55 files changed, 34609 insertions, 0 deletions
diff --git a/include/gk20a/ce2_gk20a.c b/include/gk20a/ce2_gk20a.c
new file mode 100644
index 0000000..2a40b08
--- /dev/null
+++ b/include/gk20a/ce2_gk20a.c
@@ -0,0 +1,576 @@
+/*
+ * GK20A Graphics Copy Engine  (gr host)
+ *
+ * Copyright (c) 2011-2019, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+#include <nvgpu/kmem.h>
+#include <nvgpu/dma.h>
+#include <nvgpu/os_sched.h>
+#include <nvgpu/log.h>
+#include <nvgpu/enabled.h>
+#include <nvgpu/io.h>
+#include <nvgpu/utils.h>
+#include <nvgpu/channel.h>
+#include <nvgpu/power_features/cg.h>
+#include "gk20a.h"
+#include "gk20a/fence_gk20a.h"
+#include <nvgpu/hw/gk20a/hw_ce2_gk20a.h>
+#include <nvgpu/hw/gk20a/hw_pbdma_gk20a.h>
+#include <nvgpu/hw/gk20a/hw_ccsr_gk20a.h>
+#include <nvgpu/hw/gk20a/hw_ram_gk20a.h>
+#include <nvgpu/hw/gk20a/hw_top_gk20a.h>
+#include <nvgpu/hw/gk20a/hw_gr_gk20a.h>
+#include <nvgpu/barrier.h>
+/*
+ * Copy engine defines line size in pixels
+ */
+#define MAX_CE_SHIFT    31      /* 4Gpixels -1 */
+#define MAX_CE_MASK     ((u32) (~(~0U << MAX_CE_SHIFT)))
+#define MAX_CE_ALIGN(a) (a & MAX_CE_MASK)
+static u32 ce2_nonblockpipe_isr(struct gk20a *g, u32 fifo_intr)
+{
+        nvgpu_log(g, gpu_dbg_intr, "ce2 non-blocking pipe interrupt\n");
+        return ce2_intr_status_nonblockpipe_pending_f();
+}
+static u32 ce2_blockpipe_isr(struct gk20a *g, u32 fifo_intr)
+{
+        nvgpu_log(g, gpu_dbg_intr, "ce2 blocking pipe interrupt\n");
+        return ce2_intr_status_blockpipe_pending_f();
+}
+static u32 ce2_launcherr_isr(struct gk20a *g, u32 fifo_intr)
+{
+        nvgpu_log(g, gpu_dbg_intr, "ce2 launch error interrupt\n");
+        return ce2_intr_status_launcherr_pending_f();
+}
+void gk20a_ce2_isr(struct gk20a *g, u32 inst_id, u32 pri_base)
+{
+        u32 ce2_intr = gk20a_readl(g, ce2_intr_status_r());
+        u32 clear_intr = 0;
+        nvgpu_log(g, gpu_dbg_intr, "ce2 isr %08x\n", ce2_intr);
+        /* clear blocking interrupts: they exibit broken behavior */
+        if (ce2_intr & ce2_intr_status_blockpipe_pending_f()) {
+                clear_intr |= ce2_blockpipe_isr(g, ce2_intr);
+        }
+        if (ce2_intr & ce2_intr_status_launcherr_pending_f()) {
+                clear_intr |= ce2_launcherr_isr(g, ce2_intr);
+        }
+        gk20a_writel(g, ce2_intr_status_r(), clear_intr);
+        return;
+}
+u32 gk20a_ce2_nonstall_isr(struct gk20a *g, u32 inst_id, u32 pri_base)
+{
+        u32 ops = 0;
+        u32 ce2_intr = gk20a_readl(g, ce2_intr_status_r());
+        nvgpu_log(g, gpu_dbg_intr, "ce2 nonstall isr %08x\n", ce2_intr);
+        if (ce2_intr & ce2_intr_status_nonblockpipe_pending_f()) {
+                gk20a_writel(g, ce2_intr_status_r(),
+                        ce2_nonblockpipe_isr(g, ce2_intr));
+                ops |= (GK20A_NONSTALL_OPS_WAKEUP_SEMAPHORE |
+                        GK20A_NONSTALL_OPS_POST_EVENTS);
+        }
+        return ops;
+}
+/* static CE app api */
+static void gk20a_ce_put_fences(struct gk20a_gpu_ctx *ce_ctx)
+{
+        u32 i;
+        for (i = 0; i < NVGPU_CE_MAX_INFLIGHT_JOBS; i++) {
+                struct gk20a_fence **fence = &ce_ctx->postfences[i];
+                if (*fence) {
+                        gk20a_fence_put(*fence);
+                }
+                *fence = NULL;
+        }
+}
+/* assume this api should need to call under nvgpu_mutex_acquire(&ce_app->app_mutex) */
+static void gk20a_ce_delete_gpu_context(struct gk20a_gpu_ctx *ce_ctx)
+{
+        struct nvgpu_list_node *list = &ce_ctx->list;
+        ce_ctx->gpu_ctx_state = NVGPU_CE_GPU_CTX_DELETED;
+        nvgpu_mutex_acquire(&ce_ctx->gpu_ctx_mutex);
+        if (nvgpu_mem_is_valid(&ce_ctx->cmd_buf_mem)) {
+                gk20a_ce_put_fences(ce_ctx);
+                nvgpu_dma_unmap_free(ce_ctx->vm, &ce_ctx->cmd_buf_mem);
+        }
+        /*
+         * free the channel
+         * gk20a_channel_close() will also unbind the channel from TSG
+         */
+        gk20a_channel_close(ce_ctx->ch);
+        nvgpu_ref_put(&ce_ctx->tsg->refcount, gk20a_tsg_release);
+        /* housekeeping on app */
+        if (list->prev && list->next) {
+                nvgpu_list_del(list);
+        }
+        nvgpu_mutex_release(&ce_ctx->gpu_ctx_mutex);
+        nvgpu_mutex_destroy(&ce_ctx->gpu_ctx_mutex);
+        nvgpu_kfree(ce_ctx->g, ce_ctx);
+}
+static inline unsigned int gk20a_ce_get_method_size(int request_operation,
+                        u64 size)
+{
+        /* failure size */
+        unsigned int methodsize = UINT_MAX;
+        unsigned int iterations = 0;
+        u32 shift;
+        u64 chunk = size;
+        u32 height, width;
+        while (chunk) {
+                iterations++;
+                shift = MAX_CE_ALIGN(chunk) ? __ffs(MAX_CE_ALIGN(chunk)) :
+                                                MAX_CE_SHIFT;
+                width = chunk >> shift;
+                height = 1 << shift;
+                width = MAX_CE_ALIGN(width);
+                chunk -= (u64) height * width;
+        }
+        if (request_operation & NVGPU_CE_PHYS_MODE_TRANSFER) {
+                methodsize = (2 + (16 * iterations)) * sizeof(u32);
+        } else if (request_operation & NVGPU_CE_MEMSET) {
+                methodsize = (2 + (15 * iterations)) * sizeof(u32);
+        }
+        return methodsize;
+}
+int gk20a_ce_prepare_submit(u64 src_buf,
+                u64 dst_buf,
+                u64 size,
+                u32 *cmd_buf_cpu_va,
+                u32 max_cmd_buf_size,
+                unsigned int payload,
+                int launch_flags,
+                int request_operation,
+                u32 dma_copy_class)
+{
+        u32 launch = 0;
+        u32 methodSize = 0;
+        u64 offset = 0;
+        u64 chunk_size = 0;
+        u64 chunk = size;
+        /* failure case handling */
+        if ((gk20a_ce_get_method_size(request_operation, size) >
+                max_cmd_buf_size) || (!size) ||
+                (request_operation > NVGPU_CE_MEMSET)) {
+                return 0;
+        }
+        /* set the channel object */
+        cmd_buf_cpu_va[methodSize++] = 0x20018000;
+        cmd_buf_cpu_va[methodSize++] = dma_copy_class;
+        /*
+         * The purpose clear the memory in 2D rectangles. We get the ffs to
+         * determine the number of lines to copy. The only constraint is that
+         * maximum number of pixels per line is 4Gpix - 1, which is awkward for
+         * calculation, so we settle to 2Gpix per line to make calculatione
+         * more agreable
+         */
+        /* The copy engine in 2D mode can have (2^32 - 1) x (2^32 - 1) pixels in
+         * a single submit, we are going to try to clear a range of up to 2Gpix
+         * multiple lines. Because we want to copy byte aligned we will be
+         * setting 1 byte pixels */
+        /*
+         * per iteration
+         * <------------------------- 40 bits ------------------------------>
+         *                                             1 <------ ffs ------->
+         *        <-----------up to 30 bits----------->
+         */
+        while (chunk) {
+                u32 width, height, shift;
+                /*
+                 * We will be aligning to bytes, making the maximum number of
+                 * pix per line 2Gb
+                 */
+                shift = MAX_CE_ALIGN(chunk) ? __ffs(MAX_CE_ALIGN(chunk)) :
+                                                MAX_CE_SHIFT;
+                height = chunk >> shift;
+                width = 1 << shift;
+                height = MAX_CE_ALIGN(height);
+                chunk_size = (u64) height * width;
+                /* reset launch flag */
+                launch = 0;
+                if (request_operation & NVGPU_CE_PHYS_MODE_TRANSFER) {
+                        /* setup the source */
+                        cmd_buf_cpu_va[methodSize++] = 0x20028100;
+                        cmd_buf_cpu_va[methodSize++] = (u64_hi32(src_buf +
+                                offset) & NVGPU_CE_UPPER_ADDRESS_OFFSET_MASK);
+                        cmd_buf_cpu_va[methodSize++] = (u64_lo32(src_buf +
+                                offset) & NVGPU_CE_LOWER_ADDRESS_OFFSET_MASK);
+                        cmd_buf_cpu_va[methodSize++] = 0x20018098;
+                        if (launch_flags & NVGPU_CE_SRC_LOCATION_LOCAL_FB) {
+                                cmd_buf_cpu_va[methodSize++] = 0x00000000;
+                        } else if (launch_flags &
+                                NVGPU_CE_SRC_LOCATION_NONCOHERENT_SYSMEM) {
+                                cmd_buf_cpu_va[methodSize++] = 0x00000002;
+                        } else {
+                                cmd_buf_cpu_va[methodSize++] = 0x00000001;
+                        }
+                        launch |= 0x00001000;
+                } else if (request_operation & NVGPU_CE_MEMSET) {
+                        /* Remap from component A on 1 byte wide pixels */
+                        cmd_buf_cpu_va[methodSize++] = 0x200181c2;
+                        cmd_buf_cpu_va[methodSize++] = 0x00000004;
+                        cmd_buf_cpu_va[methodSize++] = 0x200181c0;
+                        cmd_buf_cpu_va[methodSize++] = payload;
+                        launch |= 0x00000400;
+                } else {
+                        /* Illegal size */
+                        return 0;
+                }
+                /* setup the destination/output */
+                cmd_buf_cpu_va[methodSize++] = 0x20068102;
+                cmd_buf_cpu_va[methodSize++] = (u64_hi32(dst_buf +
+                        offset) & NVGPU_CE_UPPER_ADDRESS_OFFSET_MASK);
+                cmd_buf_cpu_va[methodSize++] = (u64_lo32(dst_buf +
+                        offset) & NVGPU_CE_LOWER_ADDRESS_OFFSET_MASK);
+                /* Pitch in/out */
+                cmd_buf_cpu_va[methodSize++] = width;
+                cmd_buf_cpu_va[methodSize++] = width;
+                /* width and line count */
+                cmd_buf_cpu_va[methodSize++] = width;
+                cmd_buf_cpu_va[methodSize++] = height;
+                cmd_buf_cpu_va[methodSize++] = 0x20018099;
+                if (launch_flags & NVGPU_CE_DST_LOCATION_LOCAL_FB) {
+                        cmd_buf_cpu_va[methodSize++] = 0x00000000;
+                } else if (launch_flags &
+                                NVGPU_CE_DST_LOCATION_NONCOHERENT_SYSMEM) {
+                        cmd_buf_cpu_va[methodSize++] = 0x00000002;
+                } else {
+                        cmd_buf_cpu_va[methodSize++] = 0x00000001;
+                }
+                launch |= 0x00002005;
+                if (launch_flags & NVGPU_CE_SRC_MEMORY_LAYOUT_BLOCKLINEAR) {
+                        launch |= 0x00000000;
+                } else {
+                        launch |= 0x00000080;
+                }
+                if (launch_flags & NVGPU_CE_DST_MEMORY_LAYOUT_BLOCKLINEAR) {
+                        launch |= 0x00000000;
+                } else {
+                        launch |= 0x00000100;
+                }
+                cmd_buf_cpu_va[methodSize++] = 0x200180c0;
+                cmd_buf_cpu_va[methodSize++] = launch;
+                offset += chunk_size;
+                chunk -= chunk_size;
+        }
+        return methodSize;
+}
+/* global CE app related apis */
+int gk20a_init_ce_support(struct gk20a *g)
+{
+        struct gk20a_ce_app *ce_app = &g->ce_app;
+        int err;
+        u32 ce_reset_mask;
+        ce_reset_mask = gk20a_fifo_get_all_ce_engine_reset_mask(g);
+        g->ops.mc.reset(g, ce_reset_mask);
+        nvgpu_cg_slcg_ce2_load_enable(g);
+        nvgpu_cg_blcg_ce_load_enable(g);
+        if (ce_app->initialised) {
+                /* assume this happen during poweron/poweroff GPU sequence */
+                ce_app->app_state = NVGPU_CE_ACTIVE;
+                return 0;
+        }
+        nvgpu_log(g, gpu_dbg_fn, "ce: init");
+        err = nvgpu_mutex_init(&ce_app->app_mutex);
+        if (err) {
+                return err;
+        }
+        nvgpu_mutex_acquire(&ce_app->app_mutex);
+        nvgpu_init_list_node(&ce_app->allocated_contexts);
+        ce_app->ctx_count = 0;
+        ce_app->next_ctx_id = 0;
+        ce_app->initialised = true;
+        ce_app->app_state = NVGPU_CE_ACTIVE;
+        nvgpu_mutex_release(&ce_app->app_mutex);
+        if (g->ops.ce2.init_prod_values != NULL) {
+                g->ops.ce2.init_prod_values(g);
+        }
+        nvgpu_log(g, gpu_dbg_cde_ctx, "ce: init finished");
+        return 0;
+}
+void gk20a_ce_destroy(struct gk20a *g)
+{
+        struct gk20a_ce_app *ce_app = &g->ce_app;
+        struct gk20a_gpu_ctx *ce_ctx, *ce_ctx_save;
+        if (!ce_app->initialised) {
+                return;
+        }
+        ce_app->app_state = NVGPU_CE_SUSPEND;
+        ce_app->initialised = false;
+        nvgpu_mutex_acquire(&ce_app->app_mutex);
+        nvgpu_list_for_each_entry_safe(ce_ctx, ce_ctx_save,
+                        &ce_app->allocated_contexts, gk20a_gpu_ctx, list) {
+                gk20a_ce_delete_gpu_context(ce_ctx);
+        }
+        nvgpu_init_list_node(&ce_app->allocated_contexts);
+        ce_app->ctx_count = 0;
+        ce_app->next_ctx_id = 0;
+        nvgpu_mutex_release(&ce_app->app_mutex);
+        nvgpu_mutex_destroy(&ce_app->app_mutex);
+}
+void gk20a_ce_suspend(struct gk20a *g)
+{
+        struct gk20a_ce_app *ce_app = &g->ce_app;
+        if (!ce_app->initialised) {
+                return;
+        }
+        ce_app->app_state = NVGPU_CE_SUSPEND;
+        return;
+}
+/* CE app utility functions */
+u32 gk20a_ce_create_context(struct gk20a *g,
+                int runlist_id,
+                int timeslice,
+                int runlist_level)
+{
+        struct gk20a_gpu_ctx *ce_ctx;
+        struct gk20a_ce_app *ce_app = &g->ce_app;
+        struct nvgpu_setup_bind_args setup_bind_args;
+        u32 ctx_id = ~0;
+        int err = 0;
+        if (!ce_app->initialised || ce_app->app_state != NVGPU_CE_ACTIVE) {
+                return ctx_id;
+        }
+        ce_ctx = nvgpu_kzalloc(g, sizeof(*ce_ctx));
+        if (!ce_ctx) {
+                return ctx_id;
+        }
+        err = nvgpu_mutex_init(&ce_ctx->gpu_ctx_mutex);
+        if (err) {
+                nvgpu_kfree(g, ce_ctx);
+                return ctx_id;
+        }
+        ce_ctx->g = g;
+        ce_ctx->cmd_buf_read_queue_offset = 0;
+        ce_ctx->vm = g->mm.ce.vm;
+        /* allocate a tsg if needed */
+        ce_ctx->tsg = gk20a_tsg_open(g, nvgpu_current_pid(g));
+        if (!ce_ctx->tsg) {
+                nvgpu_err(g, "ce: gk20a tsg not available");
+                err = -ENOMEM;
+                goto end;
+        }
+        /* always kernel client needs privileged channel */
+        ce_ctx->ch = gk20a_open_new_channel(g, runlist_id, true,
+                                nvgpu_current_pid(g), nvgpu_current_tid(g));
+        if (!ce_ctx->ch) {
+                nvgpu_err(g, "ce: gk20a channel not available");
+                err = -ENOMEM;
+                goto end;
+        }
+        ce_ctx->ch->timeout.enabled = false;
+        /* bind the channel to the vm */
+        err = g->ops.mm.vm_bind_channel(g->mm.ce.vm, ce_ctx->ch);
+        if (err) {
+                nvgpu_err(g, "ce: could not bind vm");
+                goto end;
+        }
+        err = gk20a_tsg_bind_channel(ce_ctx->tsg, ce_ctx->ch);
+        if (err) {
+                nvgpu_err(g, "ce: unable to bind to tsg");
+                goto end;
+        }
+        setup_bind_args.num_gpfifo_entries = 1024;
+        setup_bind_args.num_inflight_jobs = 0;
+        setup_bind_args.flags = 0;
+        /* allocate gpfifo (1024 should be more than enough) */
+        err = nvgpu_channel_setup_bind(ce_ctx->ch, &setup_bind_args);
+        if (err) {
+                nvgpu_err(g, "ce: unable to setup and bind channel");
+                goto end;
+        }
+        /* allocate command buffer from sysmem */
+        err = nvgpu_dma_alloc_map_sys(ce_ctx->vm,
+                        NVGPU_CE_MAX_INFLIGHT_JOBS *
+                        NVGPU_CE_MAX_COMMAND_BUFF_BYTES_PER_KICKOFF,
+                        &ce_ctx->cmd_buf_mem);
+         if (err) {
+                nvgpu_err(g,
+                        "ce: could not allocate command buffer for CE context");
+                goto end;
+        }
+        memset(ce_ctx->cmd_buf_mem.cpu_va, 0x00, ce_ctx->cmd_buf_mem.size);
+        /* -1 means default channel timeslice value */
+        if (timeslice != -1) {
+                err = gk20a_fifo_tsg_set_timeslice(ce_ctx->tsg, timeslice);
+                if (err) {
+                        nvgpu_err(g,
+                                "ce: could not set the channel timeslice value for CE context");
+                        goto end;
+                }
+        }
+        /* -1 means default channel runlist level */
+        if (runlist_level != -1) {
+                err = gk20a_tsg_set_runlist_interleave(ce_ctx->tsg,
+                                                       runlist_level);
+                if (err) {
+                        nvgpu_err(g,
+                                "ce: could not set the runlist interleave for CE context");
+                        goto end;
+                }
+        }
+        nvgpu_mutex_acquire(&ce_app->app_mutex);
+        ctx_id = ce_ctx->ctx_id = ce_app->next_ctx_id;
+        nvgpu_list_add(&ce_ctx->list, &ce_app->allocated_contexts);
+        ++ce_app->next_ctx_id;
+        ++ce_app->ctx_count;
+        nvgpu_mutex_release(&ce_app->app_mutex);
+        ce_ctx->gpu_ctx_state = NVGPU_CE_GPU_CTX_ALLOCATED;
+end:
+        if (ctx_id == (u32)~0) {
+                nvgpu_mutex_acquire(&ce_app->app_mutex);
+                gk20a_ce_delete_gpu_context(ce_ctx);
+                nvgpu_mutex_release(&ce_app->app_mutex);
+        }
+        return ctx_id;
+}
+void gk20a_ce_delete_context(struct gk20a *g,
+                u32 ce_ctx_id)
+{
+        gk20a_ce_delete_context_priv(g, ce_ctx_id);
+}
+void gk20a_ce_delete_context_priv(struct gk20a *g,
+                u32 ce_ctx_id)
+{
+        struct gk20a_ce_app *ce_app = &g->ce_app;
+        struct gk20a_gpu_ctx *ce_ctx, *ce_ctx_save;
+        if (!ce_app->initialised || ce_app->app_state != NVGPU_CE_ACTIVE) {
+                return;
+        }
+        nvgpu_mutex_acquire(&ce_app->app_mutex);
+        nvgpu_list_for_each_entry_safe(ce_ctx, ce_ctx_save,
+                        &ce_app->allocated_contexts, gk20a_gpu_ctx, list) {
+                if (ce_ctx->ctx_id == ce_ctx_id) {
+                        gk20a_ce_delete_gpu_context(ce_ctx);
+                        --ce_app->ctx_count;
+                        break;
+                }
+        }
+        nvgpu_mutex_release(&ce_app->app_mutex);
+        return;
+}
diff --git a/include/gk20a/ce2_gk20a.h b/include/gk20a/ce2_gk20a.h
new file mode 100644
index 0000000..df3a0e8
--- /dev/null
+++ b/include/gk20a/ce2_gk20a.h
@@ -0,0 +1,156 @@
+/*
+ * drivers/video/tegra/host/gk20a/fifo_gk20a.h
+ *
+ * GK20A graphics copy engine (gr host)
+ *
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+#ifndef NVGPU_GK20A_CE2_GK20A_H
+#define NVGPU_GK20A_CE2_GK20A_H
+struct channel_gk20a;
+struct tsg_gk20a;
+void gk20a_ce2_isr(struct gk20a *g, u32 inst_id, u32 pri_base);
+u32 gk20a_ce2_nonstall_isr(struct gk20a *g, u32 inst_id, u32 pri_base);
+/* CE command utility macros */
+#define NVGPU_CE_LOWER_ADDRESS_OFFSET_MASK 0xffffffff
+#define NVGPU_CE_UPPER_ADDRESS_OFFSET_MASK 0xff
+#define NVGPU_CE_MAX_INFLIGHT_JOBS 32
+#define NVGPU_CE_MAX_COMMAND_BUFF_BYTES_PER_KICKOFF 256
+/* dma launch_flags */
+enum {
+        /* location */
+        NVGPU_CE_SRC_LOCATION_COHERENT_SYSMEM                    = (1 << 0),
+        NVGPU_CE_SRC_LOCATION_NONCOHERENT_SYSMEM                 = (1 << 1),
+        NVGPU_CE_SRC_LOCATION_LOCAL_FB                           = (1 << 2),
+        NVGPU_CE_DST_LOCATION_COHERENT_SYSMEM                    = (1 << 3),
+        NVGPU_CE_DST_LOCATION_NONCOHERENT_SYSMEM                 = (1 << 4),
+        NVGPU_CE_DST_LOCATION_LOCAL_FB                           = (1 << 5),
+        /* memory layout */
+        NVGPU_CE_SRC_MEMORY_LAYOUT_PITCH                         = (1 << 6),
+        NVGPU_CE_SRC_MEMORY_LAYOUT_BLOCKLINEAR                   = (1 << 7),
+        NVGPU_CE_DST_MEMORY_LAYOUT_PITCH                         = (1 << 8),
+        NVGPU_CE_DST_MEMORY_LAYOUT_BLOCKLINEAR                   = (1 << 9),
+        /* transfer type */
+        NVGPU_CE_DATA_TRANSFER_TYPE_PIPELINED                   = (1 << 10),
+        NVGPU_CE_DATA_TRANSFER_TYPE_NON_PIPELINED               = (1 << 11),
+};
+/* CE operation mode */
+enum {
+        NVGPU_CE_PHYS_MODE_TRANSFER        = (1 << 0),
+        NVGPU_CE_MEMSET                    = (1 << 1),
+};
+/* CE app state machine flags */
+enum {
+        NVGPU_CE_ACTIVE                    = (1 << 0),
+        NVGPU_CE_SUSPEND                   = (1 << 1),
+};
+/* gpu context state machine flags */
+enum {
+        NVGPU_CE_GPU_CTX_ALLOCATED         = (1 << 0),
+        NVGPU_CE_GPU_CTX_DELETED           = (1 << 1),
+};
+/* global ce app db */
+struct gk20a_ce_app {
+        bool initialised;
+        struct nvgpu_mutex app_mutex;
+        int app_state;
+        struct nvgpu_list_node allocated_contexts;
+        u32 ctx_count;
+        u32 next_ctx_id;
+};
+/* ce context db */
+struct gk20a_gpu_ctx {
+        struct gk20a *g;
+        u32 ctx_id;
+        struct nvgpu_mutex gpu_ctx_mutex;
+        int gpu_ctx_state;
+        /* tsg related data */
+        struct tsg_gk20a *tsg;
+        /* channel related data */
+        struct channel_gk20a *ch;
+        struct vm_gk20a *vm;
+        /* cmd buf mem_desc */
+        struct nvgpu_mem cmd_buf_mem;
+        struct gk20a_fence *postfences[NVGPU_CE_MAX_INFLIGHT_JOBS];
+        struct nvgpu_list_node list;
+        u32 cmd_buf_read_queue_offset;
+};
+static inline struct gk20a_gpu_ctx *
+gk20a_gpu_ctx_from_list(struct nvgpu_list_node *node)
+{
+        return (struct gk20a_gpu_ctx *)
+                ((uintptr_t)node - offsetof(struct gk20a_gpu_ctx, list));
+};
+/* global CE app related apis */
+int gk20a_init_ce_support(struct gk20a *g);
+void gk20a_ce_suspend(struct gk20a *g);
+void gk20a_ce_destroy(struct gk20a *g);
+/* CE app utility functions */
+u32 gk20a_ce_create_context(struct gk20a *g,
+                int runlist_id,
+                int timeslice,
+                int runlist_level);
+int gk20a_ce_execute_ops(struct gk20a *g,
+                u32 ce_ctx_id,
+                u64 src_buf,
+                u64 dst_buf,
+                u64 size,
+                unsigned int payload,
+                int launch_flags,
+                int request_operation,
+                u32 submit_flags,
+                struct gk20a_fence **gk20a_fence_out);
+void gk20a_ce_delete_context_priv(struct gk20a *g,
+                u32 ce_ctx_id);
+void gk20a_ce_delete_context(struct gk20a *g,
+                u32 ce_ctx_id);
+int gk20a_ce_prepare_submit(u64 src_buf,
+                u64 dst_buf,
+                u64 size,
+                u32 *cmd_buf_cpu_va,
+                u32 max_cmd_buf_size,
+                unsigned int payload,
+                int launch_flags,
+                int request_operation,
+                u32 dma_copy_class);
+#endif /*NVGPU_GK20A_CE2_GK20A_H*/
diff --git a/include/gk20a/clk_gk20a.h b/include/gk20a/clk_gk20a.h
new file mode 100644
index 0000000..b8ec942
--- /dev/null
+++ b/include/gk20a/clk_gk20a.h
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2011 - 2019, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+#ifndef CLK_GK20A_H
+#define CLK_GK20A_H
+#include <nvgpu/lock.h>
+#if defined(CONFIG_COMMON_CLK)
+#include <linux/clk-provider.h>
+#endif
+#define GPUFREQ_TABLE_END     ~(u32)1
+enum {
+        /* only one PLL for gk20a */
+        GK20A_GPC_PLL = 0,
+        /* 2 PLL revisions for gm20b */
+        GM20B_GPC_PLL_B1,
+        GM20B_GPC_PLL_C1,
+};
+enum gpc_pll_mode {
+        GPC_PLL_MODE_F = 0,     /* fixed frequency mode a.k.a legacy mode */
+        GPC_PLL_MODE_DVFS,      /* DVFS mode a.k.a NA mode */
+};
+struct na_dvfs {
+        u32 n_int;
+        u32 sdm_din;
+        int dfs_coeff;
+        int dfs_det_max;
+        int dfs_ext_cal;
+        int uv_cal;
+        int mv;
+};
+struct pll {
+        u32 id;
+        u32 clk_in;     /* KHz */
+        u32 M;
+        u32 N;
+        u32 PL;
+        u32 freq;       /* KHz */
+        bool enabled;
+        enum gpc_pll_mode mode;
+        struct na_dvfs dvfs;
+};
+struct pll_parms {
+        u32 min_freq, max_freq; /* KHz */
+        u32 min_vco, max_vco;   /* KHz */
+        u32 min_u,   max_u;     /* KHz */
+        u32 min_M,   max_M;
+        u32 min_N,   max_N;
+        u32 min_PL,  max_PL;
+        /* NA mode parameters*/
+        int coeff_slope, coeff_offs; /* coeff = slope * V + offs */
+        int uvdet_slope, uvdet_offs; /* uV = slope * det + offs */
+        u32 vco_ctrl;
+        /*
+         * Timing parameters in us. Lock timeout is applied to locking in fixed
+         * frequency mode and to dynamic ramp in any mode; does not affect lock
+         * latency, since lock/ramp done status bit is polled. NA mode lock and
+         * and IDDQ exit delays set the time of the respective opertaions with
+         * no status polling.
+         */
+        u32 lock_timeout;
+        u32 na_lock_delay;
+        u32 iddq_exit_delay;
+        /* NA mode DFS control */
+        u32 dfs_ctrl;
+};
+struct namemap_cfg;
+struct clk_gk20a {
+        struct gk20a *g;
+#if defined(CONFIG_COMMON_CLK)
+        struct clk *tegra_clk;
+        struct clk *tegra_clk_parent;
+        struct clk_hw hw;
+#endif
+        struct pll gpc_pll;
+        struct pll gpc_pll_last;
+        struct nvgpu_mutex clk_mutex;
+        struct namemap_cfg *clk_namemap;
+        u32 namemap_num;
+        u32 *namemap_xlat_table;
+        bool sw_ready;
+        bool clk_hw_on;
+        bool debugfs_set;
+        int pll_poweron_uv;
+        unsigned long dvfs_safe_max_freq;
+};
+#if defined(CONFIG_COMMON_CLK)
+#define to_clk_gk20a(_hw) container_of(_hw, struct clk_gk20a, hw)
+#endif
+struct gpu_ops;
+#define KHZ 1000
+#define MHZ 1000000
+static inline unsigned long rate_gpc2clk_to_gpu(unsigned long rate)
+{
+        /* convert the kHz gpc2clk frequency to Hz gpcpll frequency */
+        return (rate * KHZ) / 2;
+}
+static inline unsigned long rate_gpu_to_gpc2clk(unsigned long rate)
+{
+        /* convert the Hz gpcpll frequency to kHz gpc2clk frequency */
+        return (rate * 2) / KHZ;
+}
+#endif /* CLK_GK20A_H */
diff --git a/include/gk20a/css_gr_gk20a.c b/include/gk20a/css_gr_gk20a.c
new file mode 100644
index 0000000..28a3d49
--- /dev/null
+++ b/include/gk20a/css_gr_gk20a.c
@@ -0,0 +1,636 @@
+/*
+ * GK20A Cycle stats snapshots support (subsystem for gr_gk20a).
+ *
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+#include <nvgpu/bitops.h>
+#include <nvgpu/kmem.h>
+#include <nvgpu/lock.h>
+#include <nvgpu/dma.h>
+#include <nvgpu/mm.h>
+#include <nvgpu/sizes.h>
+#include <nvgpu/barrier.h>
+#include <nvgpu/log.h>
+#include <nvgpu/bug.h>
+#include <nvgpu/io.h>
+#include <nvgpu/utils.h>
+#include <nvgpu/channel.h>
+#include <nvgpu/unit.h>
+#include "gk20a.h"
+#include "css_gr_gk20a.h"
+#include <nvgpu/hw/gk20a/hw_perf_gk20a.h>
+/* check client for pointed perfmon ownership */
+#define CONTAINS_PERFMON(cl, pm)                                \
+                ((cl)->perfmon_start <= (pm) &&                 \
+                ((pm) - (cl)->perfmon_start) < (cl)->perfmon_count)
+/* address of fifo entry by offset */
+#define CSS_FIFO_ENTRY(fifo, offs)                              \
+        ((struct gk20a_cs_snapshot_fifo_entry *)(((char *)(fifo)) + (offs)))
+/* calculate area capacity in number of fifo entries */
+#define CSS_FIFO_ENTRY_CAPACITY(s)                              \
+        (((s) - sizeof(struct gk20a_cs_snapshot_fifo))          \
+                / sizeof(struct gk20a_cs_snapshot_fifo_entry))
+/* reserved to indicate failures with data */
+#define CSS_FIRST_PERFMON_ID    32
+/* should correlate with size of gk20a_cs_snapshot_fifo_entry::perfmon_id */
+#define CSS_MAX_PERFMON_IDS     256
+/* reports whether the hw queue overflowed */
+bool css_hw_get_overflow_status(struct gk20a *g)
+{
+        const u32 st = perf_pmasys_control_membuf_status_overflowed_f();
+        return st == (gk20a_readl(g, perf_pmasys_control_r()) & st);
+}
+/* returns how many pending snapshot entries are pending */
+u32 css_hw_get_pending_snapshots(struct gk20a *g)
+{
+        return gk20a_readl(g, perf_pmasys_mem_bytes_r()) /
+                        sizeof(struct gk20a_cs_snapshot_fifo_entry);
+}
+/* informs hw how many snapshots have been processed (frees up fifo space) */
+void css_hw_set_handled_snapshots(struct gk20a *g, u32 done)
+{
+        if (done > 0) {
+                gk20a_writel(g, perf_pmasys_mem_bump_r(),
+                     done * sizeof(struct gk20a_cs_snapshot_fifo_entry));
+        }
+}
+/* disable streaming to memory */
+static void css_hw_reset_streaming(struct gk20a *g)
+{
+        u32 engine_status;
+        /* reset the perfmon */
+        g->ops.mc.reset(g, g->ops.mc.reset_mask(g, NVGPU_UNIT_PERFMON));
+        /* RBUFEMPTY must be set -- otherwise we'll pick up */
+        /* snapshot that have been queued up from earlier   */
+        engine_status = gk20a_readl(g, perf_pmasys_enginestatus_r());
+        WARN_ON(0 == (engine_status
+                        & perf_pmasys_enginestatus_rbufempty_empty_f()));
+        /* turn off writes */
+        gk20a_writel(g, perf_pmasys_control_r(),
+                        perf_pmasys_control_membuf_clear_status_doit_f());
+        /* pointing all pending snapshots as handled */
+        css_hw_set_handled_snapshots(g, css_hw_get_pending_snapshots(g));
+}
+/*
+ * WARNING: all css_gr_XXX functions are local and expected to be called
+ * from locked context (protected by cs_lock)
+ */
+static int css_gr_create_shared_data(struct gr_gk20a *gr)
+{
+        struct gk20a_cs_snapshot *data;
+        if (gr->cs_data)
+                return 0;
+        data = nvgpu_kzalloc(gr->g, sizeof(*data));
+        if (!data)
+                return -ENOMEM;
+        nvgpu_init_list_node(&data->clients);
+        gr->cs_data = data;
+        return 0;
+}
+int css_hw_enable_snapshot(struct channel_gk20a *ch,
+                                struct gk20a_cs_snapshot_client *cs_client)
+{
+        struct gk20a *g = ch->g;
+        struct mm_gk20a *mm = &g->mm;
+        struct gr_gk20a *gr = &g->gr;
+        struct gk20a_cs_snapshot *data = gr->cs_data;
+        u32 snapshot_size = cs_client->snapshot_size;
+        int ret;
+        u32 virt_addr_lo;
+        u32 virt_addr_hi;
+        u32 inst_pa_page;
+        if (data->hw_snapshot)
+                return 0;
+        if (snapshot_size < CSS_MIN_HW_SNAPSHOT_SIZE)
+                snapshot_size = CSS_MIN_HW_SNAPSHOT_SIZE;
+        ret = nvgpu_dma_alloc_map_sys(g->mm.pmu.vm, snapshot_size,
+                                                        &data->hw_memdesc);
+        if (ret)
+                return ret;
+        /* perf output buffer may not cross a 4GB boundary - with a separate */
+        /* va smaller than that, it won't but check anyway */
+        if (!data->hw_memdesc.cpu_va ||
+                data->hw_memdesc.size < snapshot_size ||
+                data->hw_memdesc.gpu_va + u64_lo32(snapshot_size) > SZ_4G) {
+                ret = -EFAULT;
+                goto failed_allocation;
+        }
+        data->hw_snapshot =
+                (struct gk20a_cs_snapshot_fifo_entry *)data->hw_memdesc.cpu_va;
+        data->hw_end = data->hw_snapshot +
+                snapshot_size / sizeof(struct gk20a_cs_snapshot_fifo_entry);
+        data->hw_get = data->hw_snapshot;
+        memset(data->hw_snapshot, 0xff, snapshot_size);
+        /* address and size are aligned to 32 bytes, the lowest bits read back
+         * as zeros */
+        virt_addr_lo = u64_lo32(data->hw_memdesc.gpu_va);
+        virt_addr_hi = u64_hi32(data->hw_memdesc.gpu_va);
+        css_hw_reset_streaming(g);
+        gk20a_writel(g, perf_pmasys_outbase_r(), virt_addr_lo);
+        gk20a_writel(g, perf_pmasys_outbaseupper_r(),
+                        perf_pmasys_outbaseupper_ptr_f(virt_addr_hi));
+        gk20a_writel(g, perf_pmasys_outsize_r(), snapshot_size);
+        /* this field is aligned to 4K */
+        inst_pa_page = nvgpu_inst_block_addr(g, &g->mm.hwpm.inst_block) >> 12;
+        /* A write to MEM_BLOCK triggers the block bind operation. MEM_BLOCK
+         * should be written last */
+        gk20a_writel(g, perf_pmasys_mem_block_r(),
+                        perf_pmasys_mem_block_base_f(inst_pa_page) |
+                        nvgpu_aperture_mask(g, &mm->hwpm.inst_block,
+                                perf_pmasys_mem_block_target_sys_ncoh_f(),
+                                perf_pmasys_mem_block_target_sys_coh_f(),
+                                perf_pmasys_mem_block_target_lfb_f()) |
+                        perf_pmasys_mem_block_valid_true_f());
+        nvgpu_log_info(g, "cyclestats: buffer for hardware snapshots enabled\n");
+        return 0;
+failed_allocation:
+        if (data->hw_memdesc.size) {
+                nvgpu_dma_unmap_free(g->mm.pmu.vm, &data->hw_memdesc);
+                memset(&data->hw_memdesc, 0, sizeof(data->hw_memdesc));
+        }
+        data->hw_snapshot = NULL;
+        return ret;
+}
+void css_hw_disable_snapshot(struct gr_gk20a *gr)
+{
+        struct gk20a *g = gr->g;
+        struct gk20a_cs_snapshot *data = gr->cs_data;
+        if (!data->hw_snapshot)
+                return;
+        css_hw_reset_streaming(g);
+        gk20a_writel(g, perf_pmasys_outbase_r(), 0);
+        gk20a_writel(g, perf_pmasys_outbaseupper_r(),
+                        perf_pmasys_outbaseupper_ptr_f(0));
+        gk20a_writel(g, perf_pmasys_outsize_r(), 0);
+        gk20a_writel(g, perf_pmasys_mem_block_r(),
+                        perf_pmasys_mem_block_base_f(0) |
+                        perf_pmasys_mem_block_valid_false_f() |
+                        perf_pmasys_mem_block_target_f(0));
+        nvgpu_dma_unmap_free(g->mm.pmu.vm, &data->hw_memdesc);
+        memset(&data->hw_memdesc, 0, sizeof(data->hw_memdesc));
+        data->hw_snapshot = NULL;
+        nvgpu_log_info(g, "cyclestats: buffer for hardware snapshots disabled\n");
+}
+static void css_gr_free_shared_data(struct gr_gk20a *gr)
+{
+        struct gk20a *g = gr->g;
+        if (gr->cs_data) {
+                /* the clients list is expected to be empty */
+                g->ops.css.disable_snapshot(gr);
+                /* release the objects */
+                nvgpu_kfree(gr->g, gr->cs_data);
+                gr->cs_data = NULL;
+        }
+}
+struct gk20a_cs_snapshot_client*
+css_gr_search_client(struct nvgpu_list_node *clients, u32 perfmon)
+{
+        struct gk20a_cs_snapshot_client *client;
+        nvgpu_list_for_each_entry(client, clients,
+                        gk20a_cs_snapshot_client,  list) {
+                if (CONTAINS_PERFMON(client, perfmon))
+                        return client;
+        }
+        return NULL;
+}
+static int css_gr_flush_snapshots(struct channel_gk20a *ch)
+{
+        struct gk20a *g = ch->g;
+        struct gr_gk20a *gr = &g->gr;
+        struct gk20a_cs_snapshot *css = gr->cs_data;
+        struct gk20a_cs_snapshot_client *cur;
+        u32 pending, completed;
+        bool hw_overflow;
+        int err;
+        /* variables for iterating over HW entries */
+        u32 sid;
+        struct gk20a_cs_snapshot_fifo_entry *src;
+        /* due to data sharing with userspace we allowed update only */
+        /* overflows and put field in the fifo header                */
+        struct gk20a_cs_snapshot_fifo *dst;
+        struct gk20a_cs_snapshot_fifo_entry *dst_get;
+        struct gk20a_cs_snapshot_fifo_entry *dst_put;
+        struct gk20a_cs_snapshot_fifo_entry *dst_nxt;
+        struct gk20a_cs_snapshot_fifo_entry *dst_head;
+        struct gk20a_cs_snapshot_fifo_entry *dst_tail;
+        if (!css)
+                return -EINVAL;
+        if (nvgpu_list_empty(&css->clients))
+                return -EBADF;
+        /* check data available */
+        err = g->ops.css.check_data_available(ch, &pending, &hw_overflow);
+        if (err)
+                return err;
+        if (!pending)
+                return 0;
+        if (hw_overflow) {
+                nvgpu_list_for_each_entry(cur, &css->clients,
+                                gk20a_cs_snapshot_client, list) {
+                        cur->snapshot->hw_overflow_events_occured++;
+                }
+                nvgpu_warn(g, "cyclestats: hardware overflow detected");
+        }
+        /* process all items in HW buffer */
+        sid = 0;
+        completed = 0;
+        cur = NULL;
+        dst = NULL;
+        dst_put = NULL;
+        src = css->hw_get;
+        /* proceed all completed records */
+        while (sid < pending && 0 == src->zero0) {
+                /* we may have a new perfmon_id which required to */
+                /* switch to a new client -> let's forget current */
+                if (cur && !CONTAINS_PERFMON(cur, src->perfmon_id)) {
+                        dst->put = (char *)dst_put - (char *)dst;
+                        dst = NULL;
+                        cur = NULL;
+                }
+                /* now we have to select a new current client         */
+                /* the client selection rate depends from experiment  */
+                /* activity but on Android usually happened 1-2 times */
+                if (!cur) {
+                        cur = css_gr_search_client(&css->clients,
+                                                        src->perfmon_id);
+                        if (cur) {
+                                /* found - setup all required data */
+                                dst = cur->snapshot;
+                                dst_get = CSS_FIFO_ENTRY(dst, dst->get);
+                                dst_put = CSS_FIFO_ENTRY(dst, dst->put);
+                                dst_head = CSS_FIFO_ENTRY(dst, dst->start);
+                                dst_tail = CSS_FIFO_ENTRY(dst, dst->end);
+                                dst_nxt = dst_put + 1;
+                                if (dst_nxt == dst_tail)
+                                        dst_nxt = dst_head;
+                        } else {
+                                /* client not found - skipping this entry */
+                                nvgpu_warn(g, "cyclestats: orphaned perfmon %u",
+                                                        src->perfmon_id);
+                                goto next_hw_fifo_entry;
+                        }
+                }
+                /* check for software overflows */
+                if (dst_nxt == dst_get) {
+                        /* no data copy, no pointer updates */
+                        dst->sw_overflow_events_occured++;
+                        nvgpu_warn(g, "cyclestats: perfmon %u soft overflow",
+                                                        src->perfmon_id);
+                } else {
+                        *dst_put = *src;
+                        completed++;
+                        dst_put = dst_nxt++;
+                        if (dst_nxt == dst_tail)
+                                dst_nxt = dst_head;
+                }
+next_hw_fifo_entry:
+                sid++;
+                if (++src >= css->hw_end)
+                        src = css->hw_snapshot;
+        }
+        /* update client put pointer if necessary */
+        if (cur && dst)
+                dst->put = (char *)dst_put - (char *)dst;
+        /* re-set HW buffer after processing taking wrapping into account */
+        if (css->hw_get < src) {
+                memset(css->hw_get, 0xff, (src - css->hw_get) * sizeof(*src));
+        } else {
+                memset(css->hw_snapshot, 0xff,
+                                (src - css->hw_snapshot) * sizeof(*src));
+                memset(css->hw_get, 0xff,
+                                (css->hw_end - css->hw_get) * sizeof(*src));
+        }
+        gr->cs_data->hw_get = src;
+        if (g->ops.css.set_handled_snapshots)
+                g->ops.css.set_handled_snapshots(g, sid);
+        if (completed != sid) {
+                /* not all entries proceed correctly. some of problems */
+                /* reported as overflows, some as orphaned perfmons,   */
+                /* but it will be better notify with summary about it  */
+                nvgpu_warn(g, "cyclestats: completed %u from %u entries",
+                                                        completed, pending);
+        }
+        return 0;
+}
+u32 css_gr_allocate_perfmon_ids(struct gk20a_cs_snapshot *data,
+                                       u32 count)
+{
+        unsigned long *pids = data->perfmon_ids;
+        unsigned int f;
+        f = bitmap_find_next_zero_area(pids, CSS_MAX_PERFMON_IDS,
+                                       CSS_FIRST_PERFMON_ID, count, 0);
+        if (f > CSS_MAX_PERFMON_IDS)
+                f = 0;
+        else
+                bitmap_set(pids, f, count);
+        return f;
+}
+u32 css_gr_release_perfmon_ids(struct gk20a_cs_snapshot *data,
+                                      u32 start,
+                                      u32 count)
+{
+        unsigned long *pids = data->perfmon_ids;
+        u32  end = start + count;
+        u32  cnt = 0;
+        if (start >= CSS_FIRST_PERFMON_ID && end <= CSS_MAX_PERFMON_IDS) {
+                bitmap_clear(pids, start, count);
+                cnt = count;
+        }
+        return cnt;
+}
+static int css_gr_free_client_data(struct gk20a *g,
+                                struct gk20a_cs_snapshot *data,
+                                struct gk20a_cs_snapshot_client *client)
+{
+        int ret = 0;
+        if (client->list.next && client->list.prev)
+                nvgpu_list_del(&client->list);
+        if (client->perfmon_start && client->perfmon_count
+                                        && g->ops.css.release_perfmon_ids) {
+                if (client->perfmon_count != g->ops.css.release_perfmon_ids(data,
+                                client->perfmon_start, client->perfmon_count))
+                        ret = -EINVAL;
+        }
+        return ret;
+}
+static int css_gr_create_client_data(struct gk20a *g,
+                        struct gk20a_cs_snapshot *data,
+                        u32 perfmon_count,
+                        struct gk20a_cs_snapshot_client *cur)
+{
+        /*
+         * Special handling in-case of rm-server
+         *
+         * client snapshot buffer will not be mapped
+         * in-case of rm-server its only mapped in
+         * guest side
+         */
+        if (cur->snapshot) {
+                memset(cur->snapshot, 0, sizeof(*cur->snapshot));
+                cur->snapshot->start = sizeof(*cur->snapshot);
+                /* we should be ensure that can fit all fifo entries here */
+                cur->snapshot->end =
+                        CSS_FIFO_ENTRY_CAPACITY(cur->snapshot_size)
+                                * sizeof(struct gk20a_cs_snapshot_fifo_entry)
+                                + sizeof(struct gk20a_cs_snapshot_fifo);
+                cur->snapshot->get = cur->snapshot->start;
+                cur->snapshot->put = cur->snapshot->start;
+        }
+        cur->perfmon_count = perfmon_count;
+        /* In virtual case, perfmon ID allocation is handled by the server
+         * at the time of the attach (allocate_perfmon_ids is NULL in this case)
+         */
+        if (cur->perfmon_count && g->ops.css.allocate_perfmon_ids) {
+                cur->perfmon_start = g->ops.css.allocate_perfmon_ids(data,
+                                                        cur->perfmon_count);
+                if (!cur->perfmon_start)
+                        return -ENOENT;
+        }
+        nvgpu_list_add_tail(&cur->list, &data->clients);
+        return 0;
+}
+int gr_gk20a_css_attach(struct channel_gk20a *ch,
+                        u32 perfmon_count,
+                        u32 *perfmon_start,
+                        struct gk20a_cs_snapshot_client *cs_client)
+{
+        int ret = 0;
+        struct gk20a *g = ch->g;
+        struct gr_gk20a *gr;
+        /* we must have a placeholder to store pointer to client structure */
+        if (!cs_client)
+                return -EINVAL;
+        if (!perfmon_count ||
+            perfmon_count > CSS_MAX_PERFMON_IDS - CSS_FIRST_PERFMON_ID)
+                return -EINVAL;
+        nvgpu_speculation_barrier();
+        gr = &g->gr;
+        nvgpu_mutex_acquire(&gr->cs_lock);
+        ret = css_gr_create_shared_data(gr);
+        if (ret)
+                goto failed;
+        ret = css_gr_create_client_data(g, gr->cs_data,
+                                     perfmon_count,
+                                     cs_client);
+        if (ret)
+                goto failed;
+        ret = g->ops.css.enable_snapshot(ch, cs_client);
+        if (ret)
+                goto failed;
+        if (perfmon_start)
+                *perfmon_start = cs_client->perfmon_start;
+        nvgpu_mutex_release(&gr->cs_lock);
+        return 0;
+failed:
+        if (gr->cs_data) {
+                if (cs_client) {
+                        css_gr_free_client_data(g, gr->cs_data, cs_client);
+                        cs_client = NULL;
+                }
+                if (nvgpu_list_empty(&gr->cs_data->clients))
+                        css_gr_free_shared_data(gr);
+        }
+        nvgpu_mutex_release(&gr->cs_lock);
+        if (perfmon_start)
+                *perfmon_start = 0;
+        return ret;
+}
+int gr_gk20a_css_detach(struct channel_gk20a *ch,
+                                struct gk20a_cs_snapshot_client *cs_client)
+{
+        int ret = 0;
+        struct gk20a *g = ch->g;
+        struct gr_gk20a *gr;
+        if (!cs_client)
+                return -EINVAL;
+        gr = &g->gr;
+        nvgpu_mutex_acquire(&gr->cs_lock);
+        if (gr->cs_data) {
+                struct gk20a_cs_snapshot *data = gr->cs_data;
+                if (g->ops.css.detach_snapshot)
+                        g->ops.css.detach_snapshot(ch, cs_client);
+                ret = css_gr_free_client_data(g, data, cs_client);
+                if (nvgpu_list_empty(&data->clients))
+                        css_gr_free_shared_data(gr);
+        } else {
+                ret = -EBADF;
+        }
+        nvgpu_mutex_release(&gr->cs_lock);
+        return ret;
+}
+int gr_gk20a_css_flush(struct channel_gk20a *ch,
+                                struct gk20a_cs_snapshot_client *cs_client)
+{
+        int ret = 0;
+        struct gk20a *g = ch->g;
+        struct gr_gk20a *gr;
+        if (!cs_client)
+                return -EINVAL;
+        gr = &g->gr;
+        nvgpu_mutex_acquire(&gr->cs_lock);
+        ret = css_gr_flush_snapshots(ch);
+        nvgpu_mutex_release(&gr->cs_lock);
+        return ret;
+}
+/* helper function with locking to cleanup snapshot code code in gr_gk20a.c */
+void gr_gk20a_free_cyclestats_snapshot_data(struct gk20a *g)
+{
+        struct gr_gk20a *gr = &g->gr;
+        nvgpu_mutex_acquire(&gr->cs_lock);
+        css_gr_free_shared_data(gr);
+        nvgpu_mutex_release(&gr->cs_lock);
+        nvgpu_mutex_destroy(&gr->cs_lock);
+}
+int css_hw_check_data_available(struct channel_gk20a *ch, u32 *pending,
+                                        bool *hw_overflow)
+{
+        struct gk20a *g = ch->g;
+        struct gr_gk20a *gr = &g->gr;
+        struct gk20a_cs_snapshot *css = gr->cs_data;
+        if (!css->hw_snapshot)
+                return -EINVAL;
+        *pending = css_hw_get_pending_snapshots(g);
+        if (!*pending)
+                return 0;
+        *hw_overflow = css_hw_get_overflow_status(g);
+        return 0;
+}
diff --git a/include/gk20a/css_gr_gk20a.h b/include/gk20a/css_gr_gk20a.h
new file mode 100644
index 0000000..bf8890b
--- /dev/null
+++ b/include/gk20a/css_gr_gk20a.h
@@ -0,0 +1,151 @@
+/*
+ * GK20A Cycle stats snapshots support (subsystem for gr_gk20a).
+ *
+ * Copyright (c) 2016-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+#ifndef CSS_GR_GK20A_H
+#define CSS_GR_GK20A_H
+#include <nvgpu/nvgpu_mem.h>
+#include <nvgpu/list.h>
+/* the minimal size of HW buffer - should be enough to avoid HW overflows */
+#define CSS_MIN_HW_SNAPSHOT_SIZE        (8 * 1024 * 1024)
+struct gk20a;
+struct gr_gk20a;
+struct channel_gk20a;
+/* cycle stats fifo header (must match NvSnapshotBufferFifo) */
+struct gk20a_cs_snapshot_fifo {
+        /* layout description of the buffer */
+        u32     start;
+        u32     end;
+        /* snafu bits */
+        u32     hw_overflow_events_occured;
+        u32     sw_overflow_events_occured;
+        /* the kernel copies new entries to put and
+         * increment the put++. if put == get then
+         * overflowEventsOccured++
+         */
+        u32     put;
+        u32     _reserved10;
+        u32     _reserved11;
+        u32     _reserved12;
+        /* the driver/client reads from get until
+         * put==get, get++ */
+        u32     get;
+        u32     _reserved20;
+        u32     _reserved21;
+        u32     _reserved22;
+        /* unused */
+        u32     _reserved30;
+        u32     _reserved31;
+        u32     _reserved32;
+        u32     _reserved33;
+};
+/* cycle stats fifo entry (must match NvSnapshotBufferFifoEntry) */
+struct gk20a_cs_snapshot_fifo_entry {
+        /* global 48 timestamp */
+        u32     timestamp31_00:32;
+        u32     timestamp39_32:8;
+        /* id of perfmon, should correlate with CSS_MAX_PERFMON_IDS */
+        u32     perfmon_id:8;
+        /* typically samples_counter is wired to #pmtrigger count */
+        u32     samples_counter:12;
+        /* DS=Delay Sample, SZ=Size (0=32B, 1=16B) */
+        u32     ds:1;
+        u32     sz:1;
+        u32     zero0:1;
+        u32     zero1:1;
+        /* counter results */
+        u32     event_cnt:32;
+        u32     trigger0_cnt:32;
+        u32     trigger1_cnt:32;
+        u32     sample_cnt:32;
+        /* Local PmTrigger results for Maxwell+ or padding otherwise */
+        u16     local_trigger_b_count:16;
+        u16     book_mark_b:16;
+        u16     local_trigger_a_count:16;
+        u16     book_mark_a:16;
+};
+/* cycle stats snapshot client data (e.g. associated with channel) */
+struct gk20a_cs_snapshot_client {
+        struct nvgpu_list_node  list;
+        struct gk20a_cs_snapshot_fifo   *snapshot;
+        u32                     snapshot_size;
+        u32                     perfmon_start;
+        u32                     perfmon_count;
+};
+static inline struct gk20a_cs_snapshot_client *
+gk20a_cs_snapshot_client_from_list(struct nvgpu_list_node *node)
+{
+        return (struct gk20a_cs_snapshot_client *)
+                ((uintptr_t)node - offsetof(struct gk20a_cs_snapshot_client, list));
+};
+/* should correlate with size of gk20a_cs_snapshot_fifo_entry::perfmon_id */
+#define CSS_MAX_PERFMON_IDS     256
+/* local definitions to avoid hardcodes sizes and shifts */
+#define PM_BITMAP_SIZE  DIV_ROUND_UP(CSS_MAX_PERFMON_IDS, BITS_PER_LONG)
+/* cycle stats snapshot control structure for one HW entry and many clients */
+struct gk20a_cs_snapshot {
+        unsigned long perfmon_ids[PM_BITMAP_SIZE];
+        struct nvgpu_list_node  clients;
+        struct nvgpu_mem        hw_memdesc;
+        /* pointer to allocated cpu_va memory where GPU place data */
+        struct gk20a_cs_snapshot_fifo_entry     *hw_snapshot;
+        struct gk20a_cs_snapshot_fifo_entry     *hw_end;
+        struct gk20a_cs_snapshot_fifo_entry     *hw_get;
+};
+bool css_hw_get_overflow_status(struct gk20a *g);
+u32 css_hw_get_pending_snapshots(struct gk20a *g);
+void css_hw_set_handled_snapshots(struct gk20a *g, u32 done);
+int css_hw_enable_snapshot(struct channel_gk20a *ch,
+                                struct gk20a_cs_snapshot_client *cs_client);
+void css_hw_disable_snapshot(struct gr_gk20a *gr);
+u32 css_gr_allocate_perfmon_ids(struct gk20a_cs_snapshot *data,
+                                       u32 count);
+u32 css_gr_release_perfmon_ids(struct gk20a_cs_snapshot *data,
+                                      u32 start,
+                                      u32 count);
+int css_hw_check_data_available(struct channel_gk20a *ch, u32 *pending,
+                                        bool *hw_overflow);
+struct gk20a_cs_snapshot_client*
+css_gr_search_client(struct nvgpu_list_node *clients, u32 perfmon);
+#endif /* CSS_GR_GK20A_H */
diff --git a/include/gk20a/dbg_gpu_gk20a.c b/include/gk20a/dbg_gpu_gk20a.c
new file mode 100644
index 0000000..1686d01
--- /dev/null
+++ b/include/gk20a/dbg_gpu_gk20a.c
@@ -0,0 +1,388 @@
+/*
+ * Tegra GK20A GPU Debugger/Profiler Driver
+ *
+ * Copyright (c) 2013-2019, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+#include <nvgpu/kmem.h>
+#include <nvgpu/log.h>
+#include <nvgpu/vm.h>
+#include <nvgpu/atomic.h>
+#include <nvgpu/mm.h>
+#include <nvgpu/bug.h>
+#include <nvgpu/io.h>
+#include <nvgpu/utils.h>
+#include <nvgpu/channel.h>
+#include <nvgpu/unit.h>
+#include <nvgpu/power_features/power_features.h>
+#include "gk20a.h"
+#include "gr_gk20a.h"
+#include "dbg_gpu_gk20a.h"
+#include "regops_gk20a.h"
+#include <nvgpu/hw/gk20a/hw_gr_gk20a.h>
+#include <nvgpu/hw/gk20a/hw_perf_gk20a.h>
+static void gk20a_perfbuf_reset_streaming(struct gk20a *g)
+{
+        u32 engine_status;
+        u32 num_unread_bytes;
+        g->ops.mc.reset(g, g->ops.mc.reset_mask(g, NVGPU_UNIT_PERFMON));
+        engine_status = gk20a_readl(g, perf_pmasys_enginestatus_r());
+        WARN_ON(0u ==
+                (engine_status & perf_pmasys_enginestatus_rbufempty_empty_f()));
+        gk20a_writel(g, perf_pmasys_control_r(),
+                perf_pmasys_control_membuf_clear_status_doit_f());
+        num_unread_bytes = gk20a_readl(g, perf_pmasys_mem_bytes_r());
+        if (num_unread_bytes != 0u) {
+                gk20a_writel(g, perf_pmasys_mem_bump_r(), num_unread_bytes);
+        }
+}
+/*
+ * API to get first channel from the list of all channels
+ * bound to the debug session
+ */
+struct channel_gk20a *
+nvgpu_dbg_gpu_get_session_channel(struct dbg_session_gk20a *dbg_s)
+{
+        struct dbg_session_channel_data *ch_data;
+        struct channel_gk20a *ch;
+        struct gk20a *g = dbg_s->g;
+        nvgpu_mutex_acquire(&dbg_s->ch_list_lock);
+        if (nvgpu_list_empty(&dbg_s->ch_list)) {
+                nvgpu_mutex_release(&dbg_s->ch_list_lock);
+                return NULL;
+        }
+        ch_data = nvgpu_list_first_entry(&dbg_s->ch_list,
+                                   dbg_session_channel_data,
+                                   ch_entry);
+        ch = g->fifo.channel + ch_data->chid;
+        nvgpu_mutex_release(&dbg_s->ch_list_lock);
+        return ch;
+}
+void gk20a_dbg_gpu_post_events(struct channel_gk20a *ch)
+{
+        struct dbg_session_data *session_data;
+        struct dbg_session_gk20a *dbg_s;
+        struct gk20a *g = ch->g;
+        nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, " ");
+        /* guard against the session list being modified */
+        nvgpu_mutex_acquire(&ch->dbg_s_lock);
+        nvgpu_list_for_each_entry(session_data, &ch->dbg_s_list,
+                                dbg_session_data, dbg_s_entry) {
+                dbg_s = session_data->dbg_s;
+                if (dbg_s->dbg_events.events_enabled) {
+                        nvgpu_log(g, gpu_dbg_gpu_dbg, "posting event on session id %d",
+                                        dbg_s->id);
+                        nvgpu_log(g, gpu_dbg_gpu_dbg, "%d events pending",
+                                        dbg_s->dbg_events.num_pending_events);
+                        dbg_s->dbg_events.num_pending_events++;
+                        nvgpu_dbg_session_post_event(dbg_s);
+                }
+        }
+        nvgpu_mutex_release(&ch->dbg_s_lock);
+}
+bool gk20a_dbg_gpu_broadcast_stop_trigger(struct channel_gk20a *ch)
+{
+        struct dbg_session_data *session_data;
+        struct dbg_session_gk20a *dbg_s;
+        bool broadcast = false;
+        struct gk20a *g = ch->g;
+        nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg | gpu_dbg_intr, " ");
+        /* guard against the session list being modified */
+        nvgpu_mutex_acquire(&ch->dbg_s_lock);
+        nvgpu_list_for_each_entry(session_data, &ch->dbg_s_list,
+                                dbg_session_data, dbg_s_entry) {
+                dbg_s = session_data->dbg_s;
+                if (dbg_s->broadcast_stop_trigger) {
+                        nvgpu_log(g, gpu_dbg_gpu_dbg | gpu_dbg_fn | gpu_dbg_intr,
+                                        "stop trigger broadcast enabled");
+                        broadcast = true;
+                        break;
+                }
+        }
+        nvgpu_mutex_release(&ch->dbg_s_lock);
+        return broadcast;
+}
+int gk20a_dbg_gpu_clear_broadcast_stop_trigger(struct channel_gk20a *ch)
+{
+        struct dbg_session_data *session_data;
+        struct dbg_session_gk20a *dbg_s;
+        struct gk20a *g = ch->g;
+        nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg | gpu_dbg_intr, " ");
+        /* guard against the session list being modified */
+        nvgpu_mutex_acquire(&ch->dbg_s_lock);
+        nvgpu_list_for_each_entry(session_data, &ch->dbg_s_list,
+                                dbg_session_data, dbg_s_entry) {
+                dbg_s = session_data->dbg_s;
+                if (dbg_s->broadcast_stop_trigger) {
+                        nvgpu_log(g, gpu_dbg_gpu_dbg | gpu_dbg_fn | gpu_dbg_intr,
+                                        "stop trigger broadcast disabled");
+                        dbg_s->broadcast_stop_trigger = false;
+                }
+        }
+        nvgpu_mutex_release(&ch->dbg_s_lock);
+        return 0;
+}
+u32 nvgpu_set_powergate_locked(struct dbg_session_gk20a *dbg_s,
+                                bool mode)
+{
+        u32 err = 0U;
+        struct gk20a *g = dbg_s->g;
+        if (dbg_s->is_pg_disabled != mode) {
+                if (mode == false) {
+                        g->dbg_powergating_disabled_refcount--;
+                }
+                /*
+                 * Allow powergate disable or enable only if
+                 * the global pg disabled refcount is zero
+                 */
+                if (g->dbg_powergating_disabled_refcount == 0) {
+                        err = g->ops.dbg_session_ops.dbg_set_powergate(dbg_s,
+                                                                        mode);
+                }
+                if (mode) {
+                        g->dbg_powergating_disabled_refcount++;
+                }
+                dbg_s->is_pg_disabled = mode;
+        }
+        return err;
+}
+int dbg_set_powergate(struct dbg_session_gk20a *dbg_s, bool disable_powergate)
+{
+        int err = 0;
+        struct gk20a *g = dbg_s->g;
+         /* This function must be called with g->dbg_sessions_lock held */
+        nvgpu_log(g, gpu_dbg_fn|gpu_dbg_gpu_dbg, "%s powergate mode = %s",
+                   g->name, disable_powergate ? "disable" : "enable");
+        /*
+         * Powergate mode here refers to railgate+powergate+clockgate
+         * so in case slcg/blcg/elcg are disabled and railgating is enabled,
+         * disable railgating and then set is_pg_disabled = true
+         * Similarly re-enable railgating and not other features if they are not
+         * enabled when powermode=MODE_ENABLE
+         */
+        if (disable_powergate) {
+                /* save off current powergate, clk state.
+                 * set gpu module's can_powergate = 0.
+                 * set gpu module's clk to max.
+                 * while *a* debug session is active there will be no power or
+                 * clocking state changes allowed from mainline code (but they
+                 * should be saved).
+                 */
+                nvgpu_log(g, gpu_dbg_gpu_dbg | gpu_dbg_fn,
+                                                "module busy");
+                err = gk20a_busy(g);
+                if (err) {
+                        return err;
+                }
+                err = nvgpu_cg_pg_disable(g);
+                if (err == 0) {
+                        dbg_s->is_pg_disabled = true;
+                        nvgpu_log(g, gpu_dbg_gpu_dbg | gpu_dbg_fn,
+                                        "pg disabled");
+                }
+        } else {
+                /* restore (can) powergate, clk state */
+                /* release pending exceptions to fault/be handled as usual */
+                /*TBD: ordering of these? */
+                err = nvgpu_cg_pg_enable(g);
+                nvgpu_log(g, gpu_dbg_gpu_dbg | gpu_dbg_fn, "module idle");
+                gk20a_idle(g);
+                if (err == 0) {
+                        dbg_s->is_pg_disabled = false;
+                        nvgpu_log(g, gpu_dbg_gpu_dbg | gpu_dbg_fn,
+                                        "pg enabled");
+                }
+        }
+        nvgpu_log(g, gpu_dbg_fn|gpu_dbg_gpu_dbg, "%s powergate mode = %s done",
+                   g->name, disable_powergate ? "disable" : "enable");
+        return err;
+}
+bool nvgpu_check_and_set_global_reservation(
+                                struct dbg_session_gk20a *dbg_s,
+                                struct dbg_profiler_object_data *prof_obj)
+{
+        struct gk20a *g = dbg_s->g;
+        if (g->profiler_reservation_count == 0) {
+                g->global_profiler_reservation_held = true;
+                g->profiler_reservation_count = 1;
+                dbg_s->has_profiler_reservation = true;
+                prof_obj->has_reservation = true;
+                return true;
+        }
+        return false;
+}
+bool nvgpu_check_and_set_context_reservation(
+                                struct dbg_session_gk20a *dbg_s,
+                                struct dbg_profiler_object_data *prof_obj)
+{
+        struct gk20a *g = dbg_s->g;
+        /* Assumes that we've already checked that no global reservation
+         * is in effect.
+         */
+        g->profiler_reservation_count++;
+        dbg_s->has_profiler_reservation = true;
+        prof_obj->has_reservation = true;
+        return true;
+}
+void nvgpu_release_profiler_reservation(struct dbg_session_gk20a *dbg_s,
+                                struct dbg_profiler_object_data *prof_obj)
+{
+        struct gk20a *g = dbg_s->g;
+        g->profiler_reservation_count--;
+        if (g->profiler_reservation_count < 0) {
+                nvgpu_err(g, "Negative reservation count!");
+        }
+        dbg_s->has_profiler_reservation = false;
+        prof_obj->has_reservation = false;
+        if (prof_obj->ch == NULL) {
+                g->global_profiler_reservation_held = false;
+        }
+}
+int gk20a_perfbuf_enable_locked(struct gk20a *g, u64 offset, u32 size)
+{
+        struct mm_gk20a *mm = &g->mm;
+        u32 virt_addr_lo;
+        u32 virt_addr_hi;
+        u32 inst_pa_page;
+        int err;
+        err = gk20a_busy(g);
+        if (err) {
+                nvgpu_err(g, "failed to poweron");
+                return err;
+        }
+        err = g->ops.mm.alloc_inst_block(g, &mm->perfbuf.inst_block);
+        if (err) {
+                return err;
+        }
+        g->ops.mm.init_inst_block(&mm->perfbuf.inst_block, mm->perfbuf.vm, 0);
+        gk20a_perfbuf_reset_streaming(g);
+        virt_addr_lo = u64_lo32(offset);
+        virt_addr_hi = u64_hi32(offset);
+        /* address and size are aligned to 32 bytes, the lowest bits read back
+         * as zeros */
+        gk20a_writel(g, perf_pmasys_outbase_r(), virt_addr_lo);
+        gk20a_writel(g, perf_pmasys_outbaseupper_r(),
+                        perf_pmasys_outbaseupper_ptr_f(virt_addr_hi));
+        gk20a_writel(g, perf_pmasys_outsize_r(), size);
+        /* this field is aligned to 4K */
+        inst_pa_page = nvgpu_inst_block_addr(g, &mm->perfbuf.inst_block) >> 12;
+        /* A write to MEM_BLOCK triggers the block bind operation. MEM_BLOCK
+         * should be written last */
+        gk20a_writel(g, perf_pmasys_mem_block_r(),
+                        perf_pmasys_mem_block_base_f(inst_pa_page) |
+                        nvgpu_aperture_mask(g, &mm->perfbuf.inst_block,
+                                perf_pmasys_mem_block_target_sys_ncoh_f(),
+                                perf_pmasys_mem_block_target_sys_coh_f(),
+                                perf_pmasys_mem_block_target_lfb_f()) |
+                        perf_pmasys_mem_block_valid_true_f());
+        gk20a_idle(g);
+        return 0;
+}
+/* must be called with dbg_sessions_lock held */
+int gk20a_perfbuf_disable_locked(struct gk20a *g)
+{
+        int err = gk20a_busy(g);
+        if (err) {
+                nvgpu_err(g, "failed to poweron");
+                return err;
+        }
+        gk20a_perfbuf_reset_streaming(g);
+        gk20a_writel(g, perf_pmasys_outbase_r(), 0);
+        gk20a_writel(g, perf_pmasys_outbaseupper_r(),
+                        perf_pmasys_outbaseupper_ptr_f(0));
+        gk20a_writel(g, perf_pmasys_outsize_r(), 0);
+        gk20a_writel(g, perf_pmasys_mem_block_r(),
+                        perf_pmasys_mem_block_base_f(0) |
+                        perf_pmasys_mem_block_valid_false_f() |
+                        perf_pmasys_mem_block_target_f(0));
+        gk20a_idle(g);
+        return 0;
+}
diff --git a/include/gk20a/dbg_gpu_gk20a.h b/include/gk20a/dbg_gpu_gk20a.h
new file mode 100644
index 0000000..fb5ae1f
--- /dev/null
+++ b/include/gk20a/dbg_gpu_gk20a.h
@@ -0,0 +1,147 @@
+/*
+ * Tegra GK20A GPU Debugger Driver
+ *
+ * Copyright (c) 2013-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+#ifndef DBG_GPU_H
+#define DBG_GPU_H
+#include <nvgpu/cond.h>
+#include <nvgpu/lock.h>
+#include <nvgpu/list.h>
+struct gk20a;
+struct channel_gk20a;
+struct dbg_session_gk20a;
+/* used by the interrupt handler to post events */
+void gk20a_dbg_gpu_post_events(struct channel_gk20a *fault_ch);
+struct channel_gk20a *
+nvgpu_dbg_gpu_get_session_channel(struct dbg_session_gk20a *dbg_s);
+struct dbg_gpu_session_events {
+        struct nvgpu_cond wait_queue;
+        bool events_enabled;
+        int num_pending_events;
+};
+struct dbg_session_gk20a {
+        /* dbg session id used for trace/prints */
+        int id;
+        /* profiler session, if any */
+        bool is_profiler;
+        /* has a valid profiler reservation */
+        bool has_profiler_reservation;
+        /* power enabled or disabled */
+        bool is_pg_disabled;
+        /* timeouts enabled or disabled */
+        bool is_timeout_disabled;
+        struct gk20a              *g;
+        /* list of bound channels, if any */
+        struct nvgpu_list_node ch_list;
+        struct nvgpu_mutex ch_list_lock;
+        /* event support */
+        struct dbg_gpu_session_events dbg_events;
+        bool broadcast_stop_trigger;
+        struct nvgpu_mutex ioctl_lock;
+};
+struct dbg_session_data {
+        struct dbg_session_gk20a *dbg_s;
+        struct nvgpu_list_node dbg_s_entry;
+};
+static inline struct dbg_session_data *
+dbg_session_data_from_dbg_s_entry(struct nvgpu_list_node *node)
+{
+        return (struct dbg_session_data *)
+             ((uintptr_t)node - offsetof(struct dbg_session_data, dbg_s_entry));
+};
+struct dbg_session_channel_data {
+        int channel_fd;
+        u32 chid;
+        struct nvgpu_list_node ch_entry;
+        struct dbg_session_data *session_data;
+        int (*unbind_single_channel)(struct dbg_session_gk20a *dbg_s,
+                        struct dbg_session_channel_data *ch_data);
+};
+static inline struct dbg_session_channel_data *
+dbg_session_channel_data_from_ch_entry(struct nvgpu_list_node *node)
+{
+        return (struct dbg_session_channel_data *)
+        ((uintptr_t)node - offsetof(struct dbg_session_channel_data, ch_entry));
+};
+struct dbg_profiler_object_data {
+        int session_id;
+        u32 prof_handle;
+        struct channel_gk20a *ch;
+        bool has_reservation;
+        struct nvgpu_list_node prof_obj_entry;
+};
+static inline struct dbg_profiler_object_data *
+dbg_profiler_object_data_from_prof_obj_entry(struct nvgpu_list_node *node)
+{
+        return (struct dbg_profiler_object_data *)
+        ((uintptr_t)node - offsetof(struct dbg_profiler_object_data, prof_obj_entry));
+};
+bool gk20a_dbg_gpu_broadcast_stop_trigger(struct channel_gk20a *ch);
+int gk20a_dbg_gpu_clear_broadcast_stop_trigger(struct channel_gk20a *ch);
+int dbg_set_powergate(struct dbg_session_gk20a *dbg_s, bool disable_powergate);
+bool nvgpu_check_and_set_global_reservation(
+                                struct dbg_session_gk20a *dbg_s,
+                                struct dbg_profiler_object_data *prof_obj);
+bool nvgpu_check_and_set_context_reservation(
+                                struct dbg_session_gk20a *dbg_s,
+                                struct dbg_profiler_object_data *prof_obj);
+void nvgpu_release_profiler_reservation(struct dbg_session_gk20a *dbg_s,
+                                struct dbg_profiler_object_data *prof_obj);
+int gk20a_perfbuf_enable_locked(struct gk20a *g, u64 offset, u32 size);
+int gk20a_perfbuf_disable_locked(struct gk20a *g);
+void nvgpu_dbg_session_post_event(struct dbg_session_gk20a *dbg_s);
+u32 nvgpu_set_powergate_locked(struct dbg_session_gk20a *dbg_s,
+                                bool mode);
+ /* PM Context Switch Mode */
+/*This mode says that the pms are not to be context switched. */
+#define NVGPU_DBG_HWPM_CTXSW_MODE_NO_CTXSW               (0x00000000)
+/* This mode says that the pms in Mode-B are to be context switched */
+#define NVGPU_DBG_HWPM_CTXSW_MODE_CTXSW                  (0x00000001)
+/* This mode says that the pms in Mode-E (stream out) are to be context switched. */
+#define NVGPU_DBG_HWPM_CTXSW_MODE_STREAM_OUT_CTXSW       (0x00000002)
+#endif /* DBG_GPU_GK20A_H */
diff --git a/include/gk20a/fecs_trace_gk20a.c b/include/gk20a/fecs_trace_gk20a.c
new file mode 100644
index 0000000..5c1c5e0
--- /dev/null
+++ b/include/gk20a/fecs_trace_gk20a.c
@@ -0,0 +1,744 @@
+/*
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+#include <nvgpu/kmem.h>
+#include <nvgpu/dma.h>
+#include <nvgpu/enabled.h>
+#include <nvgpu/bug.h>
+#include <nvgpu/hashtable.h>
+#include <nvgpu/circ_buf.h>
+#include <nvgpu/thread.h>
+#include <nvgpu/barrier.h>
+#include <nvgpu/mm.h>
+#include <nvgpu/enabled.h>
+#include <nvgpu/ctxsw_trace.h>
+#include <nvgpu/io.h>
+#include <nvgpu/utils.h>
+#include <nvgpu/timers.h>
+#include <nvgpu/channel.h>
+#include "fecs_trace_gk20a.h"
+#include "gk20a.h"
+#include "gr_gk20a.h"
+#include <nvgpu/log.h>
+#include <nvgpu/fecs_trace.h>
+#include <nvgpu/hw/gk20a/hw_ctxsw_prog_gk20a.h>
+#include <nvgpu/hw/gk20a/hw_gr_gk20a.h>
+struct gk20a_fecs_trace_hash_ent {
+        u32 context_ptr;
+        pid_t pid;
+        struct hlist_node node;
+};
+struct gk20a_fecs_trace {
+        DECLARE_HASHTABLE(pid_hash_table, GK20A_FECS_TRACE_HASH_BITS);
+        struct nvgpu_mutex hash_lock;
+        struct nvgpu_mutex poll_lock;
+        struct nvgpu_thread poll_task;
+        bool init;
+        struct nvgpu_mutex enable_lock;
+        u32 enable_count;
+};
+#ifdef CONFIG_GK20A_CTXSW_TRACE
+u32 gk20a_fecs_trace_record_ts_tag_invalid_ts_v(void)
+{
+        return ctxsw_prog_record_timestamp_timestamp_hi_tag_invalid_timestamp_v();
+}
+u32 gk20a_fecs_trace_record_ts_tag_v(u64 ts)
+{
+        return ctxsw_prog_record_timestamp_timestamp_hi_tag_v((u32) (ts >> 32));
+}
+u64 gk20a_fecs_trace_record_ts_timestamp_v(u64 ts)
+{
+        return ts & ~(((u64)ctxsw_prog_record_timestamp_timestamp_hi_tag_m()) << 32);
+}
+static u32 gk20a_fecs_trace_fecs_context_ptr(struct gk20a *g, struct channel_gk20a *ch)
+{
+        return (u32) (nvgpu_inst_block_addr(g, &ch->inst_block) >> 12LL);
+}
+int gk20a_fecs_trace_num_ts(void)
+{
+        return (ctxsw_prog_record_timestamp_record_size_in_bytes_v()
+                - sizeof(struct gk20a_fecs_trace_record)) / sizeof(u64);
+}
+struct gk20a_fecs_trace_record *gk20a_fecs_trace_get_record(
+        struct gk20a *g, int idx)
+{
+        struct nvgpu_mem *mem = &g->gr.global_ctx_buffer[FECS_TRACE_BUFFER].mem;
+        return (struct gk20a_fecs_trace_record *)
+                ((u8 *) mem->cpu_va
+                + (idx * ctxsw_prog_record_timestamp_record_size_in_bytes_v()));
+}
+bool gk20a_fecs_trace_is_valid_record(struct gk20a_fecs_trace_record *r)
+{
+        /*
+         * testing magic_hi should suffice. magic_lo is sometimes used
+         * as a sequence number in experimental ucode.
+         */
+        return (r->magic_hi
+                == ctxsw_prog_record_timestamp_magic_value_hi_v_value_v());
+}
+int gk20a_fecs_trace_get_read_index(struct gk20a *g)
+{
+        return gr_gk20a_elpg_protected_call(g,
+                        gk20a_readl(g, gr_fecs_mailbox1_r()));
+}
+int gk20a_fecs_trace_get_write_index(struct gk20a *g)
+{
+        return gr_gk20a_elpg_protected_call(g,
+                        gk20a_readl(g, gr_fecs_mailbox0_r()));
+}
+static int gk20a_fecs_trace_set_read_index(struct gk20a *g, int index)
+{
+        nvgpu_log(g, gpu_dbg_ctxsw, "set read=%d", index);
+        return gr_gk20a_elpg_protected_call(g,
+                        (gk20a_writel(g, gr_fecs_mailbox1_r(), index), 0));
+}
+void gk20a_fecs_trace_hash_dump(struct gk20a *g)
+{
+        u32 bkt;
+        struct gk20a_fecs_trace_hash_ent *ent;
+        struct gk20a_fecs_trace *trace = g->fecs_trace;
+        nvgpu_log(g, gpu_dbg_ctxsw, "dumping hash table");
+        nvgpu_mutex_acquire(&trace->hash_lock);
+        hash_for_each(trace->pid_hash_table, bkt, ent, node)
+        {
+                nvgpu_log(g, gpu_dbg_ctxsw, " ent=%p bkt=%x context_ptr=%x pid=%d",
+                        ent, bkt, ent->context_ptr, ent->pid);
+        }
+        nvgpu_mutex_release(&trace->hash_lock);
+}
+static int gk20a_fecs_trace_hash_add(struct gk20a *g, u32 context_ptr, pid_t pid)
+{
+        struct gk20a_fecs_trace_hash_ent *he;
+        struct gk20a_fecs_trace *trace = g->fecs_trace;
+        nvgpu_log(g, gpu_dbg_fn | gpu_dbg_ctxsw,
+                "adding hash entry context_ptr=%x -> pid=%d", context_ptr, pid);
+        he = nvgpu_kzalloc(g, sizeof(*he));
+        if (unlikely(!he)) {
+                nvgpu_warn(g,
+                        "can't alloc new hash entry for context_ptr=%x pid=%d",
+                        context_ptr, pid);
+                return -ENOMEM;
+        }
+        he->context_ptr = context_ptr;
+        he->pid = pid;
+        nvgpu_mutex_acquire(&trace->hash_lock);
+        hash_add(trace->pid_hash_table, &he->node, context_ptr);
+        nvgpu_mutex_release(&trace->hash_lock);
+        return 0;
+}
+static void gk20a_fecs_trace_hash_del(struct gk20a *g, u32 context_ptr)
+{
+        struct hlist_node *tmp;
+        struct gk20a_fecs_trace_hash_ent *ent;
+        struct gk20a_fecs_trace *trace = g->fecs_trace;
+        nvgpu_log(g, gpu_dbg_fn | gpu_dbg_ctxsw,
+                "freeing hash entry context_ptr=%x", context_ptr);
+        nvgpu_mutex_acquire(&trace->hash_lock);
+        hash_for_each_possible_safe(trace->pid_hash_table, ent, tmp, node,
+                context_ptr) {
+                if (ent->context_ptr == context_ptr) {
+                        hash_del(&ent->node);
+                        nvgpu_log(g, gpu_dbg_ctxsw,
+                                "freed hash entry=%p context_ptr=%x", ent,
+                                ent->context_ptr);
+                        nvgpu_kfree(g, ent);
+                        break;
+                }
+        }
+        nvgpu_mutex_release(&trace->hash_lock);
+}
+static void gk20a_fecs_trace_free_hash_table(struct gk20a *g)
+{
+        u32 bkt;
+        struct hlist_node *tmp;
+        struct gk20a_fecs_trace_hash_ent *ent;
+        struct gk20a_fecs_trace *trace = g->fecs_trace;
+        nvgpu_log(g, gpu_dbg_fn | gpu_dbg_ctxsw, "trace=%p", trace);
+        nvgpu_mutex_acquire(&trace->hash_lock);
+        hash_for_each_safe(trace->pid_hash_table, bkt, tmp, ent, node) {
+                hash_del(&ent->node);
+                nvgpu_kfree(g, ent);
+        }
+        nvgpu_mutex_release(&trace->hash_lock);
+}
+static pid_t gk20a_fecs_trace_find_pid(struct gk20a *g, u32 context_ptr)
+{
+        struct gk20a_fecs_trace_hash_ent *ent;
+        struct gk20a_fecs_trace *trace = g->fecs_trace;
+        pid_t pid = 0;
+        nvgpu_mutex_acquire(&trace->hash_lock);
+        hash_for_each_possible(trace->pid_hash_table, ent, node, context_ptr) {
+                if (ent->context_ptr == context_ptr) {
+                        nvgpu_log(g, gpu_dbg_ctxsw,
+                                "found context_ptr=%x -> pid=%d",
+                                ent->context_ptr, ent->pid);
+                        pid = ent->pid;
+                        break;
+                }
+        }
+        nvgpu_mutex_release(&trace->hash_lock);
+        return pid;
+}
+/*
+ * Converts HW entry format to userspace-facing format and pushes it to the
+ * queue.
+ */
+static int gk20a_fecs_trace_ring_read(struct gk20a *g, int index)
+{
+        int i;
+        struct nvgpu_gpu_ctxsw_trace_entry entry = { };
+        struct gk20a_fecs_trace *trace = g->fecs_trace;
+        pid_t cur_pid;
+        pid_t new_pid;
+        int count = 0;
+        /* for now, only one VM */
+        const int vmid = 0;
+        struct gk20a_fecs_trace_record *r =
+                gk20a_fecs_trace_get_record(g, index);
+        nvgpu_log(g, gpu_dbg_fn | gpu_dbg_ctxsw,
+                "consuming record trace=%p read=%d record=%p", trace, index, r);
+        if (unlikely(!gk20a_fecs_trace_is_valid_record(r))) {
+                nvgpu_warn(g,
+                        "trace=%p read=%d record=%p magic_lo=%08x magic_hi=%08x (invalid)",
+                        trace, index, r, r->magic_lo, r->magic_hi);
+                return -EINVAL;
+        }
+        /* Clear magic_hi to detect cases where CPU could read write index
+         * before FECS record is actually written to DRAM. This should not
+         * as we force FECS writes to SYSMEM by reading through PRAMIN.
+         */
+        r->magic_hi = 0;
+        cur_pid = gk20a_fecs_trace_find_pid(g, r->context_ptr);
+        new_pid = gk20a_fecs_trace_find_pid(g, r->new_context_ptr);
+        nvgpu_log(g, gpu_dbg_fn | gpu_dbg_ctxsw,
+                "context_ptr=%x (pid=%d) new_context_ptr=%x (pid=%d)",
+                r->context_ptr, cur_pid, r->new_context_ptr, new_pid);
+        entry.context_id = r->context_id;
+        entry.vmid = vmid;
+        /* break out FECS record into trace events */
+        for (i = 0; i < gk20a_fecs_trace_num_ts(); i++) {
+                entry.tag = gk20a_fecs_trace_record_ts_tag_v(r->ts[i]);
+                entry.timestamp = gk20a_fecs_trace_record_ts_timestamp_v(r->ts[i]);
+                entry.timestamp <<= GK20A_FECS_TRACE_PTIMER_SHIFT;
+                nvgpu_log(g, gpu_dbg_ctxsw,
+                        "tag=%x timestamp=%llx context_id=%08x new_context_id=%08x",
+                        entry.tag, entry.timestamp, r->context_id,
+                        r->new_context_id);
+                switch (nvgpu_gpu_ctxsw_tags_to_common_tags(entry.tag)) {
+                case NVGPU_GPU_CTXSW_TAG_RESTORE_START:
+                case NVGPU_GPU_CTXSW_TAG_CONTEXT_START:
+                        entry.context_id = r->new_context_id;
+                        entry.pid = new_pid;
+                        break;
+                case NVGPU_GPU_CTXSW_TAG_CTXSW_REQ_BY_HOST:
+                case NVGPU_GPU_CTXSW_TAG_FE_ACK:
+                case NVGPU_GPU_CTXSW_TAG_FE_ACK_WFI:
+                case NVGPU_GPU_CTXSW_TAG_FE_ACK_GFXP:
+                case NVGPU_GPU_CTXSW_TAG_FE_ACK_CTAP:
+                case NVGPU_GPU_CTXSW_TAG_FE_ACK_CILP:
+                case NVGPU_GPU_CTXSW_TAG_SAVE_END:
+                        entry.context_id = r->context_id;
+                        entry.pid = cur_pid;
+                        break;
+                default:
+                        /* tags are not guaranteed to start at the beginning */
+                        WARN_ON(entry.tag && (entry.tag != NVGPU_GPU_CTXSW_TAG_INVALID_TIMESTAMP));
+                        continue;
+                }
+                nvgpu_log(g, gpu_dbg_ctxsw, "tag=%x context_id=%x pid=%lld",
+                        entry.tag, entry.context_id, entry.pid);
+                if (!entry.context_id)
+                        continue;
+                gk20a_ctxsw_trace_write(g, &entry);
+                count++;
+        }
+        gk20a_ctxsw_trace_wake_up(g, vmid);
+        return count;
+}
+int gk20a_fecs_trace_poll(struct gk20a *g)
+{
+        struct gk20a_fecs_trace *trace = g->fecs_trace;
+        int read = 0;
+        int write = 0;
+        int cnt;
+        int err;
+        err = gk20a_busy(g);
+        if (unlikely(err))
+                return err;
+        nvgpu_mutex_acquire(&trace->poll_lock);
+        write = gk20a_fecs_trace_get_write_index(g);
+        if (unlikely((write < 0) || (write >= GK20A_FECS_TRACE_NUM_RECORDS))) {
+                nvgpu_err(g,
+                        "failed to acquire write index, write=%d", write);
+                err = write;
+                goto done;
+        }
+        read = gk20a_fecs_trace_get_read_index(g);
+        cnt = CIRC_CNT(write, read, GK20A_FECS_TRACE_NUM_RECORDS);
+        if (!cnt)
+                goto done;
+        nvgpu_log(g, gpu_dbg_ctxsw,
+                "circular buffer: read=%d (mailbox=%d) write=%d cnt=%d",
+                read, gk20a_fecs_trace_get_read_index(g), write, cnt);
+        /* Ensure all FECS writes have made it to SYSMEM */
+        g->ops.mm.fb_flush(g);
+        if (nvgpu_is_enabled(g, NVGPU_FECS_TRACE_FEATURE_CONTROL)) {
+                /* Bits 30:0 of MAILBOX1 represents actual read pointer value */
+                read = read & (~(BIT32(NVGPU_FECS_TRACE_FEATURE_CONTROL_BIT)));
+        }
+        while (read != write) {
+                cnt = gk20a_fecs_trace_ring_read(g, read);
+                if (cnt > 0) {
+                        nvgpu_log(g, gpu_dbg_ctxsw,
+                                "number of trace entries added: %d", cnt);
+                }
+                /* Get to next record. */
+                read = (read + 1) & (GK20A_FECS_TRACE_NUM_RECORDS - 1);
+        }
+        if (nvgpu_is_enabled(g, NVGPU_FECS_TRACE_FEATURE_CONTROL)) {
+                /*
+                 * In the next step, read pointer is going to be updated.
+                 * So, MSB of read pointer should be set back to 1. This will
+                 * keep FECS trace enabled.
+                 */
+                read = read | (BIT32(NVGPU_FECS_TRACE_FEATURE_CONTROL_BIT));
+        }
+        /* ensure FECS records has been updated before incrementing read index */
+        nvgpu_wmb();
+        gk20a_fecs_trace_set_read_index(g, read);
+done:
+        nvgpu_mutex_release(&trace->poll_lock);
+        gk20a_idle(g);
+        return err;
+}
+static int gk20a_fecs_trace_periodic_polling(void *arg)
+{
+        struct gk20a *g = (struct gk20a *)arg;
+        struct gk20a_fecs_trace *trace = g->fecs_trace;
+        pr_info("%s: running\n", __func__);
+        while (!nvgpu_thread_should_stop(&trace->poll_task)) {
+                nvgpu_usleep_range(GK20A_FECS_TRACE_FRAME_PERIOD_US,
+                                   GK20A_FECS_TRACE_FRAME_PERIOD_US * 2);
+                gk20a_fecs_trace_poll(g);
+        }
+        return 0;
+}
+size_t gk20a_fecs_trace_buffer_size(struct gk20a *g)
+{
+        return GK20A_FECS_TRACE_NUM_RECORDS
+                        * ctxsw_prog_record_timestamp_record_size_in_bytes_v();
+}
+int gk20a_fecs_trace_init(struct gk20a *g)
+{
+        struct gk20a_fecs_trace *trace;
+        int err;
+        trace = nvgpu_kzalloc(g, sizeof(struct gk20a_fecs_trace));
+        if (!trace) {
+                nvgpu_warn(g, "failed to allocate fecs_trace");
+                return -ENOMEM;
+        }
+        g->fecs_trace = trace;
+        err = nvgpu_mutex_init(&trace->poll_lock);
+        if (err)
+                goto clean;
+        err = nvgpu_mutex_init(&trace->hash_lock);
+        if (err)
+                goto clean_poll_lock;
+        err = nvgpu_mutex_init(&trace->enable_lock);
+        if (err)
+                goto clean_hash_lock;
+        BUG_ON(!is_power_of_2(GK20A_FECS_TRACE_NUM_RECORDS));
+        hash_init(trace->pid_hash_table);
+        __nvgpu_set_enabled(g, NVGPU_SUPPORT_FECS_CTXSW_TRACE, true);
+        trace->enable_count = 0;
+        trace->init = true;
+        return 0;
+clean_hash_lock:
+        nvgpu_mutex_destroy(&trace->hash_lock);
+clean_poll_lock:
+        nvgpu_mutex_destroy(&trace->poll_lock);
+clean:
+        nvgpu_kfree(g, trace);
+        g->fecs_trace = NULL;
+        return err;
+}
+int gk20a_fecs_trace_bind_channel(struct gk20a *g,
+                struct channel_gk20a *ch)
+{
+        /*
+         * map our circ_buf to the context space and store the GPU VA
+         * in the context header.
+         */
+        u32 lo;
+        u32 hi;
+        u64 addr;
+        struct tsg_gk20a *tsg;
+        struct nvgpu_gr_ctx *ch_ctx;
+        struct gk20a_fecs_trace *trace = g->fecs_trace;
+        struct nvgpu_mem *mem;
+        u32 context_ptr = gk20a_fecs_trace_fecs_context_ptr(g, ch);
+        u32 aperture_mask;
+        tsg = tsg_gk20a_from_ch(ch);
+        if (tsg == NULL) {
+                nvgpu_err(g, "chid: %d is not bound to tsg", ch->chid);
+                return -EINVAL;
+        }
+        nvgpu_log(g, gpu_dbg_fn|gpu_dbg_ctxsw,
+                        "chid=%d context_ptr=%x inst_block=%llx",
+                        ch->chid, context_ptr,
+                        nvgpu_inst_block_addr(g, &ch->inst_block));
+        tsg = tsg_gk20a_from_ch(ch);
+        if (!tsg)
+                return -EINVAL;
+        ch_ctx = &tsg->gr_ctx;
+        mem = &ch_ctx->mem;
+        if (!trace)
+                return -ENOMEM;
+        mem = &g->gr.global_ctx_buffer[FECS_TRACE_BUFFER].mem;
+        if (nvgpu_is_enabled(g, NVGPU_FECS_TRACE_VA)) {
+                addr = ch_ctx->global_ctx_buffer_va[FECS_TRACE_BUFFER_VA];
+                nvgpu_log(g, gpu_dbg_ctxsw, "gpu_va=%llx", addr);
+                aperture_mask = 0;
+        } else {
+                addr = nvgpu_inst_block_addr(g, mem);
+                nvgpu_log(g, gpu_dbg_ctxsw, "pa=%llx", addr);
+                aperture_mask = nvgpu_aperture_mask(g, mem,
+                        ctxsw_prog_main_image_context_timestamp_buffer_ptr_hi_target_sys_mem_noncoherent_f(),
+                        ctxsw_prog_main_image_context_timestamp_buffer_ptr_hi_target_sys_mem_coherent_f(),
+                        ctxsw_prog_main_image_context_timestamp_buffer_ptr_hi_target_vid_mem_f());
+        }
+        if (!addr)
+                return -ENOMEM;
+        lo = u64_lo32(addr);
+        hi = u64_hi32(addr);
+        mem = &ch_ctx->mem;
+        nvgpu_log(g, gpu_dbg_ctxsw, "addr_hi=%x addr_lo=%x count=%d", hi,
+                lo, GK20A_FECS_TRACE_NUM_RECORDS);
+        nvgpu_mem_wr(g, mem,
+                ctxsw_prog_main_image_context_timestamp_buffer_control_o(),
+                ctxsw_prog_main_image_context_timestamp_buffer_control_num_records_f(
+                        GK20A_FECS_TRACE_NUM_RECORDS));
+        if (nvgpu_is_enabled(g, NVGPU_FECS_TRACE_VA))
+                mem = &ch->ctx_header;
+        nvgpu_mem_wr(g, mem,
+                ctxsw_prog_main_image_context_timestamp_buffer_ptr_o(),
+                lo);
+        nvgpu_mem_wr(g, mem,
+                ctxsw_prog_main_image_context_timestamp_buffer_ptr_hi_o(),
+                ctxsw_prog_main_image_context_timestamp_buffer_ptr_v_f(hi) |
+                aperture_mask);
+        /* pid (process identifier) in user space, corresponds to tgid (thread
+         * group id) in kernel space.
+         */
+        gk20a_fecs_trace_hash_add(g, context_ptr, tsg->tgid);
+        return 0;
+}
+int gk20a_fecs_trace_unbind_channel(struct gk20a *g, struct channel_gk20a *ch)
+{
+        u32 context_ptr = gk20a_fecs_trace_fecs_context_ptr(g, ch);
+        if (g->fecs_trace) {
+                nvgpu_log(g, gpu_dbg_fn|gpu_dbg_ctxsw,
+                        "ch=%p context_ptr=%x", ch, context_ptr);
+                if (g->ops.fecs_trace.is_enabled(g)) {
+                        if (g->ops.fecs_trace.flush)
+                                g->ops.fecs_trace.flush(g);
+                        gk20a_fecs_trace_poll(g);
+                }
+                gk20a_fecs_trace_hash_del(g, context_ptr);
+        }
+        return 0;
+}
+int gk20a_fecs_trace_reset(struct gk20a *g)
+{
+        nvgpu_log(g, gpu_dbg_fn|gpu_dbg_ctxsw, " ");
+        if (!g->ops.fecs_trace.is_enabled(g))
+                return 0;
+        gk20a_fecs_trace_poll(g);
+        return gk20a_fecs_trace_set_read_index(g, 0);
+}
+int gk20a_fecs_trace_deinit(struct gk20a *g)
+{
+        struct gk20a_fecs_trace *trace = g->fecs_trace;
+        if (!trace->init)
+                return 0;
+        /*
+         * Check if tracer was enabled before attempting to stop the
+         * tracer thread.
+         */
+        if (trace->enable_count > 0) {
+                nvgpu_thread_stop(&trace->poll_task);
+        }
+        gk20a_fecs_trace_free_hash_table(g);
+        nvgpu_mutex_destroy(&g->fecs_trace->hash_lock);
+        nvgpu_mutex_destroy(&g->fecs_trace->poll_lock);
+        nvgpu_mutex_destroy(&g->fecs_trace->enable_lock);
+        nvgpu_kfree(g, g->fecs_trace);
+        g->fecs_trace = NULL;
+        return 0;
+}
+int gk20a_gr_max_entries(struct gk20a *g,
+                struct nvgpu_gpu_ctxsw_trace_filter *filter)
+{
+        int n;
+        int tag;
+        /* Compute number of entries per record, with given filter */
+        for (n = 0, tag = 0; tag < gk20a_fecs_trace_num_ts(); tag++)
+                n += (NVGPU_GPU_CTXSW_FILTER_ISSET(tag, filter) != 0);
+        /* Return max number of entries generated for the whole ring */
+        return n * GK20A_FECS_TRACE_NUM_RECORDS;
+}
+int gk20a_fecs_trace_enable(struct gk20a *g)
+{
+        struct gk20a_fecs_trace *trace = g->fecs_trace;
+        int write;
+        int err = 0;
+        if (!trace)
+                return -EINVAL;
+        nvgpu_mutex_acquire(&trace->enable_lock);
+        trace->enable_count++;
+        if (trace->enable_count == 1U) {
+                /* drop data in hw buffer */
+                if (g->ops.fecs_trace.flush)
+                        g->ops.fecs_trace.flush(g);
+                write = gk20a_fecs_trace_get_write_index(g);
+                if (nvgpu_is_enabled(g, NVGPU_FECS_TRACE_FEATURE_CONTROL)) {
+                        /*
+                         * For enabling FECS trace support, MAILBOX1's MSB
+                         * (Bit 31:31) should be set to 1. Bits 30:0 represents
+                         * actual pointer value.
+                         */
+                        write = write |
+                                (BIT32(NVGPU_FECS_TRACE_FEATURE_CONTROL_BIT));
+                }
+                gk20a_fecs_trace_set_read_index(g, write);
+                /*
+                 * FECS ucode does a priv holdoff around the assertion of
+                 * context reset. So, pri transactions (e.g. mailbox1 register
+                 * write) might fail due to this. Hence, do write with ack
+                 * i.e. write and read it back to make sure write happened for
+                 * mailbox1.
+                 */
+                while (gk20a_fecs_trace_get_read_index(g) != write) {
+                        nvgpu_log(g, gpu_dbg_ctxsw, "mailbox1 update failed");
+                        gk20a_fecs_trace_set_read_index(g, write);
+                }
+                err = nvgpu_thread_create(&trace->poll_task, g,
+                                gk20a_fecs_trace_periodic_polling, __func__);
+                if (err) {
+                        nvgpu_warn(g,
+                                "failed to create FECS polling task");
+                        goto done;
+                }
+        }
+done:
+        nvgpu_mutex_release(&trace->enable_lock);
+        return err;
+}
+int gk20a_fecs_trace_disable(struct gk20a *g)
+{
+        struct gk20a_fecs_trace *trace = g->fecs_trace;
+        int read = 0;
+        if (trace == NULL) {
+                return -EINVAL;
+        }
+        nvgpu_mutex_acquire(&trace->enable_lock);
+        if (trace->enable_count <= 0U) {
+                nvgpu_mutex_release(&trace->enable_lock);
+                return 0;
+        }
+        trace->enable_count--;
+        if (trace->enable_count == 0U) {
+                if (nvgpu_is_enabled(g, NVGPU_FECS_TRACE_FEATURE_CONTROL)) {
+                        /*
+                         * For disabling FECS trace support, MAILBOX1's MSB
+                         * (Bit 31:31) should be set to 0.
+                         */
+                        read = gk20a_fecs_trace_get_read_index(g) &
+                                (~(BIT32(NVGPU_FECS_TRACE_FEATURE_CONTROL_BIT)));
+                        gk20a_fecs_trace_set_read_index(g, read);
+                        /*
+                         * FECS ucode does a priv holdoff around the assertion
+                         * of context reset. So, pri transactions (e.g.
+                         * mailbox1 register write) might fail due to this.
+                         * Hence, do write with ack i.e. write and read it back
+                         * to make sure write happened for mailbox1.
+                         */
+                        while (gk20a_fecs_trace_get_read_index(g) != read) {
+                                nvgpu_log(g, gpu_dbg_ctxsw,
+                                        "mailbox1 update failed");
+                                gk20a_fecs_trace_set_read_index(g, read);
+                        }
+                }
+                nvgpu_thread_stop(&trace->poll_task);
+        }
+        nvgpu_mutex_release(&trace->enable_lock);
+        return -EPERM;
+}
+bool gk20a_fecs_trace_is_enabled(struct gk20a *g)
+{
+        struct gk20a_fecs_trace *trace = g->fecs_trace;
+        return (trace && nvgpu_thread_is_running(&trace->poll_task));
+}
+void gk20a_fecs_trace_reset_buffer(struct gk20a *g)
+{
+        nvgpu_log(g, gpu_dbg_fn|gpu_dbg_ctxsw, " ");
+        gk20a_fecs_trace_set_read_index(g,
+                gk20a_fecs_trace_get_write_index(g));
+}
+#endif /* CONFIG_GK20A_CTXSW_TRACE */
diff --git a/include/gk20a/fecs_trace_gk20a.h b/include/gk20a/fecs_trace_gk20a.h
new file mode 100644
index 0000000..d33e619
--- /dev/null
+++ b/include/gk20a/fecs_trace_gk20a.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2016-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+#ifndef NVGPU_GK20A_FECS_TRACE_GK20A_H
+#define NVGPU_GK20A_FECS_TRACE_GK20A_H
+struct gk20a;
+struct channel_gk20a;
+struct nvgpu_gpu_ctxsw_trace_filter;
+int gk20a_fecs_trace_poll(struct gk20a *g);
+int gk20a_fecs_trace_init(struct gk20a *g);
+int gk20a_fecs_trace_bind_channel(struct gk20a *g,
+                struct channel_gk20a *ch);
+int gk20a_fecs_trace_unbind_channel(struct gk20a *g, struct channel_gk20a *ch);
+int gk20a_fecs_trace_reset(struct gk20a *g);
+int gk20a_fecs_trace_deinit(struct gk20a *g);
+int gk20a_gr_max_entries(struct gk20a *g,
+                struct nvgpu_gpu_ctxsw_trace_filter *filter);
+int gk20a_fecs_trace_enable(struct gk20a *g);
+int gk20a_fecs_trace_disable(struct gk20a *g);
+bool gk20a_fecs_trace_is_enabled(struct gk20a *g);
+size_t gk20a_fecs_trace_buffer_size(struct gk20a *g);
+void gk20a_fecs_trace_reset_buffer(struct gk20a *g);
+#endif /* NVGPU_GK20A_FECS_TRACE_GK20A_H */
diff --git a/include/gk20a/fence_gk20a.c b/include/gk20a/fence_gk20a.c
new file mode 100644
index 0000000..af42130
--- /dev/null
+++ b/include/gk20a/fence_gk20a.c
@@ -0,0 +1,319 @@
+/*
+ * Copyright (c) 2014-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+#include "fence_gk20a.h"
+#include <nvgpu/semaphore.h>
+#include <nvgpu/kmem.h>
+#include <nvgpu/soc.h>
+#include <nvgpu/nvhost.h>
+#include <nvgpu/barrier.h>
+#include <nvgpu/os_fence.h>
+#include <nvgpu/channel.h>
+#include "gk20a.h"
+struct gk20a_fence_ops {
+        int (*wait)(struct gk20a_fence *, long timeout);
+        bool (*is_expired)(struct gk20a_fence *);
+        void *(*free)(struct nvgpu_ref *);
+};
+static void gk20a_fence_free(struct nvgpu_ref *ref)
+{
+        struct gk20a_fence *f =
+                container_of(ref, struct gk20a_fence, ref);
+        struct gk20a *g = f->g;
+        if (nvgpu_os_fence_is_initialized(&f->os_fence)) {
+                f->os_fence.ops->drop_ref(&f->os_fence);
+        }
+        if (f->semaphore) {
+                nvgpu_semaphore_put(f->semaphore);
+        }
+        if (f->allocator) {
+                if (nvgpu_alloc_initialized(f->allocator)) {
+                        nvgpu_free(f->allocator, (u64)(uintptr_t)f);
+                }
+        } else {
+                nvgpu_kfree(g, f);
+        }
+}
+void gk20a_fence_put(struct gk20a_fence *f)
+{
+        if (f) {
+                nvgpu_ref_put(&f->ref, gk20a_fence_free);
+        }
+}
+struct gk20a_fence *gk20a_fence_get(struct gk20a_fence *f)
+{
+        if (f) {
+                nvgpu_ref_get(&f->ref);
+        }
+        return f;
+}
+inline bool gk20a_fence_is_valid(struct gk20a_fence *f)
+{
+        bool valid = f->valid;
+        nvgpu_smp_rmb();
+        return valid;
+}
+int gk20a_fence_install_fd(struct gk20a_fence *f, int fd)
+{
+        if (!f || !gk20a_fence_is_valid(f) ||
+                !nvgpu_os_fence_is_initialized(&f->os_fence)) {
+                        return -EINVAL;
+        }
+        f->os_fence.ops->install_fence(&f->os_fence, fd);
+        return 0;
+}
+int gk20a_fence_wait(struct gk20a *g, struct gk20a_fence *f,
+                                                        unsigned long timeout)
+{
+        if (f && gk20a_fence_is_valid(f)) {
+                if (!nvgpu_platform_is_silicon(g)) {
+                        timeout = MAX_SCHEDULE_TIMEOUT;
+                }
+                return f->ops->wait(f, timeout);
+        }
+        return 0;
+}
+bool gk20a_fence_is_expired(struct gk20a_fence *f)
+{
+        if (f && gk20a_fence_is_valid(f) && f->ops) {
+                return f->ops->is_expired(f);
+        } else {
+                return true;
+        }
+}
+int gk20a_alloc_fence_pool(struct channel_gk20a *c, unsigned int count)
+{
+        int err;
+        size_t size;
+        struct gk20a_fence *fence_pool = NULL;
+        size = sizeof(struct gk20a_fence);
+        if (count <= UINT_MAX / size) {
+                size = count * size;
+                fence_pool = nvgpu_vzalloc(c->g, size);
+        }
+        if (!fence_pool) {
+                return -ENOMEM;
+        }
+        err = nvgpu_lockless_allocator_init(c->g, &c->fence_allocator,
+                                "fence_pool", (size_t)fence_pool, size,
+                                sizeof(struct gk20a_fence), 0);
+        if (err) {
+                goto fail;
+        }
+        return 0;
+fail:
+        nvgpu_vfree(c->g, fence_pool);
+        return err;
+}
+void gk20a_free_fence_pool(struct channel_gk20a *c)
+{
+        if (nvgpu_alloc_initialized(&c->fence_allocator)) {
+                struct gk20a_fence *fence_pool;
+                        fence_pool = (struct gk20a_fence *)(uintptr_t)
+                                nvgpu_alloc_base(&c->fence_allocator);
+                nvgpu_alloc_destroy(&c->fence_allocator);
+                nvgpu_vfree(c->g, fence_pool);
+        }
+}
+struct gk20a_fence *gk20a_alloc_fence(struct channel_gk20a *c)
+{
+        struct gk20a_fence *fence = NULL;
+        if (channel_gk20a_is_prealloc_enabled(c)) {
+                if (nvgpu_alloc_initialized(&c->fence_allocator)) {
+                        fence = (struct gk20a_fence *)(uintptr_t)
+                                nvgpu_alloc(&c->fence_allocator,
+                                        sizeof(struct gk20a_fence));
+                        /* clear the node and reset the allocator pointer */
+                        if (fence) {
+                                memset(fence, 0, sizeof(*fence));
+                                fence->allocator = &c->fence_allocator;
+                        }
+                }
+        } else {
+                fence = nvgpu_kzalloc(c->g, sizeof(struct gk20a_fence));
+        }
+        if (fence) {
+                nvgpu_ref_init(&fence->ref);
+                fence->g = c->g;
+        }
+        return fence;
+}
+void gk20a_init_fence(struct gk20a_fence *f,
+                const struct gk20a_fence_ops *ops,
+                struct nvgpu_os_fence os_fence)
+{
+        if (!f) {
+                return;
+        }
+        f->ops = ops;
+        f->syncpt_id = -1;
+        f->semaphore = NULL;
+        f->os_fence = os_fence;
+}
+/* Fences that are backed by GPU semaphores: */
+static int nvgpu_semaphore_fence_wait(struct gk20a_fence *f, long timeout)
+{
+        if (!nvgpu_semaphore_is_acquired(f->semaphore)) {
+                return 0;
+        }
+        return NVGPU_COND_WAIT_INTERRUPTIBLE(
+                f->semaphore_wq,
+                !nvgpu_semaphore_is_acquired(f->semaphore),
+                timeout);
+}
+static bool nvgpu_semaphore_fence_is_expired(struct gk20a_fence *f)
+{
+        return !nvgpu_semaphore_is_acquired(f->semaphore);
+}
+static const struct gk20a_fence_ops nvgpu_semaphore_fence_ops = {
+        .wait = &nvgpu_semaphore_fence_wait,
+        .is_expired = &nvgpu_semaphore_fence_is_expired,
+};
+/* This function takes ownership of the semaphore as well as the os_fence */
+int gk20a_fence_from_semaphore(
+                struct gk20a_fence *fence_out,
+                struct nvgpu_semaphore *semaphore,
+                struct nvgpu_cond *semaphore_wq,
+                struct nvgpu_os_fence os_fence)
+{
+        struct gk20a_fence *f = fence_out;
+        gk20a_init_fence(f, &nvgpu_semaphore_fence_ops, os_fence);
+        if (!f) {
+                return -EINVAL;
+        }
+        f->semaphore = semaphore;
+        f->semaphore_wq = semaphore_wq;
+        /* commit previous writes before setting the valid flag */
+        nvgpu_smp_wmb();
+        f->valid = true;
+        return 0;
+}
+#ifdef CONFIG_TEGRA_GK20A_NVHOST
+/* Fences that are backed by host1x syncpoints: */
+static int gk20a_syncpt_fence_wait(struct gk20a_fence *f, long timeout)
+{
+        return nvgpu_nvhost_syncpt_wait_timeout_ext(
+                        f->nvhost_dev, f->syncpt_id, f->syncpt_value,
+                        (u32)timeout, NULL, NULL);
+}
+static bool gk20a_syncpt_fence_is_expired(struct gk20a_fence *f)
+{
+        /*
+         * In cases we don't register a notifier, we can't expect the
+         * syncpt value to be updated. For this case, we force a read
+         * of the value from HW, and then check for expiration.
+         */
+        if (!nvgpu_nvhost_syncpt_is_expired_ext(f->nvhost_dev, f->syncpt_id,
+                                f->syncpt_value)) {
+                u32 val;
+                if (!nvgpu_nvhost_syncpt_read_ext_check(f->nvhost_dev,
+                                f->syncpt_id, &val)) {
+                        return nvgpu_nvhost_syncpt_is_expired_ext(
+                                        f->nvhost_dev,
+                                        f->syncpt_id, f->syncpt_value);
+                }
+        }
+        return true;
+}
+static const struct gk20a_fence_ops gk20a_syncpt_fence_ops = {
+        .wait = &gk20a_syncpt_fence_wait,
+        .is_expired = &gk20a_syncpt_fence_is_expired,
+};
+/* This function takes the ownership of the os_fence */
+int gk20a_fence_from_syncpt(
+                struct gk20a_fence *fence_out,
+                struct nvgpu_nvhost_dev *nvhost_dev,
+                u32 id, u32 value, struct nvgpu_os_fence os_fence)
+{
+        struct gk20a_fence *f = fence_out;
+        gk20a_init_fence(f, &gk20a_syncpt_fence_ops, os_fence);
+        if (!f)
+                return -EINVAL;
+        f->nvhost_dev = nvhost_dev;
+        f->syncpt_id = id;
+        f->syncpt_value = value;
+        /* commit previous writes before setting the valid flag */
+        nvgpu_smp_wmb();
+        f->valid = true;
+        return 0;
+}
+#else
+int gk20a_fence_from_syncpt(
+                struct gk20a_fence *fence_out,
+                struct nvgpu_nvhost_dev *nvhost_dev,
+                u32 id, u32 value, struct nvgpu_os_fence os_fence)
+{
+        return -EINVAL;
+}
+#endif
diff --git a/include/gk20a/fence_gk20a.h b/include/gk20a/fence_gk20a.h
new file mode 100644
index 0000000..0311279
--- /dev/null
+++ b/include/gk20a/fence_gk20a.h
@@ -0,0 +1,100 @@
+/*
+ * drivers/video/tegra/host/gk20a/fence_gk20a.h
+ *
+ * GK20A Fences
+ *
+ * Copyright (c) 2014-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+#ifndef NVGPU_GK20A_FENCE_GK20A_H
+#define NVGPU_GK20A_FENCE_GK20A_H
+#include <nvgpu/types.h>
+#include <nvgpu/kref.h>
+#include <nvgpu/os_fence.h>
+struct platform_device;
+struct nvgpu_semaphore;
+struct channel_gk20a;
+struct gk20a;
+struct nvgpu_os_fence;
+struct gk20a_fence_ops;
+struct gk20a_fence {
+        struct gk20a *g;
+        /* Valid for all fence types: */
+        bool valid;
+        struct nvgpu_ref ref;
+        const struct gk20a_fence_ops *ops;
+        struct nvgpu_os_fence os_fence;
+        /* Valid for fences created from semaphores: */
+        struct nvgpu_semaphore *semaphore;
+        struct nvgpu_cond *semaphore_wq;
+        /* Valid for fences created from syncpoints: */
+        struct nvgpu_nvhost_dev *nvhost_dev;
+        u32 syncpt_id;
+        u32 syncpt_value;
+        /* Valid for fences part of a pre-allocated fence pool */
+        struct nvgpu_allocator *allocator;
+};
+/* Fences can be created from semaphores or syncpoint (id, value) pairs */
+int gk20a_fence_from_semaphore(
+                struct gk20a_fence *fence_out,
+                struct nvgpu_semaphore *semaphore,
+                struct nvgpu_cond *semaphore_wq,
+                struct nvgpu_os_fence os_fence);
+int gk20a_fence_from_syncpt(
+                struct gk20a_fence *fence_out,
+                struct nvgpu_nvhost_dev *nvhost_dev,
+                u32 id, u32 value,
+                struct nvgpu_os_fence os_fence);
+int gk20a_alloc_fence_pool(
+                struct channel_gk20a *c,
+                unsigned int count);
+void gk20a_free_fence_pool(
+                struct channel_gk20a *c);
+struct gk20a_fence *gk20a_alloc_fence(
+                struct channel_gk20a *c);
+void gk20a_init_fence(struct gk20a_fence *f,
+                const struct gk20a_fence_ops *ops,
+                struct nvgpu_os_fence os_fence);
+/* Fence operations */
+void gk20a_fence_put(struct gk20a_fence *f);
+struct gk20a_fence *gk20a_fence_get(struct gk20a_fence *f);
+int gk20a_fence_wait(struct gk20a *g, struct gk20a_fence *f,
+                                                        unsigned long timeout);
+bool gk20a_fence_is_expired(struct gk20a_fence *f);
+bool gk20a_fence_is_valid(struct gk20a_fence *f);
+int gk20a_fence_install_fd(struct gk20a_fence *f, int fd);
+#endif /* NVGPU_GK20A_FENCE_GK20A_H */
diff --git a/include/gk20a/fifo_gk20a.c b/include/gk20a/fifo_gk20a.c
new file mode 100644
index 0000000..4477f7c
--- /dev/null
+++ b/include/gk20a/fifo_gk20a.c
@@ -0,0 +1,4649 @@
+/*
+ * GK20A Graphics FIFO (gr host)
+ *
+ * Copyright (c) 2011-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+#include <trace/events/gk20a.h>
+#include <nvgpu/mm.h>
+#include <nvgpu/dma.h>
+#include <nvgpu/timers.h>
+#include <nvgpu/semaphore.h>
+#include <nvgpu/enabled.h>
+#include <nvgpu/kmem.h>
+#include <nvgpu/log.h>
+#include <nvgpu/soc.h>
+#include <nvgpu/atomic.h>
+#include <nvgpu/bug.h>
+#include <nvgpu/log2.h>
+#include <nvgpu/debug.h>
+#include <nvgpu/nvhost.h>
+#include <nvgpu/barrier.h>
+#include <nvgpu/ctxsw_trace.h>
+#include <nvgpu/error_notifier.h>
+#include <nvgpu/ptimer.h>
+#include <nvgpu/io.h>
+#include <nvgpu/utils.h>
+#include <nvgpu/channel.h>
+#include <nvgpu/unit.h>
+#include <nvgpu/power_features/power_features.h>
+#include <nvgpu/power_features/cg.h>
+#include "gk20a.h"
+#include "mm_gk20a.h"
+#include <nvgpu/hw/gk20a/hw_fifo_gk20a.h>
+#include <nvgpu/hw/gk20a/hw_pbdma_gk20a.h>
+#include <nvgpu/hw/gk20a/hw_ccsr_gk20a.h>
+#include <nvgpu/hw/gk20a/hw_ram_gk20a.h>
+#include <nvgpu/hw/gk20a/hw_top_gk20a.h>
+#include <nvgpu/hw/gk20a/hw_gr_gk20a.h>
+#include <os/linux/os_linux.h>
+#define FECS_METHOD_WFI_RESTORE 0x80000
+#define FECS_MAILBOX_0_ACK_RESTORE 0x4
+static u32 gk20a_fifo_engines_on_id(struct gk20a *g, u32 id, bool is_tsg);
+static const char *const pbdma_intr_fault_type_desc[] = {
+        "MEMREQ timeout", "MEMACK_TIMEOUT", "MEMACK_EXTRA acks",
+        "MEMDAT_TIMEOUT", "MEMDAT_EXTRA acks", "MEMFLUSH noack",
+        "MEMOP noack", "LBCONNECT noack", "NONE - was LBREQ",
+        "LBACK_TIMEOUT", "LBACK_EXTRA acks", "LBDAT_TIMEOUT",
+        "LBDAT_EXTRA acks", "GPFIFO won't fit", "GPPTR invalid",
+        "GPENTRY invalid", "GPCRC mismatch", "PBPTR get>put",
+        "PBENTRY invld", "PBCRC mismatch", "NONE - was XBARC",
+        "METHOD invld", "METHODCRC mismat", "DEVICE sw method",
+        "[ENGINE]", "SEMAPHORE invlid", "ACQUIRE timeout",
+        "PRI forbidden", "ILLEGAL SYNCPT", "[NO_CTXSW_SEG]",
+        "PBSEG badsplit", "SIGNATURE bad"
+};
+u32 gk20a_fifo_get_engine_ids(struct gk20a *g,
+                u32 engine_id[], u32 engine_id_sz,
+                u32 engine_enum)
+{
+        struct fifo_gk20a *f = NULL;
+        u32 instance_cnt = 0;
+        u32 engine_id_idx;
+        u32 active_engine_id = 0;
+        struct fifo_engine_info_gk20a *info = NULL;
+        if (g && engine_id_sz && (engine_enum < ENGINE_INVAL_GK20A)) {
+                f = &g->fifo;
+                for (engine_id_idx = 0; engine_id_idx < f->num_engines; ++engine_id_idx) {
+                        active_engine_id = f->active_engines_list[engine_id_idx];
+                        info = &f->engine_info[active_engine_id];
+                        if (info->engine_enum == engine_enum) {
+                                if (instance_cnt < engine_id_sz) {
+                                        engine_id[instance_cnt] = active_engine_id;
+                                        ++instance_cnt;
+                                } else {
+                                        nvgpu_log_info(g, "warning engine_id table sz is small %d",
+                                                        engine_id_sz);
+                                }
+                        }
+                }
+        }
+        return instance_cnt;
+}
+struct fifo_engine_info_gk20a *gk20a_fifo_get_engine_info(struct gk20a *g, u32 engine_id)
+{
+        struct fifo_gk20a *f = NULL;
+        u32 engine_id_idx;
+        struct fifo_engine_info_gk20a *info = NULL;
+        if (!g) {
+                return info;
+        }
+        f = &g->fifo;
+        if (engine_id < f->max_engines) {
+                for (engine_id_idx = 0; engine_id_idx < f->num_engines; ++engine_id_idx) {
+                        if (engine_id == f->active_engines_list[engine_id_idx]) {
+                                info = &f->engine_info[engine_id];
+                                break;
+                        }
+                }
+        }
+        if (!info) {
+                nvgpu_err(g, "engine_id is not in active list/invalid %d", engine_id);
+        }
+        return info;
+}
+bool gk20a_fifo_is_valid_engine_id(struct gk20a *g, u32 engine_id)
+{
+        struct fifo_gk20a *f = NULL;
+        u32 engine_id_idx;
+        bool valid = false;
+        if (!g) {
+                return valid;
+        }
+        f = &g->fifo;
+        if (engine_id < f->max_engines) {
+                for (engine_id_idx = 0; engine_id_idx < f->num_engines; ++engine_id_idx) {
+                        if (engine_id == f->active_engines_list[engine_id_idx]) {
+                                valid = true;
+                                break;
+                        }
+                }
+        }
+        if (!valid) {
+                nvgpu_err(g, "engine_id is not in active list/invalid %d", engine_id);
+        }
+        return valid;
+}
+u32 gk20a_fifo_get_gr_engine_id(struct gk20a *g)
+{
+        u32 gr_engine_cnt = 0;
+        u32 gr_engine_id = FIFO_INVAL_ENGINE_ID;
+        /* Consider 1st available GR engine */
+        gr_engine_cnt = gk20a_fifo_get_engine_ids(g, &gr_engine_id,
+                        1, ENGINE_GR_GK20A);
+        if (!gr_engine_cnt) {
+                nvgpu_err(g, "No GR engine available on this device!");
+        }
+        return gr_engine_id;
+}
+u32 gk20a_fifo_get_all_ce_engine_reset_mask(struct gk20a *g)
+{
+        u32 reset_mask = 0;
+        u32 engine_enum = ENGINE_INVAL_GK20A;
+        struct fifo_gk20a *f = NULL;
+        u32 engine_id_idx;
+        struct fifo_engine_info_gk20a *engine_info;
+        u32 active_engine_id = 0;
+        if (!g) {
+                return reset_mask;
+        }
+        f = &g->fifo;
+        for (engine_id_idx = 0; engine_id_idx < f->num_engines; ++engine_id_idx) {
+                active_engine_id = f->active_engines_list[engine_id_idx];
+                engine_info = &f->engine_info[active_engine_id];
+                engine_enum = engine_info->engine_enum;
+                if ((engine_enum == ENGINE_GRCE_GK20A) ||
+                        (engine_enum == ENGINE_ASYNC_CE_GK20A)) {
+                                reset_mask |= engine_info->reset_mask;
+                }
+        }
+        return reset_mask;
+}
+u32 gk20a_fifo_get_fast_ce_runlist_id(struct gk20a *g)
+{
+        u32 ce_runlist_id = gk20a_fifo_get_gr_runlist_id(g);
+        u32 engine_enum = ENGINE_INVAL_GK20A;
+        struct fifo_gk20a *f = NULL;
+        u32 engine_id_idx;
+        struct fifo_engine_info_gk20a *engine_info;
+        u32 active_engine_id = 0;
+        if (!g) {
+                return ce_runlist_id;
+        }
+        f = &g->fifo;
+        for (engine_id_idx = 0; engine_id_idx < f->num_engines; ++engine_id_idx) {
+                active_engine_id = f->active_engines_list[engine_id_idx];
+                engine_info = &f->engine_info[active_engine_id];
+                engine_enum = engine_info->engine_enum;
+                /* selecet last available ASYNC_CE if available */
+                if (engine_enum == ENGINE_ASYNC_CE_GK20A) {
+                        ce_runlist_id = engine_info->runlist_id;
+                }
+        }
+        return ce_runlist_id;
+}
+u32 gk20a_fifo_get_gr_runlist_id(struct gk20a *g)
+{
+        u32 gr_engine_cnt = 0;
+        u32 gr_engine_id = FIFO_INVAL_ENGINE_ID;
+        struct fifo_engine_info_gk20a *engine_info;
+        u32 gr_runlist_id = ~0;
+        /* Consider 1st available GR engine */
+        gr_engine_cnt = gk20a_fifo_get_engine_ids(g, &gr_engine_id,
+                        1, ENGINE_GR_GK20A);
+        if (!gr_engine_cnt) {
+                nvgpu_err(g,
+                        "No GR engine available on this device!");
+                goto end;
+        }
+        engine_info = gk20a_fifo_get_engine_info(g, gr_engine_id);
+        if (engine_info) {
+                gr_runlist_id = engine_info->runlist_id;
+        } else {
+                nvgpu_err(g,
+                        "gr_engine_id is not in active list/invalid %d", gr_engine_id);
+        }
+end:
+        return gr_runlist_id;
+}
+bool gk20a_fifo_is_valid_runlist_id(struct gk20a *g, u32 runlist_id)
+{
+        struct fifo_gk20a *f = NULL;
+        u32 engine_id_idx;
+        u32 active_engine_id;
+        struct fifo_engine_info_gk20a *engine_info;
+        if (!g) {
+                return false;
+        }
+        f = &g->fifo;
+        for (engine_id_idx = 0; engine_id_idx < f->num_engines; ++engine_id_idx) {
+                active_engine_id = f->active_engines_list[engine_id_idx];
+                engine_info = gk20a_fifo_get_engine_info(g, active_engine_id);
+                if (engine_info && (engine_info->runlist_id == runlist_id)) {
+                        return true;
+                }
+        }
+        return false;
+}
+/*
+ * Link engine IDs to MMU IDs and vice versa.
+ */
+static inline u32 gk20a_engine_id_to_mmu_id(struct gk20a *g, u32 engine_id)
+{
+        u32 fault_id = FIFO_INVAL_ENGINE_ID;
+        struct fifo_engine_info_gk20a *engine_info;
+        engine_info = gk20a_fifo_get_engine_info(g, engine_id);
+        if (engine_info) {
+                fault_id = engine_info->fault_id;
+        } else {
+                nvgpu_err(g, "engine_id is not in active list/invalid %d", engine_id);
+        }
+        return fault_id;
+}
+static inline u32 gk20a_mmu_id_to_engine_id(struct gk20a *g, u32 fault_id)
+{
+        u32 engine_id;
+        u32 active_engine_id;
+        struct fifo_engine_info_gk20a *engine_info;
+        struct fifo_gk20a *f = &g->fifo;
+        for (engine_id = 0; engine_id < f->num_engines; engine_id++) {
+                active_engine_id = f->active_engines_list[engine_id];
+                engine_info = &g->fifo.engine_info[active_engine_id];
+                if (engine_info->fault_id == fault_id) {
+                        break;
+                }
+                active_engine_id = FIFO_INVAL_ENGINE_ID;
+        }
+        return active_engine_id;
+}
+int gk20a_fifo_engine_enum_from_type(struct gk20a *g, u32 engine_type,
+                                        u32 *inst_id)
+{
+        int ret = ENGINE_INVAL_GK20A;
+        nvgpu_log_info(g, "engine type %d", engine_type);
+        if (engine_type == top_device_info_type_enum_graphics_v()) {
+                ret = ENGINE_GR_GK20A;
+        } else if ((engine_type >= top_device_info_type_enum_copy0_v()) &&
+                (engine_type <= top_device_info_type_enum_copy2_v())) {
+                /* Lets consider all the CE engine have separate runlist at this point
+                 * We can identify the ENGINE_GRCE_GK20A type CE using runlist_id
+                 * comparsion logic with GR runlist_id in init_engine_info() */
+                        ret = ENGINE_ASYNC_CE_GK20A;
+                /* inst_id starts from CE0 to CE2 */
+                if (inst_id) {
+                        *inst_id = (engine_type - top_device_info_type_enum_copy0_v());
+                }
+        }
+        return ret;
+}
+int gk20a_fifo_init_engine_info(struct fifo_gk20a *f)
+{
+        struct gk20a *g = f->g;
+        u32 i;
+        u32 max_info_entries = top_device_info__size_1_v();
+        u32 engine_enum = ENGINE_INVAL_GK20A;
+        u32 engine_id = FIFO_INVAL_ENGINE_ID;
+        u32 runlist_id = ~0;
+        u32 pbdma_id = ~0;
+        u32 intr_id = ~0;
+        u32 reset_id = ~0;
+        u32 inst_id  = 0;
+        u32 pri_base = 0;
+        u32 fault_id = 0;
+        u32 gr_runlist_id = ~0;
+        bool found_pbdma_for_runlist = false;
+        nvgpu_log_fn(g, " ");
+        f->num_engines = 0;
+        for (i = 0; i < max_info_entries; i++) {
+                u32 table_entry = gk20a_readl(f->g, top_device_info_r(i));
+                u32 entry = top_device_info_entry_v(table_entry);
+                u32 runlist_bit;
+                if (entry == top_device_info_entry_enum_v()) {
+                        if (top_device_info_engine_v(table_entry)) {
+                                engine_id =
+                                        top_device_info_engine_enum_v(table_entry);
+                                nvgpu_log_info(g, "info: engine_id %d",
+                                        top_device_info_engine_enum_v(table_entry));
+                        }
+                        if (top_device_info_runlist_v(table_entry)) {
+                                runlist_id =
+                                        top_device_info_runlist_enum_v(table_entry);
+                                nvgpu_log_info(g, "gr info: runlist_id %d", runlist_id);
+                                runlist_bit = BIT(runlist_id);
+                                found_pbdma_for_runlist = false;
+                                for (pbdma_id = 0; pbdma_id < f->num_pbdma;
+                                                                pbdma_id++) {
+                                        if (f->pbdma_map[pbdma_id] &
+                                                                runlist_bit) {
+                                                nvgpu_log_info(g,
+                                                "gr info: pbdma_map[%d]=%d",
+                                                        pbdma_id,
+                                                        f->pbdma_map[pbdma_id]);
+                                                found_pbdma_for_runlist = true;
+                                                break;
+                                        }
+                                }
+                                if (!found_pbdma_for_runlist) {
+                                        nvgpu_err(g, "busted pbdma map");
+                                        return -EINVAL;
+                                }
+                        }
+                        if (top_device_info_intr_v(table_entry)) {
+                                intr_id =
+                                        top_device_info_intr_enum_v(table_entry);
+                                nvgpu_log_info(g, "gr info: intr_id %d", intr_id);
+                        }
+                        if (top_device_info_reset_v(table_entry)) {
+                                reset_id =
+                                        top_device_info_reset_enum_v(table_entry);
+                                nvgpu_log_info(g, "gr info: reset_id %d",
+                                                reset_id);
+                        }
+                } else if (entry == top_device_info_entry_engine_type_v()) {
+                        u32 engine_type =
+                                top_device_info_type_enum_v(table_entry);
+                        engine_enum =
+                                g->ops.fifo.engine_enum_from_type(g,
+                                                engine_type, &inst_id);
+                } else if (entry == top_device_info_entry_data_v()) {
+                        /* gk20a doesn't support device_info_data packet parsing */
+                        if (g->ops.fifo.device_info_data_parse) {
+                                g->ops.fifo.device_info_data_parse(g,
+                                        table_entry, &inst_id, &pri_base,
+                                        &fault_id);
+                        }
+                }
+                if (!top_device_info_chain_v(table_entry)) {
+                        if (engine_enum < ENGINE_INVAL_GK20A) {
+                                struct fifo_engine_info_gk20a *info =
+                                        &g->fifo.engine_info[engine_id];
+                                info->intr_mask |= BIT(intr_id);
+                                info->reset_mask |= BIT(reset_id);
+                                info->runlist_id = runlist_id;
+                                info->pbdma_id = pbdma_id;
+                                info->inst_id  = inst_id;
+                                info->pri_base = pri_base;
+                                if (engine_enum == ENGINE_GR_GK20A) {
+                                        gr_runlist_id = runlist_id;
+                                }
+                                /* GR and GR_COPY shares same runlist_id */
+                                if ((engine_enum == ENGINE_ASYNC_CE_GK20A) &&
+                                        (gr_runlist_id == runlist_id)) {
+                                                engine_enum = ENGINE_GRCE_GK20A;
+                                }
+                                info->engine_enum = engine_enum;
+                                if (!fault_id && (engine_enum == ENGINE_GRCE_GK20A)) {
+                                        fault_id = 0x1b;
+                                }
+                                info->fault_id = fault_id;
+                                /* engine_id starts from 0 to NV_HOST_NUM_ENGINES */
+                                f->active_engines_list[f->num_engines] = engine_id;
+                                ++f->num_engines;
+                                engine_enum = ENGINE_INVAL_GK20A;
+                        }
+                }
+        }
+        return 0;
+}
+u32 gk20a_fifo_act_eng_interrupt_mask(struct gk20a *g, u32 act_eng_id)
+{
+        struct fifo_engine_info_gk20a *engine_info = NULL;
+        engine_info = gk20a_fifo_get_engine_info(g, act_eng_id);
+        if (engine_info) {
+                return engine_info->intr_mask;
+        }
+        return 0;
+}
+u32 gk20a_fifo_engine_interrupt_mask(struct gk20a *g)
+{
+        u32 eng_intr_mask = 0;
+        unsigned int i;
+        u32 active_engine_id = 0;
+        u32 engine_enum = ENGINE_INVAL_GK20A;
+        for (i = 0; i < g->fifo.num_engines; i++) {
+                u32 intr_mask;
+                active_engine_id = g->fifo.active_engines_list[i];
+                intr_mask = g->fifo.engine_info[active_engine_id].intr_mask;
+                engine_enum = g->fifo.engine_info[active_engine_id].engine_enum;
+                if (((engine_enum == ENGINE_GRCE_GK20A) ||
+                        (engine_enum == ENGINE_ASYNC_CE_GK20A)) &&
+                        (!g->ops.ce2.isr_stall || !g->ops.ce2.isr_nonstall)) {
+                                continue;
+                }
+                eng_intr_mask |= intr_mask;
+        }
+        return eng_intr_mask;
+}
+void gk20a_fifo_delete_runlist(struct fifo_gk20a *f)
+{
+        u32 i;
+        u32 runlist_id;
+        struct fifo_runlist_info_gk20a *runlist;
+        struct gk20a *g = NULL;
+        if (!f || !f->runlist_info) {
+                return;
+        }
+        g = f->g;
+        for (runlist_id = 0; runlist_id < f->max_runlists; runlist_id++) {
+                runlist = &f->runlist_info[runlist_id];
+                for (i = 0; i < MAX_RUNLIST_BUFFERS; i++) {
+                        nvgpu_dma_free(g, &runlist->mem[i]);
+                }
+                nvgpu_kfree(g, runlist->active_channels);
+                runlist->active_channels = NULL;
+                nvgpu_kfree(g, runlist->active_tsgs);
+                runlist->active_tsgs = NULL;
+                nvgpu_mutex_destroy(&runlist->runlist_lock);
+        }
+        memset(f->runlist_info, 0, (sizeof(struct fifo_runlist_info_gk20a) *
+                f->max_runlists));
+        nvgpu_kfree(g, f->runlist_info);
+        f->runlist_info = NULL;
+        f->max_runlists = 0;
+}
+static void gk20a_remove_fifo_support(struct fifo_gk20a *f)
+{
+        struct gk20a *g = f->g;
+        unsigned int i = 0;
+        nvgpu_log_fn(g, " ");
+        nvgpu_channel_worker_deinit(g);
+        /*
+         * Make sure all channels are closed before deleting them.
+         */
+        for (; i < f->num_channels; i++) {
+                struct channel_gk20a *c = f->channel + i;
+                struct tsg_gk20a *tsg = f->tsg + i;
+                /*
+                 * Could race but worst that happens is we get an error message
+                 * from gk20a_free_channel() complaining about multiple closes.
+                 */
+                if (c->referenceable) {
+                        __gk20a_channel_kill(c);
+                }
+                nvgpu_mutex_destroy(&tsg->event_id_list_lock);
+                nvgpu_mutex_destroy(&c->ioctl_lock);
+                nvgpu_mutex_destroy(&c->joblist.cleanup_lock);
+                nvgpu_mutex_destroy(&c->joblist.pre_alloc.read_lock);
+                nvgpu_mutex_destroy(&c->sync_lock);
+#if defined(CONFIG_GK20A_CYCLE_STATS)
+                nvgpu_mutex_destroy(&c->cyclestate.cyclestate_buffer_mutex);
+                nvgpu_mutex_destroy(&c->cs_client_mutex);
+#endif
+                nvgpu_mutex_destroy(&c->dbg_s_lock);
+        }
+        nvgpu_vfree(g, f->channel);
+        nvgpu_vfree(g, f->tsg);
+        if (g->ops.mm.is_bar1_supported(g)) {
+                nvgpu_dma_unmap_free(g->mm.bar1.vm, &f->userd);
+        } else {
+                nvgpu_dma_free(g, &f->userd);
+        }
+        gk20a_fifo_delete_runlist(f);
+        nvgpu_kfree(g, f->pbdma_map);
+        f->pbdma_map = NULL;
+        nvgpu_kfree(g, f->engine_info);
+        f->engine_info = NULL;
+        nvgpu_kfree(g, f->active_engines_list);
+        f->active_engines_list = NULL;
+}
+/* reads info from hardware and fills in pbmda exception info record */
+static inline void get_exception_pbdma_info(
+        struct gk20a *g,
+        struct fifo_engine_info_gk20a *eng_info)
+{
+        struct fifo_pbdma_exception_info_gk20a *e =
+                &eng_info->pbdma_exception_info;
+        u32 pbdma_status_r = e->status_r = gk20a_readl(g,
+                   fifo_pbdma_status_r(eng_info->pbdma_id));
+        e->id = fifo_pbdma_status_id_v(pbdma_status_r); /* vs. id_hw_v()? */
+        e->id_is_chid = fifo_pbdma_status_id_type_v(pbdma_status_r) ==
+                fifo_pbdma_status_id_type_chid_v();
+        e->chan_status_v  = fifo_pbdma_status_chan_status_v(pbdma_status_r);
+        e->next_id_is_chid =
+                fifo_pbdma_status_next_id_type_v(pbdma_status_r) ==
+                fifo_pbdma_status_next_id_type_chid_v();
+        e->next_id = fifo_pbdma_status_next_id_v(pbdma_status_r);
+        e->chsw_in_progress =
+                fifo_pbdma_status_chsw_v(pbdma_status_r) ==
+                fifo_pbdma_status_chsw_in_progress_v();
+}
+static void fifo_pbdma_exception_status(struct gk20a *g,
+        struct fifo_engine_info_gk20a *eng_info)
+{
+        struct fifo_pbdma_exception_info_gk20a *e;
+        get_exception_pbdma_info(g, eng_info);
+        e = &eng_info->pbdma_exception_info;
+        nvgpu_log_fn(g, "pbdma_id %d, "
+                      "id_type %s, id %d, chan_status %d, "
+                      "next_id_type %s, next_id %d, "
+                      "chsw_in_progress %d",
+                      eng_info->pbdma_id,
+                      e->id_is_chid ? "chid" : "tsgid", e->id, e->chan_status_v,
+                      e->next_id_is_chid ? "chid" : "tsgid", e->next_id,
+                      e->chsw_in_progress);
+}
+/* reads info from hardware and fills in pbmda exception info record */
+static inline void get_exception_engine_info(
+        struct gk20a *g,
+        struct fifo_engine_info_gk20a *eng_info)
+{
+        struct fifo_engine_exception_info_gk20a *e =
+                &eng_info->engine_exception_info;
+        u32 engine_status_r = e->status_r =
+                gk20a_readl(g, fifo_engine_status_r(eng_info->engine_id));
+        e->id = fifo_engine_status_id_v(engine_status_r); /* vs. id_hw_v()? */
+        e->id_is_chid = fifo_engine_status_id_type_v(engine_status_r) ==
+                fifo_engine_status_id_type_chid_v();
+        e->ctx_status_v = fifo_engine_status_ctx_status_v(engine_status_r);
+        e->faulted =
+                fifo_engine_status_faulted_v(engine_status_r) ==
+                fifo_engine_status_faulted_true_v();
+        e->idle =
+                fifo_engine_status_engine_v(engine_status_r) ==
+                fifo_engine_status_engine_idle_v();
+        e->ctxsw_in_progress =
+                fifo_engine_status_ctxsw_v(engine_status_r) ==
+                fifo_engine_status_ctxsw_in_progress_v();
+}
+static void fifo_engine_exception_status(struct gk20a *g,
+                               struct fifo_engine_info_gk20a *eng_info)
+{
+        struct fifo_engine_exception_info_gk20a *e;
+        get_exception_engine_info(g, eng_info);
+        e = &eng_info->engine_exception_info;
+        nvgpu_log_fn(g, "engine_id %d, id_type %s, id %d, ctx_status %d, "
+                      "faulted %d, idle %d, ctxsw_in_progress %d, ",
+                      eng_info->engine_id, e->id_is_chid ? "chid" : "tsgid",
+                      e->id, e->ctx_status_v,
+                      e->faulted, e->idle,  e->ctxsw_in_progress);
+}
+static int init_runlist(struct gk20a *g, struct fifo_gk20a *f)
+{
+        struct fifo_runlist_info_gk20a *runlist;
+        struct fifo_engine_info_gk20a *engine_info;
+        unsigned int runlist_id;
+        u32 i;
+        size_t runlist_size;
+        u32 active_engine_id, pbdma_id, engine_id;
+        int flags = nvgpu_is_enabled(g, NVGPU_MM_USE_PHYSICAL_SG) ?
+                NVGPU_DMA_FORCE_CONTIGUOUS : 0;
+        int err = 0;
+        nvgpu_log_fn(g, " ");
+        f->max_runlists = g->ops.fifo.eng_runlist_base_size();
+        f->runlist_info = nvgpu_kzalloc(g,
+                                        sizeof(struct fifo_runlist_info_gk20a) *
+                                        f->max_runlists);
+        if (!f->runlist_info) {
+                goto clean_up_runlist;
+        }
+        memset(f->runlist_info, 0, (sizeof(struct fifo_runlist_info_gk20a) *
+                f->max_runlists));
+        for (runlist_id = 0; runlist_id < f->max_runlists; runlist_id++) {
+                runlist = &f->runlist_info[runlist_id];
+                runlist->active_channels =
+                        nvgpu_kzalloc(g, DIV_ROUND_UP(f->num_channels,
+                                                      BITS_PER_BYTE));
+                if (!runlist->active_channels) {
+                        goto clean_up_runlist;
+                }
+                runlist->active_tsgs =
+                        nvgpu_kzalloc(g, DIV_ROUND_UP(f->num_channels,
+                                                      BITS_PER_BYTE));
+                if (!runlist->active_tsgs) {
+                        goto clean_up_runlist;
+                }
+                runlist_size  = f->runlist_entry_size * f->num_runlist_entries;
+                nvgpu_log(g, gpu_dbg_info,
+                                "runlist_entries %d runlist size %zu",
+                                f->num_runlist_entries, runlist_size);
+                for (i = 0; i < MAX_RUNLIST_BUFFERS; i++) {
+                        err = nvgpu_dma_alloc_flags_sys(g, flags,
+                                                            runlist_size,
+                                                            &runlist->mem[i]);
+                        if (err) {
+                                nvgpu_err(g, "memory allocation failed");
+                                goto clean_up_runlist;
+                        }
+                }
+                err = nvgpu_mutex_init(&runlist->runlist_lock);
+                if (err != 0) {
+                        nvgpu_err(g,
+                                "Error in runlist_lock mutex initialization");
+                        goto clean_up_runlist;
+                }
+                /* None of buffers is pinned if this value doesn't change.
+                    Otherwise, one of them (cur_buffer) must have been pinned. */
+                runlist->cur_buffer = MAX_RUNLIST_BUFFERS;
+                for (pbdma_id = 0; pbdma_id < f->num_pbdma; pbdma_id++) {
+                        if (f->pbdma_map[pbdma_id] & BIT(runlist_id)) {
+                                runlist->pbdma_bitmask |= BIT(pbdma_id);
+                        }
+                }
+                nvgpu_log(g, gpu_dbg_info, "runlist %d : pbdma bitmask 0x%x",
+                                 runlist_id, runlist->pbdma_bitmask);
+                for (engine_id = 0; engine_id < f->num_engines; ++engine_id) {
+                        active_engine_id = f->active_engines_list[engine_id];
+                        engine_info = &f->engine_info[active_engine_id];
+                        if (engine_info && engine_info->runlist_id == runlist_id) {
+                                runlist->eng_bitmask |= BIT(active_engine_id);
+                        }
+                }
+                nvgpu_log(g, gpu_dbg_info, "runlist %d : act eng bitmask 0x%x",
+                                 runlist_id, runlist->eng_bitmask);
+        }
+        nvgpu_log_fn(g, "done");
+        return 0;
+clean_up_runlist:
+        gk20a_fifo_delete_runlist(f);
+        nvgpu_log_fn(g, "fail");
+        return err;
+}
+u32 gk20a_fifo_intr_0_error_mask(struct gk20a *g)
+{
+        u32 intr_0_error_mask =
+                fifo_intr_0_bind_error_pending_f() |
+                fifo_intr_0_sched_error_pending_f() |
+                fifo_intr_0_chsw_error_pending_f() |
+                fifo_intr_0_fb_flush_timeout_pending_f() |
+                fifo_intr_0_dropped_mmu_fault_pending_f() |
+                fifo_intr_0_mmu_fault_pending_f() |
+                fifo_intr_0_lb_error_pending_f() |
+                fifo_intr_0_pio_error_pending_f();
+        return intr_0_error_mask;
+}
+static u32 gk20a_fifo_intr_0_en_mask(struct gk20a *g)
+{
+        u32 intr_0_en_mask;
+        intr_0_en_mask = g->ops.fifo.intr_0_error_mask(g);
+        intr_0_en_mask |= fifo_intr_0_runlist_event_pending_f() |
+                                 fifo_intr_0_pbdma_intr_pending_f();
+        return intr_0_en_mask;
+}
+int gk20a_init_fifo_reset_enable_hw(struct gk20a *g)
+{
+        u32 intr_stall;
+        u32 mask;
+        u32 timeout;
+        unsigned int i;
+        u32 host_num_pbdma = nvgpu_get_litter_value(g, GPU_LIT_HOST_NUM_PBDMA);
+        nvgpu_log_fn(g, " ");
+        /* enable pmc pfifo */
+        g->ops.mc.reset(g, g->ops.mc.reset_mask(g, NVGPU_UNIT_FIFO));
+        nvgpu_cg_slcg_fifo_load_enable(g);
+        nvgpu_cg_blcg_fifo_load_enable(g);
+        timeout = gk20a_readl(g, fifo_fb_timeout_r());
+        timeout = set_field(timeout, fifo_fb_timeout_period_m(),
+                        fifo_fb_timeout_period_max_f());
+        nvgpu_log_info(g, "fifo_fb_timeout reg val = 0x%08x", timeout);
+        gk20a_writel(g, fifo_fb_timeout_r(), timeout);
+        /* write pbdma timeout value */
+        for (i = 0; i < host_num_pbdma; i++) {
+                timeout = gk20a_readl(g, pbdma_timeout_r(i));
+                timeout = set_field(timeout, pbdma_timeout_period_m(),
+                                    pbdma_timeout_period_max_f());
+                nvgpu_log_info(g, "pbdma_timeout reg val = 0x%08x", timeout);
+                gk20a_writel(g, pbdma_timeout_r(i), timeout);
+        }
+        if (g->ops.fifo.apply_pb_timeout) {
+                g->ops.fifo.apply_pb_timeout(g);
+        }
+        if (g->ops.fifo.apply_ctxsw_timeout_intr) {
+                g->ops.fifo.apply_ctxsw_timeout_intr(g);
+        } else {
+                timeout = g->fifo_eng_timeout_us;
+                timeout = scale_ptimer(timeout,
+                        ptimer_scalingfactor10x(g->ptimer_src_freq));
+                timeout |= fifo_eng_timeout_detection_enabled_f();
+                gk20a_writel(g, fifo_eng_timeout_r(), timeout);
+        }
+        /* clear and enable pbdma interrupt */
+        for (i = 0; i < host_num_pbdma; i++) {
+                gk20a_writel(g, pbdma_intr_0_r(i), 0xFFFFFFFF);
+                gk20a_writel(g, pbdma_intr_1_r(i), 0xFFFFFFFF);
+                intr_stall = gk20a_readl(g, pbdma_intr_stall_r(i));
+                intr_stall &= ~pbdma_intr_stall_lbreq_enabled_f();
+                gk20a_writel(g, pbdma_intr_stall_r(i), intr_stall);
+                nvgpu_log_info(g, "pbdma id:%u, intr_en_0 0x%08x", i, intr_stall);
+                gk20a_writel(g, pbdma_intr_en_0_r(i), intr_stall);
+                intr_stall = gk20a_readl(g, pbdma_intr_stall_1_r(i));
+                /*
+                 * For bug 2082123
+                 * Mask the unused HCE_RE_ILLEGAL_OP bit from the interrupt.
+                 */
+                intr_stall &= ~pbdma_intr_stall_1_hce_illegal_op_enabled_f();
+                nvgpu_log_info(g, "pbdma id:%u, intr_en_1 0x%08x", i, intr_stall);
+                gk20a_writel(g, pbdma_intr_en_1_r(i), intr_stall);
+        }
+        /* reset runlist interrupts */
+        gk20a_writel(g, fifo_intr_runlist_r(), ~0);
+        /* clear and enable pfifo interrupt */
+        gk20a_writel(g, fifo_intr_0_r(), 0xFFFFFFFF);
+        mask = gk20a_fifo_intr_0_en_mask(g);
+        nvgpu_log_info(g, "fifo_intr_en_0 0x%08x", mask);
+        gk20a_writel(g, fifo_intr_en_0_r(), mask);
+        nvgpu_log_info(g, "fifo_intr_en_1 = 0x80000000");
+        gk20a_writel(g, fifo_intr_en_1_r(), 0x80000000);
+        nvgpu_log_fn(g, "done");
+        return 0;
+}
+int gk20a_init_fifo_setup_sw_common(struct gk20a *g)
+{
+        struct fifo_gk20a *f = &g->fifo;
+        unsigned int chid, i;
+        int err = 0;
+        nvgpu_log_fn(g, " ");
+        f->g = g;
+        err = nvgpu_mutex_init(&f->intr.isr.mutex);
+        if (err) {
+                nvgpu_err(g, "failed to init isr.mutex");
+                return err;
+        }
+        err = nvgpu_mutex_init(&f->engines_reset_mutex);
+        if (err) {
+                nvgpu_err(g, "failed to init engines_reset_mutex");
+                return err;
+        }
+        g->ops.fifo.init_pbdma_intr_descs(f); /* just filling in data/tables */
+        f->num_channels = g->ops.fifo.get_num_fifos(g);
+        f->runlist_entry_size =  g->ops.fifo.runlist_entry_size();
+        f->num_runlist_entries = fifo_eng_runlist_length_max_v();
+        f->num_pbdma = nvgpu_get_litter_value(g, GPU_LIT_HOST_NUM_PBDMA);
+        f->max_engines = nvgpu_get_litter_value(g, GPU_LIT_HOST_NUM_ENGINES);
+        f->userd_entry_size = 1 << ram_userd_base_shift_v();
+        f->channel = nvgpu_vzalloc(g, f->num_channels * sizeof(*f->channel));
+        f->tsg = nvgpu_vzalloc(g, f->num_channels * sizeof(*f->tsg));
+        f->pbdma_map = nvgpu_kzalloc(g, f->num_pbdma * sizeof(*f->pbdma_map));
+        f->engine_info = nvgpu_kzalloc(g, f->max_engines *
+                                sizeof(*f->engine_info));
+        f->active_engines_list = nvgpu_kzalloc(g, f->max_engines * sizeof(u32));
+        if (!(f->channel && f->tsg && f->pbdma_map && f->engine_info &&
+                f->active_engines_list)) {
+                err = -ENOMEM;
+                goto clean_up;
+        }
+        memset(f->active_engines_list, 0xff, (f->max_engines * sizeof(u32)));
+        /* pbdma map needs to be in place before calling engine info init */
+        for (i = 0; i < f->num_pbdma; ++i) {
+                f->pbdma_map[i] = gk20a_readl(g, fifo_pbdma_map_r(i));
+        }
+        g->ops.fifo.init_engine_info(f);
+        err = init_runlist(g, f);
+        if (err) {
+                nvgpu_err(g, "failed to init runlist");
+                goto clean_up;
+        }
+        nvgpu_init_list_node(&f->free_chs);
+        err = nvgpu_mutex_init(&f->free_chs_mutex);
+        if (err) {
+                nvgpu_err(g, "failed to init free_chs_mutex");
+                goto clean_up;
+        }
+        for (chid = 0; chid < f->num_channels; chid++) {
+                gk20a_init_channel_support(g, chid);
+                gk20a_init_tsg_support(g, chid);
+        }
+        err = nvgpu_mutex_init(&f->tsg_inuse_mutex);
+        if (err) {
+                nvgpu_err(g, "failed to init tsg_inuse_mutex");
+                goto clean_up;
+        }
+        f->remove_support = gk20a_remove_fifo_support;
+        f->deferred_reset_pending = false;
+        err = nvgpu_mutex_init(&f->deferred_reset_mutex);
+        if (err) {
+                nvgpu_err(g, "failed to init deferred_reset_mutex");
+                goto clean_up;
+        }
+        nvgpu_log_fn(g, "done");
+        return 0;
+clean_up:
+        nvgpu_err(g, "fail");
+        nvgpu_vfree(g, f->channel);
+        f->channel = NULL;
+        nvgpu_vfree(g, f->tsg);
+        f->tsg = NULL;
+        nvgpu_kfree(g, f->pbdma_map);
+        f->pbdma_map = NULL;
+        nvgpu_kfree(g, f->engine_info);
+        f->engine_info = NULL;
+        nvgpu_kfree(g, f->active_engines_list);
+        f->active_engines_list = NULL;
+        return err;
+}
+int gk20a_init_fifo_setup_sw(struct gk20a *g)
+{
+        struct fifo_gk20a *f = &g->fifo;
+        unsigned int chid;
+        u64 userd_base;
+        int err = 0;
+        nvgpu_log_fn(g, " ");
+        if (f->sw_ready) {
+                nvgpu_log_fn(g, "skip init");
+                return 0;
+        }
+        err = gk20a_init_fifo_setup_sw_common(g);
+        if (err) {
+                nvgpu_err(g, "fail: err: %d", err);
+                return err;
+        }
+        if (g->ops.mm.is_bar1_supported(g)) {
+                err = nvgpu_dma_alloc_map_sys(g->mm.bar1.vm,
+                                   f->userd_entry_size * f->num_channels,
+                                   &f->userd);
+        } else {
+                err = nvgpu_dma_alloc_sys(g, f->userd_entry_size *
+                                f->num_channels, &f->userd);
+        }
+        if (err) {
+                nvgpu_err(g, "userd memory allocation failed");
+                goto clean_up;
+        }
+        nvgpu_log(g, gpu_dbg_map, "userd gpu va = 0x%llx", f->userd.gpu_va);
+        userd_base = nvgpu_mem_get_addr(g, &f->userd);
+        for (chid = 0; chid < f->num_channels; chid++) {
+                f->channel[chid].userd_iova = userd_base +
+                        chid * f->userd_entry_size;
+                f->channel[chid].userd_gpu_va =
+                        f->userd.gpu_va + chid * f->userd_entry_size;
+        }
+        err = nvgpu_channel_worker_init(g);
+        if (err) {
+                goto clean_up;
+        }
+        f->sw_ready = true;
+        nvgpu_log_fn(g, "done");
+        return 0;
+clean_up:
+        nvgpu_log_fn(g, "fail");
+        if (nvgpu_mem_is_valid(&f->userd)) {
+                if (g->ops.mm.is_bar1_supported(g)) {
+                        nvgpu_dma_unmap_free(g->mm.bar1.vm, &f->userd);
+                } else {
+                        nvgpu_dma_free(g, &f->userd);
+                }
+        }
+        return err;
+}
+void gk20a_fifo_handle_runlist_event(struct gk20a *g)
+{
+        u32 runlist_event = gk20a_readl(g, fifo_intr_runlist_r());
+        nvgpu_log(g, gpu_dbg_intr, "runlist event %08x",
+                  runlist_event);
+        gk20a_writel(g, fifo_intr_runlist_r(), runlist_event);
+}
+int gk20a_init_fifo_setup_hw(struct gk20a *g)
+{
+        struct fifo_gk20a *f = &g->fifo;
+        nvgpu_log_fn(g, " ");
+        /* test write, read through bar1 @ userd region before
+         * turning on the snooping */
+        {
+                struct fifo_gk20a *f = &g->fifo;
+                u32 v, v1 = 0x33, v2 = 0x55;
+                u32 bar1_vaddr = f->userd.gpu_va;
+                volatile u32 *cpu_vaddr = f->userd.cpu_va;
+                nvgpu_log_info(g, "test bar1 @ vaddr 0x%x",
+                           bar1_vaddr);
+                v = gk20a_bar1_readl(g, bar1_vaddr);
+                *cpu_vaddr = v1;
+                nvgpu_mb();
+                if (v1 != gk20a_bar1_readl(g, bar1_vaddr)) {
+                        nvgpu_err(g, "bar1 broken @ gk20a: CPU wrote 0x%x, \
+                                GPU read 0x%x", *cpu_vaddr, gk20a_bar1_readl(g, bar1_vaddr));
+                        return -EINVAL;
+                }
+                gk20a_bar1_writel(g, bar1_vaddr, v2);
+                if (v2 != gk20a_bar1_readl(g, bar1_vaddr)) {
+                        nvgpu_err(g, "bar1 broken @ gk20a: GPU wrote 0x%x, \
+                                CPU read 0x%x", gk20a_bar1_readl(g, bar1_vaddr), *cpu_vaddr);
+                        return -EINVAL;
+                }
+                /* is it visible to the cpu? */
+                if (*cpu_vaddr != v2) {
+                        nvgpu_err(g,
+                                "cpu didn't see bar1 write @ %p!",
+                                cpu_vaddr);
+                }
+                /* put it back */
+                gk20a_bar1_writel(g, bar1_vaddr, v);
+        }
+        /*XXX all manner of flushes and caching worries, etc */
+        /* set the base for the userd region now */
+        gk20a_writel(g, fifo_bar1_base_r(),
+                        fifo_bar1_base_ptr_f(f->userd.gpu_va >> 12) |
+                        fifo_bar1_base_valid_true_f());
+        nvgpu_log_fn(g, "done");
+        return 0;
+}
+int gk20a_init_fifo_support(struct gk20a *g)
+{
+        u32 err;
+        err = g->ops.fifo.setup_sw(g);
+        if (err) {
+                return err;
+        }
+        if (g->ops.fifo.init_fifo_setup_hw) {
+                err = g->ops.fifo.init_fifo_setup_hw(g);
+        }
+        if (err) {
+                return err;
+        }
+        return err;
+}
+/* return with a reference to the channel, caller must put it back */
+struct channel_gk20a *
+gk20a_refch_from_inst_ptr(struct gk20a *g, u64 inst_ptr)
+{
+        struct fifo_gk20a *f = &g->fifo;
+        unsigned int ci;
+        if (unlikely(!f->channel)) {
+                return NULL;
+        }
+        for (ci = 0; ci < f->num_channels; ci++) {
+                struct channel_gk20a *ch;
+                u64 ch_inst_ptr;
+                ch = gk20a_channel_from_id(g, ci);
+                /* only alive channels are searched */
+                if (!ch) {
+                        continue;
+                }
+                ch_inst_ptr = nvgpu_inst_block_addr(g, &ch->inst_block);
+                if (inst_ptr == ch_inst_ptr) {
+                        return ch;
+                }
+                gk20a_channel_put(ch);
+        }
+        return NULL;
+}
+/* fault info/descriptions.
+ * tbd: move to setup
+ *  */
+static const char * const gk20a_fault_type_descs[] = {
+         "pde", /*fifo_intr_mmu_fault_info_type_pde_v() == 0 */
+         "pde size",
+         "pte",
+         "va limit viol",
+         "unbound inst",
+         "priv viol",
+         "ro viol",
+         "wo viol",
+         "pitch mask",
+         "work creation",
+         "bad aperture",
+         "compression failure",
+         "bad kind",
+         "region viol",
+         "dual ptes",
+         "poisoned",
+};
+/* engine descriptions */
+static const char * const engine_subid_descs[] = {
+        "gpc",
+        "hub",
+};
+static const char * const gk20a_hub_client_descs[] = {
+        "vip", "ce0", "ce1", "dniso", "fe", "fecs", "host", "host cpu",
+        "host cpu nb", "iso", "mmu", "mspdec", "msppp", "msvld",
+        "niso", "p2p", "pd", "perf", "pmu", "raster twod", "scc",
+        "scc nb", "sec", "ssync", "gr copy", "xv", "mmu nb",
+        "msenc", "d falcon", "sked", "a falcon", "n/a",
+};
+static const char * const gk20a_gpc_client_descs[] = {
+        "l1 0", "t1 0", "pe 0",
+        "l1 1", "t1 1", "pe 1",
+        "l1 2", "t1 2", "pe 2",
+        "l1 3", "t1 3", "pe 3",
+        "rast", "gcc", "gpccs",
+        "prop 0", "prop 1", "prop 2", "prop 3",
+        "l1 4", "t1 4", "pe 4",
+        "l1 5", "t1 5", "pe 5",
+        "l1 6", "t1 6", "pe 6",
+        "l1 7", "t1 7", "pe 7",
+};
+static const char * const does_not_exist[] = {
+        "does not exist"
+};
+/* fill in mmu fault desc */
+void gk20a_fifo_get_mmu_fault_desc(struct mmu_fault_info *mmfault)
+{
+        if (mmfault->fault_type >= ARRAY_SIZE(gk20a_fault_type_descs)) {
+                WARN_ON(mmfault->fault_type >=
+                                ARRAY_SIZE(gk20a_fault_type_descs));
+        } else {
+                mmfault->fault_type_desc =
+                         gk20a_fault_type_descs[mmfault->fault_type];
+        }
+}
+/* fill in mmu fault client description */
+void gk20a_fifo_get_mmu_fault_client_desc(struct mmu_fault_info *mmfault)
+{
+        if (mmfault->client_id >= ARRAY_SIZE(gk20a_hub_client_descs)) {
+                WARN_ON(mmfault->client_id >=
+                                ARRAY_SIZE(gk20a_hub_client_descs));
+        } else {
+                mmfault->client_id_desc =
+                         gk20a_hub_client_descs[mmfault->client_id];
+        }
+}
+/* fill in mmu fault gpc description */
+void gk20a_fifo_get_mmu_fault_gpc_desc(struct mmu_fault_info *mmfault)
+{
+        if (mmfault->client_id >= ARRAY_SIZE(gk20a_gpc_client_descs)) {
+                WARN_ON(mmfault->client_id >=
+                                ARRAY_SIZE(gk20a_gpc_client_descs));
+        } else {
+                mmfault->client_id_desc =
+                         gk20a_gpc_client_descs[mmfault->client_id];
+        }
+}
+static void get_exception_mmu_fault_info(struct gk20a *g, u32 mmu_fault_id,
+        struct mmu_fault_info *mmfault)
+{
+        g->ops.fifo.get_mmu_fault_info(g, mmu_fault_id, mmfault);
+        /* parse info */
+        mmfault->fault_type_desc =  does_not_exist[0];
+        if (g->ops.fifo.get_mmu_fault_desc) {
+                g->ops.fifo.get_mmu_fault_desc(mmfault);
+        }
+        if (mmfault->client_type >= ARRAY_SIZE(engine_subid_descs)) {
+                WARN_ON(mmfault->client_type >= ARRAY_SIZE(engine_subid_descs));
+                mmfault->client_type_desc = does_not_exist[0];
+        } else {
+                mmfault->client_type_desc =
+                                 engine_subid_descs[mmfault->client_type];
+        }
+        mmfault->client_id_desc = does_not_exist[0];
+        if ((mmfault->client_type ==
+                fifo_intr_mmu_fault_info_engine_subid_hub_v())
+                && g->ops.fifo.get_mmu_fault_client_desc) {
+                g->ops.fifo.get_mmu_fault_client_desc(mmfault);
+        } else if ((mmfault->client_type ==
+                        fifo_intr_mmu_fault_info_engine_subid_gpc_v())
+                        && g->ops.fifo.get_mmu_fault_gpc_desc) {
+                g->ops.fifo.get_mmu_fault_gpc_desc(mmfault);
+        }
+}
+/* reads info from hardware and fills in mmu fault info record */
+void gk20a_fifo_get_mmu_fault_info(struct gk20a *g, u32 mmu_fault_id,
+        struct mmu_fault_info *mmfault)
+{
+        u32 fault_info;
+        u32 addr_lo, addr_hi;
+        nvgpu_log_fn(g, "mmu_fault_id %d", mmu_fault_id);
+        memset(mmfault, 0, sizeof(*mmfault));
+        fault_info = gk20a_readl(g,
+                fifo_intr_mmu_fault_info_r(mmu_fault_id));
+        mmfault->fault_type =
+                fifo_intr_mmu_fault_info_type_v(fault_info);
+        mmfault->access_type =
+                fifo_intr_mmu_fault_info_write_v(fault_info);
+        mmfault->client_type =
+                fifo_intr_mmu_fault_info_engine_subid_v(fault_info);
+        mmfault->client_id =
+                fifo_intr_mmu_fault_info_client_v(fault_info);
+        addr_lo = gk20a_readl(g, fifo_intr_mmu_fault_lo_r(mmu_fault_id));
+        addr_hi = gk20a_readl(g, fifo_intr_mmu_fault_hi_r(mmu_fault_id));
+        mmfault->fault_addr = hi32_lo32_to_u64(addr_hi, addr_lo);
+        /* note:ignoring aperture on gk20a... */
+        mmfault->inst_ptr = fifo_intr_mmu_fault_inst_ptr_v(
+                 gk20a_readl(g, fifo_intr_mmu_fault_inst_r(mmu_fault_id)));
+        /* note: inst_ptr is a 40b phys addr.  */
+        mmfault->inst_ptr <<= fifo_intr_mmu_fault_inst_ptr_align_shift_v();
+}
+void gk20a_fifo_reset_engine(struct gk20a *g, u32 engine_id)
+{
+        u32 engine_enum = ENGINE_INVAL_GK20A;
+        struct fifo_engine_info_gk20a *engine_info;
+        nvgpu_log_fn(g, " ");
+        if (!g) {
+                return;
+        }
+        engine_info = gk20a_fifo_get_engine_info(g, engine_id);
+        if (engine_info) {
+                engine_enum = engine_info->engine_enum;
+        }
+        if (engine_enum == ENGINE_INVAL_GK20A) {
+                nvgpu_err(g, "unsupported engine_id %d", engine_id);
+        }
+        if (engine_enum == ENGINE_GR_GK20A) {
+                if (g->support_pmu) {
+                        if (nvgpu_pg_elpg_disable(g) != 0 ) {
+                                nvgpu_err(g, "failed to set disable elpg");
+                        }
+                }
+#ifdef CONFIG_GK20A_CTXSW_TRACE
+                /*
+                 * Resetting engine will alter read/write index. Need to flush
+                 * circular buffer before re-enabling FECS.
+                 */
+                if (g->ops.fecs_trace.reset)
+                        g->ops.fecs_trace.reset(g);
+#endif
+                if (!nvgpu_platform_is_simulation(g)) {
+                        /*HALT_PIPELINE method, halt GR engine*/
+                        if (gr_gk20a_halt_pipe(g)) {
+                                nvgpu_err(g, "failed to HALT gr pipe");
+                        }
+                        /*
+                         * resetting engine using mc_enable_r() is not
+                         * enough, we do full init sequence
+                         */
+                        nvgpu_log(g, gpu_dbg_info, "resetting gr engine");
+                        gk20a_gr_reset(g);
+                } else {
+                        nvgpu_log(g, gpu_dbg_info,
+                                "HALT gr pipe not supported and "
+                                "gr cannot be reset without halting gr pipe");
+                }
+                if (g->support_pmu) {
+                        if (nvgpu_pg_elpg_enable(g) != 0 ) {
+                                nvgpu_err(g, "failed to set enable elpg");
+                        }
+                }
+        }
+        if ((engine_enum == ENGINE_GRCE_GK20A) ||
+                (engine_enum == ENGINE_ASYNC_CE_GK20A)) {
+                        g->ops.mc.reset(g, engine_info->reset_mask);
+        }
+}
+static void gk20a_fifo_handle_chsw_fault(struct gk20a *g)
+{
+        u32 intr;
+        intr = gk20a_readl(g, fifo_intr_chsw_error_r());
+        nvgpu_err(g, "chsw: %08x", intr);
+        gk20a_fecs_dump_falcon_stats(g);
+        gk20a_writel(g, fifo_intr_chsw_error_r(), intr);
+}
+static void gk20a_fifo_handle_dropped_mmu_fault(struct gk20a *g)
+{
+        u32 fault_id = gk20a_readl(g, fifo_intr_mmu_fault_id_r());
+        nvgpu_err(g, "dropped mmu fault (0x%08x)", fault_id);
+}
+bool gk20a_is_fault_engine_subid_gpc(struct gk20a *g, u32 engine_subid)
+{
+        return (engine_subid == fifo_intr_mmu_fault_info_engine_subid_gpc_v());
+}
+bool gk20a_fifo_should_defer_engine_reset(struct gk20a *g, u32 engine_id,
+                u32 engine_subid, bool fake_fault)
+{
+        u32 engine_enum = ENGINE_INVAL_GK20A;
+        struct fifo_engine_info_gk20a *engine_info;
+        if (!g) {
+                return false;
+        }
+        engine_info = gk20a_fifo_get_engine_info(g, engine_id);
+        if (engine_info) {
+                engine_enum = engine_info->engine_enum;
+        }
+        if (engine_enum == ENGINE_INVAL_GK20A) {
+                return false;
+        }
+        /* channel recovery is only deferred if an sm debugger
+           is attached and has MMU debug mode is enabled */
+        if (!g->ops.gr.sm_debugger_attached(g) ||
+            !g->ops.fb.is_debug_mode_enabled(g)) {
+                return false;
+        }
+        /* if this fault is fake (due to RC recovery), don't defer recovery */
+        if (fake_fault) {
+                return false;
+        }
+        if (engine_enum != ENGINE_GR_GK20A) {
+                return false;
+        }
+        return g->ops.fifo.is_fault_engine_subid_gpc(g, engine_subid);
+}
+/* caller must hold a channel reference */
+static bool gk20a_fifo_ch_timeout_debug_dump_state(struct gk20a *g,
+                struct channel_gk20a *refch)
+{
+        bool verbose = false;
+        if (!refch) {
+                return verbose;
+        }
+        if (nvgpu_is_error_notifier_set(refch,
+                        NVGPU_ERR_NOTIFIER_FIFO_ERROR_IDLE_TIMEOUT)) {
+                verbose = refch->timeout_debug_dump;
+        }
+        return verbose;
+}
+/* caller must hold a channel reference */
+static void gk20a_fifo_set_has_timedout_and_wake_up_wqs(struct gk20a *g,
+                struct channel_gk20a *refch)
+{
+        if (refch) {
+                /* mark channel as faulted */
+                gk20a_channel_set_timedout(refch);
+                /* unblock pending waits */
+                nvgpu_cond_broadcast_interruptible(&refch->semaphore_wq);
+                nvgpu_cond_broadcast_interruptible(&refch->notifier_wq);
+        }
+}
+/* caller must hold a channel reference */
+bool gk20a_fifo_error_ch(struct gk20a *g,
+                struct channel_gk20a *refch)
+{
+        bool verbose;
+        verbose = gk20a_fifo_ch_timeout_debug_dump_state(g, refch);
+        gk20a_fifo_set_has_timedout_and_wake_up_wqs(g, refch);
+        return verbose;
+}
+bool gk20a_fifo_error_tsg(struct gk20a *g,
+                struct tsg_gk20a *tsg)
+{
+        struct channel_gk20a *ch = NULL;
+        bool verbose = false;
+        nvgpu_rwsem_down_read(&tsg->ch_list_lock);
+        nvgpu_list_for_each_entry(ch, &tsg->ch_list, channel_gk20a, ch_entry) {
+                if (gk20a_channel_get(ch)) {
+                        if (gk20a_fifo_error_ch(g, ch)) {
+                                verbose = true;
+                        }
+                        gk20a_channel_put(ch);
+                }
+        }
+        nvgpu_rwsem_up_read(&tsg->ch_list_lock);
+        return verbose;
+}
+/* caller must hold a channel reference */
+void gk20a_fifo_set_ctx_mmu_error_ch(struct gk20a *g,
+                struct channel_gk20a *refch)
+{
+        nvgpu_err(g,
+                "channel %d generated a mmu fault", refch->chid);
+        g->ops.fifo.set_error_notifier(refch,
+                                NVGPU_ERR_NOTIFIER_FIFO_ERROR_MMU_ERR_FLT);
+}
+void gk20a_fifo_set_ctx_mmu_error_tsg(struct gk20a *g,
+                struct tsg_gk20a *tsg)
+{
+        struct channel_gk20a *ch = NULL;
+        nvgpu_err(g,
+                "TSG %d generated a mmu fault", tsg->tsgid);
+        nvgpu_rwsem_down_read(&tsg->ch_list_lock);
+        nvgpu_list_for_each_entry(ch, &tsg->ch_list, channel_gk20a, ch_entry) {
+                if (gk20a_channel_get(ch)) {
+                        gk20a_fifo_set_ctx_mmu_error_ch(g, ch);
+                        gk20a_channel_put(ch);
+                }
+        }
+        nvgpu_rwsem_up_read(&tsg->ch_list_lock);
+}
+void gk20a_fifo_abort_tsg(struct gk20a *g, struct tsg_gk20a *tsg, bool preempt)
+{
+        struct channel_gk20a *ch = NULL;
+        nvgpu_log_fn(g, " ");
+        g->ops.fifo.disable_tsg(tsg);
+        if (preempt) {
+                g->ops.fifo.preempt_tsg(g, tsg);
+        }
+        nvgpu_rwsem_down_read(&tsg->ch_list_lock);
+        nvgpu_list_for_each_entry(ch, &tsg->ch_list, channel_gk20a, ch_entry) {
+                if (gk20a_channel_get(ch)) {
+                        gk20a_channel_set_timedout(ch);
+                        if (ch->g->ops.fifo.ch_abort_clean_up) {
+                                ch->g->ops.fifo.ch_abort_clean_up(ch);
+                        }
+                        gk20a_channel_put(ch);
+                }
+        }
+        nvgpu_rwsem_up_read(&tsg->ch_list_lock);
+}
+int gk20a_fifo_deferred_reset(struct gk20a *g, struct channel_gk20a *ch)
+{
+        unsigned long engine_id, engines = 0U;
+        struct tsg_gk20a *tsg;
+        bool deferred_reset_pending;
+        struct fifo_gk20a *f = &g->fifo;
+        nvgpu_mutex_acquire(&g->dbg_sessions_lock);
+        nvgpu_mutex_acquire(&f->deferred_reset_mutex);
+        deferred_reset_pending = g->fifo.deferred_reset_pending;
+        nvgpu_mutex_release(&f->deferred_reset_mutex);
+        if (!deferred_reset_pending) {
+                nvgpu_mutex_release(&g->dbg_sessions_lock);
+                return 0;
+        }
+        gr_gk20a_disable_ctxsw(g);
+        tsg = tsg_gk20a_from_ch(ch);
+        if (tsg != NULL) {
+                engines = gk20a_fifo_engines_on_id(g, tsg->tsgid, true);
+        } else {
+                nvgpu_err(g, "chid: %d is not bound to tsg", ch->chid);
+        }
+        if (engines == 0U) {
+                goto clean_up;
+        }
+        /*
+         * If deferred reset is set for an engine, and channel is running
+         * on that engine, reset it
+         */
+        for_each_set_bit(engine_id, &g->fifo.deferred_fault_engines, 32) {
+                if (BIT(engine_id) & engines) {
+                        gk20a_fifo_reset_engine(g, engine_id);
+                }
+        }
+        nvgpu_mutex_acquire(&f->deferred_reset_mutex);
+        g->fifo.deferred_fault_engines = 0;
+        g->fifo.deferred_reset_pending = false;
+        nvgpu_mutex_release(&f->deferred_reset_mutex);
+clean_up:
+        gr_gk20a_enable_ctxsw(g);
+        nvgpu_mutex_release(&g->dbg_sessions_lock);
+        return 0;
+}
+static bool gk20a_fifo_handle_mmu_fault_locked(
+        struct gk20a *g,
+        u32 mmu_fault_engines, /* queried from HW if 0 */
+        u32 hw_id, /* queried from HW if ~(u32)0 OR mmu_fault_engines == 0*/
+        bool id_is_tsg)
+{
+        bool fake_fault;
+        unsigned long fault_id;
+        unsigned long engine_mmu_fault_id;
+        bool verbose = true;
+        u32 grfifo_ctl;
+        bool deferred_reset_pending = false;
+        struct fifo_gk20a *f = &g->fifo;
+        nvgpu_log_fn(g, " ");
+        /* Disable power management */
+        if (g->support_pmu) {
+                if (nvgpu_cg_pg_disable(g) != 0) {
+                        nvgpu_warn(g, "fail to disable power mgmt");
+                }
+        }
+        /* Disable fifo access */
+        grfifo_ctl = gk20a_readl(g, gr_gpfifo_ctl_r());
+        grfifo_ctl &= ~gr_gpfifo_ctl_semaphore_access_f(1);
+        grfifo_ctl &= ~gr_gpfifo_ctl_access_f(1);
+        gk20a_writel(g, gr_gpfifo_ctl_r(),
+                grfifo_ctl | gr_gpfifo_ctl_access_f(0) |
+                gr_gpfifo_ctl_semaphore_access_f(0));
+        if (mmu_fault_engines) {
+                fault_id = mmu_fault_engines;
+                fake_fault = true;
+        } else {
+                fault_id = gk20a_readl(g, fifo_intr_mmu_fault_id_r());
+                fake_fault = false;
+                gk20a_debug_dump(g);
+        }
+        nvgpu_mutex_acquire(&f->deferred_reset_mutex);
+        g->fifo.deferred_reset_pending = false;
+        nvgpu_mutex_release(&f->deferred_reset_mutex);
+        /* go through all faulted engines */
+        for_each_set_bit(engine_mmu_fault_id, &fault_id, 32) {
+                /* bits in fifo_intr_mmu_fault_id_r do not correspond 1:1 to
+                 * engines. Convert engine_mmu_id to engine_id */
+                u32 engine_id = gk20a_mmu_id_to_engine_id(g,
+                                        engine_mmu_fault_id);
+                struct mmu_fault_info mmfault_info;
+                struct channel_gk20a *ch = NULL;
+                struct tsg_gk20a *tsg = NULL;
+                struct channel_gk20a *refch = NULL;
+                /* read and parse engine status */
+                u32 status = gk20a_readl(g, fifo_engine_status_r(engine_id));
+                u32 ctx_status = fifo_engine_status_ctx_status_v(status);
+                bool ctxsw = (ctx_status ==
+                                fifo_engine_status_ctx_status_ctxsw_switch_v()
+                                || ctx_status ==
+                                fifo_engine_status_ctx_status_ctxsw_save_v()
+                                || ctx_status ==
+                                fifo_engine_status_ctx_status_ctxsw_load_v());
+                get_exception_mmu_fault_info(g, engine_mmu_fault_id,
+                                                 &mmfault_info);
+                trace_gk20a_mmu_fault(mmfault_info.fault_addr,
+                                      mmfault_info.fault_type,
+                                      mmfault_info.access_type,
+                                      mmfault_info.inst_ptr,
+                                      engine_id,
+                                      mmfault_info.client_type_desc,
+                                      mmfault_info.client_id_desc,
+                                      mmfault_info.fault_type_desc);
+                nvgpu_err(g, "%s mmu fault on engine %d, "
+                           "engine subid %d (%s), client %d (%s), "
+                           "addr 0x%llx, type %d (%s), access_type 0x%08x,"
+                           "inst_ptr 0x%llx",
+                           fake_fault ? "fake" : "",
+                           engine_id,
+                           mmfault_info.client_type,
+                           mmfault_info.client_type_desc,
+                           mmfault_info.client_id, mmfault_info.client_id_desc,
+                           mmfault_info.fault_addr,
+                           mmfault_info.fault_type,
+                           mmfault_info.fault_type_desc,
+                           mmfault_info.access_type, mmfault_info.inst_ptr);
+                if (ctxsw) {
+                        gk20a_fecs_dump_falcon_stats(g);
+                        nvgpu_err(g, "gr_status_r : 0x%x",
+                                        gk20a_readl(g, gr_status_r()));
+                }
+                /* get the channel/TSG */
+                if (fake_fault) {
+                        /* use next_id if context load is failing */
+                        u32 id, type;
+                        if (hw_id == ~(u32)0) {
+                                id = (ctx_status ==
+                                      fifo_engine_status_ctx_status_ctxsw_load_v()) ?
+                                        fifo_engine_status_next_id_v(status) :
+                                        fifo_engine_status_id_v(status);
+                                type = (ctx_status ==
+                                        fifo_engine_status_ctx_status_ctxsw_load_v()) ?
+                                        fifo_engine_status_next_id_type_v(status) :
+                                        fifo_engine_status_id_type_v(status);
+                        } else {
+                                id = hw_id;
+                                type = id_is_tsg ?
+                                        fifo_engine_status_id_type_tsgid_v() :
+                                        fifo_engine_status_id_type_chid_v();
+                        }
+                        if (type == fifo_engine_status_id_type_tsgid_v()) {
+                                tsg = &g->fifo.tsg[id];
+                        } else if (type == fifo_engine_status_id_type_chid_v()) {
+                                ch = &g->fifo.channel[id];
+                                refch = gk20a_channel_get(ch);
+                                if (refch != NULL) {
+                                        tsg = tsg_gk20a_from_ch(refch);
+                                }
+                        }
+                } else {
+                        /* read channel based on instruction pointer */
+                        ch = gk20a_refch_from_inst_ptr(g,
+                                        mmfault_info.inst_ptr);
+                        refch = ch;
+                        if (refch != NULL) {
+                                tsg = tsg_gk20a_from_ch(refch);
+                        }
+                }
+                /* check if engine reset should be deferred */
+                if (engine_id != FIFO_INVAL_ENGINE_ID) {
+                        bool defer = gk20a_fifo_should_defer_engine_reset(g,
+                                        engine_id, mmfault_info.client_type,
+                                        fake_fault);
+                        if ((ch || tsg) && defer) {
+                                g->fifo.deferred_fault_engines |= BIT(engine_id);
+                                /* handled during channel free */
+                                nvgpu_mutex_acquire(&f->deferred_reset_mutex);
+                                g->fifo.deferred_reset_pending = true;
+                                nvgpu_mutex_release(&f->deferred_reset_mutex);
+                                deferred_reset_pending = true;
+                                nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
+                                           "sm debugger attached,"
+                                           " deferring channel recovery to channel free");
+                        } else {
+                                gk20a_fifo_reset_engine(g, engine_id);
+                        }
+                }
+#ifdef CONFIG_GK20A_CTXSW_TRACE
+                if (tsg) {
+                        gk20a_ctxsw_trace_tsg_reset(g, tsg);
+                }
+#endif
+                /*
+                 * Disable the channel/TSG from hw and increment syncpoints.
+                 */
+                if (tsg) {
+                        if (deferred_reset_pending) {
+                                gk20a_disable_tsg(tsg);
+                        } else {
+                                if (!fake_fault) {
+                                        gk20a_fifo_set_ctx_mmu_error_tsg(g,
+                                                                         tsg);
+                                }
+                                verbose = gk20a_fifo_error_tsg(g, tsg);
+                                gk20a_fifo_abort_tsg(g, tsg, false);
+                        }
+                        /* put back the ref taken early above */
+                        if (refch) {
+                                gk20a_channel_put(ch);
+                        }
+                } else if (refch != NULL) {
+                        nvgpu_err(g, "mmu error in unbound channel %d",
+                                          ch->chid);
+                        gk20a_channel_put(ch);
+                } else if (mmfault_info.inst_ptr ==
+                                nvgpu_inst_block_addr(g, &g->mm.bar1.inst_block)) {
+                        nvgpu_err(g, "mmu fault from bar1");
+                } else if (mmfault_info.inst_ptr ==
+                                nvgpu_inst_block_addr(g, &g->mm.pmu.inst_block)) {
+                        nvgpu_err(g, "mmu fault from pmu");
+                } else {
+                        nvgpu_err(g, "couldn't locate channel for mmu fault");
+                }
+        }
+        /* clear interrupt */
+        gk20a_writel(g, fifo_intr_mmu_fault_id_r(), fault_id);
+        /* resume scheduler */
+        gk20a_writel(g, fifo_error_sched_disable_r(),
+                     gk20a_readl(g, fifo_error_sched_disable_r()));
+        /* Re-enable fifo access */
+        gk20a_writel(g, gr_gpfifo_ctl_r(),
+                     gr_gpfifo_ctl_access_enabled_f() |
+                     gr_gpfifo_ctl_semaphore_access_enabled_f());
+        /* It is safe to enable ELPG again. */
+        if (g->support_pmu) {
+                if (nvgpu_cg_pg_enable(g) != 0) {
+                        nvgpu_warn(g, "fail to enable power mgmt");
+                }
+        }
+        return verbose;
+}
+static bool gk20a_fifo_handle_mmu_fault(
+        struct gk20a *g,
+        u32 mmu_fault_engines, /* queried from HW if 0 */
+        u32 hw_id, /* queried from HW if ~(u32)0 OR mmu_fault_engines == 0*/
+        bool id_is_tsg)
+{
+        u32 rlid;
+        bool verbose;
+        nvgpu_log_fn(g, " ");
+        nvgpu_log_info(g, "acquire engines_reset_mutex");
+        nvgpu_mutex_acquire(&g->fifo.engines_reset_mutex);
+        nvgpu_log_info(g, "acquire runlist_lock for all runlists");
+        for (rlid = 0; rlid < g->fifo.max_runlists; rlid++) {
+                nvgpu_mutex_acquire(&g->fifo.runlist_info[rlid].runlist_lock);
+        }
+        verbose = gk20a_fifo_handle_mmu_fault_locked(g, mmu_fault_engines,
+                        hw_id, id_is_tsg);
+        nvgpu_log_info(g, "release runlist_lock for all runlists");
+        for (rlid = 0; rlid < g->fifo.max_runlists; rlid++) {
+                nvgpu_mutex_release(&g->fifo.runlist_info[rlid].runlist_lock);
+        }
+        nvgpu_log_info(g, "release engines_reset_mutex");
+        nvgpu_mutex_release(&g->fifo.engines_reset_mutex);
+        return verbose;
+}
+static void gk20a_fifo_get_faulty_id_type(struct gk20a *g, int engine_id,
+                                          u32 *id, u32 *type)
+{
+        u32 status = gk20a_readl(g, fifo_engine_status_r(engine_id));
+        u32 ctx_status = fifo_engine_status_ctx_status_v(status);
+        /* use next_id if context load is failing */
+        *id = (ctx_status ==
+                fifo_engine_status_ctx_status_ctxsw_load_v()) ?
+                fifo_engine_status_next_id_v(status) :
+                fifo_engine_status_id_v(status);
+        *type = (ctx_status ==
+                fifo_engine_status_ctx_status_ctxsw_load_v()) ?
+                fifo_engine_status_next_id_type_v(status) :
+                fifo_engine_status_id_type_v(status);
+}
+static u32 gk20a_fifo_engines_on_id(struct gk20a *g, u32 id, bool is_tsg)
+{
+        unsigned int i;
+        u32 engines = 0;
+        for (i = 0; i < g->fifo.num_engines; i++) {
+                u32 active_engine_id = g->fifo.active_engines_list[i];
+                u32 status = gk20a_readl(g, fifo_engine_status_r(active_engine_id));
+                u32 ctx_status =
+                        fifo_engine_status_ctx_status_v(status);
+                u32 ctx_id = (ctx_status ==
+                        fifo_engine_status_ctx_status_ctxsw_load_v()) ?
+                        fifo_engine_status_next_id_v(status) :
+                        fifo_engine_status_id_v(status);
+                u32 type = (ctx_status ==
+                        fifo_engine_status_ctx_status_ctxsw_load_v()) ?
+                        fifo_engine_status_next_id_type_v(status) :
+                        fifo_engine_status_id_type_v(status);
+                bool busy = fifo_engine_status_engine_v(status) ==
+                        fifo_engine_status_engine_busy_v();
+                if (busy && ctx_id == id) {
+                        if ((is_tsg && type ==
+                                        fifo_engine_status_id_type_tsgid_v()) ||
+                                    (!is_tsg && type ==
+                                        fifo_engine_status_id_type_chid_v())) {
+                                engines |= BIT(active_engine_id);
+                        }
+                }
+        }
+        return engines;
+}
+void gk20a_fifo_recover_ch(struct gk20a *g, struct channel_gk20a *ch,
+        bool verbose, u32 rc_type)
+{
+        u32 engines;
+        /* stop context switching to prevent engine assignments from
+           changing until channel is recovered */
+        nvgpu_mutex_acquire(&g->dbg_sessions_lock);
+        gr_gk20a_disable_ctxsw(g);
+        engines = gk20a_fifo_engines_on_id(g, ch->chid, false);
+        if (engines) {
+                gk20a_fifo_recover(g, engines, ch->chid, false, true, verbose,
+                                        rc_type);
+        } else {
+                gk20a_channel_abort(ch, false);
+                if (gk20a_fifo_error_ch(g, ch)) {
+                        gk20a_debug_dump(g);
+                }
+        }
+        gr_gk20a_enable_ctxsw(g);
+        nvgpu_mutex_release(&g->dbg_sessions_lock);
+}
+void gk20a_fifo_recover_tsg(struct gk20a *g, struct tsg_gk20a *tsg,
+                         bool verbose, u32 rc_type)
+{
+        u32 engines = 0U;
+        int err;
+        /* stop context switching to prevent engine assignments from
+           changing until TSG is recovered */
+        nvgpu_mutex_acquire(&g->dbg_sessions_lock);
+        /* disable tsg so that it does not get scheduled again */
+        g->ops.fifo.disable_tsg(tsg);
+        /*
+         * On hitting engine reset, h/w drops the ctxsw_status to INVALID in
+         * fifo_engine_status register. Also while the engine is held in reset
+         * h/w passes busy/idle straight through. fifo_engine_status registers
+         * are correct in that there is no context switch outstanding
+         * as the CTXSW is aborted when reset is asserted.
+        */
+        nvgpu_log_info(g, "acquire engines_reset_mutex");
+        nvgpu_mutex_acquire(&g->fifo.engines_reset_mutex);
+        /*
+         * stop context switching to prevent engine assignments from
+         * changing until engine status is checked to make sure tsg
+         * being recovered is not loaded on the engines
+         */
+        err = gr_gk20a_disable_ctxsw(g);
+        if (err != 0) {
+                /* if failed to disable ctxsw, just abort tsg */
+                nvgpu_err(g, "failed to disable ctxsw");
+        } else {
+                /* recover engines if tsg is loaded on the engines */
+                engines = gk20a_fifo_engines_on_id(g, tsg->tsgid, true);
+                /*
+                 * it is ok to enable ctxsw before tsg is recovered. If engines
+                 * is 0, no engine recovery is needed and if it is  non zero,
+                 * gk20a_fifo_recover will call get_engines_mask_on_id again.
+                 * By that time if tsg is not on the engine, engine need not
+                 * be reset.
+                 */
+                err = gr_gk20a_enable_ctxsw(g);
+                if (err != 0) {
+                        nvgpu_err(g, "failed to enable ctxsw");
+                }
+        }
+        nvgpu_log_info(g, "release engines_reset_mutex");
+        nvgpu_mutex_release(&g->fifo.engines_reset_mutex);
+        if (engines) {
+                gk20a_fifo_recover(g, engines, tsg->tsgid, true, true, verbose,
+                                        rc_type);
+        } else {
+                if (gk20a_fifo_error_tsg(g, tsg) && verbose) {
+                        gk20a_debug_dump(g);
+                }
+                gk20a_fifo_abort_tsg(g, tsg, false);
+        }
+        nvgpu_mutex_release(&g->dbg_sessions_lock);
+}
+void gk20a_fifo_teardown_mask_intr(struct gk20a *g)
+{
+        u32 val;
+        val = gk20a_readl(g, fifo_intr_en_0_r());
+        val &= ~(fifo_intr_en_0_sched_error_m() |
+                fifo_intr_en_0_mmu_fault_m());
+        gk20a_writel(g, fifo_intr_en_0_r(), val);
+        gk20a_writel(g, fifo_intr_0_r(), fifo_intr_0_sched_error_reset_f());
+}
+void gk20a_fifo_teardown_unmask_intr(struct gk20a *g)
+{
+        u32 val;
+        val = gk20a_readl(g, fifo_intr_en_0_r());
+        val |= fifo_intr_en_0_mmu_fault_f(1) | fifo_intr_en_0_sched_error_f(1);
+        gk20a_writel(g, fifo_intr_en_0_r(), val);
+}
+void gk20a_fifo_teardown_ch_tsg(struct gk20a *g, u32 __engine_ids,
+                        u32 hw_id, unsigned int id_type, unsigned int rc_type,
+                         struct mmu_fault_info *mmfault)
+{
+        unsigned long engine_id, i;
+        unsigned long _engine_ids = __engine_ids;
+        unsigned long engine_ids = 0;
+        u32 mmu_fault_engines = 0;
+        u32 ref_type;
+        u32 ref_id;
+        u32 ref_id_is_tsg = false;
+        bool id_is_known = (id_type != ID_TYPE_UNKNOWN) ? true : false;
+        bool id_is_tsg = (id_type == ID_TYPE_TSG) ? true : false;
+        u32 rlid;
+        nvgpu_log_info(g, "acquire engines_reset_mutex");
+        nvgpu_mutex_acquire(&g->fifo.engines_reset_mutex);
+        nvgpu_log_info(g, "acquire runlist_lock for all runlists");
+        for (rlid = 0; rlid < g->fifo.max_runlists; rlid++) {
+                nvgpu_mutex_acquire(&g->fifo.runlist_info[rlid].runlist_lock);
+        }
+        if (id_is_known) {
+                engine_ids = gk20a_fifo_engines_on_id(g, hw_id, id_is_tsg);
+                ref_id = hw_id;
+                ref_type = id_is_tsg ?
+                        fifo_engine_status_id_type_tsgid_v() :
+                        fifo_engine_status_id_type_chid_v();
+                ref_id_is_tsg = id_is_tsg;
+                /* atleast one engine will get passed during sched err*/
+                engine_ids |= __engine_ids;
+                for_each_set_bit(engine_id, &engine_ids, 32) {
+                        u32 mmu_id = gk20a_engine_id_to_mmu_id(g, engine_id);
+                        if (mmu_id != FIFO_INVAL_ENGINE_ID) {
+                                mmu_fault_engines |= BIT(mmu_id);
+                        }
+                }
+        } else {
+                /* store faulted engines in advance */
+                for_each_set_bit(engine_id, &_engine_ids, 32) {
+                        gk20a_fifo_get_faulty_id_type(g, engine_id, &ref_id,
+                                                      &ref_type);
+                        if (ref_type == fifo_engine_status_id_type_tsgid_v()) {
+                                ref_id_is_tsg = true;
+                        } else {
+                                ref_id_is_tsg = false;
+                        }
+                        /* Reset *all* engines that use the
+                         * same channel as faulty engine */
+                        for (i = 0; i < g->fifo.num_engines; i++) {
+                                u32 active_engine_id = g->fifo.active_engines_list[i];
+                                u32 type;
+                                u32 id;
+                                gk20a_fifo_get_faulty_id_type(g, active_engine_id, &id, &type);
+                                if (ref_type == type && ref_id == id) {
+                                        u32 mmu_id = gk20a_engine_id_to_mmu_id(g, active_engine_id);
+                                        engine_ids |= BIT(active_engine_id);
+                                        if (mmu_id != FIFO_INVAL_ENGINE_ID) {
+                                                mmu_fault_engines |= BIT(mmu_id);
+                                        }
+                                }
+                        }
+                }
+        }
+        if (mmu_fault_engines) {
+                g->ops.fifo.teardown_mask_intr(g);
+                g->ops.fifo.trigger_mmu_fault(g, engine_ids);
+                gk20a_fifo_handle_mmu_fault_locked(g, mmu_fault_engines, ref_id,
+                                ref_id_is_tsg);
+                g->ops.fifo.teardown_unmask_intr(g);
+        }
+        nvgpu_log_info(g, "release runlist_lock for all runlists");
+        for (rlid = 0; rlid < g->fifo.max_runlists; rlid++) {
+                nvgpu_mutex_release(&g->fifo.runlist_info[rlid].runlist_lock);
+        }
+        nvgpu_log_info(g, "release engines_reset_mutex");
+        nvgpu_mutex_release(&g->fifo.engines_reset_mutex);
+}
+void gk20a_fifo_recover(struct gk20a *g, u32 __engine_ids,
+                        u32 hw_id, bool id_is_tsg,
+                        bool id_is_known, bool verbose, int rc_type)
+{
+        unsigned int id_type;
+        if (verbose) {
+                gk20a_debug_dump(g);
+        }
+        if (g->ops.ltc.flush) {
+                g->ops.ltc.flush(g);
+        }
+        if (id_is_known) {
+                id_type = id_is_tsg ? ID_TYPE_TSG : ID_TYPE_CHANNEL;
+        } else {
+                id_type = ID_TYPE_UNKNOWN;
+        }
+        g->ops.fifo.teardown_ch_tsg(g, __engine_ids, hw_id, id_type,
+                                         rc_type, NULL);
+}
+/* force reset channel and tsg */
+int gk20a_fifo_force_reset_ch(struct channel_gk20a *ch,
+                                u32 err_code, bool verbose)
+{
+        struct channel_gk20a *ch_tsg = NULL;
+        struct gk20a *g = ch->g;
+        struct tsg_gk20a *tsg = tsg_gk20a_from_ch(ch);
+        if (tsg != NULL) {
+                nvgpu_rwsem_down_read(&tsg->ch_list_lock);
+                nvgpu_list_for_each_entry(ch_tsg, &tsg->ch_list,
+                                channel_gk20a, ch_entry) {
+                        if (gk20a_channel_get(ch_tsg)) {
+                                g->ops.fifo.set_error_notifier(ch_tsg,
+                                                                err_code);
+                                gk20a_channel_put(ch_tsg);
+                        }
+                }
+                nvgpu_rwsem_up_read(&tsg->ch_list_lock);
+                gk20a_fifo_recover_tsg(g, tsg, verbose,
+                                RC_TYPE_FORCE_RESET);
+        } else {
+                nvgpu_err(g, "chid: %d is not bound to tsg", ch->chid);
+        }
+        return 0;
+}
+int gk20a_fifo_tsg_unbind_channel_verify_status(struct channel_gk20a *ch)
+{
+        struct gk20a *g = ch->g;
+        if (gk20a_fifo_channel_status_is_next(g, ch->chid)) {
+                nvgpu_err(g, "Channel %d to be removed from TSG %d has NEXT set!",
+                        ch->chid, ch->tsgid);
+                return -EINVAL;
+        }
+        if (g->ops.fifo.tsg_verify_status_ctx_reload) {
+                g->ops.fifo.tsg_verify_status_ctx_reload(ch);
+        }
+        if (g->ops.fifo.tsg_verify_status_faulted) {
+                g->ops.fifo.tsg_verify_status_faulted(ch);
+        }
+        return 0;
+}
+static bool gk20a_fifo_tsg_is_multi_channel(struct tsg_gk20a *tsg)
+{
+        bool ret = false;
+        nvgpu_rwsem_down_read(&tsg->ch_list_lock);
+        if (nvgpu_list_first_entry(&tsg->ch_list, channel_gk20a,
+                                   ch_entry) !=
+            nvgpu_list_last_entry(&tsg->ch_list, channel_gk20a,
+                                   ch_entry)) {
+                ret = true;
+        }
+        nvgpu_rwsem_up_read(&tsg->ch_list_lock);
+        return ret;
+}
+int gk20a_fifo_tsg_unbind_channel(struct channel_gk20a *ch)
+{
+        struct gk20a *g = ch->g;
+        struct tsg_gk20a *tsg = tsg_gk20a_from_ch(ch);
+        int err;
+        bool tsg_timedout = false;
+        if (tsg == NULL) {
+                nvgpu_err(g, "chid: %d is not bound to tsg", ch->chid);
+                return 0;
+        }
+        /* If one channel in TSG times out, we disable all channels */
+        nvgpu_rwsem_down_write(&tsg->ch_list_lock);
+        tsg_timedout = gk20a_channel_check_timedout(ch);
+        nvgpu_rwsem_up_write(&tsg->ch_list_lock);
+        /* Disable TSG and examine status before unbinding channel */
+        g->ops.fifo.disable_tsg(tsg);
+        err = g->ops.fifo.preempt_tsg(g, tsg);
+        if (err != 0) {
+                goto fail_enable_tsg;
+        }
+        /*
+         * State validation is only necessary if there are multiple channels in
+         * the TSG.
+         */
+        if (gk20a_fifo_tsg_is_multi_channel(tsg) &&
+            g->ops.fifo.tsg_verify_channel_status && !tsg_timedout) {
+                err = g->ops.fifo.tsg_verify_channel_status(ch);
+                if (err) {
+                        goto fail_enable_tsg;
+                }
+        }
+        /* Channel should be seen as TSG channel while updating runlist */
+        err = channel_gk20a_update_runlist(ch, false);
+        if (err) {
+                goto fail_enable_tsg;
+        }
+        while (ch->mmu_debug_mode_refcnt > 0U) {
+                err = nvgpu_tsg_set_mmu_debug_mode(ch, false);
+                if (err != 0) {
+                        nvgpu_err(g, "disable mmu debug mode failed ch:%u",
+                                ch->chid);
+                        break;
+                }
+        }
+        /* Remove channel from TSG and re-enable rest of the channels */
+        nvgpu_rwsem_down_write(&tsg->ch_list_lock);
+        nvgpu_list_del(&ch->ch_entry);
+        ch->tsgid = NVGPU_INVALID_TSG_ID;
+        /* another thread could have re-enabled the channel because it was
+         * still on the list at that time, so make sure it's truly disabled
+         */
+        g->ops.fifo.disable_channel(ch);
+        nvgpu_rwsem_up_write(&tsg->ch_list_lock);
+        /*
+         * Don't re-enable all channels if TSG has timed out already
+         *
+         * Note that we can skip disabling and preempting TSG too in case of
+         * time out, but we keep that to ensure TSG is kicked out
+         */
+        if (!tsg_timedout) {
+                g->ops.fifo.enable_tsg(tsg);
+        }
+        if (ch->g->ops.fifo.ch_abort_clean_up) {
+                ch->g->ops.fifo.ch_abort_clean_up(ch);
+        }
+        return 0;
+fail_enable_tsg:
+        if (!tsg_timedout) {
+                g->ops.fifo.enable_tsg(tsg);
+        }
+        return err;
+}
+u32 gk20a_fifo_get_failing_engine_data(struct gk20a *g,
+                        int *__id, bool *__is_tsg)
+{
+        u32 engine_id;
+        int id = -1;
+        bool is_tsg = false;
+        u32 mailbox2;
+        u32 active_engine_id = FIFO_INVAL_ENGINE_ID;
+        for (engine_id = 0; engine_id < g->fifo.num_engines; engine_id++) {
+                u32 status;
+                u32 ctx_status;
+                bool failing_engine;
+                active_engine_id = g->fifo.active_engines_list[engine_id];
+                status = gk20a_readl(g, fifo_engine_status_r(active_engine_id));
+                ctx_status = fifo_engine_status_ctx_status_v(status);
+                /* we are interested in busy engines */
+                failing_engine = fifo_engine_status_engine_v(status) ==
+                        fifo_engine_status_engine_busy_v();
+                /* ..that are doing context switch */
+                failing_engine = failing_engine &&
+                        (ctx_status ==
+                                fifo_engine_status_ctx_status_ctxsw_switch_v()
+                        || ctx_status ==
+                                fifo_engine_status_ctx_status_ctxsw_save_v()
+                        || ctx_status ==
+                                fifo_engine_status_ctx_status_ctxsw_load_v());
+                if (!failing_engine) {
+                    active_engine_id = FIFO_INVAL_ENGINE_ID;
+                        continue;
+                }
+                if (ctx_status ==
+                                fifo_engine_status_ctx_status_ctxsw_load_v()) {
+                        id = fifo_engine_status_next_id_v(status);
+                        is_tsg = fifo_engine_status_next_id_type_v(status) !=
+                                fifo_engine_status_next_id_type_chid_v();
+                } else if (ctx_status ==
+                               fifo_engine_status_ctx_status_ctxsw_switch_v()) {
+                        mailbox2 = gk20a_readl(g, gr_fecs_ctxsw_mailbox_r(2));
+                        if (mailbox2 & FECS_METHOD_WFI_RESTORE) {
+                                id = fifo_engine_status_next_id_v(status);
+                                is_tsg = fifo_engine_status_next_id_type_v(status) !=
+                                        fifo_engine_status_next_id_type_chid_v();
+                        } else {
+                                id = fifo_engine_status_id_v(status);
+                                is_tsg = fifo_engine_status_id_type_v(status) !=
+                                        fifo_engine_status_id_type_chid_v();
+                        }
+                } else {
+                        id = fifo_engine_status_id_v(status);
+                        is_tsg = fifo_engine_status_id_type_v(status) !=
+                                fifo_engine_status_id_type_chid_v();
+                }
+                break;
+        }
+        *__id = id;
+        *__is_tsg = is_tsg;
+        return active_engine_id;
+}
+bool gk20a_fifo_check_ch_ctxsw_timeout(struct channel_gk20a *ch,
+                bool *verbose, u32 *ms)
+{
+        bool recover = false;
+        bool progress = false;
+        struct gk20a *g = ch->g;
+        if (gk20a_channel_get(ch)) {
+                recover = gk20a_channel_update_and_check_timeout(ch,
+                                g->fifo_eng_timeout_us / 1000,
+                                &progress);
+                *verbose = ch->timeout_debug_dump;
+                *ms = ch->timeout_accumulated_ms;
+                if (recover) {
+                        g->ops.fifo.set_error_notifier(ch,
+                                        NVGPU_ERR_NOTIFIER_FIFO_ERROR_IDLE_TIMEOUT);
+                }
+                gk20a_channel_put(ch);
+        }
+        return recover;
+}
+bool gk20a_fifo_check_tsg_ctxsw_timeout(struct tsg_gk20a *tsg,
+                bool *verbose, u32 *ms)
+{
+        struct channel_gk20a *ch;
+        bool recover = false;
+        bool progress = false;
+        struct gk20a *g = tsg->g;
+        *verbose = false;
+        *ms = g->fifo_eng_timeout_us / 1000;
+        nvgpu_rwsem_down_read(&tsg->ch_list_lock);
+        /* check if there was some progress on any of the TSG channels.
+         * fifo recovery is needed if at least one channel reached the
+         * maximum timeout without progress (update in gpfifo pointers).
+         */
+        nvgpu_list_for_each_entry(ch, &tsg->ch_list, channel_gk20a, ch_entry) {
+                if (gk20a_channel_get(ch)) {
+                        recover = gk20a_channel_update_and_check_timeout(ch,
+                                        *ms, &progress);
+                        if (progress || recover) {
+                                break;
+                        }
+                        gk20a_channel_put(ch);
+                }
+        }
+        if (recover) {
+                /*
+                 * if one channel is presumed dead (no progress for too long),
+                 * then fifo recovery is needed. we can't really figure out
+                 * which channel caused the problem, so set timeout error
+                 * notifier for all channels.
+                 */
+                nvgpu_log_info(g, "timeout on tsg=%d ch=%d",
+                                tsg->tsgid, ch->chid);
+                *ms = ch->timeout_accumulated_ms;
+                gk20a_channel_put(ch);
+                nvgpu_list_for_each_entry(ch, &tsg->ch_list,
+                                channel_gk20a, ch_entry) {
+                        if (gk20a_channel_get(ch)) {
+                                ch->g->ops.fifo.set_error_notifier(ch,
+                                        NVGPU_ERR_NOTIFIER_FIFO_ERROR_IDLE_TIMEOUT);
+                                if (ch->timeout_debug_dump) {
+                                        *verbose = true;
+                                }
+                                gk20a_channel_put(ch);
+                        }
+                }
+        } else if (progress) {
+                /*
+                 * if at least one channel in the TSG made some progress, reset
+                 * accumulated timeout for all channels in the TSG. In
+                 * particular, this resets timeout for channels that already
+                 * completed their work
+                 */
+                nvgpu_log_info(g, "progress on tsg=%d ch=%d",
+                                tsg->tsgid, ch->chid);
+                gk20a_channel_put(ch);
+                *ms = g->fifo_eng_timeout_us / 1000;
+                nvgpu_list_for_each_entry(ch, &tsg->ch_list,
+                                channel_gk20a, ch_entry) {
+                        if (gk20a_channel_get(ch)) {
+                                ch->timeout_accumulated_ms = *ms;
+                                gk20a_channel_put(ch);
+                        }
+                }
+        }
+        /* if we could not detect progress on any of the channel, but none
+         * of them has reached the timeout, there is nothing more to do:
+         * timeout_accumulated_ms has been updated for all of them.
+         */
+        nvgpu_rwsem_up_read(&tsg->ch_list_lock);
+        return recover;
+}
+bool gk20a_fifo_handle_sched_error(struct gk20a *g)
+{
+        u32 sched_error;
+        u32 engine_id;
+        int id = -1;
+        bool is_tsg = false;
+        bool ret = false;
+        /* read the scheduler error register */
+        sched_error = gk20a_readl(g, fifo_intr_sched_error_r());
+        engine_id = gk20a_fifo_get_failing_engine_data(g, &id, &is_tsg);
+        /*
+         * Could not find the engine
+         * Possible Causes:
+         * a)
+         * On hitting engine reset, h/w drops the ctxsw_status to INVALID in
+         * fifo_engine_status register. Also while the engine is held in reset
+         * h/w passes busy/idle straight through. fifo_engine_status registers
+         * are correct in that there is no context switch outstanding
+         * as the CTXSW is aborted when reset is asserted.
+         * This is just a side effect of how gv100 and earlier versions of
+         * ctxsw_timeout behave.
+         * With gv11b and later, h/w snaps the context at the point of error
+         * so that s/w can see the tsg_id which caused the HW timeout.
+         * b)
+         * If engines are not busy and ctxsw state is valid then intr occurred
+         * in the past and if the ctxsw state has moved on to VALID from LOAD
+         * or SAVE, it means that whatever timed out eventually finished
+         * anyways. The problem with this is that s/w cannot conclude which
+         * context caused the problem as maybe more switches occurred before
+         * intr is handled.
+         */
+        if (engine_id == FIFO_INVAL_ENGINE_ID) {
+                nvgpu_info(g, "fifo sched error: 0x%08x, failed to find engine "
+                                "that is busy doing ctxsw. "
+                                "May be ctxsw already happened", sched_error);
+                ret = false;
+                goto err;
+        }
+        /* could not find the engine - should never happen */
+        if (!gk20a_fifo_is_valid_engine_id(g, engine_id)) {
+                nvgpu_err(g, "fifo sched error : 0x%08x, failed to find engine",
+                        sched_error);
+                ret = false;
+                goto err;
+        }
+        if (fifo_intr_sched_error_code_f(sched_error) ==
+                        fifo_intr_sched_error_code_ctxsw_timeout_v()) {
+                struct fifo_gk20a *f = &g->fifo;
+                u32 ms = 0;
+                bool verbose = false;
+                if (is_tsg) {
+                        ret = g->ops.fifo.check_tsg_ctxsw_timeout(
+                                        &f->tsg[id], &verbose, &ms);
+                } else {
+                        ret = g->ops.fifo.check_ch_ctxsw_timeout(
+                                        &f->channel[id], &verbose, &ms);
+                }
+                if (ret) {
+                        nvgpu_err(g,
+                                "fifo sched ctxsw timeout error: "
+                                "engine=%u, %s=%d, ms=%u",
+                                engine_id, is_tsg ? "tsg" : "ch", id, ms);
+                        /*
+                         * Cancel all channels' timeout since SCHED error might
+                         * trigger multiple watchdogs at a time
+                         */
+                        gk20a_channel_timeout_restart_all_channels(g);
+                        gk20a_fifo_recover(g, BIT(engine_id), id,
+                                        is_tsg, true, verbose,
+                                        RC_TYPE_CTXSW_TIMEOUT);
+                } else {
+                        nvgpu_log_info(g,
+                                "fifo is waiting for ctx switch for %d ms, "
+                                "%s=%d", ms, is_tsg ? "tsg" : "ch", id);
+                }
+        } else {
+                nvgpu_err(g,
+                        "fifo sched error : 0x%08x, engine=%u, %s=%d",
+                        sched_error, engine_id, is_tsg ? "tsg" : "ch", id);
+        }
+err:
+        return ret;
+}
+static u32 fifo_error_isr(struct gk20a *g, u32 fifo_intr)
+{
+        bool print_channel_reset_log = false;
+        u32 handled = 0;
+        nvgpu_log_fn(g, "fifo_intr=0x%08x", fifo_intr);
+        if (fifo_intr & fifo_intr_0_pio_error_pending_f()) {
+                /* pio mode is unused.  this shouldn't happen, ever. */
+                /* should we clear it or just leave it pending? */
+                nvgpu_err(g, "fifo pio error!");
+                BUG_ON(1);
+        }
+        if (fifo_intr & fifo_intr_0_bind_error_pending_f()) {
+                u32 bind_error = gk20a_readl(g, fifo_intr_bind_error_r());
+                nvgpu_err(g, "fifo bind error: 0x%08x", bind_error);
+                print_channel_reset_log = true;
+                handled |= fifo_intr_0_bind_error_pending_f();
+        }
+        if (fifo_intr & fifo_intr_0_sched_error_pending_f()) {
+                print_channel_reset_log = g->ops.fifo.handle_sched_error(g);
+                handled |= fifo_intr_0_sched_error_pending_f();
+        }
+        if (fifo_intr & fifo_intr_0_chsw_error_pending_f()) {
+                gk20a_fifo_handle_chsw_fault(g);
+                handled |= fifo_intr_0_chsw_error_pending_f();
+        }
+        if (fifo_intr & fifo_intr_0_mmu_fault_pending_f()) {
+                if (gk20a_fifo_handle_mmu_fault(g, 0, ~(u32)0, false)) {
+                        print_channel_reset_log = true;
+                }
+                handled |= fifo_intr_0_mmu_fault_pending_f();
+        }
+        if (fifo_intr & fifo_intr_0_dropped_mmu_fault_pending_f()) {
+                gk20a_fifo_handle_dropped_mmu_fault(g);
+                handled |= fifo_intr_0_dropped_mmu_fault_pending_f();
+        }
+        print_channel_reset_log = !g->fifo.deferred_reset_pending
+                        && print_channel_reset_log;
+        if (print_channel_reset_log) {
+                unsigned int engine_id;
+                nvgpu_err(g,
+                           "channel reset initiated from %s; intr=0x%08x",
+                           __func__, fifo_intr);
+                for (engine_id = 0;
+                     engine_id < g->fifo.num_engines;
+                     engine_id++) {
+                                u32 active_engine_id = g->fifo.active_engines_list[engine_id];
+                                u32 engine_enum = g->fifo.engine_info[active_engine_id].engine_enum;
+                                nvgpu_log_fn(g, "enum:%d -> engine_id:%d", engine_enum,
+                                        active_engine_id);
+                                fifo_pbdma_exception_status(g,
+                                                &g->fifo.engine_info[active_engine_id]);
+                                fifo_engine_exception_status(g,
+                                                &g->fifo.engine_info[active_engine_id]);
+                }
+        }
+        return handled;
+}
+static inline void gk20a_fifo_reset_pbdma_header(struct gk20a *g, int pbdma_id)
+{
+        gk20a_writel(g, pbdma_pb_header_r(pbdma_id),
+                        pbdma_pb_header_first_true_f() |
+                        pbdma_pb_header_type_non_inc_f());
+}
+void gk20a_fifo_reset_pbdma_method(struct gk20a *g, int pbdma_id,
+                                                int pbdma_method_index)
+{
+        u32 pbdma_method_stride;
+        u32 pbdma_method_reg;
+        pbdma_method_stride = pbdma_method1_r(pbdma_id) -
+                                pbdma_method0_r(pbdma_id);
+        pbdma_method_reg = pbdma_method0_r(pbdma_id) +
+                (pbdma_method_index * pbdma_method_stride);
+        gk20a_writel(g, pbdma_method_reg,
+                        pbdma_method0_valid_true_f() |
+                        pbdma_method0_first_true_f() |
+                        pbdma_method0_addr_f(
+                             pbdma_udma_nop_r() >> 2));
+}
+static bool gk20a_fifo_is_sw_method_subch(struct gk20a *g, int pbdma_id,
+                                                int pbdma_method_index)
+{
+        u32 pbdma_method_stride;
+        u32 pbdma_method_reg, pbdma_method_subch;
+        pbdma_method_stride = pbdma_method1_r(pbdma_id) -
+                                pbdma_method0_r(pbdma_id);
+        pbdma_method_reg = pbdma_method0_r(pbdma_id) +
+                        (pbdma_method_index * pbdma_method_stride);
+        pbdma_method_subch = pbdma_method0_subch_v(
+                        gk20a_readl(g, pbdma_method_reg));
+        if (pbdma_method_subch == 5 ||
+            pbdma_method_subch == 6 ||
+            pbdma_method_subch == 7) {
+                return true;
+        }
+        return false;
+}
+unsigned int gk20a_fifo_handle_pbdma_intr_0(struct gk20a *g, u32 pbdma_id,
+                        u32 pbdma_intr_0, u32 *handled, u32 *error_notifier)
+{
+        struct fifo_gk20a *f = &g->fifo;
+        unsigned int rc_type = RC_TYPE_NO_RC;
+        int i;
+        unsigned long pbdma_intr_err;
+        u32 bit;
+        if ((f->intr.pbdma.device_fatal_0 |
+             f->intr.pbdma.channel_fatal_0 |
+             f->intr.pbdma.restartable_0) & pbdma_intr_0) {
+                pbdma_intr_err = (unsigned long)pbdma_intr_0;
+                for_each_set_bit(bit, &pbdma_intr_err, 32) {
+                        nvgpu_err(g, "PBDMA intr %s Error",
+                                pbdma_intr_fault_type_desc[bit]);
+                }
+                nvgpu_err(g,
+                        "pbdma_intr_0(%d):0x%08x PBH: %08x "
+                        "SHADOW: %08x gp shadow0: %08x gp shadow1: %08x"
+                        "M0: %08x %08x %08x %08x ",
+                        pbdma_id, pbdma_intr_0,
+                        gk20a_readl(g, pbdma_pb_header_r(pbdma_id)),
+                        gk20a_readl(g, pbdma_hdr_shadow_r(pbdma_id)),
+                        gk20a_readl(g, pbdma_gp_shadow_0_r(pbdma_id)),
+                        gk20a_readl(g, pbdma_gp_shadow_1_r(pbdma_id)),
+                        gk20a_readl(g, pbdma_method0_r(pbdma_id)),
+                        gk20a_readl(g, pbdma_method1_r(pbdma_id)),
+                        gk20a_readl(g, pbdma_method2_r(pbdma_id)),
+                        gk20a_readl(g, pbdma_method3_r(pbdma_id))
+                        );
+                rc_type = RC_TYPE_PBDMA_FAULT;
+                *handled |= ((f->intr.pbdma.device_fatal_0 |
+                             f->intr.pbdma.channel_fatal_0 |
+                             f->intr.pbdma.restartable_0) &
+                            pbdma_intr_0);
+        }
+        if (pbdma_intr_0 & pbdma_intr_0_acquire_pending_f()) {
+                u32 val = gk20a_readl(g, pbdma_acquire_r(pbdma_id));
+                val &= ~pbdma_acquire_timeout_en_enable_f();
+                gk20a_writel(g, pbdma_acquire_r(pbdma_id), val);
+                if (nvgpu_is_timeouts_enabled(g)) {
+                        rc_type = RC_TYPE_PBDMA_FAULT;
+                        nvgpu_err(g,
+                                "semaphore acquire timeout!");
+                        *error_notifier = NVGPU_ERR_NOTIFIER_GR_SEMAPHORE_TIMEOUT;
+                }
+                *handled |= pbdma_intr_0_acquire_pending_f();
+        }
+        if (pbdma_intr_0 & pbdma_intr_0_pbentry_pending_f()) {
+                gk20a_fifo_reset_pbdma_header(g, pbdma_id);
+                gk20a_fifo_reset_pbdma_method(g, pbdma_id, 0);
+                rc_type = RC_TYPE_PBDMA_FAULT;
+        }
+        if (pbdma_intr_0 & pbdma_intr_0_method_pending_f()) {
+                gk20a_fifo_reset_pbdma_method(g, pbdma_id, 0);
+                rc_type = RC_TYPE_PBDMA_FAULT;
+        }
+        if (pbdma_intr_0 & pbdma_intr_0_pbcrc_pending_f()) {
+                *error_notifier =
+                        NVGPU_ERR_NOTIFIER_PBDMA_PUSHBUFFER_CRC_MISMATCH;
+                rc_type = RC_TYPE_PBDMA_FAULT;
+        }
+        if (pbdma_intr_0 & pbdma_intr_0_device_pending_f()) {
+                gk20a_fifo_reset_pbdma_header(g, pbdma_id);
+                for (i = 0; i < 4; i++) {
+                        if (gk20a_fifo_is_sw_method_subch(g,
+                                        pbdma_id, i)) {
+                                gk20a_fifo_reset_pbdma_method(g,
+                                                pbdma_id, i);
+                        }
+                }
+                rc_type = RC_TYPE_PBDMA_FAULT;
+        }
+        return rc_type;
+}
+unsigned int gk20a_fifo_handle_pbdma_intr_1(struct gk20a *g,
+                        u32 pbdma_id, u32 pbdma_intr_1,
+                        u32 *handled, u32 *error_notifier)
+{
+        unsigned int rc_type = RC_TYPE_PBDMA_FAULT;
+        /*
+         * all of the interrupts in _intr_1 are "host copy engine"
+         * related, which is not supported. For now just make them
+         * channel fatal.
+         */
+        nvgpu_err(g, "hce err: pbdma_intr_1(%d):0x%08x",
+                pbdma_id, pbdma_intr_1);
+        *handled |= pbdma_intr_1;
+        return rc_type;
+}
+static void gk20a_fifo_pbdma_fault_rc(struct gk20a *g,
+                        struct fifo_gk20a *f, u32 pbdma_id,
+                        u32 error_notifier, u32 status)
+{
+        u32 id;
+        nvgpu_log(g, gpu_dbg_info, "pbdma id %d error notifier %d",
+                        pbdma_id, error_notifier);
+        /* Remove channel from runlist */
+        id = fifo_pbdma_status_id_v(status);
+        if (fifo_pbdma_status_id_type_v(status)
+                        == fifo_pbdma_status_id_type_chid_v()) {
+                struct channel_gk20a *ch = gk20a_channel_from_id(g, id);
+                if (ch != NULL) {
+                        g->ops.fifo.set_error_notifier(ch, error_notifier);
+                        gk20a_fifo_recover_ch(g, ch, true, RC_TYPE_PBDMA_FAULT);
+                        gk20a_channel_put(ch);
+                }
+        } else if (fifo_pbdma_status_id_type_v(status)
+                        == fifo_pbdma_status_id_type_tsgid_v()) {
+                struct tsg_gk20a *tsg = &f->tsg[id];
+                struct channel_gk20a *ch = NULL;
+                nvgpu_rwsem_down_read(&tsg->ch_list_lock);
+                nvgpu_list_for_each_entry(ch, &tsg->ch_list,
+                                channel_gk20a, ch_entry) {
+                        if (gk20a_channel_get(ch)) {
+                                g->ops.fifo.set_error_notifier(ch,
+                                        error_notifier);
+                                gk20a_channel_put(ch);
+                        }
+                }
+                nvgpu_rwsem_up_read(&tsg->ch_list_lock);
+                gk20a_fifo_recover_tsg(g, tsg, true, RC_TYPE_PBDMA_FAULT);
+        }
+}
+u32 gk20a_fifo_handle_pbdma_intr(struct gk20a *g, struct fifo_gk20a *f,
+                        u32 pbdma_id, unsigned int rc)
+{
+        u32 pbdma_intr_0 = gk20a_readl(g, pbdma_intr_0_r(pbdma_id));
+        u32 pbdma_intr_1 = gk20a_readl(g, pbdma_intr_1_r(pbdma_id));
+        u32 handled = 0;
+        u32 error_notifier = NVGPU_ERR_NOTIFIER_PBDMA_ERROR;
+        unsigned int rc_type = RC_TYPE_NO_RC;
+        u32 pbdma_status_info = 0;
+        if (pbdma_intr_0) {
+                nvgpu_log(g, gpu_dbg_info | gpu_dbg_intr,
+                        "pbdma id %d intr_0 0x%08x pending",
+                        pbdma_id, pbdma_intr_0);
+                if (g->ops.fifo.handle_pbdma_intr_0(g, pbdma_id, pbdma_intr_0,
+                        &handled, &error_notifier) != RC_TYPE_NO_RC) {
+                        rc_type = RC_TYPE_PBDMA_FAULT;
+                        pbdma_status_info = gk20a_readl(g,
+                                fifo_pbdma_status_r(pbdma_id));
+                }
+                gk20a_writel(g, pbdma_intr_0_r(pbdma_id), pbdma_intr_0);
+        }
+        if (pbdma_intr_1) {
+                nvgpu_log(g, gpu_dbg_info | gpu_dbg_intr,
+                        "pbdma id %d intr_1 0x%08x pending",
+                        pbdma_id, pbdma_intr_1);
+                if (g->ops.fifo.handle_pbdma_intr_1(g, pbdma_id, pbdma_intr_1,
+                        &handled, &error_notifier) != RC_TYPE_NO_RC) {
+                        rc_type = RC_TYPE_PBDMA_FAULT;
+                        pbdma_status_info = gk20a_readl(g,
+                                fifo_pbdma_status_r(pbdma_id));
+                }
+                gk20a_writel(g, pbdma_intr_1_r(pbdma_id), pbdma_intr_1);
+        }
+        if (rc == RC_YES && rc_type == RC_TYPE_PBDMA_FAULT) {
+                gk20a_fifo_pbdma_fault_rc(g, f, pbdma_id, error_notifier,
+                                pbdma_status_info);
+        }
+        return handled;
+}
+static u32 fifo_pbdma_isr(struct gk20a *g, u32 fifo_intr)
+{
+        struct fifo_gk20a *f = &g->fifo;
+        u32 clear_intr = 0, i;
+        u32 host_num_pbdma = nvgpu_get_litter_value(g, GPU_LIT_HOST_NUM_PBDMA);
+        u32 pbdma_pending = gk20a_readl(g, fifo_intr_pbdma_id_r());
+        for (i = 0; i < host_num_pbdma; i++) {
+                if (fifo_intr_pbdma_id_status_v(pbdma_pending, i)) {
+                        nvgpu_log(g, gpu_dbg_intr, "pbdma id %d intr pending", i);
+                        clear_intr |=
+                                gk20a_fifo_handle_pbdma_intr(g, f, i, RC_YES);
+                }
+        }
+        return fifo_intr_0_pbdma_intr_pending_f();
+}
+void gk20a_fifo_isr(struct gk20a *g)
+{
+        u32 error_intr_mask;
+        u32 clear_intr = 0;
+        u32 fifo_intr = gk20a_readl(g, fifo_intr_0_r());
+        error_intr_mask = g->ops.fifo.intr_0_error_mask(g);
+        if (g->fifo.sw_ready) {
+                /* note we're not actually in an "isr", but rather
+                 * in a threaded interrupt context... */
+                nvgpu_mutex_acquire(&g->fifo.intr.isr.mutex);
+                nvgpu_log(g, gpu_dbg_intr, "fifo isr %08x\n", fifo_intr);
+                /* handle runlist update */
+                if (fifo_intr & fifo_intr_0_runlist_event_pending_f()) {
+                        gk20a_fifo_handle_runlist_event(g);
+                        clear_intr |= fifo_intr_0_runlist_event_pending_f();
+                }
+                if (fifo_intr & fifo_intr_0_pbdma_intr_pending_f()) {
+                        clear_intr |= fifo_pbdma_isr(g, fifo_intr);
+                }
+                if (g->ops.fifo.handle_ctxsw_timeout) {
+                        g->ops.fifo.handle_ctxsw_timeout(g, fifo_intr);
+                }
+                if (unlikely((fifo_intr & error_intr_mask) != 0U)) {
+                        clear_intr |= fifo_error_isr(g, fifo_intr);
+                }
+                nvgpu_mutex_release(&g->fifo.intr.isr.mutex);
+        }
+        gk20a_writel(g, fifo_intr_0_r(), clear_intr);
+        return;
+}
+u32 gk20a_fifo_nonstall_isr(struct gk20a *g)
+{
+        u32 fifo_intr = gk20a_readl(g, fifo_intr_0_r());
+        u32 clear_intr = 0;
+        nvgpu_log(g, gpu_dbg_intr, "fifo nonstall isr %08x\n", fifo_intr);
+        if (fifo_intr & fifo_intr_0_channel_intr_pending_f()) {
+                clear_intr = fifo_intr_0_channel_intr_pending_f();
+        }
+        gk20a_writel(g, fifo_intr_0_r(), clear_intr);
+        return GK20A_NONSTALL_OPS_WAKEUP_SEMAPHORE;
+}
+void gk20a_fifo_issue_preempt(struct gk20a *g, u32 id, bool is_tsg)
+{
+        if (is_tsg) {
+                gk20a_writel(g, fifo_preempt_r(),
+                        fifo_preempt_id_f(id) |
+                        fifo_preempt_type_tsg_f());
+        } else {
+                gk20a_writel(g, fifo_preempt_r(),
+                        fifo_preempt_chid_f(id) |
+                        fifo_preempt_type_channel_f());
+        }
+}
+static u32 gk20a_fifo_get_preempt_timeout(struct gk20a *g)
+{
+        /* Use fifo_eng_timeout converted to ms for preempt
+         * polling. gr_idle_timeout i.e 3000 ms is and not appropriate
+         * for polling preempt done as context switch timeout gets
+         * triggered every 100 ms and context switch recovery
+         * happens every 3000 ms */
+        return g->fifo_eng_timeout_us / 1000;
+}
+int gk20a_fifo_is_preempt_pending(struct gk20a *g, u32 id,
+                unsigned int id_type)
+{
+        struct nvgpu_timeout timeout;
+        u32 delay = GR_IDLE_CHECK_DEFAULT;
+        int ret = -EBUSY;
+        nvgpu_timeout_init(g, &timeout, gk20a_fifo_get_preempt_timeout(g),
+                           NVGPU_TIMER_CPU_TIMER);
+        do {
+                if (!(gk20a_readl(g, fifo_preempt_r()) &
+                                fifo_preempt_pending_true_f())) {
+                        ret = 0;
+                        break;
+                }
+                nvgpu_usleep_range(delay, delay * 2);
+                delay = min_t(u32, delay << 1, GR_IDLE_CHECK_MAX);
+        } while (!nvgpu_timeout_expired(&timeout));
+        if (ret) {
+                nvgpu_err(g, "preempt timeout: id: %u id_type: %d ",
+                        id, id_type);
+        }
+        return ret;
+}
+void gk20a_fifo_preempt_timeout_rc_tsg(struct gk20a *g, struct tsg_gk20a *tsg)
+{
+        struct channel_gk20a *ch = NULL;
+        nvgpu_err(g, "preempt TSG %d timeout", tsg->tsgid);
+        nvgpu_rwsem_down_read(&tsg->ch_list_lock);
+        nvgpu_list_for_each_entry(ch, &tsg->ch_list,
+                        channel_gk20a, ch_entry) {
+                if (!gk20a_channel_get(ch)) {
+                        continue;
+                }
+                g->ops.fifo.set_error_notifier(ch,
+                        NVGPU_ERR_NOTIFIER_FIFO_ERROR_IDLE_TIMEOUT);
+                gk20a_channel_put(ch);
+        }
+        nvgpu_rwsem_up_read(&tsg->ch_list_lock);
+        gk20a_fifo_recover_tsg(g, tsg, true, RC_TYPE_PREEMPT_TIMEOUT);
+}
+void gk20a_fifo_preempt_timeout_rc(struct gk20a *g, struct channel_gk20a *ch)
+{
+        nvgpu_err(g, "preempt channel %d timeout", ch->chid);
+        g->ops.fifo.set_error_notifier(ch,
+                                NVGPU_ERR_NOTIFIER_FIFO_ERROR_IDLE_TIMEOUT);
+        gk20a_fifo_recover_ch(g, ch, true,
+                                        RC_TYPE_PREEMPT_TIMEOUT);
+}
+int __locked_fifo_preempt(struct gk20a *g, u32 id, bool is_tsg)
+{
+        int ret;
+        unsigned int id_type;
+        nvgpu_log_fn(g, "id: %d is_tsg: %d", id, is_tsg);
+        /* issue preempt */
+        gk20a_fifo_issue_preempt(g, id, is_tsg);
+        id_type = is_tsg ? ID_TYPE_TSG : ID_TYPE_CHANNEL;
+        /* wait for preempt */
+        ret = g->ops.fifo.is_preempt_pending(g, id, id_type);
+        return ret;
+}
+int gk20a_fifo_preempt_channel(struct gk20a *g, struct channel_gk20a *ch)
+{
+        struct fifo_gk20a *f = &g->fifo;
+        u32 ret = 0;
+        u32 token = PMU_INVALID_MUTEX_OWNER_ID;
+        u32 mutex_ret = 0;
+        u32 i;
+        nvgpu_log_fn(g, "chid: %d", ch->chid);
+        /* we have no idea which runlist we are using. lock all */
+        for (i = 0; i < g->fifo.max_runlists; i++) {
+                nvgpu_mutex_acquire(&f->runlist_info[i].runlist_lock);
+        }
+        mutex_ret = nvgpu_pmu_mutex_acquire(&g->pmu, PMU_MUTEX_ID_FIFO, &token);
+        ret = __locked_fifo_preempt(g, ch->chid, false);
+        if (!mutex_ret) {
+                nvgpu_pmu_mutex_release(&g->pmu, PMU_MUTEX_ID_FIFO, &token);
+        }
+        for (i = 0; i < g->fifo.max_runlists; i++) {
+                nvgpu_mutex_release(&f->runlist_info[i].runlist_lock);
+        }
+        if (ret) {
+                if (nvgpu_platform_is_silicon(g)) {
+                        nvgpu_err(g, "preempt timed out for chid: %u, "
+                        "ctxsw timeout will trigger recovery if needed",
+                        ch->chid);
+                } else {
+                        gk20a_fifo_preempt_timeout_rc(g, ch);
+                }
+        }
+        return ret;
+}
+int gk20a_fifo_preempt_tsg(struct gk20a *g, struct tsg_gk20a *tsg)
+{
+        struct fifo_gk20a *f = &g->fifo;
+        u32 ret = 0;
+        u32 token = PMU_INVALID_MUTEX_OWNER_ID;
+        u32 mutex_ret = 0;
+        u32 i;
+        nvgpu_log_fn(g, "tsgid: %d", tsg->tsgid);
+        /* we have no idea which runlist we are using. lock all */
+        for (i = 0; i < g->fifo.max_runlists; i++) {
+                nvgpu_mutex_acquire(&f->runlist_info[i].runlist_lock);
+        }
+        mutex_ret = nvgpu_pmu_mutex_acquire(&g->pmu, PMU_MUTEX_ID_FIFO, &token);
+        ret = __locked_fifo_preempt(g, tsg->tsgid, true);
+        if (!mutex_ret) {
+                nvgpu_pmu_mutex_release(&g->pmu, PMU_MUTEX_ID_FIFO, &token);
+        }
+        for (i = 0; i < g->fifo.max_runlists; i++) {
+                nvgpu_mutex_release(&f->runlist_info[i].runlist_lock);
+        }
+        if (ret) {
+                if (nvgpu_platform_is_silicon(g)) {
+                        nvgpu_err(g, "preempt timed out for tsgid: %u, "
+                        "ctxsw timeout will trigger recovery if needed",
+                        tsg->tsgid);
+                } else {
+                        gk20a_fifo_preempt_timeout_rc_tsg(g, tsg);
+                }
+        }
+        return ret;
+}
+int gk20a_fifo_preempt(struct gk20a *g, struct channel_gk20a *ch)
+{
+        int err;
+        struct tsg_gk20a *tsg = tsg_gk20a_from_ch(ch);
+        if (tsg != NULL) {
+                err = g->ops.fifo.preempt_tsg(ch->g, tsg);
+        } else {
+                err = g->ops.fifo.preempt_channel(ch->g, ch);
+        }
+        return err;
+}
+static void gk20a_fifo_sched_disable_rw(struct gk20a *g, u32 runlists_mask,
+                                         u32 runlist_state)
+{
+        u32 reg_val;
+        reg_val = gk20a_readl(g, fifo_sched_disable_r());
+        if (runlist_state == RUNLIST_DISABLED) {
+                reg_val |= runlists_mask;
+        } else {
+                reg_val &= (~runlists_mask);
+        }
+        gk20a_writel(g, fifo_sched_disable_r(), reg_val);
+}
+void gk20a_fifo_set_runlist_state(struct gk20a *g, u32 runlists_mask,
+                u32 runlist_state)
+{
+        u32 token = PMU_INVALID_MUTEX_OWNER_ID;
+        u32 mutex_ret;
+        nvgpu_log(g, gpu_dbg_info, "runlist mask = 0x%08x state = 0x%08x",
+                        runlists_mask, runlist_state);
+        mutex_ret = nvgpu_pmu_mutex_acquire(&g->pmu, PMU_MUTEX_ID_FIFO, &token);
+        gk20a_fifo_sched_disable_rw(g, runlists_mask, runlist_state);
+        if (!mutex_ret) {
+                nvgpu_pmu_mutex_release(&g->pmu, PMU_MUTEX_ID_FIFO, &token);
+        }
+}
+void gk20a_fifo_enable_tsg_sched(struct gk20a *g, struct tsg_gk20a *tsg)
+{
+        gk20a_fifo_set_runlist_state(g, fifo_sched_disable_runlist_m(
+                                        tsg->runlist_id), RUNLIST_ENABLED);
+}
+void gk20a_fifo_disable_tsg_sched(struct gk20a *g, struct tsg_gk20a *tsg)
+{
+        gk20a_fifo_set_runlist_state(g, fifo_sched_disable_runlist_m(
+                                        tsg->runlist_id), RUNLIST_DISABLED);
+}
+int gk20a_fifo_enable_engine_activity(struct gk20a *g,
+                                struct fifo_engine_info_gk20a *eng_info)
+{
+        nvgpu_log(g, gpu_dbg_info, "start");
+        gk20a_fifo_set_runlist_state(g, fifo_sched_disable_runlist_m(
+                                eng_info->runlist_id), RUNLIST_ENABLED);
+        return 0;
+}
+int gk20a_fifo_enable_all_engine_activity(struct gk20a *g)
+{
+        unsigned int i;
+        int err = 0, ret = 0;
+        for (i = 0; i < g->fifo.num_engines; i++) {
+                u32 active_engine_id = g->fifo.active_engines_list[i];
+                err = gk20a_fifo_enable_engine_activity(g,
+                                &g->fifo.engine_info[active_engine_id]);
+                if (err) {
+                        nvgpu_err(g,
+                                "failed to enable engine %d activity", active_engine_id);
+                        ret = err;
+                }
+        }
+        return ret;
+}
+int gk20a_fifo_disable_engine_activity(struct gk20a *g,
+                                struct fifo_engine_info_gk20a *eng_info,
+                                bool wait_for_idle)
+{
+        u32 gr_stat, pbdma_stat, chan_stat, eng_stat, ctx_stat;
+        u32 pbdma_chid = FIFO_INVAL_CHANNEL_ID;
+        u32 engine_chid = FIFO_INVAL_CHANNEL_ID;
+        u32 token = PMU_INVALID_MUTEX_OWNER_ID;
+        int mutex_ret;
+        struct channel_gk20a *ch = NULL;
+        int err = 0;
+        nvgpu_log_fn(g, " ");
+        gr_stat =
+                gk20a_readl(g, fifo_engine_status_r(eng_info->engine_id));
+        if (fifo_engine_status_engine_v(gr_stat) ==
+            fifo_engine_status_engine_busy_v() && !wait_for_idle) {
+                return -EBUSY;
+        }
+        mutex_ret = nvgpu_pmu_mutex_acquire(&g->pmu, PMU_MUTEX_ID_FIFO, &token);
+        gk20a_fifo_set_runlist_state(g, fifo_sched_disable_runlist_m(
+                                eng_info->runlist_id), RUNLIST_DISABLED);
+        /* chid from pbdma status */
+        pbdma_stat = gk20a_readl(g, fifo_pbdma_status_r(eng_info->pbdma_id));
+        chan_stat  = fifo_pbdma_status_chan_status_v(pbdma_stat);
+        if (chan_stat == fifo_pbdma_status_chan_status_valid_v() ||
+            chan_stat == fifo_pbdma_status_chan_status_chsw_save_v()) {
+                pbdma_chid = fifo_pbdma_status_id_v(pbdma_stat);
+        } else if (chan_stat == fifo_pbdma_status_chan_status_chsw_load_v() ||
+                 chan_stat == fifo_pbdma_status_chan_status_chsw_switch_v()) {
+                pbdma_chid = fifo_pbdma_status_next_id_v(pbdma_stat);
+        }
+        if (pbdma_chid != FIFO_INVAL_CHANNEL_ID) {
+                ch = gk20a_channel_from_id(g, pbdma_chid);
+                if (ch != NULL) {
+                        err = g->ops.fifo.preempt_channel(g, ch);
+                        gk20a_channel_put(ch);
+                }
+                if (err != 0) {
+                        goto clean_up;
+                }
+        }
+        /* chid from engine status */
+        eng_stat = gk20a_readl(g, fifo_engine_status_r(eng_info->engine_id));
+        ctx_stat  = fifo_engine_status_ctx_status_v(eng_stat);
+        if (ctx_stat == fifo_engine_status_ctx_status_valid_v() ||
+            ctx_stat == fifo_engine_status_ctx_status_ctxsw_save_v()) {
+                engine_chid = fifo_engine_status_id_v(eng_stat);
+        } else if (ctx_stat == fifo_engine_status_ctx_status_ctxsw_load_v() ||
+                 ctx_stat == fifo_engine_status_ctx_status_ctxsw_switch_v()) {
+                engine_chid = fifo_engine_status_next_id_v(eng_stat);
+        }
+        if (engine_chid != FIFO_INVAL_ENGINE_ID && engine_chid != pbdma_chid) {
+                ch = gk20a_channel_from_id(g, engine_chid);
+                if (ch != NULL) {
+                        err = g->ops.fifo.preempt_channel(g, ch);
+                        gk20a_channel_put(ch);
+                }
+                if (err != 0) {
+                        goto clean_up;
+                }
+        }
+clean_up:
+        if (!mutex_ret) {
+                nvgpu_pmu_mutex_release(&g->pmu, PMU_MUTEX_ID_FIFO, &token);
+        }
+        if (err) {
+                nvgpu_log_fn(g, "failed");
+                if (gk20a_fifo_enable_engine_activity(g, eng_info)) {
+                        nvgpu_err(g,
+                                "failed to enable gr engine activity");
+                }
+        } else {
+                nvgpu_log_fn(g, "done");
+        }
+        return err;
+}
+int gk20a_fifo_disable_all_engine_activity(struct gk20a *g,
+                                           bool wait_for_idle)
+{
+        unsigned int i;
+        int err = 0, ret = 0;
+        u32 active_engine_id;
+        for (i = 0; i < g->fifo.num_engines; i++) {
+                active_engine_id = g->fifo.active_engines_list[i];
+                err = gk20a_fifo_disable_engine_activity(g,
+                                &g->fifo.engine_info[active_engine_id],
+                                wait_for_idle);
+                if (err) {
+                        nvgpu_err(g, "failed to disable engine %d activity",
+                                active_engine_id);
+                        ret = err;
+                        break;
+                }
+        }
+        if (err) {
+                while (i-- != 0) {
+                        active_engine_id = g->fifo.active_engines_list[i];
+                        err = gk20a_fifo_enable_engine_activity(g,
+                                        &g->fifo.engine_info[active_engine_id]);
+                        if (err) {
+                                nvgpu_err(g,
+                                        "failed to re-enable engine %d activity",
+                                        active_engine_id);
+                        }
+                }
+        }
+        return ret;
+}
+static void gk20a_fifo_runlist_reset_engines(struct gk20a *g, u32 runlist_id)
+{
+        struct fifo_gk20a *f = &g->fifo;
+        u32 engines = 0;
+        unsigned int i;
+        for (i = 0; i < f->num_engines; i++) {
+                u32 active_engine_id = g->fifo.active_engines_list[i];
+                u32 status = gk20a_readl(g, fifo_engine_status_r(active_engine_id));
+                bool engine_busy = fifo_engine_status_engine_v(status) ==
+                        fifo_engine_status_engine_busy_v();
+                if (engine_busy &&
+                    (f->engine_info[active_engine_id].runlist_id == runlist_id)) {
+                        engines |= BIT(active_engine_id);
+                }
+        }
+        if (engines) {
+                gk20a_fifo_recover(g, engines, ~(u32)0, false, false, true,
+                                RC_TYPE_RUNLIST_UPDATE_TIMEOUT);
+        }
+}
+int gk20a_fifo_runlist_wait_pending(struct gk20a *g, u32 runlist_id)
+{
+        struct nvgpu_timeout timeout;
+        unsigned long delay = GR_IDLE_CHECK_DEFAULT;
+        int ret = -ETIMEDOUT;
+        nvgpu_timeout_init(g, &timeout, gk20a_get_gr_idle_timeout(g),
+                           NVGPU_TIMER_CPU_TIMER);
+        do {
+                if ((gk20a_readl(g, fifo_eng_runlist_r(runlist_id)) &
+                                fifo_eng_runlist_pending_true_f()) == 0) {
+                        ret = 0;
+                        break;
+                }
+                nvgpu_usleep_range(delay, delay * 2);
+                delay = min_t(u32, delay << 1, GR_IDLE_CHECK_MAX);
+        } while (!nvgpu_timeout_expired(&timeout));
+        if (ret) {
+                nvgpu_err(g, "runlist wait timeout: runlist id: %u",
+                        runlist_id);
+        }
+        return ret;
+}
+void gk20a_get_tsg_runlist_entry(struct tsg_gk20a *tsg, u32 *runlist)
+{
+        u32 runlist_entry_0 = ram_rl_entry_id_f(tsg->tsgid) |
+                        ram_rl_entry_type_tsg_f() |
+                        ram_rl_entry_tsg_length_f(tsg->num_active_channels);
+        if (tsg->timeslice_timeout) {
+                runlist_entry_0 |=
+                        ram_rl_entry_timeslice_scale_f(tsg->timeslice_scale) |
+                        ram_rl_entry_timeslice_timeout_f(tsg->timeslice_timeout);
+        } else {
+                runlist_entry_0 |=
+                        ram_rl_entry_timeslice_scale_f(
+                                NVGPU_FIFO_DEFAULT_TIMESLICE_SCALE) |
+                        ram_rl_entry_timeslice_timeout_f(
+                                NVGPU_FIFO_DEFAULT_TIMESLICE_TIMEOUT);
+        }
+        runlist[0] = runlist_entry_0;
+        runlist[1] = 0;
+}
+u32 gk20a_fifo_default_timeslice_us(struct gk20a *g)
+{
+        return (((u64)(NVGPU_FIFO_DEFAULT_TIMESLICE_TIMEOUT <<
+                                NVGPU_FIFO_DEFAULT_TIMESLICE_SCALE) *
+                        (u64)g->ptimer_src_freq) /
+                        (u64)PTIMER_REF_FREQ_HZ);
+}
+void gk20a_get_ch_runlist_entry(struct channel_gk20a *ch, u32 *runlist)
+{
+        runlist[0] = ram_rl_entry_chid_f(ch->chid);
+        runlist[1] = 0;
+}
+/* recursively construct a runlist with interleaved bare channels and TSGs */
+u32 *gk20a_runlist_construct_locked(struct fifo_gk20a *f,
+                                struct fifo_runlist_info_gk20a *runlist,
+                                u32 cur_level,
+                                u32 *runlist_entry,
+                                bool interleave_enabled,
+                                bool prev_empty,
+                                u32 *entries_left)
+{
+        bool last_level = cur_level == NVGPU_FIFO_RUNLIST_INTERLEAVE_LEVEL_HIGH;
+        struct channel_gk20a *ch;
+        bool skip_next = false;
+        u32 tsgid, count = 0;
+        u32 runlist_entry_words = f->runlist_entry_size / sizeof(u32);
+        struct gk20a *g = f->g;
+        nvgpu_log_fn(g, " ");
+        /* for each TSG, T, on this level, insert all higher-level channels
+           and TSGs before inserting T. */
+        for_each_set_bit(tsgid, runlist->active_tsgs, f->num_channels) {
+                struct tsg_gk20a *tsg = &f->tsg[tsgid];
+                if (tsg->interleave_level != cur_level) {
+                        continue;
+                }
+                if (!last_level && !skip_next) {
+                        runlist_entry = gk20a_runlist_construct_locked(f,
+                                                        runlist,
+                                                        cur_level + 1,
+                                                        runlist_entry,
+                                                        interleave_enabled,
+                                                        false,
+                                                        entries_left);
+                        if (!interleave_enabled) {
+                                skip_next = true;
+                        }
+                }
+                if (*entries_left == 0U) {
+                        return NULL;
+                }
+                /* add TSG entry */
+                nvgpu_log_info(g, "add TSG %d to runlist", tsg->tsgid);
+                f->g->ops.fifo.get_tsg_runlist_entry(tsg, runlist_entry);
+                nvgpu_log_info(g, "tsg runlist count %d runlist [0] %x [1] %x\n",
+                                count, runlist_entry[0], runlist_entry[1]);
+                runlist_entry += runlist_entry_words;
+                count++;
+                (*entries_left)--;
+                nvgpu_rwsem_down_read(&tsg->ch_list_lock);
+                /* add runnable channels bound to this TSG */
+                nvgpu_list_for_each_entry(ch, &tsg->ch_list,
+                                channel_gk20a, ch_entry) {
+                        if (!test_bit((int)ch->chid,
+                                      runlist->active_channels)) {
+                                continue;
+                        }
+                        if (*entries_left == 0U) {
+                                nvgpu_rwsem_up_read(&tsg->ch_list_lock);
+                                return NULL;
+                        }
+                        nvgpu_log_info(g, "add channel %d to runlist",
+                                ch->chid);
+                        f->g->ops.fifo.get_ch_runlist_entry(ch, runlist_entry);
+                        nvgpu_log_info(g,
+                                "run list count %d runlist [0] %x [1] %x\n",
+                                count, runlist_entry[0], runlist_entry[1]);
+                        count++;
+                        runlist_entry += runlist_entry_words;
+                        (*entries_left)--;
+                }
+                nvgpu_rwsem_up_read(&tsg->ch_list_lock);
+        }
+        /* append entries from higher level if this level is empty */
+        if (!count && !last_level) {
+                runlist_entry = gk20a_runlist_construct_locked(f,
+                                                        runlist,
+                                                        cur_level + 1,
+                                                        runlist_entry,
+                                                        interleave_enabled,
+                                                        true,
+                                                        entries_left);
+        }
+        /*
+         * if previous and this level have entries, append
+         * entries from higher level.
+         *
+         * ex. dropping from MEDIUM to LOW, need to insert HIGH
+         */
+        if (interleave_enabled && count && !prev_empty && !last_level) {
+                runlist_entry = gk20a_runlist_construct_locked(f,
+                                                        runlist,
+                                                        cur_level + 1,
+                                                        runlist_entry,
+                                                        interleave_enabled,
+                                                        false,
+                                                        entries_left);
+        }
+        return runlist_entry;
+}
+int gk20a_fifo_set_runlist_interleave(struct gk20a *g,
+                                u32 id,
+                                u32 runlist_id,
+                                u32 new_level)
+{
+        nvgpu_log_fn(g, " ");
+        g->fifo.tsg[id].interleave_level = new_level;
+        return 0;
+}
+int gk20a_fifo_tsg_set_timeslice(struct tsg_gk20a *tsg, u32 timeslice)
+{
+        struct gk20a *g = tsg->g;
+        if (timeslice < g->min_timeslice_us ||
+                timeslice > g->max_timeslice_us) {
+                return -EINVAL;
+        }
+        gk20a_channel_get_timescale_from_timeslice(g, timeslice,
+                        &tsg->timeslice_timeout, &tsg->timeslice_scale);
+        tsg->timeslice_us = timeslice;
+        return g->ops.fifo.update_runlist(g, tsg->runlist_id, ~0, true, true);
+}
+void gk20a_fifo_runlist_hw_submit(struct gk20a *g, u32 runlist_id,
+        u32 count, u32 buffer_index)
+{
+        struct fifo_runlist_info_gk20a *runlist = NULL;
+        u64 runlist_iova;
+        u32 val_wrote;
+        struct nvgpu_os_linux *l;
+        runlist = &g->fifo.runlist_info[runlist_id];
+        runlist_iova = nvgpu_mem_get_addr(g, &runlist->mem[buffer_index]);
+        if (count != 0) {
+                printk(KERN_INFO "Runlist base register: %0x\n", fifo_runlist_base_r());
+                printk(KERN_INFO "Runlist KVA:  %px\n", (void*)(runlist->mem[buffer_index].cpu_va));
+                printk(KERN_INFO "Runlist PA:   %px\n", (void*)virt_to_phys((runlist->mem[buffer_index].cpu_va)));
+                printk(KERN_INFO "Runlist dma_address:  %px\n", (void*)(runlist->mem[buffer_index].priv.sgt->sgl->dma_address));
+                printk(KERN_INFO "Runlist pages KVA:  %px\n", (void*)(runlist->mem[buffer_index].priv.pages));
+                printk(KERN_INFO "Runlist pages PA:   %px\n", (void*)virt_to_phys(runlist->mem[buffer_index].priv.pages));
+                printk(KERN_INFO "Runlist dma_address:  %px\n", (void*)(runlist->mem[buffer_index].priv.sgt->sgl->dma_address));
+                printk(KERN_INFO "Runlist page_to_phys %px + offset %px\n", (void*)(page_to_phys(sg_page(runlist->mem[buffer_index].priv.sgt->sgl))), (void*)(runlist->mem[buffer_index].priv.sgt->sgl->offset));
+                printk(KERN_INFO "Runlist IOVA: %px\n", (void*)runlist_iova);
+                printk(KERN_INFO "Using struct gk20* %px\n", g);
+                printk(KERN_INFO "g->name: %s, g->power_on: %d, g->sw_ready: %d, g->is_virtual %d\n", g->name, g->power_on, g->sw_ready, g->is_virtual);
+                printk(KERN_INFO "COHERENT_SYSMEM? %d, iommuable? %d\n", nvgpu_is_enabled(g, NVGPU_USE_COHERENT_SYSMEM), nvgpu_iommuable(g));
+                l = container_of(g, struct nvgpu_os_linux, g);
+                printk(KERN_INFO "l->regs %px\n", l->regs);
+                gk20a_writel(g, fifo_runlist_base_r(),
+                        fifo_runlist_base_ptr_f(u64_lo32(runlist_iova >> 12)) |
+                        nvgpu_aperture_mask(g, &runlist->mem[buffer_index],
+                                fifo_runlist_base_target_sys_mem_ncoh_f(),
+                                fifo_runlist_base_target_sys_mem_coh_f(),
+                                fifo_runlist_base_target_vid_mem_f()));
+                val_wrote = nvgpu_readl(g, 0x2270);
+                printk(KERN_INFO "Wrote runlist base as %0llx\n", (u64)(val_wrote & 0x0fffffff) << 12);
+        }
+        gk20a_writel(g, fifo_runlist_r(),
+                fifo_runlist_engine_f(runlist_id) |
+                fifo_eng_runlist_length_f(count));
+}
+int gk20a_fifo_update_runlist_locked(struct gk20a *g, u32 runlist_id,
+                                            u32 chid, bool add,
+                                            bool wait_for_finish)
+{
+        int ret = 0;
+        struct fifo_gk20a *f = &g->fifo;
+        struct fifo_runlist_info_gk20a *runlist = NULL;
+        u32 *runlist_entry_base = NULL;
+        u64 runlist_iova;
+        u32 new_buf;
+        struct channel_gk20a *ch = NULL;
+        struct tsg_gk20a *tsg = NULL;
+        u32 runlist_entry_words = f->runlist_entry_size / sizeof(u32);
+        runlist = &f->runlist_info[runlist_id];
+        /* valid channel, add/remove it from active list.
+           Otherwise, keep active list untouched for suspend/resume. */
+        if (chid != FIFO_INVAL_CHANNEL_ID) {
+                ch = &f->channel[chid];
+                tsg = tsg_gk20a_from_ch(ch);
+                if (add) {
+                        if (test_and_set_bit(chid,
+                                runlist->active_channels) == 1) {
+                                return 0;
+                        }
+                        if (tsg && ++tsg->num_active_channels) {
+                                set_bit((int)f->channel[chid].tsgid,
+                                        runlist->active_tsgs);
+                        }
+                } else {
+                        if (test_and_clear_bit(chid,
+                                runlist->active_channels) == 0) {
+                                return 0;
+                        }
+                        if (tsg && --tsg->num_active_channels == 0) {
+                                clear_bit((int)f->channel[chid].tsgid,
+                                        runlist->active_tsgs);
+                        }
+                }
+        }
+        new_buf = !runlist->cur_buffer;
+        runlist_iova = nvgpu_mem_get_addr(g, &runlist->mem[new_buf]);
+        nvgpu_log_info(g, "runlist_id : %d, switch to new buffer 0x%16llx",
+                runlist_id, (u64)runlist_iova);
+        if (!runlist_iova) {
+                ret = -EINVAL;
+                goto clean_up;
+        }
+        runlist_entry_base = runlist->mem[new_buf].cpu_va;
+        if (!runlist_entry_base) {
+                ret = -ENOMEM;
+                goto clean_up;
+        }
+        if (chid != FIFO_INVAL_CHANNEL_ID || /* add/remove a valid channel */
+            add /* resume to add all channels back */) {
+                u32 max_entries = f->num_runlist_entries;
+                u32 *runlist_end;
+                runlist_end = gk20a_runlist_construct_locked(f,
+                                                runlist,
+                                                0,
+                                                runlist_entry_base,
+                                                g->runlist_interleave,
+                                                true,
+                                                &max_entries);
+                if (!runlist_end) {
+                        ret = -E2BIG;
+                        goto clean_up;
+                }
+                runlist->count = (runlist_end - runlist_entry_base) /
+                        runlist_entry_words;
+                WARN_ON(runlist->count > f->num_runlist_entries);
+        } else {
+                /* suspend to remove all channels */
+                runlist->count = 0;
+        }
+        g->ops.fifo.runlist_hw_submit(g, runlist_id, runlist->count, new_buf);
+        if (wait_for_finish) {
+                ret = g->ops.fifo.runlist_wait_pending(g, runlist_id);
+                if (ret == -ETIMEDOUT) {
+                        nvgpu_err(g, "runlist %d update timeout", runlist_id);
+                        /* trigger runlist update timeout recovery */
+                        return ret;
+                } else if (ret == -EINTR) {
+                        nvgpu_err(g, "runlist update interrupted");
+                }
+        }
+        runlist->cur_buffer = new_buf;
+clean_up:
+        return ret;
+}
+int gk20a_fifo_update_runlist_ids(struct gk20a *g, u32 runlist_ids, u32 chid,
+                                bool add, bool wait_for_finish)
+{
+        u32 ret = -EINVAL;
+        u32 runlist_id = 0;
+        u32 errcode;
+        unsigned long ulong_runlist_ids = (unsigned long)runlist_ids;
+        if (!g) {
+                goto end;
+        }
+        ret = 0;
+        for_each_set_bit(runlist_id, &ulong_runlist_ids, 32) {
+                /* Capture the last failure error code */
+                errcode = g->ops.fifo.update_runlist(g, runlist_id, chid, add, wait_for_finish);
+                if (errcode) {
+                        nvgpu_err(g,
+                                "failed to update_runlist %d %d", runlist_id, errcode);
+                        ret = errcode;
+                }
+        }
+end:
+        return ret;
+}
+/* trigger host preempt of GR pending load ctx if that ctx is not for ch */
+static int __locked_fifo_reschedule_preempt_next(struct channel_gk20a *ch,
+                bool wait_preempt)
+{
+        struct gk20a *g = ch->g;
+        struct fifo_runlist_info_gk20a *runlist =
+                &g->fifo.runlist_info[ch->runlist_id];
+        int ret = 0;
+        u32 gr_eng_id = 0;
+        u32 engstat = 0, ctxstat = 0, fecsstat0 = 0, fecsstat1 = 0;
+        u32 preempt_id;
+        u32 preempt_type = 0;
+        if (1 != gk20a_fifo_get_engine_ids(
+                g, &gr_eng_id, 1, ENGINE_GR_GK20A)) {
+                return ret;
+        }
+        if (!(runlist->eng_bitmask & (1 << gr_eng_id))) {
+                return ret;
+        }
+        if (wait_preempt && gk20a_readl(g, fifo_preempt_r()) &
+                fifo_preempt_pending_true_f()) {
+                return ret;
+        }
+        fecsstat0 = gk20a_readl(g, gr_fecs_ctxsw_mailbox_r(0));
+        engstat = gk20a_readl(g, fifo_engine_status_r(gr_eng_id));
+        ctxstat = fifo_engine_status_ctx_status_v(engstat);
+        if (ctxstat == fifo_engine_status_ctx_status_ctxsw_switch_v()) {
+                /* host switching to next context, preempt that if needed */
+                preempt_id = fifo_engine_status_next_id_v(engstat);
+                preempt_type = fifo_engine_status_next_id_type_v(engstat);
+        } else {
+                return ret;
+        }
+        if (preempt_id == ch->tsgid && preempt_type) {
+                return ret;
+        }
+        fecsstat1 = gk20a_readl(g, gr_fecs_ctxsw_mailbox_r(0));
+        if (fecsstat0 != FECS_MAILBOX_0_ACK_RESTORE ||
+                fecsstat1 != FECS_MAILBOX_0_ACK_RESTORE) {
+                /* preempt useless if FECS acked save and started restore */
+                return ret;
+        }
+        gk20a_fifo_issue_preempt(g, preempt_id, preempt_type);
+#ifdef TRACEPOINTS_ENABLED
+        trace_gk20a_reschedule_preempt_next(ch->chid, fecsstat0, engstat,
+                fecsstat1, gk20a_readl(g, gr_fecs_ctxsw_mailbox_r(0)),
+                gk20a_readl(g, fifo_preempt_r()));
+#endif
+        if (wait_preempt) {
+                g->ops.fifo.is_preempt_pending(g, preempt_id, preempt_type);
+        }
+#ifdef TRACEPOINTS_ENABLED
+        trace_gk20a_reschedule_preempted_next(ch->chid);
+#endif
+        return ret;
+}
+int gk20a_fifo_reschedule_runlist(struct channel_gk20a *ch, bool preempt_next)
+{
+        return nvgpu_fifo_reschedule_runlist(ch, preempt_next, true);
+}
+/* trigger host to expire current timeslice and reschedule runlist from front */
+int nvgpu_fifo_reschedule_runlist(struct channel_gk20a *ch, bool preempt_next,
+                bool wait_preempt)
+{
+        struct gk20a *g = ch->g;
+        struct fifo_runlist_info_gk20a *runlist;
+        u32 token = PMU_INVALID_MUTEX_OWNER_ID;
+        u32 mutex_ret;
+        int ret = 0;
+        runlist = &g->fifo.runlist_info[ch->runlist_id];
+        if (!nvgpu_mutex_tryacquire(&runlist->runlist_lock)) {
+                return -EBUSY;
+        }
+        mutex_ret = nvgpu_pmu_mutex_acquire(
+                &g->pmu, PMU_MUTEX_ID_FIFO, &token);
+        g->ops.fifo.runlist_hw_submit(
+                g, ch->runlist_id, runlist->count, runlist->cur_buffer);
+        if (preempt_next) {
+                __locked_fifo_reschedule_preempt_next(ch, wait_preempt);
+        }
+        gk20a_fifo_runlist_wait_pending(g, ch->runlist_id);
+        if (!mutex_ret) {
+                nvgpu_pmu_mutex_release(
+                        &g->pmu, PMU_MUTEX_ID_FIFO, &token);
+        }
+        nvgpu_mutex_release(&runlist->runlist_lock);
+        return ret;
+}
+/* add/remove a channel from runlist
+   special cases below: runlist->active_channels will NOT be changed.
+   (chid == ~0 && !add) means remove all active channels from runlist.
+   (chid == ~0 &&  add) means restore all active channels on runlist. */
+int gk20a_fifo_update_runlist(struct gk20a *g, u32 runlist_id, u32 chid,
+                              bool add, bool wait_for_finish)
+{
+        struct fifo_runlist_info_gk20a *runlist = NULL;
+        struct fifo_gk20a *f = &g->fifo;
+        u32 token = PMU_INVALID_MUTEX_OWNER_ID;
+        u32 mutex_ret;
+        int ret = 0;
+        nvgpu_log_fn(g, " ");
+        runlist = &f->runlist_info[runlist_id];
+        nvgpu_mutex_acquire(&runlist->runlist_lock);
+        mutex_ret = nvgpu_pmu_mutex_acquire(&g->pmu, PMU_MUTEX_ID_FIFO, &token);
+        ret = gk20a_fifo_update_runlist_locked(g, runlist_id, chid, add,
+                                               wait_for_finish);
+        if (!mutex_ret) {
+                nvgpu_pmu_mutex_release(&g->pmu, PMU_MUTEX_ID_FIFO, &token);
+        }
+        nvgpu_mutex_release(&runlist->runlist_lock);
+        if (ret == -ETIMEDOUT) {
+                gk20a_fifo_runlist_reset_engines(g, runlist_id);
+        }
+        return ret;
+}
+int gk20a_fifo_suspend(struct gk20a *g)
+{
+        nvgpu_log_fn(g, " ");
+        /* stop bar1 snooping */
+        if (g->ops.mm.is_bar1_supported(g)) {
+                gk20a_writel(g, fifo_bar1_base_r(),
+                        fifo_bar1_base_valid_false_f());
+        }
+        /* disable fifo intr */
+        gk20a_writel(g, fifo_intr_en_0_r(), 0);
+        gk20a_writel(g, fifo_intr_en_1_r(), 0);
+        nvgpu_log_fn(g, "done");
+        return 0;
+}
+bool gk20a_fifo_mmu_fault_pending(struct gk20a *g)
+{
+        if (gk20a_readl(g, fifo_intr_0_r()) &
+                        fifo_intr_0_mmu_fault_pending_f()) {
+                return true;
+        } else {
+                return false;
+        }
+}
+bool gk20a_fifo_is_engine_busy(struct gk20a *g)
+{
+        u32 i, host_num_engines;
+        host_num_engines = nvgpu_get_litter_value(g, GPU_LIT_HOST_NUM_ENGINES);
+        for (i = 0; i < host_num_engines; i++) {
+                u32 status = gk20a_readl(g, fifo_engine_status_r(i));
+                if (fifo_engine_status_engine_v(status) ==
+                        fifo_engine_status_engine_busy_v()) {
+                        return true;
+                }
+        }
+        return false;
+}
+int gk20a_fifo_wait_engine_idle(struct gk20a *g)
+{
+        struct nvgpu_timeout timeout;
+        unsigned long delay = GR_IDLE_CHECK_DEFAULT;
+        int ret = -ETIMEDOUT;
+        u32 i, host_num_engines;
+        nvgpu_log_fn(g, " ");
+        host_num_engines =
+                 nvgpu_get_litter_value(g, GPU_LIT_HOST_NUM_ENGINES);
+        nvgpu_timeout_init(g, &timeout, gk20a_get_gr_idle_timeout(g),
+                           NVGPU_TIMER_CPU_TIMER);
+        for (i = 0; i < host_num_engines; i++) {
+                do {
+                        u32 status = gk20a_readl(g, fifo_engine_status_r(i));
+                        if (!fifo_engine_status_engine_v(status)) {
+                                ret = 0;
+                                break;
+                        }
+                        nvgpu_usleep_range(delay, delay * 2);
+                        delay = min_t(unsigned long,
+                                        delay << 1, GR_IDLE_CHECK_MAX);
+                } while (!nvgpu_timeout_expired(&timeout));
+                if (ret) {
+                        nvgpu_log_info(g, "cannot idle engine %u", i);
+                        break;
+                }
+        }
+        nvgpu_log_fn(g, "done");
+        return ret;
+}
+u32 gk20a_fifo_get_pbdma_signature(struct gk20a *g)
+{
+        return pbdma_signature_hw_valid_f() | pbdma_signature_sw_zero_f();
+}
+static const char * const ccsr_chan_status_str[] = {
+        "idle",
+        "pending",
+        "pending_ctx_reload",
+        "pending_acquire",
+        "pending_acq_ctx_reload",
+        "on_pbdma",
+        "on_pbdma_and_eng",
+        "on_eng",
+        "on_eng_pending_acquire",
+        "on_eng_pending",
+        "on_pbdma_ctx_reload",
+        "on_pbdma_and_eng_ctx_reload",
+        "on_eng_ctx_reload",
+        "on_eng_pending_ctx_reload",
+        "on_eng_pending_acq_ctx_reload",
+};
+static const char * const pbdma_chan_eng_ctx_status_str[] = {
+        "invalid",
+        "valid",
+        "NA",
+        "NA",
+        "NA",
+        "load",
+        "save",
+        "switch",
+};
+static const char * const not_found_str[] = {
+        "NOT FOUND"
+};
+const char *gk20a_decode_ccsr_chan_status(u32 index)
+{
+        if (index >= ARRAY_SIZE(ccsr_chan_status_str)) {
+                return not_found_str[0];
+        } else {
+                return ccsr_chan_status_str[index];
+        }
+}
+const char *gk20a_decode_pbdma_chan_eng_ctx_status(u32 index)
+{
+        if (index >= ARRAY_SIZE(pbdma_chan_eng_ctx_status_str)) {
+                return not_found_str[0];
+        } else {
+                return pbdma_chan_eng_ctx_status_str[index];
+        }
+}
+bool gk20a_fifo_channel_status_is_next(struct gk20a *g, u32 chid)
+{
+        u32 channel = gk20a_readl(g, ccsr_channel_r(chid));
+        return ccsr_channel_next_v(channel) == ccsr_channel_next_true_v();
+}
+bool gk20a_fifo_channel_status_is_ctx_reload(struct gk20a *g, u32 chid)
+{
+        u32 channel = gk20a_readl(g, ccsr_channel_r(chid));
+        u32 status = ccsr_channel_status_v(channel);
+        return (status == ccsr_channel_status_pending_ctx_reload_v() ||
+                status == ccsr_channel_status_pending_acq_ctx_reload_v() ||
+                status == ccsr_channel_status_on_pbdma_ctx_reload_v() ||
+                status == ccsr_channel_status_on_pbdma_and_eng_ctx_reload_v() ||
+                status == ccsr_channel_status_on_eng_ctx_reload_v() ||
+                status == ccsr_channel_status_on_eng_pending_ctx_reload_v() ||
+                status == ccsr_channel_status_on_eng_pending_acq_ctx_reload_v());
+}
+void gk20a_dump_channel_status_ramfc(struct gk20a *g,
+                                     struct gk20a_debug_output *o,
+                                     u32 chid,
+                                     struct ch_state *ch_state)
+{
+        u32 channel = gk20a_readl(g, ccsr_channel_r(chid));
+        u32 status = ccsr_channel_status_v(channel);
+        u32 syncpointa, syncpointb;
+        u32 *inst_mem;
+        struct channel_gk20a *c = g->fifo.channel + chid;
+        struct nvgpu_semaphore_int *hw_sema = NULL;
+        if (c->hw_sema) {
+                hw_sema = c->hw_sema;
+        }
+        if (!ch_state) {
+                return;
+        }
+        inst_mem = &ch_state->inst_block[0];
+        syncpointa = inst_mem[ram_fc_syncpointa_w()];
+        syncpointb = inst_mem[ram_fc_syncpointb_w()];
+        gk20a_debug_output(o, "%d-%s, pid %d, refs %d%s: ", chid,
+                        g->name,
+                        ch_state->pid,
+                        ch_state->refs,
+                        ch_state->deterministic ? ", deterministic" : "");
+        gk20a_debug_output(o, "channel status: %s in use %s %s\n",
+                        ccsr_channel_enable_v(channel) ? "" : "not",
+                        gk20a_decode_ccsr_chan_status(status),
+                        ccsr_channel_busy_v(channel) ? "busy" : "not busy");
+        gk20a_debug_output(o, "RAMFC : TOP: %016llx PUT: %016llx GET: %016llx "
+                        "FETCH: %016llx\nHEADER: %08x COUNT: %08x\n"
+                        "SYNCPOINT %08x %08x SEMAPHORE %08x %08x %08x %08x\n",
+                (u64)inst_mem[ram_fc_pb_top_level_get_w()] +
+                ((u64)inst_mem[ram_fc_pb_top_level_get_hi_w()] << 32ULL),
+                (u64)inst_mem[ram_fc_pb_put_w()] +
+                ((u64)inst_mem[ram_fc_pb_put_hi_w()] << 32ULL),
+                (u64)inst_mem[ram_fc_pb_get_w()] +
+                ((u64)inst_mem[ram_fc_pb_get_hi_w()] << 32ULL),
+                (u64)inst_mem[ram_fc_pb_fetch_w()] +
+                ((u64)inst_mem[ram_fc_pb_fetch_hi_w()] << 32ULL),
+                inst_mem[ram_fc_pb_header_w()],
+                inst_mem[ram_fc_pb_count_w()],
+                syncpointa,
+                syncpointb,
+                inst_mem[ram_fc_semaphorea_w()],
+                inst_mem[ram_fc_semaphoreb_w()],
+                inst_mem[ram_fc_semaphorec_w()],
+                inst_mem[ram_fc_semaphored_w()]);
+        if (hw_sema) {
+                gk20a_debug_output(o, "SEMA STATE: value: 0x%08x "
+                                   "next_val: 0x%08x addr: 0x%010llx\n",
+                                   __nvgpu_semaphore_read(hw_sema),
+                                   nvgpu_atomic_read(&hw_sema->next_value),
+                                   nvgpu_hw_sema_addr(hw_sema));
+        }
+#ifdef CONFIG_TEGRA_GK20A_NVHOST
+        if ((pbdma_syncpointb_op_v(syncpointb) == pbdma_syncpointb_op_wait_v())
+                && (pbdma_syncpointb_wait_switch_v(syncpointb) ==
+                        pbdma_syncpointb_wait_switch_en_v()))
+                gk20a_debug_output(o, "%s on syncpt %u (%s) val %u\n",
+                        (status == 3 || status == 8) ? "Waiting" : "Waited",
+                        pbdma_syncpointb_syncpt_index_v(syncpointb),
+                        nvgpu_nvhost_syncpt_get_name(g->nvhost_dev,
+                                pbdma_syncpointb_syncpt_index_v(syncpointb)),
+                        pbdma_syncpointa_payload_v(syncpointa));
+#endif
+        gk20a_debug_output(o, "\n");
+}
+void gk20a_debug_dump_all_channel_status_ramfc(struct gk20a *g,
+                 struct gk20a_debug_output *o)
+{
+        struct fifo_gk20a *f = &g->fifo;
+        u32 chid;
+        struct ch_state **ch_state;
+        ch_state = nvgpu_kzalloc(g, sizeof(*ch_state) * f->num_channels);
+        if (!ch_state) {
+                gk20a_debug_output(o, "cannot alloc memory for channels\n");
+                return;
+        }
+        for (chid = 0; chid < f->num_channels; chid++) {
+                struct channel_gk20a *ch = gk20a_channel_from_id(g, chid);
+                if (ch != NULL) {
+                        ch_state[chid] =
+                                nvgpu_kmalloc(g, sizeof(struct ch_state) +
+                                        ram_in_alloc_size_v());
+                        /* ref taken stays to below loop with
+                         * successful allocs */
+                        if (!ch_state[chid]) {
+                                gk20a_channel_put(ch);
+                        }
+                }
+        }
+        for (chid = 0; chid < f->num_channels; chid++) {
+                struct channel_gk20a *ch = &f->channel[chid];
+                if (!ch_state[chid]) {
+                        continue;
+                }
+                ch_state[chid]->pid = ch->pid;
+                ch_state[chid]->refs = nvgpu_atomic_read(&ch->ref_count);
+                ch_state[chid]->deterministic = ch->deterministic;
+                nvgpu_mem_rd_n(g, &ch->inst_block, 0,
+                                &ch_state[chid]->inst_block[0],
+                                ram_in_alloc_size_v());
+                gk20a_channel_put(ch);
+        }
+        for (chid = 0; chid < f->num_channels; chid++) {
+                if (ch_state[chid]) {
+                        g->ops.fifo.dump_channel_status_ramfc(g, o, chid,
+                                                 ch_state[chid]);
+                        nvgpu_kfree(g, ch_state[chid]);
+                }
+        }
+        nvgpu_kfree(g, ch_state);
+}
+void gk20a_dump_pbdma_status(struct gk20a *g,
+                                 struct gk20a_debug_output *o)
+{
+        u32 i, host_num_pbdma;
+        host_num_pbdma = nvgpu_get_litter_value(g, GPU_LIT_HOST_NUM_PBDMA);
+        for (i = 0; i < host_num_pbdma; i++) {
+                u32 status = gk20a_readl(g, fifo_pbdma_status_r(i));
+                u32 chan_status = fifo_pbdma_status_chan_status_v(status);
+                gk20a_debug_output(o, "%s pbdma %d: ", g->name, i);
+                gk20a_debug_output(o,
+                                "id: %d (%s), next_id: %d (%s) chan status: %s\n",
+                                fifo_pbdma_status_id_v(status),
+                                fifo_pbdma_status_id_type_v(status) ?
+                                        "tsg" : "channel",
+                                fifo_pbdma_status_next_id_v(status),
+                                fifo_pbdma_status_next_id_type_v(status) ?
+                                        "tsg" : "channel",
+                        gk20a_decode_pbdma_chan_eng_ctx_status(chan_status));
+                gk20a_debug_output(o, "PBDMA_PUT: %016llx PBDMA_GET: %016llx "
+                                "GP_PUT: %08x GP_GET: %08x "
+                                "FETCH: %08x HEADER: %08x\n"
+                                "HDR: %08x SHADOW0: %08x SHADOW1: %08x",
+                        (u64)gk20a_readl(g, pbdma_put_r(i)) +
+                        ((u64)gk20a_readl(g, pbdma_put_hi_r(i)) << 32ULL),
+                        (u64)gk20a_readl(g, pbdma_get_r(i)) +
+                        ((u64)gk20a_readl(g, pbdma_get_hi_r(i)) << 32ULL),
+                        gk20a_readl(g, pbdma_gp_put_r(i)),
+                        gk20a_readl(g, pbdma_gp_get_r(i)),
+                        gk20a_readl(g, pbdma_gp_fetch_r(i)),
+                        gk20a_readl(g, pbdma_pb_header_r(i)),
+                        gk20a_readl(g, pbdma_hdr_shadow_r(i)),
+                        gk20a_readl(g, pbdma_gp_shadow_0_r(i)),
+                        gk20a_readl(g, pbdma_gp_shadow_1_r(i)));
+        }
+        gk20a_debug_output(o, "\n");
+}
+void gk20a_dump_eng_status(struct gk20a *g,
+                                 struct gk20a_debug_output *o)
+{
+        u32 i, host_num_engines;
+        host_num_engines = nvgpu_get_litter_value(g, GPU_LIT_HOST_NUM_ENGINES);
+        for (i = 0; i < host_num_engines; i++) {
+                u32 status = gk20a_readl(g, fifo_engine_status_r(i));
+                u32 ctx_status = fifo_engine_status_ctx_status_v(status);
+                gk20a_debug_output(o, "%s eng %d: ", g->name, i);
+                gk20a_debug_output(o,
+                        "id: %d (%s), next_id: %d (%s), ctx status: %s ",
+                        fifo_engine_status_id_v(status),
+                        fifo_engine_status_id_type_v(status) ?
+                                "tsg" : "channel",
+                        fifo_engine_status_next_id_v(status),
+                        fifo_engine_status_next_id_type_v(status) ?
+                                "tsg" : "channel",
+                        gk20a_decode_pbdma_chan_eng_ctx_status(ctx_status));
+                if (fifo_engine_status_faulted_v(status)) {
+                        gk20a_debug_output(o, "faulted ");
+                }
+                if (fifo_engine_status_engine_v(status)) {
+                        gk20a_debug_output(o, "busy ");
+                }
+                gk20a_debug_output(o, "\n");
+        }
+        gk20a_debug_output(o, "\n");
+}
+void gk20a_fifo_enable_channel(struct channel_gk20a *ch)
+{
+        gk20a_writel(ch->g, ccsr_channel_r(ch->chid),
+                gk20a_readl(ch->g, ccsr_channel_r(ch->chid)) |
+                ccsr_channel_enable_set_true_f());
+}
+void gk20a_fifo_disable_channel(struct channel_gk20a *ch)
+{
+        gk20a_writel(ch->g, ccsr_channel_r(ch->chid),
+                gk20a_readl(ch->g,
+                        ccsr_channel_r(ch->chid)) |
+                        ccsr_channel_enable_clr_true_f());
+}
+void gk20a_fifo_channel_unbind(struct channel_gk20a *ch_gk20a)
+{
+        struct gk20a *g = ch_gk20a->g;
+        nvgpu_log_fn(g, " ");
+        if (nvgpu_atomic_cmpxchg(&ch_gk20a->bound, true, false)) {
+                gk20a_writel(g, ccsr_channel_inst_r(ch_gk20a->chid),
+                        ccsr_channel_inst_ptr_f(0) |
+                        ccsr_channel_inst_bind_false_f());
+        }
+}
+static int gk20a_fifo_commit_userd(struct channel_gk20a *c)
+{
+        u32 addr_lo;
+        u32 addr_hi;
+        struct gk20a *g = c->g;
+        nvgpu_log_fn(g, " ");
+        addr_lo = u64_lo32(c->userd_iova >> ram_userd_base_shift_v());
+        addr_hi = u64_hi32(c->userd_iova);
+        nvgpu_log_info(g, "channel %d : set ramfc userd 0x%16llx",
+                c->chid, (u64)c->userd_iova);
+        nvgpu_mem_wr32(g, &c->inst_block,
+                       ram_in_ramfc_w() + ram_fc_userd_w(),
+                       nvgpu_aperture_mask(g, &g->fifo.userd,
+                                           pbdma_userd_target_sys_mem_ncoh_f(),
+                                           pbdma_userd_target_sys_mem_coh_f(),
+                                           pbdma_userd_target_vid_mem_f()) |
+                       pbdma_userd_addr_f(addr_lo));
+        nvgpu_mem_wr32(g, &c->inst_block,
+                       ram_in_ramfc_w() + ram_fc_userd_hi_w(),
+                       pbdma_userd_hi_addr_f(addr_hi));
+        return 0;
+}
+int gk20a_fifo_setup_ramfc(struct channel_gk20a *c,
+                        u64 gpfifo_base, u32 gpfifo_entries,
+                        unsigned long timeout,
+                        u32 flags)
+{
+        struct gk20a *g = c->g;
+        struct nvgpu_mem *mem = &c->inst_block;
+        nvgpu_log_fn(g, " ");
+        nvgpu_memset(g, mem, 0, 0, ram_fc_size_val_v());
+        nvgpu_mem_wr32(g, mem, ram_fc_gp_base_w(),
+                pbdma_gp_base_offset_f(
+                u64_lo32(gpfifo_base >> pbdma_gp_base_rsvd_s())));
+        nvgpu_mem_wr32(g, mem, ram_fc_gp_base_hi_w(),
+                pbdma_gp_base_hi_offset_f(u64_hi32(gpfifo_base)) |
+                pbdma_gp_base_hi_limit2_f(ilog2(gpfifo_entries)));
+        nvgpu_mem_wr32(g, mem, ram_fc_signature_w(),
+                 c->g->ops.fifo.get_pbdma_signature(c->g));
+        nvgpu_mem_wr32(g, mem, ram_fc_formats_w(),
+                pbdma_formats_gp_fermi0_f() |
+                pbdma_formats_pb_fermi1_f() |
+                pbdma_formats_mp_fermi0_f());
+        nvgpu_mem_wr32(g, mem, ram_fc_pb_header_w(),
+                pbdma_pb_header_priv_user_f() |
+                pbdma_pb_header_method_zero_f() |
+                pbdma_pb_header_subchannel_zero_f() |
+                pbdma_pb_header_level_main_f() |
+                pbdma_pb_header_first_true_f() |
+                pbdma_pb_header_type_inc_f());
+        nvgpu_mem_wr32(g, mem, ram_fc_subdevice_w(),
+                pbdma_subdevice_id_f(1) |
+                pbdma_subdevice_status_active_f() |
+                pbdma_subdevice_channel_dma_enable_f());
+        nvgpu_mem_wr32(g, mem, ram_fc_target_w(), pbdma_target_engine_sw_f());
+        nvgpu_mem_wr32(g, mem, ram_fc_acquire_w(),
+                g->ops.fifo.pbdma_acquire_val(timeout));
+        nvgpu_mem_wr32(g, mem, ram_fc_runlist_timeslice_w(),
+                fifo_runlist_timeslice_timeout_128_f() |
+                fifo_runlist_timeslice_timescale_3_f() |
+                fifo_runlist_timeslice_enable_true_f());
+        nvgpu_mem_wr32(g, mem, ram_fc_pb_timeslice_w(),
+                fifo_pb_timeslice_timeout_16_f() |
+                fifo_pb_timeslice_timescale_0_f() |
+                fifo_pb_timeslice_enable_true_f());
+        nvgpu_mem_wr32(g, mem, ram_fc_chid_w(), ram_fc_chid_id_f(c->chid));
+        if (c->is_privileged_channel) {
+                gk20a_fifo_setup_ramfc_for_privileged_channel(c);
+        }
+        return gk20a_fifo_commit_userd(c);
+}
+void gk20a_fifo_setup_ramfc_for_privileged_channel(struct channel_gk20a *c)
+{
+        struct gk20a *g = c->g;
+        struct nvgpu_mem *mem = &c->inst_block;
+        nvgpu_log_info(g, "channel %d : set ramfc privileged_channel", c->chid);
+        /* Enable HCE priv mode for phys mode transfer */
+        nvgpu_mem_wr32(g, mem, ram_fc_hce_ctrl_w(),
+                pbdma_hce_ctrl_hce_priv_mode_yes_f());
+}
+int gk20a_fifo_setup_userd(struct channel_gk20a *c)
+{
+        struct gk20a *g = c->g;
+        struct nvgpu_mem *mem;
+        u32 offset;
+        nvgpu_log_fn(g, " ");
+        if (nvgpu_mem_is_valid(&c->usermode_userd)) {
+                mem = &c->usermode_userd;
+                offset = 0;
+        } else {
+                mem = &g->fifo.userd;
+                offset = c->chid * g->fifo.userd_entry_size / sizeof(u32);
+        }
+        nvgpu_mem_wr32(g, mem, offset + ram_userd_put_w(), 0);
+        nvgpu_mem_wr32(g, mem, offset + ram_userd_get_w(), 0);
+        nvgpu_mem_wr32(g, mem, offset + ram_userd_ref_w(), 0);
+        nvgpu_mem_wr32(g, mem, offset + ram_userd_put_hi_w(), 0);
+        nvgpu_mem_wr32(g, mem, offset + ram_userd_ref_threshold_w(), 0);
+        nvgpu_mem_wr32(g, mem, offset + ram_userd_gp_top_level_get_w(), 0);
+        nvgpu_mem_wr32(g, mem, offset + ram_userd_gp_top_level_get_hi_w(), 0);
+        nvgpu_mem_wr32(g, mem, offset + ram_userd_get_hi_w(), 0);
+        nvgpu_mem_wr32(g, mem, offset + ram_userd_gp_get_w(), 0);
+        nvgpu_mem_wr32(g, mem, offset + ram_userd_gp_put_w(), 0);
+        return 0;
+}
+int gk20a_fifo_alloc_inst(struct gk20a *g, struct channel_gk20a *ch)
+{
+        int err;
+        nvgpu_log_fn(g, " ");
+        err = g->ops.mm.alloc_inst_block(g, &ch->inst_block);
+        if (err) {
+                return err;
+        }
+        nvgpu_log_info(g, "channel %d inst block physical addr: 0x%16llx",
+                ch->chid, nvgpu_inst_block_addr(g, &ch->inst_block));
+        nvgpu_log_fn(g, "done");
+        return 0;
+}
+void gk20a_fifo_free_inst(struct gk20a *g, struct channel_gk20a *ch)
+{
+        nvgpu_free_inst_block(g, &ch->inst_block);
+}
+u32 gk20a_fifo_userd_gp_get(struct gk20a *g, struct channel_gk20a *c)
+{
+        return gk20a_bar1_readl(g,
+                c->userd_gpu_va + sizeof(u32) * ram_userd_gp_get_w());
+}
+u64 gk20a_fifo_userd_pb_get(struct gk20a *g, struct channel_gk20a *c)
+{
+        u32 lo = gk20a_bar1_readl(g,
+                c->userd_gpu_va + sizeof(u32) * ram_userd_get_w());
+        u32 hi = gk20a_bar1_readl(g,
+                c->userd_gpu_va + sizeof(u32) * ram_userd_get_hi_w());
+        return ((u64)hi << 32) | lo;
+}
+void gk20a_fifo_userd_gp_put(struct gk20a *g, struct channel_gk20a *c)
+{
+        gk20a_bar1_writel(g,
+                c->userd_gpu_va + sizeof(u32) * ram_userd_gp_put_w(),
+                c->gpfifo.put);
+}
+u32 gk20a_fifo_pbdma_acquire_val(u64 timeout)
+{
+        u32 val, exp, man;
+        unsigned int val_len;
+        val = pbdma_acquire_retry_man_2_f() |
+                pbdma_acquire_retry_exp_2_f();
+        if (!timeout) {
+                return val;
+        }
+        timeout *= 80UL;
+        do_div(timeout, 100); /* set acquire timeout to 80% of channel wdt */
+        timeout *= 1000000UL; /* ms -> ns */
+        do_div(timeout, 1024); /* in unit of 1024ns */
+        val_len = fls(timeout >> 32) + 32;
+        if (val_len == 32) {
+                val_len = fls(timeout);
+        }
+        if (val_len > 16U + pbdma_acquire_timeout_exp_max_v()) { /* man: 16bits */
+                exp = pbdma_acquire_timeout_exp_max_v();
+                man = pbdma_acquire_timeout_man_max_v();
+        } else if (val_len > 16) {
+                exp = val_len - 16;
+                man = timeout >> exp;
+        } else {
+                exp = 0;
+                man = timeout;
+        }
+        val |= pbdma_acquire_timeout_exp_f(exp) |
+                pbdma_acquire_timeout_man_f(man) |
+                pbdma_acquire_timeout_en_enable_f();
+        return val;
+}
+const char *gk20a_fifo_interleave_level_name(u32 interleave_level)
+{
+        switch (interleave_level) {
+        case NVGPU_FIFO_RUNLIST_INTERLEAVE_LEVEL_LOW:
+                return "LOW";
+        case NVGPU_FIFO_RUNLIST_INTERLEAVE_LEVEL_MEDIUM:
+                return "MEDIUM";
+        case NVGPU_FIFO_RUNLIST_INTERLEAVE_LEVEL_HIGH:
+                return "HIGH";
+        default:
+                return "?";
+        }
+}
+u32 gk20a_fifo_get_sema_wait_cmd_size(void)
+{
+        return 8;
+}
+u32 gk20a_fifo_get_sema_incr_cmd_size(void)
+{
+        return 10;
+}
+void gk20a_fifo_add_sema_cmd(struct gk20a *g,
+        struct nvgpu_semaphore *s, u64 sema_va,
+        struct priv_cmd_entry *cmd,
+        u32 off, bool acquire, bool wfi)
+{
+        nvgpu_log_fn(g, " ");
+        /* semaphore_a */
+        nvgpu_mem_wr32(g, cmd->mem, off++, 0x20010004);
+        /* offset_upper */
+        nvgpu_mem_wr32(g, cmd->mem, off++, (sema_va >> 32) & 0xff);
+        /* semaphore_b */
+        nvgpu_mem_wr32(g, cmd->mem, off++, 0x20010005);
+        /* offset */
+        nvgpu_mem_wr32(g, cmd->mem, off++, sema_va & 0xffffffff);
+        if (acquire) {
+                /* semaphore_c */
+                nvgpu_mem_wr32(g, cmd->mem, off++, 0x20010006);
+                /* payload */
+                nvgpu_mem_wr32(g, cmd->mem, off++,
+                               nvgpu_semaphore_get_value(s));
+                /* semaphore_d */
+                nvgpu_mem_wr32(g, cmd->mem, off++, 0x20010007);
+                /* operation: acq_geq, switch_en */
+                nvgpu_mem_wr32(g, cmd->mem, off++, 0x4 | (0x1 << 12));
+        } else {
+                /* semaphore_c */
+                nvgpu_mem_wr32(g, cmd->mem, off++, 0x20010006);
+                /* payload */
+                nvgpu_mem_wr32(g, cmd->mem, off++,
+                               nvgpu_semaphore_get_value(s));
+                /* semaphore_d */
+                nvgpu_mem_wr32(g, cmd->mem, off++, 0x20010007);
+                /* operation: release, wfi */
+                nvgpu_mem_wr32(g, cmd->mem, off++,
+                                0x2 | ((wfi ? 0x0 : 0x1) << 20));
+                /* non_stall_int */
+                nvgpu_mem_wr32(g, cmd->mem, off++, 0x20010008);
+                /* ignored */
+                nvgpu_mem_wr32(g, cmd->mem, off++, 0);
+        }
+}
+#ifdef CONFIG_TEGRA_GK20A_NVHOST
+void gk20a_fifo_add_syncpt_wait_cmd(struct gk20a *g,
+                struct priv_cmd_entry *cmd, u32 off,
+                u32 id, u32 thresh, u64 gpu_va)
+{
+        nvgpu_log_fn(g, " ");
+        off = cmd->off + off;
+        /* syncpoint_a */
+        nvgpu_mem_wr32(g, cmd->mem, off++, 0x2001001C);
+        /* payload */
+        nvgpu_mem_wr32(g, cmd->mem, off++, thresh);
+        /* syncpoint_b */
+        nvgpu_mem_wr32(g, cmd->mem, off++, 0x2001001D);
+        /* syncpt_id, switch_en, wait */
+        nvgpu_mem_wr32(g, cmd->mem, off++, (id << 8) | 0x10);
+}
+u32 gk20a_fifo_get_syncpt_wait_cmd_size(void)
+{
+        return 4;
+}
+u32 gk20a_fifo_get_syncpt_incr_per_release(void)
+{
+        return 2;
+}
+void gk20a_fifo_add_syncpt_incr_cmd(struct gk20a *g,
+                bool wfi_cmd, struct priv_cmd_entry *cmd,
+                u32 id, u64 gpu_va)
+{
+        u32 off = cmd->off;
+        nvgpu_log_fn(g, " ");
+        if (wfi_cmd) {
+                /* wfi */
+                nvgpu_mem_wr32(g, cmd->mem, off++, 0x2001001E);
+                /* handle, ignored */
+                nvgpu_mem_wr32(g, cmd->mem, off++, 0x00000000);
+        }
+        /* syncpoint_a */
+        nvgpu_mem_wr32(g, cmd->mem, off++, 0x2001001C);
+        /* payload, ignored */
+        nvgpu_mem_wr32(g, cmd->mem, off++, 0);
+        /* syncpoint_b */
+        nvgpu_mem_wr32(g, cmd->mem, off++, 0x2001001D);
+        /* syncpt_id, incr */
+        nvgpu_mem_wr32(g, cmd->mem, off++, (id << 8) | 0x1);
+        /* syncpoint_b */
+        nvgpu_mem_wr32(g, cmd->mem, off++, 0x2001001D);
+        /* syncpt_id, incr */
+        nvgpu_mem_wr32(g, cmd->mem, off++, (id << 8) | 0x1);
+}
+u32 gk20a_fifo_get_syncpt_incr_cmd_size(bool wfi_cmd)
+{
+        if (wfi_cmd)
+                return 8;
+        else
+                return 6;
+}
+void gk20a_fifo_free_syncpt_buf(struct channel_gk20a *c,
+                                struct nvgpu_mem *syncpt_buf)
+{
+}
+int gk20a_fifo_alloc_syncpt_buf(struct channel_gk20a *c,
+                        u32 syncpt_id, struct nvgpu_mem *syncpt_buf)
+{
+        return 0;
+}
+#endif
diff --git a/include/gk20a/fifo_gk20a.h b/include/gk20a/fifo_gk20a.h
new file mode 100644
index 0000000..26365ca
--- /dev/null
+++ b/include/gk20a/fifo_gk20a.h
@@ -0,0 +1,471 @@
+/*
+ * GK20A graphics fifo (gr host)
+ *
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+#ifndef FIFO_GK20A_H
+#define FIFO_GK20A_H
+#include <nvgpu/kref.h>
+struct gk20a_debug_output;
+struct mmu_fault_info;
+struct nvgpu_semaphore;
+struct channel_gk20a;
+struct tsg_gk20a;
+enum {
+        NVGPU_FIFO_RUNLIST_INTERLEAVE_LEVEL_LOW = 0,
+        NVGPU_FIFO_RUNLIST_INTERLEAVE_LEVEL_MEDIUM,
+        NVGPU_FIFO_RUNLIST_INTERLEAVE_LEVEL_HIGH,
+        NVGPU_FIFO_RUNLIST_INTERLEAVE_NUM_LEVELS,
+};
+#define MAX_RUNLIST_BUFFERS             2
+#define FIFO_INVAL_ENGINE_ID            ((u32)~0)
+#define FIFO_INVAL_CHANNEL_ID           ((u32)~0)
+#define FIFO_INVAL_TSG_ID               ((u32)~0)
+#define FIFO_INVAL_RUNLIST_ID           ((u32)~0)
+#define ID_TYPE_CHANNEL                 0
+#define ID_TYPE_TSG                     1
+#define ID_TYPE_UNKNOWN                 ((u32)~0)
+#define RC_YES                          1
+#define RC_NO                           0
+#define GRFIFO_TIMEOUT_CHECK_PERIOD_US 100000
+#define RC_TYPE_NO_RC                   0
+#define RC_TYPE_MMU_FAULT               1
+#define RC_TYPE_PBDMA_FAULT             2
+#define RC_TYPE_GR_FAULT                3
+#define RC_TYPE_PREEMPT_TIMEOUT         4
+#define RC_TYPE_CTXSW_TIMEOUT           5
+#define RC_TYPE_RUNLIST_UPDATE_TIMEOUT  6
+#define RC_TYPE_FORCE_RESET             7
+#define RC_TYPE_SCHED_ERR               8
+#define NVGPU_FIFO_DEFAULT_TIMESLICE_TIMEOUT    128UL
+#define NVGPU_FIFO_DEFAULT_TIMESLICE_SCALE      3UL
+/*
+ * Number of entries in the kickoff latency buffer, used to calculate
+ * the profiling and histogram. This number is calculated to be statistically
+ * significative on a histogram on a 5% step
+ */
+#ifdef CONFIG_DEBUG_FS
+#define FIFO_PROFILING_ENTRIES  16384
+#endif
+#define RUNLIST_DISABLED                0
+#define RUNLIST_ENABLED                 1
+/* generally corresponds to the "pbdma" engine */
+struct fifo_runlist_info_gk20a {
+        unsigned long *active_channels;
+        unsigned long *active_tsgs;
+        /* Each engine has its own SW and HW runlist buffer.*/
+        struct nvgpu_mem mem[MAX_RUNLIST_BUFFERS];
+        u32  cur_buffer;
+        u32  total_entries;
+        u32  pbdma_bitmask;      /* pbdmas supported for this runlist*/
+        u32  eng_bitmask;        /* engines using this runlist */
+        u32  reset_eng_bitmask;  /* engines to be reset during recovery */
+        u32  count;              /* cached runlist_hw_submit parameter */
+        bool stopped;
+        bool support_tsg;
+        /* protect ch/tsg/runlist preempt & runlist update */
+        struct nvgpu_mutex runlist_lock;
+};
+enum {
+        ENGINE_GR_GK20A        = 0U,
+        ENGINE_GRCE_GK20A      = 1U,
+        ENGINE_ASYNC_CE_GK20A  = 2U,
+        ENGINE_INVAL_GK20A     = 3U,
+};
+struct fifo_pbdma_exception_info_gk20a {
+        u32 status_r; /* raw register value from hardware */
+        u32 id, next_id;
+        u32 chan_status_v; /* raw value from hardware */
+        bool id_is_chid, next_id_is_chid;
+        bool chsw_in_progress;
+};
+struct fifo_engine_exception_info_gk20a {
+        u32 status_r; /* raw register value from hardware */
+        u32 id, next_id;
+        u32 ctx_status_v; /* raw value from hardware */
+        bool id_is_chid, next_id_is_chid;
+        bool faulted, idle, ctxsw_in_progress;
+};
+struct fifo_engine_info_gk20a {
+        u32 engine_id;
+        u32 runlist_id;
+        u32 intr_mask;
+        u32 reset_mask;
+        u32 pbdma_id;
+        u32 inst_id;
+        u32 pri_base;
+        u32 fault_id;
+        u32 engine_enum;
+        struct fifo_pbdma_exception_info_gk20a pbdma_exception_info;
+        struct fifo_engine_exception_info_gk20a engine_exception_info;
+};
+enum {
+        PROFILE_IOCTL_ENTRY = 0U,
+        PROFILE_ENTRY,
+        PROFILE_JOB_TRACKING,
+        PROFILE_APPEND,
+        PROFILE_END,
+        PROFILE_IOCTL_EXIT,
+        PROFILE_MAX
+};
+struct fifo_profile_gk20a {
+        u64 timestamp[PROFILE_MAX];
+};
+struct fifo_gk20a {
+        struct gk20a *g;
+        unsigned int num_channels;
+        unsigned int runlist_entry_size;
+        unsigned int num_runlist_entries;
+        unsigned int num_pbdma;
+        u32 *pbdma_map;
+        struct fifo_engine_info_gk20a *engine_info;
+        u32 max_engines;
+        u32 num_engines;
+        u32 *active_engines_list;
+        struct fifo_runlist_info_gk20a *runlist_info;
+        u32 max_runlists;
+#ifdef CONFIG_DEBUG_FS
+        struct {
+                struct fifo_profile_gk20a *data;
+                nvgpu_atomic_t get;
+                bool enabled;
+                u64 *sorted;
+                struct nvgpu_ref ref;
+                struct nvgpu_mutex lock;
+        } profile;
+#endif
+        struct nvgpu_mem userd;
+        u32 userd_entry_size;
+        unsigned int used_channels;
+        struct channel_gk20a *channel;
+        /* zero-kref'd channels here */
+        struct nvgpu_list_node free_chs;
+        struct nvgpu_mutex free_chs_mutex;
+        struct nvgpu_mutex engines_reset_mutex;
+        struct tsg_gk20a *tsg;
+        struct nvgpu_mutex tsg_inuse_mutex;
+        void (*remove_support)(struct fifo_gk20a *);
+        bool sw_ready;
+        struct {
+                /* share info between isrs and non-isr code */
+                struct {
+                        struct nvgpu_mutex mutex;
+                } isr;
+                struct {
+                        u32 device_fatal_0;
+                        u32 channel_fatal_0;
+                        u32 restartable_0;
+                } pbdma;
+                struct {
+                } engine;
+        } intr;
+        unsigned long deferred_fault_engines;
+        bool deferred_reset_pending;
+        struct nvgpu_mutex deferred_reset_mutex;
+        u32 max_subctx_count;
+        u32 channel_base;
+};
+struct ch_state {
+        int pid;
+        int refs;
+        bool deterministic;
+        u32 inst_block[0];
+};
+int gk20a_init_fifo_support(struct gk20a *g);
+int gk20a_init_fifo_setup_hw(struct gk20a *g);
+void gk20a_fifo_isr(struct gk20a *g);
+u32 gk20a_fifo_nonstall_isr(struct gk20a *g);
+int gk20a_fifo_preempt_channel(struct gk20a *g, struct channel_gk20a *ch);
+int gk20a_fifo_preempt_tsg(struct gk20a *g, struct tsg_gk20a *tsg);
+int gk20a_fifo_preempt(struct gk20a *g, struct channel_gk20a *ch);
+int gk20a_fifo_enable_engine_activity(struct gk20a *g,
+                        struct fifo_engine_info_gk20a *eng_info);
+int gk20a_fifo_enable_all_engine_activity(struct gk20a *g);
+int gk20a_fifo_disable_engine_activity(struct gk20a *g,
+                        struct fifo_engine_info_gk20a *eng_info,
+                        bool wait_for_idle);
+int gk20a_fifo_disable_all_engine_activity(struct gk20a *g,
+                                bool wait_for_idle);
+void gk20a_fifo_enable_tsg_sched(struct gk20a *g, struct tsg_gk20a *tsg);
+void gk20a_fifo_disable_tsg_sched(struct gk20a *g, struct tsg_gk20a *tsg);
+u32 gk20a_fifo_engines_on_ch(struct gk20a *g, u32 chid);
+int gk20a_fifo_reschedule_runlist(struct channel_gk20a *ch, bool preempt_next);
+int nvgpu_fifo_reschedule_runlist(struct channel_gk20a *ch, bool preempt_next,
+                bool wait_preempt);
+int gk20a_fifo_update_runlist(struct gk20a *g, u32 engine_id, u32 chid,
+                              bool add, bool wait_for_finish);
+int gk20a_fifo_update_runlist_locked(struct gk20a *g, u32 runlist_id,
+                                            u32 chid, bool add,
+                                            bool wait_for_finish);
+int gk20a_fifo_suspend(struct gk20a *g);
+bool gk20a_fifo_mmu_fault_pending(struct gk20a *g);
+void gk20a_fifo_recover(struct gk20a *g,
+                        u32 engine_ids, /* if zero, will be queried from HW */
+                        u32 hw_id, /* if ~0, will be queried from HW */
+                        bool id_is_tsg, /* ignored if hw_id == ~0 */
+                        bool id_is_known, bool verbose, int rc_type);
+void gk20a_fifo_recover_ch(struct gk20a *g, struct channel_gk20a *ch,
+        bool verbose, u32 rc_type);
+void gk20a_fifo_recover_tsg(struct gk20a *g, struct tsg_gk20a *tsg,
+                         bool verbose, u32 rc_type);
+int gk20a_fifo_force_reset_ch(struct channel_gk20a *ch,
+                                u32 err_code, bool verbose);
+void gk20a_fifo_reset_engine(struct gk20a *g, u32 engine_id);
+int gk20a_init_fifo_reset_enable_hw(struct gk20a *g);
+int gk20a_fifo_tsg_unbind_channel(struct channel_gk20a *ch);
+void fifo_gk20a_finish_mmu_fault_handling(struct gk20a *g,
+                unsigned long fault_id);
+int gk20a_fifo_wait_engine_idle(struct gk20a *g);
+bool gk20a_fifo_is_engine_busy(struct gk20a *g);
+u32 gk20a_fifo_engine_interrupt_mask(struct gk20a *g);
+u32 gk20a_fifo_act_eng_interrupt_mask(struct gk20a *g, u32 act_eng_id);
+u32 gk20a_fifo_get_pbdma_signature(struct gk20a *g);
+u32 gk20a_fifo_get_failing_engine_data(struct gk20a *g,
+                int *__id, bool *__is_tsg);
+void gk20a_fifo_set_ctx_mmu_error_tsg(struct gk20a *g,
+                struct tsg_gk20a *tsg);
+void gk20a_fifo_abort_tsg(struct gk20a *g, struct tsg_gk20a *tsg, bool preempt);
+void gk20a_fifo_set_ctx_mmu_error_ch(struct gk20a *g,
+                struct channel_gk20a *refch);
+bool gk20a_fifo_error_tsg(struct gk20a *g, struct tsg_gk20a *tsg);
+bool gk20a_fifo_error_ch(struct gk20a *g, struct channel_gk20a *refch);
+void gk20a_fifo_issue_preempt(struct gk20a *g, u32 id, bool is_tsg);
+int gk20a_fifo_set_runlist_interleave(struct gk20a *g,
+                                u32 id,
+                                u32 runlist_id,
+                                u32 new_level);
+int gk20a_fifo_tsg_set_timeslice(struct tsg_gk20a *tsg, u32 timeslice);
+const char *gk20a_fifo_interleave_level_name(u32 interleave_level);
+int gk20a_fifo_engine_enum_from_type(struct gk20a *g, u32 engine_type,
+                u32 *inst_id);
+u32 gk20a_fifo_get_engine_ids(struct gk20a *g, u32 engine_id[],
+                                 u32 engine_id_sz, u32 engine_enum);
+void gk20a_fifo_delete_runlist(struct fifo_gk20a *f);
+struct fifo_engine_info_gk20a *gk20a_fifo_get_engine_info(struct gk20a *g,
+                                                         u32 engine_id);
+bool gk20a_fifo_is_valid_engine_id(struct gk20a *g, u32 engine_id);
+u32 gk20a_fifo_get_gr_engine_id(struct gk20a *g);
+int gk20a_fifo_deferred_reset(struct gk20a *g, struct channel_gk20a *ch);
+u32 gk20a_fifo_get_all_ce_engine_reset_mask(struct gk20a *g);
+u32 gk20a_fifo_get_fast_ce_runlist_id(struct gk20a *g);
+u32 gk20a_fifo_get_gr_runlist_id(struct gk20a *g);
+bool gk20a_fifo_is_valid_runlist_id(struct gk20a *g, u32 runlist_id);
+int gk20a_fifo_update_runlist_ids(struct gk20a *g, u32 runlist_ids, u32 chid,
+                bool add, bool wait_for_finish);
+int gk20a_fifo_init_engine_info(struct fifo_gk20a *f);
+void gk20a_get_tsg_runlist_entry(struct tsg_gk20a *tsg, u32 *runlist);
+void gk20a_get_ch_runlist_entry(struct channel_gk20a *ch, u32 *runlist);
+void gk20a_fifo_set_runlist_state(struct gk20a *g, u32 runlists_mask,
+                 u32 runlist_state);
+u32 gk20a_fifo_userd_gp_get(struct gk20a *g, struct channel_gk20a *c);
+void gk20a_fifo_userd_gp_put(struct gk20a *g, struct channel_gk20a *c);
+u64 gk20a_fifo_userd_pb_get(struct gk20a *g, struct channel_gk20a *c);
+bool gk20a_is_fault_engine_subid_gpc(struct gk20a *g, u32 engine_subid);
+#ifdef CONFIG_DEBUG_FS
+struct fifo_profile_gk20a *gk20a_fifo_profile_acquire(struct gk20a *g);
+void gk20a_fifo_profile_release(struct gk20a *g,
+        struct fifo_profile_gk20a *profile);
+void gk20a_fifo_profile_snapshot(struct fifo_profile_gk20a *profile, int idx);
+#else
+static inline struct fifo_profile_gk20a *
+gk20a_fifo_profile_acquire(struct gk20a *g)
+{
+        return NULL;
+}
+static inline void gk20a_fifo_profile_release(struct gk20a *g,
+        struct fifo_profile_gk20a *profile)
+{
+}
+static inline void gk20a_fifo_profile_snapshot(
+                struct fifo_profile_gk20a *profile, int idx)
+{
+}
+#endif
+void gk20a_dump_channel_status_ramfc(struct gk20a *g,
+                                     struct gk20a_debug_output *o,
+                                     u32 chid,
+                                     struct ch_state *ch_state);
+void gk20a_debug_dump_all_channel_status_ramfc(struct gk20a *g,
+                 struct gk20a_debug_output *o);
+void gk20a_dump_pbdma_status(struct gk20a *g,
+                                 struct gk20a_debug_output *o);
+void gk20a_dump_eng_status(struct gk20a *g,
+                                 struct gk20a_debug_output *o);
+const char *gk20a_decode_ccsr_chan_status(u32 index);
+const char *gk20a_decode_pbdma_chan_eng_ctx_status(u32 index);
+void gk20a_fifo_enable_channel(struct channel_gk20a *ch);
+void gk20a_fifo_disable_channel(struct channel_gk20a *ch);
+bool gk20a_fifo_channel_status_is_next(struct gk20a *g, u32 chid);
+bool gk20a_fifo_channel_status_is_ctx_reload(struct gk20a *g, u32 chid);
+int gk20a_fifo_tsg_unbind_channel_verify_status(struct channel_gk20a *ch);
+struct channel_gk20a *gk20a_refch_from_inst_ptr(struct gk20a *g, u64 inst_ptr);
+void gk20a_fifo_channel_unbind(struct channel_gk20a *ch_gk20a);
+u32 gk20a_fifo_intr_0_error_mask(struct gk20a *g);
+int gk20a_fifo_is_preempt_pending(struct gk20a *g, u32 id,
+                        unsigned int id_type);
+int __locked_fifo_preempt(struct gk20a *g, u32 id, bool is_tsg);
+void gk20a_fifo_preempt_timeout_rc_tsg(struct gk20a *g, struct tsg_gk20a *tsg);
+void gk20a_fifo_preempt_timeout_rc(struct gk20a *g, struct channel_gk20a *ch);
+int gk20a_fifo_setup_ramfc(struct channel_gk20a *c,
+                        u64 gpfifo_base, u32 gpfifo_entries,
+                        unsigned long timeout, u32 flags);
+void gk20a_fifo_setup_ramfc_for_privileged_channel(struct channel_gk20a *c);
+int gk20a_fifo_alloc_inst(struct gk20a *g, struct channel_gk20a *ch);
+void gk20a_fifo_free_inst(struct gk20a *g, struct channel_gk20a *ch);
+int gk20a_fifo_setup_userd(struct channel_gk20a *c);
+u32 gk20a_fifo_pbdma_acquire_val(u64 timeout);
+u32 *gk20a_runlist_construct_locked(struct fifo_gk20a *f,
+                                struct fifo_runlist_info_gk20a *runlist,
+                                u32 cur_level,
+                                u32 *runlist_entry,
+                                bool interleave_enabled,
+                                bool prev_empty,
+                                u32 *entries_left);
+void gk20a_fifo_runlist_hw_submit(struct gk20a *g, u32 runlist_id,
+        u32 count, u32 buffer_index);
+int gk20a_fifo_runlist_wait_pending(struct gk20a *g, u32 runlist_id);
+int gk20a_init_fifo_setup_sw_common(struct gk20a *g);
+int gk20a_init_fifo_setup_sw(struct gk20a *g);
+void gk20a_fifo_handle_runlist_event(struct gk20a *g);
+bool gk20a_fifo_should_defer_engine_reset(struct gk20a *g, u32 engine_id,
+                        u32 engine_subid, bool fake_fault);
+void gk20a_fifo_teardown_ch_tsg(struct gk20a *g, u32 __engine_ids,
+                        u32 hw_id, unsigned int id_type, unsigned int rc_type,
+                         struct mmu_fault_info *mmfault);
+bool gk20a_fifo_check_ch_ctxsw_timeout(struct channel_gk20a *ch,
+                        bool *verbose, u32 *ms);
+bool gk20a_fifo_check_tsg_ctxsw_timeout(struct tsg_gk20a *tsg,
+                        bool *verbose, u32 *ms);
+void gk20a_fifo_teardown_mask_intr(struct gk20a *g);
+void gk20a_fifo_teardown_unmask_intr(struct gk20a *g);
+bool gk20a_fifo_handle_sched_error(struct gk20a *g);
+void gk20a_fifo_reset_pbdma_method(struct gk20a *g, int pbdma_id,
+                         int pbdma_method_index);
+unsigned int gk20a_fifo_handle_pbdma_intr_0(struct gk20a *g, u32 pbdma_id,
+                        u32 pbdma_intr_0, u32 *handled, u32 *error_notifier);
+unsigned int gk20a_fifo_handle_pbdma_intr_1(struct gk20a *g, u32 pbdma_id,
+                        u32 pbdma_intr_1, u32 *handled, u32 *error_notifier);
+u32 gk20a_fifo_handle_pbdma_intr(struct gk20a *g, struct fifo_gk20a *f,
+                        u32 pbdma_id, unsigned int rc);
+u32 gk20a_fifo_default_timeslice_us(struct gk20a *g);
+#ifdef CONFIG_TEGRA_GK20A_NVHOST
+void gk20a_fifo_add_syncpt_wait_cmd(struct gk20a *g,
+                        struct priv_cmd_entry *cmd, u32 off,
+                        u32 id, u32 thresh, u64 gpu_va);
+u32 gk20a_fifo_get_syncpt_wait_cmd_size(void);
+u32 gk20a_fifo_get_syncpt_incr_per_release(void);
+void gk20a_fifo_add_syncpt_incr_cmd(struct gk20a *g,
+                        bool wfi_cmd, struct priv_cmd_entry *cmd,
+                        u32 id, u64 gpu_va);
+u32 gk20a_fifo_get_syncpt_incr_cmd_size(bool wfi_cmd);
+void gk20a_fifo_free_syncpt_buf(struct channel_gk20a *c,
+                        struct nvgpu_mem *syncpt_buf);
+int gk20a_fifo_alloc_syncpt_buf(struct channel_gk20a *c,
+                        u32 syncpt_id, struct nvgpu_mem *syncpt_buf);
+#endif
+void gk20a_fifo_get_mmu_fault_info(struct gk20a *g, u32 mmu_fault_id,
+                                struct mmu_fault_info *mmfault);
+void gk20a_fifo_get_mmu_fault_desc(struct mmu_fault_info *mmfault);
+void gk20a_fifo_get_mmu_fault_client_desc(struct mmu_fault_info *mmfault);
+void gk20a_fifo_get_mmu_fault_gpc_desc(struct mmu_fault_info *mmfault);
+u32 gk20a_fifo_get_sema_wait_cmd_size(void);
+u32 gk20a_fifo_get_sema_incr_cmd_size(void);
+void gk20a_fifo_add_sema_cmd(struct gk20a *g,
+        struct nvgpu_semaphore *s, u64 sema_va,
+        struct priv_cmd_entry *cmd,
+        u32 off, bool acquire, bool wfi);
+#endif /* FIFO_GK20A_H */
diff --git a/include/gk20a/flcn_gk20a.c b/include/gk20a/flcn_gk20a.c
new file mode 100644
index 0000000..fdcaef9
--- /dev/null
+++ b/include/gk20a/flcn_gk20a.c
@@ -0,0 +1,759 @@
+/*
+ * Copyright (c) 2017-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+#include <nvgpu/falcon.h>
+#include <nvgpu/pmu.h>
+#include <nvgpu/io.h>
+#include "gk20a/gk20a.h"
+#include "gk20a/flcn_gk20a.h"
+#include <nvgpu/hw/gm20b/hw_falcon_gm20b.h>
+static int gk20a_flcn_reset(struct nvgpu_falcon *flcn)
+{
+        struct gk20a *g = flcn->g;
+        u32 base_addr = flcn->flcn_base;
+        u32 unit_status = 0;
+        int status = 0;
+        if (flcn->flcn_engine_dep_ops.reset_eng) {
+                /* falcon & engine reset */
+                status = flcn->flcn_engine_dep_ops.reset_eng(g);
+        } else {
+                /* do falcon CPU hard reset */
+                unit_status = gk20a_readl(g, base_addr +
+                                falcon_falcon_cpuctl_r());
+                gk20a_writel(g, base_addr + falcon_falcon_cpuctl_r(),
+                        (unit_status | falcon_falcon_cpuctl_hreset_f(1)));
+        }
+        return status;
+}
+static bool gk20a_flcn_clear_halt_interrupt_status(struct nvgpu_falcon *flcn)
+{
+        struct gk20a *g = flcn->g;
+        u32 base_addr = flcn->flcn_base;
+        u32 data = 0;
+        bool status = false;
+        gk20a_writel(g, base_addr + falcon_falcon_irqsclr_r(),
+                gk20a_readl(g, base_addr + falcon_falcon_irqsclr_r()) |
+                (0x10));
+        data = gk20a_readl(g, (base_addr + falcon_falcon_irqstat_r()));
+        if ((data & falcon_falcon_irqstat_halt_true_f()) !=
+                falcon_falcon_irqstat_halt_true_f()) {
+                /*halt irq is clear*/
+                status = true;
+        }
+        return status;
+}
+static void gk20a_flcn_set_irq(struct nvgpu_falcon *flcn, bool enable)
+{
+        struct gk20a *g = flcn->g;
+        u32 base_addr = flcn->flcn_base;
+        if (!flcn->is_interrupt_enabled) {
+                nvgpu_warn(g, "Interrupt not supported on flcn 0x%x ",
+                        flcn->flcn_id);
+                /* Keep interrupt disabled */
+                enable = false;
+        }
+        if (enable) {
+                gk20a_writel(g, base_addr + falcon_falcon_irqmset_r(),
+                        flcn->intr_mask);
+                gk20a_writel(g, base_addr + falcon_falcon_irqdest_r(),
+                        flcn->intr_dest);
+        } else {
+                gk20a_writel(g, base_addr + falcon_falcon_irqmclr_r(),
+                        0xffffffff);
+        }
+}
+static bool gk20a_is_falcon_cpu_halted(struct nvgpu_falcon *flcn)
+{
+        struct gk20a *g = flcn->g;
+        u32 base_addr = flcn->flcn_base;
+        return (gk20a_readl(g, base_addr + falcon_falcon_cpuctl_r()) &
+                        falcon_falcon_cpuctl_halt_intr_m() ?
+                        true : false);
+}
+static bool gk20a_is_falcon_idle(struct nvgpu_falcon *flcn)
+{
+        struct gk20a *g = flcn->g;
+        u32 base_addr = flcn->flcn_base;
+        u32 unit_status = 0;
+        bool status = false;
+        unit_status = gk20a_readl(g,
+                base_addr + falcon_falcon_idlestate_r());
+        if (falcon_falcon_idlestate_falcon_busy_v(unit_status) == 0 &&
+                falcon_falcon_idlestate_ext_busy_v(unit_status) == 0) {
+                status = true;
+        } else {
+                status = false;
+        }
+        return status;
+}
+static bool gk20a_is_falcon_scrubbing_done(struct nvgpu_falcon *flcn)
+{
+        struct gk20a *g = flcn->g;
+        u32 base_addr = flcn->flcn_base;
+        u32 unit_status = 0;
+        bool status = false;
+        unit_status = gk20a_readl(g,
+                base_addr + falcon_falcon_dmactl_r());
+        if (unit_status & (falcon_falcon_dmactl_dmem_scrubbing_m() |
+                 falcon_falcon_dmactl_imem_scrubbing_m())) {
+                status = false;
+        } else {
+                status = true;
+        }
+        return status;
+}
+static u32 gk20a_falcon_get_mem_size(struct nvgpu_falcon *flcn,
+                enum flcn_mem_type mem_type)
+{
+        struct gk20a *g = flcn->g;
+        u32 mem_size = 0;
+        u32 hw_cfg_reg = gk20a_readl(g,
+                flcn->flcn_base + falcon_falcon_hwcfg_r());
+        if (mem_type == MEM_DMEM) {
+                mem_size = falcon_falcon_hwcfg_dmem_size_v(hw_cfg_reg)
+                        << GK20A_PMU_DMEM_BLKSIZE2;
+        } else {
+                mem_size = falcon_falcon_hwcfg_imem_size_v(hw_cfg_reg)
+                        << GK20A_PMU_DMEM_BLKSIZE2;
+        }
+        return mem_size;
+}
+static int flcn_mem_overflow_check(struct nvgpu_falcon *flcn,
+                u32 offset, u32 size, enum flcn_mem_type mem_type)
+{
+        struct gk20a *g = flcn->g;
+        u32 mem_size = 0;
+        if (size == 0) {
+                nvgpu_err(g, "size is zero");
+                return -EINVAL;
+        }
+        if (offset & 0x3) {
+                nvgpu_err(g, "offset (0x%08x) not 4-byte aligned", offset);
+                return -EINVAL;
+        }
+        mem_size = gk20a_falcon_get_mem_size(flcn, mem_type);
+        if (!(offset <= mem_size && (offset + size) <= mem_size)) {
+                nvgpu_err(g, "flcn-id 0x%x, copy overflow ",
+                        flcn->flcn_id);
+                nvgpu_err(g, "total size 0x%x, offset 0x%x, copy size 0x%x",
+                        mem_size, offset, size);
+                return -EINVAL;
+        }
+        return 0;
+}
+static int gk20a_flcn_copy_from_dmem(struct nvgpu_falcon *flcn,
+                u32 src, u8 *dst, u32 size, u8 port)
+{
+        struct gk20a *g = flcn->g;
+        u32 base_addr = flcn->flcn_base;
+        u32 i, words, bytes;
+        u32 data, addr_mask;
+        u32 *dst_u32 = (u32 *)dst;
+        nvgpu_log_fn(g, " src dmem offset - %x, size - %x", src, size);
+        if (flcn_mem_overflow_check(flcn, src, size, MEM_DMEM)) {
+                nvgpu_err(g, "incorrect parameters");
+                return -EINVAL;
+        }
+        nvgpu_mutex_acquire(&flcn->copy_lock);
+        words = size >> 2;
+        bytes = size & 0x3;
+        addr_mask = falcon_falcon_dmemc_offs_m() |
+                            falcon_falcon_dmemc_blk_m();
+        src &= addr_mask;
+        gk20a_writel(g, base_addr + falcon_falcon_dmemc_r(port),
+                src | falcon_falcon_dmemc_aincr_f(1));
+        for (i = 0; i < words; i++) {
+                dst_u32[i] = gk20a_readl(g,
+                        base_addr + falcon_falcon_dmemd_r(port));
+        }
+        if (bytes > 0) {
+                data = gk20a_readl(g, base_addr + falcon_falcon_dmemd_r(port));
+                for (i = 0; i < bytes; i++) {
+                        dst[(words << 2) + i] = ((u8 *)&data)[i];
+                }
+        }
+        nvgpu_mutex_release(&flcn->copy_lock);
+        return 0;
+}
+static int gk20a_flcn_copy_to_dmem(struct nvgpu_falcon *flcn,
+                u32 dst, u8 *src, u32 size, u8 port)
+{
+        struct gk20a *g = flcn->g;
+        u32 base_addr = flcn->flcn_base;
+        u32 i, words, bytes;
+        u32 data, addr_mask;
+        u32 *src_u32 = (u32 *)src;
+        nvgpu_log_fn(g, "dest dmem offset - %x, size - %x", dst, size);
+        if (flcn_mem_overflow_check(flcn, dst, size, MEM_DMEM)) {
+                nvgpu_err(g, "incorrect parameters");
+                return -EINVAL;
+        }
+        nvgpu_mutex_acquire(&flcn->copy_lock);
+        words = size >> 2;
+        bytes = size & 0x3;
+        addr_mask = falcon_falcon_dmemc_offs_m() |
+                falcon_falcon_dmemc_blk_m();
+        dst &= addr_mask;
+        gk20a_writel(g, base_addr + falcon_falcon_dmemc_r(port),
+                dst | falcon_falcon_dmemc_aincw_f(1));
+        for (i = 0; i < words; i++) {
+                gk20a_writel(g,
+                        base_addr + falcon_falcon_dmemd_r(port), src_u32[i]);
+        }
+        if (bytes > 0) {
+                data = 0;
+                for (i = 0; i < bytes; i++) {
+                        ((u8 *)&data)[i] = src[(words << 2) + i];
+                }
+                gk20a_writel(g, base_addr + falcon_falcon_dmemd_r(port), data);
+        }
+        size = ALIGN(size, 4);
+        data = gk20a_readl(g,
+                base_addr + falcon_falcon_dmemc_r(port)) & addr_mask;
+        if (data != ((dst + size) & addr_mask)) {
+                nvgpu_warn(g, "copy failed. bytes written %d, expected %d",
+                        data - dst, size);
+        }
+        nvgpu_mutex_release(&flcn->copy_lock);
+        return 0;
+}
+static int gk20a_flcn_copy_from_imem(struct nvgpu_falcon *flcn, u32 src,
+        u8 *dst, u32 size, u8 port)
+{
+        struct gk20a *g = flcn->g;
+        u32 base_addr = flcn->flcn_base;
+        u32 *dst_u32 = (u32 *)dst;
+        u32 words = 0;
+        u32 bytes = 0;
+        u32 data = 0;
+        u32 blk = 0;
+        u32 i = 0;
+        nvgpu_log_info(g, "download %d bytes from 0x%x", size, src);
+        if (flcn_mem_overflow_check(flcn, src, size, MEM_IMEM)) {
+                nvgpu_err(g, "incorrect parameters");
+                return -EINVAL;
+        }
+        nvgpu_mutex_acquire(&flcn->copy_lock);
+        words = size >> 2;
+        bytes = size & 0x3;
+        blk = src >> 8;
+        nvgpu_log_info(g, "download %d words from 0x%x block %d",
+                        words, src, blk);
+        gk20a_writel(g, base_addr + falcon_falcon_imemc_r(port),
+                falcon_falcon_imemc_offs_f(src >> 2) |
+                falcon_falcon_imemc_blk_f(blk) |
+                falcon_falcon_dmemc_aincr_f(1));
+        for (i = 0; i < words; i++) {
+                dst_u32[i] = gk20a_readl(g,
+                        base_addr + falcon_falcon_imemd_r(port));
+        }
+        if (bytes > 0) {
+                data = gk20a_readl(g, base_addr + falcon_falcon_imemd_r(port));
+                for (i = 0; i < bytes; i++) {
+                        dst[(words << 2) + i] = ((u8 *)&data)[i];
+                }
+        }
+        nvgpu_mutex_release(&flcn->copy_lock);
+        return 0;
+}
+static int gk20a_flcn_copy_to_imem(struct nvgpu_falcon *flcn, u32 dst,
+                u8 *src, u32 size, u8 port, bool sec, u32 tag)
+{
+        struct gk20a *g = flcn->g;
+        u32 base_addr = flcn->flcn_base;
+        u32 *src_u32 = (u32 *)src;
+        u32 words = 0;
+        u32 blk = 0;
+        u32 i = 0;
+        nvgpu_log_info(g, "upload %d bytes to 0x%x", size, dst);
+        if (flcn_mem_overflow_check(flcn, dst, size, MEM_IMEM)) {
+                nvgpu_err(g, "incorrect parameters");
+                return -EINVAL;
+        }
+        nvgpu_mutex_acquire(&flcn->copy_lock);
+        words = size >> 2;
+        blk = dst >> 8;
+        nvgpu_log_info(g, "upload %d words to 0x%x block %d, tag 0x%x",
+                        words, dst, blk, tag);
+        gk20a_writel(g, base_addr + falcon_falcon_imemc_r(port),
+                        falcon_falcon_imemc_offs_f(dst >> 2) |
+                        falcon_falcon_imemc_blk_f(blk) |
+                        /* Set Auto-Increment on write */
+                        falcon_falcon_imemc_aincw_f(1) |
+                        falcon_falcon_imemc_secure_f(sec ? 1U : 0U));
+        for (i = 0; i < words; i++) {
+                if (i % 64 == 0) {
+                        /* tag is always 256B aligned */
+                        gk20a_writel(g, base_addr + falcon_falcon_imemt_r(0),
+                                tag);
+                        tag++;
+                }
+                gk20a_writel(g, base_addr + falcon_falcon_imemd_r(port),
+                        src_u32[i]);
+        }
+        /* WARNING : setting remaining bytes in block to 0x0 */
+        while (i % 64) {
+                gk20a_writel(g, base_addr + falcon_falcon_imemd_r(port), 0);
+                i++;
+        }
+        nvgpu_mutex_release(&flcn->copy_lock);
+        return 0;
+}
+static int gk20a_falcon_bootstrap(struct nvgpu_falcon *flcn,
+        u32 boot_vector)
+{
+        struct gk20a *g = flcn->g;
+        u32 base_addr = flcn->flcn_base;
+        nvgpu_log_info(g, "boot vec 0x%x", boot_vector);
+        gk20a_writel(g, base_addr + falcon_falcon_dmactl_r(),
+                falcon_falcon_dmactl_require_ctx_f(0));
+        gk20a_writel(g, base_addr + falcon_falcon_bootvec_r(),
+                falcon_falcon_bootvec_vec_f(boot_vector));
+        gk20a_writel(g, base_addr + falcon_falcon_cpuctl_r(),
+                falcon_falcon_cpuctl_startcpu_f(1));
+        return 0;
+}
+static u32 gk20a_falcon_mailbox_read(struct nvgpu_falcon *flcn,
+                u32 mailbox_index)
+{
+        struct gk20a *g = flcn->g;
+        u32 data = 0;
+        if (mailbox_index < FALCON_MAILBOX_COUNT) {
+                data =  gk20a_readl(g, flcn->flcn_base + (mailbox_index ?
+                        falcon_falcon_mailbox1_r() :
+                        falcon_falcon_mailbox0_r()));
+        } else {
+                nvgpu_err(g, "incorrect mailbox id %d", mailbox_index);
+        }
+        return data;
+}
+static void gk20a_falcon_mailbox_write(struct nvgpu_falcon *flcn,
+                u32 mailbox_index, u32 data)
+{
+        struct gk20a *g = flcn->g;
+        if (mailbox_index < FALCON_MAILBOX_COUNT) {
+                gk20a_writel(g, flcn->flcn_base + (mailbox_index ?
+                        falcon_falcon_mailbox1_r() :
+                        falcon_falcon_mailbox0_r()),
+                        data);
+        } else {
+                nvgpu_err(g, "incorrect mailbox id %d", mailbox_index);
+        }
+}
+static int gk20a_falcon_bl_bootstrap(struct nvgpu_falcon *flcn,
+        struct nvgpu_falcon_bl_info *bl_info)
+{
+        struct gk20a *g = flcn->g;
+        u32 base_addr = flcn->flcn_base;
+        u32 virt_addr = 0;
+        u32 dst = 0;
+        int err = 0;
+        /*copy bootloader interface structure to dmem*/
+        err = gk20a_flcn_copy_to_dmem(flcn, 0, (u8 *)bl_info->bl_desc,
+                bl_info->bl_desc_size, (u8)0);
+        if (err != 0) {
+                goto exit;
+        }
+        /* copy bootloader to TOP of IMEM */
+        dst = (falcon_falcon_hwcfg_imem_size_v(gk20a_readl(g,
+                base_addr + falcon_falcon_hwcfg_r())) << 8) - bl_info->bl_size;
+        err = gk20a_flcn_copy_to_imem(flcn, dst, (u8 *)(bl_info->bl_src),
+                bl_info->bl_size, (u8)0, false, bl_info->bl_start_tag);
+        if (err != 0) {
+                goto exit;
+        }
+        gk20a_falcon_mailbox_write(flcn, FALCON_MAILBOX_0, 0xDEADA5A5U);
+        virt_addr = bl_info->bl_start_tag << 8;
+        err = gk20a_falcon_bootstrap(flcn, virt_addr);
+exit:
+        if (err != 0) {
+                nvgpu_err(g, "falcon id-0x%x bootstrap failed", flcn->flcn_id);
+        }
+        return err;
+}
+static void gk20a_falcon_dump_imblk(struct nvgpu_falcon *flcn)
+{
+        struct gk20a *g = flcn->g;
+        u32 base_addr = flcn->flcn_base;
+        u32 i = 0, j = 0;
+        u32 data[8] = {0};
+        u32 block_count = 0;
+        block_count = falcon_falcon_hwcfg_imem_size_v(gk20a_readl(g,
+                flcn->flcn_base + falcon_falcon_hwcfg_r()));
+        /* block_count must be multiple of 8 */
+        block_count &= ~0x7;
+        nvgpu_err(g, "FALCON IMEM BLK MAPPING (PA->VA) (%d TOTAL):",
+                block_count);
+        for (i = 0; i < block_count; i += 8) {
+                for (j = 0; j < 8; j++) {
+                        gk20a_writel(g, flcn->flcn_base +
+                        falcon_falcon_imctl_debug_r(),
+                        falcon_falcon_imctl_debug_cmd_f(0x2) |
+                        falcon_falcon_imctl_debug_addr_blk_f(i + j));
+                        data[j] = gk20a_readl(g, base_addr +
+                                falcon_falcon_imstat_r());
+                }
+                nvgpu_err(g, " %#04x: %#010x %#010x %#010x %#010x",
+                                i, data[0], data[1], data[2], data[3]);
+                nvgpu_err(g, " %#04x: %#010x %#010x %#010x %#010x",
+                                i + 4, data[4], data[5], data[6], data[7]);
+        }
+}
+static void gk20a_falcon_dump_pc_trace(struct nvgpu_falcon *flcn)
+{
+        struct gk20a *g = flcn->g;
+        u32 base_addr = flcn->flcn_base;
+        u32 trace_pc_count = 0;
+        u32 pc = 0;
+        u32 i = 0;
+        if (gk20a_readl(g, base_addr + falcon_falcon_sctl_r()) & 0x02) {
+                nvgpu_err(g, " falcon is in HS mode, PC TRACE dump not supported");
+                return;
+        }
+        trace_pc_count = falcon_falcon_traceidx_maxidx_v(gk20a_readl(g,
+                base_addr + falcon_falcon_traceidx_r()));
+        nvgpu_err(g,
+                "PC TRACE (TOTAL %d ENTRIES. entry 0 is the most recent branch):",
+                trace_pc_count);
+        for (i = 0; i < trace_pc_count; i++) {
+                gk20a_writel(g, base_addr + falcon_falcon_traceidx_r(),
+                        falcon_falcon_traceidx_idx_f(i));
+                pc = falcon_falcon_tracepc_pc_v(gk20a_readl(g,
+                        base_addr + falcon_falcon_tracepc_r()));
+                nvgpu_err(g, "FALCON_TRACEPC(%d)  :  %#010x", i, pc);
+        }
+}
+void gk20a_falcon_dump_stats(struct nvgpu_falcon *flcn)
+{
+        struct gk20a *g = flcn->g;
+        u32 base_addr = flcn->flcn_base;
+        unsigned int i;
+        nvgpu_err(g, "<<< FALCON id-%d DEBUG INFORMATION - START >>>",
+                        flcn->flcn_id);
+        /* imblk dump */
+        gk20a_falcon_dump_imblk(flcn);
+        /* PC trace dump */
+        gk20a_falcon_dump_pc_trace(flcn);
+        nvgpu_err(g, "FALCON ICD REGISTERS DUMP");
+        for (i = 0; i < 4; i++) {
+                gk20a_writel(g, base_addr + falcon_falcon_icd_cmd_r(),
+                        falcon_falcon_icd_cmd_opc_rreg_f() |
+                        falcon_falcon_icd_cmd_idx_f(FALCON_REG_PC));
+                nvgpu_err(g, "FALCON_REG_PC : 0x%x",
+                        gk20a_readl(g, base_addr +
+                        falcon_falcon_icd_rdata_r()));
+                gk20a_writel(g, base_addr + falcon_falcon_icd_cmd_r(),
+                        falcon_falcon_icd_cmd_opc_rreg_f() |
+                        falcon_falcon_icd_cmd_idx_f(FALCON_REG_SP));
+                nvgpu_err(g, "FALCON_REG_SP : 0x%x",
+                        gk20a_readl(g, base_addr +
+                        falcon_falcon_icd_rdata_r()));
+        }
+        gk20a_writel(g, base_addr + falcon_falcon_icd_cmd_r(),
+                falcon_falcon_icd_cmd_opc_rreg_f() |
+                falcon_falcon_icd_cmd_idx_f(FALCON_REG_IMB));
+        nvgpu_err(g, "FALCON_REG_IMB : 0x%x",
+                gk20a_readl(g, base_addr + falcon_falcon_icd_rdata_r()));
+        gk20a_writel(g, base_addr + falcon_falcon_icd_cmd_r(),
+                falcon_falcon_icd_cmd_opc_rreg_f() |
+                falcon_falcon_icd_cmd_idx_f(FALCON_REG_DMB));
+        nvgpu_err(g, "FALCON_REG_DMB : 0x%x",
+                gk20a_readl(g, base_addr + falcon_falcon_icd_rdata_r()));
+        gk20a_writel(g, base_addr + falcon_falcon_icd_cmd_r(),
+                falcon_falcon_icd_cmd_opc_rreg_f() |
+                falcon_falcon_icd_cmd_idx_f(FALCON_REG_CSW));
+        nvgpu_err(g, "FALCON_REG_CSW : 0x%x",
+                gk20a_readl(g, base_addr + falcon_falcon_icd_rdata_r()));
+        gk20a_writel(g, base_addr + falcon_falcon_icd_cmd_r(),
+                falcon_falcon_icd_cmd_opc_rreg_f() |
+                falcon_falcon_icd_cmd_idx_f(FALCON_REG_CTX));
+        nvgpu_err(g, "FALCON_REG_CTX : 0x%x",
+                gk20a_readl(g, base_addr + falcon_falcon_icd_rdata_r()));
+        gk20a_writel(g, base_addr + falcon_falcon_icd_cmd_r(),
+                falcon_falcon_icd_cmd_opc_rreg_f() |
+                falcon_falcon_icd_cmd_idx_f(FALCON_REG_EXCI));
+        nvgpu_err(g, "FALCON_REG_EXCI : 0x%x",
+                gk20a_readl(g, base_addr + falcon_falcon_icd_rdata_r()));
+        for (i = 0; i < 6; i++) {
+                gk20a_writel(g, base_addr + falcon_falcon_icd_cmd_r(),
+                        falcon_falcon_icd_cmd_opc_rreg_f() |
+                        falcon_falcon_icd_cmd_idx_f(
+                        falcon_falcon_icd_cmd_opc_rstat_f()));
+                nvgpu_err(g, "FALCON_REG_RSTAT[%d] : 0x%x", i,
+                        gk20a_readl(g, base_addr +
+                                falcon_falcon_icd_rdata_r()));
+        }
+        nvgpu_err(g, " FALCON REGISTERS DUMP");
+        nvgpu_err(g, "falcon_falcon_os_r : %d",
+                gk20a_readl(g, base_addr + falcon_falcon_os_r()));
+        nvgpu_err(g, "falcon_falcon_cpuctl_r : 0x%x",
+                gk20a_readl(g, base_addr + falcon_falcon_cpuctl_r()));
+        nvgpu_err(g, "falcon_falcon_idlestate_r : 0x%x",
+                gk20a_readl(g, base_addr + falcon_falcon_idlestate_r()));
+        nvgpu_err(g, "falcon_falcon_mailbox0_r : 0x%x",
+                gk20a_readl(g, base_addr + falcon_falcon_mailbox0_r()));
+        nvgpu_err(g, "falcon_falcon_mailbox1_r : 0x%x",
+                gk20a_readl(g, base_addr + falcon_falcon_mailbox1_r()));
+        nvgpu_err(g, "falcon_falcon_irqstat_r : 0x%x",
+                gk20a_readl(g, base_addr + falcon_falcon_irqstat_r()));
+        nvgpu_err(g, "falcon_falcon_irqmode_r : 0x%x",
+                gk20a_readl(g, base_addr + falcon_falcon_irqmode_r()));
+        nvgpu_err(g, "falcon_falcon_irqmask_r : 0x%x",
+                gk20a_readl(g, base_addr + falcon_falcon_irqmask_r()));
+        nvgpu_err(g, "falcon_falcon_irqdest_r : 0x%x",
+                gk20a_readl(g, base_addr + falcon_falcon_irqdest_r()));
+        nvgpu_err(g, "falcon_falcon_debug1_r : 0x%x",
+                gk20a_readl(g, base_addr + falcon_falcon_debug1_r()));
+        nvgpu_err(g, "falcon_falcon_debuginfo_r : 0x%x",
+                gk20a_readl(g, base_addr + falcon_falcon_debuginfo_r()));
+        nvgpu_err(g, "falcon_falcon_bootvec_r : 0x%x",
+                gk20a_readl(g, base_addr + falcon_falcon_bootvec_r()));
+        nvgpu_err(g, "falcon_falcon_hwcfg_r : 0x%x",
+                gk20a_readl(g, base_addr + falcon_falcon_hwcfg_r()));
+        nvgpu_err(g, "falcon_falcon_engctl_r : 0x%x",
+                gk20a_readl(g, base_addr + falcon_falcon_engctl_r()));
+        nvgpu_err(g, "falcon_falcon_curctx_r : 0x%x",
+                gk20a_readl(g, base_addr + falcon_falcon_curctx_r()));
+        nvgpu_err(g, "falcon_falcon_nxtctx_r : 0x%x",
+                gk20a_readl(g, base_addr + falcon_falcon_nxtctx_r()));
+        nvgpu_err(g, "falcon_falcon_exterrstat_r : 0x%x",
+                gk20a_readl(g, base_addr + falcon_falcon_exterrstat_r()));
+        nvgpu_err(g, "falcon_falcon_exterraddr_r : 0x%x",
+                gk20a_readl(g, base_addr + falcon_falcon_exterraddr_r()));
+}
+static void gk20a_falcon_engine_dependency_ops(struct nvgpu_falcon *flcn)
+{
+        struct gk20a *g = flcn->g;
+        struct nvgpu_falcon_engine_dependency_ops *flcn_eng_dep_ops =
+                        &flcn->flcn_engine_dep_ops;
+        switch (flcn->flcn_id) {
+        case FALCON_ID_PMU:
+                flcn_eng_dep_ops->reset_eng = nvgpu_pmu_reset;
+                flcn_eng_dep_ops->queue_head = g->ops.pmu.pmu_queue_head;
+                flcn_eng_dep_ops->queue_tail = g->ops.pmu.pmu_queue_tail;
+                break;
+        default:
+                /* NULL assignment make sure
+                 * CPU hard reset in gk20a_flcn_reset() gets execute
+                 * if falcon doesn't need specific reset implementation
+                 */
+                flcn_eng_dep_ops->reset_eng = NULL;
+                break;
+        }
+}
+void gk20a_falcon_ops(struct nvgpu_falcon *flcn)
+{
+        struct nvgpu_falcon_ops *flcn_ops = &flcn->flcn_ops;
+        flcn_ops->reset = gk20a_flcn_reset;
+        flcn_ops->set_irq = gk20a_flcn_set_irq;
+        flcn_ops->clear_halt_interrupt_status =
+                gk20a_flcn_clear_halt_interrupt_status;
+        flcn_ops->is_falcon_cpu_halted =  gk20a_is_falcon_cpu_halted;
+        flcn_ops->is_falcon_idle =  gk20a_is_falcon_idle;
+        flcn_ops->is_falcon_scrubbing_done =  gk20a_is_falcon_scrubbing_done;
+        flcn_ops->copy_from_dmem = gk20a_flcn_copy_from_dmem;
+        flcn_ops->copy_to_dmem = gk20a_flcn_copy_to_dmem;
+        flcn_ops->copy_to_imem = gk20a_flcn_copy_to_imem;
+        flcn_ops->copy_from_imem = gk20a_flcn_copy_from_imem;
+        flcn_ops->bootstrap = gk20a_falcon_bootstrap;
+        flcn_ops->dump_falcon_stats = gk20a_falcon_dump_stats;
+        flcn_ops->mailbox_read = gk20a_falcon_mailbox_read;
+        flcn_ops->mailbox_write = gk20a_falcon_mailbox_write;
+        flcn_ops->bl_bootstrap = gk20a_falcon_bl_bootstrap;
+        gk20a_falcon_engine_dependency_ops(flcn);
+}
+int gk20a_falcon_hal_sw_init(struct nvgpu_falcon *flcn)
+{
+        struct gk20a *g = flcn->g;
+        int err = 0;
+        switch (flcn->flcn_id) {
+        case FALCON_ID_PMU:
+                flcn->flcn_base = FALCON_PWR_BASE;
+                flcn->is_falcon_supported = true;
+                flcn->is_interrupt_enabled = true;
+                break;
+        case FALCON_ID_SEC2:
+                flcn->flcn_base = FALCON_SEC_BASE;
+                flcn->is_falcon_supported = false;
+                flcn->is_interrupt_enabled = false;
+                break;
+        case FALCON_ID_FECS:
+                flcn->flcn_base = FALCON_FECS_BASE;
+                flcn->is_falcon_supported = true;
+                flcn->is_interrupt_enabled = false;
+                break;
+        case FALCON_ID_GPCCS:
+                flcn->flcn_base = FALCON_GPCCS_BASE;
+                flcn->is_falcon_supported = true;
+                flcn->is_interrupt_enabled = false;
+                break;
+        case FALCON_ID_NVDEC:
+                flcn->flcn_base = FALCON_NVDEC_BASE;
+                flcn->is_falcon_supported = false;
+                flcn->is_interrupt_enabled = false;
+                break;
+        default:
+                flcn->is_falcon_supported = false;
+                break;
+        }
+        if (flcn->is_falcon_supported) {
+                err = nvgpu_mutex_init(&flcn->copy_lock);
+                if (err != 0) {
+                        nvgpu_err(g, "Error in flcn.copy_lock mutex initialization");
+                } else {
+                        gk20a_falcon_ops(flcn);
+                }
+        } else {
+                nvgpu_log_info(g, "falcon 0x%x not supported on %s",
+                        flcn->flcn_id, g->name);
+        }
+        return err;
+}
diff --git a/include/gk20a/flcn_gk20a.h b/include/gk20a/flcn_gk20a.h
new file mode 100644
index 0000000..9d27b38
--- /dev/null
+++ b/include/gk20a/flcn_gk20a.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2017-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+#ifndef NVGPU_GK20A_FLCN_GK20A_H
+#define NVGPU_GK20A_FLCN_GK20A_H
+void gk20a_falcon_ops(struct nvgpu_falcon *flcn);
+int gk20a_falcon_hal_sw_init(struct nvgpu_falcon *flcn);
+void gk20a_falcon_dump_stats(struct nvgpu_falcon *flcn);
+#endif /* NVGPU_GK20A_FLCN_GK20A_H */
diff --git a/include/gk20a/gk20a.c b/include/gk20a/gk20a.c
new file mode 100644
index 0000000..c3068b7
--- /dev/null
+++ b/include/gk20a/gk20a.c
@@ -0,0 +1,590 @@
+/*
+ * GK20A Graphics
+ *
+ * Copyright (c) 2011-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+#include <nvgpu/nvgpu_common.h>
+#include <nvgpu/kmem.h>
+#include <nvgpu/allocator.h>
+#include <nvgpu/timers.h>
+#include <nvgpu/soc.h>
+#include <nvgpu/enabled.h>
+#include <nvgpu/pmu.h>
+#include <nvgpu/gmmu.h>
+#include <nvgpu/ltc.h>
+#include <nvgpu/vidmem.h>
+#include <nvgpu/mm.h>
+#include <nvgpu/ctxsw_trace.h>
+#include <nvgpu/soc.h>
+#include <nvgpu/clk_arb.h>
+#include <nvgpu/therm.h>
+#include <nvgpu/mc.h>
+#include <nvgpu/channel_sync.h>
+#include <trace/events/gk20a.h>
+#include "gk20a.h"
+#include "dbg_gpu_gk20a.h"
+#include "pstate/pstate.h"
+void __nvgpu_check_gpu_state(struct gk20a *g)
+{
+        u32 boot_0 = 0xffffffff;
+        boot_0 = nvgpu_mc_boot_0(g, NULL, NULL, NULL);
+        if (boot_0 == 0xffffffff) {
+                nvgpu_err(g, "GPU has disappeared from bus!!");
+                nvgpu_err(g, "Rebooting system!!");
+                nvgpu_kernel_restart(NULL);
+        }
+}
+void __gk20a_warn_on_no_regs(void)
+{
+        WARN_ONCE(1, "Attempted access to GPU regs after unmapping!");
+}
+static void gk20a_mask_interrupts(struct gk20a *g)
+{
+        if (g->ops.mc.intr_mask != NULL) {
+                g->ops.mc.intr_mask(g);
+        }
+        if (g->ops.mc.log_pending_intrs != NULL) {
+                g->ops.mc.log_pending_intrs(g);
+        }
+}
+int gk20a_prepare_poweroff(struct gk20a *g)
+{
+        int ret = 0;
+        nvgpu_log_fn(g, " ");
+        if (g->ops.fifo.channel_suspend) {
+                ret = g->ops.fifo.channel_suspend(g);
+                if (ret) {
+                        return ret;
+                }
+        }
+        /* disable elpg before gr or fifo suspend */
+        if (g->ops.pmu.is_pmu_supported(g)) {
+                ret |= nvgpu_pmu_destroy(g);
+        }
+        if (nvgpu_is_enabled(g, NVGPU_SUPPORT_SEC2_RTOS)) {
+                ret |= nvgpu_sec2_destroy(g);
+        }
+        ret |= gk20a_gr_suspend(g);
+        ret |= nvgpu_mm_suspend(g);
+        ret |= gk20a_fifo_suspend(g);
+        gk20a_ce_suspend(g);
+        /* Disable GPCPLL */
+        if (g->ops.clk.suspend_clk_support) {
+                ret |= g->ops.clk.suspend_clk_support(g);
+        }
+        if (nvgpu_is_enabled(g, NVGPU_PMU_PSTATE)) {
+                gk20a_deinit_pstate_support(g);
+        }
+        gk20a_mask_interrupts(g);
+        g->power_on = false;
+        return ret;
+}
+int gk20a_finalize_poweron(struct gk20a *g)
+{
+        int err = 0;
+#if defined(CONFIG_TEGRA_GK20A_NVHOST)
+        u32 nr_pages;
+#endif
+        u32 fuse_status;
+        nvgpu_log_fn(g, " ");
+        if (g->power_on) {
+                return 0;
+        }
+        g->power_on = true;
+        /*
+         * Before probing the GPU make sure the GPU's state is cleared. This is
+         * relevant for rebind operations.
+         */
+        if (g->ops.xve.reset_gpu && !g->gpu_reset_done) {
+                g->ops.xve.reset_gpu(g);
+                g->gpu_reset_done = true;
+        }
+        if (g->ops.clock_gating.slcg_acb_load_gating_prod != NULL) {
+                g->ops.clock_gating.slcg_acb_load_gating_prod(g, true);
+        }
+        /*
+         * Do this early so any early VMs that get made are capable of mapping
+         * buffers.
+         */
+        err = nvgpu_pd_cache_init(g);
+        if (err) {
+                return err;
+        }
+        /* init interface layer support for PMU falcon */
+        err = nvgpu_flcn_sw_init(g, FALCON_ID_PMU);
+        if (err != 0) {
+                nvgpu_err(g, "failed to sw init FALCON_ID_PMU");
+                goto done;
+        }
+        err = nvgpu_flcn_sw_init(g, FALCON_ID_SEC2);
+        if (err != 0) {
+                nvgpu_err(g, "failed to sw init FALCON_ID_SEC2");
+                goto done;
+        }
+        err = nvgpu_flcn_sw_init(g, FALCON_ID_NVDEC);
+        if (err != 0) {
+                nvgpu_err(g, "failed to sw init FALCON_ID_NVDEC");
+                goto done;
+        }
+        err = nvgpu_flcn_sw_init(g, FALCON_ID_GSPLITE);
+        if (err != 0) {
+                nvgpu_err(g, "failed to sw init FALCON_ID_GSPLITE");
+                goto done;
+        }
+        if (g->ops.acr.acr_sw_init != NULL &&
+                nvgpu_is_enabled(g, NVGPU_SEC_PRIVSECURITY)) {
+                g->ops.acr.acr_sw_init(g, &g->acr);
+        }
+        if (g->ops.bios.init) {
+                err = g->ops.bios.init(g);
+        }
+        if (err) {
+                goto done;
+        }
+        g->ops.bus.init_hw(g);
+        if (g->ops.clk.disable_slowboot) {
+                g->ops.clk.disable_slowboot(g);
+        }
+        g->ops.priv_ring.enable_priv_ring(g);
+        /* TBD: move this after graphics init in which blcg/slcg is enabled.
+           This function removes SlowdownOnBoot which applies 32x divider
+           on gpcpll bypass path. The purpose of slowdown is to save power
+           during boot but it also significantly slows down gk20a init on
+           simulation and emulation. We should remove SOB after graphics power
+           saving features (blcg/slcg) are enabled. For now, do it here. */
+        if (g->ops.clk.init_clk_support) {
+                err = g->ops.clk.init_clk_support(g);
+                if (err) {
+                        nvgpu_err(g, "failed to init gk20a clk");
+                        goto done;
+                }
+        }
+        if (nvgpu_is_enabled(g, NVGPU_SUPPORT_NVLINK)) {
+                err = g->ops.nvlink.init(g);
+                if (err) {
+                        nvgpu_err(g, "failed to init nvlink");
+                        goto done;
+                }
+        }
+        if (g->ops.fb.init_fbpa) {
+                err = g->ops.fb.init_fbpa(g);
+                if (err) {
+                        nvgpu_err(g, "failed to init fbpa");
+                        goto done;
+                }
+        }
+        if (g->ops.fb.mem_unlock) {
+                err = g->ops.fb.mem_unlock(g);
+                if (err) {
+                        nvgpu_err(g, "failed to unlock memory");
+                        goto done;
+                }
+        }
+        err = g->ops.fifo.reset_enable_hw(g);
+        if (err) {
+                nvgpu_err(g, "failed to reset gk20a fifo");
+                goto done;
+        }
+        err = nvgpu_init_ltc_support(g);
+        if (err) {
+                nvgpu_err(g, "failed to init ltc");
+                goto done;
+        }
+        err = nvgpu_init_mm_support(g);
+        if (err) {
+                nvgpu_err(g, "failed to init gk20a mm");
+                goto done;
+        }
+        err = gk20a_init_fifo_support(g);
+        if (err) {
+                nvgpu_err(g, "failed to init gk20a fifo");
+                goto done;
+        }
+        if (g->ops.therm.elcg_init_idle_filters) {
+                g->ops.therm.elcg_init_idle_filters(g);
+        }
+        g->ops.mc.intr_enable(g);
+        /*
+         *  Power gate the chip as per the TPC PG mask
+         *  and the fuse_status register.
+         *  If TPC PG mask is invalid halt the GPU poweron.
+         */
+        g->can_tpc_powergate = false;
+        fuse_status = g->ops.fuse.fuse_status_opt_tpc_gpc(g, 0);
+        if (g->ops.tpc.tpc_powergate) {
+                err = g->ops.tpc.tpc_powergate(g, fuse_status);
+        }
+        if (err) {
+                nvgpu_err(g, "failed to power ON GPU");
+                goto done;
+        }
+        nvgpu_mutex_acquire(&g->tpc_pg_lock);
+        if (g->can_tpc_powergate) {
+                if (g->ops.gr.powergate_tpc != NULL)
+                        g->ops.gr.powergate_tpc(g);
+        }
+        err = gk20a_enable_gr_hw(g);
+        if (err) {
+                nvgpu_err(g, "failed to enable gr");
+                nvgpu_mutex_release(&g->tpc_pg_lock);
+                goto done;
+        }
+        if (g->ops.pmu.is_pmu_supported(g)) {
+                if (g->ops.pmu.prepare_ucode) {
+                        err = g->ops.pmu.prepare_ucode(g);
+                }
+                if (err) {
+                        nvgpu_err(g, "failed to init pmu ucode");
+                        nvgpu_mutex_release(&g->tpc_pg_lock);
+                        goto done;
+                }
+        }
+        if (nvgpu_is_enabled(g, NVGPU_PMU_PSTATE)) {
+                err = gk20a_init_pstate_support(g);
+                if (err) {
+                        nvgpu_err(g, "failed to init pstates");
+                        nvgpu_mutex_release(&g->tpc_pg_lock);
+                        goto done;
+                }
+        }
+        if (g->acr.bootstrap_hs_acr != NULL &&
+                nvgpu_is_enabled(g, NVGPU_SEC_PRIVSECURITY)) {
+                err = g->acr.bootstrap_hs_acr(g, &g->acr, &g->acr.acr);
+                if (err != 0) {
+                        nvgpu_err(g, "ACR bootstrap failed");
+                        nvgpu_mutex_release(&g->tpc_pg_lock);
+                        goto done;
+                }
+        }
+        if (nvgpu_is_enabled(g, NVGPU_SUPPORT_SEC2_RTOS)) {
+                err = nvgpu_init_sec2_support(g);
+                if (err != 0) {
+                        nvgpu_err(g, "failed to init sec2");
+                        nvgpu_mutex_release(&g->tpc_pg_lock);
+                        goto done;
+                }
+        }
+        if (g->ops.pmu.is_pmu_supported(g)) {
+                err = nvgpu_init_pmu_support(g);
+                if (err) {
+                        nvgpu_err(g, "failed to init gk20a pmu");
+                        nvgpu_mutex_release(&g->tpc_pg_lock);
+                        goto done;
+                }
+        }
+        err = gk20a_init_gr_support(g);
+        if (err) {
+                nvgpu_err(g, "failed to init gk20a gr");
+                nvgpu_mutex_release(&g->tpc_pg_lock);
+                goto done;
+        }
+        nvgpu_mutex_release(&g->tpc_pg_lock);
+        if (nvgpu_is_enabled(g, NVGPU_PMU_PSTATE)) {
+                err = gk20a_init_pstate_pmu_support(g);
+                if (err) {
+                        nvgpu_err(g, "failed to init pstates");
+                        goto done;
+                }
+        }
+        if (g->ops.pmu_ver.clk.clk_set_boot_clk && nvgpu_is_enabled(g, NVGPU_PMU_PSTATE)) {
+                g->ops.pmu_ver.clk.clk_set_boot_clk(g);
+        } else {
+                err = nvgpu_clk_arb_init_arbiter(g);
+                if (err) {
+                        nvgpu_err(g, "failed to init clk arb");
+                        goto done;
+                }
+        }
+        err = nvgpu_init_therm_support(g);
+        if (err) {
+                nvgpu_err(g, "failed to init gk20a therm");
+                goto done;
+        }
+        err = g->ops.chip_init_gpu_characteristics(g);
+        if (err) {
+                nvgpu_err(g, "failed to init gk20a gpu characteristics");
+                goto done;
+        }
+#ifdef CONFIG_GK20A_CTXSW_TRACE
+        err = gk20a_ctxsw_trace_init(g);
+        if (err)
+                nvgpu_warn(g, "could not initialize ctxsw tracing");
+#endif
+        /* Restore the debug setting */
+        g->ops.fb.set_debug_mode(g, g->mmu_debug_ctrl);
+        gk20a_init_ce_support(g);
+        if (g->ops.xve.available_speeds) {
+                u32 speed;
+                if (!nvgpu_is_enabled(g, NVGPU_SUPPORT_ASPM) && g->ops.xve.disable_aspm) {
+                        g->ops.xve.disable_aspm(g);
+                }
+                g->ops.xve.available_speeds(g, &speed);
+                /* Set to max speed */
+                speed = 1 << (fls(speed) - 1);
+                err = g->ops.xve.set_speed(g, speed);
+                if (err) {
+                        nvgpu_err(g, "Failed to set PCIe bus speed!");
+                        goto done;
+                }
+        }
+#if defined(CONFIG_TEGRA_GK20A_NVHOST)
+        if (nvgpu_has_syncpoints(g) && g->syncpt_unit_size) {
+                if (!nvgpu_mem_is_valid(&g->syncpt_mem)) {
+                        nr_pages = DIV_ROUND_UP(g->syncpt_unit_size, PAGE_SIZE);
+                        __nvgpu_mem_create_from_phys(g, &g->syncpt_mem,
+                                        g->syncpt_unit_base, nr_pages);
+                }
+        }
+#endif
+        if (g->ops.fifo.channel_resume) {
+                g->ops.fifo.channel_resume(g);
+        }
+done:
+        if (err) {
+                g->power_on = false;
+        }
+        return err;
+}
+int gk20a_wait_for_idle(struct gk20a *g)
+{
+        int wait_length = 150; /* 3 second overall max wait. */
+        int target_usage_count = 0;
+        if (!g) {
+                return -ENODEV;
+        }
+        while ((nvgpu_atomic_read(&g->usage_count) != target_usage_count)
+                        && (wait_length-- >= 0)) {
+                nvgpu_msleep(20);
+        }
+        if (wait_length < 0) {
+                nvgpu_warn(g, "Timed out waiting for idle (%d)!\n",
+                           nvgpu_atomic_read(&g->usage_count));
+                return -ETIMEDOUT;
+        }
+        return 0;
+}
+int gk20a_init_gpu_characteristics(struct gk20a *g)
+{
+        __nvgpu_set_enabled(g, NVGPU_SUPPORT_PARTIAL_MAPPINGS, true);
+        __nvgpu_set_enabled(g, NVGPU_SUPPORT_MAP_DIRECT_KIND_CTRL, true);
+        __nvgpu_set_enabled(g, NVGPU_SUPPORT_MAP_BUFFER_BATCH, true);
+        if (IS_ENABLED(CONFIG_SYNC)) {
+                __nvgpu_set_enabled(g, NVGPU_SUPPORT_SYNC_FENCE_FDS, true);
+        }
+        if (g->ops.mm.support_sparse && g->ops.mm.support_sparse(g)) {
+                __nvgpu_set_enabled(g, NVGPU_SUPPORT_SPARSE_ALLOCS, true);
+        }
+        /*
+         * Fast submits are supported as long as the user doesn't request
+         * anything that depends on job tracking. (Here, fast means strictly no
+         * metadata, just the gpfifo contents are copied and gp_put updated).
+         */
+        __nvgpu_set_enabled(g,
+                        NVGPU_SUPPORT_DETERMINISTIC_SUBMIT_NO_JOBTRACKING,
+                        true);
+        /*
+         * Sync framework requires deferred job cleanup, wrapping syncs in FDs,
+         * and other heavy stuff, which prevents deterministic submits. This is
+         * supported otherwise, provided that the user doesn't request anything
+         * that depends on deferred cleanup.
+         */
+        if (!nvgpu_channel_sync_needs_os_fence_framework(g)) {
+                __nvgpu_set_enabled(g,
+                                NVGPU_SUPPORT_DETERMINISTIC_SUBMIT_FULL,
+                                true);
+        }
+        __nvgpu_set_enabled(g, NVGPU_SUPPORT_DETERMINISTIC_OPTS, true);
+        __nvgpu_set_enabled(g, NVGPU_SUPPORT_USERSPACE_MANAGED_AS, true);
+        __nvgpu_set_enabled(g, NVGPU_SUPPORT_TSG, true);
+        if (g->ops.clk_arb.get_arbiter_clk_domains != NULL &&
+                g->ops.clk.support_clk_freq_controller) {
+                __nvgpu_set_enabled(g, NVGPU_SUPPORT_CLOCK_CONTROLS, true);
+        }
+        g->ops.gr.detect_sm_arch(g);
+        if (g->ops.gr.init_cyclestats) {
+                g->ops.gr.init_cyclestats(g);
+        }
+        g->ops.gr.get_rop_l2_en_mask(g);
+        return 0;
+}
+/*
+ * Free the gk20a struct.
+ */
+static void gk20a_free_cb(struct nvgpu_ref *refcount)
+{
+        struct gk20a *g = container_of(refcount,
+                struct gk20a, refcount);
+        nvgpu_log(g, gpu_dbg_shutdown, "Freeing GK20A struct!");
+        gk20a_ce_destroy(g);
+        if (g->remove_support) {
+                g->remove_support(g);
+        }
+        if (g->free) {
+                g->free(g);
+        }
+}
+/**
+ * gk20a_get() - Increment ref count on driver
+ *
+ * @g The driver to increment
+ * This will fail if the driver is in the process of being released. In that
+ * case it will return NULL. Otherwise a pointer to the driver passed in will
+ * be returned.
+ */
+struct gk20a * __must_check gk20a_get(struct gk20a *g)
+{
+        int success;
+        /*
+         * Handle the possibility we are still freeing the gk20a struct while
+         * gk20a_get() is called. Unlikely but plausible race condition. Ideally
+         * the code will never be in such a situation that this race is
+         * possible.
+         */
+        success = nvgpu_ref_get_unless_zero(&g->refcount);
+        nvgpu_log(g, gpu_dbg_shutdown, "GET: refs currently %d %s",
+                nvgpu_atomic_read(&g->refcount.refcount),
+                        success ? "" : "(FAILED)");
+        return success ? g : NULL;
+}
+/**
+ * gk20a_put() - Decrement ref count on driver
+ *
+ * @g - The driver to decrement
+ *
+ * Decrement the driver ref-count. If neccesary also free the underlying driver
+ * memory
+ */
+void gk20a_put(struct gk20a *g)
+{
+        /*
+         * Note - this is racy, two instances of this could run before the
+         * actual kref_put(0 runs, you could see something like:
+         *
+         *  ... PUT: refs currently 2
+         *  ... PUT: refs currently 2
+         *  ... Freeing GK20A struct!
+         */
+        nvgpu_log(g, gpu_dbg_shutdown, "PUT: refs currently %d",
+                nvgpu_atomic_read(&g->refcount.refcount));
+        nvgpu_ref_put(&g->refcount, gk20a_free_cb);
+}
diff --git a/include/gk20a/gk20a.h b/include/gk20a/gk20a.h
new file mode 100644
index 0000000..16a2453
--- /dev/null
+++ b/include/gk20a/gk20a.h
@@ -0,0 +1,33 @@
+/*
+ * This file is used as a temporary redirection header for <nvgpu/gk20a.h>
+ *
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * GK20A Graphics
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+#ifndef GK20A_GK20A_H
+#define GK20A_GK20A_H
+/* no new headers should be added here */
+#include <nvgpu/gk20a.h>
+#endif
diff --git a/include/gk20a/gr_ctx_gk20a.c b/include/gk20a/gr_ctx_gk20a.c
new file mode 100644
index 0000000..8b9ac32
--- /dev/null
+++ b/include/gk20a/gr_ctx_gk20a.c
@@ -0,0 +1,486 @@
+/*
+ * GK20A Graphics Context
+ *
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+#include <nvgpu/nvgpu_common.h>
+#include <nvgpu/kmem.h>
+#include <nvgpu/log.h>
+#include <nvgpu/firmware.h>
+#include <nvgpu/enabled.h>
+#include <nvgpu/io.h>
+#include "gk20a.h"
+#include "gr_ctx_gk20a.h"
+#include <nvgpu/hw/gk20a/hw_gr_gk20a.h>
+static int gr_gk20a_alloc_load_netlist_u32(struct gk20a *g, u32 *src, u32 len,
+                        struct u32_list_gk20a *u32_list)
+{
+        u32_list->count = (len + sizeof(u32) - 1) / sizeof(u32);
+        if (!alloc_u32_list_gk20a(g, u32_list)) {
+                return -ENOMEM;
+        }
+        memcpy(u32_list->l, src, len);
+        return 0;
+}
+static int gr_gk20a_alloc_load_netlist_av(struct gk20a *g, u32 *src, u32 len,
+                        struct av_list_gk20a *av_list)
+{
+        av_list->count = len / sizeof(struct av_gk20a);
+        if (!alloc_av_list_gk20a(g, av_list)) {
+                return -ENOMEM;
+        }
+        memcpy(av_list->l, src, len);
+        return 0;
+}
+static int gr_gk20a_alloc_load_netlist_av64(struct gk20a *g, u32 *src, u32 len,
+                        struct av64_list_gk20a *av64_list)
+{
+        av64_list->count = len / sizeof(struct av64_gk20a);
+        if (!alloc_av64_list_gk20a(g, av64_list)) {
+                return -ENOMEM;
+        }
+        memcpy(av64_list->l, src, len);
+        return 0;
+}
+static int gr_gk20a_alloc_load_netlist_aiv(struct gk20a *g, u32 *src, u32 len,
+                        struct aiv_list_gk20a *aiv_list)
+{
+        aiv_list->count = len / sizeof(struct aiv_gk20a);
+        if (!alloc_aiv_list_gk20a(g, aiv_list)) {
+                return -ENOMEM;
+        }
+        memcpy(aiv_list->l, src, len);
+        return 0;
+}
+static int gr_gk20a_init_ctx_vars_fw(struct gk20a *g, struct gr_gk20a *gr)
+{
+        struct nvgpu_firmware *netlist_fw;
+        struct netlist_image *netlist = NULL;
+        char name[MAX_NETLIST_NAME];
+        u32 i, major_v = ~0, major_v_hw, netlist_num;
+        int net, max, err = -ENOENT;
+        nvgpu_log_fn(g, " ");
+        if (g->ops.gr_ctx.is_fw_defined()) {
+                net = NETLIST_FINAL;
+                max = 0;
+                major_v_hw = ~0;
+                g->gr.ctx_vars.dynamic = false;
+        } else {
+                net = NETLIST_SLOT_A;
+                max = MAX_NETLIST;
+                major_v_hw = gk20a_readl(g,
+                                gr_fecs_ctx_state_store_major_rev_id_r());
+                g->gr.ctx_vars.dynamic = true;
+        }
+        for (; net < max; net++) {
+                if (g->ops.gr_ctx.get_netlist_name(g, net, name) != 0) {
+                        nvgpu_warn(g, "invalid netlist index %d", net);
+                        continue;
+                }
+                netlist_fw = nvgpu_request_firmware(g, name, 0);
+                if (!netlist_fw) {
+                        nvgpu_warn(g, "failed to load netlist %s", name);
+                        continue;
+                }
+                netlist = (struct netlist_image *)netlist_fw->data;
+                for (i = 0; i < netlist->header.regions; i++) {
+                        u32 *src = (u32 *)((u8 *)netlist + netlist->regions[i].data_offset);
+                        u32 size = netlist->regions[i].data_size;
+                        switch (netlist->regions[i].region_id) {
+                        case NETLIST_REGIONID_FECS_UCODE_DATA:
+                                nvgpu_log_info(g, "NETLIST_REGIONID_FECS_UCODE_DATA");
+                                err = gr_gk20a_alloc_load_netlist_u32(g,
+                                        src, size, &g->gr.ctx_vars.ucode.fecs.data);
+                                if (err) {
+                                        goto clean_up;
+                                }
+                                break;
+                        case NETLIST_REGIONID_FECS_UCODE_INST:
+                                nvgpu_log_info(g, "NETLIST_REGIONID_FECS_UCODE_INST");
+                                err = gr_gk20a_alloc_load_netlist_u32(g,
+                                        src, size, &g->gr.ctx_vars.ucode.fecs.inst);
+                                if (err) {
+                                        goto clean_up;
+                                }
+                                break;
+                        case NETLIST_REGIONID_GPCCS_UCODE_DATA:
+                                nvgpu_log_info(g, "NETLIST_REGIONID_GPCCS_UCODE_DATA");
+                                err = gr_gk20a_alloc_load_netlist_u32(g,
+                                        src, size, &g->gr.ctx_vars.ucode.gpccs.data);
+                                if (err) {
+                                        goto clean_up;
+                                }
+                                break;
+                        case NETLIST_REGIONID_GPCCS_UCODE_INST:
+                                nvgpu_log_info(g, "NETLIST_REGIONID_GPCCS_UCODE_INST");
+                                err = gr_gk20a_alloc_load_netlist_u32(g,
+                                        src, size, &g->gr.ctx_vars.ucode.gpccs.inst);
+                                if (err) {
+                                        goto clean_up;
+                                }
+                                break;
+                        case NETLIST_REGIONID_SW_BUNDLE_INIT:
+                                nvgpu_log_info(g, "NETLIST_REGIONID_SW_BUNDLE_INIT");
+                                err = gr_gk20a_alloc_load_netlist_av(g,
+                                        src, size, &g->gr.ctx_vars.sw_bundle_init);
+                                if (err) {
+                                        goto clean_up;
+                                }
+                                break;
+                        case NETLIST_REGIONID_SW_METHOD_INIT:
+                                nvgpu_log_info(g, "NETLIST_REGIONID_SW_METHOD_INIT");
+                                err = gr_gk20a_alloc_load_netlist_av(g,
+                                        src, size, &g->gr.ctx_vars.sw_method_init);
+                                if (err) {
+                                        goto clean_up;
+                                }
+                                break;
+                        case NETLIST_REGIONID_SW_CTX_LOAD:
+                                nvgpu_log_info(g, "NETLIST_REGIONID_SW_CTX_LOAD");
+                                err = gr_gk20a_alloc_load_netlist_aiv(g,
+                                        src, size, &g->gr.ctx_vars.sw_ctx_load);
+                                if (err) {
+                                        goto clean_up;
+                                }
+                                break;
+                        case NETLIST_REGIONID_SW_NON_CTX_LOAD:
+                                nvgpu_log_info(g, "NETLIST_REGIONID_SW_NON_CTX_LOAD");
+                                err = gr_gk20a_alloc_load_netlist_av(g,
+                                        src, size, &g->gr.ctx_vars.sw_non_ctx_load);
+                                if (err) {
+                                        goto clean_up;
+                                }
+                                break;
+                        case NETLIST_REGIONID_SWVEIDBUNDLEINIT:
+                                nvgpu_log_info(g,
+                                        "NETLIST_REGIONID_SW_VEID_BUNDLE_INIT");
+                                err = gr_gk20a_alloc_load_netlist_av(g,
+                                        src, size,
+                                        &g->gr.ctx_vars.sw_veid_bundle_init);
+                                if (err) {
+                                        goto clean_up;
+                                }
+                                break;
+                        case NETLIST_REGIONID_CTXREG_SYS:
+                                nvgpu_log_info(g, "NETLIST_REGIONID_CTXREG_SYS");
+                                err = gr_gk20a_alloc_load_netlist_aiv(g,
+                                        src, size, &g->gr.ctx_vars.ctxsw_regs.sys);
+                                if (err) {
+                                        goto clean_up;
+                                }
+                                break;
+                        case NETLIST_REGIONID_CTXREG_GPC:
+                                nvgpu_log_info(g, "NETLIST_REGIONID_CTXREG_GPC");
+                                err = gr_gk20a_alloc_load_netlist_aiv(g,
+                                        src, size, &g->gr.ctx_vars.ctxsw_regs.gpc);
+                                if (err) {
+                                        goto clean_up;
+                                }
+                                break;
+                        case NETLIST_REGIONID_CTXREG_TPC:
+                                nvgpu_log_info(g, "NETLIST_REGIONID_CTXREG_TPC");
+                                err = gr_gk20a_alloc_load_netlist_aiv(g,
+                                        src, size, &g->gr.ctx_vars.ctxsw_regs.tpc);
+                                if (err) {
+                                        goto clean_up;
+                                }
+                                break;
+                        case NETLIST_REGIONID_CTXREG_ZCULL_GPC:
+                                nvgpu_log_info(g, "NETLIST_REGIONID_CTXREG_ZCULL_GPC");
+                                err = gr_gk20a_alloc_load_netlist_aiv(g,
+                                        src, size, &g->gr.ctx_vars.ctxsw_regs.zcull_gpc);
+                                if (err) {
+                                        goto clean_up;
+                                }
+                                break;
+                        case NETLIST_REGIONID_CTXREG_PPC:
+                                nvgpu_log_info(g, "NETLIST_REGIONID_CTXREG_PPC");
+                                err = gr_gk20a_alloc_load_netlist_aiv(g,
+                                        src, size, &g->gr.ctx_vars.ctxsw_regs.ppc);
+                                if (err) {
+                                        goto clean_up;
+                                }
+                                break;
+                        case NETLIST_REGIONID_CTXREG_PM_SYS:
+                                nvgpu_log_info(g, "NETLIST_REGIONID_CTXREG_PM_SYS");
+                                err = gr_gk20a_alloc_load_netlist_aiv(g,
+                                        src, size, &g->gr.ctx_vars.ctxsw_regs.pm_sys);
+                                if (err) {
+                                        goto clean_up;
+                                }
+                                break;
+                        case NETLIST_REGIONID_CTXREG_PM_GPC:
+                                nvgpu_log_info(g, "NETLIST_REGIONID_CTXREG_PM_GPC");
+                                err = gr_gk20a_alloc_load_netlist_aiv(g,
+                                        src, size, &g->gr.ctx_vars.ctxsw_regs.pm_gpc);
+                                if (err) {
+                                        goto clean_up;
+                                }
+                                break;
+                        case NETLIST_REGIONID_CTXREG_PM_TPC:
+                                nvgpu_log_info(g, "NETLIST_REGIONID_CTXREG_PM_TPC");
+                                err = gr_gk20a_alloc_load_netlist_aiv(g,
+                                        src, size, &g->gr.ctx_vars.ctxsw_regs.pm_tpc);
+                                if (err) {
+                                        goto clean_up;
+                                }
+                                break;
+                        case NETLIST_REGIONID_BUFFER_SIZE:
+                                g->gr.ctx_vars.buffer_size = *src;
+                                nvgpu_log_info(g, "NETLIST_REGIONID_BUFFER_SIZE : %d",
+                                        g->gr.ctx_vars.buffer_size);
+                                break;
+                        case NETLIST_REGIONID_CTXSW_REG_BASE_INDEX:
+                                g->gr.ctx_vars.regs_base_index = *src;
+                                nvgpu_log_info(g, "NETLIST_REGIONID_CTXSW_REG_BASE_INDEX : %u",
+                                        g->gr.ctx_vars.regs_base_index);
+                                break;
+                        case NETLIST_REGIONID_MAJORV:
+                                major_v = *src;
+                                nvgpu_log_info(g, "NETLIST_REGIONID_MAJORV : %d",
+                                        major_v);
+                                break;
+                        case NETLIST_REGIONID_NETLIST_NUM:
+                                netlist_num = *src;
+                                nvgpu_log_info(g, "NETLIST_REGIONID_NETLIST_NUM : %d",
+                                        netlist_num);
+                                break;
+                        case NETLIST_REGIONID_CTXREG_PMPPC:
+                                nvgpu_log_info(g, "NETLIST_REGIONID_CTXREG_PMPPC");
+                                err = gr_gk20a_alloc_load_netlist_aiv(g,
+                                        src, size, &g->gr.ctx_vars.ctxsw_regs.pm_ppc);
+                                if (err) {
+                                        goto clean_up;
+                                }
+                                break;
+                        case NETLIST_REGIONID_NVPERF_CTXREG_SYS:
+                                nvgpu_log_info(g, "NETLIST_REGIONID_NVPERF_CTXREG_SYS");
+                                err = gr_gk20a_alloc_load_netlist_aiv(g,
+                                        src, size, &g->gr.ctx_vars.ctxsw_regs.perf_sys);
+                                if (err) {
+                                        goto clean_up;
+                                }
+                                break;
+                        case NETLIST_REGIONID_NVPERF_FBP_CTXREGS:
+                                nvgpu_log_info(g, "NETLIST_REGIONID_NVPERF_FBP_CTXREGS");
+                                err = gr_gk20a_alloc_load_netlist_aiv(g,
+                                        src, size, &g->gr.ctx_vars.ctxsw_regs.fbp);
+                                if (err) {
+                                        goto clean_up;
+                                }
+                                break;
+                        case NETLIST_REGIONID_NVPERF_CTXREG_GPC:
+                                nvgpu_log_info(g, "NETLIST_REGIONID_NVPERF_CTXREG_GPC");
+                                err = gr_gk20a_alloc_load_netlist_aiv(g,
+                                        src, size, &g->gr.ctx_vars.ctxsw_regs.perf_gpc);
+                                if (err) {
+                                        goto clean_up;
+                                }
+                                break;
+                        case NETLIST_REGIONID_NVPERF_FBP_ROUTER:
+                                nvgpu_log_info(g, "NETLIST_REGIONID_NVPERF_FBP_ROUTER");
+                                err = gr_gk20a_alloc_load_netlist_aiv(g,
+                                        src, size, &g->gr.ctx_vars.ctxsw_regs.fbp_router);
+                                if (err) {
+                                        goto clean_up;
+                                }
+                                break;
+                        case NETLIST_REGIONID_NVPERF_GPC_ROUTER:
+                                nvgpu_log_info(g, "NETLIST_REGIONID_NVPERF_GPC_ROUTER");
+                                err = gr_gk20a_alloc_load_netlist_aiv(g,
+                                        src, size, &g->gr.ctx_vars.ctxsw_regs.gpc_router);
+                                if (err) {
+                                        goto clean_up;
+                                }
+                                break;
+                        case NETLIST_REGIONID_CTXREG_PMLTC:
+                                nvgpu_log_info(g, "NETLIST_REGIONID_CTXREG_PMLTC");
+                                err = gr_gk20a_alloc_load_netlist_aiv(g,
+                                        src, size, &g->gr.ctx_vars.ctxsw_regs.pm_ltc);
+                                if (err) {
+                                        goto clean_up;
+                                }
+                                break;
+                        case NETLIST_REGIONID_CTXREG_PMFBPA:
+                                nvgpu_log_info(g, "NETLIST_REGIONID_CTXREG_PMFBPA");
+                                err = gr_gk20a_alloc_load_netlist_aiv(g,
+                                        src, size, &g->gr.ctx_vars.ctxsw_regs.pm_fbpa);
+                                if (err) {
+                                        goto clean_up;
+                                }
+                                break;
+                        case NETLIST_REGIONID_NVPERF_SYS_ROUTER:
+                                nvgpu_log_info(g, "NETLIST_REGIONID_NVPERF_SYS_ROUTER");
+                                err = gr_gk20a_alloc_load_netlist_aiv(g,
+                                        src, size, &g->gr.ctx_vars.ctxsw_regs.perf_sys_router);
+                                if (err) {
+                                        goto clean_up;
+                                }
+                                break;
+                        case NETLIST_REGIONID_NVPERF_PMA:
+                                nvgpu_log_info(g, "NETLIST_REGIONID_NVPERF_PMA");
+                                err = gr_gk20a_alloc_load_netlist_aiv(g,
+                                        src, size, &g->gr.ctx_vars.ctxsw_regs.perf_pma);
+                                if (err) {
+                                        goto clean_up;
+                                }
+                                break;
+                        case NETLIST_REGIONID_CTXREG_PMROP:
+                                nvgpu_log_info(g, "NETLIST_REGIONID_CTXREG_PMROP");
+                                err = gr_gk20a_alloc_load_netlist_aiv(g,
+                                        src, size, &g->gr.ctx_vars.ctxsw_regs.pm_rop);
+                                if (err) {
+                                        goto clean_up;
+                                }
+                                break;
+                        case NETLIST_REGIONID_CTXREG_PMUCGPC:
+                                nvgpu_log_info(g, "NETLIST_REGIONID_CTXREG_PMUCGPC");
+                                err = gr_gk20a_alloc_load_netlist_aiv(g,
+                                        src, size, &g->gr.ctx_vars.ctxsw_regs.pm_ucgpc);
+                                if (err) {
+                                        goto clean_up;
+                                }
+                                break;
+                        case NETLIST_REGIONID_CTXREG_ETPC:
+                                nvgpu_log_info(g, "NETLIST_REGIONID_CTXREG_ETPC");
+                                err = gr_gk20a_alloc_load_netlist_aiv(g,
+                                        src, size, &g->gr.ctx_vars.ctxsw_regs.etpc);
+                                if (err) {
+                                        goto clean_up;
+                                }
+                                break;
+                        case NETLIST_REGIONID_SW_BUNDLE64_INIT:
+                                nvgpu_log_info(g, "NETLIST_REGIONID_SW_BUNDLE64_INIT");
+                                err = gr_gk20a_alloc_load_netlist_av64(g,
+                                        src, size,
+                                        &g->gr.ctx_vars.sw_bundle64_init);
+                                if (err) {
+                                        goto clean_up;
+                                }
+                                break;
+                        case NETLIST_REGIONID_NVPERF_PMCAU:
+                                nvgpu_log_info(g, "NETLIST_REGIONID_NVPERF_PMCAU");
+                                err = gr_gk20a_alloc_load_netlist_aiv(g,
+                                        src, size,
+                                        &g->gr.ctx_vars.ctxsw_regs.pm_cau);
+                                if (err) {
+                                        goto clean_up;
+                                }
+                                break;
+                        default:
+                                nvgpu_log_info(g, "unrecognized region %d skipped", i);
+                                break;
+                        }
+                }
+                if (net != NETLIST_FINAL && major_v != major_v_hw) {
+                        nvgpu_log_info(g, "skip %s: major_v 0x%08x doesn't match hw 0x%08x",
+                                name, major_v, major_v_hw);
+                        goto clean_up;
+                }
+                g->gr.ctx_vars.valid = true;
+                g->gr.netlist = net;
+                nvgpu_release_firmware(g, netlist_fw);
+                nvgpu_log_fn(g, "done");
+                goto done;
+clean_up:
+                g->gr.ctx_vars.valid = false;
+                nvgpu_kfree(g, g->gr.ctx_vars.ucode.fecs.inst.l);
+                nvgpu_kfree(g, g->gr.ctx_vars.ucode.fecs.data.l);
+                nvgpu_kfree(g, g->gr.ctx_vars.ucode.gpccs.inst.l);
+                nvgpu_kfree(g, g->gr.ctx_vars.ucode.gpccs.data.l);
+                nvgpu_kfree(g, g->gr.ctx_vars.sw_bundle_init.l);
+                nvgpu_kfree(g, g->gr.ctx_vars.sw_method_init.l);
+                nvgpu_kfree(g, g->gr.ctx_vars.sw_ctx_load.l);
+                nvgpu_kfree(g, g->gr.ctx_vars.sw_non_ctx_load.l);
+                nvgpu_kfree(g, g->gr.ctx_vars.sw_veid_bundle_init.l);
+                nvgpu_kfree(g, g->gr.ctx_vars.ctxsw_regs.sys.l);
+                nvgpu_kfree(g, g->gr.ctx_vars.ctxsw_regs.gpc.l);
+                nvgpu_kfree(g, g->gr.ctx_vars.ctxsw_regs.tpc.l);
+                nvgpu_kfree(g, g->gr.ctx_vars.ctxsw_regs.zcull_gpc.l);
+                nvgpu_kfree(g, g->gr.ctx_vars.ctxsw_regs.ppc.l);
+                nvgpu_kfree(g, g->gr.ctx_vars.ctxsw_regs.pm_sys.l);
+                nvgpu_kfree(g, g->gr.ctx_vars.ctxsw_regs.pm_gpc.l);
+                nvgpu_kfree(g, g->gr.ctx_vars.ctxsw_regs.pm_tpc.l);
+                nvgpu_kfree(g, g->gr.ctx_vars.ctxsw_regs.pm_ppc.l);
+                nvgpu_kfree(g, g->gr.ctx_vars.ctxsw_regs.perf_sys.l);
+                nvgpu_kfree(g, g->gr.ctx_vars.ctxsw_regs.fbp.l);
+                nvgpu_kfree(g, g->gr.ctx_vars.ctxsw_regs.perf_gpc.l);
+                nvgpu_kfree(g, g->gr.ctx_vars.ctxsw_regs.fbp_router.l);
+                nvgpu_kfree(g, g->gr.ctx_vars.ctxsw_regs.gpc_router.l);
+                nvgpu_kfree(g, g->gr.ctx_vars.ctxsw_regs.pm_ltc.l);
+                nvgpu_kfree(g, g->gr.ctx_vars.ctxsw_regs.pm_fbpa.l);
+                nvgpu_kfree(g, g->gr.ctx_vars.ctxsw_regs.perf_sys_router.l);
+                nvgpu_kfree(g, g->gr.ctx_vars.ctxsw_regs.perf_pma.l);
+                nvgpu_kfree(g, g->gr.ctx_vars.ctxsw_regs.pm_rop.l);
+                nvgpu_kfree(g, g->gr.ctx_vars.ctxsw_regs.pm_ucgpc.l);
+                nvgpu_kfree(g, g->gr.ctx_vars.ctxsw_regs.etpc.l);
+                nvgpu_kfree(g, g->gr.ctx_vars.sw_bundle64_init.l);
+                nvgpu_kfree(g, g->gr.ctx_vars.ctxsw_regs.pm_cau.l);
+                nvgpu_release_firmware(g, netlist_fw);
+                err = -ENOENT;
+        }
+done:
+        if (g->gr.ctx_vars.valid) {
+                nvgpu_log_info(g, "netlist image %s loaded", name);
+                return 0;
+        } else {
+                nvgpu_err(g, "failed to load netlist image!!");
+                return err;
+        }
+}
+int gr_gk20a_init_ctx_vars(struct gk20a *g, struct gr_gk20a *gr)
+{
+        if (nvgpu_is_enabled(g, NVGPU_IS_FMODEL)) {
+                return gr_gk20a_init_ctx_vars_sim(g, gr);
+        } else {
+                return gr_gk20a_init_ctx_vars_fw(g, gr);
+        }
+}
diff --git a/include/gk20a/gr_ctx_gk20a.h b/include/gk20a/gr_ctx_gk20a.h
new file mode 100644
index 0000000..e75472c
--- /dev/null
+++ b/include/gk20a/gr_ctx_gk20a.h
@@ -0,0 +1,206 @@
+/*
+ * GK20A Graphics Context
+ *
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+#ifndef NVGPU_GK20A_GR_CTX_GK20A_H
+#define NVGPU_GK20A_GR_CTX_GK20A_H
+#include <nvgpu/kmem.h>
+struct gr_gk20a;
+/* emulation netlists, match majorV with HW */
+#define GK20A_NETLIST_IMAGE_A   "NETA_img.bin"
+#define GK20A_NETLIST_IMAGE_B   "NETB_img.bin"
+#define GK20A_NETLIST_IMAGE_C   "NETC_img.bin"
+#define GK20A_NETLIST_IMAGE_D   "NETD_img.bin"
+/*
+ * Need to support multiple ARCH in same GPU family
+ * then need to provide path like ARCH/NETIMAGE to
+ * point to correct netimage within GPU family,
+ * Example, gm20x can support gm204 or gm206,so path
+ * for netimage is gm204/NETC_img.bin, and '/' char
+ * will inserted at null terminator char of "GAxxx"
+ * to get complete path like gm204/NETC_img.bin
+ */
+#define GPU_ARCH "GAxxx"
+union __max_name {
+#ifdef GK20A_NETLIST_IMAGE_A
+        char __name_a[sizeof(GK20A_NETLIST_IMAGE_A)];
+#endif
+#ifdef GK20A_NETLIST_IMAGE_B
+        char __name_b[sizeof(GK20A_NETLIST_IMAGE_B)];
+#endif
+#ifdef GK20A_NETLIST_IMAGE_C
+        char __name_c[sizeof(GK20A_NETLIST_IMAGE_C)];
+#endif
+#ifdef GK20A_NETLIST_IMAGE_D
+        char __name_d[sizeof(GK20A_NETLIST_IMAGE_D)];
+#endif
+};
+#define MAX_NETLIST_NAME (sizeof(GPU_ARCH) + sizeof(union __max_name))
+/* index for emulation netlists */
+#define NETLIST_FINAL           -1
+#define NETLIST_SLOT_A          0
+#define NETLIST_SLOT_B          1
+#define NETLIST_SLOT_C          2
+#define NETLIST_SLOT_D          3
+#define MAX_NETLIST             4
+/* netlist regions */
+#define NETLIST_REGIONID_FECS_UCODE_DATA        0
+#define NETLIST_REGIONID_FECS_UCODE_INST        1
+#define NETLIST_REGIONID_GPCCS_UCODE_DATA       2
+#define NETLIST_REGIONID_GPCCS_UCODE_INST       3
+#define NETLIST_REGIONID_SW_BUNDLE_INIT         4
+#define NETLIST_REGIONID_SW_CTX_LOAD            5
+#define NETLIST_REGIONID_SW_NON_CTX_LOAD        6
+#define NETLIST_REGIONID_SW_METHOD_INIT         7
+#define NETLIST_REGIONID_CTXREG_SYS             8
+#define NETLIST_REGIONID_CTXREG_GPC             9
+#define NETLIST_REGIONID_CTXREG_TPC             10
+#define NETLIST_REGIONID_CTXREG_ZCULL_GPC       11
+#define NETLIST_REGIONID_CTXREG_PM_SYS          12
+#define NETLIST_REGIONID_CTXREG_PM_GPC          13
+#define NETLIST_REGIONID_CTXREG_PM_TPC          14
+#define NETLIST_REGIONID_MAJORV                 15
+#define NETLIST_REGIONID_BUFFER_SIZE            16
+#define NETLIST_REGIONID_CTXSW_REG_BASE_INDEX   17
+#define NETLIST_REGIONID_NETLIST_NUM            18
+#define NETLIST_REGIONID_CTXREG_PPC             19
+#define NETLIST_REGIONID_CTXREG_PMPPC           20
+#define NETLIST_REGIONID_NVPERF_CTXREG_SYS      21
+#define NETLIST_REGIONID_NVPERF_FBP_CTXREGS     22
+#define NETLIST_REGIONID_NVPERF_CTXREG_GPC      23
+#define NETLIST_REGIONID_NVPERF_FBP_ROUTER      24
+#define NETLIST_REGIONID_NVPERF_GPC_ROUTER      25
+#define NETLIST_REGIONID_CTXREG_PMLTC           26
+#define NETLIST_REGIONID_CTXREG_PMFBPA          27
+#define NETLIST_REGIONID_SWVEIDBUNDLEINIT       28
+#define NETLIST_REGIONID_NVPERF_SYS_ROUTER      29
+#define NETLIST_REGIONID_NVPERF_PMA             30
+#define NETLIST_REGIONID_CTXREG_PMROP           31
+#define NETLIST_REGIONID_CTXREG_PMUCGPC         32
+#define NETLIST_REGIONID_CTXREG_ETPC            33
+#define NETLIST_REGIONID_SW_BUNDLE64_INIT       34
+#define NETLIST_REGIONID_NVPERF_PMCAU           35
+struct netlist_region {
+        u32 region_id;
+        u32 data_size;
+        u32 data_offset;
+};
+struct netlist_image_header {
+        u32 version;
+        u32 regions;
+};
+struct netlist_image {
+        struct netlist_image_header header;
+        struct netlist_region regions[1];
+};
+struct av_gk20a {
+        u32 addr;
+        u32 value;
+};
+struct av64_gk20a {
+        u32 addr;
+        u32 value_lo;
+        u32 value_hi;
+};
+struct aiv_gk20a {
+        u32 addr;
+        u32 index;
+        u32 value;
+};
+struct aiv_list_gk20a {
+        struct aiv_gk20a *l;
+        u32 count;
+};
+struct av_list_gk20a {
+        struct av_gk20a *l;
+        u32 count;
+};
+struct av64_list_gk20a {
+        struct av64_gk20a *l;
+        u32 count;
+};
+struct u32_list_gk20a {
+        u32 *l;
+        u32 count;
+};
+struct ctxsw_buf_offset_map_entry {
+        u32 addr;       /* Register address */
+        u32 offset;     /* Offset in ctxt switch buffer */
+};
+static inline
+struct av_gk20a *alloc_av_list_gk20a(struct gk20a *g, struct av_list_gk20a *avl)
+{
+        avl->l = nvgpu_kzalloc(g, avl->count * sizeof(*avl->l));
+        return avl->l;
+}
+static inline
+struct av64_gk20a *alloc_av64_list_gk20a(struct gk20a *g, struct av64_list_gk20a *avl)
+{
+        avl->l = nvgpu_kzalloc(g, avl->count * sizeof(*avl->l));
+        return avl->l;
+}
+static inline
+struct aiv_gk20a *alloc_aiv_list_gk20a(struct gk20a *g,
+                                       struct aiv_list_gk20a *aivl)
+{
+        aivl->l = nvgpu_kzalloc(g, aivl->count * sizeof(*aivl->l));
+        return aivl->l;
+}
+static inline
+u32 *alloc_u32_list_gk20a(struct gk20a *g, struct u32_list_gk20a *u32l)
+{
+        u32l->l = nvgpu_kzalloc(g, u32l->count * sizeof(*u32l->l));
+        return u32l->l;
+}
+struct gr_ucode_gk20a {
+        struct {
+                struct u32_list_gk20a inst;
+                struct u32_list_gk20a data;
+        } gpccs, fecs;
+};
+/* main entry for grctx loading */
+int gr_gk20a_init_ctx_vars(struct gk20a *g, struct gr_gk20a *gr);
+int gr_gk20a_init_ctx_vars_sim(struct gk20a *g, struct gr_gk20a *gr);
+struct gpu_ops;
+void gk20a_init_gr_ctx(struct gpu_ops *gops);
+#endif /*NVGPU_GK20A_GR_CTX_GK20A_H*/
diff --git a/include/gk20a/gr_ctx_gk20a_sim.c b/include/gk20a/gr_ctx_gk20a_sim.c
new file mode 100644
index 0000000..ce65c77
--- /dev/null
+++ b/include/gk20a/gr_ctx_gk20a_sim.c
@@ -0,0 +1,356 @@
+/*
+ * GK20A Graphics Context for Simulation
+ *
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+#include "gk20a.h"
+#include <nvgpu/sim.h>
+#include "gr_ctx_gk20a.h"
+#include <nvgpu/log.h>
+int gr_gk20a_init_ctx_vars_sim(struct gk20a *g, struct gr_gk20a *gr)
+{
+        int err = -ENOMEM;
+        u32 i, temp;
+        nvgpu_log(g, gpu_dbg_fn | gpu_dbg_info,
+                   "querying grctx info from chiplib");
+        g->gr.ctx_vars.dynamic = true;
+        g->gr.netlist = GR_NETLIST_DYNAMIC;
+        if (g->sim->esc_readl == NULL) {
+                nvgpu_err(g, "Invalid pointer to query function.");
+                err = -ENOENT;
+                goto fail;
+        }
+        /* query sizes and counts */
+        g->sim->esc_readl(g, "GRCTX_UCODE_INST_FECS_COUNT", 0,
+                            &g->gr.ctx_vars.ucode.fecs.inst.count);
+        g->sim->esc_readl(g, "GRCTX_UCODE_DATA_FECS_COUNT", 0,
+                            &g->gr.ctx_vars.ucode.fecs.data.count);
+        g->sim->esc_readl(g, "GRCTX_UCODE_INST_GPCCS_COUNT", 0,
+                            &g->gr.ctx_vars.ucode.gpccs.inst.count);
+        g->sim->esc_readl(g, "GRCTX_UCODE_DATA_GPCCS_COUNT", 0,
+                            &g->gr.ctx_vars.ucode.gpccs.data.count);
+        g->sim->esc_readl(g, "GRCTX_ALL_CTX_TOTAL_WORDS", 0, &temp);
+        g->gr.ctx_vars.buffer_size = temp << 2;
+        g->sim->esc_readl(g, "GRCTX_SW_BUNDLE_INIT_SIZE", 0,
+                            &g->gr.ctx_vars.sw_bundle_init.count);
+        g->sim->esc_readl(g, "GRCTX_SW_METHOD_INIT_SIZE", 0,
+                            &g->gr.ctx_vars.sw_method_init.count);
+        g->sim->esc_readl(g, "GRCTX_SW_CTX_LOAD_SIZE", 0,
+                            &g->gr.ctx_vars.sw_ctx_load.count);
+        g->sim->esc_readl(g, "GRCTX_SW_VEID_BUNDLE_INIT_SIZE", 0,
+                            &g->gr.ctx_vars.sw_veid_bundle_init.count);
+        g->sim->esc_readl(g, "GRCTX_SW_BUNDLE64_INIT_SIZE", 0,
+                            &g->gr.ctx_vars.sw_bundle64_init.count);
+        g->sim->esc_readl(g, "GRCTX_NONCTXSW_REG_SIZE", 0,
+                            &g->gr.ctx_vars.sw_non_ctx_load.count);
+        g->sim->esc_readl(g, "GRCTX_REG_LIST_SYS_COUNT", 0,
+                            &g->gr.ctx_vars.ctxsw_regs.sys.count);
+        g->sim->esc_readl(g, "GRCTX_REG_LIST_GPC_COUNT", 0,
+                            &g->gr.ctx_vars.ctxsw_regs.gpc.count);
+        g->sim->esc_readl(g, "GRCTX_REG_LIST_TPC_COUNT", 0,
+                            &g->gr.ctx_vars.ctxsw_regs.tpc.count);
+        g->sim->esc_readl(g, "GRCTX_REG_LIST_ZCULL_GPC_COUNT", 0,
+                            &g->gr.ctx_vars.ctxsw_regs.zcull_gpc.count);
+        g->sim->esc_readl(g, "GRCTX_REG_LIST_PM_SYS_COUNT", 0,
+                            &g->gr.ctx_vars.ctxsw_regs.pm_sys.count);
+        g->sim->esc_readl(g, "GRCTX_REG_LIST_PM_GPC_COUNT", 0,
+                            &g->gr.ctx_vars.ctxsw_regs.pm_gpc.count);
+        g->sim->esc_readl(g, "GRCTX_REG_LIST_PM_TPC_COUNT", 0,
+                            &g->gr.ctx_vars.ctxsw_regs.pm_tpc.count);
+        g->sim->esc_readl(g, "GRCTX_REG_LIST_PPC_COUNT", 0,
+                            &g->gr.ctx_vars.ctxsw_regs.ppc.count);
+        g->sim->esc_readl(g, "GRCTX_REG_LIST_ETPC_COUNT", 0,
+                            &g->gr.ctx_vars.ctxsw_regs.etpc.count);
+        g->sim->esc_readl(g, "GRCTX_REG_LIST_PPC_COUNT", 0,
+                            &g->gr.ctx_vars.ctxsw_regs.ppc.count);
+        if (alloc_u32_list_gk20a(g, &g->gr.ctx_vars.ucode.fecs.inst) == NULL) {
+                goto fail;
+        }
+        if (alloc_u32_list_gk20a(g, &g->gr.ctx_vars.ucode.fecs.data) == NULL) {
+                goto fail;
+        }
+        if (alloc_u32_list_gk20a(g, &g->gr.ctx_vars.ucode.gpccs.inst) == NULL) {
+                goto fail;
+        }
+        if (alloc_u32_list_gk20a(g, &g->gr.ctx_vars.ucode.gpccs.data) == NULL) {
+                goto fail;
+        }
+        if (alloc_av_list_gk20a(g, &g->gr.ctx_vars.sw_bundle_init) == NULL) {
+                goto fail;
+        }
+        if (alloc_av64_list_gk20a(g,
+                        &g->gr.ctx_vars.sw_bundle64_init) == NULL) {
+                goto fail;
+        }
+        if (alloc_av_list_gk20a(g, &g->gr.ctx_vars.sw_method_init) == NULL) {
+                goto fail;
+        }
+        if (alloc_aiv_list_gk20a(g, &g->gr.ctx_vars.sw_ctx_load) == NULL) {
+                goto fail;
+        }
+        if (alloc_av_list_gk20a(g, &g->gr.ctx_vars.sw_non_ctx_load) == NULL) {
+                goto fail;
+        }
+        if (alloc_av_list_gk20a(g,
+                        &g->gr.ctx_vars.sw_veid_bundle_init) == NULL) {
+                goto fail;
+        }
+        if (alloc_aiv_list_gk20a(g, &g->gr.ctx_vars.ctxsw_regs.sys) == NULL) {
+                goto fail;
+        }
+        if (alloc_aiv_list_gk20a(g, &g->gr.ctx_vars.ctxsw_regs.gpc) == NULL) {
+                goto fail;
+        }
+        if (alloc_aiv_list_gk20a(g, &g->gr.ctx_vars.ctxsw_regs.tpc) == NULL) {
+                goto fail;
+        }
+        if (alloc_aiv_list_gk20a(g,
+                        &g->gr.ctx_vars.ctxsw_regs.zcull_gpc) == NULL) {
+                goto fail;
+        }
+        if (alloc_aiv_list_gk20a(g, &g->gr.ctx_vars.ctxsw_regs.ppc) == NULL) {
+                goto fail;
+        }
+        if (alloc_aiv_list_gk20a(g,
+                        &g->gr.ctx_vars.ctxsw_regs.pm_sys) == NULL) {
+                goto fail;
+        }
+        if (alloc_aiv_list_gk20a(g,
+                        &g->gr.ctx_vars.ctxsw_regs.pm_gpc) == NULL) {
+                goto fail;
+        }
+        if (alloc_aiv_list_gk20a(g,
+                        &g->gr.ctx_vars.ctxsw_regs.pm_tpc) == NULL) {
+                goto fail;
+        }
+        if (alloc_aiv_list_gk20a(g, &g->gr.ctx_vars.ctxsw_regs.etpc) == NULL) {
+                goto fail;
+        }
+        for (i = 0; i < g->gr.ctx_vars.ucode.fecs.inst.count; i++) {
+                g->sim->esc_readl(g, "GRCTX_UCODE_INST_FECS",
+                                    i, &g->gr.ctx_vars.ucode.fecs.inst.l[i]);
+        }
+        for (i = 0; i < g->gr.ctx_vars.ucode.fecs.data.count; i++) {
+                g->sim->esc_readl(g, "GRCTX_UCODE_DATA_FECS",
+                                    i, &g->gr.ctx_vars.ucode.fecs.data.l[i]);
+        }
+        for (i = 0; i < g->gr.ctx_vars.ucode.gpccs.inst.count; i++) {
+                g->sim->esc_readl(g, "GRCTX_UCODE_INST_GPCCS",
+                                    i, &g->gr.ctx_vars.ucode.gpccs.inst.l[i]);
+        }
+        for (i = 0; i < g->gr.ctx_vars.ucode.gpccs.data.count; i++) {
+                g->sim->esc_readl(g, "GRCTX_UCODE_DATA_GPCCS",
+                                    i, &g->gr.ctx_vars.ucode.gpccs.data.l[i]);
+        }
+        for (i = 0; i < g->gr.ctx_vars.sw_bundle_init.count; i++) {
+                struct av_gk20a *l = g->gr.ctx_vars.sw_bundle_init.l;
+                g->sim->esc_readl(g, "GRCTX_SW_BUNDLE_INIT:ADDR",
+                                    i, &l[i].addr);
+                g->sim->esc_readl(g, "GRCTX_SW_BUNDLE_INIT:VALUE",
+                                    i, &l[i].value);
+        }
+        for (i = 0; i < g->gr.ctx_vars.sw_method_init.count; i++) {
+                struct av_gk20a *l = g->gr.ctx_vars.sw_method_init.l;
+                g->sim->esc_readl(g, "GRCTX_SW_METHOD_INIT:ADDR",
+                                    i, &l[i].addr);
+                g->sim->esc_readl(g, "GRCTX_SW_METHOD_INIT:VALUE",
+                                    i, &l[i].value);
+        }
+        for (i = 0; i < g->gr.ctx_vars.sw_ctx_load.count; i++) {
+                struct aiv_gk20a *l = g->gr.ctx_vars.sw_ctx_load.l;
+                g->sim->esc_readl(g, "GRCTX_SW_CTX_LOAD:ADDR",
+                                    i, &l[i].addr);
+                g->sim->esc_readl(g, "GRCTX_SW_CTX_LOAD:INDEX",
+                                    i, &l[i].index);
+                g->sim->esc_readl(g, "GRCTX_SW_CTX_LOAD:VALUE",
+                                    i, &l[i].value);
+        }
+        for (i = 0; i < g->gr.ctx_vars.sw_non_ctx_load.count; i++) {
+                struct av_gk20a *l = g->gr.ctx_vars.sw_non_ctx_load.l;
+                g->sim->esc_readl(g, "GRCTX_NONCTXSW_REG:REG",
+                                    i, &l[i].addr);
+                g->sim->esc_readl(g, "GRCTX_NONCTXSW_REG:VALUE",
+                                    i, &l[i].value);
+        }
+        for (i = 0; i < g->gr.ctx_vars.sw_veid_bundle_init.count; i++) {
+                struct av_gk20a *l = g->gr.ctx_vars.sw_veid_bundle_init.l;
+                g->sim->esc_readl(g, "GRCTX_SW_VEID_BUNDLE_INIT:ADDR",
+                                    i, &l[i].addr);
+                g->sim->esc_readl(g, "GRCTX_SW_VEID_BUNDLE_INIT:VALUE",
+                                    i, &l[i].value);
+        }
+        for (i = 0; i < g->gr.ctx_vars.sw_bundle64_init.count; i++) {
+                struct av64_gk20a *l = g->gr.ctx_vars.sw_bundle64_init.l;
+                g->sim->esc_readl(g, "GRCTX_SW_BUNDLE64_INIT:ADDR",
+                                i, &l[i].addr);
+                g->sim->esc_readl(g, "GRCTX_SW_BUNDLE64_INIT:VALUE_LO",
+                                i, &l[i].value_lo);
+                g->sim->esc_readl(g, "GRCTX_SW_BUNDLE64_INIT:VALUE_HI",
+                                i, &l[i].value_hi);
+        }
+        for (i = 0; i < g->gr.ctx_vars.ctxsw_regs.sys.count; i++) {
+                struct aiv_gk20a *l = g->gr.ctx_vars.ctxsw_regs.sys.l;
+                g->sim->esc_readl(g, "GRCTX_REG_LIST_SYS:ADDR",
+                                    i, &l[i].addr);
+                g->sim->esc_readl(g, "GRCTX_REG_LIST_SYS:INDEX",
+                                    i, &l[i].index);
+                g->sim->esc_readl(g, "GRCTX_REG_LIST_SYS:VALUE",
+                                    i, &l[i].value);
+        }
+        for (i = 0; i < g->gr.ctx_vars.ctxsw_regs.gpc.count; i++) {
+                struct aiv_gk20a *l = g->gr.ctx_vars.ctxsw_regs.gpc.l;
+                g->sim->esc_readl(g, "GRCTX_REG_LIST_GPC:ADDR",
+                                    i, &l[i].addr);
+                g->sim->esc_readl(g, "GRCTX_REG_LIST_GPC:INDEX",
+                                    i, &l[i].index);
+                g->sim->esc_readl(g, "GRCTX_REG_LIST_GPC:VALUE",
+                                    i, &l[i].value);
+        }
+        for (i = 0; i < g->gr.ctx_vars.ctxsw_regs.tpc.count; i++) {
+                struct aiv_gk20a *l = g->gr.ctx_vars.ctxsw_regs.tpc.l;
+                g->sim->esc_readl(g, "GRCTX_REG_LIST_TPC:ADDR",
+                                    i, &l[i].addr);
+                g->sim->esc_readl(g, "GRCTX_REG_LIST_TPC:INDEX",
+                                    i, &l[i].index);
+                g->sim->esc_readl(g, "GRCTX_REG_LIST_TPC:VALUE",
+                                    i, &l[i].value);
+        }
+        for (i = 0; i < g->gr.ctx_vars.ctxsw_regs.ppc.count; i++) {
+                struct aiv_gk20a *l = g->gr.ctx_vars.ctxsw_regs.ppc.l;
+                g->sim->esc_readl(g, "GRCTX_REG_LIST_PPC:ADDR",
+                                    i, &l[i].addr);
+                g->sim->esc_readl(g, "GRCTX_REG_LIST_PPC:INDEX",
+                                    i, &l[i].index);
+                g->sim->esc_readl(g, "GRCTX_REG_LIST_PPC:VALUE",
+                                    i, &l[i].value);
+        }
+        for (i = 0; i < g->gr.ctx_vars.ctxsw_regs.zcull_gpc.count; i++) {
+                struct aiv_gk20a *l = g->gr.ctx_vars.ctxsw_regs.zcull_gpc.l;
+                g->sim->esc_readl(g, "GRCTX_REG_LIST_ZCULL_GPC:ADDR",
+                                    i, &l[i].addr);
+                g->sim->esc_readl(g, "GRCTX_REG_LIST_ZCULL_GPC:INDEX",
+                                    i, &l[i].index);
+                g->sim->esc_readl(g, "GRCTX_REG_LIST_ZCULL_GPC:VALUE",
+                                    i, &l[i].value);
+        }
+        for (i = 0; i < g->gr.ctx_vars.ctxsw_regs.pm_sys.count; i++) {
+                struct aiv_gk20a *l = g->gr.ctx_vars.ctxsw_regs.pm_sys.l;
+                g->sim->esc_readl(g, "GRCTX_REG_LIST_PM_SYS:ADDR",
+                                    i, &l[i].addr);
+                g->sim->esc_readl(g, "GRCTX_REG_LIST_PM_SYS:INDEX",
+                                    i, &l[i].index);
+                g->sim->esc_readl(g, "GRCTX_REG_LIST_PM_SYS:VALUE",
+                                    i, &l[i].value);
+        }
+        for (i = 0; i < g->gr.ctx_vars.ctxsw_regs.pm_gpc.count; i++) {
+                struct aiv_gk20a *l = g->gr.ctx_vars.ctxsw_regs.pm_gpc.l;
+                g->sim->esc_readl(g, "GRCTX_REG_LIST_PM_GPC:ADDR",
+                                    i, &l[i].addr);
+                g->sim->esc_readl(g, "GRCTX_REG_LIST_PM_GPC:INDEX",
+                                    i, &l[i].index);
+                g->sim->esc_readl(g, "GRCTX_REG_LIST_PM_GPC:VALUE",
+                                    i, &l[i].value);
+        }
+        for (i = 0; i < g->gr.ctx_vars.ctxsw_regs.pm_tpc.count; i++) {
+                struct aiv_gk20a *l = g->gr.ctx_vars.ctxsw_regs.pm_tpc.l;
+                g->sim->esc_readl(g, "GRCTX_REG_LIST_PM_TPC:ADDR",
+                                    i, &l[i].addr);
+                g->sim->esc_readl(g, "GRCTX_REG_LIST_PM_TPC:INDEX",
+                                    i, &l[i].index);
+                g->sim->esc_readl(g, "GRCTX_REG_LIST_PM_TPC:VALUE",
+                                    i, &l[i].value);
+        }
+        nvgpu_log(g, gpu_dbg_info | gpu_dbg_fn, "query GRCTX_REG_LIST_ETPC");
+        for (i = 0; i < g->gr.ctx_vars.ctxsw_regs.etpc.count; i++) {
+                struct aiv_gk20a *l = g->gr.ctx_vars.ctxsw_regs.etpc.l;
+                g->sim->esc_readl(g, "GRCTX_REG_LIST_ETPC:ADDR",
+                                    i, &l[i].addr);
+                g->sim->esc_readl(g, "GRCTX_REG_LIST_ETPC:INDEX",
+                                    i, &l[i].index);
+                g->sim->esc_readl(g, "GRCTX_REG_LIST_ETPC:VALUE",
+                                    i, &l[i].value);
+                nvgpu_log(g, gpu_dbg_info | gpu_dbg_fn,
+                                "addr:0x%#08x index:0x%08x value:0x%08x",
+                                l[i].addr, l[i].index, l[i].value);
+        }
+        g->gr.ctx_vars.valid = true;
+        g->sim->esc_readl(g, "GRCTX_GEN_CTX_REGS_BASE_INDEX", 0,
+                            &g->gr.ctx_vars.regs_base_index);
+        nvgpu_log(g, gpu_dbg_info | gpu_dbg_fn, "finished querying grctx info from chiplib");
+        return 0;
+fail:
+        nvgpu_err(g, "failed querying grctx info from chiplib");
+        nvgpu_kfree(g, g->gr.ctx_vars.ucode.fecs.inst.l);
+        nvgpu_kfree(g, g->gr.ctx_vars.ucode.fecs.data.l);
+        nvgpu_kfree(g, g->gr.ctx_vars.ucode.gpccs.inst.l);
+        nvgpu_kfree(g, g->gr.ctx_vars.ucode.gpccs.data.l);
+        nvgpu_kfree(g, g->gr.ctx_vars.sw_bundle_init.l);
+        nvgpu_kfree(g, g->gr.ctx_vars.sw_bundle64_init.l);
+        nvgpu_kfree(g, g->gr.ctx_vars.sw_method_init.l);
+        nvgpu_kfree(g, g->gr.ctx_vars.sw_ctx_load.l);
+        nvgpu_kfree(g, g->gr.ctx_vars.sw_non_ctx_load.l);
+        nvgpu_kfree(g, g->gr.ctx_vars.sw_veid_bundle_init.l);
+        nvgpu_kfree(g, g->gr.ctx_vars.ctxsw_regs.sys.l);
+        nvgpu_kfree(g, g->gr.ctx_vars.ctxsw_regs.gpc.l);
+        nvgpu_kfree(g, g->gr.ctx_vars.ctxsw_regs.tpc.l);
+        nvgpu_kfree(g, g->gr.ctx_vars.ctxsw_regs.zcull_gpc.l);
+        nvgpu_kfree(g, g->gr.ctx_vars.ctxsw_regs.ppc.l);
+        nvgpu_kfree(g, g->gr.ctx_vars.ctxsw_regs.pm_sys.l);
+        nvgpu_kfree(g, g->gr.ctx_vars.ctxsw_regs.pm_gpc.l);
+        nvgpu_kfree(g, g->gr.ctx_vars.ctxsw_regs.pm_tpc.l);
+        nvgpu_kfree(g, g->gr.ctx_vars.ctxsw_regs.etpc.l);
+        return err;
+}
diff --git a/include/gk20a/gr_gk20a.c b/include/gk20a/gr_gk20a.c
new file mode 100644
index 0000000..7bcf528
--- /dev/null
+++ b/include/gk20a/gr_gk20a.c
@@ -0,0 +1,8998 @@
+/*
+ * GK20A Graphics
+ *
+ * Copyright (c) 2011-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+#include <nvgpu/dma.h>
+#include <nvgpu/kmem.h>
+#include <nvgpu/gmmu.h>
+#include <nvgpu/timers.h>
+#include <nvgpu/nvgpu_common.h>
+#include <nvgpu/log.h>
+#include <nvgpu/bsearch.h>
+#include <nvgpu/sort.h>
+#include <nvgpu/bug.h>
+#include <nvgpu/firmware.h>
+#include <nvgpu/enabled.h>
+#include <nvgpu/debug.h>
+#include <nvgpu/barrier.h>
+#include <nvgpu/mm.h>
+#include <nvgpu/ctxsw_trace.h>
+#include <nvgpu/error_notifier.h>
+#include <nvgpu/ecc.h>
+#include <nvgpu/io.h>
+#include <nvgpu/utils.h>
+#include <nvgpu/channel.h>
+#include <nvgpu/unit.h>
+#include <nvgpu/power_features/pg.h>
+#include <nvgpu/power_features/cg.h>
+#include "gk20a.h"
+#include "gr_gk20a.h"
+#include "gk20a/fecs_trace_gk20a.h"
+#include "gr_ctx_gk20a.h"
+#include "gr_pri_gk20a.h"
+#include "regops_gk20a.h"
+#include "dbg_gpu_gk20a.h"
+#include <nvgpu/hw/gk20a/hw_ccsr_gk20a.h>
+#include <nvgpu/hw/gk20a/hw_ctxsw_prog_gk20a.h>
+#include <nvgpu/hw/gk20a/hw_fifo_gk20a.h>
+#include <nvgpu/hw/gk20a/hw_gr_gk20a.h>
+#include <nvgpu/hw/gk20a/hw_gmmu_gk20a.h>
+#include <nvgpu/hw/gk20a/hw_mc_gk20a.h>
+#include <nvgpu/hw/gk20a/hw_ram_gk20a.h>
+#include <nvgpu/hw/gk20a/hw_pri_ringmaster_gk20a.h>
+#include <nvgpu/hw/gk20a/hw_top_gk20a.h>
+#include <nvgpu/hw/gk20a/hw_pbdma_gk20a.h>
+#define BLK_SIZE (256)
+#define NV_PERF_PMM_FBP_ROUTER_STRIDE 0x0200
+#define NV_PERF_PMMGPCROUTER_STRIDE     0x0200
+#define NV_PCFG_BASE            0x00088000
+#define NV_XBAR_MXBAR_PRI_GPC_GNIC_STRIDE       0x0020
+#define FE_PWR_MODE_TIMEOUT_MAX 2000
+#define FE_PWR_MODE_TIMEOUT_DEFAULT 10
+#define CTXSW_MEM_SCRUBBING_TIMEOUT_MAX 1000
+#define CTXSW_MEM_SCRUBBING_TIMEOUT_DEFAULT 10
+#define FECS_ARB_CMD_TIMEOUT_MAX 40
+#define FECS_ARB_CMD_TIMEOUT_DEFAULT 2
+static int gk20a_init_gr_bind_fecs_elpg(struct gk20a *g);
+static void gr_gk20a_free_channel_pm_ctx(struct gk20a *g,
+                                         struct vm_gk20a *vm,
+                                         struct nvgpu_gr_ctx *gr_ctx);
+/* channel patch ctx buffer */
+static int  gr_gk20a_alloc_channel_patch_ctx(struct gk20a *g,
+                                        struct channel_gk20a *c);
+static void gr_gk20a_free_channel_patch_ctx(struct gk20a *g,
+                                            struct vm_gk20a *vm,
+                                            struct nvgpu_gr_ctx *gr_ctx);
+/* golden ctx image */
+static int gr_gk20a_init_golden_ctx_image(struct gk20a *g,
+                                          struct channel_gk20a *c);
+int gr_gk20a_get_ctx_id(struct gk20a *g,
+                struct channel_gk20a *c,
+                u32 *ctx_id)
+{
+        struct tsg_gk20a *tsg;
+        struct nvgpu_gr_ctx *gr_ctx = NULL;
+        struct nvgpu_mem *mem = NULL;
+        tsg = tsg_gk20a_from_ch(c);
+        if (tsg == NULL) {
+                return -EINVAL;
+        }
+        gr_ctx = &tsg->gr_ctx;
+        mem = &gr_ctx->mem;
+        /* Channel gr_ctx buffer is gpu cacheable.
+           Flush and invalidate before cpu update. */
+        g->ops.mm.l2_flush(g, true);
+        *ctx_id = nvgpu_mem_rd(g, mem,
+                        ctxsw_prog_main_image_context_id_o());
+        nvgpu_log(g, gpu_dbg_fn | gpu_dbg_intr, "ctx_id: 0x%x", *ctx_id);
+        return 0;
+}
+void gk20a_fecs_dump_falcon_stats(struct gk20a *g)
+{
+        unsigned int i;
+        nvgpu_err(g, "gr_fecs_os_r : %d",
+                gk20a_readl(g, gr_fecs_os_r()));
+        nvgpu_err(g, "gr_fecs_cpuctl_r : 0x%x",
+                gk20a_readl(g, gr_fecs_cpuctl_r()));
+        nvgpu_err(g, "gr_fecs_idlestate_r : 0x%x",
+                gk20a_readl(g, gr_fecs_idlestate_r()));
+        nvgpu_err(g, "gr_fecs_mailbox0_r : 0x%x",
+                gk20a_readl(g, gr_fecs_mailbox0_r()));
+        nvgpu_err(g, "gr_fecs_mailbox1_r : 0x%x",
+                gk20a_readl(g, gr_fecs_mailbox1_r()));
+        nvgpu_err(g, "gr_fecs_irqstat_r : 0x%x",
+                gk20a_readl(g, gr_fecs_irqstat_r()));
+        nvgpu_err(g, "gr_fecs_irqmode_r : 0x%x",
+                gk20a_readl(g, gr_fecs_irqmode_r()));
+        nvgpu_err(g, "gr_fecs_irqmask_r : 0x%x",
+                gk20a_readl(g, gr_fecs_irqmask_r()));
+        nvgpu_err(g, "gr_fecs_irqdest_r : 0x%x",
+                gk20a_readl(g, gr_fecs_irqdest_r()));
+        nvgpu_err(g, "gr_fecs_debug1_r : 0x%x",
+                gk20a_readl(g, gr_fecs_debug1_r()));
+        nvgpu_err(g, "gr_fecs_debuginfo_r : 0x%x",
+                gk20a_readl(g, gr_fecs_debuginfo_r()));
+        nvgpu_err(g, "gr_fecs_ctxsw_status_1_r : 0x%x",
+                gk20a_readl(g, gr_fecs_ctxsw_status_1_r()));
+        for (i = 0; i < g->ops.gr.fecs_ctxsw_mailbox_size(); i++) {
+                nvgpu_err(g, "gr_fecs_ctxsw_mailbox_r(%d) : 0x%x",
+                        i, gk20a_readl(g, gr_fecs_ctxsw_mailbox_r(i)));
+        }
+        nvgpu_err(g, "gr_fecs_engctl_r : 0x%x",
+                gk20a_readl(g, gr_fecs_engctl_r()));
+        nvgpu_err(g, "gr_fecs_curctx_r : 0x%x",
+                gk20a_readl(g, gr_fecs_curctx_r()));
+        nvgpu_err(g, "gr_fecs_nxtctx_r : 0x%x",
+                gk20a_readl(g, gr_fecs_nxtctx_r()));
+        gk20a_writel(g, gr_fecs_icd_cmd_r(),
+                gr_fecs_icd_cmd_opc_rreg_f() |
+                gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_IMB));
+        nvgpu_err(g, "FECS_FALCON_REG_IMB : 0x%x",
+                gk20a_readl(g, gr_fecs_icd_rdata_r()));
+        gk20a_writel(g, gr_fecs_icd_cmd_r(),
+                gr_fecs_icd_cmd_opc_rreg_f() |
+                gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_DMB));
+        nvgpu_err(g, "FECS_FALCON_REG_DMB : 0x%x",
+                gk20a_readl(g, gr_fecs_icd_rdata_r()));
+        gk20a_writel(g, gr_fecs_icd_cmd_r(),
+                gr_fecs_icd_cmd_opc_rreg_f() |
+                gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_CSW));
+        nvgpu_err(g, "FECS_FALCON_REG_CSW : 0x%x",
+                gk20a_readl(g, gr_fecs_icd_rdata_r()));
+        gk20a_writel(g, gr_fecs_icd_cmd_r(),
+                gr_fecs_icd_cmd_opc_rreg_f() |
+                gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_CTX));
+        nvgpu_err(g, "FECS_FALCON_REG_CTX : 0x%x",
+                gk20a_readl(g, gr_fecs_icd_rdata_r()));
+        gk20a_writel(g, gr_fecs_icd_cmd_r(),
+                gr_fecs_icd_cmd_opc_rreg_f() |
+                gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_EXCI));
+        nvgpu_err(g, "FECS_FALCON_REG_EXCI : 0x%x",
+                gk20a_readl(g, gr_fecs_icd_rdata_r()));
+        for (i = 0; i < 4; i++) {
+                gk20a_writel(g, gr_fecs_icd_cmd_r(),
+                        gr_fecs_icd_cmd_opc_rreg_f() |
+                        gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_PC));
+                nvgpu_err(g, "FECS_FALCON_REG_PC : 0x%x",
+                        gk20a_readl(g, gr_fecs_icd_rdata_r()));
+                gk20a_writel(g, gr_fecs_icd_cmd_r(),
+                        gr_fecs_icd_cmd_opc_rreg_f() |
+                        gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_SP));
+                nvgpu_err(g, "FECS_FALCON_REG_SP : 0x%x",
+                        gk20a_readl(g, gr_fecs_icd_rdata_r()));
+        }
+}
+static void gr_gk20a_load_falcon_dmem(struct gk20a *g)
+{
+        u32 i, ucode_u32_size;
+        const u32 *ucode_u32_data;
+        u32 checksum;
+        nvgpu_log_fn(g, " ");
+        gk20a_writel(g, gr_gpccs_dmemc_r(0), (gr_gpccs_dmemc_offs_f(0) |
+                                              gr_gpccs_dmemc_blk_f(0)  |
+                                              gr_gpccs_dmemc_aincw_f(1)));
+        ucode_u32_size = g->gr.ctx_vars.ucode.gpccs.data.count;
+        ucode_u32_data = (const u32 *)g->gr.ctx_vars.ucode.gpccs.data.l;
+        for (i = 0, checksum = 0; i < ucode_u32_size; i++) {
+                gk20a_writel(g, gr_gpccs_dmemd_r(0), ucode_u32_data[i]);
+                checksum += ucode_u32_data[i];
+        }
+        gk20a_writel(g, gr_fecs_dmemc_r(0), (gr_fecs_dmemc_offs_f(0) |
+                                             gr_fecs_dmemc_blk_f(0)  |
+                                             gr_fecs_dmemc_aincw_f(1)));
+        ucode_u32_size = g->gr.ctx_vars.ucode.fecs.data.count;
+        ucode_u32_data = (const u32 *)g->gr.ctx_vars.ucode.fecs.data.l;
+        for (i = 0, checksum = 0; i < ucode_u32_size; i++) {
+                gk20a_writel(g, gr_fecs_dmemd_r(0), ucode_u32_data[i]);
+                checksum += ucode_u32_data[i];
+        }
+        nvgpu_log_fn(g, "done");
+}
+static void gr_gk20a_load_falcon_imem(struct gk20a *g)
+{
+        u32 cfg, fecs_imem_size, gpccs_imem_size, ucode_u32_size;
+        const u32 *ucode_u32_data;
+        u32 tag, i, pad_start, pad_end;
+        u32 checksum;
+        nvgpu_log_fn(g, " ");
+        cfg = gk20a_readl(g, gr_fecs_cfg_r());
+        fecs_imem_size = gr_fecs_cfg_imem_sz_v(cfg);
+        cfg = gk20a_readl(g, gr_gpc0_cfg_r());
+        gpccs_imem_size = gr_gpc0_cfg_imem_sz_v(cfg);
+        /* Use the broadcast address to access all of the GPCCS units. */
+        gk20a_writel(g, gr_gpccs_imemc_r(0), (gr_gpccs_imemc_offs_f(0) |
+                                              gr_gpccs_imemc_blk_f(0) |
+                                              gr_gpccs_imemc_aincw_f(1)));
+        /* Setup the tags for the instruction memory. */
+        tag = 0;
+        gk20a_writel(g, gr_gpccs_imemt_r(0), gr_gpccs_imemt_tag_f(tag));
+        ucode_u32_size = g->gr.ctx_vars.ucode.gpccs.inst.count;
+        ucode_u32_data = (const u32 *)g->gr.ctx_vars.ucode.gpccs.inst.l;
+        for (i = 0, checksum = 0; i < ucode_u32_size; i++) {
+                if ((i != 0U) && ((i % (256U/sizeof(u32))) == 0U)) {
+                        tag++;
+                        gk20a_writel(g, gr_gpccs_imemt_r(0),
+                                      gr_gpccs_imemt_tag_f(tag));
+                }
+                gk20a_writel(g, gr_gpccs_imemd_r(0), ucode_u32_data[i]);
+                checksum += ucode_u32_data[i];
+        }
+        pad_start = i * 4U;
+        pad_end = pad_start + (256U - pad_start % 256U) + 256U;
+        for (i = pad_start;
+             (i < gpccs_imem_size * 256U) && (i < pad_end);
+             i += 4U) {
+                if ((i != 0U) && ((i % 256U) == 0U)) {
+                        tag++;
+                        gk20a_writel(g, gr_gpccs_imemt_r(0),
+                                      gr_gpccs_imemt_tag_f(tag));
+                }
+                gk20a_writel(g, gr_gpccs_imemd_r(0), 0);
+        }
+        gk20a_writel(g, gr_fecs_imemc_r(0), (gr_fecs_imemc_offs_f(0) |
+                                             gr_fecs_imemc_blk_f(0) |
+                                             gr_fecs_imemc_aincw_f(1)));
+        /* Setup the tags for the instruction memory. */
+        tag = 0;
+        gk20a_writel(g, gr_fecs_imemt_r(0), gr_fecs_imemt_tag_f(tag));
+        ucode_u32_size = g->gr.ctx_vars.ucode.fecs.inst.count;
+        ucode_u32_data = (const u32 *)g->gr.ctx_vars.ucode.fecs.inst.l;
+        for (i = 0, checksum = 0; i < ucode_u32_size; i++) {
+                if ((i != 0U) && ((i % (256U/sizeof(u32))) == 0U)) {
+                        tag++;
+                        gk20a_writel(g, gr_fecs_imemt_r(0),
+                                      gr_fecs_imemt_tag_f(tag));
+                }
+                gk20a_writel(g, gr_fecs_imemd_r(0), ucode_u32_data[i]);
+                checksum += ucode_u32_data[i];
+        }
+        pad_start = i * 4U;
+        pad_end = pad_start + (256U - pad_start % 256U) + 256U;
+        for (i = pad_start;
+             (i < fecs_imem_size * 256U) && i < pad_end;
+             i += 4U) {
+                if ((i != 0U) && ((i % 256U) == 0U)) {
+                        tag++;
+                        gk20a_writel(g, gr_fecs_imemt_r(0),
+                                      gr_fecs_imemt_tag_f(tag));
+                }
+                gk20a_writel(g, gr_fecs_imemd_r(0), 0);
+        }
+}
+int gr_gk20a_wait_idle(struct gk20a *g, unsigned long duration_ms,
+                       u32 expect_delay)
+{
+        u32 delay = expect_delay;
+        bool ctxsw_active;
+        bool gr_busy;
+        u32 gr_engine_id;
+        u32 engine_status;
+        bool ctx_status_invalid;
+        struct nvgpu_timeout timeout;
+        nvgpu_log_fn(g, " ");
+        gr_engine_id = gk20a_fifo_get_gr_engine_id(g);
+        nvgpu_timeout_init(g, &timeout, duration_ms, NVGPU_TIMER_CPU_TIMER);
+        do {
+                /* fmodel: host gets fifo_engine_status(gr) from gr
+                   only when gr_status is read */
+                (void) gk20a_readl(g, gr_status_r());
+                engine_status = gk20a_readl(g,
+                                        fifo_engine_status_r(gr_engine_id));
+                ctxsw_active = engine_status &
+                        fifo_engine_status_ctxsw_in_progress_f();
+                ctx_status_invalid =
+                        (fifo_engine_status_ctx_status_v(engine_status) ==
+                         fifo_engine_status_ctx_status_invalid_v());
+                gr_busy = gk20a_readl(g, gr_engine_status_r()) &
+                        gr_engine_status_value_busy_f();
+                if (ctx_status_invalid || (!gr_busy && !ctxsw_active)) {
+                        nvgpu_log_fn(g, "done");
+                        return 0;
+                }
+                nvgpu_usleep_range(delay, delay * 2);
+                delay = min_t(u32, delay << 1, GR_IDLE_CHECK_MAX);
+        } while (nvgpu_timeout_expired(&timeout) == 0);
+        nvgpu_err(g,
+                "timeout, ctxsw busy : %d, gr busy : %d",
+                ctxsw_active, gr_busy);
+        return -EAGAIN;
+}
+int gr_gk20a_wait_fe_idle(struct gk20a *g, unsigned long duration_ms,
+                          u32 expect_delay)
+{
+        u32 val;
+        u32 delay = expect_delay;
+        struct nvgpu_timeout timeout;
+        if (nvgpu_is_enabled(g, NVGPU_IS_FMODEL)) {
+                return 0;
+        }
+        nvgpu_log_fn(g, " ");
+        nvgpu_timeout_init(g, &timeout, duration_ms, NVGPU_TIMER_CPU_TIMER);
+        do {
+                val = gk20a_readl(g, gr_status_r());
+                if (gr_status_fe_method_lower_v(val) == 0U) {
+                        nvgpu_log_fn(g, "done");
+                        return 0;
+                }
+                nvgpu_usleep_range(delay, delay * 2);
+                delay = min_t(u32, delay << 1, GR_IDLE_CHECK_MAX);
+        } while (nvgpu_timeout_expired(&timeout) == 0);
+        nvgpu_err(g,
+                "timeout, fe busy : %x", val);
+        return -EAGAIN;
+}
+int gr_gk20a_ctx_wait_ucode(struct gk20a *g, u32 mailbox_id,
+                            u32 *mailbox_ret, u32 opc_success,
+                            u32 mailbox_ok, u32 opc_fail,
+                            u32 mailbox_fail, bool sleepduringwait)
+{
+        struct nvgpu_timeout timeout;
+        u32 delay = GR_FECS_POLL_INTERVAL;
+        u32 check = WAIT_UCODE_LOOP;
+        u32 reg;
+        nvgpu_log_fn(g, " ");
+        if (sleepduringwait) {
+                delay = GR_IDLE_CHECK_DEFAULT;
+        }
+        nvgpu_timeout_init(g, &timeout, gk20a_get_gr_idle_timeout(g),
+                           NVGPU_TIMER_CPU_TIMER);
+        while (check == WAIT_UCODE_LOOP) {
+                if (nvgpu_timeout_expired(&timeout)) {
+                        check = WAIT_UCODE_TIMEOUT;
+                }
+                reg = gk20a_readl(g, gr_fecs_ctxsw_mailbox_r(mailbox_id));
+                if (mailbox_ret) {
+                        *mailbox_ret = reg;
+                }
+                switch (opc_success) {
+                case GR_IS_UCODE_OP_EQUAL:
+                        if (reg == mailbox_ok) {
+                                check = WAIT_UCODE_OK;
+                        }
+                        break;
+                case GR_IS_UCODE_OP_NOT_EQUAL:
+                        if (reg != mailbox_ok) {
+                                check = WAIT_UCODE_OK;
+                        }
+                        break;
+                case GR_IS_UCODE_OP_AND:
+                        if (reg & mailbox_ok) {
+                                check = WAIT_UCODE_OK;
+                        }
+                        break;
+                case GR_IS_UCODE_OP_LESSER:
+                        if (reg < mailbox_ok) {
+                                check = WAIT_UCODE_OK;
+                        }
+                        break;
+                case GR_IS_UCODE_OP_LESSER_EQUAL:
+                        if (reg <= mailbox_ok) {
+                                check = WAIT_UCODE_OK;
+                        }
+                        break;
+                case GR_IS_UCODE_OP_SKIP:
+                        /* do no success check */
+                        break;
+                default:
+                        nvgpu_err(g,
+                                   "invalid success opcode 0x%x", opc_success);
+                        check = WAIT_UCODE_ERROR;
+                        break;
+                }
+                switch (opc_fail) {
+                case GR_IS_UCODE_OP_EQUAL:
+                        if (reg == mailbox_fail) {
+                                check = WAIT_UCODE_ERROR;
+                        }
+                        break;
+                case GR_IS_UCODE_OP_NOT_EQUAL:
+                        if (reg != mailbox_fail) {
+                                check = WAIT_UCODE_ERROR;
+                        }
+                        break;
+                case GR_IS_UCODE_OP_AND:
+                        if (reg & mailbox_fail) {
+                                check = WAIT_UCODE_ERROR;
+                        }
+                        break;
+                case GR_IS_UCODE_OP_LESSER:
+                        if (reg < mailbox_fail) {
+                                check = WAIT_UCODE_ERROR;
+                        }
+                        break;
+                case GR_IS_UCODE_OP_LESSER_EQUAL:
+                        if (reg <= mailbox_fail) {
+                                check = WAIT_UCODE_ERROR;
+                        }
+                        break;
+                case GR_IS_UCODE_OP_SKIP:
+                        /* do no check on fail*/
+                        break;
+                default:
+                        nvgpu_err(g,
+                                   "invalid fail opcode 0x%x", opc_fail);
+                        check = WAIT_UCODE_ERROR;
+                        break;
+                }
+                if (sleepduringwait) {
+                        nvgpu_usleep_range(delay, delay * 2);
+                        delay = min_t(u32, delay << 1, GR_IDLE_CHECK_MAX);
+                } else {
+                        nvgpu_udelay(delay);
+                }
+        }
+        if (check == WAIT_UCODE_TIMEOUT) {
+                nvgpu_err(g,
+                           "timeout waiting on mailbox=%d value=0x%08x",
+                           mailbox_id, reg);
+                gk20a_fecs_dump_falcon_stats(g);
+                gk20a_gr_debug_dump(g);
+                return -1;
+        } else if (check == WAIT_UCODE_ERROR) {
+                nvgpu_err(g,
+                           "ucode method failed on mailbox=%d value=0x%08x",
+                           mailbox_id, reg);
+                gk20a_fecs_dump_falcon_stats(g);
+                return -1;
+        }
+        nvgpu_log_fn(g, "done");
+        return 0;
+}
+int gr_gk20a_submit_fecs_method_op_locked(struct gk20a *g,
+                                   struct fecs_method_op_gk20a op,
+                                   bool sleepduringwait)
+{
+        int ret;
+        if (op.mailbox.id != 0) {
+                gk20a_writel(g, gr_fecs_ctxsw_mailbox_r(op.mailbox.id),
+                             op.mailbox.data);
+        }
+        gk20a_writel(g, gr_fecs_ctxsw_mailbox_clear_r(0),
+                gr_fecs_ctxsw_mailbox_clear_value_f(op.mailbox.clr));
+        gk20a_writel(g, gr_fecs_method_data_r(), op.method.data);
+        gk20a_writel(g, gr_fecs_method_push_r(),
+                gr_fecs_method_push_adr_f(op.method.addr));
+        /* op.mailbox.id == 4 cases require waiting for completion on
+         * for op.mailbox.id == 0 */
+        if (op.mailbox.id == 4) {
+                op.mailbox.id = 0;
+        }
+        ret = gr_gk20a_ctx_wait_ucode(g, op.mailbox.id, op.mailbox.ret,
+                                      op.cond.ok, op.mailbox.ok,
+                                      op.cond.fail, op.mailbox.fail,
+                                      sleepduringwait);
+        if (ret) {
+                nvgpu_err(g,"fecs method: data=0x%08x push adr=0x%08x",
+                        op.method.data, op.method.addr);
+        }
+        return ret;
+}
+/* The following is a less brittle way to call gr_gk20a_submit_fecs_method(...)
+ * We should replace most, if not all, fecs method calls to this instead. */
+int gr_gk20a_submit_fecs_method_op(struct gk20a *g,
+                                   struct fecs_method_op_gk20a op,
+                                   bool sleepduringwait)
+{
+        struct gr_gk20a *gr = &g->gr;
+        int ret;
+        nvgpu_mutex_acquire(&gr->fecs_mutex);
+        ret = gr_gk20a_submit_fecs_method_op_locked(g, op, sleepduringwait);
+        nvgpu_mutex_release(&gr->fecs_mutex);
+        return ret;
+}
+/* Sideband mailbox writes are done a bit differently */
+int gr_gk20a_submit_fecs_sideband_method_op(struct gk20a *g,
+                struct fecs_method_op_gk20a op)
+{
+        struct gr_gk20a *gr = &g->gr;
+        int ret;
+        nvgpu_mutex_acquire(&gr->fecs_mutex);
+        gk20a_writel(g, gr_fecs_ctxsw_mailbox_clear_r(op.mailbox.id),
+                gr_fecs_ctxsw_mailbox_clear_value_f(op.mailbox.clr));
+        gk20a_writel(g, gr_fecs_method_data_r(), op.method.data);
+        gk20a_writel(g, gr_fecs_method_push_r(),
+                gr_fecs_method_push_adr_f(op.method.addr));
+        ret = gr_gk20a_ctx_wait_ucode(g, op.mailbox.id, op.mailbox.ret,
+                                      op.cond.ok, op.mailbox.ok,
+                                      op.cond.fail, op.mailbox.fail,
+                                      false);
+        if (ret) {
+                nvgpu_err(g,"fecs method: data=0x%08x push adr=0x%08x",
+                        op.method.data, op.method.addr);
+        }
+        nvgpu_mutex_release(&gr->fecs_mutex);
+        return ret;
+}
+static int gr_gk20a_ctrl_ctxsw(struct gk20a *g, u32 fecs_method, u32 *ret)
+{
+        return gr_gk20a_submit_fecs_method_op(g,
+              (struct fecs_method_op_gk20a) {
+                      .method.addr = fecs_method,
+                      .method.data = ~0,
+                      .mailbox = { .id   = 1, /*sideband?*/
+                                   .data = ~0, .clr = ~0, .ret = ret,
+                                   .ok   = gr_fecs_ctxsw_mailbox_value_pass_v(),
+                                   .fail = gr_fecs_ctxsw_mailbox_value_fail_v(), },
+                      .cond.ok = GR_IS_UCODE_OP_EQUAL,
+                      .cond.fail = GR_IS_UCODE_OP_EQUAL }, true);
+}
+/**
+ * Stop processing (stall) context switches at FECS:-
+ * If fecs is sent stop_ctxsw method, elpg entry/exit cannot happen
+ * and may timeout. It could manifest as different error signatures
+ * depending on when stop_ctxsw fecs method gets sent with respect
+ * to pmu elpg sequence. It could come as pmu halt or abort or
+ * maybe ext error too.
+*/
+int gr_gk20a_disable_ctxsw(struct gk20a *g)
+{
+        int err = 0;
+        nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, " ");
+        nvgpu_mutex_acquire(&g->ctxsw_disable_lock);
+        g->ctxsw_disable_count++;
+        if (g->ctxsw_disable_count == 1) {
+                err = nvgpu_pg_elpg_disable(g);
+                if (err != 0) {
+                        nvgpu_err(g, "failed to disable elpg. not safe to "
+                                        "stop_ctxsw");
+                        /* stop ctxsw command is not sent */
+                        g->ctxsw_disable_count--;
+                } else {
+                        err = gr_gk20a_ctrl_ctxsw(g,
+                                gr_fecs_method_push_adr_stop_ctxsw_v(), NULL);
+                        if (err != 0) {
+                                nvgpu_err(g, "failed to stop fecs ctxsw");
+                                /* stop ctxsw failed */
+                                g->ctxsw_disable_count--;
+                        }
+                }
+        } else {
+                nvgpu_log_info(g, "ctxsw disabled, ctxsw_disable_count: %d",
+                        g->ctxsw_disable_count);
+        }
+        nvgpu_mutex_release(&g->ctxsw_disable_lock);
+        return err;
+}
+/* Start processing (continue) context switches at FECS */
+int gr_gk20a_enable_ctxsw(struct gk20a *g)
+{
+        int err = 0;
+        nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, " ");
+        nvgpu_mutex_acquire(&g->ctxsw_disable_lock);
+        if (g->ctxsw_disable_count == 0) {
+                goto ctxsw_already_enabled;
+        }
+        g->ctxsw_disable_count--;
+        WARN_ON(g->ctxsw_disable_count < 0);
+        if (g->ctxsw_disable_count == 0) {
+                err = gr_gk20a_ctrl_ctxsw(g,
+                                gr_fecs_method_push_adr_start_ctxsw_v(), NULL);
+                if (err != 0) {
+                        nvgpu_err(g, "failed to start fecs ctxsw");
+                } else {
+                        if (nvgpu_pg_elpg_enable(g) != 0) {
+                                nvgpu_err(g, "failed to enable elpg "
+                                        "after start_ctxsw");
+                        }
+                }
+        } else {
+                nvgpu_log_info(g, "ctxsw_disable_count: %d is not 0 yet",
+                        g->ctxsw_disable_count);
+        }
+ctxsw_already_enabled:
+        nvgpu_mutex_release(&g->ctxsw_disable_lock);
+        return err;
+}
+int gr_gk20a_halt_pipe(struct gk20a *g)
+{
+        return gr_gk20a_submit_fecs_method_op(g,
+              (struct fecs_method_op_gk20a) {
+                      .method.addr =
+                                gr_fecs_method_push_adr_halt_pipeline_v(),
+                      .method.data = ~0,
+                      .mailbox = { .id   = 1, /*sideband?*/
+                                .data = ~0, .clr = ~0, .ret = NULL,
+                                .ok   = gr_fecs_ctxsw_mailbox_value_pass_v(),
+                                .fail = gr_fecs_ctxsw_mailbox_value_fail_v(), },
+                      .cond.ok = GR_IS_UCODE_OP_EQUAL,
+                      .cond.fail = GR_IS_UCODE_OP_EQUAL }, false);
+}
+int gr_gk20a_commit_inst(struct channel_gk20a *c, u64 gpu_va)
+{
+        u32 addr_lo;
+        u32 addr_hi;
+        nvgpu_log_fn(c->g, " ");
+        addr_lo = u64_lo32(gpu_va) >> 12;
+        addr_hi = u64_hi32(gpu_va);
+        nvgpu_mem_wr32(c->g, &c->inst_block, ram_in_gr_wfi_target_w(),
+                 ram_in_gr_cs_wfi_f() | ram_in_gr_wfi_mode_virtual_f() |
+                 ram_in_gr_wfi_ptr_lo_f(addr_lo));
+        nvgpu_mem_wr32(c->g, &c->inst_block, ram_in_gr_wfi_ptr_hi_w(),
+                 ram_in_gr_wfi_ptr_hi_f(addr_hi));
+        return 0;
+}
+/*
+ * Context state can be written directly, or "patched" at times. So that code
+ * can be used in either situation it is written using a series of
+ * _ctx_patch_write(..., patch) statements. However any necessary map overhead
+ * should be minimized; thus, bundle the sequence of these writes together, and
+ * set them up and close with _ctx_patch_write_begin/_ctx_patch_write_end.
+ */
+int gr_gk20a_ctx_patch_write_begin(struct gk20a *g,
+                                          struct nvgpu_gr_ctx *gr_ctx,
+                                          bool update_patch_count)
+{
+        if (update_patch_count) {
+                /* reset patch count if ucode has already processed it */
+                gr_ctx->patch_ctx.data_count = nvgpu_mem_rd(g,
+                                                &gr_ctx->mem,
+                                        ctxsw_prog_main_image_patch_count_o());
+                nvgpu_log(g, gpu_dbg_info, "patch count reset to %d",
+                                        gr_ctx->patch_ctx.data_count);
+        }
+        return 0;
+}
+void gr_gk20a_ctx_patch_write_end(struct gk20a *g,
+                                        struct nvgpu_gr_ctx *gr_ctx,
+                                        bool update_patch_count)
+{
+        /* Write context count to context image if it is mapped */
+        if (update_patch_count) {
+                nvgpu_mem_wr(g, &gr_ctx->mem,
+                             ctxsw_prog_main_image_patch_count_o(),
+                             gr_ctx->patch_ctx.data_count);
+                nvgpu_log(g, gpu_dbg_info, "write patch count %d",
+                        gr_ctx->patch_ctx.data_count);
+        }
+}
+void gr_gk20a_ctx_patch_write(struct gk20a *g,
+                                    struct nvgpu_gr_ctx *gr_ctx,
+                                    u32 addr, u32 data, bool patch)
+{
+        if (patch) {
+                u32 patch_slot = gr_ctx->patch_ctx.data_count *
+                                PATCH_CTX_SLOTS_REQUIRED_PER_ENTRY;
+                if (patch_slot > (PATCH_CTX_ENTRIES_FROM_SIZE(
+                                        gr_ctx->patch_ctx.mem.size) -
+                                PATCH_CTX_SLOTS_REQUIRED_PER_ENTRY)) {
+                        nvgpu_err(g, "failed to access patch_slot %d",
+                                patch_slot);
+                        return;
+                }
+                nvgpu_mem_wr32(g, &gr_ctx->patch_ctx.mem, patch_slot, addr);
+                nvgpu_mem_wr32(g, &gr_ctx->patch_ctx.mem, patch_slot + 1, data);
+                gr_ctx->patch_ctx.data_count++;
+                nvgpu_log(g, gpu_dbg_info,
+                        "patch addr = 0x%x data = 0x%x data_count %d",
+                        addr, data, gr_ctx->patch_ctx.data_count);
+        } else {
+                gk20a_writel(g, addr, data);
+        }
+}
+static u32 fecs_current_ctx_data(struct gk20a *g, struct nvgpu_mem *inst_block)
+{
+        u64 ptr = nvgpu_inst_block_addr(g, inst_block) >>
+                ram_in_base_shift_v();
+        u32 aperture = nvgpu_aperture_mask(g, inst_block,
+                                gr_fecs_current_ctx_target_sys_mem_ncoh_f(),
+                                gr_fecs_current_ctx_target_sys_mem_coh_f(),
+                                gr_fecs_current_ctx_target_vid_mem_f());
+        return gr_fecs_current_ctx_ptr_f(u64_lo32(ptr)) | aperture |
+                gr_fecs_current_ctx_valid_f(1);
+}
+int gr_gk20a_fecs_ctx_bind_channel(struct gk20a *g,
+                                        struct channel_gk20a *c)
+{
+        u32 inst_base_ptr = u64_lo32(nvgpu_inst_block_addr(g, &c->inst_block)
+                                     >> ram_in_base_shift_v());
+        u32 data = fecs_current_ctx_data(g, &c->inst_block);
+        u32 ret;
+        nvgpu_log_info(g, "bind channel %d inst ptr 0x%08x",
+                   c->chid, inst_base_ptr);
+        ret = gr_gk20a_submit_fecs_method_op(g,
+                     (struct fecs_method_op_gk20a) {
+                     .method.addr = gr_fecs_method_push_adr_bind_pointer_v(),
+                     .method.data = data,
+                     .mailbox = { .id = 0, .data = 0,
+                                  .clr = 0x30,
+                                  .ret = NULL,
+                                  .ok = 0x10,
+                                  .fail = 0x20, },
+                     .cond.ok = GR_IS_UCODE_OP_AND,
+                     .cond.fail = GR_IS_UCODE_OP_AND}, true);
+        if (ret) {
+                nvgpu_err(g,
+                        "bind channel instance failed");
+        }
+        return ret;
+}
+void gr_gk20a_write_zcull_ptr(struct gk20a *g,
+                                struct nvgpu_mem *mem, u64 gpu_va)
+{
+        u32 va = u64_lo32(gpu_va >> 8);
+        nvgpu_mem_wr(g, mem,
+                ctxsw_prog_main_image_zcull_ptr_o(), va);
+}
+void gr_gk20a_write_pm_ptr(struct gk20a *g,
+                                struct nvgpu_mem *mem, u64 gpu_va)
+{
+        u32 va = u64_lo32(gpu_va >> 8);
+        nvgpu_mem_wr(g, mem,
+                ctxsw_prog_main_image_pm_ptr_o(), va);
+}
+static int gr_gk20a_ctx_zcull_setup(struct gk20a *g, struct channel_gk20a *c)
+{
+        struct tsg_gk20a *tsg;
+        struct nvgpu_gr_ctx *gr_ctx = NULL;
+        struct nvgpu_mem *mem = NULL;
+        struct nvgpu_mem *ctxheader = &c->ctx_header;
+        int ret = 0;
+        nvgpu_log_fn(g, " ");
+        tsg = tsg_gk20a_from_ch(c);
+        if (tsg == NULL) {
+                return -EINVAL;
+        }
+        gr_ctx = &tsg->gr_ctx;
+        mem = &gr_ctx->mem;
+        if (gr_ctx->zcull_ctx.gpu_va == 0 &&
+            gr_ctx->zcull_ctx.ctx_sw_mode ==
+                ctxsw_prog_main_image_zcull_mode_separate_buffer_v()) {
+                return -EINVAL;
+        }
+        ret = gk20a_disable_channel_tsg(g, c);
+        if (ret) {
+                nvgpu_err(g, "failed to disable channel/TSG");
+                return ret;
+        }
+        ret = gk20a_fifo_preempt(g, c);
+        if (ret) {
+                gk20a_enable_channel_tsg(g, c);
+                nvgpu_err(g, "failed to preempt channel/TSG");
+                return ret;
+        }
+        nvgpu_mem_wr(g, mem,
+                        ctxsw_prog_main_image_zcull_o(),
+                 gr_ctx->zcull_ctx.ctx_sw_mode);
+        if (ctxheader->gpu_va) {
+                g->ops.gr.write_zcull_ptr(g, ctxheader,
+                                        gr_ctx->zcull_ctx.gpu_va);
+        } else {
+                g->ops.gr.write_zcull_ptr(g, mem, gr_ctx->zcull_ctx.gpu_va);
+        }
+        gk20a_enable_channel_tsg(g, c);
+        return ret;
+}
+u32 gk20a_gr_gpc_offset(struct gk20a *g, u32 gpc)
+{
+        u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE);
+        u32 gpc_offset = gpc_stride * gpc;
+        return gpc_offset;
+}
+u32 gk20a_gr_tpc_offset(struct gk20a *g, u32 tpc)
+{
+        u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g,
+                                        GPU_LIT_TPC_IN_GPC_STRIDE);
+        u32 tpc_offset = tpc_in_gpc_stride * tpc;
+        return tpc_offset;
+}
+int gr_gk20a_commit_global_ctx_buffers(struct gk20a *g,
+                        struct channel_gk20a *c, bool patch)
+{
+        struct gr_gk20a *gr = &g->gr;
+        struct tsg_gk20a *tsg;
+        struct nvgpu_gr_ctx *gr_ctx = NULL;
+        u64 addr;
+        u32 size;
+        nvgpu_log_fn(g, " ");
+        tsg = tsg_gk20a_from_ch(c);
+        if (tsg == NULL) {
+                return -EINVAL;
+        }
+        gr_ctx = &tsg->gr_ctx;
+        if (patch) {
+                int err;
+                err = gr_gk20a_ctx_patch_write_begin(g, gr_ctx, false);
+                if (err != 0) {
+                        return err;
+                }
+        }
+        /* global pagepool buffer */
+        addr = (u64_lo32(gr_ctx->global_ctx_buffer_va[PAGEPOOL_VA]) >>
+                gr_scc_pagepool_base_addr_39_8_align_bits_v()) |
+                (u64_hi32(gr_ctx->global_ctx_buffer_va[PAGEPOOL_VA]) <<
+                 (32 - gr_scc_pagepool_base_addr_39_8_align_bits_v()));
+        size = gr->global_ctx_buffer[PAGEPOOL].mem.size /
+                gr_scc_pagepool_total_pages_byte_granularity_v();
+        if (size == g->ops.gr.pagepool_default_size(g)) {
+                size = gr_scc_pagepool_total_pages_hwmax_v();
+        }
+        nvgpu_log_info(g, "pagepool buffer addr : 0x%016llx, size : %d",
+                addr, size);
+        g->ops.gr.commit_global_pagepool(g, gr_ctx, addr, size, patch);
+        /* global bundle cb */
+        addr = (u64_lo32(gr_ctx->global_ctx_buffer_va[CIRCULAR_VA]) >>
+                gr_scc_bundle_cb_base_addr_39_8_align_bits_v()) |
+                (u64_hi32(gr_ctx->global_ctx_buffer_va[CIRCULAR_VA]) <<
+                 (32 - gr_scc_bundle_cb_base_addr_39_8_align_bits_v()));
+        size = gr->bundle_cb_default_size;
+        nvgpu_log_info(g, "bundle cb addr : 0x%016llx, size : %d",
+                addr, size);
+        g->ops.gr.commit_global_bundle_cb(g, gr_ctx, addr, size, patch);
+        /* global attrib cb */
+        addr = (u64_lo32(gr_ctx->global_ctx_buffer_va[ATTRIBUTE_VA]) >>
+                gr_gpcs_setup_attrib_cb_base_addr_39_12_align_bits_v()) |
+                (u64_hi32(gr_ctx->global_ctx_buffer_va[ATTRIBUTE_VA]) <<
+                 (32 - gr_gpcs_setup_attrib_cb_base_addr_39_12_align_bits_v()));
+        nvgpu_log_info(g, "attrib cb addr : 0x%016llx", addr);
+        g->ops.gr.commit_global_attrib_cb(g, gr_ctx, addr, patch);
+        g->ops.gr.commit_global_cb_manager(g, c, patch);
+        if (patch) {
+                gr_gk20a_ctx_patch_write_end(g, gr_ctx, false);
+        }
+        return 0;
+}
+int gr_gk20a_commit_global_timeslice(struct gk20a *g, struct channel_gk20a *c)
+{
+        struct gr_gk20a *gr = &g->gr;
+        struct nvgpu_gr_ctx *gr_ctx = NULL;
+        u32 gpm_pd_cfg;
+        u32 pd_ab_dist_cfg0;
+        u32 ds_debug;
+        u32 mpc_vtg_debug;
+        u32 pe_vaf;
+        u32 pe_vsc_vpc;
+        nvgpu_log_fn(g, " ");
+        gpm_pd_cfg = gk20a_readl(g, gr_gpcs_gpm_pd_cfg_r());
+        pd_ab_dist_cfg0 = gk20a_readl(g, gr_pd_ab_dist_cfg0_r());
+        ds_debug = gk20a_readl(g, gr_ds_debug_r());
+        mpc_vtg_debug = gk20a_readl(g, gr_gpcs_tpcs_mpc_vtg_debug_r());
+        if (gr->timeslice_mode == gr_gpcs_ppcs_cbm_cfg_timeslice_mode_enable_v()) {
+                pe_vaf = gk20a_readl(g, gr_gpcs_tpcs_pe_vaf_r());
+                pe_vsc_vpc = gk20a_readl(g, gr_gpcs_tpcs_pes_vsc_vpc_r());
+                gpm_pd_cfg = gr_gpcs_gpm_pd_cfg_timeslice_mode_enable_f() | gpm_pd_cfg;
+                pe_vaf = gr_gpcs_tpcs_pe_vaf_fast_mode_switch_true_f() | pe_vaf;
+                pe_vsc_vpc = gr_gpcs_tpcs_pes_vsc_vpc_fast_mode_switch_true_f() | pe_vsc_vpc;
+                pd_ab_dist_cfg0 = gr_pd_ab_dist_cfg0_timeslice_enable_en_f() | pd_ab_dist_cfg0;
+                ds_debug = gr_ds_debug_timeslice_mode_enable_f() | ds_debug;
+                mpc_vtg_debug = gr_gpcs_tpcs_mpc_vtg_debug_timeslice_mode_enabled_f() | mpc_vtg_debug;
+                gr_gk20a_ctx_patch_write(g, gr_ctx, gr_gpcs_gpm_pd_cfg_r(), gpm_pd_cfg, false);
+                gr_gk20a_ctx_patch_write(g, gr_ctx, gr_gpcs_tpcs_pe_vaf_r(), pe_vaf, false);
+                gr_gk20a_ctx_patch_write(g, gr_ctx, gr_gpcs_tpcs_pes_vsc_vpc_r(), pe_vsc_vpc, false);
+                gr_gk20a_ctx_patch_write(g, gr_ctx, gr_pd_ab_dist_cfg0_r(), pd_ab_dist_cfg0, false);
+                gr_gk20a_ctx_patch_write(g, gr_ctx, gr_ds_debug_r(), ds_debug, false);
+                gr_gk20a_ctx_patch_write(g, gr_ctx, gr_gpcs_tpcs_mpc_vtg_debug_r(), mpc_vtg_debug, false);
+        } else {
+                gpm_pd_cfg = gr_gpcs_gpm_pd_cfg_timeslice_mode_disable_f() | gpm_pd_cfg;
+                pd_ab_dist_cfg0 = gr_pd_ab_dist_cfg0_timeslice_enable_dis_f() | pd_ab_dist_cfg0;
+                ds_debug = gr_ds_debug_timeslice_mode_disable_f() | ds_debug;
+                mpc_vtg_debug = gr_gpcs_tpcs_mpc_vtg_debug_timeslice_mode_disabled_f() | mpc_vtg_debug;
+                gr_gk20a_ctx_patch_write(g, gr_ctx, gr_gpcs_gpm_pd_cfg_r(), gpm_pd_cfg, false);
+                gr_gk20a_ctx_patch_write(g, gr_ctx, gr_pd_ab_dist_cfg0_r(), pd_ab_dist_cfg0, false);
+                gr_gk20a_ctx_patch_write(g, gr_ctx, gr_ds_debug_r(), ds_debug, false);
+                gr_gk20a_ctx_patch_write(g, gr_ctx, gr_gpcs_tpcs_mpc_vtg_debug_r(), mpc_vtg_debug, false);
+        }
+        return 0;
+}
+/*
+ * Return map tiles count for given index
+ * Return 0 if index is out-of-bounds
+ */
+static u32 gr_gk20a_get_map_tile_count(struct gr_gk20a *gr, u32 index)
+{
+        if (index >= gr->map_tile_count) {
+                return 0;
+        }
+        return gr->map_tiles[index];
+}
+int gr_gk20a_setup_rop_mapping(struct gk20a *g, struct gr_gk20a *gr)
+{
+        u32 norm_entries, norm_shift;
+        u32 coeff5_mod, coeff6_mod, coeff7_mod, coeff8_mod, coeff9_mod, coeff10_mod, coeff11_mod;
+        u32 map0, map1, map2, map3, map4, map5;
+        if (gr->map_tiles == NULL) {
+                return -1;
+        }
+        nvgpu_log_fn(g, " ");
+        gk20a_writel(g, gr_crstr_map_table_cfg_r(),
+                     gr_crstr_map_table_cfg_row_offset_f(gr->map_row_offset) |
+                     gr_crstr_map_table_cfg_num_entries_f(gr->tpc_count));
+        map0 =  gr_crstr_gpc_map0_tile0_f(gr_gk20a_get_map_tile_count(gr, 0)) |
+                gr_crstr_gpc_map0_tile1_f(gr_gk20a_get_map_tile_count(gr, 1)) |
+                gr_crstr_gpc_map0_tile2_f(gr_gk20a_get_map_tile_count(gr, 2)) |
+                gr_crstr_gpc_map0_tile3_f(gr_gk20a_get_map_tile_count(gr, 3)) |
+                gr_crstr_gpc_map0_tile4_f(gr_gk20a_get_map_tile_count(gr, 4)) |
+                gr_crstr_gpc_map0_tile5_f(gr_gk20a_get_map_tile_count(gr, 5));
+        map1 =  gr_crstr_gpc_map1_tile6_f(gr_gk20a_get_map_tile_count(gr, 6)) |
+                gr_crstr_gpc_map1_tile7_f(gr_gk20a_get_map_tile_count(gr, 7)) |
+                gr_crstr_gpc_map1_tile8_f(gr_gk20a_get_map_tile_count(gr, 8)) |
+                gr_crstr_gpc_map1_tile9_f(gr_gk20a_get_map_tile_count(gr, 9)) |
+                gr_crstr_gpc_map1_tile10_f(gr_gk20a_get_map_tile_count(gr, 10)) |
+                gr_crstr_gpc_map1_tile11_f(gr_gk20a_get_map_tile_count(gr, 11));
+        map2 =  gr_crstr_gpc_map2_tile12_f(gr_gk20a_get_map_tile_count(gr, 12)) |
+                gr_crstr_gpc_map2_tile13_f(gr_gk20a_get_map_tile_count(gr, 13)) |
+                gr_crstr_gpc_map2_tile14_f(gr_gk20a_get_map_tile_count(gr, 14)) |
+                gr_crstr_gpc_map2_tile15_f(gr_gk20a_get_map_tile_count(gr, 15)) |
+                gr_crstr_gpc_map2_tile16_f(gr_gk20a_get_map_tile_count(gr, 16)) |
+                gr_crstr_gpc_map2_tile17_f(gr_gk20a_get_map_tile_count(gr, 17));
+        map3 =  gr_crstr_gpc_map3_tile18_f(gr_gk20a_get_map_tile_count(gr, 18)) |
+                gr_crstr_gpc_map3_tile19_f(gr_gk20a_get_map_tile_count(gr, 19)) |
+                gr_crstr_gpc_map3_tile20_f(gr_gk20a_get_map_tile_count(gr, 20)) |
+                gr_crstr_gpc_map3_tile21_f(gr_gk20a_get_map_tile_count(gr, 21)) |
+                gr_crstr_gpc_map3_tile22_f(gr_gk20a_get_map_tile_count(gr, 22)) |
+                gr_crstr_gpc_map3_tile23_f(gr_gk20a_get_map_tile_count(gr, 23));
+        map4 =  gr_crstr_gpc_map4_tile24_f(gr_gk20a_get_map_tile_count(gr, 24)) |
+                gr_crstr_gpc_map4_tile25_f(gr_gk20a_get_map_tile_count(gr, 25)) |
+                gr_crstr_gpc_map4_tile26_f(gr_gk20a_get_map_tile_count(gr, 26)) |
+                gr_crstr_gpc_map4_tile27_f(gr_gk20a_get_map_tile_count(gr, 27)) |
+                gr_crstr_gpc_map4_tile28_f(gr_gk20a_get_map_tile_count(gr, 28)) |
+                gr_crstr_gpc_map4_tile29_f(gr_gk20a_get_map_tile_count(gr, 29));
+        map5 =  gr_crstr_gpc_map5_tile30_f(gr_gk20a_get_map_tile_count(gr, 30)) |
+                gr_crstr_gpc_map5_tile31_f(gr_gk20a_get_map_tile_count(gr, 31)) |
+                gr_crstr_gpc_map5_tile32_f(0) |
+                gr_crstr_gpc_map5_tile33_f(0) |
+                gr_crstr_gpc_map5_tile34_f(0) |
+                gr_crstr_gpc_map5_tile35_f(0);
+        gk20a_writel(g, gr_crstr_gpc_map0_r(), map0);
+        gk20a_writel(g, gr_crstr_gpc_map1_r(), map1);
+        gk20a_writel(g, gr_crstr_gpc_map2_r(), map2);
+        gk20a_writel(g, gr_crstr_gpc_map3_r(), map3);
+        gk20a_writel(g, gr_crstr_gpc_map4_r(), map4);
+        gk20a_writel(g, gr_crstr_gpc_map5_r(), map5);
+        switch (gr->tpc_count) {
+        case 1:
+                norm_shift = 4;
+                break;
+        case 2:
+        case 3:
+                norm_shift = 3;
+                break;
+        case 4:
+        case 5:
+        case 6:
+        case 7:
+                norm_shift = 2;
+                break;
+        case 8:
+        case 9:
+        case 10:
+        case 11:
+        case 12:
+        case 13:
+        case 14:
+        case 15:
+                norm_shift = 1;
+                break;
+        default:
+                norm_shift = 0;
+                break;
+        }
+        norm_entries = gr->tpc_count << norm_shift;
+        coeff5_mod = (1 << 5) % norm_entries;
+        coeff6_mod = (1 << 6) % norm_entries;
+        coeff7_mod = (1 << 7) % norm_entries;
+        coeff8_mod = (1 << 8) % norm_entries;
+        coeff9_mod = (1 << 9) % norm_entries;
+        coeff10_mod = (1 << 10) % norm_entries;
+        coeff11_mod = (1 << 11) % norm_entries;
+        gk20a_writel(g, gr_ppcs_wwdx_map_table_cfg_r(),
+                     gr_ppcs_wwdx_map_table_cfg_row_offset_f(gr->map_row_offset) |
+                     gr_ppcs_wwdx_map_table_cfg_normalized_num_entries_f(norm_entries) |
+                     gr_ppcs_wwdx_map_table_cfg_normalized_shift_value_f(norm_shift) |
+                     gr_ppcs_wwdx_map_table_cfg_coeff5_mod_value_f(coeff5_mod) |
+                     gr_ppcs_wwdx_map_table_cfg_num_entries_f(gr->tpc_count));
+        gk20a_writel(g, gr_ppcs_wwdx_map_table_cfg2_r(),
+                     gr_ppcs_wwdx_map_table_cfg2_coeff6_mod_value_f(coeff6_mod) |
+                     gr_ppcs_wwdx_map_table_cfg2_coeff7_mod_value_f(coeff7_mod) |
+                     gr_ppcs_wwdx_map_table_cfg2_coeff8_mod_value_f(coeff8_mod) |
+                     gr_ppcs_wwdx_map_table_cfg2_coeff9_mod_value_f(coeff9_mod) |
+                     gr_ppcs_wwdx_map_table_cfg2_coeff10_mod_value_f(coeff10_mod) |
+                     gr_ppcs_wwdx_map_table_cfg2_coeff11_mod_value_f(coeff11_mod));
+        gk20a_writel(g, gr_ppcs_wwdx_map_gpc_map0_r(), map0);
+        gk20a_writel(g, gr_ppcs_wwdx_map_gpc_map1_r(), map1);
+        gk20a_writel(g, gr_ppcs_wwdx_map_gpc_map2_r(), map2);
+        gk20a_writel(g, gr_ppcs_wwdx_map_gpc_map3_r(), map3);
+        gk20a_writel(g, gr_ppcs_wwdx_map_gpc_map4_r(), map4);
+        gk20a_writel(g, gr_ppcs_wwdx_map_gpc_map5_r(), map5);
+        gk20a_writel(g, gr_rstr2d_map_table_cfg_r(),
+                     gr_rstr2d_map_table_cfg_row_offset_f(gr->map_row_offset) |
+                     gr_rstr2d_map_table_cfg_num_entries_f(gr->tpc_count));
+        gk20a_writel(g, gr_rstr2d_gpc_map0_r(), map0);
+        gk20a_writel(g, gr_rstr2d_gpc_map1_r(), map1);
+        gk20a_writel(g, gr_rstr2d_gpc_map2_r(), map2);
+        gk20a_writel(g, gr_rstr2d_gpc_map3_r(), map3);
+        gk20a_writel(g, gr_rstr2d_gpc_map4_r(), map4);
+        gk20a_writel(g, gr_rstr2d_gpc_map5_r(), map5);
+        return 0;
+}
+static inline u32 count_bits(u32 mask)
+{
+        u32 temp = mask;
+        u32 count;
+        for (count = 0; temp != 0; count++) {
+                temp &= temp - 1;
+        }
+        return count;
+}
+int gr_gk20a_init_sm_id_table(struct gk20a *g)
+{
+        u32 gpc, tpc;
+        u32 sm_id = 0;
+        for (tpc = 0; tpc < g->gr.max_tpc_per_gpc_count; tpc++) {
+                for (gpc = 0; gpc < g->gr.gpc_count; gpc++) {
+                        if (tpc < g->gr.gpc_tpc_count[gpc]) {
+                                g->gr.sm_to_cluster[sm_id].tpc_index = tpc;
+                                g->gr.sm_to_cluster[sm_id].gpc_index = gpc;
+                                g->gr.sm_to_cluster[sm_id].sm_index = 0;
+                                g->gr.sm_to_cluster[sm_id].global_tpc_index =
+                                                                        sm_id;
+                                sm_id++;
+                        }
+                }
+        }
+        g->gr.no_of_sm = sm_id;
+        return 0;
+}
+/*
+ * Return number of TPCs in a GPC
+ * Return 0 if GPC index is invalid i.e. GPC is disabled
+ */
+u32 gr_gk20a_get_tpc_count(struct gr_gk20a *gr, u32 gpc_index)
+{
+        if (gpc_index >= gr->gpc_count) {
+                return 0;
+        }
+        return gr->gpc_tpc_count[gpc_index];
+}
+int gr_gk20a_init_fs_state(struct gk20a *g)
+{
+        struct gr_gk20a *gr = &g->gr;
+        u32 tpc_index, gpc_index;
+        u32 sm_id = 0, gpc_id = 0;
+        u32 tpc_per_gpc;
+        u32 fuse_tpc_mask;
+        u32 reg_index;
+        int err;
+        nvgpu_log_fn(g, " ");
+        if (g->ops.gr.init_sm_id_table) {
+                err = g->ops.gr.init_sm_id_table(g);
+                if (err != 0) {
+                        return err;
+                }
+                /* Is table empty ? */
+                if (g->gr.no_of_sm == 0) {
+                        return -EINVAL;
+                }
+        }
+        for (sm_id = 0; sm_id < g->gr.no_of_sm; sm_id++) {
+                tpc_index = g->gr.sm_to_cluster[sm_id].tpc_index;
+                gpc_index = g->gr.sm_to_cluster[sm_id].gpc_index;
+                g->ops.gr.program_sm_id_numbering(g, gpc_index, tpc_index, sm_id);
+                if (g->ops.gr.program_active_tpc_counts) {
+                        g->ops.gr.program_active_tpc_counts(g, gpc_index);
+                }
+        }
+        for (reg_index = 0, gpc_id = 0;
+             reg_index < gr_pd_num_tpc_per_gpc__size_1_v();
+             reg_index++, gpc_id += 8) {
+                tpc_per_gpc =
+                        gr_pd_num_tpc_per_gpc_count0_f(gr_gk20a_get_tpc_count(gr, gpc_id + 0)) |
+                        gr_pd_num_tpc_per_gpc_count1_f(gr_gk20a_get_tpc_count(gr, gpc_id + 1)) |
+                        gr_pd_num_tpc_per_gpc_count2_f(gr_gk20a_get_tpc_count(gr, gpc_id + 2)) |
+                        gr_pd_num_tpc_per_gpc_count3_f(gr_gk20a_get_tpc_count(gr, gpc_id + 3)) |
+                        gr_pd_num_tpc_per_gpc_count4_f(gr_gk20a_get_tpc_count(gr, gpc_id + 4)) |
+                        gr_pd_num_tpc_per_gpc_count5_f(gr_gk20a_get_tpc_count(gr, gpc_id + 5)) |
+                        gr_pd_num_tpc_per_gpc_count6_f(gr_gk20a_get_tpc_count(gr, gpc_id + 6)) |
+                        gr_pd_num_tpc_per_gpc_count7_f(gr_gk20a_get_tpc_count(gr, gpc_id + 7));
+                gk20a_writel(g, gr_pd_num_tpc_per_gpc_r(reg_index), tpc_per_gpc);
+                gk20a_writel(g, gr_ds_num_tpc_per_gpc_r(reg_index), tpc_per_gpc);
+        }
+        /* gr__setup_pd_mapping stubbed for gk20a */
+        g->ops.gr.setup_rop_mapping(g, gr);
+        if (g->ops.gr.setup_alpha_beta_tables) {
+                g->ops.gr.setup_alpha_beta_tables(g, gr);
+        }
+        for (gpc_index = 0;
+             gpc_index < gr_pd_dist_skip_table__size_1_v() * 4;
+             gpc_index += 4) {
+                gk20a_writel(g, gr_pd_dist_skip_table_r(gpc_index/4),
+                             (gr_pd_dist_skip_table_gpc_4n0_mask_f(gr->gpc_skip_mask[gpc_index]) != 0U) ||
+                             (gr_pd_dist_skip_table_gpc_4n1_mask_f(gr->gpc_skip_mask[gpc_index + 1]) != 0U) ||
+                             (gr_pd_dist_skip_table_gpc_4n2_mask_f(gr->gpc_skip_mask[gpc_index + 2]) != 0U) ||
+                             (gr_pd_dist_skip_table_gpc_4n3_mask_f(gr->gpc_skip_mask[gpc_index + 3]) != 0U));
+        }
+        fuse_tpc_mask = g->ops.gr.get_gpc_tpc_mask(g, 0);
+        if ((g->tpc_fs_mask_user != 0U) &&
+                (fuse_tpc_mask == BIT32(gr->max_tpc_count) - 1U)) {
+                u32 val = g->tpc_fs_mask_user;
+                val &= (0x1U << gr->max_tpc_count) - 1U;
+                gk20a_writel(g, gr_cwd_fs_r(),
+                        gr_cwd_fs_num_gpcs_f(gr->gpc_count) |
+                        gr_cwd_fs_num_tpcs_f(hweight32(val)));
+        } else {
+                gk20a_writel(g, gr_cwd_fs_r(),
+                        gr_cwd_fs_num_gpcs_f(gr->gpc_count) |
+                        gr_cwd_fs_num_tpcs_f(gr->tpc_count));
+        }
+        gk20a_writel(g, gr_bes_zrop_settings_r(),
+                     gr_bes_zrop_settings_num_active_fbps_f(gr->num_fbps));
+        gk20a_writel(g, gr_bes_crop_settings_r(),
+                     gr_bes_crop_settings_num_active_fbps_f(gr->num_fbps));
+        return 0;
+}
+int gr_gk20a_fecs_ctx_image_save(struct channel_gk20a *c, u32 save_type)
+{
+        struct gk20a *g = c->g;
+        int ret;
+        nvgpu_log_fn(g, " ");
+        ret = gr_gk20a_submit_fecs_method_op(g,
+                (struct fecs_method_op_gk20a) {
+                .method.addr = save_type,
+                .method.data = fecs_current_ctx_data(g, &c->inst_block),
+                .mailbox = {.id = 0, .data = 0, .clr = 3, .ret = NULL,
+                        .ok = 1, .fail = 2,
+                },
+                .cond.ok = GR_IS_UCODE_OP_AND,
+                .cond.fail = GR_IS_UCODE_OP_AND,
+                 }, true);
+        if (ret) {
+                nvgpu_err(g, "save context image failed");
+        }
+        return ret;
+}
+u32 gk20a_init_sw_bundle(struct gk20a *g)
+{
+        struct av_list_gk20a *sw_bundle_init = &g->gr.ctx_vars.sw_bundle_init;
+        u32 last_bundle_data = 0;
+        u32 err = 0;
+        unsigned int i;
+        /* disable fe_go_idle */
+        gk20a_writel(g, gr_fe_go_idle_timeout_r(),
+                gr_fe_go_idle_timeout_count_disabled_f());
+        /* enable pipe mode override */
+        gk20a_writel(g, gr_pipe_bundle_config_r(),
+                gr_pipe_bundle_config_override_pipe_mode_enabled_f());
+        /* load bundle init */
+        for (i = 0; i < sw_bundle_init->count; i++) {
+                if (i == 0 || last_bundle_data != sw_bundle_init->l[i].value) {
+                        gk20a_writel(g, gr_pipe_bundle_data_r(),
+                                sw_bundle_init->l[i].value);
+                        last_bundle_data = sw_bundle_init->l[i].value;
+                }
+                gk20a_writel(g, gr_pipe_bundle_address_r(),
+                             sw_bundle_init->l[i].addr);
+                if (gr_pipe_bundle_address_value_v(sw_bundle_init->l[i].addr) ==
+                    GR_GO_IDLE_BUNDLE) {
+                        err = gr_gk20a_wait_idle(g,
+                                                  gk20a_get_gr_idle_timeout(g),
+                                                  GR_IDLE_CHECK_DEFAULT);
+                        if (err != 0U) {
+                                goto error;
+                        }
+                }
+                err = gr_gk20a_wait_fe_idle(g, gk20a_get_gr_idle_timeout(g),
+                                            GR_IDLE_CHECK_DEFAULT);
+                if (err != 0U) {
+                        goto error;
+                }
+        }
+        if ((err == 0U) && (g->ops.gr.init_sw_veid_bundle != NULL)) {
+                err = g->ops.gr.init_sw_veid_bundle(g);
+                if (err != 0U) {
+                        goto error;
+                }
+        }
+        if (g->ops.gr.init_sw_bundle64) {
+                err = g->ops.gr.init_sw_bundle64(g);
+                if (err != 0U) {
+                        goto error;
+                }
+        }
+        /* disable pipe mode override */
+        gk20a_writel(g, gr_pipe_bundle_config_r(),
+                     gr_pipe_bundle_config_override_pipe_mode_disabled_f());
+        err = gr_gk20a_wait_idle(g, gk20a_get_gr_idle_timeout(g),
+                                 GR_IDLE_CHECK_DEFAULT);
+        /* restore fe_go_idle */
+        gk20a_writel(g, gr_fe_go_idle_timeout_r(),
+                     gr_fe_go_idle_timeout_count_prod_f());
+        return err;
+error:
+        /* in case of error skip waiting for GR idle - just restore state */
+        gk20a_writel(g, gr_pipe_bundle_config_r(),
+                     gr_pipe_bundle_config_override_pipe_mode_disabled_f());
+        /* restore fe_go_idle */
+        gk20a_writel(g, gr_fe_go_idle_timeout_r(),
+                     gr_fe_go_idle_timeout_count_prod_f());
+        return err;
+}
+/* init global golden image from a fresh gr_ctx in channel ctx.
+   save a copy in local_golden_image in ctx_vars */
+static int gr_gk20a_init_golden_ctx_image(struct gk20a *g,
+                                          struct channel_gk20a *c)
+{
+        struct gr_gk20a *gr = &g->gr;
+        struct tsg_gk20a *tsg;
+        struct nvgpu_gr_ctx *gr_ctx = NULL;
+        u32 ctx_header_bytes = ctxsw_prog_fecs_header_v();
+        u32 ctx_header_words;
+        u32 i;
+        u32 data;
+        struct nvgpu_mem *gold_mem = &gr->global_ctx_buffer[GOLDEN_CTX].mem;
+        struct nvgpu_mem *gr_mem;
+        u32 err = 0;
+        struct aiv_list_gk20a *sw_ctx_load = &g->gr.ctx_vars.sw_ctx_load;
+        struct av_list_gk20a *sw_method_init = &g->gr.ctx_vars.sw_method_init;
+        u32 last_method_data = 0;
+        nvgpu_log_fn(g, " ");
+        tsg = tsg_gk20a_from_ch(c);
+        if (tsg == NULL) {
+                return -EINVAL;
+        }
+        gr_ctx = &tsg->gr_ctx;
+        gr_mem = &gr_ctx->mem;
+        /* golden ctx is global to all channels. Although only the first
+           channel initializes golden image, driver needs to prevent multiple
+           channels from initializing golden ctx at the same time */
+        nvgpu_mutex_acquire(&gr->ctx_mutex);
+        if (gr->ctx_vars.golden_image_initialized) {
+                goto clean_up;
+        }
+        if (!nvgpu_is_enabled(g, NVGPU_IS_FMODEL)) {
+                struct nvgpu_timeout timeout;
+                nvgpu_timeout_init(g, &timeout,
+                                   FE_PWR_MODE_TIMEOUT_MAX /
+                                        FE_PWR_MODE_TIMEOUT_DEFAULT,
+                                   NVGPU_TIMER_RETRY_TIMER);
+                gk20a_writel(g, gr_fe_pwr_mode_r(),
+                        gr_fe_pwr_mode_req_send_f() | gr_fe_pwr_mode_mode_force_on_f());
+                do {
+                        u32 req = gr_fe_pwr_mode_req_v(gk20a_readl(g, gr_fe_pwr_mode_r()));
+                        if (req == gr_fe_pwr_mode_req_done_v()) {
+                                break;
+                        }
+                        nvgpu_udelay(FE_PWR_MODE_TIMEOUT_DEFAULT);
+                } while (nvgpu_timeout_expired_msg(&timeout,
+                                        "timeout forcing FE on") == 0);
+        }
+        gk20a_writel(g, gr_fecs_ctxsw_reset_ctl_r(),
+                        gr_fecs_ctxsw_reset_ctl_sys_halt_disabled_f() |
+                        gr_fecs_ctxsw_reset_ctl_gpc_halt_disabled_f() |
+                        gr_fecs_ctxsw_reset_ctl_be_halt_disabled_f() |
+                        gr_fecs_ctxsw_reset_ctl_sys_engine_reset_disabled_f() |
+                        gr_fecs_ctxsw_reset_ctl_gpc_engine_reset_disabled_f() |
+                        gr_fecs_ctxsw_reset_ctl_be_engine_reset_disabled_f() |
+                        gr_fecs_ctxsw_reset_ctl_sys_context_reset_enabled_f() |
+                        gr_fecs_ctxsw_reset_ctl_gpc_context_reset_enabled_f() |
+                        gr_fecs_ctxsw_reset_ctl_be_context_reset_enabled_f());
+        (void) gk20a_readl(g, gr_fecs_ctxsw_reset_ctl_r());
+        nvgpu_udelay(10);
+        gk20a_writel(g, gr_fecs_ctxsw_reset_ctl_r(),
+                        gr_fecs_ctxsw_reset_ctl_sys_halt_disabled_f() |
+                        gr_fecs_ctxsw_reset_ctl_gpc_halt_disabled_f() |
+                        gr_fecs_ctxsw_reset_ctl_be_halt_disabled_f() |
+                        gr_fecs_ctxsw_reset_ctl_sys_engine_reset_disabled_f() |
+                        gr_fecs_ctxsw_reset_ctl_gpc_engine_reset_disabled_f() |
+                        gr_fecs_ctxsw_reset_ctl_be_engine_reset_disabled_f() |
+                        gr_fecs_ctxsw_reset_ctl_sys_context_reset_disabled_f() |
+                        gr_fecs_ctxsw_reset_ctl_gpc_context_reset_disabled_f() |
+                        gr_fecs_ctxsw_reset_ctl_be_context_reset_disabled_f());
+        (void) gk20a_readl(g, gr_fecs_ctxsw_reset_ctl_r());
+        nvgpu_udelay(10);
+        if (!nvgpu_is_enabled(g, NVGPU_IS_FMODEL)) {
+                struct nvgpu_timeout timeout;
+                nvgpu_timeout_init(g, &timeout,
+                                   FE_PWR_MODE_TIMEOUT_MAX /
+                                        FE_PWR_MODE_TIMEOUT_DEFAULT,
+                                   NVGPU_TIMER_RETRY_TIMER);
+                gk20a_writel(g, gr_fe_pwr_mode_r(),
+                        gr_fe_pwr_mode_req_send_f() | gr_fe_pwr_mode_mode_auto_f());
+                do {
+                        u32 req = gr_fe_pwr_mode_req_v(gk20a_readl(g, gr_fe_pwr_mode_r()));
+                        if (req == gr_fe_pwr_mode_req_done_v()) {
+                                break;
+                        }
+                        nvgpu_udelay(FE_PWR_MODE_TIMEOUT_DEFAULT);
+                } while (nvgpu_timeout_expired_msg(&timeout,
+                                "timeout setting FE power to auto") == 0);
+        }
+        /* clear scc ram */
+        gk20a_writel(g, gr_scc_init_r(),
+                gr_scc_init_ram_trigger_f());
+        err = gr_gk20a_fecs_ctx_bind_channel(g, c);
+        if (err != 0U) {
+                goto clean_up;
+        }
+        err = gr_gk20a_wait_idle(g, gk20a_get_gr_idle_timeout(g),
+                                 GR_IDLE_CHECK_DEFAULT);
+        /* load ctx init */
+        for (i = 0; i < sw_ctx_load->count; i++) {
+                gk20a_writel(g, sw_ctx_load->l[i].addr,
+                             sw_ctx_load->l[i].value);
+        }
+        if (g->ops.gr.init_preemption_state) {
+                g->ops.gr.init_preemption_state(g);
+        }
+        if (g->ops.clock_gating.blcg_gr_load_gating_prod) {
+                g->ops.clock_gating.blcg_gr_load_gating_prod(g, g->blcg_enabled);
+        }
+        err = gr_gk20a_wait_idle(g, gk20a_get_gr_idle_timeout(g),
+                                 GR_IDLE_CHECK_DEFAULT);
+        if (err != 0U) {
+                goto clean_up;
+        }
+        /* disable fe_go_idle */
+        gk20a_writel(g, gr_fe_go_idle_timeout_r(),
+                gr_fe_go_idle_timeout_count_disabled_f());
+        err = g->ops.gr.commit_global_ctx_buffers(g, c, false);
+        if (err != 0U) {
+                goto clean_up;
+        }
+        /* override a few ctx state registers */
+        g->ops.gr.commit_global_timeslice(g, c);
+        /* floorsweep anything left */
+        err = g->ops.gr.init_fs_state(g);
+        if (err != 0U) {
+                goto clean_up;
+        }
+        err = gr_gk20a_wait_idle(g, gk20a_get_gr_idle_timeout(g),
+                                 GR_IDLE_CHECK_DEFAULT);
+        if (err != 0U) {
+                goto restore_fe_go_idle;
+        }
+        err = gk20a_init_sw_bundle(g);
+        if (err != 0U) {
+                goto clean_up;
+        }
+restore_fe_go_idle:
+        /* restore fe_go_idle */
+        gk20a_writel(g, gr_fe_go_idle_timeout_r(),
+                     gr_fe_go_idle_timeout_count_prod_f());
+        if ((err != 0U) || (gr_gk20a_wait_idle(g, gk20a_get_gr_idle_timeout(g),
+                                      GR_IDLE_CHECK_DEFAULT) != 0)) {
+                goto clean_up;
+        }
+        /* load method init */
+        if (sw_method_init->count) {
+                gk20a_writel(g, gr_pri_mme_shadow_raw_data_r(),
+                             sw_method_init->l[0].value);
+                gk20a_writel(g, gr_pri_mme_shadow_raw_index_r(),
+                             gr_pri_mme_shadow_raw_index_write_trigger_f() |
+                             sw_method_init->l[0].addr);
+                last_method_data = sw_method_init->l[0].value;
+        }
+        for (i = 1; i < sw_method_init->count; i++) {
+                if (sw_method_init->l[i].value != last_method_data) {
+                        gk20a_writel(g, gr_pri_mme_shadow_raw_data_r(),
+                                sw_method_init->l[i].value);
+                        last_method_data = sw_method_init->l[i].value;
+                }
+                gk20a_writel(g, gr_pri_mme_shadow_raw_index_r(),
+                        gr_pri_mme_shadow_raw_index_write_trigger_f() |
+                        sw_method_init->l[i].addr);
+        }
+        err = gr_gk20a_wait_idle(g, gk20a_get_gr_idle_timeout(g),
+                                 GR_IDLE_CHECK_DEFAULT);
+        if (err != 0U) {
+                goto clean_up;
+        }
+        ctx_header_words =  roundup(ctx_header_bytes, sizeof(u32));
+        ctx_header_words >>= 2;
+        g->ops.mm.l2_flush(g, true);
+        for (i = 0; i < ctx_header_words; i++) {
+                data = nvgpu_mem_rd32(g, gr_mem, i);
+                nvgpu_mem_wr32(g, gold_mem, i, data);
+        }
+        nvgpu_mem_wr(g, gold_mem, ctxsw_prog_main_image_zcull_o(),
+                 ctxsw_prog_main_image_zcull_mode_no_ctxsw_v());
+        g->ops.gr.write_zcull_ptr(g, gold_mem, 0);
+        err = g->ops.gr.commit_inst(c, gr_ctx->global_ctx_buffer_va[GOLDEN_CTX_VA]);
+        if (err != 0U) {
+                goto clean_up;
+        }
+        gr_gk20a_fecs_ctx_image_save(c, gr_fecs_method_push_adr_wfi_golden_save_v());
+        if (gr->ctx_vars.local_golden_image == NULL) {
+                gr->ctx_vars.local_golden_image =
+                        nvgpu_vzalloc(g, gr->ctx_vars.golden_image_size);
+                if (gr->ctx_vars.local_golden_image == NULL) {
+                        err = -ENOMEM;
+                        goto clean_up;
+                }
+                nvgpu_mem_rd_n(g, gold_mem, 0,
+                        gr->ctx_vars.local_golden_image,
+                        gr->ctx_vars.golden_image_size);
+        }
+        err = g->ops.gr.commit_inst(c, gr_mem->gpu_va);
+        if (err != 0U) {
+                goto clean_up;
+        }
+        gr->ctx_vars.golden_image_initialized = true;
+        gk20a_writel(g, gr_fecs_current_ctx_r(),
+                gr_fecs_current_ctx_valid_false_f());
+clean_up:
+        if (err != 0U) {
+                nvgpu_err(g, "fail");
+        } else {
+                nvgpu_log_fn(g, "done");
+        }
+        nvgpu_mutex_release(&gr->ctx_mutex);
+        return err;
+}
+int gr_gk20a_update_smpc_ctxsw_mode(struct gk20a *g,
+                                    struct channel_gk20a *c,
+                                    bool enable_smpc_ctxsw)
+{
+        struct tsg_gk20a *tsg;
+        struct nvgpu_gr_ctx *gr_ctx = NULL;
+        struct nvgpu_mem *mem = NULL;
+        u32 data;
+        int ret;
+        nvgpu_log_fn(g, " ");
+        tsg = tsg_gk20a_from_ch(c);
+        if (tsg == NULL) {
+                return -EINVAL;
+        }
+        gr_ctx = &tsg->gr_ctx;
+        mem = &gr_ctx->mem;
+        if (!nvgpu_mem_is_valid(mem)) {
+                nvgpu_err(g, "no graphics context allocated");
+                return -EFAULT;
+        }
+        ret = gk20a_disable_channel_tsg(g, c);
+        if (ret) {
+                nvgpu_err(g, "failed to disable channel/TSG");
+                goto out;
+        }
+        ret = gk20a_fifo_preempt(g, c);
+        if (ret) {
+                gk20a_enable_channel_tsg(g, c);
+                nvgpu_err(g, "failed to preempt channel/TSG");
+                goto out;
+        }
+        /* Channel gr_ctx buffer is gpu cacheable.
+           Flush and invalidate before cpu update. */
+        g->ops.mm.l2_flush(g, true);
+        data = nvgpu_mem_rd(g, mem,
+                ctxsw_prog_main_image_pm_o());
+        data = data & ~ctxsw_prog_main_image_pm_smpc_mode_m();
+        data |= enable_smpc_ctxsw ?
+                ctxsw_prog_main_image_pm_smpc_mode_ctxsw_f() :
+                ctxsw_prog_main_image_pm_smpc_mode_no_ctxsw_f();
+        nvgpu_mem_wr(g, mem,
+                ctxsw_prog_main_image_pm_o(), data);
+out:
+        gk20a_enable_channel_tsg(g, c);
+        return ret;
+}
+int gr_gk20a_update_hwpm_ctxsw_mode(struct gk20a *g,
+                                  struct channel_gk20a *c,
+                                  u64 gpu_va,
+                                  u32 mode)
+{
+        struct tsg_gk20a *tsg;
+        struct nvgpu_mem *gr_mem = NULL;
+        struct nvgpu_gr_ctx *gr_ctx;
+        struct pm_ctx_desc *pm_ctx;
+        u32 data;
+        u64 virt_addr = 0;
+        struct nvgpu_mem *ctxheader = &c->ctx_header;
+        int ret;
+        nvgpu_log_fn(g, " ");
+        tsg = tsg_gk20a_from_ch(c);
+        if (tsg == NULL) {
+                return -EINVAL;
+        }
+        gr_ctx = &tsg->gr_ctx;
+        pm_ctx = &gr_ctx->pm_ctx;
+        gr_mem = &gr_ctx->mem;
+        if (!nvgpu_mem_is_valid(gr_mem)) {
+                nvgpu_err(g, "no graphics context allocated");
+                return -EFAULT;
+        }
+        if ((mode == NVGPU_DBG_HWPM_CTXSW_MODE_STREAM_OUT_CTXSW) &&
+                (g->ops.gr.get_hw_accessor_stream_out_mode == NULL)) {
+                nvgpu_err(g, "Mode-E hwpm context switch mode is not supported");
+                return -EINVAL;
+        }
+        switch (mode) {
+        case NVGPU_DBG_HWPM_CTXSW_MODE_CTXSW:
+                if (pm_ctx->pm_mode == ctxsw_prog_main_image_pm_mode_ctxsw_f()) {
+                        return 0;
+                }
+                break;
+        case  NVGPU_DBG_HWPM_CTXSW_MODE_NO_CTXSW:
+                if (pm_ctx->pm_mode == ctxsw_prog_main_image_pm_mode_no_ctxsw_f()) {
+                        return 0;
+                }
+                break;
+        case NVGPU_DBG_HWPM_CTXSW_MODE_STREAM_OUT_CTXSW:
+                if (pm_ctx->pm_mode == g->ops.gr.get_hw_accessor_stream_out_mode()) {
+                        return 0;
+                }
+                break;
+        default:
+                nvgpu_err(g, "invalid hwpm context switch mode");
+                return -EINVAL;
+        }
+        ret = gk20a_disable_channel_tsg(g, c);
+        if (ret) {
+                nvgpu_err(g, "failed to disable channel/TSG");
+                return ret;
+        }
+        ret = gk20a_fifo_preempt(g, c);
+        if (ret) {
+                gk20a_enable_channel_tsg(g, c);
+                nvgpu_err(g, "failed to preempt channel/TSG");
+                return ret;
+        }
+        /* Channel gr_ctx buffer is gpu cacheable.
+           Flush and invalidate before cpu update. */
+        g->ops.mm.l2_flush(g, true);
+        if (mode != NVGPU_DBG_HWPM_CTXSW_MODE_NO_CTXSW) {
+                /* Allocate buffer if necessary */
+                if (pm_ctx->mem.gpu_va == 0) {
+                        ret = nvgpu_dma_alloc_sys(g,
+                                        g->gr.ctx_vars.pm_ctxsw_image_size,
+                                        &pm_ctx->mem);
+                        if (ret) {
+                                c->g->ops.fifo.enable_channel(c);
+                                nvgpu_err(g,
+                                        "failed to allocate pm ctxt buffer");
+                                return ret;
+                        }
+                        pm_ctx->mem.gpu_va = nvgpu_gmmu_map_fixed(c->vm,
+                                                        &pm_ctx->mem,
+                                                        gpu_va,
+                                                        pm_ctx->mem.size,
+                                                        NVGPU_VM_MAP_CACHEABLE,
+                                                        gk20a_mem_flag_none, true,
+                                                        pm_ctx->mem.aperture);
+                        if (pm_ctx->mem.gpu_va == 0ULL) {
+                                nvgpu_err(g,
+                                        "failed to map pm ctxt buffer");
+                                nvgpu_dma_free(g, &pm_ctx->mem);
+                                c->g->ops.fifo.enable_channel(c);
+                                return -ENOMEM;
+                        }
+                }
+                if ((mode == NVGPU_DBG_HWPM_CTXSW_MODE_STREAM_OUT_CTXSW) &&
+                        (g->ops.gr.init_hwpm_pmm_register != NULL)) {
+                        g->ops.gr.init_hwpm_pmm_register(g);
+                }
+        }
+        data = nvgpu_mem_rd(g, gr_mem, ctxsw_prog_main_image_pm_o());
+        data = data & ~ctxsw_prog_main_image_pm_mode_m();
+        switch (mode) {
+        case  NVGPU_DBG_HWPM_CTXSW_MODE_CTXSW:
+                pm_ctx->pm_mode = ctxsw_prog_main_image_pm_mode_ctxsw_f();
+                virt_addr = pm_ctx->mem.gpu_va;
+                break;
+        case NVGPU_DBG_HWPM_CTXSW_MODE_STREAM_OUT_CTXSW:
+                pm_ctx->pm_mode = g->ops.gr.get_hw_accessor_stream_out_mode();
+                virt_addr = pm_ctx->mem.gpu_va;
+                break;
+        case NVGPU_DBG_HWPM_CTXSW_MODE_NO_CTXSW:
+                pm_ctx->pm_mode = ctxsw_prog_main_image_pm_mode_no_ctxsw_f();
+                virt_addr = 0;
+        }
+        data |= pm_ctx->pm_mode;
+        nvgpu_mem_wr(g, gr_mem, ctxsw_prog_main_image_pm_o(), data);
+        if (ctxheader->gpu_va) {
+                struct channel_gk20a *ch;
+                nvgpu_rwsem_down_read(&tsg->ch_list_lock);
+                nvgpu_list_for_each_entry(ch, &tsg->ch_list, channel_gk20a, ch_entry) {
+                        g->ops.gr.write_pm_ptr(g, &ch->ctx_header, virt_addr);
+                }
+                nvgpu_rwsem_up_read(&tsg->ch_list_lock);
+        } else {
+                g->ops.gr.write_pm_ptr(g, gr_mem, virt_addr);
+        }
+        /* enable channel */
+        gk20a_enable_channel_tsg(g, c);
+        return 0;
+}
+void gk20a_gr_init_ctxsw_hdr_data(struct gk20a *g,
+                                struct nvgpu_mem *mem)
+{
+        nvgpu_mem_wr(g, mem,
+                        ctxsw_prog_main_image_num_save_ops_o(), 0);
+        nvgpu_mem_wr(g, mem,
+                        ctxsw_prog_main_image_num_restore_ops_o(), 0);
+}
+/* load saved fresh copy of gloden image into channel gr_ctx */
+int gr_gk20a_load_golden_ctx_image(struct gk20a *g,
+                                        struct channel_gk20a *c)
+{
+        struct gr_gk20a *gr = &g->gr;
+        struct tsg_gk20a *tsg;
+        struct nvgpu_gr_ctx *gr_ctx;
+        u32 virt_addr_lo;
+        u32 virt_addr_hi;
+        u64 virt_addr = 0;
+        u32 v, data;
+        int ret = 0;
+        struct nvgpu_mem *mem;
+        nvgpu_log_fn(g, " ");
+        tsg = tsg_gk20a_from_ch(c);
+        if (tsg == NULL) {
+                return -EINVAL;
+        }
+        gr_ctx = &tsg->gr_ctx;
+        mem = &gr_ctx->mem;
+        if (gr->ctx_vars.local_golden_image == NULL) {
+                return -EINVAL;
+        }
+        /* Channel gr_ctx buffer is gpu cacheable.
+           Flush and invalidate before cpu update. */
+        g->ops.mm.l2_flush(g, true);
+        nvgpu_mem_wr_n(g, mem, 0,
+                gr->ctx_vars.local_golden_image,
+                gr->ctx_vars.golden_image_size);
+        if (g->ops.gr.init_ctxsw_hdr_data) {
+                g->ops.gr.init_ctxsw_hdr_data(g, mem);
+        }
+        if ((g->ops.gr.enable_cde_in_fecs != NULL) && c->cde) {
+                g->ops.gr.enable_cde_in_fecs(g, mem);
+        }
+        /* set priv access map */
+        virt_addr_lo =
+                 u64_lo32(gr_ctx->global_ctx_buffer_va[PRIV_ACCESS_MAP_VA]);
+        virt_addr_hi =
+                 u64_hi32(gr_ctx->global_ctx_buffer_va[PRIV_ACCESS_MAP_VA]);
+        if (g->allow_all) {
+                data = ctxsw_prog_main_image_priv_access_map_config_mode_allow_all_f();
+        } else {
+                data = ctxsw_prog_main_image_priv_access_map_config_mode_use_map_f();
+        }
+        nvgpu_mem_wr(g, mem, ctxsw_prog_main_image_priv_access_map_config_o(),
+                 data);
+        nvgpu_mem_wr(g, mem,
+                ctxsw_prog_main_image_priv_access_map_addr_lo_o(),
+                virt_addr_lo);
+        nvgpu_mem_wr(g, mem,
+                ctxsw_prog_main_image_priv_access_map_addr_hi_o(),
+                virt_addr_hi);
+        /* disable verif features */
+        v = nvgpu_mem_rd(g, mem, ctxsw_prog_main_image_misc_options_o());
+        v = v & ~(ctxsw_prog_main_image_misc_options_verif_features_m());
+        v = v | ctxsw_prog_main_image_misc_options_verif_features_disabled_f();
+        nvgpu_mem_wr(g, mem, ctxsw_prog_main_image_misc_options_o(), v);
+        if (g->ops.gr.update_ctxsw_preemption_mode) {
+                g->ops.gr.update_ctxsw_preemption_mode(g, c, mem);
+        }
+        if (g->ops.gr.update_boosted_ctx) {
+                g->ops.gr.update_boosted_ctx(g, mem, gr_ctx);
+        }
+        virt_addr_lo = u64_lo32(gr_ctx->patch_ctx.mem.gpu_va);
+        virt_addr_hi = u64_hi32(gr_ctx->patch_ctx.mem.gpu_va);
+        nvgpu_log(g, gpu_dbg_info, "write patch count = %d",
+                        gr_ctx->patch_ctx.data_count);
+        nvgpu_mem_wr(g, mem, ctxsw_prog_main_image_patch_count_o(),
+                 gr_ctx->patch_ctx.data_count);
+        nvgpu_mem_wr(g, mem,
+                ctxsw_prog_main_image_patch_adr_lo_o(),
+                virt_addr_lo);
+        nvgpu_mem_wr(g, mem,
+                ctxsw_prog_main_image_patch_adr_hi_o(),
+                virt_addr_hi);
+        /* Update main header region of the context buffer with the info needed
+         * for PM context switching, including mode and possibly a pointer to
+         * the PM backing store.
+         */
+        if (gr_ctx->pm_ctx.pm_mode != ctxsw_prog_main_image_pm_mode_no_ctxsw_f()) {
+                if (gr_ctx->pm_ctx.mem.gpu_va == 0) {
+                        nvgpu_err(g,
+                                "context switched pm with no pm buffer!");
+                        return -EFAULT;
+                }
+                virt_addr = gr_ctx->pm_ctx.mem.gpu_va;
+        } else {
+                virt_addr = 0;
+        }
+        data = nvgpu_mem_rd(g, mem, ctxsw_prog_main_image_pm_o());
+        data = data & ~ctxsw_prog_main_image_pm_mode_m();
+        data |= gr_ctx->pm_ctx.pm_mode;
+        nvgpu_mem_wr(g, mem, ctxsw_prog_main_image_pm_o(), data);
+        g->ops.gr.write_pm_ptr(g, mem, virt_addr);
+        return ret;
+}
+static void gr_gk20a_start_falcon_ucode(struct gk20a *g)
+{
+        nvgpu_log_fn(g, " ");
+        gk20a_writel(g, gr_fecs_ctxsw_mailbox_clear_r(0),
+                     gr_fecs_ctxsw_mailbox_clear_value_f(~0));
+        gk20a_writel(g, gr_gpccs_dmactl_r(), gr_gpccs_dmactl_require_ctx_f(0));
+        gk20a_writel(g, gr_fecs_dmactl_r(), gr_fecs_dmactl_require_ctx_f(0));
+        gk20a_writel(g, gr_gpccs_cpuctl_r(), gr_gpccs_cpuctl_startcpu_f(1));
+        gk20a_writel(g, gr_fecs_cpuctl_r(), gr_fecs_cpuctl_startcpu_f(1));
+        nvgpu_log_fn(g, "done");
+}
+static int gr_gk20a_init_ctxsw_ucode_vaspace(struct gk20a *g)
+{
+        struct mm_gk20a *mm = &g->mm;
+        struct vm_gk20a *vm = mm->pmu.vm;
+        struct gk20a_ctxsw_ucode_info *ucode_info = &g->ctxsw_ucode_info;
+        int err;
+        err = g->ops.mm.alloc_inst_block(g, &ucode_info->inst_blk_desc);
+        if (err != 0) {
+                return err;
+        }
+        g->ops.mm.init_inst_block(&ucode_info->inst_blk_desc, vm, 0);
+        /* Map ucode surface to GMMU */
+        ucode_info->surface_desc.gpu_va = nvgpu_gmmu_map(vm,
+                                        &ucode_info->surface_desc,
+                                        ucode_info->surface_desc.size,
+                                        0, /* flags */
+                                        gk20a_mem_flag_read_only,
+                                        false,
+                                        ucode_info->surface_desc.aperture);
+        if (ucode_info->surface_desc.gpu_va == 0ULL) {
+                nvgpu_err(g, "failed to update gmmu ptes");
+                return -ENOMEM;
+        }
+        return 0;
+}
+static void gr_gk20a_init_ctxsw_ucode_segment(
+        struct gk20a_ctxsw_ucode_segment *p_seg, u32 *offset, u32 size)
+{
+        p_seg->offset = *offset;
+        p_seg->size = size;
+        *offset = ALIGN(*offset + size, BLK_SIZE);
+}
+static void gr_gk20a_init_ctxsw_ucode_segments(
+        struct gk20a_ctxsw_ucode_segments *segments, u32 *offset,
+        struct gk20a_ctxsw_bootloader_desc *bootdesc,
+        u32 code_size, u32 data_size)
+{
+        u32 boot_size = ALIGN(bootdesc->size, sizeof(u32));
+        segments->boot_entry = bootdesc->entry_point;
+        segments->boot_imem_offset = bootdesc->imem_offset;
+        gr_gk20a_init_ctxsw_ucode_segment(&segments->boot, offset, boot_size);
+        gr_gk20a_init_ctxsw_ucode_segment(&segments->code, offset, code_size);
+        gr_gk20a_init_ctxsw_ucode_segment(&segments->data, offset, data_size);
+}
+static int gr_gk20a_copy_ctxsw_ucode_segments(
+        struct gk20a *g,
+        struct nvgpu_mem *dst,
+        struct gk20a_ctxsw_ucode_segments *segments,
+        u32 *bootimage,
+        u32 *code, u32 *data)
+{
+        unsigned int i;
+        nvgpu_mem_wr_n(g, dst, segments->boot.offset, bootimage,
+                        segments->boot.size);
+        nvgpu_mem_wr_n(g, dst, segments->code.offset, code,
+                        segments->code.size);
+        nvgpu_mem_wr_n(g, dst, segments->data.offset, data,
+                        segments->data.size);
+        /* compute a "checksum" for the boot binary to detect its version */
+        segments->boot_signature = 0;
+        for (i = 0; i < segments->boot.size / sizeof(u32); i++) {
+                segments->boot_signature += bootimage[i];
+        }
+        return 0;
+}
+int gr_gk20a_init_ctxsw_ucode(struct gk20a *g)
+{
+        struct mm_gk20a *mm = &g->mm;
+        struct vm_gk20a *vm = mm->pmu.vm;
+        struct gk20a_ctxsw_bootloader_desc *fecs_boot_desc;
+        struct gk20a_ctxsw_bootloader_desc *gpccs_boot_desc;
+        struct nvgpu_firmware *fecs_fw;
+        struct nvgpu_firmware *gpccs_fw;
+        u32 *fecs_boot_image;
+        u32 *gpccs_boot_image;
+        struct gk20a_ctxsw_ucode_info *ucode_info = &g->ctxsw_ucode_info;
+        u32 ucode_size;
+        int err = 0;
+        fecs_fw = nvgpu_request_firmware(g, GK20A_FECS_UCODE_IMAGE, 0);
+        if (fecs_fw == NULL) {
+                nvgpu_err(g, "failed to load fecs ucode!!");
+                return -ENOENT;
+        }
+        fecs_boot_desc = (void *)fecs_fw->data;
+        fecs_boot_image = (void *)(fecs_fw->data +
+                                sizeof(struct gk20a_ctxsw_bootloader_desc));
+        gpccs_fw = nvgpu_request_firmware(g, GK20A_GPCCS_UCODE_IMAGE, 0);
+        if (gpccs_fw == NULL) {
+                nvgpu_release_firmware(g, fecs_fw);
+                nvgpu_err(g, "failed to load gpccs ucode!!");
+                return -ENOENT;
+        }
+        gpccs_boot_desc = (void *)gpccs_fw->data;
+        gpccs_boot_image = (void *)(gpccs_fw->data +
+                                sizeof(struct gk20a_ctxsw_bootloader_desc));
+        ucode_size = 0;
+        gr_gk20a_init_ctxsw_ucode_segments(&ucode_info->fecs, &ucode_size,
+                fecs_boot_desc,
+                g->gr.ctx_vars.ucode.fecs.inst.count * sizeof(u32),
+                g->gr.ctx_vars.ucode.fecs.data.count * sizeof(u32));
+        gr_gk20a_init_ctxsw_ucode_segments(&ucode_info->gpccs, &ucode_size,
+                gpccs_boot_desc,
+                g->gr.ctx_vars.ucode.gpccs.inst.count * sizeof(u32),
+                g->gr.ctx_vars.ucode.gpccs.data.count * sizeof(u32));
+        err = nvgpu_dma_alloc_sys(g, ucode_size, &ucode_info->surface_desc);
+        if (err != 0) {
+                goto clean_up;
+        }
+        gr_gk20a_copy_ctxsw_ucode_segments(g, &ucode_info->surface_desc,
+                &ucode_info->fecs,
+                fecs_boot_image,
+                g->gr.ctx_vars.ucode.fecs.inst.l,
+                g->gr.ctx_vars.ucode.fecs.data.l);
+        nvgpu_release_firmware(g, fecs_fw);
+        fecs_fw = NULL;
+        gr_gk20a_copy_ctxsw_ucode_segments(g, &ucode_info->surface_desc,
+                &ucode_info->gpccs,
+                gpccs_boot_image,
+                g->gr.ctx_vars.ucode.gpccs.inst.l,
+                g->gr.ctx_vars.ucode.gpccs.data.l);
+        nvgpu_release_firmware(g, gpccs_fw);
+        gpccs_fw = NULL;
+        err = gr_gk20a_init_ctxsw_ucode_vaspace(g);
+        if (err != 0) {
+                goto clean_up;
+        }
+        return 0;
+clean_up:
+        if (ucode_info->surface_desc.gpu_va) {
+                nvgpu_gmmu_unmap(vm, &ucode_info->surface_desc,
+                                 ucode_info->surface_desc.gpu_va);
+        }
+        nvgpu_dma_free(g, &ucode_info->surface_desc);
+        nvgpu_release_firmware(g, gpccs_fw);
+        gpccs_fw = NULL;
+        nvgpu_release_firmware(g, fecs_fw);
+        fecs_fw = NULL;
+        return err;
+}
+static void gr_gk20a_wait_for_fecs_arb_idle(struct gk20a *g)
+{
+        int retries = FECS_ARB_CMD_TIMEOUT_MAX / FECS_ARB_CMD_TIMEOUT_DEFAULT;
+        u32 val;
+        val = gk20a_readl(g, gr_fecs_arb_ctx_cmd_r());
+        while ((gr_fecs_arb_ctx_cmd_cmd_v(val) != 0U) && (retries != 0)) {
+                nvgpu_udelay(FECS_ARB_CMD_TIMEOUT_DEFAULT);
+                retries--;
+                val = gk20a_readl(g, gr_fecs_arb_ctx_cmd_r());
+        }
+        if (retries == 0) {
+                nvgpu_err(g, "arbiter cmd timeout, fecs arb ctx cmd: 0x%08x",
+                                gk20a_readl(g, gr_fecs_arb_ctx_cmd_r()));
+        }
+        retries = FECS_ARB_CMD_TIMEOUT_MAX / FECS_ARB_CMD_TIMEOUT_DEFAULT;
+        while (((gk20a_readl(g, gr_fecs_ctxsw_status_1_r()) &
+                        gr_fecs_ctxsw_status_1_arb_busy_m()) != 0U) &&
+               (retries != 0)) {
+                nvgpu_udelay(FECS_ARB_CMD_TIMEOUT_DEFAULT);
+                retries--;
+        }
+        if (retries == 0) {
+                nvgpu_err(g,
+                          "arbiter idle timeout, fecs ctxsw status: 0x%08x",
+                          gk20a_readl(g, gr_fecs_ctxsw_status_1_r()));
+        }
+}
+void gr_gk20a_load_falcon_bind_instblk(struct gk20a *g)
+{
+        struct gk20a_ctxsw_ucode_info *ucode_info = &g->ctxsw_ucode_info;
+        int retries = FECS_ARB_CMD_TIMEOUT_MAX / FECS_ARB_CMD_TIMEOUT_DEFAULT;
+        u64 inst_ptr;
+        while (((gk20a_readl(g, gr_fecs_ctxsw_status_1_r()) &
+                        gr_fecs_ctxsw_status_1_arb_busy_m()) != 0U) &&
+               (retries != 0)) {
+                nvgpu_udelay(FECS_ARB_CMD_TIMEOUT_DEFAULT);
+                retries--;
+        }
+        if (retries == 0) {
+                nvgpu_err(g,
+                          "arbiter idle timeout, status: %08x",
+                          gk20a_readl(g, gr_fecs_ctxsw_status_1_r()));
+        }
+        gk20a_writel(g, gr_fecs_arb_ctx_adr_r(), 0x0);
+        inst_ptr = nvgpu_inst_block_addr(g, &ucode_info->inst_blk_desc);
+        gk20a_writel(g, gr_fecs_new_ctx_r(),
+                     gr_fecs_new_ctx_ptr_f(inst_ptr >> 12) |
+                     nvgpu_aperture_mask(g, &ucode_info->inst_blk_desc,
+                                gr_fecs_new_ctx_target_sys_mem_ncoh_f(),
+                                gr_fecs_new_ctx_target_sys_mem_coh_f(),
+                                gr_fecs_new_ctx_target_vid_mem_f()) |
+                     gr_fecs_new_ctx_valid_m());
+        gk20a_writel(g, gr_fecs_arb_ctx_ptr_r(),
+                     gr_fecs_arb_ctx_ptr_ptr_f(inst_ptr >> 12) |
+                     nvgpu_aperture_mask(g, &ucode_info->inst_blk_desc,
+                                gr_fecs_arb_ctx_ptr_target_sys_mem_ncoh_f(),
+                                gr_fecs_arb_ctx_ptr_target_sys_mem_coh_f(),
+                                gr_fecs_arb_ctx_ptr_target_vid_mem_f()));
+        gk20a_writel(g, gr_fecs_arb_ctx_cmd_r(), 0x7);
+        /* Wait for arbiter command to complete */
+        gr_gk20a_wait_for_fecs_arb_idle(g);
+        gk20a_writel(g, gr_fecs_current_ctx_r(),
+                        gr_fecs_current_ctx_ptr_f(inst_ptr >> 12) |
+                        gr_fecs_current_ctx_target_m() |
+                        gr_fecs_current_ctx_valid_m());
+        /* Send command to arbiter to flush */
+        gk20a_writel(g, gr_fecs_arb_ctx_cmd_r(), gr_fecs_arb_ctx_cmd_cmd_s());
+        gr_gk20a_wait_for_fecs_arb_idle(g);
+}
+void gr_gk20a_load_ctxsw_ucode_header(struct gk20a *g, u64 addr_base,
+        struct gk20a_ctxsw_ucode_segments *segments, u32 reg_offset)
+{
+        u32 addr_code32;
+        u32 addr_data32;
+        addr_code32 = u64_lo32((addr_base + segments->code.offset) >> 8);
+        addr_data32 = u64_lo32((addr_base + segments->data.offset) >> 8);
+        /*
+         * Copy falcon bootloader header into dmem at offset 0.
+         * Configure dmem port 0 for auto-incrementing writes starting at dmem
+         * offset 0.
+         */
+        gk20a_writel(g, reg_offset + gr_fecs_dmemc_r(0),
+                        gr_fecs_dmemc_offs_f(0) |
+                        gr_fecs_dmemc_blk_f(0) |
+                        gr_fecs_dmemc_aincw_f(1));
+        /* Write out the actual data */
+        switch (segments->boot_signature) {
+        case FALCON_UCODE_SIG_T18X_GPCCS_WITH_RESERVED:
+        case FALCON_UCODE_SIG_T21X_FECS_WITH_DMEM_SIZE:
+        case FALCON_UCODE_SIG_T21X_FECS_WITH_RESERVED:
+        case FALCON_UCODE_SIG_T21X_GPCCS_WITH_RESERVED:
+        case FALCON_UCODE_SIG_T12X_FECS_WITH_RESERVED:
+        case FALCON_UCODE_SIG_T12X_GPCCS_WITH_RESERVED:
+                gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
+                gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
+                gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
+                gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
+                /* fallthrough */
+        case FALCON_UCODE_SIG_T12X_FECS_WITHOUT_RESERVED:
+        case FALCON_UCODE_SIG_T12X_GPCCS_WITHOUT_RESERVED:
+        case FALCON_UCODE_SIG_T21X_FECS_WITHOUT_RESERVED:
+        case FALCON_UCODE_SIG_T21X_FECS_WITHOUT_RESERVED2:
+        case FALCON_UCODE_SIG_T21X_GPCCS_WITHOUT_RESERVED:
+                gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
+                gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
+                gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
+                gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
+                gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 4);
+                gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0),
+                                addr_code32);
+                gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
+                gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0),
+                                segments->code.size);
+                gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
+                gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
+                gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
+                gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0),
+                                addr_data32);
+                gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0),
+                                segments->data.size);
+                break;
+        case FALCON_UCODE_SIG_T12X_FECS_OLDER:
+        case FALCON_UCODE_SIG_T12X_GPCCS_OLDER:
+                gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
+                gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0),
+                                addr_code32);
+                gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
+                gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0),
+                                segments->code.size);
+                gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
+                gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0),
+                                addr_data32);
+                gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0),
+                                segments->data.size);
+                gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0),
+                                addr_code32);
+                gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
+                gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
+                break;
+        default:
+                nvgpu_err(g,
+                                "unknown falcon ucode boot signature 0x%08x"
+                                " with reg_offset 0x%08x",
+                                segments->boot_signature, reg_offset);
+                BUG();
+        }
+}
+void gr_gk20a_load_ctxsw_ucode_boot(struct gk20a *g, u64 addr_base,
+        struct gk20a_ctxsw_ucode_segments *segments, u32 reg_offset)
+{
+        u32 addr_load32;
+        u32 blocks;
+        u32 b;
+        u32 dst;
+        addr_load32 = u64_lo32((addr_base + segments->boot.offset) >> 8);
+        blocks = ((segments->boot.size + 0xFF) & ~0xFF) >> 8;
+        /*
+         * Set the base FB address for the DMA transfer. Subtract off the 256
+         * byte IMEM block offset such that the relative FB and IMEM offsets
+         * match, allowing the IMEM tags to be properly created.
+         */
+        dst = segments->boot_imem_offset;
+        gk20a_writel(g, reg_offset + gr_fecs_dmatrfbase_r(),
+                        (addr_load32 - (dst >> 8)));
+        for (b = 0; b < blocks; b++) {
+                /* Setup destination IMEM offset */
+                gk20a_writel(g, reg_offset + gr_fecs_dmatrfmoffs_r(),
+                                dst + (b << 8));
+                /* Setup source offset (relative to BASE) */
+                gk20a_writel(g, reg_offset + gr_fecs_dmatrffboffs_r(),
+                                dst + (b << 8));
+                gk20a_writel(g, reg_offset + gr_fecs_dmatrfcmd_r(),
+                                gr_fecs_dmatrfcmd_imem_f(0x01) |
+                                gr_fecs_dmatrfcmd_write_f(0x00) |
+                                gr_fecs_dmatrfcmd_size_f(0x06) |
+                                gr_fecs_dmatrfcmd_ctxdma_f(0));
+        }
+        /* Specify the falcon boot vector */
+        gk20a_writel(g, reg_offset + gr_fecs_bootvec_r(),
+                        gr_fecs_bootvec_vec_f(segments->boot_entry));
+}
+static void gr_gk20a_load_falcon_with_bootloader(struct gk20a *g)
+{
+        struct gk20a_ctxsw_ucode_info *ucode_info = &g->ctxsw_ucode_info;
+        u64 addr_base = ucode_info->surface_desc.gpu_va;
+        gk20a_writel(g, gr_fecs_ctxsw_mailbox_clear_r(0), 0x0);
+        gr_gk20a_load_falcon_bind_instblk(g);
+        g->ops.gr.falcon_load_ucode(g, addr_base,
+                &g->ctxsw_ucode_info.fecs, 0);
+        g->ops.gr.falcon_load_ucode(g, addr_base,
+                &g->ctxsw_ucode_info.gpccs,
+                gr_gpcs_gpccs_falcon_hwcfg_r() -
+                gr_fecs_falcon_hwcfg_r());
+}
+int gr_gk20a_load_ctxsw_ucode(struct gk20a *g)
+{
+        int err;
+        nvgpu_log_fn(g, " ");
+        if (nvgpu_is_enabled(g, NVGPU_IS_FMODEL)) {
+                gk20a_writel(g, gr_fecs_ctxsw_mailbox_r(7),
+                        gr_fecs_ctxsw_mailbox_value_f(0xc0de7777));
+                gk20a_writel(g, gr_gpccs_ctxsw_mailbox_r(7),
+                        gr_gpccs_ctxsw_mailbox_value_f(0xc0de7777));
+        }
+        /*
+         * In case bootloader is not supported, revert to the old way of
+         * loading gr ucode, without the faster bootstrap routine.
+         */
+        if (!nvgpu_is_enabled(g, NVGPU_GR_USE_DMA_FOR_FW_BOOTSTRAP)) {
+                gr_gk20a_load_falcon_dmem(g);
+                gr_gk20a_load_falcon_imem(g);
+                gr_gk20a_start_falcon_ucode(g);
+        } else {
+                if (!g->gr.skip_ucode_init) {
+                        err = gr_gk20a_init_ctxsw_ucode(g);
+                        if (err != 0) {
+                                return err;
+                        }
+                }
+                gr_gk20a_load_falcon_with_bootloader(g);
+                g->gr.skip_ucode_init = true;
+        }
+        nvgpu_log_fn(g, "done");
+        return 0;
+}
+int gr_gk20a_set_fecs_watchdog_timeout(struct gk20a *g)
+{
+        gk20a_writel(g, gr_fecs_ctxsw_mailbox_clear_r(0), 0xffffffff);
+        gk20a_writel(g, gr_fecs_method_data_r(), 0x7fffffff);
+        gk20a_writel(g, gr_fecs_method_push_r(),
+                gr_fecs_method_push_adr_set_watchdog_timeout_f());
+        return 0;
+}
+static int gr_gk20a_wait_ctxsw_ready(struct gk20a *g)
+{
+        u32 ret;
+        nvgpu_log_fn(g, " ");
+        ret = gr_gk20a_ctx_wait_ucode(g, 0, NULL,
+                                      GR_IS_UCODE_OP_EQUAL,
+                                      eUcodeHandshakeInitComplete,
+                                      GR_IS_UCODE_OP_SKIP, 0, false);
+        if (ret) {
+                nvgpu_err(g, "falcon ucode init timeout");
+                return ret;
+        }
+        if (nvgpu_is_enabled(g, NVGPU_GR_USE_DMA_FOR_FW_BOOTSTRAP) ||
+                nvgpu_is_enabled(g, NVGPU_SEC_SECUREGPCCS)) {
+                gk20a_writel(g, gr_fecs_current_ctx_r(),
+                        gr_fecs_current_ctx_valid_false_f());
+        }
+        ret = g->ops.gr.set_fecs_watchdog_timeout(g);
+        if (ret) {
+                nvgpu_err(g, "fail to set watchdog timeout");
+                return ret;
+        }
+        nvgpu_log_fn(g, "done");
+        return 0;
+}
+int gr_gk20a_init_ctx_state(struct gk20a *g)
+{
+        u32 ret;
+        struct fecs_method_op_gk20a op = {
+                .mailbox = { .id = 0, .data = 0,
+                             .clr = ~0, .ok = 0, .fail = 0},
+                .method.data = 0,
+                .cond.ok = GR_IS_UCODE_OP_NOT_EQUAL,
+                .cond.fail = GR_IS_UCODE_OP_SKIP,
+                };
+        nvgpu_log_fn(g, " ");
+        /* query ctxsw image sizes, if golden context is not created */
+        if (!g->gr.ctx_vars.golden_image_initialized) {
+                op.method.addr =
+                        gr_fecs_method_push_adr_discover_image_size_v();
+                op.mailbox.ret = &g->gr.ctx_vars.golden_image_size;
+                ret = gr_gk20a_submit_fecs_method_op(g, op, false);
+                if (ret) {
+                        nvgpu_err(g,
+                                   "query golden image size failed");
+                        return ret;
+                }
+                op.method.addr =
+                        gr_fecs_method_push_adr_discover_zcull_image_size_v();
+                op.mailbox.ret = &g->gr.ctx_vars.zcull_ctxsw_image_size;
+                ret = gr_gk20a_submit_fecs_method_op(g, op, false);
+                if (ret) {
+                        nvgpu_err(g,
+                                   "query zcull ctx image size failed");
+                        return ret;
+                }
+                op.method.addr =
+                        gr_fecs_method_push_adr_discover_pm_image_size_v();
+                op.mailbox.ret = &g->gr.ctx_vars.pm_ctxsw_image_size;
+                ret = gr_gk20a_submit_fecs_method_op(g, op, false);
+                if (ret) {
+                        nvgpu_err(g,
+                                   "query pm ctx image size failed");
+                        return ret;
+                }
+                g->gr.ctx_vars.priv_access_map_size = 512 * 1024;
+#ifdef CONFIG_GK20A_CTXSW_TRACE
+                g->gr.ctx_vars.fecs_trace_buffer_size =
+                        gk20a_fecs_trace_buffer_size(g);
+#endif
+        }
+        nvgpu_log_fn(g, "done");
+        return 0;
+}
+void gk20a_gr_destroy_ctx_buffer(struct gk20a *g,
+                                        struct gr_ctx_buffer_desc *desc)
+{
+        if (desc == NULL) {
+                return;
+        }
+        nvgpu_dma_free(g, &desc->mem);
+        desc->destroy = NULL;
+}
+int gk20a_gr_alloc_ctx_buffer(struct gk20a *g,
+                                     struct gr_ctx_buffer_desc *desc,
+                                     size_t size)
+{
+        int err = 0;
+        nvgpu_log_fn(g, " ");
+        if (nvgpu_mem_is_valid(&desc->mem)) {
+                return 0;
+        }
+        err = nvgpu_dma_alloc_sys(g, size, &desc->mem);
+        if (err != 0) {
+                return err;
+        }
+        desc->destroy = gk20a_gr_destroy_ctx_buffer;
+        return err;
+}
+static void gr_gk20a_free_global_ctx_buffers(struct gk20a *g)
+{
+        struct gr_gk20a *gr = &g->gr;
+        u32 i;
+        for (i = 0; i < NR_GLOBAL_CTX_BUF; i++) {
+                /* destroy exists iff buffer is allocated */
+                if (gr->global_ctx_buffer[i].destroy) {
+                        gr->global_ctx_buffer[i].destroy(g,
+                                        &gr->global_ctx_buffer[i]);
+                }
+        }
+        nvgpu_log_fn(g, "done");
+}
+int gr_gk20a_alloc_global_ctx_buffers(struct gk20a *g)
+{
+        struct gr_gk20a *gr = &g->gr;
+        int attr_buffer_size, err;
+        u32 cb_buffer_size = gr->bundle_cb_default_size *
+                gr_scc_bundle_cb_size_div_256b_byte_granularity_v();
+        u32 pagepool_buffer_size = g->ops.gr.pagepool_default_size(g) *
+                gr_scc_pagepool_total_pages_byte_granularity_v();
+        nvgpu_log_fn(g, " ");
+        attr_buffer_size = g->ops.gr.calc_global_ctx_buffer_size(g);
+        nvgpu_log_info(g, "cb_buffer_size : %d", cb_buffer_size);
+        err = gk20a_gr_alloc_ctx_buffer(g, &gr->global_ctx_buffer[CIRCULAR],
+                                        cb_buffer_size);
+        if (err != 0) {
+                goto clean_up;
+        }
+        if (g->ops.secure_alloc) {
+                err = g->ops.secure_alloc(g,
+                                       &gr->global_ctx_buffer[CIRCULAR_VPR],
+                                       cb_buffer_size);
+                if (err != 0) {
+                        goto clean_up;
+                }
+        }
+        nvgpu_log_info(g, "pagepool_buffer_size : %d", pagepool_buffer_size);
+        err = gk20a_gr_alloc_ctx_buffer(g, &gr->global_ctx_buffer[PAGEPOOL],
+                                        pagepool_buffer_size);
+        if (err != 0) {
+                goto clean_up;
+        }
+        if (g->ops.secure_alloc) {
+                err = g->ops.secure_alloc(g,
+                                       &gr->global_ctx_buffer[PAGEPOOL_VPR],
+                                       pagepool_buffer_size);
+                if (err != 0) {
+                        goto clean_up;
+                }
+        }
+        nvgpu_log_info(g, "attr_buffer_size : %d", attr_buffer_size);
+        err = gk20a_gr_alloc_ctx_buffer(g, &gr->global_ctx_buffer[ATTRIBUTE],
+                                        attr_buffer_size);
+        if (err != 0) {
+                goto clean_up;
+        }
+        if (g->ops.secure_alloc) {
+                err = g->ops.secure_alloc(g,
+                                       &gr->global_ctx_buffer[ATTRIBUTE_VPR],
+                                       attr_buffer_size);
+                if (err != 0) {
+                        goto clean_up;
+                }
+        }
+        nvgpu_log_info(g, "golden_image_size : %d",
+                   gr->ctx_vars.golden_image_size);
+        err = gk20a_gr_alloc_ctx_buffer(g,
+                                        &gr->global_ctx_buffer[GOLDEN_CTX],
+                                        gr->ctx_vars.golden_image_size);
+        if (err != 0) {
+                goto clean_up;
+        }
+        nvgpu_log_info(g, "priv_access_map_size : %d",
+                   gr->ctx_vars.priv_access_map_size);
+        err = gk20a_gr_alloc_ctx_buffer(g,
+                                        &gr->global_ctx_buffer[PRIV_ACCESS_MAP],
+                                        gr->ctx_vars.priv_access_map_size);
+        if (err != 0) {
+                goto clean_up;
+        }
+#ifdef CONFIG_GK20A_CTXSW_TRACE
+        nvgpu_log_info(g, "fecs_trace_buffer_size : %d",
+                   gr->ctx_vars.fecs_trace_buffer_size);
+        err = nvgpu_dma_alloc_sys(g,
+                        gr->ctx_vars.fecs_trace_buffer_size,
+                        &gr->global_ctx_buffer[FECS_TRACE_BUFFER].mem);
+        if (err != 0) {
+                goto clean_up;
+        }
+        gr->global_ctx_buffer[FECS_TRACE_BUFFER].destroy =
+                         gk20a_gr_destroy_ctx_buffer;
+#endif
+        nvgpu_log_fn(g, "done");
+        return 0;
+ clean_up:
+        nvgpu_err(g, "fail");
+        gr_gk20a_free_global_ctx_buffers(g);
+        return -ENOMEM;
+}
+static void gr_gk20a_unmap_global_ctx_buffers(struct gk20a *g,
+                                              struct vm_gk20a *vm,
+                                              struct nvgpu_gr_ctx *gr_ctx)
+{
+        u64 *g_bfr_va = gr_ctx->global_ctx_buffer_va;
+        u64 *g_bfr_size = gr_ctx->global_ctx_buffer_size;
+        int *g_bfr_index = gr_ctx->global_ctx_buffer_index;
+        u32 i;
+        nvgpu_log_fn(g, " ");
+        for (i = 0; i < NR_GLOBAL_CTX_BUF_VA; i++) {
+                if (g_bfr_index[i]) {
+                        struct nvgpu_mem *mem;
+                        /*
+                         * Translate from VA index to buffer index to determine
+                         * the correct struct nvgpu_mem to use. Handles the VPR
+                         * vs non-VPR difference in context images.
+                         */
+                        mem = &g->gr.global_ctx_buffer[g_bfr_index[i]].mem;
+                        nvgpu_gmmu_unmap(vm, mem, g_bfr_va[i]);
+                }
+        }
+        memset(g_bfr_va, 0, sizeof(gr_ctx->global_ctx_buffer_va));
+        memset(g_bfr_size, 0, sizeof(gr_ctx->global_ctx_buffer_size));
+        memset(g_bfr_index, 0, sizeof(gr_ctx->global_ctx_buffer_index));
+        gr_ctx->global_ctx_buffer_mapped = false;
+}
+int gr_gk20a_map_global_ctx_buffers(struct gk20a *g,
+                                struct channel_gk20a *c)
+{
+        struct tsg_gk20a *tsg;
+        struct vm_gk20a *ch_vm = c->vm;
+        u64 *g_bfr_va;
+        u64 *g_bfr_size;
+        int *g_bfr_index;
+        struct gr_gk20a *gr = &g->gr;
+        struct nvgpu_mem *mem;
+        u64 gpu_va;
+        nvgpu_log_fn(g, " ");
+        tsg = tsg_gk20a_from_ch(c);
+        if (tsg == NULL) {
+                return -EINVAL;
+        }
+        g_bfr_va = tsg->gr_ctx.global_ctx_buffer_va;
+        g_bfr_size = tsg->gr_ctx.global_ctx_buffer_size;
+        g_bfr_index = tsg->gr_ctx.global_ctx_buffer_index;
+        /* Circular Buffer */
+        if (c->vpr &&
+            nvgpu_mem_is_valid(&gr->global_ctx_buffer[CIRCULAR_VPR].mem)) {
+                mem = &gr->global_ctx_buffer[CIRCULAR_VPR].mem;
+                g_bfr_index[CIRCULAR_VA] = CIRCULAR_VPR;
+        } else {
+                mem = &gr->global_ctx_buffer[CIRCULAR].mem;
+                g_bfr_index[CIRCULAR_VA] = CIRCULAR;
+        }
+        gpu_va = nvgpu_gmmu_map(ch_vm, mem, mem->size,
+                                NVGPU_VM_MAP_CACHEABLE,
+                                gk20a_mem_flag_none, true, mem->aperture);
+        if (gpu_va == 0ULL) {
+                goto clean_up;
+        }
+        g_bfr_va[CIRCULAR_VA] = gpu_va;
+        g_bfr_size[CIRCULAR_VA] = mem->size;
+        /* Attribute Buffer */
+        if (c->vpr &&
+            nvgpu_mem_is_valid(&gr->global_ctx_buffer[ATTRIBUTE_VPR].mem)) {
+                mem = &gr->global_ctx_buffer[ATTRIBUTE_VPR].mem;
+                g_bfr_index[ATTRIBUTE_VA] = ATTRIBUTE_VPR;
+        } else {
+                mem = &gr->global_ctx_buffer[ATTRIBUTE].mem;
+                g_bfr_index[ATTRIBUTE_VA] = ATTRIBUTE;
+        }
+        gpu_va = nvgpu_gmmu_map(ch_vm, mem, mem->size,
+                                NVGPU_VM_MAP_CACHEABLE,
+                                gk20a_mem_flag_none, false, mem->aperture);
+        if (gpu_va == 0ULL) {
+                goto clean_up;
+        }
+        g_bfr_va[ATTRIBUTE_VA] = gpu_va;
+        g_bfr_size[ATTRIBUTE_VA] = mem->size;
+        /* Page Pool */
+        if (c->vpr &&
+            nvgpu_mem_is_valid(&gr->global_ctx_buffer[PAGEPOOL_VPR].mem)) {
+                mem = &gr->global_ctx_buffer[PAGEPOOL_VPR].mem;
+                g_bfr_index[PAGEPOOL_VA] = PAGEPOOL_VPR;
+        } else {
+                mem = &gr->global_ctx_buffer[PAGEPOOL].mem;
+                g_bfr_index[PAGEPOOL_VA] = PAGEPOOL;
+        }
+        gpu_va = nvgpu_gmmu_map(ch_vm, mem, mem->size,
+                                NVGPU_VM_MAP_CACHEABLE,
+                                gk20a_mem_flag_none, true, mem->aperture);
+        if (gpu_va == 0ULL) {
+                goto clean_up;
+        }
+        g_bfr_va[PAGEPOOL_VA] = gpu_va;
+        g_bfr_size[PAGEPOOL_VA] = mem->size;
+        /* Golden Image */
+        mem = &gr->global_ctx_buffer[GOLDEN_CTX].mem;
+        gpu_va = nvgpu_gmmu_map(ch_vm, mem, mem->size, 0,
+                                gk20a_mem_flag_none, true, mem->aperture);
+        if (gpu_va == 0ULL) {
+                goto clean_up;
+        }
+        g_bfr_va[GOLDEN_CTX_VA] = gpu_va;
+        g_bfr_size[GOLDEN_CTX_VA] = mem->size;
+        g_bfr_index[GOLDEN_CTX_VA] = GOLDEN_CTX;
+        /* Priv register Access Map */
+        mem = &gr->global_ctx_buffer[PRIV_ACCESS_MAP].mem;
+        gpu_va = nvgpu_gmmu_map(ch_vm, mem, mem->size, 0,
+                                gk20a_mem_flag_none, true, mem->aperture);
+        if (gpu_va == 0ULL) {
+                goto clean_up;
+        }
+        g_bfr_va[PRIV_ACCESS_MAP_VA] = gpu_va;
+        g_bfr_size[PRIV_ACCESS_MAP_VA] = mem->size;
+        g_bfr_index[PRIV_ACCESS_MAP_VA] = PRIV_ACCESS_MAP;
+        tsg->gr_ctx.global_ctx_buffer_mapped = true;
+#ifdef CONFIG_GK20A_CTXSW_TRACE
+        /* FECS trace buffer */
+        if (nvgpu_is_enabled(g, NVGPU_FECS_TRACE_VA)) {
+                mem = &gr->global_ctx_buffer[FECS_TRACE_BUFFER].mem;
+                gpu_va = nvgpu_gmmu_map(ch_vm, mem, mem->size, 0,
+                                gk20a_mem_flag_none, true, mem->aperture);
+                if (!gpu_va)
+                        goto clean_up;
+                g_bfr_va[FECS_TRACE_BUFFER_VA] = gpu_va;
+                g_bfr_size[FECS_TRACE_BUFFER_VA] = mem->size;
+                g_bfr_index[FECS_TRACE_BUFFER_VA] = FECS_TRACE_BUFFER;
+        }
+#endif
+        return 0;
+clean_up:
+        gr_gk20a_unmap_global_ctx_buffers(g, ch_vm, &tsg->gr_ctx);
+        return -ENOMEM;
+}
+int gr_gk20a_alloc_gr_ctx(struct gk20a *g,
+                          struct nvgpu_gr_ctx *gr_ctx, struct vm_gk20a *vm,
+                          u32 class,
+                          u32 padding)
+{
+        struct gr_gk20a *gr = &g->gr;
+        int err = 0;
+        nvgpu_log_fn(g, " ");
+        if (gr->ctx_vars.buffer_size == 0) {
+                return 0;
+        }
+        /* alloc channel gr ctx buffer */
+        gr->ctx_vars.buffer_size = gr->ctx_vars.golden_image_size;
+        gr->ctx_vars.buffer_total_size = gr->ctx_vars.golden_image_size;
+        err = nvgpu_dma_alloc(g, gr->ctx_vars.buffer_total_size, &gr_ctx->mem);
+        if (err != 0) {
+                return err;
+        }
+        gr_ctx->mem.gpu_va = nvgpu_gmmu_map(vm,
+                                        &gr_ctx->mem,
+                                        gr_ctx->mem.size,
+                                        0, /* not GPU-cacheable */
+                                        gk20a_mem_flag_none, true,
+                                        gr_ctx->mem.aperture);
+        if (gr_ctx->mem.gpu_va == 0ULL) {
+                goto err_free_mem;
+        }
+        return 0;
+ err_free_mem:
+        nvgpu_dma_free(g, &gr_ctx->mem);
+        return err;
+}
+static int gr_gk20a_alloc_tsg_gr_ctx(struct gk20a *g,
+                        struct tsg_gk20a *tsg, u32 class, u32 padding)
+{
+        struct nvgpu_gr_ctx *gr_ctx = &tsg->gr_ctx;
+        int err;
+        if (tsg->vm == NULL) {
+                nvgpu_err(tsg->g, "No address space bound");
+                return -ENOMEM;
+        }
+        err = g->ops.gr.alloc_gr_ctx(g, gr_ctx, tsg->vm, class, padding);
+        if (err != 0) {
+                return err;
+        }
+        gr_ctx->tsgid = tsg->tsgid;
+        return 0;
+}
+void gr_gk20a_free_gr_ctx(struct gk20a *g,
+                          struct vm_gk20a *vm, struct nvgpu_gr_ctx *gr_ctx)
+{
+        nvgpu_log_fn(g, " ");
+        if (gr_ctx->mem.gpu_va) {
+                gr_gk20a_unmap_global_ctx_buffers(g, vm, gr_ctx);
+                gr_gk20a_free_channel_patch_ctx(g, vm, gr_ctx);
+                gr_gk20a_free_channel_pm_ctx(g, vm, gr_ctx);
+                if ((g->ops.gr.dump_ctxsw_stats != NULL) &&
+                     g->gr.ctx_vars.dump_ctxsw_stats_on_channel_close) {
+                        g->ops.gr.dump_ctxsw_stats(g, vm, gr_ctx);
+                }
+                nvgpu_dma_unmap_free(vm, &gr_ctx->pagepool_ctxsw_buffer);
+                nvgpu_dma_unmap_free(vm, &gr_ctx->betacb_ctxsw_buffer);
+                nvgpu_dma_unmap_free(vm, &gr_ctx->spill_ctxsw_buffer);
+                nvgpu_dma_unmap_free(vm, &gr_ctx->preempt_ctxsw_buffer);
+                nvgpu_dma_unmap_free(vm, &gr_ctx->mem);
+                memset(gr_ctx, 0, sizeof(*gr_ctx));
+        }
+}
+void gr_gk20a_free_tsg_gr_ctx(struct tsg_gk20a *tsg)
+{
+        struct gk20a *g = tsg->g;
+        if (tsg->vm == NULL) {
+                nvgpu_err(g, "No address space bound");
+                return;
+        }
+        tsg->g->ops.gr.free_gr_ctx(g, tsg->vm, &tsg->gr_ctx);
+}
+u32 gr_gk20a_get_patch_slots(struct gk20a *g)
+{
+        return PATCH_CTX_SLOTS_PER_PAGE;
+}
+static int gr_gk20a_alloc_channel_patch_ctx(struct gk20a *g,
+                                struct channel_gk20a *c)
+{
+        struct tsg_gk20a *tsg;
+        struct patch_desc *patch_ctx;
+        struct vm_gk20a *ch_vm = c->vm;
+        u32 alloc_size;
+        int err = 0;
+        nvgpu_log_fn(g, " ");
+        tsg = tsg_gk20a_from_ch(c);
+        if (tsg == NULL) {
+                return -EINVAL;
+        }
+        patch_ctx = &tsg->gr_ctx.patch_ctx;
+        alloc_size = g->ops.gr.get_patch_slots(g) *
+                PATCH_CTX_SLOTS_REQUIRED_PER_ENTRY;
+        nvgpu_log(g, gpu_dbg_info, "patch buffer size in entries: %d",
+                alloc_size);
+        err = nvgpu_dma_alloc_map_sys(ch_vm,
+                        alloc_size * sizeof(u32), &patch_ctx->mem);
+        if (err != 0) {
+                return err;
+        }
+        nvgpu_log_fn(g, "done");
+        return 0;
+}
+static void gr_gk20a_free_channel_patch_ctx(struct gk20a *g,
+                                            struct vm_gk20a *vm,
+                                            struct nvgpu_gr_ctx *gr_ctx)
+{
+        struct patch_desc *patch_ctx = &gr_ctx->patch_ctx;
+        nvgpu_log_fn(g, " ");
+        if (patch_ctx->mem.gpu_va) {
+                nvgpu_gmmu_unmap(vm, &patch_ctx->mem,
+                                 patch_ctx->mem.gpu_va);
+        }
+        nvgpu_dma_free(g, &patch_ctx->mem);
+        patch_ctx->data_count = 0;
+}
+static void gr_gk20a_free_channel_pm_ctx(struct gk20a *g,
+                                         struct vm_gk20a *vm,
+                                         struct nvgpu_gr_ctx *gr_ctx)
+{
+        struct pm_ctx_desc *pm_ctx = &gr_ctx->pm_ctx;
+        nvgpu_log_fn(g, " ");
+        if (pm_ctx->mem.gpu_va) {
+                nvgpu_gmmu_unmap(vm, &pm_ctx->mem, pm_ctx->mem.gpu_va);
+                nvgpu_dma_free(g, &pm_ctx->mem);
+        }
+}
+int gk20a_alloc_obj_ctx(struct channel_gk20a  *c, u32 class_num, u32 flags)
+{
+        struct gk20a *g = c->g;
+        struct nvgpu_gr_ctx *gr_ctx;
+        struct tsg_gk20a *tsg = NULL;
+        int err = 0;
+        nvgpu_log_fn(g, " ");
+        /* an address space needs to have been bound at this point.*/
+        if (!gk20a_channel_as_bound(c) && (c->vm == NULL)) {
+                nvgpu_err(g,
+                           "not bound to address space at time"
+                           " of grctx allocation");
+                return -EINVAL;
+        }
+        if (!g->ops.gr.is_valid_class(g, class_num)) {
+                nvgpu_err(g,
+                           "invalid obj class 0x%x", class_num);
+                err = -EINVAL;
+                goto out;
+        }
+        c->obj_class = class_num;
+        tsg = tsg_gk20a_from_ch(c);
+        if (tsg == NULL) {
+                return -EINVAL;
+        }
+        gr_ctx = &tsg->gr_ctx;
+        if (!nvgpu_mem_is_valid(&gr_ctx->mem)) {
+                tsg->vm = c->vm;
+                nvgpu_vm_get(tsg->vm);
+                err = gr_gk20a_alloc_tsg_gr_ctx(g, tsg,
+                                                class_num,
+                                                flags);
+                if (err != 0) {
+                        nvgpu_err(g,
+                                "fail to allocate TSG gr ctx buffer");
+                        nvgpu_vm_put(tsg->vm);
+                        tsg->vm = NULL;
+                        goto out;
+                }
+                /* allocate patch buffer */
+                if (!nvgpu_mem_is_valid(&gr_ctx->patch_ctx.mem)) {
+                        gr_ctx->patch_ctx.data_count = 0;
+                        err = gr_gk20a_alloc_channel_patch_ctx(g, c);
+                        if (err != 0) {
+                                nvgpu_err(g,
+                                        "fail to allocate patch buffer");
+                                goto out;
+                        }
+                }
+                /* map global buffer to channel gpu_va and commit */
+                err = g->ops.gr.map_global_ctx_buffers(g, c);
+                if (err != 0) {
+                        nvgpu_err(g,
+                                "fail to map global ctx buffer");
+                        goto out;
+                }
+                g->ops.gr.commit_global_ctx_buffers(g, c, true);
+                /* commit gr ctx buffer */
+                err = g->ops.gr.commit_inst(c, gr_ctx->mem.gpu_va);
+                if (err != 0) {
+                        nvgpu_err(g,
+                                "fail to commit gr ctx buffer");
+                        goto out;
+                }
+                /* init golden image */
+                err = gr_gk20a_init_golden_ctx_image(g, c);
+                if (err != 0) {
+                        nvgpu_err(g,
+                                "fail to init golden ctx image");
+                        goto out;
+                }
+                /* Re-enable ELPG now that golden image has been initialized.
+                 * The PMU PG init code may already have tried to enable elpg, but
+                 * would not have been able to complete this action since the golden
+                 * image hadn't been initialized yet, so do this now.
+                 */
+                err = nvgpu_pmu_reenable_elpg(g);
+                if (err != 0) {
+                        nvgpu_err(g, "fail to re-enable elpg");
+                        goto out;
+                }
+                /* load golden image */
+                gr_gk20a_load_golden_ctx_image(g, c);
+                if (err != 0) {
+                        nvgpu_err(g,
+                                "fail to load golden ctx image");
+                        goto out;
+                }
+#ifdef CONFIG_GK20A_CTXSW_TRACE
+                if (g->ops.fecs_trace.bind_channel && !c->vpr) {
+                        err = g->ops.fecs_trace.bind_channel(g, c);
+                        if (err != 0) {
+                                nvgpu_warn(g,
+                                        "fail to bind channel for ctxsw trace");
+                        }
+                }
+#endif
+                if (g->ops.gr.set_czf_bypass) {
+                        g->ops.gr.set_czf_bypass(g, c);
+                }
+                /* PM ctxt switch is off by default */
+                gr_ctx->pm_ctx.pm_mode = ctxsw_prog_main_image_pm_mode_no_ctxsw_f();
+        } else {
+                /* commit gr ctx buffer */
+                err = g->ops.gr.commit_inst(c, gr_ctx->mem.gpu_va);
+                if (err != 0) {
+                        nvgpu_err(g,
+                                "fail to commit gr ctx buffer");
+                        goto out;
+                }
+#ifdef CONFIG_GK20A_CTXSW_TRACE
+                if (g->ops.fecs_trace.bind_channel && !c->vpr) {
+                        err = g->ops.fecs_trace.bind_channel(g, c);
+                        if (err != 0) {
+                                nvgpu_warn(g,
+                                        "fail to bind channel for ctxsw trace");
+                        }
+                }
+#endif
+        }
+        nvgpu_log_fn(g, "done");
+        return 0;
+out:
+        /* 1. gr_ctx, patch_ctx and global ctx buffer mapping
+           can be reused so no need to release them.
+           2. golden image init and load is a one time thing so if
+           they pass, no need to undo. */
+        nvgpu_err(g, "fail");
+        return err;
+}
+static void gk20a_remove_gr_support(struct gr_gk20a *gr)
+{
+        struct gk20a *g = gr->g;
+        nvgpu_log_fn(g, " ");
+        gr_gk20a_free_cyclestats_snapshot_data(g);
+        gr_gk20a_free_global_ctx_buffers(g);
+        nvgpu_dma_free(g, &gr->compbit_store.mem);
+        memset(&gr->compbit_store, 0, sizeof(struct compbit_store_desc));
+        nvgpu_kfree(g, gr->gpc_tpc_count);
+        nvgpu_kfree(g, gr->gpc_zcb_count);
+        nvgpu_kfree(g, gr->gpc_ppc_count);
+        nvgpu_kfree(g, gr->pes_tpc_count[0]);
+        nvgpu_kfree(g, gr->pes_tpc_count[1]);
+        nvgpu_kfree(g, gr->pes_tpc_mask[0]);
+        nvgpu_kfree(g, gr->pes_tpc_mask[1]);
+        nvgpu_kfree(g, gr->sm_to_cluster);
+        nvgpu_kfree(g, gr->gpc_skip_mask);
+        nvgpu_kfree(g, gr->map_tiles);
+        nvgpu_kfree(g, gr->fbp_rop_l2_en_mask);
+        gr->gpc_tpc_count = NULL;
+        gr->gpc_zcb_count = NULL;
+        gr->gpc_ppc_count = NULL;
+        gr->pes_tpc_count[0] = NULL;
+        gr->pes_tpc_count[1] = NULL;
+        gr->pes_tpc_mask[0] = NULL;
+        gr->pes_tpc_mask[1] = NULL;
+        gr->gpc_skip_mask = NULL;
+        gr->map_tiles = NULL;
+        gr->fbp_rop_l2_en_mask = NULL;
+        gr->ctx_vars.valid = false;
+        nvgpu_kfree(g, gr->ctx_vars.ucode.fecs.inst.l);
+        nvgpu_kfree(g, gr->ctx_vars.ucode.fecs.data.l);
+        nvgpu_kfree(g, gr->ctx_vars.ucode.gpccs.inst.l);
+        nvgpu_kfree(g, gr->ctx_vars.ucode.gpccs.data.l);
+        nvgpu_kfree(g, gr->ctx_vars.sw_bundle_init.l);
+        nvgpu_kfree(g, gr->ctx_vars.sw_veid_bundle_init.l);
+        nvgpu_kfree(g, gr->ctx_vars.sw_method_init.l);
+        nvgpu_kfree(g, gr->ctx_vars.sw_ctx_load.l);
+        nvgpu_kfree(g, gr->ctx_vars.sw_non_ctx_load.l);
+        nvgpu_kfree(g, gr->ctx_vars.ctxsw_regs.sys.l);
+        nvgpu_kfree(g, gr->ctx_vars.ctxsw_regs.gpc.l);
+        nvgpu_kfree(g, gr->ctx_vars.ctxsw_regs.tpc.l);
+        nvgpu_kfree(g, gr->ctx_vars.ctxsw_regs.zcull_gpc.l);
+        nvgpu_kfree(g, gr->ctx_vars.ctxsw_regs.ppc.l);
+        nvgpu_kfree(g, gr->ctx_vars.ctxsw_regs.pm_sys.l);
+        nvgpu_kfree(g, gr->ctx_vars.ctxsw_regs.pm_gpc.l);
+        nvgpu_kfree(g, gr->ctx_vars.ctxsw_regs.pm_tpc.l);
+        nvgpu_kfree(g, gr->ctx_vars.ctxsw_regs.pm_ppc.l);
+        nvgpu_kfree(g, gr->ctx_vars.ctxsw_regs.perf_sys.l);
+        nvgpu_kfree(g, gr->ctx_vars.ctxsw_regs.fbp.l);
+        nvgpu_kfree(g, gr->ctx_vars.ctxsw_regs.perf_gpc.l);
+        nvgpu_kfree(g, gr->ctx_vars.ctxsw_regs.fbp_router.l);
+        nvgpu_kfree(g, gr->ctx_vars.ctxsw_regs.gpc_router.l);
+        nvgpu_kfree(g, gr->ctx_vars.ctxsw_regs.pm_ltc.l);
+        nvgpu_kfree(g, gr->ctx_vars.ctxsw_regs.pm_fbpa.l);
+        nvgpu_kfree(g, gr->ctx_vars.sw_bundle64_init.l);
+        nvgpu_kfree(g, gr->ctx_vars.ctxsw_regs.pm_cau.l);
+        nvgpu_vfree(g, gr->ctx_vars.local_golden_image);
+        gr->ctx_vars.local_golden_image = NULL;
+        if (gr->ctx_vars.hwpm_ctxsw_buffer_offset_map) {
+                nvgpu_big_free(g, gr->ctx_vars.hwpm_ctxsw_buffer_offset_map);
+        }
+        gr->ctx_vars.hwpm_ctxsw_buffer_offset_map = NULL;
+        gk20a_comptag_allocator_destroy(g, &gr->comp_tags);
+        nvgpu_ecc_remove_support(g);
+}
+static int gr_gk20a_init_gr_config(struct gk20a *g, struct gr_gk20a *gr)
+{
+        u32 gpc_index, pes_index;
+        u32 pes_tpc_mask;
+        u32 pes_tpc_count;
+        u32 pes_heavy_index;
+        u32 gpc_new_skip_mask;
+        u32 tmp;
+        u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE);
+        u32 sm_per_tpc = nvgpu_get_litter_value(g, GPU_LIT_NUM_SM_PER_TPC);
+        tmp = gk20a_readl(g, pri_ringmaster_enum_fbp_r());
+        gr->num_fbps = pri_ringmaster_enum_fbp_count_v(tmp);
+        tmp = gk20a_readl(g, top_num_gpcs_r());
+        gr->max_gpc_count = top_num_gpcs_value_v(tmp);
+        tmp = gk20a_readl(g, top_num_fbps_r());
+        gr->max_fbps_count = top_num_fbps_value_v(tmp);
+        gr->fbp_en_mask = g->ops.gr.get_fbp_en_mask(g);
+        if (gr->fbp_rop_l2_en_mask == NULL) {
+                gr->fbp_rop_l2_en_mask =
+                        nvgpu_kzalloc(g, gr->max_fbps_count * sizeof(u32));
+                if (gr->fbp_rop_l2_en_mask == NULL) {
+                        goto clean_up;
+                }
+        } else {
+                memset(gr->fbp_rop_l2_en_mask, 0, gr->max_fbps_count *
+                                sizeof(u32));
+        }
+        tmp = gk20a_readl(g, top_tpc_per_gpc_r());
+        gr->max_tpc_per_gpc_count = top_tpc_per_gpc_value_v(tmp);
+        gr->max_tpc_count = gr->max_gpc_count * gr->max_tpc_per_gpc_count;
+        tmp = gk20a_readl(g, top_num_fbps_r());
+        gr->sys_count = top_num_fbps_value_v(tmp);
+        tmp = gk20a_readl(g, pri_ringmaster_enum_gpc_r());
+        gr->gpc_count = pri_ringmaster_enum_gpc_count_v(tmp);
+        gr->pe_count_per_gpc = nvgpu_get_litter_value(g, GPU_LIT_NUM_PES_PER_GPC);
+        if (WARN(gr->pe_count_per_gpc > GK20A_GR_MAX_PES_PER_GPC,
+                 "too many pes per gpc\n")) {
+                goto clean_up;
+        }
+        gr->max_zcull_per_gpc_count = nvgpu_get_litter_value(g, GPU_LIT_NUM_ZCULL_BANKS);
+        if (gr->gpc_count == 0U) {
+                nvgpu_err(g, "gpc_count==0!");
+                goto clean_up;
+        }
+        if (gr->gpc_tpc_count == NULL) {
+                gr->gpc_tpc_count = nvgpu_kzalloc(g, gr->gpc_count *
+                                        sizeof(u32));
+        } else {
+                memset(gr->gpc_tpc_count, 0, gr->gpc_count *
+                                        sizeof(u32));
+        }
+        if (gr->gpc_tpc_mask == NULL) {
+                gr->gpc_tpc_mask = nvgpu_kzalloc(g, gr->max_gpc_count *
+                                        sizeof(u32));
+        } else {
+                memset(gr->gpc_tpc_mask, 0,  gr->max_gpc_count *
+                                        sizeof(u32));
+        }
+        if (gr->gpc_zcb_count == NULL) {
+                gr->gpc_zcb_count = nvgpu_kzalloc(g, gr->gpc_count *
+                                        sizeof(u32));
+        } else {
+                memset(gr->gpc_zcb_count, 0, gr->gpc_count *
+                                        sizeof(u32));
+        }
+        if (gr->gpc_ppc_count == NULL) {
+                gr->gpc_ppc_count = nvgpu_kzalloc(g, gr->gpc_count *
+                                        sizeof(u32));
+        } else {
+                memset(gr->gpc_ppc_count, 0, gr->gpc_count *
+                                        sizeof(u32));
+        }
+        if (gr->gpc_skip_mask == NULL) {
+                gr->gpc_skip_mask =
+                        nvgpu_kzalloc(g, gr_pd_dist_skip_table__size_1_v() *
+                              4 * sizeof(u32));
+        } else {
+                memset(gr->gpc_skip_mask, 0, gr_pd_dist_skip_table__size_1_v() *
+                              4 * sizeof(u32));
+        }
+        if ((gr->gpc_tpc_count == NULL) || (gr->gpc_tpc_mask == NULL) ||
+            (gr->gpc_zcb_count == NULL) || (gr->gpc_ppc_count == NULL) ||
+            (gr->gpc_skip_mask == NULL)) {
+                goto clean_up;
+        }
+        for (gpc_index = 0; gpc_index < gr->max_gpc_count; gpc_index++) {
+                if (g->ops.gr.get_gpc_tpc_mask) {
+                        gr->gpc_tpc_mask[gpc_index] =
+                                g->ops.gr.get_gpc_tpc_mask(g, gpc_index);
+                }
+        }
+        gr->ppc_count = 0;
+        gr->tpc_count = 0;
+        gr->zcb_count = 0;
+        for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
+                tmp = gk20a_readl(g, gr_gpc0_fs_gpc_r() +
+                                     gpc_stride * gpc_index);
+                gr->gpc_tpc_count[gpc_index] =
+                        gr_gpc0_fs_gpc_num_available_tpcs_v(tmp);
+                gr->tpc_count += gr->gpc_tpc_count[gpc_index];
+                gr->gpc_zcb_count[gpc_index] =
+                        gr_gpc0_fs_gpc_num_available_zculls_v(tmp);
+                gr->zcb_count += gr->gpc_zcb_count[gpc_index];
+                for (pes_index = 0; pes_index < gr->pe_count_per_gpc; pes_index++) {
+                        if (gr->pes_tpc_count[pes_index] == NULL) {
+                                gr->pes_tpc_count[pes_index] =
+                                        nvgpu_kzalloc(g, gr->gpc_count *
+                                                      sizeof(u32));
+                                gr->pes_tpc_mask[pes_index] =
+                                        nvgpu_kzalloc(g, gr->gpc_count *
+                                                      sizeof(u32));
+                                if ((gr->pes_tpc_count[pes_index] == NULL) ||
+                                    (gr->pes_tpc_mask[pes_index] == NULL)) {
+                                        goto clean_up;
+                                }
+                        }
+                        tmp = gk20a_readl(g,
+                                gr_gpc0_gpm_pd_pes_tpc_id_mask_r(pes_index) +
+                                gpc_index * gpc_stride);
+                        pes_tpc_mask = gr_gpc0_gpm_pd_pes_tpc_id_mask_mask_v(tmp);
+                        pes_tpc_count = count_bits(pes_tpc_mask);
+                        /* detect PES presence by seeing if there are
+                         * TPCs connected to it.
+                         */
+                        if (pes_tpc_count != 0) {
+                                gr->gpc_ppc_count[gpc_index]++;
+                        }
+                        gr->pes_tpc_count[pes_index][gpc_index] = pes_tpc_count;
+                        gr->pes_tpc_mask[pes_index][gpc_index] = pes_tpc_mask;
+                }
+                gr->ppc_count += gr->gpc_ppc_count[gpc_index];
+                gpc_new_skip_mask = 0;
+                if (gr->pe_count_per_gpc > 1 &&
+                    gr->pes_tpc_count[0][gpc_index] +
+                    gr->pes_tpc_count[1][gpc_index] == 5) {
+                        pes_heavy_index =
+                                gr->pes_tpc_count[0][gpc_index] >
+                                gr->pes_tpc_count[1][gpc_index] ? 0 : 1;
+                        gpc_new_skip_mask =
+                                gr->pes_tpc_mask[pes_heavy_index][gpc_index] ^
+                                   (gr->pes_tpc_mask[pes_heavy_index][gpc_index] &
+                                   (gr->pes_tpc_mask[pes_heavy_index][gpc_index] - 1));
+                } else if (gr->pe_count_per_gpc > 1 &&
+                           (gr->pes_tpc_count[0][gpc_index] +
+                            gr->pes_tpc_count[1][gpc_index] == 4) &&
+                           (gr->pes_tpc_count[0][gpc_index] !=
+                            gr->pes_tpc_count[1][gpc_index])) {
+                                pes_heavy_index =
+                                    gr->pes_tpc_count[0][gpc_index] >
+                                    gr->pes_tpc_count[1][gpc_index] ? 0 : 1;
+                        gpc_new_skip_mask =
+                                gr->pes_tpc_mask[pes_heavy_index][gpc_index] ^
+                                   (gr->pes_tpc_mask[pes_heavy_index][gpc_index] &
+                                   (gr->pes_tpc_mask[pes_heavy_index][gpc_index] - 1));
+                }
+                gr->gpc_skip_mask[gpc_index] = gpc_new_skip_mask;
+        }
+        /* allocate for max tpc per gpc */
+        if (gr->sm_to_cluster == NULL) {
+                gr->sm_to_cluster = nvgpu_kzalloc(g, gr->gpc_count *
+                                        gr->max_tpc_per_gpc_count *
+                                        sm_per_tpc * sizeof(struct sm_info));
+                if (!gr->sm_to_cluster)
+                        goto clean_up;
+        } else {
+                memset(gr->sm_to_cluster, 0, gr->gpc_count *
+                                        gr->max_tpc_per_gpc_count *
+                                        sm_per_tpc * sizeof(struct sm_info));
+        }
+        gr->no_of_sm = 0;
+        nvgpu_log_info(g, "fbps: %d", gr->num_fbps);
+        nvgpu_log_info(g, "max_gpc_count: %d", gr->max_gpc_count);
+        nvgpu_log_info(g, "max_fbps_count: %d", gr->max_fbps_count);
+        nvgpu_log_info(g, "max_tpc_per_gpc_count: %d", gr->max_tpc_per_gpc_count);
+        nvgpu_log_info(g, "max_zcull_per_gpc_count: %d", gr->max_zcull_per_gpc_count);
+        nvgpu_log_info(g, "max_tpc_count: %d", gr->max_tpc_count);
+        nvgpu_log_info(g, "sys_count: %d", gr->sys_count);
+        nvgpu_log_info(g, "gpc_count: %d", gr->gpc_count);
+        nvgpu_log_info(g, "pe_count_per_gpc: %d", gr->pe_count_per_gpc);
+        nvgpu_log_info(g, "tpc_count: %d", gr->tpc_count);
+        nvgpu_log_info(g, "ppc_count: %d", gr->ppc_count);
+        for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
+                nvgpu_log_info(g, "gpc_tpc_count[%d] : %d",
+                           gpc_index, gr->gpc_tpc_count[gpc_index]);
+        }
+        for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
+                nvgpu_log_info(g, "gpc_zcb_count[%d] : %d",
+                           gpc_index, gr->gpc_zcb_count[gpc_index]);
+        }
+        for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
+                nvgpu_log_info(g, "gpc_ppc_count[%d] : %d",
+                           gpc_index, gr->gpc_ppc_count[gpc_index]);
+        }
+        for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
+                nvgpu_log_info(g, "gpc_skip_mask[%d] : %d",
+                           gpc_index, gr->gpc_skip_mask[gpc_index]);
+        }
+        for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
+                for (pes_index = 0;
+                     pes_index < gr->pe_count_per_gpc;
+                     pes_index++) {
+                        nvgpu_log_info(g, "pes_tpc_count[%d][%d] : %d",
+                                   pes_index, gpc_index,
+                                   gr->pes_tpc_count[pes_index][gpc_index]);
+                }
+        }
+        for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
+                for (pes_index = 0;
+                     pes_index < gr->pe_count_per_gpc;
+                     pes_index++) {
+                        nvgpu_log_info(g, "pes_tpc_mask[%d][%d] : %d",
+                                   pes_index, gpc_index,
+                                   gr->pes_tpc_mask[pes_index][gpc_index]);
+                }
+        }
+        g->ops.gr.bundle_cb_defaults(g);
+        g->ops.gr.cb_size_default(g);
+        g->ops.gr.calc_global_ctx_buffer_size(g);
+        gr->timeslice_mode = gr_gpcs_ppcs_cbm_cfg_timeslice_mode_enable_v();
+        nvgpu_log_info(g, "bundle_cb_default_size: %d",
+                   gr->bundle_cb_default_size);
+        nvgpu_log_info(g, "min_gpm_fifo_depth: %d", gr->min_gpm_fifo_depth);
+        nvgpu_log_info(g, "bundle_cb_token_limit: %d", gr->bundle_cb_token_limit);
+        nvgpu_log_info(g, "attrib_cb_default_size: %d",
+                   gr->attrib_cb_default_size);
+        nvgpu_log_info(g, "attrib_cb_size: %d", gr->attrib_cb_size);
+        nvgpu_log_info(g, "alpha_cb_default_size: %d", gr->alpha_cb_default_size);
+        nvgpu_log_info(g, "alpha_cb_size: %d", gr->alpha_cb_size);
+        nvgpu_log_info(g, "timeslice_mode: %d", gr->timeslice_mode);
+        return 0;
+clean_up:
+        return -ENOMEM;
+}
+static u32 prime_set[18] = {
+        2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61 };
+static int gr_gk20a_init_map_tiles(struct gk20a *g, struct gr_gk20a *gr)
+{
+        s32 comm_denom;
+        s32 mul_factor;
+        s32 *init_frac = NULL;
+        s32 *init_err = NULL;
+        s32 *run_err = NULL;
+        s32 *sorted_num_tpcs = NULL;
+        s32 *sorted_to_unsorted_gpc_map = NULL;
+        u32 gpc_index;
+        u32 gpc_mark = 0;
+        u32 num_tpc;
+        u32 max_tpc_count = 0;
+        u32 swap;
+        u32 tile_count;
+        u32 index;
+        bool delete_map = false;
+        bool gpc_sorted;
+        int ret = 0;
+        int num_gpcs = nvgpu_get_litter_value(g, GPU_LIT_NUM_GPCS);
+        int num_tpc_per_gpc = nvgpu_get_litter_value(g, GPU_LIT_NUM_TPC_PER_GPC);
+        int map_tile_count = num_gpcs * num_tpc_per_gpc;
+        init_frac = nvgpu_kzalloc(g, num_gpcs * sizeof(s32));
+        init_err = nvgpu_kzalloc(g, num_gpcs * sizeof(s32));
+        run_err = nvgpu_kzalloc(g, num_gpcs * sizeof(s32));
+        sorted_num_tpcs =
+                nvgpu_kzalloc(g, num_gpcs * num_tpc_per_gpc * sizeof(s32));
+        sorted_to_unsorted_gpc_map =
+                nvgpu_kzalloc(g, num_gpcs * sizeof(s32));
+        if (!((init_frac != NULL) &&
+              (init_err != NULL) &&
+              (run_err != NULL) &&
+              (sorted_num_tpcs != NULL) &&
+              (sorted_to_unsorted_gpc_map != NULL))) {
+                ret = -ENOMEM;
+                goto clean_up;
+        }
+        gr->map_row_offset = INVALID_SCREEN_TILE_ROW_OFFSET;
+        if (gr->tpc_count == 3) {
+                gr->map_row_offset = 2;
+        } else if (gr->tpc_count < 3) {
+                gr->map_row_offset = 1;
+        } else {
+                gr->map_row_offset = 3;
+                for (index = 1; index < 18; index++) {
+                        u32 prime = prime_set[index];
+                        if ((gr->tpc_count % prime) != 0) {
+                                gr->map_row_offset = prime;
+                                break;
+                        }
+                }
+        }
+        switch (gr->tpc_count) {
+        case 15:
+                gr->map_row_offset = 6;
+                break;
+        case 14:
+                gr->map_row_offset = 5;
+                break;
+        case 13:
+                gr->map_row_offset = 2;
+                break;
+        case 11:
+                gr->map_row_offset = 7;
+                break;
+        case 10:
+                gr->map_row_offset = 6;
+                break;
+        case 7:
+        case 5:
+                gr->map_row_offset = 1;
+                break;
+        default:
+                break;
+        }
+        if (gr->map_tiles) {
+                if (gr->map_tile_count != gr->tpc_count) {
+                        delete_map = true;
+                }
+                for (tile_count = 0; tile_count < gr->map_tile_count; tile_count++) {
+                        if (gr_gk20a_get_map_tile_count(gr, tile_count)
+                                        >= gr->tpc_count) {
+                                delete_map = true;
+                        }
+                }
+                if (delete_map) {
+                        nvgpu_kfree(g, gr->map_tiles);
+                        gr->map_tiles = NULL;
+                        gr->map_tile_count = 0;
+                }
+        }
+        if (gr->map_tiles == NULL) {
+                gr->map_tiles = nvgpu_kzalloc(g, map_tile_count * sizeof(u8));
+                if (gr->map_tiles == NULL) {
+                        ret = -ENOMEM;
+                        goto clean_up;
+                }
+                gr->map_tile_count = map_tile_count;
+                for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
+                        sorted_num_tpcs[gpc_index] = gr->gpc_tpc_count[gpc_index];
+                        sorted_to_unsorted_gpc_map[gpc_index] = gpc_index;
+                }
+                gpc_sorted = false;
+                while (!gpc_sorted) {
+                        gpc_sorted = true;
+                        for (gpc_index = 0; gpc_index < gr->gpc_count - 1; gpc_index++) {
+                                if (sorted_num_tpcs[gpc_index + 1] > sorted_num_tpcs[gpc_index]) {
+                                        gpc_sorted = false;
+                                        swap = sorted_num_tpcs[gpc_index];
+                                        sorted_num_tpcs[gpc_index] = sorted_num_tpcs[gpc_index + 1];
+                                        sorted_num_tpcs[gpc_index + 1] = swap;
+                                        swap = sorted_to_unsorted_gpc_map[gpc_index];
+                                        sorted_to_unsorted_gpc_map[gpc_index] =
+                                                sorted_to_unsorted_gpc_map[gpc_index + 1];
+                                        sorted_to_unsorted_gpc_map[gpc_index + 1] = swap;
+                                }
+                        }
+                }
+                for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
+                        if (gr->gpc_tpc_count[gpc_index] > max_tpc_count) {
+                                max_tpc_count = gr->gpc_tpc_count[gpc_index];
+                        }
+                }
+                mul_factor = gr->gpc_count * max_tpc_count;
+                if (mul_factor & 0x1) {
+                        mul_factor = 2;
+                } else {
+                        mul_factor = 1;
+                }
+                comm_denom = gr->gpc_count * max_tpc_count * mul_factor;
+                for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
+                        num_tpc = sorted_num_tpcs[gpc_index];
+                        init_frac[gpc_index] = num_tpc * gr->gpc_count * mul_factor;
+                        if (num_tpc != 0) {
+                                init_err[gpc_index] = gpc_index * max_tpc_count * mul_factor - comm_denom/2;
+                        } else {
+                                init_err[gpc_index] = 0;
+                        }
+                        run_err[gpc_index] = init_frac[gpc_index] + init_err[gpc_index];
+                }
+                while (gpc_mark < gr->tpc_count) {
+                        for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
+                                if ((run_err[gpc_index] * 2) >= comm_denom) {
+                                        gr->map_tiles[gpc_mark++] = (u8)sorted_to_unsorted_gpc_map[gpc_index];
+                                        run_err[gpc_index] += init_frac[gpc_index] - comm_denom;
+                                } else {
+                                        run_err[gpc_index] += init_frac[gpc_index];
+                                }
+                        }
+                }
+        }
+clean_up:
+        nvgpu_kfree(g, init_frac);
+        nvgpu_kfree(g, init_err);
+        nvgpu_kfree(g, run_err);
+        nvgpu_kfree(g, sorted_num_tpcs);
+        nvgpu_kfree(g, sorted_to_unsorted_gpc_map);
+        if (ret) {
+                nvgpu_err(g, "fail");
+        } else {
+                nvgpu_log_fn(g, "done");
+        }
+        return ret;
+}
+static int gr_gk20a_init_zcull(struct gk20a *g, struct gr_gk20a *gr)
+{
+        struct gr_zcull_gk20a *zcull = &gr->zcull;
+        zcull->aliquot_width = gr->tpc_count * 16;
+        zcull->aliquot_height = 16;
+        zcull->width_align_pixels = gr->tpc_count * 16;
+        zcull->height_align_pixels = 32;
+        zcull->aliquot_size =
+                zcull->aliquot_width * zcull->aliquot_height;
+        /* assume no floor sweeping since we only have 1 tpc in 1 gpc */
+        zcull->pixel_squares_by_aliquots =
+                gr->zcb_count * 16 * 16 * gr->tpc_count /
+                (gr->gpc_count * gr->gpc_tpc_count[0]);
+        zcull->total_aliquots =
+                gr_gpc0_zcull_total_ram_size_num_aliquots_f(
+                        gk20a_readl(g, gr_gpc0_zcull_total_ram_size_r()));
+        return 0;
+}
+u32 gr_gk20a_get_ctxsw_zcull_size(struct gk20a *g, struct gr_gk20a *gr)
+{
+        /* assuming gr has already been initialized */
+        return gr->ctx_vars.zcull_ctxsw_image_size;
+}
+int gr_gk20a_bind_ctxsw_zcull(struct gk20a *g, struct gr_gk20a *gr,
+                        struct channel_gk20a *c, u64 zcull_va, u32 mode)
+{
+        struct tsg_gk20a *tsg;
+        struct zcull_ctx_desc *zcull_ctx;
+        tsg = tsg_gk20a_from_ch(c);
+        if (tsg == NULL) {
+                return -EINVAL;
+        }
+        zcull_ctx = &tsg->gr_ctx.zcull_ctx;
+        zcull_ctx->ctx_sw_mode = mode;
+        zcull_ctx->gpu_va = zcull_va;
+        /* TBD: don't disable channel in sw method processing */
+        return gr_gk20a_ctx_zcull_setup(g, c);
+}
+int gr_gk20a_get_zcull_info(struct gk20a *g, struct gr_gk20a *gr,
+                        struct gr_zcull_info *zcull_params)
+{
+        struct gr_zcull_gk20a *zcull = &gr->zcull;
+        zcull_params->width_align_pixels = zcull->width_align_pixels;
+        zcull_params->height_align_pixels = zcull->height_align_pixels;
+        zcull_params->pixel_squares_by_aliquots =
+                zcull->pixel_squares_by_aliquots;
+        zcull_params->aliquot_total = zcull->total_aliquots;
+        zcull_params->region_byte_multiplier =
+                gr->gpc_count * gr_zcull_bytes_per_aliquot_per_gpu_v();
+        zcull_params->region_header_size =
+                nvgpu_get_litter_value(g, GPU_LIT_NUM_GPCS) *
+                gr_zcull_save_restore_header_bytes_per_gpc_v();
+        zcull_params->subregion_header_size =
+                nvgpu_get_litter_value(g, GPU_LIT_NUM_GPCS) *
+                gr_zcull_save_restore_subregion_header_bytes_per_gpc_v();
+        zcull_params->subregion_width_align_pixels =
+                gr->tpc_count * gr_gpc0_zcull_zcsize_width_subregion__multiple_v();
+        zcull_params->subregion_height_align_pixels =
+                gr_gpc0_zcull_zcsize_height_subregion__multiple_v();
+        zcull_params->subregion_count = gr_zcull_subregion_qty_v();
+        return 0;
+}
+int gr_gk20a_add_zbc_color(struct gk20a *g, struct gr_gk20a *gr,
+                           struct zbc_entry *color_val, u32 index)
+{
+        u32 i;
+        /* update l2 table */
+        g->ops.ltc.set_zbc_color_entry(g, color_val, index);
+        /* update ds table */
+        gk20a_writel(g, gr_ds_zbc_color_r_r(),
+                gr_ds_zbc_color_r_val_f(color_val->color_ds[0]));
+        gk20a_writel(g, gr_ds_zbc_color_g_r(),
+                gr_ds_zbc_color_g_val_f(color_val->color_ds[1]));
+        gk20a_writel(g, gr_ds_zbc_color_b_r(),
+                gr_ds_zbc_color_b_val_f(color_val->color_ds[2]));
+        gk20a_writel(g, gr_ds_zbc_color_a_r(),
+                gr_ds_zbc_color_a_val_f(color_val->color_ds[3]));
+        gk20a_writel(g, gr_ds_zbc_color_fmt_r(),
+                gr_ds_zbc_color_fmt_val_f(color_val->format));
+        gk20a_writel(g, gr_ds_zbc_tbl_index_r(),
+                gr_ds_zbc_tbl_index_val_f(index + GK20A_STARTOF_ZBC_TABLE));
+        /* trigger the write */
+        gk20a_writel(g, gr_ds_zbc_tbl_ld_r(),
+                gr_ds_zbc_tbl_ld_select_c_f() |
+                gr_ds_zbc_tbl_ld_action_write_f() |
+                gr_ds_zbc_tbl_ld_trigger_active_f());
+        /* update local copy */
+        for (i = 0; i < GK20A_ZBC_COLOR_VALUE_SIZE; i++) {
+                gr->zbc_col_tbl[index].color_l2[i] = color_val->color_l2[i];
+                gr->zbc_col_tbl[index].color_ds[i] = color_val->color_ds[i];
+        }
+        gr->zbc_col_tbl[index].format = color_val->format;
+        gr->zbc_col_tbl[index].ref_cnt++;
+        return 0;
+}
+int gr_gk20a_add_zbc_depth(struct gk20a *g, struct gr_gk20a *gr,
+                           struct zbc_entry *depth_val, u32 index)
+{
+        /* update l2 table */
+        g->ops.ltc.set_zbc_depth_entry(g, depth_val, index);
+        /* update ds table */
+        gk20a_writel(g, gr_ds_zbc_z_r(),
+                gr_ds_zbc_z_val_f(depth_val->depth));
+        gk20a_writel(g, gr_ds_zbc_z_fmt_r(),
+                gr_ds_zbc_z_fmt_val_f(depth_val->format));
+        gk20a_writel(g, gr_ds_zbc_tbl_index_r(),
+                gr_ds_zbc_tbl_index_val_f(index + GK20A_STARTOF_ZBC_TABLE));
+        /* trigger the write */
+        gk20a_writel(g, gr_ds_zbc_tbl_ld_r(),
+                gr_ds_zbc_tbl_ld_select_z_f() |
+                gr_ds_zbc_tbl_ld_action_write_f() |
+                gr_ds_zbc_tbl_ld_trigger_active_f());
+        /* update local copy */
+        gr->zbc_dep_tbl[index].depth = depth_val->depth;
+        gr->zbc_dep_tbl[index].format = depth_val->format;
+        gr->zbc_dep_tbl[index].ref_cnt++;
+        return 0;
+}
+void gr_gk20a_pmu_save_zbc(struct gk20a *g, u32 entries)
+{
+        struct fifo_gk20a *f = &g->fifo;
+        struct fifo_engine_info_gk20a *gr_info = NULL;
+        u32 ret;
+        u32 engine_id;
+        engine_id = gk20a_fifo_get_gr_engine_id(g);
+        gr_info = (f->engine_info + engine_id);
+        ret = gk20a_fifo_disable_engine_activity(g, gr_info, true);
+        if (ret) {
+                nvgpu_err(g,
+                        "failed to disable gr engine activity");
+                return;
+        }
+        ret = g->ops.gr.wait_empty(g, gk20a_get_gr_idle_timeout(g),
+                                   GR_IDLE_CHECK_DEFAULT);
+        if (ret) {
+                nvgpu_err(g,
+                        "failed to idle graphics");
+                goto clean_up;
+        }
+        /* update zbc */
+        g->ops.gr.pmu_save_zbc(g, entries);
+clean_up:
+        ret = gk20a_fifo_enable_engine_activity(g, gr_info);
+        if (ret) {
+                nvgpu_err(g,
+                        "failed to enable gr engine activity");
+        }
+}
+int gr_gk20a_add_zbc(struct gk20a *g, struct gr_gk20a *gr,
+                     struct zbc_entry *zbc_val)
+{
+        struct zbc_color_table *c_tbl;
+        struct zbc_depth_table *d_tbl;
+        u32 i;
+        int ret = -ENOSPC;
+        bool added = false;
+        u32 entries;
+        /* no endian swap ? */
+        nvgpu_mutex_acquire(&gr->zbc_lock);
+        nvgpu_speculation_barrier();
+        switch (zbc_val->type) {
+        case GK20A_ZBC_TYPE_COLOR:
+                /* search existing tables */
+                for (i = 0; i < gr->max_used_color_index; i++) {
+                        c_tbl = &gr->zbc_col_tbl[i];
+                        if ((c_tbl->ref_cnt != 0U) &&
+                            (c_tbl->format == zbc_val->format) &&
+                            (memcmp(c_tbl->color_ds, zbc_val->color_ds,
+                                sizeof(zbc_val->color_ds)) == 0) &&
+                            (memcmp(c_tbl->color_l2, zbc_val->color_l2,
+                                sizeof(zbc_val->color_l2)) == 0)) {
+                                added = true;
+                                c_tbl->ref_cnt++;
+                                ret = 0;
+                                break;
+                        }
+                }
+                /* add new table */
+                if (!added &&
+                    gr->max_used_color_index < GK20A_ZBC_TABLE_SIZE) {
+                        c_tbl =
+                            &gr->zbc_col_tbl[gr->max_used_color_index];
+                        WARN_ON(c_tbl->ref_cnt != 0);
+                        ret = g->ops.gr.add_zbc_color(g, gr,
+                                zbc_val, gr->max_used_color_index);
+                        if (ret == 0) {
+                                gr->max_used_color_index++;
+                        }
+                }
+                break;
+        case GK20A_ZBC_TYPE_DEPTH:
+                /* search existing tables */
+                for (i = 0; i < gr->max_used_depth_index; i++) {
+                        d_tbl = &gr->zbc_dep_tbl[i];
+                        if ((d_tbl->ref_cnt != 0U) &&
+                            (d_tbl->depth == zbc_val->depth) &&
+                            (d_tbl->format == zbc_val->format)) {
+                                added = true;
+                                d_tbl->ref_cnt++;
+                                ret = 0;
+                                break;
+                        }
+                }
+                /* add new table */
+                if (!added &&
+                    gr->max_used_depth_index < GK20A_ZBC_TABLE_SIZE) {
+                        d_tbl =
+                            &gr->zbc_dep_tbl[gr->max_used_depth_index];
+                        WARN_ON(d_tbl->ref_cnt != 0);
+                        ret = g->ops.gr.add_zbc_depth(g, gr,
+                                zbc_val, gr->max_used_depth_index);
+                        if (ret == 0) {
+                                gr->max_used_depth_index++;
+                        }
+                }
+                break;
+        case T19X_ZBC:
+                if (g->ops.gr.add_zbc_type_s) {
+                        added =  g->ops.gr.add_zbc_type_s(g, gr, zbc_val, &ret);
+                } else {
+                        nvgpu_err(g,
+                        "invalid zbc table type %d", zbc_val->type);
+                        ret = -EINVAL;
+                        goto err_mutex;
+                }
+                break;
+        default:
+                nvgpu_err(g,
+                        "invalid zbc table type %d", zbc_val->type);
+                ret = -EINVAL;
+                goto err_mutex;
+        }
+        if (!added && ret == 0) {
+                /* update zbc for elpg only when new entry is added */
+                entries = max(gr->max_used_color_index,
+                                        gr->max_used_depth_index);
+                g->ops.gr.pmu_save_zbc(g, entries);
+        }
+err_mutex:
+        nvgpu_mutex_release(&gr->zbc_lock);
+        return ret;
+}
+/* get a zbc table entry specified by index
+ * return table size when type is invalid */
+int gr_gk20a_query_zbc(struct gk20a *g, struct gr_gk20a *gr,
+                        struct zbc_query_params *query_params)
+{
+        u32 index = query_params->index_size;
+        u32 i;
+        nvgpu_speculation_barrier();
+        switch (query_params->type) {
+        case GK20A_ZBC_TYPE_INVALID:
+                query_params->index_size = GK20A_ZBC_TABLE_SIZE;
+                break;
+        case GK20A_ZBC_TYPE_COLOR:
+                if (index >= GK20A_ZBC_TABLE_SIZE) {
+                        nvgpu_err(g,
+                                "invalid zbc color table index");
+                        return -EINVAL;
+                }
+                nvgpu_speculation_barrier();
+                for (i = 0; i < GK20A_ZBC_COLOR_VALUE_SIZE; i++) {
+                        query_params->color_l2[i] =
+                                gr->zbc_col_tbl[index].color_l2[i];
+                        query_params->color_ds[i] =
+                                gr->zbc_col_tbl[index].color_ds[i];
+                }
+                query_params->format = gr->zbc_col_tbl[index].format;
+                query_params->ref_cnt = gr->zbc_col_tbl[index].ref_cnt;
+                break;
+        case GK20A_ZBC_TYPE_DEPTH:
+                if (index >= GK20A_ZBC_TABLE_SIZE) {
+                        nvgpu_err(g,
+                                "invalid zbc depth table index");
+                        return -EINVAL;
+                }
+                nvgpu_speculation_barrier();
+                query_params->depth = gr->zbc_dep_tbl[index].depth;
+                query_params->format = gr->zbc_dep_tbl[index].format;
+                query_params->ref_cnt = gr->zbc_dep_tbl[index].ref_cnt;
+                break;
+        case T19X_ZBC:
+                if (g->ops.gr.zbc_s_query_table) {
+                        return g->ops.gr.zbc_s_query_table(g, gr,
+                                         query_params);
+                } else {
+                        nvgpu_err(g,
+                                "invalid zbc table type");
+                        return -EINVAL;
+                }
+                break;
+        default:
+                nvgpu_err(g,
+                                "invalid zbc table type");
+                return -EINVAL;
+        }
+        return 0;
+}
+static int gr_gk20a_load_zbc_table(struct gk20a *g, struct gr_gk20a *gr)
+{
+        unsigned int i;
+        int ret;
+        for (i = 0; i < gr->max_used_color_index; i++) {
+                struct zbc_color_table *c_tbl = &gr->zbc_col_tbl[i];
+                struct zbc_entry zbc_val;
+                zbc_val.type = GK20A_ZBC_TYPE_COLOR;
+                memcpy(zbc_val.color_ds,
+                       c_tbl->color_ds, sizeof(zbc_val.color_ds));
+                memcpy(zbc_val.color_l2,
+                       c_tbl->color_l2, sizeof(zbc_val.color_l2));
+                zbc_val.format = c_tbl->format;
+                ret = g->ops.gr.add_zbc_color(g, gr, &zbc_val, i);
+                if (ret) {
+                        return ret;
+                }
+        }
+        for (i = 0; i < gr->max_used_depth_index; i++) {
+                struct zbc_depth_table *d_tbl = &gr->zbc_dep_tbl[i];
+                struct zbc_entry zbc_val;
+                zbc_val.type = GK20A_ZBC_TYPE_DEPTH;
+                zbc_val.depth = d_tbl->depth;
+                zbc_val.format = d_tbl->format;
+                ret = g->ops.gr.add_zbc_depth(g, gr, &zbc_val, i);
+                if (ret) {
+                        return ret;
+                }
+        }
+        if (g->ops.gr.load_zbc_s_tbl) {
+                ret = g->ops.gr.load_zbc_s_tbl(g, gr);
+                if (ret) {
+                        return ret;
+                }
+        }
+        return 0;
+}
+int gr_gk20a_load_zbc_default_table(struct gk20a *g, struct gr_gk20a *gr)
+{
+        struct zbc_entry zbc_val;
+        u32 i = 0;
+        int err = 0;
+        err = nvgpu_mutex_init(&gr->zbc_lock);
+        if (err != 0) {
+                nvgpu_err(g, "Error in zbc_lock mutex initialization");
+                return err;
+        }
+        /* load default color table */
+        zbc_val.type = GK20A_ZBC_TYPE_COLOR;
+        /* Opaque black (i.e. solid black, fmt 0x28 = A8B8G8R8) */
+        zbc_val.format = gr_ds_zbc_color_fmt_val_a8_b8_g8_r8_v();
+        for (i = 0; i < GK20A_ZBC_COLOR_VALUE_SIZE; i++) {
+                zbc_val.color_ds[i] = 0;
+                zbc_val.color_l2[i] = 0;
+        }
+        zbc_val.color_l2[0] = 0xff000000;
+        zbc_val.color_ds[3] = 0x3f800000;
+        err = gr_gk20a_add_zbc(g, gr, &zbc_val);
+        if (err != 0) {
+                goto color_fail;
+        }
+        /* Transparent black = (fmt 1 = zero) */
+        zbc_val.format = gr_ds_zbc_color_fmt_val_zero_v();
+        for (i = 0; i < GK20A_ZBC_COLOR_VALUE_SIZE; i++) {
+                zbc_val.color_ds[i] = 0;
+                zbc_val.color_l2[i] = 0;
+        }
+        err = gr_gk20a_add_zbc(g, gr, &zbc_val);
+        if (err != 0) {
+                goto color_fail;
+        }
+        /* Opaque white (i.e. solid white) = (fmt 2 = uniform 1) */
+        zbc_val.format = gr_ds_zbc_color_fmt_val_unorm_one_v();
+        for (i = 0; i < GK20A_ZBC_COLOR_VALUE_SIZE; i++) {
+                zbc_val.color_ds[i] = 0x3f800000;
+                zbc_val.color_l2[i] = 0xffffffff;
+        }
+        err = gr_gk20a_add_zbc(g, gr, &zbc_val);
+        if (err != 0) {
+                goto color_fail;
+        }
+        gr->max_default_color_index = 3;
+        /* load default depth table */
+        zbc_val.type = GK20A_ZBC_TYPE_DEPTH;
+        zbc_val.format = gr_ds_zbc_z_fmt_val_fp32_v();
+        zbc_val.depth = 0x3f800000;
+        err = gr_gk20a_add_zbc(g, gr, &zbc_val);
+        if (err != 0) {
+                goto depth_fail;
+        }
+        zbc_val.format = gr_ds_zbc_z_fmt_val_fp32_v();
+        zbc_val.depth = 0;
+        err = gr_gk20a_add_zbc(g, gr, &zbc_val);
+        if (err != 0) {
+                goto depth_fail;
+        }
+        gr->max_default_depth_index = 2;
+        if (g->ops.gr.load_zbc_s_default_tbl) {
+                err = g->ops.gr.load_zbc_s_default_tbl(g, gr);
+                if (err != 0) {
+                        return err;
+                }
+        }
+        return 0;
+color_fail:
+        nvgpu_err(g, "fail to load default zbc color table");
+        return err;
+depth_fail:
+        nvgpu_err(g, "fail to load default zbc depth table");
+        return err;
+}
+int _gk20a_gr_zbc_set_table(struct gk20a *g, struct gr_gk20a *gr,
+                        struct zbc_entry *zbc_val)
+{
+        struct fifo_gk20a *f = &g->fifo;
+        struct fifo_engine_info_gk20a *gr_info = NULL;
+        int ret;
+        u32 engine_id;
+        engine_id = gk20a_fifo_get_gr_engine_id(g);
+        gr_info = (f->engine_info + engine_id);
+        ret = gk20a_fifo_disable_engine_activity(g, gr_info, true);
+        if (ret) {
+                nvgpu_err(g,
+                        "failed to disable gr engine activity");
+                return ret;
+        }
+        ret = g->ops.gr.wait_empty(g, gk20a_get_gr_idle_timeout(g),
+                                   GR_IDLE_CHECK_DEFAULT);
+        if (ret) {
+                nvgpu_err(g,
+                        "failed to idle graphics");
+                goto clean_up;
+        }
+        ret = gr_gk20a_add_zbc(g, gr, zbc_val);
+clean_up:
+        if (gk20a_fifo_enable_engine_activity(g, gr_info)) {
+                nvgpu_err(g,
+                        "failed to enable gr engine activity");
+        }
+        return ret;
+}
+int gk20a_gr_zbc_set_table(struct gk20a *g, struct gr_gk20a *gr,
+                        struct zbc_entry *zbc_val)
+{
+        nvgpu_log_fn(g, " ");
+        return gr_gk20a_elpg_protected_call(g,
+                gr_gk20a_add_zbc(g, gr, zbc_val));
+}
+void gr_gk20a_program_zcull_mapping(struct gk20a *g, u32 zcull_num_entries,
+                                                u32 *zcull_map_tiles)
+{
+        u32 val;
+        nvgpu_log_fn(g, " ");
+        if (zcull_num_entries >= 8) {
+                nvgpu_log_fn(g, "map0");
+                val =
+                gr_gpcs_zcull_sm_in_gpc_number_map0_tile_0_f(
+                                                zcull_map_tiles[0]) |
+                gr_gpcs_zcull_sm_in_gpc_number_map0_tile_1_f(
+                                                zcull_map_tiles[1]) |
+                gr_gpcs_zcull_sm_in_gpc_number_map0_tile_2_f(
+                                                zcull_map_tiles[2]) |
+                gr_gpcs_zcull_sm_in_gpc_number_map0_tile_3_f(
+                                                zcull_map_tiles[3]) |
+                gr_gpcs_zcull_sm_in_gpc_number_map0_tile_4_f(
+                                                zcull_map_tiles[4]) |
+                gr_gpcs_zcull_sm_in_gpc_number_map0_tile_5_f(
+                                                zcull_map_tiles[5]) |
+                gr_gpcs_zcull_sm_in_gpc_number_map0_tile_6_f(
+                                                zcull_map_tiles[6]) |
+                gr_gpcs_zcull_sm_in_gpc_number_map0_tile_7_f(
+                                                zcull_map_tiles[7]);
+                gk20a_writel(g, gr_gpcs_zcull_sm_in_gpc_number_map0_r(), val);
+        }
+        if (zcull_num_entries >= 16) {
+                nvgpu_log_fn(g, "map1");
+                val =
+                gr_gpcs_zcull_sm_in_gpc_number_map1_tile_8_f(
+                                                zcull_map_tiles[8]) |
+                gr_gpcs_zcull_sm_in_gpc_number_map1_tile_9_f(
+                                                zcull_map_tiles[9]) |
+                gr_gpcs_zcull_sm_in_gpc_number_map1_tile_10_f(
+                                                zcull_map_tiles[10]) |
+                gr_gpcs_zcull_sm_in_gpc_number_map1_tile_11_f(
+                                                zcull_map_tiles[11]) |
+                gr_gpcs_zcull_sm_in_gpc_number_map1_tile_12_f(
+                                                zcull_map_tiles[12]) |
+                gr_gpcs_zcull_sm_in_gpc_number_map1_tile_13_f(
+                                                zcull_map_tiles[13]) |
+                gr_gpcs_zcull_sm_in_gpc_number_map1_tile_14_f(
+                                                zcull_map_tiles[14]) |
+                gr_gpcs_zcull_sm_in_gpc_number_map1_tile_15_f(
+                                                zcull_map_tiles[15]);
+                gk20a_writel(g, gr_gpcs_zcull_sm_in_gpc_number_map1_r(), val);
+        }
+        if (zcull_num_entries >= 24) {
+                nvgpu_log_fn(g, "map2");
+                val =
+                gr_gpcs_zcull_sm_in_gpc_number_map2_tile_16_f(
+                                                zcull_map_tiles[16]) |
+                gr_gpcs_zcull_sm_in_gpc_number_map2_tile_17_f(
+                                                zcull_map_tiles[17]) |
+                gr_gpcs_zcull_sm_in_gpc_number_map2_tile_18_f(
+                                                zcull_map_tiles[18]) |
+                gr_gpcs_zcull_sm_in_gpc_number_map2_tile_19_f(
+                                                zcull_map_tiles[19]) |
+                gr_gpcs_zcull_sm_in_gpc_number_map2_tile_20_f(
+                                                zcull_map_tiles[20]) |
+                gr_gpcs_zcull_sm_in_gpc_number_map2_tile_21_f(
+                                                zcull_map_tiles[21]) |
+                gr_gpcs_zcull_sm_in_gpc_number_map2_tile_22_f(
+                                                zcull_map_tiles[22]) |
+                gr_gpcs_zcull_sm_in_gpc_number_map2_tile_23_f(
+                                                zcull_map_tiles[23]);
+                gk20a_writel(g, gr_gpcs_zcull_sm_in_gpc_number_map2_r(), val);
+        }
+        if (zcull_num_entries >= 32) {
+                nvgpu_log_fn(g, "map3");
+                val =
+                gr_gpcs_zcull_sm_in_gpc_number_map3_tile_24_f(
+                                                zcull_map_tiles[24]) |
+                gr_gpcs_zcull_sm_in_gpc_number_map3_tile_25_f(
+                                                zcull_map_tiles[25]) |
+                gr_gpcs_zcull_sm_in_gpc_number_map3_tile_26_f(
+                                                zcull_map_tiles[26]) |
+                gr_gpcs_zcull_sm_in_gpc_number_map3_tile_27_f(
+                                                zcull_map_tiles[27]) |
+                gr_gpcs_zcull_sm_in_gpc_number_map3_tile_28_f(
+                                                zcull_map_tiles[28]) |
+                gr_gpcs_zcull_sm_in_gpc_number_map3_tile_29_f(
+                                                zcull_map_tiles[29]) |
+                gr_gpcs_zcull_sm_in_gpc_number_map3_tile_30_f(
+                                                zcull_map_tiles[30]) |
+                gr_gpcs_zcull_sm_in_gpc_number_map3_tile_31_f(
+                                                zcull_map_tiles[31]);
+                gk20a_writel(g, gr_gpcs_zcull_sm_in_gpc_number_map3_r(), val);
+        }
+}
+static int gr_gk20a_zcull_init_hw(struct gk20a *g, struct gr_gk20a *gr)
+{
+        u32 gpc_index, gpc_tpc_count, gpc_zcull_count;
+        u32 *zcull_map_tiles, *zcull_bank_counters;
+        u32 map_counter;
+        u32 rcp_conserv;
+        u32 offset;
+        bool floorsweep = false;
+        u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE);
+        u32 num_gpcs = nvgpu_get_litter_value(g, GPU_LIT_NUM_GPCS);
+        u32 num_tpc_per_gpc = nvgpu_get_litter_value(g,
+                                                GPU_LIT_NUM_TPC_PER_GPC);
+        u32 zcull_alloc_num = num_gpcs * num_tpc_per_gpc;
+        u32 map_tile_count;
+        if (gr->map_tiles == NULL) {
+                return -1;
+        }
+        if (zcull_alloc_num % 8 != 0) {
+                /* Total 8 fields per map reg i.e. tile_0 to tile_7*/
+                zcull_alloc_num += (zcull_alloc_num % 8);
+        }
+        zcull_map_tiles = nvgpu_kzalloc(g, zcull_alloc_num * sizeof(u32));
+        if (zcull_map_tiles == NULL) {
+                nvgpu_err(g,
+                        "failed to allocate zcull map titles");
+                return -ENOMEM;
+        }
+        zcull_bank_counters = nvgpu_kzalloc(g, zcull_alloc_num * sizeof(u32));
+        if (zcull_bank_counters == NULL) {
+                nvgpu_err(g,
+                        "failed to allocate zcull bank counters");
+                nvgpu_kfree(g, zcull_map_tiles);
+                return -ENOMEM;
+        }
+        for (map_counter = 0; map_counter < gr->tpc_count; map_counter++) {
+                map_tile_count = gr_gk20a_get_map_tile_count(gr, map_counter);
+                zcull_map_tiles[map_counter] =
+                        zcull_bank_counters[map_tile_count];
+                zcull_bank_counters[map_tile_count]++;
+        }
+        if (g->ops.gr.program_zcull_mapping != NULL) {
+                g->ops.gr.program_zcull_mapping(g, zcull_alloc_num,
+                                         zcull_map_tiles);
+        }
+        nvgpu_kfree(g, zcull_map_tiles);
+        nvgpu_kfree(g, zcull_bank_counters);
+        for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
+                gpc_tpc_count = gr->gpc_tpc_count[gpc_index];
+                gpc_zcull_count = gr->gpc_zcb_count[gpc_index];
+                if (gpc_zcull_count != gr->max_zcull_per_gpc_count &&
+                    gpc_zcull_count < gpc_tpc_count) {
+                        nvgpu_err(g,
+                                "zcull_banks (%d) less than tpcs (%d) for gpc (%d)",
+                                gpc_zcull_count, gpc_tpc_count, gpc_index);
+                        return -EINVAL;
+                }
+                if (gpc_zcull_count != gr->max_zcull_per_gpc_count &&
+                    gpc_zcull_count != 0) {
+                        floorsweep = true;
+                }
+        }
+        /* ceil(1.0f / SM_NUM * gr_gpc0_zcull_sm_num_rcp_conservative__max_v()) */
+        rcp_conserv = DIV_ROUND_UP(gr_gpc0_zcull_sm_num_rcp_conservative__max_v(),
+                gr->gpc_tpc_count[0]);
+        for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
+                offset = gpc_index * gpc_stride;
+                if (floorsweep) {
+                        gk20a_writel(g, gr_gpc0_zcull_ram_addr_r() + offset,
+                                gr_gpc0_zcull_ram_addr_row_offset_f(gr->map_row_offset) |
+                                gr_gpc0_zcull_ram_addr_tiles_per_hypertile_row_per_gpc_f(
+                                        gr->max_zcull_per_gpc_count));
+                } else {
+                        gk20a_writel(g, gr_gpc0_zcull_ram_addr_r() + offset,
+                                gr_gpc0_zcull_ram_addr_row_offset_f(gr->map_row_offset) |
+                                gr_gpc0_zcull_ram_addr_tiles_per_hypertile_row_per_gpc_f(
+                                        gr->gpc_tpc_count[gpc_index]));
+                }
+                gk20a_writel(g, gr_gpc0_zcull_fs_r() + offset,
+                        gr_gpc0_zcull_fs_num_active_banks_f(gr->gpc_zcb_count[gpc_index]) |
+                        gr_gpc0_zcull_fs_num_sms_f(gr->tpc_count));
+                gk20a_writel(g, gr_gpc0_zcull_sm_num_rcp_r() + offset,
+                        gr_gpc0_zcull_sm_num_rcp_conservative_f(rcp_conserv));
+        }
+        gk20a_writel(g, gr_gpcs_ppcs_wwdx_sm_num_rcp_r(),
+                gr_gpcs_ppcs_wwdx_sm_num_rcp_conservative_f(rcp_conserv));
+        return 0;
+}
+void gk20a_gr_enable_exceptions(struct gk20a *g)
+{
+        gk20a_writel(g, gr_exception_r(), 0xFFFFFFFF);
+        gk20a_writel(g, gr_exception_en_r(), 0xFFFFFFFF);
+        gk20a_writel(g, gr_exception1_r(), 0xFFFFFFFF);
+        gk20a_writel(g, gr_exception1_en_r(), 0xFFFFFFFF);
+        gk20a_writel(g, gr_exception2_r(), 0xFFFFFFFF);
+        gk20a_writel(g, gr_exception2_en_r(), 0xFFFFFFFF);
+}
+void gk20a_gr_enable_gpc_exceptions(struct gk20a *g)
+{
+        struct gr_gk20a *gr = &g->gr;
+        u32 tpc_mask;
+        gk20a_writel(g, gr_gpcs_tpcs_tpccs_tpc_exception_en_r(),
+                        gr_gpcs_tpcs_tpccs_tpc_exception_en_tex_enabled_f() |
+                        gr_gpcs_tpcs_tpccs_tpc_exception_en_sm_enabled_f());
+        tpc_mask =
+                gr_gpcs_gpccs_gpc_exception_en_tpc_f((1 << gr->max_tpc_per_gpc_count) - 1);
+        gk20a_writel(g, gr_gpcs_gpccs_gpc_exception_en_r(), tpc_mask);
+}
+void gr_gk20a_enable_hww_exceptions(struct gk20a *g)
+{
+        /* enable exceptions */
+        gk20a_writel(g, gr_fe_hww_esr_r(),
+                     gr_fe_hww_esr_en_enable_f() |
+                     gr_fe_hww_esr_reset_active_f());
+        gk20a_writel(g, gr_memfmt_hww_esr_r(),
+                     gr_memfmt_hww_esr_en_enable_f() |
+                     gr_memfmt_hww_esr_reset_active_f());
+}
+void gr_gk20a_fecs_host_int_enable(struct gk20a *g)
+{
+        gk20a_writel(g, gr_fecs_host_int_enable_r(),
+                     gr_fecs_host_int_enable_ctxsw_intr1_enable_f() |
+                     gr_fecs_host_int_enable_fault_during_ctxsw_enable_f() |
+                     gr_fecs_host_int_enable_umimp_firmware_method_enable_f() |
+                     gr_fecs_host_int_enable_umimp_illegal_method_enable_f() |
+                     gr_fecs_host_int_enable_watchdog_enable_f());
+}
+static int gk20a_init_gr_setup_hw(struct gk20a *g)
+{
+        struct gr_gk20a *gr = &g->gr;
+        struct aiv_list_gk20a *sw_ctx_load = &g->gr.ctx_vars.sw_ctx_load;
+        struct av_list_gk20a *sw_method_init = &g->gr.ctx_vars.sw_method_init;
+        u32 data;
+        u32 last_method_data = 0;
+        u32 i, err;
+        nvgpu_log_fn(g, " ");
+        if (g->ops.gr.init_gpc_mmu) {
+                g->ops.gr.init_gpc_mmu(g);
+        }
+        /* load gr floorsweeping registers */
+        data = gk20a_readl(g, gr_gpc0_ppc0_pes_vsc_strem_r());
+        data = set_field(data, gr_gpc0_ppc0_pes_vsc_strem_master_pe_m(),
+                        gr_gpc0_ppc0_pes_vsc_strem_master_pe_true_f());
+        gk20a_writel(g, gr_gpc0_ppc0_pes_vsc_strem_r(), data);
+        gr_gk20a_zcull_init_hw(g, gr);
+        if (g->ops.priv_ring.set_ppriv_timeout_settings != NULL) {
+                g->ops.priv_ring.set_ppriv_timeout_settings(g);
+        }
+        /* enable fifo access */
+        gk20a_writel(g, gr_gpfifo_ctl_r(),
+                     gr_gpfifo_ctl_access_enabled_f() |
+                     gr_gpfifo_ctl_semaphore_access_enabled_f());
+        /* TBD: reload gr ucode when needed */
+        /* enable interrupts */
+        gk20a_writel(g, gr_intr_r(), 0xFFFFFFFF);
+        gk20a_writel(g, gr_intr_en_r(), 0xFFFFFFFF);
+        /* enable fecs error interrupts */
+        g->ops.gr.fecs_host_int_enable(g);
+        g->ops.gr.enable_hww_exceptions(g);
+        g->ops.gr.set_hww_esr_report_mask(g);
+        /* enable TPC exceptions per GPC */
+        if (g->ops.gr.enable_gpc_exceptions) {
+                g->ops.gr.enable_gpc_exceptions(g);
+        }
+        /* enable ECC for L1/SM */
+        if (g->ops.gr.ecc_init_scrub_reg) {
+                g->ops.gr.ecc_init_scrub_reg(g);
+        }
+        /* TBD: enable per BE exceptions */
+        /* reset and enable exceptions */
+        g->ops.gr.enable_exceptions(g);
+        gr_gk20a_load_zbc_table(g, gr);
+        if (g->ops.ltc.init_cbc) {
+                g->ops.ltc.init_cbc(g, gr);
+        }
+        if (g->ops.fb.init_cbc) {
+                g->ops.fb.init_cbc(g, gr);
+        }
+        if (g->ops.gr.disable_rd_coalesce) {
+                g->ops.gr.disable_rd_coalesce(g);
+        }
+        /* load ctx init */
+        for (i = 0; i < sw_ctx_load->count; i++) {
+                gk20a_writel(g, sw_ctx_load->l[i].addr,
+                             sw_ctx_load->l[i].value);
+        }
+        err = gr_gk20a_wait_idle(g, gk20a_get_gr_idle_timeout(g),
+                                 GR_IDLE_CHECK_DEFAULT);
+        if (err != 0U) {
+                goto out;
+        }
+        if (g->ops.gr.init_preemption_state) {
+                err = g->ops.gr.init_preemption_state(g);
+                if (err != 0U) {
+                        goto out;
+                }
+        }
+        /* disable fe_go_idle */
+        gk20a_writel(g, gr_fe_go_idle_timeout_r(),
+                gr_fe_go_idle_timeout_count_disabled_f());
+        /* override a few ctx state registers */
+        g->ops.gr.commit_global_timeslice(g, NULL);
+        /* floorsweep anything left */
+        err = g->ops.gr.init_fs_state(g);
+        if (err != 0U) {
+                goto out;
+        }
+        err = gr_gk20a_wait_idle(g, gk20a_get_gr_idle_timeout(g),
+                                 GR_IDLE_CHECK_DEFAULT);
+        if (err != 0U) {
+                goto restore_fe_go_idle;
+        }
+restore_fe_go_idle:
+        /* restore fe_go_idle */
+        gk20a_writel(g, gr_fe_go_idle_timeout_r(),
+                     gr_fe_go_idle_timeout_count_prod_f());
+        if ((err != 0U) || (gr_gk20a_wait_idle(g, gk20a_get_gr_idle_timeout(g),
+                                      GR_IDLE_CHECK_DEFAULT) != 0)) {
+                goto out;
+        }
+        /* load method init */
+        if (sw_method_init->count) {
+                gk20a_writel(g, gr_pri_mme_shadow_raw_data_r(),
+                             sw_method_init->l[0].value);
+                gk20a_writel(g, gr_pri_mme_shadow_raw_index_r(),
+                             gr_pri_mme_shadow_raw_index_write_trigger_f() |
+                             sw_method_init->l[0].addr);
+                last_method_data = sw_method_init->l[0].value;
+        }
+        for (i = 1; i < sw_method_init->count; i++) {
+                if (sw_method_init->l[i].value != last_method_data) {
+                        gk20a_writel(g, gr_pri_mme_shadow_raw_data_r(),
+                                sw_method_init->l[i].value);
+                        last_method_data = sw_method_init->l[i].value;
+                }
+                gk20a_writel(g, gr_pri_mme_shadow_raw_index_r(),
+                        gr_pri_mme_shadow_raw_index_write_trigger_f() |
+                        sw_method_init->l[i].addr);
+        }
+        err = gr_gk20a_wait_idle(g, gk20a_get_gr_idle_timeout(g),
+                                 GR_IDLE_CHECK_DEFAULT);
+out:
+        nvgpu_log_fn(g, "done");
+        return err;
+}
+static int gk20a_init_gr_prepare(struct gk20a *g)
+{
+        u32 err = 0;
+        /* reset gr engine */
+        g->ops.mc.reset(g, g->ops.mc.reset_mask(g, NVGPU_UNIT_GRAPH) |
+                        g->ops.mc.reset_mask(g, NVGPU_UNIT_BLG) |
+                        g->ops.mc.reset_mask(g, NVGPU_UNIT_PERFMON));
+        nvgpu_cg_init_gr_load_gating_prod(g);
+        /* Disable elcg until it gets enabled later in the init*/
+        nvgpu_cg_elcg_disable_no_wait(g);
+        /* enable fifo access */
+        gk20a_writel(g, gr_gpfifo_ctl_r(),
+                gr_gpfifo_ctl_access_enabled_f() |
+                gr_gpfifo_ctl_semaphore_access_enabled_f());
+        if (!g->gr.ctx_vars.valid) {
+                err = gr_gk20a_init_ctx_vars(g, &g->gr);
+                if (err != 0U) {
+                        nvgpu_err(g,
+                                "fail to load gr init ctx");
+                }
+        }
+        return err;
+}
+static int gr_gk20a_wait_mem_scrubbing(struct gk20a *g)
+{
+        struct nvgpu_timeout timeout;
+        bool fecs_scrubbing;
+        bool gpccs_scrubbing;
+        nvgpu_log_fn(g, " ");
+        nvgpu_timeout_init(g, &timeout,
+                           CTXSW_MEM_SCRUBBING_TIMEOUT_MAX /
+                                CTXSW_MEM_SCRUBBING_TIMEOUT_DEFAULT,
+                           NVGPU_TIMER_RETRY_TIMER);
+        do {
+                fecs_scrubbing = gk20a_readl(g, gr_fecs_dmactl_r()) &
+                        (gr_fecs_dmactl_imem_scrubbing_m() |
+                         gr_fecs_dmactl_dmem_scrubbing_m());
+                gpccs_scrubbing = gk20a_readl(g, gr_gpccs_dmactl_r()) &
+                        (gr_gpccs_dmactl_imem_scrubbing_m() |
+                         gr_gpccs_dmactl_imem_scrubbing_m());
+                if (!fecs_scrubbing && !gpccs_scrubbing) {
+                        nvgpu_log_fn(g, "done");
+                        return 0;
+                }
+                nvgpu_udelay(CTXSW_MEM_SCRUBBING_TIMEOUT_DEFAULT);
+        } while (nvgpu_timeout_expired(&timeout) == 0);
+        nvgpu_err(g, "Falcon mem scrubbing timeout");
+        return -ETIMEDOUT;
+}
+static int gr_gk20a_init_ctxsw(struct gk20a *g)
+{
+        u32 err = 0;
+        err = g->ops.gr.load_ctxsw_ucode(g);
+        if (err != 0U) {
+                goto out;
+        }
+        err = gr_gk20a_wait_ctxsw_ready(g);
+        if (err != 0U) {
+                goto out;
+        }
+out:
+        if (err != 0U) {
+                nvgpu_err(g, "fail");
+        } else {
+                nvgpu_log_fn(g, "done");
+        }
+        return err;
+}
+static int gk20a_init_gr_reset_enable_hw(struct gk20a *g)
+{
+        struct av_list_gk20a *sw_non_ctx_load = &g->gr.ctx_vars.sw_non_ctx_load;
+        u32 i, err = 0;
+        nvgpu_log_fn(g, " ");
+        /* enable interrupts */
+        gk20a_writel(g, gr_intr_r(), ~0);
+        gk20a_writel(g, gr_intr_en_r(), ~0);
+        /* load non_ctx init */
+        for (i = 0; i < sw_non_ctx_load->count; i++) {
+                gk20a_writel(g, sw_non_ctx_load->l[i].addr,
+                        sw_non_ctx_load->l[i].value);
+        }
+        err = gr_gk20a_wait_mem_scrubbing(g);
+        if (err != 0U) {
+                goto out;
+        }
+        err = gr_gk20a_wait_idle(g, gk20a_get_gr_idle_timeout(g),
+                                 GR_IDLE_CHECK_DEFAULT);
+        if (err != 0U) {
+                goto out;
+        }
+out:
+        if (err != 0U) {
+                nvgpu_err(g, "fail");
+        } else {
+                nvgpu_log_fn(g, "done");
+        }
+        return 0;
+}
+static int gr_gk20a_init_access_map(struct gk20a *g)
+{
+        struct gr_gk20a *gr = &g->gr;
+        struct nvgpu_mem *mem = &gr->global_ctx_buffer[PRIV_ACCESS_MAP].mem;
+        u32 nr_pages =
+                DIV_ROUND_UP(gr->ctx_vars.priv_access_map_size,
+                             PAGE_SIZE);
+        u32 *whitelist = NULL;
+        int w, num_entries = 0;
+        nvgpu_memset(g, mem, 0, 0, PAGE_SIZE * nr_pages);
+        g->ops.gr.get_access_map(g, &whitelist, &num_entries);
+        for (w = 0; w < num_entries; w++) {
+                u32 map_bit, map_byte, map_shift, x;
+                map_bit = whitelist[w] >> 2;
+                map_byte = map_bit >> 3;
+                map_shift = map_bit & 0x7; /* i.e. 0-7 */
+                nvgpu_log_info(g, "access map addr:0x%x byte:0x%x bit:%d",
+                               whitelist[w], map_byte, map_shift);
+                x = nvgpu_mem_rd32(g, mem, map_byte / sizeof(u32));
+                x |= 1 << (
+                           (map_byte % sizeof(u32) * BITS_PER_BYTE)
+                          + map_shift);
+                nvgpu_mem_wr32(g, mem, map_byte / sizeof(u32), x);
+        }
+        return 0;
+}
+static int gk20a_init_gr_setup_sw(struct gk20a *g)
+{
+        struct gr_gk20a *gr = &g->gr;
+        int err = 0;
+        nvgpu_log_fn(g, " ");
+        if (gr->sw_ready) {
+                nvgpu_log_fn(g, "skip init");
+                return 0;
+        }
+        gr->g = g;
+#if defined(CONFIG_GK20A_CYCLE_STATS)
+        err = nvgpu_mutex_init(&g->gr.cs_lock);
+        if (err != 0) {
+                nvgpu_err(g, "Error in gr.cs_lock mutex initialization");
+                return err;
+        }
+#endif
+        err = gr_gk20a_init_gr_config(g, gr);
+        if (err != 0) {
+                goto clean_up;
+        }
+        err = gr_gk20a_init_map_tiles(g, gr);
+        if (err != 0) {
+                goto clean_up;
+        }
+        if (g->ops.ltc.init_comptags) {
+                err = g->ops.ltc.init_comptags(g, gr);
+                if (err != 0) {
+                        goto clean_up;
+                }
+        }
+        err = gr_gk20a_init_zcull(g, gr);
+        if (err != 0) {
+                goto clean_up;
+        }
+        err = g->ops.gr.alloc_global_ctx_buffers(g);
+        if (err != 0) {
+                goto clean_up;
+        }
+        err = gr_gk20a_init_access_map(g);
+        if (err != 0) {
+                goto clean_up;
+        }
+        gr_gk20a_load_zbc_default_table(g, gr);
+        if (g->ops.gr.init_czf_bypass) {
+                g->ops.gr.init_czf_bypass(g);
+        }
+        if (g->ops.gr.init_gfxp_wfi_timeout_count) {
+                g->ops.gr.init_gfxp_wfi_timeout_count(g);
+        }
+        err = nvgpu_mutex_init(&gr->ctx_mutex);
+        if (err != 0) {
+                nvgpu_err(g, "Error in gr.ctx_mutex initialization");
+                goto clean_up;
+        }
+        nvgpu_spinlock_init(&gr->ch_tlb_lock);
+        gr->remove_support = gk20a_remove_gr_support;
+        gr->sw_ready = true;
+        err = nvgpu_ecc_init_support(g);
+        if (err != 0) {
+                goto clean_up;
+        }
+        nvgpu_log_fn(g, "done");
+        return 0;
+clean_up:
+        nvgpu_err(g, "fail");
+        gk20a_remove_gr_support(gr);
+        return err;
+}
+static int gk20a_init_gr_bind_fecs_elpg(struct gk20a *g)
+{
+        struct nvgpu_pmu *pmu = &g->pmu;
+        struct mm_gk20a *mm = &g->mm;
+        struct vm_gk20a *vm = mm->pmu.vm;
+        int err = 0;
+        u32 size;
+        nvgpu_log_fn(g, " ");
+        size = 0;
+        err = gr_gk20a_fecs_get_reglist_img_size(g, &size);
+        if (err != 0) {
+                nvgpu_err(g,
+                        "fail to query fecs pg buffer size");
+                return err;
+        }
+        if (pmu->pg_buf.cpu_va == NULL) {
+                err = nvgpu_dma_alloc_map_sys(vm, size, &pmu->pg_buf);
+                if (err != 0) {
+                        nvgpu_err(g, "failed to allocate memory");
+                        return -ENOMEM;
+                }
+        }
+        err = gr_gk20a_fecs_set_reglist_bind_inst(g, &mm->pmu.inst_block);
+        if (err != 0) {
+                nvgpu_err(g,
+                        "fail to bind pmu inst to gr");
+                return err;
+        }
+        err = gr_gk20a_fecs_set_reglist_virtual_addr(g, pmu->pg_buf.gpu_va);
+        if (err != 0) {
+                nvgpu_err(g,
+                        "fail to set pg buffer pmu va");
+                return err;
+        }
+        return err;
+}
+int gk20a_init_gr_support(struct gk20a *g)
+{
+        int err = 0;
+        nvgpu_log_fn(g, " ");
+        g->gr.initialized = false;
+        /* this is required before gr_gk20a_init_ctx_state */
+        err = nvgpu_mutex_init(&g->gr.fecs_mutex);
+        if (err != 0) {
+                nvgpu_err(g, "Error in gr.fecs_mutex initialization");
+                return err;
+        }
+        err = gr_gk20a_init_ctxsw(g);
+        if (err != 0) {
+                return err;
+        }
+        /* this appears query for sw states but fecs actually init
+           ramchain, etc so this is hw init */
+        err = g->ops.gr.init_ctx_state(g);
+        if (err != 0) {
+                return err;
+        }
+        err = gk20a_init_gr_setup_sw(g);
+        if (err != 0) {
+                return err;
+        }
+        err = gk20a_init_gr_setup_hw(g);
+        if (err != 0) {
+                return err;
+        }
+        if (g->can_elpg) {
+                err = gk20a_init_gr_bind_fecs_elpg(g);
+                if (err != 0) {
+                        return err;
+                }
+        }
+        /* GR is inialized, signal possible waiters */
+        g->gr.initialized = true;
+        nvgpu_cond_signal(&g->gr.init_wq);
+        return 0;
+}
+/* Wait until GR is initialized */
+void gk20a_gr_wait_initialized(struct gk20a *g)
+{
+        NVGPU_COND_WAIT(&g->gr.init_wq, g->gr.initialized, 0);
+}
+#define NVA297_SET_ALPHA_CIRCULAR_BUFFER_SIZE   0x02dc
+#define NVA297_SET_CIRCULAR_BUFFER_SIZE         0x1280
+#define NVA297_SET_SHADER_EXCEPTIONS            0x1528
+#define NVA0C0_SET_SHADER_EXCEPTIONS            0x1528
+#define NVA297_SET_SHADER_EXCEPTIONS_ENABLE_FALSE 0
+void gk20a_gr_set_shader_exceptions(struct gk20a *g, u32 data)
+{
+        nvgpu_log_fn(g, " ");
+        if (data == NVA297_SET_SHADER_EXCEPTIONS_ENABLE_FALSE) {
+                gk20a_writel(g,
+                        gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_r(), 0);
+                gk20a_writel(g,
+                        gr_gpcs_tpcs_sm_hww_global_esr_report_mask_r(), 0);
+        } else {
+                /* setup sm warp esr report masks */
+                gk20a_writel(g, gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_r(),
+                        gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_stack_error_report_f() |
+                        gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_api_stack_error_report_f() |
+                        gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_ret_empty_stack_error_report_f() |
+                        gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_pc_wrap_report_f() |
+                        gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_pc_report_f() |
+                        gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_pc_overflow_report_f() |
+                        gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_immc_addr_report_f() |
+                        gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_reg_report_f() |
+                        gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_instr_encoding_report_f() |
+                        gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_sph_instr_combo_report_f() |
+                        gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_instr_param_report_f() |
+                        gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_invalid_const_addr_report_f() |
+                        gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_oor_reg_report_f() |
+                        gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_oor_addr_report_f() |
+                        gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_addr_report_f() |
+                        gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_invalid_addr_space_report_f() |
+                        gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_instr_param2_report_f() |
+                        gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_invalid_const_addr_ldc_report_f() |
+                        gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_geometry_sm_error_report_f() |
+                        gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_divergent_report_f());
+                /* setup sm global esr report mask */
+                gk20a_writel(g, gr_gpcs_tpcs_sm_hww_global_esr_report_mask_r(),
+                        gr_gpcs_tpcs_sm_hww_global_esr_report_mask_sm_to_sm_fault_report_f() |
+                        gr_gpcs_tpcs_sm_hww_global_esr_report_mask_l1_error_report_f() |
+                        gr_gpcs_tpcs_sm_hww_global_esr_report_mask_multiple_warp_errors_report_f() |
+                        gr_gpcs_tpcs_sm_hww_global_esr_report_mask_physical_stack_overflow_error_report_f() |
+                        gr_gpcs_tpcs_sm_hww_global_esr_report_mask_bpt_int_report_f() |
+                        gr_gpcs_tpcs_sm_hww_global_esr_report_mask_bpt_pause_report_f() |
+                        gr_gpcs_tpcs_sm_hww_global_esr_report_mask_single_step_complete_report_f());
+        }
+}
+int gk20a_enable_gr_hw(struct gk20a *g)
+{
+        int err;
+        nvgpu_log_fn(g, " ");
+        err = gk20a_init_gr_prepare(g);
+        if (err != 0) {
+                return err;
+        }
+        err = gk20a_init_gr_reset_enable_hw(g);
+        if (err != 0) {
+                return err;
+        }
+        nvgpu_log_fn(g, "done");
+        return 0;
+}
+int gk20a_gr_reset(struct gk20a *g)
+{
+        int err;
+        u32 size;
+        g->gr.initialized = false;
+        nvgpu_mutex_acquire(&g->gr.fecs_mutex);
+        err = gk20a_enable_gr_hw(g);
+        if (err != 0) {
+                nvgpu_mutex_release(&g->gr.fecs_mutex);
+                return err;
+        }
+        err = gk20a_init_gr_setup_hw(g);
+        if (err != 0) {
+                nvgpu_mutex_release(&g->gr.fecs_mutex);
+                return err;
+        }
+        err = gr_gk20a_init_ctxsw(g);
+        if (err != 0) {
+                nvgpu_mutex_release(&g->gr.fecs_mutex);
+                return err;
+        }
+        nvgpu_mutex_release(&g->gr.fecs_mutex);
+        /* this appears query for sw states but fecs actually init
+           ramchain, etc so this is hw init */
+        err = g->ops.gr.init_ctx_state(g);
+        if (err != 0) {
+                return err;
+        }
+        size = 0;
+        err = gr_gk20a_fecs_get_reglist_img_size(g, &size);
+        if (err != 0) {
+                nvgpu_err(g,
+                        "fail to query fecs pg buffer size");
+                return err;
+        }
+        err = gr_gk20a_fecs_set_reglist_bind_inst(g, &g->mm.pmu.inst_block);
+        if (err != 0) {
+                nvgpu_err(g,
+                        "fail to bind pmu inst to gr");
+                return err;
+        }
+        err = gr_gk20a_fecs_set_reglist_virtual_addr(g, g->pmu.pg_buf.gpu_va);
+        if (err != 0) {
+                nvgpu_err(g,
+                        "fail to set pg buffer pmu va");
+                return err;
+        }
+        nvgpu_cg_init_gr_load_gating_prod(g);
+        nvgpu_cg_elcg_enable_no_wait(g);
+        /* GR is inialized, signal possible waiters */
+        g->gr.initialized = true;
+        nvgpu_cond_signal(&g->gr.init_wq);
+        return err;
+}
+static void gk20a_gr_set_error_notifier(struct gk20a *g,
+                  struct gr_gk20a_isr_data *isr_data, u32 error_notifier)
+{
+        struct channel_gk20a *ch;
+        struct tsg_gk20a *tsg;
+        struct channel_gk20a *ch_tsg;
+        ch = isr_data->ch;
+        if (ch == NULL) {
+                return;
+        }
+        tsg = tsg_gk20a_from_ch(ch);
+        if (tsg != NULL) {
+                nvgpu_rwsem_down_read(&tsg->ch_list_lock);
+                nvgpu_list_for_each_entry(ch_tsg, &tsg->ch_list,
+                                channel_gk20a, ch_entry) {
+                        if (gk20a_channel_get(ch_tsg)) {
+                                g->ops.fifo.set_error_notifier(ch_tsg,
+                                         error_notifier);
+                                gk20a_channel_put(ch_tsg);
+                        }
+                }
+                nvgpu_rwsem_up_read(&tsg->ch_list_lock);
+        } else {
+                nvgpu_err(g, "chid: %d is not bound to tsg", ch->chid);
+        }
+}
+static int gk20a_gr_handle_semaphore_timeout_pending(struct gk20a *g,
+                  struct gr_gk20a_isr_data *isr_data)
+{
+        nvgpu_log_fn(g, " ");
+        gk20a_gr_set_error_notifier(g, isr_data,
+                         NVGPU_ERR_NOTIFIER_GR_SEMAPHORE_TIMEOUT);
+        nvgpu_err(g,
+                   "gr semaphore timeout");
+        return -EINVAL;
+}
+static int gk20a_gr_intr_illegal_notify_pending(struct gk20a *g,
+                  struct gr_gk20a_isr_data *isr_data)
+{
+        nvgpu_log_fn(g, " ");
+        gk20a_gr_set_error_notifier(g, isr_data,
+                         NVGPU_ERR_NOTIFIER_GR_ILLEGAL_NOTIFY);
+        /* This is an unrecoverable error, reset is needed */
+        nvgpu_err(g,
+                   "gr semaphore timeout");
+        return -EINVAL;
+}
+static int gk20a_gr_handle_illegal_method(struct gk20a *g,
+                                          struct gr_gk20a_isr_data *isr_data)
+{
+        int ret = g->ops.gr.handle_sw_method(g, isr_data->addr,
+                        isr_data->class_num, isr_data->offset,
+                        isr_data->data_lo);
+        if (ret) {
+                gk20a_gr_set_error_notifier(g, isr_data,
+                         NVGPU_ERR_NOTIFIER_GR_ILLEGAL_NOTIFY);
+                nvgpu_err(g, "invalid method class 0x%08x"
+                        ", offset 0x%08x address 0x%08x",
+                        isr_data->class_num, isr_data->offset, isr_data->addr);
+        }
+        return ret;
+}
+static int gk20a_gr_handle_illegal_class(struct gk20a *g,
+                                          struct gr_gk20a_isr_data *isr_data)
+{
+        nvgpu_log_fn(g, " ");
+        gk20a_gr_set_error_notifier(g, isr_data,
+                         NVGPU_ERR_NOTIFIER_GR_ERROR_SW_NOTIFY);
+        nvgpu_err(g,
+                   "invalid class 0x%08x, offset 0x%08x",
+                   isr_data->class_num, isr_data->offset);
+        return -EINVAL;
+}
+int gk20a_gr_handle_fecs_error(struct gk20a *g, struct channel_gk20a *ch,
+                                          struct gr_gk20a_isr_data *isr_data)
+{
+        u32 gr_fecs_intr = gk20a_readl(g, gr_fecs_host_int_status_r());
+        int ret = 0;
+        u32 chid = isr_data->ch != NULL ?
+                isr_data->ch->chid : FIFO_INVAL_CHANNEL_ID;
+        if (gr_fecs_intr == 0U) {
+                return 0;
+        }
+        if (gr_fecs_intr & gr_fecs_host_int_status_umimp_firmware_method_f(1)) {
+                gk20a_gr_set_error_notifier(g, isr_data,
+                         NVGPU_ERR_NOTIFIER_FECS_ERR_UNIMP_FIRMWARE_METHOD);
+                nvgpu_err(g,
+                          "firmware method error 0x%08x for offset 0x%04x",
+                          gk20a_readl(g, gr_fecs_ctxsw_mailbox_r(6)),
+                          isr_data->data_lo);
+                ret = -1;
+        } else if ((gr_fecs_intr &
+                        gr_fecs_host_int_status_watchdog_active_f()) != 0U) {
+                /* currently, recovery is not initiated */
+                nvgpu_err(g, "fecs watchdog triggered for channel %u, "
+                                "cannot ctxsw anymore !!", chid);
+                gk20a_fecs_dump_falcon_stats(g);
+        } else if ((gr_fecs_intr &
+                gr_fecs_host_int_status_ctxsw_intr_f(CTXSW_INTR0)) != 0U) {
+                u32 mailbox_value = gk20a_readl(g, gr_fecs_ctxsw_mailbox_r(6));
+                if (mailbox_value == MAILBOX_VALUE_TIMESTAMP_BUFFER_FULL) {
+                        nvgpu_info(g, "ctxsw intr0 set by ucode, "
+                                        "timestamp buffer full");
+#ifdef CONFIG_GK20A_CTXSW_TRACE
+                        gk20a_fecs_trace_reset_buffer(g);
+#else
+                        ret = -1;
+#endif
+                } else {
+                        nvgpu_err(g,
+                          "ctxsw intr0 set by ucode, error_code: 0x%08x",
+                          mailbox_value);
+                        ret = -1;
+                }
+        } else {
+                nvgpu_err(g,
+                        "unhandled fecs error interrupt 0x%08x for channel %u",
+                        gr_fecs_intr, ch->chid);
+                gk20a_fecs_dump_falcon_stats(g);
+        }
+        gk20a_writel(g, gr_fecs_host_int_clear_r(), gr_fecs_intr);
+        return ret;
+}
+static int gk20a_gr_handle_class_error(struct gk20a *g,
+                                       struct gr_gk20a_isr_data *isr_data)
+{
+        u32 gr_class_error;
+        u32 chid = isr_data->ch != NULL ?
+                isr_data->ch->chid : FIFO_INVAL_CHANNEL_ID;
+        nvgpu_log_fn(g, " ");
+        gr_class_error =
+                gr_class_error_code_v(gk20a_readl(g, gr_class_error_r()));
+        gk20a_gr_set_error_notifier(g, isr_data,
+                         NVGPU_ERR_NOTIFIER_GR_ERROR_SW_NOTIFY);
+        nvgpu_err(g, "class error 0x%08x, offset 0x%08x,"
+                "sub channel 0x%08x mme generated %d,"
+                " mme pc 0x%08xdata high %d priv status %d"
+                " unhandled intr 0x%08x for channel %u",
+                isr_data->class_num, (isr_data->offset << 2),
+                gr_trapped_addr_subch_v(isr_data->addr),
+                gr_trapped_addr_mme_generated_v(isr_data->addr),
+                gr_trapped_data_mme_pc_v(
+                        gk20a_readl(g, gr_trapped_data_mme_r())),
+                gr_trapped_addr_datahigh_v(isr_data->addr),
+                gr_trapped_addr_priv_v(isr_data->addr),
+                gr_class_error, chid);
+        nvgpu_err(g, "trapped data low 0x%08x",
+                gk20a_readl(g, gr_trapped_data_lo_r()));
+        if (gr_trapped_addr_datahigh_v(isr_data->addr)) {
+                nvgpu_err(g, "trapped data high 0x%08x",
+                gk20a_readl(g, gr_trapped_data_hi_r()));
+        }
+        return -EINVAL;
+}
+static int gk20a_gr_handle_firmware_method(struct gk20a *g,
+                                           struct gr_gk20a_isr_data *isr_data)
+{
+        u32 chid = isr_data->ch != NULL ?
+                isr_data->ch->chid : FIFO_INVAL_CHANNEL_ID;
+        nvgpu_log_fn(g, " ");
+        gk20a_gr_set_error_notifier(g, isr_data,
+                         NVGPU_ERR_NOTIFIER_GR_ERROR_SW_NOTIFY);
+        nvgpu_err(g,
+                   "firmware method 0x%08x, offset 0x%08x for channel %u",
+                   isr_data->class_num, isr_data->offset,
+                   chid);
+        return -EINVAL;
+}
+int gk20a_gr_handle_semaphore_pending(struct gk20a *g,
+                                     struct gr_gk20a_isr_data *isr_data)
+{
+        struct channel_gk20a *ch = isr_data->ch;
+        struct tsg_gk20a *tsg;
+        if (ch == NULL) {
+                return 0;
+        }
+        tsg = tsg_gk20a_from_ch(ch);
+        if (tsg != NULL) {
+                g->ops.fifo.post_event_id(tsg,
+                        NVGPU_EVENT_ID_GR_SEMAPHORE_WRITE_AWAKEN);
+                nvgpu_cond_broadcast(&ch->semaphore_wq);
+        } else {
+                nvgpu_err(g, "chid: %d is not bound to tsg", ch->chid);
+        }
+        return 0;
+}
+#if defined(CONFIG_GK20A_CYCLE_STATS)
+static inline bool is_valid_cyclestats_bar0_offset_gk20a(struct gk20a *g,
+                                                         u32 offset)
+{
+        /* support only 24-bit 4-byte aligned offsets */
+        bool valid = !(offset & 0xFF000003);
+        if (g->allow_all)
+                return true;
+        /* whitelist check */
+        valid = valid &&
+                is_bar0_global_offset_whitelisted_gk20a(g, offset);
+        /* resource size check in case there was a problem
+         * with allocating the assumed size of bar0 */
+        valid = valid && gk20a_io_valid_reg(g, offset);
+        return valid;
+}
+#endif
+int gk20a_gr_handle_notify_pending(struct gk20a *g,
+                                          struct gr_gk20a_isr_data *isr_data)
+{
+        struct channel_gk20a *ch = isr_data->ch;
+#if defined(CONFIG_GK20A_CYCLE_STATS)
+        void *virtual_address;
+        u32 buffer_size;
+        u32 offset;
+        bool exit;
+#endif
+        if (ch == NULL || tsg_gk20a_from_ch(ch) == NULL) {
+                return 0;
+        }
+#if defined(CONFIG_GK20A_CYCLE_STATS)
+        /* GL will never use payload 0 for cycle state */
+        if ((ch->cyclestate.cyclestate_buffer == NULL) || (isr_data->data_lo == 0))
+                return 0;
+        nvgpu_mutex_acquire(&ch->cyclestate.cyclestate_buffer_mutex);
+        virtual_address = ch->cyclestate.cyclestate_buffer;
+        buffer_size = ch->cyclestate.cyclestate_buffer_size;
+        offset = isr_data->data_lo;
+        exit = false;
+        while (!exit) {
+                struct share_buffer_head *sh_hdr;
+                u32 min_element_size;
+                /* validate offset */
+                if (offset + sizeof(struct share_buffer_head) > buffer_size ||
+                    offset + sizeof(struct share_buffer_head) < offset) {
+                        nvgpu_err(g,
+                                  "cyclestats buffer overrun at offset 0x%x",
+                                  offset);
+                        break;
+                }
+                sh_hdr = (struct share_buffer_head *)
+                        ((char *)virtual_address + offset);
+                min_element_size =
+                        (sh_hdr->operation == OP_END ?
+                         sizeof(struct share_buffer_head) :
+                         sizeof(struct gk20a_cyclestate_buffer_elem));
+                /* validate sh_hdr->size */
+                if (sh_hdr->size < min_element_size ||
+                    offset + sh_hdr->size > buffer_size ||
+                    offset + sh_hdr->size < offset) {
+                        nvgpu_err(g,
+                                  "bad cyclestate buffer header size at offset 0x%x",
+                                  offset);
+                        sh_hdr->failed = true;
+                        break;
+                }
+                switch (sh_hdr->operation) {
+                case OP_END:
+                        exit = true;
+                        break;
+                case BAR0_READ32:
+                case BAR0_WRITE32:
+                {
+                        struct gk20a_cyclestate_buffer_elem *op_elem =
+                                (struct gk20a_cyclestate_buffer_elem *)sh_hdr;
+                        bool valid = is_valid_cyclestats_bar0_offset_gk20a(
+                                g, op_elem->offset_bar0);
+                        u32 raw_reg;
+                        u64 mask_orig;
+                        u64 v;
+                        if (!valid) {
+                                nvgpu_err(g,
+                                           "invalid cycletstats op offset: 0x%x",
+                                           op_elem->offset_bar0);
+                                sh_hdr->failed = exit = true;
+                                break;
+                        }
+                        mask_orig =
+                                ((1ULL <<
+                                  (op_elem->last_bit + 1))
+                                 -1)&~((1ULL <<
+                                        op_elem->first_bit)-1);
+                        raw_reg =
+                                gk20a_readl(g,
+                                            op_elem->offset_bar0);
+                        switch (sh_hdr->operation) {
+                        case BAR0_READ32:
+                                op_elem->data =
+                                        (raw_reg & mask_orig)
+                                        >> op_elem->first_bit;
+                                break;
+                        case BAR0_WRITE32:
+                                v = 0;
+                                if ((unsigned int)mask_orig !=
+                                    (unsigned int)~0) {
+                                        v = (unsigned int)
+                                                (raw_reg & ~mask_orig);
+                                }
+                                v |= ((op_elem->data
+                                       << op_elem->first_bit)
+                                      & mask_orig);
+                                gk20a_writel(g,
+                                             op_elem->offset_bar0,
+                                             (unsigned int)v);
+                                break;
+                        default:
+                                /* nop ok?*/
+                                break;
+                        }
+                }
+                break;
+                default:
+                        /* no operation content case */
+                        exit = true;
+                        break;
+                }
+                sh_hdr->completed = true;
+                offset += sh_hdr->size;
+        }
+        nvgpu_mutex_release(&ch->cyclestate.cyclestate_buffer_mutex);
+#endif
+        nvgpu_log_fn(g, " ");
+        nvgpu_cond_broadcast_interruptible(&ch->notifier_wq);
+        return 0;
+}
+/* Used by sw interrupt thread to translate current ctx to chid.
+ * Also used by regops to translate current ctx to chid and tsgid.
+ * For performance, we don't want to go through 128 channels every time.
+ * curr_ctx should be the value read from gr_fecs_current_ctx_r().
+ * A small tlb is used here to cache translation.
+ *
+ * Returned channel must be freed with gk20a_channel_put() */
+static struct channel_gk20a *gk20a_gr_get_channel_from_ctx(
+        struct gk20a *g, u32 curr_ctx, u32 *curr_tsgid)
+{
+        struct fifo_gk20a *f = &g->fifo;
+        struct gr_gk20a *gr = &g->gr;
+        u32 chid = -1;
+        u32 tsgid = NVGPU_INVALID_TSG_ID;
+        u32 i;
+        struct channel_gk20a *ret = NULL;
+        /* when contexts are unloaded from GR, the valid bit is reset
+         * but the instance pointer information remains intact.
+         * This might be called from gr_isr where contexts might be
+         * unloaded. No need to check ctx_valid bit
+         */
+        nvgpu_spinlock_acquire(&gr->ch_tlb_lock);
+        /* check cache first */
+        for (i = 0; i < GR_CHANNEL_MAP_TLB_SIZE; i++) {
+                if (gr->chid_tlb[i].curr_ctx == curr_ctx) {
+                        chid = gr->chid_tlb[i].chid;
+                        tsgid = gr->chid_tlb[i].tsgid;
+                        ret = gk20a_channel_from_id(g, chid);
+                        goto unlock;
+                }
+        }
+        /* slow path */
+        for (chid = 0; chid < f->num_channels; chid++) {
+                struct channel_gk20a *ch = gk20a_channel_from_id(g, chid);
+                if (ch == NULL) {
+                        continue;
+                }
+                if ((u32)(nvgpu_inst_block_addr(g, &ch->inst_block) >>
+                                        ram_in_base_shift_v()) ==
+                                gr_fecs_current_ctx_ptr_v(curr_ctx)) {
+                        tsgid = ch->tsgid;
+                        /* found it */
+                        ret = ch;
+                        break;
+                }
+                gk20a_channel_put(ch);
+        }
+        if (ret == NULL) {
+                goto unlock;
+        }
+        /* add to free tlb entry */
+        for (i = 0; i < GR_CHANNEL_MAP_TLB_SIZE; i++) {
+                if (gr->chid_tlb[i].curr_ctx == 0) {
+                        gr->chid_tlb[i].curr_ctx = curr_ctx;
+                        gr->chid_tlb[i].chid = chid;
+                        gr->chid_tlb[i].tsgid = tsgid;
+                        goto unlock;
+                }
+        }
+        /* no free entry, flush one */
+        gr->chid_tlb[gr->channel_tlb_flush_index].curr_ctx = curr_ctx;
+        gr->chid_tlb[gr->channel_tlb_flush_index].chid = chid;
+        gr->chid_tlb[gr->channel_tlb_flush_index].tsgid = tsgid;
+        gr->channel_tlb_flush_index =
+                (gr->channel_tlb_flush_index + 1) &
+                (GR_CHANNEL_MAP_TLB_SIZE - 1);
+unlock:
+        nvgpu_spinlock_release(&gr->ch_tlb_lock);
+        if (curr_tsgid) {
+                *curr_tsgid = tsgid;
+        }
+        return ret;
+}
+int gk20a_gr_lock_down_sm(struct gk20a *g,
+                         u32 gpc, u32 tpc, u32 sm, u32 global_esr_mask,
+                         bool check_errors)
+{
+        u32 offset = gk20a_gr_gpc_offset(g, gpc) + gk20a_gr_tpc_offset(g, tpc);
+        u32 dbgr_control0;
+        nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
+                        "GPC%d TPC%d SM%d: assert stop trigger", gpc, tpc, sm);
+        /* assert stop trigger */
+        dbgr_control0 =
+                gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_control0_r() + offset);
+        dbgr_control0 |= gr_gpc0_tpc0_sm_dbgr_control0_stop_trigger_enable_f();
+        gk20a_writel(g,
+                gr_gpc0_tpc0_sm_dbgr_control0_r() + offset, dbgr_control0);
+        return g->ops.gr.wait_for_sm_lock_down(g, gpc, tpc, sm, global_esr_mask,
+                        check_errors);
+}
+bool gk20a_gr_sm_debugger_attached(struct gk20a *g)
+{
+        u32 dbgr_control0 = gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_control0_r());
+        /* check if an sm debugger is attached.
+         * assumption: all SMs will have debug mode enabled/disabled
+         * uniformly. */
+        if (gr_gpc0_tpc0_sm_dbgr_control0_debugger_mode_v(dbgr_control0) ==
+                        gr_gpc0_tpc0_sm_dbgr_control0_debugger_mode_on_v()) {
+                return true;
+        }
+        return false;
+}
+int gr_gk20a_handle_sm_exception(struct gk20a *g, u32 gpc, u32 tpc, u32 sm,
+                bool *post_event, struct channel_gk20a *fault_ch,
+                u32 *hww_global_esr)
+{
+        int ret = 0;
+        bool do_warp_sync = false, early_exit = false, ignore_debugger = false;
+        bool disable_sm_exceptions = true;
+        u32 offset = gk20a_gr_gpc_offset(g, gpc) + gk20a_gr_tpc_offset(g, tpc);
+        bool sm_debugger_attached;
+        u32 global_esr, warp_esr, global_mask;
+        nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, " ");
+        sm_debugger_attached = g->ops.gr.sm_debugger_attached(g);
+        global_esr = g->ops.gr.get_sm_hww_global_esr(g, gpc, tpc, sm);
+        *hww_global_esr = global_esr;
+        warp_esr = g->ops.gr.get_sm_hww_warp_esr(g, gpc, tpc, sm);
+        global_mask = g->ops.gr.get_sm_no_lock_down_hww_global_esr_mask(g);
+        if (!sm_debugger_attached) {
+                nvgpu_err(g, "sm hww global 0x%08x warp 0x%08x",
+                          global_esr, warp_esr);
+                return -EFAULT;
+        }
+        nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
+                  "sm hww global 0x%08x warp 0x%08x", global_esr, warp_esr);
+        gr_gk20a_elpg_protected_call(g,
+                g->ops.gr.record_sm_error_state(g, gpc, tpc, sm, fault_ch));
+        if (g->ops.gr.pre_process_sm_exception) {
+                ret = g->ops.gr.pre_process_sm_exception(g, gpc, tpc, sm,
+                                global_esr, warp_esr,
+                                sm_debugger_attached,
+                                fault_ch,
+                                &early_exit,
+                                &ignore_debugger);
+                if (ret) {
+                        nvgpu_err(g, "could not pre-process sm error!");
+                        return ret;
+                }
+        }
+        if (early_exit) {
+                nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
+                                "returning early");
+                return ret;
+        }
+        /*
+         * Disable forwarding of tpc exceptions,
+         * the debugger will reenable exceptions after servicing them.
+         *
+         * Do not disable exceptions if the only SM exception is BPT_INT
+         */
+        if ((global_esr == gr_gpc0_tpc0_sm_hww_global_esr_bpt_int_pending_f())
+                        && (warp_esr == 0)) {
+                disable_sm_exceptions = false;
+        }
+        if (!ignore_debugger && disable_sm_exceptions) {
+                u32 tpc_exception_en = gk20a_readl(g,
+                                gr_gpc0_tpc0_tpccs_tpc_exception_en_r() +
+                                offset);
+                tpc_exception_en &= ~gr_gpc0_tpc0_tpccs_tpc_exception_en_sm_enabled_f();
+                gk20a_writel(g,
+                             gr_gpc0_tpc0_tpccs_tpc_exception_en_r() + offset,
+                             tpc_exception_en);
+                nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, "SM Exceptions disabled");
+        }
+        /* if a debugger is present and an error has occurred, do a warp sync */
+        if (!ignore_debugger &&
+            ((warp_esr != 0) || ((global_esr & ~global_mask) != 0))) {
+                nvgpu_log(g, gpu_dbg_intr, "warp sync needed");
+                do_warp_sync = true;
+        }
+        if (do_warp_sync) {
+                ret = g->ops.gr.lock_down_sm(g, gpc, tpc, sm,
+                                 global_mask, true);
+                if (ret) {
+                        nvgpu_err(g, "sm did not lock down!");
+                        return ret;
+                }
+        }
+        if (ignore_debugger) {
+                nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
+                        "ignore_debugger set, skipping event posting");
+        } else {
+                *post_event = true;
+        }
+        return ret;
+}
+int gr_gk20a_handle_tex_exception(struct gk20a *g, u32 gpc, u32 tpc,
+                bool *post_event)
+{
+        int ret = 0;
+        u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE);
+        u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_STRIDE);
+        u32 offset = gpc_stride * gpc + tpc_in_gpc_stride * tpc;
+        u32 esr;
+        nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, " ");
+        esr = gk20a_readl(g,
+                         gr_gpc0_tpc0_tex_m_hww_esr_r() + offset);
+        nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, "0x%08x", esr);
+        gk20a_writel(g,
+                     gr_gpc0_tpc0_tex_m_hww_esr_r() + offset,
+                     esr);
+        return ret;
+}
+void gk20a_gr_get_esr_sm_sel(struct gk20a *g, u32 gpc, u32 tpc,
+                                u32 *esr_sm_sel)
+{
+        *esr_sm_sel = 1;
+}
+static int gk20a_gr_handle_tpc_exception(struct gk20a *g, u32 gpc, u32 tpc,
+                bool *post_event, struct channel_gk20a *fault_ch,
+                u32 *hww_global_esr)
+{
+        int ret = 0;
+        u32 offset = gk20a_gr_gpc_offset(g, gpc) + gk20a_gr_tpc_offset(g, tpc);
+        u32 tpc_exception = gk20a_readl(g, gr_gpc0_tpc0_tpccs_tpc_exception_r()
+                        + offset);
+        u32 sm_per_tpc = nvgpu_get_litter_value(g, GPU_LIT_NUM_SM_PER_TPC);
+        nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
+                        "GPC%d TPC%d: pending exception 0x%x",
+                        gpc, tpc, tpc_exception);
+        /* check if an sm exeption is pending */
+        if (gr_gpc0_tpc0_tpccs_tpc_exception_sm_v(tpc_exception) ==
+                        gr_gpc0_tpc0_tpccs_tpc_exception_sm_pending_v()) {
+                u32 esr_sm_sel, sm;
+                nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
+                                "GPC%d TPC%d: SM exception pending", gpc, tpc);
+                if (g->ops.gr.handle_tpc_sm_ecc_exception) {
+                        g->ops.gr.handle_tpc_sm_ecc_exception(g, gpc, tpc,
+                                post_event, fault_ch, hww_global_esr);
+                }
+                g->ops.gr.get_esr_sm_sel(g, gpc, tpc, &esr_sm_sel);
+                for (sm = 0; sm < sm_per_tpc; sm++) {
+                        if ((esr_sm_sel & BIT32(sm)) == 0U) {
+                                continue;
+                        }
+                        nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
+                                "GPC%d TPC%d: SM%d exception pending",
+                                 gpc, tpc, sm);
+                        ret |= g->ops.gr.handle_sm_exception(g,
+                                 gpc, tpc, sm, post_event, fault_ch,
+                                hww_global_esr);
+                        /* clear the hwws, also causes tpc and gpc
+                         * exceptions to be cleared. Should be cleared
+                         * only if SM is locked down or empty.
+                         */
+                        g->ops.gr.clear_sm_hww(g,
+                                gpc, tpc, sm, *hww_global_esr);
+                }
+        }
+        /* check if a tex exeption is pending */
+        if (gr_gpc0_tpc0_tpccs_tpc_exception_tex_v(tpc_exception) ==
+                        gr_gpc0_tpc0_tpccs_tpc_exception_tex_pending_v()) {
+                nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
+                                "GPC%d TPC%d: TEX exception pending", gpc, tpc);
+                ret |= g->ops.gr.handle_tex_exception(g, gpc, tpc, post_event);
+        }
+        if (g->ops.gr.handle_tpc_mpc_exception) {
+                ret |= g->ops.gr.handle_tpc_mpc_exception(g,
+                                        gpc, tpc, post_event);
+        }
+        return ret;
+}
+static int gk20a_gr_handle_gpc_exception(struct gk20a *g, bool *post_event,
+                struct channel_gk20a *fault_ch, u32 *hww_global_esr)
+{
+        int ret = 0;
+        u32 gpc_offset, gpc, tpc;
+        struct gr_gk20a *gr = &g->gr;
+        u32 exception1 = gk20a_readl(g, gr_exception1_r());
+        u32 gpc_exception;
+        nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, " ");
+        for (gpc = 0; gpc < gr->gpc_count; gpc++) {
+                if ((exception1 & (1 << gpc)) == 0) {
+                        continue;
+                }
+                nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
+                                "GPC%d exception pending", gpc);
+                gpc_offset = gk20a_gr_gpc_offset(g, gpc);
+                gpc_exception = gk20a_readl(g, gr_gpc0_gpccs_gpc_exception_r()
+                                + gpc_offset);
+                /* check if any tpc has an exception */
+                for (tpc = 0; tpc < gr->gpc_tpc_count[gpc]; tpc++) {
+                        if ((gr_gpc0_gpccs_gpc_exception_tpc_v(gpc_exception) &
+                                (1 << tpc)) == 0) {
+                                continue;
+                        }
+                        nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
+                                  "GPC%d: TPC%d exception pending", gpc, tpc);
+                        ret |= gk20a_gr_handle_tpc_exception(g, gpc, tpc,
+                                        post_event, fault_ch, hww_global_esr);
+                }
+                /* Handle GCC exception */
+                if ((gr_gpc0_gpccs_gpc_exception_gcc_v(gpc_exception) != 0U) &&
+                                (g->ops.gr.handle_gcc_exception != NULL)) {
+                        int gcc_ret = 0;
+                        gcc_ret = g->ops.gr.handle_gcc_exception(g, gpc, tpc,
+                                post_event, fault_ch, hww_global_esr);
+                        ret |= (ret != 0) ? ret : gcc_ret;
+                }
+                /* Handle GPCCS exceptions */
+                if (g->ops.gr.handle_gpc_gpccs_exception) {
+                        int ret_ecc = 0;
+                        ret_ecc = g->ops.gr.handle_gpc_gpccs_exception(g, gpc,
+                                                                gpc_exception);
+                        ret |= (ret != 0) ? ret : ret_ecc;
+                }
+                /* Handle GPCMMU exceptions */
+                if (g->ops.gr.handle_gpc_gpcmmu_exception) {
+                        int ret_mmu = 0;
+                        ret_mmu = g->ops.gr.handle_gpc_gpcmmu_exception(g, gpc,
+                                                                gpc_exception);
+                        ret |= (ret != 0) ? ret : ret_mmu;
+                }
+        }
+        return ret;
+}
+static int gk20a_gr_post_bpt_events(struct gk20a *g, struct tsg_gk20a *tsg,
+                                    u32 global_esr)
+{
+        if (global_esr & gr_gpc0_tpc0_sm_hww_global_esr_bpt_int_pending_f()) {
+                g->ops.fifo.post_event_id(tsg, NVGPU_EVENT_ID_BPT_INT);
+        }
+        if (global_esr & gr_gpc0_tpc0_sm_hww_global_esr_bpt_pause_pending_f()) {
+                g->ops.fifo.post_event_id(tsg, NVGPU_EVENT_ID_BPT_PAUSE);
+        }
+        return 0;
+}
+int gk20a_gr_isr(struct gk20a *g)
+{
+        struct gr_gk20a_isr_data isr_data;
+        u32 grfifo_ctl;
+        u32 obj_table;
+        bool need_reset = false;
+        u32 gr_intr = gk20a_readl(g, gr_intr_r());
+        struct channel_gk20a *ch = NULL;
+        struct channel_gk20a *fault_ch = NULL;
+        u32 tsgid = NVGPU_INVALID_TSG_ID;
+        struct tsg_gk20a *tsg = NULL;
+        u32 gr_engine_id;
+        u32 global_esr = 0;
+        u32 chid;
+        nvgpu_log_fn(g, " ");
+        nvgpu_log(g, gpu_dbg_intr, "pgraph intr 0x%08x", gr_intr);
+        if (gr_intr == 0U) {
+                return 0;
+        }
+        gr_engine_id = gk20a_fifo_get_gr_engine_id(g);
+        if (gr_engine_id != FIFO_INVAL_ENGINE_ID) {
+                gr_engine_id = BIT(gr_engine_id);
+        }
+        grfifo_ctl = gk20a_readl(g, gr_gpfifo_ctl_r());
+        grfifo_ctl &= ~gr_gpfifo_ctl_semaphore_access_f(1);
+        grfifo_ctl &= ~gr_gpfifo_ctl_access_f(1);
+        gk20a_writel(g, gr_gpfifo_ctl_r(),
+                grfifo_ctl | gr_gpfifo_ctl_access_f(0) |
+                gr_gpfifo_ctl_semaphore_access_f(0));
+        isr_data.addr = gk20a_readl(g, gr_trapped_addr_r());
+        isr_data.data_lo = gk20a_readl(g, gr_trapped_data_lo_r());
+        isr_data.data_hi = gk20a_readl(g, gr_trapped_data_hi_r());
+        isr_data.curr_ctx = gk20a_readl(g, gr_fecs_current_ctx_r());
+        isr_data.offset = gr_trapped_addr_mthd_v(isr_data.addr);
+        isr_data.sub_chan = gr_trapped_addr_subch_v(isr_data.addr);
+        obj_table = (isr_data.sub_chan < 4) ? gk20a_readl(g,
+                gr_fe_object_table_r(isr_data.sub_chan)) : 0;
+        isr_data.class_num = gr_fe_object_table_nvclass_v(obj_table);
+        ch = gk20a_gr_get_channel_from_ctx(g, isr_data.curr_ctx, &tsgid);
+        isr_data.ch = ch;
+        chid = ch != NULL ? ch->chid : FIFO_INVAL_CHANNEL_ID;
+        if (ch == NULL) {
+                nvgpu_err(g, "pgraph intr: 0x%08x, chid: INVALID", gr_intr);
+        } else {
+                tsg = tsg_gk20a_from_ch(ch);
+                if (tsg == NULL) {
+                        nvgpu_err(g, "pgraph intr: 0x%08x, chid: %d "
+                                "not bound to tsg", gr_intr, chid);
+                }
+        }
+        nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
+                "channel %d: addr 0x%08x, "
+                "data 0x%08x 0x%08x,"
+                "ctx 0x%08x, offset 0x%08x, "
+                "subchannel 0x%08x, class 0x%08x",
+                chid, isr_data.addr,
+                isr_data.data_hi, isr_data.data_lo,
+                isr_data.curr_ctx, isr_data.offset,
+                isr_data.sub_chan, isr_data.class_num);
+        if (gr_intr & gr_intr_notify_pending_f()) {
+                g->ops.gr.handle_notify_pending(g, &isr_data);
+                gk20a_writel(g, gr_intr_r(),
+                        gr_intr_notify_reset_f());
+                gr_intr &= ~gr_intr_notify_pending_f();
+        }
+        if (gr_intr & gr_intr_semaphore_pending_f()) {
+                g->ops.gr.handle_semaphore_pending(g, &isr_data);
+                gk20a_writel(g, gr_intr_r(),
+                        gr_intr_semaphore_reset_f());
+                gr_intr &= ~gr_intr_semaphore_pending_f();
+        }
+        if (gr_intr & gr_intr_semaphore_timeout_pending_f()) {
+                if (gk20a_gr_handle_semaphore_timeout_pending(g,
+                                &isr_data) != 0) {
+                        need_reset = true;
+                }
+                gk20a_writel(g, gr_intr_r(),
+                        gr_intr_semaphore_reset_f());
+                gr_intr &= ~gr_intr_semaphore_pending_f();
+        }
+        if (gr_intr & gr_intr_illegal_notify_pending_f()) {
+                if (gk20a_gr_intr_illegal_notify_pending(g,
+                                &isr_data) != 0) {
+                        need_reset = true;
+                }
+                gk20a_writel(g, gr_intr_r(),
+                        gr_intr_illegal_notify_reset_f());
+                gr_intr &= ~gr_intr_illegal_notify_pending_f();
+        }
+        if (gr_intr & gr_intr_illegal_method_pending_f()) {
+                if (gk20a_gr_handle_illegal_method(g, &isr_data) != 0) {
+                        need_reset = true;
+                }
+                gk20a_writel(g, gr_intr_r(),
+                        gr_intr_illegal_method_reset_f());
+                gr_intr &= ~gr_intr_illegal_method_pending_f();
+        }
+        if (gr_intr & gr_intr_illegal_class_pending_f()) {
+                if (gk20a_gr_handle_illegal_class(g, &isr_data) != 0) {
+                        need_reset = true;
+                }
+                gk20a_writel(g, gr_intr_r(),
+                        gr_intr_illegal_class_reset_f());
+                gr_intr &= ~gr_intr_illegal_class_pending_f();
+        }
+        if (gr_intr & gr_intr_fecs_error_pending_f()) {
+                if (g->ops.gr.handle_fecs_error(g, ch, &isr_data) != 0) {
+                        need_reset = true;
+                }
+                gk20a_writel(g, gr_intr_r(),
+                        gr_intr_fecs_error_reset_f());
+                gr_intr &= ~gr_intr_fecs_error_pending_f();
+        }
+        if (gr_intr & gr_intr_class_error_pending_f()) {
+                if (gk20a_gr_handle_class_error(g, &isr_data) != 0) {
+                        need_reset = true;
+                }
+                gk20a_writel(g, gr_intr_r(),
+                        gr_intr_class_error_reset_f());
+                gr_intr &= ~gr_intr_class_error_pending_f();
+        }
+        /* this one happens if someone tries to hit a non-whitelisted
+         * register using set_falcon[4] */
+        if (gr_intr & gr_intr_firmware_method_pending_f()) {
+                if (gk20a_gr_handle_firmware_method(g, &isr_data) != 0) {
+                        need_reset = true;
+                }
+                nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, "firmware method intr pending\n");
+                gk20a_writel(g, gr_intr_r(),
+                        gr_intr_firmware_method_reset_f());
+                gr_intr &= ~gr_intr_firmware_method_pending_f();
+        }
+        if (gr_intr & gr_intr_exception_pending_f()) {
+                u32 exception = gk20a_readl(g, gr_exception_r());
+                nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, "exception %08x\n", exception);
+                if (exception & gr_exception_fe_m()) {
+                        u32 fe = gk20a_readl(g, gr_fe_hww_esr_r());
+                        u32 info = gk20a_readl(g, gr_fe_hww_esr_info_r());
+                        nvgpu_err(g, "fe exception: esr 0x%08x, info 0x%08x",
+                                        fe, info);
+                        gk20a_writel(g, gr_fe_hww_esr_r(),
+                                gr_fe_hww_esr_reset_active_f());
+                        need_reset = true;
+                }
+                if (exception & gr_exception_memfmt_m()) {
+                        u32 memfmt = gk20a_readl(g, gr_memfmt_hww_esr_r());
+                        nvgpu_err(g, "memfmt exception: esr %08x", memfmt);
+                        gk20a_writel(g, gr_memfmt_hww_esr_r(),
+                                        gr_memfmt_hww_esr_reset_active_f());
+                        need_reset = true;
+                }
+                if (exception & gr_exception_pd_m()) {
+                        u32 pd = gk20a_readl(g, gr_pd_hww_esr_r());
+                        nvgpu_err(g, "pd exception: esr 0x%08x", pd);
+                        gk20a_writel(g, gr_pd_hww_esr_r(),
+                                        gr_pd_hww_esr_reset_active_f());
+                        need_reset = true;
+                }
+                if (exception & gr_exception_scc_m()) {
+                        u32 scc = gk20a_readl(g, gr_scc_hww_esr_r());
+                        nvgpu_err(g, "scc exception: esr 0x%08x", scc);
+                        gk20a_writel(g, gr_scc_hww_esr_r(),
+                                        gr_scc_hww_esr_reset_active_f());
+                        need_reset = true;
+                }
+                if (exception & gr_exception_ds_m()) {
+                        u32 ds = gk20a_readl(g, gr_ds_hww_esr_r());
+                        nvgpu_err(g, "ds exception: esr: 0x%08x", ds);
+                        gk20a_writel(g, gr_ds_hww_esr_r(),
+                                         gr_ds_hww_esr_reset_task_f());
+                        need_reset = true;
+                }
+                if (exception & gr_exception_ssync_m()) {
+                        if (g->ops.gr.handle_ssync_hww) {
+                                if (g->ops.gr.handle_ssync_hww(g) != 0) {
+                                        need_reset = true;
+                                }
+                        } else {
+                                nvgpu_err(g, "unhandled ssync exception");
+                        }
+                }
+                if (exception & gr_exception_mme_m()) {
+                        u32 mme = gk20a_readl(g, gr_mme_hww_esr_r());
+                        u32 info = gk20a_readl(g, gr_mme_hww_esr_info_r());
+                        nvgpu_err(g, "mme exception: esr 0x%08x info:0x%08x",
+                                        mme, info);
+                        gk20a_writel(g, gr_mme_hww_esr_r(),
+                                gr_mme_hww_esr_reset_active_f());
+                        need_reset = true;
+                }
+                if (exception & gr_exception_sked_m()) {
+                        u32 sked = gk20a_readl(g, gr_sked_hww_esr_r());
+                        nvgpu_err(g, "sked exception: esr 0x%08x", sked);
+                        gk20a_writel(g, gr_sked_hww_esr_r(),
+                                gr_sked_hww_esr_reset_active_f());
+                        need_reset = true;
+                }
+                /* check if a gpc exception has occurred */
+                if (((exception & gr_exception_gpc_m()) != 0U) &&
+                                                !need_reset) {
+                        bool post_event = false;
+                        nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
+                                         "GPC exception pending");
+                        if (tsg != NULL) {
+                                fault_ch = isr_data.ch;
+                        }
+                        /* fault_ch can be NULL */
+                        /* check if any gpc has an exception */
+                        if (gk20a_gr_handle_gpc_exception(g, &post_event,
+                                        fault_ch, &global_esr) != 0) {
+                                need_reset = true;
+                        }
+                        /* signal clients waiting on an event */
+                        if (g->ops.gr.sm_debugger_attached(g) &&
+                                post_event && (fault_ch != NULL)) {
+                                g->ops.debugger.post_events(fault_ch);
+                        }
+                }
+                gk20a_writel(g, gr_intr_r(), gr_intr_exception_reset_f());
+                gr_intr &= ~gr_intr_exception_pending_f();
+                if (need_reset) {
+                        nvgpu_err(g, "set gr exception notifier");
+                        gk20a_gr_set_error_notifier(g, &isr_data,
+                                         NVGPU_ERR_NOTIFIER_GR_EXCEPTION);
+                }
+        }
+        if (need_reset) {
+                if (tsg != NULL) {
+                        gk20a_fifo_recover(g, gr_engine_id,
+                                           tsgid, true, true, true,
+                                                RC_TYPE_GR_FAULT);
+                } else {
+                        if (ch != NULL) {
+                                nvgpu_err(g, "chid: %d referenceable but not "
+                                        "bound to tsg", chid);
+                        }
+                        gk20a_fifo_recover(g, gr_engine_id,
+                                           0, false, false, true,
+                                                RC_TYPE_GR_FAULT);
+                }
+        }
+        if (gr_intr != 0U) {
+                /* clear unhandled interrupts */
+                if (ch == NULL) {
+                        /*
+                         * This is probably an interrupt during
+                         * gk20a_free_channel()
+                         */
+                        nvgpu_err(g, "unhandled gr intr 0x%08x for "
+                                "unreferenceable channel, clearing",
+                                gr_intr);
+                } else {
+                        nvgpu_err(g, "unhandled gr intr 0x%08x for chid: %d",
+                                gr_intr, chid);
+                }
+                gk20a_writel(g, gr_intr_r(), gr_intr);
+        }
+        gk20a_writel(g, gr_gpfifo_ctl_r(),
+                grfifo_ctl | gr_gpfifo_ctl_access_f(1) |
+                gr_gpfifo_ctl_semaphore_access_f(1));
+        /* Posting of BPT events should be the last thing in this function */
+        if ((global_esr != 0U) && (tsg != NULL)) {
+                gk20a_gr_post_bpt_events(g, tsg, global_esr);
+        }
+        if (ch) {
+                gk20a_channel_put(ch);
+        }
+        return 0;
+}
+u32 gk20a_gr_nonstall_isr(struct gk20a *g)
+{
+        u32 ops = 0;
+        u32 gr_intr = gk20a_readl(g, gr_intr_nonstall_r());
+        nvgpu_log(g, gpu_dbg_intr, "pgraph nonstall intr %08x", gr_intr);
+        if ((gr_intr & gr_intr_nonstall_trap_pending_f()) != 0U) {
+                /* Clear the interrupt */
+                gk20a_writel(g, gr_intr_nonstall_r(),
+                        gr_intr_nonstall_trap_pending_f());
+                ops |= (GK20A_NONSTALL_OPS_WAKEUP_SEMAPHORE |
+                        GK20A_NONSTALL_OPS_POST_EVENTS);
+        }
+        return ops;
+}
+int gr_gk20a_fecs_get_reglist_img_size(struct gk20a *g, u32 *size)
+{
+        BUG_ON(size == NULL);
+        return gr_gk20a_submit_fecs_method_op(g,
+                   (struct fecs_method_op_gk20a) {
+                           .mailbox.id = 0,
+                           .mailbox.data = 0,
+                           .mailbox.clr = ~0,
+                           .method.data = 1,
+                           .method.addr = gr_fecs_method_push_adr_discover_reglist_image_size_v(),
+                           .mailbox.ret = size,
+                           .cond.ok = GR_IS_UCODE_OP_NOT_EQUAL,
+                           .mailbox.ok = 0,
+                           .cond.fail = GR_IS_UCODE_OP_SKIP,
+                           .mailbox.fail = 0}, false);
+}
+int gr_gk20a_fecs_set_reglist_bind_inst(struct gk20a *g,
+                struct nvgpu_mem *inst_block)
+{
+        u32 data = fecs_current_ctx_data(g, inst_block);
+        return gr_gk20a_submit_fecs_method_op(g,
+                   (struct fecs_method_op_gk20a){
+                           .mailbox.id = 4,
+                           .mailbox.data = data,
+                           .mailbox.clr = ~0,
+                           .method.data = 1,
+                           .method.addr = gr_fecs_method_push_adr_set_reglist_bind_instance_v(),
+                           .mailbox.ret = NULL,
+                           .cond.ok = GR_IS_UCODE_OP_EQUAL,
+                           .mailbox.ok = 1,
+                           .cond.fail = GR_IS_UCODE_OP_SKIP,
+                           .mailbox.fail = 0}, false);
+}
+int gr_gk20a_fecs_set_reglist_virtual_addr(struct gk20a *g, u64 pmu_va)
+{
+        return gr_gk20a_submit_fecs_method_op(g,
+                   (struct fecs_method_op_gk20a) {
+                           .mailbox.id = 4,
+                           .mailbox.data = u64_lo32(pmu_va >> 8),
+                           .mailbox.clr = ~0,
+                           .method.data = 1,
+                           .method.addr = gr_fecs_method_push_adr_set_reglist_virtual_address_v(),
+                           .mailbox.ret = NULL,
+                           .cond.ok = GR_IS_UCODE_OP_EQUAL,
+                           .mailbox.ok = 1,
+                           .cond.fail = GR_IS_UCODE_OP_SKIP,
+                           .mailbox.fail = 0}, false);
+}
+int gk20a_gr_suspend(struct gk20a *g)
+{
+        u32 ret = 0;
+        nvgpu_log_fn(g, " ");
+        ret = g->ops.gr.wait_empty(g, gk20a_get_gr_idle_timeout(g),
+                                   GR_IDLE_CHECK_DEFAULT);
+        if (ret) {
+                return ret;
+        }
+        gk20a_writel(g, gr_gpfifo_ctl_r(),
+                gr_gpfifo_ctl_access_disabled_f());
+        /* disable gr intr */
+        gk20a_writel(g, gr_intr_r(), 0);
+        gk20a_writel(g, gr_intr_en_r(), 0);
+        /* disable all exceptions */
+        gk20a_writel(g, gr_exception_r(), 0);
+        gk20a_writel(g, gr_exception_en_r(), 0);
+        gk20a_writel(g, gr_exception1_r(), 0);
+        gk20a_writel(g, gr_exception1_en_r(), 0);
+        gk20a_writel(g, gr_exception2_r(), 0);
+        gk20a_writel(g, gr_exception2_en_r(), 0);
+        gk20a_gr_flush_channel_tlb(&g->gr);
+        g->gr.initialized = false;
+        nvgpu_log_fn(g, "done");
+        return ret;
+}
+static int gr_gk20a_find_priv_offset_in_buffer(struct gk20a *g,
+                                               u32 addr,
+                                               bool is_quad, u32 quad,
+                                               u32 *context_buffer,
+                                               u32 context_buffer_size,
+                                               u32 *priv_offset);
+static int gr_gk20a_find_priv_offset_in_pm_buffer(struct gk20a *g,
+                                                  u32 addr,
+                                                  u32 *priv_offset);
+/* This function will decode a priv address and return the partition type and numbers. */
+int gr_gk20a_decode_priv_addr(struct gk20a *g, u32 addr,
+                              enum ctxsw_addr_type *addr_type,
+                              u32 *gpc_num, u32 *tpc_num, u32 *ppc_num, u32 *be_num,
+                              u32 *broadcast_flags)
+{
+        u32 gpc_addr;
+        nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, "addr=0x%x", addr);
+        /* setup defaults */
+        *addr_type = CTXSW_ADDR_TYPE_SYS;
+        *broadcast_flags = PRI_BROADCAST_FLAGS_NONE;
+        *gpc_num = 0;
+        *tpc_num = 0;
+        *ppc_num = 0;
+        *be_num  = 0;
+        if (pri_is_gpc_addr(g, addr)) {
+                *addr_type = CTXSW_ADDR_TYPE_GPC;
+                gpc_addr = pri_gpccs_addr_mask(addr);
+                if (pri_is_gpc_addr_shared(g, addr)) {
+                        *addr_type = CTXSW_ADDR_TYPE_GPC;
+                        *broadcast_flags |= PRI_BROADCAST_FLAGS_GPC;
+                } else {
+                        *gpc_num = pri_get_gpc_num(g, addr);
+                }
+                if (pri_is_ppc_addr(g, gpc_addr)) {
+                        *addr_type = CTXSW_ADDR_TYPE_PPC;
+                        if (pri_is_ppc_addr_shared(g, gpc_addr)) {
+                                *broadcast_flags |= PRI_BROADCAST_FLAGS_PPC;
+                                return 0;
+                        }
+                }
+                if (g->ops.gr.is_tpc_addr(g, gpc_addr)) {
+                        *addr_type = CTXSW_ADDR_TYPE_TPC;
+                        if (pri_is_tpc_addr_shared(g, gpc_addr)) {
+                                *broadcast_flags |= PRI_BROADCAST_FLAGS_TPC;
+                                return 0;
+                        }
+                        *tpc_num = g->ops.gr.get_tpc_num(g, gpc_addr);
+                }
+                return 0;
+        } else if (pri_is_be_addr(g, addr)) {
+                *addr_type = CTXSW_ADDR_TYPE_BE;
+                if (pri_is_be_addr_shared(g, addr)) {
+                        *broadcast_flags |= PRI_BROADCAST_FLAGS_BE;
+                        return 0;
+                }
+                *be_num = pri_get_be_num(g, addr);
+                return 0;
+        } else if (g->ops.ltc.pri_is_ltc_addr(g, addr)) {
+                *addr_type = CTXSW_ADDR_TYPE_LTCS;
+                if (g->ops.ltc.is_ltcs_ltss_addr(g, addr)) {
+                        *broadcast_flags |= PRI_BROADCAST_FLAGS_LTCS;
+                } else if (g->ops.ltc.is_ltcn_ltss_addr(g, addr)) {
+                        *broadcast_flags |= PRI_BROADCAST_FLAGS_LTSS;
+                }
+                return 0;
+        } else if (pri_is_fbpa_addr(g, addr)) {
+                *addr_type = CTXSW_ADDR_TYPE_FBPA;
+                if (pri_is_fbpa_addr_shared(g, addr)) {
+                        *broadcast_flags |= PRI_BROADCAST_FLAGS_FBPA;
+                        return 0;
+                }
+                return 0;
+        } else if ((g->ops.gr.is_egpc_addr != NULL) &&
+                   g->ops.gr.is_egpc_addr(g, addr)) {
+                        return g->ops.gr.decode_egpc_addr(g,
+                                        addr, addr_type, gpc_num,
+                                        tpc_num, broadcast_flags);
+        } else {
+                *addr_type = CTXSW_ADDR_TYPE_SYS;
+                return 0;
+        }
+        /* PPC!?!?!?! */
+        /*NOTREACHED*/
+        return -EINVAL;
+}
+void gr_gk20a_split_fbpa_broadcast_addr(struct gk20a *g, u32 addr,
+                                      u32 num_fbpas,
+                                      u32 *priv_addr_table, u32 *t)
+{
+        u32 fbpa_id;
+        for (fbpa_id = 0; fbpa_id < num_fbpas; fbpa_id++) {
+                priv_addr_table[(*t)++] = pri_fbpa_addr(g,
+                                pri_fbpa_addr_mask(g, addr), fbpa_id);
+        }
+}
+int gr_gk20a_split_ppc_broadcast_addr(struct gk20a *g, u32 addr,
+                                      u32 gpc_num,
+                                      u32 *priv_addr_table, u32 *t)
+{
+    u32 ppc_num;
+    nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, "addr=0x%x", addr);
+    for (ppc_num = 0; ppc_num < g->gr.gpc_ppc_count[gpc_num]; ppc_num++) {
+            priv_addr_table[(*t)++] = pri_ppc_addr(g, pri_ppccs_addr_mask(addr),
+                                                   gpc_num, ppc_num);
+    }
+    return 0;
+}
+/*
+ * The context buffer is indexed using BE broadcast addresses and GPC/TPC
+ * unicast addresses. This function will convert a BE unicast address to a BE
+ * broadcast address and split a GPC/TPC broadcast address into a table of
+ * GPC/TPC addresses.  The addresses generated by this function can be
+ * successfully processed by gr_gk20a_find_priv_offset_in_buffer
+ */
+int gr_gk20a_create_priv_addr_table(struct gk20a *g,
+                                           u32 addr,
+                                           u32 *priv_addr_table,
+                                           u32 *num_registers)
+{
+        enum ctxsw_addr_type addr_type;
+        u32 gpc_num, tpc_num, ppc_num, be_num;
+        u32 priv_addr, gpc_addr;
+        u32 broadcast_flags;
+        u32 t;
+        int err;
+        t = 0;
+        *num_registers = 0;
+        nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, "addr=0x%x", addr);
+        err = g->ops.gr.decode_priv_addr(g, addr, &addr_type,
+                                        &gpc_num, &tpc_num, &ppc_num, &be_num,
+                                        &broadcast_flags);
+        nvgpu_log(g, gpu_dbg_gpu_dbg, "addr_type = %d", addr_type);
+        if (err != 0) {
+                return err;
+        }
+        if ((addr_type == CTXSW_ADDR_TYPE_SYS) ||
+            (addr_type == CTXSW_ADDR_TYPE_BE)) {
+                /* The BE broadcast registers are included in the compressed PRI
+                 * table. Convert a BE unicast address to a broadcast address
+                 * so that we can look up the offset. */
+                if ((addr_type == CTXSW_ADDR_TYPE_BE) &&
+                    ((broadcast_flags & PRI_BROADCAST_FLAGS_BE) == 0U)) {
+                        priv_addr_table[t++] = pri_be_shared_addr(g, addr);
+                } else {
+                        priv_addr_table[t++] = addr;
+                }
+                *num_registers = t;
+                return 0;
+        }
+        /* The GPC/TPC unicast registers are included in the compressed PRI
+         * tables. Convert a GPC/TPC broadcast address to unicast addresses so
+         * that we can look up the offsets. */
+        if (broadcast_flags & PRI_BROADCAST_FLAGS_GPC) {
+                for (gpc_num = 0; gpc_num < g->gr.gpc_count; gpc_num++) {
+                        if (broadcast_flags & PRI_BROADCAST_FLAGS_TPC) {
+                                for (tpc_num = 0;
+                                     tpc_num < g->gr.gpc_tpc_count[gpc_num];
+                                     tpc_num++) {
+                                        priv_addr_table[t++] =
+                                                pri_tpc_addr(g, pri_tpccs_addr_mask(addr),
+                                                             gpc_num, tpc_num);
+                                }
+                        } else if (broadcast_flags & PRI_BROADCAST_FLAGS_PPC) {
+                                err = gr_gk20a_split_ppc_broadcast_addr(g, addr, gpc_num,
+                                                               priv_addr_table, &t);
+                                if (err != 0) {
+                                        return err;
+                                }
+                        } else {
+                                priv_addr = pri_gpc_addr(g,
+                                                pri_gpccs_addr_mask(addr),
+                                                gpc_num);
+                                gpc_addr = pri_gpccs_addr_mask(priv_addr);
+                                tpc_num = g->ops.gr.get_tpc_num(g, gpc_addr);
+                                if (tpc_num >= g->gr.gpc_tpc_count[gpc_num]) {
+                                        continue;
+                                }
+                                priv_addr_table[t++] = priv_addr;
+                        }
+                }
+        } else if (((addr_type == CTXSW_ADDR_TYPE_EGPC) ||
+                        (addr_type == CTXSW_ADDR_TYPE_ETPC)) &&
+                        (g->ops.gr.egpc_etpc_priv_addr_table != NULL)) {
+                nvgpu_log(g, gpu_dbg_gpu_dbg, "addr_type : EGPC/ETPC");
+                g->ops.gr.egpc_etpc_priv_addr_table(g, addr, gpc_num, tpc_num,
+                                broadcast_flags, priv_addr_table, &t);
+        } else if (broadcast_flags & PRI_BROADCAST_FLAGS_LTSS) {
+                g->ops.ltc.split_lts_broadcast_addr(g, addr,
+                                                        priv_addr_table, &t);
+        } else if (broadcast_flags & PRI_BROADCAST_FLAGS_LTCS) {
+                g->ops.ltc.split_ltc_broadcast_addr(g, addr,
+                                                        priv_addr_table, &t);
+        } else if (broadcast_flags & PRI_BROADCAST_FLAGS_FBPA) {
+                g->ops.gr.split_fbpa_broadcast_addr(g, addr,
+                                nvgpu_get_litter_value(g, GPU_LIT_NUM_FBPAS),
+                                priv_addr_table, &t);
+        } else if ((broadcast_flags & PRI_BROADCAST_FLAGS_GPC) == 0U) {
+                if (broadcast_flags & PRI_BROADCAST_FLAGS_TPC) {
+                        for (tpc_num = 0;
+                             tpc_num < g->gr.gpc_tpc_count[gpc_num];
+                             tpc_num++) {
+                                priv_addr_table[t++] =
+                                        pri_tpc_addr(g, pri_tpccs_addr_mask(addr),
+                                                     gpc_num, tpc_num);
+                        }
+                } else if (broadcast_flags & PRI_BROADCAST_FLAGS_PPC) {
+                        err = gr_gk20a_split_ppc_broadcast_addr(g,
+                                        addr, gpc_num, priv_addr_table, &t);
+                } else {
+                        priv_addr_table[t++] = addr;
+                }
+        }
+        *num_registers = t;
+        return 0;
+}
+int gr_gk20a_get_ctx_buffer_offsets(struct gk20a *g,
+                                    u32 addr,
+                                    u32 max_offsets,
+                                    u32 *offsets, u32 *offset_addrs,
+                                    u32 *num_offsets,
+                                    bool is_quad, u32 quad)
+{
+        u32 i;
+        u32 priv_offset = 0;
+        u32 *priv_registers;
+        u32 num_registers = 0;
+        int err = 0;
+        struct gr_gk20a *gr = &g->gr;
+        u32 sm_per_tpc = nvgpu_get_litter_value(g, GPU_LIT_NUM_SM_PER_TPC);
+        u32 potential_offsets = gr->max_gpc_count * gr->max_tpc_per_gpc_count *
+                                        sm_per_tpc;
+        nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, "addr=0x%x", addr);
+        /* implementation is crossed-up if either of these happen */
+        if (max_offsets > potential_offsets) {
+                nvgpu_log_fn(g, "max_offsets > potential_offsets");
+                return -EINVAL;
+        }
+        if (!g->gr.ctx_vars.golden_image_initialized) {
+                return -ENODEV;
+        }
+        priv_registers = nvgpu_kzalloc(g, sizeof(u32) * potential_offsets);
+        if (priv_registers == NULL) {
+                nvgpu_log_fn(g, "failed alloc for potential_offsets=%d", potential_offsets);
+                err = PTR_ERR(priv_registers);
+                goto cleanup;
+        }
+        memset(offsets,      0, sizeof(u32) * max_offsets);
+        memset(offset_addrs, 0, sizeof(u32) * max_offsets);
+        *num_offsets = 0;
+        g->ops.gr.create_priv_addr_table(g, addr, &priv_registers[0],
+                        &num_registers);
+        if ((max_offsets > 1) && (num_registers > max_offsets)) {
+                nvgpu_log_fn(g, "max_offsets = %d, num_registers = %d",
+                                max_offsets, num_registers);
+                err = -EINVAL;
+                goto cleanup;
+        }
+        if ((max_offsets == 1) && (num_registers > 1)) {
+                num_registers = 1;
+        }
+        if (g->gr.ctx_vars.local_golden_image == NULL) {
+                nvgpu_log_fn(g, "no context switch header info to work with");
+                err = -EINVAL;
+                goto cleanup;
+        }
+        for (i = 0; i < num_registers; i++) {
+                err = gr_gk20a_find_priv_offset_in_buffer(g,
+                                                  priv_registers[i],
+                                                  is_quad, quad,
+                                                  g->gr.ctx_vars.local_golden_image,
+                                                  g->gr.ctx_vars.golden_image_size,
+                                                  &priv_offset);
+                if (err != 0) {
+                        nvgpu_log_fn(g, "Could not determine priv_offset for addr:0x%x",
+                                      addr); /*, grPriRegStr(addr)));*/
+                        goto cleanup;
+                }
+                offsets[i] = priv_offset;
+                offset_addrs[i] = priv_registers[i];
+        }
+        *num_offsets = num_registers;
+cleanup:
+        if (!IS_ERR_OR_NULL(priv_registers)) {
+                nvgpu_kfree(g, priv_registers);
+        }
+        return err;
+}
+int gr_gk20a_get_pm_ctx_buffer_offsets(struct gk20a *g,
+                                       u32 addr,
+                                       u32 max_offsets,
+                                       u32 *offsets, u32 *offset_addrs,
+                                       u32 *num_offsets)
+{
+        u32 i;
+        u32 priv_offset = 0;
+        u32 *priv_registers;
+        u32 num_registers = 0;
+        int err = 0;
+        struct gr_gk20a *gr = &g->gr;
+        u32 sm_per_tpc = nvgpu_get_litter_value(g, GPU_LIT_NUM_SM_PER_TPC);
+        u32 potential_offsets = gr->max_gpc_count * gr->max_tpc_per_gpc_count *
+                                        sm_per_tpc;
+        nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, "addr=0x%x", addr);
+        /* implementation is crossed-up if either of these happen */
+        if (max_offsets > potential_offsets) {
+                return -EINVAL;
+        }
+        if (!g->gr.ctx_vars.golden_image_initialized) {
+                return -ENODEV;
+        }
+        priv_registers = nvgpu_kzalloc(g, sizeof(u32) * potential_offsets);
+        if (priv_registers == NULL) {
+                nvgpu_log_fn(g, "failed alloc for potential_offsets=%d", potential_offsets);
+                return -ENOMEM;
+        }
+        memset(offsets,      0, sizeof(u32) * max_offsets);
+        memset(offset_addrs, 0, sizeof(u32) * max_offsets);
+        *num_offsets = 0;
+        g->ops.gr.create_priv_addr_table(g, addr, priv_registers,
+                        &num_registers);
+        if ((max_offsets > 1) && (num_registers > max_offsets)) {
+                err = -EINVAL;
+                goto cleanup;
+        }
+        if ((max_offsets == 1) && (num_registers > 1)) {
+                num_registers = 1;
+        }
+        if (g->gr.ctx_vars.local_golden_image == NULL) {
+                nvgpu_log_fn(g, "no context switch header info to work with");
+                err = -EINVAL;
+                goto cleanup;
+        }
+        for (i = 0; i < num_registers; i++) {
+                err = gr_gk20a_find_priv_offset_in_pm_buffer(g,
+                                                  priv_registers[i],
+                                                  &priv_offset);
+                if (err != 0) {
+                        nvgpu_log_fn(g, "Could not determine priv_offset for addr:0x%x",
+                                      addr); /*, grPriRegStr(addr)));*/
+                        goto cleanup;
+                }
+                offsets[i] = priv_offset;
+                offset_addrs[i] = priv_registers[i];
+        }
+        *num_offsets = num_registers;
+cleanup:
+        nvgpu_kfree(g, priv_registers);
+        return err;
+}
+/* Setup some register tables.  This looks hacky; our
+ * register/offset functions are just that, functions.
+ * So they can't be used as initializers... TBD: fix to
+ * generate consts at least on an as-needed basis.
+ */
+static const u32 _num_ovr_perf_regs = 17;
+static u32 _ovr_perf_regs[17] = { 0, };
+/* Following are the blocks of registers that the ucode
+ stores in the extended region.*/
+void gk20a_gr_init_ovr_sm_dsm_perf(void)
+{
+        if (_ovr_perf_regs[0] != 0) {
+                return;
+        }
+        _ovr_perf_regs[0] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control_sel0_r();
+        _ovr_perf_regs[1] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control_sel1_r();
+        _ovr_perf_regs[2] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control0_r();
+        _ovr_perf_regs[3] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control5_r();
+        _ovr_perf_regs[4] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_status1_r();
+        _ovr_perf_regs[5] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter0_control_r();
+        _ovr_perf_regs[6] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter1_control_r();
+        _ovr_perf_regs[7] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter2_control_r();
+        _ovr_perf_regs[8] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter3_control_r();
+        _ovr_perf_regs[9] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter4_control_r();
+        _ovr_perf_regs[10] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter5_control_r();
+        _ovr_perf_regs[11] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter6_control_r();
+        _ovr_perf_regs[12] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter7_control_r();
+        _ovr_perf_regs[13] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter4_r();
+        _ovr_perf_regs[14] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter5_r();
+        _ovr_perf_regs[15] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter6_r();
+        _ovr_perf_regs[16] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter7_r();
+}
+/* TBD: would like to handle this elsewhere, at a higher level.
+ * these are currently constructed in a "test-then-write" style
+ * which makes it impossible to know externally whether a ctx
+ * write will actually occur. so later we should put a lazy,
+ *  map-and-hold system in the patch write state */
+static int gr_gk20a_ctx_patch_smpc(struct gk20a *g,
+                            struct channel_gk20a *ch,
+                            u32 addr, u32 data,
+                            struct nvgpu_mem *mem)
+{
+        u32 num_gpc = g->gr.gpc_count;
+        u32 num_tpc;
+        u32 tpc, gpc, reg;
+        u32 chk_addr;
+        u32 vaddr_lo;
+        u32 vaddr_hi;
+        u32 tmp;
+        u32 num_ovr_perf_regs = 0;
+        u32 *ovr_perf_regs = NULL;
+        u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE);
+        u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_STRIDE);
+        struct tsg_gk20a *tsg;
+        struct nvgpu_gr_ctx *gr_ctx;
+        struct nvgpu_mem *ctxheader = &ch->ctx_header;
+        tsg = tsg_gk20a_from_ch(ch);
+        if (tsg == NULL) {
+                return -EINVAL;
+        }
+        gr_ctx = &tsg->gr_ctx;
+        g->ops.gr.init_ovr_sm_dsm_perf();
+        g->ops.gr.init_sm_dsm_reg_info();
+        g->ops.gr.get_ovr_perf_regs(g, &num_ovr_perf_regs, &ovr_perf_regs);
+        nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, "addr=0x%x", addr);
+        for (reg = 0; reg < num_ovr_perf_regs; reg++) {
+                for (gpc = 0; gpc < num_gpc; gpc++)  {
+                        num_tpc = g->gr.gpc_tpc_count[gpc];
+                        for (tpc = 0; tpc < num_tpc; tpc++) {
+                                chk_addr = ((gpc_stride * gpc) +
+                                            (tpc_in_gpc_stride * tpc) +
+                                            ovr_perf_regs[reg]);
+                                if (chk_addr != addr) {
+                                        continue;
+                                }
+                                /* reset the patch count from previous
+                                   runs,if ucode has already processed
+                                   it */
+                                tmp = nvgpu_mem_rd(g, mem,
+                                       ctxsw_prog_main_image_patch_count_o());
+                                if (tmp == 0U) {
+                                        gr_ctx->patch_ctx.data_count = 0;
+                                }
+                                gr_gk20a_ctx_patch_write(g, gr_ctx,
+                                                         addr, data, true);
+                                vaddr_lo = u64_lo32(gr_ctx->patch_ctx.mem.gpu_va);
+                                vaddr_hi = u64_hi32(gr_ctx->patch_ctx.mem.gpu_va);
+                                nvgpu_mem_wr(g, mem,
+                                         ctxsw_prog_main_image_patch_count_o(),
+                                         gr_ctx->patch_ctx.data_count);
+                                if (ctxheader->gpu_va) {
+                                        nvgpu_mem_wr(g, ctxheader,
+                                                ctxsw_prog_main_image_patch_adr_lo_o(),
+                                                vaddr_lo);
+                                        nvgpu_mem_wr(g, ctxheader,
+                                                ctxsw_prog_main_image_patch_adr_hi_o(),
+                                                vaddr_hi);
+                                } else {
+                                        nvgpu_mem_wr(g, mem,
+                                                ctxsw_prog_main_image_patch_adr_lo_o(),
+                                                vaddr_lo);
+                                        nvgpu_mem_wr(g, mem,
+                                                ctxsw_prog_main_image_patch_adr_hi_o(),
+                                                vaddr_hi);
+                                }
+                                /* we're not caching these on cpu side,
+                                   but later watch for it */
+                                return 0;
+                        }
+                }
+        }
+        return 0;
+}
+#define ILLEGAL_ID ((u32)~0)
+static inline bool check_main_image_header_magic(u8 *context)
+{
+        u32 magic = *(u32 *)(context + ctxsw_prog_main_image_magic_value_o());
+        return magic == ctxsw_prog_main_image_magic_value_v_value_v();
+}
+static inline bool check_local_header_magic(u8 *context)
+{
+        u32 magic = *(u32 *)(context + ctxsw_prog_local_magic_value_o());
+        return magic == ctxsw_prog_local_magic_value_v_value_v();
+}
+/* most likely dupe of ctxsw_gpccs_header__size_1_v() */
+static inline int ctxsw_prog_ucode_header_size_in_bytes(void)
+{
+        return 256;
+}
+void gk20a_gr_get_ovr_perf_regs(struct gk20a *g, u32 *num_ovr_perf_regs,
+                                               u32 **ovr_perf_regs)
+{
+        *num_ovr_perf_regs = _num_ovr_perf_regs;
+        *ovr_perf_regs = _ovr_perf_regs;
+}
+static int gr_gk20a_find_priv_offset_in_ext_buffer(struct gk20a *g,
+                                                   u32 addr,
+                                                   bool is_quad, u32 quad,
+                                                   u32 *context_buffer,
+                                                   u32 context_buffer_size,
+                                                   u32 *priv_offset)
+{
+        u32 i, data32;
+        u32 gpc_num, tpc_num;
+        u32 num_gpcs, num_tpcs;
+        u32 chk_addr;
+        u32 ext_priv_offset, ext_priv_size;
+        u8 *context;
+        u32 offset_to_segment, offset_to_segment_end;
+        u32 sm_dsm_perf_reg_id = ILLEGAL_ID;
+        u32 sm_dsm_perf_ctrl_reg_id = ILLEGAL_ID;
+        u32 num_ext_gpccs_ext_buffer_segments;
+        u32 inter_seg_offset;
+        u32 max_tpc_count;
+        u32 *sm_dsm_perf_ctrl_regs = NULL;
+        u32 num_sm_dsm_perf_ctrl_regs = 0;
+        u32 *sm_dsm_perf_regs = NULL;
+        u32 num_sm_dsm_perf_regs = 0;
+        u32 buffer_segments_size = 0;
+        u32 marker_size = 0;
+        u32 control_register_stride = 0;
+        u32 perf_register_stride = 0;
+        struct gr_gk20a *gr = &g->gr;
+        u32 gpc_base = nvgpu_get_litter_value(g, GPU_LIT_GPC_BASE);
+        u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE);
+        u32 tpc_in_gpc_base = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_BASE);
+        u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_STRIDE);
+        u32 tpc_gpc_mask = (tpc_in_gpc_stride - 1);
+        /* Only have TPC registers in extended region, so if not a TPC reg,
+           then return error so caller can look elsewhere. */
+        if (pri_is_gpc_addr(g, addr))   {
+                u32 gpc_addr = 0;
+                gpc_num = pri_get_gpc_num(g, addr);
+                gpc_addr = pri_gpccs_addr_mask(addr);
+                if (g->ops.gr.is_tpc_addr(g, gpc_addr)) {
+                        tpc_num = g->ops.gr.get_tpc_num(g, gpc_addr);
+                } else {
+                        return -EINVAL;
+                }
+                nvgpu_log_info(g, " gpc = %d tpc = %d",
+                                gpc_num, tpc_num);
+        } else if ((g->ops.gr.is_etpc_addr != NULL) &&
+                                g->ops.gr.is_etpc_addr(g, addr)) {
+                        g->ops.gr.get_egpc_etpc_num(g, addr, &gpc_num, &tpc_num);
+                        gpc_base = g->ops.gr.get_egpc_base(g);
+        } else {
+                nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg,
+                                "does not exist in extended region");
+                return -EINVAL;
+        }
+        buffer_segments_size = ctxsw_prog_extended_buffer_segments_size_in_bytes_v();
+        /* note below is in words/num_registers */
+        marker_size = ctxsw_prog_extended_marker_size_in_bytes_v() >> 2;
+        context = (u8 *)context_buffer;
+        /* sanity check main header */
+        if (!check_main_image_header_magic(context)) {
+                nvgpu_err(g,
+                           "Invalid main header: magic value");
+                return -EINVAL;
+        }
+        num_gpcs = *(u32 *)(context + ctxsw_prog_main_image_num_gpcs_o());
+        if (gpc_num >= num_gpcs) {
+                nvgpu_err(g,
+                   "GPC 0x%08x is greater than total count 0x%08x!",
+                           gpc_num, num_gpcs);
+                return -EINVAL;
+        }
+        data32 = *(u32 *)(context + ctxsw_prog_main_extended_buffer_ctl_o());
+        ext_priv_size   = ctxsw_prog_main_extended_buffer_ctl_size_v(data32);
+        if (0 == ext_priv_size) {
+                nvgpu_log_info(g, " No extended memory in context buffer");
+                return -EINVAL;
+        }
+        ext_priv_offset = ctxsw_prog_main_extended_buffer_ctl_offset_v(data32);
+        offset_to_segment = ext_priv_offset * ctxsw_prog_ucode_header_size_in_bytes();
+        offset_to_segment_end = offset_to_segment +
+                (ext_priv_size * buffer_segments_size);
+        /* check local header magic */
+        context += ctxsw_prog_ucode_header_size_in_bytes();
+        if (!check_local_header_magic(context)) {
+                nvgpu_err(g,
+                           "Invalid local header: magic value");
+                return -EINVAL;
+        }
+        /*
+         * See if the incoming register address is in the first table of
+         * registers. We check this by decoding only the TPC addr portion.
+         * If we get a hit on the TPC bit, we then double check the address
+         * by computing it from the base gpc/tpc strides.  Then make sure
+         * it is a real match.
+         */
+        g->ops.gr.get_sm_dsm_perf_regs(g, &num_sm_dsm_perf_regs,
+                                       &sm_dsm_perf_regs,
+                                       &perf_register_stride);
+        g->ops.gr.init_sm_dsm_reg_info();
+        for (i = 0; i < num_sm_dsm_perf_regs; i++) {
+                if ((addr & tpc_gpc_mask) == (sm_dsm_perf_regs[i] & tpc_gpc_mask)) {
+                        sm_dsm_perf_reg_id = i;
+                        nvgpu_log_info(g, "register match: 0x%08x",
+                                        sm_dsm_perf_regs[i]);
+                        chk_addr = (gpc_base + gpc_stride * gpc_num) +
+                                   tpc_in_gpc_base +
+                                   (tpc_in_gpc_stride * tpc_num) +
+                                   (sm_dsm_perf_regs[sm_dsm_perf_reg_id] & tpc_gpc_mask);
+                        if (chk_addr != addr) {
+                                nvgpu_err(g,
+                                   "Oops addr miss-match! : 0x%08x != 0x%08x",
+                                           addr, chk_addr);
+                                return -EINVAL;
+                        }
+                        break;
+                }
+        }
+        /* Didn't find reg in supported group 1.
+         *  so try the second group now */
+        g->ops.gr.get_sm_dsm_perf_ctrl_regs(g, &num_sm_dsm_perf_ctrl_regs,
+                                       &sm_dsm_perf_ctrl_regs,
+                                       &control_register_stride);
+        if (ILLEGAL_ID == sm_dsm_perf_reg_id) {
+                for (i = 0; i < num_sm_dsm_perf_ctrl_regs; i++) {
+                        if ((addr & tpc_gpc_mask) ==
+                            (sm_dsm_perf_ctrl_regs[i] & tpc_gpc_mask)) {
+                                sm_dsm_perf_ctrl_reg_id = i;
+                                nvgpu_log_info(g, "register match: 0x%08x",
+                                                sm_dsm_perf_ctrl_regs[i]);
+                                chk_addr = (gpc_base + gpc_stride * gpc_num) +
+                                           tpc_in_gpc_base +
+                                           tpc_in_gpc_stride * tpc_num +
+                                           (sm_dsm_perf_ctrl_regs[sm_dsm_perf_ctrl_reg_id] &
+                                            tpc_gpc_mask);
+                                if (chk_addr != addr) {
+                                        nvgpu_err(g,
+                                                   "Oops addr miss-match! : 0x%08x != 0x%08x",
+                                                   addr, chk_addr);
+                                        return -EINVAL;
+                                }
+                                break;
+                        }
+                }
+        }
+        if ((ILLEGAL_ID == sm_dsm_perf_ctrl_reg_id) &&
+            (ILLEGAL_ID == sm_dsm_perf_reg_id)) {
+                return -EINVAL;
+        }
+        /* Skip the FECS extended header, nothing there for us now. */
+        offset_to_segment += buffer_segments_size;
+        /* skip through the GPCCS extended headers until we get to the data for
+         * our GPC.  The size of each gpc extended segment is enough to hold the
+         * max tpc count for the gpcs,in 256b chunks.
+         */
+        max_tpc_count = gr->max_tpc_per_gpc_count;
+        num_ext_gpccs_ext_buffer_segments = (u32)((max_tpc_count + 1) / 2);
+        offset_to_segment += (num_ext_gpccs_ext_buffer_segments *
+                              buffer_segments_size * gpc_num);
+        num_tpcs = g->gr.gpc_tpc_count[gpc_num];
+        /* skip the head marker to start with */
+        inter_seg_offset = marker_size;
+        if (ILLEGAL_ID != sm_dsm_perf_ctrl_reg_id) {
+                /* skip over control regs of TPC's before the one we want.
+                 *  then skip to the register in this tpc */
+                inter_seg_offset = inter_seg_offset +
+                        (tpc_num * control_register_stride) +
+                        sm_dsm_perf_ctrl_reg_id;
+        } else {
+                /* skip all the control registers */
+                inter_seg_offset = inter_seg_offset +
+                        (num_tpcs * control_register_stride);
+                /* skip the marker between control and counter segments */
+                inter_seg_offset += marker_size;
+                /* skip over counter regs of TPCs before the one we want */
+                inter_seg_offset = inter_seg_offset +
+                        (tpc_num * perf_register_stride) *
+                        ctxsw_prog_extended_num_smpc_quadrants_v();
+                /* skip over the register for the quadrants we do not want.
+                 *  then skip to the register in this tpc */
+                inter_seg_offset = inter_seg_offset +
+                        (perf_register_stride * quad) +
+                        sm_dsm_perf_reg_id;
+        }
+        /* set the offset to the segment offset plus the inter segment offset to
+         *  our register */
+        offset_to_segment += (inter_seg_offset * 4);
+        /* last sanity check: did we somehow compute an offset outside the
+         * extended buffer? */
+        if (offset_to_segment > offset_to_segment_end) {
+                nvgpu_err(g,
+                           "Overflow ctxsw buffer! 0x%08x > 0x%08x",
+                           offset_to_segment, offset_to_segment_end);
+                return -EINVAL;
+        }
+        *priv_offset = offset_to_segment;
+        return 0;
+}
+static int
+gr_gk20a_process_context_buffer_priv_segment(struct gk20a *g,
+                                             enum ctxsw_addr_type addr_type,
+                                             u32 pri_addr,
+                                             u32 gpc_num, u32 num_tpcs,
+                                             u32 num_ppcs, u32 ppc_mask,
+                                             u32 *priv_offset)
+{
+        u32 i;
+        u32 address, base_address;
+        u32 sys_offset, gpc_offset, tpc_offset, ppc_offset;
+        u32 ppc_num, tpc_num, tpc_addr, gpc_addr, ppc_addr;
+        struct aiv_gk20a *reg;
+        u32 gpc_base = nvgpu_get_litter_value(g, GPU_LIT_GPC_BASE);
+        u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE);
+        u32 ppc_in_gpc_base = nvgpu_get_litter_value(g, GPU_LIT_PPC_IN_GPC_BASE);
+        u32 ppc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_PPC_IN_GPC_STRIDE);
+        u32 tpc_in_gpc_base = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_BASE);
+        u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_STRIDE);
+        nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, "pri_addr=0x%x", pri_addr);
+        if (!g->gr.ctx_vars.valid) {
+                return -EINVAL;
+        }
+        /* Process the SYS/BE segment. */
+        if ((addr_type == CTXSW_ADDR_TYPE_SYS) ||
+            (addr_type == CTXSW_ADDR_TYPE_BE)) {
+                for (i = 0; i < g->gr.ctx_vars.ctxsw_regs.sys.count; i++) {
+                        reg = &g->gr.ctx_vars.ctxsw_regs.sys.l[i];
+                        address    = reg->addr;
+                        sys_offset = reg->index;
+                        if (pri_addr == address) {
+                                *priv_offset = sys_offset;
+                                return 0;
+                        }
+                }
+        }
+        /* Process the TPC segment. */
+        if (addr_type == CTXSW_ADDR_TYPE_TPC) {
+                for (tpc_num = 0; tpc_num < num_tpcs; tpc_num++) {
+                        for (i = 0; i < g->gr.ctx_vars.ctxsw_regs.tpc.count; i++) {
+                                reg = &g->gr.ctx_vars.ctxsw_regs.tpc.l[i];
+                                address = reg->addr;
+                                tpc_addr = pri_tpccs_addr_mask(address);
+                                base_address = gpc_base +
+                                        (gpc_num * gpc_stride) +
+                                        tpc_in_gpc_base +
+                                        (tpc_num * tpc_in_gpc_stride);
+                                address = base_address + tpc_addr;
+                                /*
+                                 * The data for the TPCs is interleaved in the context buffer.
+                                 * Example with num_tpcs = 2
+                                 * 0    1    2    3    4    5    6    7    8    9    10   11 ...
+                                 * 0-0  1-0  0-1  1-1  0-2  1-2  0-3  1-3  0-4  1-4  0-5  1-5 ...
+                                 */
+                                tpc_offset = (reg->index * num_tpcs) + (tpc_num * 4);
+                                if (pri_addr == address) {
+                                        *priv_offset = tpc_offset;
+                                        return 0;
+                                }
+                        }
+                }
+        } else if ((addr_type == CTXSW_ADDR_TYPE_EGPC) ||
+                (addr_type == CTXSW_ADDR_TYPE_ETPC)) {
+                if (g->ops.gr.get_egpc_base == NULL) {
+                        return -EINVAL;
+                }
+                for (tpc_num = 0; tpc_num < num_tpcs; tpc_num++) {
+                        for (i = 0; i < g->gr.ctx_vars.ctxsw_regs.etpc.count; i++) {
+                                reg = &g->gr.ctx_vars.ctxsw_regs.etpc.l[i];
+                                address = reg->addr;
+                                tpc_addr = pri_tpccs_addr_mask(address);
+                                base_address = g->ops.gr.get_egpc_base(g) +
+                                        (gpc_num * gpc_stride) +
+                                        tpc_in_gpc_base +
+                                        (tpc_num * tpc_in_gpc_stride);
+                                address = base_address + tpc_addr;
+                                /*
+                                 * The data for the TPCs is interleaved in the context buffer.
+                                 * Example with num_tpcs = 2
+                                 * 0    1    2    3    4    5    6    7    8    9    10   11 ...
+                                 * 0-0  1-0  0-1  1-1  0-2  1-2  0-3  1-3  0-4  1-4  0-5  1-5 ...
+                                 */
+                                tpc_offset = (reg->index * num_tpcs) + (tpc_num * 4);
+                                if (pri_addr == address) {
+                                        *priv_offset = tpc_offset;
+                                        nvgpu_log(g,
+                                                gpu_dbg_fn | gpu_dbg_gpu_dbg,
+                                                "egpc/etpc priv_offset=0x%#08x",
+                                                *priv_offset);
+                                        return 0;
+                                }
+                        }
+                }
+        }
+        /* Process the PPC segment. */
+        if (addr_type == CTXSW_ADDR_TYPE_PPC) {
+                for (ppc_num = 0; ppc_num < num_ppcs; ppc_num++) {
+                        for (i = 0; i < g->gr.ctx_vars.ctxsw_regs.ppc.count; i++) {
+                                reg = &g->gr.ctx_vars.ctxsw_regs.ppc.l[i];
+                                address = reg->addr;
+                                ppc_addr = pri_ppccs_addr_mask(address);
+                                base_address = gpc_base +
+                                        (gpc_num * gpc_stride) +
+                                        ppc_in_gpc_base +
+                                        (ppc_num * ppc_in_gpc_stride);
+                                address = base_address + ppc_addr;
+                                /*
+                                 * The data for the PPCs is interleaved in the context buffer.
+                                 * Example with numPpcs = 2
+                                 * 0    1    2    3    4    5    6    7    8    9    10   11 ...
+                                 * 0-0  1-0  0-1  1-1  0-2  1-2  0-3  1-3  0-4  1-4  0-5  1-5 ...
+                                 */
+                                ppc_offset = (reg->index * num_ppcs) + (ppc_num * 4);
+                                if (pri_addr == address)  {
+                                        *priv_offset = ppc_offset;
+                                        return 0;
+                                }
+                        }
+                }
+        }
+        /* Process the GPC segment. */
+        if (addr_type == CTXSW_ADDR_TYPE_GPC) {
+                for (i = 0; i < g->gr.ctx_vars.ctxsw_regs.gpc.count; i++) {
+                        reg = &g->gr.ctx_vars.ctxsw_regs.gpc.l[i];
+                        address = reg->addr;
+                        gpc_addr = pri_gpccs_addr_mask(address);
+                        gpc_offset = reg->index;
+                        base_address = gpc_base + (gpc_num * gpc_stride);
+                        address = base_address + gpc_addr;
+                        if (pri_addr == address) {
+                                *priv_offset = gpc_offset;
+                                return 0;
+                        }
+                }
+        }
+        return -EINVAL;
+}
+static int gr_gk20a_determine_ppc_configuration(struct gk20a *g,
+                                               u8 *context,
+                                               u32 *num_ppcs, u32 *ppc_mask,
+                                               u32 *reg_ppc_count)
+{
+        u32 data32;
+        u32 num_pes_per_gpc = nvgpu_get_litter_value(g, GPU_LIT_NUM_PES_PER_GPC);
+        /*
+         * if there is only 1 PES_PER_GPC, then we put the PES registers
+         * in the GPC reglist, so we can't error out if ppc.count == 0
+         */
+        if ((!g->gr.ctx_vars.valid) ||
+            ((g->gr.ctx_vars.ctxsw_regs.ppc.count == 0) &&
+             (num_pes_per_gpc > 1))) {
+                return -EINVAL;
+        }
+        data32 = *(u32 *)(context + ctxsw_prog_local_image_ppc_info_o());
+        *num_ppcs = ctxsw_prog_local_image_ppc_info_num_ppcs_v(data32);
+        *ppc_mask = ctxsw_prog_local_image_ppc_info_ppc_mask_v(data32);
+        *reg_ppc_count = g->gr.ctx_vars.ctxsw_regs.ppc.count;
+        return 0;
+}
+int gr_gk20a_get_offset_in_gpccs_segment(struct gk20a *g,
+                                        enum ctxsw_addr_type addr_type,
+                                        u32 num_tpcs,
+                                        u32 num_ppcs,
+                                        u32 reg_list_ppc_count,
+                                        u32 *__offset_in_segment)
+{
+        u32 offset_in_segment = 0;
+        struct gr_gk20a *gr = &g->gr;
+        if (addr_type == CTXSW_ADDR_TYPE_TPC) {
+                /*
+                 * reg = gr->ctx_vars.ctxsw_regs.tpc.l;
+                 * offset_in_segment = 0;
+                 */
+        } else if ((addr_type == CTXSW_ADDR_TYPE_EGPC) ||
+                        (addr_type == CTXSW_ADDR_TYPE_ETPC)) {
+                offset_in_segment =
+                        ((gr->ctx_vars.ctxsw_regs.tpc.count *
+                                num_tpcs) << 2);
+                nvgpu_log(g, gpu_dbg_info | gpu_dbg_gpu_dbg,
+                        "egpc etpc offset_in_segment 0x%#08x",
+                        offset_in_segment);
+        } else if (addr_type == CTXSW_ADDR_TYPE_PPC) {
+                /*
+                 * The ucode stores TPC data before PPC data.
+                 * Advance offset past TPC data to PPC data.
+                 */
+                offset_in_segment =
+                        (((gr->ctx_vars.ctxsw_regs.tpc.count +
+                                gr->ctx_vars.ctxsw_regs.etpc.count) *
+                          num_tpcs) << 2);
+        } else if (addr_type == CTXSW_ADDR_TYPE_GPC) {
+                /*
+                 * The ucode stores TPC/PPC data before GPC data.
+                 * Advance offset past TPC/PPC data to GPC data.
+                 *
+                 * Note 1 PES_PER_GPC case
+                 */
+                u32 num_pes_per_gpc = nvgpu_get_litter_value(g,
+                                GPU_LIT_NUM_PES_PER_GPC);
+                if (num_pes_per_gpc > 1) {
+                        offset_in_segment =
+                                ((((gr->ctx_vars.ctxsw_regs.tpc.count +
+                                        gr->ctx_vars.ctxsw_regs.etpc.count) *
+                                        num_tpcs) << 2) +
+                                        ((reg_list_ppc_count * num_ppcs) << 2));
+                } else {
+                        offset_in_segment =
+                                (((gr->ctx_vars.ctxsw_regs.tpc.count +
+                                        gr->ctx_vars.ctxsw_regs.etpc.count) *
+                                        num_tpcs) << 2);
+                }
+        } else {
+                nvgpu_log_fn(g, "Unknown address type.");
+                return -EINVAL;
+        }
+        *__offset_in_segment = offset_in_segment;
+        return 0;
+}
+/*
+ *  This function will return the 32 bit offset for a priv register if it is
+ *  present in the context buffer. The context buffer is in CPU memory.
+ */
+static int gr_gk20a_find_priv_offset_in_buffer(struct gk20a *g,
+                                               u32 addr,
+                                               bool is_quad, u32 quad,
+                                               u32 *context_buffer,
+                                               u32 context_buffer_size,
+                                               u32 *priv_offset)
+{
+        u32 i, data32;
+        int err;
+        enum ctxsw_addr_type addr_type;
+        u32 broadcast_flags;
+        u32 gpc_num, tpc_num, ppc_num, be_num;
+        u32 num_gpcs, num_tpcs, num_ppcs;
+        u32 offset;
+        u32 sys_priv_offset, gpc_priv_offset;
+        u32 ppc_mask, reg_list_ppc_count;
+        u8 *context;
+        u32 offset_to_segment, offset_in_segment = 0;
+        nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, "addr=0x%x", addr);
+        err = g->ops.gr.decode_priv_addr(g, addr, &addr_type,
+                                        &gpc_num, &tpc_num, &ppc_num, &be_num,
+                                        &broadcast_flags);
+        nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg,
+                        "addr_type = %d, broadcast_flags: %08x",
+                        addr_type, broadcast_flags);
+        if (err != 0) {
+                return err;
+        }
+        context = (u8 *)context_buffer;
+        if (!check_main_image_header_magic(context)) {
+                nvgpu_err(g,
+                           "Invalid main header: magic value");
+                return -EINVAL;
+        }
+        num_gpcs = *(u32 *)(context + ctxsw_prog_main_image_num_gpcs_o());
+        /* Parse the FECS local header. */
+        context += ctxsw_prog_ucode_header_size_in_bytes();
+        if (!check_local_header_magic(context)) {
+                nvgpu_err(g,
+                           "Invalid FECS local header: magic value");
+                return -EINVAL;
+        }
+        data32 = *(u32 *)(context + ctxsw_prog_local_priv_register_ctl_o());
+        sys_priv_offset = ctxsw_prog_local_priv_register_ctl_offset_v(data32);
+        nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, "sys_priv_offset=0x%x", sys_priv_offset);
+        /* If found in Ext buffer, ok.
+         * If it failed and we expected to find it there (quad offset)
+         * then return the error.  Otherwise continue on.
+         */
+        err = gr_gk20a_find_priv_offset_in_ext_buffer(g,
+                                      addr, is_quad, quad, context_buffer,
+                                      context_buffer_size, priv_offset);
+        if ((err == 0) || ((err != 0) && is_quad)) {
+                nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg,
+                                "err = %d, is_quad = %s",
+                                err, is_quad ? "true" : "false");
+                return err;
+        }
+        if ((addr_type == CTXSW_ADDR_TYPE_SYS) ||
+            (addr_type == CTXSW_ADDR_TYPE_BE)) {
+                /* Find the offset in the FECS segment. */
+                offset_to_segment = sys_priv_offset *
+                        ctxsw_prog_ucode_header_size_in_bytes();
+                err = gr_gk20a_process_context_buffer_priv_segment(g,
+                                           addr_type, addr,
+                                           0, 0, 0, 0,
+                                           &offset);
+                if (err != 0) {
+                        return err;
+                }
+                *priv_offset = (offset_to_segment + offset);
+                return 0;
+        }
+        if ((gpc_num + 1) > num_gpcs)  {
+                nvgpu_err(g,
+                           "GPC %d not in this context buffer.",
+                           gpc_num);
+                return -EINVAL;
+        }
+        /* Parse the GPCCS local header(s).*/
+        for (i = 0; i < num_gpcs; i++) {
+                context += ctxsw_prog_ucode_header_size_in_bytes();
+                if (!check_local_header_magic(context)) {
+                        nvgpu_err(g,
+                                   "Invalid GPCCS local header: magic value");
+                        return -EINVAL;
+                }
+                data32 = *(u32 *)(context + ctxsw_prog_local_priv_register_ctl_o());
+                gpc_priv_offset = ctxsw_prog_local_priv_register_ctl_offset_v(data32);
+                err = gr_gk20a_determine_ppc_configuration(g, context,
+                                                           &num_ppcs, &ppc_mask,
+                                                           &reg_list_ppc_count);
+                if (err != 0) {
+                        nvgpu_err(g, "determine ppc configuration failed");
+                        return err;
+                }
+                num_tpcs = *(u32 *)(context + ctxsw_prog_local_image_num_tpcs_o());
+                if ((i == gpc_num) && ((tpc_num + 1) > num_tpcs)) {
+                        nvgpu_err(g,
+                           "GPC %d TPC %d not in this context buffer.",
+                                   gpc_num, tpc_num);
+                        return -EINVAL;
+                }
+                /* Find the offset in the GPCCS segment.*/
+                if (i == gpc_num) {
+                        nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg,
+                                        "gpc_priv_offset 0x%#08x",
+                                        gpc_priv_offset);
+                        offset_to_segment = gpc_priv_offset *
+                                ctxsw_prog_ucode_header_size_in_bytes();
+                        err = g->ops.gr.get_offset_in_gpccs_segment(g,
+                                        addr_type,
+                                        num_tpcs, num_ppcs, reg_list_ppc_count,
+                                        &offset_in_segment);
+                        if (err != 0) {
+                                return -EINVAL;
+                        }
+                        offset_to_segment += offset_in_segment;
+                        nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg,
+                                "offset_to_segment 0x%#08x",
+                                offset_to_segment);
+                        err = gr_gk20a_process_context_buffer_priv_segment(g,
+                                                           addr_type, addr,
+                                                           i, num_tpcs,
+                                                           num_ppcs, ppc_mask,
+                                                           &offset);
+                        if (err != 0) {
+                                return -EINVAL;
+                        }
+                        *priv_offset = offset_to_segment + offset;
+                        return 0;
+                }
+        }
+        return -EINVAL;
+}
+static int map_cmp(const void *a, const void *b)
+{
+        struct ctxsw_buf_offset_map_entry *e1 =
+                                        (struct ctxsw_buf_offset_map_entry *)a;
+        struct ctxsw_buf_offset_map_entry *e2 =
+                                        (struct ctxsw_buf_offset_map_entry *)b;
+        if (e1->addr < e2->addr) {
+                return -1;
+        }
+        if (e1->addr > e2->addr) {
+                return 1;
+        }
+        return 0;
+}
+static int add_ctxsw_buffer_map_entries_pmsys(struct ctxsw_buf_offset_map_entry *map,
+                                                struct aiv_list_gk20a *regs,
+                                                u32 *count, u32 *offset,
+                                                u32 max_cnt, u32 base, u32 mask)
+{
+        u32 idx;
+        u32 cnt = *count;
+        u32 off = *offset;
+        if ((cnt + regs->count) > max_cnt) {
+                return -EINVAL;
+        }
+        for (idx = 0; idx < regs->count; idx++) {
+                if ((base + (regs->l[idx].addr & mask)) < 0xFFF) {
+                        map[cnt].addr = base + (regs->l[idx].addr & mask)
+                                        + NV_PCFG_BASE;
+                } else {
+                        map[cnt].addr = base + (regs->l[idx].addr & mask);
+                }
+                map[cnt++].offset = off;
+                off += 4;
+        }
+        *count = cnt;
+        *offset = off;
+        return 0;
+}
+static int add_ctxsw_buffer_map_entries_pmgpc(struct gk20a *g,
+                                        struct ctxsw_buf_offset_map_entry *map,
+                                        struct aiv_list_gk20a *regs,
+                                        u32 *count, u32 *offset,
+                                        u32 max_cnt, u32 base, u32 mask)
+{
+        u32 idx;
+        u32 cnt = *count;
+        u32 off = *offset;
+        if ((cnt + regs->count) > max_cnt) {
+                return -EINVAL;
+        }
+        /* NOTE: The PPC offsets get added to the pm_gpc list if numPpc <= 1
+         * To handle the case of PPC registers getting added into GPC, the below
+         * code specifically checks for any PPC offsets and adds them using
+         * proper mask
+         */
+        for (idx = 0; idx < regs->count; idx++) {
+                /* Check if the address is PPC address */
+                if (pri_is_ppc_addr_shared(g, regs->l[idx].addr & mask)) {
+                        u32 ppc_in_gpc_base = nvgpu_get_litter_value(g,
+                                                GPU_LIT_PPC_IN_GPC_BASE);
+                        u32 ppc_in_gpc_stride = nvgpu_get_litter_value(g,
+                                                GPU_LIT_PPC_IN_GPC_STRIDE);
+                        /* Use PPC mask instead of the GPC mask provided */
+                        u32 ppcmask = ppc_in_gpc_stride - 1;
+                        map[cnt].addr = base + ppc_in_gpc_base
+                                        + (regs->l[idx].addr & ppcmask);
+                } else {
+                        map[cnt].addr = base + (regs->l[idx].addr & mask);
+                }
+                map[cnt++].offset = off;
+                off += 4;
+        }
+        *count = cnt;
+        *offset = off;
+        return 0;
+}
+static int add_ctxsw_buffer_map_entries(struct ctxsw_buf_offset_map_entry *map,
+                                        struct aiv_list_gk20a *regs,
+                                        u32 *count, u32 *offset,
+                                        u32 max_cnt, u32 base, u32 mask)
+{
+        u32 idx;
+        u32 cnt = *count;
+        u32 off = *offset;
+        if ((cnt + regs->count) > max_cnt) {
+                return -EINVAL;
+        }
+        for (idx = 0; idx < regs->count; idx++) {
+                map[cnt].addr = base + (regs->l[idx].addr & mask);
+                map[cnt++].offset = off;
+                off += 4;
+        }
+        *count = cnt;
+        *offset = off;
+        return 0;
+}
+/* Helper function to add register entries to the register map for all
+ * subunits
+ */
+static int add_ctxsw_buffer_map_entries_subunits(
+                                        struct ctxsw_buf_offset_map_entry *map,
+                                        struct aiv_list_gk20a *regs,
+                                        u32 *count, u32 *offset,
+                                        u32 max_cnt, u32 base,
+                                        u32 num_units, u32 stride, u32 mask)
+{
+        u32 unit;
+        u32 idx;
+        u32 cnt = *count;
+        u32 off = *offset;
+        if ((cnt + (regs->count * num_units)) > max_cnt) {
+                return -EINVAL;
+        }
+        /* Data is interleaved for units in ctxsw buffer */
+        for (idx = 0; idx < regs->count; idx++) {
+                for (unit = 0; unit < num_units; unit++) {
+                        map[cnt].addr = base + (regs->l[idx].addr & mask) +
+                                        (unit * stride);
+                        map[cnt++].offset = off;
+                        off += 4;
+                }
+        }
+        *count = cnt;
+        *offset = off;
+        return 0;
+}
+int gr_gk20a_add_ctxsw_reg_pm_fbpa(struct gk20a *g,
+                                struct ctxsw_buf_offset_map_entry *map,
+                                struct aiv_list_gk20a *regs,
+                                u32 *count, u32 *offset,
+                                u32 max_cnt, u32 base,
+                                u32 num_fbpas, u32 stride, u32 mask)
+{
+        return add_ctxsw_buffer_map_entries_subunits(map, regs, count, offset,
+                        max_cnt, base, num_fbpas, stride, mask);
+}
+static int add_ctxsw_buffer_map_entries_gpcs(struct gk20a *g,
+                                        struct ctxsw_buf_offset_map_entry *map,
+                                        u32 *count, u32 *offset, u32 max_cnt)
+{
+        u32 num_gpcs = g->gr.gpc_count;
+        u32 num_ppcs, num_tpcs, gpc_num, base;
+        u32 gpc_base = nvgpu_get_litter_value(g, GPU_LIT_GPC_BASE);
+        u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE);
+        u32 ppc_in_gpc_base = nvgpu_get_litter_value(g, GPU_LIT_PPC_IN_GPC_BASE);
+        u32 ppc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_PPC_IN_GPC_STRIDE);
+        u32 tpc_in_gpc_base = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_BASE);
+        u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_STRIDE);
+        for (gpc_num = 0; gpc_num < num_gpcs; gpc_num++) {
+                num_tpcs = g->gr.gpc_tpc_count[gpc_num];
+                base = gpc_base + (gpc_stride * gpc_num) + tpc_in_gpc_base;
+                if (add_ctxsw_buffer_map_entries_subunits(map,
+                                        &g->gr.ctx_vars.ctxsw_regs.pm_tpc,
+                                        count, offset, max_cnt, base, num_tpcs,
+                                        tpc_in_gpc_stride,
+                                        (tpc_in_gpc_stride - 1))) {
+                        return -EINVAL;
+                }
+                num_ppcs = g->gr.gpc_ppc_count[gpc_num];
+                base = gpc_base + (gpc_stride * gpc_num) + ppc_in_gpc_base;
+                if (add_ctxsw_buffer_map_entries_subunits(map,
+                                        &g->gr.ctx_vars.ctxsw_regs.pm_ppc,
+                                        count, offset, max_cnt, base, num_ppcs,
+                                        ppc_in_gpc_stride,
+                                        (ppc_in_gpc_stride - 1))) {
+                        return -EINVAL;
+                }
+                base = gpc_base + (gpc_stride * gpc_num);
+                if (add_ctxsw_buffer_map_entries_pmgpc(g, map,
+                                        &g->gr.ctx_vars.ctxsw_regs.pm_gpc,
+                                        count, offset, max_cnt, base,
+                                        (gpc_stride - 1))) {
+                        return -EINVAL;
+                }
+                base = NV_XBAR_MXBAR_PRI_GPC_GNIC_STRIDE * gpc_num;
+                if (add_ctxsw_buffer_map_entries(map,
+                                        &g->gr.ctx_vars.ctxsw_regs.pm_ucgpc,
+                                        count, offset, max_cnt, base, ~0)) {
+                        return -EINVAL;
+                }
+                base = (g->ops.gr.get_pmm_per_chiplet_offset() * gpc_num);
+                if (add_ctxsw_buffer_map_entries(map,
+                                        &g->gr.ctx_vars.ctxsw_regs.perf_gpc,
+                                        count, offset, max_cnt, base, ~0)) {
+                        return -EINVAL;
+                }
+                base = (NV_PERF_PMMGPCROUTER_STRIDE * gpc_num);
+                if (add_ctxsw_buffer_map_entries(map,
+                                        &g->gr.ctx_vars.ctxsw_regs.gpc_router,
+                                        count, offset, max_cnt, base, ~0)) {
+                        return -EINVAL;
+                }
+                /* Counter Aggregation Unit, if available */
+                if (g->gr.ctx_vars.ctxsw_regs.pm_cau.count) {
+                        base = gpc_base + (gpc_stride * gpc_num)
+                                        + tpc_in_gpc_base;
+                        if (add_ctxsw_buffer_map_entries_subunits(map,
+                                        &g->gr.ctx_vars.ctxsw_regs.pm_cau,
+                                        count, offset, max_cnt, base, num_tpcs,
+                                        tpc_in_gpc_stride,
+                                        (tpc_in_gpc_stride - 1))) {
+                                return -EINVAL;
+                        }
+                }
+                *offset = ALIGN(*offset, 256);
+        }
+        return 0;
+}
+int gr_gk20a_add_ctxsw_reg_perf_pma(struct ctxsw_buf_offset_map_entry *map,
+        struct aiv_list_gk20a *regs,
+        u32 *count, u32 *offset,
+        u32 max_cnt, u32 base, u32 mask)
+{
+        return add_ctxsw_buffer_map_entries(map, regs,
+                        count, offset, max_cnt, base, mask);
+}
+/*
+ *            PM CTXSW BUFFER LAYOUT :
+ *|---------------------------------------------|0x00 <----PM CTXSW BUFFER BASE
+ *|                                             |
+ *|        LIST_compressed_pm_ctx_reg_SYS       |Space allocated: numRegs words
+ *|---------------------------------------------|
+ *|                                             |
+ *|    LIST_compressed_nv_perf_ctx_reg_SYS      |Space allocated: numRegs words
+ *|---------------------------------------------|
+ *|                                             |
+ *|    LIST_compressed_nv_perf_ctx_reg_sysrouter|Space allocated: numRegs words
+ *|---------------------------------------------|
+ *|                                             |
+ *|    LIST_compressed_nv_perf_ctx_reg_PMA      |Space allocated: numRegs words
+ *|---------------------------------------------|
+ *|        PADDING for 256 byte alignment       |
+ *|---------------------------------------------|<----256 byte aligned
+ *|    LIST_compressed_nv_perf_fbp_ctx_regs     |
+ *|                                             |Space allocated: numRegs * n words (for n FB units)
+ *|---------------------------------------------|
+ *| LIST_compressed_nv_perf_fbprouter_ctx_regs  |
+ *|                                             |Space allocated: numRegs * n words (for n FB units)
+ *|---------------------------------------------|
+ *|    LIST_compressed_pm_fbpa_ctx_regs         |
+ *|                                             |Space allocated: numRegs * n words (for n FB units)
+ *|---------------------------------------------|
+ *|    LIST_compressed_pm_rop_ctx_regs          |
+ *|---------------------------------------------|
+ *|    LIST_compressed_pm_ltc_ctx_regs          |
+ *|                                  LTC0 LTS0  |
+ *|                                  LTC1 LTS0  |Space allocated: numRegs * n words (for n LTC units)
+ *|                                  LTCn LTS0  |
+ *|                                  LTC0 LTS1  |
+ *|                                  LTC1 LTS1  |
+ *|                                  LTCn LTS1  |
+ *|                                  LTC0 LTSn  |
+ *|                                  LTC1 LTSn  |
+ *|                                  LTCn LTSn  |
+ *|---------------------------------------------|
+ *|        PADDING for 256 byte alignment       |
+ *|---------------------------------------------|<----256 byte aligned
+ *|                            GPC0  REG0 TPC0  |Each GPC has space allocated to accommodate
+ *|                                  REG0 TPC1  |    all the GPC/TPC register lists
+ *| Lists in each GPC region:        REG0 TPCn  |Per GPC allocated space is always 256 byte aligned
+ *|  LIST_pm_ctx_reg_TPC             REG1 TPC0  |
+ *|             * numTpcs            REG1 TPC1  |
+ *|  LIST_pm_ctx_reg_PPC             REG1 TPCn  |
+ *|             * numPpcs            REGn TPC0  |
+ *|  LIST_pm_ctx_reg_GPC             REGn TPC1  |
+ *|  List_pm_ctx_reg_uc_GPC          REGn TPCn  |
+ *|  LIST_nv_perf_ctx_reg_GPC                   |
+ *|  LIST_nv_perf_gpcrouter_ctx_reg             |
+ *|  LIST_nv_perf_ctx_reg_CAU                   |
+ *|                                       ----  |--
+ *|                            GPC1         .   |
+ *|                                         .   |<----
+ *|---------------------------------------------|
+ *=                                             =
+ *|                            GPCn             |
+ *=                                             =
+ *|---------------------------------------------|
+ */
+static int gr_gk20a_create_hwpm_ctxsw_buffer_offset_map(struct gk20a *g)
+{
+        u32 hwpm_ctxsw_buffer_size = g->gr.ctx_vars.pm_ctxsw_image_size;
+        u32 hwpm_ctxsw_reg_count_max;
+        u32 map_size;
+        u32 i, count = 0;
+        u32 offset = 0;
+        struct ctxsw_buf_offset_map_entry *map;
+        u32 ltc_stride = nvgpu_get_litter_value(g, GPU_LIT_LTC_STRIDE);
+        u32 num_fbpas = nvgpu_get_litter_value(g, GPU_LIT_NUM_FBPAS);
+        u32 fbpa_stride = nvgpu_get_litter_value(g, GPU_LIT_FBPA_STRIDE);
+        u32 num_ltc = g->ops.gr.get_max_ltc_per_fbp(g) * g->gr.num_fbps;
+        if (hwpm_ctxsw_buffer_size == 0) {
+                nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg,
+                        "no PM Ctxsw buffer memory in context buffer");
+                return -EINVAL;
+        }
+        hwpm_ctxsw_reg_count_max = hwpm_ctxsw_buffer_size >> 2;
+        map_size = hwpm_ctxsw_reg_count_max * sizeof(*map);
+        map = nvgpu_big_zalloc(g, map_size);
+        if (map == NULL) {
+                return -ENOMEM;
+        }
+        /* Add entries from _LIST_pm_ctx_reg_SYS */
+        if (add_ctxsw_buffer_map_entries_pmsys(map, &g->gr.ctx_vars.ctxsw_regs.pm_sys,
+                                &count, &offset, hwpm_ctxsw_reg_count_max, 0, ~0)) {
+                goto cleanup;
+        }
+        /* Add entries from _LIST_nv_perf_ctx_reg_SYS */
+        if (add_ctxsw_buffer_map_entries(map, &g->gr.ctx_vars.ctxsw_regs.perf_sys,
+                                &count, &offset, hwpm_ctxsw_reg_count_max, 0, ~0)) {
+                goto cleanup;
+        }
+        /* Add entries from _LIST_nv_perf_sysrouter_ctx_reg*/
+        if (add_ctxsw_buffer_map_entries(map, &g->gr.ctx_vars.ctxsw_regs.perf_sys_router,
+                                &count, &offset, hwpm_ctxsw_reg_count_max, 0, ~0)) {
+                goto cleanup;
+        }
+        /* Add entries from _LIST_nv_perf_pma_ctx_reg*/
+        if (g->ops.gr.add_ctxsw_reg_perf_pma(map, &g->gr.ctx_vars.ctxsw_regs.perf_pma,
+                                &count, &offset, hwpm_ctxsw_reg_count_max, 0, ~0)) {
+                goto cleanup;
+        }
+        offset = ALIGN(offset, 256);
+        /* Add entries from _LIST_nv_perf_fbp_ctx_regs */
+        if (add_ctxsw_buffer_map_entries_subunits(map,
+                                        &g->gr.ctx_vars.ctxsw_regs.fbp,
+                                        &count, &offset,
+                                        hwpm_ctxsw_reg_count_max, 0,
+                                        g->gr.num_fbps,
+                                        g->ops.gr.get_pmm_per_chiplet_offset(),
+                                        ~0)) {
+                goto cleanup;
+        }
+        /* Add entries from _LIST_nv_perf_fbprouter_ctx_regs */
+        if (add_ctxsw_buffer_map_entries_subunits(map,
+                                        &g->gr.ctx_vars.ctxsw_regs.fbp_router,
+                                        &count, &offset,
+                                        hwpm_ctxsw_reg_count_max, 0, g->gr.num_fbps,
+                                        NV_PERF_PMM_FBP_ROUTER_STRIDE, ~0)) {
+                goto cleanup;
+        }
+        /* Add entries from _LIST_nv_pm_fbpa_ctx_regs */
+        if (g->ops.gr.add_ctxsw_reg_pm_fbpa(g, map,
+                                        &g->gr.ctx_vars.ctxsw_regs.pm_fbpa,
+                                        &count, &offset,
+                                        hwpm_ctxsw_reg_count_max, 0,
+                                        num_fbpas, fbpa_stride, ~0)) {
+                goto cleanup;
+        }
+        /* Add entries from _LIST_nv_pm_rop_ctx_regs */
+        if (add_ctxsw_buffer_map_entries(map,
+                                        &g->gr.ctx_vars.ctxsw_regs.pm_rop,
+                                        &count, &offset,
+                                        hwpm_ctxsw_reg_count_max, 0, ~0)) {
+                goto cleanup;
+        }
+        /* Add entries from _LIST_compressed_nv_pm_ltc_ctx_regs */
+        if (add_ctxsw_buffer_map_entries_subunits(map,
+                                        &g->gr.ctx_vars.ctxsw_regs.pm_ltc,
+                                        &count, &offset,
+                                        hwpm_ctxsw_reg_count_max, 0,
+                                        num_ltc, ltc_stride, ~0)) {
+                goto cleanup;
+        }
+        offset = ALIGN(offset, 256);
+        /* Add GPC entries */
+        if (add_ctxsw_buffer_map_entries_gpcs(g, map, &count, &offset,
+                                        hwpm_ctxsw_reg_count_max)) {
+                goto cleanup;
+        }
+        if (offset > hwpm_ctxsw_buffer_size) {
+                nvgpu_err(g, "offset > buffer size");
+                goto cleanup;
+        }
+        sort(map, count, sizeof(*map), map_cmp, NULL);
+        g->gr.ctx_vars.hwpm_ctxsw_buffer_offset_map = map;
+        g->gr.ctx_vars.hwpm_ctxsw_buffer_offset_map_count = count;
+        nvgpu_log_info(g, "Reg Addr => HWPM Ctxt switch buffer offset");
+        for (i = 0; i < count; i++) {
+                nvgpu_log_info(g, "%08x => %08x", map[i].addr, map[i].offset);
+        }
+        return 0;
+cleanup:
+        nvgpu_err(g, "Failed to create HWPM buffer offset map");
+        nvgpu_big_free(g, map);
+        return -EINVAL;
+}
+/*
+ *  This function will return the 32 bit offset for a priv register if it is
+ *  present in the PM context buffer.
+ */
+static int gr_gk20a_find_priv_offset_in_pm_buffer(struct gk20a *g,
+                                                  u32 addr,
+                                                  u32 *priv_offset)
+{
+        struct gr_gk20a *gr = &g->gr;
+        int err = 0;
+        u32 count;
+        struct ctxsw_buf_offset_map_entry *map, *result, map_key;
+        nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, "addr=0x%x", addr);
+        /* Create map of pri address and pm offset if necessary */
+        if (gr->ctx_vars.hwpm_ctxsw_buffer_offset_map == NULL) {
+                err = gr_gk20a_create_hwpm_ctxsw_buffer_offset_map(g);
+                if (err != 0) {
+                        return err;
+                }
+        }
+        *priv_offset = 0;
+        map = gr->ctx_vars.hwpm_ctxsw_buffer_offset_map;
+        count = gr->ctx_vars.hwpm_ctxsw_buffer_offset_map_count;
+        map_key.addr = addr;
+        result = bsearch(&map_key, map, count, sizeof(*map), map_cmp);
+        if (result) {
+                *priv_offset = result->offset;
+        } else {
+                nvgpu_err(g, "Lookup failed for address 0x%x", addr);
+                err = -EINVAL;
+        }
+        return err;
+}
+bool gk20a_is_channel_ctx_resident(struct channel_gk20a *ch)
+{
+        int curr_gr_ctx;
+        u32 curr_gr_tsgid;
+        struct gk20a *g = ch->g;
+        struct channel_gk20a *curr_ch;
+        bool ret = false;
+        struct tsg_gk20a *tsg;
+        curr_gr_ctx  = gk20a_readl(g, gr_fecs_current_ctx_r());
+        /* when contexts are unloaded from GR, the valid bit is reset
+         * but the instance pointer information remains intact. So the
+         * valid bit must be checked to be absolutely certain that a
+         * valid context is currently resident.
+         */
+        if (gr_fecs_current_ctx_valid_v(curr_gr_ctx) == 0U) {
+                return NULL;
+        }
+        curr_ch = gk20a_gr_get_channel_from_ctx(g, curr_gr_ctx,
+                                              &curr_gr_tsgid);
+        nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg,
+                  "curr_gr_chid=%d curr_tsgid=%d, ch->tsgid=%d"
+                  " ch->chid=%d",
+                  (curr_ch != NULL) ? curr_ch->chid : U32_MAX,
+                  curr_gr_tsgid,
+                  ch->tsgid,
+                  ch->chid);
+        if (curr_ch == NULL) {
+                return false;
+        }
+        if (ch->chid == curr_ch->chid) {
+                ret = true;
+        }
+        tsg = tsg_gk20a_from_ch(ch);
+        if ((tsg != NULL) && (tsg->tsgid == curr_gr_tsgid)) {
+                ret = true;
+        }
+        gk20a_channel_put(curr_ch);
+        return ret;
+}
+int __gr_gk20a_exec_ctx_ops(struct channel_gk20a *ch,
+                            struct nvgpu_dbg_reg_op *ctx_ops, u32 num_ops,
+                            u32 num_ctx_wr_ops, u32 num_ctx_rd_ops,
+                            bool ch_is_curr_ctx)
+{
+        struct gk20a *g = ch->g;
+        struct tsg_gk20a *tsg;
+        struct nvgpu_gr_ctx *gr_ctx;
+        bool gr_ctx_ready = false;
+        bool pm_ctx_ready = false;
+        struct nvgpu_mem *current_mem = NULL;
+        u32 i, j, offset, v;
+        struct gr_gk20a *gr = &g->gr;
+        u32 sm_per_tpc = nvgpu_get_litter_value(g, GPU_LIT_NUM_SM_PER_TPC);
+        u32 max_offsets = gr->max_gpc_count * gr->max_tpc_per_gpc_count *
+                                sm_per_tpc;
+        u32 *offsets = NULL;
+        u32 *offset_addrs = NULL;
+        u32 ctx_op_nr, num_ctx_ops[2] = {num_ctx_wr_ops, num_ctx_rd_ops};
+        int err = 0, pass;
+        nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, "wr_ops=%d rd_ops=%d",
+                   num_ctx_wr_ops, num_ctx_rd_ops);
+        tsg = tsg_gk20a_from_ch(ch);
+        if (tsg == NULL) {
+                return -EINVAL;
+        }
+        gr_ctx = &tsg->gr_ctx;
+        if (ch_is_curr_ctx) {
+                for (pass = 0; pass < 2; pass++) {
+                        ctx_op_nr = 0;
+                        for (i = 0; (ctx_op_nr < num_ctx_ops[pass]) && (i < num_ops); ++i) {
+                                /* only do ctx ops and only on the right pass */
+                                if ((ctx_ops[i].type == REGOP(TYPE_GLOBAL)) ||
+                                    (((pass == 0) && reg_op_is_read(ctx_ops[i].op)) ||
+                                     ((pass == 1) && !reg_op_is_read(ctx_ops[i].op)))) {
+                                        continue;
+                                }
+                                /* if this is a quad access, setup for special access*/
+                                if ((ctx_ops[i].type == REGOP(TYPE_GR_CTX_QUAD))
+                                        && (g->ops.gr.access_smpc_reg != NULL)) {
+                                        g->ops.gr.access_smpc_reg(g,
+                                                        ctx_ops[i].quad,
+                                                        ctx_ops[i].offset);
+                                }
+                                offset = ctx_ops[i].offset;
+                                if (pass == 0) { /* write pass */
+                                        v = gk20a_readl(g, offset);
+                                        v &= ~ctx_ops[i].and_n_mask_lo;
+                                        v |= ctx_ops[i].value_lo;
+                                        gk20a_writel(g, offset, v);
+                                        nvgpu_log(g, gpu_dbg_gpu_dbg,
+                                                   "direct wr: offset=0x%x v=0x%x",
+                                                   offset, v);
+                                        if (ctx_ops[i].op == REGOP(WRITE_64)) {
+                                                v = gk20a_readl(g, offset + 4);
+                                                v &= ~ctx_ops[i].and_n_mask_hi;
+                                                v |= ctx_ops[i].value_hi;
+                                                gk20a_writel(g, offset + 4, v);
+                                                nvgpu_log(g, gpu_dbg_gpu_dbg,
+                                                           "direct wr: offset=0x%x v=0x%x",
+                                                           offset + 4, v);
+                                        }
+                                } else { /* read pass */
+                                        ctx_ops[i].value_lo =
+                                                gk20a_readl(g, offset);
+                                        nvgpu_log(g, gpu_dbg_gpu_dbg,
+                                                   "direct rd: offset=0x%x v=0x%x",
+                                                   offset, ctx_ops[i].value_lo);
+                                        if (ctx_ops[i].op == REGOP(READ_64)) {
+                                                ctx_ops[i].value_hi =
+                                                        gk20a_readl(g, offset + 4);
+                                                nvgpu_log(g, gpu_dbg_gpu_dbg,
+                                                           "direct rd: offset=0x%x v=0x%x",
+                                                           offset, ctx_ops[i].value_lo);
+                                        } else {
+                                                ctx_ops[i].value_hi = 0;
+                                        }
+                                }
+                                ctx_op_nr++;
+                        }
+                }
+                goto cleanup;
+        }
+        /* they're the same size, so just use one alloc for both */
+        offsets = nvgpu_kzalloc(g, 2 * sizeof(u32) * max_offsets);
+        if (offsets == NULL) {
+                err = -ENOMEM;
+                goto cleanup;
+        }
+        offset_addrs = offsets + max_offsets;
+        err = gr_gk20a_ctx_patch_write_begin(g, gr_ctx, false);
+        if (err != 0) {
+                goto cleanup;
+        }
+        g->ops.mm.l2_flush(g, true);
+        /* write to appropriate place in context image,
+         * first have to figure out where that really is */
+        /* first pass is writes, second reads */
+        for (pass = 0; pass < 2; pass++) {
+                ctx_op_nr = 0;
+                for (i = 0; (ctx_op_nr < num_ctx_ops[pass]) && (i < num_ops); ++i) {
+                        u32 num_offsets;
+                        /* only do ctx ops and only on the right pass */
+                        if ((ctx_ops[i].type == REGOP(TYPE_GLOBAL)) ||
+                            (((pass == 0) && reg_op_is_read(ctx_ops[i].op)) ||
+                             ((pass == 1) && !reg_op_is_read(ctx_ops[i].op)))) {
+                                continue;
+                        }
+                        err = gr_gk20a_get_ctx_buffer_offsets(g,
+                                                ctx_ops[i].offset,
+                                                max_offsets,
+                                                offsets, offset_addrs,
+                                                &num_offsets,
+                                                ctx_ops[i].type == REGOP(TYPE_GR_CTX_QUAD),
+                                                ctx_ops[i].quad);
+                        if (err == 0) {
+                                if (!gr_ctx_ready) {
+                                        gr_ctx_ready = true;
+                                }
+                                current_mem = &gr_ctx->mem;
+                        } else {
+                                err = gr_gk20a_get_pm_ctx_buffer_offsets(g,
+                                                        ctx_ops[i].offset,
+                                                        max_offsets,
+                                                        offsets, offset_addrs,
+                                                        &num_offsets);
+                                if (err != 0) {
+                                        nvgpu_log(g, gpu_dbg_gpu_dbg,
+                                           "ctx op invalid offset: offset=0x%x",
+                                           ctx_ops[i].offset);
+                                        ctx_ops[i].status =
+                                                REGOP(STATUS_INVALID_OFFSET);
+                                        continue;
+                                }
+                                if (!pm_ctx_ready) {
+                                        /* Make sure ctx buffer was initialized */
+                                        if (!nvgpu_mem_is_valid(&gr_ctx->pm_ctx.mem)) {
+                                                nvgpu_err(g,
+                                                        "Invalid ctx buffer");
+                                                err = -EINVAL;
+                                                goto cleanup;
+                                        }
+                                        pm_ctx_ready = true;
+                                }
+                                current_mem = &gr_ctx->pm_ctx.mem;
+                        }
+                        /* if this is a quad access, setup for special access*/
+                        if ((ctx_ops[i].type == REGOP(TYPE_GR_CTX_QUAD)) &&
+                                (g->ops.gr.access_smpc_reg != NULL)) {
+                                g->ops.gr.access_smpc_reg(g, ctx_ops[i].quad,
+                                                         ctx_ops[i].offset);
+                        }
+                        for (j = 0; j < num_offsets; j++) {
+                                /* sanity check gr ctxt offsets,
+                                 * don't write outside, worst case
+                                 */
+                                if ((current_mem == &gr_ctx->mem) &&
+                                        (offsets[j] >= g->gr.ctx_vars.golden_image_size)) {
+                                        continue;
+                                }
+                                if (pass == 0) { /* write pass */
+                                        v = nvgpu_mem_rd(g, current_mem, offsets[j]);
+                                        v &= ~ctx_ops[i].and_n_mask_lo;
+                                        v |= ctx_ops[i].value_lo;
+                                        nvgpu_mem_wr(g, current_mem, offsets[j], v);
+                                        nvgpu_log(g, gpu_dbg_gpu_dbg,
+                                                   "context wr: offset=0x%x v=0x%x",
+                                                   offsets[j], v);
+                                        if (ctx_ops[i].op == REGOP(WRITE_64)) {
+                                                v = nvgpu_mem_rd(g, current_mem, offsets[j] + 4);
+                                                v &= ~ctx_ops[i].and_n_mask_hi;
+                                                v |= ctx_ops[i].value_hi;
+                                                nvgpu_mem_wr(g, current_mem, offsets[j] + 4, v);
+                                                nvgpu_log(g, gpu_dbg_gpu_dbg,
+                                                           "context wr: offset=0x%x v=0x%x",
+                                                           offsets[j] + 4, v);
+                                        }
+                                        /* check to see if we need to add a special WAR
+                                           for some of the SMPC perf regs */
+                                        gr_gk20a_ctx_patch_smpc(g, ch, offset_addrs[j],
+                                                        v, current_mem);
+                                } else { /* read pass */
+                                        ctx_ops[i].value_lo =
+                                                nvgpu_mem_rd(g, current_mem, offsets[0]);
+                                        nvgpu_log(g, gpu_dbg_gpu_dbg, "context rd: offset=0x%x v=0x%x",
+                                                   offsets[0], ctx_ops[i].value_lo);
+                                        if (ctx_ops[i].op == REGOP(READ_64)) {
+                                                ctx_ops[i].value_hi =
+                                                        nvgpu_mem_rd(g, current_mem, offsets[0] + 4);
+                                                nvgpu_log(g, gpu_dbg_gpu_dbg,
+                                                           "context rd: offset=0x%x v=0x%x",
+                                                           offsets[0] + 4, ctx_ops[i].value_hi);
+                                        } else {
+                                                ctx_ops[i].value_hi = 0;
+                                        }
+                                }
+                        }
+                        ctx_op_nr++;
+                }
+        }
+ cleanup:
+        if (offsets) {
+                nvgpu_kfree(g, offsets);
+        }
+        if (gr_ctx->patch_ctx.mem.cpu_va) {
+                gr_gk20a_ctx_patch_write_end(g, gr_ctx, gr_ctx_ready);
+        }
+        return err;
+}
+int gr_gk20a_exec_ctx_ops(struct channel_gk20a *ch,
+                          struct nvgpu_dbg_reg_op *ctx_ops, u32 num_ops,
+                          u32 num_ctx_wr_ops, u32 num_ctx_rd_ops,
+                          bool *is_curr_ctx)
+{
+        struct gk20a *g = ch->g;
+        int err, tmp_err;
+        bool ch_is_curr_ctx;
+        /* disable channel switching.
+         * at that point the hardware state can be inspected to
+         * determine if the context we're interested in is current.
+         */
+        err = gr_gk20a_disable_ctxsw(g);
+        if (err != 0) {
+                nvgpu_err(g, "unable to stop gr ctxsw");
+                /* this should probably be ctx-fatal... */
+                return err;
+        }
+        ch_is_curr_ctx = gk20a_is_channel_ctx_resident(ch);
+        if (is_curr_ctx != NULL) {
+                *is_curr_ctx = ch_is_curr_ctx;
+        }
+        nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, "is curr ctx=%d",
+                  ch_is_curr_ctx);
+        err = __gr_gk20a_exec_ctx_ops(ch, ctx_ops, num_ops, num_ctx_wr_ops,
+                                      num_ctx_rd_ops, ch_is_curr_ctx);
+        tmp_err = gr_gk20a_enable_ctxsw(g);
+        if (tmp_err) {
+                nvgpu_err(g, "unable to restart ctxsw!");
+                err = tmp_err;
+        }
+        return err;
+}
+void gr_gk20a_commit_global_pagepool(struct gk20a *g,
+                                            struct nvgpu_gr_ctx *gr_ctx,
+                                            u64 addr, u32 size, bool patch)
+{
+        gr_gk20a_ctx_patch_write(g, gr_ctx, gr_scc_pagepool_base_r(),
+                gr_scc_pagepool_base_addr_39_8_f(addr), patch);
+        gr_gk20a_ctx_patch_write(g, gr_ctx, gr_scc_pagepool_r(),
+                gr_scc_pagepool_total_pages_f(size) |
+                gr_scc_pagepool_valid_true_f(), patch);
+        gr_gk20a_ctx_patch_write(g, gr_ctx, gr_gpcs_gcc_pagepool_base_r(),
+                gr_gpcs_gcc_pagepool_base_addr_39_8_f(addr), patch);
+        gr_gk20a_ctx_patch_write(g, gr_ctx, gr_gpcs_gcc_pagepool_r(),
+                gr_gpcs_gcc_pagepool_total_pages_f(size), patch);
+        gr_gk20a_ctx_patch_write(g, gr_ctx, gr_pd_pagepool_r(),
+                gr_pd_pagepool_total_pages_f(size) |
+                gr_pd_pagepool_valid_true_f(), patch);
+}
+void gk20a_init_gr(struct gk20a *g)
+{
+        nvgpu_cond_init(&g->gr.init_wq);
+}
+int gk20a_gr_wait_for_sm_lock_down(struct gk20a *g, u32 gpc, u32 tpc, u32 sm,
+                u32 global_esr_mask, bool check_errors)
+{
+        bool locked_down;
+        bool no_error_pending;
+        u32 delay = GR_IDLE_CHECK_DEFAULT;
+        bool mmu_debug_mode_enabled = g->ops.fb.is_debug_mode_enabled(g);
+        u32 offset = gk20a_gr_gpc_offset(g, gpc) + gk20a_gr_tpc_offset(g, tpc);
+        u32 dbgr_status0 = 0, dbgr_control0 = 0;
+        u64 warps_valid = 0, warps_paused = 0, warps_trapped = 0;
+        struct nvgpu_timeout timeout;
+        u32 warp_esr;
+        nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
+                "GPC%d TPC%d SM%d: locking down SM", gpc, tpc, sm);
+        nvgpu_timeout_init(g, &timeout, gk20a_get_gr_idle_timeout(g),
+                           NVGPU_TIMER_CPU_TIMER);
+        /* wait for the sm to lock down */
+        do {
+                u32 global_esr = g->ops.gr.get_sm_hww_global_esr(g,
+                                                gpc, tpc, sm);
+                dbgr_status0 = gk20a_readl(g,
+                                gr_gpc0_tpc0_sm_dbgr_status0_r() + offset);
+                warp_esr = g->ops.gr.get_sm_hww_warp_esr(g, gpc, tpc, sm);
+                locked_down =
+                    (gr_gpc0_tpc0_sm_dbgr_status0_locked_down_v(dbgr_status0) ==
+                     gr_gpc0_tpc0_sm_dbgr_status0_locked_down_true_v());
+                no_error_pending =
+                        check_errors &&
+                        (gr_gpc0_tpc0_sm_hww_warp_esr_error_v(warp_esr) ==
+                         gr_gpc0_tpc0_sm_hww_warp_esr_error_none_v()) &&
+                        ((global_esr & ~global_esr_mask) == 0);
+                if (locked_down || no_error_pending) {
+                        nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
+                                  "GPC%d TPC%d SM%d: locked down SM",
+                                        gpc, tpc, sm);
+                        return 0;
+                }
+                /* if an mmu fault is pending and mmu debug mode is not
+                 * enabled, the sm will never lock down. */
+                if (!mmu_debug_mode_enabled &&
+                     (g->ops.mm.mmu_fault_pending(g))) {
+                        nvgpu_err(g,
+                                "GPC%d TPC%d: mmu fault pending,"
+                                " SM%d will never lock down!", gpc, tpc, sm);
+                        return -EFAULT;
+                }
+                nvgpu_usleep_range(delay, delay * 2);
+                delay = min_t(u32, delay << 1, GR_IDLE_CHECK_MAX);
+        } while (nvgpu_timeout_expired(&timeout) == 0);
+        dbgr_control0 = gk20a_readl(g,
+                                gr_gpc0_tpc0_sm_dbgr_control0_r() + offset);
+        /* 64 bit read */
+        warps_valid = (u64)gk20a_readl(g, gr_gpc0_tpc0_sm_warp_valid_mask_1_r() + offset) << 32;
+        warps_valid |= gk20a_readl(g, gr_gpc0_tpc0_sm_warp_valid_mask_r() + offset);
+        /* 64 bit read */
+        warps_paused = (u64)gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_bpt_pause_mask_1_r() + offset) << 32;
+        warps_paused |= gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_bpt_pause_mask_r() + offset);
+        /* 64 bit read */
+        warps_trapped = (u64)gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_bpt_trap_mask_1_r() + offset) << 32;
+        warps_trapped |= gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_bpt_trap_mask_r() + offset);
+        nvgpu_err(g,
+                "GPC%d TPC%d: timed out while trying to lock down SM", gpc, tpc);
+        nvgpu_err(g,
+                "STATUS0(0x%x)=0x%x CONTROL0=0x%x VALID_MASK=0x%llx PAUSE_MASK=0x%llx TRAP_MASK=0x%llx",
+                gr_gpc0_tpc0_sm_dbgr_status0_r() + offset, dbgr_status0, dbgr_control0,
+                warps_valid, warps_paused, warps_trapped);
+        return -ETIMEDOUT;
+}
+void gk20a_gr_suspend_single_sm(struct gk20a *g,
+                u32 gpc, u32 tpc, u32 sm,
+                u32 global_esr_mask, bool check_errors)
+{
+        int err;
+        u32 dbgr_control0;
+        u32 offset = gk20a_gr_gpc_offset(g, gpc) + gk20a_gr_tpc_offset(g, tpc);
+        /* if an SM debugger isn't attached, skip suspend */
+        if (!g->ops.gr.sm_debugger_attached(g)) {
+                nvgpu_err(g,
+                        "SM debugger not attached, skipping suspend!");
+                return;
+        }
+        nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg,
+                "suspending gpc:%d, tpc:%d, sm%d", gpc, tpc, sm);
+        /* assert stop trigger. */
+        dbgr_control0 = gk20a_readl(g,
+                                gr_gpc0_tpc0_sm_dbgr_control0_r() + offset);
+        dbgr_control0 |= gr_gpcs_tpcs_sm_dbgr_control0_stop_trigger_enable_f();
+        gk20a_writel(g, gr_gpc0_tpc0_sm_dbgr_control0_r() + offset,
+                        dbgr_control0);
+        err = g->ops.gr.wait_for_sm_lock_down(g, gpc, tpc, sm,
+                        global_esr_mask, check_errors);
+        if (err != 0) {
+                nvgpu_err(g,
+                        "SuspendSm failed");
+                return;
+        }
+}
+void gk20a_gr_suspend_all_sms(struct gk20a *g,
+                u32 global_esr_mask, bool check_errors)
+{
+        struct gr_gk20a *gr = &g->gr;
+        u32 gpc, tpc, sm;
+        int err;
+        u32 dbgr_control0;
+        u32 sm_per_tpc = nvgpu_get_litter_value(g, GPU_LIT_NUM_SM_PER_TPC);
+        /* if an SM debugger isn't attached, skip suspend */
+        if (!g->ops.gr.sm_debugger_attached(g)) {
+                nvgpu_err(g,
+                        "SM debugger not attached, skipping suspend!");
+                return;
+        }
+        nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, "suspending all sms");
+        /* assert stop trigger. uniformity assumption: all SMs will have
+         * the same state in dbg_control0.
+         */
+        dbgr_control0 =
+                gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_control0_r());
+        dbgr_control0 |= gr_gpcs_tpcs_sm_dbgr_control0_stop_trigger_enable_f();
+        /* broadcast write */
+        gk20a_writel(g,
+                gr_gpcs_tpcs_sm_dbgr_control0_r(), dbgr_control0);
+        for (gpc = 0; gpc < gr->gpc_count; gpc++) {
+                for (tpc = 0; tpc < gr_gk20a_get_tpc_count(gr, gpc); tpc++) {
+                        for (sm = 0; sm < sm_per_tpc; sm++) {
+                                err = g->ops.gr.wait_for_sm_lock_down(g,
+                                        gpc, tpc, sm,
+                                        global_esr_mask, check_errors);
+                                if (err != 0) {
+                                        nvgpu_err(g, "SuspendAllSms failed");
+                                        return;
+                                }
+                        }
+                }
+        }
+}
+void gk20a_gr_resume_single_sm(struct gk20a *g,
+                u32 gpc, u32 tpc, u32 sm)
+{
+        u32 dbgr_control0;
+        u32 offset;
+        /*
+         * The following requires some clarification. Despite the fact that both
+         * RUN_TRIGGER and STOP_TRIGGER have the word "TRIGGER" in their
+         *  names, only one is actually a trigger, and that is the STOP_TRIGGER.
+         * Merely writing a 1(_TASK) to the RUN_TRIGGER is not sufficient to
+         * resume the gpu - the _STOP_TRIGGER must explicitly be set to 0
+         * (_DISABLE) as well.
+        * Advice from the arch group:  Disable the stop trigger first, as a
+        * separate operation, in order to ensure that the trigger has taken
+        * effect, before enabling the run trigger.
+        */
+        offset = gk20a_gr_gpc_offset(g, gpc) + gk20a_gr_tpc_offset(g, tpc);
+        /*De-assert stop trigger */
+        dbgr_control0 =
+                gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_control0_r() + offset);
+        dbgr_control0 = set_field(dbgr_control0,
+                        gr_gpcs_tpcs_sm_dbgr_control0_stop_trigger_m(),
+                        gr_gpcs_tpcs_sm_dbgr_control0_stop_trigger_disable_f());
+        gk20a_writel(g,
+                gr_gpc0_tpc0_sm_dbgr_control0_r() + offset, dbgr_control0);
+        /* Run trigger */
+        dbgr_control0 |= gr_gpcs_tpcs_sm_dbgr_control0_run_trigger_task_f();
+        gk20a_writel(g,
+                gr_gpc0_tpc0_sm_dbgr_control0_r() + offset, dbgr_control0);
+}
+void gk20a_gr_resume_all_sms(struct gk20a *g)
+{
+        u32 dbgr_control0;
+        /*
+         * The following requires some clarification. Despite the fact that both
+         * RUN_TRIGGER and STOP_TRIGGER have the word "TRIGGER" in their
+         *  names, only one is actually a trigger, and that is the STOP_TRIGGER.
+         * Merely writing a 1(_TASK) to the RUN_TRIGGER is not sufficient to
+         * resume the gpu - the _STOP_TRIGGER must explicitly be set to 0
+         * (_DISABLE) as well.
+        * Advice from the arch group:  Disable the stop trigger first, as a
+        * separate operation, in order to ensure that the trigger has taken
+        * effect, before enabling the run trigger.
+        */
+        /*De-assert stop trigger */
+        dbgr_control0 =
+                gk20a_readl(g, gr_gpcs_tpcs_sm_dbgr_control0_r());
+        dbgr_control0 &= ~gr_gpcs_tpcs_sm_dbgr_control0_stop_trigger_enable_f();
+        gk20a_writel(g,
+                gr_gpcs_tpcs_sm_dbgr_control0_r(), dbgr_control0);
+        /* Run trigger */
+        dbgr_control0 |= gr_gpcs_tpcs_sm_dbgr_control0_run_trigger_task_f();
+        gk20a_writel(g,
+                gr_gpcs_tpcs_sm_dbgr_control0_r(), dbgr_control0);
+}
+int gr_gk20a_set_sm_debug_mode(struct gk20a *g,
+        struct channel_gk20a *ch, u64 sms, bool enable)
+{
+        struct nvgpu_dbg_reg_op *ops;
+        unsigned int i = 0, sm_id;
+        int err;
+        u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE);
+        u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_STRIDE);
+        ops = nvgpu_kcalloc(g, g->gr.no_of_sm, sizeof(*ops));
+        if (ops == NULL) {
+                return -ENOMEM;
+        }
+        for (sm_id = 0; sm_id < g->gr.no_of_sm; sm_id++) {
+                int gpc, tpc;
+                u32 tpc_offset, gpc_offset, reg_offset, reg_mask, reg_val;
+                if ((sms & BIT64(sm_id)) == 0ULL) {
+                        continue;
+                }
+                gpc = g->gr.sm_to_cluster[sm_id].gpc_index;
+                tpc = g->gr.sm_to_cluster[sm_id].tpc_index;
+                tpc_offset = tpc_in_gpc_stride * tpc;
+                gpc_offset = gpc_stride * gpc;
+                reg_offset = tpc_offset + gpc_offset;
+                ops[i].op = REGOP(WRITE_32);
+                ops[i].type = REGOP(TYPE_GR_CTX);
+                ops[i].offset  = gr_gpc0_tpc0_sm_dbgr_control0_r() + reg_offset;
+                reg_mask = 0;
+                reg_val = 0;
+                if (enable) {
+                        reg_mask |= gr_gpc0_tpc0_sm_dbgr_control0_debugger_mode_m();
+                        reg_val |= gr_gpc0_tpc0_sm_dbgr_control0_debugger_mode_on_f();
+                        reg_mask |= gr_gpc0_tpc0_sm_dbgr_control0_stop_on_any_warp_m();
+                        reg_val |= gr_gpc0_tpc0_sm_dbgr_control0_stop_on_any_warp_disable_f();
+                        reg_mask |= gr_gpc0_tpc0_sm_dbgr_control0_stop_on_any_sm_m();
+                        reg_val |= gr_gpc0_tpc0_sm_dbgr_control0_stop_on_any_sm_disable_f();
+                } else {
+                        reg_mask |= gr_gpc0_tpc0_sm_dbgr_control0_debugger_mode_m();
+                        reg_val |= gr_gpc0_tpc0_sm_dbgr_control0_debugger_mode_off_f();
+                }
+                ops[i].and_n_mask_lo = reg_mask;
+                ops[i].value_lo = reg_val;
+                i++;
+        }
+        err = gr_gk20a_exec_ctx_ops(ch, ops, i, i, 0, NULL);
+        if (err != 0) {
+                nvgpu_err(g, "Failed to access register");
+        }
+        nvgpu_kfree(g, ops);
+        return err;
+}
+/*
+ * gr_gk20a_suspend_context()
+ * This API should be called with dbg_session lock held
+ * and ctxsw disabled
+ * Returns bool value indicating if context was resident
+ * or not
+ */
+bool gr_gk20a_suspend_context(struct channel_gk20a *ch)
+{
+        struct gk20a *g = ch->g;
+        bool ctx_resident = false;
+        if (gk20a_is_channel_ctx_resident(ch)) {
+                g->ops.gr.suspend_all_sms(g, 0, false);
+                ctx_resident = true;
+        } else {
+                gk20a_disable_channel_tsg(g, ch);
+        }
+        return ctx_resident;
+}
+bool gr_gk20a_resume_context(struct channel_gk20a *ch)
+{
+        struct gk20a *g = ch->g;
+        bool ctx_resident = false;
+        if (gk20a_is_channel_ctx_resident(ch)) {
+                g->ops.gr.resume_all_sms(g);
+                ctx_resident = true;
+        } else {
+                gk20a_enable_channel_tsg(g, ch);
+        }
+        return ctx_resident;
+}
+int gr_gk20a_suspend_contexts(struct gk20a *g,
+                              struct dbg_session_gk20a *dbg_s,
+                              int *ctx_resident_ch_fd)
+{
+        int local_ctx_resident_ch_fd = -1;
+        bool ctx_resident;
+        struct channel_gk20a *ch;
+        struct dbg_session_channel_data *ch_data;
+        int err = 0;
+        nvgpu_mutex_acquire(&g->dbg_sessions_lock);
+        err = gr_gk20a_disable_ctxsw(g);
+        if (err != 0) {
+                nvgpu_err(g, "unable to stop gr ctxsw");
+                goto clean_up;
+        }
+        nvgpu_mutex_acquire(&dbg_s->ch_list_lock);
+        nvgpu_list_for_each_entry(ch_data, &dbg_s->ch_list,
+                        dbg_session_channel_data, ch_entry) {
+                ch = g->fifo.channel + ch_data->chid;
+                ctx_resident = gr_gk20a_suspend_context(ch);
+                if (ctx_resident) {
+                        local_ctx_resident_ch_fd = ch_data->channel_fd;
+                }
+        }
+        nvgpu_mutex_release(&dbg_s->ch_list_lock);
+        err = gr_gk20a_enable_ctxsw(g);
+        if (err != 0) {
+                nvgpu_err(g, "unable to restart ctxsw!");
+        }
+        *ctx_resident_ch_fd = local_ctx_resident_ch_fd;
+clean_up:
+        nvgpu_mutex_release(&g->dbg_sessions_lock);
+        return err;
+}
+int gr_gk20a_resume_contexts(struct gk20a *g,
+                              struct dbg_session_gk20a *dbg_s,
+                              int *ctx_resident_ch_fd)
+{
+        int local_ctx_resident_ch_fd = -1;
+        bool ctx_resident;
+        struct channel_gk20a *ch;
+        int err = 0;
+        struct dbg_session_channel_data *ch_data;
+        nvgpu_mutex_acquire(&g->dbg_sessions_lock);
+        err = gr_gk20a_disable_ctxsw(g);
+        if (err != 0) {
+                nvgpu_err(g, "unable to stop gr ctxsw");
+                goto clean_up;
+        }
+        nvgpu_list_for_each_entry(ch_data, &dbg_s->ch_list,
+                        dbg_session_channel_data, ch_entry) {
+                ch = g->fifo.channel + ch_data->chid;
+                ctx_resident = gr_gk20a_resume_context(ch);
+                if (ctx_resident) {
+                        local_ctx_resident_ch_fd = ch_data->channel_fd;
+                }
+        }
+        err = gr_gk20a_enable_ctxsw(g);
+        if (err != 0) {
+                nvgpu_err(g, "unable to restart ctxsw!");
+        }
+        *ctx_resident_ch_fd = local_ctx_resident_ch_fd;
+clean_up:
+        nvgpu_mutex_release(&g->dbg_sessions_lock);
+        return err;
+}
+int gr_gk20a_trigger_suspend(struct gk20a *g)
+{
+        int err = 0;
+        u32 dbgr_control0;
+        /* assert stop trigger. uniformity assumption: all SMs will have
+         * the same state in dbg_control0. */
+        dbgr_control0 =
+                gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_control0_r());
+        dbgr_control0 |= gr_gpcs_tpcs_sm_dbgr_control0_stop_trigger_enable_f();
+        /* broadcast write */
+        gk20a_writel(g,
+                gr_gpcs_tpcs_sm_dbgr_control0_r(), dbgr_control0);
+        return err;
+}
+int gr_gk20a_wait_for_pause(struct gk20a *g, struct nvgpu_warpstate *w_state)
+{
+        int err = 0;
+        struct gr_gk20a *gr = &g->gr;
+        u32 gpc, tpc, sm, sm_id;
+        u32 global_mask;
+        /* Wait for the SMs to reach full stop. This condition is:
+         * 1) All SMs with valid warps must be in the trap handler (SM_IN_TRAP_MODE)
+         * 2) All SMs in the trap handler must have equivalent VALID and PAUSED warp
+         *    masks.
+        */
+        global_mask = g->ops.gr.get_sm_no_lock_down_hww_global_esr_mask(g);
+        /* Lock down all SMs */
+        for (sm_id = 0; sm_id < gr->no_of_sm; sm_id++) {
+                gpc = g->gr.sm_to_cluster[sm_id].gpc_index;
+                tpc = g->gr.sm_to_cluster[sm_id].tpc_index;
+                sm = g->gr.sm_to_cluster[sm_id].sm_index;
+                err = g->ops.gr.lock_down_sm(g, gpc, tpc, sm,
+                                global_mask, false);
+                if (err != 0) {
+                        nvgpu_err(g, "sm did not lock down!");
+                        return err;
+                }
+        }
+        /* Read the warp status */
+        g->ops.gr.bpt_reg_info(g, w_state);
+        return 0;
+}
+int gr_gk20a_resume_from_pause(struct gk20a *g)
+{
+        int err = 0;
+        u32 reg_val;
+        /* Clear the pause mask to tell the GPU we want to resume everyone */
+        gk20a_writel(g,
+                gr_gpcs_tpcs_sm_dbgr_bpt_pause_mask_r(), 0);
+        /* explicitly re-enable forwarding of SM interrupts upon any resume */
+        reg_val = gk20a_readl(g, gr_gpc0_tpc0_tpccs_tpc_exception_en_r());
+        reg_val |= gr_gpc0_tpc0_tpccs_tpc_exception_en_sm_enabled_f();
+        gk20a_writel(g, gr_gpcs_tpcs_tpccs_tpc_exception_en_r(), reg_val);
+        /* Now resume all sms, write a 0 to the stop trigger
+         * then a 1 to the run trigger */
+        g->ops.gr.resume_all_sms(g);
+        return err;
+}
+int gr_gk20a_clear_sm_errors(struct gk20a *g)
+{
+        int ret = 0;
+        u32 gpc, tpc, sm;
+        struct gr_gk20a *gr = &g->gr;
+        u32 global_esr;
+        u32 sm_per_tpc = nvgpu_get_litter_value(g, GPU_LIT_NUM_SM_PER_TPC);
+        for (gpc = 0; gpc < gr->gpc_count; gpc++) {
+                /* check if any tpc has an exception */
+                for (tpc = 0; tpc < gr->gpc_tpc_count[gpc]; tpc++) {
+                        for (sm = 0; sm < sm_per_tpc; sm++) {
+                                global_esr = g->ops.gr.get_sm_hww_global_esr(g,
+                                                         gpc, tpc, sm);
+                                /* clearing hwws, also causes tpc and gpc
+                                 * exceptions to be cleared
+                                 */
+                                g->ops.gr.clear_sm_hww(g,
+                                        gpc, tpc, sm, global_esr);
+                        }
+                }
+        }
+        return ret;
+}
+u32 gr_gk20a_tpc_enabled_exceptions(struct gk20a *g)
+{
+        struct gr_gk20a *gr = &g->gr;
+        u32 sm_id, tpc_exception_en = 0;
+        u32 offset, regval, tpc_offset, gpc_offset;
+        u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE);
+        u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_STRIDE);
+        for (sm_id = 0; sm_id < gr->no_of_sm; sm_id++) {
+                tpc_offset = tpc_in_gpc_stride * g->gr.sm_to_cluster[sm_id].tpc_index;
+                gpc_offset = gpc_stride * g->gr.sm_to_cluster[sm_id].gpc_index;
+                offset = tpc_offset + gpc_offset;
+                regval = gk20a_readl(g, gr_gpc0_tpc0_tpccs_tpc_exception_en_r() +
+                                                                offset);
+                /* Each bit represents corresponding enablement state, bit 0 corrsponds to SM0 */
+                tpc_exception_en |= gr_gpc0_tpc0_tpccs_tpc_exception_en_sm_v(regval) << sm_id;
+        }
+        return tpc_exception_en;
+}
+u32 gk20a_gr_get_sm_hww_warp_esr(struct gk20a *g, u32 gpc, u32 tpc, u32 sm)
+{
+        u32 offset = gk20a_gr_gpc_offset(g, gpc) + gk20a_gr_tpc_offset(g, tpc);
+        u32 hww_warp_esr = gk20a_readl(g,
+                         gr_gpc0_tpc0_sm_hww_warp_esr_r() + offset);
+        return hww_warp_esr;
+}
+u32 gk20a_gr_get_sm_hww_global_esr(struct gk20a *g, u32 gpc, u32 tpc, u32 sm)
+{
+        u32 offset = gk20a_gr_gpc_offset(g, gpc) + gk20a_gr_tpc_offset(g, tpc);
+        u32 hww_global_esr = gk20a_readl(g,
+                                 gr_gpc0_tpc0_sm_hww_global_esr_r() + offset);
+        return hww_global_esr;
+}
+u32 gk20a_gr_get_sm_no_lock_down_hww_global_esr_mask(struct gk20a *g)
+{
+        /*
+         * These three interrupts don't require locking down the SM. They can
+         * be handled by usermode clients as they aren't fatal. Additionally,
+         * usermode clients may wish to allow some warps to execute while others
+         * are at breakpoints, as opposed to fatal errors where all warps should
+         * halt.
+         */
+        u32 global_esr_mask =
+                gr_gpc0_tpc0_sm_hww_global_esr_bpt_int_pending_f() |
+                gr_gpc0_tpc0_sm_hww_global_esr_bpt_pause_pending_f() |
+                gr_gpc0_tpc0_sm_hww_global_esr_single_step_complete_pending_f();
+        return global_esr_mask;
+}
+/* invalidate channel lookup tlb */
+void gk20a_gr_flush_channel_tlb(struct gr_gk20a *gr)
+{
+        nvgpu_spinlock_acquire(&gr->ch_tlb_lock);
+        memset(gr->chid_tlb, 0,
+                sizeof(struct gr_channel_map_tlb_entry) *
+                GR_CHANNEL_MAP_TLB_SIZE);
+        nvgpu_spinlock_release(&gr->ch_tlb_lock);
+}
diff --git a/include/gk20a/gr_gk20a.h b/include/gk20a/gr_gk20a.h
new file mode 100644
index 0000000..08b81e8
--- /dev/null
+++ b/include/gk20a/gr_gk20a.h
@@ -0,0 +1,851 @@
+/*
+ * GK20A Graphics Engine
+ *
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+#ifndef GR_GK20A_H
+#define GR_GK20A_H
+#include <nvgpu/types.h>
+#include "gr_ctx_gk20a.h"
+#include "mm_gk20a.h"
+#include <nvgpu/power_features/pg.h>
+#include <nvgpu/comptags.h>
+#include <nvgpu/cond.h>
+#define GR_IDLE_CHECK_DEFAULT           10 /* usec */
+#define GR_IDLE_CHECK_MAX               200 /* usec */
+#define GR_FECS_POLL_INTERVAL           5 /* usec */
+#define INVALID_SCREEN_TILE_ROW_OFFSET  0xFFFFFFFF
+#define INVALID_MAX_WAYS                0xFFFFFFFF
+#define GK20A_FECS_UCODE_IMAGE  "fecs.bin"
+#define GK20A_GPCCS_UCODE_IMAGE "gpccs.bin"
+#define GK20A_GR_MAX_PES_PER_GPC 3
+#define GK20A_TIMEOUT_FPGA              100000 /* 100 sec */
+/* Flags to be passed to g->ops.gr.alloc_obj_ctx() */
+#define NVGPU_OBJ_CTX_FLAGS_SUPPORT_GFXP                (1 << 1)
+#define NVGPU_OBJ_CTX_FLAGS_SUPPORT_CILP                (1 << 2)
+/*
+ * allocate a minimum of 1 page (4KB) worth of patch space, this is 512 entries
+ * of address and data pairs
+ */
+#define PATCH_CTX_SLOTS_REQUIRED_PER_ENTRY      2
+#define PATCH_CTX_SLOTS_PER_PAGE \
+        (PAGE_SIZE/(PATCH_CTX_SLOTS_REQUIRED_PER_ENTRY * sizeof(u32)))
+#define PATCH_CTX_ENTRIES_FROM_SIZE(size) (size/sizeof(u32))
+#define NVGPU_PREEMPTION_MODE_GRAPHICS_WFI      (1 << 0)
+#define NVGPU_PREEMPTION_MODE_GRAPHICS_GFXP     (1 << 1)
+#define NVGPU_PREEMPTION_MODE_COMPUTE_WFI       (1 << 0)
+#define NVGPU_PREEMPTION_MODE_COMPUTE_CTA       (1 << 1)
+#define NVGPU_PREEMPTION_MODE_COMPUTE_CILP      (1 << 2)
+#define CTXSW_INTR0                             BIT32(0)
+#define CTXSW_INTR1                             BIT32(1)
+#define MAILBOX_VALUE_TIMESTAMP_BUFFER_FULL     0x26
+struct tsg_gk20a;
+struct channel_gk20a;
+struct nvgpu_warpstate;
+enum ctxsw_addr_type;
+enum /* global_ctx_buffer */ {
+        CIRCULAR                = 0,
+        PAGEPOOL                = 1,
+        ATTRIBUTE               = 2,
+        CIRCULAR_VPR            = 3,
+        PAGEPOOL_VPR            = 4,
+        ATTRIBUTE_VPR           = 5,
+        GOLDEN_CTX              = 6,
+        PRIV_ACCESS_MAP         = 7,
+        /* #8 is reserved */
+        FECS_TRACE_BUFFER       = 9,
+        NR_GLOBAL_CTX_BUF       = 10
+};
+/* either ATTRIBUTE or ATTRIBUTE_VPR maps to ATTRIBUTE_VA */
+enum  /*global_ctx_buffer_va */ {
+        CIRCULAR_VA             = 0,
+        PAGEPOOL_VA             = 1,
+        ATTRIBUTE_VA            = 2,
+        GOLDEN_CTX_VA           = 3,
+        PRIV_ACCESS_MAP_VA      = 4,
+        /* #5 is reserved */
+        FECS_TRACE_BUFFER_VA    = 6,
+        NR_GLOBAL_CTX_BUF_VA    = 7
+};
+enum {
+        WAIT_UCODE_LOOP,
+        WAIT_UCODE_TIMEOUT,
+        WAIT_UCODE_ERROR,
+        WAIT_UCODE_OK
+};
+enum {
+        GR_IS_UCODE_OP_EQUAL,
+        GR_IS_UCODE_OP_NOT_EQUAL,
+        GR_IS_UCODE_OP_AND,
+        GR_IS_UCODE_OP_LESSER,
+        GR_IS_UCODE_OP_LESSER_EQUAL,
+        GR_IS_UCODE_OP_SKIP
+};
+enum {
+        eUcodeHandshakeInitComplete = 1,
+        eUcodeHandshakeMethodFinished
+};
+enum {
+        ELCG_MODE = (1 << 0),
+        BLCG_MODE = (1 << 1),
+        INVALID_MODE = (1 << 2)
+};
+enum {
+        NVGPU_EVENT_ID_BPT_INT = 0,
+        NVGPU_EVENT_ID_BPT_PAUSE,
+        NVGPU_EVENT_ID_BLOCKING_SYNC,
+        NVGPU_EVENT_ID_CILP_PREEMPTION_STARTED,
+        NVGPU_EVENT_ID_CILP_PREEMPTION_COMPLETE,
+        NVGPU_EVENT_ID_GR_SEMAPHORE_WRITE_AWAKEN,
+        NVGPU_EVENT_ID_MAX,
+};
+#ifndef GR_GO_IDLE_BUNDLE
+#define GR_GO_IDLE_BUNDLE       0x0000e100 /* --V-B */
+#endif
+struct gr_channel_map_tlb_entry {
+        u32 curr_ctx;
+        u32 chid;
+        u32 tsgid;
+};
+struct gr_zcull_gk20a {
+        u32 aliquot_width;
+        u32 aliquot_height;
+        u32 aliquot_size;
+        u32 total_aliquots;
+        u32 width_align_pixels;
+        u32 height_align_pixels;
+        u32 pixel_squares_by_aliquots;
+};
+struct gr_zcull_info {
+        u32 width_align_pixels;
+        u32 height_align_pixels;
+        u32 pixel_squares_by_aliquots;
+        u32 aliquot_total;
+        u32 region_byte_multiplier;
+        u32 region_header_size;
+        u32 subregion_header_size;
+        u32 subregion_width_align_pixels;
+        u32 subregion_height_align_pixels;
+        u32 subregion_count;
+};
+#define GK20A_ZBC_COLOR_VALUE_SIZE      4  /* RGBA */
+#define GK20A_STARTOF_ZBC_TABLE         1U   /* index zero reserved to indicate "not ZBCd" */
+#define GK20A_SIZEOF_ZBC_TABLE          16  /* match ltcs_ltss_dstg_zbc_index_address width (4) */
+#define GK20A_ZBC_TABLE_SIZE            (16 - 1)
+#define GK20A_ZBC_TYPE_INVALID          0
+#define GK20A_ZBC_TYPE_COLOR            1
+#define GK20A_ZBC_TYPE_DEPTH            2
+#define T19X_ZBC                        3
+struct zbc_color_table {
+        u32 color_ds[GK20A_ZBC_COLOR_VALUE_SIZE];
+        u32 color_l2[GK20A_ZBC_COLOR_VALUE_SIZE];
+        u32 format;
+        u32 ref_cnt;
+};
+struct zbc_depth_table {
+        u32 depth;
+        u32 format;
+        u32 ref_cnt;
+};
+struct zbc_s_table {
+        u32 stencil;
+        u32 format;
+        u32 ref_cnt;
+};
+struct zbc_entry {
+        u32 color_ds[GK20A_ZBC_COLOR_VALUE_SIZE];
+        u32 color_l2[GK20A_ZBC_COLOR_VALUE_SIZE];
+        u32 depth;
+        u32 type;       /* color or depth */
+        u32 format;
+};
+struct zbc_query_params {
+        u32 color_ds[GK20A_ZBC_COLOR_VALUE_SIZE];
+        u32 color_l2[GK20A_ZBC_COLOR_VALUE_SIZE];
+        u32 depth;
+        u32 ref_cnt;
+        u32 format;
+        u32 type;       /* color or depth */
+        u32 index_size; /* [out] size, [in] index */
+};
+struct sm_info {
+        u32 gpc_index;
+        u32 tpc_index;
+        u32 sm_index;
+        u32 global_tpc_index;
+};
+#if defined(CONFIG_GK20A_CYCLE_STATS)
+struct gk20a_cs_snapshot_client;
+struct gk20a_cs_snapshot;
+#endif
+struct gr_gk20a_isr_data {
+        u32 addr;
+        u32 data_lo;
+        u32 data_hi;
+        u32 curr_ctx;
+        struct channel_gk20a *ch;
+        u32 offset;
+        u32 sub_chan;
+        u32 class_num;
+};
+struct gr_ctx_buffer_desc {
+        void (*destroy)(struct gk20a *, struct gr_ctx_buffer_desc *);
+        struct nvgpu_mem mem;
+        void *priv;
+};
+struct nvgpu_preemption_modes_rec {
+        u32 graphics_preemption_mode_flags; /* supported preemption modes */
+        u32 compute_preemption_mode_flags; /* supported preemption modes */
+        u32 default_graphics_preempt_mode; /* default mode */
+        u32 default_compute_preempt_mode; /* default mode */
+};
+struct gr_gk20a {
+        struct gk20a *g;
+        struct {
+                bool dynamic;
+                u32 buffer_size;
+                u32 buffer_total_size;
+                bool golden_image_initialized;
+                u32 golden_image_size;
+                u32 *local_golden_image;
+                u32 hwpm_ctxsw_buffer_offset_map_count;
+                struct ctxsw_buf_offset_map_entry *hwpm_ctxsw_buffer_offset_map;
+                u32 zcull_ctxsw_image_size;
+                u32 pm_ctxsw_image_size;
+                u32 buffer_header_size;
+                u32 priv_access_map_size;
+                u32 fecs_trace_buffer_size;
+                struct gr_ucode_gk20a ucode;
+                struct av_list_gk20a  sw_bundle_init;
+                struct av_list_gk20a  sw_method_init;
+                struct aiv_list_gk20a sw_ctx_load;
+                struct av_list_gk20a  sw_non_ctx_load;
+                struct av_list_gk20a  sw_veid_bundle_init;
+                struct av64_list_gk20a sw_bundle64_init;
+                struct {
+                        struct aiv_list_gk20a sys;
+                        struct aiv_list_gk20a gpc;
+                        struct aiv_list_gk20a tpc;
+                        struct aiv_list_gk20a zcull_gpc;
+                        struct aiv_list_gk20a ppc;
+                        struct aiv_list_gk20a pm_sys;
+                        struct aiv_list_gk20a pm_gpc;
+                        struct aiv_list_gk20a pm_tpc;
+                        struct aiv_list_gk20a pm_ppc;
+                        struct aiv_list_gk20a perf_sys;
+                        struct aiv_list_gk20a perf_gpc;
+                        struct aiv_list_gk20a fbp;
+                        struct aiv_list_gk20a fbp_router;
+                        struct aiv_list_gk20a gpc_router;
+                        struct aiv_list_gk20a pm_ltc;
+                        struct aiv_list_gk20a pm_fbpa;
+                        struct aiv_list_gk20a perf_sys_router;
+                        struct aiv_list_gk20a perf_pma;
+                        struct aiv_list_gk20a pm_rop;
+                        struct aiv_list_gk20a pm_ucgpc;
+                        struct aiv_list_gk20a etpc;
+                        struct aiv_list_gk20a pm_cau;
+                } ctxsw_regs;
+                u32 regs_base_index;
+                bool valid;
+                u32 preempt_image_size;
+                bool force_preemption_gfxp;
+                bool force_preemption_cilp;
+                bool dump_ctxsw_stats_on_channel_close;
+        } ctx_vars;
+        struct nvgpu_mutex ctx_mutex; /* protect golden ctx init */
+        struct nvgpu_mutex fecs_mutex; /* protect fecs method */
+#define GR_NETLIST_DYNAMIC      -1
+#define GR_NETLIST_STATIC_A     'A'
+        int netlist;
+        struct nvgpu_cond init_wq;
+        int initialized;
+        u32 num_fbps;
+        u32 max_comptag_lines;
+        u32 compbit_backing_size;
+        u32 comptags_per_cacheline;
+        u32 slices_per_ltc;
+        u32 cacheline_size;
+        u32 gobs_per_comptagline_per_slice;
+        u32 max_gpc_count;
+        u32 max_fbps_count;
+        u32 max_tpc_per_gpc_count;
+        u32 max_zcull_per_gpc_count;
+        u32 max_tpc_count;
+        u32 sys_count;
+        u32 gpc_count;
+        u32 pe_count_per_gpc;
+        u32 ppc_count;
+        u32 *gpc_ppc_count;
+        u32 tpc_count;
+        u32 *gpc_tpc_count;
+        u32 *gpc_tpc_mask;
+        u32 zcb_count;
+        u32 *gpc_zcb_count;
+        u32 *pes_tpc_count[GK20A_GR_MAX_PES_PER_GPC];
+        u32 *pes_tpc_mask[GK20A_GR_MAX_PES_PER_GPC];
+        u32 *gpc_skip_mask;
+        u32 bundle_cb_default_size;
+        u32 min_gpm_fifo_depth;
+        u32 bundle_cb_token_limit;
+        u32 attrib_cb_default_size;
+        u32 attrib_cb_size;
+        u32 attrib_cb_gfxp_default_size;
+        u32 attrib_cb_gfxp_size;
+        u32 alpha_cb_default_size;
+        u32 alpha_cb_size;
+        u32 timeslice_mode;
+        u32 czf_bypass;
+        u32 pd_max_batches;
+        u32 gfxp_wfi_timeout_count;
+        u32 gfxp_wfi_timeout_unit;
+        /*
+         * The deductible memory size for max_comptag_mem (in MBytes)
+         * Usually close to memory size that running system is taking
+         */
+        u32 comptag_mem_deduct;
+        struct gr_ctx_buffer_desc global_ctx_buffer[NR_GLOBAL_CTX_BUF];
+        u8 *map_tiles;
+        u32 map_tile_count;
+        u32 map_row_offset;
+        u32 max_comptag_mem; /* max memory size (MB) for comptag */
+        struct compbit_store_desc compbit_store;
+        struct gk20a_comptag_allocator comp_tags;
+        struct gr_zcull_gk20a zcull;
+        struct nvgpu_mutex zbc_lock;
+        struct zbc_color_table zbc_col_tbl[GK20A_ZBC_TABLE_SIZE];
+        struct zbc_depth_table zbc_dep_tbl[GK20A_ZBC_TABLE_SIZE];
+        struct zbc_s_table zbc_s_tbl[GK20A_ZBC_TABLE_SIZE];
+        s32 max_default_color_index;
+        s32 max_default_depth_index;
+        s32 max_default_s_index;
+        u32 max_used_color_index;
+        u32 max_used_depth_index;
+        u32 max_used_s_index;
+#define GR_CHANNEL_MAP_TLB_SIZE         2 /* must of power of 2 */
+        struct gr_channel_map_tlb_entry chid_tlb[GR_CHANNEL_MAP_TLB_SIZE];
+        u32 channel_tlb_flush_index;
+        struct nvgpu_spinlock ch_tlb_lock;
+        void (*remove_support)(struct gr_gk20a *gr);
+        bool sw_ready;
+        bool skip_ucode_init;
+        struct nvgpu_preemption_modes_rec preemption_mode_rec;
+        u32 fecs_feature_override_ecc_val;
+        int cilp_preempt_pending_chid;
+        u32 fbp_en_mask;
+        u32 *fbp_rop_l2_en_mask;
+        u32 no_of_sm;
+        struct sm_info *sm_to_cluster;
+#if defined(CONFIG_GK20A_CYCLE_STATS)
+        struct nvgpu_mutex                      cs_lock;
+        struct gk20a_cs_snapshot        *cs_data;
+#endif
+        u32 max_css_buffer_size;
+};
+void gk20a_fecs_dump_falcon_stats(struct gk20a *g);
+/* contexts associated with a TSG */
+struct nvgpu_gr_ctx {
+        struct nvgpu_mem mem;
+        u32 graphics_preempt_mode;
+        u32 compute_preempt_mode;
+        struct nvgpu_mem preempt_ctxsw_buffer;
+        struct nvgpu_mem spill_ctxsw_buffer;
+        struct nvgpu_mem betacb_ctxsw_buffer;
+        struct nvgpu_mem pagepool_ctxsw_buffer;
+        u32 ctx_id;
+        bool ctx_id_valid;
+        bool cilp_preempt_pending;
+        bool boosted_ctx;
+        bool golden_img_loaded;
+#ifdef CONFIG_TEGRA_GR_VIRTUALIZATION
+        u64 virt_ctx;
+#endif
+        struct patch_desc       patch_ctx;
+        struct zcull_ctx_desc   zcull_ctx;
+        struct pm_ctx_desc      pm_ctx;
+        u64     global_ctx_buffer_va[NR_GLOBAL_CTX_BUF_VA];
+        u64     global_ctx_buffer_size[NR_GLOBAL_CTX_BUF_VA];
+        int     global_ctx_buffer_index[NR_GLOBAL_CTX_BUF_VA];
+        bool    global_ctx_buffer_mapped;
+        u32 tsgid;
+};
+struct gk20a_ctxsw_ucode_segment {
+        u32 offset;
+        u32 size;
+};
+struct gk20a_ctxsw_ucode_segments {
+        u32 boot_entry;
+        u32 boot_imem_offset;
+        u32 boot_signature;
+        struct gk20a_ctxsw_ucode_segment boot;
+        struct gk20a_ctxsw_ucode_segment code;
+        struct gk20a_ctxsw_ucode_segment data;
+};
+/* sums over the ucode files as sequences of u32, computed to the
+ * boot_signature field in the structure above */
+/* T18X FECS remains same as T21X,
+ * so FALCON_UCODE_SIG_T21X_FECS_WITH_RESERVED used
+ * for T18X*/
+#define FALCON_UCODE_SIG_T18X_GPCCS_WITH_RESERVED       0x68edab34
+#define FALCON_UCODE_SIG_T21X_FECS_WITH_DMEM_SIZE       0x9121ab5c
+#define FALCON_UCODE_SIG_T21X_FECS_WITH_RESERVED        0x9125ab5c
+#define FALCON_UCODE_SIG_T12X_FECS_WITH_RESERVED        0x8a621f78
+#define FALCON_UCODE_SIG_T12X_FECS_WITHOUT_RESERVED     0x67e5344b
+#define FALCON_UCODE_SIG_T12X_FECS_OLDER                0x56da09f
+#define FALCON_UCODE_SIG_T21X_GPCCS_WITH_RESERVED       0x3d3d65e2
+#define FALCON_UCODE_SIG_T12X_GPCCS_WITH_RESERVED       0x303465d5
+#define FALCON_UCODE_SIG_T12X_GPCCS_WITHOUT_RESERVED    0x3fdd33d3
+#define FALCON_UCODE_SIG_T12X_GPCCS_OLDER               0x53d7877
+#define FALCON_UCODE_SIG_T21X_FECS_WITHOUT_RESERVED     0x93671b7d
+#define FALCON_UCODE_SIG_T21X_FECS_WITHOUT_RESERVED2    0x4d6cbc10
+#define FALCON_UCODE_SIG_T21X_GPCCS_WITHOUT_RESERVED    0x393161da
+struct gk20a_ctxsw_ucode_info {
+        u64 *p_va;
+        struct nvgpu_mem inst_blk_desc;
+        struct nvgpu_mem surface_desc;
+        struct gk20a_ctxsw_ucode_segments fecs;
+        struct gk20a_ctxsw_ucode_segments gpccs;
+};
+struct gk20a_ctxsw_bootloader_desc {
+        u32 start_offset;
+        u32 size;
+        u32 imem_offset;
+        u32 entry_point;
+};
+struct fecs_method_op_gk20a {
+        struct {
+                u32 addr;
+                u32 data;
+        } method;
+        struct {
+                u32 id;
+                u32 data;
+                u32 clr;
+                u32 *ret;
+                u32 ok;
+                u32 fail;
+        } mailbox;
+        struct {
+                u32 ok;
+                u32 fail;
+        } cond;
+};
+struct nvgpu_warpstate {
+        u64 valid_warps[2];
+        u64 trapped_warps[2];
+        u64 paused_warps[2];
+};
+struct gpu_ops;
+int gr_gk20a_load_golden_ctx_image(struct gk20a *g,
+                                        struct channel_gk20a *c);
+void gk20a_init_gr(struct gk20a *g);
+int gk20a_init_gr_support(struct gk20a *g);
+int gk20a_enable_gr_hw(struct gk20a *g);
+int gk20a_gr_reset(struct gk20a *g);
+void gk20a_gr_wait_initialized(struct gk20a *g);
+int gk20a_init_gr_channel(struct channel_gk20a *ch_gk20a);
+int gk20a_alloc_obj_ctx(struct channel_gk20a  *c, u32 class_num, u32 flags);
+int gk20a_gr_isr(struct gk20a *g);
+u32 gk20a_gr_nonstall_isr(struct gk20a *g);
+/* zcull */
+u32 gr_gk20a_get_ctxsw_zcull_size(struct gk20a *g, struct gr_gk20a *gr);
+int gr_gk20a_bind_ctxsw_zcull(struct gk20a *g, struct gr_gk20a *gr,
+                        struct channel_gk20a *c, u64 zcull_va, u32 mode);
+int gr_gk20a_get_zcull_info(struct gk20a *g, struct gr_gk20a *gr,
+                        struct gr_zcull_info *zcull_params);
+void gr_gk20a_program_zcull_mapping(struct gk20a *g, u32 zcull_num_entries,
+                                        u32 *zcull_map_tiles);
+/* zbc */
+int gr_gk20a_add_zbc(struct gk20a *g, struct gr_gk20a *gr,
+                        struct zbc_entry *zbc_val);
+int gr_gk20a_query_zbc(struct gk20a *g, struct gr_gk20a *gr,
+                        struct zbc_query_params *query_params);
+int gk20a_gr_zbc_set_table(struct gk20a *g, struct gr_gk20a *gr,
+                        struct zbc_entry *zbc_val);
+int gr_gk20a_load_zbc_default_table(struct gk20a *g, struct gr_gk20a *gr);
+/* pmu */
+int gr_gk20a_fecs_get_reglist_img_size(struct gk20a *g, u32 *size);
+int gr_gk20a_fecs_set_reglist_bind_inst(struct gk20a *g,
+                struct nvgpu_mem *inst_block);
+int gr_gk20a_fecs_set_reglist_virtual_addr(struct gk20a *g, u64 pmu_va);
+void gr_gk20a_init_cg_mode(struct gk20a *g, u32 cgmode, u32 mode_config);
+/* sm */
+bool gk20a_gr_sm_debugger_attached(struct gk20a *g);
+u32 gk20a_gr_get_sm_no_lock_down_hww_global_esr_mask(struct gk20a *g);
+#define gr_gk20a_elpg_protected_call(g, func) \
+        ({ \
+                int err = 0; \
+                if (g->support_pmu) {\
+                        err = nvgpu_pg_elpg_disable(g);\
+                        if (err != 0) {\
+                                err = nvgpu_pg_elpg_enable(g); \
+                        } \
+                } \
+                if (err == 0) { \
+                        err = func; \
+                        if (g->support_pmu) {\
+                                (void)nvgpu_pg_elpg_enable(g); \
+                        } \
+                } \
+                err; \
+        })
+int gk20a_gr_suspend(struct gk20a *g);
+struct nvgpu_dbg_reg_op;
+int gr_gk20a_exec_ctx_ops(struct channel_gk20a *ch,
+                          struct nvgpu_dbg_reg_op *ctx_ops, u32 num_ops,
+                          u32 num_ctx_wr_ops, u32 num_ctx_rd_ops,
+                          bool *is_curr_ctx);
+int __gr_gk20a_exec_ctx_ops(struct channel_gk20a *ch,
+                            struct nvgpu_dbg_reg_op *ctx_ops, u32 num_ops,
+                            u32 num_ctx_wr_ops, u32 num_ctx_rd_ops,
+                            bool ch_is_curr_ctx);
+int gr_gk20a_get_ctx_buffer_offsets(struct gk20a *g,
+                                    u32 addr,
+                                    u32 max_offsets,
+                                    u32 *offsets, u32 *offset_addrs,
+                                    u32 *num_offsets,
+                                    bool is_quad, u32 quad);
+int gr_gk20a_get_pm_ctx_buffer_offsets(struct gk20a *g,
+                                       u32 addr,
+                                       u32 max_offsets,
+                                       u32 *offsets, u32 *offset_addrs,
+                                       u32 *num_offsets);
+int gr_gk20a_update_smpc_ctxsw_mode(struct gk20a *g,
+                                    struct channel_gk20a *c,
+                                    bool enable_smpc_ctxsw);
+int gr_gk20a_update_hwpm_ctxsw_mode(struct gk20a *g,
+                                  struct channel_gk20a *c,
+                                  u64 gpu_va,
+                                  u32 mode);
+struct nvgpu_gr_ctx;
+void gr_gk20a_ctx_patch_write(struct gk20a *g, struct nvgpu_gr_ctx *ch_ctx,
+                                    u32 addr, u32 data, bool patch);
+int gr_gk20a_ctx_patch_write_begin(struct gk20a *g,
+                                          struct nvgpu_gr_ctx *ch_ctx,
+                                          bool update_patch_count);
+void gr_gk20a_ctx_patch_write_end(struct gk20a *g,
+                                        struct nvgpu_gr_ctx *ch_ctx,
+                                        bool update_patch_count);
+void gr_gk20a_commit_global_pagepool(struct gk20a *g,
+                                     struct nvgpu_gr_ctx *ch_ctx,
+                                     u64 addr, u32 size, bool patch);
+void gk20a_gr_set_shader_exceptions(struct gk20a *g, u32 data);
+void gr_gk20a_enable_hww_exceptions(struct gk20a *g);
+int gr_gk20a_init_fs_state(struct gk20a *g);
+int gr_gk20a_setup_rop_mapping(struct gk20a *g, struct gr_gk20a *gr);
+int gr_gk20a_init_ctxsw_ucode(struct gk20a *g);
+int gr_gk20a_load_ctxsw_ucode(struct gk20a *g);
+void gr_gk20a_load_falcon_bind_instblk(struct gk20a *g);
+void gr_gk20a_load_ctxsw_ucode_header(struct gk20a *g, u64 addr_base,
+        struct gk20a_ctxsw_ucode_segments *segments, u32 reg_offset);
+void gr_gk20a_load_ctxsw_ucode_boot(struct gk20a *g, u64 addr_base,
+        struct gk20a_ctxsw_ucode_segments *segments, u32 reg_offset);
+void gr_gk20a_free_tsg_gr_ctx(struct tsg_gk20a *c);
+int gr_gk20a_disable_ctxsw(struct gk20a *g);
+int gr_gk20a_enable_ctxsw(struct gk20a *g);
+void gk20a_gr_resume_single_sm(struct gk20a *g,
+                u32 gpc, u32 tpc, u32 sm);
+void gk20a_gr_resume_all_sms(struct gk20a *g);
+void gk20a_gr_suspend_single_sm(struct gk20a *g,
+                u32 gpc, u32 tpc, u32 sm,
+                u32 global_esr_mask, bool check_errors);
+void gk20a_gr_suspend_all_sms(struct gk20a *g,
+                u32 global_esr_mask, bool check_errors);
+u32 gr_gk20a_get_tpc_count(struct gr_gk20a *gr, u32 gpc_index);
+int gr_gk20a_set_sm_debug_mode(struct gk20a *g,
+        struct channel_gk20a *ch, u64 sms, bool enable);
+bool gk20a_is_channel_ctx_resident(struct channel_gk20a *ch);
+int gr_gk20a_add_zbc_color(struct gk20a *g, struct gr_gk20a *gr,
+                           struct zbc_entry *color_val, u32 index);
+int gr_gk20a_add_zbc_depth(struct gk20a *g, struct gr_gk20a *gr,
+                           struct zbc_entry *depth_val, u32 index);
+int _gk20a_gr_zbc_set_table(struct gk20a *g, struct gr_gk20a *gr,
+                        struct zbc_entry *zbc_val);
+void gr_gk20a_pmu_save_zbc(struct gk20a *g, u32 entries);
+int gr_gk20a_wait_idle(struct gk20a *g, unsigned long duration_ms,
+                       u32 expect_delay);
+int gr_gk20a_handle_sm_exception(struct gk20a *g, u32 gpc, u32 tpc, u32 sm,
+                bool *post_event, struct channel_gk20a *fault_ch,
+                u32 *hww_global_esr);
+int gr_gk20a_handle_tex_exception(struct gk20a *g, u32 gpc, u32 tpc,
+                                        bool *post_event);
+int gr_gk20a_init_ctx_state(struct gk20a *g);
+int gr_gk20a_submit_fecs_method_op(struct gk20a *g,
+                                   struct fecs_method_op_gk20a op,
+                                   bool sleepduringwait);
+int gr_gk20a_submit_fecs_method_op_locked(struct gk20a *g,
+                                   struct fecs_method_op_gk20a op,
+                                   bool sleepduringwait);
+int gr_gk20a_submit_fecs_sideband_method_op(struct gk20a *g,
+                struct fecs_method_op_gk20a op);
+int gr_gk20a_alloc_gr_ctx(struct gk20a *g,
+                          struct nvgpu_gr_ctx *gr_ctx, struct vm_gk20a *vm,
+                          u32 class, u32 padding);
+void gr_gk20a_free_gr_ctx(struct gk20a *g,
+                       struct vm_gk20a *vm, struct nvgpu_gr_ctx *gr_ctx);
+int gr_gk20a_halt_pipe(struct gk20a *g);
+#if defined(CONFIG_GK20A_CYCLE_STATS)
+int gr_gk20a_css_attach(struct channel_gk20a *ch,   /* in - main hw structure */
+                        u32 perfmon_id_count,       /* in - number of perfmons*/
+                        u32 *perfmon_id_start,      /* out- index of first pm */
+                        /* in/out - pointer to client data used in later     */
+                        struct gk20a_cs_snapshot_client *css_client);
+int gr_gk20a_css_detach(struct channel_gk20a *ch,
+                                struct gk20a_cs_snapshot_client *css_client);
+int gr_gk20a_css_flush(struct channel_gk20a *ch,
+                                struct gk20a_cs_snapshot_client *css_client);
+void gr_gk20a_free_cyclestats_snapshot_data(struct gk20a *g);
+#else
+/* fake empty cleanup function if no cyclestats snapshots enabled */
+static inline void gr_gk20a_free_cyclestats_snapshot_data(struct gk20a *g)
+{
+        (void)g;
+}
+#endif
+void gr_gk20a_fecs_host_int_enable(struct gk20a *g);
+int gk20a_gr_handle_fecs_error(struct gk20a *g, struct channel_gk20a *ch,
+                struct gr_gk20a_isr_data *isr_data);
+int gk20a_gr_lock_down_sm(struct gk20a *g,
+                         u32 gpc, u32 tpc, u32 sm, u32 global_esr_mask,
+                         bool check_errors);
+int gk20a_gr_wait_for_sm_lock_down(struct gk20a *g, u32 gpc, u32 tpc, u32 sm,
+                u32 global_esr_mask, bool check_errors);
+int gr_gk20a_ctx_wait_ucode(struct gk20a *g, u32 mailbox_id,
+                            u32 *mailbox_ret, u32 opc_success,
+                            u32 mailbox_ok, u32 opc_fail,
+                            u32 mailbox_fail, bool sleepduringwait);
+int gr_gk20a_get_ctx_id(struct gk20a *g,
+                struct channel_gk20a *c,
+                u32 *ctx_id);
+u32 gk20a_gr_get_sm_hww_warp_esr(struct gk20a *g, u32 gpc, u32 tpc, u32 sm);
+u32 gk20a_gr_get_sm_hww_global_esr(struct gk20a *g, u32 gpc, u32 tpc, u32 sm);
+int gr_gk20a_wait_fe_idle(struct gk20a *g, unsigned long duration_ms,
+                          u32 expect_delay);
+struct dbg_session_gk20a;
+bool gr_gk20a_suspend_context(struct channel_gk20a *ch);
+bool gr_gk20a_resume_context(struct channel_gk20a *ch);
+int gr_gk20a_suspend_contexts(struct gk20a *g,
+                              struct dbg_session_gk20a *dbg_s,
+                              int *ctx_resident_ch_fd);
+int gr_gk20a_resume_contexts(struct gk20a *g,
+                              struct dbg_session_gk20a *dbg_s,
+                              int *ctx_resident_ch_fd);
+void gk20a_gr_enable_gpc_exceptions(struct gk20a *g);
+void gk20a_gr_enable_exceptions(struct gk20a *g);
+int gr_gk20a_trigger_suspend(struct gk20a *g);
+int gr_gk20a_wait_for_pause(struct gk20a *g, struct nvgpu_warpstate *w_state);
+int gr_gk20a_resume_from_pause(struct gk20a *g);
+int gr_gk20a_clear_sm_errors(struct gk20a *g);
+u32 gr_gk20a_tpc_enabled_exceptions(struct gk20a *g);
+int gr_gk20a_commit_global_timeslice(struct gk20a *g, struct channel_gk20a *c);
+int gr_gk20a_init_sm_id_table(struct gk20a *g);
+int gr_gk20a_commit_inst(struct channel_gk20a *c, u64 gpu_va);
+void gr_gk20a_write_zcull_ptr(struct gk20a *g,
+                                struct nvgpu_mem *mem, u64 gpu_va);
+void gr_gk20a_write_pm_ptr(struct gk20a *g,
+                                struct nvgpu_mem *mem, u64 gpu_va);
+u32 gk20a_gr_gpc_offset(struct gk20a *g, u32 gpc);
+u32 gk20a_gr_tpc_offset(struct gk20a *g, u32 tpc);
+void gk20a_gr_get_esr_sm_sel(struct gk20a *g, u32 gpc, u32 tpc,
+                                u32 *esr_sm_sel);
+void gk20a_gr_init_ovr_sm_dsm_perf(void);
+void gk20a_gr_get_ovr_perf_regs(struct gk20a *g, u32 *num_ovr_perf_regs,
+                                               u32 **ovr_perf_regs);
+void gk20a_gr_init_ctxsw_hdr_data(struct gk20a *g,
+                                        struct nvgpu_mem *mem);
+u32 gr_gk20a_get_patch_slots(struct gk20a *g);
+int gk20a_gr_handle_notify_pending(struct gk20a *g,
+                                struct gr_gk20a_isr_data *isr_data);
+int gr_gk20a_alloc_global_ctx_buffers(struct gk20a *g);
+int gr_gk20a_map_global_ctx_buffers(struct gk20a *g,
+                                struct channel_gk20a *c);
+int gr_gk20a_commit_global_ctx_buffers(struct gk20a *g,
+                        struct channel_gk20a *c, bool patch);
+int gr_gk20a_fecs_ctx_bind_channel(struct gk20a *g,
+                                        struct channel_gk20a *c);
+u32 gk20a_init_sw_bundle(struct gk20a *g);
+int gr_gk20a_fecs_ctx_image_save(struct channel_gk20a *c, u32 save_type);
+int gk20a_gr_handle_semaphore_pending(struct gk20a *g,
+                                struct gr_gk20a_isr_data *isr_data);
+int gr_gk20a_add_ctxsw_reg_pm_fbpa(struct gk20a *g,
+                                struct ctxsw_buf_offset_map_entry *map,
+                                struct aiv_list_gk20a *regs,
+                                u32 *count, u32 *offset,
+                                u32 max_cnt, u32 base,
+                                u32 num_fbpas, u32 stride, u32 mask);
+int gr_gk20a_add_ctxsw_reg_perf_pma(struct ctxsw_buf_offset_map_entry *map,
+        struct aiv_list_gk20a *regs,
+        u32 *count, u32 *offset,
+        u32 max_cnt, u32 base, u32 mask);
+int gr_gk20a_decode_priv_addr(struct gk20a *g, u32 addr,
+        enum ctxsw_addr_type *addr_type,
+        u32 *gpc_num, u32 *tpc_num, u32 *ppc_num, u32 *be_num,
+        u32 *broadcast_flags);
+int gr_gk20a_split_ppc_broadcast_addr(struct gk20a *g, u32 addr,
+        u32 gpc_num,
+        u32 *priv_addr_table, u32 *t);
+int gr_gk20a_create_priv_addr_table(struct gk20a *g,
+        u32 addr,
+        u32 *priv_addr_table,
+        u32 *num_registers);
+void gr_gk20a_split_fbpa_broadcast_addr(struct gk20a *g, u32 addr,
+        u32 num_fbpas,
+        u32 *priv_addr_table, u32 *t);
+int gr_gk20a_get_offset_in_gpccs_segment(struct gk20a *g,
+        enum ctxsw_addr_type addr_type, u32 num_tpcs, u32 num_ppcs,
+        u32 reg_list_ppc_count, u32 *__offset_in_segment);
+void gk20a_gr_destroy_ctx_buffer(struct gk20a *g,
+        struct gr_ctx_buffer_desc *desc);
+int gk20a_gr_alloc_ctx_buffer(struct gk20a *g,
+        struct gr_ctx_buffer_desc *desc, size_t size);
+void gk20a_gr_flush_channel_tlb(struct gr_gk20a *gr);
+int gr_gk20a_set_fecs_watchdog_timeout(struct gk20a *g);
+#endif /*__GR_GK20A_H__*/
diff --git a/include/gk20a/gr_pri_gk20a.h b/include/gk20a/gr_pri_gk20a.h
new file mode 100644
index 0000000..d832d90
--- /dev/null
+++ b/include/gk20a/gr_pri_gk20a.h
@@ -0,0 +1,261 @@
+/*
+ * GK20A Graphics Context Pri Register Addressing
+ *
+ * Copyright (c) 2014-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+#ifndef GR_PRI_GK20A_H
+#define GR_PRI_GK20A_H
+/*
+ * These convenience macros are generally for use in the management/modificaiton
+ * of the context state store for gr/compute contexts.
+ */
+/*
+ * GPC pri addressing
+ */
+static inline u32 pri_gpccs_addr_width(void)
+{
+        return 15; /*from where?*/
+}
+static inline u32 pri_gpccs_addr_mask(u32 addr)
+{
+        return addr & ((1 << pri_gpccs_addr_width()) - 1);
+}
+static inline u32 pri_gpc_addr(struct gk20a *g, u32 addr, u32 gpc)
+{
+        u32 gpc_base = nvgpu_get_litter_value(g, GPU_LIT_GPC_BASE);
+        u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE);
+        return gpc_base + (gpc * gpc_stride) + addr;
+}
+static inline bool pri_is_gpc_addr_shared(struct gk20a *g, u32 addr)
+{
+        u32 gpc_shared_base = nvgpu_get_litter_value(g, GPU_LIT_GPC_SHARED_BASE);
+        u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE);
+        return (addr >= gpc_shared_base) &&
+                (addr < gpc_shared_base + gpc_stride);
+}
+static inline bool pri_is_gpc_addr(struct gk20a *g, u32 addr)
+{
+        u32 gpc_base = nvgpu_get_litter_value(g, GPU_LIT_GPC_BASE);
+        u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE);
+        u32 num_gpcs = nvgpu_get_litter_value(g, GPU_LIT_NUM_GPCS);
+        return  ((addr >= gpc_base) &&
+                 (addr < gpc_base + num_gpcs * gpc_stride)) ||
+                pri_is_gpc_addr_shared(g, addr);
+}
+static inline u32 pri_get_gpc_num(struct gk20a *g, u32 addr)
+{
+        u32 i, start;
+        u32 num_gpcs = nvgpu_get_litter_value(g, GPU_LIT_NUM_GPCS);
+        u32 gpc_base = nvgpu_get_litter_value(g, GPU_LIT_GPC_BASE);
+        u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE);
+        for (i = 0; i < num_gpcs; i++) {
+                start = gpc_base + (i * gpc_stride);
+                if ((addr >= start) && (addr < (start + gpc_stride)))
+                        return i;
+        }
+        return 0;
+}
+/*
+ * PPC pri addressing
+ */
+static inline bool pri_is_ppc_addr_shared(struct gk20a *g, u32 addr)
+{
+        u32 ppc_in_gpc_shared_base = nvgpu_get_litter_value(g,
+                                                GPU_LIT_PPC_IN_GPC_SHARED_BASE);
+        u32 ppc_in_gpc_stride = nvgpu_get_litter_value(g,
+                                                GPU_LIT_PPC_IN_GPC_STRIDE);
+        return ((addr >= ppc_in_gpc_shared_base) &&
+                (addr < (ppc_in_gpc_shared_base + ppc_in_gpc_stride)));
+}
+static inline bool pri_is_ppc_addr(struct gk20a *g, u32 addr)
+{
+        u32 ppc_in_gpc_base = nvgpu_get_litter_value(g,
+                                                GPU_LIT_PPC_IN_GPC_BASE);
+        u32 num_pes_per_gpc = nvgpu_get_litter_value(g,
+                                                GPU_LIT_NUM_PES_PER_GPC);
+        u32 ppc_in_gpc_stride = nvgpu_get_litter_value(g,
+                                                GPU_LIT_PPC_IN_GPC_STRIDE);
+        return ((addr >= ppc_in_gpc_base) &&
+                (addr < ppc_in_gpc_base + num_pes_per_gpc * ppc_in_gpc_stride))
+                || pri_is_ppc_addr_shared(g, addr);
+}
+/*
+ * TPC pri addressing
+ */
+static inline u32 pri_tpccs_addr_width(void)
+{
+        return 11; /* from where? */
+}
+static inline u32 pri_tpccs_addr_mask(u32 addr)
+{
+        return addr & ((1 << pri_tpccs_addr_width()) - 1);
+}
+static inline u32 pri_fbpa_addr_mask(struct gk20a *g, u32 addr)
+{
+        return addr & (nvgpu_get_litter_value(g, GPU_LIT_FBPA_STRIDE) - 1);
+}
+static inline u32 pri_tpc_addr(struct gk20a *g, u32 addr, u32 gpc, u32 tpc)
+{
+        u32 gpc_base = nvgpu_get_litter_value(g, GPU_LIT_GPC_BASE);
+        u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE);
+        u32 tpc_in_gpc_base = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_BASE);
+        u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_STRIDE);
+        return gpc_base + (gpc * gpc_stride) +
+                tpc_in_gpc_base + (tpc * tpc_in_gpc_stride) +
+                addr;
+}
+static inline bool pri_is_tpc_addr_shared(struct gk20a *g, u32 addr)
+{
+        u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_STRIDE);
+        u32 tpc_in_gpc_shared_base = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_SHARED_BASE);
+        return (addr >= tpc_in_gpc_shared_base) &&
+                (addr < (tpc_in_gpc_shared_base +
+                         tpc_in_gpc_stride));
+}
+static inline u32 pri_fbpa_addr(struct gk20a *g, u32 addr, u32 fbpa)
+{
+        return (nvgpu_get_litter_value(g, GPU_LIT_FBPA_BASE) + addr +
+                        (fbpa * nvgpu_get_litter_value(g, GPU_LIT_FBPA_STRIDE)));
+}
+static inline bool pri_is_fbpa_addr_shared(struct gk20a *g, u32 addr)
+{
+        u32 fbpa_shared_base = nvgpu_get_litter_value(g, GPU_LIT_FBPA_SHARED_BASE);
+        u32 fbpa_stride = nvgpu_get_litter_value(g, GPU_LIT_FBPA_STRIDE);
+        return ((addr >= fbpa_shared_base) &&
+                (addr < (fbpa_shared_base + fbpa_stride)));
+}
+static inline bool pri_is_fbpa_addr(struct gk20a *g, u32 addr)
+{
+        u32 fbpa_base = nvgpu_get_litter_value(g, GPU_LIT_FBPA_BASE);
+        u32 fbpa_stride = nvgpu_get_litter_value(g, GPU_LIT_FBPA_STRIDE);
+        u32 num_fbpas = nvgpu_get_litter_value(g, GPU_LIT_NUM_FBPAS);
+        return (((addr >= fbpa_base) &&
+                (addr < (fbpa_base + num_fbpas * fbpa_stride)))
+                || pri_is_fbpa_addr_shared(g, addr));
+}
+/*
+ * BE pri addressing
+ */
+static inline u32 pri_becs_addr_width(void)
+{
+        return 10;/* from where? */
+}
+static inline u32 pri_becs_addr_mask(u32 addr)
+{
+        return addr & ((1 << pri_becs_addr_width()) - 1);
+}
+static inline bool pri_is_be_addr_shared(struct gk20a *g, u32 addr)
+{
+        u32 rop_shared_base = nvgpu_get_litter_value(g, GPU_LIT_ROP_SHARED_BASE);
+        u32 rop_stride = nvgpu_get_litter_value(g, GPU_LIT_ROP_STRIDE);
+        return (addr >= rop_shared_base) &&
+                (addr < rop_shared_base + rop_stride);
+}
+static inline u32 pri_be_shared_addr(struct gk20a *g, u32 addr)
+{
+        u32 rop_shared_base = nvgpu_get_litter_value(g, GPU_LIT_ROP_SHARED_BASE);
+        return rop_shared_base + pri_becs_addr_mask(addr);
+}
+static inline bool pri_is_be_addr(struct gk20a *g, u32 addr)
+{
+        u32 rop_base = nvgpu_get_litter_value(g, GPU_LIT_ROP_BASE);
+        u32 rop_stride = nvgpu_get_litter_value(g, GPU_LIT_ROP_STRIDE);
+        return  ((addr >= rop_base) &&
+                 (addr < rop_base + g->ltc_count * rop_stride)) ||
+                pri_is_be_addr_shared(g, addr);
+}
+static inline u32 pri_get_be_num(struct gk20a *g, u32 addr)
+{
+        u32 i, start;
+        u32 num_fbps = nvgpu_get_litter_value(g, GPU_LIT_NUM_FBPS);
+        u32 rop_base = nvgpu_get_litter_value(g, GPU_LIT_ROP_BASE);
+        u32 rop_stride = nvgpu_get_litter_value(g, GPU_LIT_ROP_STRIDE);
+        for (i = 0; i < num_fbps; i++) {
+                start = rop_base + (i * rop_stride);
+                if ((addr >= start) && (addr < (start + rop_stride)))
+                        return i;
+        }
+        return 0;
+}
+/*
+ * PPC pri addressing
+ */
+static inline u32 pri_ppccs_addr_width(void)
+{
+        return 9; /* from where? */
+}
+static inline u32 pri_ppccs_addr_mask(u32 addr)
+{
+        return addr & ((1 << pri_ppccs_addr_width()) - 1);
+}
+static inline u32 pri_ppc_addr(struct gk20a *g, u32 addr, u32 gpc, u32 ppc)
+{
+        u32 gpc_base = nvgpu_get_litter_value(g, GPU_LIT_GPC_BASE);
+        u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE);
+        u32 ppc_in_gpc_base = nvgpu_get_litter_value(g, GPU_LIT_PPC_IN_GPC_BASE);
+        u32 ppc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_PPC_IN_GPC_STRIDE);
+        return gpc_base + (gpc * gpc_stride) +
+                ppc_in_gpc_base + (ppc * ppc_in_gpc_stride) + addr;
+}
+enum ctxsw_addr_type {
+        CTXSW_ADDR_TYPE_SYS  = 0,
+        CTXSW_ADDR_TYPE_GPC  = 1,
+        CTXSW_ADDR_TYPE_TPC  = 2,
+        CTXSW_ADDR_TYPE_BE   = 3,
+        CTXSW_ADDR_TYPE_PPC  = 4,
+        CTXSW_ADDR_TYPE_LTCS = 5,
+        CTXSW_ADDR_TYPE_FBPA = 6,
+        CTXSW_ADDR_TYPE_EGPC = 7,
+        CTXSW_ADDR_TYPE_ETPC = 8,
+        CTXSW_ADDR_TYPE_ROP  = 9,
+        CTXSW_ADDR_TYPE_FBP  = 10,
+};
+#define PRI_BROADCAST_FLAGS_NONE                0U
+#define PRI_BROADCAST_FLAGS_GPC                 BIT32(0)
+#define PRI_BROADCAST_FLAGS_TPC                 BIT32(1)
+#define PRI_BROADCAST_FLAGS_BE                  BIT32(2)
+#define PRI_BROADCAST_FLAGS_PPC                 BIT32(3)
+#define PRI_BROADCAST_FLAGS_LTCS                BIT32(4)
+#define PRI_BROADCAST_FLAGS_LTSS                BIT32(5)
+#define PRI_BROADCAST_FLAGS_FBPA                BIT32(6)
+#define PRI_BROADCAST_FLAGS_EGPC                BIT32(7)
+#define PRI_BROADCAST_FLAGS_ETPC                BIT32(8)
+#define PRI_BROADCAST_FLAGS_PMMGPC              BIT32(9)
+#define PRI_BROADCAST_FLAGS_PMM_GPCS            BIT32(10)
+#define PRI_BROADCAST_FLAGS_PMM_GPCGS_GPCTPCA   BIT32(11)
+#define PRI_BROADCAST_FLAGS_PMM_GPCGS_GPCTPCB   BIT32(12)
+#define PRI_BROADCAST_FLAGS_PMMFBP              BIT32(13)
+#define PRI_BROADCAST_FLAGS_PMM_FBPS            BIT32(14)
+#define PRI_BROADCAST_FLAGS_PMM_FBPGS_LTC       BIT32(15)
+#define PRI_BROADCAST_FLAGS_PMM_FBPGS_ROP       BIT32(16)
+#endif /* GR_PRI_GK20A_H */
diff --git a/include/gk20a/hw_bus_gk20a.h b/include/gk20a/hw_bus_gk20a.h
new file mode 100644
index 0000000..d3bb9e9
--- /dev/null
+++ b/include/gk20a/hw_bus_gk20a.h
@@ -0,0 +1,171 @@
+/*
+ * Copyright (c) 2012-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+/*
+ * Function naming determines intended use:
+ *
+ *     <x>_r(void) : Returns the offset for register <x>.
+ *
+ *     <x>_o(void) : Returns the offset for element <x>.
+ *
+ *     <x>_w(void) : Returns the word offset for word (4 byte) element <x>.
+ *
+ *     <x>_<y>_s(void) : Returns size of field <y> of register <x> in bits.
+ *
+ *     <x>_<y>_f(u32 v) : Returns a value based on 'v' which has been shifted
+ *         and masked to place it at field <y> of register <x>.  This value
+ *         can be |'d with others to produce a full register value for
+ *         register <x>.
+ *
+ *     <x>_<y>_m(void) : Returns a mask for field <y> of register <x>.  This
+ *         value can be ~'d and then &'d to clear the value of field <y> for
+ *         register <x>.
+ *
+ *     <x>_<y>_<z>_f(void) : Returns the constant value <z> after being shifted
+ *         to place it at field <y> of register <x>.  This value can be |'d
+ *         with others to produce a full register value for <x>.
+ *
+ *     <x>_<y>_v(u32 r) : Returns the value of field <y> from a full register
+ *         <x> value 'r' after being shifted to place its LSB at bit 0.
+ *         This value is suitable for direct comparison with other unshifted
+ *         values appropriate for use in field <y> of register <x>.
+ *
+ *     <x>_<y>_<z>_v(void) : Returns the constant value for <z> defined for
+ *         field <y> of register <x>.  This value is suitable for direct
+ *         comparison with unshifted values appropriate for use in field <y>
+ *         of register <x>.
+ */
+#ifndef _hw_bus_gk20a_h_
+#define _hw_bus_gk20a_h_
+static inline u32 bus_bar0_window_r(void)
+{
+        return 0x00001700U;
+}
+static inline u32 bus_bar0_window_base_f(u32 v)
+{
+        return (v & 0xffffffU) << 0U;
+}
+static inline u32 bus_bar0_window_target_vid_mem_f(void)
+{
+        return 0x0U;
+}
+static inline u32 bus_bar0_window_target_sys_mem_coherent_f(void)
+{
+        return 0x2000000U;
+}
+static inline u32 bus_bar0_window_target_sys_mem_noncoherent_f(void)
+{
+        return 0x3000000U;
+}
+static inline u32 bus_bar0_window_target_bar0_window_base_shift_v(void)
+{
+        return 0x00000010U;
+}
+static inline u32 bus_bar1_block_r(void)
+{
+        return 0x00001704U;
+}
+static inline u32 bus_bar1_block_ptr_f(u32 v)
+{
+        return (v & 0xfffffffU) << 0U;
+}
+static inline u32 bus_bar1_block_target_vid_mem_f(void)
+{
+        return 0x0U;
+}
+static inline u32 bus_bar1_block_target_sys_mem_coh_f(void)
+{
+        return 0x20000000U;
+}
+static inline u32 bus_bar1_block_target_sys_mem_ncoh_f(void)
+{
+        return 0x30000000U;
+}
+static inline u32 bus_bar1_block_mode_virtual_f(void)
+{
+        return 0x80000000U;
+}
+static inline u32 bus_bar2_block_r(void)
+{
+        return 0x00001714U;
+}
+static inline u32 bus_bar2_block_ptr_f(u32 v)
+{
+        return (v & 0xfffffffU) << 0U;
+}
+static inline u32 bus_bar2_block_target_vid_mem_f(void)
+{
+        return 0x0U;
+}
+static inline u32 bus_bar2_block_target_sys_mem_coh_f(void)
+{
+        return 0x20000000U;
+}
+static inline u32 bus_bar2_block_target_sys_mem_ncoh_f(void)
+{
+        return 0x30000000U;
+}
+static inline u32 bus_bar2_block_mode_virtual_f(void)
+{
+        return 0x80000000U;
+}
+static inline u32 bus_bar1_block_ptr_shift_v(void)
+{
+        return 0x0000000cU;
+}
+static inline u32 bus_bar2_block_ptr_shift_v(void)
+{
+        return 0x0000000cU;
+}
+static inline u32 bus_intr_0_r(void)
+{
+        return 0x00001100U;
+}
+static inline u32 bus_intr_0_pri_squash_m(void)
+{
+        return 0x1U << 1U;
+}
+static inline u32 bus_intr_0_pri_fecserr_m(void)
+{
+        return 0x1U << 2U;
+}
+static inline u32 bus_intr_0_pri_timeout_m(void)
+{
+        return 0x1U << 3U;
+}
+static inline u32 bus_intr_en_0_r(void)
+{
+        return 0x00001140U;
+}
+static inline u32 bus_intr_en_0_pri_squash_m(void)
+{
+        return 0x1U << 1U;
+}
+static inline u32 bus_intr_en_0_pri_fecserr_m(void)
+{
+        return 0x1U << 2U;
+}
+static inline u32 bus_intr_en_0_pri_timeout_m(void)
+{
+        return 0x1U << 3U;
+}
+#endif
diff --git a/include/gk20a/hw_ccsr_gk20a.h b/include/gk20a/hw_ccsr_gk20a.h
new file mode 100644
index 0000000..95151f6
--- /dev/null
+++ b/include/gk20a/hw_ccsr_gk20a.h
@@ -0,0 +1,163 @@
+/*
+ * Copyright (c) 2012-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+/*
+ * Function naming determines intended use:
+ *
+ *     <x>_r(void) : Returns the offset for register <x>.
+ *
+ *     <x>_o(void) : Returns the offset for element <x>.
+ *
+ *     <x>_w(void) : Returns the word offset for word (4 byte) element <x>.
+ *
+ *     <x>_<y>_s(void) : Returns size of field <y> of register <x> in bits.
+ *
+ *     <x>_<y>_f(u32 v) : Returns a value based on 'v' which has been shifted
+ *         and masked to place it at field <y> of register <x>.  This value
+ *         can be |'d with others to produce a full register value for
+ *         register <x>.
+ *
+ *     <x>_<y>_m(void) : Returns a mask for field <y> of register <x>.  This
+ *         value can be ~'d and then &'d to clear the value of field <y> for
+ *         register <x>.
+ *
+ *     <x>_<y>_<z>_f(void) : Returns the constant value <z> after being shifted
+ *         to place it at field <y> of register <x>.  This value can be |'d
+ *         with others to produce a full register value for <x>.
+ *
+ *     <x>_<y>_v(u32 r) : Returns the value of field <y> from a full register
+ *         <x> value 'r' after being shifted to place its LSB at bit 0.
+ *         This value is suitable for direct comparison with other unshifted
+ *         values appropriate for use in field <y> of register <x>.
+ *
+ *     <x>_<y>_<z>_v(void) : Returns the constant value for <z> defined for
+ *         field <y> of register <x>.  This value is suitable for direct
+ *         comparison with unshifted values appropriate for use in field <y>
+ *         of register <x>.
+ */
+#ifndef _hw_ccsr_gk20a_h_
+#define _hw_ccsr_gk20a_h_
+static inline u32 ccsr_channel_inst_r(u32 i)
+{
+        return 0x00800000U + i*8U;
+}
+static inline u32 ccsr_channel_inst__size_1_v(void)
+{
+        return 0x00000080U;
+}
+static inline u32 ccsr_channel_inst_ptr_f(u32 v)
+{
+        return (v & 0xfffffffU) << 0U;
+}
+static inline u32 ccsr_channel_inst_target_vid_mem_f(void)
+{
+        return 0x0U;
+}
+static inline u32 ccsr_channel_inst_target_sys_mem_coh_f(void)
+{
+        return 0x20000000U;
+}
+static inline u32 ccsr_channel_inst_target_sys_mem_ncoh_f(void)
+{
+        return 0x30000000U;
+}
+static inline u32 ccsr_channel_inst_bind_false_f(void)
+{
+        return 0x0U;
+}
+static inline u32 ccsr_channel_inst_bind_true_f(void)
+{
+        return 0x80000000U;
+}
+static inline u32 ccsr_channel_r(u32 i)
+{
+        return 0x00800004U + i*8U;
+}
+static inline u32 ccsr_channel__size_1_v(void)
+{
+        return 0x00000080U;
+}
+static inline u32 ccsr_channel_enable_v(u32 r)
+{
+        return (r >> 0U) & 0x1U;
+}
+static inline u32 ccsr_channel_enable_set_f(u32 v)
+{
+        return (v & 0x1U) << 10U;
+}
+static inline u32 ccsr_channel_enable_set_true_f(void)
+{
+        return 0x400U;
+}
+static inline u32 ccsr_channel_enable_clr_true_f(void)
+{
+        return 0x800U;
+}
+static inline u32 ccsr_channel_runlist_f(u32 v)
+{
+        return (v & 0xfU) << 16U;
+}
+static inline u32 ccsr_channel_status_v(u32 r)
+{
+        return (r >> 24U) & 0xfU;
+}
+static inline u32 ccsr_channel_status_pending_ctx_reload_v(void)
+{
+        return 0x00000002U;
+}
+static inline u32 ccsr_channel_status_pending_acq_ctx_reload_v(void)
+{
+        return 0x00000004U;
+}
+static inline u32 ccsr_channel_status_on_pbdma_ctx_reload_v(void)
+{
+        return 0x0000000aU;
+}
+static inline u32 ccsr_channel_status_on_pbdma_and_eng_ctx_reload_v(void)
+{
+        return 0x0000000bU;
+}
+static inline u32 ccsr_channel_status_on_eng_ctx_reload_v(void)
+{
+        return 0x0000000cU;
+}
+static inline u32 ccsr_channel_status_on_eng_pending_ctx_reload_v(void)
+{
+        return 0x0000000dU;
+}
+static inline u32 ccsr_channel_status_on_eng_pending_acq_ctx_reload_v(void)
+{
+        return 0x0000000eU;
+}
+static inline u32 ccsr_channel_next_v(u32 r)
+{
+        return (r >> 1U) & 0x1U;
+}
+static inline u32 ccsr_channel_next_true_v(void)
+{
+        return 0x00000001U;
+}
+static inline u32 ccsr_channel_busy_v(u32 r)
+{
+        return (r >> 28U) & 0x1U;
+}
+#endif
diff --git a/include/gk20a/hw_ce2_gk20a.h b/include/gk20a/hw_ce2_gk20a.h
new file mode 100644
index 0000000..87481cd
--- /dev/null
+++ b/include/gk20a/hw_ce2_gk20a.h
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2015-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+/*
+ * Function naming determines intended use:
+ *
+ *     <x>_r(void) : Returns the offset for register <x>.
+ *
+ *     <x>_o(void) : Returns the offset for element <x>.
+ *
+ *     <x>_w(void) : Returns the word offset for word (4 byte) element <x>.
+ *
+ *     <x>_<y>_s(void) : Returns size of field <y> of register <x> in bits.
+ *
+ *     <x>_<y>_f(u32 v) : Returns a value based on 'v' which has been shifted
+ *         and masked to place it at field <y> of register <x>.  This value
+ *         can be |'d with others to produce a full register value for
+ *         register <x>.
+ *
+ *     <x>_<y>_m(void) : Returns a mask for field <y> of register <x>.  This
+ *         value can be ~'d and then &'d to clear the value of field <y> for
+ *         register <x>.
+ *
+ *     <x>_<y>_<z>_f(void) : Returns the constant value <z> after being shifted
+ *         to place it at field <y> of register <x>.  This value can be |'d
+ *         with others to produce a full register value for <x>.
+ *
+ *     <x>_<y>_v(u32 r) : Returns the value of field <y> from a full register
+ *         <x> value 'r' after being shifted to place its LSB at bit 0.
+ *         This value is suitable for direct comparison with other unshifted
+ *         values appropriate for use in field <y> of register <x>.
+ *
+ *     <x>_<y>_<z>_v(void) : Returns the constant value for <z> defined for
+ *         field <y> of register <x>.  This value is suitable for direct
+ *         comparison with unshifted values appropriate for use in field <y>
+ *         of register <x>.
+ */
+#ifndef _hw_ce2_gk20a_h_
+#define _hw_ce2_gk20a_h_
+static inline u32 ce2_intr_status_r(void)
+{
+        return 0x00106908U;
+}
+static inline u32 ce2_intr_status_blockpipe_pending_f(void)
+{
+        return 0x1U;
+}
+static inline u32 ce2_intr_status_blockpipe_reset_f(void)
+{
+        return 0x1U;
+}
+static inline u32 ce2_intr_status_nonblockpipe_pending_f(void)
+{
+        return 0x2U;
+}
+static inline u32 ce2_intr_status_nonblockpipe_reset_f(void)
+{
+        return 0x2U;
+}
+static inline u32 ce2_intr_status_launcherr_pending_f(void)
+{
+        return 0x4U;
+}
+static inline u32 ce2_intr_status_launcherr_reset_f(void)
+{
+        return 0x4U;
+}
+#endif
diff --git a/include/gk20a/hw_ctxsw_prog_gk20a.h b/include/gk20a/hw_ctxsw_prog_gk20a.h
new file mode 100644
index 0000000..131fd12
--- /dev/null
+++ b/include/gk20a/hw_ctxsw_prog_gk20a.h
@@ -0,0 +1,447 @@
+/*
+ * Copyright (c) 2012-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+/*
+ * Function naming determines intended use:
+ *
+ *     <x>_r(void) : Returns the offset for register <x>.
+ *
+ *     <x>_o(void) : Returns the offset for element <x>.
+ *
+ *     <x>_w(void) : Returns the word offset for word (4 byte) element <x>.
+ *
+ *     <x>_<y>_s(void) : Returns size of field <y> of register <x> in bits.
+ *
+ *     <x>_<y>_f(u32 v) : Returns a value based on 'v' which has been shifted
+ *         and masked to place it at field <y> of register <x>.  This value
+ *         can be |'d with others to produce a full register value for
+ *         register <x>.
+ *
+ *     <x>_<y>_m(void) : Returns a mask for field <y> of register <x>.  This
+ *         value can be ~'d and then &'d to clear the value of field <y> for
+ *         register <x>.
+ *
+ *     <x>_<y>_<z>_f(void) : Returns the constant value <z> after being shifted
+ *         to place it at field <y> of register <x>.  This value can be |'d
+ *         with others to produce a full register value for <x>.
+ *
+ *     <x>_<y>_v(u32 r) : Returns the value of field <y> from a full register
+ *         <x> value 'r' after being shifted to place its LSB at bit 0.
+ *         This value is suitable for direct comparison with other unshifted
+ *         values appropriate for use in field <y> of register <x>.
+ *
+ *     <x>_<y>_<z>_v(void) : Returns the constant value for <z> defined for
+ *         field <y> of register <x>.  This value is suitable for direct
+ *         comparison with unshifted values appropriate for use in field <y>
+ *         of register <x>.
+ */
+#ifndef _hw_ctxsw_prog_gk20a_h_
+#define _hw_ctxsw_prog_gk20a_h_
+static inline u32 ctxsw_prog_fecs_header_v(void)
+{
+        return 0x00000100U;
+}
+static inline u32 ctxsw_prog_main_image_num_gpcs_o(void)
+{
+        return 0x00000008U;
+}
+static inline u32 ctxsw_prog_main_image_patch_count_o(void)
+{
+        return 0x00000010U;
+}
+static inline u32 ctxsw_prog_main_image_context_id_o(void)
+{
+        return 0x000000f0U;
+}
+static inline u32 ctxsw_prog_main_image_patch_adr_lo_o(void)
+{
+        return 0x00000014U;
+}
+static inline u32 ctxsw_prog_main_image_patch_adr_hi_o(void)
+{
+        return 0x00000018U;
+}
+static inline u32 ctxsw_prog_main_image_zcull_o(void)
+{
+        return 0x0000001cU;
+}
+static inline u32 ctxsw_prog_main_image_zcull_mode_no_ctxsw_v(void)
+{
+        return 0x00000001U;
+}
+static inline u32 ctxsw_prog_main_image_zcull_mode_separate_buffer_v(void)
+{
+        return 0x00000002U;
+}
+static inline u32 ctxsw_prog_main_image_zcull_ptr_o(void)
+{
+        return 0x00000020U;
+}
+static inline u32 ctxsw_prog_main_image_pm_o(void)
+{
+        return 0x00000028U;
+}
+static inline u32 ctxsw_prog_main_image_pm_mode_m(void)
+{
+        return 0x7U << 0U;
+}
+static inline u32 ctxsw_prog_main_image_pm_mode_ctxsw_f(void)
+{
+        return 0x1U;
+}
+static inline u32 ctxsw_prog_main_image_pm_mode_no_ctxsw_f(void)
+{
+        return 0x0U;
+}
+static inline u32 ctxsw_prog_main_image_pm_smpc_mode_m(void)
+{
+        return 0x7U << 3U;
+}
+static inline u32 ctxsw_prog_main_image_pm_smpc_mode_ctxsw_f(void)
+{
+        return 0x8U;
+}
+static inline u32 ctxsw_prog_main_image_pm_smpc_mode_no_ctxsw_f(void)
+{
+        return 0x0U;
+}
+static inline u32 ctxsw_prog_main_image_pm_ptr_o(void)
+{
+        return 0x0000002cU;
+}
+static inline u32 ctxsw_prog_main_image_num_save_ops_o(void)
+{
+        return 0x000000f4U;
+}
+static inline u32 ctxsw_prog_main_image_num_restore_ops_o(void)
+{
+        return 0x000000f8U;
+}
+static inline u32 ctxsw_prog_main_image_magic_value_o(void)
+{
+        return 0x000000fcU;
+}
+static inline u32 ctxsw_prog_main_image_magic_value_v_value_v(void)
+{
+        return 0x600dc0deU;
+}
+static inline u32 ctxsw_prog_local_priv_register_ctl_o(void)
+{
+        return 0x0000000cU;
+}
+static inline u32 ctxsw_prog_local_priv_register_ctl_offset_v(u32 r)
+{
+        return (r >> 0U) & 0xffffU;
+}
+static inline u32 ctxsw_prog_local_image_ppc_info_o(void)
+{
+        return 0x000000f4U;
+}
+static inline u32 ctxsw_prog_local_image_ppc_info_num_ppcs_v(u32 r)
+{
+        return (r >> 0U) & 0xffffU;
+}
+static inline u32 ctxsw_prog_local_image_ppc_info_ppc_mask_v(u32 r)
+{
+        return (r >> 16U) & 0xffffU;
+}
+static inline u32 ctxsw_prog_local_image_num_tpcs_o(void)
+{
+        return 0x000000f8U;
+}
+static inline u32 ctxsw_prog_local_magic_value_o(void)
+{
+        return 0x000000fcU;
+}
+static inline u32 ctxsw_prog_local_magic_value_v_value_v(void)
+{
+        return 0xad0becabU;
+}
+static inline u32 ctxsw_prog_main_extended_buffer_ctl_o(void)
+{
+        return 0x000000ecU;
+}
+static inline u32 ctxsw_prog_main_extended_buffer_ctl_offset_v(u32 r)
+{
+        return (r >> 0U) & 0xffffU;
+}
+static inline u32 ctxsw_prog_main_extended_buffer_ctl_size_v(u32 r)
+{
+        return (r >> 16U) & 0xffU;
+}
+static inline u32 ctxsw_prog_extended_buffer_segments_size_in_bytes_v(void)
+{
+        return 0x00000100U;
+}
+static inline u32 ctxsw_prog_extended_marker_size_in_bytes_v(void)
+{
+        return 0x00000004U;
+}
+static inline u32 ctxsw_prog_extended_sm_dsm_perf_counter_register_stride_v(void)
+{
+        return 0x00000005U;
+}
+static inline u32 ctxsw_prog_extended_sm_dsm_perf_counter_control_register_stride_v(void)
+{
+        return 0x00000004U;
+}
+static inline u32 ctxsw_prog_extended_num_smpc_quadrants_v(void)
+{
+        return 0x00000004U;
+}
+static inline u32 ctxsw_prog_main_image_priv_access_map_config_o(void)
+{
+        return 0x000000a0U;
+}
+static inline u32 ctxsw_prog_main_image_priv_access_map_config_mode_s(void)
+{
+        return 2U;
+}
+static inline u32 ctxsw_prog_main_image_priv_access_map_config_mode_f(u32 v)
+{
+        return (v & 0x3U) << 0U;
+}
+static inline u32 ctxsw_prog_main_image_priv_access_map_config_mode_m(void)
+{
+        return 0x3U << 0U;
+}
+static inline u32 ctxsw_prog_main_image_priv_access_map_config_mode_v(u32 r)
+{
+        return (r >> 0U) & 0x3U;
+}
+static inline u32 ctxsw_prog_main_image_priv_access_map_config_mode_allow_all_f(void)
+{
+        return 0x0U;
+}
+static inline u32 ctxsw_prog_main_image_priv_access_map_config_mode_use_map_f(void)
+{
+        return 0x2U;
+}
+static inline u32 ctxsw_prog_main_image_priv_access_map_addr_lo_o(void)
+{
+        return 0x000000a4U;
+}
+static inline u32 ctxsw_prog_main_image_priv_access_map_addr_hi_o(void)
+{
+        return 0x000000a8U;
+}
+static inline u32 ctxsw_prog_main_image_misc_options_o(void)
+{
+        return 0x0000003cU;
+}
+static inline u32 ctxsw_prog_main_image_misc_options_verif_features_m(void)
+{
+        return 0x1U << 3U;
+}
+static inline u32 ctxsw_prog_main_image_misc_options_verif_features_disabled_f(void)
+{
+        return 0x0U;
+}
+static inline u32 ctxsw_prog_main_image_context_timestamp_buffer_control_o(void)
+{
+        return 0x000000acU;
+}
+static inline u32 ctxsw_prog_main_image_context_timestamp_buffer_control_num_records_f(u32 v)
+{
+        return (v & 0xffffU) << 0U;
+}
+static inline u32 ctxsw_prog_main_image_context_timestamp_buffer_ptr_hi_o(void)
+{
+        return 0x000000b0U;
+}
+static inline u32 ctxsw_prog_main_image_context_timestamp_buffer_ptr_hi_v_m(void)
+{
+        return 0xfffffffU << 0U;
+}
+static inline u32 ctxsw_prog_main_image_context_timestamp_buffer_ptr_hi_target_m(void)
+{
+        return 0x3U << 28U;
+}
+static inline u32 ctxsw_prog_main_image_context_timestamp_buffer_ptr_hi_target_vid_mem_f(void)
+{
+        return 0x0U;
+}
+static inline u32 ctxsw_prog_main_image_context_timestamp_buffer_ptr_hi_target_sys_mem_coherent_f(void)
+{
+        return 0x20000000U;
+}
+static inline u32 ctxsw_prog_main_image_context_timestamp_buffer_ptr_hi_target_sys_mem_noncoherent_f(void)
+{
+        return 0x30000000U;
+}
+static inline u32 ctxsw_prog_main_image_context_timestamp_buffer_ptr_o(void)
+{
+        return 0x000000b4U;
+}
+static inline u32 ctxsw_prog_main_image_context_timestamp_buffer_ptr_v_f(u32 v)
+{
+        return (v & 0xffffffffU) << 0U;
+}
+static inline u32 ctxsw_prog_record_timestamp_record_size_in_bytes_v(void)
+{
+        return 0x00000080U;
+}
+static inline u32 ctxsw_prog_record_timestamp_record_size_in_words_v(void)
+{
+        return 0x00000020U;
+}
+static inline u32 ctxsw_prog_record_timestamp_magic_value_lo_o(void)
+{
+        return 0x00000000U;
+}
+static inline u32 ctxsw_prog_record_timestamp_magic_value_lo_v_value_v(void)
+{
+        return 0x00000000U;
+}
+static inline u32 ctxsw_prog_record_timestamp_magic_value_hi_o(void)
+{
+        return 0x00000004U;
+}
+static inline u32 ctxsw_prog_record_timestamp_magic_value_hi_v_value_v(void)
+{
+        return 0x600dbeefU;
+}
+static inline u32 ctxsw_prog_record_timestamp_context_id_o(void)
+{
+        return 0x00000008U;
+}
+static inline u32 ctxsw_prog_record_timestamp_context_ptr_o(void)
+{
+        return 0x0000000cU;
+}
+static inline u32 ctxsw_prog_record_timestamp_new_context_id_o(void)
+{
+        return 0x00000010U;
+}
+static inline u32 ctxsw_prog_record_timestamp_new_context_ptr_o(void)
+{
+        return 0x00000014U;
+}
+static inline u32 ctxsw_prog_record_timestamp_timestamp_lo_o(void)
+{
+        return 0x00000018U;
+}
+static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_o(void)
+{
+        return 0x0000001cU;
+}
+static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_v_f(u32 v)
+{
+        return (v & 0xffffffU) << 0U;
+}
+static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_v_v(u32 r)
+{
+        return (r >> 0U) & 0xffffffU;
+}
+static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_f(u32 v)
+{
+        return (v & 0xffU) << 24U;
+}
+static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_m(void)
+{
+        return 0xffU << 24U;
+}
+static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_v(u32 r)
+{
+        return (r >> 24U) & 0xffU;
+}
+static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_ctxsw_req_by_host_v(void)
+{
+        return 0x00000001U;
+}
+static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_ctxsw_req_by_host_f(void)
+{
+        return 0x1000000U;
+}
+static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_fe_ack_v(void)
+{
+        return 0x00000002U;
+}
+static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_fe_ack_f(void)
+{
+        return 0x2000000U;
+}
+static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_fe_ack_wfi_v(void)
+{
+        return 0x0000000aU;
+}
+static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_fe_ack_wfi_f(void)
+{
+        return 0xa000000U;
+}
+static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_fe_ack_gfxp_v(void)
+{
+        return 0x0000000bU;
+}
+static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_fe_ack_gfxp_f(void)
+{
+        return 0xb000000U;
+}
+static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_fe_ack_ctap_v(void)
+{
+        return 0x0000000cU;
+}
+static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_fe_ack_ctap_f(void)
+{
+        return 0xc000000U;
+}
+static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_fe_ack_cilp_v(void)
+{
+        return 0x0000000dU;
+}
+static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_fe_ack_cilp_f(void)
+{
+        return 0xd000000U;
+}
+static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_save_end_v(void)
+{
+        return 0x00000003U;
+}
+static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_save_end_f(void)
+{
+        return 0x3000000U;
+}
+static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_restore_start_v(void)
+{
+        return 0x00000004U;
+}
+static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_restore_start_f(void)
+{
+        return 0x4000000U;
+}
+static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_context_start_v(void)
+{
+        return 0x00000005U;
+}
+static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_context_start_f(void)
+{
+        return 0x5000000U;
+}
+static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_invalid_timestamp_v(void)
+{
+        return 0x000000ffU;
+}
+static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_invalid_timestamp_f(void)
+{
+        return 0xff000000U;
+}
+#endif
diff --git a/include/gk20a/hw_falcon_gk20a.h b/include/gk20a/hw_falcon_gk20a.h
new file mode 100644
index 0000000..7b4d87b
--- /dev/null
+++ b/include/gk20a/hw_falcon_gk20a.h
@@ -0,0 +1,559 @@
+/*
+ * Copyright (c) 2017-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+/*
+ * Function naming determines intended use:
+ *
+ *     <x>_r(void) : Returns the offset for register <x>.
+ *
+ *     <x>_o(void) : Returns the offset for element <x>.
+ *
+ *     <x>_w(void) : Returns the word offset for word (4 byte) element <x>.
+ *
+ *     <x>_<y>_s(void) : Returns size of field <y> of register <x> in bits.
+ *
+ *     <x>_<y>_f(u32 v) : Returns a value based on 'v' which has been shifted
+ *         and masked to place it at field <y> of register <x>.  This value
+ *         can be |'d with others to produce a full register value for
+ *         register <x>.
+ *
+ *     <x>_<y>_m(void) : Returns a mask for field <y> of register <x>.  This
+ *         value can be ~'d and then &'d to clear the value of field <y> for
+ *         register <x>.
+ *
+ *     <x>_<y>_<z>_f(void) : Returns the constant value <z> after being shifted
+ *         to place it at field <y> of register <x>.  This value can be |'d
+ *         with others to produce a full register value for <x>.
+ *
+ *     <x>_<y>_v(u32 r) : Returns the value of field <y> from a full register
+ *         <x> value 'r' after being shifted to place its LSB at bit 0.
+ *         This value is suitable for direct comparison with other unshifted
+ *         values appropriate for use in field <y> of register <x>.
+ *
+ *     <x>_<y>_<z>_v(void) : Returns the constant value for <z> defined for
+ *         field <y> of register <x>.  This value is suitable for direct
+ *         comparison with unshifted values appropriate for use in field <y>
+ *         of register <x>.
+ */
+#ifndef _hw_falcon_gk20a_h_
+#define _hw_falcon_gk20a_h_
+static inline u32 falcon_falcon_irqsset_r(void)
+{
+        return 0x00000000U;
+}
+static inline u32 falcon_falcon_irqsset_swgen0_set_f(void)
+{
+        return 0x40U;
+}
+static inline u32 falcon_falcon_irqsclr_r(void)
+{
+        return 0x00000004U;
+}
+static inline u32 falcon_falcon_irqstat_r(void)
+{
+        return 0x00000008U;
+}
+static inline u32 falcon_falcon_irqstat_halt_true_f(void)
+{
+        return 0x10U;
+}
+static inline u32 falcon_falcon_irqstat_exterr_true_f(void)
+{
+        return 0x20U;
+}
+static inline u32 falcon_falcon_irqstat_swgen0_true_f(void)
+{
+        return 0x40U;
+}
+static inline u32 falcon_falcon_irqmode_r(void)
+{
+        return 0x0000000cU;
+}
+static inline u32 falcon_falcon_irqmset_r(void)
+{
+        return 0x00000010U;
+}
+static inline u32 falcon_falcon_irqmset_gptmr_f(u32 v)
+{
+        return (v & 0x1U) << 0U;
+}
+static inline u32 falcon_falcon_irqmset_wdtmr_f(u32 v)
+{
+        return (v & 0x1U) << 1U;
+}
+static inline u32 falcon_falcon_irqmset_mthd_f(u32 v)
+{
+        return (v & 0x1U) << 2U;
+}
+static inline u32 falcon_falcon_irqmset_ctxsw_f(u32 v)
+{
+        return (v & 0x1U) << 3U;
+}
+static inline u32 falcon_falcon_irqmset_halt_f(u32 v)
+{
+        return (v & 0x1U) << 4U;
+}
+static inline u32 falcon_falcon_irqmset_exterr_f(u32 v)
+{
+        return (v & 0x1U) << 5U;
+}
+static inline u32 falcon_falcon_irqmset_swgen0_f(u32 v)
+{
+        return (v & 0x1U) << 6U;
+}
+static inline u32 falcon_falcon_irqmset_swgen1_f(u32 v)
+{
+        return (v & 0x1U) << 7U;
+}
+static inline u32 falcon_falcon_irqmclr_r(void)
+{
+        return 0x00000014U;
+}
+static inline u32 falcon_falcon_irqmclr_gptmr_f(u32 v)
+{
+        return (v & 0x1U) << 0U;
+}
+static inline u32 falcon_falcon_irqmclr_wdtmr_f(u32 v)
+{
+        return (v & 0x1U) << 1U;
+}
+static inline u32 falcon_falcon_irqmclr_mthd_f(u32 v)
+{
+        return (v & 0x1U) << 2U;
+}
+static inline u32 falcon_falcon_irqmclr_ctxsw_f(u32 v)
+{
+        return (v & 0x1U) << 3U;
+}
+static inline u32 falcon_falcon_irqmclr_halt_f(u32 v)
+{
+        return (v & 0x1U) << 4U;
+}
+static inline u32 falcon_falcon_irqmclr_exterr_f(u32 v)
+{
+        return (v & 0x1U) << 5U;
+}
+static inline u32 falcon_falcon_irqmclr_swgen0_f(u32 v)
+{
+        return (v & 0x1U) << 6U;
+}
+static inline u32 falcon_falcon_irqmclr_swgen1_f(u32 v)
+{
+        return (v & 0x1U) << 7U;
+}
+static inline u32 falcon_falcon_irqmclr_ext_f(u32 v)
+{
+        return (v & 0xffU) << 8U;
+}
+static inline u32 falcon_falcon_irqmask_r(void)
+{
+        return 0x00000018U;
+}
+static inline u32 falcon_falcon_irqdest_r(void)
+{
+        return 0x0000001cU;
+}
+static inline u32 falcon_falcon_irqdest_host_gptmr_f(u32 v)
+{
+        return (v & 0x1U) << 0U;
+}
+static inline u32 falcon_falcon_irqdest_host_wdtmr_f(u32 v)
+{
+        return (v & 0x1U) << 1U;
+}
+static inline u32 falcon_falcon_irqdest_host_mthd_f(u32 v)
+{
+        return (v & 0x1U) << 2U;
+}
+static inline u32 falcon_falcon_irqdest_host_ctxsw_f(u32 v)
+{
+        return (v & 0x1U) << 3U;
+}
+static inline u32 falcon_falcon_irqdest_host_halt_f(u32 v)
+{
+        return (v & 0x1U) << 4U;
+}
+static inline u32 falcon_falcon_irqdest_host_exterr_f(u32 v)
+{
+        return (v & 0x1U) << 5U;
+}
+static inline u32 falcon_falcon_irqdest_host_swgen0_f(u32 v)
+{
+        return (v & 0x1U) << 6U;
+}
+static inline u32 falcon_falcon_irqdest_host_swgen1_f(u32 v)
+{
+        return (v & 0x1U) << 7U;
+}
+static inline u32 falcon_falcon_irqdest_host_ext_f(u32 v)
+{
+        return (v & 0xffU) << 8U;
+}
+static inline u32 falcon_falcon_irqdest_target_gptmr_f(u32 v)
+{
+        return (v & 0x1U) << 16U;
+}
+static inline u32 falcon_falcon_irqdest_target_wdtmr_f(u32 v)
+{
+        return (v & 0x1U) << 17U;
+}
+static inline u32 falcon_falcon_irqdest_target_mthd_f(u32 v)
+{
+        return (v & 0x1U) << 18U;
+}
+static inline u32 falcon_falcon_irqdest_target_ctxsw_f(u32 v)
+{
+        return (v & 0x1U) << 19U;
+}
+static inline u32 falcon_falcon_irqdest_target_halt_f(u32 v)
+{
+        return (v & 0x1U) << 20U;
+}
+static inline u32 falcon_falcon_irqdest_target_exterr_f(u32 v)
+{
+        return (v & 0x1U) << 21U;
+}
+static inline u32 falcon_falcon_irqdest_target_swgen0_f(u32 v)
+{
+        return (v & 0x1U) << 22U;
+}
+static inline u32 falcon_falcon_irqdest_target_swgen1_f(u32 v)
+{
+        return (v & 0x1U) << 23U;
+}
+static inline u32 falcon_falcon_irqdest_target_ext_f(u32 v)
+{
+        return (v & 0xffU) << 24U;
+}
+static inline u32 falcon_falcon_curctx_r(void)
+{
+        return 0x00000050U;
+}
+static inline u32 falcon_falcon_nxtctx_r(void)
+{
+        return 0x00000054U;
+}
+static inline u32 falcon_falcon_mailbox0_r(void)
+{
+        return 0x00000040U;
+}
+static inline u32 falcon_falcon_mailbox1_r(void)
+{
+        return 0x00000044U;
+}
+static inline u32 falcon_falcon_itfen_r(void)
+{
+        return 0x00000048U;
+}
+static inline u32 falcon_falcon_itfen_ctxen_enable_f(void)
+{
+        return 0x1U;
+}
+static inline u32 falcon_falcon_idlestate_r(void)
+{
+        return 0x0000004cU;
+}
+static inline u32 falcon_falcon_idlestate_falcon_busy_v(u32 r)
+{
+        return (r >> 0U) & 0x1U;
+}
+static inline u32 falcon_falcon_idlestate_ext_busy_v(u32 r)
+{
+        return (r >> 1U) & 0x7fffU;
+}
+static inline u32 falcon_falcon_os_r(void)
+{
+        return 0x00000080U;
+}
+static inline u32 falcon_falcon_engctl_r(void)
+{
+        return 0x000000a4U;
+}
+static inline u32 falcon_falcon_cpuctl_r(void)
+{
+        return 0x00000100U;
+}
+static inline u32 falcon_falcon_cpuctl_startcpu_f(u32 v)
+{
+        return (v & 0x1U) << 1U;
+}
+static inline u32 falcon_falcon_cpuctl_sreset_f(u32 v)
+{
+        return (v & 0x1U) << 2U;
+}
+static inline u32 falcon_falcon_cpuctl_hreset_f(u32 v)
+{
+        return (v & 0x1U) << 3U;
+}
+static inline u32 falcon_falcon_cpuctl_halt_intr_f(u32 v)
+{
+        return (v & 0x1U) << 4U;
+}
+static inline u32 falcon_falcon_cpuctl_halt_intr_m(void)
+{
+        return 0x1U << 4U;
+}
+static inline u32 falcon_falcon_cpuctl_halt_intr_v(u32 r)
+{
+        return (r >> 4U) & 0x1U;
+}
+static inline u32 falcon_falcon_cpuctl_stopped_m(void)
+{
+        return 0x1U << 5U;
+}
+static inline u32 falcon_falcon_imemc_r(u32 i)
+{
+        return 0x00000180U + i*16U;
+}
+static inline u32 falcon_falcon_imemc_offs_f(u32 v)
+{
+        return (v & 0x3fU) << 2U;
+}
+static inline u32 falcon_falcon_imemc_blk_f(u32 v)
+{
+        return (v & 0xffU) << 8U;
+}
+static inline u32 falcon_falcon_imemc_aincw_f(u32 v)
+{
+        return (v & 0x1U) << 24U;
+}
+static inline u32 falcon_falcon_imemc_secure_f(u32 v)
+{
+        return (v & 0x1U) << 28U;
+}
+static inline u32 falcon_falcon_imemd_r(u32 i)
+{
+        return 0x00000184U + i*16U;
+}
+static inline u32 falcon_falcon_imemt_r(u32 i)
+{
+        return 0x00000188U + i*16U;
+}
+static inline u32 falcon_falcon_bootvec_r(void)
+{
+        return 0x00000104U;
+}
+static inline u32 falcon_falcon_bootvec_vec_f(u32 v)
+{
+        return (v & 0xffffffffU) << 0U;
+}
+static inline u32 falcon_falcon_dmactl_r(void)
+{
+        return 0x0000010cU;
+}
+static inline u32 falcon_falcon_dmactl_dmem_scrubbing_m(void)
+{
+        return 0x1U << 1U;
+}
+static inline u32 falcon_falcon_dmactl_imem_scrubbing_m(void)
+{
+        return 0x1U << 2U;
+}
+static inline u32 falcon_falcon_dmactl_require_ctx_f(u32 v)
+{
+        return (v & 0x1U) << 0U;
+}
+static inline u32 falcon_falcon_hwcfg_r(void)
+{
+        return 0x00000108U;
+}
+static inline u32 falcon_falcon_hwcfg_imem_size_v(u32 r)
+{
+        return (r >> 0U) & 0x1ffU;
+}
+static inline u32 falcon_falcon_hwcfg_dmem_size_v(u32 r)
+{
+        return (r >> 9U) & 0x1ffU;
+}
+static inline u32 falcon_falcon_dmatrfbase_r(void)
+{
+        return 0x00000110U;
+}
+static inline u32 falcon_falcon_dmatrfmoffs_r(void)
+{
+        return 0x00000114U;
+}
+static inline u32 falcon_falcon_dmatrfcmd_r(void)
+{
+        return 0x00000118U;
+}
+static inline u32 falcon_falcon_dmatrfcmd_imem_f(u32 v)
+{
+        return (v & 0x1U) << 4U;
+}
+static inline u32 falcon_falcon_dmatrfcmd_write_f(u32 v)
+{
+        return (v & 0x1U) << 5U;
+}
+static inline u32 falcon_falcon_dmatrfcmd_size_f(u32 v)
+{
+        return (v & 0x7U) << 8U;
+}
+static inline u32 falcon_falcon_dmatrfcmd_ctxdma_f(u32 v)
+{
+        return (v & 0x7U) << 12U;
+}
+static inline u32 falcon_falcon_dmatrffboffs_r(void)
+{
+        return 0x0000011cU;
+}
+static inline u32 falcon_falcon_imstat_r(void)
+{
+        return 0x00000144U;
+}
+static inline u32 falcon_falcon_traceidx_r(void)
+{
+        return 0x00000148U;
+}
+static inline u32 falcon_falcon_traceidx_maxidx_v(u32 r)
+{
+        return (r >> 16U) & 0xffU;
+}
+static inline u32 falcon_falcon_traceidx_idx_v(u32 r)
+{
+        return (r >> 0U) & 0xffU;
+}
+static inline u32 falcon_falcon_tracepc_r(void)
+{
+        return 0x0000014cU;
+}
+static inline u32 falcon_falcon_tracepc_pc_v(u32 r)
+{
+        return (r >> 0U) & 0xffffffU;
+}
+static inline u32 falcon_falcon_exterraddr_r(void)
+{
+        return 0x00000168U;
+}
+static inline u32 falcon_falcon_exterrstat_r(void)
+{
+        return 0x0000016cU;
+}
+static inline u32 falcon_falcon_exterrstat_valid_m(void)
+{
+        return 0x1U << 31U;
+}
+static inline u32 falcon_falcon_exterrstat_valid_v(u32 r)
+{
+        return (r >> 31U) & 0x1U;
+}
+static inline u32 falcon_falcon_exterrstat_valid_true_v(void)
+{
+        return 0x00000001U;
+}
+static inline u32 falcon_falcon_icd_cmd_r(void)
+{
+        return 0x00000200U;
+}
+static inline u32 falcon_falcon_icd_cmd_opc_s(void)
+{
+        return 4U;
+}
+static inline u32 falcon_falcon_icd_cmd_opc_f(u32 v)
+{
+        return (v & 0xfU) << 0U;
+}
+static inline u32 falcon_falcon_icd_cmd_opc_m(void)
+{
+        return 0xfU << 0U;
+}
+static inline u32 falcon_falcon_icd_cmd_opc_v(u32 r)
+{
+        return (r >> 0U) & 0xfU;
+}
+static inline u32 falcon_falcon_icd_cmd_opc_rreg_f(void)
+{
+        return 0x8U;
+}
+static inline u32 falcon_falcon_icd_cmd_opc_rstat_f(void)
+{
+        return 0xeU;
+}
+static inline u32 falcon_falcon_icd_cmd_idx_f(u32 v)
+{
+        return (v & 0x1fU) << 8U;
+}
+static inline u32 falcon_falcon_icd_rdata_r(void)
+{
+        return 0x0000020cU;
+}
+static inline u32 falcon_falcon_dmemc_r(u32 i)
+{
+        return 0x000001c0U + i*8U;
+}
+static inline u32 falcon_falcon_dmemc_offs_f(u32 v)
+{
+        return (v & 0x3fU) << 2U;
+}
+static inline u32 falcon_falcon_dmemc_offs_m(void)
+{
+        return 0x3fU << 2U;
+}
+static inline u32 falcon_falcon_dmemc_blk_f(u32 v)
+{
+        return (v & 0xffU) << 8U;
+}
+static inline u32 falcon_falcon_dmemc_blk_m(void)
+{
+        return 0xffU << 8U;
+}
+static inline u32 falcon_falcon_dmemc_aincw_f(u32 v)
+{
+        return (v & 0x1U) << 24U;
+}
+static inline u32 falcon_falcon_dmemc_aincr_f(u32 v)
+{
+        return (v & 0x1U) << 25U;
+}
+static inline u32 falcon_falcon_dmemd_r(u32 i)
+{
+        return 0x000001c4U + i*8U;
+}
+static inline u32 falcon_falcon_debug1_r(void)
+{
+        return 0x00000090U;
+}
+static inline u32 falcon_falcon_debug1_ctxsw_mode_s(void)
+{
+        return 1U;
+}
+static inline u32 falcon_falcon_debug1_ctxsw_mode_f(u32 v)
+{
+        return (v & 0x1U) << 16U;
+}
+static inline u32 falcon_falcon_debug1_ctxsw_mode_m(void)
+{
+        return 0x1U << 16U;
+}
+static inline u32 falcon_falcon_debug1_ctxsw_mode_v(u32 r)
+{
+        return (r >> 16U) & 0x1U;
+}
+static inline u32 falcon_falcon_debug1_ctxsw_mode_init_f(void)
+{
+        return 0x0U;
+}
+static inline u32 falcon_falcon_debuginfo_r(void)
+{
+        return 0x00000094U;
+}
+#endif
diff --git a/include/gk20a/hw_fb_gk20a.h b/include/gk20a/hw_fb_gk20a.h
new file mode 100644
index 0000000..42df4f5
--- /dev/null
+++ b/include/gk20a/hw_fb_gk20a.h
@@ -0,0 +1,263 @@
+/*
+ * Copyright (c) 2012-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+/*
+ * Function naming determines intended use:
+ *
+ *     <x>_r(void) : Returns the offset for register <x>.
+ *
+ *     <x>_o(void) : Returns the offset for element <x>.
+ *
+ *     <x>_w(void) : Returns the word offset for word (4 byte) element <x>.
+ *
+ *     <x>_<y>_s(void) : Returns size of field <y> of register <x> in bits.
+ *
+ *     <x>_<y>_f(u32 v) : Returns a value based on 'v' which has been shifted
+ *         and masked to place it at field <y> of register <x>.  This value
+ *         can be |'d with others to produce a full register value for
+ *         register <x>.
+ *
+ *     <x>_<y>_m(void) : Returns a mask for field <y> of register <x>.  This
+ *         value can be ~'d and then &'d to clear the value of field <y> for
+ *         register <x>.
+ *
+ *     <x>_<y>_<z>_f(void) : Returns the constant value <z> after being shifted
+ *         to place it at field <y> of register <x>.  This value can be |'d
+ *         with others to produce a full register value for <x>.
+ *
+ *     <x>_<y>_v(u32 r) : Returns the value of field <y> from a full register
+ *         <x> value 'r' after being shifted to place its LSB at bit 0.
+ *         This value is suitable for direct comparison with other unshifted
+ *         values appropriate for use in field <y> of register <x>.
+ *
+ *     <x>_<y>_<z>_v(void) : Returns the constant value for <z> defined for
+ *         field <y> of register <x>.  This value is suitable for direct
+ *         comparison with unshifted values appropriate for use in field <y>
+ *         of register <x>.
+ */
+#ifndef _hw_fb_gk20a_h_
+#define _hw_fb_gk20a_h_
+static inline u32 fb_mmu_ctrl_r(void)
+{
+        return 0x00100c80U;
+}
+static inline u32 fb_mmu_ctrl_vm_pg_size_f(u32 v)
+{
+        return (v & 0x1U) << 0U;
+}
+static inline u32 fb_mmu_ctrl_vm_pg_size_128kb_f(void)
+{
+        return 0x0U;
+}
+static inline u32 fb_mmu_ctrl_vm_pg_size_64kb_f(void)
+{
+        return 0x1U;
+}
+static inline u32 fb_mmu_ctrl_pri_fifo_empty_v(u32 r)
+{
+        return (r >> 15U) & 0x1U;
+}
+static inline u32 fb_mmu_ctrl_pri_fifo_empty_false_f(void)
+{
+        return 0x0U;
+}
+static inline u32 fb_mmu_ctrl_pri_fifo_space_v(u32 r)
+{
+        return (r >> 16U) & 0xffU;
+}
+static inline u32 fb_mmu_invalidate_pdb_r(void)
+{
+        return 0x00100cb8U;
+}
+static inline u32 fb_mmu_invalidate_pdb_aperture_vid_mem_f(void)
+{
+        return 0x0U;
+}
+static inline u32 fb_mmu_invalidate_pdb_aperture_sys_mem_f(void)
+{
+        return 0x2U;
+}
+static inline u32 fb_mmu_invalidate_pdb_addr_f(u32 v)
+{
+        return (v & 0xfffffffU) << 4U;
+}
+static inline u32 fb_mmu_invalidate_r(void)
+{
+        return 0x00100cbcU;
+}
+static inline u32 fb_mmu_invalidate_all_va_true_f(void)
+{
+        return 0x1U;
+}
+static inline u32 fb_mmu_invalidate_all_pdb_true_f(void)
+{
+        return 0x2U;
+}
+static inline u32 fb_mmu_invalidate_trigger_s(void)
+{
+        return 1U;
+}
+static inline u32 fb_mmu_invalidate_trigger_f(u32 v)
+{
+        return (v & 0x1U) << 31U;
+}
+static inline u32 fb_mmu_invalidate_trigger_m(void)
+{
+        return 0x1U << 31U;
+}
+static inline u32 fb_mmu_invalidate_trigger_v(u32 r)
+{
+        return (r >> 31U) & 0x1U;
+}
+static inline u32 fb_mmu_invalidate_trigger_true_f(void)
+{
+        return 0x80000000U;
+}
+static inline u32 fb_mmu_debug_wr_r(void)
+{
+        return 0x00100cc8U;
+}
+static inline u32 fb_mmu_debug_wr_aperture_s(void)
+{
+        return 2U;
+}
+static inline u32 fb_mmu_debug_wr_aperture_f(u32 v)
+{
+        return (v & 0x3U) << 0U;
+}
+static inline u32 fb_mmu_debug_wr_aperture_m(void)
+{
+        return 0x3U << 0U;
+}
+static inline u32 fb_mmu_debug_wr_aperture_v(u32 r)
+{
+        return (r >> 0U) & 0x3U;
+}
+static inline u32 fb_mmu_debug_wr_aperture_vid_mem_f(void)
+{
+        return 0x0U;
+}
+static inline u32 fb_mmu_debug_wr_aperture_sys_mem_coh_f(void)
+{
+        return 0x2U;
+}
+static inline u32 fb_mmu_debug_wr_aperture_sys_mem_ncoh_f(void)
+{
+        return 0x3U;
+}
+static inline u32 fb_mmu_debug_wr_vol_false_f(void)
+{
+        return 0x0U;
+}
+static inline u32 fb_mmu_debug_wr_vol_true_v(void)
+{
+        return 0x00000001U;
+}
+static inline u32 fb_mmu_debug_wr_vol_true_f(void)
+{
+        return 0x4U;
+}
+static inline u32 fb_mmu_debug_wr_addr_f(u32 v)
+{
+        return (v & 0xfffffffU) << 4U;
+}
+static inline u32 fb_mmu_debug_wr_addr_alignment_v(void)
+{
+        return 0x0000000cU;
+}
+static inline u32 fb_mmu_debug_rd_r(void)
+{
+        return 0x00100cccU;
+}
+static inline u32 fb_mmu_debug_rd_aperture_vid_mem_f(void)
+{
+        return 0x0U;
+}
+static inline u32 fb_mmu_debug_rd_aperture_sys_mem_coh_f(void)
+{
+        return 0x2U;
+}
+static inline u32 fb_mmu_debug_rd_aperture_sys_mem_ncoh_f(void)
+{
+        return 0x3U;
+}
+static inline u32 fb_mmu_debug_rd_vol_false_f(void)
+{
+        return 0x0U;
+}
+static inline u32 fb_mmu_debug_rd_addr_f(u32 v)
+{
+        return (v & 0xfffffffU) << 4U;
+}
+static inline u32 fb_mmu_debug_rd_addr_alignment_v(void)
+{
+        return 0x0000000cU;
+}
+static inline u32 fb_mmu_debug_ctrl_r(void)
+{
+        return 0x00100cc4U;
+}
+static inline u32 fb_mmu_debug_ctrl_debug_v(u32 r)
+{
+        return (r >> 16U) & 0x1U;
+}
+static inline u32 fb_mmu_debug_ctrl_debug_m(void)
+{
+        return 0x1U << 16U;
+}
+static inline u32 fb_mmu_debug_ctrl_debug_enabled_v(void)
+{
+        return 0x00000001U;
+}
+static inline u32 fb_mmu_debug_ctrl_debug_enabled_f(void)
+{
+        return 0x10000U;
+}
+static inline u32 fb_mmu_debug_ctrl_debug_disabled_v(void)
+{
+        return 0x00000000U;
+}
+static inline u32 fb_mmu_debug_ctrl_debug_disabled_f(void)
+{
+        return 0x0U;
+}
+static inline u32 fb_mmu_vpr_info_r(void)
+{
+        return 0x00100cd0U;
+}
+static inline u32 fb_mmu_vpr_info_fetch_v(u32 r)
+{
+        return (r >> 2U) & 0x1U;
+}
+static inline u32 fb_mmu_vpr_info_fetch_false_v(void)
+{
+        return 0x00000000U;
+}
+static inline u32 fb_mmu_vpr_info_fetch_true_v(void)
+{
+        return 0x00000001U;
+}
+static inline u32 fb_niso_flush_sysmem_addr_r(void)
+{
+        return 0x00100c10U;
+}
+#endif
diff --git a/include/gk20a/hw_fifo_gk20a.h b/include/gk20a/hw_fifo_gk20a.h
new file mode 100644
index 0000000..e61e386
--- /dev/null
+++ b/include/gk20a/hw_fifo_gk20a.h
@@ -0,0 +1,619 @@
+/*
+ * Copyright (c) 2012-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+/*
+ * Function naming determines intended use:
+ *
+ *     <x>_r(void) : Returns the offset for register <x>.
+ *
+ *     <x>_o(void) : Returns the offset for element <x>.
+ *
+ *     <x>_w(void) : Returns the word offset for word (4 byte) element <x>.
+ *
+ *     <x>_<y>_s(void) : Returns size of field <y> of register <x> in bits.
+ *
+ *     <x>_<y>_f(u32 v) : Returns a value based on 'v' which has been shifted
+ *         and masked to place it at field <y> of register <x>.  This value
+ *         can be |'d with others to produce a full register value for
+ *         register <x>.
+ *
+ *     <x>_<y>_m(void) : Returns a mask for field <y> of register <x>.  This
+ *         value can be ~'d and then &'d to clear the value of field <y> for
+ *         register <x>.
+ *
+ *     <x>_<y>_<z>_f(void) : Returns the constant value <z> after being shifted
+ *         to place it at field <y> of register <x>.  This value can be |'d
+ *         with others to produce a full register value for <x>.
+ *
+ *     <x>_<y>_v(u32 r) : Returns the value of field <y> from a full register
+ *         <x> value 'r' after being shifted to place its LSB at bit 0.
+ *         This value is suitable for direct comparison with other unshifted
+ *         values appropriate for use in field <y> of register <x>.
+ *
+ *     <x>_<y>_<z>_v(void) : Returns the constant value for <z> defined for
+ *         field <y> of register <x>.  This value is suitable for direct
+ *         comparison with unshifted values appropriate for use in field <y>
+ *         of register <x>.
+ */
+#ifndef _hw_fifo_gk20a_h_
+#define _hw_fifo_gk20a_h_
+static inline u32 fifo_bar1_base_r(void)
+{
+        return 0x00002254U;
+}
+static inline u32 fifo_bar1_base_ptr_f(u32 v)
+{
+        return (v & 0xfffffffU) << 0U;
+}
+static inline u32 fifo_bar1_base_ptr_align_shift_v(void)
+{
+        return 0x0000000cU;
+}
+static inline u32 fifo_bar1_base_valid_false_f(void)
+{
+        return 0x0U;
+}
+static inline u32 fifo_bar1_base_valid_true_f(void)
+{
+        return 0x10000000U;
+}
+static inline u32 fifo_runlist_base_r(void)
+{
+        return 0x00002270U;
+}
+static inline u32 fifo_runlist_base_ptr_f(u32 v)
+{
+        return (v & 0xfffffffU) << 0U;
+}
+static inline u32 fifo_runlist_base_target_vid_mem_f(void)
+{
+        return 0x0U;
+}
+static inline u32 fifo_runlist_base_target_sys_mem_coh_f(void)
+{
+        return 0x20000000U;
+}
+static inline u32 fifo_runlist_base_target_sys_mem_ncoh_f(void)
+{
+        return 0x30000000U;
+}
+static inline u32 fifo_runlist_r(void)
+{
+        return 0x00002274U;
+}
+static inline u32 fifo_runlist_engine_f(u32 v)
+{
+        return (v & 0xfU) << 20U;
+}
+static inline u32 fifo_eng_runlist_base_r(u32 i)
+{
+        return 0x00002280U + i*8U;
+}
+static inline u32 fifo_eng_runlist_base__size_1_v(void)
+{
+        return 0x00000001U;
+}
+static inline u32 fifo_eng_runlist_r(u32 i)
+{
+        return 0x00002284U + i*8U;
+}
+static inline u32 fifo_eng_runlist__size_1_v(void)
+{
+        return 0x00000001U;
+}
+static inline u32 fifo_eng_runlist_length_f(u32 v)
+{
+        return (v & 0xffffU) << 0U;
+}
+static inline u32 fifo_eng_runlist_length_max_v(void)
+{
+        return 0x0000ffffU;
+}
+static inline u32 fifo_eng_runlist_pending_true_f(void)
+{
+        return 0x100000U;
+}
+static inline u32 fifo_runlist_timeslice_r(u32 i)
+{
+        return 0x00002310U + i*4U;
+}
+static inline u32 fifo_runlist_timeslice_timeout_128_f(void)
+{
+        return 0x80U;
+}
+static inline u32 fifo_runlist_timeslice_timescale_3_f(void)
+{
+        return 0x3000U;
+}
+static inline u32 fifo_runlist_timeslice_enable_true_f(void)
+{
+        return 0x10000000U;
+}
+static inline u32 fifo_eng_timeout_r(void)
+{
+        return 0x00002a0cU;
+}
+static inline u32 fifo_eng_timeout_period_max_f(void)
+{
+        return 0x7fffffffU;
+}
+static inline u32 fifo_eng_timeout_detection_enabled_f(void)
+{
+        return 0x80000000U;
+}
+static inline u32 fifo_eng_timeout_detection_disabled_f(void)
+{
+        return 0x0U;
+}
+static inline u32 fifo_pb_timeslice_r(u32 i)
+{
+        return 0x00002350U + i*4U;
+}
+static inline u32 fifo_pb_timeslice_timeout_16_f(void)
+{
+        return 0x10U;
+}
+static inline u32 fifo_pb_timeslice_timescale_0_f(void)
+{
+        return 0x0U;
+}
+static inline u32 fifo_pb_timeslice_enable_true_f(void)
+{
+        return 0x10000000U;
+}
+static inline u32 fifo_pbdma_map_r(u32 i)
+{
+        return 0x00002390U + i*4U;
+}
+static inline u32 fifo_intr_0_r(void)
+{
+        return 0x00002100U;
+}
+static inline u32 fifo_intr_0_bind_error_pending_f(void)
+{
+        return 0x1U;
+}
+static inline u32 fifo_intr_0_bind_error_reset_f(void)
+{
+        return 0x1U;
+}
+static inline u32 fifo_intr_0_pio_error_pending_f(void)
+{
+        return 0x10U;
+}
+static inline u32 fifo_intr_0_pio_error_reset_f(void)
+{
+        return 0x10U;
+}
+static inline u32 fifo_intr_0_sched_error_pending_f(void)
+{
+        return 0x100U;
+}
+static inline u32 fifo_intr_0_sched_error_reset_f(void)
+{
+        return 0x100U;
+}
+static inline u32 fifo_intr_0_chsw_error_pending_f(void)
+{
+        return 0x10000U;
+}
+static inline u32 fifo_intr_0_chsw_error_reset_f(void)
+{
+        return 0x10000U;
+}
+static inline u32 fifo_intr_0_fb_flush_timeout_pending_f(void)
+{
+        return 0x800000U;
+}
+static inline u32 fifo_intr_0_fb_flush_timeout_reset_f(void)
+{
+        return 0x800000U;
+}
+static inline u32 fifo_intr_0_lb_error_pending_f(void)
+{
+        return 0x1000000U;
+}
+static inline u32 fifo_intr_0_lb_error_reset_f(void)
+{
+        return 0x1000000U;
+}
+static inline u32 fifo_intr_0_dropped_mmu_fault_pending_f(void)
+{
+        return 0x8000000U;
+}
+static inline u32 fifo_intr_0_dropped_mmu_fault_reset_f(void)
+{
+        return 0x8000000U;
+}
+static inline u32 fifo_intr_0_mmu_fault_pending_f(void)
+{
+        return 0x10000000U;
+}
+static inline u32 fifo_intr_0_pbdma_intr_pending_f(void)
+{
+        return 0x20000000U;
+}
+static inline u32 fifo_intr_0_runlist_event_pending_f(void)
+{
+        return 0x40000000U;
+}
+static inline u32 fifo_intr_0_channel_intr_pending_f(void)
+{
+        return 0x80000000U;
+}
+static inline u32 fifo_intr_en_0_r(void)
+{
+        return 0x00002140U;
+}
+static inline u32 fifo_intr_en_0_sched_error_f(u32 v)
+{
+        return (v & 0x1U) << 8U;
+}
+static inline u32 fifo_intr_en_0_sched_error_m(void)
+{
+        return 0x1U << 8U;
+}
+static inline u32 fifo_intr_en_0_mmu_fault_f(u32 v)
+{
+        return (v & 0x1U) << 28U;
+}
+static inline u32 fifo_intr_en_0_mmu_fault_m(void)
+{
+        return 0x1U << 28U;
+}
+static inline u32 fifo_intr_en_1_r(void)
+{
+        return 0x00002528U;
+}
+static inline u32 fifo_intr_bind_error_r(void)
+{
+        return 0x0000252cU;
+}
+static inline u32 fifo_intr_sched_error_r(void)
+{
+        return 0x0000254cU;
+}
+static inline u32 fifo_intr_sched_error_code_f(u32 v)
+{
+        return (v & 0xffU) << 0U;
+}
+static inline u32 fifo_intr_sched_error_code_ctxsw_timeout_v(void)
+{
+        return 0x0000000aU;
+}
+static inline u32 fifo_intr_chsw_error_r(void)
+{
+        return 0x0000256cU;
+}
+static inline u32 fifo_intr_mmu_fault_id_r(void)
+{
+        return 0x0000259cU;
+}
+static inline u32 fifo_intr_mmu_fault_eng_id_graphics_v(void)
+{
+        return 0x00000000U;
+}
+static inline u32 fifo_intr_mmu_fault_eng_id_graphics_f(void)
+{
+        return 0x0U;
+}
+static inline u32 fifo_intr_mmu_fault_inst_r(u32 i)
+{
+        return 0x00002800U + i*16U;
+}
+static inline u32 fifo_intr_mmu_fault_inst_ptr_v(u32 r)
+{
+        return (r >> 0U) & 0xfffffffU;
+}
+static inline u32 fifo_intr_mmu_fault_inst_ptr_align_shift_v(void)
+{
+        return 0x0000000cU;
+}
+static inline u32 fifo_intr_mmu_fault_lo_r(u32 i)
+{
+        return 0x00002804U + i*16U;
+}
+static inline u32 fifo_intr_mmu_fault_hi_r(u32 i)
+{
+        return 0x00002808U + i*16U;
+}
+static inline u32 fifo_intr_mmu_fault_info_r(u32 i)
+{
+        return 0x0000280cU + i*16U;
+}
+static inline u32 fifo_intr_mmu_fault_info_type_v(u32 r)
+{
+        return (r >> 0U) & 0xfU;
+}
+static inline u32 fifo_intr_mmu_fault_info_write_v(u32 r)
+{
+        return (r >> 7U) & 0x1U;
+}
+static inline u32 fifo_intr_mmu_fault_info_engine_subid_v(u32 r)
+{
+        return (r >> 6U) & 0x1U;
+}
+static inline u32 fifo_intr_mmu_fault_info_engine_subid_gpc_v(void)
+{
+        return 0x00000000U;
+}
+static inline u32 fifo_intr_mmu_fault_info_engine_subid_hub_v(void)
+{
+        return 0x00000001U;
+}
+static inline u32 fifo_intr_mmu_fault_info_client_v(u32 r)
+{
+        return (r >> 8U) & 0x1fU;
+}
+static inline u32 fifo_intr_pbdma_id_r(void)
+{
+        return 0x000025a0U;
+}
+static inline u32 fifo_intr_pbdma_id_status_f(u32 v, u32 i)
+{
+        return (v & 0x1U) << (0U + i*1U);
+}
+static inline u32 fifo_intr_pbdma_id_status_v(u32 r, u32 i)
+{
+        return (r >> (0U + i*1U)) & 0x1U;
+}
+static inline u32 fifo_intr_pbdma_id_status__size_1_v(void)
+{
+        return 0x00000001U;
+}
+static inline u32 fifo_intr_runlist_r(void)
+{
+        return 0x00002a00U;
+}
+static inline u32 fifo_fb_timeout_r(void)
+{
+        return 0x00002a04U;
+}
+static inline u32 fifo_fb_timeout_period_m(void)
+{
+        return 0x3fffffffU << 0U;
+}
+static inline u32 fifo_fb_timeout_period_max_f(void)
+{
+        return 0x3fffffffU;
+}
+static inline u32 fifo_pb_timeout_r(void)
+{
+        return 0x00002a08U;
+}
+static inline u32 fifo_pb_timeout_detection_enabled_f(void)
+{
+        return 0x80000000U;
+}
+static inline u32 fifo_error_sched_disable_r(void)
+{
+        return 0x0000262cU;
+}
+static inline u32 fifo_sched_disable_r(void)
+{
+        return 0x00002630U;
+}
+static inline u32 fifo_sched_disable_runlist_f(u32 v, u32 i)
+{
+        return (v & 0x1U) << (0U + i*1U);
+}
+static inline u32 fifo_sched_disable_runlist_m(u32 i)
+{
+        return 0x1U << (0U + i*1U);
+}
+static inline u32 fifo_sched_disable_true_v(void)
+{
+        return 0x00000001U;
+}
+static inline u32 fifo_preempt_r(void)
+{
+        return 0x00002634U;
+}
+static inline u32 fifo_preempt_pending_true_f(void)
+{
+        return 0x100000U;
+}
+static inline u32 fifo_preempt_type_channel_f(void)
+{
+        return 0x0U;
+}
+static inline u32 fifo_preempt_type_tsg_f(void)
+{
+        return 0x1000000U;
+}
+static inline u32 fifo_preempt_chid_f(u32 v)
+{
+        return (v & 0xfffU) << 0U;
+}
+static inline u32 fifo_preempt_id_f(u32 v)
+{
+        return (v & 0xfffU) << 0U;
+}
+static inline u32 fifo_trigger_mmu_fault_r(u32 i)
+{
+        return 0x00002a30U + i*4U;
+}
+static inline u32 fifo_trigger_mmu_fault_id_f(u32 v)
+{
+        return (v & 0x1fU) << 0U;
+}
+static inline u32 fifo_trigger_mmu_fault_enable_f(u32 v)
+{
+        return (v & 0x1U) << 8U;
+}
+static inline u32 fifo_engine_status_r(u32 i)
+{
+        return 0x00002640U + i*8U;
+}
+static inline u32 fifo_engine_status__size_1_v(void)
+{
+        return 0x00000002U;
+}
+static inline u32 fifo_engine_status_id_v(u32 r)
+{
+        return (r >> 0U) & 0xfffU;
+}
+static inline u32 fifo_engine_status_id_type_v(u32 r)
+{
+        return (r >> 12U) & 0x1U;
+}
+static inline u32 fifo_engine_status_id_type_chid_v(void)
+{
+        return 0x00000000U;
+}
+static inline u32 fifo_engine_status_id_type_tsgid_v(void)
+{
+        return 0x00000001U;
+}
+static inline u32 fifo_engine_status_ctx_status_v(u32 r)
+{
+        return (r >> 13U) & 0x7U;
+}
+static inline u32 fifo_engine_status_ctx_status_invalid_v(void)
+{
+        return 0x00000000U;
+}
+static inline u32 fifo_engine_status_ctx_status_valid_v(void)
+{
+        return 0x00000001U;
+}
+static inline u32 fifo_engine_status_ctx_status_ctxsw_load_v(void)
+{
+        return 0x00000005U;
+}
+static inline u32 fifo_engine_status_ctx_status_ctxsw_save_v(void)
+{
+        return 0x00000006U;
+}
+static inline u32 fifo_engine_status_ctx_status_ctxsw_switch_v(void)
+{
+        return 0x00000007U;
+}
+static inline u32 fifo_engine_status_next_id_v(u32 r)
+{
+        return (r >> 16U) & 0xfffU;
+}
+static inline u32 fifo_engine_status_next_id_type_v(u32 r)
+{
+        return (r >> 28U) & 0x1U;
+}
+static inline u32 fifo_engine_status_next_id_type_chid_v(void)
+{
+        return 0x00000000U;
+}
+static inline u32 fifo_engine_status_faulted_v(u32 r)
+{
+        return (r >> 30U) & 0x1U;
+}
+static inline u32 fifo_engine_status_faulted_true_v(void)
+{
+        return 0x00000001U;
+}
+static inline u32 fifo_engine_status_engine_v(u32 r)
+{
+        return (r >> 31U) & 0x1U;
+}
+static inline u32 fifo_engine_status_engine_idle_v(void)
+{
+        return 0x00000000U;
+}
+static inline u32 fifo_engine_status_engine_busy_v(void)
+{
+        return 0x00000001U;
+}
+static inline u32 fifo_engine_status_ctxsw_v(u32 r)
+{
+        return (r >> 15U) & 0x1U;
+}
+static inline u32 fifo_engine_status_ctxsw_in_progress_v(void)
+{
+        return 0x00000001U;
+}
+static inline u32 fifo_engine_status_ctxsw_in_progress_f(void)
+{
+        return 0x8000U;
+}
+static inline u32 fifo_pbdma_status_r(u32 i)
+{
+        return 0x00003080U + i*4U;
+}
+static inline u32 fifo_pbdma_status__size_1_v(void)
+{
+        return 0x00000001U;
+}
+static inline u32 fifo_pbdma_status_id_v(u32 r)
+{
+        return (r >> 0U) & 0xfffU;
+}
+static inline u32 fifo_pbdma_status_id_type_v(u32 r)
+{
+        return (r >> 12U) & 0x1U;
+}
+static inline u32 fifo_pbdma_status_id_type_chid_v(void)
+{
+        return 0x00000000U;
+}
+static inline u32 fifo_pbdma_status_id_type_tsgid_v(void)
+{
+        return 0x00000001U;
+}
+static inline u32 fifo_pbdma_status_chan_status_v(u32 r)
+{
+        return (r >> 13U) & 0x7U;
+}
+static inline u32 fifo_pbdma_status_chan_status_valid_v(void)
+{
+        return 0x00000001U;
+}
+static inline u32 fifo_pbdma_status_chan_status_chsw_load_v(void)
+{
+        return 0x00000005U;
+}
+static inline u32 fifo_pbdma_status_chan_status_chsw_save_v(void)
+{
+        return 0x00000006U;
+}
+static inline u32 fifo_pbdma_status_chan_status_chsw_switch_v(void)
+{
+        return 0x00000007U;
+}
+static inline u32 fifo_pbdma_status_next_id_v(u32 r)
+{
+        return (r >> 16U) & 0xfffU;
+}
+static inline u32 fifo_pbdma_status_next_id_type_v(u32 r)
+{
+        return (r >> 28U) & 0x1U;
+}
+static inline u32 fifo_pbdma_status_next_id_type_chid_v(void)
+{
+        return 0x00000000U;
+}
+static inline u32 fifo_pbdma_status_chsw_v(u32 r)
+{
+        return (r >> 15U) & 0x1U;
+}
+static inline u32 fifo_pbdma_status_chsw_in_progress_v(void)
+{
+        return 0x00000001U;
+}
+#endif
diff --git a/include/gk20a/hw_flush_gk20a.h b/include/gk20a/hw_flush_gk20a.h
new file mode 100644
index 0000000..d270b5f
--- /dev/null
+++ b/include/gk20a/hw_flush_gk20a.h
@@ -0,0 +1,187 @@
+/*
+ * Copyright (c) 2012-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+/*
+ * Function naming determines intended use:
+ *
+ *     <x>_r(void) : Returns the offset for register <x>.
+ *
+ *     <x>_o(void) : Returns the offset for element <x>.
+ *
+ *     <x>_w(void) : Returns the word offset for word (4 byte) element <x>.
+ *
+ *     <x>_<y>_s(void) : Returns size of field <y> of register <x> in bits.
+ *
+ *     <x>_<y>_f(u32 v) : Returns a value based on 'v' which has been shifted
+ *         and masked to place it at field <y> of register <x>.  This value
+ *         can be |'d with others to produce a full register value for
+ *         register <x>.
+ *
+ *     <x>_<y>_m(void) : Returns a mask for field <y> of register <x>.  This
+ *         value can be ~'d and then &'d to clear the value of field <y> for
+ *         register <x>.
+ *
+ *     <x>_<y>_<z>_f(void) : Returns the constant value <z> after being shifted
+ *         to place it at field <y> of register <x>.  This value can be |'d
+ *         with others to produce a full register value for <x>.
+ *
+ *     <x>_<y>_v(u32 r) : Returns the value of field <y> from a full register
+ *         <x> value 'r' after being shifted to place its LSB at bit 0.
+ *         This value is suitable for direct comparison with other unshifted
+ *         values appropriate for use in field <y> of register <x>.
+ *
+ *     <x>_<y>_<z>_v(void) : Returns the constant value for <z> defined for
+ *         field <y> of register <x>.  This value is suitable for direct
+ *         comparison with unshifted values appropriate for use in field <y>
+ *         of register <x>.
+ */
+#ifndef _hw_flush_gk20a_h_
+#define _hw_flush_gk20a_h_
+static inline u32 flush_l2_system_invalidate_r(void)
+{
+        return 0x00070004U;
+}
+static inline u32 flush_l2_system_invalidate_pending_v(u32 r)
+{
+        return (r >> 0U) & 0x1U;
+}
+static inline u32 flush_l2_system_invalidate_pending_busy_v(void)
+{
+        return 0x00000001U;
+}
+static inline u32 flush_l2_system_invalidate_pending_busy_f(void)
+{
+        return 0x1U;
+}
+static inline u32 flush_l2_system_invalidate_outstanding_v(u32 r)
+{
+        return (r >> 1U) & 0x1U;
+}
+static inline u32 flush_l2_system_invalidate_outstanding_true_v(void)
+{
+        return 0x00000001U;
+}
+static inline u32 flush_l2_flush_dirty_r(void)
+{
+        return 0x00070010U;
+}
+static inline u32 flush_l2_flush_dirty_pending_v(u32 r)
+{
+        return (r >> 0U) & 0x1U;
+}
+static inline u32 flush_l2_flush_dirty_pending_empty_v(void)
+{
+        return 0x00000000U;
+}
+static inline u32 flush_l2_flush_dirty_pending_empty_f(void)
+{
+        return 0x0U;
+}
+static inline u32 flush_l2_flush_dirty_pending_busy_v(void)
+{
+        return 0x00000001U;
+}
+static inline u32 flush_l2_flush_dirty_pending_busy_f(void)
+{
+        return 0x1U;
+}
+static inline u32 flush_l2_flush_dirty_outstanding_v(u32 r)
+{
+        return (r >> 1U) & 0x1U;
+}
+static inline u32 flush_l2_flush_dirty_outstanding_false_v(void)
+{
+        return 0x00000000U;
+}
+static inline u32 flush_l2_flush_dirty_outstanding_false_f(void)
+{
+        return 0x0U;
+}
+static inline u32 flush_l2_flush_dirty_outstanding_true_v(void)
+{
+        return 0x00000001U;
+}
+static inline u32 flush_l2_clean_comptags_r(void)
+{
+        return 0x0007000cU;
+}
+static inline u32 flush_l2_clean_comptags_pending_v(u32 r)
+{
+        return (r >> 0U) & 0x1U;
+}
+static inline u32 flush_l2_clean_comptags_pending_empty_v(void)
+{
+        return 0x00000000U;
+}
+static inline u32 flush_l2_clean_comptags_pending_empty_f(void)
+{
+        return 0x0U;
+}
+static inline u32 flush_l2_clean_comptags_pending_busy_v(void)
+{
+        return 0x00000001U;
+}
+static inline u32 flush_l2_clean_comptags_pending_busy_f(void)
+{
+        return 0x1U;
+}
+static inline u32 flush_l2_clean_comptags_outstanding_v(u32 r)
+{
+        return (r >> 1U) & 0x1U;
+}
+static inline u32 flush_l2_clean_comptags_outstanding_false_v(void)
+{
+        return 0x00000000U;
+}
+static inline u32 flush_l2_clean_comptags_outstanding_false_f(void)
+{
+        return 0x0U;
+}
+static inline u32 flush_l2_clean_comptags_outstanding_true_v(void)
+{
+        return 0x00000001U;
+}
+static inline u32 flush_fb_flush_r(void)
+{
+        return 0x00070000U;
+}
+static inline u32 flush_fb_flush_pending_v(u32 r)
+{
+        return (r >> 0U) & 0x1U;
+}
+static inline u32 flush_fb_flush_pending_busy_v(void)
+{
+        return 0x00000001U;
+}
+static inline u32 flush_fb_flush_pending_busy_f(void)
+{
+        return 0x1U;
+}
+static inline u32 flush_fb_flush_outstanding_v(u32 r)
+{
+        return (r >> 1U) & 0x1U;
+}
+static inline u32 flush_fb_flush_outstanding_true_v(void)
+{
+        return 0x00000001U;
+}
+#endif
diff --git a/include/gk20a/hw_gmmu_gk20a.h b/include/gk20a/hw_gmmu_gk20a.h
new file mode 100644
index 0000000..a788d1d
--- /dev/null
+++ b/include/gk20a/hw_gmmu_gk20a.h
@@ -0,0 +1,283 @@
+/*
+ * Copyright (c) 2012-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+/*
+ * Function naming determines intended use:
+ *
+ *     <x>_r(void) : Returns the offset for register <x>.
+ *
+ *     <x>_o(void) : Returns the offset for element <x>.
+ *
+ *     <x>_w(void) : Returns the word offset for word (4 byte) element <x>.
+ *
+ *     <x>_<y>_s(void) : Returns size of field <y> of register <x> in bits.
+ *
+ *     <x>_<y>_f(u32 v) : Returns a value based on 'v' which has been shifted
+ *         and masked to place it at field <y> of register <x>.  This value
+ *         can be |'d with others to produce a full register value for
+ *         register <x>.
+ *
+ *     <x>_<y>_m(void) : Returns a mask for field <y> of register <x>.  This
+ *         value can be ~'d and then &'d to clear the value of field <y> for
+ *         register <x>.
+ *
+ *     <x>_<y>_<z>_f(void) : Returns the constant value <z> after being shifted
+ *         to place it at field <y> of register <x>.  This value can be |'d
+ *         with others to produce a full register value for <x>.
+ *
+ *     <x>_<y>_v(u32 r) : Returns the value of field <y> from a full register
+ *         <x> value 'r' after being shifted to place its LSB at bit 0.
+ *         This value is suitable for direct comparison with other unshifted
+ *         values appropriate for use in field <y> of register <x>.
+ *
+ *     <x>_<y>_<z>_v(void) : Returns the constant value for <z> defined for
+ *         field <y> of register <x>.  This value is suitable for direct
+ *         comparison with unshifted values appropriate for use in field <y>
+ *         of register <x>.
+ */
+#ifndef _hw_gmmu_gk20a_h_
+#define _hw_gmmu_gk20a_h_
+static inline u32 gmmu_pde_aperture_big_w(void)
+{
+        return 0U;
+}
+static inline u32 gmmu_pde_aperture_big_invalid_f(void)
+{
+        return 0x0U;
+}
+static inline u32 gmmu_pde_aperture_big_video_memory_f(void)
+{
+        return 0x1U;
+}
+static inline u32 gmmu_pde_aperture_big_sys_mem_coh_f(void)
+{
+        return 0x2U;
+}
+static inline u32 gmmu_pde_aperture_big_sys_mem_ncoh_f(void)
+{
+        return 0x3U;
+}
+static inline u32 gmmu_pde_size_w(void)
+{
+        return 0U;
+}
+static inline u32 gmmu_pde_size_full_f(void)
+{
+        return 0x0U;
+}
+static inline u32 gmmu_pde_address_big_sys_f(u32 v)
+{
+        return (v & 0xfffffffU) << 4U;
+}
+static inline u32 gmmu_pde_address_big_sys_w(void)
+{
+        return 0U;
+}
+static inline u32 gmmu_pde_aperture_small_w(void)
+{
+        return 1U;
+}
+static inline u32 gmmu_pde_aperture_small_invalid_f(void)
+{
+        return 0x0U;
+}
+static inline u32 gmmu_pde_aperture_small_video_memory_f(void)
+{
+        return 0x1U;
+}
+static inline u32 gmmu_pde_aperture_small_sys_mem_coh_f(void)
+{
+        return 0x2U;
+}
+static inline u32 gmmu_pde_aperture_small_sys_mem_ncoh_f(void)
+{
+        return 0x3U;
+}
+static inline u32 gmmu_pde_vol_small_w(void)
+{
+        return 1U;
+}
+static inline u32 gmmu_pde_vol_small_true_f(void)
+{
+        return 0x4U;
+}
+static inline u32 gmmu_pde_vol_small_false_f(void)
+{
+        return 0x0U;
+}
+static inline u32 gmmu_pde_vol_big_w(void)
+{
+        return 1U;
+}
+static inline u32 gmmu_pde_vol_big_true_f(void)
+{
+        return 0x8U;
+}
+static inline u32 gmmu_pde_vol_big_false_f(void)
+{
+        return 0x0U;
+}
+static inline u32 gmmu_pde_address_small_sys_f(u32 v)
+{
+        return (v & 0xfffffffU) << 4U;
+}
+static inline u32 gmmu_pde_address_small_sys_w(void)
+{
+        return 1U;
+}
+static inline u32 gmmu_pde_address_shift_v(void)
+{
+        return 0x0000000cU;
+}
+static inline u32 gmmu_pde__size_v(void)
+{
+        return 0x00000008U;
+}
+static inline u32 gmmu_pte__size_v(void)
+{
+        return 0x00000008U;
+}
+static inline u32 gmmu_pte_valid_w(void)
+{
+        return 0U;
+}
+static inline u32 gmmu_pte_valid_true_f(void)
+{
+        return 0x1U;
+}
+static inline u32 gmmu_pte_valid_false_f(void)
+{
+        return 0x0U;
+}
+static inline u32 gmmu_pte_privilege_w(void)
+{
+        return 0U;
+}
+static inline u32 gmmu_pte_privilege_true_f(void)
+{
+        return 0x2U;
+}
+static inline u32 gmmu_pte_privilege_false_f(void)
+{
+        return 0x0U;
+}
+static inline u32 gmmu_pte_address_sys_f(u32 v)
+{
+        return (v & 0xfffffffU) << 4U;
+}
+static inline u32 gmmu_pte_address_sys_w(void)
+{
+        return 0U;
+}
+static inline u32 gmmu_pte_address_vid_f(u32 v)
+{
+        return (v & 0x1ffffffU) << 4U;
+}
+static inline u32 gmmu_pte_address_vid_w(void)
+{
+        return 0U;
+}
+static inline u32 gmmu_pte_vol_w(void)
+{
+        return 1U;
+}
+static inline u32 gmmu_pte_vol_true_f(void)
+{
+        return 0x1U;
+}
+static inline u32 gmmu_pte_vol_false_f(void)
+{
+        return 0x0U;
+}
+static inline u32 gmmu_pte_aperture_w(void)
+{
+        return 1U;
+}
+static inline u32 gmmu_pte_aperture_video_memory_f(void)
+{
+        return 0x0U;
+}
+static inline u32 gmmu_pte_aperture_sys_mem_coh_f(void)
+{
+        return 0x4U;
+}
+static inline u32 gmmu_pte_aperture_sys_mem_ncoh_f(void)
+{
+        return 0x6U;
+}
+static inline u32 gmmu_pte_read_only_w(void)
+{
+        return 0U;
+}
+static inline u32 gmmu_pte_read_only_true_f(void)
+{
+        return 0x4U;
+}
+static inline u32 gmmu_pte_write_disable_w(void)
+{
+        return 1U;
+}
+static inline u32 gmmu_pte_write_disable_true_f(void)
+{
+        return 0x80000000U;
+}
+static inline u32 gmmu_pte_read_disable_w(void)
+{
+        return 1U;
+}
+static inline u32 gmmu_pte_read_disable_true_f(void)
+{
+        return 0x40000000U;
+}
+static inline u32 gmmu_pte_comptagline_s(void)
+{
+        return 17U;
+}
+static inline u32 gmmu_pte_comptagline_f(u32 v)
+{
+        return (v & 0x1ffffU) << 12U;
+}
+static inline u32 gmmu_pte_comptagline_w(void)
+{
+        return 1U;
+}
+static inline u32 gmmu_pte_address_shift_v(void)
+{
+        return 0x0000000cU;
+}
+static inline u32 gmmu_pte_kind_f(u32 v)
+{
+        return (v & 0xffU) << 4U;
+}
+static inline u32 gmmu_pte_kind_w(void)
+{
+        return 1U;
+}
+static inline u32 gmmu_pte_kind_invalid_v(void)
+{
+        return 0x000000ffU;
+}
+static inline u32 gmmu_pte_kind_pitch_v(void)
+{
+        return 0x00000000U;
+}
+#endif
diff --git a/include/gk20a/hw_gr_gk20a.h b/include/gk20a/hw_gr_gk20a.h
new file mode 100644
index 0000000..826108f
--- /dev/null
+++ b/include/gk20a/hw_gr_gk20a.h
@@ -0,0 +1,3807 @@
+/*
+ * Copyright (c) 2014-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+/*
+ * Function naming determines intended use:
+ *
+ *     <x>_r(void) : Returns the offset for register <x>.
+ *
+ *     <x>_o(void) : Returns the offset for element <x>.
+ *
+ *     <x>_w(void) : Returns the word offset for word (4 byte) element <x>.
+ *
+ *     <x>_<y>_s(void) : Returns size of field <y> of register <x> in bits.
+ *
+ *     <x>_<y>_f(u32 v) : Returns a value based on 'v' which has been shifted
+ *         and masked to place it at field <y> of register <x>.  This value
+ *         can be |'d with others to produce a full register value for
+ *         register <x>.
+ *
+ *     <x>_<y>_m(void) : Returns a mask for field <y> of register <x>.  This
+ *         value can be ~'d and then &'d to clear the value of field <y> for
+ *         register <x>.
+ *
+ *     <x>_<y>_<z>_f(void) : Returns the constant value <z> after being shifted
+ *         to place it at field <y> of register <x>.  This value can be |'d
+ *         with others to produce a full register value for <x>.
+ *
+ *     <x>_<y>_v(u32 r) : Returns the value of field <y> from a full register
+ *         <x> value 'r' after being shifted to place its LSB at bit 0.
+ *         This value is suitable for direct comparison with other unshifted
+ *         values appropriate for use in field <y> of register <x>.
+ *
+ *     <x>_<y>_<z>_v(void) : Returns the constant value for <z> defined for
+ *         field <y> of register <x>.  This value is suitable for direct
+ *         comparison with unshifted values appropriate for use in field <y>
+ *         of register <x>.
+ */
+#ifndef _hw_gr_gk20a_h_
+#define _hw_gr_gk20a_h_
+static inline u32 gr_intr_r(void)
+{
+        return 0x00400100U;
+}
+static inline u32 gr_intr_notify_pending_f(void)
+{
+        return 0x1U;
+}
+static inline u32 gr_intr_notify_reset_f(void)
+{
+        return 0x1U;
+}
+static inline u32 gr_intr_semaphore_pending_f(void)
+{
+        return 0x2U;
+}
+static inline u32 gr_intr_semaphore_reset_f(void)
+{
+        return 0x2U;
+}
+static inline u32 gr_intr_semaphore_timeout_not_pending_f(void)
+{
+        return 0x0U;
+}
+static inline u32 gr_intr_semaphore_timeout_pending_f(void)
+{
+        return 0x4U;
+}
+static inline u32 gr_intr_semaphore_timeout_reset_f(void)
+{
+        return 0x4U;
+}
+static inline u32 gr_intr_illegal_method_pending_f(void)
+{
+        return 0x10U;
+}
+static inline u32 gr_intr_illegal_method_reset_f(void)
+{
+        return 0x10U;
+}
+static inline u32 gr_intr_illegal_notify_pending_f(void)
+{
+        return 0x40U;
+}
+static inline u32 gr_intr_illegal_notify_reset_f(void)
+{
+        return 0x40U;
+}
+static inline u32 gr_intr_firmware_method_f(u32 v)
+{
+        return (v & 0x1U) << 8U;
+}
+static inline u32 gr_intr_firmware_method_pending_f(void)
+{
+        return 0x100U;
+}
+static inline u32 gr_intr_firmware_method_reset_f(void)
+{
+        return 0x100U;
+}
+static inline u32 gr_intr_illegal_class_pending_f(void)
+{
+        return 0x20U;
+}
+static inline u32 gr_intr_illegal_class_reset_f(void)
+{
+        return 0x20U;
+}
+static inline u32 gr_intr_fecs_error_pending_f(void)
+{
+        return 0x80000U;
+}
+static inline u32 gr_intr_fecs_error_reset_f(void)
+{
+        return 0x80000U;
+}
+static inline u32 gr_intr_class_error_pending_f(void)
+{
+        return 0x100000U;
+}
+static inline u32 gr_intr_class_error_reset_f(void)
+{
+        return 0x100000U;
+}
+static inline u32 gr_intr_exception_pending_f(void)
+{
+        return 0x200000U;
+}
+static inline u32 gr_intr_exception_reset_f(void)
+{
+        return 0x200000U;
+}
+static inline u32 gr_fecs_intr_r(void)
+{
+        return 0x00400144U;
+}
+static inline u32 gr_class_error_r(void)
+{
+        return 0x00400110U;
+}
+static inline u32 gr_class_error_code_v(u32 r)
+{
+        return (r >> 0U) & 0xffffU;
+}
+static inline u32 gr_intr_nonstall_r(void)
+{
+        return 0x00400120U;
+}
+static inline u32 gr_intr_nonstall_trap_pending_f(void)
+{
+        return 0x2U;
+}
+static inline u32 gr_intr_en_r(void)
+{
+        return 0x0040013cU;
+}
+static inline u32 gr_exception_r(void)
+{
+        return 0x00400108U;
+}
+static inline u32 gr_exception_fe_m(void)
+{
+        return 0x1U << 0U;
+}
+static inline u32 gr_exception_gpc_m(void)
+{
+        return 0x1U << 24U;
+}
+static inline u32 gr_exception_memfmt_m(void)
+{
+        return 0x1U << 1U;
+}
+static inline u32 gr_exception_ds_m(void)
+{
+        return 0x1U << 4U;
+}
+static inline u32 gr_exception_sked_m(void)
+{
+        return 0x1U << 8U;
+}
+static inline u32 gr_exception_pd_m(void)
+{
+        return 0x1U << 2U;
+}
+static inline u32 gr_exception_scc_m(void)
+{
+        return 0x1U << 3U;
+}
+static inline u32 gr_exception_ssync_m(void)
+{
+        return 0x1U << 5U;
+}
+static inline u32 gr_exception_mme_m(void)
+{
+        return 0x1U << 7U;
+}
+static inline u32 gr_exception1_r(void)
+{
+        return 0x00400118U;
+}
+static inline u32 gr_exception1_gpc_0_pending_f(void)
+{
+        return 0x1U;
+}
+static inline u32 gr_exception2_r(void)
+{
+        return 0x0040011cU;
+}
+static inline u32 gr_exception_en_r(void)
+{
+        return 0x00400138U;
+}
+static inline u32 gr_exception_en_fe_m(void)
+{
+        return 0x1U << 0U;
+}
+static inline u32 gr_exception1_en_r(void)
+{
+        return 0x00400130U;
+}
+static inline u32 gr_exception2_en_r(void)
+{
+        return 0x00400134U;
+}
+static inline u32 gr_gpfifo_ctl_r(void)
+{
+        return 0x00400500U;
+}
+static inline u32 gr_gpfifo_ctl_access_f(u32 v)
+{
+        return (v & 0x1U) << 0U;
+}
+static inline u32 gr_gpfifo_ctl_access_disabled_f(void)
+{
+        return 0x0U;
+}
+static inline u32 gr_gpfifo_ctl_access_enabled_f(void)
+{
+        return 0x1U;
+}
+static inline u32 gr_gpfifo_ctl_semaphore_access_f(u32 v)
+{
+        return (v & 0x1U) << 16U;
+}
+static inline u32 gr_gpfifo_ctl_semaphore_access_enabled_v(void)
+{
+        return 0x00000001U;
+}
+static inline u32 gr_gpfifo_ctl_semaphore_access_enabled_f(void)
+{
+        return 0x10000U;
+}
+static inline u32 gr_gpfifo_status_r(void)
+{
+        return 0x00400504U;
+}
+static inline u32 gr_trapped_addr_r(void)
+{
+        return 0x00400704U;
+}
+static inline u32 gr_trapped_addr_mthd_v(u32 r)
+{
+        return (r >> 2U) & 0xfffU;
+}
+static inline u32 gr_trapped_addr_subch_v(u32 r)
+{
+        return (r >> 16U) & 0x7U;
+}
+static inline u32 gr_trapped_addr_mme_generated_v(u32 r)
+{
+        return (r >> 20U) & 0x1U;
+}
+static inline u32 gr_trapped_addr_datahigh_v(u32 r)
+{
+        return (r >> 24U) & 0x1U;
+}
+static inline u32 gr_trapped_addr_priv_v(u32 r)
+{
+        return (r >> 28U) & 0x1U;
+}
+static inline u32 gr_trapped_addr_status_v(u32 r)
+{
+        return (r >> 31U) & 0x1U;
+}
+static inline u32 gr_trapped_data_lo_r(void)
+{
+        return 0x00400708U;
+}
+static inline u32 gr_trapped_data_hi_r(void)
+{
+        return 0x0040070cU;
+}
+static inline u32 gr_trapped_data_mme_r(void)
+{
+        return 0x00400710U;
+}
+static inline u32 gr_trapped_data_mme_pc_v(u32 r)
+{
+        return (r >> 0U) & 0x7ffU;
+}
+static inline u32 gr_status_r(void)
+{
+        return 0x00400700U;
+}
+static inline u32 gr_status_fe_method_upper_v(u32 r)
+{
+        return (r >> 1U) & 0x1U;
+}
+static inline u32 gr_status_fe_method_lower_v(u32 r)
+{
+        return (r >> 2U) & 0x1U;
+}
+static inline u32 gr_status_fe_method_lower_idle_v(void)
+{
+        return 0x00000000U;
+}
+static inline u32 gr_status_fe_gi_v(u32 r)
+{
+        return (r >> 21U) & 0x1U;
+}
+static inline u32 gr_status_mask_r(void)
+{
+        return 0x00400610U;
+}
+static inline u32 gr_status_1_r(void)
+{
+        return 0x00400604U;
+}
+static inline u32 gr_status_2_r(void)
+{
+        return 0x00400608U;
+}
+static inline u32 gr_engine_status_r(void)
+{
+        return 0x0040060cU;
+}
+static inline u32 gr_engine_status_value_busy_f(void)
+{
+        return 0x1U;
+}
+static inline u32 gr_pri_be0_becs_be_exception_r(void)
+{
+        return 0x00410204U;
+}
+static inline u32 gr_pri_be0_becs_be_exception_en_r(void)
+{
+        return 0x00410208U;
+}
+static inline u32 gr_pri_gpc0_gpccs_gpc_exception_r(void)
+{
+        return 0x00502c90U;
+}
+static inline u32 gr_pri_gpc0_gpccs_gpc_exception_en_r(void)
+{
+        return 0x00502c94U;
+}
+static inline u32 gr_pri_gpc0_tpc0_tpccs_tpc_exception_r(void)
+{
+        return 0x00504508U;
+}
+static inline u32 gr_pri_gpc0_tpc0_tpccs_tpc_exception_en_r(void)
+{
+        return 0x0050450cU;
+}
+static inline u32 gr_activity_0_r(void)
+{
+        return 0x00400380U;
+}
+static inline u32 gr_activity_1_r(void)
+{
+        return 0x00400384U;
+}
+static inline u32 gr_activity_2_r(void)
+{
+        return 0x00400388U;
+}
+static inline u32 gr_activity_4_r(void)
+{
+        return 0x00400390U;
+}
+static inline u32 gr_pri_gpc0_gcc_dbg_r(void)
+{
+        return 0x00501000U;
+}
+static inline u32 gr_pri_gpcs_gcc_dbg_r(void)
+{
+        return 0x00419000U;
+}
+static inline u32 gr_pri_gpcs_gcc_dbg_invalidate_m(void)
+{
+        return 0x1U << 1U;
+}
+static inline u32 gr_pri_gpc0_tpc0_sm_cache_control_r(void)
+{
+        return 0x005046a4U;
+}
+static inline u32 gr_pri_gpcs_tpcs_sm_cache_control_r(void)
+{
+        return 0x00419ea4U;
+}
+static inline u32 gr_pri_gpcs_tpcs_sm_cache_control_invalidate_cache_m(void)
+{
+        return 0x1U << 0U;
+}
+static inline u32 gr_pri_sked_activity_r(void)
+{
+        return 0x00407054U;
+}
+static inline u32 gr_pri_gpc0_gpccs_gpc_activity0_r(void)
+{
+        return 0x00502c80U;
+}
+static inline u32 gr_pri_gpc0_gpccs_gpc_activity1_r(void)
+{
+        return 0x00502c84U;
+}
+static inline u32 gr_pri_gpc0_gpccs_gpc_activity2_r(void)
+{
+        return 0x00502c88U;
+}
+static inline u32 gr_pri_gpc0_gpccs_gpc_activity3_r(void)
+{
+        return 0x00502c8cU;
+}
+static inline u32 gr_pri_gpc0_tpc0_tpccs_tpc_activity_0_r(void)
+{
+        return 0x00504500U;
+}
+static inline u32 gr_pri_gpc0_tpcs_tpccs_tpc_activity_0_r(void)
+{
+        return 0x00501d00U;
+}
+static inline u32 gr_pri_gpcs_gpccs_gpc_activity_0_r(void)
+{
+        return 0x0041ac80U;
+}
+static inline u32 gr_pri_gpcs_gpccs_gpc_activity_1_r(void)
+{
+        return 0x0041ac84U;
+}
+static inline u32 gr_pri_gpcs_gpccs_gpc_activity_2_r(void)
+{
+        return 0x0041ac88U;
+}
+static inline u32 gr_pri_gpcs_gpccs_gpc_activity_3_r(void)
+{
+        return 0x0041ac8cU;
+}
+static inline u32 gr_pri_gpcs_tpc0_tpccs_tpc_activity_0_r(void)
+{
+        return 0x0041c500U;
+}
+static inline u32 gr_pri_gpcs_tpcs_tpccs_tpc_activity_0_r(void)
+{
+        return 0x00419d00U;
+}
+static inline u32 gr_pri_be0_becs_be_activity0_r(void)
+{
+        return 0x00410200U;
+}
+static inline u32 gr_pri_bes_becs_be_activity0_r(void)
+{
+        return 0x00408a00U;
+}
+static inline u32 gr_pri_ds_mpipe_status_r(void)
+{
+        return 0x00405858U;
+}
+static inline u32 gr_pri_fe_go_idle_on_status_r(void)
+{
+        return 0x00404150U;
+}
+static inline u32 gr_pri_fe_go_idle_check_r(void)
+{
+        return 0x00404158U;
+}
+static inline u32 gr_pri_fe_go_idle_info_r(void)
+{
+        return 0x00404194U;
+}
+static inline u32 gr_pri_gpc0_tpc0_tex_m_tex_subunits_status_r(void)
+{
+        return 0x00504238U;
+}
+static inline u32 gr_pri_be0_crop_status1_r(void)
+{
+        return 0x00410134U;
+}
+static inline u32 gr_pri_bes_crop_status1_r(void)
+{
+        return 0x00408934U;
+}
+static inline u32 gr_pri_be0_zrop_status_r(void)
+{
+        return 0x00410048U;
+}
+static inline u32 gr_pri_be0_zrop_status2_r(void)
+{
+        return 0x0041004cU;
+}
+static inline u32 gr_pri_bes_zrop_status_r(void)
+{
+        return 0x00408848U;
+}
+static inline u32 gr_pri_bes_zrop_status2_r(void)
+{
+        return 0x0040884cU;
+}
+static inline u32 gr_pipe_bundle_address_r(void)
+{
+        return 0x00400200U;
+}
+static inline u32 gr_pipe_bundle_address_value_v(u32 r)
+{
+        return (r >> 0U) & 0xffffU;
+}
+static inline u32 gr_pipe_bundle_data_r(void)
+{
+        return 0x00400204U;
+}
+static inline u32 gr_pipe_bundle_config_r(void)
+{
+        return 0x00400208U;
+}
+static inline u32 gr_pipe_bundle_config_override_pipe_mode_disabled_f(void)
+{
+        return 0x0U;
+}
+static inline u32 gr_pipe_bundle_config_override_pipe_mode_enabled_f(void)
+{
+        return 0x80000000U;
+}
+static inline u32 gr_fe_hww_esr_r(void)
+{
+        return 0x00404000U;
+}
+static inline u32 gr_fe_hww_esr_reset_active_f(void)
+{
+        return 0x40000000U;
+}
+static inline u32 gr_fe_hww_esr_en_enable_f(void)
+{
+        return 0x80000000U;
+}
+static inline u32 gr_fe_hww_esr_info_r(void)
+{
+        return 0x004041b0U;
+}
+static inline u32 gr_fe_go_idle_timeout_r(void)
+{
+        return 0x00404154U;
+}
+static inline u32 gr_fe_go_idle_timeout_count_f(u32 v)
+{
+        return (v & 0xffffffffU) << 0U;
+}
+static inline u32 gr_fe_go_idle_timeout_count_disabled_f(void)
+{
+        return 0x0U;
+}
+static inline u32 gr_fe_go_idle_timeout_count_prod_f(void)
+{
+        return 0x800U;
+}
+static inline u32 gr_fe_object_table_r(u32 i)
+{
+        return 0x00404200U + i*4U;
+}
+static inline u32 gr_fe_object_table_nvclass_v(u32 r)
+{
+        return (r >> 0U) & 0xffffU;
+}
+static inline u32 gr_pri_mme_shadow_raw_index_r(void)
+{
+        return 0x00404488U;
+}
+static inline u32 gr_pri_mme_shadow_raw_index_write_trigger_f(void)
+{
+        return 0x80000000U;
+}
+static inline u32 gr_pri_mme_shadow_raw_data_r(void)
+{
+        return 0x0040448cU;
+}
+static inline u32 gr_mme_hww_esr_r(void)
+{
+        return 0x00404490U;
+}
+static inline u32 gr_mme_hww_esr_reset_active_f(void)
+{
+        return 0x40000000U;
+}
+static inline u32 gr_mme_hww_esr_en_enable_f(void)
+{
+        return 0x80000000U;
+}
+static inline u32 gr_mme_hww_esr_info_r(void)
+{
+        return 0x00404494U;
+}
+static inline u32 gr_memfmt_hww_esr_r(void)
+{
+        return 0x00404600U;
+}
+static inline u32 gr_memfmt_hww_esr_reset_active_f(void)
+{
+        return 0x40000000U;
+}
+static inline u32 gr_memfmt_hww_esr_en_enable_f(void)
+{
+        return 0x80000000U;
+}
+static inline u32 gr_fecs_cpuctl_r(void)
+{
+        return 0x00409100U;
+}
+static inline u32 gr_fecs_cpuctl_startcpu_f(u32 v)
+{
+        return (v & 0x1U) << 1U;
+}
+static inline u32 gr_fecs_dmactl_r(void)
+{
+        return 0x0040910cU;
+}
+static inline u32 gr_fecs_dmactl_require_ctx_f(u32 v)
+{
+        return (v & 0x1U) << 0U;
+}
+static inline u32 gr_fecs_dmactl_dmem_scrubbing_m(void)
+{
+        return 0x1U << 1U;
+}
+static inline u32 gr_fecs_dmactl_imem_scrubbing_m(void)
+{
+        return 0x1U << 2U;
+}
+static inline u32 gr_fecs_os_r(void)
+{
+        return 0x00409080U;
+}
+static inline u32 gr_fecs_idlestate_r(void)
+{
+        return 0x0040904cU;
+}
+static inline u32 gr_fecs_mailbox0_r(void)
+{
+        return 0x00409040U;
+}
+static inline u32 gr_fecs_mailbox1_r(void)
+{
+        return 0x00409044U;
+}
+static inline u32 gr_fecs_irqstat_r(void)
+{
+        return 0x00409008U;
+}
+static inline u32 gr_fecs_irqmode_r(void)
+{
+        return 0x0040900cU;
+}
+static inline u32 gr_fecs_irqmask_r(void)
+{
+        return 0x00409018U;
+}
+static inline u32 gr_fecs_irqdest_r(void)
+{
+        return 0x0040901cU;
+}
+static inline u32 gr_fecs_curctx_r(void)
+{
+        return 0x00409050U;
+}
+static inline u32 gr_fecs_nxtctx_r(void)
+{
+        return 0x00409054U;
+}
+static inline u32 gr_fecs_engctl_r(void)
+{
+        return 0x004090a4U;
+}
+static inline u32 gr_fecs_debug1_r(void)
+{
+        return 0x00409090U;
+}
+static inline u32 gr_fecs_debuginfo_r(void)
+{
+        return 0x00409094U;
+}
+static inline u32 gr_fecs_icd_cmd_r(void)
+{
+        return 0x00409200U;
+}
+static inline u32 gr_fecs_icd_cmd_opc_s(void)
+{
+        return 4U;
+}
+static inline u32 gr_fecs_icd_cmd_opc_f(u32 v)
+{
+        return (v & 0xfU) << 0U;
+}
+static inline u32 gr_fecs_icd_cmd_opc_m(void)
+{
+        return 0xfU << 0U;
+}
+static inline u32 gr_fecs_icd_cmd_opc_v(u32 r)
+{
+        return (r >> 0U) & 0xfU;
+}
+static inline u32 gr_fecs_icd_cmd_opc_rreg_f(void)
+{
+        return 0x8U;
+}
+static inline u32 gr_fecs_icd_cmd_opc_rstat_f(void)
+{
+        return 0xeU;
+}
+static inline u32 gr_fecs_icd_cmd_idx_f(u32 v)
+{
+        return (v & 0x1fU) << 8U;
+}
+static inline u32 gr_fecs_icd_rdata_r(void)
+{
+        return 0x0040920cU;
+}
+static inline u32 gr_fecs_imemc_r(u32 i)
+{
+        return 0x00409180U + i*16U;
+}
+static inline u32 gr_fecs_imemc_offs_f(u32 v)
+{
+        return (v & 0x3fU) << 2U;
+}
+static inline u32 gr_fecs_imemc_blk_f(u32 v)
+{
+        return (v & 0xffU) << 8U;
+}
+static inline u32 gr_fecs_imemc_aincw_f(u32 v)
+{
+        return (v & 0x1U) << 24U;
+}
+static inline u32 gr_fecs_imemd_r(u32 i)
+{
+        return 0x00409184U + i*16U;
+}
+static inline u32 gr_fecs_imemt_r(u32 i)
+{
+        return 0x00409188U + i*16U;
+}
+static inline u32 gr_fecs_imemt_tag_f(u32 v)
+{
+        return (v & 0xffffU) << 0U;
+}
+static inline u32 gr_fecs_dmemc_r(u32 i)
+{
+        return 0x004091c0U + i*8U;
+}
+static inline u32 gr_fecs_dmemc_offs_s(void)
+{
+        return 6U;
+}
+static inline u32 gr_fecs_dmemc_offs_f(u32 v)
+{
+        return (v & 0x3fU) << 2U;
+}
+static inline u32 gr_fecs_dmemc_offs_m(void)
+{
+        return 0x3fU << 2U;
+}
+static inline u32 gr_fecs_dmemc_offs_v(u32 r)
+{
+        return (r >> 2U) & 0x3fU;
+}
+static inline u32 gr_fecs_dmemc_blk_f(u32 v)
+{
+        return (v & 0xffU) << 8U;
+}
+static inline u32 gr_fecs_dmemc_aincw_f(u32 v)
+{
+        return (v & 0x1U) << 24U;
+}
+static inline u32 gr_fecs_dmemd_r(u32 i)
+{
+        return 0x004091c4U + i*8U;
+}
+static inline u32 gr_fecs_dmatrfbase_r(void)
+{
+        return 0x00409110U;
+}
+static inline u32 gr_fecs_dmatrfmoffs_r(void)
+{
+        return 0x00409114U;
+}
+static inline u32 gr_fecs_dmatrffboffs_r(void)
+{
+        return 0x0040911cU;
+}
+static inline u32 gr_fecs_dmatrfcmd_r(void)
+{
+        return 0x00409118U;
+}
+static inline u32 gr_fecs_dmatrfcmd_imem_f(u32 v)
+{
+        return (v & 0x1U) << 4U;
+}
+static inline u32 gr_fecs_dmatrfcmd_write_f(u32 v)
+{
+        return (v & 0x1U) << 5U;
+}
+static inline u32 gr_fecs_dmatrfcmd_size_f(u32 v)
+{
+        return (v & 0x7U) << 8U;
+}
+static inline u32 gr_fecs_dmatrfcmd_ctxdma_f(u32 v)
+{
+        return (v & 0x7U) << 12U;
+}
+static inline u32 gr_fecs_bootvec_r(void)
+{
+        return 0x00409104U;
+}
+static inline u32 gr_fecs_bootvec_vec_f(u32 v)
+{
+        return (v & 0xffffffffU) << 0U;
+}
+static inline u32 gr_fecs_falcon_hwcfg_r(void)
+{
+        return 0x00409108U;
+}
+static inline u32 gr_gpcs_gpccs_falcon_hwcfg_r(void)
+{
+        return 0x0041a108U;
+}
+static inline u32 gr_fecs_falcon_rm_r(void)
+{
+        return 0x00409084U;
+}
+static inline u32 gr_fecs_current_ctx_r(void)
+{
+        return 0x00409b00U;
+}
+static inline u32 gr_fecs_current_ctx_ptr_f(u32 v)
+{
+        return (v & 0xfffffffU) << 0U;
+}
+static inline u32 gr_fecs_current_ctx_ptr_v(u32 r)
+{
+        return (r >> 0U) & 0xfffffffU;
+}
+static inline u32 gr_fecs_current_ctx_target_s(void)
+{
+        return 2U;
+}
+static inline u32 gr_fecs_current_ctx_target_f(u32 v)
+{
+        return (v & 0x3U) << 28U;
+}
+static inline u32 gr_fecs_current_ctx_target_m(void)
+{
+        return 0x3U << 28U;
+}
+static inline u32 gr_fecs_current_ctx_target_v(u32 r)
+{
+        return (r >> 28U) & 0x3U;
+}
+static inline u32 gr_fecs_current_ctx_target_vid_mem_f(void)
+{
+        return 0x0U;
+}
+static inline u32 gr_fecs_current_ctx_target_sys_mem_coh_f(void)
+{
+        return 0x20000000U;
+}
+static inline u32 gr_fecs_current_ctx_target_sys_mem_ncoh_f(void)
+{
+        return 0x30000000U;
+}
+static inline u32 gr_fecs_current_ctx_valid_s(void)
+{
+        return 1U;
+}
+static inline u32 gr_fecs_current_ctx_valid_f(u32 v)
+{
+        return (v & 0x1U) << 31U;
+}
+static inline u32 gr_fecs_current_ctx_valid_m(void)
+{
+        return 0x1U << 31U;
+}
+static inline u32 gr_fecs_current_ctx_valid_v(u32 r)
+{
+        return (r >> 31U) & 0x1U;
+}
+static inline u32 gr_fecs_current_ctx_valid_false_f(void)
+{
+        return 0x0U;
+}
+static inline u32 gr_fecs_method_data_r(void)
+{
+        return 0x00409500U;
+}
+static inline u32 gr_fecs_method_push_r(void)
+{
+        return 0x00409504U;
+}
+static inline u32 gr_fecs_method_push_adr_f(u32 v)
+{
+        return (v & 0xfffU) << 0U;
+}
+static inline u32 gr_fecs_method_push_adr_bind_pointer_v(void)
+{
+        return 0x00000003U;
+}
+static inline u32 gr_fecs_method_push_adr_bind_pointer_f(void)
+{
+        return 0x3U;
+}
+static inline u32 gr_fecs_method_push_adr_discover_image_size_v(void)
+{
+        return 0x00000010U;
+}
+static inline u32 gr_fecs_method_push_adr_wfi_golden_save_v(void)
+{
+        return 0x00000009U;
+}
+static inline u32 gr_fecs_method_push_adr_restore_golden_v(void)
+{
+        return 0x00000015U;
+}
+static inline u32 gr_fecs_method_push_adr_discover_zcull_image_size_v(void)
+{
+        return 0x00000016U;
+}
+static inline u32 gr_fecs_method_push_adr_discover_pm_image_size_v(void)
+{
+        return 0x00000025U;
+}
+static inline u32 gr_fecs_method_push_adr_discover_reglist_image_size_v(void)
+{
+        return 0x00000030U;
+}
+static inline u32 gr_fecs_method_push_adr_set_reglist_bind_instance_v(void)
+{
+        return 0x00000031U;
+}
+static inline u32 gr_fecs_method_push_adr_set_reglist_virtual_address_v(void)
+{
+        return 0x00000032U;
+}
+static inline u32 gr_fecs_method_push_adr_stop_ctxsw_v(void)
+{
+        return 0x00000038U;
+}
+static inline u32 gr_fecs_method_push_adr_start_ctxsw_v(void)
+{
+        return 0x00000039U;
+}
+static inline u32 gr_fecs_method_push_adr_set_watchdog_timeout_f(void)
+{
+        return 0x21U;
+}
+static inline u32 gr_fecs_method_push_adr_halt_pipeline_v(void)
+{
+        return 0x00000004U;
+}
+static inline u32 gr_fecs_host_int_status_r(void)
+{
+        return 0x00409c18U;
+}
+static inline u32 gr_fecs_host_int_status_fault_during_ctxsw_f(u32 v)
+{
+        return (v & 0x1U) << 16U;
+}
+static inline u32 gr_fecs_host_int_status_umimp_firmware_method_f(u32 v)
+{
+        return (v & 0x1U) << 17U;
+}
+static inline u32 gr_fecs_host_int_status_umimp_illegal_method_f(u32 v)
+{
+        return (v & 0x1U) << 18U;
+}
+static inline u32 gr_fecs_host_int_status_watchdog_active_f(void)
+{
+        return 0x80000U;
+}
+static inline u32 gr_fecs_host_int_status_ctxsw_intr_f(u32 v)
+{
+        return (v & 0xffffU) << 0U;
+}
+static inline u32 gr_fecs_host_int_clear_r(void)
+{
+        return 0x00409c20U;
+}
+static inline u32 gr_fecs_host_int_clear_ctxsw_intr1_f(u32 v)
+{
+        return (v & 0x1U) << 1U;
+}
+static inline u32 gr_fecs_host_int_clear_ctxsw_intr1_clear_f(void)
+{
+        return 0x2U;
+}
+static inline u32 gr_fecs_host_int_enable_r(void)
+{
+        return 0x00409c24U;
+}
+static inline u32 gr_fecs_host_int_enable_ctxsw_intr1_enable_f(void)
+{
+        return 0x2U;
+}
+static inline u32 gr_fecs_host_int_enable_fault_during_ctxsw_enable_f(void)
+{
+        return 0x10000U;
+}
+static inline u32 gr_fecs_host_int_enable_umimp_firmware_method_enable_f(void)
+{
+        return 0x20000U;
+}
+static inline u32 gr_fecs_host_int_enable_umimp_illegal_method_enable_f(void)
+{
+        return 0x40000U;
+}
+static inline u32 gr_fecs_host_int_enable_watchdog_enable_f(void)
+{
+        return 0x80000U;
+}
+static inline u32 gr_fecs_ctxsw_reset_ctl_r(void)
+{
+        return 0x00409614U;
+}
+static inline u32 gr_fecs_ctxsw_reset_ctl_sys_halt_disabled_f(void)
+{
+        return 0x0U;
+}
+static inline u32 gr_fecs_ctxsw_reset_ctl_gpc_halt_disabled_f(void)
+{
+        return 0x0U;
+}
+static inline u32 gr_fecs_ctxsw_reset_ctl_be_halt_disabled_f(void)
+{
+        return 0x0U;
+}
+static inline u32 gr_fecs_ctxsw_reset_ctl_sys_engine_reset_disabled_f(void)
+{
+        return 0x10U;
+}
+static inline u32 gr_fecs_ctxsw_reset_ctl_gpc_engine_reset_disabled_f(void)
+{
+        return 0x20U;
+}
+static inline u32 gr_fecs_ctxsw_reset_ctl_be_engine_reset_disabled_f(void)
+{
+        return 0x40U;
+}
+static inline u32 gr_fecs_ctxsw_reset_ctl_sys_context_reset_enabled_f(void)
+{
+        return 0x0U;
+}
+static inline u32 gr_fecs_ctxsw_reset_ctl_sys_context_reset_disabled_f(void)
+{
+        return 0x100U;
+}
+static inline u32 gr_fecs_ctxsw_reset_ctl_gpc_context_reset_enabled_f(void)
+{
+        return 0x0U;
+}
+static inline u32 gr_fecs_ctxsw_reset_ctl_gpc_context_reset_disabled_f(void)
+{
+        return 0x200U;
+}
+static inline u32 gr_fecs_ctxsw_reset_ctl_be_context_reset_s(void)
+{
+        return 1U;
+}
+static inline u32 gr_fecs_ctxsw_reset_ctl_be_context_reset_f(u32 v)
+{
+        return (v & 0x1U) << 10U;
+}
+static inline u32 gr_fecs_ctxsw_reset_ctl_be_context_reset_m(void)
+{
+        return 0x1U << 10U;
+}
+static inline u32 gr_fecs_ctxsw_reset_ctl_be_context_reset_v(u32 r)
+{
+        return (r >> 10U) & 0x1U;
+}
+static inline u32 gr_fecs_ctxsw_reset_ctl_be_context_reset_enabled_f(void)
+{
+        return 0x0U;
+}
+static inline u32 gr_fecs_ctxsw_reset_ctl_be_context_reset_disabled_f(void)
+{
+        return 0x400U;
+}
+static inline u32 gr_fecs_ctx_state_store_major_rev_id_r(void)
+{
+        return 0x0040960cU;
+}
+static inline u32 gr_fecs_ctxsw_mailbox_r(u32 i)
+{
+        return 0x00409800U + i*4U;
+}
+static inline u32 gr_fecs_ctxsw_mailbox__size_1_v(void)
+{
+        return 0x00000008U;
+}
+static inline u32 gr_fecs_ctxsw_mailbox_value_f(u32 v)
+{
+        return (v & 0xffffffffU) << 0U;
+}
+static inline u32 gr_fecs_ctxsw_mailbox_value_pass_v(void)
+{
+        return 0x00000001U;
+}
+static inline u32 gr_fecs_ctxsw_mailbox_value_fail_v(void)
+{
+        return 0x00000002U;
+}
+static inline u32 gr_fecs_ctxsw_mailbox_set_r(u32 i)
+{
+        return 0x00409820U + i*4U;
+}
+static inline u32 gr_fecs_ctxsw_mailbox_set_value_f(u32 v)
+{
+        return (v & 0xffffffffU) << 0U;
+}
+static inline u32 gr_fecs_ctxsw_mailbox_clear_r(u32 i)
+{
+        return 0x00409840U + i*4U;
+}
+static inline u32 gr_fecs_ctxsw_mailbox_clear_value_f(u32 v)
+{
+        return (v & 0xffffffffU) << 0U;
+}
+static inline u32 gr_fecs_fs_r(void)
+{
+        return 0x00409604U;
+}
+static inline u32 gr_fecs_fs_num_available_gpcs_s(void)
+{
+        return 5U;
+}
+static inline u32 gr_fecs_fs_num_available_gpcs_f(u32 v)
+{
+        return (v & 0x1fU) << 0U;
+}
+static inline u32 gr_fecs_fs_num_available_gpcs_m(void)
+{
+        return 0x1fU << 0U;
+}
+static inline u32 gr_fecs_fs_num_available_gpcs_v(u32 r)
+{
+        return (r >> 0U) & 0x1fU;
+}
+static inline u32 gr_fecs_fs_num_available_fbps_s(void)
+{
+        return 5U;
+}
+static inline u32 gr_fecs_fs_num_available_fbps_f(u32 v)
+{
+        return (v & 0x1fU) << 16U;
+}
+static inline u32 gr_fecs_fs_num_available_fbps_m(void)
+{
+        return 0x1fU << 16U;
+}
+static inline u32 gr_fecs_fs_num_available_fbps_v(u32 r)
+{
+        return (r >> 16U) & 0x1fU;
+}
+static inline u32 gr_fecs_cfg_r(void)
+{
+        return 0x00409620U;
+}
+static inline u32 gr_fecs_cfg_imem_sz_v(u32 r)
+{
+        return (r >> 0U) & 0xffU;
+}
+static inline u32 gr_fecs_rc_lanes_r(void)
+{
+        return 0x00409880U;
+}
+static inline u32 gr_fecs_rc_lanes_num_chains_s(void)
+{
+        return 6U;
+}
+static inline u32 gr_fecs_rc_lanes_num_chains_f(u32 v)
+{
+        return (v & 0x3fU) << 0U;
+}
+static inline u32 gr_fecs_rc_lanes_num_chains_m(void)
+{
+        return 0x3fU << 0U;
+}
+static inline u32 gr_fecs_rc_lanes_num_chains_v(u32 r)
+{
+        return (r >> 0U) & 0x3fU;
+}
+static inline u32 gr_fecs_ctxsw_status_1_r(void)
+{
+        return 0x00409400U;
+}
+static inline u32 gr_fecs_ctxsw_status_1_arb_busy_s(void)
+{
+        return 1U;
+}
+static inline u32 gr_fecs_ctxsw_status_1_arb_busy_f(u32 v)
+{
+        return (v & 0x1U) << 12U;
+}
+static inline u32 gr_fecs_ctxsw_status_1_arb_busy_m(void)
+{
+        return 0x1U << 12U;
+}
+static inline u32 gr_fecs_ctxsw_status_1_arb_busy_v(u32 r)
+{
+        return (r >> 12U) & 0x1U;
+}
+static inline u32 gr_fecs_arb_ctx_adr_r(void)
+{
+        return 0x00409a24U;
+}
+static inline u32 gr_fecs_new_ctx_r(void)
+{
+        return 0x00409b04U;
+}
+static inline u32 gr_fecs_new_ctx_ptr_s(void)
+{
+        return 28U;
+}
+static inline u32 gr_fecs_new_ctx_ptr_f(u32 v)
+{
+        return (v & 0xfffffffU) << 0U;
+}
+static inline u32 gr_fecs_new_ctx_ptr_m(void)
+{
+        return 0xfffffffU << 0U;
+}
+static inline u32 gr_fecs_new_ctx_ptr_v(u32 r)
+{
+        return (r >> 0U) & 0xfffffffU;
+}
+static inline u32 gr_fecs_new_ctx_target_s(void)
+{
+        return 2U;
+}
+static inline u32 gr_fecs_new_ctx_target_f(u32 v)
+{
+        return (v & 0x3U) << 28U;
+}
+static inline u32 gr_fecs_new_ctx_target_m(void)
+{
+        return 0x3U << 28U;
+}
+static inline u32 gr_fecs_new_ctx_target_v(u32 r)
+{
+        return (r >> 28U) & 0x3U;
+}
+static inline u32 gr_fecs_new_ctx_target_vid_mem_f(void)
+{
+        return 0x0U;
+}
+static inline u32 gr_fecs_new_ctx_target_sys_mem_ncoh_f(void)
+{
+        return 0x30000000U;
+}
+static inline u32 gr_fecs_new_ctx_target_sys_mem_coh_f(void)
+{
+        return 0x20000000U;
+}
+static inline u32 gr_fecs_new_ctx_valid_s(void)
+{
+        return 1U;
+}
+static inline u32 gr_fecs_new_ctx_valid_f(u32 v)
+{
+        return (v & 0x1U) << 31U;
+}
+static inline u32 gr_fecs_new_ctx_valid_m(void)
+{
+        return 0x1U << 31U;
+}
+static inline u32 gr_fecs_new_ctx_valid_v(u32 r)
+{
+        return (r >> 31U) & 0x1U;
+}
+static inline u32 gr_fecs_arb_ctx_ptr_r(void)
+{
+        return 0x00409a0cU;
+}
+static inline u32 gr_fecs_arb_ctx_ptr_ptr_s(void)
+{
+        return 28U;
+}
+static inline u32 gr_fecs_arb_ctx_ptr_ptr_f(u32 v)
+{
+        return (v & 0xfffffffU) << 0U;
+}
+static inline u32 gr_fecs_arb_ctx_ptr_ptr_m(void)
+{
+        return 0xfffffffU << 0U;
+}
+static inline u32 gr_fecs_arb_ctx_ptr_ptr_v(u32 r)
+{
+        return (r >> 0U) & 0xfffffffU;
+}
+static inline u32 gr_fecs_arb_ctx_ptr_target_s(void)
+{
+        return 2U;
+}
+static inline u32 gr_fecs_arb_ctx_ptr_target_f(u32 v)
+{
+        return (v & 0x3U) << 28U;
+}
+static inline u32 gr_fecs_arb_ctx_ptr_target_m(void)
+{
+        return 0x3U << 28U;
+}
+static inline u32 gr_fecs_arb_ctx_ptr_target_v(u32 r)
+{
+        return (r >> 28U) & 0x3U;
+}
+static inline u32 gr_fecs_arb_ctx_ptr_target_vid_mem_f(void)
+{
+        return 0x0U;
+}
+static inline u32 gr_fecs_arb_ctx_ptr_target_sys_mem_ncoh_f(void)
+{
+        return 0x30000000U;
+}
+static inline u32 gr_fecs_arb_ctx_ptr_target_sys_mem_coh_f(void)
+{
+        return 0x20000000U;
+}
+static inline u32 gr_fecs_arb_ctx_cmd_r(void)
+{
+        return 0x00409a10U;
+}
+static inline u32 gr_fecs_arb_ctx_cmd_cmd_s(void)
+{
+        return 5U;
+}
+static inline u32 gr_fecs_arb_ctx_cmd_cmd_f(u32 v)
+{
+        return (v & 0x1fU) << 0U;
+}
+static inline u32 gr_fecs_arb_ctx_cmd_cmd_m(void)
+{
+        return 0x1fU << 0U;
+}
+static inline u32 gr_fecs_arb_ctx_cmd_cmd_v(u32 r)
+{
+        return (r >> 0U) & 0x1fU;
+}
+static inline u32 gr_fecs_ctxsw_status_fe_0_r(void)
+{
+        return 0x00409c00U;
+}
+static inline u32 gr_gpc0_gpccs_ctxsw_status_gpc_0_r(void)
+{
+        return 0x00502c04U;
+}
+static inline u32 gr_gpc0_gpccs_ctxsw_status_1_r(void)
+{
+        return 0x00502400U;
+}
+static inline u32 gr_fecs_ctxsw_idlestate_r(void)
+{
+        return 0x00409420U;
+}
+static inline u32 gr_gpc0_gpccs_ctxsw_idlestate_r(void)
+{
+        return 0x00502420U;
+}
+static inline u32 gr_rstr2d_gpc_map0_r(void)
+{
+        return 0x0040780cU;
+}
+static inline u32 gr_rstr2d_gpc_map1_r(void)
+{
+        return 0x00407810U;
+}
+static inline u32 gr_rstr2d_gpc_map2_r(void)
+{
+        return 0x00407814U;
+}
+static inline u32 gr_rstr2d_gpc_map3_r(void)
+{
+        return 0x00407818U;
+}
+static inline u32 gr_rstr2d_gpc_map4_r(void)
+{
+        return 0x0040781cU;
+}
+static inline u32 gr_rstr2d_gpc_map5_r(void)
+{
+        return 0x00407820U;
+}
+static inline u32 gr_rstr2d_map_table_cfg_r(void)
+{
+        return 0x004078bcU;
+}
+static inline u32 gr_rstr2d_map_table_cfg_row_offset_f(u32 v)
+{
+        return (v & 0xffU) << 0U;
+}
+static inline u32 gr_rstr2d_map_table_cfg_num_entries_f(u32 v)
+{
+        return (v & 0xffU) << 8U;
+}
+static inline u32 gr_pd_hww_esr_r(void)
+{
+        return 0x00406018U;
+}
+static inline u32 gr_pd_hww_esr_reset_active_f(void)
+{
+        return 0x40000000U;
+}
+static inline u32 gr_pd_hww_esr_en_enable_f(void)
+{
+        return 0x80000000U;
+}
+static inline u32 gr_pd_num_tpc_per_gpc_r(u32 i)
+{
+        return 0x00406028U + i*4U;
+}
+static inline u32 gr_pd_num_tpc_per_gpc__size_1_v(void)
+{
+        return 0x00000004U;
+}
+static inline u32 gr_pd_num_tpc_per_gpc_count0_f(u32 v)
+{
+        return (v & 0xfU) << 0U;
+}
+static inline u32 gr_pd_num_tpc_per_gpc_count1_f(u32 v)
+{
+        return (v & 0xfU) << 4U;
+}
+static inline u32 gr_pd_num_tpc_per_gpc_count2_f(u32 v)
+{
+        return (v & 0xfU) << 8U;
+}
+static inline u32 gr_pd_num_tpc_per_gpc_count3_f(u32 v)
+{
+        return (v & 0xfU) << 12U;
+}
+static inline u32 gr_pd_num_tpc_per_gpc_count4_f(u32 v)
+{
+        return (v & 0xfU) << 16U;
+}
+static inline u32 gr_pd_num_tpc_per_gpc_count5_f(u32 v)
+{
+        return (v & 0xfU) << 20U;
+}
+static inline u32 gr_pd_num_tpc_per_gpc_count6_f(u32 v)
+{
+        return (v & 0xfU) << 24U;
+}
+static inline u32 gr_pd_num_tpc_per_gpc_count7_f(u32 v)
+{
+        return (v & 0xfU) << 28U;
+}
+static inline u32 gr_pd_ab_dist_cfg0_r(void)
+{
+        return 0x004064c0U;
+}
+static inline u32 gr_pd_ab_dist_cfg0_timeslice_enable_en_f(void)
+{
+        return 0x80000000U;
+}
+static inline u32 gr_pd_ab_dist_cfg0_timeslice_enable_dis_f(void)
+{
+        return 0x0U;
+}
+static inline u32 gr_pd_ab_dist_cfg1_r(void)
+{
+        return 0x004064c4U;
+}
+static inline u32 gr_pd_ab_dist_cfg1_max_batches_init_f(void)
+{
+        return 0xffffU;
+}
+static inline u32 gr_pd_ab_dist_cfg1_max_output_f(u32 v)
+{
+        return (v & 0x7ffU) << 16U;
+}
+static inline u32 gr_pd_ab_dist_cfg1_max_output_granularity_v(void)
+{
+        return 0x00000080U;
+}
+static inline u32 gr_pd_ab_dist_cfg2_r(void)
+{
+        return 0x004064c8U;
+}
+static inline u32 gr_pd_ab_dist_cfg2_token_limit_f(u32 v)
+{
+        return (v & 0xfffU) << 0U;
+}
+static inline u32 gr_pd_ab_dist_cfg2_token_limit_init_v(void)
+{
+        return 0x00000100U;
+}
+static inline u32 gr_pd_ab_dist_cfg2_state_limit_f(u32 v)
+{
+        return (v & 0xfffU) << 16U;
+}
+static inline u32 gr_pd_ab_dist_cfg2_state_limit_scc_bundle_granularity_v(void)
+{
+        return 0x00000020U;
+}
+static inline u32 gr_pd_ab_dist_cfg2_state_limit_min_gpm_fifo_depths_v(void)
+{
+        return 0x00000062U;
+}
+static inline u32 gr_pd_pagepool_r(void)
+{
+        return 0x004064ccU;
+}
+static inline u32 gr_pd_pagepool_total_pages_f(u32 v)
+{
+        return (v & 0xffU) << 0U;
+}
+static inline u32 gr_pd_pagepool_valid_true_f(void)
+{
+        return 0x80000000U;
+}
+static inline u32 gr_pd_dist_skip_table_r(u32 i)
+{
+        return 0x004064d0U + i*4U;
+}
+static inline u32 gr_pd_dist_skip_table__size_1_v(void)
+{
+        return 0x00000008U;
+}
+static inline u32 gr_pd_dist_skip_table_gpc_4n0_mask_f(u32 v)
+{
+        return (v & 0xffU) << 0U;
+}
+static inline u32 gr_pd_dist_skip_table_gpc_4n1_mask_f(u32 v)
+{
+        return (v & 0xffU) << 8U;
+}
+static inline u32 gr_pd_dist_skip_table_gpc_4n2_mask_f(u32 v)
+{
+        return (v & 0xffU) << 16U;
+}
+static inline u32 gr_pd_dist_skip_table_gpc_4n3_mask_f(u32 v)
+{
+        return (v & 0xffU) << 24U;
+}
+static inline u32 gr_pd_alpha_ratio_table_r(u32 i)
+{
+        return 0x00406800U + i*4U;
+}
+static inline u32 gr_pd_alpha_ratio_table__size_1_v(void)
+{
+        return 0x00000100U;
+}
+static inline u32 gr_pd_alpha_ratio_table_gpc_4n0_mask_f(u32 v)
+{
+        return (v & 0xffU) << 0U;
+}
+static inline u32 gr_pd_alpha_ratio_table_gpc_4n1_mask_f(u32 v)
+{
+        return (v & 0xffU) << 8U;
+}
+static inline u32 gr_pd_alpha_ratio_table_gpc_4n2_mask_f(u32 v)
+{
+        return (v & 0xffU) << 16U;
+}
+static inline u32 gr_pd_alpha_ratio_table_gpc_4n3_mask_f(u32 v)
+{
+        return (v & 0xffU) << 24U;
+}
+static inline u32 gr_pd_beta_ratio_table_r(u32 i)
+{
+        return 0x00406c00U + i*4U;
+}
+static inline u32 gr_pd_beta_ratio_table__size_1_v(void)
+{
+        return 0x00000100U;
+}
+static inline u32 gr_pd_beta_ratio_table_gpc_4n0_mask_f(u32 v)
+{
+        return (v & 0xffU) << 0U;
+}
+static inline u32 gr_pd_beta_ratio_table_gpc_4n1_mask_f(u32 v)
+{
+        return (v & 0xffU) << 8U;
+}
+static inline u32 gr_pd_beta_ratio_table_gpc_4n2_mask_f(u32 v)
+{
+        return (v & 0xffU) << 16U;
+}
+static inline u32 gr_pd_beta_ratio_table_gpc_4n3_mask_f(u32 v)
+{
+        return (v & 0xffU) << 24U;
+}
+static inline u32 gr_ds_debug_r(void)
+{
+        return 0x00405800U;
+}
+static inline u32 gr_ds_debug_timeslice_mode_disable_f(void)
+{
+        return 0x0U;
+}
+static inline u32 gr_ds_debug_timeslice_mode_enable_f(void)
+{
+        return 0x8000000U;
+}
+static inline u32 gr_ds_zbc_color_r_r(void)
+{
+        return 0x00405804U;
+}
+static inline u32 gr_ds_zbc_color_r_val_f(u32 v)
+{
+        return (v & 0xffffffffU) << 0U;
+}
+static inline u32 gr_ds_zbc_color_g_r(void)
+{
+        return 0x00405808U;
+}
+static inline u32 gr_ds_zbc_color_g_val_f(u32 v)
+{
+        return (v & 0xffffffffU) << 0U;
+}
+static inline u32 gr_ds_zbc_color_b_r(void)
+{
+        return 0x0040580cU;
+}
+static inline u32 gr_ds_zbc_color_b_val_f(u32 v)
+{
+        return (v & 0xffffffffU) << 0U;
+}
+static inline u32 gr_ds_zbc_color_a_r(void)
+{
+        return 0x00405810U;
+}
+static inline u32 gr_ds_zbc_color_a_val_f(u32 v)
+{
+        return (v & 0xffffffffU) << 0U;
+}
+static inline u32 gr_ds_zbc_color_fmt_r(void)
+{
+        return 0x00405814U;
+}
+static inline u32 gr_ds_zbc_color_fmt_val_f(u32 v)
+{
+        return (v & 0x7fU) << 0U;
+}
+static inline u32 gr_ds_zbc_color_fmt_val_invalid_f(void)
+{
+        return 0x0U;
+}
+static inline u32 gr_ds_zbc_color_fmt_val_zero_v(void)
+{
+        return 0x00000001U;
+}
+static inline u32 gr_ds_zbc_color_fmt_val_unorm_one_v(void)
+{
+        return 0x00000002U;
+}
+static inline u32 gr_ds_zbc_color_fmt_val_rf32_gf32_bf32_af32_v(void)
+{
+        return 0x00000004U;
+}
+static inline u32 gr_ds_zbc_color_fmt_val_a8_b8_g8_r8_v(void)
+{
+        return 0x00000028U;
+}
+static inline u32 gr_ds_zbc_z_r(void)
+{
+        return 0x00405818U;
+}
+static inline u32 gr_ds_zbc_z_val_s(void)
+{
+        return 32U;
+}
+static inline u32 gr_ds_zbc_z_val_f(u32 v)
+{
+        return (v & 0xffffffffU) << 0U;
+}
+static inline u32 gr_ds_zbc_z_val_m(void)
+{
+        return 0xffffffffU << 0U;
+}
+static inline u32 gr_ds_zbc_z_val_v(u32 r)
+{
+        return (r >> 0U) & 0xffffffffU;
+}
+static inline u32 gr_ds_zbc_z_val__init_v(void)
+{
+        return 0x00000000U;
+}
+static inline u32 gr_ds_zbc_z_val__init_f(void)
+{
+        return 0x0U;
+}
+static inline u32 gr_ds_zbc_z_fmt_r(void)
+{
+        return 0x0040581cU;
+}
+static inline u32 gr_ds_zbc_z_fmt_val_f(u32 v)
+{
+        return (v & 0x1U) << 0U;
+}
+static inline u32 gr_ds_zbc_z_fmt_val_invalid_f(void)
+{
+        return 0x0U;
+}
+static inline u32 gr_ds_zbc_z_fmt_val_fp32_v(void)
+{
+        return 0x00000001U;
+}
+static inline u32 gr_ds_zbc_tbl_index_r(void)
+{
+        return 0x00405820U;
+}
+static inline u32 gr_ds_zbc_tbl_index_val_f(u32 v)
+{
+        return (v & 0xfU) << 0U;
+}
+static inline u32 gr_ds_zbc_tbl_ld_r(void)
+{
+        return 0x00405824U;
+}
+static inline u32 gr_ds_zbc_tbl_ld_select_c_f(void)
+{
+        return 0x0U;
+}
+static inline u32 gr_ds_zbc_tbl_ld_select_z_f(void)
+{
+        return 0x1U;
+}
+static inline u32 gr_ds_zbc_tbl_ld_action_write_f(void)
+{
+        return 0x0U;
+}
+static inline u32 gr_ds_zbc_tbl_ld_trigger_active_f(void)
+{
+        return 0x4U;
+}
+static inline u32 gr_ds_tga_constraintlogic_r(void)
+{
+        return 0x00405830U;
+}
+static inline u32 gr_ds_tga_constraintlogic_beta_cbsize_f(u32 v)
+{
+        return (v & 0xfffU) << 16U;
+}
+static inline u32 gr_ds_tga_constraintlogic_alpha_cbsize_f(u32 v)
+{
+        return (v & 0xfffU) << 0U;
+}
+static inline u32 gr_ds_hww_esr_r(void)
+{
+        return 0x00405840U;
+}
+static inline u32 gr_ds_hww_esr_reset_s(void)
+{
+        return 1U;
+}
+static inline u32 gr_ds_hww_esr_reset_f(u32 v)
+{
+        return (v & 0x1U) << 30U;
+}
+static inline u32 gr_ds_hww_esr_reset_m(void)
+{
+        return 0x1U << 30U;
+}
+static inline u32 gr_ds_hww_esr_reset_v(u32 r)
+{
+        return (r >> 30U) & 0x1U;
+}
+static inline u32 gr_ds_hww_esr_reset_task_v(void)
+{
+        return 0x00000001U;
+}
+static inline u32 gr_ds_hww_esr_reset_task_f(void)
+{
+        return 0x40000000U;
+}
+static inline u32 gr_ds_hww_esr_en_enabled_f(void)
+{
+        return 0x80000000U;
+}
+static inline u32 gr_ds_hww_report_mask_r(void)
+{
+        return 0x00405844U;
+}
+static inline u32 gr_ds_hww_report_mask_sph0_err_report_f(void)
+{
+        return 0x1U;
+}
+static inline u32 gr_ds_hww_report_mask_sph1_err_report_f(void)
+{
+        return 0x2U;
+}
+static inline u32 gr_ds_hww_report_mask_sph2_err_report_f(void)
+{
+        return 0x4U;
+}
+static inline u32 gr_ds_hww_report_mask_sph3_err_report_f(void)
+{
+        return 0x8U;
+}
+static inline u32 gr_ds_hww_report_mask_sph4_err_report_f(void)
+{
+        return 0x10U;
+}
+static inline u32 gr_ds_hww_report_mask_sph5_err_report_f(void)
+{
+        return 0x20U;
+}
+static inline u32 gr_ds_hww_report_mask_sph6_err_report_f(void)
+{
+        return 0x40U;
+}
+static inline u32 gr_ds_hww_report_mask_sph7_err_report_f(void)
+{
+        return 0x80U;
+}
+static inline u32 gr_ds_hww_report_mask_sph8_err_report_f(void)
+{
+        return 0x100U;
+}
+static inline u32 gr_ds_hww_report_mask_sph9_err_report_f(void)
+{
+        return 0x200U;
+}
+static inline u32 gr_ds_hww_report_mask_sph10_err_report_f(void)
+{
+        return 0x400U;
+}
+static inline u32 gr_ds_hww_report_mask_sph11_err_report_f(void)
+{
+        return 0x800U;
+}
+static inline u32 gr_ds_hww_report_mask_sph12_err_report_f(void)
+{
+        return 0x1000U;
+}
+static inline u32 gr_ds_hww_report_mask_sph13_err_report_f(void)
+{
+        return 0x2000U;
+}
+static inline u32 gr_ds_hww_report_mask_sph14_err_report_f(void)
+{
+        return 0x4000U;
+}
+static inline u32 gr_ds_hww_report_mask_sph15_err_report_f(void)
+{
+        return 0x8000U;
+}
+static inline u32 gr_ds_hww_report_mask_sph16_err_report_f(void)
+{
+        return 0x10000U;
+}
+static inline u32 gr_ds_hww_report_mask_sph17_err_report_f(void)
+{
+        return 0x20000U;
+}
+static inline u32 gr_ds_hww_report_mask_sph18_err_report_f(void)
+{
+        return 0x40000U;
+}
+static inline u32 gr_ds_hww_report_mask_sph19_err_report_f(void)
+{
+        return 0x80000U;
+}
+static inline u32 gr_ds_hww_report_mask_sph20_err_report_f(void)
+{
+        return 0x100000U;
+}
+static inline u32 gr_ds_hww_report_mask_sph21_err_report_f(void)
+{
+        return 0x200000U;
+}
+static inline u32 gr_ds_hww_report_mask_sph22_err_report_f(void)
+{
+        return 0x400000U;
+}
+static inline u32 gr_ds_hww_report_mask_sph23_err_report_f(void)
+{
+        return 0x800000U;
+}
+static inline u32 gr_ds_num_tpc_per_gpc_r(u32 i)
+{
+        return 0x00405870U + i*4U;
+}
+static inline u32 gr_scc_bundle_cb_base_r(void)
+{
+        return 0x00408004U;
+}
+static inline u32 gr_scc_bundle_cb_base_addr_39_8_f(u32 v)
+{
+        return (v & 0xffffffffU) << 0U;
+}
+static inline u32 gr_scc_bundle_cb_base_addr_39_8_align_bits_v(void)
+{
+        return 0x00000008U;
+}
+static inline u32 gr_scc_bundle_cb_size_r(void)
+{
+        return 0x00408008U;
+}
+static inline u32 gr_scc_bundle_cb_size_div_256b_f(u32 v)
+{
+        return (v & 0x7ffU) << 0U;
+}
+static inline u32 gr_scc_bundle_cb_size_div_256b__prod_v(void)
+{
+        return 0x00000018U;
+}
+static inline u32 gr_scc_bundle_cb_size_div_256b_byte_granularity_v(void)
+{
+        return 0x00000100U;
+}
+static inline u32 gr_scc_bundle_cb_size_valid_false_v(void)
+{
+        return 0x00000000U;
+}
+static inline u32 gr_scc_bundle_cb_size_valid_false_f(void)
+{
+        return 0x0U;
+}
+static inline u32 gr_scc_bundle_cb_size_valid_true_f(void)
+{
+        return 0x80000000U;
+}
+static inline u32 gr_scc_pagepool_base_r(void)
+{
+        return 0x0040800cU;
+}
+static inline u32 gr_scc_pagepool_base_addr_39_8_f(u32 v)
+{
+        return (v & 0xffffffffU) << 0U;
+}
+static inline u32 gr_scc_pagepool_base_addr_39_8_align_bits_v(void)
+{
+        return 0x00000008U;
+}
+static inline u32 gr_scc_pagepool_r(void)
+{
+        return 0x00408010U;
+}
+static inline u32 gr_scc_pagepool_total_pages_f(u32 v)
+{
+        return (v & 0xffU) << 0U;
+}
+static inline u32 gr_scc_pagepool_total_pages_hwmax_v(void)
+{
+        return 0x00000000U;
+}
+static inline u32 gr_scc_pagepool_total_pages_hwmax_value_v(void)
+{
+        return 0x00000080U;
+}
+static inline u32 gr_scc_pagepool_total_pages_byte_granularity_v(void)
+{
+        return 0x00000100U;
+}
+static inline u32 gr_scc_pagepool_max_valid_pages_s(void)
+{
+        return 8U;
+}
+static inline u32 gr_scc_pagepool_max_valid_pages_f(u32 v)
+{
+        return (v & 0xffU) << 8U;
+}
+static inline u32 gr_scc_pagepool_max_valid_pages_m(void)
+{
+        return 0xffU << 8U;
+}
+static inline u32 gr_scc_pagepool_max_valid_pages_v(u32 r)
+{
+        return (r >> 8U) & 0xffU;
+}
+static inline u32 gr_scc_pagepool_valid_true_f(void)
+{
+        return 0x80000000U;
+}
+static inline u32 gr_scc_init_r(void)
+{
+        return 0x0040802cU;
+}
+static inline u32 gr_scc_init_ram_trigger_f(void)
+{
+        return 0x1U;
+}
+static inline u32 gr_scc_hww_esr_r(void)
+{
+        return 0x00408030U;
+}
+static inline u32 gr_scc_hww_esr_reset_active_f(void)
+{
+        return 0x40000000U;
+}
+static inline u32 gr_scc_hww_esr_en_enable_f(void)
+{
+        return 0x80000000U;
+}
+static inline u32 gr_sked_hww_esr_r(void)
+{
+        return 0x00407020U;
+}
+static inline u32 gr_sked_hww_esr_reset_active_f(void)
+{
+        return 0x40000000U;
+}
+static inline u32 gr_cwd_fs_r(void)
+{
+        return 0x00405b00U;
+}
+static inline u32 gr_cwd_fs_num_gpcs_f(u32 v)
+{
+        return (v & 0xffU) << 0U;
+}
+static inline u32 gr_cwd_fs_num_tpcs_f(u32 v)
+{
+        return (v & 0xffU) << 8U;
+}
+static inline u32 gr_gpc0_fs_gpc_r(void)
+{
+        return 0x00502608U;
+}
+static inline u32 gr_gpc0_fs_gpc_num_available_tpcs_v(u32 r)
+{
+        return (r >> 0U) & 0x1fU;
+}
+static inline u32 gr_gpc0_fs_gpc_num_available_zculls_v(u32 r)
+{
+        return (r >> 16U) & 0x1fU;
+}
+static inline u32 gr_gpc0_cfg_r(void)
+{
+        return 0x00502620U;
+}
+static inline u32 gr_gpc0_cfg_imem_sz_v(u32 r)
+{
+        return (r >> 0U) & 0xffU;
+}
+static inline u32 gr_gpccs_rc_lanes_r(void)
+{
+        return 0x00502880U;
+}
+static inline u32 gr_gpccs_rc_lanes_num_chains_s(void)
+{
+        return 6U;
+}
+static inline u32 gr_gpccs_rc_lanes_num_chains_f(u32 v)
+{
+        return (v & 0x3fU) << 0U;
+}
+static inline u32 gr_gpccs_rc_lanes_num_chains_m(void)
+{
+        return 0x3fU << 0U;
+}
+static inline u32 gr_gpccs_rc_lanes_num_chains_v(u32 r)
+{
+        return (r >> 0U) & 0x3fU;
+}
+static inline u32 gr_gpccs_rc_lane_size_r(u32 i)
+{
+        return 0x00502910U + i*0U;
+}
+static inline u32 gr_gpccs_rc_lane_size__size_1_v(void)
+{
+        return 0x00000010U;
+}
+static inline u32 gr_gpccs_rc_lane_size_v_s(void)
+{
+        return 24U;
+}
+static inline u32 gr_gpccs_rc_lane_size_v_f(u32 v)
+{
+        return (v & 0xffffffU) << 0U;
+}
+static inline u32 gr_gpccs_rc_lane_size_v_m(void)
+{
+        return 0xffffffU << 0U;
+}
+static inline u32 gr_gpccs_rc_lane_size_v_v(u32 r)
+{
+        return (r >> 0U) & 0xffffffU;
+}
+static inline u32 gr_gpccs_rc_lane_size_v_0_v(void)
+{
+        return 0x00000000U;
+}
+static inline u32 gr_gpccs_rc_lane_size_v_0_f(void)
+{
+        return 0x0U;
+}
+static inline u32 gr_gpc0_zcull_fs_r(void)
+{
+        return 0x00500910U;
+}
+static inline u32 gr_gpc0_zcull_fs_num_sms_f(u32 v)
+{
+        return (v & 0x1ffU) << 0U;
+}
+static inline u32 gr_gpc0_zcull_fs_num_active_banks_f(u32 v)
+{
+        return (v & 0xfU) << 16U;
+}
+static inline u32 gr_gpc0_zcull_ram_addr_r(void)
+{
+        return 0x00500914U;
+}
+static inline u32 gr_gpc0_zcull_ram_addr_tiles_per_hypertile_row_per_gpc_f(u32 v)
+{
+        return (v & 0xfU) << 0U;
+}
+static inline u32 gr_gpc0_zcull_ram_addr_row_offset_f(u32 v)
+{
+        return (v & 0xfU) << 8U;
+}
+static inline u32 gr_gpc0_zcull_sm_num_rcp_r(void)
+{
+        return 0x00500918U;
+}
+static inline u32 gr_gpc0_zcull_sm_num_rcp_conservative_f(u32 v)
+{
+        return (v & 0xffffffU) << 0U;
+}
+static inline u32 gr_gpc0_zcull_sm_num_rcp_conservative__max_v(void)
+{
+        return 0x00800000U;
+}
+static inline u32 gr_gpc0_zcull_total_ram_size_r(void)
+{
+        return 0x00500920U;
+}
+static inline u32 gr_gpc0_zcull_total_ram_size_num_aliquots_f(u32 v)
+{
+        return (v & 0xffffU) << 0U;
+}
+static inline u32 gr_gpc0_zcull_zcsize_r(u32 i)
+{
+        return 0x00500a04U + i*32U;
+}
+static inline u32 gr_gpc0_zcull_zcsize_height_subregion__multiple_v(void)
+{
+        return 0x00000040U;
+}
+static inline u32 gr_gpc0_zcull_zcsize_width_subregion__multiple_v(void)
+{
+        return 0x00000010U;
+}
+static inline u32 gr_gpc0_gpm_pd_active_tpcs_r(void)
+{
+        return 0x00500c08U;
+}
+static inline u32 gr_gpc0_gpm_pd_active_tpcs_num_f(u32 v)
+{
+        return (v & 0x7U) << 0U;
+}
+static inline u32 gr_gpc0_gpm_pd_sm_id_r(u32 i)
+{
+        return 0x00500c10U + i*4U;
+}
+static inline u32 gr_gpc0_gpm_pd_sm_id_id_f(u32 v)
+{
+        return (v & 0xffU) << 0U;
+}
+static inline u32 gr_gpc0_gpm_pd_pes_tpc_id_mask_r(u32 i)
+{
+        return 0x00500c30U + i*4U;
+}
+static inline u32 gr_gpc0_gpm_pd_pes_tpc_id_mask_mask_v(u32 r)
+{
+        return (r >> 0U) & 0xffU;
+}
+static inline u32 gr_gpc0_gpm_sd_active_tpcs_r(void)
+{
+        return 0x00500c8cU;
+}
+static inline u32 gr_gpc0_gpm_sd_active_tpcs_num_f(u32 v)
+{
+        return (v & 0x7U) << 0U;
+}
+static inline u32 gr_gpc0_tpc0_pe_cfg_smid_r(void)
+{
+        return 0x00504088U;
+}
+static inline u32 gr_gpc0_tpc0_pe_cfg_smid_value_f(u32 v)
+{
+        return (v & 0xffffU) << 0U;
+}
+static inline u32 gr_gpc0_tpc0_l1c_cfg_smid_r(void)
+{
+        return 0x005044e8U;
+}
+static inline u32 gr_gpc0_tpc0_l1c_cfg_smid_value_f(u32 v)
+{
+        return (v & 0xffffU) << 0U;
+}
+static inline u32 gr_gpc0_tpc0_sm_cfg_r(void)
+{
+        return 0x00504698U;
+}
+static inline u32 gr_gpc0_tpc0_sm_cfg_sm_id_f(u32 v)
+{
+        return (v & 0xffffU) << 0U;
+}
+static inline u32 gr_gpc0_tpc0_sm_cfg_sm_id_v(u32 r)
+{
+        return (r >> 0U) & 0xffffU;
+}
+static inline u32 gr_gpc0_tpc0_sm_arch_r(void)
+{
+        return 0x0050469cU;
+}
+static inline u32 gr_gpc0_tpc0_sm_arch_warp_count_v(u32 r)
+{
+        return (r >> 0U) & 0xffU;
+}
+static inline u32 gr_gpc0_tpc0_sm_arch_spa_version_v(u32 r)
+{
+        return (r >> 8U) & 0xfU;
+}
+static inline u32 gr_gpc0_tpc0_sm_arch_spa_version_smkepler_lp_v(void)
+{
+        return 0x0000000cU;
+}
+static inline u32 gr_gpc0_ppc0_pes_vsc_strem_r(void)
+{
+        return 0x00503018U;
+}
+static inline u32 gr_gpc0_ppc0_pes_vsc_strem_master_pe_m(void)
+{
+        return 0x1U << 0U;
+}
+static inline u32 gr_gpc0_ppc0_pes_vsc_strem_master_pe_true_f(void)
+{
+        return 0x1U;
+}
+static inline u32 gr_gpc0_ppc0_cbm_cfg_r(void)
+{
+        return 0x005030c0U;
+}
+static inline u32 gr_gpc0_ppc0_cbm_cfg_start_offset_f(u32 v)
+{
+        return (v & 0xffffU) << 0U;
+}
+static inline u32 gr_gpc0_ppc0_cbm_cfg_start_offset_m(void)
+{
+        return 0xffffU << 0U;
+}
+static inline u32 gr_gpc0_ppc0_cbm_cfg_start_offset_v(u32 r)
+{
+        return (r >> 0U) & 0xffffU;
+}
+static inline u32 gr_gpc0_ppc0_cbm_cfg_size_f(u32 v)
+{
+        return (v & 0xfffU) << 16U;
+}
+static inline u32 gr_gpc0_ppc0_cbm_cfg_size_m(void)
+{
+        return 0xfffU << 16U;
+}
+static inline u32 gr_gpc0_ppc0_cbm_cfg_size_v(u32 r)
+{
+        return (r >> 16U) & 0xfffU;
+}
+static inline u32 gr_gpc0_ppc0_cbm_cfg_size_default_v(void)
+{
+        return 0x00000240U;
+}
+static inline u32 gr_gpc0_ppc0_cbm_cfg_size_granularity_v(void)
+{
+        return 0x00000020U;
+}
+static inline u32 gr_gpc0_ppc0_cbm_cfg_timeslice_mode_f(u32 v)
+{
+        return (v & 0x1U) << 28U;
+}
+static inline u32 gr_gpc0_ppc0_cbm_cfg2_r(void)
+{
+        return 0x005030e4U;
+}
+static inline u32 gr_gpc0_ppc0_cbm_cfg2_start_offset_f(u32 v)
+{
+        return (v & 0xffffU) << 0U;
+}
+static inline u32 gr_gpc0_ppc0_cbm_cfg2_size_f(u32 v)
+{
+        return (v & 0xfffU) << 16U;
+}
+static inline u32 gr_gpc0_ppc0_cbm_cfg2_size_m(void)
+{
+        return 0xfffU << 16U;
+}
+static inline u32 gr_gpc0_ppc0_cbm_cfg2_size_v(u32 r)
+{
+        return (r >> 16U) & 0xfffU;
+}
+static inline u32 gr_gpc0_ppc0_cbm_cfg2_size_default_v(void)
+{
+        return 0x00000648U;
+}
+static inline u32 gr_gpc0_ppc0_cbm_cfg2_size_granularity_v(void)
+{
+        return 0x00000020U;
+}
+static inline u32 gr_gpccs_falcon_addr_r(void)
+{
+        return 0x0041a0acU;
+}
+static inline u32 gr_gpccs_falcon_addr_lsb_s(void)
+{
+        return 6U;
+}
+static inline u32 gr_gpccs_falcon_addr_lsb_f(u32 v)
+{
+        return (v & 0x3fU) << 0U;
+}
+static inline u32 gr_gpccs_falcon_addr_lsb_m(void)
+{
+        return 0x3fU << 0U;
+}
+static inline u32 gr_gpccs_falcon_addr_lsb_v(u32 r)
+{
+        return (r >> 0U) & 0x3fU;
+}
+static inline u32 gr_gpccs_falcon_addr_lsb_init_v(void)
+{
+        return 0x00000000U;
+}
+static inline u32 gr_gpccs_falcon_addr_lsb_init_f(void)
+{
+        return 0x0U;
+}
+static inline u32 gr_gpccs_falcon_addr_msb_s(void)
+{
+        return 6U;
+}
+static inline u32 gr_gpccs_falcon_addr_msb_f(u32 v)
+{
+        return (v & 0x3fU) << 6U;
+}
+static inline u32 gr_gpccs_falcon_addr_msb_m(void)
+{
+        return 0x3fU << 6U;
+}
+static inline u32 gr_gpccs_falcon_addr_msb_v(u32 r)
+{
+        return (r >> 6U) & 0x3fU;
+}
+static inline u32 gr_gpccs_falcon_addr_msb_init_v(void)
+{
+        return 0x00000000U;
+}
+static inline u32 gr_gpccs_falcon_addr_msb_init_f(void)
+{
+        return 0x0U;
+}
+static inline u32 gr_gpccs_falcon_addr_ext_s(void)
+{
+        return 12U;
+}
+static inline u32 gr_gpccs_falcon_addr_ext_f(u32 v)
+{
+        return (v & 0xfffU) << 0U;
+}
+static inline u32 gr_gpccs_falcon_addr_ext_m(void)
+{
+        return 0xfffU << 0U;
+}
+static inline u32 gr_gpccs_falcon_addr_ext_v(u32 r)
+{
+        return (r >> 0U) & 0xfffU;
+}
+static inline u32 gr_gpccs_cpuctl_r(void)
+{
+        return 0x0041a100U;
+}
+static inline u32 gr_gpccs_cpuctl_startcpu_f(u32 v)
+{
+        return (v & 0x1U) << 1U;
+}
+static inline u32 gr_gpccs_dmactl_r(void)
+{
+        return 0x0041a10cU;
+}
+static inline u32 gr_gpccs_dmactl_require_ctx_f(u32 v)
+{
+        return (v & 0x1U) << 0U;
+}
+static inline u32 gr_gpccs_dmactl_dmem_scrubbing_m(void)
+{
+        return 0x1U << 1U;
+}
+static inline u32 gr_gpccs_dmactl_imem_scrubbing_m(void)
+{
+        return 0x1U << 2U;
+}
+static inline u32 gr_gpccs_imemc_r(u32 i)
+{
+        return 0x0041a180U + i*16U;
+}
+static inline u32 gr_gpccs_imemc_offs_f(u32 v)
+{
+        return (v & 0x3fU) << 2U;
+}
+static inline u32 gr_gpccs_imemc_blk_f(u32 v)
+{
+        return (v & 0xffU) << 8U;
+}
+static inline u32 gr_gpccs_imemc_aincw_f(u32 v)
+{
+        return (v & 0x1U) << 24U;
+}
+static inline u32 gr_gpccs_imemd_r(u32 i)
+{
+        return 0x0041a184U + i*16U;
+}
+static inline u32 gr_gpccs_imemt_r(u32 i)
+{
+        return 0x0041a188U + i*16U;
+}
+static inline u32 gr_gpccs_imemt__size_1_v(void)
+{
+        return 0x00000004U;
+}
+static inline u32 gr_gpccs_imemt_tag_f(u32 v)
+{
+        return (v & 0xffffU) << 0U;
+}
+static inline u32 gr_gpccs_dmemc_r(u32 i)
+{
+        return 0x0041a1c0U + i*8U;
+}
+static inline u32 gr_gpccs_dmemc_offs_f(u32 v)
+{
+        return (v & 0x3fU) << 2U;
+}
+static inline u32 gr_gpccs_dmemc_blk_f(u32 v)
+{
+        return (v & 0xffU) << 8U;
+}
+static inline u32 gr_gpccs_dmemc_aincw_f(u32 v)
+{
+        return (v & 0x1U) << 24U;
+}
+static inline u32 gr_gpccs_dmemd_r(u32 i)
+{
+        return 0x0041a1c4U + i*8U;
+}
+static inline u32 gr_gpccs_ctxsw_mailbox_r(u32 i)
+{
+        return 0x0041a800U + i*4U;
+}
+static inline u32 gr_gpccs_ctxsw_mailbox_value_f(u32 v)
+{
+        return (v & 0xffffffffU) << 0U;
+}
+static inline u32 gr_gpcs_setup_bundle_cb_base_r(void)
+{
+        return 0x00418808U;
+}
+static inline u32 gr_gpcs_setup_bundle_cb_base_addr_39_8_s(void)
+{
+        return 32U;
+}
+static inline u32 gr_gpcs_setup_bundle_cb_base_addr_39_8_f(u32 v)
+{
+        return (v & 0xffffffffU) << 0U;
+}
+static inline u32 gr_gpcs_setup_bundle_cb_base_addr_39_8_m(void)
+{
+        return 0xffffffffU << 0U;
+}
+static inline u32 gr_gpcs_setup_bundle_cb_base_addr_39_8_v(u32 r)
+{
+        return (r >> 0U) & 0xffffffffU;
+}
+static inline u32 gr_gpcs_setup_bundle_cb_base_addr_39_8_init_v(void)
+{
+        return 0x00000000U;
+}
+static inline u32 gr_gpcs_setup_bundle_cb_base_addr_39_8_init_f(void)
+{
+        return 0x0U;
+}
+static inline u32 gr_gpcs_setup_bundle_cb_size_r(void)
+{
+        return 0x0041880cU;
+}
+static inline u32 gr_gpcs_setup_bundle_cb_size_div_256b_s(void)
+{
+        return 11U;
+}
+static inline u32 gr_gpcs_setup_bundle_cb_size_div_256b_f(u32 v)
+{
+        return (v & 0x7ffU) << 0U;
+}
+static inline u32 gr_gpcs_setup_bundle_cb_size_div_256b_m(void)
+{
+        return 0x7ffU << 0U;
+}
+static inline u32 gr_gpcs_setup_bundle_cb_size_div_256b_v(u32 r)
+{
+        return (r >> 0U) & 0x7ffU;
+}
+static inline u32 gr_gpcs_setup_bundle_cb_size_div_256b_init_v(void)
+{
+        return 0x00000000U;
+}
+static inline u32 gr_gpcs_setup_bundle_cb_size_div_256b_init_f(void)
+{
+        return 0x0U;
+}
+static inline u32 gr_gpcs_setup_bundle_cb_size_div_256b__prod_v(void)
+{
+        return 0x00000018U;
+}
+static inline u32 gr_gpcs_setup_bundle_cb_size_div_256b__prod_f(void)
+{
+        return 0x18U;
+}
+static inline u32 gr_gpcs_setup_bundle_cb_size_valid_s(void)
+{
+        return 1U;
+}
+static inline u32 gr_gpcs_setup_bundle_cb_size_valid_f(u32 v)
+{
+        return (v & 0x1U) << 31U;
+}
+static inline u32 gr_gpcs_setup_bundle_cb_size_valid_m(void)
+{
+        return 0x1U << 31U;
+}
+static inline u32 gr_gpcs_setup_bundle_cb_size_valid_v(u32 r)
+{
+        return (r >> 31U) & 0x1U;
+}
+static inline u32 gr_gpcs_setup_bundle_cb_size_valid_false_v(void)
+{
+        return 0x00000000U;
+}
+static inline u32 gr_gpcs_setup_bundle_cb_size_valid_false_f(void)
+{
+        return 0x0U;
+}
+static inline u32 gr_gpcs_setup_bundle_cb_size_valid_true_v(void)
+{
+        return 0x00000001U;
+}
+static inline u32 gr_gpcs_setup_bundle_cb_size_valid_true_f(void)
+{
+        return 0x80000000U;
+}
+static inline u32 gr_gpcs_setup_attrib_cb_base_r(void)
+{
+        return 0x00418810U;
+}
+static inline u32 gr_gpcs_setup_attrib_cb_base_addr_39_12_f(u32 v)
+{
+        return (v & 0xfffffffU) << 0U;
+}
+static inline u32 gr_gpcs_setup_attrib_cb_base_addr_39_12_align_bits_v(void)
+{
+        return 0x0000000cU;
+}
+static inline u32 gr_gpcs_setup_attrib_cb_base_valid_true_f(void)
+{
+        return 0x80000000U;
+}
+static inline u32 gr_crstr_gpc_map0_r(void)
+{
+        return 0x00418b08U;
+}
+static inline u32 gr_crstr_gpc_map0_tile0_f(u32 v)
+{
+        return (v & 0x7U) << 0U;
+}
+static inline u32 gr_crstr_gpc_map0_tile1_f(u32 v)
+{
+        return (v & 0x7U) << 5U;
+}
+static inline u32 gr_crstr_gpc_map0_tile2_f(u32 v)
+{
+        return (v & 0x7U) << 10U;
+}
+static inline u32 gr_crstr_gpc_map0_tile3_f(u32 v)
+{
+        return (v & 0x7U) << 15U;
+}
+static inline u32 gr_crstr_gpc_map0_tile4_f(u32 v)
+{
+        return (v & 0x7U) << 20U;
+}
+static inline u32 gr_crstr_gpc_map0_tile5_f(u32 v)
+{
+        return (v & 0x7U) << 25U;
+}
+static inline u32 gr_crstr_gpc_map1_r(void)
+{
+        return 0x00418b0cU;
+}
+static inline u32 gr_crstr_gpc_map1_tile6_f(u32 v)
+{
+        return (v & 0x7U) << 0U;
+}
+static inline u32 gr_crstr_gpc_map1_tile7_f(u32 v)
+{
+        return (v & 0x7U) << 5U;
+}
+static inline u32 gr_crstr_gpc_map1_tile8_f(u32 v)
+{
+        return (v & 0x7U) << 10U;
+}
+static inline u32 gr_crstr_gpc_map1_tile9_f(u32 v)
+{
+        return (v & 0x7U) << 15U;
+}
+static inline u32 gr_crstr_gpc_map1_tile10_f(u32 v)
+{
+        return (v & 0x7U) << 20U;
+}
+static inline u32 gr_crstr_gpc_map1_tile11_f(u32 v)
+{
+        return (v & 0x7U) << 25U;
+}
+static inline u32 gr_crstr_gpc_map2_r(void)
+{
+        return 0x00418b10U;
+}
+static inline u32 gr_crstr_gpc_map2_tile12_f(u32 v)
+{
+        return (v & 0x7U) << 0U;
+}
+static inline u32 gr_crstr_gpc_map2_tile13_f(u32 v)
+{
+        return (v & 0x7U) << 5U;
+}
+static inline u32 gr_crstr_gpc_map2_tile14_f(u32 v)
+{
+        return (v & 0x7U) << 10U;
+}
+static inline u32 gr_crstr_gpc_map2_tile15_f(u32 v)
+{
+        return (v & 0x7U) << 15U;
+}
+static inline u32 gr_crstr_gpc_map2_tile16_f(u32 v)
+{
+        return (v & 0x7U) << 20U;
+}
+static inline u32 gr_crstr_gpc_map2_tile17_f(u32 v)
+{
+        return (v & 0x7U) << 25U;
+}
+static inline u32 gr_crstr_gpc_map3_r(void)
+{
+        return 0x00418b14U;
+}
+static inline u32 gr_crstr_gpc_map3_tile18_f(u32 v)
+{
+        return (v & 0x7U) << 0U;
+}
+static inline u32 gr_crstr_gpc_map3_tile19_f(u32 v)
+{
+        return (v & 0x7U) << 5U;
+}
+static inline u32 gr_crstr_gpc_map3_tile20_f(u32 v)
+{
+        return (v & 0x7U) << 10U;
+}
+static inline u32 gr_crstr_gpc_map3_tile21_f(u32 v)
+{
+        return (v & 0x7U) << 15U;
+}
+static inline u32 gr_crstr_gpc_map3_tile22_f(u32 v)
+{
+        return (v & 0x7U) << 20U;
+}
+static inline u32 gr_crstr_gpc_map3_tile23_f(u32 v)
+{
+        return (v & 0x7U) << 25U;
+}
+static inline u32 gr_crstr_gpc_map4_r(void)
+{
+        return 0x00418b18U;
+}
+static inline u32 gr_crstr_gpc_map4_tile24_f(u32 v)
+{
+        return (v & 0x7U) << 0U;
+}
+static inline u32 gr_crstr_gpc_map4_tile25_f(u32 v)
+{
+        return (v & 0x7U) << 5U;
+}
+static inline u32 gr_crstr_gpc_map4_tile26_f(u32 v)
+{
+        return (v & 0x7U) << 10U;
+}
+static inline u32 gr_crstr_gpc_map4_tile27_f(u32 v)
+{
+        return (v & 0x7U) << 15U;
+}
+static inline u32 gr_crstr_gpc_map4_tile28_f(u32 v)
+{
+        return (v & 0x7U) << 20U;
+}
+static inline u32 gr_crstr_gpc_map4_tile29_f(u32 v)
+{
+        return (v & 0x7U) << 25U;
+}
+static inline u32 gr_crstr_gpc_map5_r(void)
+{
+        return 0x00418b1cU;
+}
+static inline u32 gr_crstr_gpc_map5_tile30_f(u32 v)
+{
+        return (v & 0x7U) << 0U;
+}
+static inline u32 gr_crstr_gpc_map5_tile31_f(u32 v)
+{
+        return (v & 0x7U) << 5U;
+}
+static inline u32 gr_crstr_gpc_map5_tile32_f(u32 v)
+{
+        return (v & 0x7U) << 10U;
+}
+static inline u32 gr_crstr_gpc_map5_tile33_f(u32 v)
+{
+        return (v & 0x7U) << 15U;
+}
+static inline u32 gr_crstr_gpc_map5_tile34_f(u32 v)
+{
+        return (v & 0x7U) << 20U;
+}
+static inline u32 gr_crstr_gpc_map5_tile35_f(u32 v)
+{
+        return (v & 0x7U) << 25U;
+}
+static inline u32 gr_crstr_map_table_cfg_r(void)
+{
+        return 0x00418bb8U;
+}
+static inline u32 gr_crstr_map_table_cfg_row_offset_f(u32 v)
+{
+        return (v & 0xffU) << 0U;
+}
+static inline u32 gr_crstr_map_table_cfg_num_entries_f(u32 v)
+{
+        return (v & 0xffU) << 8U;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map0_r(void)
+{
+        return 0x00418980U;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_0_f(u32 v)
+{
+        return (v & 0x7U) << 0U;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_1_f(u32 v)
+{
+        return (v & 0x7U) << 4U;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_2_f(u32 v)
+{
+        return (v & 0x7U) << 8U;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_3_f(u32 v)
+{
+        return (v & 0x7U) << 12U;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_4_f(u32 v)
+{
+        return (v & 0x7U) << 16U;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_5_f(u32 v)
+{
+        return (v & 0x7U) << 20U;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_6_f(u32 v)
+{
+        return (v & 0x7U) << 24U;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_7_f(u32 v)
+{
+        return (v & 0x7U) << 28U;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map1_r(void)
+{
+        return 0x00418984U;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_8_f(u32 v)
+{
+        return (v & 0x7U) << 0U;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_9_f(u32 v)
+{
+        return (v & 0x7U) << 4U;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_10_f(u32 v)
+{
+        return (v & 0x7U) << 8U;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_11_f(u32 v)
+{
+        return (v & 0x7U) << 12U;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_12_f(u32 v)
+{
+        return (v & 0x7U) << 16U;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_13_f(u32 v)
+{
+        return (v & 0x7U) << 20U;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_14_f(u32 v)
+{
+        return (v & 0x7U) << 24U;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_15_f(u32 v)
+{
+        return (v & 0x7U) << 28U;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map2_r(void)
+{
+        return 0x00418988U;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_16_f(u32 v)
+{
+        return (v & 0x7U) << 0U;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_17_f(u32 v)
+{
+        return (v & 0x7U) << 4U;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_18_f(u32 v)
+{
+        return (v & 0x7U) << 8U;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_19_f(u32 v)
+{
+        return (v & 0x7U) << 12U;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_20_f(u32 v)
+{
+        return (v & 0x7U) << 16U;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_21_f(u32 v)
+{
+        return (v & 0x7U) << 20U;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_22_f(u32 v)
+{
+        return (v & 0x7U) << 24U;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_23_s(void)
+{
+        return 3U;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_23_f(u32 v)
+{
+        return (v & 0x7U) << 28U;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_23_m(void)
+{
+        return 0x7U << 28U;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_23_v(u32 r)
+{
+        return (r >> 28U) & 0x7U;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map3_r(void)
+{
+        return 0x0041898cU;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_24_f(u32 v)
+{
+        return (v & 0x7U) << 0U;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_25_f(u32 v)
+{
+        return (v & 0x7U) << 4U;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_26_f(u32 v)
+{
+        return (v & 0x7U) << 8U;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_27_f(u32 v)
+{
+        return (v & 0x7U) << 12U;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_28_f(u32 v)
+{
+        return (v & 0x7U) << 16U;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_29_f(u32 v)
+{
+        return (v & 0x7U) << 20U;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_30_f(u32 v)
+{
+        return (v & 0x7U) << 24U;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_31_f(u32 v)
+{
+        return (v & 0x7U) << 28U;
+}
+static inline u32 gr_gpcs_gpm_pd_cfg_r(void)
+{
+        return 0x00418c6cU;
+}
+static inline u32 gr_gpcs_gpm_pd_cfg_timeslice_mode_disable_f(void)
+{
+        return 0x0U;
+}
+static inline u32 gr_gpcs_gpm_pd_cfg_timeslice_mode_enable_f(void)
+{
+        return 0x1U;
+}
+static inline u32 gr_gpcs_gcc_pagepool_base_r(void)
+{
+        return 0x00419004U;
+}
+static inline u32 gr_gpcs_gcc_pagepool_base_addr_39_8_f(u32 v)
+{
+        return (v & 0xffffffffU) << 0U;
+}
+static inline u32 gr_gpcs_gcc_pagepool_r(void)
+{
+        return 0x00419008U;
+}
+static inline u32 gr_gpcs_gcc_pagepool_total_pages_f(u32 v)
+{
+        return (v & 0xffU) << 0U;
+}
+static inline u32 gr_gpcs_tpcs_pe_vaf_r(void)
+{
+        return 0x0041980cU;
+}
+static inline u32 gr_gpcs_tpcs_pe_vaf_fast_mode_switch_true_f(void)
+{
+        return 0x10U;
+}
+static inline u32 gr_gpcs_tpcs_pe_pin_cb_global_base_addr_r(void)
+{
+        return 0x00419848U;
+}
+static inline u32 gr_gpcs_tpcs_pe_pin_cb_global_base_addr_v_f(u32 v)
+{
+        return (v & 0xfffffffU) << 0U;
+}
+static inline u32 gr_gpcs_tpcs_pe_pin_cb_global_base_addr_valid_f(u32 v)
+{
+        return (v & 0x1U) << 28U;
+}
+static inline u32 gr_gpcs_tpcs_pe_pin_cb_global_base_addr_valid_true_f(void)
+{
+        return 0x10000000U;
+}
+static inline u32 gr_gpcs_tpcs_mpc_vtg_debug_r(void)
+{
+        return 0x00419c00U;
+}
+static inline u32 gr_gpcs_tpcs_mpc_vtg_debug_timeslice_mode_disabled_f(void)
+{
+        return 0x0U;
+}
+static inline u32 gr_gpcs_tpcs_mpc_vtg_debug_timeslice_mode_enabled_f(void)
+{
+        return 0x8U;
+}
+static inline u32 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_r(void)
+{
+        return 0x00419e44U;
+}
+static inline u32 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_stack_error_report_f(void)
+{
+        return 0x2U;
+}
+static inline u32 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_api_stack_error_report_f(void)
+{
+        return 0x4U;
+}
+static inline u32 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_ret_empty_stack_error_report_f(void)
+{
+        return 0x8U;
+}
+static inline u32 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_pc_wrap_report_f(void)
+{
+        return 0x10U;
+}
+static inline u32 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_pc_report_f(void)
+{
+        return 0x20U;
+}
+static inline u32 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_pc_overflow_report_f(void)
+{
+        return 0x40U;
+}
+static inline u32 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_immc_addr_report_f(void)
+{
+        return 0x80U;
+}
+static inline u32 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_reg_report_f(void)
+{
+        return 0x100U;
+}
+static inline u32 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_instr_encoding_report_f(void)
+{
+        return 0x200U;
+}
+static inline u32 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_sph_instr_combo_report_f(void)
+{
+        return 0x400U;
+}
+static inline u32 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_instr_param_report_f(void)
+{
+        return 0x800U;
+}
+static inline u32 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_invalid_const_addr_report_f(void)
+{
+        return 0x1000U;
+}
+static inline u32 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_oor_reg_report_f(void)
+{
+        return 0x2000U;
+}
+static inline u32 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_oor_addr_report_f(void)
+{
+        return 0x4000U;
+}
+static inline u32 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_addr_report_f(void)
+{
+        return 0x8000U;
+}
+static inline u32 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_invalid_addr_space_report_f(void)
+{
+        return 0x10000U;
+}
+static inline u32 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_instr_param2_report_f(void)
+{
+        return 0x20000U;
+}
+static inline u32 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_invalid_const_addr_ldc_report_f(void)
+{
+        return 0x40000U;
+}
+static inline u32 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_geometry_sm_error_report_f(void)
+{
+        return 0x80000U;
+}
+static inline u32 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_divergent_report_f(void)
+{
+        return 0x100000U;
+}
+static inline u32 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_r(void)
+{
+        return 0x00419e4cU;
+}
+static inline u32 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_sm_to_sm_fault_report_f(void)
+{
+        return 0x1U;
+}
+static inline u32 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_l1_error_report_f(void)
+{
+        return 0x2U;
+}
+static inline u32 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_multiple_warp_errors_report_f(void)
+{
+        return 0x4U;
+}
+static inline u32 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_physical_stack_overflow_error_report_f(void)
+{
+        return 0x8U;
+}
+static inline u32 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_bpt_int_report_f(void)
+{
+        return 0x10U;
+}
+static inline u32 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_bpt_pause_report_f(void)
+{
+        return 0x20U;
+}
+static inline u32 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_single_step_complete_report_f(void)
+{
+        return 0x40U;
+}
+static inline u32 gr_gpcs_tpcs_tpccs_tpc_exception_en_r(void)
+{
+        return 0x00419d0cU;
+}
+static inline u32 gr_gpcs_tpcs_tpccs_tpc_exception_en_sm_enabled_f(void)
+{
+        return 0x2U;
+}
+static inline u32 gr_gpcs_tpcs_tpccs_tpc_exception_en_tex_enabled_f(void)
+{
+        return 0x1U;
+}
+static inline u32 gr_gpc0_tpc0_tpccs_tpc_exception_en_r(void)
+{
+        return 0x0050450cU;
+}
+static inline u32 gr_gpc0_tpc0_tpccs_tpc_exception_en_sm_v(u32 r)
+{
+        return (r >> 1U) & 0x1U;
+}
+static inline u32 gr_gpc0_tpc0_tpccs_tpc_exception_en_sm_enabled_f(void)
+{
+        return 0x2U;
+}
+static inline u32 gr_gpcs_gpccs_gpc_exception_en_r(void)
+{
+        return 0x0041ac94U;
+}
+static inline u32 gr_gpcs_gpccs_gpc_exception_en_tpc_f(u32 v)
+{
+        return (v & 0xffU) << 16U;
+}
+static inline u32 gr_gpc0_gpccs_gpc_exception_r(void)
+{
+        return 0x00502c90U;
+}
+static inline u32 gr_gpc0_gpccs_gpc_exception_gcc_v(u32 r)
+{
+        return (r >> 2U) & 0x1U;
+}
+static inline u32 gr_gpc0_gpccs_gpc_exception_tpc_v(u32 r)
+{
+        return (r >> 16U) & 0xffU;
+}
+static inline u32 gr_gpc0_gpccs_gpc_exception_tpc_0_pending_v(void)
+{
+        return 0x00000001U;
+}
+static inline u32 gr_gpc0_tpc0_tpccs_tpc_exception_r(void)
+{
+        return 0x00504508U;
+}
+static inline u32 gr_gpc0_tpc0_tpccs_tpc_exception_tex_v(u32 r)
+{
+        return (r >> 0U) & 0x1U;
+}
+static inline u32 gr_gpc0_tpc0_tpccs_tpc_exception_tex_pending_v(void)
+{
+        return 0x00000001U;
+}
+static inline u32 gr_gpc0_tpc0_tpccs_tpc_exception_sm_v(u32 r)
+{
+        return (r >> 1U) & 0x1U;
+}
+static inline u32 gr_gpc0_tpc0_tpccs_tpc_exception_sm_pending_v(void)
+{
+        return 0x00000001U;
+}
+static inline u32 gr_gpc0_tpc0_sm_dbgr_control0_r(void)
+{
+        return 0x00504610U;
+}
+static inline u32 gr_gpc0_tpc0_sm_dbgr_control0_debugger_mode_m(void)
+{
+        return 0x1U << 0U;
+}
+static inline u32 gr_gpc0_tpc0_sm_dbgr_control0_debugger_mode_v(u32 r)
+{
+        return (r >> 0U) & 0x1U;
+}
+static inline u32 gr_gpc0_tpc0_sm_dbgr_control0_debugger_mode_on_v(void)
+{
+        return 0x00000001U;
+}
+static inline u32 gr_gpc0_tpc0_sm_dbgr_control0_debugger_mode_on_f(void)
+{
+        return 0x1U;
+}
+static inline u32 gr_gpc0_tpc0_sm_dbgr_control0_debugger_mode_off_v(void)
+{
+        return 0x00000000U;
+}
+static inline u32 gr_gpc0_tpc0_sm_dbgr_control0_debugger_mode_off_f(void)
+{
+        return 0x0U;
+}
+static inline u32 gr_gpc0_tpc0_sm_dbgr_control0_stop_trigger_enable_f(void)
+{
+        return 0x80000000U;
+}
+static inline u32 gr_gpc0_tpc0_sm_dbgr_control0_stop_trigger_disable_f(void)
+{
+        return 0x0U;
+}
+static inline u32 gr_gpc0_tpc0_sm_dbgr_control0_single_step_mode_enable_f(void)
+{
+        return 0x8U;
+}
+static inline u32 gr_gpc0_tpc0_sm_dbgr_control0_single_step_mode_disable_f(void)
+{
+        return 0x0U;
+}
+static inline u32 gr_gpc0_tpc0_sm_dbgr_control0_run_trigger_task_f(void)
+{
+        return 0x40000000U;
+}
+static inline u32 gr_gpc0_tpc0_sm_dbgr_control0_stop_on_any_warp_m(void)
+{
+        return 0x1U << 1U;
+}
+static inline u32 gr_gpc0_tpc0_sm_dbgr_control0_stop_on_any_warp_v(u32 r)
+{
+        return (r >> 1U) & 0x1U;
+}
+static inline u32 gr_gpc0_tpc0_sm_dbgr_control0_stop_on_any_warp_disable_f(void)
+{
+        return 0x0U;
+}
+static inline u32 gr_gpc0_tpc0_sm_dbgr_control0_stop_on_any_sm_m(void)
+{
+        return 0x1U << 2U;
+}
+static inline u32 gr_gpc0_tpc0_sm_dbgr_control0_stop_on_any_sm_v(u32 r)
+{
+        return (r >> 2U) & 0x1U;
+}
+static inline u32 gr_gpc0_tpc0_sm_dbgr_control0_stop_on_any_sm_disable_f(void)
+{
+        return 0x0U;
+}
+static inline u32 gr_gpc0_tpc0_sm_dbgr_control0_stop_on_any_sm_stop_on_any_warp_disable_v(void)
+{
+        return 0x00000000U;
+}
+static inline u32 gr_gpc0_tpc0_sm_dbgr_control0_stop_on_any_sm_stop_on_any_sm_disable_v(void)
+{
+        return 0x00000000U;
+}
+static inline u32 gr_gpc0_tpc0_sm_warp_valid_mask_r(void)
+{
+        return 0x00504614U;
+}
+static inline u32 gr_gpc0_tpc0_sm_warp_valid_mask_1_r(void)
+{
+        return 0x00504618U;
+}
+static inline u32 gr_gpc0_tpc0_sm_dbgr_bpt_pause_mask_r(void)
+{
+        return 0x00504624U;
+}
+static inline u32 gr_gpc0_tpc0_sm_dbgr_bpt_pause_mask_1_r(void)
+{
+        return 0x00504628U;
+}
+static inline u32 gr_gpc0_tpc0_sm_dbgr_bpt_trap_mask_r(void)
+{
+        return 0x00504634U;
+}
+static inline u32 gr_gpc0_tpc0_sm_dbgr_bpt_trap_mask_1_r(void)
+{
+        return 0x00504638U;
+}
+static inline u32 gr_gpcs_tpcs_sm_dbgr_bpt_pause_mask_r(void)
+{
+        return 0x00419e24U;
+}
+static inline u32 gr_gpc0_tpc0_sm_dbgr_status0_r(void)
+{
+        return 0x0050460cU;
+}
+static inline u32 gr_gpc0_tpc0_sm_dbgr_status0_sm_in_trap_mode_v(u32 r)
+{
+        return (r >> 0U) & 0x1U;
+}
+static inline u32 gr_gpc0_tpc0_sm_dbgr_status0_locked_down_v(u32 r)
+{
+        return (r >> 4U) & 0x1U;
+}
+static inline u32 gr_gpc0_tpc0_sm_dbgr_status0_locked_down_true_v(void)
+{
+        return 0x00000001U;
+}
+static inline u32 gr_gpcs_tpcs_sm_hww_global_esr_r(void)
+{
+        return 0x00419e50U;
+}
+static inline u32 gr_gpcs_tpcs_sm_hww_global_esr_bpt_int_pending_f(void)
+{
+        return 0x10U;
+}
+static inline u32 gr_gpcs_tpcs_sm_hww_global_esr_bpt_pause_pending_f(void)
+{
+        return 0x20U;
+}
+static inline u32 gr_gpcs_tpcs_sm_hww_global_esr_single_step_complete_pending_f(void)
+{
+        return 0x40U;
+}
+static inline u32 gr_gpcs_tpcs_sm_hww_global_esr_sm_to_sm_fault_pending_f(void)
+{
+        return 0x1U;
+}
+static inline u32 gr_gpcs_tpcs_sm_hww_global_esr_l1_error_pending_f(void)
+{
+        return 0x2U;
+}
+static inline u32 gr_gpcs_tpcs_sm_hww_global_esr_multiple_warp_errors_pending_f(void)
+{
+        return 0x4U;
+}
+static inline u32 gr_gpcs_tpcs_sm_hww_global_esr_physical_stack_overflow_error_pending_f(void)
+{
+        return 0x8U;
+}
+static inline u32 gr_gpcs_tpcs_sm_hww_global_esr_timeout_error_pending_f(void)
+{
+        return 0x80000000U;
+}
+static inline u32 gr_gpc0_tpc0_sm_hww_global_esr_r(void)
+{
+        return 0x00504650U;
+}
+static inline u32 gr_gpc0_tpc0_sm_hww_global_esr_bpt_int_pending_f(void)
+{
+        return 0x10U;
+}
+static inline u32 gr_gpc0_tpc0_sm_hww_global_esr_bpt_pause_pending_f(void)
+{
+        return 0x20U;
+}
+static inline u32 gr_gpc0_tpc0_sm_hww_global_esr_single_step_complete_pending_f(void)
+{
+        return 0x40U;
+}
+static inline u32 gr_gpc0_tpc0_sm_hww_global_esr_sm_to_sm_fault_pending_f(void)
+{
+        return 0x1U;
+}
+static inline u32 gr_gpc0_tpc0_sm_hww_global_esr_l1_error_pending_f(void)
+{
+        return 0x2U;
+}
+static inline u32 gr_gpc0_tpc0_sm_hww_global_esr_multiple_warp_errors_pending_f(void)
+{
+        return 0x4U;
+}
+static inline u32 gr_gpc0_tpc0_sm_hww_global_esr_physical_stack_overflow_error_pending_f(void)
+{
+        return 0x8U;
+}
+static inline u32 gr_gpc0_tpc0_sm_hww_global_esr_timeout_error_pending_f(void)
+{
+        return 0x80000000U;
+}
+static inline u32 gr_gpc0_tpc0_tex_m_hww_esr_r(void)
+{
+        return 0x00504224U;
+}
+static inline u32 gr_gpc0_tpc0_tex_m_hww_esr_intr_pending_f(void)
+{
+        return 0x1U;
+}
+static inline u32 gr_gpc0_tpc0_sm_hww_warp_esr_r(void)
+{
+        return 0x00504648U;
+}
+static inline u32 gr_gpc0_tpc0_sm_hww_warp_esr_error_v(u32 r)
+{
+        return (r >> 0U) & 0xffffU;
+}
+static inline u32 gr_gpc0_tpc0_sm_hww_warp_esr_error_none_v(void)
+{
+        return 0x00000000U;
+}
+static inline u32 gr_gpc0_tpc0_sm_hww_warp_esr_error_none_f(void)
+{
+        return 0x0U;
+}
+static inline u32 gr_gpc0_tpc0_sm_halfctl_ctrl_r(void)
+{
+        return 0x00504770U;
+}
+static inline u32 gr_gpcs_tpcs_sm_halfctl_ctrl_r(void)
+{
+        return 0x00419f70U;
+}
+static inline u32 gr_gpcs_tpcs_sm_halfctl_ctrl_sctl_read_quad_ctl_m(void)
+{
+        return 0x1U << 4U;
+}
+static inline u32 gr_gpcs_tpcs_sm_halfctl_ctrl_sctl_read_quad_ctl_f(u32 v)
+{
+        return (v & 0x1U) << 4U;
+}
+static inline u32 gr_gpc0_tpc0_sm_debug_sfe_control_r(void)
+{
+        return 0x0050477cU;
+}
+static inline u32 gr_gpcs_tpcs_sm_debug_sfe_control_r(void)
+{
+        return 0x00419f7cU;
+}
+static inline u32 gr_gpcs_tpcs_sm_debug_sfe_control_read_half_ctl_m(void)
+{
+        return 0x1U << 0U;
+}
+static inline u32 gr_gpcs_tpcs_sm_debug_sfe_control_read_half_ctl_f(u32 v)
+{
+        return (v & 0x1U) << 0U;
+}
+static inline u32 gr_gpcs_tpcs_pes_vsc_vpc_r(void)
+{
+        return 0x0041be08U;
+}
+static inline u32 gr_gpcs_tpcs_pes_vsc_vpc_fast_mode_switch_true_f(void)
+{
+        return 0x4U;
+}
+static inline u32 gr_ppcs_wwdx_map_gpc_map0_r(void)
+{
+        return 0x0041bf00U;
+}
+static inline u32 gr_ppcs_wwdx_map_gpc_map1_r(void)
+{
+        return 0x0041bf04U;
+}
+static inline u32 gr_ppcs_wwdx_map_gpc_map2_r(void)
+{
+        return 0x0041bf08U;
+}
+static inline u32 gr_ppcs_wwdx_map_gpc_map3_r(void)
+{
+        return 0x0041bf0cU;
+}
+static inline u32 gr_ppcs_wwdx_map_gpc_map4_r(void)
+{
+        return 0x0041bf10U;
+}
+static inline u32 gr_ppcs_wwdx_map_gpc_map5_r(void)
+{
+        return 0x0041bf14U;
+}
+static inline u32 gr_ppcs_wwdx_map_table_cfg_r(void)
+{
+        return 0x0041bfd0U;
+}
+static inline u32 gr_ppcs_wwdx_map_table_cfg_row_offset_f(u32 v)
+{
+        return (v & 0xffU) << 0U;
+}
+static inline u32 gr_ppcs_wwdx_map_table_cfg_num_entries_f(u32 v)
+{
+        return (v & 0xffU) << 8U;
+}
+static inline u32 gr_ppcs_wwdx_map_table_cfg_normalized_num_entries_f(u32 v)
+{
+        return (v & 0x1fU) << 16U;
+}
+static inline u32 gr_ppcs_wwdx_map_table_cfg_normalized_shift_value_f(u32 v)
+{
+        return (v & 0x7U) << 21U;
+}
+static inline u32 gr_ppcs_wwdx_map_table_cfg_coeff5_mod_value_f(u32 v)
+{
+        return (v & 0x1fU) << 24U;
+}
+static inline u32 gr_gpcs_ppcs_wwdx_sm_num_rcp_r(void)
+{
+        return 0x0041bfd4U;
+}
+static inline u32 gr_gpcs_ppcs_wwdx_sm_num_rcp_conservative_f(u32 v)
+{
+        return (v & 0xffffffU) << 0U;
+}
+static inline u32 gr_ppcs_wwdx_map_table_cfg2_r(void)
+{
+        return 0x0041bfe4U;
+}
+static inline u32 gr_ppcs_wwdx_map_table_cfg2_coeff6_mod_value_f(u32 v)
+{
+        return (v & 0x1fU) << 0U;
+}
+static inline u32 gr_ppcs_wwdx_map_table_cfg2_coeff7_mod_value_f(u32 v)
+{
+        return (v & 0x1fU) << 5U;
+}
+static inline u32 gr_ppcs_wwdx_map_table_cfg2_coeff8_mod_value_f(u32 v)
+{
+        return (v & 0x1fU) << 10U;
+}
+static inline u32 gr_ppcs_wwdx_map_table_cfg2_coeff9_mod_value_f(u32 v)
+{
+        return (v & 0x1fU) << 15U;
+}
+static inline u32 gr_ppcs_wwdx_map_table_cfg2_coeff10_mod_value_f(u32 v)
+{
+        return (v & 0x1fU) << 20U;
+}
+static inline u32 gr_ppcs_wwdx_map_table_cfg2_coeff11_mod_value_f(u32 v)
+{
+        return (v & 0x1fU) << 25U;
+}
+static inline u32 gr_gpcs_ppcs_cbm_cfg_r(void)
+{
+        return 0x0041bec0U;
+}
+static inline u32 gr_gpcs_ppcs_cbm_cfg_timeslice_mode_enable_v(void)
+{
+        return 0x00000001U;
+}
+static inline u32 gr_bes_zrop_settings_r(void)
+{
+        return 0x00408850U;
+}
+static inline u32 gr_bes_zrop_settings_num_active_fbps_f(u32 v)
+{
+        return (v & 0xfU) << 0U;
+}
+static inline u32 gr_bes_crop_settings_r(void)
+{
+        return 0x00408958U;
+}
+static inline u32 gr_bes_crop_settings_num_active_fbps_f(u32 v)
+{
+        return (v & 0xfU) << 0U;
+}
+static inline u32 gr_zcull_bytes_per_aliquot_per_gpu_v(void)
+{
+        return 0x00000020U;
+}
+static inline u32 gr_zcull_save_restore_header_bytes_per_gpc_v(void)
+{
+        return 0x00000020U;
+}
+static inline u32 gr_zcull_save_restore_subregion_header_bytes_per_gpc_v(void)
+{
+        return 0x000000c0U;
+}
+static inline u32 gr_zcull_subregion_qty_v(void)
+{
+        return 0x00000010U;
+}
+static inline u32 gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control_sel0_r(void)
+{
+        return 0x00504604U;
+}
+static inline u32 gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control_sel1_r(void)
+{
+        return 0x00504608U;
+}
+static inline u32 gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control0_r(void)
+{
+        return 0x0050465cU;
+}
+static inline u32 gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control1_r(void)
+{
+        return 0x00504660U;
+}
+static inline u32 gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control2_r(void)
+{
+        return 0x00504664U;
+}
+static inline u32 gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control3_r(void)
+{
+        return 0x00504668U;
+}
+static inline u32 gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control4_r(void)
+{
+        return 0x0050466cU;
+}
+static inline u32 gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control5_r(void)
+{
+        return 0x00504658U;
+}
+static inline u32 gr_pri_gpc0_tpc0_sm_dsm_perf_counter_status_r(void)
+{
+        return 0x00504670U;
+}
+static inline u32 gr_pri_gpc0_tpc0_sm_dsm_perf_counter_status1_r(void)
+{
+        return 0x00504694U;
+}
+static inline u32 gr_pri_gpc0_tpc0_sm_dsm_perf_counter0_control_r(void)
+{
+        return 0x00504730U;
+}
+static inline u32 gr_pri_gpc0_tpc0_sm_dsm_perf_counter1_control_r(void)
+{
+        return 0x00504734U;
+}
+static inline u32 gr_pri_gpc0_tpc0_sm_dsm_perf_counter2_control_r(void)
+{
+        return 0x00504738U;
+}
+static inline u32 gr_pri_gpc0_tpc0_sm_dsm_perf_counter3_control_r(void)
+{
+        return 0x0050473cU;
+}
+static inline u32 gr_pri_gpc0_tpc0_sm_dsm_perf_counter4_control_r(void)
+{
+        return 0x00504740U;
+}
+static inline u32 gr_pri_gpc0_tpc0_sm_dsm_perf_counter5_control_r(void)
+{
+        return 0x00504744U;
+}
+static inline u32 gr_pri_gpc0_tpc0_sm_dsm_perf_counter6_control_r(void)
+{
+        return 0x00504748U;
+}
+static inline u32 gr_pri_gpc0_tpc0_sm_dsm_perf_counter7_control_r(void)
+{
+        return 0x0050474cU;
+}
+static inline u32 gr_pri_gpc0_tpc0_sm_dsm_perf_counter0_r(void)
+{
+        return 0x00504674U;
+}
+static inline u32 gr_pri_gpc0_tpc0_sm_dsm_perf_counter1_r(void)
+{
+        return 0x00504678U;
+}
+static inline u32 gr_pri_gpc0_tpc0_sm_dsm_perf_counter2_r(void)
+{
+        return 0x0050467cU;
+}
+static inline u32 gr_pri_gpc0_tpc0_sm_dsm_perf_counter3_r(void)
+{
+        return 0x00504680U;
+}
+static inline u32 gr_pri_gpc0_tpc0_sm_dsm_perf_counter4_r(void)
+{
+        return 0x00504684U;
+}
+static inline u32 gr_pri_gpc0_tpc0_sm_dsm_perf_counter5_r(void)
+{
+        return 0x00504688U;
+}
+static inline u32 gr_pri_gpc0_tpc0_sm_dsm_perf_counter6_r(void)
+{
+        return 0x0050468cU;
+}
+static inline u32 gr_pri_gpc0_tpc0_sm_dsm_perf_counter7_r(void)
+{
+        return 0x00504690U;
+}
+static inline u32 gr_fe_pwr_mode_r(void)
+{
+        return 0x00404170U;
+}
+static inline u32 gr_fe_pwr_mode_mode_auto_f(void)
+{
+        return 0x0U;
+}
+static inline u32 gr_fe_pwr_mode_mode_force_on_f(void)
+{
+        return 0x2U;
+}
+static inline u32 gr_fe_pwr_mode_req_v(u32 r)
+{
+        return (r >> 4U) & 0x1U;
+}
+static inline u32 gr_fe_pwr_mode_req_send_f(void)
+{
+        return 0x10U;
+}
+static inline u32 gr_fe_pwr_mode_req_done_v(void)
+{
+        return 0x00000000U;
+}
+static inline u32 gr_gpc0_tpc0_l1c_dbg_r(void)
+{
+        return 0x005044b0U;
+}
+static inline u32 gr_gpc0_tpc0_l1c_dbg_cya15_en_f(void)
+{
+        return 0x8000000U;
+}
+static inline u32 gr_gpcs_tpcs_sm_sch_texlock_r(void)
+{
+        return 0x00419ec8U;
+}
+static inline u32 gr_gpcs_tpcs_sm_sch_texlock_tex_hash_m(void)
+{
+        return 0x1U << 0U;
+}
+static inline u32 gr_gpcs_tpcs_sm_sch_texlock_tex_hash_disable_f(void)
+{
+        return 0x0U;
+}
+static inline u32 gr_gpcs_tpcs_sm_sch_texlock_tex_hash_tile_m(void)
+{
+        return 0x1U << 1U;
+}
+static inline u32 gr_gpcs_tpcs_sm_sch_texlock_tex_hash_tile_disable_f(void)
+{
+        return 0x0U;
+}
+static inline u32 gr_gpcs_tpcs_sm_sch_texlock_tex_hash_phase_m(void)
+{
+        return 0x1U << 2U;
+}
+static inline u32 gr_gpcs_tpcs_sm_sch_texlock_tex_hash_phase_disable_f(void)
+{
+        return 0x0U;
+}
+static inline u32 gr_gpcs_tpcs_sm_sch_texlock_tex_hash_tex_m(void)
+{
+        return 0x1U << 3U;
+}
+static inline u32 gr_gpcs_tpcs_sm_sch_texlock_tex_hash_tex_disable_f(void)
+{
+        return 0x0U;
+}
+static inline u32 gr_gpcs_tpcs_sm_sch_texlock_tex_hash_timeout_m(void)
+{
+        return 0xffU << 4U;
+}
+static inline u32 gr_gpcs_tpcs_sm_sch_texlock_tex_hash_timeout_disable_f(void)
+{
+        return 0x0U;
+}
+static inline u32 gr_gpcs_tpcs_sm_sch_texlock_dot_t_unlock_m(void)
+{
+        return 0x1U << 16U;
+}
+static inline u32 gr_gpcs_tpcs_sm_sch_texlock_dot_t_unlock_disable_f(void)
+{
+        return 0x0U;
+}
+static inline u32 gr_gpcs_tpcs_sm_sch_macro_sched_r(void)
+{
+        return 0x00419eacU;
+}
+static inline u32 gr_gpcs_tpcs_sm_sch_macro_sched_lockboost_size_f(u32 v)
+{
+        return (v & 0x1U) << 2U;
+}
+static inline u32 gr_gpcs_tpcs_sm_sch_macro_sched_lockboost_size_m(void)
+{
+        return 0x1U << 2U;
+}
+static inline u32 gr_gpcs_tpcs_sm_dbgr_control0_r(void)
+{
+        return 0x00419e10U;
+}
+static inline u32 gr_gpcs_tpcs_sm_dbgr_control0_debugger_mode_f(u32 v)
+{
+        return (v & 0x1U) << 0U;
+}
+static inline u32 gr_gpcs_tpcs_sm_dbgr_control0_debugger_mode_on_v(void)
+{
+        return 0x00000001U;
+}
+static inline u32 gr_gpcs_tpcs_sm_dbgr_control0_stop_trigger_m(void)
+{
+        return 0x1U << 31U;
+}
+static inline u32 gr_gpcs_tpcs_sm_dbgr_control0_stop_trigger_v(u32 r)
+{
+        return (r >> 31U) & 0x1U;
+}
+static inline u32 gr_gpcs_tpcs_sm_dbgr_control0_stop_trigger_enable_f(void)
+{
+        return 0x80000000U;
+}
+static inline u32 gr_gpcs_tpcs_sm_dbgr_control0_stop_trigger_disable_f(void)
+{
+        return 0x0U;
+}
+static inline u32 gr_gpcs_tpcs_sm_dbgr_control0_single_step_mode_m(void)
+{
+        return 0x1U << 3U;
+}
+static inline u32 gr_gpcs_tpcs_sm_dbgr_control0_single_step_mode_enable_f(void)
+{
+        return 0x8U;
+}
+static inline u32 gr_gpcs_tpcs_sm_dbgr_control0_single_step_mode_disable_f(void)
+{
+        return 0x0U;
+}
+static inline u32 gr_gpcs_tpcs_sm_dbgr_control0_run_trigger_m(void)
+{
+        return 0x1U << 30U;
+}
+static inline u32 gr_gpcs_tpcs_sm_dbgr_control0_run_trigger_v(u32 r)
+{
+        return (r >> 30U) & 0x1U;
+}
+static inline u32 gr_gpcs_tpcs_sm_dbgr_control0_run_trigger_task_f(void)
+{
+        return 0x40000000U;
+}
+#endif
diff --git a/include/gk20a/hw_ltc_gk20a.h b/include/gk20a/hw_ltc_gk20a.h
new file mode 100644
index 0000000..efe7f98
--- /dev/null
+++ b/include/gk20a/hw_ltc_gk20a.h
@@ -0,0 +1,455 @@
+/*
+ * Copyright (c) 2012-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+/*
+ * Function naming determines intended use:
+ *
+ *     <x>_r(void) : Returns the offset for register <x>.
+ *
+ *     <x>_o(void) : Returns the offset for element <x>.
+ *
+ *     <x>_w(void) : Returns the word offset for word (4 byte) element <x>.
+ *
+ *     <x>_<y>_s(void) : Returns size of field <y> of register <x> in bits.
+ *
+ *     <x>_<y>_f(u32 v) : Returns a value based on 'v' which has been shifted
+ *         and masked to place it at field <y> of register <x>.  This value
+ *         can be |'d with others to produce a full register value for
+ *         register <x>.
+ *
+ *     <x>_<y>_m(void) : Returns a mask for field <y> of register <x>.  This
+ *         value can be ~'d and then &'d to clear the value of field <y> for
+ *         register <x>.
+ *
+ *     <x>_<y>_<z>_f(void) : Returns the constant value <z> after being shifted
+ *         to place it at field <y> of register <x>.  This value can be |'d
+ *         with others to produce a full register value for <x>.
+ *
+ *     <x>_<y>_v(u32 r) : Returns the value of field <y> from a full register
+ *         <x> value 'r' after being shifted to place its LSB at bit 0.
+ *         This value is suitable for direct comparison with other unshifted
+ *         values appropriate for use in field <y> of register <x>.
+ *
+ *     <x>_<y>_<z>_v(void) : Returns the constant value for <z> defined for
+ *         field <y> of register <x>.  This value is suitable for direct
+ *         comparison with unshifted values appropriate for use in field <y>
+ *         of register <x>.
+ */
+#ifndef _hw_ltc_gk20a_h_
+#define _hw_ltc_gk20a_h_
+static inline u32 ltc_pltcg_base_v(void)
+{
+        return 0x00140000U;
+}
+static inline u32 ltc_pltcg_extent_v(void)
+{
+        return 0x0017ffffU;
+}
+static inline u32 ltc_ltcs_lts0_cbc_ctrl1_r(void)
+{
+        return 0x001410c8U;
+}
+static inline u32 ltc_ltc0_lts0_dstg_cfg0_r(void)
+{
+        return 0x00141200U;
+}
+static inline u32 ltc_ltcs_ltss_dstg_cfg0_r(void)
+{
+        return 0x0017ea00U;
+}
+static inline u32 ltc_ltc0_lts0_tstg_cfg1_r(void)
+{
+        return 0x00141104U;
+}
+static inline u32 ltc_ltc0_lts0_tstg_cfg1_active_ways_v(u32 r)
+{
+        return (r >> 0U) & 0xffffU;
+}
+static inline u32 ltc_ltc0_lts0_tstg_cfg1_active_sets_v(u32 r)
+{
+        return (r >> 16U) & 0x3U;
+}
+static inline u32 ltc_ltc0_lts0_tstg_cfg1_active_sets_all_v(void)
+{
+        return 0x00000000U;
+}
+static inline u32 ltc_ltc0_lts0_tstg_cfg1_active_sets_half_v(void)
+{
+        return 0x00000001U;
+}
+static inline u32 ltc_ltc0_lts0_tstg_cfg1_active_sets_quarter_v(void)
+{
+        return 0x00000002U;
+}
+static inline u32 ltc_ltcs_ltss_cbc_ctrl1_r(void)
+{
+        return 0x0017e8c8U;
+}
+static inline u32 ltc_ltcs_ltss_cbc_ctrl1_clean_active_f(void)
+{
+        return 0x1U;
+}
+static inline u32 ltc_ltcs_ltss_cbc_ctrl1_invalidate_active_f(void)
+{
+        return 0x2U;
+}
+static inline u32 ltc_ltcs_ltss_cbc_ctrl1_clear_v(u32 r)
+{
+        return (r >> 2U) & 0x1U;
+}
+static inline u32 ltc_ltcs_ltss_cbc_ctrl1_clear_active_v(void)
+{
+        return 0x00000001U;
+}
+static inline u32 ltc_ltcs_ltss_cbc_ctrl1_clear_active_f(void)
+{
+        return 0x4U;
+}
+static inline u32 ltc_ltc0_lts0_cbc_ctrl1_r(void)
+{
+        return 0x001410c8U;
+}
+static inline u32 ltc_ltcs_ltss_cbc_ctrl2_r(void)
+{
+        return 0x0017e8ccU;
+}
+static inline u32 ltc_ltcs_ltss_cbc_ctrl2_clear_lower_bound_f(u32 v)
+{
+        return (v & 0x1ffffU) << 0U;
+}
+static inline u32 ltc_ltcs_ltss_cbc_ctrl3_r(void)
+{
+        return 0x0017e8d0U;
+}
+static inline u32 ltc_ltcs_ltss_cbc_ctrl3_clear_upper_bound_f(u32 v)
+{
+        return (v & 0x1ffffU) << 0U;
+}
+static inline u32 ltc_ltcs_ltss_cbc_ctrl3_clear_upper_bound_init_v(void)
+{
+        return 0x0001ffffU;
+}
+static inline u32 ltc_ltcs_ltss_cbc_base_r(void)
+{
+        return 0x0017e8d4U;
+}
+static inline u32 ltc_ltcs_ltss_cbc_base_alignment_shift_v(void)
+{
+        return 0x0000000bU;
+}
+static inline u32 ltc_ltcs_ltss_cbc_base_address_v(u32 r)
+{
+        return (r >> 0U) & 0x3ffffffU;
+}
+static inline u32 ltc_ltcs_ltss_cbc_param_r(void)
+{
+        return 0x0017e8dcU;
+}
+static inline u32 ltc_ltcs_ltss_cbc_param_comptags_per_cache_line_v(u32 r)
+{
+        return (r >> 0U) & 0xffffU;
+}
+static inline u32 ltc_ltcs_ltss_cbc_param_cache_line_size_v(u32 r)
+{
+        return (r >> 24U) & 0xfU;
+}
+static inline u32 ltc_ltcs_ltss_cbc_param_slices_per_fbp_v(u32 r)
+{
+        return (r >> 28U) & 0xfU;
+}
+static inline u32 ltc_ltcs_ltss_tstg_set_mgmt_r(void)
+{
+        return 0x0017e91cU;
+}
+static inline u32 ltc_ltcs_ltss_tstg_set_mgmt_max_ways_evict_last_f(u32 v)
+{
+        return (v & 0x1fU) << 16U;
+}
+static inline u32 ltc_ltcs_ltss_dstg_zbc_index_r(void)
+{
+        return 0x0017ea44U;
+}
+static inline u32 ltc_ltcs_ltss_dstg_zbc_index_address_f(u32 v)
+{
+        return (v & 0xfU) << 0U;
+}
+static inline u32 ltc_ltcs_ltss_dstg_zbc_color_clear_value_r(u32 i)
+{
+        return 0x0017ea48U + i*4U;
+}
+static inline u32 ltc_ltcs_ltss_dstg_zbc_color_clear_value__size_1_v(void)
+{
+        return 0x00000004U;
+}
+static inline u32 ltc_ltcs_ltss_dstg_zbc_depth_clear_value_r(void)
+{
+        return 0x0017ea58U;
+}
+static inline u32 ltc_ltcs_ltss_dstg_zbc_depth_clear_value_field_s(void)
+{
+        return 32U;
+}
+static inline u32 ltc_ltcs_ltss_dstg_zbc_depth_clear_value_field_f(u32 v)
+{
+        return (v & 0xffffffffU) << 0U;
+}
+static inline u32 ltc_ltcs_ltss_dstg_zbc_depth_clear_value_field_m(void)
+{
+        return 0xffffffffU << 0U;
+}
+static inline u32 ltc_ltcs_ltss_dstg_zbc_depth_clear_value_field_v(u32 r)
+{
+        return (r >> 0U) & 0xffffffffU;
+}
+static inline u32 ltc_ltcs_ltss_tstg_set_mgmt_2_r(void)
+{
+        return 0x0017e924U;
+}
+static inline u32 ltc_ltcs_ltss_tstg_set_mgmt_2_l2_bypass_mode_enabled_f(void)
+{
+        return 0x10000000U;
+}
+static inline u32 ltc_ltcs_ltss_g_elpg_r(void)
+{
+        return 0x0017e828U;
+}
+static inline u32 ltc_ltcs_ltss_g_elpg_flush_v(u32 r)
+{
+        return (r >> 0U) & 0x1U;
+}
+static inline u32 ltc_ltcs_ltss_g_elpg_flush_pending_v(void)
+{
+        return 0x00000001U;
+}
+static inline u32 ltc_ltcs_ltss_g_elpg_flush_pending_f(void)
+{
+        return 0x1U;
+}
+static inline u32 ltc_ltc0_ltss_g_elpg_r(void)
+{
+        return 0x00140828U;
+}
+static inline u32 ltc_ltc0_ltss_g_elpg_flush_v(u32 r)
+{
+        return (r >> 0U) & 0x1U;
+}
+static inline u32 ltc_ltc0_ltss_g_elpg_flush_pending_v(void)
+{
+        return 0x00000001U;
+}
+static inline u32 ltc_ltc0_ltss_g_elpg_flush_pending_f(void)
+{
+        return 0x1U;
+}
+static inline u32 ltc_ltc0_ltss_intr_r(void)
+{
+        return 0x00140820U;
+}
+static inline u32 ltc_ltcs_ltss_intr_r(void)
+{
+        return 0x0017e820U;
+}
+static inline u32 ltc_ltcs_ltss_intr_en_evicted_cb_m(void)
+{
+        return 0x1U << 20U;
+}
+static inline u32 ltc_ltcs_ltss_intr_en_illegal_compstat_m(void)
+{
+        return 0x1U << 21U;
+}
+static inline u32 ltc_ltc0_lts0_intr_r(void)
+{
+        return 0x00141020U;
+}
+static inline u32 ltc_ltcs_ltss_tstg_cmgmt0_r(void)
+{
+        return 0x0017e910U;
+}
+static inline u32 ltc_ltcs_ltss_tstg_cmgmt0_invalidate_v(u32 r)
+{
+        return (r >> 0U) & 0x1U;
+}
+static inline u32 ltc_ltcs_ltss_tstg_cmgmt0_invalidate_pending_v(void)
+{
+        return 0x00000001U;
+}
+static inline u32 ltc_ltcs_ltss_tstg_cmgmt0_invalidate_pending_f(void)
+{
+        return 0x1U;
+}
+static inline u32 ltc_ltcs_ltss_tstg_cmgmt0_max_cycles_between_invalidates_v(u32 r)
+{
+        return (r >> 8U) & 0xfU;
+}
+static inline u32 ltc_ltcs_ltss_tstg_cmgmt0_max_cycles_between_invalidates_3_v(void)
+{
+        return 0x00000003U;
+}
+static inline u32 ltc_ltcs_ltss_tstg_cmgmt0_max_cycles_between_invalidates_3_f(void)
+{
+        return 0x300U;
+}
+static inline u32 ltc_ltcs_ltss_tstg_cmgmt0_invalidate_evict_last_class_v(u32 r)
+{
+        return (r >> 28U) & 0x1U;
+}
+static inline u32 ltc_ltcs_ltss_tstg_cmgmt0_invalidate_evict_last_class_true_v(void)
+{
+        return 0x00000001U;
+}
+static inline u32 ltc_ltcs_ltss_tstg_cmgmt0_invalidate_evict_last_class_true_f(void)
+{
+        return 0x10000000U;
+}
+static inline u32 ltc_ltcs_ltss_tstg_cmgmt0_invalidate_evict_normal_class_v(u32 r)
+{
+        return (r >> 29U) & 0x1U;
+}
+static inline u32 ltc_ltcs_ltss_tstg_cmgmt0_invalidate_evict_normal_class_true_v(void)
+{
+        return 0x00000001U;
+}
+static inline u32 ltc_ltcs_ltss_tstg_cmgmt0_invalidate_evict_normal_class_true_f(void)
+{
+        return 0x20000000U;
+}
+static inline u32 ltc_ltcs_ltss_tstg_cmgmt0_invalidate_evict_first_class_v(u32 r)
+{
+        return (r >> 30U) & 0x1U;
+}
+static inline u32 ltc_ltcs_ltss_tstg_cmgmt0_invalidate_evict_first_class_true_v(void)
+{
+        return 0x00000001U;
+}
+static inline u32 ltc_ltcs_ltss_tstg_cmgmt0_invalidate_evict_first_class_true_f(void)
+{
+        return 0x40000000U;
+}
+static inline u32 ltc_ltcs_ltss_tstg_cmgmt1_r(void)
+{
+        return 0x0017e914U;
+}
+static inline u32 ltc_ltcs_ltss_tstg_cmgmt1_clean_v(u32 r)
+{
+        return (r >> 0U) & 0x1U;
+}
+static inline u32 ltc_ltcs_ltss_tstg_cmgmt1_clean_pending_v(void)
+{
+        return 0x00000001U;
+}
+static inline u32 ltc_ltcs_ltss_tstg_cmgmt1_clean_pending_f(void)
+{
+        return 0x1U;
+}
+static inline u32 ltc_ltcs_ltss_tstg_cmgmt1_max_cycles_between_cleans_v(u32 r)
+{
+        return (r >> 8U) & 0xfU;
+}
+static inline u32 ltc_ltcs_ltss_tstg_cmgmt1_max_cycles_between_cleans_3_v(void)
+{
+        return 0x00000003U;
+}
+static inline u32 ltc_ltcs_ltss_tstg_cmgmt1_max_cycles_between_cleans_3_f(void)
+{
+        return 0x300U;
+}
+static inline u32 ltc_ltcs_ltss_tstg_cmgmt1_clean_wait_for_fb_to_pull_v(u32 r)
+{
+        return (r >> 16U) & 0x1U;
+}
+static inline u32 ltc_ltcs_ltss_tstg_cmgmt1_clean_wait_for_fb_to_pull_true_v(void)
+{
+        return 0x00000001U;
+}
+static inline u32 ltc_ltcs_ltss_tstg_cmgmt1_clean_wait_for_fb_to_pull_true_f(void)
+{
+        return 0x10000U;
+}
+static inline u32 ltc_ltcs_ltss_tstg_cmgmt1_clean_evict_last_class_v(u32 r)
+{
+        return (r >> 28U) & 0x1U;
+}
+static inline u32 ltc_ltcs_ltss_tstg_cmgmt1_clean_evict_last_class_true_v(void)
+{
+        return 0x00000001U;
+}
+static inline u32 ltc_ltcs_ltss_tstg_cmgmt1_clean_evict_last_class_true_f(void)
+{
+        return 0x10000000U;
+}
+static inline u32 ltc_ltcs_ltss_tstg_cmgmt1_clean_evict_normal_class_v(u32 r)
+{
+        return (r >> 29U) & 0x1U;
+}
+static inline u32 ltc_ltcs_ltss_tstg_cmgmt1_clean_evict_normal_class_true_v(void)
+{
+        return 0x00000001U;
+}
+static inline u32 ltc_ltcs_ltss_tstg_cmgmt1_clean_evict_normal_class_true_f(void)
+{
+        return 0x20000000U;
+}
+static inline u32 ltc_ltcs_ltss_tstg_cmgmt1_clean_evict_first_class_v(u32 r)
+{
+        return (r >> 30U) & 0x1U;
+}
+static inline u32 ltc_ltcs_ltss_tstg_cmgmt1_clean_evict_first_class_true_v(void)
+{
+        return 0x00000001U;
+}
+static inline u32 ltc_ltcs_ltss_tstg_cmgmt1_clean_evict_first_class_true_f(void)
+{
+        return 0x40000000U;
+}
+static inline u32 ltc_ltc0_ltss_tstg_cmgmt0_r(void)
+{
+        return 0x00140910U;
+}
+static inline u32 ltc_ltc0_ltss_tstg_cmgmt0_invalidate_v(u32 r)
+{
+        return (r >> 0U) & 0x1U;
+}
+static inline u32 ltc_ltc0_ltss_tstg_cmgmt0_invalidate_pending_v(void)
+{
+        return 0x00000001U;
+}
+static inline u32 ltc_ltc0_ltss_tstg_cmgmt0_invalidate_pending_f(void)
+{
+        return 0x1U;
+}
+static inline u32 ltc_ltc0_ltss_tstg_cmgmt1_r(void)
+{
+        return 0x00140914U;
+}
+static inline u32 ltc_ltc0_ltss_tstg_cmgmt1_clean_v(u32 r)
+{
+        return (r >> 0U) & 0x1U;
+}
+static inline u32 ltc_ltc0_ltss_tstg_cmgmt1_clean_pending_v(void)
+{
+        return 0x00000001U;
+}
+static inline u32 ltc_ltc0_ltss_tstg_cmgmt1_clean_pending_f(void)
+{
+        return 0x1U;
+}
+#endif
diff --git a/include/gk20a/hw_mc_gk20a.h b/include/gk20a/hw_mc_gk20a.h
new file mode 100644
index 0000000..3ca2a29
--- /dev/null
+++ b/include/gk20a/hw_mc_gk20a.h
@@ -0,0 +1,291 @@
+/*
+ * Copyright (c) 2012-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+/*
+ * Function naming determines intended use:
+ *
+ *     <x>_r(void) : Returns the offset for register <x>.
+ *
+ *     <x>_o(void) : Returns the offset for element <x>.
+ *
+ *     <x>_w(void) : Returns the word offset for word (4 byte) element <x>.
+ *
+ *     <x>_<y>_s(void) : Returns size of field <y> of register <x> in bits.
+ *
+ *     <x>_<y>_f(u32 v) : Returns a value based on 'v' which has been shifted
+ *         and masked to place it at field <y> of register <x>.  This value
+ *         can be |'d with others to produce a full register value for
+ *         register <x>.
+ *
+ *     <x>_<y>_m(void) : Returns a mask for field <y> of register <x>.  This
+ *         value can be ~'d and then &'d to clear the value of field <y> for
+ *         register <x>.
+ *
+ *     <x>_<y>_<z>_f(void) : Returns the constant value <z> after being shifted
+ *         to place it at field <y> of register <x>.  This value can be |'d
+ *         with others to produce a full register value for <x>.
+ *
+ *     <x>_<y>_v(u32 r) : Returns the value of field <y> from a full register
+ *         <x> value 'r' after being shifted to place its LSB at bit 0.
+ *         This value is suitable for direct comparison with other unshifted
+ *         values appropriate for use in field <y> of register <x>.
+ *
+ *     <x>_<y>_<z>_v(void) : Returns the constant value for <z> defined for
+ *         field <y> of register <x>.  This value is suitable for direct
+ *         comparison with unshifted values appropriate for use in field <y>
+ *         of register <x>.
+ */
+#ifndef _hw_mc_gk20a_h_
+#define _hw_mc_gk20a_h_
+static inline u32 mc_boot_0_r(void)
+{
+        return 0x00000000U;
+}
+static inline u32 mc_boot_0_architecture_v(u32 r)
+{
+        return (r >> 24U) & 0x1fU;
+}
+static inline u32 mc_boot_0_implementation_v(u32 r)
+{
+        return (r >> 20U) & 0xfU;
+}
+static inline u32 mc_boot_0_major_revision_v(u32 r)
+{
+        return (r >> 4U) & 0xfU;
+}
+static inline u32 mc_boot_0_minor_revision_v(u32 r)
+{
+        return (r >> 0U) & 0xfU;
+}
+static inline u32 mc_intr_0_r(void)
+{
+        return 0x00000100U;
+}
+static inline u32 mc_intr_0_pfifo_pending_f(void)
+{
+        return 0x100U;
+}
+static inline u32 mc_intr_0_pgraph_pending_f(void)
+{
+        return 0x1000U;
+}
+static inline u32 mc_intr_0_pmu_pending_f(void)
+{
+        return 0x1000000U;
+}
+static inline u32 mc_intr_0_ltc_pending_f(void)
+{
+        return 0x2000000U;
+}
+static inline u32 mc_intr_0_priv_ring_pending_f(void)
+{
+        return 0x40000000U;
+}
+static inline u32 mc_intr_0_pbus_pending_f(void)
+{
+        return 0x10000000U;
+}
+static inline u32 mc_intr_1_r(void)
+{
+        return 0x00000104U;
+}
+static inline u32 mc_intr_mask_0_r(void)
+{
+        return 0x00000640U;
+}
+static inline u32 mc_intr_mask_0_pmu_enabled_f(void)
+{
+        return 0x1000000U;
+}
+static inline u32 mc_intr_en_0_r(void)
+{
+        return 0x00000140U;
+}
+static inline u32 mc_intr_en_0_inta_disabled_f(void)
+{
+        return 0x0U;
+}
+static inline u32 mc_intr_en_0_inta_hardware_f(void)
+{
+        return 0x1U;
+}
+static inline u32 mc_intr_mask_1_r(void)
+{
+        return 0x00000644U;
+}
+static inline u32 mc_intr_mask_1_pmu_s(void)
+{
+        return 1U;
+}
+static inline u32 mc_intr_mask_1_pmu_f(u32 v)
+{
+        return (v & 0x1U) << 24U;
+}
+static inline u32 mc_intr_mask_1_pmu_m(void)
+{
+        return 0x1U << 24U;
+}
+static inline u32 mc_intr_mask_1_pmu_v(u32 r)
+{
+        return (r >> 24U) & 0x1U;
+}
+static inline u32 mc_intr_mask_1_pmu_enabled_f(void)
+{
+        return 0x1000000U;
+}
+static inline u32 mc_intr_en_1_r(void)
+{
+        return 0x00000144U;
+}
+static inline u32 mc_intr_en_1_inta_disabled_f(void)
+{
+        return 0x0U;
+}
+static inline u32 mc_intr_en_1_inta_hardware_f(void)
+{
+        return 0x1U;
+}
+static inline u32 mc_enable_r(void)
+{
+        return 0x00000200U;
+}
+static inline u32 mc_enable_xbar_enabled_f(void)
+{
+        return 0x4U;
+}
+static inline u32 mc_enable_l2_enabled_f(void)
+{
+        return 0x8U;
+}
+static inline u32 mc_enable_pmedia_s(void)
+{
+        return 1U;
+}
+static inline u32 mc_enable_pmedia_f(u32 v)
+{
+        return (v & 0x1U) << 4U;
+}
+static inline u32 mc_enable_pmedia_m(void)
+{
+        return 0x1U << 4U;
+}
+static inline u32 mc_enable_pmedia_v(u32 r)
+{
+        return (r >> 4U) & 0x1U;
+}
+static inline u32 mc_enable_priv_ring_enabled_f(void)
+{
+        return 0x20U;
+}
+static inline u32 mc_enable_ce0_m(void)
+{
+        return 0x1U << 6U;
+}
+static inline u32 mc_enable_pfifo_enabled_f(void)
+{
+        return 0x100U;
+}
+static inline u32 mc_enable_pgraph_enabled_f(void)
+{
+        return 0x1000U;
+}
+static inline u32 mc_enable_pwr_v(u32 r)
+{
+        return (r >> 13U) & 0x1U;
+}
+static inline u32 mc_enable_pwr_disabled_v(void)
+{
+        return 0x00000000U;
+}
+static inline u32 mc_enable_pwr_enabled_f(void)
+{
+        return 0x2000U;
+}
+static inline u32 mc_enable_pfb_enabled_f(void)
+{
+        return 0x100000U;
+}
+static inline u32 mc_enable_ce2_m(void)
+{
+        return 0x1U << 21U;
+}
+static inline u32 mc_enable_ce2_enabled_f(void)
+{
+        return 0x200000U;
+}
+static inline u32 mc_enable_blg_enabled_f(void)
+{
+        return 0x8000000U;
+}
+static inline u32 mc_enable_perfmon_enabled_f(void)
+{
+        return 0x10000000U;
+}
+static inline u32 mc_enable_hub_enabled_f(void)
+{
+        return 0x20000000U;
+}
+static inline u32 mc_enable_pb_r(void)
+{
+        return 0x00000204U;
+}
+static inline u32 mc_enable_pb_0_s(void)
+{
+        return 1U;
+}
+static inline u32 mc_enable_pb_0_f(u32 v)
+{
+        return (v & 0x1U) << 0U;
+}
+static inline u32 mc_enable_pb_0_m(void)
+{
+        return 0x1U << 0U;
+}
+static inline u32 mc_enable_pb_0_v(u32 r)
+{
+        return (r >> 0U) & 0x1U;
+}
+static inline u32 mc_enable_pb_0_enabled_v(void)
+{
+        return 0x00000001U;
+}
+static inline u32 mc_enable_pb_sel_f(u32 v, u32 i)
+{
+        return (v & 0x1U) << (0U + i*1U);
+}
+static inline u32 mc_elpg_enable_r(void)
+{
+        return 0x0000020cU;
+}
+static inline u32 mc_elpg_enable_xbar_enabled_f(void)
+{
+        return 0x4U;
+}
+static inline u32 mc_elpg_enable_pfb_enabled_f(void)
+{
+        return 0x100000U;
+}
+static inline u32 mc_elpg_enable_hub_enabled_f(void)
+{
+        return 0x20000000U;
+}
+#endif
diff --git a/include/gk20a/hw_pbdma_gk20a.h b/include/gk20a/hw_pbdma_gk20a.h
new file mode 100644
index 0000000..2c8f48d
--- /dev/null
+++ b/include/gk20a/hw_pbdma_gk20a.h
@@ -0,0 +1,575 @@
+/*
+ * Copyright (c) 2012-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+/*
+ * Function naming determines intended use:
+ *
+ *     <x>_r(void) : Returns the offset for register <x>.
+ *
+ *     <x>_o(void) : Returns the offset for element <x>.
+ *
+ *     <x>_w(void) : Returns the word offset for word (4 byte) element <x>.
+ *
+ *     <x>_<y>_s(void) : Returns size of field <y> of register <x> in bits.
+ *
+ *     <x>_<y>_f(u32 v) : Returns a value based on 'v' which has been shifted
+ *         and masked to place it at field <y> of register <x>.  This value
+ *         can be |'d with others to produce a full register value for
+ *         register <x>.
+ *
+ *     <x>_<y>_m(void) : Returns a mask for field <y> of register <x>.  This
+ *         value can be ~'d and then &'d to clear the value of field <y> for
+ *         register <x>.
+ *
+ *     <x>_<y>_<z>_f(void) : Returns the constant value <z> after being shifted
+ *         to place it at field <y> of register <x>.  This value can be |'d
+ *         with others to produce a full register value for <x>.
+ *
+ *     <x>_<y>_v(u32 r) : Returns the value of field <y> from a full register
+ *         <x> value 'r' after being shifted to place its LSB at bit 0.
+ *         This value is suitable for direct comparison with other unshifted
+ *         values appropriate for use in field <y> of register <x>.
+ *
+ *     <x>_<y>_<z>_v(void) : Returns the constant value for <z> defined for
+ *         field <y> of register <x>.  This value is suitable for direct
+ *         comparison with unshifted values appropriate for use in field <y>
+ *         of register <x>.
+ */
+#ifndef _hw_pbdma_gk20a_h_
+#define _hw_pbdma_gk20a_h_
+static inline u32 pbdma_gp_entry1_r(void)
+{
+        return 0x10000004U;
+}
+static inline u32 pbdma_gp_entry1_get_hi_v(u32 r)
+{
+        return (r >> 0U) & 0xffU;
+}
+static inline u32 pbdma_gp_entry1_length_f(u32 v)
+{
+        return (v & 0x1fffffU) << 10U;
+}
+static inline u32 pbdma_gp_entry1_length_v(u32 r)
+{
+        return (r >> 10U) & 0x1fffffU;
+}
+static inline u32 pbdma_gp_base_r(u32 i)
+{
+        return 0x00040048U + i*8192U;
+}
+static inline u32 pbdma_gp_base__size_1_v(void)
+{
+        return 0x00000001U;
+}
+static inline u32 pbdma_gp_base_offset_f(u32 v)
+{
+        return (v & 0x1fffffffU) << 3U;
+}
+static inline u32 pbdma_gp_base_rsvd_s(void)
+{
+        return 3U;
+}
+static inline u32 pbdma_gp_base_hi_r(u32 i)
+{
+        return 0x0004004cU + i*8192U;
+}
+static inline u32 pbdma_gp_base_hi_offset_f(u32 v)
+{
+        return (v & 0xffU) << 0U;
+}
+static inline u32 pbdma_gp_base_hi_limit2_f(u32 v)
+{
+        return (v & 0x1fU) << 16U;
+}
+static inline u32 pbdma_gp_fetch_r(u32 i)
+{
+        return 0x00040050U + i*8192U;
+}
+static inline u32 pbdma_gp_get_r(u32 i)
+{
+        return 0x00040014U + i*8192U;
+}
+static inline u32 pbdma_gp_put_r(u32 i)
+{
+        return 0x00040000U + i*8192U;
+}
+static inline u32 pbdma_timeout_r(u32 i)
+{
+        return 0x0004012cU + i*8192U;
+}
+static inline u32 pbdma_timeout__size_1_v(void)
+{
+        return 0x00000001U;
+}
+static inline u32 pbdma_timeout_period_m(void)
+{
+        return 0xffffffffU << 0U;
+}
+static inline u32 pbdma_timeout_period_max_f(void)
+{
+        return 0xffffffffU;
+}
+static inline u32 pbdma_pb_fetch_r(u32 i)
+{
+        return 0x00040054U + i*8192U;
+}
+static inline u32 pbdma_pb_fetch_hi_r(u32 i)
+{
+        return 0x00040058U + i*8192U;
+}
+static inline u32 pbdma_get_r(u32 i)
+{
+        return 0x00040018U + i*8192U;
+}
+static inline u32 pbdma_get_hi_r(u32 i)
+{
+        return 0x0004001cU + i*8192U;
+}
+static inline u32 pbdma_put_r(u32 i)
+{
+        return 0x0004005cU + i*8192U;
+}
+static inline u32 pbdma_put_hi_r(u32 i)
+{
+        return 0x00040060U + i*8192U;
+}
+static inline u32 pbdma_formats_r(u32 i)
+{
+        return 0x0004009cU + i*8192U;
+}
+static inline u32 pbdma_formats_gp_fermi0_f(void)
+{
+        return 0x0U;
+}
+static inline u32 pbdma_formats_pb_fermi1_f(void)
+{
+        return 0x100U;
+}
+static inline u32 pbdma_formats_mp_fermi0_f(void)
+{
+        return 0x0U;
+}
+static inline u32 pbdma_pb_header_r(u32 i)
+{
+        return 0x00040084U + i*8192U;
+}
+static inline u32 pbdma_pb_header_priv_user_f(void)
+{
+        return 0x0U;
+}
+static inline u32 pbdma_pb_header_method_zero_f(void)
+{
+        return 0x0U;
+}
+static inline u32 pbdma_pb_header_subchannel_zero_f(void)
+{
+        return 0x0U;
+}
+static inline u32 pbdma_pb_header_level_main_f(void)
+{
+        return 0x0U;
+}
+static inline u32 pbdma_pb_header_first_true_f(void)
+{
+        return 0x400000U;
+}
+static inline u32 pbdma_pb_header_type_inc_f(void)
+{
+        return 0x20000000U;
+}
+static inline u32 pbdma_pb_header_type_non_inc_f(void)
+{
+        return 0x60000000U;
+}
+static inline u32 pbdma_hdr_shadow_r(u32 i)
+{
+        return 0x00040118U + i*8192U;
+}
+static inline u32 pbdma_gp_shadow_0_r(u32 i)
+{
+        return 0x00040110U + i*8192U;
+}
+static inline u32 pbdma_gp_shadow_1_r(u32 i)
+{
+        return 0x00040114U + i*8192U;
+}
+static inline u32 pbdma_subdevice_r(u32 i)
+{
+        return 0x00040094U + i*8192U;
+}
+static inline u32 pbdma_subdevice_id_f(u32 v)
+{
+        return (v & 0xfffU) << 0U;
+}
+static inline u32 pbdma_subdevice_status_active_f(void)
+{
+        return 0x10000000U;
+}
+static inline u32 pbdma_subdevice_channel_dma_enable_f(void)
+{
+        return 0x20000000U;
+}
+static inline u32 pbdma_method0_r(u32 i)
+{
+        return 0x000400c0U + i*8192U;
+}
+static inline u32 pbdma_method0_addr_f(u32 v)
+{
+        return (v & 0xfffU) << 2U;
+}
+static inline u32 pbdma_method0_addr_v(u32 r)
+{
+        return (r >> 2U) & 0xfffU;
+}
+static inline u32 pbdma_method0_subch_v(u32 r)
+{
+        return (r >> 16U) & 0x7U;
+}
+static inline u32 pbdma_method0_first_true_f(void)
+{
+        return 0x400000U;
+}
+static inline u32 pbdma_method0_valid_true_f(void)
+{
+        return 0x80000000U;
+}
+static inline u32 pbdma_method1_r(u32 i)
+{
+        return 0x000400c8U + i*8192U;
+}
+static inline u32 pbdma_method2_r(u32 i)
+{
+        return 0x000400d0U + i*8192U;
+}
+static inline u32 pbdma_method3_r(u32 i)
+{
+        return 0x000400d8U + i*8192U;
+}
+static inline u32 pbdma_data0_r(u32 i)
+{
+        return 0x000400c4U + i*8192U;
+}
+static inline u32 pbdma_target_r(u32 i)
+{
+        return 0x000400acU + i*8192U;
+}
+static inline u32 pbdma_target_engine_sw_f(void)
+{
+        return 0x1fU;
+}
+static inline u32 pbdma_acquire_r(u32 i)
+{
+        return 0x00040030U + i*8192U;
+}
+static inline u32 pbdma_acquire_retry_man_2_f(void)
+{
+        return 0x2U;
+}
+static inline u32 pbdma_acquire_retry_exp_2_f(void)
+{
+        return 0x100U;
+}
+static inline u32 pbdma_acquire_timeout_exp_f(u32 v)
+{
+        return (v & 0xfU) << 11U;
+}
+static inline u32 pbdma_acquire_timeout_exp_max_v(void)
+{
+        return 0x0000000fU;
+}
+static inline u32 pbdma_acquire_timeout_exp_max_f(void)
+{
+        return 0x7800U;
+}
+static inline u32 pbdma_acquire_timeout_man_f(u32 v)
+{
+        return (v & 0xffffU) << 15U;
+}
+static inline u32 pbdma_acquire_timeout_man_max_v(void)
+{
+        return 0x0000ffffU;
+}
+static inline u32 pbdma_acquire_timeout_man_max_f(void)
+{
+        return 0x7fff8000U;
+}
+static inline u32 pbdma_acquire_timeout_en_enable_f(void)
+{
+        return 0x80000000U;
+}
+static inline u32 pbdma_acquire_timeout_en_disable_f(void)
+{
+        return 0x0U;
+}
+static inline u32 pbdma_status_r(u32 i)
+{
+        return 0x00040100U + i*8192U;
+}
+static inline u32 pbdma_channel_r(u32 i)
+{
+        return 0x00040120U + i*8192U;
+}
+static inline u32 pbdma_signature_r(u32 i)
+{
+        return 0x00040010U + i*8192U;
+}
+static inline u32 pbdma_signature_hw_valid_f(void)
+{
+        return 0xfaceU;
+}
+static inline u32 pbdma_signature_sw_zero_f(void)
+{
+        return 0x0U;
+}
+static inline u32 pbdma_userd_r(u32 i)
+{
+        return 0x00040008U + i*8192U;
+}
+static inline u32 pbdma_userd_target_vid_mem_f(void)
+{
+        return 0x0U;
+}
+static inline u32 pbdma_userd_target_sys_mem_coh_f(void)
+{
+        return 0x2U;
+}
+static inline u32 pbdma_userd_target_sys_mem_ncoh_f(void)
+{
+        return 0x3U;
+}
+static inline u32 pbdma_userd_addr_f(u32 v)
+{
+        return (v & 0x7fffffU) << 9U;
+}
+static inline u32 pbdma_userd_hi_r(u32 i)
+{
+        return 0x0004000cU + i*8192U;
+}
+static inline u32 pbdma_userd_hi_addr_f(u32 v)
+{
+        return (v & 0xffU) << 0U;
+}
+static inline u32 pbdma_hce_ctrl_r(u32 i)
+{
+        return 0x000400e4U + i*8192U;
+}
+static inline u32 pbdma_hce_ctrl_hce_priv_mode_yes_f(void)
+{
+        return 0x20U;
+}
+static inline u32 pbdma_intr_0_r(u32 i)
+{
+        return 0x00040108U + i*8192U;
+}
+static inline u32 pbdma_intr_0_memreq_v(u32 r)
+{
+        return (r >> 0U) & 0x1U;
+}
+static inline u32 pbdma_intr_0_memreq_pending_f(void)
+{
+        return 0x1U;
+}
+static inline u32 pbdma_intr_0_memack_timeout_pending_f(void)
+{
+        return 0x2U;
+}
+static inline u32 pbdma_intr_0_memack_extra_pending_f(void)
+{
+        return 0x4U;
+}
+static inline u32 pbdma_intr_0_memdat_timeout_pending_f(void)
+{
+        return 0x8U;
+}
+static inline u32 pbdma_intr_0_memdat_extra_pending_f(void)
+{
+        return 0x10U;
+}
+static inline u32 pbdma_intr_0_memflush_pending_f(void)
+{
+        return 0x20U;
+}
+static inline u32 pbdma_intr_0_memop_pending_f(void)
+{
+        return 0x40U;
+}
+static inline u32 pbdma_intr_0_lbconnect_pending_f(void)
+{
+        return 0x80U;
+}
+static inline u32 pbdma_intr_0_lbreq_pending_f(void)
+{
+        return 0x100U;
+}
+static inline u32 pbdma_intr_0_lback_timeout_pending_f(void)
+{
+        return 0x200U;
+}
+static inline u32 pbdma_intr_0_lback_extra_pending_f(void)
+{
+        return 0x400U;
+}
+static inline u32 pbdma_intr_0_lbdat_timeout_pending_f(void)
+{
+        return 0x800U;
+}
+static inline u32 pbdma_intr_0_lbdat_extra_pending_f(void)
+{
+        return 0x1000U;
+}
+static inline u32 pbdma_intr_0_gpfifo_pending_f(void)
+{
+        return 0x2000U;
+}
+static inline u32 pbdma_intr_0_gpptr_pending_f(void)
+{
+        return 0x4000U;
+}
+static inline u32 pbdma_intr_0_gpentry_pending_f(void)
+{
+        return 0x8000U;
+}
+static inline u32 pbdma_intr_0_gpcrc_pending_f(void)
+{
+        return 0x10000U;
+}
+static inline u32 pbdma_intr_0_pbptr_pending_f(void)
+{
+        return 0x20000U;
+}
+static inline u32 pbdma_intr_0_pbentry_pending_f(void)
+{
+        return 0x40000U;
+}
+static inline u32 pbdma_intr_0_pbcrc_pending_f(void)
+{
+        return 0x80000U;
+}
+static inline u32 pbdma_intr_0_xbarconnect_pending_f(void)
+{
+        return 0x100000U;
+}
+static inline u32 pbdma_intr_0_method_pending_f(void)
+{
+        return 0x200000U;
+}
+static inline u32 pbdma_intr_0_methodcrc_pending_f(void)
+{
+        return 0x400000U;
+}
+static inline u32 pbdma_intr_0_device_pending_f(void)
+{
+        return 0x800000U;
+}
+static inline u32 pbdma_intr_0_semaphore_pending_f(void)
+{
+        return 0x2000000U;
+}
+static inline u32 pbdma_intr_0_acquire_pending_f(void)
+{
+        return 0x4000000U;
+}
+static inline u32 pbdma_intr_0_pri_pending_f(void)
+{
+        return 0x8000000U;
+}
+static inline u32 pbdma_intr_0_no_ctxsw_seg_pending_f(void)
+{
+        return 0x20000000U;
+}
+static inline u32 pbdma_intr_0_pbseg_pending_f(void)
+{
+        return 0x40000000U;
+}
+static inline u32 pbdma_intr_0_signature_pending_f(void)
+{
+        return 0x80000000U;
+}
+static inline u32 pbdma_intr_1_r(u32 i)
+{
+        return 0x00040148U + i*8192U;
+}
+static inline u32 pbdma_intr_en_0_r(u32 i)
+{
+        return 0x0004010cU + i*8192U;
+}
+static inline u32 pbdma_intr_en_0_lbreq_enabled_f(void)
+{
+        return 0x100U;
+}
+static inline u32 pbdma_intr_en_1_r(u32 i)
+{
+        return 0x0004014cU + i*8192U;
+}
+static inline u32 pbdma_intr_stall_r(u32 i)
+{
+        return 0x0004013cU + i*8192U;
+}
+static inline u32 pbdma_intr_stall_lbreq_enabled_f(void)
+{
+        return 0x100U;
+}
+static inline u32 pbdma_intr_stall_1_r(u32 i)
+{
+        return 0x00040140U + i*8192U;
+}
+static inline u32 pbdma_intr_stall_1_hce_illegal_op_enabled_f(void)
+{
+        return 0x1U;
+}
+static inline u32 pbdma_udma_nop_r(void)
+{
+        return 0x00000008U;
+}
+static inline u32 pbdma_syncpointa_r(u32 i)
+{
+        return 0x000400a4U + i*8192U;
+}
+static inline u32 pbdma_syncpointa_payload_v(u32 r)
+{
+        return (r >> 0U) & 0xffffffffU;
+}
+static inline u32 pbdma_syncpointb_r(u32 i)
+{
+        return 0x000400a8U + i*8192U;
+}
+static inline u32 pbdma_syncpointb_op_v(u32 r)
+{
+        return (r >> 0U) & 0x3U;
+}
+static inline u32 pbdma_syncpointb_op_wait_v(void)
+{
+        return 0x00000000U;
+}
+static inline u32 pbdma_syncpointb_wait_switch_v(u32 r)
+{
+        return (r >> 4U) & 0x1U;
+}
+static inline u32 pbdma_syncpointb_wait_switch_en_v(void)
+{
+        return 0x00000001U;
+}
+static inline u32 pbdma_syncpointb_syncpt_index_v(u32 r)
+{
+        return (r >> 8U) & 0xffU;
+}
+#endif
diff --git a/include/gk20a/hw_perf_gk20a.h b/include/gk20a/hw_perf_gk20a.h
new file mode 100644
index 0000000..a93560f
--- /dev/null
+++ b/include/gk20a/hw_perf_gk20a.h
@@ -0,0 +1,211 @@
+/*
+ * Copyright (c) 2015-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+/*
+ * Function naming determines intended use:
+ *
+ *     <x>_r(void) : Returns the offset for register <x>.
+ *
+ *     <x>_o(void) : Returns the offset for element <x>.
+ *
+ *     <x>_w(void) : Returns the word offset for word (4 byte) element <x>.
+ *
+ *     <x>_<y>_s(void) : Returns size of field <y> of register <x> in bits.
+ *
+ *     <x>_<y>_f(u32 v) : Returns a value based on 'v' which has been shifted
+ *         and masked to place it at field <y> of register <x>.  This value
+ *         can be |'d with others to produce a full register value for
+ *         register <x>.
+ *
+ *     <x>_<y>_m(void) : Returns a mask for field <y> of register <x>.  This
+ *         value can be ~'d and then &'d to clear the value of field <y> for
+ *         register <x>.
+ *
+ *     <x>_<y>_<z>_f(void) : Returns the constant value <z> after being shifted
+ *         to place it at field <y> of register <x>.  This value can be |'d
+ *         with others to produce a full register value for <x>.
+ *
+ *     <x>_<y>_v(u32 r) : Returns the value of field <y> from a full register
+ *         <x> value 'r' after being shifted to place its LSB at bit 0.
+ *         This value is suitable for direct comparison with other unshifted
+ *         values appropriate for use in field <y> of register <x>.
+ *
+ *     <x>_<y>_<z>_v(void) : Returns the constant value for <z> defined for
+ *         field <y> of register <x>.  This value is suitable for direct
+ *         comparison with unshifted values appropriate for use in field <y>
+ *         of register <x>.
+ */
+#ifndef _hw_perf_gk20a_h_
+#define _hw_perf_gk20a_h_
+static inline u32 perf_pmasys_control_r(void)
+{
+        return 0x001b4000U;
+}
+static inline u32 perf_pmasys_control_membuf_status_v(u32 r)
+{
+        return (r >> 4U) & 0x1U;
+}
+static inline u32 perf_pmasys_control_membuf_status_overflowed_v(void)
+{
+        return 0x00000001U;
+}
+static inline u32 perf_pmasys_control_membuf_status_overflowed_f(void)
+{
+        return 0x10U;
+}
+static inline u32 perf_pmasys_control_membuf_clear_status_f(u32 v)
+{
+        return (v & 0x1U) << 5U;
+}
+static inline u32 perf_pmasys_control_membuf_clear_status_v(u32 r)
+{
+        return (r >> 5U) & 0x1U;
+}
+static inline u32 perf_pmasys_control_membuf_clear_status_doit_v(void)
+{
+        return 0x00000001U;
+}
+static inline u32 perf_pmasys_control_membuf_clear_status_doit_f(void)
+{
+        return 0x20U;
+}
+static inline u32 perf_pmasys_mem_block_r(void)
+{
+        return 0x001b4070U;
+}
+static inline u32 perf_pmasys_mem_block_base_f(u32 v)
+{
+        return (v & 0xfffffffU) << 0U;
+}
+static inline u32 perf_pmasys_mem_block_target_f(u32 v)
+{
+        return (v & 0x3U) << 28U;
+}
+static inline u32 perf_pmasys_mem_block_target_v(u32 r)
+{
+        return (r >> 28U) & 0x3U;
+}
+static inline u32 perf_pmasys_mem_block_target_lfb_v(void)
+{
+        return 0x00000000U;
+}
+static inline u32 perf_pmasys_mem_block_target_lfb_f(void)
+{
+        return 0x0U;
+}
+static inline u32 perf_pmasys_mem_block_target_sys_coh_v(void)
+{
+        return 0x00000002U;
+}
+static inline u32 perf_pmasys_mem_block_target_sys_coh_f(void)
+{
+        return 0x20000000U;
+}
+static inline u32 perf_pmasys_mem_block_target_sys_ncoh_v(void)
+{
+        return 0x00000003U;
+}
+static inline u32 perf_pmasys_mem_block_target_sys_ncoh_f(void)
+{
+        return 0x30000000U;
+}
+static inline u32 perf_pmasys_mem_block_valid_f(u32 v)
+{
+        return (v & 0x1U) << 31U;
+}
+static inline u32 perf_pmasys_mem_block_valid_v(u32 r)
+{
+        return (r >> 31U) & 0x1U;
+}
+static inline u32 perf_pmasys_mem_block_valid_true_v(void)
+{
+        return 0x00000001U;
+}
+static inline u32 perf_pmasys_mem_block_valid_true_f(void)
+{
+        return 0x80000000U;
+}
+static inline u32 perf_pmasys_mem_block_valid_false_v(void)
+{
+        return 0x00000000U;
+}
+static inline u32 perf_pmasys_mem_block_valid_false_f(void)
+{
+        return 0x0U;
+}
+static inline u32 perf_pmasys_outbase_r(void)
+{
+        return 0x001b4074U;
+}
+static inline u32 perf_pmasys_outbase_ptr_f(u32 v)
+{
+        return (v & 0x7ffffffU) << 5U;
+}
+static inline u32 perf_pmasys_outbaseupper_r(void)
+{
+        return 0x001b4078U;
+}
+static inline u32 perf_pmasys_outbaseupper_ptr_f(u32 v)
+{
+        return (v & 0xffU) << 0U;
+}
+static inline u32 perf_pmasys_outsize_r(void)
+{
+        return 0x001b407cU;
+}
+static inline u32 perf_pmasys_outsize_numbytes_f(u32 v)
+{
+        return (v & 0x7ffffffU) << 5U;
+}
+static inline u32 perf_pmasys_mem_bytes_r(void)
+{
+        return 0x001b4084U;
+}
+static inline u32 perf_pmasys_mem_bytes_numbytes_f(u32 v)
+{
+        return (v & 0xfffffffU) << 4U;
+}
+static inline u32 perf_pmasys_mem_bump_r(void)
+{
+        return 0x001b4088U;
+}
+static inline u32 perf_pmasys_mem_bump_numbytes_f(u32 v)
+{
+        return (v & 0xfffffffU) << 4U;
+}
+static inline u32 perf_pmasys_enginestatus_r(void)
+{
+        return 0x001b40a4U;
+}
+static inline u32 perf_pmasys_enginestatus_rbufempty_f(u32 v)
+{
+        return (v & 0x1U) << 4U;
+}
+static inline u32 perf_pmasys_enginestatus_rbufempty_empty_v(void)
+{
+        return 0x00000001U;
+}
+static inline u32 perf_pmasys_enginestatus_rbufempty_empty_f(void)
+{
+        return 0x10U;
+}
+#endif
diff --git a/include/gk20a/hw_pram_gk20a.h b/include/gk20a/hw_pram_gk20a.h
new file mode 100644
index 0000000..10923e2
--- /dev/null
+++ b/include/gk20a/hw_pram_gk20a.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2016-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+/*
+ * Function naming determines intended use:
+ *
+ *     <x>_r(void) : Returns the offset for register <x>.
+ *
+ *     <x>_o(void) : Returns the offset for element <x>.
+ *
+ *     <x>_w(void) : Returns the word offset for word (4 byte) element <x>.
+ *
+ *     <x>_<y>_s(void) : Returns size of field <y> of register <x> in bits.
+ *
+ *     <x>_<y>_f(u32 v) : Returns a value based on 'v' which has been shifted
+ *         and masked to place it at field <y> of register <x>.  This value
+ *         can be |'d with others to produce a full register value for
+ *         register <x>.
+ *
+ *     <x>_<y>_m(void) : Returns a mask for field <y> of register <x>.  This
+ *         value can be ~'d and then &'d to clear the value of field <y> for
+ *         register <x>.
+ *
+ *     <x>_<y>_<z>_f(void) : Returns the constant value <z> after being shifted
+ *         to place it at field <y> of register <x>.  This value can be |'d
+ *         with others to produce a full register value for <x>.
+ *
+ *     <x>_<y>_v(u32 r) : Returns the value of field <y> from a full register
+ *         <x> value 'r' after being shifted to place its LSB at bit 0.
+ *         This value is suitable for direct comparison with other unshifted
+ *         values appropriate for use in field <y> of register <x>.
+ *
+ *     <x>_<y>_<z>_v(void) : Returns the constant value for <z> defined for
+ *         field <y> of register <x>.  This value is suitable for direct
+ *         comparison with unshifted values appropriate for use in field <y>
+ *         of register <x>.
+ */
+#ifndef _hw_pram_gk20a_h_
+#define _hw_pram_gk20a_h_
+static inline u32 pram_data032_r(u32 i)
+{
+        return 0x00700000U + i*4U;
+}
+#endif
diff --git a/include/gk20a/hw_pri_ringmaster_gk20a.h b/include/gk20a/hw_pri_ringmaster_gk20a.h
new file mode 100644
index 0000000..ca2775e
--- /dev/null
+++ b/include/gk20a/hw_pri_ringmaster_gk20a.h
@@ -0,0 +1,159 @@
+/*
+ * Copyright (c) 2012-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+/*
+ * Function naming determines intended use:
+ *
+ *     <x>_r(void) : Returns the offset for register <x>.
+ *
+ *     <x>_o(void) : Returns the offset for element <x>.
+ *
+ *     <x>_w(void) : Returns the word offset for word (4 byte) element <x>.
+ *
+ *     <x>_<y>_s(void) : Returns size of field <y> of register <x> in bits.
+ *
+ *     <x>_<y>_f(u32 v) : Returns a value based on 'v' which has been shifted
+ *         and masked to place it at field <y> of register <x>.  This value
+ *         can be |'d with others to produce a full register value for
+ *         register <x>.
+ *
+ *     <x>_<y>_m(void) : Returns a mask for field <y> of register <x>.  This
+ *         value can be ~'d and then &'d to clear the value of field <y> for
+ *         register <x>.
+ *
+ *     <x>_<y>_<z>_f(void) : Returns the constant value <z> after being shifted
+ *         to place it at field <y> of register <x>.  This value can be |'d
+ *         with others to produce a full register value for <x>.
+ *
+ *     <x>_<y>_v(u32 r) : Returns the value of field <y> from a full register
+ *         <x> value 'r' after being shifted to place its LSB at bit 0.
+ *         This value is suitable for direct comparison with other unshifted
+ *         values appropriate for use in field <y> of register <x>.
+ *
+ *     <x>_<y>_<z>_v(void) : Returns the constant value for <z> defined for
+ *         field <y> of register <x>.  This value is suitable for direct
+ *         comparison with unshifted values appropriate for use in field <y>
+ *         of register <x>.
+ */
+#ifndef _hw_pri_ringmaster_gk20a_h_
+#define _hw_pri_ringmaster_gk20a_h_
+static inline u32 pri_ringmaster_command_r(void)
+{
+        return 0x0012004cU;
+}
+static inline u32 pri_ringmaster_command_cmd_m(void)
+{
+        return 0x3fU << 0U;
+}
+static inline u32 pri_ringmaster_command_cmd_v(u32 r)
+{
+        return (r >> 0U) & 0x3fU;
+}
+static inline u32 pri_ringmaster_command_cmd_no_cmd_v(void)
+{
+        return 0x00000000U;
+}
+static inline u32 pri_ringmaster_command_cmd_start_ring_f(void)
+{
+        return 0x1U;
+}
+static inline u32 pri_ringmaster_command_cmd_ack_interrupt_f(void)
+{
+        return 0x2U;
+}
+static inline u32 pri_ringmaster_command_cmd_enumerate_stations_f(void)
+{
+        return 0x3U;
+}
+static inline u32 pri_ringmaster_command_cmd_enumerate_stations_bc_grp_all_f(void)
+{
+        return 0x0U;
+}
+static inline u32 pri_ringmaster_command_data_r(void)
+{
+        return 0x00120048U;
+}
+static inline u32 pri_ringmaster_start_results_r(void)
+{
+        return 0x00120050U;
+}
+static inline u32 pri_ringmaster_start_results_connectivity_v(u32 r)
+{
+        return (r >> 0U) & 0x1U;
+}
+static inline u32 pri_ringmaster_start_results_connectivity_pass_v(void)
+{
+        return 0x00000001U;
+}
+static inline u32 pri_ringmaster_intr_status0_r(void)
+{
+        return 0x00120058U;
+}
+static inline u32 pri_ringmaster_intr_status0_ring_start_conn_fault_v(u32 r)
+{
+        return (r >> 0U) & 0x1U;
+}
+static inline u32 pri_ringmaster_intr_status0_disconnect_fault_v(u32 r)
+{
+        return (r >> 1U) & 0x1U;
+}
+static inline u32 pri_ringmaster_intr_status0_overflow_fault_v(u32 r)
+{
+        return (r >> 2U) & 0x1U;
+}
+static inline u32 pri_ringmaster_intr_status0_gbl_write_error_sys_v(u32 r)
+{
+        return (r >> 8U) & 0x1U;
+}
+static inline u32 pri_ringmaster_intr_status1_r(void)
+{
+        return 0x0012005cU;
+}
+static inline u32 pri_ringmaster_global_ctl_r(void)
+{
+        return 0x00120060U;
+}
+static inline u32 pri_ringmaster_global_ctl_ring_reset_asserted_f(void)
+{
+        return 0x1U;
+}
+static inline u32 pri_ringmaster_global_ctl_ring_reset_deasserted_f(void)
+{
+        return 0x0U;
+}
+static inline u32 pri_ringmaster_enum_fbp_r(void)
+{
+        return 0x00120074U;
+}
+static inline u32 pri_ringmaster_enum_fbp_count_v(u32 r)
+{
+        return (r >> 0U) & 0x1fU;
+}
+static inline u32 pri_ringmaster_enum_gpc_r(void)
+{
+        return 0x00120078U;
+}
+static inline u32 pri_ringmaster_enum_gpc_count_v(u32 r)
+{
+        return (r >> 0U) & 0x1fU;
+}
+#endif
diff --git a/include/gk20a/hw_pri_ringstation_fbp_gk20a.h b/include/gk20a/hw_pri_ringstation_fbp_gk20a.h
new file mode 100644
index 0000000..06e08bd
--- /dev/null
+++ b/include/gk20a/hw_pri_ringstation_fbp_gk20a.h
@@ -0,0 +1,231 @@
+/*
+ * drivers/video/tegra/host/gk20a/hw_pri_ringstation_fbp_gk20a.h
+ *
+ * Copyright (c) 2012-2013, NVIDIA Corporation. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+ /*
+  * Function naming determines intended use:
+  *
+  *     <x>_r(void) : Returns the offset for register <x>.
+  *
+  *     <x>_w(void) : Returns the word offset for word (4 byte) element <x>.
+  *
+  *     <x>_<y>_s(void) : Returns size of field <y> of register <x> in bits.
+  *
+  *     <x>_<y>_f(u32 v) : Returns a value based on 'v' which has been shifted
+  *         and masked to place it at field <y> of register <x>.  This value
+  *         can be |'d with others to produce a full register value for
+  *         register <x>.
+  *
+  *     <x>_<y>_m(void) : Returns a mask for field <y> of register <x>.  This
+  *         value can be ~'d and then &'d to clear the value of field <y> for
+  *         register <x>.
+  *
+  *     <x>_<y>_<z>_f(void) : Returns the constant value <z> after being shifted
+  *         to place it at field <y> of register <x>.  This value can be |'d
+  *         with others to produce a full register value for <x>.
+  *
+  *     <x>_<y>_v(u32 r) : Returns the value of field <y> from a full register
+  *         <x> value 'r' after being shifted to place its LSB at bit 0.
+  *         This value is suitable for direct comparison with other unshifted
+  *         values appropriate for use in field <y> of register <x>.
+  *
+  *     <x>_<y>_<z>_v(void) : Returns the constant value for <z> defined for
+  *         field <y> of register <x>.  This value is suitable for direct
+  *         comparison with unshifted values appropriate for use in field <y>
+  *         of register <x>.
+  */
+#ifndef __hw_pri_ringstation_fbp_gk20a_h__
+#define __hw_pri_ringstation_fbp_gk20a_h__
+/*This file is autogenerated.  Do not edit. */
+static inline u32 pri_ringstation_fbp_master_config_r(u32 i)
+{
+        return 0x00124300+((i)*4);
+}
+static inline u32 pri_ringstation_fbp_master_config__size_1_v(void)
+{
+        return 64;
+}
+static inline u32 pri_ringstation_fbp_master_config_timeout_s(void)
+{
+        return 18;
+}
+static inline u32 pri_ringstation_fbp_master_config_timeout_f(u32 v)
+{
+        return (v & 0x3ffff) << 0;
+}
+static inline u32 pri_ringstation_fbp_master_config_timeout_m(void)
+{
+        return 0x3ffff << 0;
+}
+static inline u32 pri_ringstation_fbp_master_config_timeout_v(u32 r)
+{
+        return (r >> 0) & 0x3ffff;
+}
+static inline u32 pri_ringstation_fbp_master_config_timeout_i_v(void)
+{
+        return 0x00000064;
+}
+static inline u32 pri_ringstation_fbp_master_config_timeout_i_f(void)
+{
+        return 0x64;
+}
+static inline u32 pri_ringstation_fbp_master_config_fs_action_s(void)
+{
+        return 1;
+}
+static inline u32 pri_ringstation_fbp_master_config_fs_action_f(u32 v)
+{
+        return (v & 0x1) << 30;
+}
+static inline u32 pri_ringstation_fbp_master_config_fs_action_m(void)
+{
+        return 0x1 << 30;
+}
+static inline u32 pri_ringstation_fbp_master_config_fs_action_v(u32 r)
+{
+        return (r >> 30) & 0x1;
+}
+static inline u32 pri_ringstation_fbp_master_config_fs_action_error_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 pri_ringstation_fbp_master_config_fs_action_error_f(void)
+{
+        return 0x0;
+}
+static inline u32 pri_ringstation_fbp_master_config_fs_action_soldier_on_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 pri_ringstation_fbp_master_config_fs_action_soldier_on_f(void)
+{
+        return 0x40000000;
+}
+static inline u32 pri_ringstation_fbp_master_config_reset_action_s(void)
+{
+        return 1;
+}
+static inline u32 pri_ringstation_fbp_master_config_reset_action_f(u32 v)
+{
+        return (v & 0x1) << 31;
+}
+static inline u32 pri_ringstation_fbp_master_config_reset_action_m(void)
+{
+        return 0x1 << 31;
+}
+static inline u32 pri_ringstation_fbp_master_config_reset_action_v(u32 r)
+{
+        return (r >> 31) & 0x1;
+}
+static inline u32 pri_ringstation_fbp_master_config_reset_action_error_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 pri_ringstation_fbp_master_config_reset_action_error_f(void)
+{
+        return 0x0;
+}
+static inline u32 pri_ringstation_fbp_master_config_reset_action_soldier_on_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 pri_ringstation_fbp_master_config_reset_action_soldier_on_f(void)
+{
+        return 0x80000000;
+}
+static inline u32 pri_ringstation_fbp_master_config_setup_clocks_s(void)
+{
+        return 3;
+}
+static inline u32 pri_ringstation_fbp_master_config_setup_clocks_f(u32 v)
+{
+        return (v & 0x7) << 20;
+}
+static inline u32 pri_ringstation_fbp_master_config_setup_clocks_m(void)
+{
+        return 0x7 << 20;
+}
+static inline u32 pri_ringstation_fbp_master_config_setup_clocks_v(u32 r)
+{
+        return (r >> 20) & 0x7;
+}
+static inline u32 pri_ringstation_fbp_master_config_setup_clocks_i_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 pri_ringstation_fbp_master_config_setup_clocks_i_f(void)
+{
+        return 0x0;
+}
+static inline u32 pri_ringstation_fbp_master_config_wait_clocks_s(void)
+{
+        return 3;
+}
+static inline u32 pri_ringstation_fbp_master_config_wait_clocks_f(u32 v)
+{
+        return (v & 0x7) << 24;
+}
+static inline u32 pri_ringstation_fbp_master_config_wait_clocks_m(void)
+{
+        return 0x7 << 24;
+}
+static inline u32 pri_ringstation_fbp_master_config_wait_clocks_v(u32 r)
+{
+        return (r >> 24) & 0x7;
+}
+static inline u32 pri_ringstation_fbp_master_config_wait_clocks_i_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 pri_ringstation_fbp_master_config_wait_clocks_i_f(void)
+{
+        return 0x0;
+}
+static inline u32 pri_ringstation_fbp_master_config_hold_clocks_s(void)
+{
+        return 3;
+}
+static inline u32 pri_ringstation_fbp_master_config_hold_clocks_f(u32 v)
+{
+        return (v & 0x7) << 27;
+}
+static inline u32 pri_ringstation_fbp_master_config_hold_clocks_m(void)
+{
+        return 0x7 << 27;
+}
+static inline u32 pri_ringstation_fbp_master_config_hold_clocks_v(u32 r)
+{
+        return (r >> 27) & 0x7;
+}
+static inline u32 pri_ringstation_fbp_master_config_hold_clocks_i_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 pri_ringstation_fbp_master_config_hold_clocks_i_f(void)
+{
+        return 0x0;
+}
+#endif /* __hw_pri_ringstation_fbp_gk20a_h__ */
diff --git a/include/gk20a/hw_pri_ringstation_gpc_gk20a.h b/include/gk20a/hw_pri_ringstation_gpc_gk20a.h
new file mode 100644
index 0000000..6b57429
--- /dev/null
+++ b/include/gk20a/hw_pri_ringstation_gpc_gk20a.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2012-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+/*
+ * Function naming determines intended use:
+ *
+ *     <x>_r(void) : Returns the offset for register <x>.
+ *
+ *     <x>_o(void) : Returns the offset for element <x>.
+ *
+ *     <x>_w(void) : Returns the word offset for word (4 byte) element <x>.
+ *
+ *     <x>_<y>_s(void) : Returns size of field <y> of register <x> in bits.
+ *
+ *     <x>_<y>_f(u32 v) : Returns a value based on 'v' which has been shifted
+ *         and masked to place it at field <y> of register <x>.  This value
+ *         can be |'d with others to produce a full register value for
+ *         register <x>.
+ *
+ *     <x>_<y>_m(void) : Returns a mask for field <y> of register <x>.  This
+ *         value can be ~'d and then &'d to clear the value of field <y> for
+ *         register <x>.
+ *
+ *     <x>_<y>_<z>_f(void) : Returns the constant value <z> after being shifted
+ *         to place it at field <y> of register <x>.  This value can be |'d
+ *         with others to produce a full register value for <x>.
+ *
+ *     <x>_<y>_v(u32 r) : Returns the value of field <y> from a full register
+ *         <x> value 'r' after being shifted to place its LSB at bit 0.
+ *         This value is suitable for direct comparison with other unshifted
+ *         values appropriate for use in field <y> of register <x>.
+ *
+ *     <x>_<y>_<z>_v(void) : Returns the constant value for <z> defined for
+ *         field <y> of register <x>.  This value is suitable for direct
+ *         comparison with unshifted values appropriate for use in field <y>
+ *         of register <x>.
+ */
+#ifndef _hw_pri_ringstation_gpc_gk20a_h_
+#define _hw_pri_ringstation_gpc_gk20a_h_
+static inline u32 pri_ringstation_gpc_master_config_r(u32 i)
+{
+        return 0x00128300U + i*4U;
+}
+static inline u32 pri_ringstation_gpc_gpc0_priv_error_adr_r(void)
+{
+        return 0x00128120U;
+}
+static inline u32 pri_ringstation_gpc_gpc0_priv_error_wrdat_r(void)
+{
+        return 0x00128124U;
+}
+static inline u32 pri_ringstation_gpc_gpc0_priv_error_info_r(void)
+{
+        return 0x00128128U;
+}
+static inline u32 pri_ringstation_gpc_gpc0_priv_error_code_r(void)
+{
+        return 0x0012812cU;
+}
+#endif
diff --git a/include/gk20a/hw_pri_ringstation_sys_gk20a.h b/include/gk20a/hw_pri_ringstation_sys_gk20a.h
new file mode 100644
index 0000000..e4d5c3b
--- /dev/null
+++ b/include/gk20a/hw_pri_ringstation_sys_gk20a.h
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2012-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+/*
+ * Function naming determines intended use:
+ *
+ *     <x>_r(void) : Returns the offset for register <x>.
+ *
+ *     <x>_o(void) : Returns the offset for element <x>.
+ *
+ *     <x>_w(void) : Returns the word offset for word (4 byte) element <x>.
+ *
+ *     <x>_<y>_s(void) : Returns size of field <y> of register <x> in bits.
+ *
+ *     <x>_<y>_f(u32 v) : Returns a value based on 'v' which has been shifted
+ *         and masked to place it at field <y> of register <x>.  This value
+ *         can be |'d with others to produce a full register value for
+ *         register <x>.
+ *
+ *     <x>_<y>_m(void) : Returns a mask for field <y> of register <x>.  This
+ *         value can be ~'d and then &'d to clear the value of field <y> for
+ *         register <x>.
+ *
+ *     <x>_<y>_<z>_f(void) : Returns the constant value <z> after being shifted
+ *         to place it at field <y> of register <x>.  This value can be |'d
+ *         with others to produce a full register value for <x>.
+ *
+ *     <x>_<y>_v(u32 r) : Returns the value of field <y> from a full register
+ *         <x> value 'r' after being shifted to place its LSB at bit 0.
+ *         This value is suitable for direct comparison with other unshifted
+ *         values appropriate for use in field <y> of register <x>.
+ *
+ *     <x>_<y>_<z>_v(void) : Returns the constant value for <z> defined for
+ *         field <y> of register <x>.  This value is suitable for direct
+ *         comparison with unshifted values appropriate for use in field <y>
+ *         of register <x>.
+ */
+#ifndef _hw_pri_ringstation_sys_gk20a_h_
+#define _hw_pri_ringstation_sys_gk20a_h_
+static inline u32 pri_ringstation_sys_master_config_r(u32 i)
+{
+        return 0x00122300U + i*4U;
+}
+static inline u32 pri_ringstation_sys_decode_config_r(void)
+{
+        return 0x00122204U;
+}
+static inline u32 pri_ringstation_sys_decode_config_ring_m(void)
+{
+        return 0x7U << 0U;
+}
+static inline u32 pri_ringstation_sys_decode_config_ring_drop_on_ring_not_started_f(void)
+{
+        return 0x1U;
+}
+static inline u32 pri_ringstation_sys_priv_error_adr_r(void)
+{
+        return 0x00122120U;
+}
+static inline u32 pri_ringstation_sys_priv_error_wrdat_r(void)
+{
+        return 0x00122124U;
+}
+static inline u32 pri_ringstation_sys_priv_error_info_r(void)
+{
+        return 0x00122128U;
+}
+static inline u32 pri_ringstation_sys_priv_error_code_r(void)
+{
+        return 0x0012212cU;
+}
+#endif
diff --git a/include/gk20a/hw_proj_gk20a.h b/include/gk20a/hw_proj_gk20a.h
new file mode 100644
index 0000000..10509ca
--- /dev/null
+++ b/include/gk20a/hw_proj_gk20a.h
@@ -0,0 +1,167 @@
+/*
+ * Copyright (c) 2012-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+/*
+ * Function naming determines intended use:
+ *
+ *     <x>_r(void) : Returns the offset for register <x>.
+ *
+ *     <x>_o(void) : Returns the offset for element <x>.
+ *
+ *     <x>_w(void) : Returns the word offset for word (4 byte) element <x>.
+ *
+ *     <x>_<y>_s(void) : Returns size of field <y> of register <x> in bits.
+ *
+ *     <x>_<y>_f(u32 v) : Returns a value based on 'v' which has been shifted
+ *         and masked to place it at field <y> of register <x>.  This value
+ *         can be |'d with others to produce a full register value for
+ *         register <x>.
+ *
+ *     <x>_<y>_m(void) : Returns a mask for field <y> of register <x>.  This
+ *         value can be ~'d and then &'d to clear the value of field <y> for
+ *         register <x>.
+ *
+ *     <x>_<y>_<z>_f(void) : Returns the constant value <z> after being shifted
+ *         to place it at field <y> of register <x>.  This value can be |'d
+ *         with others to produce a full register value for <x>.
+ *
+ *     <x>_<y>_v(u32 r) : Returns the value of field <y> from a full register
+ *         <x> value 'r' after being shifted to place its LSB at bit 0.
+ *         This value is suitable for direct comparison with other unshifted
+ *         values appropriate for use in field <y> of register <x>.
+ *
+ *     <x>_<y>_<z>_v(void) : Returns the constant value for <z> defined for
+ *         field <y> of register <x>.  This value is suitable for direct
+ *         comparison with unshifted values appropriate for use in field <y>
+ *         of register <x>.
+ */
+#ifndef _hw_proj_gk20a_h_
+#define _hw_proj_gk20a_h_
+static inline u32 proj_gpc_base_v(void)
+{
+        return 0x00500000U;
+}
+static inline u32 proj_gpc_shared_base_v(void)
+{
+        return 0x00418000U;
+}
+static inline u32 proj_gpc_stride_v(void)
+{
+        return 0x00008000U;
+}
+static inline u32 proj_gpc_priv_stride_v(void)
+{
+        return 0x00000800U;
+}
+static inline u32 proj_ltc_stride_v(void)
+{
+        return 0x00002000U;
+}
+static inline u32 proj_lts_stride_v(void)
+{
+        return 0x00000400U;
+}
+static inline u32 proj_fbpa_stride_v(void)
+{
+        return 0x00001000U;
+}
+static inline u32 proj_ppc_in_gpc_base_v(void)
+{
+        return 0x00003000U;
+}
+static inline u32 proj_ppc_in_gpc_shared_base_v(void)
+{
+        return 0x00003e00U;
+}
+static inline u32 proj_ppc_in_gpc_stride_v(void)
+{
+        return 0x00000200U;
+}
+static inline u32 proj_rop_base_v(void)
+{
+        return 0x00410000U;
+}
+static inline u32 proj_rop_shared_base_v(void)
+{
+        return 0x00408800U;
+}
+static inline u32 proj_rop_stride_v(void)
+{
+        return 0x00000400U;
+}
+static inline u32 proj_tpc_in_gpc_base_v(void)
+{
+        return 0x00004000U;
+}
+static inline u32 proj_tpc_in_gpc_stride_v(void)
+{
+        return 0x00000800U;
+}
+static inline u32 proj_tpc_in_gpc_shared_base_v(void)
+{
+        return 0x00001800U;
+}
+static inline u32 proj_host_num_engines_v(void)
+{
+        return 0x00000002U;
+}
+static inline u32 proj_host_num_pbdma_v(void)
+{
+        return 0x00000001U;
+}
+static inline u32 proj_scal_litter_num_tpc_per_gpc_v(void)
+{
+        return 0x00000001U;
+}
+static inline u32 proj_scal_litter_num_fbps_v(void)
+{
+        return 0x00000001U;
+}
+static inline u32 proj_scal_litter_num_fbpas_v(void)
+{
+        return 0x00000001U;
+}
+static inline u32 proj_scal_litter_num_gpcs_v(void)
+{
+        return 0x00000001U;
+}
+static inline u32 proj_scal_litter_num_pes_per_gpc_v(void)
+{
+        return 0x00000001U;
+}
+static inline u32 proj_scal_litter_num_tpcs_per_pes_v(void)
+{
+        return 0x00000001U;
+}
+static inline u32 proj_scal_litter_num_zcull_banks_v(void)
+{
+        return 0x00000004U;
+}
+static inline u32 proj_scal_max_gpcs_v(void)
+{
+        return 0x00000020U;
+}
+static inline u32 proj_scal_max_tpc_per_gpc_v(void)
+{
+        return 0x00000008U;
+}
+#endif
diff --git a/include/gk20a/hw_pwr_gk20a.h b/include/gk20a/hw_pwr_gk20a.h
new file mode 100644
index 0000000..2845763
--- /dev/null
+++ b/include/gk20a/hw_pwr_gk20a.h
@@ -0,0 +1,823 @@
+/*
+ * Copyright (c) 2012-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+/*
+ * Function naming determines intended use:
+ *
+ *     <x>_r(void) : Returns the offset for register <x>.
+ *
+ *     <x>_o(void) : Returns the offset for element <x>.
+ *
+ *     <x>_w(void) : Returns the word offset for word (4 byte) element <x>.
+ *
+ *     <x>_<y>_s(void) : Returns size of field <y> of register <x> in bits.
+ *
+ *     <x>_<y>_f(u32 v) : Returns a value based on 'v' which has been shifted
+ *         and masked to place it at field <y> of register <x>.  This value
+ *         can be |'d with others to produce a full register value for
+ *         register <x>.
+ *
+ *     <x>_<y>_m(void) : Returns a mask for field <y> of register <x>.  This
+ *         value can be ~'d and then &'d to clear the value of field <y> for
+ *         register <x>.
+ *
+ *     <x>_<y>_<z>_f(void) : Returns the constant value <z> after being shifted
+ *         to place it at field <y> of register <x>.  This value can be |'d
+ *         with others to produce a full register value for <x>.
+ *
+ *     <x>_<y>_v(u32 r) : Returns the value of field <y> from a full register
+ *         <x> value 'r' after being shifted to place its LSB at bit 0.
+ *         This value is suitable for direct comparison with other unshifted
+ *         values appropriate for use in field <y> of register <x>.
+ *
+ *     <x>_<y>_<z>_v(void) : Returns the constant value for <z> defined for
+ *         field <y> of register <x>.  This value is suitable for direct
+ *         comparison with unshifted values appropriate for use in field <y>
+ *         of register <x>.
+ */
+#ifndef _hw_pwr_gk20a_h_
+#define _hw_pwr_gk20a_h_
+static inline u32 pwr_falcon_irqsset_r(void)
+{
+        return 0x0010a000U;
+}
+static inline u32 pwr_falcon_irqsset_swgen0_set_f(void)
+{
+        return 0x40U;
+}
+static inline u32 pwr_falcon_irqsclr_r(void)
+{
+        return 0x0010a004U;
+}
+static inline u32 pwr_falcon_irqstat_r(void)
+{
+        return 0x0010a008U;
+}
+static inline u32 pwr_falcon_irqstat_halt_true_f(void)
+{
+        return 0x10U;
+}
+static inline u32 pwr_falcon_irqstat_exterr_true_f(void)
+{
+        return 0x20U;
+}
+static inline u32 pwr_falcon_irqstat_swgen0_true_f(void)
+{
+        return 0x40U;
+}
+static inline u32 pwr_falcon_irqmode_r(void)
+{
+        return 0x0010a00cU;
+}
+static inline u32 pwr_falcon_irqmset_r(void)
+{
+        return 0x0010a010U;
+}
+static inline u32 pwr_falcon_irqmset_gptmr_f(u32 v)
+{
+        return (v & 0x1U) << 0U;
+}
+static inline u32 pwr_falcon_irqmset_wdtmr_f(u32 v)
+{
+        return (v & 0x1U) << 1U;
+}
+static inline u32 pwr_falcon_irqmset_mthd_f(u32 v)
+{
+        return (v & 0x1U) << 2U;
+}
+static inline u32 pwr_falcon_irqmset_ctxsw_f(u32 v)
+{
+        return (v & 0x1U) << 3U;
+}
+static inline u32 pwr_falcon_irqmset_halt_f(u32 v)
+{
+        return (v & 0x1U) << 4U;
+}
+static inline u32 pwr_falcon_irqmset_exterr_f(u32 v)
+{
+        return (v & 0x1U) << 5U;
+}
+static inline u32 pwr_falcon_irqmset_swgen0_f(u32 v)
+{
+        return (v & 0x1U) << 6U;
+}
+static inline u32 pwr_falcon_irqmset_swgen1_f(u32 v)
+{
+        return (v & 0x1U) << 7U;
+}
+static inline u32 pwr_falcon_irqmclr_r(void)
+{
+        return 0x0010a014U;
+}
+static inline u32 pwr_falcon_irqmclr_gptmr_f(u32 v)
+{
+        return (v & 0x1U) << 0U;
+}
+static inline u32 pwr_falcon_irqmclr_wdtmr_f(u32 v)
+{
+        return (v & 0x1U) << 1U;
+}
+static inline u32 pwr_falcon_irqmclr_mthd_f(u32 v)
+{
+        return (v & 0x1U) << 2U;
+}
+static inline u32 pwr_falcon_irqmclr_ctxsw_f(u32 v)
+{
+        return (v & 0x1U) << 3U;
+}
+static inline u32 pwr_falcon_irqmclr_halt_f(u32 v)
+{
+        return (v & 0x1U) << 4U;
+}
+static inline u32 pwr_falcon_irqmclr_exterr_f(u32 v)
+{
+        return (v & 0x1U) << 5U;
+}
+static inline u32 pwr_falcon_irqmclr_swgen0_f(u32 v)
+{
+        return (v & 0x1U) << 6U;
+}
+static inline u32 pwr_falcon_irqmclr_swgen1_f(u32 v)
+{
+        return (v & 0x1U) << 7U;
+}
+static inline u32 pwr_falcon_irqmclr_ext_f(u32 v)
+{
+        return (v & 0xffU) << 8U;
+}
+static inline u32 pwr_falcon_irqmask_r(void)
+{
+        return 0x0010a018U;
+}
+static inline u32 pwr_falcon_irqdest_r(void)
+{
+        return 0x0010a01cU;
+}
+static inline u32 pwr_falcon_irqdest_host_gptmr_f(u32 v)
+{
+        return (v & 0x1U) << 0U;
+}
+static inline u32 pwr_falcon_irqdest_host_wdtmr_f(u32 v)
+{
+        return (v & 0x1U) << 1U;
+}
+static inline u32 pwr_falcon_irqdest_host_mthd_f(u32 v)
+{
+        return (v & 0x1U) << 2U;
+}
+static inline u32 pwr_falcon_irqdest_host_ctxsw_f(u32 v)
+{
+        return (v & 0x1U) << 3U;
+}
+static inline u32 pwr_falcon_irqdest_host_halt_f(u32 v)
+{
+        return (v & 0x1U) << 4U;
+}
+static inline u32 pwr_falcon_irqdest_host_exterr_f(u32 v)
+{
+        return (v & 0x1U) << 5U;
+}
+static inline u32 pwr_falcon_irqdest_host_swgen0_f(u32 v)
+{
+        return (v & 0x1U) << 6U;
+}
+static inline u32 pwr_falcon_irqdest_host_swgen1_f(u32 v)
+{
+        return (v & 0x1U) << 7U;
+}
+static inline u32 pwr_falcon_irqdest_host_ext_f(u32 v)
+{
+        return (v & 0xffU) << 8U;
+}
+static inline u32 pwr_falcon_irqdest_target_gptmr_f(u32 v)
+{
+        return (v & 0x1U) << 16U;
+}
+static inline u32 pwr_falcon_irqdest_target_wdtmr_f(u32 v)
+{
+        return (v & 0x1U) << 17U;
+}
+static inline u32 pwr_falcon_irqdest_target_mthd_f(u32 v)
+{
+        return (v & 0x1U) << 18U;
+}
+static inline u32 pwr_falcon_irqdest_target_ctxsw_f(u32 v)
+{
+        return (v & 0x1U) << 19U;
+}
+static inline u32 pwr_falcon_irqdest_target_halt_f(u32 v)
+{
+        return (v & 0x1U) << 20U;
+}
+static inline u32 pwr_falcon_irqdest_target_exterr_f(u32 v)
+{
+        return (v & 0x1U) << 21U;
+}
+static inline u32 pwr_falcon_irqdest_target_swgen0_f(u32 v)
+{
+        return (v & 0x1U) << 22U;
+}
+static inline u32 pwr_falcon_irqdest_target_swgen1_f(u32 v)
+{
+        return (v & 0x1U) << 23U;
+}
+static inline u32 pwr_falcon_irqdest_target_ext_f(u32 v)
+{
+        return (v & 0xffU) << 24U;
+}
+static inline u32 pwr_falcon_curctx_r(void)
+{
+        return 0x0010a050U;
+}
+static inline u32 pwr_falcon_nxtctx_r(void)
+{
+        return 0x0010a054U;
+}
+static inline u32 pwr_falcon_mailbox0_r(void)
+{
+        return 0x0010a040U;
+}
+static inline u32 pwr_falcon_mailbox1_r(void)
+{
+        return 0x0010a044U;
+}
+static inline u32 pwr_falcon_itfen_r(void)
+{
+        return 0x0010a048U;
+}
+static inline u32 pwr_falcon_itfen_ctxen_enable_f(void)
+{
+        return 0x1U;
+}
+static inline u32 pwr_falcon_idlestate_r(void)
+{
+        return 0x0010a04cU;
+}
+static inline u32 pwr_falcon_idlestate_falcon_busy_v(u32 r)
+{
+        return (r >> 0U) & 0x1U;
+}
+static inline u32 pwr_falcon_idlestate_ext_busy_v(u32 r)
+{
+        return (r >> 1U) & 0x7fffU;
+}
+static inline u32 pwr_falcon_os_r(void)
+{
+        return 0x0010a080U;
+}
+static inline u32 pwr_falcon_engctl_r(void)
+{
+        return 0x0010a0a4U;
+}
+static inline u32 pwr_falcon_cpuctl_r(void)
+{
+        return 0x0010a100U;
+}
+static inline u32 pwr_falcon_cpuctl_startcpu_f(u32 v)
+{
+        return (v & 0x1U) << 1U;
+}
+static inline u32 pwr_falcon_cpuctl_halt_intr_f(u32 v)
+{
+        return (v & 0x1U) << 4U;
+}
+static inline u32 pwr_falcon_cpuctl_halt_intr_m(void)
+{
+        return 0x1U << 4U;
+}
+static inline u32 pwr_falcon_cpuctl_halt_intr_v(u32 r)
+{
+        return (r >> 4U) & 0x1U;
+}
+static inline u32 pwr_falcon_imemc_r(u32 i)
+{
+        return 0x0010a180U + i*16U;
+}
+static inline u32 pwr_falcon_imemc_offs_f(u32 v)
+{
+        return (v & 0x3fU) << 2U;
+}
+static inline u32 pwr_falcon_imemc_blk_f(u32 v)
+{
+        return (v & 0xffU) << 8U;
+}
+static inline u32 pwr_falcon_imemc_aincw_f(u32 v)
+{
+        return (v & 0x1U) << 24U;
+}
+static inline u32 pwr_falcon_imemd_r(u32 i)
+{
+        return 0x0010a184U + i*16U;
+}
+static inline u32 pwr_falcon_imemt_r(u32 i)
+{
+        return 0x0010a188U + i*16U;
+}
+static inline u32 pwr_falcon_bootvec_r(void)
+{
+        return 0x0010a104U;
+}
+static inline u32 pwr_falcon_bootvec_vec_f(u32 v)
+{
+        return (v & 0xffffffffU) << 0U;
+}
+static inline u32 pwr_falcon_dmactl_r(void)
+{
+        return 0x0010a10cU;
+}
+static inline u32 pwr_falcon_dmactl_dmem_scrubbing_m(void)
+{
+        return 0x1U << 1U;
+}
+static inline u32 pwr_falcon_dmactl_imem_scrubbing_m(void)
+{
+        return 0x1U << 2U;
+}
+static inline u32 pwr_falcon_hwcfg_r(void)
+{
+        return 0x0010a108U;
+}
+static inline u32 pwr_falcon_hwcfg_imem_size_v(u32 r)
+{
+        return (r >> 0U) & 0x1ffU;
+}
+static inline u32 pwr_falcon_hwcfg_dmem_size_v(u32 r)
+{
+        return (r >> 9U) & 0x1ffU;
+}
+static inline u32 pwr_falcon_dmatrfbase_r(void)
+{
+        return 0x0010a110U;
+}
+static inline u32 pwr_falcon_dmatrfmoffs_r(void)
+{
+        return 0x0010a114U;
+}
+static inline u32 pwr_falcon_dmatrfcmd_r(void)
+{
+        return 0x0010a118U;
+}
+static inline u32 pwr_falcon_dmatrfcmd_imem_f(u32 v)
+{
+        return (v & 0x1U) << 4U;
+}
+static inline u32 pwr_falcon_dmatrfcmd_write_f(u32 v)
+{
+        return (v & 0x1U) << 5U;
+}
+static inline u32 pwr_falcon_dmatrfcmd_size_f(u32 v)
+{
+        return (v & 0x7U) << 8U;
+}
+static inline u32 pwr_falcon_dmatrfcmd_ctxdma_f(u32 v)
+{
+        return (v & 0x7U) << 12U;
+}
+static inline u32 pwr_falcon_dmatrffboffs_r(void)
+{
+        return 0x0010a11cU;
+}
+static inline u32 pwr_falcon_exterraddr_r(void)
+{
+        return 0x0010a168U;
+}
+static inline u32 pwr_falcon_exterrstat_r(void)
+{
+        return 0x0010a16cU;
+}
+static inline u32 pwr_falcon_exterrstat_valid_m(void)
+{
+        return 0x1U << 31U;
+}
+static inline u32 pwr_falcon_exterrstat_valid_v(u32 r)
+{
+        return (r >> 31U) & 0x1U;
+}
+static inline u32 pwr_falcon_exterrstat_valid_true_v(void)
+{
+        return 0x00000001U;
+}
+static inline u32 pwr_pmu_falcon_icd_cmd_r(void)
+{
+        return 0x0010a200U;
+}
+static inline u32 pwr_pmu_falcon_icd_cmd_opc_s(void)
+{
+        return 4U;
+}
+static inline u32 pwr_pmu_falcon_icd_cmd_opc_f(u32 v)
+{
+        return (v & 0xfU) << 0U;
+}
+static inline u32 pwr_pmu_falcon_icd_cmd_opc_m(void)
+{
+        return 0xfU << 0U;
+}
+static inline u32 pwr_pmu_falcon_icd_cmd_opc_v(u32 r)
+{
+        return (r >> 0U) & 0xfU;
+}
+static inline u32 pwr_pmu_falcon_icd_cmd_opc_rreg_f(void)
+{
+        return 0x8U;
+}
+static inline u32 pwr_pmu_falcon_icd_cmd_opc_rstat_f(void)
+{
+        return 0xeU;
+}
+static inline u32 pwr_pmu_falcon_icd_cmd_idx_f(u32 v)
+{
+        return (v & 0x1fU) << 8U;
+}
+static inline u32 pwr_pmu_falcon_icd_rdata_r(void)
+{
+        return 0x0010a20cU;
+}
+static inline u32 pwr_falcon_dmemc_r(u32 i)
+{
+        return 0x0010a1c0U + i*8U;
+}
+static inline u32 pwr_falcon_dmemc_offs_f(u32 v)
+{
+        return (v & 0x3fU) << 2U;
+}
+static inline u32 pwr_falcon_dmemc_offs_m(void)
+{
+        return 0x3fU << 2U;
+}
+static inline u32 pwr_falcon_dmemc_blk_f(u32 v)
+{
+        return (v & 0xffU) << 8U;
+}
+static inline u32 pwr_falcon_dmemc_blk_m(void)
+{
+        return 0xffU << 8U;
+}
+static inline u32 pwr_falcon_dmemc_aincw_f(u32 v)
+{
+        return (v & 0x1U) << 24U;
+}
+static inline u32 pwr_falcon_dmemc_aincr_f(u32 v)
+{
+        return (v & 0x1U) << 25U;
+}
+static inline u32 pwr_falcon_dmemd_r(u32 i)
+{
+        return 0x0010a1c4U + i*8U;
+}
+static inline u32 pwr_pmu_new_instblk_r(void)
+{
+        return 0x0010a480U;
+}
+static inline u32 pwr_pmu_new_instblk_ptr_f(u32 v)
+{
+        return (v & 0xfffffffU) << 0U;
+}
+static inline u32 pwr_pmu_new_instblk_target_fb_f(void)
+{
+        return 0x0U;
+}
+static inline u32 pwr_pmu_new_instblk_target_sys_coh_f(void)
+{
+        return 0x20000000U;
+}
+static inline u32 pwr_pmu_new_instblk_target_sys_ncoh_f(void)
+{
+        return 0x30000000U;
+}
+static inline u32 pwr_pmu_new_instblk_valid_f(u32 v)
+{
+        return (v & 0x1U) << 30U;
+}
+static inline u32 pwr_pmu_mutex_id_r(void)
+{
+        return 0x0010a488U;
+}
+static inline u32 pwr_pmu_mutex_id_value_v(u32 r)
+{
+        return (r >> 0U) & 0xffU;
+}
+static inline u32 pwr_pmu_mutex_id_value_init_v(void)
+{
+        return 0x00000000U;
+}
+static inline u32 pwr_pmu_mutex_id_value_not_avail_v(void)
+{
+        return 0x000000ffU;
+}
+static inline u32 pwr_pmu_mutex_id_release_r(void)
+{
+        return 0x0010a48cU;
+}
+static inline u32 pwr_pmu_mutex_id_release_value_f(u32 v)
+{
+        return (v & 0xffU) << 0U;
+}
+static inline u32 pwr_pmu_mutex_id_release_value_m(void)
+{
+        return 0xffU << 0U;
+}
+static inline u32 pwr_pmu_mutex_id_release_value_init_v(void)
+{
+        return 0x00000000U;
+}
+static inline u32 pwr_pmu_mutex_id_release_value_init_f(void)
+{
+        return 0x0U;
+}
+static inline u32 pwr_pmu_mutex_r(u32 i)
+{
+        return 0x0010a580U + i*4U;
+}
+static inline u32 pwr_pmu_mutex__size_1_v(void)
+{
+        return 0x00000010U;
+}
+static inline u32 pwr_pmu_mutex_value_f(u32 v)
+{
+        return (v & 0xffU) << 0U;
+}
+static inline u32 pwr_pmu_mutex_value_v(u32 r)
+{
+        return (r >> 0U) & 0xffU;
+}
+static inline u32 pwr_pmu_mutex_value_initial_lock_f(void)
+{
+        return 0x0U;
+}
+static inline u32 pwr_pmu_queue_head_r(u32 i)
+{
+        return 0x0010a4a0U + i*4U;
+}
+static inline u32 pwr_pmu_queue_head__size_1_v(void)
+{
+        return 0x00000004U;
+}
+static inline u32 pwr_pmu_queue_head_address_f(u32 v)
+{
+        return (v & 0xffffffffU) << 0U;
+}
+static inline u32 pwr_pmu_queue_head_address_v(u32 r)
+{
+        return (r >> 0U) & 0xffffffffU;
+}
+static inline u32 pwr_pmu_queue_tail_r(u32 i)
+{
+        return 0x0010a4b0U + i*4U;
+}
+static inline u32 pwr_pmu_queue_tail__size_1_v(void)
+{
+        return 0x00000004U;
+}
+static inline u32 pwr_pmu_queue_tail_address_f(u32 v)
+{
+        return (v & 0xffffffffU) << 0U;
+}
+static inline u32 pwr_pmu_queue_tail_address_v(u32 r)
+{
+        return (r >> 0U) & 0xffffffffU;
+}
+static inline u32 pwr_pmu_msgq_head_r(void)
+{
+        return 0x0010a4c8U;
+}
+static inline u32 pwr_pmu_msgq_head_val_f(u32 v)
+{
+        return (v & 0xffffffffU) << 0U;
+}
+static inline u32 pwr_pmu_msgq_head_val_v(u32 r)
+{
+        return (r >> 0U) & 0xffffffffU;
+}
+static inline u32 pwr_pmu_msgq_tail_r(void)
+{
+        return 0x0010a4ccU;
+}
+static inline u32 pwr_pmu_msgq_tail_val_f(u32 v)
+{
+        return (v & 0xffffffffU) << 0U;
+}
+static inline u32 pwr_pmu_msgq_tail_val_v(u32 r)
+{
+        return (r >> 0U) & 0xffffffffU;
+}
+static inline u32 pwr_pmu_idle_mask_r(u32 i)
+{
+        return 0x0010a504U + i*16U;
+}
+static inline u32 pwr_pmu_idle_mask_gr_enabled_f(void)
+{
+        return 0x1U;
+}
+static inline u32 pwr_pmu_idle_mask_ce_2_enabled_f(void)
+{
+        return 0x200000U;
+}
+static inline u32 pwr_pmu_idle_count_r(u32 i)
+{
+        return 0x0010a508U + i*16U;
+}
+static inline u32 pwr_pmu_idle_count_value_f(u32 v)
+{
+        return (v & 0x7fffffffU) << 0U;
+}
+static inline u32 pwr_pmu_idle_count_value_v(u32 r)
+{
+        return (r >> 0U) & 0x7fffffffU;
+}
+static inline u32 pwr_pmu_idle_count_reset_f(u32 v)
+{
+        return (v & 0x1U) << 31U;
+}
+static inline u32 pwr_pmu_idle_ctrl_r(u32 i)
+{
+        return 0x0010a50cU + i*16U;
+}
+static inline u32 pwr_pmu_idle_ctrl_value_m(void)
+{
+        return 0x3U << 0U;
+}
+static inline u32 pwr_pmu_idle_ctrl_value_busy_f(void)
+{
+        return 0x2U;
+}
+static inline u32 pwr_pmu_idle_ctrl_value_always_f(void)
+{
+        return 0x3U;
+}
+static inline u32 pwr_pmu_idle_ctrl_filter_m(void)
+{
+        return 0x1U << 2U;
+}
+static inline u32 pwr_pmu_idle_ctrl_filter_disabled_f(void)
+{
+        return 0x0U;
+}
+static inline u32 pwr_pmu_idle_threshold_r(u32 i)
+{
+        return 0x0010a8a0U + i*4U;
+}
+static inline u32 pwr_pmu_idle_threshold_value_f(u32 v)
+{
+        return (v & 0x7fffffffU) << 0U;
+}
+static inline u32 pwr_pmu_idle_intr_r(void)
+{
+        return 0x0010a9e8U;
+}
+static inline u32 pwr_pmu_idle_intr_en_f(u32 v)
+{
+        return (v & 0x1U) << 0U;
+}
+static inline u32 pwr_pmu_idle_intr_en_disabled_v(void)
+{
+        return 0x00000000U;
+}
+static inline u32 pwr_pmu_idle_intr_en_enabled_v(void)
+{
+        return 0x00000001U;
+}
+static inline u32 pwr_pmu_idle_intr_status_r(void)
+{
+        return 0x0010a9ecU;
+}
+static inline u32 pwr_pmu_idle_intr_status_intr_f(u32 v)
+{
+        return (v & 0x1U) << 0U;
+}
+static inline u32 pwr_pmu_idle_intr_status_intr_m(void)
+{
+        return U32(0x1U) << 0U;
+}
+static inline u32 pwr_pmu_idle_intr_status_intr_v(u32 r)
+{
+        return (r >> 0U) & 0x1U;
+}
+static inline u32 pwr_pmu_idle_mask_supp_r(u32 i)
+{
+        return 0x0010a9f0U + i*8U;
+}
+static inline u32 pwr_pmu_idle_mask_1_supp_r(u32 i)
+{
+        return 0x0010a9f4U + i*8U;
+}
+static inline u32 pwr_pmu_idle_ctrl_supp_r(u32 i)
+{
+        return 0x0010aa30U + i*8U;
+}
+static inline u32 pwr_pmu_debug_r(u32 i)
+{
+        return 0x0010a5c0U + i*4U;
+}
+static inline u32 pwr_pmu_debug__size_1_v(void)
+{
+        return 0x00000004U;
+}
+static inline u32 pwr_pmu_mailbox_r(u32 i)
+{
+        return 0x0010a450U + i*4U;
+}
+static inline u32 pwr_pmu_mailbox__size_1_v(void)
+{
+        return 0x0000000cU;
+}
+static inline u32 pwr_pmu_bar0_addr_r(void)
+{
+        return 0x0010a7a0U;
+}
+static inline u32 pwr_pmu_bar0_data_r(void)
+{
+        return 0x0010a7a4U;
+}
+static inline u32 pwr_pmu_bar0_ctl_r(void)
+{
+        return 0x0010a7acU;
+}
+static inline u32 pwr_pmu_bar0_timeout_r(void)
+{
+        return 0x0010a7a8U;
+}
+static inline u32 pwr_pmu_bar0_fecs_error_r(void)
+{
+        return 0x0010a988U;
+}
+static inline u32 pwr_pmu_bar0_error_status_r(void)
+{
+        return 0x0010a7b0U;
+}
+static inline u32 pwr_pmu_pg_idlefilth_r(u32 i)
+{
+        return 0x0010a6c0U + i*4U;
+}
+static inline u32 pwr_pmu_pg_ppuidlefilth_r(u32 i)
+{
+        return 0x0010a6e8U + i*4U;
+}
+static inline u32 pwr_pmu_pg_idle_cnt_r(u32 i)
+{
+        return 0x0010a710U + i*4U;
+}
+static inline u32 pwr_pmu_pg_intren_r(u32 i)
+{
+        return 0x0010a760U + i*4U;
+}
+static inline u32 pwr_fbif_transcfg_r(u32 i)
+{
+        return 0x0010a600U + i*4U;
+}
+static inline u32 pwr_fbif_transcfg_target_local_fb_f(void)
+{
+        return 0x0U;
+}
+static inline u32 pwr_fbif_transcfg_target_coherent_sysmem_f(void)
+{
+        return 0x1U;
+}
+static inline u32 pwr_fbif_transcfg_target_noncoherent_sysmem_f(void)
+{
+        return 0x2U;
+}
+static inline u32 pwr_fbif_transcfg_mem_type_s(void)
+{
+        return 1U;
+}
+static inline u32 pwr_fbif_transcfg_mem_type_f(u32 v)
+{
+        return (v & 0x1U) << 2U;
+}
+static inline u32 pwr_fbif_transcfg_mem_type_m(void)
+{
+        return 0x1U << 2U;
+}
+static inline u32 pwr_fbif_transcfg_mem_type_v(u32 r)
+{
+        return (r >> 2U) & 0x1U;
+}
+static inline u32 pwr_fbif_transcfg_mem_type_virtual_f(void)
+{
+        return 0x0U;
+}
+static inline u32 pwr_fbif_transcfg_mem_type_physical_f(void)
+{
+        return 0x4U;
+}
+#endif
diff --git a/include/gk20a/hw_ram_gk20a.h b/include/gk20a/hw_ram_gk20a.h
new file mode 100644
index 0000000..ed385d9
--- /dev/null
+++ b/include/gk20a/hw_ram_gk20a.h
@@ -0,0 +1,443 @@
+/*
+ * Copyright (c) 2012-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+/*
+ * Function naming determines intended use:
+ *
+ *     <x>_r(void) : Returns the offset for register <x>.
+ *
+ *     <x>_o(void) : Returns the offset for element <x>.
+ *
+ *     <x>_w(void) : Returns the word offset for word (4 byte) element <x>.
+ *
+ *     <x>_<y>_s(void) : Returns size of field <y> of register <x> in bits.
+ *
+ *     <x>_<y>_f(u32 v) : Returns a value based on 'v' which has been shifted
+ *         and masked to place it at field <y> of register <x>.  This value
+ *         can be |'d with others to produce a full register value for
+ *         register <x>.
+ *
+ *     <x>_<y>_m(void) : Returns a mask for field <y> of register <x>.  This
+ *         value can be ~'d and then &'d to clear the value of field <y> for
+ *         register <x>.
+ *
+ *     <x>_<y>_<z>_f(void) : Returns the constant value <z> after being shifted
+ *         to place it at field <y> of register <x>.  This value can be |'d
+ *         with others to produce a full register value for <x>.
+ *
+ *     <x>_<y>_v(u32 r) : Returns the value of field <y> from a full register
+ *         <x> value 'r' after being shifted to place its LSB at bit 0.
+ *         This value is suitable for direct comparison with other unshifted
+ *         values appropriate for use in field <y> of register <x>.
+ *
+ *     <x>_<y>_<z>_v(void) : Returns the constant value for <z> defined for
+ *         field <y> of register <x>.  This value is suitable for direct
+ *         comparison with unshifted values appropriate for use in field <y>
+ *         of register <x>.
+ */
+#ifndef _hw_ram_gk20a_h_
+#define _hw_ram_gk20a_h_
+static inline u32 ram_in_ramfc_s(void)
+{
+        return 4096U;
+}
+static inline u32 ram_in_ramfc_w(void)
+{
+        return 0U;
+}
+static inline u32 ram_in_page_dir_base_target_f(u32 v)
+{
+        return (v & 0x3U) << 0U;
+}
+static inline u32 ram_in_page_dir_base_target_w(void)
+{
+        return 128U;
+}
+static inline u32 ram_in_page_dir_base_target_vid_mem_f(void)
+{
+        return 0x0U;
+}
+static inline u32 ram_in_page_dir_base_target_sys_mem_coh_f(void)
+{
+        return 0x2U;
+}
+static inline u32 ram_in_page_dir_base_target_sys_mem_ncoh_f(void)
+{
+        return 0x3U;
+}
+static inline u32 ram_in_page_dir_base_vol_w(void)
+{
+        return 128U;
+}
+static inline u32 ram_in_page_dir_base_vol_true_f(void)
+{
+        return 0x4U;
+}
+static inline u32 ram_in_page_dir_base_lo_f(u32 v)
+{
+        return (v & 0xfffffU) << 12U;
+}
+static inline u32 ram_in_page_dir_base_lo_w(void)
+{
+        return 128U;
+}
+static inline u32 ram_in_page_dir_base_hi_f(u32 v)
+{
+        return (v & 0xffU) << 0U;
+}
+static inline u32 ram_in_page_dir_base_hi_w(void)
+{
+        return 129U;
+}
+static inline u32 ram_in_adr_limit_lo_f(u32 v)
+{
+        return (v & 0xfffffU) << 12U;
+}
+static inline u32 ram_in_adr_limit_lo_w(void)
+{
+        return 130U;
+}
+static inline u32 ram_in_adr_limit_hi_f(u32 v)
+{
+        return (v & 0xffU) << 0U;
+}
+static inline u32 ram_in_adr_limit_hi_w(void)
+{
+        return 131U;
+}
+static inline u32 ram_in_engine_cs_w(void)
+{
+        return 132U;
+}
+static inline u32 ram_in_engine_cs_wfi_v(void)
+{
+        return 0x00000000U;
+}
+static inline u32 ram_in_engine_cs_wfi_f(void)
+{
+        return 0x0U;
+}
+static inline u32 ram_in_engine_cs_fg_v(void)
+{
+        return 0x00000001U;
+}
+static inline u32 ram_in_engine_cs_fg_f(void)
+{
+        return 0x8U;
+}
+static inline u32 ram_in_gr_cs_w(void)
+{
+        return 132U;
+}
+static inline u32 ram_in_gr_cs_wfi_f(void)
+{
+        return 0x0U;
+}
+static inline u32 ram_in_gr_wfi_target_w(void)
+{
+        return 132U;
+}
+static inline u32 ram_in_gr_wfi_mode_w(void)
+{
+        return 132U;
+}
+static inline u32 ram_in_gr_wfi_mode_physical_v(void)
+{
+        return 0x00000000U;
+}
+static inline u32 ram_in_gr_wfi_mode_physical_f(void)
+{
+        return 0x0U;
+}
+static inline u32 ram_in_gr_wfi_mode_virtual_v(void)
+{
+        return 0x00000001U;
+}
+static inline u32 ram_in_gr_wfi_mode_virtual_f(void)
+{
+        return 0x4U;
+}
+static inline u32 ram_in_gr_wfi_ptr_lo_f(u32 v)
+{
+        return (v & 0xfffffU) << 12U;
+}
+static inline u32 ram_in_gr_wfi_ptr_lo_w(void)
+{
+        return 132U;
+}
+static inline u32 ram_in_gr_wfi_ptr_hi_f(u32 v)
+{
+        return (v & 0xffU) << 0U;
+}
+static inline u32 ram_in_gr_wfi_ptr_hi_w(void)
+{
+        return 133U;
+}
+static inline u32 ram_in_base_shift_v(void)
+{
+        return 0x0000000cU;
+}
+static inline u32 ram_in_alloc_size_v(void)
+{
+        return 0x00001000U;
+}
+static inline u32 ram_fc_size_val_v(void)
+{
+        return 0x00000200U;
+}
+static inline u32 ram_fc_gp_put_w(void)
+{
+        return 0U;
+}
+static inline u32 ram_fc_userd_w(void)
+{
+        return 2U;
+}
+static inline u32 ram_fc_userd_hi_w(void)
+{
+        return 3U;
+}
+static inline u32 ram_fc_signature_w(void)
+{
+        return 4U;
+}
+static inline u32 ram_fc_gp_get_w(void)
+{
+        return 5U;
+}
+static inline u32 ram_fc_pb_get_w(void)
+{
+        return 6U;
+}
+static inline u32 ram_fc_pb_get_hi_w(void)
+{
+        return 7U;
+}
+static inline u32 ram_fc_pb_top_level_get_w(void)
+{
+        return 8U;
+}
+static inline u32 ram_fc_pb_top_level_get_hi_w(void)
+{
+        return 9U;
+}
+static inline u32 ram_fc_acquire_w(void)
+{
+        return 12U;
+}
+static inline u32 ram_fc_semaphorea_w(void)
+{
+        return 14U;
+}
+static inline u32 ram_fc_semaphoreb_w(void)
+{
+        return 15U;
+}
+static inline u32 ram_fc_semaphorec_w(void)
+{
+        return 16U;
+}
+static inline u32 ram_fc_semaphored_w(void)
+{
+        return 17U;
+}
+static inline u32 ram_fc_gp_base_w(void)
+{
+        return 18U;
+}
+static inline u32 ram_fc_gp_base_hi_w(void)
+{
+        return 19U;
+}
+static inline u32 ram_fc_gp_fetch_w(void)
+{
+        return 20U;
+}
+static inline u32 ram_fc_pb_fetch_w(void)
+{
+        return 21U;
+}
+static inline u32 ram_fc_pb_fetch_hi_w(void)
+{
+        return 22U;
+}
+static inline u32 ram_fc_pb_put_w(void)
+{
+        return 23U;
+}
+static inline u32 ram_fc_pb_put_hi_w(void)
+{
+        return 24U;
+}
+static inline u32 ram_fc_pb_header_w(void)
+{
+        return 33U;
+}
+static inline u32 ram_fc_pb_count_w(void)
+{
+        return 34U;
+}
+static inline u32 ram_fc_subdevice_w(void)
+{
+        return 37U;
+}
+static inline u32 ram_fc_formats_w(void)
+{
+        return 39U;
+}
+static inline u32 ram_fc_syncpointa_w(void)
+{
+        return 41U;
+}
+static inline u32 ram_fc_syncpointb_w(void)
+{
+        return 42U;
+}
+static inline u32 ram_fc_target_w(void)
+{
+        return 43U;
+}
+static inline u32 ram_fc_hce_ctrl_w(void)
+{
+        return 57U;
+}
+static inline u32 ram_fc_chid_w(void)
+{
+        return 58U;
+}
+static inline u32 ram_fc_chid_id_f(u32 v)
+{
+        return (v & 0xfffU) << 0U;
+}
+static inline u32 ram_fc_chid_id_w(void)
+{
+        return 0U;
+}
+static inline u32 ram_fc_runlist_timeslice_w(void)
+{
+        return 62U;
+}
+static inline u32 ram_fc_pb_timeslice_w(void)
+{
+        return 63U;
+}
+static inline u32 ram_userd_base_shift_v(void)
+{
+        return 0x00000009U;
+}
+static inline u32 ram_userd_chan_size_v(void)
+{
+        return 0x00000200U;
+}
+static inline u32 ram_userd_put_w(void)
+{
+        return 16U;
+}
+static inline u32 ram_userd_get_w(void)
+{
+        return 17U;
+}
+static inline u32 ram_userd_ref_w(void)
+{
+        return 18U;
+}
+static inline u32 ram_userd_put_hi_w(void)
+{
+        return 19U;
+}
+static inline u32 ram_userd_ref_threshold_w(void)
+{
+        return 20U;
+}
+static inline u32 ram_userd_top_level_get_w(void)
+{
+        return 22U;
+}
+static inline u32 ram_userd_top_level_get_hi_w(void)
+{
+        return 23U;
+}
+static inline u32 ram_userd_get_hi_w(void)
+{
+        return 24U;
+}
+static inline u32 ram_userd_gp_get_w(void)
+{
+        return 34U;
+}
+static inline u32 ram_userd_gp_put_w(void)
+{
+        return 35U;
+}
+static inline u32 ram_userd_gp_top_level_get_w(void)
+{
+        return 22U;
+}
+static inline u32 ram_userd_gp_top_level_get_hi_w(void)
+{
+        return 23U;
+}
+static inline u32 ram_rl_entry_size_v(void)
+{
+        return 0x00000008U;
+}
+static inline u32 ram_rl_entry_chid_f(u32 v)
+{
+        return (v & 0xfffU) << 0U;
+}
+static inline u32 ram_rl_entry_id_f(u32 v)
+{
+        return (v & 0xfffU) << 0U;
+}
+static inline u32 ram_rl_entry_type_f(u32 v)
+{
+        return (v & 0x1U) << 13U;
+}
+static inline u32 ram_rl_entry_type_chid_f(void)
+{
+        return 0x0U;
+}
+static inline u32 ram_rl_entry_type_tsg_f(void)
+{
+        return 0x2000U;
+}
+static inline u32 ram_rl_entry_timeslice_scale_f(u32 v)
+{
+        return (v & 0xfU) << 14U;
+}
+static inline u32 ram_rl_entry_timeslice_scale_3_f(void)
+{
+        return 0xc000U;
+}
+static inline u32 ram_rl_entry_timeslice_timeout_f(u32 v)
+{
+        return (v & 0xffU) << 18U;
+}
+static inline u32 ram_rl_entry_timeslice_timeout_128_f(void)
+{
+        return 0x2000000U;
+}
+static inline u32 ram_rl_entry_tsg_length_f(u32 v)
+{
+        return (v & 0x3fU) << 26U;
+}
+#endif
diff --git a/include/gk20a/hw_therm_gk20a.h b/include/gk20a/hw_therm_gk20a.h
new file mode 100644
index 0000000..075c9bc
--- /dev/null
+++ b/include/gk20a/hw_therm_gk20a.h
@@ -0,0 +1,367 @@
+/*
+ * Copyright (c) 2012-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+/*
+ * Function naming determines intended use:
+ *
+ *     <x>_r(void) : Returns the offset for register <x>.
+ *
+ *     <x>_o(void) : Returns the offset for element <x>.
+ *
+ *     <x>_w(void) : Returns the word offset for word (4 byte) element <x>.
+ *
+ *     <x>_<y>_s(void) : Returns size of field <y> of register <x> in bits.
+ *
+ *     <x>_<y>_f(u32 v) : Returns a value based on 'v' which has been shifted
+ *         and masked to place it at field <y> of register <x>.  This value
+ *         can be |'d with others to produce a full register value for
+ *         register <x>.
+ *
+ *     <x>_<y>_m(void) : Returns a mask for field <y> of register <x>.  This
+ *         value can be ~'d and then &'d to clear the value of field <y> for
+ *         register <x>.
+ *
+ *     <x>_<y>_<z>_f(void) : Returns the constant value <z> after being shifted
+ *         to place it at field <y> of register <x>.  This value can be |'d
+ *         with others to produce a full register value for <x>.
+ *
+ *     <x>_<y>_v(u32 r) : Returns the value of field <y> from a full register
+ *         <x> value 'r' after being shifted to place its LSB at bit 0.
+ *         This value is suitable for direct comparison with other unshifted
+ *         values appropriate for use in field <y> of register <x>.
+ *
+ *     <x>_<y>_<z>_v(void) : Returns the constant value for <z> defined for
+ *         field <y> of register <x>.  This value is suitable for direct
+ *         comparison with unshifted values appropriate for use in field <y>
+ *         of register <x>.
+ */
+#ifndef _hw_therm_gk20a_h_
+#define _hw_therm_gk20a_h_
+static inline u32 therm_use_a_r(void)
+{
+        return 0x00020798U;
+}
+static inline u32 therm_use_a_ext_therm_0_enable_f(void)
+{
+        return 0x1U;
+}
+static inline u32 therm_use_a_ext_therm_1_enable_f(void)
+{
+        return 0x2U;
+}
+static inline u32 therm_use_a_ext_therm_2_enable_f(void)
+{
+        return 0x4U;
+}
+static inline u32 therm_evt_ext_therm_0_r(void)
+{
+        return 0x00020700U;
+}
+static inline u32 therm_evt_ext_therm_0_slow_factor_f(u32 v)
+{
+        return (v & 0x3fU) << 8U;
+}
+static inline u32 therm_evt_ext_therm_0_slow_factor_init_v(void)
+{
+        return 0x00000000U;
+}
+static inline u32 therm_evt_ext_therm_0_priority_f(u32 v)
+{
+        return (v & 0x1fU) << 24U;
+}
+static inline u32 therm_evt_ext_therm_1_r(void)
+{
+        return 0x00020704U;
+}
+static inline u32 therm_evt_ext_therm_1_slow_factor_f(u32 v)
+{
+        return (v & 0x3fU) << 8U;
+}
+static inline u32 therm_evt_ext_therm_1_slow_factor_init_v(void)
+{
+        return 0x00000000U;
+}
+static inline u32 therm_evt_ext_therm_1_priority_f(u32 v)
+{
+        return (v & 0x1fU) << 24U;
+}
+static inline u32 therm_evt_ext_therm_2_r(void)
+{
+        return 0x00020708U;
+}
+static inline u32 therm_evt_ext_therm_2_slow_factor_f(u32 v)
+{
+        return (v & 0x3fU) << 8U;
+}
+static inline u32 therm_evt_ext_therm_2_slow_factor_init_v(void)
+{
+        return 0x00000000U;
+}
+static inline u32 therm_evt_ext_therm_2_priority_f(u32 v)
+{
+        return (v & 0x1fU) << 24U;
+}
+static inline u32 therm_weight_1_r(void)
+{
+        return 0x00020024U;
+}
+static inline u32 therm_config1_r(void)
+{
+        return 0x00020050U;
+}
+static inline u32 therm_config2_r(void)
+{
+        return 0x00020130U;
+}
+static inline u32 therm_config2_slowdown_factor_extended_f(u32 v)
+{
+        return (v & 0x1U) << 24U;
+}
+static inline u32 therm_config2_grad_enable_f(u32 v)
+{
+        return (v & 0x1U) << 31U;
+}
+static inline u32 therm_gate_ctrl_r(u32 i)
+{
+        return 0x00020200U + i*4U;
+}
+static inline u32 therm_gate_ctrl_eng_clk_m(void)
+{
+        return 0x3U << 0U;
+}
+static inline u32 therm_gate_ctrl_eng_clk_run_f(void)
+{
+        return 0x0U;
+}
+static inline u32 therm_gate_ctrl_eng_clk_auto_f(void)
+{
+        return 0x1U;
+}
+static inline u32 therm_gate_ctrl_eng_clk_stop_f(void)
+{
+        return 0x2U;
+}
+static inline u32 therm_gate_ctrl_blk_clk_m(void)
+{
+        return 0x3U << 2U;
+}
+static inline u32 therm_gate_ctrl_blk_clk_run_f(void)
+{
+        return 0x0U;
+}
+static inline u32 therm_gate_ctrl_blk_clk_auto_f(void)
+{
+        return 0x4U;
+}
+static inline u32 therm_gate_ctrl_eng_pwr_m(void)
+{
+        return 0x3U << 4U;
+}
+static inline u32 therm_gate_ctrl_eng_pwr_auto_f(void)
+{
+        return 0x10U;
+}
+static inline u32 therm_gate_ctrl_eng_pwr_off_v(void)
+{
+        return 0x00000002U;
+}
+static inline u32 therm_gate_ctrl_eng_pwr_off_f(void)
+{
+        return 0x20U;
+}
+static inline u32 therm_gate_ctrl_eng_idle_filt_exp_f(u32 v)
+{
+        return (v & 0x1fU) << 8U;
+}
+static inline u32 therm_gate_ctrl_eng_idle_filt_exp_m(void)
+{
+        return 0x1fU << 8U;
+}
+static inline u32 therm_gate_ctrl_eng_idle_filt_mant_f(u32 v)
+{
+        return (v & 0x7U) << 13U;
+}
+static inline u32 therm_gate_ctrl_eng_idle_filt_mant_m(void)
+{
+        return 0x7U << 13U;
+}
+static inline u32 therm_gate_ctrl_eng_delay_before_f(u32 v)
+{
+        return (v & 0xfU) << 16U;
+}
+static inline u32 therm_gate_ctrl_eng_delay_before_m(void)
+{
+        return 0xfU << 16U;
+}
+static inline u32 therm_gate_ctrl_eng_delay_after_f(u32 v)
+{
+        return (v & 0xfU) << 20U;
+}
+static inline u32 therm_gate_ctrl_eng_delay_after_m(void)
+{
+        return 0xfU << 20U;
+}
+static inline u32 therm_fecs_idle_filter_r(void)
+{
+        return 0x00020288U;
+}
+static inline u32 therm_fecs_idle_filter_value_m(void)
+{
+        return 0xffffffffU << 0U;
+}
+static inline u32 therm_hubmmu_idle_filter_r(void)
+{
+        return 0x0002028cU;
+}
+static inline u32 therm_hubmmu_idle_filter_value_m(void)
+{
+        return 0xffffffffU << 0U;
+}
+static inline u32 therm_clk_slowdown_r(u32 i)
+{
+        return 0x00020160U + i*4U;
+}
+static inline u32 therm_clk_slowdown_idle_factor_f(u32 v)
+{
+        return (v & 0x3fU) << 16U;
+}
+static inline u32 therm_clk_slowdown_idle_factor_m(void)
+{
+        return 0x3fU << 16U;
+}
+static inline u32 therm_clk_slowdown_idle_factor_v(u32 r)
+{
+        return (r >> 16U) & 0x3fU;
+}
+static inline u32 therm_clk_slowdown_idle_factor_disabled_f(void)
+{
+        return 0x0U;
+}
+static inline u32 therm_grad_stepping_table_r(u32 i)
+{
+        return 0x000202c8U + i*4U;
+}
+static inline u32 therm_grad_stepping_table_slowdown_factor0_f(u32 v)
+{
+        return (v & 0x3fU) << 0U;
+}
+static inline u32 therm_grad_stepping_table_slowdown_factor0_m(void)
+{
+        return 0x3fU << 0U;
+}
+static inline u32 therm_grad_stepping_table_slowdown_factor0_fpdiv_by1p5_f(void)
+{
+        return 0x1U;
+}
+static inline u32 therm_grad_stepping_table_slowdown_factor0_fpdiv_by2_f(void)
+{
+        return 0x2U;
+}
+static inline u32 therm_grad_stepping_table_slowdown_factor0_fpdiv_by4_f(void)
+{
+        return 0x6U;
+}
+static inline u32 therm_grad_stepping_table_slowdown_factor0_fpdiv_by8_f(void)
+{
+        return 0xeU;
+}
+static inline u32 therm_grad_stepping_table_slowdown_factor1_f(u32 v)
+{
+        return (v & 0x3fU) << 6U;
+}
+static inline u32 therm_grad_stepping_table_slowdown_factor1_m(void)
+{
+        return 0x3fU << 6U;
+}
+static inline u32 therm_grad_stepping_table_slowdown_factor2_f(u32 v)
+{
+        return (v & 0x3fU) << 12U;
+}
+static inline u32 therm_grad_stepping_table_slowdown_factor2_m(void)
+{
+        return 0x3fU << 12U;
+}
+static inline u32 therm_grad_stepping_table_slowdown_factor3_f(u32 v)
+{
+        return (v & 0x3fU) << 18U;
+}
+static inline u32 therm_grad_stepping_table_slowdown_factor3_m(void)
+{
+        return 0x3fU << 18U;
+}
+static inline u32 therm_grad_stepping_table_slowdown_factor4_f(u32 v)
+{
+        return (v & 0x3fU) << 24U;
+}
+static inline u32 therm_grad_stepping_table_slowdown_factor4_m(void)
+{
+        return 0x3fU << 24U;
+}
+static inline u32 therm_grad_stepping0_r(void)
+{
+        return 0x000202c0U;
+}
+static inline u32 therm_grad_stepping0_feature_s(void)
+{
+        return 1U;
+}
+static inline u32 therm_grad_stepping0_feature_f(u32 v)
+{
+        return (v & 0x1U) << 0U;
+}
+static inline u32 therm_grad_stepping0_feature_m(void)
+{
+        return 0x1U << 0U;
+}
+static inline u32 therm_grad_stepping0_feature_v(u32 r)
+{
+        return (r >> 0U) & 0x1U;
+}
+static inline u32 therm_grad_stepping0_feature_enable_f(void)
+{
+        return 0x1U;
+}
+static inline u32 therm_grad_stepping1_r(void)
+{
+        return 0x000202c4U;
+}
+static inline u32 therm_grad_stepping1_pdiv_duration_f(u32 v)
+{
+        return (v & 0x1ffffU) << 0U;
+}
+static inline u32 therm_clk_timing_r(u32 i)
+{
+        return 0x000203c0U + i*4U;
+}
+static inline u32 therm_clk_timing_grad_slowdown_f(u32 v)
+{
+        return (v & 0x1U) << 16U;
+}
+static inline u32 therm_clk_timing_grad_slowdown_m(void)
+{
+        return 0x1U << 16U;
+}
+static inline u32 therm_clk_timing_grad_slowdown_enabled_f(void)
+{
+        return 0x10000U;
+}
+#endif
diff --git a/include/gk20a/hw_timer_gk20a.h b/include/gk20a/hw_timer_gk20a.h
new file mode 100644
index 0000000..972d68a
--- /dev/null
+++ b/include/gk20a/hw_timer_gk20a.h
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2013-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+/*
+ * Function naming determines intended use:
+ *
+ *     <x>_r(void) : Returns the offset for register <x>.
+ *
+ *     <x>_o(void) : Returns the offset for element <x>.
+ *
+ *     <x>_w(void) : Returns the word offset for word (4 byte) element <x>.
+ *
+ *     <x>_<y>_s(void) : Returns size of field <y> of register <x> in bits.
+ *
+ *     <x>_<y>_f(u32 v) : Returns a value based on 'v' which has been shifted
+ *         and masked to place it at field <y> of register <x>.  This value
+ *         can be |'d with others to produce a full register value for
+ *         register <x>.
+ *
+ *     <x>_<y>_m(void) : Returns a mask for field <y> of register <x>.  This
+ *         value can be ~'d and then &'d to clear the value of field <y> for
+ *         register <x>.
+ *
+ *     <x>_<y>_<z>_f(void) : Returns the constant value <z> after being shifted
+ *         to place it at field <y> of register <x>.  This value can be |'d
+ *         with others to produce a full register value for <x>.
+ *
+ *     <x>_<y>_v(u32 r) : Returns the value of field <y> from a full register
+ *         <x> value 'r' after being shifted to place its LSB at bit 0.
+ *         This value is suitable for direct comparison with other unshifted
+ *         values appropriate for use in field <y> of register <x>.
+ *
+ *     <x>_<y>_<z>_v(void) : Returns the constant value for <z> defined for
+ *         field <y> of register <x>.  This value is suitable for direct
+ *         comparison with unshifted values appropriate for use in field <y>
+ *         of register <x>.
+ */
+#ifndef _hw_timer_gk20a_h_
+#define _hw_timer_gk20a_h_
+static inline u32 timer_pri_timeout_r(void)
+{
+        return 0x00009080U;
+}
+static inline u32 timer_pri_timeout_period_f(u32 v)
+{
+        return (v & 0xffffffU) << 0U;
+}
+static inline u32 timer_pri_timeout_period_m(void)
+{
+        return 0xffffffU << 0U;
+}
+static inline u32 timer_pri_timeout_period_v(u32 r)
+{
+        return (r >> 0U) & 0xffffffU;
+}
+static inline u32 timer_pri_timeout_en_f(u32 v)
+{
+        return (v & 0x1U) << 31U;
+}
+static inline u32 timer_pri_timeout_en_m(void)
+{
+        return 0x1U << 31U;
+}
+static inline u32 timer_pri_timeout_en_v(u32 r)
+{
+        return (r >> 31U) & 0x1U;
+}
+static inline u32 timer_pri_timeout_en_en_enabled_f(void)
+{
+        return 0x80000000U;
+}
+static inline u32 timer_pri_timeout_en_en_disabled_f(void)
+{
+        return 0x0U;
+}
+static inline u32 timer_pri_timeout_save_0_r(void)
+{
+        return 0x00009084U;
+}
+static inline u32 timer_pri_timeout_save_0_fecs_tgt_v(u32 r)
+{
+        return (r >> 31U) & 0x1U;
+}
+static inline u32 timer_pri_timeout_save_0_addr_v(u32 r)
+{
+        return (r >> 2U) & 0x3fffffU;
+}
+static inline u32 timer_pri_timeout_save_0_write_v(u32 r)
+{
+        return (r >> 1U) & 0x1U;
+}
+static inline u32 timer_pri_timeout_save_1_r(void)
+{
+        return 0x00009088U;
+}
+static inline u32 timer_pri_timeout_fecs_errcode_r(void)
+{
+        return 0x0000908cU;
+}
+static inline u32 timer_time_0_r(void)
+{
+        return 0x00009400U;
+}
+static inline u32 timer_time_1_r(void)
+{
+        return 0x00009410U;
+}
+#endif
diff --git a/include/gk20a/hw_top_gk20a.h b/include/gk20a/hw_top_gk20a.h
new file mode 100644
index 0000000..be7fa4a
--- /dev/null
+++ b/include/gk20a/hw_top_gk20a.h
@@ -0,0 +1,211 @@
+/*
+ * Copyright (c) 2012-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+/*
+ * Function naming determines intended use:
+ *
+ *     <x>_r(void) : Returns the offset for register <x>.
+ *
+ *     <x>_o(void) : Returns the offset for element <x>.
+ *
+ *     <x>_w(void) : Returns the word offset for word (4 byte) element <x>.
+ *
+ *     <x>_<y>_s(void) : Returns size of field <y> of register <x> in bits.
+ *
+ *     <x>_<y>_f(u32 v) : Returns a value based on 'v' which has been shifted
+ *         and masked to place it at field <y> of register <x>.  This value
+ *         can be |'d with others to produce a full register value for
+ *         register <x>.
+ *
+ *     <x>_<y>_m(void) : Returns a mask for field <y> of register <x>.  This
+ *         value can be ~'d and then &'d to clear the value of field <y> for
+ *         register <x>.
+ *
+ *     <x>_<y>_<z>_f(void) : Returns the constant value <z> after being shifted
+ *         to place it at field <y> of register <x>.  This value can be |'d
+ *         with others to produce a full register value for <x>.
+ *
+ *     <x>_<y>_v(u32 r) : Returns the value of field <y> from a full register
+ *         <x> value 'r' after being shifted to place its LSB at bit 0.
+ *         This value is suitable for direct comparison with other unshifted
+ *         values appropriate for use in field <y> of register <x>.
+ *
+ *     <x>_<y>_<z>_v(void) : Returns the constant value for <z> defined for
+ *         field <y> of register <x>.  This value is suitable for direct
+ *         comparison with unshifted values appropriate for use in field <y>
+ *         of register <x>.
+ */
+#ifndef _hw_top_gk20a_h_
+#define _hw_top_gk20a_h_
+static inline u32 top_num_gpcs_r(void)
+{
+        return 0x00022430U;
+}
+static inline u32 top_num_gpcs_value_v(u32 r)
+{
+        return (r >> 0U) & 0x1fU;
+}
+static inline u32 top_tpc_per_gpc_r(void)
+{
+        return 0x00022434U;
+}
+static inline u32 top_tpc_per_gpc_value_v(u32 r)
+{
+        return (r >> 0U) & 0x1fU;
+}
+static inline u32 top_num_fbps_r(void)
+{
+        return 0x00022438U;
+}
+static inline u32 top_num_fbps_value_v(u32 r)
+{
+        return (r >> 0U) & 0x1fU;
+}
+static inline u32 top_device_info_r(u32 i)
+{
+        return 0x00022700U + i*4U;
+}
+static inline u32 top_device_info__size_1_v(void)
+{
+        return 0x00000040U;
+}
+static inline u32 top_device_info_chain_v(u32 r)
+{
+        return (r >> 31U) & 0x1U;
+}
+static inline u32 top_device_info_chain_enable_v(void)
+{
+        return 0x00000001U;
+}
+static inline u32 top_device_info_engine_enum_v(u32 r)
+{
+        return (r >> 26U) & 0xfU;
+}
+static inline u32 top_device_info_runlist_enum_v(u32 r)
+{
+        return (r >> 21U) & 0xfU;
+}
+static inline u32 top_device_info_intr_enum_v(u32 r)
+{
+        return (r >> 15U) & 0x1fU;
+}
+static inline u32 top_device_info_reset_enum_v(u32 r)
+{
+        return (r >> 9U) & 0x1fU;
+}
+static inline u32 top_device_info_type_enum_v(u32 r)
+{
+        return (r >> 2U) & 0x1fffffffU;
+}
+static inline u32 top_device_info_type_enum_graphics_v(void)
+{
+        return 0x00000000U;
+}
+static inline u32 top_device_info_type_enum_graphics_f(void)
+{
+        return 0x0U;
+}
+static inline u32 top_device_info_type_enum_copy0_v(void)
+{
+        return 0x00000001U;
+}
+static inline u32 top_device_info_type_enum_copy0_f(void)
+{
+        return 0x4U;
+}
+static inline u32 top_device_info_type_enum_copy1_v(void)
+{
+        return 0x00000002U;
+}
+static inline u32 top_device_info_type_enum_copy1_f(void)
+{
+        return 0x8U;
+}
+static inline u32 top_device_info_type_enum_copy2_v(void)
+{
+        return 0x00000003U;
+}
+static inline u32 top_device_info_type_enum_copy2_f(void)
+{
+        return 0xcU;
+}
+static inline u32 top_device_info_engine_v(u32 r)
+{
+        return (r >> 5U) & 0x1U;
+}
+static inline u32 top_device_info_runlist_v(u32 r)
+{
+        return (r >> 4U) & 0x1U;
+}
+static inline u32 top_device_info_intr_v(u32 r)
+{
+        return (r >> 3U) & 0x1U;
+}
+static inline u32 top_device_info_reset_v(u32 r)
+{
+        return (r >> 2U) & 0x1U;
+}
+static inline u32 top_device_info_entry_v(u32 r)
+{
+        return (r >> 0U) & 0x3U;
+}
+static inline u32 top_device_info_entry_not_valid_v(void)
+{
+        return 0x00000000U;
+}
+static inline u32 top_device_info_entry_enum_v(void)
+{
+        return 0x00000002U;
+}
+static inline u32 top_device_info_entry_engine_type_v(void)
+{
+        return 0x00000003U;
+}
+static inline u32 top_device_info_entry_data_v(void)
+{
+        return 0x00000001U;
+}
+static inline u32 top_fs_status_fbp_r(void)
+{
+        return 0x00022548U;
+}
+static inline u32 top_fs_status_fbp_cluster_v(u32 r)
+{
+        return (r >> 0U) & 0xffffU;
+}
+static inline u32 top_fs_status_fbp_cluster_enable_v(void)
+{
+        return 0x00000000U;
+}
+static inline u32 top_fs_status_fbp_cluster_enable_f(void)
+{
+        return 0x0U;
+}
+static inline u32 top_fs_status_fbp_cluster_disable_v(void)
+{
+        return 0x00000001U;
+}
+static inline u32 top_fs_status_fbp_cluster_disable_f(void)
+{
+        return 0x1U;
+}
+#endif
diff --git a/include/gk20a/hw_trim_gk20a.h b/include/gk20a/hw_trim_gk20a.h
new file mode 100644
index 0000000..f28c21f
--- /dev/null
+++ b/include/gk20a/hw_trim_gk20a.h
@@ -0,0 +1,315 @@
+/*
+ * Copyright (c) 2012-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+/*
+ * Function naming determines intended use:
+ *
+ *     <x>_r(void) : Returns the offset for register <x>.
+ *
+ *     <x>_o(void) : Returns the offset for element <x>.
+ *
+ *     <x>_w(void) : Returns the word offset for word (4 byte) element <x>.
+ *
+ *     <x>_<y>_s(void) : Returns size of field <y> of register <x> in bits.
+ *
+ *     <x>_<y>_f(u32 v) : Returns a value based on 'v' which has been shifted
+ *         and masked to place it at field <y> of register <x>.  This value
+ *         can be |'d with others to produce a full register value for
+ *         register <x>.
+ *
+ *     <x>_<y>_m(void) : Returns a mask for field <y> of register <x>.  This
+ *         value can be ~'d and then &'d to clear the value of field <y> for
+ *         register <x>.
+ *
+ *     <x>_<y>_<z>_f(void) : Returns the constant value <z> after being shifted
+ *         to place it at field <y> of register <x>.  This value can be |'d
+ *         with others to produce a full register value for <x>.
+ *
+ *     <x>_<y>_v(u32 r) : Returns the value of field <y> from a full register
+ *         <x> value 'r' after being shifted to place its LSB at bit 0.
+ *         This value is suitable for direct comparison with other unshifted
+ *         values appropriate for use in field <y> of register <x>.
+ *
+ *     <x>_<y>_<z>_v(void) : Returns the constant value for <z> defined for
+ *         field <y> of register <x>.  This value is suitable for direct
+ *         comparison with unshifted values appropriate for use in field <y>
+ *         of register <x>.
+ */
+#ifndef _hw_trim_gk20a_h_
+#define _hw_trim_gk20a_h_
+static inline u32 trim_sys_gpcpll_cfg_r(void)
+{
+        return 0x00137000U;
+}
+static inline u32 trim_sys_gpcpll_cfg_enable_m(void)
+{
+        return 0x1U << 0U;
+}
+static inline u32 trim_sys_gpcpll_cfg_enable_v(u32 r)
+{
+        return (r >> 0U) & 0x1U;
+}
+static inline u32 trim_sys_gpcpll_cfg_enable_no_f(void)
+{
+        return 0x0U;
+}
+static inline u32 trim_sys_gpcpll_cfg_enable_yes_f(void)
+{
+        return 0x1U;
+}
+static inline u32 trim_sys_gpcpll_cfg_iddq_m(void)
+{
+        return 0x1U << 1U;
+}
+static inline u32 trim_sys_gpcpll_cfg_iddq_v(u32 r)
+{
+        return (r >> 1U) & 0x1U;
+}
+static inline u32 trim_sys_gpcpll_cfg_iddq_power_on_v(void)
+{
+        return 0x00000000U;
+}
+static inline u32 trim_sys_gpcpll_cfg_enb_lckdet_m(void)
+{
+        return 0x1U << 4U;
+}
+static inline u32 trim_sys_gpcpll_cfg_enb_lckdet_power_on_f(void)
+{
+        return 0x0U;
+}
+static inline u32 trim_sys_gpcpll_cfg_enb_lckdet_power_off_f(void)
+{
+        return 0x10U;
+}
+static inline u32 trim_sys_gpcpll_cfg_pll_lock_v(u32 r)
+{
+        return (r >> 17U) & 0x1U;
+}
+static inline u32 trim_sys_gpcpll_cfg_pll_lock_true_f(void)
+{
+        return 0x20000U;
+}
+static inline u32 trim_sys_gpcpll_coeff_r(void)
+{
+        return 0x00137004U;
+}
+static inline u32 trim_sys_gpcpll_coeff_mdiv_f(u32 v)
+{
+        return (v & 0xffU) << 0U;
+}
+static inline u32 trim_sys_gpcpll_coeff_mdiv_m(void)
+{
+        return 0xffU << 0U;
+}
+static inline u32 trim_sys_gpcpll_coeff_mdiv_v(u32 r)
+{
+        return (r >> 0U) & 0xffU;
+}
+static inline u32 trim_sys_gpcpll_coeff_ndiv_f(u32 v)
+{
+        return (v & 0xffU) << 8U;
+}
+static inline u32 trim_sys_gpcpll_coeff_ndiv_m(void)
+{
+        return 0xffU << 8U;
+}
+static inline u32 trim_sys_gpcpll_coeff_ndiv_v(u32 r)
+{
+        return (r >> 8U) & 0xffU;
+}
+static inline u32 trim_sys_gpcpll_coeff_pldiv_f(u32 v)
+{
+        return (v & 0x3fU) << 16U;
+}
+static inline u32 trim_sys_gpcpll_coeff_pldiv_m(void)
+{
+        return 0x3fU << 16U;
+}
+static inline u32 trim_sys_gpcpll_coeff_pldiv_v(u32 r)
+{
+        return (r >> 16U) & 0x3fU;
+}
+static inline u32 trim_sys_sel_vco_r(void)
+{
+        return 0x00137100U;
+}
+static inline u32 trim_sys_sel_vco_gpc2clk_out_m(void)
+{
+        return 0x1U << 0U;
+}
+static inline u32 trim_sys_sel_vco_gpc2clk_out_init_v(void)
+{
+        return 0x00000000U;
+}
+static inline u32 trim_sys_sel_vco_gpc2clk_out_init_f(void)
+{
+        return 0x0U;
+}
+static inline u32 trim_sys_sel_vco_gpc2clk_out_bypass_f(void)
+{
+        return 0x0U;
+}
+static inline u32 trim_sys_sel_vco_gpc2clk_out_vco_f(void)
+{
+        return 0x1U;
+}
+static inline u32 trim_sys_gpc2clk_out_r(void)
+{
+        return 0x00137250U;
+}
+static inline u32 trim_sys_gpc2clk_out_bypdiv_s(void)
+{
+        return 6U;
+}
+static inline u32 trim_sys_gpc2clk_out_bypdiv_f(u32 v)
+{
+        return (v & 0x3fU) << 0U;
+}
+static inline u32 trim_sys_gpc2clk_out_bypdiv_m(void)
+{
+        return 0x3fU << 0U;
+}
+static inline u32 trim_sys_gpc2clk_out_bypdiv_v(u32 r)
+{
+        return (r >> 0U) & 0x3fU;
+}
+static inline u32 trim_sys_gpc2clk_out_bypdiv_by31_f(void)
+{
+        return 0x3cU;
+}
+static inline u32 trim_sys_gpc2clk_out_vcodiv_s(void)
+{
+        return 6U;
+}
+static inline u32 trim_sys_gpc2clk_out_vcodiv_f(u32 v)
+{
+        return (v & 0x3fU) << 8U;
+}
+static inline u32 trim_sys_gpc2clk_out_vcodiv_m(void)
+{
+        return 0x3fU << 8U;
+}
+static inline u32 trim_sys_gpc2clk_out_vcodiv_v(u32 r)
+{
+        return (r >> 8U) & 0x3fU;
+}
+static inline u32 trim_sys_gpc2clk_out_vcodiv_by1_f(void)
+{
+        return 0x0U;
+}
+static inline u32 trim_sys_gpc2clk_out_sdiv14_m(void)
+{
+        return 0x1U << 31U;
+}
+static inline u32 trim_sys_gpc2clk_out_sdiv14_indiv4_mode_f(void)
+{
+        return 0x80000000U;
+}
+static inline u32 trim_gpc_clk_cntr_ncgpcclk_cfg_r(u32 i)
+{
+        return 0x00134124U + i*512U;
+}
+static inline u32 trim_gpc_clk_cntr_ncgpcclk_cfg_noofipclks_f(u32 v)
+{
+        return (v & 0x3fffU) << 0U;
+}
+static inline u32 trim_gpc_clk_cntr_ncgpcclk_cfg_write_en_asserted_f(void)
+{
+        return 0x10000U;
+}
+static inline u32 trim_gpc_clk_cntr_ncgpcclk_cfg_enable_asserted_f(void)
+{
+        return 0x100000U;
+}
+static inline u32 trim_gpc_clk_cntr_ncgpcclk_cfg_reset_asserted_f(void)
+{
+        return 0x1000000U;
+}
+static inline u32 trim_gpc_clk_cntr_ncgpcclk_cnt_r(u32 i)
+{
+        return 0x00134128U + i*512U;
+}
+static inline u32 trim_gpc_clk_cntr_ncgpcclk_cnt_value_v(u32 r)
+{
+        return (r >> 0U) & 0xfffffU;
+}
+static inline u32 trim_sys_gpcpll_cfg2_r(void)
+{
+        return 0x0013700cU;
+}
+static inline u32 trim_sys_gpcpll_cfg2_pll_stepa_f(u32 v)
+{
+        return (v & 0xffU) << 24U;
+}
+static inline u32 trim_sys_gpcpll_cfg2_pll_stepa_m(void)
+{
+        return 0xffU << 24U;
+}
+static inline u32 trim_sys_gpcpll_cfg3_r(void)
+{
+        return 0x00137018U;
+}
+static inline u32 trim_sys_gpcpll_cfg3_pll_stepb_f(u32 v)
+{
+        return (v & 0xffU) << 16U;
+}
+static inline u32 trim_sys_gpcpll_cfg3_pll_stepb_m(void)
+{
+        return 0xffU << 16U;
+}
+static inline u32 trim_sys_gpcpll_ndiv_slowdown_r(void)
+{
+        return 0x0013701cU;
+}
+static inline u32 trim_sys_gpcpll_ndiv_slowdown_slowdown_using_pll_m(void)
+{
+        return 0x1U << 22U;
+}
+static inline u32 trim_sys_gpcpll_ndiv_slowdown_slowdown_using_pll_yes_f(void)
+{
+        return 0x400000U;
+}
+static inline u32 trim_sys_gpcpll_ndiv_slowdown_slowdown_using_pll_no_f(void)
+{
+        return 0x0U;
+}
+static inline u32 trim_sys_gpcpll_ndiv_slowdown_en_dynramp_m(void)
+{
+        return 0x1U << 31U;
+}
+static inline u32 trim_sys_gpcpll_ndiv_slowdown_en_dynramp_yes_f(void)
+{
+        return 0x80000000U;
+}
+static inline u32 trim_sys_gpcpll_ndiv_slowdown_en_dynramp_no_f(void)
+{
+        return 0x0U;
+}
+static inline u32 trim_gpc_bcast_gpcpll_ndiv_slowdown_debug_r(void)
+{
+        return 0x001328a0U;
+}
+static inline u32 trim_gpc_bcast_gpcpll_ndiv_slowdown_debug_pll_dynramp_done_synced_v(u32 r)
+{
+        return (r >> 24U) & 0x1U;
+}
+#endif
diff --git a/include/gk20a/mm_gk20a.c b/include/gk20a/mm_gk20a.c
new file mode 100644
index 0000000..10ca84d
--- /dev/null
+++ b/include/gk20a/mm_gk20a.c
@@ -0,0 +1,654 @@
+/*
+ * Copyright (c) 2011-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+#include <trace/events/gk20a.h>
+#include <nvgpu/mm.h>
+#include <nvgpu/vm.h>
+#include <nvgpu/vm_area.h>
+#include <nvgpu/dma.h>
+#include <nvgpu/kmem.h>
+#include <nvgpu/timers.h>
+#include <nvgpu/pramin.h>
+#include <nvgpu/list.h>
+#include <nvgpu/nvgpu_mem.h>
+#include <nvgpu/allocator.h>
+#include <nvgpu/semaphore.h>
+#include <nvgpu/page_allocator.h>
+#include <nvgpu/log.h>
+#include <nvgpu/bug.h>
+#include <nvgpu/log2.h>
+#include <nvgpu/enabled.h>
+#include <nvgpu/vidmem.h>
+#include <nvgpu/sizes.h>
+#include <nvgpu/io.h>
+#include <nvgpu/utils.h>
+#include <nvgpu/channel.h>
+#include "gk20a.h"
+#include "mm_gk20a.h"
+#include "fence_gk20a.h"
+#include <nvgpu/hw/gk20a/hw_gmmu_gk20a.h>
+#include <nvgpu/hw/gk20a/hw_ram_gk20a.h>
+#include <nvgpu/hw/gk20a/hw_pram_gk20a.h>
+#include <nvgpu/hw/gk20a/hw_flush_gk20a.h>
+/*
+ * GPU mapping life cycle
+ * ======================
+ *
+ * Kernel mappings
+ * ---------------
+ *
+ * Kernel mappings are created through vm.map(..., false):
+ *
+ *  - Mappings to the same allocations are reused and refcounted.
+ *  - This path does not support deferred unmapping (i.e. kernel must wait for
+ *    all hw operations on the buffer to complete before unmapping).
+ *  - References to dmabuf are owned and managed by the (kernel) clients of
+ *    the gk20a_vm layer.
+ *
+ *
+ * User space mappings
+ * -------------------
+ *
+ * User space mappings are created through as.map_buffer -> vm.map(..., true):
+ *
+ *  - Mappings to the same allocations are reused and refcounted.
+ *  - This path supports deferred unmapping (i.e. we delay the actual unmapping
+ *    until all hw operations have completed).
+ *  - References to dmabuf are owned and managed by the vm_gk20a
+ *    layer itself. vm.map acquires these refs, and sets
+ *    mapped_buffer->own_mem_ref to record that we must release the refs when we
+ *    actually unmap.
+ *
+ */
+/* make sure gk20a_init_mm_support is called before */
+int gk20a_init_mm_setup_hw(struct gk20a *g)
+{
+        struct mm_gk20a *mm = &g->mm;
+        int err;
+        nvgpu_log_fn(g, " ");
+        if (g->ops.fb.set_mmu_page_size) {
+                g->ops.fb.set_mmu_page_size(g);
+        }
+        if (g->ops.fb.set_use_full_comp_tag_line) {
+                mm->use_full_comp_tag_line =
+                        g->ops.fb.set_use_full_comp_tag_line(g);
+        }
+        g->ops.fb.init_hw(g);
+        if (g->ops.bus.bar1_bind) {
+                g->ops.bus.bar1_bind(g, &mm->bar1.inst_block);
+        }
+        if (g->ops.bus.bar2_bind) {
+                err = g->ops.bus.bar2_bind(g, &mm->bar2.inst_block);
+                if (err) {
+                        return err;
+                }
+        }
+        if (gk20a_mm_fb_flush(g) || gk20a_mm_fb_flush(g)) {
+                return -EBUSY;
+        }
+        nvgpu_log_fn(g, "done");
+        return 0;
+}
+/* for gk20a the "video memory" apertures here are misnomers. */
+static inline u32 big_valid_pde0_bits(struct gk20a *g,
+                                      struct nvgpu_gmmu_pd *pd, u64 addr)
+{
+        u32 pde0_bits =
+                nvgpu_aperture_mask(g, pd->mem,
+                                    gmmu_pde_aperture_big_sys_mem_ncoh_f(),
+                                    gmmu_pde_aperture_big_sys_mem_coh_f(),
+                                    gmmu_pde_aperture_big_video_memory_f()) |
+                gmmu_pde_address_big_sys_f(
+                           (u32)(addr >> gmmu_pde_address_shift_v()));
+        return pde0_bits;
+}
+static inline u32 small_valid_pde1_bits(struct gk20a *g,
+                                        struct nvgpu_gmmu_pd *pd, u64 addr)
+{
+        u32 pde1_bits =
+                nvgpu_aperture_mask(g, pd->mem,
+                                    gmmu_pde_aperture_small_sys_mem_ncoh_f(),
+                                    gmmu_pde_aperture_small_sys_mem_coh_f(),
+                                    gmmu_pde_aperture_small_video_memory_f()) |
+                gmmu_pde_vol_small_true_f() | /* tbd: why? */
+                gmmu_pde_address_small_sys_f(
+                           (u32)(addr >> gmmu_pde_address_shift_v()));
+        return pde1_bits;
+}
+static void update_gmmu_pde_locked(struct vm_gk20a *vm,
+                                   const struct gk20a_mmu_level *l,
+                                   struct nvgpu_gmmu_pd *pd,
+                                   u32 pd_idx,
+                                   u64 virt_addr,
+                                   u64 phys_addr,
+                                   struct nvgpu_gmmu_attrs *attrs)
+{
+        struct gk20a *g = gk20a_from_vm(vm);
+        bool small_valid, big_valid;
+        u32 pd_offset = pd_offset_from_index(l, pd_idx);
+        u32 pde_v[2] = {0, 0};
+        small_valid = attrs->pgsz == GMMU_PAGE_SIZE_SMALL;
+        big_valid   = attrs->pgsz == GMMU_PAGE_SIZE_BIG;
+        pde_v[0] = gmmu_pde_size_full_f();
+        pde_v[0] |= big_valid ?
+                big_valid_pde0_bits(g, pd, phys_addr) :
+                gmmu_pde_aperture_big_invalid_f();
+        pde_v[1] |= (small_valid ? small_valid_pde1_bits(g, pd, phys_addr) :
+                     (gmmu_pde_aperture_small_invalid_f() |
+                      gmmu_pde_vol_small_false_f()))
+                |
+                (big_valid ? (gmmu_pde_vol_big_true_f()) :
+                 gmmu_pde_vol_big_false_f());
+        pte_dbg(g, attrs,
+                "PDE: i=%-4u size=%-2u offs=%-4u pgsz: %c%c | "
+                "GPU %#-12llx  phys %#-12llx "
+                "[0x%08x, 0x%08x]",
+                pd_idx, l->entry_size, pd_offset,
+                small_valid ? 'S' : '-',
+                big_valid ?   'B' : '-',
+                virt_addr, phys_addr,
+                pde_v[1], pde_v[0]);
+        pd_write(g, &vm->pdb, pd_offset + 0, pde_v[0]);
+        pd_write(g, &vm->pdb, pd_offset + 1, pde_v[1]);
+}
+static void __update_pte_sparse(u32 *pte_w)
+{
+        pte_w[0]  = gmmu_pte_valid_false_f();
+        pte_w[1] |= gmmu_pte_vol_true_f();
+}
+static void __update_pte(struct vm_gk20a *vm,
+                         u32 *pte_w,
+                         u64 phys_addr,
+                         struct nvgpu_gmmu_attrs *attrs)
+{
+        struct gk20a *g = gk20a_from_vm(vm);
+        u32 page_size = vm->gmmu_page_sizes[attrs->pgsz];
+        u32 pte_valid = attrs->valid ?
+                gmmu_pte_valid_true_f() :
+                gmmu_pte_valid_false_f();
+        u32 phys_shifted = phys_addr >> gmmu_pte_address_shift_v();
+        u32 addr = attrs->aperture == APERTURE_SYSMEM ?
+                gmmu_pte_address_sys_f(phys_shifted) :
+                gmmu_pte_address_vid_f(phys_shifted);
+        int ctag_shift = ilog2(g->ops.fb.compression_page_size(g));
+        pte_w[0] = pte_valid | addr;
+        if (attrs->priv) {
+                pte_w[0] |= gmmu_pte_privilege_true_f();
+        }
+        pte_w[1] = nvgpu_aperture_mask_raw(g, attrs->aperture,
+                                         gmmu_pte_aperture_sys_mem_ncoh_f(),
+                                         gmmu_pte_aperture_sys_mem_coh_f(),
+                                         gmmu_pte_aperture_video_memory_f()) |
+                gmmu_pte_kind_f(attrs->kind_v) |
+                gmmu_pte_comptagline_f((u32)(attrs->ctag >> ctag_shift));
+        if (attrs->ctag && vm->mm->use_full_comp_tag_line &&
+            phys_addr & 0x10000) {
+                pte_w[1] |= gmmu_pte_comptagline_f(
+                        1 << (gmmu_pte_comptagline_s() - 1));
+        }
+        if (attrs->rw_flag == gk20a_mem_flag_read_only) {
+                pte_w[0] |= gmmu_pte_read_only_true_f();
+                pte_w[1] |= gmmu_pte_write_disable_true_f();
+        } else if (attrs->rw_flag == gk20a_mem_flag_write_only) {
+                pte_w[1] |= gmmu_pte_read_disable_true_f();
+        }
+        if (!attrs->cacheable) {
+                pte_w[1] |= gmmu_pte_vol_true_f();
+        }
+        if (attrs->ctag) {
+                attrs->ctag += page_size;
+        }
+}
+static void update_gmmu_pte_locked(struct vm_gk20a *vm,
+                                   const struct gk20a_mmu_level *l,
+                                   struct nvgpu_gmmu_pd *pd,
+                                   u32 pd_idx,
+                                   u64 virt_addr,
+                                   u64 phys_addr,
+                                   struct nvgpu_gmmu_attrs *attrs)
+{
+        struct gk20a *g = gk20a_from_vm(vm);
+        u32 page_size  = vm->gmmu_page_sizes[attrs->pgsz];
+        u32 pd_offset = pd_offset_from_index(l, pd_idx);
+        u32 pte_w[2] = {0, 0};
+        int ctag_shift = ilog2(g->ops.fb.compression_page_size(g));
+        if (phys_addr) {
+                __update_pte(vm, pte_w, phys_addr, attrs);
+        } else if (attrs->sparse) {
+                __update_pte_sparse(pte_w);
+        }
+        pte_dbg(g, attrs,
+                "PTE: i=%-4u size=%-2u offs=%-4u | "
+                "GPU %#-12llx  phys %#-12llx "
+                "pgsz: %3dkb perm=%-2s kind=%#02x APT=%-6s %c%c%c%c "
+                "ctag=0x%08x "
+                "[0x%08x, 0x%08x]",
+                pd_idx, l->entry_size, pd_offset,
+                virt_addr, phys_addr,
+                page_size >> 10,
+                nvgpu_gmmu_perm_str(attrs->rw_flag),
+                attrs->kind_v,
+                nvgpu_aperture_str(g, attrs->aperture),
+                attrs->cacheable ? 'C' : '-',
+                attrs->sparse    ? 'S' : '-',
+                attrs->priv      ? 'P' : '-',
+                attrs->valid     ? 'V' : '-',
+                (u32)attrs->ctag >> ctag_shift,
+                pte_w[1], pte_w[0]);
+        pd_write(g, pd, pd_offset + 0, pte_w[0]);
+        pd_write(g, pd, pd_offset + 1, pte_w[1]);
+}
+u32 gk20a_get_pde_pgsz(struct gk20a *g, const struct gk20a_mmu_level *l,
+                                struct nvgpu_gmmu_pd *pd, u32 pd_idx)
+{
+        /*
+         * big and small page sizes are the same
+         */
+        return GMMU_PAGE_SIZE_SMALL;
+}
+u32 gk20a_get_pte_pgsz(struct gk20a *g, const struct gk20a_mmu_level *l,
+                                struct nvgpu_gmmu_pd *pd, u32 pd_idx)
+{
+        /*
+         * return invalid
+         */
+        return GMMU_NR_PAGE_SIZES;
+}
+const struct gk20a_mmu_level gk20a_mm_levels_64k[] = {
+        {.hi_bit = {NV_GMMU_VA_RANGE-1, NV_GMMU_VA_RANGE-1},
+         .lo_bit = {26, 26},
+         .update_entry = update_gmmu_pde_locked,
+         .entry_size = 8,
+         .get_pgsz = gk20a_get_pde_pgsz},
+        {.hi_bit = {25, 25},
+         .lo_bit = {12, 16},
+         .update_entry = update_gmmu_pte_locked,
+         .entry_size = 8,
+         .get_pgsz = gk20a_get_pte_pgsz},
+        {.update_entry = NULL}
+};
+const struct gk20a_mmu_level gk20a_mm_levels_128k[] = {
+        {.hi_bit = {NV_GMMU_VA_RANGE-1, NV_GMMU_VA_RANGE-1},
+         .lo_bit = {27, 27},
+         .update_entry = update_gmmu_pde_locked,
+         .entry_size = 8,
+         .get_pgsz = gk20a_get_pde_pgsz},
+        {.hi_bit = {26, 26},
+         .lo_bit = {12, 17},
+         .update_entry = update_gmmu_pte_locked,
+         .entry_size = 8,
+         .get_pgsz = gk20a_get_pte_pgsz},
+        {.update_entry = NULL}
+};
+int gk20a_vm_bind_channel(struct vm_gk20a *vm, struct channel_gk20a *ch)
+{
+        int err = 0;
+        nvgpu_log_fn(ch->g, " ");
+        nvgpu_vm_get(vm);
+        ch->vm = vm;
+        err = channel_gk20a_commit_va(ch);
+        if (err) {
+                ch->vm = NULL;
+        }
+        nvgpu_log(gk20a_from_vm(vm), gpu_dbg_map, "Binding ch=%d -> VM:%s",
+                  ch->chid, vm->name);
+        return err;
+}
+void gk20a_mm_init_pdb(struct gk20a *g, struct nvgpu_mem *inst_block,
+                struct vm_gk20a *vm)
+{
+        u64 pdb_addr = nvgpu_mem_get_addr(g, vm->pdb.mem);
+        u32 pdb_addr_lo = u64_lo32(pdb_addr >> ram_in_base_shift_v());
+        u32 pdb_addr_hi = u64_hi32(pdb_addr);
+        nvgpu_log_info(g, "pde pa=0x%llx", pdb_addr);
+        nvgpu_mem_wr32(g, inst_block, ram_in_page_dir_base_lo_w(),
+                       nvgpu_aperture_mask(g, vm->pdb.mem,
+                                ram_in_page_dir_base_target_sys_mem_ncoh_f(),
+                                ram_in_page_dir_base_target_sys_mem_coh_f(),
+                                ram_in_page_dir_base_target_vid_mem_f()) |
+                       ram_in_page_dir_base_vol_true_f() |
+                       ram_in_page_dir_base_lo_f(pdb_addr_lo));
+        nvgpu_mem_wr32(g, inst_block, ram_in_page_dir_base_hi_w(),
+                ram_in_page_dir_base_hi_f(pdb_addr_hi));
+}
+void gk20a_init_inst_block(struct nvgpu_mem *inst_block, struct vm_gk20a *vm,
+                u32 big_page_size)
+{
+        struct gk20a *g = gk20a_from_vm(vm);
+        nvgpu_log_info(g, "inst block phys = 0x%llx, kv = 0x%p",
+                nvgpu_inst_block_addr(g, inst_block), inst_block->cpu_va);
+        g->ops.mm.init_pdb(g, inst_block, vm);
+        nvgpu_mem_wr32(g, inst_block, ram_in_adr_limit_lo_w(),
+                u64_lo32(vm->va_limit - 1) & ~0xfff);
+        nvgpu_mem_wr32(g, inst_block, ram_in_adr_limit_hi_w(),
+                ram_in_adr_limit_hi_f(u64_hi32(vm->va_limit - 1)));
+        if (big_page_size && g->ops.mm.set_big_page_size) {
+                g->ops.mm.set_big_page_size(g, inst_block, big_page_size);
+        }
+}
+int gk20a_alloc_inst_block(struct gk20a *g, struct nvgpu_mem *inst_block)
+{
+        int err;
+        nvgpu_log_fn(g, " ");
+        err = nvgpu_dma_alloc(g, ram_in_alloc_size_v(), inst_block);
+        if (err) {
+                nvgpu_err(g, "%s: memory allocation failed", __func__);
+                return err;
+        }
+        nvgpu_log_fn(g, "done");
+        return 0;
+}
+int gk20a_mm_fb_flush(struct gk20a *g)
+{
+        struct mm_gk20a *mm = &g->mm;
+        u32 data;
+        int ret = 0;
+        struct nvgpu_timeout timeout;
+        u32 retries;
+        nvgpu_log_fn(g, " ");
+        gk20a_busy_noresume(g);
+        if (!g->power_on) {
+                gk20a_idle_nosuspend(g);
+                return 0;
+        }
+        retries = 100;
+        if (g->ops.mm.get_flush_retries) {
+                retries = g->ops.mm.get_flush_retries(g, NVGPU_FLUSH_FB);
+        }
+        nvgpu_timeout_init(g, &timeout, retries, NVGPU_TIMER_RETRY_TIMER);
+        nvgpu_mutex_acquire(&mm->l2_op_lock);
+        /* Make sure all previous writes are committed to the L2. There's no
+           guarantee that writes are to DRAM. This will be a sysmembar internal
+           to the L2. */
+        trace_gk20a_mm_fb_flush(g->name);
+        gk20a_writel(g, flush_fb_flush_r(),
+                flush_fb_flush_pending_busy_f());
+        do {
+                data = gk20a_readl(g, flush_fb_flush_r());
+                if (flush_fb_flush_outstanding_v(data) ==
+                        flush_fb_flush_outstanding_true_v() ||
+                    flush_fb_flush_pending_v(data) ==
+                        flush_fb_flush_pending_busy_v()) {
+                                nvgpu_log_info(g, "fb_flush 0x%x", data);
+                                nvgpu_udelay(5);
+                } else {
+                        break;
+                }
+        } while (!nvgpu_timeout_expired(&timeout));
+        if (nvgpu_timeout_peek_expired(&timeout)) {
+                if (g->ops.fb.dump_vpr_info) {
+                        g->ops.fb.dump_vpr_info(g);
+                }
+                if (g->ops.fb.dump_wpr_info) {
+                        g->ops.fb.dump_wpr_info(g);
+                }
+                ret = -EBUSY;
+        }
+        trace_gk20a_mm_fb_flush_done(g->name);
+        nvgpu_mutex_release(&mm->l2_op_lock);
+        gk20a_idle_nosuspend(g);
+        return ret;
+}
+static void gk20a_mm_l2_invalidate_locked(struct gk20a *g)
+{
+        u32 data;
+        struct nvgpu_timeout timeout;
+        u32 retries = 200;
+        trace_gk20a_mm_l2_invalidate(g->name);
+        if (g->ops.mm.get_flush_retries) {
+                retries = g->ops.mm.get_flush_retries(g, NVGPU_FLUSH_L2_INV);
+        }
+        nvgpu_timeout_init(g, &timeout, retries, NVGPU_TIMER_RETRY_TIMER);
+        /* Invalidate any clean lines from the L2 so subsequent reads go to
+           DRAM. Dirty lines are not affected by this operation. */
+        gk20a_writel(g, flush_l2_system_invalidate_r(),
+                flush_l2_system_invalidate_pending_busy_f());
+        do {
+                data = gk20a_readl(g, flush_l2_system_invalidate_r());
+                if (flush_l2_system_invalidate_outstanding_v(data) ==
+                        flush_l2_system_invalidate_outstanding_true_v() ||
+                    flush_l2_system_invalidate_pending_v(data) ==
+                        flush_l2_system_invalidate_pending_busy_v()) {
+                                nvgpu_log_info(g, "l2_system_invalidate 0x%x",
+                                                data);
+                                nvgpu_udelay(5);
+                } else {
+                        break;
+                }
+        } while (!nvgpu_timeout_expired(&timeout));
+        if (nvgpu_timeout_peek_expired(&timeout)) {
+                nvgpu_warn(g, "l2_system_invalidate too many retries");
+        }
+        trace_gk20a_mm_l2_invalidate_done(g->name);
+}
+void gk20a_mm_l2_invalidate(struct gk20a *g)
+{
+        struct mm_gk20a *mm = &g->mm;
+        gk20a_busy_noresume(g);
+        if (g->power_on) {
+                nvgpu_mutex_acquire(&mm->l2_op_lock);
+                gk20a_mm_l2_invalidate_locked(g);
+                nvgpu_mutex_release(&mm->l2_op_lock);
+        }
+        gk20a_idle_nosuspend(g);
+}
+void gk20a_mm_l2_flush(struct gk20a *g, bool invalidate)
+{
+        struct mm_gk20a *mm = &g->mm;
+        u32 data;
+        struct nvgpu_timeout timeout;
+        u32 retries = 2000;
+        nvgpu_log_fn(g, " ");
+        gk20a_busy_noresume(g);
+        if (!g->power_on) {
+                goto hw_was_off;
+        }
+        if (g->ops.mm.get_flush_retries) {
+                retries = g->ops.mm.get_flush_retries(g, NVGPU_FLUSH_L2_FLUSH);
+        }
+        nvgpu_timeout_init(g, &timeout, retries, NVGPU_TIMER_RETRY_TIMER);
+        nvgpu_mutex_acquire(&mm->l2_op_lock);
+        trace_gk20a_mm_l2_flush(g->name);
+        /* Flush all dirty lines from the L2 to DRAM. Lines are left in the L2
+           as clean, so subsequent reads might hit in the L2. */
+        gk20a_writel(g, flush_l2_flush_dirty_r(),
+                flush_l2_flush_dirty_pending_busy_f());
+        do {
+                data = gk20a_readl(g, flush_l2_flush_dirty_r());
+                if (flush_l2_flush_dirty_outstanding_v(data) ==
+                        flush_l2_flush_dirty_outstanding_true_v() ||
+                    flush_l2_flush_dirty_pending_v(data) ==
+                        flush_l2_flush_dirty_pending_busy_v()) {
+                                nvgpu_log_info(g, "l2_flush_dirty 0x%x", data);
+                                nvgpu_udelay(5);
+                } else {
+                        break;
+                }
+        } while (!nvgpu_timeout_expired_msg(&timeout,
+                                         "l2_flush_dirty too many retries"));
+        trace_gk20a_mm_l2_flush_done(g->name);
+        if (invalidate) {
+                gk20a_mm_l2_invalidate_locked(g);
+        }
+        nvgpu_mutex_release(&mm->l2_op_lock);
+hw_was_off:
+        gk20a_idle_nosuspend(g);
+}
+void gk20a_mm_cbc_clean(struct gk20a *g)
+{
+        struct mm_gk20a *mm = &g->mm;
+        u32 data;
+        struct nvgpu_timeout timeout;
+        u32 retries = 200;
+        nvgpu_log_fn(g, " ");
+        gk20a_busy_noresume(g);
+        if (!g->power_on) {
+                goto hw_was_off;
+        }
+        if (g->ops.mm.get_flush_retries) {
+                retries = g->ops.mm.get_flush_retries(g, NVGPU_FLUSH_CBC_CLEAN);
+        }
+        nvgpu_timeout_init(g, &timeout, retries, NVGPU_TIMER_RETRY_TIMER);
+        nvgpu_mutex_acquire(&mm->l2_op_lock);
+        /* Flush all dirty lines from the CBC to L2 */
+        gk20a_writel(g, flush_l2_clean_comptags_r(),
+                flush_l2_clean_comptags_pending_busy_f());
+        do {
+                data = gk20a_readl(g, flush_l2_clean_comptags_r());
+                if (flush_l2_clean_comptags_outstanding_v(data) ==
+                        flush_l2_clean_comptags_outstanding_true_v() ||
+                    flush_l2_clean_comptags_pending_v(data) ==
+                        flush_l2_clean_comptags_pending_busy_v()) {
+                                nvgpu_log_info(g, "l2_clean_comptags 0x%x", data);
+                                nvgpu_udelay(5);
+                } else {
+                        break;
+                }
+        } while (!nvgpu_timeout_expired_msg(&timeout,
+                                         "l2_clean_comptags too many retries"));
+        nvgpu_mutex_release(&mm->l2_op_lock);
+hw_was_off:
+        gk20a_idle_nosuspend(g);
+}
+u32 gk20a_mm_get_iommu_bit(struct gk20a *g)
+{
+        return 34;
+}
+const struct gk20a_mmu_level *gk20a_mm_get_mmu_levels(struct gk20a *g,
+                                                      u32 big_page_size)
+{
+        return (big_page_size == SZ_64K) ?
+                 gk20a_mm_levels_64k : gk20a_mm_levels_128k;
+}
diff --git a/include/gk20a/mm_gk20a.h b/include/gk20a/mm_gk20a.h
new file mode 100644
index 0000000..76a1621
--- /dev/null
+++ b/include/gk20a/mm_gk20a.h
@@ -0,0 +1,155 @@
+/*
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+#ifndef MM_GK20A_H
+#define MM_GK20A_H
+#include <nvgpu/nvgpu_mem.h>
+#include <nvgpu/allocator.h>
+#include <nvgpu/vm.h>
+#include <nvgpu/list.h>
+#include <nvgpu/rbtree.h>
+#include <nvgpu/kref.h>
+enum gk20a_mem_rw_flag;
+struct patch_desc {
+        struct nvgpu_mem mem;
+        u32 data_count;
+};
+struct zcull_ctx_desc {
+        u64 gpu_va;
+        u32 ctx_attr;
+        u32 ctx_sw_mode;
+};
+struct pm_ctx_desc {
+        struct nvgpu_mem mem;
+        u32 pm_mode;
+};
+struct compbit_store_desc {
+        struct nvgpu_mem mem;
+        /* The value that is written to the hardware. This depends on
+         * on the number of ltcs and is not an address. */
+        u64 base_hw;
+};
+struct gk20a_buffer_state {
+        struct nvgpu_list_node list;
+        /* The valid compbits and the fence must be changed atomically. */
+        struct nvgpu_mutex lock;
+        /* Offset of the surface within the dma-buf whose state is
+         * described by this struct (one dma-buf can contain multiple
+         * surfaces with different states). */
+        size_t offset;
+        /* A bitmask of valid sets of compbits (0 = uncompressed). */
+        u32 valid_compbits;
+        /* The ZBC color used on this buffer. */
+        u32 zbc_color;
+        /* This struct reflects the state of the buffer when this
+         * fence signals. */
+        struct gk20a_fence *fence;
+};
+static inline struct gk20a_buffer_state *
+gk20a_buffer_state_from_list(struct nvgpu_list_node *node)
+{
+        return (struct gk20a_buffer_state *)
+                ((uintptr_t)node - offsetof(struct gk20a_buffer_state, list));
+};
+struct gk20a;
+struct channel_gk20a;
+int gk20a_mm_fb_flush(struct gk20a *g);
+void gk20a_mm_l2_flush(struct gk20a *g, bool invalidate);
+void gk20a_mm_cbc_clean(struct gk20a *g);
+void gk20a_mm_l2_invalidate(struct gk20a *g);
+#define dev_from_vm(vm) dev_from_gk20a(vm->mm->g)
+void gk20a_mm_ltc_isr(struct gk20a *g);
+bool gk20a_mm_mmu_debug_mode_enabled(struct gk20a *g);
+int gk20a_alloc_inst_block(struct gk20a *g, struct nvgpu_mem *inst_block);
+void gk20a_init_inst_block(struct nvgpu_mem *inst_block, struct vm_gk20a *vm,
+                u32 big_page_size);
+int gk20a_init_mm_setup_hw(struct gk20a *g);
+u64 gk20a_locked_gmmu_map(struct vm_gk20a *vm,
+                          u64 map_offset,
+                          struct nvgpu_sgt *sgt,
+                          u64 buffer_offset,
+                          u64 size,
+                          u32 pgsz_idx,
+                          u8 kind_v,
+                          u32 ctag_offset,
+                          u32 flags,
+                          enum gk20a_mem_rw_flag rw_flag,
+                          bool clear_ctags,
+                          bool sparse,
+                          bool priv,
+                          struct vm_gk20a_mapping_batch *batch,
+                          enum nvgpu_aperture aperture);
+void gk20a_locked_gmmu_unmap(struct vm_gk20a *vm,
+                             u64 vaddr,
+                             u64 size,
+                             u32 pgsz_idx,
+                             bool va_allocated,
+                             enum gk20a_mem_rw_flag rw_flag,
+                             bool sparse,
+                             struct vm_gk20a_mapping_batch *batch);
+/* vm-as interface */
+struct nvgpu_as_alloc_space_args;
+struct nvgpu_as_free_space_args;
+int gk20a_vm_release_share(struct gk20a_as_share *as_share);
+int gk20a_vm_bind_channel(struct vm_gk20a *vm, struct channel_gk20a *ch);
+void pde_range_from_vaddr_range(struct vm_gk20a *vm,
+                                              u64 addr_lo, u64 addr_hi,
+                                              u32 *pde_lo, u32 *pde_hi);
+u32 gk20a_mm_get_iommu_bit(struct gk20a *g);
+const struct gk20a_mmu_level *gk20a_mm_get_mmu_levels(struct gk20a *g,
+                                                      u32 big_page_size);
+void gk20a_mm_init_pdb(struct gk20a *g, struct nvgpu_mem *mem,
+                struct vm_gk20a *vm);
+extern const struct gk20a_mmu_level gk20a_mm_levels_64k[];
+extern const struct gk20a_mmu_level gk20a_mm_levels_128k[];
+u32 gk20a_get_pde_pgsz(struct gk20a *g, const struct gk20a_mmu_level *l,
+                                struct nvgpu_gmmu_pd *pd, u32 pd_idx);
+u32 gk20a_get_pte_pgsz(struct gk20a *g, const struct gk20a_mmu_level *l,
+                                struct nvgpu_gmmu_pd *pd, u32 pd_idx);
+#endif /* MM_GK20A_H */
diff --git a/include/gk20a/pmu_gk20a.c b/include/gk20a/pmu_gk20a.c
new file mode 100644
index 0000000..63a32f0
--- /dev/null
+++ b/include/gk20a/pmu_gk20a.c
@@ -0,0 +1,879 @@
+/*
+ * GK20A PMU (aka. gPMU outside gk20a context)
+ *
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+#include <nvgpu/nvgpu_common.h>
+#include <nvgpu/timers.h>
+#include <nvgpu/kmem.h>
+#include <nvgpu/dma.h>
+#include <nvgpu/log.h>
+#include <nvgpu/bug.h>
+#include <nvgpu/firmware.h>
+#include <nvgpu/falcon.h>
+#include <nvgpu/mm.h>
+#include <nvgpu/io.h>
+#include <nvgpu/clk_arb.h>
+#include <nvgpu/utils.h>
+#include <nvgpu/unit.h>
+#include "gk20a.h"
+#include "gr_gk20a.h"
+#include "pmu_gk20a.h"
+#include <nvgpu/hw/gk20a/hw_mc_gk20a.h>
+#include <nvgpu/hw/gk20a/hw_pwr_gk20a.h>
+#include <nvgpu/hw/gk20a/hw_top_gk20a.h>
+#define gk20a_dbg_pmu(g, fmt, arg...) \
+        nvgpu_log(g, gpu_dbg_pmu, fmt, ##arg)
+bool nvgpu_find_hex_in_string(char *strings, struct gk20a *g, u32 *hex_pos)
+{
+        u32 i = 0, j = strlen(strings);
+        for (; i < j; i++) {
+                if (strings[i] == '%') {
+                        if (strings[i + 1] == 'x' || strings[i + 1] == 'X') {
+                                *hex_pos = i;
+                                return true;
+                        }
+                }
+        }
+        *hex_pos = -1;
+        return false;
+}
+static void print_pmu_trace(struct nvgpu_pmu *pmu)
+{
+        struct gk20a *g = pmu->g;
+        u32 i = 0, j = 0, k, l, m, count;
+        char part_str[40], buf[0x40];
+        void *tracebuffer;
+        char *trace;
+        u32 *trace1;
+        /* allocate system memory to copy pmu trace buffer */
+        tracebuffer = nvgpu_kzalloc(g, GK20A_PMU_TRACE_BUFSIZE);
+        if (tracebuffer == NULL) {
+                return;
+        }
+        /* read pmu traces into system memory buffer */
+        nvgpu_mem_rd_n(g, &pmu->trace_buf, 0, tracebuffer,
+                GK20A_PMU_TRACE_BUFSIZE);
+        trace = (char *)tracebuffer;
+        trace1 = (u32 *)tracebuffer;
+        nvgpu_err(g, "dump PMU trace buffer");
+        for (i = 0; i < GK20A_PMU_TRACE_BUFSIZE; i += 0x40) {
+                for (j = 0; j < 0x40; j++) {
+                        if (trace1[(i / 4) + j]) {
+                                break;
+                        }
+                }
+                if (j == 0x40) {
+                        break;
+                }
+                count = scnprintf(buf, 0x40, "Index %x: ", trace1[(i / 4)]);
+                l = 0;
+                m = 0;
+                while (nvgpu_find_hex_in_string((trace+i+20+m), g, &k)) {
+                        if (k >= 40) {
+                                break;
+                        }
+                        strncpy(part_str, (trace+i+20+m), k);
+                        part_str[k] = '\0';
+                        count += scnprintf((buf + count), 0x40, "%s0x%x",
+                                        part_str, trace1[(i / 4) + 1 + l]);
+                        l++;
+                        m += k + 2;
+                }
+                scnprintf((buf + count), 0x40, "%s", (trace+i+20+m));
+                nvgpu_err(g, "%s", buf);
+        }
+        nvgpu_kfree(g, tracebuffer);
+}
+u32 gk20a_pmu_get_irqdest(struct gk20a *g)
+{
+        u32 intr_dest;
+        /* dest 0=falcon, 1=host; level 0=irq0, 1=irq1 */
+        intr_dest = pwr_falcon_irqdest_host_gptmr_f(0)    |
+                pwr_falcon_irqdest_host_wdtmr_f(1)    |
+                pwr_falcon_irqdest_host_mthd_f(0)     |
+                pwr_falcon_irqdest_host_ctxsw_f(0)    |
+                pwr_falcon_irqdest_host_halt_f(1)     |
+                pwr_falcon_irqdest_host_exterr_f(0)   |
+                pwr_falcon_irqdest_host_swgen0_f(1)   |
+                pwr_falcon_irqdest_host_swgen1_f(0)   |
+                pwr_falcon_irqdest_host_ext_f(0xff)   |
+                pwr_falcon_irqdest_target_gptmr_f(1)  |
+                pwr_falcon_irqdest_target_wdtmr_f(0)  |
+                pwr_falcon_irqdest_target_mthd_f(0)   |
+                pwr_falcon_irqdest_target_ctxsw_f(0)  |
+                pwr_falcon_irqdest_target_halt_f(0)   |
+                pwr_falcon_irqdest_target_exterr_f(0) |
+                pwr_falcon_irqdest_target_swgen0_f(0) |
+                pwr_falcon_irqdest_target_swgen1_f(0) |
+                pwr_falcon_irqdest_target_ext_f(0xff);
+        return intr_dest;
+}
+void gk20a_pmu_enable_irq(struct nvgpu_pmu *pmu, bool enable)
+{
+        struct gk20a *g = gk20a_from_pmu(pmu);
+        u32 intr_mask;
+        u32 intr_dest;
+        nvgpu_log_fn(g, " ");
+        g->ops.mc.intr_unit_config(g, MC_INTR_UNIT_DISABLE, true,
+                        mc_intr_mask_0_pmu_enabled_f());
+        g->ops.mc.intr_unit_config(g, MC_INTR_UNIT_DISABLE, false,
+                        mc_intr_mask_1_pmu_enabled_f());
+        nvgpu_flcn_set_irq(pmu->flcn, false, 0x0, 0x0);
+        if (enable) {
+                intr_dest = g->ops.pmu.get_irqdest(g);
+                /* 0=disable, 1=enable */
+                intr_mask = pwr_falcon_irqmset_gptmr_f(1)  |
+                        pwr_falcon_irqmset_wdtmr_f(1)  |
+                        pwr_falcon_irqmset_mthd_f(0)   |
+                        pwr_falcon_irqmset_ctxsw_f(0)  |
+                        pwr_falcon_irqmset_halt_f(1)   |
+                        pwr_falcon_irqmset_exterr_f(1) |
+                        pwr_falcon_irqmset_swgen0_f(1) |
+                        pwr_falcon_irqmset_swgen1_f(1);
+                nvgpu_flcn_set_irq(pmu->flcn, true, intr_mask, intr_dest);
+                g->ops.mc.intr_unit_config(g, MC_INTR_UNIT_ENABLE, true,
+                                mc_intr_mask_0_pmu_enabled_f());
+        }
+        nvgpu_log_fn(g, "done");
+}
+int pmu_bootstrap(struct nvgpu_pmu *pmu)
+{
+        struct gk20a *g = gk20a_from_pmu(pmu);
+        struct mm_gk20a *mm = &g->mm;
+        struct pmu_ucode_desc *desc = pmu->desc;
+        u64 addr_code, addr_data, addr_load;
+        u32 i, blocks, addr_args;
+        nvgpu_log_fn(g, " ");
+        gk20a_writel(g, pwr_falcon_itfen_r(),
+                gk20a_readl(g, pwr_falcon_itfen_r()) |
+                pwr_falcon_itfen_ctxen_enable_f());
+        gk20a_writel(g, pwr_pmu_new_instblk_r(),
+                pwr_pmu_new_instblk_ptr_f(
+                        nvgpu_inst_block_addr(g, &mm->pmu.inst_block) >> 12) |
+                pwr_pmu_new_instblk_valid_f(1) |
+                pwr_pmu_new_instblk_target_sys_coh_f());
+        /* TBD: load all other surfaces */
+        g->ops.pmu_ver.set_pmu_cmdline_args_trace_size(
+                pmu, GK20A_PMU_TRACE_BUFSIZE);
+        g->ops.pmu_ver.set_pmu_cmdline_args_trace_dma_base(pmu);
+        g->ops.pmu_ver.set_pmu_cmdline_args_trace_dma_idx(
+                pmu, GK20A_PMU_DMAIDX_VIRT);
+        g->ops.pmu_ver.set_pmu_cmdline_args_cpu_freq(pmu,
+                g->ops.clk.get_rate(g, CTRL_CLK_DOMAIN_PWRCLK));
+        addr_args = (pwr_falcon_hwcfg_dmem_size_v(
+                gk20a_readl(g, pwr_falcon_hwcfg_r()))
+                        << GK20A_PMU_DMEM_BLKSIZE2) -
+                g->ops.pmu_ver.get_pmu_cmdline_args_size(pmu);
+        nvgpu_flcn_copy_to_dmem(pmu->flcn, addr_args,
+                        (u8 *)(g->ops.pmu_ver.get_pmu_cmdline_args_ptr(pmu)),
+                        g->ops.pmu_ver.get_pmu_cmdline_args_size(pmu), 0);
+        gk20a_writel(g, pwr_falcon_dmemc_r(0),
+                pwr_falcon_dmemc_offs_f(0) |
+                pwr_falcon_dmemc_blk_f(0)  |
+                pwr_falcon_dmemc_aincw_f(1));
+        addr_code = u64_lo32((pmu->ucode.gpu_va +
+                        desc->app_start_offset +
+                        desc->app_resident_code_offset) >> 8) ;
+        addr_data = u64_lo32((pmu->ucode.gpu_va +
+                        desc->app_start_offset +
+                        desc->app_resident_data_offset) >> 8);
+        addr_load = u64_lo32((pmu->ucode.gpu_va +
+                        desc->bootloader_start_offset) >> 8);
+        gk20a_writel(g, pwr_falcon_dmemd_r(0), GK20A_PMU_DMAIDX_UCODE);
+        gk20a_writel(g, pwr_falcon_dmemd_r(0), addr_code);
+        gk20a_writel(g, pwr_falcon_dmemd_r(0), desc->app_size);
+        gk20a_writel(g, pwr_falcon_dmemd_r(0), desc->app_resident_code_size);
+        gk20a_writel(g, pwr_falcon_dmemd_r(0), desc->app_imem_entry);
+        gk20a_writel(g, pwr_falcon_dmemd_r(0), addr_data);
+        gk20a_writel(g, pwr_falcon_dmemd_r(0), desc->app_resident_data_size);
+        gk20a_writel(g, pwr_falcon_dmemd_r(0), addr_code);
+        gk20a_writel(g, pwr_falcon_dmemd_r(0), 0x1);
+        gk20a_writel(g, pwr_falcon_dmemd_r(0), addr_args);
+        g->ops.pmu.write_dmatrfbase(g,
+                        addr_load - (desc->bootloader_imem_offset >> 8));
+        blocks = ((desc->bootloader_size + 0xFF) & ~0xFF) >> 8;
+        for (i = 0; i < blocks; i++) {
+                gk20a_writel(g, pwr_falcon_dmatrfmoffs_r(),
+                        desc->bootloader_imem_offset + (i << 8));
+                gk20a_writel(g, pwr_falcon_dmatrffboffs_r(),
+                        desc->bootloader_imem_offset + (i << 8));
+                gk20a_writel(g, pwr_falcon_dmatrfcmd_r(),
+                        pwr_falcon_dmatrfcmd_imem_f(1)  |
+                        pwr_falcon_dmatrfcmd_write_f(0) |
+                        pwr_falcon_dmatrfcmd_size_f(6)  |
+                        pwr_falcon_dmatrfcmd_ctxdma_f(GK20A_PMU_DMAIDX_UCODE));
+        }
+        nvgpu_flcn_bootstrap(g->pmu.flcn, desc->bootloader_entry_point);
+        gk20a_writel(g, pwr_falcon_os_r(), desc->app_version);
+        return 0;
+}
+void gk20a_pmu_pg_idle_counter_config(struct gk20a *g, u32 pg_engine_id)
+{
+        gk20a_writel(g, pwr_pmu_pg_idlefilth_r(pg_engine_id),
+                PMU_PG_IDLE_THRESHOLD);
+        gk20a_writel(g, pwr_pmu_pg_ppuidlefilth_r(pg_engine_id),
+                PMU_PG_POST_POWERUP_IDLE_THRESHOLD);
+}
+int gk20a_pmu_mutex_acquire(struct nvgpu_pmu *pmu, u32 id, u32 *token)
+{
+        struct gk20a *g = gk20a_from_pmu(pmu);
+        struct pmu_mutex *mutex;
+        u32 data, owner, max_retry;
+        if (!pmu->initialized) {
+                return -EINVAL;
+        }
+        BUG_ON(!token);
+        BUG_ON(!PMU_MUTEX_ID_IS_VALID(id));
+        BUG_ON(id > pmu->mutex_cnt);
+        mutex = &pmu->mutex[id];
+        owner = pwr_pmu_mutex_value_v(
+                gk20a_readl(g, pwr_pmu_mutex_r(mutex->index)));
+        if (*token != PMU_INVALID_MUTEX_OWNER_ID && *token == owner) {
+                BUG_ON(mutex->ref_cnt == 0);
+                gk20a_dbg_pmu(g, "already acquired by owner : 0x%08x", *token);
+                mutex->ref_cnt++;
+                return 0;
+        }
+        max_retry = 40;
+        do {
+                data = pwr_pmu_mutex_id_value_v(
+                        gk20a_readl(g, pwr_pmu_mutex_id_r()));
+                if (data == pwr_pmu_mutex_id_value_init_v() ||
+                    data == pwr_pmu_mutex_id_value_not_avail_v()) {
+                        nvgpu_warn(g,
+                                "fail to generate mutex token: val 0x%08x",
+                                owner);
+                        nvgpu_usleep_range(20, 40);
+                        continue;
+                }
+                owner = data;
+                gk20a_writel(g, pwr_pmu_mutex_r(mutex->index),
+                        pwr_pmu_mutex_value_f(owner));
+                data = pwr_pmu_mutex_value_v(
+                        gk20a_readl(g, pwr_pmu_mutex_r(mutex->index)));
+                if (owner == data) {
+                        mutex->ref_cnt = 1;
+                        gk20a_dbg_pmu(g, "mutex acquired: id=%d, token=0x%x",
+                                mutex->index, *token);
+                        *token = owner;
+                        return 0;
+                } else {
+                        nvgpu_log_info(g, "fail to acquire mutex idx=0x%08x",
+                                mutex->index);
+                        data = gk20a_readl(g, pwr_pmu_mutex_id_release_r());
+                        data = set_field(data,
+                                pwr_pmu_mutex_id_release_value_m(),
+                                pwr_pmu_mutex_id_release_value_f(owner));
+                        gk20a_writel(g, pwr_pmu_mutex_id_release_r(), data);
+                        nvgpu_usleep_range(20, 40);
+                        continue;
+                }
+        } while (max_retry-- > 0);
+        return -EBUSY;
+}
+int gk20a_pmu_mutex_release(struct nvgpu_pmu *pmu, u32 id, u32 *token)
+{
+        struct gk20a *g = gk20a_from_pmu(pmu);
+        struct pmu_mutex *mutex;
+        u32 owner, data;
+        if (!pmu->initialized) {
+                return -EINVAL;
+        }
+        BUG_ON(!token);
+        BUG_ON(!PMU_MUTEX_ID_IS_VALID(id));
+        BUG_ON(id > pmu->mutex_cnt);
+        mutex = &pmu->mutex[id];
+        owner = pwr_pmu_mutex_value_v(
+                gk20a_readl(g, pwr_pmu_mutex_r(mutex->index)));
+        if (*token != owner) {
+                nvgpu_err(g, "requester 0x%08x NOT match owner 0x%08x",
+                        *token, owner);
+                return -EINVAL;
+        }
+        if (--mutex->ref_cnt > 0) {
+                return -EBUSY;
+        }
+        gk20a_writel(g, pwr_pmu_mutex_r(mutex->index),
+                pwr_pmu_mutex_value_initial_lock_f());
+        data = gk20a_readl(g, pwr_pmu_mutex_id_release_r());
+        data = set_field(data, pwr_pmu_mutex_id_release_value_m(),
+                pwr_pmu_mutex_id_release_value_f(owner));
+        gk20a_writel(g, pwr_pmu_mutex_id_release_r(), data);
+        gk20a_dbg_pmu(g, "mutex released: id=%d, token=0x%x",
+                mutex->index, *token);
+        return 0;
+}
+int gk20a_pmu_queue_head(struct gk20a *g, struct nvgpu_falcon_queue *queue,
+                        u32 *head, bool set)
+{
+        u32 queue_head_size = 0;
+        if (g->ops.pmu.pmu_get_queue_head_size) {
+                queue_head_size = g->ops.pmu.pmu_get_queue_head_size();
+        }
+        BUG_ON(!head || !queue_head_size);
+        if (PMU_IS_COMMAND_QUEUE(queue->id)) {
+                if (queue->index >= queue_head_size) {
+                        return -EINVAL;
+                }
+                if (!set) {
+                        *head = pwr_pmu_queue_head_address_v(
+                                gk20a_readl(g,
+                                g->ops.pmu.pmu_get_queue_head(queue->index)));
+                } else {
+                        gk20a_writel(g,
+                                g->ops.pmu.pmu_get_queue_head(queue->index),
+                                pwr_pmu_queue_head_address_f(*head));
+                }
+        } else {
+                if (!set) {
+                        *head = pwr_pmu_msgq_head_val_v(
+                                gk20a_readl(g, pwr_pmu_msgq_head_r()));
+                } else {
+                        gk20a_writel(g,
+                                pwr_pmu_msgq_head_r(),
+                                pwr_pmu_msgq_head_val_f(*head));
+                }
+        }
+        return 0;
+}
+int gk20a_pmu_queue_tail(struct gk20a *g, struct nvgpu_falcon_queue *queue,
+                        u32 *tail, bool set)
+{
+        u32 queue_tail_size = 0;
+        if (g->ops.pmu.pmu_get_queue_tail_size) {
+                queue_tail_size = g->ops.pmu.pmu_get_queue_tail_size();
+        }
+        BUG_ON(!tail || !queue_tail_size);
+        if (PMU_IS_COMMAND_QUEUE(queue->id)) {
+                if (queue->index >= queue_tail_size) {
+                        return -EINVAL;
+                }
+                if (!set) {
+                        *tail = pwr_pmu_queue_tail_address_v(gk20a_readl(g,
+                                        g->ops.pmu.pmu_get_queue_tail(queue->index)));
+                } else {
+                        gk20a_writel(g,
+                                g->ops.pmu.pmu_get_queue_tail(queue->index),
+                                pwr_pmu_queue_tail_address_f(*tail));
+                }
+        } else {
+                if (!set) {
+                        *tail = pwr_pmu_msgq_tail_val_v(
+                                gk20a_readl(g, pwr_pmu_msgq_tail_r()));
+                } else {
+                        gk20a_writel(g,
+                                pwr_pmu_msgq_tail_r(),
+                                pwr_pmu_msgq_tail_val_f(*tail));
+                }
+        }
+        return 0;
+}
+void gk20a_pmu_msgq_tail(struct nvgpu_pmu *pmu, u32 *tail, bool set)
+{
+        struct gk20a *g = gk20a_from_pmu(pmu);
+        u32 queue_tail_size = 0;
+        if (g->ops.pmu.pmu_get_queue_tail_size) {
+                queue_tail_size = g->ops.pmu.pmu_get_queue_tail_size();
+        }
+        BUG_ON(!tail || !queue_tail_size);
+        if (!set) {
+                *tail = pwr_pmu_msgq_tail_val_v(
+                        gk20a_readl(g, pwr_pmu_msgq_tail_r()));
+        } else {
+                gk20a_writel(g,
+                        pwr_pmu_msgq_tail_r(),
+                        pwr_pmu_msgq_tail_val_f(*tail));
+        }
+}
+void gk20a_write_dmatrfbase(struct gk20a *g, u32 addr)
+{
+        gk20a_writel(g, pwr_falcon_dmatrfbase_r(), addr);
+}
+bool gk20a_pmu_is_engine_in_reset(struct gk20a *g)
+{
+        bool status = false;
+        status = g->ops.mc.is_enabled(g, NVGPU_UNIT_PWR);
+        return status;
+}
+int gk20a_pmu_engine_reset(struct gk20a *g, bool do_reset)
+{
+        u32 reset_mask = g->ops.mc.reset_mask(g, NVGPU_UNIT_PWR);
+        if (do_reset) {
+                g->ops.mc.enable(g, reset_mask);
+        } else {
+                g->ops.mc.disable(g, reset_mask);
+        }
+        return 0;
+}
+bool gk20a_is_pmu_supported(struct gk20a *g)
+{
+        return true;
+}
+u32 gk20a_pmu_pg_engines_list(struct gk20a *g)
+{
+        return BIT(PMU_PG_ELPG_ENGINE_ID_GRAPHICS);
+}
+u32 gk20a_pmu_pg_feature_list(struct gk20a *g, u32 pg_engine_id)
+{
+        if (pg_engine_id == PMU_PG_ELPG_ENGINE_ID_GRAPHICS) {
+                return NVGPU_PMU_GR_FEATURE_MASK_POWER_GATING;
+        }
+        return 0;
+}
+static void pmu_handle_zbc_msg(struct gk20a *g, struct pmu_msg *msg,
+                        void *param, u32 handle, u32 status)
+{
+        struct nvgpu_pmu *pmu = param;
+        gk20a_dbg_pmu(g, "reply ZBC_TABLE_UPDATE");
+        pmu->zbc_save_done = 1;
+}
+void gk20a_pmu_save_zbc(struct gk20a *g, u32 entries)
+{
+        struct nvgpu_pmu *pmu = &g->pmu;
+        struct pmu_cmd cmd;
+        u32 seq;
+        if (!pmu->pmu_ready || !entries || !pmu->zbc_ready) {
+                return;
+        }
+        memset(&cmd, 0, sizeof(struct pmu_cmd));
+        cmd.hdr.unit_id = PMU_UNIT_PG;
+        cmd.hdr.size = PMU_CMD_HDR_SIZE + sizeof(struct pmu_zbc_cmd);
+        cmd.cmd.zbc.cmd_type = g->pmu_ver_cmd_id_zbc_table_update;
+        cmd.cmd.zbc.entry_mask = ZBC_MASK(entries);
+        pmu->zbc_save_done = 0;
+        gk20a_dbg_pmu(g, "cmd post ZBC_TABLE_UPDATE");
+        nvgpu_pmu_cmd_post(g, &cmd, NULL, NULL, PMU_COMMAND_QUEUE_HPQ,
+                           pmu_handle_zbc_msg, pmu, &seq, ~0);
+        pmu_wait_message_cond(pmu, gk20a_get_gr_idle_timeout(g),
+                              &pmu->zbc_save_done, 1);
+        if (!pmu->zbc_save_done) {
+                nvgpu_err(g, "ZBC save timeout");
+        }
+}
+int nvgpu_pmu_handle_therm_event(struct nvgpu_pmu *pmu,
+                        struct nv_pmu_therm_msg *msg)
+{
+        struct gk20a *g = gk20a_from_pmu(pmu);
+        nvgpu_log_fn(g, " ");
+        switch (msg->msg_type) {
+        case NV_PMU_THERM_MSG_ID_EVENT_HW_SLOWDOWN_NOTIFICATION:
+                if (msg->hw_slct_msg.mask == BIT(NV_PMU_THERM_EVENT_THERMAL_1)) {
+                        nvgpu_clk_arb_send_thermal_alarm(pmu->g);
+                } else {
+                        gk20a_dbg_pmu(g, "Unwanted/Unregistered thermal event received %d",
+                                msg->hw_slct_msg.mask);
+                }
+                break;
+        default:
+                gk20a_dbg_pmu(g, "unkown therm event received %d", msg->msg_type);
+                break;
+        }
+        return 0;
+}
+void gk20a_pmu_dump_elpg_stats(struct nvgpu_pmu *pmu)
+{
+        struct gk20a *g = gk20a_from_pmu(pmu);
+        gk20a_dbg_pmu(g, "pwr_pmu_idle_mask_supp_r(3): 0x%08x",
+                gk20a_readl(g, pwr_pmu_idle_mask_supp_r(3)));
+        gk20a_dbg_pmu(g, "pwr_pmu_idle_mask_1_supp_r(3): 0x%08x",
+                gk20a_readl(g, pwr_pmu_idle_mask_1_supp_r(3)));
+        gk20a_dbg_pmu(g, "pwr_pmu_idle_ctrl_supp_r(3): 0x%08x",
+                gk20a_readl(g, pwr_pmu_idle_ctrl_supp_r(3)));
+        gk20a_dbg_pmu(g, "pwr_pmu_pg_idle_cnt_r(0): 0x%08x",
+                gk20a_readl(g, pwr_pmu_pg_idle_cnt_r(0)));
+        gk20a_dbg_pmu(g, "pwr_pmu_pg_intren_r(0): 0x%08x",
+                gk20a_readl(g, pwr_pmu_pg_intren_r(0)));
+        gk20a_dbg_pmu(g, "pwr_pmu_idle_count_r(3): 0x%08x",
+                gk20a_readl(g, pwr_pmu_idle_count_r(3)));
+        gk20a_dbg_pmu(g, "pwr_pmu_idle_count_r(4): 0x%08x",
+                gk20a_readl(g, pwr_pmu_idle_count_r(4)));
+        gk20a_dbg_pmu(g, "pwr_pmu_idle_count_r(7): 0x%08x",
+                gk20a_readl(g, pwr_pmu_idle_count_r(7)));
+}
+void gk20a_pmu_dump_falcon_stats(struct nvgpu_pmu *pmu)
+{
+        struct gk20a *g = gk20a_from_pmu(pmu);
+        unsigned int i;
+        for (i = 0; i < pwr_pmu_mailbox__size_1_v(); i++) {
+                nvgpu_err(g, "pwr_pmu_mailbox_r(%d) : 0x%x",
+                        i, gk20a_readl(g, pwr_pmu_mailbox_r(i)));
+        }
+        for (i = 0; i < pwr_pmu_debug__size_1_v(); i++) {
+                nvgpu_err(g, "pwr_pmu_debug_r(%d) : 0x%x",
+                        i, gk20a_readl(g, pwr_pmu_debug_r(i)));
+        }
+        i = gk20a_readl(g, pwr_pmu_bar0_error_status_r());
+        nvgpu_err(g, "pwr_pmu_bar0_error_status_r : 0x%x", i);
+        if (i != 0) {
+                nvgpu_err(g, "pwr_pmu_bar0_addr_r : 0x%x",
+                        gk20a_readl(g, pwr_pmu_bar0_addr_r()));
+                nvgpu_err(g, "pwr_pmu_bar0_data_r : 0x%x",
+                        gk20a_readl(g, pwr_pmu_bar0_data_r()));
+                nvgpu_err(g, "pwr_pmu_bar0_timeout_r : 0x%x",
+                        gk20a_readl(g, pwr_pmu_bar0_timeout_r()));
+                nvgpu_err(g, "pwr_pmu_bar0_ctl_r : 0x%x",
+                        gk20a_readl(g, pwr_pmu_bar0_ctl_r()));
+        }
+        i = gk20a_readl(g, pwr_pmu_bar0_fecs_error_r());
+        nvgpu_err(g, "pwr_pmu_bar0_fecs_error_r : 0x%x", i);
+        i = gk20a_readl(g, pwr_falcon_exterrstat_r());
+        nvgpu_err(g, "pwr_falcon_exterrstat_r : 0x%x", i);
+        if (pwr_falcon_exterrstat_valid_v(i) ==
+                        pwr_falcon_exterrstat_valid_true_v()) {
+                nvgpu_err(g, "pwr_falcon_exterraddr_r : 0x%x",
+                        gk20a_readl(g, pwr_falcon_exterraddr_r()));
+        }
+        /* Print PMU F/W debug prints */
+        print_pmu_trace(pmu);
+}
+bool gk20a_pmu_is_interrupted(struct nvgpu_pmu *pmu)
+{
+        struct gk20a *g = gk20a_from_pmu(pmu);
+        u32 servicedpmuint;
+        servicedpmuint = pwr_falcon_irqstat_halt_true_f() |
+                        pwr_falcon_irqstat_exterr_true_f() |
+                        pwr_falcon_irqstat_swgen0_true_f();
+        if (gk20a_readl(g, pwr_falcon_irqstat_r()) & servicedpmuint) {
+                return true;
+        }
+        return false;
+}
+void gk20a_pmu_isr(struct gk20a *g)
+{
+        struct nvgpu_pmu *pmu = &g->pmu;
+        struct nvgpu_falcon_queue *queue;
+        u32 intr, mask;
+        bool recheck = false;
+        nvgpu_log_fn(g, " ");
+        nvgpu_mutex_acquire(&pmu->isr_mutex);
+        if (!pmu->isr_enabled) {
+                nvgpu_mutex_release(&pmu->isr_mutex);
+                return;
+        }
+        mask = gk20a_readl(g, pwr_falcon_irqmask_r()) &
+                gk20a_readl(g, pwr_falcon_irqdest_r());
+        intr = gk20a_readl(g, pwr_falcon_irqstat_r());
+        gk20a_dbg_pmu(g, "received falcon interrupt: 0x%08x", intr);
+        intr = gk20a_readl(g, pwr_falcon_irqstat_r()) & mask;
+        if (!intr || pmu->pmu_state == PMU_STATE_OFF) {
+                gk20a_writel(g, pwr_falcon_irqsclr_r(), intr);
+                nvgpu_mutex_release(&pmu->isr_mutex);
+                return;
+        }
+        if (intr & pwr_falcon_irqstat_halt_true_f()) {
+                nvgpu_err(g, "pmu halt intr not implemented");
+                nvgpu_pmu_dump_falcon_stats(pmu);
+                if (gk20a_readl(g, pwr_pmu_mailbox_r
+                                (PMU_MODE_MISMATCH_STATUS_MAILBOX_R)) ==
+                                PMU_MODE_MISMATCH_STATUS_VAL) {
+                        if (g->ops.pmu.dump_secure_fuses) {
+                                g->ops.pmu.dump_secure_fuses(g);
+                        }
+                }
+        }
+        if (intr & pwr_falcon_irqstat_exterr_true_f()) {
+                nvgpu_err(g,
+                        "pmu exterr intr not implemented. Clearing interrupt.");
+                nvgpu_pmu_dump_falcon_stats(pmu);
+                gk20a_writel(g, pwr_falcon_exterrstat_r(),
+                        gk20a_readl(g, pwr_falcon_exterrstat_r()) &
+                                ~pwr_falcon_exterrstat_valid_m());
+        }
+        if (g->ops.pmu.handle_ext_irq) {
+                g->ops.pmu.handle_ext_irq(g, intr);
+        }
+        if (intr & pwr_falcon_irqstat_swgen0_true_f()) {
+                nvgpu_pmu_process_message(pmu);
+                recheck = true;
+        }
+        gk20a_writel(g, pwr_falcon_irqsclr_r(), intr);
+        if (recheck) {
+                queue = &pmu->queue[PMU_MESSAGE_QUEUE];
+                if (!nvgpu_flcn_queue_is_empty(pmu->flcn, queue)) {
+                        gk20a_writel(g, pwr_falcon_irqsset_r(),
+                                pwr_falcon_irqsset_swgen0_set_f());
+                }
+        }
+        nvgpu_mutex_release(&pmu->isr_mutex);
+}
+void gk20a_pmu_init_perfmon_counter(struct gk20a *g)
+{
+        u32 data;
+        /* use counter #3 for GR && CE2 busy cycles */
+        gk20a_writel(g, pwr_pmu_idle_mask_r(3),
+                pwr_pmu_idle_mask_gr_enabled_f() |
+                pwr_pmu_idle_mask_ce_2_enabled_f());
+        /* assign same mask setting from GR ELPG to counter #3 */
+        data = gk20a_readl(g, pwr_pmu_idle_mask_1_supp_r(0));
+        gk20a_writel(g, pwr_pmu_idle_mask_1_r(3), data);
+        /* disable idle filtering for counters 3 and 6 */
+        data = gk20a_readl(g, pwr_pmu_idle_ctrl_r(3));
+        data = set_field(data, pwr_pmu_idle_ctrl_value_m() |
+                        pwr_pmu_idle_ctrl_filter_m(),
+                        pwr_pmu_idle_ctrl_value_busy_f() |
+                        pwr_pmu_idle_ctrl_filter_disabled_f());
+        gk20a_writel(g, pwr_pmu_idle_ctrl_r(3), data);
+        /* use counter #6 for total cycles */
+        data = gk20a_readl(g, pwr_pmu_idle_ctrl_r(6));
+        data = set_field(data, pwr_pmu_idle_ctrl_value_m() |
+                        pwr_pmu_idle_ctrl_filter_m(),
+                        pwr_pmu_idle_ctrl_value_always_f() |
+                        pwr_pmu_idle_ctrl_filter_disabled_f());
+        gk20a_writel(g, pwr_pmu_idle_ctrl_r(6), data);
+        /*
+         * We don't want to disturb counters #3 and #6, which are used by
+         * perfmon, so we add wiring also to counters #1 and #2 for
+         * exposing raw counter readings.
+         */
+        gk20a_writel(g, pwr_pmu_idle_mask_r(1),
+                pwr_pmu_idle_mask_gr_enabled_f() |
+                pwr_pmu_idle_mask_ce_2_enabled_f());
+        data = gk20a_readl(g, pwr_pmu_idle_ctrl_r(1));
+        data = set_field(data, pwr_pmu_idle_ctrl_value_m() |
+                        pwr_pmu_idle_ctrl_filter_m(),
+                        pwr_pmu_idle_ctrl_value_busy_f() |
+                        pwr_pmu_idle_ctrl_filter_disabled_f());
+        gk20a_writel(g, pwr_pmu_idle_ctrl_r(1), data);
+        data = gk20a_readl(g, pwr_pmu_idle_ctrl_r(2));
+        data = set_field(data, pwr_pmu_idle_ctrl_value_m() |
+                        pwr_pmu_idle_ctrl_filter_m(),
+                        pwr_pmu_idle_ctrl_value_always_f() |
+                        pwr_pmu_idle_ctrl_filter_disabled_f());
+        gk20a_writel(g, pwr_pmu_idle_ctrl_r(2), data);
+        /*
+         * use counters 4 and 0 for perfmon to log busy cycles and total cycles
+         * counter #0 overflow sets pmu idle intr status bit
+         */
+        gk20a_writel(g, pwr_pmu_idle_intr_r(),
+                     pwr_pmu_idle_intr_en_f(0));
+        gk20a_writel(g, pwr_pmu_idle_threshold_r(0),
+                     pwr_pmu_idle_threshold_value_f(0x7FFFFFFF));
+        data = gk20a_readl(g, pwr_pmu_idle_ctrl_r(0));
+        data = set_field(data, pwr_pmu_idle_ctrl_value_m() |
+                        pwr_pmu_idle_ctrl_filter_m(),
+                        pwr_pmu_idle_ctrl_value_always_f() |
+                        pwr_pmu_idle_ctrl_filter_disabled_f());
+        gk20a_writel(g, pwr_pmu_idle_ctrl_r(0), data);
+        gk20a_writel(g, pwr_pmu_idle_mask_r(4),
+                pwr_pmu_idle_mask_gr_enabled_f() |
+                pwr_pmu_idle_mask_ce_2_enabled_f());
+        data = gk20a_readl(g, pwr_pmu_idle_ctrl_r(4));
+        data = set_field(data, pwr_pmu_idle_ctrl_value_m() |
+                        pwr_pmu_idle_ctrl_filter_m(),
+                        pwr_pmu_idle_ctrl_value_busy_f() |
+                        pwr_pmu_idle_ctrl_filter_disabled_f());
+        gk20a_writel(g, pwr_pmu_idle_ctrl_r(4), data);
+        gk20a_writel(g, pwr_pmu_idle_count_r(0), pwr_pmu_idle_count_reset_f(1));
+        gk20a_writel(g, pwr_pmu_idle_count_r(4), pwr_pmu_idle_count_reset_f(1));
+        gk20a_writel(g, pwr_pmu_idle_intr_status_r(),
+                     pwr_pmu_idle_intr_status_intr_f(1));
+}
+u32 gk20a_pmu_read_idle_counter(struct gk20a *g, u32 counter_id)
+{
+        return pwr_pmu_idle_count_value_v(
+                gk20a_readl(g, pwr_pmu_idle_count_r(counter_id)));
+}
+void gk20a_pmu_reset_idle_counter(struct gk20a *g, u32 counter_id)
+{
+        gk20a_writel(g, pwr_pmu_idle_count_r(counter_id),
+                pwr_pmu_idle_count_reset_f(1));
+}
+u32 gk20a_pmu_read_idle_intr_status(struct gk20a *g)
+{
+        return pwr_pmu_idle_intr_status_intr_v(
+                gk20a_readl(g, pwr_pmu_idle_intr_status_r()));
+}
+void gk20a_pmu_clear_idle_intr_status(struct gk20a *g)
+{
+        gk20a_writel(g, pwr_pmu_idle_intr_status_r(),
+                     pwr_pmu_idle_intr_status_intr_f(1));
+}
+void gk20a_pmu_elpg_statistics(struct gk20a *g, u32 pg_engine_id,
+                struct pmu_pg_stats_data *pg_stat_data)
+{
+        struct nvgpu_pmu *pmu = &g->pmu;
+        struct pmu_pg_stats stats;
+        nvgpu_flcn_copy_from_dmem(pmu->flcn,
+                pmu->stat_dmem_offset[pg_engine_id],
+                (u8 *)&stats, sizeof(struct pmu_pg_stats), 0);
+        pg_stat_data->ingating_time = stats.pg_ingating_time_us;
+        pg_stat_data->ungating_time = stats.pg_ungating_time_us;
+        pg_stat_data->gating_cnt = stats.pg_gating_cnt;
+        pg_stat_data->avg_entry_latency_us = stats.pg_avg_entry_time_us;
+        pg_stat_data->avg_exit_latency_us = stats.pg_avg_exit_time_us;
+}
diff --git a/include/gk20a/pmu_gk20a.h b/include/gk20a/pmu_gk20a.h
new file mode 100644
index 0000000..65ffd63
--- /dev/null
+++ b/include/gk20a/pmu_gk20a.h
@@ -0,0 +1,80 @@
+/*
+ * drivers/video/tegra/host/gk20a/pmu_gk20a.h
+ *
+ * GK20A PMU (aka. gPMU outside gk20a context)
+ *
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+#ifndef NVGPU_GK20A_PMU_GK20A_H
+#define NVGPU_GK20A_PMU_GK20A_H
+#include <nvgpu/flcnif_cmn.h>
+#include <nvgpu/pmuif/nvgpu_gpmu_cmdif.h>
+#include <nvgpu/pmu.h>
+struct nvgpu_firmware;
+#define ZBC_MASK(i)                     (~(~(0) << ((i)+1)) & 0xfffe)
+bool gk20a_pmu_is_interrupted(struct nvgpu_pmu *pmu);
+void gk20a_pmu_isr(struct gk20a *g);
+u32 gk20a_pmu_pg_engines_list(struct gk20a *g);
+u32 gk20a_pmu_pg_feature_list(struct gk20a *g, u32 pg_engine_id);
+void gk20a_pmu_save_zbc(struct gk20a *g, u32 entries);
+void gk20a_pmu_init_perfmon_counter(struct gk20a *g);
+void gk20a_pmu_pg_idle_counter_config(struct gk20a *g, u32 pg_engine_id);
+int gk20a_pmu_mutex_acquire(struct nvgpu_pmu *pmu, u32 id, u32 *token);
+int gk20a_pmu_mutex_release(struct nvgpu_pmu *pmu, u32 id, u32 *token);
+int gk20a_pmu_queue_head(struct gk20a *g, struct nvgpu_falcon_queue *queue,
+                        u32 *head, bool set);
+int gk20a_pmu_queue_tail(struct gk20a *g, struct nvgpu_falcon_queue *queue,
+                        u32 *tail, bool set);
+void gk20a_pmu_msgq_tail(struct nvgpu_pmu *pmu, u32 *tail, bool set);
+u32 gk20a_pmu_read_idle_counter(struct gk20a *g, u32 counter_id);
+void gk20a_pmu_reset_idle_counter(struct gk20a *g, u32 counter_id);
+u32 gk20a_pmu_read_idle_intr_status(struct gk20a *g);
+void gk20a_pmu_clear_idle_intr_status(struct gk20a *g);
+void gk20a_write_dmatrfbase(struct gk20a *g, u32 addr);
+bool gk20a_is_pmu_supported(struct gk20a *g);
+int pmu_bootstrap(struct nvgpu_pmu *pmu);
+void gk20a_pmu_dump_elpg_stats(struct nvgpu_pmu *pmu);
+void gk20a_pmu_dump_falcon_stats(struct nvgpu_pmu *pmu);
+void gk20a_pmu_enable_irq(struct nvgpu_pmu *pmu, bool enable);
+void pmu_handle_fecs_boot_acr_msg(struct gk20a *g, struct pmu_msg *msg,
+                                void *param, u32 handle, u32 status);
+void gk20a_pmu_elpg_statistics(struct gk20a *g, u32 pg_engine_id,
+                struct pmu_pg_stats_data *pg_stat_data);
+bool gk20a_pmu_is_engine_in_reset(struct gk20a *g);
+int gk20a_pmu_engine_reset(struct gk20a *g, bool do_reset);
+u32 gk20a_pmu_get_irqdest(struct gk20a *g);
+#endif /*NVGPU_GK20A_PMU_GK20A_H*/
diff --git a/include/gk20a/regops_gk20a.c b/include/gk20a/regops_gk20a.c
new file mode 100644
index 0000000..0aec4f8
--- /dev/null
+++ b/include/gk20a/regops_gk20a.c
@@ -0,0 +1,472 @@
+/*
+ * Tegra GK20A GPU Debugger Driver Register Ops
+ *
+ * Copyright (c) 2013-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+#include "gk20a.h"
+#include "gr_gk20a.h"
+#include "dbg_gpu_gk20a.h"
+#include "regops_gk20a.h"
+#include <nvgpu/log.h>
+#include <nvgpu/bsearch.h>
+#include <nvgpu/bug.h>
+#include <nvgpu/io.h>
+static int regop_bsearch_range_cmp(const void *pkey, const void *pelem)
+{
+        u32 key = *(u32 *)pkey;
+        struct regop_offset_range *prange = (struct regop_offset_range *)pelem;
+        if (key < prange->base) {
+                return -1;
+        } else if (prange->base <= key && key < (prange->base +
+                                               (prange->count * 4U))) {
+                return 0;
+        }
+        return 1;
+}
+static inline bool linear_search(u32 offset, const u32 *list, int size)
+{
+        int i;
+        for (i = 0; i < size; i++) {
+                if (list[i] == offset) {
+                        return true;
+                }
+        }
+        return false;
+}
+/*
+ * In order to perform a context relative op the context has
+ * to be created already... which would imply that the
+ * context switch mechanism has already been put in place.
+ * So by the time we perform such an opertation it should always
+ * be possible to query for the appropriate context offsets, etc.
+ *
+ * But note: while the dbg_gpu bind requires the a channel fd,
+ * it doesn't require an allocated gr/compute obj at that point...
+ */
+static bool gr_context_info_available(struct gr_gk20a *gr)
+{
+        int err;
+        nvgpu_mutex_acquire(&gr->ctx_mutex);
+        err = !gr->ctx_vars.golden_image_initialized;
+        nvgpu_mutex_release(&gr->ctx_mutex);
+        if (err) {
+                return false;
+        }
+        return true;
+}
+static bool validate_reg_ops(struct dbg_session_gk20a *dbg_s,
+                             u32 *ctx_rd_count, u32 *ctx_wr_count,
+                             struct nvgpu_dbg_reg_op *ops,
+                             u32 op_count);
+int exec_regops_gk20a(struct dbg_session_gk20a *dbg_s,
+                      struct nvgpu_dbg_reg_op *ops,
+                      u64 num_ops,
+                      bool *is_current_ctx)
+{
+        int err = 0;
+        unsigned int i;
+        struct channel_gk20a *ch = NULL;
+        struct gk20a *g = dbg_s->g;
+        /*struct gr_gk20a *gr = &g->gr;*/
+        u32 data32_lo = 0, data32_hi = 0;
+        u32 ctx_rd_count = 0, ctx_wr_count = 0;
+        bool skip_read_lo, skip_read_hi;
+        bool ok;
+        nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, " ");
+        ch = nvgpu_dbg_gpu_get_session_channel(dbg_s);
+        /* For vgpu, the regops routines need to be handled in the
+         * context of the server and support for that does not exist.
+         *
+         * The two users of the regops interface are the compute driver
+         * and tools. The compute driver will work without a functional
+         * regops implementation, so we return -ENOSYS. This will allow
+         * compute apps to run with vgpu. Tools will not work in this
+         * configuration and are not required to work at this time. */
+        if (g->is_virtual) {
+                return -ENOSYS;
+        }
+        ok = validate_reg_ops(dbg_s,
+                              &ctx_rd_count, &ctx_wr_count,
+                              ops, num_ops);
+        if (!ok) {
+                nvgpu_err(g, "invalid op(s)");
+                err = -EINVAL;
+                /* each op has its own err/status */
+                goto clean_up;
+        }
+        /* be sure that ctx info is in place if there are ctx ops */
+        if (ctx_wr_count | ctx_rd_count) {
+                if (!gr_context_info_available(&g->gr)) {
+                        nvgpu_err(g, "gr context data not available");
+                        return -ENODEV;
+                }
+        }
+        for (i = 0; i < num_ops; i++) {
+                /* if it isn't global then it is done in the ctx ops... */
+                if (ops[i].type != REGOP(TYPE_GLOBAL)) {
+                        continue;
+                }
+                switch (ops[i].op) {
+                case REGOP(READ_32):
+                        ops[i].value_hi = 0;
+                        ops[i].value_lo = gk20a_readl(g, ops[i].offset);
+                        nvgpu_log(g, gpu_dbg_gpu_dbg, "read_32 0x%08x from 0x%08x",
+                                   ops[i].value_lo, ops[i].offset);
+                        break;
+                case REGOP(READ_64):
+                        ops[i].value_lo = gk20a_readl(g, ops[i].offset);
+                        ops[i].value_hi =
+                                gk20a_readl(g, ops[i].offset + 4);
+                        nvgpu_log(g, gpu_dbg_gpu_dbg, "read_64 0x%08x:%08x from 0x%08x",
+                                   ops[i].value_hi, ops[i].value_lo,
+                                   ops[i].offset);
+                break;
+                case REGOP(WRITE_32):
+                case REGOP(WRITE_64):
+                        /* some of this appears wonky/unnecessary but
+                           we've kept it for compat with existing
+                           debugger code.  just in case... */
+                        skip_read_lo = skip_read_hi = false;
+                        if (ops[i].and_n_mask_lo == ~(u32)0) {
+                                data32_lo = ops[i].value_lo;
+                                skip_read_lo = true;
+                        }
+                        if ((ops[i].op == REGOP(WRITE_64)) &&
+                            (ops[i].and_n_mask_hi == ~(u32)0)) {
+                                data32_hi = ops[i].value_hi;
+                                skip_read_hi = true;
+                        }
+                        /* read first 32bits */
+                        if (skip_read_lo == false) {
+                                data32_lo = gk20a_readl(g, ops[i].offset);
+                                data32_lo &= ~ops[i].and_n_mask_lo;
+                                data32_lo |= ops[i].value_lo;
+                        }
+                        /* if desired, read second 32bits */
+                        if ((ops[i].op == REGOP(WRITE_64)) &&
+                            !skip_read_hi) {
+                                data32_hi = gk20a_readl(g, ops[i].offset + 4);
+                                data32_hi &= ~ops[i].and_n_mask_hi;
+                                data32_hi |= ops[i].value_hi;
+                        }
+                        /* now update first 32bits */
+                        gk20a_writel(g, ops[i].offset, data32_lo);
+                        nvgpu_log(g, gpu_dbg_gpu_dbg, "Wrote 0x%08x to 0x%08x ",
+                                   data32_lo, ops[i].offset);
+                        /* if desired, update second 32bits */
+                        if (ops[i].op == REGOP(WRITE_64)) {
+                                gk20a_writel(g, ops[i].offset + 4, data32_hi);
+                                nvgpu_log(g, gpu_dbg_gpu_dbg, "Wrote 0x%08x to 0x%08x ",
+                                           data32_hi, ops[i].offset + 4);
+                        }
+                        break;
+                /* shouldn't happen as we've already screened */
+                default:
+                        BUG();
+                        err = -EINVAL;
+                        goto clean_up;
+                        break;
+                }
+        }
+        if (ctx_wr_count | ctx_rd_count) {
+                err = gr_gk20a_exec_ctx_ops(ch, ops, num_ops,
+                                            ctx_wr_count, ctx_rd_count,
+                                            is_current_ctx);
+                if (err) {
+                        nvgpu_warn(g, "failed to perform ctx ops\n");
+                        goto clean_up;
+                }
+        }
+ clean_up:
+        nvgpu_log(g, gpu_dbg_gpu_dbg, "ret=%d", err);
+        return err;
+}
+static int validate_reg_op_info(struct dbg_session_gk20a *dbg_s,
+                                struct nvgpu_dbg_reg_op *op)
+{
+        int err = 0;
+        op->status = REGOP(STATUS_SUCCESS);
+        switch (op->op) {
+        case REGOP(READ_32):
+        case REGOP(READ_64):
+        case REGOP(WRITE_32):
+        case REGOP(WRITE_64):
+                break;
+        default:
+                op->status |= REGOP(STATUS_UNSUPPORTED_OP);
+                err = -EINVAL;
+                break;
+        }
+        switch (op->type) {
+        case REGOP(TYPE_GLOBAL):
+        case REGOP(TYPE_GR_CTX):
+        case REGOP(TYPE_GR_CTX_TPC):
+        case REGOP(TYPE_GR_CTX_SM):
+        case REGOP(TYPE_GR_CTX_CROP):
+        case REGOP(TYPE_GR_CTX_ZROP):
+        case REGOP(TYPE_GR_CTX_QUAD):
+                break;
+        /*
+        case NVGPU_DBG_GPU_REG_OP_TYPE_FB:
+        */
+        default:
+                op->status |= REGOP(STATUS_INVALID_TYPE);
+                err = -EINVAL;
+                break;
+        }
+        return err;
+}
+static bool check_whitelists(struct dbg_session_gk20a *dbg_s,
+                          struct nvgpu_dbg_reg_op *op, u32 offset)
+{
+        struct gk20a *g = dbg_s->g;
+        bool valid = false;
+        struct channel_gk20a *ch;
+        ch = nvgpu_dbg_gpu_get_session_channel(dbg_s);
+        if (op->type == REGOP(TYPE_GLOBAL)) {
+                /* search global list */
+                valid = g->ops.regops.get_global_whitelist_ranges &&
+                        !!bsearch(&offset,
+                        g->ops.regops.get_global_whitelist_ranges(),
+                        g->ops.regops.get_global_whitelist_ranges_count(),
+                        sizeof(*g->ops.regops.get_global_whitelist_ranges()),
+                        regop_bsearch_range_cmp);
+                /* if debug session and channel is bound search context list */
+                if ((!valid) && (!dbg_s->is_profiler && ch)) {
+                        /* binary search context list */
+                        valid = g->ops.regops.get_context_whitelist_ranges &&
+                                !!bsearch(&offset,
+                        g->ops.regops.get_context_whitelist_ranges(),
+                        g->ops.regops.get_context_whitelist_ranges_count(),
+                        sizeof(*g->ops.regops.get_context_whitelist_ranges()),
+                        regop_bsearch_range_cmp);
+                }
+                /* if debug session and channel is bound search runcontrol list */
+                if ((!valid) && (!dbg_s->is_profiler && ch)) {
+                        valid = g->ops.regops.get_runcontrol_whitelist &&
+                                linear_search(offset,
+                                g->ops.regops.get_runcontrol_whitelist(),
+                                g->ops.regops.get_runcontrol_whitelist_count());
+                }
+        } else if (op->type == REGOP(TYPE_GR_CTX)) {
+                /* it's a context-relative op */
+                if (!ch) {
+                        nvgpu_err(dbg_s->g, "can't perform ctx regop unless bound");
+                        op->status = REGOP(STATUS_UNSUPPORTED_OP);
+                        return valid;
+                }
+                /* binary search context list */
+                valid = g->ops.regops.get_context_whitelist_ranges &&
+                        !!bsearch(&offset,
+                        g->ops.regops.get_context_whitelist_ranges(),
+                        g->ops.regops.get_context_whitelist_ranges_count(),
+                        sizeof(*g->ops.regops.get_context_whitelist_ranges()),
+                        regop_bsearch_range_cmp);
+                /* if debug session and channel is bound search runcontrol list */
+                if ((!valid) && (!dbg_s->is_profiler && ch)) {
+                        valid = g->ops.regops.get_runcontrol_whitelist &&
+                                linear_search(offset,
+                                g->ops.regops.get_runcontrol_whitelist(),
+                                g->ops.regops.get_runcontrol_whitelist_count());
+                }
+        } else if (op->type == REGOP(TYPE_GR_CTX_QUAD)) {
+                valid = g->ops.regops.get_qctl_whitelist &&
+                        linear_search(offset,
+                                g->ops.regops.get_qctl_whitelist(),
+                                g->ops.regops.get_qctl_whitelist_count());
+        }
+        return valid;
+}
+/* note: the op here has already been through validate_reg_op_info */
+static int validate_reg_op_offset(struct dbg_session_gk20a *dbg_s,
+                                  struct nvgpu_dbg_reg_op *op)
+{
+        int err;
+        u32 buf_offset_lo, buf_offset_addr, num_offsets, offset;
+        bool valid = false;
+        op->status = 0;
+        offset = op->offset;
+        /* support only 24-bit 4-byte aligned offsets */
+        if (offset & 0xFF000003) {
+                nvgpu_err(dbg_s->g, "invalid regop offset: 0x%x", offset);
+                op->status |= REGOP(STATUS_INVALID_OFFSET);
+                return -EINVAL;
+        }
+        valid = check_whitelists(dbg_s, op, offset);
+        if ((op->op == REGOP(READ_64) || op->op == REGOP(WRITE_64)) && valid) {
+                valid = check_whitelists(dbg_s, op, offset + 4);
+        }
+        if (valid && (op->type != REGOP(TYPE_GLOBAL))) {
+                err = gr_gk20a_get_ctx_buffer_offsets(dbg_s->g,
+                                                      op->offset,
+                                                      1,
+                                                      &buf_offset_lo,
+                                                      &buf_offset_addr,
+                                                      &num_offsets,
+                                                      op->type == REGOP(TYPE_GR_CTX_QUAD),
+                                                      op->quad);
+                if (err) {
+                        err = gr_gk20a_get_pm_ctx_buffer_offsets(dbg_s->g,
+                                                              op->offset,
+                                                              1,
+                                                              &buf_offset_lo,
+                                                              &buf_offset_addr,
+                                                              &num_offsets);
+                        if (err) {
+                                op->status |= REGOP(STATUS_INVALID_OFFSET);
+                                return -EINVAL;
+                        }
+                }
+                if (!num_offsets) {
+                        op->status |= REGOP(STATUS_INVALID_OFFSET);
+                        return -EINVAL;
+                }
+        }
+        if (!valid) {
+                nvgpu_err(dbg_s->g, "invalid regop offset: 0x%x", offset);
+                op->status |= REGOP(STATUS_INVALID_OFFSET);
+                return -EINVAL;
+        }
+        return 0;
+}
+static bool validate_reg_ops(struct dbg_session_gk20a *dbg_s,
+                            u32 *ctx_rd_count, u32 *ctx_wr_count,
+                            struct nvgpu_dbg_reg_op *ops,
+                            u32 op_count)
+{
+        u32 i;
+        bool ok = true;
+        struct gk20a *g = dbg_s->g;
+        /* keep going until the end so every op can get
+         * a separate error code if needed */
+        for (i = 0; i < op_count; i++) {
+                if (validate_reg_op_info(dbg_s, &ops[i]) != 0) {
+                        ok = false;
+                }
+                if (reg_op_is_gr_ctx(ops[i].type)) {
+                        if (reg_op_is_read(ops[i].op)) {
+                                (*ctx_rd_count)++;
+                        } else {
+                                (*ctx_wr_count)++;
+                        }
+                }
+                /* if "allow_all" flag enabled, dont validate offset */
+                if (!g->allow_all) {
+                        if (validate_reg_op_offset(dbg_s, &ops[i]) != 0) {
+                                ok = false;
+                        }
+                }
+        }
+        nvgpu_log(g, gpu_dbg_gpu_dbg, "ctx_wrs:%d ctx_rds:%d",
+                   *ctx_wr_count, *ctx_rd_count);
+        return ok;
+}
+/* exported for tools like cyclestats, etc */
+bool is_bar0_global_offset_whitelisted_gk20a(struct gk20a *g, u32 offset)
+{
+        bool valid = !!bsearch(&offset,
+                        g->ops.regops.get_global_whitelist_ranges(),
+                        g->ops.regops.get_global_whitelist_ranges_count(),
+                        sizeof(*g->ops.regops.get_global_whitelist_ranges()),
+                        regop_bsearch_range_cmp);
+        return valid;
+}
+bool reg_op_is_gr_ctx(u8 type)
+{
+        return  type == REGOP(TYPE_GR_CTX) ||
+                type == REGOP(TYPE_GR_CTX_TPC) ||
+                type == REGOP(TYPE_GR_CTX_SM) ||
+                type == REGOP(TYPE_GR_CTX_CROP) ||
+                type == REGOP(TYPE_GR_CTX_ZROP) ||
+                type == REGOP(TYPE_GR_CTX_QUAD);
+}
+bool reg_op_is_read(u8 op)
+{
+        return  op == REGOP(READ_32) ||
+                op == REGOP(READ_64);
+}
diff --git a/include/gk20a/regops_gk20a.h b/include/gk20a/regops_gk20a.h
new file mode 100644
index 0000000..9670587
--- /dev/null
+++ b/include/gk20a/regops_gk20a.h
@@ -0,0 +1,90 @@
+/*
+ * Tegra GK20A GPU Debugger Driver Register Ops
+ *
+ * Copyright (c) 2013-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+#ifndef REGOPS_GK20A_H
+#define REGOPS_GK20A_H
+/*
+ * Register operations
+ * All operations are targeted towards first channel
+ * attached to debug session
+ */
+/* valid op values */
+#define NVGPU_DBG_REG_OP_READ_32                             (0x00000000)
+#define NVGPU_DBG_REG_OP_WRITE_32                            (0x00000001)
+#define NVGPU_DBG_REG_OP_READ_64                             (0x00000002)
+#define NVGPU_DBG_REG_OP_WRITE_64                            (0x00000003)
+/* note: 8b ops are unsupported */
+#define NVGPU_DBG_REG_OP_READ_08                             (0x00000004)
+#define NVGPU_DBG_REG_OP_WRITE_08                            (0x00000005)
+/* valid type values */
+#define NVGPU_DBG_REG_OP_TYPE_GLOBAL                         (0x00000000)
+#define NVGPU_DBG_REG_OP_TYPE_GR_CTX                         (0x00000001)
+#define NVGPU_DBG_REG_OP_TYPE_GR_CTX_TPC                     (0x00000002)
+#define NVGPU_DBG_REG_OP_TYPE_GR_CTX_SM                      (0x00000004)
+#define NVGPU_DBG_REG_OP_TYPE_GR_CTX_CROP                    (0x00000008)
+#define NVGPU_DBG_REG_OP_TYPE_GR_CTX_ZROP                    (0x00000010)
+/*#define NVGPU_DBG_REG_OP_TYPE_FB                           (0x00000020)*/
+#define NVGPU_DBG_REG_OP_TYPE_GR_CTX_QUAD                    (0x00000040)
+/* valid status values */
+#define NVGPU_DBG_REG_OP_STATUS_SUCCESS                      (0x00000000)
+#define NVGPU_DBG_REG_OP_STATUS_INVALID_OP                   (0x00000001)
+#define NVGPU_DBG_REG_OP_STATUS_INVALID_TYPE                 (0x00000002)
+#define NVGPU_DBG_REG_OP_STATUS_INVALID_OFFSET               (0x00000004)
+#define NVGPU_DBG_REG_OP_STATUS_UNSUPPORTED_OP               (0x00000008)
+#define NVGPU_DBG_REG_OP_STATUS_INVALID_MASK                 (0x00000010)
+struct nvgpu_dbg_reg_op {
+        u8    op;
+        u8    type;
+        u8    status;
+        u8    quad;
+        u32   group_mask;
+        u32   sub_group_mask;
+        u32   offset;
+        u32   value_lo;
+        u32   value_hi;
+        u32   and_n_mask_lo;
+        u32   and_n_mask_hi;
+};
+struct regop_offset_range {
+        u32 base:24;
+        u32 count:8;
+};
+int exec_regops_gk20a(struct dbg_session_gk20a *dbg_s,
+                      struct nvgpu_dbg_reg_op *ops,
+                      u64 num_ops,
+                      bool *is_current_ctx);
+/* turn seriously unwieldy names -> something shorter */
+#define REGOP(x) NVGPU_DBG_REG_OP_##x
+bool reg_op_is_gr_ctx(u8 type);
+bool reg_op_is_read(u8 op);
+bool is_bar0_global_offset_whitelisted_gk20a(struct gk20a *g, u32 offset);
+#endif /* REGOPS_GK20A_H */