6 files changed, 2025 insertions, 10 deletions
diff --git a/drivers/gpu/nvgpu/common/linux/cde.c b/drivers/gpu/nvgpu/common/linux/cde.c
new file mode 100644
index 00000000..5b0fb910
--- /dev/null
+++ b/drivers/gpu/nvgpu/common/linux/cde.c
@@ -0,0 +1,1693 @@
+/*
+ * Color decompression engine support
+ *
+ * Copyright (c) 2014-2017, NVIDIA Corporation.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#include <linux/dma-mapping.h>
+#include <linux/fs.h>
+#include <linux/dma-buf.h>
+#include <trace/events/gk20a.h>
+#include <nvgpu/dma.h>
+#include <nvgpu/gmmu.h>
+#include <nvgpu/timers.h>
+#include <nvgpu/nvgpu_common.h>
+#include <nvgpu/kmem.h>
+#include <nvgpu/log.h>
+#include <nvgpu/bug.h>
+#include <nvgpu/firmware.h>
+#include "gk20a/gk20a.h"
+#include "gk20a/channel_gk20a.h"
+#include "gk20a/mm_gk20a.h"
+#include "gk20a/fence_gk20a.h"
+#include "gk20a/gr_gk20a.h"
+#include "cde.h"
+#include "os_linux.h"
+#include <nvgpu/hw/gk20a/hw_ccsr_gk20a.h>
+#include <nvgpu/hw/gk20a/hw_pbdma_gk20a.h>
+/*
+ * Currently this code uses nvgpu_vm_map() since it takes dmabuf FDs from the
+ * CDE ioctls. That has to change - instead this needs to take an nvgpu_mem.
+ */
+#include "common/linux/vm_priv.h"
+static int gk20a_cde_load(struct gk20a_cde_ctx *cde_ctx);
+static struct gk20a_cde_ctx *gk20a_cde_allocate_context(struct nvgpu_os_linux *l);
+#define CTX_DELETE_TIME 1000
+#define MAX_CTX_USE_COUNT 42
+#define MAX_CTX_RETRY_TIME 2000
+static void gk20a_deinit_cde_img(struct gk20a_cde_ctx *cde_ctx)
+{
+        unsigned int i;
+        for (i = 0; i < cde_ctx->num_bufs; i++) {
+                struct nvgpu_mem *mem = cde_ctx->mem + i;
+                nvgpu_dma_unmap_free(cde_ctx->vm, mem);
+        }
+        nvgpu_kfree(&cde_ctx->l->g, cde_ctx->init_convert_cmd);
+        cde_ctx->convert_cmd = NULL;
+        cde_ctx->init_convert_cmd = NULL;
+        cde_ctx->num_bufs = 0;
+        cde_ctx->num_params = 0;
+        cde_ctx->init_cmd_num_entries = 0;
+        cde_ctx->convert_cmd_num_entries = 0;
+        cde_ctx->init_cmd_executed = false;
+}
+static void gk20a_cde_remove_ctx(struct gk20a_cde_ctx *cde_ctx)
+__must_hold(&cde_app->mutex)
+{
+        struct nvgpu_os_linux *l = cde_ctx->l;
+        struct gk20a *g = &l->g;
+        struct channel_gk20a *ch = cde_ctx->ch;
+        struct vm_gk20a *vm = ch->vm;
+        trace_gk20a_cde_remove_ctx(cde_ctx);
+        /* release mapped memory */
+        gk20a_deinit_cde_img(cde_ctx);
+        nvgpu_gmmu_unmap(vm, &g->gr.compbit_store.mem,
+                         cde_ctx->backing_store_vaddr);
+        /* free the channel */
+        gk20a_channel_close(ch);
+        /* housekeeping on app */
+        nvgpu_list_del(&cde_ctx->list);
+        l->cde_app.ctx_count--;
+        nvgpu_kfree(g, cde_ctx);
+}
+static void gk20a_cde_cancel_deleter(struct gk20a_cde_ctx *cde_ctx,
+                bool wait_finish)
+__releases(&cde_app->mutex)
+__acquires(&cde_app->mutex)
+{
+        struct gk20a_cde_app *cde_app = &cde_ctx->l->cde_app;
+        /* permanent contexts do not have deleter works */
+        if (!cde_ctx->is_temporary)
+                return;
+        if (wait_finish) {
+                nvgpu_mutex_release(&cde_app->mutex);
+                cancel_delayed_work_sync(&cde_ctx->ctx_deleter_work);
+                nvgpu_mutex_acquire(&cde_app->mutex);
+        } else {
+                cancel_delayed_work(&cde_ctx->ctx_deleter_work);
+        }
+}
+static void gk20a_cde_remove_contexts(struct nvgpu_os_linux *l)
+__must_hold(&l->cde_app->mutex)
+{
+        struct gk20a_cde_app *cde_app = &l->cde_app;
+        struct gk20a_cde_ctx *cde_ctx, *cde_ctx_save;
+        /* safe to go off the mutex in cancel_deleter since app is
+         * deinitialised; no new jobs are started. deleter works may be only at
+         * waiting for the mutex or before, going to abort */
+        nvgpu_list_for_each_entry_safe(cde_ctx, cde_ctx_save,
+                        &cde_app->free_contexts, gk20a_cde_ctx, list) {
+                gk20a_cde_cancel_deleter(cde_ctx, true);
+                gk20a_cde_remove_ctx(cde_ctx);
+        }
+        nvgpu_list_for_each_entry_safe(cde_ctx, cde_ctx_save,
+                        &cde_app->used_contexts, gk20a_cde_ctx, list) {
+                gk20a_cde_cancel_deleter(cde_ctx, true);
+                gk20a_cde_remove_ctx(cde_ctx);
+        }
+}
+static void gk20a_cde_stop(struct nvgpu_os_linux *l)
+__must_hold(&l->cde_app->mutex)
+{
+        struct gk20a_cde_app *cde_app = &l->cde_app;
+        /* prevent further conversions and delayed works from working */
+        cde_app->initialised = false;
+        /* free all data, empty the list */
+        gk20a_cde_remove_contexts(l);
+}
+void gk20a_cde_destroy(struct nvgpu_os_linux *l)
+__acquires(&l->cde_app->mutex)
+__releases(&l->cde_app->mutex)
+{
+        struct gk20a_cde_app *cde_app = &l->cde_app;
+        if (!cde_app->initialised)
+                return;
+        nvgpu_mutex_acquire(&cde_app->mutex);
+        gk20a_cde_stop(l);
+        nvgpu_mutex_release(&cde_app->mutex);
+        nvgpu_mutex_destroy(&cde_app->mutex);
+}
+void gk20a_cde_suspend(struct nvgpu_os_linux *l)
+__acquires(&l->cde_app->mutex)
+__releases(&l->cde_app->mutex)
+{
+        struct gk20a_cde_app *cde_app = &l->cde_app;
+        struct gk20a_cde_ctx *cde_ctx, *cde_ctx_save;
+        if (!cde_app->initialised)
+                return;
+        nvgpu_mutex_acquire(&cde_app->mutex);
+        nvgpu_list_for_each_entry_safe(cde_ctx, cde_ctx_save,
+                        &cde_app->free_contexts, gk20a_cde_ctx, list) {
+                gk20a_cde_cancel_deleter(cde_ctx, false);
+        }
+        nvgpu_list_for_each_entry_safe(cde_ctx, cde_ctx_save,
+                        &cde_app->used_contexts, gk20a_cde_ctx, list) {
+                gk20a_cde_cancel_deleter(cde_ctx, false);
+        }
+        nvgpu_mutex_release(&cde_app->mutex);
+}
+static int gk20a_cde_create_context(struct nvgpu_os_linux *l)
+__must_hold(&l->cde_app->mutex)
+{
+        struct gk20a_cde_app *cde_app = &l->cde_app;
+        struct gk20a_cde_ctx *cde_ctx;
+        cde_ctx = gk20a_cde_allocate_context(l);
+        if (IS_ERR(cde_ctx))
+                return PTR_ERR(cde_ctx);
+        nvgpu_list_add(&cde_ctx->list, &cde_app->free_contexts);
+        cde_app->ctx_count++;
+        if (cde_app->ctx_count > cde_app->ctx_count_top)
+                cde_app->ctx_count_top = cde_app->ctx_count;
+        return 0;
+}
+static int gk20a_cde_create_contexts(struct nvgpu_os_linux *l)
+__must_hold(&l->cde_app->mutex)
+{
+        int err;
+        int i;
+        for (i = 0; i < NUM_CDE_CONTEXTS; i++) {
+                err = gk20a_cde_create_context(l);
+                if (err)
+                        goto out;
+        }
+        return 0;
+out:
+        gk20a_cde_remove_contexts(l);
+        return err;
+}
+static int gk20a_init_cde_buf(struct gk20a_cde_ctx *cde_ctx,
+                              struct nvgpu_firmware *img,
+                              struct gk20a_cde_hdr_buf *buf)
+{
+        struct nvgpu_mem *mem;
+        struct nvgpu_os_linux *l = cde_ctx->l;
+        struct gk20a *g = &l->g;
+        int err;
+        /* check that the file can hold the buf */
+        if (buf->data_byte_offset != 0 &&
+            buf->data_byte_offset + buf->num_bytes > img->size) {
+                nvgpu_warn(g, "cde: invalid data section. buffer idx = %d",
+                           cde_ctx->num_bufs);
+                return -EINVAL;
+        }
+        /* check that we have enough buf elems available */
+        if (cde_ctx->num_bufs >= MAX_CDE_BUFS) {
+                nvgpu_warn(g, "cde: invalid data section. buffer idx = %d",
+                           cde_ctx->num_bufs);
+                return -ENOMEM;
+        }
+        /* allocate buf */
+        mem = cde_ctx->mem + cde_ctx->num_bufs;
+        err = nvgpu_dma_alloc_map_sys(cde_ctx->vm, buf->num_bytes, mem);
+        if (err) {
+                nvgpu_warn(g, "cde: could not allocate device memory. buffer idx = %d",
+                           cde_ctx->num_bufs);
+                return -ENOMEM;
+        }
+        /* copy the content */
+        if (buf->data_byte_offset != 0)
+                memcpy(mem->cpu_va, img->data + buf->data_byte_offset,
+                       buf->num_bytes);
+        cde_ctx->num_bufs++;
+        return 0;
+}
+static int gk20a_replace_data(struct gk20a_cde_ctx *cde_ctx, void *target,
+                              int type, s32 shift, u64 mask, u64 value)
+{
+        struct nvgpu_os_linux *l = cde_ctx->l;
+        struct gk20a *g = &l->g;
+        u32 *target_mem_ptr = target;
+        u64 *target_mem_ptr_u64 = target;
+        u64 current_value, new_value;
+        value = (shift >= 0) ? value << shift : value >> -shift;
+        value &= mask;
+        /* read current data from the location */
+        current_value = 0;
+        if (type == TYPE_PARAM_TYPE_U32) {
+                if (mask != 0xfffffffful)
+                        current_value = *target_mem_ptr;
+        } else if (type == TYPE_PARAM_TYPE_U64_LITTLE) {
+                if (mask != ~0ul)
+                        current_value = *target_mem_ptr_u64;
+        } else if (type == TYPE_PARAM_TYPE_U64_BIG) {
+                current_value = *target_mem_ptr_u64;
+                current_value = (u64)(current_value >> 32) |
+                        (u64)(current_value << 32);
+        } else {
+                nvgpu_warn(g, "cde: unknown type. type=%d",
+                           type);
+                return -EINVAL;
+        }
+        current_value &= ~mask;
+        new_value = current_value | value;
+        /* store the element data back */
+        if (type == TYPE_PARAM_TYPE_U32)
+                *target_mem_ptr = (u32)new_value;
+        else if (type == TYPE_PARAM_TYPE_U64_LITTLE)
+                *target_mem_ptr_u64 = new_value;
+        else  {
+                new_value = (u64)(new_value >> 32) |
+                        (u64)(new_value << 32);
+                *target_mem_ptr_u64 = new_value;
+        }
+        return 0;
+}
+static int gk20a_init_cde_replace(struct gk20a_cde_ctx *cde_ctx,
+                                  struct nvgpu_firmware *img,
+                                  struct gk20a_cde_hdr_replace *replace)
+{
+        struct nvgpu_mem *source_mem;
+        struct nvgpu_mem *target_mem;
+        struct nvgpu_os_linux *l = cde_ctx->l;
+        struct gk20a *g = &l->g;
+        u32 *target_mem_ptr;
+        u64 vaddr;
+        int err;
+        if (replace->target_buf >= cde_ctx->num_bufs ||
+            replace->source_buf >= cde_ctx->num_bufs) {
+                nvgpu_warn(g, "cde: invalid buffer. target_buf=%u, source_buf=%u, num_bufs=%d",
+                           replace->target_buf, replace->source_buf,
+                           cde_ctx->num_bufs);
+                return -EINVAL;
+        }
+        source_mem = cde_ctx->mem + replace->source_buf;
+        target_mem = cde_ctx->mem + replace->target_buf;
+        target_mem_ptr = target_mem->cpu_va;
+        if (source_mem->size < (replace->source_byte_offset + 3) ||
+            target_mem->size < (replace->target_byte_offset + 3)) {
+                nvgpu_warn(g, "cde: invalid buffer offsets. target_buf_offs=%lld, source_buf_offs=%lld, source_buf_size=%zu, dest_buf_size=%zu",
+                           replace->target_byte_offset,
+                           replace->source_byte_offset,
+                         source_mem->size,
+                         target_mem->size);
+                return -EINVAL;
+        }
+        /* calculate the target pointer */
+        target_mem_ptr += (replace->target_byte_offset / sizeof(u32));
+        /* determine patch value */
+        vaddr = source_mem->gpu_va + replace->source_byte_offset;
+        err = gk20a_replace_data(cde_ctx, target_mem_ptr, replace->type,
+                                 replace->shift, replace->mask,
+                                 vaddr);
+        if (err) {
+                nvgpu_warn(g, "cde: replace failed. err=%d, target_buf=%u, target_buf_offs=%lld, source_buf=%u, source_buf_offs=%lld",
+                           err, replace->target_buf,
+                           replace->target_byte_offset,
+                           replace->source_buf,
+                           replace->source_byte_offset);
+        }
+        return err;
+}
+static int gk20a_cde_patch_params(struct gk20a_cde_ctx *cde_ctx)
+{
+        struct nvgpu_os_linux *l = cde_ctx->l;
+        struct gk20a *g = &l->g;
+        struct nvgpu_mem *target_mem;
+        u32 *target_mem_ptr;
+        u64 new_data;
+        int user_id = 0, err;
+        unsigned int i;
+        for (i = 0; i < cde_ctx->num_params; i++) {
+                struct gk20a_cde_hdr_param *param = cde_ctx->params + i;
+                target_mem = cde_ctx->mem + param->target_buf;
+                target_mem_ptr = target_mem->cpu_va;
+                target_mem_ptr += (param->target_byte_offset / sizeof(u32));
+                switch (param->id) {
+                case TYPE_PARAM_COMPTAGS_PER_CACHELINE:
+                        new_data = g->gr.comptags_per_cacheline;
+                        break;
+                case TYPE_PARAM_GPU_CONFIGURATION:
+                        new_data = (u64)g->ltc_count * g->gr.slices_per_ltc *
+                                g->gr.cacheline_size;
+                        break;
+                case TYPE_PARAM_FIRSTPAGEOFFSET:
+                        new_data = cde_ctx->surf_param_offset;
+                        break;
+                case TYPE_PARAM_NUMPAGES:
+                        new_data = cde_ctx->surf_param_lines;
+                        break;
+                case TYPE_PARAM_BACKINGSTORE:
+                        new_data = cde_ctx->backing_store_vaddr;
+                        break;
+                case TYPE_PARAM_DESTINATION:
+                        new_data = cde_ctx->compbit_vaddr;
+                        break;
+                case TYPE_PARAM_DESTINATION_SIZE:
+                        new_data = cde_ctx->compbit_size;
+                        break;
+                case TYPE_PARAM_BACKINGSTORE_SIZE:
+                        new_data = g->gr.compbit_store.mem.size;
+                        break;
+                case TYPE_PARAM_SOURCE_SMMU_ADDR:
+                        new_data = gk20a_mm_gpuva_to_iova_base(cde_ctx->vm,
+                                                        cde_ctx->surf_vaddr);
+                        if (new_data == 0)
+                                return -EINVAL;
+                        break;
+                case TYPE_PARAM_BACKINGSTORE_BASE_HW:
+                        new_data = g->gr.compbit_store.base_hw;
+                        break;
+                case TYPE_PARAM_GOBS_PER_COMPTAGLINE_PER_SLICE:
+                        new_data = g->gr.gobs_per_comptagline_per_slice;
+                        break;
+                case TYPE_PARAM_SCATTERBUFFER:
+                        new_data = cde_ctx->scatterbuffer_vaddr;
+                        break;
+                case TYPE_PARAM_SCATTERBUFFER_SIZE:
+                        new_data = cde_ctx->scatterbuffer_size;
+                        break;
+                default:
+                        user_id = param->id - NUM_RESERVED_PARAMS;
+                        if (user_id < 0 || user_id >= MAX_CDE_USER_PARAMS)
+                                continue;
+                        new_data = cde_ctx->user_param_values[user_id];
+                }
+                gk20a_dbg(gpu_dbg_cde, "cde: patch: idx_in_file=%d      param_id=%d     target_buf=%u   target_byte_offset=%lld data_value=0x%llx       data_offset/data_diff=%lld      data_type=%d    data_shift=%d   data_mask=0x%llx",
+                          i, param->id, param->target_buf,
+                          param->target_byte_offset, new_data,
+                          param->data_offset, param->type, param->shift,
+                          param->mask);
+                new_data += param->data_offset;
+                err = gk20a_replace_data(cde_ctx, target_mem_ptr, param->type,
+                                         param->shift, param->mask, new_data);
+                if (err) {
+                        nvgpu_warn(g, "cde: patch failed. err=%d, idx=%d, id=%d, target_buf=%u, target_buf_offs=%lld, patch_value=%llu",
+                                   err, i, param->id, param->target_buf,
+                                   param->target_byte_offset, new_data);
+                        return err;
+                }
+        }
+        return 0;
+}
+static int gk20a_init_cde_param(struct gk20a_cde_ctx *cde_ctx,
+                                struct nvgpu_firmware *img,
+                                struct gk20a_cde_hdr_param *param)
+{
+        struct nvgpu_mem *target_mem;
+        struct nvgpu_os_linux *l = cde_ctx->l;
+        struct gk20a *g = &l->g;
+        if (param->target_buf >= cde_ctx->num_bufs) {
+                nvgpu_warn(g, "cde: invalid buffer parameter. param idx = %d, target_buf=%u, num_bufs=%u",
+                           cde_ctx->num_params, param->target_buf,
+                           cde_ctx->num_bufs);
+                return -EINVAL;
+        }
+        target_mem = cde_ctx->mem + param->target_buf;
+        if (target_mem->size < (param->target_byte_offset + 3)) {
+                nvgpu_warn(g, "cde: invalid buffer parameter. param idx = %d, target_buf_offs=%lld, target_buf_size=%zu",
+                           cde_ctx->num_params, param->target_byte_offset,
+                           target_mem->size);
+                return -EINVAL;
+        }
+        /* does this parameter fit into our parameter structure */
+        if (cde_ctx->num_params >= MAX_CDE_PARAMS) {
+                nvgpu_warn(g, "cde: no room for new parameters param idx = %d",
+                           cde_ctx->num_params);
+                return -ENOMEM;
+        }
+        /* is the given id valid? */
+        if (param->id >= NUM_RESERVED_PARAMS + MAX_CDE_USER_PARAMS) {
+                nvgpu_warn(g, "cde: parameter id is not valid. param idx = %d, id=%u, max=%u",
+                           param->id, cde_ctx->num_params,
+                           NUM_RESERVED_PARAMS + MAX_CDE_USER_PARAMS);
+                return -EINVAL;
+        }
+        cde_ctx->params[cde_ctx->num_params] = *param;
+        cde_ctx->num_params++;
+        return 0;
+}
+static int gk20a_init_cde_required_class(struct gk20a_cde_ctx *cde_ctx,
+                                         struct nvgpu_firmware *img,
+                                         u32 required_class)
+{
+        struct nvgpu_os_linux *l = cde_ctx->l;
+        struct gk20a *g = &l->g;
+        struct nvgpu_alloc_obj_ctx_args alloc_obj_ctx;
+        int err;
+        alloc_obj_ctx.class_num = required_class;
+        alloc_obj_ctx.flags = 0;
+        /* CDE enabled */
+        cde_ctx->ch->cde = true;
+        err = gk20a_alloc_obj_ctx(cde_ctx->ch, &alloc_obj_ctx);
+        if (err) {
+                nvgpu_warn(g, "cde: failed to allocate ctx. err=%d",
+                           err);
+                return err;
+        }
+        return 0;
+}
+static int gk20a_init_cde_command(struct gk20a_cde_ctx *cde_ctx,
+                                  struct nvgpu_firmware *img,
+                                  u32 op,
+                                  struct gk20a_cde_cmd_elem *cmd_elem,
+                                  u32 num_elems)
+{
+        struct nvgpu_os_linux *l = cde_ctx->l;
+        struct gk20a *g = &l->g;
+        struct nvgpu_gpfifo **gpfifo, *gpfifo_elem;
+        u32 *num_entries;
+        unsigned int i;
+        /* check command type */
+        if (op == TYPE_BUF_COMMAND_INIT) {
+                gpfifo = &cde_ctx->init_convert_cmd;
+                num_entries = &cde_ctx->init_cmd_num_entries;
+        } else if (op == TYPE_BUF_COMMAND_CONVERT) {
+                gpfifo = &cde_ctx->convert_cmd;
+                num_entries = &cde_ctx->convert_cmd_num_entries;
+        } else {
+                nvgpu_warn(g, "cde: unknown command. op=%u",
+                           op);
+                return -EINVAL;
+        }
+        /* allocate gpfifo entries to be pushed */
+        *gpfifo = nvgpu_kzalloc(g,
+                                sizeof(struct nvgpu_gpfifo) * num_elems);
+        if (!*gpfifo) {
+                nvgpu_warn(g, "cde: could not allocate memory for gpfifo entries");
+                return -ENOMEM;
+        }
+        gpfifo_elem = *gpfifo;
+        for (i = 0; i < num_elems; i++, cmd_elem++, gpfifo_elem++) {
+                struct nvgpu_mem *target_mem;
+                /* validate the current entry */
+                if (cmd_elem->target_buf >= cde_ctx->num_bufs) {
+                        nvgpu_warn(g, "cde: target buffer is not available (target=%u, num_bufs=%u)",
+                                   cmd_elem->target_buf, cde_ctx->num_bufs);
+                        return -EINVAL;
+                }
+                target_mem = cde_ctx->mem + cmd_elem->target_buf;
+                if (target_mem->size<
+                    cmd_elem->target_byte_offset + cmd_elem->num_bytes) {
+                        nvgpu_warn(g, "cde: target buffer cannot hold all entries (target_size=%zu, target_byte_offset=%lld, num_bytes=%llu)",
+                                   target_mem->size,
+                                   cmd_elem->target_byte_offset,
+                                   cmd_elem->num_bytes);
+                        return -EINVAL;
+                }
+                /* store the element into gpfifo */
+                gpfifo_elem->entry0 =
+                        u64_lo32(target_mem->gpu_va +
+                        cmd_elem->target_byte_offset);
+                gpfifo_elem->entry1 =
+                        u64_hi32(target_mem->gpu_va +
+                        cmd_elem->target_byte_offset) |
+                        pbdma_gp_entry1_length_f(cmd_elem->num_bytes /
+                                                 sizeof(u32));
+        }
+        *num_entries = num_elems;
+        return 0;
+}
+static int gk20a_cde_pack_cmdbufs(struct gk20a_cde_ctx *cde_ctx)
+{
+        struct nvgpu_os_linux *l = cde_ctx->l;
+        struct gk20a *g = &l->g;
+        unsigned long init_bytes = cde_ctx->init_cmd_num_entries *
+                sizeof(struct nvgpu_gpfifo);
+        unsigned long conv_bytes = cde_ctx->convert_cmd_num_entries *
+                sizeof(struct nvgpu_gpfifo);
+        unsigned long total_bytes = init_bytes + conv_bytes;
+        struct nvgpu_gpfifo *combined_cmd;
+        /* allocate buffer that has space for both */
+        combined_cmd = nvgpu_kzalloc(g, total_bytes);
+        if (!combined_cmd) {
+                nvgpu_warn(g,
+                        "cde: could not allocate memory for gpfifo entries");
+                return -ENOMEM;
+        }
+        /* move the original init here and append convert */
+        memcpy(combined_cmd, cde_ctx->init_convert_cmd, init_bytes);
+        memcpy(combined_cmd + cde_ctx->init_cmd_num_entries,
+                        cde_ctx->convert_cmd, conv_bytes);
+        nvgpu_kfree(g, cde_ctx->init_convert_cmd);
+        nvgpu_kfree(g, cde_ctx->convert_cmd);
+        cde_ctx->init_convert_cmd = combined_cmd;
+        cde_ctx->convert_cmd = combined_cmd
+                + cde_ctx->init_cmd_num_entries;
+        return 0;
+}
+static int gk20a_init_cde_img(struct gk20a_cde_ctx *cde_ctx,
+                              struct nvgpu_firmware *img)
+{
+        struct nvgpu_os_linux *l = cde_ctx->l;
+        struct gk20a *g = &l->g;
+        struct gk20a_cde_app *cde_app = &l->cde_app;
+        u32 *data = (u32 *)img->data;
+        u32 num_of_elems;
+        struct gk20a_cde_hdr_elem *elem;
+        u32 min_size = 0;
+        int err = 0;
+        unsigned int i;
+        min_size += 2 * sizeof(u32);
+        if (img->size < min_size) {
+                nvgpu_warn(g, "cde: invalid image header");
+                return -EINVAL;
+        }
+        cde_app->firmware_version = data[0];
+        num_of_elems = data[1];
+        min_size += num_of_elems * sizeof(*elem);
+        if (img->size < min_size) {
+                nvgpu_warn(g, "cde: bad image");
+                return -EINVAL;
+        }
+        elem = (struct gk20a_cde_hdr_elem *)&data[2];
+        for (i = 0; i < num_of_elems; i++) {
+                int err = 0;
+                switch (elem->type) {
+                case TYPE_BUF:
+                        err = gk20a_init_cde_buf(cde_ctx, img, &elem->buf);
+                        break;
+                case TYPE_REPLACE:
+                        err = gk20a_init_cde_replace(cde_ctx, img,
+                                                     &elem->replace);
+                        break;
+                case TYPE_PARAM:
+                        err = gk20a_init_cde_param(cde_ctx, img, &elem->param);
+                        break;
+                case TYPE_REQUIRED_CLASS:
+                        err = gk20a_init_cde_required_class(cde_ctx, img,
+                                elem->required_class);
+                        break;
+                case TYPE_COMMAND:
+                {
+                        struct gk20a_cde_cmd_elem *cmd = (void *)
+                                &img->data[elem->command.data_byte_offset];
+                        err = gk20a_init_cde_command(cde_ctx, img,
+                                elem->command.op, cmd,
+                                elem->command.num_entries);
+                        break;
+                }
+                case TYPE_ARRAY:
+                        memcpy(&cde_app->arrays[elem->array.id][0],
+                                elem->array.data,
+                                MAX_CDE_ARRAY_ENTRIES*sizeof(u32));
+                        break;
+                default:
+                        nvgpu_warn(g, "cde: unknown header element");
+                        err = -EINVAL;
+                }
+                if (err)
+                        goto deinit_image;
+                elem++;
+        }
+        if (!cde_ctx->init_convert_cmd || !cde_ctx->init_cmd_num_entries) {
+                nvgpu_warn(g, "cde: convert command not defined");
+                err = -EINVAL;
+                goto deinit_image;
+        }
+        if (!cde_ctx->convert_cmd || !cde_ctx->convert_cmd_num_entries) {
+                nvgpu_warn(g, "cde: convert command not defined");
+                err = -EINVAL;
+                goto deinit_image;
+        }
+        err = gk20a_cde_pack_cmdbufs(cde_ctx);
+        if (err)
+                goto deinit_image;
+        return 0;
+deinit_image:
+        gk20a_deinit_cde_img(cde_ctx);
+        return err;
+}
+static int gk20a_cde_execute_buffer(struct gk20a_cde_ctx *cde_ctx,
+                                    u32 op, struct nvgpu_fence *fence,
+                                    u32 flags, struct gk20a_fence **fence_out)
+{
+        struct nvgpu_os_linux *l = cde_ctx->l;
+        struct gk20a *g = &l->g;
+        struct nvgpu_gpfifo *gpfifo = NULL;
+        int num_entries = 0;
+        /* check command type */
+        if (op == TYPE_BUF_COMMAND_INIT) {
+                /* both init and convert combined */
+                gpfifo = cde_ctx->init_convert_cmd;
+                num_entries = cde_ctx->init_cmd_num_entries
+                        + cde_ctx->convert_cmd_num_entries;
+        } else if (op == TYPE_BUF_COMMAND_CONVERT) {
+                gpfifo = cde_ctx->convert_cmd;
+                num_entries = cde_ctx->convert_cmd_num_entries;
+        } else {
+                nvgpu_warn(g, "cde: unknown buffer");
+                return -EINVAL;
+        }
+        if (gpfifo == NULL || num_entries == 0) {
+                nvgpu_warn(g, "cde: buffer not available");
+                return -ENOSYS;
+        }
+        return gk20a_submit_channel_gpfifo(cde_ctx->ch, gpfifo, NULL,
+                                   num_entries, flags, fence, fence_out, true,
+                                   NULL);
+}
+static void gk20a_cde_ctx_release(struct gk20a_cde_ctx *cde_ctx)
+__acquires(&cde_app->mutex)
+__releases(&cde_app->mutex)
+{
+        struct gk20a_cde_app *cde_app = &cde_ctx->l->cde_app;
+        gk20a_dbg(gpu_dbg_cde_ctx, "releasing use on %p", cde_ctx);
+        trace_gk20a_cde_release(cde_ctx);
+        nvgpu_mutex_acquire(&cde_app->mutex);
+        if (cde_ctx->in_use) {
+                cde_ctx->in_use = false;
+                nvgpu_list_move(&cde_ctx->list, &cde_app->free_contexts);
+                cde_app->ctx_usecount--;
+        } else {
+                gk20a_dbg_info("double release cde context %p", cde_ctx);
+        }
+        nvgpu_mutex_release(&cde_app->mutex);
+}
+static void gk20a_cde_ctx_deleter_fn(struct work_struct *work)
+__acquires(&cde_app->mutex)
+__releases(&cde_app->mutex)
+{
+        struct delayed_work *delay_work = to_delayed_work(work);
+        struct gk20a_cde_ctx *cde_ctx = container_of(delay_work,
+                        struct gk20a_cde_ctx, ctx_deleter_work);
+        struct gk20a_cde_app *cde_app = &cde_ctx->l->cde_app;
+        struct nvgpu_os_linux *l = cde_ctx->l;
+        struct gk20a *g = &l->g;
+        int err;
+        /* someone has just taken it? engine deletion started? */
+        if (cde_ctx->in_use || !cde_app->initialised)
+                return;
+        gk20a_dbg(gpu_dbg_fn | gpu_dbg_cde_ctx,
+                        "cde: attempting to delete temporary %p", cde_ctx);
+        err = gk20a_busy(g);
+        if (err) {
+                /* this context would find new use anyway later, so not freeing
+                 * here does not leak anything */
+                nvgpu_warn(g, "cde: cannot set gk20a on, postponing"
+                                " temp ctx deletion");
+                return;
+        }
+        nvgpu_mutex_acquire(&cde_app->mutex);
+        if (cde_ctx->in_use || !cde_app->initialised) {
+                gk20a_dbg(gpu_dbg_cde_ctx,
+                                "cde: context use raced, not deleting %p",
+                                cde_ctx);
+                goto out;
+        }
+        WARN(delayed_work_pending(&cde_ctx->ctx_deleter_work),
+                        "double pending %p", cde_ctx);
+        gk20a_cde_remove_ctx(cde_ctx);
+        gk20a_dbg(gpu_dbg_fn | gpu_dbg_cde_ctx,
+                        "cde: destroyed %p count=%d use=%d max=%d",
+                        cde_ctx, cde_app->ctx_count, cde_app->ctx_usecount,
+                        cde_app->ctx_count_top);
+out:
+        nvgpu_mutex_release(&cde_app->mutex);
+        gk20a_idle(g);
+}
+static struct gk20a_cde_ctx *gk20a_cde_do_get_context(struct nvgpu_os_linux *l)
+__must_hold(&cde_app->mutex)
+{
+        struct gk20a *g = &l->g;
+        struct gk20a_cde_app *cde_app = &l->cde_app;
+        struct gk20a_cde_ctx *cde_ctx;
+        /* exhausted? */
+        if (cde_app->ctx_usecount >= MAX_CTX_USE_COUNT)
+                return ERR_PTR(-EAGAIN);
+        /* idle context available? */
+        if (!nvgpu_list_empty(&cde_app->free_contexts)) {
+                cde_ctx = nvgpu_list_first_entry(&cde_app->free_contexts,
+                                gk20a_cde_ctx, list);
+                gk20a_dbg(gpu_dbg_fn | gpu_dbg_cde_ctx,
+                                "cde: got free %p count=%d use=%d max=%d",
+                                cde_ctx, cde_app->ctx_count,
+                                cde_app->ctx_usecount,
+                                cde_app->ctx_count_top);
+                trace_gk20a_cde_get_context(cde_ctx);
+                /* deleter work may be scheduled, but in_use prevents it */
+                cde_ctx->in_use = true;
+                nvgpu_list_move(&cde_ctx->list, &cde_app->used_contexts);
+                cde_app->ctx_usecount++;
+                /* cancel any deletions now that ctx is in use */
+                gk20a_cde_cancel_deleter(cde_ctx, true);
+                return cde_ctx;
+        }
+        /* no free contexts, get a temporary one */
+        gk20a_dbg(gpu_dbg_fn | gpu_dbg_cde_ctx,
+                        "cde: no free contexts, count=%d",
+                        cde_app->ctx_count);
+        cde_ctx = gk20a_cde_allocate_context(l);
+        if (IS_ERR(cde_ctx)) {
+                nvgpu_warn(g, "cde: cannot allocate context: %ld",
+                                PTR_ERR(cde_ctx));
+                return cde_ctx;
+        }
+        trace_gk20a_cde_get_context(cde_ctx);
+        cde_ctx->in_use = true;
+        cde_ctx->is_temporary = true;
+        cde_app->ctx_usecount++;
+        cde_app->ctx_count++;
+        if (cde_app->ctx_count > cde_app->ctx_count_top)
+                cde_app->ctx_count_top = cde_app->ctx_count;
+        nvgpu_list_add(&cde_ctx->list, &cde_app->used_contexts);
+        return cde_ctx;
+}
+static struct gk20a_cde_ctx *gk20a_cde_get_context(struct nvgpu_os_linux *l)
+__releases(&cde_app->mutex)
+__acquires(&cde_app->mutex)
+{
+        struct gk20a *g = &l->g;
+        struct gk20a_cde_app *cde_app = &l->cde_app;
+        struct gk20a_cde_ctx *cde_ctx = NULL;
+        struct nvgpu_timeout timeout;
+        nvgpu_timeout_init(g, &timeout, MAX_CTX_RETRY_TIME,
+                           NVGPU_TIMER_CPU_TIMER);
+        do {
+                cde_ctx = gk20a_cde_do_get_context(l);
+                if (PTR_ERR(cde_ctx) != -EAGAIN)
+                        break;
+                /* exhausted, retry */
+                nvgpu_mutex_release(&cde_app->mutex);
+                cond_resched();
+                nvgpu_mutex_acquire(&cde_app->mutex);
+        } while (!nvgpu_timeout_expired(&timeout));
+        return cde_ctx;
+}
+static struct gk20a_cde_ctx *gk20a_cde_allocate_context(struct nvgpu_os_linux *l)
+{
+        struct gk20a *g = &l->g;
+        struct gk20a_cde_ctx *cde_ctx;
+        int ret;
+        cde_ctx = nvgpu_kzalloc(g, sizeof(*cde_ctx));
+        if (!cde_ctx)
+                return ERR_PTR(-ENOMEM);
+        cde_ctx->l = l;
+        cde_ctx->dev = dev_from_gk20a(g);
+        ret = gk20a_cde_load(cde_ctx);
+        if (ret) {
+                nvgpu_kfree(g, cde_ctx);
+                return ERR_PTR(ret);
+        }
+        nvgpu_init_list_node(&cde_ctx->list);
+        cde_ctx->is_temporary = false;
+        cde_ctx->in_use = false;
+        INIT_DELAYED_WORK(&cde_ctx->ctx_deleter_work,
+                        gk20a_cde_ctx_deleter_fn);
+        gk20a_dbg(gpu_dbg_fn | gpu_dbg_cde_ctx, "cde: allocated %p", cde_ctx);
+        trace_gk20a_cde_allocate_context(cde_ctx);
+        return cde_ctx;
+}
+int gk20a_cde_convert(struct nvgpu_os_linux *l,
+                      struct dma_buf *compbits_scatter_buf,
+                      u64 compbits_byte_offset,
+                      u64 scatterbuffer_byte_offset,
+                      struct nvgpu_fence *fence,
+                      u32 __flags, struct gk20a_cde_param *params,
+                      int num_params, struct gk20a_fence **fence_out)
+__acquires(&l->cde_app->mutex)
+__releases(&l->cde_app->mutex)
+{
+        struct gk20a *g = &l->g;
+        struct gk20a_cde_ctx *cde_ctx = NULL;
+        struct gk20a_comptags comptags;
+        u64 mapped_compbits_offset = 0;
+        u64 compbits_size = 0;
+        u64 mapped_scatterbuffer_offset = 0;
+        u64 scatterbuffer_size = 0;
+        u64 map_vaddr = 0;
+        u64 map_offset = 0;
+        u64 map_size = 0;
+        u8 *surface = NULL;
+        u64 big_page_mask = 0;
+        u32 flags;
+        int err, i;
+        const s32 compbits_kind = 0;
+        gk20a_dbg(gpu_dbg_cde, "compbits_byte_offset=%llu scatterbuffer_byte_offset=%llu",
+                  compbits_byte_offset, scatterbuffer_byte_offset);
+        /* scatter buffer must be after compbits buffer */
+        if (scatterbuffer_byte_offset &&
+            scatterbuffer_byte_offset < compbits_byte_offset)
+                return -EINVAL;
+        err = gk20a_busy(g);
+        if (err)
+                return err;
+        nvgpu_mutex_acquire(&l->cde_app.mutex);
+        cde_ctx = gk20a_cde_get_context(l);
+        nvgpu_mutex_release(&l->cde_app.mutex);
+        if (IS_ERR(cde_ctx)) {
+                err = PTR_ERR(cde_ctx);
+                goto exit_idle;
+        }
+        /* First, map the buffer to local va */
+        /* ensure that the compbits buffer has drvdata */
+        err = gk20a_dmabuf_alloc_drvdata(compbits_scatter_buf,
+                        dev_from_gk20a(g));
+        if (err)
+                goto exit_idle;
+        /* compbits don't start at page aligned offset, so we need to align
+           the region to be mapped */
+        big_page_mask = cde_ctx->vm->big_page_size - 1;
+        map_offset = compbits_byte_offset & ~big_page_mask;
+        map_size = compbits_scatter_buf->size - map_offset;
+        /* compute compbit start offset from the beginning of the mapped
+           area */
+        mapped_compbits_offset = compbits_byte_offset - map_offset;
+        if (scatterbuffer_byte_offset) {
+                compbits_size = scatterbuffer_byte_offset -
+                                compbits_byte_offset;
+                mapped_scatterbuffer_offset = scatterbuffer_byte_offset -
+                                              map_offset;
+                scatterbuffer_size = compbits_scatter_buf->size -
+                                     scatterbuffer_byte_offset;
+        } else {
+                compbits_size = compbits_scatter_buf->size -
+                                compbits_byte_offset;
+        }
+        gk20a_dbg(gpu_dbg_cde, "map_offset=%llu map_size=%llu",
+                  map_offset, map_size);
+        gk20a_dbg(gpu_dbg_cde, "mapped_compbits_offset=%llu compbits_size=%llu",
+                  mapped_compbits_offset, compbits_size);
+        gk20a_dbg(gpu_dbg_cde, "mapped_scatterbuffer_offset=%llu scatterbuffer_size=%llu",
+                  mapped_scatterbuffer_offset, scatterbuffer_size);
+        /* map the destination buffer */
+        get_dma_buf(compbits_scatter_buf); /* a ref for nvgpu_vm_map */
+        map_vaddr = nvgpu_vm_map(cde_ctx->vm, compbits_scatter_buf, 0,
+                                 NVGPU_MAP_BUFFER_FLAGS_CACHEABLE_TRUE,
+                                 compbits_kind, true,
+                                 gk20a_mem_flag_none,
+                                 map_offset, map_size,
+                                 NULL);
+        if (!map_vaddr) {
+                dma_buf_put(compbits_scatter_buf);
+                err = -EINVAL;
+                goto exit_idle;
+        }
+        if (scatterbuffer_byte_offset &&
+            g->ops.cde.need_scatter_buffer &&
+            g->ops.cde.need_scatter_buffer(g)) {
+                struct sg_table *sgt;
+                void *scatter_buffer;
+                surface = dma_buf_vmap(compbits_scatter_buf);
+                if (IS_ERR(surface)) {
+                        nvgpu_warn(g,
+                                   "dma_buf_vmap failed");
+                        err = -EINVAL;
+                        goto exit_unmap_vaddr;
+                }
+                scatter_buffer = surface + scatterbuffer_byte_offset;
+                gk20a_dbg(gpu_dbg_cde, "surface=0x%p scatterBuffer=0x%p",
+                          surface, scatter_buffer);
+                sgt = gk20a_mm_pin(dev_from_gk20a(g), compbits_scatter_buf);
+                if (IS_ERR(sgt)) {
+                        nvgpu_warn(g,
+                                   "mm_pin failed");
+                        err = -EINVAL;
+                        goto exit_unmap_surface;
+                } else {
+                        err = g->ops.cde.populate_scatter_buffer(g, sgt,
+                                        compbits_byte_offset, scatter_buffer,
+                                        scatterbuffer_size);
+                        WARN_ON(err);
+                        gk20a_mm_unpin(dev_from_gk20a(g), compbits_scatter_buf,
+                                       sgt);
+                        if (err)
+                                goto exit_unmap_surface;
+                }
+                __cpuc_flush_dcache_area(scatter_buffer, scatterbuffer_size);
+                dma_buf_vunmap(compbits_scatter_buf, surface);
+                surface = NULL;
+        }
+        /* store source buffer compression tags */
+        gk20a_get_comptags(dev_from_gk20a(g), compbits_scatter_buf, &comptags);
+        cde_ctx->surf_param_offset = comptags.offset;
+        cde_ctx->surf_param_lines = comptags.lines;
+        /* store surface vaddr. This is actually compbit vaddr, but since
+           compbits live in the same surface, and we can get the alloc base
+           address by using gk20a_mm_gpuva_to_iova_base, this will do */
+        cde_ctx->surf_vaddr = map_vaddr;
+        /* store information about destination */
+        cde_ctx->compbit_vaddr = map_vaddr + mapped_compbits_offset;
+        cde_ctx->compbit_size = compbits_size;
+        cde_ctx->scatterbuffer_vaddr = map_vaddr + mapped_scatterbuffer_offset;
+        cde_ctx->scatterbuffer_size = scatterbuffer_size;
+        /* remove existing argument data */
+        memset(cde_ctx->user_param_values, 0,
+               sizeof(cde_ctx->user_param_values));
+        /* read user space arguments for the conversion */
+        for (i = 0; i < num_params; i++) {
+                struct gk20a_cde_param *param = params + i;
+                int id = param->id - NUM_RESERVED_PARAMS;
+                if (id < 0 || id >= MAX_CDE_USER_PARAMS) {
+                        nvgpu_warn(g, "cde: unknown user parameter");
+                        err = -EINVAL;
+                        goto exit_unmap_surface;
+                }
+                cde_ctx->user_param_values[id] = param->value;
+        }
+        /* patch data */
+        err = gk20a_cde_patch_params(cde_ctx);
+        if (err) {
+                nvgpu_warn(g, "cde: failed to patch parameters");
+                goto exit_unmap_surface;
+        }
+        gk20a_dbg(gpu_dbg_cde, "cde: buffer=cbc, size=%zu, gpuva=%llx\n",
+                 g->gr.compbit_store.mem.size, cde_ctx->backing_store_vaddr);
+        gk20a_dbg(gpu_dbg_cde, "cde: buffer=compbits, size=%llu, gpuva=%llx\n",
+                 cde_ctx->compbit_size, cde_ctx->compbit_vaddr);
+        gk20a_dbg(gpu_dbg_cde, "cde: buffer=scatterbuffer, size=%llu, gpuva=%llx\n",
+                 cde_ctx->scatterbuffer_size, cde_ctx->scatterbuffer_vaddr);
+        /* take always the postfence as it is needed for protecting the
+         * cde context */
+        flags = __flags | NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_GET;
+        /* gk20a_cde_execute_buffer() will grab a power reference of it's own */
+        gk20a_idle(g);
+        /* execute the conversion buffer, combined with init first if it's the
+         * first time */
+        err = gk20a_cde_execute_buffer(cde_ctx,
+                        cde_ctx->init_cmd_executed
+                                ? TYPE_BUF_COMMAND_CONVERT
+                                : TYPE_BUF_COMMAND_INIT,
+                        fence, flags, fence_out);
+        cde_ctx->init_cmd_executed = true;
+        /* unmap the buffers - channel holds references to them now */
+        nvgpu_vm_unmap(cde_ctx->vm, map_vaddr);
+        return err;
+exit_unmap_surface:
+        if (surface)
+                dma_buf_vunmap(compbits_scatter_buf, surface);
+exit_unmap_vaddr:
+        nvgpu_vm_unmap(cde_ctx->vm, map_vaddr);
+exit_idle:
+        gk20a_idle(g);
+        return err;
+}
+static void gk20a_cde_finished_ctx_cb(struct channel_gk20a *ch, void *data)
+__acquires(&cde_app->mutex)
+__releases(&cde_app->mutex)
+{
+        struct gk20a_cde_ctx *cde_ctx = data;
+        struct nvgpu_os_linux *l = cde_ctx->l;
+        struct gk20a *g = &l->g;
+        struct gk20a_cde_app *cde_app = &l->cde_app;
+        bool channel_idle;
+        channel_gk20a_joblist_lock(ch);
+        channel_idle = channel_gk20a_joblist_is_empty(ch);
+        channel_gk20a_joblist_unlock(ch);
+        if (!channel_idle)
+                return;
+        trace_gk20a_cde_finished_ctx_cb(cde_ctx);
+        gk20a_dbg(gpu_dbg_fn | gpu_dbg_cde_ctx, "cde: finished %p", cde_ctx);
+        if (!cde_ctx->in_use)
+                gk20a_dbg_info("double finish cde context %p on channel %p",
+                                cde_ctx, ch);
+        if (ch->has_timedout) {
+                if (cde_ctx->is_temporary) {
+                        nvgpu_warn(g,
+                                        "cde: channel had timed out"
+                                        " (temporary channel)");
+                        /* going to be deleted anyway */
+                } else {
+                        nvgpu_warn(g,
+                                        "cde: channel had timed out"
+                                        ", reloading");
+                        /* mark it to be deleted, replace with a new one */
+                        nvgpu_mutex_acquire(&cde_app->mutex);
+                        cde_ctx->is_temporary = true;
+                        if (gk20a_cde_create_context(l)) {
+                                nvgpu_err(g, "cde: can't replace context");
+                        }
+                        nvgpu_mutex_release(&cde_app->mutex);
+                }
+        }
+        /* delete temporary contexts later (watch for doubles) */
+        if (cde_ctx->is_temporary && cde_ctx->in_use) {
+                WARN_ON(delayed_work_pending(&cde_ctx->ctx_deleter_work));
+                schedule_delayed_work(&cde_ctx->ctx_deleter_work,
+                        msecs_to_jiffies(CTX_DELETE_TIME));
+        }
+        if (!ch->has_timedout)
+                gk20a_cde_ctx_release(cde_ctx);
+}
+static int gk20a_cde_load(struct gk20a_cde_ctx *cde_ctx)
+{
+        struct nvgpu_os_linux *l = cde_ctx->l;
+        struct gk20a *g = &l->g;
+        struct nvgpu_firmware *img;
+        struct channel_gk20a *ch;
+        struct gr_gk20a *gr = &g->gr;
+        int err = 0;
+        u64 vaddr;
+        img = nvgpu_request_firmware(g, "gpu2cde.bin", 0);
+        if (!img) {
+                nvgpu_err(g, "cde: could not fetch the firmware");
+                return -ENOSYS;
+        }
+        ch = gk20a_open_new_channel_with_cb(g, gk20a_cde_finished_ctx_cb,
+                        cde_ctx,
+                        -1,
+                        false);
+        if (!ch) {
+                nvgpu_warn(g, "cde: gk20a channel not available");
+                err = -ENOMEM;
+                goto err_get_gk20a_channel;
+        }
+        /* bind the channel to the vm */
+        err = __gk20a_vm_bind_channel(g->mm.cde.vm, ch);
+        if (err) {
+                nvgpu_warn(g, "cde: could not bind vm");
+                goto err_commit_va;
+        }
+        /* allocate gpfifo (1024 should be more than enough) */
+        err = gk20a_channel_alloc_gpfifo(ch, 1024, 0, 0);
+        if (err) {
+                nvgpu_warn(g, "cde: unable to allocate gpfifo");
+                goto err_alloc_gpfifo;
+        }
+        /* map backing store to gpu virtual space */
+        vaddr = nvgpu_gmmu_map(ch->vm, &gr->compbit_store.mem,
+                               g->gr.compbit_store.mem.size,
+                               NVGPU_MAP_BUFFER_FLAGS_CACHEABLE_TRUE,
+                               gk20a_mem_flag_read_only,
+                               false,
+                               gr->compbit_store.mem.aperture);
+        if (!vaddr) {
+                nvgpu_warn(g, "cde: cannot map compression bit backing store");
+                err = -ENOMEM;
+                goto err_map_backingstore;
+        }
+        /* store initialisation data */
+        cde_ctx->ch = ch;
+        cde_ctx->vm = ch->vm;
+        cde_ctx->backing_store_vaddr = vaddr;
+        /* initialise the firmware */
+        err = gk20a_init_cde_img(cde_ctx, img);
+        if (err) {
+                nvgpu_warn(g, "cde: image initialisation failed");
+                goto err_init_cde_img;
+        }
+        /* initialisation done */
+        nvgpu_release_firmware(g, img);
+        return 0;
+err_init_cde_img:
+        nvgpu_gmmu_unmap(ch->vm, &g->gr.compbit_store.mem, vaddr);
+err_map_backingstore:
+err_alloc_gpfifo:
+        nvgpu_vm_put(ch->vm);
+err_commit_va:
+err_get_gk20a_channel:
+        nvgpu_release_firmware(g, img);
+        nvgpu_err(g, "cde: couldn't initialise buffer converter: %d", err);
+        return err;
+}
+int gk20a_cde_reload(struct nvgpu_os_linux *l)
+__acquires(&l->cde_app->mutex)
+__releases(&l->cde_app->mutex)
+{
+        struct gk20a *g = &l->g;
+        struct gk20a_cde_app *cde_app = &l->cde_app;
+        int err;
+        if (!cde_app->initialised)
+                return -ENOSYS;
+        err = gk20a_busy(g);
+        if (err)
+                return err;
+        nvgpu_mutex_acquire(&cde_app->mutex);
+        gk20a_cde_stop(l);
+        err = gk20a_cde_create_contexts(l);
+        if (!err)
+                cde_app->initialised = true;
+        nvgpu_mutex_release(&cde_app->mutex);
+        gk20a_idle(g);
+        return err;
+}
+int gk20a_init_cde_support(struct nvgpu_os_linux *l)
+__acquires(&cde_app->mutex)
+__releases(&cde_app->mutex)
+{
+        struct gk20a_cde_app *cde_app = &l->cde_app;
+        int err;
+        if (cde_app->initialised)
+                return 0;
+        gk20a_dbg(gpu_dbg_fn | gpu_dbg_cde_ctx, "cde: init");
+        err = nvgpu_mutex_init(&cde_app->mutex);
+        if (err)
+                return err;
+        nvgpu_mutex_acquire(&cde_app->mutex);
+        nvgpu_init_list_node(&cde_app->free_contexts);
+        nvgpu_init_list_node(&cde_app->used_contexts);
+        cde_app->ctx_count = 0;
+        cde_app->ctx_count_top = 0;
+        cde_app->ctx_usecount = 0;
+        err = gk20a_cde_create_contexts(l);
+        if (!err)
+                cde_app->initialised = true;
+        nvgpu_mutex_release(&cde_app->mutex);
+        gk20a_dbg(gpu_dbg_cde_ctx, "cde: init finished: %d", err);
+        if (err)
+                nvgpu_mutex_destroy(&cde_app->mutex);
+        return err;
+}
+enum cde_launch_patch_id {
+        PATCH_H_QMD_CTA_RASTER_WIDTH_ID     = 1024,
+        PATCH_H_QMD_CTA_RASTER_HEIGHT_ID    = 1025,
+        PATCH_QMD_CTA_RASTER_DEPTH_ID       = 1026, /* for firmware v0 only */
+        PATCH_QMD_CTA_THREAD_DIMENSION0_ID  = 1027,
+        PATCH_QMD_CTA_THREAD_DIMENSION1_ID  = 1028,
+        PATCH_QMD_CTA_THREAD_DIMENSION2_ID  = 1029, /* for firmware v0 only */
+        PATCH_USER_CONST_XTILES_ID          = 1030, /* for firmware v0 only */
+        PATCH_USER_CONST_YTILES_ID          = 1031, /* for firmware v0 only */
+        PATCH_USER_CONST_BLOCKHEIGHTLOG2_ID = 1032,
+        PATCH_USER_CONST_DSTPITCH_ID        = 1033, /* for firmware v0 only */
+        PATCH_H_USER_CONST_FLAGS_ID         = 1034, /* for firmware v0 only */
+        PATCH_H_VPC_CURRENT_GRID_SIZE_X_ID  = 1035,
+        PATCH_H_VPC_CURRENT_GRID_SIZE_Y_ID  = 1036,
+        PATCH_H_VPC_CURRENT_GRID_SIZE_Z_ID  = 1037,
+        PATCH_VPC_CURRENT_GROUP_SIZE_X_ID   = 1038,
+        PATCH_VPC_CURRENT_GROUP_SIZE_Y_ID   = 1039,
+        PATCH_VPC_CURRENT_GROUP_SIZE_Z_ID   = 1040,
+        PATCH_USER_CONST_XBLOCKS_ID         = 1041,
+        PATCH_H_USER_CONST_DSTOFFSET_ID     = 1042,
+        PATCH_V_QMD_CTA_RASTER_WIDTH_ID     = 1043,
+        PATCH_V_QMD_CTA_RASTER_HEIGHT_ID    = 1044,
+        PATCH_V_USER_CONST_DSTOFFSET_ID     = 1045,
+        PATCH_V_VPC_CURRENT_GRID_SIZE_X_ID  = 1046,
+        PATCH_V_VPC_CURRENT_GRID_SIZE_Y_ID  = 1047,
+        PATCH_V_VPC_CURRENT_GRID_SIZE_Z_ID  = 1048,
+        PATCH_H_LAUNCH_WORD1_ID             = 1049,
+        PATCH_H_LAUNCH_WORD2_ID             = 1050,
+        PATCH_V_LAUNCH_WORD1_ID             = 1051,
+        PATCH_V_LAUNCH_WORD2_ID             = 1052,
+        PATCH_H_QMD_PROGRAM_OFFSET_ID       = 1053,
+        PATCH_H_QMD_REGISTER_COUNT_ID       = 1054,
+        PATCH_V_QMD_PROGRAM_OFFSET_ID       = 1055,
+        PATCH_V_QMD_REGISTER_COUNT_ID       = 1056,
+};
+/* maximum number of WRITE_PATCHes in the below function */
+#define MAX_CDE_LAUNCH_PATCHES            32
+static int gk20a_buffer_convert_gpu_to_cde_v1(
+                struct nvgpu_os_linux *l,
+                struct dma_buf *dmabuf, u32 consumer,
+                u64 offset, u64 compbits_hoffset, u64 compbits_voffset,
+                u64 scatterbuffer_offset,
+                u32 width, u32 height, u32 block_height_log2,
+                u32 submit_flags, struct nvgpu_fence *fence_in,
+                struct gk20a_buffer_state *state)
+{
+        struct gk20a *g = &l->g;
+        struct gk20a_cde_param params[MAX_CDE_LAUNCH_PATCHES];
+        int param = 0;
+        int err = 0;
+        struct gk20a_fence *new_fence = NULL;
+        const int wgx = 8;
+        const int wgy = 8;
+        const int compbits_per_byte = 4; /* one byte stores 4 compbit pairs */
+        const int xalign = compbits_per_byte * wgx;
+        const int yalign = wgy;
+        /* Compute per launch parameters */
+        const int xtiles = (width + 7) >> 3;
+        const int ytiles = (height + 7) >> 3;
+        const int gridw_h = roundup(xtiles, xalign) / xalign;
+        const int gridh_h = roundup(ytiles, yalign) / yalign;
+        const int gridw_v = roundup(ytiles, xalign) / xalign;
+        const int gridh_v = roundup(xtiles, yalign) / yalign;
+        const int xblocks = (xtiles + 1) >> 1;
+        const int voffset = compbits_voffset - compbits_hoffset;
+        int hprog = -1;
+        int vprog = -1;
+        if (g->ops.cde.get_program_numbers)
+                g->ops.cde.get_program_numbers(g, block_height_log2,
+                                               l->cde_app.shader_parameter,
+                                               &hprog, &vprog);
+        else {
+                nvgpu_warn(g, "cde: chip not supported");
+                return -ENOSYS;
+        }
+        if (hprog < 0 || vprog < 0) {
+                nvgpu_warn(g, "cde: could not determine programs");
+                return -ENOSYS;
+        }
+        if (xtiles > 8192 / 8 || ytiles > 8192 / 8)
+                nvgpu_warn(g, "cde: surface is exceptionally large (xtiles=%d, ytiles=%d)",
+                           xtiles, ytiles);
+        gk20a_dbg(gpu_dbg_cde, "w=%d, h=%d, bh_log2=%d, compbits_hoffset=0x%llx, compbits_voffset=0x%llx, scatterbuffer_offset=0x%llx",
+                  width, height, block_height_log2,
+                  compbits_hoffset, compbits_voffset, scatterbuffer_offset);
+        gk20a_dbg(gpu_dbg_cde, "resolution (%d, %d) tiles (%d, %d)",
+                  width, height, xtiles, ytiles);
+        gk20a_dbg(gpu_dbg_cde, "group (%d, %d) gridH (%d, %d) gridV (%d, %d)",
+                  wgx, wgy, gridw_h, gridh_h, gridw_v, gridh_v);
+        gk20a_dbg(gpu_dbg_cde, "hprog=%d, offset=0x%x, regs=%d, vprog=%d, offset=0x%x, regs=%d",
+                  hprog,
+                  l->cde_app.arrays[ARRAY_PROGRAM_OFFSET][hprog],
+                  l->cde_app.arrays[ARRAY_REGISTER_COUNT][hprog],
+                  vprog,
+                  l->cde_app.arrays[ARRAY_PROGRAM_OFFSET][vprog],
+                  l->cde_app.arrays[ARRAY_REGISTER_COUNT][vprog]);
+        /* Write parameters */
+#define WRITE_PATCH(NAME, VALUE) \
+        params[param++] = (struct gk20a_cde_param){NAME##_ID, 0, VALUE}
+        WRITE_PATCH(PATCH_USER_CONST_XBLOCKS, xblocks);
+        WRITE_PATCH(PATCH_USER_CONST_BLOCKHEIGHTLOG2,
+                block_height_log2);
+        WRITE_PATCH(PATCH_QMD_CTA_THREAD_DIMENSION0, wgx);
+        WRITE_PATCH(PATCH_QMD_CTA_THREAD_DIMENSION1, wgy);
+        WRITE_PATCH(PATCH_VPC_CURRENT_GROUP_SIZE_X, wgx);
+        WRITE_PATCH(PATCH_VPC_CURRENT_GROUP_SIZE_Y, wgy);
+        WRITE_PATCH(PATCH_VPC_CURRENT_GROUP_SIZE_Z, 1);
+        WRITE_PATCH(PATCH_H_QMD_CTA_RASTER_WIDTH, gridw_h);
+        WRITE_PATCH(PATCH_H_QMD_CTA_RASTER_HEIGHT, gridh_h);
+        WRITE_PATCH(PATCH_H_USER_CONST_DSTOFFSET, 0);
+        WRITE_PATCH(PATCH_H_VPC_CURRENT_GRID_SIZE_X, gridw_h);
+        WRITE_PATCH(PATCH_H_VPC_CURRENT_GRID_SIZE_Y, gridh_h);
+        WRITE_PATCH(PATCH_H_VPC_CURRENT_GRID_SIZE_Z, 1);
+        WRITE_PATCH(PATCH_V_QMD_CTA_RASTER_WIDTH, gridw_v);
+        WRITE_PATCH(PATCH_V_QMD_CTA_RASTER_HEIGHT, gridh_v);
+        WRITE_PATCH(PATCH_V_USER_CONST_DSTOFFSET, voffset);
+        WRITE_PATCH(PATCH_V_VPC_CURRENT_GRID_SIZE_X, gridw_v);
+        WRITE_PATCH(PATCH_V_VPC_CURRENT_GRID_SIZE_Y, gridh_v);
+        WRITE_PATCH(PATCH_V_VPC_CURRENT_GRID_SIZE_Z, 1);
+        WRITE_PATCH(PATCH_H_QMD_PROGRAM_OFFSET,
+                l->cde_app.arrays[ARRAY_PROGRAM_OFFSET][hprog]);
+        WRITE_PATCH(PATCH_H_QMD_REGISTER_COUNT,
+                l->cde_app.arrays[ARRAY_REGISTER_COUNT][hprog]);
+        WRITE_PATCH(PATCH_V_QMD_PROGRAM_OFFSET,
+                l->cde_app.arrays[ARRAY_PROGRAM_OFFSET][vprog]);
+        WRITE_PATCH(PATCH_V_QMD_REGISTER_COUNT,
+                l->cde_app.arrays[ARRAY_REGISTER_COUNT][vprog]);
+        if (consumer & NVGPU_GPU_COMPBITS_CDEH) {
+                WRITE_PATCH(PATCH_H_LAUNCH_WORD1,
+                        l->cde_app.arrays[ARRAY_LAUNCH_COMMAND][0]);
+                WRITE_PATCH(PATCH_H_LAUNCH_WORD2,
+                        l->cde_app.arrays[ARRAY_LAUNCH_COMMAND][1]);
+        } else {
+                WRITE_PATCH(PATCH_H_LAUNCH_WORD1,
+                        l->cde_app.arrays[ARRAY_LAUNCH_COMMAND][2]);
+                WRITE_PATCH(PATCH_H_LAUNCH_WORD2,
+                        l->cde_app.arrays[ARRAY_LAUNCH_COMMAND][3]);
+        }
+        if (consumer & NVGPU_GPU_COMPBITS_CDEV) {
+                WRITE_PATCH(PATCH_V_LAUNCH_WORD1,
+                        l->cde_app.arrays[ARRAY_LAUNCH_COMMAND][0]);
+                WRITE_PATCH(PATCH_V_LAUNCH_WORD2,
+                        l->cde_app.arrays[ARRAY_LAUNCH_COMMAND][1]);
+        } else {
+                WRITE_PATCH(PATCH_V_LAUNCH_WORD1,
+                        l->cde_app.arrays[ARRAY_LAUNCH_COMMAND][2]);
+                WRITE_PATCH(PATCH_V_LAUNCH_WORD2,
+                        l->cde_app.arrays[ARRAY_LAUNCH_COMMAND][3]);
+        }
+#undef WRITE_PATCH
+        err = gk20a_cde_convert(l, dmabuf,
+                                compbits_hoffset,
+                                scatterbuffer_offset,
+                                fence_in, submit_flags,
+                                params, param, &new_fence);
+        if (err)
+                goto out;
+        /* compbits generated, update state & fence */
+        gk20a_fence_put(state->fence);
+        state->fence = new_fence;
+        state->valid_compbits |= consumer &
+                (NVGPU_GPU_COMPBITS_CDEH | NVGPU_GPU_COMPBITS_CDEV);
+out:
+        return err;
+}
+static int gk20a_buffer_convert_gpu_to_cde(
+                struct nvgpu_os_linux *l, struct dma_buf *dmabuf, u32 consumer,
+                u64 offset, u64 compbits_hoffset, u64 compbits_voffset,
+                u64 scatterbuffer_offset,
+                u32 width, u32 height, u32 block_height_log2,
+                u32 submit_flags, struct nvgpu_fence *fence_in,
+                struct gk20a_buffer_state *state)
+{
+        struct gk20a *g = &l->g;
+        int err = 0;
+        if (!l->cde_app.initialised)
+                return -ENOSYS;
+        gk20a_dbg(gpu_dbg_cde, "firmware version = %d\n",
+                l->cde_app.firmware_version);
+        if (l->cde_app.firmware_version == 1) {
+                err = gk20a_buffer_convert_gpu_to_cde_v1(
+                    l, dmabuf, consumer, offset, compbits_hoffset,
+                    compbits_voffset, scatterbuffer_offset,
+                    width, height, block_height_log2,
+                    submit_flags, fence_in, state);
+        } else {
+                nvgpu_err(g, "unsupported CDE firmware version %d",
+                        l->cde_app.firmware_version);
+                err = -EINVAL;
+        }
+        return err;
+}
+int gk20a_prepare_compressible_read(
+                struct nvgpu_os_linux *l, u32 buffer_fd, u32 request, u64 offset,
+                u64 compbits_hoffset, u64 compbits_voffset,
+                u64 scatterbuffer_offset,
+                u32 width, u32 height, u32 block_height_log2,
+                u32 submit_flags, struct nvgpu_fence *fence,
+                u32 *valid_compbits, u32 *zbc_color,
+                struct gk20a_fence **fence_out)
+{
+        struct gk20a *g = &l->g;
+        int err = 0;
+        struct gk20a_buffer_state *state;
+        struct dma_buf *dmabuf;
+        u32 missing_bits;
+        dmabuf = dma_buf_get(buffer_fd);
+        if (IS_ERR(dmabuf))
+                return -EINVAL;
+        err = gk20a_dmabuf_get_state(dmabuf, g, offset, &state);
+        if (err) {
+                dma_buf_put(dmabuf);
+                return err;
+        }
+        missing_bits = (state->valid_compbits ^ request) & request;
+        nvgpu_mutex_acquire(&state->lock);
+        if (state->valid_compbits && request == NVGPU_GPU_COMPBITS_NONE) {
+                gk20a_fence_put(state->fence);
+                state->fence = NULL;
+                /* state->fence = decompress();
+                state->valid_compbits = 0; */
+                err = -EINVAL;
+                goto out;
+        } else if (missing_bits) {
+                u32 missing_cde_bits = missing_bits &
+                         (NVGPU_GPU_COMPBITS_CDEH | NVGPU_GPU_COMPBITS_CDEV);
+                if ((state->valid_compbits & NVGPU_GPU_COMPBITS_GPU) &&
+                    missing_cde_bits) {
+                        err = gk20a_buffer_convert_gpu_to_cde(
+                                        l, dmabuf,
+                                        missing_cde_bits,
+                                        offset, compbits_hoffset,
+                                        compbits_voffset, scatterbuffer_offset,
+                                        width, height, block_height_log2,
+                                        submit_flags, fence,
+                                        state);
+                        if (err)
+                                goto out;
+                }
+        }
+        if (state->fence && fence_out)
+                *fence_out = gk20a_fence_get(state->fence);
+        if (valid_compbits)
+                *valid_compbits = state->valid_compbits;
+        if (zbc_color)
+                *zbc_color = state->zbc_color;
+out:
+        nvgpu_mutex_release(&state->lock);
+        dma_buf_put(dmabuf);
+        return err;
+}
+int gk20a_mark_compressible_write(struct gk20a *g, u32 buffer_fd,
+                                  u32 valid_compbits, u64 offset, u32 zbc_color)
+{
+        int err;
+        struct gk20a_buffer_state *state;
+        struct dma_buf *dmabuf;
+        dmabuf = dma_buf_get(buffer_fd);
+        if (IS_ERR(dmabuf)) {
+                nvgpu_err(g, "invalid dmabuf");
+                return -EINVAL;
+        }
+        err = gk20a_dmabuf_get_state(dmabuf, g, offset, &state);
+        if (err) {
+                nvgpu_err(g, "could not get state from dmabuf");
+                dma_buf_put(dmabuf);
+                return err;
+        }
+        nvgpu_mutex_acquire(&state->lock);
+        /* Update the compbits state. */
+        state->valid_compbits = valid_compbits;
+        state->zbc_color = zbc_color;
+        /* Discard previous compbit job fence. */
+        gk20a_fence_put(state->fence);
+        state->fence = NULL;
+        nvgpu_mutex_release(&state->lock);
+        dma_buf_put(dmabuf);
+        return 0;
+}
diff --git a/drivers/gpu/nvgpu/common/linux/cde.h b/drivers/gpu/nvgpu/common/linux/cde.h
new file mode 100644
index 00000000..22732a2a
--- /dev/null
+++ b/drivers/gpu/nvgpu/common/linux/cde.h
@@ -0,0 +1,309 @@
+/*
+ * GK20A color decompression engine support
+ *
+ * Copyright (c) 2014-2017, NVIDIA Corporation.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#ifndef _CDE_GK20A_H_
+#define _CDE_GK20A_H_
+#define MAX_CDE_BUFS            10
+#define MAX_CDE_PARAMS          64
+#define MAX_CDE_USER_PARAMS     40
+#define MAX_CDE_ARRAY_ENTRIES   9
+/*
+ * The size of the context ring buffer that is dedicated for handling cde
+ * jobs.  Re-using a context (=channel) for a differnt cde job forces a cpu
+ * wait on the previous job to that channel, so increasing this value
+ * reduces the likelihood of stalls.
+ */
+#define NUM_CDE_CONTEXTS        4
+struct dma_buf;
+struct gk20a;
+/*
+ * this element defines a buffer that is allocated and mapped into gpu address
+ * space. data_byte_offset defines the beginning of the buffer inside the
+ * firmare. num_bytes defines how many bytes the firmware contains.
+ *
+ * If data_byte_offset is zero, we allocate an empty buffer.
+ */
+struct gk20a_cde_hdr_buf {
+        u64 data_byte_offset;
+        u64 num_bytes;
+};
+/*
+ * this element defines a constant patching in buffers. It basically
+ * computes physical address to <source_buf>+source_byte_offset. The
+ * address is then modified into patch value as per:
+ *    value = (current_value & ~mask) | (address << shift) & mask .
+ *
+ * The type field defines the register size as:
+ *  0=u32,
+ *  1=u64 (little endian),
+ *  2=u64 (big endian)
+ */
+struct gk20a_cde_hdr_replace {
+        u32 target_buf;
+        u32 source_buf;
+        s32 shift;
+        u32 type;
+        u64 target_byte_offset;
+        u64 source_byte_offset;
+        u64 mask;
+};
+enum {
+        TYPE_PARAM_TYPE_U32 = 0,
+        TYPE_PARAM_TYPE_U64_LITTLE,
+        TYPE_PARAM_TYPE_U64_BIG
+};
+/*
+ * this element defines a runtime patching in buffers. Parameters with id from
+ * 0 to 1024 are reserved for special usage as follows:
+ *   0 = comptags_per_cacheline,
+ *   1 = slices_per_fbp,
+ *   2 = num_fbps
+ *   3 = source buffer first page offset
+ *   4 = source buffer block height log2
+ *   5 = backing store memory address
+ *   6 = destination memory address
+ *   7 = destination size (bytes)
+ *   8 = backing store size (bytes)
+ *   9 = cache line size
+ *
+ * Parameters above id 1024 are user-specified. I.e. they determine where a
+ * parameters from user space should be placed in buffers, what is their
+ * type, etc.
+ *
+ * Once the value is available, we add data_offset to the value.
+ *
+ * The value address is then modified into patch value as per:
+ *    value = (current_value & ~mask) | (address << shift) & mask .
+ *
+ * The type field defines the register size as:
+ *  0=u32,
+ *  1=u64 (little endian),
+ *  2=u64 (big endian)
+ */
+struct gk20a_cde_hdr_param {
+        u32 id;
+        u32 target_buf;
+        s32 shift;
+        u32 type;
+        s64 data_offset;
+        u64 target_byte_offset;
+        u64 mask;
+};
+enum {
+        TYPE_PARAM_COMPTAGS_PER_CACHELINE = 0,
+        TYPE_PARAM_GPU_CONFIGURATION,
+        TYPE_PARAM_FIRSTPAGEOFFSET,
+        TYPE_PARAM_NUMPAGES,
+        TYPE_PARAM_BACKINGSTORE,
+        TYPE_PARAM_DESTINATION,
+        TYPE_PARAM_DESTINATION_SIZE,
+        TYPE_PARAM_BACKINGSTORE_SIZE,
+        TYPE_PARAM_SOURCE_SMMU_ADDR,
+        TYPE_PARAM_BACKINGSTORE_BASE_HW,
+        TYPE_PARAM_GOBS_PER_COMPTAGLINE_PER_SLICE,
+        TYPE_PARAM_SCATTERBUFFER,
+        TYPE_PARAM_SCATTERBUFFER_SIZE,
+        NUM_RESERVED_PARAMS = 1024,
+};
+/*
+ * This header element defines a command. The op field determines whether the
+ * element is defining an init (0) or convert command (1). data_byte_offset
+ * denotes the beginning address of command elements in the file.
+ */
+struct gk20a_cde_hdr_command {
+        u32 op;
+        u32 num_entries;
+        u64 data_byte_offset;
+};
+enum {
+        TYPE_BUF_COMMAND_INIT = 0,
+        TYPE_BUF_COMMAND_CONVERT
+};
+/*
+ * This is a command element defines one entry inside push buffer. target_buf
+ * defines the buffer including the pushbuffer entries, target_byte_offset the
+ * offset inside the buffer and num_bytes the number of words in the buffer.
+ */
+struct gk20a_cde_cmd_elem {
+        u32 target_buf;
+        u32 padding;
+        u64 target_byte_offset;
+        u64 num_bytes;
+};
+/*
+ * This element is used for storing a small array of data.
+ */
+enum {
+        ARRAY_PROGRAM_OFFSET = 0,
+        ARRAY_REGISTER_COUNT,
+        ARRAY_LAUNCH_COMMAND,
+        NUM_CDE_ARRAYS
+};
+struct gk20a_cde_hdr_array {
+        u32 id;
+        u32 data[MAX_CDE_ARRAY_ENTRIES];
+};
+/*
+ * Following defines a single header element. Each element has a type and
+ * some of the data structures.
+ */
+struct gk20a_cde_hdr_elem {
+        u32 type;
+        u32 padding;
+        union {
+                struct gk20a_cde_hdr_buf buf;
+                struct gk20a_cde_hdr_replace replace;
+                struct gk20a_cde_hdr_param param;
+                u32 required_class;
+                struct gk20a_cde_hdr_command command;
+                struct gk20a_cde_hdr_array array;
+        };
+};
+enum {
+        TYPE_BUF = 0,
+        TYPE_REPLACE,
+        TYPE_PARAM,
+        TYPE_REQUIRED_CLASS,
+        TYPE_COMMAND,
+        TYPE_ARRAY
+};
+struct gk20a_cde_param {
+        u32 id;
+        u32 padding;
+        u64 value;
+};
+struct gk20a_cde_ctx {
+        struct nvgpu_os_linux *l;
+        struct device *dev;
+        /* channel related data */
+        struct channel_gk20a *ch;
+        struct vm_gk20a *vm;
+        /* buf converter configuration */
+        struct nvgpu_mem mem[MAX_CDE_BUFS];
+        unsigned int num_bufs;
+        /* buffer patching params (where should patching be done) */
+        struct gk20a_cde_hdr_param params[MAX_CDE_PARAMS];
+        unsigned int num_params;
+        /* storage for user space parameter values */
+        u32 user_param_values[MAX_CDE_USER_PARAMS];
+        u32 surf_param_offset;
+        u32 surf_param_lines;
+        u64 surf_vaddr;
+        u64 compbit_vaddr;
+        u64 compbit_size;
+        u64 scatterbuffer_vaddr;
+        u64 scatterbuffer_size;
+        u64 backing_store_vaddr;
+        struct nvgpu_gpfifo *init_convert_cmd;
+        int init_cmd_num_entries;
+        struct nvgpu_gpfifo *convert_cmd;
+        int convert_cmd_num_entries;
+        struct kobj_attribute attr;
+        bool init_cmd_executed;
+        struct nvgpu_list_node list;
+        bool is_temporary;
+        bool in_use;
+        struct delayed_work ctx_deleter_work;
+};
+static inline struct gk20a_cde_ctx *
+gk20a_cde_ctx_from_list(struct nvgpu_list_node *node)
+{
+        return (struct gk20a_cde_ctx *)
+                ((uintptr_t)node - offsetof(struct gk20a_cde_ctx, list));
+};
+struct gk20a_cde_app {
+        bool initialised;
+        struct nvgpu_mutex mutex;
+        struct nvgpu_list_node free_contexts;
+        struct nvgpu_list_node used_contexts;
+        unsigned int ctx_count;
+        unsigned int ctx_usecount;
+        unsigned int ctx_count_top;
+        u32 firmware_version;
+        u32 arrays[NUM_CDE_ARRAYS][MAX_CDE_ARRAY_ENTRIES];
+        u32 shader_parameter;
+};
+void gk20a_cde_destroy(struct nvgpu_os_linux *l);
+void gk20a_cde_suspend(struct nvgpu_os_linux *l);
+int gk20a_init_cde_support(struct nvgpu_os_linux *l);
+int gk20a_cde_reload(struct nvgpu_os_linux *l);
+int gk20a_cde_convert(struct nvgpu_os_linux *l,
+                struct dma_buf *compbits_buf,
+                u64 compbits_byte_offset,
+                u64 scatterbuffer_byte_offset,
+                struct nvgpu_fence *fence,
+                u32 __flags, struct gk20a_cde_param *params,
+                int num_params, struct gk20a_fence **fence_out);
+int gk20a_prepare_compressible_read(
+                struct nvgpu_os_linux *l, u32 buffer_fd, u32 request, u64 offset,
+                u64 compbits_hoffset, u64 compbits_voffset,
+                u64 scatterbuffer_offset,
+                u32 width, u32 height, u32 block_height_log2,
+                u32 submit_flags, struct nvgpu_fence *fence,
+                u32 *valid_compbits, u32 *zbc_color,
+                struct gk20a_fence **fence_out);
+int gk20a_mark_compressible_write(
+                struct gk20a *g, u32 buffer_fd, u32 valid_compbits, u64 offset,
+                u32 zbc_color);
+#endif
diff --git a/drivers/gpu/nvgpu/common/linux/debug_cde.c b/drivers/gpu/nvgpu/common/linux/debug_cde.c
index 40cc64a4..cbea83b9 100644
--- a/drivers/gpu/nvgpu/common/linux/debug_cde.c
+++ b/drivers/gpu/nvgpu/common/linux/debug_cde.c
@@ -22,8 +22,8 @@
 static ssize_t gk20a_cde_reload_write(struct file *file,
        const char __user *userbuf, size_t count, loff_t *ppos)
 {
-        struct gk20a *g = file->private_data;
+        struct nvgpu_os_linux *l = file->private_data;
-        gk20a_cde_reload(g);
+        gk20a_cde_reload(l);
        return count;
 }
@@ -41,13 +41,13 @@ void gk20a_cde_debugfs_init(struct gk20a *g)
                return;
        debugfs_create_u32("cde_parameter", S_IWUSR | S_IRUGO,
-                           l->debugfs, &g->cde_app.shader_parameter);
+                           l->debugfs, &l->cde_app.shader_parameter);
        debugfs_create_u32("cde_ctx_count", S_IWUSR | S_IRUGO,
-                           l->debugfs, &g->cde_app.ctx_count);
+                           l->debugfs, &l->cde_app.ctx_count);
        debugfs_create_u32("cde_ctx_usecount", S_IWUSR | S_IRUGO,
-                           l->debugfs, &g->cde_app.ctx_usecount);
+                           l->debugfs, &l->cde_app.ctx_usecount);
        debugfs_create_u32("cde_ctx_count_top", S_IWUSR | S_IRUGO,
-                           l->debugfs, &g->cde_app.ctx_count_top);
+                           l->debugfs, &l->cde_app.ctx_count_top);
        debugfs_create_file("reload_cde_firmware", S_IWUSR, l->debugfs,
-                            g, &gk20a_cde_reload_fops);
+                            l, &gk20a_cde_reload_fops);
 }
diff --git a/drivers/gpu/nvgpu/common/linux/ioctl_ctrl.c b/drivers/gpu/nvgpu/common/linux/ioctl_ctrl.c
index 0d79b143..0357f098 100644
--- a/drivers/gpu/nvgpu/common/linux/ioctl_ctrl.c
+++ b/drivers/gpu/nvgpu/common/linux/ioctl_ctrl.c
@@ -138,6 +138,7 @@ static int gk20a_ctrl_prepare_compressible_read(
                struct gk20a *g,
                struct nvgpu_gpu_prepare_compressible_read_args *args)
 {
+        struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g);
        struct nvgpu_fence fence;
        struct gk20a_fence *fence_out = NULL;
        int ret = 0;
@@ -146,7 +147,7 @@ static int gk20a_ctrl_prepare_compressible_read(
        fence.id = args->fence.syncpt_id;
        fence.value = args->fence.syncpt_value;
-        ret = gk20a_prepare_compressible_read(g, args->handle,
+        ret = gk20a_prepare_compressible_read(l, args->handle,
                        args->request_compbits, args->offset,
                        args->compbits_hoffset, args->compbits_voffset,
                        args->scatterbuffer_offset,
diff --git a/drivers/gpu/nvgpu/common/linux/module.c b/drivers/gpu/nvgpu/common/linux/module.c
index 6a590baa..509930c7 100644
--- a/drivers/gpu/nvgpu/common/linux/module.c
+++ b/drivers/gpu/nvgpu/common/linux/module.c
@@ -39,6 +39,7 @@
 #include "pci.h"
 #include "module.h"
 #include "intr.h"
+#include "cde.h"
 #ifdef CONFIG_TEGRA_19x_GPU
 #include "nvgpu_gpuid_t19x.h"
 #ifdef CONFIG_TEGRA_GR_VIRTUALIZATION
@@ -185,7 +186,7 @@ int gk20a_pm_finalize_poweron(struct device *dev)
        gk20a_scale_resume(dev_from_gk20a(g));
        if (platform->has_cde)
-                gk20a_init_cde_support(g);
+                gk20a_init_cde_support(l);
 done:
        if (err)
@@ -197,6 +198,7 @@ done:
 static int gk20a_pm_prepare_poweroff(struct device *dev)
 {
        struct gk20a *g = get_gk20a(dev);
+        struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g);
        int ret = 0;
        struct gk20a_platform *platform = gk20a_get_platform(dev);
@@ -207,8 +209,15 @@ static int gk20a_pm_prepare_poweroff(struct device *dev)
        if (!g->power_on)
                goto done;
+        if (gk20a_fifo_is_engine_busy(g)) {
+                ret = -EBUSY;
+                goto done;
+        }
        gk20a_scale_suspend(dev);
+        gk20a_cde_suspend(l);
        ret = gk20a_prepare_poweroff(g);
        if (ret)
                goto error;
@@ -974,6 +983,7 @@ static int __exit gk20a_remove(struct platform_device *pdev)
 {
        struct device *dev = &pdev->dev;
        struct gk20a *g = get_gk20a(dev);
+        struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g);
        struct gk20a_platform *platform = gk20a_get_platform(dev);
        gk20a_dbg_fn("");
@@ -982,7 +992,7 @@ static int __exit gk20a_remove(struct platform_device *pdev)
                return vgpu_remove(pdev);
        if (platform->has_cde)
-                gk20a_cde_destroy(g);
+                gk20a_cde_destroy(l);
        gk20a_ctxsw_trace_cleanup(g);
diff --git a/drivers/gpu/nvgpu/common/linux/os_linux.h b/drivers/gpu/nvgpu/common/linux/os_linux.h
index ed8364a9..160a5738 100644
--- a/drivers/gpu/nvgpu/common/linux/os_linux.h
+++ b/drivers/gpu/nvgpu/common/linux/os_linux.h
@@ -19,6 +19,7 @@
 #include <linux/cdev.h>
 #include "gk20a/gk20a.h"
+#include "cde.h"
 struct nvgpu_os_linux {
        struct gk20a g;
@@ -108,6 +109,7 @@ struct nvgpu_os_linux {
        struct dentry *debugfs_force_preemption_gfxp;
        struct dentry *debugfs_dump_ctxsw_stats;
 #endif
+        struct gk20a_cde_app cde_app;
 };
 static inline struct nvgpu_os_linux *nvgpu_os_linux_from_gk20a(struct gk20a *g)