13 files changed, 171 insertions, 136 deletions
diff --git a/drivers/gpu/nvgpu/Makefile.nvgpu b/drivers/gpu/nvgpu/Makefile.nvgpu
index 25545f29..87199316 100644
--- a/drivers/gpu/nvgpu/Makefile.nvgpu
+++ b/drivers/gpu/nvgpu/Makefile.nvgpu
@@ -45,6 +45,7 @@ nvgpu-y := \
        common/linux/vm.o \
        common/linux/intr.o \
        common/linux/sysfs.o \
+        common/linux/cde.o \
        common/mm/nvgpu_allocator.o \
        common/mm/bitmap_allocator.o \
        common/mm/buddy_allocator.o \
@@ -92,7 +93,6 @@ nvgpu-y := \
        gk20a/ltc_gk20a.o \
        gk20a/fb_gk20a.o \
        gk20a/hal.o \
-        gk20a/cde_gk20a.o \
        gk20a/tsg_gk20a.o \
        gk20a/ctxsw_trace_gk20a.o \
        gk20a/fecs_trace_gk20a.o \
diff --git a/drivers/gpu/nvgpu/gk20a/cde_gk20a.c b/drivers/gpu/nvgpu/common/linux/cde.c
index 506207f2..5b0fb910 100644
--- a/drivers/gpu/nvgpu/gk20a/cde_gk20a.c
+++ b/drivers/gpu/nvgpu/common/linux/cde.c
@@ -31,13 +31,14 @@
 #include <nvgpu/bug.h>
 #include <nvgpu/firmware.h>
-#include "gk20a.h"
+#include "gk20a/gk20a.h"
-#include "channel_gk20a.h"
+#include "gk20a/channel_gk20a.h"
-#include "mm_gk20a.h"
+#include "gk20a/mm_gk20a.h"
-#include "cde_gk20a.h"
+#include "gk20a/fence_gk20a.h"
-#include "fence_gk20a.h"
+#include "gk20a/gr_gk20a.h"
-#include "gr_gk20a.h"
-#include "common/linux/os_linux.h"
+#include "cde.h"
+#include "os_linux.h"
 #include <nvgpu/hw/gk20a/hw_ccsr_gk20a.h>
 #include <nvgpu/hw/gk20a/hw_pbdma_gk20a.h>
@@ -49,7 +50,7 @@
 #include "common/linux/vm_priv.h"
 static int gk20a_cde_load(struct gk20a_cde_ctx *cde_ctx);
-static struct gk20a_cde_ctx *gk20a_cde_allocate_context(struct gk20a *g);
+static struct gk20a_cde_ctx *gk20a_cde_allocate_context(struct nvgpu_os_linux *l);
 #define CTX_DELETE_TIME 1000
@@ -65,7 +66,7 @@ static void gk20a_deinit_cde_img(struct gk20a_cde_ctx *cde_ctx)
                nvgpu_dma_unmap_free(cde_ctx->vm, mem);
        }
-        nvgpu_kfree(cde_ctx->g, cde_ctx->init_convert_cmd);
+        nvgpu_kfree(&cde_ctx->l->g, cde_ctx->init_convert_cmd);
        cde_ctx->convert_cmd = NULL;
        cde_ctx->init_convert_cmd = NULL;
@@ -79,7 +80,8 @@ static void gk20a_deinit_cde_img(struct gk20a_cde_ctx *cde_ctx)
 static void gk20a_cde_remove_ctx(struct gk20a_cde_ctx *cde_ctx)
 __must_hold(&cde_app->mutex)
 {
-        struct gk20a *g = cde_ctx->g;
+        struct nvgpu_os_linux *l = cde_ctx->l;
+        struct gk20a *g = &l->g;
        struct channel_gk20a *ch = cde_ctx->ch;
        struct vm_gk20a *vm = ch->vm;
@@ -95,7 +97,7 @@ __must_hold(&cde_app->mutex)
        /* housekeeping on app */
        nvgpu_list_del(&cde_ctx->list);
-        cde_ctx->g->cde_app.ctx_count--;
+        l->cde_app.ctx_count--;
        nvgpu_kfree(g, cde_ctx);
 }
@@ -104,7 +106,7 @@ static void gk20a_cde_cancel_deleter(struct gk20a_cde_ctx *cde_ctx,
 __releases(&cde_app->mutex)
 __acquires(&cde_app->mutex)
 {
-        struct gk20a_cde_app *cde_app = &cde_ctx->g->cde_app;
+        struct gk20a_cde_app *cde_app = &cde_ctx->l->cde_app;
        /* permanent contexts do not have deleter works */
        if (!cde_ctx->is_temporary)
@@ -119,10 +121,10 @@ __acquires(&cde_app->mutex)
        }
 }
-static void gk20a_cde_remove_contexts(struct gk20a *g)
+static void gk20a_cde_remove_contexts(struct nvgpu_os_linux *l)
-__must_hold(&cde_app->mutex)
+__must_hold(&l->cde_app->mutex)
 {
-        struct gk20a_cde_app *cde_app = &g->cde_app;
+        struct gk20a_cde_app *cde_app = &l->cde_app;
        struct gk20a_cde_ctx *cde_ctx, *cde_ctx_save;
        /* safe to go off the mutex in cancel_deleter since app is
@@ -142,38 +144,38 @@ __must_hold(&cde_app->mutex)
        }
 }
-static void gk20a_cde_stop(struct gk20a *g)
+static void gk20a_cde_stop(struct nvgpu_os_linux *l)
-__must_hold(&cde_app->mutex)
+__must_hold(&l->cde_app->mutex)
 {
-        struct gk20a_cde_app *cde_app = &g->cde_app;
+        struct gk20a_cde_app *cde_app = &l->cde_app;
        /* prevent further conversions and delayed works from working */
        cde_app->initialised = false;
        /* free all data, empty the list */
-        gk20a_cde_remove_contexts(g);
+        gk20a_cde_remove_contexts(l);
 }
-void gk20a_cde_destroy(struct gk20a *g)
+void gk20a_cde_destroy(struct nvgpu_os_linux *l)
-__acquires(&cde_app->mutex)
+__acquires(&l->cde_app->mutex)
-__releases(&cde_app->mutex)
+__releases(&l->cde_app->mutex)
 {
-        struct gk20a_cde_app *cde_app = &g->cde_app;
+        struct gk20a_cde_app *cde_app = &l->cde_app;
        if (!cde_app->initialised)
                return;
        nvgpu_mutex_acquire(&cde_app->mutex);
-        gk20a_cde_stop(g);
+        gk20a_cde_stop(l);
        nvgpu_mutex_release(&cde_app->mutex);
        nvgpu_mutex_destroy(&cde_app->mutex);
 }
-void gk20a_cde_suspend(struct gk20a *g)
+void gk20a_cde_suspend(struct nvgpu_os_linux *l)
-__acquires(&cde_app->mutex)
+__acquires(&l->cde_app->mutex)
-__releases(&cde_app->mutex)
+__releases(&l->cde_app->mutex)
 {
-        struct gk20a_cde_app *cde_app = &g->cde_app;
+        struct gk20a_cde_app *cde_app = &l->cde_app;
        struct gk20a_cde_ctx *cde_ctx, *cde_ctx_save;
        if (!cde_app->initialised)
@@ -195,13 +197,13 @@ __releases(&cde_app->mutex)
 }
-static int gk20a_cde_create_context(struct gk20a *g)
+static int gk20a_cde_create_context(struct nvgpu_os_linux *l)
-__must_hold(&cde_app->mutex)
+__must_hold(&l->cde_app->mutex)
 {
-        struct gk20a_cde_app *cde_app = &g->cde_app;
+        struct gk20a_cde_app *cde_app = &l->cde_app;
        struct gk20a_cde_ctx *cde_ctx;
-        cde_ctx = gk20a_cde_allocate_context(g);
+        cde_ctx = gk20a_cde_allocate_context(l);
        if (IS_ERR(cde_ctx))
                return PTR_ERR(cde_ctx);
@@ -213,21 +215,21 @@ __must_hold(&cde_app->mutex)
        return 0;
 }
-static int gk20a_cde_create_contexts(struct gk20a *g)
+static int gk20a_cde_create_contexts(struct nvgpu_os_linux *l)
-__must_hold(&g->cde_app->mutex)
+__must_hold(&l->cde_app->mutex)
 {
        int err;
        int i;
        for (i = 0; i < NUM_CDE_CONTEXTS; i++) {
-                err = gk20a_cde_create_context(g);
+                err = gk20a_cde_create_context(l);
                if (err)
                        goto out;
        }
        return 0;
 out:
-        gk20a_cde_remove_contexts(g);
+        gk20a_cde_remove_contexts(l);
        return err;
 }
@@ -236,7 +238,8 @@ static int gk20a_init_cde_buf(struct gk20a_cde_ctx *cde_ctx,
                              struct gk20a_cde_hdr_buf *buf)
 {
        struct nvgpu_mem *mem;
-        struct gk20a *g = cde_ctx->g;
+        struct nvgpu_os_linux *l = cde_ctx->l;
+        struct gk20a *g = &l->g;
        int err;
        /* check that the file can hold the buf */
@@ -276,7 +279,8 @@ static int gk20a_init_cde_buf(struct gk20a_cde_ctx *cde_ctx,
 static int gk20a_replace_data(struct gk20a_cde_ctx *cde_ctx, void *target,
                              int type, s32 shift, u64 mask, u64 value)
 {
-        struct gk20a *g = cde_ctx->g;
+        struct nvgpu_os_linux *l = cde_ctx->l;
+        struct gk20a *g = &l->g;
        u32 *target_mem_ptr = target;
        u64 *target_mem_ptr_u64 = target;
        u64 current_value, new_value;
@@ -325,7 +329,8 @@ static int gk20a_init_cde_replace(struct gk20a_cde_ctx *cde_ctx,
 {
        struct nvgpu_mem *source_mem;
        struct nvgpu_mem *target_mem;
-        struct gk20a *g = cde_ctx->g;
+        struct nvgpu_os_linux *l = cde_ctx->l;
+        struct gk20a *g = &l->g;
        u32 *target_mem_ptr;
        u64 vaddr;
        int err;
@@ -373,7 +378,8 @@ static int gk20a_init_cde_replace(struct gk20a_cde_ctx *cde_ctx,
 static int gk20a_cde_patch_params(struct gk20a_cde_ctx *cde_ctx)
 {
-        struct gk20a *g = cde_ctx->g;
+        struct nvgpu_os_linux *l = cde_ctx->l;
+        struct gk20a *g = &l->g;
        struct nvgpu_mem *target_mem;
        u32 *target_mem_ptr;
        u64 new_data;
@@ -464,7 +470,8 @@ static int gk20a_init_cde_param(struct gk20a_cde_ctx *cde_ctx,
                                struct gk20a_cde_hdr_param *param)
 {
        struct nvgpu_mem *target_mem;
-        struct gk20a *g = cde_ctx->g;
+        struct nvgpu_os_linux *l = cde_ctx->l;
+        struct gk20a *g = &l->g;
        if (param->target_buf >= cde_ctx->num_bufs) {
                nvgpu_warn(g, "cde: invalid buffer parameter. param idx = %d, target_buf=%u, num_bufs=%u",
@@ -506,7 +513,8 @@ static int gk20a_init_cde_required_class(struct gk20a_cde_ctx *cde_ctx,
                                         struct nvgpu_firmware *img,
                                         u32 required_class)
 {
-        struct gk20a *g = cde_ctx->g;
+        struct nvgpu_os_linux *l = cde_ctx->l;
+        struct gk20a *g = &l->g;
        struct nvgpu_alloc_obj_ctx_args alloc_obj_ctx;
        int err;
@@ -532,7 +540,8 @@ static int gk20a_init_cde_command(struct gk20a_cde_ctx *cde_ctx,
                                  struct gk20a_cde_cmd_elem *cmd_elem,
                                  u32 num_elems)
 {
-        struct gk20a *g = cde_ctx->g;
+        struct nvgpu_os_linux *l = cde_ctx->l;
+        struct gk20a *g = &l->g;
        struct nvgpu_gpfifo **gpfifo, *gpfifo_elem;
        u32 *num_entries;
        unsigned int i;
@@ -551,7 +560,7 @@ static int gk20a_init_cde_command(struct gk20a_cde_ctx *cde_ctx,
        }
        /* allocate gpfifo entries to be pushed */
-        *gpfifo = nvgpu_kzalloc(cde_ctx->g,
+        *gpfifo = nvgpu_kzalloc(g,
                                sizeof(struct nvgpu_gpfifo) * num_elems);
        if (!*gpfifo) {
                nvgpu_warn(g, "cde: could not allocate memory for gpfifo entries");
@@ -596,7 +605,8 @@ static int gk20a_init_cde_command(struct gk20a_cde_ctx *cde_ctx,
 static int gk20a_cde_pack_cmdbufs(struct gk20a_cde_ctx *cde_ctx)
 {
-        struct gk20a *g = cde_ctx->g;
+        struct nvgpu_os_linux *l = cde_ctx->l;
+        struct gk20a *g = &l->g;
        unsigned long init_bytes = cde_ctx->init_cmd_num_entries *
                sizeof(struct nvgpu_gpfifo);
        unsigned long conv_bytes = cde_ctx->convert_cmd_num_entries *
@@ -605,7 +615,7 @@ static int gk20a_cde_pack_cmdbufs(struct gk20a_cde_ctx *cde_ctx)
        struct nvgpu_gpfifo *combined_cmd;
        /* allocate buffer that has space for both */
-        combined_cmd = nvgpu_kzalloc(cde_ctx->g, total_bytes);
+        combined_cmd = nvgpu_kzalloc(g, total_bytes);
        if (!combined_cmd) {
                nvgpu_warn(g,
                        "cde: could not allocate memory for gpfifo entries");
@@ -617,8 +627,8 @@ static int gk20a_cde_pack_cmdbufs(struct gk20a_cde_ctx *cde_ctx)
        memcpy(combined_cmd + cde_ctx->init_cmd_num_entries,
                        cde_ctx->convert_cmd, conv_bytes);
-        nvgpu_kfree(cde_ctx->g, cde_ctx->init_convert_cmd);
+        nvgpu_kfree(g, cde_ctx->init_convert_cmd);
-        nvgpu_kfree(cde_ctx->g, cde_ctx->convert_cmd);
+        nvgpu_kfree(g, cde_ctx->convert_cmd);
        cde_ctx->init_convert_cmd = combined_cmd;
        cde_ctx->convert_cmd = combined_cmd
@@ -630,8 +640,9 @@ static int gk20a_cde_pack_cmdbufs(struct gk20a_cde_ctx *cde_ctx)
 static int gk20a_init_cde_img(struct gk20a_cde_ctx *cde_ctx,
                              struct nvgpu_firmware *img)
 {
-        struct gk20a *g = cde_ctx->g;
+        struct nvgpu_os_linux *l = cde_ctx->l;
-        struct gk20a_cde_app *cde_app = &cde_ctx->g->cde_app;
+        struct gk20a *g = &l->g;
+        struct gk20a_cde_app *cde_app = &l->cde_app;
        u32 *data = (u32 *)img->data;
        u32 num_of_elems;
        struct gk20a_cde_hdr_elem *elem;
@@ -724,7 +735,8 @@ static int gk20a_cde_execute_buffer(struct gk20a_cde_ctx *cde_ctx,
                                    u32 op, struct nvgpu_fence *fence,
                                    u32 flags, struct gk20a_fence **fence_out)
 {
-        struct gk20a *g = cde_ctx->g;
+        struct nvgpu_os_linux *l = cde_ctx->l;
+        struct gk20a *g = &l->g;
        struct nvgpu_gpfifo *gpfifo = NULL;
        int num_entries = 0;
@@ -756,7 +768,7 @@ static void gk20a_cde_ctx_release(struct gk20a_cde_ctx *cde_ctx)
 __acquires(&cde_app->mutex)
 __releases(&cde_app->mutex)
 {
-        struct gk20a_cde_app *cde_app = &cde_ctx->g->cde_app;
+        struct gk20a_cde_app *cde_app = &cde_ctx->l->cde_app;
        gk20a_dbg(gpu_dbg_cde_ctx, "releasing use on %p", cde_ctx);
        trace_gk20a_cde_release(cde_ctx);
@@ -781,8 +793,9 @@ __releases(&cde_app->mutex)
        struct delayed_work *delay_work = to_delayed_work(work);
        struct gk20a_cde_ctx *cde_ctx = container_of(delay_work,
                        struct gk20a_cde_ctx, ctx_deleter_work);
-        struct gk20a_cde_app *cde_app = &cde_ctx->g->cde_app;
+        struct gk20a_cde_app *cde_app = &cde_ctx->l->cde_app;
-        struct gk20a *g = cde_ctx->g;
+        struct nvgpu_os_linux *l = cde_ctx->l;
+        struct gk20a *g = &l->g;
        int err;
        /* someone has just taken it? engine deletion started? */
@@ -823,10 +836,11 @@ out:
        gk20a_idle(g);
 }
-static struct gk20a_cde_ctx *gk20a_cde_do_get_context(struct gk20a *g)
+static struct gk20a_cde_ctx *gk20a_cde_do_get_context(struct nvgpu_os_linux *l)
 __must_hold(&cde_app->mutex)
 {
-        struct gk20a_cde_app *cde_app = &g->cde_app;
+        struct gk20a *g = &l->g;
+        struct gk20a_cde_app *cde_app = &l->cde_app;
        struct gk20a_cde_ctx *cde_ctx;
        /* exhausted? */
@@ -862,7 +876,7 @@ __must_hold(&cde_app->mutex)
                        "cde: no free contexts, count=%d",
                        cde_app->ctx_count);
-        cde_ctx = gk20a_cde_allocate_context(g);
+        cde_ctx = gk20a_cde_allocate_context(l);
        if (IS_ERR(cde_ctx)) {
                nvgpu_warn(g, "cde: cannot allocate context: %ld",
                                PTR_ERR(cde_ctx));
@@ -881,11 +895,12 @@ __must_hold(&cde_app->mutex)
        return cde_ctx;
 }
-static struct gk20a_cde_ctx *gk20a_cde_get_context(struct gk20a *g)
+static struct gk20a_cde_ctx *gk20a_cde_get_context(struct nvgpu_os_linux *l)
 __releases(&cde_app->mutex)
 __acquires(&cde_app->mutex)
 {
-        struct gk20a_cde_app *cde_app = &g->cde_app;
+        struct gk20a *g = &l->g;
+        struct gk20a_cde_app *cde_app = &l->cde_app;
        struct gk20a_cde_ctx *cde_ctx = NULL;
        struct nvgpu_timeout timeout;
@@ -893,7 +908,7 @@ __acquires(&cde_app->mutex)
                           NVGPU_TIMER_CPU_TIMER);
        do {
-                cde_ctx = gk20a_cde_do_get_context(g);
+                cde_ctx = gk20a_cde_do_get_context(l);
                if (PTR_ERR(cde_ctx) != -EAGAIN)
                        break;
@@ -906,8 +921,9 @@ __acquires(&cde_app->mutex)
        return cde_ctx;
 }
-static struct gk20a_cde_ctx *gk20a_cde_allocate_context(struct gk20a *g)
+static struct gk20a_cde_ctx *gk20a_cde_allocate_context(struct nvgpu_os_linux *l)
 {
+        struct gk20a *g = &l->g;
        struct gk20a_cde_ctx *cde_ctx;
        int ret;
@@ -915,7 +931,7 @@ static struct gk20a_cde_ctx *gk20a_cde_allocate_context(struct gk20a *g)
        if (!cde_ctx)
                return ERR_PTR(-ENOMEM);
-        cde_ctx->g = g;
+        cde_ctx->l = l;
        cde_ctx->dev = dev_from_gk20a(g);
        ret = gk20a_cde_load(cde_ctx);
@@ -935,16 +951,17 @@ static struct gk20a_cde_ctx *gk20a_cde_allocate_context(struct gk20a *g)
        return cde_ctx;
 }
-int gk20a_cde_convert(struct gk20a *g,
+int gk20a_cde_convert(struct nvgpu_os_linux *l,
                      struct dma_buf *compbits_scatter_buf,
                      u64 compbits_byte_offset,
                      u64 scatterbuffer_byte_offset,
                      struct nvgpu_fence *fence,
                      u32 __flags, struct gk20a_cde_param *params,
                      int num_params, struct gk20a_fence **fence_out)
-__acquires(&cde_app->mutex)
+__acquires(&l->cde_app->mutex)
-__releases(&cde_app->mutex)
+__releases(&l->cde_app->mutex)
 {
+        struct gk20a *g = &l->g;
        struct gk20a_cde_ctx *cde_ctx = NULL;
        struct gk20a_comptags comptags;
        u64 mapped_compbits_offset = 0;
@@ -972,9 +989,9 @@ __releases(&cde_app->mutex)
        if (err)
                return err;
-        nvgpu_mutex_acquire(&g->cde_app.mutex);
+        nvgpu_mutex_acquire(&l->cde_app.mutex);
-        cde_ctx = gk20a_cde_get_context(g);
+        cde_ctx = gk20a_cde_get_context(l);
-        nvgpu_mutex_release(&g->cde_app.mutex);
+        nvgpu_mutex_release(&l->cde_app.mutex);
        if (IS_ERR(cde_ctx)) {
                err = PTR_ERR(cde_ctx);
                goto exit_idle;
@@ -1158,8 +1175,9 @@ __acquires(&cde_app->mutex)
 __releases(&cde_app->mutex)
 {
        struct gk20a_cde_ctx *cde_ctx = data;
-        struct gk20a *g = cde_ctx->g;
+        struct nvgpu_os_linux *l = cde_ctx->l;
-        struct gk20a_cde_app *cde_app = &g->cde_app;
+        struct gk20a *g = &l->g;
+        struct gk20a_cde_app *cde_app = &l->cde_app;
        bool channel_idle;
        channel_gk20a_joblist_lock(ch);
@@ -1188,7 +1206,7 @@ __releases(&cde_app->mutex)
                        /* mark it to be deleted, replace with a new one */
                        nvgpu_mutex_acquire(&cde_app->mutex);
                        cde_ctx->is_temporary = true;
-                        if (gk20a_cde_create_context(g)) {
+                        if (gk20a_cde_create_context(l)) {
                                nvgpu_err(g, "cde: can't replace context");
                        }
                        nvgpu_mutex_release(&cde_app->mutex);
@@ -1208,7 +1226,8 @@ __releases(&cde_app->mutex)
 static int gk20a_cde_load(struct gk20a_cde_ctx *cde_ctx)
 {
-        struct gk20a *g = cde_ctx->g;
+        struct nvgpu_os_linux *l = cde_ctx->l;
+        struct gk20a *g = &l->g;
        struct nvgpu_firmware *img;
        struct channel_gk20a *ch;
        struct gr_gk20a *gr = &g->gr;
@@ -1288,11 +1307,12 @@ err_get_gk20a_channel:
        return err;
 }
-int gk20a_cde_reload(struct gk20a *g)
+int gk20a_cde_reload(struct nvgpu_os_linux *l)
-__acquires(&cde_app->mutex)
+__acquires(&l->cde_app->mutex)
-__releases(&cde_app->mutex)
+__releases(&l->cde_app->mutex)
 {
-        struct gk20a_cde_app *cde_app = &g->cde_app;
+        struct gk20a *g = &l->g;
+        struct gk20a_cde_app *cde_app = &l->cde_app;
        int err;
        if (!cde_app->initialised)
@@ -1304,9 +1324,9 @@ __releases(&cde_app->mutex)
        nvgpu_mutex_acquire(&cde_app->mutex);
-        gk20a_cde_stop(g);
+        gk20a_cde_stop(l);
-        err = gk20a_cde_create_contexts(g);
+        err = gk20a_cde_create_contexts(l);
        if (!err)
                cde_app->initialised = true;
@@ -1316,11 +1336,11 @@ __releases(&cde_app->mutex)
        return err;
 }
-int gk20a_init_cde_support(struct gk20a *g)
+int gk20a_init_cde_support(struct nvgpu_os_linux *l)
 __acquires(&cde_app->mutex)
 __releases(&cde_app->mutex)
 {
-        struct gk20a_cde_app *cde_app = &g->cde_app;
+        struct gk20a_cde_app *cde_app = &l->cde_app;
        int err;
        if (cde_app->initialised)
@@ -1340,7 +1360,7 @@ __releases(&cde_app->mutex)
        cde_app->ctx_count_top = 0;
        cde_app->ctx_usecount = 0;
-        err = gk20a_cde_create_contexts(g);
+        err = gk20a_cde_create_contexts(l);
        if (!err)
                cde_app->initialised = true;
@@ -1393,7 +1413,7 @@ enum cde_launch_patch_id {
 #define MAX_CDE_LAUNCH_PATCHES            32
 static int gk20a_buffer_convert_gpu_to_cde_v1(
-                struct gk20a *g,
+                struct nvgpu_os_linux *l,
                struct dma_buf *dmabuf, u32 consumer,
                u64 offset, u64 compbits_hoffset, u64 compbits_voffset,
                u64 scatterbuffer_offset,
@@ -1401,6 +1421,7 @@ static int gk20a_buffer_convert_gpu_to_cde_v1(
                u32 submit_flags, struct nvgpu_fence *fence_in,
                struct gk20a_buffer_state *state)
 {
+        struct gk20a *g = &l->g;
        struct gk20a_cde_param params[MAX_CDE_LAUNCH_PATCHES];
        int param = 0;
        int err = 0;
@@ -1426,6 +1447,7 @@ static int gk20a_buffer_convert_gpu_to_cde_v1(
        if (g->ops.cde.get_program_numbers)
                g->ops.cde.get_program_numbers(g, block_height_log2,
+                                               l->cde_app.shader_parameter,
                                               &hprog, &vprog);
        else {
                nvgpu_warn(g, "cde: chip not supported");
@@ -1450,11 +1472,11 @@ static int gk20a_buffer_convert_gpu_to_cde_v1(
                  wgx, wgy, gridw_h, gridh_h, gridw_v, gridh_v);
        gk20a_dbg(gpu_dbg_cde, "hprog=%d, offset=0x%x, regs=%d, vprog=%d, offset=0x%x, regs=%d",
                  hprog,
-                  g->cde_app.arrays[ARRAY_PROGRAM_OFFSET][hprog],
+                  l->cde_app.arrays[ARRAY_PROGRAM_OFFSET][hprog],
-                  g->cde_app.arrays[ARRAY_REGISTER_COUNT][hprog],
+                  l->cde_app.arrays[ARRAY_REGISTER_COUNT][hprog],
                  vprog,
-                  g->cde_app.arrays[ARRAY_PROGRAM_OFFSET][vprog],
+                  l->cde_app.arrays[ARRAY_PROGRAM_OFFSET][vprog],
-                  g->cde_app.arrays[ARRAY_REGISTER_COUNT][vprog]);
+                  l->cde_app.arrays[ARRAY_REGISTER_COUNT][vprog]);
        /* Write parameters */
 #define WRITE_PATCH(NAME, VALUE) \
@@ -1483,40 +1505,40 @@ static int gk20a_buffer_convert_gpu_to_cde_v1(
        WRITE_PATCH(PATCH_V_VPC_CURRENT_GRID_SIZE_Z, 1);
        WRITE_PATCH(PATCH_H_QMD_PROGRAM_OFFSET,
-                g->cde_app.arrays[ARRAY_PROGRAM_OFFSET][hprog]);
+                l->cde_app.arrays[ARRAY_PROGRAM_OFFSET][hprog]);
        WRITE_PATCH(PATCH_H_QMD_REGISTER_COUNT,
-                g->cde_app.arrays[ARRAY_REGISTER_COUNT][hprog]);
+                l->cde_app.arrays[ARRAY_REGISTER_COUNT][hprog]);
        WRITE_PATCH(PATCH_V_QMD_PROGRAM_OFFSET,
-                g->cde_app.arrays[ARRAY_PROGRAM_OFFSET][vprog]);
+                l->cde_app.arrays[ARRAY_PROGRAM_OFFSET][vprog]);
        WRITE_PATCH(PATCH_V_QMD_REGISTER_COUNT,
-                g->cde_app.arrays[ARRAY_REGISTER_COUNT][vprog]);
+                l->cde_app.arrays[ARRAY_REGISTER_COUNT][vprog]);
        if (consumer & NVGPU_GPU_COMPBITS_CDEH) {
                WRITE_PATCH(PATCH_H_LAUNCH_WORD1,
-                        g->cde_app.arrays[ARRAY_LAUNCH_COMMAND][0]);
+                        l->cde_app.arrays[ARRAY_LAUNCH_COMMAND][0]);
                WRITE_PATCH(PATCH_H_LAUNCH_WORD2,
-                        g->cde_app.arrays[ARRAY_LAUNCH_COMMAND][1]);
+                        l->cde_app.arrays[ARRAY_LAUNCH_COMMAND][1]);
        } else {
                WRITE_PATCH(PATCH_H_LAUNCH_WORD1,
-                        g->cde_app.arrays[ARRAY_LAUNCH_COMMAND][2]);
+                        l->cde_app.arrays[ARRAY_LAUNCH_COMMAND][2]);
                WRITE_PATCH(PATCH_H_LAUNCH_WORD2,
-                        g->cde_app.arrays[ARRAY_LAUNCH_COMMAND][3]);
+                        l->cde_app.arrays[ARRAY_LAUNCH_COMMAND][3]);
        }
        if (consumer & NVGPU_GPU_COMPBITS_CDEV) {
                WRITE_PATCH(PATCH_V_LAUNCH_WORD1,
-                        g->cde_app.arrays[ARRAY_LAUNCH_COMMAND][0]);
+                        l->cde_app.arrays[ARRAY_LAUNCH_COMMAND][0]);
                WRITE_PATCH(PATCH_V_LAUNCH_WORD2,
-                        g->cde_app.arrays[ARRAY_LAUNCH_COMMAND][1]);
+                        l->cde_app.arrays[ARRAY_LAUNCH_COMMAND][1]);
        } else {
                WRITE_PATCH(PATCH_V_LAUNCH_WORD1,
-                        g->cde_app.arrays[ARRAY_LAUNCH_COMMAND][2]);
+                        l->cde_app.arrays[ARRAY_LAUNCH_COMMAND][2]);
                WRITE_PATCH(PATCH_V_LAUNCH_WORD2,
-                        g->cde_app.arrays[ARRAY_LAUNCH_COMMAND][3]);
+                        l->cde_app.arrays[ARRAY_LAUNCH_COMMAND][3]);
        }
 #undef WRITE_PATCH
-        err = gk20a_cde_convert(g, dmabuf,
+        err = gk20a_cde_convert(l, dmabuf,
                                compbits_hoffset,
                                scatterbuffer_offset,
                                fence_in, submit_flags,
@@ -1534,30 +1556,31 @@ out:
 }
 static int gk20a_buffer_convert_gpu_to_cde(
-                struct gk20a *g, struct dma_buf *dmabuf, u32 consumer,
+                struct nvgpu_os_linux *l, struct dma_buf *dmabuf, u32 consumer,
                u64 offset, u64 compbits_hoffset, u64 compbits_voffset,
                u64 scatterbuffer_offset,
                u32 width, u32 height, u32 block_height_log2,
                u32 submit_flags, struct nvgpu_fence *fence_in,
                struct gk20a_buffer_state *state)
 {
+        struct gk20a *g = &l->g;
        int err = 0;
-        if (!g->cde_app.initialised)
+        if (!l->cde_app.initialised)
                return -ENOSYS;
        gk20a_dbg(gpu_dbg_cde, "firmware version = %d\n",
-                g->cde_app.firmware_version);
+                l->cde_app.firmware_version);
-        if (g->cde_app.firmware_version == 1) {
+        if (l->cde_app.firmware_version == 1) {
                err = gk20a_buffer_convert_gpu_to_cde_v1(
-                    g, dmabuf, consumer, offset, compbits_hoffset,
+                    l, dmabuf, consumer, offset, compbits_hoffset,
                    compbits_voffset, scatterbuffer_offset,
                    width, height, block_height_log2,
                    submit_flags, fence_in, state);
        } else {
                nvgpu_err(g, "unsupported CDE firmware version %d",
-                        g->cde_app.firmware_version);
+                        l->cde_app.firmware_version);
                err = -EINVAL;
        }
@@ -1565,7 +1588,7 @@ static int gk20a_buffer_convert_gpu_to_cde(
 }
 int gk20a_prepare_compressible_read(
-                struct gk20a *g, u32 buffer_fd, u32 request, u64 offset,
+                struct nvgpu_os_linux *l, u32 buffer_fd, u32 request, u64 offset,
                u64 compbits_hoffset, u64 compbits_voffset,
                u64 scatterbuffer_offset,
                u32 width, u32 height, u32 block_height_log2,
@@ -1573,6 +1596,7 @@ int gk20a_prepare_compressible_read(
                u32 *valid_compbits, u32 *zbc_color,
                struct gk20a_fence **fence_out)
 {
+        struct gk20a *g = &l->g;
        int err = 0;
        struct gk20a_buffer_state *state;
        struct dma_buf *dmabuf;
@@ -1606,7 +1630,7 @@ int gk20a_prepare_compressible_read(
                if ((state->valid_compbits & NVGPU_GPU_COMPBITS_GPU) &&
                    missing_cde_bits) {
                        err = gk20a_buffer_convert_gpu_to_cde(
-                                        g, dmabuf,
+                                        l, dmabuf,
                                        missing_cde_bits,
                                        offset, compbits_hoffset,
                                        compbits_voffset, scatterbuffer_offset,
diff --git a/drivers/gpu/nvgpu/gk20a/cde_gk20a.h b/drivers/gpu/nvgpu/common/linux/cde.h
index 4f400bf3..22732a2a 100644
--- a/drivers/gpu/nvgpu/gk20a/cde_gk20a.h
+++ b/drivers/gpu/nvgpu/common/linux/cde.h
@@ -19,8 +19,6 @@
 #ifndef _CDE_GK20A_H_
 #define _CDE_GK20A_H_
-#include "mm_gk20a.h"
 #define MAX_CDE_BUFS            10
 #define MAX_CDE_PARAMS          64
 #define MAX_CDE_USER_PARAMS     40
@@ -214,7 +212,7 @@ struct gk20a_cde_param {
 };
 struct gk20a_cde_ctx {
-        struct gk20a *g;
+        struct nvgpu_os_linux *l;
        struct device *dev;
        /* channel related data */
@@ -284,11 +282,11 @@ struct gk20a_cde_app {
        u32 shader_parameter;
 };
-void gk20a_cde_destroy(struct gk20a *g);
+void gk20a_cde_destroy(struct nvgpu_os_linux *l);
-void gk20a_cde_suspend(struct gk20a *g);
+void gk20a_cde_suspend(struct nvgpu_os_linux *l);
-int gk20a_init_cde_support(struct gk20a *g);
+int gk20a_init_cde_support(struct nvgpu_os_linux *l);
-int gk20a_cde_reload(struct gk20a *g);
+int gk20a_cde_reload(struct nvgpu_os_linux *l);
-int gk20a_cde_convert(struct gk20a *g,
+int gk20a_cde_convert(struct nvgpu_os_linux *l,
                struct dma_buf *compbits_buf,
                u64 compbits_byte_offset,
                u64 scatterbuffer_byte_offset,
@@ -297,7 +295,7 @@ int gk20a_cde_convert(struct gk20a *g,
                int num_params, struct gk20a_fence **fence_out);
 int gk20a_prepare_compressible_read(
-                struct gk20a *g, u32 buffer_fd, u32 request, u64 offset,
+                struct nvgpu_os_linux *l, u32 buffer_fd, u32 request, u64 offset,
                u64 compbits_hoffset, u64 compbits_voffset,
                u64 scatterbuffer_offset,
                u32 width, u32 height, u32 block_height_log2,
diff --git a/drivers/gpu/nvgpu/common/linux/debug_cde.c b/drivers/gpu/nvgpu/common/linux/debug_cde.c
index 40cc64a4..cbea83b9 100644
--- a/drivers/gpu/nvgpu/common/linux/debug_cde.c
+++ b/drivers/gpu/nvgpu/common/linux/debug_cde.c
@@ -22,8 +22,8 @@
 static ssize_t gk20a_cde_reload_write(struct file *file,
        const char __user *userbuf, size_t count, loff_t *ppos)
 {
-        struct gk20a *g = file->private_data;
+        struct nvgpu_os_linux *l = file->private_data;
-        gk20a_cde_reload(g);
+        gk20a_cde_reload(l);
        return count;
 }
@@ -41,13 +41,13 @@ void gk20a_cde_debugfs_init(struct gk20a *g)
                return;
        debugfs_create_u32("cde_parameter", S_IWUSR | S_IRUGO,
-                           l->debugfs, &g->cde_app.shader_parameter);
+                           l->debugfs, &l->cde_app.shader_parameter);
        debugfs_create_u32("cde_ctx_count", S_IWUSR | S_IRUGO,
-                           l->debugfs, &g->cde_app.ctx_count);
+                           l->debugfs, &l->cde_app.ctx_count);
        debugfs_create_u32("cde_ctx_usecount", S_IWUSR | S_IRUGO,
-                           l->debugfs, &g->cde_app.ctx_usecount);
+                           l->debugfs, &l->cde_app.ctx_usecount);
        debugfs_create_u32("cde_ctx_count_top", S_IWUSR | S_IRUGO,
-                           l->debugfs, &g->cde_app.ctx_count_top);
+                           l->debugfs, &l->cde_app.ctx_count_top);
        debugfs_create_file("reload_cde_firmware", S_IWUSR, l->debugfs,
-                            g, &gk20a_cde_reload_fops);
+                            l, &gk20a_cde_reload_fops);
 }
diff --git a/drivers/gpu/nvgpu/common/linux/ioctl_ctrl.c b/drivers/gpu/nvgpu/common/linux/ioctl_ctrl.c
index 0d79b143..0357f098 100644
--- a/drivers/gpu/nvgpu/common/linux/ioctl_ctrl.c
+++ b/drivers/gpu/nvgpu/common/linux/ioctl_ctrl.c
@@ -138,6 +138,7 @@ static int gk20a_ctrl_prepare_compressible_read(
                struct gk20a *g,
                struct nvgpu_gpu_prepare_compressible_read_args *args)
 {
+        struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g);
        struct nvgpu_fence fence;
        struct gk20a_fence *fence_out = NULL;
        int ret = 0;
@@ -146,7 +147,7 @@ static int gk20a_ctrl_prepare_compressible_read(
        fence.id = args->fence.syncpt_id;
        fence.value = args->fence.syncpt_value;
-        ret = gk20a_prepare_compressible_read(g, args->handle,
+        ret = gk20a_prepare_compressible_read(l, args->handle,
                        args->request_compbits, args->offset,
                        args->compbits_hoffset, args->compbits_voffset,
                        args->scatterbuffer_offset,
diff --git a/drivers/gpu/nvgpu/common/linux/module.c b/drivers/gpu/nvgpu/common/linux/module.c
index 6a590baa..509930c7 100644
--- a/drivers/gpu/nvgpu/common/linux/module.c
+++ b/drivers/gpu/nvgpu/common/linux/module.c
@@ -39,6 +39,7 @@
 #include "pci.h"
 #include "module.h"
 #include "intr.h"
+#include "cde.h"
 #ifdef CONFIG_TEGRA_19x_GPU
 #include "nvgpu_gpuid_t19x.h"
 #ifdef CONFIG_TEGRA_GR_VIRTUALIZATION
@@ -185,7 +186,7 @@ int gk20a_pm_finalize_poweron(struct device *dev)
        gk20a_scale_resume(dev_from_gk20a(g));
        if (platform->has_cde)
-                gk20a_init_cde_support(g);
+                gk20a_init_cde_support(l);
 done:
        if (err)
@@ -197,6 +198,7 @@ done:
 static int gk20a_pm_prepare_poweroff(struct device *dev)
 {
        struct gk20a *g = get_gk20a(dev);
+        struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g);
        int ret = 0;
        struct gk20a_platform *platform = gk20a_get_platform(dev);
@@ -207,8 +209,15 @@ static int gk20a_pm_prepare_poweroff(struct device *dev)
        if (!g->power_on)
                goto done;
+        if (gk20a_fifo_is_engine_busy(g)) {
+                ret = -EBUSY;
+                goto done;
+        }
        gk20a_scale_suspend(dev);
+        gk20a_cde_suspend(l);
        ret = gk20a_prepare_poweroff(g);
        if (ret)
                goto error;
@@ -974,6 +983,7 @@ static int __exit gk20a_remove(struct platform_device *pdev)
 {
        struct device *dev = &pdev->dev;
        struct gk20a *g = get_gk20a(dev);
+        struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g);
        struct gk20a_platform *platform = gk20a_get_platform(dev);
        gk20a_dbg_fn("");
@@ -982,7 +992,7 @@ static int __exit gk20a_remove(struct platform_device *pdev)
                return vgpu_remove(pdev);
        if (platform->has_cde)
-                gk20a_cde_destroy(g);
+                gk20a_cde_destroy(l);
        gk20a_ctxsw_trace_cleanup(g);
diff --git a/drivers/gpu/nvgpu/common/linux/os_linux.h b/drivers/gpu/nvgpu/common/linux/os_linux.h
index ed8364a9..160a5738 100644
--- a/drivers/gpu/nvgpu/common/linux/os_linux.h
+++ b/drivers/gpu/nvgpu/common/linux/os_linux.h
@@ -19,6 +19,7 @@
 #include <linux/cdev.h>
 #include "gk20a/gk20a.h"
+#include "cde.h"
 struct nvgpu_os_linux {
        struct gk20a g;
@@ -108,6 +109,7 @@ struct nvgpu_os_linux {
        struct dentry *debugfs_force_preemption_gfxp;
        struct dentry *debugfs_dump_ctxsw_stats;
 #endif
+        struct gk20a_cde_app cde_app;
 };
 static inline struct nvgpu_os_linux *nvgpu_os_linux_from_gk20a(struct gk20a *g)
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.c b/drivers/gpu/nvgpu/gk20a/gk20a.c
index 0cd314d6..63ea5bc4 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.c
@@ -117,9 +117,6 @@ int gk20a_prepare_poweroff(struct gk20a *g)
        if (gk20a_fifo_is_engine_busy(g))
                return -EBUSY;
-        /* cancel any pending cde work */
-        gk20a_cde_suspend(g);
        gk20a_ce_suspend(g);
        ret = gk20a_channel_suspend(g);
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.h b/drivers/gpu/nvgpu/gk20a/gk20a.h
index ab715bdc..69cb2253 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.h
@@ -60,7 +60,6 @@ struct nvgpu_cpu_time_correlation_sample;
 #include "pmu_gk20a.h"
 #include "priv_ring_gk20a.h"
 #include "therm_gk20a.h"
-#include "cde_gk20a.h"
 #include "sched_gk20a.h"
 #ifdef CONFIG_ARCH_TEGRA_18x_SOC
 #include "clk/clk.h"
@@ -928,6 +927,7 @@ struct gpu_ops {
        struct {
                void (*get_program_numbers)(struct gk20a *g,
                                            u32 block_height_log2,
+                                            u32 shader_parameter,
                                            int *hprog, int *vprog);
                bool (*need_scatter_buffer)(struct gk20a *g);
                int (*populate_scatter_buffer)(struct gk20a *g,
@@ -1217,7 +1217,6 @@ struct gk20a {
        struct gk20a_sched_ctrl sched_ctrl;
-        struct gk20a_cde_app cde_app;
        bool mmu_debug_ctrl;
        u32 tpc_fs_mask_user;
diff --git a/drivers/gpu/nvgpu/gm20b/cde_gm20b.c b/drivers/gpu/nvgpu/gm20b/cde_gm20b.c
index f8267d1d..de7cf872 100644
--- a/drivers/gpu/nvgpu/gm20b/cde_gm20b.c
+++ b/drivers/gpu/nvgpu/gm20b/cde_gm20b.c
@@ -28,15 +28,16 @@ enum programs {
 void gm20b_cde_get_program_numbers(struct gk20a *g,
                                          u32 block_height_log2,
+                                          u32 shader_parameter,
                                          int *hprog_out, int *vprog_out)
 {
        int hprog = PROG_HPASS;
        int vprog = (block_height_log2 >= 2) ?
                PROG_VPASS_LARGE : PROG_VPASS_SMALL;
-        if (g->cde_app.shader_parameter == 1) {
+        if (shader_parameter == 1) {
                hprog = PROG_PASSTHROUGH;
                vprog = PROG_PASSTHROUGH;
-        } else if (g->cde_app.shader_parameter == 2) {
+        } else if (shader_parameter == 2) {
                hprog = PROG_HPASS_DEBUG;
                vprog = (block_height_log2 >= 2) ?
                        PROG_VPASS_LARGE_DEBUG :
diff --git a/drivers/gpu/nvgpu/gm20b/cde_gm20b.h b/drivers/gpu/nvgpu/gm20b/cde_gm20b.h
index f2ea20a0..0ea423ad 100644
--- a/drivers/gpu/nvgpu/gm20b/cde_gm20b.h
+++ b/drivers/gpu/nvgpu/gm20b/cde_gm20b.h
@@ -20,6 +20,7 @@ struct gk20a;
 void gm20b_cde_get_program_numbers(struct gk20a *g,
                                          u32 block_height_log2,
+                                          u32 shader_parameter,
                                          int *hprog_out, int *vprog_out);
 #endif
diff --git a/drivers/gpu/nvgpu/gp10b/cde_gp10b.c b/drivers/gpu/nvgpu/gp10b/cde_gp10b.c
index 685ddbc4..1ddbcba6 100644
--- a/drivers/gpu/nvgpu/gp10b/cde_gp10b.c
+++ b/drivers/gpu/nvgpu/gp10b/cde_gp10b.c
@@ -32,17 +32,18 @@ enum gp10b_programs {
 void gp10b_cde_get_program_numbers(struct gk20a *g,
                                          u32 block_height_log2,
+                                          u32 shader_parameter,
                                          int *hprog_out, int *vprog_out)
 {
        int hprog, vprog;
-        if (g->cde_app.shader_parameter == 1) {
+        if (shader_parameter == 1) {
                hprog = GP10B_PROG_PASSTHROUGH;
                vprog = GP10B_PROG_PASSTHROUGH;
        } else {
                hprog = GP10B_PROG_HPASS;
                vprog = GP10B_PROG_VPASS;
-                if (g->cde_app.shader_parameter == 2) {
+                if (shader_parameter == 2) {
                        hprog = GP10B_PROG_HPASS_DEBUG;
                        vprog = GP10B_PROG_VPASS_DEBUG;
                }
diff --git a/drivers/gpu/nvgpu/gp10b/cde_gp10b.h b/drivers/gpu/nvgpu/gp10b/cde_gp10b.h
index 3ee6027c..7ccfe560 100644
--- a/drivers/gpu/nvgpu/gp10b/cde_gp10b.h
+++ b/drivers/gpu/nvgpu/gp10b/cde_gp10b.h
@@ -21,6 +21,7 @@ struct sg_table;
 void gp10b_cde_get_program_numbers(struct gk20a *g,
                                          u32 block_height_log2,
+                                          u32 shader_parameter,
                                          int *hprog_out, int *vprog_out);
 bool gp10b_need_scatter_buffer(struct gk20a *g);
 int gp10b_populate_scatter_buffer(struct gk20a *g,