14 files changed, 896 insertions, 7 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/cde_gk20a.c b/drivers/gpu/nvgpu/gk20a/cde_gk20a.c
index 4b84dc69..f5b68e72 100644
--- a/drivers/gpu/nvgpu/gk20a/cde_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/cde_gk20a.c
@@ -1186,7 +1186,9 @@ static int gk20a_cde_load(struct gk20a_cde_ctx *cde_ctx)
        }
        ch = gk20a_open_new_channel_with_cb(g, gk20a_cde_finished_ctx_cb,
-                        cde_ctx);
+                        cde_ctx,
+                        -1,
+                        false);
        if (!ch) {
                gk20a_warn(cde_ctx->dev, "cde: gk20a channel not available");
                err = -ENOMEM;
diff --git a/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c b/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c
index 96d38b11..e2f2d9e9 100644
--- a/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c
@@ -24,6 +24,7 @@
 #include <trace/events/gk20a.h>
 #include <linux/dma-mapping.h>
 #include <linux/nvhost.h>
+#include <linux/debugfs.h>
 #include "gk20a.h"
 #include "debug_gk20a.h"
@@ -96,3 +97,619 @@ void gk20a_init_ce2(struct gpu_ops *gops)
        gops->ce2.isr_stall = gk20a_ce2_isr;
        gops->ce2.isr_nonstall = gk20a_ce2_nonstall_isr;
 }
+/* static CE app api */
+static void gk20a_ce_notify_all_user(struct gk20a *g, u32 event)
+{
+        struct gk20a_ce_app *ce_app = &g->ce_app;
+        struct gk20a_gpu_ctx *ce_ctx, *ce_ctx_save;
+        if (!ce_app->initialised)
+                return;
+        mutex_lock(&ce_app->app_mutex);
+        list_for_each_entry_safe(ce_ctx, ce_ctx_save,
+                        &ce_app->allocated_contexts, list) {
+                if (ce_ctx->user_event_callback) {
+                        ce_ctx->user_event_callback(ce_ctx->ctx_id,
+                                event);
+                }
+        }
+        mutex_unlock(&ce_app->app_mutex);
+}
+static void gk20a_ce_finished_ctx_cb(struct channel_gk20a *ch, void *data)
+{
+        struct gk20a_gpu_ctx *ce_ctx = data;
+        bool channel_idle;
+        u32 event;
+        mutex_lock(&ch->jobs_lock);
+        channel_idle = list_empty(&ch->jobs);
+        mutex_unlock(&ch->jobs_lock);
+        if (!channel_idle)
+                return;
+        gk20a_dbg(gpu_dbg_fn, "ce: finished %p", ce_ctx);
+        if (ch->has_timedout)
+                event = NVGPU_CE_CONTEXT_JOB_TIMEDOUT;
+        else
+                event = NVGPU_CE_CONTEXT_JOB_COMPLETED;
+        if (ce_ctx->user_event_callback)
+                ce_ctx->user_event_callback(ce_ctx->ctx_id,
+                        event);
+        ++ce_ctx->completed_seq_number;
+}
+static void gk20a_ce_free_command_buffer_stored_fence(struct gk20a_gpu_ctx *ce_ctx)
+{
+        u32 cmd_buf_index;
+        u32 cmd_buf_read_offset;
+        u32 fence_index;
+        u32 *cmd_buf_cpu_va;
+        for (cmd_buf_index = 0;
+                cmd_buf_index < ce_ctx->cmd_buf_end_queue_offset;
+                cmd_buf_index++) {
+                cmd_buf_read_offset = (cmd_buf_index *
+                        (NVGPU_CE_MAX_COMMAND_BUFF_SIZE_PER_KICKOFF / sizeof(u32)));
+                /* at end of command buffer has gk20a_fence for command buffer sync */
+                fence_index = (cmd_buf_read_offset +
+                        ((NVGPU_CE_MAX_COMMAND_BUFF_SIZE_PER_KICKOFF / sizeof(u32)) -
+                        (NVGPU_CE_MAX_COMMAND_BUFF_SIZE_FOR_TRACING / sizeof(u32))));
+                cmd_buf_cpu_va = (u32 *)ce_ctx->cmd_buf_mem.cpu_va;
+                /* 0 is treated as invalid pre-sync */
+                if (cmd_buf_cpu_va[fence_index]) {
+                        struct gk20a_fence * ce_cmd_buf_fence_in = NULL;
+                        memcpy((void *)&ce_cmd_buf_fence_in,
+                                        (void *)(cmd_buf_cpu_va + fence_index),
+                                        sizeof(struct gk20a_fence *));
+                        gk20a_fence_put(ce_cmd_buf_fence_in);
+                        /* Reset the stored last pre-sync */
+                        memset((void *)(cmd_buf_cpu_va + fence_index),
+                                        0,
+                                        NVGPU_CE_MAX_COMMAND_BUFF_SIZE_FOR_TRACING);
+                }
+        }
+}
+/* assume this api should need to call under mutex_lock(&ce_app->app_mutex) */
+static void gk20a_ce_delete_gpu_context(struct gk20a_gpu_ctx *ce_ctx)
+{
+        ce_ctx->gpu_ctx_state = NVGPU_CE_GPU_CTX_DELETED;
+        mutex_lock(&ce_ctx->gpu_ctx_mutex);
+        gk20a_ce_free_command_buffer_stored_fence(ce_ctx);
+        gk20a_gmmu_unmap_free(ce_ctx->vm, &ce_ctx->cmd_buf_mem);
+        /* free the channel */
+        if (ce_ctx->ch)
+                gk20a_channel_close(ce_ctx->ch);
+        /* housekeeping on app */
+        list_del(&ce_ctx->list);
+        mutex_unlock(&ce_ctx->gpu_ctx_mutex);
+        mutex_destroy(&ce_ctx->gpu_ctx_mutex);
+        kfree(ce_ctx);
+}
+static inline int gk20a_ce_get_method_size(int request_operation)
+{
+        /* failure size */
+        int methodsize = ~0;
+        if (request_operation & NVGPU_CE_PHYS_MODE_TRANSFER)
+                methodsize = 10 * 2 * sizeof(u32);
+        else if (request_operation & NVGPU_CE_MEMSET)
+                methodsize = 9 * 2 * sizeof(u32);
+        return methodsize;
+}
+static inline int gk20a_get_valid_launch_flags(struct gk20a *g, int launch_flags)
+{
+        /* there is no local memory available,
+        don't allow local memory related CE flags */
+        if (!g->mm.vidmem_size) {
+                launch_flags &= ~(NVGPU_CE_SRC_LOCATION_LOCAL_FB |
+                        NVGPU_CE_DST_LOCATION_LOCAL_FB);
+        }
+        return launch_flags;
+}
+static int gk20a_ce_prepare_submit(u64 src_buf,
+                u64 dst_buf,
+                u64 size,
+                u32 *cmd_buf_cpu_va,
+                u32 max_cmd_buf_size,
+                unsigned int payload,
+                int launch_flags,
+                int request_operation,
+                u32 dma_copy_class,
+                struct gk20a_fence *gk20a_fence_in)
+{
+        u32 launch = 0;
+        u32 methodSize = 0;
+        /* failure case handling */
+        if ((gk20a_ce_get_method_size(request_operation) > max_cmd_buf_size) ||
+                (!size) ||
+                (request_operation > NVGPU_CE_MEMSET))
+                return 0;
+        /* set the channel object */
+        cmd_buf_cpu_va[methodSize++] = 0x20018000;
+        cmd_buf_cpu_va[methodSize++] = dma_copy_class;
+        if (request_operation & NVGPU_CE_PHYS_MODE_TRANSFER) {
+                /* setup the source */
+                cmd_buf_cpu_va[methodSize++] = 0x20018101;
+                cmd_buf_cpu_va[methodSize++] = (u64_lo32(src_buf) &
+                                        NVGPU_CE_LOWER_ADDRESS_OFFSET_MASK);
+                cmd_buf_cpu_va[methodSize++] = 0x20018100;
+                cmd_buf_cpu_va[methodSize++] = (u64_hi32(src_buf) &
+                                        NVGPU_CE_UPPER_ADDRESS_OFFSET_MASK);
+                cmd_buf_cpu_va[methodSize++] = 0x20018098;
+                if (launch_flags & NVGPU_CE_SRC_LOCATION_LOCAL_FB) {
+                        cmd_buf_cpu_va[methodSize++] = 0x00000000;
+                } else if (launch_flags & NVGPU_CE_SRC_LOCATION_NONCOHERENT_SYSMEM) {
+                        cmd_buf_cpu_va[methodSize++] = 0x00000002;
+                } else {
+                        cmd_buf_cpu_va[methodSize++] = 0x00000001;
+                }
+                launch |= 0x00001000;
+        } else if (request_operation & NVGPU_CE_MEMSET) {
+                cmd_buf_cpu_va[methodSize++] = 0x200181c2;
+                cmd_buf_cpu_va[methodSize++] = 0x00030004;
+                cmd_buf_cpu_va[methodSize++] = 0x200181c0;
+                cmd_buf_cpu_va[methodSize++] = payload;
+                launch |= 0x00000400;
+                /* converted into number of words */
+                size /= sizeof(u32);
+        }
+        /* setup the destination/output */
+        cmd_buf_cpu_va[methodSize++] = 0x20018103;
+        cmd_buf_cpu_va[methodSize++] = (u64_lo32(dst_buf) & NVGPU_CE_LOWER_ADDRESS_OFFSET_MASK);
+        cmd_buf_cpu_va[methodSize++] = 0x20018102;
+        cmd_buf_cpu_va[methodSize++] = (u64_hi32(dst_buf) & NVGPU_CE_UPPER_ADDRESS_OFFSET_MASK);
+        cmd_buf_cpu_va[methodSize++] = 0x20018099;
+        if (launch_flags & NVGPU_CE_DST_LOCATION_LOCAL_FB) {
+                cmd_buf_cpu_va[methodSize++] = 0x00000000;
+        } else if (launch_flags & NVGPU_CE_DST_LOCATION_NONCOHERENT_SYSMEM) {
+                cmd_buf_cpu_va[methodSize++] = 0x00000002;
+        } else {
+                cmd_buf_cpu_va[methodSize++] = 0x00000001;
+        }
+        launch |= 0x00002000;
+        /* setup the format */
+        cmd_buf_cpu_va[methodSize++] = 0x20018107;
+        cmd_buf_cpu_va[methodSize++] = 1;
+        cmd_buf_cpu_va[methodSize++] = 0x20018106;
+        cmd_buf_cpu_va[methodSize++] =  u64_lo32(size);
+        launch |= 0x00000004;
+        if (launch_flags & NVGPU_CE_SRC_MEMORY_LAYOUT_BLOCKLINEAR)
+                launch |= 0x00000000;
+        else
+                launch |= 0x00000080;
+        if (launch_flags & NVGPU_CE_DST_MEMORY_LAYOUT_BLOCKLINEAR)
+                launch |= 0x00000000;
+        else
+                launch |= 0x00000100;
+        if (launch_flags & NVGPU_CE_DATA_TRANSFER_TYPE_NON_PIPELINED)
+                launch |= 0x00000002;
+        else
+                launch |= 0x00000001;
+        cmd_buf_cpu_va[methodSize++] = 0x200180c0;
+        cmd_buf_cpu_va[methodSize++] = launch;
+        return methodSize;
+}
+/* global CE app related apis */
+int gk20a_init_ce_support(struct gk20a *g)
+{
+        struct gk20a_ce_app *ce_app = &g->ce_app;
+        if (ce_app->initialised) {
+                /* assume this happen during poweron/poweroff GPU sequence */
+                ce_app->app_state = NVGPU_CE_ACTIVE;
+                gk20a_ce_notify_all_user(g, NVGPU_CE_CONTEXT_RESUME);
+                return 0;
+        }
+        gk20a_dbg(gpu_dbg_fn, "ce: init");
+        mutex_init(&ce_app->app_mutex);
+        mutex_lock(&ce_app->app_mutex);
+        INIT_LIST_HEAD(&ce_app->allocated_contexts);
+        ce_app->ctx_count = 0;
+        ce_app->next_ctx_id = 0;
+        ce_app->initialised = true;
+        ce_app->app_state = NVGPU_CE_ACTIVE;
+        mutex_unlock(&ce_app->app_mutex);
+        gk20a_dbg(gpu_dbg_cde_ctx, "ce: init finished");
+        return 0;
+}
+void gk20a_ce_destroy(struct gk20a *g)
+{
+        struct gk20a_ce_app *ce_app = &g->ce_app;
+        struct gk20a_gpu_ctx *ce_ctx, *ce_ctx_save;
+        if (!ce_app->initialised)
+                return;
+        ce_app->app_state = NVGPU_CE_SUSPEND;
+        ce_app->initialised = false;
+        mutex_lock(&ce_app->app_mutex);
+        list_for_each_entry_safe(ce_ctx, ce_ctx_save,
+                        &ce_app->allocated_contexts, list) {
+                gk20a_ce_delete_gpu_context(ce_ctx);
+        }
+        INIT_LIST_HEAD(&ce_app->allocated_contexts);
+        ce_app->ctx_count = 0;
+        ce_app->next_ctx_id = 0;
+        mutex_unlock(&ce_app->app_mutex);
+        mutex_destroy(&ce_app->app_mutex);
+}
+void gk20a_ce_suspend(struct gk20a *g)
+{
+        struct gk20a_ce_app *ce_app = &g->ce_app;
+        if (!ce_app->initialised)
+                return;
+        ce_app->app_state = NVGPU_CE_SUSPEND;
+        gk20a_ce_notify_all_user(g, NVGPU_CE_CONTEXT_SUSPEND);
+        return;
+}
+/* CE app utility functions */
+u32 gk20a_ce_create_context_with_cb(struct device *dev,
+                int runlist_id,
+                int priority,
+                int timeslice,
+                int runlist_level,
+                ce_event_callback user_event_callback)
+{
+        struct gk20a_gpu_ctx *ce_ctx;
+        struct gk20a *g = gk20a_from_dev(dev);
+        struct gk20a_ce_app *ce_app = &g->ce_app;
+        u32 ctx_id = ~0;
+        int err = 0;
+        if (!ce_app->initialised || ce_app->app_state != NVGPU_CE_ACTIVE)
+                return ctx_id;
+        ce_ctx = kzalloc(sizeof(*ce_ctx), GFP_KERNEL);
+        if (!ce_ctx)
+                return ctx_id;
+        mutex_init(&ce_ctx->gpu_ctx_mutex);
+        ce_ctx->g = g;
+        ce_ctx->dev = g->dev;
+        ce_ctx->user_event_callback = user_event_callback;
+        ce_ctx->cmd_buf_read_queue_offset = 0;
+        ce_ctx->cmd_buf_end_queue_offset =
+                (NVGPU_CE_COMMAND_BUF_SIZE / NVGPU_CE_MAX_COMMAND_BUFF_SIZE_PER_KICKOFF);
+        ce_ctx->submitted_seq_number = 0;
+        ce_ctx->completed_seq_number = 0;
+        /* always kernel client needs privileged channel */
+        ce_ctx->ch = gk20a_open_new_channel_with_cb(g, gk20a_ce_finished_ctx_cb,
+                                        ce_ctx,
+                                        runlist_id,
+                                        true);
+        if (!ce_ctx->ch) {
+                gk20a_err(ce_ctx->dev, "ce: gk20a channel not available");
+                goto end;
+         }
+        /* bind the channel to the vm */
+        gk20a_vm_get(&g->mm.ce.vm);
+        ce_ctx->vm = ce_ctx->ch->vm = &g->mm.ce.vm;
+        err = channel_gk20a_commit_va(ce_ctx->ch);
+        if (err) {
+                gk20a_err(ce_ctx->dev, "ce: could not bind vm");
+                goto end;
+        }
+        /* allocate gpfifo (1024 should be more than enough) */
+        err = gk20a_alloc_channel_gpfifo(ce_ctx->ch,
+                &(struct nvgpu_alloc_gpfifo_args){1024, 0});
+        if (err) {
+                gk20a_err(ce_ctx->dev, "ce: unable to allocate gpfifo");
+                goto end;
+        }
+        /* allocate command buffer (4096 should be more than enough) from sysmem*/
+        err = gk20a_gmmu_alloc_map_sys(ce_ctx->vm, NVGPU_CE_COMMAND_BUF_SIZE, &ce_ctx->cmd_buf_mem);
+         if (err) {
+                gk20a_err(ce_ctx->dev,
+                        "ce: could not allocate command buffer for CE context");
+                goto end;
+        }
+        memset(ce_ctx->cmd_buf_mem.cpu_va, 0x00, ce_ctx->cmd_buf_mem.size);
+        /* -1 means default channel priority */
+        if (priority != -1) {
+                err = gk20a_channel_set_priority(ce_ctx->ch, priority);
+                if (err) {
+                        gk20a_err(ce_ctx->dev,
+                                "ce: could not set the channel priority for CE context");
+                        goto end;
+                }
+        }
+        /* -1 means default channel timeslice value */
+        if (timeslice != -1) {
+                err = gk20a_channel_set_timeslice(ce_ctx->ch, timeslice);
+                if (err) {
+                        gk20a_err(ce_ctx->dev,
+                                "ce: could not set the channel timeslice value for CE context");
+                        goto end;
+                }
+        }
+        /* -1 means default channel runlist level */
+        if (runlist_level != -1) {
+                err = gk20a_channel_set_runlist_interleave(ce_ctx->ch, runlist_level);
+                if (err) {
+                        gk20a_err(ce_ctx->dev,
+                                "ce: could not set the runlist interleave for CE context");
+                        goto end;
+                }
+        }
+        mutex_lock(&ce_app->app_mutex);
+        ctx_id = ce_ctx->ctx_id = ce_app->next_ctx_id;
+        list_add(&ce_ctx->list, &ce_app->allocated_contexts);
+        ++ce_app->next_ctx_id;
+        ++ce_app->ctx_count;
+        mutex_unlock(&ce_app->app_mutex);
+        ce_ctx->gpu_ctx_state = NVGPU_CE_GPU_CTX_ALLOCATED;
+end:
+        if (ctx_id == ~0) {
+                mutex_lock(&ce_app->app_mutex);
+                gk20a_ce_delete_gpu_context(ce_ctx);
+                mutex_unlock(&ce_app->app_mutex);
+        }
+        return ctx_id;
+}
+EXPORT_SYMBOL(gk20a_ce_create_context_with_cb);
+int gk20a_ce_execute_ops(struct device *dev,
+                u32 ce_ctx_id,
+                u64 src_buf,
+                u64 dst_buf,
+                u64 size,
+                unsigned int payload,
+                int launch_flags,
+                int request_operation,
+                struct gk20a_fence *gk20a_fence_in,
+                u32 submit_flags,
+                struct gk20a_fence **gk20a_fence_out)
+{
+        int ret = -EPERM;
+        struct gk20a *g = gk20a_from_dev(dev);
+        struct gk20a_ce_app *ce_app = &g->ce_app;
+        struct gk20a_gpu_ctx *ce_ctx, *ce_ctx_save;
+        bool found = false;
+        u32 *cmd_buf_cpu_va;
+        u64 cmd_buf_gpu_va = 0;
+        u32 methodSize;
+        u32 cmd_buf_read_offset;
+        u32 fence_index;
+        struct nvgpu_gpfifo gpfifo;
+        struct nvgpu_fence fence = {0,0};
+        struct gk20a_fence *ce_cmd_buf_fence_out = NULL;
+        struct nvgpu_gpu_characteristics *gpu_capability = &g->gpu_characteristics;
+        if (!ce_app->initialised ||ce_app->app_state != NVGPU_CE_ACTIVE)
+                goto end;
+        mutex_lock(&ce_app->app_mutex);
+        list_for_each_entry_safe(ce_ctx, ce_ctx_save,
+                        &ce_app->allocated_contexts, list) {
+                if (ce_ctx->ctx_id == ce_ctx_id) {
+                        found = true;
+                        break;
+                }
+        }
+        mutex_unlock(&ce_app->app_mutex);
+        if (!found) {
+                ret = -EINVAL;
+                goto end;
+        }
+        if (ce_ctx->gpu_ctx_state != NVGPU_CE_GPU_CTX_ALLOCATED) {
+                ret = -ENODEV;
+                goto end;
+        }
+        mutex_lock(&ce_ctx->gpu_ctx_mutex);
+        ce_ctx->cmd_buf_read_queue_offset %= ce_ctx->cmd_buf_end_queue_offset;
+        cmd_buf_read_offset = (ce_ctx->cmd_buf_read_queue_offset *
+                        (NVGPU_CE_MAX_COMMAND_BUFF_SIZE_PER_KICKOFF / sizeof(u32)));
+        /* at end of command buffer has gk20a_fence for command buffer sync */
+        fence_index = (cmd_buf_read_offset +
+                        ((NVGPU_CE_MAX_COMMAND_BUFF_SIZE_PER_KICKOFF / sizeof(u32)) -
+                        (NVGPU_CE_MAX_COMMAND_BUFF_SIZE_FOR_TRACING / sizeof(u32))));
+        if (sizeof(struct gk20a_fence *) > NVGPU_CE_MAX_COMMAND_BUFF_SIZE_FOR_TRACING) {
+                ret = -ENOMEM;
+                goto noop;
+        }
+        cmd_buf_cpu_va = (u32 *)ce_ctx->cmd_buf_mem.cpu_va;
+        /* 0 is treated as invalid pre-sync */
+        if (cmd_buf_cpu_va[fence_index]) {
+                struct gk20a_fence * ce_cmd_buf_fence_in = NULL;
+                memcpy((void *)&ce_cmd_buf_fence_in,
+                                (void *)(cmd_buf_cpu_va + fence_index),
+                                sizeof(struct gk20a_fence *));
+                ret = gk20a_fence_wait(ce_cmd_buf_fence_in, gk20a_get_gr_idle_timeout(g));
+                gk20a_fence_put(ce_cmd_buf_fence_in);
+                /* Reset the stored last pre-sync */
+                memset((void *)(cmd_buf_cpu_va + fence_index),
+                                0,
+                                NVGPU_CE_MAX_COMMAND_BUFF_SIZE_FOR_TRACING);
+                if (ret)
+                        goto noop;
+        }
+        cmd_buf_gpu_va = (ce_ctx->cmd_buf_mem.gpu_va + (u64)(cmd_buf_read_offset *sizeof(u32)));
+        methodSize = gk20a_ce_prepare_submit(src_buf,
+                                        dst_buf,
+                                        size,
+                                        &cmd_buf_cpu_va[cmd_buf_read_offset],
+                                        NVGPU_CE_MAX_COMMAND_BUFF_SIZE_PER_KICKOFF,
+                                        payload,
+                                        gk20a_get_valid_launch_flags(g, launch_flags),
+                                        request_operation,
+                                        gpu_capability->dma_copy_class,
+                                        gk20a_fence_in);
+        if (methodSize) {
+                /* TODO: Remove CPU pre-fence wait */
+                if (gk20a_fence_in) {
+                        ret = gk20a_fence_wait(gk20a_fence_in, gk20a_get_gr_idle_timeout(g));
+                        gk20a_fence_put(gk20a_fence_in);
+                        if (ret)
+                                goto noop;
+                }
+                /* store the element into gpfifo */
+                gpfifo.entry0 =
+                        u64_lo32(cmd_buf_gpu_va);
+                gpfifo.entry1 =
+                        (u64_hi32(cmd_buf_gpu_va) |
+                        pbdma_gp_entry1_length_f(methodSize));
+                /* take always the postfence as it is needed for protecting the ce context */
+                submit_flags |= NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_GET;
+                wmb();
+                ret = gk20a_submit_channel_gpfifo(ce_ctx->ch, &gpfifo, NULL,
+                                        1, submit_flags, &fence, &ce_cmd_buf_fence_out, true);
+                if (!ret) {
+                        memcpy((void *)(cmd_buf_cpu_va + fence_index),
+                                        (void *)&ce_cmd_buf_fence_out,
+                                        sizeof(struct gk20a_fence *));
+                        if (gk20a_fence_out) {
+                                gk20a_fence_get(ce_cmd_buf_fence_out);
+                                *gk20a_fence_out = ce_cmd_buf_fence_out;
+                        }
+                        /* Next available command buffer queue Index */
+                        ++ce_ctx->cmd_buf_read_queue_offset;
+                        ++ce_ctx->submitted_seq_number;
+                        }
+        } else
+                ret = -ENOMEM;
+noop:
+        mutex_unlock(&ce_ctx->gpu_ctx_mutex);
+end:
+        return ret;
+}
+EXPORT_SYMBOL(gk20a_ce_execute_ops);
+void gk20a_ce_delete_context(struct device *dev,
+                u32 ce_ctx_id)
+{
+        struct gk20a *g = gk20a_from_dev(dev);
+        struct gk20a_ce_app *ce_app = &g->ce_app;
+        struct gk20a_gpu_ctx *ce_ctx, *ce_ctx_save;
+        if (!ce_app->initialised ||ce_app->app_state != NVGPU_CE_ACTIVE)
+                return;
+        mutex_lock(&ce_app->app_mutex);
+        list_for_each_entry_safe(ce_ctx, ce_ctx_save,
+                        &ce_app->allocated_contexts, list) {
+                if (ce_ctx->ctx_id == ce_ctx_id) {
+                        gk20a_ce_delete_gpu_context(ce_ctx);
+                        --ce_app->ctx_count;
+                        break;
+                }
+        }
+        mutex_unlock(&ce_app->app_mutex);
+        return;
+}
+EXPORT_SYMBOL(gk20a_ce_delete_context);
+#ifdef CONFIG_DEBUG_FS
+void gk20a_ce_debugfs_init(struct device *dev)
+{
+        struct gk20a_platform *platform = dev_get_drvdata(dev);
+        struct gk20a *g = get_gk20a(dev);
+        debugfs_create_u32("ce_app_ctx_count", S_IWUSR | S_IRUGO,
+                           platform->debugfs, &g->ce_app.ctx_count);
+        debugfs_create_u32("ce_app_state", S_IWUSR | S_IRUGO,
+                           platform->debugfs, &g->ce_app.app_state);
+        debugfs_create_u32("ce_app_next_ctx_id", S_IWUSR | S_IRUGO,
+                           platform->debugfs, &g->ce_app.next_ctx_id);
+}
+#endif
diff --git a/drivers/gpu/nvgpu/gk20a/ce2_gk20a.h b/drivers/gpu/nvgpu/gk20a/ce2_gk20a.h
index 5ceb69e1..3b53834d 100644
--- a/drivers/gpu/nvgpu/gk20a/ce2_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/ce2_gk20a.h
@@ -28,4 +28,128 @@ void gk20a_init_ce2(struct gpu_ops *gops);
 void gk20a_ce2_isr(struct gk20a *g, u32 inst_id, u32 pri_base);
 void gk20a_ce2_nonstall_isr(struct gk20a *g, u32 inst_id, u32 pri_base);
+/* CE command utility macros */
+#define NVGPU_CE_LOWER_ADDRESS_OFFSET_MASK 0xffffffff
+#define NVGPU_CE_UPPER_ADDRESS_OFFSET_MASK 0xff
+#define NVGPU_CE_COMMAND_BUF_SIZE     4096
+#define NVGPU_CE_MAX_COMMAND_BUFF_SIZE_PER_KICKOFF 128
+#define NVGPU_CE_MAX_COMMAND_BUFF_SIZE_FOR_TRACING 8
+typedef void (*ce_event_callback)(u32 ce_ctx_id, u32 ce_event_flag);
+/* dma launch_flags */
+enum {
+        /* location */
+        NVGPU_CE_SRC_LOCATION_COHERENT_SYSMEM                    = (1 << 0),
+        NVGPU_CE_SRC_LOCATION_NONCOHERENT_SYSMEM                 = (1 << 1),
+        NVGPU_CE_SRC_LOCATION_LOCAL_FB                           = (1 << 2),
+        NVGPU_CE_DST_LOCATION_COHERENT_SYSMEM                    = (1 << 3),
+        NVGPU_CE_DST_LOCATION_NONCOHERENT_SYSMEM                 = (1 << 4),
+        NVGPU_CE_DST_LOCATION_LOCAL_FB                           = (1 << 5),
+        /* memory layout */
+        NVGPU_CE_SRC_MEMORY_LAYOUT_PITCH                         = (1 << 6),
+        NVGPU_CE_SRC_MEMORY_LAYOUT_BLOCKLINEAR                   = (1 << 7),
+        NVGPU_CE_DST_MEMORY_LAYOUT_PITCH                         = (1 << 8),
+        NVGPU_CE_DST_MEMORY_LAYOUT_BLOCKLINEAR                   = (1 << 9),
+        /* transfer type */
+        NVGPU_CE_DATA_TRANSFER_TYPE_PIPELINED                   = (1 << 10),
+        NVGPU_CE_DATA_TRANSFER_TYPE_NON_PIPELINED               = (1 << 11),
+};
+/* CE operation mode */
+enum {
+        NVGPU_CE_PHYS_MODE_TRANSFER        = (1 << 0),
+        NVGPU_CE_MEMSET                    = (1 << 1),
+};
+/* CE event flags */
+enum {
+        NVGPU_CE_CONTEXT_JOB_COMPLETED               = (1 << 0),
+        NVGPU_CE_CONTEXT_JOB_TIMEDOUT                = (1 << 1),
+        NVGPU_CE_CONTEXT_SUSPEND                     = (1 << 2),
+        NVGPU_CE_CONTEXT_RESUME                      = (1 << 3),
+};
+/* CE app state machine flags */
+enum {
+        NVGPU_CE_ACTIVE                    = (1 << 0),
+        NVGPU_CE_SUSPEND                   = (1 << 1),
+};
+/* gpu context state machine flags */
+enum {
+        NVGPU_CE_GPU_CTX_ALLOCATED         = (1 << 0),
+        NVGPU_CE_GPU_CTX_DELETED           = (1 << 1),
+};
+/* global ce app db */
+struct gk20a_ce_app {
+        bool initialised;
+        struct mutex app_mutex;
+        int app_state;
+        struct list_head allocated_contexts;
+        u32 ctx_count;
+        u32 next_ctx_id;
+};
+/* ce context db */
+struct gk20a_gpu_ctx {
+        struct gk20a *g;
+        struct device *dev;
+        u32 ctx_id;
+        struct mutex gpu_ctx_mutex;
+        int gpu_ctx_state;
+        ce_event_callback user_event_callback;
+        /* channel related data */
+        struct channel_gk20a *ch;
+        struct vm_gk20a *vm;
+        /* cmd buf mem_desc */
+        struct mem_desc cmd_buf_mem;
+        struct list_head list;
+        u64 submitted_seq_number;
+        u64 completed_seq_number;
+        u32 cmd_buf_read_queue_offset;
+        u32 cmd_buf_end_queue_offset;
+};
+/* global CE app related apis */
+int gk20a_init_ce_support(struct gk20a *g);
+void gk20a_ce_suspend(struct gk20a *g);
+void gk20a_ce_destroy(struct gk20a *g);
+/* CE app utility functions */
+u32 gk20a_ce_create_context_with_cb(struct device *dev,
+                int runlist_id,
+                int priority,
+                int timeslice,
+                int runlist_level,
+                ce_event_callback user_event_callback);
+int gk20a_ce_execute_ops(struct device *dev,
+                u32 ce_ctx_id,
+                u64 src_buf,
+                u64 dst_buf,
+                u64 size,
+                unsigned int payload,
+                int launch_flags,
+                int request_operation,
+                struct gk20a_fence *gk20a_fence_in,
+                u32 submit_flags,
+                struct gk20a_fence **gk20a_fence_out);
+void gk20a_ce_delete_context(struct device *dev,
+                u32 ce_ctx_id);
+#ifdef CONFIG_DEBUG_FS
+/* CE app debugfs api */
+void gk20a_ce_debugfs_init(struct device *dev);
+#endif
 #endif /*__CE2_GK20A_H__*/
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
index d5457d10..447fe86a 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
@@ -702,7 +702,7 @@ static int gk20a_channel_set_wdt_status(struct channel_gk20a *ch,
        return 0;
 }
-static int gk20a_channel_set_runlist_interleave(struct channel_gk20a *ch,
+int gk20a_channel_set_runlist_interleave(struct channel_gk20a *ch,
                                                u32 level)
 {
        struct gk20a *g = ch->g;
@@ -1113,9 +1113,11 @@ static void gk20a_channel_update_runcb_fn(struct work_struct *work)
 struct channel_gk20a *gk20a_open_new_channel_with_cb(struct gk20a *g,
                void (*update_fn)(struct channel_gk20a *, void *),
-                void *update_fn_data)
+                void *update_fn_data,
+                int runlist_id,
+                bool is_privileged_channel)
 {
-        struct channel_gk20a *ch = gk20a_open_new_channel(g, -1, false);
+        struct channel_gk20a *ch = gk20a_open_new_channel(g, runlist_id, is_privileged_channel);
        if (ch) {
                spin_lock(&ch->update_fn_lock);
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
index 4b5fe1b3..971175f2 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
@@ -265,7 +265,9 @@ struct channel_gk20a *gk20a_open_new_channel(struct gk20a *g,
                bool is_privileged_channel);
 struct channel_gk20a *gk20a_open_new_channel_with_cb(struct gk20a *g,
                void (*update_fn)(struct channel_gk20a *, void *),
-                void *update_fn_data);
+                void *update_fn_data,
+                int runlist_id,
+                bool is_privileged_channel);
 void channel_gk20a_unbind(struct channel_gk20a *ch_gk20a);
 int gk20a_submit_channel_gpfifo(struct channel_gk20a *c,
@@ -295,6 +297,8 @@ int gk20a_channel_get_timescale_from_timeslice(struct gk20a *g,
                int *__timeslice_timeout, int *__timeslice_scale);
 int gk20a_channel_set_priority(struct channel_gk20a *ch, u32 priority);
 int gk20a_channel_set_timeslice(struct channel_gk20a *ch, u32 timeslice);
+int gk20a_channel_set_runlist_interleave(struct channel_gk20a *ch,
+                u32 level);
 void gk20a_channel_event_id_post_event(struct channel_gk20a *ch,
                                       int event_id);
diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
index 5133f86a..3dd7cb02 100644
--- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
@@ -165,6 +165,33 @@ u32 gk20a_fifo_get_all_ce_engine_reset_mask(struct gk20a *g)
        return reset_mask;
 }
+u32 gk20a_fifo_get_fast_ce_runlist_id(struct gk20a *g)
+{
+        u32 ce_runlist_id = gk20a_fifo_get_gr_runlist_id(g);
+        u32 engine_enum = ENGINE_INVAL_GK20A;
+        struct fifo_gk20a *f = NULL;
+        u32 engine_id_idx;
+        struct fifo_engine_info_gk20a *engine_info;
+        u32 active_engine_id = 0;
+        if (!g)
+                return ce_runlist_id;
+        f = &g->fifo;
+        for (engine_id_idx = 0; engine_id_idx < f->num_engines; ++engine_id_idx) {
+                active_engine_id = f->active_engines_list[engine_id_idx];
+                engine_info = &f->engine_info[active_engine_id];
+                engine_enum = engine_info->engine_enum;
+                /* selecet last available ASYNC_CE if available */
+                if (engine_enum == ENGINE_ASYNC_CE_GK20A)
+                        ce_runlist_id = engine_info->runlist_id;
+        }
+        return ce_runlist_id;
+}
 u32 gk20a_fifo_get_gr_runlist_id(struct gk20a *g)
 {
        u32 gr_engine_cnt = 0;
diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h
index 3473bc78..33d6d39c 100644
--- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h
@@ -244,6 +244,8 @@ u32 gk20a_fifo_get_gr_engine_id(struct gk20a *g);
 u32 gk20a_fifo_get_all_ce_engine_reset_mask(struct gk20a *g);
+u32 gk20a_fifo_get_fast_ce_runlist_id(struct gk20a *g);
 u32 gk20a_fifo_get_gr_runlist_id(struct gk20a *g);
 bool gk20a_fifo_is_valid_runlist_id(struct gk20a *g, u32 runlist_id);
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.c b/drivers/gpu/nvgpu/gk20a/gk20a.c
index 50f67262..04f82033 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.c
@@ -773,6 +773,7 @@ static int gk20a_pm_prepare_poweroff(struct device *dev)
 {
        struct gk20a *g = get_gk20a(dev);
        int ret = 0;
+        struct gk20a_platform *platform = gk20a_get_platform(dev);
        gk20a_dbg_fn("");
@@ -786,6 +787,9 @@ static int gk20a_pm_prepare_poweroff(struct device *dev)
        /* cancel any pending cde work */
        gk20a_cde_suspend(g);
+        if (platform->has_ce)
+                gk20a_ce_suspend(g);
        ret = gk20a_channel_suspend(g);
        if (ret)
                goto done;
@@ -996,6 +1000,11 @@ int gk20a_pm_finalize_poweron(struct device *dev)
        if (platform->has_cde)
                gk20a_init_cde_support(g);
+        if (platform->has_ce)
+                gk20a_init_ce_support(g);
+        gk20a_init_mm_ce_context(g);
        enable_irq(g->irq_stall);
        if (g->irq_stall != g->irq_nonstall)
                enable_irq(g->irq_nonstall);
@@ -1658,6 +1667,7 @@ static int gk20a_probe(struct platform_device *dev)
        gk20a_pmu_debugfs_init(&dev->dev);
        gk20a_railgating_debugfs_init(&dev->dev);
        gk20a_cde_debugfs_init(&dev->dev);
+        gk20a_ce_debugfs_init(&dev->dev);
        gk20a_alloc_debugfs_init(dev);
        gk20a_mm_debugfs_init(&dev->dev);
        gk20a_fifo_debugfs_init(&dev->dev);
@@ -1693,6 +1703,9 @@ static int __exit gk20a_remove(struct platform_device *pdev)
        if (g->remove_support)
                g->remove_support(dev);
+        if (platform->has_ce)
+                gk20a_ce_destroy(g);
        gk20a_user_deinit(dev, &nvgpu_class);
        debugfs_remove_recursive(platform->debugfs);
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.h b/drivers/gpu/nvgpu/gk20a/gk20a.h
index 8aa8689b..03a698dc 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.h
@@ -864,6 +864,8 @@ struct gk20a {
        struct nvgpu_bios bios;
        struct debugfs_blob_wrapper bios_blob;
+        struct gk20a_ce_app ce_app;
 };
 static inline unsigned long gk20a_get_gr_idle_timeout(struct gk20a *g)
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
index 750ce10c..7b2174bc 100644
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
@@ -393,7 +393,7 @@ static int __must_check gk20a_init_system_vm(struct mm_gk20a *mm);
 static int __must_check gk20a_init_bar1_vm(struct mm_gk20a *mm);
 static int __must_check gk20a_init_hwpm(struct mm_gk20a *mm);
 static int __must_check gk20a_init_cde_vm(struct mm_gk20a *mm);
+static int __must_check gk20a_init_ce_vm(struct mm_gk20a *mm);
 struct gk20a_dmabuf_priv {
        struct mutex lock;
@@ -702,6 +702,7 @@ void gk20a_remove_vm(struct vm_gk20a *vm, struct mem_desc *inst_block)
 static void gk20a_remove_mm_support(struct mm_gk20a *mm)
 {
        struct gk20a *g = gk20a_from_mm(mm);
+        struct gk20a_platform *platform = gk20a_get_platform(g->dev);
        if (g->ops.mm.remove_bar2_vm)
                g->ops.mm.remove_bar2_vm(g);
@@ -709,6 +710,14 @@ static void gk20a_remove_mm_support(struct mm_gk20a *mm)
        gk20a_remove_vm(&mm->pmu.vm, &mm->pmu.inst_block);
        gk20a_free_inst_block(gk20a_from_mm(mm), &mm->hwpm.inst_block);
        gk20a_vm_remove_support_nofree(&mm->cde.vm);
+        if (mm->ce_vidmem_ctx_id != ~0)
+                gk20a_ce_delete_context(g->dev, mm->ce_vidmem_ctx_id );
+        mm->ce_vidmem_ctx_id =  ~0;
+        if (platform->has_ce)
+                gk20a_vm_remove_support_nofree(&mm->ce.vm);
 }
 static int gk20a_alloc_sysmem_flush(struct gk20a *g)
@@ -754,6 +763,7 @@ int gk20a_init_mm_setup_sw(struct gk20a *g)
 {
        struct mm_gk20a *mm = &g->mm;
        int err;
+        struct gk20a_platform *platform = gk20a_get_platform(g->dev);
        gk20a_dbg_fn("");
@@ -775,6 +785,8 @@ int gk20a_init_mm_setup_sw(struct gk20a *g)
        gk20a_init_pramin(mm);
+        mm->ce_vidmem_ctx_id =  ~0;
        err = gk20a_init_vidmem(mm);
        if (err)
                return err;
@@ -804,6 +816,12 @@ int gk20a_init_mm_setup_sw(struct gk20a *g)
        if (err)
                return err;
+        if (platform->has_ce) {
+                err = gk20a_init_ce_vm(mm);
+                if (err)
+                        return err;
+        }
        /* set vm_alloc_share op here as gk20a_as_alloc_share needs it */
        g->ops.mm.vm_alloc_share = gk20a_vm_alloc_share;
        mm->remove_support = gk20a_remove_mm_support;
@@ -881,6 +899,25 @@ int gk20a_init_mm_support(struct gk20a *g)
        return err;
 }
+void gk20a_init_mm_ce_context(struct gk20a *g)
+{
+#if defined(CONFIG_GK20A_VIDMEM)
+        if (g->mm.vidmem_size && (g->mm.ce_vidmem_ctx_id ==  ~0)) {
+                g->mm.ce_vidmem_ctx_id =
+                        gk20a_ce_create_context_with_cb(g->dev,
+                                gk20a_fifo_get_fast_ce_runlist_id(g),
+                                -1,
+                                -1,
+                                -1,
+                                NULL);
+                if (g->mm.ce_vidmem_ctx_id == ~0)
+                        gk20a_err(g->dev,
+                                "Failed to allocate CE context for vidmem page clearing support");
+        }
+#endif
+}
 static int alloc_gmmu_phys_pages(struct vm_gk20a *vm, u32 order,
                                 struct gk20a_mm_entry *entry)
 {
@@ -2484,6 +2521,7 @@ int gk20a_gmmu_alloc_attr_vid_at(struct gk20a *g, enum dma_attr attr,
        struct device *d = &g->mm.vidmem_dev;
        int err;
        dma_addr_t iova;
+        bool need_pramin_access = true;
        DEFINE_DMA_ATTRS(attrs);
        gk20a_dbg_fn("");
@@ -2519,7 +2557,38 @@ int gk20a_gmmu_alloc_attr_vid_at(struct gk20a *g, enum dma_attr attr,
        mem->size = size;
        mem->aperture = APERTURE_VIDMEM;
-        gk20a_memset(g, mem, 0, 0, size);
+        if (g->mm.ce_vidmem_ctx_id != ~0) {
+                struct gk20a_fence *gk20a_fence_out = NULL;
+                u64 dst_bufbase = g->ops.mm.get_iova_addr(g, mem->sgt->sgl, 0);
+                err = gk20a_ce_execute_ops(g->dev,
+                                g->mm.ce_vidmem_ctx_id,
+                                0,
+                                dst_bufbase,
+                                (u64)size,
+                                0x00000000,
+                                NVGPU_CE_DST_LOCATION_LOCAL_FB,
+                                NVGPU_CE_MEMSET,
+                                NULL,
+                                0,
+                                &gk20a_fence_out);
+                if (!err) {
+                        if (gk20a_fence_out) {
+                                err = gk20a_fence_wait(gk20a_fence_out, gk20a_get_gr_idle_timeout(g));
+                                gk20a_fence_put(gk20a_fence_out);
+                                if (err)
+                                        gk20a_err(g->dev,
+                                                "Failed to get the fence_out from CE execute ops");
+                                else
+                                        need_pramin_access = false;
+                        }
+                } else
+                        gk20a_err(g->dev, "Failed gk20a_ce_execute_ops[%d]",err);
+        }
+        if (need_pramin_access)
+                gk20a_memset(g, mem, 0, 0, size);
        gk20a_dbg_fn("done");
@@ -4125,6 +4194,19 @@ static int gk20a_init_cde_vm(struct mm_gk20a *mm)
                        false, false, "cde");
 }
+static int gk20a_init_ce_vm(struct mm_gk20a *mm)
+{
+        struct vm_gk20a *vm = &mm->ce.vm;
+        struct gk20a *g = gk20a_from_mm(mm);
+        u32 big_page_size = gk20a_get_platform(g->dev)->default_big_page_size;
+        return gk20a_init_vm(mm, vm, big_page_size,
+                        SZ_4K * 16,
+                        NV_MM_DEFAULT_KERNEL_SIZE,
+                        NV_MM_DEFAULT_KERNEL_SIZE + NV_MM_DEFAULT_USER_SIZE,
+                        false, false, "ce");
+}
 void gk20a_mm_init_pdb(struct gk20a *g, struct mem_desc *inst_block,
                struct vm_gk20a *vm)
 {
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
index 66e46480..184c1f71 100644
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
@@ -307,6 +307,7 @@ int gk20a_init_mm_support(struct gk20a *g);
 int gk20a_init_mm_setup_sw(struct gk20a *g);
 int gk20a_init_mm_setup_hw(struct gk20a *g);
 void gk20a_mm_debugfs_init(struct device *dev);
+void gk20a_init_mm_ce_context(struct gk20a *g);
 int gk20a_mm_fb_flush(struct gk20a *g);
 void gk20a_mm_l2_flush(struct gk20a *g, bool invalidate);
@@ -349,6 +350,10 @@ struct mm_gk20a {
                struct vm_gk20a vm;
        } cde;
+        struct {
+                struct vm_gk20a vm;
+        } ce;
        struct mutex l2_op_lock;
 #ifdef CONFIG_ARCH_TEGRA_18x_SOC
        struct mem_desc bar2_desc;
@@ -388,6 +393,7 @@ struct mm_gk20a {
        size_t vidmem_size;
        struct device vidmem_dev;
+        u32 ce_vidmem_ctx_id;
 };
 int gk20a_mm_init(struct mm_gk20a *mm);
diff --git a/drivers/gpu/nvgpu/gk20a/platform_gk20a.h b/drivers/gpu/nvgpu/gk20a/platform_gk20a.h
index 543f9873..5bde3439 100644
--- a/drivers/gpu/nvgpu/gk20a/platform_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/platform_gk20a.h
@@ -209,6 +209,8 @@ struct gk20a_platform {
        bool has_cde;
+        bool has_ce;
        /* soc name for finding firmware files */
        const char *soc_name;
diff --git a/drivers/gpu/nvgpu/gk20a/platform_gk20a_tegra.c b/drivers/gpu/nvgpu/gk20a/platform_gk20a_tegra.c
index 2ed6df43..745d963c 100644
--- a/drivers/gpu/nvgpu/gk20a/platform_gk20a_tegra.c
+++ b/drivers/gpu/nvgpu/gk20a/platform_gk20a_tegra.c
@@ -900,6 +900,8 @@ struct gk20a_platform gk20a_tegra_platform = {
        .secure_page_alloc = gk20a_tegra_secure_page_alloc,
        .dump_platform_dependencies = gk20a_tegra_debug_dump,
+        .has_ce = true,
        .soc_name = "tegra12x",
        .vidmem_is_vidmem = false,
@@ -962,6 +964,8 @@ struct gk20a_platform gm20b_tegra_platform = {
        .has_cde = true,
+        .has_ce = true,
        .soc_name = "tegra21x",
        .vidmem_is_vidmem = false,
diff --git a/drivers/gpu/nvgpu/pci.c b/drivers/gpu/nvgpu/pci.c
index ea6f3b4c..fcf63ddc 100644
--- a/drivers/gpu/nvgpu/pci.c
+++ b/drivers/gpu/nvgpu/pci.c
@@ -56,6 +56,8 @@ static struct gk20a_platform nvgpu_pci_device = {
        .ch_wdt_timeout_ms = 7000,
        .disable_bigpage = true,
+        .has_ce = true,
 };
 static struct pci_device_id nvgpu_pci_table[] = {