From 89aecd1202b49727e940069f2a6feb5c3cf4c927 Mon Sep 17 00:00:00 2001
From: Lakshmanan M <lm@nvidia.com>
Date: Wed, 29 Jun 2016 16:06:39 +0530
Subject: gpu: nvgpu: Add nvgpu infra to allow kernel to create privileged CE
 channels

Added interface to allow kernel to create privileged CE channels for
page migration and clearing support between sysmem and videmem.

JIRA DNVGPU-53

Change-Id: I3e18d18403809c9e64fa45d40b6c4e3844992506
Signed-off-by: Lakshmanan M <lm@nvidia.com>
Reviewed-on: http://git-master/r/1173085
GVS: Gerrit_Virtual_Submit
Reviewed-by: Vijayakumar Subbu <vsubbu@nvidia.com>
---
 drivers/gpu/nvgpu/gk20a/cde_gk20a.c            |   4 +-
 drivers/gpu/nvgpu/gk20a/ce2_gk20a.c            | 617 +++++++++++++++++++++++++
 drivers/gpu/nvgpu/gk20a/ce2_gk20a.h            | 124 +++++
 drivers/gpu/nvgpu/gk20a/channel_gk20a.c        |   8 +-
 drivers/gpu/nvgpu/gk20a/channel_gk20a.h        |   6 +-
 drivers/gpu/nvgpu/gk20a/fifo_gk20a.c           |  27 ++
 drivers/gpu/nvgpu/gk20a/fifo_gk20a.h           |   2 +
 drivers/gpu/nvgpu/gk20a/gk20a.c                |  13 +
 drivers/gpu/nvgpu/gk20a/gk20a.h                |   2 +
 drivers/gpu/nvgpu/gk20a/mm_gk20a.c             |  86 +++-
 drivers/gpu/nvgpu/gk20a/mm_gk20a.h             |   6 +
 drivers/gpu/nvgpu/gk20a/platform_gk20a.h       |   2 +
 drivers/gpu/nvgpu/gk20a/platform_gk20a_tegra.c |   4 +
 drivers/gpu/nvgpu/pci.c                        |   2 +
 14 files changed, 896 insertions(+), 7 deletions(-)

(limited to 'drivers/gpu')

diff --git a/drivers/gpu/nvgpu/gk20a/cde_gk20a.c b/drivers/gpu/nvgpu/gk20a/cde_gk20a.c
index 4b84dc69..f5b68e72 100644
--- a/drivers/gpu/nvgpu/gk20a/cde_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/cde_gk20a.c
@@ -1186,7 +1186,9 @@ static int gk20a_cde_load(struct gk20a_cde_ctx *cde_ctx)
 	}
 
 	ch = gk20a_open_new_channel_with_cb(g, gk20a_cde_finished_ctx_cb,
-			cde_ctx);
+			cde_ctx,
+			-1,
+			false);
 	if (!ch) {
 		gk20a_warn(cde_ctx->dev, "cde: gk20a channel not available");
 		err = -ENOMEM;
diff --git a/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c b/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c
index 96d38b11..e2f2d9e9 100644
--- a/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c
@@ -24,6 +24,7 @@
 #include <trace/events/gk20a.h>
 #include <linux/dma-mapping.h>
 #include <linux/nvhost.h>
+#include <linux/debugfs.h>
 
 #include "gk20a.h"
 #include "debug_gk20a.h"
@@ -96,3 +97,619 @@ void gk20a_init_ce2(struct gpu_ops *gops)
 	gops->ce2.isr_stall = gk20a_ce2_isr;
 	gops->ce2.isr_nonstall = gk20a_ce2_nonstall_isr;
 }
+
+/* static CE app api */
+static void gk20a_ce_notify_all_user(struct gk20a *g, u32 event)
+{
+	struct gk20a_ce_app *ce_app = &g->ce_app;
+	struct gk20a_gpu_ctx *ce_ctx, *ce_ctx_save;
+
+	if (!ce_app->initialised)
+		return;
+
+	mutex_lock(&ce_app->app_mutex);
+
+	list_for_each_entry_safe(ce_ctx, ce_ctx_save,
+			&ce_app->allocated_contexts, list) {
+		if (ce_ctx->user_event_callback) {
+			ce_ctx->user_event_callback(ce_ctx->ctx_id,
+				event);
+		}
+	}
+
+	mutex_unlock(&ce_app->app_mutex);
+}
+
+static void gk20a_ce_finished_ctx_cb(struct channel_gk20a *ch, void *data)
+{
+	struct gk20a_gpu_ctx *ce_ctx = data;
+	bool channel_idle;
+	u32 event;
+
+	mutex_lock(&ch->jobs_lock);
+	channel_idle = list_empty(&ch->jobs);
+	mutex_unlock(&ch->jobs_lock);
+
+	if (!channel_idle)
+		return;
+
+	gk20a_dbg(gpu_dbg_fn, "ce: finished %p", ce_ctx);
+
+	if (ch->has_timedout)
+		event = NVGPU_CE_CONTEXT_JOB_TIMEDOUT;
+	else
+		event = NVGPU_CE_CONTEXT_JOB_COMPLETED;
+
+	if (ce_ctx->user_event_callback)
+		ce_ctx->user_event_callback(ce_ctx->ctx_id,
+			event);
+
+	++ce_ctx->completed_seq_number;
+}
+
+static void gk20a_ce_free_command_buffer_stored_fence(struct gk20a_gpu_ctx *ce_ctx)
+{
+	u32 cmd_buf_index;
+	u32 cmd_buf_read_offset;
+	u32 fence_index;
+	u32 *cmd_buf_cpu_va;
+
+	for (cmd_buf_index = 0;
+		cmd_buf_index < ce_ctx->cmd_buf_end_queue_offset;
+		cmd_buf_index++) {
+		cmd_buf_read_offset = (cmd_buf_index *
+			(NVGPU_CE_MAX_COMMAND_BUFF_SIZE_PER_KICKOFF / sizeof(u32)));
+
+		/* at end of command buffer has gk20a_fence for command buffer sync */
+		fence_index = (cmd_buf_read_offset +
+			((NVGPU_CE_MAX_COMMAND_BUFF_SIZE_PER_KICKOFF / sizeof(u32)) -
+			(NVGPU_CE_MAX_COMMAND_BUFF_SIZE_FOR_TRACING / sizeof(u32))));
+
+		cmd_buf_cpu_va = (u32 *)ce_ctx->cmd_buf_mem.cpu_va;
+
+		/* 0 is treated as invalid pre-sync */
+		if (cmd_buf_cpu_va[fence_index]) {
+			struct gk20a_fence * ce_cmd_buf_fence_in = NULL;
+
+			memcpy((void *)&ce_cmd_buf_fence_in,
+					(void *)(cmd_buf_cpu_va + fence_index),
+					sizeof(struct gk20a_fence *));
+			gk20a_fence_put(ce_cmd_buf_fence_in);
+			/* Reset the stored last pre-sync */
+			memset((void *)(cmd_buf_cpu_va + fence_index),
+					0,
+					NVGPU_CE_MAX_COMMAND_BUFF_SIZE_FOR_TRACING);
+		}
+	}
+}
+
+/* assume this api should need to call under mutex_lock(&ce_app->app_mutex) */
+static void gk20a_ce_delete_gpu_context(struct gk20a_gpu_ctx *ce_ctx)
+{
+	ce_ctx->gpu_ctx_state = NVGPU_CE_GPU_CTX_DELETED;
+
+	mutex_lock(&ce_ctx->gpu_ctx_mutex);
+
+	gk20a_ce_free_command_buffer_stored_fence(ce_ctx);
+
+	gk20a_gmmu_unmap_free(ce_ctx->vm, &ce_ctx->cmd_buf_mem);
+
+	/* free the channel */
+	if (ce_ctx->ch)
+		gk20a_channel_close(ce_ctx->ch);
+
+	/* housekeeping on app */
+	list_del(&ce_ctx->list);
+
+	mutex_unlock(&ce_ctx->gpu_ctx_mutex);
+	mutex_destroy(&ce_ctx->gpu_ctx_mutex);
+
+	kfree(ce_ctx);
+}
+
+static inline int gk20a_ce_get_method_size(int request_operation)
+{
+	/* failure size */
+	int methodsize = ~0;
+
+	if (request_operation & NVGPU_CE_PHYS_MODE_TRANSFER)
+		methodsize = 10 * 2 * sizeof(u32);
+	else if (request_operation & NVGPU_CE_MEMSET)
+		methodsize = 9 * 2 * sizeof(u32);
+
+	return methodsize;
+}
+
+static inline int gk20a_get_valid_launch_flags(struct gk20a *g, int launch_flags)
+{
+	/* there is no local memory available,
+	don't allow local memory related CE flags */
+	if (!g->mm.vidmem_size) {
+		launch_flags &= ~(NVGPU_CE_SRC_LOCATION_LOCAL_FB |
+			NVGPU_CE_DST_LOCATION_LOCAL_FB);
+	}
+	return launch_flags;
+}
+
+static int gk20a_ce_prepare_submit(u64 src_buf,
+		u64 dst_buf,
+		u64 size,
+		u32 *cmd_buf_cpu_va,
+		u32 max_cmd_buf_size,
+		unsigned int payload,
+		int launch_flags,
+		int request_operation,
+		u32 dma_copy_class,
+		struct gk20a_fence *gk20a_fence_in)
+{
+	u32 launch = 0;
+	u32 methodSize = 0;
+
+	/* failure case handling */
+	if ((gk20a_ce_get_method_size(request_operation) > max_cmd_buf_size) ||
+		(!size) ||
+		(request_operation > NVGPU_CE_MEMSET))
+		return 0;
+
+	/* set the channel object */
+	cmd_buf_cpu_va[methodSize++] = 0x20018000;
+	cmd_buf_cpu_va[methodSize++] = dma_copy_class;
+
+	if (request_operation & NVGPU_CE_PHYS_MODE_TRANSFER) {
+		/* setup the source */
+		cmd_buf_cpu_va[methodSize++] = 0x20018101;
+		cmd_buf_cpu_va[methodSize++] = (u64_lo32(src_buf) &
+					NVGPU_CE_LOWER_ADDRESS_OFFSET_MASK);
+
+		cmd_buf_cpu_va[methodSize++] = 0x20018100;
+		cmd_buf_cpu_va[methodSize++] = (u64_hi32(src_buf) &
+					NVGPU_CE_UPPER_ADDRESS_OFFSET_MASK);
+
+		cmd_buf_cpu_va[methodSize++] = 0x20018098;
+		if (launch_flags & NVGPU_CE_SRC_LOCATION_LOCAL_FB) {
+			cmd_buf_cpu_va[methodSize++] = 0x00000000;
+		} else if (launch_flags & NVGPU_CE_SRC_LOCATION_NONCOHERENT_SYSMEM) {
+			cmd_buf_cpu_va[methodSize++] = 0x00000002;
+		} else {
+			cmd_buf_cpu_va[methodSize++] = 0x00000001;
+		}
+
+		launch |= 0x00001000;
+	} else if (request_operation & NVGPU_CE_MEMSET) {
+		cmd_buf_cpu_va[methodSize++] = 0x200181c2;
+		cmd_buf_cpu_va[methodSize++] = 0x00030004;
+
+		cmd_buf_cpu_va[methodSize++] = 0x200181c0;
+		cmd_buf_cpu_va[methodSize++] = payload;
+
+		launch |= 0x00000400;
+
+		/* converted into number of words */
+		size /= sizeof(u32);
+	}
+
+	/* setup the destination/output */
+	cmd_buf_cpu_va[methodSize++] = 0x20018103;
+	cmd_buf_cpu_va[methodSize++] = (u64_lo32(dst_buf) & NVGPU_CE_LOWER_ADDRESS_OFFSET_MASK);
+
+	cmd_buf_cpu_va[methodSize++] = 0x20018102;
+	cmd_buf_cpu_va[methodSize++] = (u64_hi32(dst_buf) & NVGPU_CE_UPPER_ADDRESS_OFFSET_MASK);
+
+	cmd_buf_cpu_va[methodSize++] = 0x20018099;
+	if (launch_flags & NVGPU_CE_DST_LOCATION_LOCAL_FB) {
+		cmd_buf_cpu_va[methodSize++] = 0x00000000;
+	} else if (launch_flags & NVGPU_CE_DST_LOCATION_NONCOHERENT_SYSMEM) {
+		cmd_buf_cpu_va[methodSize++] = 0x00000002;
+	} else {
+		cmd_buf_cpu_va[methodSize++] = 0x00000001;
+	}
+
+	launch |= 0x00002000;
+
+	/* setup the format */
+	cmd_buf_cpu_va[methodSize++] = 0x20018107;
+	cmd_buf_cpu_va[methodSize++] = 1;
+	cmd_buf_cpu_va[methodSize++] = 0x20018106;
+	cmd_buf_cpu_va[methodSize++] =  u64_lo32(size);
+
+	launch |= 0x00000004;
+
+	if (launch_flags & NVGPU_CE_SRC_MEMORY_LAYOUT_BLOCKLINEAR)
+		launch |= 0x00000000;
+	else
+		launch |= 0x00000080;
+
+	if (launch_flags & NVGPU_CE_DST_MEMORY_LAYOUT_BLOCKLINEAR)
+		launch |= 0x00000000;
+	else
+		launch |= 0x00000100;
+
+	if (launch_flags & NVGPU_CE_DATA_TRANSFER_TYPE_NON_PIPELINED)
+		launch |= 0x00000002;
+	else
+		launch |= 0x00000001;
+
+	cmd_buf_cpu_va[methodSize++] = 0x200180c0;
+	cmd_buf_cpu_va[methodSize++] = launch;
+
+	return methodSize;
+}
+
+/* global CE app related apis */
+int gk20a_init_ce_support(struct gk20a *g)
+{
+	struct gk20a_ce_app *ce_app = &g->ce_app;
+
+	if (ce_app->initialised) {
+		/* assume this happen during poweron/poweroff GPU sequence */
+		ce_app->app_state = NVGPU_CE_ACTIVE;
+		gk20a_ce_notify_all_user(g, NVGPU_CE_CONTEXT_RESUME);
+		return 0;
+	}
+
+	gk20a_dbg(gpu_dbg_fn, "ce: init");
+
+	mutex_init(&ce_app->app_mutex);
+	mutex_lock(&ce_app->app_mutex);
+
+	INIT_LIST_HEAD(&ce_app->allocated_contexts);
+	ce_app->ctx_count = 0;
+	ce_app->next_ctx_id = 0;
+	ce_app->initialised = true;
+	ce_app->app_state = NVGPU_CE_ACTIVE;
+
+	mutex_unlock(&ce_app->app_mutex);
+	gk20a_dbg(gpu_dbg_cde_ctx, "ce: init finished");
+
+	return 0;
+}
+
+void gk20a_ce_destroy(struct gk20a *g)
+{
+	struct gk20a_ce_app *ce_app = &g->ce_app;
+	struct gk20a_gpu_ctx *ce_ctx, *ce_ctx_save;
+
+	if (!ce_app->initialised)
+		return;
+
+	ce_app->app_state = NVGPU_CE_SUSPEND;
+	ce_app->initialised = false;
+
+	mutex_lock(&ce_app->app_mutex);
+
+	list_for_each_entry_safe(ce_ctx, ce_ctx_save,
+			&ce_app->allocated_contexts, list) {
+		gk20a_ce_delete_gpu_context(ce_ctx);
+	}
+
+	INIT_LIST_HEAD(&ce_app->allocated_contexts);
+	ce_app->ctx_count = 0;
+	ce_app->next_ctx_id = 0;
+
+	mutex_unlock(&ce_app->app_mutex);
+	mutex_destroy(&ce_app->app_mutex);
+}
+
+void gk20a_ce_suspend(struct gk20a *g)
+{
+	struct gk20a_ce_app *ce_app = &g->ce_app;
+
+	if (!ce_app->initialised)
+		return;
+
+	ce_app->app_state = NVGPU_CE_SUSPEND;
+	gk20a_ce_notify_all_user(g, NVGPU_CE_CONTEXT_SUSPEND);
+
+	return;
+}
+
+/* CE app utility functions */
+u32 gk20a_ce_create_context_with_cb(struct device *dev,
+		int runlist_id,
+		int priority,
+		int timeslice,
+		int runlist_level,
+		ce_event_callback user_event_callback)
+{
+	struct gk20a_gpu_ctx *ce_ctx;
+	struct gk20a *g = gk20a_from_dev(dev);
+	struct gk20a_ce_app *ce_app = &g->ce_app;
+	u32 ctx_id = ~0;
+	int err = 0;
+
+	if (!ce_app->initialised || ce_app->app_state != NVGPU_CE_ACTIVE)
+		return ctx_id;
+
+	ce_ctx = kzalloc(sizeof(*ce_ctx), GFP_KERNEL);
+	if (!ce_ctx)
+		return ctx_id;
+
+	mutex_init(&ce_ctx->gpu_ctx_mutex);
+
+	ce_ctx->g = g;
+	ce_ctx->dev = g->dev;
+	ce_ctx->user_event_callback = user_event_callback;
+
+	ce_ctx->cmd_buf_read_queue_offset = 0;
+	ce_ctx->cmd_buf_end_queue_offset =
+		(NVGPU_CE_COMMAND_BUF_SIZE / NVGPU_CE_MAX_COMMAND_BUFF_SIZE_PER_KICKOFF);
+
+	ce_ctx->submitted_seq_number = 0;
+	ce_ctx->completed_seq_number = 0;
+
+	/* always kernel client needs privileged channel */
+	ce_ctx->ch = gk20a_open_new_channel_with_cb(g, gk20a_ce_finished_ctx_cb,
+					ce_ctx,
+					runlist_id,
+					true);
+	if (!ce_ctx->ch) {
+		gk20a_err(ce_ctx->dev, "ce: gk20a channel not available");
+		goto end;
+	 }
+
+	/* bind the channel to the vm */
+	gk20a_vm_get(&g->mm.ce.vm);
+	ce_ctx->vm = ce_ctx->ch->vm = &g->mm.ce.vm;
+	err = channel_gk20a_commit_va(ce_ctx->ch);
+	if (err) {
+		gk20a_err(ce_ctx->dev, "ce: could not bind vm");
+		goto end;
+	}
+
+	/* allocate gpfifo (1024 should be more than enough) */
+	err = gk20a_alloc_channel_gpfifo(ce_ctx->ch,
+		&(struct nvgpu_alloc_gpfifo_args){1024, 0});
+	if (err) {
+		gk20a_err(ce_ctx->dev, "ce: unable to allocate gpfifo");
+		goto end;
+	}
+
+	/* allocate command buffer (4096 should be more than enough) from sysmem*/
+	err = gk20a_gmmu_alloc_map_sys(ce_ctx->vm, NVGPU_CE_COMMAND_BUF_SIZE, &ce_ctx->cmd_buf_mem);
+	 if (err) {
+		gk20a_err(ce_ctx->dev,
+			"ce: could not allocate command buffer for CE context");
+		goto end;
+	}
+
+	memset(ce_ctx->cmd_buf_mem.cpu_va, 0x00, ce_ctx->cmd_buf_mem.size);
+
+	/* -1 means default channel priority */
+	if (priority != -1) {
+		err = gk20a_channel_set_priority(ce_ctx->ch, priority);
+		if (err) {
+			gk20a_err(ce_ctx->dev,
+				"ce: could not set the channel priority for CE context");
+			goto end;
+		}
+	}
+
+	/* -1 means default channel timeslice value */
+	if (timeslice != -1) {
+		err = gk20a_channel_set_timeslice(ce_ctx->ch, timeslice);
+		if (err) {
+			gk20a_err(ce_ctx->dev,
+				"ce: could not set the channel timeslice value for CE context");
+			goto end;
+		}
+	}
+
+	/* -1 means default channel runlist level */
+	if (runlist_level != -1) {
+		err = gk20a_channel_set_runlist_interleave(ce_ctx->ch, runlist_level);
+		if (err) {
+			gk20a_err(ce_ctx->dev,
+				"ce: could not set the runlist interleave for CE context");
+			goto end;
+		}
+	}
+
+	mutex_lock(&ce_app->app_mutex);
+	ctx_id = ce_ctx->ctx_id = ce_app->next_ctx_id;
+	list_add(&ce_ctx->list, &ce_app->allocated_contexts);
+	++ce_app->next_ctx_id;
+	++ce_app->ctx_count;
+	mutex_unlock(&ce_app->app_mutex);
+
+	ce_ctx->gpu_ctx_state = NVGPU_CE_GPU_CTX_ALLOCATED;
+
+end:
+	if (ctx_id == ~0) {
+		mutex_lock(&ce_app->app_mutex);
+		gk20a_ce_delete_gpu_context(ce_ctx);
+		mutex_unlock(&ce_app->app_mutex);
+	}
+	return ctx_id;
+
+}
+EXPORT_SYMBOL(gk20a_ce_create_context_with_cb);
+
+int gk20a_ce_execute_ops(struct device *dev,
+		u32 ce_ctx_id,
+		u64 src_buf,
+		u64 dst_buf,
+		u64 size,
+		unsigned int payload,
+		int launch_flags,
+		int request_operation,
+		struct gk20a_fence *gk20a_fence_in,
+		u32 submit_flags,
+		struct gk20a_fence **gk20a_fence_out)
+{
+	int ret = -EPERM;
+	struct gk20a *g = gk20a_from_dev(dev);
+	struct gk20a_ce_app *ce_app = &g->ce_app;
+	struct gk20a_gpu_ctx *ce_ctx, *ce_ctx_save;
+	bool found = false;
+	u32 *cmd_buf_cpu_va;
+	u64 cmd_buf_gpu_va = 0;
+	u32 methodSize;
+	u32 cmd_buf_read_offset;
+	u32 fence_index;
+	struct nvgpu_gpfifo gpfifo;
+	struct nvgpu_fence fence = {0,0};
+	struct gk20a_fence *ce_cmd_buf_fence_out = NULL;
+	struct nvgpu_gpu_characteristics *gpu_capability = &g->gpu_characteristics;
+
+	if (!ce_app->initialised ||ce_app->app_state != NVGPU_CE_ACTIVE)
+		goto end;
+
+	mutex_lock(&ce_app->app_mutex);
+
+	list_for_each_entry_safe(ce_ctx, ce_ctx_save,
+			&ce_app->allocated_contexts, list) {
+		if (ce_ctx->ctx_id == ce_ctx_id) {
+			found = true;
+			break;
+		}
+	}
+
+	mutex_unlock(&ce_app->app_mutex);
+
+	if (!found) {
+		ret = -EINVAL;
+		goto end;
+	}
+
+	if (ce_ctx->gpu_ctx_state != NVGPU_CE_GPU_CTX_ALLOCATED) {
+		ret = -ENODEV;
+		goto end;
+	}
+
+	mutex_lock(&ce_ctx->gpu_ctx_mutex);
+
+	ce_ctx->cmd_buf_read_queue_offset %= ce_ctx->cmd_buf_end_queue_offset;
+
+	cmd_buf_read_offset = (ce_ctx->cmd_buf_read_queue_offset *
+			(NVGPU_CE_MAX_COMMAND_BUFF_SIZE_PER_KICKOFF / sizeof(u32)));
+
+	/* at end of command buffer has gk20a_fence for command buffer sync */
+	fence_index = (cmd_buf_read_offset +
+			((NVGPU_CE_MAX_COMMAND_BUFF_SIZE_PER_KICKOFF / sizeof(u32)) -
+			(NVGPU_CE_MAX_COMMAND_BUFF_SIZE_FOR_TRACING / sizeof(u32))));
+
+	if (sizeof(struct gk20a_fence *) > NVGPU_CE_MAX_COMMAND_BUFF_SIZE_FOR_TRACING) {
+		ret = -ENOMEM;
+		goto noop;
+	}
+
+	cmd_buf_cpu_va = (u32 *)ce_ctx->cmd_buf_mem.cpu_va;
+
+	/* 0 is treated as invalid pre-sync */
+	if (cmd_buf_cpu_va[fence_index]) {
+		struct gk20a_fence * ce_cmd_buf_fence_in = NULL;
+
+		memcpy((void *)&ce_cmd_buf_fence_in,
+				(void *)(cmd_buf_cpu_va + fence_index),
+				sizeof(struct gk20a_fence *));
+		ret = gk20a_fence_wait(ce_cmd_buf_fence_in, gk20a_get_gr_idle_timeout(g));
+
+		gk20a_fence_put(ce_cmd_buf_fence_in);
+		/* Reset the stored last pre-sync */
+		memset((void *)(cmd_buf_cpu_va + fence_index),
+				0,
+				NVGPU_CE_MAX_COMMAND_BUFF_SIZE_FOR_TRACING);
+		if (ret)
+			goto noop;
+	}
+
+	cmd_buf_gpu_va = (ce_ctx->cmd_buf_mem.gpu_va + (u64)(cmd_buf_read_offset *sizeof(u32)));
+
+	methodSize = gk20a_ce_prepare_submit(src_buf,
+					dst_buf,
+					size,
+					&cmd_buf_cpu_va[cmd_buf_read_offset],
+					NVGPU_CE_MAX_COMMAND_BUFF_SIZE_PER_KICKOFF,
+					payload,
+					gk20a_get_valid_launch_flags(g, launch_flags),
+					request_operation,
+					gpu_capability->dma_copy_class,
+					gk20a_fence_in);
+
+	if (methodSize) {
+		/* TODO: Remove CPU pre-fence wait */
+		if (gk20a_fence_in) {
+			ret = gk20a_fence_wait(gk20a_fence_in, gk20a_get_gr_idle_timeout(g));
+			gk20a_fence_put(gk20a_fence_in);
+			if (ret)
+				goto noop;
+		}
+
+		/* store the element into gpfifo */
+		gpfifo.entry0 =
+			u64_lo32(cmd_buf_gpu_va);
+		gpfifo.entry1 =
+			(u64_hi32(cmd_buf_gpu_va) |
+			pbdma_gp_entry1_length_f(methodSize));
+
+		/* take always the postfence as it is needed for protecting the ce context */
+		submit_flags |= NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_GET;
+
+		wmb();
+
+		ret = gk20a_submit_channel_gpfifo(ce_ctx->ch, &gpfifo, NULL,
+					1, submit_flags, &fence, &ce_cmd_buf_fence_out, true);
+
+		if (!ret) {
+			memcpy((void *)(cmd_buf_cpu_va + fence_index),
+					(void *)&ce_cmd_buf_fence_out,
+					sizeof(struct gk20a_fence *));
+
+			if (gk20a_fence_out) {
+				gk20a_fence_get(ce_cmd_buf_fence_out);
+				*gk20a_fence_out = ce_cmd_buf_fence_out;
+			}
+
+			/* Next available command buffer queue Index */
+			++ce_ctx->cmd_buf_read_queue_offset;
+			++ce_ctx->submitted_seq_number;
+			}
+	} else
+		ret = -ENOMEM;
+noop:
+	mutex_unlock(&ce_ctx->gpu_ctx_mutex);
+end:
+	return ret;
+}
+EXPORT_SYMBOL(gk20a_ce_execute_ops);
+
+void gk20a_ce_delete_context(struct device *dev,
+		u32 ce_ctx_id)
+{
+	struct gk20a *g = gk20a_from_dev(dev);
+	struct gk20a_ce_app *ce_app = &g->ce_app;
+	struct gk20a_gpu_ctx *ce_ctx, *ce_ctx_save;
+
+	if (!ce_app->initialised ||ce_app->app_state != NVGPU_CE_ACTIVE)
+		return;
+
+	mutex_lock(&ce_app->app_mutex);
+
+	list_for_each_entry_safe(ce_ctx, ce_ctx_save,
+			&ce_app->allocated_contexts, list) {
+		if (ce_ctx->ctx_id == ce_ctx_id) {
+			gk20a_ce_delete_gpu_context(ce_ctx);
+			--ce_app->ctx_count;
+			break;
+		}
+	}
+
+	mutex_unlock(&ce_app->app_mutex);
+	return;
+}
+EXPORT_SYMBOL(gk20a_ce_delete_context);
+
+#ifdef CONFIG_DEBUG_FS
+void gk20a_ce_debugfs_init(struct device *dev)
+{
+	struct gk20a_platform *platform = dev_get_drvdata(dev);
+	struct gk20a *g = get_gk20a(dev);
+
+	debugfs_create_u32("ce_app_ctx_count", S_IWUSR | S_IRUGO,
+			   platform->debugfs, &g->ce_app.ctx_count);
+	debugfs_create_u32("ce_app_state", S_IWUSR | S_IRUGO,
+			   platform->debugfs, &g->ce_app.app_state);
+	debugfs_create_u32("ce_app_next_ctx_id", S_IWUSR | S_IRUGO,
+			   platform->debugfs, &g->ce_app.next_ctx_id);
+}
+#endif
diff --git a/drivers/gpu/nvgpu/gk20a/ce2_gk20a.h b/drivers/gpu/nvgpu/gk20a/ce2_gk20a.h
index 5ceb69e1..3b53834d 100644
--- a/drivers/gpu/nvgpu/gk20a/ce2_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/ce2_gk20a.h
@@ -28,4 +28,128 @@ void gk20a_init_ce2(struct gpu_ops *gops);
 void gk20a_ce2_isr(struct gk20a *g, u32 inst_id, u32 pri_base);
 void gk20a_ce2_nonstall_isr(struct gk20a *g, u32 inst_id, u32 pri_base);
 
+/* CE command utility macros */
+#define NVGPU_CE_LOWER_ADDRESS_OFFSET_MASK 0xffffffff
+#define NVGPU_CE_UPPER_ADDRESS_OFFSET_MASK 0xff
+
+#define NVGPU_CE_COMMAND_BUF_SIZE     4096
+#define NVGPU_CE_MAX_COMMAND_BUFF_SIZE_PER_KICKOFF 128
+#define NVGPU_CE_MAX_COMMAND_BUFF_SIZE_FOR_TRACING 8
+
+typedef void (*ce_event_callback)(u32 ce_ctx_id, u32 ce_event_flag);
+
+/* dma launch_flags */
+enum {
+	/* location */
+	NVGPU_CE_SRC_LOCATION_COHERENT_SYSMEM                    = (1 << 0),
+	NVGPU_CE_SRC_LOCATION_NONCOHERENT_SYSMEM                 = (1 << 1),
+	NVGPU_CE_SRC_LOCATION_LOCAL_FB                           = (1 << 2),
+	NVGPU_CE_DST_LOCATION_COHERENT_SYSMEM                    = (1 << 3),
+	NVGPU_CE_DST_LOCATION_NONCOHERENT_SYSMEM                 = (1 << 4),
+	NVGPU_CE_DST_LOCATION_LOCAL_FB                           = (1 << 5),
+
+	/* memory layout */
+	NVGPU_CE_SRC_MEMORY_LAYOUT_PITCH                         = (1 << 6),
+	NVGPU_CE_SRC_MEMORY_LAYOUT_BLOCKLINEAR                   = (1 << 7),
+	NVGPU_CE_DST_MEMORY_LAYOUT_PITCH                         = (1 << 8),
+	NVGPU_CE_DST_MEMORY_LAYOUT_BLOCKLINEAR                   = (1 << 9),
+
+	/* transfer type */
+	NVGPU_CE_DATA_TRANSFER_TYPE_PIPELINED                   = (1 << 10),
+	NVGPU_CE_DATA_TRANSFER_TYPE_NON_PIPELINED               = (1 << 11),
+};
+
+/* CE operation mode */
+enum {
+	NVGPU_CE_PHYS_MODE_TRANSFER        = (1 << 0),
+	NVGPU_CE_MEMSET                    = (1 << 1),
+};
+
+/* CE event flags */
+enum {
+	NVGPU_CE_CONTEXT_JOB_COMPLETED               = (1 << 0),
+	NVGPU_CE_CONTEXT_JOB_TIMEDOUT                = (1 << 1),
+	NVGPU_CE_CONTEXT_SUSPEND                     = (1 << 2),
+	NVGPU_CE_CONTEXT_RESUME                      = (1 << 3),
+};
+
+/* CE app state machine flags */
+enum {
+	NVGPU_CE_ACTIVE                    = (1 << 0),
+	NVGPU_CE_SUSPEND                   = (1 << 1),
+};
+
+/* gpu context state machine flags */
+enum {
+	NVGPU_CE_GPU_CTX_ALLOCATED         = (1 << 0),
+	NVGPU_CE_GPU_CTX_DELETED           = (1 << 1),
+};
+
+/* global ce app db */
+struct gk20a_ce_app {
+	bool initialised;
+	struct mutex app_mutex;
+	int app_state;
+
+	struct list_head allocated_contexts;
+	u32 ctx_count;
+	u32 next_ctx_id;
+};
+
+/* ce context db */
+struct gk20a_gpu_ctx {
+	struct gk20a *g;
+	struct device *dev;
+	u32 ctx_id;
+	struct mutex gpu_ctx_mutex;
+	int gpu_ctx_state;
+	ce_event_callback user_event_callback;
+
+	/* channel related data */
+	struct channel_gk20a *ch;
+	struct vm_gk20a *vm;
+
+	/* cmd buf mem_desc */
+	struct mem_desc cmd_buf_mem;
+
+	struct list_head list;
+
+	u64 submitted_seq_number;
+	u64 completed_seq_number;
+
+	u32 cmd_buf_read_queue_offset;
+	u32 cmd_buf_end_queue_offset;
+};
+
+/* global CE app related apis */
+int gk20a_init_ce_support(struct gk20a *g);
+void gk20a_ce_suspend(struct gk20a *g);
+void gk20a_ce_destroy(struct gk20a *g);
+
+/* CE app utility functions */
+u32 gk20a_ce_create_context_with_cb(struct device *dev,
+		int runlist_id,
+		int priority,
+		int timeslice,
+		int runlist_level,
+		ce_event_callback user_event_callback);
+int gk20a_ce_execute_ops(struct device *dev,
+		u32 ce_ctx_id,
+		u64 src_buf,
+		u64 dst_buf,
+		u64 size,
+		unsigned int payload,
+		int launch_flags,
+		int request_operation,
+		struct gk20a_fence *gk20a_fence_in,
+		u32 submit_flags,
+		struct gk20a_fence **gk20a_fence_out);
+void gk20a_ce_delete_context(struct device *dev,
+		u32 ce_ctx_id);
+
+#ifdef CONFIG_DEBUG_FS
+/* CE app debugfs api */
+void gk20a_ce_debugfs_init(struct device *dev);
+#endif
+
 #endif /*__CE2_GK20A_H__*/
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
index d5457d10..447fe86a 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
@@ -702,7 +702,7 @@ static int gk20a_channel_set_wdt_status(struct channel_gk20a *ch,
 	return 0;
 }
 
-static int gk20a_channel_set_runlist_interleave(struct channel_gk20a *ch,
+int gk20a_channel_set_runlist_interleave(struct channel_gk20a *ch,
 						u32 level)
 {
 	struct gk20a *g = ch->g;
@@ -1113,9 +1113,11 @@ static void gk20a_channel_update_runcb_fn(struct work_struct *work)
 
 struct channel_gk20a *gk20a_open_new_channel_with_cb(struct gk20a *g,
 		void (*update_fn)(struct channel_gk20a *, void *),
-		void *update_fn_data)
+		void *update_fn_data,
+		int runlist_id,
+		bool is_privileged_channel)
 {
-	struct channel_gk20a *ch = gk20a_open_new_channel(g, -1, false);
+	struct channel_gk20a *ch = gk20a_open_new_channel(g, runlist_id, is_privileged_channel);
 
 	if (ch) {
 		spin_lock(&ch->update_fn_lock);
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
index 4b5fe1b3..971175f2 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
@@ -265,7 +265,9 @@ struct channel_gk20a *gk20a_open_new_channel(struct gk20a *g,
 		bool is_privileged_channel);
 struct channel_gk20a *gk20a_open_new_channel_with_cb(struct gk20a *g,
 		void (*update_fn)(struct channel_gk20a *, void *),
-		void *update_fn_data);
+		void *update_fn_data,
+		int runlist_id,
+		bool is_privileged_channel);
 void channel_gk20a_unbind(struct channel_gk20a *ch_gk20a);
 
 int gk20a_submit_channel_gpfifo(struct channel_gk20a *c,
@@ -295,6 +297,8 @@ int gk20a_channel_get_timescale_from_timeslice(struct gk20a *g,
 		int *__timeslice_timeout, int *__timeslice_scale);
 int gk20a_channel_set_priority(struct channel_gk20a *ch, u32 priority);
 int gk20a_channel_set_timeslice(struct channel_gk20a *ch, u32 timeslice);
+int gk20a_channel_set_runlist_interleave(struct channel_gk20a *ch,
+		u32 level);
 void gk20a_channel_event_id_post_event(struct channel_gk20a *ch,
 				       int event_id);
 
diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
index 5133f86a..3dd7cb02 100644
--- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
@@ -165,6 +165,33 @@ u32 gk20a_fifo_get_all_ce_engine_reset_mask(struct gk20a *g)
 	return reset_mask;
 }
 
+u32 gk20a_fifo_get_fast_ce_runlist_id(struct gk20a *g)
+{
+	u32 ce_runlist_id = gk20a_fifo_get_gr_runlist_id(g);
+	u32 engine_enum = ENGINE_INVAL_GK20A;
+	struct fifo_gk20a *f = NULL;
+	u32 engine_id_idx;
+	struct fifo_engine_info_gk20a *engine_info;
+	u32 active_engine_id = 0;
+
+	if (!g)
+		return ce_runlist_id;
+
+	f = &g->fifo;
+
+	for (engine_id_idx = 0; engine_id_idx < f->num_engines; ++engine_id_idx) {
+		active_engine_id = f->active_engines_list[engine_id_idx];
+		engine_info = &f->engine_info[active_engine_id];
+		engine_enum = engine_info->engine_enum;
+
+		/* selecet last available ASYNC_CE if available */
+		if (engine_enum == ENGINE_ASYNC_CE_GK20A)
+			ce_runlist_id = engine_info->runlist_id;
+	}
+
+	return ce_runlist_id;
+}
+
 u32 gk20a_fifo_get_gr_runlist_id(struct gk20a *g)
 {
 	u32 gr_engine_cnt = 0;
diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h
index 3473bc78..33d6d39c 100644
--- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h
@@ -244,6 +244,8 @@ u32 gk20a_fifo_get_gr_engine_id(struct gk20a *g);
 
 u32 gk20a_fifo_get_all_ce_engine_reset_mask(struct gk20a *g);
 
+u32 gk20a_fifo_get_fast_ce_runlist_id(struct gk20a *g);
+
 u32 gk20a_fifo_get_gr_runlist_id(struct gk20a *g);
 
 bool gk20a_fifo_is_valid_runlist_id(struct gk20a *g, u32 runlist_id);
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.c b/drivers/gpu/nvgpu/gk20a/gk20a.c
index 50f67262..04f82033 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.c
@@ -773,6 +773,7 @@ static int gk20a_pm_prepare_poweroff(struct device *dev)
 {
 	struct gk20a *g = get_gk20a(dev);
 	int ret = 0;
+	struct gk20a_platform *platform = gk20a_get_platform(dev);
 
 	gk20a_dbg_fn("");
 
@@ -786,6 +787,9 @@ static int gk20a_pm_prepare_poweroff(struct device *dev)
 	/* cancel any pending cde work */
 	gk20a_cde_suspend(g);
 
+	if (platform->has_ce)
+		gk20a_ce_suspend(g);
+
 	ret = gk20a_channel_suspend(g);
 	if (ret)
 		goto done;
@@ -996,6 +1000,11 @@ int gk20a_pm_finalize_poweron(struct device *dev)
 	if (platform->has_cde)
 		gk20a_init_cde_support(g);
 
+	if (platform->has_ce)
+		gk20a_init_ce_support(g);
+
+	gk20a_init_mm_ce_context(g);
+
 	enable_irq(g->irq_stall);
 	if (g->irq_stall != g->irq_nonstall)
 		enable_irq(g->irq_nonstall);
@@ -1658,6 +1667,7 @@ static int gk20a_probe(struct platform_device *dev)
 	gk20a_pmu_debugfs_init(&dev->dev);
 	gk20a_railgating_debugfs_init(&dev->dev);
 	gk20a_cde_debugfs_init(&dev->dev);
+	gk20a_ce_debugfs_init(&dev->dev);
 	gk20a_alloc_debugfs_init(dev);
 	gk20a_mm_debugfs_init(&dev->dev);
 	gk20a_fifo_debugfs_init(&dev->dev);
@@ -1693,6 +1703,9 @@ static int __exit gk20a_remove(struct platform_device *pdev)
 	if (g->remove_support)
 		g->remove_support(dev);
 
+	if (platform->has_ce)
+		gk20a_ce_destroy(g);
+
 	gk20a_user_deinit(dev, &nvgpu_class);
 
 	debugfs_remove_recursive(platform->debugfs);
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.h b/drivers/gpu/nvgpu/gk20a/gk20a.h
index 8aa8689b..03a698dc 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.h
@@ -864,6 +864,8 @@ struct gk20a {
 
 	struct nvgpu_bios bios;
 	struct debugfs_blob_wrapper bios_blob;
+
+	struct gk20a_ce_app ce_app;
 };
 
 static inline unsigned long gk20a_get_gr_idle_timeout(struct gk20a *g)
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
index 750ce10c..7b2174bc 100644
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
@@ -393,7 +393,7 @@ static int __must_check gk20a_init_system_vm(struct mm_gk20a *mm);
 static int __must_check gk20a_init_bar1_vm(struct mm_gk20a *mm);
 static int __must_check gk20a_init_hwpm(struct mm_gk20a *mm);
 static int __must_check gk20a_init_cde_vm(struct mm_gk20a *mm);
-
+static int __must_check gk20a_init_ce_vm(struct mm_gk20a *mm);
 
 struct gk20a_dmabuf_priv {
 	struct mutex lock;
@@ -702,6 +702,7 @@ void gk20a_remove_vm(struct vm_gk20a *vm, struct mem_desc *inst_block)
 static void gk20a_remove_mm_support(struct mm_gk20a *mm)
 {
 	struct gk20a *g = gk20a_from_mm(mm);
+	struct gk20a_platform *platform = gk20a_get_platform(g->dev);
 
 	if (g->ops.mm.remove_bar2_vm)
 		g->ops.mm.remove_bar2_vm(g);
@@ -709,6 +710,14 @@ static void gk20a_remove_mm_support(struct mm_gk20a *mm)
 	gk20a_remove_vm(&mm->pmu.vm, &mm->pmu.inst_block);
 	gk20a_free_inst_block(gk20a_from_mm(mm), &mm->hwpm.inst_block);
 	gk20a_vm_remove_support_nofree(&mm->cde.vm);
+
+	if (mm->ce_vidmem_ctx_id != ~0)
+		gk20a_ce_delete_context(g->dev, mm->ce_vidmem_ctx_id );
+
+	mm->ce_vidmem_ctx_id =  ~0;
+
+	if (platform->has_ce)
+		gk20a_vm_remove_support_nofree(&mm->ce.vm);
 }
 
 static int gk20a_alloc_sysmem_flush(struct gk20a *g)
@@ -754,6 +763,7 @@ int gk20a_init_mm_setup_sw(struct gk20a *g)
 {
 	struct mm_gk20a *mm = &g->mm;
 	int err;
+	struct gk20a_platform *platform = gk20a_get_platform(g->dev);
 
 	gk20a_dbg_fn("");
 
@@ -775,6 +785,8 @@ int gk20a_init_mm_setup_sw(struct gk20a *g)
 
 	gk20a_init_pramin(mm);
 
+	mm->ce_vidmem_ctx_id =  ~0;
+
 	err = gk20a_init_vidmem(mm);
 	if (err)
 		return err;
@@ -804,6 +816,12 @@ int gk20a_init_mm_setup_sw(struct gk20a *g)
 	if (err)
 		return err;
 
+	if (platform->has_ce) {
+		err = gk20a_init_ce_vm(mm);
+		if (err)
+			return err;
+	}
+
 	/* set vm_alloc_share op here as gk20a_as_alloc_share needs it */
 	g->ops.mm.vm_alloc_share = gk20a_vm_alloc_share;
 	mm->remove_support = gk20a_remove_mm_support;
@@ -881,6 +899,25 @@ int gk20a_init_mm_support(struct gk20a *g)
 	return err;
 }
 
+void gk20a_init_mm_ce_context(struct gk20a *g)
+{
+#if defined(CONFIG_GK20A_VIDMEM)
+	if (g->mm.vidmem_size && (g->mm.ce_vidmem_ctx_id ==  ~0)) {
+		g->mm.ce_vidmem_ctx_id =
+			gk20a_ce_create_context_with_cb(g->dev,
+				gk20a_fifo_get_fast_ce_runlist_id(g),
+				-1,
+				-1,
+				-1,
+				NULL);
+
+		if (g->mm.ce_vidmem_ctx_id == ~0)
+			gk20a_err(g->dev,
+				"Failed to allocate CE context for vidmem page clearing support");
+	}
+#endif
+}
+
 static int alloc_gmmu_phys_pages(struct vm_gk20a *vm, u32 order,
 				 struct gk20a_mm_entry *entry)
 {
@@ -2484,6 +2521,7 @@ int gk20a_gmmu_alloc_attr_vid_at(struct gk20a *g, enum dma_attr attr,
 	struct device *d = &g->mm.vidmem_dev;
 	int err;
 	dma_addr_t iova;
+	bool need_pramin_access = true;
 	DEFINE_DMA_ATTRS(attrs);
 
 	gk20a_dbg_fn("");
@@ -2519,7 +2557,38 @@ int gk20a_gmmu_alloc_attr_vid_at(struct gk20a *g, enum dma_attr attr,
 	mem->size = size;
 	mem->aperture = APERTURE_VIDMEM;
 
-	gk20a_memset(g, mem, 0, 0, size);
+	if (g->mm.ce_vidmem_ctx_id != ~0) {
+		struct gk20a_fence *gk20a_fence_out = NULL;
+		u64 dst_bufbase = g->ops.mm.get_iova_addr(g, mem->sgt->sgl, 0);
+
+		err = gk20a_ce_execute_ops(g->dev,
+				g->mm.ce_vidmem_ctx_id,
+				0,
+				dst_bufbase,
+				(u64)size,
+				0x00000000,
+				NVGPU_CE_DST_LOCATION_LOCAL_FB,
+				NVGPU_CE_MEMSET,
+				NULL,
+				0,
+				&gk20a_fence_out);
+
+		if (!err) {
+			if (gk20a_fence_out) {
+				err = gk20a_fence_wait(gk20a_fence_out, gk20a_get_gr_idle_timeout(g));
+				gk20a_fence_put(gk20a_fence_out);
+				if (err)
+					gk20a_err(g->dev,
+						"Failed to get the fence_out from CE execute ops");
+				else
+					need_pramin_access = false;
+			}
+		} else
+			gk20a_err(g->dev, "Failed gk20a_ce_execute_ops[%d]",err);
+	}
+
+	if (need_pramin_access)
+		gk20a_memset(g, mem, 0, 0, size);
 
 	gk20a_dbg_fn("done");
 
@@ -4125,6 +4194,19 @@ static int gk20a_init_cde_vm(struct mm_gk20a *mm)
 			false, false, "cde");
 }
 
+static int gk20a_init_ce_vm(struct mm_gk20a *mm)
+{
+	struct vm_gk20a *vm = &mm->ce.vm;
+	struct gk20a *g = gk20a_from_mm(mm);
+	u32 big_page_size = gk20a_get_platform(g->dev)->default_big_page_size;
+
+	return gk20a_init_vm(mm, vm, big_page_size,
+			SZ_4K * 16,
+			NV_MM_DEFAULT_KERNEL_SIZE,
+			NV_MM_DEFAULT_KERNEL_SIZE + NV_MM_DEFAULT_USER_SIZE,
+			false, false, "ce");
+}
+
 void gk20a_mm_init_pdb(struct gk20a *g, struct mem_desc *inst_block,
 		struct vm_gk20a *vm)
 {
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
index 66e46480..184c1f71 100644
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
@@ -307,6 +307,7 @@ int gk20a_init_mm_support(struct gk20a *g);
 int gk20a_init_mm_setup_sw(struct gk20a *g);
 int gk20a_init_mm_setup_hw(struct gk20a *g);
 void gk20a_mm_debugfs_init(struct device *dev);
+void gk20a_init_mm_ce_context(struct gk20a *g);
 
 int gk20a_mm_fb_flush(struct gk20a *g);
 void gk20a_mm_l2_flush(struct gk20a *g, bool invalidate);
@@ -349,6 +350,10 @@ struct mm_gk20a {
 		struct vm_gk20a vm;
 	} cde;
 
+	struct {
+		struct vm_gk20a vm;
+	} ce;
+
 	struct mutex l2_op_lock;
 #ifdef CONFIG_ARCH_TEGRA_18x_SOC
 	struct mem_desc bar2_desc;
@@ -388,6 +393,7 @@ struct mm_gk20a {
 
 	size_t vidmem_size;
 	struct device vidmem_dev;
+	u32 ce_vidmem_ctx_id;
 };
 
 int gk20a_mm_init(struct mm_gk20a *mm);
diff --git a/drivers/gpu/nvgpu/gk20a/platform_gk20a.h b/drivers/gpu/nvgpu/gk20a/platform_gk20a.h
index 543f9873..5bde3439 100644
--- a/drivers/gpu/nvgpu/gk20a/platform_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/platform_gk20a.h
@@ -209,6 +209,8 @@ struct gk20a_platform {
 
 	bool has_cde;
 
+	bool has_ce;
+
 	/* soc name for finding firmware files */
 	const char *soc_name;
 
diff --git a/drivers/gpu/nvgpu/gk20a/platform_gk20a_tegra.c b/drivers/gpu/nvgpu/gk20a/platform_gk20a_tegra.c
index 2ed6df43..745d963c 100644
--- a/drivers/gpu/nvgpu/gk20a/platform_gk20a_tegra.c
+++ b/drivers/gpu/nvgpu/gk20a/platform_gk20a_tegra.c
@@ -900,6 +900,8 @@ struct gk20a_platform gk20a_tegra_platform = {
 	.secure_page_alloc = gk20a_tegra_secure_page_alloc,
 	.dump_platform_dependencies = gk20a_tegra_debug_dump,
 
+	.has_ce = true,
+
 	.soc_name = "tegra12x",
 
 	.vidmem_is_vidmem = false,
@@ -962,6 +964,8 @@ struct gk20a_platform gm20b_tegra_platform = {
 
 	.has_cde = true,
 
+	.has_ce = true,
+
 	.soc_name = "tegra21x",
 
 	.vidmem_is_vidmem = false,
diff --git a/drivers/gpu/nvgpu/pci.c b/drivers/gpu/nvgpu/pci.c
index ea6f3b4c..fcf63ddc 100644
--- a/drivers/gpu/nvgpu/pci.c
+++ b/drivers/gpu/nvgpu/pci.c
@@ -56,6 +56,8 @@ static struct gk20a_platform nvgpu_pci_device = {
 
 	.ch_wdt_timeout_ms = 7000,
 	.disable_bigpage = true,
+
+	.has_ce = true,
 };
 
 static struct pci_device_id nvgpu_pci_table[] = {
-- 
cgit v1.2.2