1 files changed, 617 insertions, 0 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c b/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c
index 96d38b11..e2f2d9e9 100644
--- a/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c
@@ -24,6 +24,7 @@
 #include <trace/events/gk20a.h>
 #include <linux/dma-mapping.h>
 #include <linux/nvhost.h>
+#include <linux/debugfs.h>
 #include "gk20a.h"
 #include "debug_gk20a.h"
@@ -96,3 +97,619 @@ void gk20a_init_ce2(struct gpu_ops *gops)
        gops->ce2.isr_stall = gk20a_ce2_isr;
        gops->ce2.isr_nonstall = gk20a_ce2_nonstall_isr;
 }
+/* static CE app api */
+static void gk20a_ce_notify_all_user(struct gk20a *g, u32 event)
+{
+        struct gk20a_ce_app *ce_app = &g->ce_app;
+        struct gk20a_gpu_ctx *ce_ctx, *ce_ctx_save;
+        if (!ce_app->initialised)
+                return;
+        mutex_lock(&ce_app->app_mutex);
+        list_for_each_entry_safe(ce_ctx, ce_ctx_save,
+                        &ce_app->allocated_contexts, list) {
+                if (ce_ctx->user_event_callback) {
+                        ce_ctx->user_event_callback(ce_ctx->ctx_id,
+                                event);
+                }
+        }
+        mutex_unlock(&ce_app->app_mutex);
+}
+static void gk20a_ce_finished_ctx_cb(struct channel_gk20a *ch, void *data)
+{
+        struct gk20a_gpu_ctx *ce_ctx = data;
+        bool channel_idle;
+        u32 event;
+        mutex_lock(&ch->jobs_lock);
+        channel_idle = list_empty(&ch->jobs);
+        mutex_unlock(&ch->jobs_lock);
+        if (!channel_idle)
+                return;
+        gk20a_dbg(gpu_dbg_fn, "ce: finished %p", ce_ctx);
+        if (ch->has_timedout)
+                event = NVGPU_CE_CONTEXT_JOB_TIMEDOUT;
+        else
+                event = NVGPU_CE_CONTEXT_JOB_COMPLETED;
+        if (ce_ctx->user_event_callback)
+                ce_ctx->user_event_callback(ce_ctx->ctx_id,
+                        event);
+        ++ce_ctx->completed_seq_number;
+}
+static void gk20a_ce_free_command_buffer_stored_fence(struct gk20a_gpu_ctx *ce_ctx)
+{
+        u32 cmd_buf_index;
+        u32 cmd_buf_read_offset;
+        u32 fence_index;
+        u32 *cmd_buf_cpu_va;
+        for (cmd_buf_index = 0;
+                cmd_buf_index < ce_ctx->cmd_buf_end_queue_offset;
+                cmd_buf_index++) {
+                cmd_buf_read_offset = (cmd_buf_index *
+                        (NVGPU_CE_MAX_COMMAND_BUFF_SIZE_PER_KICKOFF / sizeof(u32)));
+                /* at end of command buffer has gk20a_fence for command buffer sync */
+                fence_index = (cmd_buf_read_offset +
+                        ((NVGPU_CE_MAX_COMMAND_BUFF_SIZE_PER_KICKOFF / sizeof(u32)) -
+                        (NVGPU_CE_MAX_COMMAND_BUFF_SIZE_FOR_TRACING / sizeof(u32))));
+                cmd_buf_cpu_va = (u32 *)ce_ctx->cmd_buf_mem.cpu_va;
+                /* 0 is treated as invalid pre-sync */
+                if (cmd_buf_cpu_va[fence_index]) {
+                        struct gk20a_fence * ce_cmd_buf_fence_in = NULL;
+                        memcpy((void *)&ce_cmd_buf_fence_in,
+                                        (void *)(cmd_buf_cpu_va + fence_index),
+                                        sizeof(struct gk20a_fence *));
+                        gk20a_fence_put(ce_cmd_buf_fence_in);
+                        /* Reset the stored last pre-sync */
+                        memset((void *)(cmd_buf_cpu_va + fence_index),
+                                        0,
+                                        NVGPU_CE_MAX_COMMAND_BUFF_SIZE_FOR_TRACING);
+                }
+        }
+}
+/* assume this api should need to call under mutex_lock(&ce_app->app_mutex) */
+static void gk20a_ce_delete_gpu_context(struct gk20a_gpu_ctx *ce_ctx)
+{
+        ce_ctx->gpu_ctx_state = NVGPU_CE_GPU_CTX_DELETED;
+        mutex_lock(&ce_ctx->gpu_ctx_mutex);
+        gk20a_ce_free_command_buffer_stored_fence(ce_ctx);
+        gk20a_gmmu_unmap_free(ce_ctx->vm, &ce_ctx->cmd_buf_mem);
+        /* free the channel */
+        if (ce_ctx->ch)
+                gk20a_channel_close(ce_ctx->ch);
+        /* housekeeping on app */
+        list_del(&ce_ctx->list);
+        mutex_unlock(&ce_ctx->gpu_ctx_mutex);
+        mutex_destroy(&ce_ctx->gpu_ctx_mutex);
+        kfree(ce_ctx);
+}
+static inline int gk20a_ce_get_method_size(int request_operation)
+{
+        /* failure size */
+        int methodsize = ~0;
+        if (request_operation & NVGPU_CE_PHYS_MODE_TRANSFER)
+                methodsize = 10 * 2 * sizeof(u32);
+        else if (request_operation & NVGPU_CE_MEMSET)
+                methodsize = 9 * 2 * sizeof(u32);
+        return methodsize;
+}
+static inline int gk20a_get_valid_launch_flags(struct gk20a *g, int launch_flags)
+{
+        /* there is no local memory available,
+        don't allow local memory related CE flags */
+        if (!g->mm.vidmem_size) {
+                launch_flags &= ~(NVGPU_CE_SRC_LOCATION_LOCAL_FB |
+                        NVGPU_CE_DST_LOCATION_LOCAL_FB);
+        }
+        return launch_flags;
+}
+static int gk20a_ce_prepare_submit(u64 src_buf,
+                u64 dst_buf,
+                u64 size,
+                u32 *cmd_buf_cpu_va,
+                u32 max_cmd_buf_size,
+                unsigned int payload,
+                int launch_flags,
+                int request_operation,
+                u32 dma_copy_class,
+                struct gk20a_fence *gk20a_fence_in)
+{
+        u32 launch = 0;
+        u32 methodSize = 0;
+        /* failure case handling */
+        if ((gk20a_ce_get_method_size(request_operation) > max_cmd_buf_size) ||
+                (!size) ||
+                (request_operation > NVGPU_CE_MEMSET))
+                return 0;
+        /* set the channel object */
+        cmd_buf_cpu_va[methodSize++] = 0x20018000;
+        cmd_buf_cpu_va[methodSize++] = dma_copy_class;
+        if (request_operation & NVGPU_CE_PHYS_MODE_TRANSFER) {
+                /* setup the source */
+                cmd_buf_cpu_va[methodSize++] = 0x20018101;
+                cmd_buf_cpu_va[methodSize++] = (u64_lo32(src_buf) &
+                                        NVGPU_CE_LOWER_ADDRESS_OFFSET_MASK);
+                cmd_buf_cpu_va[methodSize++] = 0x20018100;
+                cmd_buf_cpu_va[methodSize++] = (u64_hi32(src_buf) &
+                                        NVGPU_CE_UPPER_ADDRESS_OFFSET_MASK);
+                cmd_buf_cpu_va[methodSize++] = 0x20018098;
+                if (launch_flags & NVGPU_CE_SRC_LOCATION_LOCAL_FB) {
+                        cmd_buf_cpu_va[methodSize++] = 0x00000000;
+                } else if (launch_flags & NVGPU_CE_SRC_LOCATION_NONCOHERENT_SYSMEM) {
+                        cmd_buf_cpu_va[methodSize++] = 0x00000002;
+                } else {
+                        cmd_buf_cpu_va[methodSize++] = 0x00000001;
+                }
+                launch |= 0x00001000;
+        } else if (request_operation & NVGPU_CE_MEMSET) {
+                cmd_buf_cpu_va[methodSize++] = 0x200181c2;
+                cmd_buf_cpu_va[methodSize++] = 0x00030004;
+                cmd_buf_cpu_va[methodSize++] = 0x200181c0;
+                cmd_buf_cpu_va[methodSize++] = payload;
+                launch |= 0x00000400;
+                /* converted into number of words */
+                size /= sizeof(u32);
+        }
+        /* setup the destination/output */
+        cmd_buf_cpu_va[methodSize++] = 0x20018103;
+        cmd_buf_cpu_va[methodSize++] = (u64_lo32(dst_buf) & NVGPU_CE_LOWER_ADDRESS_OFFSET_MASK);
+        cmd_buf_cpu_va[methodSize++] = 0x20018102;
+        cmd_buf_cpu_va[methodSize++] = (u64_hi32(dst_buf) & NVGPU_CE_UPPER_ADDRESS_OFFSET_MASK);
+        cmd_buf_cpu_va[methodSize++] = 0x20018099;
+        if (launch_flags & NVGPU_CE_DST_LOCATION_LOCAL_FB) {
+                cmd_buf_cpu_va[methodSize++] = 0x00000000;
+        } else if (launch_flags & NVGPU_CE_DST_LOCATION_NONCOHERENT_SYSMEM) {
+                cmd_buf_cpu_va[methodSize++] = 0x00000002;
+        } else {
+                cmd_buf_cpu_va[methodSize++] = 0x00000001;
+        }
+        launch |= 0x00002000;
+        /* setup the format */
+        cmd_buf_cpu_va[methodSize++] = 0x20018107;
+        cmd_buf_cpu_va[methodSize++] = 1;
+        cmd_buf_cpu_va[methodSize++] = 0x20018106;
+        cmd_buf_cpu_va[methodSize++] =  u64_lo32(size);
+        launch |= 0x00000004;
+        if (launch_flags & NVGPU_CE_SRC_MEMORY_LAYOUT_BLOCKLINEAR)
+                launch |= 0x00000000;
+        else
+                launch |= 0x00000080;
+        if (launch_flags & NVGPU_CE_DST_MEMORY_LAYOUT_BLOCKLINEAR)
+                launch |= 0x00000000;
+        else
+                launch |= 0x00000100;
+        if (launch_flags & NVGPU_CE_DATA_TRANSFER_TYPE_NON_PIPELINED)
+                launch |= 0x00000002;
+        else
+                launch |= 0x00000001;
+        cmd_buf_cpu_va[methodSize++] = 0x200180c0;
+        cmd_buf_cpu_va[methodSize++] = launch;
+        return methodSize;
+}
+/* global CE app related apis */
+int gk20a_init_ce_support(struct gk20a *g)
+{
+        struct gk20a_ce_app *ce_app = &g->ce_app;
+        if (ce_app->initialised) {
+                /* assume this happen during poweron/poweroff GPU sequence */
+                ce_app->app_state = NVGPU_CE_ACTIVE;
+                gk20a_ce_notify_all_user(g, NVGPU_CE_CONTEXT_RESUME);
+                return 0;
+        }
+        gk20a_dbg(gpu_dbg_fn, "ce: init");
+        mutex_init(&ce_app->app_mutex);
+        mutex_lock(&ce_app->app_mutex);
+        INIT_LIST_HEAD(&ce_app->allocated_contexts);
+        ce_app->ctx_count = 0;
+        ce_app->next_ctx_id = 0;
+        ce_app->initialised = true;
+        ce_app->app_state = NVGPU_CE_ACTIVE;
+        mutex_unlock(&ce_app->app_mutex);
+        gk20a_dbg(gpu_dbg_cde_ctx, "ce: init finished");
+        return 0;
+}
+void gk20a_ce_destroy(struct gk20a *g)
+{
+        struct gk20a_ce_app *ce_app = &g->ce_app;
+        struct gk20a_gpu_ctx *ce_ctx, *ce_ctx_save;
+        if (!ce_app->initialised)
+                return;
+        ce_app->app_state = NVGPU_CE_SUSPEND;
+        ce_app->initialised = false;
+        mutex_lock(&ce_app->app_mutex);
+        list_for_each_entry_safe(ce_ctx, ce_ctx_save,
+                        &ce_app->allocated_contexts, list) {
+                gk20a_ce_delete_gpu_context(ce_ctx);
+        }
+        INIT_LIST_HEAD(&ce_app->allocated_contexts);
+        ce_app->ctx_count = 0;
+        ce_app->next_ctx_id = 0;
+        mutex_unlock(&ce_app->app_mutex);
+        mutex_destroy(&ce_app->app_mutex);
+}
+void gk20a_ce_suspend(struct gk20a *g)
+{
+        struct gk20a_ce_app *ce_app = &g->ce_app;
+        if (!ce_app->initialised)
+                return;
+        ce_app->app_state = NVGPU_CE_SUSPEND;
+        gk20a_ce_notify_all_user(g, NVGPU_CE_CONTEXT_SUSPEND);
+        return;
+}
+/* CE app utility functions */
+u32 gk20a_ce_create_context_with_cb(struct device *dev,
+                int runlist_id,
+                int priority,
+                int timeslice,
+                int runlist_level,
+                ce_event_callback user_event_callback)
+{
+        struct gk20a_gpu_ctx *ce_ctx;
+        struct gk20a *g = gk20a_from_dev(dev);
+        struct gk20a_ce_app *ce_app = &g->ce_app;
+        u32 ctx_id = ~0;
+        int err = 0;
+        if (!ce_app->initialised || ce_app->app_state != NVGPU_CE_ACTIVE)
+                return ctx_id;
+        ce_ctx = kzalloc(sizeof(*ce_ctx), GFP_KERNEL);
+        if (!ce_ctx)
+                return ctx_id;
+        mutex_init(&ce_ctx->gpu_ctx_mutex);
+        ce_ctx->g = g;
+        ce_ctx->dev = g->dev;
+        ce_ctx->user_event_callback = user_event_callback;
+        ce_ctx->cmd_buf_read_queue_offset = 0;
+        ce_ctx->cmd_buf_end_queue_offset =
+                (NVGPU_CE_COMMAND_BUF_SIZE / NVGPU_CE_MAX_COMMAND_BUFF_SIZE_PER_KICKOFF);
+        ce_ctx->submitted_seq_number = 0;
+        ce_ctx->completed_seq_number = 0;
+        /* always kernel client needs privileged channel */
+        ce_ctx->ch = gk20a_open_new_channel_with_cb(g, gk20a_ce_finished_ctx_cb,
+                                        ce_ctx,
+                                        runlist_id,
+                                        true);
+        if (!ce_ctx->ch) {
+                gk20a_err(ce_ctx->dev, "ce: gk20a channel not available");
+                goto end;
+         }
+        /* bind the channel to the vm */
+        gk20a_vm_get(&g->mm.ce.vm);
+        ce_ctx->vm = ce_ctx->ch->vm = &g->mm.ce.vm;
+        err = channel_gk20a_commit_va(ce_ctx->ch);
+        if (err) {
+                gk20a_err(ce_ctx->dev, "ce: could not bind vm");
+                goto end;
+        }
+        /* allocate gpfifo (1024 should be more than enough) */
+        err = gk20a_alloc_channel_gpfifo(ce_ctx->ch,
+                &(struct nvgpu_alloc_gpfifo_args){1024, 0});
+        if (err) {
+                gk20a_err(ce_ctx->dev, "ce: unable to allocate gpfifo");
+                goto end;
+        }
+        /* allocate command buffer (4096 should be more than enough) from sysmem*/
+        err = gk20a_gmmu_alloc_map_sys(ce_ctx->vm, NVGPU_CE_COMMAND_BUF_SIZE, &ce_ctx->cmd_buf_mem);
+         if (err) {
+                gk20a_err(ce_ctx->dev,
+                        "ce: could not allocate command buffer for CE context");
+                goto end;
+        }
+        memset(ce_ctx->cmd_buf_mem.cpu_va, 0x00, ce_ctx->cmd_buf_mem.size);
+        /* -1 means default channel priority */
+        if (priority != -1) {
+                err = gk20a_channel_set_priority(ce_ctx->ch, priority);
+                if (err) {
+                        gk20a_err(ce_ctx->dev,
+                                "ce: could not set the channel priority for CE context");
+                        goto end;
+                }
+        }
+        /* -1 means default channel timeslice value */
+        if (timeslice != -1) {
+                err = gk20a_channel_set_timeslice(ce_ctx->ch, timeslice);
+                if (err) {
+                        gk20a_err(ce_ctx->dev,
+                                "ce: could not set the channel timeslice value for CE context");
+                        goto end;
+                }
+        }
+        /* -1 means default channel runlist level */
+        if (runlist_level != -1) {
+                err = gk20a_channel_set_runlist_interleave(ce_ctx->ch, runlist_level);
+                if (err) {
+                        gk20a_err(ce_ctx->dev,
+                                "ce: could not set the runlist interleave for CE context");
+                        goto end;
+                }
+        }
+        mutex_lock(&ce_app->app_mutex);
+        ctx_id = ce_ctx->ctx_id = ce_app->next_ctx_id;
+        list_add(&ce_ctx->list, &ce_app->allocated_contexts);
+        ++ce_app->next_ctx_id;
+        ++ce_app->ctx_count;
+        mutex_unlock(&ce_app->app_mutex);
+        ce_ctx->gpu_ctx_state = NVGPU_CE_GPU_CTX_ALLOCATED;
+end:
+        if (ctx_id == ~0) {
+                mutex_lock(&ce_app->app_mutex);
+                gk20a_ce_delete_gpu_context(ce_ctx);
+                mutex_unlock(&ce_app->app_mutex);
+        }
+        return ctx_id;
+}
+EXPORT_SYMBOL(gk20a_ce_create_context_with_cb);
+int gk20a_ce_execute_ops(struct device *dev,
+                u32 ce_ctx_id,
+                u64 src_buf,
+                u64 dst_buf,
+                u64 size,
+                unsigned int payload,
+                int launch_flags,
+                int request_operation,
+                struct gk20a_fence *gk20a_fence_in,
+                u32 submit_flags,
+                struct gk20a_fence **gk20a_fence_out)
+{
+        int ret = -EPERM;
+        struct gk20a *g = gk20a_from_dev(dev);
+        struct gk20a_ce_app *ce_app = &g->ce_app;
+        struct gk20a_gpu_ctx *ce_ctx, *ce_ctx_save;
+        bool found = false;
+        u32 *cmd_buf_cpu_va;
+        u64 cmd_buf_gpu_va = 0;
+        u32 methodSize;
+        u32 cmd_buf_read_offset;
+        u32 fence_index;
+        struct nvgpu_gpfifo gpfifo;
+        struct nvgpu_fence fence = {0,0};
+        struct gk20a_fence *ce_cmd_buf_fence_out = NULL;
+        struct nvgpu_gpu_characteristics *gpu_capability = &g->gpu_characteristics;
+        if (!ce_app->initialised ||ce_app->app_state != NVGPU_CE_ACTIVE)
+                goto end;
+        mutex_lock(&ce_app->app_mutex);
+        list_for_each_entry_safe(ce_ctx, ce_ctx_save,
+                        &ce_app->allocated_contexts, list) {
+                if (ce_ctx->ctx_id == ce_ctx_id) {
+                        found = true;
+                        break;
+                }
+        }
+        mutex_unlock(&ce_app->app_mutex);
+        if (!found) {
+                ret = -EINVAL;
+                goto end;
+        }
+        if (ce_ctx->gpu_ctx_state != NVGPU_CE_GPU_CTX_ALLOCATED) {
+                ret = -ENODEV;
+                goto end;
+        }
+        mutex_lock(&ce_ctx->gpu_ctx_mutex);
+        ce_ctx->cmd_buf_read_queue_offset %= ce_ctx->cmd_buf_end_queue_offset;
+        cmd_buf_read_offset = (ce_ctx->cmd_buf_read_queue_offset *
+                        (NVGPU_CE_MAX_COMMAND_BUFF_SIZE_PER_KICKOFF / sizeof(u32)));
+        /* at end of command buffer has gk20a_fence for command buffer sync */
+        fence_index = (cmd_buf_read_offset +
+                        ((NVGPU_CE_MAX_COMMAND_BUFF_SIZE_PER_KICKOFF / sizeof(u32)) -
+                        (NVGPU_CE_MAX_COMMAND_BUFF_SIZE_FOR_TRACING / sizeof(u32))));
+        if (sizeof(struct gk20a_fence *) > NVGPU_CE_MAX_COMMAND_BUFF_SIZE_FOR_TRACING) {
+                ret = -ENOMEM;
+                goto noop;
+        }
+        cmd_buf_cpu_va = (u32 *)ce_ctx->cmd_buf_mem.cpu_va;
+        /* 0 is treated as invalid pre-sync */
+        if (cmd_buf_cpu_va[fence_index]) {
+                struct gk20a_fence * ce_cmd_buf_fence_in = NULL;
+                memcpy((void *)&ce_cmd_buf_fence_in,
+                                (void *)(cmd_buf_cpu_va + fence_index),
+                                sizeof(struct gk20a_fence *));
+                ret = gk20a_fence_wait(ce_cmd_buf_fence_in, gk20a_get_gr_idle_timeout(g));
+                gk20a_fence_put(ce_cmd_buf_fence_in);
+                /* Reset the stored last pre-sync */
+                memset((void *)(cmd_buf_cpu_va + fence_index),
+                                0,
+                                NVGPU_CE_MAX_COMMAND_BUFF_SIZE_FOR_TRACING);
+                if (ret)
+                        goto noop;
+        }
+        cmd_buf_gpu_va = (ce_ctx->cmd_buf_mem.gpu_va + (u64)(cmd_buf_read_offset *sizeof(u32)));
+        methodSize = gk20a_ce_prepare_submit(src_buf,
+                                        dst_buf,
+                                        size,
+                                        &cmd_buf_cpu_va[cmd_buf_read_offset],
+                                        NVGPU_CE_MAX_COMMAND_BUFF_SIZE_PER_KICKOFF,
+                                        payload,
+                                        gk20a_get_valid_launch_flags(g, launch_flags),
+                                        request_operation,
+                                        gpu_capability->dma_copy_class,
+                                        gk20a_fence_in);
+        if (methodSize) {
+                /* TODO: Remove CPU pre-fence wait */
+                if (gk20a_fence_in) {
+                        ret = gk20a_fence_wait(gk20a_fence_in, gk20a_get_gr_idle_timeout(g));
+                        gk20a_fence_put(gk20a_fence_in);
+                        if (ret)
+                                goto noop;
+                }
+                /* store the element into gpfifo */
+                gpfifo.entry0 =
+                        u64_lo32(cmd_buf_gpu_va);
+                gpfifo.entry1 =
+                        (u64_hi32(cmd_buf_gpu_va) |
+                        pbdma_gp_entry1_length_f(methodSize));
+                /* take always the postfence as it is needed for protecting the ce context */
+                submit_flags |= NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_GET;
+                wmb();
+                ret = gk20a_submit_channel_gpfifo(ce_ctx->ch, &gpfifo, NULL,
+                                        1, submit_flags, &fence, &ce_cmd_buf_fence_out, true);
+                if (!ret) {
+                        memcpy((void *)(cmd_buf_cpu_va + fence_index),
+                                        (void *)&ce_cmd_buf_fence_out,
+                                        sizeof(struct gk20a_fence *));
+                        if (gk20a_fence_out) {
+                                gk20a_fence_get(ce_cmd_buf_fence_out);
+                                *gk20a_fence_out = ce_cmd_buf_fence_out;
+                        }
+                        /* Next available command buffer queue Index */
+                        ++ce_ctx->cmd_buf_read_queue_offset;
+                        ++ce_ctx->submitted_seq_number;
+                        }
+        } else
+                ret = -ENOMEM;
+noop:
+        mutex_unlock(&ce_ctx->gpu_ctx_mutex);
+end:
+        return ret;
+}
+EXPORT_SYMBOL(gk20a_ce_execute_ops);
+void gk20a_ce_delete_context(struct device *dev,
+                u32 ce_ctx_id)
+{
+        struct gk20a *g = gk20a_from_dev(dev);
+        struct gk20a_ce_app *ce_app = &g->ce_app;
+        struct gk20a_gpu_ctx *ce_ctx, *ce_ctx_save;
+        if (!ce_app->initialised ||ce_app->app_state != NVGPU_CE_ACTIVE)
+                return;
+        mutex_lock(&ce_app->app_mutex);
+        list_for_each_entry_safe(ce_ctx, ce_ctx_save,
+                        &ce_app->allocated_contexts, list) {
+                if (ce_ctx->ctx_id == ce_ctx_id) {
+                        gk20a_ce_delete_gpu_context(ce_ctx);
+                        --ce_app->ctx_count;
+                        break;
+                }
+        }
+        mutex_unlock(&ce_app->app_mutex);
+        return;
+}
+EXPORT_SYMBOL(gk20a_ce_delete_context);
+#ifdef CONFIG_DEBUG_FS
+void gk20a_ce_debugfs_init(struct device *dev)
+{
+        struct gk20a_platform *platform = dev_get_drvdata(dev);
+        struct gk20a *g = get_gk20a(dev);
+        debugfs_create_u32("ce_app_ctx_count", S_IWUSR | S_IRUGO,
+                           platform->debugfs, &g->ce_app.ctx_count);
+        debugfs_create_u32("ce_app_state", S_IWUSR | S_IRUGO,
+                           platform->debugfs, &g->ce_app.app_state);
+        debugfs_create_u32("ce_app_next_ctx_id", S_IWUSR | S_IRUGO,
+                           platform->debugfs, &g->ce_app.next_ctx_id);
+}
+#endif

diff --git a/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c b/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c index 96d38b11..e2f2d9e9 100644 --- a/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c
@@ -24,6 +24,7 @@
24	#include <trace/events/gk20a.h>	24	#include <trace/events/gk20a.h>
25	#include <linux/dma-mapping.h>	25	#include <linux/dma-mapping.h>
26	#include <linux/nvhost.h>	26	#include <linux/nvhost.h>
		27	#include <linux/debugfs.h>
27		28
28	#include "gk20a.h"	29	#include "gk20a.h"
29	#include "debug_gk20a.h"	30	#include "debug_gk20a.h"
@@ -96,3 +97,619 @@ void gk20a_init_ce2(struct gpu_ops *gops)
96	gops->ce2.isr_stall = gk20a_ce2_isr;	97	gops->ce2.isr_stall = gk20a_ce2_isr;
97	gops->ce2.isr_nonstall = gk20a_ce2_nonstall_isr;	98	gops->ce2.isr_nonstall = gk20a_ce2_nonstall_isr;
98	}	99	}
		100
		101	/* static CE app api */
		102	static void gk20a_ce_notify_all_user(struct gk20a *g, u32 event)
		103	{
		104	struct gk20a_ce_app *ce_app = &g->ce_app;
		105	struct gk20a_gpu_ctx ce_ctx, ce_ctx_save;
		106
		107	if (!ce_app->initialised)
		108	return;
		109
		110	mutex_lock(&ce_app->app_mutex);
		111
		112	list_for_each_entry_safe(ce_ctx, ce_ctx_save,
		113	&ce_app->allocated_contexts, list) {
		114	if (ce_ctx->user_event_callback) {
		115	ce_ctx->user_event_callback(ce_ctx->ctx_id,
		116	event);
		117	}
		118	}
		119
		120	mutex_unlock(&ce_app->app_mutex);
		121	}
		122
		123	static void gk20a_ce_finished_ctx_cb(struct channel_gk20a ch, void data)
		124	{
		125	struct gk20a_gpu_ctx *ce_ctx = data;
		126	bool channel_idle;
		127	u32 event;
		128
		129	mutex_lock(&ch->jobs_lock);
		130	channel_idle = list_empty(&ch->jobs);
		131	mutex_unlock(&ch->jobs_lock);
		132
		133	if (!channel_idle)
		134	return;
		135
		136	gk20a_dbg(gpu_dbg_fn, "ce: finished %p", ce_ctx);
		137
		138	if (ch->has_timedout)
		139	event = NVGPU_CE_CONTEXT_JOB_TIMEDOUT;
		140	else
		141	event = NVGPU_CE_CONTEXT_JOB_COMPLETED;
		142
		143	if (ce_ctx->user_event_callback)
		144	ce_ctx->user_event_callback(ce_ctx->ctx_id,
		145	event);
		146
		147	++ce_ctx->completed_seq_number;
		148	}
		149
		150	static void gk20a_ce_free_command_buffer_stored_fence(struct gk20a_gpu_ctx *ce_ctx)
		151	{
		152	u32 cmd_buf_index;
		153	u32 cmd_buf_read_offset;
		154	u32 fence_index;
		155	u32 *cmd_buf_cpu_va;
		156
		157	for (cmd_buf_index = 0;
		158	cmd_buf_index < ce_ctx->cmd_buf_end_queue_offset;
		159	cmd_buf_index++) {
		160	cmd_buf_read_offset = (cmd_buf_index *
		161	(NVGPU_CE_MAX_COMMAND_BUFF_SIZE_PER_KICKOFF / sizeof(u32)));
		162
		163	/* at end of command buffer has gk20a_fence for command buffer sync */
		164	fence_index = (cmd_buf_read_offset +
		165	((NVGPU_CE_MAX_COMMAND_BUFF_SIZE_PER_KICKOFF / sizeof(u32)) -
		166	(NVGPU_CE_MAX_COMMAND_BUFF_SIZE_FOR_TRACING / sizeof(u32))));
		167
		168	cmd_buf_cpu_va = (u32 *)ce_ctx->cmd_buf_mem.cpu_va;
		169
		170	/* 0 is treated as invalid pre-sync */
		171	if (cmd_buf_cpu_va[fence_index]) {
		172	struct gk20a_fence * ce_cmd_buf_fence_in = NULL;
		173
		174	memcpy((void *)&ce_cmd_buf_fence_in,
		175	(void *)(cmd_buf_cpu_va + fence_index),
		176	sizeof(struct gk20a_fence *));
		177	gk20a_fence_put(ce_cmd_buf_fence_in);
		178	/* Reset the stored last pre-sync */
		179	memset((void *)(cmd_buf_cpu_va + fence_index),
		180	0,
		181	NVGPU_CE_MAX_COMMAND_BUFF_SIZE_FOR_TRACING);
		182	}
		183	}
		184	}
		185
		186	/* assume this api should need to call under mutex_lock(&ce_app->app_mutex) */
		187	static void gk20a_ce_delete_gpu_context(struct gk20a_gpu_ctx *ce_ctx)
		188	{
		189	ce_ctx->gpu_ctx_state = NVGPU_CE_GPU_CTX_DELETED;
		190
		191	mutex_lock(&ce_ctx->gpu_ctx_mutex);
		192
		193	gk20a_ce_free_command_buffer_stored_fence(ce_ctx);
		194
		195	gk20a_gmmu_unmap_free(ce_ctx->vm, &ce_ctx->cmd_buf_mem);
		196
		197	/* free the channel */
		198	if (ce_ctx->ch)
		199	gk20a_channel_close(ce_ctx->ch);
		200
		201	/* housekeeping on app */
		202	list_del(&ce_ctx->list);
		203
		204	mutex_unlock(&ce_ctx->gpu_ctx_mutex);
		205	mutex_destroy(&ce_ctx->gpu_ctx_mutex);
		206
		207	kfree(ce_ctx);
		208	}
		209
		210	static inline int gk20a_ce_get_method_size(int request_operation)
		211	{
		212	/* failure size */
		213	int methodsize = ~0;
		214
		215	if (request_operation & NVGPU_CE_PHYS_MODE_TRANSFER)
		216	methodsize = 10 * 2 * sizeof(u32);
		217	else if (request_operation & NVGPU_CE_MEMSET)
		218	methodsize = 9 * 2 * sizeof(u32);
		219
		220	return methodsize;
		221	}
		222
		223	static inline int gk20a_get_valid_launch_flags(struct gk20a *g, int launch_flags)
		224	{
		225	/* there is no local memory available,
		226	don't allow local memory related CE flags */
		227	if (!g->mm.vidmem_size) {
		228	launch_flags &= ~(NVGPU_CE_SRC_LOCATION_LOCAL_FB \|
		229	NVGPU_CE_DST_LOCATION_LOCAL_FB);
		230	}
		231	return launch_flags;
		232	}
		233
		234	static int gk20a_ce_prepare_submit(u64 src_buf,
		235	u64 dst_buf,
		236	u64 size,
		237	u32 *cmd_buf_cpu_va,
		238	u32 max_cmd_buf_size,
		239	unsigned int payload,
		240	int launch_flags,
		241	int request_operation,
		242	u32 dma_copy_class,
		243	struct gk20a_fence *gk20a_fence_in)
		244	{
		245	u32 launch = 0;
		246	u32 methodSize = 0;
		247
		248	/* failure case handling */
		249	if ((gk20a_ce_get_method_size(request_operation) > max_cmd_buf_size) \|\|
		250	(!size) \|\|
		251	(request_operation > NVGPU_CE_MEMSET))
		252	return 0;
		253
		254	/* set the channel object */
		255	cmd_buf_cpu_va[methodSize++] = 0x20018000;
		256	cmd_buf_cpu_va[methodSize++] = dma_copy_class;
		257
		258	if (request_operation & NVGPU_CE_PHYS_MODE_TRANSFER) {
		259	/* setup the source */
		260	cmd_buf_cpu_va[methodSize++] = 0x20018101;
		261	cmd_buf_cpu_va[methodSize++] = (u64_lo32(src_buf) &
		262	NVGPU_CE_LOWER_ADDRESS_OFFSET_MASK);
		263
		264	cmd_buf_cpu_va[methodSize++] = 0x20018100;
		265	cmd_buf_cpu_va[methodSize++] = (u64_hi32(src_buf) &
		266	NVGPU_CE_UPPER_ADDRESS_OFFSET_MASK);
		267
		268	cmd_buf_cpu_va[methodSize++] = 0x20018098;
		269	if (launch_flags & NVGPU_CE_SRC_LOCATION_LOCAL_FB) {
		270	cmd_buf_cpu_va[methodSize++] = 0x00000000;
		271	} else if (launch_flags & NVGPU_CE_SRC_LOCATION_NONCOHERENT_SYSMEM) {
		272	cmd_buf_cpu_va[methodSize++] = 0x00000002;
		273	} else {
		274	cmd_buf_cpu_va[methodSize++] = 0x00000001;
		275	}
		276
		277	launch \|= 0x00001000;
		278	} else if (request_operation & NVGPU_CE_MEMSET) {
		279	cmd_buf_cpu_va[methodSize++] = 0x200181c2;
		280	cmd_buf_cpu_va[methodSize++] = 0x00030004;
		281
		282	cmd_buf_cpu_va[methodSize++] = 0x200181c0;
		283	cmd_buf_cpu_va[methodSize++] = payload;
		284
		285	launch \|= 0x00000400;
		286
		287	/* converted into number of words */
		288	size /= sizeof(u32);
		289	}
		290
		291	/* setup the destination/output */
		292	cmd_buf_cpu_va[methodSize++] = 0x20018103;
		293	cmd_buf_cpu_va[methodSize++] = (u64_lo32(dst_buf) & NVGPU_CE_LOWER_ADDRESS_OFFSET_MASK);
		294
		295	cmd_buf_cpu_va[methodSize++] = 0x20018102;
		296	cmd_buf_cpu_va[methodSize++] = (u64_hi32(dst_buf) & NVGPU_CE_UPPER_ADDRESS_OFFSET_MASK);
		297
		298	cmd_buf_cpu_va[methodSize++] = 0x20018099;
		299	if (launch_flags & NVGPU_CE_DST_LOCATION_LOCAL_FB) {
		300	cmd_buf_cpu_va[methodSize++] = 0x00000000;
		301	} else if (launch_flags & NVGPU_CE_DST_LOCATION_NONCOHERENT_SYSMEM) {
		302	cmd_buf_cpu_va[methodSize++] = 0x00000002;
		303	} else {
		304	cmd_buf_cpu_va[methodSize++] = 0x00000001;
		305	}
		306
		307	launch \|= 0x00002000;
		308
		309	/* setup the format */
		310	cmd_buf_cpu_va[methodSize++] = 0x20018107;
		311	cmd_buf_cpu_va[methodSize++] = 1;
		312	cmd_buf_cpu_va[methodSize++] = 0x20018106;
		313	cmd_buf_cpu_va[methodSize++] = u64_lo32(size);
		314
		315	launch \|= 0x00000004;
		316
		317	if (launch_flags & NVGPU_CE_SRC_MEMORY_LAYOUT_BLOCKLINEAR)
		318	launch \|= 0x00000000;
		319	else
		320	launch \|= 0x00000080;
		321
		322	if (launch_flags & NVGPU_CE_DST_MEMORY_LAYOUT_BLOCKLINEAR)
		323	launch \|= 0x00000000;
		324	else
		325	launch \|= 0x00000100;
		326
		327	if (launch_flags & NVGPU_CE_DATA_TRANSFER_TYPE_NON_PIPELINED)
		328	launch \|= 0x00000002;
		329	else
		330	launch \|= 0x00000001;
		331
		332	cmd_buf_cpu_va[methodSize++] = 0x200180c0;
		333	cmd_buf_cpu_va[methodSize++] = launch;
		334
		335	return methodSize;
		336	}
		337
		338	/* global CE app related apis */
		339	int gk20a_init_ce_support(struct gk20a *g)
		340	{
		341	struct gk20a_ce_app *ce_app = &g->ce_app;
		342
		343	if (ce_app->initialised) {
		344	/* assume this happen during poweron/poweroff GPU sequence */
		345	ce_app->app_state = NVGPU_CE_ACTIVE;
		346	gk20a_ce_notify_all_user(g, NVGPU_CE_CONTEXT_RESUME);
		347	return 0;
		348	}
		349
		350	gk20a_dbg(gpu_dbg_fn, "ce: init");
		351
		352	mutex_init(&ce_app->app_mutex);
		353	mutex_lock(&ce_app->app_mutex);
		354
		355	INIT_LIST_HEAD(&ce_app->allocated_contexts);
		356	ce_app->ctx_count = 0;
		357	ce_app->next_ctx_id = 0;
		358	ce_app->initialised = true;
		359	ce_app->app_state = NVGPU_CE_ACTIVE;
		360
		361	mutex_unlock(&ce_app->app_mutex);
		362	gk20a_dbg(gpu_dbg_cde_ctx, "ce: init finished");
		363
		364	return 0;
		365	}
		366
		367	void gk20a_ce_destroy(struct gk20a *g)
		368	{
		369	struct gk20a_ce_app *ce_app = &g->ce_app;
		370	struct gk20a_gpu_ctx ce_ctx, ce_ctx_save;
		371
		372	if (!ce_app->initialised)
		373	return;
		374
		375	ce_app->app_state = NVGPU_CE_SUSPEND;
		376	ce_app->initialised = false;
		377
		378	mutex_lock(&ce_app->app_mutex);
		379
		380	list_for_each_entry_safe(ce_ctx, ce_ctx_save,
		381	&ce_app->allocated_contexts, list) {
		382	gk20a_ce_delete_gpu_context(ce_ctx);
		383	}
		384
		385	INIT_LIST_HEAD(&ce_app->allocated_contexts);
		386	ce_app->ctx_count = 0;
		387	ce_app->next_ctx_id = 0;
		388
		389	mutex_unlock(&ce_app->app_mutex);
		390	mutex_destroy(&ce_app->app_mutex);
		391	}
		392
		393	void gk20a_ce_suspend(struct gk20a *g)
		394	{
		395	struct gk20a_ce_app *ce_app = &g->ce_app;
		396
		397	if (!ce_app->initialised)
		398	return;
		399
		400	ce_app->app_state = NVGPU_CE_SUSPEND;
		401	gk20a_ce_notify_all_user(g, NVGPU_CE_CONTEXT_SUSPEND);
		402
		403	return;
		404	}
		405
		406	/* CE app utility functions */
		407	u32 gk20a_ce_create_context_with_cb(struct device *dev,
		408	int runlist_id,
		409	int priority,
		410	int timeslice,
		411	int runlist_level,
		412	ce_event_callback user_event_callback)
		413	{
		414	struct gk20a_gpu_ctx *ce_ctx;
		415	struct gk20a *g = gk20a_from_dev(dev);
		416	struct gk20a_ce_app *ce_app = &g->ce_app;
		417	u32 ctx_id = ~0;
		418	int err = 0;
		419
		420	if (!ce_app->initialised \|\| ce_app->app_state != NVGPU_CE_ACTIVE)
		421	return ctx_id;
		422
		423	ce_ctx = kzalloc(sizeof(*ce_ctx), GFP_KERNEL);
		424	if (!ce_ctx)
		425	return ctx_id;
		426
		427	mutex_init(&ce_ctx->gpu_ctx_mutex);
		428
		429	ce_ctx->g = g;
		430	ce_ctx->dev = g->dev;
		431	ce_ctx->user_event_callback = user_event_callback;
		432
		433	ce_ctx->cmd_buf_read_queue_offset = 0;
		434	ce_ctx->cmd_buf_end_queue_offset =
		435	(NVGPU_CE_COMMAND_BUF_SIZE / NVGPU_CE_MAX_COMMAND_BUFF_SIZE_PER_KICKOFF);
		436
		437	ce_ctx->submitted_seq_number = 0;
		438	ce_ctx->completed_seq_number = 0;
		439
		440	/* always kernel client needs privileged channel */
		441	ce_ctx->ch = gk20a_open_new_channel_with_cb(g, gk20a_ce_finished_ctx_cb,
		442	ce_ctx,
		443	runlist_id,
		444	true);
		445	if (!ce_ctx->ch) {
		446	gk20a_err(ce_ctx->dev, "ce: gk20a channel not available");
		447	goto end;
		448	}
		449
		450	/* bind the channel to the vm */
		451	gk20a_vm_get(&g->mm.ce.vm);
		452	ce_ctx->vm = ce_ctx->ch->vm = &g->mm.ce.vm;
		453	err = channel_gk20a_commit_va(ce_ctx->ch);
		454	if (err) {
		455	gk20a_err(ce_ctx->dev, "ce: could not bind vm");
		456	goto end;
		457	}
		458
		459	/* allocate gpfifo (1024 should be more than enough) */
		460	err = gk20a_alloc_channel_gpfifo(ce_ctx->ch,
		461	&(struct nvgpu_alloc_gpfifo_args){1024, 0});
		462	if (err) {
		463	gk20a_err(ce_ctx->dev, "ce: unable to allocate gpfifo");
		464	goto end;
		465	}
		466
		467	/* allocate command buffer (4096 should be more than enough) from sysmem*/
		468	err = gk20a_gmmu_alloc_map_sys(ce_ctx->vm, NVGPU_CE_COMMAND_BUF_SIZE, &ce_ctx->cmd_buf_mem);
		469	if (err) {
		470	gk20a_err(ce_ctx->dev,
		471	"ce: could not allocate command buffer for CE context");
		472	goto end;
		473	}
		474
		475	memset(ce_ctx->cmd_buf_mem.cpu_va, 0x00, ce_ctx->cmd_buf_mem.size);
		476
		477	/* -1 means default channel priority */
		478	if (priority != -1) {
		479	err = gk20a_channel_set_priority(ce_ctx->ch, priority);
		480	if (err) {
		481	gk20a_err(ce_ctx->dev,
		482	"ce: could not set the channel priority for CE context");
		483	goto end;
		484	}
		485	}
		486
		487	/* -1 means default channel timeslice value */
		488	if (timeslice != -1) {
		489	err = gk20a_channel_set_timeslice(ce_ctx->ch, timeslice);
		490	if (err) {
		491	gk20a_err(ce_ctx->dev,
		492	"ce: could not set the channel timeslice value for CE context");
		493	goto end;
		494	}
		495	}
		496
		497	/* -1 means default channel runlist level */
		498	if (runlist_level != -1) {
		499	err = gk20a_channel_set_runlist_interleave(ce_ctx->ch, runlist_level);
		500	if (err) {
		501	gk20a_err(ce_ctx->dev,
		502	"ce: could not set the runlist interleave for CE context");
		503	goto end;
		504	}
		505	}
		506
		507	mutex_lock(&ce_app->app_mutex);
		508	ctx_id = ce_ctx->ctx_id = ce_app->next_ctx_id;
		509	list_add(&ce_ctx->list, &ce_app->allocated_contexts);
		510	++ce_app->next_ctx_id;
		511	++ce_app->ctx_count;
		512	mutex_unlock(&ce_app->app_mutex);
		513
		514	ce_ctx->gpu_ctx_state = NVGPU_CE_GPU_CTX_ALLOCATED;
		515
		516	end:
		517	if (ctx_id == ~0) {
		518	mutex_lock(&ce_app->app_mutex);
		519	gk20a_ce_delete_gpu_context(ce_ctx);
		520	mutex_unlock(&ce_app->app_mutex);
		521	}
		522	return ctx_id;
		523
		524	}
		525	EXPORT_SYMBOL(gk20a_ce_create_context_with_cb);
		526
		527	int gk20a_ce_execute_ops(struct device *dev,
		528	u32 ce_ctx_id,
		529	u64 src_buf,
		530	u64 dst_buf,
		531	u64 size,
		532	unsigned int payload,
		533	int launch_flags,
		534	int request_operation,
		535	struct gk20a_fence *gk20a_fence_in,
		536	u32 submit_flags,
		537	struct gk20a_fence **gk20a_fence_out)
		538	{
		539	int ret = -EPERM;
		540	struct gk20a *g = gk20a_from_dev(dev);
		541	struct gk20a_ce_app *ce_app = &g->ce_app;
		542	struct gk20a_gpu_ctx ce_ctx, ce_ctx_save;
		543	bool found = false;
		544	u32 *cmd_buf_cpu_va;
		545	u64 cmd_buf_gpu_va = 0;
		546	u32 methodSize;
		547	u32 cmd_buf_read_offset;
		548	u32 fence_index;
		549	struct nvgpu_gpfifo gpfifo;
		550	struct nvgpu_fence fence = {0,0};
		551	struct gk20a_fence *ce_cmd_buf_fence_out = NULL;
		552	struct nvgpu_gpu_characteristics *gpu_capability = &g->gpu_characteristics;
		553
		554	if (!ce_app->initialised \|\|ce_app->app_state != NVGPU_CE_ACTIVE)
		555	goto end;
		556
		557	mutex_lock(&ce_app->app_mutex);
		558
		559	list_for_each_entry_safe(ce_ctx, ce_ctx_save,
		560	&ce_app->allocated_contexts, list) {
		561	if (ce_ctx->ctx_id == ce_ctx_id) {
		562	found = true;
		563	break;
		564	}
		565	}
		566
		567	mutex_unlock(&ce_app->app_mutex);
		568
		569	if (!found) {
		570	ret = -EINVAL;
		571	goto end;
		572	}
		573
		574	if (ce_ctx->gpu_ctx_state != NVGPU_CE_GPU_CTX_ALLOCATED) {
		575	ret = -ENODEV;
		576	goto end;
		577	}
		578
		579	mutex_lock(&ce_ctx->gpu_ctx_mutex);
		580
		581	ce_ctx->cmd_buf_read_queue_offset %= ce_ctx->cmd_buf_end_queue_offset;
		582
		583	cmd_buf_read_offset = (ce_ctx->cmd_buf_read_queue_offset *
		584	(NVGPU_CE_MAX_COMMAND_BUFF_SIZE_PER_KICKOFF / sizeof(u32)));
		585
		586	/* at end of command buffer has gk20a_fence for command buffer sync */
		587	fence_index = (cmd_buf_read_offset +
		588	((NVGPU_CE_MAX_COMMAND_BUFF_SIZE_PER_KICKOFF / sizeof(u32)) -
		589	(NVGPU_CE_MAX_COMMAND_BUFF_SIZE_FOR_TRACING / sizeof(u32))));
		590
		591	if (sizeof(struct gk20a_fence *) > NVGPU_CE_MAX_COMMAND_BUFF_SIZE_FOR_TRACING) {
		592	ret = -ENOMEM;
		593	goto noop;
		594	}
		595
		596	cmd_buf_cpu_va = (u32 *)ce_ctx->cmd_buf_mem.cpu_va;
		597
		598	/* 0 is treated as invalid pre-sync */
		599	if (cmd_buf_cpu_va[fence_index]) {
		600	struct gk20a_fence * ce_cmd_buf_fence_in = NULL;
		601
		602	memcpy((void *)&ce_cmd_buf_fence_in,
		603	(void *)(cmd_buf_cpu_va + fence_index),
		604	sizeof(struct gk20a_fence *));
		605	ret = gk20a_fence_wait(ce_cmd_buf_fence_in, gk20a_get_gr_idle_timeout(g));
		606
		607	gk20a_fence_put(ce_cmd_buf_fence_in);
		608	/* Reset the stored last pre-sync */
		609	memset((void *)(cmd_buf_cpu_va + fence_index),
		610	0,
		611	NVGPU_CE_MAX_COMMAND_BUFF_SIZE_FOR_TRACING);
		612	if (ret)
		613	goto noop;
		614	}
		615
		616	cmd_buf_gpu_va = (ce_ctx->cmd_buf_mem.gpu_va + (u64)(cmd_buf_read_offset *sizeof(u32)));
		617
		618	methodSize = gk20a_ce_prepare_submit(src_buf,
		619	dst_buf,
		620	size,
		621	&cmd_buf_cpu_va[cmd_buf_read_offset],
		622	NVGPU_CE_MAX_COMMAND_BUFF_SIZE_PER_KICKOFF,
		623	payload,
		624	gk20a_get_valid_launch_flags(g, launch_flags),
		625	request_operation,
		626	gpu_capability->dma_copy_class,
		627	gk20a_fence_in);
		628
		629	if (methodSize) {
		630	/* TODO: Remove CPU pre-fence wait */
		631	if (gk20a_fence_in) {
		632	ret = gk20a_fence_wait(gk20a_fence_in, gk20a_get_gr_idle_timeout(g));
		633	gk20a_fence_put(gk20a_fence_in);
		634	if (ret)
		635	goto noop;
		636	}
		637
		638	/* store the element into gpfifo */
		639	gpfifo.entry0 =
		640	u64_lo32(cmd_buf_gpu_va);
		641	gpfifo.entry1 =
		642	(u64_hi32(cmd_buf_gpu_va) \|
		643	pbdma_gp_entry1_length_f(methodSize));
		644
		645	/* take always the postfence as it is needed for protecting the ce context */
		646	submit_flags \|= NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_GET;
		647
		648	wmb();
		649
		650	ret = gk20a_submit_channel_gpfifo(ce_ctx->ch, &gpfifo, NULL,
		651	1, submit_flags, &fence, &ce_cmd_buf_fence_out, true);
		652
		653	if (!ret) {
		654	memcpy((void *)(cmd_buf_cpu_va + fence_index),
		655	(void *)&ce_cmd_buf_fence_out,
		656	sizeof(struct gk20a_fence *));
		657
		658	if (gk20a_fence_out) {
		659	gk20a_fence_get(ce_cmd_buf_fence_out);
		660	*gk20a_fence_out = ce_cmd_buf_fence_out;
		661	}
		662
		663	/* Next available command buffer queue Index */
		664	++ce_ctx->cmd_buf_read_queue_offset;
		665	++ce_ctx->submitted_seq_number;
		666	}
		667	} else
		668	ret = -ENOMEM;
		669	noop:
		670	mutex_unlock(&ce_ctx->gpu_ctx_mutex);
		671	end:
		672	return ret;
		673	}
		674	EXPORT_SYMBOL(gk20a_ce_execute_ops);
		675
		676	void gk20a_ce_delete_context(struct device *dev,
		677	u32 ce_ctx_id)
		678	{
		679	struct gk20a *g = gk20a_from_dev(dev);
		680	struct gk20a_ce_app *ce_app = &g->ce_app;
		681	struct gk20a_gpu_ctx ce_ctx, ce_ctx_save;
		682
		683	if (!ce_app->initialised \|\|ce_app->app_state != NVGPU_CE_ACTIVE)
		684	return;
		685
		686	mutex_lock(&ce_app->app_mutex);
		687
		688	list_for_each_entry_safe(ce_ctx, ce_ctx_save,
		689	&ce_app->allocated_contexts, list) {
		690	if (ce_ctx->ctx_id == ce_ctx_id) {
		691	gk20a_ce_delete_gpu_context(ce_ctx);
		692	--ce_app->ctx_count;
		693	break;
		694	}
		695	}
		696
		697	mutex_unlock(&ce_app->app_mutex);
		698	return;
		699	}
		700	EXPORT_SYMBOL(gk20a_ce_delete_context);
		701
		702	#ifdef CONFIG_DEBUG_FS
		703	void gk20a_ce_debugfs_init(struct device *dev)
		704	{
		705	struct gk20a_platform *platform = dev_get_drvdata(dev);
		706	struct gk20a *g = get_gk20a(dev);
		707
		708	debugfs_create_u32("ce_app_ctx_count", S_IWUSR \| S_IRUGO,
		709	platform->debugfs, &g->ce_app.ctx_count);
		710	debugfs_create_u32("ce_app_state", S_IWUSR \| S_IRUGO,
		711	platform->debugfs, &g->ce_app.app_state);
		712	debugfs_create_u32("ce_app_next_ctx_id", S_IWUSR \| S_IRUGO,
		713	platform->debugfs, &g->ce_app.next_ctx_id);
		714	}
		715	#endif