gpu: nvgpu: Add nvgpu infra to allow kernel to create privileged CE channels

Added interface to allow kernel to create privileged CE channels for page migration and clearing support between sysmem and videmem. JIRA DNVGPU-53 Change-Id: I3e18d18403809c9e64fa45d40b6c4e3844992506 Signed-off-by: Lakshmanan M <lm@nvidia.com> Reviewed-on: http://git-master/r/1173085 GVS: Gerrit_Virtual_Submit Reviewed-by: Vijayakumar Subbu <vsubbu@nvidia.com>
author: Lakshmanan M <lm@nvidia.com> 2016-06-29 06:36:39 -0400
committer: Vijayakumar Subbu <vsubbu@nvidia.com> 2016-07-20 06:09:28 -0400
commit: 89aecd1202b49727e940069f2a6feb5c3cf4c927 (patch)
tree: 8a0d3a493b389167ce1d93e55f23e114ec2cbd38 /drivers/gpu/nvgpu/gk20a/ce2_gk20a.c
parent: f6ebdc5f2916706f7a61983567420e0985faeeb1 (diff)
1 files changed, 617 insertions, 0 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c b/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c
index 96d38b11..e2f2d9e9 100644
--- a/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c
@@ -24,6 +24,7 @@
 #include <trace/events/gk20a.h>
 #include <linux/dma-mapping.h>
 #include <linux/nvhost.h>
+#include <linux/debugfs.h>
 #include "gk20a.h"
 #include "debug_gk20a.h"
@@ -96,3 +97,619 @@ void gk20a_init_ce2(struct gpu_ops *gops)
        gops->ce2.isr_stall = gk20a_ce2_isr;
        gops->ce2.isr_nonstall = gk20a_ce2_nonstall_isr;
 }
+/* static CE app api */
+static void gk20a_ce_notify_all_user(struct gk20a *g, u32 event)
+{
+        struct gk20a_ce_app *ce_app = &g->ce_app;
+        struct gk20a_gpu_ctx *ce_ctx, *ce_ctx_save;
+        if (!ce_app->initialised)
+                return;
+        mutex_lock(&ce_app->app_mutex);
+        list_for_each_entry_safe(ce_ctx, ce_ctx_save,
+                        &ce_app->allocated_contexts, list) {
+                if (ce_ctx->user_event_callback) {
+                        ce_ctx->user_event_callback(ce_ctx->ctx_id,
+                                event);
+                }
+        }
+        mutex_unlock(&ce_app->app_mutex);
+}
+static void gk20a_ce_finished_ctx_cb(struct channel_gk20a *ch, void *data)
+{
+        struct gk20a_gpu_ctx *ce_ctx = data;
+        bool channel_idle;
+        u32 event;
+        mutex_lock(&ch->jobs_lock);
+        channel_idle = list_empty(&ch->jobs);
+        mutex_unlock(&ch->jobs_lock);
+        if (!channel_idle)
+                return;
+        gk20a_dbg(gpu_dbg_fn, "ce: finished %p", ce_ctx);
+        if (ch->has_timedout)
+                event = NVGPU_CE_CONTEXT_JOB_TIMEDOUT;
+        else
+                event = NVGPU_CE_CONTEXT_JOB_COMPLETED;
+        if (ce_ctx->user_event_callback)
+                ce_ctx->user_event_callback(ce_ctx->ctx_id,
+                        event);
+        ++ce_ctx->completed_seq_number;
+}
+static void gk20a_ce_free_command_buffer_stored_fence(struct gk20a_gpu_ctx *ce_ctx)
+{
+        u32 cmd_buf_index;
+        u32 cmd_buf_read_offset;
+        u32 fence_index;
+        u32 *cmd_buf_cpu_va;
+        for (cmd_buf_index = 0;
+                cmd_buf_index < ce_ctx->cmd_buf_end_queue_offset;
+                cmd_buf_index++) {
+                cmd_buf_read_offset = (cmd_buf_index *
+                        (NVGPU_CE_MAX_COMMAND_BUFF_SIZE_PER_KICKOFF / sizeof(u32)));
+                /* at end of command buffer has gk20a_fence for command buffer sync */
+                fence_index = (cmd_buf_read_offset +
+                        ((NVGPU_CE_MAX_COMMAND_BUFF_SIZE_PER_KICKOFF / sizeof(u32)) -
+                        (NVGPU_CE_MAX_COMMAND_BUFF_SIZE_FOR_TRACING / sizeof(u32))));
+                cmd_buf_cpu_va = (u32 *)ce_ctx->cmd_buf_mem.cpu_va;
+                /* 0 is treated as invalid pre-sync */
+                if (cmd_buf_cpu_va[fence_index]) {
+                        struct gk20a_fence * ce_cmd_buf_fence_in = NULL;
+                        memcpy((void *)&ce_cmd_buf_fence_in,
+                                        (void *)(cmd_buf_cpu_va + fence_index),
+                                        sizeof(struct gk20a_fence *));
+                        gk20a_fence_put(ce_cmd_buf_fence_in);
+                        /* Reset the stored last pre-sync */
+                        memset((void *)(cmd_buf_cpu_va + fence_index),
+                                        0,
+                                        NVGPU_CE_MAX_COMMAND_BUFF_SIZE_FOR_TRACING);
+                }
+        }
+}
+/* assume this api should need to call under mutex_lock(&ce_app->app_mutex) */
+static void gk20a_ce_delete_gpu_context(struct gk20a_gpu_ctx *ce_ctx)
+{
+        ce_ctx->gpu_ctx_state = NVGPU_CE_GPU_CTX_DELETED;
+        mutex_lock(&ce_ctx->gpu_ctx_mutex);
+        gk20a_ce_free_command_buffer_stored_fence(ce_ctx);
+        gk20a_gmmu_unmap_free(ce_ctx->vm, &ce_ctx->cmd_buf_mem);
+        /* free the channel */
+        if (ce_ctx->ch)
+                gk20a_channel_close(ce_ctx->ch);
+        /* housekeeping on app */
+        list_del(&ce_ctx->list);
+        mutex_unlock(&ce_ctx->gpu_ctx_mutex);
+        mutex_destroy(&ce_ctx->gpu_ctx_mutex);
+        kfree(ce_ctx);
+}
+static inline int gk20a_ce_get_method_size(int request_operation)
+{
+        /* failure size */
+        int methodsize = ~0;
+        if (request_operation & NVGPU_CE_PHYS_MODE_TRANSFER)
+                methodsize = 10 * 2 * sizeof(u32);
+        else if (request_operation & NVGPU_CE_MEMSET)
+                methodsize = 9 * 2 * sizeof(u32);
+        return methodsize;
+}
+static inline int gk20a_get_valid_launch_flags(struct gk20a *g, int launch_flags)
+{
+        /* there is no local memory available,
+        don't allow local memory related CE flags */
+        if (!g->mm.vidmem_size) {
+                launch_flags &= ~(NVGPU_CE_SRC_LOCATION_LOCAL_FB |
+                        NVGPU_CE_DST_LOCATION_LOCAL_FB);
+        }
+        return launch_flags;
+}
+static int gk20a_ce_prepare_submit(u64 src_buf,
+                u64 dst_buf,
+                u64 size,
+                u32 *cmd_buf_cpu_va,
+                u32 max_cmd_buf_size,
+                unsigned int payload,
+                int launch_flags,
+                int request_operation,
+                u32 dma_copy_class,
+                struct gk20a_fence *gk20a_fence_in)
+{
+        u32 launch = 0;
+        u32 methodSize = 0;
+        /* failure case handling */
+        if ((gk20a_ce_get_method_size(request_operation) > max_cmd_buf_size) ||
+                (!size) ||
+                (request_operation > NVGPU_CE_MEMSET))
+                return 0;
+        /* set the channel object */
+        cmd_buf_cpu_va[methodSize++] = 0x20018000;
+        cmd_buf_cpu_va[methodSize++] = dma_copy_class;
+        if (request_operation & NVGPU_CE_PHYS_MODE_TRANSFER) {
+                /* setup the source */
+                cmd_buf_cpu_va[methodSize++] = 0x20018101;
+                cmd_buf_cpu_va[methodSize++] = (u64_lo32(src_buf) &
+                                        NVGPU_CE_LOWER_ADDRESS_OFFSET_MASK);
+                cmd_buf_cpu_va[methodSize++] = 0x20018100;
+                cmd_buf_cpu_va[methodSize++] = (u64_hi32(src_buf) &
+                                        NVGPU_CE_UPPER_ADDRESS_OFFSET_MASK);
+                cmd_buf_cpu_va[methodSize++] = 0x20018098;
+                if (launch_flags & NVGPU_CE_SRC_LOCATION_LOCAL_FB) {
+                        cmd_buf_cpu_va[methodSize++] = 0x00000000;
+                } else if (launch_flags & NVGPU_CE_SRC_LOCATION_NONCOHERENT_SYSMEM) {
+                        cmd_buf_cpu_va[methodSize++] = 0x00000002;
+                } else {
+                        cmd_buf_cpu_va[methodSize++] = 0x00000001;
+                }
+                launch |= 0x00001000;
+        } else if (request_operation & NVGPU_CE_MEMSET) {
+                cmd_buf_cpu_va[methodSize++] = 0x200181c2;
+                cmd_buf_cpu_va[methodSize++] = 0x00030004;
+                cmd_buf_cpu_va[methodSize++] = 0x200181c0;
+                cmd_buf_cpu_va[methodSize++] = payload;
+                launch |= 0x00000400;
+                /* converted into number of words */
+                size /= sizeof(u32);
+        }
+        /* setup the destination/output */
+        cmd_buf_cpu_va[methodSize++] = 0x20018103;
+        cmd_buf_cpu_va[methodSize++] = (u64_lo32(dst_buf) & NVGPU_CE_LOWER_ADDRESS_OFFSET_MASK);
+        cmd_buf_cpu_va[methodSize++] = 0x20018102;
+        cmd_buf_cpu_va[methodSize++] = (u64_hi32(dst_buf) & NVGPU_CE_UPPER_ADDRESS_OFFSET_MASK);
+        cmd_buf_cpu_va[methodSize++] = 0x20018099;
+        if (launch_flags & NVGPU_CE_DST_LOCATION_LOCAL_FB) {
+                cmd_buf_cpu_va[methodSize++] = 0x00000000;
+        } else if (launch_flags & NVGPU_CE_DST_LOCATION_NONCOHERENT_SYSMEM) {
+                cmd_buf_cpu_va[methodSize++] = 0x00000002;
+        } else {
+                cmd_buf_cpu_va[methodSize++] = 0x00000001;
+        }
+        launch |= 0x00002000;
+        /* setup the format */
+        cmd_buf_cpu_va[methodSize++] = 0x20018107;
+        cmd_buf_cpu_va[methodSize++] = 1;
+        cmd_buf_cpu_va[methodSize++] = 0x20018106;
+        cmd_buf_cpu_va[methodSize++] =  u64_lo32(size);
+        launch |= 0x00000004;
+        if (launch_flags & NVGPU_CE_SRC_MEMORY_LAYOUT_BLOCKLINEAR)
+                launch |= 0x00000000;
+        else
+                launch |= 0x00000080;
+        if (launch_flags & NVGPU_CE_DST_MEMORY_LAYOUT_BLOCKLINEAR)
+                launch |= 0x00000000;
+        else
+                launch |= 0x00000100;
+        if (launch_flags & NVGPU_CE_DATA_TRANSFER_TYPE_NON_PIPELINED)
+                launch |= 0x00000002;
+        else
+                launch |= 0x00000001;
+        cmd_buf_cpu_va[methodSize++] = 0x200180c0;
+        cmd_buf_cpu_va[methodSize++] = launch;
+        return methodSize;
+}
+/* global CE app related apis */
+int gk20a_init_ce_support(struct gk20a *g)
+{
+        struct gk20a_ce_app *ce_app = &g->ce_app;
+        if (ce_app->initialised) {
+                /* assume this happen during poweron/poweroff GPU sequence */
+                ce_app->app_state = NVGPU_CE_ACTIVE;
+                gk20a_ce_notify_all_user(g, NVGPU_CE_CONTEXT_RESUME);
+                return 0;
+        }
+        gk20a_dbg(gpu_dbg_fn, "ce: init");
+        mutex_init(&ce_app->app_mutex);
+        mutex_lock(&ce_app->app_mutex);
+        INIT_LIST_HEAD(&ce_app->allocated_contexts);
+        ce_app->ctx_count = 0;
+        ce_app->next_ctx_id = 0;
+        ce_app->initialised = true;
+        ce_app->app_state = NVGPU_CE_ACTIVE;
+        mutex_unlock(&ce_app->app_mutex);
+        gk20a_dbg(gpu_dbg_cde_ctx, "ce: init finished");
+        return 0;
+}
+void gk20a_ce_destroy(struct gk20a *g)
+{
+        struct gk20a_ce_app *ce_app = &g->ce_app;
+        struct gk20a_gpu_ctx *ce_ctx, *ce_ctx_save;
+        if (!ce_app->initialised)
+                return;
+        ce_app->app_state = NVGPU_CE_SUSPEND;
+        ce_app->initialised = false;
+        mutex_lock(&ce_app->app_mutex);
+        list_for_each_entry_safe(ce_ctx, ce_ctx_save,
+                        &ce_app->allocated_contexts, list) {
+                gk20a_ce_delete_gpu_context(ce_ctx);
+        }
+        INIT_LIST_HEAD(&ce_app->allocated_contexts);
+        ce_app->ctx_count = 0;
+        ce_app->next_ctx_id = 0;
+        mutex_unlock(&ce_app->app_mutex);
+        mutex_destroy(&ce_app->app_mutex);
+}
+void gk20a_ce_suspend(struct gk20a *g)
+{
+        struct gk20a_ce_app *ce_app = &g->ce_app;
+        if (!ce_app->initialised)
+                return;
+        ce_app->app_state = NVGPU_CE_SUSPEND;
+        gk20a_ce_notify_all_user(g, NVGPU_CE_CONTEXT_SUSPEND);
+        return;
+}
+/* CE app utility functions */
+u32 gk20a_ce_create_context_with_cb(struct device *dev,
+                int runlist_id,
+                int priority,
+                int timeslice,
+                int runlist_level,
+                ce_event_callback user_event_callback)
+{
+        struct gk20a_gpu_ctx *ce_ctx;
+        struct gk20a *g = gk20a_from_dev(dev);
+        struct gk20a_ce_app *ce_app = &g->ce_app;
+        u32 ctx_id = ~0;
+        int err = 0;
+        if (!ce_app->initialised || ce_app->app_state != NVGPU_CE_ACTIVE)
+                return ctx_id;
+        ce_ctx = kzalloc(sizeof(*ce_ctx), GFP_KERNEL);
+        if (!ce_ctx)
+                return ctx_id;
+        mutex_init(&ce_ctx->gpu_ctx_mutex);
+        ce_ctx->g = g;
+        ce_ctx->dev = g->dev;
+        ce_ctx->user_event_callback = user_event_callback;
+        ce_ctx->cmd_buf_read_queue_offset = 0;
+        ce_ctx->cmd_buf_end_queue_offset =
+                (NVGPU_CE_COMMAND_BUF_SIZE / NVGPU_CE_MAX_COMMAND_BUFF_SIZE_PER_KICKOFF);
+        ce_ctx->submitted_seq_number = 0;
+        ce_ctx->completed_seq_number = 0;
+        /* always kernel client needs privileged channel */
+        ce_ctx->ch = gk20a_open_new_channel_with_cb(g, gk20a_ce_finished_ctx_cb,
+                                        ce_ctx,
+                                        runlist_id,
+                                        true);
+        if (!ce_ctx->ch) {
+                gk20a_err(ce_ctx->dev, "ce: gk20a channel not available");
+                goto end;
+         }
+        /* bind the channel to the vm */
+        gk20a_vm_get(&g->mm.ce.vm);
+        ce_ctx->vm = ce_ctx->ch->vm = &g->mm.ce.vm;
+        err = channel_gk20a_commit_va(ce_ctx->ch);
+        if (err) {
+                gk20a_err(ce_ctx->dev, "ce: could not bind vm");
+                goto end;
+        }
+        /* allocate gpfifo (1024 should be more than enough) */
+        err = gk20a_alloc_channel_gpfifo(ce_ctx->ch,
+                &(struct nvgpu_alloc_gpfifo_args){1024, 0});
+        if (err) {
+                gk20a_err(ce_ctx->dev, "ce: unable to allocate gpfifo");
+                goto end;
+        }
+        /* allocate command buffer (4096 should be more than enough) from sysmem*/
+        err = gk20a_gmmu_alloc_map_sys(ce_ctx->vm, NVGPU_CE_COMMAND_BUF_SIZE, &ce_ctx->cmd_buf_mem);
+         if (err) {
+                gk20a_err(ce_ctx->dev,
+                        "ce: could not allocate command buffer for CE context");
+                goto end;
+        }
+        memset(ce_ctx->cmd_buf_mem.cpu_va, 0x00, ce_ctx->cmd_buf_mem.size);
+        /* -1 means default channel priority */
+        if (priority != -1) {
+                err = gk20a_channel_set_priority(ce_ctx->ch, priority);
+                if (err) {
+                        gk20a_err(ce_ctx->dev,
+                                "ce: could not set the channel priority for CE context");
+                        goto end;
+                }
+        }
+        /* -1 means default channel timeslice value */
+        if (timeslice != -1) {
+                err = gk20a_channel_set_timeslice(ce_ctx->ch, timeslice);
+                if (err) {
+                        gk20a_err(ce_ctx->dev,
+                                "ce: could not set the channel timeslice value for CE context");
+                        goto end;
+                }
+        }
+        /* -1 means default channel runlist level */
+        if (runlist_level != -1) {
+                err = gk20a_channel_set_runlist_interleave(ce_ctx->ch, runlist_level);
+                if (err) {
+                        gk20a_err(ce_ctx->dev,
+                                "ce: could not set the runlist interleave for CE context");
+                        goto end;
+                }
+        }
+        mutex_lock(&ce_app->app_mutex);
+        ctx_id = ce_ctx->ctx_id = ce_app->next_ctx_id;
+        list_add(&ce_ctx->list, &ce_app->allocated_contexts);
+        ++ce_app->next_ctx_id;
+        ++ce_app->ctx_count;
+        mutex_unlock(&ce_app->app_mutex);
+        ce_ctx->gpu_ctx_state = NVGPU_CE_GPU_CTX_ALLOCATED;
+end:
+        if (ctx_id == ~0) {
+                mutex_lock(&ce_app->app_mutex);
+                gk20a_ce_delete_gpu_context(ce_ctx);
+                mutex_unlock(&ce_app->app_mutex);
+        }
+        return ctx_id;
+}
+EXPORT_SYMBOL(gk20a_ce_create_context_with_cb);
+int gk20a_ce_execute_ops(struct device *dev,
+                u32 ce_ctx_id,
+                u64 src_buf,
+                u64 dst_buf,
+                u64 size,
+                unsigned int payload,
+                int launch_flags,
+                int request_operation,
+                struct gk20a_fence *gk20a_fence_in,
+                u32 submit_flags,
+                struct gk20a_fence **gk20a_fence_out)
+{
+        int ret = -EPERM;
+        struct gk20a *g = gk20a_from_dev(dev);
+        struct gk20a_ce_app *ce_app = &g->ce_app;
+        struct gk20a_gpu_ctx *ce_ctx, *ce_ctx_save;
+        bool found = false;
+        u32 *cmd_buf_cpu_va;
+        u64 cmd_buf_gpu_va = 0;
+        u32 methodSize;
+        u32 cmd_buf_read_offset;
+        u32 fence_index;
+        struct nvgpu_gpfifo gpfifo;
+        struct nvgpu_fence fence = {0,0};
+        struct gk20a_fence *ce_cmd_buf_fence_out = NULL;
+        struct nvgpu_gpu_characteristics *gpu_capability = &g->gpu_characteristics;
+        if (!ce_app->initialised ||ce_app->app_state != NVGPU_CE_ACTIVE)
+                goto end;
+        mutex_lock(&ce_app->app_mutex);
+        list_for_each_entry_safe(ce_ctx, ce_ctx_save,
+                        &ce_app->allocated_contexts, list) {
+                if (ce_ctx->ctx_id == ce_ctx_id) {
+                        found = true;
+                        break;
+                }
+        }
+        mutex_unlock(&ce_app->app_mutex);
+        if (!found) {
+                ret = -EINVAL;
+                goto end;
+        }
+        if (ce_ctx->gpu_ctx_state != NVGPU_CE_GPU_CTX_ALLOCATED) {
+                ret = -ENODEV;
+                goto end;
+        }
+        mutex_lock(&ce_ctx->gpu_ctx_mutex);
+        ce_ctx->cmd_buf_read_queue_offset %= ce_ctx->cmd_buf_end_queue_offset;
+        cmd_buf_read_offset = (ce_ctx->cmd_buf_read_queue_offset *
+                        (NVGPU_CE_MAX_COMMAND_BUFF_SIZE_PER_KICKOFF / sizeof(u32)));
+        /* at end of command buffer has gk20a_fence for command buffer sync */
+        fence_index = (cmd_buf_read_offset +
+                        ((NVGPU_CE_MAX_COMMAND_BUFF_SIZE_PER_KICKOFF / sizeof(u32)) -
+                        (NVGPU_CE_MAX_COMMAND_BUFF_SIZE_FOR_TRACING / sizeof(u32))));
+        if (sizeof(struct gk20a_fence *) > NVGPU_CE_MAX_COMMAND_BUFF_SIZE_FOR_TRACING) {
+                ret = -ENOMEM;
+                goto noop;
+        }
+        cmd_buf_cpu_va = (u32 *)ce_ctx->cmd_buf_mem.cpu_va;
+        /* 0 is treated as invalid pre-sync */
+        if (cmd_buf_cpu_va[fence_index]) {
+                struct gk20a_fence * ce_cmd_buf_fence_in = NULL;
+                memcpy((void *)&ce_cmd_buf_fence_in,
+                                (void *)(cmd_buf_cpu_va + fence_index),
+                                sizeof(struct gk20a_fence *));
+                ret = gk20a_fence_wait(ce_cmd_buf_fence_in, gk20a_get_gr_idle_timeout(g));
+                gk20a_fence_put(ce_cmd_buf_fence_in);
+                /* Reset the stored last pre-sync */
+                memset((void *)(cmd_buf_cpu_va + fence_index),
+                                0,
+                                NVGPU_CE_MAX_COMMAND_BUFF_SIZE_FOR_TRACING);
+                if (ret)
+                        goto noop;
+        }
+        cmd_buf_gpu_va = (ce_ctx->cmd_buf_mem.gpu_va + (u64)(cmd_buf_read_offset *sizeof(u32)));
+        methodSize = gk20a_ce_prepare_submit(src_buf,
+                                        dst_buf,
+                                        size,
+                                        &cmd_buf_cpu_va[cmd_buf_read_offset],
+                                        NVGPU_CE_MAX_COMMAND_BUFF_SIZE_PER_KICKOFF,
+                                        payload,
+                                        gk20a_get_valid_launch_flags(g, launch_flags),
+                                        request_operation,
+                                        gpu_capability->dma_copy_class,
+                                        gk20a_fence_in);
+        if (methodSize) {
+                /* TODO: Remove CPU pre-fence wait */
+                if (gk20a_fence_in) {
+                        ret = gk20a_fence_wait(gk20a_fence_in, gk20a_get_gr_idle_timeout(g));
+                        gk20a_fence_put(gk20a_fence_in);
+                        if (ret)
+                                goto noop;
+                }
+                /* store the element into gpfifo */
+                gpfifo.entry0 =
+                        u64_lo32(cmd_buf_gpu_va);
+                gpfifo.entry1 =
+                        (u64_hi32(cmd_buf_gpu_va) |
+                        pbdma_gp_entry1_length_f(methodSize));
+                /* take always the postfence as it is needed for protecting the ce context */
+                submit_flags |= NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_GET;
+                wmb();
+                ret = gk20a_submit_channel_gpfifo(ce_ctx->ch, &gpfifo, NULL,
+                                        1, submit_flags, &fence, &ce_cmd_buf_fence_out, true);
+                if (!ret) {
+                        memcpy((void *)(cmd_buf_cpu_va + fence_index),
+                                        (void *)&ce_cmd_buf_fence_out,
+                                        sizeof(struct gk20a_fence *));
+                        if (gk20a_fence_out) {
+                                gk20a_fence_get(ce_cmd_buf_fence_out);
+                                *gk20a_fence_out = ce_cmd_buf_fence_out;
+                        }
+                        /* Next available command buffer queue Index */
+                        ++ce_ctx->cmd_buf_read_queue_offset;
+                        ++ce_ctx->submitted_seq_number;
+                        }
+        } else
+                ret = -ENOMEM;
+noop:
+        mutex_unlock(&ce_ctx->gpu_ctx_mutex);
+end:
+        return ret;
+}
+EXPORT_SYMBOL(gk20a_ce_execute_ops);
+void gk20a_ce_delete_context(struct device *dev,
+                u32 ce_ctx_id)
+{
+        struct gk20a *g = gk20a_from_dev(dev);
+        struct gk20a_ce_app *ce_app = &g->ce_app;
+        struct gk20a_gpu_ctx *ce_ctx, *ce_ctx_save;
+        if (!ce_app->initialised ||ce_app->app_state != NVGPU_CE_ACTIVE)
+                return;
+        mutex_lock(&ce_app->app_mutex);
+        list_for_each_entry_safe(ce_ctx, ce_ctx_save,
+                        &ce_app->allocated_contexts, list) {
+                if (ce_ctx->ctx_id == ce_ctx_id) {
+                        gk20a_ce_delete_gpu_context(ce_ctx);
+                        --ce_app->ctx_count;
+                        break;
+                }
+        }
+        mutex_unlock(&ce_app->app_mutex);
+        return;
+}
+EXPORT_SYMBOL(gk20a_ce_delete_context);
+#ifdef CONFIG_DEBUG_FS
+void gk20a_ce_debugfs_init(struct device *dev)
+{
+        struct gk20a_platform *platform = dev_get_drvdata(dev);
+        struct gk20a *g = get_gk20a(dev);
+        debugfs_create_u32("ce_app_ctx_count", S_IWUSR | S_IRUGO,
+                           platform->debugfs, &g->ce_app.ctx_count);
+        debugfs_create_u32("ce_app_state", S_IWUSR | S_IRUGO,
+                           platform->debugfs, &g->ce_app.app_state);
+        debugfs_create_u32("ce_app_next_ctx_id", S_IWUSR | S_IRUGO,
+                           platform->debugfs, &g->ce_app.next_ctx_id);
+}
+#endif
author	Lakshmanan M <lm@nvidia.com>	2016-06-29 06:36:39 -0400
committer	Vijayakumar Subbu <vsubbu@nvidia.com>	2016-07-20 06:09:28 -0400
commit	89aecd1202b49727e940069f2a6feb5c3cf4c927 (patch)
tree	8a0d3a493b389167ce1d93e55f23e114ec2cbd38 /drivers/gpu/nvgpu/gk20a/ce2_gk20a.c
parent	f6ebdc5f2916706f7a61983567420e0985faeeb1 (diff)

diff --git a/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c b/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c index 96d38b11..e2f2d9e9 100644 --- a/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c
@@ -24,6 +24,7 @@
24	#include <trace/events/gk20a.h>	24	#include <trace/events/gk20a.h>
25	#include <linux/dma-mapping.h>	25	#include <linux/dma-mapping.h>
26	#include <linux/nvhost.h>	26	#include <linux/nvhost.h>
		27	#include <linux/debugfs.h>
27		28
28	#include "gk20a.h"	29	#include "gk20a.h"
29	#include "debug_gk20a.h"	30	#include "debug_gk20a.h"
@@ -96,3 +97,619 @@ void gk20a_init_ce2(struct gpu_ops *gops)
96	gops->ce2.isr_stall = gk20a_ce2_isr;	97	gops->ce2.isr_stall = gk20a_ce2_isr;
97	gops->ce2.isr_nonstall = gk20a_ce2_nonstall_isr;	98	gops->ce2.isr_nonstall = gk20a_ce2_nonstall_isr;
98	}	99	}
		100
		101	/* static CE app api */
		102	static void gk20a_ce_notify_all_user(struct gk20a *g, u32 event)
		103	{
		104	struct gk20a_ce_app *ce_app = &g->ce_app;
		105	struct gk20a_gpu_ctx ce_ctx, ce_ctx_save;
		106
		107	if (!ce_app->initialised)
		108	return;
		109
		110	mutex_lock(&ce_app->app_mutex);
		111
		112	list_for_each_entry_safe(ce_ctx, ce_ctx_save,
		113	&ce_app->allocated_contexts, list) {
		114	if (ce_ctx->user_event_callback) {
		115	ce_ctx->user_event_callback(ce_ctx->ctx_id,
		116	event);
		117	}
		118	}
		119
		120	mutex_unlock(&ce_app->app_mutex);
		121	}
		122
		123	static void gk20a_ce_finished_ctx_cb(struct channel_gk20a ch, void data)
		124	{
		125	struct gk20a_gpu_ctx *ce_ctx = data;
		126	bool channel_idle;
		127	u32 event;
		128
		129	mutex_lock(&ch->jobs_lock);
		130	channel_idle = list_empty(&ch->jobs);
		131	mutex_unlock(&ch->jobs_lock);
		132
		133	if (!channel_idle)
		134	return;
		135
		136	gk20a_dbg(gpu_dbg_fn, "ce: finished %p", ce_ctx);
		137
		138	if (ch->has_timedout)
		139	event = NVGPU_CE_CONTEXT_JOB_TIMEDOUT;
		140	else
		141	event = NVGPU_CE_CONTEXT_JOB_COMPLETED;
		142
		143	if (ce_ctx->user_event_callback)
		144	ce_ctx->user_event_callback(ce_ctx->ctx_id,
		145	event);
		146
		147	++ce_ctx->completed_seq_number;
		148	}
		149
		150	static void gk20a_ce_free_command_buffer_stored_fence(struct gk20a_gpu_ctx *ce_ctx)
		151	{
		152	u32 cmd_buf_index;
		153	u32 cmd_buf_read_offset;
		154	u32 fence_index;
		155	u32 *cmd_buf_cpu_va;
		156
		157	for (cmd_buf_index = 0;
		158	cmd_buf_index < ce_ctx->cmd_buf_end_queue_offset;
		159	cmd_buf_index++) {
		160	cmd_buf_read_offset = (cmd_buf_index *
		161	(NVGPU_CE_MAX_COMMAND_BUFF_SIZE_PER_KICKOFF / sizeof(u32)));
		162
		163	/* at end of command buffer has gk20a_fence for command buffer sync */
		164	fence_index = (cmd_buf_read_offset +
		165	((NVGPU_CE_MAX_COMMAND_BUFF_SIZE_PER_KICKOFF / sizeof(u32)) -
		166	(NVGPU_CE_MAX_COMMAND_BUFF_SIZE_FOR_TRACING / sizeof(u32))));
		167
		168	cmd_buf_cpu_va = (u32 *)ce_ctx->cmd_buf_mem.cpu_va;
		169
		170	/* 0 is treated as invalid pre-sync */
		171	if (cmd_buf_cpu_va[fence_index]) {
		172	struct gk20a_fence * ce_cmd_buf_fence_in = NULL;
		173
		174	memcpy((void *)&ce_cmd_buf_fence_in,
		175	(void *)(cmd_buf_cpu_va + fence_index),
		176	sizeof(struct gk20a_fence *));
		177	gk20a_fence_put(ce_cmd_buf_fence_in);
		178	/* Reset the stored last pre-sync */
		179	memset((void *)(cmd_buf_cpu_va + fence_index),
		180	0,
		181	NVGPU_CE_MAX_COMMAND_BUFF_SIZE_FOR_TRACING);
		182	}
		183	}
		184	}
		185
		186	/* assume this api should need to call under mutex_lock(&ce_app->app_mutex) */
		187	static void gk20a_ce_delete_gpu_context(struct gk20a_gpu_ctx *ce_ctx)
		188	{
		189	ce_ctx->gpu_ctx_state = NVGPU_CE_GPU_CTX_DELETED;
		190
		191	mutex_lock(&ce_ctx->gpu_ctx_mutex);
		192
		193	gk20a_ce_free_command_buffer_stored_fence(ce_ctx);
		194
		195	gk20a_gmmu_unmap_free(ce_ctx->vm, &ce_ctx->cmd_buf_mem);
		196
		197	/* free the channel */
		198	if (ce_ctx->ch)
		199	gk20a_channel_close(ce_ctx->ch);
		200
		201	/* housekeeping on app */
		202	list_del(&ce_ctx->list);
		203
		204	mutex_unlock(&ce_ctx->gpu_ctx_mutex);
		205	mutex_destroy(&ce_ctx->gpu_ctx_mutex);
		206
		207	kfree(ce_ctx);
		208	}
		209
		210	static inline int gk20a_ce_get_method_size(int request_operation)
		211	{
		212	/* failure size */
		213	int methodsize = ~0;
		214
		215	if (request_operation & NVGPU_CE_PHYS_MODE_TRANSFER)
		216	methodsize = 10 * 2 * sizeof(u32);
		217	else if (request_operation & NVGPU_CE_MEMSET)
		218	methodsize = 9 * 2 * sizeof(u32);
		219
		220	return methodsize;
		221	}
		222
		223	static inline int gk20a_get_valid_launch_flags(struct gk20a *g, int launch_flags)
		224	{
		225	/* there is no local memory available,
		226	don't allow local memory related CE flags */
		227	if (!g->mm.vidmem_size) {
		228	launch_flags &= ~(NVGPU_CE_SRC_LOCATION_LOCAL_FB \|
		229	NVGPU_CE_DST_LOCATION_LOCAL_FB);
		230	}
		231	return launch_flags;
		232	}
		233
		234	static int gk20a_ce_prepare_submit(u64 src_buf,
		235	u64 dst_buf,
		236	u64 size,
		237	u32 *cmd_buf_cpu_va,
		238	u32 max_cmd_buf_size,
		239	unsigned int payload,
		240	int launch_flags,
		241	int request_operation,
		242	u32 dma_copy_class,
		243	struct gk20a_fence *gk20a_fence_in)
		244	{
		245	u32 launch = 0;
		246	u32 methodSize = 0;
		247
		248	/* failure case handling */
		249	if ((gk20a_ce_get_method_size(request_operation) > max_cmd_buf_size) \|\|
		250	(!size) \|\|
		251	(request_operation > NVGPU_CE_MEMSET))
		252	return 0;
		253
		254	/* set the channel object */
		255	cmd_buf_cpu_va[methodSize++] = 0x20018000;
		256	cmd_buf_cpu_va[methodSize++] = dma_copy_class;
		257
		258	if (request_operation & NVGPU_CE_PHYS_MODE_TRANSFER) {
		259	/* setup the source */
		260	cmd_buf_cpu_va[methodSize++] = 0x20018101;
		261	cmd_buf_cpu_va[methodSize++] = (u64_lo32(src_buf) &
		262	NVGPU_CE_LOWER_ADDRESS_OFFSET_MASK);
		263
		264	cmd_buf_cpu_va[methodSize++] = 0x20018100;
		265	cmd_buf_cpu_va[methodSize++] = (u64_hi32(src_buf) &
		266	NVGPU_CE_UPPER_ADDRESS_OFFSET_MASK);
		267
		268	cmd_buf_cpu_va[methodSize++] = 0x20018098;
		269	if (launch_flags & NVGPU_CE_SRC_LOCATION_LOCAL_FB) {
		270	cmd_buf_cpu_va[methodSize++] = 0x00000000;
		271	} else if (launch_flags & NVGPU_CE_SRC_LOCATION_NONCOHERENT_SYSMEM) {
		272	cmd_buf_cpu_va[methodSize++] = 0x00000002;
		273	} else {
		274	cmd_buf_cpu_va[methodSize++] = 0x00000001;
		275	}
		276
		277	launch \|= 0x00001000;
		278	} else if (request_operation & NVGPU_CE_MEMSET) {
		279	cmd_buf_cpu_va[methodSize++] = 0x200181c2;
		280	cmd_buf_cpu_va[methodSize++] = 0x00030004;
		281
		282	cmd_buf_cpu_va[methodSize++] = 0x200181c0;
		283	cmd_buf_cpu_va[methodSize++] = payload;
		284
		285	launch \|= 0x00000400;
		286
		287	/* converted into number of words */
		288	size /= sizeof(u32);
		289	}
		290
		291	/* setup the destination/output */
		292	cmd_buf_cpu_va[methodSize++] = 0x20018103;
		293	cmd_buf_cpu_va[methodSize++] = (u64_lo32(dst_buf) & NVGPU_CE_LOWER_ADDRESS_OFFSET_MASK);
		294
		295	cmd_buf_cpu_va[methodSize++] = 0x20018102;
		296	cmd_buf_cpu_va[methodSize++] = (u64_hi32(dst_buf) & NVGPU_CE_UPPER_ADDRESS_OFFSET_MASK);
		297
		298	cmd_buf_cpu_va[methodSize++] = 0x20018099;
		299	if (launch_flags & NVGPU_CE_DST_LOCATION_LOCAL_FB) {
		300	cmd_buf_cpu_va[methodSize++] = 0x00000000;
		301	} else if (launch_flags & NVGPU_CE_DST_LOCATION_NONCOHERENT_SYSMEM) {
		302	cmd_buf_cpu_va[methodSize++] = 0x00000002;
		303	} else {
		304	cmd_buf_cpu_va[methodSize++] = 0x00000001;
		305	}
		306
		307	launch \|= 0x00002000;
		308
		309	/* setup the format */
		310	cmd_buf_cpu_va[methodSize++] = 0x20018107;
		311	cmd_buf_cpu_va[methodSize++] = 1;
		312	cmd_buf_cpu_va[methodSize++] = 0x20018106;
		313	cmd_buf_cpu_va[methodSize++] = u64_lo32(size);
		314
		315	launch \|= 0x00000004;
		316
		317	if (launch_flags & NVGPU_CE_SRC_MEMORY_LAYOUT_BLOCKLINEAR)
		318	launch \|= 0x00000000;
		319	else
		320	launch \|= 0x00000080;
		321
		322	if (launch_flags & NVGPU_CE_DST_MEMORY_LAYOUT_BLOCKLINEAR)
		323	launch \|= 0x00000000;
		324	else
		325	launch \|= 0x00000100;
		326
		327	if (launch_flags & NVGPU_CE_DATA_TRANSFER_TYPE_NON_PIPELINED)
		328	launch \|= 0x00000002;
		329	else
		330	launch \|= 0x00000001;
		331
		332	cmd_buf_cpu_va[methodSize++] = 0x200180c0;
		333	cmd_buf_cpu_va[methodSize++] = launch;
		334
		335	return methodSize;
		336	}
		337
		338	/* global CE app related apis */
		339	int gk20a_init_ce_support(struct gk20a *g)
		340	{
		341	struct gk20a_ce_app *ce_app = &g->ce_app;
		342
		343	if (ce_app->initialised) {
		344	/* assume this happen during poweron/poweroff GPU sequence */
		345	ce_app->app_state = NVGPU_CE_ACTIVE;
		346	gk20a_ce_notify_all_user(g, NVGPU_CE_CONTEXT_RESUME);
		347	return 0;
		348	}
		349
		350	gk20a_dbg(gpu_dbg_fn, "ce: init");
		351
		352	mutex_init(&ce_app->app_mutex);
		353	mutex_lock(&ce_app->app_mutex);
		354
		355	INIT_LIST_HEAD(&ce_app->allocated_contexts);
		356	ce_app->ctx_count = 0;
		357	ce_app->next_ctx_id = 0;
		358	ce_app->initialised = true;
		359	ce_app->app_state = NVGPU_CE_ACTIVE;
		360
		361	mutex_unlock(&ce_app->app_mutex);
		362	gk20a_dbg(gpu_dbg_cde_ctx, "ce: init finished");
		363
		364	return 0;
		365	}
		366
		367	void gk20a_ce_destroy(struct gk20a *g)
		368	{
		369	struct gk20a_ce_app *ce_app = &g->ce_app;
		370	struct gk20a_gpu_ctx ce_ctx, ce_ctx_save;
		371
		372	if (!ce_app->initialised)
		373	return;
		374
		375	ce_app->app_state = NVGPU_CE_SUSPEND;
		376	ce_app->initialised = false;
		377
		378	mutex_lock(&ce_app->app_mutex);
		379
		380	list_for_each_entry_safe(ce_ctx, ce_ctx_save,
		381	&ce_app->allocated_contexts, list) {
		382	gk20a_ce_delete_gpu_context(ce_ctx);
		383	}
		384
		385	INIT_LIST_HEAD(&ce_app->allocated_contexts);
		386	ce_app->ctx_count = 0;
		387	ce_app->next_ctx_id = 0;
		388
		389	mutex_unlock(&ce_app->app_mutex);
		390	mutex_destroy(&ce_app->app_mutex);
		391	}
		392
		393	void gk20a_ce_suspend(struct gk20a *g)
		394	{
		395	struct gk20a_ce_app *ce_app = &g->ce_app;
		396
		397	if (!ce_app->initialised)
		398	return;
		399
		400	ce_app->app_state = NVGPU_CE_SUSPEND;
		401	gk20a_ce_notify_all_user(g, NVGPU_CE_CONTEXT_SUSPEND);
		402
		403	return;
		404	}
		405
		406	/* CE app utility functions */
		407	u32 gk20a_ce_create_context_with_cb(struct device *dev,
		408	int runlist_id,
		409	int priority,
		410	int timeslice,
		411	int runlist_level,
		412	ce_event_callback user_event_callback)
		413	{
		414	struct gk20a_gpu_ctx *ce_ctx;
		415	struct gk20a *g = gk20a_from_dev(dev);
		416	struct gk20a_ce_app *ce_app = &g->ce_app;
		417	u32 ctx_id = ~0;
		418	int err = 0;
		419
		420	if (!ce_app->initialised \|\| ce_app->app_state != NVGPU_CE_ACTIVE)
		421	return ctx_id;
		422
		423	ce_ctx = kzalloc(sizeof(*ce_ctx), GFP_KERNEL);
		424	if (!ce_ctx)
		425	return ctx_id;
		426
		427	mutex_init(&ce_ctx->gpu_ctx_mutex);
		428
		429	ce_ctx->g = g;
		430	ce_ctx->dev = g->dev;
		431	ce_ctx->user_event_callback = user_event_callback;
		432
		433	ce_ctx->cmd_buf_read_queue_offset = 0;
		434	ce_ctx->cmd_buf_end_queue_offset =
		435	(NVGPU_CE_COMMAND_BUF_SIZE / NVGPU_CE_MAX_COMMAND_BUFF_SIZE_PER_KICKOFF);
		436
		437	ce_ctx->submitted_seq_number = 0;
		438	ce_ctx->completed_seq_number = 0;
		439
		440	/* always kernel client needs privileged channel */
		441	ce_ctx->ch = gk20a_open_new_channel_with_cb(g, gk20a_ce_finished_ctx_cb,
		442	ce_ctx,
		443	runlist_id,
		444	true);
		445	if (!ce_ctx->ch) {
		446	gk20a_err(ce_ctx->dev, "ce: gk20a channel not available");
		447	goto end;
		448	}
		449
		450	/* bind the channel to the vm */
		451	gk20a_vm_get(&g->mm.ce.vm);
		452	ce_ctx->vm = ce_ctx->ch->vm = &g->mm.ce.vm;
		453	err = channel_gk20a_commit_va(ce_ctx->ch);
		454	if (err) {
		455	gk20a_err(ce_ctx->dev, "ce: could not bind vm");
		456	goto end;
		457	}
		458
		459	/* allocate gpfifo (1024 should be more than enough) */
		460	err = gk20a_alloc_channel_gpfifo(ce_ctx->ch,
		461	&(struct nvgpu_alloc_gpfifo_args){1024, 0});
		462	if (err) {
		463	gk20a_err(ce_ctx->dev, "ce: unable to allocate gpfifo");
		464	goto end;
		465	}
		466
		467	/* allocate command buffer (4096 should be more than enough) from sysmem*/
		468	err = gk20a_gmmu_alloc_map_sys(ce_ctx->vm, NVGPU_CE_COMMAND_BUF_SIZE, &ce_ctx->cmd_buf_mem);
		469	if (err) {
		470	gk20a_err(ce_ctx->dev,
		471	"ce: could not allocate command buffer for CE context");
		472	goto end;
		473	}
		474
		475	memset(ce_ctx->cmd_buf_mem.cpu_va, 0x00, ce_ctx->cmd_buf_mem.size);
		476
		477	/* -1 means default channel priority */
		478	if (priority != -1) {
		479	err = gk20a_channel_set_priority(ce_ctx->ch, priority);
		480	if (err) {
		481	gk20a_err(ce_ctx->dev,
		482	"ce: could not set the channel priority for CE context");
		483	goto end;
		484	}
		485	}
		486
		487	/* -1 means default channel timeslice value */
		488	if (timeslice != -1) {
		489	err = gk20a_channel_set_timeslice(ce_ctx->ch, timeslice);
		490	if (err) {
		491	gk20a_err(ce_ctx->dev,
		492	"ce: could not set the channel timeslice value for CE context");
		493	goto end;
		494	}
		495	}
		496
		497	/* -1 means default channel runlist level */
		498	if (runlist_level != -1) {
		499	err = gk20a_channel_set_runlist_interleave(ce_ctx->ch, runlist_level);
		500	if (err) {
		501	gk20a_err(ce_ctx->dev,
		502	"ce: could not set the runlist interleave for CE context");
		503	goto end;
		504	}
		505	}
		506
		507	mutex_lock(&ce_app->app_mutex);
		508	ctx_id = ce_ctx->ctx_id = ce_app->next_ctx_id;
		509	list_add(&ce_ctx->list, &ce_app->allocated_contexts);
		510	++ce_app->next_ctx_id;
		511	++ce_app->ctx_count;
		512	mutex_unlock(&ce_app->app_mutex);
		513
		514	ce_ctx->gpu_ctx_state = NVGPU_CE_GPU_CTX_ALLOCATED;
		515
		516	end:
		517	if (ctx_id == ~0) {
		518	mutex_lock(&ce_app->app_mutex);
		519	gk20a_ce_delete_gpu_context(ce_ctx);
		520	mutex_unlock(&ce_app->app_mutex);
		521	}
		522	return ctx_id;
		523
		524	}
		525	EXPORT_SYMBOL(gk20a_ce_create_context_with_cb);
		526
		527	int gk20a_ce_execute_ops(struct device *dev,
		528	u32 ce_ctx_id,
		529	u64 src_buf,
		530	u64 dst_buf,
		531	u64 size,
		532	unsigned int payload,
		533	int launch_flags,
		534	int request_operation,
		535	struct gk20a_fence *gk20a_fence_in,
		536	u32 submit_flags,
		537	struct gk20a_fence **gk20a_fence_out)
		538	{
		539	int ret = -EPERM;
		540	struct gk20a *g = gk20a_from_dev(dev);
		541	struct gk20a_ce_app *ce_app = &g->ce_app;
		542	struct gk20a_gpu_ctx ce_ctx, ce_ctx_save;
		543	bool found = false;
		544	u32 *cmd_buf_cpu_va;
		545	u64 cmd_buf_gpu_va = 0;
		546	u32 methodSize;
		547	u32 cmd_buf_read_offset;
		548	u32 fence_index;
		549	struct nvgpu_gpfifo gpfifo;
		550	struct nvgpu_fence fence = {0,0};
		551	struct gk20a_fence *ce_cmd_buf_fence_out = NULL;
		552	struct nvgpu_gpu_characteristics *gpu_capability = &g->gpu_characteristics;
		553
		554	if (!ce_app->initialised \|\|ce_app->app_state != NVGPU_CE_ACTIVE)
		555	goto end;
		556
		557	mutex_lock(&ce_app->app_mutex);
		558
		559	list_for_each_entry_safe(ce_ctx, ce_ctx_save,
		560	&ce_app->allocated_contexts, list) {
		561	if (ce_ctx->ctx_id == ce_ctx_id) {
		562	found = true;
		563	break;
		564	}
		565	}
		566
		567	mutex_unlock(&ce_app->app_mutex);
		568
		569	if (!found) {
		570	ret = -EINVAL;
		571	goto end;
		572	}
		573
		574	if (ce_ctx->gpu_ctx_state != NVGPU_CE_GPU_CTX_ALLOCATED) {
		575	ret = -ENODEV;
		576	goto end;
		577	}
		578
		579	mutex_lock(&ce_ctx->gpu_ctx_mutex);
		580
		581	ce_ctx->cmd_buf_read_queue_offset %= ce_ctx->cmd_buf_end_queue_offset;
		582
		583	cmd_buf_read_offset = (ce_ctx->cmd_buf_read_queue_offset *
		584	(NVGPU_CE_MAX_COMMAND_BUFF_SIZE_PER_KICKOFF / sizeof(u32)));
		585
		586	/* at end of command buffer has gk20a_fence for command buffer sync */
		587	fence_index = (cmd_buf_read_offset +
		588	((NVGPU_CE_MAX_COMMAND_BUFF_SIZE_PER_KICKOFF / sizeof(u32)) -
		589	(NVGPU_CE_MAX_COMMAND_BUFF_SIZE_FOR_TRACING / sizeof(u32))));
		590
		591	if (sizeof(struct gk20a_fence *) > NVGPU_CE_MAX_COMMAND_BUFF_SIZE_FOR_TRACING) {
		592	ret = -ENOMEM;
		593	goto noop;
		594	}
		595
		596	cmd_buf_cpu_va = (u32 *)ce_ctx->cmd_buf_mem.cpu_va;
		597
		598	/* 0 is treated as invalid pre-sync */
		599	if (cmd_buf_cpu_va[fence_index]) {
		600	struct gk20a_fence * ce_cmd_buf_fence_in = NULL;
		601
		602	memcpy((void *)&ce_cmd_buf_fence_in,
		603	(void *)(cmd_buf_cpu_va + fence_index),
		604	sizeof(struct gk20a_fence *));
		605	ret = gk20a_fence_wait(ce_cmd_buf_fence_in, gk20a_get_gr_idle_timeout(g));
		606
		607	gk20a_fence_put(ce_cmd_buf_fence_in);
		608	/* Reset the stored last pre-sync */
		609	memset((void *)(cmd_buf_cpu_va + fence_index),
		610	0,
		611	NVGPU_CE_MAX_COMMAND_BUFF_SIZE_FOR_TRACING);
		612	if (ret)
		613	goto noop;
		614	}
		615
		616	cmd_buf_gpu_va = (ce_ctx->cmd_buf_mem.gpu_va + (u64)(cmd_buf_read_offset *sizeof(u32)));
		617
		618	methodSize = gk20a_ce_prepare_submit(src_buf,
		619	dst_buf,
		620	size,
		621	&cmd_buf_cpu_va[cmd_buf_read_offset],
		622	NVGPU_CE_MAX_COMMAND_BUFF_SIZE_PER_KICKOFF,
		623	payload,
		624	gk20a_get_valid_launch_flags(g, launch_flags),
		625	request_operation,
		626	gpu_capability->dma_copy_class,
		627	gk20a_fence_in);
		628
		629	if (methodSize) {
		630	/* TODO: Remove CPU pre-fence wait */
		631	if (gk20a_fence_in) {
		632	ret = gk20a_fence_wait(gk20a_fence_in, gk20a_get_gr_idle_timeout(g));
		633	gk20a_fence_put(gk20a_fence_in);
		634	if (ret)
		635	goto noop;
		636	}
		637
		638	/* store the element into gpfifo */
		639	gpfifo.entry0 =
		640	u64_lo32(cmd_buf_gpu_va);
		641	gpfifo.entry1 =
		642	(u64_hi32(cmd_buf_gpu_va) \|
		643	pbdma_gp_entry1_length_f(methodSize));
		644
		645	/* take always the postfence as it is needed for protecting the ce context */
		646	submit_flags \|= NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_GET;
		647
		648	wmb();
		649
		650	ret = gk20a_submit_channel_gpfifo(ce_ctx->ch, &gpfifo, NULL,
		651	1, submit_flags, &fence, &ce_cmd_buf_fence_out, true);
		652
		653	if (!ret) {
		654	memcpy((void *)(cmd_buf_cpu_va + fence_index),
		655	(void *)&ce_cmd_buf_fence_out,
		656	sizeof(struct gk20a_fence *));
		657
		658	if (gk20a_fence_out) {
		659	gk20a_fence_get(ce_cmd_buf_fence_out);
		660	*gk20a_fence_out = ce_cmd_buf_fence_out;
		661	}
		662
		663	/* Next available command buffer queue Index */
		664	++ce_ctx->cmd_buf_read_queue_offset;
		665	++ce_ctx->submitted_seq_number;
		666	}
		667	} else
		668	ret = -ENOMEM;
		669	noop:
		670	mutex_unlock(&ce_ctx->gpu_ctx_mutex);
		671	end:
		672	return ret;
		673	}
		674	EXPORT_SYMBOL(gk20a_ce_execute_ops);
		675
		676	void gk20a_ce_delete_context(struct device *dev,
		677	u32 ce_ctx_id)
		678	{
		679	struct gk20a *g = gk20a_from_dev(dev);
		680	struct gk20a_ce_app *ce_app = &g->ce_app;
		681	struct gk20a_gpu_ctx ce_ctx, ce_ctx_save;
		682
		683	if (!ce_app->initialised \|\|ce_app->app_state != NVGPU_CE_ACTIVE)
		684	return;
		685
		686	mutex_lock(&ce_app->app_mutex);
		687
		688	list_for_each_entry_safe(ce_ctx, ce_ctx_save,
		689	&ce_app->allocated_contexts, list) {
		690	if (ce_ctx->ctx_id == ce_ctx_id) {
		691	gk20a_ce_delete_gpu_context(ce_ctx);
		692	--ce_app->ctx_count;
		693	break;
		694	}
		695	}
		696
		697	mutex_unlock(&ce_app->app_mutex);
		698	return;
		699	}
		700	EXPORT_SYMBOL(gk20a_ce_delete_context);
		701
		702	#ifdef CONFIG_DEBUG_FS
		703	void gk20a_ce_debugfs_init(struct device *dev)
		704	{
		705	struct gk20a_platform *platform = dev_get_drvdata(dev);
		706	struct gk20a *g = get_gk20a(dev);
		707
		708	debugfs_create_u32("ce_app_ctx_count", S_IWUSR \| S_IRUGO,
		709	platform->debugfs, &g->ce_app.ctx_count);
		710	debugfs_create_u32("ce_app_state", S_IWUSR \| S_IRUGO,
		711	platform->debugfs, &g->ce_app.app_state);
		712	debugfs_create_u32("ce_app_next_ctx_id", S_IWUSR \| S_IRUGO,
		713	platform->debugfs, &g->ce_app.next_ctx_id);
		714	}
		715	#endif