gpu: nvgpu: move submit path to linux

Nvgpu submit path has a lot of dependency on Linux framework e.g. use of copy_from_user, use of structures defined in uapi/nvgpu headers, dma_buf_* calls for trace support etc Hence to keep common code independent of Linux code, move submit path to Linux directory Move below APIs to common/linux/channel.c trace_write_pushbuffer() trace_write_pushbuffer_range() gk20a_submit_prepare_syncs() gk20a_submit_append_priv_cmdbuf() gk20a_submit_append_gpfifo() gk20a_submit_channel_gpfifo() Move below APIs to common/linux/ce2.c gk20a_ce_execute_ops() Define gk20a_ce_execute_ops() in common/linux/ce2.c, and declare it in gk20a/ce2_gk20a.h since it is needed in common/mm code too Each OS needs to implement this API separately gk20a_channel_alloc_gpfifo() use sizeof(nvgpu_gpfifo) to get size of one gpfifo entry, but structure nvgpu_gpfifo is linux specific Define new nvgpu_get_gpfifo_entry_size() in linux specific code and use it in gk20a_channel_alloc_gpfifo() to get gpfifo entry size Each OS needs to implement this API separately Export some APIs from gk20a/ce2_gk20a.h and gk20a/channel_gk20a.h that are needed in linux code Jira NVGPU-259 Jira NVGPU-313 Change-Id: I360c6cb8ce4494b1e50c66af334a2a379f0d2dc4 Signed-off-by: Deepak Nibade <dnibade@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/1586277 Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
author: Deepak Nibade <dnibade@nvidia.com> 2017-10-26 11:29:56 -0400
committer: mobile promotions <svcmobile_promotions@nvidia.com> 2017-11-02 08:09:59 -0400
commit: 23c7903eff6ee1ab184dfcc62c054de1557e5b1d (patch)
tree: a5122028e181e5c6009f9f8b66bfbf00f69a9290 /drivers/gpu/nvgpu/common/linux/channel.c
parent: 5f8cfaa250f08499f587da0097f6accaa5eedf15 (diff)
1 files changed, 648 insertions, 0 deletions
diff --git a/drivers/gpu/nvgpu/common/linux/channel.c b/drivers/gpu/nvgpu/common/linux/channel.c
new file mode 100644
index 00000000..716c5820
--- /dev/null
+++ b/drivers/gpu/nvgpu/common/linux/channel.c
@@ -0,0 +1,648 @@
+/*
+ * Copyright (c) 2017, NVIDIA Corporation.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#include <nvgpu/enabled.h>
+#include <nvgpu/debug.h>
+#include <nvgpu/ltc.h>
+/*
+ * This is required for nvgpu_vm_find_buf() which is used in the tracing
+ * code. Once we can get and access userspace buffers without requiring
+ * direct dma_buf usage this can be removed.
+ */
+#include <nvgpu/linux/vm.h>
+#include "gk20a/gk20a.h"
+#include <nvgpu/hw/gk20a/hw_pbdma_gk20a.h>
+#include <linux/uaccess.h>
+#include <linux/dma-buf.h>
+#include <trace/events/gk20a.h>
+u32 nvgpu_get_gpfifo_entry_size(void)
+{
+        return sizeof(struct nvgpu_gpfifo);
+}
+#ifdef CONFIG_DEBUG_FS
+static void trace_write_pushbuffer(struct channel_gk20a *c,
+                                   struct nvgpu_gpfifo *g)
+{
+        void *mem = NULL;
+        unsigned int words;
+        u64 offset;
+        struct dma_buf *dmabuf = NULL;
+        if (gk20a_debug_trace_cmdbuf) {
+                u64 gpu_va = (u64)g->entry0 |
+                        (u64)((u64)pbdma_gp_entry1_get_hi_v(g->entry1) << 32);
+                int err;
+                words = pbdma_gp_entry1_length_v(g->entry1);
+                err = nvgpu_vm_find_buf(c->vm, gpu_va, &dmabuf, &offset);
+                if (!err)
+                        mem = dma_buf_vmap(dmabuf);
+        }
+        if (mem) {
+                u32 i;
+                /*
+                 * Write in batches of 128 as there seems to be a limit
+                 * of how much you can output to ftrace at once.
+                 */
+                for (i = 0; i < words; i += 128U) {
+                        trace_gk20a_push_cmdbuf(
+                                c->g->name,
+                                0,
+                                min(words - i, 128U),
+                                offset + i * sizeof(u32),
+                                mem);
+                }
+                dma_buf_vunmap(dmabuf, mem);
+        }
+}
+#endif
+static void trace_write_pushbuffer_range(struct channel_gk20a *c,
+                                         struct nvgpu_gpfifo *g,
+                                         struct nvgpu_gpfifo __user *user_gpfifo,
+                                         int offset,
+                                         int count)
+{
+#ifdef CONFIG_DEBUG_FS
+        u32 size;
+        int i;
+        struct nvgpu_gpfifo *gp;
+        bool gpfifo_allocated = false;
+        if (!gk20a_debug_trace_cmdbuf)
+                return;
+        if (!g && !user_gpfifo)
+                return;
+        if (!g) {
+                size = count * sizeof(struct nvgpu_gpfifo);
+                if (size) {
+                        g = nvgpu_big_malloc(c->g, size);
+                        if (!g)
+                                return;
+                        if (copy_from_user(g, user_gpfifo, size)) {
+                                nvgpu_big_free(c->g, g);
+                                return;
+                        }
+                }
+                gpfifo_allocated = true;
+        }
+        gp = g + offset;
+        for (i = 0; i < count; i++, gp++)
+                trace_write_pushbuffer(c, gp);
+        if (gpfifo_allocated)
+                nvgpu_big_free(c->g, g);
+#endif
+}
+/*
+ * Handle the submit synchronization - pre-fences and post-fences.
+ */
+static int gk20a_submit_prepare_syncs(struct channel_gk20a *c,
+                                      struct nvgpu_fence *fence,
+                                      struct channel_gk20a_job *job,
+                                      struct priv_cmd_entry **wait_cmd,
+                                      struct priv_cmd_entry **incr_cmd,
+                                      struct gk20a_fence **pre_fence,
+                                      struct gk20a_fence **post_fence,
+                                      bool force_need_sync_fence,
+                                      bool register_irq,
+                                      u32 flags)
+{
+        struct gk20a *g = c->g;
+        bool need_sync_fence = false;
+        bool new_sync_created = false;
+        int wait_fence_fd = -1;
+        int err = 0;
+        bool need_wfi = !(flags & NVGPU_SUBMIT_GPFIFO_FLAGS_SUPPRESS_WFI);
+        bool pre_alloc_enabled = channel_gk20a_is_prealloc_enabled(c);
+        /*
+         * If user wants to always allocate sync_fence_fds then respect that;
+         * otherwise, allocate sync_fence_fd based on user flags.
+         */
+        if (force_need_sync_fence)
+                need_sync_fence = true;
+        if (g->aggressive_sync_destroy_thresh) {
+                nvgpu_mutex_acquire(&c->sync_lock);
+                if (!c->sync) {
+                        c->sync = gk20a_channel_sync_create(c);
+                        if (!c->sync) {
+                                err = -ENOMEM;
+                                nvgpu_mutex_release(&c->sync_lock);
+                                goto fail;
+                        }
+                        new_sync_created = true;
+                }
+                nvgpu_atomic_inc(&c->sync->refcount);
+                nvgpu_mutex_release(&c->sync_lock);
+        }
+        if (g->ops.fifo.resetup_ramfc && new_sync_created) {
+                err = g->ops.fifo.resetup_ramfc(c);
+                if (err)
+                        goto fail;
+        }
+        /*
+         * Optionally insert syncpt wait in the beginning of gpfifo submission
+         * when user requested and the wait hasn't expired. Validate that the id
+         * makes sense, elide if not. The only reason this isn't being
+         * unceremoniously killed is to keep running some tests which trigger
+         * this condition.
+         */
+        if (flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_WAIT) {
+                job->pre_fence = gk20a_alloc_fence(c);
+                if (!job->pre_fence) {
+                        err = -ENOMEM;
+                        goto fail;
+                }
+                if (!pre_alloc_enabled)
+                        job->wait_cmd = nvgpu_kzalloc(g,
+                                sizeof(struct priv_cmd_entry));
+                if (!job->wait_cmd) {
+                        err = -ENOMEM;
+                        goto clean_up_pre_fence;
+                }
+                if (flags & NVGPU_SUBMIT_GPFIFO_FLAGS_SYNC_FENCE) {
+                        wait_fence_fd = fence->id;
+                        err = c->sync->wait_fd(c->sync, wait_fence_fd,
+                                               job->wait_cmd, job->pre_fence);
+                } else {
+                        err = c->sync->wait_syncpt(c->sync, fence->id,
+                                                   fence->value, job->wait_cmd,
+                                                   job->pre_fence);
+                }
+                if (!err) {
+                        if (job->wait_cmd->valid)
+                                *wait_cmd = job->wait_cmd;
+                        *pre_fence = job->pre_fence;
+                } else
+                        goto clean_up_wait_cmd;
+        }
+        if ((flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_GET) &&
+            (flags & NVGPU_SUBMIT_GPFIFO_FLAGS_SYNC_FENCE))
+                need_sync_fence = true;
+        /*
+         * Always generate an increment at the end of a GPFIFO submission. This
+         * is used to keep track of method completion for idle railgating. The
+         * sync_pt/semaphore PB is added to the GPFIFO later on in submit.
+         */
+        job->post_fence = gk20a_alloc_fence(c);
+        if (!job->post_fence) {
+                err = -ENOMEM;
+                goto clean_up_wait_cmd;
+        }
+        if (!pre_alloc_enabled)
+                job->incr_cmd = nvgpu_kzalloc(g, sizeof(struct priv_cmd_entry));
+        if (!job->incr_cmd) {
+                err = -ENOMEM;
+                goto clean_up_post_fence;
+        }
+        if (flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_GET)
+                err = c->sync->incr_user(c->sync, wait_fence_fd, job->incr_cmd,
+                                 job->post_fence, need_wfi, need_sync_fence,
+                                 register_irq);
+        else
+                err = c->sync->incr(c->sync, job->incr_cmd,
+                                    job->post_fence, need_sync_fence,
+                                    register_irq);
+        if (!err) {
+                *incr_cmd = job->incr_cmd;
+                *post_fence = job->post_fence;
+        } else
+                goto clean_up_incr_cmd;
+        return 0;
+clean_up_incr_cmd:
+        free_priv_cmdbuf(c, job->incr_cmd);
+        if (!pre_alloc_enabled)
+                job->incr_cmd = NULL;
+clean_up_post_fence:
+        gk20a_fence_put(job->post_fence);
+        job->post_fence = NULL;
+clean_up_wait_cmd:
+        free_priv_cmdbuf(c, job->wait_cmd);
+        if (!pre_alloc_enabled)
+                job->wait_cmd = NULL;
+clean_up_pre_fence:
+        gk20a_fence_put(job->pre_fence);
+        job->pre_fence = NULL;
+fail:
+        *wait_cmd = NULL;
+        *pre_fence = NULL;
+        return err;
+}
+static void gk20a_submit_append_priv_cmdbuf(struct channel_gk20a *c,
+                struct priv_cmd_entry *cmd)
+{
+        struct gk20a *g = c->g;
+        struct nvgpu_mem *gpfifo_mem = &c->gpfifo.mem;
+        struct nvgpu_gpfifo x = {
+                .entry0 = u64_lo32(cmd->gva),
+                .entry1 = u64_hi32(cmd->gva) |
+                        pbdma_gp_entry1_length_f(cmd->size)
+        };
+        nvgpu_mem_wr_n(g, gpfifo_mem, c->gpfifo.put * sizeof(x),
+                        &x, sizeof(x));
+        if (cmd->mem->aperture == APERTURE_SYSMEM)
+                trace_gk20a_push_cmdbuf(g->name, 0, cmd->size, 0,
+                                cmd->mem->cpu_va + cmd->off * sizeof(u32));
+        c->gpfifo.put = (c->gpfifo.put + 1) & (c->gpfifo.entry_num - 1);
+}
+/*
+ * Copy source gpfifo entries into the gpfifo ring buffer, potentially
+ * splitting into two memcpys to handle wrap-around.
+ */
+static int gk20a_submit_append_gpfifo(struct channel_gk20a *c,
+                struct nvgpu_gpfifo *kern_gpfifo,
+                struct nvgpu_gpfifo __user *user_gpfifo,
+                u32 num_entries)
+{
+        /* byte offsets */
+        u32 gpfifo_size = c->gpfifo.entry_num * sizeof(struct nvgpu_gpfifo);
+        u32 len = num_entries * sizeof(struct nvgpu_gpfifo);
+        u32 start = c->gpfifo.put * sizeof(struct nvgpu_gpfifo);
+        u32 end = start + len; /* exclusive */
+        struct nvgpu_mem *gpfifo_mem = &c->gpfifo.mem;
+        struct nvgpu_gpfifo *cpu_src;
+        int err;
+        if (user_gpfifo && !c->gpfifo.pipe) {
+                /*
+                 * This path (from userspace to sysmem) is special in order to
+                 * avoid two copies unnecessarily (from user to pipe, then from
+                 * pipe to gpu sysmem buffer).
+                 *
+                 * As a special case, the pipe buffer exists if PRAMIN writes
+                 * are forced, although the buffers may not be in vidmem in
+                 * that case.
+                 */
+                if (end > gpfifo_size) {
+                        /* wrap-around */
+                        int length0 = gpfifo_size - start;
+                        int length1 = len - length0;
+                        void __user *user2 = (u8 __user *)user_gpfifo + length0;
+                        err = copy_from_user(gpfifo_mem->cpu_va + start,
+                                        user_gpfifo, length0);
+                        if (err)
+                                return err;
+                        err = copy_from_user(gpfifo_mem->cpu_va,
+                                        user2, length1);
+                        if (err)
+                                return err;
+                } else {
+                        err = copy_from_user(gpfifo_mem->cpu_va + start,
+                                        user_gpfifo, len);
+                        if (err)
+                                return err;
+                }
+                trace_write_pushbuffer_range(c, NULL, user_gpfifo,
+                                0, num_entries);
+                goto out;
+        } else if (user_gpfifo) {
+                /* from userspace to vidmem or sysmem when pramin forced, use
+                 * the common copy path below */
+                err = copy_from_user(c->gpfifo.pipe, user_gpfifo, len);
+                if (err)
+                        return err;
+                cpu_src = c->gpfifo.pipe;
+        } else {
+                /* from kernel to either sysmem or vidmem, don't need
+                 * copy_from_user so use the common path below */
+                cpu_src = kern_gpfifo;
+        }
+        if (end > gpfifo_size) {
+                /* wrap-around */
+                int length0 = gpfifo_size - start;
+                int length1 = len - length0;
+                void *src2 = (u8 *)cpu_src + length0;
+                nvgpu_mem_wr_n(c->g, gpfifo_mem, start, cpu_src, length0);
+                nvgpu_mem_wr_n(c->g, gpfifo_mem, 0, src2, length1);
+        } else {
+                nvgpu_mem_wr_n(c->g, gpfifo_mem, start, cpu_src, len);
+        }
+        trace_write_pushbuffer_range(c, cpu_src, NULL, 0, num_entries);
+out:
+        c->gpfifo.put = (c->gpfifo.put + num_entries) &
+                (c->gpfifo.entry_num - 1);
+        return 0;
+}
+int gk20a_submit_channel_gpfifo(struct channel_gk20a *c,
+                                struct nvgpu_gpfifo *gpfifo,
+                                struct nvgpu_submit_gpfifo_args *args,
+                                u32 num_entries,
+                                u32 flags,
+                                struct nvgpu_fence *fence,
+                                struct gk20a_fence **fence_out,
+                                bool force_need_sync_fence,
+                                struct fifo_profile_gk20a *profile)
+{
+        struct gk20a *g = c->g;
+        struct priv_cmd_entry *wait_cmd = NULL;
+        struct priv_cmd_entry *incr_cmd = NULL;
+        struct gk20a_fence *pre_fence = NULL;
+        struct gk20a_fence *post_fence = NULL;
+        struct channel_gk20a_job *job = NULL;
+        /* we might need two extra gpfifo entries - one for pre fence
+         * and one for post fence. */
+        const int extra_entries = 2;
+        bool skip_buffer_refcounting = (flags &
+                        NVGPU_SUBMIT_GPFIFO_FLAGS_SKIP_BUFFER_REFCOUNTING);
+        int err = 0;
+        bool need_job_tracking;
+        bool need_deferred_cleanup = false;
+        struct nvgpu_gpfifo __user *user_gpfifo = args ?
+                (struct nvgpu_gpfifo __user *)(uintptr_t)args->gpfifo : NULL;
+        if (nvgpu_is_enabled(g, NVGPU_DRIVER_IS_DYING))
+                return -ENODEV;
+        if (c->has_timedout)
+                return -ETIMEDOUT;
+        if (!nvgpu_mem_is_valid(&c->gpfifo.mem))
+                return -ENOMEM;
+        /* fifo not large enough for request. Return error immediately.
+         * Kernel can insert gpfifo entries before and after user gpfifos.
+         * So, add extra_entries in user request. Also, HW with fifo size N
+         * can accept only N-1 entreis and so the below condition */
+        if (c->gpfifo.entry_num - 1 < num_entries + extra_entries) {
+                nvgpu_err(g, "not enough gpfifo space allocated");
+                return -ENOMEM;
+        }
+        if (!gpfifo && !args)
+                return -EINVAL;
+        if ((flags & (NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_WAIT |
+                      NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_GET)) &&
+            !fence)
+                return -EINVAL;
+        /* an address space needs to have been bound at this point. */
+        if (!gk20a_channel_as_bound(c)) {
+                nvgpu_err(g,
+                            "not bound to an address space at time of gpfifo"
+                            " submission.");
+                return -EINVAL;
+        }
+        if (profile)
+                profile->timestamp[PROFILE_ENTRY] = sched_clock();
+        /* update debug settings */
+        nvgpu_ltc_sync_enabled(g);
+        gk20a_dbg_info("channel %d", c->chid);
+        /*
+         * Job tracking is necessary for any of the following conditions:
+         *  - pre- or post-fence functionality
+         *  - channel wdt
+         *  - GPU rail-gating with non-deterministic channels
+         *  - buffer refcounting
+         *
+         * If none of the conditions are met, then job tracking is not
+         * required and a fast submit can be done (ie. only need to write
+         * out userspace GPFIFO entries and update GP_PUT).
+         */
+        need_job_tracking = (flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_WAIT) ||
+                        (flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_GET) ||
+                        c->wdt_enabled ||
+                        (g->can_railgate && !c->deterministic) ||
+                        !skip_buffer_refcounting;
+        if (need_job_tracking) {
+                bool need_sync_framework = false;
+                /*
+                 * If the channel is to have deterministic latency and
+                 * job tracking is required, the channel must have
+                 * pre-allocated resources. Otherwise, we fail the submit here
+                 */
+                if (c->deterministic && !channel_gk20a_is_prealloc_enabled(c))
+                        return -EINVAL;
+                need_sync_framework = force_need_sync_fence ||
+                        gk20a_channel_sync_needs_sync_framework(g) ||
+                        (flags & NVGPU_SUBMIT_GPFIFO_FLAGS_SYNC_FENCE &&
+                        (flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_WAIT ||
+                         flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_GET));
+                /*
+                 * Deferred clean-up is necessary for any of the following
+                 * conditions:
+                 * - channel's deterministic flag is not set
+                 * - dependency on sync framework, which could make the
+                 *   behavior of the clean-up operation non-deterministic
+                 *   (should not be performed in the submit path)
+                 * - channel wdt
+                 * - GPU rail-gating with non-deterministic channels
+                 * - buffer refcounting
+                 *
+                 * If none of the conditions are met, then deferred clean-up
+                 * is not required, and we clean-up one job-tracking
+                 * resource in the submit path.
+                 */
+                need_deferred_cleanup = !c->deterministic ||
+                                        need_sync_framework ||
+                                        c->wdt_enabled ||
+                                        (g->can_railgate &&
+                                         !c->deterministic) ||
+                                        !skip_buffer_refcounting;
+                /*
+                 * For deterministic channels, we don't allow deferred clean_up
+                 * processing to occur. In cases we hit this, we fail the submit
+                 */
+                if (c->deterministic && need_deferred_cleanup)
+                        return -EINVAL;
+                if (!c->deterministic) {
+                        /*
+                         * Get a power ref unless this is a deterministic
+                         * channel that holds them during the channel lifetime.
+                         * This one is released by gk20a_channel_clean_up_jobs,
+                         * via syncpt or sema interrupt, whichever is used.
+                         */
+                        err = gk20a_busy(g);
+                        if (err) {
+                                nvgpu_err(g,
+                                        "failed to host gk20a to submit gpfifo, process %s",
+                                        current->comm);
+                                return err;
+                        }
+                }
+                if (!need_deferred_cleanup) {
+                        /* clean up a single job */
+                        gk20a_channel_clean_up_jobs(c, false);
+                }
+        }
+        /* Grab access to HW to deal with do_idle */
+        if (c->deterministic)
+                nvgpu_rwsem_down_read(&g->deterministic_busy);
+        trace_gk20a_channel_submit_gpfifo(g->name,
+                                          c->chid,
+                                          num_entries,
+                                          flags,
+                                          fence ? fence->id : 0,
+                                          fence ? fence->value : 0);
+        gk20a_dbg_info("pre-submit put %d, get %d, size %d",
+                c->gpfifo.put, c->gpfifo.get, c->gpfifo.entry_num);
+        /*
+         * Make sure we have enough space for gpfifo entries. Check cached
+         * values first and then read from HW. If no space, return EAGAIN
+         * and let userpace decide to re-try request or not.
+         */
+        if (nvgpu_gp_free_count(c) < num_entries + extra_entries) {
+                if (nvgpu_get_gp_free_count(c) < num_entries + extra_entries) {
+                        err = -EAGAIN;
+                        goto clean_up;
+                }
+        }
+        if (c->has_timedout) {
+                err = -ETIMEDOUT;
+                goto clean_up;
+        }
+        if (need_job_tracking) {
+                err = channel_gk20a_alloc_job(c, &job);
+                if (err)
+                        goto clean_up;
+                err = gk20a_submit_prepare_syncs(c, fence, job,
+                                                 &wait_cmd, &incr_cmd,
+                                                 &pre_fence, &post_fence,
+                                                 force_need_sync_fence,
+                                                 need_deferred_cleanup,
+                                                 flags);
+                if (err)
+                        goto clean_up_job;
+        }
+        if (profile)
+                profile->timestamp[PROFILE_JOB_TRACKING] = sched_clock();
+        if (wait_cmd)
+                gk20a_submit_append_priv_cmdbuf(c, wait_cmd);
+        if (gpfifo || user_gpfifo)
+                err = gk20a_submit_append_gpfifo(c, gpfifo, user_gpfifo,
+                                num_entries);
+        if (err)
+                goto clean_up_job;
+        /*
+         * And here's where we add the incr_cmd we generated earlier. It should
+         * always run!
+         */
+        if (incr_cmd)
+                gk20a_submit_append_priv_cmdbuf(c, incr_cmd);
+        if (fence_out)
+                *fence_out = gk20a_fence_get(post_fence);
+        if (need_job_tracking)
+                /* TODO! Check for errors... */
+                gk20a_channel_add_job(c, job, skip_buffer_refcounting);
+        if (profile)
+                profile->timestamp[PROFILE_APPEND] = sched_clock();
+        g->ops.fifo.userd_gp_put(g, c);
+        if ((NVGPU_SUBMIT_GPFIFO_FLAGS_RESCHEDULE_RUNLIST & flags) &&
+                g->ops.fifo.reschedule_runlist)
+                g->ops.fifo.reschedule_runlist(g, c->runlist_id);
+        /* No hw access beyond this point */
+        if (c->deterministic)
+                nvgpu_rwsem_up_read(&g->deterministic_busy);
+        trace_gk20a_channel_submitted_gpfifo(g->name,
+                                c->chid,
+                                num_entries,
+                                flags,
+                                post_fence ? post_fence->syncpt_id : 0,
+                                post_fence ? post_fence->syncpt_value : 0);
+        gk20a_dbg_info("post-submit put %d, get %d, size %d",
+                c->gpfifo.put, c->gpfifo.get, c->gpfifo.entry_num);
+        if (profile)
+                profile->timestamp[PROFILE_END] = sched_clock();
+        gk20a_dbg_fn("done");
+        return err;
+clean_up_job:
+        channel_gk20a_free_job(c, job);
+clean_up:
+        gk20a_dbg_fn("fail");
+        gk20a_fence_put(pre_fence);
+        gk20a_fence_put(post_fence);
+        if (c->deterministic)
+                nvgpu_rwsem_up_read(&g->deterministic_busy);
+        else if (need_deferred_cleanup)
+                gk20a_idle(g);
+        return err;
+}
author	Deepak Nibade <dnibade@nvidia.com>	2017-10-26 11:29:56 -0400
committer	mobile promotions <svcmobile_promotions@nvidia.com>	2017-11-02 08:09:59 -0400
commit	23c7903eff6ee1ab184dfcc62c054de1557e5b1d (patch)
tree	a5122028e181e5c6009f9f8b66bfbf00f69a9290 /drivers/gpu/nvgpu/common/linux/channel.c
parent	5f8cfaa250f08499f587da0097f6accaa5eedf15 (diff)

diff --git a/drivers/gpu/nvgpu/common/linux/channel.c b/drivers/gpu/nvgpu/common/linux/channel.c new file mode 100644 index 00000000..716c5820 --- /dev/null +++ b/drivers/gpu/nvgpu/common/linux/channel.c
@@ -0,0 +1,648 @@
	1	/*
	2	* Copyright (c) 2017, NVIDIA Corporation. All rights reserved.
	3	*
	4	* This program is free software; you can redistribute it and/or modify it
	5	* under the terms and conditions of the GNU General Public License,
	6	* version 2, as published by the Free Software Foundation.
	7	*
	8	* This program is distributed in the hope it will be useful, but WITHOUT
	9	* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
	10	* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
	11	* more details.
	12	*
	13	* You should have received a copy of the GNU General Public License
	14	* along with this program. If not, see <http://www.gnu.org/licenses/>.
	15	*/
	16
	17	#include <nvgpu/enabled.h>
	18	#include <nvgpu/debug.h>
	19	#include <nvgpu/ltc.h>
	20
	21	/*
	22	* This is required for nvgpu_vm_find_buf() which is used in the tracing
	23	* code. Once we can get and access userspace buffers without requiring
	24	* direct dma_buf usage this can be removed.
	25	*/
	26	#include <nvgpu/linux/vm.h>
	27
	28	#include "gk20a/gk20a.h"
	29
	30	#include <nvgpu/hw/gk20a/hw_pbdma_gk20a.h>
	31
	32	#include <linux/uaccess.h>
	33	#include <linux/dma-buf.h>
	34	#include <trace/events/gk20a.h>
	35
	36	u32 nvgpu_get_gpfifo_entry_size(void)
	37	{
	38	return sizeof(struct nvgpu_gpfifo);
	39	}
	40
	41	#ifdef CONFIG_DEBUG_FS
	42	static void trace_write_pushbuffer(struct channel_gk20a *c,
	43	struct nvgpu_gpfifo *g)
	44	{
	45	void *mem = NULL;
	46	unsigned int words;
	47	u64 offset;
	48	struct dma_buf *dmabuf = NULL;
	49
	50	if (gk20a_debug_trace_cmdbuf) {
	51	u64 gpu_va = (u64)g->entry0 \|
	52	(u64)((u64)pbdma_gp_entry1_get_hi_v(g->entry1) << 32);
	53	int err;
	54
	55	words = pbdma_gp_entry1_length_v(g->entry1);
	56	err = nvgpu_vm_find_buf(c->vm, gpu_va, &dmabuf, &offset);
	57	if (!err)
	58	mem = dma_buf_vmap(dmabuf);
	59	}
	60
	61	if (mem) {
	62	u32 i;
	63	/*
	64	* Write in batches of 128 as there seems to be a limit
	65	* of how much you can output to ftrace at once.
	66	*/
	67	for (i = 0; i < words; i += 128U) {
	68	trace_gk20a_push_cmdbuf(
	69	c->g->name,
	70	0,
	71	min(words - i, 128U),
	72	offset + i * sizeof(u32),
	73	mem);
	74	}
	75	dma_buf_vunmap(dmabuf, mem);
	76	}
	77	}
	78	#endif
	79
	80	static void trace_write_pushbuffer_range(struct channel_gk20a *c,
	81	struct nvgpu_gpfifo *g,
	82	struct nvgpu_gpfifo __user *user_gpfifo,
	83	int offset,
	84	int count)
	85	{
	86	#ifdef CONFIG_DEBUG_FS
	87	u32 size;
	88	int i;
	89	struct nvgpu_gpfifo *gp;
	90	bool gpfifo_allocated = false;
	91
	92	if (!gk20a_debug_trace_cmdbuf)
	93	return;
	94
	95	if (!g && !user_gpfifo)
	96	return;
	97
	98	if (!g) {
	99	size = count * sizeof(struct nvgpu_gpfifo);
	100	if (size) {
	101	g = nvgpu_big_malloc(c->g, size);
	102	if (!g)
	103	return;
	104
	105	if (copy_from_user(g, user_gpfifo, size)) {
	106	nvgpu_big_free(c->g, g);
	107	return;
	108	}
	109	}
	110	gpfifo_allocated = true;
	111	}
	112
	113	gp = g + offset;
	114	for (i = 0; i < count; i++, gp++)
	115	trace_write_pushbuffer(c, gp);
	116
	117	if (gpfifo_allocated)
	118	nvgpu_big_free(c->g, g);
	119	#endif
	120	}
	121
	122	/*
	123	* Handle the submit synchronization - pre-fences and post-fences.
	124	*/
	125	static int gk20a_submit_prepare_syncs(struct channel_gk20a *c,
	126	struct nvgpu_fence *fence,
	127	struct channel_gk20a_job *job,
	128	struct priv_cmd_entry **wait_cmd,
	129	struct priv_cmd_entry **incr_cmd,
	130	struct gk20a_fence **pre_fence,
	131	struct gk20a_fence **post_fence,
	132	bool force_need_sync_fence,
	133	bool register_irq,
	134	u32 flags)
	135	{
	136	struct gk20a *g = c->g;
	137	bool need_sync_fence = false;
	138	bool new_sync_created = false;
	139	int wait_fence_fd = -1;
	140	int err = 0;
	141	bool need_wfi = !(flags & NVGPU_SUBMIT_GPFIFO_FLAGS_SUPPRESS_WFI);
	142	bool pre_alloc_enabled = channel_gk20a_is_prealloc_enabled(c);
	143
	144	/*
	145	* If user wants to always allocate sync_fence_fds then respect that;
	146	* otherwise, allocate sync_fence_fd based on user flags.
	147	*/
	148	if (force_need_sync_fence)
	149	need_sync_fence = true;
	150
	151	if (g->aggressive_sync_destroy_thresh) {
	152	nvgpu_mutex_acquire(&c->sync_lock);
	153	if (!c->sync) {
	154	c->sync = gk20a_channel_sync_create(c);
	155	if (!c->sync) {
	156	err = -ENOMEM;
	157	nvgpu_mutex_release(&c->sync_lock);
	158	goto fail;
	159	}
	160	new_sync_created = true;
	161	}
	162	nvgpu_atomic_inc(&c->sync->refcount);
	163	nvgpu_mutex_release(&c->sync_lock);
	164	}
	165
	166	if (g->ops.fifo.resetup_ramfc && new_sync_created) {
	167	err = g->ops.fifo.resetup_ramfc(c);
	168	if (err)
	169	goto fail;
	170	}
	171
	172	/*
	173	* Optionally insert syncpt wait in the beginning of gpfifo submission
	174	* when user requested and the wait hasn't expired. Validate that the id
	175	* makes sense, elide if not. The only reason this isn't being
	176	* unceremoniously killed is to keep running some tests which trigger
	177	* this condition.
	178	*/
	179	if (flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_WAIT) {
	180	job->pre_fence = gk20a_alloc_fence(c);
	181	if (!job->pre_fence) {
	182	err = -ENOMEM;
	183	goto fail;
	184	}
	185
	186	if (!pre_alloc_enabled)
	187	job->wait_cmd = nvgpu_kzalloc(g,
	188	sizeof(struct priv_cmd_entry));
	189
	190	if (!job->wait_cmd) {
	191	err = -ENOMEM;
	192	goto clean_up_pre_fence;
	193	}
	194
	195	if (flags & NVGPU_SUBMIT_GPFIFO_FLAGS_SYNC_FENCE) {
	196	wait_fence_fd = fence->id;
	197	err = c->sync->wait_fd(c->sync, wait_fence_fd,
	198	job->wait_cmd, job->pre_fence);
	199	} else {
	200	err = c->sync->wait_syncpt(c->sync, fence->id,
	201	fence->value, job->wait_cmd,
	202	job->pre_fence);
	203	}
	204
	205	if (!err) {
	206	if (job->wait_cmd->valid)
	207	*wait_cmd = job->wait_cmd;
	208	*pre_fence = job->pre_fence;
	209	} else
	210	goto clean_up_wait_cmd;
	211	}
	212
	213	if ((flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_GET) &&
	214	(flags & NVGPU_SUBMIT_GPFIFO_FLAGS_SYNC_FENCE))
	215	need_sync_fence = true;
	216
	217	/*
	218	* Always generate an increment at the end of a GPFIFO submission. This
	219	* is used to keep track of method completion for idle railgating. The
	220	* sync_pt/semaphore PB is added to the GPFIFO later on in submit.
	221	*/
	222	job->post_fence = gk20a_alloc_fence(c);
	223	if (!job->post_fence) {
	224	err = -ENOMEM;
	225	goto clean_up_wait_cmd;
	226	}
	227	if (!pre_alloc_enabled)
	228	job->incr_cmd = nvgpu_kzalloc(g, sizeof(struct priv_cmd_entry));
	229
	230	if (!job->incr_cmd) {
	231	err = -ENOMEM;
	232	goto clean_up_post_fence;
	233	}
	234
	235	if (flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_GET)
	236	err = c->sync->incr_user(c->sync, wait_fence_fd, job->incr_cmd,
	237	job->post_fence, need_wfi, need_sync_fence,
	238	register_irq);
	239	else
	240	err = c->sync->incr(c->sync, job->incr_cmd,
	241	job->post_fence, need_sync_fence,
	242	register_irq);
	243	if (!err) {
	244	*incr_cmd = job->incr_cmd;
	245	*post_fence = job->post_fence;
	246	} else
	247	goto clean_up_incr_cmd;
	248
	249	return 0;
	250
	251	clean_up_incr_cmd:
	252	free_priv_cmdbuf(c, job->incr_cmd);
	253	if (!pre_alloc_enabled)
	254	job->incr_cmd = NULL;
	255	clean_up_post_fence:
	256	gk20a_fence_put(job->post_fence);
	257	job->post_fence = NULL;
	258	clean_up_wait_cmd:
	259	free_priv_cmdbuf(c, job->wait_cmd);
	260	if (!pre_alloc_enabled)
	261	job->wait_cmd = NULL;
	262	clean_up_pre_fence:
	263	gk20a_fence_put(job->pre_fence);
	264	job->pre_fence = NULL;
	265	fail:
	266	*wait_cmd = NULL;
	267	*pre_fence = NULL;
	268	return err;
	269	}
	270
	271	static void gk20a_submit_append_priv_cmdbuf(struct channel_gk20a *c,
	272	struct priv_cmd_entry *cmd)
	273	{
	274	struct gk20a *g = c->g;
	275	struct nvgpu_mem *gpfifo_mem = &c->gpfifo.mem;
	276	struct nvgpu_gpfifo x = {
	277	.entry0 = u64_lo32(cmd->gva),
	278	.entry1 = u64_hi32(cmd->gva) \|
	279	pbdma_gp_entry1_length_f(cmd->size)
	280	};
	281
	282	nvgpu_mem_wr_n(g, gpfifo_mem, c->gpfifo.put * sizeof(x),
	283	&x, sizeof(x));
	284
	285	if (cmd->mem->aperture == APERTURE_SYSMEM)
	286	trace_gk20a_push_cmdbuf(g->name, 0, cmd->size, 0,
	287	cmd->mem->cpu_va + cmd->off * sizeof(u32));
	288
	289	c->gpfifo.put = (c->gpfifo.put + 1) & (c->gpfifo.entry_num - 1);
	290	}
	291
	292	/*
	293	* Copy source gpfifo entries into the gpfifo ring buffer, potentially
	294	* splitting into two memcpys to handle wrap-around.
	295	*/
	296	static int gk20a_submit_append_gpfifo(struct channel_gk20a *c,
	297	struct nvgpu_gpfifo *kern_gpfifo,
	298	struct nvgpu_gpfifo __user *user_gpfifo,
	299	u32 num_entries)
	300	{
	301	/* byte offsets */
	302	u32 gpfifo_size = c->gpfifo.entry_num * sizeof(struct nvgpu_gpfifo);
	303	u32 len = num_entries * sizeof(struct nvgpu_gpfifo);
	304	u32 start = c->gpfifo.put * sizeof(struct nvgpu_gpfifo);
	305	u32 end = start + len; /* exclusive */
	306	struct nvgpu_mem *gpfifo_mem = &c->gpfifo.mem;
	307	struct nvgpu_gpfifo *cpu_src;
	308	int err;
	309
	310	if (user_gpfifo && !c->gpfifo.pipe) {
	311	/*
	312	* This path (from userspace to sysmem) is special in order to
	313	* avoid two copies unnecessarily (from user to pipe, then from
	314	* pipe to gpu sysmem buffer).
	315	*
	316	* As a special case, the pipe buffer exists if PRAMIN writes
	317	* are forced, although the buffers may not be in vidmem in
	318	* that case.
	319	*/
	320	if (end > gpfifo_size) {
	321	/* wrap-around */
	322	int length0 = gpfifo_size - start;
	323	int length1 = len - length0;
	324	void __user user2 = (u8 __user )user_gpfifo + length0;
	325
	326	err = copy_from_user(gpfifo_mem->cpu_va + start,
	327	user_gpfifo, length0);
	328	if (err)
	329	return err;
	330
	331	err = copy_from_user(gpfifo_mem->cpu_va,
	332	user2, length1);
	333	if (err)
	334	return err;
	335	} else {
	336	err = copy_from_user(gpfifo_mem->cpu_va + start,
	337	user_gpfifo, len);
	338	if (err)
	339	return err;
	340	}
	341
	342	trace_write_pushbuffer_range(c, NULL, user_gpfifo,
	343	0, num_entries);
	344	goto out;
	345	} else if (user_gpfifo) {
	346	/* from userspace to vidmem or sysmem when pramin forced, use
	347	* the common copy path below */
	348	err = copy_from_user(c->gpfifo.pipe, user_gpfifo, len);
	349	if (err)
	350	return err;
	351
	352	cpu_src = c->gpfifo.pipe;
	353	} else {
	354	/* from kernel to either sysmem or vidmem, don't need
	355	* copy_from_user so use the common path below */
	356	cpu_src = kern_gpfifo;
	357	}
	358
	359	if (end > gpfifo_size) {
	360	/* wrap-around */
	361	int length0 = gpfifo_size - start;
	362	int length1 = len - length0;
	363	void src2 = (u8 )cpu_src + length0;
	364
	365	nvgpu_mem_wr_n(c->g, gpfifo_mem, start, cpu_src, length0);
	366	nvgpu_mem_wr_n(c->g, gpfifo_mem, 0, src2, length1);
	367	} else {
	368	nvgpu_mem_wr_n(c->g, gpfifo_mem, start, cpu_src, len);
	369
	370	}
	371
	372	trace_write_pushbuffer_range(c, cpu_src, NULL, 0, num_entries);
	373
	374	out:
	375	c->gpfifo.put = (c->gpfifo.put + num_entries) &
	376	(c->gpfifo.entry_num - 1);
	377
	378	return 0;
	379	}
	380
	381	int gk20a_submit_channel_gpfifo(struct channel_gk20a *c,
	382	struct nvgpu_gpfifo *gpfifo,
	383	struct nvgpu_submit_gpfifo_args *args,
	384	u32 num_entries,
	385	u32 flags,
	386	struct nvgpu_fence *fence,
	387	struct gk20a_fence **fence_out,
	388	bool force_need_sync_fence,
	389	struct fifo_profile_gk20a *profile)
	390	{
	391	struct gk20a *g = c->g;
	392	struct priv_cmd_entry *wait_cmd = NULL;
	393	struct priv_cmd_entry *incr_cmd = NULL;
	394	struct gk20a_fence *pre_fence = NULL;
	395	struct gk20a_fence *post_fence = NULL;
	396	struct channel_gk20a_job *job = NULL;
	397	/* we might need two extra gpfifo entries - one for pre fence
	398	* and one for post fence. */
	399	const int extra_entries = 2;
	400	bool skip_buffer_refcounting = (flags &
	401	NVGPU_SUBMIT_GPFIFO_FLAGS_SKIP_BUFFER_REFCOUNTING);
	402	int err = 0;
	403	bool need_job_tracking;
	404	bool need_deferred_cleanup = false;
	405	struct nvgpu_gpfifo __user *user_gpfifo = args ?
	406	(struct nvgpu_gpfifo __user *)(uintptr_t)args->gpfifo : NULL;
	407
	408	if (nvgpu_is_enabled(g, NVGPU_DRIVER_IS_DYING))
	409	return -ENODEV;
	410
	411	if (c->has_timedout)
	412	return -ETIMEDOUT;
	413
	414	if (!nvgpu_mem_is_valid(&c->gpfifo.mem))
	415	return -ENOMEM;
	416
	417	/* fifo not large enough for request. Return error immediately.
	418	* Kernel can insert gpfifo entries before and after user gpfifos.
	419	* So, add extra_entries in user request. Also, HW with fifo size N
	420	* can accept only N-1 entreis and so the below condition */
	421	if (c->gpfifo.entry_num - 1 < num_entries + extra_entries) {
	422	nvgpu_err(g, "not enough gpfifo space allocated");
	423	return -ENOMEM;
	424	}
	425
	426	if (!gpfifo && !args)
	427	return -EINVAL;
	428
	429	if ((flags & (NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_WAIT \|
	430	NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_GET)) &&
	431	!fence)
	432	return -EINVAL;
	433
	434	/* an address space needs to have been bound at this point. */
	435	if (!gk20a_channel_as_bound(c)) {
	436	nvgpu_err(g,
	437	"not bound to an address space at time of gpfifo"
	438	" submission.");
	439	return -EINVAL;
	440	}
	441
	442	if (profile)
	443	profile->timestamp[PROFILE_ENTRY] = sched_clock();
	444
	445	/* update debug settings */
	446	nvgpu_ltc_sync_enabled(g);
	447
	448	gk20a_dbg_info("channel %d", c->chid);
	449
	450	/*
	451	* Job tracking is necessary for any of the following conditions:
	452	* - pre- or post-fence functionality
	453	* - channel wdt
	454	* - GPU rail-gating with non-deterministic channels
	455	* - buffer refcounting
	456	*
	457	* If none of the conditions are met, then job tracking is not
	458	* required and a fast submit can be done (ie. only need to write
	459	* out userspace GPFIFO entries and update GP_PUT).
	460	*/
	461	need_job_tracking = (flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_WAIT) \|\|
	462	(flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_GET) \|\|
	463	c->wdt_enabled \|\|
	464	(g->can_railgate && !c->deterministic) \|\|
	465	!skip_buffer_refcounting;
	466
	467	if (need_job_tracking) {
	468	bool need_sync_framework = false;
	469
	470	/*
	471	* If the channel is to have deterministic latency and
	472	* job tracking is required, the channel must have
	473	* pre-allocated resources. Otherwise, we fail the submit here
	474	*/
	475	if (c->deterministic && !channel_gk20a_is_prealloc_enabled(c))
	476	return -EINVAL;
	477
	478	need_sync_framework = force_need_sync_fence \|\|
	479	gk20a_channel_sync_needs_sync_framework(g) \|\|
	480	(flags & NVGPU_SUBMIT_GPFIFO_FLAGS_SYNC_FENCE &&
	481	(flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_WAIT \|\|
	482	flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_GET));
	483
	484	/*
	485	* Deferred clean-up is necessary for any of the following
	486	* conditions:
	487	* - channel's deterministic flag is not set
	488	* - dependency on sync framework, which could make the
	489	* behavior of the clean-up operation non-deterministic
	490	* (should not be performed in the submit path)
	491	* - channel wdt
	492	* - GPU rail-gating with non-deterministic channels
	493	* - buffer refcounting
	494	*
	495	* If none of the conditions are met, then deferred clean-up
	496	* is not required, and we clean-up one job-tracking
	497	* resource in the submit path.
	498	*/
	499	need_deferred_cleanup = !c->deterministic \|\|
	500	need_sync_framework \|\|
	501	c->wdt_enabled \|\|
	502	(g->can_railgate &&
	503	!c->deterministic) \|\|
	504	!skip_buffer_refcounting;
	505
	506	/*
	507	* For deterministic channels, we don't allow deferred clean_up
	508	* processing to occur. In cases we hit this, we fail the submit
	509	*/
	510	if (c->deterministic && need_deferred_cleanup)
	511	return -EINVAL;
	512
	513	if (!c->deterministic) {
	514	/*
	515	* Get a power ref unless this is a deterministic
	516	* channel that holds them during the channel lifetime.
	517	* This one is released by gk20a_channel_clean_up_jobs,
	518	* via syncpt or sema interrupt, whichever is used.
	519	*/
	520	err = gk20a_busy(g);
	521	if (err) {
	522	nvgpu_err(g,
	523	"failed to host gk20a to submit gpfifo, process %s",
	524	current->comm);
	525	return err;
	526	}
	527	}
	528
	529	if (!need_deferred_cleanup) {
	530	/* clean up a single job */
	531	gk20a_channel_clean_up_jobs(c, false);
	532	}
	533	}
	534
	535
	536	/* Grab access to HW to deal with do_idle */
	537	if (c->deterministic)
	538	nvgpu_rwsem_down_read(&g->deterministic_busy);
	539
	540	trace_gk20a_channel_submit_gpfifo(g->name,
	541	c->chid,
	542	num_entries,
	543	flags,
	544	fence ? fence->id : 0,
	545	fence ? fence->value : 0);
	546
	547	gk20a_dbg_info("pre-submit put %d, get %d, size %d",
	548	c->gpfifo.put, c->gpfifo.get, c->gpfifo.entry_num);
	549
	550	/*
	551	* Make sure we have enough space for gpfifo entries. Check cached
	552	* values first and then read from HW. If no space, return EAGAIN
	553	* and let userpace decide to re-try request or not.
	554	*/
	555	if (nvgpu_gp_free_count(c) < num_entries + extra_entries) {
	556	if (nvgpu_get_gp_free_count(c) < num_entries + extra_entries) {
	557	err = -EAGAIN;
	558	goto clean_up;
	559	}
	560	}
	561
	562	if (c->has_timedout) {
	563	err = -ETIMEDOUT;
	564	goto clean_up;
	565	}
	566
	567	if (need_job_tracking) {
	568	err = channel_gk20a_alloc_job(c, &job);
	569	if (err)
	570	goto clean_up;
	571
	572	err = gk20a_submit_prepare_syncs(c, fence, job,
	573	&wait_cmd, &incr_cmd,
	574	&pre_fence, &post_fence,
	575	force_need_sync_fence,
	576	need_deferred_cleanup,
	577	flags);
	578	if (err)
	579	goto clean_up_job;
	580	}
	581
	582	if (profile)
	583	profile->timestamp[PROFILE_JOB_TRACKING] = sched_clock();
	584
	585	if (wait_cmd)
	586	gk20a_submit_append_priv_cmdbuf(c, wait_cmd);
	587
	588	if (gpfifo \|\| user_gpfifo)
	589	err = gk20a_submit_append_gpfifo(c, gpfifo, user_gpfifo,
	590	num_entries);
	591	if (err)
	592	goto clean_up_job;
	593
	594	/*
	595	* And here's where we add the incr_cmd we generated earlier. It should
	596	* always run!
	597	*/
	598	if (incr_cmd)
	599	gk20a_submit_append_priv_cmdbuf(c, incr_cmd);
	600
	601	if (fence_out)
	602	*fence_out = gk20a_fence_get(post_fence);
	603
	604	if (need_job_tracking)
	605	/* TODO! Check for errors... */
	606	gk20a_channel_add_job(c, job, skip_buffer_refcounting);
	607	if (profile)
	608	profile->timestamp[PROFILE_APPEND] = sched_clock();
	609
	610	g->ops.fifo.userd_gp_put(g, c);
	611
	612	if ((NVGPU_SUBMIT_GPFIFO_FLAGS_RESCHEDULE_RUNLIST & flags) &&
	613	g->ops.fifo.reschedule_runlist)
	614	g->ops.fifo.reschedule_runlist(g, c->runlist_id);
	615
	616	/* No hw access beyond this point */
	617	if (c->deterministic)
	618	nvgpu_rwsem_up_read(&g->deterministic_busy);
	619
	620	trace_gk20a_channel_submitted_gpfifo(g->name,
	621	c->chid,
	622	num_entries,
	623	flags,
	624	post_fence ? post_fence->syncpt_id : 0,
	625	post_fence ? post_fence->syncpt_value : 0);
	626
	627	gk20a_dbg_info("post-submit put %d, get %d, size %d",
	628	c->gpfifo.put, c->gpfifo.get, c->gpfifo.entry_num);
	629
	630	if (profile)
	631	profile->timestamp[PROFILE_END] = sched_clock();
	632	gk20a_dbg_fn("done");
	633	return err;
	634
	635	clean_up_job:
	636	channel_gk20a_free_job(c, job);
	637	clean_up:
	638	gk20a_dbg_fn("fail");
	639	gk20a_fence_put(pre_fence);
	640	gk20a_fence_put(post_fence);
	641	if (c->deterministic)
	642	nvgpu_rwsem_up_read(&g->deterministic_busy);
	643	else if (need_deferred_cleanup)
	644	gk20a_idle(g);
	645
	646	return err;
	647	}
	648