gpu: nvgpu: move submit code to common

To finish OS unification of the submit path, move the gk20a_submit_channel_gpfifo* functions to a file that's accessible also outside Linux code. Also change the prefix of the submit functions from gk20a_ to nvgpu_. Jira NVGPU-705 Change-Id: I8ca355d1eb69771fb016c7a21fc7f102ca7967d7 Signed-off-by: Konsta Holtta <kholtta@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/1760421 Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
author: Konsta Holtta <kholtta@nvidia.com> 2018-06-25 05:35:42 -0400
committer: mobile promotions <svcmobile_promotions@nvidia.com> 2018-06-27 21:40:16 -0400
commit: 7998233b77a343d002b699d5f348bbeb243e16f5 (patch)
tree: aa24afcc414be8fbccf6991804f69946e2b72525 /drivers/gpu/nvgpu/common/fifo/submit.c
parent: 2ac6fb4253fa815ed17f09a01141b938c826dac9 (diff)
1 files changed, 577 insertions, 0 deletions
diff --git a/drivers/gpu/nvgpu/common/fifo/submit.c b/drivers/gpu/nvgpu/common/fifo/submit.c
new file mode 100644
index 00000000..daeee608
--- /dev/null
+++ b/drivers/gpu/nvgpu/common/fifo/submit.c
@@ -0,0 +1,577 @@
+/*
+ * Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+#include <nvgpu/channel.h>
+#include <nvgpu/ltc.h>
+#include <nvgpu/os_sched.h>
+#include <nvgpu/hw/gk20a/hw_pbdma_gk20a.h>
+#include "gk20a/gk20a.h"
+#include "gk20a/channel_gk20a.h"
+#include <trace/events/gk20a.h>
+/*
+ * Handle the submit synchronization - pre-fences and post-fences.
+ */
+static int nvgpu_submit_prepare_syncs(struct channel_gk20a *c,
+                                      struct nvgpu_channel_fence *fence,
+                                      struct channel_gk20a_job *job,
+                                      struct priv_cmd_entry **wait_cmd,
+                                      struct priv_cmd_entry **incr_cmd,
+                                      struct gk20a_fence **post_fence,
+                                      bool register_irq,
+                                      u32 flags)
+{
+        struct gk20a *g = c->g;
+        bool need_sync_fence = false;
+        bool new_sync_created = false;
+        int wait_fence_fd = -1;
+        int err = 0;
+        bool need_wfi = !(flags & NVGPU_SUBMIT_FLAGS_SUPPRESS_WFI);
+        bool pre_alloc_enabled = channel_gk20a_is_prealloc_enabled(c);
+        if (g->aggressive_sync_destroy_thresh) {
+                nvgpu_mutex_acquire(&c->sync_lock);
+                if (!c->sync) {
+                        c->sync = gk20a_channel_sync_create(c, false);
+                        if (!c->sync) {
+                                err = -ENOMEM;
+                                nvgpu_mutex_release(&c->sync_lock);
+                                goto fail;
+                        }
+                        new_sync_created = true;
+                }
+                nvgpu_atomic_inc(&c->sync->refcount);
+                nvgpu_mutex_release(&c->sync_lock);
+        }
+        if (g->ops.fifo.resetup_ramfc && new_sync_created) {
+                err = g->ops.fifo.resetup_ramfc(c);
+                if (err)
+                        goto fail;
+        }
+        /*
+         * Optionally insert syncpt/semaphore wait in the beginning of gpfifo
+         * submission when user requested and the wait hasn't expired.
+         */
+        if (flags & NVGPU_SUBMIT_FLAGS_FENCE_WAIT) {
+                int max_wait_cmds = c->deterministic ? 1 : 0;
+                if (!pre_alloc_enabled)
+                        job->wait_cmd = nvgpu_kzalloc(g,
+                                sizeof(struct priv_cmd_entry));
+                if (!job->wait_cmd) {
+                        err = -ENOMEM;
+                        goto fail;
+                }
+                if (flags & NVGPU_SUBMIT_FLAGS_SYNC_FENCE) {
+                        wait_fence_fd = fence->id;
+                        err = c->sync->wait_fd(c->sync, wait_fence_fd,
+                                               job->wait_cmd, max_wait_cmds);
+                } else {
+                        err = c->sync->wait_syncpt(c->sync, fence->id,
+                                                   fence->value,
+                                                   job->wait_cmd);
+                }
+                if (err)
+                        goto clean_up_wait_cmd;
+                if (job->wait_cmd->valid)
+                        *wait_cmd = job->wait_cmd;
+        }
+        if ((flags & NVGPU_SUBMIT_FLAGS_FENCE_GET) &&
+            (flags & NVGPU_SUBMIT_FLAGS_SYNC_FENCE))
+                need_sync_fence = true;
+        /*
+         * Always generate an increment at the end of a GPFIFO submission. This
+         * is used to keep track of method completion for idle railgating. The
+         * sync_pt/semaphore PB is added to the GPFIFO later on in submit.
+         */
+        job->post_fence = gk20a_alloc_fence(c);
+        if (!job->post_fence) {
+                err = -ENOMEM;
+                goto clean_up_wait_cmd;
+        }
+        if (!pre_alloc_enabled)
+                job->incr_cmd = nvgpu_kzalloc(g, sizeof(struct priv_cmd_entry));
+        if (!job->incr_cmd) {
+                err = -ENOMEM;
+                goto clean_up_post_fence;
+        }
+        if (flags & NVGPU_SUBMIT_FLAGS_FENCE_GET)
+                err = c->sync->incr_user(c->sync, wait_fence_fd, job->incr_cmd,
+                                 job->post_fence, need_wfi, need_sync_fence,
+                                 register_irq);
+        else
+                err = c->sync->incr(c->sync, job->incr_cmd,
+                                    job->post_fence, need_sync_fence,
+                                    register_irq);
+        if (!err) {
+                *incr_cmd = job->incr_cmd;
+                *post_fence = job->post_fence;
+        } else
+                goto clean_up_incr_cmd;
+        return 0;
+clean_up_incr_cmd:
+        free_priv_cmdbuf(c, job->incr_cmd);
+        if (!pre_alloc_enabled)
+                job->incr_cmd = NULL;
+clean_up_post_fence:
+        gk20a_fence_put(job->post_fence);
+        job->post_fence = NULL;
+clean_up_wait_cmd:
+        if (job->wait_cmd)
+                free_priv_cmdbuf(c, job->wait_cmd);
+        if (!pre_alloc_enabled)
+                job->wait_cmd = NULL;
+fail:
+        *wait_cmd = NULL;
+        return err;
+}
+static void nvgpu_submit_append_priv_cmdbuf(struct channel_gk20a *c,
+                struct priv_cmd_entry *cmd)
+{
+        struct gk20a *g = c->g;
+        struct nvgpu_mem *gpfifo_mem = &c->gpfifo.mem;
+        struct nvgpu_gpfifo_entry x = {
+                .entry0 = u64_lo32(cmd->gva),
+                .entry1 = u64_hi32(cmd->gva) |
+                        pbdma_gp_entry1_length_f(cmd->size)
+        };
+        nvgpu_mem_wr_n(g, gpfifo_mem, c->gpfifo.put * sizeof(x),
+                        &x, sizeof(x));
+        if (cmd->mem->aperture == APERTURE_SYSMEM)
+                trace_gk20a_push_cmdbuf(g->name, 0, cmd->size, 0,
+                                (u32 *)cmd->mem->cpu_va + cmd->off);
+        c->gpfifo.put = (c->gpfifo.put + 1) & (c->gpfifo.entry_num - 1);
+}
+static int nvgpu_submit_append_gpfifo_user_direct(struct channel_gk20a *c,
+                struct nvgpu_gpfifo_userdata userdata,
+                u32 num_entries)
+{
+        struct gk20a *g = c->g;
+        struct nvgpu_gpfifo_entry *gpfifo_cpu = c->gpfifo.mem.cpu_va;
+        u32 gpfifo_size = c->gpfifo.entry_num;
+        u32 len = num_entries;
+        u32 start = c->gpfifo.put;
+        u32 end = start + len; /* exclusive */
+        int err;
+        if (end > gpfifo_size) {
+                /* wrap-around */
+                int length0 = gpfifo_size - start;
+                int length1 = len - length0;
+                err = g->os_channel.copy_user_gpfifo(
+                                gpfifo_cpu + start, userdata,
+                                0, length0);
+                if (err)
+                        return err;
+                err = g->os_channel.copy_user_gpfifo(
+                                gpfifo_cpu, userdata,
+                                length0, length1);
+                if (err)
+                        return err;
+        } else {
+                err = g->os_channel.copy_user_gpfifo(
+                                gpfifo_cpu + start, userdata,
+                                0, len);
+                if (err)
+                        return err;
+        }
+        return 0;
+}
+static void nvgpu_submit_append_gpfifo_common(struct channel_gk20a *c,
+                struct nvgpu_gpfifo_entry *src, u32 num_entries)
+{
+        struct gk20a *g = c->g;
+        struct nvgpu_mem *gpfifo_mem = &c->gpfifo.mem;
+        /* in bytes */
+        u32 gpfifo_size =
+                c->gpfifo.entry_num * sizeof(struct nvgpu_gpfifo_entry);
+        u32 len = num_entries * sizeof(struct nvgpu_gpfifo_entry);
+        u32 start = c->gpfifo.put * sizeof(struct nvgpu_gpfifo_entry);
+        u32 end = start + len; /* exclusive */
+        if (end > gpfifo_size) {
+                /* wrap-around */
+                int length0 = gpfifo_size - start;
+                int length1 = len - length0;
+                struct nvgpu_gpfifo_entry *src2 = src + length0;
+                nvgpu_mem_wr_n(g, gpfifo_mem, start, src, length0);
+                nvgpu_mem_wr_n(g, gpfifo_mem, 0, src2, length1);
+        } else {
+                nvgpu_mem_wr_n(g, gpfifo_mem, start, src, len);
+        }
+}
+/*
+ * Copy source gpfifo entries into the gpfifo ring buffer, potentially
+ * splitting into two memcpys to handle wrap-around.
+ */
+static int nvgpu_submit_append_gpfifo(struct channel_gk20a *c,
+                struct nvgpu_gpfifo_entry *kern_gpfifo,
+                struct nvgpu_gpfifo_userdata userdata,
+                u32 num_entries)
+{
+        struct gk20a *g = c->g;
+        int err;
+        if (!kern_gpfifo && !c->gpfifo.pipe) {
+                /*
+                 * This path (from userspace to sysmem) is special in order to
+                 * avoid two copies unnecessarily (from user to pipe, then from
+                 * pipe to gpu sysmem buffer).
+                 */
+                err = nvgpu_submit_append_gpfifo_user_direct(c, userdata,
+                                num_entries);
+                if (err)
+                        return err;
+        } else if (!kern_gpfifo) {
+                /* from userspace to vidmem, use the common path */
+                err = g->os_channel.copy_user_gpfifo(c->gpfifo.pipe, userdata,
+                                0, num_entries);
+                if (err)
+                        return err;
+                nvgpu_submit_append_gpfifo_common(c, c->gpfifo.pipe,
+                                num_entries);
+        } else {
+                /* from kernel to either sysmem or vidmem, don't need
+                 * copy_user_gpfifo so use the common path */
+                nvgpu_submit_append_gpfifo_common(c, kern_gpfifo, num_entries);
+        }
+        trace_write_pushbuffers(c, num_entries);
+        c->gpfifo.put = (c->gpfifo.put + num_entries) &
+                (c->gpfifo.entry_num - 1);
+        return 0;
+}
+static int nvgpu_submit_channel_gpfifo(struct channel_gk20a *c,
+                                struct nvgpu_gpfifo_entry *gpfifo,
+                                struct nvgpu_gpfifo_userdata userdata,
+                                u32 num_entries,
+                                u32 flags,
+                                struct nvgpu_channel_fence *fence,
+                                struct gk20a_fence **fence_out,
+                                struct fifo_profile_gk20a *profile)
+{
+        struct gk20a *g = c->g;
+        struct priv_cmd_entry *wait_cmd = NULL;
+        struct priv_cmd_entry *incr_cmd = NULL;
+        struct gk20a_fence *post_fence = NULL;
+        struct channel_gk20a_job *job = NULL;
+        /* we might need two extra gpfifo entries - one for pre fence
+         * and one for post fence. */
+        const int extra_entries = 2;
+        bool skip_buffer_refcounting = (flags &
+                        NVGPU_SUBMIT_FLAGS_SKIP_BUFFER_REFCOUNTING);
+        int err = 0;
+        bool need_job_tracking;
+        bool need_deferred_cleanup = false;
+        if (nvgpu_is_enabled(g, NVGPU_DRIVER_IS_DYING))
+                return -ENODEV;
+        if (c->has_timedout)
+                return -ETIMEDOUT;
+        if (!nvgpu_mem_is_valid(&c->gpfifo.mem))
+                return -ENOMEM;
+        /* fifo not large enough for request. Return error immediately.
+         * Kernel can insert gpfifo entries before and after user gpfifos.
+         * So, add extra_entries in user request. Also, HW with fifo size N
+         * can accept only N-1 entreis and so the below condition */
+        if (c->gpfifo.entry_num - 1 < num_entries + extra_entries) {
+                nvgpu_err(g, "not enough gpfifo space allocated");
+                return -ENOMEM;
+        }
+        if ((flags & (NVGPU_SUBMIT_FLAGS_FENCE_WAIT |
+                      NVGPU_SUBMIT_FLAGS_FENCE_GET)) &&
+            !fence)
+                return -EINVAL;
+        /* an address space needs to have been bound at this point. */
+        if (!gk20a_channel_as_bound(c)) {
+                nvgpu_err(g,
+                            "not bound to an address space at time of gpfifo"
+                            " submission.");
+                return -EINVAL;
+        }
+        gk20a_fifo_profile_snapshot(profile, PROFILE_ENTRY);
+        /* update debug settings */
+        nvgpu_ltc_sync_enabled(g);
+        nvgpu_log_info(g, "channel %d", c->chid);
+        /*
+         * Job tracking is necessary for any of the following conditions:
+         *  - pre- or post-fence functionality
+         *  - channel wdt
+         *  - GPU rail-gating with non-deterministic channels
+         *  - buffer refcounting
+         *
+         * If none of the conditions are met, then job tracking is not
+         * required and a fast submit can be done (ie. only need to write
+         * out userspace GPFIFO entries and update GP_PUT).
+         */
+        need_job_tracking = (flags & NVGPU_SUBMIT_FLAGS_FENCE_WAIT) ||
+                        (flags & NVGPU_SUBMIT_FLAGS_FENCE_GET) ||
+                        c->timeout.enabled ||
+                        (g->can_railgate && !c->deterministic) ||
+                        !skip_buffer_refcounting;
+        if (need_job_tracking) {
+                bool need_sync_framework = false;
+                /*
+                 * If the channel is to have deterministic latency and
+                 * job tracking is required, the channel must have
+                 * pre-allocated resources. Otherwise, we fail the submit here
+                 */
+                if (c->deterministic && !channel_gk20a_is_prealloc_enabled(c))
+                        return -EINVAL;
+                need_sync_framework =
+                        gk20a_channel_sync_needs_sync_framework(g) ||
+                        (flags & NVGPU_SUBMIT_FLAGS_SYNC_FENCE &&
+                         flags & NVGPU_SUBMIT_FLAGS_FENCE_GET);
+                /*
+                 * Deferred clean-up is necessary for any of the following
+                 * conditions:
+                 * - channel's deterministic flag is not set
+                 * - dependency on sync framework, which could make the
+                 *   behavior of the clean-up operation non-deterministic
+                 *   (should not be performed in the submit path)
+                 * - channel wdt
+                 * - GPU rail-gating with non-deterministic channels
+                 * - buffer refcounting
+                 *
+                 * If none of the conditions are met, then deferred clean-up
+                 * is not required, and we clean-up one job-tracking
+                 * resource in the submit path.
+                 */
+                need_deferred_cleanup = !c->deterministic ||
+                                        need_sync_framework ||
+                                        c->timeout.enabled ||
+                                        (g->can_railgate &&
+                                         !c->deterministic) ||
+                                        !skip_buffer_refcounting;
+                /*
+                 * For deterministic channels, we don't allow deferred clean_up
+                 * processing to occur. In cases we hit this, we fail the submit
+                 */
+                if (c->deterministic && need_deferred_cleanup)
+                        return -EINVAL;
+                if (!c->deterministic) {
+                        /*
+                         * Get a power ref unless this is a deterministic
+                         * channel that holds them during the channel lifetime.
+                         * This one is released by gk20a_channel_clean_up_jobs,
+                         * via syncpt or sema interrupt, whichever is used.
+                         */
+                        err = gk20a_busy(g);
+                        if (err) {
+                                nvgpu_err(g,
+                                        "failed to host gk20a to submit gpfifo");
+                                nvgpu_print_current(g, NULL, NVGPU_ERROR);
+                                return err;
+                        }
+                }
+                if (!need_deferred_cleanup) {
+                        /* clean up a single job */
+                        gk20a_channel_clean_up_jobs(c, false);
+                }
+        }
+        /* Grab access to HW to deal with do_idle */
+        if (c->deterministic)
+                nvgpu_rwsem_down_read(&g->deterministic_busy);
+        if (c->deterministic && c->deterministic_railgate_allowed) {
+                /*
+                 * Nope - this channel has dropped its own power ref. As
+                 * deterministic submits don't hold power on per each submitted
+                 * job like normal ones do, the GPU might railgate any time now
+                 * and thus submit is disallowed.
+                 */
+                err = -EINVAL;
+                goto clean_up;
+        }
+        trace_gk20a_channel_submit_gpfifo(g->name,
+                                          c->chid,
+                                          num_entries,
+                                          flags,
+                                          fence ? fence->id : 0,
+                                          fence ? fence->value : 0);
+        nvgpu_log_info(g, "pre-submit put %d, get %d, size %d",
+                c->gpfifo.put, c->gpfifo.get, c->gpfifo.entry_num);
+        /*
+         * Make sure we have enough space for gpfifo entries. Check cached
+         * values first and then read from HW. If no space, return EAGAIN
+         * and let userpace decide to re-try request or not.
+         */
+        if (nvgpu_gp_free_count(c) < num_entries + extra_entries) {
+                if (nvgpu_get_gp_free_count(c) < num_entries + extra_entries) {
+                        err = -EAGAIN;
+                        goto clean_up;
+                }
+        }
+        if (c->has_timedout) {
+                err = -ETIMEDOUT;
+                goto clean_up;
+        }
+        if (need_job_tracking) {
+                err = channel_gk20a_alloc_job(c, &job);
+                if (err)
+                        goto clean_up;
+                err = nvgpu_submit_prepare_syncs(c, fence, job,
+                                                 &wait_cmd, &incr_cmd,
+                                                 &post_fence,
+                                                 need_deferred_cleanup,
+                                                 flags);
+                if (err)
+                        goto clean_up_job;
+        }
+        gk20a_fifo_profile_snapshot(profile, PROFILE_JOB_TRACKING);
+        if (wait_cmd)
+                nvgpu_submit_append_priv_cmdbuf(c, wait_cmd);
+        err = nvgpu_submit_append_gpfifo(c, gpfifo, userdata,
+                        num_entries);
+        if (err)
+                goto clean_up_job;
+        /*
+         * And here's where we add the incr_cmd we generated earlier. It should
+         * always run!
+         */
+        if (incr_cmd)
+                nvgpu_submit_append_priv_cmdbuf(c, incr_cmd);
+        if (fence_out)
+                *fence_out = gk20a_fence_get(post_fence);
+        if (need_job_tracking)
+                /* TODO! Check for errors... */
+                gk20a_channel_add_job(c, job, skip_buffer_refcounting);
+        gk20a_fifo_profile_snapshot(profile, PROFILE_APPEND);
+        g->ops.fifo.userd_gp_put(g, c);
+        /* No hw access beyond this point */
+        if (c->deterministic)
+                nvgpu_rwsem_up_read(&g->deterministic_busy);
+        trace_gk20a_channel_submitted_gpfifo(g->name,
+                                c->chid,
+                                num_entries,
+                                flags,
+                                post_fence ? post_fence->syncpt_id : 0,
+                                post_fence ? post_fence->syncpt_value : 0);
+        nvgpu_log_info(g, "post-submit put %d, get %d, size %d",
+                c->gpfifo.put, c->gpfifo.get, c->gpfifo.entry_num);
+        gk20a_fifo_profile_snapshot(profile, PROFILE_END);
+        nvgpu_log_fn(g, "done");
+        return err;
+clean_up_job:
+        channel_gk20a_free_job(c, job);
+clean_up:
+        nvgpu_log_fn(g, "fail");
+        gk20a_fence_put(post_fence);
+        if (c->deterministic)
+                nvgpu_rwsem_up_read(&g->deterministic_busy);
+        else if (need_deferred_cleanup)
+                gk20a_idle(g);
+        return err;
+}
+int nvgpu_submit_channel_gpfifo_user(struct channel_gk20a *c,
+                                struct nvgpu_gpfifo_userdata userdata,
+                                u32 num_entries,
+                                u32 flags,
+                                struct nvgpu_channel_fence *fence,
+                                struct gk20a_fence **fence_out,
+                                struct fifo_profile_gk20a *profile)
+{
+        return nvgpu_submit_channel_gpfifo(c, NULL, userdata, num_entries,
+                        flags, fence, fence_out, profile);
+}
+int nvgpu_submit_channel_gpfifo_kernel(struct channel_gk20a *c,
+                                struct nvgpu_gpfifo_entry *gpfifo,
+                                u32 num_entries,
+                                u32 flags,
+                                struct nvgpu_channel_fence *fence,
+                                struct gk20a_fence **fence_out)
+{
+        struct nvgpu_gpfifo_userdata userdata = { NULL, NULL };
+        return nvgpu_submit_channel_gpfifo(c, gpfifo, userdata, num_entries,
+                        flags, fence, fence_out, NULL);
+}
author	Konsta Holtta <kholtta@nvidia.com>	2018-06-25 05:35:42 -0400
committer	mobile promotions <svcmobile_promotions@nvidia.com>	2018-06-27 21:40:16 -0400
commit	7998233b77a343d002b699d5f348bbeb243e16f5 (patch)
tree	aa24afcc414be8fbccf6991804f69946e2b72525 /drivers/gpu/nvgpu/common/fifo/submit.c
parent	2ac6fb4253fa815ed17f09a01141b938c826dac9 (diff)

diff --git a/drivers/gpu/nvgpu/common/fifo/submit.c b/drivers/gpu/nvgpu/common/fifo/submit.c new file mode 100644 index 00000000..daeee608 --- /dev/null +++ b/drivers/gpu/nvgpu/common/fifo/submit.c
@@ -0,0 +1,577 @@
	1	/*
	2	* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
	3	*
	4	* Permission is hereby granted, free of charge, to any person obtaining a
	5	* copy of this software and associated documentation files (the "Software"),
	6	* to deal in the Software without restriction, including without limitation
	7	* the rights to use, copy, modify, merge, publish, distribute, sublicense,
	8	* and/or sell copies of the Software, and to permit persons to whom the
	9	* Software is furnished to do so, subject to the following conditions:
	10	*
	11	* The above copyright notice and this permission notice shall be included in
	12	* all copies or substantial portions of the Software.
	13	*
	14	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	15	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	16	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
	17	* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	18	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
	19	* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
	20	* DEALINGS IN THE SOFTWARE.
	21	*/
	22
	23	#include <nvgpu/channel.h>
	24	#include <nvgpu/ltc.h>
	25	#include <nvgpu/os_sched.h>
	26
	27	#include <nvgpu/hw/gk20a/hw_pbdma_gk20a.h>
	28
	29	#include "gk20a/gk20a.h"
	30	#include "gk20a/channel_gk20a.h"
	31
	32	#include <trace/events/gk20a.h>
	33
	34	/*
	35	* Handle the submit synchronization - pre-fences and post-fences.
	36	*/
	37	static int nvgpu_submit_prepare_syncs(struct channel_gk20a *c,
	38	struct nvgpu_channel_fence *fence,
	39	struct channel_gk20a_job *job,
	40	struct priv_cmd_entry **wait_cmd,
	41	struct priv_cmd_entry **incr_cmd,
	42	struct gk20a_fence **post_fence,
	43	bool register_irq,
	44	u32 flags)
	45	{
	46	struct gk20a *g = c->g;
	47	bool need_sync_fence = false;
	48	bool new_sync_created = false;
	49	int wait_fence_fd = -1;
	50	int err = 0;
	51	bool need_wfi = !(flags & NVGPU_SUBMIT_FLAGS_SUPPRESS_WFI);
	52	bool pre_alloc_enabled = channel_gk20a_is_prealloc_enabled(c);
	53
	54	if (g->aggressive_sync_destroy_thresh) {
	55	nvgpu_mutex_acquire(&c->sync_lock);
	56	if (!c->sync) {
	57	c->sync = gk20a_channel_sync_create(c, false);
	58	if (!c->sync) {
	59	err = -ENOMEM;
	60	nvgpu_mutex_release(&c->sync_lock);
	61	goto fail;
	62	}
	63	new_sync_created = true;
	64	}
	65	nvgpu_atomic_inc(&c->sync->refcount);
	66	nvgpu_mutex_release(&c->sync_lock);
	67	}
	68
	69	if (g->ops.fifo.resetup_ramfc && new_sync_created) {
	70	err = g->ops.fifo.resetup_ramfc(c);
	71	if (err)
	72	goto fail;
	73	}
	74
	75	/*
	76	* Optionally insert syncpt/semaphore wait in the beginning of gpfifo
	77	* submission when user requested and the wait hasn't expired.
	78	*/
	79	if (flags & NVGPU_SUBMIT_FLAGS_FENCE_WAIT) {
	80	int max_wait_cmds = c->deterministic ? 1 : 0;
	81
	82	if (!pre_alloc_enabled)
	83	job->wait_cmd = nvgpu_kzalloc(g,
	84	sizeof(struct priv_cmd_entry));
	85
	86	if (!job->wait_cmd) {
	87	err = -ENOMEM;
	88	goto fail;
	89	}
	90
	91	if (flags & NVGPU_SUBMIT_FLAGS_SYNC_FENCE) {
	92	wait_fence_fd = fence->id;
	93	err = c->sync->wait_fd(c->sync, wait_fence_fd,
	94	job->wait_cmd, max_wait_cmds);
	95	} else {
	96	err = c->sync->wait_syncpt(c->sync, fence->id,
	97	fence->value,
	98	job->wait_cmd);
	99	}
	100
	101	if (err)
	102	goto clean_up_wait_cmd;
	103
	104	if (job->wait_cmd->valid)
	105	*wait_cmd = job->wait_cmd;
	106	}
	107
	108	if ((flags & NVGPU_SUBMIT_FLAGS_FENCE_GET) &&
	109	(flags & NVGPU_SUBMIT_FLAGS_SYNC_FENCE))
	110	need_sync_fence = true;
	111
	112	/*
	113	* Always generate an increment at the end of a GPFIFO submission. This
	114	* is used to keep track of method completion for idle railgating. The
	115	* sync_pt/semaphore PB is added to the GPFIFO later on in submit.
	116	*/
	117	job->post_fence = gk20a_alloc_fence(c);
	118	if (!job->post_fence) {
	119	err = -ENOMEM;
	120	goto clean_up_wait_cmd;
	121	}
	122	if (!pre_alloc_enabled)
	123	job->incr_cmd = nvgpu_kzalloc(g, sizeof(struct priv_cmd_entry));
	124
	125	if (!job->incr_cmd) {
	126	err = -ENOMEM;
	127	goto clean_up_post_fence;
	128	}
	129
	130	if (flags & NVGPU_SUBMIT_FLAGS_FENCE_GET)
	131	err = c->sync->incr_user(c->sync, wait_fence_fd, job->incr_cmd,
	132	job->post_fence, need_wfi, need_sync_fence,
	133	register_irq);
	134	else
	135	err = c->sync->incr(c->sync, job->incr_cmd,
	136	job->post_fence, need_sync_fence,
	137	register_irq);
	138	if (!err) {
	139	*incr_cmd = job->incr_cmd;
	140	*post_fence = job->post_fence;
	141	} else
	142	goto clean_up_incr_cmd;
	143
	144	return 0;
	145
	146	clean_up_incr_cmd:
	147	free_priv_cmdbuf(c, job->incr_cmd);
	148	if (!pre_alloc_enabled)
	149	job->incr_cmd = NULL;
	150	clean_up_post_fence:
	151	gk20a_fence_put(job->post_fence);
	152	job->post_fence = NULL;
	153	clean_up_wait_cmd:
	154	if (job->wait_cmd)
	155	free_priv_cmdbuf(c, job->wait_cmd);
	156	if (!pre_alloc_enabled)
	157	job->wait_cmd = NULL;
	158	fail:
	159	*wait_cmd = NULL;
	160	return err;
	161	}
	162
	163	static void nvgpu_submit_append_priv_cmdbuf(struct channel_gk20a *c,
	164	struct priv_cmd_entry *cmd)
	165	{
	166	struct gk20a *g = c->g;
	167	struct nvgpu_mem *gpfifo_mem = &c->gpfifo.mem;
	168	struct nvgpu_gpfifo_entry x = {
	169	.entry0 = u64_lo32(cmd->gva),
	170	.entry1 = u64_hi32(cmd->gva) \|
	171	pbdma_gp_entry1_length_f(cmd->size)
	172	};
	173
	174	nvgpu_mem_wr_n(g, gpfifo_mem, c->gpfifo.put * sizeof(x),
	175	&x, sizeof(x));
	176
	177	if (cmd->mem->aperture == APERTURE_SYSMEM)
	178	trace_gk20a_push_cmdbuf(g->name, 0, cmd->size, 0,
	179	(u32 *)cmd->mem->cpu_va + cmd->off);
	180
	181	c->gpfifo.put = (c->gpfifo.put + 1) & (c->gpfifo.entry_num - 1);
	182	}
	183
	184	static int nvgpu_submit_append_gpfifo_user_direct(struct channel_gk20a *c,
	185	struct nvgpu_gpfifo_userdata userdata,
	186	u32 num_entries)
	187	{
	188	struct gk20a *g = c->g;
	189	struct nvgpu_gpfifo_entry *gpfifo_cpu = c->gpfifo.mem.cpu_va;
	190	u32 gpfifo_size = c->gpfifo.entry_num;
	191	u32 len = num_entries;
	192	u32 start = c->gpfifo.put;
	193	u32 end = start + len; /* exclusive */
	194	int err;
	195
	196	if (end > gpfifo_size) {
	197	/* wrap-around */
	198	int length0 = gpfifo_size - start;
	199	int length1 = len - length0;
	200
	201	err = g->os_channel.copy_user_gpfifo(
	202	gpfifo_cpu + start, userdata,
	203	0, length0);
	204	if (err)
	205	return err;
	206
	207	err = g->os_channel.copy_user_gpfifo(
	208	gpfifo_cpu, userdata,
	209	length0, length1);
	210	if (err)
	211	return err;
	212	} else {
	213	err = g->os_channel.copy_user_gpfifo(
	214	gpfifo_cpu + start, userdata,
	215	0, len);
	216	if (err)
	217	return err;
	218	}
	219
	220	return 0;
	221	}
	222
	223	static void nvgpu_submit_append_gpfifo_common(struct channel_gk20a *c,
	224	struct nvgpu_gpfifo_entry *src, u32 num_entries)
	225	{
	226	struct gk20a *g = c->g;
	227	struct nvgpu_mem *gpfifo_mem = &c->gpfifo.mem;
	228	/* in bytes */
	229	u32 gpfifo_size =
	230	c->gpfifo.entry_num * sizeof(struct nvgpu_gpfifo_entry);
	231	u32 len = num_entries * sizeof(struct nvgpu_gpfifo_entry);
	232	u32 start = c->gpfifo.put * sizeof(struct nvgpu_gpfifo_entry);
	233	u32 end = start + len; /* exclusive */
	234
	235	if (end > gpfifo_size) {
	236	/* wrap-around */
	237	int length0 = gpfifo_size - start;
	238	int length1 = len - length0;
	239	struct nvgpu_gpfifo_entry *src2 = src + length0;
	240
	241	nvgpu_mem_wr_n(g, gpfifo_mem, start, src, length0);
	242	nvgpu_mem_wr_n(g, gpfifo_mem, 0, src2, length1);
	243	} else {
	244	nvgpu_mem_wr_n(g, gpfifo_mem, start, src, len);
	245	}
	246	}
	247
	248	/*
	249	* Copy source gpfifo entries into the gpfifo ring buffer, potentially
	250	* splitting into two memcpys to handle wrap-around.
	251	*/
	252	static int nvgpu_submit_append_gpfifo(struct channel_gk20a *c,
	253	struct nvgpu_gpfifo_entry *kern_gpfifo,
	254	struct nvgpu_gpfifo_userdata userdata,
	255	u32 num_entries)
	256	{
	257	struct gk20a *g = c->g;
	258	int err;
	259
	260	if (!kern_gpfifo && !c->gpfifo.pipe) {
	261	/*
	262	* This path (from userspace to sysmem) is special in order to
	263	* avoid two copies unnecessarily (from user to pipe, then from
	264	* pipe to gpu sysmem buffer).
	265	*/
	266	err = nvgpu_submit_append_gpfifo_user_direct(c, userdata,
	267	num_entries);
	268	if (err)
	269	return err;
	270	} else if (!kern_gpfifo) {
	271	/* from userspace to vidmem, use the common path */
	272	err = g->os_channel.copy_user_gpfifo(c->gpfifo.pipe, userdata,
	273	0, num_entries);
	274	if (err)
	275	return err;
	276
	277	nvgpu_submit_append_gpfifo_common(c, c->gpfifo.pipe,
	278	num_entries);
	279	} else {
	280	/* from kernel to either sysmem or vidmem, don't need
	281	* copy_user_gpfifo so use the common path */
	282	nvgpu_submit_append_gpfifo_common(c, kern_gpfifo, num_entries);
	283	}
	284
	285	trace_write_pushbuffers(c, num_entries);
	286
	287	c->gpfifo.put = (c->gpfifo.put + num_entries) &
	288	(c->gpfifo.entry_num - 1);
	289
	290	return 0;
	291	}
	292
	293	static int nvgpu_submit_channel_gpfifo(struct channel_gk20a *c,
	294	struct nvgpu_gpfifo_entry *gpfifo,
	295	struct nvgpu_gpfifo_userdata userdata,
	296	u32 num_entries,
	297	u32 flags,
	298	struct nvgpu_channel_fence *fence,
	299	struct gk20a_fence **fence_out,
	300	struct fifo_profile_gk20a *profile)
	301	{
	302	struct gk20a *g = c->g;
	303	struct priv_cmd_entry *wait_cmd = NULL;
	304	struct priv_cmd_entry *incr_cmd = NULL;
	305	struct gk20a_fence *post_fence = NULL;
	306	struct channel_gk20a_job *job = NULL;
	307	/* we might need two extra gpfifo entries - one for pre fence
	308	* and one for post fence. */
	309	const int extra_entries = 2;
	310	bool skip_buffer_refcounting = (flags &
	311	NVGPU_SUBMIT_FLAGS_SKIP_BUFFER_REFCOUNTING);
	312	int err = 0;
	313	bool need_job_tracking;
	314	bool need_deferred_cleanup = false;
	315
	316	if (nvgpu_is_enabled(g, NVGPU_DRIVER_IS_DYING))
	317	return -ENODEV;
	318
	319	if (c->has_timedout)
	320	return -ETIMEDOUT;
	321
	322	if (!nvgpu_mem_is_valid(&c->gpfifo.mem))
	323	return -ENOMEM;
	324
	325	/* fifo not large enough for request. Return error immediately.
	326	* Kernel can insert gpfifo entries before and after user gpfifos.
	327	* So, add extra_entries in user request. Also, HW with fifo size N
	328	* can accept only N-1 entreis and so the below condition */
	329	if (c->gpfifo.entry_num - 1 < num_entries + extra_entries) {
	330	nvgpu_err(g, "not enough gpfifo space allocated");
	331	return -ENOMEM;
	332	}
	333
	334	if ((flags & (NVGPU_SUBMIT_FLAGS_FENCE_WAIT \|
	335	NVGPU_SUBMIT_FLAGS_FENCE_GET)) &&
	336	!fence)
	337	return -EINVAL;
	338
	339	/* an address space needs to have been bound at this point. */
	340	if (!gk20a_channel_as_bound(c)) {
	341	nvgpu_err(g,
	342	"not bound to an address space at time of gpfifo"
	343	" submission.");
	344	return -EINVAL;
	345	}
	346
	347	gk20a_fifo_profile_snapshot(profile, PROFILE_ENTRY);
	348
	349	/* update debug settings */
	350	nvgpu_ltc_sync_enabled(g);
	351
	352	nvgpu_log_info(g, "channel %d", c->chid);
	353
	354	/*
	355	* Job tracking is necessary for any of the following conditions:
	356	* - pre- or post-fence functionality
	357	* - channel wdt
	358	* - GPU rail-gating with non-deterministic channels
	359	* - buffer refcounting
	360	*
	361	* If none of the conditions are met, then job tracking is not
	362	* required and a fast submit can be done (ie. only need to write
	363	* out userspace GPFIFO entries and update GP_PUT).
	364	*/
	365	need_job_tracking = (flags & NVGPU_SUBMIT_FLAGS_FENCE_WAIT) \|\|
	366	(flags & NVGPU_SUBMIT_FLAGS_FENCE_GET) \|\|
	367	c->timeout.enabled \|\|
	368	(g->can_railgate && !c->deterministic) \|\|
	369	!skip_buffer_refcounting;
	370
	371	if (need_job_tracking) {
	372	bool need_sync_framework = false;
	373
	374	/*
	375	* If the channel is to have deterministic latency and
	376	* job tracking is required, the channel must have
	377	* pre-allocated resources. Otherwise, we fail the submit here
	378	*/
	379	if (c->deterministic && !channel_gk20a_is_prealloc_enabled(c))
	380	return -EINVAL;
	381
	382	need_sync_framework =
	383	gk20a_channel_sync_needs_sync_framework(g) \|\|
	384	(flags & NVGPU_SUBMIT_FLAGS_SYNC_FENCE &&
	385	flags & NVGPU_SUBMIT_FLAGS_FENCE_GET);
	386
	387	/*
	388	* Deferred clean-up is necessary for any of the following
	389	* conditions:
	390	* - channel's deterministic flag is not set
	391	* - dependency on sync framework, which could make the
	392	* behavior of the clean-up operation non-deterministic
	393	* (should not be performed in the submit path)
	394	* - channel wdt
	395	* - GPU rail-gating with non-deterministic channels
	396	* - buffer refcounting
	397	*
	398	* If none of the conditions are met, then deferred clean-up
	399	* is not required, and we clean-up one job-tracking
	400	* resource in the submit path.
	401	*/
	402	need_deferred_cleanup = !c->deterministic \|\|
	403	need_sync_framework \|\|
	404	c->timeout.enabled \|\|
	405	(g->can_railgate &&
	406	!c->deterministic) \|\|
	407	!skip_buffer_refcounting;
	408
	409	/*
	410	* For deterministic channels, we don't allow deferred clean_up
	411	* processing to occur. In cases we hit this, we fail the submit
	412	*/
	413	if (c->deterministic && need_deferred_cleanup)
	414	return -EINVAL;
	415
	416	if (!c->deterministic) {
	417	/*
	418	* Get a power ref unless this is a deterministic
	419	* channel that holds them during the channel lifetime.
	420	* This one is released by gk20a_channel_clean_up_jobs,
	421	* via syncpt or sema interrupt, whichever is used.
	422	*/
	423	err = gk20a_busy(g);
	424	if (err) {
	425	nvgpu_err(g,
	426	"failed to host gk20a to submit gpfifo");
	427	nvgpu_print_current(g, NULL, NVGPU_ERROR);
	428	return err;
	429	}
	430	}
	431
	432	if (!need_deferred_cleanup) {
	433	/* clean up a single job */
	434	gk20a_channel_clean_up_jobs(c, false);
	435	}
	436	}
	437
	438
	439	/* Grab access to HW to deal with do_idle */
	440	if (c->deterministic)
	441	nvgpu_rwsem_down_read(&g->deterministic_busy);
	442
	443	if (c->deterministic && c->deterministic_railgate_allowed) {
	444	/*
	445	* Nope - this channel has dropped its own power ref. As
	446	* deterministic submits don't hold power on per each submitted
	447	* job like normal ones do, the GPU might railgate any time now
	448	* and thus submit is disallowed.
	449	*/
	450	err = -EINVAL;
	451	goto clean_up;
	452	}
	453
	454	trace_gk20a_channel_submit_gpfifo(g->name,
	455	c->chid,
	456	num_entries,
	457	flags,
	458	fence ? fence->id : 0,
	459	fence ? fence->value : 0);
	460
	461	nvgpu_log_info(g, "pre-submit put %d, get %d, size %d",
	462	c->gpfifo.put, c->gpfifo.get, c->gpfifo.entry_num);
	463
	464	/*
	465	* Make sure we have enough space for gpfifo entries. Check cached
	466	* values first and then read from HW. If no space, return EAGAIN
	467	* and let userpace decide to re-try request or not.
	468	*/
	469	if (nvgpu_gp_free_count(c) < num_entries + extra_entries) {
	470	if (nvgpu_get_gp_free_count(c) < num_entries + extra_entries) {
	471	err = -EAGAIN;
	472	goto clean_up;
	473	}
	474	}
	475
	476	if (c->has_timedout) {
	477	err = -ETIMEDOUT;
	478	goto clean_up;
	479	}
	480
	481	if (need_job_tracking) {
	482	err = channel_gk20a_alloc_job(c, &job);
	483	if (err)
	484	goto clean_up;
	485
	486	err = nvgpu_submit_prepare_syncs(c, fence, job,
	487	&wait_cmd, &incr_cmd,
	488	&post_fence,
	489	need_deferred_cleanup,
	490	flags);
	491	if (err)
	492	goto clean_up_job;
	493	}
	494
	495	gk20a_fifo_profile_snapshot(profile, PROFILE_JOB_TRACKING);
	496
	497	if (wait_cmd)
	498	nvgpu_submit_append_priv_cmdbuf(c, wait_cmd);
	499
	500	err = nvgpu_submit_append_gpfifo(c, gpfifo, userdata,
	501	num_entries);
	502	if (err)
	503	goto clean_up_job;
	504
	505	/*
	506	* And here's where we add the incr_cmd we generated earlier. It should
	507	* always run!
	508	*/
	509	if (incr_cmd)
	510	nvgpu_submit_append_priv_cmdbuf(c, incr_cmd);
	511
	512	if (fence_out)
	513	*fence_out = gk20a_fence_get(post_fence);
	514
	515	if (need_job_tracking)
	516	/* TODO! Check for errors... */
	517	gk20a_channel_add_job(c, job, skip_buffer_refcounting);
	518	gk20a_fifo_profile_snapshot(profile, PROFILE_APPEND);
	519
	520	g->ops.fifo.userd_gp_put(g, c);
	521
	522	/* No hw access beyond this point */
	523	if (c->deterministic)
	524	nvgpu_rwsem_up_read(&g->deterministic_busy);
	525
	526	trace_gk20a_channel_submitted_gpfifo(g->name,
	527	c->chid,
	528	num_entries,
	529	flags,
	530	post_fence ? post_fence->syncpt_id : 0,
	531	post_fence ? post_fence->syncpt_value : 0);
	532
	533	nvgpu_log_info(g, "post-submit put %d, get %d, size %d",
	534	c->gpfifo.put, c->gpfifo.get, c->gpfifo.entry_num);
	535
	536	gk20a_fifo_profile_snapshot(profile, PROFILE_END);
	537
	538	nvgpu_log_fn(g, "done");
	539	return err;
	540
	541	clean_up_job:
	542	channel_gk20a_free_job(c, job);
	543	clean_up:
	544	nvgpu_log_fn(g, "fail");
	545	gk20a_fence_put(post_fence);
	546	if (c->deterministic)
	547	nvgpu_rwsem_up_read(&g->deterministic_busy);
	548	else if (need_deferred_cleanup)
	549	gk20a_idle(g);
	550
	551	return err;
	552	}
	553
	554	int nvgpu_submit_channel_gpfifo_user(struct channel_gk20a *c,
	555	struct nvgpu_gpfifo_userdata userdata,
	556	u32 num_entries,
	557	u32 flags,
	558	struct nvgpu_channel_fence *fence,
	559	struct gk20a_fence **fence_out,
	560	struct fifo_profile_gk20a *profile)
	561	{
	562	return nvgpu_submit_channel_gpfifo(c, NULL, userdata, num_entries,
	563	flags, fence, fence_out, profile);
	564	}
	565
	566	int nvgpu_submit_channel_gpfifo_kernel(struct channel_gk20a *c,
	567	struct nvgpu_gpfifo_entry *gpfifo,
	568	u32 num_entries,
	569	u32 flags,
	570	struct nvgpu_channel_fence *fence,
	571	struct gk20a_fence **fence_out)
	572	{
	573	struct nvgpu_gpfifo_userdata userdata = { NULL, NULL };
	574
	575	return nvgpu_submit_channel_gpfifo(c, gpfifo, userdata, num_entries,
	576	flags, fence, fence_out, NULL);
	577	}