From 7998233b77a343d002b699d5f348bbeb243e16f5 Mon Sep 17 00:00:00 2001 From: Konsta Holtta Date: Mon, 25 Jun 2018 12:35:42 +0300 Subject: gpu: nvgpu: move submit code to common To finish OS unification of the submit path, move the gk20a_submit_channel_gpfifo* functions to a file that's accessible also outside Linux code. Also change the prefix of the submit functions from gk20a_ to nvgpu_. Jira NVGPU-705 Change-Id: I8ca355d1eb69771fb016c7a21fc7f102ca7967d7 Signed-off-by: Konsta Holtta Reviewed-on: https://git-master.nvidia.com/r/1760421 Reviewed-by: mobile promotions Tested-by: mobile promotions --- drivers/gpu/nvgpu/Makefile | 1 + drivers/gpu/nvgpu/Makefile.sources | 1 + drivers/gpu/nvgpu/common/fifo/submit.c | 577 +++++++++++++++++++++++++++++ drivers/gpu/nvgpu/gk20a/channel_gk20a.h | 2 +- drivers/gpu/nvgpu/gk20a/gk20a.h | 1 + drivers/gpu/nvgpu/include/nvgpu/channel.h | 52 +++ drivers/gpu/nvgpu/os/linux/cde.c | 3 +- drivers/gpu/nvgpu/os/linux/ce2.c | 3 +- drivers/gpu/nvgpu/os/linux/channel.c | 551 +-------------------------- drivers/gpu/nvgpu/os/linux/channel.h | 15 - drivers/gpu/nvgpu/os/linux/ioctl_channel.c | 5 +- 11 files changed, 642 insertions(+), 569 deletions(-) create mode 100644 drivers/gpu/nvgpu/common/fifo/submit.c create mode 100644 drivers/gpu/nvgpu/include/nvgpu/channel.h (limited to 'drivers/gpu/nvgpu') diff --git a/drivers/gpu/nvgpu/Makefile b/drivers/gpu/nvgpu/Makefile index faf17a91..61636ff5 100644 --- a/drivers/gpu/nvgpu/Makefile +++ b/drivers/gpu/nvgpu/Makefile @@ -177,6 +177,7 @@ nvgpu-y += \ common/clock_gating/gv11b_gating_reglist.o \ common/sim.o \ common/sim_pci.o \ + common/fifo/submit.o \ gk20a/gk20a.o \ gk20a/ce2_gk20a.o \ gk20a/fifo_gk20a.o \ diff --git a/drivers/gpu/nvgpu/Makefile.sources b/drivers/gpu/nvgpu/Makefile.sources index cad9c1e3..942fddea 100644 --- a/drivers/gpu/nvgpu/Makefile.sources +++ b/drivers/gpu/nvgpu/Makefile.sources @@ -80,6 +80,7 @@ srcs := common/mm/nvgpu_allocator.c \ common/clock_gating/gv11b_gating_reglist.c \ common/clock_gating/gp106_gating_reglist.c \ common/clock_gating/gv100_gating_reglist.c \ + common/fifo/submit.c \ boardobj/boardobj.c \ boardobj/boardobjgrp.c \ boardobj/boardobjgrpmask.c \ diff --git a/drivers/gpu/nvgpu/common/fifo/submit.c b/drivers/gpu/nvgpu/common/fifo/submit.c new file mode 100644 index 00000000..daeee608 --- /dev/null +++ b/drivers/gpu/nvgpu/common/fifo/submit.c @@ -0,0 +1,577 @@ +/* + * Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include +#include +#include + +#include + +#include "gk20a/gk20a.h" +#include "gk20a/channel_gk20a.h" + +#include + +/* + * Handle the submit synchronization - pre-fences and post-fences. + */ +static int nvgpu_submit_prepare_syncs(struct channel_gk20a *c, + struct nvgpu_channel_fence *fence, + struct channel_gk20a_job *job, + struct priv_cmd_entry **wait_cmd, + struct priv_cmd_entry **incr_cmd, + struct gk20a_fence **post_fence, + bool register_irq, + u32 flags) +{ + struct gk20a *g = c->g; + bool need_sync_fence = false; + bool new_sync_created = false; + int wait_fence_fd = -1; + int err = 0; + bool need_wfi = !(flags & NVGPU_SUBMIT_FLAGS_SUPPRESS_WFI); + bool pre_alloc_enabled = channel_gk20a_is_prealloc_enabled(c); + + if (g->aggressive_sync_destroy_thresh) { + nvgpu_mutex_acquire(&c->sync_lock); + if (!c->sync) { + c->sync = gk20a_channel_sync_create(c, false); + if (!c->sync) { + err = -ENOMEM; + nvgpu_mutex_release(&c->sync_lock); + goto fail; + } + new_sync_created = true; + } + nvgpu_atomic_inc(&c->sync->refcount); + nvgpu_mutex_release(&c->sync_lock); + } + + if (g->ops.fifo.resetup_ramfc && new_sync_created) { + err = g->ops.fifo.resetup_ramfc(c); + if (err) + goto fail; + } + + /* + * Optionally insert syncpt/semaphore wait in the beginning of gpfifo + * submission when user requested and the wait hasn't expired. + */ + if (flags & NVGPU_SUBMIT_FLAGS_FENCE_WAIT) { + int max_wait_cmds = c->deterministic ? 1 : 0; + + if (!pre_alloc_enabled) + job->wait_cmd = nvgpu_kzalloc(g, + sizeof(struct priv_cmd_entry)); + + if (!job->wait_cmd) { + err = -ENOMEM; + goto fail; + } + + if (flags & NVGPU_SUBMIT_FLAGS_SYNC_FENCE) { + wait_fence_fd = fence->id; + err = c->sync->wait_fd(c->sync, wait_fence_fd, + job->wait_cmd, max_wait_cmds); + } else { + err = c->sync->wait_syncpt(c->sync, fence->id, + fence->value, + job->wait_cmd); + } + + if (err) + goto clean_up_wait_cmd; + + if (job->wait_cmd->valid) + *wait_cmd = job->wait_cmd; + } + + if ((flags & NVGPU_SUBMIT_FLAGS_FENCE_GET) && + (flags & NVGPU_SUBMIT_FLAGS_SYNC_FENCE)) + need_sync_fence = true; + + /* + * Always generate an increment at the end of a GPFIFO submission. This + * is used to keep track of method completion for idle railgating. The + * sync_pt/semaphore PB is added to the GPFIFO later on in submit. + */ + job->post_fence = gk20a_alloc_fence(c); + if (!job->post_fence) { + err = -ENOMEM; + goto clean_up_wait_cmd; + } + if (!pre_alloc_enabled) + job->incr_cmd = nvgpu_kzalloc(g, sizeof(struct priv_cmd_entry)); + + if (!job->incr_cmd) { + err = -ENOMEM; + goto clean_up_post_fence; + } + + if (flags & NVGPU_SUBMIT_FLAGS_FENCE_GET) + err = c->sync->incr_user(c->sync, wait_fence_fd, job->incr_cmd, + job->post_fence, need_wfi, need_sync_fence, + register_irq); + else + err = c->sync->incr(c->sync, job->incr_cmd, + job->post_fence, need_sync_fence, + register_irq); + if (!err) { + *incr_cmd = job->incr_cmd; + *post_fence = job->post_fence; + } else + goto clean_up_incr_cmd; + + return 0; + +clean_up_incr_cmd: + free_priv_cmdbuf(c, job->incr_cmd); + if (!pre_alloc_enabled) + job->incr_cmd = NULL; +clean_up_post_fence: + gk20a_fence_put(job->post_fence); + job->post_fence = NULL; +clean_up_wait_cmd: + if (job->wait_cmd) + free_priv_cmdbuf(c, job->wait_cmd); + if (!pre_alloc_enabled) + job->wait_cmd = NULL; +fail: + *wait_cmd = NULL; + return err; +} + +static void nvgpu_submit_append_priv_cmdbuf(struct channel_gk20a *c, + struct priv_cmd_entry *cmd) +{ + struct gk20a *g = c->g; + struct nvgpu_mem *gpfifo_mem = &c->gpfifo.mem; + struct nvgpu_gpfifo_entry x = { + .entry0 = u64_lo32(cmd->gva), + .entry1 = u64_hi32(cmd->gva) | + pbdma_gp_entry1_length_f(cmd->size) + }; + + nvgpu_mem_wr_n(g, gpfifo_mem, c->gpfifo.put * sizeof(x), + &x, sizeof(x)); + + if (cmd->mem->aperture == APERTURE_SYSMEM) + trace_gk20a_push_cmdbuf(g->name, 0, cmd->size, 0, + (u32 *)cmd->mem->cpu_va + cmd->off); + + c->gpfifo.put = (c->gpfifo.put + 1) & (c->gpfifo.entry_num - 1); +} + +static int nvgpu_submit_append_gpfifo_user_direct(struct channel_gk20a *c, + struct nvgpu_gpfifo_userdata userdata, + u32 num_entries) +{ + struct gk20a *g = c->g; + struct nvgpu_gpfifo_entry *gpfifo_cpu = c->gpfifo.mem.cpu_va; + u32 gpfifo_size = c->gpfifo.entry_num; + u32 len = num_entries; + u32 start = c->gpfifo.put; + u32 end = start + len; /* exclusive */ + int err; + + if (end > gpfifo_size) { + /* wrap-around */ + int length0 = gpfifo_size - start; + int length1 = len - length0; + + err = g->os_channel.copy_user_gpfifo( + gpfifo_cpu + start, userdata, + 0, length0); + if (err) + return err; + + err = g->os_channel.copy_user_gpfifo( + gpfifo_cpu, userdata, + length0, length1); + if (err) + return err; + } else { + err = g->os_channel.copy_user_gpfifo( + gpfifo_cpu + start, userdata, + 0, len); + if (err) + return err; + } + + return 0; +} + +static void nvgpu_submit_append_gpfifo_common(struct channel_gk20a *c, + struct nvgpu_gpfifo_entry *src, u32 num_entries) +{ + struct gk20a *g = c->g; + struct nvgpu_mem *gpfifo_mem = &c->gpfifo.mem; + /* in bytes */ + u32 gpfifo_size = + c->gpfifo.entry_num * sizeof(struct nvgpu_gpfifo_entry); + u32 len = num_entries * sizeof(struct nvgpu_gpfifo_entry); + u32 start = c->gpfifo.put * sizeof(struct nvgpu_gpfifo_entry); + u32 end = start + len; /* exclusive */ + + if (end > gpfifo_size) { + /* wrap-around */ + int length0 = gpfifo_size - start; + int length1 = len - length0; + struct nvgpu_gpfifo_entry *src2 = src + length0; + + nvgpu_mem_wr_n(g, gpfifo_mem, start, src, length0); + nvgpu_mem_wr_n(g, gpfifo_mem, 0, src2, length1); + } else { + nvgpu_mem_wr_n(g, gpfifo_mem, start, src, len); + } +} + +/* + * Copy source gpfifo entries into the gpfifo ring buffer, potentially + * splitting into two memcpys to handle wrap-around. + */ +static int nvgpu_submit_append_gpfifo(struct channel_gk20a *c, + struct nvgpu_gpfifo_entry *kern_gpfifo, + struct nvgpu_gpfifo_userdata userdata, + u32 num_entries) +{ + struct gk20a *g = c->g; + int err; + + if (!kern_gpfifo && !c->gpfifo.pipe) { + /* + * This path (from userspace to sysmem) is special in order to + * avoid two copies unnecessarily (from user to pipe, then from + * pipe to gpu sysmem buffer). + */ + err = nvgpu_submit_append_gpfifo_user_direct(c, userdata, + num_entries); + if (err) + return err; + } else if (!kern_gpfifo) { + /* from userspace to vidmem, use the common path */ + err = g->os_channel.copy_user_gpfifo(c->gpfifo.pipe, userdata, + 0, num_entries); + if (err) + return err; + + nvgpu_submit_append_gpfifo_common(c, c->gpfifo.pipe, + num_entries); + } else { + /* from kernel to either sysmem or vidmem, don't need + * copy_user_gpfifo so use the common path */ + nvgpu_submit_append_gpfifo_common(c, kern_gpfifo, num_entries); + } + + trace_write_pushbuffers(c, num_entries); + + c->gpfifo.put = (c->gpfifo.put + num_entries) & + (c->gpfifo.entry_num - 1); + + return 0; +} + +static int nvgpu_submit_channel_gpfifo(struct channel_gk20a *c, + struct nvgpu_gpfifo_entry *gpfifo, + struct nvgpu_gpfifo_userdata userdata, + u32 num_entries, + u32 flags, + struct nvgpu_channel_fence *fence, + struct gk20a_fence **fence_out, + struct fifo_profile_gk20a *profile) +{ + struct gk20a *g = c->g; + struct priv_cmd_entry *wait_cmd = NULL; + struct priv_cmd_entry *incr_cmd = NULL; + struct gk20a_fence *post_fence = NULL; + struct channel_gk20a_job *job = NULL; + /* we might need two extra gpfifo entries - one for pre fence + * and one for post fence. */ + const int extra_entries = 2; + bool skip_buffer_refcounting = (flags & + NVGPU_SUBMIT_FLAGS_SKIP_BUFFER_REFCOUNTING); + int err = 0; + bool need_job_tracking; + bool need_deferred_cleanup = false; + + if (nvgpu_is_enabled(g, NVGPU_DRIVER_IS_DYING)) + return -ENODEV; + + if (c->has_timedout) + return -ETIMEDOUT; + + if (!nvgpu_mem_is_valid(&c->gpfifo.mem)) + return -ENOMEM; + + /* fifo not large enough for request. Return error immediately. + * Kernel can insert gpfifo entries before and after user gpfifos. + * So, add extra_entries in user request. Also, HW with fifo size N + * can accept only N-1 entreis and so the below condition */ + if (c->gpfifo.entry_num - 1 < num_entries + extra_entries) { + nvgpu_err(g, "not enough gpfifo space allocated"); + return -ENOMEM; + } + + if ((flags & (NVGPU_SUBMIT_FLAGS_FENCE_WAIT | + NVGPU_SUBMIT_FLAGS_FENCE_GET)) && + !fence) + return -EINVAL; + + /* an address space needs to have been bound at this point. */ + if (!gk20a_channel_as_bound(c)) { + nvgpu_err(g, + "not bound to an address space at time of gpfifo" + " submission."); + return -EINVAL; + } + + gk20a_fifo_profile_snapshot(profile, PROFILE_ENTRY); + + /* update debug settings */ + nvgpu_ltc_sync_enabled(g); + + nvgpu_log_info(g, "channel %d", c->chid); + + /* + * Job tracking is necessary for any of the following conditions: + * - pre- or post-fence functionality + * - channel wdt + * - GPU rail-gating with non-deterministic channels + * - buffer refcounting + * + * If none of the conditions are met, then job tracking is not + * required and a fast submit can be done (ie. only need to write + * out userspace GPFIFO entries and update GP_PUT). + */ + need_job_tracking = (flags & NVGPU_SUBMIT_FLAGS_FENCE_WAIT) || + (flags & NVGPU_SUBMIT_FLAGS_FENCE_GET) || + c->timeout.enabled || + (g->can_railgate && !c->deterministic) || + !skip_buffer_refcounting; + + if (need_job_tracking) { + bool need_sync_framework = false; + + /* + * If the channel is to have deterministic latency and + * job tracking is required, the channel must have + * pre-allocated resources. Otherwise, we fail the submit here + */ + if (c->deterministic && !channel_gk20a_is_prealloc_enabled(c)) + return -EINVAL; + + need_sync_framework = + gk20a_channel_sync_needs_sync_framework(g) || + (flags & NVGPU_SUBMIT_FLAGS_SYNC_FENCE && + flags & NVGPU_SUBMIT_FLAGS_FENCE_GET); + + /* + * Deferred clean-up is necessary for any of the following + * conditions: + * - channel's deterministic flag is not set + * - dependency on sync framework, which could make the + * behavior of the clean-up operation non-deterministic + * (should not be performed in the submit path) + * - channel wdt + * - GPU rail-gating with non-deterministic channels + * - buffer refcounting + * + * If none of the conditions are met, then deferred clean-up + * is not required, and we clean-up one job-tracking + * resource in the submit path. + */ + need_deferred_cleanup = !c->deterministic || + need_sync_framework || + c->timeout.enabled || + (g->can_railgate && + !c->deterministic) || + !skip_buffer_refcounting; + + /* + * For deterministic channels, we don't allow deferred clean_up + * processing to occur. In cases we hit this, we fail the submit + */ + if (c->deterministic && need_deferred_cleanup) + return -EINVAL; + + if (!c->deterministic) { + /* + * Get a power ref unless this is a deterministic + * channel that holds them during the channel lifetime. + * This one is released by gk20a_channel_clean_up_jobs, + * via syncpt or sema interrupt, whichever is used. + */ + err = gk20a_busy(g); + if (err) { + nvgpu_err(g, + "failed to host gk20a to submit gpfifo"); + nvgpu_print_current(g, NULL, NVGPU_ERROR); + return err; + } + } + + if (!need_deferred_cleanup) { + /* clean up a single job */ + gk20a_channel_clean_up_jobs(c, false); + } + } + + + /* Grab access to HW to deal with do_idle */ + if (c->deterministic) + nvgpu_rwsem_down_read(&g->deterministic_busy); + + if (c->deterministic && c->deterministic_railgate_allowed) { + /* + * Nope - this channel has dropped its own power ref. As + * deterministic submits don't hold power on per each submitted + * job like normal ones do, the GPU might railgate any time now + * and thus submit is disallowed. + */ + err = -EINVAL; + goto clean_up; + } + + trace_gk20a_channel_submit_gpfifo(g->name, + c->chid, + num_entries, + flags, + fence ? fence->id : 0, + fence ? fence->value : 0); + + nvgpu_log_info(g, "pre-submit put %d, get %d, size %d", + c->gpfifo.put, c->gpfifo.get, c->gpfifo.entry_num); + + /* + * Make sure we have enough space for gpfifo entries. Check cached + * values first and then read from HW. If no space, return EAGAIN + * and let userpace decide to re-try request or not. + */ + if (nvgpu_gp_free_count(c) < num_entries + extra_entries) { + if (nvgpu_get_gp_free_count(c) < num_entries + extra_entries) { + err = -EAGAIN; + goto clean_up; + } + } + + if (c->has_timedout) { + err = -ETIMEDOUT; + goto clean_up; + } + + if (need_job_tracking) { + err = channel_gk20a_alloc_job(c, &job); + if (err) + goto clean_up; + + err = nvgpu_submit_prepare_syncs(c, fence, job, + &wait_cmd, &incr_cmd, + &post_fence, + need_deferred_cleanup, + flags); + if (err) + goto clean_up_job; + } + + gk20a_fifo_profile_snapshot(profile, PROFILE_JOB_TRACKING); + + if (wait_cmd) + nvgpu_submit_append_priv_cmdbuf(c, wait_cmd); + + err = nvgpu_submit_append_gpfifo(c, gpfifo, userdata, + num_entries); + if (err) + goto clean_up_job; + + /* + * And here's where we add the incr_cmd we generated earlier. It should + * always run! + */ + if (incr_cmd) + nvgpu_submit_append_priv_cmdbuf(c, incr_cmd); + + if (fence_out) + *fence_out = gk20a_fence_get(post_fence); + + if (need_job_tracking) + /* TODO! Check for errors... */ + gk20a_channel_add_job(c, job, skip_buffer_refcounting); + gk20a_fifo_profile_snapshot(profile, PROFILE_APPEND); + + g->ops.fifo.userd_gp_put(g, c); + + /* No hw access beyond this point */ + if (c->deterministic) + nvgpu_rwsem_up_read(&g->deterministic_busy); + + trace_gk20a_channel_submitted_gpfifo(g->name, + c->chid, + num_entries, + flags, + post_fence ? post_fence->syncpt_id : 0, + post_fence ? post_fence->syncpt_value : 0); + + nvgpu_log_info(g, "post-submit put %d, get %d, size %d", + c->gpfifo.put, c->gpfifo.get, c->gpfifo.entry_num); + + gk20a_fifo_profile_snapshot(profile, PROFILE_END); + + nvgpu_log_fn(g, "done"); + return err; + +clean_up_job: + channel_gk20a_free_job(c, job); +clean_up: + nvgpu_log_fn(g, "fail"); + gk20a_fence_put(post_fence); + if (c->deterministic) + nvgpu_rwsem_up_read(&g->deterministic_busy); + else if (need_deferred_cleanup) + gk20a_idle(g); + + return err; +} + +int nvgpu_submit_channel_gpfifo_user(struct channel_gk20a *c, + struct nvgpu_gpfifo_userdata userdata, + u32 num_entries, + u32 flags, + struct nvgpu_channel_fence *fence, + struct gk20a_fence **fence_out, + struct fifo_profile_gk20a *profile) +{ + return nvgpu_submit_channel_gpfifo(c, NULL, userdata, num_entries, + flags, fence, fence_out, profile); +} + +int nvgpu_submit_channel_gpfifo_kernel(struct channel_gk20a *c, + struct nvgpu_gpfifo_entry *gpfifo, + u32 num_entries, + u32 flags, + struct nvgpu_channel_fence *fence, + struct gk20a_fence **fence_out) +{ + struct nvgpu_gpfifo_userdata userdata = { NULL, NULL }; + + return nvgpu_submit_channel_gpfifo(c, gpfifo, userdata, num_entries, + flags, fence, fence_out, NULL); +} diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h index aa37db62..78325019 100644 --- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h +++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h @@ -47,7 +47,7 @@ struct fifo_profile_gk20a; #define NVGPU_GPFIFO_FLAGS_REPLAYABLE_FAULTS_ENABLE (1 << 2) #define NVGPU_GPFIFO_FLAGS_USERMODE_SUPPORT (1 << 3) -/* Flags to be passed to gk20a_submit_channel_gpfifo() */ +/* Flags to be passed to nvgpu_submit_channel_gpfifo() */ #define NVGPU_SUBMIT_FLAGS_FENCE_WAIT (1 << 0) #define NVGPU_SUBMIT_FLAGS_FENCE_GET (1 << 1) #define NVGPU_SUBMIT_FLAGS_HW_FORMAT (1 << 2) diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.h b/drivers/gpu/nvgpu/gk20a/gk20a.h index 9061236e..3c25f8fb 100644 --- a/drivers/gpu/nvgpu/gk20a/gk20a.h +++ b/drivers/gpu/nvgpu/gk20a/gk20a.h @@ -150,6 +150,7 @@ enum nvgpu_unit; enum nvgpu_flush_op; struct _resmgr_context; +struct nvgpu_gpfifo_entry; struct nvgpu_gpfifo_userdata { struct nvgpu_gpfifo_entry __user *entries; diff --git a/drivers/gpu/nvgpu/include/nvgpu/channel.h b/drivers/gpu/nvgpu/include/nvgpu/channel.h new file mode 100644 index 00000000..604083d4 --- /dev/null +++ b/drivers/gpu/nvgpu/include/nvgpu/channel.h @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include + +#include "gk20a/gk20a.h" + +struct nvgpu_channel_fence; +struct gk20a_fence; +struct fifo_profile_gk20a; + +int nvgpu_submit_channel_gpfifo_user(struct channel_gk20a *c, + struct nvgpu_gpfifo_userdata userdata, + u32 num_entries, + u32 flags, + struct nvgpu_channel_fence *fence, + struct gk20a_fence **fence_out, + struct fifo_profile_gk20a *profile); + +int nvgpu_submit_channel_gpfifo_kernel(struct channel_gk20a *c, + struct nvgpu_gpfifo_entry *gpfifo, + u32 num_entries, + u32 flags, + struct nvgpu_channel_fence *fence, + struct gk20a_fence **fence_out); + +#ifdef CONFIG_DEBUG_FS +void trace_write_pushbuffers(struct channel_gk20a *c, int count); +#else +static inline void trace_write_pushbuffers(struct channel_gk20a *c, int count) +{ +} +#endif diff --git a/drivers/gpu/nvgpu/os/linux/cde.c b/drivers/gpu/nvgpu/os/linux/cde.c index 052a1d21..39b7d1f5 100644 --- a/drivers/gpu/nvgpu/os/linux/cde.c +++ b/drivers/gpu/nvgpu/os/linux/cde.c @@ -32,6 +32,7 @@ #include #include #include +#include #include @@ -783,7 +784,7 @@ static int gk20a_cde_execute_buffer(struct gk20a_cde_ctx *cde_ctx, return -ENOSYS; } - return gk20a_submit_channel_gpfifo_kernel(cde_ctx->ch, gpfifo, + return nvgpu_submit_channel_gpfifo_kernel(cde_ctx->ch, gpfifo, num_entries, flags, fence, fence_out); } diff --git a/drivers/gpu/nvgpu/os/linux/ce2.c b/drivers/gpu/nvgpu/os/linux/ce2.c index 8f20091b..0b43c0d1 100644 --- a/drivers/gpu/nvgpu/os/linux/ce2.c +++ b/drivers/gpu/nvgpu/os/linux/ce2.c @@ -15,6 +15,7 @@ */ #include +#include #include @@ -130,7 +131,7 @@ int gk20a_ce_execute_ops(struct gk20a *g, nvgpu_smp_wmb(); - ret = gk20a_submit_channel_gpfifo_kernel(ce_ctx->ch, &gpfifo, + ret = nvgpu_submit_channel_gpfifo_kernel(ce_ctx->ch, &gpfifo, 1, submit_flags, &fence, &ce_cmd_buf_fence_out); if (!ret) { diff --git a/drivers/gpu/nvgpu/os/linux/channel.c b/drivers/gpu/nvgpu/os/linux/channel.c index 391950af..fef44f2b 100644 --- a/drivers/gpu/nvgpu/os/linux/channel.c +++ b/drivers/gpu/nvgpu/os/linux/channel.c @@ -16,7 +16,6 @@ #include #include -#include #include #include @@ -489,11 +488,9 @@ static void trace_write_pushbuffer(struct channel_gk20a *c, dma_buf_vunmap(dmabuf, mem); } } -#endif -static void trace_write_pushbuffers(struct channel_gk20a *c, u32 count) +void trace_write_pushbuffers(struct channel_gk20a *c, u32 count) { -#ifdef CONFIG_DEBUG_FS struct nvgpu_gpfifo_entry *gp = c->gpfifo.mem.cpu_va; u32 n = c->gpfifo.entry_num; u32 start = c->gpfifo.put; @@ -507,549 +504,5 @@ static void trace_write_pushbuffers(struct channel_gk20a *c, u32 count) for (i = 0; i < count; i++) trace_write_pushbuffer(c, &gp[(start + i) % n]); -#endif -} - -/* - * Handle the submit synchronization - pre-fences and post-fences. - */ -static int gk20a_submit_prepare_syncs(struct channel_gk20a *c, - struct nvgpu_channel_fence *fence, - struct channel_gk20a_job *job, - struct priv_cmd_entry **wait_cmd, - struct priv_cmd_entry **incr_cmd, - struct gk20a_fence **post_fence, - bool register_irq, - u32 flags) -{ - struct gk20a *g = c->g; - bool need_sync_fence = false; - bool new_sync_created = false; - int wait_fence_fd = -1; - int err = 0; - bool need_wfi = !(flags & NVGPU_SUBMIT_FLAGS_SUPPRESS_WFI); - bool pre_alloc_enabled = channel_gk20a_is_prealloc_enabled(c); - - if (g->aggressive_sync_destroy_thresh) { - nvgpu_mutex_acquire(&c->sync_lock); - if (!c->sync) { - c->sync = gk20a_channel_sync_create(c, false); - if (!c->sync) { - err = -ENOMEM; - nvgpu_mutex_release(&c->sync_lock); - goto fail; - } - new_sync_created = true; - } - nvgpu_atomic_inc(&c->sync->refcount); - nvgpu_mutex_release(&c->sync_lock); - } - - if (g->ops.fifo.resetup_ramfc && new_sync_created) { - err = g->ops.fifo.resetup_ramfc(c); - if (err) - goto fail; - } - - /* - * Optionally insert syncpt/semaphore wait in the beginning of gpfifo - * submission when user requested and the wait hasn't expired. - */ - if (flags & NVGPU_SUBMIT_FLAGS_FENCE_WAIT) { - int max_wait_cmds = c->deterministic ? 1 : 0; - - if (!pre_alloc_enabled) - job->wait_cmd = nvgpu_kzalloc(g, - sizeof(struct priv_cmd_entry)); - - if (!job->wait_cmd) { - err = -ENOMEM; - goto fail; - } - - if (flags & NVGPU_SUBMIT_FLAGS_SYNC_FENCE) { - wait_fence_fd = fence->id; - err = c->sync->wait_fd(c->sync, wait_fence_fd, - job->wait_cmd, max_wait_cmds); - } else { - err = c->sync->wait_syncpt(c->sync, fence->id, - fence->value, - job->wait_cmd); - } - - if (err) - goto clean_up_wait_cmd; - - if (job->wait_cmd->valid) - *wait_cmd = job->wait_cmd; - } - - if ((flags & NVGPU_SUBMIT_FLAGS_FENCE_GET) && - (flags & NVGPU_SUBMIT_FLAGS_SYNC_FENCE)) - need_sync_fence = true; - - /* - * Always generate an increment at the end of a GPFIFO submission. This - * is used to keep track of method completion for idle railgating. The - * sync_pt/semaphore PB is added to the GPFIFO later on in submit. - */ - job->post_fence = gk20a_alloc_fence(c); - if (!job->post_fence) { - err = -ENOMEM; - goto clean_up_wait_cmd; - } - if (!pre_alloc_enabled) - job->incr_cmd = nvgpu_kzalloc(g, sizeof(struct priv_cmd_entry)); - - if (!job->incr_cmd) { - err = -ENOMEM; - goto clean_up_post_fence; - } - - if (flags & NVGPU_SUBMIT_FLAGS_FENCE_GET) - err = c->sync->incr_user(c->sync, wait_fence_fd, job->incr_cmd, - job->post_fence, need_wfi, need_sync_fence, - register_irq); - else - err = c->sync->incr(c->sync, job->incr_cmd, - job->post_fence, need_sync_fence, - register_irq); - if (!err) { - *incr_cmd = job->incr_cmd; - *post_fence = job->post_fence; - } else - goto clean_up_incr_cmd; - - return 0; - -clean_up_incr_cmd: - free_priv_cmdbuf(c, job->incr_cmd); - if (!pre_alloc_enabled) - job->incr_cmd = NULL; -clean_up_post_fence: - gk20a_fence_put(job->post_fence); - job->post_fence = NULL; -clean_up_wait_cmd: - if (job->wait_cmd) - free_priv_cmdbuf(c, job->wait_cmd); - if (!pre_alloc_enabled) - job->wait_cmd = NULL; -fail: - *wait_cmd = NULL; - return err; -} - -static void gk20a_submit_append_priv_cmdbuf(struct channel_gk20a *c, - struct priv_cmd_entry *cmd) -{ - struct gk20a *g = c->g; - struct nvgpu_mem *gpfifo_mem = &c->gpfifo.mem; - struct nvgpu_gpfifo_entry x = { - .entry0 = u64_lo32(cmd->gva), - .entry1 = u64_hi32(cmd->gva) | - pbdma_gp_entry1_length_f(cmd->size) - }; - - nvgpu_mem_wr_n(g, gpfifo_mem, c->gpfifo.put * sizeof(x), - &x, sizeof(x)); - - if (cmd->mem->aperture == APERTURE_SYSMEM) - trace_gk20a_push_cmdbuf(g->name, 0, cmd->size, 0, - (u32 *)cmd->mem->cpu_va + cmd->off); - - c->gpfifo.put = (c->gpfifo.put + 1) & (c->gpfifo.entry_num - 1); -} - -static int nvgpu_submit_append_gpfifo_user_direct(struct channel_gk20a *c, - struct nvgpu_gpfifo_userdata userdata, - u32 num_entries) -{ - struct gk20a *g = c->g; - struct nvgpu_gpfifo_entry *gpfifo_cpu = c->gpfifo.mem.cpu_va; - u32 gpfifo_size = c->gpfifo.entry_num; - u32 len = num_entries; - u32 start = c->gpfifo.put; - u32 end = start + len; /* exclusive */ - int err; - - if (end > gpfifo_size) { - /* wrap-around */ - int length0 = gpfifo_size - start; - int length1 = len - length0; - - err = g->os_channel.copy_user_gpfifo( - gpfifo_cpu + start, userdata, - 0, length0); - if (err) - return err; - - err = g->os_channel.copy_user_gpfifo( - gpfifo_cpu, userdata, - length0, length1); - if (err) - return err; - } else { - err = g->os_channel.copy_user_gpfifo( - gpfifo_cpu + start, userdata, - 0, len); - if (err) - return err; - } - - return 0; -} - -static void nvgpu_submit_append_gpfifo_common(struct channel_gk20a *c, - struct nvgpu_gpfifo_entry *src, u32 num_entries) -{ - struct gk20a *g = c->g; - struct nvgpu_mem *gpfifo_mem = &c->gpfifo.mem; - /* in bytes */ - u32 gpfifo_size = - c->gpfifo.entry_num * sizeof(struct nvgpu_gpfifo_entry); - u32 len = num_entries * sizeof(struct nvgpu_gpfifo_entry); - u32 start = c->gpfifo.put * sizeof(struct nvgpu_gpfifo_entry); - u32 end = start + len; /* exclusive */ - - if (end > gpfifo_size) { - /* wrap-around */ - int length0 = gpfifo_size - start; - int length1 = len - length0; - struct nvgpu_gpfifo_entry *src2 = src + length0; - - nvgpu_mem_wr_n(g, gpfifo_mem, start, src, length0); - nvgpu_mem_wr_n(g, gpfifo_mem, 0, src2, length1); - } else { - nvgpu_mem_wr_n(g, gpfifo_mem, start, src, len); - } -} - -/* - * Copy source gpfifo entries into the gpfifo ring buffer, potentially - * splitting into two memcpys to handle wrap-around. - */ -static int nvgpu_submit_append_gpfifo(struct channel_gk20a *c, - struct nvgpu_gpfifo_entry *kern_gpfifo, - struct nvgpu_gpfifo_userdata userdata, - u32 num_entries) -{ - struct gk20a *g = c->g; - int err; - - if (!kern_gpfifo && !c->gpfifo.pipe) { - /* - * This path (from userspace to sysmem) is special in order to - * avoid two copies unnecessarily (from user to pipe, then from - * pipe to gpu sysmem buffer). - */ - err = nvgpu_submit_append_gpfifo_user_direct(c, userdata, - num_entries); - if (err) - return err; - } else if (!kern_gpfifo) { - /* from userspace to vidmem, use the common path */ - err = g->os_channel.copy_user_gpfifo(c->gpfifo.pipe, userdata, - 0, num_entries); - if (err) - return err; - - nvgpu_submit_append_gpfifo_common(c, c->gpfifo.pipe, - num_entries); - } else { - /* from kernel to either sysmem or vidmem, don't need - * copy_user_gpfifo so use the common path */ - nvgpu_submit_append_gpfifo_common(c, kern_gpfifo, num_entries); - } - - trace_write_pushbuffers(c, num_entries); - - c->gpfifo.put = (c->gpfifo.put + num_entries) & - (c->gpfifo.entry_num - 1); - - return 0; -} - -static int gk20a_submit_channel_gpfifo(struct channel_gk20a *c, - struct nvgpu_gpfifo_entry *gpfifo, - struct nvgpu_gpfifo_userdata userdata, - u32 num_entries, - u32 flags, - struct nvgpu_channel_fence *fence, - struct gk20a_fence **fence_out, - struct fifo_profile_gk20a *profile) -{ - struct gk20a *g = c->g; - struct priv_cmd_entry *wait_cmd = NULL; - struct priv_cmd_entry *incr_cmd = NULL; - struct gk20a_fence *post_fence = NULL; - struct channel_gk20a_job *job = NULL; - /* we might need two extra gpfifo entries - one for pre fence - * and one for post fence. */ - const int extra_entries = 2; - bool skip_buffer_refcounting = (flags & - NVGPU_SUBMIT_FLAGS_SKIP_BUFFER_REFCOUNTING); - int err = 0; - bool need_job_tracking; - bool need_deferred_cleanup = false; - - if (nvgpu_is_enabled(g, NVGPU_DRIVER_IS_DYING)) - return -ENODEV; - - if (c->has_timedout) - return -ETIMEDOUT; - - if (!nvgpu_mem_is_valid(&c->gpfifo.mem)) - return -ENOMEM; - - /* fifo not large enough for request. Return error immediately. - * Kernel can insert gpfifo entries before and after user gpfifos. - * So, add extra_entries in user request. Also, HW with fifo size N - * can accept only N-1 entreis and so the below condition */ - if (c->gpfifo.entry_num - 1 < num_entries + extra_entries) { - nvgpu_err(g, "not enough gpfifo space allocated"); - return -ENOMEM; - } - - if ((flags & (NVGPU_SUBMIT_FLAGS_FENCE_WAIT | - NVGPU_SUBMIT_FLAGS_FENCE_GET)) && - !fence) - return -EINVAL; - - /* an address space needs to have been bound at this point. */ - if (!gk20a_channel_as_bound(c)) { - nvgpu_err(g, - "not bound to an address space at time of gpfifo" - " submission."); - return -EINVAL; - } - - gk20a_fifo_profile_snapshot(profile, PROFILE_ENTRY); - - /* update debug settings */ - nvgpu_ltc_sync_enabled(g); - - nvgpu_log_info(g, "channel %d", c->chid); - - /* - * Job tracking is necessary for any of the following conditions: - * - pre- or post-fence functionality - * - channel wdt - * - GPU rail-gating with non-deterministic channels - * - buffer refcounting - * - * If none of the conditions are met, then job tracking is not - * required and a fast submit can be done (ie. only need to write - * out userspace GPFIFO entries and update GP_PUT). - */ - need_job_tracking = (flags & NVGPU_SUBMIT_FLAGS_FENCE_WAIT) || - (flags & NVGPU_SUBMIT_FLAGS_FENCE_GET) || - c->timeout.enabled || - (g->can_railgate && !c->deterministic) || - !skip_buffer_refcounting; - - if (need_job_tracking) { - bool need_sync_framework = false; - - /* - * If the channel is to have deterministic latency and - * job tracking is required, the channel must have - * pre-allocated resources. Otherwise, we fail the submit here - */ - if (c->deterministic && !channel_gk20a_is_prealloc_enabled(c)) - return -EINVAL; - - need_sync_framework = - gk20a_channel_sync_needs_sync_framework(g) || - (flags & NVGPU_SUBMIT_FLAGS_SYNC_FENCE && - flags & NVGPU_SUBMIT_FLAGS_FENCE_GET); - - /* - * Deferred clean-up is necessary for any of the following - * conditions: - * - channel's deterministic flag is not set - * - dependency on sync framework, which could make the - * behavior of the clean-up operation non-deterministic - * (should not be performed in the submit path) - * - channel wdt - * - GPU rail-gating with non-deterministic channels - * - buffer refcounting - * - * If none of the conditions are met, then deferred clean-up - * is not required, and we clean-up one job-tracking - * resource in the submit path. - */ - need_deferred_cleanup = !c->deterministic || - need_sync_framework || - c->timeout.enabled || - (g->can_railgate && - !c->deterministic) || - !skip_buffer_refcounting; - - /* - * For deterministic channels, we don't allow deferred clean_up - * processing to occur. In cases we hit this, we fail the submit - */ - if (c->deterministic && need_deferred_cleanup) - return -EINVAL; - - if (!c->deterministic) { - /* - * Get a power ref unless this is a deterministic - * channel that holds them during the channel lifetime. - * This one is released by gk20a_channel_clean_up_jobs, - * via syncpt or sema interrupt, whichever is used. - */ - err = gk20a_busy(g); - if (err) { - nvgpu_err(g, - "failed to host gk20a to submit gpfifo"); - nvgpu_print_current(g, NULL, NVGPU_ERROR); - return err; - } - } - - if (!need_deferred_cleanup) { - /* clean up a single job */ - gk20a_channel_clean_up_jobs(c, false); - } - } - - - /* Grab access to HW to deal with do_idle */ - if (c->deterministic) - nvgpu_rwsem_down_read(&g->deterministic_busy); - - if (c->deterministic && c->deterministic_railgate_allowed) { - /* - * Nope - this channel has dropped its own power ref. As - * deterministic submits don't hold power on per each submitted - * job like normal ones do, the GPU might railgate any time now - * and thus submit is disallowed. - */ - err = -EINVAL; - goto clean_up; - } - - trace_gk20a_channel_submit_gpfifo(g->name, - c->chid, - num_entries, - flags, - fence ? fence->id : 0, - fence ? fence->value : 0); - - nvgpu_log_info(g, "pre-submit put %d, get %d, size %d", - c->gpfifo.put, c->gpfifo.get, c->gpfifo.entry_num); - - /* - * Make sure we have enough space for gpfifo entries. Check cached - * values first and then read from HW. If no space, return EAGAIN - * and let userpace decide to re-try request or not. - */ - if (nvgpu_gp_free_count(c) < num_entries + extra_entries) { - if (nvgpu_get_gp_free_count(c) < num_entries + extra_entries) { - err = -EAGAIN; - goto clean_up; - } - } - - if (c->has_timedout) { - err = -ETIMEDOUT; - goto clean_up; - } - - if (need_job_tracking) { - err = channel_gk20a_alloc_job(c, &job); - if (err) - goto clean_up; - - err = gk20a_submit_prepare_syncs(c, fence, job, - &wait_cmd, &incr_cmd, - &post_fence, - need_deferred_cleanup, - flags); - if (err) - goto clean_up_job; - } - - gk20a_fifo_profile_snapshot(profile, PROFILE_JOB_TRACKING); - - if (wait_cmd) - gk20a_submit_append_priv_cmdbuf(c, wait_cmd); - - err = nvgpu_submit_append_gpfifo(c, gpfifo, userdata, - num_entries); - if (err) - goto clean_up_job; - - /* - * And here's where we add the incr_cmd we generated earlier. It should - * always run! - */ - if (incr_cmd) - gk20a_submit_append_priv_cmdbuf(c, incr_cmd); - - if (fence_out) - *fence_out = gk20a_fence_get(post_fence); - - if (need_job_tracking) - /* TODO! Check for errors... */ - gk20a_channel_add_job(c, job, skip_buffer_refcounting); - gk20a_fifo_profile_snapshot(profile, PROFILE_APPEND); - - g->ops.fifo.userd_gp_put(g, c); - - /* No hw access beyond this point */ - if (c->deterministic) - nvgpu_rwsem_up_read(&g->deterministic_busy); - - trace_gk20a_channel_submitted_gpfifo(g->name, - c->chid, - num_entries, - flags, - post_fence ? post_fence->syncpt_id : 0, - post_fence ? post_fence->syncpt_value : 0); - - nvgpu_log_info(g, "post-submit put %d, get %d, size %d", - c->gpfifo.put, c->gpfifo.get, c->gpfifo.entry_num); - - gk20a_fifo_profile_snapshot(profile, PROFILE_END); - - nvgpu_log_fn(g, "done"); - return err; - -clean_up_job: - channel_gk20a_free_job(c, job); -clean_up: - nvgpu_log_fn(g, "fail"); - gk20a_fence_put(post_fence); - if (c->deterministic) - nvgpu_rwsem_up_read(&g->deterministic_busy); - else if (need_deferred_cleanup) - gk20a_idle(g); - - return err; -} - -int gk20a_submit_channel_gpfifo_user(struct channel_gk20a *c, - struct nvgpu_gpfifo_userdata userdata, - u32 num_entries, - u32 flags, - struct nvgpu_channel_fence *fence, - struct gk20a_fence **fence_out, - struct fifo_profile_gk20a *profile) -{ - return gk20a_submit_channel_gpfifo(c, NULL, userdata, num_entries, - flags, fence, fence_out, profile); -} - -int gk20a_submit_channel_gpfifo_kernel(struct channel_gk20a *c, - struct nvgpu_gpfifo_entry *gpfifo, - u32 num_entries, - u32 flags, - struct nvgpu_channel_fence *fence, - struct gk20a_fence **fence_out) -{ - struct nvgpu_gpfifo_userdata userdata = { NULL, NULL }; - return gk20a_submit_channel_gpfifo(c, gpfifo, userdata, num_entries, - flags, fence, fence_out, NULL); } +#endif diff --git a/drivers/gpu/nvgpu/os/linux/channel.h b/drivers/gpu/nvgpu/os/linux/channel.h index 43fa492b..87231a79 100644 --- a/drivers/gpu/nvgpu/os/linux/channel.h +++ b/drivers/gpu/nvgpu/os/linux/channel.h @@ -84,19 +84,4 @@ struct channel_gk20a *gk20a_open_new_channel_with_cb(struct gk20a *g, int runlist_id, bool is_privileged_channel); -int gk20a_submit_channel_gpfifo_user(struct channel_gk20a *c, - struct nvgpu_gpfifo_userdata userdata, - u32 num_entries, - u32 flags, - struct nvgpu_channel_fence *fence, - struct gk20a_fence **fence_out, - struct fifo_profile_gk20a *profile); - -int gk20a_submit_channel_gpfifo_kernel(struct channel_gk20a *c, - struct nvgpu_gpfifo_entry *gpfifo, - u32 num_entries, - u32 flags, - struct nvgpu_channel_fence *fence, - struct gk20a_fence **fence_out); - #endif /* __NVGPU_CHANNEL_H__ */ diff --git a/drivers/gpu/nvgpu/os/linux/ioctl_channel.c b/drivers/gpu/nvgpu/os/linux/ioctl_channel.c index fa6a02d6..7b003b76 100644 --- a/drivers/gpu/nvgpu/os/linux/ioctl_channel.c +++ b/drivers/gpu/nvgpu/os/linux/ioctl_channel.c @@ -34,6 +34,7 @@ #include #include #include +#include #include "gk20a/gk20a.h" #include "gk20a/dbg_gpu_gk20a.h" @@ -799,11 +800,11 @@ static int gk20a_ioctl_channel_submit_gpfifo( return fd; } - userdata.entries = (struct nvgpu_gpfifo_entry __user*) + userdata.entries = (struct nvgpu_gpfifo_entry __user *) (uintptr_t)args->gpfifo; userdata.context = NULL; - ret = gk20a_submit_channel_gpfifo_user(ch, + ret = nvgpu_submit_channel_gpfifo_user(ch, userdata, args->num_entries, submit_flags, &fence, &fence_out, profile); -- cgit v1.2.2