From 23c7903eff6ee1ab184dfcc62c054de1557e5b1d Mon Sep 17 00:00:00 2001 From: Deepak Nibade Date: Thu, 26 Oct 2017 08:29:56 -0700 Subject: gpu: nvgpu: move submit path to linux Nvgpu submit path has a lot of dependency on Linux framework e.g. use of copy_from_user, use of structures defined in uapi/nvgpu headers, dma_buf_* calls for trace support etc Hence to keep common code independent of Linux code, move submit path to Linux directory Move below APIs to common/linux/channel.c trace_write_pushbuffer() trace_write_pushbuffer_range() gk20a_submit_prepare_syncs() gk20a_submit_append_priv_cmdbuf() gk20a_submit_append_gpfifo() gk20a_submit_channel_gpfifo() Move below APIs to common/linux/ce2.c gk20a_ce_execute_ops() Define gk20a_ce_execute_ops() in common/linux/ce2.c, and declare it in gk20a/ce2_gk20a.h since it is needed in common/mm code too Each OS needs to implement this API separately gk20a_channel_alloc_gpfifo() use sizeof(nvgpu_gpfifo) to get size of one gpfifo entry, but structure nvgpu_gpfifo is linux specific Define new nvgpu_get_gpfifo_entry_size() in linux specific code and use it in gk20a_channel_alloc_gpfifo() to get gpfifo entry size Each OS needs to implement this API separately Export some APIs from gk20a/ce2_gk20a.h and gk20a/channel_gk20a.h that are needed in linux code Jira NVGPU-259 Jira NVGPU-313 Change-Id: I360c6cb8ce4494b1e50c66af334a2a379f0d2dc4 Signed-off-by: Deepak Nibade Reviewed-on: https://git-master.nvidia.com/r/1586277 Reviewed-by: mobile promotions Tested-by: mobile promotions --- drivers/gpu/nvgpu/Makefile | 2 + drivers/gpu/nvgpu/common/linux/cde.c | 1 + drivers/gpu/nvgpu/common/linux/ce2.c | 185 +++++++ drivers/gpu/nvgpu/common/linux/channel.c | 648 ++++++++++++++++++++++++ drivers/gpu/nvgpu/common/linux/channel.h | 38 ++ drivers/gpu/nvgpu/common/linux/ioctl_channel.c | 1 + drivers/gpu/nvgpu/gk20a/ce2_gk20a.c | 164 +----- drivers/gpu/nvgpu/gk20a/ce2_gk20a.h | 10 + drivers/gpu/nvgpu/gk20a/channel_gk20a.c | 666 +------------------------ drivers/gpu/nvgpu/gk20a/channel_gk20a.h | 29 +- 10 files changed, 917 insertions(+), 827 deletions(-) create mode 100644 drivers/gpu/nvgpu/common/linux/ce2.c create mode 100644 drivers/gpu/nvgpu/common/linux/channel.c create mode 100644 drivers/gpu/nvgpu/common/linux/channel.h (limited to 'drivers/gpu') diff --git a/drivers/gpu/nvgpu/Makefile b/drivers/gpu/nvgpu/Makefile index 06d3dedb..9c6c59f2 100644 --- a/drivers/gpu/nvgpu/Makefile +++ b/drivers/gpu/nvgpu/Makefile @@ -54,6 +54,8 @@ nvgpu-y := \ common/linux/comptags.o \ common/linux/dmabuf.o \ common/linux/sched.o \ + common/linux/channel.o \ + common/linux/ce2.o \ common/mm/nvgpu_allocator.o \ common/mm/bitmap_allocator.o \ common/mm/buddy_allocator.o \ diff --git a/drivers/gpu/nvgpu/common/linux/cde.c b/drivers/gpu/nvgpu/common/linux/cde.c index 6600fe42..f6020d9a 100644 --- a/drivers/gpu/nvgpu/common/linux/cde.c +++ b/drivers/gpu/nvgpu/common/linux/cde.c @@ -42,6 +42,7 @@ #include "cde.h" #include "os_linux.h" #include "dmabuf.h" +#include "channel.h" #include #include diff --git a/drivers/gpu/nvgpu/common/linux/ce2.c b/drivers/gpu/nvgpu/common/linux/ce2.c new file mode 100644 index 00000000..3fee23e5 --- /dev/null +++ b/drivers/gpu/nvgpu/common/linux/ce2.c @@ -0,0 +1,185 @@ +/* + * Copyright (c) 2017, NVIDIA Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include + +#include + +#include "gk20a/ce2_gk20a.h" +#include "gk20a/gk20a.h" +#include "channel.h" + +static inline int gk20a_get_valid_launch_flags(struct gk20a *g, int launch_flags) +{ + /* there is no local memory available, + don't allow local memory related CE flags */ + if (!g->mm.vidmem.size) { + launch_flags &= ~(NVGPU_CE_SRC_LOCATION_LOCAL_FB | + NVGPU_CE_DST_LOCATION_LOCAL_FB); + } + return launch_flags; +} + +int gk20a_ce_execute_ops(struct gk20a *g, + u32 ce_ctx_id, + u64 src_buf, + u64 dst_buf, + u64 size, + unsigned int payload, + int launch_flags, + int request_operation, + struct gk20a_fence *gk20a_fence_in, + u32 submit_flags, + struct gk20a_fence **gk20a_fence_out) +{ + int ret = -EPERM; + struct gk20a_ce_app *ce_app = &g->ce_app; + struct gk20a_gpu_ctx *ce_ctx, *ce_ctx_save; + bool found = false; + u32 *cmd_buf_cpu_va; + u64 cmd_buf_gpu_va = 0; + u32 methodSize; + u32 cmd_buf_read_offset; + u32 fence_index; + struct nvgpu_gpfifo gpfifo; + struct nvgpu_fence fence = {0,0}; + struct gk20a_fence *ce_cmd_buf_fence_out = NULL; + struct nvgpu_gpu_characteristics *gpu_capability = &g->gpu_characteristics; + + if (!ce_app->initialised ||ce_app->app_state != NVGPU_CE_ACTIVE) + goto end; + + nvgpu_mutex_acquire(&ce_app->app_mutex); + + nvgpu_list_for_each_entry_safe(ce_ctx, ce_ctx_save, + &ce_app->allocated_contexts, gk20a_gpu_ctx, list) { + if (ce_ctx->ctx_id == ce_ctx_id) { + found = true; + break; + } + } + + nvgpu_mutex_release(&ce_app->app_mutex); + + if (!found) { + ret = -EINVAL; + goto end; + } + + if (ce_ctx->gpu_ctx_state != NVGPU_CE_GPU_CTX_ALLOCATED) { + ret = -ENODEV; + goto end; + } + + nvgpu_mutex_acquire(&ce_ctx->gpu_ctx_mutex); + + ce_ctx->cmd_buf_read_queue_offset %= ce_ctx->cmd_buf_end_queue_offset; + + cmd_buf_read_offset = (ce_ctx->cmd_buf_read_queue_offset * + (NVGPU_CE_MAX_COMMAND_BUFF_SIZE_PER_KICKOFF / sizeof(u32))); + + /* at end of command buffer has gk20a_fence for command buffer sync */ + fence_index = (cmd_buf_read_offset + + ((NVGPU_CE_MAX_COMMAND_BUFF_SIZE_PER_KICKOFF / sizeof(u32)) - + (NVGPU_CE_MAX_COMMAND_BUFF_SIZE_FOR_TRACING / sizeof(u32)))); + + if (sizeof(struct gk20a_fence *) > NVGPU_CE_MAX_COMMAND_BUFF_SIZE_FOR_TRACING) { + ret = -ENOMEM; + goto noop; + } + + cmd_buf_cpu_va = (u32 *)ce_ctx->cmd_buf_mem.cpu_va; + + /* 0 is treated as invalid pre-sync */ + if (cmd_buf_cpu_va[fence_index]) { + struct gk20a_fence * ce_cmd_buf_fence_in = NULL; + + memcpy((void *)&ce_cmd_buf_fence_in, + (void *)(cmd_buf_cpu_va + fence_index), + sizeof(struct gk20a_fence *)); + ret = gk20a_fence_wait(g, ce_cmd_buf_fence_in, + gk20a_get_gr_idle_timeout(g)); + + gk20a_fence_put(ce_cmd_buf_fence_in); + /* Reset the stored last pre-sync */ + memset((void *)(cmd_buf_cpu_va + fence_index), + 0, + NVGPU_CE_MAX_COMMAND_BUFF_SIZE_FOR_TRACING); + if (ret) + goto noop; + } + + cmd_buf_gpu_va = (ce_ctx->cmd_buf_mem.gpu_va + (u64)(cmd_buf_read_offset *sizeof(u32))); + + methodSize = gk20a_ce_prepare_submit(src_buf, + dst_buf, + size, + &cmd_buf_cpu_va[cmd_buf_read_offset], + NVGPU_CE_MAX_COMMAND_BUFF_SIZE_PER_KICKOFF, + payload, + gk20a_get_valid_launch_flags(g, launch_flags), + request_operation, + gpu_capability->dma_copy_class, + gk20a_fence_in); + + if (methodSize) { + /* TODO: Remove CPU pre-fence wait */ + if (gk20a_fence_in) { + ret = gk20a_fence_wait(g, gk20a_fence_in, + gk20a_get_gr_idle_timeout(g)); + gk20a_fence_put(gk20a_fence_in); + if (ret) + goto noop; + } + + /* store the element into gpfifo */ + gpfifo.entry0 = + u64_lo32(cmd_buf_gpu_va); + gpfifo.entry1 = + (u64_hi32(cmd_buf_gpu_va) | + pbdma_gp_entry1_length_f(methodSize)); + + /* take always the postfence as it is needed for protecting the ce context */ + submit_flags |= NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_GET; + + nvgpu_smp_wmb(); + + ret = gk20a_submit_channel_gpfifo(ce_ctx->ch, &gpfifo, NULL, + 1, submit_flags, &fence, + &ce_cmd_buf_fence_out, false, NULL); + + if (!ret) { + memcpy((void *)(cmd_buf_cpu_va + fence_index), + (void *)&ce_cmd_buf_fence_out, + sizeof(struct gk20a_fence *)); + + if (gk20a_fence_out) { + gk20a_fence_get(ce_cmd_buf_fence_out); + *gk20a_fence_out = ce_cmd_buf_fence_out; + } + + /* Next available command buffer queue Index */ + ++ce_ctx->cmd_buf_read_queue_offset; + ++ce_ctx->submitted_seq_number; + } + } else { + ret = -ENOMEM; + } +noop: + nvgpu_mutex_release(&ce_ctx->gpu_ctx_mutex); +end: + return ret; +} diff --git a/drivers/gpu/nvgpu/common/linux/channel.c b/drivers/gpu/nvgpu/common/linux/channel.c new file mode 100644 index 00000000..716c5820 --- /dev/null +++ b/drivers/gpu/nvgpu/common/linux/channel.c @@ -0,0 +1,648 @@ +/* + * Copyright (c) 2017, NVIDIA Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include +#include +#include + +/* + * This is required for nvgpu_vm_find_buf() which is used in the tracing + * code. Once we can get and access userspace buffers without requiring + * direct dma_buf usage this can be removed. + */ +#include + +#include "gk20a/gk20a.h" + +#include + +#include +#include +#include + +u32 nvgpu_get_gpfifo_entry_size(void) +{ + return sizeof(struct nvgpu_gpfifo); +} + +#ifdef CONFIG_DEBUG_FS +static void trace_write_pushbuffer(struct channel_gk20a *c, + struct nvgpu_gpfifo *g) +{ + void *mem = NULL; + unsigned int words; + u64 offset; + struct dma_buf *dmabuf = NULL; + + if (gk20a_debug_trace_cmdbuf) { + u64 gpu_va = (u64)g->entry0 | + (u64)((u64)pbdma_gp_entry1_get_hi_v(g->entry1) << 32); + int err; + + words = pbdma_gp_entry1_length_v(g->entry1); + err = nvgpu_vm_find_buf(c->vm, gpu_va, &dmabuf, &offset); + if (!err) + mem = dma_buf_vmap(dmabuf); + } + + if (mem) { + u32 i; + /* + * Write in batches of 128 as there seems to be a limit + * of how much you can output to ftrace at once. + */ + for (i = 0; i < words; i += 128U) { + trace_gk20a_push_cmdbuf( + c->g->name, + 0, + min(words - i, 128U), + offset + i * sizeof(u32), + mem); + } + dma_buf_vunmap(dmabuf, mem); + } +} +#endif + +static void trace_write_pushbuffer_range(struct channel_gk20a *c, + struct nvgpu_gpfifo *g, + struct nvgpu_gpfifo __user *user_gpfifo, + int offset, + int count) +{ +#ifdef CONFIG_DEBUG_FS + u32 size; + int i; + struct nvgpu_gpfifo *gp; + bool gpfifo_allocated = false; + + if (!gk20a_debug_trace_cmdbuf) + return; + + if (!g && !user_gpfifo) + return; + + if (!g) { + size = count * sizeof(struct nvgpu_gpfifo); + if (size) { + g = nvgpu_big_malloc(c->g, size); + if (!g) + return; + + if (copy_from_user(g, user_gpfifo, size)) { + nvgpu_big_free(c->g, g); + return; + } + } + gpfifo_allocated = true; + } + + gp = g + offset; + for (i = 0; i < count; i++, gp++) + trace_write_pushbuffer(c, gp); + + if (gpfifo_allocated) + nvgpu_big_free(c->g, g); +#endif +} + +/* + * Handle the submit synchronization - pre-fences and post-fences. + */ +static int gk20a_submit_prepare_syncs(struct channel_gk20a *c, + struct nvgpu_fence *fence, + struct channel_gk20a_job *job, + struct priv_cmd_entry **wait_cmd, + struct priv_cmd_entry **incr_cmd, + struct gk20a_fence **pre_fence, + struct gk20a_fence **post_fence, + bool force_need_sync_fence, + bool register_irq, + u32 flags) +{ + struct gk20a *g = c->g; + bool need_sync_fence = false; + bool new_sync_created = false; + int wait_fence_fd = -1; + int err = 0; + bool need_wfi = !(flags & NVGPU_SUBMIT_GPFIFO_FLAGS_SUPPRESS_WFI); + bool pre_alloc_enabled = channel_gk20a_is_prealloc_enabled(c); + + /* + * If user wants to always allocate sync_fence_fds then respect that; + * otherwise, allocate sync_fence_fd based on user flags. + */ + if (force_need_sync_fence) + need_sync_fence = true; + + if (g->aggressive_sync_destroy_thresh) { + nvgpu_mutex_acquire(&c->sync_lock); + if (!c->sync) { + c->sync = gk20a_channel_sync_create(c); + if (!c->sync) { + err = -ENOMEM; + nvgpu_mutex_release(&c->sync_lock); + goto fail; + } + new_sync_created = true; + } + nvgpu_atomic_inc(&c->sync->refcount); + nvgpu_mutex_release(&c->sync_lock); + } + + if (g->ops.fifo.resetup_ramfc && new_sync_created) { + err = g->ops.fifo.resetup_ramfc(c); + if (err) + goto fail; + } + + /* + * Optionally insert syncpt wait in the beginning of gpfifo submission + * when user requested and the wait hasn't expired. Validate that the id + * makes sense, elide if not. The only reason this isn't being + * unceremoniously killed is to keep running some tests which trigger + * this condition. + */ + if (flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_WAIT) { + job->pre_fence = gk20a_alloc_fence(c); + if (!job->pre_fence) { + err = -ENOMEM; + goto fail; + } + + if (!pre_alloc_enabled) + job->wait_cmd = nvgpu_kzalloc(g, + sizeof(struct priv_cmd_entry)); + + if (!job->wait_cmd) { + err = -ENOMEM; + goto clean_up_pre_fence; + } + + if (flags & NVGPU_SUBMIT_GPFIFO_FLAGS_SYNC_FENCE) { + wait_fence_fd = fence->id; + err = c->sync->wait_fd(c->sync, wait_fence_fd, + job->wait_cmd, job->pre_fence); + } else { + err = c->sync->wait_syncpt(c->sync, fence->id, + fence->value, job->wait_cmd, + job->pre_fence); + } + + if (!err) { + if (job->wait_cmd->valid) + *wait_cmd = job->wait_cmd; + *pre_fence = job->pre_fence; + } else + goto clean_up_wait_cmd; + } + + if ((flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_GET) && + (flags & NVGPU_SUBMIT_GPFIFO_FLAGS_SYNC_FENCE)) + need_sync_fence = true; + + /* + * Always generate an increment at the end of a GPFIFO submission. This + * is used to keep track of method completion for idle railgating. The + * sync_pt/semaphore PB is added to the GPFIFO later on in submit. + */ + job->post_fence = gk20a_alloc_fence(c); + if (!job->post_fence) { + err = -ENOMEM; + goto clean_up_wait_cmd; + } + if (!pre_alloc_enabled) + job->incr_cmd = nvgpu_kzalloc(g, sizeof(struct priv_cmd_entry)); + + if (!job->incr_cmd) { + err = -ENOMEM; + goto clean_up_post_fence; + } + + if (flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_GET) + err = c->sync->incr_user(c->sync, wait_fence_fd, job->incr_cmd, + job->post_fence, need_wfi, need_sync_fence, + register_irq); + else + err = c->sync->incr(c->sync, job->incr_cmd, + job->post_fence, need_sync_fence, + register_irq); + if (!err) { + *incr_cmd = job->incr_cmd; + *post_fence = job->post_fence; + } else + goto clean_up_incr_cmd; + + return 0; + +clean_up_incr_cmd: + free_priv_cmdbuf(c, job->incr_cmd); + if (!pre_alloc_enabled) + job->incr_cmd = NULL; +clean_up_post_fence: + gk20a_fence_put(job->post_fence); + job->post_fence = NULL; +clean_up_wait_cmd: + free_priv_cmdbuf(c, job->wait_cmd); + if (!pre_alloc_enabled) + job->wait_cmd = NULL; +clean_up_pre_fence: + gk20a_fence_put(job->pre_fence); + job->pre_fence = NULL; +fail: + *wait_cmd = NULL; + *pre_fence = NULL; + return err; +} + +static void gk20a_submit_append_priv_cmdbuf(struct channel_gk20a *c, + struct priv_cmd_entry *cmd) +{ + struct gk20a *g = c->g; + struct nvgpu_mem *gpfifo_mem = &c->gpfifo.mem; + struct nvgpu_gpfifo x = { + .entry0 = u64_lo32(cmd->gva), + .entry1 = u64_hi32(cmd->gva) | + pbdma_gp_entry1_length_f(cmd->size) + }; + + nvgpu_mem_wr_n(g, gpfifo_mem, c->gpfifo.put * sizeof(x), + &x, sizeof(x)); + + if (cmd->mem->aperture == APERTURE_SYSMEM) + trace_gk20a_push_cmdbuf(g->name, 0, cmd->size, 0, + cmd->mem->cpu_va + cmd->off * sizeof(u32)); + + c->gpfifo.put = (c->gpfifo.put + 1) & (c->gpfifo.entry_num - 1); +} + +/* + * Copy source gpfifo entries into the gpfifo ring buffer, potentially + * splitting into two memcpys to handle wrap-around. + */ +static int gk20a_submit_append_gpfifo(struct channel_gk20a *c, + struct nvgpu_gpfifo *kern_gpfifo, + struct nvgpu_gpfifo __user *user_gpfifo, + u32 num_entries) +{ + /* byte offsets */ + u32 gpfifo_size = c->gpfifo.entry_num * sizeof(struct nvgpu_gpfifo); + u32 len = num_entries * sizeof(struct nvgpu_gpfifo); + u32 start = c->gpfifo.put * sizeof(struct nvgpu_gpfifo); + u32 end = start + len; /* exclusive */ + struct nvgpu_mem *gpfifo_mem = &c->gpfifo.mem; + struct nvgpu_gpfifo *cpu_src; + int err; + + if (user_gpfifo && !c->gpfifo.pipe) { + /* + * This path (from userspace to sysmem) is special in order to + * avoid two copies unnecessarily (from user to pipe, then from + * pipe to gpu sysmem buffer). + * + * As a special case, the pipe buffer exists if PRAMIN writes + * are forced, although the buffers may not be in vidmem in + * that case. + */ + if (end > gpfifo_size) { + /* wrap-around */ + int length0 = gpfifo_size - start; + int length1 = len - length0; + void __user *user2 = (u8 __user *)user_gpfifo + length0; + + err = copy_from_user(gpfifo_mem->cpu_va + start, + user_gpfifo, length0); + if (err) + return err; + + err = copy_from_user(gpfifo_mem->cpu_va, + user2, length1); + if (err) + return err; + } else { + err = copy_from_user(gpfifo_mem->cpu_va + start, + user_gpfifo, len); + if (err) + return err; + } + + trace_write_pushbuffer_range(c, NULL, user_gpfifo, + 0, num_entries); + goto out; + } else if (user_gpfifo) { + /* from userspace to vidmem or sysmem when pramin forced, use + * the common copy path below */ + err = copy_from_user(c->gpfifo.pipe, user_gpfifo, len); + if (err) + return err; + + cpu_src = c->gpfifo.pipe; + } else { + /* from kernel to either sysmem or vidmem, don't need + * copy_from_user so use the common path below */ + cpu_src = kern_gpfifo; + } + + if (end > gpfifo_size) { + /* wrap-around */ + int length0 = gpfifo_size - start; + int length1 = len - length0; + void *src2 = (u8 *)cpu_src + length0; + + nvgpu_mem_wr_n(c->g, gpfifo_mem, start, cpu_src, length0); + nvgpu_mem_wr_n(c->g, gpfifo_mem, 0, src2, length1); + } else { + nvgpu_mem_wr_n(c->g, gpfifo_mem, start, cpu_src, len); + + } + + trace_write_pushbuffer_range(c, cpu_src, NULL, 0, num_entries); + +out: + c->gpfifo.put = (c->gpfifo.put + num_entries) & + (c->gpfifo.entry_num - 1); + + return 0; +} + +int gk20a_submit_channel_gpfifo(struct channel_gk20a *c, + struct nvgpu_gpfifo *gpfifo, + struct nvgpu_submit_gpfifo_args *args, + u32 num_entries, + u32 flags, + struct nvgpu_fence *fence, + struct gk20a_fence **fence_out, + bool force_need_sync_fence, + struct fifo_profile_gk20a *profile) +{ + struct gk20a *g = c->g; + struct priv_cmd_entry *wait_cmd = NULL; + struct priv_cmd_entry *incr_cmd = NULL; + struct gk20a_fence *pre_fence = NULL; + struct gk20a_fence *post_fence = NULL; + struct channel_gk20a_job *job = NULL; + /* we might need two extra gpfifo entries - one for pre fence + * and one for post fence. */ + const int extra_entries = 2; + bool skip_buffer_refcounting = (flags & + NVGPU_SUBMIT_GPFIFO_FLAGS_SKIP_BUFFER_REFCOUNTING); + int err = 0; + bool need_job_tracking; + bool need_deferred_cleanup = false; + struct nvgpu_gpfifo __user *user_gpfifo = args ? + (struct nvgpu_gpfifo __user *)(uintptr_t)args->gpfifo : NULL; + + if (nvgpu_is_enabled(g, NVGPU_DRIVER_IS_DYING)) + return -ENODEV; + + if (c->has_timedout) + return -ETIMEDOUT; + + if (!nvgpu_mem_is_valid(&c->gpfifo.mem)) + return -ENOMEM; + + /* fifo not large enough for request. Return error immediately. + * Kernel can insert gpfifo entries before and after user gpfifos. + * So, add extra_entries in user request. Also, HW with fifo size N + * can accept only N-1 entreis and so the below condition */ + if (c->gpfifo.entry_num - 1 < num_entries + extra_entries) { + nvgpu_err(g, "not enough gpfifo space allocated"); + return -ENOMEM; + } + + if (!gpfifo && !args) + return -EINVAL; + + if ((flags & (NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_WAIT | + NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_GET)) && + !fence) + return -EINVAL; + + /* an address space needs to have been bound at this point. */ + if (!gk20a_channel_as_bound(c)) { + nvgpu_err(g, + "not bound to an address space at time of gpfifo" + " submission."); + return -EINVAL; + } + + if (profile) + profile->timestamp[PROFILE_ENTRY] = sched_clock(); + + /* update debug settings */ + nvgpu_ltc_sync_enabled(g); + + gk20a_dbg_info("channel %d", c->chid); + + /* + * Job tracking is necessary for any of the following conditions: + * - pre- or post-fence functionality + * - channel wdt + * - GPU rail-gating with non-deterministic channels + * - buffer refcounting + * + * If none of the conditions are met, then job tracking is not + * required and a fast submit can be done (ie. only need to write + * out userspace GPFIFO entries and update GP_PUT). + */ + need_job_tracking = (flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_WAIT) || + (flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_GET) || + c->wdt_enabled || + (g->can_railgate && !c->deterministic) || + !skip_buffer_refcounting; + + if (need_job_tracking) { + bool need_sync_framework = false; + + /* + * If the channel is to have deterministic latency and + * job tracking is required, the channel must have + * pre-allocated resources. Otherwise, we fail the submit here + */ + if (c->deterministic && !channel_gk20a_is_prealloc_enabled(c)) + return -EINVAL; + + need_sync_framework = force_need_sync_fence || + gk20a_channel_sync_needs_sync_framework(g) || + (flags & NVGPU_SUBMIT_GPFIFO_FLAGS_SYNC_FENCE && + (flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_WAIT || + flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_GET)); + + /* + * Deferred clean-up is necessary for any of the following + * conditions: + * - channel's deterministic flag is not set + * - dependency on sync framework, which could make the + * behavior of the clean-up operation non-deterministic + * (should not be performed in the submit path) + * - channel wdt + * - GPU rail-gating with non-deterministic channels + * - buffer refcounting + * + * If none of the conditions are met, then deferred clean-up + * is not required, and we clean-up one job-tracking + * resource in the submit path. + */ + need_deferred_cleanup = !c->deterministic || + need_sync_framework || + c->wdt_enabled || + (g->can_railgate && + !c->deterministic) || + !skip_buffer_refcounting; + + /* + * For deterministic channels, we don't allow deferred clean_up + * processing to occur. In cases we hit this, we fail the submit + */ + if (c->deterministic && need_deferred_cleanup) + return -EINVAL; + + if (!c->deterministic) { + /* + * Get a power ref unless this is a deterministic + * channel that holds them during the channel lifetime. + * This one is released by gk20a_channel_clean_up_jobs, + * via syncpt or sema interrupt, whichever is used. + */ + err = gk20a_busy(g); + if (err) { + nvgpu_err(g, + "failed to host gk20a to submit gpfifo, process %s", + current->comm); + return err; + } + } + + if (!need_deferred_cleanup) { + /* clean up a single job */ + gk20a_channel_clean_up_jobs(c, false); + } + } + + + /* Grab access to HW to deal with do_idle */ + if (c->deterministic) + nvgpu_rwsem_down_read(&g->deterministic_busy); + + trace_gk20a_channel_submit_gpfifo(g->name, + c->chid, + num_entries, + flags, + fence ? fence->id : 0, + fence ? fence->value : 0); + + gk20a_dbg_info("pre-submit put %d, get %d, size %d", + c->gpfifo.put, c->gpfifo.get, c->gpfifo.entry_num); + + /* + * Make sure we have enough space for gpfifo entries. Check cached + * values first and then read from HW. If no space, return EAGAIN + * and let userpace decide to re-try request or not. + */ + if (nvgpu_gp_free_count(c) < num_entries + extra_entries) { + if (nvgpu_get_gp_free_count(c) < num_entries + extra_entries) { + err = -EAGAIN; + goto clean_up; + } + } + + if (c->has_timedout) { + err = -ETIMEDOUT; + goto clean_up; + } + + if (need_job_tracking) { + err = channel_gk20a_alloc_job(c, &job); + if (err) + goto clean_up; + + err = gk20a_submit_prepare_syncs(c, fence, job, + &wait_cmd, &incr_cmd, + &pre_fence, &post_fence, + force_need_sync_fence, + need_deferred_cleanup, + flags); + if (err) + goto clean_up_job; + } + + if (profile) + profile->timestamp[PROFILE_JOB_TRACKING] = sched_clock(); + + if (wait_cmd) + gk20a_submit_append_priv_cmdbuf(c, wait_cmd); + + if (gpfifo || user_gpfifo) + err = gk20a_submit_append_gpfifo(c, gpfifo, user_gpfifo, + num_entries); + if (err) + goto clean_up_job; + + /* + * And here's where we add the incr_cmd we generated earlier. It should + * always run! + */ + if (incr_cmd) + gk20a_submit_append_priv_cmdbuf(c, incr_cmd); + + if (fence_out) + *fence_out = gk20a_fence_get(post_fence); + + if (need_job_tracking) + /* TODO! Check for errors... */ + gk20a_channel_add_job(c, job, skip_buffer_refcounting); + if (profile) + profile->timestamp[PROFILE_APPEND] = sched_clock(); + + g->ops.fifo.userd_gp_put(g, c); + + if ((NVGPU_SUBMIT_GPFIFO_FLAGS_RESCHEDULE_RUNLIST & flags) && + g->ops.fifo.reschedule_runlist) + g->ops.fifo.reschedule_runlist(g, c->runlist_id); + + /* No hw access beyond this point */ + if (c->deterministic) + nvgpu_rwsem_up_read(&g->deterministic_busy); + + trace_gk20a_channel_submitted_gpfifo(g->name, + c->chid, + num_entries, + flags, + post_fence ? post_fence->syncpt_id : 0, + post_fence ? post_fence->syncpt_value : 0); + + gk20a_dbg_info("post-submit put %d, get %d, size %d", + c->gpfifo.put, c->gpfifo.get, c->gpfifo.entry_num); + + if (profile) + profile->timestamp[PROFILE_END] = sched_clock(); + gk20a_dbg_fn("done"); + return err; + +clean_up_job: + channel_gk20a_free_job(c, job); +clean_up: + gk20a_dbg_fn("fail"); + gk20a_fence_put(pre_fence); + gk20a_fence_put(post_fence); + if (c->deterministic) + nvgpu_rwsem_up_read(&g->deterministic_busy); + else if (need_deferred_cleanup) + gk20a_idle(g); + + return err; +} + diff --git a/drivers/gpu/nvgpu/common/linux/channel.h b/drivers/gpu/nvgpu/common/linux/channel.h new file mode 100644 index 00000000..785c03d6 --- /dev/null +++ b/drivers/gpu/nvgpu/common/linux/channel.h @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +#ifndef __NVGPU_CHANNEL_H__ +#define __NVGPU_CHANNEL_H__ + +#include + +struct channel_gk20a; +struct nvgpu_gpfifo; +struct nvgpu_submit_gpfifo_args; +struct nvgpu_fence; +struct gk20a_fence; +struct fifo_profile_gk20a; + +int gk20a_submit_channel_gpfifo(struct channel_gk20a *c, + struct nvgpu_gpfifo *gpfifo, + struct nvgpu_submit_gpfifo_args *args, + u32 num_entries, + u32 flags, + struct nvgpu_fence *fence, + struct gk20a_fence **fence_out, + bool force_need_sync_fence, + struct fifo_profile_gk20a *profile); + +#endif /* __NVGPU_CHANNEL_H__ */ diff --git a/drivers/gpu/nvgpu/common/linux/ioctl_channel.c b/drivers/gpu/nvgpu/common/linux/ioctl_channel.c index 91dfc630..5b0c4a50 100644 --- a/drivers/gpu/nvgpu/common/linux/ioctl_channel.c +++ b/drivers/gpu/nvgpu/common/linux/ioctl_channel.c @@ -36,6 +36,7 @@ #include "gk20a/platform_gk20a.h" #include "ioctl_channel.h" +#include "channel.h" #include "os_linux.h" #include "ctxsw_trace.h" diff --git a/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c b/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c index 5314a1be..9ff6c792 100644 --- a/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c @@ -249,18 +249,7 @@ static inline unsigned int gk20a_ce_get_method_size(int request_operation, return methodsize; } -static inline int gk20a_get_valid_launch_flags(struct gk20a *g, int launch_flags) -{ - /* there is no local memory available, - don't allow local memory related CE flags */ - if (!g->mm.vidmem.size) { - launch_flags &= ~(NVGPU_CE_SRC_LOCATION_LOCAL_FB | - NVGPU_CE_DST_LOCATION_LOCAL_FB); - } - return launch_flags; -} - -static int gk20a_ce_prepare_submit(u64 src_buf, +int gk20a_ce_prepare_submit(u64 src_buf, u64 dst_buf, u64 size, u32 *cmd_buf_cpu_va, @@ -626,157 +615,6 @@ end: } EXPORT_SYMBOL(gk20a_ce_create_context_with_cb); -int gk20a_ce_execute_ops(struct gk20a *g, - u32 ce_ctx_id, - u64 src_buf, - u64 dst_buf, - u64 size, - unsigned int payload, - int launch_flags, - int request_operation, - struct gk20a_fence *gk20a_fence_in, - u32 submit_flags, - struct gk20a_fence **gk20a_fence_out) -{ - int ret = -EPERM; - struct gk20a_ce_app *ce_app = &g->ce_app; - struct gk20a_gpu_ctx *ce_ctx, *ce_ctx_save; - bool found = false; - u32 *cmd_buf_cpu_va; - u64 cmd_buf_gpu_va = 0; - u32 methodSize; - u32 cmd_buf_read_offset; - u32 fence_index; - struct nvgpu_gpfifo gpfifo; - struct nvgpu_fence fence = {0,0}; - struct gk20a_fence *ce_cmd_buf_fence_out = NULL; - struct nvgpu_gpu_characteristics *gpu_capability = &g->gpu_characteristics; - - if (!ce_app->initialised ||ce_app->app_state != NVGPU_CE_ACTIVE) - goto end; - - nvgpu_mutex_acquire(&ce_app->app_mutex); - - nvgpu_list_for_each_entry_safe(ce_ctx, ce_ctx_save, - &ce_app->allocated_contexts, gk20a_gpu_ctx, list) { - if (ce_ctx->ctx_id == ce_ctx_id) { - found = true; - break; - } - } - - nvgpu_mutex_release(&ce_app->app_mutex); - - if (!found) { - ret = -EINVAL; - goto end; - } - - if (ce_ctx->gpu_ctx_state != NVGPU_CE_GPU_CTX_ALLOCATED) { - ret = -ENODEV; - goto end; - } - - nvgpu_mutex_acquire(&ce_ctx->gpu_ctx_mutex); - - ce_ctx->cmd_buf_read_queue_offset %= ce_ctx->cmd_buf_end_queue_offset; - - cmd_buf_read_offset = (ce_ctx->cmd_buf_read_queue_offset * - (NVGPU_CE_MAX_COMMAND_BUFF_SIZE_PER_KICKOFF / sizeof(u32))); - - /* at end of command buffer has gk20a_fence for command buffer sync */ - fence_index = (cmd_buf_read_offset + - ((NVGPU_CE_MAX_COMMAND_BUFF_SIZE_PER_KICKOFF / sizeof(u32)) - - (NVGPU_CE_MAX_COMMAND_BUFF_SIZE_FOR_TRACING / sizeof(u32)))); - - if (sizeof(struct gk20a_fence *) > NVGPU_CE_MAX_COMMAND_BUFF_SIZE_FOR_TRACING) { - ret = -ENOMEM; - goto noop; - } - - cmd_buf_cpu_va = (u32 *)ce_ctx->cmd_buf_mem.cpu_va; - - /* 0 is treated as invalid pre-sync */ - if (cmd_buf_cpu_va[fence_index]) { - struct gk20a_fence * ce_cmd_buf_fence_in = NULL; - - memcpy((void *)&ce_cmd_buf_fence_in, - (void *)(cmd_buf_cpu_va + fence_index), - sizeof(struct gk20a_fence *)); - ret = gk20a_fence_wait(g, ce_cmd_buf_fence_in, - gk20a_get_gr_idle_timeout(g)); - - gk20a_fence_put(ce_cmd_buf_fence_in); - /* Reset the stored last pre-sync */ - memset((void *)(cmd_buf_cpu_va + fence_index), - 0, - NVGPU_CE_MAX_COMMAND_BUFF_SIZE_FOR_TRACING); - if (ret) - goto noop; - } - - cmd_buf_gpu_va = (ce_ctx->cmd_buf_mem.gpu_va + (u64)(cmd_buf_read_offset *sizeof(u32))); - - methodSize = gk20a_ce_prepare_submit(src_buf, - dst_buf, - size, - &cmd_buf_cpu_va[cmd_buf_read_offset], - NVGPU_CE_MAX_COMMAND_BUFF_SIZE_PER_KICKOFF, - payload, - gk20a_get_valid_launch_flags(g, launch_flags), - request_operation, - gpu_capability->dma_copy_class, - gk20a_fence_in); - - if (methodSize) { - /* TODO: Remove CPU pre-fence wait */ - if (gk20a_fence_in) { - ret = gk20a_fence_wait(g, gk20a_fence_in, - gk20a_get_gr_idle_timeout(g)); - gk20a_fence_put(gk20a_fence_in); - if (ret) - goto noop; - } - - /* store the element into gpfifo */ - gpfifo.entry0 = - u64_lo32(cmd_buf_gpu_va); - gpfifo.entry1 = - (u64_hi32(cmd_buf_gpu_va) | - pbdma_gp_entry1_length_f(methodSize)); - - /* take always the postfence as it is needed for protecting the ce context */ - submit_flags |= NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_GET; - - nvgpu_smp_wmb(); - - ret = gk20a_submit_channel_gpfifo(ce_ctx->ch, &gpfifo, NULL, - 1, submit_flags, &fence, - &ce_cmd_buf_fence_out, false, NULL); - - if (!ret) { - memcpy((void *)(cmd_buf_cpu_va + fence_index), - (void *)&ce_cmd_buf_fence_out, - sizeof(struct gk20a_fence *)); - - if (gk20a_fence_out) { - gk20a_fence_get(ce_cmd_buf_fence_out); - *gk20a_fence_out = ce_cmd_buf_fence_out; - } - - /* Next available command buffer queue Index */ - ++ce_ctx->cmd_buf_read_queue_offset; - ++ce_ctx->submitted_seq_number; - } - } else - ret = -ENOMEM; -noop: - nvgpu_mutex_release(&ce_ctx->gpu_ctx_mutex); -end: - return ret; -} -EXPORT_SYMBOL(gk20a_ce_execute_ops); - void gk20a_ce_delete_context(struct gk20a *g, u32 ce_ctx_id) { diff --git a/drivers/gpu/nvgpu/gk20a/ce2_gk20a.h b/drivers/gpu/nvgpu/gk20a/ce2_gk20a.h index 1dad8952..8d3a4ca3 100644 --- a/drivers/gpu/nvgpu/gk20a/ce2_gk20a.h +++ b/drivers/gpu/nvgpu/gk20a/ce2_gk20a.h @@ -161,5 +161,15 @@ void gk20a_ce_delete_context_priv(struct gk20a *g, u32 ce_ctx_id); void gk20a_ce_delete_context(struct gk20a *g, u32 ce_ctx_id); +int gk20a_ce_prepare_submit(u64 src_buf, + u64 dst_buf, + u64 size, + u32 *cmd_buf_cpu_va, + u32 max_cmd_buf_size, + unsigned int payload, + int launch_flags, + int request_operation, + u32 dma_copy_class, + struct gk20a_fence *gk20a_fence_in); #endif /*__CE2_GK20A_H__*/ diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c index 00d20357..c938ba6b 100644 --- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c @@ -44,45 +44,13 @@ #include #include -/* - * This is required for nvgpu_vm_find_buf() which is used in the tracing - * code. Once we can get and access userspace buffers without requiring - * direct dma_buf usage this can be removed. - */ -#include - #include "gk20a.h" #include "dbg_gpu_gk20a.h" #include "fence_gk20a.h" -#include - -/* - * Note - * This is added for all the copy_from_user methods in this file which needs to - * be moved lated to reduce depenedency on Linux - */ -#include - -/* - * Although channels do have pointers back to the gk20a struct that they were - * created under in cases where the driver is killed that pointer can be bad. - * The channel memory can be freed before the release() function for a given - * channel is called. This happens when the driver dies and userspace doesn't - * get a chance to call release() until after the entire gk20a driver data is - * unloaded and freed. - */ -struct channel_priv { - struct gk20a *g; - struct channel_gk20a *c; -}; - static void free_channel(struct fifo_gk20a *f, struct channel_gk20a *c); static void gk20a_channel_dump_ref_actions(struct channel_gk20a *c); -static void free_priv_cmdbuf(struct channel_gk20a *c, - struct priv_cmd_entry *e); - static int channel_gk20a_alloc_priv_cmdbuf(struct channel_gk20a *c); static void channel_gk20a_free_priv_cmdbuf(struct channel_gk20a *c); @@ -97,9 +65,6 @@ static struct channel_gk20a_job *channel_gk20a_joblist_peek( static u32 gk20a_get_channel_watchdog_timeout(struct channel_gk20a *ch); -static void gk20a_channel_clean_up_jobs(struct channel_gk20a *c, - bool clean_all); - /* allocate GPU channel */ static struct channel_gk20a *allocate_channel(struct fifo_gk20a *f) { @@ -1038,7 +1003,7 @@ int gk20a_channel_alloc_priv_cmdbuf(struct channel_gk20a *c, u32 orig_size, /* Don't call this to free an explict cmd entry. * It doesn't update priv_cmd_queue get/put */ -static void free_priv_cmdbuf(struct channel_gk20a *c, +void free_priv_cmdbuf(struct channel_gk20a *c, struct priv_cmd_entry *e) { if (channel_gk20a_is_prealloc_enabled(c)) @@ -1047,7 +1012,7 @@ static void free_priv_cmdbuf(struct channel_gk20a *c, nvgpu_kfree(c->g, e); } -static int channel_gk20a_alloc_job(struct channel_gk20a *c, +int channel_gk20a_alloc_job(struct channel_gk20a *c, struct channel_gk20a_job **job_out) { int err = 0; @@ -1080,7 +1045,7 @@ static int channel_gk20a_alloc_job(struct channel_gk20a *c, return err; } -static void channel_gk20a_free_job(struct channel_gk20a *c, +void channel_gk20a_free_job(struct channel_gk20a *c, struct channel_gk20a_job *job) { /* @@ -1267,11 +1232,12 @@ int gk20a_channel_alloc_gpfifo(struct channel_gk20a *c, { struct gk20a *g = c->g; struct vm_gk20a *ch_vm; - u32 gpfifo_size; + u32 gpfifo_size, gpfifo_entry_size; int err = 0; unsigned long acquire_timeout; gpfifo_size = num_entries; + gpfifo_entry_size = nvgpu_get_gpfifo_entry_size(); if (flags & NVGPU_ALLOC_GPFIFO_EX_FLAGS_VPR_ENABLED) c->vpr = true; @@ -1315,7 +1281,7 @@ int gk20a_channel_alloc_gpfifo(struct channel_gk20a *c, } err = nvgpu_dma_alloc_map_sys(ch_vm, - gpfifo_size * sizeof(struct nvgpu_gpfifo), + gpfifo_size * gpfifo_entry_size, &c->gpfifo.mem); if (err) { nvgpu_err(g, "%s: memory allocation failed", __func__); @@ -1324,7 +1290,7 @@ int gk20a_channel_alloc_gpfifo(struct channel_gk20a *c, if (c->gpfifo.mem.aperture == APERTURE_VIDMEM || g->mm.force_pramin) { c->gpfifo.pipe = nvgpu_big_malloc(g, - gpfifo_size * sizeof(struct nvgpu_gpfifo)); + gpfifo_size * gpfifo_entry_size); if (!c->gpfifo.pipe) { err = -ENOMEM; goto clean_up_unmap; @@ -1427,7 +1393,7 @@ static inline u32 update_gp_get(struct gk20a *g, return new_get; } -static inline u32 gp_free_count(struct channel_gk20a *c) +u32 nvgpu_gp_free_count(struct channel_gk20a *c) { return (c->gpfifo.entry_num - (c->gpfifo.put - c->gpfifo.get) - 1) % c->gpfifo.entry_num; @@ -1460,91 +1426,10 @@ static u32 gk20a_get_channel_watchdog_timeout(struct channel_gk20a *ch) return ch->g->ch_wdt_timeout_ms; } -static u32 get_gp_free_count(struct channel_gk20a *c) +u32 nvgpu_get_gp_free_count(struct channel_gk20a *c) { update_gp_get(c->g, c); - return gp_free_count(c); -} - -#ifdef CONFIG_DEBUG_FS -static void trace_write_pushbuffer(struct channel_gk20a *c, - struct nvgpu_gpfifo *g) -{ - void *mem = NULL; - unsigned int words; - u64 offset; - struct dma_buf *dmabuf = NULL; - - if (gk20a_debug_trace_cmdbuf) { - u64 gpu_va = (u64)g->entry0 | - (u64)((u64)pbdma_gp_entry1_get_hi_v(g->entry1) << 32); - int err; - - words = pbdma_gp_entry1_length_v(g->entry1); - err = nvgpu_vm_find_buf(c->vm, gpu_va, &dmabuf, &offset); - if (!err) - mem = dma_buf_vmap(dmabuf); - } - - if (mem) { - u32 i; - /* - * Write in batches of 128 as there seems to be a limit - * of how much you can output to ftrace at once. - */ - for (i = 0; i < words; i += 128U) { - trace_gk20a_push_cmdbuf( - c->g->name, - 0, - min(words - i, 128U), - offset + i * sizeof(u32), - mem); - } - dma_buf_vunmap(dmabuf, mem); - } -} -#endif - -static void trace_write_pushbuffer_range(struct channel_gk20a *c, - struct nvgpu_gpfifo *g, - struct nvgpu_gpfifo __user *user_gpfifo, - int offset, - int count) -{ -#ifdef CONFIG_DEBUG_FS - u32 size; - int i; - struct nvgpu_gpfifo *gp; - bool gpfifo_allocated = false; - - if (!gk20a_debug_trace_cmdbuf) - return; - - if (!g && !user_gpfifo) - return; - - if (!g) { - size = count * sizeof(struct nvgpu_gpfifo); - if (size) { - g = nvgpu_big_malloc(c->g, size); - if (!g) - return; - - if (copy_from_user(g, user_gpfifo, size)) { - nvgpu_big_free(c->g, g); - return; - } - } - gpfifo_allocated = true; - } - - gp = g + offset; - for (i = 0; i < count; i++, gp++) - trace_write_pushbuffer(c, gp); - - if (gpfifo_allocated) - nvgpu_big_free(c->g, g); -#endif + return nvgpu_gp_free_count(c); } static void __gk20a_channel_timeout_start(struct channel_gk20a *ch) @@ -2032,7 +1917,7 @@ int gk20a_free_priv_cmdbuf(struct channel_gk20a *c, struct priv_cmd_entry *e) return 0; } -static int gk20a_channel_add_job(struct channel_gk20a *c, +int gk20a_channel_add_job(struct channel_gk20a *c, struct channel_gk20a_job *job, bool skip_buffer_refcounting) { @@ -2097,7 +1982,7 @@ err_put_buffers: * per-job memory for completed jobs; in case of preallocated resources, this * opens up slots for new jobs to be submitted. */ -static void gk20a_channel_clean_up_jobs(struct channel_gk20a *c, +void gk20a_channel_clean_up_jobs(struct channel_gk20a *c, bool clean_all) { struct vm_gk20a *vm; @@ -2257,533 +2142,6 @@ void gk20a_channel_update(struct channel_gk20a *c) gk20a_channel_worker_enqueue(c); } -static void gk20a_submit_append_priv_cmdbuf(struct channel_gk20a *c, - struct priv_cmd_entry *cmd) -{ - struct gk20a *g = c->g; - struct nvgpu_mem *gpfifo_mem = &c->gpfifo.mem; - struct nvgpu_gpfifo x = { - .entry0 = u64_lo32(cmd->gva), - .entry1 = u64_hi32(cmd->gva) | - pbdma_gp_entry1_length_f(cmd->size) - }; - - nvgpu_mem_wr_n(g, gpfifo_mem, c->gpfifo.put * sizeof(x), - &x, sizeof(x)); - - if (cmd->mem->aperture == APERTURE_SYSMEM) - trace_gk20a_push_cmdbuf(g->name, 0, cmd->size, 0, - cmd->mem->cpu_va + cmd->off * sizeof(u32)); - - c->gpfifo.put = (c->gpfifo.put + 1) & (c->gpfifo.entry_num - 1); -} - -/* - * Copy source gpfifo entries into the gpfifo ring buffer, potentially - * splitting into two memcpys to handle wrap-around. - */ -static int gk20a_submit_append_gpfifo(struct channel_gk20a *c, - struct nvgpu_gpfifo *kern_gpfifo, - struct nvgpu_gpfifo __user *user_gpfifo, - u32 num_entries) -{ - /* byte offsets */ - u32 gpfifo_size = c->gpfifo.entry_num * sizeof(struct nvgpu_gpfifo); - u32 len = num_entries * sizeof(struct nvgpu_gpfifo); - u32 start = c->gpfifo.put * sizeof(struct nvgpu_gpfifo); - u32 end = start + len; /* exclusive */ - struct nvgpu_mem *gpfifo_mem = &c->gpfifo.mem; - struct nvgpu_gpfifo *cpu_src; - int err; - - if (user_gpfifo && !c->gpfifo.pipe) { - /* - * This path (from userspace to sysmem) is special in order to - * avoid two copies unnecessarily (from user to pipe, then from - * pipe to gpu sysmem buffer). - * - * As a special case, the pipe buffer exists if PRAMIN writes - * are forced, although the buffers may not be in vidmem in - * that case. - */ - if (end > gpfifo_size) { - /* wrap-around */ - int length0 = gpfifo_size - start; - int length1 = len - length0; - void __user *user2 = (u8 __user *)user_gpfifo + length0; - - err = copy_from_user(gpfifo_mem->cpu_va + start, - user_gpfifo, length0); - if (err) - return err; - - err = copy_from_user(gpfifo_mem->cpu_va, - user2, length1); - if (err) - return err; - } else { - err = copy_from_user(gpfifo_mem->cpu_va + start, - user_gpfifo, len); - if (err) - return err; - } - - trace_write_pushbuffer_range(c, NULL, user_gpfifo, - 0, num_entries); - goto out; - } else if (user_gpfifo) { - /* from userspace to vidmem or sysmem when pramin forced, use - * the common copy path below */ - err = copy_from_user(c->gpfifo.pipe, user_gpfifo, len); - if (err) - return err; - - cpu_src = c->gpfifo.pipe; - } else { - /* from kernel to either sysmem or vidmem, don't need - * copy_from_user so use the common path below */ - cpu_src = kern_gpfifo; - } - - if (end > gpfifo_size) { - /* wrap-around */ - int length0 = gpfifo_size - start; - int length1 = len - length0; - void *src2 = (u8 *)cpu_src + length0; - - nvgpu_mem_wr_n(c->g, gpfifo_mem, start, cpu_src, length0); - nvgpu_mem_wr_n(c->g, gpfifo_mem, 0, src2, length1); - } else { - nvgpu_mem_wr_n(c->g, gpfifo_mem, start, cpu_src, len); - - } - - trace_write_pushbuffer_range(c, cpu_src, NULL, 0, num_entries); - -out: - c->gpfifo.put = (c->gpfifo.put + num_entries) & - (c->gpfifo.entry_num - 1); - - return 0; -} - -/* - * Handle the submit synchronization - pre-fences and post-fences. - */ -static int gk20a_submit_prepare_syncs(struct channel_gk20a *c, - struct nvgpu_fence *fence, - struct channel_gk20a_job *job, - struct priv_cmd_entry **wait_cmd, - struct priv_cmd_entry **incr_cmd, - struct gk20a_fence **pre_fence, - struct gk20a_fence **post_fence, - bool force_need_sync_fence, - bool register_irq, - u32 flags) -{ - struct gk20a *g = c->g; - bool need_sync_fence = false; - bool new_sync_created = false; - int wait_fence_fd = -1; - int err = 0; - bool need_wfi = !(flags & NVGPU_SUBMIT_GPFIFO_FLAGS_SUPPRESS_WFI); - bool pre_alloc_enabled = channel_gk20a_is_prealloc_enabled(c); - - /* - * If user wants to always allocate sync_fence_fds then respect that; - * otherwise, allocate sync_fence_fd based on user flags. - */ - if (force_need_sync_fence) - need_sync_fence = true; - - if (g->aggressive_sync_destroy_thresh) { - nvgpu_mutex_acquire(&c->sync_lock); - if (!c->sync) { - c->sync = gk20a_channel_sync_create(c); - if (!c->sync) { - err = -ENOMEM; - nvgpu_mutex_release(&c->sync_lock); - goto fail; - } - new_sync_created = true; - } - nvgpu_atomic_inc(&c->sync->refcount); - nvgpu_mutex_release(&c->sync_lock); - } - - if (g->ops.fifo.resetup_ramfc && new_sync_created) { - err = g->ops.fifo.resetup_ramfc(c); - if (err) - goto fail; - } - - /* - * Optionally insert syncpt wait in the beginning of gpfifo submission - * when user requested and the wait hasn't expired. Validate that the id - * makes sense, elide if not. The only reason this isn't being - * unceremoniously killed is to keep running some tests which trigger - * this condition. - */ - if (flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_WAIT) { - job->pre_fence = gk20a_alloc_fence(c); - if (!job->pre_fence) { - err = -ENOMEM; - goto fail; - } - - if (!pre_alloc_enabled) - job->wait_cmd = nvgpu_kzalloc(g, - sizeof(struct priv_cmd_entry)); - - if (!job->wait_cmd) { - err = -ENOMEM; - goto clean_up_pre_fence; - } - - if (flags & NVGPU_SUBMIT_GPFIFO_FLAGS_SYNC_FENCE) { - wait_fence_fd = fence->id; - err = c->sync->wait_fd(c->sync, wait_fence_fd, - job->wait_cmd, job->pre_fence); - } else { - err = c->sync->wait_syncpt(c->sync, fence->id, - fence->value, job->wait_cmd, - job->pre_fence); - } - - if (!err) { - if (job->wait_cmd->valid) - *wait_cmd = job->wait_cmd; - *pre_fence = job->pre_fence; - } else - goto clean_up_wait_cmd; - } - - if ((flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_GET) && - (flags & NVGPU_SUBMIT_GPFIFO_FLAGS_SYNC_FENCE)) - need_sync_fence = true; - - /* - * Always generate an increment at the end of a GPFIFO submission. This - * is used to keep track of method completion for idle railgating. The - * sync_pt/semaphore PB is added to the GPFIFO later on in submit. - */ - job->post_fence = gk20a_alloc_fence(c); - if (!job->post_fence) { - err = -ENOMEM; - goto clean_up_wait_cmd; - } - if (!pre_alloc_enabled) - job->incr_cmd = nvgpu_kzalloc(g, sizeof(struct priv_cmd_entry)); - - if (!job->incr_cmd) { - err = -ENOMEM; - goto clean_up_post_fence; - } - - if (flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_GET) - err = c->sync->incr_user(c->sync, wait_fence_fd, job->incr_cmd, - job->post_fence, need_wfi, need_sync_fence, - register_irq); - else - err = c->sync->incr(c->sync, job->incr_cmd, - job->post_fence, need_sync_fence, - register_irq); - if (!err) { - *incr_cmd = job->incr_cmd; - *post_fence = job->post_fence; - } else - goto clean_up_incr_cmd; - - return 0; - -clean_up_incr_cmd: - free_priv_cmdbuf(c, job->incr_cmd); - if (!pre_alloc_enabled) - job->incr_cmd = NULL; -clean_up_post_fence: - gk20a_fence_put(job->post_fence); - job->post_fence = NULL; -clean_up_wait_cmd: - free_priv_cmdbuf(c, job->wait_cmd); - if (!pre_alloc_enabled) - job->wait_cmd = NULL; -clean_up_pre_fence: - gk20a_fence_put(job->pre_fence); - job->pre_fence = NULL; -fail: - *wait_cmd = NULL; - *pre_fence = NULL; - return err; -} - -int gk20a_submit_channel_gpfifo(struct channel_gk20a *c, - struct nvgpu_gpfifo *gpfifo, - struct nvgpu_submit_gpfifo_args *args, - u32 num_entries, - u32 flags, - struct nvgpu_fence *fence, - struct gk20a_fence **fence_out, - bool force_need_sync_fence, - struct fifo_profile_gk20a *profile) -{ - struct gk20a *g = c->g; - struct priv_cmd_entry *wait_cmd = NULL; - struct priv_cmd_entry *incr_cmd = NULL; - struct gk20a_fence *pre_fence = NULL; - struct gk20a_fence *post_fence = NULL; - struct channel_gk20a_job *job = NULL; - /* we might need two extra gpfifo entries - one for pre fence - * and one for post fence. */ - const int extra_entries = 2; - bool skip_buffer_refcounting = (flags & - NVGPU_SUBMIT_GPFIFO_FLAGS_SKIP_BUFFER_REFCOUNTING); - int err = 0; - bool need_job_tracking; - bool need_deferred_cleanup = false; - struct nvgpu_gpfifo __user *user_gpfifo = args ? - (struct nvgpu_gpfifo __user *)(uintptr_t)args->gpfifo : NULL; - - if (nvgpu_is_enabled(g, NVGPU_DRIVER_IS_DYING)) - return -ENODEV; - - if (c->has_timedout) - return -ETIMEDOUT; - - if (!nvgpu_mem_is_valid(&c->gpfifo.mem)) - return -ENOMEM; - - /* fifo not large enough for request. Return error immediately. - * Kernel can insert gpfifo entries before and after user gpfifos. - * So, add extra_entries in user request. Also, HW with fifo size N - * can accept only N-1 entreis and so the below condition */ - if (c->gpfifo.entry_num - 1 < num_entries + extra_entries) { - nvgpu_err(g, "not enough gpfifo space allocated"); - return -ENOMEM; - } - - if (!gpfifo && !args) - return -EINVAL; - - if ((flags & (NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_WAIT | - NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_GET)) && - !fence) - return -EINVAL; - - /* an address space needs to have been bound at this point. */ - if (!gk20a_channel_as_bound(c)) { - nvgpu_err(g, - "not bound to an address space at time of gpfifo" - " submission."); - return -EINVAL; - } - - if (profile) - profile->timestamp[PROFILE_ENTRY] = sched_clock(); - - /* update debug settings */ - nvgpu_ltc_sync_enabled(g); - - gk20a_dbg_info("channel %d", c->chid); - - /* - * Job tracking is necessary for any of the following conditions: - * - pre- or post-fence functionality - * - channel wdt - * - GPU rail-gating with non-deterministic channels - * - buffer refcounting - * - * If none of the conditions are met, then job tracking is not - * required and a fast submit can be done (ie. only need to write - * out userspace GPFIFO entries and update GP_PUT). - */ - need_job_tracking = (flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_WAIT) || - (flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_GET) || - c->wdt_enabled || - (g->can_railgate && !c->deterministic) || - !skip_buffer_refcounting; - - if (need_job_tracking) { - bool need_sync_framework = false; - - /* - * If the channel is to have deterministic latency and - * job tracking is required, the channel must have - * pre-allocated resources. Otherwise, we fail the submit here - */ - if (c->deterministic && !channel_gk20a_is_prealloc_enabled(c)) - return -EINVAL; - - need_sync_framework = force_need_sync_fence || - gk20a_channel_sync_needs_sync_framework(g) || - (flags & NVGPU_SUBMIT_GPFIFO_FLAGS_SYNC_FENCE && - (flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_WAIT || - flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_GET)); - - /* - * Deferred clean-up is necessary for any of the following - * conditions: - * - channel's deterministic flag is not set - * - dependency on sync framework, which could make the - * behavior of the clean-up operation non-deterministic - * (should not be performed in the submit path) - * - channel wdt - * - GPU rail-gating with non-deterministic channels - * - buffer refcounting - * - * If none of the conditions are met, then deferred clean-up - * is not required, and we clean-up one job-tracking - * resource in the submit path. - */ - need_deferred_cleanup = !c->deterministic || - need_sync_framework || - c->wdt_enabled || - (g->can_railgate && - !c->deterministic) || - !skip_buffer_refcounting; - - /* - * For deterministic channels, we don't allow deferred clean_up - * processing to occur. In cases we hit this, we fail the submit - */ - if (c->deterministic && need_deferred_cleanup) - return -EINVAL; - - if (!c->deterministic) { - /* - * Get a power ref unless this is a deterministic - * channel that holds them during the channel lifetime. - * This one is released by gk20a_channel_clean_up_jobs, - * via syncpt or sema interrupt, whichever is used. - */ - err = gk20a_busy(g); - if (err) { - nvgpu_err(g, - "failed to host gk20a to submit gpfifo, process %s", - current->comm); - return err; - } - } - - if (!need_deferred_cleanup) { - /* clean up a single job */ - gk20a_channel_clean_up_jobs(c, false); - } - } - - - /* Grab access to HW to deal with do_idle */ - if (c->deterministic) - nvgpu_rwsem_down_read(&g->deterministic_busy); - - trace_gk20a_channel_submit_gpfifo(g->name, - c->chid, - num_entries, - flags, - fence ? fence->id : 0, - fence ? fence->value : 0); - - gk20a_dbg_info("pre-submit put %d, get %d, size %d", - c->gpfifo.put, c->gpfifo.get, c->gpfifo.entry_num); - - /* - * Make sure we have enough space for gpfifo entries. Check cached - * values first and then read from HW. If no space, return EAGAIN - * and let userpace decide to re-try request or not. - */ - if (gp_free_count(c) < num_entries + extra_entries) { - if (get_gp_free_count(c) < num_entries + extra_entries) { - err = -EAGAIN; - goto clean_up; - } - } - - if (c->has_timedout) { - err = -ETIMEDOUT; - goto clean_up; - } - - if (need_job_tracking) { - err = channel_gk20a_alloc_job(c, &job); - if (err) - goto clean_up; - - err = gk20a_submit_prepare_syncs(c, fence, job, - &wait_cmd, &incr_cmd, - &pre_fence, &post_fence, - force_need_sync_fence, - need_deferred_cleanup, - flags); - if (err) - goto clean_up_job; - } - - if (profile) - profile->timestamp[PROFILE_JOB_TRACKING] = sched_clock(); - - if (wait_cmd) - gk20a_submit_append_priv_cmdbuf(c, wait_cmd); - - if (gpfifo || user_gpfifo) - err = gk20a_submit_append_gpfifo(c, gpfifo, user_gpfifo, - num_entries); - if (err) - goto clean_up_job; - - /* - * And here's where we add the incr_cmd we generated earlier. It should - * always run! - */ - if (incr_cmd) - gk20a_submit_append_priv_cmdbuf(c, incr_cmd); - - if (fence_out) - *fence_out = gk20a_fence_get(post_fence); - - if (need_job_tracking) - /* TODO! Check for errors... */ - gk20a_channel_add_job(c, job, skip_buffer_refcounting); - if (profile) - profile->timestamp[PROFILE_APPEND] = sched_clock(); - - g->ops.fifo.userd_gp_put(g, c); - - if ((NVGPU_SUBMIT_GPFIFO_FLAGS_RESCHEDULE_RUNLIST & flags) && - g->ops.fifo.reschedule_runlist) - g->ops.fifo.reschedule_runlist(g, c->runlist_id); - - /* No hw access beyond this point */ - if (c->deterministic) - nvgpu_rwsem_up_read(&g->deterministic_busy); - - trace_gk20a_channel_submitted_gpfifo(g->name, - c->chid, - num_entries, - flags, - post_fence ? post_fence->syncpt_id : 0, - post_fence ? post_fence->syncpt_value : 0); - - gk20a_dbg_info("post-submit put %d, get %d, size %d", - c->gpfifo.put, c->gpfifo.get, c->gpfifo.entry_num); - - if (profile) - profile->timestamp[PROFILE_END] = sched_clock(); - gk20a_dbg_fn("done"); - return err; - -clean_up_job: - channel_gk20a_free_job(c, job); -clean_up: - gk20a_dbg_fn("fail"); - gk20a_fence_put(pre_fence); - gk20a_fence_put(post_fence); - if (c->deterministic) - nvgpu_rwsem_up_read(&g->deterministic_busy); - else if (need_deferred_cleanup) - gk20a_idle(g); - - return err; -} - /* * Stop deterministic channel activity for do_idle() when power needs to go off * momentarily but deterministic channels keep power refs for potentially a diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h index 4b1cb351..cdf75a9a 100644 --- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h +++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h @@ -24,6 +24,9 @@ #ifndef CHANNEL_GK20A_H #define CHANNEL_GK20A_H +/* TODO: To be removed when work_struct update_fn_work is moved out of common code */ +#include + #include #include @@ -374,16 +377,6 @@ struct channel_gk20a *gk20a_open_new_channel_with_cb(struct gk20a *g, int runlist_id, bool is_privileged_channel); -int gk20a_submit_channel_gpfifo(struct channel_gk20a *c, - struct nvgpu_gpfifo *gpfifo, - struct nvgpu_submit_gpfifo_args *args, - u32 num_entries, - u32 flags, - struct nvgpu_fence *fence, - struct gk20a_fence **fence_out, - bool force_need_sync_fence, - struct fifo_profile_gk20a *profile); - int gk20a_channel_alloc_gpfifo(struct channel_gk20a *c, unsigned int num_entries, unsigned int num_inflight_jobs, @@ -408,4 +401,20 @@ int gk20a_channel_set_runlist_interleave(struct channel_gk20a *ch, void gk20a_channel_event_id_post_event(struct channel_gk20a *ch, u32 event_id); +int channel_gk20a_alloc_job(struct channel_gk20a *c, + struct channel_gk20a_job **job_out); +void channel_gk20a_free_job(struct channel_gk20a *c, + struct channel_gk20a_job *job); +u32 nvgpu_get_gp_free_count(struct channel_gk20a *c); +u32 nvgpu_gp_free_count(struct channel_gk20a *c); +int gk20a_channel_add_job(struct channel_gk20a *c, + struct channel_gk20a_job *job, + bool skip_buffer_refcounting); +void free_priv_cmdbuf(struct channel_gk20a *c, + struct priv_cmd_entry *e); +void gk20a_channel_clean_up_jobs(struct channel_gk20a *c, + bool clean_all); + +u32 nvgpu_get_gpfifo_entry_size(void); + #endif /* CHANNEL_GK20A_H */ -- cgit v1.2.2