/* * Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. */ #include #include #include #include #include #include #include #include #include "gk20a/fence_gk20a.h" #include /* * Handle the submit synchronization - pre-fences and post-fences. */ static int nvgpu_submit_prepare_syncs(struct channel_gk20a *c, struct nvgpu_channel_fence *fence, struct channel_gk20a_job *job, struct priv_cmd_entry **wait_cmd, struct priv_cmd_entry **incr_cmd, struct gk20a_fence **post_fence, bool register_irq, u32 flags) { struct gk20a *g = c->g; bool need_sync_fence = false; bool new_sync_created = false; int wait_fence_fd = -1; int err = 0; bool need_wfi = !(flags & NVGPU_SUBMIT_FLAGS_SUPPRESS_WFI); bool pre_alloc_enabled = channel_gk20a_is_prealloc_enabled(c); if (g->aggressive_sync_destroy_thresh) { nvgpu_mutex_acquire(&c->sync_lock); if (!c->sync) { c->sync = nvgpu_channel_sync_create(c, false); if (!c->sync) { err = -ENOMEM; goto fail; } new_sync_created = true; } nvgpu_atomic_inc(&c->sync->refcount); } if (g->ops.fifo.resetup_ramfc && new_sync_created) { err = g->ops.fifo.resetup_ramfc(c); if (err) { goto fail; } } /* * Optionally insert syncpt/semaphore wait in the beginning of gpfifo * submission when user requested and the wait hasn't expired. */ if (flags & NVGPU_SUBMIT_FLAGS_FENCE_WAIT) { int max_wait_cmds = c->deterministic ? 1 : 0; if (!pre_alloc_enabled) { job->wait_cmd = nvgpu_kzalloc(g, sizeof(struct priv_cmd_entry)); } if (!job->wait_cmd) { err = -ENOMEM; goto fail; } if (flags & NVGPU_SUBMIT_FLAGS_SYNC_FENCE) { wait_fence_fd = fence->id; err = c->sync->wait_fd(c->sync, wait_fence_fd, job->wait_cmd, max_wait_cmds); } else { err = c->sync->wait_syncpt(c->sync, fence->id, fence->value, job->wait_cmd); } if (err) { goto clean_up_wait_cmd; } if (job->wait_cmd->valid) { *wait_cmd = job->wait_cmd; } } if ((flags & NVGPU_SUBMIT_FLAGS_FENCE_GET) && (flags & NVGPU_SUBMIT_FLAGS_SYNC_FENCE)) { need_sync_fence = true; } /* * Always generate an increment at the end of a GPFIFO submission. This * is used to keep track of method completion for idle railgating. The * sync_pt/semaphore PB is added to the GPFIFO later on in submit. */ job->post_fence = gk20a_alloc_fence(c); if (!job->post_fence) { err = -ENOMEM; goto clean_up_wait_cmd; } if (!pre_alloc_enabled) { job->incr_cmd = nvgpu_kzalloc(g, sizeof(struct priv_cmd_entry)); } if (!job->incr_cmd) { err = -ENOMEM; goto clean_up_post_fence; } if (flags & NVGPU_SUBMIT_FLAGS_FENCE_GET) { err = c->sync->incr_user(c->sync, wait_fence_fd, job->incr_cmd, job->post_fence, need_wfi, need_sync_fence, register_irq); } else { err = c->sync->incr(c->sync, job->incr_cmd, job->post_fence, need_sync_fence, register_irq); } if (!err) { *incr_cmd = job->incr_cmd; *post_fence = job->post_fence; } else { goto clean_up_incr_cmd; } if (g->aggressive_sync_destroy_thresh) { nvgpu_mutex_release(&c->sync_lock); } return 0; clean_up_incr_cmd: free_priv_cmdbuf(c, job->incr_cmd); if (!pre_alloc_enabled) { job->incr_cmd = NULL; } clean_up_post_fence: gk20a_fence_put(job->post_fence); job->post_fence = NULL; clean_up_wait_cmd: if (job->wait_cmd) { free_priv_cmdbuf(c, job->wait_cmd); } if (!pre_alloc_enabled) { job->wait_cmd = NULL; } fail: if (g->aggressive_sync_destroy_thresh) { nvgpu_mutex_release(&c->sync_lock); } *wait_cmd = NULL; return err; } static void nvgpu_submit_append_priv_cmdbuf(struct channel_gk20a *c, struct priv_cmd_entry *cmd) { struct gk20a *g = c->g; struct nvgpu_mem *gpfifo_mem = &c->gpfifo.mem; struct nvgpu_gpfifo_entry x = { .entry0 = u64_lo32(cmd->gva), .entry1 = u64_hi32(cmd->gva) | pbdma_gp_entry1_length_f(cmd->size) }; nvgpu_mem_wr_n(g, gpfifo_mem, c->gpfifo.put * sizeof(x), &x, sizeof(x)); if (cmd->mem->aperture == APERTURE_SYSMEM) { trace_gk20a_push_cmdbuf(g->name, 0, cmd->size, 0, (u32 *)cmd->mem->cpu_va + cmd->off); } c->gpfifo.put = (c->gpfifo.put + 1U) & (c->gpfifo.entry_num - 1U); } static int nvgpu_submit_append_gpfifo_user_direct(struct channel_gk20a *c, struct nvgpu_gpfifo_userdata userdata, u32 num_entries) { struct gk20a *g = c->g; struct nvgpu_gpfifo_entry *gpfifo_cpu = c->gpfifo.mem.cpu_va; u32 gpfifo_size = c->gpfifo.entry_num; u32 len = num_entries; u32 start = c->gpfifo.put; u32 end = start + len; /* exclusive */ int err; nvgpu_speculation_barrier(); if (end > gpfifo_size) { /* wrap-around */ int length0 = gpfifo_size - start; int length1 = len - length0; err = g->os_channel.copy_user_gpfifo( gpfifo_cpu + start, userdata, 0, length0); if (err) { return err; } err = g->os_channel.copy_user_gpfifo( gpfifo_cpu, userdata, length0, length1); if (err) { return err; } } else { err = g->os_channel.copy_user_gpfifo( gpfifo_cpu + start, userdata, 0, len); if (err) { return err; } } return 0; } static void nvgpu_submit_append_gpfifo_common(struct channel_gk20a *c, struct nvgpu_gpfifo_entry *src, u32 num_entries) { struct gk20a *g = c->g; struct nvgpu_mem *gpfifo_mem = &c->gpfifo.mem; /* in bytes */ u32 gpfifo_size = c->gpfifo.entry_num * sizeof(struct nvgpu_gpfifo_entry); u32 len = num_entries * sizeof(struct nvgpu_gpfifo_entry); u32 start = c->gpfifo.put * sizeof(struct nvgpu_gpfifo_entry); u32 end = start + len; /* exclusive */ if (end > gpfifo_size) { /* wrap-around */ int length0 = gpfifo_size - start; int length1 = len - length0; struct nvgpu_gpfifo_entry *src2 = src + length0; nvgpu_mem_wr_n(g, gpfifo_mem, start, src, length0); nvgpu_mem_wr_n(g, gpfifo_mem, 0, src2, length1); } else { nvgpu_mem_wr_n(g, gpfifo_mem, start, src, len); } } /* * Copy source gpfifo entries into the gpfifo ring buffer, potentially * splitting into two memcpys to handle wrap-around. */ static int nvgpu_submit_append_gpfifo(struct channel_gk20a *c, struct nvgpu_gpfifo_entry *kern_gpfifo, struct nvgpu_gpfifo_userdata userdata, u32 num_entries) { struct gk20a *g = c->g; int err; if (!kern_gpfifo && !c->gpfifo.pipe) { /* * This path (from userspace to sysmem) is special in order to * avoid two copies unnecessarily (from user to pipe, then from * pipe to gpu sysmem buffer). */ err = nvgpu_submit_append_gpfifo_user_direct(c, userdata, num_entries); if (err) { return err; } } else if (!kern_gpfifo) { /* from userspace to vidmem, use the common path */ err = g->os_channel.copy_user_gpfifo(c->gpfifo.pipe, userdata, 0, num_entries); if (err) { return err; } nvgpu_submit_append_gpfifo_common(c, c->gpfifo.pipe, num_entries); } else { /* from kernel to either sysmem or vidmem, don't need * copy_user_gpfifo so use the common path */ nvgpu_submit_append_gpfifo_common(c, kern_gpfifo, num_entries); } trace_write_pushbuffers(c, num_entries); c->gpfifo.put = (c->gpfifo.put + num_entries) & (c->gpfifo.entry_num - 1U); return 0; } static int nvgpu_submit_channel_gpfifo(struct channel_gk20a *c, struct nvgpu_gpfifo_entry *gpfifo, struct nvgpu_gpfifo_userdata userdata, u32 num_entries, u32 flags, struct nvgpu_channel_fence *fence, struct gk20a_fence **fence_out, struct fifo_profile_gk20a *profile) { struct gk20a *g = c->g; struct priv_cmd_entry *wait_cmd = NULL; struct priv_cmd_entry *incr_cmd = NULL; struct gk20a_fence *post_fence = NULL; struct channel_gk20a_job *job = NULL; /* we might need two extra gpfifo entries - one for pre fence * and one for post fence. */ const u32 extra_entries = 2U; bool skip_buffer_refcounting = (flags & NVGPU_SUBMIT_FLAGS_SKIP_BUFFER_REFCOUNTING); int err = 0; bool need_job_tracking; bool need_deferred_cleanup = false; if (nvgpu_is_enabled(g, NVGPU_DRIVER_IS_DYING)) { return -ENODEV; } if (gk20a_channel_check_timedout(c)) { return -ETIMEDOUT; } if (c->usermode_submit_enabled) { return -EINVAL; } if (!nvgpu_mem_is_valid(&c->gpfifo.mem)) { return -ENOMEM; } /* fifo not large enough for request. Return error immediately. * Kernel can insert gpfifo entries before and after user gpfifos. * So, add extra_entries in user request. Also, HW with fifo size N * can accept only N-1 entreis and so the below condition */ if (c->gpfifo.entry_num - 1U < num_entries + extra_entries) { nvgpu_err(g, "not enough gpfifo space allocated"); return -ENOMEM; } if ((flags & (NVGPU_SUBMIT_FLAGS_FENCE_WAIT | NVGPU_SUBMIT_FLAGS_FENCE_GET)) && !fence) { return -EINVAL; } /* an address space needs to have been bound at this point. */ if (!gk20a_channel_as_bound(c)) { nvgpu_err(g, "not bound to an address space at time of gpfifo" " submission."); return -EINVAL; } gk20a_fifo_profile_snapshot(profile, PROFILE_ENTRY); /* update debug settings */ nvgpu_ltc_sync_enabled(g); nvgpu_log_info(g, "channel %d", c->chid); /* * Job tracking is necessary for any of the following conditions: * - pre- or post-fence functionality * - channel wdt * - GPU rail-gating with non-deterministic channels * - VPR resize enabled with non-deterministic channels * - buffer refcounting * * If none of the conditions are met, then job tracking is not * required and a fast submit can be done (ie. only need to write * out userspace GPFIFO entries and update GP_PUT). */ need_job_tracking = (flags & NVGPU_SUBMIT_FLAGS_FENCE_WAIT) || (flags & NVGPU_SUBMIT_FLAGS_FENCE_GET) || c->timeout.enabled || ((nvgpu_is_enabled(g, NVGPU_CAN_RAILGATE) || nvgpu_is_vpr_resize_enabled()) && !c->deterministic) || !skip_buffer_refcounting; if (need_job_tracking) { bool need_sync_framework = false; /* * If the channel is to have deterministic latency and * job tracking is required, the channel must have * pre-allocated resources. Otherwise, we fail the submit here */ if (c->deterministic && !channel_gk20a_is_prealloc_enabled(c)) { return -EINVAL; } need_sync_framework = nvgpu_channel_sync_needs_os_fence_framework(g) || (flags & NVGPU_SUBMIT_FLAGS_SYNC_FENCE && flags & NVGPU_SUBMIT_FLAGS_FENCE_GET); /* * Deferred clean-up is necessary for any of the following * conditions: * - channel's deterministic flag is not set * - dependency on sync framework, which could make the * behavior of the clean-up operation non-deterministic * (should not be performed in the submit path) * - channel wdt * - GPU rail-gating with non-deterministic channels * - buffer refcounting * * If none of the conditions are met, then deferred clean-up * is not required, and we clean-up one job-tracking * resource in the submit path. */ need_deferred_cleanup = !c->deterministic || need_sync_framework || c->timeout.enabled || (nvgpu_is_enabled(g, NVGPU_CAN_RAILGATE) && !c->deterministic) || !skip_buffer_refcounting; /* * For deterministic channels, we don't allow deferred clean_up * processing to occur. In cases we hit this, we fail the submit */ if (c->deterministic && need_deferred_cleanup) { return -EINVAL; } if (!c->deterministic) { /* * Get a power ref unless this is a deterministic * channel that holds them during the channel lifetime. * This one is released by gk20a_channel_clean_up_jobs, * via syncpt or sema interrupt, whichever is used. */ err = gk20a_busy(g); if (err) { nvgpu_err(g, "failed to host gk20a to submit gpfifo"); nvgpu_print_current(g, NULL, NVGPU_ERROR); return err; } } if (!need_deferred_cleanup) { /* clean up a single job */ gk20a_channel_clean_up_jobs(c, false); } } /* Grab access to HW to deal with do_idle */ if (c->deterministic) { nvgpu_rwsem_down_read(&g->deterministic_busy); } if (c->deterministic && c->deterministic_railgate_allowed) { /* * Nope - this channel has dropped its own power ref. As * deterministic submits don't hold power on per each submitted * job like normal ones do, the GPU might railgate any time now * and thus submit is disallowed. */ err = -EINVAL; goto clean_up; } trace_gk20a_channel_submit_gpfifo(g->name, c->chid, num_entries, flags, fence ? fence->id : 0, fence ? fence->value : 0); nvgpu_log_info(g, "pre-submit put %d, get %d, size %d", c->gpfifo.put, c->gpfifo.get, c->gpfifo.entry_num); /* * Make sure we have enough space for gpfifo entries. Check cached * values first and then read from HW. If no space, return EAGAIN * and let userpace decide to re-try request or not. */ if (nvgpu_gp_free_count(c) < num_entries + extra_entries) { if (nvgpu_get_gp_free_count(c) < num_entries + extra_entries) { err = -EAGAIN; goto clean_up; } } if (gk20a_channel_check_timedout(c)) { err = -ETIMEDOUT; goto clean_up; } if (need_job_tracking) { err = channel_gk20a_alloc_job(c, &job); if (err) { goto clean_up; } err = nvgpu_submit_prepare_syncs(c, fence, job, &wait_cmd, &incr_cmd, &post_fence, need_deferred_cleanup, flags); if (err) { goto clean_up_job; } } gk20a_fifo_profile_snapshot(profile, PROFILE_JOB_TRACKING); if (wait_cmd) { nvgpu_submit_append_priv_cmdbuf(c, wait_cmd); } err = nvgpu_submit_append_gpfifo(c, gpfifo, userdata, num_entries); if (err) { goto clean_up_job; } /* * And here's where we add the incr_cmd we generated earlier. It should * always run! */ if (incr_cmd) { nvgpu_submit_append_priv_cmdbuf(c, incr_cmd); } if (fence_out) { *fence_out = gk20a_fence_get(post_fence); } if (need_job_tracking) { /* TODO! Check for errors... */ gk20a_channel_add_job(c, job, skip_buffer_refcounting); } gk20a_fifo_profile_snapshot(profile, PROFILE_APPEND); g->ops.fifo.userd_gp_put(g, c); /* No hw access beyond this point */ if (c->deterministic) { nvgpu_rwsem_up_read(&g->deterministic_busy); } trace_gk20a_channel_submitted_gpfifo(g->name, c->chid, num_entries, flags, post_fence ? post_fence->syncpt_id : 0, post_fence ? post_fence->syncpt_value : 0); nvgpu_log_info(g, "post-submit put %d, get %d, size %d", c->gpfifo.put, c->gpfifo.get, c->gpfifo.entry_num); gk20a_fifo_profile_snapshot(profile, PROFILE_END); nvgpu_log_fn(g, "done"); return err; clean_up_job: channel_gk20a_free_job(c, job); clean_up: nvgpu_log_fn(g, "fail"); gk20a_fence_put(post_fence); if (c->deterministic) { nvgpu_rwsem_up_read(&g->deterministic_busy); } else if (need_deferred_cleanup) { gk20a_idle(g); } return err; } int nvgpu_submit_channel_gpfifo_user(struct channel_gk20a *c, struct nvgpu_gpfifo_userdata userdata, u32 num_entries, u32 flags, struct nvgpu_channel_fence *fence, struct gk20a_fence **fence_out, struct fifo_profile_gk20a *profile) { return nvgpu_submit_channel_gpfifo(c, NULL, userdata, num_entries, flags, fence, fence_out, profile); } int nvgpu_submit_channel_gpfifo_kernel(struct channel_gk20a *c, struct nvgpu_gpfifo_entry *gpfifo, u32 num_entries, u32 flags, struct nvgpu_channel_fence *fence, struct gk20a_fence **fence_out) { struct nvgpu_gpfifo_userdata userdata = { NULL, NULL }; return nvgpu_submit_channel_gpfifo(c, gpfifo, userdata, num_entries, flags, fence, fence_out, NULL); }