summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDeepak Nibade <dnibade@nvidia.com>2017-10-26 11:29:56 -0400
committermobile promotions <svcmobile_promotions@nvidia.com>2017-11-02 08:09:59 -0400
commit23c7903eff6ee1ab184dfcc62c054de1557e5b1d (patch)
treea5122028e181e5c6009f9f8b66bfbf00f69a9290
parent5f8cfaa250f08499f587da0097f6accaa5eedf15 (diff)
gpu: nvgpu: move submit path to linux
Nvgpu submit path has a lot of dependency on Linux framework e.g. use of copy_from_user, use of structures defined in uapi/nvgpu headers, dma_buf_* calls for trace support etc Hence to keep common code independent of Linux code, move submit path to Linux directory Move below APIs to common/linux/channel.c trace_write_pushbuffer() trace_write_pushbuffer_range() gk20a_submit_prepare_syncs() gk20a_submit_append_priv_cmdbuf() gk20a_submit_append_gpfifo() gk20a_submit_channel_gpfifo() Move below APIs to common/linux/ce2.c gk20a_ce_execute_ops() Define gk20a_ce_execute_ops() in common/linux/ce2.c, and declare it in gk20a/ce2_gk20a.h since it is needed in common/mm code too Each OS needs to implement this API separately gk20a_channel_alloc_gpfifo() use sizeof(nvgpu_gpfifo) to get size of one gpfifo entry, but structure nvgpu_gpfifo is linux specific Define new nvgpu_get_gpfifo_entry_size() in linux specific code and use it in gk20a_channel_alloc_gpfifo() to get gpfifo entry size Each OS needs to implement this API separately Export some APIs from gk20a/ce2_gk20a.h and gk20a/channel_gk20a.h that are needed in linux code Jira NVGPU-259 Jira NVGPU-313 Change-Id: I360c6cb8ce4494b1e50c66af334a2a379f0d2dc4 Signed-off-by: Deepak Nibade <dnibade@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/1586277 Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
-rw-r--r--drivers/gpu/nvgpu/Makefile2
-rw-r--r--drivers/gpu/nvgpu/common/linux/cde.c1
-rw-r--r--drivers/gpu/nvgpu/common/linux/ce2.c185
-rw-r--r--drivers/gpu/nvgpu/common/linux/channel.c648
-rw-r--r--drivers/gpu/nvgpu/common/linux/channel.h38
-rw-r--r--drivers/gpu/nvgpu/common/linux/ioctl_channel.c1
-rw-r--r--drivers/gpu/nvgpu/gk20a/ce2_gk20a.c164
-rw-r--r--drivers/gpu/nvgpu/gk20a/ce2_gk20a.h10
-rw-r--r--drivers/gpu/nvgpu/gk20a/channel_gk20a.c666
-rw-r--r--drivers/gpu/nvgpu/gk20a/channel_gk20a.h29
10 files changed, 917 insertions, 827 deletions
diff --git a/drivers/gpu/nvgpu/Makefile b/drivers/gpu/nvgpu/Makefile
index 06d3dedb..9c6c59f2 100644
--- a/drivers/gpu/nvgpu/Makefile
+++ b/drivers/gpu/nvgpu/Makefile
@@ -54,6 +54,8 @@ nvgpu-y := \
54 common/linux/comptags.o \ 54 common/linux/comptags.o \
55 common/linux/dmabuf.o \ 55 common/linux/dmabuf.o \
56 common/linux/sched.o \ 56 common/linux/sched.o \
57 common/linux/channel.o \
58 common/linux/ce2.o \
57 common/mm/nvgpu_allocator.o \ 59 common/mm/nvgpu_allocator.o \
58 common/mm/bitmap_allocator.o \ 60 common/mm/bitmap_allocator.o \
59 common/mm/buddy_allocator.o \ 61 common/mm/buddy_allocator.o \
diff --git a/drivers/gpu/nvgpu/common/linux/cde.c b/drivers/gpu/nvgpu/common/linux/cde.c
index 6600fe42..f6020d9a 100644
--- a/drivers/gpu/nvgpu/common/linux/cde.c
+++ b/drivers/gpu/nvgpu/common/linux/cde.c
@@ -42,6 +42,7 @@
42#include "cde.h" 42#include "cde.h"
43#include "os_linux.h" 43#include "os_linux.h"
44#include "dmabuf.h" 44#include "dmabuf.h"
45#include "channel.h"
45 46
46#include <nvgpu/hw/gk20a/hw_ccsr_gk20a.h> 47#include <nvgpu/hw/gk20a/hw_ccsr_gk20a.h>
47#include <nvgpu/hw/gk20a/hw_pbdma_gk20a.h> 48#include <nvgpu/hw/gk20a/hw_pbdma_gk20a.h>
diff --git a/drivers/gpu/nvgpu/common/linux/ce2.c b/drivers/gpu/nvgpu/common/linux/ce2.c
new file mode 100644
index 00000000..3fee23e5
--- /dev/null
+++ b/drivers/gpu/nvgpu/common/linux/ce2.c
@@ -0,0 +1,185 @@
1/*
2 * Copyright (c) 2017, NVIDIA Corporation. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11 * more details.
12 *
13 * You should have received a copy of the GNU General Public License
14 * along with this program. If not, see <http://www.gnu.org/licenses/>.
15 */
16
17#include <nvgpu/types.h>
18
19#include <nvgpu/hw/gk20a/hw_pbdma_gk20a.h>
20
21#include "gk20a/ce2_gk20a.h"
22#include "gk20a/gk20a.h"
23#include "channel.h"
24
25static inline int gk20a_get_valid_launch_flags(struct gk20a *g, int launch_flags)
26{
27 /* there is no local memory available,
28 don't allow local memory related CE flags */
29 if (!g->mm.vidmem.size) {
30 launch_flags &= ~(NVGPU_CE_SRC_LOCATION_LOCAL_FB |
31 NVGPU_CE_DST_LOCATION_LOCAL_FB);
32 }
33 return launch_flags;
34}
35
36int gk20a_ce_execute_ops(struct gk20a *g,
37 u32 ce_ctx_id,
38 u64 src_buf,
39 u64 dst_buf,
40 u64 size,
41 unsigned int payload,
42 int launch_flags,
43 int request_operation,
44 struct gk20a_fence *gk20a_fence_in,
45 u32 submit_flags,
46 struct gk20a_fence **gk20a_fence_out)
47{
48 int ret = -EPERM;
49 struct gk20a_ce_app *ce_app = &g->ce_app;
50 struct gk20a_gpu_ctx *ce_ctx, *ce_ctx_save;
51 bool found = false;
52 u32 *cmd_buf_cpu_va;
53 u64 cmd_buf_gpu_va = 0;
54 u32 methodSize;
55 u32 cmd_buf_read_offset;
56 u32 fence_index;
57 struct nvgpu_gpfifo gpfifo;
58 struct nvgpu_fence fence = {0,0};
59 struct gk20a_fence *ce_cmd_buf_fence_out = NULL;
60 struct nvgpu_gpu_characteristics *gpu_capability = &g->gpu_characteristics;
61
62 if (!ce_app->initialised ||ce_app->app_state != NVGPU_CE_ACTIVE)
63 goto end;
64
65 nvgpu_mutex_acquire(&ce_app->app_mutex);
66
67 nvgpu_list_for_each_entry_safe(ce_ctx, ce_ctx_save,
68 &ce_app->allocated_contexts, gk20a_gpu_ctx, list) {
69 if (ce_ctx->ctx_id == ce_ctx_id) {
70 found = true;
71 break;
72 }
73 }
74
75 nvgpu_mutex_release(&ce_app->app_mutex);
76
77 if (!found) {
78 ret = -EINVAL;
79 goto end;
80 }
81
82 if (ce_ctx->gpu_ctx_state != NVGPU_CE_GPU_CTX_ALLOCATED) {
83 ret = -ENODEV;
84 goto end;
85 }
86
87 nvgpu_mutex_acquire(&ce_ctx->gpu_ctx_mutex);
88
89 ce_ctx->cmd_buf_read_queue_offset %= ce_ctx->cmd_buf_end_queue_offset;
90
91 cmd_buf_read_offset = (ce_ctx->cmd_buf_read_queue_offset *
92 (NVGPU_CE_MAX_COMMAND_BUFF_SIZE_PER_KICKOFF / sizeof(u32)));
93
94 /* at end of command buffer has gk20a_fence for command buffer sync */
95 fence_index = (cmd_buf_read_offset +
96 ((NVGPU_CE_MAX_COMMAND_BUFF_SIZE_PER_KICKOFF / sizeof(u32)) -
97 (NVGPU_CE_MAX_COMMAND_BUFF_SIZE_FOR_TRACING / sizeof(u32))));
98
99 if (sizeof(struct gk20a_fence *) > NVGPU_CE_MAX_COMMAND_BUFF_SIZE_FOR_TRACING) {
100 ret = -ENOMEM;
101 goto noop;
102 }
103
104 cmd_buf_cpu_va = (u32 *)ce_ctx->cmd_buf_mem.cpu_va;
105
106 /* 0 is treated as invalid pre-sync */
107 if (cmd_buf_cpu_va[fence_index]) {
108 struct gk20a_fence * ce_cmd_buf_fence_in = NULL;
109
110 memcpy((void *)&ce_cmd_buf_fence_in,
111 (void *)(cmd_buf_cpu_va + fence_index),
112 sizeof(struct gk20a_fence *));
113 ret = gk20a_fence_wait(g, ce_cmd_buf_fence_in,
114 gk20a_get_gr_idle_timeout(g));
115
116 gk20a_fence_put(ce_cmd_buf_fence_in);
117 /* Reset the stored last pre-sync */
118 memset((void *)(cmd_buf_cpu_va + fence_index),
119 0,
120 NVGPU_CE_MAX_COMMAND_BUFF_SIZE_FOR_TRACING);
121 if (ret)
122 goto noop;
123 }
124
125 cmd_buf_gpu_va = (ce_ctx->cmd_buf_mem.gpu_va + (u64)(cmd_buf_read_offset *sizeof(u32)));
126
127 methodSize = gk20a_ce_prepare_submit(src_buf,
128 dst_buf,
129 size,
130 &cmd_buf_cpu_va[cmd_buf_read_offset],
131 NVGPU_CE_MAX_COMMAND_BUFF_SIZE_PER_KICKOFF,
132 payload,
133 gk20a_get_valid_launch_flags(g, launch_flags),
134 request_operation,
135 gpu_capability->dma_copy_class,
136 gk20a_fence_in);
137
138 if (methodSize) {
139 /* TODO: Remove CPU pre-fence wait */
140 if (gk20a_fence_in) {
141 ret = gk20a_fence_wait(g, gk20a_fence_in,
142 gk20a_get_gr_idle_timeout(g));
143 gk20a_fence_put(gk20a_fence_in);
144 if (ret)
145 goto noop;
146 }
147
148 /* store the element into gpfifo */
149 gpfifo.entry0 =
150 u64_lo32(cmd_buf_gpu_va);
151 gpfifo.entry1 =
152 (u64_hi32(cmd_buf_gpu_va) |
153 pbdma_gp_entry1_length_f(methodSize));
154
155 /* take always the postfence as it is needed for protecting the ce context */
156 submit_flags |= NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_GET;
157
158 nvgpu_smp_wmb();
159
160 ret = gk20a_submit_channel_gpfifo(ce_ctx->ch, &gpfifo, NULL,
161 1, submit_flags, &fence,
162 &ce_cmd_buf_fence_out, false, NULL);
163
164 if (!ret) {
165 memcpy((void *)(cmd_buf_cpu_va + fence_index),
166 (void *)&ce_cmd_buf_fence_out,
167 sizeof(struct gk20a_fence *));
168
169 if (gk20a_fence_out) {
170 gk20a_fence_get(ce_cmd_buf_fence_out);
171 *gk20a_fence_out = ce_cmd_buf_fence_out;
172 }
173
174 /* Next available command buffer queue Index */
175 ++ce_ctx->cmd_buf_read_queue_offset;
176 ++ce_ctx->submitted_seq_number;
177 }
178 } else {
179 ret = -ENOMEM;
180 }
181noop:
182 nvgpu_mutex_release(&ce_ctx->gpu_ctx_mutex);
183end:
184 return ret;
185}
diff --git a/drivers/gpu/nvgpu/common/linux/channel.c b/drivers/gpu/nvgpu/common/linux/channel.c
new file mode 100644
index 00000000..716c5820
--- /dev/null
+++ b/drivers/gpu/nvgpu/common/linux/channel.c
@@ -0,0 +1,648 @@
1/*
2 * Copyright (c) 2017, NVIDIA Corporation. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11 * more details.
12 *
13 * You should have received a copy of the GNU General Public License
14 * along with this program. If not, see <http://www.gnu.org/licenses/>.
15 */
16
17#include <nvgpu/enabled.h>
18#include <nvgpu/debug.h>
19#include <nvgpu/ltc.h>
20
21/*
22 * This is required for nvgpu_vm_find_buf() which is used in the tracing
23 * code. Once we can get and access userspace buffers without requiring
24 * direct dma_buf usage this can be removed.
25 */
26#include <nvgpu/linux/vm.h>
27
28#include "gk20a/gk20a.h"
29
30#include <nvgpu/hw/gk20a/hw_pbdma_gk20a.h>
31
32#include <linux/uaccess.h>
33#include <linux/dma-buf.h>
34#include <trace/events/gk20a.h>
35
36u32 nvgpu_get_gpfifo_entry_size(void)
37{
38 return sizeof(struct nvgpu_gpfifo);
39}
40
41#ifdef CONFIG_DEBUG_FS
42static void trace_write_pushbuffer(struct channel_gk20a *c,
43 struct nvgpu_gpfifo *g)
44{
45 void *mem = NULL;
46 unsigned int words;
47 u64 offset;
48 struct dma_buf *dmabuf = NULL;
49
50 if (gk20a_debug_trace_cmdbuf) {
51 u64 gpu_va = (u64)g->entry0 |
52 (u64)((u64)pbdma_gp_entry1_get_hi_v(g->entry1) << 32);
53 int err;
54
55 words = pbdma_gp_entry1_length_v(g->entry1);
56 err = nvgpu_vm_find_buf(c->vm, gpu_va, &dmabuf, &offset);
57 if (!err)
58 mem = dma_buf_vmap(dmabuf);
59 }
60
61 if (mem) {
62 u32 i;
63 /*
64 * Write in batches of 128 as there seems to be a limit
65 * of how much you can output to ftrace at once.
66 */
67 for (i = 0; i < words; i += 128U) {
68 trace_gk20a_push_cmdbuf(
69 c->g->name,
70 0,
71 min(words - i, 128U),
72 offset + i * sizeof(u32),
73 mem);
74 }
75 dma_buf_vunmap(dmabuf, mem);
76 }
77}
78#endif
79
80static void trace_write_pushbuffer_range(struct channel_gk20a *c,
81 struct nvgpu_gpfifo *g,
82 struct nvgpu_gpfifo __user *user_gpfifo,
83 int offset,
84 int count)
85{
86#ifdef CONFIG_DEBUG_FS
87 u32 size;
88 int i;
89 struct nvgpu_gpfifo *gp;
90 bool gpfifo_allocated = false;
91
92 if (!gk20a_debug_trace_cmdbuf)
93 return;
94
95 if (!g && !user_gpfifo)
96 return;
97
98 if (!g) {
99 size = count * sizeof(struct nvgpu_gpfifo);
100 if (size) {
101 g = nvgpu_big_malloc(c->g, size);
102 if (!g)
103 return;
104
105 if (copy_from_user(g, user_gpfifo, size)) {
106 nvgpu_big_free(c->g, g);
107 return;
108 }
109 }
110 gpfifo_allocated = true;
111 }
112
113 gp = g + offset;
114 for (i = 0; i < count; i++, gp++)
115 trace_write_pushbuffer(c, gp);
116
117 if (gpfifo_allocated)
118 nvgpu_big_free(c->g, g);
119#endif
120}
121
122/*
123 * Handle the submit synchronization - pre-fences and post-fences.
124 */
125static int gk20a_submit_prepare_syncs(struct channel_gk20a *c,
126 struct nvgpu_fence *fence,
127 struct channel_gk20a_job *job,
128 struct priv_cmd_entry **wait_cmd,
129 struct priv_cmd_entry **incr_cmd,
130 struct gk20a_fence **pre_fence,
131 struct gk20a_fence **post_fence,
132 bool force_need_sync_fence,
133 bool register_irq,
134 u32 flags)
135{
136 struct gk20a *g = c->g;
137 bool need_sync_fence = false;
138 bool new_sync_created = false;
139 int wait_fence_fd = -1;
140 int err = 0;
141 bool need_wfi = !(flags & NVGPU_SUBMIT_GPFIFO_FLAGS_SUPPRESS_WFI);
142 bool pre_alloc_enabled = channel_gk20a_is_prealloc_enabled(c);
143
144 /*
145 * If user wants to always allocate sync_fence_fds then respect that;
146 * otherwise, allocate sync_fence_fd based on user flags.
147 */
148 if (force_need_sync_fence)
149 need_sync_fence = true;
150
151 if (g->aggressive_sync_destroy_thresh) {
152 nvgpu_mutex_acquire(&c->sync_lock);
153 if (!c->sync) {
154 c->sync = gk20a_channel_sync_create(c);
155 if (!c->sync) {
156 err = -ENOMEM;
157 nvgpu_mutex_release(&c->sync_lock);
158 goto fail;
159 }
160 new_sync_created = true;
161 }
162 nvgpu_atomic_inc(&c->sync->refcount);
163 nvgpu_mutex_release(&c->sync_lock);
164 }
165
166 if (g->ops.fifo.resetup_ramfc && new_sync_created) {
167 err = g->ops.fifo.resetup_ramfc(c);
168 if (err)
169 goto fail;
170 }
171
172 /*
173 * Optionally insert syncpt wait in the beginning of gpfifo submission
174 * when user requested and the wait hasn't expired. Validate that the id
175 * makes sense, elide if not. The only reason this isn't being
176 * unceremoniously killed is to keep running some tests which trigger
177 * this condition.
178 */
179 if (flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_WAIT) {
180 job->pre_fence = gk20a_alloc_fence(c);
181 if (!job->pre_fence) {
182 err = -ENOMEM;
183 goto fail;
184 }
185
186 if (!pre_alloc_enabled)
187 job->wait_cmd = nvgpu_kzalloc(g,
188 sizeof(struct priv_cmd_entry));
189
190 if (!job->wait_cmd) {
191 err = -ENOMEM;
192 goto clean_up_pre_fence;
193 }
194
195 if (flags & NVGPU_SUBMIT_GPFIFO_FLAGS_SYNC_FENCE) {
196 wait_fence_fd = fence->id;
197 err = c->sync->wait_fd(c->sync, wait_fence_fd,
198 job->wait_cmd, job->pre_fence);
199 } else {
200 err = c->sync->wait_syncpt(c->sync, fence->id,
201 fence->value, job->wait_cmd,
202 job->pre_fence);
203 }
204
205 if (!err) {
206 if (job->wait_cmd->valid)
207 *wait_cmd = job->wait_cmd;
208 *pre_fence = job->pre_fence;
209 } else
210 goto clean_up_wait_cmd;
211 }
212
213 if ((flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_GET) &&
214 (flags & NVGPU_SUBMIT_GPFIFO_FLAGS_SYNC_FENCE))
215 need_sync_fence = true;
216
217 /*
218 * Always generate an increment at the end of a GPFIFO submission. This
219 * is used to keep track of method completion for idle railgating. The
220 * sync_pt/semaphore PB is added to the GPFIFO later on in submit.
221 */
222 job->post_fence = gk20a_alloc_fence(c);
223 if (!job->post_fence) {
224 err = -ENOMEM;
225 goto clean_up_wait_cmd;
226 }
227 if (!pre_alloc_enabled)
228 job->incr_cmd = nvgpu_kzalloc(g, sizeof(struct priv_cmd_entry));
229
230 if (!job->incr_cmd) {
231 err = -ENOMEM;
232 goto clean_up_post_fence;
233 }
234
235 if (flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_GET)
236 err = c->sync->incr_user(c->sync, wait_fence_fd, job->incr_cmd,
237 job->post_fence, need_wfi, need_sync_fence,
238 register_irq);
239 else
240 err = c->sync->incr(c->sync, job->incr_cmd,
241 job->post_fence, need_sync_fence,
242 register_irq);
243 if (!err) {
244 *incr_cmd = job->incr_cmd;
245 *post_fence = job->post_fence;
246 } else
247 goto clean_up_incr_cmd;
248
249 return 0;
250
251clean_up_incr_cmd:
252 free_priv_cmdbuf(c, job->incr_cmd);
253 if (!pre_alloc_enabled)
254 job->incr_cmd = NULL;
255clean_up_post_fence:
256 gk20a_fence_put(job->post_fence);
257 job->post_fence = NULL;
258clean_up_wait_cmd:
259 free_priv_cmdbuf(c, job->wait_cmd);
260 if (!pre_alloc_enabled)
261 job->wait_cmd = NULL;
262clean_up_pre_fence:
263 gk20a_fence_put(job->pre_fence);
264 job->pre_fence = NULL;
265fail:
266 *wait_cmd = NULL;
267 *pre_fence = NULL;
268 return err;
269}
270
271static void gk20a_submit_append_priv_cmdbuf(struct channel_gk20a *c,
272 struct priv_cmd_entry *cmd)
273{
274 struct gk20a *g = c->g;
275 struct nvgpu_mem *gpfifo_mem = &c->gpfifo.mem;
276 struct nvgpu_gpfifo x = {
277 .entry0 = u64_lo32(cmd->gva),
278 .entry1 = u64_hi32(cmd->gva) |
279 pbdma_gp_entry1_length_f(cmd->size)
280 };
281
282 nvgpu_mem_wr_n(g, gpfifo_mem, c->gpfifo.put * sizeof(x),
283 &x, sizeof(x));
284
285 if (cmd->mem->aperture == APERTURE_SYSMEM)
286 trace_gk20a_push_cmdbuf(g->name, 0, cmd->size, 0,
287 cmd->mem->cpu_va + cmd->off * sizeof(u32));
288
289 c->gpfifo.put = (c->gpfifo.put + 1) & (c->gpfifo.entry_num - 1);
290}
291
292/*
293 * Copy source gpfifo entries into the gpfifo ring buffer, potentially
294 * splitting into two memcpys to handle wrap-around.
295 */
296static int gk20a_submit_append_gpfifo(struct channel_gk20a *c,
297 struct nvgpu_gpfifo *kern_gpfifo,
298 struct nvgpu_gpfifo __user *user_gpfifo,
299 u32 num_entries)
300{
301 /* byte offsets */
302 u32 gpfifo_size = c->gpfifo.entry_num * sizeof(struct nvgpu_gpfifo);
303 u32 len = num_entries * sizeof(struct nvgpu_gpfifo);
304 u32 start = c->gpfifo.put * sizeof(struct nvgpu_gpfifo);
305 u32 end = start + len; /* exclusive */
306 struct nvgpu_mem *gpfifo_mem = &c->gpfifo.mem;
307 struct nvgpu_gpfifo *cpu_src;
308 int err;
309
310 if (user_gpfifo && !c->gpfifo.pipe) {
311 /*
312 * This path (from userspace to sysmem) is special in order to
313 * avoid two copies unnecessarily (from user to pipe, then from
314 * pipe to gpu sysmem buffer).
315 *
316 * As a special case, the pipe buffer exists if PRAMIN writes
317 * are forced, although the buffers may not be in vidmem in
318 * that case.
319 */
320 if (end > gpfifo_size) {
321 /* wrap-around */
322 int length0 = gpfifo_size - start;
323 int length1 = len - length0;
324 void __user *user2 = (u8 __user *)user_gpfifo + length0;
325
326 err = copy_from_user(gpfifo_mem->cpu_va + start,
327 user_gpfifo, length0);
328 if (err)
329 return err;
330
331 err = copy_from_user(gpfifo_mem->cpu_va,
332 user2, length1);
333 if (err)
334 return err;
335 } else {
336 err = copy_from_user(gpfifo_mem->cpu_va + start,
337 user_gpfifo, len);
338 if (err)
339 return err;
340 }
341
342 trace_write_pushbuffer_range(c, NULL, user_gpfifo,
343 0, num_entries);
344 goto out;
345 } else if (user_gpfifo) {
346 /* from userspace to vidmem or sysmem when pramin forced, use
347 * the common copy path below */
348 err = copy_from_user(c->gpfifo.pipe, user_gpfifo, len);
349 if (err)
350 return err;
351
352 cpu_src = c->gpfifo.pipe;
353 } else {
354 /* from kernel to either sysmem or vidmem, don't need
355 * copy_from_user so use the common path below */
356 cpu_src = kern_gpfifo;
357 }
358
359 if (end > gpfifo_size) {
360 /* wrap-around */
361 int length0 = gpfifo_size - start;
362 int length1 = len - length0;
363 void *src2 = (u8 *)cpu_src + length0;
364
365 nvgpu_mem_wr_n(c->g, gpfifo_mem, start, cpu_src, length0);
366 nvgpu_mem_wr_n(c->g, gpfifo_mem, 0, src2, length1);
367 } else {
368 nvgpu_mem_wr_n(c->g, gpfifo_mem, start, cpu_src, len);
369
370 }
371
372 trace_write_pushbuffer_range(c, cpu_src, NULL, 0, num_entries);
373
374out:
375 c->gpfifo.put = (c->gpfifo.put + num_entries) &
376 (c->gpfifo.entry_num - 1);
377
378 return 0;
379}
380
381int gk20a_submit_channel_gpfifo(struct channel_gk20a *c,
382 struct nvgpu_gpfifo *gpfifo,
383 struct nvgpu_submit_gpfifo_args *args,
384 u32 num_entries,
385 u32 flags,
386 struct nvgpu_fence *fence,
387 struct gk20a_fence **fence_out,
388 bool force_need_sync_fence,
389 struct fifo_profile_gk20a *profile)
390{
391 struct gk20a *g = c->g;
392 struct priv_cmd_entry *wait_cmd = NULL;
393 struct priv_cmd_entry *incr_cmd = NULL;
394 struct gk20a_fence *pre_fence = NULL;
395 struct gk20a_fence *post_fence = NULL;
396 struct channel_gk20a_job *job = NULL;
397 /* we might need two extra gpfifo entries - one for pre fence
398 * and one for post fence. */
399 const int extra_entries = 2;
400 bool skip_buffer_refcounting = (flags &
401 NVGPU_SUBMIT_GPFIFO_FLAGS_SKIP_BUFFER_REFCOUNTING);
402 int err = 0;
403 bool need_job_tracking;
404 bool need_deferred_cleanup = false;
405 struct nvgpu_gpfifo __user *user_gpfifo = args ?
406 (struct nvgpu_gpfifo __user *)(uintptr_t)args->gpfifo : NULL;
407
408 if (nvgpu_is_enabled(g, NVGPU_DRIVER_IS_DYING))
409 return -ENODEV;
410
411 if (c->has_timedout)
412 return -ETIMEDOUT;
413
414 if (!nvgpu_mem_is_valid(&c->gpfifo.mem))
415 return -ENOMEM;
416
417 /* fifo not large enough for request. Return error immediately.
418 * Kernel can insert gpfifo entries before and after user gpfifos.
419 * So, add extra_entries in user request. Also, HW with fifo size N
420 * can accept only N-1 entreis and so the below condition */
421 if (c->gpfifo.entry_num - 1 < num_entries + extra_entries) {
422 nvgpu_err(g, "not enough gpfifo space allocated");
423 return -ENOMEM;
424 }
425
426 if (!gpfifo && !args)
427 return -EINVAL;
428
429 if ((flags & (NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_WAIT |
430 NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_GET)) &&
431 !fence)
432 return -EINVAL;
433
434 /* an address space needs to have been bound at this point. */
435 if (!gk20a_channel_as_bound(c)) {
436 nvgpu_err(g,
437 "not bound to an address space at time of gpfifo"
438 " submission.");
439 return -EINVAL;
440 }
441
442 if (profile)
443 profile->timestamp[PROFILE_ENTRY] = sched_clock();
444
445 /* update debug settings */
446 nvgpu_ltc_sync_enabled(g);
447
448 gk20a_dbg_info("channel %d", c->chid);
449
450 /*
451 * Job tracking is necessary for any of the following conditions:
452 * - pre- or post-fence functionality
453 * - channel wdt
454 * - GPU rail-gating with non-deterministic channels
455 * - buffer refcounting
456 *
457 * If none of the conditions are met, then job tracking is not
458 * required and a fast submit can be done (ie. only need to write
459 * out userspace GPFIFO entries and update GP_PUT).
460 */
461 need_job_tracking = (flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_WAIT) ||
462 (flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_GET) ||
463 c->wdt_enabled ||
464 (g->can_railgate && !c->deterministic) ||
465 !skip_buffer_refcounting;
466
467 if (need_job_tracking) {
468 bool need_sync_framework = false;
469
470 /*
471 * If the channel is to have deterministic latency and
472 * job tracking is required, the channel must have
473 * pre-allocated resources. Otherwise, we fail the submit here
474 */
475 if (c->deterministic && !channel_gk20a_is_prealloc_enabled(c))
476 return -EINVAL;
477
478 need_sync_framework = force_need_sync_fence ||
479 gk20a_channel_sync_needs_sync_framework(g) ||
480 (flags & NVGPU_SUBMIT_GPFIFO_FLAGS_SYNC_FENCE &&
481 (flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_WAIT ||
482 flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_GET));
483
484 /*
485 * Deferred clean-up is necessary for any of the following
486 * conditions:
487 * - channel's deterministic flag is not set
488 * - dependency on sync framework, which could make the
489 * behavior of the clean-up operation non-deterministic
490 * (should not be performed in the submit path)
491 * - channel wdt
492 * - GPU rail-gating with non-deterministic channels
493 * - buffer refcounting
494 *
495 * If none of the conditions are met, then deferred clean-up
496 * is not required, and we clean-up one job-tracking
497 * resource in the submit path.
498 */
499 need_deferred_cleanup = !c->deterministic ||
500 need_sync_framework ||
501 c->wdt_enabled ||
502 (g->can_railgate &&
503 !c->deterministic) ||
504 !skip_buffer_refcounting;
505
506 /*
507 * For deterministic channels, we don't allow deferred clean_up
508 * processing to occur. In cases we hit this, we fail the submit
509 */
510 if (c->deterministic && need_deferred_cleanup)
511 return -EINVAL;
512
513 if (!c->deterministic) {
514 /*
515 * Get a power ref unless this is a deterministic
516 * channel that holds them during the channel lifetime.
517 * This one is released by gk20a_channel_clean_up_jobs,
518 * via syncpt or sema interrupt, whichever is used.
519 */
520 err = gk20a_busy(g);
521 if (err) {
522 nvgpu_err(g,
523 "failed to host gk20a to submit gpfifo, process %s",
524 current->comm);
525 return err;
526 }
527 }
528
529 if (!need_deferred_cleanup) {
530 /* clean up a single job */
531 gk20a_channel_clean_up_jobs(c, false);
532 }
533 }
534
535
536 /* Grab access to HW to deal with do_idle */
537 if (c->deterministic)
538 nvgpu_rwsem_down_read(&g->deterministic_busy);
539
540 trace_gk20a_channel_submit_gpfifo(g->name,
541 c->chid,
542 num_entries,
543 flags,
544 fence ? fence->id : 0,
545 fence ? fence->value : 0);
546
547 gk20a_dbg_info("pre-submit put %d, get %d, size %d",
548 c->gpfifo.put, c->gpfifo.get, c->gpfifo.entry_num);
549
550 /*
551 * Make sure we have enough space for gpfifo entries. Check cached
552 * values first and then read from HW. If no space, return EAGAIN
553 * and let userpace decide to re-try request or not.
554 */
555 if (nvgpu_gp_free_count(c) < num_entries + extra_entries) {
556 if (nvgpu_get_gp_free_count(c) < num_entries + extra_entries) {
557 err = -EAGAIN;
558 goto clean_up;
559 }
560 }
561
562 if (c->has_timedout) {
563 err = -ETIMEDOUT;
564 goto clean_up;
565 }
566
567 if (need_job_tracking) {
568 err = channel_gk20a_alloc_job(c, &job);
569 if (err)
570 goto clean_up;
571
572 err = gk20a_submit_prepare_syncs(c, fence, job,
573 &wait_cmd, &incr_cmd,
574 &pre_fence, &post_fence,
575 force_need_sync_fence,
576 need_deferred_cleanup,
577 flags);
578 if (err)
579 goto clean_up_job;
580 }
581
582 if (profile)
583 profile->timestamp[PROFILE_JOB_TRACKING] = sched_clock();
584
585 if (wait_cmd)
586 gk20a_submit_append_priv_cmdbuf(c, wait_cmd);
587
588 if (gpfifo || user_gpfifo)
589 err = gk20a_submit_append_gpfifo(c, gpfifo, user_gpfifo,
590 num_entries);
591 if (err)
592 goto clean_up_job;
593
594 /*
595 * And here's where we add the incr_cmd we generated earlier. It should
596 * always run!
597 */
598 if (incr_cmd)
599 gk20a_submit_append_priv_cmdbuf(c, incr_cmd);
600
601 if (fence_out)
602 *fence_out = gk20a_fence_get(post_fence);
603
604 if (need_job_tracking)
605 /* TODO! Check for errors... */
606 gk20a_channel_add_job(c, job, skip_buffer_refcounting);
607 if (profile)
608 profile->timestamp[PROFILE_APPEND] = sched_clock();
609
610 g->ops.fifo.userd_gp_put(g, c);
611
612 if ((NVGPU_SUBMIT_GPFIFO_FLAGS_RESCHEDULE_RUNLIST & flags) &&
613 g->ops.fifo.reschedule_runlist)
614 g->ops.fifo.reschedule_runlist(g, c->runlist_id);
615
616 /* No hw access beyond this point */
617 if (c->deterministic)
618 nvgpu_rwsem_up_read(&g->deterministic_busy);
619
620 trace_gk20a_channel_submitted_gpfifo(g->name,
621 c->chid,
622 num_entries,
623 flags,
624 post_fence ? post_fence->syncpt_id : 0,
625 post_fence ? post_fence->syncpt_value : 0);
626
627 gk20a_dbg_info("post-submit put %d, get %d, size %d",
628 c->gpfifo.put, c->gpfifo.get, c->gpfifo.entry_num);
629
630 if (profile)
631 profile->timestamp[PROFILE_END] = sched_clock();
632 gk20a_dbg_fn("done");
633 return err;
634
635clean_up_job:
636 channel_gk20a_free_job(c, job);
637clean_up:
638 gk20a_dbg_fn("fail");
639 gk20a_fence_put(pre_fence);
640 gk20a_fence_put(post_fence);
641 if (c->deterministic)
642 nvgpu_rwsem_up_read(&g->deterministic_busy);
643 else if (need_deferred_cleanup)
644 gk20a_idle(g);
645
646 return err;
647}
648
diff --git a/drivers/gpu/nvgpu/common/linux/channel.h b/drivers/gpu/nvgpu/common/linux/channel.h
new file mode 100644
index 00000000..785c03d6
--- /dev/null
+++ b/drivers/gpu/nvgpu/common/linux/channel.h
@@ -0,0 +1,38 @@
1/*
2 * Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11 * more details.
12 *
13 * You should have received a copy of the GNU General Public License
14 * along with this program. If not, see <http://www.gnu.org/licenses/>.
15 */
16#ifndef __NVGPU_CHANNEL_H__
17#define __NVGPU_CHANNEL_H__
18
19#include <nvgpu/types.h>
20
21struct channel_gk20a;
22struct nvgpu_gpfifo;
23struct nvgpu_submit_gpfifo_args;
24struct nvgpu_fence;
25struct gk20a_fence;
26struct fifo_profile_gk20a;
27
28int gk20a_submit_channel_gpfifo(struct channel_gk20a *c,
29 struct nvgpu_gpfifo *gpfifo,
30 struct nvgpu_submit_gpfifo_args *args,
31 u32 num_entries,
32 u32 flags,
33 struct nvgpu_fence *fence,
34 struct gk20a_fence **fence_out,
35 bool force_need_sync_fence,
36 struct fifo_profile_gk20a *profile);
37
38#endif /* __NVGPU_CHANNEL_H__ */
diff --git a/drivers/gpu/nvgpu/common/linux/ioctl_channel.c b/drivers/gpu/nvgpu/common/linux/ioctl_channel.c
index 91dfc630..5b0c4a50 100644
--- a/drivers/gpu/nvgpu/common/linux/ioctl_channel.c
+++ b/drivers/gpu/nvgpu/common/linux/ioctl_channel.c
@@ -36,6 +36,7 @@
36#include "gk20a/platform_gk20a.h" 36#include "gk20a/platform_gk20a.h"
37 37
38#include "ioctl_channel.h" 38#include "ioctl_channel.h"
39#include "channel.h"
39#include "os_linux.h" 40#include "os_linux.h"
40#include "ctxsw_trace.h" 41#include "ctxsw_trace.h"
41 42
diff --git a/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c b/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c
index 5314a1be..9ff6c792 100644
--- a/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c
@@ -249,18 +249,7 @@ static inline unsigned int gk20a_ce_get_method_size(int request_operation,
249 return methodsize; 249 return methodsize;
250} 250}
251 251
252static inline int gk20a_get_valid_launch_flags(struct gk20a *g, int launch_flags) 252int gk20a_ce_prepare_submit(u64 src_buf,
253{
254 /* there is no local memory available,
255 don't allow local memory related CE flags */
256 if (!g->mm.vidmem.size) {
257 launch_flags &= ~(NVGPU_CE_SRC_LOCATION_LOCAL_FB |
258 NVGPU_CE_DST_LOCATION_LOCAL_FB);
259 }
260 return launch_flags;
261}
262
263static int gk20a_ce_prepare_submit(u64 src_buf,
264 u64 dst_buf, 253 u64 dst_buf,
265 u64 size, 254 u64 size,
266 u32 *cmd_buf_cpu_va, 255 u32 *cmd_buf_cpu_va,
@@ -626,157 +615,6 @@ end:
626} 615}
627EXPORT_SYMBOL(gk20a_ce_create_context_with_cb); 616EXPORT_SYMBOL(gk20a_ce_create_context_with_cb);
628 617
629int gk20a_ce_execute_ops(struct gk20a *g,
630 u32 ce_ctx_id,
631 u64 src_buf,
632 u64 dst_buf,
633 u64 size,
634 unsigned int payload,
635 int launch_flags,
636 int request_operation,
637 struct gk20a_fence *gk20a_fence_in,
638 u32 submit_flags,
639 struct gk20a_fence **gk20a_fence_out)
640{
641 int ret = -EPERM;
642 struct gk20a_ce_app *ce_app = &g->ce_app;
643 struct gk20a_gpu_ctx *ce_ctx, *ce_ctx_save;
644 bool found = false;
645 u32 *cmd_buf_cpu_va;
646 u64 cmd_buf_gpu_va = 0;
647 u32 methodSize;
648 u32 cmd_buf_read_offset;
649 u32 fence_index;
650 struct nvgpu_gpfifo gpfifo;
651 struct nvgpu_fence fence = {0,0};
652 struct gk20a_fence *ce_cmd_buf_fence_out = NULL;
653 struct nvgpu_gpu_characteristics *gpu_capability = &g->gpu_characteristics;
654
655 if (!ce_app->initialised ||ce_app->app_state != NVGPU_CE_ACTIVE)
656 goto end;
657
658 nvgpu_mutex_acquire(&ce_app->app_mutex);
659
660 nvgpu_list_for_each_entry_safe(ce_ctx, ce_ctx_save,
661 &ce_app->allocated_contexts, gk20a_gpu_ctx, list) {
662 if (ce_ctx->ctx_id == ce_ctx_id) {
663 found = true;
664 break;
665 }
666 }
667
668 nvgpu_mutex_release(&ce_app->app_mutex);
669
670 if (!found) {
671 ret = -EINVAL;
672 goto end;
673 }
674
675 if (ce_ctx->gpu_ctx_state != NVGPU_CE_GPU_CTX_ALLOCATED) {
676 ret = -ENODEV;
677 goto end;
678 }
679
680 nvgpu_mutex_acquire(&ce_ctx->gpu_ctx_mutex);
681
682 ce_ctx->cmd_buf_read_queue_offset %= ce_ctx->cmd_buf_end_queue_offset;
683
684 cmd_buf_read_offset = (ce_ctx->cmd_buf_read_queue_offset *
685 (NVGPU_CE_MAX_COMMAND_BUFF_SIZE_PER_KICKOFF / sizeof(u32)));
686
687 /* at end of command buffer has gk20a_fence for command buffer sync */
688 fence_index = (cmd_buf_read_offset +
689 ((NVGPU_CE_MAX_COMMAND_BUFF_SIZE_PER_KICKOFF / sizeof(u32)) -
690 (NVGPU_CE_MAX_COMMAND_BUFF_SIZE_FOR_TRACING / sizeof(u32))));
691
692 if (sizeof(struct gk20a_fence *) > NVGPU_CE_MAX_COMMAND_BUFF_SIZE_FOR_TRACING) {
693 ret = -ENOMEM;
694 goto noop;
695 }
696
697 cmd_buf_cpu_va = (u32 *)ce_ctx->cmd_buf_mem.cpu_va;
698
699 /* 0 is treated as invalid pre-sync */
700 if (cmd_buf_cpu_va[fence_index]) {
701 struct gk20a_fence * ce_cmd_buf_fence_in = NULL;
702
703 memcpy((void *)&ce_cmd_buf_fence_in,
704 (void *)(cmd_buf_cpu_va + fence_index),
705 sizeof(struct gk20a_fence *));
706 ret = gk20a_fence_wait(g, ce_cmd_buf_fence_in,
707 gk20a_get_gr_idle_timeout(g));
708
709 gk20a_fence_put(ce_cmd_buf_fence_in);
710 /* Reset the stored last pre-sync */
711 memset((void *)(cmd_buf_cpu_va + fence_index),
712 0,
713 NVGPU_CE_MAX_COMMAND_BUFF_SIZE_FOR_TRACING);
714 if (ret)
715 goto noop;
716 }
717
718 cmd_buf_gpu_va = (ce_ctx->cmd_buf_mem.gpu_va + (u64)(cmd_buf_read_offset *sizeof(u32)));
719
720 methodSize = gk20a_ce_prepare_submit(src_buf,
721 dst_buf,
722 size,
723 &cmd_buf_cpu_va[cmd_buf_read_offset],
724 NVGPU_CE_MAX_COMMAND_BUFF_SIZE_PER_KICKOFF,
725 payload,
726 gk20a_get_valid_launch_flags(g, launch_flags),
727 request_operation,
728 gpu_capability->dma_copy_class,
729 gk20a_fence_in);
730
731 if (methodSize) {
732 /* TODO: Remove CPU pre-fence wait */
733 if (gk20a_fence_in) {
734 ret = gk20a_fence_wait(g, gk20a_fence_in,
735 gk20a_get_gr_idle_timeout(g));
736 gk20a_fence_put(gk20a_fence_in);
737 if (ret)
738 goto noop;
739 }
740
741 /* store the element into gpfifo */
742 gpfifo.entry0 =
743 u64_lo32(cmd_buf_gpu_va);
744 gpfifo.entry1 =
745 (u64_hi32(cmd_buf_gpu_va) |
746 pbdma_gp_entry1_length_f(methodSize));
747
748 /* take always the postfence as it is needed for protecting the ce context */
749 submit_flags |= NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_GET;
750
751 nvgpu_smp_wmb();
752
753 ret = gk20a_submit_channel_gpfifo(ce_ctx->ch, &gpfifo, NULL,
754 1, submit_flags, &fence,
755 &ce_cmd_buf_fence_out, false, NULL);
756
757 if (!ret) {
758 memcpy((void *)(cmd_buf_cpu_va + fence_index),
759 (void *)&ce_cmd_buf_fence_out,
760 sizeof(struct gk20a_fence *));
761
762 if (gk20a_fence_out) {
763 gk20a_fence_get(ce_cmd_buf_fence_out);
764 *gk20a_fence_out = ce_cmd_buf_fence_out;
765 }
766
767 /* Next available command buffer queue Index */
768 ++ce_ctx->cmd_buf_read_queue_offset;
769 ++ce_ctx->submitted_seq_number;
770 }
771 } else
772 ret = -ENOMEM;
773noop:
774 nvgpu_mutex_release(&ce_ctx->gpu_ctx_mutex);
775end:
776 return ret;
777}
778EXPORT_SYMBOL(gk20a_ce_execute_ops);
779
780void gk20a_ce_delete_context(struct gk20a *g, 618void gk20a_ce_delete_context(struct gk20a *g,
781 u32 ce_ctx_id) 619 u32 ce_ctx_id)
782{ 620{
diff --git a/drivers/gpu/nvgpu/gk20a/ce2_gk20a.h b/drivers/gpu/nvgpu/gk20a/ce2_gk20a.h
index 1dad8952..8d3a4ca3 100644
--- a/drivers/gpu/nvgpu/gk20a/ce2_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/ce2_gk20a.h
@@ -161,5 +161,15 @@ void gk20a_ce_delete_context_priv(struct gk20a *g,
161 u32 ce_ctx_id); 161 u32 ce_ctx_id);
162void gk20a_ce_delete_context(struct gk20a *g, 162void gk20a_ce_delete_context(struct gk20a *g,
163 u32 ce_ctx_id); 163 u32 ce_ctx_id);
164int gk20a_ce_prepare_submit(u64 src_buf,
165 u64 dst_buf,
166 u64 size,
167 u32 *cmd_buf_cpu_va,
168 u32 max_cmd_buf_size,
169 unsigned int payload,
170 int launch_flags,
171 int request_operation,
172 u32 dma_copy_class,
173 struct gk20a_fence *gk20a_fence_in);
164 174
165#endif /*__CE2_GK20A_H__*/ 175#endif /*__CE2_GK20A_H__*/
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
index 00d20357..c938ba6b 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
@@ -44,45 +44,13 @@
44#include <nvgpu/barrier.h> 44#include <nvgpu/barrier.h>
45#include <nvgpu/ctxsw_trace.h> 45#include <nvgpu/ctxsw_trace.h>
46 46
47/*
48 * This is required for nvgpu_vm_find_buf() which is used in the tracing
49 * code. Once we can get and access userspace buffers without requiring
50 * direct dma_buf usage this can be removed.
51 */
52#include <nvgpu/linux/vm.h>
53
54#include "gk20a.h" 47#include "gk20a.h"
55#include "dbg_gpu_gk20a.h" 48#include "dbg_gpu_gk20a.h"
56#include "fence_gk20a.h" 49#include "fence_gk20a.h"
57 50
58#include <nvgpu/hw/gk20a/hw_pbdma_gk20a.h>
59
60/*
61 * Note
62 * This is added for all the copy_from_user methods in this file which needs to
63 * be moved lated to reduce depenedency on Linux
64 */
65#include <linux/uaccess.h>
66
67/*
68 * Although channels do have pointers back to the gk20a struct that they were
69 * created under in cases where the driver is killed that pointer can be bad.
70 * The channel memory can be freed before the release() function for a given
71 * channel is called. This happens when the driver dies and userspace doesn't
72 * get a chance to call release() until after the entire gk20a driver data is
73 * unloaded and freed.
74 */
75struct channel_priv {
76 struct gk20a *g;
77 struct channel_gk20a *c;
78};
79
80static void free_channel(struct fifo_gk20a *f, struct channel_gk20a *c); 51static void free_channel(struct fifo_gk20a *f, struct channel_gk20a *c);
81static void gk20a_channel_dump_ref_actions(struct channel_gk20a *c); 52static void gk20a_channel_dump_ref_actions(struct channel_gk20a *c);
82 53
83static void free_priv_cmdbuf(struct channel_gk20a *c,
84 struct priv_cmd_entry *e);
85
86static int channel_gk20a_alloc_priv_cmdbuf(struct channel_gk20a *c); 54static int channel_gk20a_alloc_priv_cmdbuf(struct channel_gk20a *c);
87static void channel_gk20a_free_priv_cmdbuf(struct channel_gk20a *c); 55static void channel_gk20a_free_priv_cmdbuf(struct channel_gk20a *c);
88 56
@@ -97,9 +65,6 @@ static struct channel_gk20a_job *channel_gk20a_joblist_peek(
97 65
98static u32 gk20a_get_channel_watchdog_timeout(struct channel_gk20a *ch); 66static u32 gk20a_get_channel_watchdog_timeout(struct channel_gk20a *ch);
99 67
100static void gk20a_channel_clean_up_jobs(struct channel_gk20a *c,
101 bool clean_all);
102
103/* allocate GPU channel */ 68/* allocate GPU channel */
104static struct channel_gk20a *allocate_channel(struct fifo_gk20a *f) 69static struct channel_gk20a *allocate_channel(struct fifo_gk20a *f)
105{ 70{
@@ -1038,7 +1003,7 @@ int gk20a_channel_alloc_priv_cmdbuf(struct channel_gk20a *c, u32 orig_size,
1038 1003
1039/* Don't call this to free an explict cmd entry. 1004/* Don't call this to free an explict cmd entry.
1040 * It doesn't update priv_cmd_queue get/put */ 1005 * It doesn't update priv_cmd_queue get/put */
1041static void free_priv_cmdbuf(struct channel_gk20a *c, 1006void free_priv_cmdbuf(struct channel_gk20a *c,
1042 struct priv_cmd_entry *e) 1007 struct priv_cmd_entry *e)
1043{ 1008{
1044 if (channel_gk20a_is_prealloc_enabled(c)) 1009 if (channel_gk20a_is_prealloc_enabled(c))
@@ -1047,7 +1012,7 @@ static void free_priv_cmdbuf(struct channel_gk20a *c,
1047 nvgpu_kfree(c->g, e); 1012 nvgpu_kfree(c->g, e);
1048} 1013}
1049 1014
1050static int channel_gk20a_alloc_job(struct channel_gk20a *c, 1015int channel_gk20a_alloc_job(struct channel_gk20a *c,
1051 struct channel_gk20a_job **job_out) 1016 struct channel_gk20a_job **job_out)
1052{ 1017{
1053 int err = 0; 1018 int err = 0;
@@ -1080,7 +1045,7 @@ static int channel_gk20a_alloc_job(struct channel_gk20a *c,
1080 return err; 1045 return err;
1081} 1046}
1082 1047
1083static void channel_gk20a_free_job(struct channel_gk20a *c, 1048void channel_gk20a_free_job(struct channel_gk20a *c,
1084 struct channel_gk20a_job *job) 1049 struct channel_gk20a_job *job)
1085{ 1050{
1086 /* 1051 /*
@@ -1267,11 +1232,12 @@ int gk20a_channel_alloc_gpfifo(struct channel_gk20a *c,
1267{ 1232{
1268 struct gk20a *g = c->g; 1233 struct gk20a *g = c->g;
1269 struct vm_gk20a *ch_vm; 1234 struct vm_gk20a *ch_vm;
1270 u32 gpfifo_size; 1235 u32 gpfifo_size, gpfifo_entry_size;
1271 int err = 0; 1236 int err = 0;
1272 unsigned long acquire_timeout; 1237 unsigned long acquire_timeout;
1273 1238
1274 gpfifo_size = num_entries; 1239 gpfifo_size = num_entries;
1240 gpfifo_entry_size = nvgpu_get_gpfifo_entry_size();
1275 1241
1276 if (flags & NVGPU_ALLOC_GPFIFO_EX_FLAGS_VPR_ENABLED) 1242 if (flags & NVGPU_ALLOC_GPFIFO_EX_FLAGS_VPR_ENABLED)
1277 c->vpr = true; 1243 c->vpr = true;
@@ -1315,7 +1281,7 @@ int gk20a_channel_alloc_gpfifo(struct channel_gk20a *c,
1315 } 1281 }
1316 1282
1317 err = nvgpu_dma_alloc_map_sys(ch_vm, 1283 err = nvgpu_dma_alloc_map_sys(ch_vm,
1318 gpfifo_size * sizeof(struct nvgpu_gpfifo), 1284 gpfifo_size * gpfifo_entry_size,
1319 &c->gpfifo.mem); 1285 &c->gpfifo.mem);
1320 if (err) { 1286 if (err) {
1321 nvgpu_err(g, "%s: memory allocation failed", __func__); 1287 nvgpu_err(g, "%s: memory allocation failed", __func__);
@@ -1324,7 +1290,7 @@ int gk20a_channel_alloc_gpfifo(struct channel_gk20a *c,
1324 1290
1325 if (c->gpfifo.mem.aperture == APERTURE_VIDMEM || g->mm.force_pramin) { 1291 if (c->gpfifo.mem.aperture == APERTURE_VIDMEM || g->mm.force_pramin) {
1326 c->gpfifo.pipe = nvgpu_big_malloc(g, 1292 c->gpfifo.pipe = nvgpu_big_malloc(g,
1327 gpfifo_size * sizeof(struct nvgpu_gpfifo)); 1293 gpfifo_size * gpfifo_entry_size);
1328 if (!c->gpfifo.pipe) { 1294 if (!c->gpfifo.pipe) {
1329 err = -ENOMEM; 1295 err = -ENOMEM;
1330 goto clean_up_unmap; 1296 goto clean_up_unmap;
@@ -1427,7 +1393,7 @@ static inline u32 update_gp_get(struct gk20a *g,
1427 return new_get; 1393 return new_get;
1428} 1394}
1429 1395
1430static inline u32 gp_free_count(struct channel_gk20a *c) 1396u32 nvgpu_gp_free_count(struct channel_gk20a *c)
1431{ 1397{
1432 return (c->gpfifo.entry_num - (c->gpfifo.put - c->gpfifo.get) - 1) % 1398 return (c->gpfifo.entry_num - (c->gpfifo.put - c->gpfifo.get) - 1) %
1433 c->gpfifo.entry_num; 1399 c->gpfifo.entry_num;
@@ -1460,91 +1426,10 @@ static u32 gk20a_get_channel_watchdog_timeout(struct channel_gk20a *ch)
1460 return ch->g->ch_wdt_timeout_ms; 1426 return ch->g->ch_wdt_timeout_ms;
1461} 1427}
1462 1428
1463static u32 get_gp_free_count(struct channel_gk20a *c) 1429u32 nvgpu_get_gp_free_count(struct channel_gk20a *c)
1464{ 1430{
1465 update_gp_get(c->g, c); 1431 update_gp_get(c->g, c);
1466 return gp_free_count(c); 1432 return nvgpu_gp_free_count(c);
1467}
1468
1469#ifdef CONFIG_DEBUG_FS
1470static void trace_write_pushbuffer(struct channel_gk20a *c,
1471 struct nvgpu_gpfifo *g)
1472{
1473 void *mem = NULL;
1474 unsigned int words;
1475 u64 offset;
1476 struct dma_buf *dmabuf = NULL;
1477
1478 if (gk20a_debug_trace_cmdbuf) {
1479 u64 gpu_va = (u64)g->entry0 |
1480 (u64)((u64)pbdma_gp_entry1_get_hi_v(g->entry1) << 32);
1481 int err;
1482
1483 words = pbdma_gp_entry1_length_v(g->entry1);
1484 err = nvgpu_vm_find_buf(c->vm, gpu_va, &dmabuf, &offset);
1485 if (!err)
1486 mem = dma_buf_vmap(dmabuf);
1487 }
1488
1489 if (mem) {
1490 u32 i;
1491 /*
1492 * Write in batches of 128 as there seems to be a limit
1493 * of how much you can output to ftrace at once.
1494 */
1495 for (i = 0; i < words; i += 128U) {
1496 trace_gk20a_push_cmdbuf(
1497 c->g->name,
1498 0,
1499 min(words - i, 128U),
1500 offset + i * sizeof(u32),
1501 mem);
1502 }
1503 dma_buf_vunmap(dmabuf, mem);
1504 }
1505}
1506#endif
1507
1508static void trace_write_pushbuffer_range(struct channel_gk20a *c,
1509 struct nvgpu_gpfifo *g,
1510 struct nvgpu_gpfifo __user *user_gpfifo,
1511 int offset,
1512 int count)
1513{
1514#ifdef CONFIG_DEBUG_FS
1515 u32 size;
1516 int i;
1517 struct nvgpu_gpfifo *gp;
1518 bool gpfifo_allocated = false;
1519
1520 if (!gk20a_debug_trace_cmdbuf)
1521 return;
1522
1523 if (!g && !user_gpfifo)
1524 return;
1525
1526 if (!g) {
1527 size = count * sizeof(struct nvgpu_gpfifo);
1528 if (size) {
1529 g = nvgpu_big_malloc(c->g, size);
1530 if (!g)
1531 return;
1532
1533 if (copy_from_user(g, user_gpfifo, size)) {
1534 nvgpu_big_free(c->g, g);
1535 return;
1536 }
1537 }
1538 gpfifo_allocated = true;
1539 }
1540
1541 gp = g + offset;
1542 for (i = 0; i < count; i++, gp++)
1543 trace_write_pushbuffer(c, gp);
1544
1545 if (gpfifo_allocated)
1546 nvgpu_big_free(c->g, g);
1547#endif
1548} 1433}
1549 1434
1550static void __gk20a_channel_timeout_start(struct channel_gk20a *ch) 1435static void __gk20a_channel_timeout_start(struct channel_gk20a *ch)
@@ -2032,7 +1917,7 @@ int gk20a_free_priv_cmdbuf(struct channel_gk20a *c, struct priv_cmd_entry *e)
2032 return 0; 1917 return 0;
2033} 1918}
2034 1919
2035static int gk20a_channel_add_job(struct channel_gk20a *c, 1920int gk20a_channel_add_job(struct channel_gk20a *c,
2036 struct channel_gk20a_job *job, 1921 struct channel_gk20a_job *job,
2037 bool skip_buffer_refcounting) 1922 bool skip_buffer_refcounting)
2038{ 1923{
@@ -2097,7 +1982,7 @@ err_put_buffers:
2097 * per-job memory for completed jobs; in case of preallocated resources, this 1982 * per-job memory for completed jobs; in case of preallocated resources, this
2098 * opens up slots for new jobs to be submitted. 1983 * opens up slots for new jobs to be submitted.
2099 */ 1984 */
2100static void gk20a_channel_clean_up_jobs(struct channel_gk20a *c, 1985void gk20a_channel_clean_up_jobs(struct channel_gk20a *c,
2101 bool clean_all) 1986 bool clean_all)
2102{ 1987{
2103 struct vm_gk20a *vm; 1988 struct vm_gk20a *vm;
@@ -2257,533 +2142,6 @@ void gk20a_channel_update(struct channel_gk20a *c)
2257 gk20a_channel_worker_enqueue(c); 2142 gk20a_channel_worker_enqueue(c);
2258} 2143}
2259 2144
2260static void gk20a_submit_append_priv_cmdbuf(struct channel_gk20a *c,
2261 struct priv_cmd_entry *cmd)
2262{
2263 struct gk20a *g = c->g;
2264 struct nvgpu_mem *gpfifo_mem = &c->gpfifo.mem;
2265 struct nvgpu_gpfifo x = {
2266 .entry0 = u64_lo32(cmd->gva),
2267 .entry1 = u64_hi32(cmd->gva) |
2268 pbdma_gp_entry1_length_f(cmd->size)
2269 };
2270
2271 nvgpu_mem_wr_n(g, gpfifo_mem, c->gpfifo.put * sizeof(x),
2272 &x, sizeof(x));
2273
2274 if (cmd->mem->aperture == APERTURE_SYSMEM)
2275 trace_gk20a_push_cmdbuf(g->name, 0, cmd->size, 0,
2276 cmd->mem->cpu_va + cmd->off * sizeof(u32));
2277
2278 c->gpfifo.put = (c->gpfifo.put + 1) & (c->gpfifo.entry_num - 1);
2279}
2280
2281/*
2282 * Copy source gpfifo entries into the gpfifo ring buffer, potentially
2283 * splitting into two memcpys to handle wrap-around.
2284 */
2285static int gk20a_submit_append_gpfifo(struct channel_gk20a *c,
2286 struct nvgpu_gpfifo *kern_gpfifo,
2287 struct nvgpu_gpfifo __user *user_gpfifo,
2288 u32 num_entries)
2289{
2290 /* byte offsets */
2291 u32 gpfifo_size = c->gpfifo.entry_num * sizeof(struct nvgpu_gpfifo);
2292 u32 len = num_entries * sizeof(struct nvgpu_gpfifo);
2293 u32 start = c->gpfifo.put * sizeof(struct nvgpu_gpfifo);
2294 u32 end = start + len; /* exclusive */
2295 struct nvgpu_mem *gpfifo_mem = &c->gpfifo.mem;
2296 struct nvgpu_gpfifo *cpu_src;
2297 int err;
2298
2299 if (user_gpfifo && !c->gpfifo.pipe) {
2300 /*
2301 * This path (from userspace to sysmem) is special in order to
2302 * avoid two copies unnecessarily (from user to pipe, then from
2303 * pipe to gpu sysmem buffer).
2304 *
2305 * As a special case, the pipe buffer exists if PRAMIN writes
2306 * are forced, although the buffers may not be in vidmem in
2307 * that case.
2308 */
2309 if (end > gpfifo_size) {
2310 /* wrap-around */
2311 int length0 = gpfifo_size - start;
2312 int length1 = len - length0;
2313 void __user *user2 = (u8 __user *)user_gpfifo + length0;
2314
2315 err = copy_from_user(gpfifo_mem->cpu_va + start,
2316 user_gpfifo, length0);
2317 if (err)
2318 return err;
2319
2320 err = copy_from_user(gpfifo_mem->cpu_va,
2321 user2, length1);
2322 if (err)
2323 return err;
2324 } else {
2325 err = copy_from_user(gpfifo_mem->cpu_va + start,
2326 user_gpfifo, len);
2327 if (err)
2328 return err;
2329 }
2330
2331 trace_write_pushbuffer_range(c, NULL, user_gpfifo,
2332 0, num_entries);
2333 goto out;
2334 } else if (user_gpfifo) {
2335 /* from userspace to vidmem or sysmem when pramin forced, use
2336 * the common copy path below */
2337 err = copy_from_user(c->gpfifo.pipe, user_gpfifo, len);
2338 if (err)
2339 return err;
2340
2341 cpu_src = c->gpfifo.pipe;
2342 } else {
2343 /* from kernel to either sysmem or vidmem, don't need
2344 * copy_from_user so use the common path below */
2345 cpu_src = kern_gpfifo;
2346 }
2347
2348 if (end > gpfifo_size) {
2349 /* wrap-around */
2350 int length0 = gpfifo_size - start;
2351 int length1 = len - length0;
2352 void *src2 = (u8 *)cpu_src + length0;
2353
2354 nvgpu_mem_wr_n(c->g, gpfifo_mem, start, cpu_src, length0);
2355 nvgpu_mem_wr_n(c->g, gpfifo_mem, 0, src2, length1);
2356 } else {
2357 nvgpu_mem_wr_n(c->g, gpfifo_mem, start, cpu_src, len);
2358
2359 }
2360
2361 trace_write_pushbuffer_range(c, cpu_src, NULL, 0, num_entries);
2362
2363out:
2364 c->gpfifo.put = (c->gpfifo.put + num_entries) &
2365 (c->gpfifo.entry_num - 1);
2366
2367 return 0;
2368}
2369
2370/*
2371 * Handle the submit synchronization - pre-fences and post-fences.
2372 */
2373static int gk20a_submit_prepare_syncs(struct channel_gk20a *c,
2374 struct nvgpu_fence *fence,
2375 struct channel_gk20a_job *job,
2376 struct priv_cmd_entry **wait_cmd,
2377 struct priv_cmd_entry **incr_cmd,
2378 struct gk20a_fence **pre_fence,
2379 struct gk20a_fence **post_fence,
2380 bool force_need_sync_fence,
2381 bool register_irq,
2382 u32 flags)
2383{
2384 struct gk20a *g = c->g;
2385 bool need_sync_fence = false;
2386 bool new_sync_created = false;
2387 int wait_fence_fd = -1;
2388 int err = 0;
2389 bool need_wfi = !(flags & NVGPU_SUBMIT_GPFIFO_FLAGS_SUPPRESS_WFI);
2390 bool pre_alloc_enabled = channel_gk20a_is_prealloc_enabled(c);
2391
2392 /*
2393 * If user wants to always allocate sync_fence_fds then respect that;
2394 * otherwise, allocate sync_fence_fd based on user flags.
2395 */
2396 if (force_need_sync_fence)
2397 need_sync_fence = true;
2398
2399 if (g->aggressive_sync_destroy_thresh) {
2400 nvgpu_mutex_acquire(&c->sync_lock);
2401 if (!c->sync) {
2402 c->sync = gk20a_channel_sync_create(c);
2403 if (!c->sync) {
2404 err = -ENOMEM;
2405 nvgpu_mutex_release(&c->sync_lock);
2406 goto fail;
2407 }
2408 new_sync_created = true;
2409 }
2410 nvgpu_atomic_inc(&c->sync->refcount);
2411 nvgpu_mutex_release(&c->sync_lock);
2412 }
2413
2414 if (g->ops.fifo.resetup_ramfc && new_sync_created) {
2415 err = g->ops.fifo.resetup_ramfc(c);
2416 if (err)
2417 goto fail;
2418 }
2419
2420 /*
2421 * Optionally insert syncpt wait in the beginning of gpfifo submission
2422 * when user requested and the wait hasn't expired. Validate that the id
2423 * makes sense, elide if not. The only reason this isn't being
2424 * unceremoniously killed is to keep running some tests which trigger
2425 * this condition.
2426 */
2427 if (flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_WAIT) {
2428 job->pre_fence = gk20a_alloc_fence(c);
2429 if (!job->pre_fence) {
2430 err = -ENOMEM;
2431 goto fail;
2432 }
2433
2434 if (!pre_alloc_enabled)
2435 job->wait_cmd = nvgpu_kzalloc(g,
2436 sizeof(struct priv_cmd_entry));
2437
2438 if (!job->wait_cmd) {
2439 err = -ENOMEM;
2440 goto clean_up_pre_fence;
2441 }
2442
2443 if (flags & NVGPU_SUBMIT_GPFIFO_FLAGS_SYNC_FENCE) {
2444 wait_fence_fd = fence->id;
2445 err = c->sync->wait_fd(c->sync, wait_fence_fd,
2446 job->wait_cmd, job->pre_fence);
2447 } else {
2448 err = c->sync->wait_syncpt(c->sync, fence->id,
2449 fence->value, job->wait_cmd,
2450 job->pre_fence);
2451 }
2452
2453 if (!err) {
2454 if (job->wait_cmd->valid)
2455 *wait_cmd = job->wait_cmd;
2456 *pre_fence = job->pre_fence;
2457 } else
2458 goto clean_up_wait_cmd;
2459 }
2460
2461 if ((flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_GET) &&
2462 (flags & NVGPU_SUBMIT_GPFIFO_FLAGS_SYNC_FENCE))
2463 need_sync_fence = true;
2464
2465 /*
2466 * Always generate an increment at the end of a GPFIFO submission. This
2467 * is used to keep track of method completion for idle railgating. The
2468 * sync_pt/semaphore PB is added to the GPFIFO later on in submit.
2469 */
2470 job->post_fence = gk20a_alloc_fence(c);
2471 if (!job->post_fence) {
2472 err = -ENOMEM;
2473 goto clean_up_wait_cmd;
2474 }
2475 if (!pre_alloc_enabled)
2476 job->incr_cmd = nvgpu_kzalloc(g, sizeof(struct priv_cmd_entry));
2477
2478 if (!job->incr_cmd) {
2479 err = -ENOMEM;
2480 goto clean_up_post_fence;
2481 }
2482
2483 if (flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_GET)
2484 err = c->sync->incr_user(c->sync, wait_fence_fd, job->incr_cmd,
2485 job->post_fence, need_wfi, need_sync_fence,
2486 register_irq);
2487 else
2488 err = c->sync->incr(c->sync, job->incr_cmd,
2489 job->post_fence, need_sync_fence,
2490 register_irq);
2491 if (!err) {
2492 *incr_cmd = job->incr_cmd;
2493 *post_fence = job->post_fence;
2494 } else
2495 goto clean_up_incr_cmd;
2496
2497 return 0;
2498
2499clean_up_incr_cmd:
2500 free_priv_cmdbuf(c, job->incr_cmd);
2501 if (!pre_alloc_enabled)
2502 job->incr_cmd = NULL;
2503clean_up_post_fence:
2504 gk20a_fence_put(job->post_fence);
2505 job->post_fence = NULL;
2506clean_up_wait_cmd:
2507 free_priv_cmdbuf(c, job->wait_cmd);
2508 if (!pre_alloc_enabled)
2509 job->wait_cmd = NULL;
2510clean_up_pre_fence:
2511 gk20a_fence_put(job->pre_fence);
2512 job->pre_fence = NULL;
2513fail:
2514 *wait_cmd = NULL;
2515 *pre_fence = NULL;
2516 return err;
2517}
2518
2519int gk20a_submit_channel_gpfifo(struct channel_gk20a *c,
2520 struct nvgpu_gpfifo *gpfifo,
2521 struct nvgpu_submit_gpfifo_args *args,
2522 u32 num_entries,
2523 u32 flags,
2524 struct nvgpu_fence *fence,
2525 struct gk20a_fence **fence_out,
2526 bool force_need_sync_fence,
2527 struct fifo_profile_gk20a *profile)
2528{
2529 struct gk20a *g = c->g;
2530 struct priv_cmd_entry *wait_cmd = NULL;
2531 struct priv_cmd_entry *incr_cmd = NULL;
2532 struct gk20a_fence *pre_fence = NULL;
2533 struct gk20a_fence *post_fence = NULL;
2534 struct channel_gk20a_job *job = NULL;
2535 /* we might need two extra gpfifo entries - one for pre fence
2536 * and one for post fence. */
2537 const int extra_entries = 2;
2538 bool skip_buffer_refcounting = (flags &
2539 NVGPU_SUBMIT_GPFIFO_FLAGS_SKIP_BUFFER_REFCOUNTING);
2540 int err = 0;
2541 bool need_job_tracking;
2542 bool need_deferred_cleanup = false;
2543 struct nvgpu_gpfifo __user *user_gpfifo = args ?
2544 (struct nvgpu_gpfifo __user *)(uintptr_t)args->gpfifo : NULL;
2545
2546 if (nvgpu_is_enabled(g, NVGPU_DRIVER_IS_DYING))
2547 return -ENODEV;
2548
2549 if (c->has_timedout)
2550 return -ETIMEDOUT;
2551
2552 if (!nvgpu_mem_is_valid(&c->gpfifo.mem))
2553 return -ENOMEM;
2554
2555 /* fifo not large enough for request. Return error immediately.
2556 * Kernel can insert gpfifo entries before and after user gpfifos.
2557 * So, add extra_entries in user request. Also, HW with fifo size N
2558 * can accept only N-1 entreis and so the below condition */
2559 if (c->gpfifo.entry_num - 1 < num_entries + extra_entries) {
2560 nvgpu_err(g, "not enough gpfifo space allocated");
2561 return -ENOMEM;
2562 }
2563
2564 if (!gpfifo && !args)
2565 return -EINVAL;
2566
2567 if ((flags & (NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_WAIT |
2568 NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_GET)) &&
2569 !fence)
2570 return -EINVAL;
2571
2572 /* an address space needs to have been bound at this point. */
2573 if (!gk20a_channel_as_bound(c)) {
2574 nvgpu_err(g,
2575 "not bound to an address space at time of gpfifo"
2576 " submission.");
2577 return -EINVAL;
2578 }
2579
2580 if (profile)
2581 profile->timestamp[PROFILE_ENTRY] = sched_clock();
2582
2583 /* update debug settings */
2584 nvgpu_ltc_sync_enabled(g);
2585
2586 gk20a_dbg_info("channel %d", c->chid);
2587
2588 /*
2589 * Job tracking is necessary for any of the following conditions:
2590 * - pre- or post-fence functionality
2591 * - channel wdt
2592 * - GPU rail-gating with non-deterministic channels
2593 * - buffer refcounting
2594 *
2595 * If none of the conditions are met, then job tracking is not
2596 * required and a fast submit can be done (ie. only need to write
2597 * out userspace GPFIFO entries and update GP_PUT).
2598 */
2599 need_job_tracking = (flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_WAIT) ||
2600 (flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_GET) ||
2601 c->wdt_enabled ||
2602 (g->can_railgate && !c->deterministic) ||
2603 !skip_buffer_refcounting;
2604
2605 if (need_job_tracking) {
2606 bool need_sync_framework = false;
2607
2608 /*
2609 * If the channel is to have deterministic latency and
2610 * job tracking is required, the channel must have
2611 * pre-allocated resources. Otherwise, we fail the submit here
2612 */
2613 if (c->deterministic && !channel_gk20a_is_prealloc_enabled(c))
2614 return -EINVAL;
2615
2616 need_sync_framework = force_need_sync_fence ||
2617 gk20a_channel_sync_needs_sync_framework(g) ||
2618 (flags & NVGPU_SUBMIT_GPFIFO_FLAGS_SYNC_FENCE &&
2619 (flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_WAIT ||
2620 flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_GET));
2621
2622 /*
2623 * Deferred clean-up is necessary for any of the following
2624 * conditions:
2625 * - channel's deterministic flag is not set
2626 * - dependency on sync framework, which could make the
2627 * behavior of the clean-up operation non-deterministic
2628 * (should not be performed in the submit path)
2629 * - channel wdt
2630 * - GPU rail-gating with non-deterministic channels
2631 * - buffer refcounting
2632 *
2633 * If none of the conditions are met, then deferred clean-up
2634 * is not required, and we clean-up one job-tracking
2635 * resource in the submit path.
2636 */
2637 need_deferred_cleanup = !c->deterministic ||
2638 need_sync_framework ||
2639 c->wdt_enabled ||
2640 (g->can_railgate &&
2641 !c->deterministic) ||
2642 !skip_buffer_refcounting;
2643
2644 /*
2645 * For deterministic channels, we don't allow deferred clean_up
2646 * processing to occur. In cases we hit this, we fail the submit
2647 */
2648 if (c->deterministic && need_deferred_cleanup)
2649 return -EINVAL;
2650
2651 if (!c->deterministic) {
2652 /*
2653 * Get a power ref unless this is a deterministic
2654 * channel that holds them during the channel lifetime.
2655 * This one is released by gk20a_channel_clean_up_jobs,
2656 * via syncpt or sema interrupt, whichever is used.
2657 */
2658 err = gk20a_busy(g);
2659 if (err) {
2660 nvgpu_err(g,
2661 "failed to host gk20a to submit gpfifo, process %s",
2662 current->comm);
2663 return err;
2664 }
2665 }
2666
2667 if (!need_deferred_cleanup) {
2668 /* clean up a single job */
2669 gk20a_channel_clean_up_jobs(c, false);
2670 }
2671 }
2672
2673
2674 /* Grab access to HW to deal with do_idle */
2675 if (c->deterministic)
2676 nvgpu_rwsem_down_read(&g->deterministic_busy);
2677
2678 trace_gk20a_channel_submit_gpfifo(g->name,
2679 c->chid,
2680 num_entries,
2681 flags,
2682 fence ? fence->id : 0,
2683 fence ? fence->value : 0);
2684
2685 gk20a_dbg_info("pre-submit put %d, get %d, size %d",
2686 c->gpfifo.put, c->gpfifo.get, c->gpfifo.entry_num);
2687
2688 /*
2689 * Make sure we have enough space for gpfifo entries. Check cached
2690 * values first and then read from HW. If no space, return EAGAIN
2691 * and let userpace decide to re-try request or not.
2692 */
2693 if (gp_free_count(c) < num_entries + extra_entries) {
2694 if (get_gp_free_count(c) < num_entries + extra_entries) {
2695 err = -EAGAIN;
2696 goto clean_up;
2697 }
2698 }
2699
2700 if (c->has_timedout) {
2701 err = -ETIMEDOUT;
2702 goto clean_up;
2703 }
2704
2705 if (need_job_tracking) {
2706 err = channel_gk20a_alloc_job(c, &job);
2707 if (err)
2708 goto clean_up;
2709
2710 err = gk20a_submit_prepare_syncs(c, fence, job,
2711 &wait_cmd, &incr_cmd,
2712 &pre_fence, &post_fence,
2713 force_need_sync_fence,
2714 need_deferred_cleanup,
2715 flags);
2716 if (err)
2717 goto clean_up_job;
2718 }
2719
2720 if (profile)
2721 profile->timestamp[PROFILE_JOB_TRACKING] = sched_clock();
2722
2723 if (wait_cmd)
2724 gk20a_submit_append_priv_cmdbuf(c, wait_cmd);
2725
2726 if (gpfifo || user_gpfifo)
2727 err = gk20a_submit_append_gpfifo(c, gpfifo, user_gpfifo,
2728 num_entries);
2729 if (err)
2730 goto clean_up_job;
2731
2732 /*
2733 * And here's where we add the incr_cmd we generated earlier. It should
2734 * always run!
2735 */
2736 if (incr_cmd)
2737 gk20a_submit_append_priv_cmdbuf(c, incr_cmd);
2738
2739 if (fence_out)
2740 *fence_out = gk20a_fence_get(post_fence);
2741
2742 if (need_job_tracking)
2743 /* TODO! Check for errors... */
2744 gk20a_channel_add_job(c, job, skip_buffer_refcounting);
2745 if (profile)
2746 profile->timestamp[PROFILE_APPEND] = sched_clock();
2747
2748 g->ops.fifo.userd_gp_put(g, c);
2749
2750 if ((NVGPU_SUBMIT_GPFIFO_FLAGS_RESCHEDULE_RUNLIST & flags) &&
2751 g->ops.fifo.reschedule_runlist)
2752 g->ops.fifo.reschedule_runlist(g, c->runlist_id);
2753
2754 /* No hw access beyond this point */
2755 if (c->deterministic)
2756 nvgpu_rwsem_up_read(&g->deterministic_busy);
2757
2758 trace_gk20a_channel_submitted_gpfifo(g->name,
2759 c->chid,
2760 num_entries,
2761 flags,
2762 post_fence ? post_fence->syncpt_id : 0,
2763 post_fence ? post_fence->syncpt_value : 0);
2764
2765 gk20a_dbg_info("post-submit put %d, get %d, size %d",
2766 c->gpfifo.put, c->gpfifo.get, c->gpfifo.entry_num);
2767
2768 if (profile)
2769 profile->timestamp[PROFILE_END] = sched_clock();
2770 gk20a_dbg_fn("done");
2771 return err;
2772
2773clean_up_job:
2774 channel_gk20a_free_job(c, job);
2775clean_up:
2776 gk20a_dbg_fn("fail");
2777 gk20a_fence_put(pre_fence);
2778 gk20a_fence_put(post_fence);
2779 if (c->deterministic)
2780 nvgpu_rwsem_up_read(&g->deterministic_busy);
2781 else if (need_deferred_cleanup)
2782 gk20a_idle(g);
2783
2784 return err;
2785}
2786
2787/* 2145/*
2788 * Stop deterministic channel activity for do_idle() when power needs to go off 2146 * Stop deterministic channel activity for do_idle() when power needs to go off
2789 * momentarily but deterministic channels keep power refs for potentially a 2147 * momentarily but deterministic channels keep power refs for potentially a
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
index 4b1cb351..cdf75a9a 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
@@ -24,6 +24,9 @@
24#ifndef CHANNEL_GK20A_H 24#ifndef CHANNEL_GK20A_H
25#define CHANNEL_GK20A_H 25#define CHANNEL_GK20A_H
26 26
27/* TODO: To be removed when work_struct update_fn_work is moved out of common code */
28#include <linux/workqueue.h>
29
27#include <linux/stacktrace.h> 30#include <linux/stacktrace.h>
28#include <nvgpu/list.h> 31#include <nvgpu/list.h>
29 32
@@ -374,16 +377,6 @@ struct channel_gk20a *gk20a_open_new_channel_with_cb(struct gk20a *g,
374 int runlist_id, 377 int runlist_id,
375 bool is_privileged_channel); 378 bool is_privileged_channel);
376 379
377int gk20a_submit_channel_gpfifo(struct channel_gk20a *c,
378 struct nvgpu_gpfifo *gpfifo,
379 struct nvgpu_submit_gpfifo_args *args,
380 u32 num_entries,
381 u32 flags,
382 struct nvgpu_fence *fence,
383 struct gk20a_fence **fence_out,
384 bool force_need_sync_fence,
385 struct fifo_profile_gk20a *profile);
386
387int gk20a_channel_alloc_gpfifo(struct channel_gk20a *c, 380int gk20a_channel_alloc_gpfifo(struct channel_gk20a *c,
388 unsigned int num_entries, 381 unsigned int num_entries,
389 unsigned int num_inflight_jobs, 382 unsigned int num_inflight_jobs,
@@ -408,4 +401,20 @@ int gk20a_channel_set_runlist_interleave(struct channel_gk20a *ch,
408void gk20a_channel_event_id_post_event(struct channel_gk20a *ch, 401void gk20a_channel_event_id_post_event(struct channel_gk20a *ch,
409 u32 event_id); 402 u32 event_id);
410 403
404int channel_gk20a_alloc_job(struct channel_gk20a *c,
405 struct channel_gk20a_job **job_out);
406void channel_gk20a_free_job(struct channel_gk20a *c,
407 struct channel_gk20a_job *job);
408u32 nvgpu_get_gp_free_count(struct channel_gk20a *c);
409u32 nvgpu_gp_free_count(struct channel_gk20a *c);
410int gk20a_channel_add_job(struct channel_gk20a *c,
411 struct channel_gk20a_job *job,
412 bool skip_buffer_refcounting);
413void free_priv_cmdbuf(struct channel_gk20a *c,
414 struct priv_cmd_entry *e);
415void gk20a_channel_clean_up_jobs(struct channel_gk20a *c,
416 bool clean_all);
417
418u32 nvgpu_get_gpfifo_entry_size(void);
419
411#endif /* CHANNEL_GK20A_H */ 420#endif /* CHANNEL_GK20A_H */