summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLakshmanan M <lm@nvidia.com>2016-06-29 06:36:39 -0400
committerVijayakumar Subbu <vsubbu@nvidia.com>2016-07-20 06:09:28 -0400
commit89aecd1202b49727e940069f2a6feb5c3cf4c927 (patch)
tree8a0d3a493b389167ce1d93e55f23e114ec2cbd38
parentf6ebdc5f2916706f7a61983567420e0985faeeb1 (diff)
gpu: nvgpu: Add nvgpu infra to allow kernel to create privileged CE channels
Added interface to allow kernel to create privileged CE channels for page migration and clearing support between sysmem and videmem. JIRA DNVGPU-53 Change-Id: I3e18d18403809c9e64fa45d40b6c4e3844992506 Signed-off-by: Lakshmanan M <lm@nvidia.com> Reviewed-on: http://git-master/r/1173085 GVS: Gerrit_Virtual_Submit Reviewed-by: Vijayakumar Subbu <vsubbu@nvidia.com>
-rw-r--r--drivers/gpu/nvgpu/gk20a/cde_gk20a.c4
-rw-r--r--drivers/gpu/nvgpu/gk20a/ce2_gk20a.c617
-rw-r--r--drivers/gpu/nvgpu/gk20a/ce2_gk20a.h124
-rw-r--r--drivers/gpu/nvgpu/gk20a/channel_gk20a.c8
-rw-r--r--drivers/gpu/nvgpu/gk20a/channel_gk20a.h6
-rw-r--r--drivers/gpu/nvgpu/gk20a/fifo_gk20a.c27
-rw-r--r--drivers/gpu/nvgpu/gk20a/fifo_gk20a.h2
-rw-r--r--drivers/gpu/nvgpu/gk20a/gk20a.c13
-rw-r--r--drivers/gpu/nvgpu/gk20a/gk20a.h2
-rw-r--r--drivers/gpu/nvgpu/gk20a/mm_gk20a.c86
-rw-r--r--drivers/gpu/nvgpu/gk20a/mm_gk20a.h6
-rw-r--r--drivers/gpu/nvgpu/gk20a/platform_gk20a.h2
-rw-r--r--drivers/gpu/nvgpu/gk20a/platform_gk20a_tegra.c4
-rw-r--r--drivers/gpu/nvgpu/pci.c2
14 files changed, 896 insertions, 7 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/cde_gk20a.c b/drivers/gpu/nvgpu/gk20a/cde_gk20a.c
index 4b84dc69..f5b68e72 100644
--- a/drivers/gpu/nvgpu/gk20a/cde_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/cde_gk20a.c
@@ -1186,7 +1186,9 @@ static int gk20a_cde_load(struct gk20a_cde_ctx *cde_ctx)
1186 } 1186 }
1187 1187
1188 ch = gk20a_open_new_channel_with_cb(g, gk20a_cde_finished_ctx_cb, 1188 ch = gk20a_open_new_channel_with_cb(g, gk20a_cde_finished_ctx_cb,
1189 cde_ctx); 1189 cde_ctx,
1190 -1,
1191 false);
1190 if (!ch) { 1192 if (!ch) {
1191 gk20a_warn(cde_ctx->dev, "cde: gk20a channel not available"); 1193 gk20a_warn(cde_ctx->dev, "cde: gk20a channel not available");
1192 err = -ENOMEM; 1194 err = -ENOMEM;
diff --git a/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c b/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c
index 96d38b11..e2f2d9e9 100644
--- a/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c
@@ -24,6 +24,7 @@
24#include <trace/events/gk20a.h> 24#include <trace/events/gk20a.h>
25#include <linux/dma-mapping.h> 25#include <linux/dma-mapping.h>
26#include <linux/nvhost.h> 26#include <linux/nvhost.h>
27#include <linux/debugfs.h>
27 28
28#include "gk20a.h" 29#include "gk20a.h"
29#include "debug_gk20a.h" 30#include "debug_gk20a.h"
@@ -96,3 +97,619 @@ void gk20a_init_ce2(struct gpu_ops *gops)
96 gops->ce2.isr_stall = gk20a_ce2_isr; 97 gops->ce2.isr_stall = gk20a_ce2_isr;
97 gops->ce2.isr_nonstall = gk20a_ce2_nonstall_isr; 98 gops->ce2.isr_nonstall = gk20a_ce2_nonstall_isr;
98} 99}
100
101/* static CE app api */
102static void gk20a_ce_notify_all_user(struct gk20a *g, u32 event)
103{
104 struct gk20a_ce_app *ce_app = &g->ce_app;
105 struct gk20a_gpu_ctx *ce_ctx, *ce_ctx_save;
106
107 if (!ce_app->initialised)
108 return;
109
110 mutex_lock(&ce_app->app_mutex);
111
112 list_for_each_entry_safe(ce_ctx, ce_ctx_save,
113 &ce_app->allocated_contexts, list) {
114 if (ce_ctx->user_event_callback) {
115 ce_ctx->user_event_callback(ce_ctx->ctx_id,
116 event);
117 }
118 }
119
120 mutex_unlock(&ce_app->app_mutex);
121}
122
123static void gk20a_ce_finished_ctx_cb(struct channel_gk20a *ch, void *data)
124{
125 struct gk20a_gpu_ctx *ce_ctx = data;
126 bool channel_idle;
127 u32 event;
128
129 mutex_lock(&ch->jobs_lock);
130 channel_idle = list_empty(&ch->jobs);
131 mutex_unlock(&ch->jobs_lock);
132
133 if (!channel_idle)
134 return;
135
136 gk20a_dbg(gpu_dbg_fn, "ce: finished %p", ce_ctx);
137
138 if (ch->has_timedout)
139 event = NVGPU_CE_CONTEXT_JOB_TIMEDOUT;
140 else
141 event = NVGPU_CE_CONTEXT_JOB_COMPLETED;
142
143 if (ce_ctx->user_event_callback)
144 ce_ctx->user_event_callback(ce_ctx->ctx_id,
145 event);
146
147 ++ce_ctx->completed_seq_number;
148}
149
150static void gk20a_ce_free_command_buffer_stored_fence(struct gk20a_gpu_ctx *ce_ctx)
151{
152 u32 cmd_buf_index;
153 u32 cmd_buf_read_offset;
154 u32 fence_index;
155 u32 *cmd_buf_cpu_va;
156
157 for (cmd_buf_index = 0;
158 cmd_buf_index < ce_ctx->cmd_buf_end_queue_offset;
159 cmd_buf_index++) {
160 cmd_buf_read_offset = (cmd_buf_index *
161 (NVGPU_CE_MAX_COMMAND_BUFF_SIZE_PER_KICKOFF / sizeof(u32)));
162
163 /* at end of command buffer has gk20a_fence for command buffer sync */
164 fence_index = (cmd_buf_read_offset +
165 ((NVGPU_CE_MAX_COMMAND_BUFF_SIZE_PER_KICKOFF / sizeof(u32)) -
166 (NVGPU_CE_MAX_COMMAND_BUFF_SIZE_FOR_TRACING / sizeof(u32))));
167
168 cmd_buf_cpu_va = (u32 *)ce_ctx->cmd_buf_mem.cpu_va;
169
170 /* 0 is treated as invalid pre-sync */
171 if (cmd_buf_cpu_va[fence_index]) {
172 struct gk20a_fence * ce_cmd_buf_fence_in = NULL;
173
174 memcpy((void *)&ce_cmd_buf_fence_in,
175 (void *)(cmd_buf_cpu_va + fence_index),
176 sizeof(struct gk20a_fence *));
177 gk20a_fence_put(ce_cmd_buf_fence_in);
178 /* Reset the stored last pre-sync */
179 memset((void *)(cmd_buf_cpu_va + fence_index),
180 0,
181 NVGPU_CE_MAX_COMMAND_BUFF_SIZE_FOR_TRACING);
182 }
183 }
184}
185
186/* assume this api should need to call under mutex_lock(&ce_app->app_mutex) */
187static void gk20a_ce_delete_gpu_context(struct gk20a_gpu_ctx *ce_ctx)
188{
189 ce_ctx->gpu_ctx_state = NVGPU_CE_GPU_CTX_DELETED;
190
191 mutex_lock(&ce_ctx->gpu_ctx_mutex);
192
193 gk20a_ce_free_command_buffer_stored_fence(ce_ctx);
194
195 gk20a_gmmu_unmap_free(ce_ctx->vm, &ce_ctx->cmd_buf_mem);
196
197 /* free the channel */
198 if (ce_ctx->ch)
199 gk20a_channel_close(ce_ctx->ch);
200
201 /* housekeeping on app */
202 list_del(&ce_ctx->list);
203
204 mutex_unlock(&ce_ctx->gpu_ctx_mutex);
205 mutex_destroy(&ce_ctx->gpu_ctx_mutex);
206
207 kfree(ce_ctx);
208}
209
210static inline int gk20a_ce_get_method_size(int request_operation)
211{
212 /* failure size */
213 int methodsize = ~0;
214
215 if (request_operation & NVGPU_CE_PHYS_MODE_TRANSFER)
216 methodsize = 10 * 2 * sizeof(u32);
217 else if (request_operation & NVGPU_CE_MEMSET)
218 methodsize = 9 * 2 * sizeof(u32);
219
220 return methodsize;
221}
222
223static inline int gk20a_get_valid_launch_flags(struct gk20a *g, int launch_flags)
224{
225 /* there is no local memory available,
226 don't allow local memory related CE flags */
227 if (!g->mm.vidmem_size) {
228 launch_flags &= ~(NVGPU_CE_SRC_LOCATION_LOCAL_FB |
229 NVGPU_CE_DST_LOCATION_LOCAL_FB);
230 }
231 return launch_flags;
232}
233
234static int gk20a_ce_prepare_submit(u64 src_buf,
235 u64 dst_buf,
236 u64 size,
237 u32 *cmd_buf_cpu_va,
238 u32 max_cmd_buf_size,
239 unsigned int payload,
240 int launch_flags,
241 int request_operation,
242 u32 dma_copy_class,
243 struct gk20a_fence *gk20a_fence_in)
244{
245 u32 launch = 0;
246 u32 methodSize = 0;
247
248 /* failure case handling */
249 if ((gk20a_ce_get_method_size(request_operation) > max_cmd_buf_size) ||
250 (!size) ||
251 (request_operation > NVGPU_CE_MEMSET))
252 return 0;
253
254 /* set the channel object */
255 cmd_buf_cpu_va[methodSize++] = 0x20018000;
256 cmd_buf_cpu_va[methodSize++] = dma_copy_class;
257
258 if (request_operation & NVGPU_CE_PHYS_MODE_TRANSFER) {
259 /* setup the source */
260 cmd_buf_cpu_va[methodSize++] = 0x20018101;
261 cmd_buf_cpu_va[methodSize++] = (u64_lo32(src_buf) &
262 NVGPU_CE_LOWER_ADDRESS_OFFSET_MASK);
263
264 cmd_buf_cpu_va[methodSize++] = 0x20018100;
265 cmd_buf_cpu_va[methodSize++] = (u64_hi32(src_buf) &
266 NVGPU_CE_UPPER_ADDRESS_OFFSET_MASK);
267
268 cmd_buf_cpu_va[methodSize++] = 0x20018098;
269 if (launch_flags & NVGPU_CE_SRC_LOCATION_LOCAL_FB) {
270 cmd_buf_cpu_va[methodSize++] = 0x00000000;
271 } else if (launch_flags & NVGPU_CE_SRC_LOCATION_NONCOHERENT_SYSMEM) {
272 cmd_buf_cpu_va[methodSize++] = 0x00000002;
273 } else {
274 cmd_buf_cpu_va[methodSize++] = 0x00000001;
275 }
276
277 launch |= 0x00001000;
278 } else if (request_operation & NVGPU_CE_MEMSET) {
279 cmd_buf_cpu_va[methodSize++] = 0x200181c2;
280 cmd_buf_cpu_va[methodSize++] = 0x00030004;
281
282 cmd_buf_cpu_va[methodSize++] = 0x200181c0;
283 cmd_buf_cpu_va[methodSize++] = payload;
284
285 launch |= 0x00000400;
286
287 /* converted into number of words */
288 size /= sizeof(u32);
289 }
290
291 /* setup the destination/output */
292 cmd_buf_cpu_va[methodSize++] = 0x20018103;
293 cmd_buf_cpu_va[methodSize++] = (u64_lo32(dst_buf) & NVGPU_CE_LOWER_ADDRESS_OFFSET_MASK);
294
295 cmd_buf_cpu_va[methodSize++] = 0x20018102;
296 cmd_buf_cpu_va[methodSize++] = (u64_hi32(dst_buf) & NVGPU_CE_UPPER_ADDRESS_OFFSET_MASK);
297
298 cmd_buf_cpu_va[methodSize++] = 0x20018099;
299 if (launch_flags & NVGPU_CE_DST_LOCATION_LOCAL_FB) {
300 cmd_buf_cpu_va[methodSize++] = 0x00000000;
301 } else if (launch_flags & NVGPU_CE_DST_LOCATION_NONCOHERENT_SYSMEM) {
302 cmd_buf_cpu_va[methodSize++] = 0x00000002;
303 } else {
304 cmd_buf_cpu_va[methodSize++] = 0x00000001;
305 }
306
307 launch |= 0x00002000;
308
309 /* setup the format */
310 cmd_buf_cpu_va[methodSize++] = 0x20018107;
311 cmd_buf_cpu_va[methodSize++] = 1;
312 cmd_buf_cpu_va[methodSize++] = 0x20018106;
313 cmd_buf_cpu_va[methodSize++] = u64_lo32(size);
314
315 launch |= 0x00000004;
316
317 if (launch_flags & NVGPU_CE_SRC_MEMORY_LAYOUT_BLOCKLINEAR)
318 launch |= 0x00000000;
319 else
320 launch |= 0x00000080;
321
322 if (launch_flags & NVGPU_CE_DST_MEMORY_LAYOUT_BLOCKLINEAR)
323 launch |= 0x00000000;
324 else
325 launch |= 0x00000100;
326
327 if (launch_flags & NVGPU_CE_DATA_TRANSFER_TYPE_NON_PIPELINED)
328 launch |= 0x00000002;
329 else
330 launch |= 0x00000001;
331
332 cmd_buf_cpu_va[methodSize++] = 0x200180c0;
333 cmd_buf_cpu_va[methodSize++] = launch;
334
335 return methodSize;
336}
337
338/* global CE app related apis */
339int gk20a_init_ce_support(struct gk20a *g)
340{
341 struct gk20a_ce_app *ce_app = &g->ce_app;
342
343 if (ce_app->initialised) {
344 /* assume this happen during poweron/poweroff GPU sequence */
345 ce_app->app_state = NVGPU_CE_ACTIVE;
346 gk20a_ce_notify_all_user(g, NVGPU_CE_CONTEXT_RESUME);
347 return 0;
348 }
349
350 gk20a_dbg(gpu_dbg_fn, "ce: init");
351
352 mutex_init(&ce_app->app_mutex);
353 mutex_lock(&ce_app->app_mutex);
354
355 INIT_LIST_HEAD(&ce_app->allocated_contexts);
356 ce_app->ctx_count = 0;
357 ce_app->next_ctx_id = 0;
358 ce_app->initialised = true;
359 ce_app->app_state = NVGPU_CE_ACTIVE;
360
361 mutex_unlock(&ce_app->app_mutex);
362 gk20a_dbg(gpu_dbg_cde_ctx, "ce: init finished");
363
364 return 0;
365}
366
367void gk20a_ce_destroy(struct gk20a *g)
368{
369 struct gk20a_ce_app *ce_app = &g->ce_app;
370 struct gk20a_gpu_ctx *ce_ctx, *ce_ctx_save;
371
372 if (!ce_app->initialised)
373 return;
374
375 ce_app->app_state = NVGPU_CE_SUSPEND;
376 ce_app->initialised = false;
377
378 mutex_lock(&ce_app->app_mutex);
379
380 list_for_each_entry_safe(ce_ctx, ce_ctx_save,
381 &ce_app->allocated_contexts, list) {
382 gk20a_ce_delete_gpu_context(ce_ctx);
383 }
384
385 INIT_LIST_HEAD(&ce_app->allocated_contexts);
386 ce_app->ctx_count = 0;
387 ce_app->next_ctx_id = 0;
388
389 mutex_unlock(&ce_app->app_mutex);
390 mutex_destroy(&ce_app->app_mutex);
391}
392
393void gk20a_ce_suspend(struct gk20a *g)
394{
395 struct gk20a_ce_app *ce_app = &g->ce_app;
396
397 if (!ce_app->initialised)
398 return;
399
400 ce_app->app_state = NVGPU_CE_SUSPEND;
401 gk20a_ce_notify_all_user(g, NVGPU_CE_CONTEXT_SUSPEND);
402
403 return;
404}
405
406/* CE app utility functions */
407u32 gk20a_ce_create_context_with_cb(struct device *dev,
408 int runlist_id,
409 int priority,
410 int timeslice,
411 int runlist_level,
412 ce_event_callback user_event_callback)
413{
414 struct gk20a_gpu_ctx *ce_ctx;
415 struct gk20a *g = gk20a_from_dev(dev);
416 struct gk20a_ce_app *ce_app = &g->ce_app;
417 u32 ctx_id = ~0;
418 int err = 0;
419
420 if (!ce_app->initialised || ce_app->app_state != NVGPU_CE_ACTIVE)
421 return ctx_id;
422
423 ce_ctx = kzalloc(sizeof(*ce_ctx), GFP_KERNEL);
424 if (!ce_ctx)
425 return ctx_id;
426
427 mutex_init(&ce_ctx->gpu_ctx_mutex);
428
429 ce_ctx->g = g;
430 ce_ctx->dev = g->dev;
431 ce_ctx->user_event_callback = user_event_callback;
432
433 ce_ctx->cmd_buf_read_queue_offset = 0;
434 ce_ctx->cmd_buf_end_queue_offset =
435 (NVGPU_CE_COMMAND_BUF_SIZE / NVGPU_CE_MAX_COMMAND_BUFF_SIZE_PER_KICKOFF);
436
437 ce_ctx->submitted_seq_number = 0;
438 ce_ctx->completed_seq_number = 0;
439
440 /* always kernel client needs privileged channel */
441 ce_ctx->ch = gk20a_open_new_channel_with_cb(g, gk20a_ce_finished_ctx_cb,
442 ce_ctx,
443 runlist_id,
444 true);
445 if (!ce_ctx->ch) {
446 gk20a_err(ce_ctx->dev, "ce: gk20a channel not available");
447 goto end;
448 }
449
450 /* bind the channel to the vm */
451 gk20a_vm_get(&g->mm.ce.vm);
452 ce_ctx->vm = ce_ctx->ch->vm = &g->mm.ce.vm;
453 err = channel_gk20a_commit_va(ce_ctx->ch);
454 if (err) {
455 gk20a_err(ce_ctx->dev, "ce: could not bind vm");
456 goto end;
457 }
458
459 /* allocate gpfifo (1024 should be more than enough) */
460 err = gk20a_alloc_channel_gpfifo(ce_ctx->ch,
461 &(struct nvgpu_alloc_gpfifo_args){1024, 0});
462 if (err) {
463 gk20a_err(ce_ctx->dev, "ce: unable to allocate gpfifo");
464 goto end;
465 }
466
467 /* allocate command buffer (4096 should be more than enough) from sysmem*/
468 err = gk20a_gmmu_alloc_map_sys(ce_ctx->vm, NVGPU_CE_COMMAND_BUF_SIZE, &ce_ctx->cmd_buf_mem);
469 if (err) {
470 gk20a_err(ce_ctx->dev,
471 "ce: could not allocate command buffer for CE context");
472 goto end;
473 }
474
475 memset(ce_ctx->cmd_buf_mem.cpu_va, 0x00, ce_ctx->cmd_buf_mem.size);
476
477 /* -1 means default channel priority */
478 if (priority != -1) {
479 err = gk20a_channel_set_priority(ce_ctx->ch, priority);
480 if (err) {
481 gk20a_err(ce_ctx->dev,
482 "ce: could not set the channel priority for CE context");
483 goto end;
484 }
485 }
486
487 /* -1 means default channel timeslice value */
488 if (timeslice != -1) {
489 err = gk20a_channel_set_timeslice(ce_ctx->ch, timeslice);
490 if (err) {
491 gk20a_err(ce_ctx->dev,
492 "ce: could not set the channel timeslice value for CE context");
493 goto end;
494 }
495 }
496
497 /* -1 means default channel runlist level */
498 if (runlist_level != -1) {
499 err = gk20a_channel_set_runlist_interleave(ce_ctx->ch, runlist_level);
500 if (err) {
501 gk20a_err(ce_ctx->dev,
502 "ce: could not set the runlist interleave for CE context");
503 goto end;
504 }
505 }
506
507 mutex_lock(&ce_app->app_mutex);
508 ctx_id = ce_ctx->ctx_id = ce_app->next_ctx_id;
509 list_add(&ce_ctx->list, &ce_app->allocated_contexts);
510 ++ce_app->next_ctx_id;
511 ++ce_app->ctx_count;
512 mutex_unlock(&ce_app->app_mutex);
513
514 ce_ctx->gpu_ctx_state = NVGPU_CE_GPU_CTX_ALLOCATED;
515
516end:
517 if (ctx_id == ~0) {
518 mutex_lock(&ce_app->app_mutex);
519 gk20a_ce_delete_gpu_context(ce_ctx);
520 mutex_unlock(&ce_app->app_mutex);
521 }
522 return ctx_id;
523
524}
525EXPORT_SYMBOL(gk20a_ce_create_context_with_cb);
526
527int gk20a_ce_execute_ops(struct device *dev,
528 u32 ce_ctx_id,
529 u64 src_buf,
530 u64 dst_buf,
531 u64 size,
532 unsigned int payload,
533 int launch_flags,
534 int request_operation,
535 struct gk20a_fence *gk20a_fence_in,
536 u32 submit_flags,
537 struct gk20a_fence **gk20a_fence_out)
538{
539 int ret = -EPERM;
540 struct gk20a *g = gk20a_from_dev(dev);
541 struct gk20a_ce_app *ce_app = &g->ce_app;
542 struct gk20a_gpu_ctx *ce_ctx, *ce_ctx_save;
543 bool found = false;
544 u32 *cmd_buf_cpu_va;
545 u64 cmd_buf_gpu_va = 0;
546 u32 methodSize;
547 u32 cmd_buf_read_offset;
548 u32 fence_index;
549 struct nvgpu_gpfifo gpfifo;
550 struct nvgpu_fence fence = {0,0};
551 struct gk20a_fence *ce_cmd_buf_fence_out = NULL;
552 struct nvgpu_gpu_characteristics *gpu_capability = &g->gpu_characteristics;
553
554 if (!ce_app->initialised ||ce_app->app_state != NVGPU_CE_ACTIVE)
555 goto end;
556
557 mutex_lock(&ce_app->app_mutex);
558
559 list_for_each_entry_safe(ce_ctx, ce_ctx_save,
560 &ce_app->allocated_contexts, list) {
561 if (ce_ctx->ctx_id == ce_ctx_id) {
562 found = true;
563 break;
564 }
565 }
566
567 mutex_unlock(&ce_app->app_mutex);
568
569 if (!found) {
570 ret = -EINVAL;
571 goto end;
572 }
573
574 if (ce_ctx->gpu_ctx_state != NVGPU_CE_GPU_CTX_ALLOCATED) {
575 ret = -ENODEV;
576 goto end;
577 }
578
579 mutex_lock(&ce_ctx->gpu_ctx_mutex);
580
581 ce_ctx->cmd_buf_read_queue_offset %= ce_ctx->cmd_buf_end_queue_offset;
582
583 cmd_buf_read_offset = (ce_ctx->cmd_buf_read_queue_offset *
584 (NVGPU_CE_MAX_COMMAND_BUFF_SIZE_PER_KICKOFF / sizeof(u32)));
585
586 /* at end of command buffer has gk20a_fence for command buffer sync */
587 fence_index = (cmd_buf_read_offset +
588 ((NVGPU_CE_MAX_COMMAND_BUFF_SIZE_PER_KICKOFF / sizeof(u32)) -
589 (NVGPU_CE_MAX_COMMAND_BUFF_SIZE_FOR_TRACING / sizeof(u32))));
590
591 if (sizeof(struct gk20a_fence *) > NVGPU_CE_MAX_COMMAND_BUFF_SIZE_FOR_TRACING) {
592 ret = -ENOMEM;
593 goto noop;
594 }
595
596 cmd_buf_cpu_va = (u32 *)ce_ctx->cmd_buf_mem.cpu_va;
597
598 /* 0 is treated as invalid pre-sync */
599 if (cmd_buf_cpu_va[fence_index]) {
600 struct gk20a_fence * ce_cmd_buf_fence_in = NULL;
601
602 memcpy((void *)&ce_cmd_buf_fence_in,
603 (void *)(cmd_buf_cpu_va + fence_index),
604 sizeof(struct gk20a_fence *));
605 ret = gk20a_fence_wait(ce_cmd_buf_fence_in, gk20a_get_gr_idle_timeout(g));
606
607 gk20a_fence_put(ce_cmd_buf_fence_in);
608 /* Reset the stored last pre-sync */
609 memset((void *)(cmd_buf_cpu_va + fence_index),
610 0,
611 NVGPU_CE_MAX_COMMAND_BUFF_SIZE_FOR_TRACING);
612 if (ret)
613 goto noop;
614 }
615
616 cmd_buf_gpu_va = (ce_ctx->cmd_buf_mem.gpu_va + (u64)(cmd_buf_read_offset *sizeof(u32)));
617
618 methodSize = gk20a_ce_prepare_submit(src_buf,
619 dst_buf,
620 size,
621 &cmd_buf_cpu_va[cmd_buf_read_offset],
622 NVGPU_CE_MAX_COMMAND_BUFF_SIZE_PER_KICKOFF,
623 payload,
624 gk20a_get_valid_launch_flags(g, launch_flags),
625 request_operation,
626 gpu_capability->dma_copy_class,
627 gk20a_fence_in);
628
629 if (methodSize) {
630 /* TODO: Remove CPU pre-fence wait */
631 if (gk20a_fence_in) {
632 ret = gk20a_fence_wait(gk20a_fence_in, gk20a_get_gr_idle_timeout(g));
633 gk20a_fence_put(gk20a_fence_in);
634 if (ret)
635 goto noop;
636 }
637
638 /* store the element into gpfifo */
639 gpfifo.entry0 =
640 u64_lo32(cmd_buf_gpu_va);
641 gpfifo.entry1 =
642 (u64_hi32(cmd_buf_gpu_va) |
643 pbdma_gp_entry1_length_f(methodSize));
644
645 /* take always the postfence as it is needed for protecting the ce context */
646 submit_flags |= NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_GET;
647
648 wmb();
649
650 ret = gk20a_submit_channel_gpfifo(ce_ctx->ch, &gpfifo, NULL,
651 1, submit_flags, &fence, &ce_cmd_buf_fence_out, true);
652
653 if (!ret) {
654 memcpy((void *)(cmd_buf_cpu_va + fence_index),
655 (void *)&ce_cmd_buf_fence_out,
656 sizeof(struct gk20a_fence *));
657
658 if (gk20a_fence_out) {
659 gk20a_fence_get(ce_cmd_buf_fence_out);
660 *gk20a_fence_out = ce_cmd_buf_fence_out;
661 }
662
663 /* Next available command buffer queue Index */
664 ++ce_ctx->cmd_buf_read_queue_offset;
665 ++ce_ctx->submitted_seq_number;
666 }
667 } else
668 ret = -ENOMEM;
669noop:
670 mutex_unlock(&ce_ctx->gpu_ctx_mutex);
671end:
672 return ret;
673}
674EXPORT_SYMBOL(gk20a_ce_execute_ops);
675
676void gk20a_ce_delete_context(struct device *dev,
677 u32 ce_ctx_id)
678{
679 struct gk20a *g = gk20a_from_dev(dev);
680 struct gk20a_ce_app *ce_app = &g->ce_app;
681 struct gk20a_gpu_ctx *ce_ctx, *ce_ctx_save;
682
683 if (!ce_app->initialised ||ce_app->app_state != NVGPU_CE_ACTIVE)
684 return;
685
686 mutex_lock(&ce_app->app_mutex);
687
688 list_for_each_entry_safe(ce_ctx, ce_ctx_save,
689 &ce_app->allocated_contexts, list) {
690 if (ce_ctx->ctx_id == ce_ctx_id) {
691 gk20a_ce_delete_gpu_context(ce_ctx);
692 --ce_app->ctx_count;
693 break;
694 }
695 }
696
697 mutex_unlock(&ce_app->app_mutex);
698 return;
699}
700EXPORT_SYMBOL(gk20a_ce_delete_context);
701
702#ifdef CONFIG_DEBUG_FS
703void gk20a_ce_debugfs_init(struct device *dev)
704{
705 struct gk20a_platform *platform = dev_get_drvdata(dev);
706 struct gk20a *g = get_gk20a(dev);
707
708 debugfs_create_u32("ce_app_ctx_count", S_IWUSR | S_IRUGO,
709 platform->debugfs, &g->ce_app.ctx_count);
710 debugfs_create_u32("ce_app_state", S_IWUSR | S_IRUGO,
711 platform->debugfs, &g->ce_app.app_state);
712 debugfs_create_u32("ce_app_next_ctx_id", S_IWUSR | S_IRUGO,
713 platform->debugfs, &g->ce_app.next_ctx_id);
714}
715#endif
diff --git a/drivers/gpu/nvgpu/gk20a/ce2_gk20a.h b/drivers/gpu/nvgpu/gk20a/ce2_gk20a.h
index 5ceb69e1..3b53834d 100644
--- a/drivers/gpu/nvgpu/gk20a/ce2_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/ce2_gk20a.h
@@ -28,4 +28,128 @@ void gk20a_init_ce2(struct gpu_ops *gops);
28void gk20a_ce2_isr(struct gk20a *g, u32 inst_id, u32 pri_base); 28void gk20a_ce2_isr(struct gk20a *g, u32 inst_id, u32 pri_base);
29void gk20a_ce2_nonstall_isr(struct gk20a *g, u32 inst_id, u32 pri_base); 29void gk20a_ce2_nonstall_isr(struct gk20a *g, u32 inst_id, u32 pri_base);
30 30
31/* CE command utility macros */
32#define NVGPU_CE_LOWER_ADDRESS_OFFSET_MASK 0xffffffff
33#define NVGPU_CE_UPPER_ADDRESS_OFFSET_MASK 0xff
34
35#define NVGPU_CE_COMMAND_BUF_SIZE 4096
36#define NVGPU_CE_MAX_COMMAND_BUFF_SIZE_PER_KICKOFF 128
37#define NVGPU_CE_MAX_COMMAND_BUFF_SIZE_FOR_TRACING 8
38
39typedef void (*ce_event_callback)(u32 ce_ctx_id, u32 ce_event_flag);
40
41/* dma launch_flags */
42enum {
43 /* location */
44 NVGPU_CE_SRC_LOCATION_COHERENT_SYSMEM = (1 << 0),
45 NVGPU_CE_SRC_LOCATION_NONCOHERENT_SYSMEM = (1 << 1),
46 NVGPU_CE_SRC_LOCATION_LOCAL_FB = (1 << 2),
47 NVGPU_CE_DST_LOCATION_COHERENT_SYSMEM = (1 << 3),
48 NVGPU_CE_DST_LOCATION_NONCOHERENT_SYSMEM = (1 << 4),
49 NVGPU_CE_DST_LOCATION_LOCAL_FB = (1 << 5),
50
51 /* memory layout */
52 NVGPU_CE_SRC_MEMORY_LAYOUT_PITCH = (1 << 6),
53 NVGPU_CE_SRC_MEMORY_LAYOUT_BLOCKLINEAR = (1 << 7),
54 NVGPU_CE_DST_MEMORY_LAYOUT_PITCH = (1 << 8),
55 NVGPU_CE_DST_MEMORY_LAYOUT_BLOCKLINEAR = (1 << 9),
56
57 /* transfer type */
58 NVGPU_CE_DATA_TRANSFER_TYPE_PIPELINED = (1 << 10),
59 NVGPU_CE_DATA_TRANSFER_TYPE_NON_PIPELINED = (1 << 11),
60};
61
62/* CE operation mode */
63enum {
64 NVGPU_CE_PHYS_MODE_TRANSFER = (1 << 0),
65 NVGPU_CE_MEMSET = (1 << 1),
66};
67
68/* CE event flags */
69enum {
70 NVGPU_CE_CONTEXT_JOB_COMPLETED = (1 << 0),
71 NVGPU_CE_CONTEXT_JOB_TIMEDOUT = (1 << 1),
72 NVGPU_CE_CONTEXT_SUSPEND = (1 << 2),
73 NVGPU_CE_CONTEXT_RESUME = (1 << 3),
74};
75
76/* CE app state machine flags */
77enum {
78 NVGPU_CE_ACTIVE = (1 << 0),
79 NVGPU_CE_SUSPEND = (1 << 1),
80};
81
82/* gpu context state machine flags */
83enum {
84 NVGPU_CE_GPU_CTX_ALLOCATED = (1 << 0),
85 NVGPU_CE_GPU_CTX_DELETED = (1 << 1),
86};
87
88/* global ce app db */
89struct gk20a_ce_app {
90 bool initialised;
91 struct mutex app_mutex;
92 int app_state;
93
94 struct list_head allocated_contexts;
95 u32 ctx_count;
96 u32 next_ctx_id;
97};
98
99/* ce context db */
100struct gk20a_gpu_ctx {
101 struct gk20a *g;
102 struct device *dev;
103 u32 ctx_id;
104 struct mutex gpu_ctx_mutex;
105 int gpu_ctx_state;
106 ce_event_callback user_event_callback;
107
108 /* channel related data */
109 struct channel_gk20a *ch;
110 struct vm_gk20a *vm;
111
112 /* cmd buf mem_desc */
113 struct mem_desc cmd_buf_mem;
114
115 struct list_head list;
116
117 u64 submitted_seq_number;
118 u64 completed_seq_number;
119
120 u32 cmd_buf_read_queue_offset;
121 u32 cmd_buf_end_queue_offset;
122};
123
124/* global CE app related apis */
125int gk20a_init_ce_support(struct gk20a *g);
126void gk20a_ce_suspend(struct gk20a *g);
127void gk20a_ce_destroy(struct gk20a *g);
128
129/* CE app utility functions */
130u32 gk20a_ce_create_context_with_cb(struct device *dev,
131 int runlist_id,
132 int priority,
133 int timeslice,
134 int runlist_level,
135 ce_event_callback user_event_callback);
136int gk20a_ce_execute_ops(struct device *dev,
137 u32 ce_ctx_id,
138 u64 src_buf,
139 u64 dst_buf,
140 u64 size,
141 unsigned int payload,
142 int launch_flags,
143 int request_operation,
144 struct gk20a_fence *gk20a_fence_in,
145 u32 submit_flags,
146 struct gk20a_fence **gk20a_fence_out);
147void gk20a_ce_delete_context(struct device *dev,
148 u32 ce_ctx_id);
149
150#ifdef CONFIG_DEBUG_FS
151/* CE app debugfs api */
152void gk20a_ce_debugfs_init(struct device *dev);
153#endif
154
31#endif /*__CE2_GK20A_H__*/ 155#endif /*__CE2_GK20A_H__*/
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
index d5457d10..447fe86a 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
@@ -702,7 +702,7 @@ static int gk20a_channel_set_wdt_status(struct channel_gk20a *ch,
702 return 0; 702 return 0;
703} 703}
704 704
705static int gk20a_channel_set_runlist_interleave(struct channel_gk20a *ch, 705int gk20a_channel_set_runlist_interleave(struct channel_gk20a *ch,
706 u32 level) 706 u32 level)
707{ 707{
708 struct gk20a *g = ch->g; 708 struct gk20a *g = ch->g;
@@ -1113,9 +1113,11 @@ static void gk20a_channel_update_runcb_fn(struct work_struct *work)
1113 1113
1114struct channel_gk20a *gk20a_open_new_channel_with_cb(struct gk20a *g, 1114struct channel_gk20a *gk20a_open_new_channel_with_cb(struct gk20a *g,
1115 void (*update_fn)(struct channel_gk20a *, void *), 1115 void (*update_fn)(struct channel_gk20a *, void *),
1116 void *update_fn_data) 1116 void *update_fn_data,
1117 int runlist_id,
1118 bool is_privileged_channel)
1117{ 1119{
1118 struct channel_gk20a *ch = gk20a_open_new_channel(g, -1, false); 1120 struct channel_gk20a *ch = gk20a_open_new_channel(g, runlist_id, is_privileged_channel);
1119 1121
1120 if (ch) { 1122 if (ch) {
1121 spin_lock(&ch->update_fn_lock); 1123 spin_lock(&ch->update_fn_lock);
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
index 4b5fe1b3..971175f2 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
@@ -265,7 +265,9 @@ struct channel_gk20a *gk20a_open_new_channel(struct gk20a *g,
265 bool is_privileged_channel); 265 bool is_privileged_channel);
266struct channel_gk20a *gk20a_open_new_channel_with_cb(struct gk20a *g, 266struct channel_gk20a *gk20a_open_new_channel_with_cb(struct gk20a *g,
267 void (*update_fn)(struct channel_gk20a *, void *), 267 void (*update_fn)(struct channel_gk20a *, void *),
268 void *update_fn_data); 268 void *update_fn_data,
269 int runlist_id,
270 bool is_privileged_channel);
269void channel_gk20a_unbind(struct channel_gk20a *ch_gk20a); 271void channel_gk20a_unbind(struct channel_gk20a *ch_gk20a);
270 272
271int gk20a_submit_channel_gpfifo(struct channel_gk20a *c, 273int gk20a_submit_channel_gpfifo(struct channel_gk20a *c,
@@ -295,6 +297,8 @@ int gk20a_channel_get_timescale_from_timeslice(struct gk20a *g,
295 int *__timeslice_timeout, int *__timeslice_scale); 297 int *__timeslice_timeout, int *__timeslice_scale);
296int gk20a_channel_set_priority(struct channel_gk20a *ch, u32 priority); 298int gk20a_channel_set_priority(struct channel_gk20a *ch, u32 priority);
297int gk20a_channel_set_timeslice(struct channel_gk20a *ch, u32 timeslice); 299int gk20a_channel_set_timeslice(struct channel_gk20a *ch, u32 timeslice);
300int gk20a_channel_set_runlist_interleave(struct channel_gk20a *ch,
301 u32 level);
298void gk20a_channel_event_id_post_event(struct channel_gk20a *ch, 302void gk20a_channel_event_id_post_event(struct channel_gk20a *ch,
299 int event_id); 303 int event_id);
300 304
diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
index 5133f86a..3dd7cb02 100644
--- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
@@ -165,6 +165,33 @@ u32 gk20a_fifo_get_all_ce_engine_reset_mask(struct gk20a *g)
165 return reset_mask; 165 return reset_mask;
166} 166}
167 167
168u32 gk20a_fifo_get_fast_ce_runlist_id(struct gk20a *g)
169{
170 u32 ce_runlist_id = gk20a_fifo_get_gr_runlist_id(g);
171 u32 engine_enum = ENGINE_INVAL_GK20A;
172 struct fifo_gk20a *f = NULL;
173 u32 engine_id_idx;
174 struct fifo_engine_info_gk20a *engine_info;
175 u32 active_engine_id = 0;
176
177 if (!g)
178 return ce_runlist_id;
179
180 f = &g->fifo;
181
182 for (engine_id_idx = 0; engine_id_idx < f->num_engines; ++engine_id_idx) {
183 active_engine_id = f->active_engines_list[engine_id_idx];
184 engine_info = &f->engine_info[active_engine_id];
185 engine_enum = engine_info->engine_enum;
186
187 /* selecet last available ASYNC_CE if available */
188 if (engine_enum == ENGINE_ASYNC_CE_GK20A)
189 ce_runlist_id = engine_info->runlist_id;
190 }
191
192 return ce_runlist_id;
193}
194
168u32 gk20a_fifo_get_gr_runlist_id(struct gk20a *g) 195u32 gk20a_fifo_get_gr_runlist_id(struct gk20a *g)
169{ 196{
170 u32 gr_engine_cnt = 0; 197 u32 gr_engine_cnt = 0;
diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h
index 3473bc78..33d6d39c 100644
--- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h
@@ -244,6 +244,8 @@ u32 gk20a_fifo_get_gr_engine_id(struct gk20a *g);
244 244
245u32 gk20a_fifo_get_all_ce_engine_reset_mask(struct gk20a *g); 245u32 gk20a_fifo_get_all_ce_engine_reset_mask(struct gk20a *g);
246 246
247u32 gk20a_fifo_get_fast_ce_runlist_id(struct gk20a *g);
248
247u32 gk20a_fifo_get_gr_runlist_id(struct gk20a *g); 249u32 gk20a_fifo_get_gr_runlist_id(struct gk20a *g);
248 250
249bool gk20a_fifo_is_valid_runlist_id(struct gk20a *g, u32 runlist_id); 251bool gk20a_fifo_is_valid_runlist_id(struct gk20a *g, u32 runlist_id);
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.c b/drivers/gpu/nvgpu/gk20a/gk20a.c
index 50f67262..04f82033 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.c
@@ -773,6 +773,7 @@ static int gk20a_pm_prepare_poweroff(struct device *dev)
773{ 773{
774 struct gk20a *g = get_gk20a(dev); 774 struct gk20a *g = get_gk20a(dev);
775 int ret = 0; 775 int ret = 0;
776 struct gk20a_platform *platform = gk20a_get_platform(dev);
776 777
777 gk20a_dbg_fn(""); 778 gk20a_dbg_fn("");
778 779
@@ -786,6 +787,9 @@ static int gk20a_pm_prepare_poweroff(struct device *dev)
786 /* cancel any pending cde work */ 787 /* cancel any pending cde work */
787 gk20a_cde_suspend(g); 788 gk20a_cde_suspend(g);
788 789
790 if (platform->has_ce)
791 gk20a_ce_suspend(g);
792
789 ret = gk20a_channel_suspend(g); 793 ret = gk20a_channel_suspend(g);
790 if (ret) 794 if (ret)
791 goto done; 795 goto done;
@@ -996,6 +1000,11 @@ int gk20a_pm_finalize_poweron(struct device *dev)
996 if (platform->has_cde) 1000 if (platform->has_cde)
997 gk20a_init_cde_support(g); 1001 gk20a_init_cde_support(g);
998 1002
1003 if (platform->has_ce)
1004 gk20a_init_ce_support(g);
1005
1006 gk20a_init_mm_ce_context(g);
1007
999 enable_irq(g->irq_stall); 1008 enable_irq(g->irq_stall);
1000 if (g->irq_stall != g->irq_nonstall) 1009 if (g->irq_stall != g->irq_nonstall)
1001 enable_irq(g->irq_nonstall); 1010 enable_irq(g->irq_nonstall);
@@ -1658,6 +1667,7 @@ static int gk20a_probe(struct platform_device *dev)
1658 gk20a_pmu_debugfs_init(&dev->dev); 1667 gk20a_pmu_debugfs_init(&dev->dev);
1659 gk20a_railgating_debugfs_init(&dev->dev); 1668 gk20a_railgating_debugfs_init(&dev->dev);
1660 gk20a_cde_debugfs_init(&dev->dev); 1669 gk20a_cde_debugfs_init(&dev->dev);
1670 gk20a_ce_debugfs_init(&dev->dev);
1661 gk20a_alloc_debugfs_init(dev); 1671 gk20a_alloc_debugfs_init(dev);
1662 gk20a_mm_debugfs_init(&dev->dev); 1672 gk20a_mm_debugfs_init(&dev->dev);
1663 gk20a_fifo_debugfs_init(&dev->dev); 1673 gk20a_fifo_debugfs_init(&dev->dev);
@@ -1693,6 +1703,9 @@ static int __exit gk20a_remove(struct platform_device *pdev)
1693 if (g->remove_support) 1703 if (g->remove_support)
1694 g->remove_support(dev); 1704 g->remove_support(dev);
1695 1705
1706 if (platform->has_ce)
1707 gk20a_ce_destroy(g);
1708
1696 gk20a_user_deinit(dev, &nvgpu_class); 1709 gk20a_user_deinit(dev, &nvgpu_class);
1697 1710
1698 debugfs_remove_recursive(platform->debugfs); 1711 debugfs_remove_recursive(platform->debugfs);
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.h b/drivers/gpu/nvgpu/gk20a/gk20a.h
index 8aa8689b..03a698dc 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.h
@@ -864,6 +864,8 @@ struct gk20a {
864 864
865 struct nvgpu_bios bios; 865 struct nvgpu_bios bios;
866 struct debugfs_blob_wrapper bios_blob; 866 struct debugfs_blob_wrapper bios_blob;
867
868 struct gk20a_ce_app ce_app;
867}; 869};
868 870
869static inline unsigned long gk20a_get_gr_idle_timeout(struct gk20a *g) 871static inline unsigned long gk20a_get_gr_idle_timeout(struct gk20a *g)
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
index 750ce10c..7b2174bc 100644
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
@@ -393,7 +393,7 @@ static int __must_check gk20a_init_system_vm(struct mm_gk20a *mm);
393static int __must_check gk20a_init_bar1_vm(struct mm_gk20a *mm); 393static int __must_check gk20a_init_bar1_vm(struct mm_gk20a *mm);
394static int __must_check gk20a_init_hwpm(struct mm_gk20a *mm); 394static int __must_check gk20a_init_hwpm(struct mm_gk20a *mm);
395static int __must_check gk20a_init_cde_vm(struct mm_gk20a *mm); 395static int __must_check gk20a_init_cde_vm(struct mm_gk20a *mm);
396 396static int __must_check gk20a_init_ce_vm(struct mm_gk20a *mm);
397 397
398struct gk20a_dmabuf_priv { 398struct gk20a_dmabuf_priv {
399 struct mutex lock; 399 struct mutex lock;
@@ -702,6 +702,7 @@ void gk20a_remove_vm(struct vm_gk20a *vm, struct mem_desc *inst_block)
702static void gk20a_remove_mm_support(struct mm_gk20a *mm) 702static void gk20a_remove_mm_support(struct mm_gk20a *mm)
703{ 703{
704 struct gk20a *g = gk20a_from_mm(mm); 704 struct gk20a *g = gk20a_from_mm(mm);
705 struct gk20a_platform *platform = gk20a_get_platform(g->dev);
705 706
706 if (g->ops.mm.remove_bar2_vm) 707 if (g->ops.mm.remove_bar2_vm)
707 g->ops.mm.remove_bar2_vm(g); 708 g->ops.mm.remove_bar2_vm(g);
@@ -709,6 +710,14 @@ static void gk20a_remove_mm_support(struct mm_gk20a *mm)
709 gk20a_remove_vm(&mm->pmu.vm, &mm->pmu.inst_block); 710 gk20a_remove_vm(&mm->pmu.vm, &mm->pmu.inst_block);
710 gk20a_free_inst_block(gk20a_from_mm(mm), &mm->hwpm.inst_block); 711 gk20a_free_inst_block(gk20a_from_mm(mm), &mm->hwpm.inst_block);
711 gk20a_vm_remove_support_nofree(&mm->cde.vm); 712 gk20a_vm_remove_support_nofree(&mm->cde.vm);
713
714 if (mm->ce_vidmem_ctx_id != ~0)
715 gk20a_ce_delete_context(g->dev, mm->ce_vidmem_ctx_id );
716
717 mm->ce_vidmem_ctx_id = ~0;
718
719 if (platform->has_ce)
720 gk20a_vm_remove_support_nofree(&mm->ce.vm);
712} 721}
713 722
714static int gk20a_alloc_sysmem_flush(struct gk20a *g) 723static int gk20a_alloc_sysmem_flush(struct gk20a *g)
@@ -754,6 +763,7 @@ int gk20a_init_mm_setup_sw(struct gk20a *g)
754{ 763{
755 struct mm_gk20a *mm = &g->mm; 764 struct mm_gk20a *mm = &g->mm;
756 int err; 765 int err;
766 struct gk20a_platform *platform = gk20a_get_platform(g->dev);
757 767
758 gk20a_dbg_fn(""); 768 gk20a_dbg_fn("");
759 769
@@ -775,6 +785,8 @@ int gk20a_init_mm_setup_sw(struct gk20a *g)
775 785
776 gk20a_init_pramin(mm); 786 gk20a_init_pramin(mm);
777 787
788 mm->ce_vidmem_ctx_id = ~0;
789
778 err = gk20a_init_vidmem(mm); 790 err = gk20a_init_vidmem(mm);
779 if (err) 791 if (err)
780 return err; 792 return err;
@@ -804,6 +816,12 @@ int gk20a_init_mm_setup_sw(struct gk20a *g)
804 if (err) 816 if (err)
805 return err; 817 return err;
806 818
819 if (platform->has_ce) {
820 err = gk20a_init_ce_vm(mm);
821 if (err)
822 return err;
823 }
824
807 /* set vm_alloc_share op here as gk20a_as_alloc_share needs it */ 825 /* set vm_alloc_share op here as gk20a_as_alloc_share needs it */
808 g->ops.mm.vm_alloc_share = gk20a_vm_alloc_share; 826 g->ops.mm.vm_alloc_share = gk20a_vm_alloc_share;
809 mm->remove_support = gk20a_remove_mm_support; 827 mm->remove_support = gk20a_remove_mm_support;
@@ -881,6 +899,25 @@ int gk20a_init_mm_support(struct gk20a *g)
881 return err; 899 return err;
882} 900}
883 901
902void gk20a_init_mm_ce_context(struct gk20a *g)
903{
904#if defined(CONFIG_GK20A_VIDMEM)
905 if (g->mm.vidmem_size && (g->mm.ce_vidmem_ctx_id == ~0)) {
906 g->mm.ce_vidmem_ctx_id =
907 gk20a_ce_create_context_with_cb(g->dev,
908 gk20a_fifo_get_fast_ce_runlist_id(g),
909 -1,
910 -1,
911 -1,
912 NULL);
913
914 if (g->mm.ce_vidmem_ctx_id == ~0)
915 gk20a_err(g->dev,
916 "Failed to allocate CE context for vidmem page clearing support");
917 }
918#endif
919}
920
884static int alloc_gmmu_phys_pages(struct vm_gk20a *vm, u32 order, 921static int alloc_gmmu_phys_pages(struct vm_gk20a *vm, u32 order,
885 struct gk20a_mm_entry *entry) 922 struct gk20a_mm_entry *entry)
886{ 923{
@@ -2484,6 +2521,7 @@ int gk20a_gmmu_alloc_attr_vid_at(struct gk20a *g, enum dma_attr attr,
2484 struct device *d = &g->mm.vidmem_dev; 2521 struct device *d = &g->mm.vidmem_dev;
2485 int err; 2522 int err;
2486 dma_addr_t iova; 2523 dma_addr_t iova;
2524 bool need_pramin_access = true;
2487 DEFINE_DMA_ATTRS(attrs); 2525 DEFINE_DMA_ATTRS(attrs);
2488 2526
2489 gk20a_dbg_fn(""); 2527 gk20a_dbg_fn("");
@@ -2519,7 +2557,38 @@ int gk20a_gmmu_alloc_attr_vid_at(struct gk20a *g, enum dma_attr attr,
2519 mem->size = size; 2557 mem->size = size;
2520 mem->aperture = APERTURE_VIDMEM; 2558 mem->aperture = APERTURE_VIDMEM;
2521 2559
2522 gk20a_memset(g, mem, 0, 0, size); 2560 if (g->mm.ce_vidmem_ctx_id != ~0) {
2561 struct gk20a_fence *gk20a_fence_out = NULL;
2562 u64 dst_bufbase = g->ops.mm.get_iova_addr(g, mem->sgt->sgl, 0);
2563
2564 err = gk20a_ce_execute_ops(g->dev,
2565 g->mm.ce_vidmem_ctx_id,
2566 0,
2567 dst_bufbase,
2568 (u64)size,
2569 0x00000000,
2570 NVGPU_CE_DST_LOCATION_LOCAL_FB,
2571 NVGPU_CE_MEMSET,
2572 NULL,
2573 0,
2574 &gk20a_fence_out);
2575
2576 if (!err) {
2577 if (gk20a_fence_out) {
2578 err = gk20a_fence_wait(gk20a_fence_out, gk20a_get_gr_idle_timeout(g));
2579 gk20a_fence_put(gk20a_fence_out);
2580 if (err)
2581 gk20a_err(g->dev,
2582 "Failed to get the fence_out from CE execute ops");
2583 else
2584 need_pramin_access = false;
2585 }
2586 } else
2587 gk20a_err(g->dev, "Failed gk20a_ce_execute_ops[%d]",err);
2588 }
2589
2590 if (need_pramin_access)
2591 gk20a_memset(g, mem, 0, 0, size);
2523 2592
2524 gk20a_dbg_fn("done"); 2593 gk20a_dbg_fn("done");
2525 2594
@@ -4125,6 +4194,19 @@ static int gk20a_init_cde_vm(struct mm_gk20a *mm)
4125 false, false, "cde"); 4194 false, false, "cde");
4126} 4195}
4127 4196
4197static int gk20a_init_ce_vm(struct mm_gk20a *mm)
4198{
4199 struct vm_gk20a *vm = &mm->ce.vm;
4200 struct gk20a *g = gk20a_from_mm(mm);
4201 u32 big_page_size = gk20a_get_platform(g->dev)->default_big_page_size;
4202
4203 return gk20a_init_vm(mm, vm, big_page_size,
4204 SZ_4K * 16,
4205 NV_MM_DEFAULT_KERNEL_SIZE,
4206 NV_MM_DEFAULT_KERNEL_SIZE + NV_MM_DEFAULT_USER_SIZE,
4207 false, false, "ce");
4208}
4209
4128void gk20a_mm_init_pdb(struct gk20a *g, struct mem_desc *inst_block, 4210void gk20a_mm_init_pdb(struct gk20a *g, struct mem_desc *inst_block,
4129 struct vm_gk20a *vm) 4211 struct vm_gk20a *vm)
4130{ 4212{
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
index 66e46480..184c1f71 100644
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
@@ -307,6 +307,7 @@ int gk20a_init_mm_support(struct gk20a *g);
307int gk20a_init_mm_setup_sw(struct gk20a *g); 307int gk20a_init_mm_setup_sw(struct gk20a *g);
308int gk20a_init_mm_setup_hw(struct gk20a *g); 308int gk20a_init_mm_setup_hw(struct gk20a *g);
309void gk20a_mm_debugfs_init(struct device *dev); 309void gk20a_mm_debugfs_init(struct device *dev);
310void gk20a_init_mm_ce_context(struct gk20a *g);
310 311
311int gk20a_mm_fb_flush(struct gk20a *g); 312int gk20a_mm_fb_flush(struct gk20a *g);
312void gk20a_mm_l2_flush(struct gk20a *g, bool invalidate); 313void gk20a_mm_l2_flush(struct gk20a *g, bool invalidate);
@@ -349,6 +350,10 @@ struct mm_gk20a {
349 struct vm_gk20a vm; 350 struct vm_gk20a vm;
350 } cde; 351 } cde;
351 352
353 struct {
354 struct vm_gk20a vm;
355 } ce;
356
352 struct mutex l2_op_lock; 357 struct mutex l2_op_lock;
353#ifdef CONFIG_ARCH_TEGRA_18x_SOC 358#ifdef CONFIG_ARCH_TEGRA_18x_SOC
354 struct mem_desc bar2_desc; 359 struct mem_desc bar2_desc;
@@ -388,6 +393,7 @@ struct mm_gk20a {
388 393
389 size_t vidmem_size; 394 size_t vidmem_size;
390 struct device vidmem_dev; 395 struct device vidmem_dev;
396 u32 ce_vidmem_ctx_id;
391}; 397};
392 398
393int gk20a_mm_init(struct mm_gk20a *mm); 399int gk20a_mm_init(struct mm_gk20a *mm);
diff --git a/drivers/gpu/nvgpu/gk20a/platform_gk20a.h b/drivers/gpu/nvgpu/gk20a/platform_gk20a.h
index 543f9873..5bde3439 100644
--- a/drivers/gpu/nvgpu/gk20a/platform_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/platform_gk20a.h
@@ -209,6 +209,8 @@ struct gk20a_platform {
209 209
210 bool has_cde; 210 bool has_cde;
211 211
212 bool has_ce;
213
212 /* soc name for finding firmware files */ 214 /* soc name for finding firmware files */
213 const char *soc_name; 215 const char *soc_name;
214 216
diff --git a/drivers/gpu/nvgpu/gk20a/platform_gk20a_tegra.c b/drivers/gpu/nvgpu/gk20a/platform_gk20a_tegra.c
index 2ed6df43..745d963c 100644
--- a/drivers/gpu/nvgpu/gk20a/platform_gk20a_tegra.c
+++ b/drivers/gpu/nvgpu/gk20a/platform_gk20a_tegra.c
@@ -900,6 +900,8 @@ struct gk20a_platform gk20a_tegra_platform = {
900 .secure_page_alloc = gk20a_tegra_secure_page_alloc, 900 .secure_page_alloc = gk20a_tegra_secure_page_alloc,
901 .dump_platform_dependencies = gk20a_tegra_debug_dump, 901 .dump_platform_dependencies = gk20a_tegra_debug_dump,
902 902
903 .has_ce = true,
904
903 .soc_name = "tegra12x", 905 .soc_name = "tegra12x",
904 906
905 .vidmem_is_vidmem = false, 907 .vidmem_is_vidmem = false,
@@ -962,6 +964,8 @@ struct gk20a_platform gm20b_tegra_platform = {
962 964
963 .has_cde = true, 965 .has_cde = true,
964 966
967 .has_ce = true,
968
965 .soc_name = "tegra21x", 969 .soc_name = "tegra21x",
966 970
967 .vidmem_is_vidmem = false, 971 .vidmem_is_vidmem = false,
diff --git a/drivers/gpu/nvgpu/pci.c b/drivers/gpu/nvgpu/pci.c
index ea6f3b4c..fcf63ddc 100644
--- a/drivers/gpu/nvgpu/pci.c
+++ b/drivers/gpu/nvgpu/pci.c
@@ -56,6 +56,8 @@ static struct gk20a_platform nvgpu_pci_device = {
56 56
57 .ch_wdt_timeout_ms = 7000, 57 .ch_wdt_timeout_ms = 7000,
58 .disable_bigpage = true, 58 .disable_bigpage = true,
59
60 .has_ce = true,
59}; 61};
60 62
61static struct pci_device_id nvgpu_pci_table[] = { 63static struct pci_device_id nvgpu_pci_table[] = {