summaryrefslogtreecommitdiffstats
path: root/drivers/gpu/nvgpu/common/linux/cde.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/gpu/nvgpu/common/linux/cde.c')
-rw-r--r--drivers/gpu/nvgpu/common/linux/cde.c1693
1 files changed, 1693 insertions, 0 deletions
diff --git a/drivers/gpu/nvgpu/common/linux/cde.c b/drivers/gpu/nvgpu/common/linux/cde.c
new file mode 100644
index 00000000..5b0fb910
--- /dev/null
+++ b/drivers/gpu/nvgpu/common/linux/cde.c
@@ -0,0 +1,1693 @@
1/*
2 * Color decompression engine support
3 *
4 * Copyright (c) 2014-2017, NVIDIA Corporation. All rights reserved.
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms and conditions of the GNU General Public License,
8 * version 2, as published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program. If not, see <http://www.gnu.org/licenses/>.
17 */
18
19#include <linux/dma-mapping.h>
20#include <linux/fs.h>
21#include <linux/dma-buf.h>
22
23#include <trace/events/gk20a.h>
24
25#include <nvgpu/dma.h>
26#include <nvgpu/gmmu.h>
27#include <nvgpu/timers.h>
28#include <nvgpu/nvgpu_common.h>
29#include <nvgpu/kmem.h>
30#include <nvgpu/log.h>
31#include <nvgpu/bug.h>
32#include <nvgpu/firmware.h>
33
34#include "gk20a/gk20a.h"
35#include "gk20a/channel_gk20a.h"
36#include "gk20a/mm_gk20a.h"
37#include "gk20a/fence_gk20a.h"
38#include "gk20a/gr_gk20a.h"
39
40#include "cde.h"
41#include "os_linux.h"
42
43#include <nvgpu/hw/gk20a/hw_ccsr_gk20a.h>
44#include <nvgpu/hw/gk20a/hw_pbdma_gk20a.h>
45
46/*
47 * Currently this code uses nvgpu_vm_map() since it takes dmabuf FDs from the
48 * CDE ioctls. That has to change - instead this needs to take an nvgpu_mem.
49 */
50#include "common/linux/vm_priv.h"
51
52static int gk20a_cde_load(struct gk20a_cde_ctx *cde_ctx);
53static struct gk20a_cde_ctx *gk20a_cde_allocate_context(struct nvgpu_os_linux *l);
54
55#define CTX_DELETE_TIME 1000
56
57#define MAX_CTX_USE_COUNT 42
58#define MAX_CTX_RETRY_TIME 2000
59
60static void gk20a_deinit_cde_img(struct gk20a_cde_ctx *cde_ctx)
61{
62 unsigned int i;
63
64 for (i = 0; i < cde_ctx->num_bufs; i++) {
65 struct nvgpu_mem *mem = cde_ctx->mem + i;
66 nvgpu_dma_unmap_free(cde_ctx->vm, mem);
67 }
68
69 nvgpu_kfree(&cde_ctx->l->g, cde_ctx->init_convert_cmd);
70
71 cde_ctx->convert_cmd = NULL;
72 cde_ctx->init_convert_cmd = NULL;
73 cde_ctx->num_bufs = 0;
74 cde_ctx->num_params = 0;
75 cde_ctx->init_cmd_num_entries = 0;
76 cde_ctx->convert_cmd_num_entries = 0;
77 cde_ctx->init_cmd_executed = false;
78}
79
80static void gk20a_cde_remove_ctx(struct gk20a_cde_ctx *cde_ctx)
81__must_hold(&cde_app->mutex)
82{
83 struct nvgpu_os_linux *l = cde_ctx->l;
84 struct gk20a *g = &l->g;
85 struct channel_gk20a *ch = cde_ctx->ch;
86 struct vm_gk20a *vm = ch->vm;
87
88 trace_gk20a_cde_remove_ctx(cde_ctx);
89
90 /* release mapped memory */
91 gk20a_deinit_cde_img(cde_ctx);
92 nvgpu_gmmu_unmap(vm, &g->gr.compbit_store.mem,
93 cde_ctx->backing_store_vaddr);
94
95 /* free the channel */
96 gk20a_channel_close(ch);
97
98 /* housekeeping on app */
99 nvgpu_list_del(&cde_ctx->list);
100 l->cde_app.ctx_count--;
101 nvgpu_kfree(g, cde_ctx);
102}
103
104static void gk20a_cde_cancel_deleter(struct gk20a_cde_ctx *cde_ctx,
105 bool wait_finish)
106__releases(&cde_app->mutex)
107__acquires(&cde_app->mutex)
108{
109 struct gk20a_cde_app *cde_app = &cde_ctx->l->cde_app;
110
111 /* permanent contexts do not have deleter works */
112 if (!cde_ctx->is_temporary)
113 return;
114
115 if (wait_finish) {
116 nvgpu_mutex_release(&cde_app->mutex);
117 cancel_delayed_work_sync(&cde_ctx->ctx_deleter_work);
118 nvgpu_mutex_acquire(&cde_app->mutex);
119 } else {
120 cancel_delayed_work(&cde_ctx->ctx_deleter_work);
121 }
122}
123
124static void gk20a_cde_remove_contexts(struct nvgpu_os_linux *l)
125__must_hold(&l->cde_app->mutex)
126{
127 struct gk20a_cde_app *cde_app = &l->cde_app;
128 struct gk20a_cde_ctx *cde_ctx, *cde_ctx_save;
129
130 /* safe to go off the mutex in cancel_deleter since app is
131 * deinitialised; no new jobs are started. deleter works may be only at
132 * waiting for the mutex or before, going to abort */
133
134 nvgpu_list_for_each_entry_safe(cde_ctx, cde_ctx_save,
135 &cde_app->free_contexts, gk20a_cde_ctx, list) {
136 gk20a_cde_cancel_deleter(cde_ctx, true);
137 gk20a_cde_remove_ctx(cde_ctx);
138 }
139
140 nvgpu_list_for_each_entry_safe(cde_ctx, cde_ctx_save,
141 &cde_app->used_contexts, gk20a_cde_ctx, list) {
142 gk20a_cde_cancel_deleter(cde_ctx, true);
143 gk20a_cde_remove_ctx(cde_ctx);
144 }
145}
146
147static void gk20a_cde_stop(struct nvgpu_os_linux *l)
148__must_hold(&l->cde_app->mutex)
149{
150 struct gk20a_cde_app *cde_app = &l->cde_app;
151
152 /* prevent further conversions and delayed works from working */
153 cde_app->initialised = false;
154 /* free all data, empty the list */
155 gk20a_cde_remove_contexts(l);
156}
157
158void gk20a_cde_destroy(struct nvgpu_os_linux *l)
159__acquires(&l->cde_app->mutex)
160__releases(&l->cde_app->mutex)
161{
162 struct gk20a_cde_app *cde_app = &l->cde_app;
163
164 if (!cde_app->initialised)
165 return;
166
167 nvgpu_mutex_acquire(&cde_app->mutex);
168 gk20a_cde_stop(l);
169 nvgpu_mutex_release(&cde_app->mutex);
170
171 nvgpu_mutex_destroy(&cde_app->mutex);
172}
173
174void gk20a_cde_suspend(struct nvgpu_os_linux *l)
175__acquires(&l->cde_app->mutex)
176__releases(&l->cde_app->mutex)
177{
178 struct gk20a_cde_app *cde_app = &l->cde_app;
179 struct gk20a_cde_ctx *cde_ctx, *cde_ctx_save;
180
181 if (!cde_app->initialised)
182 return;
183
184 nvgpu_mutex_acquire(&cde_app->mutex);
185
186 nvgpu_list_for_each_entry_safe(cde_ctx, cde_ctx_save,
187 &cde_app->free_contexts, gk20a_cde_ctx, list) {
188 gk20a_cde_cancel_deleter(cde_ctx, false);
189 }
190
191 nvgpu_list_for_each_entry_safe(cde_ctx, cde_ctx_save,
192 &cde_app->used_contexts, gk20a_cde_ctx, list) {
193 gk20a_cde_cancel_deleter(cde_ctx, false);
194 }
195
196 nvgpu_mutex_release(&cde_app->mutex);
197
198}
199
200static int gk20a_cde_create_context(struct nvgpu_os_linux *l)
201__must_hold(&l->cde_app->mutex)
202{
203 struct gk20a_cde_app *cde_app = &l->cde_app;
204 struct gk20a_cde_ctx *cde_ctx;
205
206 cde_ctx = gk20a_cde_allocate_context(l);
207 if (IS_ERR(cde_ctx))
208 return PTR_ERR(cde_ctx);
209
210 nvgpu_list_add(&cde_ctx->list, &cde_app->free_contexts);
211 cde_app->ctx_count++;
212 if (cde_app->ctx_count > cde_app->ctx_count_top)
213 cde_app->ctx_count_top = cde_app->ctx_count;
214
215 return 0;
216}
217
218static int gk20a_cde_create_contexts(struct nvgpu_os_linux *l)
219__must_hold(&l->cde_app->mutex)
220{
221 int err;
222 int i;
223
224 for (i = 0; i < NUM_CDE_CONTEXTS; i++) {
225 err = gk20a_cde_create_context(l);
226 if (err)
227 goto out;
228 }
229
230 return 0;
231out:
232 gk20a_cde_remove_contexts(l);
233 return err;
234}
235
236static int gk20a_init_cde_buf(struct gk20a_cde_ctx *cde_ctx,
237 struct nvgpu_firmware *img,
238 struct gk20a_cde_hdr_buf *buf)
239{
240 struct nvgpu_mem *mem;
241 struct nvgpu_os_linux *l = cde_ctx->l;
242 struct gk20a *g = &l->g;
243 int err;
244
245 /* check that the file can hold the buf */
246 if (buf->data_byte_offset != 0 &&
247 buf->data_byte_offset + buf->num_bytes > img->size) {
248 nvgpu_warn(g, "cde: invalid data section. buffer idx = %d",
249 cde_ctx->num_bufs);
250 return -EINVAL;
251 }
252
253 /* check that we have enough buf elems available */
254 if (cde_ctx->num_bufs >= MAX_CDE_BUFS) {
255 nvgpu_warn(g, "cde: invalid data section. buffer idx = %d",
256 cde_ctx->num_bufs);
257 return -ENOMEM;
258 }
259
260 /* allocate buf */
261 mem = cde_ctx->mem + cde_ctx->num_bufs;
262 err = nvgpu_dma_alloc_map_sys(cde_ctx->vm, buf->num_bytes, mem);
263 if (err) {
264 nvgpu_warn(g, "cde: could not allocate device memory. buffer idx = %d",
265 cde_ctx->num_bufs);
266 return -ENOMEM;
267 }
268
269 /* copy the content */
270 if (buf->data_byte_offset != 0)
271 memcpy(mem->cpu_va, img->data + buf->data_byte_offset,
272 buf->num_bytes);
273
274 cde_ctx->num_bufs++;
275
276 return 0;
277}
278
279static int gk20a_replace_data(struct gk20a_cde_ctx *cde_ctx, void *target,
280 int type, s32 shift, u64 mask, u64 value)
281{
282 struct nvgpu_os_linux *l = cde_ctx->l;
283 struct gk20a *g = &l->g;
284 u32 *target_mem_ptr = target;
285 u64 *target_mem_ptr_u64 = target;
286 u64 current_value, new_value;
287
288 value = (shift >= 0) ? value << shift : value >> -shift;
289 value &= mask;
290
291 /* read current data from the location */
292 current_value = 0;
293 if (type == TYPE_PARAM_TYPE_U32) {
294 if (mask != 0xfffffffful)
295 current_value = *target_mem_ptr;
296 } else if (type == TYPE_PARAM_TYPE_U64_LITTLE) {
297 if (mask != ~0ul)
298 current_value = *target_mem_ptr_u64;
299 } else if (type == TYPE_PARAM_TYPE_U64_BIG) {
300 current_value = *target_mem_ptr_u64;
301 current_value = (u64)(current_value >> 32) |
302 (u64)(current_value << 32);
303 } else {
304 nvgpu_warn(g, "cde: unknown type. type=%d",
305 type);
306 return -EINVAL;
307 }
308
309 current_value &= ~mask;
310 new_value = current_value | value;
311
312 /* store the element data back */
313 if (type == TYPE_PARAM_TYPE_U32)
314 *target_mem_ptr = (u32)new_value;
315 else if (type == TYPE_PARAM_TYPE_U64_LITTLE)
316 *target_mem_ptr_u64 = new_value;
317 else {
318 new_value = (u64)(new_value >> 32) |
319 (u64)(new_value << 32);
320 *target_mem_ptr_u64 = new_value;
321 }
322
323 return 0;
324}
325
326static int gk20a_init_cde_replace(struct gk20a_cde_ctx *cde_ctx,
327 struct nvgpu_firmware *img,
328 struct gk20a_cde_hdr_replace *replace)
329{
330 struct nvgpu_mem *source_mem;
331 struct nvgpu_mem *target_mem;
332 struct nvgpu_os_linux *l = cde_ctx->l;
333 struct gk20a *g = &l->g;
334 u32 *target_mem_ptr;
335 u64 vaddr;
336 int err;
337
338 if (replace->target_buf >= cde_ctx->num_bufs ||
339 replace->source_buf >= cde_ctx->num_bufs) {
340 nvgpu_warn(g, "cde: invalid buffer. target_buf=%u, source_buf=%u, num_bufs=%d",
341 replace->target_buf, replace->source_buf,
342 cde_ctx->num_bufs);
343 return -EINVAL;
344 }
345
346 source_mem = cde_ctx->mem + replace->source_buf;
347 target_mem = cde_ctx->mem + replace->target_buf;
348 target_mem_ptr = target_mem->cpu_va;
349
350 if (source_mem->size < (replace->source_byte_offset + 3) ||
351 target_mem->size < (replace->target_byte_offset + 3)) {
352 nvgpu_warn(g, "cde: invalid buffer offsets. target_buf_offs=%lld, source_buf_offs=%lld, source_buf_size=%zu, dest_buf_size=%zu",
353 replace->target_byte_offset,
354 replace->source_byte_offset,
355 source_mem->size,
356 target_mem->size);
357 return -EINVAL;
358 }
359
360 /* calculate the target pointer */
361 target_mem_ptr += (replace->target_byte_offset / sizeof(u32));
362
363 /* determine patch value */
364 vaddr = source_mem->gpu_va + replace->source_byte_offset;
365 err = gk20a_replace_data(cde_ctx, target_mem_ptr, replace->type,
366 replace->shift, replace->mask,
367 vaddr);
368 if (err) {
369 nvgpu_warn(g, "cde: replace failed. err=%d, target_buf=%u, target_buf_offs=%lld, source_buf=%u, source_buf_offs=%lld",
370 err, replace->target_buf,
371 replace->target_byte_offset,
372 replace->source_buf,
373 replace->source_byte_offset);
374 }
375
376 return err;
377}
378
379static int gk20a_cde_patch_params(struct gk20a_cde_ctx *cde_ctx)
380{
381 struct nvgpu_os_linux *l = cde_ctx->l;
382 struct gk20a *g = &l->g;
383 struct nvgpu_mem *target_mem;
384 u32 *target_mem_ptr;
385 u64 new_data;
386 int user_id = 0, err;
387 unsigned int i;
388
389 for (i = 0; i < cde_ctx->num_params; i++) {
390 struct gk20a_cde_hdr_param *param = cde_ctx->params + i;
391 target_mem = cde_ctx->mem + param->target_buf;
392 target_mem_ptr = target_mem->cpu_va;
393 target_mem_ptr += (param->target_byte_offset / sizeof(u32));
394
395 switch (param->id) {
396 case TYPE_PARAM_COMPTAGS_PER_CACHELINE:
397 new_data = g->gr.comptags_per_cacheline;
398 break;
399 case TYPE_PARAM_GPU_CONFIGURATION:
400 new_data = (u64)g->ltc_count * g->gr.slices_per_ltc *
401 g->gr.cacheline_size;
402 break;
403 case TYPE_PARAM_FIRSTPAGEOFFSET:
404 new_data = cde_ctx->surf_param_offset;
405 break;
406 case TYPE_PARAM_NUMPAGES:
407 new_data = cde_ctx->surf_param_lines;
408 break;
409 case TYPE_PARAM_BACKINGSTORE:
410 new_data = cde_ctx->backing_store_vaddr;
411 break;
412 case TYPE_PARAM_DESTINATION:
413 new_data = cde_ctx->compbit_vaddr;
414 break;
415 case TYPE_PARAM_DESTINATION_SIZE:
416 new_data = cde_ctx->compbit_size;
417 break;
418 case TYPE_PARAM_BACKINGSTORE_SIZE:
419 new_data = g->gr.compbit_store.mem.size;
420 break;
421 case TYPE_PARAM_SOURCE_SMMU_ADDR:
422 new_data = gk20a_mm_gpuva_to_iova_base(cde_ctx->vm,
423 cde_ctx->surf_vaddr);
424 if (new_data == 0)
425 return -EINVAL;
426 break;
427 case TYPE_PARAM_BACKINGSTORE_BASE_HW:
428 new_data = g->gr.compbit_store.base_hw;
429 break;
430 case TYPE_PARAM_GOBS_PER_COMPTAGLINE_PER_SLICE:
431 new_data = g->gr.gobs_per_comptagline_per_slice;
432 break;
433 case TYPE_PARAM_SCATTERBUFFER:
434 new_data = cde_ctx->scatterbuffer_vaddr;
435 break;
436 case TYPE_PARAM_SCATTERBUFFER_SIZE:
437 new_data = cde_ctx->scatterbuffer_size;
438 break;
439 default:
440 user_id = param->id - NUM_RESERVED_PARAMS;
441 if (user_id < 0 || user_id >= MAX_CDE_USER_PARAMS)
442 continue;
443 new_data = cde_ctx->user_param_values[user_id];
444 }
445
446 gk20a_dbg(gpu_dbg_cde, "cde: patch: idx_in_file=%d param_id=%d target_buf=%u target_byte_offset=%lld data_value=0x%llx data_offset/data_diff=%lld data_type=%d data_shift=%d data_mask=0x%llx",
447 i, param->id, param->target_buf,
448 param->target_byte_offset, new_data,
449 param->data_offset, param->type, param->shift,
450 param->mask);
451
452 new_data += param->data_offset;
453
454 err = gk20a_replace_data(cde_ctx, target_mem_ptr, param->type,
455 param->shift, param->mask, new_data);
456
457 if (err) {
458 nvgpu_warn(g, "cde: patch failed. err=%d, idx=%d, id=%d, target_buf=%u, target_buf_offs=%lld, patch_value=%llu",
459 err, i, param->id, param->target_buf,
460 param->target_byte_offset, new_data);
461 return err;
462 }
463 }
464
465 return 0;
466}
467
468static int gk20a_init_cde_param(struct gk20a_cde_ctx *cde_ctx,
469 struct nvgpu_firmware *img,
470 struct gk20a_cde_hdr_param *param)
471{
472 struct nvgpu_mem *target_mem;
473 struct nvgpu_os_linux *l = cde_ctx->l;
474 struct gk20a *g = &l->g;
475
476 if (param->target_buf >= cde_ctx->num_bufs) {
477 nvgpu_warn(g, "cde: invalid buffer parameter. param idx = %d, target_buf=%u, num_bufs=%u",
478 cde_ctx->num_params, param->target_buf,
479 cde_ctx->num_bufs);
480 return -EINVAL;
481 }
482
483 target_mem = cde_ctx->mem + param->target_buf;
484 if (target_mem->size < (param->target_byte_offset + 3)) {
485 nvgpu_warn(g, "cde: invalid buffer parameter. param idx = %d, target_buf_offs=%lld, target_buf_size=%zu",
486 cde_ctx->num_params, param->target_byte_offset,
487 target_mem->size);
488 return -EINVAL;
489 }
490
491 /* does this parameter fit into our parameter structure */
492 if (cde_ctx->num_params >= MAX_CDE_PARAMS) {
493 nvgpu_warn(g, "cde: no room for new parameters param idx = %d",
494 cde_ctx->num_params);
495 return -ENOMEM;
496 }
497
498 /* is the given id valid? */
499 if (param->id >= NUM_RESERVED_PARAMS + MAX_CDE_USER_PARAMS) {
500 nvgpu_warn(g, "cde: parameter id is not valid. param idx = %d, id=%u, max=%u",
501 param->id, cde_ctx->num_params,
502 NUM_RESERVED_PARAMS + MAX_CDE_USER_PARAMS);
503 return -EINVAL;
504 }
505
506 cde_ctx->params[cde_ctx->num_params] = *param;
507 cde_ctx->num_params++;
508
509 return 0;
510}
511
512static int gk20a_init_cde_required_class(struct gk20a_cde_ctx *cde_ctx,
513 struct nvgpu_firmware *img,
514 u32 required_class)
515{
516 struct nvgpu_os_linux *l = cde_ctx->l;
517 struct gk20a *g = &l->g;
518 struct nvgpu_alloc_obj_ctx_args alloc_obj_ctx;
519 int err;
520
521 alloc_obj_ctx.class_num = required_class;
522 alloc_obj_ctx.flags = 0;
523
524 /* CDE enabled */
525 cde_ctx->ch->cde = true;
526
527 err = gk20a_alloc_obj_ctx(cde_ctx->ch, &alloc_obj_ctx);
528 if (err) {
529 nvgpu_warn(g, "cde: failed to allocate ctx. err=%d",
530 err);
531 return err;
532 }
533
534 return 0;
535}
536
537static int gk20a_init_cde_command(struct gk20a_cde_ctx *cde_ctx,
538 struct nvgpu_firmware *img,
539 u32 op,
540 struct gk20a_cde_cmd_elem *cmd_elem,
541 u32 num_elems)
542{
543 struct nvgpu_os_linux *l = cde_ctx->l;
544 struct gk20a *g = &l->g;
545 struct nvgpu_gpfifo **gpfifo, *gpfifo_elem;
546 u32 *num_entries;
547 unsigned int i;
548
549 /* check command type */
550 if (op == TYPE_BUF_COMMAND_INIT) {
551 gpfifo = &cde_ctx->init_convert_cmd;
552 num_entries = &cde_ctx->init_cmd_num_entries;
553 } else if (op == TYPE_BUF_COMMAND_CONVERT) {
554 gpfifo = &cde_ctx->convert_cmd;
555 num_entries = &cde_ctx->convert_cmd_num_entries;
556 } else {
557 nvgpu_warn(g, "cde: unknown command. op=%u",
558 op);
559 return -EINVAL;
560 }
561
562 /* allocate gpfifo entries to be pushed */
563 *gpfifo = nvgpu_kzalloc(g,
564 sizeof(struct nvgpu_gpfifo) * num_elems);
565 if (!*gpfifo) {
566 nvgpu_warn(g, "cde: could not allocate memory for gpfifo entries");
567 return -ENOMEM;
568 }
569
570 gpfifo_elem = *gpfifo;
571 for (i = 0; i < num_elems; i++, cmd_elem++, gpfifo_elem++) {
572 struct nvgpu_mem *target_mem;
573
574 /* validate the current entry */
575 if (cmd_elem->target_buf >= cde_ctx->num_bufs) {
576 nvgpu_warn(g, "cde: target buffer is not available (target=%u, num_bufs=%u)",
577 cmd_elem->target_buf, cde_ctx->num_bufs);
578 return -EINVAL;
579 }
580
581 target_mem = cde_ctx->mem + cmd_elem->target_buf;
582 if (target_mem->size<
583 cmd_elem->target_byte_offset + cmd_elem->num_bytes) {
584 nvgpu_warn(g, "cde: target buffer cannot hold all entries (target_size=%zu, target_byte_offset=%lld, num_bytes=%llu)",
585 target_mem->size,
586 cmd_elem->target_byte_offset,
587 cmd_elem->num_bytes);
588 return -EINVAL;
589 }
590
591 /* store the element into gpfifo */
592 gpfifo_elem->entry0 =
593 u64_lo32(target_mem->gpu_va +
594 cmd_elem->target_byte_offset);
595 gpfifo_elem->entry1 =
596 u64_hi32(target_mem->gpu_va +
597 cmd_elem->target_byte_offset) |
598 pbdma_gp_entry1_length_f(cmd_elem->num_bytes /
599 sizeof(u32));
600 }
601
602 *num_entries = num_elems;
603 return 0;
604}
605
606static int gk20a_cde_pack_cmdbufs(struct gk20a_cde_ctx *cde_ctx)
607{
608 struct nvgpu_os_linux *l = cde_ctx->l;
609 struct gk20a *g = &l->g;
610 unsigned long init_bytes = cde_ctx->init_cmd_num_entries *
611 sizeof(struct nvgpu_gpfifo);
612 unsigned long conv_bytes = cde_ctx->convert_cmd_num_entries *
613 sizeof(struct nvgpu_gpfifo);
614 unsigned long total_bytes = init_bytes + conv_bytes;
615 struct nvgpu_gpfifo *combined_cmd;
616
617 /* allocate buffer that has space for both */
618 combined_cmd = nvgpu_kzalloc(g, total_bytes);
619 if (!combined_cmd) {
620 nvgpu_warn(g,
621 "cde: could not allocate memory for gpfifo entries");
622 return -ENOMEM;
623 }
624
625 /* move the original init here and append convert */
626 memcpy(combined_cmd, cde_ctx->init_convert_cmd, init_bytes);
627 memcpy(combined_cmd + cde_ctx->init_cmd_num_entries,
628 cde_ctx->convert_cmd, conv_bytes);
629
630 nvgpu_kfree(g, cde_ctx->init_convert_cmd);
631 nvgpu_kfree(g, cde_ctx->convert_cmd);
632
633 cde_ctx->init_convert_cmd = combined_cmd;
634 cde_ctx->convert_cmd = combined_cmd
635 + cde_ctx->init_cmd_num_entries;
636
637 return 0;
638}
639
640static int gk20a_init_cde_img(struct gk20a_cde_ctx *cde_ctx,
641 struct nvgpu_firmware *img)
642{
643 struct nvgpu_os_linux *l = cde_ctx->l;
644 struct gk20a *g = &l->g;
645 struct gk20a_cde_app *cde_app = &l->cde_app;
646 u32 *data = (u32 *)img->data;
647 u32 num_of_elems;
648 struct gk20a_cde_hdr_elem *elem;
649 u32 min_size = 0;
650 int err = 0;
651 unsigned int i;
652
653 min_size += 2 * sizeof(u32);
654 if (img->size < min_size) {
655 nvgpu_warn(g, "cde: invalid image header");
656 return -EINVAL;
657 }
658
659 cde_app->firmware_version = data[0];
660 num_of_elems = data[1];
661
662 min_size += num_of_elems * sizeof(*elem);
663 if (img->size < min_size) {
664 nvgpu_warn(g, "cde: bad image");
665 return -EINVAL;
666 }
667
668 elem = (struct gk20a_cde_hdr_elem *)&data[2];
669 for (i = 0; i < num_of_elems; i++) {
670 int err = 0;
671 switch (elem->type) {
672 case TYPE_BUF:
673 err = gk20a_init_cde_buf(cde_ctx, img, &elem->buf);
674 break;
675 case TYPE_REPLACE:
676 err = gk20a_init_cde_replace(cde_ctx, img,
677 &elem->replace);
678 break;
679 case TYPE_PARAM:
680 err = gk20a_init_cde_param(cde_ctx, img, &elem->param);
681 break;
682 case TYPE_REQUIRED_CLASS:
683 err = gk20a_init_cde_required_class(cde_ctx, img,
684 elem->required_class);
685 break;
686 case TYPE_COMMAND:
687 {
688 struct gk20a_cde_cmd_elem *cmd = (void *)
689 &img->data[elem->command.data_byte_offset];
690 err = gk20a_init_cde_command(cde_ctx, img,
691 elem->command.op, cmd,
692 elem->command.num_entries);
693 break;
694 }
695 case TYPE_ARRAY:
696 memcpy(&cde_app->arrays[elem->array.id][0],
697 elem->array.data,
698 MAX_CDE_ARRAY_ENTRIES*sizeof(u32));
699 break;
700 default:
701 nvgpu_warn(g, "cde: unknown header element");
702 err = -EINVAL;
703 }
704
705 if (err)
706 goto deinit_image;
707
708 elem++;
709 }
710
711 if (!cde_ctx->init_convert_cmd || !cde_ctx->init_cmd_num_entries) {
712 nvgpu_warn(g, "cde: convert command not defined");
713 err = -EINVAL;
714 goto deinit_image;
715 }
716
717 if (!cde_ctx->convert_cmd || !cde_ctx->convert_cmd_num_entries) {
718 nvgpu_warn(g, "cde: convert command not defined");
719 err = -EINVAL;
720 goto deinit_image;
721 }
722
723 err = gk20a_cde_pack_cmdbufs(cde_ctx);
724 if (err)
725 goto deinit_image;
726
727 return 0;
728
729deinit_image:
730 gk20a_deinit_cde_img(cde_ctx);
731 return err;
732}
733
734static int gk20a_cde_execute_buffer(struct gk20a_cde_ctx *cde_ctx,
735 u32 op, struct nvgpu_fence *fence,
736 u32 flags, struct gk20a_fence **fence_out)
737{
738 struct nvgpu_os_linux *l = cde_ctx->l;
739 struct gk20a *g = &l->g;
740 struct nvgpu_gpfifo *gpfifo = NULL;
741 int num_entries = 0;
742
743 /* check command type */
744 if (op == TYPE_BUF_COMMAND_INIT) {
745 /* both init and convert combined */
746 gpfifo = cde_ctx->init_convert_cmd;
747 num_entries = cde_ctx->init_cmd_num_entries
748 + cde_ctx->convert_cmd_num_entries;
749 } else if (op == TYPE_BUF_COMMAND_CONVERT) {
750 gpfifo = cde_ctx->convert_cmd;
751 num_entries = cde_ctx->convert_cmd_num_entries;
752 } else {
753 nvgpu_warn(g, "cde: unknown buffer");
754 return -EINVAL;
755 }
756
757 if (gpfifo == NULL || num_entries == 0) {
758 nvgpu_warn(g, "cde: buffer not available");
759 return -ENOSYS;
760 }
761
762 return gk20a_submit_channel_gpfifo(cde_ctx->ch, gpfifo, NULL,
763 num_entries, flags, fence, fence_out, true,
764 NULL);
765}
766
767static void gk20a_cde_ctx_release(struct gk20a_cde_ctx *cde_ctx)
768__acquires(&cde_app->mutex)
769__releases(&cde_app->mutex)
770{
771 struct gk20a_cde_app *cde_app = &cde_ctx->l->cde_app;
772
773 gk20a_dbg(gpu_dbg_cde_ctx, "releasing use on %p", cde_ctx);
774 trace_gk20a_cde_release(cde_ctx);
775
776 nvgpu_mutex_acquire(&cde_app->mutex);
777
778 if (cde_ctx->in_use) {
779 cde_ctx->in_use = false;
780 nvgpu_list_move(&cde_ctx->list, &cde_app->free_contexts);
781 cde_app->ctx_usecount--;
782 } else {
783 gk20a_dbg_info("double release cde context %p", cde_ctx);
784 }
785
786 nvgpu_mutex_release(&cde_app->mutex);
787}
788
789static void gk20a_cde_ctx_deleter_fn(struct work_struct *work)
790__acquires(&cde_app->mutex)
791__releases(&cde_app->mutex)
792{
793 struct delayed_work *delay_work = to_delayed_work(work);
794 struct gk20a_cde_ctx *cde_ctx = container_of(delay_work,
795 struct gk20a_cde_ctx, ctx_deleter_work);
796 struct gk20a_cde_app *cde_app = &cde_ctx->l->cde_app;
797 struct nvgpu_os_linux *l = cde_ctx->l;
798 struct gk20a *g = &l->g;
799 int err;
800
801 /* someone has just taken it? engine deletion started? */
802 if (cde_ctx->in_use || !cde_app->initialised)
803 return;
804
805 gk20a_dbg(gpu_dbg_fn | gpu_dbg_cde_ctx,
806 "cde: attempting to delete temporary %p", cde_ctx);
807
808 err = gk20a_busy(g);
809 if (err) {
810 /* this context would find new use anyway later, so not freeing
811 * here does not leak anything */
812 nvgpu_warn(g, "cde: cannot set gk20a on, postponing"
813 " temp ctx deletion");
814 return;
815 }
816
817 nvgpu_mutex_acquire(&cde_app->mutex);
818 if (cde_ctx->in_use || !cde_app->initialised) {
819 gk20a_dbg(gpu_dbg_cde_ctx,
820 "cde: context use raced, not deleting %p",
821 cde_ctx);
822 goto out;
823 }
824
825 WARN(delayed_work_pending(&cde_ctx->ctx_deleter_work),
826 "double pending %p", cde_ctx);
827
828 gk20a_cde_remove_ctx(cde_ctx);
829 gk20a_dbg(gpu_dbg_fn | gpu_dbg_cde_ctx,
830 "cde: destroyed %p count=%d use=%d max=%d",
831 cde_ctx, cde_app->ctx_count, cde_app->ctx_usecount,
832 cde_app->ctx_count_top);
833
834out:
835 nvgpu_mutex_release(&cde_app->mutex);
836 gk20a_idle(g);
837}
838
839static struct gk20a_cde_ctx *gk20a_cde_do_get_context(struct nvgpu_os_linux *l)
840__must_hold(&cde_app->mutex)
841{
842 struct gk20a *g = &l->g;
843 struct gk20a_cde_app *cde_app = &l->cde_app;
844 struct gk20a_cde_ctx *cde_ctx;
845
846 /* exhausted? */
847
848 if (cde_app->ctx_usecount >= MAX_CTX_USE_COUNT)
849 return ERR_PTR(-EAGAIN);
850
851 /* idle context available? */
852
853 if (!nvgpu_list_empty(&cde_app->free_contexts)) {
854 cde_ctx = nvgpu_list_first_entry(&cde_app->free_contexts,
855 gk20a_cde_ctx, list);
856 gk20a_dbg(gpu_dbg_fn | gpu_dbg_cde_ctx,
857 "cde: got free %p count=%d use=%d max=%d",
858 cde_ctx, cde_app->ctx_count,
859 cde_app->ctx_usecount,
860 cde_app->ctx_count_top);
861 trace_gk20a_cde_get_context(cde_ctx);
862
863 /* deleter work may be scheduled, but in_use prevents it */
864 cde_ctx->in_use = true;
865 nvgpu_list_move(&cde_ctx->list, &cde_app->used_contexts);
866 cde_app->ctx_usecount++;
867
868 /* cancel any deletions now that ctx is in use */
869 gk20a_cde_cancel_deleter(cde_ctx, true);
870 return cde_ctx;
871 }
872
873 /* no free contexts, get a temporary one */
874
875 gk20a_dbg(gpu_dbg_fn | gpu_dbg_cde_ctx,
876 "cde: no free contexts, count=%d",
877 cde_app->ctx_count);
878
879 cde_ctx = gk20a_cde_allocate_context(l);
880 if (IS_ERR(cde_ctx)) {
881 nvgpu_warn(g, "cde: cannot allocate context: %ld",
882 PTR_ERR(cde_ctx));
883 return cde_ctx;
884 }
885
886 trace_gk20a_cde_get_context(cde_ctx);
887 cde_ctx->in_use = true;
888 cde_ctx->is_temporary = true;
889 cde_app->ctx_usecount++;
890 cde_app->ctx_count++;
891 if (cde_app->ctx_count > cde_app->ctx_count_top)
892 cde_app->ctx_count_top = cde_app->ctx_count;
893 nvgpu_list_add(&cde_ctx->list, &cde_app->used_contexts);
894
895 return cde_ctx;
896}
897
898static struct gk20a_cde_ctx *gk20a_cde_get_context(struct nvgpu_os_linux *l)
899__releases(&cde_app->mutex)
900__acquires(&cde_app->mutex)
901{
902 struct gk20a *g = &l->g;
903 struct gk20a_cde_app *cde_app = &l->cde_app;
904 struct gk20a_cde_ctx *cde_ctx = NULL;
905 struct nvgpu_timeout timeout;
906
907 nvgpu_timeout_init(g, &timeout, MAX_CTX_RETRY_TIME,
908 NVGPU_TIMER_CPU_TIMER);
909
910 do {
911 cde_ctx = gk20a_cde_do_get_context(l);
912 if (PTR_ERR(cde_ctx) != -EAGAIN)
913 break;
914
915 /* exhausted, retry */
916 nvgpu_mutex_release(&cde_app->mutex);
917 cond_resched();
918 nvgpu_mutex_acquire(&cde_app->mutex);
919 } while (!nvgpu_timeout_expired(&timeout));
920
921 return cde_ctx;
922}
923
924static struct gk20a_cde_ctx *gk20a_cde_allocate_context(struct nvgpu_os_linux *l)
925{
926 struct gk20a *g = &l->g;
927 struct gk20a_cde_ctx *cde_ctx;
928 int ret;
929
930 cde_ctx = nvgpu_kzalloc(g, sizeof(*cde_ctx));
931 if (!cde_ctx)
932 return ERR_PTR(-ENOMEM);
933
934 cde_ctx->l = l;
935 cde_ctx->dev = dev_from_gk20a(g);
936
937 ret = gk20a_cde_load(cde_ctx);
938 if (ret) {
939 nvgpu_kfree(g, cde_ctx);
940 return ERR_PTR(ret);
941 }
942
943 nvgpu_init_list_node(&cde_ctx->list);
944 cde_ctx->is_temporary = false;
945 cde_ctx->in_use = false;
946 INIT_DELAYED_WORK(&cde_ctx->ctx_deleter_work,
947 gk20a_cde_ctx_deleter_fn);
948
949 gk20a_dbg(gpu_dbg_fn | gpu_dbg_cde_ctx, "cde: allocated %p", cde_ctx);
950 trace_gk20a_cde_allocate_context(cde_ctx);
951 return cde_ctx;
952}
953
954int gk20a_cde_convert(struct nvgpu_os_linux *l,
955 struct dma_buf *compbits_scatter_buf,
956 u64 compbits_byte_offset,
957 u64 scatterbuffer_byte_offset,
958 struct nvgpu_fence *fence,
959 u32 __flags, struct gk20a_cde_param *params,
960 int num_params, struct gk20a_fence **fence_out)
961__acquires(&l->cde_app->mutex)
962__releases(&l->cde_app->mutex)
963{
964 struct gk20a *g = &l->g;
965 struct gk20a_cde_ctx *cde_ctx = NULL;
966 struct gk20a_comptags comptags;
967 u64 mapped_compbits_offset = 0;
968 u64 compbits_size = 0;
969 u64 mapped_scatterbuffer_offset = 0;
970 u64 scatterbuffer_size = 0;
971 u64 map_vaddr = 0;
972 u64 map_offset = 0;
973 u64 map_size = 0;
974 u8 *surface = NULL;
975 u64 big_page_mask = 0;
976 u32 flags;
977 int err, i;
978 const s32 compbits_kind = 0;
979
980 gk20a_dbg(gpu_dbg_cde, "compbits_byte_offset=%llu scatterbuffer_byte_offset=%llu",
981 compbits_byte_offset, scatterbuffer_byte_offset);
982
983 /* scatter buffer must be after compbits buffer */
984 if (scatterbuffer_byte_offset &&
985 scatterbuffer_byte_offset < compbits_byte_offset)
986 return -EINVAL;
987
988 err = gk20a_busy(g);
989 if (err)
990 return err;
991
992 nvgpu_mutex_acquire(&l->cde_app.mutex);
993 cde_ctx = gk20a_cde_get_context(l);
994 nvgpu_mutex_release(&l->cde_app.mutex);
995 if (IS_ERR(cde_ctx)) {
996 err = PTR_ERR(cde_ctx);
997 goto exit_idle;
998 }
999
1000 /* First, map the buffer to local va */
1001
1002 /* ensure that the compbits buffer has drvdata */
1003 err = gk20a_dmabuf_alloc_drvdata(compbits_scatter_buf,
1004 dev_from_gk20a(g));
1005 if (err)
1006 goto exit_idle;
1007
1008 /* compbits don't start at page aligned offset, so we need to align
1009 the region to be mapped */
1010 big_page_mask = cde_ctx->vm->big_page_size - 1;
1011 map_offset = compbits_byte_offset & ~big_page_mask;
1012 map_size = compbits_scatter_buf->size - map_offset;
1013
1014
1015 /* compute compbit start offset from the beginning of the mapped
1016 area */
1017 mapped_compbits_offset = compbits_byte_offset - map_offset;
1018 if (scatterbuffer_byte_offset) {
1019 compbits_size = scatterbuffer_byte_offset -
1020 compbits_byte_offset;
1021 mapped_scatterbuffer_offset = scatterbuffer_byte_offset -
1022 map_offset;
1023 scatterbuffer_size = compbits_scatter_buf->size -
1024 scatterbuffer_byte_offset;
1025 } else {
1026 compbits_size = compbits_scatter_buf->size -
1027 compbits_byte_offset;
1028 }
1029
1030 gk20a_dbg(gpu_dbg_cde, "map_offset=%llu map_size=%llu",
1031 map_offset, map_size);
1032 gk20a_dbg(gpu_dbg_cde, "mapped_compbits_offset=%llu compbits_size=%llu",
1033 mapped_compbits_offset, compbits_size);
1034 gk20a_dbg(gpu_dbg_cde, "mapped_scatterbuffer_offset=%llu scatterbuffer_size=%llu",
1035 mapped_scatterbuffer_offset, scatterbuffer_size);
1036
1037
1038 /* map the destination buffer */
1039 get_dma_buf(compbits_scatter_buf); /* a ref for nvgpu_vm_map */
1040 map_vaddr = nvgpu_vm_map(cde_ctx->vm, compbits_scatter_buf, 0,
1041 NVGPU_MAP_BUFFER_FLAGS_CACHEABLE_TRUE,
1042 compbits_kind, true,
1043 gk20a_mem_flag_none,
1044 map_offset, map_size,
1045 NULL);
1046 if (!map_vaddr) {
1047 dma_buf_put(compbits_scatter_buf);
1048 err = -EINVAL;
1049 goto exit_idle;
1050 }
1051
1052 if (scatterbuffer_byte_offset &&
1053 g->ops.cde.need_scatter_buffer &&
1054 g->ops.cde.need_scatter_buffer(g)) {
1055 struct sg_table *sgt;
1056 void *scatter_buffer;
1057
1058 surface = dma_buf_vmap(compbits_scatter_buf);
1059 if (IS_ERR(surface)) {
1060 nvgpu_warn(g,
1061 "dma_buf_vmap failed");
1062 err = -EINVAL;
1063 goto exit_unmap_vaddr;
1064 }
1065
1066 scatter_buffer = surface + scatterbuffer_byte_offset;
1067
1068 gk20a_dbg(gpu_dbg_cde, "surface=0x%p scatterBuffer=0x%p",
1069 surface, scatter_buffer);
1070 sgt = gk20a_mm_pin(dev_from_gk20a(g), compbits_scatter_buf);
1071 if (IS_ERR(sgt)) {
1072 nvgpu_warn(g,
1073 "mm_pin failed");
1074 err = -EINVAL;
1075 goto exit_unmap_surface;
1076 } else {
1077 err = g->ops.cde.populate_scatter_buffer(g, sgt,
1078 compbits_byte_offset, scatter_buffer,
1079 scatterbuffer_size);
1080 WARN_ON(err);
1081
1082 gk20a_mm_unpin(dev_from_gk20a(g), compbits_scatter_buf,
1083 sgt);
1084 if (err)
1085 goto exit_unmap_surface;
1086 }
1087
1088 __cpuc_flush_dcache_area(scatter_buffer, scatterbuffer_size);
1089 dma_buf_vunmap(compbits_scatter_buf, surface);
1090 surface = NULL;
1091 }
1092
1093 /* store source buffer compression tags */
1094 gk20a_get_comptags(dev_from_gk20a(g), compbits_scatter_buf, &comptags);
1095 cde_ctx->surf_param_offset = comptags.offset;
1096 cde_ctx->surf_param_lines = comptags.lines;
1097
1098 /* store surface vaddr. This is actually compbit vaddr, but since
1099 compbits live in the same surface, and we can get the alloc base
1100 address by using gk20a_mm_gpuva_to_iova_base, this will do */
1101 cde_ctx->surf_vaddr = map_vaddr;
1102
1103 /* store information about destination */
1104 cde_ctx->compbit_vaddr = map_vaddr + mapped_compbits_offset;
1105 cde_ctx->compbit_size = compbits_size;
1106
1107 cde_ctx->scatterbuffer_vaddr = map_vaddr + mapped_scatterbuffer_offset;
1108 cde_ctx->scatterbuffer_size = scatterbuffer_size;
1109
1110 /* remove existing argument data */
1111 memset(cde_ctx->user_param_values, 0,
1112 sizeof(cde_ctx->user_param_values));
1113
1114 /* read user space arguments for the conversion */
1115 for (i = 0; i < num_params; i++) {
1116 struct gk20a_cde_param *param = params + i;
1117 int id = param->id - NUM_RESERVED_PARAMS;
1118
1119 if (id < 0 || id >= MAX_CDE_USER_PARAMS) {
1120 nvgpu_warn(g, "cde: unknown user parameter");
1121 err = -EINVAL;
1122 goto exit_unmap_surface;
1123 }
1124 cde_ctx->user_param_values[id] = param->value;
1125 }
1126
1127 /* patch data */
1128 err = gk20a_cde_patch_params(cde_ctx);
1129 if (err) {
1130 nvgpu_warn(g, "cde: failed to patch parameters");
1131 goto exit_unmap_surface;
1132 }
1133
1134 gk20a_dbg(gpu_dbg_cde, "cde: buffer=cbc, size=%zu, gpuva=%llx\n",
1135 g->gr.compbit_store.mem.size, cde_ctx->backing_store_vaddr);
1136 gk20a_dbg(gpu_dbg_cde, "cde: buffer=compbits, size=%llu, gpuva=%llx\n",
1137 cde_ctx->compbit_size, cde_ctx->compbit_vaddr);
1138 gk20a_dbg(gpu_dbg_cde, "cde: buffer=scatterbuffer, size=%llu, gpuva=%llx\n",
1139 cde_ctx->scatterbuffer_size, cde_ctx->scatterbuffer_vaddr);
1140
1141 /* take always the postfence as it is needed for protecting the
1142 * cde context */
1143 flags = __flags | NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_GET;
1144
1145 /* gk20a_cde_execute_buffer() will grab a power reference of it's own */
1146 gk20a_idle(g);
1147
1148 /* execute the conversion buffer, combined with init first if it's the
1149 * first time */
1150 err = gk20a_cde_execute_buffer(cde_ctx,
1151 cde_ctx->init_cmd_executed
1152 ? TYPE_BUF_COMMAND_CONVERT
1153 : TYPE_BUF_COMMAND_INIT,
1154 fence, flags, fence_out);
1155
1156 cde_ctx->init_cmd_executed = true;
1157
1158 /* unmap the buffers - channel holds references to them now */
1159 nvgpu_vm_unmap(cde_ctx->vm, map_vaddr);
1160
1161 return err;
1162
1163exit_unmap_surface:
1164 if (surface)
1165 dma_buf_vunmap(compbits_scatter_buf, surface);
1166exit_unmap_vaddr:
1167 nvgpu_vm_unmap(cde_ctx->vm, map_vaddr);
1168exit_idle:
1169 gk20a_idle(g);
1170 return err;
1171}
1172
1173static void gk20a_cde_finished_ctx_cb(struct channel_gk20a *ch, void *data)
1174__acquires(&cde_app->mutex)
1175__releases(&cde_app->mutex)
1176{
1177 struct gk20a_cde_ctx *cde_ctx = data;
1178 struct nvgpu_os_linux *l = cde_ctx->l;
1179 struct gk20a *g = &l->g;
1180 struct gk20a_cde_app *cde_app = &l->cde_app;
1181 bool channel_idle;
1182
1183 channel_gk20a_joblist_lock(ch);
1184 channel_idle = channel_gk20a_joblist_is_empty(ch);
1185 channel_gk20a_joblist_unlock(ch);
1186
1187 if (!channel_idle)
1188 return;
1189
1190 trace_gk20a_cde_finished_ctx_cb(cde_ctx);
1191 gk20a_dbg(gpu_dbg_fn | gpu_dbg_cde_ctx, "cde: finished %p", cde_ctx);
1192 if (!cde_ctx->in_use)
1193 gk20a_dbg_info("double finish cde context %p on channel %p",
1194 cde_ctx, ch);
1195
1196 if (ch->has_timedout) {
1197 if (cde_ctx->is_temporary) {
1198 nvgpu_warn(g,
1199 "cde: channel had timed out"
1200 " (temporary channel)");
1201 /* going to be deleted anyway */
1202 } else {
1203 nvgpu_warn(g,
1204 "cde: channel had timed out"
1205 ", reloading");
1206 /* mark it to be deleted, replace with a new one */
1207 nvgpu_mutex_acquire(&cde_app->mutex);
1208 cde_ctx->is_temporary = true;
1209 if (gk20a_cde_create_context(l)) {
1210 nvgpu_err(g, "cde: can't replace context");
1211 }
1212 nvgpu_mutex_release(&cde_app->mutex);
1213 }
1214 }
1215
1216 /* delete temporary contexts later (watch for doubles) */
1217 if (cde_ctx->is_temporary && cde_ctx->in_use) {
1218 WARN_ON(delayed_work_pending(&cde_ctx->ctx_deleter_work));
1219 schedule_delayed_work(&cde_ctx->ctx_deleter_work,
1220 msecs_to_jiffies(CTX_DELETE_TIME));
1221 }
1222
1223 if (!ch->has_timedout)
1224 gk20a_cde_ctx_release(cde_ctx);
1225}
1226
1227static int gk20a_cde_load(struct gk20a_cde_ctx *cde_ctx)
1228{
1229 struct nvgpu_os_linux *l = cde_ctx->l;
1230 struct gk20a *g = &l->g;
1231 struct nvgpu_firmware *img;
1232 struct channel_gk20a *ch;
1233 struct gr_gk20a *gr = &g->gr;
1234 int err = 0;
1235 u64 vaddr;
1236
1237 img = nvgpu_request_firmware(g, "gpu2cde.bin", 0);
1238 if (!img) {
1239 nvgpu_err(g, "cde: could not fetch the firmware");
1240 return -ENOSYS;
1241 }
1242
1243 ch = gk20a_open_new_channel_with_cb(g, gk20a_cde_finished_ctx_cb,
1244 cde_ctx,
1245 -1,
1246 false);
1247 if (!ch) {
1248 nvgpu_warn(g, "cde: gk20a channel not available");
1249 err = -ENOMEM;
1250 goto err_get_gk20a_channel;
1251 }
1252
1253 /* bind the channel to the vm */
1254 err = __gk20a_vm_bind_channel(g->mm.cde.vm, ch);
1255 if (err) {
1256 nvgpu_warn(g, "cde: could not bind vm");
1257 goto err_commit_va;
1258 }
1259
1260 /* allocate gpfifo (1024 should be more than enough) */
1261 err = gk20a_channel_alloc_gpfifo(ch, 1024, 0, 0);
1262 if (err) {
1263 nvgpu_warn(g, "cde: unable to allocate gpfifo");
1264 goto err_alloc_gpfifo;
1265 }
1266
1267 /* map backing store to gpu virtual space */
1268 vaddr = nvgpu_gmmu_map(ch->vm, &gr->compbit_store.mem,
1269 g->gr.compbit_store.mem.size,
1270 NVGPU_MAP_BUFFER_FLAGS_CACHEABLE_TRUE,
1271 gk20a_mem_flag_read_only,
1272 false,
1273 gr->compbit_store.mem.aperture);
1274
1275 if (!vaddr) {
1276 nvgpu_warn(g, "cde: cannot map compression bit backing store");
1277 err = -ENOMEM;
1278 goto err_map_backingstore;
1279 }
1280
1281 /* store initialisation data */
1282 cde_ctx->ch = ch;
1283 cde_ctx->vm = ch->vm;
1284 cde_ctx->backing_store_vaddr = vaddr;
1285
1286 /* initialise the firmware */
1287 err = gk20a_init_cde_img(cde_ctx, img);
1288 if (err) {
1289 nvgpu_warn(g, "cde: image initialisation failed");
1290 goto err_init_cde_img;
1291 }
1292
1293 /* initialisation done */
1294 nvgpu_release_firmware(g, img);
1295
1296 return 0;
1297
1298err_init_cde_img:
1299 nvgpu_gmmu_unmap(ch->vm, &g->gr.compbit_store.mem, vaddr);
1300err_map_backingstore:
1301err_alloc_gpfifo:
1302 nvgpu_vm_put(ch->vm);
1303err_commit_va:
1304err_get_gk20a_channel:
1305 nvgpu_release_firmware(g, img);
1306 nvgpu_err(g, "cde: couldn't initialise buffer converter: %d", err);
1307 return err;
1308}
1309
1310int gk20a_cde_reload(struct nvgpu_os_linux *l)
1311__acquires(&l->cde_app->mutex)
1312__releases(&l->cde_app->mutex)
1313{
1314 struct gk20a *g = &l->g;
1315 struct gk20a_cde_app *cde_app = &l->cde_app;
1316 int err;
1317
1318 if (!cde_app->initialised)
1319 return -ENOSYS;
1320
1321 err = gk20a_busy(g);
1322 if (err)
1323 return err;
1324
1325 nvgpu_mutex_acquire(&cde_app->mutex);
1326
1327 gk20a_cde_stop(l);
1328
1329 err = gk20a_cde_create_contexts(l);
1330 if (!err)
1331 cde_app->initialised = true;
1332
1333 nvgpu_mutex_release(&cde_app->mutex);
1334
1335 gk20a_idle(g);
1336 return err;
1337}
1338
1339int gk20a_init_cde_support(struct nvgpu_os_linux *l)
1340__acquires(&cde_app->mutex)
1341__releases(&cde_app->mutex)
1342{
1343 struct gk20a_cde_app *cde_app = &l->cde_app;
1344 int err;
1345
1346 if (cde_app->initialised)
1347 return 0;
1348
1349 gk20a_dbg(gpu_dbg_fn | gpu_dbg_cde_ctx, "cde: init");
1350
1351 err = nvgpu_mutex_init(&cde_app->mutex);
1352 if (err)
1353 return err;
1354
1355 nvgpu_mutex_acquire(&cde_app->mutex);
1356
1357 nvgpu_init_list_node(&cde_app->free_contexts);
1358 nvgpu_init_list_node(&cde_app->used_contexts);
1359 cde_app->ctx_count = 0;
1360 cde_app->ctx_count_top = 0;
1361 cde_app->ctx_usecount = 0;
1362
1363 err = gk20a_cde_create_contexts(l);
1364 if (!err)
1365 cde_app->initialised = true;
1366
1367 nvgpu_mutex_release(&cde_app->mutex);
1368 gk20a_dbg(gpu_dbg_cde_ctx, "cde: init finished: %d", err);
1369
1370 if (err)
1371 nvgpu_mutex_destroy(&cde_app->mutex);
1372
1373 return err;
1374}
1375
1376enum cde_launch_patch_id {
1377 PATCH_H_QMD_CTA_RASTER_WIDTH_ID = 1024,
1378 PATCH_H_QMD_CTA_RASTER_HEIGHT_ID = 1025,
1379 PATCH_QMD_CTA_RASTER_DEPTH_ID = 1026, /* for firmware v0 only */
1380 PATCH_QMD_CTA_THREAD_DIMENSION0_ID = 1027,
1381 PATCH_QMD_CTA_THREAD_DIMENSION1_ID = 1028,
1382 PATCH_QMD_CTA_THREAD_DIMENSION2_ID = 1029, /* for firmware v0 only */
1383 PATCH_USER_CONST_XTILES_ID = 1030, /* for firmware v0 only */
1384 PATCH_USER_CONST_YTILES_ID = 1031, /* for firmware v0 only */
1385 PATCH_USER_CONST_BLOCKHEIGHTLOG2_ID = 1032,
1386 PATCH_USER_CONST_DSTPITCH_ID = 1033, /* for firmware v0 only */
1387 PATCH_H_USER_CONST_FLAGS_ID = 1034, /* for firmware v0 only */
1388 PATCH_H_VPC_CURRENT_GRID_SIZE_X_ID = 1035,
1389 PATCH_H_VPC_CURRENT_GRID_SIZE_Y_ID = 1036,
1390 PATCH_H_VPC_CURRENT_GRID_SIZE_Z_ID = 1037,
1391 PATCH_VPC_CURRENT_GROUP_SIZE_X_ID = 1038,
1392 PATCH_VPC_CURRENT_GROUP_SIZE_Y_ID = 1039,
1393 PATCH_VPC_CURRENT_GROUP_SIZE_Z_ID = 1040,
1394 PATCH_USER_CONST_XBLOCKS_ID = 1041,
1395 PATCH_H_USER_CONST_DSTOFFSET_ID = 1042,
1396 PATCH_V_QMD_CTA_RASTER_WIDTH_ID = 1043,
1397 PATCH_V_QMD_CTA_RASTER_HEIGHT_ID = 1044,
1398 PATCH_V_USER_CONST_DSTOFFSET_ID = 1045,
1399 PATCH_V_VPC_CURRENT_GRID_SIZE_X_ID = 1046,
1400 PATCH_V_VPC_CURRENT_GRID_SIZE_Y_ID = 1047,
1401 PATCH_V_VPC_CURRENT_GRID_SIZE_Z_ID = 1048,
1402 PATCH_H_LAUNCH_WORD1_ID = 1049,
1403 PATCH_H_LAUNCH_WORD2_ID = 1050,
1404 PATCH_V_LAUNCH_WORD1_ID = 1051,
1405 PATCH_V_LAUNCH_WORD2_ID = 1052,
1406 PATCH_H_QMD_PROGRAM_OFFSET_ID = 1053,
1407 PATCH_H_QMD_REGISTER_COUNT_ID = 1054,
1408 PATCH_V_QMD_PROGRAM_OFFSET_ID = 1055,
1409 PATCH_V_QMD_REGISTER_COUNT_ID = 1056,
1410};
1411
1412/* maximum number of WRITE_PATCHes in the below function */
1413#define MAX_CDE_LAUNCH_PATCHES 32
1414
1415static int gk20a_buffer_convert_gpu_to_cde_v1(
1416 struct nvgpu_os_linux *l,
1417 struct dma_buf *dmabuf, u32 consumer,
1418 u64 offset, u64 compbits_hoffset, u64 compbits_voffset,
1419 u64 scatterbuffer_offset,
1420 u32 width, u32 height, u32 block_height_log2,
1421 u32 submit_flags, struct nvgpu_fence *fence_in,
1422 struct gk20a_buffer_state *state)
1423{
1424 struct gk20a *g = &l->g;
1425 struct gk20a_cde_param params[MAX_CDE_LAUNCH_PATCHES];
1426 int param = 0;
1427 int err = 0;
1428 struct gk20a_fence *new_fence = NULL;
1429 const int wgx = 8;
1430 const int wgy = 8;
1431 const int compbits_per_byte = 4; /* one byte stores 4 compbit pairs */
1432 const int xalign = compbits_per_byte * wgx;
1433 const int yalign = wgy;
1434
1435 /* Compute per launch parameters */
1436 const int xtiles = (width + 7) >> 3;
1437 const int ytiles = (height + 7) >> 3;
1438 const int gridw_h = roundup(xtiles, xalign) / xalign;
1439 const int gridh_h = roundup(ytiles, yalign) / yalign;
1440 const int gridw_v = roundup(ytiles, xalign) / xalign;
1441 const int gridh_v = roundup(xtiles, yalign) / yalign;
1442 const int xblocks = (xtiles + 1) >> 1;
1443 const int voffset = compbits_voffset - compbits_hoffset;
1444
1445 int hprog = -1;
1446 int vprog = -1;
1447
1448 if (g->ops.cde.get_program_numbers)
1449 g->ops.cde.get_program_numbers(g, block_height_log2,
1450 l->cde_app.shader_parameter,
1451 &hprog, &vprog);
1452 else {
1453 nvgpu_warn(g, "cde: chip not supported");
1454 return -ENOSYS;
1455 }
1456
1457 if (hprog < 0 || vprog < 0) {
1458 nvgpu_warn(g, "cde: could not determine programs");
1459 return -ENOSYS;
1460 }
1461
1462 if (xtiles > 8192 / 8 || ytiles > 8192 / 8)
1463 nvgpu_warn(g, "cde: surface is exceptionally large (xtiles=%d, ytiles=%d)",
1464 xtiles, ytiles);
1465
1466 gk20a_dbg(gpu_dbg_cde, "w=%d, h=%d, bh_log2=%d, compbits_hoffset=0x%llx, compbits_voffset=0x%llx, scatterbuffer_offset=0x%llx",
1467 width, height, block_height_log2,
1468 compbits_hoffset, compbits_voffset, scatterbuffer_offset);
1469 gk20a_dbg(gpu_dbg_cde, "resolution (%d, %d) tiles (%d, %d)",
1470 width, height, xtiles, ytiles);
1471 gk20a_dbg(gpu_dbg_cde, "group (%d, %d) gridH (%d, %d) gridV (%d, %d)",
1472 wgx, wgy, gridw_h, gridh_h, gridw_v, gridh_v);
1473 gk20a_dbg(gpu_dbg_cde, "hprog=%d, offset=0x%x, regs=%d, vprog=%d, offset=0x%x, regs=%d",
1474 hprog,
1475 l->cde_app.arrays[ARRAY_PROGRAM_OFFSET][hprog],
1476 l->cde_app.arrays[ARRAY_REGISTER_COUNT][hprog],
1477 vprog,
1478 l->cde_app.arrays[ARRAY_PROGRAM_OFFSET][vprog],
1479 l->cde_app.arrays[ARRAY_REGISTER_COUNT][vprog]);
1480
1481 /* Write parameters */
1482#define WRITE_PATCH(NAME, VALUE) \
1483 params[param++] = (struct gk20a_cde_param){NAME##_ID, 0, VALUE}
1484 WRITE_PATCH(PATCH_USER_CONST_XBLOCKS, xblocks);
1485 WRITE_PATCH(PATCH_USER_CONST_BLOCKHEIGHTLOG2,
1486 block_height_log2);
1487 WRITE_PATCH(PATCH_QMD_CTA_THREAD_DIMENSION0, wgx);
1488 WRITE_PATCH(PATCH_QMD_CTA_THREAD_DIMENSION1, wgy);
1489 WRITE_PATCH(PATCH_VPC_CURRENT_GROUP_SIZE_X, wgx);
1490 WRITE_PATCH(PATCH_VPC_CURRENT_GROUP_SIZE_Y, wgy);
1491 WRITE_PATCH(PATCH_VPC_CURRENT_GROUP_SIZE_Z, 1);
1492
1493 WRITE_PATCH(PATCH_H_QMD_CTA_RASTER_WIDTH, gridw_h);
1494 WRITE_PATCH(PATCH_H_QMD_CTA_RASTER_HEIGHT, gridh_h);
1495 WRITE_PATCH(PATCH_H_USER_CONST_DSTOFFSET, 0);
1496 WRITE_PATCH(PATCH_H_VPC_CURRENT_GRID_SIZE_X, gridw_h);
1497 WRITE_PATCH(PATCH_H_VPC_CURRENT_GRID_SIZE_Y, gridh_h);
1498 WRITE_PATCH(PATCH_H_VPC_CURRENT_GRID_SIZE_Z, 1);
1499
1500 WRITE_PATCH(PATCH_V_QMD_CTA_RASTER_WIDTH, gridw_v);
1501 WRITE_PATCH(PATCH_V_QMD_CTA_RASTER_HEIGHT, gridh_v);
1502 WRITE_PATCH(PATCH_V_USER_CONST_DSTOFFSET, voffset);
1503 WRITE_PATCH(PATCH_V_VPC_CURRENT_GRID_SIZE_X, gridw_v);
1504 WRITE_PATCH(PATCH_V_VPC_CURRENT_GRID_SIZE_Y, gridh_v);
1505 WRITE_PATCH(PATCH_V_VPC_CURRENT_GRID_SIZE_Z, 1);
1506
1507 WRITE_PATCH(PATCH_H_QMD_PROGRAM_OFFSET,
1508 l->cde_app.arrays[ARRAY_PROGRAM_OFFSET][hprog]);
1509 WRITE_PATCH(PATCH_H_QMD_REGISTER_COUNT,
1510 l->cde_app.arrays[ARRAY_REGISTER_COUNT][hprog]);
1511 WRITE_PATCH(PATCH_V_QMD_PROGRAM_OFFSET,
1512 l->cde_app.arrays[ARRAY_PROGRAM_OFFSET][vprog]);
1513 WRITE_PATCH(PATCH_V_QMD_REGISTER_COUNT,
1514 l->cde_app.arrays[ARRAY_REGISTER_COUNT][vprog]);
1515
1516 if (consumer & NVGPU_GPU_COMPBITS_CDEH) {
1517 WRITE_PATCH(PATCH_H_LAUNCH_WORD1,
1518 l->cde_app.arrays[ARRAY_LAUNCH_COMMAND][0]);
1519 WRITE_PATCH(PATCH_H_LAUNCH_WORD2,
1520 l->cde_app.arrays[ARRAY_LAUNCH_COMMAND][1]);
1521 } else {
1522 WRITE_PATCH(PATCH_H_LAUNCH_WORD1,
1523 l->cde_app.arrays[ARRAY_LAUNCH_COMMAND][2]);
1524 WRITE_PATCH(PATCH_H_LAUNCH_WORD2,
1525 l->cde_app.arrays[ARRAY_LAUNCH_COMMAND][3]);
1526 }
1527
1528 if (consumer & NVGPU_GPU_COMPBITS_CDEV) {
1529 WRITE_PATCH(PATCH_V_LAUNCH_WORD1,
1530 l->cde_app.arrays[ARRAY_LAUNCH_COMMAND][0]);
1531 WRITE_PATCH(PATCH_V_LAUNCH_WORD2,
1532 l->cde_app.arrays[ARRAY_LAUNCH_COMMAND][1]);
1533 } else {
1534 WRITE_PATCH(PATCH_V_LAUNCH_WORD1,
1535 l->cde_app.arrays[ARRAY_LAUNCH_COMMAND][2]);
1536 WRITE_PATCH(PATCH_V_LAUNCH_WORD2,
1537 l->cde_app.arrays[ARRAY_LAUNCH_COMMAND][3]);
1538 }
1539#undef WRITE_PATCH
1540
1541 err = gk20a_cde_convert(l, dmabuf,
1542 compbits_hoffset,
1543 scatterbuffer_offset,
1544 fence_in, submit_flags,
1545 params, param, &new_fence);
1546 if (err)
1547 goto out;
1548
1549 /* compbits generated, update state & fence */
1550 gk20a_fence_put(state->fence);
1551 state->fence = new_fence;
1552 state->valid_compbits |= consumer &
1553 (NVGPU_GPU_COMPBITS_CDEH | NVGPU_GPU_COMPBITS_CDEV);
1554out:
1555 return err;
1556}
1557
1558static int gk20a_buffer_convert_gpu_to_cde(
1559 struct nvgpu_os_linux *l, struct dma_buf *dmabuf, u32 consumer,
1560 u64 offset, u64 compbits_hoffset, u64 compbits_voffset,
1561 u64 scatterbuffer_offset,
1562 u32 width, u32 height, u32 block_height_log2,
1563 u32 submit_flags, struct nvgpu_fence *fence_in,
1564 struct gk20a_buffer_state *state)
1565{
1566 struct gk20a *g = &l->g;
1567 int err = 0;
1568
1569 if (!l->cde_app.initialised)
1570 return -ENOSYS;
1571
1572 gk20a_dbg(gpu_dbg_cde, "firmware version = %d\n",
1573 l->cde_app.firmware_version);
1574
1575 if (l->cde_app.firmware_version == 1) {
1576 err = gk20a_buffer_convert_gpu_to_cde_v1(
1577 l, dmabuf, consumer, offset, compbits_hoffset,
1578 compbits_voffset, scatterbuffer_offset,
1579 width, height, block_height_log2,
1580 submit_flags, fence_in, state);
1581 } else {
1582 nvgpu_err(g, "unsupported CDE firmware version %d",
1583 l->cde_app.firmware_version);
1584 err = -EINVAL;
1585 }
1586
1587 return err;
1588}
1589
1590int gk20a_prepare_compressible_read(
1591 struct nvgpu_os_linux *l, u32 buffer_fd, u32 request, u64 offset,
1592 u64 compbits_hoffset, u64 compbits_voffset,
1593 u64 scatterbuffer_offset,
1594 u32 width, u32 height, u32 block_height_log2,
1595 u32 submit_flags, struct nvgpu_fence *fence,
1596 u32 *valid_compbits, u32 *zbc_color,
1597 struct gk20a_fence **fence_out)
1598{
1599 struct gk20a *g = &l->g;
1600 int err = 0;
1601 struct gk20a_buffer_state *state;
1602 struct dma_buf *dmabuf;
1603 u32 missing_bits;
1604
1605 dmabuf = dma_buf_get(buffer_fd);
1606 if (IS_ERR(dmabuf))
1607 return -EINVAL;
1608
1609 err = gk20a_dmabuf_get_state(dmabuf, g, offset, &state);
1610 if (err) {
1611 dma_buf_put(dmabuf);
1612 return err;
1613 }
1614
1615 missing_bits = (state->valid_compbits ^ request) & request;
1616
1617 nvgpu_mutex_acquire(&state->lock);
1618
1619 if (state->valid_compbits && request == NVGPU_GPU_COMPBITS_NONE) {
1620
1621 gk20a_fence_put(state->fence);
1622 state->fence = NULL;
1623 /* state->fence = decompress();
1624 state->valid_compbits = 0; */
1625 err = -EINVAL;
1626 goto out;
1627 } else if (missing_bits) {
1628 u32 missing_cde_bits = missing_bits &
1629 (NVGPU_GPU_COMPBITS_CDEH | NVGPU_GPU_COMPBITS_CDEV);
1630 if ((state->valid_compbits & NVGPU_GPU_COMPBITS_GPU) &&
1631 missing_cde_bits) {
1632 err = gk20a_buffer_convert_gpu_to_cde(
1633 l, dmabuf,
1634 missing_cde_bits,
1635 offset, compbits_hoffset,
1636 compbits_voffset, scatterbuffer_offset,
1637 width, height, block_height_log2,
1638 submit_flags, fence,
1639 state);
1640 if (err)
1641 goto out;
1642 }
1643 }
1644
1645 if (state->fence && fence_out)
1646 *fence_out = gk20a_fence_get(state->fence);
1647
1648 if (valid_compbits)
1649 *valid_compbits = state->valid_compbits;
1650
1651 if (zbc_color)
1652 *zbc_color = state->zbc_color;
1653
1654out:
1655 nvgpu_mutex_release(&state->lock);
1656 dma_buf_put(dmabuf);
1657 return err;
1658}
1659
1660int gk20a_mark_compressible_write(struct gk20a *g, u32 buffer_fd,
1661 u32 valid_compbits, u64 offset, u32 zbc_color)
1662{
1663 int err;
1664 struct gk20a_buffer_state *state;
1665 struct dma_buf *dmabuf;
1666
1667 dmabuf = dma_buf_get(buffer_fd);
1668 if (IS_ERR(dmabuf)) {
1669 nvgpu_err(g, "invalid dmabuf");
1670 return -EINVAL;
1671 }
1672
1673 err = gk20a_dmabuf_get_state(dmabuf, g, offset, &state);
1674 if (err) {
1675 nvgpu_err(g, "could not get state from dmabuf");
1676 dma_buf_put(dmabuf);
1677 return err;
1678 }
1679
1680 nvgpu_mutex_acquire(&state->lock);
1681
1682 /* Update the compbits state. */
1683 state->valid_compbits = valid_compbits;
1684 state->zbc_color = zbc_color;
1685
1686 /* Discard previous compbit job fence. */
1687 gk20a_fence_put(state->fence);
1688 state->fence = NULL;
1689
1690 nvgpu_mutex_release(&state->lock);
1691 dma_buf_put(dmabuf);
1692 return 0;
1693}