diff options
Diffstat (limited to 'include/os/linux/cde.c')
-rw-r--r-- | include/os/linux/cde.c | 1794 |
1 files changed, 1794 insertions, 0 deletions
diff --git a/include/os/linux/cde.c b/include/os/linux/cde.c new file mode 100644 index 0000000..715513c --- /dev/null +++ b/include/os/linux/cde.c | |||
@@ -0,0 +1,1794 @@ | |||
1 | /* | ||
2 | * Color decompression engine support | ||
3 | * | ||
4 | * Copyright (c) 2014-2018, NVIDIA Corporation. All rights reserved. | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or modify it | ||
7 | * under the terms and conditions of the GNU General Public License, | ||
8 | * version 2, as published by the Free Software Foundation. | ||
9 | * | ||
10 | * This program is distributed in the hope it will be useful, but WITHOUT | ||
11 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | ||
12 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for | ||
13 | * more details. | ||
14 | * | ||
15 | * You should have received a copy of the GNU General Public License | ||
16 | * along with this program. If not, see <http://www.gnu.org/licenses/>. | ||
17 | */ | ||
18 | |||
19 | #include <linux/dma-mapping.h> | ||
20 | #include <linux/fs.h> | ||
21 | #include <linux/dma-buf.h> | ||
22 | #include <uapi/linux/nvgpu.h> | ||
23 | |||
24 | #include <trace/events/gk20a.h> | ||
25 | |||
26 | #include <nvgpu/dma.h> | ||
27 | #include <nvgpu/gmmu.h> | ||
28 | #include <nvgpu/timers.h> | ||
29 | #include <nvgpu/nvgpu_common.h> | ||
30 | #include <nvgpu/kmem.h> | ||
31 | #include <nvgpu/log.h> | ||
32 | #include <nvgpu/bug.h> | ||
33 | #include <nvgpu/firmware.h> | ||
34 | #include <nvgpu/os_sched.h> | ||
35 | #include <nvgpu/channel.h> | ||
36 | #include <nvgpu/utils.h> | ||
37 | #include <nvgpu/gk20a.h> | ||
38 | |||
39 | #include <nvgpu/linux/vm.h> | ||
40 | |||
41 | #include "gk20a/mm_gk20a.h" | ||
42 | #include "gk20a/fence_gk20a.h" | ||
43 | #include "gk20a/gr_gk20a.h" | ||
44 | |||
45 | #include "cde.h" | ||
46 | #include "os_linux.h" | ||
47 | #include "dmabuf.h" | ||
48 | #include "channel.h" | ||
49 | #include "cde_gm20b.h" | ||
50 | #include "cde_gp10b.h" | ||
51 | |||
52 | #include <nvgpu/hw/gk20a/hw_ccsr_gk20a.h> | ||
53 | #include <nvgpu/hw/gk20a/hw_pbdma_gk20a.h> | ||
54 | |||
55 | static int gk20a_cde_load(struct gk20a_cde_ctx *cde_ctx); | ||
56 | static struct gk20a_cde_ctx *gk20a_cde_allocate_context(struct nvgpu_os_linux *l); | ||
57 | |||
58 | #define CTX_DELETE_TIME 1000 | ||
59 | |||
60 | #define MAX_CTX_USE_COUNT 42 | ||
61 | #define MAX_CTX_RETRY_TIME 2000 | ||
62 | |||
63 | static dma_addr_t gpuva_to_iova_base(struct vm_gk20a *vm, u64 gpu_vaddr) | ||
64 | { | ||
65 | struct nvgpu_mapped_buf *buffer; | ||
66 | dma_addr_t addr = 0; | ||
67 | struct gk20a *g = gk20a_from_vm(vm); | ||
68 | |||
69 | nvgpu_mutex_acquire(&vm->update_gmmu_lock); | ||
70 | buffer = __nvgpu_vm_find_mapped_buf(vm, gpu_vaddr); | ||
71 | if (buffer) | ||
72 | addr = nvgpu_mem_get_addr_sgl(g, buffer->os_priv.sgt->sgl); | ||
73 | nvgpu_mutex_release(&vm->update_gmmu_lock); | ||
74 | |||
75 | return addr; | ||
76 | } | ||
77 | |||
78 | static void gk20a_deinit_cde_img(struct gk20a_cde_ctx *cde_ctx) | ||
79 | { | ||
80 | unsigned int i; | ||
81 | |||
82 | for (i = 0; i < cde_ctx->num_bufs; i++) { | ||
83 | struct nvgpu_mem *mem = cde_ctx->mem + i; | ||
84 | nvgpu_dma_unmap_free(cde_ctx->vm, mem); | ||
85 | } | ||
86 | |||
87 | nvgpu_kfree(&cde_ctx->l->g, cde_ctx->init_convert_cmd); | ||
88 | |||
89 | cde_ctx->convert_cmd = NULL; | ||
90 | cde_ctx->init_convert_cmd = NULL; | ||
91 | cde_ctx->num_bufs = 0; | ||
92 | cde_ctx->num_params = 0; | ||
93 | cde_ctx->init_cmd_num_entries = 0; | ||
94 | cde_ctx->convert_cmd_num_entries = 0; | ||
95 | cde_ctx->init_cmd_executed = false; | ||
96 | } | ||
97 | |||
98 | static void gk20a_cde_remove_ctx(struct gk20a_cde_ctx *cde_ctx) | ||
99 | __must_hold(&cde_app->mutex) | ||
100 | { | ||
101 | struct nvgpu_os_linux *l = cde_ctx->l; | ||
102 | struct gk20a *g = &l->g; | ||
103 | struct channel_gk20a *ch = cde_ctx->ch; | ||
104 | struct vm_gk20a *vm = ch->vm; | ||
105 | |||
106 | trace_gk20a_cde_remove_ctx(cde_ctx); | ||
107 | |||
108 | /* release mapped memory */ | ||
109 | gk20a_deinit_cde_img(cde_ctx); | ||
110 | nvgpu_gmmu_unmap(vm, &g->gr.compbit_store.mem, | ||
111 | cde_ctx->backing_store_vaddr); | ||
112 | |||
113 | /* | ||
114 | * free the channel | ||
115 | * gk20a_channel_close() will also unbind the channel from TSG | ||
116 | */ | ||
117 | gk20a_channel_close(ch); | ||
118 | nvgpu_ref_put(&cde_ctx->tsg->refcount, gk20a_tsg_release); | ||
119 | |||
120 | /* housekeeping on app */ | ||
121 | nvgpu_list_del(&cde_ctx->list); | ||
122 | l->cde_app.ctx_count--; | ||
123 | nvgpu_kfree(g, cde_ctx); | ||
124 | } | ||
125 | |||
126 | static void gk20a_cde_cancel_deleter(struct gk20a_cde_ctx *cde_ctx, | ||
127 | bool wait_finish) | ||
128 | __releases(&cde_app->mutex) | ||
129 | __acquires(&cde_app->mutex) | ||
130 | { | ||
131 | struct gk20a_cde_app *cde_app = &cde_ctx->l->cde_app; | ||
132 | |||
133 | /* permanent contexts do not have deleter works */ | ||
134 | if (!cde_ctx->is_temporary) | ||
135 | return; | ||
136 | |||
137 | if (wait_finish) { | ||
138 | nvgpu_mutex_release(&cde_app->mutex); | ||
139 | cancel_delayed_work_sync(&cde_ctx->ctx_deleter_work); | ||
140 | nvgpu_mutex_acquire(&cde_app->mutex); | ||
141 | } else { | ||
142 | cancel_delayed_work(&cde_ctx->ctx_deleter_work); | ||
143 | } | ||
144 | } | ||
145 | |||
146 | static void gk20a_cde_remove_contexts(struct nvgpu_os_linux *l) | ||
147 | __must_hold(&l->cde_app->mutex) | ||
148 | { | ||
149 | struct gk20a_cde_app *cde_app = &l->cde_app; | ||
150 | struct gk20a_cde_ctx *cde_ctx, *cde_ctx_save; | ||
151 | |||
152 | /* safe to go off the mutex in cancel_deleter since app is | ||
153 | * deinitialised; no new jobs are started. deleter works may be only at | ||
154 | * waiting for the mutex or before, going to abort */ | ||
155 | |||
156 | nvgpu_list_for_each_entry_safe(cde_ctx, cde_ctx_save, | ||
157 | &cde_app->free_contexts, gk20a_cde_ctx, list) { | ||
158 | gk20a_cde_cancel_deleter(cde_ctx, true); | ||
159 | gk20a_cde_remove_ctx(cde_ctx); | ||
160 | } | ||
161 | |||
162 | nvgpu_list_for_each_entry_safe(cde_ctx, cde_ctx_save, | ||
163 | &cde_app->used_contexts, gk20a_cde_ctx, list) { | ||
164 | gk20a_cde_cancel_deleter(cde_ctx, true); | ||
165 | gk20a_cde_remove_ctx(cde_ctx); | ||
166 | } | ||
167 | } | ||
168 | |||
169 | static void gk20a_cde_stop(struct nvgpu_os_linux *l) | ||
170 | __must_hold(&l->cde_app->mutex) | ||
171 | { | ||
172 | struct gk20a_cde_app *cde_app = &l->cde_app; | ||
173 | |||
174 | /* prevent further conversions and delayed works from working */ | ||
175 | cde_app->initialised = false; | ||
176 | /* free all data, empty the list */ | ||
177 | gk20a_cde_remove_contexts(l); | ||
178 | } | ||
179 | |||
180 | void gk20a_cde_destroy(struct nvgpu_os_linux *l) | ||
181 | __acquires(&l->cde_app->mutex) | ||
182 | __releases(&l->cde_app->mutex) | ||
183 | { | ||
184 | struct gk20a_cde_app *cde_app = &l->cde_app; | ||
185 | |||
186 | if (!cde_app->initialised) | ||
187 | return; | ||
188 | |||
189 | nvgpu_mutex_acquire(&cde_app->mutex); | ||
190 | gk20a_cde_stop(l); | ||
191 | nvgpu_mutex_release(&cde_app->mutex); | ||
192 | |||
193 | nvgpu_mutex_destroy(&cde_app->mutex); | ||
194 | } | ||
195 | |||
196 | void gk20a_cde_suspend(struct nvgpu_os_linux *l) | ||
197 | __acquires(&l->cde_app->mutex) | ||
198 | __releases(&l->cde_app->mutex) | ||
199 | { | ||
200 | struct gk20a_cde_app *cde_app = &l->cde_app; | ||
201 | struct gk20a_cde_ctx *cde_ctx, *cde_ctx_save; | ||
202 | |||
203 | if (!cde_app->initialised) | ||
204 | return; | ||
205 | |||
206 | nvgpu_mutex_acquire(&cde_app->mutex); | ||
207 | |||
208 | nvgpu_list_for_each_entry_safe(cde_ctx, cde_ctx_save, | ||
209 | &cde_app->free_contexts, gk20a_cde_ctx, list) { | ||
210 | gk20a_cde_cancel_deleter(cde_ctx, false); | ||
211 | } | ||
212 | |||
213 | nvgpu_list_for_each_entry_safe(cde_ctx, cde_ctx_save, | ||
214 | &cde_app->used_contexts, gk20a_cde_ctx, list) { | ||
215 | gk20a_cde_cancel_deleter(cde_ctx, false); | ||
216 | } | ||
217 | |||
218 | nvgpu_mutex_release(&cde_app->mutex); | ||
219 | |||
220 | } | ||
221 | |||
222 | static int gk20a_cde_create_context(struct nvgpu_os_linux *l) | ||
223 | __must_hold(&l->cde_app->mutex) | ||
224 | { | ||
225 | struct gk20a_cde_app *cde_app = &l->cde_app; | ||
226 | struct gk20a_cde_ctx *cde_ctx; | ||
227 | |||
228 | cde_ctx = gk20a_cde_allocate_context(l); | ||
229 | if (IS_ERR(cde_ctx)) | ||
230 | return PTR_ERR(cde_ctx); | ||
231 | |||
232 | nvgpu_list_add(&cde_ctx->list, &cde_app->free_contexts); | ||
233 | cde_app->ctx_count++; | ||
234 | if (cde_app->ctx_count > cde_app->ctx_count_top) | ||
235 | cde_app->ctx_count_top = cde_app->ctx_count; | ||
236 | |||
237 | return 0; | ||
238 | } | ||
239 | |||
240 | static int gk20a_cde_create_contexts(struct nvgpu_os_linux *l) | ||
241 | __must_hold(&l->cde_app->mutex) | ||
242 | { | ||
243 | int err; | ||
244 | int i; | ||
245 | |||
246 | for (i = 0; i < NUM_CDE_CONTEXTS; i++) { | ||
247 | err = gk20a_cde_create_context(l); | ||
248 | if (err) | ||
249 | goto out; | ||
250 | } | ||
251 | |||
252 | return 0; | ||
253 | out: | ||
254 | gk20a_cde_remove_contexts(l); | ||
255 | return err; | ||
256 | } | ||
257 | |||
258 | static int gk20a_init_cde_buf(struct gk20a_cde_ctx *cde_ctx, | ||
259 | struct nvgpu_firmware *img, | ||
260 | struct gk20a_cde_hdr_buf *buf) | ||
261 | { | ||
262 | struct nvgpu_mem *mem; | ||
263 | struct nvgpu_os_linux *l = cde_ctx->l; | ||
264 | struct gk20a *g = &l->g; | ||
265 | int err; | ||
266 | |||
267 | /* check that the file can hold the buf */ | ||
268 | if (buf->data_byte_offset != 0 && | ||
269 | buf->data_byte_offset + buf->num_bytes > img->size) { | ||
270 | nvgpu_warn(g, "cde: invalid data section. buffer idx = %d", | ||
271 | cde_ctx->num_bufs); | ||
272 | return -EINVAL; | ||
273 | } | ||
274 | |||
275 | /* check that we have enough buf elems available */ | ||
276 | if (cde_ctx->num_bufs >= MAX_CDE_BUFS) { | ||
277 | nvgpu_warn(g, "cde: invalid data section. buffer idx = %d", | ||
278 | cde_ctx->num_bufs); | ||
279 | return -ENOMEM; | ||
280 | } | ||
281 | |||
282 | /* allocate buf */ | ||
283 | mem = cde_ctx->mem + cde_ctx->num_bufs; | ||
284 | err = nvgpu_dma_alloc_map_sys(cde_ctx->vm, buf->num_bytes, mem); | ||
285 | if (err) { | ||
286 | nvgpu_warn(g, "cde: could not allocate device memory. buffer idx = %d", | ||
287 | cde_ctx->num_bufs); | ||
288 | return -ENOMEM; | ||
289 | } | ||
290 | |||
291 | /* copy the content */ | ||
292 | if (buf->data_byte_offset != 0) | ||
293 | memcpy(mem->cpu_va, img->data + buf->data_byte_offset, | ||
294 | buf->num_bytes); | ||
295 | |||
296 | cde_ctx->num_bufs++; | ||
297 | |||
298 | return 0; | ||
299 | } | ||
300 | |||
301 | static int gk20a_replace_data(struct gk20a_cde_ctx *cde_ctx, void *target, | ||
302 | int type, s32 shift, u64 mask, u64 value) | ||
303 | { | ||
304 | struct nvgpu_os_linux *l = cde_ctx->l; | ||
305 | struct gk20a *g = &l->g; | ||
306 | u32 *target_mem_ptr = target; | ||
307 | u64 *target_mem_ptr_u64 = target; | ||
308 | u64 current_value, new_value; | ||
309 | |||
310 | value = (shift >= 0) ? value << shift : value >> -shift; | ||
311 | value &= mask; | ||
312 | |||
313 | /* read current data from the location */ | ||
314 | current_value = 0; | ||
315 | if (type == TYPE_PARAM_TYPE_U32) { | ||
316 | if (mask != 0xfffffffful) | ||
317 | current_value = *target_mem_ptr; | ||
318 | } else if (type == TYPE_PARAM_TYPE_U64_LITTLE) { | ||
319 | if (mask != ~0ul) | ||
320 | current_value = *target_mem_ptr_u64; | ||
321 | } else if (type == TYPE_PARAM_TYPE_U64_BIG) { | ||
322 | current_value = *target_mem_ptr_u64; | ||
323 | current_value = (u64)(current_value >> 32) | | ||
324 | (u64)(current_value << 32); | ||
325 | } else { | ||
326 | nvgpu_warn(g, "cde: unknown type. type=%d", | ||
327 | type); | ||
328 | return -EINVAL; | ||
329 | } | ||
330 | |||
331 | current_value &= ~mask; | ||
332 | new_value = current_value | value; | ||
333 | |||
334 | /* store the element data back */ | ||
335 | if (type == TYPE_PARAM_TYPE_U32) | ||
336 | *target_mem_ptr = (u32)new_value; | ||
337 | else if (type == TYPE_PARAM_TYPE_U64_LITTLE) | ||
338 | *target_mem_ptr_u64 = new_value; | ||
339 | else { | ||
340 | new_value = (u64)(new_value >> 32) | | ||
341 | (u64)(new_value << 32); | ||
342 | *target_mem_ptr_u64 = new_value; | ||
343 | } | ||
344 | |||
345 | return 0; | ||
346 | } | ||
347 | |||
348 | static int gk20a_init_cde_replace(struct gk20a_cde_ctx *cde_ctx, | ||
349 | struct nvgpu_firmware *img, | ||
350 | struct gk20a_cde_hdr_replace *replace) | ||
351 | { | ||
352 | struct nvgpu_mem *source_mem; | ||
353 | struct nvgpu_mem *target_mem; | ||
354 | struct nvgpu_os_linux *l = cde_ctx->l; | ||
355 | struct gk20a *g = &l->g; | ||
356 | u32 *target_mem_ptr; | ||
357 | u64 vaddr; | ||
358 | int err; | ||
359 | |||
360 | if (replace->target_buf >= cde_ctx->num_bufs || | ||
361 | replace->source_buf >= cde_ctx->num_bufs) { | ||
362 | nvgpu_warn(g, "cde: invalid buffer. target_buf=%u, source_buf=%u, num_bufs=%d", | ||
363 | replace->target_buf, replace->source_buf, | ||
364 | cde_ctx->num_bufs); | ||
365 | return -EINVAL; | ||
366 | } | ||
367 | |||
368 | source_mem = cde_ctx->mem + replace->source_buf; | ||
369 | target_mem = cde_ctx->mem + replace->target_buf; | ||
370 | target_mem_ptr = target_mem->cpu_va; | ||
371 | |||
372 | if (source_mem->size < (replace->source_byte_offset + 3) || | ||
373 | target_mem->size < (replace->target_byte_offset + 3)) { | ||
374 | nvgpu_warn(g, "cde: invalid buffer offsets. target_buf_offs=%lld, source_buf_offs=%lld, source_buf_size=%zu, dest_buf_size=%zu", | ||
375 | replace->target_byte_offset, | ||
376 | replace->source_byte_offset, | ||
377 | source_mem->size, | ||
378 | target_mem->size); | ||
379 | return -EINVAL; | ||
380 | } | ||
381 | |||
382 | /* calculate the target pointer */ | ||
383 | target_mem_ptr += (replace->target_byte_offset / sizeof(u32)); | ||
384 | |||
385 | /* determine patch value */ | ||
386 | vaddr = source_mem->gpu_va + replace->source_byte_offset; | ||
387 | err = gk20a_replace_data(cde_ctx, target_mem_ptr, replace->type, | ||
388 | replace->shift, replace->mask, | ||
389 | vaddr); | ||
390 | if (err) { | ||
391 | nvgpu_warn(g, "cde: replace failed. err=%d, target_buf=%u, target_buf_offs=%lld, source_buf=%u, source_buf_offs=%lld", | ||
392 | err, replace->target_buf, | ||
393 | replace->target_byte_offset, | ||
394 | replace->source_buf, | ||
395 | replace->source_byte_offset); | ||
396 | } | ||
397 | |||
398 | return err; | ||
399 | } | ||
400 | |||
401 | static int gk20a_cde_patch_params(struct gk20a_cde_ctx *cde_ctx) | ||
402 | { | ||
403 | struct nvgpu_os_linux *l = cde_ctx->l; | ||
404 | struct gk20a *g = &l->g; | ||
405 | struct nvgpu_mem *target_mem; | ||
406 | u32 *target_mem_ptr; | ||
407 | u64 new_data; | ||
408 | int user_id = 0, err; | ||
409 | unsigned int i; | ||
410 | |||
411 | for (i = 0; i < cde_ctx->num_params; i++) { | ||
412 | struct gk20a_cde_hdr_param *param = cde_ctx->params + i; | ||
413 | target_mem = cde_ctx->mem + param->target_buf; | ||
414 | target_mem_ptr = target_mem->cpu_va; | ||
415 | target_mem_ptr += (param->target_byte_offset / sizeof(u32)); | ||
416 | |||
417 | switch (param->id) { | ||
418 | case TYPE_PARAM_COMPTAGS_PER_CACHELINE: | ||
419 | new_data = g->gr.comptags_per_cacheline; | ||
420 | break; | ||
421 | case TYPE_PARAM_GPU_CONFIGURATION: | ||
422 | new_data = (u64)g->ltc_count * g->gr.slices_per_ltc * | ||
423 | g->gr.cacheline_size; | ||
424 | break; | ||
425 | case TYPE_PARAM_FIRSTPAGEOFFSET: | ||
426 | new_data = cde_ctx->surf_param_offset; | ||
427 | break; | ||
428 | case TYPE_PARAM_NUMPAGES: | ||
429 | new_data = cde_ctx->surf_param_lines; | ||
430 | break; | ||
431 | case TYPE_PARAM_BACKINGSTORE: | ||
432 | new_data = cde_ctx->backing_store_vaddr; | ||
433 | break; | ||
434 | case TYPE_PARAM_DESTINATION: | ||
435 | new_data = cde_ctx->compbit_vaddr; | ||
436 | break; | ||
437 | case TYPE_PARAM_DESTINATION_SIZE: | ||
438 | new_data = cde_ctx->compbit_size; | ||
439 | break; | ||
440 | case TYPE_PARAM_BACKINGSTORE_SIZE: | ||
441 | new_data = g->gr.compbit_store.mem.size; | ||
442 | break; | ||
443 | case TYPE_PARAM_SOURCE_SMMU_ADDR: | ||
444 | new_data = gpuva_to_iova_base(cde_ctx->vm, | ||
445 | cde_ctx->surf_vaddr); | ||
446 | if (new_data == 0) { | ||
447 | nvgpu_warn(g, "cde: failed to find 0x%llx", | ||
448 | cde_ctx->surf_vaddr); | ||
449 | return -EINVAL; | ||
450 | } | ||
451 | break; | ||
452 | case TYPE_PARAM_BACKINGSTORE_BASE_HW: | ||
453 | new_data = g->gr.compbit_store.base_hw; | ||
454 | break; | ||
455 | case TYPE_PARAM_GOBS_PER_COMPTAGLINE_PER_SLICE: | ||
456 | new_data = g->gr.gobs_per_comptagline_per_slice; | ||
457 | break; | ||
458 | case TYPE_PARAM_SCATTERBUFFER: | ||
459 | new_data = cde_ctx->scatterbuffer_vaddr; | ||
460 | break; | ||
461 | case TYPE_PARAM_SCATTERBUFFER_SIZE: | ||
462 | new_data = cde_ctx->scatterbuffer_size; | ||
463 | break; | ||
464 | default: | ||
465 | user_id = param->id - NUM_RESERVED_PARAMS; | ||
466 | if (user_id < 0 || user_id >= MAX_CDE_USER_PARAMS) | ||
467 | continue; | ||
468 | new_data = cde_ctx->user_param_values[user_id]; | ||
469 | } | ||
470 | |||
471 | nvgpu_log(g, gpu_dbg_cde, "cde: patch: idx_in_file=%d param_id=%d target_buf=%u target_byte_offset=%lld data_value=0x%llx data_offset/data_diff=%lld data_type=%d data_shift=%d data_mask=0x%llx", | ||
472 | i, param->id, param->target_buf, | ||
473 | param->target_byte_offset, new_data, | ||
474 | param->data_offset, param->type, param->shift, | ||
475 | param->mask); | ||
476 | |||
477 | new_data += param->data_offset; | ||
478 | |||
479 | err = gk20a_replace_data(cde_ctx, target_mem_ptr, param->type, | ||
480 | param->shift, param->mask, new_data); | ||
481 | |||
482 | if (err) { | ||
483 | nvgpu_warn(g, "cde: patch failed. err=%d, idx=%d, id=%d, target_buf=%u, target_buf_offs=%lld, patch_value=%llu", | ||
484 | err, i, param->id, param->target_buf, | ||
485 | param->target_byte_offset, new_data); | ||
486 | return err; | ||
487 | } | ||
488 | } | ||
489 | |||
490 | return 0; | ||
491 | } | ||
492 | |||
493 | static int gk20a_init_cde_param(struct gk20a_cde_ctx *cde_ctx, | ||
494 | struct nvgpu_firmware *img, | ||
495 | struct gk20a_cde_hdr_param *param) | ||
496 | { | ||
497 | struct nvgpu_mem *target_mem; | ||
498 | struct nvgpu_os_linux *l = cde_ctx->l; | ||
499 | struct gk20a *g = &l->g; | ||
500 | |||
501 | if (param->target_buf >= cde_ctx->num_bufs) { | ||
502 | nvgpu_warn(g, "cde: invalid buffer parameter. param idx = %d, target_buf=%u, num_bufs=%u", | ||
503 | cde_ctx->num_params, param->target_buf, | ||
504 | cde_ctx->num_bufs); | ||
505 | return -EINVAL; | ||
506 | } | ||
507 | |||
508 | target_mem = cde_ctx->mem + param->target_buf; | ||
509 | if (target_mem->size < (param->target_byte_offset + 3)) { | ||
510 | nvgpu_warn(g, "cde: invalid buffer parameter. param idx = %d, target_buf_offs=%lld, target_buf_size=%zu", | ||
511 | cde_ctx->num_params, param->target_byte_offset, | ||
512 | target_mem->size); | ||
513 | return -EINVAL; | ||
514 | } | ||
515 | |||
516 | /* does this parameter fit into our parameter structure */ | ||
517 | if (cde_ctx->num_params >= MAX_CDE_PARAMS) { | ||
518 | nvgpu_warn(g, "cde: no room for new parameters param idx = %d", | ||
519 | cde_ctx->num_params); | ||
520 | return -ENOMEM; | ||
521 | } | ||
522 | |||
523 | /* is the given id valid? */ | ||
524 | if (param->id >= NUM_RESERVED_PARAMS + MAX_CDE_USER_PARAMS) { | ||
525 | nvgpu_warn(g, "cde: parameter id is not valid. param idx = %d, id=%u, max=%u", | ||
526 | param->id, cde_ctx->num_params, | ||
527 | NUM_RESERVED_PARAMS + MAX_CDE_USER_PARAMS); | ||
528 | return -EINVAL; | ||
529 | } | ||
530 | |||
531 | cde_ctx->params[cde_ctx->num_params] = *param; | ||
532 | cde_ctx->num_params++; | ||
533 | |||
534 | return 0; | ||
535 | } | ||
536 | |||
537 | static int gk20a_init_cde_required_class(struct gk20a_cde_ctx *cde_ctx, | ||
538 | struct nvgpu_firmware *img, | ||
539 | u32 required_class) | ||
540 | { | ||
541 | struct nvgpu_os_linux *l = cde_ctx->l; | ||
542 | struct gk20a *g = &l->g; | ||
543 | int err; | ||
544 | |||
545 | /* CDE enabled */ | ||
546 | cde_ctx->ch->cde = true; | ||
547 | |||
548 | err = gk20a_alloc_obj_ctx(cde_ctx->ch, required_class, 0); | ||
549 | if (err) { | ||
550 | nvgpu_warn(g, "cde: failed to allocate ctx. err=%d", | ||
551 | err); | ||
552 | return err; | ||
553 | } | ||
554 | |||
555 | return 0; | ||
556 | } | ||
557 | |||
558 | static int gk20a_init_cde_command(struct gk20a_cde_ctx *cde_ctx, | ||
559 | struct nvgpu_firmware *img, | ||
560 | u32 op, | ||
561 | struct gk20a_cde_cmd_elem *cmd_elem, | ||
562 | u32 num_elems) | ||
563 | { | ||
564 | struct nvgpu_os_linux *l = cde_ctx->l; | ||
565 | struct gk20a *g = &l->g; | ||
566 | struct nvgpu_gpfifo_entry **gpfifo, *gpfifo_elem; | ||
567 | u32 *num_entries; | ||
568 | unsigned int i; | ||
569 | |||
570 | /* check command type */ | ||
571 | if (op == TYPE_BUF_COMMAND_INIT) { | ||
572 | gpfifo = &cde_ctx->init_convert_cmd; | ||
573 | num_entries = &cde_ctx->init_cmd_num_entries; | ||
574 | } else if (op == TYPE_BUF_COMMAND_CONVERT) { | ||
575 | gpfifo = &cde_ctx->convert_cmd; | ||
576 | num_entries = &cde_ctx->convert_cmd_num_entries; | ||
577 | } else { | ||
578 | nvgpu_warn(g, "cde: unknown command. op=%u", | ||
579 | op); | ||
580 | return -EINVAL; | ||
581 | } | ||
582 | |||
583 | /* allocate gpfifo entries to be pushed */ | ||
584 | *gpfifo = nvgpu_kzalloc(g, | ||
585 | sizeof(struct nvgpu_gpfifo_entry) * num_elems); | ||
586 | if (!*gpfifo) { | ||
587 | nvgpu_warn(g, "cde: could not allocate memory for gpfifo entries"); | ||
588 | return -ENOMEM; | ||
589 | } | ||
590 | |||
591 | gpfifo_elem = *gpfifo; | ||
592 | for (i = 0; i < num_elems; i++, cmd_elem++, gpfifo_elem++) { | ||
593 | struct nvgpu_mem *target_mem; | ||
594 | |||
595 | /* validate the current entry */ | ||
596 | if (cmd_elem->target_buf >= cde_ctx->num_bufs) { | ||
597 | nvgpu_warn(g, "cde: target buffer is not available (target=%u, num_bufs=%u)", | ||
598 | cmd_elem->target_buf, cde_ctx->num_bufs); | ||
599 | return -EINVAL; | ||
600 | } | ||
601 | |||
602 | target_mem = cde_ctx->mem + cmd_elem->target_buf; | ||
603 | if (target_mem->size< | ||
604 | cmd_elem->target_byte_offset + cmd_elem->num_bytes) { | ||
605 | nvgpu_warn(g, "cde: target buffer cannot hold all entries (target_size=%zu, target_byte_offset=%lld, num_bytes=%llu)", | ||
606 | target_mem->size, | ||
607 | cmd_elem->target_byte_offset, | ||
608 | cmd_elem->num_bytes); | ||
609 | return -EINVAL; | ||
610 | } | ||
611 | |||
612 | /* store the element into gpfifo */ | ||
613 | gpfifo_elem->entry0 = | ||
614 | u64_lo32(target_mem->gpu_va + | ||
615 | cmd_elem->target_byte_offset); | ||
616 | gpfifo_elem->entry1 = | ||
617 | u64_hi32(target_mem->gpu_va + | ||
618 | cmd_elem->target_byte_offset) | | ||
619 | pbdma_gp_entry1_length_f(cmd_elem->num_bytes / | ||
620 | sizeof(u32)); | ||
621 | } | ||
622 | |||
623 | *num_entries = num_elems; | ||
624 | return 0; | ||
625 | } | ||
626 | |||
627 | static int gk20a_cde_pack_cmdbufs(struct gk20a_cde_ctx *cde_ctx) | ||
628 | { | ||
629 | struct nvgpu_os_linux *l = cde_ctx->l; | ||
630 | struct gk20a *g = &l->g; | ||
631 | unsigned long init_bytes = cde_ctx->init_cmd_num_entries * | ||
632 | sizeof(struct nvgpu_gpfifo_entry); | ||
633 | unsigned long conv_bytes = cde_ctx->convert_cmd_num_entries * | ||
634 | sizeof(struct nvgpu_gpfifo_entry); | ||
635 | unsigned long total_bytes = init_bytes + conv_bytes; | ||
636 | struct nvgpu_gpfifo_entry *combined_cmd; | ||
637 | |||
638 | /* allocate buffer that has space for both */ | ||
639 | combined_cmd = nvgpu_kzalloc(g, total_bytes); | ||
640 | if (!combined_cmd) { | ||
641 | nvgpu_warn(g, | ||
642 | "cde: could not allocate memory for gpfifo entries"); | ||
643 | return -ENOMEM; | ||
644 | } | ||
645 | |||
646 | /* move the original init here and append convert */ | ||
647 | memcpy(combined_cmd, cde_ctx->init_convert_cmd, init_bytes); | ||
648 | memcpy(combined_cmd + cde_ctx->init_cmd_num_entries, | ||
649 | cde_ctx->convert_cmd, conv_bytes); | ||
650 | |||
651 | nvgpu_kfree(g, cde_ctx->init_convert_cmd); | ||
652 | nvgpu_kfree(g, cde_ctx->convert_cmd); | ||
653 | |||
654 | cde_ctx->init_convert_cmd = combined_cmd; | ||
655 | cde_ctx->convert_cmd = combined_cmd | ||
656 | + cde_ctx->init_cmd_num_entries; | ||
657 | |||
658 | return 0; | ||
659 | } | ||
660 | |||
661 | static int gk20a_init_cde_img(struct gk20a_cde_ctx *cde_ctx, | ||
662 | struct nvgpu_firmware *img) | ||
663 | { | ||
664 | struct nvgpu_os_linux *l = cde_ctx->l; | ||
665 | struct gk20a *g = &l->g; | ||
666 | struct gk20a_cde_app *cde_app = &l->cde_app; | ||
667 | u32 *data = (u32 *)img->data; | ||
668 | u32 num_of_elems; | ||
669 | struct gk20a_cde_hdr_elem *elem; | ||
670 | u32 min_size = 0; | ||
671 | int err = 0; | ||
672 | unsigned int i; | ||
673 | |||
674 | min_size += 2 * sizeof(u32); | ||
675 | if (img->size < min_size) { | ||
676 | nvgpu_warn(g, "cde: invalid image header"); | ||
677 | return -EINVAL; | ||
678 | } | ||
679 | |||
680 | cde_app->firmware_version = data[0]; | ||
681 | num_of_elems = data[1]; | ||
682 | |||
683 | min_size += num_of_elems * sizeof(*elem); | ||
684 | if (img->size < min_size) { | ||
685 | nvgpu_warn(g, "cde: bad image"); | ||
686 | return -EINVAL; | ||
687 | } | ||
688 | |||
689 | elem = (struct gk20a_cde_hdr_elem *)&data[2]; | ||
690 | for (i = 0; i < num_of_elems; i++) { | ||
691 | int err = 0; | ||
692 | switch (elem->type) { | ||
693 | case TYPE_BUF: | ||
694 | err = gk20a_init_cde_buf(cde_ctx, img, &elem->buf); | ||
695 | break; | ||
696 | case TYPE_REPLACE: | ||
697 | err = gk20a_init_cde_replace(cde_ctx, img, | ||
698 | &elem->replace); | ||
699 | break; | ||
700 | case TYPE_PARAM: | ||
701 | err = gk20a_init_cde_param(cde_ctx, img, &elem->param); | ||
702 | break; | ||
703 | case TYPE_REQUIRED_CLASS: | ||
704 | err = gk20a_init_cde_required_class(cde_ctx, img, | ||
705 | elem->required_class); | ||
706 | break; | ||
707 | case TYPE_COMMAND: | ||
708 | { | ||
709 | struct gk20a_cde_cmd_elem *cmd = (void *) | ||
710 | &img->data[elem->command.data_byte_offset]; | ||
711 | err = gk20a_init_cde_command(cde_ctx, img, | ||
712 | elem->command.op, cmd, | ||
713 | elem->command.num_entries); | ||
714 | break; | ||
715 | } | ||
716 | case TYPE_ARRAY: | ||
717 | memcpy(&cde_app->arrays[elem->array.id][0], | ||
718 | elem->array.data, | ||
719 | MAX_CDE_ARRAY_ENTRIES*sizeof(u32)); | ||
720 | break; | ||
721 | default: | ||
722 | nvgpu_warn(g, "cde: unknown header element"); | ||
723 | err = -EINVAL; | ||
724 | } | ||
725 | |||
726 | if (err) | ||
727 | goto deinit_image; | ||
728 | |||
729 | elem++; | ||
730 | } | ||
731 | |||
732 | if (!cde_ctx->init_convert_cmd || !cde_ctx->init_cmd_num_entries) { | ||
733 | nvgpu_warn(g, "cde: convert command not defined"); | ||
734 | err = -EINVAL; | ||
735 | goto deinit_image; | ||
736 | } | ||
737 | |||
738 | if (!cde_ctx->convert_cmd || !cde_ctx->convert_cmd_num_entries) { | ||
739 | nvgpu_warn(g, "cde: convert command not defined"); | ||
740 | err = -EINVAL; | ||
741 | goto deinit_image; | ||
742 | } | ||
743 | |||
744 | err = gk20a_cde_pack_cmdbufs(cde_ctx); | ||
745 | if (err) | ||
746 | goto deinit_image; | ||
747 | |||
748 | return 0; | ||
749 | |||
750 | deinit_image: | ||
751 | gk20a_deinit_cde_img(cde_ctx); | ||
752 | return err; | ||
753 | } | ||
754 | |||
755 | static int gk20a_cde_execute_buffer(struct gk20a_cde_ctx *cde_ctx, | ||
756 | u32 op, struct nvgpu_channel_fence *fence, | ||
757 | u32 flags, struct gk20a_fence **fence_out) | ||
758 | { | ||
759 | struct nvgpu_os_linux *l = cde_ctx->l; | ||
760 | struct gk20a *g = &l->g; | ||
761 | struct nvgpu_gpfifo_entry *gpfifo = NULL; | ||
762 | int num_entries = 0; | ||
763 | |||
764 | /* check command type */ | ||
765 | if (op == TYPE_BUF_COMMAND_INIT) { | ||
766 | /* both init and convert combined */ | ||
767 | gpfifo = cde_ctx->init_convert_cmd; | ||
768 | num_entries = cde_ctx->init_cmd_num_entries | ||
769 | + cde_ctx->convert_cmd_num_entries; | ||
770 | } else if (op == TYPE_BUF_COMMAND_CONVERT) { | ||
771 | gpfifo = cde_ctx->convert_cmd; | ||
772 | num_entries = cde_ctx->convert_cmd_num_entries; | ||
773 | } else if (op == TYPE_BUF_COMMAND_NOOP) { | ||
774 | /* Any non-null gpfifo will suffice with 0 num_entries */ | ||
775 | gpfifo = cde_ctx->init_convert_cmd; | ||
776 | num_entries = 0; | ||
777 | } else { | ||
778 | nvgpu_warn(g, "cde: unknown buffer"); | ||
779 | return -EINVAL; | ||
780 | } | ||
781 | |||
782 | if (gpfifo == NULL) { | ||
783 | nvgpu_warn(g, "cde: buffer not available"); | ||
784 | return -ENOSYS; | ||
785 | } | ||
786 | |||
787 | return nvgpu_submit_channel_gpfifo_kernel(cde_ctx->ch, gpfifo, | ||
788 | num_entries, flags, fence, fence_out); | ||
789 | } | ||
790 | |||
791 | static void gk20a_cde_ctx_release(struct gk20a_cde_ctx *cde_ctx) | ||
792 | __acquires(&cde_app->mutex) | ||
793 | __releases(&cde_app->mutex) | ||
794 | { | ||
795 | struct gk20a_cde_app *cde_app = &cde_ctx->l->cde_app; | ||
796 | struct gk20a *g = &cde_ctx->l->g; | ||
797 | |||
798 | nvgpu_log(g, gpu_dbg_cde_ctx, "releasing use on %p", cde_ctx); | ||
799 | trace_gk20a_cde_release(cde_ctx); | ||
800 | |||
801 | nvgpu_mutex_acquire(&cde_app->mutex); | ||
802 | |||
803 | if (cde_ctx->in_use) { | ||
804 | cde_ctx->in_use = false; | ||
805 | nvgpu_list_move(&cde_ctx->list, &cde_app->free_contexts); | ||
806 | cde_app->ctx_usecount--; | ||
807 | } else { | ||
808 | nvgpu_log_info(g, "double release cde context %p", cde_ctx); | ||
809 | } | ||
810 | |||
811 | nvgpu_mutex_release(&cde_app->mutex); | ||
812 | } | ||
813 | |||
814 | static void gk20a_cde_ctx_deleter_fn(struct work_struct *work) | ||
815 | __acquires(&cde_app->mutex) | ||
816 | __releases(&cde_app->mutex) | ||
817 | { | ||
818 | struct delayed_work *delay_work = to_delayed_work(work); | ||
819 | struct gk20a_cde_ctx *cde_ctx = container_of(delay_work, | ||
820 | struct gk20a_cde_ctx, ctx_deleter_work); | ||
821 | struct gk20a_cde_app *cde_app = &cde_ctx->l->cde_app; | ||
822 | struct nvgpu_os_linux *l = cde_ctx->l; | ||
823 | struct gk20a *g = &l->g; | ||
824 | int err; | ||
825 | |||
826 | /* someone has just taken it? engine deletion started? */ | ||
827 | if (cde_ctx->in_use || !cde_app->initialised) | ||
828 | return; | ||
829 | |||
830 | nvgpu_log(g, gpu_dbg_fn | gpu_dbg_cde_ctx, | ||
831 | "cde: attempting to delete temporary %p", cde_ctx); | ||
832 | |||
833 | err = gk20a_busy(g); | ||
834 | if (err) { | ||
835 | /* this context would find new use anyway later, so not freeing | ||
836 | * here does not leak anything */ | ||
837 | nvgpu_warn(g, "cde: cannot set gk20a on, postponing" | ||
838 | " temp ctx deletion"); | ||
839 | return; | ||
840 | } | ||
841 | |||
842 | nvgpu_mutex_acquire(&cde_app->mutex); | ||
843 | if (cde_ctx->in_use || !cde_app->initialised) { | ||
844 | nvgpu_log(g, gpu_dbg_cde_ctx, | ||
845 | "cde: context use raced, not deleting %p", | ||
846 | cde_ctx); | ||
847 | goto out; | ||
848 | } | ||
849 | |||
850 | WARN(delayed_work_pending(&cde_ctx->ctx_deleter_work), | ||
851 | "double pending %p", cde_ctx); | ||
852 | |||
853 | gk20a_cde_remove_ctx(cde_ctx); | ||
854 | nvgpu_log(g, gpu_dbg_fn | gpu_dbg_cde_ctx, | ||
855 | "cde: destroyed %p count=%d use=%d max=%d", | ||
856 | cde_ctx, cde_app->ctx_count, cde_app->ctx_usecount, | ||
857 | cde_app->ctx_count_top); | ||
858 | |||
859 | out: | ||
860 | nvgpu_mutex_release(&cde_app->mutex); | ||
861 | gk20a_idle(g); | ||
862 | } | ||
863 | |||
864 | static struct gk20a_cde_ctx *gk20a_cde_do_get_context(struct nvgpu_os_linux *l) | ||
865 | __must_hold(&cde_app->mutex) | ||
866 | { | ||
867 | struct gk20a *g = &l->g; | ||
868 | struct gk20a_cde_app *cde_app = &l->cde_app; | ||
869 | struct gk20a_cde_ctx *cde_ctx; | ||
870 | |||
871 | /* exhausted? */ | ||
872 | |||
873 | if (cde_app->ctx_usecount >= MAX_CTX_USE_COUNT) | ||
874 | return ERR_PTR(-EAGAIN); | ||
875 | |||
876 | /* idle context available? */ | ||
877 | |||
878 | if (!nvgpu_list_empty(&cde_app->free_contexts)) { | ||
879 | cde_ctx = nvgpu_list_first_entry(&cde_app->free_contexts, | ||
880 | gk20a_cde_ctx, list); | ||
881 | nvgpu_log(g, gpu_dbg_fn | gpu_dbg_cde_ctx, | ||
882 | "cde: got free %p count=%d use=%d max=%d", | ||
883 | cde_ctx, cde_app->ctx_count, | ||
884 | cde_app->ctx_usecount, | ||
885 | cde_app->ctx_count_top); | ||
886 | trace_gk20a_cde_get_context(cde_ctx); | ||
887 | |||
888 | /* deleter work may be scheduled, but in_use prevents it */ | ||
889 | cde_ctx->in_use = true; | ||
890 | nvgpu_list_move(&cde_ctx->list, &cde_app->used_contexts); | ||
891 | cde_app->ctx_usecount++; | ||
892 | |||
893 | /* cancel any deletions now that ctx is in use */ | ||
894 | gk20a_cde_cancel_deleter(cde_ctx, true); | ||
895 | return cde_ctx; | ||
896 | } | ||
897 | |||
898 | /* no free contexts, get a temporary one */ | ||
899 | |||
900 | nvgpu_log(g, gpu_dbg_fn | gpu_dbg_cde_ctx, | ||
901 | "cde: no free contexts, count=%d", | ||
902 | cde_app->ctx_count); | ||
903 | |||
904 | cde_ctx = gk20a_cde_allocate_context(l); | ||
905 | if (IS_ERR(cde_ctx)) { | ||
906 | nvgpu_warn(g, "cde: cannot allocate context: %ld", | ||
907 | PTR_ERR(cde_ctx)); | ||
908 | return cde_ctx; | ||
909 | } | ||
910 | |||
911 | trace_gk20a_cde_get_context(cde_ctx); | ||
912 | cde_ctx->in_use = true; | ||
913 | cde_ctx->is_temporary = true; | ||
914 | cde_app->ctx_usecount++; | ||
915 | cde_app->ctx_count++; | ||
916 | if (cde_app->ctx_count > cde_app->ctx_count_top) | ||
917 | cde_app->ctx_count_top = cde_app->ctx_count; | ||
918 | nvgpu_list_add(&cde_ctx->list, &cde_app->used_contexts); | ||
919 | |||
920 | return cde_ctx; | ||
921 | } | ||
922 | |||
923 | static struct gk20a_cde_ctx *gk20a_cde_get_context(struct nvgpu_os_linux *l) | ||
924 | __releases(&cde_app->mutex) | ||
925 | __acquires(&cde_app->mutex) | ||
926 | { | ||
927 | struct gk20a *g = &l->g; | ||
928 | struct gk20a_cde_app *cde_app = &l->cde_app; | ||
929 | struct gk20a_cde_ctx *cde_ctx = NULL; | ||
930 | struct nvgpu_timeout timeout; | ||
931 | |||
932 | nvgpu_timeout_init(g, &timeout, MAX_CTX_RETRY_TIME, | ||
933 | NVGPU_TIMER_CPU_TIMER); | ||
934 | |||
935 | do { | ||
936 | cde_ctx = gk20a_cde_do_get_context(l); | ||
937 | if (PTR_ERR(cde_ctx) != -EAGAIN) | ||
938 | break; | ||
939 | |||
940 | /* exhausted, retry */ | ||
941 | nvgpu_mutex_release(&cde_app->mutex); | ||
942 | cond_resched(); | ||
943 | nvgpu_mutex_acquire(&cde_app->mutex); | ||
944 | } while (!nvgpu_timeout_expired(&timeout)); | ||
945 | |||
946 | return cde_ctx; | ||
947 | } | ||
948 | |||
949 | static struct gk20a_cde_ctx *gk20a_cde_allocate_context(struct nvgpu_os_linux *l) | ||
950 | { | ||
951 | struct gk20a *g = &l->g; | ||
952 | struct gk20a_cde_ctx *cde_ctx; | ||
953 | int ret; | ||
954 | |||
955 | cde_ctx = nvgpu_kzalloc(g, sizeof(*cde_ctx)); | ||
956 | if (!cde_ctx) | ||
957 | return ERR_PTR(-ENOMEM); | ||
958 | |||
959 | cde_ctx->l = l; | ||
960 | cde_ctx->dev = dev_from_gk20a(g); | ||
961 | |||
962 | ret = gk20a_cde_load(cde_ctx); | ||
963 | if (ret) { | ||
964 | nvgpu_kfree(g, cde_ctx); | ||
965 | return ERR_PTR(ret); | ||
966 | } | ||
967 | |||
968 | nvgpu_init_list_node(&cde_ctx->list); | ||
969 | cde_ctx->is_temporary = false; | ||
970 | cde_ctx->in_use = false; | ||
971 | INIT_DELAYED_WORK(&cde_ctx->ctx_deleter_work, | ||
972 | gk20a_cde_ctx_deleter_fn); | ||
973 | |||
974 | nvgpu_log(g, gpu_dbg_fn | gpu_dbg_cde_ctx, "cde: allocated %p", cde_ctx); | ||
975 | trace_gk20a_cde_allocate_context(cde_ctx); | ||
976 | return cde_ctx; | ||
977 | } | ||
978 | |||
979 | static u32 gk20a_cde_mapping_page_size(struct vm_gk20a *vm, | ||
980 | u32 map_offset, u32 map_size) | ||
981 | { | ||
982 | struct gk20a *g = gk20a_from_vm(vm); | ||
983 | |||
984 | /* | ||
985 | * To be simple we will just make the map size depend on the | ||
986 | * iommu'ability of the driver. If there's an IOMMU we can rely on | ||
987 | * buffers being contiguous. If not, then we'll use 4k pages since we | ||
988 | * know that will work for any buffer. | ||
989 | */ | ||
990 | if (!nvgpu_iommuable(g)) | ||
991 | return SZ_4K; | ||
992 | |||
993 | /* | ||
994 | * If map size or offset is not 64K aligned then use small pages. | ||
995 | */ | ||
996 | if (map_size & (vm->big_page_size - 1) || | ||
997 | map_offset & (vm->big_page_size - 1)) | ||
998 | return SZ_4K; | ||
999 | |||
1000 | return vm->big_page_size; | ||
1001 | } | ||
1002 | |||
1003 | int gk20a_cde_convert(struct nvgpu_os_linux *l, | ||
1004 | struct dma_buf *compbits_scatter_buf, | ||
1005 | u64 compbits_byte_offset, | ||
1006 | u64 scatterbuffer_byte_offset, | ||
1007 | struct nvgpu_channel_fence *fence, | ||
1008 | u32 __flags, struct gk20a_cde_param *params, | ||
1009 | int num_params, struct gk20a_fence **fence_out) | ||
1010 | __acquires(&l->cde_app->mutex) | ||
1011 | __releases(&l->cde_app->mutex) | ||
1012 | { | ||
1013 | struct gk20a *g = &l->g; | ||
1014 | struct gk20a_cde_ctx *cde_ctx = NULL; | ||
1015 | struct gk20a_comptags comptags; | ||
1016 | struct nvgpu_os_buffer os_buf = { | ||
1017 | compbits_scatter_buf, | ||
1018 | NULL, | ||
1019 | dev_from_gk20a(g) | ||
1020 | }; | ||
1021 | u64 mapped_compbits_offset = 0; | ||
1022 | u64 compbits_size = 0; | ||
1023 | u64 mapped_scatterbuffer_offset = 0; | ||
1024 | u64 scatterbuffer_size = 0; | ||
1025 | u64 map_vaddr = 0; | ||
1026 | u64 map_offset = 0; | ||
1027 | u64 map_size = 0; | ||
1028 | u8 *surface = NULL; | ||
1029 | u64 big_page_mask = 0; | ||
1030 | u32 flags; | ||
1031 | int err, i; | ||
1032 | const s16 compbits_kind = 0; | ||
1033 | u32 submit_op; | ||
1034 | struct dma_buf_attachment *attachment; | ||
1035 | |||
1036 | nvgpu_log(g, gpu_dbg_cde, "compbits_byte_offset=%llu scatterbuffer_byte_offset=%llu", | ||
1037 | compbits_byte_offset, scatterbuffer_byte_offset); | ||
1038 | |||
1039 | /* scatter buffer must be after compbits buffer */ | ||
1040 | if (scatterbuffer_byte_offset && | ||
1041 | scatterbuffer_byte_offset < compbits_byte_offset) | ||
1042 | return -EINVAL; | ||
1043 | |||
1044 | err = gk20a_busy(g); | ||
1045 | if (err) | ||
1046 | return err; | ||
1047 | |||
1048 | nvgpu_mutex_acquire(&l->cde_app.mutex); | ||
1049 | cde_ctx = gk20a_cde_get_context(l); | ||
1050 | nvgpu_mutex_release(&l->cde_app.mutex); | ||
1051 | if (IS_ERR(cde_ctx)) { | ||
1052 | err = PTR_ERR(cde_ctx); | ||
1053 | goto exit_idle; | ||
1054 | } | ||
1055 | |||
1056 | /* First, map the buffer to local va */ | ||
1057 | |||
1058 | /* ensure that the compbits buffer has drvdata */ | ||
1059 | err = gk20a_dmabuf_alloc_drvdata(compbits_scatter_buf, | ||
1060 | dev_from_gk20a(g)); | ||
1061 | if (err) | ||
1062 | goto exit_idle; | ||
1063 | |||
1064 | /* compbits don't start at page aligned offset, so we need to align | ||
1065 | the region to be mapped */ | ||
1066 | big_page_mask = cde_ctx->vm->big_page_size - 1; | ||
1067 | map_offset = compbits_byte_offset & ~big_page_mask; | ||
1068 | map_size = compbits_scatter_buf->size - map_offset; | ||
1069 | |||
1070 | |||
1071 | /* compute compbit start offset from the beginning of the mapped | ||
1072 | area */ | ||
1073 | mapped_compbits_offset = compbits_byte_offset - map_offset; | ||
1074 | if (scatterbuffer_byte_offset) { | ||
1075 | compbits_size = scatterbuffer_byte_offset - | ||
1076 | compbits_byte_offset; | ||
1077 | mapped_scatterbuffer_offset = scatterbuffer_byte_offset - | ||
1078 | map_offset; | ||
1079 | scatterbuffer_size = compbits_scatter_buf->size - | ||
1080 | scatterbuffer_byte_offset; | ||
1081 | } else { | ||
1082 | compbits_size = compbits_scatter_buf->size - | ||
1083 | compbits_byte_offset; | ||
1084 | } | ||
1085 | |||
1086 | nvgpu_log(g, gpu_dbg_cde, "map_offset=%llu map_size=%llu", | ||
1087 | map_offset, map_size); | ||
1088 | nvgpu_log(g, gpu_dbg_cde, "mapped_compbits_offset=%llu compbits_size=%llu", | ||
1089 | mapped_compbits_offset, compbits_size); | ||
1090 | nvgpu_log(g, gpu_dbg_cde, "mapped_scatterbuffer_offset=%llu scatterbuffer_size=%llu", | ||
1091 | mapped_scatterbuffer_offset, scatterbuffer_size); | ||
1092 | |||
1093 | |||
1094 | /* map the destination buffer */ | ||
1095 | get_dma_buf(compbits_scatter_buf); /* a ref for nvgpu_vm_map_linux */ | ||
1096 | err = nvgpu_vm_map_linux(cde_ctx->vm, compbits_scatter_buf, 0, | ||
1097 | NVGPU_VM_MAP_CACHEABLE | | ||
1098 | NVGPU_VM_MAP_DIRECT_KIND_CTRL, | ||
1099 | gk20a_cde_mapping_page_size(cde_ctx->vm, | ||
1100 | map_offset, | ||
1101 | map_size), | ||
1102 | NV_KIND_INVALID, | ||
1103 | compbits_kind, /* incompressible kind */ | ||
1104 | gk20a_mem_flag_none, | ||
1105 | map_offset, map_size, | ||
1106 | NULL, | ||
1107 | &map_vaddr); | ||
1108 | if (err) { | ||
1109 | nvgpu_warn(g, "cde: failed to map compbits scatter buf at %lld size %lld", | ||
1110 | map_offset, map_size); | ||
1111 | dma_buf_put(compbits_scatter_buf); | ||
1112 | err = -EINVAL; | ||
1113 | goto exit_idle; | ||
1114 | } | ||
1115 | |||
1116 | if (scatterbuffer_byte_offset && | ||
1117 | l->ops.cde.need_scatter_buffer && | ||
1118 | l->ops.cde.need_scatter_buffer(g)) { | ||
1119 | struct sg_table *sgt; | ||
1120 | void *scatter_buffer; | ||
1121 | |||
1122 | surface = dma_buf_vmap(compbits_scatter_buf); | ||
1123 | if (IS_ERR(surface)) { | ||
1124 | nvgpu_warn(g, | ||
1125 | "dma_buf_vmap failed"); | ||
1126 | err = -EINVAL; | ||
1127 | goto exit_unmap_vaddr; | ||
1128 | } | ||
1129 | |||
1130 | scatter_buffer = surface + scatterbuffer_byte_offset; | ||
1131 | |||
1132 | nvgpu_log(g, gpu_dbg_cde, "surface=0x%p scatterBuffer=0x%p", | ||
1133 | surface, scatter_buffer); | ||
1134 | sgt = gk20a_mm_pin(dev_from_gk20a(g), compbits_scatter_buf, | ||
1135 | &attachment); | ||
1136 | if (IS_ERR(sgt)) { | ||
1137 | nvgpu_warn(g, | ||
1138 | "mm_pin failed"); | ||
1139 | err = -EINVAL; | ||
1140 | goto exit_unmap_surface; | ||
1141 | } else { | ||
1142 | err = l->ops.cde.populate_scatter_buffer(g, sgt, | ||
1143 | compbits_byte_offset, scatter_buffer, | ||
1144 | scatterbuffer_size); | ||
1145 | WARN_ON(err); | ||
1146 | |||
1147 | gk20a_mm_unpin(dev_from_gk20a(g), compbits_scatter_buf, | ||
1148 | attachment, sgt); | ||
1149 | if (err) | ||
1150 | goto exit_unmap_surface; | ||
1151 | } | ||
1152 | |||
1153 | __cpuc_flush_dcache_area(scatter_buffer, scatterbuffer_size); | ||
1154 | dma_buf_vunmap(compbits_scatter_buf, surface); | ||
1155 | surface = NULL; | ||
1156 | } | ||
1157 | |||
1158 | /* store source buffer compression tags */ | ||
1159 | gk20a_get_comptags(&os_buf, &comptags); | ||
1160 | cde_ctx->surf_param_offset = comptags.offset; | ||
1161 | cde_ctx->surf_param_lines = comptags.lines; | ||
1162 | |||
1163 | /* store surface vaddr. This is actually compbit vaddr, but since | ||
1164 | compbits live in the same surface, and we can get the alloc base | ||
1165 | address by using gpuva_to_iova_base, this will do */ | ||
1166 | cde_ctx->surf_vaddr = map_vaddr; | ||
1167 | |||
1168 | /* store information about destination */ | ||
1169 | cde_ctx->compbit_vaddr = map_vaddr + mapped_compbits_offset; | ||
1170 | cde_ctx->compbit_size = compbits_size; | ||
1171 | |||
1172 | cde_ctx->scatterbuffer_vaddr = map_vaddr + mapped_scatterbuffer_offset; | ||
1173 | cde_ctx->scatterbuffer_size = scatterbuffer_size; | ||
1174 | |||
1175 | /* remove existing argument data */ | ||
1176 | memset(cde_ctx->user_param_values, 0, | ||
1177 | sizeof(cde_ctx->user_param_values)); | ||
1178 | |||
1179 | /* read user space arguments for the conversion */ | ||
1180 | for (i = 0; i < num_params; i++) { | ||
1181 | struct gk20a_cde_param *param = params + i; | ||
1182 | int id = param->id - NUM_RESERVED_PARAMS; | ||
1183 | |||
1184 | if (id < 0 || id >= MAX_CDE_USER_PARAMS) { | ||
1185 | nvgpu_warn(g, "cde: unknown user parameter"); | ||
1186 | err = -EINVAL; | ||
1187 | goto exit_unmap_surface; | ||
1188 | } | ||
1189 | cde_ctx->user_param_values[id] = param->value; | ||
1190 | } | ||
1191 | |||
1192 | /* patch data */ | ||
1193 | err = gk20a_cde_patch_params(cde_ctx); | ||
1194 | if (err) { | ||
1195 | nvgpu_warn(g, "cde: failed to patch parameters"); | ||
1196 | goto exit_unmap_surface; | ||
1197 | } | ||
1198 | |||
1199 | nvgpu_log(g, gpu_dbg_cde, "cde: buffer=cbc, size=%zu, gpuva=%llx\n", | ||
1200 | g->gr.compbit_store.mem.size, cde_ctx->backing_store_vaddr); | ||
1201 | nvgpu_log(g, gpu_dbg_cde, "cde: buffer=compbits, size=%llu, gpuva=%llx\n", | ||
1202 | cde_ctx->compbit_size, cde_ctx->compbit_vaddr); | ||
1203 | nvgpu_log(g, gpu_dbg_cde, "cde: buffer=scatterbuffer, size=%llu, gpuva=%llx\n", | ||
1204 | cde_ctx->scatterbuffer_size, cde_ctx->scatterbuffer_vaddr); | ||
1205 | |||
1206 | /* take always the postfence as it is needed for protecting the | ||
1207 | * cde context */ | ||
1208 | flags = __flags | NVGPU_SUBMIT_FLAGS_FENCE_GET; | ||
1209 | |||
1210 | /* gk20a_cde_execute_buffer() will grab a power reference of it's own */ | ||
1211 | gk20a_idle(g); | ||
1212 | |||
1213 | if (comptags.lines == 0) { | ||
1214 | /* | ||
1215 | * Nothing to do on the buffer, but do a null kickoff for | ||
1216 | * managing the pre and post fences. | ||
1217 | */ | ||
1218 | submit_op = TYPE_BUF_COMMAND_NOOP; | ||
1219 | } else if (!cde_ctx->init_cmd_executed) { | ||
1220 | /* | ||
1221 | * First time, so include the init pushbuf too in addition to | ||
1222 | * the conversion code. | ||
1223 | */ | ||
1224 | submit_op = TYPE_BUF_COMMAND_INIT; | ||
1225 | } else { | ||
1226 | /* | ||
1227 | * The usual condition: execute just the conversion. | ||
1228 | */ | ||
1229 | submit_op = TYPE_BUF_COMMAND_CONVERT; | ||
1230 | } | ||
1231 | err = gk20a_cde_execute_buffer(cde_ctx, submit_op, | ||
1232 | fence, flags, fence_out); | ||
1233 | |||
1234 | if (comptags.lines != 0 && !err) | ||
1235 | cde_ctx->init_cmd_executed = true; | ||
1236 | |||
1237 | /* unmap the buffers - channel holds references to them now */ | ||
1238 | nvgpu_vm_unmap(cde_ctx->vm, map_vaddr, NULL); | ||
1239 | |||
1240 | return err; | ||
1241 | |||
1242 | exit_unmap_surface: | ||
1243 | if (surface) | ||
1244 | dma_buf_vunmap(compbits_scatter_buf, surface); | ||
1245 | exit_unmap_vaddr: | ||
1246 | nvgpu_vm_unmap(cde_ctx->vm, map_vaddr, NULL); | ||
1247 | exit_idle: | ||
1248 | gk20a_idle(g); | ||
1249 | return err; | ||
1250 | } | ||
1251 | |||
1252 | static void gk20a_cde_finished_ctx_cb(struct channel_gk20a *ch, void *data) | ||
1253 | __acquires(&cde_app->mutex) | ||
1254 | __releases(&cde_app->mutex) | ||
1255 | { | ||
1256 | struct gk20a_cde_ctx *cde_ctx = data; | ||
1257 | struct nvgpu_os_linux *l = cde_ctx->l; | ||
1258 | struct gk20a *g = &l->g; | ||
1259 | struct gk20a_cde_app *cde_app = &l->cde_app; | ||
1260 | bool channel_idle; | ||
1261 | |||
1262 | channel_gk20a_joblist_lock(ch); | ||
1263 | channel_idle = channel_gk20a_joblist_is_empty(ch); | ||
1264 | channel_gk20a_joblist_unlock(ch); | ||
1265 | |||
1266 | if (!channel_idle) | ||
1267 | return; | ||
1268 | |||
1269 | trace_gk20a_cde_finished_ctx_cb(cde_ctx); | ||
1270 | nvgpu_log(g, gpu_dbg_fn | gpu_dbg_cde_ctx, "cde: finished %p", cde_ctx); | ||
1271 | if (!cde_ctx->in_use) | ||
1272 | nvgpu_log_info(g, "double finish cde context %p on channel %p", | ||
1273 | cde_ctx, ch); | ||
1274 | |||
1275 | if (gk20a_channel_check_timedout(ch)) { | ||
1276 | if (cde_ctx->is_temporary) { | ||
1277 | nvgpu_warn(g, | ||
1278 | "cde: channel had timed out" | ||
1279 | " (temporary channel)"); | ||
1280 | /* going to be deleted anyway */ | ||
1281 | } else { | ||
1282 | nvgpu_warn(g, | ||
1283 | "cde: channel had timed out" | ||
1284 | ", reloading"); | ||
1285 | /* mark it to be deleted, replace with a new one */ | ||
1286 | nvgpu_mutex_acquire(&cde_app->mutex); | ||
1287 | cde_ctx->is_temporary = true; | ||
1288 | if (gk20a_cde_create_context(l)) { | ||
1289 | nvgpu_err(g, "cde: can't replace context"); | ||
1290 | } | ||
1291 | nvgpu_mutex_release(&cde_app->mutex); | ||
1292 | } | ||
1293 | } | ||
1294 | |||
1295 | /* delete temporary contexts later (watch for doubles) */ | ||
1296 | if (cde_ctx->is_temporary && cde_ctx->in_use) { | ||
1297 | WARN_ON(delayed_work_pending(&cde_ctx->ctx_deleter_work)); | ||
1298 | schedule_delayed_work(&cde_ctx->ctx_deleter_work, | ||
1299 | msecs_to_jiffies(CTX_DELETE_TIME)); | ||
1300 | } | ||
1301 | |||
1302 | if (!gk20a_channel_check_timedout(ch)) { | ||
1303 | gk20a_cde_ctx_release(cde_ctx); | ||
1304 | } | ||
1305 | } | ||
1306 | |||
1307 | static int gk20a_cde_load(struct gk20a_cde_ctx *cde_ctx) | ||
1308 | { | ||
1309 | struct nvgpu_os_linux *l = cde_ctx->l; | ||
1310 | struct gk20a *g = &l->g; | ||
1311 | struct nvgpu_firmware *img; | ||
1312 | struct channel_gk20a *ch; | ||
1313 | struct tsg_gk20a *tsg; | ||
1314 | struct gr_gk20a *gr = &g->gr; | ||
1315 | struct nvgpu_setup_bind_args setup_bind_args; | ||
1316 | int err = 0; | ||
1317 | u64 vaddr; | ||
1318 | |||
1319 | img = nvgpu_request_firmware(g, "gpu2cde.bin", 0); | ||
1320 | if (!img) { | ||
1321 | nvgpu_err(g, "cde: could not fetch the firmware"); | ||
1322 | return -ENOSYS; | ||
1323 | } | ||
1324 | |||
1325 | tsg = gk20a_tsg_open(g, nvgpu_current_pid(g)); | ||
1326 | if (!tsg) { | ||
1327 | nvgpu_err(g, "cde: could not create TSG"); | ||
1328 | err = -ENOMEM; | ||
1329 | goto err_get_gk20a_channel; | ||
1330 | } | ||
1331 | |||
1332 | ch = gk20a_open_new_channel_with_cb(g, gk20a_cde_finished_ctx_cb, | ||
1333 | cde_ctx, | ||
1334 | -1, | ||
1335 | false); | ||
1336 | if (!ch) { | ||
1337 | nvgpu_warn(g, "cde: gk20a channel not available"); | ||
1338 | err = -ENOMEM; | ||
1339 | goto err_get_gk20a_channel; | ||
1340 | } | ||
1341 | |||
1342 | ch->timeout.enabled = false; | ||
1343 | |||
1344 | /* bind the channel to the vm */ | ||
1345 | err = g->ops.mm.vm_bind_channel(g->mm.cde.vm, ch); | ||
1346 | if (err) { | ||
1347 | nvgpu_warn(g, "cde: could not bind vm"); | ||
1348 | goto err_commit_va; | ||
1349 | } | ||
1350 | |||
1351 | err = gk20a_tsg_bind_channel(tsg, ch); | ||
1352 | if (err) { | ||
1353 | nvgpu_err(g, "cde: unable to bind to tsg"); | ||
1354 | goto err_setup_bind; | ||
1355 | } | ||
1356 | |||
1357 | setup_bind_args.num_gpfifo_entries = 1024; | ||
1358 | setup_bind_args.num_inflight_jobs = 0; | ||
1359 | setup_bind_args.flags = 0; | ||
1360 | err = nvgpu_channel_setup_bind(ch, &setup_bind_args); | ||
1361 | if (err) { | ||
1362 | nvgpu_warn(g, "cde: unable to setup channel"); | ||
1363 | goto err_setup_bind; | ||
1364 | } | ||
1365 | |||
1366 | /* map backing store to gpu virtual space */ | ||
1367 | vaddr = nvgpu_gmmu_map(ch->vm, &gr->compbit_store.mem, | ||
1368 | g->gr.compbit_store.mem.size, | ||
1369 | NVGPU_VM_MAP_CACHEABLE, | ||
1370 | gk20a_mem_flag_read_only, | ||
1371 | false, | ||
1372 | gr->compbit_store.mem.aperture); | ||
1373 | |||
1374 | if (!vaddr) { | ||
1375 | nvgpu_warn(g, "cde: cannot map compression bit backing store"); | ||
1376 | err = -ENOMEM; | ||
1377 | goto err_map_backingstore; | ||
1378 | } | ||
1379 | |||
1380 | /* store initialisation data */ | ||
1381 | cde_ctx->ch = ch; | ||
1382 | cde_ctx->tsg = tsg; | ||
1383 | cde_ctx->vm = ch->vm; | ||
1384 | cde_ctx->backing_store_vaddr = vaddr; | ||
1385 | |||
1386 | /* initialise the firmware */ | ||
1387 | err = gk20a_init_cde_img(cde_ctx, img); | ||
1388 | if (err) { | ||
1389 | nvgpu_warn(g, "cde: image initialisation failed"); | ||
1390 | goto err_init_cde_img; | ||
1391 | } | ||
1392 | |||
1393 | /* initialisation done */ | ||
1394 | nvgpu_release_firmware(g, img); | ||
1395 | |||
1396 | return 0; | ||
1397 | |||
1398 | err_init_cde_img: | ||
1399 | nvgpu_gmmu_unmap(ch->vm, &g->gr.compbit_store.mem, vaddr); | ||
1400 | err_map_backingstore: | ||
1401 | err_setup_bind: | ||
1402 | nvgpu_vm_put(ch->vm); | ||
1403 | err_commit_va: | ||
1404 | err_get_gk20a_channel: | ||
1405 | nvgpu_release_firmware(g, img); | ||
1406 | nvgpu_err(g, "cde: couldn't initialise buffer converter: %d", err); | ||
1407 | return err; | ||
1408 | } | ||
1409 | |||
1410 | int gk20a_cde_reload(struct nvgpu_os_linux *l) | ||
1411 | __acquires(&l->cde_app->mutex) | ||
1412 | __releases(&l->cde_app->mutex) | ||
1413 | { | ||
1414 | struct gk20a *g = &l->g; | ||
1415 | struct gk20a_cde_app *cde_app = &l->cde_app; | ||
1416 | int err; | ||
1417 | |||
1418 | if (!cde_app->initialised) | ||
1419 | return -ENOSYS; | ||
1420 | |||
1421 | err = gk20a_busy(g); | ||
1422 | if (err) | ||
1423 | return err; | ||
1424 | |||
1425 | nvgpu_mutex_acquire(&cde_app->mutex); | ||
1426 | |||
1427 | gk20a_cde_stop(l); | ||
1428 | |||
1429 | err = gk20a_cde_create_contexts(l); | ||
1430 | if (!err) | ||
1431 | cde_app->initialised = true; | ||
1432 | |||
1433 | nvgpu_mutex_release(&cde_app->mutex); | ||
1434 | |||
1435 | gk20a_idle(g); | ||
1436 | return err; | ||
1437 | } | ||
1438 | |||
1439 | int gk20a_init_cde_support(struct nvgpu_os_linux *l) | ||
1440 | __acquires(&cde_app->mutex) | ||
1441 | __releases(&cde_app->mutex) | ||
1442 | { | ||
1443 | struct gk20a_cde_app *cde_app = &l->cde_app; | ||
1444 | struct gk20a *g = &l->g; | ||
1445 | int err; | ||
1446 | |||
1447 | if (cde_app->initialised) | ||
1448 | return 0; | ||
1449 | |||
1450 | nvgpu_log(g, gpu_dbg_fn | gpu_dbg_cde_ctx, "cde: init"); | ||
1451 | |||
1452 | err = nvgpu_mutex_init(&cde_app->mutex); | ||
1453 | if (err) | ||
1454 | return err; | ||
1455 | |||
1456 | nvgpu_mutex_acquire(&cde_app->mutex); | ||
1457 | |||
1458 | nvgpu_init_list_node(&cde_app->free_contexts); | ||
1459 | nvgpu_init_list_node(&cde_app->used_contexts); | ||
1460 | cde_app->ctx_count = 0; | ||
1461 | cde_app->ctx_count_top = 0; | ||
1462 | cde_app->ctx_usecount = 0; | ||
1463 | |||
1464 | err = gk20a_cde_create_contexts(l); | ||
1465 | if (!err) | ||
1466 | cde_app->initialised = true; | ||
1467 | |||
1468 | nvgpu_mutex_release(&cde_app->mutex); | ||
1469 | nvgpu_log(g, gpu_dbg_cde_ctx, "cde: init finished: %d", err); | ||
1470 | |||
1471 | if (err) | ||
1472 | nvgpu_mutex_destroy(&cde_app->mutex); | ||
1473 | |||
1474 | return err; | ||
1475 | } | ||
1476 | |||
1477 | enum cde_launch_patch_id { | ||
1478 | PATCH_H_QMD_CTA_RASTER_WIDTH_ID = 1024, | ||
1479 | PATCH_H_QMD_CTA_RASTER_HEIGHT_ID = 1025, | ||
1480 | PATCH_QMD_CTA_RASTER_DEPTH_ID = 1026, /* for firmware v0 only */ | ||
1481 | PATCH_QMD_CTA_THREAD_DIMENSION0_ID = 1027, | ||
1482 | PATCH_QMD_CTA_THREAD_DIMENSION1_ID = 1028, | ||
1483 | PATCH_QMD_CTA_THREAD_DIMENSION2_ID = 1029, /* for firmware v0 only */ | ||
1484 | PATCH_USER_CONST_XTILES_ID = 1030, /* for firmware v0 only */ | ||
1485 | PATCH_USER_CONST_YTILES_ID = 1031, /* for firmware v0 only */ | ||
1486 | PATCH_USER_CONST_BLOCKHEIGHTLOG2_ID = 1032, | ||
1487 | PATCH_USER_CONST_DSTPITCH_ID = 1033, /* for firmware v0 only */ | ||
1488 | PATCH_H_USER_CONST_FLAGS_ID = 1034, /* for firmware v0 only */ | ||
1489 | PATCH_H_VPC_CURRENT_GRID_SIZE_X_ID = 1035, | ||
1490 | PATCH_H_VPC_CURRENT_GRID_SIZE_Y_ID = 1036, | ||
1491 | PATCH_H_VPC_CURRENT_GRID_SIZE_Z_ID = 1037, | ||
1492 | PATCH_VPC_CURRENT_GROUP_SIZE_X_ID = 1038, | ||
1493 | PATCH_VPC_CURRENT_GROUP_SIZE_Y_ID = 1039, | ||
1494 | PATCH_VPC_CURRENT_GROUP_SIZE_Z_ID = 1040, | ||
1495 | PATCH_USER_CONST_XBLOCKS_ID = 1041, | ||
1496 | PATCH_H_USER_CONST_DSTOFFSET_ID = 1042, | ||
1497 | PATCH_V_QMD_CTA_RASTER_WIDTH_ID = 1043, | ||
1498 | PATCH_V_QMD_CTA_RASTER_HEIGHT_ID = 1044, | ||
1499 | PATCH_V_USER_CONST_DSTOFFSET_ID = 1045, | ||
1500 | PATCH_V_VPC_CURRENT_GRID_SIZE_X_ID = 1046, | ||
1501 | PATCH_V_VPC_CURRENT_GRID_SIZE_Y_ID = 1047, | ||
1502 | PATCH_V_VPC_CURRENT_GRID_SIZE_Z_ID = 1048, | ||
1503 | PATCH_H_LAUNCH_WORD1_ID = 1049, | ||
1504 | PATCH_H_LAUNCH_WORD2_ID = 1050, | ||
1505 | PATCH_V_LAUNCH_WORD1_ID = 1051, | ||
1506 | PATCH_V_LAUNCH_WORD2_ID = 1052, | ||
1507 | PATCH_H_QMD_PROGRAM_OFFSET_ID = 1053, | ||
1508 | PATCH_H_QMD_REGISTER_COUNT_ID = 1054, | ||
1509 | PATCH_V_QMD_PROGRAM_OFFSET_ID = 1055, | ||
1510 | PATCH_V_QMD_REGISTER_COUNT_ID = 1056, | ||
1511 | }; | ||
1512 | |||
1513 | /* maximum number of WRITE_PATCHes in the below function */ | ||
1514 | #define MAX_CDE_LAUNCH_PATCHES 32 | ||
1515 | |||
1516 | static int gk20a_buffer_convert_gpu_to_cde_v1( | ||
1517 | struct nvgpu_os_linux *l, | ||
1518 | struct dma_buf *dmabuf, u32 consumer, | ||
1519 | u64 offset, u64 compbits_hoffset, u64 compbits_voffset, | ||
1520 | u64 scatterbuffer_offset, | ||
1521 | u32 width, u32 height, u32 block_height_log2, | ||
1522 | u32 submit_flags, struct nvgpu_channel_fence *fence_in, | ||
1523 | struct gk20a_buffer_state *state) | ||
1524 | { | ||
1525 | struct gk20a *g = &l->g; | ||
1526 | struct gk20a_cde_param params[MAX_CDE_LAUNCH_PATCHES]; | ||
1527 | int param = 0; | ||
1528 | int err = 0; | ||
1529 | struct gk20a_fence *new_fence = NULL; | ||
1530 | const int wgx = 8; | ||
1531 | const int wgy = 8; | ||
1532 | const int compbits_per_byte = 4; /* one byte stores 4 compbit pairs */ | ||
1533 | const int xalign = compbits_per_byte * wgx; | ||
1534 | const int yalign = wgy; | ||
1535 | |||
1536 | /* Compute per launch parameters */ | ||
1537 | const int xtiles = (width + 7) >> 3; | ||
1538 | const int ytiles = (height + 7) >> 3; | ||
1539 | const int gridw_h = roundup(xtiles, xalign) / xalign; | ||
1540 | const int gridh_h = roundup(ytiles, yalign) / yalign; | ||
1541 | const int gridw_v = roundup(ytiles, xalign) / xalign; | ||
1542 | const int gridh_v = roundup(xtiles, yalign) / yalign; | ||
1543 | const int xblocks = (xtiles + 1) >> 1; | ||
1544 | const int voffset = compbits_voffset - compbits_hoffset; | ||
1545 | |||
1546 | int hprog = -1; | ||
1547 | int vprog = -1; | ||
1548 | |||
1549 | if (l->ops.cde.get_program_numbers) | ||
1550 | l->ops.cde.get_program_numbers(g, block_height_log2, | ||
1551 | l->cde_app.shader_parameter, | ||
1552 | &hprog, &vprog); | ||
1553 | else { | ||
1554 | nvgpu_warn(g, "cde: chip not supported"); | ||
1555 | return -ENOSYS; | ||
1556 | } | ||
1557 | |||
1558 | if (hprog < 0 || vprog < 0) { | ||
1559 | nvgpu_warn(g, "cde: could not determine programs"); | ||
1560 | return -ENOSYS; | ||
1561 | } | ||
1562 | |||
1563 | if (xtiles > 8192 / 8 || ytiles > 8192 / 8) | ||
1564 | nvgpu_warn(g, "cde: surface is exceptionally large (xtiles=%d, ytiles=%d)", | ||
1565 | xtiles, ytiles); | ||
1566 | |||
1567 | nvgpu_log(g, gpu_dbg_cde, "w=%d, h=%d, bh_log2=%d, compbits_hoffset=0x%llx, compbits_voffset=0x%llx, scatterbuffer_offset=0x%llx", | ||
1568 | width, height, block_height_log2, | ||
1569 | compbits_hoffset, compbits_voffset, scatterbuffer_offset); | ||
1570 | nvgpu_log(g, gpu_dbg_cde, "resolution (%d, %d) tiles (%d, %d)", | ||
1571 | width, height, xtiles, ytiles); | ||
1572 | nvgpu_log(g, gpu_dbg_cde, "group (%d, %d) gridH (%d, %d) gridV (%d, %d)", | ||
1573 | wgx, wgy, gridw_h, gridh_h, gridw_v, gridh_v); | ||
1574 | nvgpu_log(g, gpu_dbg_cde, "hprog=%d, offset=0x%x, regs=%d, vprog=%d, offset=0x%x, regs=%d", | ||
1575 | hprog, | ||
1576 | l->cde_app.arrays[ARRAY_PROGRAM_OFFSET][hprog], | ||
1577 | l->cde_app.arrays[ARRAY_REGISTER_COUNT][hprog], | ||
1578 | vprog, | ||
1579 | l->cde_app.arrays[ARRAY_PROGRAM_OFFSET][vprog], | ||
1580 | l->cde_app.arrays[ARRAY_REGISTER_COUNT][vprog]); | ||
1581 | |||
1582 | /* Write parameters */ | ||
1583 | #define WRITE_PATCH(NAME, VALUE) \ | ||
1584 | params[param++] = (struct gk20a_cde_param){NAME##_ID, 0, VALUE} | ||
1585 | WRITE_PATCH(PATCH_USER_CONST_XBLOCKS, xblocks); | ||
1586 | WRITE_PATCH(PATCH_USER_CONST_BLOCKHEIGHTLOG2, | ||
1587 | block_height_log2); | ||
1588 | WRITE_PATCH(PATCH_QMD_CTA_THREAD_DIMENSION0, wgx); | ||
1589 | WRITE_PATCH(PATCH_QMD_CTA_THREAD_DIMENSION1, wgy); | ||
1590 | WRITE_PATCH(PATCH_VPC_CURRENT_GROUP_SIZE_X, wgx); | ||
1591 | WRITE_PATCH(PATCH_VPC_CURRENT_GROUP_SIZE_Y, wgy); | ||
1592 | WRITE_PATCH(PATCH_VPC_CURRENT_GROUP_SIZE_Z, 1); | ||
1593 | |||
1594 | WRITE_PATCH(PATCH_H_QMD_CTA_RASTER_WIDTH, gridw_h); | ||
1595 | WRITE_PATCH(PATCH_H_QMD_CTA_RASTER_HEIGHT, gridh_h); | ||
1596 | WRITE_PATCH(PATCH_H_USER_CONST_DSTOFFSET, 0); | ||
1597 | WRITE_PATCH(PATCH_H_VPC_CURRENT_GRID_SIZE_X, gridw_h); | ||
1598 | WRITE_PATCH(PATCH_H_VPC_CURRENT_GRID_SIZE_Y, gridh_h); | ||
1599 | WRITE_PATCH(PATCH_H_VPC_CURRENT_GRID_SIZE_Z, 1); | ||
1600 | |||
1601 | WRITE_PATCH(PATCH_V_QMD_CTA_RASTER_WIDTH, gridw_v); | ||
1602 | WRITE_PATCH(PATCH_V_QMD_CTA_RASTER_HEIGHT, gridh_v); | ||
1603 | WRITE_PATCH(PATCH_V_USER_CONST_DSTOFFSET, voffset); | ||
1604 | WRITE_PATCH(PATCH_V_VPC_CURRENT_GRID_SIZE_X, gridw_v); | ||
1605 | WRITE_PATCH(PATCH_V_VPC_CURRENT_GRID_SIZE_Y, gridh_v); | ||
1606 | WRITE_PATCH(PATCH_V_VPC_CURRENT_GRID_SIZE_Z, 1); | ||
1607 | |||
1608 | WRITE_PATCH(PATCH_H_QMD_PROGRAM_OFFSET, | ||
1609 | l->cde_app.arrays[ARRAY_PROGRAM_OFFSET][hprog]); | ||
1610 | WRITE_PATCH(PATCH_H_QMD_REGISTER_COUNT, | ||
1611 | l->cde_app.arrays[ARRAY_REGISTER_COUNT][hprog]); | ||
1612 | WRITE_PATCH(PATCH_V_QMD_PROGRAM_OFFSET, | ||
1613 | l->cde_app.arrays[ARRAY_PROGRAM_OFFSET][vprog]); | ||
1614 | WRITE_PATCH(PATCH_V_QMD_REGISTER_COUNT, | ||
1615 | l->cde_app.arrays[ARRAY_REGISTER_COUNT][vprog]); | ||
1616 | |||
1617 | if (consumer & NVGPU_GPU_COMPBITS_CDEH) { | ||
1618 | WRITE_PATCH(PATCH_H_LAUNCH_WORD1, | ||
1619 | l->cde_app.arrays[ARRAY_LAUNCH_COMMAND][0]); | ||
1620 | WRITE_PATCH(PATCH_H_LAUNCH_WORD2, | ||
1621 | l->cde_app.arrays[ARRAY_LAUNCH_COMMAND][1]); | ||
1622 | } else { | ||
1623 | WRITE_PATCH(PATCH_H_LAUNCH_WORD1, | ||
1624 | l->cde_app.arrays[ARRAY_LAUNCH_COMMAND][2]); | ||
1625 | WRITE_PATCH(PATCH_H_LAUNCH_WORD2, | ||
1626 | l->cde_app.arrays[ARRAY_LAUNCH_COMMAND][3]); | ||
1627 | } | ||
1628 | |||
1629 | if (consumer & NVGPU_GPU_COMPBITS_CDEV) { | ||
1630 | WRITE_PATCH(PATCH_V_LAUNCH_WORD1, | ||
1631 | l->cde_app.arrays[ARRAY_LAUNCH_COMMAND][0]); | ||
1632 | WRITE_PATCH(PATCH_V_LAUNCH_WORD2, | ||
1633 | l->cde_app.arrays[ARRAY_LAUNCH_COMMAND][1]); | ||
1634 | } else { | ||
1635 | WRITE_PATCH(PATCH_V_LAUNCH_WORD1, | ||
1636 | l->cde_app.arrays[ARRAY_LAUNCH_COMMAND][2]); | ||
1637 | WRITE_PATCH(PATCH_V_LAUNCH_WORD2, | ||
1638 | l->cde_app.arrays[ARRAY_LAUNCH_COMMAND][3]); | ||
1639 | } | ||
1640 | #undef WRITE_PATCH | ||
1641 | |||
1642 | err = gk20a_cde_convert(l, dmabuf, | ||
1643 | compbits_hoffset, | ||
1644 | scatterbuffer_offset, | ||
1645 | fence_in, submit_flags, | ||
1646 | params, param, &new_fence); | ||
1647 | if (err) | ||
1648 | goto out; | ||
1649 | |||
1650 | /* compbits generated, update state & fence */ | ||
1651 | gk20a_fence_put(state->fence); | ||
1652 | state->fence = new_fence; | ||
1653 | state->valid_compbits |= consumer & | ||
1654 | (NVGPU_GPU_COMPBITS_CDEH | NVGPU_GPU_COMPBITS_CDEV); | ||
1655 | out: | ||
1656 | return err; | ||
1657 | } | ||
1658 | |||
1659 | static int gk20a_buffer_convert_gpu_to_cde( | ||
1660 | struct nvgpu_os_linux *l, struct dma_buf *dmabuf, u32 consumer, | ||
1661 | u64 offset, u64 compbits_hoffset, u64 compbits_voffset, | ||
1662 | u64 scatterbuffer_offset, | ||
1663 | u32 width, u32 height, u32 block_height_log2, | ||
1664 | u32 submit_flags, struct nvgpu_channel_fence *fence_in, | ||
1665 | struct gk20a_buffer_state *state) | ||
1666 | { | ||
1667 | struct gk20a *g = &l->g; | ||
1668 | int err = 0; | ||
1669 | |||
1670 | if (!l->cde_app.initialised) | ||
1671 | return -ENOSYS; | ||
1672 | |||
1673 | nvgpu_log(g, gpu_dbg_cde, "firmware version = %d\n", | ||
1674 | l->cde_app.firmware_version); | ||
1675 | |||
1676 | if (l->cde_app.firmware_version == 1) { | ||
1677 | err = gk20a_buffer_convert_gpu_to_cde_v1( | ||
1678 | l, dmabuf, consumer, offset, compbits_hoffset, | ||
1679 | compbits_voffset, scatterbuffer_offset, | ||
1680 | width, height, block_height_log2, | ||
1681 | submit_flags, fence_in, state); | ||
1682 | } else { | ||
1683 | nvgpu_err(g, "unsupported CDE firmware version %d", | ||
1684 | l->cde_app.firmware_version); | ||
1685 | err = -EINVAL; | ||
1686 | } | ||
1687 | |||
1688 | return err; | ||
1689 | } | ||
1690 | |||
1691 | int gk20a_prepare_compressible_read( | ||
1692 | struct nvgpu_os_linux *l, u32 buffer_fd, u32 request, u64 offset, | ||
1693 | u64 compbits_hoffset, u64 compbits_voffset, | ||
1694 | u64 scatterbuffer_offset, | ||
1695 | u32 width, u32 height, u32 block_height_log2, | ||
1696 | u32 submit_flags, struct nvgpu_channel_fence *fence, | ||
1697 | u32 *valid_compbits, u32 *zbc_color, | ||
1698 | struct gk20a_fence **fence_out) | ||
1699 | { | ||
1700 | struct gk20a *g = &l->g; | ||
1701 | int err = 0; | ||
1702 | struct gk20a_buffer_state *state; | ||
1703 | struct dma_buf *dmabuf; | ||
1704 | u32 missing_bits; | ||
1705 | |||
1706 | dmabuf = dma_buf_get(buffer_fd); | ||
1707 | if (IS_ERR(dmabuf)) | ||
1708 | return -EINVAL; | ||
1709 | |||
1710 | err = gk20a_dmabuf_get_state(dmabuf, g, offset, &state); | ||
1711 | if (err) { | ||
1712 | dma_buf_put(dmabuf); | ||
1713 | return err; | ||
1714 | } | ||
1715 | |||
1716 | missing_bits = (state->valid_compbits ^ request) & request; | ||
1717 | |||
1718 | nvgpu_mutex_acquire(&state->lock); | ||
1719 | |||
1720 | if (state->valid_compbits && request == NVGPU_GPU_COMPBITS_NONE) { | ||
1721 | |||
1722 | gk20a_fence_put(state->fence); | ||
1723 | state->fence = NULL; | ||
1724 | /* state->fence = decompress(); | ||
1725 | state->valid_compbits = 0; */ | ||
1726 | err = -EINVAL; | ||
1727 | goto out; | ||
1728 | } else if (missing_bits) { | ||
1729 | u32 missing_cde_bits = missing_bits & | ||
1730 | (NVGPU_GPU_COMPBITS_CDEH | NVGPU_GPU_COMPBITS_CDEV); | ||
1731 | if ((state->valid_compbits & NVGPU_GPU_COMPBITS_GPU) && | ||
1732 | missing_cde_bits) { | ||
1733 | err = gk20a_buffer_convert_gpu_to_cde( | ||
1734 | l, dmabuf, | ||
1735 | missing_cde_bits, | ||
1736 | offset, compbits_hoffset, | ||
1737 | compbits_voffset, scatterbuffer_offset, | ||
1738 | width, height, block_height_log2, | ||
1739 | submit_flags, fence, | ||
1740 | state); | ||
1741 | if (err) | ||
1742 | goto out; | ||
1743 | } | ||
1744 | } | ||
1745 | |||
1746 | if (state->fence && fence_out) | ||
1747 | *fence_out = gk20a_fence_get(state->fence); | ||
1748 | |||
1749 | if (valid_compbits) | ||
1750 | *valid_compbits = state->valid_compbits; | ||
1751 | |||
1752 | if (zbc_color) | ||
1753 | *zbc_color = state->zbc_color; | ||
1754 | |||
1755 | out: | ||
1756 | nvgpu_mutex_release(&state->lock); | ||
1757 | dma_buf_put(dmabuf); | ||
1758 | return err; | ||
1759 | } | ||
1760 | |||
1761 | int gk20a_mark_compressible_write(struct gk20a *g, u32 buffer_fd, | ||
1762 | u32 valid_compbits, u64 offset, u32 zbc_color) | ||
1763 | { | ||
1764 | int err; | ||
1765 | struct gk20a_buffer_state *state; | ||
1766 | struct dma_buf *dmabuf; | ||
1767 | |||
1768 | dmabuf = dma_buf_get(buffer_fd); | ||
1769 | if (IS_ERR(dmabuf)) { | ||
1770 | nvgpu_err(g, "invalid dmabuf"); | ||
1771 | return -EINVAL; | ||
1772 | } | ||
1773 | |||
1774 | err = gk20a_dmabuf_get_state(dmabuf, g, offset, &state); | ||
1775 | if (err) { | ||
1776 | nvgpu_err(g, "could not get state from dmabuf"); | ||
1777 | dma_buf_put(dmabuf); | ||
1778 | return err; | ||
1779 | } | ||
1780 | |||
1781 | nvgpu_mutex_acquire(&state->lock); | ||
1782 | |||
1783 | /* Update the compbits state. */ | ||
1784 | state->valid_compbits = valid_compbits; | ||
1785 | state->zbc_color = zbc_color; | ||
1786 | |||
1787 | /* Discard previous compbit job fence. */ | ||
1788 | gk20a_fence_put(state->fence); | ||
1789 | state->fence = NULL; | ||
1790 | |||
1791 | nvgpu_mutex_release(&state->lock); | ||
1792 | dma_buf_put(dmabuf); | ||
1793 | return 0; | ||
1794 | } | ||