diff options
Diffstat (limited to 'drivers/gpu/nvgpu/gk20a')
-rw-r--r-- | drivers/gpu/nvgpu/gk20a/ce2_gk20a.c | 228 | ||||
-rw-r--r-- | drivers/gpu/nvgpu/gk20a/ce2_gk20a.h | 7 | ||||
-rw-r--r-- | drivers/gpu/nvgpu/gk20a/platform_gk20a.h | 3 |
3 files changed, 164 insertions, 74 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c b/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c index fdc1ac61..5314a1be 100644 --- a/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c | |||
@@ -28,6 +28,7 @@ | |||
28 | #include "gk20a.h" | 28 | #include "gk20a.h" |
29 | 29 | ||
30 | #include <nvgpu/log.h> | 30 | #include <nvgpu/log.h> |
31 | #include <nvgpu/enabled.h> | ||
31 | 32 | ||
32 | #include <nvgpu/hw/gk20a/hw_ce2_gk20a.h> | 33 | #include <nvgpu/hw/gk20a/hw_ce2_gk20a.h> |
33 | #include <nvgpu/hw/gk20a/hw_pbdma_gk20a.h> | 34 | #include <nvgpu/hw/gk20a/hw_pbdma_gk20a.h> |
@@ -38,6 +39,14 @@ | |||
38 | #include <nvgpu/hw/gk20a/hw_gr_gk20a.h> | 39 | #include <nvgpu/hw/gk20a/hw_gr_gk20a.h> |
39 | #include <nvgpu/barrier.h> | 40 | #include <nvgpu/barrier.h> |
40 | 41 | ||
42 | /* | ||
43 | * Copy engine defines line size in pixels | ||
44 | */ | ||
45 | #define MAX_CE_SHIFT 31 /* 4Gpixels -1 */ | ||
46 | #define MAX_CE_MASK ((u32) (~(~0 << MAX_CE_SHIFT))) | ||
47 | #define MAX_CE_ALIGN(a) (a & MAX_CE_MASK) | ||
48 | |||
49 | |||
41 | static u32 ce2_nonblockpipe_isr(struct gk20a *g, u32 fifo_intr) | 50 | static u32 ce2_nonblockpipe_isr(struct gk20a *g, u32 fifo_intr) |
42 | { | 51 | { |
43 | gk20a_dbg(gpu_dbg_intr, "ce2 non-blocking pipe interrupt\n"); | 52 | gk20a_dbg(gpu_dbg_intr, "ce2 non-blocking pipe interrupt\n"); |
@@ -192,6 +201,10 @@ static void gk20a_ce_delete_gpu_context(struct gk20a_gpu_ctx *ce_ctx) | |||
192 | nvgpu_dma_unmap_free(ce_ctx->vm, &ce_ctx->cmd_buf_mem); | 201 | nvgpu_dma_unmap_free(ce_ctx->vm, &ce_ctx->cmd_buf_mem); |
193 | } | 202 | } |
194 | 203 | ||
204 | /* unbind tsg */ | ||
205 | if (ce_ctx->tsg && ce_ctx->ch) | ||
206 | gk20a_tsg_unbind_channel(ce_ctx->ch); | ||
207 | |||
195 | /* free the channel */ | 208 | /* free the channel */ |
196 | if (ce_ctx->ch) | 209 | if (ce_ctx->ch) |
197 | gk20a_channel_close(ce_ctx->ch); | 210 | gk20a_channel_close(ce_ctx->ch); |
@@ -206,15 +219,32 @@ static void gk20a_ce_delete_gpu_context(struct gk20a_gpu_ctx *ce_ctx) | |||
206 | nvgpu_kfree(ce_ctx->g, ce_ctx); | 219 | nvgpu_kfree(ce_ctx->g, ce_ctx); |
207 | } | 220 | } |
208 | 221 | ||
209 | static inline unsigned int gk20a_ce_get_method_size(int request_operation) | 222 | static inline unsigned int gk20a_ce_get_method_size(int request_operation, |
223 | u64 size) | ||
210 | { | 224 | { |
211 | /* failure size */ | 225 | /* failure size */ |
212 | unsigned int methodsize = UINT_MAX; | 226 | unsigned int methodsize = UINT_MAX; |
227 | unsigned int iterations = 0; | ||
228 | u32 shift; | ||
229 | u64 chunk = size; | ||
230 | u32 height, width; | ||
231 | |||
232 | while (chunk) { | ||
233 | iterations++; | ||
234 | |||
235 | shift = MAX_CE_ALIGN(chunk) ? __ffs(MAX_CE_ALIGN(chunk)) : | ||
236 | MAX_CE_SHIFT; | ||
237 | width = chunk >> shift; | ||
238 | height = 1 << shift; | ||
239 | width = MAX_CE_ALIGN(width); | ||
240 | |||
241 | chunk -= (u64) height * width; | ||
242 | } | ||
213 | 243 | ||
214 | if (request_operation & NVGPU_CE_PHYS_MODE_TRANSFER) | 244 | if (request_operation & NVGPU_CE_PHYS_MODE_TRANSFER) |
215 | methodsize = 10 * 2 * sizeof(u32); | 245 | methodsize = (2 + (16 * iterations)) * sizeof(u32); |
216 | else if (request_operation & NVGPU_CE_MEMSET) | 246 | else if (request_operation & NVGPU_CE_MEMSET) |
217 | methodsize = 9 * 2 * sizeof(u32); | 247 | methodsize = (2 + (15 * iterations)) * sizeof(u32); |
218 | 248 | ||
219 | return methodsize; | 249 | return methodsize; |
220 | } | 250 | } |
@@ -243,10 +273,13 @@ static int gk20a_ce_prepare_submit(u64 src_buf, | |||
243 | { | 273 | { |
244 | u32 launch = 0; | 274 | u32 launch = 0; |
245 | u32 methodSize = 0; | 275 | u32 methodSize = 0; |
276 | u64 offset = 0; | ||
277 | u64 chunk_size = 0; | ||
278 | u64 chunk = size; | ||
246 | 279 | ||
247 | /* failure case handling */ | 280 | /* failure case handling */ |
248 | if ((gk20a_ce_get_method_size(request_operation) > max_cmd_buf_size) || | 281 | if ((gk20a_ce_get_method_size(request_operation, size) > |
249 | (!size) || | 282 | max_cmd_buf_size) || (!size) || |
250 | (request_operation > NVGPU_CE_MEMSET)) | 283 | (request_operation > NVGPU_CE_MEMSET)) |
251 | return 0; | 284 | return 0; |
252 | 285 | ||
@@ -254,83 +287,116 @@ static int gk20a_ce_prepare_submit(u64 src_buf, | |||
254 | cmd_buf_cpu_va[methodSize++] = 0x20018000; | 287 | cmd_buf_cpu_va[methodSize++] = 0x20018000; |
255 | cmd_buf_cpu_va[methodSize++] = dma_copy_class; | 288 | cmd_buf_cpu_va[methodSize++] = dma_copy_class; |
256 | 289 | ||
257 | if (request_operation & NVGPU_CE_PHYS_MODE_TRANSFER) { | 290 | /* |
258 | /* setup the source */ | 291 | * The purpose clear the memory in 2D rectangles. We get the ffs to |
259 | cmd_buf_cpu_va[methodSize++] = 0x20018101; | 292 | * determine the number of lines to copy. The only constraint is that |
260 | cmd_buf_cpu_va[methodSize++] = (u64_lo32(src_buf) & | 293 | * maximum number of pixels per line is 4Gpix - 1, which is awkward for |
261 | NVGPU_CE_LOWER_ADDRESS_OFFSET_MASK); | 294 | * calculation, so we settle to 2Gpix per line to make calculatione |
262 | 295 | * more agreable | |
263 | cmd_buf_cpu_va[methodSize++] = 0x20018100; | 296 | */ |
264 | cmd_buf_cpu_va[methodSize++] = (u64_hi32(src_buf) & | 297 | |
265 | NVGPU_CE_UPPER_ADDRESS_OFFSET_MASK); | 298 | /* The copy engine in 2D mode can have (2^32 - 1) x (2^32 - 1) pixels in |
299 | * a single submit, we are going to try to clear a range of up to 2Gpix | ||
300 | * multiple lines. Because we want to copy byte aligned we will be | ||
301 | * setting 1 byte pixels */ | ||
302 | |||
303 | /* | ||
304 | * per iteration | ||
305 | * <------------------------- 40 bits ------------------------------> | ||
306 | * 1 <------ ffs -------> | ||
307 | * <-----------up to 30 bits-----------> | ||
308 | */ | ||
309 | while (chunk) { | ||
310 | u32 width, height, shift; | ||
311 | |||
312 | /* | ||
313 | * We will be aligning to bytes, making the maximum number of | ||
314 | * pix per line 2Gb | ||
315 | */ | ||
316 | |||
317 | shift = MAX_CE_ALIGN(chunk) ? __ffs(MAX_CE_ALIGN(chunk)) : | ||
318 | MAX_CE_SHIFT; | ||
319 | height = chunk >> shift; | ||
320 | width = 1 << shift; | ||
321 | height = MAX_CE_ALIGN(height); | ||
322 | |||
323 | chunk_size = (u64) height * width; | ||
324 | |||
325 | /* reset launch flag */ | ||
326 | launch = 0; | ||
327 | |||
328 | if (request_operation & NVGPU_CE_PHYS_MODE_TRANSFER) { | ||
329 | /* setup the source */ | ||
330 | cmd_buf_cpu_va[methodSize++] = 0x20028100; | ||
331 | cmd_buf_cpu_va[methodSize++] = (u64_hi32(src_buf + | ||
332 | offset) & NVGPU_CE_UPPER_ADDRESS_OFFSET_MASK); | ||
333 | cmd_buf_cpu_va[methodSize++] = (u64_lo32(src_buf + | ||
334 | offset) & NVGPU_CE_LOWER_ADDRESS_OFFSET_MASK); | ||
335 | |||
336 | cmd_buf_cpu_va[methodSize++] = 0x20018098; | ||
337 | if (launch_flags & NVGPU_CE_SRC_LOCATION_LOCAL_FB) | ||
338 | cmd_buf_cpu_va[methodSize++] = 0x00000000; | ||
339 | else if (launch_flags & | ||
340 | NVGPU_CE_SRC_LOCATION_NONCOHERENT_SYSMEM) | ||
341 | cmd_buf_cpu_va[methodSize++] = 0x00000002; | ||
342 | else | ||
343 | cmd_buf_cpu_va[methodSize++] = 0x00000001; | ||
344 | |||
345 | launch |= 0x00001000; | ||
346 | } else if (request_operation & NVGPU_CE_MEMSET) { | ||
347 | /* Remap from component A on 1 byte wide pixels */ | ||
348 | cmd_buf_cpu_va[methodSize++] = 0x200181c2; | ||
349 | cmd_buf_cpu_va[methodSize++] = 0x00000004; | ||
350 | |||
351 | cmd_buf_cpu_va[methodSize++] = 0x200181c0; | ||
352 | cmd_buf_cpu_va[methodSize++] = payload; | ||
353 | |||
354 | launch |= 0x00000400; | ||
355 | } else { | ||
356 | /* Illegal size */ | ||
357 | return 0; | ||
358 | } | ||
266 | 359 | ||
267 | cmd_buf_cpu_va[methodSize++] = 0x20018098; | 360 | /* setup the destination/output */ |
268 | if (launch_flags & NVGPU_CE_SRC_LOCATION_LOCAL_FB) { | 361 | cmd_buf_cpu_va[methodSize++] = 0x20068102; |
362 | cmd_buf_cpu_va[methodSize++] = (u64_hi32(dst_buf + | ||
363 | offset) & NVGPU_CE_UPPER_ADDRESS_OFFSET_MASK); | ||
364 | cmd_buf_cpu_va[methodSize++] = (u64_lo32(dst_buf + | ||
365 | offset) & NVGPU_CE_LOWER_ADDRESS_OFFSET_MASK); | ||
366 | /* Pitch in/out */ | ||
367 | cmd_buf_cpu_va[methodSize++] = width; | ||
368 | cmd_buf_cpu_va[methodSize++] = width; | ||
369 | /* width and line count */ | ||
370 | cmd_buf_cpu_va[methodSize++] = width; | ||
371 | cmd_buf_cpu_va[methodSize++] = height; | ||
372 | |||
373 | cmd_buf_cpu_va[methodSize++] = 0x20018099; | ||
374 | if (launch_flags & NVGPU_CE_DST_LOCATION_LOCAL_FB) | ||
269 | cmd_buf_cpu_va[methodSize++] = 0x00000000; | 375 | cmd_buf_cpu_va[methodSize++] = 0x00000000; |
270 | } else if (launch_flags & NVGPU_CE_SRC_LOCATION_NONCOHERENT_SYSMEM) { | 376 | else if (launch_flags & |
377 | NVGPU_CE_DST_LOCATION_NONCOHERENT_SYSMEM) | ||
271 | cmd_buf_cpu_va[methodSize++] = 0x00000002; | 378 | cmd_buf_cpu_va[methodSize++] = 0x00000002; |
272 | } else { | 379 | else |
273 | cmd_buf_cpu_va[methodSize++] = 0x00000001; | 380 | cmd_buf_cpu_va[methodSize++] = 0x00000001; |
274 | } | ||
275 | 381 | ||
276 | launch |= 0x00001000; | 382 | launch |= 0x00002005; |
277 | } else if (request_operation & NVGPU_CE_MEMSET) { | ||
278 | cmd_buf_cpu_va[methodSize++] = 0x200181c2; | ||
279 | cmd_buf_cpu_va[methodSize++] = 0x00030004; | ||
280 | 383 | ||
281 | cmd_buf_cpu_va[methodSize++] = 0x200181c0; | 384 | if (launch_flags & NVGPU_CE_SRC_MEMORY_LAYOUT_BLOCKLINEAR) |
282 | cmd_buf_cpu_va[methodSize++] = payload; | 385 | launch |= 0x00000000; |
386 | else | ||
387 | launch |= 0x00000080; | ||
283 | 388 | ||
284 | launch |= 0x00000400; | 389 | if (launch_flags & NVGPU_CE_DST_MEMORY_LAYOUT_BLOCKLINEAR) |
390 | launch |= 0x00000000; | ||
391 | else | ||
392 | launch |= 0x00000100; | ||
285 | 393 | ||
286 | /* converted into number of words */ | 394 | cmd_buf_cpu_va[methodSize++] = 0x200180c0; |
287 | size /= sizeof(u32); | 395 | cmd_buf_cpu_va[methodSize++] = launch; |
396 | offset += chunk_size; | ||
397 | chunk -= chunk_size; | ||
288 | } | 398 | } |
289 | 399 | ||
290 | /* setup the destination/output */ | ||
291 | cmd_buf_cpu_va[methodSize++] = 0x20018103; | ||
292 | cmd_buf_cpu_va[methodSize++] = (u64_lo32(dst_buf) & NVGPU_CE_LOWER_ADDRESS_OFFSET_MASK); | ||
293 | |||
294 | cmd_buf_cpu_va[methodSize++] = 0x20018102; | ||
295 | cmd_buf_cpu_va[methodSize++] = (u64_hi32(dst_buf) & NVGPU_CE_UPPER_ADDRESS_OFFSET_MASK); | ||
296 | |||
297 | cmd_buf_cpu_va[methodSize++] = 0x20018099; | ||
298 | if (launch_flags & NVGPU_CE_DST_LOCATION_LOCAL_FB) { | ||
299 | cmd_buf_cpu_va[methodSize++] = 0x00000000; | ||
300 | } else if (launch_flags & NVGPU_CE_DST_LOCATION_NONCOHERENT_SYSMEM) { | ||
301 | cmd_buf_cpu_va[methodSize++] = 0x00000002; | ||
302 | } else { | ||
303 | cmd_buf_cpu_va[methodSize++] = 0x00000001; | ||
304 | } | ||
305 | |||
306 | launch |= 0x00002000; | ||
307 | |||
308 | /* setup the format */ | ||
309 | cmd_buf_cpu_va[methodSize++] = 0x20018107; | ||
310 | cmd_buf_cpu_va[methodSize++] = 1; | ||
311 | cmd_buf_cpu_va[methodSize++] = 0x20018106; | ||
312 | cmd_buf_cpu_va[methodSize++] = u64_lo32(size); | ||
313 | |||
314 | launch |= 0x00000004; | ||
315 | |||
316 | if (launch_flags & NVGPU_CE_SRC_MEMORY_LAYOUT_BLOCKLINEAR) | ||
317 | launch |= 0x00000000; | ||
318 | else | ||
319 | launch |= 0x00000080; | ||
320 | |||
321 | if (launch_flags & NVGPU_CE_DST_MEMORY_LAYOUT_BLOCKLINEAR) | ||
322 | launch |= 0x00000000; | ||
323 | else | ||
324 | launch |= 0x00000100; | ||
325 | |||
326 | if (launch_flags & NVGPU_CE_DATA_TRANSFER_TYPE_NON_PIPELINED) | ||
327 | launch |= 0x00000002; | ||
328 | else | ||
329 | launch |= 0x00000001; | ||
330 | |||
331 | cmd_buf_cpu_va[methodSize++] = 0x200180c0; | ||
332 | cmd_buf_cpu_va[methodSize++] = launch; | ||
333 | |||
334 | return methodSize; | 400 | return methodSize; |
335 | } | 401 | } |
336 | 402 | ||
@@ -457,6 +523,16 @@ u32 gk20a_ce_create_context_with_cb(struct gk20a *g, | |||
457 | 523 | ||
458 | ce_ctx->vm = g->mm.ce.vm; | 524 | ce_ctx->vm = g->mm.ce.vm; |
459 | 525 | ||
526 | if (nvgpu_is_enabled(g, NVGPU_MM_CE_TSG_REQUIRED)) { | ||
527 | /* allocate a tsg if needed */ | ||
528 | ce_ctx->tsg = gk20a_tsg_open(g); | ||
529 | |||
530 | if (!ce_ctx->tsg) { | ||
531 | nvgpu_err(g, "ce: gk20a tsg not available"); | ||
532 | goto end; | ||
533 | } | ||
534 | } | ||
535 | |||
460 | /* always kernel client needs privileged channel */ | 536 | /* always kernel client needs privileged channel */ |
461 | ce_ctx->ch = gk20a_open_new_channel_with_cb(g, gk20a_ce_finished_ctx_cb, | 537 | ce_ctx->ch = gk20a_open_new_channel_with_cb(g, gk20a_ce_finished_ctx_cb, |
462 | ce_ctx, | 538 | ce_ctx, |
@@ -475,6 +551,14 @@ u32 gk20a_ce_create_context_with_cb(struct gk20a *g, | |||
475 | goto end; | 551 | goto end; |
476 | } | 552 | } |
477 | 553 | ||
554 | if (nvgpu_is_enabled(g, NVGPU_MM_CE_TSG_REQUIRED)) { | ||
555 | err = gk20a_tsg_bind_channel(ce_ctx->tsg, ce_ctx->ch); | ||
556 | if (err) { | ||
557 | nvgpu_err(g, "ce: unable to bind to tsg"); | ||
558 | goto end; | ||
559 | } | ||
560 | } | ||
561 | |||
478 | /* allocate gpfifo (1024 should be more than enough) */ | 562 | /* allocate gpfifo (1024 should be more than enough) */ |
479 | err = gk20a_channel_alloc_gpfifo(ce_ctx->ch, 1024, 0, 0); | 563 | err = gk20a_channel_alloc_gpfifo(ce_ctx->ch, 1024, 0, 0); |
480 | if (err) { | 564 | if (err) { |
diff --git a/drivers/gpu/nvgpu/gk20a/ce2_gk20a.h b/drivers/gpu/nvgpu/gk20a/ce2_gk20a.h index f1f9e260..1dad8952 100644 --- a/drivers/gpu/nvgpu/gk20a/ce2_gk20a.h +++ b/drivers/gpu/nvgpu/gk20a/ce2_gk20a.h | |||
@@ -36,8 +36,8 @@ int gk20a_ce2_nonstall_isr(struct gk20a *g, u32 inst_id, u32 pri_base); | |||
36 | #define NVGPU_CE_LOWER_ADDRESS_OFFSET_MASK 0xffffffff | 36 | #define NVGPU_CE_LOWER_ADDRESS_OFFSET_MASK 0xffffffff |
37 | #define NVGPU_CE_UPPER_ADDRESS_OFFSET_MASK 0xff | 37 | #define NVGPU_CE_UPPER_ADDRESS_OFFSET_MASK 0xff |
38 | 38 | ||
39 | #define NVGPU_CE_COMMAND_BUF_SIZE 4096 | 39 | #define NVGPU_CE_COMMAND_BUF_SIZE 8192 |
40 | #define NVGPU_CE_MAX_COMMAND_BUFF_SIZE_PER_KICKOFF 128 | 40 | #define NVGPU_CE_MAX_COMMAND_BUFF_SIZE_PER_KICKOFF 256 |
41 | #define NVGPU_CE_MAX_COMMAND_BUFF_SIZE_FOR_TRACING 8 | 41 | #define NVGPU_CE_MAX_COMMAND_BUFF_SIZE_FOR_TRACING 8 |
42 | 42 | ||
43 | typedef void (*ce_event_callback)(u32 ce_ctx_id, u32 ce_event_flag); | 43 | typedef void (*ce_event_callback)(u32 ce_ctx_id, u32 ce_event_flag); |
@@ -108,6 +108,9 @@ struct gk20a_gpu_ctx { | |||
108 | int gpu_ctx_state; | 108 | int gpu_ctx_state; |
109 | ce_event_callback user_event_callback; | 109 | ce_event_callback user_event_callback; |
110 | 110 | ||
111 | /* tsg related data */ | ||
112 | struct tsg_gk20a *tsg; | ||
113 | |||
111 | /* channel related data */ | 114 | /* channel related data */ |
112 | struct channel_gk20a *ch; | 115 | struct channel_gk20a *ch; |
113 | struct vm_gk20a *vm; | 116 | struct vm_gk20a *vm; |
diff --git a/drivers/gpu/nvgpu/gk20a/platform_gk20a.h b/drivers/gpu/nvgpu/gk20a/platform_gk20a.h index c2c73b9c..d4ff17f3 100644 --- a/drivers/gpu/nvgpu/gk20a/platform_gk20a.h +++ b/drivers/gpu/nvgpu/gk20a/platform_gk20a.h | |||
@@ -233,6 +233,9 @@ struct gk20a_platform { | |||
233 | /* unified or split memory with separate vidmem? */ | 233 | /* unified or split memory with separate vidmem? */ |
234 | bool unified_memory; | 234 | bool unified_memory; |
235 | 235 | ||
236 | /* true if all channels must be in TSG */ | ||
237 | bool tsg_required; | ||
238 | |||
236 | /* minimum supported VBIOS version */ | 239 | /* minimum supported VBIOS version */ |
237 | u32 vbios_min_version; | 240 | u32 vbios_min_version; |
238 | 241 | ||