summaryrefslogtreecommitdiffstats
path: root/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c
diff options
context:
space:
mode:
authorDavid Nieto <dmartineznie@nvidia.com>2017-10-04 13:44:40 -0400
committermobile promotions <svcmobile_promotions@nvidia.com>2017-10-13 16:42:30 -0400
commite02d14e7542aed80c8f37c12a1d5df127146fbd3 (patch)
tree2d5ee7974648921491a782bf8fde0d0fd3624348 /drivers/gpu/nvgpu/gk20a/ce2_gk20a.c
parent036e4ea2442d27cdbce6d67683ea629ed82ed208 (diff)
gpu: nvgpu: ce: tsg and large vidmem support
Some GPUs require all channels to be on TSG and also have larger than 4GB vidmem sizes which were not supported on the previous CE2 code. This change creates a new property to track if the copy engine needs to encapsulate its kernel context on tsg and also modifies the copy engine code to support much larger copies without dramatically increasing the PB size. JIRA: EVLR-1990 Change-Id: Ieb4acba0c787eb96cb9c7cd97f884d2119d445aa Signed-off-by: David Nieto <dmartineznie@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/1573216 Reviewed-by: Automatic_Commit_Validation_User Reviewed-by: svc-mobile-coverity <svc-mobile-coverity@nvidia.com> Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com> Reviewed-by: Alex Waterman <alexw@nvidia.com> GVS: Gerrit_Virtual_Submit Reviewed-by: Nirav Patel <nipatel@nvidia.com>
Diffstat (limited to 'drivers/gpu/nvgpu/gk20a/ce2_gk20a.c')
-rw-r--r--drivers/gpu/nvgpu/gk20a/ce2_gk20a.c228
1 files changed, 156 insertions, 72 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c b/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c
index fdc1ac61..5314a1be 100644
--- a/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c
@@ -28,6 +28,7 @@
28#include "gk20a.h" 28#include "gk20a.h"
29 29
30#include <nvgpu/log.h> 30#include <nvgpu/log.h>
31#include <nvgpu/enabled.h>
31 32
32#include <nvgpu/hw/gk20a/hw_ce2_gk20a.h> 33#include <nvgpu/hw/gk20a/hw_ce2_gk20a.h>
33#include <nvgpu/hw/gk20a/hw_pbdma_gk20a.h> 34#include <nvgpu/hw/gk20a/hw_pbdma_gk20a.h>
@@ -38,6 +39,14 @@
38#include <nvgpu/hw/gk20a/hw_gr_gk20a.h> 39#include <nvgpu/hw/gk20a/hw_gr_gk20a.h>
39#include <nvgpu/barrier.h> 40#include <nvgpu/barrier.h>
40 41
42/*
43 * Copy engine defines line size in pixels
44 */
45#define MAX_CE_SHIFT 31 /* 4Gpixels -1 */
46#define MAX_CE_MASK ((u32) (~(~0 << MAX_CE_SHIFT)))
47#define MAX_CE_ALIGN(a) (a & MAX_CE_MASK)
48
49
41static u32 ce2_nonblockpipe_isr(struct gk20a *g, u32 fifo_intr) 50static u32 ce2_nonblockpipe_isr(struct gk20a *g, u32 fifo_intr)
42{ 51{
43 gk20a_dbg(gpu_dbg_intr, "ce2 non-blocking pipe interrupt\n"); 52 gk20a_dbg(gpu_dbg_intr, "ce2 non-blocking pipe interrupt\n");
@@ -192,6 +201,10 @@ static void gk20a_ce_delete_gpu_context(struct gk20a_gpu_ctx *ce_ctx)
192 nvgpu_dma_unmap_free(ce_ctx->vm, &ce_ctx->cmd_buf_mem); 201 nvgpu_dma_unmap_free(ce_ctx->vm, &ce_ctx->cmd_buf_mem);
193 } 202 }
194 203
204 /* unbind tsg */
205 if (ce_ctx->tsg && ce_ctx->ch)
206 gk20a_tsg_unbind_channel(ce_ctx->ch);
207
195 /* free the channel */ 208 /* free the channel */
196 if (ce_ctx->ch) 209 if (ce_ctx->ch)
197 gk20a_channel_close(ce_ctx->ch); 210 gk20a_channel_close(ce_ctx->ch);
@@ -206,15 +219,32 @@ static void gk20a_ce_delete_gpu_context(struct gk20a_gpu_ctx *ce_ctx)
206 nvgpu_kfree(ce_ctx->g, ce_ctx); 219 nvgpu_kfree(ce_ctx->g, ce_ctx);
207} 220}
208 221
209static inline unsigned int gk20a_ce_get_method_size(int request_operation) 222static inline unsigned int gk20a_ce_get_method_size(int request_operation,
223 u64 size)
210{ 224{
211 /* failure size */ 225 /* failure size */
212 unsigned int methodsize = UINT_MAX; 226 unsigned int methodsize = UINT_MAX;
227 unsigned int iterations = 0;
228 u32 shift;
229 u64 chunk = size;
230 u32 height, width;
231
232 while (chunk) {
233 iterations++;
234
235 shift = MAX_CE_ALIGN(chunk) ? __ffs(MAX_CE_ALIGN(chunk)) :
236 MAX_CE_SHIFT;
237 width = chunk >> shift;
238 height = 1 << shift;
239 width = MAX_CE_ALIGN(width);
240
241 chunk -= (u64) height * width;
242 }
213 243
214 if (request_operation & NVGPU_CE_PHYS_MODE_TRANSFER) 244 if (request_operation & NVGPU_CE_PHYS_MODE_TRANSFER)
215 methodsize = 10 * 2 * sizeof(u32); 245 methodsize = (2 + (16 * iterations)) * sizeof(u32);
216 else if (request_operation & NVGPU_CE_MEMSET) 246 else if (request_operation & NVGPU_CE_MEMSET)
217 methodsize = 9 * 2 * sizeof(u32); 247 methodsize = (2 + (15 * iterations)) * sizeof(u32);
218 248
219 return methodsize; 249 return methodsize;
220} 250}
@@ -243,10 +273,13 @@ static int gk20a_ce_prepare_submit(u64 src_buf,
243{ 273{
244 u32 launch = 0; 274 u32 launch = 0;
245 u32 methodSize = 0; 275 u32 methodSize = 0;
276 u64 offset = 0;
277 u64 chunk_size = 0;
278 u64 chunk = size;
246 279
247 /* failure case handling */ 280 /* failure case handling */
248 if ((gk20a_ce_get_method_size(request_operation) > max_cmd_buf_size) || 281 if ((gk20a_ce_get_method_size(request_operation, size) >
249 (!size) || 282 max_cmd_buf_size) || (!size) ||
250 (request_operation > NVGPU_CE_MEMSET)) 283 (request_operation > NVGPU_CE_MEMSET))
251 return 0; 284 return 0;
252 285
@@ -254,83 +287,116 @@ static int gk20a_ce_prepare_submit(u64 src_buf,
254 cmd_buf_cpu_va[methodSize++] = 0x20018000; 287 cmd_buf_cpu_va[methodSize++] = 0x20018000;
255 cmd_buf_cpu_va[methodSize++] = dma_copy_class; 288 cmd_buf_cpu_va[methodSize++] = dma_copy_class;
256 289
257 if (request_operation & NVGPU_CE_PHYS_MODE_TRANSFER) { 290 /*
258 /* setup the source */ 291 * The purpose clear the memory in 2D rectangles. We get the ffs to
259 cmd_buf_cpu_va[methodSize++] = 0x20018101; 292 * determine the number of lines to copy. The only constraint is that
260 cmd_buf_cpu_va[methodSize++] = (u64_lo32(src_buf) & 293 * maximum number of pixels per line is 4Gpix - 1, which is awkward for
261 NVGPU_CE_LOWER_ADDRESS_OFFSET_MASK); 294 * calculation, so we settle to 2Gpix per line to make calculatione
262 295 * more agreable
263 cmd_buf_cpu_va[methodSize++] = 0x20018100; 296 */
264 cmd_buf_cpu_va[methodSize++] = (u64_hi32(src_buf) & 297
265 NVGPU_CE_UPPER_ADDRESS_OFFSET_MASK); 298 /* The copy engine in 2D mode can have (2^32 - 1) x (2^32 - 1) pixels in
299 * a single submit, we are going to try to clear a range of up to 2Gpix
300 * multiple lines. Because we want to copy byte aligned we will be
301 * setting 1 byte pixels */
302
303 /*
304 * per iteration
305 * <------------------------- 40 bits ------------------------------>
306 * 1 <------ ffs ------->
307 * <-----------up to 30 bits----------->
308 */
309 while (chunk) {
310 u32 width, height, shift;
311
312 /*
313 * We will be aligning to bytes, making the maximum number of
314 * pix per line 2Gb
315 */
316
317 shift = MAX_CE_ALIGN(chunk) ? __ffs(MAX_CE_ALIGN(chunk)) :
318 MAX_CE_SHIFT;
319 height = chunk >> shift;
320 width = 1 << shift;
321 height = MAX_CE_ALIGN(height);
322
323 chunk_size = (u64) height * width;
324
325 /* reset launch flag */
326 launch = 0;
327
328 if (request_operation & NVGPU_CE_PHYS_MODE_TRANSFER) {
329 /* setup the source */
330 cmd_buf_cpu_va[methodSize++] = 0x20028100;
331 cmd_buf_cpu_va[methodSize++] = (u64_hi32(src_buf +
332 offset) & NVGPU_CE_UPPER_ADDRESS_OFFSET_MASK);
333 cmd_buf_cpu_va[methodSize++] = (u64_lo32(src_buf +
334 offset) & NVGPU_CE_LOWER_ADDRESS_OFFSET_MASK);
335
336 cmd_buf_cpu_va[methodSize++] = 0x20018098;
337 if (launch_flags & NVGPU_CE_SRC_LOCATION_LOCAL_FB)
338 cmd_buf_cpu_va[methodSize++] = 0x00000000;
339 else if (launch_flags &
340 NVGPU_CE_SRC_LOCATION_NONCOHERENT_SYSMEM)
341 cmd_buf_cpu_va[methodSize++] = 0x00000002;
342 else
343 cmd_buf_cpu_va[methodSize++] = 0x00000001;
344
345 launch |= 0x00001000;
346 } else if (request_operation & NVGPU_CE_MEMSET) {
347 /* Remap from component A on 1 byte wide pixels */
348 cmd_buf_cpu_va[methodSize++] = 0x200181c2;
349 cmd_buf_cpu_va[methodSize++] = 0x00000004;
350
351 cmd_buf_cpu_va[methodSize++] = 0x200181c0;
352 cmd_buf_cpu_va[methodSize++] = payload;
353
354 launch |= 0x00000400;
355 } else {
356 /* Illegal size */
357 return 0;
358 }
266 359
267 cmd_buf_cpu_va[methodSize++] = 0x20018098; 360 /* setup the destination/output */
268 if (launch_flags & NVGPU_CE_SRC_LOCATION_LOCAL_FB) { 361 cmd_buf_cpu_va[methodSize++] = 0x20068102;
362 cmd_buf_cpu_va[methodSize++] = (u64_hi32(dst_buf +
363 offset) & NVGPU_CE_UPPER_ADDRESS_OFFSET_MASK);
364 cmd_buf_cpu_va[methodSize++] = (u64_lo32(dst_buf +
365 offset) & NVGPU_CE_LOWER_ADDRESS_OFFSET_MASK);
366 /* Pitch in/out */
367 cmd_buf_cpu_va[methodSize++] = width;
368 cmd_buf_cpu_va[methodSize++] = width;
369 /* width and line count */
370 cmd_buf_cpu_va[methodSize++] = width;
371 cmd_buf_cpu_va[methodSize++] = height;
372
373 cmd_buf_cpu_va[methodSize++] = 0x20018099;
374 if (launch_flags & NVGPU_CE_DST_LOCATION_LOCAL_FB)
269 cmd_buf_cpu_va[methodSize++] = 0x00000000; 375 cmd_buf_cpu_va[methodSize++] = 0x00000000;
270 } else if (launch_flags & NVGPU_CE_SRC_LOCATION_NONCOHERENT_SYSMEM) { 376 else if (launch_flags &
377 NVGPU_CE_DST_LOCATION_NONCOHERENT_SYSMEM)
271 cmd_buf_cpu_va[methodSize++] = 0x00000002; 378 cmd_buf_cpu_va[methodSize++] = 0x00000002;
272 } else { 379 else
273 cmd_buf_cpu_va[methodSize++] = 0x00000001; 380 cmd_buf_cpu_va[methodSize++] = 0x00000001;
274 }
275 381
276 launch |= 0x00001000; 382 launch |= 0x00002005;
277 } else if (request_operation & NVGPU_CE_MEMSET) {
278 cmd_buf_cpu_va[methodSize++] = 0x200181c2;
279 cmd_buf_cpu_va[methodSize++] = 0x00030004;
280 383
281 cmd_buf_cpu_va[methodSize++] = 0x200181c0; 384 if (launch_flags & NVGPU_CE_SRC_MEMORY_LAYOUT_BLOCKLINEAR)
282 cmd_buf_cpu_va[methodSize++] = payload; 385 launch |= 0x00000000;
386 else
387 launch |= 0x00000080;
283 388
284 launch |= 0x00000400; 389 if (launch_flags & NVGPU_CE_DST_MEMORY_LAYOUT_BLOCKLINEAR)
390 launch |= 0x00000000;
391 else
392 launch |= 0x00000100;
285 393
286 /* converted into number of words */ 394 cmd_buf_cpu_va[methodSize++] = 0x200180c0;
287 size /= sizeof(u32); 395 cmd_buf_cpu_va[methodSize++] = launch;
396 offset += chunk_size;
397 chunk -= chunk_size;
288 } 398 }
289 399
290 /* setup the destination/output */
291 cmd_buf_cpu_va[methodSize++] = 0x20018103;
292 cmd_buf_cpu_va[methodSize++] = (u64_lo32(dst_buf) & NVGPU_CE_LOWER_ADDRESS_OFFSET_MASK);
293
294 cmd_buf_cpu_va[methodSize++] = 0x20018102;
295 cmd_buf_cpu_va[methodSize++] = (u64_hi32(dst_buf) & NVGPU_CE_UPPER_ADDRESS_OFFSET_MASK);
296
297 cmd_buf_cpu_va[methodSize++] = 0x20018099;
298 if (launch_flags & NVGPU_CE_DST_LOCATION_LOCAL_FB) {
299 cmd_buf_cpu_va[methodSize++] = 0x00000000;
300 } else if (launch_flags & NVGPU_CE_DST_LOCATION_NONCOHERENT_SYSMEM) {
301 cmd_buf_cpu_va[methodSize++] = 0x00000002;
302 } else {
303 cmd_buf_cpu_va[methodSize++] = 0x00000001;
304 }
305
306 launch |= 0x00002000;
307
308 /* setup the format */
309 cmd_buf_cpu_va[methodSize++] = 0x20018107;
310 cmd_buf_cpu_va[methodSize++] = 1;
311 cmd_buf_cpu_va[methodSize++] = 0x20018106;
312 cmd_buf_cpu_va[methodSize++] = u64_lo32(size);
313
314 launch |= 0x00000004;
315
316 if (launch_flags & NVGPU_CE_SRC_MEMORY_LAYOUT_BLOCKLINEAR)
317 launch |= 0x00000000;
318 else
319 launch |= 0x00000080;
320
321 if (launch_flags & NVGPU_CE_DST_MEMORY_LAYOUT_BLOCKLINEAR)
322 launch |= 0x00000000;
323 else
324 launch |= 0x00000100;
325
326 if (launch_flags & NVGPU_CE_DATA_TRANSFER_TYPE_NON_PIPELINED)
327 launch |= 0x00000002;
328 else
329 launch |= 0x00000001;
330
331 cmd_buf_cpu_va[methodSize++] = 0x200180c0;
332 cmd_buf_cpu_va[methodSize++] = launch;
333
334 return methodSize; 400 return methodSize;
335} 401}
336 402
@@ -457,6 +523,16 @@ u32 gk20a_ce_create_context_with_cb(struct gk20a *g,
457 523
458 ce_ctx->vm = g->mm.ce.vm; 524 ce_ctx->vm = g->mm.ce.vm;
459 525
526 if (nvgpu_is_enabled(g, NVGPU_MM_CE_TSG_REQUIRED)) {
527 /* allocate a tsg if needed */
528 ce_ctx->tsg = gk20a_tsg_open(g);
529
530 if (!ce_ctx->tsg) {
531 nvgpu_err(g, "ce: gk20a tsg not available");
532 goto end;
533 }
534 }
535
460 /* always kernel client needs privileged channel */ 536 /* always kernel client needs privileged channel */
461 ce_ctx->ch = gk20a_open_new_channel_with_cb(g, gk20a_ce_finished_ctx_cb, 537 ce_ctx->ch = gk20a_open_new_channel_with_cb(g, gk20a_ce_finished_ctx_cb,
462 ce_ctx, 538 ce_ctx,
@@ -475,6 +551,14 @@ u32 gk20a_ce_create_context_with_cb(struct gk20a *g,
475 goto end; 551 goto end;
476 } 552 }
477 553
554 if (nvgpu_is_enabled(g, NVGPU_MM_CE_TSG_REQUIRED)) {
555 err = gk20a_tsg_bind_channel(ce_ctx->tsg, ce_ctx->ch);
556 if (err) {
557 nvgpu_err(g, "ce: unable to bind to tsg");
558 goto end;
559 }
560 }
561
478 /* allocate gpfifo (1024 should be more than enough) */ 562 /* allocate gpfifo (1024 should be more than enough) */
479 err = gk20a_channel_alloc_gpfifo(ce_ctx->ch, 1024, 0, 0); 563 err = gk20a_channel_alloc_gpfifo(ce_ctx->ch, 1024, 0, 0);
480 if (err) { 564 if (err) {