gpu: nvgpu: ce: tsg and large vidmem support

Some GPUs require all channels to be on TSG and also have larger than 4GB vidmem sizes which were not supported on the previous CE2 code. This change creates a new property to track if the copy engine needs to encapsulate its kernel context on tsg and also modifies the copy engine code to support much larger copies without dramatically increasing the PB size. JIRA: EVLR-1990 Change-Id: Ieb4acba0c787eb96cb9c7cd97f884d2119d445aa Signed-off-by: David Nieto <dmartineznie@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/1573216 Reviewed-by: Automatic_Commit_Validation_User Reviewed-by: svc-mobile-coverity <svc-mobile-coverity@nvidia.com> Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com> Reviewed-by: Alex Waterman <alexw@nvidia.com> GVS: Gerrit_Virtual_Submit Reviewed-by: Nirav Patel <nipatel@nvidia.com>
author: David Nieto <dmartineznie@nvidia.com> 2017-10-04 13:44:40 -0400
committer: mobile promotions <svcmobile_promotions@nvidia.com> 2017-10-13 16:42:30 -0400
commit: e02d14e7542aed80c8f37c12a1d5df127146fbd3 (patch)
tree: 2d5ee7974648921491a782bf8fde0d0fd3624348 /drivers/gpu/nvgpu/gk20a/ce2_gk20a.c
parent: 036e4ea2442d27cdbce6d67683ea629ed82ed208 (diff)
1 files changed, 156 insertions, 72 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c b/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c
index fdc1ac61..5314a1be 100644
--- a/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c
@@ -28,6 +28,7 @@
 #include "gk20a.h"
 #include <nvgpu/log.h>
+#include <nvgpu/enabled.h>
 #include <nvgpu/hw/gk20a/hw_ce2_gk20a.h>
 #include <nvgpu/hw/gk20a/hw_pbdma_gk20a.h>
@@ -38,6 +39,14 @@
 #include <nvgpu/hw/gk20a/hw_gr_gk20a.h>
 #include <nvgpu/barrier.h>
+/*
+ * Copy engine defines line size in pixels
+ */
+#define MAX_CE_SHIFT    31      /* 4Gpixels -1 */
+#define MAX_CE_MASK     ((u32) (~(~0 << MAX_CE_SHIFT)))
+#define MAX_CE_ALIGN(a) (a & MAX_CE_MASK)
 static u32 ce2_nonblockpipe_isr(struct gk20a *g, u32 fifo_intr)
 {
        gk20a_dbg(gpu_dbg_intr, "ce2 non-blocking pipe interrupt\n");
@@ -192,6 +201,10 @@ static void gk20a_ce_delete_gpu_context(struct gk20a_gpu_ctx *ce_ctx)
                nvgpu_dma_unmap_free(ce_ctx->vm, &ce_ctx->cmd_buf_mem);
        }
+        /* unbind tsg */
+        if (ce_ctx->tsg && ce_ctx->ch)
+                gk20a_tsg_unbind_channel(ce_ctx->ch);
        /* free the channel */
        if (ce_ctx->ch)
                gk20a_channel_close(ce_ctx->ch);
@@ -206,15 +219,32 @@ static void gk20a_ce_delete_gpu_context(struct gk20a_gpu_ctx *ce_ctx)
        nvgpu_kfree(ce_ctx->g, ce_ctx);
 }
-static inline unsigned int gk20a_ce_get_method_size(int request_operation)
+static inline unsigned int gk20a_ce_get_method_size(int request_operation,
+                        u64 size)
 {
        /* failure size */
        unsigned int methodsize = UINT_MAX;
+        unsigned int iterations = 0;
+        u32 shift;
+        u64 chunk = size;
+        u32 height, width;
+        while (chunk) {
+                iterations++;
+                shift = MAX_CE_ALIGN(chunk) ? __ffs(MAX_CE_ALIGN(chunk)) :
+                                                MAX_CE_SHIFT;
+                width = chunk >> shift;
+                height = 1 << shift;
+                width = MAX_CE_ALIGN(width);
+                chunk -= (u64) height * width;
+        }
        if (request_operation & NVGPU_CE_PHYS_MODE_TRANSFER)
-                methodsize = 10 * 2 * sizeof(u32);
+                methodsize = (2 + (16 * iterations)) * sizeof(u32);
        else if (request_operation & NVGPU_CE_MEMSET)
-                methodsize = 9 * 2 * sizeof(u32);
+                methodsize = (2 + (15 * iterations)) * sizeof(u32);
        return methodsize;
 }
@@ -243,10 +273,13 @@ static int gk20a_ce_prepare_submit(u64 src_buf,
 {
        u32 launch = 0;
        u32 methodSize = 0;
+        u64 offset = 0;
+        u64 chunk_size = 0;
+        u64 chunk = size;
        /* failure case handling */
-        if ((gk20a_ce_get_method_size(request_operation) > max_cmd_buf_size) ||
+        if ((gk20a_ce_get_method_size(request_operation, size) >
-                (!size) ||
+                max_cmd_buf_size) || (!size) ||
                (request_operation > NVGPU_CE_MEMSET))
                return 0;
@@ -254,83 +287,116 @@ static int gk20a_ce_prepare_submit(u64 src_buf,
        cmd_buf_cpu_va[methodSize++] = 0x20018000;
        cmd_buf_cpu_va[methodSize++] = dma_copy_class;
-        if (request_operation & NVGPU_CE_PHYS_MODE_TRANSFER) {
+        /*
-                /* setup the source */
+         * The purpose clear the memory in 2D rectangles. We get the ffs to
-                cmd_buf_cpu_va[methodSize++] = 0x20018101;
+         * determine the number of lines to copy. The only constraint is that
-                cmd_buf_cpu_va[methodSize++] = (u64_lo32(src_buf) &
+         * maximum number of pixels per line is 4Gpix - 1, which is awkward for
-                                        NVGPU_CE_LOWER_ADDRESS_OFFSET_MASK);
+         * calculation, so we settle to 2Gpix per line to make calculatione
+         * more agreable
-                cmd_buf_cpu_va[methodSize++] = 0x20018100;
+         */
-                cmd_buf_cpu_va[methodSize++] = (u64_hi32(src_buf) &
-                                        NVGPU_CE_UPPER_ADDRESS_OFFSET_MASK);
+        /* The copy engine in 2D mode can have (2^32 - 1) x (2^32 - 1) pixels in
+         * a single submit, we are going to try to clear a range of up to 2Gpix
+         * multiple lines. Because we want to copy byte aligned we will be
+         * setting 1 byte pixels */
+        /*
+         * per iteration
+         * <------------------------- 40 bits ------------------------------>
+         *                                             1 <------ ffs ------->
+         *        <-----------up to 30 bits----------->
+         */
+        while (chunk) {
+                u32 width, height, shift;
+                /*
+                 * We will be aligning to bytes, making the maximum number of
+                 * pix per line 2Gb
+                 */
+                shift = MAX_CE_ALIGN(chunk) ? __ffs(MAX_CE_ALIGN(chunk)) :
+                                                MAX_CE_SHIFT;
+                height = chunk >> shift;
+                width = 1 << shift;
+                height = MAX_CE_ALIGN(height);
+                chunk_size = (u64) height * width;
+                /* reset launch flag */
+                launch = 0;
+                if (request_operation & NVGPU_CE_PHYS_MODE_TRANSFER) {
+                        /* setup the source */
+                        cmd_buf_cpu_va[methodSize++] = 0x20028100;
+                        cmd_buf_cpu_va[methodSize++] = (u64_hi32(src_buf +
+                                offset) & NVGPU_CE_UPPER_ADDRESS_OFFSET_MASK);
+                        cmd_buf_cpu_va[methodSize++] = (u64_lo32(src_buf +
+                                offset) & NVGPU_CE_LOWER_ADDRESS_OFFSET_MASK);
+                        cmd_buf_cpu_va[methodSize++] = 0x20018098;
+                        if (launch_flags & NVGPU_CE_SRC_LOCATION_LOCAL_FB)
+                                cmd_buf_cpu_va[methodSize++] = 0x00000000;
+                        else if (launch_flags &
+                                NVGPU_CE_SRC_LOCATION_NONCOHERENT_SYSMEM)
+                                cmd_buf_cpu_va[methodSize++] = 0x00000002;
+                        else
+                                cmd_buf_cpu_va[methodSize++] = 0x00000001;
+                        launch |= 0x00001000;
+                } else if (request_operation & NVGPU_CE_MEMSET) {
+                        /* Remap from component A on 1 byte wide pixels */
+                        cmd_buf_cpu_va[methodSize++] = 0x200181c2;
+                        cmd_buf_cpu_va[methodSize++] = 0x00000004;
+                        cmd_buf_cpu_va[methodSize++] = 0x200181c0;
+                        cmd_buf_cpu_va[methodSize++] = payload;
+                        launch |= 0x00000400;
+                } else {
+                        /* Illegal size */
+                        return 0;
+                }
-                cmd_buf_cpu_va[methodSize++] = 0x20018098;
+                /* setup the destination/output */
-                if (launch_flags & NVGPU_CE_SRC_LOCATION_LOCAL_FB) {
+                cmd_buf_cpu_va[methodSize++] = 0x20068102;
+                cmd_buf_cpu_va[methodSize++] = (u64_hi32(dst_buf +
+                        offset) & NVGPU_CE_UPPER_ADDRESS_OFFSET_MASK);
+                cmd_buf_cpu_va[methodSize++] = (u64_lo32(dst_buf +
+                        offset) & NVGPU_CE_LOWER_ADDRESS_OFFSET_MASK);
+                /* Pitch in/out */
+                cmd_buf_cpu_va[methodSize++] = width;
+                cmd_buf_cpu_va[methodSize++] = width;
+                /* width and line count */
+                cmd_buf_cpu_va[methodSize++] = width;
+                cmd_buf_cpu_va[methodSize++] = height;
+                cmd_buf_cpu_va[methodSize++] = 0x20018099;
+                if (launch_flags & NVGPU_CE_DST_LOCATION_LOCAL_FB)
                        cmd_buf_cpu_va[methodSize++] = 0x00000000;
-                } else if (launch_flags & NVGPU_CE_SRC_LOCATION_NONCOHERENT_SYSMEM) {
+                else if (launch_flags &
+                                NVGPU_CE_DST_LOCATION_NONCOHERENT_SYSMEM)
                        cmd_buf_cpu_va[methodSize++] = 0x00000002;
-                } else {
+                else
                        cmd_buf_cpu_va[methodSize++] = 0x00000001;
-                }
-                launch |= 0x00001000;
+                launch |= 0x00002005;
-        } else if (request_operation & NVGPU_CE_MEMSET) {
-                cmd_buf_cpu_va[methodSize++] = 0x200181c2;
-                cmd_buf_cpu_va[methodSize++] = 0x00030004;
-                cmd_buf_cpu_va[methodSize++] = 0x200181c0;
+                if (launch_flags & NVGPU_CE_SRC_MEMORY_LAYOUT_BLOCKLINEAR)
-                cmd_buf_cpu_va[methodSize++] = payload;
+                        launch |= 0x00000000;
+                else
+                        launch |= 0x00000080;
-                launch |= 0x00000400;
+                if (launch_flags & NVGPU_CE_DST_MEMORY_LAYOUT_BLOCKLINEAR)
+                        launch |= 0x00000000;
+                else
+                        launch |= 0x00000100;
-                /* converted into number of words */
+                cmd_buf_cpu_va[methodSize++] = 0x200180c0;
-                size /= sizeof(u32);
+                cmd_buf_cpu_va[methodSize++] = launch;
+                offset += chunk_size;
+                chunk -= chunk_size;
        }
-        /* setup the destination/output */
-        cmd_buf_cpu_va[methodSize++] = 0x20018103;
-        cmd_buf_cpu_va[methodSize++] = (u64_lo32(dst_buf) & NVGPU_CE_LOWER_ADDRESS_OFFSET_MASK);
-        cmd_buf_cpu_va[methodSize++] = 0x20018102;
-        cmd_buf_cpu_va[methodSize++] = (u64_hi32(dst_buf) & NVGPU_CE_UPPER_ADDRESS_OFFSET_MASK);
-        cmd_buf_cpu_va[methodSize++] = 0x20018099;
-        if (launch_flags & NVGPU_CE_DST_LOCATION_LOCAL_FB) {
-                cmd_buf_cpu_va[methodSize++] = 0x00000000;
-        } else if (launch_flags & NVGPU_CE_DST_LOCATION_NONCOHERENT_SYSMEM) {
-                cmd_buf_cpu_va[methodSize++] = 0x00000002;
-        } else {
-                cmd_buf_cpu_va[methodSize++] = 0x00000001;
-        }
-        launch |= 0x00002000;
-        /* setup the format */
-        cmd_buf_cpu_va[methodSize++] = 0x20018107;
-        cmd_buf_cpu_va[methodSize++] = 1;
-        cmd_buf_cpu_va[methodSize++] = 0x20018106;
-        cmd_buf_cpu_va[methodSize++] =  u64_lo32(size);
-        launch |= 0x00000004;
-        if (launch_flags & NVGPU_CE_SRC_MEMORY_LAYOUT_BLOCKLINEAR)
-                launch |= 0x00000000;
-        else
-                launch |= 0x00000080;
-        if (launch_flags & NVGPU_CE_DST_MEMORY_LAYOUT_BLOCKLINEAR)
-                launch |= 0x00000000;
-        else
-                launch |= 0x00000100;
-        if (launch_flags & NVGPU_CE_DATA_TRANSFER_TYPE_NON_PIPELINED)
-                launch |= 0x00000002;
-        else
-                launch |= 0x00000001;
-        cmd_buf_cpu_va[methodSize++] = 0x200180c0;
-        cmd_buf_cpu_va[methodSize++] = launch;
        return methodSize;
 }
@@ -457,6 +523,16 @@ u32 gk20a_ce_create_context_with_cb(struct gk20a *g,
        ce_ctx->vm = g->mm.ce.vm;
+        if (nvgpu_is_enabled(g, NVGPU_MM_CE_TSG_REQUIRED)) {
+                /* allocate a tsg if needed */
+                ce_ctx->tsg = gk20a_tsg_open(g);
+                if (!ce_ctx->tsg) {
+                        nvgpu_err(g, "ce: gk20a tsg not available");
+                        goto end;
+                }
+        }
        /* always kernel client needs privileged channel */
        ce_ctx->ch = gk20a_open_new_channel_with_cb(g, gk20a_ce_finished_ctx_cb,
                                        ce_ctx,
@@ -475,6 +551,14 @@ u32 gk20a_ce_create_context_with_cb(struct gk20a *g,
                goto end;
        }
+        if (nvgpu_is_enabled(g, NVGPU_MM_CE_TSG_REQUIRED)) {
+                err = gk20a_tsg_bind_channel(ce_ctx->tsg, ce_ctx->ch);
+                if (err) {
+                        nvgpu_err(g, "ce: unable to bind to tsg");
+                        goto end;
+                }
+        }
        /* allocate gpfifo (1024 should be more than enough) */
        err = gk20a_channel_alloc_gpfifo(ce_ctx->ch, 1024, 0, 0);
        if (err) {
author	David Nieto <dmartineznie@nvidia.com>	2017-10-04 13:44:40 -0400
committer	mobile promotions <svcmobile_promotions@nvidia.com>	2017-10-13 16:42:30 -0400
commit	e02d14e7542aed80c8f37c12a1d5df127146fbd3 (patch)
tree	2d5ee7974648921491a782bf8fde0d0fd3624348 /drivers/gpu/nvgpu/gk20a/ce2_gk20a.c
parent	036e4ea2442d27cdbce6d67683ea629ed82ed208 (diff)

diff --git a/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c b/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c index fdc1ac61..5314a1be 100644 --- a/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c
@@ -28,6 +28,7 @@
28	#include "gk20a.h"	28	#include "gk20a.h"
29		29
30	#include <nvgpu/log.h>	30	#include <nvgpu/log.h>
		31	#include <nvgpu/enabled.h>
31		32
32	#include <nvgpu/hw/gk20a/hw_ce2_gk20a.h>	33	#include <nvgpu/hw/gk20a/hw_ce2_gk20a.h>
33	#include <nvgpu/hw/gk20a/hw_pbdma_gk20a.h>	34	#include <nvgpu/hw/gk20a/hw_pbdma_gk20a.h>
@@ -38,6 +39,14 @@
38	#include <nvgpu/hw/gk20a/hw_gr_gk20a.h>	39	#include <nvgpu/hw/gk20a/hw_gr_gk20a.h>
39	#include <nvgpu/barrier.h>	40	#include <nvgpu/barrier.h>
40		41
		42	/*
		43	* Copy engine defines line size in pixels
		44	*/
		45	#define MAX_CE_SHIFT 31 /* 4Gpixels -1 */
		46	#define MAX_CE_MASK ((u32) (~(~0 << MAX_CE_SHIFT)))
		47	#define MAX_CE_ALIGN(a) (a & MAX_CE_MASK)
		48
		49
41	static u32 ce2_nonblockpipe_isr(struct gk20a *g, u32 fifo_intr)	50	static u32 ce2_nonblockpipe_isr(struct gk20a *g, u32 fifo_intr)
42	{	51	{
43	gk20a_dbg(gpu_dbg_intr, "ce2 non-blocking pipe interrupt\n");	52	gk20a_dbg(gpu_dbg_intr, "ce2 non-blocking pipe interrupt\n");
@@ -192,6 +201,10 @@ static void gk20a_ce_delete_gpu_context(struct gk20a_gpu_ctx *ce_ctx)
192	nvgpu_dma_unmap_free(ce_ctx->vm, &ce_ctx->cmd_buf_mem);	201	nvgpu_dma_unmap_free(ce_ctx->vm, &ce_ctx->cmd_buf_mem);
193	}	202	}
194		203
		204	/* unbind tsg */
		205	if (ce_ctx->tsg && ce_ctx->ch)
		206	gk20a_tsg_unbind_channel(ce_ctx->ch);
		207
195	/* free the channel */	208	/* free the channel */
196	if (ce_ctx->ch)	209	if (ce_ctx->ch)
197	gk20a_channel_close(ce_ctx->ch);	210	gk20a_channel_close(ce_ctx->ch);
@@ -206,15 +219,32 @@ static void gk20a_ce_delete_gpu_context(struct gk20a_gpu_ctx *ce_ctx)
206	nvgpu_kfree(ce_ctx->g, ce_ctx);	219	nvgpu_kfree(ce_ctx->g, ce_ctx);
207	}	220	}
208		221
209	static inline unsigned int gk20a_ce_get_method_size(int request_operation)	222	static inline unsigned int gk20a_ce_get_method_size(int request_operation,
		223	u64 size)
210	{	224	{
211	/* failure size */	225	/* failure size */
212	unsigned int methodsize = UINT_MAX;	226	unsigned int methodsize = UINT_MAX;
		227	unsigned int iterations = 0;
		228	u32 shift;
		229	u64 chunk = size;
		230	u32 height, width;
		231
		232	while (chunk) {
		233	iterations++;
		234
		235	shift = MAX_CE_ALIGN(chunk) ? __ffs(MAX_CE_ALIGN(chunk)) :
		236	MAX_CE_SHIFT;
		237	width = chunk >> shift;
		238	height = 1 << shift;
		239	width = MAX_CE_ALIGN(width);
		240
		241	chunk -= (u64) height * width;
		242	}
213		243
214	if (request_operation & NVGPU_CE_PHYS_MODE_TRANSFER)	244	if (request_operation & NVGPU_CE_PHYS_MODE_TRANSFER)
215	methodsize = 10 * 2 * sizeof(u32);	245	methodsize = (2 + (16 * iterations)) * sizeof(u32);
216	else if (request_operation & NVGPU_CE_MEMSET)	246	else if (request_operation & NVGPU_CE_MEMSET)
217	methodsize = 9 * 2 * sizeof(u32);	247	methodsize = (2 + (15 * iterations)) * sizeof(u32);
218		248
219	return methodsize;	249	return methodsize;
220	}	250	}
@@ -243,10 +273,13 @@ static int gk20a_ce_prepare_submit(u64 src_buf,
243	{	273	{
244	u32 launch = 0;	274	u32 launch = 0;
245	u32 methodSize = 0;	275	u32 methodSize = 0;
		276	u64 offset = 0;
		277	u64 chunk_size = 0;
		278	u64 chunk = size;
246		279
247	/* failure case handling */	280	/* failure case handling */
248	if ((gk20a_ce_get_method_size(request_operation) > max_cmd_buf_size) \|\|	281	if ((gk20a_ce_get_method_size(request_operation, size) >
249	(!size) \|\|	282	max_cmd_buf_size) \|\| (!size) \|\|
250	(request_operation > NVGPU_CE_MEMSET))	283	(request_operation > NVGPU_CE_MEMSET))
251	return 0;	284	return 0;
252		285
@@ -254,83 +287,116 @@ static int gk20a_ce_prepare_submit(u64 src_buf,
254	cmd_buf_cpu_va[methodSize++] = 0x20018000;	287	cmd_buf_cpu_va[methodSize++] = 0x20018000;
255	cmd_buf_cpu_va[methodSize++] = dma_copy_class;	288	cmd_buf_cpu_va[methodSize++] = dma_copy_class;
256		289
257	if (request_operation & NVGPU_CE_PHYS_MODE_TRANSFER) {	290	/*
258	/* setup the source */	291	* The purpose clear the memory in 2D rectangles. We get the ffs to
259	cmd_buf_cpu_va[methodSize++] = 0x20018101;	292	* determine the number of lines to copy. The only constraint is that
260	cmd_buf_cpu_va[methodSize++] = (u64_lo32(src_buf) &	293	* maximum number of pixels per line is 4Gpix - 1, which is awkward for
261	NVGPU_CE_LOWER_ADDRESS_OFFSET_MASK);	294	* calculation, so we settle to 2Gpix per line to make calculatione
262		295	* more agreable
263	cmd_buf_cpu_va[methodSize++] = 0x20018100;	296	*/
264	cmd_buf_cpu_va[methodSize++] = (u64_hi32(src_buf) &	297
265	NVGPU_CE_UPPER_ADDRESS_OFFSET_MASK);	298	/* The copy engine in 2D mode can have (2^32 - 1) x (2^32 - 1) pixels in
		299	* a single submit, we are going to try to clear a range of up to 2Gpix
		300	* multiple lines. Because we want to copy byte aligned we will be
		301	* setting 1 byte pixels */
		302
		303	/*
		304	* per iteration
		305	* <------------------------- 40 bits ------------------------------>
		306	* 1 <------ ffs ------->
		307	* <-----------up to 30 bits----------->
		308	*/
		309	while (chunk) {
		310	u32 width, height, shift;
		311
		312	/*
		313	* We will be aligning to bytes, making the maximum number of
		314	* pix per line 2Gb
		315	*/
		316
		317	shift = MAX_CE_ALIGN(chunk) ? __ffs(MAX_CE_ALIGN(chunk)) :
		318	MAX_CE_SHIFT;
		319	height = chunk >> shift;
		320	width = 1 << shift;
		321	height = MAX_CE_ALIGN(height);
		322
		323	chunk_size = (u64) height * width;
		324
		325	/* reset launch flag */
		326	launch = 0;
		327
		328	if (request_operation & NVGPU_CE_PHYS_MODE_TRANSFER) {
		329	/* setup the source */
		330	cmd_buf_cpu_va[methodSize++] = 0x20028100;
		331	cmd_buf_cpu_va[methodSize++] = (u64_hi32(src_buf +
		332	offset) & NVGPU_CE_UPPER_ADDRESS_OFFSET_MASK);
		333	cmd_buf_cpu_va[methodSize++] = (u64_lo32(src_buf +
		334	offset) & NVGPU_CE_LOWER_ADDRESS_OFFSET_MASK);
		335
		336	cmd_buf_cpu_va[methodSize++] = 0x20018098;
		337	if (launch_flags & NVGPU_CE_SRC_LOCATION_LOCAL_FB)
		338	cmd_buf_cpu_va[methodSize++] = 0x00000000;
		339	else if (launch_flags &
		340	NVGPU_CE_SRC_LOCATION_NONCOHERENT_SYSMEM)
		341	cmd_buf_cpu_va[methodSize++] = 0x00000002;
		342	else
		343	cmd_buf_cpu_va[methodSize++] = 0x00000001;
		344
		345	launch \|= 0x00001000;
		346	} else if (request_operation & NVGPU_CE_MEMSET) {
		347	/* Remap from component A on 1 byte wide pixels */
		348	cmd_buf_cpu_va[methodSize++] = 0x200181c2;
		349	cmd_buf_cpu_va[methodSize++] = 0x00000004;
		350
		351	cmd_buf_cpu_va[methodSize++] = 0x200181c0;
		352	cmd_buf_cpu_va[methodSize++] = payload;
		353
		354	launch \|= 0x00000400;
		355	} else {
		356	/* Illegal size */
		357	return 0;
		358	}
266		359
267	cmd_buf_cpu_va[methodSize++] = 0x20018098;	360	/* setup the destination/output */
268	if (launch_flags & NVGPU_CE_SRC_LOCATION_LOCAL_FB) {	361	cmd_buf_cpu_va[methodSize++] = 0x20068102;
		362	cmd_buf_cpu_va[methodSize++] = (u64_hi32(dst_buf +
		363	offset) & NVGPU_CE_UPPER_ADDRESS_OFFSET_MASK);
		364	cmd_buf_cpu_va[methodSize++] = (u64_lo32(dst_buf +
		365	offset) & NVGPU_CE_LOWER_ADDRESS_OFFSET_MASK);
		366	/* Pitch in/out */
		367	cmd_buf_cpu_va[methodSize++] = width;
		368	cmd_buf_cpu_va[methodSize++] = width;
		369	/* width and line count */
		370	cmd_buf_cpu_va[methodSize++] = width;
		371	cmd_buf_cpu_va[methodSize++] = height;
		372
		373	cmd_buf_cpu_va[methodSize++] = 0x20018099;
		374	if (launch_flags & NVGPU_CE_DST_LOCATION_LOCAL_FB)
269	cmd_buf_cpu_va[methodSize++] = 0x00000000;	375	cmd_buf_cpu_va[methodSize++] = 0x00000000;
270	} else if (launch_flags & NVGPU_CE_SRC_LOCATION_NONCOHERENT_SYSMEM) {	376	else if (launch_flags &
		377	NVGPU_CE_DST_LOCATION_NONCOHERENT_SYSMEM)
271	cmd_buf_cpu_va[methodSize++] = 0x00000002;	378	cmd_buf_cpu_va[methodSize++] = 0x00000002;
272	} else {	379	else
273	cmd_buf_cpu_va[methodSize++] = 0x00000001;	380	cmd_buf_cpu_va[methodSize++] = 0x00000001;
274	}
275		381
276	launch \|= 0x00001000;	382	launch \|= 0x00002005;
277	} else if (request_operation & NVGPU_CE_MEMSET) {
278	cmd_buf_cpu_va[methodSize++] = 0x200181c2;
279	cmd_buf_cpu_va[methodSize++] = 0x00030004;
280		383
281	cmd_buf_cpu_va[methodSize++] = 0x200181c0;	384	if (launch_flags & NVGPU_CE_SRC_MEMORY_LAYOUT_BLOCKLINEAR)
282	cmd_buf_cpu_va[methodSize++] = payload;	385	launch \|= 0x00000000;
		386	else
		387	launch \|= 0x00000080;
283		388
284	launch \|= 0x00000400;	389	if (launch_flags & NVGPU_CE_DST_MEMORY_LAYOUT_BLOCKLINEAR)
		390	launch \|= 0x00000000;
		391	else
		392	launch \|= 0x00000100;
285		393
286	/* converted into number of words */	394	cmd_buf_cpu_va[methodSize++] = 0x200180c0;
287	size /= sizeof(u32);	395	cmd_buf_cpu_va[methodSize++] = launch;
		396	offset += chunk_size;
		397	chunk -= chunk_size;
288	}	398	}
289		399
290	/* setup the destination/output */
291	cmd_buf_cpu_va[methodSize++] = 0x20018103;
292	cmd_buf_cpu_va[methodSize++] = (u64_lo32(dst_buf) & NVGPU_CE_LOWER_ADDRESS_OFFSET_MASK);
293
294	cmd_buf_cpu_va[methodSize++] = 0x20018102;
295	cmd_buf_cpu_va[methodSize++] = (u64_hi32(dst_buf) & NVGPU_CE_UPPER_ADDRESS_OFFSET_MASK);
296
297	cmd_buf_cpu_va[methodSize++] = 0x20018099;
298	if (launch_flags & NVGPU_CE_DST_LOCATION_LOCAL_FB) {
299	cmd_buf_cpu_va[methodSize++] = 0x00000000;
300	} else if (launch_flags & NVGPU_CE_DST_LOCATION_NONCOHERENT_SYSMEM) {
301	cmd_buf_cpu_va[methodSize++] = 0x00000002;
302	} else {
303	cmd_buf_cpu_va[methodSize++] = 0x00000001;
304	}
305
306	launch \|= 0x00002000;
307
308	/* setup the format */
309	cmd_buf_cpu_va[methodSize++] = 0x20018107;
310	cmd_buf_cpu_va[methodSize++] = 1;
311	cmd_buf_cpu_va[methodSize++] = 0x20018106;
312	cmd_buf_cpu_va[methodSize++] = u64_lo32(size);
313
314	launch \|= 0x00000004;
315
316	if (launch_flags & NVGPU_CE_SRC_MEMORY_LAYOUT_BLOCKLINEAR)
317	launch \|= 0x00000000;
318	else
319	launch \|= 0x00000080;
320
321	if (launch_flags & NVGPU_CE_DST_MEMORY_LAYOUT_BLOCKLINEAR)
322	launch \|= 0x00000000;
323	else
324	launch \|= 0x00000100;
325
326	if (launch_flags & NVGPU_CE_DATA_TRANSFER_TYPE_NON_PIPELINED)
327	launch \|= 0x00000002;
328	else
329	launch \|= 0x00000001;
330
331	cmd_buf_cpu_va[methodSize++] = 0x200180c0;
332	cmd_buf_cpu_va[methodSize++] = launch;
333
334	return methodSize;	400	return methodSize;
335	}	401	}
336		402
@@ -457,6 +523,16 @@ u32 gk20a_ce_create_context_with_cb(struct gk20a *g,
457		523
458	ce_ctx->vm = g->mm.ce.vm;	524	ce_ctx->vm = g->mm.ce.vm;
459		525
		526	if (nvgpu_is_enabled(g, NVGPU_MM_CE_TSG_REQUIRED)) {
		527	/* allocate a tsg if needed */
		528	ce_ctx->tsg = gk20a_tsg_open(g);
		529
		530	if (!ce_ctx->tsg) {
		531	nvgpu_err(g, "ce: gk20a tsg not available");
		532	goto end;
		533	}
		534	}
		535
460	/* always kernel client needs privileged channel */	536	/* always kernel client needs privileged channel */
461	ce_ctx->ch = gk20a_open_new_channel_with_cb(g, gk20a_ce_finished_ctx_cb,	537	ce_ctx->ch = gk20a_open_new_channel_with_cb(g, gk20a_ce_finished_ctx_cb,
462	ce_ctx,	538	ce_ctx,
@@ -475,6 +551,14 @@ u32 gk20a_ce_create_context_with_cb(struct gk20a *g,
475	goto end;	551	goto end;
476	}	552	}
477		553
		554	if (nvgpu_is_enabled(g, NVGPU_MM_CE_TSG_REQUIRED)) {
		555	err = gk20a_tsg_bind_channel(ce_ctx->tsg, ce_ctx->ch);
		556	if (err) {
		557	nvgpu_err(g, "ce: unable to bind to tsg");
		558	goto end;
		559	}
		560	}
		561
478	/* allocate gpfifo (1024 should be more than enough) */	562	/* allocate gpfifo (1024 should be more than enough) */
479	err = gk20a_channel_alloc_gpfifo(ce_ctx->ch, 1024, 0, 0);	563	err = gk20a_channel_alloc_gpfifo(ce_ctx->ch, 1024, 0, 0);
480	if (err) {	564	if (err) {