From e02d14e7542aed80c8f37c12a1d5df127146fbd3 Mon Sep 17 00:00:00 2001
From: David Nieto <dmartineznie@nvidia.com>
Date: Wed, 4 Oct 2017 10:44:40 -0700
Subject: gpu: nvgpu: ce: tsg and large vidmem support

Some GPUs require all channels to be on TSG and also have larger than 4GB
vidmem sizes which were not supported on the previous CE2 code.

This change creates a new property to track if the copy engine needs to
encapsulate its kernel context on tsg and also modifies the copy engine code
to support much larger copies without dramatically increasing the PB size.

JIRA: EVLR-1990

Change-Id: Ieb4acba0c787eb96cb9c7cd97f884d2119d445aa
Signed-off-by: David Nieto <dmartineznie@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/1573216
Reviewed-by: Automatic_Commit_Validation_User
Reviewed-by: svc-mobile-coverity <svc-mobile-coverity@nvidia.com>
Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com>
Reviewed-by: Alex Waterman <alexw@nvidia.com>
GVS: Gerrit_Virtual_Submit
Reviewed-by: Nirav Patel <nipatel@nvidia.com>
---
 drivers/gpu/nvgpu/common/linux/driver_common.c |   2 +
 drivers/gpu/nvgpu/common/linux/pci.c           |   2 +
 drivers/gpu/nvgpu/gk20a/ce2_gk20a.c            | 228 +++++++++++++++++--------
 drivers/gpu/nvgpu/gk20a/ce2_gk20a.h            |   7 +-
 drivers/gpu/nvgpu/gk20a/platform_gk20a.h       |   3 +
 drivers/gpu/nvgpu/include/nvgpu/enabled.h      |   2 +
 6 files changed, 170 insertions(+), 74 deletions(-)

(limited to 'drivers/gpu')

diff --git a/drivers/gpu/nvgpu/common/linux/driver_common.c b/drivers/gpu/nvgpu/common/linux/driver_common.c
index b4e6a02c..7c4645a8 100644
--- a/drivers/gpu/nvgpu/common/linux/driver_common.c
+++ b/drivers/gpu/nvgpu/common/linux/driver_common.c
@@ -163,6 +163,8 @@ static void nvgpu_init_mm_vars(struct gk20a *g)
 			    platform->unified_memory);
 	__nvgpu_set_enabled(g, NVGPU_MM_UNIFY_ADDRESS_SPACES,
 			    platform->unify_address_spaces);
+	__nvgpu_set_enabled(g, NVGPU_MM_CE_TSG_REQUIRED,
+			    platform->tsg_required);
 
 	nvgpu_mutex_init(&g->mm.tlb_lock);
 	nvgpu_mutex_init(&g->mm.priv_lock);
diff --git a/drivers/gpu/nvgpu/common/linux/pci.c b/drivers/gpu/nvgpu/common/linux/pci.c
index 92764c21..401080ed 100644
--- a/drivers/gpu/nvgpu/common/linux/pci.c
+++ b/drivers/gpu/nvgpu/common/linux/pci.c
@@ -240,6 +240,7 @@ static struct gk20a_platform nvgpu_pci_device[] = {
 	.vbios_min_version = 0x88001e00,
 	.hardcode_sw_threshold = false,
 	.run_preos = true,
+	.tsg_required = true,
 	},
 	{ /* DEVICE=PG503 SKU 200 ES */
 	/* ptimer src frequency in hz */
@@ -274,6 +275,7 @@ static struct gk20a_platform nvgpu_pci_device[] = {
 	.vbios_min_version = 0x88001e00,
 	.hardcode_sw_threshold = false,
 	.run_preos = true,
+	.tsg_required = true,
 	}
 };
 
diff --git a/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c b/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c
index fdc1ac61..5314a1be 100644
--- a/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c
@@ -28,6 +28,7 @@
 #include "gk20a.h"
 
 #include <nvgpu/log.h>
+#include <nvgpu/enabled.h>
 
 #include <nvgpu/hw/gk20a/hw_ce2_gk20a.h>
 #include <nvgpu/hw/gk20a/hw_pbdma_gk20a.h>
@@ -38,6 +39,14 @@
 #include <nvgpu/hw/gk20a/hw_gr_gk20a.h>
 #include <nvgpu/barrier.h>
 
+/*
+ * Copy engine defines line size in pixels
+ */
+#define MAX_CE_SHIFT	31	/* 4Gpixels -1 */
+#define MAX_CE_MASK	((u32) (~(~0 << MAX_CE_SHIFT)))
+#define MAX_CE_ALIGN(a)	(a & MAX_CE_MASK)
+
+
 static u32 ce2_nonblockpipe_isr(struct gk20a *g, u32 fifo_intr)
 {
 	gk20a_dbg(gpu_dbg_intr, "ce2 non-blocking pipe interrupt\n");
@@ -192,6 +201,10 @@ static void gk20a_ce_delete_gpu_context(struct gk20a_gpu_ctx *ce_ctx)
 		nvgpu_dma_unmap_free(ce_ctx->vm, &ce_ctx->cmd_buf_mem);
 	}
 
+	/* unbind tsg */
+	if (ce_ctx->tsg && ce_ctx->ch)
+		gk20a_tsg_unbind_channel(ce_ctx->ch);
+
 	/* free the channel */
 	if (ce_ctx->ch)
 		gk20a_channel_close(ce_ctx->ch);
@@ -206,15 +219,32 @@ static void gk20a_ce_delete_gpu_context(struct gk20a_gpu_ctx *ce_ctx)
 	nvgpu_kfree(ce_ctx->g, ce_ctx);
 }
 
-static inline unsigned int gk20a_ce_get_method_size(int request_operation)
+static inline unsigned int gk20a_ce_get_method_size(int request_operation,
+			u64 size)
 {
 	/* failure size */
 	unsigned int methodsize = UINT_MAX;
+	unsigned int iterations = 0;
+	u32 shift;
+	u64 chunk = size;
+	u32 height, width;
+
+	while (chunk) {
+		iterations++;
+
+		shift = MAX_CE_ALIGN(chunk) ? __ffs(MAX_CE_ALIGN(chunk)) :
+						MAX_CE_SHIFT;
+		width = chunk >> shift;
+		height = 1 << shift;
+		width = MAX_CE_ALIGN(width);
+
+		chunk -= (u64) height * width;
+	}
 
 	if (request_operation & NVGPU_CE_PHYS_MODE_TRANSFER)
-		methodsize = 10 * 2 * sizeof(u32);
+		methodsize = (2 + (16 * iterations)) * sizeof(u32);
 	else if (request_operation & NVGPU_CE_MEMSET)
-		methodsize = 9 * 2 * sizeof(u32);
+		methodsize = (2 + (15 * iterations)) * sizeof(u32);
 
 	return methodsize;
 }
@@ -243,10 +273,13 @@ static int gk20a_ce_prepare_submit(u64 src_buf,
 {
 	u32 launch = 0;
 	u32 methodSize = 0;
+	u64 offset = 0;
+	u64 chunk_size = 0;
+	u64 chunk = size;
 
 	/* failure case handling */
-	if ((gk20a_ce_get_method_size(request_operation) > max_cmd_buf_size) ||
-		(!size) ||
+	if ((gk20a_ce_get_method_size(request_operation, size) >
+		max_cmd_buf_size) || (!size) ||
 		(request_operation > NVGPU_CE_MEMSET))
 		return 0;
 
@@ -254,83 +287,116 @@ static int gk20a_ce_prepare_submit(u64 src_buf,
 	cmd_buf_cpu_va[methodSize++] = 0x20018000;
 	cmd_buf_cpu_va[methodSize++] = dma_copy_class;
 
-	if (request_operation & NVGPU_CE_PHYS_MODE_TRANSFER) {
-		/* setup the source */
-		cmd_buf_cpu_va[methodSize++] = 0x20018101;
-		cmd_buf_cpu_va[methodSize++] = (u64_lo32(src_buf) &
-					NVGPU_CE_LOWER_ADDRESS_OFFSET_MASK);
-
-		cmd_buf_cpu_va[methodSize++] = 0x20018100;
-		cmd_buf_cpu_va[methodSize++] = (u64_hi32(src_buf) &
-					NVGPU_CE_UPPER_ADDRESS_OFFSET_MASK);
+	/*
+	 * The purpose clear the memory in 2D rectangles. We get the ffs to
+	 * determine the number of lines to copy. The only constraint is that
+	 * maximum number of pixels per line is 4Gpix - 1, which is awkward for
+	 * calculation, so we settle to 2Gpix per line to make calculatione
+	 * more agreable
+	 */
+
+	/* The copy engine in 2D mode can have (2^32 - 1) x (2^32 - 1) pixels in
+	 * a single submit, we are going to try to clear a range of up to 2Gpix
+	 * multiple lines. Because we want to copy byte aligned we will be
+	 * setting 1 byte pixels */
+
+	/*
+	 * per iteration
+	 * <------------------------- 40 bits ------------------------------>
+	 *                                             1 <------ ffs ------->
+	 *        <-----------up to 30 bits----------->
+	 */
+	while (chunk) {
+		u32 width, height, shift;
+
+		/*
+		 * We will be aligning to bytes, making the maximum number of
+		 * pix per line 2Gb
+		 */
+
+		shift = MAX_CE_ALIGN(chunk) ? __ffs(MAX_CE_ALIGN(chunk)) :
+						MAX_CE_SHIFT;
+		height = chunk >> shift;
+		width = 1 << shift;
+		height = MAX_CE_ALIGN(height);
+
+		chunk_size = (u64) height * width;
+
+		/* reset launch flag */
+		launch = 0;
+
+		if (request_operation & NVGPU_CE_PHYS_MODE_TRANSFER) {
+			/* setup the source */
+			cmd_buf_cpu_va[methodSize++] = 0x20028100;
+			cmd_buf_cpu_va[methodSize++] = (u64_hi32(src_buf +
+				offset) & NVGPU_CE_UPPER_ADDRESS_OFFSET_MASK);
+			cmd_buf_cpu_va[methodSize++] = (u64_lo32(src_buf +
+				offset) & NVGPU_CE_LOWER_ADDRESS_OFFSET_MASK);
+
+			cmd_buf_cpu_va[methodSize++] = 0x20018098;
+			if (launch_flags & NVGPU_CE_SRC_LOCATION_LOCAL_FB)
+				cmd_buf_cpu_va[methodSize++] = 0x00000000;
+			else if (launch_flags &
+				NVGPU_CE_SRC_LOCATION_NONCOHERENT_SYSMEM)
+				cmd_buf_cpu_va[methodSize++] = 0x00000002;
+			else
+				cmd_buf_cpu_va[methodSize++] = 0x00000001;
+
+			launch |= 0x00001000;
+		} else if (request_operation & NVGPU_CE_MEMSET) {
+			/* Remap from component A on 1 byte wide pixels */
+			cmd_buf_cpu_va[methodSize++] = 0x200181c2;
+			cmd_buf_cpu_va[methodSize++] = 0x00000004;
+
+			cmd_buf_cpu_va[methodSize++] = 0x200181c0;
+			cmd_buf_cpu_va[methodSize++] = payload;
+
+			launch |= 0x00000400;
+		} else {
+			/* Illegal size */
+			return 0;
+		}
 
-		cmd_buf_cpu_va[methodSize++] = 0x20018098;
-		if (launch_flags & NVGPU_CE_SRC_LOCATION_LOCAL_FB) {
+		/* setup the destination/output */
+		cmd_buf_cpu_va[methodSize++] = 0x20068102;
+		cmd_buf_cpu_va[methodSize++] = (u64_hi32(dst_buf +
+			offset) & NVGPU_CE_UPPER_ADDRESS_OFFSET_MASK);
+		cmd_buf_cpu_va[methodSize++] = (u64_lo32(dst_buf +
+			offset) & NVGPU_CE_LOWER_ADDRESS_OFFSET_MASK);
+		/* Pitch in/out */
+		cmd_buf_cpu_va[methodSize++] = width;
+		cmd_buf_cpu_va[methodSize++] = width;
+		/* width and line count */
+		cmd_buf_cpu_va[methodSize++] = width;
+		cmd_buf_cpu_va[methodSize++] = height;
+
+		cmd_buf_cpu_va[methodSize++] = 0x20018099;
+		if (launch_flags & NVGPU_CE_DST_LOCATION_LOCAL_FB)
 			cmd_buf_cpu_va[methodSize++] = 0x00000000;
-		} else if (launch_flags & NVGPU_CE_SRC_LOCATION_NONCOHERENT_SYSMEM) {
+		else if (launch_flags &
+				NVGPU_CE_DST_LOCATION_NONCOHERENT_SYSMEM)
 			cmd_buf_cpu_va[methodSize++] = 0x00000002;
-		} else {
+		else
 			cmd_buf_cpu_va[methodSize++] = 0x00000001;
-		}
 
-		launch |= 0x00001000;
-	} else if (request_operation & NVGPU_CE_MEMSET) {
-		cmd_buf_cpu_va[methodSize++] = 0x200181c2;
-		cmd_buf_cpu_va[methodSize++] = 0x00030004;
+		launch |= 0x00002005;
 
-		cmd_buf_cpu_va[methodSize++] = 0x200181c0;
-		cmd_buf_cpu_va[methodSize++] = payload;
+		if (launch_flags & NVGPU_CE_SRC_MEMORY_LAYOUT_BLOCKLINEAR)
+			launch |= 0x00000000;
+		else
+			launch |= 0x00000080;
 
-		launch |= 0x00000400;
+		if (launch_flags & NVGPU_CE_DST_MEMORY_LAYOUT_BLOCKLINEAR)
+			launch |= 0x00000000;
+		else
+			launch |= 0x00000100;
 
-		/* converted into number of words */
-		size /= sizeof(u32);
+		cmd_buf_cpu_va[methodSize++] = 0x200180c0;
+		cmd_buf_cpu_va[methodSize++] = launch;
+		offset += chunk_size;
+		chunk -= chunk_size;
 	}
 
-	/* setup the destination/output */
-	cmd_buf_cpu_va[methodSize++] = 0x20018103;
-	cmd_buf_cpu_va[methodSize++] = (u64_lo32(dst_buf) & NVGPU_CE_LOWER_ADDRESS_OFFSET_MASK);
-
-	cmd_buf_cpu_va[methodSize++] = 0x20018102;
-	cmd_buf_cpu_va[methodSize++] = (u64_hi32(dst_buf) & NVGPU_CE_UPPER_ADDRESS_OFFSET_MASK);
-
-	cmd_buf_cpu_va[methodSize++] = 0x20018099;
-	if (launch_flags & NVGPU_CE_DST_LOCATION_LOCAL_FB) {
-		cmd_buf_cpu_va[methodSize++] = 0x00000000;
-	} else if (launch_flags & NVGPU_CE_DST_LOCATION_NONCOHERENT_SYSMEM) {
-		cmd_buf_cpu_va[methodSize++] = 0x00000002;
-	} else {
-		cmd_buf_cpu_va[methodSize++] = 0x00000001;
-	}
-
-	launch |= 0x00002000;
-
-	/* setup the format */
-	cmd_buf_cpu_va[methodSize++] = 0x20018107;
-	cmd_buf_cpu_va[methodSize++] = 1;
-	cmd_buf_cpu_va[methodSize++] = 0x20018106;
-	cmd_buf_cpu_va[methodSize++] =  u64_lo32(size);
-
-	launch |= 0x00000004;
-
-	if (launch_flags & NVGPU_CE_SRC_MEMORY_LAYOUT_BLOCKLINEAR)
-		launch |= 0x00000000;
-	else
-		launch |= 0x00000080;
-
-	if (launch_flags & NVGPU_CE_DST_MEMORY_LAYOUT_BLOCKLINEAR)
-		launch |= 0x00000000;
-	else
-		launch |= 0x00000100;
-
-	if (launch_flags & NVGPU_CE_DATA_TRANSFER_TYPE_NON_PIPELINED)
-		launch |= 0x00000002;
-	else
-		launch |= 0x00000001;
-
-	cmd_buf_cpu_va[methodSize++] = 0x200180c0;
-	cmd_buf_cpu_va[methodSize++] = launch;
-
 	return methodSize;
 }
 
@@ -457,6 +523,16 @@ u32 gk20a_ce_create_context_with_cb(struct gk20a *g,
 
 	ce_ctx->vm = g->mm.ce.vm;
 
+	if (nvgpu_is_enabled(g, NVGPU_MM_CE_TSG_REQUIRED)) {
+		/* allocate a tsg if needed */
+		ce_ctx->tsg = gk20a_tsg_open(g);
+
+		if (!ce_ctx->tsg) {
+			nvgpu_err(g, "ce: gk20a tsg not available");
+			goto end;
+		}
+	}
+
 	/* always kernel client needs privileged channel */
 	ce_ctx->ch = gk20a_open_new_channel_with_cb(g, gk20a_ce_finished_ctx_cb,
 					ce_ctx,
@@ -475,6 +551,14 @@ u32 gk20a_ce_create_context_with_cb(struct gk20a *g,
 		goto end;
 	}
 
+	if (nvgpu_is_enabled(g, NVGPU_MM_CE_TSG_REQUIRED)) {
+		err = gk20a_tsg_bind_channel(ce_ctx->tsg, ce_ctx->ch);
+		if (err) {
+			nvgpu_err(g, "ce: unable to bind to tsg");
+			goto end;
+		}
+	}
+
 	/* allocate gpfifo (1024 should be more than enough) */
 	err = gk20a_channel_alloc_gpfifo(ce_ctx->ch, 1024, 0, 0);
 	if (err) {
diff --git a/drivers/gpu/nvgpu/gk20a/ce2_gk20a.h b/drivers/gpu/nvgpu/gk20a/ce2_gk20a.h
index f1f9e260..1dad8952 100644
--- a/drivers/gpu/nvgpu/gk20a/ce2_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/ce2_gk20a.h
@@ -36,8 +36,8 @@ int gk20a_ce2_nonstall_isr(struct gk20a *g, u32 inst_id, u32 pri_base);
 #define NVGPU_CE_LOWER_ADDRESS_OFFSET_MASK 0xffffffff
 #define NVGPU_CE_UPPER_ADDRESS_OFFSET_MASK 0xff
 
-#define NVGPU_CE_COMMAND_BUF_SIZE     4096
-#define NVGPU_CE_MAX_COMMAND_BUFF_SIZE_PER_KICKOFF 128
+#define NVGPU_CE_COMMAND_BUF_SIZE     8192
+#define NVGPU_CE_MAX_COMMAND_BUFF_SIZE_PER_KICKOFF 256
 #define NVGPU_CE_MAX_COMMAND_BUFF_SIZE_FOR_TRACING 8
 
 typedef void (*ce_event_callback)(u32 ce_ctx_id, u32 ce_event_flag);
@@ -108,6 +108,9 @@ struct gk20a_gpu_ctx {
 	int gpu_ctx_state;
 	ce_event_callback user_event_callback;
 
+	/* tsg related data */
+	struct tsg_gk20a *tsg;
+
 	/* channel related data */
 	struct channel_gk20a *ch;
 	struct vm_gk20a *vm;
diff --git a/drivers/gpu/nvgpu/gk20a/platform_gk20a.h b/drivers/gpu/nvgpu/gk20a/platform_gk20a.h
index c2c73b9c..d4ff17f3 100644
--- a/drivers/gpu/nvgpu/gk20a/platform_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/platform_gk20a.h
@@ -233,6 +233,9 @@ struct gk20a_platform {
 	/* unified or split memory with separate vidmem? */
 	bool unified_memory;
 
+	/* true if all channels must be in TSG */
+	bool tsg_required;
+
 	/* minimum supported VBIOS version */
 	u32 vbios_min_version;
 
diff --git a/drivers/gpu/nvgpu/include/nvgpu/enabled.h b/drivers/gpu/nvgpu/include/nvgpu/enabled.h
index 41758fe7..8c0bb9d3 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/enabled.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/enabled.h
@@ -42,6 +42,8 @@ struct gk20a;
 #define NVGPU_MM_HONORS_APERTURE		17
 /* unified or split memory with separate vidmem? */
 #define NVGPU_MM_UNIFIED_MEMORY			18
+/* kernel mode ce vidmem clearing channels need to be in a tsg */
+#define NVGPU_MM_CE_TSG_REQUIRED		19
 
 /*
  * Security flags
-- 
cgit v1.2.2