From 67fe5f6d738a9b1ba2d8f5dd0726790d965c8883 Mon Sep 17 00:00:00 2001
From: Deepak Nibade <dnibade@nvidia.com>
Date: Mon, 26 Oct 2015 18:47:55 +0530
Subject: gpu: nvgpu: remove temporary gpfifo allocation in submit path

In GPU job submit path gk20a_ioctl_channel_submit_gpfifo(),
we currently allocate a temporary gpfifo, copy user space
gpfifo content into this temporary buffer, and then copy
temp buffer content into channel's gpfifo.

Allocation/copy/free of temporary buffer adds additional
overhead

Rewrite this sequence such that gk20a_submit_channel_gpfifo()
can receive either a pre-filled gpfifo or pointer to
user provided args.
And then we can direclty copy the user provided gpfifo
into the channel's gpfifo

Also, if command buffer tracing is enabled, we still need
to copy user provided gpfifo into temporaty buffer for reading
But that should not cause overhead in real world use case

Bug 200141116

Change-Id: I7166c9271da2694059da9853ab8839e98457b941
Signed-off-by: Deepak Nibade <dnibade@nvidia.com>
Reviewed-on: http://git-master/r/823386
(cherry picked from commit 3e0702db006c262dd8737a567b8e06f7ff005e2c)
Reviewed-on: http://git-master/r/835799
GVS: Gerrit_Virtual_Submit
Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com>
---
 drivers/gpu/nvgpu/gk20a/cde_gk20a.c     |   2 +-
 drivers/gpu/nvgpu/gk20a/channel_gk20a.c | 132 +++++++++++++++++++++++---------
 drivers/gpu/nvgpu/gk20a/channel_gk20a.h |   1 +
 3 files changed, 97 insertions(+), 38 deletions(-)

(limited to 'drivers/gpu')

diff --git a/drivers/gpu/nvgpu/gk20a/cde_gk20a.c b/drivers/gpu/nvgpu/gk20a/cde_gk20a.c
index 01fca058..e1edec2a 100644
--- a/drivers/gpu/nvgpu/gk20a/cde_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/cde_gk20a.c
@@ -723,7 +723,7 @@ static int gk20a_cde_execute_buffer(struct gk20a_cde_ctx *cde_ctx,
 		return -ENOSYS;
 	}
 
-	return gk20a_submit_channel_gpfifo(cde_ctx->ch, gpfifo,
+	return gk20a_submit_channel_gpfifo(cde_ctx->ch, gpfifo, NULL,
 					   num_entries, flags, fence, fence_out);
 }
 
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
index 4cf3beec..0b84b7da 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
@@ -1551,14 +1551,42 @@ static void trace_write_pushbuffer(struct channel_gk20a *c,
 
 static void trace_write_pushbuffer_range(struct channel_gk20a *c,
 					 struct nvgpu_gpfifo *g,
+					 struct nvgpu_submit_gpfifo_args *args,
+					 int offset,
 					 int count)
 {
-	if (gk20a_debug_trace_cmdbuf) {
-		int i;
-		struct nvgpu_gpfifo *gp = g;
-		for (i = 0; i < count; i++, gp++)
-			trace_write_pushbuffer(c, gp);
+	u32 size;
+	int i;
+	struct nvgpu_gpfifo *gp;
+	bool gpfifo_allocated = false;
+
+	if (!gk20a_debug_trace_cmdbuf)
+		return;
+
+	if (!g && !args)
+		return;
+
+	if (!g) {
+		size = args->num_entries * sizeof(struct nvgpu_gpfifo);
+		if (size) {
+			g = nvgpu_alloc(size, false);
+			if (!g)
+				return;
+
+			if (copy_from_user(g,
+				(void __user *)(uintptr_t)args->gpfifo,	size)) {
+				return;
+			}
+		}
+		gpfifo_allocated = true;
 	}
+
+	gp = g + offset;
+	for (i = 0; i < count; i++, gp++)
+		trace_write_pushbuffer(c, gp);
+
+	if (gpfifo_allocated)
+		nvgpu_free(g);
 }
 
 static void gk20a_channel_timeout_start(struct channel_gk20a *ch,
@@ -1810,6 +1838,7 @@ void gk20a_channel_update(struct channel_gk20a *c, int nr_completed)
 
 int gk20a_submit_channel_gpfifo(struct channel_gk20a *c,
 				struct nvgpu_gpfifo *gpfifo,
+				struct nvgpu_submit_gpfifo_args *args,
 				u32 num_entries,
 				u32 flags,
 				struct nvgpu_fence *fence,
@@ -1842,6 +1871,9 @@ int gk20a_submit_channel_gpfifo(struct channel_gk20a *c,
 		return -ENOMEM;
 	}
 
+	if (!gpfifo && !args)
+		return -EINVAL;
+
 	if ((flags & (NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_WAIT |
 		      NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_GET)) &&
 	    !fence)
@@ -1986,24 +2018,69 @@ int gk20a_submit_channel_gpfifo(struct channel_gk20a *c,
 	start = c->gpfifo.put;
 	end = start + num_entries;
 
-	if (end > c->gpfifo.entry_num) {
-		int length0 = c->gpfifo.entry_num - start;
-		int length1 = num_entries - length0;
+	if (gpfifo) {
+		if (end > c->gpfifo.entry_num) {
+			int length0 = c->gpfifo.entry_num - start;
+			int length1 = num_entries - length0;
 
-		memcpy(gpfifo_mem + start, gpfifo,
-		       length0 * sizeof(*gpfifo));
+			memcpy(gpfifo_mem + start, gpfifo,
+			       length0 * sizeof(*gpfifo));
 
-		memcpy(gpfifo_mem, gpfifo + length0,
-		       length1 * sizeof(*gpfifo));
+			memcpy(gpfifo_mem, gpfifo + length0,
+			       length1 * sizeof(*gpfifo));
+
+			trace_write_pushbuffer_range(c, gpfifo, NULL,
+					0, length0);
+			trace_write_pushbuffer_range(c, gpfifo, NULL,
+					length0, length1);
+		} else {
+			memcpy(gpfifo_mem + start, gpfifo,
+			       num_entries * sizeof(*gpfifo));
 
-		trace_write_pushbuffer_range(c, gpfifo, length0);
-		trace_write_pushbuffer_range(c, gpfifo + length0, length1);
+			trace_write_pushbuffer_range(c, gpfifo, NULL,
+					0, num_entries);
+		}
 	} else {
-		memcpy(gpfifo_mem + start, gpfifo,
-		       num_entries * sizeof(*gpfifo));
+		struct nvgpu_gpfifo __user *user_gpfifo =
+			(struct nvgpu_gpfifo __user *)(uintptr_t)args->gpfifo;
+		if (end > c->gpfifo.entry_num) {
+			int length0 = c->gpfifo.entry_num - start;
+			int length1 = num_entries - length0;
+
+			err = copy_from_user(gpfifo_mem + start,
+				user_gpfifo,
+				length0 * sizeof(*user_gpfifo));
+			if (err) {
+				mutex_unlock(&c->submit_lock);
+				goto clean_up;
+			}
+
+			err = copy_from_user(gpfifo_mem,
+				user_gpfifo + length0,
+				length1 * sizeof(*user_gpfifo));
+			if (err) {
+				mutex_unlock(&c->submit_lock);
+				goto clean_up;
+			}
 
-		trace_write_pushbuffer_range(c, gpfifo, num_entries);
+			trace_write_pushbuffer_range(c, NULL, args,
+					0, length0);
+			trace_write_pushbuffer_range(c, NULL, args,
+					length0, length1);
+		} else {
+			err = copy_from_user(gpfifo_mem + start,
+				user_gpfifo,
+				num_entries * sizeof(*user_gpfifo));
+			if (err) {
+				mutex_unlock(&c->submit_lock);
+				goto clean_up;
+			}
+
+			trace_write_pushbuffer_range(c, NULL, args,
+					0, num_entries);
+		}
 	}
+
 	c->gpfifo.put = (c->gpfifo.put + num_entries) &
 		(c->gpfifo.entry_num - 1);
 
@@ -2501,8 +2578,6 @@ static int gk20a_ioctl_channel_submit_gpfifo(
 	struct nvgpu_submit_gpfifo_args *args)
 {
 	struct gk20a_fence *fence_out;
-	void *gpfifo = NULL;
-	u32 size;
 	int ret = 0;
 
 	gk20a_dbg_fn("");
@@ -2510,23 +2585,7 @@ static int gk20a_ioctl_channel_submit_gpfifo(
 	if (ch->has_timedout)
 		return -ETIMEDOUT;
 
-	/* zero-sized submits are allowed, since they can be used for
-	 * synchronization; we might still wait and do an increment */
-	size = args->num_entries * sizeof(struct nvgpu_gpfifo);
-	if (size) {
-		gpfifo = nvgpu_alloc(size, false);
-		if (!gpfifo)
-			return -ENOMEM;
-
-		if (copy_from_user(gpfifo,
-					(void __user *)(uintptr_t)args->gpfifo,
-					size)) {
-			ret = -EINVAL;
-			goto clean_up;
-		}
-	}
-
-	ret = gk20a_submit_channel_gpfifo(ch, gpfifo, args->num_entries,
+	ret = gk20a_submit_channel_gpfifo(ch, NULL, args, args->num_entries,
 					  args->flags, &args->fence,
 					  &fence_out);
 
@@ -2549,7 +2608,6 @@ static int gk20a_ioctl_channel_submit_gpfifo(
 	gk20a_fence_put(fence_out);
 
 clean_up:
-	nvgpu_free(gpfifo);
 	return ret;
 }
 
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
index e7809daa..d5f5e6a2 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
@@ -237,6 +237,7 @@ void channel_gk20a_unbind(struct channel_gk20a *ch_gk20a);
 
 int gk20a_submit_channel_gpfifo(struct channel_gk20a *c,
 				struct nvgpu_gpfifo *gpfifo,
+				struct nvgpu_submit_gpfifo_args *args,
 				u32 num_entries,
 				u32 flags,
 				struct nvgpu_fence *fence,
-- 
cgit v1.2.2