From 52753b51f1dbf51221d7856a9288aad1ab2d351a Mon Sep 17 00:00:00 2001
From: Deepak Nibade <dnibade@nvidia.com>
Date: Wed, 7 Oct 2015 16:20:07 +0530
Subject: gpu: nvgpu: create sync_fence only if needed

Currently, we create sync_fence (from nvhost_sync_create_fence())
for every submit
But not all submits request for a sync_fence.

Also, nvhost_sync_create_fence() API takes about 1/3rd of the total
submit path.

Hence to optimize, we can allocate sync_fence
only when user explicitly asks for it using
(NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_GET &&
NVGPU_SUBMIT_GPFIFO_FLAGS_SYNC_FENCE)

Also, in CDE path from gk20a_prepare_compressible_read(),
we reuse existing fence stored in "state" and that can
result into not returning sync_fence_fd when user asked
for it
Hence, force allocation of sync_fence when job submission
comes from CDE path

Bug 200141116

Change-Id: Ia921701bf0e2432d6b8a5e8b7d91160e7f52db1e
Signed-off-by: Deepak Nibade <dnibade@nvidia.com>
Reviewed-on: http://git-master/r/812845
(cherry picked from commit 5fd47015eeed00352cc8473eff969a66c94fee98)
Reviewed-on: http://git-master/r/837662
Reviewed-by: Automatic_Commit_Validation_User
GVS: Gerrit_Virtual_Submit
Reviewed-by: Sachin Nikam <snikam@nvidia.com>
---
 drivers/gpu/nvgpu/gk20a/cde_gk20a.c          |  2 +-
 drivers/gpu/nvgpu/gk20a/channel_gk20a.c      | 20 +++++++++++++----
 drivers/gpu/nvgpu/gk20a/channel_gk20a.h      |  3 ++-
 drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c | 32 +++++++++++++++++-----------
 drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.h |  8 ++++---
 drivers/gpu/nvgpu/gk20a/fence_gk20a.c        | 13 ++++++-----
 drivers/gpu/nvgpu/gk20a/fence_gk20a.h        |  5 +++--
 7 files changed, 54 insertions(+), 29 deletions(-)

(limited to 'drivers/gpu/nvgpu/gk20a')

diff --git a/drivers/gpu/nvgpu/gk20a/cde_gk20a.c b/drivers/gpu/nvgpu/gk20a/cde_gk20a.c
index e1edec2a..a2f7e7a4 100644
--- a/drivers/gpu/nvgpu/gk20a/cde_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/cde_gk20a.c
@@ -724,7 +724,7 @@ static int gk20a_cde_execute_buffer(struct gk20a_cde_ctx *cde_ctx,
 	}
 
 	return gk20a_submit_channel_gpfifo(cde_ctx->ch, gpfifo, NULL,
-					   num_entries, flags, fence, fence_out);
+				   num_entries, flags, fence, fence_out, true);
 }
 
 static void gk20a_cde_ctx_release(struct gk20a_cde_ctx *cde_ctx)
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
index 59c3e31d..98c8760e 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
@@ -1830,7 +1830,8 @@ int gk20a_submit_channel_gpfifo(struct channel_gk20a *c,
 				u32 num_entries,
 				u32 flags,
 				struct nvgpu_fence *fence,
-				struct gk20a_fence **fence_out)
+				struct gk20a_fence **fence_out,
+				bool force_need_sync_fence)
 {
 	struct gk20a *g = c->g;
 	struct device *d = dev_from_gk20a(g);
@@ -1848,6 +1849,14 @@ int gk20a_submit_channel_gpfifo(struct channel_gk20a *c,
 	struct nvgpu_gpfifo *gpfifo_mem = c->gpfifo.mem.cpu_va;
 	bool skip_buffer_refcounting = (flags &
 			NVGPU_SUBMIT_GPFIFO_FLAGS_SKIP_BUFFER_REFCOUNTING);
+	bool need_sync_fence = false;
+
+	/*
+	 * If user wants to allocate sync_fence_fd always, then respect that;
+	 * otherwise, allocate sync_fence_fd based on user flags only
+	 */
+	if (force_need_sync_fence)
+		need_sync_fence = true;
 
 	if (c->has_timedout)
 		return -ETIMEDOUT;
@@ -1970,15 +1979,18 @@ int gk20a_submit_channel_gpfifo(struct channel_gk20a *c,
 		goto clean_up;
 	}
 
+	if ((flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_GET) &&
+			(flags & NVGPU_SUBMIT_GPFIFO_FLAGS_SYNC_FENCE))
+		need_sync_fence = true;
 
 	/* always insert syncpt increment at end of gpfifo submission
 	   to keep track of method completion for idle railgating */
 	if (flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_GET)
 		err = c->sync->incr_user(c->sync, wait_fence_fd, &incr_cmd,
-					 &post_fence, need_wfi);
+				 &post_fence, need_wfi, need_sync_fence);
 	else
 		err = c->sync->incr(c->sync, &incr_cmd,
-				    &post_fence);
+				    &post_fence, need_sync_fence);
 	if (err) {
 		mutex_unlock(&c->submit_lock);
 		goto clean_up;
@@ -2578,7 +2590,7 @@ static int gk20a_ioctl_channel_submit_gpfifo(
 
 	ret = gk20a_submit_channel_gpfifo(ch, NULL, args, args->num_entries,
 					  args->flags, &args->fence,
-					  &fence_out);
+					  &fence_out, false);
 
 	if (ret)
 		goto clean_up;
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
index 55528dd9..d3428788 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
@@ -244,7 +244,8 @@ int gk20a_submit_channel_gpfifo(struct channel_gk20a *c,
 				u32 num_entries,
 				u32 flags,
 				struct nvgpu_fence *fence,
-				struct gk20a_fence **fence_out);
+				struct gk20a_fence **fence_out,
+				bool force_need_sync_fence);
 
 int gk20a_alloc_channel_gpfifo(struct channel_gk20a *c,
 			       struct nvgpu_alloc_gpfifo_args *args);
diff --git a/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c
index 95647774..c0c8ec6d 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c
@@ -166,7 +166,8 @@ static int __gk20a_channel_syncpt_incr(struct gk20a_channel_sync *s,
 				       bool wfi_cmd,
 				       bool register_irq,
 				       struct priv_cmd_entry **entry,
-				       struct gk20a_fence **fence)
+				       struct gk20a_fence **fence,
+				       bool need_sync_fence)
 {
 	u32 thresh;
 	int incr_cmd_size;
@@ -239,7 +240,7 @@ static int __gk20a_channel_syncpt_incr(struct gk20a_channel_sync *s,
 	}
 
 	*fence = gk20a_fence_from_syncpt(sp->host1x_pdev, sp->id, thresh,
-					 wfi_cmd);
+					 wfi_cmd, need_sync_fence);
 	*entry = incr_cmd;
 	return 0;
 }
@@ -251,33 +252,35 @@ static int gk20a_channel_syncpt_incr_wfi(struct gk20a_channel_sync *s,
 	return __gk20a_channel_syncpt_incr(s,
 			true /* wfi */,
 			false /* no irq handler */,
-			entry, fence);
+			entry, fence, true);
 }
 
 static int gk20a_channel_syncpt_incr(struct gk20a_channel_sync *s,
 			      struct priv_cmd_entry **entry,
-			      struct gk20a_fence **fence)
+			      struct gk20a_fence **fence,
+			      bool need_sync_fence)
 {
 	/* Don't put wfi cmd to this one since we're not returning
 	 * a fence to user space. */
 	return __gk20a_channel_syncpt_incr(s,
 			false /* no wfi */,
 			true /* register irq */,
-			entry, fence);
+			entry, fence, need_sync_fence);
 }
 
 static int gk20a_channel_syncpt_incr_user(struct gk20a_channel_sync *s,
 				   int wait_fence_fd,
 				   struct priv_cmd_entry **entry,
 				   struct gk20a_fence **fence,
-				   bool wfi)
+				   bool wfi,
+				   bool need_sync_fence)
 {
 	/* Need to do 'wfi + host incr' since we return the fence
 	 * to user space. */
 	return __gk20a_channel_syncpt_incr(s,
 			wfi,
 			true /* register irq */,
-			entry, fence);
+			entry, fence, need_sync_fence);
 }
 
 static void gk20a_channel_syncpt_set_min_eq_max(struct gk20a_channel_sync *s)
@@ -513,7 +516,8 @@ static int __gk20a_channel_semaphore_incr(
 		struct gk20a_channel_sync *s, bool wfi_cmd,
 		struct sync_fence *dependency,
 		struct priv_cmd_entry **entry,
-		struct gk20a_fence **fence)
+		struct gk20a_fence **fence,
+		bool need_sync_fence)
 {
 	u64 va;
 	int incr_cmd_size;
@@ -560,18 +564,19 @@ static int gk20a_channel_semaphore_incr_wfi(
 	return __gk20a_channel_semaphore_incr(s,
 			true /* wfi */,
 			NULL,
-			entry, fence);
+			entry, fence, true);
 }
 
 static int gk20a_channel_semaphore_incr(
 		struct gk20a_channel_sync *s,
 		struct priv_cmd_entry **entry,
-		struct gk20a_fence **fence)
+		struct gk20a_fence **fence,
+		bool need_sync_fence)
 {
 	/* Don't put wfi cmd to this one since we're not returning
 	 * a fence to user space. */
 	return __gk20a_channel_semaphore_incr(s, false /* no wfi */,
-					      NULL, entry, fence);
+				      NULL, entry, fence, need_sync_fence);
 }
 
 static int gk20a_channel_semaphore_incr_user(
@@ -579,7 +584,8 @@ static int gk20a_channel_semaphore_incr_user(
 		int wait_fence_fd,
 		struct priv_cmd_entry **entry,
 		struct gk20a_fence **fence,
-		bool wfi)
+		bool wfi,
+		bool need_sync_fence)
 {
 #ifdef CONFIG_SYNC
 	struct sync_fence *dependency = NULL;
@@ -592,7 +598,7 @@ static int gk20a_channel_semaphore_incr_user(
 	}
 
 	err = __gk20a_channel_semaphore_incr(s, wfi, dependency,
-					     entry, fence);
+					     entry, fence, need_sync_fence);
 	if (err) {
 		if (dependency)
 			sync_fence_put(dependency);
diff --git a/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.h b/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.h
index a347cbab..618e1b26 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.h
@@ -3,7 +3,7 @@
  *
  * GK20A Channel Synchronization Abstraction
  *
- * Copyright (c) 2014, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2014-2015, NVIDIA CORPORATION.  All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms and conditions of the GNU General Public License,
@@ -54,7 +54,8 @@ struct gk20a_channel_sync {
 	 */
 	int (*incr)(struct gk20a_channel_sync *s,
 		    struct priv_cmd_entry **entry,
-		    struct gk20a_fence **fence);
+		    struct gk20a_fence **fence,
+		    bool need_sync_fence);
 
 	/* Increment syncpoint/semaphore, preceded by a wfi.
 	 * Returns
@@ -76,7 +77,8 @@ struct gk20a_channel_sync {
 			 int wait_fence_fd,
 			 struct priv_cmd_entry **entry,
 			 struct gk20a_fence **fence,
-			 bool wfi);
+			 bool wfi,
+			 bool need_sync_fence);
 
 	/* Reset the channel syncpoint/semaphore. */
 	void (*set_min_eq_max)(struct gk20a_channel_sync *s);
diff --git a/drivers/gpu/nvgpu/gk20a/fence_gk20a.c b/drivers/gpu/nvgpu/gk20a/fence_gk20a.c
index 54a288cd..ae19d36f 100644
--- a/drivers/gpu/nvgpu/gk20a/fence_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/fence_gk20a.c
@@ -194,7 +194,8 @@ static const struct gk20a_fence_ops gk20a_syncpt_fence_ops = {
 };
 
 struct gk20a_fence *gk20a_fence_from_syncpt(struct platform_device *host1x_pdev,
-					    u32 id, u32 value, bool wfi)
+					    u32 id, u32 value, bool wfi,
+					    bool need_sync_fence)
 {
 	struct gk20a_fence *f;
 	struct sync_fence *sync_fence = NULL;
@@ -205,10 +206,12 @@ struct gk20a_fence *gk20a_fence_from_syncpt(struct platform_device *host1x_pdev,
 		.thresh = value
 	};
 
-	sync_fence = nvhost_sync_create_fence(host1x_pdev, &pt, 1,
-					      "fence");
-	if (IS_ERR(sync_fence))
-		return NULL;
+	if (need_sync_fence) {
+		sync_fence = nvhost_sync_create_fence(host1x_pdev, &pt, 1,
+						      "fence");
+		if (IS_ERR(sync_fence))
+			return NULL;
+	}
 #endif
 
 	f = alloc_fence(&gk20a_syncpt_fence_ops, sync_fence, wfi);
diff --git a/drivers/gpu/nvgpu/gk20a/fence_gk20a.h b/drivers/gpu/nvgpu/gk20a/fence_gk20a.h
index 629dc694..75e135e9 100644
--- a/drivers/gpu/nvgpu/gk20a/fence_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/fence_gk20a.h
@@ -3,7 +3,7 @@
  *
  * GK20A Fences
  *
- * Copyright (c) 2014, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2014-2015, NVIDIA CORPORATION.  All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms and conditions of the GNU General Public License,
@@ -56,7 +56,8 @@ struct gk20a_fence *gk20a_fence_from_semaphore(
 
 struct gk20a_fence *gk20a_fence_from_syncpt(
 		struct platform_device *host1x_pdev,
-		u32 id, u32 value, bool wfi);
+		u32 id, u32 value, bool wfi,
+		bool need_sync_fence);
 
 /* Fence operations */
 void gk20a_fence_put(struct gk20a_fence *f);
-- 
cgit v1.2.2