From dfd5ec53fcce4ebae27f78242e6b788350337095 Mon Sep 17 00:00:00 2001
From: Alex Waterman <alexw@nvidia.com>
Date: Wed, 27 Apr 2016 12:27:36 -0700
Subject: gpu: nvgpu: Revamp semaphore support

Revamp the support the nvgpu driver has for semaphores.

The original problem with nvgpu's semaphore support is that it
required a SW based wait for every semaphore release. This was
because for every fence that gk20a_channel_semaphore_wait_fd()
waited on a new semaphore was created. This semaphore would then
get released by SW when the fence signaled. This meant that for
every release there was necessarily a sync_fence_wait_async() call
which could block. The latency of this SW wait was enough to cause
massive degredation in performance.

To fix this a fast path was implemented. When a fence is passed to
gk20a_channel_semaphore_wait_fd() that is backed by a GPU semaphore
a semaphore acquire is directly used to block the GPU. No longer is
a sync_fence_wait_async() performed nor is there an extra semaphore
created.

To implement this fast path the semaphore memory had to be shared
between channels. Previously since a new semaphore was created
every time through gk20a_channel_semaphore_wait_fd() what address
space a semaphore was mapped into was irrelevant. However, when
using the fast path a sempahore may be released on one address
space but acquired in another.

Sharing the semaphore memory was done by making a fixed GPU mapping
in all channels. This mapping points to the semaphore memory (the
so called semaphore sea). This global fixed mapping is read-only to
make sure no semaphores can be incremented (i.e released) by a
malicious channel. Each channel then gets a RW mapping of it's own
semaphore. This way a channel may only acquire other channel's
semaphores but may both acquire and release its own semaphore.

The gk20a fence code was updated to allow introspection of the GPU
backed fences. This allows detection of when the fast path can be
taken. If the fast path cannot be used (for example when a fence is
sync-pt backed) the original slow path is still present. This gets
used when the GPU needs to wait on an event from something which
only understands how to use sync-pts.

Bug 1732449
JIRA DNVGPU-12

Change-Id: Ic0fea74994da5819a771deac726bb0d47a33c2de
Signed-off-by: Alex Waterman <alexw@nvidia.com>
Reviewed-on: http://git-master/r/1133792
Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com>
Tested-by: Terje Bergstrom <tbergstrom@nvidia.com>
---
 drivers/gpu/nvgpu/gk20a/mm_gk20a.c | 82 +++++++++++++++++++++++++++++++++++++-
 1 file changed, 80 insertions(+), 2 deletions(-)

(limited to 'drivers/gpu/nvgpu/gk20a/mm_gk20a.c')

diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
index 3b21e843..9299266f 100644
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
@@ -3213,6 +3213,17 @@ static void gk20a_vm_remove_support_nofree(struct vm_gk20a *vm)
 	struct rb_node *node;
 
 	gk20a_dbg_fn("");
+
+	/*
+	 * Do this outside of the update_gmmu_lock since unmapping the semaphore
+	 * pool involves unmapping a GMMU mapping which means aquiring the
+	 * update_gmmu_lock.
+	 */
+	if (!gk20a_platform_has_syncpoints(gk20a_from_vm(vm)->dev)) {
+		gk20a_semaphore_pool_unmap(vm->sema_pool, vm);
+		gk20a_semaphore_pool_put(vm->sema_pool);
+	}
+
 	mutex_lock(&vm->update_gmmu_lock);
 
 	/* TBD: add a flag here for the unmap code to recognize teardown
@@ -3286,6 +3297,64 @@ const struct gk20a_mmu_level gk20a_mm_levels_128k[] = {
 	{.update_entry = NULL}
 };
 
+/*
+ * Initialize a semaphore pool. Just return successfully if we do not need
+ * semaphores (i.e when sync-pts are active).
+ */
+int gk20a_init_sema_pool(struct vm_gk20a *vm)
+{
+	struct gk20a_semaphore_sea *sema_sea;
+	struct mm_gk20a *mm = vm->mm;
+	struct gk20a *g = mm->g;
+	int err;
+
+	/*
+	 * Don't waste the memory on semaphores if we don't need them.
+	 */
+	if (gk20a_platform_has_syncpoints(g->dev))
+		return 0;
+
+	if (vm->sema_pool)
+		return 0;
+
+	sema_sea = gk20a_semaphore_sea_create(g);
+	if (!sema_sea)
+		return -ENOMEM;
+
+	vm->sema_pool = gk20a_semaphore_pool_alloc(sema_sea);
+	if (!vm->sema_pool) {
+		gk20a_vm_put(vm);
+		return -ENOMEM;
+	}
+
+	/*
+	 * Allocate a chunk of GPU VA space for mapping the semaphores. We will
+	 * do a fixed alloc in the kernel VM so that all channels have the same
+	 * RO address range for the semaphores.
+	 *
+	 * !!! TODO: cleanup.
+	 */
+	sema_sea->gpu_va = gk20a_balloc_fixed(&vm->vma[gmmu_page_size_kernel],
+					      vm->va_limit -
+					      mm->channel.kernel_size,
+					      512 * PAGE_SIZE);
+	if (!sema_sea->gpu_va) {
+		gk20a_bfree(&vm->vma[gmmu_page_size_small], sema_sea->gpu_va);
+		gk20a_vm_put(vm);
+		return -ENOMEM;
+	}
+
+	err = gk20a_semaphore_pool_map(vm->sema_pool, vm);
+	if (err) {
+		gk20a_semaphore_pool_unmap(vm->sema_pool, vm);
+		gk20a_bfree(&vm->vma[gmmu_page_size_small],
+			    vm->sema_pool->gpu_va);
+		gk20a_vm_put(vm);
+	}
+
+	return 0;
+}
+
 int gk20a_init_vm(struct mm_gk20a *mm,
 		struct vm_gk20a *vm,
 		u32 big_page_size,
@@ -3317,9 +3386,7 @@ int gk20a_init_vm(struct mm_gk20a *mm,
 	vm->big_pages = big_pages;
 
 	vm->big_page_size = gmmu_page_sizes[gmmu_page_size_big];
-
 	vm->userspace_managed = userspace_managed;
-
 	vm->mmu_levels = vm->mm->g->ops.mm.get_mmu_levels(vm->mm->g,
 			vm->big_page_size);
 
@@ -3465,6 +3532,17 @@ int gk20a_init_vm(struct mm_gk20a *mm,
 	kref_init(&vm->ref);
 	INIT_LIST_HEAD(&vm->reserved_va_list);
 
+	/*
+	 * This is only necessary for channel address spaces. The best way to
+	 * distinguish channel address spaces from other address spaces is by
+	 * size - if the address space is 4GB or less, it's not a channel.
+	 */
+	if (vm->va_limit > SZ_4G) {
+		err = gk20a_init_sema_pool(vm);
+		if (err)
+			goto clean_up_big_allocator;
+	}
+
 	return 0;
 
 clean_up_big_allocator:
-- 
cgit v1.2.2