gpu: nvgpu: Revamp semaphore support

Revamp the support the nvgpu driver has for semaphores. The original problem with nvgpu's semaphore support is that it required a SW based wait for every semaphore release. This was because for every fence that gk20a_channel_semaphore_wait_fd() waited on a new semaphore was created. This semaphore would then get released by SW when the fence signaled. This meant that for every release there was necessarily a sync_fence_wait_async() call which could block. The latency of this SW wait was enough to cause massive degredation in performance. To fix this a fast path was implemented. When a fence is passed to gk20a_channel_semaphore_wait_fd() that is backed by a GPU semaphore a semaphore acquire is directly used to block the GPU. No longer is a sync_fence_wait_async() performed nor is there an extra semaphore created. To implement this fast path the semaphore memory had to be shared between channels. Previously since a new semaphore was created every time through gk20a_channel_semaphore_wait_fd() what address space a semaphore was mapped into was irrelevant. However, when using the fast path a sempahore may be released on one address space but acquired in another. Sharing the semaphore memory was done by making a fixed GPU mapping in all channels. This mapping points to the semaphore memory (the so called semaphore sea). This global fixed mapping is read-only to make sure no semaphores can be incremented (i.e released) by a malicious channel. Each channel then gets a RW mapping of it's own semaphore. This way a channel may only acquire other channel's semaphores but may both acquire and release its own semaphore. The gk20a fence code was updated to allow introspection of the GPU backed fences. This allows detection of when the fast path can be taken. If the fast path cannot be used (for example when a fence is sync-pt backed) the original slow path is still present. This gets used when the GPU needs to wait on an event from something which only understands how to use sync-pts. Bug 1732449 JIRA DNVGPU-12 Change-Id: Ic0fea74994da5819a771deac726bb0d47a33c2de Signed-off-by: Alex Waterman <alexw@nvidia.com> Reviewed-on: http://git-master/r/1133792 Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com> Tested-by: Terje Bergstrom <tbergstrom@nvidia.com>
author: Alex Waterman <alexw@nvidia.com> 2016-04-27 15:27:36 -0400
committer: Terje Bergstrom <tbergstrom@nvidia.com> 2016-06-28 18:49:11 -0400
commit: dfd5ec53fcce4ebae27f78242e6b788350337095 (patch)
tree: 073ea380b9ee4734391d381745f57600c3525be5 /drivers/gpu/nvgpu/gk20a/mm_gk20a.c
parent: b30990ea6db564e885d5aee7a1a5ea87a1e5e8ee (diff)
1 files changed, 80 insertions, 2 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
index 3b21e843..9299266f 100644
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
@@ -3213,6 +3213,17 @@ static void gk20a_vm_remove_support_nofree(struct vm_gk20a *vm)
        struct rb_node *node;
        gk20a_dbg_fn("");
+        /*
+         * Do this outside of the update_gmmu_lock since unmapping the semaphore
+         * pool involves unmapping a GMMU mapping which means aquiring the
+         * update_gmmu_lock.
+         */
+        if (!gk20a_platform_has_syncpoints(gk20a_from_vm(vm)->dev)) {
+                gk20a_semaphore_pool_unmap(vm->sema_pool, vm);
+                gk20a_semaphore_pool_put(vm->sema_pool);
+        }
        mutex_lock(&vm->update_gmmu_lock);
        /* TBD: add a flag here for the unmap code to recognize teardown
@@ -3286,6 +3297,64 @@ const struct gk20a_mmu_level gk20a_mm_levels_128k[] = {
        {.update_entry = NULL}
 };
+/*
+ * Initialize a semaphore pool. Just return successfully if we do not need
+ * semaphores (i.e when sync-pts are active).
+ */
+int gk20a_init_sema_pool(struct vm_gk20a *vm)
+{
+        struct gk20a_semaphore_sea *sema_sea;
+        struct mm_gk20a *mm = vm->mm;
+        struct gk20a *g = mm->g;
+        int err;
+        /*
+         * Don't waste the memory on semaphores if we don't need them.
+         */
+        if (gk20a_platform_has_syncpoints(g->dev))
+                return 0;
+        if (vm->sema_pool)
+                return 0;
+        sema_sea = gk20a_semaphore_sea_create(g);
+        if (!sema_sea)
+                return -ENOMEM;
+        vm->sema_pool = gk20a_semaphore_pool_alloc(sema_sea);
+        if (!vm->sema_pool) {
+                gk20a_vm_put(vm);
+                return -ENOMEM;
+        }
+        /*
+         * Allocate a chunk of GPU VA space for mapping the semaphores. We will
+         * do a fixed alloc in the kernel VM so that all channels have the same
+         * RO address range for the semaphores.
+         *
+         * !!! TODO: cleanup.
+         */
+        sema_sea->gpu_va = gk20a_balloc_fixed(&vm->vma[gmmu_page_size_kernel],
+                                              vm->va_limit -
+                                              mm->channel.kernel_size,
+                                              512 * PAGE_SIZE);
+        if (!sema_sea->gpu_va) {
+                gk20a_bfree(&vm->vma[gmmu_page_size_small], sema_sea->gpu_va);
+                gk20a_vm_put(vm);
+                return -ENOMEM;
+        }
+        err = gk20a_semaphore_pool_map(vm->sema_pool, vm);
+        if (err) {
+                gk20a_semaphore_pool_unmap(vm->sema_pool, vm);
+                gk20a_bfree(&vm->vma[gmmu_page_size_small],
+                            vm->sema_pool->gpu_va);
+                gk20a_vm_put(vm);
+        }
+        return 0;
+}
 int gk20a_init_vm(struct mm_gk20a *mm,
                struct vm_gk20a *vm,
                u32 big_page_size,
@@ -3317,9 +3386,7 @@ int gk20a_init_vm(struct mm_gk20a *mm,
        vm->big_pages = big_pages;
        vm->big_page_size = gmmu_page_sizes[gmmu_page_size_big];
        vm->userspace_managed = userspace_managed;
        vm->mmu_levels = vm->mm->g->ops.mm.get_mmu_levels(vm->mm->g,
                        vm->big_page_size);
@@ -3465,6 +3532,17 @@ int gk20a_init_vm(struct mm_gk20a *mm,
        kref_init(&vm->ref);
        INIT_LIST_HEAD(&vm->reserved_va_list);
+        /*
+         * This is only necessary for channel address spaces. The best way to
+         * distinguish channel address spaces from other address spaces is by
+         * size - if the address space is 4GB or less, it's not a channel.
+         */
+        if (vm->va_limit > SZ_4G) {
+                err = gk20a_init_sema_pool(vm);
+                if (err)
+                        goto clean_up_big_allocator;
+        }
        return 0;
 clean_up_big_allocator:
author	Alex Waterman <alexw@nvidia.com>	2016-04-27 15:27:36 -0400
committer	Terje Bergstrom <tbergstrom@nvidia.com>	2016-06-28 18:49:11 -0400
commit	dfd5ec53fcce4ebae27f78242e6b788350337095 (patch)
tree	073ea380b9ee4734391d381745f57600c3525be5 /drivers/gpu/nvgpu/gk20a/mm_gk20a.c
parent	b30990ea6db564e885d5aee7a1a5ea87a1e5e8ee (diff)

diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c index 3b21e843..9299266f 100644 --- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
@@ -3213,6 +3213,17 @@ static void gk20a_vm_remove_support_nofree(struct vm_gk20a *vm)
3213	struct rb_node *node;	3213	struct rb_node *node;
3214		3214
3215	gk20a_dbg_fn("");	3215	gk20a_dbg_fn("");
		3216
		3217	/*
		3218	* Do this outside of the update_gmmu_lock since unmapping the semaphore
		3219	* pool involves unmapping a GMMU mapping which means aquiring the
		3220	* update_gmmu_lock.
		3221	*/
		3222	if (!gk20a_platform_has_syncpoints(gk20a_from_vm(vm)->dev)) {
		3223	gk20a_semaphore_pool_unmap(vm->sema_pool, vm);
		3224	gk20a_semaphore_pool_put(vm->sema_pool);
		3225	}
		3226
3216	mutex_lock(&vm->update_gmmu_lock);	3227	mutex_lock(&vm->update_gmmu_lock);
3217		3228
3218	/* TBD: add a flag here for the unmap code to recognize teardown	3229	/* TBD: add a flag here for the unmap code to recognize teardown
@@ -3286,6 +3297,64 @@ const struct gk20a_mmu_level gk20a_mm_levels_128k[] = {
3286	{.update_entry = NULL}	3297	{.update_entry = NULL}
3287	};	3298	};
3288		3299
		3300	/*
		3301	* Initialize a semaphore pool. Just return successfully if we do not need
		3302	* semaphores (i.e when sync-pts are active).
		3303	*/
		3304	int gk20a_init_sema_pool(struct vm_gk20a *vm)
		3305	{
		3306	struct gk20a_semaphore_sea *sema_sea;
		3307	struct mm_gk20a *mm = vm->mm;
		3308	struct gk20a *g = mm->g;
		3309	int err;
		3310
		3311	/*
		3312	* Don't waste the memory on semaphores if we don't need them.
		3313	*/
		3314	if (gk20a_platform_has_syncpoints(g->dev))
		3315	return 0;
		3316
		3317	if (vm->sema_pool)
		3318	return 0;
		3319
		3320	sema_sea = gk20a_semaphore_sea_create(g);
		3321	if (!sema_sea)
		3322	return -ENOMEM;
		3323
		3324	vm->sema_pool = gk20a_semaphore_pool_alloc(sema_sea);
		3325	if (!vm->sema_pool) {
		3326	gk20a_vm_put(vm);
		3327	return -ENOMEM;
		3328	}
		3329
		3330	/*
		3331	* Allocate a chunk of GPU VA space for mapping the semaphores. We will
		3332	* do a fixed alloc in the kernel VM so that all channels have the same
		3333	* RO address range for the semaphores.
		3334	*
		3335	* !!! TODO: cleanup.
		3336	*/
		3337	sema_sea->gpu_va = gk20a_balloc_fixed(&vm->vma[gmmu_page_size_kernel],
		3338	vm->va_limit -
		3339	mm->channel.kernel_size,
		3340	512 * PAGE_SIZE);
		3341	if (!sema_sea->gpu_va) {
		3342	gk20a_bfree(&vm->vma[gmmu_page_size_small], sema_sea->gpu_va);
		3343	gk20a_vm_put(vm);
		3344	return -ENOMEM;
		3345	}
		3346
		3347	err = gk20a_semaphore_pool_map(vm->sema_pool, vm);
		3348	if (err) {
		3349	gk20a_semaphore_pool_unmap(vm->sema_pool, vm);
		3350	gk20a_bfree(&vm->vma[gmmu_page_size_small],
		3351	vm->sema_pool->gpu_va);
		3352	gk20a_vm_put(vm);
		3353	}
		3354
		3355	return 0;
		3356	}
		3357
3289	int gk20a_init_vm(struct mm_gk20a *mm,	3358	int gk20a_init_vm(struct mm_gk20a *mm,
3290	struct vm_gk20a *vm,	3359	struct vm_gk20a *vm,
3291	u32 big_page_size,	3360	u32 big_page_size,
@@ -3317,9 +3386,7 @@ int gk20a_init_vm(struct mm_gk20a *mm,
3317	vm->big_pages = big_pages;	3386	vm->big_pages = big_pages;
3318		3387
3319	vm->big_page_size = gmmu_page_sizes[gmmu_page_size_big];	3388	vm->big_page_size = gmmu_page_sizes[gmmu_page_size_big];
3320
3321	vm->userspace_managed = userspace_managed;	3389	vm->userspace_managed = userspace_managed;
3322
3323	vm->mmu_levels = vm->mm->g->ops.mm.get_mmu_levels(vm->mm->g,	3390	vm->mmu_levels = vm->mm->g->ops.mm.get_mmu_levels(vm->mm->g,
3324	vm->big_page_size);	3391	vm->big_page_size);
3325		3392
@@ -3465,6 +3532,17 @@ int gk20a_init_vm(struct mm_gk20a *mm,
3465	kref_init(&vm->ref);	3532	kref_init(&vm->ref);
3466	INIT_LIST_HEAD(&vm->reserved_va_list);	3533	INIT_LIST_HEAD(&vm->reserved_va_list);
3467		3534
		3535	/*
		3536	* This is only necessary for channel address spaces. The best way to
		3537	* distinguish channel address spaces from other address spaces is by
		3538	* size - if the address space is 4GB or less, it's not a channel.
		3539	*/
		3540	if (vm->va_limit > SZ_4G) {
		3541	err = gk20a_init_sema_pool(vm);
		3542	if (err)
		3543	goto clean_up_big_allocator;
		3544	}
		3545
3468	return 0;	3546	return 0;
3469		3547
3470	clean_up_big_allocator:	3548	clean_up_big_allocator: