summaryrefslogtreecommitdiffstats
path: root/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
diff options
context:
space:
mode:
authorAlex Waterman <alexw@nvidia.com>2016-04-27 15:27:36 -0400
committerTerje Bergstrom <tbergstrom@nvidia.com>2016-06-28 18:49:11 -0400
commitdfd5ec53fcce4ebae27f78242e6b788350337095 (patch)
tree073ea380b9ee4734391d381745f57600c3525be5 /drivers/gpu/nvgpu/gk20a/mm_gk20a.c
parentb30990ea6db564e885d5aee7a1a5ea87a1e5e8ee (diff)
gpu: nvgpu: Revamp semaphore support
Revamp the support the nvgpu driver has for semaphores. The original problem with nvgpu's semaphore support is that it required a SW based wait for every semaphore release. This was because for every fence that gk20a_channel_semaphore_wait_fd() waited on a new semaphore was created. This semaphore would then get released by SW when the fence signaled. This meant that for every release there was necessarily a sync_fence_wait_async() call which could block. The latency of this SW wait was enough to cause massive degredation in performance. To fix this a fast path was implemented. When a fence is passed to gk20a_channel_semaphore_wait_fd() that is backed by a GPU semaphore a semaphore acquire is directly used to block the GPU. No longer is a sync_fence_wait_async() performed nor is there an extra semaphore created. To implement this fast path the semaphore memory had to be shared between channels. Previously since a new semaphore was created every time through gk20a_channel_semaphore_wait_fd() what address space a semaphore was mapped into was irrelevant. However, when using the fast path a sempahore may be released on one address space but acquired in another. Sharing the semaphore memory was done by making a fixed GPU mapping in all channels. This mapping points to the semaphore memory (the so called semaphore sea). This global fixed mapping is read-only to make sure no semaphores can be incremented (i.e released) by a malicious channel. Each channel then gets a RW mapping of it's own semaphore. This way a channel may only acquire other channel's semaphores but may both acquire and release its own semaphore. The gk20a fence code was updated to allow introspection of the GPU backed fences. This allows detection of when the fast path can be taken. If the fast path cannot be used (for example when a fence is sync-pt backed) the original slow path is still present. This gets used when the GPU needs to wait on an event from something which only understands how to use sync-pts. Bug 1732449 JIRA DNVGPU-12 Change-Id: Ic0fea74994da5819a771deac726bb0d47a33c2de Signed-off-by: Alex Waterman <alexw@nvidia.com> Reviewed-on: http://git-master/r/1133792 Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com> Tested-by: Terje Bergstrom <tbergstrom@nvidia.com>
Diffstat (limited to 'drivers/gpu/nvgpu/gk20a/mm_gk20a.c')
-rw-r--r--drivers/gpu/nvgpu/gk20a/mm_gk20a.c82
1 files changed, 80 insertions, 2 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
index 3b21e843..9299266f 100644
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
@@ -3213,6 +3213,17 @@ static void gk20a_vm_remove_support_nofree(struct vm_gk20a *vm)
3213 struct rb_node *node; 3213 struct rb_node *node;
3214 3214
3215 gk20a_dbg_fn(""); 3215 gk20a_dbg_fn("");
3216
3217 /*
3218 * Do this outside of the update_gmmu_lock since unmapping the semaphore
3219 * pool involves unmapping a GMMU mapping which means aquiring the
3220 * update_gmmu_lock.
3221 */
3222 if (!gk20a_platform_has_syncpoints(gk20a_from_vm(vm)->dev)) {
3223 gk20a_semaphore_pool_unmap(vm->sema_pool, vm);
3224 gk20a_semaphore_pool_put(vm->sema_pool);
3225 }
3226
3216 mutex_lock(&vm->update_gmmu_lock); 3227 mutex_lock(&vm->update_gmmu_lock);
3217 3228
3218 /* TBD: add a flag here for the unmap code to recognize teardown 3229 /* TBD: add a flag here for the unmap code to recognize teardown
@@ -3286,6 +3297,64 @@ const struct gk20a_mmu_level gk20a_mm_levels_128k[] = {
3286 {.update_entry = NULL} 3297 {.update_entry = NULL}
3287}; 3298};
3288 3299
3300/*
3301 * Initialize a semaphore pool. Just return successfully if we do not need
3302 * semaphores (i.e when sync-pts are active).
3303 */
3304int gk20a_init_sema_pool(struct vm_gk20a *vm)
3305{
3306 struct gk20a_semaphore_sea *sema_sea;
3307 struct mm_gk20a *mm = vm->mm;
3308 struct gk20a *g = mm->g;
3309 int err;
3310
3311 /*
3312 * Don't waste the memory on semaphores if we don't need them.
3313 */
3314 if (gk20a_platform_has_syncpoints(g->dev))
3315 return 0;
3316
3317 if (vm->sema_pool)
3318 return 0;
3319
3320 sema_sea = gk20a_semaphore_sea_create(g);
3321 if (!sema_sea)
3322 return -ENOMEM;
3323
3324 vm->sema_pool = gk20a_semaphore_pool_alloc(sema_sea);
3325 if (!vm->sema_pool) {
3326 gk20a_vm_put(vm);
3327 return -ENOMEM;
3328 }
3329
3330 /*
3331 * Allocate a chunk of GPU VA space for mapping the semaphores. We will
3332 * do a fixed alloc in the kernel VM so that all channels have the same
3333 * RO address range for the semaphores.
3334 *
3335 * !!! TODO: cleanup.
3336 */
3337 sema_sea->gpu_va = gk20a_balloc_fixed(&vm->vma[gmmu_page_size_kernel],
3338 vm->va_limit -
3339 mm->channel.kernel_size,
3340 512 * PAGE_SIZE);
3341 if (!sema_sea->gpu_va) {
3342 gk20a_bfree(&vm->vma[gmmu_page_size_small], sema_sea->gpu_va);
3343 gk20a_vm_put(vm);
3344 return -ENOMEM;
3345 }
3346
3347 err = gk20a_semaphore_pool_map(vm->sema_pool, vm);
3348 if (err) {
3349 gk20a_semaphore_pool_unmap(vm->sema_pool, vm);
3350 gk20a_bfree(&vm->vma[gmmu_page_size_small],
3351 vm->sema_pool->gpu_va);
3352 gk20a_vm_put(vm);
3353 }
3354
3355 return 0;
3356}
3357
3289int gk20a_init_vm(struct mm_gk20a *mm, 3358int gk20a_init_vm(struct mm_gk20a *mm,
3290 struct vm_gk20a *vm, 3359 struct vm_gk20a *vm,
3291 u32 big_page_size, 3360 u32 big_page_size,
@@ -3317,9 +3386,7 @@ int gk20a_init_vm(struct mm_gk20a *mm,
3317 vm->big_pages = big_pages; 3386 vm->big_pages = big_pages;
3318 3387
3319 vm->big_page_size = gmmu_page_sizes[gmmu_page_size_big]; 3388 vm->big_page_size = gmmu_page_sizes[gmmu_page_size_big];
3320
3321 vm->userspace_managed = userspace_managed; 3389 vm->userspace_managed = userspace_managed;
3322
3323 vm->mmu_levels = vm->mm->g->ops.mm.get_mmu_levels(vm->mm->g, 3390 vm->mmu_levels = vm->mm->g->ops.mm.get_mmu_levels(vm->mm->g,
3324 vm->big_page_size); 3391 vm->big_page_size);
3325 3392
@@ -3465,6 +3532,17 @@ int gk20a_init_vm(struct mm_gk20a *mm,
3465 kref_init(&vm->ref); 3532 kref_init(&vm->ref);
3466 INIT_LIST_HEAD(&vm->reserved_va_list); 3533 INIT_LIST_HEAD(&vm->reserved_va_list);
3467 3534
3535 /*
3536 * This is only necessary for channel address spaces. The best way to
3537 * distinguish channel address spaces from other address spaces is by
3538 * size - if the address space is 4GB or less, it's not a channel.
3539 */
3540 if (vm->va_limit > SZ_4G) {
3541 err = gk20a_init_sema_pool(vm);
3542 if (err)
3543 goto clean_up_big_allocator;
3544 }
3545
3468 return 0; 3546 return 0;
3469 3547
3470clean_up_big_allocator: 3548clean_up_big_allocator: