summaryrefslogtreecommitdiffstats
path: root/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c
diff options
context:
space:
mode:
authorAlex Waterman <alexw@nvidia.com>2016-04-27 15:27:36 -0400
committerTerje Bergstrom <tbergstrom@nvidia.com>2016-06-28 18:49:11 -0400
commitdfd5ec53fcce4ebae27f78242e6b788350337095 (patch)
tree073ea380b9ee4734391d381745f57600c3525be5 /drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c
parentb30990ea6db564e885d5aee7a1a5ea87a1e5e8ee (diff)
gpu: nvgpu: Revamp semaphore support
Revamp the support the nvgpu driver has for semaphores. The original problem with nvgpu's semaphore support is that it required a SW based wait for every semaphore release. This was because for every fence that gk20a_channel_semaphore_wait_fd() waited on a new semaphore was created. This semaphore would then get released by SW when the fence signaled. This meant that for every release there was necessarily a sync_fence_wait_async() call which could block. The latency of this SW wait was enough to cause massive degredation in performance. To fix this a fast path was implemented. When a fence is passed to gk20a_channel_semaphore_wait_fd() that is backed by a GPU semaphore a semaphore acquire is directly used to block the GPU. No longer is a sync_fence_wait_async() performed nor is there an extra semaphore created. To implement this fast path the semaphore memory had to be shared between channels. Previously since a new semaphore was created every time through gk20a_channel_semaphore_wait_fd() what address space a semaphore was mapped into was irrelevant. However, when using the fast path a sempahore may be released on one address space but acquired in another. Sharing the semaphore memory was done by making a fixed GPU mapping in all channels. This mapping points to the semaphore memory (the so called semaphore sea). This global fixed mapping is read-only to make sure no semaphores can be incremented (i.e released) by a malicious channel. Each channel then gets a RW mapping of it's own semaphore. This way a channel may only acquire other channel's semaphores but may both acquire and release its own semaphore. The gk20a fence code was updated to allow introspection of the GPU backed fences. This allows detection of when the fast path can be taken. If the fast path cannot be used (for example when a fence is sync-pt backed) the original slow path is still present. This gets used when the GPU needs to wait on an event from something which only understands how to use sync-pts. Bug 1732449 JIRA DNVGPU-12 Change-Id: Ic0fea74994da5819a771deac726bb0d47a33c2de Signed-off-by: Alex Waterman <alexw@nvidia.com> Reviewed-on: http://git-master/r/1133792 Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com> Tested-by: Terje Bergstrom <tbergstrom@nvidia.com>
Diffstat (limited to 'drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c')
-rw-r--r--drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c233
1 files changed, 176 insertions, 57 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c
index d2d8c094..9c8911e9 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c
@@ -424,28 +424,52 @@ static void gk20a_channel_semaphore_launcher(
424} 424}
425#endif 425#endif
426 426
427static int add_sema_cmd(struct gk20a *g, struct priv_cmd_entry *cmd, 427static void add_sema_cmd(struct gk20a *g, struct channel_gk20a *c,
428 u64 sema, u32 payload, bool acquire, bool wfi) 428 struct gk20a_semaphore *s, struct priv_cmd_entry *cmd,
429 int cmd_size, bool acquire, bool wfi)
429{ 430{
430 u32 off = cmd->off; 431 u32 off = cmd->off;
432 u64 va;
433
434 /*
435 * RO for acquire (since we just need to read the mem) and RW for
436 * release since we will need to write back to the semaphore memory.
437 */
438 va = acquire ? gk20a_semaphore_gpu_ro_va(s) :
439 gk20a_semaphore_gpu_rw_va(s);
440
441 /*
442 * If the op is not an acquire (so therefor a release) we should
443 * incr the underlying sema next_value.
444 */
445 if (!acquire)
446 gk20a_semaphore_incr(s);
447
431 /* semaphore_a */ 448 /* semaphore_a */
432 gk20a_mem_wr32(g, cmd->mem, off++, 0x20010004); 449 gk20a_mem_wr32(g, cmd->mem, off++, 0x20010004);
433 /* offset_upper */ 450 /* offset_upper */
434 gk20a_mem_wr32(g, cmd->mem, off++, (sema >> 32) & 0xff); 451 gk20a_mem_wr32(g, cmd->mem, off++, (va >> 32) & 0xff);
435 /* semaphore_b */ 452 /* semaphore_b */
436 gk20a_mem_wr32(g, cmd->mem, off++, 0x20010005); 453 gk20a_mem_wr32(g, cmd->mem, off++, 0x20010005);
437 /* offset */ 454 /* offset */
438 gk20a_mem_wr32(g, cmd->mem, off++, sema & 0xffffffff); 455 gk20a_mem_wr32(g, cmd->mem, off++, va & 0xffffffff);
439 /* semaphore_c */ 456
440 gk20a_mem_wr32(g, cmd->mem, off++, 0x20010006);
441 /* payload */
442 gk20a_mem_wr32(g, cmd->mem, off++, payload);
443 if (acquire) { 457 if (acquire) {
458 /* semaphore_c */
459 gk20a_mem_wr32(g, cmd->mem, off++, 0x20010006);
460 /* payload */
461 gk20a_mem_wr32(g, cmd->mem, off++,
462 gk20a_semaphore_get_value(s));
444 /* semaphore_d */ 463 /* semaphore_d */
445 gk20a_mem_wr32(g, cmd->mem, off++, 0x20010007); 464 gk20a_mem_wr32(g, cmd->mem, off++, 0x20010007);
446 /* operation: acq_geq, switch_en */ 465 /* operation: acq_geq, switch_en */
447 gk20a_mem_wr32(g, cmd->mem, off++, 0x4 | (0x1 << 12)); 466 gk20a_mem_wr32(g, cmd->mem, off++, 0x4 | (0x1 << 12));
448 } else { 467 } else {
468 /* semaphore_c */
469 gk20a_mem_wr32(g, cmd->mem, off++, 0x20010006);
470 /* payload */
471 gk20a_mem_wr32(g, cmd->mem, off++,
472 gk20a_semaphore_get_value(s));
449 /* semaphore_d */ 473 /* semaphore_d */
450 gk20a_mem_wr32(g, cmd->mem, off++, 0x20010007); 474 gk20a_mem_wr32(g, cmd->mem, off++, 0x20010007);
451 /* operation: release, wfi */ 475 /* operation: release, wfi */
@@ -456,7 +480,6 @@ static int add_sema_cmd(struct gk20a *g, struct priv_cmd_entry *cmd,
456 /* ignored */ 480 /* ignored */
457 gk20a_mem_wr32(g, cmd->mem, off++, 0); 481 gk20a_mem_wr32(g, cmd->mem, off++, 0);
458 } 482 }
459 return off - cmd->off;
460} 483}
461 484
462static int gk20a_channel_semaphore_wait_syncpt( 485static int gk20a_channel_semaphore_wait_syncpt(
@@ -471,6 +494,76 @@ static int gk20a_channel_semaphore_wait_syncpt(
471 return -ENODEV; 494 return -ENODEV;
472} 495}
473 496
497/*
498 * UGHHH - the sync_fence underlying implementation changes from 3.10 to 3.18.
499 * But since there's no API for getting the underlying sync_pts we have to do
500 * some conditional compilation.
501 */
502#ifdef CONFIG_SYNC
503static struct gk20a_semaphore *sema_from_sync_fence(struct sync_fence *f)
504{
505#if LINUX_VERSION_CODE < KERNEL_VERSION(3,18,0)
506 struct sync_pt *pt;
507
508 pt = list_first_entry(&f->pt_list_head, struct sync_pt, pt_list);
509 return gk20a_sync_pt_inst_get_sema(pt);
510#else
511 return gk20a_sync_pt_inst_get_sema(f->cbs[0].sync_pt);
512#endif
513}
514
515/*
516 * Attempt a fast path for waiting on a sync_fence. Basically if the passed
517 * sync_fence is backed by a gk20a_semaphore then there's no reason to go
518 * through the rigmarole of setting up a separate semaphore which waits on an
519 * interrupt from the GPU and then triggers a worker thread to execute a SW
520 * based semaphore release. Instead just have the GPU wait on the same semaphore
521 * that is going to be incremented by the GPU.
522 *
523 * This function returns 2 possible values: -ENODEV or 0 on success. In the case
524 * of -ENODEV the fastpath cannot be taken due to the fence not being backed by
525 * a GPU semaphore.
526 */
527static int __semaphore_wait_fd_fast_path(struct channel_gk20a *c,
528 struct sync_fence *fence,
529 struct priv_cmd_entry **wait_cmd,
530 struct gk20a_semaphore **fp_sema)
531{
532 struct gk20a_semaphore *sema;
533 int err;
534
535 if (!gk20a_is_sema_backed_sync_fence(fence))
536 return -ENODEV;
537
538 sema = sema_from_sync_fence(fence);
539
540 /*
541 * If there's no underlying sema then that means the underlying sema has
542 * already signaled.
543 */
544 if (!sema) {
545 *fp_sema = NULL;
546 return 0;
547 }
548
549 err = gk20a_channel_alloc_priv_cmdbuf(c, 8, wait_cmd);
550 if (err)
551 return err;
552
553 gk20a_semaphore_get(sema);
554 BUG_ON(!atomic_read(&sema->value));
555 add_sema_cmd(c->g, c, sema, *wait_cmd, 8, true, false);
556
557 /*
558 * Make sure that gk20a_channel_semaphore_wait_fd() can create another
559 * fence with the underlying semaphore.
560 */
561 *fp_sema = sema;
562
563 return 0;
564}
565#endif
566
474static int gk20a_channel_semaphore_wait_fd( 567static int gk20a_channel_semaphore_wait_fd(
475 struct gk20a_channel_sync *s, int fd, 568 struct gk20a_channel_sync *s, int fd,
476 struct priv_cmd_entry **entry, 569 struct priv_cmd_entry **entry,
@@ -480,69 +573,107 @@ static int gk20a_channel_semaphore_wait_fd(
480 container_of(s, struct gk20a_channel_semaphore, ops); 573 container_of(s, struct gk20a_channel_semaphore, ops);
481 struct channel_gk20a *c = sema->c; 574 struct channel_gk20a *c = sema->c;
482#ifdef CONFIG_SYNC 575#ifdef CONFIG_SYNC
576 struct gk20a_semaphore *fp_sema;
483 struct sync_fence *sync_fence; 577 struct sync_fence *sync_fence;
484 struct priv_cmd_entry *wait_cmd = NULL; 578 struct priv_cmd_entry *wait_cmd = NULL;
485 struct wait_fence_work *w; 579 struct wait_fence_work *w = NULL;
486 int written; 580 int err, ret, status;
487 int err, ret;
488 u64 va;
489 581
490 sync_fence = gk20a_sync_fence_fdget(fd); 582 sync_fence = gk20a_sync_fence_fdget(fd);
491 if (!sync_fence) 583 if (!sync_fence)
492 return -EINVAL; 584 return -EINVAL;
493 585
586 ret = __semaphore_wait_fd_fast_path(c, sync_fence, &wait_cmd, &fp_sema);
587 if (ret == 0) {
588 if (fp_sema)
589 *fence = gk20a_fence_from_semaphore(sema->timeline,
590 fp_sema,
591 &c->semaphore_wq,
592 NULL, false);
593 else
594 /*
595 * Allocate an empty fence. It will instantly return
596 * from gk20a_fence_wait().
597 */
598 *fence = gk20a_alloc_fence(NULL, NULL, false);
599
600 sync_fence_put(sync_fence);
601 goto skip_slow_path;
602 }
603
604 /* If the fence has signaled there is no reason to wait on it. */
605#if LINUX_VERSION_CODE < KERNEL_VERSION(3,18,0)
606 status = sync_fence->status;
607#else
608 status = atomic_read(&sync_fence->status);
609#endif
610 if (status) {
611 sync_fence_put(sync_fence);
612 goto skip_slow_path;
613 }
614
615 err = gk20a_channel_alloc_priv_cmdbuf(c, 8, &wait_cmd);
616 if (err) {
617 gk20a_err(dev_from_gk20a(c->g),
618 "not enough priv cmd buffer space");
619 sync_fence_put(sync_fence);
620 return -ENOMEM;
621 }
622
494 w = kzalloc(sizeof(*w), GFP_KERNEL); 623 w = kzalloc(sizeof(*w), GFP_KERNEL);
495 if (!w) { 624 if (!w) {
496 err = -ENOMEM; 625 err = -ENOMEM;
497 goto fail; 626 goto fail_free_cmdbuf;
498 } 627 }
628
499 sync_fence_waiter_init(&w->waiter, gk20a_channel_semaphore_launcher); 629 sync_fence_waiter_init(&w->waiter, gk20a_channel_semaphore_launcher);
500 w->ch = c; 630 w->ch = c;
501 w->sema = gk20a_semaphore_alloc(sema->pool); 631 w->sema = gk20a_semaphore_alloc(c);
502 if (!w->sema) { 632 if (!w->sema) {
503 gk20a_err(dev_from_gk20a(c->g), "ran out of semaphores"); 633 gk20a_err(dev_from_gk20a(c->g), "ran out of semaphores");
504 err = -ENOMEM; 634 err = -ENOMEM;
505 goto fail; 635 goto fail_free_worker;
506 } 636 }
507 637
508 /* worker takes one reference */ 638 /* worker takes one reference */
509 gk20a_semaphore_get(w->sema); 639 gk20a_semaphore_get(w->sema);
640 gk20a_semaphore_incr(w->sema);
510 641
511 err = gk20a_channel_alloc_priv_cmdbuf(c, 8, &wait_cmd); 642 /* GPU unblocked when the semaphore value increments. */
512 if (err) { 643 add_sema_cmd(c->g, c, w->sema, wait_cmd, 8, true, false);
513 gk20a_err(dev_from_gk20a(c->g),
514 "not enough priv cmd buffer space");
515 goto fail;
516 }
517
518 va = gk20a_semaphore_gpu_va(w->sema, c->vm);
519 /* GPU unblocked when when the semaphore value becomes 1. */
520 written = add_sema_cmd(c->g, wait_cmd, va, 1, true, false);
521 644
522 WARN_ON(written != wait_cmd->size);
523 ret = sync_fence_wait_async(sync_fence, &w->waiter); 645 ret = sync_fence_wait_async(sync_fence, &w->waiter);
524 646
525 /* 647 /*
526 * If the sync_fence has already signaled then the above async_wait 648 * If the sync_fence has already signaled then the above async_wait
527 * will never trigger. This causes the semaphore release op to never 649 * will never trigger. This causes the semaphore release op to never
528 * happen which, in turn, hangs the GPU. That's bad. So let's just 650 * happen which, in turn, hangs the GPU. That's bad. So let's just
529 * do the semaphore_release right now. 651 * do the gk20a_semaphore_release() right now.
530 */ 652 */
531 if (ret == 1) 653 if (ret == 1) {
654 sync_fence_put(sync_fence);
532 gk20a_semaphore_release(w->sema); 655 gk20a_semaphore_release(w->sema);
656 gk20a_semaphore_put(w->sema);
657 }
533 658
534 /* XXX - this fixes an actual bug, we need to hold a ref to this 659 /* XXX - this fixes an actual bug, we need to hold a ref to this
535 semaphore while the job is in flight. */ 660 semaphore while the job is in flight. */
536 *fence = gk20a_fence_from_semaphore(sema->timeline, w->sema, 661 *fence = gk20a_fence_from_semaphore(sema->timeline, w->sema,
537 &c->semaphore_wq, 662 &c->semaphore_wq,
538 NULL, false); 663 NULL, false);
664
665skip_slow_path:
539 *entry = wait_cmd; 666 *entry = wait_cmd;
540 return 0; 667 return 0;
541fail: 668
669fail_free_worker:
542 if (w && w->sema) 670 if (w && w->sema)
543 gk20a_semaphore_put(w->sema); 671 gk20a_semaphore_put(w->sema);
544 kfree(w); 672 kfree(w);
545 sync_fence_put(sync_fence); 673 sync_fence_put(sync_fence);
674fail_free_cmdbuf:
675 if (wait_cmd)
676 gk20a_free_priv_cmdbuf(c, wait_cmd);
546 return err; 677 return err;
547#else 678#else
548 gk20a_err(dev_from_gk20a(c->g), 679 gk20a_err(dev_from_gk20a(c->g),
@@ -558,9 +689,7 @@ static int __gk20a_channel_semaphore_incr(
558 struct gk20a_fence **fence, 689 struct gk20a_fence **fence,
559 bool need_sync_fence) 690 bool need_sync_fence)
560{ 691{
561 u64 va;
562 int incr_cmd_size; 692 int incr_cmd_size;
563 int written;
564 struct priv_cmd_entry *incr_cmd = NULL; 693 struct priv_cmd_entry *incr_cmd = NULL;
565 struct gk20a_channel_semaphore *sp = 694 struct gk20a_channel_semaphore *sp =
566 container_of(s, struct gk20a_channel_semaphore, ops); 695 container_of(s, struct gk20a_channel_semaphore, ops);
@@ -568,7 +697,7 @@ static int __gk20a_channel_semaphore_incr(
568 struct gk20a_semaphore *semaphore; 697 struct gk20a_semaphore *semaphore;
569 int err = 0; 698 int err = 0;
570 699
571 semaphore = gk20a_semaphore_alloc(sp->pool); 700 semaphore = gk20a_semaphore_alloc(c);
572 if (!semaphore) { 701 if (!semaphore) {
573 gk20a_err(dev_from_gk20a(c->g), 702 gk20a_err(dev_from_gk20a(c->g),
574 "ran out of semaphores"); 703 "ran out of semaphores");
@@ -585,9 +714,7 @@ static int __gk20a_channel_semaphore_incr(
585 } 714 }
586 715
587 /* Release the completion semaphore. */ 716 /* Release the completion semaphore. */
588 va = gk20a_semaphore_gpu_va(semaphore, c->vm); 717 add_sema_cmd(c->g, c, semaphore, incr_cmd, 14, false, wfi_cmd);
589 written = add_sema_cmd(c->g, incr_cmd, va, 1, false, wfi_cmd);
590 WARN_ON(written != incr_cmd_size);
591 718
592 *fence = gk20a_fence_from_semaphore(sp->timeline, semaphore, 719 *fence = gk20a_fence_from_semaphore(sp->timeline, semaphore,
593 &c->semaphore_wq, 720 &c->semaphore_wq,
@@ -615,8 +742,10 @@ static int gk20a_channel_semaphore_incr(
615{ 742{
616 /* Don't put wfi cmd to this one since we're not returning 743 /* Don't put wfi cmd to this one since we're not returning
617 * a fence to user space. */ 744 * a fence to user space. */
618 return __gk20a_channel_semaphore_incr(s, false /* no wfi */, 745 return __gk20a_channel_semaphore_incr(s,
619 NULL, entry, fence, need_sync_fence); 746 false /* no wfi */,
747 NULL,
748 entry, fence, need_sync_fence);
620} 749}
621 750
622static int gk20a_channel_semaphore_incr_user( 751static int gk20a_channel_semaphore_incr_user(
@@ -679,17 +808,16 @@ static void gk20a_channel_semaphore_destroy(struct gk20a_channel_sync *s)
679 container_of(s, struct gk20a_channel_semaphore, ops); 808 container_of(s, struct gk20a_channel_semaphore, ops);
680 if (sema->timeline) 809 if (sema->timeline)
681 gk20a_sync_timeline_destroy(sema->timeline); 810 gk20a_sync_timeline_destroy(sema->timeline);
682 if (sema->pool) { 811
683 gk20a_semaphore_pool_unmap(sema->pool, sema->c->vm); 812 /* The sema pool is cleaned up by the VM destroy. */
684 gk20a_semaphore_pool_put(sema->pool); 813 sema->pool = NULL;
685 } 814
686 kfree(sema); 815 kfree(sema);
687} 816}
688 817
689static struct gk20a_channel_sync * 818static struct gk20a_channel_sync *
690gk20a_channel_semaphore_create(struct channel_gk20a *c) 819gk20a_channel_semaphore_create(struct channel_gk20a *c)
691{ 820{
692 int err;
693 int asid = -1; 821 int asid = -1;
694 struct gk20a_channel_semaphore *sema; 822 struct gk20a_channel_semaphore *sema;
695 char pool_name[20]; 823 char pool_name[20];
@@ -706,21 +834,15 @@ gk20a_channel_semaphore_create(struct channel_gk20a *c)
706 asid = c->vm->as_share->id; 834 asid = c->vm->as_share->id;
707 835
708 sprintf(pool_name, "semaphore_pool-%d", c->hw_chid); 836 sprintf(pool_name, "semaphore_pool-%d", c->hw_chid);
709 sema->pool = gk20a_semaphore_pool_alloc(c->g, pool_name, 1024); 837 sema->pool = c->vm->sema_pool;
710 if (!sema->pool)
711 goto clean_up;
712
713 /* Map the semaphore pool to the channel vm. Map as read-write to the
714 * owner channel (all other channels should map as read only!). */
715 err = gk20a_semaphore_pool_map(sema->pool, c->vm, gk20a_mem_flag_none);
716 if (err)
717 goto clean_up;
718 838
719#ifdef CONFIG_SYNC 839#ifdef CONFIG_SYNC
720 sema->timeline = gk20a_sync_timeline_create( 840 sema->timeline = gk20a_sync_timeline_create(
721 "gk20a_ch%d_as%d", c->hw_chid, asid); 841 "gk20a_ch%d_as%d", c->hw_chid, asid);
722 if (!sema->timeline) 842 if (!sema->timeline) {
723 goto clean_up; 843 gk20a_channel_semaphore_destroy(&sema->ops);
844 return NULL;
845 }
724#endif 846#endif
725 atomic_set(&sema->ops.refcount, 0); 847 atomic_set(&sema->ops.refcount, 0);
726 sema->ops.wait_syncpt = gk20a_channel_semaphore_wait_syncpt; 848 sema->ops.wait_syncpt = gk20a_channel_semaphore_wait_syncpt;
@@ -734,9 +856,6 @@ gk20a_channel_semaphore_create(struct channel_gk20a *c)
734 sema->ops.destroy = gk20a_channel_semaphore_destroy; 856 sema->ops.destroy = gk20a_channel_semaphore_destroy;
735 857
736 return &sema->ops; 858 return &sema->ops;
737clean_up:
738 gk20a_channel_semaphore_destroy(&sema->ops);
739 return NULL;
740} 859}
741 860
742void gk20a_channel_sync_destroy(struct gk20a_channel_sync *sync) 861void gk20a_channel_sync_destroy(struct gk20a_channel_sync *sync)