summaryrefslogtreecommitdiffstats
path: root/drivers/gpu
diff options
context:
space:
mode:
authorAlex Waterman <alexw@nvidia.com>2016-04-27 15:27:36 -0400
committerTerje Bergstrom <tbergstrom@nvidia.com>2016-06-28 18:49:11 -0400
commitdfd5ec53fcce4ebae27f78242e6b788350337095 (patch)
tree073ea380b9ee4734391d381745f57600c3525be5 /drivers/gpu
parentb30990ea6db564e885d5aee7a1a5ea87a1e5e8ee (diff)
gpu: nvgpu: Revamp semaphore support
Revamp the support the nvgpu driver has for semaphores. The original problem with nvgpu's semaphore support is that it required a SW based wait for every semaphore release. This was because for every fence that gk20a_channel_semaphore_wait_fd() waited on a new semaphore was created. This semaphore would then get released by SW when the fence signaled. This meant that for every release there was necessarily a sync_fence_wait_async() call which could block. The latency of this SW wait was enough to cause massive degredation in performance. To fix this a fast path was implemented. When a fence is passed to gk20a_channel_semaphore_wait_fd() that is backed by a GPU semaphore a semaphore acquire is directly used to block the GPU. No longer is a sync_fence_wait_async() performed nor is there an extra semaphore created. To implement this fast path the semaphore memory had to be shared between channels. Previously since a new semaphore was created every time through gk20a_channel_semaphore_wait_fd() what address space a semaphore was mapped into was irrelevant. However, when using the fast path a sempahore may be released on one address space but acquired in another. Sharing the semaphore memory was done by making a fixed GPU mapping in all channels. This mapping points to the semaphore memory (the so called semaphore sea). This global fixed mapping is read-only to make sure no semaphores can be incremented (i.e released) by a malicious channel. Each channel then gets a RW mapping of it's own semaphore. This way a channel may only acquire other channel's semaphores but may both acquire and release its own semaphore. The gk20a fence code was updated to allow introspection of the GPU backed fences. This allows detection of when the fast path can be taken. If the fast path cannot be used (for example when a fence is sync-pt backed) the original slow path is still present. This gets used when the GPU needs to wait on an event from something which only understands how to use sync-pts. Bug 1732449 JIRA DNVGPU-12 Change-Id: Ic0fea74994da5819a771deac726bb0d47a33c2de Signed-off-by: Alex Waterman <alexw@nvidia.com> Reviewed-on: http://git-master/r/1133792 Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com> Tested-by: Terje Bergstrom <tbergstrom@nvidia.com>
Diffstat (limited to 'drivers/gpu')
-rw-r--r--drivers/gpu/nvgpu/gk20a/channel_gk20a.c3
-rw-r--r--drivers/gpu/nvgpu/gk20a/channel_gk20a.h2
-rw-r--r--drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c233
-rw-r--r--drivers/gpu/nvgpu/gk20a/fence_gk20a.c4
-rw-r--r--drivers/gpu/nvgpu/gk20a/gk20a.h5
-rw-r--r--drivers/gpu/nvgpu/gk20a/mm_gk20a.c82
-rw-r--r--drivers/gpu/nvgpu/gk20a/mm_gk20a.h5
-rw-r--r--drivers/gpu/nvgpu/gk20a/semaphore_gk20a.c435
-rw-r--r--drivers/gpu/nvgpu/gk20a/semaphore_gk20a.h303
9 files changed, 847 insertions, 225 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
index 3f9b0432..6c7ff551 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
@@ -1002,6 +1002,9 @@ unbind:
1002 1002
1003 mutex_unlock(&g->dbg_sessions_lock); 1003 mutex_unlock(&g->dbg_sessions_lock);
1004 1004
1005 /* Make sure that when the ch is re-opened it will get a new HW sema. */
1006 ch->hw_sema = NULL;
1007
1005 /* make sure we catch accesses of unopened channels in case 1008 /* make sure we catch accesses of unopened channels in case
1006 * there's non-refcounted channel pointers hanging around */ 1009 * there's non-refcounted channel pointers hanging around */
1007 ch->g = NULL; 1010 ch->g = NULL;
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
index acd272b4..c5a1bd24 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
@@ -108,6 +108,8 @@ struct channel_gk20a {
108 atomic_t ref_count; 108 atomic_t ref_count;
109 wait_queue_head_t ref_count_dec_wq; 109 wait_queue_head_t ref_count_dec_wq;
110 110
111 struct gk20a_semaphore_int *hw_sema;
112
111 int hw_chid; 113 int hw_chid;
112 bool wdt_enabled; 114 bool wdt_enabled;
113 bool bound; 115 bool bound;
diff --git a/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c
index d2d8c094..9c8911e9 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c
@@ -424,28 +424,52 @@ static void gk20a_channel_semaphore_launcher(
424} 424}
425#endif 425#endif
426 426
427static int add_sema_cmd(struct gk20a *g, struct priv_cmd_entry *cmd, 427static void add_sema_cmd(struct gk20a *g, struct channel_gk20a *c,
428 u64 sema, u32 payload, bool acquire, bool wfi) 428 struct gk20a_semaphore *s, struct priv_cmd_entry *cmd,
429 int cmd_size, bool acquire, bool wfi)
429{ 430{
430 u32 off = cmd->off; 431 u32 off = cmd->off;
432 u64 va;
433
434 /*
435 * RO for acquire (since we just need to read the mem) and RW for
436 * release since we will need to write back to the semaphore memory.
437 */
438 va = acquire ? gk20a_semaphore_gpu_ro_va(s) :
439 gk20a_semaphore_gpu_rw_va(s);
440
441 /*
442 * If the op is not an acquire (so therefor a release) we should
443 * incr the underlying sema next_value.
444 */
445 if (!acquire)
446 gk20a_semaphore_incr(s);
447
431 /* semaphore_a */ 448 /* semaphore_a */
432 gk20a_mem_wr32(g, cmd->mem, off++, 0x20010004); 449 gk20a_mem_wr32(g, cmd->mem, off++, 0x20010004);
433 /* offset_upper */ 450 /* offset_upper */
434 gk20a_mem_wr32(g, cmd->mem, off++, (sema >> 32) & 0xff); 451 gk20a_mem_wr32(g, cmd->mem, off++, (va >> 32) & 0xff);
435 /* semaphore_b */ 452 /* semaphore_b */
436 gk20a_mem_wr32(g, cmd->mem, off++, 0x20010005); 453 gk20a_mem_wr32(g, cmd->mem, off++, 0x20010005);
437 /* offset */ 454 /* offset */
438 gk20a_mem_wr32(g, cmd->mem, off++, sema & 0xffffffff); 455 gk20a_mem_wr32(g, cmd->mem, off++, va & 0xffffffff);
439 /* semaphore_c */ 456
440 gk20a_mem_wr32(g, cmd->mem, off++, 0x20010006);
441 /* payload */
442 gk20a_mem_wr32(g, cmd->mem, off++, payload);
443 if (acquire) { 457 if (acquire) {
458 /* semaphore_c */
459 gk20a_mem_wr32(g, cmd->mem, off++, 0x20010006);
460 /* payload */
461 gk20a_mem_wr32(g, cmd->mem, off++,
462 gk20a_semaphore_get_value(s));
444 /* semaphore_d */ 463 /* semaphore_d */
445 gk20a_mem_wr32(g, cmd->mem, off++, 0x20010007); 464 gk20a_mem_wr32(g, cmd->mem, off++, 0x20010007);
446 /* operation: acq_geq, switch_en */ 465 /* operation: acq_geq, switch_en */
447 gk20a_mem_wr32(g, cmd->mem, off++, 0x4 | (0x1 << 12)); 466 gk20a_mem_wr32(g, cmd->mem, off++, 0x4 | (0x1 << 12));
448 } else { 467 } else {
468 /* semaphore_c */
469 gk20a_mem_wr32(g, cmd->mem, off++, 0x20010006);
470 /* payload */
471 gk20a_mem_wr32(g, cmd->mem, off++,
472 gk20a_semaphore_get_value(s));
449 /* semaphore_d */ 473 /* semaphore_d */
450 gk20a_mem_wr32(g, cmd->mem, off++, 0x20010007); 474 gk20a_mem_wr32(g, cmd->mem, off++, 0x20010007);
451 /* operation: release, wfi */ 475 /* operation: release, wfi */
@@ -456,7 +480,6 @@ static int add_sema_cmd(struct gk20a *g, struct priv_cmd_entry *cmd,
456 /* ignored */ 480 /* ignored */
457 gk20a_mem_wr32(g, cmd->mem, off++, 0); 481 gk20a_mem_wr32(g, cmd->mem, off++, 0);
458 } 482 }
459 return off - cmd->off;
460} 483}
461 484
462static int gk20a_channel_semaphore_wait_syncpt( 485static int gk20a_channel_semaphore_wait_syncpt(
@@ -471,6 +494,76 @@ static int gk20a_channel_semaphore_wait_syncpt(
471 return -ENODEV; 494 return -ENODEV;
472} 495}
473 496
497/*
498 * UGHHH - the sync_fence underlying implementation changes from 3.10 to 3.18.
499 * But since there's no API for getting the underlying sync_pts we have to do
500 * some conditional compilation.
501 */
502#ifdef CONFIG_SYNC
503static struct gk20a_semaphore *sema_from_sync_fence(struct sync_fence *f)
504{
505#if LINUX_VERSION_CODE < KERNEL_VERSION(3,18,0)
506 struct sync_pt *pt;
507
508 pt = list_first_entry(&f->pt_list_head, struct sync_pt, pt_list);
509 return gk20a_sync_pt_inst_get_sema(pt);
510#else
511 return gk20a_sync_pt_inst_get_sema(f->cbs[0].sync_pt);
512#endif
513}
514
515/*
516 * Attempt a fast path for waiting on a sync_fence. Basically if the passed
517 * sync_fence is backed by a gk20a_semaphore then there's no reason to go
518 * through the rigmarole of setting up a separate semaphore which waits on an
519 * interrupt from the GPU and then triggers a worker thread to execute a SW
520 * based semaphore release. Instead just have the GPU wait on the same semaphore
521 * that is going to be incremented by the GPU.
522 *
523 * This function returns 2 possible values: -ENODEV or 0 on success. In the case
524 * of -ENODEV the fastpath cannot be taken due to the fence not being backed by
525 * a GPU semaphore.
526 */
527static int __semaphore_wait_fd_fast_path(struct channel_gk20a *c,
528 struct sync_fence *fence,
529 struct priv_cmd_entry **wait_cmd,
530 struct gk20a_semaphore **fp_sema)
531{
532 struct gk20a_semaphore *sema;
533 int err;
534
535 if (!gk20a_is_sema_backed_sync_fence(fence))
536 return -ENODEV;
537
538 sema = sema_from_sync_fence(fence);
539
540 /*
541 * If there's no underlying sema then that means the underlying sema has
542 * already signaled.
543 */
544 if (!sema) {
545 *fp_sema = NULL;
546 return 0;
547 }
548
549 err = gk20a_channel_alloc_priv_cmdbuf(c, 8, wait_cmd);
550 if (err)
551 return err;
552
553 gk20a_semaphore_get(sema);
554 BUG_ON(!atomic_read(&sema->value));
555 add_sema_cmd(c->g, c, sema, *wait_cmd, 8, true, false);
556
557 /*
558 * Make sure that gk20a_channel_semaphore_wait_fd() can create another
559 * fence with the underlying semaphore.
560 */
561 *fp_sema = sema;
562
563 return 0;
564}
565#endif
566
474static int gk20a_channel_semaphore_wait_fd( 567static int gk20a_channel_semaphore_wait_fd(
475 struct gk20a_channel_sync *s, int fd, 568 struct gk20a_channel_sync *s, int fd,
476 struct priv_cmd_entry **entry, 569 struct priv_cmd_entry **entry,
@@ -480,69 +573,107 @@ static int gk20a_channel_semaphore_wait_fd(
480 container_of(s, struct gk20a_channel_semaphore, ops); 573 container_of(s, struct gk20a_channel_semaphore, ops);
481 struct channel_gk20a *c = sema->c; 574 struct channel_gk20a *c = sema->c;
482#ifdef CONFIG_SYNC 575#ifdef CONFIG_SYNC
576 struct gk20a_semaphore *fp_sema;
483 struct sync_fence *sync_fence; 577 struct sync_fence *sync_fence;
484 struct priv_cmd_entry *wait_cmd = NULL; 578 struct priv_cmd_entry *wait_cmd = NULL;
485 struct wait_fence_work *w; 579 struct wait_fence_work *w = NULL;
486 int written; 580 int err, ret, status;
487 int err, ret;
488 u64 va;
489 581
490 sync_fence = gk20a_sync_fence_fdget(fd); 582 sync_fence = gk20a_sync_fence_fdget(fd);
491 if (!sync_fence) 583 if (!sync_fence)
492 return -EINVAL; 584 return -EINVAL;
493 585
586 ret = __semaphore_wait_fd_fast_path(c, sync_fence, &wait_cmd, &fp_sema);
587 if (ret == 0) {
588 if (fp_sema)
589 *fence = gk20a_fence_from_semaphore(sema->timeline,
590 fp_sema,
591 &c->semaphore_wq,
592 NULL, false);
593 else
594 /*
595 * Allocate an empty fence. It will instantly return
596 * from gk20a_fence_wait().
597 */
598 *fence = gk20a_alloc_fence(NULL, NULL, false);
599
600 sync_fence_put(sync_fence);
601 goto skip_slow_path;
602 }
603
604 /* If the fence has signaled there is no reason to wait on it. */
605#if LINUX_VERSION_CODE < KERNEL_VERSION(3,18,0)
606 status = sync_fence->status;
607#else
608 status = atomic_read(&sync_fence->status);
609#endif
610 if (status) {
611 sync_fence_put(sync_fence);
612 goto skip_slow_path;
613 }
614
615 err = gk20a_channel_alloc_priv_cmdbuf(c, 8, &wait_cmd);
616 if (err) {
617 gk20a_err(dev_from_gk20a(c->g),
618 "not enough priv cmd buffer space");
619 sync_fence_put(sync_fence);
620 return -ENOMEM;
621 }
622
494 w = kzalloc(sizeof(*w), GFP_KERNEL); 623 w = kzalloc(sizeof(*w), GFP_KERNEL);
495 if (!w) { 624 if (!w) {
496 err = -ENOMEM; 625 err = -ENOMEM;
497 goto fail; 626 goto fail_free_cmdbuf;
498 } 627 }
628
499 sync_fence_waiter_init(&w->waiter, gk20a_channel_semaphore_launcher); 629 sync_fence_waiter_init(&w->waiter, gk20a_channel_semaphore_launcher);
500 w->ch = c; 630 w->ch = c;
501 w->sema = gk20a_semaphore_alloc(sema->pool); 631 w->sema = gk20a_semaphore_alloc(c);
502 if (!w->sema) { 632 if (!w->sema) {
503 gk20a_err(dev_from_gk20a(c->g), "ran out of semaphores"); 633 gk20a_err(dev_from_gk20a(c->g), "ran out of semaphores");
504 err = -ENOMEM; 634 err = -ENOMEM;
505 goto fail; 635 goto fail_free_worker;
506 } 636 }
507 637
508 /* worker takes one reference */ 638 /* worker takes one reference */
509 gk20a_semaphore_get(w->sema); 639 gk20a_semaphore_get(w->sema);
640 gk20a_semaphore_incr(w->sema);
510 641
511 err = gk20a_channel_alloc_priv_cmdbuf(c, 8, &wait_cmd); 642 /* GPU unblocked when the semaphore value increments. */
512 if (err) { 643 add_sema_cmd(c->g, c, w->sema, wait_cmd, 8, true, false);
513 gk20a_err(dev_from_gk20a(c->g),
514 "not enough priv cmd buffer space");
515 goto fail;
516 }
517
518 va = gk20a_semaphore_gpu_va(w->sema, c->vm);
519 /* GPU unblocked when when the semaphore value becomes 1. */
520 written = add_sema_cmd(c->g, wait_cmd, va, 1, true, false);
521 644
522 WARN_ON(written != wait_cmd->size);
523 ret = sync_fence_wait_async(sync_fence, &w->waiter); 645 ret = sync_fence_wait_async(sync_fence, &w->waiter);
524 646
525 /* 647 /*
526 * If the sync_fence has already signaled then the above async_wait 648 * If the sync_fence has already signaled then the above async_wait
527 * will never trigger. This causes the semaphore release op to never 649 * will never trigger. This causes the semaphore release op to never
528 * happen which, in turn, hangs the GPU. That's bad. So let's just 650 * happen which, in turn, hangs the GPU. That's bad. So let's just
529 * do the semaphore_release right now. 651 * do the gk20a_semaphore_release() right now.
530 */ 652 */
531 if (ret == 1) 653 if (ret == 1) {
654 sync_fence_put(sync_fence);
532 gk20a_semaphore_release(w->sema); 655 gk20a_semaphore_release(w->sema);
656 gk20a_semaphore_put(w->sema);
657 }
533 658
534 /* XXX - this fixes an actual bug, we need to hold a ref to this 659 /* XXX - this fixes an actual bug, we need to hold a ref to this
535 semaphore while the job is in flight. */ 660 semaphore while the job is in flight. */
536 *fence = gk20a_fence_from_semaphore(sema->timeline, w->sema, 661 *fence = gk20a_fence_from_semaphore(sema->timeline, w->sema,
537 &c->semaphore_wq, 662 &c->semaphore_wq,
538 NULL, false); 663 NULL, false);
664
665skip_slow_path:
539 *entry = wait_cmd; 666 *entry = wait_cmd;
540 return 0; 667 return 0;
541fail: 668
669fail_free_worker:
542 if (w && w->sema) 670 if (w && w->sema)
543 gk20a_semaphore_put(w->sema); 671 gk20a_semaphore_put(w->sema);
544 kfree(w); 672 kfree(w);
545 sync_fence_put(sync_fence); 673 sync_fence_put(sync_fence);
674fail_free_cmdbuf:
675 if (wait_cmd)
676 gk20a_free_priv_cmdbuf(c, wait_cmd);
546 return err; 677 return err;
547#else 678#else
548 gk20a_err(dev_from_gk20a(c->g), 679 gk20a_err(dev_from_gk20a(c->g),
@@ -558,9 +689,7 @@ static int __gk20a_channel_semaphore_incr(
558 struct gk20a_fence **fence, 689 struct gk20a_fence **fence,
559 bool need_sync_fence) 690 bool need_sync_fence)
560{ 691{
561 u64 va;
562 int incr_cmd_size; 692 int incr_cmd_size;
563 int written;
564 struct priv_cmd_entry *incr_cmd = NULL; 693 struct priv_cmd_entry *incr_cmd = NULL;
565 struct gk20a_channel_semaphore *sp = 694 struct gk20a_channel_semaphore *sp =
566 container_of(s, struct gk20a_channel_semaphore, ops); 695 container_of(s, struct gk20a_channel_semaphore, ops);
@@ -568,7 +697,7 @@ static int __gk20a_channel_semaphore_incr(
568 struct gk20a_semaphore *semaphore; 697 struct gk20a_semaphore *semaphore;
569 int err = 0; 698 int err = 0;
570 699
571 semaphore = gk20a_semaphore_alloc(sp->pool); 700 semaphore = gk20a_semaphore_alloc(c);
572 if (!semaphore) { 701 if (!semaphore) {
573 gk20a_err(dev_from_gk20a(c->g), 702 gk20a_err(dev_from_gk20a(c->g),
574 "ran out of semaphores"); 703 "ran out of semaphores");
@@ -585,9 +714,7 @@ static int __gk20a_channel_semaphore_incr(
585 } 714 }
586 715
587 /* Release the completion semaphore. */ 716 /* Release the completion semaphore. */
588 va = gk20a_semaphore_gpu_va(semaphore, c->vm); 717 add_sema_cmd(c->g, c, semaphore, incr_cmd, 14, false, wfi_cmd);
589 written = add_sema_cmd(c->g, incr_cmd, va, 1, false, wfi_cmd);
590 WARN_ON(written != incr_cmd_size);
591 718
592 *fence = gk20a_fence_from_semaphore(sp->timeline, semaphore, 719 *fence = gk20a_fence_from_semaphore(sp->timeline, semaphore,
593 &c->semaphore_wq, 720 &c->semaphore_wq,
@@ -615,8 +742,10 @@ static int gk20a_channel_semaphore_incr(
615{ 742{
616 /* Don't put wfi cmd to this one since we're not returning 743 /* Don't put wfi cmd to this one since we're not returning
617 * a fence to user space. */ 744 * a fence to user space. */
618 return __gk20a_channel_semaphore_incr(s, false /* no wfi */, 745 return __gk20a_channel_semaphore_incr(s,
619 NULL, entry, fence, need_sync_fence); 746 false /* no wfi */,
747 NULL,
748 entry, fence, need_sync_fence);
620} 749}
621 750
622static int gk20a_channel_semaphore_incr_user( 751static int gk20a_channel_semaphore_incr_user(
@@ -679,17 +808,16 @@ static void gk20a_channel_semaphore_destroy(struct gk20a_channel_sync *s)
679 container_of(s, struct gk20a_channel_semaphore, ops); 808 container_of(s, struct gk20a_channel_semaphore, ops);
680 if (sema->timeline) 809 if (sema->timeline)
681 gk20a_sync_timeline_destroy(sema->timeline); 810 gk20a_sync_timeline_destroy(sema->timeline);
682 if (sema->pool) { 811
683 gk20a_semaphore_pool_unmap(sema->pool, sema->c->vm); 812 /* The sema pool is cleaned up by the VM destroy. */
684 gk20a_semaphore_pool_put(sema->pool); 813 sema->pool = NULL;
685 } 814
686 kfree(sema); 815 kfree(sema);
687} 816}
688 817
689static struct gk20a_channel_sync * 818static struct gk20a_channel_sync *
690gk20a_channel_semaphore_create(struct channel_gk20a *c) 819gk20a_channel_semaphore_create(struct channel_gk20a *c)
691{ 820{
692 int err;
693 int asid = -1; 821 int asid = -1;
694 struct gk20a_channel_semaphore *sema; 822 struct gk20a_channel_semaphore *sema;
695 char pool_name[20]; 823 char pool_name[20];
@@ -706,21 +834,15 @@ gk20a_channel_semaphore_create(struct channel_gk20a *c)
706 asid = c->vm->as_share->id; 834 asid = c->vm->as_share->id;
707 835
708 sprintf(pool_name, "semaphore_pool-%d", c->hw_chid); 836 sprintf(pool_name, "semaphore_pool-%d", c->hw_chid);
709 sema->pool = gk20a_semaphore_pool_alloc(c->g, pool_name, 1024); 837 sema->pool = c->vm->sema_pool;
710 if (!sema->pool)
711 goto clean_up;
712
713 /* Map the semaphore pool to the channel vm. Map as read-write to the
714 * owner channel (all other channels should map as read only!). */
715 err = gk20a_semaphore_pool_map(sema->pool, c->vm, gk20a_mem_flag_none);
716 if (err)
717 goto clean_up;
718 838
719#ifdef CONFIG_SYNC 839#ifdef CONFIG_SYNC
720 sema->timeline = gk20a_sync_timeline_create( 840 sema->timeline = gk20a_sync_timeline_create(
721 "gk20a_ch%d_as%d", c->hw_chid, asid); 841 "gk20a_ch%d_as%d", c->hw_chid, asid);
722 if (!sema->timeline) 842 if (!sema->timeline) {
723 goto clean_up; 843 gk20a_channel_semaphore_destroy(&sema->ops);
844 return NULL;
845 }
724#endif 846#endif
725 atomic_set(&sema->ops.refcount, 0); 847 atomic_set(&sema->ops.refcount, 0);
726 sema->ops.wait_syncpt = gk20a_channel_semaphore_wait_syncpt; 848 sema->ops.wait_syncpt = gk20a_channel_semaphore_wait_syncpt;
@@ -734,9 +856,6 @@ gk20a_channel_semaphore_create(struct channel_gk20a *c)
734 sema->ops.destroy = gk20a_channel_semaphore_destroy; 856 sema->ops.destroy = gk20a_channel_semaphore_destroy;
735 857
736 return &sema->ops; 858 return &sema->ops;
737clean_up:
738 gk20a_channel_semaphore_destroy(&sema->ops);
739 return NULL;
740} 859}
741 860
742void gk20a_channel_sync_destroy(struct gk20a_channel_sync *sync) 861void gk20a_channel_sync_destroy(struct gk20a_channel_sync *sync)
diff --git a/drivers/gpu/nvgpu/gk20a/fence_gk20a.c b/drivers/gpu/nvgpu/gk20a/fence_gk20a.c
index 23522882..fbbaa2a7 100644
--- a/drivers/gpu/nvgpu/gk20a/fence_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/fence_gk20a.c
@@ -155,8 +155,8 @@ struct gk20a_fence *gk20a_fence_from_semaphore(
155 155
156#ifdef CONFIG_SYNC 156#ifdef CONFIG_SYNC
157 sync_fence = gk20a_sync_fence_create(timeline, semaphore, 157 sync_fence = gk20a_sync_fence_create(timeline, semaphore,
158 dependency, "f-gk20a-0x%04x", 158 dependency, "f-gk20a-0x%04x",
159 semaphore->offset & 0xffff); 159 gk20a_semaphore_gpu_ro_va(semaphore));
160 if (!sync_fence) 160 if (!sync_fence)
161 return NULL; 161 return NULL;
162#endif 162#endif
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.h b/drivers/gpu/nvgpu/gk20a/gk20a.h
index 5ab09ac3..7bd9775e 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.h
@@ -738,6 +738,11 @@ struct gk20a {
738#endif 738#endif
739 struct gk20a_ctxsw_ucode_info ctxsw_ucode_info; 739 struct gk20a_ctxsw_ucode_info ctxsw_ucode_info;
740 740
741 /*
742 * A group of semaphore pools. One for each channel.
743 */
744 struct gk20a_semaphore_sea *sema_sea;
745
741 /* held while manipulating # of debug/profiler sessions present */ 746 /* held while manipulating # of debug/profiler sessions present */
742 /* also prevents debug sessions from attaching until released */ 747 /* also prevents debug sessions from attaching until released */
743 struct mutex dbg_sessions_lock; 748 struct mutex dbg_sessions_lock;
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
index 3b21e843..9299266f 100644
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
@@ -3213,6 +3213,17 @@ static void gk20a_vm_remove_support_nofree(struct vm_gk20a *vm)
3213 struct rb_node *node; 3213 struct rb_node *node;
3214 3214
3215 gk20a_dbg_fn(""); 3215 gk20a_dbg_fn("");
3216
3217 /*
3218 * Do this outside of the update_gmmu_lock since unmapping the semaphore
3219 * pool involves unmapping a GMMU mapping which means aquiring the
3220 * update_gmmu_lock.
3221 */
3222 if (!gk20a_platform_has_syncpoints(gk20a_from_vm(vm)->dev)) {
3223 gk20a_semaphore_pool_unmap(vm->sema_pool, vm);
3224 gk20a_semaphore_pool_put(vm->sema_pool);
3225 }
3226
3216 mutex_lock(&vm->update_gmmu_lock); 3227 mutex_lock(&vm->update_gmmu_lock);
3217 3228
3218 /* TBD: add a flag here for the unmap code to recognize teardown 3229 /* TBD: add a flag here for the unmap code to recognize teardown
@@ -3286,6 +3297,64 @@ const struct gk20a_mmu_level gk20a_mm_levels_128k[] = {
3286 {.update_entry = NULL} 3297 {.update_entry = NULL}
3287}; 3298};
3288 3299
3300/*
3301 * Initialize a semaphore pool. Just return successfully if we do not need
3302 * semaphores (i.e when sync-pts are active).
3303 */
3304int gk20a_init_sema_pool(struct vm_gk20a *vm)
3305{
3306 struct gk20a_semaphore_sea *sema_sea;
3307 struct mm_gk20a *mm = vm->mm;
3308 struct gk20a *g = mm->g;
3309 int err;
3310
3311 /*
3312 * Don't waste the memory on semaphores if we don't need them.
3313 */
3314 if (gk20a_platform_has_syncpoints(g->dev))
3315 return 0;
3316
3317 if (vm->sema_pool)
3318 return 0;
3319
3320 sema_sea = gk20a_semaphore_sea_create(g);
3321 if (!sema_sea)
3322 return -ENOMEM;
3323
3324 vm->sema_pool = gk20a_semaphore_pool_alloc(sema_sea);
3325 if (!vm->sema_pool) {
3326 gk20a_vm_put(vm);
3327 return -ENOMEM;
3328 }
3329
3330 /*
3331 * Allocate a chunk of GPU VA space for mapping the semaphores. We will
3332 * do a fixed alloc in the kernel VM so that all channels have the same
3333 * RO address range for the semaphores.
3334 *
3335 * !!! TODO: cleanup.
3336 */
3337 sema_sea->gpu_va = gk20a_balloc_fixed(&vm->vma[gmmu_page_size_kernel],
3338 vm->va_limit -
3339 mm->channel.kernel_size,
3340 512 * PAGE_SIZE);
3341 if (!sema_sea->gpu_va) {
3342 gk20a_bfree(&vm->vma[gmmu_page_size_small], sema_sea->gpu_va);
3343 gk20a_vm_put(vm);
3344 return -ENOMEM;
3345 }
3346
3347 err = gk20a_semaphore_pool_map(vm->sema_pool, vm);
3348 if (err) {
3349 gk20a_semaphore_pool_unmap(vm->sema_pool, vm);
3350 gk20a_bfree(&vm->vma[gmmu_page_size_small],
3351 vm->sema_pool->gpu_va);
3352 gk20a_vm_put(vm);
3353 }
3354
3355 return 0;
3356}
3357
3289int gk20a_init_vm(struct mm_gk20a *mm, 3358int gk20a_init_vm(struct mm_gk20a *mm,
3290 struct vm_gk20a *vm, 3359 struct vm_gk20a *vm,
3291 u32 big_page_size, 3360 u32 big_page_size,
@@ -3317,9 +3386,7 @@ int gk20a_init_vm(struct mm_gk20a *mm,
3317 vm->big_pages = big_pages; 3386 vm->big_pages = big_pages;
3318 3387
3319 vm->big_page_size = gmmu_page_sizes[gmmu_page_size_big]; 3388 vm->big_page_size = gmmu_page_sizes[gmmu_page_size_big];
3320
3321 vm->userspace_managed = userspace_managed; 3389 vm->userspace_managed = userspace_managed;
3322
3323 vm->mmu_levels = vm->mm->g->ops.mm.get_mmu_levels(vm->mm->g, 3390 vm->mmu_levels = vm->mm->g->ops.mm.get_mmu_levels(vm->mm->g,
3324 vm->big_page_size); 3391 vm->big_page_size);
3325 3392
@@ -3465,6 +3532,17 @@ int gk20a_init_vm(struct mm_gk20a *mm,
3465 kref_init(&vm->ref); 3532 kref_init(&vm->ref);
3466 INIT_LIST_HEAD(&vm->reserved_va_list); 3533 INIT_LIST_HEAD(&vm->reserved_va_list);
3467 3534
3535 /*
3536 * This is only necessary for channel address spaces. The best way to
3537 * distinguish channel address spaces from other address spaces is by
3538 * size - if the address space is 4GB or less, it's not a channel.
3539 */
3540 if (vm->va_limit > SZ_4G) {
3541 err = gk20a_init_sema_pool(vm);
3542 if (err)
3543 goto clean_up_big_allocator;
3544 }
3545
3468 return 0; 3546 return 0;
3469 3547
3470clean_up_big_allocator: 3548clean_up_big_allocator:
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
index db74a5ca..7bb4d011 100644
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
@@ -287,6 +287,11 @@ struct vm_gk20a {
287 /* if non-NULL, kref_put will use this batch when 287 /* if non-NULL, kref_put will use this batch when
288 unmapping. Must hold vm->update_gmmu_lock. */ 288 unmapping. Must hold vm->update_gmmu_lock. */
289 struct vm_gk20a_mapping_batch *kref_put_batch; 289 struct vm_gk20a_mapping_batch *kref_put_batch;
290
291 /*
292 * Each address space needs to have a semaphore pool.
293 */
294 struct gk20a_semaphore_pool *sema_pool;
290}; 295};
291 296
292struct gk20a; 297struct gk20a;
diff --git a/drivers/gpu/nvgpu/gk20a/semaphore_gk20a.c b/drivers/gpu/nvgpu/gk20a/semaphore_gk20a.c
index 3b17bfcb..aa375b24 100644
--- a/drivers/gpu/nvgpu/gk20a/semaphore_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/semaphore_gk20a.c
@@ -15,63 +15,284 @@
15 * more details. 15 * more details.
16 */ 16 */
17 17
18#include "semaphore_gk20a.h" 18#define pr_fmt(fmt) "gpu_sema: " fmt
19
19#include <linux/dma-mapping.h> 20#include <linux/dma-mapping.h>
21#include <linux/highmem.h>
20#include <linux/slab.h> 22#include <linux/slab.h>
23
24#include <asm/pgtable.h>
25
21#include "gk20a.h" 26#include "gk20a.h"
22#include "mm_gk20a.h" 27#include "mm_gk20a.h"
28#include "semaphore_gk20a.h"
29
30#define __lock_sema_sea(s) \
31 do { \
32 mutex_lock(&s->sea_lock); \
33 } while (0)
23 34
24static const int SEMAPHORE_SIZE = 16; 35#define __unlock_sema_sea(s) \
36 do { \
37 mutex_unlock(&s->sea_lock); \
38 } while (0)
25 39
26struct gk20a_semaphore_pool *gk20a_semaphore_pool_alloc(struct gk20a *g, 40/*
27 const char *unique_name, size_t capacity) 41 * Return the sema_sea pointer.
42 */
43struct gk20a_semaphore_sea *gk20a_semaphore_get_sea(struct gk20a *g)
44{
45 return g->sema_sea;
46}
47
48static int __gk20a_semaphore_sea_grow(struct gk20a_semaphore_sea *sea)
49{
50 int ret = 0;
51 struct gk20a *gk20a = sea->gk20a;
52
53 __lock_sema_sea(sea);
54
55 ret = gk20a_gmmu_alloc_attr(gk20a, DMA_ATTR_NO_KERNEL_MAPPING,
56 PAGE_SIZE * SEMAPHORE_POOL_COUNT,
57 &sea->sea_mem);
58 if (ret)
59 goto out;
60
61 sea->ro_sg_table = sea->sea_mem.sgt;
62 sea->size = SEMAPHORE_POOL_COUNT;
63 sea->map_size = SEMAPHORE_POOL_COUNT * PAGE_SIZE;
64
65out:
66 __unlock_sema_sea(sea);
67 return ret;
68}
69
70/*
71 * Create the semaphore sea. Only create it once - subsequent calls to this will
72 * return the originally created sea pointer.
73 */
74struct gk20a_semaphore_sea *gk20a_semaphore_sea_create(struct gk20a *g)
75{
76 if (g->sema_sea)
77 return g->sema_sea;
78
79 g->sema_sea = kzalloc(sizeof(*g->sema_sea), GFP_KERNEL);
80 if (!g->sema_sea)
81 return NULL;
82
83 g->sema_sea->size = 0;
84 g->sema_sea->page_count = 0;
85 g->sema_sea->gk20a = g;
86 INIT_LIST_HEAD(&g->sema_sea->pool_list);
87 mutex_init(&g->sema_sea->sea_lock);
88
89 if (__gk20a_semaphore_sea_grow(g->sema_sea))
90 goto cleanup;
91
92 return g->sema_sea;
93
94cleanup:
95 kfree(g->sema_sea);
96 g->sema_sea = NULL;
97 return NULL;
98}
99
100static int __semaphore_bitmap_alloc(unsigned long *bitmap, unsigned long len)
101{
102 unsigned long idx = find_first_zero_bit(bitmap, len);
103
104 if (idx == len)
105 return -ENOSPC;
106
107 set_bit(idx, bitmap);
108
109 return (int)idx;
110}
111
112/*
113 * Allocate a pool from the sea.
114 */
115struct gk20a_semaphore_pool *gk20a_semaphore_pool_alloc(
116 struct gk20a_semaphore_sea *sea)
28{ 117{
29 struct gk20a_semaphore_pool *p; 118 struct gk20a_semaphore_pool *p;
119 unsigned long page_idx;
120 int err = 0;
121
30 p = kzalloc(sizeof(*p), GFP_KERNEL); 122 p = kzalloc(sizeof(*p), GFP_KERNEL);
31 if (!p) 123 if (!p)
32 return NULL; 124 return ERR_PTR(-ENOMEM);
125
126 __lock_sema_sea(sea);
127
128 page_idx = __semaphore_bitmap_alloc(sea->pools_alloced,
129 SEMAPHORE_POOL_COUNT);
130 if (page_idx < 0) {
131 err = page_idx;
132 goto fail;
133 }
33 134
135 p->page = sea->sea_mem.pages[page_idx];
136 p->ro_sg_table = sea->ro_sg_table;
137 p->page_idx = page_idx;
138 p->sema_sea = sea;
139 INIT_LIST_HEAD(&p->hw_semas);
34 kref_init(&p->ref); 140 kref_init(&p->ref);
35 INIT_LIST_HEAD(&p->maps); 141 mutex_init(&p->pool_lock);
36 mutex_init(&p->maps_mutex); 142
37 p->g = g; 143 sea->page_count++;
38 144 list_add(&p->pool_list_entry, &sea->pool_list);
39 /* Alloc one 4k page of semaphore per channel. */ 145 __unlock_sema_sea(sea);
40 if (gk20a_gmmu_alloc(g, roundup(capacity * SEMAPHORE_SIZE, PAGE_SIZE), 146
41 &p->mem))
42 goto clean_up;
43
44 /* Sacrifice one semaphore in the name of returning error codes. */
45 if (gk20a_allocator_init(&p->alloc, unique_name,
46 SEMAPHORE_SIZE, p->mem.size - SEMAPHORE_SIZE,
47 SEMAPHORE_SIZE))
48 goto clean_up;
49
50 gk20a_dbg_info("cpuva=%p iova=%llx phys=%llx", p->mem.cpu_va,
51 (u64)sg_dma_address(p->mem.sgt->sgl),
52 (u64)sg_phys(p->mem.sgt->sgl));
53 return p; 147 return p;
54 148
55clean_up: 149fail:
56 if (p->mem.size) 150 __unlock_sema_sea(sea);
57 gk20a_gmmu_free(p->g, &p->mem);
58 kfree(p); 151 kfree(p);
59 return NULL; 152 return ERR_PTR(err);
153}
154
155/*
156 * Map a pool into the passed vm's address space. This handles both the fixed
157 * global RO mapping and the non-fixed private RW mapping.
158 */
159int gk20a_semaphore_pool_map(struct gk20a_semaphore_pool *p,
160 struct vm_gk20a *vm)
161{
162 int ents, err = 0;
163 u64 addr;
164
165 p->cpu_va = vmap(&p->page, 1, 0,
166 pgprot_writecombine(PAGE_KERNEL));
167
168 /* First do the RW mapping. */
169 p->rw_sg_table = kzalloc(sizeof(*p->rw_sg_table), GFP_KERNEL);
170 if (!p->rw_sg_table)
171 return -ENOMEM;
172
173 err = sg_alloc_table_from_pages(p->rw_sg_table, &p->page, 1, 0,
174 PAGE_SIZE, GFP_KERNEL);
175 if (err) {
176 err = -ENOMEM;
177 goto fail;
178 }
179
180 /* Add IOMMU mapping... */
181 ents = dma_map_sg(dev_from_vm(vm), p->rw_sg_table->sgl, 1,
182 DMA_BIDIRECTIONAL);
183 if (ents != 1) {
184 err = -ENOMEM;
185 goto fail_free_sgt;
186 }
187
188 /* Map into the GPU... Doesn't need to be fixed. */
189 p->gpu_va = gk20a_gmmu_map(vm, &p->rw_sg_table, PAGE_SIZE,
190 0, gk20a_mem_flag_none, false);
191 if (!p->gpu_va) {
192 err = -ENOMEM;
193 goto fail_unmap_sgt;
194 }
195
196 /*
197 * And now the global mapping. Take the sea lock so that we don't race
198 * with a concurrent remap.
199 */
200 __lock_sema_sea(p->sema_sea);
201
202 BUG_ON(p->mapped);
203 addr = gk20a_gmmu_fixed_map(vm, &p->sema_sea->ro_sg_table,
204 p->sema_sea->gpu_va, p->sema_sea->map_size,
205 0,
206 gk20a_mem_flag_read_only,
207 false);
208 if (!addr) {
209 err = -ENOMEM;
210 BUG();
211 goto fail_unlock;
212 }
213 p->gpu_va_ro = addr;
214 p->mapped = 1;
215
216 __unlock_sema_sea(p->sema_sea);
217
218 return 0;
219
220fail_unlock:
221 __unlock_sema_sea(p->sema_sea);
222fail_unmap_sgt:
223 dma_unmap_sg(dev_from_vm(vm), p->rw_sg_table->sgl, 1,
224 DMA_BIDIRECTIONAL);
225fail_free_sgt:
226 sg_free_table(p->rw_sg_table);
227fail:
228 kfree(p->rw_sg_table);
229 p->rw_sg_table = NULL;
230 return err;
60} 231}
61 232
233/*
234 * Unmap a semaphore_pool.
235 */
236void gk20a_semaphore_pool_unmap(struct gk20a_semaphore_pool *p,
237 struct vm_gk20a *vm)
238{
239 struct gk20a_semaphore_int *hw_sema;
240
241 kunmap(p->cpu_va);
242
243 /* First the global RO mapping... */
244 __lock_sema_sea(p->sema_sea);
245 gk20a_gmmu_unmap(vm, p->gpu_va_ro,
246 p->sema_sea->map_size, gk20a_mem_flag_none);
247 p->ro_sg_table = NULL;
248 __unlock_sema_sea(p->sema_sea);
249
250 /* And now the private RW mapping. */
251 gk20a_gmmu_unmap(vm, p->gpu_va, PAGE_SIZE, gk20a_mem_flag_none);
252 p->gpu_va = 0;
253
254 dma_unmap_sg(dev_from_vm(vm), p->rw_sg_table->sgl, 1,
255 DMA_BIDIRECTIONAL);
256
257 sg_free_table(p->rw_sg_table);
258 kfree(p->rw_sg_table);
259 p->rw_sg_table = NULL;
260
261 gk20a_dbg_info("Unmapped sema-pool: idx = %d", p->page_idx);
262 list_for_each_entry(hw_sema, &p->hw_semas, hw_sema_list)
263 /*
264 * Make sure the mem addresses are all NULL so if this gets
265 * reused we will fault.
266 */
267 hw_sema->value = NULL;
268}
269
270/*
271 * Completely free a sempahore_pool. You should make sure this pool is not
272 * mapped otherwise there's going to be a memory leak.
273 */
62static void gk20a_semaphore_pool_free(struct kref *ref) 274static void gk20a_semaphore_pool_free(struct kref *ref)
63{ 275{
64 struct gk20a_semaphore_pool *p = 276 struct gk20a_semaphore_pool *p =
65 container_of(ref, struct gk20a_semaphore_pool, ref); 277 container_of(ref, struct gk20a_semaphore_pool, ref);
66 mutex_lock(&p->maps_mutex); 278 struct gk20a_semaphore_sea *s = p->sema_sea;
67 WARN_ON(!list_empty(&p->maps)); 279 struct gk20a_semaphore_int *hw_sema, *tmp;
68 mutex_unlock(&p->maps_mutex); 280
69 gk20a_gmmu_free(p->g, &p->mem); 281 WARN_ON(p->gpu_va || p->rw_sg_table || p->ro_sg_table);
70 gk20a_allocator_destroy(&p->alloc); 282
283 __lock_sema_sea(s);
284 list_del(&p->pool_list_entry);
285 clear_bit(p->page_idx, s->pools_alloced);
286 s->page_count--;
287 __unlock_sema_sea(s);
288
289 list_for_each_entry_safe(hw_sema, tmp, &p->hw_semas, hw_sema_list)
290 kfree(hw_sema);
291
71 kfree(p); 292 kfree(p);
72} 293}
73 294
74static void gk20a_semaphore_pool_get(struct gk20a_semaphore_pool *p) 295void gk20a_semaphore_pool_get(struct gk20a_semaphore_pool *p)
75{ 296{
76 kref_get(&p->ref); 297 kref_get(&p->ref);
77} 298}
@@ -81,104 +302,96 @@ void gk20a_semaphore_pool_put(struct gk20a_semaphore_pool *p)
81 kref_put(&p->ref, gk20a_semaphore_pool_free); 302 kref_put(&p->ref, gk20a_semaphore_pool_free);
82} 303}
83 304
84static struct gk20a_semaphore_pool_map * 305/*
85gk20a_semaphore_pool_find_map_locked(struct gk20a_semaphore_pool *p, 306 * Get the address for a semaphore_pool - if global is true then return the
86 struct vm_gk20a *vm) 307 * global RO address instead of the RW address owned by the semaphore's VM.
308 */
309u64 __gk20a_semaphore_pool_gpu_va(struct gk20a_semaphore_pool *p, bool global)
87{ 310{
88 struct gk20a_semaphore_pool_map *map, *found = NULL; 311 if (!global)
89 list_for_each_entry(map, &p->maps, list) { 312 return p->gpu_va;
90 if (map->vm == vm) { 313
91 found = map; 314 return p->gpu_va_ro + (PAGE_SIZE * p->page_idx);
92 break;
93 }
94 }
95 return found;
96} 315}
97 316
98int gk20a_semaphore_pool_map(struct gk20a_semaphore_pool *p, 317static int __gk20a_init_hw_sema(struct channel_gk20a *ch)
99 struct vm_gk20a *vm,
100 enum gk20a_mem_rw_flag rw_flag)
101{ 318{
102 struct gk20a_semaphore_pool_map *map; 319 int hw_sema_idx;
320 int ret = 0;
321 struct gk20a_semaphore_int *hw_sema;
322 struct gk20a_semaphore_pool *p = ch->vm->sema_pool;
103 323
104 map = kzalloc(sizeof(*map), GFP_KERNEL); 324 BUG_ON(!p);
105 if (!map)
106 return -ENOMEM;
107 map->vm = vm;
108 map->rw_flag = rw_flag;
109 map->gpu_va = gk20a_gmmu_map(vm, &p->mem.sgt, p->mem.size,
110 0/*uncached*/, rw_flag,
111 false);
112 if (!map->gpu_va) {
113 kfree(map);
114 return -ENOMEM;
115 }
116 gk20a_vm_get(vm);
117 325
118 mutex_lock(&p->maps_mutex); 326 mutex_lock(&p->pool_lock);
119 WARN_ON(gk20a_semaphore_pool_find_map_locked(p, vm));
120 list_add(&map->list, &p->maps);
121 mutex_unlock(&p->maps_mutex);
122 return 0;
123}
124 327
125void gk20a_semaphore_pool_unmap(struct gk20a_semaphore_pool *p, 328 /* Find an available HW semaphore. */
126 struct vm_gk20a *vm) 329 hw_sema_idx = __semaphore_bitmap_alloc(p->semas_alloced,
127{ 330 PAGE_SIZE / SEMAPHORE_SIZE);
128 struct gk20a_semaphore_pool_map *map; 331 if (hw_sema_idx < 0) {
129 WARN_ON(!vm); 332 ret = hw_sema_idx;
130 333 goto fail;
131 mutex_lock(&p->maps_mutex);
132 map = gk20a_semaphore_pool_find_map_locked(p, vm);
133 if (map) {
134 gk20a_gmmu_unmap(vm, map->gpu_va, p->mem.size, map->rw_flag);
135 gk20a_vm_put(vm);
136 list_del(&map->list);
137 kfree(map);
138 } 334 }
139 mutex_unlock(&p->maps_mutex);
140}
141 335
142u64 gk20a_semaphore_pool_gpu_va(struct gk20a_semaphore_pool *p, 336 hw_sema = kzalloc(sizeof(struct gk20a_semaphore_int), GFP_KERNEL);
143 struct vm_gk20a *vm) 337 if (!hw_sema) {
144{ 338 ret = -ENOMEM;
145 struct gk20a_semaphore_pool_map *map; 339 goto fail_free_idx;
146 u64 gpu_va = 0; 340 }
147 341
148 mutex_lock(&p->maps_mutex); 342 ch->hw_sema = hw_sema;
149 map = gk20a_semaphore_pool_find_map_locked(p, vm); 343 hw_sema->ch = ch;
150 if (map) 344 hw_sema->p = p;
151 gpu_va = map->gpu_va; 345 hw_sema->idx = hw_sema_idx;
152 mutex_unlock(&p->maps_mutex); 346 hw_sema->offset = SEMAPHORE_SIZE * hw_sema_idx;
347 atomic_set(&hw_sema->next_value, 0);
348 hw_sema->value = p->cpu_va + hw_sema->offset;
349 writel(0, hw_sema->value);
153 350
154 return gpu_va; 351 list_add(&hw_sema->hw_sema_list, &p->hw_semas);
352
353 mutex_unlock(&p->pool_lock);
354
355 return 0;
356
357fail_free_idx:
358 clear_bit(hw_sema_idx, p->semas_alloced);
359fail:
360 mutex_unlock(&p->pool_lock);
361 return ret;
155} 362}
156 363
157struct gk20a_semaphore *gk20a_semaphore_alloc(struct gk20a_semaphore_pool *pool) 364/*
365 * Allocate a semaphore from the passed pool.
366 *
367 * Since semaphores are ref-counted there's no explicit free for external code
368 * to use. When the ref-count hits 0 the internal free will happen.
369 */
370struct gk20a_semaphore *gk20a_semaphore_alloc(struct channel_gk20a *ch)
158{ 371{
159 struct gk20a_semaphore *s; 372 struct gk20a_semaphore *s;
373 int ret;
374
375 if (!ch->hw_sema) {
376 ret = __gk20a_init_hw_sema(ch);
377 if (ret)
378 return ERR_PTR(ret);
379 }
160 380
161 s = kzalloc(sizeof(*s), GFP_KERNEL); 381 s = kzalloc(sizeof(*s), GFP_KERNEL);
162 if (!s) 382 if (!s)
163 return NULL; 383 return NULL;
164 384
165 s->offset = gk20a_balloc(&pool->alloc, SEMAPHORE_SIZE); 385 kref_init(&s->ref);
166 if (!s->offset) { 386 s->hw_sema = ch->hw_sema;
167 gk20a_err(dev_from_gk20a(pool->g), 387 atomic_set(&s->value, 0);
168 "failed to allocate semaphore");
169 kfree(s);
170 return NULL;
171 }
172 388
173 gk20a_semaphore_pool_get(pool); 389 /*
174 s->pool = pool; 390 * Take a ref on the pool so that we can keep this pool alive for
391 * as long as this semaphore is alive.
392 */
393 gk20a_semaphore_pool_get(s->hw_sema->p);
175 394
176 kref_init(&s->ref);
177 /* Initially acquired. */
178 gk20a_mem_wr(s->pool->g, &s->pool->mem, s->offset, 0);
179 gk20a_dbg_info("created semaphore offset=%d, value=%d",
180 s->offset,
181 gk20a_mem_rd(s->pool->g, &s->pool->mem, s->offset));
182 return s; 395 return s;
183} 396}
184 397
@@ -187,8 +400,8 @@ static void gk20a_semaphore_free(struct kref *ref)
187 struct gk20a_semaphore *s = 400 struct gk20a_semaphore *s =
188 container_of(ref, struct gk20a_semaphore, ref); 401 container_of(ref, struct gk20a_semaphore, ref);
189 402
190 gk20a_bfree(&s->pool->alloc, s->offset); 403 gk20a_semaphore_pool_put(s->hw_sema->p);
191 gk20a_semaphore_pool_put(s->pool); 404
192 kfree(s); 405 kfree(s);
193} 406}
194 407
diff --git a/drivers/gpu/nvgpu/gk20a/semaphore_gk20a.h b/drivers/gpu/nvgpu/gk20a/semaphore_gk20a.h
index 1f12e262..58081b56 100644
--- a/drivers/gpu/nvgpu/gk20a/semaphore_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/semaphore_gk20a.h
@@ -15,17 +15,128 @@
15#define SEMAPHORE_GK20A_H 15#define SEMAPHORE_GK20A_H
16 16
17#include <linux/kref.h> 17#include <linux/kref.h>
18#include "gk20a_allocator.h" 18#include <linux/list.h>
19#include <linux/delay.h>
20
21#include "gk20a.h"
19#include "mm_gk20a.h" 22#include "mm_gk20a.h"
23#include "channel_gk20a.h"
24
25/*
26 * Max number of channels that can be used is 512. This of course needs to be
27 * fixed to be dynamic but still fast.
28 */
29#define SEMAPHORE_POOL_COUNT 512
30#define SEMAPHORE_SIZE 16
31#define SEMAPHORE_SEA_GROWTH_RATE 32
32
33struct gk20a_semaphore_sea;
34
35/*
36 * Underlying semaphore data structure. This semaphore can be shared amongst
37 * other semaphore instances.
38 */
39struct gk20a_semaphore_int {
40 int idx; /* Semaphore index. */
41 u32 offset; /* Offset into the pool. */
42 atomic_t next_value; /* Next available value. */
43 u32 *value; /* Current value (access w/ readl()). */
44 u32 nr_incrs; /* Number of increments programmed. */
45 struct gk20a_semaphore_pool *p; /* Pool that owns this sema. */
46 struct channel_gk20a *ch; /* Channel that owns this sema. */
47 struct list_head hw_sema_list; /* List of HW semaphores. */
48};
49
50/*
51 * A semaphore which the rest of the driver actually uses. This consists of a
52 * pointer to a real semaphore and a value to wait for. This allows one physical
53 * semaphore to be shared among an essentially infinite number of submits.
54 */
55struct gk20a_semaphore {
56 struct gk20a_semaphore_int *hw_sema;
20 57
21/* A memory pool for holding semaphores. */ 58 atomic_t value;
59 int incremented;
60
61 struct kref ref;
62};
63
64/*
65 * A semaphore pool. Each address space will own exactly one of these.
66 */
22struct gk20a_semaphore_pool { 67struct gk20a_semaphore_pool {
23 struct mem_desc mem; 68 struct page *page; /* This pool's page of memory */
24 struct gk20a *g; 69 struct list_head pool_list_entry; /* Node for list of pools. */
25 struct list_head maps; 70 void *cpu_va; /* CPU access to the pool. */
26 struct mutex maps_mutex; 71 u64 gpu_va; /* GPU access to the pool. */
72 u64 gpu_va_ro; /* GPU access to the pool. */
73 int page_idx; /* Index into sea bitmap. */
74
75 struct list_head hw_semas; /* List of HW semas. */
76 DECLARE_BITMAP(semas_alloced, PAGE_SIZE / SEMAPHORE_SIZE);
77
78 struct gk20a_semaphore_sea *sema_sea; /* Sea that owns this pool. */
79
80 struct mutex pool_lock;
81
82 /*
83 * This is the address spaces's personal RW table. Other channels will
84 * ultimately map this page as RO.
85 */
86 struct sg_table *rw_sg_table;
87
88 /*
89 * This is to keep track of whether the pool has had its sg_table
90 * updated during sea resizing.
91 */
92 struct sg_table *ro_sg_table;
93
94 int mapped;
95
96 /*
97 * Sometimes a channel can be released before other channels are
98 * done waiting on it. This ref count ensures that the pool doesn't
99 * go away until all semaphores using this pool are cleaned up first.
100 */
27 struct kref ref; 101 struct kref ref;
28 struct gk20a_allocator alloc; 102};
103
104/*
105 * A sea of semaphores pools. Each pool is owned by a single VM. Since multiple
106 * channels can share a VM each channel gets it's own HW semaphore from the
107 * pool. Channels then allocate regular semaphores - basically just a value that
108 * signifies when a particular job is done.
109 */
110struct gk20a_semaphore_sea {
111 struct list_head pool_list; /* List of pools in this sea. */
112 struct gk20a *gk20a;
113
114 size_t size; /* Number of pages available. */
115 u64 gpu_va; /* GPU virtual address of sema sea. */
116 u64 map_size; /* Size of the mapping. */
117
118 /*
119 * TODO:
120 * List of pages that we use to back the pools. The number of pages
121 * can grow dynamically since allocating 512 pages for all channels at
122 * once would be a tremendous waste.
123 */
124 int page_count; /* Pages allocated to pools. */
125
126 struct sg_table *ro_sg_table;
127 /*
128 struct page *pages[SEMAPHORE_POOL_COUNT];
129 */
130
131 struct mem_desc sea_mem;
132
133 /*
134 * Can't use a regular allocator here since the full range of pools are
135 * not always allocated. Instead just use a bitmap.
136 */
137 DECLARE_BITMAP(pools_alloced, SEMAPHORE_POOL_COUNT);
138
139 struct mutex sea_lock; /* Lock alloc/free calls. */
29}; 140};
30 141
31enum gk20a_mem_rw_flag { 142enum gk20a_mem_rw_flag {
@@ -34,64 +145,150 @@ enum gk20a_mem_rw_flag {
34 gk20a_mem_flag_write_only = 2, 145 gk20a_mem_flag_write_only = 2,
35}; 146};
36 147
37/* A semaphore pool can be mapped to multiple GPU address spaces. */ 148/*
38struct gk20a_semaphore_pool_map { 149 * Semaphore sea functions.
39 u64 gpu_va; 150 */
40 enum gk20a_mem_rw_flag rw_flag; 151struct gk20a_semaphore_sea *gk20a_semaphore_sea_create(struct gk20a *gk20a);
41 struct vm_gk20a *vm; 152int gk20a_semaphore_sea_map(struct gk20a_semaphore_pool *sea,
42 struct list_head list; 153 struct vm_gk20a *vm);
43}; 154void gk20a_semaphore_sea_unmap(struct gk20a_semaphore_pool *sea,
155 struct vm_gk20a *vm);
156struct gk20a_semaphore_sea *gk20a_semaphore_get_sea(struct gk20a *g);
157
158/*
159 * Semaphore pool functions.
160 */
161struct gk20a_semaphore_pool *gk20a_semaphore_pool_alloc(
162 struct gk20a_semaphore_sea *sea);
163int gk20a_semaphore_pool_map(struct gk20a_semaphore_pool *pool,
164 struct vm_gk20a *vm);
165void gk20a_semaphore_pool_unmap(struct gk20a_semaphore_pool *pool,
166 struct vm_gk20a *vm);
167u64 __gk20a_semaphore_pool_gpu_va(struct gk20a_semaphore_pool *p, bool global);
168void gk20a_semaphore_pool_get(struct gk20a_semaphore_pool *p);
169void gk20a_semaphore_pool_put(struct gk20a_semaphore_pool *p);
170
171/*
172 * Semaphore functions.
173 */
174struct gk20a_semaphore *gk20a_semaphore_alloc(struct channel_gk20a *ch);
175void gk20a_semaphore_put(struct gk20a_semaphore *s);
176void gk20a_semaphore_get(struct gk20a_semaphore *s);
177
178/*
179 * Return the address of a specific semaphore.
180 *
181 * Don't call this on a semaphore you don't own - the VA returned will make no
182 * sense in your specific channel's VM.
183 */
184static inline u64 gk20a_semaphore_gpu_rw_va(struct gk20a_semaphore *s)
185{
186 return __gk20a_semaphore_pool_gpu_va(s->hw_sema->p, false) +
187 s->hw_sema->offset;
188}
189
190/*
191 * Get the global RO address for the semaphore. Can be called on any semaphore
192 * regardless of whether you own it.
193 */
194static inline u64 gk20a_semaphore_gpu_ro_va(struct gk20a_semaphore *s)
195{
196 return __gk20a_semaphore_pool_gpu_va(s->hw_sema->p, true) +
197 s->hw_sema->offset;
198}
199
200static inline u64 gk20a_hw_sema_addr(struct gk20a_semaphore_int *hw_sema)
201{
202 return __gk20a_semaphore_pool_gpu_va(hw_sema->p, true) +
203 hw_sema->offset;
204}
205
206/*
207 * TODO: handle wrap around... Hmm, how to do this?
208 */
209static inline bool gk20a_semaphore_is_released(struct gk20a_semaphore *s)
210{
211 u32 sema_val = readl(s->hw_sema->value);
44 212
45/* A semaphore that lives inside a semaphore pool. */
46struct gk20a_semaphore {
47 struct gk20a_semaphore_pool *pool;
48 /* 213 /*
49 * value exists within the pool's memory at the specified offset. 214 * If the underlying semaphore value is greater than or equal to
50 * 0=acquired, 1=released. 215 * the value of the semaphore then the semaphore has been signaled
216 * (a.k.a. released).
51 */ 217 */
52 u32 offset; /* byte offset within pool */ 218 return sema_val >= atomic_read(&s->value);
53 struct kref ref; 219}
54};
55 220
56/* Create a semaphore pool that can hold at most 'capacity' semaphores. */ 221static inline bool gk20a_semaphore_is_acquired(struct gk20a_semaphore *s)
57struct gk20a_semaphore_pool *
58gk20a_semaphore_pool_alloc(struct gk20a *, const char *unique_name,
59 size_t capacity);
60void gk20a_semaphore_pool_put(struct gk20a_semaphore_pool *);
61int gk20a_semaphore_pool_map(struct gk20a_semaphore_pool *,
62 struct vm_gk20a *,
63 enum gk20a_mem_rw_flag);
64void gk20a_semaphore_pool_unmap(struct gk20a_semaphore_pool *,
65 struct vm_gk20a *);
66u64 gk20a_semaphore_pool_gpu_va(struct gk20a_semaphore_pool *,
67 struct vm_gk20a *);
68
69/* Allocate a semaphore from the semaphore pool. The newly allocated
70 * semaphore will be in acquired state (value=0). */
71struct gk20a_semaphore *
72gk20a_semaphore_alloc(struct gk20a_semaphore_pool *);
73void gk20a_semaphore_put(struct gk20a_semaphore *);
74void gk20a_semaphore_get(struct gk20a_semaphore *);
75
76static inline u64 gk20a_semaphore_gpu_va(struct gk20a_semaphore *s,
77 struct vm_gk20a *vm)
78{ 222{
79 return gk20a_semaphore_pool_gpu_va(s->pool, vm) + s->offset; 223 return !gk20a_semaphore_is_released(s);
80} 224}
81 225
82static inline bool gk20a_semaphore_is_acquired(struct gk20a_semaphore *s) 226/*
227 * Read the underlying value from a semaphore.
228 */
229static inline u32 gk20a_semaphore_read(struct gk20a_semaphore *s)
83{ 230{
84 u32 v = gk20a_mem_rd(s->pool->g, &s->pool->mem, s->offset); 231 return readl(s->hw_sema->value);
232}
85 233
86 /* When often block on value reaching a certain threshold. We must make 234static inline u32 gk20a_semaphore_get_value(struct gk20a_semaphore *s)
87 * sure that if we get unblocked, we haven't read anything too early. */ 235{
88 smp_rmb(); 236 return atomic_read(&s->value);
89 return v == 0;
90} 237}
91 238
239static inline u32 gk20a_semaphore_next_value(struct gk20a_semaphore *s)
240{
241 return atomic_read(&s->hw_sema->next_value);
242}
243
244/*
245 * Note - if you call this then any prior semaphores will also be released.
246 */
92static inline void gk20a_semaphore_release(struct gk20a_semaphore *s) 247static inline void gk20a_semaphore_release(struct gk20a_semaphore *s)
93{ 248{
94 smp_wmb(); 249 u32 current_val;
95 gk20a_mem_wr(s->pool->g, &s->pool->mem, s->offset, 1); 250 u32 val = gk20a_semaphore_get_value(s);
251 int attempts = 0;
252
253 /*
254 * Wait until the sema value is 1 less than the write value. That
255 * way this function is essentially an increment.
256 *
257 * TODO: tune the wait a little better.
258 */
259 while ((current_val = gk20a_semaphore_read(s)) < (val - 1)) {
260 msleep(100);
261 attempts += 1;
262 if (attempts > 100) {
263 WARN(1, "Stall on sema release!");
264 return;
265 }
266 }
267
268 /*
269 * If the semaphore has already passed the value we would write then
270 * this is really just a NO-OP.
271 */
272 if (current_val >= val)
273 return;
274
275 writel(val, s->hw_sema->value);
276}
277
278/*
279 * Configure a software based increment on this semaphore. This is useful for
280 * when we want the GPU to wait on a SW event before processing a channel.
281 * Another way to describe this is when the GPU needs to wait on a SW pre-fence.
282 * The pre-fence signals SW which in turn calls gk20a_semaphore_release() which
283 * then allows the GPU to continue.
284 *
285 * Also used to prep a semaphore for an INCR by the GPU.
286 */
287static inline void gk20a_semaphore_incr(struct gk20a_semaphore *s)
288{
289 BUG_ON(s->incremented);
290
291 atomic_set(&s->value, atomic_add_return(1, &s->hw_sema->next_value));
292 s->incremented = 1;
96} 293}
97#endif 294#endif