summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--drivers/gpu/nvgpu/gk20a/channel_gk20a.c3
-rw-r--r--drivers/gpu/nvgpu/gk20a/channel_gk20a.h2
-rw-r--r--drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c233
-rw-r--r--drivers/gpu/nvgpu/gk20a/fence_gk20a.c4
-rw-r--r--drivers/gpu/nvgpu/gk20a/gk20a.h5
-rw-r--r--drivers/gpu/nvgpu/gk20a/mm_gk20a.c82
-rw-r--r--drivers/gpu/nvgpu/gk20a/mm_gk20a.h5
-rw-r--r--drivers/gpu/nvgpu/gk20a/semaphore_gk20a.c435
-rw-r--r--drivers/gpu/nvgpu/gk20a/semaphore_gk20a.h303
9 files changed, 847 insertions, 225 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
index 3f9b0432..6c7ff551 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
@@ -1002,6 +1002,9 @@ unbind:
1002 1002
1003 mutex_unlock(&g->dbg_sessions_lock); 1003 mutex_unlock(&g->dbg_sessions_lock);
1004 1004
1005 /* Make sure that when the ch is re-opened it will get a new HW sema. */
1006 ch->hw_sema = NULL;
1007
1005 /* make sure we catch accesses of unopened channels in case 1008 /* make sure we catch accesses of unopened channels in case
1006 * there's non-refcounted channel pointers hanging around */ 1009 * there's non-refcounted channel pointers hanging around */
1007 ch->g = NULL; 1010 ch->g = NULL;
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
index acd272b4..c5a1bd24 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
@@ -108,6 +108,8 @@ struct channel_gk20a {
108 atomic_t ref_count; 108 atomic_t ref_count;
109 wait_queue_head_t ref_count_dec_wq; 109 wait_queue_head_t ref_count_dec_wq;
110 110
111 struct gk20a_semaphore_int *hw_sema;
112
111 int hw_chid; 113 int hw_chid;
112 bool wdt_enabled; 114 bool wdt_enabled;
113 bool bound; 115 bool bound;
diff --git a/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c
index d2d8c094..9c8911e9 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c
@@ -424,28 +424,52 @@ static void gk20a_channel_semaphore_launcher(
424} 424}
425#endif 425#endif
426 426
427static int add_sema_cmd(struct gk20a *g, struct priv_cmd_entry *cmd, 427static void add_sema_cmd(struct gk20a *g, struct channel_gk20a *c,
428 u64 sema, u32 payload, bool acquire, bool wfi) 428 struct gk20a_semaphore *s, struct priv_cmd_entry *cmd,
429 int cmd_size, bool acquire, bool wfi)
429{ 430{
430 u32 off = cmd->off; 431 u32 off = cmd->off;
432 u64 va;
433
434 /*
435 * RO for acquire (since we just need to read the mem) and RW for
436 * release since we will need to write back to the semaphore memory.
437 */
438 va = acquire ? gk20a_semaphore_gpu_ro_va(s) :
439 gk20a_semaphore_gpu_rw_va(s);
440
441 /*
442 * If the op is not an acquire (so therefor a release) we should
443 * incr the underlying sema next_value.
444 */
445 if (!acquire)
446 gk20a_semaphore_incr(s);
447
431 /* semaphore_a */ 448 /* semaphore_a */
432 gk20a_mem_wr32(g, cmd->mem, off++, 0x20010004); 449 gk20a_mem_wr32(g, cmd->mem, off++, 0x20010004);
433 /* offset_upper */ 450 /* offset_upper */
434 gk20a_mem_wr32(g, cmd->mem, off++, (sema >> 32) & 0xff); 451 gk20a_mem_wr32(g, cmd->mem, off++, (va >> 32) & 0xff);
435 /* semaphore_b */ 452 /* semaphore_b */
436 gk20a_mem_wr32(g, cmd->mem, off++, 0x20010005); 453 gk20a_mem_wr32(g, cmd->mem, off++, 0x20010005);
437 /* offset */ 454 /* offset */
438 gk20a_mem_wr32(g, cmd->mem, off++, sema & 0xffffffff); 455 gk20a_mem_wr32(g, cmd->mem, off++, va & 0xffffffff);
439 /* semaphore_c */ 456
440 gk20a_mem_wr32(g, cmd->mem, off++, 0x20010006);
441 /* payload */
442 gk20a_mem_wr32(g, cmd->mem, off++, payload);
443 if (acquire) { 457 if (acquire) {
458 /* semaphore_c */
459 gk20a_mem_wr32(g, cmd->mem, off++, 0x20010006);
460 /* payload */
461 gk20a_mem_wr32(g, cmd->mem, off++,
462 gk20a_semaphore_get_value(s));
444 /* semaphore_d */ 463 /* semaphore_d */
445 gk20a_mem_wr32(g, cmd->mem, off++, 0x20010007); 464 gk20a_mem_wr32(g, cmd->mem, off++, 0x20010007);
446 /* operation: acq_geq, switch_en */ 465 /* operation: acq_geq, switch_en */
447 gk20a_mem_wr32(g, cmd->mem, off++, 0x4 | (0x1 << 12)); 466 gk20a_mem_wr32(g, cmd->mem, off++, 0x4 | (0x1 << 12));
448 } else { 467 } else {
468 /* semaphore_c */
469 gk20a_mem_wr32(g, cmd->mem, off++, 0x20010006);
470 /* payload */
471 gk20a_mem_wr32(g, cmd->mem, off++,
472 gk20a_semaphore_get_value(s));
449 /* semaphore_d */ 473 /* semaphore_d */
450 gk20a_mem_wr32(g, cmd->mem, off++, 0x20010007); 474 gk20a_mem_wr32(g, cmd->mem, off++, 0x20010007);
451 /* operation: release, wfi */ 475 /* operation: release, wfi */
@@ -456,7 +480,6 @@ static int add_sema_cmd(struct gk20a *g, struct priv_cmd_entry *cmd,
456 /* ignored */ 480 /* ignored */
457 gk20a_mem_wr32(g, cmd->mem, off++, 0); 481 gk20a_mem_wr32(g, cmd->mem, off++, 0);
458 } 482 }
459 return off - cmd->off;
460} 483}
461 484
462static int gk20a_channel_semaphore_wait_syncpt( 485static int gk20a_channel_semaphore_wait_syncpt(
@@ -471,6 +494,76 @@ static int gk20a_channel_semaphore_wait_syncpt(
471 return -ENODEV; 494 return -ENODEV;
472} 495}
473 496
497/*
498 * UGHHH - the sync_fence underlying implementation changes from 3.10 to 3.18.
499 * But since there's no API for getting the underlying sync_pts we have to do
500 * some conditional compilation.
501 */
502#ifdef CONFIG_SYNC
503static struct gk20a_semaphore *sema_from_sync_fence(struct sync_fence *f)
504{
505#if LINUX_VERSION_CODE < KERNEL_VERSION(3,18,0)
506 struct sync_pt *pt;
507
508 pt = list_first_entry(&f->pt_list_head, struct sync_pt, pt_list);
509 return gk20a_sync_pt_inst_get_sema(pt);
510#else
511 return gk20a_sync_pt_inst_get_sema(f->cbs[0].sync_pt);
512#endif
513}
514
515/*
516 * Attempt a fast path for waiting on a sync_fence. Basically if the passed
517 * sync_fence is backed by a gk20a_semaphore then there's no reason to go
518 * through the rigmarole of setting up a separate semaphore which waits on an
519 * interrupt from the GPU and then triggers a worker thread to execute a SW
520 * based semaphore release. Instead just have the GPU wait on the same semaphore
521 * that is going to be incremented by the GPU.
522 *
523 * This function returns 2 possible values: -ENODEV or 0 on success. In the case
524 * of -ENODEV the fastpath cannot be taken due to the fence not being backed by
525 * a GPU semaphore.
526 */
527static int __semaphore_wait_fd_fast_path(struct channel_gk20a *c,
528 struct sync_fence *fence,
529 struct priv_cmd_entry **wait_cmd,
530 struct gk20a_semaphore **fp_sema)
531{
532 struct gk20a_semaphore *sema;
533 int err;
534
535 if (!gk20a_is_sema_backed_sync_fence(fence))
536 return -ENODEV;
537
538 sema = sema_from_sync_fence(fence);
539
540 /*
541 * If there's no underlying sema then that means the underlying sema has
542 * already signaled.
543 */
544 if (!sema) {
545 *fp_sema = NULL;
546 return 0;
547 }
548
549 err = gk20a_channel_alloc_priv_cmdbuf(c, 8, wait_cmd);
550 if (err)
551 return err;
552
553 gk20a_semaphore_get(sema);
554 BUG_ON(!atomic_read(&sema->value));
555 add_sema_cmd(c->g, c, sema, *wait_cmd, 8, true, false);
556
557 /*
558 * Make sure that gk20a_channel_semaphore_wait_fd() can create another
559 * fence with the underlying semaphore.
560 */
561 *fp_sema = sema;
562
563 return 0;
564}
565#endif
566
474static int gk20a_channel_semaphore_wait_fd( 567static int gk20a_channel_semaphore_wait_fd(
475 struct gk20a_channel_sync *s, int fd, 568 struct gk20a_channel_sync *s, int fd,
476 struct priv_cmd_entry **entry, 569 struct priv_cmd_entry **entry,
@@ -480,69 +573,107 @@ static int gk20a_channel_semaphore_wait_fd(
480 container_of(s, struct gk20a_channel_semaphore, ops); 573 container_of(s, struct gk20a_channel_semaphore, ops);
481 struct channel_gk20a *c = sema->c; 574 struct channel_gk20a *c = sema->c;
482#ifdef CONFIG_SYNC 575#ifdef CONFIG_SYNC
576 struct gk20a_semaphore *fp_sema;
483 struct sync_fence *sync_fence; 577 struct sync_fence *sync_fence;
484 struct priv_cmd_entry *wait_cmd = NULL; 578 struct priv_cmd_entry *wait_cmd = NULL;
485 struct wait_fence_work *w; 579 struct wait_fence_work *w = NULL;
486 int written; 580 int err, ret, status;
487 int err, ret;
488 u64 va;
489 581
490 sync_fence = gk20a_sync_fence_fdget(fd); 582 sync_fence = gk20a_sync_fence_fdget(fd);
491 if (!sync_fence) 583 if (!sync_fence)
492 return -EINVAL; 584 return -EINVAL;
493 585
586 ret = __semaphore_wait_fd_fast_path(c, sync_fence, &wait_cmd, &fp_sema);
587 if (ret == 0) {
588 if (fp_sema)
589 *fence = gk20a_fence_from_semaphore(sema->timeline,
590 fp_sema,
591 &c->semaphore_wq,
592 NULL, false);
593 else
594 /*
595 * Allocate an empty fence. It will instantly return
596 * from gk20a_fence_wait().
597 */
598 *fence = gk20a_alloc_fence(NULL, NULL, false);
599
600 sync_fence_put(sync_fence);
601 goto skip_slow_path;
602 }
603
604 /* If the fence has signaled there is no reason to wait on it. */
605#if LINUX_VERSION_CODE < KERNEL_VERSION(3,18,0)
606 status = sync_fence->status;
607#else
608 status = atomic_read(&sync_fence->status);
609#endif
610 if (status) {
611 sync_fence_put(sync_fence);
612 goto skip_slow_path;
613 }
614
615 err = gk20a_channel_alloc_priv_cmdbuf(c, 8, &wait_cmd);
616 if (err) {
617 gk20a_err(dev_from_gk20a(c->g),
618 "not enough priv cmd buffer space");
619 sync_fence_put(sync_fence);
620 return -ENOMEM;
621 }
622
494 w = kzalloc(sizeof(*w), GFP_KERNEL); 623 w = kzalloc(sizeof(*w), GFP_KERNEL);
495 if (!w) { 624 if (!w) {
496 err = -ENOMEM; 625 err = -ENOMEM;
497 goto fail; 626 goto fail_free_cmdbuf;
498 } 627 }
628
499 sync_fence_waiter_init(&w->waiter, gk20a_channel_semaphore_launcher); 629 sync_fence_waiter_init(&w->waiter, gk20a_channel_semaphore_launcher);
500 w->ch = c; 630 w->ch = c;
501 w->sema = gk20a_semaphore_alloc(sema->pool); 631 w->sema = gk20a_semaphore_alloc(c);
502 if (!w->sema) { 632 if (!w->sema) {
503 gk20a_err(dev_from_gk20a(c->g), "ran out of semaphores"); 633 gk20a_err(dev_from_gk20a(c->g), "ran out of semaphores");
504 err = -ENOMEM; 634 err = -ENOMEM;
505 goto fail; 635 goto fail_free_worker;
506 } 636 }
507 637
508 /* worker takes one reference */ 638 /* worker takes one reference */
509 gk20a_semaphore_get(w->sema); 639 gk20a_semaphore_get(w->sema);
640 gk20a_semaphore_incr(w->sema);
510 641
511 err = gk20a_channel_alloc_priv_cmdbuf(c, 8, &wait_cmd); 642 /* GPU unblocked when the semaphore value increments. */
512 if (err) { 643 add_sema_cmd(c->g, c, w->sema, wait_cmd, 8, true, false);
513 gk20a_err(dev_from_gk20a(c->g),
514 "not enough priv cmd buffer space");
515 goto fail;
516 }
517
518 va = gk20a_semaphore_gpu_va(w->sema, c->vm);
519 /* GPU unblocked when when the semaphore value becomes 1. */
520 written = add_sema_cmd(c->g, wait_cmd, va, 1, true, false);
521 644
522 WARN_ON(written != wait_cmd->size);
523 ret = sync_fence_wait_async(sync_fence, &w->waiter); 645 ret = sync_fence_wait_async(sync_fence, &w->waiter);
524 646
525 /* 647 /*
526 * If the sync_fence has already signaled then the above async_wait 648 * If the sync_fence has already signaled then the above async_wait
527 * will never trigger. This causes the semaphore release op to never 649 * will never trigger. This causes the semaphore release op to never
528 * happen which, in turn, hangs the GPU. That's bad. So let's just 650 * happen which, in turn, hangs the GPU. That's bad. So let's just
529 * do the semaphore_release right now. 651 * do the gk20a_semaphore_release() right now.
530 */ 652 */
531 if (ret == 1) 653 if (ret == 1) {
654 sync_fence_put(sync_fence);
532 gk20a_semaphore_release(w->sema); 655 gk20a_semaphore_release(w->sema);
656 gk20a_semaphore_put(w->sema);
657 }
533 658
534 /* XXX - this fixes an actual bug, we need to hold a ref to this 659 /* XXX - this fixes an actual bug, we need to hold a ref to this
535 semaphore while the job is in flight. */ 660 semaphore while the job is in flight. */
536 *fence = gk20a_fence_from_semaphore(sema->timeline, w->sema, 661 *fence = gk20a_fence_from_semaphore(sema->timeline, w->sema,
537 &c->semaphore_wq, 662 &c->semaphore_wq,
538 NULL, false); 663 NULL, false);
664
665skip_slow_path:
539 *entry = wait_cmd; 666 *entry = wait_cmd;
540 return 0; 667 return 0;
541fail: 668
669fail_free_worker:
542 if (w && w->sema) 670 if (w && w->sema)
543 gk20a_semaphore_put(w->sema); 671 gk20a_semaphore_put(w->sema);
544 kfree(w); 672 kfree(w);
545 sync_fence_put(sync_fence); 673 sync_fence_put(sync_fence);
674fail_free_cmdbuf:
675 if (wait_cmd)
676 gk20a_free_priv_cmdbuf(c, wait_cmd);
546 return err; 677 return err;
547#else 678#else
548 gk20a_err(dev_from_gk20a(c->g), 679 gk20a_err(dev_from_gk20a(c->g),
@@ -558,9 +689,7 @@ static int __gk20a_channel_semaphore_incr(
558 struct gk20a_fence **fence, 689 struct gk20a_fence **fence,
559 bool need_sync_fence) 690 bool need_sync_fence)
560{ 691{
561 u64 va;
562 int incr_cmd_size; 692 int incr_cmd_size;
563 int written;
564 struct priv_cmd_entry *incr_cmd = NULL; 693 struct priv_cmd_entry *incr_cmd = NULL;
565 struct gk20a_channel_semaphore *sp = 694 struct gk20a_channel_semaphore *sp =
566 container_of(s, struct gk20a_channel_semaphore, ops); 695 container_of(s, struct gk20a_channel_semaphore, ops);
@@ -568,7 +697,7 @@ static int __gk20a_channel_semaphore_incr(
568 struct gk20a_semaphore *semaphore; 697 struct gk20a_semaphore *semaphore;
569 int err = 0; 698 int err = 0;
570 699
571 semaphore = gk20a_semaphore_alloc(sp->pool); 700 semaphore = gk20a_semaphore_alloc(c);
572 if (!semaphore) { 701 if (!semaphore) {
573 gk20a_err(dev_from_gk20a(c->g), 702 gk20a_err(dev_from_gk20a(c->g),
574 "ran out of semaphores"); 703 "ran out of semaphores");
@@ -585,9 +714,7 @@ static int __gk20a_channel_semaphore_incr(
585 } 714 }
586 715
587 /* Release the completion semaphore. */ 716 /* Release the completion semaphore. */
588 va = gk20a_semaphore_gpu_va(semaphore, c->vm); 717 add_sema_cmd(c->g, c, semaphore, incr_cmd, 14, false, wfi_cmd);
589 written = add_sema_cmd(c->g, incr_cmd, va, 1, false, wfi_cmd);
590 WARN_ON(written != incr_cmd_size);
591 718
592 *fence = gk20a_fence_from_semaphore(sp->timeline, semaphore, 719 *fence = gk20a_fence_from_semaphore(sp->timeline, semaphore,
593 &c->semaphore_wq, 720 &c->semaphore_wq,
@@ -615,8 +742,10 @@ static int gk20a_channel_semaphore_incr(
615{ 742{
616 /* Don't put wfi cmd to this one since we're not returning 743 /* Don't put wfi cmd to this one since we're not returning
617 * a fence to user space. */ 744 * a fence to user space. */
618 return __gk20a_channel_semaphore_incr(s, false /* no wfi */, 745 return __gk20a_channel_semaphore_incr(s,
619 NULL, entry, fence, need_sync_fence); 746 false /* no wfi */,
747 NULL,
748 entry, fence, need_sync_fence);
620} 749}
621 750
622static int gk20a_channel_semaphore_incr_user( 751static int gk20a_channel_semaphore_incr_user(
@@ -679,17 +808,16 @@ static void gk20a_channel_semaphore_destroy(struct gk20a_channel_sync *s)
679 container_of(s, struct gk20a_channel_semaphore, ops); 808 container_of(s, struct gk20a_channel_semaphore, ops);
680 if (sema->timeline) 809 if (sema->timeline)
681 gk20a_sync_timeline_destroy(sema->timeline); 810 gk20a_sync_timeline_destroy(sema->timeline);
682 if (sema->pool) { 811
683 gk20a_semaphore_pool_unmap(sema->pool, sema->c->vm); 812 /* The sema pool is cleaned up by the VM destroy. */
684 gk20a_semaphore_pool_put(sema->pool); 813 sema->pool = NULL;
685 } 814
686 kfree(sema); 815 kfree(sema);
687} 816}
688 817
689static struct gk20a_channel_sync * 818static struct gk20a_channel_sync *
690gk20a_channel_semaphore_create(struct channel_gk20a *c) 819gk20a_channel_semaphore_create(struct channel_gk20a *c)
691{ 820{
692 int err;
693 int asid = -1; 821 int asid = -1;
694 struct gk20a_channel_semaphore *sema; 822 struct gk20a_channel_semaphore *sema;
695 char pool_name[20]; 823 char pool_name[20];
@@ -706,21 +834,15 @@ gk20a_channel_semaphore_create(struct channel_gk20a *c)
706 asid = c->vm->as_share->id; 834 asid = c->vm->as_share->id;
707 835
708 sprintf(pool_name, "semaphore_pool-%d", c->hw_chid); 836 sprintf(pool_name, "semaphore_pool-%d", c->hw_chid);
709 sema->pool = gk20a_semaphore_pool_alloc(c->g, pool_name, 1024); 837 sema->pool = c->vm->sema_pool;
710 if (!sema->pool)
711 goto clean_up;
712
713 /* Map the semaphore pool to the channel vm. Map as read-write to the
714 * owner channel (all other channels should map as read only!). */
715 err = gk20a_semaphore_pool_map(sema->pool, c->vm, gk20a_mem_flag_none);
716 if (err)
717 goto clean_up;
718 838
719#ifdef CONFIG_SYNC 839#ifdef CONFIG_SYNC
720 sema->timeline = gk20a_sync_timeline_create( 840 sema->timeline = gk20a_sync_timeline_create(
721 "gk20a_ch%d_as%d", c->hw_chid, asid); 841 "gk20a_ch%d_as%d", c->hw_chid, asid);
722 if (!sema->timeline) 842 if (!sema->timeline) {
723 goto clean_up; 843 gk20a_channel_semaphore_destroy(&sema->ops);
844 return NULL;
845 }
724#endif 846#endif
725 atomic_set(&sema->ops.refcount, 0); 847 atomic_set(&sema->ops.refcount, 0);
726 sema->ops.wait_syncpt = gk20a_channel_semaphore_wait_syncpt; 848 sema->ops.wait_syncpt = gk20a_channel_semaphore_wait_syncpt;
@@ -734,9 +856,6 @@ gk20a_channel_semaphore_create(struct channel_gk20a *c)
734 sema->ops.destroy = gk20a_channel_semaphore_destroy; 856 sema->ops.destroy = gk20a_channel_semaphore_destroy;
735 857
736 return &sema->ops; 858 return &sema->ops;
737clean_up:
738 gk20a_channel_semaphore_destroy(&sema->ops);
739 return NULL;
740} 859}
741 860
742void gk20a_channel_sync_destroy(struct gk20a_channel_sync *sync) 861void gk20a_channel_sync_destroy(struct gk20a_channel_sync *sync)
diff --git a/drivers/gpu/nvgpu/gk20a/fence_gk20a.c b/drivers/gpu/nvgpu/gk20a/fence_gk20a.c
index 23522882..fbbaa2a7 100644
--- a/drivers/gpu/nvgpu/gk20a/fence_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/fence_gk20a.c
@@ -155,8 +155,8 @@ struct gk20a_fence *gk20a_fence_from_semaphore(
155 155
156#ifdef CONFIG_SYNC 156#ifdef CONFIG_SYNC
157 sync_fence = gk20a_sync_fence_create(timeline, semaphore, 157 sync_fence = gk20a_sync_fence_create(timeline, semaphore,
158 dependency, "f-gk20a-0x%04x", 158 dependency, "f-gk20a-0x%04x",
159 semaphore->offset & 0xffff); 159 gk20a_semaphore_gpu_ro_va(semaphore));
160 if (!sync_fence) 160 if (!sync_fence)
161 return NULL; 161 return NULL;
162#endif 162#endif
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.h b/drivers/gpu/nvgpu/gk20a/gk20a.h
index 5ab09ac3..7bd9775e 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.h
@@ -738,6 +738,11 @@ struct gk20a {
738#endif 738#endif
739 struct gk20a_ctxsw_ucode_info ctxsw_ucode_info; 739 struct gk20a_ctxsw_ucode_info ctxsw_ucode_info;
740 740
741 /*
742 * A group of semaphore pools. One for each channel.
743 */
744 struct gk20a_semaphore_sea *sema_sea;
745
741 /* held while manipulating # of debug/profiler sessions present */ 746 /* held while manipulating # of debug/profiler sessions present */
742 /* also prevents debug sessions from attaching until released */ 747 /* also prevents debug sessions from attaching until released */
743 struct mutex dbg_sessions_lock; 748 struct mutex dbg_sessions_lock;
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
index 3b21e843..9299266f 100644
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
@@ -3213,6 +3213,17 @@ static void gk20a_vm_remove_support_nofree(struct vm_gk20a *vm)
3213 struct rb_node *node; 3213 struct rb_node *node;
3214 3214
3215 gk20a_dbg_fn(""); 3215 gk20a_dbg_fn("");
3216
3217 /*
3218 * Do this outside of the update_gmmu_lock since unmapping the semaphore
3219 * pool involves unmapping a GMMU mapping which means aquiring the
3220 * update_gmmu_lock.
3221 */
3222 if (!gk20a_platform_has_syncpoints(gk20a_from_vm(vm)->dev)) {
3223 gk20a_semaphore_pool_unmap(vm->sema_pool, vm);
3224 gk20a_semaphore_pool_put(vm->sema_pool);
3225 }
3226
3216 mutex_lock(&vm->update_gmmu_lock); 3227 mutex_lock(&vm->update_gmmu_lock);
3217 3228
3218 /* TBD: add a flag here for the unmap code to recognize teardown 3229 /* TBD: add a flag here for the unmap code to recognize teardown
@@ -3286,6 +3297,64 @@ const struct gk20a_mmu_level gk20a_mm_levels_128k[] = {
3286 {.update_entry = NULL} 3297 {.update_entry = NULL}
3287}; 3298};
3288 3299
3300/*
3301 * Initialize a semaphore pool. Just return successfully if we do not need
3302 * semaphores (i.e when sync-pts are active).
3303 */
3304int gk20a_init_sema_pool(struct vm_gk20a *vm)
3305{
3306 struct gk20a_semaphore_sea *sema_sea;
3307 struct mm_gk20a *mm = vm->mm;
3308 struct gk20a *g = mm->g;
3309 int err;
3310
3311 /*
3312 * Don't waste the memory on semaphores if we don't need them.
3313 */
3314 if (gk20a_platform_has_syncpoints(g->dev))
3315 return 0;
3316
3317 if (vm->sema_pool)
3318 return 0;
3319
3320 sema_sea = gk20a_semaphore_sea_create(g);
3321 if (!sema_sea)
3322 return -ENOMEM;
3323
3324 vm->sema_pool = gk20a_semaphore_pool_alloc(sema_sea);
3325 if (!vm->sema_pool) {
3326 gk20a_vm_put(vm);
3327 return -ENOMEM;
3328 }
3329
3330 /*
3331 * Allocate a chunk of GPU VA space for mapping the semaphores. We will
3332 * do a fixed alloc in the kernel VM so that all channels have the same
3333 * RO address range for the semaphores.
3334 *
3335 * !!! TODO: cleanup.
3336 */
3337 sema_sea->gpu_va = gk20a_balloc_fixed(&vm->vma[gmmu_page_size_kernel],
3338 vm->va_limit -
3339 mm->channel.kernel_size,
3340 512 * PAGE_SIZE);
3341 if (!sema_sea->gpu_va) {
3342 gk20a_bfree(&vm->vma[gmmu_page_size_small], sema_sea->gpu_va);
3343 gk20a_vm_put(vm);
3344 return -ENOMEM;
3345 }
3346
3347 err = gk20a_semaphore_pool_map(vm->sema_pool, vm);
3348 if (err) {
3349 gk20a_semaphore_pool_unmap(vm->sema_pool, vm);
3350 gk20a_bfree(&vm->vma[gmmu_page_size_small],
3351 vm->sema_pool->gpu_va);
3352 gk20a_vm_put(vm);
3353 }
3354
3355 return 0;
3356}
3357
3289int gk20a_init_vm(struct mm_gk20a *mm, 3358int gk20a_init_vm(struct mm_gk20a *mm,
3290 struct vm_gk20a *vm, 3359 struct vm_gk20a *vm,
3291 u32 big_page_size, 3360 u32 big_page_size,
@@ -3317,9 +3386,7 @@ int gk20a_init_vm(struct mm_gk20a *mm,
3317 vm->big_pages = big_pages; 3386 vm->big_pages = big_pages;
3318 3387
3319 vm->big_page_size = gmmu_page_sizes[gmmu_page_size_big]; 3388 vm->big_page_size = gmmu_page_sizes[gmmu_page_size_big];
3320
3321 vm->userspace_managed = userspace_managed; 3389 vm->userspace_managed = userspace_managed;
3322
3323 vm->mmu_levels = vm->mm->g->ops.mm.get_mmu_levels(vm->mm->g, 3390 vm->mmu_levels = vm->mm->g->ops.mm.get_mmu_levels(vm->mm->g,
3324 vm->big_page_size); 3391 vm->big_page_size);
3325 3392
@@ -3465,6 +3532,17 @@ int gk20a_init_vm(struct mm_gk20a *mm,
3465 kref_init(&vm->ref); 3532 kref_init(&vm->ref);
3466 INIT_LIST_HEAD(&vm->reserved_va_list); 3533 INIT_LIST_HEAD(&vm->reserved_va_list);
3467 3534
3535 /*
3536 * This is only necessary for channel address spaces. The best way to
3537 * distinguish channel address spaces from other address spaces is by
3538 * size - if the address space is 4GB or less, it's not a channel.
3539 */
3540 if (vm->va_limit > SZ_4G) {
3541 err = gk20a_init_sema_pool(vm);
3542 if (err)
3543 goto clean_up_big_allocator;
3544 }
3545
3468 return 0; 3546 return 0;
3469 3547
3470clean_up_big_allocator: 3548clean_up_big_allocator:
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
index db74a5ca..7bb4d011 100644
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
@@ -287,6 +287,11 @@ struct vm_gk20a {
287 /* if non-NULL, kref_put will use this batch when 287 /* if non-NULL, kref_put will use this batch when
288 unmapping. Must hold vm->update_gmmu_lock. */ 288 unmapping. Must hold vm->update_gmmu_lock. */
289 struct vm_gk20a_mapping_batch *kref_put_batch; 289 struct vm_gk20a_mapping_batch *kref_put_batch;
290
291 /*
292 * Each address space needs to have a semaphore pool.
293 */
294 struct gk20a_semaphore_pool *sema_pool;
290}; 295};
291 296
292struct gk20a; 297struct gk20a;
diff --git a/drivers/gpu/nvgpu/gk20a/semaphore_gk20a.c b/drivers/gpu/nvgpu/gk20a/semaphore_gk20a.c
index 3b17bfcb..aa375b24 100644
--- a/drivers/gpu/nvgpu/gk20a/semaphore_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/semaphore_gk20a.c
@@ -15,63 +15,284 @@
15 * more details. 15 * more details.
16 */ 16 */
17 17
18#include "semaphore_gk20a.h" 18#define pr_fmt(fmt) "gpu_sema: " fmt
19
19#include <linux/dma-mapping.h> 20#include <linux/dma-mapping.h>
21#include <linux/highmem.h>
20#include <linux/slab.h> 22#include <linux/slab.h>
23
24#include <asm/pgtable.h>
25
21#include "gk20a.h" 26#include "gk20a.h"
22#include "mm_gk20a.h" 27#include "mm_gk20a.h"
28#include "semaphore_gk20a.h"
29
30#define __lock_sema_sea(s) \
31 do { \
32 mutex_lock(&s->sea_lock); \
33 } while (0)
23 34
24static const int SEMAPHORE_SIZE = 16; 35#define __unlock_sema_sea(s) \
36 do { \
37 mutex_unlock(&s->sea_lock); \
38 } while (0)
25 39
26struct gk20a_semaphore_pool *gk20a_semaphore_pool_alloc(struct gk20a *g, 40/*
27 const char *unique_name, size_t capacity) 41 * Return the sema_sea pointer.
42 */
43struct gk20a_semaphore_sea *gk20a_semaphore_get_sea(struct gk20a *g)
44{
45 return g->sema_sea;
46}
47
48static int __gk20a_semaphore_sea_grow(struct gk20a_semaphore_sea *sea)
49{
50 int ret = 0;
51 struct gk20a *gk20a = sea->gk20a;
52
53 __lock_sema_sea(sea);
54
55 ret = gk20a_gmmu_alloc_attr(gk20a, DMA_ATTR_NO_KERNEL_MAPPING,
56 PAGE_SIZE * SEMAPHORE_POOL_COUNT,
57 &sea->sea_mem);
58 if (ret)
59 goto out;
60
61 sea->ro_sg_table = sea->sea_mem.sgt;
62 sea->size = SEMAPHORE_POOL_COUNT;
63 sea->map_size = SEMAPHORE_POOL_COUNT * PAGE_SIZE;
64
65out:
66 __unlock_sema_sea(sea);
67 return ret;
68}
69
70/*
71 * Create the semaphore sea. Only create it once - subsequent calls to this will
72 * return the originally created sea pointer.
73 */
74struct gk20a_semaphore_sea *gk20a_semaphore_sea_create(struct gk20a *g)
75{
76 if (g->sema_sea)
77 return g->sema_sea;
78
79 g->sema_sea = kzalloc(sizeof(*g->sema_sea), GFP_KERNEL);
80 if (!g->sema_sea)
81 return NULL;
82
83 g->sema_sea->size = 0;
84 g->sema_sea->page_count = 0;
85 g->sema_sea->gk20a = g;
86 INIT_LIST_HEAD(&g->sema_sea->pool_list);
87 mutex_init(&g->sema_sea->sea_lock);
88
89 if (__gk20a_semaphore_sea_grow(g->sema_sea))
90 goto cleanup;
91
92 return g->sema_sea;
93
94cleanup:
95 kfree(g->sema_sea);
96 g->sema_sea = NULL;
97 return NULL;
98}
99
100static int __semaphore_bitmap_alloc(unsigned long *bitmap, unsigned long len)
101{
102 unsigned long idx = find_first_zero_bit(bitmap, len);
103
104 if (idx == len)
105 return -ENOSPC;
106
107 set_bit(idx, bitmap);
108
109 return (int)idx;
110}
111
112/*
113 * Allocate a pool from the sea.
114 */
115struct gk20a_semaphore_pool *gk20a_semaphore_pool_alloc(
116 struct gk20a_semaphore_sea *sea)
28{ 117{
29 struct gk20a_semaphore_pool *p; 118 struct gk20a_semaphore_pool *p;
119 unsigned long page_idx;
120 int err = 0;
121
30 p = kzalloc(sizeof(*p), GFP_KERNEL); 122 p = kzalloc(sizeof(*p), GFP_KERNEL);
31 if (!p) 123 if (!p)
32 return NULL; 124 return ERR_PTR(-ENOMEM);
125
126 __lock_sema_sea(sea);
127
128 page_idx = __semaphore_bitmap_alloc(sea->pools_alloced,
129 SEMAPHORE_POOL_COUNT);
130 if (page_idx < 0) {
131 err = page_idx;
132 goto fail;
133 }
33 134
135 p->page = sea->sea_mem.pages[page_idx];
136 p->ro_sg_table = sea->ro_sg_table;
137 p->page_idx = page_idx;
138 p->sema_sea = sea;
139 INIT_LIST_HEAD(&p->hw_semas);
34 kref_init(&p->ref); 140 kref_init(&p->ref);
35 INIT_LIST_HEAD(&p->maps); 141 mutex_init(&p->pool_lock);
36 mutex_init(&p->maps_mutex); 142
37 p->g = g; 143 sea->page_count++;
38 144 list_add(&p->pool_list_entry, &sea->pool_list);
39 /* Alloc one 4k page of semaphore per channel. */ 145 __unlock_sema_sea(sea);
40 if (gk20a_gmmu_alloc(g, roundup(capacity * SEMAPHORE_SIZE, PAGE_SIZE), 146
41 &p->mem))
42 goto clean_up;
43
44 /* Sacrifice one semaphore in the name of returning error codes. */
45 if (gk20a_allocator_init(&p->alloc, unique_name,
46 SEMAPHORE_SIZE, p->mem.size - SEMAPHORE_SIZE,
47 SEMAPHORE_SIZE))
48 goto clean_up;
49
50 gk20a_dbg_info("cpuva=%p iova=%llx phys=%llx", p->mem.cpu_va,
51 (u64)sg_dma_address(p->mem.sgt->sgl),
52 (u64)sg_phys(p->mem.sgt->sgl));
53 return p; 147 return p;
54 148
55clean_up: 149fail:
56 if (p->mem.size) 150 __unlock_sema_sea(sea);
57 gk20a_gmmu_free(p->g, &p->mem);
58 kfree(p); 151 kfree(p);
59 return NULL; 152 return ERR_PTR(err);
153}
154
155/*
156 * Map a pool into the passed vm's address space. This handles both the fixed
157 * global RO mapping and the non-fixed private RW mapping.
158 */
159int gk20a_semaphore_pool_map(struct gk20a_semaphore_pool *p,
160 struct vm_gk20a *vm)
161{
162 int ents, err = 0;
163 u64 addr;
164
165 p->cpu_va = vmap(&p->page, 1, 0,
166 pgprot_writecombine(PAGE_KERNEL));
167
168 /* First do the RW mapping. */
169 p->rw_sg_table = kzalloc(sizeof(*p->rw_sg_table), GFP_KERNEL);
170 if (!p->rw_sg_table)
171 return -ENOMEM;
172
173 err = sg_alloc_table_from_pages(p->rw_sg_table, &p->page, 1, 0,
174 PAGE_SIZE, GFP_KERNEL);
175 if (err) {
176 err = -ENOMEM;
177 goto fail;
178 }
179
180 /* Add IOMMU mapping... */
181 ents = dma_map_sg(dev_from_vm(vm), p->rw_sg_table->sgl, 1,
182 DMA_BIDIRECTIONAL);
183 if (ents != 1) {
184 err = -ENOMEM;
185 goto fail_free_sgt;
186 }
187
188 /* Map into the GPU... Doesn't need to be fixed. */
189 p->gpu_va = gk20a_gmmu_map(vm, &p->rw_sg_table, PAGE_SIZE,
190 0, gk20a_mem_flag_none, false);
191 if (!p->gpu_va) {
192 err = -ENOMEM;
193 goto fail_unmap_sgt;
194 }
195
196 /*
197 * And now the global mapping. Take the sea lock so that we don't race
198 * with a concurrent remap.
199 */
200 __lock_sema_sea(p->sema_sea);
201
202 BUG_ON(p->mapped);
203 addr = gk20a_gmmu_fixed_map(vm, &p->sema_sea->ro_sg_table,
204 p->sema_sea->gpu_va, p->sema_sea->map_size,
205 0,
206 gk20a_mem_flag_read_only,
207 false);
208 if (!addr) {
209 err = -ENOMEM;
210 BUG();
211 goto fail_unlock;
212 }
213 p->gpu_va_ro = addr;
214 p->mapped = 1;
215
216 __unlock_sema_sea(p->sema_sea);
217
218 return 0;
219
220fail_unlock:
221 __unlock_sema_sea(p->sema_sea);
222fail_unmap_sgt:
223 dma_unmap_sg(dev_from_vm(vm), p->rw_sg_table->sgl, 1,
224 DMA_BIDIRECTIONAL);
225fail_free_sgt:
226 sg_free_table(p->rw_sg_table);
227fail:
228 kfree(p->rw_sg_table);
229 p->rw_sg_table = NULL;
230 return err;
60} 231}
61 232
233/*
234 * Unmap a semaphore_pool.
235 */
236void gk20a_semaphore_pool_unmap(struct gk20a_semaphore_pool *p,
237 struct vm_gk20a *vm)
238{
239 struct gk20a_semaphore_int *hw_sema;
240
241 kunmap(p->cpu_va);
242
243 /* First the global RO mapping... */
244 __lock_sema_sea(p->sema_sea);
245 gk20a_gmmu_unmap(vm, p->gpu_va_ro,
246 p->sema_sea->map_size, gk20a_mem_flag_none);
247 p->ro_sg_table = NULL;
248 __unlock_sema_sea(p->sema_sea);
249
250 /* And now the private RW mapping. */
251 gk20a_gmmu_unmap(vm, p->gpu_va, PAGE_SIZE, gk20a_mem_flag_none);
252 p->gpu_va = 0;
253
254 dma_unmap_sg(dev_from_vm(vm), p->rw_sg_table->sgl, 1,
255 DMA_BIDIRECTIONAL);
256
257 sg_free_table(p->rw_sg_table);
258 kfree(p->rw_sg_table);
259 p->rw_sg_table = NULL;
260
261 gk20a_dbg_info("Unmapped sema-pool: idx = %d", p->page_idx);
262 list_for_each_entry(hw_sema, &p->hw_semas, hw_sema_list)
263 /*
264 * Make sure the mem addresses are all NULL so if this gets
265 * reused we will fault.
266 */
267 hw_sema->value = NULL;
268}
269
270/*
271 * Completely free a sempahore_pool. You should make sure this pool is not
272 * mapped otherwise there's going to be a memory leak.
273 */
62static void gk20a_semaphore_pool_free(struct kref *ref) 274static void gk20a_semaphore_pool_free(struct kref *ref)
63{ 275{
64 struct gk20a_semaphore_pool *p = 276 struct gk20a_semaphore_pool *p =
65 container_of(ref, struct gk20a_semaphore_pool, ref); 277 container_of(ref, struct gk20a_semaphore_pool, ref);
66 mutex_lock(&p->maps_mutex); 278 struct gk20a_semaphore_sea *s = p->sema_sea;
67 WARN_ON(!list_empty(&p->maps)); 279 struct gk20a_semaphore_int *hw_sema, *tmp;
68 mutex_unlock(&p->maps_mutex); 280
69 gk20a_gmmu_free(p->g, &p->mem); 281 WARN_ON(p->gpu_va || p->rw_sg_table || p->ro_sg_table);
70 gk20a_allocator_destroy(&p->alloc); 282
283 __lock_sema_sea(s);
284 list_del(&p->pool_list_entry);
285 clear_bit(p->page_idx, s->pools_alloced);
286 s->page_count--;
287 __unlock_sema_sea(s);
288
289 list_for_each_entry_safe(hw_sema, tmp, &p->hw_semas, hw_sema_list)
290 kfree(hw_sema);
291
71 kfree(p); 292 kfree(p);
72} 293}
73 294
74static void gk20a_semaphore_pool_get(struct gk20a_semaphore_pool *p) 295void gk20a_semaphore_pool_get(struct gk20a_semaphore_pool *p)
75{ 296{
76 kref_get(&p->ref); 297 kref_get(&p->ref);
77} 298}
@@ -81,104 +302,96 @@ void gk20a_semaphore_pool_put(struct gk20a_semaphore_pool *p)
81 kref_put(&p->ref, gk20a_semaphore_pool_free); 302 kref_put(&p->ref, gk20a_semaphore_pool_free);
82} 303}
83 304
84static struct gk20a_semaphore_pool_map * 305/*
85gk20a_semaphore_pool_find_map_locked(struct gk20a_semaphore_pool *p, 306 * Get the address for a semaphore_pool - if global is true then return the
86 struct vm_gk20a *vm) 307 * global RO address instead of the RW address owned by the semaphore's VM.
308 */
309u64 __gk20a_semaphore_pool_gpu_va(struct gk20a_semaphore_pool *p, bool global)
87{ 310{
88 struct gk20a_semaphore_pool_map *map, *found = NULL; 311 if (!global)
89 list_for_each_entry(map, &p->maps, list) { 312 return p->gpu_va;
90 if (map->vm == vm) { 313
91 found = map; 314 return p->gpu_va_ro + (PAGE_SIZE * p->page_idx);
92 break;
93 }
94 }
95 return found;
96} 315}
97 316
98int gk20a_semaphore_pool_map(struct gk20a_semaphore_pool *p, 317static int __gk20a_init_hw_sema(struct channel_gk20a *ch)
99 struct vm_gk20a *vm,
100 enum gk20a_mem_rw_flag rw_flag)
101{ 318{
102 struct gk20a_semaphore_pool_map *map; 319 int hw_sema_idx;
320 int ret = 0;
321 struct gk20a_semaphore_int *hw_sema;
322 struct gk20a_semaphore_pool *p = ch->vm->sema_pool;
103 323
104 map = kzalloc(sizeof(*map), GFP_KERNEL); 324 BUG_ON(!p);
105 if (!map)
106 return -ENOMEM;
107 map->vm = vm;
108 map->rw_flag = rw_flag;
109 map->gpu_va = gk20a_gmmu_map(vm, &p->mem.sgt, p->mem.size,
110 0/*uncached*/, rw_flag,
111 false);
112 if (!map->gpu_va) {
113 kfree(map);
114 return -ENOMEM;
115 }
116 gk20a_vm_get(vm);
117 325
118 mutex_lock(&p->maps_mutex); 326 mutex_lock(&p->pool_lock);
119 WARN_ON(gk20a_semaphore_pool_find_map_locked(p, vm));
120 list_add(&map->list, &p->maps);
121 mutex_unlock(&p->maps_mutex);
122 return 0;
123}
124 327
125void gk20a_semaphore_pool_unmap(struct gk20a_semaphore_pool *p, 328 /* Find an available HW semaphore. */
126 struct vm_gk20a *vm) 329 hw_sema_idx = __semaphore_bitmap_alloc(p->semas_alloced,
127{ 330 PAGE_SIZE / SEMAPHORE_SIZE);
128 struct gk20a_semaphore_pool_map *map; 331 if (hw_sema_idx < 0) {
129 WARN_ON(!vm); 332 ret = hw_sema_idx;
130 333 goto fail;
131 mutex_lock(&p->maps_mutex);
132 map = gk20a_semaphore_pool_find_map_locked(p, vm);
133 if (map) {
134 gk20a_gmmu_unmap(vm, map->gpu_va, p->mem.size, map->rw_flag);
135 gk20a_vm_put(vm);
136 list_del(&map->list);
137 kfree(map);
138 } 334 }
139 mutex_unlock(&p->maps_mutex);
140}
141 335
142u64 gk20a_semaphore_pool_gpu_va(struct gk20a_semaphore_pool *p, 336 hw_sema = kzalloc(sizeof(struct gk20a_semaphore_int), GFP_KERNEL);
143 struct vm_gk20a *vm) 337 if (!hw_sema) {
144{ 338 ret = -ENOMEM;
145 struct gk20a_semaphore_pool_map *map; 339 goto fail_free_idx;
146 u64 gpu_va = 0; 340 }
147 341
148 mutex_lock(&p->maps_mutex); 342 ch->hw_sema = hw_sema;
149 map = gk20a_semaphore_pool_find_map_locked(p, vm); 343 hw_sema->ch = ch;
150 if (map) 344 hw_sema->p = p;
151 gpu_va = map->gpu_va; 345 hw_sema->idx = hw_sema_idx;
152 mutex_unlock(&p->maps_mutex); 346 hw_sema->offset = SEMAPHORE_SIZE * hw_sema_idx;
347 atomic_set(&hw_sema->next_value, 0);
348 hw_sema->value = p->cpu_va + hw_sema->offset;
349 writel(0, hw_sema->value);
153 350
154 return gpu_va; 351 list_add(&hw_sema->hw_sema_list, &p->hw_semas);
352
353 mutex_unlock(&p->pool_lock);
354
355 return 0;
356
357fail_free_idx:
358 clear_bit(hw_sema_idx, p->semas_alloced);
359fail:
360 mutex_unlock(&p->pool_lock);
361 return ret;
155} 362}
156 363
157struct gk20a_semaphore *gk20a_semaphore_alloc(struct gk20a_semaphore_pool *pool) 364/*
365 * Allocate a semaphore from the passed pool.
366 *
367 * Since semaphores are ref-counted there's no explicit free for external code
368 * to use. When the ref-count hits 0 the internal free will happen.
369 */
370struct gk20a_semaphore *gk20a_semaphore_alloc(struct channel_gk20a *ch)
158{ 371{
159 struct gk20a_semaphore *s; 372 struct gk20a_semaphore *s;
373 int ret;
374
375 if (!ch->hw_sema) {
376 ret = __gk20a_init_hw_sema(ch);
377 if (ret)
378 return ERR_PTR(ret);
379 }
160 380
161 s = kzalloc(sizeof(*s), GFP_KERNEL); 381 s = kzalloc(sizeof(*s), GFP_KERNEL);
162 if (!s) 382 if (!s)
163 return NULL; 383 return NULL;
164 384
165 s->offset = gk20a_balloc(&pool->alloc, SEMAPHORE_SIZE); 385 kref_init(&s->ref);
166 if (!s->offset) { 386 s->hw_sema = ch->hw_sema;
167 gk20a_err(dev_from_gk20a(pool->g), 387 atomic_set(&s->value, 0);
168 "failed to allocate semaphore");
169 kfree(s);
170 return NULL;
171 }
172 388
173 gk20a_semaphore_pool_get(pool); 389 /*
174 s->pool = pool; 390 * Take a ref on the pool so that we can keep this pool alive for
391 * as long as this semaphore is alive.
392 */
393 gk20a_semaphore_pool_get(s->hw_sema->p);
175 394
176 kref_init(&s->ref);
177 /* Initially acquired. */
178 gk20a_mem_wr(s->pool->g, &s->pool->mem, s->offset, 0);
179 gk20a_dbg_info("created semaphore offset=%d, value=%d",
180 s->offset,
181 gk20a_mem_rd(s->pool->g, &s->pool->mem, s->offset));
182 return s; 395 return s;
183} 396}
184 397
@@ -187,8 +400,8 @@ static void gk20a_semaphore_free(struct kref *ref)
187 struct gk20a_semaphore *s = 400 struct gk20a_semaphore *s =
188 container_of(ref, struct gk20a_semaphore, ref); 401 container_of(ref, struct gk20a_semaphore, ref);
189 402
190 gk20a_bfree(&s->pool->alloc, s->offset); 403 gk20a_semaphore_pool_put(s->hw_sema->p);
191 gk20a_semaphore_pool_put(s->pool); 404
192 kfree(s); 405 kfree(s);
193} 406}
194 407
diff --git a/drivers/gpu/nvgpu/gk20a/semaphore_gk20a.h b/drivers/gpu/nvgpu/gk20a/semaphore_gk20a.h
index 1f12e262..58081b56 100644
--- a/drivers/gpu/nvgpu/gk20a/semaphore_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/semaphore_gk20a.h
@@ -15,17 +15,128 @@
15#define SEMAPHORE_GK20A_H 15#define SEMAPHORE_GK20A_H
16 16
17#include <linux/kref.h> 17#include <linux/kref.h>
18#include "gk20a_allocator.h" 18#include <linux/list.h>
19#include <linux/delay.h>
20
21#include "gk20a.h"
19#include "mm_gk20a.h" 22#include "mm_gk20a.h"
23#include "channel_gk20a.h"
24
25/*
26 * Max number of channels that can be used is 512. This of course needs to be
27 * fixed to be dynamic but still fast.
28 */
29#define SEMAPHORE_POOL_COUNT 512
30#define SEMAPHORE_SIZE 16
31#define SEMAPHORE_SEA_GROWTH_RATE 32
32
33struct gk20a_semaphore_sea;
34
35/*
36 * Underlying semaphore data structure. This semaphore can be shared amongst
37 * other semaphore instances.
38 */
39struct gk20a_semaphore_int {
40 int idx; /* Semaphore index. */
41 u32 offset; /* Offset into the pool. */
42 atomic_t next_value; /* Next available value. */
43 u32 *value; /* Current value (access w/ readl()). */
44 u32 nr_incrs; /* Number of increments programmed. */
45 struct gk20a_semaphore_pool *p; /* Pool that owns this sema. */
46 struct channel_gk20a *ch; /* Channel that owns this sema. */
47 struct list_head hw_sema_list; /* List of HW semaphores. */
48};
49
50/*
51 * A semaphore which the rest of the driver actually uses. This consists of a
52 * pointer to a real semaphore and a value to wait for. This allows one physical
53 * semaphore to be shared among an essentially infinite number of submits.
54 */
55struct gk20a_semaphore {
56 struct gk20a_semaphore_int *hw_sema;
20 57
21/* A memory pool for holding semaphores. */ 58 atomic_t value;
59 int incremented;
60
61 struct kref ref;
62};
63
64/*
65 * A semaphore pool. Each address space will own exactly one of these.
66 */
22struct gk20a_semaphore_pool { 67struct gk20a_semaphore_pool {
23 struct mem_desc mem; 68 struct page *page; /* This pool's page of memory */
24 struct gk20a *g; 69 struct list_head pool_list_entry; /* Node for list of pools. */
25 struct list_head maps; 70 void *cpu_va; /* CPU access to the pool. */
26 struct mutex maps_mutex; 71 u64 gpu_va; /* GPU access to the pool. */
72 u64 gpu_va_ro; /* GPU access to the pool. */
73 int page_idx; /* Index into sea bitmap. */
74
75 struct list_head hw_semas; /* List of HW semas. */
76 DECLARE_BITMAP(semas_alloced, PAGE_SIZE / SEMAPHORE_SIZE);
77
78 struct gk20a_semaphore_sea *sema_sea; /* Sea that owns this pool. */
79
80 struct mutex pool_lock;
81
82 /*
83 * This is the address spaces's personal RW table. Other channels will
84 * ultimately map this page as RO.
85 */
86 struct sg_table *rw_sg_table;
87
88 /*
89 * This is to keep track of whether the pool has had its sg_table
90 * updated during sea resizing.
91 */
92 struct sg_table *ro_sg_table;
93
94 int mapped;
95
96 /*
97 * Sometimes a channel can be released before other channels are
98 * done waiting on it. This ref count ensures that the pool doesn't
99 * go away until all semaphores using this pool are cleaned up first.
100 */
27 struct kref ref; 101 struct kref ref;
28 struct gk20a_allocator alloc; 102};
103
104/*
105 * A sea of semaphores pools. Each pool is owned by a single VM. Since multiple
106 * channels can share a VM each channel gets it's own HW semaphore from the
107 * pool. Channels then allocate regular semaphores - basically just a value that
108 * signifies when a particular job is done.
109 */
110struct gk20a_semaphore_sea {
111 struct list_head pool_list; /* List of pools in this sea. */
112 struct gk20a *gk20a;
113
114 size_t size; /* Number of pages available. */
115 u64 gpu_va; /* GPU virtual address of sema sea. */
116 u64 map_size; /* Size of the mapping. */
117
118 /*
119 * TODO:
120 * List of pages that we use to back the pools. The number of pages
121 * can grow dynamically since allocating 512 pages for all channels at
122 * once would be a tremendous waste.
123 */
124 int page_count; /* Pages allocated to pools. */
125
126 struct sg_table *ro_sg_table;
127 /*
128 struct page *pages[SEMAPHORE_POOL_COUNT];
129 */
130
131 struct mem_desc sea_mem;
132
133 /*
134 * Can't use a regular allocator here since the full range of pools are
135 * not always allocated. Instead just use a bitmap.
136 */
137 DECLARE_BITMAP(pools_alloced, SEMAPHORE_POOL_COUNT);
138
139 struct mutex sea_lock; /* Lock alloc/free calls. */
29}; 140};
30 141
31enum gk20a_mem_rw_flag { 142enum gk20a_mem_rw_flag {
@@ -34,64 +145,150 @@ enum gk20a_mem_rw_flag {
34 gk20a_mem_flag_write_only = 2, 145 gk20a_mem_flag_write_only = 2,
35}; 146};
36 147
37/* A semaphore pool can be mapped to multiple GPU address spaces. */ 148/*
38struct gk20a_semaphore_pool_map { 149 * Semaphore sea functions.
39 u64 gpu_va; 150 */
40 enum gk20a_mem_rw_flag rw_flag; 151struct gk20a_semaphore_sea *gk20a_semaphore_sea_create(struct gk20a *gk20a);
41 struct vm_gk20a *vm; 152int gk20a_semaphore_sea_map(struct gk20a_semaphore_pool *sea,
42 struct list_head list; 153 struct vm_gk20a *vm);
43}; 154void gk20a_semaphore_sea_unmap(struct gk20a_semaphore_pool *sea,
155 struct vm_gk20a *vm);
156struct gk20a_semaphore_sea *gk20a_semaphore_get_sea(struct gk20a *g);
157
158/*
159 * Semaphore pool functions.
160 */
161struct gk20a_semaphore_pool *gk20a_semaphore_pool_alloc(
162 struct gk20a_semaphore_sea *sea);
163int gk20a_semaphore_pool_map(struct gk20a_semaphore_pool *pool,
164 struct vm_gk20a *vm);
165void gk20a_semaphore_pool_unmap(struct gk20a_semaphore_pool *pool,
166 struct vm_gk20a *vm);
167u64 __gk20a_semaphore_pool_gpu_va(struct gk20a_semaphore_pool *p, bool global);
168void gk20a_semaphore_pool_get(struct gk20a_semaphore_pool *p);
169void gk20a_semaphore_pool_put(struct gk20a_semaphore_pool *p);
170
171/*
172 * Semaphore functions.
173 */
174struct gk20a_semaphore *gk20a_semaphore_alloc(struct channel_gk20a *ch);
175void gk20a_semaphore_put(struct gk20a_semaphore *s);
176void gk20a_semaphore_get(struct gk20a_semaphore *s);
177
178/*
179 * Return the address of a specific semaphore.
180 *
181 * Don't call this on a semaphore you don't own - the VA returned will make no
182 * sense in your specific channel's VM.
183 */
184static inline u64 gk20a_semaphore_gpu_rw_va(struct gk20a_semaphore *s)
185{
186 return __gk20a_semaphore_pool_gpu_va(s->hw_sema->p, false) +
187 s->hw_sema->offset;
188}
189
190/*
191 * Get the global RO address for the semaphore. Can be called on any semaphore
192 * regardless of whether you own it.
193 */
194static inline u64 gk20a_semaphore_gpu_ro_va(struct gk20a_semaphore *s)
195{
196 return __gk20a_semaphore_pool_gpu_va(s->hw_sema->p, true) +
197 s->hw_sema->offset;
198}
199
200static inline u64 gk20a_hw_sema_addr(struct gk20a_semaphore_int *hw_sema)
201{
202 return __gk20a_semaphore_pool_gpu_va(hw_sema->p, true) +
203 hw_sema->offset;
204}
205
206/*
207 * TODO: handle wrap around... Hmm, how to do this?
208 */
209static inline bool gk20a_semaphore_is_released(struct gk20a_semaphore *s)
210{
211 u32 sema_val = readl(s->hw_sema->value);
44 212
45/* A semaphore that lives inside a semaphore pool. */
46struct gk20a_semaphore {
47 struct gk20a_semaphore_pool *pool;
48 /* 213 /*
49 * value exists within the pool's memory at the specified offset. 214 * If the underlying semaphore value is greater than or equal to
50 * 0=acquired, 1=released. 215 * the value of the semaphore then the semaphore has been signaled
216 * (a.k.a. released).
51 */ 217 */
52 u32 offset; /* byte offset within pool */ 218 return sema_val >= atomic_read(&s->value);
53 struct kref ref; 219}
54};
55 220
56/* Create a semaphore pool that can hold at most 'capacity' semaphores. */ 221static inline bool gk20a_semaphore_is_acquired(struct gk20a_semaphore *s)
57struct gk20a_semaphore_pool *
58gk20a_semaphore_pool_alloc(struct gk20a *, const char *unique_name,
59 size_t capacity);
60void gk20a_semaphore_pool_put(struct gk20a_semaphore_pool *);
61int gk20a_semaphore_pool_map(struct gk20a_semaphore_pool *,
62 struct vm_gk20a *,
63 enum gk20a_mem_rw_flag);
64void gk20a_semaphore_pool_unmap(struct gk20a_semaphore_pool *,
65 struct vm_gk20a *);
66u64 gk20a_semaphore_pool_gpu_va(struct gk20a_semaphore_pool *,
67 struct vm_gk20a *);
68
69/* Allocate a semaphore from the semaphore pool. The newly allocated
70 * semaphore will be in acquired state (value=0). */
71struct gk20a_semaphore *
72gk20a_semaphore_alloc(struct gk20a_semaphore_pool *);
73void gk20a_semaphore_put(struct gk20a_semaphore *);
74void gk20a_semaphore_get(struct gk20a_semaphore *);
75
76static inline u64 gk20a_semaphore_gpu_va(struct gk20a_semaphore *s,
77 struct vm_gk20a *vm)
78{ 222{
79 return gk20a_semaphore_pool_gpu_va(s->pool, vm) + s->offset; 223 return !gk20a_semaphore_is_released(s);
80} 224}
81 225
82static inline bool gk20a_semaphore_is_acquired(struct gk20a_semaphore *s) 226/*
227 * Read the underlying value from a semaphore.
228 */
229static inline u32 gk20a_semaphore_read(struct gk20a_semaphore *s)
83{ 230{
84 u32 v = gk20a_mem_rd(s->pool->g, &s->pool->mem, s->offset); 231 return readl(s->hw_sema->value);
232}
85 233
86 /* When often block on value reaching a certain threshold. We must make 234static inline u32 gk20a_semaphore_get_value(struct gk20a_semaphore *s)
87 * sure that if we get unblocked, we haven't read anything too early. */ 235{
88 smp_rmb(); 236 return atomic_read(&s->value);
89 return v == 0;
90} 237}
91 238
239static inline u32 gk20a_semaphore_next_value(struct gk20a_semaphore *s)
240{
241 return atomic_read(&s->hw_sema->next_value);
242}
243
244/*
245 * Note - if you call this then any prior semaphores will also be released.
246 */
92static inline void gk20a_semaphore_release(struct gk20a_semaphore *s) 247static inline void gk20a_semaphore_release(struct gk20a_semaphore *s)
93{ 248{
94 smp_wmb(); 249 u32 current_val;
95 gk20a_mem_wr(s->pool->g, &s->pool->mem, s->offset, 1); 250 u32 val = gk20a_semaphore_get_value(s);
251 int attempts = 0;
252
253 /*
254 * Wait until the sema value is 1 less than the write value. That
255 * way this function is essentially an increment.
256 *
257 * TODO: tune the wait a little better.
258 */
259 while ((current_val = gk20a_semaphore_read(s)) < (val - 1)) {
260 msleep(100);
261 attempts += 1;
262 if (attempts > 100) {
263 WARN(1, "Stall on sema release!");
264 return;
265 }
266 }
267
268 /*
269 * If the semaphore has already passed the value we would write then
270 * this is really just a NO-OP.
271 */
272 if (current_val >= val)
273 return;
274
275 writel(val, s->hw_sema->value);
276}
277
278/*
279 * Configure a software based increment on this semaphore. This is useful for
280 * when we want the GPU to wait on a SW event before processing a channel.
281 * Another way to describe this is when the GPU needs to wait on a SW pre-fence.
282 * The pre-fence signals SW which in turn calls gk20a_semaphore_release() which
283 * then allows the GPU to continue.
284 *
285 * Also used to prep a semaphore for an INCR by the GPU.
286 */
287static inline void gk20a_semaphore_incr(struct gk20a_semaphore *s)
288{
289 BUG_ON(s->incremented);
290
291 atomic_set(&s->value, atomic_add_return(1, &s->hw_sema->next_value));
292 s->incremented = 1;
96} 293}
97#endif 294#endif