9 files changed, 847 insertions, 225 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
index 3f9b0432..6c7ff551 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
@@ -1002,6 +1002,9 @@ unbind:
        mutex_unlock(&g->dbg_sessions_lock);
+        /* Make sure that when the ch is re-opened it will get a new HW sema. */
+        ch->hw_sema = NULL;
        /* make sure we catch accesses of unopened channels in case
         * there's non-refcounted channel pointers hanging around */
        ch->g = NULL;
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
index acd272b4..c5a1bd24 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
@@ -108,6 +108,8 @@ struct channel_gk20a {
        atomic_t ref_count;
        wait_queue_head_t ref_count_dec_wq;
+        struct gk20a_semaphore_int *hw_sema;
        int hw_chid;
        bool wdt_enabled;
        bool bound;
diff --git a/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c
index d2d8c094..9c8911e9 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c
@@ -424,28 +424,52 @@ static void gk20a_channel_semaphore_launcher(
 }
 #endif
-static int add_sema_cmd(struct gk20a *g, struct priv_cmd_entry *cmd,
+static void add_sema_cmd(struct gk20a *g, struct channel_gk20a *c,
-                u64 sema, u32 payload, bool acquire, bool wfi)
+                         struct gk20a_semaphore *s, struct priv_cmd_entry *cmd,
+                         int cmd_size, bool acquire, bool wfi)
 {
        u32 off = cmd->off;
+        u64 va;
+        /*
+         * RO for acquire (since we just need to read the mem) and RW for
+         * release since we will need to write back to the semaphore memory.
+         */
+        va = acquire ? gk20a_semaphore_gpu_ro_va(s) :
+                       gk20a_semaphore_gpu_rw_va(s);
+        /*
+         * If the op is not an acquire (so therefor a release) we should
+         * incr the underlying sema next_value.
+         */
+        if (!acquire)
+                gk20a_semaphore_incr(s);
        /* semaphore_a */
        gk20a_mem_wr32(g, cmd->mem, off++, 0x20010004);
        /* offset_upper */
-        gk20a_mem_wr32(g, cmd->mem, off++, (sema >> 32) & 0xff);
+        gk20a_mem_wr32(g, cmd->mem, off++, (va >> 32) & 0xff);
        /* semaphore_b */
        gk20a_mem_wr32(g, cmd->mem, off++, 0x20010005);
        /* offset */
-        gk20a_mem_wr32(g, cmd->mem, off++, sema & 0xffffffff);
+        gk20a_mem_wr32(g, cmd->mem, off++, va & 0xffffffff);
-        /* semaphore_c */
-        gk20a_mem_wr32(g, cmd->mem, off++, 0x20010006);
-        /* payload */
-        gk20a_mem_wr32(g, cmd->mem, off++, payload);
        if (acquire) {
+                /* semaphore_c */
+                gk20a_mem_wr32(g, cmd->mem, off++, 0x20010006);
+                /* payload */
+                gk20a_mem_wr32(g, cmd->mem, off++,
+                               gk20a_semaphore_get_value(s));
                /* semaphore_d */
                gk20a_mem_wr32(g, cmd->mem, off++, 0x20010007);
                /* operation: acq_geq, switch_en */
                gk20a_mem_wr32(g, cmd->mem, off++, 0x4 | (0x1 << 12));
        } else {
+                /* semaphore_c */
+                gk20a_mem_wr32(g, cmd->mem, off++, 0x20010006);
+                /* payload */
+                gk20a_mem_wr32(g, cmd->mem, off++,
+                               gk20a_semaphore_get_value(s));
                /* semaphore_d */
                gk20a_mem_wr32(g, cmd->mem, off++, 0x20010007);
                /* operation: release, wfi */
@@ -456,7 +480,6 @@ static int add_sema_cmd(struct gk20a *g, struct priv_cmd_entry *cmd,
                /* ignored */
                gk20a_mem_wr32(g, cmd->mem, off++, 0);
        }
-        return off - cmd->off;
 }
 static int gk20a_channel_semaphore_wait_syncpt(
@@ -471,6 +494,76 @@ static int gk20a_channel_semaphore_wait_syncpt(
        return -ENODEV;
 }
+/*
+ * UGHHH - the sync_fence underlying implementation changes from 3.10 to 3.18.
+ * But since there's no API for getting the underlying sync_pts we have to do
+ * some conditional compilation.
+ */
+#ifdef CONFIG_SYNC
+static struct gk20a_semaphore *sema_from_sync_fence(struct sync_fence *f)
+{
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3,18,0)
+        struct sync_pt *pt;
+        pt = list_first_entry(&f->pt_list_head, struct sync_pt, pt_list);
+        return gk20a_sync_pt_inst_get_sema(pt);
+#else
+        return gk20a_sync_pt_inst_get_sema(f->cbs[0].sync_pt);
+#endif
+}
+/*
+ * Attempt a fast path for waiting on a sync_fence. Basically if the passed
+ * sync_fence is backed by a gk20a_semaphore then there's no reason to go
+ * through the rigmarole of setting up a separate semaphore which waits on an
+ * interrupt from the GPU and then triggers a worker thread to execute a SW
+ * based semaphore release. Instead just have the GPU wait on the same semaphore
+ * that is going to be incremented by the GPU.
+ *
+ * This function returns 2 possible values: -ENODEV or 0 on success. In the case
+ * of -ENODEV the fastpath cannot be taken due to the fence not being backed by
+ * a GPU semaphore.
+ */
+static int __semaphore_wait_fd_fast_path(struct channel_gk20a *c,
+                                         struct sync_fence *fence,
+                                         struct priv_cmd_entry **wait_cmd,
+                                         struct gk20a_semaphore **fp_sema)
+{
+        struct gk20a_semaphore *sema;
+        int err;
+        if (!gk20a_is_sema_backed_sync_fence(fence))
+                return -ENODEV;
+        sema = sema_from_sync_fence(fence);
+        /*
+         * If there's no underlying sema then that means the underlying sema has
+         * already signaled.
+         */
+        if (!sema) {
+                *fp_sema = NULL;
+                return 0;
+        }
+        err = gk20a_channel_alloc_priv_cmdbuf(c, 8, wait_cmd);
+        if (err)
+                return err;
+        gk20a_semaphore_get(sema);
+        BUG_ON(!atomic_read(&sema->value));
+        add_sema_cmd(c->g, c, sema, *wait_cmd, 8, true, false);
+        /*
+         * Make sure that gk20a_channel_semaphore_wait_fd() can create another
+         * fence with the underlying semaphore.
+         */
+        *fp_sema = sema;
+        return 0;
+}
+#endif
 static int gk20a_channel_semaphore_wait_fd(
                struct gk20a_channel_sync *s, int fd,
                struct priv_cmd_entry **entry,
@@ -480,69 +573,107 @@ static int gk20a_channel_semaphore_wait_fd(
                container_of(s, struct gk20a_channel_semaphore, ops);
        struct channel_gk20a *c = sema->c;
 #ifdef CONFIG_SYNC
+        struct gk20a_semaphore *fp_sema;
        struct sync_fence *sync_fence;
        struct priv_cmd_entry *wait_cmd = NULL;
-        struct wait_fence_work *w;
+        struct wait_fence_work *w = NULL;
-        int written;
+        int err, ret, status;
-        int err, ret;
-        u64 va;
        sync_fence = gk20a_sync_fence_fdget(fd);
        if (!sync_fence)
                return -EINVAL;
+        ret = __semaphore_wait_fd_fast_path(c, sync_fence, &wait_cmd, &fp_sema);
+        if (ret == 0) {
+                if (fp_sema)
+                        *fence = gk20a_fence_from_semaphore(sema->timeline,
+                                                            fp_sema,
+                                                            &c->semaphore_wq,
+                                                            NULL, false);
+                else
+                        /*
+                         * Allocate an empty fence. It will instantly return
+                         * from gk20a_fence_wait().
+                         */
+                        *fence = gk20a_alloc_fence(NULL, NULL, false);
+                sync_fence_put(sync_fence);
+                goto skip_slow_path;
+        }
+        /* If the fence has signaled there is no reason to wait on it. */
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3,18,0)
+        status = sync_fence->status;
+#else
+        status = atomic_read(&sync_fence->status);
+#endif
+        if (status) {
+                sync_fence_put(sync_fence);
+                goto skip_slow_path;
+        }
+        err = gk20a_channel_alloc_priv_cmdbuf(c, 8, &wait_cmd);
+        if (err) {
+                gk20a_err(dev_from_gk20a(c->g),
+                                "not enough priv cmd buffer space");
+                sync_fence_put(sync_fence);
+                return -ENOMEM;
+        }
        w = kzalloc(sizeof(*w), GFP_KERNEL);
        if (!w) {
                err = -ENOMEM;
-                goto fail;
+                goto fail_free_cmdbuf;
        }
        sync_fence_waiter_init(&w->waiter, gk20a_channel_semaphore_launcher);
        w->ch = c;
-        w->sema = gk20a_semaphore_alloc(sema->pool);
+        w->sema = gk20a_semaphore_alloc(c);
        if (!w->sema) {
                gk20a_err(dev_from_gk20a(c->g), "ran out of semaphores");
                err = -ENOMEM;
-                goto fail;
+                goto fail_free_worker;
        }
        /* worker takes one reference */
        gk20a_semaphore_get(w->sema);
+        gk20a_semaphore_incr(w->sema);
-        err = gk20a_channel_alloc_priv_cmdbuf(c, 8, &wait_cmd);
+        /* GPU unblocked when the semaphore value increments. */
-        if (err) {
+        add_sema_cmd(c->g, c, w->sema, wait_cmd, 8, true, false);
-                gk20a_err(dev_from_gk20a(c->g),
-                                "not enough priv cmd buffer space");
-                goto fail;
-        }
-        va = gk20a_semaphore_gpu_va(w->sema, c->vm);
-        /* GPU unblocked when when the semaphore value becomes 1. */
-        written = add_sema_cmd(c->g, wait_cmd, va, 1, true, false);
-        WARN_ON(written != wait_cmd->size);
        ret = sync_fence_wait_async(sync_fence, &w->waiter);
        /*
         * If the sync_fence has already signaled then the above async_wait
         * will never trigger. This causes the semaphore release op to never
         * happen which, in turn, hangs the GPU. That's bad. So let's just
-         * do the semaphore_release right now.
+         * do the gk20a_semaphore_release() right now.
         */
-        if (ret == 1)
+        if (ret == 1) {
+                sync_fence_put(sync_fence);
                gk20a_semaphore_release(w->sema);
+                gk20a_semaphore_put(w->sema);
+        }
        /* XXX - this fixes an actual bug, we need to hold a ref to this
           semaphore while the job is in flight. */
        *fence = gk20a_fence_from_semaphore(sema->timeline, w->sema,
                                            &c->semaphore_wq,
                                            NULL, false);
+skip_slow_path:
        *entry = wait_cmd;
        return 0;
-fail:
+fail_free_worker:
        if (w && w->sema)
                gk20a_semaphore_put(w->sema);
        kfree(w);
        sync_fence_put(sync_fence);
+fail_free_cmdbuf:
+        if (wait_cmd)
+                gk20a_free_priv_cmdbuf(c, wait_cmd);
        return err;
 #else
        gk20a_err(dev_from_gk20a(c->g),
@@ -558,9 +689,7 @@ static int __gk20a_channel_semaphore_incr(
                struct gk20a_fence **fence,
                bool need_sync_fence)
 {
-        u64 va;
        int incr_cmd_size;
-        int written;
        struct priv_cmd_entry *incr_cmd = NULL;
        struct gk20a_channel_semaphore *sp =
                container_of(s, struct gk20a_channel_semaphore, ops);
@@ -568,7 +697,7 @@ static int __gk20a_channel_semaphore_incr(
        struct gk20a_semaphore *semaphore;
        int err = 0;
-        semaphore = gk20a_semaphore_alloc(sp->pool);
+        semaphore = gk20a_semaphore_alloc(c);
        if (!semaphore) {
                gk20a_err(dev_from_gk20a(c->g),
                                "ran out of semaphores");
@@ -585,9 +714,7 @@ static int __gk20a_channel_semaphore_incr(
        }
        /* Release the completion semaphore. */
-        va = gk20a_semaphore_gpu_va(semaphore, c->vm);
+        add_sema_cmd(c->g, c, semaphore, incr_cmd, 14, false, wfi_cmd);
-        written = add_sema_cmd(c->g, incr_cmd, va, 1, false, wfi_cmd);
-        WARN_ON(written != incr_cmd_size);
        *fence = gk20a_fence_from_semaphore(sp->timeline, semaphore,
                                            &c->semaphore_wq,
@@ -615,8 +742,10 @@ static int gk20a_channel_semaphore_incr(
 {
        /* Don't put wfi cmd to this one since we're not returning
         * a fence to user space. */
-        return __gk20a_channel_semaphore_incr(s, false /* no wfi */,
+        return __gk20a_channel_semaphore_incr(s,
-                                      NULL, entry, fence, need_sync_fence);
+                        false /* no wfi */,
+                        NULL,
+                        entry, fence, need_sync_fence);
 }
 static int gk20a_channel_semaphore_incr_user(
@@ -679,17 +808,16 @@ static void gk20a_channel_semaphore_destroy(struct gk20a_channel_sync *s)
                container_of(s, struct gk20a_channel_semaphore, ops);
        if (sema->timeline)
                gk20a_sync_timeline_destroy(sema->timeline);
-        if (sema->pool) {
-                gk20a_semaphore_pool_unmap(sema->pool, sema->c->vm);
+        /* The sema pool is cleaned up by the VM destroy. */
-                gk20a_semaphore_pool_put(sema->pool);
+        sema->pool = NULL;
-        }
        kfree(sema);
 }
 static struct gk20a_channel_sync *
 gk20a_channel_semaphore_create(struct channel_gk20a *c)
 {
-        int err;
        int asid = -1;
        struct gk20a_channel_semaphore *sema;
        char pool_name[20];
@@ -706,21 +834,15 @@ gk20a_channel_semaphore_create(struct channel_gk20a *c)
                asid = c->vm->as_share->id;
        sprintf(pool_name, "semaphore_pool-%d", c->hw_chid);
-        sema->pool = gk20a_semaphore_pool_alloc(c->g, pool_name, 1024);
+        sema->pool = c->vm->sema_pool;
-        if (!sema->pool)
-                goto clean_up;
-        /* Map the semaphore pool to the channel vm. Map as read-write to the
-         * owner channel (all other channels should map as read only!). */
-        err = gk20a_semaphore_pool_map(sema->pool, c->vm, gk20a_mem_flag_none);
-        if (err)
-                goto clean_up;
 #ifdef CONFIG_SYNC
        sema->timeline = gk20a_sync_timeline_create(
                        "gk20a_ch%d_as%d", c->hw_chid, asid);
-        if (!sema->timeline)
+        if (!sema->timeline) {
-                goto clean_up;
+                gk20a_channel_semaphore_destroy(&sema->ops);
+                return NULL;
+        }
 #endif
        atomic_set(&sema->ops.refcount, 0);
        sema->ops.wait_syncpt   = gk20a_channel_semaphore_wait_syncpt;
@@ -734,9 +856,6 @@ gk20a_channel_semaphore_create(struct channel_gk20a *c)
        sema->ops.destroy       = gk20a_channel_semaphore_destroy;
        return &sema->ops;
-clean_up:
-        gk20a_channel_semaphore_destroy(&sema->ops);
-        return NULL;
 }
 void gk20a_channel_sync_destroy(struct gk20a_channel_sync *sync)
diff --git a/drivers/gpu/nvgpu/gk20a/fence_gk20a.c b/drivers/gpu/nvgpu/gk20a/fence_gk20a.c
index 23522882..fbbaa2a7 100644
--- a/drivers/gpu/nvgpu/gk20a/fence_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/fence_gk20a.c
@@ -155,8 +155,8 @@ struct gk20a_fence *gk20a_fence_from_semaphore(
 #ifdef CONFIG_SYNC
        sync_fence = gk20a_sync_fence_create(timeline, semaphore,
-                                             dependency, "f-gk20a-0x%04x",
+                                        dependency, "f-gk20a-0x%04x",
-                                             semaphore->offset & 0xffff);
+                                        gk20a_semaphore_gpu_ro_va(semaphore));
        if (!sync_fence)
                return NULL;
 #endif
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.h b/drivers/gpu/nvgpu/gk20a/gk20a.h
index 5ab09ac3..7bd9775e 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.h
@@ -738,6 +738,11 @@ struct gk20a {
 #endif
        struct gk20a_ctxsw_ucode_info ctxsw_ucode_info;
+        /*
+         * A group of semaphore pools. One for each channel.
+         */
+        struct gk20a_semaphore_sea *sema_sea;
        /* held while manipulating # of debug/profiler sessions present */
        /* also prevents debug sessions from attaching until released */
        struct mutex dbg_sessions_lock;
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
index 3b21e843..9299266f 100644
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
@@ -3213,6 +3213,17 @@ static void gk20a_vm_remove_support_nofree(struct vm_gk20a *vm)
        struct rb_node *node;
        gk20a_dbg_fn("");
+        /*
+         * Do this outside of the update_gmmu_lock since unmapping the semaphore
+         * pool involves unmapping a GMMU mapping which means aquiring the
+         * update_gmmu_lock.
+         */
+        if (!gk20a_platform_has_syncpoints(gk20a_from_vm(vm)->dev)) {
+                gk20a_semaphore_pool_unmap(vm->sema_pool, vm);
+                gk20a_semaphore_pool_put(vm->sema_pool);
+        }
        mutex_lock(&vm->update_gmmu_lock);
        /* TBD: add a flag here for the unmap code to recognize teardown
@@ -3286,6 +3297,64 @@ const struct gk20a_mmu_level gk20a_mm_levels_128k[] = {
        {.update_entry = NULL}
 };
+/*
+ * Initialize a semaphore pool. Just return successfully if we do not need
+ * semaphores (i.e when sync-pts are active).
+ */
+int gk20a_init_sema_pool(struct vm_gk20a *vm)
+{
+        struct gk20a_semaphore_sea *sema_sea;
+        struct mm_gk20a *mm = vm->mm;
+        struct gk20a *g = mm->g;
+        int err;
+        /*
+         * Don't waste the memory on semaphores if we don't need them.
+         */
+        if (gk20a_platform_has_syncpoints(g->dev))
+                return 0;
+        if (vm->sema_pool)
+                return 0;
+        sema_sea = gk20a_semaphore_sea_create(g);
+        if (!sema_sea)
+                return -ENOMEM;
+        vm->sema_pool = gk20a_semaphore_pool_alloc(sema_sea);
+        if (!vm->sema_pool) {
+                gk20a_vm_put(vm);
+                return -ENOMEM;
+        }
+        /*
+         * Allocate a chunk of GPU VA space for mapping the semaphores. We will
+         * do a fixed alloc in the kernel VM so that all channels have the same
+         * RO address range for the semaphores.
+         *
+         * !!! TODO: cleanup.
+         */
+        sema_sea->gpu_va = gk20a_balloc_fixed(&vm->vma[gmmu_page_size_kernel],
+                                              vm->va_limit -
+                                              mm->channel.kernel_size,
+                                              512 * PAGE_SIZE);
+        if (!sema_sea->gpu_va) {
+                gk20a_bfree(&vm->vma[gmmu_page_size_small], sema_sea->gpu_va);
+                gk20a_vm_put(vm);
+                return -ENOMEM;
+        }
+        err = gk20a_semaphore_pool_map(vm->sema_pool, vm);
+        if (err) {
+                gk20a_semaphore_pool_unmap(vm->sema_pool, vm);
+                gk20a_bfree(&vm->vma[gmmu_page_size_small],
+                            vm->sema_pool->gpu_va);
+                gk20a_vm_put(vm);
+        }
+        return 0;
+}
 int gk20a_init_vm(struct mm_gk20a *mm,
                struct vm_gk20a *vm,
                u32 big_page_size,
@@ -3317,9 +3386,7 @@ int gk20a_init_vm(struct mm_gk20a *mm,
        vm->big_pages = big_pages;
        vm->big_page_size = gmmu_page_sizes[gmmu_page_size_big];
        vm->userspace_managed = userspace_managed;
        vm->mmu_levels = vm->mm->g->ops.mm.get_mmu_levels(vm->mm->g,
                        vm->big_page_size);
@@ -3465,6 +3532,17 @@ int gk20a_init_vm(struct mm_gk20a *mm,
        kref_init(&vm->ref);
        INIT_LIST_HEAD(&vm->reserved_va_list);
+        /*
+         * This is only necessary for channel address spaces. The best way to
+         * distinguish channel address spaces from other address spaces is by
+         * size - if the address space is 4GB or less, it's not a channel.
+         */
+        if (vm->va_limit > SZ_4G) {
+                err = gk20a_init_sema_pool(vm);
+                if (err)
+                        goto clean_up_big_allocator;
+        }
        return 0;
 clean_up_big_allocator:
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
index db74a5ca..7bb4d011 100644
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
@@ -287,6 +287,11 @@ struct vm_gk20a {
        /* if non-NULL, kref_put will use this batch when
           unmapping. Must hold vm->update_gmmu_lock. */
        struct vm_gk20a_mapping_batch *kref_put_batch;
+        /*
+         * Each address space needs to have a semaphore pool.
+         */
+        struct gk20a_semaphore_pool *sema_pool;
 };
 struct gk20a;
diff --git a/drivers/gpu/nvgpu/gk20a/semaphore_gk20a.c b/drivers/gpu/nvgpu/gk20a/semaphore_gk20a.c
index 3b17bfcb..aa375b24 100644
--- a/drivers/gpu/nvgpu/gk20a/semaphore_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/semaphore_gk20a.c
@@ -15,63 +15,284 @@
 * more details.
 */
-#include "semaphore_gk20a.h"
+#define pr_fmt(fmt) "gpu_sema: " fmt
 #include <linux/dma-mapping.h>
+#include <linux/highmem.h>
 #include <linux/slab.h>
+#include <asm/pgtable.h>
 #include "gk20a.h"
 #include "mm_gk20a.h"
+#include "semaphore_gk20a.h"
+#define __lock_sema_sea(s)                                              \
+        do {                                                            \
+                mutex_lock(&s->sea_lock);                               \
+        } while (0)
-static const int SEMAPHORE_SIZE = 16;
+#define __unlock_sema_sea(s)                                            \
+        do {                                                            \
+                mutex_unlock(&s->sea_lock);                             \
+        } while (0)
-struct gk20a_semaphore_pool *gk20a_semaphore_pool_alloc(struct gk20a *g,
+/*
-                const char *unique_name, size_t capacity)
+ * Return the sema_sea pointer.
+ */
+struct gk20a_semaphore_sea *gk20a_semaphore_get_sea(struct gk20a *g)
+{
+        return g->sema_sea;
+}
+static int __gk20a_semaphore_sea_grow(struct gk20a_semaphore_sea *sea)
+{
+        int ret = 0;
+        struct gk20a *gk20a = sea->gk20a;
+        __lock_sema_sea(sea);
+        ret = gk20a_gmmu_alloc_attr(gk20a, DMA_ATTR_NO_KERNEL_MAPPING,
+                                    PAGE_SIZE * SEMAPHORE_POOL_COUNT,
+                                    &sea->sea_mem);
+        if (ret)
+                goto out;
+        sea->ro_sg_table = sea->sea_mem.sgt;
+        sea->size = SEMAPHORE_POOL_COUNT;
+        sea->map_size = SEMAPHORE_POOL_COUNT * PAGE_SIZE;
+out:
+        __unlock_sema_sea(sea);
+        return ret;
+}
+/*
+ * Create the semaphore sea. Only create it once - subsequent calls to this will
+ * return the originally created sea pointer.
+ */
+struct gk20a_semaphore_sea *gk20a_semaphore_sea_create(struct gk20a *g)
+{
+        if (g->sema_sea)
+                return g->sema_sea;
+        g->sema_sea = kzalloc(sizeof(*g->sema_sea), GFP_KERNEL);
+        if (!g->sema_sea)
+                return NULL;
+        g->sema_sea->size = 0;
+        g->sema_sea->page_count = 0;
+        g->sema_sea->gk20a = g;
+        INIT_LIST_HEAD(&g->sema_sea->pool_list);
+        mutex_init(&g->sema_sea->sea_lock);
+        if (__gk20a_semaphore_sea_grow(g->sema_sea))
+                goto cleanup;
+        return g->sema_sea;
+cleanup:
+        kfree(g->sema_sea);
+        g->sema_sea = NULL;
+        return NULL;
+}
+static int __semaphore_bitmap_alloc(unsigned long *bitmap, unsigned long len)
+{
+        unsigned long idx = find_first_zero_bit(bitmap, len);
+        if (idx == len)
+                return -ENOSPC;
+        set_bit(idx, bitmap);
+        return (int)idx;
+}
+/*
+ * Allocate a pool from the sea.
+ */
+struct gk20a_semaphore_pool *gk20a_semaphore_pool_alloc(
+                                struct gk20a_semaphore_sea *sea)
 {
        struct gk20a_semaphore_pool *p;
+        unsigned long page_idx;
+        int err = 0;
        p = kzalloc(sizeof(*p), GFP_KERNEL);
        if (!p)
-                return NULL;
+                return ERR_PTR(-ENOMEM);
+        __lock_sema_sea(sea);
+        page_idx = __semaphore_bitmap_alloc(sea->pools_alloced,
+                                            SEMAPHORE_POOL_COUNT);
+        if (page_idx < 0) {
+                err = page_idx;
+                goto fail;
+        }
+        p->page = sea->sea_mem.pages[page_idx];
+        p->ro_sg_table = sea->ro_sg_table;
+        p->page_idx = page_idx;
+        p->sema_sea = sea;
+        INIT_LIST_HEAD(&p->hw_semas);
        kref_init(&p->ref);
-        INIT_LIST_HEAD(&p->maps);
+        mutex_init(&p->pool_lock);
-        mutex_init(&p->maps_mutex);
-        p->g = g;
+        sea->page_count++;
+        list_add(&p->pool_list_entry, &sea->pool_list);
-        /* Alloc one 4k page of semaphore per channel. */
+        __unlock_sema_sea(sea);
-        if (gk20a_gmmu_alloc(g, roundup(capacity * SEMAPHORE_SIZE, PAGE_SIZE),
-                                &p->mem))
-                goto clean_up;
-        /* Sacrifice one semaphore in the name of returning error codes. */
-        if (gk20a_allocator_init(&p->alloc, unique_name,
-                                 SEMAPHORE_SIZE, p->mem.size - SEMAPHORE_SIZE,
-                                 SEMAPHORE_SIZE))
-                goto clean_up;
-        gk20a_dbg_info("cpuva=%p iova=%llx phys=%llx", p->mem.cpu_va,
-                (u64)sg_dma_address(p->mem.sgt->sgl),
-                (u64)sg_phys(p->mem.sgt->sgl));
        return p;
-clean_up:
+fail:
-        if (p->mem.size)
+        __unlock_sema_sea(sea);
-                gk20a_gmmu_free(p->g, &p->mem);
        kfree(p);
-        return NULL;
+        return ERR_PTR(err);
+}
+/*
+ * Map a pool into the passed vm's address space. This handles both the fixed
+ * global RO mapping and the non-fixed private RW mapping.
+ */
+int gk20a_semaphore_pool_map(struct gk20a_semaphore_pool *p,
+                             struct vm_gk20a *vm)
+{
+        int ents, err = 0;
+        u64 addr;
+        p->cpu_va = vmap(&p->page, 1, 0,
+                         pgprot_writecombine(PAGE_KERNEL));
+        /* First do the RW mapping. */
+        p->rw_sg_table = kzalloc(sizeof(*p->rw_sg_table), GFP_KERNEL);
+        if (!p->rw_sg_table)
+                return -ENOMEM;
+        err = sg_alloc_table_from_pages(p->rw_sg_table, &p->page, 1, 0,
+                                        PAGE_SIZE, GFP_KERNEL);
+        if (err) {
+                err = -ENOMEM;
+                goto fail;
+        }
+        /* Add IOMMU mapping... */
+        ents = dma_map_sg(dev_from_vm(vm), p->rw_sg_table->sgl, 1,
+                          DMA_BIDIRECTIONAL);
+        if (ents != 1) {
+                err = -ENOMEM;
+                goto fail_free_sgt;
+        }
+        /* Map into the GPU... Doesn't need to be fixed. */
+        p->gpu_va = gk20a_gmmu_map(vm, &p->rw_sg_table, PAGE_SIZE,
+                                   0, gk20a_mem_flag_none, false);
+        if (!p->gpu_va) {
+                err = -ENOMEM;
+                goto fail_unmap_sgt;
+        }
+        /*
+         * And now the global mapping. Take the sea lock so that we don't race
+         * with a concurrent remap.
+         */
+        __lock_sema_sea(p->sema_sea);
+        BUG_ON(p->mapped);
+        addr = gk20a_gmmu_fixed_map(vm, &p->sema_sea->ro_sg_table,
+                                    p->sema_sea->gpu_va, p->sema_sea->map_size,
+                                    0,
+                                    gk20a_mem_flag_read_only,
+                                    false);
+        if (!addr) {
+                err = -ENOMEM;
+                BUG();
+                goto fail_unlock;
+        }
+        p->gpu_va_ro = addr;
+        p->mapped = 1;
+        __unlock_sema_sea(p->sema_sea);
+        return 0;
+fail_unlock:
+        __unlock_sema_sea(p->sema_sea);
+fail_unmap_sgt:
+        dma_unmap_sg(dev_from_vm(vm), p->rw_sg_table->sgl, 1,
+                     DMA_BIDIRECTIONAL);
+fail_free_sgt:
+        sg_free_table(p->rw_sg_table);
+fail:
+        kfree(p->rw_sg_table);
+        p->rw_sg_table = NULL;
+        return err;
 }
+/*
+ * Unmap a semaphore_pool.
+ */
+void gk20a_semaphore_pool_unmap(struct gk20a_semaphore_pool *p,
+                                struct vm_gk20a *vm)
+{
+        struct gk20a_semaphore_int *hw_sema;
+        kunmap(p->cpu_va);
+        /* First the global RO mapping... */
+        __lock_sema_sea(p->sema_sea);
+        gk20a_gmmu_unmap(vm, p->gpu_va_ro,
+                         p->sema_sea->map_size, gk20a_mem_flag_none);
+        p->ro_sg_table = NULL;
+        __unlock_sema_sea(p->sema_sea);
+        /* And now the private RW mapping. */
+        gk20a_gmmu_unmap(vm, p->gpu_va, PAGE_SIZE, gk20a_mem_flag_none);
+        p->gpu_va = 0;
+        dma_unmap_sg(dev_from_vm(vm), p->rw_sg_table->sgl, 1,
+                     DMA_BIDIRECTIONAL);
+        sg_free_table(p->rw_sg_table);
+        kfree(p->rw_sg_table);
+        p->rw_sg_table = NULL;
+        gk20a_dbg_info("Unmapped sema-pool: idx = %d", p->page_idx);
+        list_for_each_entry(hw_sema, &p->hw_semas, hw_sema_list)
+                /*
+                 * Make sure the mem addresses are all NULL so if this gets
+                 * reused we will fault.
+                 */
+                hw_sema->value = NULL;
+}
+/*
+ * Completely free a sempahore_pool. You should make sure this pool is not
+ * mapped otherwise there's going to be a memory leak.
+ */
 static void gk20a_semaphore_pool_free(struct kref *ref)
 {
        struct gk20a_semaphore_pool *p =
                container_of(ref, struct gk20a_semaphore_pool, ref);
-        mutex_lock(&p->maps_mutex);
+        struct gk20a_semaphore_sea *s = p->sema_sea;
-        WARN_ON(!list_empty(&p->maps));
+        struct gk20a_semaphore_int *hw_sema, *tmp;
-        mutex_unlock(&p->maps_mutex);
-        gk20a_gmmu_free(p->g, &p->mem);
+        WARN_ON(p->gpu_va || p->rw_sg_table || p->ro_sg_table);
-        gk20a_allocator_destroy(&p->alloc);
+        __lock_sema_sea(s);
+        list_del(&p->pool_list_entry);
+        clear_bit(p->page_idx, s->pools_alloced);
+        s->page_count--;
+        __unlock_sema_sea(s);
+        list_for_each_entry_safe(hw_sema, tmp, &p->hw_semas, hw_sema_list)
+                kfree(hw_sema);
        kfree(p);
 }
-static void gk20a_semaphore_pool_get(struct gk20a_semaphore_pool *p)
+void gk20a_semaphore_pool_get(struct gk20a_semaphore_pool *p)
 {
        kref_get(&p->ref);
 }
@@ -81,104 +302,96 @@ void gk20a_semaphore_pool_put(struct gk20a_semaphore_pool *p)
        kref_put(&p->ref, gk20a_semaphore_pool_free);
 }
-static struct gk20a_semaphore_pool_map *
+/*
-gk20a_semaphore_pool_find_map_locked(struct gk20a_semaphore_pool *p,
+ * Get the address for a semaphore_pool - if global is true then return the
-                                     struct vm_gk20a *vm)
+ * global RO address instead of the RW address owned by the semaphore's VM.
+ */
+u64 __gk20a_semaphore_pool_gpu_va(struct gk20a_semaphore_pool *p, bool global)
 {
-        struct gk20a_semaphore_pool_map *map, *found = NULL;
+        if (!global)
-        list_for_each_entry(map, &p->maps, list) {
+                return p->gpu_va;
-                if (map->vm == vm) {
-                        found = map;
+        return p->gpu_va_ro + (PAGE_SIZE * p->page_idx);
-                        break;
-                }
-        }
-        return found;
 }
-int gk20a_semaphore_pool_map(struct gk20a_semaphore_pool *p,
+static int __gk20a_init_hw_sema(struct channel_gk20a *ch)
-                             struct vm_gk20a *vm,
-                             enum gk20a_mem_rw_flag rw_flag)
 {
-        struct gk20a_semaphore_pool_map *map;
+        int hw_sema_idx;
+        int ret = 0;
+        struct gk20a_semaphore_int *hw_sema;
+        struct gk20a_semaphore_pool *p = ch->vm->sema_pool;
-        map = kzalloc(sizeof(*map), GFP_KERNEL);
+        BUG_ON(!p);
-        if (!map)
-                return -ENOMEM;
-        map->vm = vm;
-        map->rw_flag = rw_flag;
-        map->gpu_va = gk20a_gmmu_map(vm, &p->mem.sgt, p->mem.size,
-                                     0/*uncached*/, rw_flag,
-                                     false);
-        if (!map->gpu_va) {
-                kfree(map);
-                return -ENOMEM;
-        }
-        gk20a_vm_get(vm);
-        mutex_lock(&p->maps_mutex);
+        mutex_lock(&p->pool_lock);
-        WARN_ON(gk20a_semaphore_pool_find_map_locked(p, vm));
-        list_add(&map->list, &p->maps);
-        mutex_unlock(&p->maps_mutex);
-        return 0;
-}
-void gk20a_semaphore_pool_unmap(struct gk20a_semaphore_pool *p,
+        /* Find an available HW semaphore. */
-                struct vm_gk20a *vm)
+        hw_sema_idx = __semaphore_bitmap_alloc(p->semas_alloced,
-{
+                                               PAGE_SIZE / SEMAPHORE_SIZE);
-        struct gk20a_semaphore_pool_map *map;
+        if (hw_sema_idx < 0) {
-        WARN_ON(!vm);
+                ret = hw_sema_idx;
+                goto fail;
-        mutex_lock(&p->maps_mutex);
-        map = gk20a_semaphore_pool_find_map_locked(p, vm);
-        if (map) {
-                gk20a_gmmu_unmap(vm, map->gpu_va, p->mem.size, map->rw_flag);
-                gk20a_vm_put(vm);
-                list_del(&map->list);
-                kfree(map);
        }
-        mutex_unlock(&p->maps_mutex);
-}
-u64 gk20a_semaphore_pool_gpu_va(struct gk20a_semaphore_pool *p,
+        hw_sema = kzalloc(sizeof(struct gk20a_semaphore_int), GFP_KERNEL);
-                struct vm_gk20a *vm)
+        if (!hw_sema) {
-{
+                ret = -ENOMEM;
-        struct gk20a_semaphore_pool_map *map;
+                goto fail_free_idx;
-        u64 gpu_va = 0;
+        }
-        mutex_lock(&p->maps_mutex);
+        ch->hw_sema = hw_sema;
-        map = gk20a_semaphore_pool_find_map_locked(p, vm);
+        hw_sema->ch = ch;
-        if (map)
+        hw_sema->p = p;
-                gpu_va = map->gpu_va;
+        hw_sema->idx = hw_sema_idx;
-        mutex_unlock(&p->maps_mutex);
+        hw_sema->offset = SEMAPHORE_SIZE * hw_sema_idx;
+        atomic_set(&hw_sema->next_value, 0);
+        hw_sema->value = p->cpu_va + hw_sema->offset;
+        writel(0, hw_sema->value);
-        return gpu_va;
+        list_add(&hw_sema->hw_sema_list, &p->hw_semas);
+        mutex_unlock(&p->pool_lock);
+        return 0;
+fail_free_idx:
+        clear_bit(hw_sema_idx, p->semas_alloced);
+fail:
+        mutex_unlock(&p->pool_lock);
+        return ret;
 }
-struct gk20a_semaphore *gk20a_semaphore_alloc(struct gk20a_semaphore_pool *pool)
+/*
+ * Allocate a semaphore from the passed pool.
+ *
+ * Since semaphores are ref-counted there's no explicit free for external code
+ * to use. When the ref-count hits 0 the internal free will happen.
+ */
+struct gk20a_semaphore *gk20a_semaphore_alloc(struct channel_gk20a *ch)
 {
        struct gk20a_semaphore *s;
+        int ret;
+        if (!ch->hw_sema) {
+                ret = __gk20a_init_hw_sema(ch);
+                if (ret)
+                        return ERR_PTR(ret);
+        }
        s = kzalloc(sizeof(*s), GFP_KERNEL);
        if (!s)
                return NULL;
-        s->offset = gk20a_balloc(&pool->alloc, SEMAPHORE_SIZE);
+        kref_init(&s->ref);
-        if (!s->offset) {
+        s->hw_sema = ch->hw_sema;
-                gk20a_err(dev_from_gk20a(pool->g),
+        atomic_set(&s->value, 0);
-                                "failed to allocate semaphore");
-                kfree(s);
-                return NULL;
-        }
-        gk20a_semaphore_pool_get(pool);
+        /*
-        s->pool = pool;
+         * Take a ref on the pool so that we can keep this pool alive for
+         * as long as this semaphore is alive.
+         */
+        gk20a_semaphore_pool_get(s->hw_sema->p);
-        kref_init(&s->ref);
-        /* Initially acquired. */
-        gk20a_mem_wr(s->pool->g, &s->pool->mem, s->offset, 0);
-        gk20a_dbg_info("created semaphore offset=%d, value=%d",
-                        s->offset,
-                        gk20a_mem_rd(s->pool->g, &s->pool->mem, s->offset));
        return s;
 }
@@ -187,8 +400,8 @@ static void gk20a_semaphore_free(struct kref *ref)
        struct gk20a_semaphore *s =
                container_of(ref, struct gk20a_semaphore, ref);
-        gk20a_bfree(&s->pool->alloc, s->offset);
+        gk20a_semaphore_pool_put(s->hw_sema->p);
-        gk20a_semaphore_pool_put(s->pool);
        kfree(s);
 }
diff --git a/drivers/gpu/nvgpu/gk20a/semaphore_gk20a.h b/drivers/gpu/nvgpu/gk20a/semaphore_gk20a.h
index 1f12e262..58081b56 100644
--- a/drivers/gpu/nvgpu/gk20a/semaphore_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/semaphore_gk20a.h
@@ -15,17 +15,128 @@
 #define SEMAPHORE_GK20A_H
 #include <linux/kref.h>
-#include "gk20a_allocator.h"
+#include <linux/list.h>
+#include <linux/delay.h>
+#include "gk20a.h"
 #include "mm_gk20a.h"
+#include "channel_gk20a.h"
+/*
+ * Max number of channels that can be used is 512. This of course needs to be
+ * fixed to be dynamic but still fast.
+ */
+#define SEMAPHORE_POOL_COUNT            512
+#define SEMAPHORE_SIZE                  16
+#define SEMAPHORE_SEA_GROWTH_RATE       32
+struct gk20a_semaphore_sea;
+/*
+ * Underlying semaphore data structure. This semaphore can be shared amongst
+ * other semaphore instances.
+ */
+struct gk20a_semaphore_int {
+        int idx;                        /* Semaphore index. */
+        u32 offset;                     /* Offset into the pool. */
+        atomic_t next_value;            /* Next available value. */
+        u32 *value;                     /* Current value (access w/ readl()). */
+        u32 nr_incrs;                   /* Number of increments programmed. */
+        struct gk20a_semaphore_pool *p; /* Pool that owns this sema. */
+        struct channel_gk20a *ch;       /* Channel that owns this sema. */
+        struct list_head hw_sema_list;  /* List of HW semaphores. */
+};
+/*
+ * A semaphore which the rest of the driver actually uses. This consists of a
+ * pointer to a real semaphore and a value to wait for. This allows one physical
+ * semaphore to be shared among an essentially infinite number of submits.
+ */
+struct gk20a_semaphore {
+        struct gk20a_semaphore_int *hw_sema;
-/* A memory pool for holding semaphores. */
+        atomic_t value;
+        int incremented;
+        struct kref ref;
+};
+/*
+ * A semaphore pool. Each address space will own exactly one of these.
+ */
 struct gk20a_semaphore_pool {
-        struct mem_desc mem;
+        struct page *page;                      /* This pool's page of memory */
-        struct gk20a *g;
+        struct list_head pool_list_entry;       /* Node for list of pools. */
-        struct list_head maps;
+        void *cpu_va;                           /* CPU access to the pool. */
-        struct mutex maps_mutex;
+        u64 gpu_va;                             /* GPU access to the pool. */
+        u64 gpu_va_ro;                          /* GPU access to the pool. */
+        int page_idx;                           /* Index into sea bitmap. */
+        struct list_head hw_semas;              /* List of HW semas. */
+        DECLARE_BITMAP(semas_alloced, PAGE_SIZE / SEMAPHORE_SIZE);
+        struct gk20a_semaphore_sea *sema_sea;   /* Sea that owns this pool. */
+        struct mutex pool_lock;
+        /*
+         * This is the address spaces's personal RW table. Other channels will
+         * ultimately map this page as RO.
+         */
+        struct sg_table *rw_sg_table;
+        /*
+         * This is to keep track of whether the pool has had its sg_table
+         * updated during sea resizing.
+         */
+        struct sg_table *ro_sg_table;
+        int mapped;
+        /*
+         * Sometimes a channel can be released before other channels are
+         * done waiting on it. This ref count ensures that the pool doesn't
+         * go away until all semaphores using this pool are cleaned up first.
+         */
        struct kref ref;
-        struct gk20a_allocator alloc;
+};
+/*
+ * A sea of semaphores pools. Each pool is owned by a single VM. Since multiple
+ * channels can share a VM each channel gets it's own HW semaphore from the
+ * pool. Channels then allocate regular semaphores - basically just a value that
+ * signifies when a particular job is done.
+ */
+struct gk20a_semaphore_sea {
+        struct list_head pool_list;     /* List of pools in this sea. */
+        struct gk20a *gk20a;
+        size_t size;                    /* Number of pages available. */
+        u64 gpu_va;                     /* GPU virtual address of sema sea. */
+        u64 map_size;                   /* Size of the mapping. */
+        /*
+         * TODO:
+         * List of pages that we use to back the pools. The number of pages
+         * can grow dynamically since allocating 512 pages for all channels at
+         * once would be a tremendous waste.
+         */
+        int page_count;                 /* Pages allocated to pools. */
+        struct sg_table *ro_sg_table;
+        /*
+        struct page *pages[SEMAPHORE_POOL_COUNT];
+        */
+        struct mem_desc sea_mem;
+        /*
+         * Can't use a regular allocator here since the full range of pools are
+         * not always allocated. Instead just use a bitmap.
+         */
+        DECLARE_BITMAP(pools_alloced, SEMAPHORE_POOL_COUNT);
+        struct mutex sea_lock;          /* Lock alloc/free calls. */
 };
 enum gk20a_mem_rw_flag {
@@ -34,64 +145,150 @@ enum gk20a_mem_rw_flag {
        gk20a_mem_flag_write_only = 2,
 };
-/* A semaphore pool can be mapped to multiple GPU address spaces. */
+/*
-struct gk20a_semaphore_pool_map {
+ * Semaphore sea functions.
-        u64 gpu_va;
+ */
-        enum gk20a_mem_rw_flag rw_flag;
+struct gk20a_semaphore_sea *gk20a_semaphore_sea_create(struct gk20a *gk20a);
-        struct vm_gk20a *vm;
+int gk20a_semaphore_sea_map(struct gk20a_semaphore_pool *sea,
-        struct list_head list;
+                            struct vm_gk20a *vm);
-};
+void gk20a_semaphore_sea_unmap(struct gk20a_semaphore_pool *sea,
+                               struct vm_gk20a *vm);
+struct gk20a_semaphore_sea *gk20a_semaphore_get_sea(struct gk20a *g);
+/*
+ * Semaphore pool functions.
+ */
+struct gk20a_semaphore_pool *gk20a_semaphore_pool_alloc(
+        struct gk20a_semaphore_sea *sea);
+int gk20a_semaphore_pool_map(struct gk20a_semaphore_pool *pool,
+                             struct vm_gk20a *vm);
+void gk20a_semaphore_pool_unmap(struct gk20a_semaphore_pool *pool,
+                                struct vm_gk20a *vm);
+u64 __gk20a_semaphore_pool_gpu_va(struct gk20a_semaphore_pool *p, bool global);
+void gk20a_semaphore_pool_get(struct gk20a_semaphore_pool *p);
+void gk20a_semaphore_pool_put(struct gk20a_semaphore_pool *p);
+/*
+ * Semaphore functions.
+ */
+struct gk20a_semaphore *gk20a_semaphore_alloc(struct channel_gk20a *ch);
+void gk20a_semaphore_put(struct gk20a_semaphore *s);
+void gk20a_semaphore_get(struct gk20a_semaphore *s);
+/*
+ * Return the address of a specific semaphore.
+ *
+ * Don't call this on a semaphore you don't own - the VA returned will make no
+ * sense in your specific channel's VM.
+ */
+static inline u64 gk20a_semaphore_gpu_rw_va(struct gk20a_semaphore *s)
+{
+        return __gk20a_semaphore_pool_gpu_va(s->hw_sema->p, false) +
+                s->hw_sema->offset;
+}
+/*
+ * Get the global RO address for the semaphore. Can be called on any semaphore
+ * regardless of whether you own it.
+ */
+static inline u64 gk20a_semaphore_gpu_ro_va(struct gk20a_semaphore *s)
+{
+        return __gk20a_semaphore_pool_gpu_va(s->hw_sema->p, true) +
+                s->hw_sema->offset;
+}
+static inline u64 gk20a_hw_sema_addr(struct gk20a_semaphore_int *hw_sema)
+{
+        return __gk20a_semaphore_pool_gpu_va(hw_sema->p, true) +
+                hw_sema->offset;
+}
+/*
+ * TODO: handle wrap around... Hmm, how to do this?
+ */
+static inline bool gk20a_semaphore_is_released(struct gk20a_semaphore *s)
+{
+        u32 sema_val = readl(s->hw_sema->value);
-/* A semaphore that lives inside a semaphore pool. */
-struct gk20a_semaphore {
-        struct gk20a_semaphore_pool *pool;
        /*
-         * value exists within the pool's memory at the specified offset.
+         * If the underlying semaphore value is greater than or equal to
-         * 0=acquired, 1=released.
+         * the value of the semaphore then the semaphore has been signaled
+         * (a.k.a. released).
         */
-        u32 offset; /* byte offset within pool */
+        return sema_val >= atomic_read(&s->value);
-        struct kref ref;
+}
-};
-/* Create a semaphore pool that can hold at most 'capacity' semaphores. */
+static inline bool gk20a_semaphore_is_acquired(struct gk20a_semaphore *s)
-struct gk20a_semaphore_pool *
-gk20a_semaphore_pool_alloc(struct gk20a *, const char *unique_name,
-                           size_t capacity);
-void gk20a_semaphore_pool_put(struct gk20a_semaphore_pool *);
-int gk20a_semaphore_pool_map(struct gk20a_semaphore_pool *,
-                             struct vm_gk20a *,
-                             enum gk20a_mem_rw_flag);
-void gk20a_semaphore_pool_unmap(struct gk20a_semaphore_pool *,
-                                struct vm_gk20a *);
-u64 gk20a_semaphore_pool_gpu_va(struct gk20a_semaphore_pool *,
-                                struct vm_gk20a *);
-/* Allocate a semaphore from the semaphore pool. The newly allocated
- * semaphore will be in acquired state (value=0). */
-struct gk20a_semaphore *
-gk20a_semaphore_alloc(struct gk20a_semaphore_pool *);
-void gk20a_semaphore_put(struct gk20a_semaphore *);
-void gk20a_semaphore_get(struct gk20a_semaphore *);
-static inline u64 gk20a_semaphore_gpu_va(struct gk20a_semaphore *s,
-                                         struct vm_gk20a *vm)
 {
-        return gk20a_semaphore_pool_gpu_va(s->pool, vm) + s->offset;
+        return !gk20a_semaphore_is_released(s);
 }
-static inline bool gk20a_semaphore_is_acquired(struct gk20a_semaphore *s)
+/*
+ * Read the underlying value from a semaphore.
+ */
+static inline u32 gk20a_semaphore_read(struct gk20a_semaphore *s)
 {
-        u32 v = gk20a_mem_rd(s->pool->g, &s->pool->mem, s->offset);
+        return readl(s->hw_sema->value);
+}
-        /* When often block on value reaching a certain threshold. We must make
+static inline u32 gk20a_semaphore_get_value(struct gk20a_semaphore *s)
-         * sure that if we get unblocked, we haven't read anything too early. */
+{
-        smp_rmb();
+        return atomic_read(&s->value);
-        return v == 0;
 }
+static inline u32 gk20a_semaphore_next_value(struct gk20a_semaphore *s)
+{
+        return atomic_read(&s->hw_sema->next_value);
+}
+/*
+ * Note - if you call this then any prior semaphores will also be released.
+ */
 static inline void gk20a_semaphore_release(struct gk20a_semaphore *s)
 {
-        smp_wmb();
+        u32 current_val;
-        gk20a_mem_wr(s->pool->g, &s->pool->mem, s->offset, 1);
+        u32 val = gk20a_semaphore_get_value(s);
+        int attempts = 0;
+        /*
+         * Wait until the sema value is 1 less than the write value. That
+         * way this function is essentially an increment.
+         *
+         * TODO: tune the wait a little better.
+         */
+        while ((current_val = gk20a_semaphore_read(s)) < (val - 1)) {
+                msleep(100);
+                attempts += 1;
+                if (attempts > 100) {
+                        WARN(1, "Stall on sema release!");
+                        return;
+                }
+        }
+        /*
+         * If the semaphore has already passed the value we would write then
+         * this is really just a NO-OP.
+         */
+        if (current_val >= val)
+                return;
+        writel(val, s->hw_sema->value);
+}
+/*
+ * Configure a software based increment on this semaphore. This is useful for
+ * when we want the GPU to wait on a SW event before processing a channel.
+ * Another way to describe this is when the GPU needs to wait on a SW pre-fence.
+ * The pre-fence signals SW which in turn calls gk20a_semaphore_release() which
+ * then allows the GPU to continue.
+ *
+ * Also used to prep a semaphore for an INCR by the GPU.
+ */
+static inline void gk20a_semaphore_incr(struct gk20a_semaphore *s)
+{
+        BUG_ON(s->incremented);
+        atomic_set(&s->value, atomic_add_return(1, &s->hw_sema->next_value));
+        s->incremented = 1;
 }
 #endif