13 files changed, 1385 insertions, 353 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/as_gk20a.c b/drivers/gpu/nvgpu/gk20a/as_gk20a.c
index 63569008..eb18fa65 100644
--- a/drivers/gpu/nvgpu/gk20a/as_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/as_gk20a.c
@@ -199,21 +199,14 @@ static int gk20a_as_ioctl_get_va_regions(
        for (i = 0; i < write_entries; ++i) {
                struct nvgpu_as_va_region region;
-                u32 base, limit;
                memset(&region, 0, sizeof(struct nvgpu_as_va_region));
-                if (!vm->vma[i].constraint.enable) {
-                        base = vm->vma[i].base;
-                        limit = vm->vma[i].limit;
-                } else {
-                        base = vm->vma[i].constraint.base;
-                        limit = vm->vma[i].constraint.limit;
-                }
                region.page_size = vm->gmmu_page_sizes[i];
-                region.offset = (u64)base * region.page_size;
+                region.offset = vm->vma[i].base;
-                region.pages = limit - base; /* NOTE: limit is exclusive */
+                /* No __aeabi_uldivmod() on some platforms... */
+                region.pages = (vm->vma[i].end - vm->vma[i].start) >>
+                        ilog2(region.page_size);
                if (copy_to_user(user_region_ptr + i, &region, sizeof(region)))
                        return -EFAULT;
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.c b/drivers/gpu/nvgpu/gk20a/gk20a.c
index f3b5544f..2e88726a 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.c
@@ -59,6 +59,7 @@
 #include "hw_fb_gk20a.h"
 #include "gk20a_scale.h"
 #include "dbg_gpu_gk20a.h"
+#include "gk20a_allocator.h"
 #include "hal.h"
 #include "vgpu/vgpu.h"
@@ -1532,6 +1533,7 @@ static int gk20a_probe(struct platform_device *dev)
        gr_gk20a_debugfs_init(gk20a);
        gk20a_pmu_debugfs_init(dev);
        gk20a_cde_debugfs_init(dev);
+        gk20a_alloc_debugfs_init(dev);
 #endif
        gk20a_init_gr(gk20a);
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a_allocator.c b/drivers/gpu/nvgpu/gk20a/gk20a_allocator.c
index 675a98a2..56fb22df 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a_allocator.c
+++ b/drivers/gpu/nvgpu/gk20a/gk20a_allocator.c
@@ -1,7 +1,7 @@
 /*
 * gk20a allocator
 *
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2015, NVIDIA CORPORATION.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms and conditions of the GNU General Public License,
@@ -16,112 +16,1149 @@
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
+#include <linux/kernel.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/debugfs.h>
+#include "platform_gk20a.h"
 #include "gk20a_allocator.h"
-#include <linux/vmalloc.h>
-/* init allocator struct */
+#include "mm_gk20a.h"
-int gk20a_allocator_init(struct gk20a_allocator *allocator,
-                const char *name, u32 start, u32 len)
+static struct dentry *balloc_debugfs_root;
+static struct kmem_cache *buddy_cache;  /* slab cache for meta data. */
+static u32 balloc_tracing_on;
+#define balloc_trace_func()                             \
+        do {                                            \
+                if (balloc_tracing_on)                  \
+                        trace_printk("%s\n", __func__); \
+        } while (0)
+#define balloc_trace_func_done()                                \
+        do {                                                    \
+                if (balloc_tracing_on)                          \
+                        trace_printk("%s_done\n", __func__);    \
+        } while (0)
+static void balloc_init_alloc_debug(struct gk20a_allocator *a);
+static void balloc_print_stats(struct gk20a_allocator *a, struct seq_file *s,
+                               int lock);
+static struct gk20a_buddy *balloc_free_buddy(struct gk20a_allocator *a,
+                                             u64 addr);
+static void balloc_coalesce(struct gk20a_allocator *a, struct gk20a_buddy *b);
+static void __balloc_do_free_fixed(struct gk20a_allocator *a,
+                                   struct gk20a_fixed_alloc *falloc);
+/*
+ * This function is not present in older kernel's list.h code.
+ */
+#ifndef list_last_entry
+#define list_last_entry(ptr, type, member) \
+        list_entry((ptr)->prev, type, member)
+#endif
+/*
+ * GPU buddy allocator for various address spaces.
+ *
+ * Current limitations:
+ *   o  A fixed allocation could potentially be made that borders PDEs with
+ *      different PTE sizes. This would require that fixed buffer to have
+ *      different sized PTEs for different parts of the allocation. Probably
+ *      best to just require PDE alignment for fixed address allocs.
+ *
+ *   o  It is currently possible to make an allocator that has a buddy alignment
+ *      out of sync with the PDE block size alignment. A simple example is a
+ *      32GB address space starting at byte 1. Every buddy is shifted off by 1
+ *      which means each buddy corresponf to more than one actual GPU page. The
+ *      best way to fix this is probably just require PDE blocksize alignment
+ *      for the start of the address space. At the moment all allocators are
+ *      easily PDE aligned so this hasn't been a problem.
+ */
+/*
+ * Pick a suitable maximum order for this allocator.
+ *
+ * Hueristic: Just guessing that the best max order is the largest single
+ * block that will fit in the address space.
+ */
+static void balloc_compute_max_order(struct gk20a_allocator *a)
+{
+        u64 true_max_order = ilog2(a->blks);
+        if (a->max_order > true_max_order)
+                a->max_order = true_max_order;
+        if (a->max_order > GPU_BALLOC_MAX_ORDER)
+                a->max_order = GPU_BALLOC_MAX_ORDER;
+}
+/*
+ * Since we can only allocate in chucks of a->blk_size we need to trim off
+ * any excess data that is not aligned to a->blk_size.
+ */
+static void balloc_allocator_align(struct gk20a_allocator *a)
+{
+        a->start = ALIGN(a->base, a->blk_size);
+        a->end   = (a->base + a->length) & ~(a->blk_size - 1);
+        a->count = a->end - a->start;
+        a->blks  = a->count >> a->blk_shift;
+}
+/*
+ * Pass NULL for parent if you want a top level buddy.
+ */
+static struct gk20a_buddy *balloc_new_buddy(struct gk20a_allocator *a,
+                                            struct gk20a_buddy *parent,
+                                            u64 start, u64 order)
+{
+        struct gk20a_buddy *new_buddy;
+        new_buddy = kmem_cache_alloc(buddy_cache, GFP_KERNEL);
+        if (!new_buddy)
+                return NULL;
+        memset(new_buddy, 0, sizeof(struct gk20a_buddy));
+        new_buddy->parent = parent;
+        new_buddy->start = start;
+        new_buddy->order = order;
+        new_buddy->end = start + (1 << order) * a->blk_size;
+        return new_buddy;
+}
+static void __balloc_buddy_list_add(struct gk20a_allocator *a,
+                                    struct gk20a_buddy *b,
+                                    struct list_head *list)
+{
+        if (buddy_is_in_list(b)) {
+                balloc_dbg(a, "Oops: adding added buddy (%llu:0x%llx)\n",
+                           b->order, b->start);
+                BUG();
+        }
+        /*
+         * Add big PTE blocks to the tail, small to the head for GVA spaces.
+         * This lets the code that checks if there are available blocks check
+         * without cycling through the entire list.
+         */
+        if (a->flags & GPU_BALLOC_GVA_SPACE &&
+            b->pte_size == BALLOC_PTE_SIZE_BIG)
+                list_add_tail(&b->buddy_entry, list);
+        else
+                list_add(&b->buddy_entry, list);
+        buddy_set_in_list(b);
+}
+static void __balloc_buddy_list_rem(struct gk20a_allocator *a,
+                                    struct gk20a_buddy *b)
+{
+        if (!buddy_is_in_list(b)) {
+                balloc_dbg(a, "Oops: removing removed buddy (%llu:0x%llx)\n",
+                           b->order, b->start);
+                BUG();
+        }
+        list_del_init(&b->buddy_entry);
+        buddy_clr_in_list(b);
+}
+/*
+ * Add a buddy to one of the buddy lists and deal with the necessary
+ * book keeping. Adds the buddy to the list specified by the buddy's order.
+ */
+static void balloc_blist_add(struct gk20a_allocator *a, struct gk20a_buddy *b)
+{
+        __balloc_buddy_list_add(a, b, balloc_get_order_list(a, b->order));
+        a->buddy_list_len[b->order]++;
+}
+static void balloc_blist_rem(struct gk20a_allocator *a, struct gk20a_buddy *b)
+{
+        __balloc_buddy_list_rem(a, b);
+        a->buddy_list_len[b->order]--;
+}
+static u64 balloc_get_order(struct gk20a_allocator *a, u64 len)
+{
+        if (len == 0)
+                return 0;
+        len--;
+        len >>= a->blk_shift;
+        return fls(len);
+}
+static u64 __balloc_max_order_in(struct gk20a_allocator *a, u64 start, u64 end)
+{
+        u64 size = (end - start) >> a->blk_shift;
+        if (size > 0)
+                return min_t(u64, ilog2(size), a->max_order);
+        else
+                return GPU_BALLOC_MAX_ORDER;
+}
+/*
+ * Initialize the buddy lists.
+ */
+static int balloc_init_lists(struct gk20a_allocator *a)
+{
+        int i;
+        u64 bstart, bend, order;
+        struct gk20a_buddy *buddy;
+        bstart = a->start;
+        bend = a->end;
+        /* First make sure the LLs are valid. */
+        for (i = 0; i < GPU_BALLOC_ORDER_LIST_LEN; i++)
+                INIT_LIST_HEAD(balloc_get_order_list(a, i));
+        while (bstart < bend) {
+                order = __balloc_max_order_in(a, bstart, bend);
+                buddy = balloc_new_buddy(a, NULL, bstart, order);
+                if (!buddy)
+                        goto cleanup;
+                balloc_blist_add(a, buddy);
+                bstart += balloc_order_to_len(a, order);
+        }
+        return 0;
+cleanup:
+        for (i = 0; i < GPU_BALLOC_ORDER_LIST_LEN; i++) {
+                if (!list_empty(balloc_get_order_list(a, i))) {
+                        buddy = list_first_entry(balloc_get_order_list(a, i),
+                                        struct gk20a_buddy, buddy_entry);
+                        balloc_blist_rem(a, buddy);
+                        kmem_cache_free(buddy_cache, buddy);
+                }
+        }
+        return -ENOMEM;
+}
+/*
+ * Initialize a buddy allocator. Returns 0 on success. This allocator does
+ * not necessarily manage bytes. It manages distinct ranges of resources. This
+ * allows the allocator to work for things like comp_tags, semaphores, etc.
+ *
+ * @allocator: Ptr to an allocator struct to init.
+ * @vm: GPU VM to associate this allocator with. Can be NULL. Will be used to
+ *      get PTE size for GVA spaces.
+ * @name: Name of the allocator. Doesn't have to be static storage.
+ * @base: The base address of the resource pool being managed.
+ * @size: Number of resources in the pool.
+ * @blk_size: Minimum number of resources to allocate at once. For things like
+ *            semaphores this is 1. For GVA this might be as much as 64k. This
+ *            corresponds to order 0. Must be power of 2.
+ * @max_order: Pick a maximum order. If you leave this as 0, the buddy allocator
+ *             will try and pick a reasonable max order.
+ * @flags: Extra flags necessary. See GPU_BALLOC_*.
+ */
+int __gk20a_allocator_init(struct gk20a_allocator *a,
+                           struct vm_gk20a *vm, const char *name,
+                           u64 base, u64 size, u64 blk_size, u64 max_order,
+                           u64 flags)
 {
-        memset(allocator, 0, sizeof(struct gk20a_allocator));
+        int err;
+        memset(a, 0, sizeof(struct gk20a_allocator));
+        strncpy(a->name, name, 32);
+        a->base = base;
+        a->length = size;
+        a->blk_size = blk_size;
+        a->blk_shift = __ffs(blk_size);
+        /* blk_size must be greater than 0 and a power of 2. */
+        if (blk_size == 0)
+                return -EINVAL;
+        if (blk_size & (blk_size - 1))
+                return -EINVAL;
+        if (max_order > GPU_BALLOC_MAX_ORDER)
+                return -EINVAL;
+        /* If this is to manage a GVA space we need a VM. */
+        if (flags & GPU_BALLOC_GVA_SPACE && !vm)
+                return -EINVAL;
+        a->vm = vm;
+        if (flags & GPU_BALLOC_GVA_SPACE)
+                a->pte_blk_order = balloc_get_order(a, vm->big_page_size << 10);
-        strncpy(allocator->name, name, 32);
+        a->flags = flags;
+        a->max_order = max_order;
-        allocator->base = start;
+        balloc_allocator_align(a);
-        allocator->limit = start + len - 1;
+        balloc_compute_max_order(a);
-        allocator->bitmap = vzalloc(BITS_TO_LONGS(len) * sizeof(long));
+        /* Shared buddy kmem_cache for all allocators. */
-        if (!allocator->bitmap)
+        if (!buddy_cache)
+                buddy_cache = KMEM_CACHE(gk20a_buddy, 0);
+        if (!buddy_cache)
                return -ENOMEM;
-        allocator_dbg(allocator, "%s : base %d, limit %d",
+        a->alloced_buddies = RB_ROOT;
-                allocator->name, allocator->base, allocator->limit);
+        err = balloc_init_lists(a);
+        if (err)
+                return err;
-        init_rwsem(&allocator->rw_sema);
+        mutex_init(&a->lock);
-        allocator->alloc = gk20a_allocator_block_alloc;
+        a->init = 1;
-        allocator->free = gk20a_allocator_block_free;
+        balloc_init_alloc_debug(a);
+        balloc_dbg(a, "New allocator: base      0x%llx\n", a->base);
+        balloc_dbg(a, "               size      0x%llx\n", a->length);
+        balloc_dbg(a, "               blk_size  0x%llx\n", a->blk_size);
+        balloc_dbg(a, "               max_order %llu\n", a->max_order);
+        balloc_dbg(a, "               flags     0x%llx\n", a->flags);
        return 0;
 }
-/* destroy allocator, free all remaining blocks if any */
+int gk20a_allocator_init(struct gk20a_allocator *a, const char *name,
-void gk20a_allocator_destroy(struct gk20a_allocator *allocator)
+                         u64 base, u64 size, u64 blk_size)
+{
+        return __gk20a_allocator_init(a, NULL, name,
+                                      base, size, blk_size, 0, 0);
+}
+/*
+ * Clean up and destroy the passed allocator.
+ */
+void gk20a_allocator_destroy(struct gk20a_allocator *a)
 {
-        down_write(&allocator->rw_sema);
+        struct rb_node *node;
+        struct gk20a_buddy *bud;
+        struct gk20a_fixed_alloc *falloc;
+        int i;
+        balloc_lock(a);
+        if (!IS_ERR_OR_NULL(a->debugfs_entry))
+                debugfs_remove(a->debugfs_entry);
+        /*
+         * Free the fixed allocs first.
+         */
+        while ((node = rb_first(&a->fixed_allocs)) != NULL) {
+                falloc = container_of(node,
+                                      struct gk20a_fixed_alloc, alloced_entry);
+                __balloc_do_free_fixed(a, falloc);
+                rb_erase(node, &a->fixed_allocs);
+        }
+        /*
+         * And now free all outstanding allocations.
+         */
+        while ((node = rb_first(&a->alloced_buddies)) != NULL) {
+                bud = container_of(node, struct gk20a_buddy, alloced_entry);
+                balloc_free_buddy(a, bud->start);
+                balloc_blist_add(a, bud);
+                balloc_coalesce(a, bud);
+        }
-        vfree(allocator->bitmap);
+        /*
+         * Now clean up the unallocated buddies.
+         */
+        for (i = 0; i < GPU_BALLOC_ORDER_LIST_LEN; i++) {
+                BUG_ON(a->buddy_list_alloced[i] != 0);
+                while (!list_empty(balloc_get_order_list(a, i))) {
+                        bud = list_first_entry(balloc_get_order_list(a, i),
+                                               struct gk20a_buddy, buddy_entry);
+                        balloc_blist_rem(a, bud);
+                        kmem_cache_free(buddy_cache, bud);
+                }
+                if (a->buddy_list_len[i] != 0) {
+                        pr_info("Excess buddies!!! (%d: %llu)\n",
+                                i, a->buddy_list_len[i]);
+                        BUG();
+                }
+                if (a->buddy_list_split[i] != 0) {
+                        pr_info("Excess split nodes!!! (%d: %llu)\n",
+                                i, a->buddy_list_split[i]);
+                        BUG();
+                }
+                if (a->buddy_list_alloced[i] != 0) {
+                        pr_info("Excess alloced nodes!!! (%d: %llu)\n",
+                                i, a->buddy_list_alloced[i]);
+                        BUG();
+                }
+        }
-        memset(allocator, 0, sizeof(struct gk20a_allocator));
+        a->init = 0;
+        balloc_unlock(a);
+        /*
+         * We cant unlock an allocator after memsetting it. That wipes the
+         * state of the mutex. Hopefully no one uses the allocator after
+         * destroying it...
+         */
+        memset(a, 0, sizeof(struct gk20a_allocator));
 }
 /*
- * *addr != ~0 for fixed address allocation. if *addr == 0, base addr is
+ * Combine the passed buddy if possible. The pointer in @b may not be valid
- * returned to caller in *addr.
+ * after this as the buddy may be freed.
 *
- * contiguous allocation, which allocates one block of
+ * @a must be locked.
- * contiguous address.
+ */
-*/
+static void balloc_coalesce(struct gk20a_allocator *a, struct gk20a_buddy *b)
-int gk20a_allocator_block_alloc(struct gk20a_allocator *allocator,
-                u32 *addr, u32 len, u32 align)
 {
-        unsigned long _addr;
+        struct gk20a_buddy *parent;
-        allocator_dbg(allocator, "[in] addr %d, len %d", *addr, len);
+        if (buddy_is_alloced(b) || buddy_is_split(b))
+                return;
-        if ((*addr != 0 && *addr < allocator->base) || /* check addr range */
+        /*
-            *addr + len > allocator->limit || /* check addr range */
+         * If both our buddy and I are both not allocated and not split then
-            *addr & (align - 1) || /* check addr alignment */
+         * we can coalesce ourselves.
-             len == 0)                        /* check len */
+         */
-                return -EINVAL;
+        if (!b->buddy)
+                return;
+        if (buddy_is_alloced(b->buddy) || buddy_is_split(b->buddy))
+                return;
+        parent = b->parent;
+        balloc_blist_rem(a, b);
+        balloc_blist_rem(a, b->buddy);
+        buddy_clr_split(parent);
+        a->buddy_list_split[parent->order]--;
+        balloc_blist_add(a, parent);
+        /*
+         * Recursively coalesce as far as we can go.
+         */
+        balloc_coalesce(a, parent);
+        /* Clean up the remains. */
+        kmem_cache_free(buddy_cache, b->buddy);
+        kmem_cache_free(buddy_cache, b);
+}
+/*
+ * Split a buddy into two new buddies who are 1/2 the size of the parent buddy.
+ *
+ * @a must be locked.
+ */
+static int balloc_split_buddy(struct gk20a_allocator *a, struct gk20a_buddy *b,
+                              int pte_size)
+{
+        struct gk20a_buddy *left, *right;
+        u64 half;
-        len = ALIGN(len, align);
+        left = balloc_new_buddy(a, b, b->start, b->order - 1);
-        if (!len)
+        if (!left)
                return -ENOMEM;
-        down_write(&allocator->rw_sema);
+        half = (b->end - b->start) / 2;
-        _addr = bitmap_find_next_zero_area(allocator->bitmap,
+        right = balloc_new_buddy(a, b, b->start + half, b->order - 1);
-                        allocator->limit - allocator->base + 1,
+        if (!right) {
-                        *addr ? (*addr - allocator->base) : 0,
+                kmem_cache_free(buddy_cache, left);
-                        len,
-                        align - 1);
-        if ((_addr > allocator->limit - allocator->base + 1) ||
-            (*addr && *addr != (_addr + allocator->base))) {
-                up_write(&allocator->rw_sema);
                return -ENOMEM;
        }
-        bitmap_set(allocator->bitmap, _addr, len);
+        buddy_set_split(b);
-        *addr = allocator->base + _addr;
+        a->buddy_list_split[b->order]++;
-        up_write(&allocator->rw_sema);
+        b->left = left;
+        b->right = right;
+        left->buddy = right;
+        right->buddy = left;
+        left->parent = b;
+        right->parent = b;
-        allocator_dbg(allocator, "[out] addr %d, len %d", *addr, len);
+        /* PTE considerations. */
+        if (a->flags & GPU_BALLOC_GVA_SPACE &&
+            left->order <= a->pte_blk_order) {
+                        left->pte_size = pte_size;
+                        right->pte_size = pte_size;
+        }
+        balloc_blist_rem(a, b);
+        balloc_blist_add(a, left);
+        balloc_blist_add(a, right);
        return 0;
 }
-/* free all blocks between start and end */
+/*
-int gk20a_allocator_block_free(struct gk20a_allocator *allocator,
+ * Place the passed buddy into the RB tree for allocated buddies. Never fails
-                u32 addr, u32 len, u32 align)
+ * unless the passed entry is a duplicate which is a bug.
+ *
+ * @a must be locked.
+ */
+void balloc_alloc_buddy(struct gk20a_allocator *a, struct gk20a_buddy *b)
 {
-        allocator_dbg(allocator, "[in] addr %d, len %d", addr, len);
+        struct rb_node **new = &(a->alloced_buddies.rb_node);
+        struct rb_node *parent = NULL;
-        if (addr + len > allocator->limit || /* check addr range */
+        while (*new) {
-            addr < allocator->base ||
+                struct gk20a_buddy *bud = container_of(*new, struct gk20a_buddy,
-            addr & (align - 1))   /* check addr alignment */
+                                                       alloced_entry);
-                return -EINVAL;
-        len = ALIGN(len, align);
+                parent = *new;
-        if (!len)
+                if (b->start < bud->start)
-                return -EINVAL;
+                        new = &((*new)->rb_left);
+                else if (b->start > bud->start)
+                        new = &((*new)->rb_right);
+                else
+                        BUG_ON("Duplicate entries in allocated list!\n");
+        }
+        rb_link_node(&b->alloced_entry, parent, new);
+        rb_insert_color(&b->alloced_entry, &a->alloced_buddies);
+        buddy_set_alloced(b);
+        a->buddy_list_alloced[b->order]++;
+}
+/*
+ * Remove the passed buddy from the allocated buddy RB tree. Returns the
+ * deallocated buddy for further processing.
+ *
+ * @a must be locked.
+ */
+static struct gk20a_buddy *balloc_free_buddy(struct gk20a_allocator *a,
+                                             u64 addr)
+{
+        struct rb_node *node = a->alloced_buddies.rb_node;
+        struct gk20a_buddy *bud;
+        while (node) {
+                bud = container_of(node, struct gk20a_buddy, alloced_entry);
+                if (addr < bud->start)
+                        node = node->rb_left;
+                else if (addr > bud->start)
+                        node = node->rb_right;
+                else
+                        break;
+        }
+        if (!node)
+                return NULL;
+        rb_erase(node, &a->alloced_buddies);
+        buddy_clr_alloced(bud);
+        a->buddy_list_alloced[bud->order]--;
+        return bud;
+}
+/*
+ * Find a suitable buddy for the given order and PTE type (big or little).
+ */
+static struct gk20a_buddy *__balloc_find_buddy(struct gk20a_allocator *a,
+                                               u64 order, int pte_size)
+{
+        struct gk20a_buddy *bud;
+        if (list_empty(balloc_get_order_list(a, order)))
+                return NULL;
+        if (a->flags & GPU_BALLOC_GVA_SPACE &&
+            pte_size == BALLOC_PTE_SIZE_BIG)
+                bud = list_last_entry(balloc_get_order_list(a, order),
+                                      struct gk20a_buddy, buddy_entry);
+        else
+                bud = list_first_entry(balloc_get_order_list(a, order),
+                                       struct gk20a_buddy, buddy_entry);
+        if (bud->pte_size != BALLOC_PTE_SIZE_ANY &&
+            bud->pte_size != pte_size)
+                return NULL;
+        return bud;
+}
+/*
+ * Allocate a suitably sized buddy. If no suitable buddy exists split higher
+ * order buddies until we have a suitable buddy to allocate.
+ *
+ * For PDE grouping add an extra check to see if a buddy is suitable: that the
+ * buddy exists in a PDE who's PTE size is reasonable
+ *
+ * @a must be locked.
+ */
+static u64 __balloc_do_alloc(struct gk20a_allocator *a, u64 order, int pte_size)
+{
+        u64 split_order;
+        struct gk20a_buddy *bud;
+        split_order = order;
+        while (!(bud = __balloc_find_buddy(a, split_order, pte_size)))
+                split_order++;
+        while (bud->order != order) {
+                if (balloc_split_buddy(a, bud, pte_size))
+                        return 0; /* No mem... */
+                bud = bud->left;
+        }
+        balloc_blist_rem(a, bud);
+        balloc_alloc_buddy(a, bud);
-        down_write(&allocator->rw_sema);
+        return bud->start;
-        bitmap_clear(allocator->bitmap, addr - allocator->base, len);
+}
-        up_write(&allocator->rw_sema);
+/*
+ * Allocate memory from the passed allocator.
+ */
+u64 gk20a_balloc(struct gk20a_allocator *a, u64 len)
+{
+        u64 order, addr;
+        int pte_size;
+        balloc_trace_func();
+        balloc_lock(a);
+        order = balloc_get_order(a, len);
+        if (order > a->max_order) {
+                balloc_unlock(a);
+                balloc_dbg(a, "Alloc fail\n");
+                balloc_trace_func_done();
+                return 0;
+        }
+        /*
+         * For now pass the base address of the allocator's region to
+         * __get_pte_size(). This ensures we get the right page size for
+         * the alloc but we don't have to know what the real address is
+         * going to be quite yet.
+         *
+         * TODO: once userspace supports a unified address space pass 0 for
+         * the base. This will make only 'len' affect the PTE size.
+         */
+        if (a->flags & GPU_BALLOC_GVA_SPACE)
+                pte_size = __get_pte_size(a->vm, a->base, len);
+        else
+                pte_size = BALLOC_PTE_SIZE_ANY;
+        addr = __balloc_do_alloc(a, order, pte_size);
+        a->bytes_alloced += len;
+        a->bytes_alloced_real += balloc_order_to_len(a, order);
+        balloc_unlock(a);
+        balloc_dbg(a, "Alloc 0x%-10llx %3lld:0x%-10llx pte_size=%s\n",
+                   addr, order, len,
+                   pte_size == gmmu_page_size_big   ? "big" :
+                   pte_size == gmmu_page_size_small ? "small" :
+                   "NA/any");
+        balloc_trace_func_done();
+        return addr;
+}
+/*
+ * See if the passed range is actually available for allocation. If so, then
+ * return 1, otherwise return 0.
+ *
+ * TODO: Right now this uses the unoptimal approach of going through all
+ * outstanding allocations and checking their base/ends. This could be better.
+ */
+static int balloc_is_range_free(struct gk20a_allocator *a, u64 base, u64 end)
+{
+        struct rb_node *node;
+        struct gk20a_buddy *bud;
+        node = rb_first(&a->alloced_buddies);
+        if (!node)
+                return 1; /* No allocs yet. */
+        bud = container_of(node, struct gk20a_buddy, alloced_entry);
+        while (bud->start < end) {
+                if ((bud->start > base && bud->start < end) ||
+                    (bud->end   > base && bud->end   < end))
+                        return 0;
+                node = rb_next(node);
+                if (!node)
+                        break;
+                bud = container_of(node, struct gk20a_buddy, alloced_entry);
+        }
+        return 1;
+}
+static void balloc_alloc_fixed(struct gk20a_allocator *a,
+                               struct gk20a_fixed_alloc *f)
+{
+        struct rb_node **new = &(a->fixed_allocs.rb_node);
+        struct rb_node *parent = NULL;
+        while (*new) {
+                struct gk20a_fixed_alloc *falloc =
+                        container_of(*new, struct gk20a_fixed_alloc,
+                                     alloced_entry);
+                parent = *new;
+                if (f->start < falloc->start)
+                        new = &((*new)->rb_left);
+                else if (f->start > falloc->start)
+                        new = &((*new)->rb_right);
+                else
+                        BUG_ON("Duplicate entries in allocated list!\n");
+        }
+        rb_link_node(&f->alloced_entry, parent, new);
+        rb_insert_color(&f->alloced_entry, &a->fixed_allocs);
+}
+/*
+ * Remove the passed buddy from the allocated buddy RB tree. Returns the
+ * deallocated buddy for further processing.
+ *
+ * @a must be locked.
+ */
+static struct gk20a_fixed_alloc *balloc_free_fixed(struct gk20a_allocator *a,
+                                                   u64 addr)
+{
+        struct rb_node *node = a->fixed_allocs.rb_node;
+        struct gk20a_fixed_alloc *falloc;
+        while (node) {
+                falloc = container_of(node,
+                                      struct gk20a_fixed_alloc, alloced_entry);
+                if (addr < falloc->start)
+                        node = node->rb_left;
+                else if (addr > falloc->start)
+                        node = node->rb_right;
+                else
+                        break;
+        }
+        if (!node)
+                return NULL;
+        rb_erase(node, &a->fixed_allocs);
+        return falloc;
+}
+/*
+ * Find the parent range - doesn't necessarily need the parent to actually exist
+ * as a buddy. Finding an existing parent comes later...
+ */
+static void __balloc_get_parent_range(struct gk20a_allocator *a,
+                                      u64 base, u64 order,
+                                      u64 *pbase, u64 *porder)
+{
+        u64 base_mask;
+        u64 shifted_base = balloc_base_shift(a, base);
+        order++;
+        base_mask = ~((a->blk_size << order) - 1);
+        shifted_base &= base_mask;
+        *pbase = balloc_base_unshift(a, shifted_base);
+        *porder = order;
+}
+/*
+ * Makes a buddy at the passed address. This will make all parent buddies
+ * necessary for this buddy to exist as well.
+ */
+static struct gk20a_buddy *__balloc_make_fixed_buddy(struct gk20a_allocator *a,
+                                                     u64 base, u64 order)
+{
+        struct gk20a_buddy *bud = NULL;
+        struct list_head *order_list;
+        u64 cur_order = order, cur_base = base;
+        /*
+         * Algo:
+         *  1. Keep jumping up a buddy order until we find the real buddy that
+         *     this buddy exists in.
+         *  2. Then work our way down through the buddy tree until we hit a dead
+         *     end.
+         *  3. Start splitting buddies until we split to the one we need to
+         *     make.
+         */
+        while (cur_order <= a->max_order) {
+                int found = 0;
+                order_list = balloc_get_order_list(a, cur_order);
+                list_for_each_entry(bud, order_list, buddy_entry) {
+                        if (bud->start == cur_base) {
+                                found = 1;
+                                break;
+                        }
+                }
+                if (found)
+                        break;
+                __balloc_get_parent_range(a, cur_base, cur_order,
+                                          &cur_base, &cur_order);
+        }
+        if (cur_order > a->max_order) {
+                balloc_dbg(a, "No buddy for range ???\n");
+                return NULL;
+        }
+        /* Split this buddy as necessary until we get the target buddy. */
+        while (bud->start != base || bud->order != order) {
+                if (balloc_split_buddy(a, bud, BALLOC_PTE_SIZE_ANY)) {
+                        balloc_coalesce(a, bud);
+                        return NULL;
+                }
+                if (base < bud->right->start)
+                        bud = bud->left;
+                else
+                        bud = bud->right;
+        }
+        return bud;
+}
+static u64 __balloc_do_alloc_fixed(struct gk20a_allocator *a,
+                                   struct gk20a_fixed_alloc *falloc,
+                                   u64 base, u64 len)
+{
+        u64 shifted_base, inc_base;
+        u64 align_order;
+        shifted_base = balloc_base_shift(a, base);
+        if (shifted_base == 0)
+                align_order = __fls(len >> a->blk_shift);
+        else
+                align_order = min_t(u64,
+                                    __ffs(shifted_base >> a->blk_shift),
+                                    __fls(len >> a->blk_shift));
+        if (align_order > a->max_order) {
+                balloc_dbg(a, "Align order too big: %llu > %llu\n",
+                           align_order, a->max_order);
+                return 0;
+        }
+        /*
+         * Generate a list of buddies that satisfy this allocation.
+         */
+        inc_base = shifted_base;
+        while (inc_base < (shifted_base + len)) {
+                u64 order_len = balloc_order_to_len(a, align_order);
+                u64 remaining;
+                struct gk20a_buddy *bud;
+                bud = __balloc_make_fixed_buddy(a,
+                                        balloc_base_unshift(a, inc_base),
+                                        align_order);
+                if (!bud) {
+                        balloc_dbg(a, "Fixed buddy failed: {0x%llx, %llu}!\n",
+                                   balloc_base_unshift(a, inc_base),
+                                   align_order);
+                        goto err_and_cleanup;
+                }
+                balloc_blist_rem(a, bud);
+                balloc_alloc_buddy(a, bud);
+                __balloc_buddy_list_add(a, bud, &falloc->buddies);
+                /* Book keeping. */
+                inc_base += order_len;
+                remaining = (shifted_base + len) - inc_base;
+                align_order = __ffs(inc_base >> a->blk_shift);
+                /* If we don't have much left - trim down align_order. */
+                if (balloc_order_to_len(a, align_order) > remaining)
+                        align_order = __balloc_max_order_in(a, inc_base,
+                                                inc_base + remaining);
+        }
+        return base;
-        allocator_dbg(allocator, "[out] addr %d, len %d", addr, len);
+err_and_cleanup:
+        while (!list_empty(&falloc->buddies)) {
+                struct gk20a_buddy *bud = list_first_entry(&falloc->buddies,
+                                                           struct gk20a_buddy,
+                                                           buddy_entry);
+                __balloc_buddy_list_rem(a, bud);
+                balloc_free_buddy(a, bud->start);
+                kmem_cache_free(buddy_cache, bud);
+        }
+        return 0;
+}
+/*
+ * Allocate a fixed address allocation. The address of the allocation is @base
+ * and the length is @len. This is not a typical buddy allocator operation and
+ * as such has a high posibility of failure if the address space is heavily in
+ * use.
+ *
+ * Please do not use this function unless _absolutely_ necessary.
+ */
+u64 gk20a_balloc_fixed(struct gk20a_allocator *a, u64 base, u64 len)
+{
+        struct gk20a_fixed_alloc *falloc = NULL;
+        struct gk20a_buddy *bud;
+        u64 ret, real_bytes = 0;
+        balloc_trace_func();
+        /* If base isn't aligned to an order 0 block, fail. */
+        if (base & (a->blk_size - 1))
+                goto fail;
+        if (len == 0)
+                goto fail;
+        falloc = kmalloc(sizeof(*falloc), GFP_KERNEL);
+        if (!falloc)
+                goto fail;
+        INIT_LIST_HEAD(&falloc->buddies);
+        falloc->start = base;
+        falloc->end = base + len;
+        balloc_lock(a);
+        if (!balloc_is_range_free(a, base, base + len)) {
+                balloc_dbg(a, "Range not free: 0x%llx -> 0x%llx\n",
+                           base, base + len);
+                goto fail_unlock;
+        }
+        ret = __balloc_do_alloc_fixed(a, falloc, base, len);
+        if (!ret) {
+                balloc_dbg(a, "Alloc-fixed failed ?? 0x%llx -> 0x%llx\n",
+                           base, base + len);
+                goto fail_unlock;
+        }
+        balloc_alloc_fixed(a, falloc);
+        list_for_each_entry(bud, &falloc->buddies, buddy_entry)
+                real_bytes += (bud->end - bud->start);
+        a->bytes_alloced += len;
+        a->bytes_alloced_real += real_bytes;
+        balloc_unlock(a);
+        balloc_dbg(a, "Alloc (fixed) 0x%llx\n", base);
+        balloc_trace_func_done();
+        return base;
+fail_unlock:
+        balloc_unlock(a);
+fail:
+        kfree(falloc);
+        balloc_trace_func_done();
+        return 0;
+}
+static void __balloc_do_free_fixed(struct gk20a_allocator *a,
+                                   struct gk20a_fixed_alloc *falloc)
+{
+        struct gk20a_buddy *bud;
+        while (!list_empty(&falloc->buddies)) {
+                bud = list_first_entry(&falloc->buddies,
+                                       struct gk20a_buddy,
+                                       buddy_entry);
+                __balloc_buddy_list_rem(a, bud);
+                balloc_free_buddy(a, bud->start);
+                balloc_blist_add(a, bud);
+                a->bytes_freed += balloc_order_to_len(a, bud->order);
+                /*
+                 * Attemp to defrag the allocation.
+                 */
+                balloc_coalesce(a, bud);
+        }
+        kfree(falloc);
+}
+/*
+ * Free the passed allocation.
+ */
+void gk20a_bfree(struct gk20a_allocator *a, u64 addr)
+{
+        struct gk20a_buddy *bud;
+        struct gk20a_fixed_alloc *falloc;
+        balloc_trace_func();
+        if (!addr) {
+                balloc_trace_func_done();
+                return;
+        }
+        balloc_lock(a);
+        /*
+         * First see if this is a fixed alloc. If not fall back to a regular
+         * buddy.
+         */
+        falloc = balloc_free_fixed(a, addr);
+        if (falloc) {
+                __balloc_do_free_fixed(a, falloc);
+                goto done;
+        }
+        bud = balloc_free_buddy(a, addr);
+        if (!bud)
+                goto done;
+        balloc_blist_add(a, bud);
+        a->bytes_freed += balloc_order_to_len(a, bud->order);
+        /*
+         * Attemp to defrag the allocation.
+         */
+        balloc_coalesce(a, bud);
+done:
+        balloc_unlock(a);
+        balloc_dbg(a, "Free 0x%llx\n", addr);
+        balloc_trace_func_done();
+        return;
+}
+/*
+ * Print the buddy allocator top level stats. If you pass @s as NULL then the
+ * stats are printed to the kernel log. This lets this code be used for
+ * debugging purposes internal to the allocator.
+ */
+static void balloc_print_stats(struct gk20a_allocator *a, struct seq_file *s,
+                               int lock)
+{
+#define __balloc_pstat(s, fmt, arg...)                  \
+        do {                                            \
+                if (s)                                  \
+                        seq_printf(s, fmt, ##arg);      \
+                else                                    \
+                        balloc_dbg(a, fmt, ##arg);      \
+        } while (0)
+        int i;
+        struct rb_node *node;
+        struct gk20a_fixed_alloc *falloc;
+        __balloc_pstat(s, "base = %llu, limit = %llu, blk_size = %llu\n",
+                   a->base, a->length, a->blk_size);
+        __balloc_pstat(s, "Internal params:\n");
+        __balloc_pstat(s, "  start = %llu\n", a->start);
+        __balloc_pstat(s, "  end   = %llu\n", a->end);
+        __balloc_pstat(s, "  count = %llu\n", a->count);
+        __balloc_pstat(s, "  blks  = %llu\n", a->blks);
+        __balloc_pstat(s, "  max_order  = %llu\n", a->max_order);
+        __balloc_pstat(s, "Buddy blocks:\n");
+        __balloc_pstat(s, "  Order   Free    Alloced   Split\n");
+        __balloc_pstat(s, "  -----   ----    -------   -----\n");
+        if (lock)
+                balloc_lock(a);
+        for (i = a->max_order; i >= 0; i--) {
+                if (a->buddy_list_len[i] == 0 &&
+                    a->buddy_list_alloced[i] == 0 &&
+                    a->buddy_list_split[i] == 0)
+                        continue;
+                __balloc_pstat(s, "  %3d     %-7llu %-9llu %llu\n", i,
+                               a->buddy_list_len[i],
+                               a->buddy_list_alloced[i],
+                               a->buddy_list_split[i]);
+        }
+        __balloc_pstat(s, "\n");
+        for (node = rb_first(&a->fixed_allocs), i = 1;
+             node != NULL;
+             node = rb_next(node)) {
+                falloc = container_of(node,
+                                      struct gk20a_fixed_alloc, alloced_entry);
+                __balloc_pstat(s, "Fixed alloc (%d): [0x%llx -> 0x%llx]\n",
+                                i, falloc->start, falloc->end);
+        }
+        __balloc_pstat(s, "\n");
+        __balloc_pstat(s, "Bytes allocated:        %llu\n", a->bytes_alloced);
+        __balloc_pstat(s, "Bytes allocated (real): %llu\n",
+                       a->bytes_alloced_real);
+        __balloc_pstat(s, "Bytes freed:            %llu\n", a->bytes_freed);
+        if (lock)
+                balloc_unlock(a);
+#undef __balloc_pstats
+}
+static int __alloc_show(struct seq_file *s, void *unused)
+{
+        struct gk20a_allocator *a = s->private;
+        balloc_print_stats(a, s, 1);
        return 0;
 }
+static int __alloc_open(struct inode *inode, struct file *file)
+{
+        return single_open(file, __alloc_show, inode->i_private);
+}
+static const struct file_operations __alloc_fops = {
+        .open = __alloc_open,
+        .read = seq_read,
+        .llseek = seq_lseek,
+        .release = single_release,
+};
+static void balloc_init_alloc_debug(struct gk20a_allocator *a)
+{
+        if (!balloc_debugfs_root)
+                return;
+        a->debugfs_entry = debugfs_create_file(a->name, S_IRUGO,
+                                               balloc_debugfs_root,
+                                               a, &__alloc_fops);
+}
+void gk20a_alloc_debugfs_init(struct platform_device *pdev)
+{
+        struct gk20a_platform *platform = platform_get_drvdata(pdev);
+        struct dentry *gpu_root = platform->debugfs;
+        balloc_debugfs_root = debugfs_create_dir("allocators", gpu_root);
+        if (IS_ERR_OR_NULL(balloc_debugfs_root))
+                return;
+        debugfs_create_u32("tracing", 0664, balloc_debugfs_root,
+                           &balloc_tracing_on);
+}
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a_allocator.h b/drivers/gpu/nvgpu/gk20a/gk20a_allocator.h
index 69a227bd..e86e053b 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a_allocator.h
+++ b/drivers/gpu/nvgpu/gk20a/gk20a_allocator.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2015, NVIDIA CORPORATION.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms and conditions of the GNU General Public License,
@@ -17,75 +17,190 @@
 #ifndef GK20A_ALLOCATOR_H
 #define GK20A_ALLOCATOR_H
+#include <linux/list.h>
 #include <linux/rbtree.h>
-#include <linux/rwsem.h>
+#include <linux/debugfs.h>
-#include <linux/slab.h>
+#include <linux/platform_device.h>
 /* #define ALLOCATOR_DEBUG */
-/* main struct */
+/*
+ * Each buddy is an element in a binary tree.
+ */
+struct gk20a_buddy {
+        struct gk20a_buddy *parent;     /* Parent node. */
+        struct gk20a_buddy *buddy;      /* This node's buddy. */
+        struct gk20a_buddy *left;       /* Lower address sub-node. */
+        struct gk20a_buddy *right;      /* Higher address sub-node. */
+        struct list_head buddy_entry;   /* List entry for various lists. */
+        struct rb_node alloced_entry;   /* RB tree of allocations. */
+        u64 start;                      /* Start address of this buddy. */
+        u64 end;                        /* End address of this buddy. */
+        u64 order;                      /* Buddy order. */
+#define BALLOC_BUDDY_ALLOCED    0x1
+#define BALLOC_BUDDY_SPLIT      0x2
+#define BALLOC_BUDDY_IN_LIST    0x4
+        int flags;                      /* List of associated flags. */
+        /*
+         * Size of the PDE this buddy is using. This allows for grouping like
+         * sized allocations into the same PDE.
+         */
+#define BALLOC_PTE_SIZE_ANY     0x0
+#define BALLOC_PTE_SIZE_SMALL   0x1
+#define BALLOC_PTE_SIZE_BIG     0x2
+        int pte_size;
+};
+#define __buddy_flag_ops(flag, flag_up)                                 \
+        static inline int buddy_is_ ## flag(struct gk20a_buddy *b)      \
+        {                                                               \
+                return b->flags & BALLOC_BUDDY_ ## flag_up;             \
+        }                                                               \
+        static inline void buddy_set_ ## flag(struct gk20a_buddy *b)    \
+        {                                                               \
+                b->flags |= BALLOC_BUDDY_ ## flag_up;                   \
+        }                                                               \
+        static inline void buddy_clr_ ## flag(struct gk20a_buddy *b)    \
+        {                                                               \
+                b->flags &= ~BALLOC_BUDDY_ ## flag_up;                  \
+        }
+/*
+ * int  buddy_is_alloced(struct gk20a_buddy *b);
+ * void buddy_set_alloced(struct gk20a_buddy *b);
+ * void buddy_clr_alloced(struct gk20a_buddy *b);
+ *
+ * int  buddy_is_split(struct gk20a_buddy *b);
+ * void buddy_set_split(struct gk20a_buddy *b);
+ * void buddy_clr_split(struct gk20a_buddy *b);
+ *
+ * int  buddy_is_in_list(struct gk20a_buddy *b);
+ * void buddy_set_in_list(struct gk20a_buddy *b);
+ * void buddy_clr_in_list(struct gk20a_buddy *b);
+ */
+__buddy_flag_ops(alloced, ALLOCED);
+__buddy_flag_ops(split,   SPLIT);
+__buddy_flag_ops(in_list, IN_LIST);
+/*
+ * Keeps info for a fixed allocation.
+ */
+struct gk20a_fixed_alloc {
+        struct list_head buddies;       /* List of buddies. */
+        struct rb_node alloced_entry;   /* RB tree of fixed allocations. */
+        u64 start;                      /* Start of fixed block. */
+        u64 end;                        /* End address. */
+};
+struct vm_gk20a;
+/*
+ * GPU buddy allocator for the various GPU address spaces. Each addressable unit
+ * doesn't have to correspond to a byte. In some cases each unit is a more
+ * complex object such as a comp_tag line or the like.
+ *
+ * The max order is computed based on the size of the minimum order and the size
+ * of the address space.
+ *
+ * order_size is the size of an order 0 buddy.
+ */
 struct gk20a_allocator {
-        char name[32];                  /* name for allocator */
+        struct vm_gk20a *vm;            /* Parent VM - can be NULL. */
-        struct rb_root rb_root;         /* rb tree root for blocks */
-        u32 base;                       /* min value of this linear space */
+        char name[32];                  /* Name of allocator. */
-        u32 limit;                      /* max value = limit - 1 */
-        unsigned long *bitmap;          /* bitmap */
+        u64 base;                       /* Base address of the space. */
+        u64 length;                     /* Length of the space. */
+        u64 blk_size;                   /* Size of order 0 allocation. */
+        u64 blk_shift;                  /* Shift to divide by blk_size. */
-        struct gk20a_alloc_block *block_first;  /* first block in list */
+        int init;                       /* Non-zero if initialized. */
-        struct gk20a_alloc_block *block_recent; /* last visited block */
-        u32 first_free_addr;            /* first free addr, non-contigous
+        /* Internal stuff. */
-                                           allocation preferred start,
+        u64 start;                      /* Real start (aligned to blk_size). */
-                                           in order to pick up small holes */
+        u64 end;                        /* Real end, trimmed if needed. */
-        u32 last_free_addr;             /* last free addr, contiguous
+        u64 count;                      /* Count of objects in space. */
-                                           allocation preferred start */
+        u64 blks;                       /* Count of blks in the space. */
-        u32 cached_hole_size;           /* max free hole size up to
+        u64 max_order;                  /* Specific maximum order. */
-                                           last_free_addr */
-        u32 block_count;                /* number of blocks */
-        struct rw_semaphore rw_sema;    /* lock */
+        struct rb_root alloced_buddies; /* Outstanding allocations. */
-        struct kmem_cache *block_cache; /* slab cache */
+        struct rb_root fixed_allocs;    /* Outstanding fixed allocations. */
-        /* if enabled, constrain to [base, limit) */
+        struct mutex lock;              /* Protects buddy access. */
-        struct {
-                bool enable;
-                u32 base;
-                u32 limit;
-        } constraint;
-        int (*alloc)(struct gk20a_allocator *allocator,
+#define GPU_BALLOC_GVA_SPACE            0x1
-                u32 *addr, u32 len, u32 align);
+        u64 flags;
-        int (*free)(struct gk20a_allocator *allocator,
-                u32 addr, u32 len, u32 align);
-};
+        /*
+         * Impose an upper bound on the maximum order.
+         */
+#define GPU_BALLOC_MAX_ORDER            31
+#define GPU_BALLOC_ORDER_LIST_LEN       (GPU_BALLOC_MAX_ORDER + 1)
-int gk20a_allocator_init(struct gk20a_allocator *allocator,
+        struct list_head buddy_list[GPU_BALLOC_ORDER_LIST_LEN];
-                        const char *name, u32 base, u32 size);
+        u64 buddy_list_len[GPU_BALLOC_ORDER_LIST_LEN];
-void gk20a_allocator_destroy(struct gk20a_allocator *allocator);
+        u64 buddy_list_split[GPU_BALLOC_ORDER_LIST_LEN];
+        u64 buddy_list_alloced[GPU_BALLOC_ORDER_LIST_LEN];
-int gk20a_allocator_block_alloc(struct gk20a_allocator *allocator,
+        /*
-                        u32 *addr, u32 len, u32 align);
+         * This is for when the allocator is managing a GVA space (the
+         * GPU_BALLOC_GVA_SPACE bit is set in @flags). This requires
+         * that we group like sized allocations into PDE blocks.
+         */
+        u64 pte_blk_order;
-int gk20a_allocator_block_free(struct gk20a_allocator *allocator,
+        struct dentry *debugfs_entry;
-                        u32 addr, u32 len, u32 align);
-#if defined(ALLOCATOR_DEBUG)
+        u64 bytes_alloced;
+        u64 bytes_alloced_real;
+        u64 bytes_freed;
+};
-#define allocator_dbg(alloctor, format, arg...)                         \
+#define balloc_lock(a)          mutex_lock(&(a)->lock)
-do {                                                            \
+#define balloc_unlock(a)        mutex_unlock(&(a)->lock)
-        if (1)                                                  \
-                pr_debug("gk20a_allocator (%s) %s: " format "\n",\
-                        alloctor->name, __func__, ##arg);\
-} while (0)
-#else /* ALLOCATOR_DEBUG */
+#define balloc_get_order_list(a, order) (&(a)->buddy_list[(order)])
+#define balloc_order_to_len(a, order)   ((1 << order) * (a)->blk_size)
+#define balloc_base_shift(a, base)      ((base) - (a)->start)
+#define balloc_base_unshift(a, base)    ((base) + (a)->start)
-#define allocator_dbg(format, arg...)
+int  gk20a_allocator_init(struct gk20a_allocator *allocator,
+                          const char *name, u64 base, u64 size, u64 order0);
+int  __gk20a_allocator_init(struct gk20a_allocator *allocator,
+                            struct vm_gk20a *vm, const char *name,
+                            u64 base, u64 size, u64 order0,
+                            u64 max_order, u64 flags);
+void gk20a_allocator_destroy(struct gk20a_allocator *allocator);
-#endif /* ALLOCATOR_DEBUG */
+/*
+ * Normal alloc/free operations for the buddy allocator.
+ */
+u64  gk20a_balloc(struct gk20a_allocator *allocator, u64 len);
+void gk20a_bfree(struct gk20a_allocator *allocator, u64 addr);
+/*
+ * Special interface to allocate a memory regions with a specific starting
+ * address. Yikes.
+ */
+u64  gk20a_balloc_fixed(struct gk20a_allocator *allocator, u64 base, u64 len);
+/*
+ * Debugfs init.
+ */
+void gk20a_alloc_debugfs_init(struct platform_device *pdev);
+#if defined(ALLOCATOR_DEBUG)
+#define balloc_dbg(alloctor, format, arg...)            \
+        pr_info("%-25s %25s() " format,                 \
+                alloctor->name, __func__, ##arg)
+#else
+#define balloc_dbg(allocator, format, arg...)
+#endif
 #endif /* GK20A_ALLOCATOR_H */
diff --git a/drivers/gpu/nvgpu/gk20a/ltc_gk20a.c b/drivers/gpu/nvgpu/gk20a/ltc_gk20a.c
index 02bea0a1..7cb386f0 100644
--- a/drivers/gpu/nvgpu/gk20a/ltc_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/ltc_gk20a.c
@@ -89,9 +89,8 @@ static int gk20a_ltc_init_comptags(struct gk20a *g, struct gr_gk20a *gr)
        if (err)
                return err;
-        gk20a_allocator_init(&gr->comp_tags, "comptag",
+        __gk20a_allocator_init(&gr->comp_tags, NULL, "comptag",
-                              1, /* start */
+                               1, max_comptag_lines - 1, 1, 10, 0);
-                              max_comptag_lines - 1); /* length*/
        gr->comptags_per_cacheline = comptags_per_cacheline;
        gr->slices_per_ltc = slices_per_fbp / g->ltc_count;
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
index 735c262a..a38db709 100644
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
@@ -132,10 +132,8 @@ static void gk20a_mm_delete_priv(void *_priv)
        if (priv->comptags.lines) {
                BUG_ON(!priv->comptag_allocator);
-                priv->comptag_allocator->free(priv->comptag_allocator,
+                gk20a_bfree(priv->comptag_allocator,
-                                              priv->comptags.offset,
+                            priv->comptags.real_offset);
-                                              priv->comptags.allocated_lines,
-                                              1);
        }
        /* Free buffer states */
@@ -226,10 +224,9 @@ static int gk20a_alloc_comptags(struct gk20a *g,
                                u32 *ctag_map_win_ctagline)
 {
        struct gk20a_dmabuf_priv *priv = dma_buf_get_drvdata(dmabuf, dev);
-        u32 offset = 0;
-        int err;
        u32 ctaglines_to_allocate;
-        u32 ctagline_align;
+        u32 ctagline_align = 1;
+        u32 offset;
        const u32 aggregate_cacheline_sz =
                g->gr.cacheline_size * g->gr.slices_per_ltc *
                g->ltc_count;
@@ -243,7 +240,6 @@ static int gk20a_alloc_comptags(struct gk20a *g,
        if (!user_mappable) {
                ctaglines_to_allocate = lines;
-                ctagline_align = 1;
        } else {
                /* Unfortunately, we cannot use allocation alignment
                 * here, since compbits per cacheline is not always a
@@ -275,82 +271,26 @@ static int gk20a_alloc_comptags(struct gk20a *g,
                if (ctaglines_to_allocate < lines)
                        return -EINVAL; /* integer overflow */
+                pr_info("user-mapped CTAGS: %u\n", ctaglines_to_allocate);
        }
        /* store the allocator so we can use it when we free the ctags */
        priv->comptag_allocator = allocator;
-        err = allocator->alloc(allocator, &offset,
+        offset = gk20a_balloc(allocator, ctaglines_to_allocate);
-                               ctaglines_to_allocate, 1);
+        if (!offset)
-        if (!err) {
+                return -ENOMEM;
-                const u32 alignment_lines =
-                        DIV_ROUND_UP(offset, ctagline_align) * ctagline_align -
-                        offset;
-                /* prune the preceding ctaglines that were allocated
-                   for alignment */
-                if (alignment_lines) {
-                        /* free alignment lines */
-                        int tmp=
-                                allocator->free(allocator, offset,
-                                                alignment_lines,
-                                                1);
-                        WARN_ON(tmp);
-                        offset += alignment_lines;
-                        ctaglines_to_allocate -= alignment_lines;
-                }
-                /* check if we can prune the trailing, too */
+        priv->comptags.lines = lines;
-                if (user_mappable)
+        priv->comptags.real_offset = offset;
-                {
-                        u32 needed_cachelines =
-                                DIV_ROUND_UP(lines, g->gr.comptags_per_cacheline);
-                        u32 first_unneeded_cacheline =
-                                DIV_ROUND_UP(round_up(needed_cachelines *
-                                                      aggregate_cacheline_sz,
-                                                      small_pgsz),
-                                             aggregate_cacheline_sz);
-                        u32 needed_ctaglines =
-                                first_unneeded_cacheline *
-                                g->gr.comptags_per_cacheline;
-                        u64 win_size;
-                        if (needed_ctaglines < ctaglines_to_allocate) {
-                                /* free alignment lines */
-                                int tmp=
-                                        allocator->free(
-                                                allocator,
-                                                offset + needed_ctaglines,
-                                                (ctaglines_to_allocate -
-                                                 needed_ctaglines),
-                                                1);
-                                WARN_ON(tmp);
-                                ctaglines_to_allocate = needed_ctaglines;
-                        }
-                        *ctag_map_win_ctagline = offset;
+        if (user_mappable)
-                        win_size =
+                offset = DIV_ROUND_UP(offset, ctagline_align) * ctagline_align;
-                                DIV_ROUND_UP(lines,
-                                             g->gr.comptags_per_cacheline) *
-                                aggregate_cacheline_sz;
-                        *ctag_map_win_size = round_up(win_size, small_pgsz);
+        priv->comptags.offset = offset;
-                }
-                priv->comptags.offset = offset;
+        return 0;
-                priv->comptags.lines = lines;
-                priv->comptags.allocated_lines = ctaglines_to_allocate;
-                priv->comptags.user_mappable = user_mappable;
-        }
-        return err;
 }
 static int gk20a_init_mm_reset_enable_hw(struct gk20a *g)
 {
        gk20a_dbg_fn("");
@@ -901,14 +841,12 @@ static void gk20a_vm_unmap_user(struct vm_gk20a *vm, u64 offset)
 }
 u64 gk20a_vm_alloc_va(struct vm_gk20a *vm,
-                     u64 size,
+                      u64 size,
-                     enum gmmu_pgsz_gk20a gmmu_pgsz_idx)
+                      enum gmmu_pgsz_gk20a gmmu_pgsz_idx)
 {
        struct gk20a_allocator *vma = &vm->vma[gmmu_pgsz_idx];
-        int err;
        u64 offset;
-        u32 start_page_nr = 0, num_pages;
        u64 gmmu_page_size = vm->gmmu_page_sizes[gmmu_pgsz_idx];
        if (gmmu_pgsz_idx >= gmmu_nr_page_sizes) {
@@ -924,28 +862,19 @@ u64 gk20a_vm_alloc_va(struct vm_gk20a *vm,
        }
-        /* be certain we round up to gmmu_page_size if needed */
+        /* Be certain we round up to gmmu_page_size if needed */
-        /* TBD: DIV_ROUND_UP -> undefined reference to __aeabi_uldivmod */
        size = (size + ((u64)gmmu_page_size - 1)) & ~((u64)gmmu_page_size - 1);
        gk20a_dbg_info("size=0x%llx @ pgsz=%dKB", size,
                        vm->gmmu_page_sizes[gmmu_pgsz_idx]>>10);
-        /* The vma allocator represents page accounting. */
+        offset = gk20a_balloc(vma, size);
-        num_pages = size >> ilog2(vm->gmmu_page_sizes[gmmu_pgsz_idx]);
+        if (!offset) {
-        err = vma->alloc(vma, &start_page_nr, num_pages, 1);
-        if (err) {
                gk20a_err(dev_from_vm(vm),
-                           "%s oom: sz=0x%llx", vma->name, size);
+                          "%s oom: sz=0x%llx", vma->name, size);
                return 0;
        }
-        offset = (u64)start_page_nr <<
-                 ilog2(vm->gmmu_page_sizes[gmmu_pgsz_idx]);
        gk20a_dbg_fn("%s found addr: 0x%llx", vma->name, offset);
        return offset;
 }
@@ -954,25 +883,12 @@ int gk20a_vm_free_va(struct vm_gk20a *vm,
                     enum gmmu_pgsz_gk20a pgsz_idx)
 {
        struct gk20a_allocator *vma = &vm->vma[pgsz_idx];
-        u32 page_size = vm->gmmu_page_sizes[pgsz_idx];
-        u32 page_shift = ilog2(page_size);
-        u32 start_page_nr, num_pages;
-        int err;
        gk20a_dbg_info("%s free addr=0x%llx, size=0x%llx",
                        vma->name, offset, size);
+        gk20a_bfree(vma, offset);
-        start_page_nr = (u32)(offset >> page_shift);
+        return 0;
-        num_pages = (u32)((size + page_size - 1) >> page_shift);
-        err = vma->free(vma, start_page_nr, num_pages, 1);
-        if (err) {
-                gk20a_err(dev_from_vm(vm),
-                           "not found: offset=0x%llx, sz=0x%llx",
-                           offset, size);
-        }
-        return err;
 }
 static int insert_mapped_buffer(struct rb_root *root,
@@ -1169,7 +1085,7 @@ static int validate_fixed_buffer(struct vm_gk20a *vm,
        if (map_offset & (vm->gmmu_page_sizes[bfr->pgsz_idx] - 1)) {
                gk20a_err(dev, "map offset must be buffer page size aligned 0x%llx",
-                           map_offset);
+                          map_offset);
                return -EINVAL;
        }
@@ -2613,7 +2529,6 @@ int gk20a_init_vm(struct mm_gk20a *mm,
                char *name)
 {
        int err, i;
-        u32 num_small_pages, num_large_pages, low_hole_pages;
        char alloc_name[32];
        u64 small_vma_size, large_vma_size;
        u32 pde_lo, pde_hi;
@@ -2674,34 +2589,31 @@ int gk20a_init_vm(struct mm_gk20a *mm,
                large_vma_size = vm->va_limit - small_vma_size;
        }
-        num_small_pages = (u32)(small_vma_size >>
-                    ilog2(vm->gmmu_page_sizes[gmmu_page_size_small]));
-        /* num_pages above is without regard to the low-side hole. */
-        low_hole_pages = (vm->va_start >>
-                          ilog2(vm->gmmu_page_sizes[gmmu_page_size_small]));
        snprintf(alloc_name, sizeof(alloc_name), "gk20a_%s-%dKB", name,
                 vm->gmmu_page_sizes[gmmu_page_size_small]>>10);
-        err = gk20a_allocator_init(&vm->vma[gmmu_page_size_small],
+        err = __gk20a_allocator_init(&vm->vma[gmmu_page_size_small],
-                             alloc_name,
+                                     vm, alloc_name,
-                             low_hole_pages,             /*start*/
+                                     vm->va_start,
-                             num_small_pages - low_hole_pages);/* length*/
+                                     small_vma_size - vm->va_start,
+                                     SZ_4K,
+                                     GPU_BALLOC_MAX_ORDER,
+                                     GPU_BALLOC_GVA_SPACE);
        if (err)
                goto clean_up_ptes;
        if (big_pages) {
-                u32 start = (u32)(small_vma_size >>
-                            ilog2(vm->gmmu_page_sizes[gmmu_page_size_big]));
-                num_large_pages = (u32)(large_vma_size >>
-                            ilog2(vm->gmmu_page_sizes[gmmu_page_size_big]));
                snprintf(alloc_name, sizeof(alloc_name), "gk20a_%s-%dKB",
                         name, vm->gmmu_page_sizes[gmmu_page_size_big]>>10);
-                err = gk20a_allocator_init(&vm->vma[gmmu_page_size_big],
+                /*
-                                      alloc_name,
+                 * Big page VMA starts at the end of the small page VMA.
-                                      start,                    /* start */
+                 */
-                                      num_large_pages);         /* length */
+                err = __gk20a_allocator_init(&vm->vma[gmmu_page_size_big],
+                                             vm, alloc_name,
+                                             small_vma_size,
+                                             large_vma_size,
+                                             big_page_size,
+                                             GPU_BALLOC_MAX_ORDER,
+                                             GPU_BALLOC_GVA_SPACE);
                if (err)
                        goto clean_up_small_allocator;
        }
@@ -2782,9 +2694,9 @@ int gk20a_vm_release_share(struct gk20a_as_share *as_share)
 int gk20a_vm_alloc_space(struct gk20a_as_share *as_share,
                         struct nvgpu_as_alloc_space_args *args)
-{       int err = -ENOMEM;
+{
+        int err = -ENOMEM;
        int pgsz_idx = gmmu_page_size_small;
-        u32 start_page_nr;
        struct gk20a_allocator *vma;
        struct vm_gk20a *vm = as_share->vm;
        struct gk20a *g = vm->mm->g;
@@ -2815,21 +2727,19 @@ int gk20a_vm_alloc_space(struct gk20a_as_share *as_share,
                goto clean_up;
        }
-        start_page_nr = 0;
+        vma = &vm->vma[pgsz_idx];
        if (args->flags & NVGPU_AS_ALLOC_SPACE_FLAGS_FIXED_OFFSET)
-                start_page_nr = (u32)(args->o_a.offset >>
+                vaddr_start = gk20a_balloc_fixed(vma, args->o_a.offset,
-                                ilog2(vm->gmmu_page_sizes[pgsz_idx]));
+                                                 (u64)args->pages *
+                                                 (u64)args->page_size);
+        else
+                vaddr_start = gk20a_balloc(vma, args->pages * args->page_size);
-        vma = &vm->vma[pgsz_idx];
+        if (!vaddr_start) {
-        err = vma->alloc(vma, &start_page_nr, args->pages, 1);
-        if (err) {
                kfree(va_node);
                goto clean_up;
        }
-        vaddr_start = (u64)start_page_nr <<
-                      ilog2(vm->gmmu_page_sizes[pgsz_idx]);
        va_node->vaddr_start = vaddr_start;
        va_node->size = (u64)args->page_size * (u64)args->pages;
        va_node->pgsz_idx = pgsz_idx;
@@ -2853,7 +2763,7 @@ int gk20a_vm_alloc_space(struct gk20a_as_share *as_share,
                                         true);
                if (!map_offset) {
                        mutex_unlock(&vm->update_gmmu_lock);
-                        vma->free(vma, start_page_nr, args->pages, 1);
+                        gk20a_bfree(vma, vaddr_start);
                        kfree(va_node);
                        goto clean_up;
                }
@@ -2865,6 +2775,7 @@ int gk20a_vm_alloc_space(struct gk20a_as_share *as_share,
        mutex_unlock(&vm->update_gmmu_lock);
        args->o_a.offset = vaddr_start;
+        err = 0;
 clean_up:
        return err;
@@ -2875,7 +2786,6 @@ int gk20a_vm_free_space(struct gk20a_as_share *as_share,
 {
        int err = -ENOMEM;
        int pgsz_idx;
-        u32 start_page_nr;
        struct gk20a_allocator *vma;
        struct vm_gk20a *vm = as_share->vm;
        struct vm_reserved_va_node *va_node;
@@ -2888,14 +2798,8 @@ int gk20a_vm_free_space(struct gk20a_as_share *as_share,
        pgsz_idx = __nv_gmmu_va_is_upper(vm, args->offset) ?
                        gmmu_page_size_big : gmmu_page_size_small;
-        start_page_nr = (u32)(args->offset >>
-                        ilog2(vm->gmmu_page_sizes[pgsz_idx]));
        vma = &vm->vma[pgsz_idx];
-        err = vma->free(vma, start_page_nr, args->pages, 1);
+        gk20a_bfree(vma, args->offset);
-        if (err)
-                goto clean_up;
        mutex_lock(&vm->update_gmmu_lock);
        va_node = addr_to_reservation(vm, args->offset);
@@ -2925,8 +2829,8 @@ int gk20a_vm_free_space(struct gk20a_as_share *as_share,
                kfree(va_node);
        }
        mutex_unlock(&vm->update_gmmu_lock);
+        err = 0;
-clean_up:
        return err;
 }
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
index c1f8a4f0..82003cd0 100644
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
@@ -131,6 +131,7 @@ enum gmmu_pgsz_gk20a {
 };
 struct gk20a_comptags {
+        u32 real_offset;
        u32 offset;
        u32 lines;
        u32 allocated_lines;
diff --git a/drivers/gpu/nvgpu/gk20a/pmu_gk20a.c b/drivers/gpu/nvgpu/gk20a/pmu_gk20a.c
index 2456c784..11322293 100644
--- a/drivers/gpu/nvgpu/gk20a/pmu_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/pmu_gk20a.c
@@ -2816,7 +2816,6 @@ static int pmu_init_perfmon(struct pmu_gk20a *pmu)
        struct pmu_payload payload;
        u32 seq;
        u32 data;
-        int err = 0;
        gk20a_dbg_fn("");
@@ -2867,12 +2866,11 @@ static int pmu_init_perfmon(struct pmu_gk20a *pmu)
        gk20a_writel(g, pwr_pmu_idle_ctrl_r(2), data);
        if (!pmu->sample_buffer)
-                err = pmu->dmem.alloc(&pmu->dmem,
+                pmu->sample_buffer = gk20a_balloc(&pmu->dmem,
-                                      &pmu->sample_buffer, 2 * sizeof(u16),
+                                                  2 * sizeof(u16));
-                                      PMU_DMEM_ALLOC_ALIGNMENT);
+        if (!pmu->sample_buffer) {
-        if (err) {
                gk20a_err(dev_from_gk20a(g),
-                        "failed to allocate perfmon sample buffer");
+                          "failed to allocate perfmon sample buffer");
                return -ENOMEM;
        }
@@ -2970,15 +2968,17 @@ static int pmu_process_init_msg(struct pmu_gk20a *pmu,
        for (i = 0; i < PMU_QUEUE_COUNT; i++)
                pmu_queue_init(pmu, i, init);
-        if (!pmu->dmem.alloc) {
+        if (!pmu->dmem.init) {
-                /*Align start and end addresses*/
+                /* Align start and end addresses */
                u32 start = ALIGN(pv->get_pmu_init_msg_pmu_sw_mg_off(init),
-                                        PMU_DMEM_ALLOC_ALIGNMENT);
+                                  PMU_DMEM_ALLOC_ALIGNMENT);
                u32 end = (pv->get_pmu_init_msg_pmu_sw_mg_off(init) +
-                        pv->get_pmu_init_msg_pmu_sw_mg_size(init)) &
+                           pv->get_pmu_init_msg_pmu_sw_mg_size(init)) &
                        ~(PMU_DMEM_ALLOC_ALIGNMENT - 1);
                u32 size = end - start;
-                gk20a_allocator_init(&pmu->dmem, "gk20a_pmu_dmem", start, size);
+                __gk20a_allocator_init(&pmu->dmem, NULL, "gk20a_pmu_dmem",
+                                       start, size,
+                                       PMU_DMEM_ALLOC_ALIGNMENT, 4, 0);
        }
        pmu->pmu_ready = true;
@@ -3115,20 +3115,14 @@ static int pmu_response_handle(struct pmu_gk20a *pmu,
                seq->callback = NULL;
        if (pv->pmu_allocation_get_dmem_size(pmu,
                        pv->get_pmu_seq_in_a_ptr(seq)) != 0)
-                pmu->dmem.free(&pmu->dmem,
+                gk20a_bfree(&pmu->dmem,
                        pv->pmu_allocation_get_dmem_offset(pmu,
-                        pv->get_pmu_seq_in_a_ptr(seq)),
+                        pv->get_pmu_seq_in_a_ptr(seq)));
-                        pv->pmu_allocation_get_dmem_size(pmu,
-                                pv->get_pmu_seq_in_a_ptr(seq)),
-                        PMU_DMEM_ALLOC_ALIGNMENT);
        if (pv->pmu_allocation_get_dmem_size(pmu,
                        pv->get_pmu_seq_out_a_ptr(seq)) != 0)
-                pmu->dmem.free(&pmu->dmem,
+                gk20a_bfree(&pmu->dmem,
                        pv->pmu_allocation_get_dmem_offset(pmu,
-                        pv->get_pmu_seq_out_a_ptr(seq)),
+                        pv->get_pmu_seq_out_a_ptr(seq)));
-                        pv->pmu_allocation_get_dmem_size(pmu,
-                                pv->get_pmu_seq_out_a_ptr(seq)),
-                        PMU_DMEM_ALLOC_ALIGNMENT);
        if (seq->callback)
                seq->callback(g, msg, seq->cb_params, seq->desc, ret);
@@ -3769,11 +3763,10 @@ int gk20a_pmu_cmd_post(struct gk20a *g, struct pmu_cmd *cmd,
                        pv->pmu_allocation_set_dmem_size(pmu, in,
                        (u16)max(payload->in.size, payload->out.size));
-                err = pmu->dmem.alloc(&pmu->dmem,
+                *(pv->pmu_allocation_get_dmem_offset_addr(pmu, in)) =
-                        pv->pmu_allocation_get_dmem_offset_addr(pmu, in),
+                        gk20a_balloc(&pmu->dmem,
-                        pv->pmu_allocation_get_dmem_size(pmu, in),
+                                     pv->pmu_allocation_get_dmem_size(pmu, in));
-                        PMU_DMEM_ALLOC_ALIGNMENT);
+                if (!*(pv->pmu_allocation_get_dmem_offset_addr(pmu, in)))
-                if (err)
                        goto clean_up;
                pmu_copy_to_dmem(pmu, (pv->pmu_allocation_get_dmem_offset(pmu,
@@ -3794,11 +3787,12 @@ int gk20a_pmu_cmd_post(struct gk20a *g, struct pmu_cmd *cmd,
                (u16)payload->out.size);
                if (payload->out.buf != payload->in.buf) {
-                        err = pmu->dmem.alloc(&pmu->dmem,
-                                pv->pmu_allocation_get_dmem_offset_addr(pmu, out),
+                        *(pv->pmu_allocation_get_dmem_offset_addr(pmu, out)) =
-                                pv->pmu_allocation_get_dmem_size(pmu, out),
+                                gk20a_balloc(&pmu->dmem,
-                                PMU_DMEM_ALLOC_ALIGNMENT);
+                                    pv->pmu_allocation_get_dmem_size(pmu, out));
-                        if (err)
+                        if (!*(pv->pmu_allocation_get_dmem_offset_addr(pmu,
+                                                                       out)))
                                goto clean_up;
                } else {
                        BUG_ON(in == NULL);
@@ -3826,15 +3820,11 @@ int gk20a_pmu_cmd_post(struct gk20a *g, struct pmu_cmd *cmd,
 clean_up:
        gk20a_dbg_fn("fail");
        if (in)
-                pmu->dmem.free(&pmu->dmem,
+                gk20a_bfree(&pmu->dmem,
-                        pv->pmu_allocation_get_dmem_offset(pmu, in),
+                        pv->pmu_allocation_get_dmem_offset(pmu, in));
-                        pv->pmu_allocation_get_dmem_size(pmu, in),
-                        PMU_DMEM_ALLOC_ALIGNMENT);
        if (out)
-                pmu->dmem.free(&pmu->dmem,
+                gk20a_bfree(&pmu->dmem,
-                        pv->pmu_allocation_get_dmem_offset(pmu, out),
+                        pv->pmu_allocation_get_dmem_offset(pmu, out));
-                        pv->pmu_allocation_get_dmem_size(pmu, out),
-                        PMU_DMEM_ALLOC_ALIGNMENT);
        pmu_seq_release(pmu, seq);
        return err;
diff --git a/drivers/gpu/nvgpu/gk20a/pmu_gk20a.h b/drivers/gpu/nvgpu/gk20a/pmu_gk20a.h
index 73530b22..f29c810e 100644
--- a/drivers/gpu/nvgpu/gk20a/pmu_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/pmu_gk20a.h
@@ -3,7 +3,7 @@
 *
 * GK20A PMU (aka. gPMU outside gk20a context)
 *
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2015, NVIDIA CORPORATION.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms and conditions of the GNU General Public License,
@@ -466,7 +466,7 @@ struct pmu_ucode_desc {
 #define PMU_UNIT_ID_IS_VALID(id)                \
                (((id) < PMU_UNIT_END) || ((id) >= PMU_UNIT_TEST_START))
-#define PMU_DMEM_ALLOC_ALIGNMENT        (32)
+#define PMU_DMEM_ALLOC_ALIGNMENT        (4)
 #define PMU_DMEM_ALIGNMENT              (4)
 #define PMU_CMD_FLAGS_PMU_MASK          (0xF0)
diff --git a/drivers/gpu/nvgpu/gk20a/semaphore_gk20a.c b/drivers/gpu/nvgpu/gk20a/semaphore_gk20a.c
index 04f61c58..053550f6 100644
--- a/drivers/gpu/nvgpu/gk20a/semaphore_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/semaphore_gk20a.c
@@ -3,7 +3,7 @@
 *
 * GK20A Semaphores
 *
- * Copyright (c) 2014, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2014-2015, NVIDIA CORPORATION.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms and conditions of the GNU General Public License,
@@ -44,8 +44,10 @@ struct gk20a_semaphore_pool *gk20a_semaphore_pool_alloc(struct device *d,
        if (gk20a_get_sgtable(d, &p->sgt, p->cpu_va, p->iova, p->size))
                goto clean_up;
-        if (gk20a_allocator_init(&p->alloc, unique_name, 0,
+        /* Sacrifice one semaphore in the name of returning error codes. */
-                             p->size))
+        if (gk20a_allocator_init(&p->alloc, unique_name,
+                                 SEMAPHORE_SIZE, p->size - SEMAPHORE_SIZE,
+                                 SEMAPHORE_SIZE))
                goto clean_up;
        gk20a_dbg_info("cpuva=%p iova=%llx phys=%llx", p->cpu_va,
@@ -163,8 +165,8 @@ struct gk20a_semaphore *gk20a_semaphore_alloc(struct gk20a_semaphore_pool *pool)
        if (!s)
                return NULL;
-        if (pool->alloc.alloc(&pool->alloc, &s->offset, SEMAPHORE_SIZE,
+        s->offset = gk20a_balloc(&pool->alloc, SEMAPHORE_SIZE);
-                                SEMAPHORE_SIZE)) {
+        if (!s->offset) {
                gk20a_err(pool->dev, "failed to allocate semaphore");
                kfree(s);
                return NULL;
@@ -186,8 +188,7 @@ static void gk20a_semaphore_free(struct kref *ref)
        struct gk20a_semaphore *s =
                container_of(ref, struct gk20a_semaphore, ref);
-        s->pool->alloc.free(&s->pool->alloc, s->offset, SEMAPHORE_SIZE,
+        gk20a_bfree(&s->pool->alloc, s->offset);
-                        SEMAPHORE_SIZE);
        gk20a_semaphore_pool_put(s->pool);
        kfree(s);
 }
diff --git a/drivers/gpu/nvgpu/gm20b/ltc_gm20b.c b/drivers/gpu/nvgpu/gm20b/ltc_gm20b.c
index 9d16dba7..bc904ef3 100644
--- a/drivers/gpu/nvgpu/gm20b/ltc_gm20b.c
+++ b/drivers/gpu/nvgpu/gm20b/ltc_gm20b.c
@@ -90,9 +90,8 @@ static int gm20b_ltc_init_comptags(struct gk20a *g, struct gr_gk20a *gr)
        if (err)
                return err;
-        gk20a_allocator_init(&gr->comp_tags, "comptag",
+        __gk20a_allocator_init(&gr->comp_tags, NULL, "comptag",
-                              1, /* start */
+                               1, max_comptag_lines - 1, 1, 10, 0);
-                              max_comptag_lines - 1); /* length*/
        gr->comptags_per_cacheline = comptags_per_cacheline;
        gr->slices_per_ltc = slices_per_ltc;
diff --git a/drivers/gpu/nvgpu/vgpu/ltc_vgpu.c b/drivers/gpu/nvgpu/vgpu/ltc_vgpu.c
index 1beac216..211e34b5 100644
--- a/drivers/gpu/nvgpu/vgpu/ltc_vgpu.c
+++ b/drivers/gpu/nvgpu/vgpu/ltc_vgpu.c
@@ -41,9 +41,8 @@ static int vgpu_ltc_init_comptags(struct gk20a *g, struct gr_gk20a *gr)
        if (max_comptag_lines < 2)
                return -ENXIO;
-        gk20a_allocator_init(&gr->comp_tags, "comptag",
+        __gk20a_allocator_init(&gr->comp_tags, NULL, "comptag",
-                              1, /* start */
+                               1, max_comptag_lines - 1, 1, 10, 0); /* length*/
-                              max_comptag_lines - 1); /* length*/
        return 0;
 }
diff --git a/drivers/gpu/nvgpu/vgpu/mm_vgpu.c b/drivers/gpu/nvgpu/vgpu/mm_vgpu.c
index 94e4602f..855aac0d 100644
--- a/drivers/gpu/nvgpu/vgpu/mm_vgpu.c
+++ b/drivers/gpu/nvgpu/vgpu/mm_vgpu.c
@@ -243,11 +243,9 @@ static int vgpu_vm_alloc_share(struct gk20a_as_share *as_share,
        struct tegra_vgpu_as_share_params *p = &msg.params.as_share;
        struct mm_gk20a *mm = &g->mm;
        struct vm_gk20a *vm;
-        u32 num_small_pages, num_large_pages, low_hole_pages;
        u64 small_vma_size, large_vma_size;
        char name[32];
        int err, i;
-        u32 start;
        /* note: keep the page sizes sorted lowest to highest here */
        u32 gmmu_page_sizes[gmmu_nr_page_sizes] = {
@@ -294,33 +292,27 @@ static int vgpu_vm_alloc_share(struct gk20a_as_share *as_share,
        small_vma_size = (u64)16 << 30;
        large_vma_size = vm->va_limit - small_vma_size;
-        num_small_pages = (u32)(small_vma_size >>
-                    ilog2(vm->gmmu_page_sizes[gmmu_page_size_small]));
-        /* num_pages above is without regard to the low-side hole. */
-        low_hole_pages = (vm->va_start >>
-                          ilog2(vm->gmmu_page_sizes[gmmu_page_size_small]));
        snprintf(name, sizeof(name), "gk20a_as_%d-%dKB", as_share->id,
                 gmmu_page_sizes[gmmu_page_size_small]>>10);
-        err = gk20a_allocator_init(&vm->vma[gmmu_page_size_small],
+        err = __gk20a_allocator_init(&vm->vma[gmmu_page_size_small],
-                             name,
+                                     vm, name,
-                             low_hole_pages,             /*start*/
+                                     vm->va_start,
-                             num_small_pages - low_hole_pages);/* length*/
+                                     small_vma_size - vm->va_start,
+                                     SZ_4K,
+                                     GPU_BALLOC_MAX_ORDER,
+                                     GPU_BALLOC_GVA_SPACE);
        if (err)
                goto clean_up_share;
-        start = (u32)(small_vma_size >>
-                    ilog2(vm->gmmu_page_sizes[gmmu_page_size_big]));
-        num_large_pages = (u32)(large_vma_size >>
-                            ilog2(vm->gmmu_page_sizes[gmmu_page_size_big]));
        snprintf(name, sizeof(name), "gk20a_as_%d-%dKB", as_share->id,
                gmmu_page_sizes[gmmu_page_size_big]>>10);
-        err = gk20a_allocator_init(&vm->vma[gmmu_page_size_big],
+        err = __gk20a_allocator_init(&vm->vma[gmmu_page_size_big],
-                              name,
+                                     vm, name,
-                              start,                    /* start */
+                                     small_vma_size,
-                              num_large_pages);         /* length */
+                                     large_vma_size,
+                                     big_page_size,
+                                     GPU_BALLOC_MAX_ORDER,
+                                     GPU_BALLOC_GVA_SPACE);
        if (err)
                goto clean_up_small_allocator;