From a2e852364582e9c337f52bc53ccc33877c8f3b47 Mon Sep 17 00:00:00 2001
From: Alex Waterman <alexw@nvidia.com>
Date: Wed, 18 Mar 2015 13:33:09 -0700
Subject: gpu: nvgpu: New allocator for VA space

Implement a new buddy allocation scheme for the GPU's VA space.
The bitmap allocator was using too much memory and is not a scaleable
solution as the GPU's address space keeps getting bigger. The buddy
allocation scheme is much more memory efficient when the majority
of the address space is not allocated.

The buddy allocator is not constrained by the notion of a split
address space. The bitmap allocator could only manage either small
pages or large pages but not both at the same time. Thus the bottom
of the address space was for small pages, the top for large pages.
Although, that split is not removed quite yet, the new allocator
enables that to happen.

The buddy allocator is also very scalable. It manages the relatively
small comptag space to the enormous GPU VA space and everything in
between. This is important since the GPU has lots of different sized
spaces that need managing.

Currently there are certain limitations. For one the allocator does
not handle the fixed allocations from CUDA very well. It can do so
but with certain caveats. The PTE page size is always set to small.
This means the BA may place other small page allocations in the
buddies around the fixed allocation. It does this to avoid having
large and small page allocations in the same PDE.

Change-Id: I501cd15af03611536490137331d43761c402c7f9
Signed-off-by: Alex Waterman <alexw@nvidia.com>
Reviewed-on: http://git-master/r/740694
Reviewed-by: Automatic_Commit_Validation_User
GVS: Gerrit_Virtual_Submit
Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com>
Tested-by: Terje Bergstrom <tbergstrom@nvidia.com>
---
 drivers/gpu/nvgpu/gk20a/as_gk20a.c        |   15 +-
 drivers/gpu/nvgpu/gk20a/gk20a.c           |    2 +
 drivers/gpu/nvgpu/gk20a/gk20a_allocator.c | 1167 +++++++++++++++++++++++++++--
 drivers/gpu/nvgpu/gk20a/gk20a_allocator.h |  213 ++++--
 drivers/gpu/nvgpu/gk20a/ltc_gk20a.c       |    5 +-
 drivers/gpu/nvgpu/gk20a/mm_gk20a.c        |  194 ++---
 drivers/gpu/nvgpu/gk20a/mm_gk20a.h        |    1 +
 drivers/gpu/nvgpu/gk20a/pmu_gk20a.c       |   68 +-
 drivers/gpu/nvgpu/gk20a/pmu_gk20a.h       |    4 +-
 drivers/gpu/nvgpu/gk20a/semaphore_gk20a.c |   15 +-
 10 files changed, 1368 insertions(+), 316 deletions(-)

(limited to 'drivers/gpu/nvgpu/gk20a')

diff --git a/drivers/gpu/nvgpu/gk20a/as_gk20a.c b/drivers/gpu/nvgpu/gk20a/as_gk20a.c
index 038fa4c8..d832b792 100644
--- a/drivers/gpu/nvgpu/gk20a/as_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/as_gk20a.c
@@ -199,21 +199,14 @@ static int gk20a_as_ioctl_get_va_regions(
 
 	for (i = 0; i < write_entries; ++i) {
 		struct nvgpu_as_va_region region;
-		u32 base, limit;
 
 		memset(&region, 0, sizeof(struct nvgpu_as_va_region));
 
-		if (!vm->vma[i].constraint.enable) {
-			base = vm->vma[i].base;
-			limit = vm->vma[i].limit;
-		} else {
-			base = vm->vma[i].constraint.base;
-			limit = vm->vma[i].constraint.limit;
-		}
-
 		region.page_size = vm->gmmu_page_sizes[i];
-		region.offset = (u64)base * region.page_size;
-		region.pages = limit - base; /* NOTE: limit is exclusive */
+		region.offset = vm->vma[i].base;
+		/* No __aeabi_uldivmod() on some platforms... */
+		region.pages = (vm->vma[i].end - vm->vma[i].start) >>
+			ilog2(region.page_size);
 
 		if (copy_to_user(user_region_ptr + i, &region, sizeof(region)))
 			return -EFAULT;
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.c b/drivers/gpu/nvgpu/gk20a/gk20a.c
index f6c9f901..eb52f28d 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.c
@@ -59,6 +59,7 @@
 #include "hw_fb_gk20a.h"
 #include "gk20a_scale.h"
 #include "dbg_gpu_gk20a.h"
+#include "gk20a_allocator.h"
 #include "hal.h"
 #include "vgpu/vgpu.h"
 
@@ -1510,6 +1511,7 @@ static int gk20a_probe(struct platform_device *dev)
 					&gk20a->mm.disable_bigpage);
 	gk20a_pmu_debugfs_init(dev);
 	gk20a_cde_debugfs_init(dev);
+	gk20a_alloc_debugfs_init(dev);
 #endif
 
 	gk20a_init_gr(gk20a);
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a_allocator.c b/drivers/gpu/nvgpu/gk20a/gk20a_allocator.c
index 0037257c..56fb22df 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a_allocator.c
+++ b/drivers/gpu/nvgpu/gk20a/gk20a_allocator.c
@@ -1,7 +1,7 @@
 /*
  * gk20a allocator
  *
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2015, NVIDIA CORPORATION.  All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms and conditions of the GNU General Public License,
@@ -16,112 +16,1149 @@
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
+#include <linux/kernel.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/debugfs.h>
+
+#include "platform_gk20a.h"
 #include "gk20a_allocator.h"
-#include <linux/vmalloc.h>
 
-/* init allocator struct */
-int gk20a_allocator_init(struct gk20a_allocator *allocator,
-		const char *name, u32 start, u32 len)
+#include "mm_gk20a.h"
+
+static struct dentry *balloc_debugfs_root;
+
+static struct kmem_cache *buddy_cache;	/* slab cache for meta data. */
+
+static u32 balloc_tracing_on;
+
+#define balloc_trace_func()				\
+	do {						\
+		if (balloc_tracing_on)			\
+			trace_printk("%s\n", __func__);	\
+	} while (0)
+
+#define balloc_trace_func_done()				\
+	do {							\
+		if (balloc_tracing_on)				\
+			trace_printk("%s_done\n", __func__);	\
+	} while (0)
+
+
+static void balloc_init_alloc_debug(struct gk20a_allocator *a);
+static void balloc_print_stats(struct gk20a_allocator *a, struct seq_file *s,
+			       int lock);
+static struct gk20a_buddy *balloc_free_buddy(struct gk20a_allocator *a,
+					     u64 addr);
+static void balloc_coalesce(struct gk20a_allocator *a, struct gk20a_buddy *b);
+static void __balloc_do_free_fixed(struct gk20a_allocator *a,
+				   struct gk20a_fixed_alloc *falloc);
+
+/*
+ * This function is not present in older kernel's list.h code.
+ */
+#ifndef list_last_entry
+#define list_last_entry(ptr, type, member) \
+	list_entry((ptr)->prev, type, member)
+#endif
+
+/*
+ * GPU buddy allocator for various address spaces.
+ *
+ * Current limitations:
+ *   o  A fixed allocation could potentially be made that borders PDEs with
+ *      different PTE sizes. This would require that fixed buffer to have
+ *      different sized PTEs for different parts of the allocation. Probably
+ *      best to just require PDE alignment for fixed address allocs.
+ *
+ *   o  It is currently possible to make an allocator that has a buddy alignment
+ *      out of sync with the PDE block size alignment. A simple example is a
+ *      32GB address space starting at byte 1. Every buddy is shifted off by 1
+ *      which means each buddy corresponf to more than one actual GPU page. The
+ *      best way to fix this is probably just require PDE blocksize alignment
+ *      for the start of the address space. At the moment all allocators are
+ *      easily PDE aligned so this hasn't been a problem.
+ */
+
+/*
+ * Pick a suitable maximum order for this allocator.
+ *
+ * Hueristic: Just guessing that the best max order is the largest single
+ * block that will fit in the address space.
+ */
+static void balloc_compute_max_order(struct gk20a_allocator *a)
+{
+	u64 true_max_order = ilog2(a->blks);
+
+	if (a->max_order > true_max_order)
+		a->max_order = true_max_order;
+	if (a->max_order > GPU_BALLOC_MAX_ORDER)
+		a->max_order = GPU_BALLOC_MAX_ORDER;
+}
+
+/*
+ * Since we can only allocate in chucks of a->blk_size we need to trim off
+ * any excess data that is not aligned to a->blk_size.
+ */
+static void balloc_allocator_align(struct gk20a_allocator *a)
+{
+	a->start = ALIGN(a->base, a->blk_size);
+	a->end   = (a->base + a->length) & ~(a->blk_size - 1);
+	a->count = a->end - a->start;
+	a->blks  = a->count >> a->blk_shift;
+}
+
+/*
+ * Pass NULL for parent if you want a top level buddy.
+ */
+static struct gk20a_buddy *balloc_new_buddy(struct gk20a_allocator *a,
+					    struct gk20a_buddy *parent,
+					    u64 start, u64 order)
+{
+	struct gk20a_buddy *new_buddy;
+
+	new_buddy = kmem_cache_alloc(buddy_cache, GFP_KERNEL);
+	if (!new_buddy)
+		return NULL;
+
+	memset(new_buddy, 0, sizeof(struct gk20a_buddy));
+
+	new_buddy->parent = parent;
+	new_buddy->start = start;
+	new_buddy->order = order;
+	new_buddy->end = start + (1 << order) * a->blk_size;
+
+	return new_buddy;
+}
+
+static void __balloc_buddy_list_add(struct gk20a_allocator *a,
+				    struct gk20a_buddy *b,
+				    struct list_head *list)
+{
+	if (buddy_is_in_list(b)) {
+		balloc_dbg(a, "Oops: adding added buddy (%llu:0x%llx)\n",
+			   b->order, b->start);
+		BUG();
+	}
+
+	/*
+	 * Add big PTE blocks to the tail, small to the head for GVA spaces.
+	 * This lets the code that checks if there are available blocks check
+	 * without cycling through the entire list.
+	 */
+	if (a->flags & GPU_BALLOC_GVA_SPACE &&
+	    b->pte_size == BALLOC_PTE_SIZE_BIG)
+		list_add_tail(&b->buddy_entry, list);
+	else
+		list_add(&b->buddy_entry, list);
+
+	buddy_set_in_list(b);
+}
+
+static void __balloc_buddy_list_rem(struct gk20a_allocator *a,
+				    struct gk20a_buddy *b)
+{
+	if (!buddy_is_in_list(b)) {
+		balloc_dbg(a, "Oops: removing removed buddy (%llu:0x%llx)\n",
+			   b->order, b->start);
+		BUG();
+	}
+
+	list_del_init(&b->buddy_entry);
+	buddy_clr_in_list(b);
+}
+
+/*
+ * Add a buddy to one of the buddy lists and deal with the necessary
+ * book keeping. Adds the buddy to the list specified by the buddy's order.
+ */
+static void balloc_blist_add(struct gk20a_allocator *a, struct gk20a_buddy *b)
+{
+	__balloc_buddy_list_add(a, b, balloc_get_order_list(a, b->order));
+	a->buddy_list_len[b->order]++;
+}
+
+static void balloc_blist_rem(struct gk20a_allocator *a, struct gk20a_buddy *b)
+{
+	__balloc_buddy_list_rem(a, b);
+	a->buddy_list_len[b->order]--;
+}
+
+static u64 balloc_get_order(struct gk20a_allocator *a, u64 len)
+{
+	if (len == 0)
+		return 0;
+
+	len--;
+	len >>= a->blk_shift;
+
+	return fls(len);
+}
+
+static u64 __balloc_max_order_in(struct gk20a_allocator *a, u64 start, u64 end)
+{
+	u64 size = (end - start) >> a->blk_shift;
+
+	if (size > 0)
+		return min_t(u64, ilog2(size), a->max_order);
+	else
+		return GPU_BALLOC_MAX_ORDER;
+}
+
+/*
+ * Initialize the buddy lists.
+ */
+static int balloc_init_lists(struct gk20a_allocator *a)
+{
+	int i;
+	u64 bstart, bend, order;
+	struct gk20a_buddy *buddy;
+
+	bstart = a->start;
+	bend = a->end;
+
+	/* First make sure the LLs are valid. */
+	for (i = 0; i < GPU_BALLOC_ORDER_LIST_LEN; i++)
+		INIT_LIST_HEAD(balloc_get_order_list(a, i));
+
+	while (bstart < bend) {
+		order = __balloc_max_order_in(a, bstart, bend);
+
+		buddy = balloc_new_buddy(a, NULL, bstart, order);
+		if (!buddy)
+			goto cleanup;
+
+		balloc_blist_add(a, buddy);
+		bstart += balloc_order_to_len(a, order);
+	}
+
+	return 0;
+
+cleanup:
+	for (i = 0; i < GPU_BALLOC_ORDER_LIST_LEN; i++) {
+		if (!list_empty(balloc_get_order_list(a, i))) {
+			buddy = list_first_entry(balloc_get_order_list(a, i),
+					struct gk20a_buddy, buddy_entry);
+			balloc_blist_rem(a, buddy);
+			kmem_cache_free(buddy_cache, buddy);
+		}
+	}
+
+	return -ENOMEM;
+}
+
+/*
+ * Initialize a buddy allocator. Returns 0 on success. This allocator does
+ * not necessarily manage bytes. It manages distinct ranges of resources. This
+ * allows the allocator to work for things like comp_tags, semaphores, etc.
+ *
+ * @allocator: Ptr to an allocator struct to init.
+ * @vm: GPU VM to associate this allocator with. Can be NULL. Will be used to
+ *      get PTE size for GVA spaces.
+ * @name: Name of the allocator. Doesn't have to be static storage.
+ * @base: The base address of the resource pool being managed.
+ * @size: Number of resources in the pool.
+ * @blk_size: Minimum number of resources to allocate at once. For things like
+ *            semaphores this is 1. For GVA this might be as much as 64k. This
+ *            corresponds to order 0. Must be power of 2.
+ * @max_order: Pick a maximum order. If you leave this as 0, the buddy allocator
+ *             will try and pick a reasonable max order.
+ * @flags: Extra flags necessary. See GPU_BALLOC_*.
+ */
+int __gk20a_allocator_init(struct gk20a_allocator *a,
+			   struct vm_gk20a *vm, const char *name,
+			   u64 base, u64 size, u64 blk_size, u64 max_order,
+			   u64 flags)
 {
-	memset(allocator, 0, sizeof(struct gk20a_allocator));
+	int err;
+
+	memset(a, 0, sizeof(struct gk20a_allocator));
+	strncpy(a->name, name, 32);
+
+	a->base = base;
+	a->length = size;
+	a->blk_size = blk_size;
+	a->blk_shift = __ffs(blk_size);
+
+	/* blk_size must be greater than 0 and a power of 2. */
+	if (blk_size == 0)
+		return -EINVAL;
+	if (blk_size & (blk_size - 1))
+		return -EINVAL;
+
+	if (max_order > GPU_BALLOC_MAX_ORDER)
+		return -EINVAL;
+
+	/* If this is to manage a GVA space we need a VM. */
+	if (flags & GPU_BALLOC_GVA_SPACE && !vm)
+		return -EINVAL;
+
+	a->vm = vm;
+	if (flags & GPU_BALLOC_GVA_SPACE)
+		a->pte_blk_order = balloc_get_order(a, vm->big_page_size << 10);
 
-	strncpy(allocator->name, name, 32);
+	a->flags = flags;
+	a->max_order = max_order;
 
-	allocator->base = start;
-	allocator->limit = start + len - 1;
+	balloc_allocator_align(a);
+	balloc_compute_max_order(a);
 
-	allocator->bitmap = vzalloc(BITS_TO_LONGS(len) * sizeof(long));
-	if (!allocator->bitmap)
+	/* Shared buddy kmem_cache for all allocators. */
+	if (!buddy_cache)
+		buddy_cache = KMEM_CACHE(gk20a_buddy, 0);
+	if (!buddy_cache)
 		return -ENOMEM;
 
-	allocator_dbg(allocator, "%s : base %d, limit %d",
-		allocator->name, allocator->base);
+	a->alloced_buddies = RB_ROOT;
+	err = balloc_init_lists(a);
+	if (err)
+		return err;
 
-	init_rwsem(&allocator->rw_sema);
+	mutex_init(&a->lock);
 
-	allocator->alloc = gk20a_allocator_block_alloc;
-	allocator->free = gk20a_allocator_block_free;
+	a->init = 1;
+
+	balloc_init_alloc_debug(a);
+	balloc_dbg(a, "New allocator: base      0x%llx\n", a->base);
+	balloc_dbg(a, "               size      0x%llx\n", a->length);
+	balloc_dbg(a, "               blk_size  0x%llx\n", a->blk_size);
+	balloc_dbg(a, "               max_order %llu\n", a->max_order);
+	balloc_dbg(a, "               flags     0x%llx\n", a->flags);
 
 	return 0;
 }
 
-/* destroy allocator, free all remaining blocks if any */
-void gk20a_allocator_destroy(struct gk20a_allocator *allocator)
+int gk20a_allocator_init(struct gk20a_allocator *a, const char *name,
+			 u64 base, u64 size, u64 blk_size)
+{
+	return __gk20a_allocator_init(a, NULL, name,
+				      base, size, blk_size, 0, 0);
+}
+
+/*
+ * Clean up and destroy the passed allocator.
+ */
+void gk20a_allocator_destroy(struct gk20a_allocator *a)
 {
-	down_write(&allocator->rw_sema);
+	struct rb_node *node;
+	struct gk20a_buddy *bud;
+	struct gk20a_fixed_alloc *falloc;
+	int i;
+
+	balloc_lock(a);
+
+	if (!IS_ERR_OR_NULL(a->debugfs_entry))
+		debugfs_remove(a->debugfs_entry);
+
+	/*
+	 * Free the fixed allocs first.
+	 */
+	while ((node = rb_first(&a->fixed_allocs)) != NULL) {
+		falloc = container_of(node,
+				      struct gk20a_fixed_alloc, alloced_entry);
+
+		__balloc_do_free_fixed(a, falloc);
+		rb_erase(node, &a->fixed_allocs);
+	}
+
+	/*
+	 * And now free all outstanding allocations.
+	 */
+	while ((node = rb_first(&a->alloced_buddies)) != NULL) {
+		bud = container_of(node, struct gk20a_buddy, alloced_entry);
+		balloc_free_buddy(a, bud->start);
+		balloc_blist_add(a, bud);
+		balloc_coalesce(a, bud);
+	}
 
-	vfree(allocator->bitmap);
+	/*
+	 * Now clean up the unallocated buddies.
+	 */
+	for (i = 0; i < GPU_BALLOC_ORDER_LIST_LEN; i++) {
+		BUG_ON(a->buddy_list_alloced[i] != 0);
+
+		while (!list_empty(balloc_get_order_list(a, i))) {
+			bud = list_first_entry(balloc_get_order_list(a, i),
+					       struct gk20a_buddy, buddy_entry);
+			balloc_blist_rem(a, bud);
+			kmem_cache_free(buddy_cache, bud);
+		}
+
+		if (a->buddy_list_len[i] != 0) {
+			pr_info("Excess buddies!!! (%d: %llu)\n",
+				i, a->buddy_list_len[i]);
+			BUG();
+		}
+		if (a->buddy_list_split[i] != 0) {
+			pr_info("Excess split nodes!!! (%d: %llu)\n",
+				i, a->buddy_list_split[i]);
+			BUG();
+		}
+		if (a->buddy_list_alloced[i] != 0) {
+			pr_info("Excess alloced nodes!!! (%d: %llu)\n",
+				i, a->buddy_list_alloced[i]);
+			BUG();
+		}
+	}
 
-	memset(allocator, 0, sizeof(struct gk20a_allocator));
+	a->init = 0;
+
+	balloc_unlock(a);
+
+	/*
+	 * We cant unlock an allocator after memsetting it. That wipes the
+	 * state of the mutex. Hopefully no one uses the allocator after
+	 * destroying it...
+	 */
+	memset(a, 0, sizeof(struct gk20a_allocator));
 }
 
 /*
- * *addr != ~0 for fixed address allocation. if *addr == 0, base addr is
- * returned to caller in *addr.
+ * Combine the passed buddy if possible. The pointer in @b may not be valid
+ * after this as the buddy may be freed.
  *
- * contiguous allocation, which allocates one block of
- * contiguous address.
-*/
-int gk20a_allocator_block_alloc(struct gk20a_allocator *allocator,
-		u32 *addr, u32 len, u32 align)
+ * @a must be locked.
+ */
+static void balloc_coalesce(struct gk20a_allocator *a, struct gk20a_buddy *b)
 {
-	unsigned long _addr;
+	struct gk20a_buddy *parent;
 
-	allocator_dbg(allocator, "[in] addr %d, len %d", *addr, len);
+	if (buddy_is_alloced(b) || buddy_is_split(b))
+		return;
 
-	if ((*addr != 0 && *addr < allocator->base) || /* check addr range */
-	    *addr + len > allocator->limit || /* check addr range */
-	    *addr & (align - 1) || /* check addr alignment */
-	     len == 0)                        /* check len */
-		return -EINVAL;
+	/*
+	 * If both our buddy and I are both not allocated and not split then
+	 * we can coalesce ourselves.
+	 */
+	if (!b->buddy)
+		return;
+	if (buddy_is_alloced(b->buddy) || buddy_is_split(b->buddy))
+		return;
+
+	parent = b->parent;
+
+	balloc_blist_rem(a, b);
+	balloc_blist_rem(a, b->buddy);
+
+	buddy_clr_split(parent);
+	a->buddy_list_split[parent->order]--;
+	balloc_blist_add(a, parent);
+
+	/*
+	 * Recursively coalesce as far as we can go.
+	 */
+	balloc_coalesce(a, parent);
+
+	/* Clean up the remains. */
+	kmem_cache_free(buddy_cache, b->buddy);
+	kmem_cache_free(buddy_cache, b);
+}
+
+/*
+ * Split a buddy into two new buddies who are 1/2 the size of the parent buddy.
+ *
+ * @a must be locked.
+ */
+static int balloc_split_buddy(struct gk20a_allocator *a, struct gk20a_buddy *b,
+			      int pte_size)
+{
+	struct gk20a_buddy *left, *right;
+	u64 half;
 
-	len = ALIGN(len, align);
-	if (!len)
+	left = balloc_new_buddy(a, b, b->start, b->order - 1);
+	if (!left)
 		return -ENOMEM;
 
-	down_write(&allocator->rw_sema);
+	half = (b->end - b->start) / 2;
 
-	_addr = bitmap_find_next_zero_area(allocator->bitmap,
-			allocator->limit - allocator->base + 1,
-			*addr ? (*addr - allocator->base) : 0,
-			len,
-			align - 1);
-	if ((_addr > allocator->limit - allocator->base + 1) ||
-	    (*addr && *addr != (_addr + allocator->base))) {
-		up_write(&allocator->rw_sema);
+	right = balloc_new_buddy(a, b, b->start + half, b->order - 1);
+	if (!right) {
+		kmem_cache_free(buddy_cache, left);
 		return -ENOMEM;
 	}
 
-	bitmap_set(allocator->bitmap, _addr, len);
-	*addr = allocator->base + _addr;
+	buddy_set_split(b);
+	a->buddy_list_split[b->order]++;
 
-	up_write(&allocator->rw_sema);
+	b->left = left;
+	b->right = right;
+	left->buddy = right;
+	right->buddy = left;
+	left->parent = b;
+	right->parent = b;
 
-	allocator_dbg(allocator, "[out] addr %d, len %d", *addr, len);
+	/* PTE considerations. */
+	if (a->flags & GPU_BALLOC_GVA_SPACE &&
+	    left->order <= a->pte_blk_order) {
+			left->pte_size = pte_size;
+			right->pte_size = pte_size;
+	}
+
+	balloc_blist_rem(a, b);
+	balloc_blist_add(a, left);
+	balloc_blist_add(a, right);
 
 	return 0;
 }
 
-/* free all blocks between start and end */
-int gk20a_allocator_block_free(struct gk20a_allocator *allocator,
-		u32 addr, u32 len, u32 align)
+/*
+ * Place the passed buddy into the RB tree for allocated buddies. Never fails
+ * unless the passed entry is a duplicate which is a bug.
+ *
+ * @a must be locked.
+ */
+void balloc_alloc_buddy(struct gk20a_allocator *a, struct gk20a_buddy *b)
 {
-	allocator_dbg(allocator, "[in] addr %d, len %d", addr, len);
+	struct rb_node **new = &(a->alloced_buddies.rb_node);
+	struct rb_node *parent = NULL;
 
-	if (addr + len > allocator->limit || /* check addr range */
-	    addr < allocator->base ||
-	    addr & (align - 1))   /* check addr alignment */
-		return -EINVAL;
+	while (*new) {
+		struct gk20a_buddy *bud = container_of(*new, struct gk20a_buddy,
+						       alloced_entry);
 
-	len = ALIGN(len, align);
-	if (!len)
-		return -EINVAL;
+		parent = *new;
+		if (b->start < bud->start)
+			new = &((*new)->rb_left);
+		else if (b->start > bud->start)
+			new = &((*new)->rb_right);
+		else
+			BUG_ON("Duplicate entries in allocated list!\n");
+	}
+
+	rb_link_node(&b->alloced_entry, parent, new);
+	rb_insert_color(&b->alloced_entry, &a->alloced_buddies);
+
+	buddy_set_alloced(b);
+	a->buddy_list_alloced[b->order]++;
+}
+
+/*
+ * Remove the passed buddy from the allocated buddy RB tree. Returns the
+ * deallocated buddy for further processing.
+ *
+ * @a must be locked.
+ */
+static struct gk20a_buddy *balloc_free_buddy(struct gk20a_allocator *a,
+					     u64 addr)
+{
+	struct rb_node *node = a->alloced_buddies.rb_node;
+	struct gk20a_buddy *bud;
+
+	while (node) {
+		bud = container_of(node, struct gk20a_buddy, alloced_entry);
+
+		if (addr < bud->start)
+			node = node->rb_left;
+		else if (addr > bud->start)
+			node = node->rb_right;
+		else
+			break;
+	}
+
+	if (!node)
+		return NULL;
+
+	rb_erase(node, &a->alloced_buddies);
+	buddy_clr_alloced(bud);
+	a->buddy_list_alloced[bud->order]--;
+
+	return bud;
+}
+
+/*
+ * Find a suitable buddy for the given order and PTE type (big or little).
+ */
+static struct gk20a_buddy *__balloc_find_buddy(struct gk20a_allocator *a,
+					       u64 order, int pte_size)
+{
+	struct gk20a_buddy *bud;
+
+	if (list_empty(balloc_get_order_list(a, order)))
+		return NULL;
+
+	if (a->flags & GPU_BALLOC_GVA_SPACE &&
+	    pte_size == BALLOC_PTE_SIZE_BIG)
+		bud = list_last_entry(balloc_get_order_list(a, order),
+				      struct gk20a_buddy, buddy_entry);
+	else
+		bud = list_first_entry(balloc_get_order_list(a, order),
+				       struct gk20a_buddy, buddy_entry);
+
+	if (bud->pte_size != BALLOC_PTE_SIZE_ANY &&
+	    bud->pte_size != pte_size)
+		return NULL;
+
+	return bud;
+}
+
+/*
+ * Allocate a suitably sized buddy. If no suitable buddy exists split higher
+ * order buddies until we have a suitable buddy to allocate.
+ *
+ * For PDE grouping add an extra check to see if a buddy is suitable: that the
+ * buddy exists in a PDE who's PTE size is reasonable
+ *
+ * @a must be locked.
+ */
+static u64 __balloc_do_alloc(struct gk20a_allocator *a, u64 order, int pte_size)
+{
+	u64 split_order;
+	struct gk20a_buddy *bud;
+
+	split_order = order;
+	while (!(bud = __balloc_find_buddy(a, split_order, pte_size)))
+		split_order++;
+
+	while (bud->order != order) {
+		if (balloc_split_buddy(a, bud, pte_size))
+			return 0; /* No mem... */
+		bud = bud->left;
+	}
+
+	balloc_blist_rem(a, bud);
+	balloc_alloc_buddy(a, bud);
 
-	down_write(&allocator->rw_sema);
-	bitmap_clear(allocator->bitmap, addr - allocator->base, len);
-	up_write(&allocator->rw_sema);
+	return bud->start;
+}
+
+/*
+ * Allocate memory from the passed allocator.
+ */
+u64 gk20a_balloc(struct gk20a_allocator *a, u64 len)
+{
+	u64 order, addr;
+	int pte_size;
+
+	balloc_trace_func();
+
+	balloc_lock(a);
+
+	order = balloc_get_order(a, len);
+
+	if (order > a->max_order) {
+		balloc_unlock(a);
+		balloc_dbg(a, "Alloc fail\n");
+		balloc_trace_func_done();
+		return 0;
+	}
+
+	/*
+	 * For now pass the base address of the allocator's region to
+	 * __get_pte_size(). This ensures we get the right page size for
+	 * the alloc but we don't have to know what the real address is
+	 * going to be quite yet.
+	 *
+	 * TODO: once userspace supports a unified address space pass 0 for
+	 * the base. This will make only 'len' affect the PTE size.
+	 */
+	if (a->flags & GPU_BALLOC_GVA_SPACE)
+		pte_size = __get_pte_size(a->vm, a->base, len);
+	else
+		pte_size = BALLOC_PTE_SIZE_ANY;
+
+	addr = __balloc_do_alloc(a, order, pte_size);
+
+	a->bytes_alloced += len;
+	a->bytes_alloced_real += balloc_order_to_len(a, order);
+
+	balloc_unlock(a);
+	balloc_dbg(a, "Alloc 0x%-10llx %3lld:0x%-10llx pte_size=%s\n",
+		   addr, order, len,
+		   pte_size == gmmu_page_size_big   ? "big" :
+		   pte_size == gmmu_page_size_small ? "small" :
+		   "NA/any");
+
+	balloc_trace_func_done();
+	return addr;
+}
+
+/*
+ * See if the passed range is actually available for allocation. If so, then
+ * return 1, otherwise return 0.
+ *
+ * TODO: Right now this uses the unoptimal approach of going through all
+ * outstanding allocations and checking their base/ends. This could be better.
+ */
+static int balloc_is_range_free(struct gk20a_allocator *a, u64 base, u64 end)
+{
+	struct rb_node *node;
+	struct gk20a_buddy *bud;
+
+	node = rb_first(&a->alloced_buddies);
+	if (!node)
+		return 1; /* No allocs yet. */
+
+	bud = container_of(node, struct gk20a_buddy, alloced_entry);
+
+	while (bud->start < end) {
+		if ((bud->start > base && bud->start < end) ||
+		    (bud->end   > base && bud->end   < end))
+			return 0;
+
+		node = rb_next(node);
+		if (!node)
+			break;
+		bud = container_of(node, struct gk20a_buddy, alloced_entry);
+	}
+
+	return 1;
+}
+
+static void balloc_alloc_fixed(struct gk20a_allocator *a,
+			       struct gk20a_fixed_alloc *f)
+{
+	struct rb_node **new = &(a->fixed_allocs.rb_node);
+	struct rb_node *parent = NULL;
+
+	while (*new) {
+		struct gk20a_fixed_alloc *falloc =
+			container_of(*new, struct gk20a_fixed_alloc,
+				     alloced_entry);
+
+		parent = *new;
+		if (f->start < falloc->start)
+			new = &((*new)->rb_left);
+		else if (f->start > falloc->start)
+			new = &((*new)->rb_right);
+		else
+			BUG_ON("Duplicate entries in allocated list!\n");
+	}
+
+	rb_link_node(&f->alloced_entry, parent, new);
+	rb_insert_color(&f->alloced_entry, &a->fixed_allocs);
+}
+
+/*
+ * Remove the passed buddy from the allocated buddy RB tree. Returns the
+ * deallocated buddy for further processing.
+ *
+ * @a must be locked.
+ */
+static struct gk20a_fixed_alloc *balloc_free_fixed(struct gk20a_allocator *a,
+						   u64 addr)
+{
+	struct rb_node *node = a->fixed_allocs.rb_node;
+	struct gk20a_fixed_alloc *falloc;
+
+	while (node) {
+		falloc = container_of(node,
+				      struct gk20a_fixed_alloc, alloced_entry);
+
+		if (addr < falloc->start)
+			node = node->rb_left;
+		else if (addr > falloc->start)
+			node = node->rb_right;
+		else
+			break;
+	}
+
+	if (!node)
+		return NULL;
+
+	rb_erase(node, &a->fixed_allocs);
+
+	return falloc;
+}
+
+/*
+ * Find the parent range - doesn't necessarily need the parent to actually exist
+ * as a buddy. Finding an existing parent comes later...
+ */
+static void __balloc_get_parent_range(struct gk20a_allocator *a,
+				      u64 base, u64 order,
+				      u64 *pbase, u64 *porder)
+{
+	u64 base_mask;
+	u64 shifted_base = balloc_base_shift(a, base);
+
+	order++;
+	base_mask = ~((a->blk_size << order) - 1);
+
+	shifted_base &= base_mask;
+
+	*pbase = balloc_base_unshift(a, shifted_base);
+	*porder = order;
+}
+
+/*
+ * Makes a buddy at the passed address. This will make all parent buddies
+ * necessary for this buddy to exist as well.
+ */
+static struct gk20a_buddy *__balloc_make_fixed_buddy(struct gk20a_allocator *a,
+						     u64 base, u64 order)
+{
+	struct gk20a_buddy *bud = NULL;
+	struct list_head *order_list;
+	u64 cur_order = order, cur_base = base;
+
+	/*
+	 * Algo:
+	 *  1. Keep jumping up a buddy order until we find the real buddy that
+	 *     this buddy exists in.
+	 *  2. Then work our way down through the buddy tree until we hit a dead
+	 *     end.
+	 *  3. Start splitting buddies until we split to the one we need to
+	 *     make.
+	 */
+	while (cur_order <= a->max_order) {
+		int found = 0;
+
+		order_list = balloc_get_order_list(a, cur_order);
+		list_for_each_entry(bud, order_list, buddy_entry) {
+			if (bud->start == cur_base) {
+				found = 1;
+				break;
+			}
+		}
+
+		if (found)
+			break;
+
+		__balloc_get_parent_range(a, cur_base, cur_order,
+					  &cur_base, &cur_order);
+	}
+
+	if (cur_order > a->max_order) {
+		balloc_dbg(a, "No buddy for range ???\n");
+		return NULL;
+	}
+
+	/* Split this buddy as necessary until we get the target buddy. */
+	while (bud->start != base || bud->order != order) {
+		if (balloc_split_buddy(a, bud, BALLOC_PTE_SIZE_ANY)) {
+			balloc_coalesce(a, bud);
+			return NULL;
+		}
+
+		if (base < bud->right->start)
+			bud = bud->left;
+		else
+			bud = bud->right;
+
+	}
+
+	return bud;
+}
+
+static u64 __balloc_do_alloc_fixed(struct gk20a_allocator *a,
+				   struct gk20a_fixed_alloc *falloc,
+				   u64 base, u64 len)
+{
+	u64 shifted_base, inc_base;
+	u64 align_order;
+
+	shifted_base = balloc_base_shift(a, base);
+	if (shifted_base == 0)
+		align_order = __fls(len >> a->blk_shift);
+	else
+		align_order = min_t(u64,
+				    __ffs(shifted_base >> a->blk_shift),
+				    __fls(len >> a->blk_shift));
+
+	if (align_order > a->max_order) {
+		balloc_dbg(a, "Align order too big: %llu > %llu\n",
+			   align_order, a->max_order);
+		return 0;
+	}
+
+	/*
+	 * Generate a list of buddies that satisfy this allocation.
+	 */
+	inc_base = shifted_base;
+	while (inc_base < (shifted_base + len)) {
+		u64 order_len = balloc_order_to_len(a, align_order);
+		u64 remaining;
+		struct gk20a_buddy *bud;
+
+		bud = __balloc_make_fixed_buddy(a,
+					balloc_base_unshift(a, inc_base),
+					align_order);
+		if (!bud) {
+			balloc_dbg(a, "Fixed buddy failed: {0x%llx, %llu}!\n",
+				   balloc_base_unshift(a, inc_base),
+				   align_order);
+			goto err_and_cleanup;
+		}
+
+		balloc_blist_rem(a, bud);
+		balloc_alloc_buddy(a, bud);
+		__balloc_buddy_list_add(a, bud, &falloc->buddies);
+
+		/* Book keeping. */
+		inc_base += order_len;
+		remaining = (shifted_base + len) - inc_base;
+		align_order = __ffs(inc_base >> a->blk_shift);
+
+		/* If we don't have much left - trim down align_order. */
+		if (balloc_order_to_len(a, align_order) > remaining)
+			align_order = __balloc_max_order_in(a, inc_base,
+						inc_base + remaining);
+	}
+
+	return base;
 
-	allocator_dbg(allocator, "[out] addr %d, len %d", addr, len);
+err_and_cleanup:
+	while (!list_empty(&falloc->buddies)) {
+		struct gk20a_buddy *bud = list_first_entry(&falloc->buddies,
+							   struct gk20a_buddy,
+							   buddy_entry);
+
+		__balloc_buddy_list_rem(a, bud);
+		balloc_free_buddy(a, bud->start);
+		kmem_cache_free(buddy_cache, bud);
+	}
+
+	return 0;
+}
+
+/*
+ * Allocate a fixed address allocation. The address of the allocation is @base
+ * and the length is @len. This is not a typical buddy allocator operation and
+ * as such has a high posibility of failure if the address space is heavily in
+ * use.
+ *
+ * Please do not use this function unless _absolutely_ necessary.
+ */
+u64 gk20a_balloc_fixed(struct gk20a_allocator *a, u64 base, u64 len)
+{
+	struct gk20a_fixed_alloc *falloc = NULL;
+	struct gk20a_buddy *bud;
+	u64 ret, real_bytes = 0;
+
+	balloc_trace_func();
+
+	/* If base isn't aligned to an order 0 block, fail. */
+	if (base & (a->blk_size - 1))
+		goto fail;
+
+	if (len == 0)
+		goto fail;
+
+	falloc = kmalloc(sizeof(*falloc), GFP_KERNEL);
+	if (!falloc)
+		goto fail;
+
+	INIT_LIST_HEAD(&falloc->buddies);
+	falloc->start = base;
+	falloc->end = base + len;
+
+	balloc_lock(a);
+	if (!balloc_is_range_free(a, base, base + len)) {
+		balloc_dbg(a, "Range not free: 0x%llx -> 0x%llx\n",
+			   base, base + len);
+		goto fail_unlock;
+	}
+
+	ret = __balloc_do_alloc_fixed(a, falloc, base, len);
+	if (!ret) {
+		balloc_dbg(a, "Alloc-fixed failed ?? 0x%llx -> 0x%llx\n",
+			   base, base + len);
+		goto fail_unlock;
+	}
+
+	balloc_alloc_fixed(a, falloc);
+
+	list_for_each_entry(bud, &falloc->buddies, buddy_entry)
+		real_bytes += (bud->end - bud->start);
+
+	a->bytes_alloced += len;
+	a->bytes_alloced_real += real_bytes;
+
+	balloc_unlock(a);
+	balloc_dbg(a, "Alloc (fixed) 0x%llx\n", base);
+
+	balloc_trace_func_done();
+	return base;
+
+fail_unlock:
+	balloc_unlock(a);
+fail:
+	kfree(falloc);
+	balloc_trace_func_done();
+	return 0;
+}
+
+static void __balloc_do_free_fixed(struct gk20a_allocator *a,
+				   struct gk20a_fixed_alloc *falloc)
+{
+	struct gk20a_buddy *bud;
+
+	while (!list_empty(&falloc->buddies)) {
+		bud = list_first_entry(&falloc->buddies,
+				       struct gk20a_buddy,
+				       buddy_entry);
+		__balloc_buddy_list_rem(a, bud);
+
+		balloc_free_buddy(a, bud->start);
+		balloc_blist_add(a, bud);
+		a->bytes_freed += balloc_order_to_len(a, bud->order);
+
+		/*
+		 * Attemp to defrag the allocation.
+		 */
+		balloc_coalesce(a, bud);
+	}
+
+	kfree(falloc);
+}
+
+/*
+ * Free the passed allocation.
+ */
+void gk20a_bfree(struct gk20a_allocator *a, u64 addr)
+{
+	struct gk20a_buddy *bud;
+	struct gk20a_fixed_alloc *falloc;
+
+	balloc_trace_func();
+
+	if (!addr) {
+		balloc_trace_func_done();
+		return;
+	}
+
+	balloc_lock(a);
+
+	/*
+	 * First see if this is a fixed alloc. If not fall back to a regular
+	 * buddy.
+	 */
+	falloc = balloc_free_fixed(a, addr);
+	if (falloc) {
+		__balloc_do_free_fixed(a, falloc);
+		goto done;
+	}
+
+	bud = balloc_free_buddy(a, addr);
+	if (!bud)
+		goto done;
+
+	balloc_blist_add(a, bud);
+	a->bytes_freed += balloc_order_to_len(a, bud->order);
+
+	/*
+	 * Attemp to defrag the allocation.
+	 */
+	balloc_coalesce(a, bud);
+
+done:
+	balloc_unlock(a);
+	balloc_dbg(a, "Free 0x%llx\n", addr);
+	balloc_trace_func_done();
+	return;
+}
+
+/*
+ * Print the buddy allocator top level stats. If you pass @s as NULL then the
+ * stats are printed to the kernel log. This lets this code be used for
+ * debugging purposes internal to the allocator.
+ */
+static void balloc_print_stats(struct gk20a_allocator *a, struct seq_file *s,
+			       int lock)
+{
+#define __balloc_pstat(s, fmt, arg...)			\
+	do {						\
+		if (s)					\
+			seq_printf(s, fmt, ##arg);	\
+		else					\
+			balloc_dbg(a, fmt, ##arg);	\
+	} while (0)
+
+	int i;
+	struct rb_node *node;
+	struct gk20a_fixed_alloc *falloc;
+
+	__balloc_pstat(s, "base = %llu, limit = %llu, blk_size = %llu\n",
+		   a->base, a->length, a->blk_size);
+	__balloc_pstat(s, "Internal params:\n");
+	__balloc_pstat(s, "  start = %llu\n", a->start);
+	__balloc_pstat(s, "  end   = %llu\n", a->end);
+	__balloc_pstat(s, "  count = %llu\n", a->count);
+	__balloc_pstat(s, "  blks  = %llu\n", a->blks);
+	__balloc_pstat(s, "  max_order  = %llu\n", a->max_order);
+
+	__balloc_pstat(s, "Buddy blocks:\n");
+	__balloc_pstat(s, "  Order   Free    Alloced   Split\n");
+	__balloc_pstat(s, "  -----   ----    -------   -----\n");
+
+	if (lock)
+		balloc_lock(a);
+	for (i = a->max_order; i >= 0; i--) {
+		if (a->buddy_list_len[i] == 0 &&
+		    a->buddy_list_alloced[i] == 0 &&
+		    a->buddy_list_split[i] == 0)
+			continue;
+
+		__balloc_pstat(s, "  %3d     %-7llu %-9llu %llu\n", i,
+			       a->buddy_list_len[i],
+			       a->buddy_list_alloced[i],
+			       a->buddy_list_split[i]);
+	}
+
+	__balloc_pstat(s, "\n");
+
+	for (node = rb_first(&a->fixed_allocs), i = 1;
+	     node != NULL;
+	     node = rb_next(node)) {
+		falloc = container_of(node,
+				      struct gk20a_fixed_alloc, alloced_entry);
+
+		__balloc_pstat(s, "Fixed alloc (%d): [0x%llx -> 0x%llx]\n",
+				i, falloc->start, falloc->end);
+	}
+
+	__balloc_pstat(s, "\n");
+	__balloc_pstat(s, "Bytes allocated:        %llu\n", a->bytes_alloced);
+	__balloc_pstat(s, "Bytes allocated (real): %llu\n",
+		       a->bytes_alloced_real);
+	__balloc_pstat(s, "Bytes freed:            %llu\n", a->bytes_freed);
+
+	if (lock)
+		balloc_unlock(a);
+
+#undef __balloc_pstats
+}
+
+static int __alloc_show(struct seq_file *s, void *unused)
+{
+	struct gk20a_allocator *a = s->private;
+
+	balloc_print_stats(a, s, 1);
 
 	return 0;
 }
+
+static int __alloc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, __alloc_show, inode->i_private);
+}
+
+static const struct file_operations __alloc_fops = {
+	.open = __alloc_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+static void balloc_init_alloc_debug(struct gk20a_allocator *a)
+{
+	if (!balloc_debugfs_root)
+		return;
+
+	a->debugfs_entry = debugfs_create_file(a->name, S_IRUGO,
+					       balloc_debugfs_root,
+					       a, &__alloc_fops);
+}
+
+void gk20a_alloc_debugfs_init(struct platform_device *pdev)
+{
+	struct gk20a_platform *platform = platform_get_drvdata(pdev);
+	struct dentry *gpu_root = platform->debugfs;
+
+	balloc_debugfs_root = debugfs_create_dir("allocators", gpu_root);
+	if (IS_ERR_OR_NULL(balloc_debugfs_root))
+		return;
+
+	debugfs_create_u32("tracing", 0664, balloc_debugfs_root,
+			   &balloc_tracing_on);
+}
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a_allocator.h b/drivers/gpu/nvgpu/gk20a/gk20a_allocator.h
index 69a227bd..e86e053b 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a_allocator.h
+++ b/drivers/gpu/nvgpu/gk20a/gk20a_allocator.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2015, NVIDIA CORPORATION.  All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms and conditions of the GNU General Public License,
@@ -17,75 +17,190 @@
 #ifndef GK20A_ALLOCATOR_H
 #define GK20A_ALLOCATOR_H
 
+#include <linux/list.h>
 #include <linux/rbtree.h>
-#include <linux/rwsem.h>
-#include <linux/slab.h>
+#include <linux/debugfs.h>
+#include <linux/platform_device.h>
 
 /* #define ALLOCATOR_DEBUG */
 
-/* main struct */
+/*
+ * Each buddy is an element in a binary tree.
+ */
+struct gk20a_buddy {
+	struct gk20a_buddy *parent;	/* Parent node. */
+	struct gk20a_buddy *buddy;	/* This node's buddy. */
+	struct gk20a_buddy *left;	/* Lower address sub-node. */
+	struct gk20a_buddy *right;	/* Higher address sub-node. */
+
+	struct list_head buddy_entry;	/* List entry for various lists. */
+	struct rb_node alloced_entry;	/* RB tree of allocations. */
+
+	u64 start;			/* Start address of this buddy. */
+	u64 end;			/* End address of this buddy. */
+	u64 order;			/* Buddy order. */
+
+#define BALLOC_BUDDY_ALLOCED	0x1
+#define BALLOC_BUDDY_SPLIT	0x2
+#define BALLOC_BUDDY_IN_LIST	0x4
+	int flags;			/* List of associated flags. */
+
+	/*
+	 * Size of the PDE this buddy is using. This allows for grouping like
+	 * sized allocations into the same PDE.
+	 */
+#define BALLOC_PTE_SIZE_ANY	0x0
+#define BALLOC_PTE_SIZE_SMALL	0x1
+#define BALLOC_PTE_SIZE_BIG	0x2
+	int pte_size;
+};
+
+#define __buddy_flag_ops(flag, flag_up)					\
+	static inline int buddy_is_ ## flag(struct gk20a_buddy *b)	\
+	{								\
+		return b->flags & BALLOC_BUDDY_ ## flag_up;		\
+	}								\
+	static inline void buddy_set_ ## flag(struct gk20a_buddy *b)	\
+	{								\
+		b->flags |= BALLOC_BUDDY_ ## flag_up;			\
+	}								\
+	static inline void buddy_clr_ ## flag(struct gk20a_buddy *b)	\
+	{								\
+		b->flags &= ~BALLOC_BUDDY_ ## flag_up;			\
+	}
+
+/*
+ * int  buddy_is_alloced(struct gk20a_buddy *b);
+ * void buddy_set_alloced(struct gk20a_buddy *b);
+ * void buddy_clr_alloced(struct gk20a_buddy *b);
+ *
+ * int  buddy_is_split(struct gk20a_buddy *b);
+ * void buddy_set_split(struct gk20a_buddy *b);
+ * void buddy_clr_split(struct gk20a_buddy *b);
+ *
+ * int  buddy_is_in_list(struct gk20a_buddy *b);
+ * void buddy_set_in_list(struct gk20a_buddy *b);
+ * void buddy_clr_in_list(struct gk20a_buddy *b);
+ */
+__buddy_flag_ops(alloced, ALLOCED);
+__buddy_flag_ops(split,   SPLIT);
+__buddy_flag_ops(in_list, IN_LIST);
+
+/*
+ * Keeps info for a fixed allocation.
+ */
+struct gk20a_fixed_alloc {
+	struct list_head buddies;	/* List of buddies. */
+	struct rb_node alloced_entry;	/* RB tree of fixed allocations. */
+
+	u64 start;			/* Start of fixed block. */
+	u64 end;			/* End address. */
+};
+
+struct vm_gk20a;
+
+/*
+ * GPU buddy allocator for the various GPU address spaces. Each addressable unit
+ * doesn't have to correspond to a byte. In some cases each unit is a more
+ * complex object such as a comp_tag line or the like.
+ *
+ * The max order is computed based on the size of the minimum order and the size
+ * of the address space.
+ *
+ * order_size is the size of an order 0 buddy.
+ */
 struct gk20a_allocator {
 
-	char name[32];			/* name for allocator */
-	struct rb_root rb_root;		/* rb tree root for blocks */
+	struct vm_gk20a *vm;		/* Parent VM - can be NULL. */
 
-	u32 base;			/* min value of this linear space */
-	u32 limit;			/* max value = limit - 1 */
+	char name[32];			/* Name of allocator. */
 
-	unsigned long *bitmap;		/* bitmap */
+	u64 base;			/* Base address of the space. */
+	u64 length;			/* Length of the space. */
+	u64 blk_size;			/* Size of order 0 allocation. */
+	u64 blk_shift;			/* Shift to divide by blk_size. */
 
-	struct gk20a_alloc_block *block_first;	/* first block in list */
-	struct gk20a_alloc_block *block_recent; /* last visited block */
+	int init;			/* Non-zero if initialized. */
 
-	u32 first_free_addr;		/* first free addr, non-contigous
-					   allocation preferred start,
-					   in order to pick up small holes */
-	u32 last_free_addr;		/* last free addr, contiguous
-					   allocation preferred start */
-	u32 cached_hole_size;		/* max free hole size up to
-					   last_free_addr */
-	u32 block_count;		/* number of blocks */
+	/* Internal stuff. */
+	u64 start;			/* Real start (aligned to blk_size). */
+	u64 end;			/* Real end, trimmed if needed. */
+	u64 count;			/* Count of objects in space. */
+	u64 blks;			/* Count of blks in the space. */
+	u64 max_order;			/* Specific maximum order. */
 
-	struct rw_semaphore rw_sema;	/* lock */
-	struct kmem_cache *block_cache;	/* slab cache */
+	struct rb_root alloced_buddies;	/* Outstanding allocations. */
+	struct rb_root fixed_allocs;	/* Outstanding fixed allocations. */
 
-	/* if enabled, constrain to [base, limit) */
-	struct {
-		bool enable;
-		u32 base;
-		u32 limit;
-	} constraint;
+	struct mutex lock;		/* Protects buddy access. */
 
-	int (*alloc)(struct gk20a_allocator *allocator,
-		u32 *addr, u32 len, u32 align);
-	int (*free)(struct gk20a_allocator *allocator,
-		u32 addr, u32 len, u32 align);
+#define GPU_BALLOC_GVA_SPACE		0x1
+	u64 flags;
 
-};
+	/*
+	 * Impose an upper bound on the maximum order.
+	 */
+#define GPU_BALLOC_MAX_ORDER		31
+#define GPU_BALLOC_ORDER_LIST_LEN	(GPU_BALLOC_MAX_ORDER + 1)
 
-int gk20a_allocator_init(struct gk20a_allocator *allocator,
-			const char *name, u32 base, u32 size);
-void gk20a_allocator_destroy(struct gk20a_allocator *allocator);
+	struct list_head buddy_list[GPU_BALLOC_ORDER_LIST_LEN];
+	u64 buddy_list_len[GPU_BALLOC_ORDER_LIST_LEN];
+	u64 buddy_list_split[GPU_BALLOC_ORDER_LIST_LEN];
+	u64 buddy_list_alloced[GPU_BALLOC_ORDER_LIST_LEN];
 
-int gk20a_allocator_block_alloc(struct gk20a_allocator *allocator,
-			u32 *addr, u32 len, u32 align);
+	/*
+	 * This is for when the allocator is managing a GVA space (the
+	 * GPU_BALLOC_GVA_SPACE bit is set in @flags). This requires
+	 * that we group like sized allocations into PDE blocks.
+	 */
+	u64 pte_blk_order;
 
-int gk20a_allocator_block_free(struct gk20a_allocator *allocator,
-			u32 addr, u32 len, u32 align);
+	struct dentry *debugfs_entry;
 
-#if defined(ALLOCATOR_DEBUG)
+	u64 bytes_alloced;
+	u64 bytes_alloced_real;
+	u64 bytes_freed;
+};
 
-#define allocator_dbg(alloctor, format, arg...)				\
-do {								\
-	if (1)							\
-		pr_debug("gk20a_allocator (%s) %s: " format "\n",\
-			alloctor->name, __func__, ##arg);\
-} while (0)
+#define balloc_lock(a)		mutex_lock(&(a)->lock)
+#define balloc_unlock(a)	mutex_unlock(&(a)->lock)
 
-#else /* ALLOCATOR_DEBUG */
+#define balloc_get_order_list(a, order)	(&(a)->buddy_list[(order)])
+#define balloc_order_to_len(a, order)	((1 << order) * (a)->blk_size)
+#define balloc_base_shift(a, base)	((base) - (a)->start)
+#define balloc_base_unshift(a, base)	((base) + (a)->start)
 
-#define allocator_dbg(format, arg...)
+int  gk20a_allocator_init(struct gk20a_allocator *allocator,
+			  const char *name, u64 base, u64 size, u64 order0);
+int  __gk20a_allocator_init(struct gk20a_allocator *allocator,
+			    struct vm_gk20a *vm, const char *name,
+			    u64 base, u64 size, u64 order0,
+			    u64 max_order, u64 flags);
+void gk20a_allocator_destroy(struct gk20a_allocator *allocator);
 
-#endif /* ALLOCATOR_DEBUG */
+/*
+ * Normal alloc/free operations for the buddy allocator.
+ */
+u64  gk20a_balloc(struct gk20a_allocator *allocator, u64 len);
+void gk20a_bfree(struct gk20a_allocator *allocator, u64 addr);
+
+/*
+ * Special interface to allocate a memory regions with a specific starting
+ * address. Yikes.
+ */
+u64  gk20a_balloc_fixed(struct gk20a_allocator *allocator, u64 base, u64 len);
+
+/*
+ * Debugfs init.
+ */
+void gk20a_alloc_debugfs_init(struct platform_device *pdev);
+
+#if defined(ALLOCATOR_DEBUG)
+#define balloc_dbg(alloctor, format, arg...)		\
+	pr_info("%-25s %25s() " format,			\
+		alloctor->name, __func__, ##arg)
+#else
+#define balloc_dbg(allocator, format, arg...)
+#endif
 
 #endif /* GK20A_ALLOCATOR_H */
diff --git a/drivers/gpu/nvgpu/gk20a/ltc_gk20a.c b/drivers/gpu/nvgpu/gk20a/ltc_gk20a.c
index c5d0f0c4..bcadde93 100644
--- a/drivers/gpu/nvgpu/gk20a/ltc_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/ltc_gk20a.c
@@ -89,9 +89,8 @@ static int gk20a_ltc_init_comptags(struct gk20a *g, struct gr_gk20a *gr)
 	if (err)
 		return err;
 
-	gk20a_allocator_init(&gr->comp_tags, "comptag",
-			      1, /* start */
-			      max_comptag_lines - 1); /* length*/
+	__gk20a_allocator_init(&gr->comp_tags, NULL, "comptag",
+			       1, max_comptag_lines - 1, 1, 10, 0);
 
 	gr->comptags_per_cacheline = comptags_per_cacheline;
 	gr->slices_per_ltc = slices_per_fbp / g->ltc_count;
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
index 5d1ff563..c11414b5 100644
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
@@ -132,10 +132,8 @@ static void gk20a_mm_delete_priv(void *_priv)
 
 	if (priv->comptags.lines) {
 		BUG_ON(!priv->comptag_allocator);
-		priv->comptag_allocator->free(priv->comptag_allocator,
-					      priv->comptags.offset,
-					      priv->comptags.allocated_lines,
-					      1);
+		gk20a_bfree(priv->comptag_allocator,
+			    priv->comptags.real_offset);
 	}
 
 	/* Free buffer states */
@@ -224,10 +222,9 @@ static int gk20a_alloc_comptags(struct gk20a *g,
 				u32 lines, bool user_mappable)
 {
 	struct gk20a_dmabuf_priv *priv = dma_buf_get_drvdata(dmabuf, dev);
-	u32 offset = 0;
-	int err;
 	u32 ctaglines_to_allocate;
-	u32 ctagline_align;
+	u32 ctagline_align = 1;
+	u32 offset;
 	const u32 aggregate_cacheline_sz =
 		g->gr.cacheline_size * g->gr.slices_per_ltc *
 		g->ltc_count;
@@ -241,7 +238,6 @@ static int gk20a_alloc_comptags(struct gk20a *g,
 
 	if (!user_mappable) {
 		ctaglines_to_allocate = lines;
-		ctagline_align = 1;
 	} else {
 		/* Unfortunately, we cannot use allocation alignment
 		 * here, since compbits per cacheline is not always a
@@ -273,71 +269,25 @@ static int gk20a_alloc_comptags(struct gk20a *g,
 
 		if (ctaglines_to_allocate < lines)
 			return -EINVAL; /* integer overflow */
+		pr_info("user-mapped CTAGS: %u\n", ctaglines_to_allocate);
 	}
 
 	/* store the allocator so we can use it when we free the ctags */
 	priv->comptag_allocator = allocator;
-	err = allocator->alloc(allocator, &offset,
-			       ctaglines_to_allocate, 1);
-	if (!err) {
-		const u32 alignment_lines =
-			DIV_ROUND_UP(offset, ctagline_align) * ctagline_align -
-			offset;
-
-		/* prune the preceding ctaglines that were allocated
-		   for alignment */
-		if (alignment_lines) {
-			/* free alignment lines */
-			int tmp=
-				allocator->free(allocator, offset,
-						alignment_lines,
-						1);
-			WARN_ON(tmp);
-
-			offset += alignment_lines;
-			ctaglines_to_allocate -= alignment_lines;
-		}
+	offset = gk20a_balloc(allocator, ctaglines_to_allocate);
+	if (!offset)
+		return -ENOMEM;
 
-		/* check if we can prune the trailing, too */
-		if (user_mappable)
-		{
-			u32 needed_cachelines =
-				DIV_ROUND_UP(lines, g->gr.comptags_per_cacheline);
-
-			u32 first_unneeded_cacheline =
-				DIV_ROUND_UP(round_up(needed_cachelines *
-						      aggregate_cacheline_sz,
-						      small_pgsz),
-					     aggregate_cacheline_sz);
-			u32 needed_ctaglines =
-				first_unneeded_cacheline *
-				g->gr.comptags_per_cacheline;
-
-			if (needed_ctaglines < ctaglines_to_allocate) {
-				/* free alignment lines */
-				int tmp=
-					allocator->free(
-						allocator,
-						offset + needed_ctaglines,
-						(ctaglines_to_allocate -
-						 needed_ctaglines),
-						1);
-				WARN_ON(tmp);
-
-				ctaglines_to_allocate = needed_ctaglines;
-			}
-		}
-
-		priv->comptags.offset = offset;
-		priv->comptags.lines = lines;
-		priv->comptags.allocated_lines = ctaglines_to_allocate;
-		priv->comptags.user_mappable = user_mappable;
-	}
-	return err;
-}
+	priv->comptags.lines = lines;
+	priv->comptags.real_offset = offset;
 
+	if (user_mappable)
+		offset = DIV_ROUND_UP(offset, ctagline_align) * ctagline_align;
 
+	priv->comptags.offset = offset;
 
+	return 0;
+}
 
 static int gk20a_init_mm_reset_enable_hw(struct gk20a *g)
 {
@@ -889,14 +839,12 @@ static void gk20a_vm_unmap_user(struct vm_gk20a *vm, u64 offset)
 }
 
 u64 gk20a_vm_alloc_va(struct vm_gk20a *vm,
-		     u64 size,
-		     enum gmmu_pgsz_gk20a gmmu_pgsz_idx)
+		      u64 size,
+		      enum gmmu_pgsz_gk20a gmmu_pgsz_idx)
 
 {
 	struct gk20a_allocator *vma = &vm->vma[gmmu_pgsz_idx];
-	int err;
 	u64 offset;
-	u32 start_page_nr = 0, num_pages;
 	u64 gmmu_page_size = vm->gmmu_page_sizes[gmmu_pgsz_idx];
 
 	if (gmmu_pgsz_idx >= gmmu_nr_page_sizes) {
@@ -912,28 +860,19 @@ u64 gk20a_vm_alloc_va(struct vm_gk20a *vm,
 
 	}
 
-	/* be certain we round up to gmmu_page_size if needed */
-	/* TBD: DIV_ROUND_UP -> undefined reference to __aeabi_uldivmod */
+	/* Be certain we round up to gmmu_page_size if needed */
 	size = (size + ((u64)gmmu_page_size - 1)) & ~((u64)gmmu_page_size - 1);
-
 	gk20a_dbg_info("size=0x%llx @ pgsz=%dKB", size,
 			vm->gmmu_page_sizes[gmmu_pgsz_idx]>>10);
 
-	/* The vma allocator represents page accounting. */
-	num_pages = size >> ilog2(vm->gmmu_page_sizes[gmmu_pgsz_idx]);
-
-	err = vma->alloc(vma, &start_page_nr, num_pages, 1);
-
-	if (err) {
+	offset = gk20a_balloc(vma, size);
+	if (!offset) {
 		gk20a_err(dev_from_vm(vm),
-			   "%s oom: sz=0x%llx", vma->name, size);
+			  "%s oom: sz=0x%llx", vma->name, size);
 		return 0;
 	}
 
-	offset = (u64)start_page_nr <<
-		 ilog2(vm->gmmu_page_sizes[gmmu_pgsz_idx]);
 	gk20a_dbg_fn("%s found addr: 0x%llx", vma->name, offset);
-
 	return offset;
 }
 
@@ -942,25 +881,12 @@ int gk20a_vm_free_va(struct vm_gk20a *vm,
 		     enum gmmu_pgsz_gk20a pgsz_idx)
 {
 	struct gk20a_allocator *vma = &vm->vma[pgsz_idx];
-	u32 page_size = vm->gmmu_page_sizes[pgsz_idx];
-	u32 page_shift = ilog2(page_size);
-	u32 start_page_nr, num_pages;
-	int err;
 
 	gk20a_dbg_info("%s free addr=0x%llx, size=0x%llx",
 			vma->name, offset, size);
+	gk20a_bfree(vma, offset);
 
-	start_page_nr = (u32)(offset >> page_shift);
-	num_pages = (u32)((size + page_size - 1) >> page_shift);
-
-	err = vma->free(vma, start_page_nr, num_pages, 1);
-	if (err) {
-		gk20a_err(dev_from_vm(vm),
-			   "not found: offset=0x%llx, sz=0x%llx",
-			   offset, size);
-	}
-
-	return err;
+	return 0;
 }
 
 static int insert_mapped_buffer(struct rb_root *root,
@@ -1136,7 +1062,7 @@ static int validate_fixed_buffer(struct vm_gk20a *vm,
 
 	if (map_offset & (vm->gmmu_page_sizes[bfr->pgsz_idx] - 1)) {
 		gk20a_err(dev, "map offset must be buffer page size aligned 0x%llx",
-			   map_offset);
+			  map_offset);
 		return -EINVAL;
 	}
 
@@ -2433,7 +2359,6 @@ int gk20a_init_vm(struct mm_gk20a *mm,
 		char *name)
 {
 	int err, i;
-	u32 num_small_pages, num_large_pages, low_hole_pages;
 	char alloc_name[32];
 	u64 small_vma_size, large_vma_size;
 	u32 pde_lo, pde_hi;
@@ -2494,34 +2419,31 @@ int gk20a_init_vm(struct mm_gk20a *mm,
 		large_vma_size = vm->va_limit - small_vma_size;
 	}
 
-	num_small_pages = (u32)(small_vma_size >>
-		    ilog2(vm->gmmu_page_sizes[gmmu_page_size_small]));
-
-	/* num_pages above is without regard to the low-side hole. */
-	low_hole_pages = (vm->va_start >>
-			  ilog2(vm->gmmu_page_sizes[gmmu_page_size_small]));
-
 	snprintf(alloc_name, sizeof(alloc_name), "gk20a_%s-%dKB", name,
 		 vm->gmmu_page_sizes[gmmu_page_size_small]>>10);
-	err = gk20a_allocator_init(&vm->vma[gmmu_page_size_small],
-			     alloc_name,
-			     low_hole_pages,		 /*start*/
-			     num_small_pages - low_hole_pages);/* length*/
+	err = __gk20a_allocator_init(&vm->vma[gmmu_page_size_small],
+				     vm, alloc_name,
+				     vm->va_start,
+				     small_vma_size - vm->va_start,
+				     SZ_4K,
+				     GPU_BALLOC_MAX_ORDER,
+				     GPU_BALLOC_GVA_SPACE);
 	if (err)
 		goto clean_up_ptes;
 
 	if (big_pages) {
-		u32 start = (u32)(small_vma_size >>
-			    ilog2(vm->gmmu_page_sizes[gmmu_page_size_big]));
-		num_large_pages = (u32)(large_vma_size >>
-			    ilog2(vm->gmmu_page_sizes[gmmu_page_size_big]));
-
 		snprintf(alloc_name, sizeof(alloc_name), "gk20a_%s-%dKB",
 			 name, vm->gmmu_page_sizes[gmmu_page_size_big]>>10);
-		err = gk20a_allocator_init(&vm->vma[gmmu_page_size_big],
-				      alloc_name,
-				      start,			/* start */
-				      num_large_pages);		/* length */
+		/*
+		 * Big page VMA starts at the end of the small page VMA.
+		 */
+		err = __gk20a_allocator_init(&vm->vma[gmmu_page_size_big],
+					     vm, alloc_name,
+					     small_vma_size,
+					     large_vma_size,
+					     big_page_size,
+					     GPU_BALLOC_MAX_ORDER,
+					     GPU_BALLOC_GVA_SPACE);
 		if (err)
 			goto clean_up_small_allocator;
 	}
@@ -2602,9 +2524,9 @@ int gk20a_vm_release_share(struct gk20a_as_share *as_share)
 int gk20a_vm_alloc_space(struct gk20a_as_share *as_share,
 			 struct nvgpu_as_alloc_space_args *args)
 
-{	int err = -ENOMEM;
+{
+	int err = -ENOMEM;
 	int pgsz_idx = gmmu_page_size_small;
-	u32 start_page_nr;
 	struct gk20a_allocator *vma;
 	struct vm_gk20a *vm = as_share->vm;
 	struct gk20a *g = vm->mm->g;
@@ -2635,21 +2557,19 @@ int gk20a_vm_alloc_space(struct gk20a_as_share *as_share,
 		goto clean_up;
 	}
 
-	start_page_nr = 0;
+	vma = &vm->vma[pgsz_idx];
 	if (args->flags & NVGPU_AS_ALLOC_SPACE_FLAGS_FIXED_OFFSET)
-		start_page_nr = (u32)(args->o_a.offset >>
-				ilog2(vm->gmmu_page_sizes[pgsz_idx]));
+		vaddr_start = gk20a_balloc_fixed(vma, args->o_a.offset,
+						 (u64)args->pages *
+						 (u64)args->page_size);
+	else
+		vaddr_start = gk20a_balloc(vma, args->pages * args->page_size);
 
-	vma = &vm->vma[pgsz_idx];
-	err = vma->alloc(vma, &start_page_nr, args->pages, 1);
-	if (err) {
+	if (!vaddr_start) {
 		kfree(va_node);
 		goto clean_up;
 	}
 
-	vaddr_start = (u64)start_page_nr <<
-		      ilog2(vm->gmmu_page_sizes[pgsz_idx]);
-
 	va_node->vaddr_start = vaddr_start;
 	va_node->size = (u64)args->page_size * (u64)args->pages;
 	va_node->pgsz_idx = pgsz_idx;
@@ -2673,7 +2593,7 @@ int gk20a_vm_alloc_space(struct gk20a_as_share *as_share,
 					 true);
 		if (!map_offset) {
 			mutex_unlock(&vm->update_gmmu_lock);
-			vma->free(vma, start_page_nr, args->pages, 1);
+			gk20a_bfree(vma, vaddr_start);
 			kfree(va_node);
 			goto clean_up;
 		}
@@ -2685,6 +2605,7 @@ int gk20a_vm_alloc_space(struct gk20a_as_share *as_share,
 	mutex_unlock(&vm->update_gmmu_lock);
 
 	args->o_a.offset = vaddr_start;
+	err = 0;
 
 clean_up:
 	return err;
@@ -2695,7 +2616,6 @@ int gk20a_vm_free_space(struct gk20a_as_share *as_share,
 {
 	int err = -ENOMEM;
 	int pgsz_idx;
-	u32 start_page_nr;
 	struct gk20a_allocator *vma;
 	struct vm_gk20a *vm = as_share->vm;
 	struct vm_reserved_va_node *va_node;
@@ -2708,14 +2628,8 @@ int gk20a_vm_free_space(struct gk20a_as_share *as_share,
 	pgsz_idx = __nv_gmmu_va_is_upper(vm, args->offset) ?
 			gmmu_page_size_big : gmmu_page_size_small;
 
-	start_page_nr = (u32)(args->offset >>
-			ilog2(vm->gmmu_page_sizes[pgsz_idx]));
-
 	vma = &vm->vma[pgsz_idx];
-	err = vma->free(vma, start_page_nr, args->pages, 1);
-
-	if (err)
-		goto clean_up;
+	gk20a_bfree(vma, args->offset);
 
 	mutex_lock(&vm->update_gmmu_lock);
 	va_node = addr_to_reservation(vm, args->offset);
@@ -2745,8 +2659,8 @@ int gk20a_vm_free_space(struct gk20a_as_share *as_share,
 		kfree(va_node);
 	}
 	mutex_unlock(&vm->update_gmmu_lock);
+	err = 0;
 
-clean_up:
 	return err;
 }
 
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
index 57f7a373..cf246744 100644
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
@@ -131,6 +131,7 @@ enum gmmu_pgsz_gk20a {
 };
 
 struct gk20a_comptags {
+	u32 real_offset;
 	u32 offset;
 	u32 lines;
 	u32 allocated_lines;
diff --git a/drivers/gpu/nvgpu/gk20a/pmu_gk20a.c b/drivers/gpu/nvgpu/gk20a/pmu_gk20a.c
index 275fbd4e..fc8d130c 100644
--- a/drivers/gpu/nvgpu/gk20a/pmu_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/pmu_gk20a.c
@@ -2438,7 +2438,6 @@ static int pmu_init_perfmon(struct pmu_gk20a *pmu)
 	struct pmu_payload payload;
 	u32 seq;
 	u32 data;
-	int err = 0;
 
 	gk20a_dbg_fn("");
 
@@ -2489,12 +2488,11 @@ static int pmu_init_perfmon(struct pmu_gk20a *pmu)
 	gk20a_writel(g, pwr_pmu_idle_ctrl_r(2), data);
 
 	if (!pmu->sample_buffer)
-		err = pmu->dmem.alloc(&pmu->dmem,
-				      &pmu->sample_buffer, 2 * sizeof(u16),
-				      PMU_DMEM_ALLOC_ALIGNMENT);
-	if (err) {
+		pmu->sample_buffer = gk20a_balloc(&pmu->dmem,
+						  2 * sizeof(u16));
+	if (!pmu->sample_buffer) {
 		gk20a_err(dev_from_gk20a(g),
-			"failed to allocate perfmon sample buffer");
+			  "failed to allocate perfmon sample buffer");
 		return -ENOMEM;
 	}
 
@@ -2592,15 +2590,17 @@ static int pmu_process_init_msg(struct pmu_gk20a *pmu,
 	for (i = 0; i < PMU_QUEUE_COUNT; i++)
 		pmu_queue_init(pmu, i, init);
 
-	if (!pmu->dmem.alloc) {
-		/*Align start and end addresses*/
+	if (!pmu->dmem.init) {
+		/* Align start and end addresses */
 		u32 start = ALIGN(pv->get_pmu_init_msg_pmu_sw_mg_off(init),
-					PMU_DMEM_ALLOC_ALIGNMENT);
+				  PMU_DMEM_ALLOC_ALIGNMENT);
 		u32 end = (pv->get_pmu_init_msg_pmu_sw_mg_off(init) +
-			pv->get_pmu_init_msg_pmu_sw_mg_size(init)) &
+			   pv->get_pmu_init_msg_pmu_sw_mg_size(init)) &
 			~(PMU_DMEM_ALLOC_ALIGNMENT - 1);
 		u32 size = end - start;
-		gk20a_allocator_init(&pmu->dmem, "gk20a_pmu_dmem", start, size);
+		__gk20a_allocator_init(&pmu->dmem, NULL, "gk20a_pmu_dmem",
+				       start, size,
+				       PMU_DMEM_ALLOC_ALIGNMENT, 4, 0);
 	}
 
 	pmu->pmu_ready = true;
@@ -2737,20 +2737,14 @@ static int pmu_response_handle(struct pmu_gk20a *pmu,
 		seq->callback = NULL;
 	if (pv->pmu_allocation_get_dmem_size(pmu,
 			pv->get_pmu_seq_in_a_ptr(seq)) != 0)
-		pmu->dmem.free(&pmu->dmem,
+		gk20a_bfree(&pmu->dmem,
 			pv->pmu_allocation_get_dmem_offset(pmu,
-			pv->get_pmu_seq_in_a_ptr(seq)),
-			pv->pmu_allocation_get_dmem_size(pmu,
-				pv->get_pmu_seq_in_a_ptr(seq)),
-			PMU_DMEM_ALLOC_ALIGNMENT);
+			pv->get_pmu_seq_in_a_ptr(seq)));
 	if (pv->pmu_allocation_get_dmem_size(pmu,
 			pv->get_pmu_seq_out_a_ptr(seq)) != 0)
-		pmu->dmem.free(&pmu->dmem,
+		gk20a_bfree(&pmu->dmem,
 			pv->pmu_allocation_get_dmem_offset(pmu,
-			pv->get_pmu_seq_out_a_ptr(seq)),
-			pv->pmu_allocation_get_dmem_size(pmu,
-				pv->get_pmu_seq_out_a_ptr(seq)),
-			PMU_DMEM_ALLOC_ALIGNMENT);
+			pv->get_pmu_seq_out_a_ptr(seq)));
 
 	if (seq->callback)
 		seq->callback(g, msg, seq->cb_params, seq->desc, ret);
@@ -3387,11 +3381,10 @@ int gk20a_pmu_cmd_post(struct gk20a *g, struct pmu_cmd *cmd,
 			pv->pmu_allocation_set_dmem_size(pmu, in,
 			(u16)max(payload->in.size, payload->out.size));
 
-		err = pmu->dmem.alloc(&pmu->dmem,
-			pv->pmu_allocation_get_dmem_offset_addr(pmu, in),
-			pv->pmu_allocation_get_dmem_size(pmu, in),
-			PMU_DMEM_ALLOC_ALIGNMENT);
-		if (err)
+		*(pv->pmu_allocation_get_dmem_offset_addr(pmu, in)) =
+			gk20a_balloc(&pmu->dmem,
+				     pv->pmu_allocation_get_dmem_size(pmu, in));
+		if (!*(pv->pmu_allocation_get_dmem_offset_addr(pmu, in)))
 			goto clean_up;
 
 		pmu_copy_to_dmem(pmu, (pv->pmu_allocation_get_dmem_offset(pmu,
@@ -3412,11 +3405,12 @@ int gk20a_pmu_cmd_post(struct gk20a *g, struct pmu_cmd *cmd,
 		(u16)payload->out.size);
 
 		if (payload->out.buf != payload->in.buf) {
-			err = pmu->dmem.alloc(&pmu->dmem,
-				pv->pmu_allocation_get_dmem_offset_addr(pmu, out),
-				pv->pmu_allocation_get_dmem_size(pmu, out),
-				PMU_DMEM_ALLOC_ALIGNMENT);
-			if (err)
+
+			*(pv->pmu_allocation_get_dmem_offset_addr(pmu, out)) =
+				gk20a_balloc(&pmu->dmem,
+				    pv->pmu_allocation_get_dmem_size(pmu, out));
+			if (!*(pv->pmu_allocation_get_dmem_offset_addr(pmu,
+								       out)))
 				goto clean_up;
 		} else {
 			BUG_ON(in == NULL);
@@ -3444,15 +3438,11 @@ int gk20a_pmu_cmd_post(struct gk20a *g, struct pmu_cmd *cmd,
 clean_up:
 	gk20a_dbg_fn("fail");
 	if (in)
-		pmu->dmem.free(&pmu->dmem,
-			pv->pmu_allocation_get_dmem_offset(pmu, in),
-			pv->pmu_allocation_get_dmem_size(pmu, in),
-			PMU_DMEM_ALLOC_ALIGNMENT);
+		gk20a_bfree(&pmu->dmem,
+			pv->pmu_allocation_get_dmem_offset(pmu, in));
 	if (out)
-		pmu->dmem.free(&pmu->dmem,
-			pv->pmu_allocation_get_dmem_offset(pmu, out),
-			pv->pmu_allocation_get_dmem_size(pmu, out),
-			PMU_DMEM_ALLOC_ALIGNMENT);
+		gk20a_bfree(&pmu->dmem,
+			pv->pmu_allocation_get_dmem_offset(pmu, out));
 
 	pmu_seq_release(pmu, seq);
 	return err;
diff --git a/drivers/gpu/nvgpu/gk20a/pmu_gk20a.h b/drivers/gpu/nvgpu/gk20a/pmu_gk20a.h
index 6cd173e8..e54805a6 100644
--- a/drivers/gpu/nvgpu/gk20a/pmu_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/pmu_gk20a.h
@@ -3,7 +3,7 @@
  *
  * GK20A PMU (aka. gPMU outside gk20a context)
  *
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2015, NVIDIA CORPORATION.  All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms and conditions of the GNU General Public License,
@@ -437,7 +437,7 @@ struct pmu_ucode_desc {
 #define PMU_UNIT_ID_IS_VALID(id)		\
 		(((id) < PMU_UNIT_END) || ((id) >= PMU_UNIT_TEST_START))
 
-#define PMU_DMEM_ALLOC_ALIGNMENT	(32)
+#define PMU_DMEM_ALLOC_ALIGNMENT	(4)
 #define PMU_DMEM_ALIGNMENT		(4)
 
 #define PMU_CMD_FLAGS_PMU_MASK		(0xF0)
diff --git a/drivers/gpu/nvgpu/gk20a/semaphore_gk20a.c b/drivers/gpu/nvgpu/gk20a/semaphore_gk20a.c
index 04f61c58..053550f6 100644
--- a/drivers/gpu/nvgpu/gk20a/semaphore_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/semaphore_gk20a.c
@@ -3,7 +3,7 @@
  *
  * GK20A Semaphores
  *
- * Copyright (c) 2014, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2014-2015, NVIDIA CORPORATION.  All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms and conditions of the GNU General Public License,
@@ -44,8 +44,10 @@ struct gk20a_semaphore_pool *gk20a_semaphore_pool_alloc(struct device *d,
 	if (gk20a_get_sgtable(d, &p->sgt, p->cpu_va, p->iova, p->size))
 		goto clean_up;
 
-	if (gk20a_allocator_init(&p->alloc, unique_name, 0,
-			     p->size))
+	/* Sacrifice one semaphore in the name of returning error codes. */
+	if (gk20a_allocator_init(&p->alloc, unique_name,
+				 SEMAPHORE_SIZE, p->size - SEMAPHORE_SIZE,
+				 SEMAPHORE_SIZE))
 		goto clean_up;
 
 	gk20a_dbg_info("cpuva=%p iova=%llx phys=%llx", p->cpu_va,
@@ -163,8 +165,8 @@ struct gk20a_semaphore *gk20a_semaphore_alloc(struct gk20a_semaphore_pool *pool)
 	if (!s)
 		return NULL;
 
-	if (pool->alloc.alloc(&pool->alloc, &s->offset, SEMAPHORE_SIZE,
-				SEMAPHORE_SIZE)) {
+	s->offset = gk20a_balloc(&pool->alloc, SEMAPHORE_SIZE);
+	if (!s->offset) {
 		gk20a_err(pool->dev, "failed to allocate semaphore");
 		kfree(s);
 		return NULL;
@@ -186,8 +188,7 @@ static void gk20a_semaphore_free(struct kref *ref)
 	struct gk20a_semaphore *s =
 		container_of(ref, struct gk20a_semaphore, ref);
 
-	s->pool->alloc.free(&s->pool->alloc, s->offset, SEMAPHORE_SIZE,
-			SEMAPHORE_SIZE);
+	gk20a_bfree(&s->pool->alloc, s->offset);
 	gk20a_semaphore_pool_put(s->pool);
 	kfree(s);
 }
-- 
cgit v1.2.2