From 6df3992b60959d32c7113cb77e131a2547174f3a Mon Sep 17 00:00:00 2001
From: Alex Waterman <alexw@nvidia.com>
Date: Tue, 20 Dec 2016 13:55:48 -0800
Subject: gpu: nvgpu: Move allocators to common/mm/

Move the GPU allocators to common/mm/ since the allocators are common
code across all GPUs. Also rename the allocator code to move away from
gk20a_ prefixed structs and functions.

This caused one issue with the nvgpu_alloc() and nvgpu_free() functions.
There was a function for allocating either with kmalloc() or vmalloc()
depending on the size of the allocation. Those have now been renamed to
nvgpu_kalloc() and nvgpu_kfree().

Bug 1799159

Change-Id: Iddda92c013612bcb209847084ec85b8953002fa5
Signed-off-by: Alex Waterman <alexw@nvidia.com>
Reviewed-on: http://git-master/r/1274400
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
---
 drivers/gpu/nvgpu/Makefile.nvgpu                   |   10 +-
 drivers/gpu/nvgpu/common/mm/bitmap_allocator.c     |  443 +++++++
 .../gpu/nvgpu/common/mm/bitmap_allocator_priv.h    |   70 ++
 drivers/gpu/nvgpu/common/mm/buddy_allocator.c      | 1329 ++++++++++++++++++++
 drivers/gpu/nvgpu/common/mm/buddy_allocator_priv.h |  192 +++
 drivers/gpu/nvgpu/common/mm/lockless_allocator.c   |  207 +++
 .../gpu/nvgpu/common/mm/lockless_allocator_priv.h  |  121 ++
 drivers/gpu/nvgpu/common/mm/nvgpu_allocator.c      |  212 ++++
 drivers/gpu/nvgpu/common/mm/page_allocator.c       |  937 ++++++++++++++
 drivers/gpu/nvgpu/gk20a/as_gk20a.c                 |   10 +-
 drivers/gpu/nvgpu/gk20a/bitmap_allocator_priv.h    |   70 --
 drivers/gpu/nvgpu/gk20a/buddy_allocator_priv.h     |  192 ---
 drivers/gpu/nvgpu/gk20a/channel_gk20a.c            |   12 +-
 drivers/gpu/nvgpu/gk20a/channel_gk20a.h            |    2 +-
 drivers/gpu/nvgpu/gk20a/dbg_gpu_gk20a.c            |    4 +-
 drivers/gpu/nvgpu/gk20a/debug_gk20a.c              |    2 +-
 drivers/gpu/nvgpu/gk20a/fence_gk20a.c              |   16 +-
 drivers/gpu/nvgpu/gk20a/fence_gk20a.h              |    2 +-
 drivers/gpu/nvgpu/gk20a/gk20a.c                    |    3 +-
 drivers/gpu/nvgpu/gk20a/gk20a_allocator.c          |  211 ----
 drivers/gpu/nvgpu/gk20a/gk20a_allocator.h          |  302 -----
 drivers/gpu/nvgpu/gk20a/gk20a_allocator_bitmap.c   |  442 -------
 drivers/gpu/nvgpu/gk20a/gk20a_allocator_buddy.c    | 1327 -------------------
 drivers/gpu/nvgpu/gk20a/gk20a_allocator_lockless.c |  206 ---
 drivers/gpu/nvgpu/gk20a/gk20a_allocator_page.c     |  936 --------------
 drivers/gpu/nvgpu/gk20a/gr_gk20a.c                 |    6 +-
 drivers/gpu/nvgpu/gk20a/lockless_allocator_priv.h  |  121 --
 drivers/gpu/nvgpu/gk20a/mm_gk20a.c                 |  114 +-
 drivers/gpu/nvgpu/gk20a/mm_gk20a.h                 |   23 +-
 drivers/gpu/nvgpu/gk20a/page_allocator_priv.h      |  164 ---
 drivers/gpu/nvgpu/gk20a/pmu_gk20a.c                |   26 +-
 drivers/gpu/nvgpu/gk20a/pmu_gk20a.h                |    2 +-
 drivers/gpu/nvgpu/gk20a/semaphore_gk20a.h          |    3 +-
 drivers/gpu/nvgpu/include/nvgpu/allocator.h        |  302 +++++
 drivers/gpu/nvgpu/include/nvgpu/page_allocator.h   |  164 +++
 drivers/gpu/nvgpu/vgpu/mm_vgpu.c                   |   20 +-
 36 files changed, 4106 insertions(+), 4097 deletions(-)
 create mode 100644 drivers/gpu/nvgpu/common/mm/bitmap_allocator.c
 create mode 100644 drivers/gpu/nvgpu/common/mm/bitmap_allocator_priv.h
 create mode 100644 drivers/gpu/nvgpu/common/mm/buddy_allocator.c
 create mode 100644 drivers/gpu/nvgpu/common/mm/buddy_allocator_priv.h
 create mode 100644 drivers/gpu/nvgpu/common/mm/lockless_allocator.c
 create mode 100644 drivers/gpu/nvgpu/common/mm/lockless_allocator_priv.h
 create mode 100644 drivers/gpu/nvgpu/common/mm/nvgpu_allocator.c
 create mode 100644 drivers/gpu/nvgpu/common/mm/page_allocator.c
 delete mode 100644 drivers/gpu/nvgpu/gk20a/bitmap_allocator_priv.h
 delete mode 100644 drivers/gpu/nvgpu/gk20a/buddy_allocator_priv.h
 delete mode 100644 drivers/gpu/nvgpu/gk20a/gk20a_allocator.c
 delete mode 100644 drivers/gpu/nvgpu/gk20a/gk20a_allocator.h
 delete mode 100644 drivers/gpu/nvgpu/gk20a/gk20a_allocator_bitmap.c
 delete mode 100644 drivers/gpu/nvgpu/gk20a/gk20a_allocator_buddy.c
 delete mode 100644 drivers/gpu/nvgpu/gk20a/gk20a_allocator_lockless.c
 delete mode 100644 drivers/gpu/nvgpu/gk20a/gk20a_allocator_page.c
 delete mode 100644 drivers/gpu/nvgpu/gk20a/lockless_allocator_priv.h
 delete mode 100644 drivers/gpu/nvgpu/gk20a/page_allocator_priv.h
 create mode 100644 drivers/gpu/nvgpu/include/nvgpu/allocator.h
 create mode 100644 drivers/gpu/nvgpu/include/nvgpu/page_allocator.h

diff --git a/drivers/gpu/nvgpu/Makefile.nvgpu b/drivers/gpu/nvgpu/Makefile.nvgpu
index 93629eff..afce062b 100644
--- a/drivers/gpu/nvgpu/Makefile.nvgpu
+++ b/drivers/gpu/nvgpu/Makefile.nvgpu
@@ -23,6 +23,11 @@ obj-$(CONFIG_GK20A) := nvgpu.o
 
 nvgpu-y := \
 	common/linux/timers.o \
+	common/mm/nvgpu_allocator.o \
+	common/mm/bitmap_allocator.o \
+	common/mm/buddy_allocator.o \
+	common/mm/page_allocator.o \
+	common/mm/lockless_allocator.o \
 	nvgpu_common.o \
 	gk20a/gk20a.o \
 	gk20a/sched_gk20a.o \
@@ -51,11 +56,6 @@ nvgpu-y := \
 	gk20a/fb_gk20a.o \
 	gk20a/hal.o \
 	gk20a/hal_gk20a.o \
-	gk20a/gk20a_allocator.o \
-	gk20a/gk20a_allocator_bitmap.o \
-	gk20a/gk20a_allocator_buddy.o \
-	gk20a/gk20a_allocator_page.o \
-	gk20a/gk20a_allocator_lockless.o \
 	gk20a/cde_gk20a.o \
 	gk20a/platform_gk20a_generic.o \
 	gk20a/tsg_gk20a.o \
diff --git a/drivers/gpu/nvgpu/common/mm/bitmap_allocator.c b/drivers/gpu/nvgpu/common/mm/bitmap_allocator.c
new file mode 100644
index 00000000..6f267c85
--- /dev/null
+++ b/drivers/gpu/nvgpu/common/mm/bitmap_allocator.c
@@ -0,0 +1,443 @@
+/*
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/bitops.h>
+
+#include <nvgpu/allocator.h>
+
+#include "bitmap_allocator_priv.h"
+
+static struct kmem_cache *meta_data_cache;	/* slab cache for meta data. */
+static DEFINE_MUTEX(meta_data_cache_lock);
+
+static u64 nvgpu_bitmap_alloc_length(struct nvgpu_allocator *a)
+{
+	struct nvgpu_bitmap_allocator *ba = a->priv;
+
+	return ba->length;
+}
+
+static u64 nvgpu_bitmap_alloc_base(struct nvgpu_allocator *a)
+{
+	struct nvgpu_bitmap_allocator *ba = a->priv;
+
+	return ba->base;
+}
+
+static int nvgpu_bitmap_alloc_inited(struct nvgpu_allocator *a)
+{
+	struct nvgpu_bitmap_allocator *ba = a->priv;
+	int inited = ba->inited;
+
+	rmb();
+	return inited;
+}
+
+static u64 nvgpu_bitmap_alloc_end(struct nvgpu_allocator *a)
+{
+	struct nvgpu_bitmap_allocator *ba = a->priv;
+
+	return ba->base + ba->length;
+}
+
+static u64 nvgpu_bitmap_alloc_fixed(struct nvgpu_allocator *__a,
+				    u64 base, u64 len)
+{
+	struct nvgpu_bitmap_allocator *a = bitmap_allocator(__a);
+	u64 blks, offs, ret;
+
+	/* Compute the bit offset and make sure it's aligned to a block.  */
+	offs = base >> a->blk_shift;
+	if (offs * a->blk_size != base)
+		return 0;
+
+	offs -= a->bit_offs;
+
+	blks = len >> a->blk_shift;
+	if (blks * a->blk_size != len)
+		blks++;
+
+	alloc_lock(__a);
+
+	/* Check if the space requested is already occupied. */
+	ret = bitmap_find_next_zero_area(a->bitmap, a->num_bits, offs, blks, 0);
+	if (ret != offs)
+		goto fail;
+
+	bitmap_set(a->bitmap, offs, blks);
+
+	a->bytes_alloced += blks * a->blk_size;
+	a->nr_fixed_allocs++;
+	alloc_unlock(__a);
+
+	alloc_dbg(__a, "Alloc-fixed 0x%-10llx 0x%-5llx [bits=0x%llx (%llu)]\n",
+		  base, len, blks, blks);
+	return base;
+
+fail:
+	alloc_unlock(__a);
+	alloc_dbg(__a, "Alloc-fixed failed! (0x%llx)\n", base);
+	return 0;
+}
+
+/*
+ * Two possibilities for this function: either we are freeing a fixed allocation
+ * or we are freeing a regular alloc but with GPU_ALLOC_NO_ALLOC_PAGE defined.
+ *
+ * Note: this function won't do much error checking. Thus you could really
+ * confuse the allocator if you misuse this function.
+ */
+static void nvgpu_bitmap_free_fixed(struct nvgpu_allocator *__a,
+				    u64 base, u64 len)
+{
+	struct nvgpu_bitmap_allocator *a = bitmap_allocator(__a);
+	u64 blks, offs;
+
+	offs = base >> a->blk_shift;
+	if (WARN_ON(offs * a->blk_size != base))
+		return;
+
+	offs -= a->bit_offs;
+
+	blks = len >> a->blk_shift;
+	if (blks * a->blk_size != len)
+		blks++;
+
+	alloc_lock(__a);
+	bitmap_clear(a->bitmap, offs, blks);
+	a->bytes_freed += blks * a->blk_size;
+	alloc_unlock(__a);
+
+	alloc_dbg(__a, "Free-fixed 0x%-10llx 0x%-5llx [bits=0x%llx (%llu)]\n",
+		  base, len, blks, blks);
+}
+
+/*
+ * Add the passed alloc to the tree of stored allocations.
+ */
+static void insert_alloc_metadata(struct nvgpu_bitmap_allocator *a,
+				  struct nvgpu_bitmap_alloc *alloc)
+{
+	struct rb_node **new = &a->allocs.rb_node;
+	struct rb_node *parent = NULL;
+	struct nvgpu_bitmap_alloc *tmp;
+
+	while (*new) {
+		tmp = container_of(*new, struct nvgpu_bitmap_alloc,
+				   alloc_entry);
+
+		parent = *new;
+		if (alloc->base < tmp->base)
+			new = &((*new)->rb_left);
+		else if (alloc->base > tmp->base)
+			new = &((*new)->rb_right);
+		else {
+			WARN_ON("Duplicate entries in RB alloc tree!\n");
+			return;
+		}
+	}
+
+	rb_link_node(&alloc->alloc_entry, parent, new);
+	rb_insert_color(&alloc->alloc_entry, &a->allocs);
+}
+
+/*
+ * Find and remove meta-data from the outstanding allocations.
+ */
+static struct nvgpu_bitmap_alloc *find_alloc_metadata(
+	struct nvgpu_bitmap_allocator *a, u64 addr)
+{
+	struct rb_node *node = a->allocs.rb_node;
+	struct nvgpu_bitmap_alloc *alloc;
+
+	while (node) {
+		alloc = container_of(node, struct nvgpu_bitmap_alloc,
+				     alloc_entry);
+
+		if (addr < alloc->base)
+			node = node->rb_left;
+		else if (addr > alloc->base)
+			node = node->rb_right;
+		else
+			break;
+	}
+
+	if (!node)
+		return NULL;
+
+	rb_erase(node, &a->allocs);
+
+	return alloc;
+}
+
+/*
+ * Tree of alloc meta data stores the address of the alloc not the bit offset.
+ */
+static int __nvgpu_bitmap_store_alloc(struct nvgpu_bitmap_allocator *a,
+				      u64 addr, u64 len)
+{
+	struct nvgpu_bitmap_alloc *alloc =
+		kmem_cache_alloc(meta_data_cache, GFP_KERNEL);
+
+	if (!alloc)
+		return -ENOMEM;
+
+	alloc->base = addr;
+	alloc->length = len;
+
+	insert_alloc_metadata(a, alloc);
+
+	return 0;
+}
+
+/*
+ * @len is in bytes. This routine will figure out the right number of bits to
+ * actually allocate. The return is the address in bytes as well.
+ */
+static u64 nvgpu_bitmap_alloc(struct nvgpu_allocator *__a, u64 len)
+{
+	u64 blks, addr;
+	unsigned long offs, adjusted_offs, limit;
+	struct nvgpu_bitmap_allocator *a = bitmap_allocator(__a);
+
+	blks = len >> a->blk_shift;
+
+	if (blks * a->blk_size != len)
+		blks++;
+
+	alloc_lock(__a);
+
+	/*
+	 * First look from next_blk and onwards...
+	 */
+	offs = bitmap_find_next_zero_area(a->bitmap, a->num_bits,
+					  a->next_blk, blks, 0);
+	if (offs >= a->num_bits) {
+		/*
+		 * If that didn't work try the remaining area. Since there can
+		 * be available space that spans across a->next_blk we need to
+		 * search up to the first set bit after that.
+		 */
+		limit = find_next_bit(a->bitmap, a->num_bits, a->next_blk);
+		offs = bitmap_find_next_zero_area(a->bitmap, limit,
+						  0, blks, 0);
+		if (offs >= a->next_blk)
+			goto fail;
+	}
+
+	bitmap_set(a->bitmap, offs, blks);
+	a->next_blk = offs + blks;
+
+	adjusted_offs = offs + a->bit_offs;
+	addr = ((u64)adjusted_offs) * a->blk_size;
+
+	/*
+	 * Only do meta-data storage if we are allowed to allocate storage for
+	 * that meta-data. The issue with using kmalloc() and friends is that
+	 * in latency and success critical paths an alloc_page() call can either
+	 * sleep for potentially a long time or, assuming GFP_ATOMIC, fail.
+	 * Since we might not want either of these possibilities assume that the
+	 * caller will keep what data it needs around to successfully free this
+	 * allocation.
+	 */
+	if (!(a->flags & GPU_ALLOC_NO_ALLOC_PAGE) &&
+	    __nvgpu_bitmap_store_alloc(a, addr, blks * a->blk_size))
+		goto fail_reset_bitmap;
+
+	alloc_dbg(__a, "Alloc 0x%-10llx 0x%-5llx [bits=0x%llx (%llu)]\n",
+		  addr, len, blks, blks);
+
+	a->nr_allocs++;
+	a->bytes_alloced += (blks * a->blk_size);
+	alloc_unlock(__a);
+
+	return addr;
+
+fail_reset_bitmap:
+	bitmap_clear(a->bitmap, offs, blks);
+fail:
+	a->next_blk = 0;
+	alloc_unlock(__a);
+	alloc_dbg(__a, "Alloc failed!\n");
+	return 0;
+}
+
+static void nvgpu_bitmap_free(struct nvgpu_allocator *__a, u64 addr)
+{
+	struct nvgpu_bitmap_allocator *a = bitmap_allocator(__a);
+	struct nvgpu_bitmap_alloc *alloc = NULL;
+	u64 offs, adjusted_offs, blks;
+
+	alloc_lock(__a);
+
+	if (a->flags & GPU_ALLOC_NO_ALLOC_PAGE) {
+		WARN(1, "Using wrong free for NO_ALLOC_PAGE bitmap allocator");
+		goto done;
+	}
+
+	alloc = find_alloc_metadata(a, addr);
+	if (!alloc)
+		goto done;
+
+	/*
+	 * Address comes from adjusted offset (i.e the bit offset with
+	 * a->bit_offs added. So start with that and then work out the real
+	 * offs into the bitmap.
+	 */
+	adjusted_offs = addr >> a->blk_shift;
+	offs = adjusted_offs - a->bit_offs;
+	blks = alloc->length >> a->blk_shift;
+
+	bitmap_clear(a->bitmap, offs, blks);
+	alloc_dbg(__a, "Free  0x%-10llx\n", addr);
+
+	a->bytes_freed += alloc->length;
+
+done:
+	kfree(alloc);
+	alloc_unlock(__a);
+}
+
+static void nvgpu_bitmap_alloc_destroy(struct nvgpu_allocator *__a)
+{
+	struct nvgpu_bitmap_allocator *a = bitmap_allocator(__a);
+	struct nvgpu_bitmap_alloc *alloc;
+	struct rb_node *node;
+
+	/*
+	 * Kill any outstanding allocations.
+	 */
+	while ((node = rb_first(&a->allocs)) != NULL) {
+		alloc = container_of(node, struct nvgpu_bitmap_alloc,
+				     alloc_entry);
+
+		rb_erase(node, &a->allocs);
+		kfree(alloc);
+	}
+
+	kfree(a->bitmap);
+	kfree(a);
+}
+
+static void nvgpu_bitmap_print_stats(struct nvgpu_allocator *__a,
+				     struct seq_file *s, int lock)
+{
+	struct nvgpu_bitmap_allocator *a = bitmap_allocator(__a);
+
+	__alloc_pstat(s, __a, "Bitmap allocator params:\n");
+	__alloc_pstat(s, __a, "  start = 0x%llx\n", a->base);
+	__alloc_pstat(s, __a, "  end   = 0x%llx\n", a->base + a->length);
+	__alloc_pstat(s, __a, "  blks  = 0x%llx\n", a->num_bits);
+
+	/* Actual stats. */
+	__alloc_pstat(s, __a, "Stats:\n");
+	__alloc_pstat(s, __a, "  Number allocs = 0x%llx\n", a->nr_allocs);
+	__alloc_pstat(s, __a, "  Number fixed  = 0x%llx\n", a->nr_fixed_allocs);
+	__alloc_pstat(s, __a, "  Bytes alloced = 0x%llx\n", a->bytes_alloced);
+	__alloc_pstat(s, __a, "  Bytes freed   = 0x%llx\n", a->bytes_freed);
+	__alloc_pstat(s, __a, "  Outstanding   = 0x%llx\n",
+		      a->bytes_alloced - a->bytes_freed);
+}
+
+static const struct nvgpu_allocator_ops bitmap_ops = {
+	.alloc		= nvgpu_bitmap_alloc,
+	.free		= nvgpu_bitmap_free,
+
+	.alloc_fixed	= nvgpu_bitmap_alloc_fixed,
+	.free_fixed	= nvgpu_bitmap_free_fixed,
+
+	.base		= nvgpu_bitmap_alloc_base,
+	.length		= nvgpu_bitmap_alloc_length,
+	.end		= nvgpu_bitmap_alloc_end,
+	.inited		= nvgpu_bitmap_alloc_inited,
+
+	.fini		= nvgpu_bitmap_alloc_destroy,
+
+	.print_stats	= nvgpu_bitmap_print_stats,
+};
+
+
+int nvgpu_bitmap_allocator_init(struct gk20a *g, struct nvgpu_allocator *__a,
+				const char *name, u64 base, u64 length,
+				u64 blk_size, u64 flags)
+{
+	int err;
+	struct nvgpu_bitmap_allocator *a;
+
+	mutex_lock(&meta_data_cache_lock);
+	if (!meta_data_cache)
+		meta_data_cache = KMEM_CACHE(nvgpu_bitmap_alloc, 0);
+	mutex_unlock(&meta_data_cache_lock);
+
+	if (!meta_data_cache)
+		return -ENOMEM;
+
+	if (WARN_ON(blk_size & (blk_size - 1)))
+		return -EINVAL;
+
+	/*
+	 * blk_size must be a power-of-2; base length also need to be aligned
+	 * to blk_size.
+	 */
+	if (blk_size & (blk_size - 1) ||
+	    base & (blk_size - 1) || length & (blk_size - 1))
+		return -EINVAL;
+
+	if (base == 0) {
+		base = blk_size;
+		length -= blk_size;
+	}
+
+	a = kzalloc(sizeof(struct nvgpu_bitmap_allocator), GFP_KERNEL);
+	if (!a)
+		return -ENOMEM;
+
+	err = __nvgpu_alloc_common_init(__a, name, a, false, &bitmap_ops);
+	if (err)
+		goto fail;
+
+	a->base = base;
+	a->length = length;
+	a->blk_size = blk_size;
+	a->blk_shift = __ffs(a->blk_size);
+	a->num_bits = length >> a->blk_shift;
+	a->bit_offs = a->base >> a->blk_shift;
+	a->flags = flags;
+
+	a->bitmap = kcalloc(BITS_TO_LONGS(a->num_bits), sizeof(*a->bitmap),
+			    GFP_KERNEL);
+	if (!a->bitmap)
+		goto fail;
+
+	wmb();
+	a->inited = true;
+
+	nvgpu_init_alloc_debug(g, __a);
+	alloc_dbg(__a, "New allocator: type      bitmap\n");
+	alloc_dbg(__a, "               base      0x%llx\n", a->base);
+	alloc_dbg(__a, "               bit_offs  0x%llx\n", a->bit_offs);
+	alloc_dbg(__a, "               size      0x%llx\n", a->length);
+	alloc_dbg(__a, "               blk_size  0x%llx\n", a->blk_size);
+	alloc_dbg(__a, "               flags     0x%llx\n", a->flags);
+
+	return 0;
+
+fail:
+	kfree(a);
+	return err;
+}
diff --git a/drivers/gpu/nvgpu/common/mm/bitmap_allocator_priv.h b/drivers/gpu/nvgpu/common/mm/bitmap_allocator_priv.h
new file mode 100644
index 00000000..9802b9db
--- /dev/null
+++ b/drivers/gpu/nvgpu/common/mm/bitmap_allocator_priv.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef BITMAP_ALLOCATOR_PRIV_H
+#define BITMAP_ALLOCATOR_PRIV_H
+
+#include <linux/rbtree.h>
+
+struct nvgpu_allocator;
+
+struct nvgpu_bitmap_allocator {
+	struct nvgpu_allocator *owner;
+
+	u64 base;			/* Base address of the space. */
+	u64 length;			/* Length of the space. */
+	u64 blk_size;			/* Size that corresponds to 1 bit. */
+	u64 blk_shift;			/* Bit shift to divide by blk_size. */
+	u64 num_bits;			/* Number of allocatable bits. */
+	u64 bit_offs;			/* Offset of bitmap. */
+
+	/*
+	 * Optimization for making repeated allocations faster. Keep track of
+	 * the next bit after the most recent allocation. This is where the next
+	 * search will start from. This should make allocation faster in cases
+	 * where lots of allocations get made one after another. It shouldn't
+	 * have a negative impact on the case where the allocator is fragmented.
+	 */
+	u64 next_blk;
+
+	unsigned long *bitmap;		/* The actual bitmap! */
+	struct rb_root allocs;		/* Tree of outstanding allocations. */
+
+	u64 flags;
+
+	bool inited;
+
+	/* Statistics */
+	u64 nr_allocs;
+	u64 nr_fixed_allocs;
+	u64 bytes_alloced;
+	u64 bytes_freed;
+};
+
+struct nvgpu_bitmap_alloc {
+	u64 base;
+	u64 length;
+	struct rb_node alloc_entry;	/* RB tree of allocations. */
+};
+
+static inline struct nvgpu_bitmap_allocator *bitmap_allocator(
+	struct nvgpu_allocator *a)
+{
+	return (struct nvgpu_bitmap_allocator *)(a)->priv;
+}
+
+
+#endif
diff --git a/drivers/gpu/nvgpu/common/mm/buddy_allocator.c b/drivers/gpu/nvgpu/common/mm/buddy_allocator.c
new file mode 100644
index 00000000..39a53801
--- /dev/null
+++ b/drivers/gpu/nvgpu/common/mm/buddy_allocator.c
@@ -0,0 +1,1329 @@
+/*
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+
+#include <nvgpu/allocator.h>
+
+#include "gk20a/mm_gk20a.h"
+#include "gk20a/platform_gk20a.h"
+
+#include "buddy_allocator_priv.h"
+
+static struct kmem_cache *buddy_cache;	/* slab cache for meta data. */
+
+/* Some other buddy allocator functions. */
+static struct nvgpu_buddy *balloc_free_buddy(struct nvgpu_buddy_allocator *a,
+					     u64 addr);
+static void balloc_coalesce(struct nvgpu_buddy_allocator *a,
+			    struct nvgpu_buddy *b);
+static void __balloc_do_free_fixed(struct nvgpu_buddy_allocator *a,
+				   struct nvgpu_fixed_alloc *falloc);
+
+/*
+ * This function is not present in older kernel's list.h code.
+ */
+#ifndef list_last_entry
+#define list_last_entry(ptr, type, member)	\
+	list_entry((ptr)->prev, type, member)
+#endif
+
+/*
+ * GPU buddy allocator for various address spaces.
+ *
+ * Current limitations:
+ *   o  A fixed allocation could potentially be made that borders PDEs with
+ *      different PTE sizes. This would require that fixed buffer to have
+ *      different sized PTEs for different parts of the allocation. Probably
+ *      best to just require PDE alignment for fixed address allocs.
+ *
+ *   o  It is currently possible to make an allocator that has a buddy alignment
+ *      out of sync with the PDE block size alignment. A simple example is a
+ *      32GB address space starting at byte 1. Every buddy is shifted off by 1
+ *      which means each buddy corresponf to more than one actual GPU page. The
+ *      best way to fix this is probably just require PDE blocksize alignment
+ *      for the start of the address space. At the moment all allocators are
+ *      easily PDE aligned so this hasn't been a problem.
+ */
+
+/*
+ * Pick a suitable maximum order for this allocator.
+ *
+ * Hueristic: Just guessing that the best max order is the largest single
+ * block that will fit in the address space.
+ */
+static void balloc_compute_max_order(struct nvgpu_buddy_allocator *a)
+{
+	u64 true_max_order = ilog2(a->blks);
+
+	if (a->max_order == 0) {
+		a->max_order = true_max_order;
+		return;
+	}
+
+	if (a->max_order > true_max_order)
+		a->max_order = true_max_order;
+	if (a->max_order > GPU_BALLOC_MAX_ORDER)
+		a->max_order = GPU_BALLOC_MAX_ORDER;
+}
+
+/*
+ * Since we can only allocate in chucks of a->blk_size we need to trim off
+ * any excess data that is not aligned to a->blk_size.
+ */
+static void balloc_allocator_align(struct nvgpu_buddy_allocator *a)
+{
+	a->start = ALIGN(a->base, a->blk_size);
+	WARN_ON(a->start != a->base);
+	a->end   = (a->base + a->length) & ~(a->blk_size - 1);
+	a->count = a->end - a->start;
+	a->blks  = a->count >> a->blk_shift;
+}
+
+/*
+ * Pass NULL for parent if you want a top level buddy.
+ */
+static struct nvgpu_buddy *balloc_new_buddy(struct nvgpu_buddy_allocator *a,
+					    struct nvgpu_buddy *parent,
+					    u64 start, u64 order)
+{
+	struct nvgpu_buddy *new_buddy;
+
+	new_buddy = kmem_cache_alloc(buddy_cache, GFP_KERNEL);
+	if (!new_buddy)
+		return NULL;
+
+	memset(new_buddy, 0, sizeof(struct nvgpu_buddy));
+
+	new_buddy->parent = parent;
+	new_buddy->start = start;
+	new_buddy->order = order;
+	new_buddy->end = start + (1 << order) * a->blk_size;
+	new_buddy->pte_size = BALLOC_PTE_SIZE_ANY;
+
+	return new_buddy;
+}
+
+static void __balloc_buddy_list_add(struct nvgpu_buddy_allocator *a,
+				    struct nvgpu_buddy *b,
+				    struct list_head *list)
+{
+	if (buddy_is_in_list(b)) {
+		alloc_dbg(balloc_owner(a),
+			  "Oops: adding added buddy (%llu:0x%llx)\n",
+			  b->order, b->start);
+		BUG();
+	}
+
+	/*
+	 * Add big PTE blocks to the tail, small to the head for GVA spaces.
+	 * This lets the code that checks if there are available blocks check
+	 * without cycling through the entire list.
+	 */
+	if (a->flags & GPU_ALLOC_GVA_SPACE &&
+	    b->pte_size == gmmu_page_size_big)
+		list_add_tail(&b->buddy_entry, list);
+	else
+		list_add(&b->buddy_entry, list);
+
+	buddy_set_in_list(b);
+}
+
+static void __balloc_buddy_list_rem(struct nvgpu_buddy_allocator *a,
+				    struct nvgpu_buddy *b)
+{
+	if (!buddy_is_in_list(b)) {
+		alloc_dbg(balloc_owner(a),
+			  "Oops: removing removed buddy (%llu:0x%llx)\n",
+			  b->order, b->start);
+		BUG();
+	}
+
+	list_del_init(&b->buddy_entry);
+	buddy_clr_in_list(b);
+}
+
+/*
+ * Add a buddy to one of the buddy lists and deal with the necessary
+ * book keeping. Adds the buddy to the list specified by the buddy's order.
+ */
+static void balloc_blist_add(struct nvgpu_buddy_allocator *a,
+			     struct nvgpu_buddy *b)
+{
+	__balloc_buddy_list_add(a, b, balloc_get_order_list(a, b->order));
+	a->buddy_list_len[b->order]++;
+}
+
+static void balloc_blist_rem(struct nvgpu_buddy_allocator *a,
+			     struct nvgpu_buddy *b)
+{
+	__balloc_buddy_list_rem(a, b);
+	a->buddy_list_len[b->order]--;
+}
+
+static u64 balloc_get_order(struct nvgpu_buddy_allocator *a, u64 len)
+{
+	if (len == 0)
+		return 0;
+
+	len--;
+	len >>= a->blk_shift;
+
+	return fls(len);
+}
+
+static u64 __balloc_max_order_in(struct nvgpu_buddy_allocator *a,
+				 u64 start, u64 end)
+{
+	u64 size = (end - start) >> a->blk_shift;
+
+	if (size > 0)
+		return min_t(u64, ilog2(size), a->max_order);
+	else
+		return GPU_BALLOC_MAX_ORDER;
+}
+
+/*
+ * Initialize the buddy lists.
+ */
+static int balloc_init_lists(struct nvgpu_buddy_allocator *a)
+{
+	int i;
+	u64 bstart, bend, order;
+	struct nvgpu_buddy *buddy;
+
+	bstart = a->start;
+	bend = a->end;
+
+	/* First make sure the LLs are valid. */
+	for (i = 0; i < GPU_BALLOC_ORDER_LIST_LEN; i++)
+		INIT_LIST_HEAD(balloc_get_order_list(a, i));
+
+	while (bstart < bend) {
+		order = __balloc_max_order_in(a, bstart, bend);
+
+		buddy = balloc_new_buddy(a, NULL, bstart, order);
+		if (!buddy)
+			goto cleanup;
+
+		balloc_blist_add(a, buddy);
+		bstart += balloc_order_to_len(a, order);
+	}
+
+	return 0;
+
+cleanup:
+	for (i = 0; i < GPU_BALLOC_ORDER_LIST_LEN; i++) {
+		if (!list_empty(balloc_get_order_list(a, i))) {
+			buddy = list_first_entry(balloc_get_order_list(a, i),
+					struct nvgpu_buddy, buddy_entry);
+			balloc_blist_rem(a, buddy);
+			kmem_cache_free(buddy_cache, buddy);
+		}
+	}
+
+	return -ENOMEM;
+}
+
+/*
+ * Clean up and destroy the passed allocator.
+ */
+static void nvgpu_buddy_allocator_destroy(struct nvgpu_allocator *__a)
+{
+	int i;
+	struct rb_node *node;
+	struct nvgpu_buddy *bud;
+	struct nvgpu_fixed_alloc *falloc;
+	struct nvgpu_buddy_allocator *a = __a->priv;
+
+	alloc_lock(__a);
+
+	nvgpu_fini_alloc_debug(__a);
+
+	/*
+	 * Free the fixed allocs first.
+	 */
+	while ((node = rb_first(&a->fixed_allocs)) != NULL) {
+		falloc = container_of(node,
+				      struct nvgpu_fixed_alloc, alloced_entry);
+
+		rb_erase(node, &a->fixed_allocs);
+		__balloc_do_free_fixed(a, falloc);
+	}
+
+	/*
+	 * And now free all outstanding allocations.
+	 */
+	while ((node = rb_first(&a->alloced_buddies)) != NULL) {
+		bud = container_of(node, struct nvgpu_buddy, alloced_entry);
+		balloc_free_buddy(a, bud->start);
+		balloc_blist_add(a, bud);
+		balloc_coalesce(a, bud);
+	}
+
+	/*
+	 * Now clean up the unallocated buddies.
+	 */
+	for (i = 0; i < GPU_BALLOC_ORDER_LIST_LEN; i++) {
+		BUG_ON(a->buddy_list_alloced[i] != 0);
+
+		while (!list_empty(balloc_get_order_list(a, i))) {
+			bud = list_first_entry(balloc_get_order_list(a, i),
+					       struct nvgpu_buddy, buddy_entry);
+			balloc_blist_rem(a, bud);
+			kmem_cache_free(buddy_cache, bud);
+		}
+
+		if (a->buddy_list_len[i] != 0) {
+			pr_info("Excess buddies!!! (%d: %llu)\n",
+				i, a->buddy_list_len[i]);
+			BUG();
+		}
+		if (a->buddy_list_split[i] != 0) {
+			pr_info("Excess split nodes!!! (%d: %llu)\n",
+				i, a->buddy_list_split[i]);
+			BUG();
+		}
+		if (a->buddy_list_alloced[i] != 0) {
+			pr_info("Excess alloced nodes!!! (%d: %llu)\n",
+				i, a->buddy_list_alloced[i]);
+			BUG();
+		}
+	}
+
+	kfree(a);
+
+	alloc_unlock(__a);
+}
+
+/*
+ * Combine the passed buddy if possible. The pointer in @b may not be valid
+ * after this as the buddy may be freed.
+ *
+ * @a must be locked.
+ */
+static void balloc_coalesce(struct nvgpu_buddy_allocator *a,
+			    struct nvgpu_buddy *b)
+{
+	struct nvgpu_buddy *parent;
+
+	if (buddy_is_alloced(b) || buddy_is_split(b))
+		return;
+
+	/*
+	 * If both our buddy and I are both not allocated and not split then
+	 * we can coalesce ourselves.
+	 */
+	if (!b->buddy)
+		return;
+	if (buddy_is_alloced(b->buddy) || buddy_is_split(b->buddy))
+		return;
+
+	parent = b->parent;
+
+	balloc_blist_rem(a, b);
+	balloc_blist_rem(a, b->buddy);
+
+	buddy_clr_split(parent);
+	a->buddy_list_split[parent->order]--;
+	balloc_blist_add(a, parent);
+
+	/*
+	 * Recursively coalesce as far as we can go.
+	 */
+	balloc_coalesce(a, parent);
+
+	/* Clean up the remains. */
+	kmem_cache_free(buddy_cache, b->buddy);
+	kmem_cache_free(buddy_cache, b);
+}
+
+/*
+ * Split a buddy into two new buddies who are 1/2 the size of the parent buddy.
+ *
+ * @a must be locked.
+ */
+static int balloc_split_buddy(struct nvgpu_buddy_allocator *a,
+			      struct nvgpu_buddy *b, int pte_size)
+{
+	struct nvgpu_buddy *left, *right;
+	u64 half;
+
+	left = balloc_new_buddy(a, b, b->start, b->order - 1);
+	if (!left)
+		return -ENOMEM;
+
+	half = (b->end - b->start) / 2;
+
+	right = balloc_new_buddy(a, b, b->start + half, b->order - 1);
+	if (!right) {
+		kmem_cache_free(buddy_cache, left);
+		return -ENOMEM;
+	}
+
+	buddy_set_split(b);
+	a->buddy_list_split[b->order]++;
+
+	b->left = left;
+	b->right = right;
+	left->buddy = right;
+	right->buddy = left;
+	left->parent = b;
+	right->parent = b;
+
+	/* PTE considerations. */
+	if (a->flags & GPU_ALLOC_GVA_SPACE &&
+	    left->order <= a->pte_blk_order) {
+		left->pte_size = pte_size;
+		right->pte_size = pte_size;
+	}
+
+	balloc_blist_rem(a, b);
+	balloc_blist_add(a, left);
+	balloc_blist_add(a, right);
+
+	return 0;
+}
+
+/*
+ * Place the passed buddy into the RB tree for allocated buddies. Never fails
+ * unless the passed entry is a duplicate which is a bug.
+ *
+ * @a must be locked.
+ */
+static void balloc_alloc_buddy(struct nvgpu_buddy_allocator *a,
+			       struct nvgpu_buddy *b)
+{
+	struct rb_node **new = &(a->alloced_buddies.rb_node);
+	struct rb_node *parent = NULL;
+
+	while (*new) {
+		struct nvgpu_buddy *bud = container_of(*new, struct nvgpu_buddy,
+						       alloced_entry);
+
+		parent = *new;
+		if (b->start < bud->start)
+			new = &((*new)->rb_left);
+		else if (b->start > bud->start)
+			new = &((*new)->rb_right);
+		else
+			BUG_ON("Duplicate entries in allocated list!\n");
+	}
+
+	rb_link_node(&b->alloced_entry, parent, new);
+	rb_insert_color(&b->alloced_entry, &a->alloced_buddies);
+
+	buddy_set_alloced(b);
+	a->buddy_list_alloced[b->order]++;
+}
+
+/*
+ * Remove the passed buddy from the allocated buddy RB tree. Returns the
+ * deallocated buddy for further processing.
+ *
+ * @a must be locked.
+ */
+static struct nvgpu_buddy *balloc_free_buddy(struct nvgpu_buddy_allocator *a,
+					     u64 addr)
+{
+	struct rb_node *node = a->alloced_buddies.rb_node;
+	struct nvgpu_buddy *bud;
+
+	while (node) {
+		bud = container_of(node, struct nvgpu_buddy, alloced_entry);
+
+		if (addr < bud->start)
+			node = node->rb_left;
+		else if (addr > bud->start)
+			node = node->rb_right;
+		else
+			break;
+	}
+
+	if (!node)
+		return NULL;
+
+	rb_erase(node, &a->alloced_buddies);
+	buddy_clr_alloced(bud);
+	a->buddy_list_alloced[bud->order]--;
+
+	return bud;
+}
+
+/*
+ * Find a suitable buddy for the given order and PTE type (big or little).
+ */
+static struct nvgpu_buddy *__balloc_find_buddy(struct nvgpu_buddy_allocator *a,
+					       u64 order, int pte_size)
+{
+	struct nvgpu_buddy *bud;
+
+	if (order > a->max_order ||
+	    list_empty(balloc_get_order_list(a, order)))
+		return NULL;
+
+	if (a->flags & GPU_ALLOC_GVA_SPACE &&
+	    pte_size == gmmu_page_size_big)
+		bud = list_last_entry(balloc_get_order_list(a, order),
+				      struct nvgpu_buddy, buddy_entry);
+	else
+		bud = list_first_entry(balloc_get_order_list(a, order),
+				       struct nvgpu_buddy, buddy_entry);
+
+	if (bud->pte_size != BALLOC_PTE_SIZE_ANY &&
+	    bud->pte_size != pte_size)
+		return NULL;
+
+	return bud;
+}
+
+/*
+ * Allocate a suitably sized buddy. If no suitable buddy exists split higher
+ * order buddies until we have a suitable buddy to allocate.
+ *
+ * For PDE grouping add an extra check to see if a buddy is suitable: that the
+ * buddy exists in a PDE who's PTE size is reasonable
+ *
+ * @a must be locked.
+ */
+static u64 __balloc_do_alloc(struct nvgpu_buddy_allocator *a,
+			     u64 order, int pte_size)
+{
+	u64 split_order;
+	struct nvgpu_buddy *bud = NULL;
+
+	split_order = order;
+	while (split_order <= a->max_order &&
+	       !(bud = __balloc_find_buddy(a, split_order, pte_size)))
+		split_order++;
+
+	/* Out of memory! */
+	if (!bud)
+		return 0;
+
+	while (bud->order != order) {
+		if (balloc_split_buddy(a, bud, pte_size))
+			return 0; /* No mem... */
+		bud = bud->left;
+	}
+
+	balloc_blist_rem(a, bud);
+	balloc_alloc_buddy(a, bud);
+
+	return bud->start;
+}
+
+/*
+ * See if the passed range is actually available for allocation. If so, then
+ * return 1, otherwise return 0.
+ *
+ * TODO: Right now this uses the unoptimal approach of going through all
+ * outstanding allocations and checking their base/ends. This could be better.
+ */
+static int balloc_is_range_free(struct nvgpu_buddy_allocator *a,
+				u64 base, u64 end)
+{
+	struct rb_node *node;
+	struct nvgpu_buddy *bud;
+
+	node = rb_first(&a->alloced_buddies);
+	if (!node)
+		return 1; /* No allocs yet. */
+
+	bud = container_of(node, struct nvgpu_buddy, alloced_entry);
+
+	while (bud->start < end) {
+		if ((bud->start > base && bud->start < end) ||
+		    (bud->end   > base && bud->end   < end))
+			return 0;
+
+		node = rb_next(node);
+		if (!node)
+			break;
+		bud = container_of(node, struct nvgpu_buddy, alloced_entry);
+	}
+
+	return 1;
+}
+
+static void balloc_alloc_fixed(struct nvgpu_buddy_allocator *a,
+			       struct nvgpu_fixed_alloc *f)
+{
+	struct rb_node **new = &(a->fixed_allocs.rb_node);
+	struct rb_node *parent = NULL;
+
+	while (*new) {
+		struct nvgpu_fixed_alloc *falloc =
+			container_of(*new, struct nvgpu_fixed_alloc,
+				     alloced_entry);
+
+		BUG_ON(!virt_addr_valid(falloc));
+
+		parent = *new;
+		if (f->start < falloc->start)
+			new = &((*new)->rb_left);
+		else if (f->start > falloc->start)
+			new = &((*new)->rb_right);
+		else
+			BUG_ON("Duplicate entries in allocated list!\n");
+	}
+
+	rb_link_node(&f->alloced_entry, parent, new);
+	rb_insert_color(&f->alloced_entry, &a->fixed_allocs);
+}
+
+/*
+ * Remove the passed buddy from the allocated buddy RB tree. Returns the
+ * deallocated buddy for further processing.
+ *
+ * @a must be locked.
+ */
+static struct nvgpu_fixed_alloc *balloc_free_fixed(
+	struct nvgpu_buddy_allocator *a, u64 addr)
+{
+	struct rb_node *node = a->fixed_allocs.rb_node;
+	struct nvgpu_fixed_alloc *falloc;
+
+	while (node) {
+		falloc = container_of(node,
+				      struct nvgpu_fixed_alloc, alloced_entry);
+
+		if (addr < falloc->start)
+			node = node->rb_left;
+		else if (addr > falloc->start)
+			node = node->rb_right;
+		else
+			break;
+	}
+
+	if (!node)
+		return NULL;
+
+	rb_erase(node, &a->fixed_allocs);
+
+	return falloc;
+}
+
+/*
+ * Find the parent range - doesn't necessarily need the parent to actually exist
+ * as a buddy. Finding an existing parent comes later...
+ */
+static void __balloc_get_parent_range(struct nvgpu_buddy_allocator *a,
+				      u64 base, u64 order,
+				      u64 *pbase, u64 *porder)
+{
+	u64 base_mask;
+	u64 shifted_base = balloc_base_shift(a, base);
+
+	order++;
+	base_mask = ~((a->blk_size << order) - 1);
+
+	shifted_base &= base_mask;
+
+	*pbase = balloc_base_unshift(a, shifted_base);
+	*porder = order;
+}
+
+/*
+ * Makes a buddy at the passed address. This will make all parent buddies
+ * necessary for this buddy to exist as well.
+ */
+static struct nvgpu_buddy *__balloc_make_fixed_buddy(
+	struct nvgpu_buddy_allocator *a, u64 base, u64 order)
+{
+	struct nvgpu_buddy *bud = NULL;
+	struct list_head *order_list;
+	u64 cur_order = order, cur_base = base;
+
+	/*
+	 * Algo:
+	 *  1. Keep jumping up a buddy order until we find the real buddy that
+	 *     this buddy exists in.
+	 *  2. Then work our way down through the buddy tree until we hit a dead
+	 *     end.
+	 *  3. Start splitting buddies until we split to the one we need to
+	 *     make.
+	 */
+	while (cur_order <= a->max_order) {
+		int found = 0;
+
+		order_list = balloc_get_order_list(a, cur_order);
+		list_for_each_entry(bud, order_list, buddy_entry) {
+			if (bud->start == cur_base) {
+				found = 1;
+				break;
+			}
+		}
+
+		if (found)
+			break;
+
+		__balloc_get_parent_range(a, cur_base, cur_order,
+					  &cur_base, &cur_order);
+	}
+
+	if (cur_order > a->max_order) {
+		alloc_dbg(balloc_owner(a), "No buddy for range ???\n");
+		return NULL;
+	}
+
+	/* Split this buddy as necessary until we get the target buddy. */
+	while (bud->start != base || bud->order != order) {
+		if (balloc_split_buddy(a, bud, BALLOC_PTE_SIZE_ANY)) {
+			balloc_coalesce(a, bud);
+			return NULL;
+		}
+
+		if (base < bud->right->start)
+			bud = bud->left;
+		else
+			bud = bud->right;
+
+	}
+
+	return bud;
+}
+
+static u64 __balloc_do_alloc_fixed(struct nvgpu_buddy_allocator *a,
+				   struct nvgpu_fixed_alloc *falloc,
+				   u64 base, u64 len)
+{
+	u64 shifted_base, inc_base;
+	u64 align_order;
+
+	shifted_base = balloc_base_shift(a, base);
+	if (shifted_base == 0)
+		align_order = __fls(len >> a->blk_shift);
+	else
+		align_order = min_t(u64,
+				    __ffs(shifted_base >> a->blk_shift),
+				    __fls(len >> a->blk_shift));
+
+	if (align_order > a->max_order) {
+		alloc_dbg(balloc_owner(a),
+			  "Align order too big: %llu > %llu\n",
+			  align_order, a->max_order);
+		return 0;
+	}
+
+	/*
+	 * Generate a list of buddies that satisfy this allocation.
+	 */
+	inc_base = shifted_base;
+	while (inc_base < (shifted_base + len)) {
+		u64 order_len = balloc_order_to_len(a, align_order);
+		u64 remaining;
+		struct nvgpu_buddy *bud;
+
+		bud = __balloc_make_fixed_buddy(a,
+					balloc_base_unshift(a, inc_base),
+					align_order);
+		if (!bud) {
+			alloc_dbg(balloc_owner(a),
+				  "Fixed buddy failed: {0x%llx, %llu}!\n",
+				  balloc_base_unshift(a, inc_base),
+				  align_order);
+			goto err_and_cleanup;
+		}
+
+		balloc_blist_rem(a, bud);
+		balloc_alloc_buddy(a, bud);
+		__balloc_buddy_list_add(a, bud, &falloc->buddies);
+
+		/* Book keeping. */
+		inc_base += order_len;
+		remaining = (shifted_base + len) - inc_base;
+		align_order = __ffs(inc_base >> a->blk_shift);
+
+		/* If we don't have much left - trim down align_order. */
+		if (balloc_order_to_len(a, align_order) > remaining)
+			align_order = __balloc_max_order_in(a, inc_base,
+							inc_base + remaining);
+	}
+
+	return base;
+
+err_and_cleanup:
+	while (!list_empty(&falloc->buddies)) {
+		struct nvgpu_buddy *bud = list_first_entry(&falloc->buddies,
+							   struct nvgpu_buddy,
+							   buddy_entry);
+
+		__balloc_buddy_list_rem(a, bud);
+		balloc_free_buddy(a, bud->start);
+		kmem_cache_free(buddy_cache, bud);
+	}
+
+	return 0;
+}
+
+static void __balloc_do_free_fixed(struct nvgpu_buddy_allocator *a,
+				   struct nvgpu_fixed_alloc *falloc)
+{
+	struct nvgpu_buddy *bud;
+
+	while (!list_empty(&falloc->buddies)) {
+		bud = list_first_entry(&falloc->buddies,
+				       struct nvgpu_buddy,
+				       buddy_entry);
+		__balloc_buddy_list_rem(a, bud);
+
+		balloc_free_buddy(a, bud->start);
+		balloc_blist_add(a, bud);
+		a->bytes_freed += balloc_order_to_len(a, bud->order);
+
+		/*
+		 * Attemp to defrag the allocation.
+		 */
+		balloc_coalesce(a, bud);
+	}
+
+	kfree(falloc);
+}
+
+/*
+ * Allocate memory from the passed allocator.
+ */
+static u64 nvgpu_buddy_balloc(struct nvgpu_allocator *__a, u64 len)
+{
+	u64 order, addr;
+	int pte_size;
+	struct nvgpu_buddy_allocator *a = __a->priv;
+
+	nvgpu_alloc_trace_func();
+
+	alloc_lock(__a);
+
+	order = balloc_get_order(a, len);
+
+	if (order > a->max_order) {
+		alloc_unlock(__a);
+		alloc_dbg(balloc_owner(a), "Alloc fail\n");
+		nvgpu_alloc_trace_func_done();
+		return 0;
+	}
+
+	/*
+	 * For now pass the base address of the allocator's region to
+	 * __get_pte_size(). This ensures we get the right page size for
+	 * the alloc but we don't have to know what the real address is
+	 * going to be quite yet.
+	 *
+	 * TODO: once userspace supports a unified address space pass 0 for
+	 * the base. This will make only 'len' affect the PTE size.
+	 */
+	if (a->flags & GPU_ALLOC_GVA_SPACE)
+		pte_size = __get_pte_size(a->vm, a->base, len);
+	else
+		pte_size = BALLOC_PTE_SIZE_ANY;
+
+	addr = __balloc_do_alloc(a, order, pte_size);
+
+	if (addr) {
+		a->bytes_alloced += len;
+		a->bytes_alloced_real += balloc_order_to_len(a, order);
+		alloc_dbg(balloc_owner(a),
+			  "Alloc 0x%-10llx %3lld:0x%-10llx pte_size=%s\n",
+			  addr, order, len,
+			  pte_size == gmmu_page_size_big   ? "big" :
+			  pte_size == gmmu_page_size_small ? "small" :
+			  "NA/any");
+	} else {
+		alloc_dbg(balloc_owner(a), "Alloc failed: no mem!\n");
+	}
+
+	a->alloc_made = 1;
+
+	alloc_unlock(__a);
+
+	nvgpu_alloc_trace_func_done();
+	return addr;
+}
+
+/*
+ * Requires @__a to be locked.
+ */
+static u64 __nvgpu_balloc_fixed_buddy(struct nvgpu_allocator *__a,
+				      u64 base, u64 len)
+{
+	u64 ret, real_bytes = 0;
+	struct nvgpu_buddy *bud;
+	struct nvgpu_fixed_alloc *falloc = NULL;
+	struct nvgpu_buddy_allocator *a = __a->priv;
+
+	nvgpu_alloc_trace_func();
+
+	/* If base isn't aligned to an order 0 block, fail. */
+	if (base & (a->blk_size - 1))
+		goto fail;
+
+	if (len == 0)
+		goto fail;
+
+	falloc = kmalloc(sizeof(*falloc), GFP_KERNEL);
+	if (!falloc)
+		goto fail;
+
+	INIT_LIST_HEAD(&falloc->buddies);
+	falloc->start = base;
+	falloc->end = base + len;
+
+	if (!balloc_is_range_free(a, base, base + len)) {
+		alloc_dbg(balloc_owner(a),
+			  "Range not free: 0x%llx -> 0x%llx\n",
+			  base, base + len);
+		goto fail_unlock;
+	}
+
+	ret = __balloc_do_alloc_fixed(a, falloc, base, len);
+	if (!ret) {
+		alloc_dbg(balloc_owner(a),
+			  "Alloc-fixed failed ?? 0x%llx -> 0x%llx\n",
+			  base, base + len);
+		goto fail_unlock;
+	}
+
+	balloc_alloc_fixed(a, falloc);
+
+	list_for_each_entry(bud, &falloc->buddies, buddy_entry)
+		real_bytes += (bud->end - bud->start);
+
+	a->bytes_alloced += len;
+	a->bytes_alloced_real += real_bytes;
+
+	alloc_dbg(balloc_owner(a), "Alloc (fixed) 0x%llx\n", base);
+
+	nvgpu_alloc_trace_func_done();
+	return base;
+
+fail_unlock:
+	alloc_unlock(__a);
+fail:
+	kfree(falloc);
+	nvgpu_alloc_trace_func_done();
+	return 0;
+}
+
+/*
+ * Allocate a fixed address allocation. The address of the allocation is @base
+ * and the length is @len. This is not a typical buddy allocator operation and
+ * as such has a high posibility of failure if the address space is heavily in
+ * use.
+ *
+ * Please do not use this function unless _absolutely_ necessary.
+ */
+static u64 nvgpu_balloc_fixed_buddy(struct nvgpu_allocator *__a,
+				    u64 base, u64 len)
+{
+	u64 alloc;
+	struct nvgpu_buddy_allocator *a = __a->priv;
+
+	alloc_lock(__a);
+	alloc = __nvgpu_balloc_fixed_buddy(__a, base, len);
+	a->alloc_made = 1;
+	alloc_unlock(__a);
+
+	return alloc;
+}
+
+/*
+ * Free the passed allocation.
+ */
+static void nvgpu_buddy_bfree(struct nvgpu_allocator *__a, u64 addr)
+{
+	struct nvgpu_buddy *bud;
+	struct nvgpu_fixed_alloc *falloc;
+	struct nvgpu_buddy_allocator *a = __a->priv;
+
+	nvgpu_alloc_trace_func();
+
+	if (!addr) {
+		nvgpu_alloc_trace_func_done();
+		return;
+	}
+
+	alloc_lock(__a);
+
+	/*
+	 * First see if this is a fixed alloc. If not fall back to a regular
+	 * buddy.
+	 */
+	falloc = balloc_free_fixed(a, addr);
+	if (falloc) {
+		__balloc_do_free_fixed(a, falloc);
+		goto done;
+	}
+
+	bud = balloc_free_buddy(a, addr);
+	if (!bud)
+		goto done;
+
+	balloc_blist_add(a, bud);
+	a->bytes_freed += balloc_order_to_len(a, bud->order);
+
+	/*
+	 * Attemp to defrag the allocation.
+	 */
+	balloc_coalesce(a, bud);
+
+done:
+	alloc_unlock(__a);
+	alloc_dbg(balloc_owner(a), "Free 0x%llx\n", addr);
+	nvgpu_alloc_trace_func_done();
+	return;
+}
+
+static bool nvgpu_buddy_reserve_is_possible(struct nvgpu_buddy_allocator *a,
+					    struct nvgpu_alloc_carveout *co)
+{
+	struct nvgpu_alloc_carveout *tmp;
+	u64 co_base, co_end;
+
+	co_base = co->base;
+	co_end  = co->base + co->length;
+
+	/*
+	 * Not the fastest approach but we should not have that many carveouts
+	 * for any reasonable allocator.
+	 */
+	list_for_each_entry(tmp, &a->co_list, co_entry) {
+		if ((co_base >= tmp->base &&
+		     co_base < (tmp->base + tmp->length)) ||
+		    (co_end >= tmp->base &&
+		     co_end < (tmp->base + tmp->length)))
+			return false;
+	}
+
+	return true;
+}
+
+/*
+ * Carveouts can only be reserved before any regular allocations have been
+ * made.
+ */
+static int nvgpu_buddy_reserve_co(struct nvgpu_allocator *__a,
+				  struct nvgpu_alloc_carveout *co)
+{
+	struct nvgpu_buddy_allocator *a = __a->priv;
+	u64 addr;
+	int err = 0;
+
+	if (co->base < a->start || (co->base + co->length) > a->end ||
+	    a->alloc_made)
+		return -EINVAL;
+
+	alloc_lock(__a);
+
+	if (!nvgpu_buddy_reserve_is_possible(a, co)) {
+		err = -EBUSY;
+		goto done;
+	}
+
+	/* Should not be possible to fail... */
+	addr = __nvgpu_balloc_fixed_buddy(__a, co->base, co->length);
+	if (!addr) {
+		err = -ENOMEM;
+		pr_warn("%s: Failed to reserve a valid carveout!\n", __func__);
+		goto done;
+	}
+
+	list_add(&co->co_entry, &a->co_list);
+
+done:
+	alloc_unlock(__a);
+	return err;
+}
+
+/*
+ * Carveouts can be release at any time.
+ */
+static void nvgpu_buddy_release_co(struct nvgpu_allocator *__a,
+				   struct nvgpu_alloc_carveout *co)
+{
+	alloc_lock(__a);
+
+	list_del_init(&co->co_entry);
+	nvgpu_free(__a, co->base);
+
+	alloc_unlock(__a);
+}
+
+static u64 nvgpu_buddy_alloc_length(struct nvgpu_allocator *a)
+{
+	struct nvgpu_buddy_allocator *ba = a->priv;
+
+	return ba->length;
+}
+
+static u64 nvgpu_buddy_alloc_base(struct nvgpu_allocator *a)
+{
+	struct nvgpu_buddy_allocator *ba = a->priv;
+
+	return ba->start;
+}
+
+static int nvgpu_buddy_alloc_inited(struct nvgpu_allocator *a)
+{
+	struct nvgpu_buddy_allocator *ba = a->priv;
+	int inited = ba->initialized;
+
+	rmb();
+	return inited;
+}
+
+static u64 nvgpu_buddy_alloc_end(struct nvgpu_allocator *a)
+{
+	struct nvgpu_buddy_allocator *ba = a->priv;
+
+	return ba->end;
+}
+
+static u64 nvgpu_buddy_alloc_space(struct nvgpu_allocator *a)
+{
+	struct nvgpu_buddy_allocator *ba = a->priv;
+	u64 space;
+
+	alloc_lock(a);
+	space = ba->end - ba->start -
+		(ba->bytes_alloced_real - ba->bytes_freed);
+	alloc_unlock(a);
+
+	return space;
+}
+
+/*
+ * Print the buddy allocator top level stats. If you pass @s as NULL then the
+ * stats are printed to the kernel log. This lets this code be used for
+ * debugging purposes internal to the allocator.
+ */
+static void nvgpu_buddy_print_stats(struct nvgpu_allocator *__a,
+				    struct seq_file *s, int lock)
+{
+	int i = 0;
+	struct rb_node *node;
+	struct nvgpu_fixed_alloc *falloc;
+	struct nvgpu_alloc_carveout *tmp;
+	struct nvgpu_buddy_allocator *a = __a->priv;
+
+	__alloc_pstat(s, __a, "base = %llu, limit = %llu, blk_size = %llu\n",
+		      a->base, a->length, a->blk_size);
+	__alloc_pstat(s, __a, "Internal params:\n");
+	__alloc_pstat(s, __a, "  start = 0x%llx\n", a->start);
+	__alloc_pstat(s, __a, "  end   = 0x%llx\n", a->end);
+	__alloc_pstat(s, __a, "  count = 0x%llx\n", a->count);
+	__alloc_pstat(s, __a, "  blks  = 0x%llx\n", a->blks);
+	__alloc_pstat(s, __a, "  max_order = %llu\n", a->max_order);
+
+	if (lock)
+		alloc_lock(__a);
+
+	if (!list_empty(&a->co_list)) {
+		__alloc_pstat(s, __a, "\n");
+		__alloc_pstat(s, __a, "Carveouts:\n");
+		list_for_each_entry(tmp, &a->co_list, co_entry)
+			__alloc_pstat(s, __a,
+				      "  CO %2d: %-20s 0x%010llx + 0x%llx\n",
+				      i++, tmp->name, tmp->base, tmp->length);
+	}
+
+	__alloc_pstat(s, __a, "\n");
+	__alloc_pstat(s, __a, "Buddy blocks:\n");
+	__alloc_pstat(s, __a, "  Order   Free    Alloced   Split\n");
+	__alloc_pstat(s, __a, "  -----   ----    -------   -----\n");
+
+	for (i = a->max_order; i >= 0; i--) {
+		if (a->buddy_list_len[i] == 0 &&
+		    a->buddy_list_alloced[i] == 0 &&
+		    a->buddy_list_split[i] == 0)
+			continue;
+
+		__alloc_pstat(s, __a, "  %3d     %-7llu %-9llu %llu\n", i,
+			      a->buddy_list_len[i],
+			      a->buddy_list_alloced[i],
+			      a->buddy_list_split[i]);
+	}
+
+	__alloc_pstat(s, __a, "\n");
+
+	for (node = rb_first(&a->fixed_allocs), i = 1;
+	     node != NULL;
+	     node = rb_next(node)) {
+		falloc = container_of(node,
+				      struct nvgpu_fixed_alloc, alloced_entry);
+
+		__alloc_pstat(s, __a, "Fixed alloc (%d): [0x%llx -> 0x%llx]\n",
+			      i, falloc->start, falloc->end);
+	}
+
+	__alloc_pstat(s, __a, "\n");
+	__alloc_pstat(s, __a, "Bytes allocated:        %llu\n",
+		      a->bytes_alloced);
+	__alloc_pstat(s, __a, "Bytes allocated (real): %llu\n",
+		      a->bytes_alloced_real);
+	__alloc_pstat(s, __a, "Bytes freed:            %llu\n",
+		      a->bytes_freed);
+
+	if (lock)
+		alloc_unlock(__a);
+}
+
+static const struct nvgpu_allocator_ops buddy_ops = {
+	.alloc		= nvgpu_buddy_balloc,
+	.free		= nvgpu_buddy_bfree,
+
+	.alloc_fixed	= nvgpu_balloc_fixed_buddy,
+	/* .free_fixed not needed. */
+
+	.reserve_carveout	= nvgpu_buddy_reserve_co,
+	.release_carveout	= nvgpu_buddy_release_co,
+
+	.base		= nvgpu_buddy_alloc_base,
+	.length		= nvgpu_buddy_alloc_length,
+	.end		= nvgpu_buddy_alloc_end,
+	.inited		= nvgpu_buddy_alloc_inited,
+	.space		= nvgpu_buddy_alloc_space,
+
+	.fini		= nvgpu_buddy_allocator_destroy,
+
+	.print_stats	= nvgpu_buddy_print_stats,
+};
+
+/*
+ * Initialize a buddy allocator. Returns 0 on success. This allocator does
+ * not necessarily manage bytes. It manages distinct ranges of resources. This
+ * allows the allocator to work for things like comp_tags, semaphores, etc.
+ *
+ * @allocator: Ptr to an allocator struct to init.
+ * @vm: GPU VM to associate this allocator with. Can be NULL. Will be used to
+ *      get PTE size for GVA spaces.
+ * @name: Name of the allocator. Doesn't have to be static storage.
+ * @base: The base address of the resource pool being managed.
+ * @size: Number of resources in the pool.
+ * @blk_size: Minimum number of resources to allocate at once. For things like
+ *            semaphores this is 1. For GVA this might be as much as 64k. This
+ *            corresponds to order 0. Must be power of 2.
+ * @max_order: Pick a maximum order. If you leave this as 0, the buddy allocator
+ *             will try and pick a reasonable max order.
+ * @flags: Extra flags necessary. See GPU_BALLOC_*.
+ */
+int __nvgpu_buddy_allocator_init(struct gk20a *g, struct nvgpu_allocator *__a,
+				 struct vm_gk20a *vm, const char *name,
+				 u64 base, u64 size, u64 blk_size,
+				 u64 max_order, u64 flags)
+{
+	int err;
+	u64 pde_size;
+	struct nvgpu_buddy_allocator *a;
+
+	/* blk_size must be greater than 0 and a power of 2. */
+	if (blk_size == 0)
+		return -EINVAL;
+	if (blk_size & (blk_size - 1))
+		return -EINVAL;
+
+	if (max_order > GPU_BALLOC_MAX_ORDER)
+		return -EINVAL;
+
+	/* If this is to manage a GVA space we need a VM. */
+	if (flags & GPU_ALLOC_GVA_SPACE && !vm)
+		return -EINVAL;
+
+	a = kzalloc(sizeof(struct nvgpu_buddy_allocator), GFP_KERNEL);
+	if (!a)
+		return -ENOMEM;
+
+	err = __nvgpu_alloc_common_init(__a, name, a, false, &buddy_ops);
+	if (err)
+		goto fail;
+
+	a->base = base;
+	a->length = size;
+	a->blk_size = blk_size;
+	a->blk_shift = __ffs(blk_size);
+	a->owner = __a;
+
+	/*
+	 * If base is 0 then modfy base to be the size of one block so that we
+	 * can return errors by returning addr == 0.
+	 */
+	if (a->base == 0) {
+		a->base = a->blk_size;
+		a->length -= a->blk_size;
+	}
+
+	a->vm = vm;
+	if (flags & GPU_ALLOC_GVA_SPACE) {
+		pde_size = ((u64)vm->big_page_size) << 10;
+		a->pte_blk_order = balloc_get_order(a, pde_size);
+	}
+
+	/*
+	 * When we have a GVA space with big_pages enabled the size and base
+	 * must be PDE aligned. If big_pages are not enabled then this
+	 * requirement is not necessary.
+	 */
+	if (flags & GPU_ALLOC_GVA_SPACE && vm->big_pages &&
+	    (base & ((vm->big_page_size << 10) - 1) ||
+	     size & ((vm->big_page_size << 10) - 1)))
+		return -EINVAL;
+
+	a->flags = flags;
+	a->max_order = max_order;
+
+	balloc_allocator_align(a);
+	balloc_compute_max_order(a);
+
+	/* Shared buddy kmem_cache for all allocators. */
+	if (!buddy_cache)
+		buddy_cache = KMEM_CACHE(nvgpu_buddy, 0);
+	if (!buddy_cache) {
+		err = -ENOMEM;
+		goto fail;
+	}
+
+	a->alloced_buddies = RB_ROOT;
+	a->fixed_allocs = RB_ROOT;
+	INIT_LIST_HEAD(&a->co_list);
+	err = balloc_init_lists(a);
+	if (err)
+		goto fail;
+
+	wmb();
+	a->initialized = 1;
+
+	nvgpu_init_alloc_debug(g, __a);
+	alloc_dbg(__a, "New allocator: type      buddy\n");
+	alloc_dbg(__a, "               base      0x%llx\n", a->base);
+	alloc_dbg(__a, "               size      0x%llx\n", a->length);
+	alloc_dbg(__a, "               blk_size  0x%llx\n", a->blk_size);
+	alloc_dbg(__a, "               max_order %llu\n", a->max_order);
+	alloc_dbg(__a, "               flags     0x%llx\n", a->flags);
+
+	return 0;
+
+fail:
+	kfree(a);
+	return err;
+}
+
+int nvgpu_buddy_allocator_init(struct gk20a *g, struct nvgpu_allocator *a,
+			       const char *name, u64 base, u64 size,
+			       u64 blk_size, u64 flags)
+{
+	return __nvgpu_buddy_allocator_init(g, a, NULL, name,
+					    base, size, blk_size, 0, 0);
+}
diff --git a/drivers/gpu/nvgpu/common/mm/buddy_allocator_priv.h b/drivers/gpu/nvgpu/common/mm/buddy_allocator_priv.h
new file mode 100644
index 00000000..50a11f14
--- /dev/null
+++ b/drivers/gpu/nvgpu/common/mm/buddy_allocator_priv.h
@@ -0,0 +1,192 @@
+/*
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef BUDDY_ALLOCATOR_PRIV_H
+#define BUDDY_ALLOCATOR_PRIV_H
+
+#include <linux/list.h>
+#include <linux/rbtree.h>
+
+struct nvgpu_allocator;
+struct vm_gk20a;
+
+/*
+ * Each buddy is an element in a binary tree.
+ */
+struct nvgpu_buddy {
+	struct nvgpu_buddy *parent;	/* Parent node. */
+	struct nvgpu_buddy *buddy;	/* This node's buddy. */
+	struct nvgpu_buddy *left;	/* Lower address sub-node. */
+	struct nvgpu_buddy *right;	/* Higher address sub-node. */
+
+	struct list_head buddy_entry;	/* List entry for various lists. */
+	struct rb_node alloced_entry;	/* RB tree of allocations. */
+
+	u64 start;			/* Start address of this buddy. */
+	u64 end;			/* End address of this buddy. */
+	u64 order;			/* Buddy order. */
+
+#define BALLOC_BUDDY_ALLOCED	0x1
+#define BALLOC_BUDDY_SPLIT	0x2
+#define BALLOC_BUDDY_IN_LIST	0x4
+	int flags;			/* List of associated flags. */
+
+	/*
+	 * Size of the PDE this buddy is using. This allows for grouping like
+	 * sized allocations into the same PDE. This uses the gmmu_pgsz_gk20a
+	 * enum except for the BALLOC_PTE_SIZE_ANY specifier.
+	 */
+#define BALLOC_PTE_SIZE_ANY	-1
+	int pte_size;
+};
+
+#define __buddy_flag_ops(flag, flag_up)					\
+	static inline int buddy_is_ ## flag(struct nvgpu_buddy *b)	\
+	{								\
+		return b->flags & BALLOC_BUDDY_ ## flag_up;		\
+	}								\
+	static inline void buddy_set_ ## flag(struct nvgpu_buddy *b)	\
+	{								\
+		b->flags |= BALLOC_BUDDY_ ## flag_up;			\
+	}								\
+	static inline void buddy_clr_ ## flag(struct nvgpu_buddy *b)	\
+	{								\
+		b->flags &= ~BALLOC_BUDDY_ ## flag_up;			\
+	}
+
+/*
+ * int  buddy_is_alloced(struct nvgpu_buddy *b);
+ * void buddy_set_alloced(struct nvgpu_buddy *b);
+ * void buddy_clr_alloced(struct nvgpu_buddy *b);
+ *
+ * int  buddy_is_split(struct nvgpu_buddy *b);
+ * void buddy_set_split(struct nvgpu_buddy *b);
+ * void buddy_clr_split(struct nvgpu_buddy *b);
+ *
+ * int  buddy_is_in_list(struct nvgpu_buddy *b);
+ * void buddy_set_in_list(struct nvgpu_buddy *b);
+ * void buddy_clr_in_list(struct nvgpu_buddy *b);
+ */
+__buddy_flag_ops(alloced, ALLOCED);
+__buddy_flag_ops(split,   SPLIT);
+__buddy_flag_ops(in_list, IN_LIST);
+
+/*
+ * Keeps info for a fixed allocation.
+ */
+struct nvgpu_fixed_alloc {
+	struct list_head buddies;	/* List of buddies. */
+	struct rb_node alloced_entry;	/* RB tree of fixed allocations. */
+
+	u64 start;			/* Start of fixed block. */
+	u64 end;			/* End address. */
+};
+
+/*
+ * GPU buddy allocator for the various GPU address spaces. Each addressable unit
+ * doesn't have to correspond to a byte. In some cases each unit is a more
+ * complex object such as a comp_tag line or the like.
+ *
+ * The max order is computed based on the size of the minimum order and the size
+ * of the address space.
+ *
+ * order_size is the size of an order 0 buddy.
+ */
+struct nvgpu_buddy_allocator {
+	struct nvgpu_allocator *owner;	/* Owner of this buddy allocator. */
+	struct vm_gk20a *vm;		/* Parent VM - can be NULL. */
+
+	u64 base;			/* Base address of the space. */
+	u64 length;			/* Length of the space. */
+	u64 blk_size;			/* Size of order 0 allocation. */
+	u64 blk_shift;			/* Shift to divide by blk_size. */
+
+	/* Internal stuff. */
+	u64 start;			/* Real start (aligned to blk_size). */
+	u64 end;			/* Real end, trimmed if needed. */
+	u64 count;			/* Count of objects in space. */
+	u64 blks;			/* Count of blks in the space. */
+	u64 max_order;			/* Specific maximum order. */
+
+	struct rb_root alloced_buddies;	/* Outstanding allocations. */
+	struct rb_root fixed_allocs;	/* Outstanding fixed allocations. */
+
+	struct list_head co_list;
+
+	/*
+	 * Impose an upper bound on the maximum order.
+	 */
+#define GPU_BALLOC_ORDER_LIST_LEN	(GPU_BALLOC_MAX_ORDER + 1)
+
+	struct list_head buddy_list[GPU_BALLOC_ORDER_LIST_LEN];
+	u64 buddy_list_len[GPU_BALLOC_ORDER_LIST_LEN];
+	u64 buddy_list_split[GPU_BALLOC_ORDER_LIST_LEN];
+	u64 buddy_list_alloced[GPU_BALLOC_ORDER_LIST_LEN];
+
+	/*
+	 * This is for when the allocator is managing a GVA space (the
+	 * GPU_ALLOC_GVA_SPACE bit is set in @flags). This requires
+	 * that we group like sized allocations into PDE blocks.
+	 */
+	u64 pte_blk_order;
+
+	int initialized;
+	int alloc_made;			/* True after the first alloc. */
+
+	u64 flags;
+
+	u64 bytes_alloced;
+	u64 bytes_alloced_real;
+	u64 bytes_freed;
+};
+
+static inline struct nvgpu_buddy_allocator *buddy_allocator(
+	struct nvgpu_allocator *a)
+{
+	return (struct nvgpu_buddy_allocator *)(a)->priv;
+}
+
+static inline struct list_head *balloc_get_order_list(
+	struct nvgpu_buddy_allocator *a, int order)
+{
+	return &a->buddy_list[order];
+}
+
+static inline u64 balloc_order_to_len(struct nvgpu_buddy_allocator *a,
+				      int order)
+{
+	return (1 << order) * a->blk_size;
+}
+
+static inline u64 balloc_base_shift(struct nvgpu_buddy_allocator *a,
+				    u64 base)
+{
+	return base - a->start;
+}
+
+static inline u64 balloc_base_unshift(struct nvgpu_buddy_allocator *a,
+				      u64 base)
+{
+	return base + a->start;
+}
+
+static inline struct nvgpu_allocator *balloc_owner(
+	struct nvgpu_buddy_allocator *a)
+{
+	return a->owner;
+}
+
+#endif
diff --git a/drivers/gpu/nvgpu/common/mm/lockless_allocator.c b/drivers/gpu/nvgpu/common/mm/lockless_allocator.c
new file mode 100644
index 00000000..e3063a42
--- /dev/null
+++ b/drivers/gpu/nvgpu/common/mm/lockless_allocator.c
@@ -0,0 +1,207 @@
+/*
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/atomic.h>
+
+#include <nvgpu/allocator.h>
+
+#include "lockless_allocator_priv.h"
+
+static u64 nvgpu_lockless_alloc_length(struct nvgpu_allocator *a)
+{
+	struct nvgpu_lockless_allocator *pa = a->priv;
+
+	return pa->length;
+}
+
+static u64 nvgpu_lockless_alloc_base(struct nvgpu_allocator *a)
+{
+	struct nvgpu_lockless_allocator *pa = a->priv;
+
+	return pa->base;
+}
+
+static int nvgpu_lockless_alloc_inited(struct nvgpu_allocator *a)
+{
+	struct nvgpu_lockless_allocator *pa = a->priv;
+	int inited = pa->inited;
+
+	rmb();
+	return inited;
+}
+
+static u64 nvgpu_lockless_alloc_end(struct nvgpu_allocator *a)
+{
+	struct nvgpu_lockless_allocator *pa = a->priv;
+
+	return pa->base + pa->length;
+}
+
+static u64 nvgpu_lockless_alloc(struct nvgpu_allocator *a, u64 len)
+{
+	struct nvgpu_lockless_allocator *pa = a->priv;
+	int head, new_head, ret;
+	u64 addr = 0;
+
+	if (len != pa->blk_size)
+		return 0;
+
+	head = ACCESS_ONCE(pa->head);
+	while (head >= 0) {
+		new_head = ACCESS_ONCE(pa->next[head]);
+		ret = cmpxchg(&pa->head, head, new_head);
+		if (ret == head) {
+			addr = pa->base + head * pa->blk_size;
+			atomic_inc(&pa->nr_allocs);
+			alloc_dbg(a, "Alloc node # %d @ addr 0x%llx\n", head,
+				  addr);
+			break;
+		}
+		head = ACCESS_ONCE(pa->head);
+	}
+	return addr;
+}
+
+static void nvgpu_lockless_free(struct nvgpu_allocator *a, u64 addr)
+{
+	struct nvgpu_lockless_allocator *pa = a->priv;
+	int head, ret;
+	u64 cur_idx, rem;
+
+	cur_idx = addr - pa->base;
+	rem = do_div(cur_idx, pa->blk_size);
+
+	while (1) {
+		head = ACCESS_ONCE(pa->head);
+		ACCESS_ONCE(pa->next[cur_idx]) = head;
+		ret = cmpxchg(&pa->head, head, cur_idx);
+		if (ret == head) {
+			atomic_dec(&pa->nr_allocs);
+			alloc_dbg(a, "Free node # %llu\n", cur_idx);
+			break;
+		}
+	}
+}
+
+static void nvgpu_lockless_alloc_destroy(struct nvgpu_allocator *a)
+{
+	struct nvgpu_lockless_allocator *pa = a->priv;
+
+	nvgpu_fini_alloc_debug(a);
+
+	vfree(pa->next);
+	kfree(pa);
+}
+
+static void nvgpu_lockless_print_stats(struct nvgpu_allocator *a,
+				   struct seq_file *s, int lock)
+{
+	struct nvgpu_lockless_allocator *pa = a->priv;
+
+	__alloc_pstat(s, a, "Lockless allocator params:\n");
+	__alloc_pstat(s, a, "  start = 0x%llx\n", pa->base);
+	__alloc_pstat(s, a, "  end   = 0x%llx\n", pa->base + pa->length);
+
+	/* Actual stats. */
+	__alloc_pstat(s, a, "Stats:\n");
+	__alloc_pstat(s, a, "  Number allocs = %d\n",
+		      atomic_read(&pa->nr_allocs));
+	__alloc_pstat(s, a, "  Number free   = %d\n",
+		      pa->nr_nodes - atomic_read(&pa->nr_allocs));
+}
+
+static const struct nvgpu_allocator_ops pool_ops = {
+	.alloc		= nvgpu_lockless_alloc,
+	.free		= nvgpu_lockless_free,
+
+	.base		= nvgpu_lockless_alloc_base,
+	.length		= nvgpu_lockless_alloc_length,
+	.end		= nvgpu_lockless_alloc_end,
+	.inited		= nvgpu_lockless_alloc_inited,
+
+	.fini		= nvgpu_lockless_alloc_destroy,
+
+	.print_stats	= nvgpu_lockless_print_stats,
+};
+
+int nvgpu_lockless_allocator_init(struct gk20a *g, struct nvgpu_allocator *__a,
+			      const char *name, u64 base, u64 length,
+			      u64 blk_size, u64 flags)
+{
+	int i;
+	int err;
+	int nr_nodes;
+	u64 count, rem;
+	struct nvgpu_lockless_allocator *a;
+
+	if (!blk_size)
+		return -EINVAL;
+
+	/*
+	 * Ensure we have space for atleast one node & there's no overflow.
+	 * In order to control memory footprint, we require count < INT_MAX
+	 */
+	count = length;
+	rem = do_div(count, blk_size);
+	if (!base || !count || count > INT_MAX)
+		return -EINVAL;
+
+	a = kzalloc(sizeof(struct nvgpu_lockless_allocator), GFP_KERNEL);
+	if (!a)
+		return -ENOMEM;
+
+	err = __nvgpu_alloc_common_init(__a, name, a, false, &pool_ops);
+	if (err)
+		goto fail;
+
+	a->next = vzalloc(sizeof(*a->next) * count);
+	if (!a->next) {
+		err = -ENOMEM;
+		goto fail;
+	}
+
+	/* chain the elements together to form the initial free list  */
+	nr_nodes = (int)count;
+	for (i = 0; i < nr_nodes; i++)
+		a->next[i] = i + 1;
+	a->next[nr_nodes - 1] = -1;
+
+	a->base = base;
+	a->length = length;
+	a->blk_size = blk_size;
+	a->nr_nodes = nr_nodes;
+	a->flags = flags;
+	atomic_set(&a->nr_allocs, 0);
+
+	wmb();
+	a->inited = true;
+
+	nvgpu_init_alloc_debug(g, __a);
+	alloc_dbg(__a, "New allocator: type          lockless\n");
+	alloc_dbg(__a, "               base          0x%llx\n", a->base);
+	alloc_dbg(__a, "               nodes         %d\n", a->nr_nodes);
+	alloc_dbg(__a, "               blk_size      0x%llx\n", a->blk_size);
+	alloc_dbg(__a, "               flags         0x%llx\n", a->flags);
+
+	return 0;
+
+fail:
+	kfree(a);
+	return err;
+}
diff --git a/drivers/gpu/nvgpu/common/mm/lockless_allocator_priv.h b/drivers/gpu/nvgpu/common/mm/lockless_allocator_priv.h
new file mode 100644
index 00000000..32421ac1
--- /dev/null
+++ b/drivers/gpu/nvgpu/common/mm/lockless_allocator_priv.h
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * Basics:
+ *
+ *    - Lockless memory allocator for fixed-size structures, whose
+ *      size is defined up front at init time.
+ *    - Memory footprint scales linearly w/ the number of structures in
+ *      the pool. It is ~= sizeof(int) * N.
+ *    - Memory is pre-allocated by the client. The allocator itself
+ *      only computes the addresses for allocations.
+ *    - Limit of MAX_INT nodes that the allocator can be responsible for.
+ *
+ * Implementation details:
+ *
+ *    The allocator maintains a single list of free nodes. We allocate &
+ *    free nodes from the head of the list. We rely on the cmpxchg() operator
+ *    to maintain atomicity on the head.
+ *
+ *    So, both allocs & frees are O(1)!!
+ *
+ *    -- Definitions --
+ *    Block Size - size of a single structure that this allocator will
+ *                 allocate.
+ *    Node       - one of the elements of size blk_size in the
+ *                 client-allocated buffer.
+ *    Node Index - zero-based index of a node in the client-allocated
+ *                 contiguous buffer.
+ *
+ *    -- Initial State --
+ *    We maintain the following to track the state of the free list:
+ *
+ *    1) A "head" index to track the index of the first free node in the list
+ *    2) A "next" array to track the index of the next free node in the list
+ *       for every node. So next[head], will give the index to the 2nd free
+ *       element in the list.
+ *
+ *    So, to begin with, the free list consists of all node indices, and each
+ *    position in the next array contains index N + 1:
+ *
+ *    head = 0
+ *    next = [1, 2, 3, 4, -1] : Example for a user-allocated buffer of 5 nodes
+ *    free_list = 0->1->2->3->4->-1
+ *
+ *    -- Allocations --
+ *    1) Read the current head (aka acq_head)
+ *    2) Read next[acq_head], to get the 2nd free element (aka new_head)
+ *    3) cmp_xchg(&head, acq_head, new_head)
+ *    4) If it succeeds, compute the address of the node, based on
+ *       base address, blk_size, & acq_head.
+ *
+ *    head = 1;
+ *    next = [1, 2, 3, 4, -1] : Example after allocating Node #0
+ *    free_list = 1->2->3->4->-1
+ *
+ *    head = 2;
+ *    next = [1, 2, 3, 4, -1] : Example after allocating Node #1
+ *    free_list = 2->3->4->-1
+ *
+ *    -- Frees --
+ *    1) Based on the address to be freed, calculate the index of the node
+ *       being freed (cur_idx)
+ *    2) Read the current head (old_head)
+ *    3) So the freed node is going to go at the head of the list, and we
+ *       want to put the old_head after it. So next[cur_idx] = old_head
+ *    4) cmpxchg(head, old_head, cur_idx)
+ *
+ *    head = 0
+ *    next = [2, 2, 3, 4, -1]
+ *    free_list = 0->2->3->4->-1 : Example after freeing Node #0
+ *
+ *    head = 1
+ *    next = [2, 0, 3, 4, -1]
+ *    free_list = 1->0->2->3->4->-1 : Example after freeing Node #1
+ */
+
+#ifndef LOCKLESS_ALLOCATOR_PRIV_H
+#define LOCKLESS_ALLOCATOR_PRIV_H
+
+struct nvgpu_allocator;
+
+struct nvgpu_lockless_allocator {
+	struct nvgpu_allocator *owner;
+
+	u64 base;		/* Base address of the space. */
+	u64 length;		/* Length of the space. */
+	u64 blk_size;		/* Size of the structure being allocated */
+	int nr_nodes;		/* Number of nodes available for allocation */
+
+	int *next;		/* An array holding the next indices per node */
+	int head;		/* Current node at the top of the stack */
+
+	u64 flags;
+
+	bool inited;
+
+	/* Statistics */
+	atomic_t nr_allocs;
+};
+
+static inline struct nvgpu_lockless_allocator *lockless_allocator(
+	struct nvgpu_allocator *a)
+{
+	return (struct nvgpu_lockless_allocator *)(a)->priv;
+}
+
+#endif
diff --git a/drivers/gpu/nvgpu/common/mm/nvgpu_allocator.c b/drivers/gpu/nvgpu/common/mm/nvgpu_allocator.c
new file mode 100644
index 00000000..ebd779c0
--- /dev/null
+++ b/drivers/gpu/nvgpu/common/mm/nvgpu_allocator.c
@@ -0,0 +1,212 @@
+/*
+ * gk20a allocator
+ *
+ * Copyright (c) 2011-2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+
+#include <nvgpu/allocator.h>
+
+#include "gk20a/gk20a.h"
+#include "gk20a/mm_gk20a.h"
+#include "gk20a/platform_gk20a.h"
+
+u32 nvgpu_alloc_tracing_on;
+
+u64 nvgpu_alloc_length(struct nvgpu_allocator *a)
+{
+	if (a->ops->length)
+		return a->ops->length(a);
+
+	return 0;
+}
+
+u64 nvgpu_alloc_base(struct nvgpu_allocator *a)
+{
+	if (a->ops->base)
+		return a->ops->base(a);
+
+	return 0;
+}
+
+u64 nvgpu_alloc_initialized(struct nvgpu_allocator *a)
+{
+	if (!a->ops || !a->ops->inited)
+		return 0;
+
+	return a->ops->inited(a);
+}
+
+u64 nvgpu_alloc_end(struct nvgpu_allocator *a)
+{
+	if (a->ops->end)
+		return a->ops->end(a);
+
+	return 0;
+}
+
+u64 nvgpu_alloc_space(struct nvgpu_allocator *a)
+{
+	if (a->ops->space)
+		return a->ops->space(a);
+
+	return 0;
+}
+
+u64 nvgpu_alloc(struct nvgpu_allocator *a, u64 len)
+{
+	return a->ops->alloc(a, len);
+}
+
+void nvgpu_free(struct nvgpu_allocator *a, u64 addr)
+{
+	a->ops->free(a, addr);
+}
+
+u64 nvgpu_alloc_fixed(struct nvgpu_allocator *a, u64 base, u64 len)
+{
+	if (a->ops->alloc_fixed)
+		return a->ops->alloc_fixed(a, base, len);
+
+	return 0;
+}
+
+void nvgpu_free_fixed(struct nvgpu_allocator *a, u64 base, u64 len)
+{
+	/*
+	 * If this operation is not defined for the allocator then just do
+	 * nothing. The alternative would be to fall back on the regular
+	 * free but that may be harmful in unexpected ways.
+	 */
+	if (a->ops->free_fixed)
+		a->ops->free_fixed(a, base, len);
+}
+
+int nvgpu_alloc_reserve_carveout(struct nvgpu_allocator *a,
+				 struct nvgpu_alloc_carveout *co)
+{
+	if (a->ops->reserve_carveout)
+		return a->ops->reserve_carveout(a, co);
+
+	return -ENODEV;
+}
+
+void nvgpu_alloc_release_carveout(struct nvgpu_allocator *a,
+				  struct nvgpu_alloc_carveout *co)
+{
+	if (a->ops->release_carveout)
+		a->ops->release_carveout(a, co);
+}
+
+void nvgpu_alloc_destroy(struct nvgpu_allocator *a)
+{
+	a->ops->fini(a);
+	memset(a, 0, sizeof(*a));
+}
+
+/*
+ * Handle the common init stuff for a nvgpu_allocator.
+ */
+int __nvgpu_alloc_common_init(struct nvgpu_allocator *a,
+			      const char *name, void *priv, bool dbg,
+			      const struct nvgpu_allocator_ops *ops)
+{
+	if (!ops)
+		return -EINVAL;
+
+	/*
+	 * This is the bare minimum operations required for a sensible
+	 * allocator.
+	 */
+	if (!ops->alloc || !ops->free || !ops->fini)
+		return -EINVAL;
+
+	a->ops = ops;
+	a->priv = priv;
+	a->debug = dbg;
+
+	mutex_init(&a->lock);
+
+	strlcpy(a->name, name, sizeof(a->name));
+
+	return 0;
+}
+
+void nvgpu_alloc_print_stats(struct nvgpu_allocator *__a,
+			     struct seq_file *s, int lock)
+{
+	__a->ops->print_stats(__a, s, lock);
+}
+
+#ifdef CONFIG_DEBUG_FS
+static int __alloc_show(struct seq_file *s, void *unused)
+{
+	struct nvgpu_allocator *a = s->private;
+
+	nvgpu_alloc_print_stats(a, s, 1);
+
+	return 0;
+}
+
+static int __alloc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, __alloc_show, inode->i_private);
+}
+
+static const struct file_operations __alloc_fops = {
+	.open = __alloc_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+#endif
+
+void nvgpu_init_alloc_debug(struct gk20a *g, struct nvgpu_allocator *a)
+{
+#ifdef CONFIG_DEBUG_FS
+	if (!g->debugfs_allocators)
+		return;
+
+	a->debugfs_entry = debugfs_create_file(a->name, S_IRUGO,
+					       g->debugfs_allocators,
+					       a, &__alloc_fops);
+#endif
+}
+
+void nvgpu_fini_alloc_debug(struct nvgpu_allocator *a)
+{
+#ifdef CONFIG_DEBUG_FS
+	if (!IS_ERR_OR_NULL(a->debugfs_entry))
+		debugfs_remove(a->debugfs_entry);
+#endif
+}
+
+void nvgpu_alloc_debugfs_init(struct device *dev)
+{
+#ifdef CONFIG_DEBUG_FS
+	struct gk20a_platform *platform = dev_get_drvdata(dev);
+	struct dentry *gpu_root = platform->debugfs;
+	struct gk20a *g = get_gk20a(dev);
+
+	g->debugfs_allocators = debugfs_create_dir("allocators", gpu_root);
+	if (IS_ERR_OR_NULL(g->debugfs_allocators))
+		return;
+
+	debugfs_create_u32("tracing", 0664, g->debugfs_allocators,
+			   &nvgpu_alloc_tracing_on);
+#endif
+}
diff --git a/drivers/gpu/nvgpu/common/mm/page_allocator.c b/drivers/gpu/nvgpu/common/mm/page_allocator.c
new file mode 100644
index 00000000..c61b2238
--- /dev/null
+++ b/drivers/gpu/nvgpu/common/mm/page_allocator.c
@@ -0,0 +1,937 @@
+/*
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/bitops.h>
+#include <linux/mm.h>
+
+#include <nvgpu/allocator.h>
+#include <nvgpu/page_allocator.h>
+
+#include "buddy_allocator_priv.h"
+
+#define palloc_dbg(a, fmt, arg...)			\
+	alloc_dbg(palloc_owner(a), fmt, ##arg)
+
+static struct kmem_cache *page_alloc_cache;
+static struct kmem_cache *page_alloc_chunk_cache;
+static struct kmem_cache *page_alloc_slab_page_cache;
+static DEFINE_MUTEX(meta_data_cache_lock);
+
+/*
+ * Handle the book-keeping for these operations.
+ */
+static inline void add_slab_page_to_empty(struct page_alloc_slab *slab,
+					  struct page_alloc_slab_page *page)
+{
+	BUG_ON(page->state != SP_NONE);
+	list_add(&page->list_entry, &slab->empty);
+	slab->nr_empty++;
+	page->state = SP_EMPTY;
+}
+static inline void add_slab_page_to_partial(struct page_alloc_slab *slab,
+					    struct page_alloc_slab_page *page)
+{
+	BUG_ON(page->state != SP_NONE);
+	list_add(&page->list_entry, &slab->partial);
+	slab->nr_partial++;
+	page->state = SP_PARTIAL;
+}
+static inline void add_slab_page_to_full(struct page_alloc_slab *slab,
+					 struct page_alloc_slab_page *page)
+{
+	BUG_ON(page->state != SP_NONE);
+	list_add(&page->list_entry, &slab->full);
+	slab->nr_full++;
+	page->state = SP_FULL;
+}
+
+static inline void del_slab_page_from_empty(struct page_alloc_slab *slab,
+					    struct page_alloc_slab_page *page)
+{
+	list_del_init(&page->list_entry);
+	slab->nr_empty--;
+	page->state = SP_NONE;
+}
+static inline void del_slab_page_from_partial(struct page_alloc_slab *slab,
+					      struct page_alloc_slab_page *page)
+{
+	list_del_init(&page->list_entry);
+	slab->nr_partial--;
+	page->state = SP_NONE;
+}
+static inline void del_slab_page_from_full(struct page_alloc_slab *slab,
+					   struct page_alloc_slab_page *page)
+{
+	list_del_init(&page->list_entry);
+	slab->nr_full--;
+	page->state = SP_NONE;
+}
+
+static u64 nvgpu_page_alloc_length(struct nvgpu_allocator *a)
+{
+	struct nvgpu_page_allocator *va = a->priv;
+
+	return nvgpu_alloc_length(&va->source_allocator);
+}
+
+static u64 nvgpu_page_alloc_base(struct nvgpu_allocator *a)
+{
+	struct nvgpu_page_allocator *va = a->priv;
+
+	return nvgpu_alloc_base(&va->source_allocator);
+}
+
+static int nvgpu_page_alloc_inited(struct nvgpu_allocator *a)
+{
+	struct nvgpu_page_allocator *va = a->priv;
+
+	return nvgpu_alloc_initialized(&va->source_allocator);
+}
+
+static u64 nvgpu_page_alloc_end(struct nvgpu_allocator *a)
+{
+	struct nvgpu_page_allocator *va = a->priv;
+
+	return nvgpu_alloc_end(&va->source_allocator);
+}
+
+static u64 nvgpu_page_alloc_space(struct nvgpu_allocator *a)
+{
+	struct nvgpu_page_allocator *va = a->priv;
+
+	return nvgpu_alloc_space(&va->source_allocator);
+}
+
+static int nvgpu_page_reserve_co(struct nvgpu_allocator *a,
+				 struct nvgpu_alloc_carveout *co)
+{
+	struct nvgpu_page_allocator *va = a->priv;
+
+	return nvgpu_alloc_reserve_carveout(&va->source_allocator, co);
+}
+
+static void nvgpu_page_release_co(struct nvgpu_allocator *a,
+				  struct nvgpu_alloc_carveout *co)
+{
+	struct nvgpu_page_allocator *va = a->priv;
+
+	nvgpu_alloc_release_carveout(&va->source_allocator, co);
+}
+
+static void __nvgpu_free_pages(struct nvgpu_page_allocator *a,
+			       struct nvgpu_page_alloc *alloc,
+			       bool free_buddy_alloc)
+{
+	struct page_alloc_chunk *chunk;
+
+	while (!list_empty(&alloc->alloc_chunks)) {
+		chunk = list_first_entry(&alloc->alloc_chunks,
+					 struct page_alloc_chunk,
+					 list_entry);
+		list_del(&chunk->list_entry);
+
+		if (free_buddy_alloc)
+			nvgpu_free(&a->source_allocator, chunk->base);
+		kfree(chunk);
+	}
+
+	kfree(alloc);
+}
+
+static int __insert_page_alloc(struct nvgpu_page_allocator *a,
+			       struct nvgpu_page_alloc *alloc)
+{
+	struct rb_node **new = &a->allocs.rb_node;
+	struct rb_node *parent = NULL;
+
+	while (*new) {
+		struct nvgpu_page_alloc *tmp =
+			container_of(*new, struct nvgpu_page_alloc,
+				     tree_entry);
+
+		parent = *new;
+		if (alloc->base < tmp->base) {
+			new = &((*new)->rb_left);
+		} else if (alloc->base > tmp->base) {
+			new = &((*new)->rb_right);
+		} else {
+			WARN(1, "Duplicate entries in allocated list!\n");
+			return 0;
+		}
+	}
+
+	rb_link_node(&alloc->tree_entry, parent, new);
+	rb_insert_color(&alloc->tree_entry, &a->allocs);
+
+	return 0;
+}
+
+static struct nvgpu_page_alloc *__find_page_alloc(
+	struct nvgpu_page_allocator *a,
+	u64 addr)
+{
+	struct rb_node *node = a->allocs.rb_node;
+	struct nvgpu_page_alloc *alloc;
+
+	while (node) {
+		alloc = container_of(node, struct nvgpu_page_alloc, tree_entry);
+
+		if (addr < alloc->base)
+			node = node->rb_left;
+		else if (addr > alloc->base)
+			node = node->rb_right;
+		else
+			break;
+	}
+
+	if (!node)
+		return NULL;
+
+	rb_erase(node, &a->allocs);
+
+	return alloc;
+}
+
+static struct page_alloc_slab_page *alloc_slab_page(
+	struct nvgpu_page_allocator *a,
+	struct page_alloc_slab *slab)
+{
+	struct page_alloc_slab_page *slab_page;
+
+	slab_page = kmem_cache_alloc(page_alloc_slab_page_cache, GFP_KERNEL);
+	if (!slab_page) {
+		palloc_dbg(a, "OOM: unable to alloc slab_page struct!\n");
+		return ERR_PTR(-ENOMEM);
+	}
+
+	memset(slab_page, 0, sizeof(*slab_page));
+
+	slab_page->page_addr = nvgpu_alloc(&a->source_allocator, a->page_size);
+	if (!slab_page->page_addr) {
+		kfree(slab_page);
+		palloc_dbg(a, "OOM: vidmem is full!\n");
+		return ERR_PTR(-ENOMEM);
+	}
+
+	INIT_LIST_HEAD(&slab_page->list_entry);
+	slab_page->slab_size = slab->slab_size;
+	slab_page->nr_objects = (u32)a->page_size / slab->slab_size;
+	slab_page->nr_objects_alloced = 0;
+	slab_page->owner = slab;
+	slab_page->state = SP_NONE;
+
+	a->pages_alloced++;
+
+	palloc_dbg(a, "Allocated new slab page @ 0x%012llx size=%u\n",
+		   slab_page->page_addr, slab_page->slab_size);
+
+	return slab_page;
+}
+
+static void free_slab_page(struct nvgpu_page_allocator *a,
+			   struct page_alloc_slab_page *slab_page)
+{
+	palloc_dbg(a, "Freeing slab page @ 0x%012llx\n", slab_page->page_addr);
+
+	BUG_ON((slab_page->state != SP_NONE && slab_page->state != SP_EMPTY) ||
+	       slab_page->nr_objects_alloced != 0 ||
+	       slab_page->bitmap != 0);
+
+	nvgpu_free(&a->source_allocator, slab_page->page_addr);
+	a->pages_freed++;
+
+	kmem_cache_free(page_alloc_slab_page_cache, slab_page);
+}
+
+/*
+ * This expects @alloc to have 1 empty page_alloc_chunk already added to the
+ * alloc_chunks list.
+ */
+static int __do_slab_alloc(struct nvgpu_page_allocator *a,
+			   struct page_alloc_slab *slab,
+			   struct nvgpu_page_alloc *alloc)
+{
+	struct page_alloc_slab_page *slab_page = NULL;
+	struct page_alloc_chunk *chunk;
+	unsigned long offs;
+
+	/*
+	 * Check the partial and empty lists to see if we have some space
+	 * readily available. Take the slab_page out of what ever list it
+	 * was in since it may be put back into a different list later.
+	 */
+	if (!list_empty(&slab->partial)) {
+		slab_page = list_first_entry(&slab->partial,
+					     struct page_alloc_slab_page,
+					     list_entry);
+		del_slab_page_from_partial(slab, slab_page);
+	} else if (!list_empty(&slab->empty)) {
+		slab_page = list_first_entry(&slab->empty,
+					     struct page_alloc_slab_page,
+					     list_entry);
+		del_slab_page_from_empty(slab, slab_page);
+	}
+
+	if (!slab_page) {
+		slab_page = alloc_slab_page(a, slab);
+		if (IS_ERR(slab_page))
+			return PTR_ERR(slab_page);
+	}
+
+	/*
+	 * We now have a slab_page. Do the alloc.
+	 */
+	offs = bitmap_find_next_zero_area(&slab_page->bitmap,
+					  slab_page->nr_objects,
+					  0, 1, 0);
+	if (offs >= slab_page->nr_objects) {
+		WARN(1, "Empty/partial slab with no free objects?");
+
+		/* Add the buggy page to the full list... This isn't ideal. */
+		add_slab_page_to_full(slab, slab_page);
+		return -ENOMEM;
+	}
+
+	bitmap_set(&slab_page->bitmap, offs, 1);
+	slab_page->nr_objects_alloced++;
+
+	if (slab_page->nr_objects_alloced < slab_page->nr_objects)
+		add_slab_page_to_partial(slab, slab_page);
+	else if (slab_page->nr_objects_alloced == slab_page->nr_objects)
+		add_slab_page_to_full(slab, slab_page);
+	else
+		BUG(); /* Should be impossible to hit this. */
+
+	/*
+	 * Handle building the nvgpu_page_alloc struct. We expect one
+	 * page_alloc_chunk to be present.
+	 */
+	alloc->slab_page = slab_page;
+	alloc->nr_chunks = 1;
+	alloc->length = slab_page->slab_size;
+	alloc->base = slab_page->page_addr + (offs * slab_page->slab_size);
+
+	chunk = list_first_entry(&alloc->alloc_chunks,
+				 struct page_alloc_chunk, list_entry);
+	chunk->base = alloc->base;
+	chunk->length = alloc->length;
+
+	return 0;
+}
+
+/*
+ * Allocate from a slab instead of directly from the page allocator.
+ */
+static struct nvgpu_page_alloc *__nvgpu_alloc_slab(
+	struct nvgpu_page_allocator *a, u64 len)
+{
+	int err, slab_nr;
+	struct page_alloc_slab *slab;
+	struct nvgpu_page_alloc *alloc = NULL;
+	struct page_alloc_chunk *chunk = NULL;
+
+	/*
+	 * Align the length to a page and then divide by the page size (4k for
+	 * this code). ilog2() of that then gets us the correct slab to use.
+	 */
+	slab_nr = (int)ilog2(PAGE_ALIGN(len) >> 12);
+	slab = &a->slabs[slab_nr];
+
+	alloc = kmem_cache_alloc(page_alloc_cache, GFP_KERNEL);
+	if (!alloc) {
+		palloc_dbg(a, "OOM: could not alloc page_alloc struct!\n");
+		goto fail;
+	}
+	chunk = kmem_cache_alloc(page_alloc_chunk_cache, GFP_KERNEL);
+	if (!chunk) {
+		palloc_dbg(a, "OOM: could not alloc alloc_chunk struct!\n");
+		goto fail;
+	}
+
+	INIT_LIST_HEAD(&alloc->alloc_chunks);
+	list_add(&chunk->list_entry, &alloc->alloc_chunks);
+
+	err = __do_slab_alloc(a, slab, alloc);
+	if (err)
+		goto fail;
+
+	palloc_dbg(a, "Alloc 0x%04llx sr=%d id=0x%010llx [slab]\n",
+		   len, slab_nr, alloc->base);
+	a->nr_slab_allocs++;
+
+	return alloc;
+
+fail:
+	kfree(alloc);
+	kfree(chunk);
+	return NULL;
+}
+
+static void __nvgpu_free_slab(struct nvgpu_page_allocator *a,
+			      struct nvgpu_page_alloc *alloc)
+{
+	struct page_alloc_slab_page *slab_page = alloc->slab_page;
+	struct page_alloc_slab *slab = slab_page->owner;
+	enum slab_page_state new_state;
+	int offs;
+
+	offs = (u32)(alloc->base - slab_page->page_addr) / slab_page->slab_size;
+	bitmap_clear(&slab_page->bitmap, offs, 1);
+
+	slab_page->nr_objects_alloced--;
+
+	if (slab_page->nr_objects_alloced == 0)
+		new_state = SP_EMPTY;
+	else
+		new_state = SP_PARTIAL;
+
+	/*
+	 * Need to migrate the page to a different list.
+	 */
+	if (new_state != slab_page->state) {
+		/* Delete - can't be in empty. */
+		if (slab_page->state == SP_PARTIAL)
+			del_slab_page_from_partial(slab, slab_page);
+		else
+			del_slab_page_from_full(slab, slab_page);
+
+		/* And add. */
+		if (new_state == SP_EMPTY) {
+			if (list_empty(&slab->empty))
+				add_slab_page_to_empty(slab, slab_page);
+			else
+				free_slab_page(a, slab_page);
+		} else {
+			add_slab_page_to_partial(slab, slab_page);
+		}
+	}
+
+	/*
+	 * Now handle the page_alloc.
+	 */
+	__nvgpu_free_pages(a, alloc, false);
+	a->nr_slab_frees++;
+
+	return;
+}
+
+/*
+ * Allocate physical pages. Since the underlying allocator is a buddy allocator
+ * the returned pages are always contiguous. However, since there could be
+ * fragmentation in the space this allocator will collate smaller non-contiguous
+ * allocations together if necessary.
+ */
+static struct nvgpu_page_alloc *__do_nvgpu_alloc_pages(
+	struct nvgpu_page_allocator *a, u64 pages)
+{
+	struct nvgpu_page_alloc *alloc;
+	struct page_alloc_chunk *c;
+	u64 max_chunk_len = pages << a->page_shift;
+	int i = 0;
+
+	alloc = kmem_cache_alloc(page_alloc_cache, GFP_KERNEL);
+	if (!alloc)
+		goto fail;
+
+	memset(alloc, 0, sizeof(*alloc));
+
+	INIT_LIST_HEAD(&alloc->alloc_chunks);
+	alloc->length = pages << a->page_shift;
+
+	while (pages) {
+		u64 chunk_addr = 0;
+		u64 chunk_pages = (u64)1 << __fls(pages);
+		u64 chunk_len = chunk_pages << a->page_shift;
+
+		/*
+		 * Take care of the possibility that the allocation must be
+		 * contiguous. If this is not the first iteration then that
+		 * means the first iteration failed to alloc the entire
+		 * requested size. The buddy allocator guarantees any given
+		 * single alloc is contiguous.
+		 */
+		if (a->flags & GPU_ALLOC_FORCE_CONTIG && i != 0)
+			goto fail_cleanup;
+
+		if (chunk_len > max_chunk_len)
+			chunk_len = max_chunk_len;
+
+		/*
+		 * Keep attempting to allocate in smaller chunks until the alloc
+		 * either succeeds or is smaller than the page_size of the
+		 * allocator (i.e the allocator is OOM).
+		 */
+		do {
+			chunk_addr = nvgpu_alloc(&a->source_allocator,
+						 chunk_len);
+
+			/* Divide by 2 and try again */
+			if (!chunk_addr) {
+				palloc_dbg(a, "balloc failed: 0x%llx\n",
+					   chunk_len);
+				chunk_len >>= 1;
+				max_chunk_len = chunk_len;
+			}
+		} while (!chunk_addr && chunk_len >= a->page_size);
+
+		chunk_pages = chunk_len >> a->page_shift;
+
+		if (!chunk_addr) {
+			palloc_dbg(a, "bailing @ 0x%llx\n", chunk_len);
+			goto fail_cleanup;
+		}
+
+		c = kmem_cache_alloc(page_alloc_chunk_cache, GFP_KERNEL);
+		if (!c) {
+			nvgpu_free(&a->source_allocator, chunk_addr);
+			goto fail_cleanup;
+		}
+
+		pages -= chunk_pages;
+
+		c->base = chunk_addr;
+		c->length = chunk_len;
+		list_add(&c->list_entry, &alloc->alloc_chunks);
+
+		i++;
+	}
+
+	alloc->nr_chunks = i;
+	c = list_first_entry(&alloc->alloc_chunks,
+			     struct page_alloc_chunk, list_entry);
+	alloc->base = c->base;
+
+	return alloc;
+
+fail_cleanup:
+	while (!list_empty(&alloc->alloc_chunks)) {
+		c = list_first_entry(&alloc->alloc_chunks,
+				     struct page_alloc_chunk, list_entry);
+		list_del(&c->list_entry);
+		nvgpu_free(&a->source_allocator, c->base);
+		kfree(c);
+	}
+	kfree(alloc);
+fail:
+	return ERR_PTR(-ENOMEM);
+}
+
+static struct nvgpu_page_alloc *__nvgpu_alloc_pages(
+	struct nvgpu_page_allocator *a, u64 len)
+{
+	struct nvgpu_page_alloc *alloc = NULL;
+	struct page_alloc_chunk *c;
+	u64 pages;
+	int i = 0;
+
+	pages = ALIGN(len, a->page_size) >> a->page_shift;
+
+	alloc = __do_nvgpu_alloc_pages(a, pages);
+	if (IS_ERR(alloc)) {
+		palloc_dbg(a, "Alloc 0x%llx (%llu) (failed)\n",
+			   pages << a->page_shift, pages);
+		return NULL;
+	}
+
+	palloc_dbg(a, "Alloc 0x%llx (%llu) id=0x%010llx\n",
+		   pages << a->page_shift, pages, alloc->base);
+	list_for_each_entry(c, &alloc->alloc_chunks, list_entry) {
+		palloc_dbg(a, "  Chunk %2d: 0x%010llx + 0x%llx\n",
+			   i++, c->base, c->length);
+	}
+
+	return alloc;
+}
+
+/*
+ * Allocate enough pages to satisfy @len. Page size is determined at
+ * initialization of the allocator.
+ *
+ * The return is actually a pointer to a struct nvgpu_page_alloc pointer. This
+ * is because it doesn't make a lot of sense to return the address of the first
+ * page in the list of pages (since they could be discontiguous). This has
+ * precedent in the dma_alloc APIs, though, it's really just an annoying
+ * artifact of the fact that the nvgpu_alloc() API requires a u64 return type.
+ */
+static u64 nvgpu_page_alloc(struct nvgpu_allocator *__a, u64 len)
+{
+	struct nvgpu_page_allocator *a = page_allocator(__a);
+	struct nvgpu_page_alloc *alloc = NULL;
+	u64 real_len;
+
+	/*
+	 * If we want contig pages we have to round up to a power of two. It's
+	 * easier to do that here than in the buddy allocator.
+	 */
+	real_len = a->flags & GPU_ALLOC_FORCE_CONTIG ?
+		roundup_pow_of_two(len) : len;
+
+	alloc_lock(__a);
+	if (a->flags & GPU_ALLOC_4K_VIDMEM_PAGES &&
+	    real_len <= (a->page_size / 2))
+		alloc = __nvgpu_alloc_slab(a, real_len);
+	else
+		alloc = __nvgpu_alloc_pages(a, real_len);
+
+	if (!alloc) {
+		alloc_unlock(__a);
+		return 0;
+	}
+
+	__insert_page_alloc(a, alloc);
+
+	a->nr_allocs++;
+	if (real_len > a->page_size / 2)
+		a->pages_alloced += alloc->length >> a->page_shift;
+	alloc_unlock(__a);
+
+	if (a->flags & GPU_ALLOC_NO_SCATTER_GATHER)
+		return alloc->base;
+	else
+		return (u64) (uintptr_t) alloc;
+}
+
+/*
+ * Note: this will remove the nvgpu_page_alloc struct from the RB tree
+ * if it's found.
+ */
+static void nvgpu_page_free(struct nvgpu_allocator *__a, u64 base)
+{
+	struct nvgpu_page_allocator *a = page_allocator(__a);
+	struct nvgpu_page_alloc *alloc;
+
+	alloc_lock(__a);
+
+	if (a->flags & GPU_ALLOC_NO_SCATTER_GATHER)
+		alloc = __find_page_alloc(a, base);
+	else
+		alloc = __find_page_alloc(a,
+			((struct nvgpu_page_alloc *)(uintptr_t)base)->base);
+
+	if (!alloc) {
+		palloc_dbg(a, "Hrm, found no alloc?\n");
+		goto done;
+	}
+
+	a->nr_frees++;
+
+	palloc_dbg(a, "Free  0x%llx id=0x%010llx\n",
+		   alloc->length, alloc->base);
+
+	/*
+	 * Frees *alloc.
+	 */
+	if (alloc->slab_page) {
+		__nvgpu_free_slab(a, alloc);
+	} else {
+		a->pages_freed += (alloc->length >> a->page_shift);
+		__nvgpu_free_pages(a, alloc, true);
+	}
+
+done:
+	alloc_unlock(__a);
+}
+
+static struct nvgpu_page_alloc *__nvgpu_alloc_pages_fixed(
+	struct nvgpu_page_allocator *a, u64 base, u64 length)
+{
+	struct nvgpu_page_alloc *alloc;
+	struct page_alloc_chunk *c;
+
+	alloc = kmem_cache_alloc(page_alloc_cache, GFP_KERNEL);
+	c = kmem_cache_alloc(page_alloc_chunk_cache, GFP_KERNEL);
+	if (!alloc || !c)
+		goto fail;
+
+	alloc->base = nvgpu_alloc_fixed(&a->source_allocator, base, length);
+	if (!alloc->base) {
+		WARN(1, "nvgpu: failed to fixed alloc pages @ 0x%010llx", base);
+		goto fail;
+	}
+
+	alloc->nr_chunks = 1;
+	alloc->length = length;
+	INIT_LIST_HEAD(&alloc->alloc_chunks);
+
+	c->base = alloc->base;
+	c->length = length;
+	list_add(&c->list_entry, &alloc->alloc_chunks);
+
+	return alloc;
+
+fail:
+	kfree(c);
+	kfree(alloc);
+	return ERR_PTR(-ENOMEM);
+}
+
+static u64 nvgpu_page_alloc_fixed(struct nvgpu_allocator *__a,
+				  u64 base, u64 len)
+{
+	struct nvgpu_page_allocator *a = page_allocator(__a);
+	struct nvgpu_page_alloc *alloc = NULL;
+	struct page_alloc_chunk *c;
+	u64 aligned_len, pages;
+	int i = 0;
+
+	aligned_len = ALIGN(len, a->page_size);
+	pages = aligned_len >> a->page_shift;
+
+	alloc_lock(__a);
+
+	alloc = __nvgpu_alloc_pages_fixed(a, base, aligned_len);
+	if (IS_ERR(alloc)) {
+		alloc_unlock(__a);
+		return 0;
+	}
+
+	__insert_page_alloc(a, alloc);
+	alloc_unlock(__a);
+
+	palloc_dbg(a, "Alloc [fixed] @ 0x%010llx + 0x%llx (%llu)\n",
+		   alloc->base, aligned_len, pages);
+	list_for_each_entry(c, &alloc->alloc_chunks, list_entry) {
+		palloc_dbg(a, "  Chunk %2d: 0x%010llx + 0x%llx\n",
+			   i++, c->base, c->length);
+	}
+
+	a->nr_fixed_allocs++;
+	a->pages_alloced += pages;
+
+	if (a->flags & GPU_ALLOC_NO_SCATTER_GATHER)
+		return alloc->base;
+	else
+		return (u64) (uintptr_t) alloc;
+}
+
+static void nvgpu_page_free_fixed(struct nvgpu_allocator *__a,
+				  u64 base, u64 len)
+{
+	struct nvgpu_page_allocator *a = page_allocator(__a);
+	struct nvgpu_page_alloc *alloc;
+
+	alloc_lock(__a);
+
+	if (a->flags & GPU_ALLOC_NO_SCATTER_GATHER) {
+		alloc = __find_page_alloc(a, base);
+		if (!alloc)
+			goto done;
+	} else {
+		alloc = (struct nvgpu_page_alloc *) (uintptr_t) base;
+	}
+
+	palloc_dbg(a, "Free  [fixed] 0x%010llx + 0x%llx\n",
+		   alloc->base, alloc->length);
+
+	a->nr_fixed_frees++;
+	a->pages_freed += (alloc->length >> a->page_shift);
+
+	/*
+	 * This works for the time being since the buddy allocator
+	 * uses the same free function for both fixed and regular
+	 * allocs. This would have to be updated if the underlying
+	 * allocator were to change.
+	 */
+	__nvgpu_free_pages(a, alloc, true);
+
+done:
+	alloc_unlock(__a);
+}
+
+static void nvgpu_page_allocator_destroy(struct nvgpu_allocator *__a)
+{
+	struct nvgpu_page_allocator *a = page_allocator(__a);
+
+	alloc_lock(__a);
+	kfree(a);
+	__a->priv = NULL;
+	alloc_unlock(__a);
+}
+
+static void nvgpu_page_print_stats(struct nvgpu_allocator *__a,
+				   struct seq_file *s, int lock)
+{
+	struct nvgpu_page_allocator *a = page_allocator(__a);
+	int i;
+
+	if (lock)
+		alloc_lock(__a);
+
+	__alloc_pstat(s, __a, "Page allocator:\n");
+	__alloc_pstat(s, __a, "  allocs         %lld\n", a->nr_allocs);
+	__alloc_pstat(s, __a, "  frees          %lld\n", a->nr_frees);
+	__alloc_pstat(s, __a, "  fixed_allocs   %lld\n", a->nr_fixed_allocs);
+	__alloc_pstat(s, __a, "  fixed_frees    %lld\n", a->nr_fixed_frees);
+	__alloc_pstat(s, __a, "  slab_allocs    %lld\n", a->nr_slab_allocs);
+	__alloc_pstat(s, __a, "  slab_frees     %lld\n", a->nr_slab_frees);
+	__alloc_pstat(s, __a, "  pages alloced  %lld\n", a->pages_alloced);
+	__alloc_pstat(s, __a, "  pages freed    %lld\n", a->pages_freed);
+	__alloc_pstat(s, __a, "\n");
+
+	/*
+	 * Slab info.
+	 */
+	if (a->flags & GPU_ALLOC_4K_VIDMEM_PAGES) {
+		__alloc_pstat(s, __a, "Slabs:\n");
+		__alloc_pstat(s, __a, "  size      empty     partial   full\n");
+		__alloc_pstat(s, __a, "  ----      -----     -------   ----\n");
+
+		for (i = 0; i < a->nr_slabs; i++) {
+			struct page_alloc_slab *slab = &a->slabs[i];
+
+			__alloc_pstat(s, __a, "  %-9u %-9d %-9u %u\n",
+				      slab->slab_size,
+				      slab->nr_empty, slab->nr_partial,
+				      slab->nr_full);
+		}
+		__alloc_pstat(s, __a, "\n");
+	}
+
+	__alloc_pstat(s, __a, "Source alloc: %s\n",
+		      a->source_allocator.name);
+	nvgpu_alloc_print_stats(&a->source_allocator, s, lock);
+
+	if (lock)
+		alloc_unlock(__a);
+}
+
+static const struct nvgpu_allocator_ops page_ops = {
+	.alloc		= nvgpu_page_alloc,
+	.free		= nvgpu_page_free,
+
+	.alloc_fixed	= nvgpu_page_alloc_fixed,
+	.free_fixed	= nvgpu_page_free_fixed,
+
+	.reserve_carveout	= nvgpu_page_reserve_co,
+	.release_carveout	= nvgpu_page_release_co,
+
+	.base		= nvgpu_page_alloc_base,
+	.length		= nvgpu_page_alloc_length,
+	.end		= nvgpu_page_alloc_end,
+	.inited		= nvgpu_page_alloc_inited,
+	.space		= nvgpu_page_alloc_space,
+
+	.fini		= nvgpu_page_allocator_destroy,
+
+	.print_stats	= nvgpu_page_print_stats,
+};
+
+/*
+ * nr_slabs is computed as follows: divide page_size by 4096 to get number of
+ * 4k pages in page_size. Then take the base 2 log of that to get number of
+ * slabs. For 64k page_size that works on like:
+ *
+ *   1024*64 / 1024*4 = 16
+ *   ilog2(16) = 4
+ *
+ * That gives buckets of 1, 2, 4, and 8 pages (i.e 4k, 8k, 16k, 32k).
+ */
+static int nvgpu_page_alloc_init_slabs(struct nvgpu_page_allocator *a)
+{
+	size_t nr_slabs = ilog2(a->page_size >> 12);
+	unsigned int i;
+
+	a->slabs = kcalloc(nr_slabs,
+			   sizeof(struct page_alloc_slab),
+			   GFP_KERNEL);
+	if (!a->slabs)
+		return -ENOMEM;
+	a->nr_slabs = nr_slabs;
+
+	for (i = 0; i < nr_slabs; i++) {
+		struct page_alloc_slab *slab = &a->slabs[i];
+
+		slab->slab_size = SZ_4K * (1 << i);
+		INIT_LIST_HEAD(&slab->empty);
+		INIT_LIST_HEAD(&slab->partial);
+		INIT_LIST_HEAD(&slab->full);
+		slab->nr_empty = 0;
+		slab->nr_partial = 0;
+		slab->nr_full = 0;
+	}
+
+	return 0;
+}
+
+int nvgpu_page_allocator_init(struct gk20a *g, struct nvgpu_allocator *__a,
+			      const char *name, u64 base, u64 length,
+			      u64 blk_size, u64 flags)
+{
+	struct nvgpu_page_allocator *a;
+	char buddy_name[sizeof(__a->name)];
+	int err;
+
+	mutex_lock(&meta_data_cache_lock);
+	if (!page_alloc_cache)
+		page_alloc_cache = KMEM_CACHE(nvgpu_page_alloc, 0);
+	if (!page_alloc_chunk_cache)
+		page_alloc_chunk_cache = KMEM_CACHE(page_alloc_chunk, 0);
+	if (!page_alloc_slab_page_cache)
+		page_alloc_slab_page_cache =
+			KMEM_CACHE(page_alloc_slab_page, 0);
+	mutex_unlock(&meta_data_cache_lock);
+
+	if (!page_alloc_cache || !page_alloc_chunk_cache)
+		return -ENOMEM;
+
+	if (blk_size < SZ_4K)
+		return -EINVAL;
+
+	a = kzalloc(sizeof(struct nvgpu_page_allocator), GFP_KERNEL);
+	if (!a)
+		return -ENOMEM;
+
+	err = __nvgpu_alloc_common_init(__a, name, a, false, &page_ops);
+	if (err)
+		goto fail;
+
+	a->base = base;
+	a->length = length;
+	a->page_size = blk_size;
+	a->page_shift = __ffs(blk_size);
+	a->allocs = RB_ROOT;
+	a->owner = __a;
+	a->flags = flags;
+
+	if (flags & GPU_ALLOC_4K_VIDMEM_PAGES && blk_size > SZ_4K) {
+		err = nvgpu_page_alloc_init_slabs(a);
+		if (err)
+			goto fail;
+	}
+
+	snprintf(buddy_name, sizeof(buddy_name), "%s-src", name);
+
+	err = nvgpu_buddy_allocator_init(g, &a->source_allocator, buddy_name,
+					 base, length, blk_size, 0);
+	if (err)
+		goto fail;
+
+	nvgpu_init_alloc_debug(g, __a);
+	palloc_dbg(a, "New allocator: type      page\n");
+	palloc_dbg(a, "               base      0x%llx\n", a->base);
+	palloc_dbg(a, "               size      0x%llx\n", a->length);
+	palloc_dbg(a, "               page_size 0x%llx\n", a->page_size);
+	palloc_dbg(a, "               flags     0x%llx\n", a->flags);
+	palloc_dbg(a, "               slabs:    %d\n", a->nr_slabs);
+
+	return 0;
+
+fail:
+	kfree(a);
+	return err;
+}
diff --git a/drivers/gpu/nvgpu/gk20a/as_gk20a.c b/drivers/gpu/nvgpu/gk20a/as_gk20a.c
index 0b90090a..07601d42 100644
--- a/drivers/gpu/nvgpu/gk20a/as_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/as_gk20a.c
@@ -279,17 +279,17 @@ static int gk20a_as_ioctl_get_va_regions(
 
 	for (i = 0; i < write_entries; ++i) {
 		struct nvgpu_as_va_region region;
-		struct gk20a_allocator *vma =
-			gk20a_alloc_initialized(&vm->fixed) ?
+		struct nvgpu_allocator *vma =
+			nvgpu_alloc_initialized(&vm->fixed) ?
 			&vm->fixed : &vm->vma[i];
 
 		memset(&region, 0, sizeof(struct nvgpu_as_va_region));
 
 		region.page_size = vm->gmmu_page_sizes[i];
-		region.offset = gk20a_alloc_base(vma);
+		region.offset = nvgpu_alloc_base(vma);
 		/* No __aeabi_uldivmod() on some platforms... */
-		region.pages = (gk20a_alloc_end(vma) -
-			gk20a_alloc_base(vma)) >> ilog2(region.page_size);
+		region.pages = (nvgpu_alloc_end(vma) -
+			nvgpu_alloc_base(vma)) >> ilog2(region.page_size);
 
 		if (copy_to_user(user_region_ptr + i, &region, sizeof(region)))
 			return -EFAULT;
diff --git a/drivers/gpu/nvgpu/gk20a/bitmap_allocator_priv.h b/drivers/gpu/nvgpu/gk20a/bitmap_allocator_priv.h
deleted file mode 100644
index a686b704..00000000
--- a/drivers/gpu/nvgpu/gk20a/bitmap_allocator_priv.h
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#ifndef BITMAP_ALLOCATOR_PRIV_H
-#define BITMAP_ALLOCATOR_PRIV_H
-
-#include <linux/rbtree.h>
-
-struct gk20a_allocator;
-
-struct gk20a_bitmap_allocator {
-	struct gk20a_allocator *owner;
-
-	u64 base;			/* Base address of the space. */
-	u64 length;			/* Length of the space. */
-	u64 blk_size;			/* Size that corresponds to 1 bit. */
-	u64 blk_shift;			/* Bit shift to divide by blk_size. */
-	u64 num_bits;			/* Number of allocatable bits. */
-	u64 bit_offs;			/* Offset of bitmap. */
-
-	/*
-	 * Optimization for making repeated allocations faster. Keep track of
-	 * the next bit after the most recent allocation. This is where the next
-	 * search will start from. This should make allocation faster in cases
-	 * where lots of allocations get made one after another. It shouldn't
-	 * have a negative impact on the case where the allocator is fragmented.
-	 */
-	u64 next_blk;
-
-	unsigned long *bitmap;		/* The actual bitmap! */
-	struct rb_root allocs;		/* Tree of outstanding allocations. */
-
-	u64 flags;
-
-	bool inited;
-
-	/* Statistics */
-	u64 nr_allocs;
-	u64 nr_fixed_allocs;
-	u64 bytes_alloced;
-	u64 bytes_freed;
-};
-
-struct gk20a_bitmap_alloc {
-	u64 base;
-	u64 length;
-	struct rb_node alloc_entry;	/* RB tree of allocations. */
-};
-
-static inline struct gk20a_bitmap_allocator *bitmap_allocator(
-	struct gk20a_allocator *a)
-{
-	return (struct gk20a_bitmap_allocator *)(a)->priv;
-}
-
-
-#endif
diff --git a/drivers/gpu/nvgpu/gk20a/buddy_allocator_priv.h b/drivers/gpu/nvgpu/gk20a/buddy_allocator_priv.h
deleted file mode 100644
index bb8b307b..00000000
--- a/drivers/gpu/nvgpu/gk20a/buddy_allocator_priv.h
+++ /dev/null
@@ -1,192 +0,0 @@
-/*
- * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#ifndef BUDDY_ALLOCATOR_PRIV_H
-#define BUDDY_ALLOCATOR_PRIV_H
-
-#include <linux/list.h>
-#include <linux/rbtree.h>
-
-struct gk20a_allocator;
-struct vm_gk20a;
-
-/*
- * Each buddy is an element in a binary tree.
- */
-struct gk20a_buddy {
-	struct gk20a_buddy *parent;	/* Parent node. */
-	struct gk20a_buddy *buddy;	/* This node's buddy. */
-	struct gk20a_buddy *left;	/* Lower address sub-node. */
-	struct gk20a_buddy *right;	/* Higher address sub-node. */
-
-	struct list_head buddy_entry;	/* List entry for various lists. */
-	struct rb_node alloced_entry;	/* RB tree of allocations. */
-
-	u64 start;			/* Start address of this buddy. */
-	u64 end;			/* End address of this buddy. */
-	u64 order;			/* Buddy order. */
-
-#define BALLOC_BUDDY_ALLOCED	0x1
-#define BALLOC_BUDDY_SPLIT	0x2
-#define BALLOC_BUDDY_IN_LIST	0x4
-	int flags;			/* List of associated flags. */
-
-	/*
-	 * Size of the PDE this buddy is using. This allows for grouping like
-	 * sized allocations into the same PDE. This uses the gmmu_pgsz_gk20a
-	 * enum except for the BALLOC_PTE_SIZE_ANY specifier.
-	 */
-#define BALLOC_PTE_SIZE_ANY	-1
-	int pte_size;
-};
-
-#define __buddy_flag_ops(flag, flag_up)					\
-	static inline int buddy_is_ ## flag(struct gk20a_buddy *b)	\
-	{								\
-		return b->flags & BALLOC_BUDDY_ ## flag_up;		\
-	}								\
-	static inline void buddy_set_ ## flag(struct gk20a_buddy *b)	\
-	{								\
-		b->flags |= BALLOC_BUDDY_ ## flag_up;			\
-	}								\
-	static inline void buddy_clr_ ## flag(struct gk20a_buddy *b)	\
-	{								\
-		b->flags &= ~BALLOC_BUDDY_ ## flag_up;			\
-	}
-
-/*
- * int  buddy_is_alloced(struct gk20a_buddy *b);
- * void buddy_set_alloced(struct gk20a_buddy *b);
- * void buddy_clr_alloced(struct gk20a_buddy *b);
- *
- * int  buddy_is_split(struct gk20a_buddy *b);
- * void buddy_set_split(struct gk20a_buddy *b);
- * void buddy_clr_split(struct gk20a_buddy *b);
- *
- * int  buddy_is_in_list(struct gk20a_buddy *b);
- * void buddy_set_in_list(struct gk20a_buddy *b);
- * void buddy_clr_in_list(struct gk20a_buddy *b);
- */
-__buddy_flag_ops(alloced, ALLOCED);
-__buddy_flag_ops(split,   SPLIT);
-__buddy_flag_ops(in_list, IN_LIST);
-
-/*
- * Keeps info for a fixed allocation.
- */
-struct gk20a_fixed_alloc {
-	struct list_head buddies;	/* List of buddies. */
-	struct rb_node alloced_entry;	/* RB tree of fixed allocations. */
-
-	u64 start;			/* Start of fixed block. */
-	u64 end;			/* End address. */
-};
-
-/*
- * GPU buddy allocator for the various GPU address spaces. Each addressable unit
- * doesn't have to correspond to a byte. In some cases each unit is a more
- * complex object such as a comp_tag line or the like.
- *
- * The max order is computed based on the size of the minimum order and the size
- * of the address space.
- *
- * order_size is the size of an order 0 buddy.
- */
-struct gk20a_buddy_allocator {
-	struct gk20a_allocator *owner;	/* Owner of this buddy allocator. */
-	struct vm_gk20a *vm;		/* Parent VM - can be NULL. */
-
-	u64 base;			/* Base address of the space. */
-	u64 length;			/* Length of the space. */
-	u64 blk_size;			/* Size of order 0 allocation. */
-	u64 blk_shift;			/* Shift to divide by blk_size. */
-
-	/* Internal stuff. */
-	u64 start;			/* Real start (aligned to blk_size). */
-	u64 end;			/* Real end, trimmed if needed. */
-	u64 count;			/* Count of objects in space. */
-	u64 blks;			/* Count of blks in the space. */
-	u64 max_order;			/* Specific maximum order. */
-
-	struct rb_root alloced_buddies;	/* Outstanding allocations. */
-	struct rb_root fixed_allocs;	/* Outstanding fixed allocations. */
-
-	struct list_head co_list;
-
-	/*
-	 * Impose an upper bound on the maximum order.
-	 */
-#define GPU_BALLOC_ORDER_LIST_LEN	(GPU_BALLOC_MAX_ORDER + 1)
-
-	struct list_head buddy_list[GPU_BALLOC_ORDER_LIST_LEN];
-	u64 buddy_list_len[GPU_BALLOC_ORDER_LIST_LEN];
-	u64 buddy_list_split[GPU_BALLOC_ORDER_LIST_LEN];
-	u64 buddy_list_alloced[GPU_BALLOC_ORDER_LIST_LEN];
-
-	/*
-	 * This is for when the allocator is managing a GVA space (the
-	 * GPU_ALLOC_GVA_SPACE bit is set in @flags). This requires
-	 * that we group like sized allocations into PDE blocks.
-	 */
-	u64 pte_blk_order;
-
-	int initialized;
-	int alloc_made;			/* True after the first alloc. */
-
-	u64 flags;
-
-	u64 bytes_alloced;
-	u64 bytes_alloced_real;
-	u64 bytes_freed;
-};
-
-static inline struct gk20a_buddy_allocator *buddy_allocator(
-	struct gk20a_allocator *a)
-{
-	return (struct gk20a_buddy_allocator *)(a)->priv;
-}
-
-static inline struct list_head *balloc_get_order_list(
-	struct gk20a_buddy_allocator *a, int order)
-{
-	return &a->buddy_list[order];
-}
-
-static inline u64 balloc_order_to_len(struct gk20a_buddy_allocator *a,
-				      int order)
-{
-	return (1 << order) * a->blk_size;
-}
-
-static inline u64 balloc_base_shift(struct gk20a_buddy_allocator *a,
-				    u64 base)
-{
-	return base - a->start;
-}
-
-static inline u64 balloc_base_unshift(struct gk20a_buddy_allocator *a,
-				      u64 base)
-{
-	return base + a->start;
-}
-
-static inline struct gk20a_allocator *balloc_owner(
-	struct gk20a_buddy_allocator *a)
-{
-	return a->owner;
-}
-
-#endif
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
index 136c28d0..be01e0e9 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
@@ -976,7 +976,7 @@ static void gk20a_free_channel(struct channel_gk20a *ch, bool force)
 	memset(&ch->ramfc, 0, sizeof(struct mem_desc_sub));
 
 	gk20a_gmmu_unmap_free(ch_vm, &ch->gpfifo.mem);
-	nvgpu_free(ch->gpfifo.pipe);
+	nvgpu_kfree(ch->gpfifo.pipe);
 	memset(&ch->gpfifo, 0, sizeof(struct gpfifo_desc));
 
 #if defined(CONFIG_GK20A_CYCLE_STATS)
@@ -1778,7 +1778,7 @@ int gk20a_alloc_channel_gpfifo(struct channel_gk20a *c,
 	}
 
 	if (c->gpfifo.mem.aperture == APERTURE_VIDMEM || g->mm.force_pramin) {
-		c->gpfifo.pipe = nvgpu_alloc(
+		c->gpfifo.pipe = nvgpu_kalloc(
 				gpfifo_size * sizeof(struct nvgpu_gpfifo),
 				false);
 		if (!c->gpfifo.pipe) {
@@ -1850,7 +1850,7 @@ clean_up_sync:
 		c->sync = NULL;
 	}
 clean_up_unmap:
-	nvgpu_free(c->gpfifo.pipe);
+	nvgpu_kfree(c->gpfifo.pipe);
 	gk20a_gmmu_unmap_free(ch_vm, &c->gpfifo.mem);
 clean_up:
 	memset(&c->gpfifo, 0, sizeof(struct gpfifo_desc));
@@ -1980,12 +1980,12 @@ static void trace_write_pushbuffer_range(struct channel_gk20a *c,
 	if (!g) {
 		size = count * sizeof(struct nvgpu_gpfifo);
 		if (size) {
-			g = nvgpu_alloc(size, false);
+			g = nvgpu_kalloc(size, false);
 			if (!g)
 				return;
 
 			if (copy_from_user(g, user_gpfifo, size)) {
-				nvgpu_free(g);
+				nvgpu_kfree(g);
 				return;
 			}
 		}
@@ -1997,7 +1997,7 @@ static void trace_write_pushbuffer_range(struct channel_gk20a *c,
 		trace_write_pushbuffer(c, gp);
 
 	if (gpfifo_allocated)
-		nvgpu_free(g);
+		nvgpu_kfree(g);
 }
 
 static void gk20a_channel_timeout_start(struct channel_gk20a *ch,
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
index 0a0d94b7..697d1603 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
@@ -143,7 +143,7 @@ struct channel_gk20a {
 	struct list_head ch_entry; /* channel's entry in TSG */
 
 	struct channel_gk20a_joblist joblist;
-	struct gk20a_allocator fence_allocator;
+	struct nvgpu_allocator fence_allocator;
 
 	struct vm_gk20a *vm;
 
diff --git a/drivers/gpu/nvgpu/gk20a/dbg_gpu_gk20a.c b/drivers/gpu/nvgpu/gk20a/dbg_gpu_gk20a.c
index e5529295..ac96036f 100644
--- a/drivers/gpu/nvgpu/gk20a/dbg_gpu_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/dbg_gpu_gk20a.c
@@ -815,7 +815,7 @@ static int nvgpu_dbg_gpu_ioctl_access_fb_memory(struct dbg_session_gk20a *dbg_s,
 		goto fail_dmabuf_put;
 	}
 
-	buffer = nvgpu_alloc(access_limit_size, true);
+	buffer = nvgpu_kalloc(access_limit_size, true);
 	if (!buffer) {
 		err = -ENOMEM;
 		goto fail_dmabuf_put;
@@ -861,7 +861,7 @@ static int nvgpu_dbg_gpu_ioctl_access_fb_memory(struct dbg_session_gk20a *dbg_s,
 fail_idle:
 	gk20a_idle(g->dev);
 fail_free_buffer:
-	nvgpu_free(buffer);
+	nvgpu_kfree(buffer);
 fail_dmabuf_put:
 	dma_buf_put(dmabuf);
 
diff --git a/drivers/gpu/nvgpu/gk20a/debug_gk20a.c b/drivers/gpu/nvgpu/gk20a/debug_gk20a.c
index b84db933..8fa108c2 100644
--- a/drivers/gpu/nvgpu/gk20a/debug_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/debug_gk20a.c
@@ -477,7 +477,7 @@ void gk20a_debug_init(struct device *dev, const char *debugfs_symlink)
 	gk20a_railgating_debugfs_init(g->dev);
 	gk20a_cde_debugfs_init(g->dev);
 	gk20a_ce_debugfs_init(g->dev);
-	gk20a_alloc_debugfs_init(g->dev);
+	nvgpu_alloc_debugfs_init(g->dev);
 	gk20a_mm_debugfs_init(g->dev);
 	gk20a_fifo_debugfs_init(g->dev);
 	gk20a_sched_debugfs_init(g->dev);
diff --git a/drivers/gpu/nvgpu/gk20a/fence_gk20a.c b/drivers/gpu/nvgpu/gk20a/fence_gk20a.c
index 323caa8f..b8a1dcbc 100644
--- a/drivers/gpu/nvgpu/gk20a/fence_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/fence_gk20a.c
@@ -49,8 +49,8 @@ static void gk20a_fence_free(struct kref *ref)
 		gk20a_semaphore_put(f->semaphore);
 
 	if (f->allocator) {
-		if (gk20a_alloc_initialized(f->allocator))
-			gk20a_free(f->allocator, (size_t)f);
+		if (nvgpu_alloc_initialized(f->allocator))
+			nvgpu_free(f->allocator, (size_t)f);
 	} else
 		kfree(f);
 }
@@ -129,7 +129,7 @@ int gk20a_alloc_fence_pool(struct channel_gk20a *c, unsigned int count)
 	if (!fence_pool)
 		return -ENOMEM;
 
-	err = gk20a_lockless_allocator_init(c->g, &c->fence_allocator,
+	err = nvgpu_lockless_allocator_init(c->g, &c->fence_allocator,
 				"fence_pool", (size_t)fence_pool, size,
 				sizeof(struct gk20a_fence), 0);
 	if (err)
@@ -144,11 +144,11 @@ fail:
 
 void gk20a_free_fence_pool(struct channel_gk20a *c)
 {
-	if (gk20a_alloc_initialized(&c->fence_allocator)) {
+	if (nvgpu_alloc_initialized(&c->fence_allocator)) {
 		void *base = (void *)(uintptr_t)
-				gk20a_alloc_base(&c->fence_allocator);
+				nvgpu_alloc_base(&c->fence_allocator);
 
-		gk20a_alloc_destroy(&c->fence_allocator);
+		nvgpu_alloc_destroy(&c->fence_allocator);
 		vfree(base);
 	}
 }
@@ -158,9 +158,9 @@ struct gk20a_fence *gk20a_alloc_fence(struct channel_gk20a *c)
 	struct gk20a_fence *fence = NULL;
 
 	if (channel_gk20a_is_prealloc_enabled(c)) {
-		if (gk20a_alloc_initialized(&c->fence_allocator)) {
+		if (nvgpu_alloc_initialized(&c->fence_allocator)) {
 			fence = (struct gk20a_fence *)(uintptr_t)
-				gk20a_alloc(&c->fence_allocator,
+				nvgpu_alloc(&c->fence_allocator,
 					sizeof(struct gk20a_fence));
 
 			/* clear the node and reset the allocator pointer */
diff --git a/drivers/gpu/nvgpu/gk20a/fence_gk20a.h b/drivers/gpu/nvgpu/gk20a/fence_gk20a.h
index beba761a..f38fcbe7 100644
--- a/drivers/gpu/nvgpu/gk20a/fence_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/fence_gk20a.h
@@ -47,7 +47,7 @@ struct gk20a_fence {
 	u32 syncpt_value;
 
 	/* Valid for fences part of a pre-allocated fence pool */
-	struct gk20a_allocator *allocator;
+	struct nvgpu_allocator *allocator;
 };
 
 /* Fences can be created from semaphores or syncpoint (id, value) pairs */
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.c b/drivers/gpu/nvgpu/gk20a/gk20a.c
index b1e90bd8..753f031a 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.c
@@ -43,6 +43,8 @@
 #include <linux/sched.h>
 #include <linux/version.h>
 
+#include <nvgpu/allocator.h>
+
 #include "gk20a.h"
 #include "nvgpu_common.h"
 #include "debug_gk20a.h"
@@ -60,7 +62,6 @@
 #include "gk20a_scale.h"
 #include "ctxsw_trace_gk20a.h"
 #include "dbg_gpu_gk20a.h"
-#include "gk20a_allocator.h"
 #include "hal.h"
 #include "vgpu/vgpu.h"
 #include "pci.h"
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a_allocator.c b/drivers/gpu/nvgpu/gk20a/gk20a_allocator.c
deleted file mode 100644
index 3129b07c..00000000
--- a/drivers/gpu/nvgpu/gk20a/gk20a_allocator.c
+++ /dev/null
@@ -1,211 +0,0 @@
-/*
- * gk20a allocator
- *
- * Copyright (c) 2011-2016, NVIDIA CORPORATION.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <linux/kernel.h>
-#include <linux/slab.h>
-
-#include "gk20a.h"
-#include "mm_gk20a.h"
-#include "platform_gk20a.h"
-#include "gk20a_allocator.h"
-
-u32 gk20a_alloc_tracing_on;
-
-u64 gk20a_alloc_length(struct gk20a_allocator *a)
-{
-	if (a->ops->length)
-		return a->ops->length(a);
-
-	return 0;
-}
-
-u64 gk20a_alloc_base(struct gk20a_allocator *a)
-{
-	if (a->ops->base)
-		return a->ops->base(a);
-
-	return 0;
-}
-
-u64 gk20a_alloc_initialized(struct gk20a_allocator *a)
-{
-	if (!a->ops || !a->ops->inited)
-		return 0;
-
-	return a->ops->inited(a);
-}
-
-u64 gk20a_alloc_end(struct gk20a_allocator *a)
-{
-	if (a->ops->end)
-		return a->ops->end(a);
-
-	return 0;
-}
-
-u64 gk20a_alloc_space(struct gk20a_allocator *a)
-{
-	if (a->ops->space)
-		return a->ops->space(a);
-
-	return 0;
-}
-
-u64 gk20a_alloc(struct gk20a_allocator *a, u64 len)
-{
-	return a->ops->alloc(a, len);
-}
-
-void gk20a_free(struct gk20a_allocator *a, u64 addr)
-{
-	a->ops->free(a, addr);
-}
-
-u64 gk20a_alloc_fixed(struct gk20a_allocator *a, u64 base, u64 len)
-{
-	if (a->ops->alloc_fixed)
-		return a->ops->alloc_fixed(a, base, len);
-
-	return 0;
-}
-
-void gk20a_free_fixed(struct gk20a_allocator *a, u64 base, u64 len)
-{
-	/*
-	 * If this operation is not defined for the allocator then just do
-	 * nothing. The alternative would be to fall back on the regular
-	 * free but that may be harmful in unexpected ways.
-	 */
-	if (a->ops->free_fixed)
-		a->ops->free_fixed(a, base, len);
-}
-
-int gk20a_alloc_reserve_carveout(struct gk20a_allocator *a,
-				 struct gk20a_alloc_carveout *co)
-{
-	if (a->ops->reserve_carveout)
-		return a->ops->reserve_carveout(a, co);
-
-	return -ENODEV;
-}
-
-void gk20a_alloc_release_carveout(struct gk20a_allocator *a,
-				  struct gk20a_alloc_carveout *co)
-{
-	if (a->ops->release_carveout)
-		a->ops->release_carveout(a, co);
-}
-
-void gk20a_alloc_destroy(struct gk20a_allocator *a)
-{
-	a->ops->fini(a);
-	memset(a, 0, sizeof(*a));
-}
-
-/*
- * Handle the common init stuff for a gk20a_allocator.
- */
-int __gk20a_alloc_common_init(struct gk20a_allocator *a,
-			      const char *name, void *priv, bool dbg,
-			      const struct gk20a_allocator_ops *ops)
-{
-	if (!ops)
-		return -EINVAL;
-
-	/*
-	 * This is the bare minimum operations required for a sensible
-	 * allocator.
-	 */
-	if (!ops->alloc || !ops->free || !ops->fini)
-		return -EINVAL;
-
-	a->ops = ops;
-	a->priv = priv;
-	a->debug = dbg;
-
-	mutex_init(&a->lock);
-
-	strlcpy(a->name, name, sizeof(a->name));
-
-	return 0;
-}
-
-void gk20a_alloc_print_stats(struct gk20a_allocator *__a,
-			     struct seq_file *s, int lock)
-{
-	__a->ops->print_stats(__a, s, lock);
-}
-
-#ifdef CONFIG_DEBUG_FS
-static int __alloc_show(struct seq_file *s, void *unused)
-{
-	struct gk20a_allocator *a = s->private;
-
-	gk20a_alloc_print_stats(a, s, 1);
-
-	return 0;
-}
-
-static int __alloc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, __alloc_show, inode->i_private);
-}
-
-static const struct file_operations __alloc_fops = {
-	.open = __alloc_open,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = single_release,
-};
-#endif
-
-void gk20a_init_alloc_debug(struct gk20a *g, struct gk20a_allocator *a)
-{
-#ifdef CONFIG_DEBUG_FS
-	if (!g->debugfs_allocators)
-		return;
-
-	a->debugfs_entry = debugfs_create_file(a->name, S_IRUGO,
-					       g->debugfs_allocators,
-					       a, &__alloc_fops);
-#endif
-}
-
-void gk20a_fini_alloc_debug(struct gk20a_allocator *a)
-{
-#ifdef CONFIG_DEBUG_FS
-	if (!IS_ERR_OR_NULL(a->debugfs_entry))
-		debugfs_remove(a->debugfs_entry);
-#endif
-}
-
-void gk20a_alloc_debugfs_init(struct device *dev)
-{
-#ifdef CONFIG_DEBUG_FS
-	struct gk20a_platform *platform = dev_get_drvdata(dev);
-	struct dentry *gpu_root = platform->debugfs;
-	struct gk20a *g = get_gk20a(dev);
-
-	g->debugfs_allocators = debugfs_create_dir("allocators", gpu_root);
-	if (IS_ERR_OR_NULL(g->debugfs_allocators))
-		return;
-
-	debugfs_create_u32("tracing", 0664, g->debugfs_allocators,
-			   &gk20a_alloc_tracing_on);
-#endif
-}
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a_allocator.h b/drivers/gpu/nvgpu/gk20a/gk20a_allocator.h
deleted file mode 100644
index b12926b3..00000000
--- a/drivers/gpu/nvgpu/gk20a/gk20a_allocator.h
+++ /dev/null
@@ -1,302 +0,0 @@
-/*
- * Copyright (c) 2011-2016, NVIDIA CORPORATION.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#ifndef GK20A_ALLOCATOR_H
-#define GK20A_ALLOCATOR_H
-
-#include <linux/debugfs.h>
-#include <linux/seq_file.h>
-#include <linux/platform_device.h>
-
-/* #define ALLOCATOR_DEBUG */
-
-struct gk20a_allocator;
-struct gk20a_alloc_carveout;
-struct vm_gk20a;
-struct gk20a;
-
-/*
- * Operations for an allocator to implement.
- */
-struct gk20a_allocator_ops {
-	u64  (*alloc)(struct gk20a_allocator *allocator, u64 len);
-	void (*free)(struct gk20a_allocator *allocator, u64 addr);
-
-	/*
-	 * Special interface to allocate a memory region with a specific
-	 * starting address. Yikes. Note: if free() works for freeing both
-	 * regular and fixed allocations then free_fixed() does not need to
-	 * be implemented. This behavior exists for legacy reasons and should
-	 * not be propagated to new allocators.
-	 */
-	u64  (*alloc_fixed)(struct gk20a_allocator *allocator,
-			     u64 base, u64 len);
-	void (*free_fixed)(struct gk20a_allocator *allocator,
-			    u64 base, u64 len);
-
-	/*
-	 * Allow allocators to reserve space for carveouts.
-	 */
-	int  (*reserve_carveout)(struct gk20a_allocator *allocator,
-				 struct gk20a_alloc_carveout *co);
-	void (*release_carveout)(struct gk20a_allocator *allocator,
-				 struct gk20a_alloc_carveout *co);
-
-	/*
-	 * Returns info about the allocator.
-	 */
-	u64  (*base)(struct gk20a_allocator *allocator);
-	u64  (*length)(struct gk20a_allocator *allocator);
-	u64  (*end)(struct gk20a_allocator *allocator);
-	int  (*inited)(struct gk20a_allocator *allocator);
-	u64  (*space)(struct gk20a_allocator *allocator);
-
-	/* Destructor. */
-	void (*fini)(struct gk20a_allocator *allocator);
-
-	/* Debugging. */
-	void (*print_stats)(struct gk20a_allocator *allocator,
-			    struct seq_file *s, int lock);
-};
-
-struct gk20a_allocator {
-	char name[32];
-	struct mutex lock;
-
-	void *priv;
-	const struct gk20a_allocator_ops *ops;
-
-	struct dentry *debugfs_entry;
-	bool debug;				/* Control for debug msgs. */
-};
-
-struct gk20a_alloc_carveout {
-	const char *name;
-	u64 base;
-	u64 length;
-
-	struct gk20a_allocator *allocator;
-
-	/*
-	 * For usage by the allocator implementation.
-	 */
-	struct list_head co_entry;
-};
-
-#define GK20A_CARVEOUT(__name, __base, __length)	\
-	{						\
-		.name = (__name),			\
-		.base = (__base),			\
-		.length = (__length)			\
-	}
-
-/*
- * These are the available allocator flags.
- *
- *   GPU_ALLOC_GVA_SPACE
- *
- *     This flag makes sense for the buddy allocator only. It specifies that the
- *     allocator will be used for managing a GVA space. When managing GVA spaces
- *     special care has to be taken to ensure that allocations of similar PTE
- *     sizes are placed in the same PDE block. This allows the higher level
- *     code to skip defining both small and large PTE tables for every PDE. That
- *     can save considerable memory for address spaces that have a lot of
- *     allocations.
- *
- *   GPU_ALLOC_NO_ALLOC_PAGE
- *
- *     For any allocator that needs to manage a resource in a latency critical
- *     path this flag specifies that the allocator should not use any kmalloc()
- *     or similar functions during normal operation. Initialization routines
- *     may still use kmalloc(). This prevents the possibility of long waits for
- *     pages when using alloc_page(). Currently only the bitmap allocator
- *     implements this functionality.
- *
- *     Also note that if you accept this flag then you must also define the
- *     free_fixed() function. Since no meta-data is allocated to help free
- *     allocations you need to keep track of the meta-data yourself (in this
- *     case the base and length of the allocation as opposed to just the base
- *     of the allocation).
- *
- *   GPU_ALLOC_4K_VIDMEM_PAGES
- *
- *     We manage vidmem pages at a large page granularity for performance
- *     reasons; however, this can lead to wasting memory. For page allocators
- *     setting this flag will tell the allocator to manage pools of 4K pages
- *     inside internally allocated large pages.
- *
- *     Currently this flag is ignored since the only usage of the page allocator
- *     uses a 4K block size already. However, this flag has been reserved since
- *     it will be necessary in the future.
- *
- *   GPU_ALLOC_FORCE_CONTIG
- *
- *     Force allocations to be contiguous. Currently only relevant for page
- *     allocators since all other allocators are naturally contiguous.
- *
- *   GPU_ALLOC_NO_SCATTER_GATHER
- *
- *     The page allocator normally returns a scatter gather data structure for
- *     allocations (to handle discontiguous pages). However, at times that can
- *     be annoying so this flag forces the page allocator to return a u64
- *     pointing to the allocation base (requires GPU_ALLOC_FORCE_CONTIG to be
- *     set as well).
- */
-#define GPU_ALLOC_GVA_SPACE		0x1
-#define GPU_ALLOC_NO_ALLOC_PAGE		0x2
-#define GPU_ALLOC_4K_VIDMEM_PAGES	0x4
-#define GPU_ALLOC_FORCE_CONTIG		0x8
-#define GPU_ALLOC_NO_SCATTER_GATHER	0x10
-
-static inline void alloc_lock(struct gk20a_allocator *a)
-{
-	mutex_lock(&a->lock);
-}
-
-static inline void alloc_unlock(struct gk20a_allocator *a)
-{
-	mutex_unlock(&a->lock);
-}
-
-/*
- * Buddy allocator specific initializers.
- */
-int  __gk20a_buddy_allocator_init(struct gk20a *g, struct gk20a_allocator *a,
-				  struct vm_gk20a *vm, const char *name,
-				  u64 base, u64 size, u64 blk_size,
-				  u64 max_order, u64 flags);
-int  gk20a_buddy_allocator_init(struct gk20a *g, struct gk20a_allocator *a,
-				const char *name, u64 base, u64 size,
-				u64 blk_size, u64 flags);
-
-/*
- * Bitmap initializers.
- */
-int gk20a_bitmap_allocator_init(struct gk20a *g, struct gk20a_allocator *a,
-				const char *name, u64 base, u64 length,
-				u64 blk_size, u64 flags);
-
-/*
- * Page allocator initializers.
- */
-int gk20a_page_allocator_init(struct gk20a *g, struct gk20a_allocator *a,
-			      const char *name, u64 base, u64 length,
-			      u64 blk_size, u64 flags);
-
-/*
- * Lockless allocatior initializers.
- * Note: This allocator can only allocate fixed-size structures of a
- * pre-defined size.
- */
-int gk20a_lockless_allocator_init(struct gk20a *g, struct gk20a_allocator *a,
-				  const char *name, u64 base, u64 length,
-				  u64 struct_size, u64 flags);
-
-#define GPU_BALLOC_MAX_ORDER		31
-
-/*
- * Allocator APIs.
- */
-u64  gk20a_alloc(struct gk20a_allocator *allocator, u64 len);
-void gk20a_free(struct gk20a_allocator *allocator, u64 addr);
-
-u64  gk20a_alloc_fixed(struct gk20a_allocator *allocator, u64 base, u64 len);
-void gk20a_free_fixed(struct gk20a_allocator *allocator, u64 base, u64 len);
-
-int  gk20a_alloc_reserve_carveout(struct gk20a_allocator *a,
-				  struct gk20a_alloc_carveout *co);
-void gk20a_alloc_release_carveout(struct gk20a_allocator *a,
-				  struct gk20a_alloc_carveout *co);
-
-u64  gk20a_alloc_base(struct gk20a_allocator *a);
-u64  gk20a_alloc_length(struct gk20a_allocator *a);
-u64  gk20a_alloc_end(struct gk20a_allocator *a);
-u64  gk20a_alloc_initialized(struct gk20a_allocator *a);
-u64  gk20a_alloc_space(struct gk20a_allocator *a);
-
-void gk20a_alloc_destroy(struct gk20a_allocator *allocator);
-
-void gk20a_alloc_print_stats(struct gk20a_allocator *a,
-			     struct seq_file *s, int lock);
-
-/*
- * Common functionality for the internals of the allocators.
- */
-void gk20a_init_alloc_debug(struct gk20a *g, struct gk20a_allocator *a);
-void gk20a_fini_alloc_debug(struct gk20a_allocator *a);
-
-int  __gk20a_alloc_common_init(struct gk20a_allocator *a,
-			       const char *name, void *priv, bool dbg,
-			       const struct gk20a_allocator_ops *ops);
-
-static inline void gk20a_alloc_enable_dbg(struct gk20a_allocator *a)
-{
-	a->debug = true;
-}
-
-static inline void gk20a_alloc_disable_dbg(struct gk20a_allocator *a)
-{
-	a->debug = false;
-}
-
-/*
- * Debug stuff.
- */
-extern u32 gk20a_alloc_tracing_on;
-
-void gk20a_alloc_debugfs_init(struct device *dev);
-
-#define gk20a_alloc_trace_func()			\
-	do {						\
-		if (gk20a_alloc_tracing_on)		\
-			trace_printk("%s\n", __func__);	\
-	} while (0)
-
-#define gk20a_alloc_trace_func_done()				\
-	do {							\
-		if (gk20a_alloc_tracing_on)			\
-			trace_printk("%s_done\n", __func__);	\
-	} while (0)
-
-#define __alloc_pstat(seq, allocator, fmt, arg...)		\
-	do {							\
-		if (s)						\
-			seq_printf(seq, fmt, ##arg);		\
-		else						\
-			alloc_dbg(allocator, fmt, ##arg);	\
-	} while (0)
-
-#define __alloc_dbg(a, fmt, arg...)					\
-	pr_info("%-25s %25s() " fmt, (a)->name, __func__, ##arg)
-
-#if defined(ALLOCATOR_DEBUG)
-/*
- * Always print the debug messages...
- */
-#define alloc_dbg(a, fmt, arg...) __alloc_dbg(a, fmt, ##arg)
-#else
-/*
- * Only print debug messages if debug is enabled for a given allocator.
- */
-#define alloc_dbg(a, fmt, arg...)			\
-	do {						\
-		if ((a)->debug)				\
-			__alloc_dbg((a), fmt, ##arg);	\
-	} while (0)
-
-#endif
-
-#endif /* GK20A_ALLOCATOR_H */
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a_allocator_bitmap.c b/drivers/gpu/nvgpu/gk20a/gk20a_allocator_bitmap.c
deleted file mode 100644
index f98e0782..00000000
--- a/drivers/gpu/nvgpu/gk20a/gk20a_allocator_bitmap.c
+++ /dev/null
@@ -1,442 +0,0 @@
-/*
- * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <linux/kernel.h>
-#include <linux/slab.h>
-#include <linux/bitops.h>
-
-#include "gk20a_allocator.h"
-#include "bitmap_allocator_priv.h"
-
-static struct kmem_cache *meta_data_cache;	/* slab cache for meta data. */
-static DEFINE_MUTEX(meta_data_cache_lock);
-
-static u64 gk20a_bitmap_alloc_length(struct gk20a_allocator *a)
-{
-	struct gk20a_bitmap_allocator *ba = a->priv;
-
-	return ba->length;
-}
-
-static u64 gk20a_bitmap_alloc_base(struct gk20a_allocator *a)
-{
-	struct gk20a_bitmap_allocator *ba = a->priv;
-
-	return ba->base;
-}
-
-static int gk20a_bitmap_alloc_inited(struct gk20a_allocator *a)
-{
-	struct gk20a_bitmap_allocator *ba = a->priv;
-	int inited = ba->inited;
-
-	rmb();
-	return inited;
-}
-
-static u64 gk20a_bitmap_alloc_end(struct gk20a_allocator *a)
-{
-	struct gk20a_bitmap_allocator *ba = a->priv;
-
-	return ba->base + ba->length;
-}
-
-static u64 gk20a_bitmap_alloc_fixed(struct gk20a_allocator *__a,
-				    u64 base, u64 len)
-{
-	struct gk20a_bitmap_allocator *a = bitmap_allocator(__a);
-	u64 blks, offs, ret;
-
-	/* Compute the bit offset and make sure it's aligned to a block.  */
-	offs = base >> a->blk_shift;
-	if (offs * a->blk_size != base)
-		return 0;
-
-	offs -= a->bit_offs;
-
-	blks = len >> a->blk_shift;
-	if (blks * a->blk_size != len)
-		blks++;
-
-	alloc_lock(__a);
-
-	/* Check if the space requested is already occupied. */
-	ret = bitmap_find_next_zero_area(a->bitmap, a->num_bits, offs, blks, 0);
-	if (ret != offs)
-		goto fail;
-
-	bitmap_set(a->bitmap, offs, blks);
-
-	a->bytes_alloced += blks * a->blk_size;
-	a->nr_fixed_allocs++;
-	alloc_unlock(__a);
-
-	alloc_dbg(__a, "Alloc-fixed 0x%-10llx 0x%-5llx [bits=0x%llx (%llu)]\n",
-		  base, len, blks, blks);
-	return base;
-
-fail:
-	alloc_unlock(__a);
-	alloc_dbg(__a, "Alloc-fixed failed! (0x%llx)\n", base);
-	return 0;
-}
-
-/*
- * Two possibilities for this function: either we are freeing a fixed allocation
- * or we are freeing a regular alloc but with GPU_ALLOC_NO_ALLOC_PAGE defined.
- *
- * Note: this function won't do much error checking. Thus you could really
- * confuse the allocator if you misuse this function.
- */
-static void gk20a_bitmap_free_fixed(struct gk20a_allocator *__a,
-				    u64 base, u64 len)
-{
-	struct gk20a_bitmap_allocator *a = bitmap_allocator(__a);
-	u64 blks, offs;
-
-	offs = base >> a->blk_shift;
-	if (WARN_ON(offs * a->blk_size != base))
-		return;
-
-	offs -= a->bit_offs;
-
-	blks = len >> a->blk_shift;
-	if (blks * a->blk_size != len)
-		blks++;
-
-	alloc_lock(__a);
-	bitmap_clear(a->bitmap, offs, blks);
-	a->bytes_freed += blks * a->blk_size;
-	alloc_unlock(__a);
-
-	alloc_dbg(__a, "Free-fixed 0x%-10llx 0x%-5llx [bits=0x%llx (%llu)]\n",
-		  base, len, blks, blks);
-}
-
-/*
- * Add the passed alloc to the tree of stored allocations.
- */
-static void insert_alloc_metadata(struct gk20a_bitmap_allocator *a,
-				  struct gk20a_bitmap_alloc *alloc)
-{
-	struct rb_node **new = &a->allocs.rb_node;
-	struct rb_node *parent = NULL;
-	struct gk20a_bitmap_alloc *tmp;
-
-	while (*new) {
-		tmp = container_of(*new, struct gk20a_bitmap_alloc,
-				   alloc_entry);
-
-		parent = *new;
-		if (alloc->base < tmp->base)
-			new = &((*new)->rb_left);
-		else if (alloc->base > tmp->base)
-			new = &((*new)->rb_right);
-		else {
-			WARN_ON("Duplicate entries in RB alloc tree!\n");
-			return;
-		}
-	}
-
-	rb_link_node(&alloc->alloc_entry, parent, new);
-	rb_insert_color(&alloc->alloc_entry, &a->allocs);
-}
-
-/*
- * Find and remove meta-data from the outstanding allocations.
- */
-static struct gk20a_bitmap_alloc *find_alloc_metadata(
-	struct gk20a_bitmap_allocator *a, u64 addr)
-{
-	struct rb_node *node = a->allocs.rb_node;
-	struct gk20a_bitmap_alloc *alloc;
-
-	while (node) {
-		alloc = container_of(node, struct gk20a_bitmap_alloc,
-				     alloc_entry);
-
-		if (addr < alloc->base)
-			node = node->rb_left;
-		else if (addr > alloc->base)
-			node = node->rb_right;
-		else
-			break;
-	}
-
-	if (!node)
-		return NULL;
-
-	rb_erase(node, &a->allocs);
-
-	return alloc;
-}
-
-/*
- * Tree of alloc meta data stores the address of the alloc not the bit offset.
- */
-static int __gk20a_bitmap_store_alloc(struct gk20a_bitmap_allocator *a,
-				      u64 addr, u64 len)
-{
-	struct gk20a_bitmap_alloc *alloc =
-		kmem_cache_alloc(meta_data_cache, GFP_KERNEL);
-
-	if (!alloc)
-		return -ENOMEM;
-
-	alloc->base = addr;
-	alloc->length = len;
-
-	insert_alloc_metadata(a, alloc);
-
-	return 0;
-}
-
-/*
- * @len is in bytes. This routine will figure out the right number of bits to
- * actually allocate. The return is the address in bytes as well.
- */
-static u64 gk20a_bitmap_alloc(struct gk20a_allocator *__a, u64 len)
-{
-	u64 blks, addr;
-	unsigned long offs, adjusted_offs, limit;
-	struct gk20a_bitmap_allocator *a = bitmap_allocator(__a);
-
-	blks = len >> a->blk_shift;
-
-	if (blks * a->blk_size != len)
-		blks++;
-
-	alloc_lock(__a);
-
-	/*
-	 * First look from next_blk and onwards...
-	 */
-	offs = bitmap_find_next_zero_area(a->bitmap, a->num_bits,
-					  a->next_blk, blks, 0);
-	if (offs >= a->num_bits) {
-		/*
-		 * If that didn't work try the remaining area. Since there can
-		 * be available space that spans across a->next_blk we need to
-		 * search up to the first set bit after that.
-		 */
-		limit = find_next_bit(a->bitmap, a->num_bits, a->next_blk);
-		offs = bitmap_find_next_zero_area(a->bitmap, limit,
-						  0, blks, 0);
-		if (offs >= a->next_blk)
-			goto fail;
-	}
-
-	bitmap_set(a->bitmap, offs, blks);
-	a->next_blk = offs + blks;
-
-	adjusted_offs = offs + a->bit_offs;
-	addr = ((u64)adjusted_offs) * a->blk_size;
-
-	/*
-	 * Only do meta-data storage if we are allowed to allocate storage for
-	 * that meta-data. The issue with using kmalloc() and friends is that
-	 * in latency and success critical paths an alloc_page() call can either
-	 * sleep for potentially a long time or, assuming GFP_ATOMIC, fail.
-	 * Since we might not want either of these possibilities assume that the
-	 * caller will keep what data it needs around to successfully free this
-	 * allocation.
-	 */
-	if (!(a->flags & GPU_ALLOC_NO_ALLOC_PAGE) &&
-	    __gk20a_bitmap_store_alloc(a, addr, blks * a->blk_size))
-		goto fail_reset_bitmap;
-
-	alloc_dbg(__a, "Alloc 0x%-10llx 0x%-5llx [bits=0x%llx (%llu)]\n",
-		  addr, len, blks, blks);
-
-	a->nr_allocs++;
-	a->bytes_alloced += (blks * a->blk_size);
-	alloc_unlock(__a);
-
-	return addr;
-
-fail_reset_bitmap:
-	bitmap_clear(a->bitmap, offs, blks);
-fail:
-	a->next_blk = 0;
-	alloc_unlock(__a);
-	alloc_dbg(__a, "Alloc failed!\n");
-	return 0;
-}
-
-static void gk20a_bitmap_free(struct gk20a_allocator *__a, u64 addr)
-{
-	struct gk20a_bitmap_allocator *a = bitmap_allocator(__a);
-	struct gk20a_bitmap_alloc *alloc = NULL;
-	u64 offs, adjusted_offs, blks;
-
-	alloc_lock(__a);
-
-	if (a->flags & GPU_ALLOC_NO_ALLOC_PAGE) {
-		WARN(1, "Using wrong free for NO_ALLOC_PAGE bitmap allocator");
-		goto done;
-	}
-
-	alloc = find_alloc_metadata(a, addr);
-	if (!alloc)
-		goto done;
-
-	/*
-	 * Address comes from adjusted offset (i.e the bit offset with
-	 * a->bit_offs added. So start with that and then work out the real
-	 * offs into the bitmap.
-	 */
-	adjusted_offs = addr >> a->blk_shift;
-	offs = adjusted_offs - a->bit_offs;
-	blks = alloc->length >> a->blk_shift;
-
-	bitmap_clear(a->bitmap, offs, blks);
-	alloc_dbg(__a, "Free  0x%-10llx\n", addr);
-
-	a->bytes_freed += alloc->length;
-
-done:
-	kfree(alloc);
-	alloc_unlock(__a);
-}
-
-static void gk20a_bitmap_alloc_destroy(struct gk20a_allocator *__a)
-{
-	struct gk20a_bitmap_allocator *a = bitmap_allocator(__a);
-	struct gk20a_bitmap_alloc *alloc;
-	struct rb_node *node;
-
-	/*
-	 * Kill any outstanding allocations.
-	 */
-	while ((node = rb_first(&a->allocs)) != NULL) {
-		alloc = container_of(node, struct gk20a_bitmap_alloc,
-				     alloc_entry);
-
-		rb_erase(node, &a->allocs);
-		kfree(alloc);
-	}
-
-	kfree(a->bitmap);
-	kfree(a);
-}
-
-static void gk20a_bitmap_print_stats(struct gk20a_allocator *__a,
-				     struct seq_file *s, int lock)
-{
-	struct gk20a_bitmap_allocator *a = bitmap_allocator(__a);
-
-	__alloc_pstat(s, __a, "Bitmap allocator params:\n");
-	__alloc_pstat(s, __a, "  start = 0x%llx\n", a->base);
-	__alloc_pstat(s, __a, "  end   = 0x%llx\n", a->base + a->length);
-	__alloc_pstat(s, __a, "  blks  = 0x%llx\n", a->num_bits);
-
-	/* Actual stats. */
-	__alloc_pstat(s, __a, "Stats:\n");
-	__alloc_pstat(s, __a, "  Number allocs = 0x%llx\n", a->nr_allocs);
-	__alloc_pstat(s, __a, "  Number fixed  = 0x%llx\n", a->nr_fixed_allocs);
-	__alloc_pstat(s, __a, "  Bytes alloced = 0x%llx\n", a->bytes_alloced);
-	__alloc_pstat(s, __a, "  Bytes freed   = 0x%llx\n", a->bytes_freed);
-	__alloc_pstat(s, __a, "  Outstanding   = 0x%llx\n",
-		      a->bytes_alloced - a->bytes_freed);
-}
-
-static const struct gk20a_allocator_ops bitmap_ops = {
-	.alloc		= gk20a_bitmap_alloc,
-	.free		= gk20a_bitmap_free,
-
-	.alloc_fixed	= gk20a_bitmap_alloc_fixed,
-	.free_fixed	= gk20a_bitmap_free_fixed,
-
-	.base		= gk20a_bitmap_alloc_base,
-	.length		= gk20a_bitmap_alloc_length,
-	.end		= gk20a_bitmap_alloc_end,
-	.inited		= gk20a_bitmap_alloc_inited,
-
-	.fini		= gk20a_bitmap_alloc_destroy,
-
-	.print_stats	= gk20a_bitmap_print_stats,
-};
-
-
-int gk20a_bitmap_allocator_init(struct gk20a *g, struct gk20a_allocator *__a,
-				const char *name, u64 base, u64 length,
-				u64 blk_size, u64 flags)
-{
-	int err;
-	struct gk20a_bitmap_allocator *a;
-
-	mutex_lock(&meta_data_cache_lock);
-	if (!meta_data_cache)
-		meta_data_cache = KMEM_CACHE(gk20a_bitmap_alloc, 0);
-	mutex_unlock(&meta_data_cache_lock);
-
-	if (!meta_data_cache)
-		return -ENOMEM;
-
-	if (WARN_ON(blk_size & (blk_size - 1)))
-		return -EINVAL;
-
-	/*
-	 * blk_size must be a power-of-2; base length also need to be aligned
-	 * to blk_size.
-	 */
-	if (blk_size & (blk_size - 1) ||
-	    base & (blk_size - 1) || length & (blk_size - 1))
-		return -EINVAL;
-
-	if (base == 0) {
-		base = blk_size;
-		length -= blk_size;
-	}
-
-	a = kzalloc(sizeof(struct gk20a_bitmap_allocator), GFP_KERNEL);
-	if (!a)
-		return -ENOMEM;
-
-	err = __gk20a_alloc_common_init(__a, name, a, false, &bitmap_ops);
-	if (err)
-		goto fail;
-
-	a->base = base;
-	a->length = length;
-	a->blk_size = blk_size;
-	a->blk_shift = __ffs(a->blk_size);
-	a->num_bits = length >> a->blk_shift;
-	a->bit_offs = a->base >> a->blk_shift;
-	a->flags = flags;
-
-	a->bitmap = kcalloc(BITS_TO_LONGS(a->num_bits), sizeof(*a->bitmap),
-			    GFP_KERNEL);
-	if (!a->bitmap)
-		goto fail;
-
-	wmb();
-	a->inited = true;
-
-	gk20a_init_alloc_debug(g, __a);
-	alloc_dbg(__a, "New allocator: type      bitmap\n");
-	alloc_dbg(__a, "               base      0x%llx\n", a->base);
-	alloc_dbg(__a, "               bit_offs  0x%llx\n", a->bit_offs);
-	alloc_dbg(__a, "               size      0x%llx\n", a->length);
-	alloc_dbg(__a, "               blk_size  0x%llx\n", a->blk_size);
-	alloc_dbg(__a, "               flags     0x%llx\n", a->flags);
-
-	return 0;
-
-fail:
-	kfree(a);
-	return err;
-}
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a_allocator_buddy.c b/drivers/gpu/nvgpu/gk20a/gk20a_allocator_buddy.c
deleted file mode 100644
index 3715e9f8..00000000
--- a/drivers/gpu/nvgpu/gk20a/gk20a_allocator_buddy.c
+++ /dev/null
@@ -1,1327 +0,0 @@
-/*
- * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <linux/kernel.h>
-#include <linux/slab.h>
-
-#include "mm_gk20a.h"
-#include "platform_gk20a.h"
-#include "gk20a_allocator.h"
-#include "buddy_allocator_priv.h"
-
-static struct kmem_cache *buddy_cache;	/* slab cache for meta data. */
-
-/* Some other buddy allocator functions. */
-static struct gk20a_buddy *balloc_free_buddy(struct gk20a_buddy_allocator *a,
-					     u64 addr);
-static void balloc_coalesce(struct gk20a_buddy_allocator *a,
-			    struct gk20a_buddy *b);
-static void __balloc_do_free_fixed(struct gk20a_buddy_allocator *a,
-				   struct gk20a_fixed_alloc *falloc);
-
-/*
- * This function is not present in older kernel's list.h code.
- */
-#ifndef list_last_entry
-#define list_last_entry(ptr, type, member)	\
-	list_entry((ptr)->prev, type, member)
-#endif
-
-/*
- * GPU buddy allocator for various address spaces.
- *
- * Current limitations:
- *   o  A fixed allocation could potentially be made that borders PDEs with
- *      different PTE sizes. This would require that fixed buffer to have
- *      different sized PTEs for different parts of the allocation. Probably
- *      best to just require PDE alignment for fixed address allocs.
- *
- *   o  It is currently possible to make an allocator that has a buddy alignment
- *      out of sync with the PDE block size alignment. A simple example is a
- *      32GB address space starting at byte 1. Every buddy is shifted off by 1
- *      which means each buddy corresponf to more than one actual GPU page. The
- *      best way to fix this is probably just require PDE blocksize alignment
- *      for the start of the address space. At the moment all allocators are
- *      easily PDE aligned so this hasn't been a problem.
- */
-
-/*
- * Pick a suitable maximum order for this allocator.
- *
- * Hueristic: Just guessing that the best max order is the largest single
- * block that will fit in the address space.
- */
-static void balloc_compute_max_order(struct gk20a_buddy_allocator *a)
-{
-	u64 true_max_order = ilog2(a->blks);
-
-	if (a->max_order == 0) {
-		a->max_order = true_max_order;
-		return;
-	}
-
-	if (a->max_order > true_max_order)
-		a->max_order = true_max_order;
-	if (a->max_order > GPU_BALLOC_MAX_ORDER)
-		a->max_order = GPU_BALLOC_MAX_ORDER;
-}
-
-/*
- * Since we can only allocate in chucks of a->blk_size we need to trim off
- * any excess data that is not aligned to a->blk_size.
- */
-static void balloc_allocator_align(struct gk20a_buddy_allocator *a)
-{
-	a->start = ALIGN(a->base, a->blk_size);
-	WARN_ON(a->start != a->base);
-	a->end   = (a->base + a->length) & ~(a->blk_size - 1);
-	a->count = a->end - a->start;
-	a->blks  = a->count >> a->blk_shift;
-}
-
-/*
- * Pass NULL for parent if you want a top level buddy.
- */
-static struct gk20a_buddy *balloc_new_buddy(struct gk20a_buddy_allocator *a,
-					    struct gk20a_buddy *parent,
-					    u64 start, u64 order)
-{
-	struct gk20a_buddy *new_buddy;
-
-	new_buddy = kmem_cache_alloc(buddy_cache, GFP_KERNEL);
-	if (!new_buddy)
-		return NULL;
-
-	memset(new_buddy, 0, sizeof(struct gk20a_buddy));
-
-	new_buddy->parent = parent;
-	new_buddy->start = start;
-	new_buddy->order = order;
-	new_buddy->end = start + (1 << order) * a->blk_size;
-	new_buddy->pte_size = BALLOC_PTE_SIZE_ANY;
-
-	return new_buddy;
-}
-
-static void __balloc_buddy_list_add(struct gk20a_buddy_allocator *a,
-				    struct gk20a_buddy *b,
-				    struct list_head *list)
-{
-	if (buddy_is_in_list(b)) {
-		alloc_dbg(balloc_owner(a),
-			  "Oops: adding added buddy (%llu:0x%llx)\n",
-			  b->order, b->start);
-		BUG();
-	}
-
-	/*
-	 * Add big PTE blocks to the tail, small to the head for GVA spaces.
-	 * This lets the code that checks if there are available blocks check
-	 * without cycling through the entire list.
-	 */
-	if (a->flags & GPU_ALLOC_GVA_SPACE &&
-	    b->pte_size == gmmu_page_size_big)
-		list_add_tail(&b->buddy_entry, list);
-	else
-		list_add(&b->buddy_entry, list);
-
-	buddy_set_in_list(b);
-}
-
-static void __balloc_buddy_list_rem(struct gk20a_buddy_allocator *a,
-				    struct gk20a_buddy *b)
-{
-	if (!buddy_is_in_list(b)) {
-		alloc_dbg(balloc_owner(a),
-			  "Oops: removing removed buddy (%llu:0x%llx)\n",
-			  b->order, b->start);
-		BUG();
-	}
-
-	list_del_init(&b->buddy_entry);
-	buddy_clr_in_list(b);
-}
-
-/*
- * Add a buddy to one of the buddy lists and deal with the necessary
- * book keeping. Adds the buddy to the list specified by the buddy's order.
- */
-static void balloc_blist_add(struct gk20a_buddy_allocator *a,
-			     struct gk20a_buddy *b)
-{
-	__balloc_buddy_list_add(a, b, balloc_get_order_list(a, b->order));
-	a->buddy_list_len[b->order]++;
-}
-
-static void balloc_blist_rem(struct gk20a_buddy_allocator *a,
-			     struct gk20a_buddy *b)
-{
-	__balloc_buddy_list_rem(a, b);
-	a->buddy_list_len[b->order]--;
-}
-
-static u64 balloc_get_order(struct gk20a_buddy_allocator *a, u64 len)
-{
-	if (len == 0)
-		return 0;
-
-	len--;
-	len >>= a->blk_shift;
-
-	return fls(len);
-}
-
-static u64 __balloc_max_order_in(struct gk20a_buddy_allocator *a,
-				 u64 start, u64 end)
-{
-	u64 size = (end - start) >> a->blk_shift;
-
-	if (size > 0)
-		return min_t(u64, ilog2(size), a->max_order);
-	else
-		return GPU_BALLOC_MAX_ORDER;
-}
-
-/*
- * Initialize the buddy lists.
- */
-static int balloc_init_lists(struct gk20a_buddy_allocator *a)
-{
-	int i;
-	u64 bstart, bend, order;
-	struct gk20a_buddy *buddy;
-
-	bstart = a->start;
-	bend = a->end;
-
-	/* First make sure the LLs are valid. */
-	for (i = 0; i < GPU_BALLOC_ORDER_LIST_LEN; i++)
-		INIT_LIST_HEAD(balloc_get_order_list(a, i));
-
-	while (bstart < bend) {
-		order = __balloc_max_order_in(a, bstart, bend);
-
-		buddy = balloc_new_buddy(a, NULL, bstart, order);
-		if (!buddy)
-			goto cleanup;
-
-		balloc_blist_add(a, buddy);
-		bstart += balloc_order_to_len(a, order);
-	}
-
-	return 0;
-
-cleanup:
-	for (i = 0; i < GPU_BALLOC_ORDER_LIST_LEN; i++) {
-		if (!list_empty(balloc_get_order_list(a, i))) {
-			buddy = list_first_entry(balloc_get_order_list(a, i),
-					struct gk20a_buddy, buddy_entry);
-			balloc_blist_rem(a, buddy);
-			kmem_cache_free(buddy_cache, buddy);
-		}
-	}
-
-	return -ENOMEM;
-}
-
-/*
- * Clean up and destroy the passed allocator.
- */
-static void gk20a_buddy_allocator_destroy(struct gk20a_allocator *__a)
-{
-	int i;
-	struct rb_node *node;
-	struct gk20a_buddy *bud;
-	struct gk20a_fixed_alloc *falloc;
-	struct gk20a_buddy_allocator *a = __a->priv;
-
-	alloc_lock(__a);
-
-	gk20a_fini_alloc_debug(__a);
-
-	/*
-	 * Free the fixed allocs first.
-	 */
-	while ((node = rb_first(&a->fixed_allocs)) != NULL) {
-		falloc = container_of(node,
-				      struct gk20a_fixed_alloc, alloced_entry);
-
-		rb_erase(node, &a->fixed_allocs);
-		__balloc_do_free_fixed(a, falloc);
-	}
-
-	/*
-	 * And now free all outstanding allocations.
-	 */
-	while ((node = rb_first(&a->alloced_buddies)) != NULL) {
-		bud = container_of(node, struct gk20a_buddy, alloced_entry);
-		balloc_free_buddy(a, bud->start);
-		balloc_blist_add(a, bud);
-		balloc_coalesce(a, bud);
-	}
-
-	/*
-	 * Now clean up the unallocated buddies.
-	 */
-	for (i = 0; i < GPU_BALLOC_ORDER_LIST_LEN; i++) {
-		BUG_ON(a->buddy_list_alloced[i] != 0);
-
-		while (!list_empty(balloc_get_order_list(a, i))) {
-			bud = list_first_entry(balloc_get_order_list(a, i),
-					       struct gk20a_buddy, buddy_entry);
-			balloc_blist_rem(a, bud);
-			kmem_cache_free(buddy_cache, bud);
-		}
-
-		if (a->buddy_list_len[i] != 0) {
-			pr_info("Excess buddies!!! (%d: %llu)\n",
-				i, a->buddy_list_len[i]);
-			BUG();
-		}
-		if (a->buddy_list_split[i] != 0) {
-			pr_info("Excess split nodes!!! (%d: %llu)\n",
-				i, a->buddy_list_split[i]);
-			BUG();
-		}
-		if (a->buddy_list_alloced[i] != 0) {
-			pr_info("Excess alloced nodes!!! (%d: %llu)\n",
-				i, a->buddy_list_alloced[i]);
-			BUG();
-		}
-	}
-
-	kfree(a);
-
-	alloc_unlock(__a);
-}
-
-/*
- * Combine the passed buddy if possible. The pointer in @b may not be valid
- * after this as the buddy may be freed.
- *
- * @a must be locked.
- */
-static void balloc_coalesce(struct gk20a_buddy_allocator *a,
-			    struct gk20a_buddy *b)
-{
-	struct gk20a_buddy *parent;
-
-	if (buddy_is_alloced(b) || buddy_is_split(b))
-		return;
-
-	/*
-	 * If both our buddy and I are both not allocated and not split then
-	 * we can coalesce ourselves.
-	 */
-	if (!b->buddy)
-		return;
-	if (buddy_is_alloced(b->buddy) || buddy_is_split(b->buddy))
-		return;
-
-	parent = b->parent;
-
-	balloc_blist_rem(a, b);
-	balloc_blist_rem(a, b->buddy);
-
-	buddy_clr_split(parent);
-	a->buddy_list_split[parent->order]--;
-	balloc_blist_add(a, parent);
-
-	/*
-	 * Recursively coalesce as far as we can go.
-	 */
-	balloc_coalesce(a, parent);
-
-	/* Clean up the remains. */
-	kmem_cache_free(buddy_cache, b->buddy);
-	kmem_cache_free(buddy_cache, b);
-}
-
-/*
- * Split a buddy into two new buddies who are 1/2 the size of the parent buddy.
- *
- * @a must be locked.
- */
-static int balloc_split_buddy(struct gk20a_buddy_allocator *a,
-			      struct gk20a_buddy *b, int pte_size)
-{
-	struct gk20a_buddy *left, *right;
-	u64 half;
-
-	left = balloc_new_buddy(a, b, b->start, b->order - 1);
-	if (!left)
-		return -ENOMEM;
-
-	half = (b->end - b->start) / 2;
-
-	right = balloc_new_buddy(a, b, b->start + half, b->order - 1);
-	if (!right) {
-		kmem_cache_free(buddy_cache, left);
-		return -ENOMEM;
-	}
-
-	buddy_set_split(b);
-	a->buddy_list_split[b->order]++;
-
-	b->left = left;
-	b->right = right;
-	left->buddy = right;
-	right->buddy = left;
-	left->parent = b;
-	right->parent = b;
-
-	/* PTE considerations. */
-	if (a->flags & GPU_ALLOC_GVA_SPACE &&
-	    left->order <= a->pte_blk_order) {
-		left->pte_size = pte_size;
-		right->pte_size = pte_size;
-	}
-
-	balloc_blist_rem(a, b);
-	balloc_blist_add(a, left);
-	balloc_blist_add(a, right);
-
-	return 0;
-}
-
-/*
- * Place the passed buddy into the RB tree for allocated buddies. Never fails
- * unless the passed entry is a duplicate which is a bug.
- *
- * @a must be locked.
- */
-static void balloc_alloc_buddy(struct gk20a_buddy_allocator *a,
-			       struct gk20a_buddy *b)
-{
-	struct rb_node **new = &(a->alloced_buddies.rb_node);
-	struct rb_node *parent = NULL;
-
-	while (*new) {
-		struct gk20a_buddy *bud = container_of(*new, struct gk20a_buddy,
-						       alloced_entry);
-
-		parent = *new;
-		if (b->start < bud->start)
-			new = &((*new)->rb_left);
-		else if (b->start > bud->start)
-			new = &((*new)->rb_right);
-		else
-			BUG_ON("Duplicate entries in allocated list!\n");
-	}
-
-	rb_link_node(&b->alloced_entry, parent, new);
-	rb_insert_color(&b->alloced_entry, &a->alloced_buddies);
-
-	buddy_set_alloced(b);
-	a->buddy_list_alloced[b->order]++;
-}
-
-/*
- * Remove the passed buddy from the allocated buddy RB tree. Returns the
- * deallocated buddy for further processing.
- *
- * @a must be locked.
- */
-static struct gk20a_buddy *balloc_free_buddy(struct gk20a_buddy_allocator *a,
-					     u64 addr)
-{
-	struct rb_node *node = a->alloced_buddies.rb_node;
-	struct gk20a_buddy *bud;
-
-	while (node) {
-		bud = container_of(node, struct gk20a_buddy, alloced_entry);
-
-		if (addr < bud->start)
-			node = node->rb_left;
-		else if (addr > bud->start)
-			node = node->rb_right;
-		else
-			break;
-	}
-
-	if (!node)
-		return NULL;
-
-	rb_erase(node, &a->alloced_buddies);
-	buddy_clr_alloced(bud);
-	a->buddy_list_alloced[bud->order]--;
-
-	return bud;
-}
-
-/*
- * Find a suitable buddy for the given order and PTE type (big or little).
- */
-static struct gk20a_buddy *__balloc_find_buddy(struct gk20a_buddy_allocator *a,
-					       u64 order, int pte_size)
-{
-	struct gk20a_buddy *bud;
-
-	if (order > a->max_order ||
-	    list_empty(balloc_get_order_list(a, order)))
-		return NULL;
-
-	if (a->flags & GPU_ALLOC_GVA_SPACE &&
-	    pte_size == gmmu_page_size_big)
-		bud = list_last_entry(balloc_get_order_list(a, order),
-				      struct gk20a_buddy, buddy_entry);
-	else
-		bud = list_first_entry(balloc_get_order_list(a, order),
-				       struct gk20a_buddy, buddy_entry);
-
-	if (bud->pte_size != BALLOC_PTE_SIZE_ANY &&
-	    bud->pte_size != pte_size)
-		return NULL;
-
-	return bud;
-}
-
-/*
- * Allocate a suitably sized buddy. If no suitable buddy exists split higher
- * order buddies until we have a suitable buddy to allocate.
- *
- * For PDE grouping add an extra check to see if a buddy is suitable: that the
- * buddy exists in a PDE who's PTE size is reasonable
- *
- * @a must be locked.
- */
-static u64 __balloc_do_alloc(struct gk20a_buddy_allocator *a,
-			     u64 order, int pte_size)
-{
-	u64 split_order;
-	struct gk20a_buddy *bud = NULL;
-
-	split_order = order;
-	while (split_order <= a->max_order &&
-	       !(bud = __balloc_find_buddy(a, split_order, pte_size)))
-		split_order++;
-
-	/* Out of memory! */
-	if (!bud)
-		return 0;
-
-	while (bud->order != order) {
-		if (balloc_split_buddy(a, bud, pte_size))
-			return 0; /* No mem... */
-		bud = bud->left;
-	}
-
-	balloc_blist_rem(a, bud);
-	balloc_alloc_buddy(a, bud);
-
-	return bud->start;
-}
-
-/*
- * See if the passed range is actually available for allocation. If so, then
- * return 1, otherwise return 0.
- *
- * TODO: Right now this uses the unoptimal approach of going through all
- * outstanding allocations and checking their base/ends. This could be better.
- */
-static int balloc_is_range_free(struct gk20a_buddy_allocator *a,
-				u64 base, u64 end)
-{
-	struct rb_node *node;
-	struct gk20a_buddy *bud;
-
-	node = rb_first(&a->alloced_buddies);
-	if (!node)
-		return 1; /* No allocs yet. */
-
-	bud = container_of(node, struct gk20a_buddy, alloced_entry);
-
-	while (bud->start < end) {
-		if ((bud->start > base && bud->start < end) ||
-		    (bud->end   > base && bud->end   < end))
-			return 0;
-
-		node = rb_next(node);
-		if (!node)
-			break;
-		bud = container_of(node, struct gk20a_buddy, alloced_entry);
-	}
-
-	return 1;
-}
-
-static void balloc_alloc_fixed(struct gk20a_buddy_allocator *a,
-			       struct gk20a_fixed_alloc *f)
-{
-	struct rb_node **new = &(a->fixed_allocs.rb_node);
-	struct rb_node *parent = NULL;
-
-	while (*new) {
-		struct gk20a_fixed_alloc *falloc =
-			container_of(*new, struct gk20a_fixed_alloc,
-				     alloced_entry);
-
-		BUG_ON(!virt_addr_valid(falloc));
-
-		parent = *new;
-		if (f->start < falloc->start)
-			new = &((*new)->rb_left);
-		else if (f->start > falloc->start)
-			new = &((*new)->rb_right);
-		else
-			BUG_ON("Duplicate entries in allocated list!\n");
-	}
-
-	rb_link_node(&f->alloced_entry, parent, new);
-	rb_insert_color(&f->alloced_entry, &a->fixed_allocs);
-}
-
-/*
- * Remove the passed buddy from the allocated buddy RB tree. Returns the
- * deallocated buddy for further processing.
- *
- * @a must be locked.
- */
-static struct gk20a_fixed_alloc *balloc_free_fixed(
-	struct gk20a_buddy_allocator *a, u64 addr)
-{
-	struct rb_node *node = a->fixed_allocs.rb_node;
-	struct gk20a_fixed_alloc *falloc;
-
-	while (node) {
-		falloc = container_of(node,
-				      struct gk20a_fixed_alloc, alloced_entry);
-
-		if (addr < falloc->start)
-			node = node->rb_left;
-		else if (addr > falloc->start)
-			node = node->rb_right;
-		else
-			break;
-	}
-
-	if (!node)
-		return NULL;
-
-	rb_erase(node, &a->fixed_allocs);
-
-	return falloc;
-}
-
-/*
- * Find the parent range - doesn't necessarily need the parent to actually exist
- * as a buddy. Finding an existing parent comes later...
- */
-static void __balloc_get_parent_range(struct gk20a_buddy_allocator *a,
-				      u64 base, u64 order,
-				      u64 *pbase, u64 *porder)
-{
-	u64 base_mask;
-	u64 shifted_base = balloc_base_shift(a, base);
-
-	order++;
-	base_mask = ~((a->blk_size << order) - 1);
-
-	shifted_base &= base_mask;
-
-	*pbase = balloc_base_unshift(a, shifted_base);
-	*porder = order;
-}
-
-/*
- * Makes a buddy at the passed address. This will make all parent buddies
- * necessary for this buddy to exist as well.
- */
-static struct gk20a_buddy *__balloc_make_fixed_buddy(
-	struct gk20a_buddy_allocator *a, u64 base, u64 order)
-{
-	struct gk20a_buddy *bud = NULL;
-	struct list_head *order_list;
-	u64 cur_order = order, cur_base = base;
-
-	/*
-	 * Algo:
-	 *  1. Keep jumping up a buddy order until we find the real buddy that
-	 *     this buddy exists in.
-	 *  2. Then work our way down through the buddy tree until we hit a dead
-	 *     end.
-	 *  3. Start splitting buddies until we split to the one we need to
-	 *     make.
-	 */
-	while (cur_order <= a->max_order) {
-		int found = 0;
-
-		order_list = balloc_get_order_list(a, cur_order);
-		list_for_each_entry(bud, order_list, buddy_entry) {
-			if (bud->start == cur_base) {
-				found = 1;
-				break;
-			}
-		}
-
-		if (found)
-			break;
-
-		__balloc_get_parent_range(a, cur_base, cur_order,
-					  &cur_base, &cur_order);
-	}
-
-	if (cur_order > a->max_order) {
-		alloc_dbg(balloc_owner(a), "No buddy for range ???\n");
-		return NULL;
-	}
-
-	/* Split this buddy as necessary until we get the target buddy. */
-	while (bud->start != base || bud->order != order) {
-		if (balloc_split_buddy(a, bud, BALLOC_PTE_SIZE_ANY)) {
-			balloc_coalesce(a, bud);
-			return NULL;
-		}
-
-		if (base < bud->right->start)
-			bud = bud->left;
-		else
-			bud = bud->right;
-
-	}
-
-	return bud;
-}
-
-static u64 __balloc_do_alloc_fixed(struct gk20a_buddy_allocator *a,
-				   struct gk20a_fixed_alloc *falloc,
-				   u64 base, u64 len)
-{
-	u64 shifted_base, inc_base;
-	u64 align_order;
-
-	shifted_base = balloc_base_shift(a, base);
-	if (shifted_base == 0)
-		align_order = __fls(len >> a->blk_shift);
-	else
-		align_order = min_t(u64,
-				    __ffs(shifted_base >> a->blk_shift),
-				    __fls(len >> a->blk_shift));
-
-	if (align_order > a->max_order) {
-		alloc_dbg(balloc_owner(a),
-			  "Align order too big: %llu > %llu\n",
-			  align_order, a->max_order);
-		return 0;
-	}
-
-	/*
-	 * Generate a list of buddies that satisfy this allocation.
-	 */
-	inc_base = shifted_base;
-	while (inc_base < (shifted_base + len)) {
-		u64 order_len = balloc_order_to_len(a, align_order);
-		u64 remaining;
-		struct gk20a_buddy *bud;
-
-		bud = __balloc_make_fixed_buddy(a,
-					balloc_base_unshift(a, inc_base),
-					align_order);
-		if (!bud) {
-			alloc_dbg(balloc_owner(a),
-				  "Fixed buddy failed: {0x%llx, %llu}!\n",
-				  balloc_base_unshift(a, inc_base),
-				  align_order);
-			goto err_and_cleanup;
-		}
-
-		balloc_blist_rem(a, bud);
-		balloc_alloc_buddy(a, bud);
-		__balloc_buddy_list_add(a, bud, &falloc->buddies);
-
-		/* Book keeping. */
-		inc_base += order_len;
-		remaining = (shifted_base + len) - inc_base;
-		align_order = __ffs(inc_base >> a->blk_shift);
-
-		/* If we don't have much left - trim down align_order. */
-		if (balloc_order_to_len(a, align_order) > remaining)
-			align_order = __balloc_max_order_in(a, inc_base,
-							inc_base + remaining);
-	}
-
-	return base;
-
-err_and_cleanup:
-	while (!list_empty(&falloc->buddies)) {
-		struct gk20a_buddy *bud = list_first_entry(&falloc->buddies,
-							   struct gk20a_buddy,
-							   buddy_entry);
-
-		__balloc_buddy_list_rem(a, bud);
-		balloc_free_buddy(a, bud->start);
-		kmem_cache_free(buddy_cache, bud);
-	}
-
-	return 0;
-}
-
-static void __balloc_do_free_fixed(struct gk20a_buddy_allocator *a,
-				   struct gk20a_fixed_alloc *falloc)
-{
-	struct gk20a_buddy *bud;
-
-	while (!list_empty(&falloc->buddies)) {
-		bud = list_first_entry(&falloc->buddies,
-				       struct gk20a_buddy,
-				       buddy_entry);
-		__balloc_buddy_list_rem(a, bud);
-
-		balloc_free_buddy(a, bud->start);
-		balloc_blist_add(a, bud);
-		a->bytes_freed += balloc_order_to_len(a, bud->order);
-
-		/*
-		 * Attemp to defrag the allocation.
-		 */
-		balloc_coalesce(a, bud);
-	}
-
-	kfree(falloc);
-}
-
-/*
- * Allocate memory from the passed allocator.
- */
-static u64 gk20a_buddy_balloc(struct gk20a_allocator *__a, u64 len)
-{
-	u64 order, addr;
-	int pte_size;
-	struct gk20a_buddy_allocator *a = __a->priv;
-
-	gk20a_alloc_trace_func();
-
-	alloc_lock(__a);
-
-	order = balloc_get_order(a, len);
-
-	if (order > a->max_order) {
-		alloc_unlock(__a);
-		alloc_dbg(balloc_owner(a), "Alloc fail\n");
-		gk20a_alloc_trace_func_done();
-		return 0;
-	}
-
-	/*
-	 * For now pass the base address of the allocator's region to
-	 * __get_pte_size(). This ensures we get the right page size for
-	 * the alloc but we don't have to know what the real address is
-	 * going to be quite yet.
-	 *
-	 * TODO: once userspace supports a unified address space pass 0 for
-	 * the base. This will make only 'len' affect the PTE size.
-	 */
-	if (a->flags & GPU_ALLOC_GVA_SPACE)
-		pte_size = __get_pte_size(a->vm, a->base, len);
-	else
-		pte_size = BALLOC_PTE_SIZE_ANY;
-
-	addr = __balloc_do_alloc(a, order, pte_size);
-
-	if (addr) {
-		a->bytes_alloced += len;
-		a->bytes_alloced_real += balloc_order_to_len(a, order);
-		alloc_dbg(balloc_owner(a),
-			  "Alloc 0x%-10llx %3lld:0x%-10llx pte_size=%s\n",
-			  addr, order, len,
-			  pte_size == gmmu_page_size_big   ? "big" :
-			  pte_size == gmmu_page_size_small ? "small" :
-			  "NA/any");
-	} else {
-		alloc_dbg(balloc_owner(a), "Alloc failed: no mem!\n");
-	}
-
-	a->alloc_made = 1;
-
-	alloc_unlock(__a);
-
-	gk20a_alloc_trace_func_done();
-	return addr;
-}
-
-/*
- * Requires @__a to be locked.
- */
-static u64 __gk20a_balloc_fixed_buddy(struct gk20a_allocator *__a,
-				      u64 base, u64 len)
-{
-	u64 ret, real_bytes = 0;
-	struct gk20a_buddy *bud;
-	struct gk20a_fixed_alloc *falloc = NULL;
-	struct gk20a_buddy_allocator *a = __a->priv;
-
-	gk20a_alloc_trace_func();
-
-	/* If base isn't aligned to an order 0 block, fail. */
-	if (base & (a->blk_size - 1))
-		goto fail;
-
-	if (len == 0)
-		goto fail;
-
-	falloc = kmalloc(sizeof(*falloc), GFP_KERNEL);
-	if (!falloc)
-		goto fail;
-
-	INIT_LIST_HEAD(&falloc->buddies);
-	falloc->start = base;
-	falloc->end = base + len;
-
-	if (!balloc_is_range_free(a, base, base + len)) {
-		alloc_dbg(balloc_owner(a),
-			  "Range not free: 0x%llx -> 0x%llx\n",
-			  base, base + len);
-		goto fail_unlock;
-	}
-
-	ret = __balloc_do_alloc_fixed(a, falloc, base, len);
-	if (!ret) {
-		alloc_dbg(balloc_owner(a),
-			  "Alloc-fixed failed ?? 0x%llx -> 0x%llx\n",
-			  base, base + len);
-		goto fail_unlock;
-	}
-
-	balloc_alloc_fixed(a, falloc);
-
-	list_for_each_entry(bud, &falloc->buddies, buddy_entry)
-		real_bytes += (bud->end - bud->start);
-
-	a->bytes_alloced += len;
-	a->bytes_alloced_real += real_bytes;
-
-	alloc_dbg(balloc_owner(a), "Alloc (fixed) 0x%llx\n", base);
-
-	gk20a_alloc_trace_func_done();
-	return base;
-
-fail_unlock:
-	alloc_unlock(__a);
-fail:
-	kfree(falloc);
-	gk20a_alloc_trace_func_done();
-	return 0;
-}
-
-/*
- * Allocate a fixed address allocation. The address of the allocation is @base
- * and the length is @len. This is not a typical buddy allocator operation and
- * as such has a high posibility of failure if the address space is heavily in
- * use.
- *
- * Please do not use this function unless _absolutely_ necessary.
- */
-static u64 gk20a_balloc_fixed_buddy(struct gk20a_allocator *__a,
-				    u64 base, u64 len)
-{
-	u64 alloc;
-	struct gk20a_buddy_allocator *a = __a->priv;
-
-	alloc_lock(__a);
-	alloc = __gk20a_balloc_fixed_buddy(__a, base, len);
-	a->alloc_made = 1;
-	alloc_unlock(__a);
-
-	return alloc;
-}
-
-/*
- * Free the passed allocation.
- */
-static void gk20a_buddy_bfree(struct gk20a_allocator *__a, u64 addr)
-{
-	struct gk20a_buddy *bud;
-	struct gk20a_fixed_alloc *falloc;
-	struct gk20a_buddy_allocator *a = __a->priv;
-
-	gk20a_alloc_trace_func();
-
-	if (!addr) {
-		gk20a_alloc_trace_func_done();
-		return;
-	}
-
-	alloc_lock(__a);
-
-	/*
-	 * First see if this is a fixed alloc. If not fall back to a regular
-	 * buddy.
-	 */
-	falloc = balloc_free_fixed(a, addr);
-	if (falloc) {
-		__balloc_do_free_fixed(a, falloc);
-		goto done;
-	}
-
-	bud = balloc_free_buddy(a, addr);
-	if (!bud)
-		goto done;
-
-	balloc_blist_add(a, bud);
-	a->bytes_freed += balloc_order_to_len(a, bud->order);
-
-	/*
-	 * Attemp to defrag the allocation.
-	 */
-	balloc_coalesce(a, bud);
-
-done:
-	alloc_unlock(__a);
-	alloc_dbg(balloc_owner(a), "Free 0x%llx\n", addr);
-	gk20a_alloc_trace_func_done();
-	return;
-}
-
-static bool gk20a_buddy_reserve_is_possible(struct gk20a_buddy_allocator *a,
-					    struct gk20a_alloc_carveout *co)
-{
-	struct gk20a_alloc_carveout *tmp;
-	u64 co_base, co_end;
-
-	co_base = co->base;
-	co_end  = co->base + co->length;
-
-	/*
-	 * Not the fastest approach but we should not have that many carveouts
-	 * for any reasonable allocator.
-	 */
-	list_for_each_entry(tmp, &a->co_list, co_entry) {
-		if ((co_base >= tmp->base &&
-		     co_base < (tmp->base + tmp->length)) ||
-		    (co_end >= tmp->base &&
-		     co_end < (tmp->base + tmp->length)))
-			return false;
-	}
-
-	return true;
-}
-
-/*
- * Carveouts can only be reserved before any regular allocations have been
- * made.
- */
-static int gk20a_buddy_reserve_co(struct gk20a_allocator *__a,
-				  struct gk20a_alloc_carveout *co)
-{
-	struct gk20a_buddy_allocator *a = __a->priv;
-	u64 addr;
-	int err = 0;
-
-	if (co->base < a->start || (co->base + co->length) > a->end ||
-	    a->alloc_made)
-		return -EINVAL;
-
-	alloc_lock(__a);
-
-	if (!gk20a_buddy_reserve_is_possible(a, co)) {
-		err = -EBUSY;
-		goto done;
-	}
-
-	/* Should not be possible to fail... */
-	addr = __gk20a_balloc_fixed_buddy(__a, co->base, co->length);
-	if (!addr) {
-		err = -ENOMEM;
-		pr_warn("%s: Failed to reserve a valid carveout!\n", __func__);
-		goto done;
-	}
-
-	list_add(&co->co_entry, &a->co_list);
-
-done:
-	alloc_unlock(__a);
-	return err;
-}
-
-/*
- * Carveouts can be release at any time.
- */
-static void gk20a_buddy_release_co(struct gk20a_allocator *__a,
-				   struct gk20a_alloc_carveout *co)
-{
-	alloc_lock(__a);
-
-	list_del_init(&co->co_entry);
-	gk20a_free(__a, co->base);
-
-	alloc_unlock(__a);
-}
-
-static u64 gk20a_buddy_alloc_length(struct gk20a_allocator *a)
-{
-	struct gk20a_buddy_allocator *ba = a->priv;
-
-	return ba->length;
-}
-
-static u64 gk20a_buddy_alloc_base(struct gk20a_allocator *a)
-{
-	struct gk20a_buddy_allocator *ba = a->priv;
-
-	return ba->start;
-}
-
-static int gk20a_buddy_alloc_inited(struct gk20a_allocator *a)
-{
-	struct gk20a_buddy_allocator *ba = a->priv;
-	int inited = ba->initialized;
-
-	rmb();
-	return inited;
-}
-
-static u64 gk20a_buddy_alloc_end(struct gk20a_allocator *a)
-{
-	struct gk20a_buddy_allocator *ba = a->priv;
-
-	return ba->end;
-}
-
-static u64 gk20a_buddy_alloc_space(struct gk20a_allocator *a)
-{
-	struct gk20a_buddy_allocator *ba = a->priv;
-	u64 space;
-
-	alloc_lock(a);
-	space = ba->end - ba->start -
-		(ba->bytes_alloced_real - ba->bytes_freed);
-	alloc_unlock(a);
-
-	return space;
-}
-
-/*
- * Print the buddy allocator top level stats. If you pass @s as NULL then the
- * stats are printed to the kernel log. This lets this code be used for
- * debugging purposes internal to the allocator.
- */
-static void gk20a_buddy_print_stats(struct gk20a_allocator *__a,
-				    struct seq_file *s, int lock)
-{
-	int i = 0;
-	struct rb_node *node;
-	struct gk20a_fixed_alloc *falloc;
-	struct gk20a_alloc_carveout *tmp;
-	struct gk20a_buddy_allocator *a = __a->priv;
-
-	__alloc_pstat(s, __a, "base = %llu, limit = %llu, blk_size = %llu\n",
-		      a->base, a->length, a->blk_size);
-	__alloc_pstat(s, __a, "Internal params:\n");
-	__alloc_pstat(s, __a, "  start = 0x%llx\n", a->start);
-	__alloc_pstat(s, __a, "  end   = 0x%llx\n", a->end);
-	__alloc_pstat(s, __a, "  count = 0x%llx\n", a->count);
-	__alloc_pstat(s, __a, "  blks  = 0x%llx\n", a->blks);
-	__alloc_pstat(s, __a, "  max_order = %llu\n", a->max_order);
-
-	if (lock)
-		alloc_lock(__a);
-
-	if (!list_empty(&a->co_list)) {
-		__alloc_pstat(s, __a, "\n");
-		__alloc_pstat(s, __a, "Carveouts:\n");
-		list_for_each_entry(tmp, &a->co_list, co_entry)
-			__alloc_pstat(s, __a,
-				      "  CO %2d: %-20s 0x%010llx + 0x%llx\n",
-				      i++, tmp->name, tmp->base, tmp->length);
-	}
-
-	__alloc_pstat(s, __a, "\n");
-	__alloc_pstat(s, __a, "Buddy blocks:\n");
-	__alloc_pstat(s, __a, "  Order   Free    Alloced   Split\n");
-	__alloc_pstat(s, __a, "  -----   ----    -------   -----\n");
-
-	for (i = a->max_order; i >= 0; i--) {
-		if (a->buddy_list_len[i] == 0 &&
-		    a->buddy_list_alloced[i] == 0 &&
-		    a->buddy_list_split[i] == 0)
-			continue;
-
-		__alloc_pstat(s, __a, "  %3d     %-7llu %-9llu %llu\n", i,
-			      a->buddy_list_len[i],
-			      a->buddy_list_alloced[i],
-			      a->buddy_list_split[i]);
-	}
-
-	__alloc_pstat(s, __a, "\n");
-
-	for (node = rb_first(&a->fixed_allocs), i = 1;
-	     node != NULL;
-	     node = rb_next(node)) {
-		falloc = container_of(node,
-				      struct gk20a_fixed_alloc, alloced_entry);
-
-		__alloc_pstat(s, __a, "Fixed alloc (%d): [0x%llx -> 0x%llx]\n",
-			      i, falloc->start, falloc->end);
-	}
-
-	__alloc_pstat(s, __a, "\n");
-	__alloc_pstat(s, __a, "Bytes allocated:        %llu\n",
-		      a->bytes_alloced);
-	__alloc_pstat(s, __a, "Bytes allocated (real): %llu\n",
-		      a->bytes_alloced_real);
-	__alloc_pstat(s, __a, "Bytes freed:            %llu\n",
-		      a->bytes_freed);
-
-	if (lock)
-		alloc_unlock(__a);
-}
-
-static const struct gk20a_allocator_ops buddy_ops = {
-	.alloc		= gk20a_buddy_balloc,
-	.free		= gk20a_buddy_bfree,
-
-	.alloc_fixed	= gk20a_balloc_fixed_buddy,
-	/* .free_fixed not needed. */
-
-	.reserve_carveout	= gk20a_buddy_reserve_co,
-	.release_carveout	= gk20a_buddy_release_co,
-
-	.base		= gk20a_buddy_alloc_base,
-	.length		= gk20a_buddy_alloc_length,
-	.end		= gk20a_buddy_alloc_end,
-	.inited		= gk20a_buddy_alloc_inited,
-	.space		= gk20a_buddy_alloc_space,
-
-	.fini		= gk20a_buddy_allocator_destroy,
-
-	.print_stats	= gk20a_buddy_print_stats,
-};
-
-/*
- * Initialize a buddy allocator. Returns 0 on success. This allocator does
- * not necessarily manage bytes. It manages distinct ranges of resources. This
- * allows the allocator to work for things like comp_tags, semaphores, etc.
- *
- * @allocator: Ptr to an allocator struct to init.
- * @vm: GPU VM to associate this allocator with. Can be NULL. Will be used to
- *      get PTE size for GVA spaces.
- * @name: Name of the allocator. Doesn't have to be static storage.
- * @base: The base address of the resource pool being managed.
- * @size: Number of resources in the pool.
- * @blk_size: Minimum number of resources to allocate at once. For things like
- *            semaphores this is 1. For GVA this might be as much as 64k. This
- *            corresponds to order 0. Must be power of 2.
- * @max_order: Pick a maximum order. If you leave this as 0, the buddy allocator
- *             will try and pick a reasonable max order.
- * @flags: Extra flags necessary. See GPU_BALLOC_*.
- */
-int __gk20a_buddy_allocator_init(struct gk20a *g, struct gk20a_allocator *__a,
-				 struct vm_gk20a *vm, const char *name,
-				 u64 base, u64 size, u64 blk_size,
-				 u64 max_order, u64 flags)
-{
-	int err;
-	u64 pde_size;
-	struct gk20a_buddy_allocator *a;
-
-	/* blk_size must be greater than 0 and a power of 2. */
-	if (blk_size == 0)
-		return -EINVAL;
-	if (blk_size & (blk_size - 1))
-		return -EINVAL;
-
-	if (max_order > GPU_BALLOC_MAX_ORDER)
-		return -EINVAL;
-
-	/* If this is to manage a GVA space we need a VM. */
-	if (flags & GPU_ALLOC_GVA_SPACE && !vm)
-		return -EINVAL;
-
-	a = kzalloc(sizeof(struct gk20a_buddy_allocator), GFP_KERNEL);
-	if (!a)
-		return -ENOMEM;
-
-	err = __gk20a_alloc_common_init(__a, name, a, false, &buddy_ops);
-	if (err)
-		goto fail;
-
-	a->base = base;
-	a->length = size;
-	a->blk_size = blk_size;
-	a->blk_shift = __ffs(blk_size);
-	a->owner = __a;
-
-	/*
-	 * If base is 0 then modfy base to be the size of one block so that we
-	 * can return errors by returning addr == 0.
-	 */
-	if (a->base == 0) {
-		a->base = a->blk_size;
-		a->length -= a->blk_size;
-	}
-
-	a->vm = vm;
-	if (flags & GPU_ALLOC_GVA_SPACE) {
-		pde_size = ((u64)vm->big_page_size) << 10;
-		a->pte_blk_order = balloc_get_order(a, pde_size);
-	}
-
-	/*
-	 * When we have a GVA space with big_pages enabled the size and base
-	 * must be PDE aligned. If big_pages are not enabled then this
-	 * requirement is not necessary.
-	 */
-	if (flags & GPU_ALLOC_GVA_SPACE && vm->big_pages &&
-	    (base & ((vm->big_page_size << 10) - 1) ||
-	     size & ((vm->big_page_size << 10) - 1)))
-		return -EINVAL;
-
-	a->flags = flags;
-	a->max_order = max_order;
-
-	balloc_allocator_align(a);
-	balloc_compute_max_order(a);
-
-	/* Shared buddy kmem_cache for all allocators. */
-	if (!buddy_cache)
-		buddy_cache = KMEM_CACHE(gk20a_buddy, 0);
-	if (!buddy_cache) {
-		err = -ENOMEM;
-		goto fail;
-	}
-
-	a->alloced_buddies = RB_ROOT;
-	a->fixed_allocs = RB_ROOT;
-	INIT_LIST_HEAD(&a->co_list);
-	err = balloc_init_lists(a);
-	if (err)
-		goto fail;
-
-	wmb();
-	a->initialized = 1;
-
-	gk20a_init_alloc_debug(g, __a);
-	alloc_dbg(__a, "New allocator: type      buddy\n");
-	alloc_dbg(__a, "               base      0x%llx\n", a->base);
-	alloc_dbg(__a, "               size      0x%llx\n", a->length);
-	alloc_dbg(__a, "               blk_size  0x%llx\n", a->blk_size);
-	alloc_dbg(__a, "               max_order %llu\n", a->max_order);
-	alloc_dbg(__a, "               flags     0x%llx\n", a->flags);
-
-	return 0;
-
-fail:
-	kfree(a);
-	return err;
-}
-
-int gk20a_buddy_allocator_init(struct gk20a *g, struct gk20a_allocator *a,
-			       const char *name, u64 base, u64 size,
-			       u64 blk_size, u64 flags)
-{
-	return __gk20a_buddy_allocator_init(g, a, NULL, name,
-					    base, size, blk_size, 0, 0);
-}
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a_allocator_lockless.c b/drivers/gpu/nvgpu/gk20a/gk20a_allocator_lockless.c
deleted file mode 100644
index 5b011d8c..00000000
--- a/drivers/gpu/nvgpu/gk20a/gk20a_allocator_lockless.c
+++ /dev/null
@@ -1,206 +0,0 @@
-/*
- * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <linux/kernel.h>
-#include <linux/slab.h>
-#include <linux/vmalloc.h>
-#include <linux/atomic.h>
-
-#include "gk20a_allocator.h"
-#include "lockless_allocator_priv.h"
-
-static u64 gk20a_lockless_alloc_length(struct gk20a_allocator *a)
-{
-	struct gk20a_lockless_allocator *pa = a->priv;
-
-	return pa->length;
-}
-
-static u64 gk20a_lockless_alloc_base(struct gk20a_allocator *a)
-{
-	struct gk20a_lockless_allocator *pa = a->priv;
-
-	return pa->base;
-}
-
-static int gk20a_lockless_alloc_inited(struct gk20a_allocator *a)
-{
-	struct gk20a_lockless_allocator *pa = a->priv;
-	int inited = pa->inited;
-
-	rmb();
-	return inited;
-}
-
-static u64 gk20a_lockless_alloc_end(struct gk20a_allocator *a)
-{
-	struct gk20a_lockless_allocator *pa = a->priv;
-
-	return pa->base + pa->length;
-}
-
-static u64 gk20a_lockless_alloc(struct gk20a_allocator *a, u64 len)
-{
-	struct gk20a_lockless_allocator *pa = a->priv;
-	int head, new_head, ret;
-	u64 addr = 0;
-
-	if (len != pa->blk_size)
-		return 0;
-
-	head = ACCESS_ONCE(pa->head);
-	while (head >= 0) {
-		new_head = ACCESS_ONCE(pa->next[head]);
-		ret = cmpxchg(&pa->head, head, new_head);
-		if (ret == head) {
-			addr = pa->base + head * pa->blk_size;
-			atomic_inc(&pa->nr_allocs);
-			alloc_dbg(a, "Alloc node # %d @ addr 0x%llx\n", head,
-				  addr);
-			break;
-		}
-		head = ACCESS_ONCE(pa->head);
-	}
-	return addr;
-}
-
-static void gk20a_lockless_free(struct gk20a_allocator *a, u64 addr)
-{
-	struct gk20a_lockless_allocator *pa = a->priv;
-	int head, ret;
-	u64 cur_idx, rem;
-
-	cur_idx = addr - pa->base;
-	rem = do_div(cur_idx, pa->blk_size);
-
-	while (1) {
-		head = ACCESS_ONCE(pa->head);
-		ACCESS_ONCE(pa->next[cur_idx]) = head;
-		ret = cmpxchg(&pa->head, head, cur_idx);
-		if (ret == head) {
-			atomic_dec(&pa->nr_allocs);
-			alloc_dbg(a, "Free node # %llu\n", cur_idx);
-			break;
-		}
-	}
-}
-
-static void gk20a_lockless_alloc_destroy(struct gk20a_allocator *a)
-{
-	struct gk20a_lockless_allocator *pa = a->priv;
-
-	gk20a_fini_alloc_debug(a);
-
-	vfree(pa->next);
-	kfree(pa);
-}
-
-static void gk20a_lockless_print_stats(struct gk20a_allocator *a,
-				   struct seq_file *s, int lock)
-{
-	struct gk20a_lockless_allocator *pa = a->priv;
-
-	__alloc_pstat(s, a, "Lockless allocator params:\n");
-	__alloc_pstat(s, a, "  start = 0x%llx\n", pa->base);
-	__alloc_pstat(s, a, "  end   = 0x%llx\n", pa->base + pa->length);
-
-	/* Actual stats. */
-	__alloc_pstat(s, a, "Stats:\n");
-	__alloc_pstat(s, a, "  Number allocs = %d\n",
-		      atomic_read(&pa->nr_allocs));
-	__alloc_pstat(s, a, "  Number free   = %d\n",
-		      pa->nr_nodes - atomic_read(&pa->nr_allocs));
-}
-
-static const struct gk20a_allocator_ops pool_ops = {
-	.alloc		= gk20a_lockless_alloc,
-	.free		= gk20a_lockless_free,
-
-	.base		= gk20a_lockless_alloc_base,
-	.length		= gk20a_lockless_alloc_length,
-	.end		= gk20a_lockless_alloc_end,
-	.inited		= gk20a_lockless_alloc_inited,
-
-	.fini		= gk20a_lockless_alloc_destroy,
-
-	.print_stats	= gk20a_lockless_print_stats,
-};
-
-int gk20a_lockless_allocator_init(struct gk20a *g, struct gk20a_allocator *__a,
-			      const char *name, u64 base, u64 length,
-			      u64 blk_size, u64 flags)
-{
-	int i;
-	int err;
-	int nr_nodes;
-	u64 count, rem;
-	struct gk20a_lockless_allocator *a;
-
-	if (!blk_size)
-		return -EINVAL;
-
-	/*
-	 * Ensure we have space for atleast one node & there's no overflow.
-	 * In order to control memory footprint, we require count < INT_MAX
-	 */
-	count = length;
-	rem = do_div(count, blk_size);
-	if (!base || !count || count > INT_MAX)
-		return -EINVAL;
-
-	a = kzalloc(sizeof(struct gk20a_lockless_allocator), GFP_KERNEL);
-	if (!a)
-		return -ENOMEM;
-
-	err = __gk20a_alloc_common_init(__a, name, a, false, &pool_ops);
-	if (err)
-		goto fail;
-
-	a->next = vzalloc(sizeof(*a->next) * count);
-	if (!a->next) {
-		err = -ENOMEM;
-		goto fail;
-	}
-
-	/* chain the elements together to form the initial free list  */
-	nr_nodes = (int)count;
-	for (i = 0; i < nr_nodes; i++)
-		a->next[i] = i + 1;
-	a->next[nr_nodes - 1] = -1;
-
-	a->base = base;
-	a->length = length;
-	a->blk_size = blk_size;
-	a->nr_nodes = nr_nodes;
-	a->flags = flags;
-	atomic_set(&a->nr_allocs, 0);
-
-	wmb();
-	a->inited = true;
-
-	gk20a_init_alloc_debug(g, __a);
-	alloc_dbg(__a, "New allocator: type          lockless\n");
-	alloc_dbg(__a, "               base          0x%llx\n", a->base);
-	alloc_dbg(__a, "               nodes         %d\n", a->nr_nodes);
-	alloc_dbg(__a, "               blk_size      0x%llx\n", a->blk_size);
-	alloc_dbg(__a, "               flags         0x%llx\n", a->flags);
-
-	return 0;
-
-fail:
-	kfree(a);
-	return err;
-}
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a_allocator_page.c b/drivers/gpu/nvgpu/gk20a/gk20a_allocator_page.c
deleted file mode 100644
index 9717a726..00000000
--- a/drivers/gpu/nvgpu/gk20a/gk20a_allocator_page.c
+++ /dev/null
@@ -1,936 +0,0 @@
-/*
- * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <linux/kernel.h>
-#include <linux/slab.h>
-#include <linux/bitops.h>
-#include <linux/mm.h>
-
-#include "gk20a_allocator.h"
-#include "buddy_allocator_priv.h"
-#include "page_allocator_priv.h"
-
-#define palloc_dbg(a, fmt, arg...)			\
-	alloc_dbg(palloc_owner(a), fmt, ##arg)
-
-static struct kmem_cache *page_alloc_cache;
-static struct kmem_cache *page_alloc_chunk_cache;
-static struct kmem_cache *page_alloc_slab_page_cache;
-static DEFINE_MUTEX(meta_data_cache_lock);
-
-/*
- * Handle the book-keeping for these operations.
- */
-static inline void add_slab_page_to_empty(struct page_alloc_slab *slab,
-					  struct page_alloc_slab_page *page)
-{
-	BUG_ON(page->state != SP_NONE);
-	list_add(&page->list_entry, &slab->empty);
-	slab->nr_empty++;
-	page->state = SP_EMPTY;
-}
-static inline void add_slab_page_to_partial(struct page_alloc_slab *slab,
-					    struct page_alloc_slab_page *page)
-{
-	BUG_ON(page->state != SP_NONE);
-	list_add(&page->list_entry, &slab->partial);
-	slab->nr_partial++;
-	page->state = SP_PARTIAL;
-}
-static inline void add_slab_page_to_full(struct page_alloc_slab *slab,
-					 struct page_alloc_slab_page *page)
-{
-	BUG_ON(page->state != SP_NONE);
-	list_add(&page->list_entry, &slab->full);
-	slab->nr_full++;
-	page->state = SP_FULL;
-}
-
-static inline void del_slab_page_from_empty(struct page_alloc_slab *slab,
-					    struct page_alloc_slab_page *page)
-{
-	list_del_init(&page->list_entry);
-	slab->nr_empty--;
-	page->state = SP_NONE;
-}
-static inline void del_slab_page_from_partial(struct page_alloc_slab *slab,
-					      struct page_alloc_slab_page *page)
-{
-	list_del_init(&page->list_entry);
-	slab->nr_partial--;
-	page->state = SP_NONE;
-}
-static inline void del_slab_page_from_full(struct page_alloc_slab *slab,
-					   struct page_alloc_slab_page *page)
-{
-	list_del_init(&page->list_entry);
-	slab->nr_full--;
-	page->state = SP_NONE;
-}
-
-static u64 gk20a_page_alloc_length(struct gk20a_allocator *a)
-{
-	struct gk20a_page_allocator *va = a->priv;
-
-	return gk20a_alloc_length(&va->source_allocator);
-}
-
-static u64 gk20a_page_alloc_base(struct gk20a_allocator *a)
-{
-	struct gk20a_page_allocator *va = a->priv;
-
-	return gk20a_alloc_base(&va->source_allocator);
-}
-
-static int gk20a_page_alloc_inited(struct gk20a_allocator *a)
-{
-	struct gk20a_page_allocator *va = a->priv;
-
-	return gk20a_alloc_initialized(&va->source_allocator);
-}
-
-static u64 gk20a_page_alloc_end(struct gk20a_allocator *a)
-{
-	struct gk20a_page_allocator *va = a->priv;
-
-	return gk20a_alloc_end(&va->source_allocator);
-}
-
-static u64 gk20a_page_alloc_space(struct gk20a_allocator *a)
-{
-	struct gk20a_page_allocator *va = a->priv;
-
-	return gk20a_alloc_space(&va->source_allocator);
-}
-
-static int gk20a_page_reserve_co(struct gk20a_allocator *a,
-				 struct gk20a_alloc_carveout *co)
-{
-	struct gk20a_page_allocator *va = a->priv;
-
-	return gk20a_alloc_reserve_carveout(&va->source_allocator, co);
-}
-
-static void gk20a_page_release_co(struct gk20a_allocator *a,
-				  struct gk20a_alloc_carveout *co)
-{
-	struct gk20a_page_allocator *va = a->priv;
-
-	gk20a_alloc_release_carveout(&va->source_allocator, co);
-}
-
-static void __gk20a_free_pages(struct gk20a_page_allocator *a,
-			       struct gk20a_page_alloc *alloc,
-			       bool free_buddy_alloc)
-{
-	struct page_alloc_chunk *chunk;
-
-	while (!list_empty(&alloc->alloc_chunks)) {
-		chunk = list_first_entry(&alloc->alloc_chunks,
-					 struct page_alloc_chunk,
-					 list_entry);
-		list_del(&chunk->list_entry);
-
-		if (free_buddy_alloc)
-			gk20a_free(&a->source_allocator, chunk->base);
-		kfree(chunk);
-	}
-
-	kfree(alloc);
-}
-
-static int __insert_page_alloc(struct gk20a_page_allocator *a,
-			       struct gk20a_page_alloc *alloc)
-{
-	struct rb_node **new = &a->allocs.rb_node;
-	struct rb_node *parent = NULL;
-
-	while (*new) {
-		struct gk20a_page_alloc *tmp =
-			container_of(*new, struct gk20a_page_alloc,
-				     tree_entry);
-
-		parent = *new;
-		if (alloc->base < tmp->base) {
-			new = &((*new)->rb_left);
-		} else if (alloc->base > tmp->base) {
-			new = &((*new)->rb_right);
-		} else {
-			WARN(1, "Duplicate entries in allocated list!\n");
-			return 0;
-		}
-	}
-
-	rb_link_node(&alloc->tree_entry, parent, new);
-	rb_insert_color(&alloc->tree_entry, &a->allocs);
-
-	return 0;
-}
-
-static struct gk20a_page_alloc *__find_page_alloc(
-	struct gk20a_page_allocator *a,
-	u64 addr)
-{
-	struct rb_node *node = a->allocs.rb_node;
-	struct gk20a_page_alloc *alloc;
-
-	while (node) {
-		alloc = container_of(node, struct gk20a_page_alloc, tree_entry);
-
-		if (addr < alloc->base)
-			node = node->rb_left;
-		else if (addr > alloc->base)
-			node = node->rb_right;
-		else
-			break;
-	}
-
-	if (!node)
-		return NULL;
-
-	rb_erase(node, &a->allocs);
-
-	return alloc;
-}
-
-static struct page_alloc_slab_page *alloc_slab_page(
-	struct gk20a_page_allocator *a,
-	struct page_alloc_slab *slab)
-{
-	struct page_alloc_slab_page *slab_page;
-
-	slab_page = kmem_cache_alloc(page_alloc_slab_page_cache, GFP_KERNEL);
-	if (!slab_page) {
-		palloc_dbg(a, "OOM: unable to alloc slab_page struct!\n");
-		return ERR_PTR(-ENOMEM);
-	}
-
-	memset(slab_page, 0, sizeof(*slab_page));
-
-	slab_page->page_addr = gk20a_alloc(&a->source_allocator, a->page_size);
-	if (!slab_page->page_addr) {
-		kfree(slab_page);
-		palloc_dbg(a, "OOM: vidmem is full!\n");
-		return ERR_PTR(-ENOMEM);
-	}
-
-	INIT_LIST_HEAD(&slab_page->list_entry);
-	slab_page->slab_size = slab->slab_size;
-	slab_page->nr_objects = (u32)a->page_size / slab->slab_size;
-	slab_page->nr_objects_alloced = 0;
-	slab_page->owner = slab;
-	slab_page->state = SP_NONE;
-
-	a->pages_alloced++;
-
-	palloc_dbg(a, "Allocated new slab page @ 0x%012llx size=%u\n",
-		   slab_page->page_addr, slab_page->slab_size);
-
-	return slab_page;
-}
-
-static void free_slab_page(struct gk20a_page_allocator *a,
-			   struct page_alloc_slab_page *slab_page)
-{
-	palloc_dbg(a, "Freeing slab page @ 0x%012llx\n", slab_page->page_addr);
-
-	BUG_ON((slab_page->state != SP_NONE && slab_page->state != SP_EMPTY) ||
-	       slab_page->nr_objects_alloced != 0 ||
-	       slab_page->bitmap != 0);
-
-	gk20a_free(&a->source_allocator, slab_page->page_addr);
-	a->pages_freed++;
-
-	kmem_cache_free(page_alloc_slab_page_cache, slab_page);
-}
-
-/*
- * This expects @alloc to have 1 empty page_alloc_chunk already added to the
- * alloc_chunks list.
- */
-static int __do_slab_alloc(struct gk20a_page_allocator *a,
-			   struct page_alloc_slab *slab,
-			   struct gk20a_page_alloc *alloc)
-{
-	struct page_alloc_slab_page *slab_page = NULL;
-	struct page_alloc_chunk *chunk;
-	unsigned long offs;
-
-	/*
-	 * Check the partial and empty lists to see if we have some space
-	 * readily available. Take the slab_page out of what ever list it
-	 * was in since it may be put back into a different list later.
-	 */
-	if (!list_empty(&slab->partial)) {
-		slab_page = list_first_entry(&slab->partial,
-					     struct page_alloc_slab_page,
-					     list_entry);
-		del_slab_page_from_partial(slab, slab_page);
-	} else if (!list_empty(&slab->empty)) {
-		slab_page = list_first_entry(&slab->empty,
-					     struct page_alloc_slab_page,
-					     list_entry);
-		del_slab_page_from_empty(slab, slab_page);
-	}
-
-	if (!slab_page) {
-		slab_page = alloc_slab_page(a, slab);
-		if (IS_ERR(slab_page))
-			return PTR_ERR(slab_page);
-	}
-
-	/*
-	 * We now have a slab_page. Do the alloc.
-	 */
-	offs = bitmap_find_next_zero_area(&slab_page->bitmap,
-					  slab_page->nr_objects,
-					  0, 1, 0);
-	if (offs >= slab_page->nr_objects) {
-		WARN(1, "Empty/partial slab with no free objects?");
-
-		/* Add the buggy page to the full list... This isn't ideal. */
-		add_slab_page_to_full(slab, slab_page);
-		return -ENOMEM;
-	}
-
-	bitmap_set(&slab_page->bitmap, offs, 1);
-	slab_page->nr_objects_alloced++;
-
-	if (slab_page->nr_objects_alloced < slab_page->nr_objects)
-		add_slab_page_to_partial(slab, slab_page);
-	else if (slab_page->nr_objects_alloced == slab_page->nr_objects)
-		add_slab_page_to_full(slab, slab_page);
-	else
-		BUG(); /* Should be impossible to hit this. */
-
-	/*
-	 * Handle building the gk20a_page_alloc struct. We expect one
-	 * page_alloc_chunk to be present.
-	 */
-	alloc->slab_page = slab_page;
-	alloc->nr_chunks = 1;
-	alloc->length = slab_page->slab_size;
-	alloc->base = slab_page->page_addr + (offs * slab_page->slab_size);
-
-	chunk = list_first_entry(&alloc->alloc_chunks,
-				 struct page_alloc_chunk, list_entry);
-	chunk->base = alloc->base;
-	chunk->length = alloc->length;
-
-	return 0;
-}
-
-/*
- * Allocate from a slab instead of directly from the page allocator.
- */
-static struct gk20a_page_alloc *__gk20a_alloc_slab(
-	struct gk20a_page_allocator *a, u64 len)
-{
-	int err, slab_nr;
-	struct page_alloc_slab *slab;
-	struct gk20a_page_alloc *alloc = NULL;
-	struct page_alloc_chunk *chunk = NULL;
-
-	/*
-	 * Align the length to a page and then divide by the page size (4k for
-	 * this code). ilog2() of that then gets us the correct slab to use.
-	 */
-	slab_nr = (int)ilog2(PAGE_ALIGN(len) >> 12);
-	slab = &a->slabs[slab_nr];
-
-	alloc = kmem_cache_alloc(page_alloc_cache, GFP_KERNEL);
-	if (!alloc) {
-		palloc_dbg(a, "OOM: could not alloc page_alloc struct!\n");
-		goto fail;
-	}
-	chunk = kmem_cache_alloc(page_alloc_chunk_cache, GFP_KERNEL);
-	if (!chunk) {
-		palloc_dbg(a, "OOM: could not alloc alloc_chunk struct!\n");
-		goto fail;
-	}
-
-	INIT_LIST_HEAD(&alloc->alloc_chunks);
-	list_add(&chunk->list_entry, &alloc->alloc_chunks);
-
-	err = __do_slab_alloc(a, slab, alloc);
-	if (err)
-		goto fail;
-
-	palloc_dbg(a, "Alloc 0x%04llx sr=%d id=0x%010llx [slab]\n",
-		   len, slab_nr, alloc->base);
-	a->nr_slab_allocs++;
-
-	return alloc;
-
-fail:
-	kfree(alloc);
-	kfree(chunk);
-	return NULL;
-}
-
-static void __gk20a_free_slab(struct gk20a_page_allocator *a,
-			      struct gk20a_page_alloc *alloc)
-{
-	struct page_alloc_slab_page *slab_page = alloc->slab_page;
-	struct page_alloc_slab *slab = slab_page->owner;
-	enum slab_page_state new_state;
-	int offs;
-
-	offs = (u32)(alloc->base - slab_page->page_addr) / slab_page->slab_size;
-	bitmap_clear(&slab_page->bitmap, offs, 1);
-
-	slab_page->nr_objects_alloced--;
-
-	if (slab_page->nr_objects_alloced == 0)
-		new_state = SP_EMPTY;
-	else
-		new_state = SP_PARTIAL;
-
-	/*
-	 * Need to migrate the page to a different list.
-	 */
-	if (new_state != slab_page->state) {
-		/* Delete - can't be in empty. */
-		if (slab_page->state == SP_PARTIAL)
-			del_slab_page_from_partial(slab, slab_page);
-		else
-			del_slab_page_from_full(slab, slab_page);
-
-		/* And add. */
-		if (new_state == SP_EMPTY) {
-			if (list_empty(&slab->empty))
-				add_slab_page_to_empty(slab, slab_page);
-			else
-				free_slab_page(a, slab_page);
-		} else {
-			add_slab_page_to_partial(slab, slab_page);
-		}
-	}
-
-	/*
-	 * Now handle the page_alloc.
-	 */
-	__gk20a_free_pages(a, alloc, false);
-	a->nr_slab_frees++;
-
-	return;
-}
-
-/*
- * Allocate physical pages. Since the underlying allocator is a buddy allocator
- * the returned pages are always contiguous. However, since there could be
- * fragmentation in the space this allocator will collate smaller non-contiguous
- * allocations together if necessary.
- */
-static struct gk20a_page_alloc *__do_gk20a_alloc_pages(
-	struct gk20a_page_allocator *a, u64 pages)
-{
-	struct gk20a_page_alloc *alloc;
-	struct page_alloc_chunk *c;
-	u64 max_chunk_len = pages << a->page_shift;
-	int i = 0;
-
-	alloc = kmem_cache_alloc(page_alloc_cache, GFP_KERNEL);
-	if (!alloc)
-		goto fail;
-
-	memset(alloc, 0, sizeof(*alloc));
-
-	INIT_LIST_HEAD(&alloc->alloc_chunks);
-	alloc->length = pages << a->page_shift;
-
-	while (pages) {
-		u64 chunk_addr = 0;
-		u64 chunk_pages = (u64)1 << __fls(pages);
-		u64 chunk_len = chunk_pages << a->page_shift;
-
-		/*
-		 * Take care of the possibility that the allocation must be
-		 * contiguous. If this is not the first iteration then that
-		 * means the first iteration failed to alloc the entire
-		 * requested size. The buddy allocator guarantees any given
-		 * single alloc is contiguous.
-		 */
-		if (a->flags & GPU_ALLOC_FORCE_CONTIG && i != 0)
-			goto fail_cleanup;
-
-		if (chunk_len > max_chunk_len)
-			chunk_len = max_chunk_len;
-
-		/*
-		 * Keep attempting to allocate in smaller chunks until the alloc
-		 * either succeeds or is smaller than the page_size of the
-		 * allocator (i.e the allocator is OOM).
-		 */
-		do {
-			chunk_addr = gk20a_alloc(&a->source_allocator,
-						 chunk_len);
-
-			/* Divide by 2 and try again */
-			if (!chunk_addr) {
-				palloc_dbg(a, "balloc failed: 0x%llx\n",
-					   chunk_len);
-				chunk_len >>= 1;
-				max_chunk_len = chunk_len;
-			}
-		} while (!chunk_addr && chunk_len >= a->page_size);
-
-		chunk_pages = chunk_len >> a->page_shift;
-
-		if (!chunk_addr) {
-			palloc_dbg(a, "bailing @ 0x%llx\n", chunk_len);
-			goto fail_cleanup;
-		}
-
-		c = kmem_cache_alloc(page_alloc_chunk_cache, GFP_KERNEL);
-		if (!c) {
-			gk20a_free(&a->source_allocator, chunk_addr);
-			goto fail_cleanup;
-		}
-
-		pages -= chunk_pages;
-
-		c->base = chunk_addr;
-		c->length = chunk_len;
-		list_add(&c->list_entry, &alloc->alloc_chunks);
-
-		i++;
-	}
-
-	alloc->nr_chunks = i;
-	c = list_first_entry(&alloc->alloc_chunks,
-			     struct page_alloc_chunk, list_entry);
-	alloc->base = c->base;
-
-	return alloc;
-
-fail_cleanup:
-	while (!list_empty(&alloc->alloc_chunks)) {
-		c = list_first_entry(&alloc->alloc_chunks,
-				     struct page_alloc_chunk, list_entry);
-		list_del(&c->list_entry);
-		gk20a_free(&a->source_allocator, c->base);
-		kfree(c);
-	}
-	kfree(alloc);
-fail:
-	return ERR_PTR(-ENOMEM);
-}
-
-static struct gk20a_page_alloc *__gk20a_alloc_pages(
-	struct gk20a_page_allocator *a, u64 len)
-{
-	struct gk20a_page_alloc *alloc = NULL;
-	struct page_alloc_chunk *c;
-	u64 pages;
-	int i = 0;
-
-	pages = ALIGN(len, a->page_size) >> a->page_shift;
-
-	alloc = __do_gk20a_alloc_pages(a, pages);
-	if (IS_ERR(alloc)) {
-		palloc_dbg(a, "Alloc 0x%llx (%llu) (failed)\n",
-			   pages << a->page_shift, pages);
-		return NULL;
-	}
-
-	palloc_dbg(a, "Alloc 0x%llx (%llu) id=0x%010llx\n",
-		   pages << a->page_shift, pages, alloc->base);
-	list_for_each_entry(c, &alloc->alloc_chunks, list_entry) {
-		palloc_dbg(a, "  Chunk %2d: 0x%010llx + 0x%llx\n",
-			   i++, c->base, c->length);
-	}
-
-	return alloc;
-}
-
-/*
- * Allocate enough pages to satisfy @len. Page size is determined at
- * initialization of the allocator.
- *
- * The return is actually a pointer to a struct gk20a_page_alloc pointer. This
- * is because it doesn't make a lot of sense to return the address of the first
- * page in the list of pages (since they could be discontiguous). This has
- * precedent in the dma_alloc APIs, though, it's really just an annoying
- * artifact of the fact that the gk20a_alloc() API requires a u64 return type.
- */
-static u64 gk20a_page_alloc(struct gk20a_allocator *__a, u64 len)
-{
-	struct gk20a_page_allocator *a = page_allocator(__a);
-	struct gk20a_page_alloc *alloc = NULL;
-	u64 real_len;
-
-	/*
-	 * If we want contig pages we have to round up to a power of two. It's
-	 * easier to do that here than in the buddy allocator.
-	 */
-	real_len = a->flags & GPU_ALLOC_FORCE_CONTIG ?
-		roundup_pow_of_two(len) : len;
-
-	alloc_lock(__a);
-	if (a->flags & GPU_ALLOC_4K_VIDMEM_PAGES &&
-	    real_len <= (a->page_size / 2))
-		alloc = __gk20a_alloc_slab(a, real_len);
-	else
-		alloc = __gk20a_alloc_pages(a, real_len);
-
-	if (!alloc) {
-		alloc_unlock(__a);
-		return 0;
-	}
-
-	__insert_page_alloc(a, alloc);
-
-	a->nr_allocs++;
-	if (real_len > a->page_size / 2)
-		a->pages_alloced += alloc->length >> a->page_shift;
-	alloc_unlock(__a);
-
-	if (a->flags & GPU_ALLOC_NO_SCATTER_GATHER)
-		return alloc->base;
-	else
-		return (u64) (uintptr_t) alloc;
-}
-
-/*
- * Note: this will remove the gk20a_page_alloc struct from the RB tree
- * if it's found.
- */
-static void gk20a_page_free(struct gk20a_allocator *__a, u64 base)
-{
-	struct gk20a_page_allocator *a = page_allocator(__a);
-	struct gk20a_page_alloc *alloc;
-
-	alloc_lock(__a);
-
-	if (a->flags & GPU_ALLOC_NO_SCATTER_GATHER)
-		alloc = __find_page_alloc(a, base);
-	else
-		alloc = __find_page_alloc(a,
-			((struct gk20a_page_alloc *)(uintptr_t)base)->base);
-
-	if (!alloc) {
-		palloc_dbg(a, "Hrm, found no alloc?\n");
-		goto done;
-	}
-
-	a->nr_frees++;
-
-	palloc_dbg(a, "Free  0x%llx id=0x%010llx\n",
-		   alloc->length, alloc->base);
-
-	/*
-	 * Frees *alloc.
-	 */
-	if (alloc->slab_page) {
-		__gk20a_free_slab(a, alloc);
-	} else {
-		a->pages_freed += (alloc->length >> a->page_shift);
-		__gk20a_free_pages(a, alloc, true);
-	}
-
-done:
-	alloc_unlock(__a);
-}
-
-static struct gk20a_page_alloc *__gk20a_alloc_pages_fixed(
-	struct gk20a_page_allocator *a, u64 base, u64 length)
-{
-	struct gk20a_page_alloc *alloc;
-	struct page_alloc_chunk *c;
-
-	alloc = kmem_cache_alloc(page_alloc_cache, GFP_KERNEL);
-	c = kmem_cache_alloc(page_alloc_chunk_cache, GFP_KERNEL);
-	if (!alloc || !c)
-		goto fail;
-
-	alloc->base = gk20a_alloc_fixed(&a->source_allocator, base, length);
-	if (!alloc->base) {
-		WARN(1, "gk20a: failed to fixed alloc pages @ 0x%010llx", base);
-		goto fail;
-	}
-
-	alloc->nr_chunks = 1;
-	alloc->length = length;
-	INIT_LIST_HEAD(&alloc->alloc_chunks);
-
-	c->base = alloc->base;
-	c->length = length;
-	list_add(&c->list_entry, &alloc->alloc_chunks);
-
-	return alloc;
-
-fail:
-	kfree(c);
-	kfree(alloc);
-	return ERR_PTR(-ENOMEM);
-}
-
-static u64 gk20a_page_alloc_fixed(struct gk20a_allocator *__a,
-				  u64 base, u64 len)
-{
-	struct gk20a_page_allocator *a = page_allocator(__a);
-	struct gk20a_page_alloc *alloc = NULL;
-	struct page_alloc_chunk *c;
-	u64 aligned_len, pages;
-	int i = 0;
-
-	aligned_len = ALIGN(len, a->page_size);
-	pages = aligned_len >> a->page_shift;
-
-	alloc_lock(__a);
-
-	alloc = __gk20a_alloc_pages_fixed(a, base, aligned_len);
-	if (IS_ERR(alloc)) {
-		alloc_unlock(__a);
-		return 0;
-	}
-
-	__insert_page_alloc(a, alloc);
-	alloc_unlock(__a);
-
-	palloc_dbg(a, "Alloc [fixed] @ 0x%010llx + 0x%llx (%llu)\n",
-		   alloc->base, aligned_len, pages);
-	list_for_each_entry(c, &alloc->alloc_chunks, list_entry) {
-		palloc_dbg(a, "  Chunk %2d: 0x%010llx + 0x%llx\n",
-			   i++, c->base, c->length);
-	}
-
-	a->nr_fixed_allocs++;
-	a->pages_alloced += pages;
-
-	if (a->flags & GPU_ALLOC_NO_SCATTER_GATHER)
-		return alloc->base;
-	else
-		return (u64) (uintptr_t) alloc;
-}
-
-static void gk20a_page_free_fixed(struct gk20a_allocator *__a,
-				  u64 base, u64 len)
-{
-	struct gk20a_page_allocator *a = page_allocator(__a);
-	struct gk20a_page_alloc *alloc;
-
-	alloc_lock(__a);
-
-	if (a->flags & GPU_ALLOC_NO_SCATTER_GATHER) {
-		alloc = __find_page_alloc(a, base);
-		if (!alloc)
-			goto done;
-	} else {
-		alloc = (struct gk20a_page_alloc *) (uintptr_t) base;
-	}
-
-	palloc_dbg(a, "Free  [fixed] 0x%010llx + 0x%llx\n",
-		   alloc->base, alloc->length);
-
-	a->nr_fixed_frees++;
-	a->pages_freed += (alloc->length >> a->page_shift);
-
-	/*
-	 * This works for the time being since the buddy allocator
-	 * uses the same free function for both fixed and regular
-	 * allocs. This would have to be updated if the underlying
-	 * allocator were to change.
-	 */
-	__gk20a_free_pages(a, alloc, true);
-
-done:
-	alloc_unlock(__a);
-}
-
-static void gk20a_page_allocator_destroy(struct gk20a_allocator *__a)
-{
-	struct gk20a_page_allocator *a = page_allocator(__a);
-
-	alloc_lock(__a);
-	kfree(a);
-	__a->priv = NULL;
-	alloc_unlock(__a);
-}
-
-static void gk20a_page_print_stats(struct gk20a_allocator *__a,
-				   struct seq_file *s, int lock)
-{
-	struct gk20a_page_allocator *a = page_allocator(__a);
-	int i;
-
-	if (lock)
-		alloc_lock(__a);
-
-	__alloc_pstat(s, __a, "Page allocator:\n");
-	__alloc_pstat(s, __a, "  allocs         %lld\n", a->nr_allocs);
-	__alloc_pstat(s, __a, "  frees          %lld\n", a->nr_frees);
-	__alloc_pstat(s, __a, "  fixed_allocs   %lld\n", a->nr_fixed_allocs);
-	__alloc_pstat(s, __a, "  fixed_frees    %lld\n", a->nr_fixed_frees);
-	__alloc_pstat(s, __a, "  slab_allocs    %lld\n", a->nr_slab_allocs);
-	__alloc_pstat(s, __a, "  slab_frees     %lld\n", a->nr_slab_frees);
-	__alloc_pstat(s, __a, "  pages alloced  %lld\n", a->pages_alloced);
-	__alloc_pstat(s, __a, "  pages freed    %lld\n", a->pages_freed);
-	__alloc_pstat(s, __a, "\n");
-
-	/*
-	 * Slab info.
-	 */
-	if (a->flags & GPU_ALLOC_4K_VIDMEM_PAGES) {
-		__alloc_pstat(s, __a, "Slabs:\n");
-		__alloc_pstat(s, __a, "  size      empty     partial   full\n");
-		__alloc_pstat(s, __a, "  ----      -----     -------   ----\n");
-
-		for (i = 0; i < a->nr_slabs; i++) {
-			struct page_alloc_slab *slab = &a->slabs[i];
-
-			__alloc_pstat(s, __a, "  %-9u %-9d %-9u %u\n",
-				      slab->slab_size,
-				      slab->nr_empty, slab->nr_partial,
-				      slab->nr_full);
-		}
-		__alloc_pstat(s, __a, "\n");
-	}
-
-	__alloc_pstat(s, __a, "Source alloc: %s\n",
-		      a->source_allocator.name);
-	gk20a_alloc_print_stats(&a->source_allocator, s, lock);
-
-	if (lock)
-		alloc_unlock(__a);
-}
-
-static const struct gk20a_allocator_ops page_ops = {
-	.alloc		= gk20a_page_alloc,
-	.free		= gk20a_page_free,
-
-	.alloc_fixed	= gk20a_page_alloc_fixed,
-	.free_fixed	= gk20a_page_free_fixed,
-
-	.reserve_carveout	= gk20a_page_reserve_co,
-	.release_carveout	= gk20a_page_release_co,
-
-	.base		= gk20a_page_alloc_base,
-	.length		= gk20a_page_alloc_length,
-	.end		= gk20a_page_alloc_end,
-	.inited		= gk20a_page_alloc_inited,
-	.space		= gk20a_page_alloc_space,
-
-	.fini		= gk20a_page_allocator_destroy,
-
-	.print_stats	= gk20a_page_print_stats,
-};
-
-/*
- * nr_slabs is computed as follows: divide page_size by 4096 to get number of
- * 4k pages in page_size. Then take the base 2 log of that to get number of
- * slabs. For 64k page_size that works on like:
- *
- *   1024*64 / 1024*4 = 16
- *   ilog2(16) = 4
- *
- * That gives buckets of 1, 2, 4, and 8 pages (i.e 4k, 8k, 16k, 32k).
- */
-static int gk20a_page_alloc_init_slabs(struct gk20a_page_allocator *a)
-{
-	size_t nr_slabs = ilog2(a->page_size >> 12);
-	unsigned int i;
-
-	a->slabs = kcalloc(nr_slabs,
-			   sizeof(struct page_alloc_slab),
-			   GFP_KERNEL);
-	if (!a->slabs)
-		return -ENOMEM;
-	a->nr_slabs = nr_slabs;
-
-	for (i = 0; i < nr_slabs; i++) {
-		struct page_alloc_slab *slab = &a->slabs[i];
-
-		slab->slab_size = SZ_4K * (1 << i);
-		INIT_LIST_HEAD(&slab->empty);
-		INIT_LIST_HEAD(&slab->partial);
-		INIT_LIST_HEAD(&slab->full);
-		slab->nr_empty = 0;
-		slab->nr_partial = 0;
-		slab->nr_full = 0;
-	}
-
-	return 0;
-}
-
-int gk20a_page_allocator_init(struct gk20a *g, struct gk20a_allocator *__a,
-			      const char *name, u64 base, u64 length,
-			      u64 blk_size, u64 flags)
-{
-	struct gk20a_page_allocator *a;
-	char buddy_name[sizeof(__a->name)];
-	int err;
-
-	mutex_lock(&meta_data_cache_lock);
-	if (!page_alloc_cache)
-		page_alloc_cache = KMEM_CACHE(gk20a_page_alloc, 0);
-	if (!page_alloc_chunk_cache)
-		page_alloc_chunk_cache = KMEM_CACHE(page_alloc_chunk, 0);
-	if (!page_alloc_slab_page_cache)
-		page_alloc_slab_page_cache =
-			KMEM_CACHE(page_alloc_slab_page, 0);
-	mutex_unlock(&meta_data_cache_lock);
-
-	if (!page_alloc_cache || !page_alloc_chunk_cache)
-		return -ENOMEM;
-
-	if (blk_size < SZ_4K)
-		return -EINVAL;
-
-	a = kzalloc(sizeof(struct gk20a_page_allocator), GFP_KERNEL);
-	if (!a)
-		return -ENOMEM;
-
-	err = __gk20a_alloc_common_init(__a, name, a, false, &page_ops);
-	if (err)
-		goto fail;
-
-	a->base = base;
-	a->length = length;
-	a->page_size = blk_size;
-	a->page_shift = __ffs(blk_size);
-	a->allocs = RB_ROOT;
-	a->owner = __a;
-	a->flags = flags;
-
-	if (flags & GPU_ALLOC_4K_VIDMEM_PAGES && blk_size > SZ_4K) {
-		err = gk20a_page_alloc_init_slabs(a);
-		if (err)
-			goto fail;
-	}
-
-	snprintf(buddy_name, sizeof(buddy_name), "%s-src", name);
-
-	err = gk20a_buddy_allocator_init(g, &a->source_allocator, buddy_name,
-					 base, length, blk_size, 0);
-	if (err)
-		goto fail;
-
-	gk20a_init_alloc_debug(g, __a);
-	palloc_dbg(a, "New allocator: type      page\n");
-	palloc_dbg(a, "               base      0x%llx\n", a->base);
-	palloc_dbg(a, "               size      0x%llx\n", a->length);
-	palloc_dbg(a, "               page_size 0x%llx\n", a->page_size);
-	palloc_dbg(a, "               flags     0x%llx\n", a->flags);
-	palloc_dbg(a, "               slabs:    %d\n", a->nr_slabs);
-
-	return 0;
-
-fail:
-	kfree(a);
-	return err;
-}
diff --git a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
index 39562ec1..2ee2dd43 100644
--- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
@@ -3400,7 +3400,7 @@ static void gk20a_remove_gr_support(struct gr_gk20a *gr)
 	gr->ctx_vars.local_golden_image = NULL;
 
 	if (gr->ctx_vars.hwpm_ctxsw_buffer_offset_map)
-		nvgpu_free(gr->ctx_vars.hwpm_ctxsw_buffer_offset_map);
+		nvgpu_kfree(gr->ctx_vars.hwpm_ctxsw_buffer_offset_map);
 	gr->ctx_vars.hwpm_ctxsw_buffer_offset_map = NULL;
 
 	gk20a_comptag_allocator_destroy(&gr->comp_tags);
@@ -7998,7 +7998,7 @@ static int gr_gk20a_create_hwpm_ctxsw_buffer_offset_map(struct gk20a *g)
 	hwpm_ctxsw_reg_count_max = hwpm_ctxsw_buffer_size >> 2;
 	map_size = hwpm_ctxsw_reg_count_max * sizeof(*map);
 
-	map = nvgpu_alloc(map_size, true);
+	map = nvgpu_kalloc(map_size, true);
 	if (!map)
 		return -ENOMEM;
 
@@ -8088,7 +8088,7 @@ static int gr_gk20a_create_hwpm_ctxsw_buffer_offset_map(struct gk20a *g)
 	return 0;
 cleanup:
 	gk20a_err(dev_from_gk20a(g), "Failed to create HWPM buffer offset map");
-	nvgpu_free(map);
+	nvgpu_kfree(map);
 	return -EINVAL;
 }
 
diff --git a/drivers/gpu/nvgpu/gk20a/lockless_allocator_priv.h b/drivers/gpu/nvgpu/gk20a/lockless_allocator_priv.h
deleted file mode 100644
index f9b03e0e..00000000
--- a/drivers/gpu/nvgpu/gk20a/lockless_allocator_priv.h
+++ /dev/null
@@ -1,121 +0,0 @@
-/*
- * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-/*
- * Basics:
- *
- *    - Lockless memory allocator for fixed-size structures, whose
- *      size is defined up front at init time.
- *    - Memory footprint scales linearly w/ the number of structures in
- *      the pool. It is ~= sizeof(int) * N.
- *    - Memory is pre-allocated by the client. The allocator itself
- *      only computes the addresses for allocations.
- *    - Limit of MAX_INT nodes that the allocator can be responsible for.
- *
- * Implementation details:
- *
- *    The allocator maintains a single list of free nodes. We allocate &
- *    free nodes from the head of the list. We rely on the cmpxchg() operator
- *    to maintain atomicity on the head.
- *
- *    So, both allocs & frees are O(1)!!
- *
- *    -- Definitions --
- *    Block Size - size of a single structure that this allocator will
- *                 allocate.
- *    Node       - one of the elements of size blk_size in the
- *                 client-allocated buffer.
- *    Node Index - zero-based index of a node in the client-allocated
- *                 contiguous buffer.
- *
- *    -- Initial State --
- *    We maintain the following to track the state of the free list:
- *
- *    1) A "head" index to track the index of the first free node in the list
- *    2) A "next" array to track the index of the next free node in the list
- *       for every node. So next[head], will give the index to the 2nd free
- *       element in the list.
- *
- *    So, to begin with, the free list consists of all node indices, and each
- *    position in the next array contains index N + 1:
- *
- *    head = 0
- *    next = [1, 2, 3, 4, -1] : Example for a user-allocated buffer of 5 nodes
- *    free_list = 0->1->2->3->4->-1
- *
- *    -- Allocations --
- *    1) Read the current head (aka acq_head)
- *    2) Read next[acq_head], to get the 2nd free element (aka new_head)
- *    3) cmp_xchg(&head, acq_head, new_head)
- *    4) If it succeeds, compute the address of the node, based on
- *       base address, blk_size, & acq_head.
- *
- *    head = 1;
- *    next = [1, 2, 3, 4, -1] : Example after allocating Node #0
- *    free_list = 1->2->3->4->-1
- *
- *    head = 2;
- *    next = [1, 2, 3, 4, -1] : Example after allocating Node #1
- *    free_list = 2->3->4->-1
- *
- *    -- Frees --
- *    1) Based on the address to be freed, calculate the index of the node
- *       being freed (cur_idx)
- *    2) Read the current head (old_head)
- *    3) So the freed node is going to go at the head of the list, and we
- *       want to put the old_head after it. So next[cur_idx] = old_head
- *    4) cmpxchg(head, old_head, cur_idx)
- *
- *    head = 0
- *    next = [2, 2, 3, 4, -1]
- *    free_list = 0->2->3->4->-1 : Example after freeing Node #0
- *
- *    head = 1
- *    next = [2, 0, 3, 4, -1]
- *    free_list = 1->0->2->3->4->-1 : Example after freeing Node #1
- */
-
-#ifndef LOCKLESS_ALLOCATOR_PRIV_H
-#define LOCKLESS_ALLOCATOR_PRIV_H
-
-struct gk20a_allocator;
-
-struct gk20a_lockless_allocator {
-	struct gk20a_allocator *owner;
-
-	u64 base;		/* Base address of the space. */
-	u64 length;		/* Length of the space. */
-	u64 blk_size;		/* Size of the structure being allocated */
-	int nr_nodes;		/* Number of nodes available for allocation */
-
-	int *next;		/* An array holding the next indices per node */
-	int head;		/* Current node at the top of the stack */
-
-	u64 flags;
-
-	bool inited;
-
-	/* Statistics */
-	atomic_t nr_allocs;
-};
-
-static inline struct gk20a_lockless_allocator *lockless_allocator(
-	struct gk20a_allocator *a)
-{
-	return (struct gk20a_lockless_allocator *)(a)->priv;
-}
-
-#endif
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
index 2e338fef..d594a5a4 100644
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
@@ -31,9 +31,9 @@
 #include <uapi/linux/nvgpu.h>
 #include <trace/events/gk20a.h>
 
-#include <gk20a/page_allocator_priv.h>
-
 #include <nvgpu/timers.h>
+#include <nvgpu/allocator.h>
+#include <nvgpu/page_allocator.h>
 
 #include "gk20a.h"
 #include "mm_gk20a.h"
@@ -74,7 +74,7 @@ is_vidmem_page_alloc(u64 addr)
 	return !!(addr & 1ULL);
 }
 
-static inline struct gk20a_page_alloc *
+static inline struct nvgpu_page_alloc *
 get_vidmem_page_alloc(struct scatterlist *sgl)
 {
 	u64 addr;
@@ -86,7 +86,7 @@ get_vidmem_page_alloc(struct scatterlist *sgl)
 	else
 		WARN_ON(1);
 
-	return (struct gk20a_page_alloc *)(uintptr_t)addr;
+	return (struct nvgpu_page_alloc *)(uintptr_t)addr;
 }
 
 int gk20a_mem_begin(struct gk20a *g, struct mem_desc *mem)
@@ -176,7 +176,7 @@ typedef void (*pramin_access_batch_fn)(struct gk20a *g, u32 start, u32 words,
 static inline void pramin_access_batched(struct gk20a *g, struct mem_desc *mem,
 		u32 offset, u32 size, pramin_access_batch_fn loop, u32 **arg)
 {
-	struct gk20a_page_alloc *alloc = NULL;
+	struct nvgpu_page_alloc *alloc = NULL;
 	struct page_alloc_chunk *chunk = NULL;
 	u32 byteoff, start_reg, until_end, n;
 
@@ -797,8 +797,8 @@ void gk20a_remove_vm(struct vm_gk20a *vm, struct mem_desc *inst_block)
 static void gk20a_vidmem_destroy(struct gk20a *g)
 {
 #if defined(CONFIG_GK20A_VIDMEM)
-	if (gk20a_alloc_initialized(&g->mm.vidmem.allocator))
-		gk20a_alloc_destroy(&g->mm.vidmem.allocator);
+	if (nvgpu_alloc_initialized(&g->mm.vidmem.allocator))
+		nvgpu_alloc_destroy(&g->mm.vidmem.allocator);
 #endif
 }
 
@@ -928,8 +928,8 @@ static int gk20a_init_vidmem(struct mm_gk20a *mm)
 	u64 default_page_size = SZ_64K;
 	int err;
 
-	static struct gk20a_alloc_carveout wpr_co =
-		GK20A_CARVEOUT("wpr-region", 0, SZ_16M);
+	static struct nvgpu_alloc_carveout wpr_co =
+		NVGPU_CARVEOUT("wpr-region", 0, SZ_16M);
 
 	if (!size)
 		return 0;
@@ -944,12 +944,12 @@ static int gk20a_init_vidmem(struct mm_gk20a *mm)
 	 * initialization requires vidmem but we want to use the CE to zero
 	 * out vidmem before allocating it...
 	 */
-	err = gk20a_page_allocator_init(g, &g->mm.vidmem.bootstrap_allocator,
+	err = nvgpu_page_allocator_init(g, &g->mm.vidmem.bootstrap_allocator,
 					"vidmem-bootstrap",
 					bootstrap_base, bootstrap_size,
 					SZ_4K, 0);
 
-	err = gk20a_page_allocator_init(g, &g->mm.vidmem.allocator,
+	err = nvgpu_page_allocator_init(g, &g->mm.vidmem.allocator,
 					"vidmem",
 					base, size - base,
 					default_page_size,
@@ -961,7 +961,7 @@ static int gk20a_init_vidmem(struct mm_gk20a *mm)
 	}
 
 	/* Reserve bootstrap region in vidmem allocator */
-	gk20a_alloc_reserve_carveout(&g->mm.vidmem.allocator, &wpr_co);
+	nvgpu_alloc_reserve_carveout(&g->mm.vidmem.allocator, &wpr_co);
 
 	mm->vidmem.base = base;
 	mm->vidmem.size = size - base;
@@ -1482,7 +1482,7 @@ int gk20a_vm_get_buffers(struct vm_gk20a *vm,
 
 	mutex_lock(&vm->update_gmmu_lock);
 
-	buffer_list = nvgpu_alloc(sizeof(*buffer_list) *
+	buffer_list = nvgpu_kalloc(sizeof(*buffer_list) *
 			      vm->num_user_mapped_buffers, true);
 	if (!buffer_list) {
 		mutex_unlock(&vm->update_gmmu_lock);
@@ -1567,7 +1567,7 @@ void gk20a_vm_put_buffers(struct vm_gk20a *vm,
 	gk20a_vm_mapping_batch_finish_locked(vm, &batch);
 	mutex_unlock(&vm->update_gmmu_lock);
 
-	nvgpu_free(mapped_buffers);
+	nvgpu_kfree(mapped_buffers);
 }
 
 static void gk20a_vm_unmap_user(struct vm_gk20a *vm, u64 offset,
@@ -1623,7 +1623,7 @@ u64 gk20a_vm_alloc_va(struct vm_gk20a *vm,
 		      enum gmmu_pgsz_gk20a gmmu_pgsz_idx)
 
 {
-	struct gk20a_allocator *vma = &vm->vma[gmmu_pgsz_idx];
+	struct nvgpu_allocator *vma = &vm->vma[gmmu_pgsz_idx];
 	u64 offset;
 	u64 gmmu_page_size = vm->gmmu_page_sizes[gmmu_pgsz_idx];
 
@@ -1645,7 +1645,7 @@ u64 gk20a_vm_alloc_va(struct vm_gk20a *vm,
 	gk20a_dbg_info("size=0x%llx @ pgsz=%dKB", size,
 			vm->gmmu_page_sizes[gmmu_pgsz_idx]>>10);
 
-	offset = gk20a_alloc(vma, size);
+	offset = nvgpu_alloc(vma, size);
 	if (!offset) {
 		gk20a_err(dev_from_vm(vm),
 			  "%s oom: sz=0x%llx", vma->name, size);
@@ -1660,11 +1660,11 @@ int gk20a_vm_free_va(struct vm_gk20a *vm,
 		     u64 offset, u64 size,
 		     enum gmmu_pgsz_gk20a pgsz_idx)
 {
-	struct gk20a_allocator *vma = &vm->vma[pgsz_idx];
+	struct nvgpu_allocator *vma = &vm->vma[pgsz_idx];
 
 	gk20a_dbg_info("%s free addr=0x%llx, size=0x%llx",
 			vma->name, offset, size);
-	gk20a_free(vma, offset);
+	nvgpu_free(vma, offset);
 
 	return 0;
 }
@@ -2302,15 +2302,15 @@ err_kfree:
 int gk20a_vidmem_get_space(struct gk20a *g, u64 *space)
 {
 #if defined(CONFIG_GK20A_VIDMEM)
-	struct gk20a_allocator *allocator = &g->mm.vidmem.allocator;
+	struct nvgpu_allocator *allocator = &g->mm.vidmem.allocator;
 
 	gk20a_dbg_fn("");
 
-	if (!gk20a_alloc_initialized(allocator))
+	if (!nvgpu_alloc_initialized(allocator))
 		return -ENOSYS;
 
 	mutex_lock(&g->mm.vidmem.clear_list_mutex);
-	*space = gk20a_alloc_space(allocator) +
+	*space = nvgpu_alloc_space(allocator) +
 		atomic64_read(&g->mm.vidmem.bytes_pending);
 	mutex_unlock(&g->mm.vidmem.clear_list_mutex);
 	return 0;
@@ -2359,7 +2359,7 @@ static u64 gk20a_mm_get_align(struct gk20a *g, struct scatterlist *sgl,
 	u64 buf_addr;
 
 	if (aperture == APERTURE_VIDMEM) {
-		struct gk20a_page_alloc *alloc = get_vidmem_page_alloc(sgl);
+		struct nvgpu_page_alloc *alloc = get_vidmem_page_alloc(sgl);
 		struct page_alloc_chunk *chunk = NULL;
 
 		list_for_each_entry(chunk, &alloc->alloc_chunks, list_entry) {
@@ -3068,7 +3068,7 @@ static int gk20a_gmmu_clear_vidmem_mem(struct gk20a *g, struct mem_desc *mem)
 {
 	struct gk20a_fence *gk20a_fence_out = NULL;
 	struct gk20a_fence *gk20a_last_fence = NULL;
-	struct gk20a_page_alloc *alloc = NULL;
+	struct nvgpu_page_alloc *alloc = NULL;
 	struct page_alloc_chunk *chunk = NULL;
 	int err = 0;
 
@@ -3134,15 +3134,15 @@ int gk20a_gmmu_alloc_attr_vid(struct gk20a *g, enum dma_attr attr,
 }
 
 #if defined(CONFIG_GK20A_VIDMEM)
-static u64 __gk20a_gmmu_alloc(struct gk20a_allocator *allocator, dma_addr_t at,
+static u64 __gk20a_gmmu_alloc(struct nvgpu_allocator *allocator, dma_addr_t at,
 				size_t size)
 {
 	u64 addr = 0;
 
 	if (at)
-		addr = gk20a_alloc_fixed(allocator, at, size);
+		addr = nvgpu_alloc_fixed(allocator, at, size);
 	else
-		addr = gk20a_alloc(allocator, size);
+		addr = nvgpu_alloc(allocator, size);
 
 	return addr;
 }
@@ -3154,14 +3154,14 @@ int gk20a_gmmu_alloc_attr_vid_at(struct gk20a *g, enum dma_attr attr,
 #if defined(CONFIG_GK20A_VIDMEM)
 	u64 addr;
 	int err;
-	struct gk20a_allocator *vidmem_alloc = g->mm.vidmem.cleared ?
+	struct nvgpu_allocator *vidmem_alloc = g->mm.vidmem.cleared ?
 		&g->mm.vidmem.allocator :
 		&g->mm.vidmem.bootstrap_allocator;
 	int before_pending;
 
 	gk20a_dbg_fn("");
 
-	if (!gk20a_alloc_initialized(&g->mm.vidmem.allocator))
+	if (!nvgpu_alloc_initialized(&g->mm.vidmem.allocator))
 		return -ENOSYS;
 
 	/* we don't support dma attributes here, except that kernel mappings
@@ -3214,7 +3214,7 @@ int gk20a_gmmu_alloc_attr_vid_at(struct gk20a *g, enum dma_attr attr,
 fail_kfree:
 	kfree(mem->sgt);
 fail_physfree:
-	gk20a_free(&g->mm.vidmem.allocator, addr);
+	nvgpu_free(&g->mm.vidmem.allocator, addr);
 	return err;
 #else
 	return -ENOSYS;
@@ -3241,7 +3241,7 @@ static void gk20a_gmmu_free_attr_vid(struct gk20a *g, enum dma_attr attr,
 		}
 	} else {
 		gk20a_memset(g, mem, 0, 0, mem->size);
-		gk20a_free(mem->allocator,
+		nvgpu_free(mem->allocator,
 			   (u64)get_vidmem_page_alloc(mem->sgt->sgl));
 		gk20a_free_sgtable(&mem->sgt);
 
@@ -3276,7 +3276,7 @@ void gk20a_gmmu_free(struct gk20a *g, struct mem_desc *mem)
 u64 gk20a_mem_get_base_addr(struct gk20a *g, struct mem_desc *mem,
 			    u32 flags)
 {
-	struct gk20a_page_alloc *alloc;
+	struct nvgpu_page_alloc *alloc;
 	u64 addr;
 
 	if (mem->aperture == APERTURE_VIDMEM) {
@@ -3317,7 +3317,7 @@ static void gk20a_vidmem_clear_mem_worker(struct work_struct *work)
 
 	while ((mem = get_pending_mem_desc(mm)) != NULL) {
 		gk20a_gmmu_clear_vidmem_mem(g, mem);
-		gk20a_free(mem->allocator,
+		nvgpu_free(mem->allocator,
 			   (u64)get_vidmem_page_alloc(mem->sgt->sgl));
 		gk20a_free_sgtable(&mem->sgt);
 
@@ -3905,7 +3905,7 @@ static int update_gmmu_ptes_locked(struct vm_gk20a *vm,
 	u32 page_size  = vm->gmmu_page_sizes[pgsz_idx];
 	int err;
 	struct scatterlist *sgl = NULL;
-	struct gk20a_page_alloc *alloc = NULL;
+	struct nvgpu_page_alloc *alloc = NULL;
 	struct page_alloc_chunk *chunk = NULL;
 	u64 length;
 
@@ -4251,12 +4251,12 @@ static int gk20a_init_sema_pool(struct vm_gk20a *vm)
 	 *
 	 * !!! TODO: cleanup.
 	 */
-	sema_sea->gpu_va = gk20a_alloc_fixed(&vm->vma[gmmu_page_size_kernel],
+	sema_sea->gpu_va = nvgpu_alloc_fixed(&vm->vma[gmmu_page_size_kernel],
 					     vm->va_limit -
 					     mm->channel.kernel_size,
 					     512 * PAGE_SIZE);
 	if (!sema_sea->gpu_va) {
-		gk20a_free(&vm->vma[gmmu_page_size_small], sema_sea->gpu_va);
+		nvgpu_free(&vm->vma[gmmu_page_size_small], sema_sea->gpu_va);
 		gk20a_vm_put(vm);
 		return -ENOMEM;
 	}
@@ -4264,7 +4264,7 @@ static int gk20a_init_sema_pool(struct vm_gk20a *vm)
 	err = gk20a_semaphore_pool_map(vm->sema_pool, vm);
 	if (err) {
 		gk20a_semaphore_pool_unmap(vm->sema_pool, vm);
-		gk20a_free(&vm->vma[gmmu_page_size_small],
+		nvgpu_free(&vm->vma[gmmu_page_size_small],
 			    vm->sema_pool->gpu_va);
 		gk20a_vm_put(vm);
 	}
@@ -4387,7 +4387,7 @@ int gk20a_init_vm(struct mm_gk20a *mm,
 		snprintf(alloc_name, sizeof(alloc_name),
 			 "gk20a_%s-fixed", name);
 
-		err = __gk20a_buddy_allocator_init(g, &vm->fixed,
+		err = __nvgpu_buddy_allocator_init(g, &vm->fixed,
 						   vm, alloc_name,
 						   small_vma_start,
 						   g->separate_fixed_allocs,
@@ -4404,7 +4404,7 @@ int gk20a_init_vm(struct mm_gk20a *mm,
 	if (small_vma_start < small_vma_limit) {
 		snprintf(alloc_name, sizeof(alloc_name), "gk20a_%s-%dKB", name,
 			 vm->gmmu_page_sizes[gmmu_page_size_small] >> 10);
-		err = __gk20a_buddy_allocator_init(
+		err = __nvgpu_buddy_allocator_init(
 			g,
 			&vm->vma[gmmu_page_size_small],
 			vm, alloc_name,
@@ -4420,7 +4420,7 @@ int gk20a_init_vm(struct mm_gk20a *mm,
 	if (large_vma_start < large_vma_limit) {
 		snprintf(alloc_name, sizeof(alloc_name), "gk20a_%s-%dKB",
 			 name, vm->gmmu_page_sizes[gmmu_page_size_big] >> 10);
-		err = __gk20a_buddy_allocator_init(
+		err = __nvgpu_buddy_allocator_init(
 			g,
 			&vm->vma[gmmu_page_size_big],
 			vm, alloc_name,
@@ -4438,7 +4438,7 @@ int gk20a_init_vm(struct mm_gk20a *mm,
 	/*
 	 * kernel reserved VMA is at the end of the aperture
 	 */
-	err = __gk20a_buddy_allocator_init(g, &vm->vma[gmmu_page_size_kernel],
+	err = __nvgpu_buddy_allocator_init(g, &vm->vma[gmmu_page_size_kernel],
 					   vm, alloc_name,
 					   kernel_vma_start,
 					   kernel_vma_limit - kernel_vma_start,
@@ -4469,10 +4469,10 @@ int gk20a_init_vm(struct mm_gk20a *mm,
 
 clean_up_big_allocator:
 	if (large_vma_start < large_vma_limit)
-		gk20a_alloc_destroy(&vm->vma[gmmu_page_size_big]);
+		nvgpu_alloc_destroy(&vm->vma[gmmu_page_size_big]);
 clean_up_small_allocator:
 	if (small_vma_start < small_vma_limit)
-		gk20a_alloc_destroy(&vm->vma[gmmu_page_size_small]);
+		nvgpu_alloc_destroy(&vm->vma[gmmu_page_size_small]);
 clean_up_ptes:
 	free_gmmu_pages(vm, &vm->pdb);
 clean_up_pdes:
@@ -4547,7 +4547,7 @@ int gk20a_vm_alloc_space(struct gk20a_as_share *as_share,
 {
 	int err = -ENOMEM;
 	int pgsz_idx = gmmu_page_size_small;
-	struct gk20a_allocator *vma;
+	struct nvgpu_allocator *vma;
 	struct vm_gk20a *vm = as_share->vm;
 	struct gk20a *g = vm->mm->g;
 	struct vm_reserved_va_node *va_node;
@@ -4579,13 +4579,13 @@ int gk20a_vm_alloc_space(struct gk20a_as_share *as_share,
 
 	vma = &vm->vma[pgsz_idx];
 	if (args->flags & NVGPU_AS_ALLOC_SPACE_FLAGS_FIXED_OFFSET) {
-		if (gk20a_alloc_initialized(&vm->fixed))
+		if (nvgpu_alloc_initialized(&vm->fixed))
 			vma = &vm->fixed;
-		vaddr_start = gk20a_alloc_fixed(vma, args->o_a.offset,
+		vaddr_start = nvgpu_alloc_fixed(vma, args->o_a.offset,
 						(u64)args->pages *
 						(u64)args->page_size);
 	} else {
-		vaddr_start = gk20a_alloc(vma,
+		vaddr_start = nvgpu_alloc(vma,
 					  (u64)args->pages *
 					  (u64)args->page_size);
 	}
@@ -4621,7 +4621,7 @@ int gk20a_vm_alloc_space(struct gk20a_as_share *as_share,
 					 APERTURE_INVALID);
 		if (!map_offset) {
 			mutex_unlock(&vm->update_gmmu_lock);
-			gk20a_free(vma, vaddr_start);
+			nvgpu_free(vma, vaddr_start);
 			kfree(va_node);
 			goto clean_up;
 		}
@@ -4644,7 +4644,7 @@ int gk20a_vm_free_space(struct gk20a_as_share *as_share,
 {
 	int err = -ENOMEM;
 	int pgsz_idx;
-	struct gk20a_allocator *vma;
+	struct nvgpu_allocator *vma;
 	struct vm_gk20a *vm = as_share->vm;
 	struct vm_reserved_va_node *va_node;
 	struct gk20a *g = gk20a_from_vm(vm);
@@ -4656,11 +4656,11 @@ int gk20a_vm_free_space(struct gk20a_as_share *as_share,
 	pgsz_idx = __nv_gmmu_va_is_big_page_region(vm, args->offset) ?
 			gmmu_page_size_big : gmmu_page_size_small;
 
-	if (gk20a_alloc_initialized(&vm->fixed))
+	if (nvgpu_alloc_initialized(&vm->fixed))
 		vma = &vm->fixed;
 	else
 		vma = &vm->vma[pgsz_idx];
-	gk20a_free(vma, args->offset);
+	nvgpu_free(vma, args->offset);
 
 	mutex_lock(&vm->update_gmmu_lock);
 	va_node = addr_to_reservation(vm, args->offset);
@@ -4844,13 +4844,13 @@ int gk20a_vm_unmap_buffer(struct vm_gk20a *vm, u64 offset,
 
 void gk20a_deinit_vm(struct vm_gk20a *vm)
 {
-	gk20a_alloc_destroy(&vm->vma[gmmu_page_size_kernel]);
-	if (gk20a_alloc_initialized(&vm->vma[gmmu_page_size_big]))
-		gk20a_alloc_destroy(&vm->vma[gmmu_page_size_big]);
-	if (gk20a_alloc_initialized(&vm->vma[gmmu_page_size_small]))
-		gk20a_alloc_destroy(&vm->vma[gmmu_page_size_small]);
-	if (gk20a_alloc_initialized(&vm->fixed))
-		gk20a_alloc_destroy(&vm->fixed);
+	nvgpu_alloc_destroy(&vm->vma[gmmu_page_size_kernel]);
+	if (nvgpu_alloc_initialized(&vm->vma[gmmu_page_size_big]))
+		nvgpu_alloc_destroy(&vm->vma[gmmu_page_size_big]);
+	if (nvgpu_alloc_initialized(&vm->vma[gmmu_page_size_small]))
+		nvgpu_alloc_destroy(&vm->vma[gmmu_page_size_small]);
+	if (nvgpu_alloc_initialized(&vm->fixed))
+		nvgpu_alloc_destroy(&vm->fixed);
 
 	gk20a_vm_free_entries(vm, &vm->pdb, 0);
 }
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
index d32e121a..f58b5df5 100644
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
@@ -27,7 +27,8 @@
 #include <linux/version.h>
 #include <asm/dma-iommu.h>
 #include <asm/cacheflush.h>
-#include "gk20a_allocator.h"
+
+#include <nvgpu/allocator.h>
 
 #ifdef CONFIG_ARM64
 #define outer_flush_range(a, b)
@@ -70,7 +71,7 @@ struct mem_desc {
 	u64 gpu_va;
 	bool fixed; /* vidmem only */
 	bool user_mem; /* vidmem only */
-	struct gk20a_allocator *allocator; /* vidmem only */
+	struct nvgpu_allocator *allocator; /* vidmem only */
 	struct list_head clear_list_entry; /* vidmem only */
 	bool skip_wmb;
 };
@@ -295,10 +296,10 @@ struct vm_gk20a {
 
 	struct gk20a_mm_entry pdb;
 
-	struct gk20a_allocator vma[gmmu_nr_page_sizes];
+	struct nvgpu_allocator vma[gmmu_nr_page_sizes];
 
 	/* If necessary, split fixed from non-fixed. */
-	struct gk20a_allocator fixed;
+	struct nvgpu_allocator fixed;
 
 	struct rb_root mapped_buffers;
 
@@ -421,8 +422,8 @@ struct mm_gk20a {
 		size_t bootstrap_size;
 		u64 bootstrap_base;
 
-		struct gk20a_allocator allocator;
-		struct gk20a_allocator bootstrap_allocator;
+		struct nvgpu_allocator allocator;
+		struct nvgpu_allocator bootstrap_allocator;
 
 		u32 ce_ctx_id;
 		volatile bool cleared;
@@ -470,13 +471,13 @@ static inline u64 __nv_gmmu_va_small_page_limit(void)
 
 static inline int __nv_gmmu_va_is_big_page_region(struct vm_gk20a *vm, u64 addr)
 {
-	struct gk20a_allocator *a = &vm->vma[gmmu_page_size_big];
+	struct nvgpu_allocator *a = &vm->vma[gmmu_page_size_big];
 
 	if (!vm->big_pages)
 		return 0;
 
-	return addr >= gk20a_alloc_base(a) &&
-		addr < gk20a_alloc_base(a) + gk20a_alloc_length(a);
+	return addr >= nvgpu_alloc_base(a) &&
+		addr < nvgpu_alloc_base(a) + nvgpu_alloc_length(a);
 }
 
 /*
@@ -825,7 +826,7 @@ void gk20a_remove_vm(struct vm_gk20a *vm, struct mem_desc *inst_block);
 extern const struct gk20a_mmu_level gk20a_mm_levels_64k[];
 extern const struct gk20a_mmu_level gk20a_mm_levels_128k[];
 
-static inline void *nvgpu_alloc(size_t size, bool clear)
+static inline void *nvgpu_kalloc(size_t size, bool clear)
 {
 	void *p;
 
@@ -844,7 +845,7 @@ static inline void *nvgpu_alloc(size_t size, bool clear)
 	return p;
 }
 
-static inline void nvgpu_free(void *p)
+static inline void nvgpu_kfree(void *p)
 {
 	if (virt_addr_valid(p))
 		kfree(p);
diff --git a/drivers/gpu/nvgpu/gk20a/page_allocator_priv.h b/drivers/gpu/nvgpu/gk20a/page_allocator_priv.h
deleted file mode 100644
index 7d7f43c2..00000000
--- a/drivers/gpu/nvgpu/gk20a/page_allocator_priv.h
+++ /dev/null
@@ -1,164 +0,0 @@
-/*
- * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#ifndef PAGE_ALLOCATOR_PRIV_H
-#define PAGE_ALLOCATOR_PRIV_H
-
-#include <linux/list.h>
-#include <linux/rbtree.h>
-
-#include "gk20a_allocator.h"
-
-struct gk20a_allocator;
-
-/*
- * This allocator implements the ability to do SLAB style allocation since the
- * GPU has two page sizes available - 4k and 64k/128k. When the default
- * granularity is the large page size (64k/128k) small allocations become very
- * space inefficient. This is most notable in PDE and PTE blocks which are 4k
- * in size.
- *
- * Thus we need the ability to suballocate in 64k pages. The way we do this for
- * the GPU is as follows. We have several buckets for sub-64K allocations:
- *
- *   B0 - 4k
- *   B1 - 8k
- *   B3 - 16k
- *   B4 - 32k
- *   B5 - 64k (for when large pages are 128k)
- *
- * When an allocation comes in for less than the large page size (from now on
- * assumed to be 64k) the allocation is satisfied by one of the buckets.
- */
-struct page_alloc_slab {
-	struct list_head empty;
-	struct list_head partial;
-	struct list_head full;
-
-	int nr_empty;
-	int nr_partial;
-	int nr_full;
-
-	u32 slab_size;
-};
-
-enum slab_page_state {
-	SP_EMPTY,
-	SP_PARTIAL,
-	SP_FULL,
-	SP_NONE
-};
-
-struct page_alloc_slab_page {
-	unsigned long bitmap;
-	u64 page_addr;
-	u32 slab_size;
-
-	u32 nr_objects;
-	u32 nr_objects_alloced;
-
-	enum slab_page_state state;
-
-	struct page_alloc_slab *owner;
-	struct list_head list_entry;
-};
-
-struct page_alloc_chunk {
-	struct list_head list_entry;
-
-	u64 base;
-	u64 length;
-};
-
-/*
- * Struct to handle internal management of page allocation. It holds a list
- * of the chunks of pages that make up the overall allocation - much like a
- * scatter gather table.
- */
-struct gk20a_page_alloc {
-	struct list_head alloc_chunks;
-
-	int nr_chunks;
-	u64 length;
-
-	/*
-	 * Only useful for the RB tree - since the alloc may have discontiguous
-	 * pages the base is essentially irrelevant except for the fact that it
-	 * is guarenteed to be unique.
-	 */
-	u64 base;
-
-	struct rb_node tree_entry;
-
-	/*
-	 * Set if this is a slab alloc. Points back to the slab page that owns
-	 * this particular allocation. nr_chunks will always be 1 if this is
-	 * set.
-	 */
-	struct page_alloc_slab_page *slab_page;
-};
-
-struct gk20a_page_allocator {
-	struct gk20a_allocator *owner;	/* Owner of this allocator. */
-
-	/*
-	 * Use a buddy allocator to manage the allocation of the underlying
-	 * pages. This lets us abstract the discontiguous allocation handling
-	 * out of the annoyingly complicated buddy allocator.
-	 */
-	struct gk20a_allocator source_allocator;
-
-	/*
-	 * Page params.
-	 */
-	u64 base;
-	u64 length;
-	u64 page_size;
-	u32 page_shift;
-
-	struct rb_root allocs;		/* Outstanding allocations. */
-
-	struct page_alloc_slab *slabs;
-	int nr_slabs;
-
-	u64 flags;
-
-	/*
-	 * Stat tracking.
-	 */
-	u64 nr_allocs;
-	u64 nr_frees;
-	u64 nr_fixed_allocs;
-	u64 nr_fixed_frees;
-	u64 nr_slab_allocs;
-	u64 nr_slab_frees;
-	u64 pages_alloced;
-	u64 pages_freed;
-};
-
-static inline struct gk20a_page_allocator *page_allocator(
-	struct gk20a_allocator *a)
-{
-	return (struct gk20a_page_allocator *)(a)->priv;
-}
-
-static inline struct gk20a_allocator *palloc_owner(
-	struct gk20a_page_allocator *a)
-{
-	return a->owner;
-}
-
-#endif
diff --git a/drivers/gpu/nvgpu/gk20a/pmu_gk20a.c b/drivers/gpu/nvgpu/gk20a/pmu_gk20a.c
index e221be11..56ebda1a 100644
--- a/drivers/gpu/nvgpu/gk20a/pmu_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/pmu_gk20a.c
@@ -2896,8 +2896,8 @@ void gk20a_remove_pmu_support(struct pmu_gk20a *pmu)
 {
 	gk20a_dbg_fn("");
 
-	if (gk20a_alloc_initialized(&pmu->dmem))
-		gk20a_alloc_destroy(&pmu->dmem);
+	if (nvgpu_alloc_initialized(&pmu->dmem))
+		nvgpu_alloc_destroy(&pmu->dmem);
 
 	release_firmware(pmu->fw);
 }
@@ -3607,7 +3607,7 @@ static int pmu_init_perfmon(struct pmu_gk20a *pmu)
 	gk20a_writel(g, pwr_pmu_idle_ctrl_r(2), data);
 
 	if (!pmu->sample_buffer)
-		pmu->sample_buffer = gk20a_alloc(&pmu->dmem,
+		pmu->sample_buffer = nvgpu_alloc(&pmu->dmem,
 						  2 * sizeof(u16));
 	if (!pmu->sample_buffer) {
 		gk20a_err(dev_from_gk20a(g),
@@ -3708,7 +3708,7 @@ static int pmu_process_init_msg(struct pmu_gk20a *pmu,
 	for (i = 0; i < PMU_QUEUE_COUNT; i++)
 		pmu_queue_init(pmu, i, init);
 
-	if (!gk20a_alloc_initialized(&pmu->dmem)) {
+	if (!nvgpu_alloc_initialized(&pmu->dmem)) {
 		/* Align start and end addresses */
 		u32 start = ALIGN(pv->get_pmu_init_msg_pmu_sw_mg_off(init),
 				  PMU_DMEM_ALLOC_ALIGNMENT);
@@ -3716,9 +3716,9 @@ static int pmu_process_init_msg(struct pmu_gk20a *pmu,
 			   pv->get_pmu_init_msg_pmu_sw_mg_size(init)) &
 			~(PMU_DMEM_ALLOC_ALIGNMENT - 1);
 		u32 size = end - start;
-		gk20a_bitmap_allocator_init(g, &pmu->dmem, "gk20a_pmu_dmem",
-					   start, size,
-					   PMU_DMEM_ALLOC_ALIGNMENT, 0);
+		nvgpu_bitmap_allocator_init(g, &pmu->dmem, "gk20a_pmu_dmem",
+					    start, size,
+					    PMU_DMEM_ALLOC_ALIGNMENT, 0);
 	}
 
 	pmu->pmu_ready = true;
@@ -3855,12 +3855,12 @@ static int pmu_response_handle(struct pmu_gk20a *pmu,
 		seq->callback = NULL;
 	if (pv->pmu_allocation_get_dmem_size(pmu,
 			pv->get_pmu_seq_in_a_ptr(seq)) != 0)
-		gk20a_free(&pmu->dmem,
+		nvgpu_free(&pmu->dmem,
 			pv->pmu_allocation_get_dmem_offset(pmu,
 			pv->get_pmu_seq_in_a_ptr(seq)));
 	if (pv->pmu_allocation_get_dmem_size(pmu,
 			pv->get_pmu_seq_out_a_ptr(seq)) != 0)
-		gk20a_free(&pmu->dmem,
+		nvgpu_free(&pmu->dmem,
 			pv->pmu_allocation_get_dmem_offset(pmu,
 			pv->get_pmu_seq_out_a_ptr(seq)));
 
@@ -4601,7 +4601,7 @@ int gk20a_pmu_cmd_post(struct gk20a *g, struct pmu_cmd *cmd,
 			(u16)max(payload->in.size, payload->out.size));
 
 		*(pv->pmu_allocation_get_dmem_offset_addr(pmu, in)) =
-			gk20a_alloc(&pmu->dmem,
+			nvgpu_alloc(&pmu->dmem,
 				     pv->pmu_allocation_get_dmem_size(pmu, in));
 		if (!*(pv->pmu_allocation_get_dmem_offset_addr(pmu, in)))
 			goto clean_up;
@@ -4644,7 +4644,7 @@ int gk20a_pmu_cmd_post(struct gk20a *g, struct pmu_cmd *cmd,
 
 		if (payload->in.buf != payload->out.buf) {
 			*(pv->pmu_allocation_get_dmem_offset_addr(pmu, out)) =
-				gk20a_alloc(&pmu->dmem,
+				nvgpu_alloc(&pmu->dmem,
 				    pv->pmu_allocation_get_dmem_size(pmu, out));
 			if (!*(pv->pmu_allocation_get_dmem_offset_addr(pmu,
 					out)))
@@ -4694,10 +4694,10 @@ int gk20a_pmu_cmd_post(struct gk20a *g, struct pmu_cmd *cmd,
 clean_up:
 	gk20a_dbg_fn("fail");
 	if (in)
-		gk20a_free(&pmu->dmem,
+		nvgpu_free(&pmu->dmem,
 			pv->pmu_allocation_get_dmem_offset(pmu, in));
 	if (out)
-		gk20a_free(&pmu->dmem,
+		nvgpu_free(&pmu->dmem,
 			pv->pmu_allocation_get_dmem_offset(pmu, out));
 
 	pmu_seq_release(pmu, seq);
diff --git a/drivers/gpu/nvgpu/gk20a/pmu_gk20a.h b/drivers/gpu/nvgpu/gk20a/pmu_gk20a.h
index cf4f3b52..32e2ef54 100644
--- a/drivers/gpu/nvgpu/gk20a/pmu_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/pmu_gk20a.h
@@ -709,7 +709,7 @@ struct pmu_gk20a {
 	struct mutex pmu_copy_lock;
 	struct mutex pmu_seq_lock;
 
-	struct gk20a_allocator dmem;
+	struct nvgpu_allocator dmem;
 
 	u32 *ucode_image;
 	bool pmu_ready;
diff --git a/drivers/gpu/nvgpu/gk20a/semaphore_gk20a.h b/drivers/gpu/nvgpu/gk20a/semaphore_gk20a.h
index cf724fdb..8e09fcfc 100644
--- a/drivers/gpu/nvgpu/gk20a/semaphore_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/semaphore_gk20a.h
@@ -18,10 +18,11 @@
 #include <linux/list.h>
 #include <linux/delay.h>
 
+#include <nvgpu/allocator.h>
+
 #include "gk20a.h"
 #include "mm_gk20a.h"
 #include "channel_gk20a.h"
-#include "gk20a_allocator.h"
 
 #define gpu_sema_dbg(fmt, args...)		\
 	gk20a_dbg(gpu_dbg_sema, fmt, ##args)
diff --git a/drivers/gpu/nvgpu/include/nvgpu/allocator.h b/drivers/gpu/nvgpu/include/nvgpu/allocator.h
new file mode 100644
index 00000000..dee9b562
--- /dev/null
+++ b/drivers/gpu/nvgpu/include/nvgpu/allocator.h
@@ -0,0 +1,302 @@
+/*
+ * Copyright (c) 2011-2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef NVGPU_ALLOCATOR_H
+#define NVGPU_ALLOCATOR_H
+
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include <linux/platform_device.h>
+
+/* #define ALLOCATOR_DEBUG */
+
+struct nvgpu_allocator;
+struct nvgpu_alloc_carveout;
+struct vm_gk20a;
+struct gk20a;
+
+/*
+ * Operations for an allocator to implement.
+ */
+struct nvgpu_allocator_ops {
+	u64  (*alloc)(struct nvgpu_allocator *allocator, u64 len);
+	void (*free)(struct nvgpu_allocator *allocator, u64 addr);
+
+	/*
+	 * Special interface to allocate a memory region with a specific
+	 * starting address. Yikes. Note: if free() works for freeing both
+	 * regular and fixed allocations then free_fixed() does not need to
+	 * be implemented. This behavior exists for legacy reasons and should
+	 * not be propagated to new allocators.
+	 */
+	u64  (*alloc_fixed)(struct nvgpu_allocator *allocator,
+			     u64 base, u64 len);
+	void (*free_fixed)(struct nvgpu_allocator *allocator,
+			    u64 base, u64 len);
+
+	/*
+	 * Allow allocators to reserve space for carveouts.
+	 */
+	int  (*reserve_carveout)(struct nvgpu_allocator *allocator,
+				 struct nvgpu_alloc_carveout *co);
+	void (*release_carveout)(struct nvgpu_allocator *allocator,
+				 struct nvgpu_alloc_carveout *co);
+
+	/*
+	 * Returns info about the allocator.
+	 */
+	u64  (*base)(struct nvgpu_allocator *allocator);
+	u64  (*length)(struct nvgpu_allocator *allocator);
+	u64  (*end)(struct nvgpu_allocator *allocator);
+	int  (*inited)(struct nvgpu_allocator *allocator);
+	u64  (*space)(struct nvgpu_allocator *allocator);
+
+	/* Destructor. */
+	void (*fini)(struct nvgpu_allocator *allocator);
+
+	/* Debugging. */
+	void (*print_stats)(struct nvgpu_allocator *allocator,
+			    struct seq_file *s, int lock);
+};
+
+struct nvgpu_allocator {
+	char name[32];
+	struct mutex lock;
+
+	void *priv;
+	const struct nvgpu_allocator_ops *ops;
+
+	struct dentry *debugfs_entry;
+	bool debug;				/* Control for debug msgs. */
+};
+
+struct nvgpu_alloc_carveout {
+	const char *name;
+	u64 base;
+	u64 length;
+
+	struct nvgpu_allocator *allocator;
+
+	/*
+	 * For usage by the allocator implementation.
+	 */
+	struct list_head co_entry;
+};
+
+#define NVGPU_CARVEOUT(__name, __base, __length)	\
+	{						\
+		.name = (__name),			\
+		.base = (__base),			\
+		.length = (__length)			\
+	}
+
+/*
+ * These are the available allocator flags.
+ *
+ *   GPU_ALLOC_GVA_SPACE
+ *
+ *     This flag makes sense for the buddy allocator only. It specifies that the
+ *     allocator will be used for managing a GVA space. When managing GVA spaces
+ *     special care has to be taken to ensure that allocations of similar PTE
+ *     sizes are placed in the same PDE block. This allows the higher level
+ *     code to skip defining both small and large PTE tables for every PDE. That
+ *     can save considerable memory for address spaces that have a lot of
+ *     allocations.
+ *
+ *   GPU_ALLOC_NO_ALLOC_PAGE
+ *
+ *     For any allocator that needs to manage a resource in a latency critical
+ *     path this flag specifies that the allocator should not use any kmalloc()
+ *     or similar functions during normal operation. Initialization routines
+ *     may still use kmalloc(). This prevents the possibility of long waits for
+ *     pages when using alloc_page(). Currently only the bitmap allocator
+ *     implements this functionality.
+ *
+ *     Also note that if you accept this flag then you must also define the
+ *     free_fixed() function. Since no meta-data is allocated to help free
+ *     allocations you need to keep track of the meta-data yourself (in this
+ *     case the base and length of the allocation as opposed to just the base
+ *     of the allocation).
+ *
+ *   GPU_ALLOC_4K_VIDMEM_PAGES
+ *
+ *     We manage vidmem pages at a large page granularity for performance
+ *     reasons; however, this can lead to wasting memory. For page allocators
+ *     setting this flag will tell the allocator to manage pools of 4K pages
+ *     inside internally allocated large pages.
+ *
+ *     Currently this flag is ignored since the only usage of the page allocator
+ *     uses a 4K block size already. However, this flag has been reserved since
+ *     it will be necessary in the future.
+ *
+ *   GPU_ALLOC_FORCE_CONTIG
+ *
+ *     Force allocations to be contiguous. Currently only relevant for page
+ *     allocators since all other allocators are naturally contiguous.
+ *
+ *   GPU_ALLOC_NO_SCATTER_GATHER
+ *
+ *     The page allocator normally returns a scatter gather data structure for
+ *     allocations (to handle discontiguous pages). However, at times that can
+ *     be annoying so this flag forces the page allocator to return a u64
+ *     pointing to the allocation base (requires GPU_ALLOC_FORCE_CONTIG to be
+ *     set as well).
+ */
+#define GPU_ALLOC_GVA_SPACE		0x1
+#define GPU_ALLOC_NO_ALLOC_PAGE		0x2
+#define GPU_ALLOC_4K_VIDMEM_PAGES	0x4
+#define GPU_ALLOC_FORCE_CONTIG		0x8
+#define GPU_ALLOC_NO_SCATTER_GATHER	0x10
+
+static inline void alloc_lock(struct nvgpu_allocator *a)
+{
+	mutex_lock(&a->lock);
+}
+
+static inline void alloc_unlock(struct nvgpu_allocator *a)
+{
+	mutex_unlock(&a->lock);
+}
+
+/*
+ * Buddy allocator specific initializers.
+ */
+int  __nvgpu_buddy_allocator_init(struct gk20a *g, struct nvgpu_allocator *a,
+				  struct vm_gk20a *vm, const char *name,
+				  u64 base, u64 size, u64 blk_size,
+				  u64 max_order, u64 flags);
+int  nvgpu_buddy_allocator_init(struct gk20a *g, struct nvgpu_allocator *a,
+				const char *name, u64 base, u64 size,
+				u64 blk_size, u64 flags);
+
+/*
+ * Bitmap initializers.
+ */
+int nvgpu_bitmap_allocator_init(struct gk20a *g, struct nvgpu_allocator *a,
+				const char *name, u64 base, u64 length,
+				u64 blk_size, u64 flags);
+
+/*
+ * Page allocator initializers.
+ */
+int nvgpu_page_allocator_init(struct gk20a *g, struct nvgpu_allocator *a,
+			      const char *name, u64 base, u64 length,
+			      u64 blk_size, u64 flags);
+
+/*
+ * Lockless allocatior initializers.
+ * Note: This allocator can only allocate fixed-size structures of a
+ * pre-defined size.
+ */
+int nvgpu_lockless_allocator_init(struct gk20a *g, struct nvgpu_allocator *a,
+				  const char *name, u64 base, u64 length,
+				  u64 struct_size, u64 flags);
+
+#define GPU_BALLOC_MAX_ORDER		31
+
+/*
+ * Allocator APIs.
+ */
+u64  nvgpu_alloc(struct nvgpu_allocator *allocator, u64 len);
+void nvgpu_free(struct nvgpu_allocator *allocator, u64 addr);
+
+u64  nvgpu_alloc_fixed(struct nvgpu_allocator *allocator, u64 base, u64 len);
+void nvgpu_free_fixed(struct nvgpu_allocator *allocator, u64 base, u64 len);
+
+int  nvgpu_alloc_reserve_carveout(struct nvgpu_allocator *a,
+				  struct nvgpu_alloc_carveout *co);
+void nvgpu_alloc_release_carveout(struct nvgpu_allocator *a,
+				  struct nvgpu_alloc_carveout *co);
+
+u64  nvgpu_alloc_base(struct nvgpu_allocator *a);
+u64  nvgpu_alloc_length(struct nvgpu_allocator *a);
+u64  nvgpu_alloc_end(struct nvgpu_allocator *a);
+u64  nvgpu_alloc_initialized(struct nvgpu_allocator *a);
+u64  nvgpu_alloc_space(struct nvgpu_allocator *a);
+
+void nvgpu_alloc_destroy(struct nvgpu_allocator *allocator);
+
+void nvgpu_alloc_print_stats(struct nvgpu_allocator *a,
+			     struct seq_file *s, int lock);
+
+/*
+ * Common functionality for the internals of the allocators.
+ */
+void nvgpu_init_alloc_debug(struct gk20a *g, struct nvgpu_allocator *a);
+void nvgpu_fini_alloc_debug(struct nvgpu_allocator *a);
+
+int  __nvgpu_alloc_common_init(struct nvgpu_allocator *a,
+			       const char *name, void *priv, bool dbg,
+			       const struct nvgpu_allocator_ops *ops);
+
+static inline void nvgpu_alloc_enable_dbg(struct nvgpu_allocator *a)
+{
+	a->debug = true;
+}
+
+static inline void nvgpu_alloc_disable_dbg(struct nvgpu_allocator *a)
+{
+	a->debug = false;
+}
+
+/*
+ * Debug stuff.
+ */
+extern u32 nvgpu_alloc_tracing_on;
+
+void nvgpu_alloc_debugfs_init(struct device *dev);
+
+#define nvgpu_alloc_trace_func()			\
+	do {						\
+		if (nvgpu_alloc_tracing_on)		\
+			trace_printk("%s\n", __func__);	\
+	} while (0)
+
+#define nvgpu_alloc_trace_func_done()				\
+	do {							\
+		if (nvgpu_alloc_tracing_on)			\
+			trace_printk("%s_done\n", __func__);	\
+	} while (0)
+
+#define __alloc_pstat(seq, allocator, fmt, arg...)		\
+	do {							\
+		if (s)						\
+			seq_printf(seq, fmt, ##arg);		\
+		else						\
+			alloc_dbg(allocator, fmt, ##arg);	\
+	} while (0)
+
+#define __alloc_dbg(a, fmt, arg...)					\
+	pr_info("%-25s %25s() " fmt, (a)->name, __func__, ##arg)
+
+#if defined(ALLOCATOR_DEBUG)
+/*
+ * Always print the debug messages...
+ */
+#define alloc_dbg(a, fmt, arg...) __alloc_dbg(a, fmt, ##arg)
+#else
+/*
+ * Only print debug messages if debug is enabled for a given allocator.
+ */
+#define alloc_dbg(a, fmt, arg...)			\
+	do {						\
+		if ((a)->debug)				\
+			__alloc_dbg((a), fmt, ##arg);	\
+	} while (0)
+
+#endif
+
+#endif /* NVGPU_ALLOCATOR_H */
diff --git a/drivers/gpu/nvgpu/include/nvgpu/page_allocator.h b/drivers/gpu/nvgpu/include/nvgpu/page_allocator.h
new file mode 100644
index 00000000..7c21c117
--- /dev/null
+++ b/drivers/gpu/nvgpu/include/nvgpu/page_allocator.h
@@ -0,0 +1,164 @@
+/*
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef PAGE_ALLOCATOR_PRIV_H
+#define PAGE_ALLOCATOR_PRIV_H
+
+#include <linux/list.h>
+#include <linux/rbtree.h>
+
+#include <nvgpu/allocator.h>
+
+struct nvgpu_allocator;
+
+/*
+ * This allocator implements the ability to do SLAB style allocation since the
+ * GPU has two page sizes available - 4k and 64k/128k. When the default
+ * granularity is the large page size (64k/128k) small allocations become very
+ * space inefficient. This is most notable in PDE and PTE blocks which are 4k
+ * in size.
+ *
+ * Thus we need the ability to suballocate in 64k pages. The way we do this for
+ * the GPU is as follows. We have several buckets for sub-64K allocations:
+ *
+ *   B0 - 4k
+ *   B1 - 8k
+ *   B3 - 16k
+ *   B4 - 32k
+ *   B5 - 64k (for when large pages are 128k)
+ *
+ * When an allocation comes in for less than the large page size (from now on
+ * assumed to be 64k) the allocation is satisfied by one of the buckets.
+ */
+struct page_alloc_slab {
+	struct list_head empty;
+	struct list_head partial;
+	struct list_head full;
+
+	int nr_empty;
+	int nr_partial;
+	int nr_full;
+
+	u32 slab_size;
+};
+
+enum slab_page_state {
+	SP_EMPTY,
+	SP_PARTIAL,
+	SP_FULL,
+	SP_NONE
+};
+
+struct page_alloc_slab_page {
+	unsigned long bitmap;
+	u64 page_addr;
+	u32 slab_size;
+
+	u32 nr_objects;
+	u32 nr_objects_alloced;
+
+	enum slab_page_state state;
+
+	struct page_alloc_slab *owner;
+	struct list_head list_entry;
+};
+
+struct page_alloc_chunk {
+	struct list_head list_entry;
+
+	u64 base;
+	u64 length;
+};
+
+/*
+ * Struct to handle internal management of page allocation. It holds a list
+ * of the chunks of pages that make up the overall allocation - much like a
+ * scatter gather table.
+ */
+struct nvgpu_page_alloc {
+	struct list_head alloc_chunks;
+
+	int nr_chunks;
+	u64 length;
+
+	/*
+	 * Only useful for the RB tree - since the alloc may have discontiguous
+	 * pages the base is essentially irrelevant except for the fact that it
+	 * is guarenteed to be unique.
+	 */
+	u64 base;
+
+	struct rb_node tree_entry;
+
+	/*
+	 * Set if this is a slab alloc. Points back to the slab page that owns
+	 * this particular allocation. nr_chunks will always be 1 if this is
+	 * set.
+	 */
+	struct page_alloc_slab_page *slab_page;
+};
+
+struct nvgpu_page_allocator {
+	struct nvgpu_allocator *owner;	/* Owner of this allocator. */
+
+	/*
+	 * Use a buddy allocator to manage the allocation of the underlying
+	 * pages. This lets us abstract the discontiguous allocation handling
+	 * out of the annoyingly complicated buddy allocator.
+	 */
+	struct nvgpu_allocator source_allocator;
+
+	/*
+	 * Page params.
+	 */
+	u64 base;
+	u64 length;
+	u64 page_size;
+	u32 page_shift;
+
+	struct rb_root allocs;		/* Outstanding allocations. */
+
+	struct page_alloc_slab *slabs;
+	int nr_slabs;
+
+	u64 flags;
+
+	/*
+	 * Stat tracking.
+	 */
+	u64 nr_allocs;
+	u64 nr_frees;
+	u64 nr_fixed_allocs;
+	u64 nr_fixed_frees;
+	u64 nr_slab_allocs;
+	u64 nr_slab_frees;
+	u64 pages_alloced;
+	u64 pages_freed;
+};
+
+static inline struct nvgpu_page_allocator *page_allocator(
+	struct nvgpu_allocator *a)
+{
+	return (struct nvgpu_page_allocator *)(a)->priv;
+}
+
+static inline struct nvgpu_allocator *palloc_owner(
+	struct nvgpu_page_allocator *a)
+{
+	return a->owner;
+}
+
+#endif
diff --git a/drivers/gpu/nvgpu/vgpu/mm_vgpu.c b/drivers/gpu/nvgpu/vgpu/mm_vgpu.c
index 69f6fcaf..66c9344b 100644
--- a/drivers/gpu/nvgpu/vgpu/mm_vgpu.c
+++ b/drivers/gpu/nvgpu/vgpu/mm_vgpu.c
@@ -227,11 +227,11 @@ static void vgpu_vm_remove_support(struct vm_gk20a *vm)
 	err = vgpu_comm_sendrecv(&msg, sizeof(msg), sizeof(msg));
 	WARN_ON(err || msg.ret);
 
-	gk20a_alloc_destroy(&vm->vma[gmmu_page_size_kernel]);
-	if (gk20a_alloc_initialized(&vm->vma[gmmu_page_size_small]))
-		gk20a_alloc_destroy(&vm->vma[gmmu_page_size_small]);
-	if (gk20a_alloc_initialized(&vm->vma[gmmu_page_size_big]))
-		gk20a_alloc_destroy(&vm->vma[gmmu_page_size_big]);
+	nvgpu_alloc_destroy(&vm->vma[gmmu_page_size_kernel]);
+	if (nvgpu_alloc_initialized(&vm->vma[gmmu_page_size_small]))
+		nvgpu_alloc_destroy(&vm->vma[gmmu_page_size_small]);
+	if (nvgpu_alloc_initialized(&vm->vma[gmmu_page_size_big]))
+		nvgpu_alloc_destroy(&vm->vma[gmmu_page_size_big]);
 
 	mutex_unlock(&vm->update_gmmu_lock);
 
@@ -370,7 +370,7 @@ static int vgpu_vm_alloc_share(struct gk20a_as_share *as_share,
 		snprintf(name, sizeof(name), "gk20a_as_%d-%dKB", as_share->id,
 			 gmmu_page_sizes[gmmu_page_size_small] >> 10);
 
-		err = __gk20a_buddy_allocator_init(
+		err = __nvgpu_buddy_allocator_init(
 					g,
 					&vm->vma[gmmu_page_size_small],
 					vm, name,
@@ -386,7 +386,7 @@ static int vgpu_vm_alloc_share(struct gk20a_as_share *as_share,
 	if (large_vma_start < large_vma_limit) {
 		snprintf(name, sizeof(name), "gk20a_as_%d-%dKB", as_share->id,
 			gmmu_page_sizes[gmmu_page_size_big] >> 10);
-		err = __gk20a_buddy_allocator_init(
+		err = __nvgpu_buddy_allocator_init(
 					g,
 					&vm->vma[gmmu_page_size_big],
 					vm, name,
@@ -404,7 +404,7 @@ static int vgpu_vm_alloc_share(struct gk20a_as_share *as_share,
 	/*
 	 * kernel reserved VMA is at the end of the aperture
 	 */
-	err = __gk20a_buddy_allocator_init(
+	err = __nvgpu_buddy_allocator_init(
 				     g,
 				     &vm->vma[gmmu_page_size_kernel],
 				     vm, name,
@@ -428,10 +428,10 @@ static int vgpu_vm_alloc_share(struct gk20a_as_share *as_share,
 
 clean_up_big_allocator:
 	if (large_vma_start < large_vma_limit)
-		gk20a_alloc_destroy(&vm->vma[gmmu_page_size_big]);
+		nvgpu_alloc_destroy(&vm->vma[gmmu_page_size_big]);
 clean_up_small_allocator:
 	if (small_vma_start < small_vma_limit)
-		gk20a_alloc_destroy(&vm->vma[gmmu_page_size_small]);
+		nvgpu_alloc_destroy(&vm->vma[gmmu_page_size_small]);
 clean_up_share:
 	msg.cmd = TEGRA_VGPU_CMD_AS_FREE_SHARE;
 	msg.handle = vgpu_get_handle(g);
-- 
cgit v1.2.2