From fc4f0ddddb50da826b527b7791028a67539f34c8 Mon Sep 17 00:00:00 2001
From: Alex Waterman <alexw@nvidia.com>
Date: Thu, 22 Sep 2016 10:13:31 -0700
Subject: gpu: nvgpu: SLAB allocation for page allocator

Add the ability to do "SLAB" allocation in the page allocator. This
is generally useful since the allocator manages 64K pages but often
we only need 4k chunks (for example when allocating memory for page
table entries).

Bug 1799159
JIRA DNVGPU-100

Signed-off-by: Alex Waterman <alexw@nvidia.com>
Reviewed-on: http://git-master/r/1225322
(cherry picked from commit 299a5639243e44be504391d9155b4ae17d914aa2)
Change-Id: Ib3a8558d40ba16bd3a413f4fd38b146beaa3c66b
Reviewed-on: http://git-master/r/1227924
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
---
 drivers/gpu/nvgpu/gk20a/gk20a_allocator_page.c | 457 ++++++++++++++++++++++---
 drivers/gpu/nvgpu/gk20a/mm_gk20a.c             |   7 +-
 drivers/gpu/nvgpu/gk20a/page_allocator_priv.h  |  69 +++-
 3 files changed, 486 insertions(+), 47 deletions(-)

(limited to 'drivers')

diff --git a/drivers/gpu/nvgpu/gk20a/gk20a_allocator_page.c b/drivers/gpu/nvgpu/gk20a/gk20a_allocator_page.c
index 2e5d46b9..b8e38620 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a_allocator_page.c
+++ b/drivers/gpu/nvgpu/gk20a/gk20a_allocator_page.c
@@ -17,6 +17,7 @@
 #include <linux/kernel.h>
 #include <linux/slab.h>
 #include <linux/bitops.h>
+#include <linux/mm.h>
 
 #include "gk20a_allocator.h"
 #include "buddy_allocator_priv.h"
@@ -27,8 +28,59 @@
 
 static struct kmem_cache *page_alloc_cache;
 static struct kmem_cache *page_alloc_chunk_cache;
+static struct kmem_cache *page_alloc_slab_page_cache;
 static DEFINE_MUTEX(meta_data_cache_lock);
 
+/*
+ * Handle the book-keeping for these operations.
+ */
+static inline void add_slab_page_to_empty(struct page_alloc_slab *slab,
+					  struct page_alloc_slab_page *page)
+{
+	BUG_ON(page->state != SP_NONE);
+	list_add(&page->list_entry, &slab->empty);
+	slab->nr_empty++;
+	page->state = SP_EMPTY;
+}
+static inline void add_slab_page_to_partial(struct page_alloc_slab *slab,
+					    struct page_alloc_slab_page *page)
+{
+	BUG_ON(page->state != SP_NONE);
+	list_add(&page->list_entry, &slab->partial);
+	slab->nr_partial++;
+	page->state = SP_PARTIAL;
+}
+static inline void add_slab_page_to_full(struct page_alloc_slab *slab,
+					 struct page_alloc_slab_page *page)
+{
+	BUG_ON(page->state != SP_NONE);
+	list_add(&page->list_entry, &slab->full);
+	slab->nr_full++;
+	page->state = SP_FULL;
+}
+
+static inline void del_slab_page_from_empty(struct page_alloc_slab *slab,
+					    struct page_alloc_slab_page *page)
+{
+	list_del_init(&page->list_entry);
+	slab->nr_empty--;
+	page->state = SP_NONE;
+}
+static inline void del_slab_page_from_partial(struct page_alloc_slab *slab,
+					      struct page_alloc_slab_page *page)
+{
+	list_del_init(&page->list_entry);
+	slab->nr_partial--;
+	page->state = SP_NONE;
+}
+static inline void del_slab_page_from_full(struct page_alloc_slab *slab,
+					   struct page_alloc_slab_page *page)
+{
+	list_del_init(&page->list_entry);
+	slab->nr_full--;
+	page->state = SP_NONE;
+}
+
 static u64 gk20a_page_alloc_length(struct gk20a_allocator *a)
 {
 	struct gk20a_page_allocator *va = a->priv;
@@ -80,6 +132,26 @@ static void gk20a_page_release_co(struct gk20a_allocator *a,
 	gk20a_alloc_release_carveout(&va->source_allocator, co);
 }
 
+static void __gk20a_free_pages(struct gk20a_page_allocator *a,
+			       struct gk20a_page_alloc *alloc,
+			       bool free_buddy_alloc)
+{
+	struct page_alloc_chunk *chunk;
+
+	while (!list_empty(&alloc->alloc_chunks)) {
+		chunk = list_first_entry(&alloc->alloc_chunks,
+					 struct page_alloc_chunk,
+					 list_entry);
+		list_del(&chunk->list_entry);
+
+		if (free_buddy_alloc)
+			gk20a_free(&a->source_allocator, chunk->base);
+		kfree(chunk);
+	}
+
+	kfree(alloc);
+}
+
 static int __insert_page_alloc(struct gk20a_page_allocator *a,
 			       struct gk20a_page_alloc *alloc)
 {
@@ -134,13 +206,236 @@ static struct gk20a_page_alloc *__find_page_alloc(
 	return alloc;
 }
 
+static struct page_alloc_slab_page *alloc_slab_page(
+	struct gk20a_page_allocator *a,
+	struct page_alloc_slab *slab)
+{
+	struct page_alloc_slab_page *slab_page;
+
+	slab_page = kmem_cache_alloc(page_alloc_slab_page_cache, GFP_KERNEL);
+	if (!slab_page) {
+		palloc_dbg(a, "OOM: unable to alloc slab_page struct!\n");
+		return ERR_PTR(-ENOMEM);
+	}
+
+	memset(slab_page, 0, sizeof(*slab_page));
+
+	slab_page->page_addr = gk20a_alloc(&a->source_allocator, a->page_size);
+	if (!slab_page->page_addr) {
+		kfree(slab_page);
+		palloc_dbg(a, "OOM: vidmem is full!\n");
+		return ERR_PTR(-ENOMEM);
+	}
+
+	INIT_LIST_HEAD(&slab_page->list_entry);
+	slab_page->slab_size = slab->slab_size;
+	slab_page->nr_objects = a->page_size / slab->slab_size;
+	slab_page->nr_objects_alloced = 0;
+	slab_page->owner = slab;
+	slab_page->state = SP_NONE;
+
+	a->pages_alloced++;
+
+	palloc_dbg(a, "Allocated new slab page @ 0x%012llx size=%u\n",
+		   slab_page->page_addr, slab_page->slab_size);
+
+	return slab_page;
+}
+
+static void free_slab_page(struct gk20a_page_allocator *a,
+			   struct page_alloc_slab_page *slab_page)
+{
+	palloc_dbg(a, "Freeing slab page @ 0x%012llx\n", slab_page->page_addr);
+
+	BUG_ON((slab_page->state != SP_NONE && slab_page->state != SP_EMPTY) ||
+	       slab_page->nr_objects_alloced != 0 ||
+	       slab_page->bitmap != 0);
+
+	gk20a_free(&a->source_allocator, slab_page->page_addr);
+	a->pages_freed++;
+
+	kmem_cache_free(page_alloc_slab_page_cache, slab_page);
+}
+
+/*
+ * This expects @alloc to have 1 empty page_alloc_chunk already added to the
+ * alloc_chunks list.
+ */
+static int __do_slab_alloc(struct gk20a_page_allocator *a,
+			   struct page_alloc_slab *slab,
+			   struct gk20a_page_alloc *alloc)
+{
+	struct page_alloc_slab_page *slab_page = NULL;
+	struct page_alloc_chunk *chunk;
+	unsigned long offs;
+
+	/*
+	 * Check the partial and empty lists to see if we have some space
+	 * readily available. Take the slab_page out of what ever list it
+	 * was in since it may be put back into a different list later.
+	 */
+	if (!list_empty(&slab->partial)) {
+		slab_page = list_first_entry(&slab->partial,
+					     struct page_alloc_slab_page,
+					     list_entry);
+		del_slab_page_from_partial(slab, slab_page);
+	} else if (!list_empty(&slab->empty)) {
+		slab_page = list_first_entry(&slab->empty,
+					     struct page_alloc_slab_page,
+					     list_entry);
+		del_slab_page_from_empty(slab, slab_page);
+	}
+
+	if (!slab_page) {
+		slab_page = alloc_slab_page(a, slab);
+		if (IS_ERR(slab_page))
+			return PTR_ERR(slab_page);
+	}
+
+	/*
+	 * We now have a slab_page. Do the alloc.
+	 */
+	offs = bitmap_find_next_zero_area(&slab_page->bitmap,
+					  slab_page->nr_objects,
+					  0, 1, 0);
+	if (offs >= slab_page->nr_objects) {
+		WARN(1, "Empty/partial slab with no free objects?");
+
+		/* Add the buggy page to the full list... This isn't ideal. */
+		add_slab_page_to_full(slab, slab_page);
+		return -ENOMEM;
+	}
+
+	bitmap_set(&slab_page->bitmap, offs, 1);
+	slab_page->nr_objects_alloced++;
+
+	if (slab_page->nr_objects_alloced < slab_page->nr_objects)
+		add_slab_page_to_partial(slab, slab_page);
+	else if (slab_page->nr_objects_alloced == slab_page->nr_objects)
+		add_slab_page_to_full(slab, slab_page);
+	else
+		BUG(); /* Should be impossible to hit this. */
+
+	/*
+	 * Handle building the gk20a_page_alloc struct. We expect one
+	 * page_alloc_chunk to be present.
+	 */
+	alloc->slab_page = slab_page;
+	alloc->nr_chunks = 1;
+	alloc->length = slab_page->slab_size;
+	alloc->base = slab_page->page_addr + (offs * slab_page->slab_size);
+
+	chunk = list_first_entry(&alloc->alloc_chunks,
+				 struct page_alloc_chunk, list_entry);
+	chunk->base = alloc->base;
+	chunk->length = alloc->length;
+
+	return 0;
+}
+
+/*
+ * Allocate from a slab instead of directly from the page allocator.
+ */
+static struct gk20a_page_alloc *__gk20a_alloc_slab(
+	struct gk20a_page_allocator *a, u64 len)
+{
+	int err, slab_nr;
+	struct page_alloc_slab *slab;
+	struct gk20a_page_alloc *alloc = NULL;
+	struct page_alloc_chunk *chunk = NULL;
+
+	/*
+	 * Align the length to a page and then divide by the page size (4k for
+	 * this code). ilog2() of that then gets us the correct slab to use.
+	 */
+	slab_nr = (int)ilog2(PAGE_ALIGN(len) >> 12);
+	slab = &a->slabs[slab_nr];
+
+	alloc = kmem_cache_alloc(page_alloc_cache, GFP_KERNEL);
+	if (!alloc) {
+		palloc_dbg(a, "OOM: could not alloc page_alloc struct!\n");
+		goto fail;
+	}
+	chunk = kmem_cache_alloc(page_alloc_chunk_cache, GFP_KERNEL);
+	if (!chunk) {
+		palloc_dbg(a, "OOM: could not alloc alloc_chunk struct!\n");
+		goto fail;
+	}
+
+	INIT_LIST_HEAD(&alloc->alloc_chunks);
+	list_add(&chunk->list_entry, &alloc->alloc_chunks);
+
+	err = __do_slab_alloc(a, slab, alloc);
+	if (err)
+		goto fail;
+
+	palloc_dbg(a, "Alloc 0x%04llx sr=%d id=0x%010llx [slab]\n",
+		   len, slab_nr, alloc->base);
+	a->nr_slab_allocs++;
+
+	return alloc;
+
+fail:
+	kfree(alloc);
+	kfree(chunk);
+	return ERR_PTR(-ENOMEM);
+}
+
+static void __gk20a_free_slab(struct gk20a_page_allocator *a,
+			      struct gk20a_page_alloc *alloc)
+{
+	struct page_alloc_slab_page *slab_page = alloc->slab_page;
+	struct page_alloc_slab *slab = slab_page->owner;
+	enum slab_page_state new_state;
+	int offs;
+
+	offs = (alloc->base - slab_page->page_addr) / slab_page->slab_size;
+	bitmap_clear(&slab_page->bitmap, offs, 1);
+
+	slab_page->nr_objects_alloced--;
+
+	if (slab_page->nr_objects_alloced == 0)
+		new_state = SP_EMPTY;
+	else
+		new_state = SP_PARTIAL;
+
+	/*
+	 * Need to migrate the page to a different list.
+	 */
+	if (new_state != slab_page->state) {
+		/* Delete - can't be in empty. */
+		if (slab_page->state == SP_PARTIAL)
+			del_slab_page_from_partial(slab, slab_page);
+		else
+			del_slab_page_from_full(slab, slab_page);
+
+		/* And add. */
+		if (new_state == SP_EMPTY) {
+			if (list_empty(&slab->empty))
+				add_slab_page_to_empty(slab, slab_page);
+			else
+				free_slab_page(a, slab_page);
+		} else {
+			add_slab_page_to_partial(slab, slab_page);
+		}
+	}
+
+	/*
+	 * Now handle the page_alloc.
+	 */
+	__gk20a_free_pages(a, alloc, false);
+	a->nr_slab_frees++;
+
+	return;
+}
+
 /*
  * Allocate physical pages. Since the underlying allocator is a buddy allocator
  * the returned pages are always contiguous. However, since there could be
  * fragmentation in the space this allocator will collate smaller non-contiguous
  * allocations together if necessary.
  */
-static struct gk20a_page_alloc *__gk20a_alloc_pages(
+static struct gk20a_page_alloc *__do_gk20a_alloc_pages(
 	struct gk20a_page_allocator *a, u64 pages)
 {
 	struct gk20a_page_alloc *alloc;
@@ -152,6 +447,8 @@ static struct gk20a_page_alloc *__gk20a_alloc_pages(
 	if (!alloc)
 		goto fail;
 
+	memset(alloc, 0, sizeof(*alloc));
+
 	INIT_LIST_HEAD(&alloc->alloc_chunks);
 	alloc->length = pages << a->page_shift;
 
@@ -233,6 +530,33 @@ fail:
 	return ERR_PTR(-ENOMEM);
 }
 
+static struct gk20a_page_alloc *__gk20a_alloc_pages(
+	struct gk20a_page_allocator *a, u64 len)
+{
+	struct gk20a_page_alloc *alloc = NULL;
+	struct page_alloc_chunk *c;
+	u64 pages;
+	int i = 0;
+
+	pages = ALIGN(len, a->page_size) >> a->page_shift;
+
+	alloc = __do_gk20a_alloc_pages(a, pages);
+	if (IS_ERR(alloc)) {
+		palloc_dbg(a, "Alloc 0x%llx (%llu) (failed)\n",
+			   pages << a->page_shift, pages);
+		return NULL;
+	}
+
+	palloc_dbg(a, "Alloc 0x%llx (%llu) id=0x%010llx\n",
+		   pages << a->page_shift, pages, alloc->base);
+	list_for_each_entry(c, &alloc->alloc_chunks, list_entry) {
+		palloc_dbg(a, "  Chunk %2d: 0x%010llx + 0x%llx\n",
+			   i++, c->base, c->length);
+	}
+
+	return alloc;
+}
+
 /*
  * Allocate enough pages to satisfy @len. Page size is determined at
  * initialization of the allocator.
@@ -247,10 +571,7 @@ static u64 gk20a_page_alloc(struct gk20a_allocator *__a, u64 len)
 {
 	struct gk20a_page_allocator *a = page_allocator(__a);
 	struct gk20a_page_alloc *alloc = NULL;
-	struct page_alloc_chunk *c;
 	u64 real_len;
-	u64 pages;
-	int i = 0;
 
 	/*
 	 * If we want contig pages we have to round up to a power of two. It's
@@ -259,30 +580,24 @@ static u64 gk20a_page_alloc(struct gk20a_allocator *__a, u64 len)
 	real_len = a->flags & GPU_ALLOC_FORCE_CONTIG ?
 		roundup_pow_of_two(len) : len;
 
-	pages = ALIGN(real_len, a->page_size) >> a->page_shift;
-
 	alloc_lock(__a);
+	if (a->flags & GPU_ALLOC_4K_VIDMEM_PAGES &&
+	    real_len <= (a->page_size / 2))
+		alloc = __gk20a_alloc_slab(a, real_len);
+	else
+		alloc = __gk20a_alloc_pages(a, real_len);
 
-	alloc = __gk20a_alloc_pages(a, pages);
-	if (IS_ERR(alloc)) {
+	if (!alloc) {
 		alloc_unlock(__a);
-		palloc_dbg(a, "Alloc 0x%llx (%llu) (failed)\n",
-			   pages << a->page_shift, pages);
 		return 0;
 	}
 
 	__insert_page_alloc(a, alloc);
-	alloc_unlock(__a);
-
-	palloc_dbg(a, "Alloc 0x%llx (%llu) id=0x%010llx\n",
-		   pages << a->page_shift, pages, alloc->base);
-	list_for_each_entry(c, &alloc->alloc_chunks, list_entry) {
-		palloc_dbg(a, "  Chunk %2d: 0x%010llx + 0x%llx\n",
-			   i++, c->base, c->length);
-	}
 
 	a->nr_allocs++;
-	a->pages_alloced += pages;
+	if (real_len > a->page_size / 2)
+		a->pages_alloced += alloc->length >> a->page_shift;
+	alloc_unlock(__a);
 
 	if (a->flags & GPU_ALLOC_NO_SCATTER_GATHER)
 		return alloc->base;
@@ -290,24 +605,6 @@ static u64 gk20a_page_alloc(struct gk20a_allocator *__a, u64 len)
 		return (u64) (uintptr_t) alloc;
 }
 
-static void __gk20a_free_pages(struct gk20a_page_allocator *a,
-			       struct gk20a_page_alloc *alloc)
-{
-	struct page_alloc_chunk *chunk;
-
-	while (!list_empty(&alloc->alloc_chunks)) {
-		chunk = list_first_entry(&alloc->alloc_chunks,
-					 struct page_alloc_chunk,
-					 list_entry);
-		list_del(&chunk->list_entry);
-
-		gk20a_free(&a->source_allocator, chunk->base);
-		kfree(chunk);
-	}
-
-	kfree(alloc);
-}
-
 /*
  * Note: this will remove the gk20a_page_alloc struct from the RB tree
  * if it's found.
@@ -331,14 +628,18 @@ static void gk20a_page_free(struct gk20a_allocator *__a, u64 base)
 	}
 
 	a->nr_frees++;
-	a->pages_freed += (alloc->length >> a->page_shift);
 
 	/*
 	 * Frees *alloc.
 	 */
-	__gk20a_free_pages(a, alloc);
+	if (alloc->slab_page) {
+		__gk20a_free_slab(a, alloc);
+	} else {
+		a->pages_freed += (alloc->length >> a->page_shift);
+		__gk20a_free_pages(a, alloc, true);
+	}
 
-	palloc_dbg(a, "Free  0x%010llx id=0x%010llx\n",
+	palloc_dbg(a, "Free  0x%llx id=0x%010llx\n",
 		   alloc->length, alloc->base);
 
 done:
@@ -439,7 +740,7 @@ static void gk20a_page_free_fixed(struct gk20a_allocator *__a,
 	 * allocs. This would have to be updated if the underlying
 	 * allocator were to change.
 	 */
-	__gk20a_free_pages(a, alloc);
+	__gk20a_free_pages(a, alloc, true);
 
 	palloc_dbg(a, "Free  [fixed] 0x%010llx + 0x%llx\n",
 		   alloc->base, alloc->length);
@@ -464,6 +765,7 @@ static void gk20a_page_print_stats(struct gk20a_allocator *__a,
 				   struct seq_file *s, int lock)
 {
 	struct gk20a_page_allocator *a = page_allocator(__a);
+	int i;
 
 	if (lock)
 		alloc_lock(__a);
@@ -473,12 +775,33 @@ static void gk20a_page_print_stats(struct gk20a_allocator *__a,
 	__alloc_pstat(s, __a, "  frees          %lld\n", a->nr_frees);
 	__alloc_pstat(s, __a, "  fixed_allocs   %lld\n", a->nr_fixed_allocs);
 	__alloc_pstat(s, __a, "  fixed_frees    %lld\n", a->nr_fixed_frees);
+	__alloc_pstat(s, __a, "  slab_allocs    %lld\n", a->nr_slab_allocs);
+	__alloc_pstat(s, __a, "  slab_frees     %lld\n", a->nr_slab_frees);
 	__alloc_pstat(s, __a, "  pages alloced  %lld\n", a->pages_alloced);
 	__alloc_pstat(s, __a, "  pages freed    %lld\n", a->pages_freed);
 	__alloc_pstat(s, __a, "\n");
+
+	/*
+	 * Slab info.
+	 */
+	if (a->flags & GPU_ALLOC_4K_VIDMEM_PAGES) {
+		__alloc_pstat(s, __a, "Slabs:\n");
+		__alloc_pstat(s, __a, "  size      empty     partial   full\n");
+		__alloc_pstat(s, __a, "  ----      -----     -------   ----\n");
+
+		for (i = 0; i < a->nr_slabs; i++) {
+			struct page_alloc_slab *slab = &a->slabs[i];
+
+			__alloc_pstat(s, __a, "  %-9u %-9d %-9u %u\n",
+				      slab->slab_size,
+				      slab->nr_empty, slab->nr_partial,
+				      slab->nr_full);
+		}
+		__alloc_pstat(s, __a, "\n");
+	}
+
 	__alloc_pstat(s, __a, "Source alloc: %s\n",
 		      a->source_allocator.name);
-
 	gk20a_alloc_print_stats(&a->source_allocator, s, lock);
 
 	if (lock)
@@ -506,6 +829,43 @@ static const struct gk20a_allocator_ops page_ops = {
 	.print_stats	= gk20a_page_print_stats,
 };
 
+/*
+ * nr_slabs is computed as follows: divide page_size by 4096 to get number of
+ * 4k pages in page_size. Then take the base 2 log of that to get number of
+ * slabs. For 64k page_size that works on like:
+ *
+ *   1024*64 / 1024*4 = 16
+ *   ilog2(16) = 4
+ *
+ * That gives buckets of 1, 2, 4, and 8 pages (i.e 4k, 8k, 16k, 32k).
+ */
+static int gk20a_page_alloc_init_slabs(struct gk20a_page_allocator *a)
+{
+	size_t nr_slabs = ilog2(a->page_size >> 12);
+	int i;
+
+	a->slabs = kcalloc(nr_slabs,
+			   sizeof(struct page_alloc_slab),
+			   GFP_KERNEL);
+	if (!a->slabs)
+		return -ENOMEM;
+	a->nr_slabs = nr_slabs;
+
+	for (i = 0; i < nr_slabs; i++) {
+		struct page_alloc_slab *slab = &a->slabs[i];
+
+		slab->slab_size = SZ_4K * (1 << i);
+		INIT_LIST_HEAD(&slab->empty);
+		INIT_LIST_HEAD(&slab->partial);
+		INIT_LIST_HEAD(&slab->full);
+		slab->nr_empty = 0;
+		slab->nr_partial = 0;
+		slab->nr_full = 0;
+	}
+
+	return 0;
+}
+
 int gk20a_page_allocator_init(struct gk20a_allocator *__a,
 				const char *name, u64 base, u64 length,
 				u64 blk_size, u64 flags)
@@ -519,11 +879,17 @@ int gk20a_page_allocator_init(struct gk20a_allocator *__a,
 		page_alloc_cache = KMEM_CACHE(gk20a_page_alloc, 0);
 	if (!page_alloc_chunk_cache)
 		page_alloc_chunk_cache = KMEM_CACHE(page_alloc_chunk, 0);
+	if (!page_alloc_slab_page_cache)
+		page_alloc_slab_page_cache =
+			KMEM_CACHE(page_alloc_slab_page, 0);
 	mutex_unlock(&meta_data_cache_lock);
 
 	if (!page_alloc_cache || !page_alloc_chunk_cache)
 		return -ENOMEM;
 
+	if (blk_size < SZ_4K)
+		return -EINVAL;
+
 	a = kzalloc(sizeof(struct gk20a_page_allocator), GFP_KERNEL);
 	if (!a)
 		return -ENOMEM;
@@ -540,6 +906,12 @@ int gk20a_page_allocator_init(struct gk20a_allocator *__a,
 	a->owner = __a;
 	a->flags = flags;
 
+	if (flags & GPU_ALLOC_4K_VIDMEM_PAGES && blk_size > SZ_4K) {
+		err = gk20a_page_alloc_init_slabs(a);
+		if (err)
+			goto fail;
+	}
+
 	snprintf(buddy_name, sizeof(buddy_name), "%s-src", name);
 
 	err = gk20a_buddy_allocator_init(&a->source_allocator, buddy_name, base,
@@ -553,6 +925,7 @@ int gk20a_page_allocator_init(struct gk20a_allocator *__a,
 	palloc_dbg(a, "               size      0x%llx\n", a->length);
 	palloc_dbg(a, "               page_size 0x%llx\n", a->page_size);
 	palloc_dbg(a, "               flags     0x%llx\n", a->flags);
+	palloc_dbg(a, "               slabs:    %d\n", a->nr_slabs);
 
 	return 0;
 
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
index 1bacb70a..df58feb2 100644
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
@@ -925,8 +925,11 @@ static int gk20a_init_vidmem(struct mm_gk20a *mm)
 					bootstrap_base, bootstrap_size,
 					SZ_4K, 0);
 
-	err = gk20a_page_allocator_init(&g->mm.vidmem.allocator, "vidmem",
-				base, size - base, default_page_size, 0);
+	err = gk20a_page_allocator_init(&g->mm.vidmem.allocator,
+					"vidmem",
+					base, size - base,
+					default_page_size,
+					GPU_ALLOC_4K_VIDMEM_PAGES);
 	if (err) {
 		gk20a_err(d, "Failed to register vidmem for size %zu: %d",
 				size, err);
diff --git a/drivers/gpu/nvgpu/gk20a/page_allocator_priv.h b/drivers/gpu/nvgpu/gk20a/page_allocator_priv.h
index 3d4e3c43..7d7f43c2 100644
--- a/drivers/gpu/nvgpu/gk20a/page_allocator_priv.h
+++ b/drivers/gpu/nvgpu/gk20a/page_allocator_priv.h
@@ -19,12 +19,63 @@
 
 #include <linux/list.h>
 #include <linux/rbtree.h>
-#include <gk20a/gk20a_allocator.h>
 
 #include "gk20a_allocator.h"
 
 struct gk20a_allocator;
 
+/*
+ * This allocator implements the ability to do SLAB style allocation since the
+ * GPU has two page sizes available - 4k and 64k/128k. When the default
+ * granularity is the large page size (64k/128k) small allocations become very
+ * space inefficient. This is most notable in PDE and PTE blocks which are 4k
+ * in size.
+ *
+ * Thus we need the ability to suballocate in 64k pages. The way we do this for
+ * the GPU is as follows. We have several buckets for sub-64K allocations:
+ *
+ *   B0 - 4k
+ *   B1 - 8k
+ *   B3 - 16k
+ *   B4 - 32k
+ *   B5 - 64k (for when large pages are 128k)
+ *
+ * When an allocation comes in for less than the large page size (from now on
+ * assumed to be 64k) the allocation is satisfied by one of the buckets.
+ */
+struct page_alloc_slab {
+	struct list_head empty;
+	struct list_head partial;
+	struct list_head full;
+
+	int nr_empty;
+	int nr_partial;
+	int nr_full;
+
+	u32 slab_size;
+};
+
+enum slab_page_state {
+	SP_EMPTY,
+	SP_PARTIAL,
+	SP_FULL,
+	SP_NONE
+};
+
+struct page_alloc_slab_page {
+	unsigned long bitmap;
+	u64 page_addr;
+	u32 slab_size;
+
+	u32 nr_objects;
+	u32 nr_objects_alloced;
+
+	enum slab_page_state state;
+
+	struct page_alloc_slab *owner;
+	struct list_head list_entry;
+};
+
 struct page_alloc_chunk {
 	struct list_head list_entry;
 
@@ -34,7 +85,7 @@ struct page_alloc_chunk {
 
 /*
  * Struct to handle internal management of page allocation. It holds a list
- * of the chunks of page that make up the overall allocation - much like a
+ * of the chunks of pages that make up the overall allocation - much like a
  * scatter gather table.
  */
 struct gk20a_page_alloc {
@@ -44,13 +95,20 @@ struct gk20a_page_alloc {
 	u64 length;
 
 	/*
-	 * Only useful for the RB tree - since the alloc will have discontiguous
+	 * Only useful for the RB tree - since the alloc may have discontiguous
 	 * pages the base is essentially irrelevant except for the fact that it
 	 * is guarenteed to be unique.
 	 */
 	u64 base;
 
 	struct rb_node tree_entry;
+
+	/*
+	 * Set if this is a slab alloc. Points back to the slab page that owns
+	 * this particular allocation. nr_chunks will always be 1 if this is
+	 * set.
+	 */
+	struct page_alloc_slab_page *slab_page;
 };
 
 struct gk20a_page_allocator {
@@ -73,6 +131,9 @@ struct gk20a_page_allocator {
 
 	struct rb_root allocs;		/* Outstanding allocations. */
 
+	struct page_alloc_slab *slabs;
+	int nr_slabs;
+
 	u64 flags;
 
 	/*
@@ -82,6 +143,8 @@ struct gk20a_page_allocator {
 	u64 nr_frees;
 	u64 nr_fixed_allocs;
 	u64 nr_fixed_frees;
+	u64 nr_slab_allocs;
+	u64 nr_slab_frees;
 	u64 pages_alloced;
 	u64 pages_freed;
 };
-- 
cgit v1.2.2