gpu: nvgpu: Implement PD packing

In some cases page directories require less than a full page of memory. For example, on Pascal, the final PD level for large pages is only 256 bytes; thus 16 PDs can fit in a single page. To allocate an entire page for each of these 256 B PDs is extremely wasteful. This patch aims to alleviate the wasted DMA memory from having small PDs in a full page by packing multiple small PDs into a single page. The packing is implemented as a slab allocator - each page is a slab and from each page multiple PD instances can be allocated. Several modifications to the nvgpu_gmmu_pd struct also needed to be made to support this. The nvgpu_mem is now a pointer and there's an explicit offset into the nvgpu_mem struct so that each nvgpu_gmmu_pd knows what portion of the memory it's using. The nvgpu_pde_phys_addr() function and the pd_write() functions also require some changes since the PD no longer is always situated at the start of the nvgpu_mem. Initialization and cleanup of the page tables for each VM was slightly modified to work through the new pd_cache implementation. Some PDs (i.e the PDB), despite not being a full page, still require a full page for alignment purposes (HW requirements). Thus a direct allocation method for PDs is still provided. This is also used when a PD that could in principle be cached is greater than a page in size. Lastly a new debug flag was added for the pd_cache code. JIRA NVGPU-30 Change-Id: I64c8037fc356783c1ef203cc143c4d71bbd5d77c Signed-off-by: Alex Waterman <alexw@nvidia.com> Reviewed-on: https://git-master/r/1506610 Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com> GVS: Gerrit_Virtual_Submit
author: Alex Waterman <alexw@nvidia.com> 2017-06-09 14:42:50 -0400
committer: mobile promotions <svcmobile_promotions@nvidia.com> 2017-07-06 17:44:16 -0400
commit: 583704620db88e391f6b14acc57af859a70127de (patch)
tree: 8fc3becf2850b724e87011b0e0250c52d0efb7ee /drivers/gpu/nvgpu/common/mm/pd_cache.c
parent: c1393d5b68e63c992f4c689cb788139fdf8c2f1a (diff)
1 files changed, 426 insertions, 0 deletions
diff --git a/drivers/gpu/nvgpu/common/mm/pd_cache.c b/drivers/gpu/nvgpu/common/mm/pd_cache.c
new file mode 100644
index 00000000..4f312eff
--- /dev/null
+++ b/drivers/gpu/nvgpu/common/mm/pd_cache.c
@@ -0,0 +1,426 @@
+/*
+ * Copyright (c) 2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#include <nvgpu/log.h>
+#include <nvgpu/dma.h>
+#include <nvgpu/gmmu.h>
+#include <nvgpu/nvgpu_mem.h>
+#include <nvgpu/list.h>
+#include <nvgpu/log2.h>
+#include "gk20a/gk20a.h"
+#include "gk20a/mm_gk20a.h"
+#define pd_dbg(g, fmt, args...) nvgpu_log(g, gpu_dbg_pd_cache, fmt, ##args)
+/**
+ * DOC: PD cache
+ *
+ * In the name of saving memory with the many sub-page sized PD levels in Pascal
+ * and beyond a way of packing PD tables together is necessary. This code here
+ * does just that. If a PD table only requires 1024 bytes, then it is possible
+ * to have 4 of these PDs in one page. This is even more pronounced for 256 byte
+ * PD tables.
+ *
+ * The pd cache is basially just a slab allocator. Each instance of the nvgpu
+ * driver makes one of these structs:
+ *
+ *   struct nvgpu_pd_cache {
+ *      struct nvgpu_list_node           full[NVGPU_PD_CACHE_COUNT];
+ *      struct nvgpu_list_node           partial[NVGPU_PD_CACHE_COUNT];
+ *
+ *      struct nvgpu_rbtree_node        *mem_tree;
+ *   };
+ *
+ * There are two sets of lists, the full and the partial. The full lists contain
+ * pages of memory for which all the memory in that page is in use. The partial
+ * lists contain partially full pages of memory which can be used for more PD
+ * allocations. There a couple of assumptions here:
+ *
+ *   1. PDs greater than or equal to the page size bypass the pd cache.
+ *   2. PDs are always power of 2 and greater than %NVGPU_PD_CACHE_MIN bytes.
+ *
+ * There are NVGPU_PD_CACHE_COUNT full lists and the same number of partial
+ * lists. For a 4Kb page NVGPU_PD_CACHE_COUNT is 4. This is enough space for
+ * 256, 512, 1024, and 2048 byte PDs.
+ *
+ * __nvgpu_pd_alloc() will allocate a PD for the GMMU. It will check if the PD
+ * size is page size or larger and choose the correct allocation scheme - either
+ * from the PD cache or directly. Similarly __nvgpu_pd_free() will free a PD
+ * allocated by __nvgpu_pd_alloc().
+ *
+ * Since the top level PD (the PDB) is a page aligned pointer but less than a
+ * page size the direct functions must be used for allocating PDBs. Otherwise
+ * there would be alignment issues for the PDBs when they get packed.
+ */
+static u32 nvgpu_pd_cache_nr(u32 bytes)
+{
+        return ilog2(bytes >> (NVGPU_PD_CACHE_MIN_SHIFT - 1));
+}
+static u32 nvgpu_pd_cache_get_mask(struct nvgpu_pd_mem_entry *pentry)
+{
+        u32 mask_offset = 1 << (PAGE_SIZE / pentry->pd_size);
+        return mask_offset - 1;
+}
+int nvgpu_pd_cache_init(struct gk20a *g)
+{
+        struct nvgpu_pd_cache *cache;
+        int i;
+        /*
+         * This gets called from finalize_poweron() so we need to make sure we
+         * don't reinit the pd_cache over and over.
+         */
+        if (g->mm.pd_cache)
+                return 0;
+        cache = nvgpu_kzalloc(g, sizeof(*cache));
+        if (!cache) {
+                nvgpu_err(g, "Failed to alloc pd_cache!");
+                return -ENOMEM;
+        }
+        for (i = 0; i < NVGPU_PD_CACHE_COUNT; i++) {
+                nvgpu_init_list_node(&cache->full[i]);
+                nvgpu_init_list_node(&cache->partial[i]);
+        }
+        cache->mem_tree = NULL;
+        g->mm.pd_cache = cache;
+        nvgpu_mutex_init(&cache->lock);
+        pd_dbg(g, "PD cache initialized!");
+        return 0;
+}
+void nvgpu_pd_cache_fini(struct gk20a *g)
+{
+        int i;
+        struct nvgpu_pd_cache *cache = g->mm.pd_cache;
+        if (!cache)
+                return;
+        for (i = 0; i < NVGPU_PD_CACHE_COUNT; i++) {
+                WARN_ON(!nvgpu_list_empty(&cache->full[i]));
+                WARN_ON(!nvgpu_list_empty(&cache->partial[i]));
+        }
+        nvgpu_kfree(g, g->mm.pd_cache);
+}
+/*
+ * This is the simple pass-through for greater than page or page sized PDs.
+ *
+ * Note: this does not need the cache lock since it does not modify any of the
+ * PD cache data structures.
+ */
+int __nvgpu_pd_cache_alloc_direct(struct gk20a *g,
+                                  struct nvgpu_gmmu_pd *pd, u32 bytes)
+{
+        int err;
+        pd_dbg(g, "PD-Alloc [D] %u bytes", bytes);
+        pd->mem = nvgpu_kzalloc(g, sizeof(*pd->mem));
+        if (!pd->mem) {
+                pd_dbg(g, "OOM allocating nvgpu_mem struct!");
+                return -ENOMEM;
+        }
+        err = nvgpu_dma_alloc_flags(g, NVGPU_DMA_FORCE_CONTIGUOUS,
+                                    bytes, pd->mem);
+        if (err) {
+                pd_dbg(g, "OOM allocating page directory!");
+                nvgpu_kfree(g, pd->mem);
+                return -ENOMEM;
+        }
+        pd->cached = false;
+        pd->mem_offs = 0;
+        return 0;
+}
+/*
+ * Make a new nvgpu_pd_cache_entry and allocate a PD from it. Update the passed
+ * pd to reflect this allocation.
+ */
+static int nvgpu_pd_cache_alloc_new(struct gk20a *g,
+                                    struct nvgpu_pd_cache *cache,
+                                    struct nvgpu_gmmu_pd *pd,
+                                    u32 bytes)
+{
+        struct nvgpu_pd_mem_entry *pentry;
+        pd_dbg(g, "PD-Alloc [C]   New: offs=0");
+        pentry = nvgpu_kzalloc(g, sizeof(*pentry));
+        if (!pentry) {
+                pd_dbg(g, "OOM allocating pentry!");
+                return -ENOMEM;
+        }
+        if (nvgpu_dma_alloc_flags(g, NVGPU_DMA_FORCE_CONTIGUOUS,
+                                  PAGE_SIZE, &pentry->mem)) {
+                nvgpu_kfree(g, pentry);
+                pd_dbg(g, "Unable to DMA alloc!");
+                return -ENOMEM;
+        }
+        pentry->pd_size = bytes;
+        nvgpu_list_add(&pentry->list_entry,
+                       &cache->partial[nvgpu_pd_cache_nr(bytes)]);
+        /*
+         * This allocates the very first PD table in the set of tables in this
+         * nvgpu_pd_mem_entry.
+         */
+        pentry->alloc_map = 1;
+        /*
+         * Now update the nvgpu_gmmu_pd to reflect this allocation.
+         */
+        pd->mem = &pentry->mem;
+        pd->mem_offs = 0;
+        pd->cached = true;
+        pentry->tree_entry.key_start = (u64)(uintptr_t)&pentry->mem;
+        nvgpu_rbtree_insert(&pentry->tree_entry, &cache->mem_tree);
+        return 0;
+}
+static int nvgpu_pd_cache_alloc_from_partial(struct gk20a *g,
+                                             struct nvgpu_pd_cache *cache,
+                                             struct nvgpu_pd_mem_entry *pentry,
+                                             struct nvgpu_gmmu_pd *pd)
+{
+        unsigned long bit_offs;
+        u32 mem_offs;
+        u32 pentry_mask = nvgpu_pd_cache_get_mask(pentry);
+        /*
+         * Find and allocate an open PD.
+         */
+        bit_offs = ffz(pentry->alloc_map);
+        mem_offs = bit_offs * pentry->pd_size;
+        /* Bit map full. Somethings wrong. */
+        if (WARN_ON(bit_offs >= ffz(pentry_mask)))
+                return -ENOMEM;
+        pentry->alloc_map |= 1 << bit_offs;
+        pd_dbg(g, "PD-Alloc [C]   Partial: offs=%lu", bit_offs);
+        /*
+         * First update the pd.
+         */
+        pd->mem = &pentry->mem;
+        pd->mem_offs = mem_offs;
+        pd->cached = true;
+        /*
+         * Now make sure the pentry is in the correct list (full vs partial).
+         */
+        if ((pentry->alloc_map & pentry_mask) == pentry_mask) {
+                pd_dbg(g, "Adding pentry to full list!");
+                nvgpu_list_del(&pentry->list_entry);
+                nvgpu_list_add(&pentry->list_entry,
+                        &cache->full[nvgpu_pd_cache_nr(pentry->pd_size)]);
+        }
+        return 0;
+}
+/*
+ * Get a partially full nvgpu_pd_mem_entry. Returns NULL if there is no partial
+ * nvgpu_pd_mem_entry's.
+ */
+static struct nvgpu_pd_mem_entry *nvgpu_pd_cache_get_partial(
+        struct nvgpu_pd_cache *cache, u32 bytes)
+{
+        struct nvgpu_list_node *list =
+                &cache->partial[nvgpu_pd_cache_nr(bytes)];
+        if (nvgpu_list_empty(list))
+                return NULL;
+        return nvgpu_list_first_entry(list,
+                                      nvgpu_pd_mem_entry,
+                                      list_entry);
+}
+/*
+ * Allocate memory from an nvgpu_mem for the page directory.
+ */
+static int nvgpu_pd_cache_alloc(struct gk20a *g, struct nvgpu_pd_cache *cache,
+                                struct nvgpu_gmmu_pd *pd, u32 bytes)
+{
+        struct nvgpu_pd_mem_entry *pentry;
+        int err;
+        pd_dbg(g, "PD-Alloc [C] %u bytes", bytes);
+        if (bytes & (bytes - 1) ||
+            (bytes >= PAGE_SIZE ||
+             bytes < NVGPU_PD_CACHE_MIN)) {
+                pd_dbg(g, "PD-Alloc [C]   Invalid (bytes=%u)!", bytes);
+                return -EINVAL;
+        }
+        pentry = nvgpu_pd_cache_get_partial(cache, bytes);
+        if (!pentry)
+                err = nvgpu_pd_cache_alloc_new(g, cache, pd, bytes);
+        else
+                err = nvgpu_pd_cache_alloc_from_partial(g, cache, pentry, pd);
+        if (err)
+                pd_dbg(g, "PD-Alloc [C] Failed!");
+        return err;
+}
+/*
+ * Allocate the DMA memory for a page directory. This handles the necessary PD
+ * cache logistics. Since on Parker and later GPUs some of the page  directories
+ * are smaller than a page packing these PDs together saves a lot of memory.
+ */
+int __nvgpu_pd_alloc(struct vm_gk20a *vm, struct nvgpu_gmmu_pd *pd, u32 bytes)
+{
+        struct gk20a *g = gk20a_from_vm(vm);
+        int err;
+        /*
+         * Simple case: PD is bigger than a page so just do a regular DMA
+         * alloc.
+         */
+        if (bytes >= PAGE_SIZE) {
+                err = __nvgpu_pd_cache_alloc_direct(g, pd, bytes);
+                if (err)
+                        return err;
+                return 0;
+        }
+        if (WARN_ON(!g->mm.pd_cache))
+                return -ENOMEM;
+        nvgpu_mutex_acquire(&g->mm.pd_cache->lock);
+        err = nvgpu_pd_cache_alloc(g, g->mm.pd_cache, pd, bytes);
+        nvgpu_mutex_release(&g->mm.pd_cache->lock);
+        return err;
+}
+void __nvgpu_pd_cache_free_direct(struct gk20a *g, struct nvgpu_gmmu_pd *pd)
+{
+        pd_dbg(g, "PD-Free  [D] 0x%p", pd->mem);
+        if (!pd->mem)
+                return;
+        nvgpu_dma_free(g, pd->mem);
+        nvgpu_kfree(g, pd->mem);
+        pd->mem = NULL;
+}
+static void nvgpu_pd_cache_free_mem_entry(struct gk20a *g,
+                                          struct nvgpu_pd_cache *cache,
+                                          struct nvgpu_pd_mem_entry *pentry)
+{
+        nvgpu_dma_free(g, &pentry->mem);
+        nvgpu_list_del(&pentry->list_entry);
+        nvgpu_rbtree_unlink(&pentry->tree_entry, &cache->mem_tree);
+        nvgpu_kfree(g, pentry);
+}
+static void nvgpu_pd_cache_do_free(struct gk20a *g,
+                                   struct nvgpu_pd_cache *cache,
+                                   struct nvgpu_pd_mem_entry *pentry,
+                                   struct nvgpu_gmmu_pd *pd)
+{
+        u32 index = pd->mem_offs / pentry->pd_size;
+        u32 bit = 1 << index;
+        /* Mark entry as free. */
+        pentry->alloc_map &= ~bit;
+        if (pentry->alloc_map & nvgpu_pd_cache_get_mask(pentry)) {
+                /*
+                 * Partially full still. If it was already on the partial list
+                 * this just re-adds it.
+                 */
+                nvgpu_list_del(&pentry->list_entry);
+                nvgpu_list_add(&pentry->list_entry,
+                        &cache->partial[nvgpu_pd_cache_nr(pentry->pd_size)]);
+        } else {
+                /* Empty now so free it. */
+                nvgpu_pd_cache_free_mem_entry(g, cache, pentry);
+        }
+}
+static struct nvgpu_pd_mem_entry *nvgpu_pd_cache_look_up(
+        struct gk20a *g,
+        struct nvgpu_pd_cache *cache,
+        struct nvgpu_gmmu_pd *pd)
+{
+        struct nvgpu_rbtree_node *node;
+        nvgpu_rbtree_search((u64)(uintptr_t)pd->mem, &node,
+                            cache->mem_tree);
+        if (!node)
+                return NULL;
+        return nvgpu_pd_mem_entry_from_tree_entry(node);
+}
+static void nvgpu_pd_cache_free(struct gk20a *g, struct nvgpu_pd_cache *cache,
+                                struct nvgpu_gmmu_pd *pd)
+{
+        struct nvgpu_pd_mem_entry *pentry;
+        pd_dbg(g, "PD-Free  [C] 0x%p", pd->mem);
+        pentry = nvgpu_pd_cache_look_up(g, cache, pd);
+        if (!pentry) {
+                WARN(1, "Attempting to free non-existent pd");
+                return;
+        }
+        nvgpu_pd_cache_do_free(g, cache, pentry, pd);
+}
+void __nvgpu_pd_free(struct vm_gk20a *vm, struct nvgpu_gmmu_pd *pd)
+{
+        struct gk20a *g = gk20a_from_vm(vm);
+        /*
+         * Simple case: just DMA free.
+         */
+        if (!pd->cached)
+                return __nvgpu_pd_cache_free_direct(g, pd);
+        nvgpu_mutex_acquire(&g->mm.pd_cache->lock);
+        nvgpu_pd_cache_free(g, g->mm.pd_cache, pd);
+        nvgpu_mutex_release(&g->mm.pd_cache->lock);
+}
author	Alex Waterman <alexw@nvidia.com>	2017-06-09 14:42:50 -0400
committer	mobile promotions <svcmobile_promotions@nvidia.com>	2017-07-06 17:44:16 -0400
commit	583704620db88e391f6b14acc57af859a70127de (patch)
tree	8fc3becf2850b724e87011b0e0250c52d0efb7ee /drivers/gpu/nvgpu/common/mm/pd_cache.c
parent	c1393d5b68e63c992f4c689cb788139fdf8c2f1a (diff)

diff --git a/drivers/gpu/nvgpu/common/mm/pd_cache.c b/drivers/gpu/nvgpu/common/mm/pd_cache.c new file mode 100644 index 00000000..4f312eff --- /dev/null +++ b/drivers/gpu/nvgpu/common/mm/pd_cache.c
@@ -0,0 +1,426 @@
	1	/*
	2	* Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
	3	*
	4	* This program is free software; you can redistribute it and/or modify it
	5	* under the terms and conditions of the GNU General Public License,
	6	* version 2, as published by the Free Software Foundation.
	7	*
	8	* This program is distributed in the hope it will be useful, but WITHOUT
	9	* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
	10	* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
	11	* more details.
	12	*
	13	* You should have received a copy of the GNU General Public License
	14	* along with this program. If not, see <http://www.gnu.org/licenses/>.
	15	*/
	16
	17	#include <nvgpu/log.h>
	18	#include <nvgpu/dma.h>
	19	#include <nvgpu/gmmu.h>
	20	#include <nvgpu/nvgpu_mem.h>
	21	#include <nvgpu/list.h>
	22	#include <nvgpu/log2.h>
	23
	24	#include "gk20a/gk20a.h"
	25	#include "gk20a/mm_gk20a.h"
	26
	27	#define pd_dbg(g, fmt, args...) nvgpu_log(g, gpu_dbg_pd_cache, fmt, ##args)
	28
	29	/**
	30	* DOC: PD cache
	31	*
	32	* In the name of saving memory with the many sub-page sized PD levels in Pascal
	33	* and beyond a way of packing PD tables together is necessary. This code here
	34	* does just that. If a PD table only requires 1024 bytes, then it is possible
	35	* to have 4 of these PDs in one page. This is even more pronounced for 256 byte
	36	* PD tables.
	37	*
	38	* The pd cache is basially just a slab allocator. Each instance of the nvgpu
	39	* driver makes one of these structs:
	40	*
	41	* struct nvgpu_pd_cache {
	42	* struct nvgpu_list_node full[NVGPU_PD_CACHE_COUNT];
	43	* struct nvgpu_list_node partial[NVGPU_PD_CACHE_COUNT];
	44	*
	45	* struct nvgpu_rbtree_node *mem_tree;
	46	* };
	47	*
	48	* There are two sets of lists, the full and the partial. The full lists contain
	49	* pages of memory for which all the memory in that page is in use. The partial
	50	* lists contain partially full pages of memory which can be used for more PD
	51	* allocations. There a couple of assumptions here:
	52	*
	53	* 1. PDs greater than or equal to the page size bypass the pd cache.
	54	* 2. PDs are always power of 2 and greater than %NVGPU_PD_CACHE_MIN bytes.
	55	*
	56	* There are NVGPU_PD_CACHE_COUNT full lists and the same number of partial
	57	* lists. For a 4Kb page NVGPU_PD_CACHE_COUNT is 4. This is enough space for
	58	* 256, 512, 1024, and 2048 byte PDs.
	59	*
	60	* __nvgpu_pd_alloc() will allocate a PD for the GMMU. It will check if the PD
	61	* size is page size or larger and choose the correct allocation scheme - either
	62	* from the PD cache or directly. Similarly __nvgpu_pd_free() will free a PD
	63	* allocated by __nvgpu_pd_alloc().
	64	*
	65	* Since the top level PD (the PDB) is a page aligned pointer but less than a
	66	* page size the direct functions must be used for allocating PDBs. Otherwise
	67	* there would be alignment issues for the PDBs when they get packed.
	68	*/
	69
	70	static u32 nvgpu_pd_cache_nr(u32 bytes)
	71	{
	72	return ilog2(bytes >> (NVGPU_PD_CACHE_MIN_SHIFT - 1));
	73	}
	74
	75	static u32 nvgpu_pd_cache_get_mask(struct nvgpu_pd_mem_entry *pentry)
	76	{
	77	u32 mask_offset = 1 << (PAGE_SIZE / pentry->pd_size);
	78
	79	return mask_offset - 1;
	80	}
	81
	82	int nvgpu_pd_cache_init(struct gk20a *g)
	83	{
	84	struct nvgpu_pd_cache *cache;
	85	int i;
	86
	87	/*
	88	* This gets called from finalize_poweron() so we need to make sure we
	89	* don't reinit the pd_cache over and over.
	90	*/
	91	if (g->mm.pd_cache)
	92	return 0;
	93
	94	cache = nvgpu_kzalloc(g, sizeof(*cache));
	95	if (!cache) {
	96	nvgpu_err(g, "Failed to alloc pd_cache!");
	97	return -ENOMEM;
	98	}
	99
	100	for (i = 0; i < NVGPU_PD_CACHE_COUNT; i++) {
	101	nvgpu_init_list_node(&cache->full[i]);
	102	nvgpu_init_list_node(&cache->partial[i]);
	103	}
	104
	105	cache->mem_tree = NULL;
	106	g->mm.pd_cache = cache;
	107	nvgpu_mutex_init(&cache->lock);
	108
	109	pd_dbg(g, "PD cache initialized!");
	110
	111	return 0;
	112	}
	113
	114	void nvgpu_pd_cache_fini(struct gk20a *g)
	115	{
	116	int i;
	117	struct nvgpu_pd_cache *cache = g->mm.pd_cache;
	118
	119	if (!cache)
	120	return;
	121
	122	for (i = 0; i < NVGPU_PD_CACHE_COUNT; i++) {
	123	WARN_ON(!nvgpu_list_empty(&cache->full[i]));
	124	WARN_ON(!nvgpu_list_empty(&cache->partial[i]));
	125	}
	126
	127	nvgpu_kfree(g, g->mm.pd_cache);
	128	}
	129
	130	/*
	131	* This is the simple pass-through for greater than page or page sized PDs.
	132	*
	133	* Note: this does not need the cache lock since it does not modify any of the
	134	* PD cache data structures.
	135	*/
	136	int __nvgpu_pd_cache_alloc_direct(struct gk20a *g,
	137	struct nvgpu_gmmu_pd *pd, u32 bytes)
	138	{
	139	int err;
	140
	141	pd_dbg(g, "PD-Alloc [D] %u bytes", bytes);
	142
	143	pd->mem = nvgpu_kzalloc(g, sizeof(*pd->mem));
	144	if (!pd->mem) {
	145	pd_dbg(g, "OOM allocating nvgpu_mem struct!");
	146	return -ENOMEM;
	147	}
	148
	149	err = nvgpu_dma_alloc_flags(g, NVGPU_DMA_FORCE_CONTIGUOUS,
	150	bytes, pd->mem);
	151	if (err) {
	152	pd_dbg(g, "OOM allocating page directory!");
	153	nvgpu_kfree(g, pd->mem);
	154	return -ENOMEM;
	155	}
	156
	157	pd->cached = false;
	158	pd->mem_offs = 0;
	159
	160	return 0;
	161	}
	162
	163	/*
	164	* Make a new nvgpu_pd_cache_entry and allocate a PD from it. Update the passed
	165	* pd to reflect this allocation.
	166	*/
	167	static int nvgpu_pd_cache_alloc_new(struct gk20a *g,
	168	struct nvgpu_pd_cache *cache,
	169	struct nvgpu_gmmu_pd *pd,
	170	u32 bytes)
	171	{
	172	struct nvgpu_pd_mem_entry *pentry;
	173
	174	pd_dbg(g, "PD-Alloc [C] New: offs=0");
	175
	176	pentry = nvgpu_kzalloc(g, sizeof(*pentry));
	177	if (!pentry) {
	178	pd_dbg(g, "OOM allocating pentry!");
	179	return -ENOMEM;
	180	}
	181
	182	if (nvgpu_dma_alloc_flags(g, NVGPU_DMA_FORCE_CONTIGUOUS,
	183	PAGE_SIZE, &pentry->mem)) {
	184	nvgpu_kfree(g, pentry);
	185	pd_dbg(g, "Unable to DMA alloc!");
	186	return -ENOMEM;
	187	}
	188
	189	pentry->pd_size = bytes;
	190	nvgpu_list_add(&pentry->list_entry,
	191	&cache->partial[nvgpu_pd_cache_nr(bytes)]);
	192
	193	/*
	194	* This allocates the very first PD table in the set of tables in this
	195	* nvgpu_pd_mem_entry.
	196	*/
	197	pentry->alloc_map = 1;
	198
	199	/*
	200	* Now update the nvgpu_gmmu_pd to reflect this allocation.
	201	*/
	202	pd->mem = &pentry->mem;
	203	pd->mem_offs = 0;
	204	pd->cached = true;
	205
	206	pentry->tree_entry.key_start = (u64)(uintptr_t)&pentry->mem;
	207	nvgpu_rbtree_insert(&pentry->tree_entry, &cache->mem_tree);
	208
	209	return 0;
	210	}
	211
	212	static int nvgpu_pd_cache_alloc_from_partial(struct gk20a *g,
	213	struct nvgpu_pd_cache *cache,
	214	struct nvgpu_pd_mem_entry *pentry,
	215	struct nvgpu_gmmu_pd *pd)
	216	{
	217	unsigned long bit_offs;
	218	u32 mem_offs;
	219	u32 pentry_mask = nvgpu_pd_cache_get_mask(pentry);
	220
	221	/*
	222	* Find and allocate an open PD.
	223	*/
	224	bit_offs = ffz(pentry->alloc_map);
	225	mem_offs = bit_offs * pentry->pd_size;
	226
	227	/* Bit map full. Somethings wrong. */
	228	if (WARN_ON(bit_offs >= ffz(pentry_mask)))
	229	return -ENOMEM;
	230
	231	pentry->alloc_map \|= 1 << bit_offs;
	232
	233	pd_dbg(g, "PD-Alloc [C] Partial: offs=%lu", bit_offs);
	234
	235	/*
	236	* First update the pd.
	237	*/
	238	pd->mem = &pentry->mem;
	239	pd->mem_offs = mem_offs;
	240	pd->cached = true;
	241
	242	/*
	243	* Now make sure the pentry is in the correct list (full vs partial).
	244	*/
	245	if ((pentry->alloc_map & pentry_mask) == pentry_mask) {
	246	pd_dbg(g, "Adding pentry to full list!");
	247	nvgpu_list_del(&pentry->list_entry);
	248	nvgpu_list_add(&pentry->list_entry,
	249	&cache->full[nvgpu_pd_cache_nr(pentry->pd_size)]);
	250	}
	251
	252	return 0;
	253	}
	254
	255	/*
	256	* Get a partially full nvgpu_pd_mem_entry. Returns NULL if there is no partial
	257	* nvgpu_pd_mem_entry's.
	258	*/
	259	static struct nvgpu_pd_mem_entry *nvgpu_pd_cache_get_partial(
	260	struct nvgpu_pd_cache *cache, u32 bytes)
	261	{
	262	struct nvgpu_list_node *list =
	263	&cache->partial[nvgpu_pd_cache_nr(bytes)];
	264
	265	if (nvgpu_list_empty(list))
	266	return NULL;
	267
	268	return nvgpu_list_first_entry(list,
	269	nvgpu_pd_mem_entry,
	270	list_entry);
	271	}
	272
	273	/*
	274	* Allocate memory from an nvgpu_mem for the page directory.
	275	*/
	276	static int nvgpu_pd_cache_alloc(struct gk20a g, struct nvgpu_pd_cache cache,
	277	struct nvgpu_gmmu_pd *pd, u32 bytes)
	278	{
	279	struct nvgpu_pd_mem_entry *pentry;
	280	int err;
	281
	282	pd_dbg(g, "PD-Alloc [C] %u bytes", bytes);
	283
	284	if (bytes & (bytes - 1) \|\|
	285	(bytes >= PAGE_SIZE \|\|
	286	bytes < NVGPU_PD_CACHE_MIN)) {
	287	pd_dbg(g, "PD-Alloc [C] Invalid (bytes=%u)!", bytes);
	288	return -EINVAL;
	289	}
	290
	291	pentry = nvgpu_pd_cache_get_partial(cache, bytes);
	292	if (!pentry)
	293	err = nvgpu_pd_cache_alloc_new(g, cache, pd, bytes);
	294	else
	295	err = nvgpu_pd_cache_alloc_from_partial(g, cache, pentry, pd);
	296
	297	if (err)
	298	pd_dbg(g, "PD-Alloc [C] Failed!");
	299
	300	return err;
	301	}
	302
	303	/*
	304	* Allocate the DMA memory for a page directory. This handles the necessary PD
	305	* cache logistics. Since on Parker and later GPUs some of the page directories
	306	* are smaller than a page packing these PDs together saves a lot of memory.
	307	*/
	308	int __nvgpu_pd_alloc(struct vm_gk20a vm, struct nvgpu_gmmu_pd pd, u32 bytes)
	309	{
	310	struct gk20a *g = gk20a_from_vm(vm);
	311	int err;
	312
	313	/*
	314	* Simple case: PD is bigger than a page so just do a regular DMA
	315	* alloc.
	316	*/
	317	if (bytes >= PAGE_SIZE) {
	318	err = __nvgpu_pd_cache_alloc_direct(g, pd, bytes);
	319	if (err)
	320	return err;
	321
	322	return 0;
	323	}
	324
	325	if (WARN_ON(!g->mm.pd_cache))
	326	return -ENOMEM;
	327
	328	nvgpu_mutex_acquire(&g->mm.pd_cache->lock);
	329	err = nvgpu_pd_cache_alloc(g, g->mm.pd_cache, pd, bytes);
	330	nvgpu_mutex_release(&g->mm.pd_cache->lock);
	331
	332	return err;
	333	}
	334
	335	void __nvgpu_pd_cache_free_direct(struct gk20a g, struct nvgpu_gmmu_pd pd)
	336	{
	337	pd_dbg(g, "PD-Free [D] 0x%p", pd->mem);
	338
	339	if (!pd->mem)
	340	return;
	341
	342	nvgpu_dma_free(g, pd->mem);
	343	nvgpu_kfree(g, pd->mem);
	344	pd->mem = NULL;
	345	}
	346
	347	static void nvgpu_pd_cache_free_mem_entry(struct gk20a *g,
	348	struct nvgpu_pd_cache *cache,
	349	struct nvgpu_pd_mem_entry *pentry)
	350	{
	351	nvgpu_dma_free(g, &pentry->mem);
	352	nvgpu_list_del(&pentry->list_entry);
	353	nvgpu_rbtree_unlink(&pentry->tree_entry, &cache->mem_tree);
	354	nvgpu_kfree(g, pentry);
	355	}
	356
	357	static void nvgpu_pd_cache_do_free(struct gk20a *g,
	358	struct nvgpu_pd_cache *cache,
	359	struct nvgpu_pd_mem_entry *pentry,
	360	struct nvgpu_gmmu_pd *pd)
	361	{
	362	u32 index = pd->mem_offs / pentry->pd_size;
	363	u32 bit = 1 << index;
	364
	365	/* Mark entry as free. */
	366	pentry->alloc_map &= ~bit;
	367
	368	if (pentry->alloc_map & nvgpu_pd_cache_get_mask(pentry)) {
	369	/*
	370	* Partially full still. If it was already on the partial list
	371	* this just re-adds it.
	372	*/
	373	nvgpu_list_del(&pentry->list_entry);
	374	nvgpu_list_add(&pentry->list_entry,
	375	&cache->partial[nvgpu_pd_cache_nr(pentry->pd_size)]);
	376	} else {
	377	/* Empty now so free it. */
	378	nvgpu_pd_cache_free_mem_entry(g, cache, pentry);
	379	}
	380	}
	381
	382	static struct nvgpu_pd_mem_entry *nvgpu_pd_cache_look_up(
	383	struct gk20a *g,
	384	struct nvgpu_pd_cache *cache,
	385	struct nvgpu_gmmu_pd *pd)
	386	{
	387	struct nvgpu_rbtree_node *node;
	388
	389	nvgpu_rbtree_search((u64)(uintptr_t)pd->mem, &node,
	390	cache->mem_tree);
	391	if (!node)
	392	return NULL;
	393
	394	return nvgpu_pd_mem_entry_from_tree_entry(node);
	395	}
	396
	397	static void nvgpu_pd_cache_free(struct gk20a g, struct nvgpu_pd_cache cache,
	398	struct nvgpu_gmmu_pd *pd)
	399	{
	400	struct nvgpu_pd_mem_entry *pentry;
	401
	402	pd_dbg(g, "PD-Free [C] 0x%p", pd->mem);
	403
	404	pentry = nvgpu_pd_cache_look_up(g, cache, pd);
	405	if (!pentry) {
	406	WARN(1, "Attempting to free non-existent pd");
	407	return;
	408	}
	409
	410	nvgpu_pd_cache_do_free(g, cache, pentry, pd);
	411	}
	412
	413	void __nvgpu_pd_free(struct vm_gk20a vm, struct nvgpu_gmmu_pd pd)
	414	{
	415	struct gk20a *g = gk20a_from_vm(vm);
	416
	417	/*
	418	* Simple case: just DMA free.
	419	*/
	420	if (!pd->cached)
	421	return __nvgpu_pd_cache_free_direct(g, pd);
	422
	423	nvgpu_mutex_acquire(&g->mm.pd_cache->lock);
	424	nvgpu_pd_cache_free(g, g->mm.pd_cache, pd);
	425	nvgpu_mutex_release(&g->mm.pd_cache->lock);
	426	}