summaryrefslogtreecommitdiffstats
path: root/drivers
diff options
context:
space:
mode:
authorAlex Waterman <alexw@nvidia.com>2017-06-09 14:42:50 -0400
committermobile promotions <svcmobile_promotions@nvidia.com>2017-07-06 17:44:16 -0400
commit583704620db88e391f6b14acc57af859a70127de (patch)
tree8fc3becf2850b724e87011b0e0250c52d0efb7ee /drivers
parentc1393d5b68e63c992f4c689cb788139fdf8c2f1a (diff)
gpu: nvgpu: Implement PD packing
In some cases page directories require less than a full page of memory. For example, on Pascal, the final PD level for large pages is only 256 bytes; thus 16 PDs can fit in a single page. To allocate an entire page for each of these 256 B PDs is extremely wasteful. This patch aims to alleviate the wasted DMA memory from having small PDs in a full page by packing multiple small PDs into a single page. The packing is implemented as a slab allocator - each page is a slab and from each page multiple PD instances can be allocated. Several modifications to the nvgpu_gmmu_pd struct also needed to be made to support this. The nvgpu_mem is now a pointer and there's an explicit offset into the nvgpu_mem struct so that each nvgpu_gmmu_pd knows what portion of the memory it's using. The nvgpu_pde_phys_addr() function and the pd_write() functions also require some changes since the PD no longer is always situated at the start of the nvgpu_mem. Initialization and cleanup of the page tables for each VM was slightly modified to work through the new pd_cache implementation. Some PDs (i.e the PDB), despite not being a full page, still require a full page for alignment purposes (HW requirements). Thus a direct allocation method for PDs is still provided. This is also used when a PD that could in principle be cached is greater than a page in size. Lastly a new debug flag was added for the pd_cache code. JIRA NVGPU-30 Change-Id: I64c8037fc356783c1ef203cc143c4d71bbd5d77c Signed-off-by: Alex Waterman <alexw@nvidia.com> Reviewed-on: https://git-master/r/1506610 Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com> GVS: Gerrit_Virtual_Submit
Diffstat (limited to 'drivers')
-rw-r--r--drivers/gpu/nvgpu/Makefile.nvgpu1
-rw-r--r--drivers/gpu/nvgpu/common/mm/gmmu.c81
-rw-r--r--drivers/gpu/nvgpu/common/mm/pd_cache.c426
-rw-r--r--drivers/gpu/nvgpu/common/mm/vm.c50
-rw-r--r--drivers/gpu/nvgpu/gk20a/gk20a.c9
-rw-r--r--drivers/gpu/nvgpu/gk20a/mm_gk20a.c9
-rw-r--r--drivers/gpu/nvgpu/gk20a/mm_gk20a.h4
-rw-r--r--drivers/gpu/nvgpu/gp10b/mm_gp10b.c10
-rw-r--r--drivers/gpu/nvgpu/include/nvgpu/gmmu.h91
-rw-r--r--drivers/gpu/nvgpu/include/nvgpu/log.h1
10 files changed, 609 insertions, 73 deletions
diff --git a/drivers/gpu/nvgpu/Makefile.nvgpu b/drivers/gpu/nvgpu/Makefile.nvgpu
index 3a256771..4aaf7bc5 100644
--- a/drivers/gpu/nvgpu/Makefile.nvgpu
+++ b/drivers/gpu/nvgpu/Makefile.nvgpu
@@ -50,6 +50,7 @@ nvgpu-y := \
50 common/mm/page_allocator.o \ 50 common/mm/page_allocator.o \
51 common/mm/lockless_allocator.o \ 51 common/mm/lockless_allocator.o \
52 common/mm/gmmu.o \ 52 common/mm/gmmu.o \
53 common/mm/pd_cache.o \
53 common/mm/vm.o \ 54 common/mm/vm.o \
54 common/mm/vm_area.o \ 55 common/mm/vm_area.o \
55 common/bus.o \ 56 common/bus.o \
diff --git a/drivers/gpu/nvgpu/common/mm/gmmu.c b/drivers/gpu/nvgpu/common/mm/gmmu.c
index ec1bc095..602dfb3b 100644
--- a/drivers/gpu/nvgpu/common/mm/gmmu.c
+++ b/drivers/gpu/nvgpu/common/mm/gmmu.c
@@ -45,7 +45,8 @@ static int pd_allocate(struct vm_gk20a *vm,
45 struct nvgpu_gmmu_pd *pd, 45 struct nvgpu_gmmu_pd *pd,
46 const struct gk20a_mmu_level *l, 46 const struct gk20a_mmu_level *l,
47 struct nvgpu_gmmu_attrs *attrs); 47 struct nvgpu_gmmu_attrs *attrs);
48 48static u32 pd_size(const struct gk20a_mmu_level *l,
49 struct nvgpu_gmmu_attrs *attrs);
49/* 50/*
50 * Core GMMU map function for the kernel to use. If @addr is 0 then the GPU 51 * Core GMMU map function for the kernel to use. If @addr is 0 then the GPU
51 * VA will be allocated for you. If addr is non-zero then the buffer will be 52 * VA will be allocated for you. If addr is non-zero then the buffer will be
@@ -138,6 +139,9 @@ void nvgpu_gmmu_unmap(struct vm_gk20a *vm, struct nvgpu_mem *mem, u64 gpu_va)
138 139
139int nvgpu_gmmu_init_page_table(struct vm_gk20a *vm) 140int nvgpu_gmmu_init_page_table(struct vm_gk20a *vm)
140{ 141{
142 u32 pdb_size;
143 int err;
144
141 /* 145 /*
142 * Need this just for page size. Everything else can be ignored. Also 146 * Need this just for page size. Everything else can be ignored. Also
143 * note that we can just use pgsz 0 (i.e small pages) since the number 147 * note that we can just use pgsz 0 (i.e small pages) since the number
@@ -148,56 +152,43 @@ int nvgpu_gmmu_init_page_table(struct vm_gk20a *vm)
148 .pgsz = 0, 152 .pgsz = 0,
149 }; 153 };
150 154
151 return pd_allocate(vm, &vm->pdb, &vm->mmu_levels[0], &attrs); 155 /*
152} 156 * PDB size here must be one page so that its address is page size
157 * aligned. Although lower PDE tables can be aligned at 256B boundaries
158 * the main PDB must be page aligned.
159 */
160 pdb_size = ALIGN(pd_size(&vm->mmu_levels[0], &attrs), PAGE_SIZE);
161
162 err = __nvgpu_pd_cache_alloc_direct(vm->mm->g, &vm->pdb, pdb_size);
163 if (WARN_ON(err))
164 return err;
153 165
166 /*
167 * One mb() is done after all mapping operations. Don't need individual
168 * barriers for each PD write.
169 */
170 vm->pdb.mem->skip_wmb = true;
171
172 return 0;
173}
154 174
155/* 175/*
156 * Ensure that there's a CPU mapping for the page directory memory. This won't 176 * Ensure that there's a CPU mapping for the page directory memory. This won't
157 * always be the case for 32 bit systems since we may need to save kernel 177 * always be the case for 32 bit systems since we may need to save kernel
158 * virtual memory. 178 * virtual memory.
159 */ 179 */
160static int map_gmmu_pages(struct gk20a *g, struct nvgpu_gmmu_pd *entry) 180static int map_gmmu_pages(struct gk20a *g, struct nvgpu_gmmu_pd *pd)
161{ 181{
162 return nvgpu_mem_begin(g, &entry->mem); 182 return nvgpu_mem_begin(g, pd->mem);
163} 183}
164 184
165/* 185/*
166 * Handle any necessary CPU unmap semantics for a page directories DMA memory. 186 * Handle any necessary CPU unmap semantics for a page directories DMA memory.
167 * For 64 bit platforms this is a noop. 187 * For 64 bit platforms this is a noop.
168 */ 188 */
169static void unmap_gmmu_pages(struct gk20a *g, struct nvgpu_gmmu_pd *entry) 189static void unmap_gmmu_pages(struct gk20a *g, struct nvgpu_gmmu_pd *pd)
170{
171 nvgpu_mem_end(g, &entry->mem);
172}
173
174static int nvgpu_alloc_gmmu_pages(struct vm_gk20a *vm, u32 bytes,
175 struct nvgpu_gmmu_pd *pd)
176{
177 struct gk20a *g = gk20a_from_vm(vm);
178 unsigned long flags = NVGPU_DMA_FORCE_CONTIGUOUS;
179 int err;
180
181 /*
182 * On arm32 vmalloc space is a precious commodity so we do not map pages
183 * by default.
184 */
185 if (!IS_ENABLED(CONFIG_ARM64))
186 flags |= NVGPU_DMA_NO_KERNEL_MAPPING;
187
188 err = nvgpu_dma_alloc_flags(g, flags, bytes, &pd->mem);
189 if (err)
190 return -ENOMEM;
191
192 return 0;
193}
194
195void nvgpu_free_gmmu_pages(struct vm_gk20a *vm,
196 struct nvgpu_gmmu_pd *pd)
197{ 190{
198 struct gk20a *g = gk20a_from_vm(vm); 191 nvgpu_mem_end(g, pd->mem);
199
200 nvgpu_dma_free(g, &pd->mem);
201} 192}
202 193
203/* 194/*
@@ -205,10 +196,14 @@ void nvgpu_free_gmmu_pages(struct vm_gk20a *vm,
205 */ 196 */
206u64 nvgpu_pde_phys_addr(struct gk20a *g, struct nvgpu_gmmu_pd *pd) 197u64 nvgpu_pde_phys_addr(struct gk20a *g, struct nvgpu_gmmu_pd *pd)
207{ 198{
199 u64 page_addr;
200
208 if (g->mm.has_physical_mode) 201 if (g->mm.has_physical_mode)
209 return sg_phys(pd->mem.priv.sgt->sgl); 202 page_addr = sg_phys(pd->mem->priv.sgt->sgl);
210 else 203 else
211 return nvgpu_mem_get_base_addr(g, &pd->mem, 0); 204 page_addr = nvgpu_mem_get_base_addr(g, pd->mem, 0);
205
206 return page_addr + pd->mem_offs;
212} 207}
213 208
214/* 209/*
@@ -254,10 +249,10 @@ static int pd_allocate(struct vm_gk20a *vm,
254{ 249{
255 int err; 250 int err;
256 251
257 if (pd->mem.size) 252 if (pd->mem)
258 return 0; 253 return 0;
259 254
260 err = nvgpu_alloc_gmmu_pages(vm, pd_size(l, attrs), pd); 255 err = __nvgpu_pd_alloc(vm, pd, pd_size(l, attrs));
261 if (err) { 256 if (err) {
262 nvgpu_info(vm->mm->g, "error allocating page directory!"); 257 nvgpu_info(vm->mm->g, "error allocating page directory!");
263 return err; 258 return err;
@@ -267,7 +262,7 @@ static int pd_allocate(struct vm_gk20a *vm,
267 * One mb() is done after all mapping operations. Don't need individual 262 * One mb() is done after all mapping operations. Don't need individual
268 * barriers for each PD write. 263 * barriers for each PD write.
269 */ 264 */
270 pd->mem.skip_wmb = true; 265 pd->mem->skip_wmb = true;
271 266
272 return 0; 267 return 0;
273} 268}
@@ -778,7 +773,7 @@ u64 gk20a_locked_gmmu_map(struct vm_gk20a *vm,
778 } 773 }
779 774
780 if (!batch) 775 if (!batch)
781 g->ops.fb.tlb_invalidate(g, &vm->pdb.mem); 776 g->ops.fb.tlb_invalidate(g, vm->pdb.mem);
782 else 777 else
783 batch->need_tlb_invalidate = true; 778 batch->need_tlb_invalidate = true;
784 779
@@ -830,7 +825,7 @@ void gk20a_locked_gmmu_unmap(struct vm_gk20a *vm,
830 825
831 if (!batch) { 826 if (!batch) {
832 gk20a_mm_l2_flush(g, true); 827 gk20a_mm_l2_flush(g, true);
833 g->ops.fb.tlb_invalidate(g, &vm->pdb.mem); 828 g->ops.fb.tlb_invalidate(g, vm->pdb.mem);
834 } else { 829 } else {
835 if (!batch->gpu_l2_flushed) { 830 if (!batch->gpu_l2_flushed) {
836 gk20a_mm_l2_flush(g, true); 831 gk20a_mm_l2_flush(g, true);
diff --git a/drivers/gpu/nvgpu/common/mm/pd_cache.c b/drivers/gpu/nvgpu/common/mm/pd_cache.c
new file mode 100644
index 00000000..4f312eff
--- /dev/null
+++ b/drivers/gpu/nvgpu/common/mm/pd_cache.c
@@ -0,0 +1,426 @@
1/*
2 * Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11 * more details.
12 *
13 * You should have received a copy of the GNU General Public License
14 * along with this program. If not, see <http://www.gnu.org/licenses/>.
15 */
16
17#include <nvgpu/log.h>
18#include <nvgpu/dma.h>
19#include <nvgpu/gmmu.h>
20#include <nvgpu/nvgpu_mem.h>
21#include <nvgpu/list.h>
22#include <nvgpu/log2.h>
23
24#include "gk20a/gk20a.h"
25#include "gk20a/mm_gk20a.h"
26
27#define pd_dbg(g, fmt, args...) nvgpu_log(g, gpu_dbg_pd_cache, fmt, ##args)
28
29/**
30 * DOC: PD cache
31 *
32 * In the name of saving memory with the many sub-page sized PD levels in Pascal
33 * and beyond a way of packing PD tables together is necessary. This code here
34 * does just that. If a PD table only requires 1024 bytes, then it is possible
35 * to have 4 of these PDs in one page. This is even more pronounced for 256 byte
36 * PD tables.
37 *
38 * The pd cache is basially just a slab allocator. Each instance of the nvgpu
39 * driver makes one of these structs:
40 *
41 * struct nvgpu_pd_cache {
42 * struct nvgpu_list_node full[NVGPU_PD_CACHE_COUNT];
43 * struct nvgpu_list_node partial[NVGPU_PD_CACHE_COUNT];
44 *
45 * struct nvgpu_rbtree_node *mem_tree;
46 * };
47 *
48 * There are two sets of lists, the full and the partial. The full lists contain
49 * pages of memory for which all the memory in that page is in use. The partial
50 * lists contain partially full pages of memory which can be used for more PD
51 * allocations. There a couple of assumptions here:
52 *
53 * 1. PDs greater than or equal to the page size bypass the pd cache.
54 * 2. PDs are always power of 2 and greater than %NVGPU_PD_CACHE_MIN bytes.
55 *
56 * There are NVGPU_PD_CACHE_COUNT full lists and the same number of partial
57 * lists. For a 4Kb page NVGPU_PD_CACHE_COUNT is 4. This is enough space for
58 * 256, 512, 1024, and 2048 byte PDs.
59 *
60 * __nvgpu_pd_alloc() will allocate a PD for the GMMU. It will check if the PD
61 * size is page size or larger and choose the correct allocation scheme - either
62 * from the PD cache or directly. Similarly __nvgpu_pd_free() will free a PD
63 * allocated by __nvgpu_pd_alloc().
64 *
65 * Since the top level PD (the PDB) is a page aligned pointer but less than a
66 * page size the direct functions must be used for allocating PDBs. Otherwise
67 * there would be alignment issues for the PDBs when they get packed.
68 */
69
70static u32 nvgpu_pd_cache_nr(u32 bytes)
71{
72 return ilog2(bytes >> (NVGPU_PD_CACHE_MIN_SHIFT - 1));
73}
74
75static u32 nvgpu_pd_cache_get_mask(struct nvgpu_pd_mem_entry *pentry)
76{
77 u32 mask_offset = 1 << (PAGE_SIZE / pentry->pd_size);
78
79 return mask_offset - 1;
80}
81
82int nvgpu_pd_cache_init(struct gk20a *g)
83{
84 struct nvgpu_pd_cache *cache;
85 int i;
86
87 /*
88 * This gets called from finalize_poweron() so we need to make sure we
89 * don't reinit the pd_cache over and over.
90 */
91 if (g->mm.pd_cache)
92 return 0;
93
94 cache = nvgpu_kzalloc(g, sizeof(*cache));
95 if (!cache) {
96 nvgpu_err(g, "Failed to alloc pd_cache!");
97 return -ENOMEM;
98 }
99
100 for (i = 0; i < NVGPU_PD_CACHE_COUNT; i++) {
101 nvgpu_init_list_node(&cache->full[i]);
102 nvgpu_init_list_node(&cache->partial[i]);
103 }
104
105 cache->mem_tree = NULL;
106 g->mm.pd_cache = cache;
107 nvgpu_mutex_init(&cache->lock);
108
109 pd_dbg(g, "PD cache initialized!");
110
111 return 0;
112}
113
114void nvgpu_pd_cache_fini(struct gk20a *g)
115{
116 int i;
117 struct nvgpu_pd_cache *cache = g->mm.pd_cache;
118
119 if (!cache)
120 return;
121
122 for (i = 0; i < NVGPU_PD_CACHE_COUNT; i++) {
123 WARN_ON(!nvgpu_list_empty(&cache->full[i]));
124 WARN_ON(!nvgpu_list_empty(&cache->partial[i]));
125 }
126
127 nvgpu_kfree(g, g->mm.pd_cache);
128}
129
130/*
131 * This is the simple pass-through for greater than page or page sized PDs.
132 *
133 * Note: this does not need the cache lock since it does not modify any of the
134 * PD cache data structures.
135 */
136int __nvgpu_pd_cache_alloc_direct(struct gk20a *g,
137 struct nvgpu_gmmu_pd *pd, u32 bytes)
138{
139 int err;
140
141 pd_dbg(g, "PD-Alloc [D] %u bytes", bytes);
142
143 pd->mem = nvgpu_kzalloc(g, sizeof(*pd->mem));
144 if (!pd->mem) {
145 pd_dbg(g, "OOM allocating nvgpu_mem struct!");
146 return -ENOMEM;
147 }
148
149 err = nvgpu_dma_alloc_flags(g, NVGPU_DMA_FORCE_CONTIGUOUS,
150 bytes, pd->mem);
151 if (err) {
152 pd_dbg(g, "OOM allocating page directory!");
153 nvgpu_kfree(g, pd->mem);
154 return -ENOMEM;
155 }
156
157 pd->cached = false;
158 pd->mem_offs = 0;
159
160 return 0;
161}
162
163/*
164 * Make a new nvgpu_pd_cache_entry and allocate a PD from it. Update the passed
165 * pd to reflect this allocation.
166 */
167static int nvgpu_pd_cache_alloc_new(struct gk20a *g,
168 struct nvgpu_pd_cache *cache,
169 struct nvgpu_gmmu_pd *pd,
170 u32 bytes)
171{
172 struct nvgpu_pd_mem_entry *pentry;
173
174 pd_dbg(g, "PD-Alloc [C] New: offs=0");
175
176 pentry = nvgpu_kzalloc(g, sizeof(*pentry));
177 if (!pentry) {
178 pd_dbg(g, "OOM allocating pentry!");
179 return -ENOMEM;
180 }
181
182 if (nvgpu_dma_alloc_flags(g, NVGPU_DMA_FORCE_CONTIGUOUS,
183 PAGE_SIZE, &pentry->mem)) {
184 nvgpu_kfree(g, pentry);
185 pd_dbg(g, "Unable to DMA alloc!");
186 return -ENOMEM;
187 }
188
189 pentry->pd_size = bytes;
190 nvgpu_list_add(&pentry->list_entry,
191 &cache->partial[nvgpu_pd_cache_nr(bytes)]);
192
193 /*
194 * This allocates the very first PD table in the set of tables in this
195 * nvgpu_pd_mem_entry.
196 */
197 pentry->alloc_map = 1;
198
199 /*
200 * Now update the nvgpu_gmmu_pd to reflect this allocation.
201 */
202 pd->mem = &pentry->mem;
203 pd->mem_offs = 0;
204 pd->cached = true;
205
206 pentry->tree_entry.key_start = (u64)(uintptr_t)&pentry->mem;
207 nvgpu_rbtree_insert(&pentry->tree_entry, &cache->mem_tree);
208
209 return 0;
210}
211
212static int nvgpu_pd_cache_alloc_from_partial(struct gk20a *g,
213 struct nvgpu_pd_cache *cache,
214 struct nvgpu_pd_mem_entry *pentry,
215 struct nvgpu_gmmu_pd *pd)
216{
217 unsigned long bit_offs;
218 u32 mem_offs;
219 u32 pentry_mask = nvgpu_pd_cache_get_mask(pentry);
220
221 /*
222 * Find and allocate an open PD.
223 */
224 bit_offs = ffz(pentry->alloc_map);
225 mem_offs = bit_offs * pentry->pd_size;
226
227 /* Bit map full. Somethings wrong. */
228 if (WARN_ON(bit_offs >= ffz(pentry_mask)))
229 return -ENOMEM;
230
231 pentry->alloc_map |= 1 << bit_offs;
232
233 pd_dbg(g, "PD-Alloc [C] Partial: offs=%lu", bit_offs);
234
235 /*
236 * First update the pd.
237 */
238 pd->mem = &pentry->mem;
239 pd->mem_offs = mem_offs;
240 pd->cached = true;
241
242 /*
243 * Now make sure the pentry is in the correct list (full vs partial).
244 */
245 if ((pentry->alloc_map & pentry_mask) == pentry_mask) {
246 pd_dbg(g, "Adding pentry to full list!");
247 nvgpu_list_del(&pentry->list_entry);
248 nvgpu_list_add(&pentry->list_entry,
249 &cache->full[nvgpu_pd_cache_nr(pentry->pd_size)]);
250 }
251
252 return 0;
253}
254
255/*
256 * Get a partially full nvgpu_pd_mem_entry. Returns NULL if there is no partial
257 * nvgpu_pd_mem_entry's.
258 */
259static struct nvgpu_pd_mem_entry *nvgpu_pd_cache_get_partial(
260 struct nvgpu_pd_cache *cache, u32 bytes)
261{
262 struct nvgpu_list_node *list =
263 &cache->partial[nvgpu_pd_cache_nr(bytes)];
264
265 if (nvgpu_list_empty(list))
266 return NULL;
267
268 return nvgpu_list_first_entry(list,
269 nvgpu_pd_mem_entry,
270 list_entry);
271}
272
273/*
274 * Allocate memory from an nvgpu_mem for the page directory.
275 */
276static int nvgpu_pd_cache_alloc(struct gk20a *g, struct nvgpu_pd_cache *cache,
277 struct nvgpu_gmmu_pd *pd, u32 bytes)
278{
279 struct nvgpu_pd_mem_entry *pentry;
280 int err;
281
282 pd_dbg(g, "PD-Alloc [C] %u bytes", bytes);
283
284 if (bytes & (bytes - 1) ||
285 (bytes >= PAGE_SIZE ||
286 bytes < NVGPU_PD_CACHE_MIN)) {
287 pd_dbg(g, "PD-Alloc [C] Invalid (bytes=%u)!", bytes);
288 return -EINVAL;
289 }
290
291 pentry = nvgpu_pd_cache_get_partial(cache, bytes);
292 if (!pentry)
293 err = nvgpu_pd_cache_alloc_new(g, cache, pd, bytes);
294 else
295 err = nvgpu_pd_cache_alloc_from_partial(g, cache, pentry, pd);
296
297 if (err)
298 pd_dbg(g, "PD-Alloc [C] Failed!");
299
300 return err;
301}
302
303/*
304 * Allocate the DMA memory for a page directory. This handles the necessary PD
305 * cache logistics. Since on Parker and later GPUs some of the page directories
306 * are smaller than a page packing these PDs together saves a lot of memory.
307 */
308int __nvgpu_pd_alloc(struct vm_gk20a *vm, struct nvgpu_gmmu_pd *pd, u32 bytes)
309{
310 struct gk20a *g = gk20a_from_vm(vm);
311 int err;
312
313 /*
314 * Simple case: PD is bigger than a page so just do a regular DMA
315 * alloc.
316 */
317 if (bytes >= PAGE_SIZE) {
318 err = __nvgpu_pd_cache_alloc_direct(g, pd, bytes);
319 if (err)
320 return err;
321
322 return 0;
323 }
324
325 if (WARN_ON(!g->mm.pd_cache))
326 return -ENOMEM;
327
328 nvgpu_mutex_acquire(&g->mm.pd_cache->lock);
329 err = nvgpu_pd_cache_alloc(g, g->mm.pd_cache, pd, bytes);
330 nvgpu_mutex_release(&g->mm.pd_cache->lock);
331
332 return err;
333}
334
335void __nvgpu_pd_cache_free_direct(struct gk20a *g, struct nvgpu_gmmu_pd *pd)
336{
337 pd_dbg(g, "PD-Free [D] 0x%p", pd->mem);
338
339 if (!pd->mem)
340 return;
341
342 nvgpu_dma_free(g, pd->mem);
343 nvgpu_kfree(g, pd->mem);
344 pd->mem = NULL;
345}
346
347static void nvgpu_pd_cache_free_mem_entry(struct gk20a *g,
348 struct nvgpu_pd_cache *cache,
349 struct nvgpu_pd_mem_entry *pentry)
350{
351 nvgpu_dma_free(g, &pentry->mem);
352 nvgpu_list_del(&pentry->list_entry);
353 nvgpu_rbtree_unlink(&pentry->tree_entry, &cache->mem_tree);
354 nvgpu_kfree(g, pentry);
355}
356
357static void nvgpu_pd_cache_do_free(struct gk20a *g,
358 struct nvgpu_pd_cache *cache,
359 struct nvgpu_pd_mem_entry *pentry,
360 struct nvgpu_gmmu_pd *pd)
361{
362 u32 index = pd->mem_offs / pentry->pd_size;
363 u32 bit = 1 << index;
364
365 /* Mark entry as free. */
366 pentry->alloc_map &= ~bit;
367
368 if (pentry->alloc_map & nvgpu_pd_cache_get_mask(pentry)) {
369 /*
370 * Partially full still. If it was already on the partial list
371 * this just re-adds it.
372 */
373 nvgpu_list_del(&pentry->list_entry);
374 nvgpu_list_add(&pentry->list_entry,
375 &cache->partial[nvgpu_pd_cache_nr(pentry->pd_size)]);
376 } else {
377 /* Empty now so free it. */
378 nvgpu_pd_cache_free_mem_entry(g, cache, pentry);
379 }
380}
381
382static struct nvgpu_pd_mem_entry *nvgpu_pd_cache_look_up(
383 struct gk20a *g,
384 struct nvgpu_pd_cache *cache,
385 struct nvgpu_gmmu_pd *pd)
386{
387 struct nvgpu_rbtree_node *node;
388
389 nvgpu_rbtree_search((u64)(uintptr_t)pd->mem, &node,
390 cache->mem_tree);
391 if (!node)
392 return NULL;
393
394 return nvgpu_pd_mem_entry_from_tree_entry(node);
395}
396
397static void nvgpu_pd_cache_free(struct gk20a *g, struct nvgpu_pd_cache *cache,
398 struct nvgpu_gmmu_pd *pd)
399{
400 struct nvgpu_pd_mem_entry *pentry;
401
402 pd_dbg(g, "PD-Free [C] 0x%p", pd->mem);
403
404 pentry = nvgpu_pd_cache_look_up(g, cache, pd);
405 if (!pentry) {
406 WARN(1, "Attempting to free non-existent pd");
407 return;
408 }
409
410 nvgpu_pd_cache_do_free(g, cache, pentry, pd);
411}
412
413void __nvgpu_pd_free(struct vm_gk20a *vm, struct nvgpu_gmmu_pd *pd)
414{
415 struct gk20a *g = gk20a_from_vm(vm);
416
417 /*
418 * Simple case: just DMA free.
419 */
420 if (!pd->cached)
421 return __nvgpu_pd_cache_free_direct(g, pd);
422
423 nvgpu_mutex_acquire(&g->mm.pd_cache->lock);
424 nvgpu_pd_cache_free(g, g->mm.pd_cache, pd);
425 nvgpu_mutex_release(&g->mm.pd_cache->lock);
426}
diff --git a/drivers/gpu/nvgpu/common/mm/vm.c b/drivers/gpu/nvgpu/common/mm/vm.c
index 3aeba500..3ed3c7fe 100644
--- a/drivers/gpu/nvgpu/common/mm/vm.c
+++ b/drivers/gpu/nvgpu/common/mm/vm.c
@@ -35,21 +35,42 @@ int vm_aspace_id(struct vm_gk20a *vm)
35 return vm->as_share ? vm->as_share->id : -1; 35 return vm->as_share ? vm->as_share->id : -1;
36} 36}
37 37
38static void nvgpu_vm_free_entries(struct vm_gk20a *vm, 38static void __nvgpu_vm_free_entries(struct vm_gk20a *vm,
39 struct nvgpu_gmmu_pd *parent, 39 struct nvgpu_gmmu_pd *pd,
40 int level) 40 int level)
41{ 41{
42 int i; 42 int i;
43 43
44 if (parent->entries) 44 if (pd->mem) {
45 for (i = 0; i < parent->num_entries; i++) 45 __nvgpu_pd_free(vm, pd);
46 nvgpu_vm_free_entries(vm, &parent->entries[i], 46 pd->mem = NULL;
47 }
48
49 if (pd->entries) {
50 for (i = 0; i < pd->num_entries; i++)
51 __nvgpu_vm_free_entries(vm, &pd->entries[i],
47 level + 1); 52 level + 1);
53 nvgpu_vfree(vm->mm->g, pd->entries);
54 pd->entries = NULL;
55 }
56}
57
58static void nvgpu_vm_free_entries(struct vm_gk20a *vm,
59 struct nvgpu_gmmu_pd *pdb)
60{
61 struct gk20a *g = vm->mm->g;
62 int i;
63
64 __nvgpu_pd_cache_free_direct(g, pdb);
65
66 if (!pdb->entries)
67 return;
68
69 for (i = 0; i < pdb->num_entries; i++)
70 __nvgpu_vm_free_entries(vm, &pdb->entries[i], 1);
48 71
49 if (parent->mem.size) 72 nvgpu_vfree(g, pdb->entries);
50 nvgpu_free_gmmu_pages(vm, parent); 73 pdb->entries = NULL;
51 nvgpu_vfree(vm->mm->g, parent->entries);
52 parent->entries = NULL;
53} 74}
54 75
55u64 __nvgpu_vm_alloc_va(struct vm_gk20a *vm, u64 size, 76u64 __nvgpu_vm_alloc_va(struct vm_gk20a *vm, u64 size,
@@ -110,7 +131,7 @@ void nvgpu_vm_mapping_batch_finish_locked(
110 131
111 if (mapping_batch->need_tlb_invalidate) { 132 if (mapping_batch->need_tlb_invalidate) {
112 struct gk20a *g = gk20a_from_vm(vm); 133 struct gk20a *g = gk20a_from_vm(vm);
113 g->ops.fb.tlb_invalidate(g, &vm->pdb.mem); 134 g->ops.fb.tlb_invalidate(g, vm->pdb.mem);
114 } 135 }
115} 136}
116 137
@@ -407,9 +428,8 @@ clean_up_allocators:
407 if (nvgpu_alloc_initialized(&vm->user_lp)) 428 if (nvgpu_alloc_initialized(&vm->user_lp))
408 nvgpu_alloc_destroy(&vm->user_lp); 429 nvgpu_alloc_destroy(&vm->user_lp);
409clean_up_page_tables: 430clean_up_page_tables:
410 /* Cleans up nvgpu_vm_init_page_tables() */ 431 /* Cleans up nvgpu_gmmu_init_page_table() */
411 nvgpu_vfree(g, vm->pdb.entries); 432 __nvgpu_pd_cache_free_direct(g, &vm->pdb);
412 nvgpu_free_gmmu_pages(vm, &vm->pdb);
413clean_up_vgpu_vm: 433clean_up_vgpu_vm:
414#ifdef CONFIG_TEGRA_GR_VIRTUALIZATION 434#ifdef CONFIG_TEGRA_GR_VIRTUALIZATION
415 if (g->is_virtual) 435 if (g->is_virtual)
@@ -525,7 +545,7 @@ static void __nvgpu_vm_remove(struct vm_gk20a *vm)
525 if (nvgpu_alloc_initialized(&vm->user_lp)) 545 if (nvgpu_alloc_initialized(&vm->user_lp))
526 nvgpu_alloc_destroy(&vm->user_lp); 546 nvgpu_alloc_destroy(&vm->user_lp);
527 547
528 nvgpu_vm_free_entries(vm, &vm->pdb, 0); 548 nvgpu_vm_free_entries(vm, &vm->pdb);
529 549
530#ifdef CONFIG_TEGRA_GR_VIRTUALIZATION 550#ifdef CONFIG_TEGRA_GR_VIRTUALIZATION
531 if (g->is_virtual) 551 if (g->is_virtual)
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.c b/drivers/gpu/nvgpu/gk20a/gk20a.c
index 380c28ac..a0753770 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.c
@@ -25,6 +25,7 @@
25#include <nvgpu/soc.h> 25#include <nvgpu/soc.h>
26#include <nvgpu/enabled.h> 26#include <nvgpu/enabled.h>
27#include <nvgpu/pmu.h> 27#include <nvgpu/pmu.h>
28#include <nvgpu/gmmu.h>
28 29
29#include <trace/events/gk20a.h> 30#include <trace/events/gk20a.h>
30 31
@@ -174,6 +175,14 @@ int gk20a_finalize_poweron(struct gk20a *g)
174 g->gpu_reset_done = true; 175 g->gpu_reset_done = true;
175 } 176 }
176 177
178 /*
179 * Do this early so any early VMs that get made are capable of mapping
180 * buffers.
181 */
182 err = nvgpu_pd_cache_init(g);
183 if (err)
184 return err;
185
177 /* init interface layer support for PMU falcon */ 186 /* init interface layer support for PMU falcon */
178 nvgpu_flcn_sw_init(g, FALCON_ID_PMU); 187 nvgpu_flcn_sw_init(g, FALCON_ID_PMU);
179 nvgpu_flcn_sw_init(g, FALCON_ID_SEC2); 188 nvgpu_flcn_sw_init(g, FALCON_ID_SEC2);
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
index 558a1b06..0a84cabb 100644
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
@@ -478,6 +478,7 @@ static void gk20a_remove_mm_support(struct mm_gk20a *mm)
478 478
479 gk20a_semaphore_sea_destroy(g); 479 gk20a_semaphore_sea_destroy(g);
480 gk20a_vidmem_destroy(g); 480 gk20a_vidmem_destroy(g);
481 nvgpu_pd_cache_fini(g);
481} 482}
482 483
483static int gk20a_alloc_sysmem_flush(struct gk20a *g) 484static int gk20a_alloc_sysmem_flush(struct gk20a *g)
@@ -1560,7 +1561,7 @@ static inline u32 big_valid_pde0_bits(struct gk20a *g,
1560 struct nvgpu_gmmu_pd *pd, u64 addr) 1561 struct nvgpu_gmmu_pd *pd, u64 addr)
1561{ 1562{
1562 u32 pde0_bits = 1563 u32 pde0_bits =
1563 nvgpu_aperture_mask(g, &pd->mem, 1564 nvgpu_aperture_mask(g, pd->mem,
1564 gmmu_pde_aperture_big_sys_mem_ncoh_f(), 1565 gmmu_pde_aperture_big_sys_mem_ncoh_f(),
1565 gmmu_pde_aperture_big_video_memory_f()) | 1566 gmmu_pde_aperture_big_video_memory_f()) |
1566 gmmu_pde_address_big_sys_f( 1567 gmmu_pde_address_big_sys_f(
@@ -1573,7 +1574,7 @@ static inline u32 small_valid_pde1_bits(struct gk20a *g,
1573 struct nvgpu_gmmu_pd *pd, u64 addr) 1574 struct nvgpu_gmmu_pd *pd, u64 addr)
1574{ 1575{
1575 u32 pde1_bits = 1576 u32 pde1_bits =
1576 nvgpu_aperture_mask(g, &pd->mem, 1577 nvgpu_aperture_mask(g, pd->mem,
1577 gmmu_pde_aperture_small_sys_mem_ncoh_f(), 1578 gmmu_pde_aperture_small_sys_mem_ncoh_f(),
1578 gmmu_pde_aperture_small_video_memory_f()) | 1579 gmmu_pde_aperture_small_video_memory_f()) |
1579 gmmu_pde_vol_small_true_f() | /* tbd: why? */ 1580 gmmu_pde_vol_small_true_f() | /* tbd: why? */
@@ -2173,14 +2174,14 @@ static int gk20a_init_ce_vm(struct mm_gk20a *mm)
2173void gk20a_mm_init_pdb(struct gk20a *g, struct nvgpu_mem *inst_block, 2174void gk20a_mm_init_pdb(struct gk20a *g, struct nvgpu_mem *inst_block,
2174 struct vm_gk20a *vm) 2175 struct vm_gk20a *vm)
2175{ 2176{
2176 u64 pdb_addr = nvgpu_mem_get_base_addr(g, &vm->pdb.mem, 0); 2177 u64 pdb_addr = nvgpu_mem_get_base_addr(g, vm->pdb.mem, 0);
2177 u32 pdb_addr_lo = u64_lo32(pdb_addr >> ram_in_base_shift_v()); 2178 u32 pdb_addr_lo = u64_lo32(pdb_addr >> ram_in_base_shift_v());
2178 u32 pdb_addr_hi = u64_hi32(pdb_addr); 2179 u32 pdb_addr_hi = u64_hi32(pdb_addr);
2179 2180
2180 gk20a_dbg_info("pde pa=0x%llx", pdb_addr); 2181 gk20a_dbg_info("pde pa=0x%llx", pdb_addr);
2181 2182
2182 nvgpu_mem_wr32(g, inst_block, ram_in_page_dir_base_lo_w(), 2183 nvgpu_mem_wr32(g, inst_block, ram_in_page_dir_base_lo_w(),
2183 nvgpu_aperture_mask(g, &vm->pdb.mem, 2184 nvgpu_aperture_mask(g, vm->pdb.mem,
2184 ram_in_page_dir_base_target_sys_mem_ncoh_f(), 2185 ram_in_page_dir_base_target_sys_mem_ncoh_f(),
2185 ram_in_page_dir_base_target_vid_mem_f()) | 2186 ram_in_page_dir_base_target_vid_mem_f()) |
2186 ram_in_page_dir_base_vol_true_f() | 2187 ram_in_page_dir_base_vol_true_f() |
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
index a245d0e0..cadcffa4 100644
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
@@ -31,6 +31,8 @@
31#include <nvgpu/rbtree.h> 31#include <nvgpu/rbtree.h>
32#include <nvgpu/kref.h> 32#include <nvgpu/kref.h>
33 33
34struct nvgpu_pd_cache;
35
34#ifdef CONFIG_ARM64 36#ifdef CONFIG_ARM64
35#define outer_flush_range(a, b) 37#define outer_flush_range(a, b)
36#define __cpuc_flush_dcache_area __flush_dcache_area 38#define __cpuc_flush_dcache_area __flush_dcache_area
@@ -217,6 +219,8 @@ struct mm_gk20a {
217 struct vm_gk20a *vm; 219 struct vm_gk20a *vm;
218 } ce; 220 } ce;
219 221
222 struct nvgpu_pd_cache *pd_cache;
223
220 struct nvgpu_mutex l2_op_lock; 224 struct nvgpu_mutex l2_op_lock;
221 struct nvgpu_mutex tlb_lock; 225 struct nvgpu_mutex tlb_lock;
222 struct nvgpu_mutex priv_lock; 226 struct nvgpu_mutex priv_lock;
diff --git a/drivers/gpu/nvgpu/gp10b/mm_gp10b.c b/drivers/gpu/nvgpu/gp10b/mm_gp10b.c
index c3867e9d..2ff199c6 100644
--- a/drivers/gpu/nvgpu/gp10b/mm_gp10b.c
+++ b/drivers/gpu/nvgpu/gp10b/mm_gp10b.c
@@ -164,7 +164,7 @@ static void update_gmmu_pde3_locked(struct vm_gk20a *vm,
164 164
165 phys_addr >>= gmmu_new_pde_address_shift_v(); 165 phys_addr >>= gmmu_new_pde_address_shift_v();
166 166
167 pde_v[0] |= nvgpu_aperture_mask(g, &pd->mem, 167 pde_v[0] |= nvgpu_aperture_mask(g, pd->mem,
168 gmmu_new_pde_aperture_sys_mem_ncoh_f(), 168 gmmu_new_pde_aperture_sys_mem_ncoh_f(),
169 gmmu_new_pde_aperture_video_memory_f()); 169 gmmu_new_pde_aperture_video_memory_f());
170 pde_v[0] |= gmmu_new_pde_address_sys_f(u64_lo32(phys_addr)); 170 pde_v[0] |= gmmu_new_pde_address_sys_f(u64_lo32(phys_addr));
@@ -209,7 +209,7 @@ static void update_gmmu_pde0_locked(struct vm_gk20a *vm,
209 if (small_valid) { 209 if (small_valid) {
210 pde_v[2] |= 210 pde_v[2] |=
211 gmmu_new_dual_pde_address_small_sys_f(small_addr); 211 gmmu_new_dual_pde_address_small_sys_f(small_addr);
212 pde_v[2] |= nvgpu_aperture_mask(g, &pd->mem, 212 pde_v[2] |= nvgpu_aperture_mask(g, pd->mem,
213 gmmu_new_dual_pde_aperture_small_sys_mem_ncoh_f(), 213 gmmu_new_dual_pde_aperture_small_sys_mem_ncoh_f(),
214 gmmu_new_dual_pde_aperture_small_video_memory_f()); 214 gmmu_new_dual_pde_aperture_small_video_memory_f());
215 pde_v[2] |= gmmu_new_dual_pde_vol_small_true_f(); 215 pde_v[2] |= gmmu_new_dual_pde_vol_small_true_f();
@@ -219,7 +219,7 @@ static void update_gmmu_pde0_locked(struct vm_gk20a *vm,
219 if (big_valid) { 219 if (big_valid) {
220 pde_v[0] |= gmmu_new_dual_pde_address_big_sys_f(big_addr); 220 pde_v[0] |= gmmu_new_dual_pde_address_big_sys_f(big_addr);
221 pde_v[0] |= gmmu_new_dual_pde_vol_big_true_f(); 221 pde_v[0] |= gmmu_new_dual_pde_vol_big_true_f();
222 pde_v[0] |= nvgpu_aperture_mask(g, &pd->mem, 222 pde_v[0] |= nvgpu_aperture_mask(g, pd->mem,
223 gmmu_new_dual_pde_aperture_big_sys_mem_ncoh_f(), 223 gmmu_new_dual_pde_aperture_big_sys_mem_ncoh_f(),
224 gmmu_new_dual_pde_aperture_big_video_memory_f()); 224 gmmu_new_dual_pde_aperture_big_video_memory_f());
225 pde_v[1] |= big_addr >> 28; 225 pde_v[1] |= big_addr >> 28;
@@ -365,14 +365,14 @@ static const struct gk20a_mmu_level *gp10b_mm_get_mmu_levels(struct gk20a *g,
365static void gp10b_mm_init_pdb(struct gk20a *g, struct nvgpu_mem *inst_block, 365static void gp10b_mm_init_pdb(struct gk20a *g, struct nvgpu_mem *inst_block,
366 struct vm_gk20a *vm) 366 struct vm_gk20a *vm)
367{ 367{
368 u64 pdb_addr = nvgpu_mem_get_base_addr(g, &vm->pdb.mem, 0); 368 u64 pdb_addr = nvgpu_mem_get_base_addr(g, vm->pdb.mem, 0);
369 u32 pdb_addr_lo = u64_lo32(pdb_addr >> ram_in_base_shift_v()); 369 u32 pdb_addr_lo = u64_lo32(pdb_addr >> ram_in_base_shift_v());
370 u32 pdb_addr_hi = u64_hi32(pdb_addr); 370 u32 pdb_addr_hi = u64_hi32(pdb_addr);
371 371
372 gk20a_dbg_info("pde pa=0x%llx", pdb_addr); 372 gk20a_dbg_info("pde pa=0x%llx", pdb_addr);
373 373
374 nvgpu_mem_wr32(g, inst_block, ram_in_page_dir_base_lo_w(), 374 nvgpu_mem_wr32(g, inst_block, ram_in_page_dir_base_lo_w(),
375 nvgpu_aperture_mask(g, &vm->pdb.mem, 375 nvgpu_aperture_mask(g, vm->pdb.mem,
376 ram_in_page_dir_base_target_sys_mem_ncoh_f(), 376 ram_in_page_dir_base_target_sys_mem_ncoh_f(),
377 ram_in_page_dir_base_target_vid_mem_f()) | 377 ram_in_page_dir_base_target_vid_mem_f()) |
378 ram_in_page_dir_base_vol_true_f() | 378 ram_in_page_dir_base_vol_true_f() |
diff --git a/drivers/gpu/nvgpu/include/nvgpu/gmmu.h b/drivers/gpu/nvgpu/include/nvgpu/gmmu.h
index 28a2cb82..eff87c31 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/gmmu.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/gmmu.h
@@ -19,6 +19,9 @@
19 19
20#include <nvgpu/types.h> 20#include <nvgpu/types.h>
21#include <nvgpu/nvgpu_mem.h> 21#include <nvgpu/nvgpu_mem.h>
22#include <nvgpu/list.h>
23#include <nvgpu/rbtree.h>
24#include <nvgpu/lock.h>
22 25
23struct scatterlist; 26struct scatterlist;
24 27
@@ -45,14 +48,85 @@ enum gk20a_mem_rw_flag {
45}; 48};
46 49
47/* 50/*
51 * Minimum size of a cache. The number of different caches in the nvgpu_pd_cache
52 * structure is of course depending on this. The MIN_SHIFT define is the right
53 * number of bits to shift to determine which list to use in the array of lists.
54 */
55#define NVGPU_PD_CACHE_MIN 256
56#define NVGPU_PD_CACHE_MIN_SHIFT 9
57#define NVGPU_PD_CACHE_COUNT 4
58
59struct nvgpu_pd_mem_entry {
60 struct nvgpu_mem mem;
61
62 /*
63 * Size of the page directories (not the mem). bmap is a bitmap showing
64 * which PDs have been allocated. The size of mem will always be one
65 * page. pd_size will always be a power of 2.
66 */
67 u32 pd_size;
68 unsigned long alloc_map;
69
70 struct nvgpu_list_node list_entry;
71 struct nvgpu_rbtree_node tree_entry;
72};
73
74static inline struct nvgpu_pd_mem_entry *
75nvgpu_pd_mem_entry_from_list_entry(struct nvgpu_list_node *node)
76{
77 return (struct nvgpu_pd_mem_entry *)
78 ((uintptr_t)node -
79 offsetof(struct nvgpu_pd_mem_entry, list_entry));
80};
81
82static inline struct nvgpu_pd_mem_entry *
83nvgpu_pd_mem_entry_from_tree_entry(struct nvgpu_rbtree_node *node)
84{
85 return (struct nvgpu_pd_mem_entry *)
86 ((uintptr_t)node -
87 offsetof(struct nvgpu_pd_mem_entry, tree_entry));
88};
89
90/*
91 * A cache for allocating PD memory from. This enables smaller PDs to be packed
92 * into single pages.
93 *
94 * This is fairly complex so see the documentation in pd_cache.c for a full
95 * description of how this is organized.
96 */
97struct nvgpu_pd_cache {
98 /*
99 * Array of lists of full nvgpu_pd_mem_entries and partially full (or
100 * empty) nvgpu_pd_mem_entries.
101 */
102 struct nvgpu_list_node full[NVGPU_PD_CACHE_COUNT];
103 struct nvgpu_list_node partial[NVGPU_PD_CACHE_COUNT];
104
105 /*
106 * Tree of all allocated struct nvgpu_mem's for fast look up.
107 */
108 struct nvgpu_rbtree_node *mem_tree;
109
110 /*
111 * All access to the cache much be locked. This protects the lists and
112 * the rb tree.
113 */
114 struct nvgpu_mutex lock;
115};
116
117/*
48 * GMMU page directory. This is the kernel's tracking of a list of PDEs or PTEs 118 * GMMU page directory. This is the kernel's tracking of a list of PDEs or PTEs
49 * in the GMMU. 119 * in the GMMU.
50 */ 120 */
51struct nvgpu_gmmu_pd { 121struct nvgpu_gmmu_pd {
52 /* 122 /*
53 * DMA memory describing the PTEs or PTEs. 123 * DMA memory describing the PTEs or PDEs. @mem_offs describes the
124 * offset of the PDE table in @mem. @cached specifies if this PD is
125 * using pd_cache memory.
54 */ 126 */
55 struct nvgpu_mem mem; 127 struct nvgpu_mem *mem;
128 u32 mem_offs;
129 bool cached;
56 130
57 /* 131 /*
58 * List of pointers to the next level of page tables. Does not 132 * List of pointers to the next level of page tables. Does not
@@ -66,7 +140,7 @@ struct nvgpu_gmmu_pd {
66 * Reduce the number of arguments getting passed through the various levels of 140 * Reduce the number of arguments getting passed through the various levels of
67 * GMMU mapping functions. 141 * GMMU mapping functions.
68 * 142 *
69 * The following fields are set statically and do not change throughout 143 * The following fields are set statically and do not change throughout the
70 * mapping call: 144 * mapping call:
71 * 145 *
72 * pgsz: Index into the page size table. 146 * pgsz: Index into the page size table.
@@ -166,8 +240,13 @@ void nvgpu_gmmu_unmap(struct vm_gk20a *vm,
166 struct nvgpu_mem *mem, 240 struct nvgpu_mem *mem,
167 u64 gpu_va); 241 u64 gpu_va);
168 242
169void nvgpu_free_gmmu_pages(struct vm_gk20a *vm, 243int __nvgpu_pd_alloc(struct vm_gk20a *vm, struct nvgpu_gmmu_pd *pd, u32 bytes);
170 struct nvgpu_gmmu_pd *entry); 244void __nvgpu_pd_free(struct vm_gk20a *vm, struct nvgpu_gmmu_pd *pd);
245int __nvgpu_pd_cache_alloc_direct(struct gk20a *g,
246 struct nvgpu_gmmu_pd *pd, u32 bytes);
247void __nvgpu_pd_cache_free_direct(struct gk20a *g, struct nvgpu_gmmu_pd *pd);
248int nvgpu_pd_cache_init(struct gk20a *g);
249void nvgpu_pd_cache_fini(struct gk20a *g);
171 250
172/* 251/*
173 * Some useful routines that are shared across chips. 252 * Some useful routines that are shared across chips.
@@ -181,7 +260,7 @@ static inline u32 pd_offset_from_index(const struct gk20a_mmu_level *l,
181static inline void pd_write(struct gk20a *g, struct nvgpu_gmmu_pd *pd, 260static inline void pd_write(struct gk20a *g, struct nvgpu_gmmu_pd *pd,
182 size_t w, size_t data) 261 size_t w, size_t data)
183{ 262{
184 nvgpu_mem_wr32(g, &pd->mem, w, data); 263 nvgpu_mem_wr32(g, pd->mem, (pd->mem_offs / sizeof(u32)) + w, data);
185} 264}
186 265
187 266
diff --git a/drivers/gpu/nvgpu/include/nvgpu/log.h b/drivers/gpu/nvgpu/include/nvgpu/log.h
index 3b8e6b19..a1110a59 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/log.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/log.h
@@ -68,6 +68,7 @@ enum nvgpu_log_categories {
68 gpu_dbg_xv = BIT(17), /* XVE debugging. */ 68 gpu_dbg_xv = BIT(17), /* XVE debugging. */
69 gpu_dbg_shutdown = BIT(18), /* GPU shutdown tracing. */ 69 gpu_dbg_shutdown = BIT(18), /* GPU shutdown tracing. */
70 gpu_dbg_kmem = BIT(19), /* Kmem tracking debugging. */ 70 gpu_dbg_kmem = BIT(19), /* Kmem tracking debugging. */
71 gpu_dbg_pd_cache = BIT(20), /* PD cache traces. */
71 gpu_dbg_mem = BIT(31), /* memory accesses; very verbose. */ 72 gpu_dbg_mem = BIT(31), /* memory accesses; very verbose. */
72}; 73};
73 74