summaryrefslogtreecommitdiffstats
path: root/drivers/gpu/nvgpu
diff options
context:
space:
mode:
authorAlex Waterman <alexw@nvidia.com>2017-05-11 16:59:22 -0400
committermobile promotions <svcmobile_promotions@nvidia.com>2017-07-06 17:44:15 -0400
commitc1393d5b68e63c992f4c689cb788139fdf8c2f1a (patch)
tree00a588d35342d75c05fed7733e91da753ba640fb /drivers/gpu/nvgpu
parent84f712dee8b582dd7d2a19345c621a2ae3bd6292 (diff)
gpu: nvgpu: gmmu programming rewrite
Update the high level mapping logic. Instead of iterating over the GPU VA iterate over the scatter-gather table chunks. As a result each GMMU page table update call is simplified dramatically. This also modifies the chip level code to no longer require an SGL as an argument. Each call to the chip level code will be guaranteed to be contiguous so it only has to worry about making a mapping from virt -> phys. This removes the dependency on Linux that the chip code currently has. With this patch the core GMMU code still uses the Linux SGL but the logic is highly transferable to a different, nvgpu specific, scatter gather list format in the near future. The last major update is to push most of the page table attribute arguments to a struct. That struct is passed on through the various mapping levels. This makes the funtions calls more simple and easier to follow. JIRA NVGPU-30 Change-Id: Ibb6b11755f99818fe642622ca0bd4cbed054f602 Signed-off-by: Alex Waterman <alexw@nvidia.com> Reviewed-on: https://git-master/r/1484104 Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com> GVS: Gerrit_Virtual_Submit
Diffstat (limited to 'drivers/gpu/nvgpu')
-rw-r--r--drivers/gpu/nvgpu/common/mm/gmmu.c976
-rw-r--r--drivers/gpu/nvgpu/common/mm/vm.c36
-rw-r--r--drivers/gpu/nvgpu/gk20a/fb_gk20a.c2
-rw-r--r--drivers/gpu/nvgpu/gk20a/mm_gk20a.c306
-rw-r--r--drivers/gpu/nvgpu/gk20a/mm_gk20a.h16
-rw-r--r--drivers/gpu/nvgpu/gp10b/mm_gp10b.c309
-rw-r--r--drivers/gpu/nvgpu/include/nvgpu/gmmu.h136
-rw-r--r--drivers/gpu/nvgpu/include/nvgpu/nvgpu_mem.h6
-rw-r--r--drivers/gpu/nvgpu/include/nvgpu/vm.h3
9 files changed, 979 insertions, 811 deletions
diff --git a/drivers/gpu/nvgpu/common/mm/gmmu.c b/drivers/gpu/nvgpu/common/mm/gmmu.c
index 06291600..ec1bc095 100644
--- a/drivers/gpu/nvgpu/common/mm/gmmu.c
+++ b/drivers/gpu/nvgpu/common/mm/gmmu.c
@@ -25,115 +25,26 @@
25#include "gk20a/gk20a.h" 25#include "gk20a/gk20a.h"
26#include "gk20a/mm_gk20a.h" 26#include "gk20a/mm_gk20a.h"
27 27
28#define gmmu_dbg(g, fmt, args...) \ 28#define __gmmu_dbg(g, attrs, fmt, args...) \
29 nvgpu_log(g, gpu_dbg_map, fmt, ##args) 29 do { \
30#define gmmu_dbg_v(g, fmt, args...) \ 30 if (attrs->debug) \
31 nvgpu_log(g, gpu_dbg_map_v, fmt, ##args) 31 nvgpu_info(g, fmt, ##args); \
32 32 else \
33static int map_gmmu_pages(struct gk20a *g, struct gk20a_mm_entry *entry) 33 nvgpu_log(g, gpu_dbg_map, fmt, ##args); \
34{ 34 } while (0)
35 return nvgpu_mem_begin(g, &entry->mem); 35
36} 36#define __gmmu_dbg_v(g, attrs, fmt, args...) \
37 37 do { \
38static void unmap_gmmu_pages(struct gk20a *g, struct gk20a_mm_entry *entry) 38 if (attrs->debug) \
39{ 39 nvgpu_info(g, fmt, ##args); \
40 nvgpu_mem_end(g, &entry->mem); 40 else \
41} 41 nvgpu_log(g, gpu_dbg_map_v, fmt, ##args); \
42 42 } while (0)
43static int nvgpu_alloc_gmmu_pages(struct vm_gk20a *vm, u32 order, 43
44 struct gk20a_mm_entry *entry) 44static int pd_allocate(struct vm_gk20a *vm,
45{ 45 struct nvgpu_gmmu_pd *pd,
46 struct gk20a *g = gk20a_from_vm(vm); 46 const struct gk20a_mmu_level *l,
47 u32 num_pages = 1 << order; 47 struct nvgpu_gmmu_attrs *attrs);
48 u32 len = num_pages * PAGE_SIZE;
49 int err;
50
51 err = nvgpu_dma_alloc(g, len, &entry->mem);
52
53 if (err) {
54 nvgpu_err(g, "memory allocation failed");
55 return -ENOMEM;
56 }
57
58 return 0;
59}
60
61void nvgpu_free_gmmu_pages(struct vm_gk20a *vm,
62 struct gk20a_mm_entry *entry)
63{
64 struct gk20a *g = gk20a_from_vm(vm);
65
66 if (!entry->mem.size)
67 return;
68
69 if (entry->woffset) /* fake shadow mem */
70 return;
71
72 nvgpu_dma_free(g, &entry->mem);
73}
74
75/*
76 * Allocate a phys contig region big enough for a full
77 * sized gmmu page table for the given gmmu_page_size.
78 * the whole range is zeroed so it's "invalid"/will fault.
79 *
80 * If a previous entry is supplied, its memory will be used for
81 * suballocation for this next entry too, if there is space.
82 */
83int nvgpu_zalloc_gmmu_page_table(struct vm_gk20a *vm,
84 enum gmmu_pgsz_gk20a pgsz_idx,
85 const struct gk20a_mmu_level *l,
86 struct gk20a_mm_entry *entry,
87 struct gk20a_mm_entry *prev_entry)
88{
89 int err = -ENOMEM;
90 int order;
91 struct gk20a *g = gk20a_from_vm(vm);
92 u32 bytes;
93
94 /* allocate enough pages for the table */
95 order = l->hi_bit[pgsz_idx] - l->lo_bit[pgsz_idx] + 1;
96 order += ilog2(l->entry_size);
97 bytes = 1 << order;
98 order -= PAGE_SHIFT;
99 if (order < 0 && prev_entry) {
100 /* try to suballocate from previous chunk */
101 u32 capacity = prev_entry->mem.size / bytes;
102 u32 prev = prev_entry->woffset * sizeof(u32) / bytes;
103 u32 free = capacity - prev - 1;
104
105 nvgpu_log(g, gpu_dbg_pte, "cap %d prev %d free %d bytes %d",
106 capacity, prev, free, bytes);
107
108 if (free) {
109 memcpy(&entry->mem, &prev_entry->mem,
110 sizeof(entry->mem));
111 entry->woffset = prev_entry->woffset
112 + bytes / sizeof(u32);
113 err = 0;
114 }
115 }
116
117 if (err) {
118 /* no suballoc space */
119 order = max(0, order);
120 err = nvgpu_alloc_gmmu_pages(vm, order, entry);
121 entry->woffset = 0;
122 }
123
124 nvgpu_log(g, gpu_dbg_pte, "entry = 0x%p, addr=%08llx, size %d, woff %x",
125 entry,
126 (entry->mem.priv.sgt &&
127 entry->mem.aperture == APERTURE_SYSMEM) ?
128 g->ops.mm.get_iova_addr(g, entry->mem.priv.sgt->sgl, 0) : 0,
129 order, entry->woffset);
130 if (err)
131 return err;
132 entry->pgsz = pgsz_idx;
133 entry->mem.skip_wmb = true;
134
135 return err;
136}
137 48
138/* 49/*
139 * Core GMMU map function for the kernel to use. If @addr is 0 then the GPU 50 * Core GMMU map function for the kernel to use. If @addr is 0 then the GPU
@@ -225,103 +136,484 @@ void nvgpu_gmmu_unmap(struct vm_gk20a *vm, struct nvgpu_mem *mem, u64 gpu_va)
225 nvgpu_mutex_release(&vm->update_gmmu_lock); 136 nvgpu_mutex_release(&vm->update_gmmu_lock);
226} 137}
227 138
228static int update_gmmu_level_locked(struct vm_gk20a *vm, 139int nvgpu_gmmu_init_page_table(struct vm_gk20a *vm)
229 struct gk20a_mm_entry *pte, 140{
230 enum gmmu_pgsz_gk20a pgsz_idx, 141 /*
231 struct scatterlist **sgl, 142 * Need this just for page size. Everything else can be ignored. Also
232 u64 *offset, 143 * note that we can just use pgsz 0 (i.e small pages) since the number
233 u64 *iova, 144 * of bits present in the top level PDE are the same for small/large
234 u64 gpu_va, u64 gpu_end, 145 * page VMs.
235 u8 kind_v, u64 *ctag, 146 */
236 bool cacheable, bool unmapped_pte, 147 struct nvgpu_gmmu_attrs attrs = {
237 int rw_flag, 148 .pgsz = 0,
238 bool sparse, 149 };
239 int lvl, 150
240 bool priv, 151 return pd_allocate(vm, &vm->pdb, &vm->mmu_levels[0], &attrs);
241 enum nvgpu_aperture aperture) 152}
153
154
155/*
156 * Ensure that there's a CPU mapping for the page directory memory. This won't
157 * always be the case for 32 bit systems since we may need to save kernel
158 * virtual memory.
159 */
160static int map_gmmu_pages(struct gk20a *g, struct nvgpu_gmmu_pd *entry)
161{
162 return nvgpu_mem_begin(g, &entry->mem);
163}
164
165/*
166 * Handle any necessary CPU unmap semantics for a page directories DMA memory.
167 * For 64 bit platforms this is a noop.
168 */
169static void unmap_gmmu_pages(struct gk20a *g, struct nvgpu_gmmu_pd *entry)
170{
171 nvgpu_mem_end(g, &entry->mem);
172}
173
174static int nvgpu_alloc_gmmu_pages(struct vm_gk20a *vm, u32 bytes,
175 struct nvgpu_gmmu_pd *pd)
242{ 176{
243 struct gk20a *g = gk20a_from_vm(vm); 177 struct gk20a *g = gk20a_from_vm(vm);
244 const struct gk20a_mmu_level *l = &vm->mmu_levels[lvl]; 178 unsigned long flags = NVGPU_DMA_FORCE_CONTIGUOUS;
245 const struct gk20a_mmu_level *next_l = &vm->mmu_levels[lvl+1]; 179 int err;
246 int err = 0; 180
247 u32 pde_i; 181 /*
248 u64 pde_size = 1ULL << (u64)l->lo_bit[pgsz_idx]; 182 * On arm32 vmalloc space is a precious commodity so we do not map pages
249 struct gk20a_mm_entry *next_pte = NULL, *prev_pte = NULL; 183 * by default.
184 */
185 if (!IS_ENABLED(CONFIG_ARM64))
186 flags |= NVGPU_DMA_NO_KERNEL_MAPPING;
187
188 err = nvgpu_dma_alloc_flags(g, flags, bytes, &pd->mem);
189 if (err)
190 return -ENOMEM;
191
192 return 0;
193}
194
195void nvgpu_free_gmmu_pages(struct vm_gk20a *vm,
196 struct nvgpu_gmmu_pd *pd)
197{
198 struct gk20a *g = gk20a_from_vm(vm);
199
200 nvgpu_dma_free(g, &pd->mem);
201}
202
203/*
204 * Return the _physical_ address of a page directory.
205 */
206u64 nvgpu_pde_phys_addr(struct gk20a *g, struct nvgpu_gmmu_pd *pd)
207{
208 if (g->mm.has_physical_mode)
209 return sg_phys(pd->mem.priv.sgt->sgl);
210 else
211 return nvgpu_mem_get_base_addr(g, &pd->mem, 0);
212}
213
214/*
215 * Return the aligned length based on the page size in attrs.
216 */
217static u64 nvgpu_align_map_length(struct vm_gk20a *vm, u64 length,
218 struct nvgpu_gmmu_attrs *attrs)
219{
220 u64 page_size = vm->gmmu_page_sizes[attrs->pgsz];
221
222 return ALIGN(length, page_size);
223}
224
225static u32 pd_entries(const struct gk20a_mmu_level *l,
226 struct nvgpu_gmmu_attrs *attrs)
227{
228 /*
229 * Number of entries in a PD is easy to compute from the number of bits
230 * used to index the page directory. That is simply 2 raised to the
231 * number of bits.
232 */
233 return 1UL << (l->hi_bit[attrs->pgsz] - l->lo_bit[attrs->pgsz] + 1UL);
234}
235
236/*
237 * Computes the size of a PD table.
238 */
239static u32 pd_size(const struct gk20a_mmu_level *l,
240 struct nvgpu_gmmu_attrs *attrs)
241{
242 return pd_entries(l, attrs) * l->entry_size;
243}
244
245/*
246 * Allocate a physically contiguous region big enough for a gmmu page table
247 * of the specified level and page size. The whole range is zeroed so that any
248 * accesses will fault until proper values are programmed.
249 */
250static int pd_allocate(struct vm_gk20a *vm,
251 struct nvgpu_gmmu_pd *pd,
252 const struct gk20a_mmu_level *l,
253 struct nvgpu_gmmu_attrs *attrs)
254{
255 int err;
250 256
251 gk20a_dbg_fn(""); 257 if (pd->mem.size)
258 return 0;
252 259
253 pde_i = (gpu_va & ((1ULL << ((u64)l->hi_bit[pgsz_idx]+1)) - 1ULL)) 260 err = nvgpu_alloc_gmmu_pages(vm, pd_size(l, attrs), pd);
254 >> (u64)l->lo_bit[pgsz_idx]; 261 if (err) {
262 nvgpu_info(vm->mm->g, "error allocating page directory!");
263 return err;
264 }
255 265
256 gk20a_dbg(gpu_dbg_pte, "size_idx=%d, l: %d, [%llx,%llx], iova=%llx", 266 /*
257 pgsz_idx, lvl, gpu_va, gpu_end-1, *iova); 267 * One mb() is done after all mapping operations. Don't need individual
268 * barriers for each PD write.
269 */
270 pd->mem.skip_wmb = true;
258 271
259 while (gpu_va < gpu_end) { 272 return 0;
260 u64 next = min((gpu_va + pde_size) & ~(pde_size-1), gpu_end); 273}
261 274
262 /* Allocate next level */ 275/*
276 * Compute what page directory index at the passed level the passed virtual
277 * address corresponds to. @attrs is necessary for determining the page size
278 * which is used to pick the right bit offsets for the GMMU level.
279 */
280static u32 pd_index(const struct gk20a_mmu_level *l, u64 virt,
281 struct nvgpu_gmmu_attrs *attrs)
282{
283 u64 pd_mask = (1ULL << ((u64)l->hi_bit[attrs->pgsz] + 1)) - 1ULL;
284 u32 pd_shift = (u64)l->lo_bit[attrs->pgsz];
285
286 /*
287 * For convenience we don't bother computing the lower bound of the
288 * mask; it's easier to just shift it off.
289 */
290 return (virt & pd_mask) >> pd_shift;
291}
292
293static int pd_allocate_children(struct vm_gk20a *vm,
294 const struct gk20a_mmu_level *l,
295 struct nvgpu_gmmu_pd *pd,
296 struct nvgpu_gmmu_attrs *attrs)
297{
298 struct gk20a *g = gk20a_from_vm(vm);
299
300 if (pd->entries)
301 return 0;
302
303 pd->num_entries = pd_entries(l, attrs);
304 pd->entries = nvgpu_vzalloc(g, sizeof(struct nvgpu_gmmu_pd) *
305 pd->num_entries);
306 if (!pd->entries)
307 return -ENOMEM;
308
309 return 0;
310}
311
312/*
313 * This function programs the GMMU based on two ranges: a physical range and a
314 * GPU virtual range. The virtual is mapped to the physical. Physical in this
315 * case can mean either a real physical sysmem address or a IO virtual address
316 * (for instance when a system has an IOMMU running).
317 *
318 * The rest of the parameters are for describing the actual mapping itself.
319 *
320 * This function recursively calls itself for handling PDEs. At the final level
321 * a PTE handler is called. The phys and virt ranges are adjusted for each
322 * recursion so that each invocation of this function need only worry about the
323 * range it is passed.
324 *
325 * phys_addr will always point to a contiguous range - the discontiguous nature
326 * of DMA buffers is taken care of at the layer above this.
327 */
328static int __set_pd_level(struct vm_gk20a *vm,
329 struct nvgpu_gmmu_pd *pd,
330 int lvl,
331 u64 phys_addr,
332 u64 virt_addr, u64 length,
333 struct nvgpu_gmmu_attrs *attrs)
334{
335 int err = 0;
336 u64 pde_range;
337 struct gk20a *g = gk20a_from_vm(vm);
338 struct nvgpu_gmmu_pd *next_pd = NULL;
339 const struct gk20a_mmu_level *l = &vm->mmu_levels[lvl];
340 const struct gk20a_mmu_level *next_l = &vm->mmu_levels[lvl + 1];
341
342 /*
343 * 5 levels for Pascal+. For pre-pascal we only have 2. This puts
344 * offsets into the page table debugging code which makes it easier to
345 * see what level prints are from.
346 */
347 static const char *__lvl_debug[] = {
348 "", /* L=0 */
349 " ", /* L=1 */
350 " ", /* L=2 */
351 " ", /* L=3 */
352 " ", /* L=4 */
353 };
354
355 pde_range = 1ULL << (u64)l->lo_bit[attrs->pgsz];
356
357 __gmmu_dbg_v(g, attrs,
358 "L=%d %sGPU virt %#-12llx +%#-9llx -> phys %#-12llx",
359 lvl,
360 __lvl_debug[lvl],
361 virt_addr,
362 length,
363 phys_addr);
364
365 /*
366 * Iterate across the mapping in chunks the size of this level's PDE.
367 * For each of those chunks program our level's PDE and then, if there's
368 * a next level, program the next level's PDEs/PTEs.
369 */
370 while (length) {
371 u32 pd_idx = pd_index(l, virt_addr, attrs);
372 u64 chunk_size;
373 u64 target_addr;
374
375 /*
376 * Truncate the pde_range when the virtual address does not
377 * start at a PDE boundary.
378 */
379 chunk_size = min(length,
380 pde_range - (virt_addr & (pde_range - 1)));
381
382 /*
383 * If the next level has an update_entry function then we know
384 * that _this_ level points to PDEs (not PTEs). Thus we need to
385 * have a bunch of children PDs.
386 */
263 if (next_l->update_entry) { 387 if (next_l->update_entry) {
264 if (!pte->entries) { 388 if (pd_allocate_children(vm, l, pd, attrs))
265 int num_entries = 389 return -ENOMEM;
266 1 << 390
267 (l->hi_bit[pgsz_idx] 391 /*
268 - l->lo_bit[pgsz_idx] + 1); 392 * Get the next PD so that we know what to put in this
269 pte->entries = 393 * current PD. If the next level is actually PTEs then
270 nvgpu_vzalloc(g, 394 * we don't need this - we will just use the real
271 sizeof(struct gk20a_mm_entry) * 395 * physical target.
272 num_entries); 396 */
273 if (!pte->entries) 397 next_pd = &pd->entries[pd_idx];
274 return -ENOMEM; 398
275 pte->pgsz = pgsz_idx; 399 /*
276 pte->num_entries = num_entries; 400 * Allocate the backing memory for next_pd.
277 } 401 */
278 prev_pte = next_pte; 402 if (pd_allocate(vm, next_pd, next_l, attrs))
279 next_pte = pte->entries + pde_i; 403 return -ENOMEM;
280
281 if (!next_pte->mem.size) {
282 err = nvgpu_zalloc_gmmu_page_table(vm,
283 pgsz_idx, next_l, next_pte, prev_pte);
284 if (err)
285 return err;
286 }
287 } 404 }
288 405
289 err = l->update_entry(vm, pte, pde_i, pgsz_idx, 406 /*
290 sgl, offset, iova, 407 * This is the address we want to program into the actual PDE/
291 kind_v, ctag, cacheable, unmapped_pte, 408 * PTE. When the next level is PDEs we need the target address
292 rw_flag, sparse, priv, aperture); 409 * to be the table of PDEs. When the next level is PTEs the
293 if (err) 410 * target addr is the real physical address we are aiming for.
294 return err; 411 */
412 target_addr = next_pd ? nvgpu_pde_phys_addr(g, next_pd) :
413 phys_addr;
414
415 l->update_entry(vm, l,
416 pd, pd_idx,
417 virt_addr,
418 target_addr,
419 attrs);
295 420
296 if (next_l->update_entry) { 421 if (next_l->update_entry) {
297 /* get cpu access to the ptes */ 422 err = map_gmmu_pages(g, next_pd);
298 err = map_gmmu_pages(g, next_pte);
299 if (err) { 423 if (err) {
300 nvgpu_err(g, 424 nvgpu_err(g,
301 "couldn't map ptes for update as=%d", 425 "couldn't map ptes for update as=%d",
302 vm_aspace_id(vm)); 426 vm_aspace_id(vm));
303 return err; 427 return err;
304 } 428 }
305 err = update_gmmu_level_locked(vm, next_pte, 429
306 pgsz_idx, 430 err = __set_pd_level(vm, next_pd,
307 sgl, 431 lvl + 1,
308 offset, 432 phys_addr,
309 iova, 433 virt_addr,
310 gpu_va, 434 chunk_size,
311 next, 435 attrs);
312 kind_v, ctag, cacheable, unmapped_pte, 436 unmap_gmmu_pages(g, next_pd);
313 rw_flag, sparse, lvl+1, priv, aperture);
314 unmap_gmmu_pages(g, next_pte);
315 437
316 if (err) 438 if (err)
317 return err; 439 return err;
318 } 440 }
319 441
320 pde_i++; 442 virt_addr += chunk_size;
321 gpu_va = next; 443
444 /*
445 * Only add to phys_addr if it's non-zero. A zero value implies
446 * we are unmapping as as a result we don't want to place
447 * non-zero phys addresses in the PTEs. A non-zero phys-addr
448 * would also confuse the lower level PTE programming code.
449 */
450 if (phys_addr)
451 phys_addr += chunk_size;
452 length -= chunk_size;
453 }
454
455 __gmmu_dbg_v(g, attrs, "L=%d %s%s", lvl, __lvl_debug[lvl], "ret!");
456
457 return 0;
458}
459
460/*
461 * VIDMEM version of the update_ptes logic.
462 */
463static int __nvgpu_gmmu_update_page_table_vidmem(struct vm_gk20a *vm,
464 struct sg_table *sgt,
465 u64 space_to_skip,
466 u64 virt_addr,
467 u64 length,
468 struct nvgpu_gmmu_attrs *attrs)
469{
470 struct nvgpu_page_alloc *alloc = NULL;
471 struct page_alloc_chunk *chunk = NULL;
472 u64 phys_addr, chunk_length;
473 int err = 0;
474
475 if (!sgt) {
476 /*
477 * This is considered an unmap. Just pass in 0 as the physical
478 * address for the entire GPU range.
479 */
480 err = __set_pd_level(vm, &vm->pdb,
481 0,
482 0,
483 virt_addr, length,
484 attrs);
485 return err;
486 }
487
488 alloc = get_vidmem_page_alloc(sgt->sgl);
489
490 /*
491 * Otherwise iterate across all the chunks in this allocation and
492 * map them.
493 */
494 nvgpu_list_for_each_entry(chunk, &alloc->alloc_chunks,
495 page_alloc_chunk, list_entry) {
496 if (space_to_skip &&
497 space_to_skip >= chunk->length) {
498 space_to_skip -= chunk->length;
499 continue;
500 }
501
502 phys_addr = chunk->base + space_to_skip;
503 chunk_length = min(length, (chunk->length - space_to_skip));
504
505 err = __set_pd_level(vm, &vm->pdb,
506 0,
507 phys_addr,
508 virt_addr, length,
509 attrs);
510 if (err)
511 break;
512
513 /* Space has been skipped so zero this for future chunks. */
514 space_to_skip = 0;
515
516 /*
517 * Update the map pointer and the remaining length.
518 */
519 virt_addr += chunk_length;
520 length -= chunk_length;
521
522 if (length == 0)
523 break;
322 } 524 }
323 525
324 gk20a_dbg_fn("done"); 526 return err;
527}
528
529static int __nvgpu_gmmu_update_page_table_sysmem(struct vm_gk20a *vm,
530 struct sg_table *sgt,
531 u64 space_to_skip,
532 u64 virt_addr,
533 u64 length,
534 struct nvgpu_gmmu_attrs *attrs)
535{
536 int err;
537 struct scatterlist *sgl;
538 struct gk20a *g = gk20a_from_vm(vm);
539
540 if (!sgt) {
541 /*
542 * This is considered an unmap. Just pass in 0 as the physical
543 * address for the entire GPU range.
544 */
545 err = __set_pd_level(vm, &vm->pdb,
546 0,
547 0,
548 virt_addr, length,
549 attrs);
550 return err;
551 }
552
553 /*
554 * At this point we have a Linux scatter-gather list pointing to some
555 * number of discontiguous chunks of memory. Iterate over that list and
556 * generate a GMMU map call for each chunk. There are two possibilities:
557 * either the IOMMU is enabled or not. When the IOMMU is enabled the
558 * mapping is simple since the "physical" address is actually a virtual
559 * IO address and will be contiguous. The no-IOMMU case is more
560 * complicated. We will have to iterate over the SGT and do a separate
561 * map for each chunk of the SGT.
562 */
563 sgl = sgt->sgl;
564
565 if (!g->mm.bypass_smmu) {
566 u64 io_addr = g->ops.mm.get_iova_addr(g, sgl, 0);
567
568 io_addr += space_to_skip;
569
570 err = __set_pd_level(vm, &vm->pdb,
571 0,
572 io_addr,
573 virt_addr,
574 length,
575 attrs);
576
577 return err;
578 }
579
580 /*
581 * Finally: last possible case: do the no-IOMMU mapping. In this case we
582 * really are mapping physical pages directly.
583 */
584 while (sgl) {
585 u64 phys_addr;
586 u64 chunk_length;
587
588 /*
589 * Cut out sgl ents for space_to_skip.
590 */
591 if (space_to_skip && space_to_skip >= sgl->length) {
592 space_to_skip -= sgl->length;
593 sgl = sg_next(sgl);
594 continue;
595 }
596
597 phys_addr = sg_phys(sgl) + space_to_skip;
598 chunk_length = min(length, sgl->length - space_to_skip);
599
600 err = __set_pd_level(vm, &vm->pdb,
601 0,
602 phys_addr,
603 virt_addr,
604 chunk_length,
605 attrs);
606 if (err)
607 return err;
608
609 space_to_skip = 0;
610 virt_addr += chunk_length;
611 length -= chunk_length;
612 sgl = sg_next(sgl);
613
614 if (length == 0)
615 break;
616 }
325 617
326 return 0; 618 return 0;
327} 619}
@@ -332,8 +624,8 @@ static int update_gmmu_level_locked(struct vm_gk20a *vm,
332 * physical* address. 624 * physical* address.
333 * 625 *
334 * The update of each level of the page tables is farmed out to chip specific 626 * The update of each level of the page tables is farmed out to chip specific
335 * implementations. But the logic around that is generic to all chips. Every chip 627 * implementations. But the logic around that is generic to all chips. Every
336 * has some number of PDE levels and then a PTE level. 628 * chip has some number of PDE levels and then a PTE level.
337 * 629 *
338 * Each chunk of the incoming SGT is sent to the chip specific implementation 630 * Each chunk of the incoming SGT is sent to the chip specific implementation
339 * of page table update. 631 * of page table update.
@@ -341,148 +633,81 @@ static int update_gmmu_level_locked(struct vm_gk20a *vm,
341 * [*] Note: the "physical" address may actually be an IO virtual address in the 633 * [*] Note: the "physical" address may actually be an IO virtual address in the
342 * case of SMMU usage. 634 * case of SMMU usage.
343 */ 635 */
344static int update_gmmu_ptes_locked(struct vm_gk20a *vm, 636static int __nvgpu_gmmu_update_page_table(struct vm_gk20a *vm,
345 enum gmmu_pgsz_gk20a pgsz_idx, 637 struct sg_table *sgt,
346 struct sg_table *sgt, 638 u64 space_to_skip,
347 u64 buffer_offset, 639 u64 virt_addr,
348 u64 gpu_va, u64 gpu_end, 640 u64 length,
349 u8 kind_v, u32 ctag_offset, 641 struct nvgpu_gmmu_attrs *attrs)
350 bool cacheable, bool unmapped_pte,
351 int rw_flag,
352 bool sparse,
353 bool priv,
354 enum nvgpu_aperture aperture)
355{ 642{
356 struct gk20a *g = gk20a_from_vm(vm); 643 struct gk20a *g = gk20a_from_vm(vm);
357 int ctag_granularity = g->ops.fb.compression_page_size(g); 644 u32 page_size;
358 u64 ctag = (u64)ctag_offset * (u64)ctag_granularity;
359 u64 iova = 0;
360 u64 space_to_skip = buffer_offset;
361 u64 map_size = gpu_end - gpu_va;
362 u32 page_size = vm->gmmu_page_sizes[pgsz_idx];
363 int err; 645 int err;
364 struct scatterlist *sgl = NULL;
365 struct nvgpu_page_alloc *alloc = NULL;
366 struct page_alloc_chunk *chunk = NULL;
367 u64 length;
368 646
369 /* note: here we need to map kernel to small, since the 647 /* note: here we need to map kernel to small, since the
370 * low-level mmu code assumes 0 is small and 1 is big pages */ 648 * low-level mmu code assumes 0 is small and 1 is big pages */
371 if (pgsz_idx == gmmu_page_size_kernel) 649 if (attrs->pgsz == gmmu_page_size_kernel)
372 pgsz_idx = gmmu_page_size_small; 650 attrs->pgsz = gmmu_page_size_small;
651
652 page_size = vm->gmmu_page_sizes[attrs->pgsz];
373 653
374 if (space_to_skip & (page_size - 1)) 654 if (space_to_skip & (page_size - 1))
375 return -EINVAL; 655 return -EINVAL;
376 656
657 /*
658 * Update length to be aligned to the passed page size.
659 */
660 length = nvgpu_align_map_length(vm, length, attrs);
661
377 err = map_gmmu_pages(g, &vm->pdb); 662 err = map_gmmu_pages(g, &vm->pdb);
378 if (err) { 663 if (err) {
379 nvgpu_err(g, 664 nvgpu_err(g, "couldn't map ptes for update as=%d",
380 "couldn't map ptes for update as=%d", 665 vm_aspace_id(vm));
381 vm_aspace_id(vm));
382 return err; 666 return err;
383 } 667 }
384 668
385 if (aperture == APERTURE_VIDMEM) { 669 __gmmu_dbg(g, attrs,
386 gmmu_dbg_v(g, "vidmem map size_idx=%d, gpu_va=[%llx,%llx]", 670 "vm=%s "
387 pgsz_idx, gpu_va, gpu_end-1); 671 "%-5s GPU virt %#-12llx +%#-9llx phys %#-12llx "
388 672 "phys offset: %#-4llx; pgsz: %3dkb perm=%-2s | "
389 if (sgt) { 673 "kind=%#02x APT=%-6s %c%c%c",
390 alloc = get_vidmem_page_alloc(sgt->sgl); 674 vm->name,
391 675 sgt ? "MAP" : "UNMAP",
392 nvgpu_list_for_each_entry(chunk, &alloc->alloc_chunks, 676 virt_addr,
393 page_alloc_chunk, list_entry) { 677 length,
394 if (space_to_skip && 678 sgt ? g->ops.mm.get_iova_addr(g, sgt->sgl, 0) : 0ULL,
395 space_to_skip > chunk->length) { 679 space_to_skip,
396 space_to_skip -= chunk->length; 680 page_size >> 10,
397 } else { 681 nvgpu_gmmu_perm_str(attrs->rw_flag),
398 iova = chunk->base + space_to_skip; 682 attrs->kind_v,
399 length = chunk->length - space_to_skip; 683 nvgpu_aperture_str(attrs->aperture),
400 length = min(length, map_size); 684 attrs->cacheable ? 'C' : 'V', /* C = cached, V = volatile. */
401 space_to_skip = 0; 685 attrs->sparse ? 'S' : '-',
402 686 attrs->priv ? 'P' : '-');
403 err = update_gmmu_level_locked(vm, 687
404 &vm->pdb, pgsz_idx, 688 /*
405 &sgl, 689 * Handle VIDMEM progamming. Currently uses a different scatter list
406 &space_to_skip, 690 * format.
407 &iova, 691 */
408 gpu_va, gpu_va + length, 692 if (attrs->aperture == APERTURE_VIDMEM)
409 kind_v, &ctag, 693 err = __nvgpu_gmmu_update_page_table_vidmem(vm,
410 cacheable, unmapped_pte, 694 sgt,
411 rw_flag, sparse, 0, priv, 695 space_to_skip,
412 aperture); 696 virt_addr,
413 if (err) 697 length,
414 break; 698 attrs);
415 699 else
416 /* need to set explicit zero here */ 700 err = __nvgpu_gmmu_update_page_table_sysmem(vm,
417 space_to_skip = 0; 701 sgt,
418 gpu_va += length; 702 space_to_skip,
419 map_size -= length; 703 virt_addr,
420 704 length,
421 if (!map_size) 705 attrs);
422 break;
423 }
424 }
425 } else {
426 err = update_gmmu_level_locked(vm, &vm->pdb, pgsz_idx,
427 &sgl,
428 &space_to_skip,
429 &iova,
430 gpu_va, gpu_end,
431 kind_v, &ctag,
432 cacheable, unmapped_pte, rw_flag,
433 sparse, 0, priv,
434 aperture);
435 }
436 } else {
437 gmmu_dbg_v(g,
438 "pgsz=%-6d, gpu_va: %#-12llx +%#-6llx phys: %#-12llx "
439 "buffer offset: %-4lld, nents: %d",
440 page_size,
441 gpu_va, gpu_end - gpu_va,
442 sgt ? g->ops.mm.get_iova_addr(g, sgt->sgl, 0) : 0ULL,
443 buffer_offset,
444 sgt ? sgt->nents : 0);
445
446 if (sgt) {
447 iova = g->ops.mm.get_iova_addr(vm->mm->g, sgt->sgl, 0);
448 if (!vm->mm->bypass_smmu && iova) {
449 iova += space_to_skip;
450 } else {
451 sgl = sgt->sgl;
452
453 gk20a_dbg(gpu_dbg_pte, "chunk address %llx, size %d",
454 (u64)sg_phys(sgl),
455 sgl->length);
456
457 while (space_to_skip && sgl &&
458 space_to_skip + page_size > sgl->length) {
459 space_to_skip -= sgl->length;
460 sgl = sg_next(sgl);
461 gk20a_dbg(gpu_dbg_pte, "chunk address %llx, size %d",
462 (u64)sg_phys(sgl),
463 sgl->length);
464 }
465
466 iova = sg_phys(sgl) + space_to_skip;
467 }
468 }
469
470 err = update_gmmu_level_locked(vm, &vm->pdb, pgsz_idx,
471 &sgl,
472 &space_to_skip,
473 &iova,
474 gpu_va, gpu_end,
475 kind_v, &ctag,
476 cacheable, unmapped_pte, rw_flag,
477 sparse, 0, priv,
478 aperture);
479 }
480 706
481 unmap_gmmu_pages(g, &vm->pdb); 707 unmap_gmmu_pages(g, &vm->pdb);
482
483 mb(); 708 mb();
484 709
485 gk20a_dbg_fn("done"); 710 __gmmu_dbg(g, attrs, "%-5s Done!", sgt ? "MAP" : "UNMAP");
486 711
487 return err; 712 return err;
488} 713}
@@ -500,32 +725,44 @@ static int update_gmmu_ptes_locked(struct vm_gk20a *vm,
500 * have the update_gmmu_lock aquired. 725 * have the update_gmmu_lock aquired.
501 */ 726 */
502u64 gk20a_locked_gmmu_map(struct vm_gk20a *vm, 727u64 gk20a_locked_gmmu_map(struct vm_gk20a *vm,
503 u64 map_offset, 728 u64 vaddr,
504 struct sg_table *sgt, 729 struct sg_table *sgt,
505 u64 buffer_offset, 730 u64 buffer_offset,
506 u64 size, 731 u64 size,
507 int pgsz_idx, 732 int pgsz_idx,
508 u8 kind_v, 733 u8 kind_v,
509 u32 ctag_offset, 734 u32 ctag_offset,
510 u32 flags, 735 u32 flags,
511 int rw_flag, 736 int rw_flag,
512 bool clear_ctags, 737 bool clear_ctags,
513 bool sparse, 738 bool sparse,
514 bool priv, 739 bool priv,
515 struct vm_gk20a_mapping_batch *batch, 740 struct vm_gk20a_mapping_batch *batch,
516 enum nvgpu_aperture aperture) 741 enum nvgpu_aperture aperture)
517{ 742{
743 struct gk20a *g = gk20a_from_vm(vm);
518 int err = 0; 744 int err = 0;
519 bool allocated = false; 745 bool allocated = false;
520 struct gk20a *g = gk20a_from_vm(vm);
521 int ctag_granularity = g->ops.fb.compression_page_size(g); 746 int ctag_granularity = g->ops.fb.compression_page_size(g);
522 u32 ctag_lines = DIV_ROUND_UP_ULL(size, ctag_granularity); 747 struct nvgpu_gmmu_attrs attrs = {
523 748 .pgsz = pgsz_idx,
524 /* Allocate (or validate when map_offset != 0) the virtual address. */ 749 .kind_v = kind_v,
525 if (!map_offset) { 750 .ctag = (u64)ctag_offset * (u64)ctag_granularity,
526 map_offset = __nvgpu_vm_alloc_va(vm, size, 751 .cacheable = flags & NVGPU_MAP_BUFFER_FLAGS_CACHEABLE_TRUE,
527 pgsz_idx); 752 .rw_flag = rw_flag,
528 if (!map_offset) { 753 .sparse = sparse,
754 .priv = priv,
755 .valid = !(flags & NVGPU_AS_MAP_BUFFER_FLAGS_UNMAPPED_PTE),
756 .aperture = aperture
757 };
758
759 /*
760 * Only allocate a new GPU VA range if we haven't already been passed a
761 * GPU VA range. This facilitates fixed mappings.
762 */
763 if (!vaddr) {
764 vaddr = __nvgpu_vm_alloc_va(vm, size, pgsz_idx);
765 if (!vaddr) {
529 nvgpu_err(g, "failed to allocate va space"); 766 nvgpu_err(g, "failed to allocate va space");
530 err = -ENOMEM; 767 err = -ENOMEM;
531 goto fail_alloc; 768 goto fail_alloc;
@@ -533,34 +770,8 @@ u64 gk20a_locked_gmmu_map(struct vm_gk20a *vm,
533 allocated = true; 770 allocated = true;
534 } 771 }
535 772
536 gmmu_dbg(g, 773 err = __nvgpu_gmmu_update_page_table(vm, sgt, buffer_offset,
537 "gv: 0x%04x_%08x + 0x%-7llx " 774 vaddr, size, &attrs);
538 "[dma: 0x%02x_%08x, pa: 0x%02x_%08x] "
539 "pgsz=%-3dKb as=%-2d ctags=%d start=%d "
540 "kind=0x%x flags=0x%x apt=%s",
541 u64_hi32(map_offset), u64_lo32(map_offset), size,
542 sgt ? u64_hi32((u64)sg_dma_address(sgt->sgl)) : 0,
543 sgt ? u64_lo32((u64)sg_dma_address(sgt->sgl)) : 0,
544 sgt ? u64_hi32((u64)sg_phys(sgt->sgl)) : 0,
545 sgt ? u64_lo32((u64)sg_phys(sgt->sgl)) : 0,
546 vm->gmmu_page_sizes[pgsz_idx] >> 10, vm_aspace_id(vm),
547 ctag_lines, ctag_offset,
548 kind_v, flags, nvgpu_aperture_str(aperture));
549
550 err = update_gmmu_ptes_locked(vm, pgsz_idx,
551 sgt,
552 buffer_offset,
553 map_offset, map_offset + size,
554 kind_v,
555 ctag_offset,
556 flags &
557 NVGPU_MAP_BUFFER_FLAGS_CACHEABLE_TRUE,
558 flags &
559 NVGPU_AS_MAP_BUFFER_FLAGS_UNMAPPED_PTE,
560 rw_flag,
561 sparse,
562 priv,
563 aperture);
564 if (err) { 775 if (err) {
565 nvgpu_err(g, "failed to update ptes on map"); 776 nvgpu_err(g, "failed to update ptes on map");
566 goto fail_validate; 777 goto fail_validate;
@@ -571,26 +782,37 @@ u64 gk20a_locked_gmmu_map(struct vm_gk20a *vm,
571 else 782 else
572 batch->need_tlb_invalidate = true; 783 batch->need_tlb_invalidate = true;
573 784
574 return map_offset; 785 return vaddr;
575fail_validate: 786fail_validate:
576 if (allocated) 787 if (allocated)
577 __nvgpu_vm_free_va(vm, map_offset, pgsz_idx); 788 __nvgpu_vm_free_va(vm, vaddr, pgsz_idx);
578fail_alloc: 789fail_alloc:
579 nvgpu_err(g, "%s: failed with err=%d", __func__, err); 790 nvgpu_err(g, "%s: failed with err=%d", __func__, err);
580 return 0; 791 return 0;
581} 792}
582 793
583void gk20a_locked_gmmu_unmap(struct vm_gk20a *vm, 794void gk20a_locked_gmmu_unmap(struct vm_gk20a *vm,
584 u64 vaddr, 795 u64 vaddr,
585 u64 size, 796 u64 size,
586 int pgsz_idx, 797 int pgsz_idx,
587 bool va_allocated, 798 bool va_allocated,
588 int rw_flag, 799 int rw_flag,
589 bool sparse, 800 bool sparse,
590 struct vm_gk20a_mapping_batch *batch) 801 struct vm_gk20a_mapping_batch *batch)
591{ 802{
592 int err = 0; 803 int err = 0;
593 struct gk20a *g = gk20a_from_vm(vm); 804 struct gk20a *g = gk20a_from_vm(vm);
805 struct nvgpu_gmmu_attrs attrs = {
806 .pgsz = pgsz_idx,
807 .kind_v = 0,
808 .ctag = 0,
809 .cacheable = 0,
810 .rw_flag = rw_flag,
811 .sparse = sparse,
812 .priv = 0,
813 .valid = 0,
814 .aperture = APERTURE_INVALID,
815 };
594 816
595 if (va_allocated) { 817 if (va_allocated) {
596 err = __nvgpu_vm_free_va(vm, vaddr, pgsz_idx); 818 err = __nvgpu_vm_free_va(vm, vaddr, pgsz_idx);
@@ -601,27 +823,11 @@ void gk20a_locked_gmmu_unmap(struct vm_gk20a *vm,
601 } 823 }
602 824
603 /* unmap here needs to know the page size we assigned at mapping */ 825 /* unmap here needs to know the page size we assigned at mapping */
604 err = update_gmmu_ptes_locked(vm, 826 err = __nvgpu_gmmu_update_page_table(vm, NULL, 0,
605 pgsz_idx, 827 vaddr, size, &attrs);
606 NULL, /* n/a for unmap */
607 0,
608 vaddr,
609 vaddr + size,
610 0, 0, false /* n/a for unmap */,
611 false, rw_flag,
612 sparse, 0,
613 APERTURE_INVALID); /* don't care for unmap */
614 if (err) 828 if (err)
615 nvgpu_err(g, "failed to update gmmu ptes on unmap"); 829 nvgpu_err(g, "failed to update gmmu ptes on unmap");
616 830
617 /* flush l2 so any dirty lines are written out *now*.
618 * also as we could potentially be switching this buffer
619 * from nonvolatile (l2 cacheable) to volatile (l2 non-cacheable) at
620 * some point in the future we need to invalidate l2. e.g. switching
621 * from a render buffer unmap (here) to later using the same memory
622 * for gmmu ptes. note the positioning of this relative to any smmu
623 * unmapping (below). */
624
625 if (!batch) { 831 if (!batch) {
626 gk20a_mm_l2_flush(g, true); 832 gk20a_mm_l2_flush(g, true);
627 g->ops.fb.tlb_invalidate(g, &vm->pdb.mem); 833 g->ops.fb.tlb_invalidate(g, &vm->pdb.mem);
diff --git a/drivers/gpu/nvgpu/common/mm/vm.c b/drivers/gpu/nvgpu/common/mm/vm.c
index 88622eca..3aeba500 100644
--- a/drivers/gpu/nvgpu/common/mm/vm.c
+++ b/drivers/gpu/nvgpu/common/mm/vm.c
@@ -36,7 +36,7 @@ int vm_aspace_id(struct vm_gk20a *vm)
36} 36}
37 37
38static void nvgpu_vm_free_entries(struct vm_gk20a *vm, 38static void nvgpu_vm_free_entries(struct vm_gk20a *vm,
39 struct gk20a_mm_entry *parent, 39 struct nvgpu_gmmu_pd *parent,
40 int level) 40 int level)
41{ 41{
42 int i; 42 int i;
@@ -75,8 +75,6 @@ u64 __nvgpu_vm_alloc_va(struct vm_gk20a *vm, u64 size,
75 75
76 /* Be certain we round up to page_size if needed */ 76 /* Be certain we round up to page_size if needed */
77 size = (size + ((u64)page_size - 1)) & ~((u64)page_size - 1); 77 size = (size + ((u64)page_size - 1)) & ~((u64)page_size - 1);
78 nvgpu_log(g, gpu_dbg_map, "size=0x%llx @ pgsz=%dKB", size,
79 vm->gmmu_page_sizes[pgsz_idx] >> 10);
80 78
81 addr = nvgpu_alloc(vma, size); 79 addr = nvgpu_alloc(vma, size);
82 if (!addr) { 80 if (!addr) {
@@ -84,17 +82,14 @@ u64 __nvgpu_vm_alloc_va(struct vm_gk20a *vm, u64 size,
84 return 0; 82 return 0;
85 } 83 }
86 84
87 nvgpu_log(g, gpu_dbg_map, "(%s) addr: 0x%llx", vma->name, addr);
88 return addr; 85 return addr;
89} 86}
90 87
91int __nvgpu_vm_free_va(struct vm_gk20a *vm, u64 addr, 88int __nvgpu_vm_free_va(struct vm_gk20a *vm, u64 addr,
92 enum gmmu_pgsz_gk20a pgsz_idx) 89 enum gmmu_pgsz_gk20a pgsz_idx)
93{ 90{
94 struct gk20a *g = vm->mm->g;
95 struct nvgpu_allocator *vma = vm->vma[pgsz_idx]; 91 struct nvgpu_allocator *vma = vm->vma[pgsz_idx];
96 92
97 nvgpu_log(g, gpu_dbg_map, "(%s) addr: 0x%llx", vma->name, addr);
98 nvgpu_free(vma, addr); 93 nvgpu_free(vma, addr);
99 94
100 return 0; 95 return 0;
@@ -127,32 +122,6 @@ void nvgpu_vm_mapping_batch_finish(struct vm_gk20a *vm,
127 nvgpu_mutex_release(&vm->update_gmmu_lock); 122 nvgpu_mutex_release(&vm->update_gmmu_lock);
128} 123}
129 124
130static int nvgpu_vm_init_page_tables(struct vm_gk20a *vm)
131{
132 u32 pde_lo, pde_hi;
133 int err;
134
135 pde_range_from_vaddr_range(vm,
136 0, vm->va_limit-1,
137 &pde_lo, &pde_hi);
138 vm->pdb.entries = nvgpu_vzalloc(vm->mm->g,
139 sizeof(struct gk20a_mm_entry) *
140 (pde_hi + 1));
141 vm->pdb.num_entries = pde_hi + 1;
142
143 if (!vm->pdb.entries)
144 return -ENOMEM;
145
146 err = nvgpu_zalloc_gmmu_page_table(vm, 0, &vm->mmu_levels[0],
147 &vm->pdb, NULL);
148 if (err) {
149 nvgpu_vfree(vm->mm->g, vm->pdb.entries);
150 return err;
151 }
152
153 return 0;
154}
155
156/* 125/*
157 * Determine if the passed address space can support big pages or not. 126 * Determine if the passed address space can support big pages or not.
158 */ 127 */
@@ -280,7 +249,8 @@ static int __nvgpu_vm_init(struct mm_gk20a *mm,
280#endif 249#endif
281 250
282 /* Initialize the page table data structures. */ 251 /* Initialize the page table data structures. */
283 err = nvgpu_vm_init_page_tables(vm); 252 strncpy(vm->name, name, min(strlen(name), sizeof(vm->name)));
253 err = nvgpu_gmmu_init_page_table(vm);
284 if (err) 254 if (err)
285 goto clean_up_vgpu_vm; 255 goto clean_up_vgpu_vm;
286 256
diff --git a/drivers/gpu/nvgpu/gk20a/fb_gk20a.c b/drivers/gpu/nvgpu/gk20a/fb_gk20a.c
index 3c76e817..c5f9c1fd 100644
--- a/drivers/gpu/nvgpu/gk20a/fb_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/fb_gk20a.c
@@ -67,7 +67,7 @@ void gk20a_fb_tlb_invalidate(struct gk20a *g, struct nvgpu_mem *pdb)
67 if (!g->power_on) 67 if (!g->power_on)
68 return; 68 return;
69 69
70 addr_lo = u64_lo32(gk20a_mem_get_base_addr(g, pdb, 0) >> 12); 70 addr_lo = u64_lo32(nvgpu_mem_get_base_addr(g, pdb, 0) >> 12);
71 71
72 nvgpu_mutex_acquire(&g->mm.tlb_lock); 72 nvgpu_mutex_acquire(&g->mm.tlb_lock);
73 73
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
index b7b68575..558a1b06 100644
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
@@ -777,31 +777,6 @@ int gk20a_mm_pde_coverage_bit_count(struct vm_gk20a *vm)
777 return vm->mmu_levels[0].lo_bit[0]; 777 return vm->mmu_levels[0].lo_bit[0];
778} 778}
779 779
780/* given address range (inclusive) determine the pdes crossed */
781void pde_range_from_vaddr_range(struct vm_gk20a *vm,
782 u64 addr_lo, u64 addr_hi,
783 u32 *pde_lo, u32 *pde_hi)
784{
785 int pde_shift = gk20a_mm_pde_coverage_bit_count(vm);
786
787 *pde_lo = (u32)(addr_lo >> pde_shift);
788 *pde_hi = (u32)(addr_hi >> pde_shift);
789 gk20a_dbg(gpu_dbg_pte, "addr_lo=0x%llx addr_hi=0x%llx pde_ss=%d",
790 addr_lo, addr_hi, pde_shift);
791 gk20a_dbg(gpu_dbg_pte, "pde_lo=%d pde_hi=%d",
792 *pde_lo, *pde_hi);
793}
794
795static u32 pde_from_index(u32 i)
796{
797 return i * gmmu_pde__size_v() / sizeof(u32);
798}
799
800static u32 pte_from_index(u32 i)
801{
802 return i * gmmu_pte__size_v() / sizeof(u32);
803}
804
805int nvgpu_vm_get_buffers(struct vm_gk20a *vm, 780int nvgpu_vm_get_buffers(struct vm_gk20a *vm,
806 struct nvgpu_mapped_buf ***mapped_buffers, 781 struct nvgpu_mapped_buf ***mapped_buffers,
807 int *num_buffers) 782 int *num_buffers)
@@ -1478,7 +1453,7 @@ static int gk20a_gmmu_clear_vidmem_mem(struct gk20a *g, struct nvgpu_mem *mem)
1478 * If mem is in VIDMEM, return base address in vidmem 1453 * If mem is in VIDMEM, return base address in vidmem
1479 * else return IOVA address for SYSMEM 1454 * else return IOVA address for SYSMEM
1480 */ 1455 */
1481u64 gk20a_mem_get_base_addr(struct gk20a *g, struct nvgpu_mem *mem, 1456u64 nvgpu_mem_get_base_addr(struct gk20a *g, struct nvgpu_mem *mem,
1482 u32 flags) 1457 u32 flags)
1483{ 1458{
1484 struct nvgpu_page_alloc *alloc; 1459 struct nvgpu_page_alloc *alloc;
@@ -1580,203 +1555,168 @@ u64 gk20a_mm_iova_addr(struct gk20a *g, struct scatterlist *sgl,
1580 return gk20a_mm_smmu_vaddr_translate(g, sg_dma_address(sgl)); 1555 return gk20a_mm_smmu_vaddr_translate(g, sg_dma_address(sgl));
1581} 1556}
1582 1557
1583void gk20a_pde_wr32(struct gk20a *g, struct gk20a_mm_entry *entry,
1584 size_t w, size_t data)
1585{
1586 nvgpu_mem_wr32(g, &entry->mem, entry->woffset + w, data);
1587}
1588
1589u64 gk20a_pde_addr(struct gk20a *g, struct gk20a_mm_entry *entry)
1590{
1591 u64 base;
1592
1593 if (g->mm.has_physical_mode)
1594 base = sg_phys(entry->mem.priv.sgt->sgl);
1595 else
1596 base = gk20a_mem_get_base_addr(g, &entry->mem, 0);
1597
1598 return base + entry->woffset * sizeof(u32);
1599}
1600
1601/* for gk20a the "video memory" apertures here are misnomers. */ 1558/* for gk20a the "video memory" apertures here are misnomers. */
1602static inline u32 big_valid_pde0_bits(struct gk20a *g, 1559static inline u32 big_valid_pde0_bits(struct gk20a *g,
1603 struct gk20a_mm_entry *entry) 1560 struct nvgpu_gmmu_pd *pd, u64 addr)
1604{ 1561{
1605 u64 pte_addr = gk20a_pde_addr(g, entry);
1606 u32 pde0_bits = 1562 u32 pde0_bits =
1607 nvgpu_aperture_mask(g, &entry->mem, 1563 nvgpu_aperture_mask(g, &pd->mem,
1608 gmmu_pde_aperture_big_sys_mem_ncoh_f(), 1564 gmmu_pde_aperture_big_sys_mem_ncoh_f(),
1609 gmmu_pde_aperture_big_video_memory_f()) | 1565 gmmu_pde_aperture_big_video_memory_f()) |
1610 gmmu_pde_address_big_sys_f( 1566 gmmu_pde_address_big_sys_f(
1611 (u32)(pte_addr >> gmmu_pde_address_shift_v())); 1567 (u32)(addr >> gmmu_pde_address_shift_v()));
1612 1568
1613 return pde0_bits; 1569 return pde0_bits;
1614} 1570}
1615 1571
1616static inline u32 small_valid_pde1_bits(struct gk20a *g, 1572static inline u32 small_valid_pde1_bits(struct gk20a *g,
1617 struct gk20a_mm_entry *entry) 1573 struct nvgpu_gmmu_pd *pd, u64 addr)
1618{ 1574{
1619 u64 pte_addr = gk20a_pde_addr(g, entry);
1620 u32 pde1_bits = 1575 u32 pde1_bits =
1621 nvgpu_aperture_mask(g, &entry->mem, 1576 nvgpu_aperture_mask(g, &pd->mem,
1622 gmmu_pde_aperture_small_sys_mem_ncoh_f(), 1577 gmmu_pde_aperture_small_sys_mem_ncoh_f(),
1623 gmmu_pde_aperture_small_video_memory_f()) | 1578 gmmu_pde_aperture_small_video_memory_f()) |
1624 gmmu_pde_vol_small_true_f() | /* tbd: why? */ 1579 gmmu_pde_vol_small_true_f() | /* tbd: why? */
1625 gmmu_pde_address_small_sys_f( 1580 gmmu_pde_address_small_sys_f(
1626 (u32)(pte_addr >> gmmu_pde_address_shift_v())); 1581 (u32)(addr >> gmmu_pde_address_shift_v()));
1627 1582
1628 return pde1_bits; 1583 return pde1_bits;
1629} 1584}
1630 1585
1631/* Given the current state of the ptes associated with a pde, 1586static void update_gmmu_pde_locked(struct vm_gk20a *vm,
1632 determine value and write it out. There's no checking 1587 const struct gk20a_mmu_level *l,
1633 here to determine whether or not a change was actually 1588 struct nvgpu_gmmu_pd *pd,
1634 made. So, superfluous updates will cause unnecessary 1589 u32 pd_idx,
1635 pde invalidations. 1590 u64 virt_addr,
1636*/ 1591 u64 phys_addr,
1637static int update_gmmu_pde_locked(struct vm_gk20a *vm, 1592 struct nvgpu_gmmu_attrs *attrs)
1638 struct gk20a_mm_entry *pte,
1639 u32 i, u32 gmmu_pgsz_idx,
1640 struct scatterlist **sgl,
1641 u64 *offset,
1642 u64 *iova,
1643 u32 kind_v, u64 *ctag,
1644 bool cacheable, bool unammped_pte,
1645 int rw_flag, bool sparse, bool priv,
1646 enum nvgpu_aperture aperture)
1647{ 1593{
1648 struct gk20a *g = gk20a_from_vm(vm); 1594 struct gk20a *g = gk20a_from_vm(vm);
1649 bool small_valid, big_valid; 1595 bool small_valid, big_valid;
1650 struct gk20a_mm_entry *entry = vm->pdb.entries + i; 1596 u32 pd_offset = pd_offset_from_index(l, pd_idx);
1651 u32 pde_v[2] = {0, 0}; 1597 u32 pde_v[2] = {0, 0};
1652 u32 pde;
1653 1598
1654 gk20a_dbg_fn(""); 1599 small_valid = attrs->pgsz == gmmu_page_size_small;
1655 1600 big_valid = attrs->pgsz == gmmu_page_size_big;
1656 small_valid = entry->mem.size && entry->pgsz == gmmu_page_size_small;
1657 big_valid = entry->mem.size && entry->pgsz == gmmu_page_size_big;
1658 1601
1659 pde_v[0] = gmmu_pde_size_full_f(); 1602 pde_v[0] = gmmu_pde_size_full_f();
1660 pde_v[0] |= big_valid ? 1603 pde_v[0] |= big_valid ?
1661 big_valid_pde0_bits(g, entry) : 1604 big_valid_pde0_bits(g, pd, phys_addr) :
1662 gmmu_pde_aperture_big_invalid_f(); 1605 gmmu_pde_aperture_big_invalid_f();
1663 1606
1664 pde_v[1] |= (small_valid ? 1607 pde_v[1] |= (small_valid ? small_valid_pde1_bits(g, pd, phys_addr) :
1665 small_valid_pde1_bits(g, entry) :
1666 (gmmu_pde_aperture_small_invalid_f() | 1608 (gmmu_pde_aperture_small_invalid_f() |
1667 gmmu_pde_vol_small_false_f())) 1609 gmmu_pde_vol_small_false_f()))
1668 | 1610 |
1669 (big_valid ? (gmmu_pde_vol_big_true_f()) : 1611 (big_valid ? (gmmu_pde_vol_big_true_f()) :
1670 gmmu_pde_vol_big_false_f()); 1612 gmmu_pde_vol_big_false_f());
1671 1613
1672 pde = pde_from_index(i); 1614 pte_dbg(g, attrs,
1615 "PDE: i=%-4u size=%-2u offs=%-4u pgsz: %c%c | "
1616 "GPU %#-12llx phys %#-12llx "
1617 "[0x%08x, 0x%08x]",
1618 pd_idx, l->entry_size, pd_offset,
1619 small_valid ? 'S' : '-',
1620 big_valid ? 'B' : '-',
1621 virt_addr, phys_addr,
1622 pde_v[1], pde_v[0]);
1673 1623
1674 gk20a_pde_wr32(g, &vm->pdb, pde + 0, pde_v[0]); 1624 pd_write(g, &vm->pdb, pd_offset + 0, pde_v[0]);
1675 gk20a_pde_wr32(g, &vm->pdb, pde + 1, pde_v[1]); 1625 pd_write(g, &vm->pdb, pd_offset + 1, pde_v[1]);
1626}
1676 1627
1677 gk20a_dbg(gpu_dbg_pte, "pde:%d,sz=%d = 0x%x,0x%08x", 1628static void __update_pte_sparse(u32 *pte_w)
1678 i, gmmu_pgsz_idx, pde_v[1], pde_v[0]); 1629{
1679 return 0; 1630 pte_w[0] = gmmu_pte_valid_false_f();
1631 pte_w[1] |= gmmu_pte_vol_true_f();
1680} 1632}
1681 1633
1682static int update_gmmu_pte_locked(struct vm_gk20a *vm, 1634static void __update_pte(struct vm_gk20a *vm,
1683 struct gk20a_mm_entry *pte, 1635 u32 *pte_w,
1684 u32 i, u32 gmmu_pgsz_idx, 1636 u64 phys_addr,
1685 struct scatterlist **sgl, 1637 struct nvgpu_gmmu_attrs *attrs)
1686 u64 *offset,
1687 u64 *iova,
1688 u32 kind_v, u64 *ctag,
1689 bool cacheable, bool unmapped_pte,
1690 int rw_flag, bool sparse, bool priv,
1691 enum nvgpu_aperture aperture)
1692{ 1638{
1693 struct gk20a *g = gk20a_from_vm(vm); 1639 struct gk20a *g = gk20a_from_vm(vm);
1640 u32 page_size = vm->gmmu_page_sizes[attrs->pgsz];
1641 u32 pte_valid = attrs->valid ?
1642 gmmu_pte_valid_true_f() :
1643 gmmu_pte_valid_false_f();
1644 u32 phys_shifted = phys_addr >> gmmu_pte_address_shift_v();
1645 u32 addr = attrs->aperture == APERTURE_SYSMEM ?
1646 gmmu_pte_address_sys_f(phys_shifted) :
1647 gmmu_pte_address_vid_f(phys_shifted);
1694 int ctag_shift = ilog2(g->ops.fb.compression_page_size(g)); 1648 int ctag_shift = ilog2(g->ops.fb.compression_page_size(g));
1695 u32 page_size = vm->gmmu_page_sizes[gmmu_pgsz_idx];
1696 u32 pte_w[2] = {0, 0}; /* invalid pte */
1697
1698 if (*iova) {
1699 u32 pte_valid = unmapped_pte ?
1700 gmmu_pte_valid_false_f() :
1701 gmmu_pte_valid_true_f();
1702 u32 iova_v = *iova >> gmmu_pte_address_shift_v();
1703 u32 pte_addr = aperture == APERTURE_SYSMEM ?
1704 gmmu_pte_address_sys_f(iova_v) :
1705 gmmu_pte_address_vid_f(iova_v);
1706
1707 pte_w[0] = pte_valid | pte_addr;
1708
1709 if (priv)
1710 pte_w[0] |= gmmu_pte_privilege_true_f();
1711
1712 pte_w[1] = __nvgpu_aperture_mask(g, aperture,
1713 gmmu_pte_aperture_sys_mem_ncoh_f(),
1714 gmmu_pte_aperture_video_memory_f()) |
1715 gmmu_pte_kind_f(kind_v) |
1716 gmmu_pte_comptagline_f((u32)(*ctag >> ctag_shift));
1717
1718 if (*ctag && vm->mm->use_full_comp_tag_line && *iova & 0x10000)
1719 pte_w[1] |= gmmu_pte_comptagline_f(
1720 1 << (gmmu_pte_comptagline_s() - 1));
1721
1722 if (rw_flag == gk20a_mem_flag_read_only) {
1723 pte_w[0] |= gmmu_pte_read_only_true_f();
1724 pte_w[1] |=
1725 gmmu_pte_write_disable_true_f();
1726 } else if (rw_flag ==
1727 gk20a_mem_flag_write_only) {
1728 pte_w[1] |=
1729 gmmu_pte_read_disable_true_f();
1730 }
1731 if (!unmapped_pte) {
1732 if (!cacheable)
1733 pte_w[1] |=
1734 gmmu_pte_vol_true_f();
1735 } else {
1736 /* Store cacheable value behind
1737 * gmmu_pte_write_disable_true_f */
1738 if (!cacheable)
1739 pte_w[1] |=
1740 gmmu_pte_write_disable_true_f();
1741 }
1742 1649
1743 gk20a_dbg(gpu_dbg_pte, 1650 pte_w[0] = pte_valid | addr;
1744 "pte=%d iova=0x%llx kind=%d ctag=%d vol=%d [0x%08x, 0x%08x]",
1745 i, *iova,
1746 kind_v, (u32)(*ctag >> ctag_shift), !cacheable,
1747 pte_w[1], pte_w[0]);
1748 1651
1749 if (*ctag) 1652 if (attrs->priv)
1750 *ctag += page_size; 1653 pte_w[0] |= gmmu_pte_privilege_true_f();
1751 } else if (sparse) {
1752 pte_w[0] = gmmu_pte_valid_false_f();
1753 pte_w[1] |= gmmu_pte_vol_true_f();
1754 } else {
1755 gk20a_dbg(gpu_dbg_pte, "pte_cur=%d [0x0,0x0]", i);
1756 }
1757 1654
1758 gk20a_pde_wr32(g, pte, pte_from_index(i) + 0, pte_w[0]); 1655 pte_w[1] = __nvgpu_aperture_mask(g, attrs->aperture,
1759 gk20a_pde_wr32(g, pte, pte_from_index(i) + 1, pte_w[1]); 1656 gmmu_pte_aperture_sys_mem_ncoh_f(),
1760 1657 gmmu_pte_aperture_video_memory_f()) |
1761 if (*iova) { 1658 gmmu_pte_kind_f(attrs->kind_v) |
1762 *iova += page_size; 1659 gmmu_pte_comptagline_f((u32)(attrs->ctag >> ctag_shift));
1763 *offset += page_size; 1660
1764 if (*sgl && *offset + page_size > (*sgl)->length) { 1661 if (attrs->ctag && vm->mm->use_full_comp_tag_line &&
1765 u64 new_iova; 1662 phys_addr & 0x10000)
1766 *sgl = sg_next(*sgl); 1663 pte_w[1] |= gmmu_pte_comptagline_f(
1767 if (*sgl) { 1664 1 << (gmmu_pte_comptagline_s() - 1));
1768 new_iova = sg_phys(*sgl); 1665
1769 gk20a_dbg(gpu_dbg_pte, "chunk address %llx, size %d", 1666 if (attrs->rw_flag == gk20a_mem_flag_read_only) {
1770 new_iova, (*sgl)->length); 1667 pte_w[0] |= gmmu_pte_read_only_true_f();
1771 if (new_iova) { 1668 pte_w[1] |= gmmu_pte_write_disable_true_f();
1772 *offset = 0; 1669 } else if (attrs->rw_flag == gk20a_mem_flag_write_only) {
1773 *iova = new_iova; 1670 pte_w[1] |= gmmu_pte_read_disable_true_f();
1774 }
1775 }
1776 }
1777 } 1671 }
1778 1672
1779 return 0; 1673 if (!attrs->cacheable)
1674 pte_w[1] |= gmmu_pte_vol_true_f();
1675
1676 if (attrs->ctag)
1677 attrs->ctag += page_size;
1678}
1679
1680static void update_gmmu_pte_locked(struct vm_gk20a *vm,
1681 const struct gk20a_mmu_level *l,
1682 struct nvgpu_gmmu_pd *pd,
1683 u32 pd_idx,
1684 u64 virt_addr,
1685 u64 phys_addr,
1686 struct nvgpu_gmmu_attrs *attrs)
1687{
1688 struct gk20a *g = gk20a_from_vm(vm);
1689 u32 page_size = vm->gmmu_page_sizes[attrs->pgsz];
1690 u32 pd_offset = pd_offset_from_index(l, pd_idx);
1691 u32 pte_w[2] = {0, 0};
1692 int ctag_shift = ilog2(g->ops.fb.compression_page_size(g));
1693
1694 if (phys_addr)
1695 __update_pte(vm, pte_w, phys_addr, attrs);
1696 else if (attrs->sparse)
1697 __update_pte_sparse(pte_w);
1698
1699 pte_dbg(g, attrs,
1700 "PTE: i=%-4u size=%-2u offs=%-4u | "
1701 "GPU %#-12llx phys %#-12llx "
1702 "pgsz: %3dkb perm=%-2s kind=%#02x APT=%-6s %c%c%c%c "
1703 "ctag=0x%08x "
1704 "[0x%08x, 0x%08x]",
1705 pd_idx, l->entry_size, pd_offset,
1706 virt_addr, phys_addr,
1707 page_size >> 10,
1708 nvgpu_gmmu_perm_str(attrs->rw_flag),
1709 attrs->kind_v,
1710 nvgpu_aperture_str(attrs->aperture),
1711 attrs->valid ? 'V' : '-',
1712 attrs->cacheable ? 'C' : '-',
1713 attrs->sparse ? 'S' : '-',
1714 attrs->priv ? 'P' : '-',
1715 (u32)attrs->ctag >> ctag_shift,
1716 pte_w[1], pte_w[0]);
1717
1718 pd_write(g, pd, pd_offset + 0, pte_w[0]);
1719 pd_write(g, pd, pd_offset + 1, pte_w[1]);
1780} 1720}
1781 1721
1782/* NOTE! mapped_buffers lock must be held */ 1722/* NOTE! mapped_buffers lock must be held */
@@ -1809,13 +1749,6 @@ void nvgpu_vm_unmap_locked(struct nvgpu_mapped_buf *mapped_buffer,
1809 mapped_buffer->vm_area->sparse : false, 1749 mapped_buffer->vm_area->sparse : false,
1810 batch); 1750 batch);
1811 1751
1812 gk20a_dbg(gpu_dbg_map,
1813 "gv: 0x%04x_%08x pgsz=%-3dKb as=%-2d own_mem_ref=%d",
1814 u64_hi32(mapped_buffer->addr), u64_lo32(mapped_buffer->addr),
1815 vm->gmmu_page_sizes[mapped_buffer->pgsz_idx] >> 10,
1816 vm_aspace_id(vm),
1817 mapped_buffer->own_mem_ref);
1818
1819 gk20a_mm_unpin(dev_from_vm(vm), mapped_buffer->dmabuf, 1752 gk20a_mm_unpin(dev_from_vm(vm), mapped_buffer->dmabuf,
1820 mapped_buffer->sgt); 1753 mapped_buffer->sgt);
1821 1754
@@ -1942,6 +1875,9 @@ int __gk20a_vm_bind_channel(struct vm_gk20a *vm, struct channel_gk20a *ch)
1942 if (err) 1875 if (err)
1943 ch->vm = NULL; 1876 ch->vm = NULL;
1944 1877
1878 nvgpu_log(gk20a_from_vm(vm), gpu_dbg_map, "Binding ch=%d -> VM:%s",
1879 ch->chid, vm->name);
1880
1945 return err; 1881 return err;
1946} 1882}
1947 1883
@@ -2114,7 +2050,7 @@ u64 gk20a_mm_inst_block_addr(struct gk20a *g, struct nvgpu_mem *inst_block)
2114 if (g->mm.has_physical_mode) 2050 if (g->mm.has_physical_mode)
2115 addr = gk20a_mem_phys(inst_block); 2051 addr = gk20a_mem_phys(inst_block);
2116 else 2052 else
2117 addr = gk20a_mem_get_base_addr(g, inst_block, 0); 2053 addr = nvgpu_mem_get_base_addr(g, inst_block, 0);
2118 2054
2119 return addr; 2055 return addr;
2120} 2056}
@@ -2237,7 +2173,7 @@ static int gk20a_init_ce_vm(struct mm_gk20a *mm)
2237void gk20a_mm_init_pdb(struct gk20a *g, struct nvgpu_mem *inst_block, 2173void gk20a_mm_init_pdb(struct gk20a *g, struct nvgpu_mem *inst_block,
2238 struct vm_gk20a *vm) 2174 struct vm_gk20a *vm)
2239{ 2175{
2240 u64 pdb_addr = gk20a_mem_get_base_addr(g, &vm->pdb.mem, 0); 2176 u64 pdb_addr = nvgpu_mem_get_base_addr(g, &vm->pdb.mem, 0);
2241 u32 pdb_addr_lo = u64_lo32(pdb_addr >> ram_in_base_shift_v()); 2177 u32 pdb_addr_lo = u64_lo32(pdb_addr >> ram_in_base_shift_v());
2242 u32 pdb_addr_hi = u64_hi32(pdb_addr); 2178 u32 pdb_addr_hi = u64_hi32(pdb_addr);
2243 2179
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
index cf37640d..a245d0e0 100644
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
@@ -42,12 +42,6 @@
42 outer_flush_range(pa, pa + (size_t)(size)); \ 42 outer_flush_range(pa, pa + (size_t)(size)); \
43 } while (0) 43 } while (0)
44 44
45enum gk20a_mem_rw_flag {
46 gk20a_mem_flag_none = 0,
47 gk20a_mem_flag_read_only = 1,
48 gk20a_mem_flag_write_only = 2,
49};
50
51struct gpfifo_desc { 45struct gpfifo_desc {
52 struct nvgpu_mem mem; 46 struct nvgpu_mem mem;
53 u32 entry_num; 47 u32 entry_num;
@@ -347,7 +341,7 @@ int gk20a_mm_suspend(struct gk20a *g);
347u64 gk20a_mm_iova_addr(struct gk20a *g, struct scatterlist *sgl, 341u64 gk20a_mm_iova_addr(struct gk20a *g, struct scatterlist *sgl,
348 u32 flags); 342 u32 flags);
349u64 gk20a_mm_smmu_vaddr_translate(struct gk20a *g, dma_addr_t iova); 343u64 gk20a_mm_smmu_vaddr_translate(struct gk20a *g, dma_addr_t iova);
350u64 gk20a_mem_get_base_addr(struct gk20a *g, struct nvgpu_mem *mem, 344u64 nvgpu_mem_get_base_addr(struct gk20a *g, struct nvgpu_mem *mem,
351 u32 flags); 345 u32 flags);
352 346
353void gk20a_mm_ltc_isr(struct gk20a *g); 347void gk20a_mm_ltc_isr(struct gk20a *g);
@@ -371,10 +365,6 @@ static inline phys_addr_t gk20a_mem_phys(struct nvgpu_mem *mem)
371 return 0; 365 return 0;
372} 366}
373 367
374void gk20a_pde_wr32(struct gk20a *g, struct gk20a_mm_entry *entry,
375 size_t w, size_t data);
376u64 gk20a_pde_addr(struct gk20a *g, struct gk20a_mm_entry *entry);
377
378u64 gk20a_locked_gmmu_map(struct vm_gk20a *vm, 368u64 gk20a_locked_gmmu_map(struct vm_gk20a *vm,
379 u64 map_offset, 369 u64 map_offset,
380 struct sg_table *sgt, 370 struct sg_table *sgt,
@@ -451,8 +441,4 @@ int gk20a_mm_get_buffer_info(struct device *dev, int dmabuf_fd,
451 u64 *buffer_id, u64 *buffer_len); 441 u64 *buffer_id, u64 *buffer_len);
452void gk20a_vm_unmap_locked_kref(struct kref *ref); 442void gk20a_vm_unmap_locked_kref(struct kref *ref);
453 443
454void gk20a_vm_free_entries(struct vm_gk20a *vm,
455 struct gk20a_mm_entry *parent,
456 int level);
457
458#endif /* MM_GK20A_H */ 444#endif /* MM_GK20A_H */
diff --git a/drivers/gpu/nvgpu/gp10b/mm_gp10b.c b/drivers/gpu/nvgpu/gp10b/mm_gp10b.c
index d7391c6d..c3867e9d 100644
--- a/drivers/gpu/nvgpu/gp10b/mm_gp10b.c
+++ b/drivers/gpu/nvgpu/gp10b/mm_gp10b.c
@@ -14,6 +14,7 @@
14 */ 14 */
15 15
16#include <nvgpu/dma.h> 16#include <nvgpu/dma.h>
17#include <nvgpu/gmmu.h>
17 18
18#include "gk20a/gk20a.h" 19#include "gk20a/gk20a.h"
19#include "gk20a/platform_gk20a.h" 20#include "gk20a/platform_gk20a.h"
@@ -149,206 +150,186 @@ static u64 gp10b_mm_iova_addr(struct gk20a *g, struct scatterlist *sgl,
149 return gk20a_mm_smmu_vaddr_translate(g, sg_dma_address(sgl)); 150 return gk20a_mm_smmu_vaddr_translate(g, sg_dma_address(sgl));
150} 151}
151 152
152static u32 pde3_from_index(u32 i) 153static void update_gmmu_pde3_locked(struct vm_gk20a *vm,
153{ 154 const struct gk20a_mmu_level *l,
154 return i * gmmu_new_pde__size_v() / sizeof(u32); 155 struct nvgpu_gmmu_pd *pd,
155} 156 u32 pd_idx,
156 157 u64 virt_addr,
157static u32 pte3_from_index(u32 i) 158 u64 phys_addr,
158{ 159 struct nvgpu_gmmu_attrs *attrs)
159 return i * gmmu_new_pte__size_v() / sizeof(u32);
160}
161
162static int update_gmmu_pde3_locked(struct vm_gk20a *vm,
163 struct gk20a_mm_entry *parent,
164 u32 i, u32 gmmu_pgsz_idx,
165 struct scatterlist **sgl,
166 u64 *offset,
167 u64 *iova,
168 u32 kind_v, u64 *ctag,
169 bool cacheable, bool unmapped_pte,
170 int rw_flag, bool sparse, bool priv,
171 enum nvgpu_aperture aperture)
172{ 160{
173 struct gk20a *g = gk20a_from_vm(vm); 161 struct gk20a *g = gk20a_from_vm(vm);
174 u64 pte_addr = 0; 162 u32 pd_offset = pd_offset_from_index(l, pd_idx);
175 struct gk20a_mm_entry *pte = parent->entries + i;
176 u32 pde_v[2] = {0, 0}; 163 u32 pde_v[2] = {0, 0};
177 u32 pde;
178
179 gk20a_dbg_fn("");
180 164
181 pte_addr = gk20a_pde_addr(g, pte) >> gmmu_new_pde_address_shift_v(); 165 phys_addr >>= gmmu_new_pde_address_shift_v();
182 166
183 pde_v[0] |= nvgpu_aperture_mask(g, &pte->mem, 167 pde_v[0] |= nvgpu_aperture_mask(g, &pd->mem,
184 gmmu_new_pde_aperture_sys_mem_ncoh_f(), 168 gmmu_new_pde_aperture_sys_mem_ncoh_f(),
185 gmmu_new_pde_aperture_video_memory_f()); 169 gmmu_new_pde_aperture_video_memory_f());
186 pde_v[0] |= gmmu_new_pde_address_sys_f(u64_lo32(pte_addr)); 170 pde_v[0] |= gmmu_new_pde_address_sys_f(u64_lo32(phys_addr));
187 pde_v[0] |= gmmu_new_pde_vol_true_f(); 171 pde_v[0] |= gmmu_new_pde_vol_true_f();
188 pde_v[1] |= pte_addr >> 24; 172 pde_v[1] |= phys_addr >> 24;
189 pde = pde3_from_index(i); 173
190 174 pd_write(g, pd, pd_offset + 0, pde_v[0]);
191 gk20a_pde_wr32(g, parent, pde + 0, pde_v[0]); 175 pd_write(g, pd, pd_offset + 1, pde_v[1]);
192 gk20a_pde_wr32(g, parent, pde + 1, pde_v[1]); 176
193 177 pte_dbg(g, attrs,
194 gk20a_dbg(gpu_dbg_pte, "pde:%d,sz=%d = 0x%x,0x%08x", 178 "PDE: i=%-4u size=%-2u offs=%-4u pgsz: -- | "
195 i, gmmu_pgsz_idx, pde_v[1], pde_v[0]); 179 "GPU %#-12llx phys %#-12llx "
196 gk20a_dbg_fn("done"); 180 "[0x%08x, 0x%08x]",
197 return 0; 181 pd_idx, l->entry_size, pd_offset,
182 virt_addr, phys_addr,
183 pde_v[1], pde_v[0]);
198} 184}
199 185
200static u32 pde0_from_index(u32 i) 186static void update_gmmu_pde0_locked(struct vm_gk20a *vm,
201{ 187 const struct gk20a_mmu_level *l,
202 return i * gmmu_new_dual_pde__size_v() / sizeof(u32); 188 struct nvgpu_gmmu_pd *pd,
203} 189 u32 pd_idx,
204 190 u64 virt_addr,
205static int update_gmmu_pde0_locked(struct vm_gk20a *vm, 191 u64 phys_addr,
206 struct gk20a_mm_entry *pte, 192 struct nvgpu_gmmu_attrs *attrs)
207 u32 i, u32 gmmu_pgsz_idx,
208 struct scatterlist **sgl,
209 u64 *offset,
210 u64 *iova,
211 u32 kind_v, u64 *ctag,
212 bool cacheable, bool unmapped_pte,
213 int rw_flag, bool sparse, bool priv,
214 enum nvgpu_aperture aperture)
215{ 193{
216 struct gk20a *g = gk20a_from_vm(vm); 194 struct gk20a *g = gk20a_from_vm(vm);
217 bool small_valid, big_valid; 195 bool small_valid, big_valid;
218 u32 pte_addr_small = 0, pte_addr_big = 0; 196 u32 small_addr = 0, big_addr = 0;
219 struct gk20a_mm_entry *entry = pte->entries + i; 197 u32 pd_offset = pd_offset_from_index(l, pd_idx);
220 u32 pde_v[4] = {0, 0, 0, 0}; 198 u32 pde_v[4] = {0, 0, 0, 0};
221 u32 pde;
222
223 gk20a_dbg_fn("");
224 199
225 small_valid = entry->mem.size && entry->pgsz == gmmu_page_size_small; 200 small_valid = attrs->pgsz == gmmu_page_size_small;
226 big_valid = entry->mem.size && entry->pgsz == gmmu_page_size_big; 201 big_valid = attrs->pgsz == gmmu_page_size_big;
227 202
228 if (small_valid) { 203 if (small_valid)
229 pte_addr_small = gk20a_pde_addr(g, entry) 204 small_addr = phys_addr >> gmmu_new_dual_pde_address_shift_v();
230 >> gmmu_new_dual_pde_address_shift_v();
231 }
232 205
233 if (big_valid) 206 if (big_valid)
234 pte_addr_big = gk20a_pde_addr(g, entry) 207 big_addr = phys_addr >> gmmu_new_dual_pde_address_big_shift_v();
235 >> gmmu_new_dual_pde_address_big_shift_v();
236 208
237 if (small_valid) { 209 if (small_valid) {
238 pde_v[2] |= gmmu_new_dual_pde_address_small_sys_f(pte_addr_small); 210 pde_v[2] |=
239 pde_v[2] |= nvgpu_aperture_mask(g, &entry->mem, 211 gmmu_new_dual_pde_address_small_sys_f(small_addr);
212 pde_v[2] |= nvgpu_aperture_mask(g, &pd->mem,
240 gmmu_new_dual_pde_aperture_small_sys_mem_ncoh_f(), 213 gmmu_new_dual_pde_aperture_small_sys_mem_ncoh_f(),
241 gmmu_new_dual_pde_aperture_small_video_memory_f()); 214 gmmu_new_dual_pde_aperture_small_video_memory_f());
242 pde_v[2] |= gmmu_new_dual_pde_vol_small_true_f(); 215 pde_v[2] |= gmmu_new_dual_pde_vol_small_true_f();
243 pde_v[3] |= pte_addr_small >> 24; 216 pde_v[3] |= small_addr >> 24;
244 } 217 }
245 218
246 if (big_valid) { 219 if (big_valid) {
247 pde_v[0] |= gmmu_new_dual_pde_address_big_sys_f(pte_addr_big); 220 pde_v[0] |= gmmu_new_dual_pde_address_big_sys_f(big_addr);
248 pde_v[0] |= gmmu_new_dual_pde_vol_big_true_f(); 221 pde_v[0] |= gmmu_new_dual_pde_vol_big_true_f();
249 pde_v[0] |= nvgpu_aperture_mask(g, &entry->mem, 222 pde_v[0] |= nvgpu_aperture_mask(g, &pd->mem,
250 gmmu_new_dual_pde_aperture_big_sys_mem_ncoh_f(), 223 gmmu_new_dual_pde_aperture_big_sys_mem_ncoh_f(),
251 gmmu_new_dual_pde_aperture_big_video_memory_f()); 224 gmmu_new_dual_pde_aperture_big_video_memory_f());
252 pde_v[1] |= pte_addr_big >> 28; 225 pde_v[1] |= big_addr >> 28;
253 } 226 }
254 227
255 pde = pde0_from_index(i); 228 pd_write(g, pd, pd_offset + 0, pde_v[0]);
256 229 pd_write(g, pd, pd_offset + 1, pde_v[1]);
257 gk20a_pde_wr32(g, pte, pde + 0, pde_v[0]); 230 pd_write(g, pd, pd_offset + 2, pde_v[2]);
258 gk20a_pde_wr32(g, pte, pde + 1, pde_v[1]); 231 pd_write(g, pd, pd_offset + 3, pde_v[3]);
259 gk20a_pde_wr32(g, pte, pde + 2, pde_v[2]); 232
260 gk20a_pde_wr32(g, pte, pde + 3, pde_v[3]); 233 pte_dbg(g, attrs,
261 234 "PDE: i=%-4u size=%-2u offs=%-4u pgsz: %c%c | "
262 gk20a_dbg(gpu_dbg_pte, "pde:%d,sz=%d [0x%08x, 0x%08x, 0x%x, 0x%08x]", 235 "GPU %#-12llx phys %#-12llx "
263 i, gmmu_pgsz_idx, pde_v[3], pde_v[2], pde_v[1], pde_v[0]); 236 "[0x%08x, 0x%08x, 0x%08x, 0x%08x]",
264 gk20a_dbg_fn("done"); 237 pd_idx, l->entry_size, pd_offset,
265 return 0; 238 small_valid ? 'S' : '-',
239 big_valid ? 'B' : '-',
240 virt_addr, phys_addr,
241 pde_v[3], pde_v[2], pde_v[1], pde_v[0]);
266} 242}
267 243
268static int update_gmmu_pte_locked(struct vm_gk20a *vm, 244static void __update_pte(struct vm_gk20a *vm,
269 struct gk20a_mm_entry *pte, 245 u32 *pte_w,
270 u32 i, u32 gmmu_pgsz_idx, 246 u64 phys_addr,
271 struct scatterlist **sgl, 247 struct nvgpu_gmmu_attrs *attrs)
272 u64 *offset,
273 u64 *iova,
274 u32 kind_v, u64 *ctag,
275 bool cacheable, bool unmapped_pte,
276 int rw_flag, bool sparse, bool priv,
277 enum nvgpu_aperture aperture)
278{ 248{
279 struct gk20a *g = vm->mm->g; 249 struct gk20a *g = gk20a_from_vm(vm);
280 u32 page_size = vm->gmmu_page_sizes[gmmu_pgsz_idx];
281 u64 ctag_granularity = g->ops.fb.compression_page_size(g); 250 u64 ctag_granularity = g->ops.fb.compression_page_size(g);
282 u32 pte_w[2] = {0, 0}; /* invalid pte */ 251 u32 page_size = vm->gmmu_page_sizes[attrs->pgsz];
283 u32 pte_i; 252 u32 pte_valid = attrs->valid ?
284 253 gmmu_new_pte_valid_true_f() :
285 if (*iova) { 254 gmmu_new_pte_valid_false_f();
286 u32 pte_valid = unmapped_pte ? 255 u32 phys_shifted = phys_addr >> gmmu_new_pte_address_shift_v();
287 gmmu_new_pte_valid_false_f() : 256 u32 pte_addr = attrs->aperture == APERTURE_SYSMEM ?
288 gmmu_new_pte_valid_true_f(); 257 gmmu_new_pte_address_sys_f(phys_shifted) :
289 u32 iova_v = *iova >> gmmu_new_pte_address_shift_v(); 258 gmmu_new_pte_address_vid_f(phys_shifted);
290 u32 pte_addr = aperture == APERTURE_SYSMEM ? 259 u32 pte_tgt = __nvgpu_aperture_mask(g, attrs->aperture,
291 gmmu_new_pte_address_sys_f(iova_v) : 260 gmmu_new_pte_aperture_sys_mem_ncoh_f(),
292 gmmu_new_pte_address_vid_f(iova_v); 261 gmmu_new_pte_aperture_video_memory_f());
293 u32 pte_tgt = __nvgpu_aperture_mask(g, aperture, 262
294 gmmu_new_pte_aperture_sys_mem_ncoh_f(), 263 pte_w[0] = pte_valid | pte_addr | pte_tgt;
295 gmmu_new_pte_aperture_video_memory_f()); 264
296 265 if (attrs->priv)
297 pte_w[0] = pte_valid | pte_addr | pte_tgt; 266 pte_w[0] |= gmmu_new_pte_privilege_true_f();
298 267
299 if (priv) 268 pte_w[1] = phys_addr >> (24 + gmmu_new_pte_address_shift_v()) |
300 pte_w[0] |= gmmu_new_pte_privilege_true_f(); 269 gmmu_new_pte_kind_f(attrs->kind_v) |
301 270 gmmu_new_pte_comptagline_f((u32)(attrs->ctag /
302 pte_w[1] = *iova >> (24 + gmmu_new_pte_address_shift_v()) | 271 ctag_granularity));
303 gmmu_new_pte_kind_f(kind_v) | 272
304 gmmu_new_pte_comptagline_f((u32)(*ctag / ctag_granularity)); 273 if (attrs->rw_flag == gk20a_mem_flag_read_only)
305 274 pte_w[0] |= gmmu_new_pte_read_only_true_f();
306 if (rw_flag == gk20a_mem_flag_read_only) 275
307 pte_w[0] |= gmmu_new_pte_read_only_true_f(); 276 if (!attrs->valid && !attrs->cacheable)
308 if (unmapped_pte && !cacheable) 277 pte_w[0] |= gmmu_new_pte_read_only_true_f();
309 pte_w[0] |= gmmu_new_pte_read_only_true_f(); 278 else if (!attrs->cacheable)
310 else if (!cacheable)
311 pte_w[0] |= gmmu_new_pte_vol_true_f();
312
313 gk20a_dbg(gpu_dbg_pte, "pte=%d iova=0x%llx kind=%d"
314 " ctag=%d vol=%d"
315 " [0x%08x, 0x%08x]",
316 i, *iova,
317 kind_v, (u32)(*ctag / ctag_granularity), !cacheable,
318 pte_w[1], pte_w[0]);
319
320 if (*ctag)
321 *ctag += page_size;
322 } else if (sparse) {
323 pte_w[0] = gmmu_new_pte_valid_false_f();
324 pte_w[0] |= gmmu_new_pte_vol_true_f(); 279 pte_w[0] |= gmmu_new_pte_vol_true_f();
325 } else {
326 gk20a_dbg(gpu_dbg_pte, "pte_cur=%d [0x0,0x0]", i);
327 }
328 280
329 pte_i = pte3_from_index(i); 281 if (attrs->ctag)
330 282 attrs->ctag += page_size;
331 gk20a_pde_wr32(g, pte, pte_i + 0, pte_w[0]); 283
332 gk20a_pde_wr32(g, pte, pte_i + 1, pte_w[1]); 284}
333 285
334 if (*iova) { 286static void __update_pte_sparse(u32 *pte_w)
335 *iova += page_size; 287{
336 *offset += page_size; 288 pte_w[0] = gmmu_new_pte_valid_false_f();
337 if (*sgl && *offset + page_size > (*sgl)->length) { 289 pte_w[0] |= gmmu_new_pte_vol_true_f();
338 u64 new_iova; 290}
339 *sgl = sg_next(*sgl); 291
340 if (*sgl) { 292static void update_gmmu_pte_locked(struct vm_gk20a *vm,
341 new_iova = sg_phys(*sgl); 293 const struct gk20a_mmu_level *l,
342 gk20a_dbg(gpu_dbg_pte, "chunk address %llx, size %d", 294 struct nvgpu_gmmu_pd *pd,
343 new_iova, (*sgl)->length); 295 u32 pd_idx,
344 if (new_iova) { 296 u64 virt_addr,
345 *offset = 0; 297 u64 phys_addr,
346 *iova = new_iova; 298 struct nvgpu_gmmu_attrs *attrs)
347 } 299{
348 } 300 struct gk20a *g = vm->mm->g;
349 } 301 u32 page_size = vm->gmmu_page_sizes[attrs->pgsz];
350 } 302 u32 pd_offset = pd_offset_from_index(l, pd_idx);
351 return 0; 303 u32 pte_w[2] = {0, 0};
304
305 if (phys_addr)
306 __update_pte(vm, pte_w, phys_addr, attrs);
307 else if (attrs->sparse)
308 __update_pte_sparse(pte_w);
309
310 pte_dbg(g, attrs,
311 "vm=%s "
312 "PTE: i=%-4u size=%-2u offs=%-4u | "
313 "GPU %#-12llx phys %#-12llx "
314 "pgsz: %3dkb perm=%-2s kind=%#02x APT=%-6s %c%c%c%c "
315 "ctag=0x%08x "
316 "[0x%08x, 0x%08x]",
317 vm->name,
318 pd_idx, l->entry_size, pd_offset,
319 virt_addr, phys_addr,
320 page_size >> 10,
321 nvgpu_gmmu_perm_str(attrs->rw_flag),
322 attrs->kind_v,
323 nvgpu_aperture_str(attrs->aperture),
324 attrs->valid ? 'V' : '-',
325 attrs->cacheable ? 'C' : '-',
326 attrs->sparse ? 'S' : '-',
327 attrs->priv ? 'P' : '-',
328 (u32)attrs->ctag / g->ops.fb.compression_page_size(g),
329 pte_w[1], pte_w[0]);
330
331 pd_write(g, pd, pd_offset + 0, pte_w[0]);
332 pd_write(g, pd, pd_offset + 1, pte_w[1]);
352} 333}
353 334
354static const struct gk20a_mmu_level gp10b_mm_levels[] = { 335static const struct gk20a_mmu_level gp10b_mm_levels[] = {
@@ -384,7 +365,7 @@ static const struct gk20a_mmu_level *gp10b_mm_get_mmu_levels(struct gk20a *g,
384static void gp10b_mm_init_pdb(struct gk20a *g, struct nvgpu_mem *inst_block, 365static void gp10b_mm_init_pdb(struct gk20a *g, struct nvgpu_mem *inst_block,
385 struct vm_gk20a *vm) 366 struct vm_gk20a *vm)
386{ 367{
387 u64 pdb_addr = gk20a_mem_get_base_addr(g, &vm->pdb.mem, 0); 368 u64 pdb_addr = nvgpu_mem_get_base_addr(g, &vm->pdb.mem, 0);
388 u32 pdb_addr_lo = u64_lo32(pdb_addr >> ram_in_base_shift_v()); 369 u32 pdb_addr_lo = u64_lo32(pdb_addr >> ram_in_base_shift_v());
389 u32 pdb_addr_hi = u64_hi32(pdb_addr); 370 u32 pdb_addr_hi = u64_hi32(pdb_addr);
390 371
diff --git a/drivers/gpu/nvgpu/include/nvgpu/gmmu.h b/drivers/gpu/nvgpu/include/nvgpu/gmmu.h
index ed152cd8..28a2cb82 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/gmmu.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/gmmu.h
@@ -38,36 +38,97 @@ enum gmmu_pgsz_gk20a {
38 gmmu_nr_page_sizes = 3, 38 gmmu_nr_page_sizes = 3,
39}; 39};
40 40
41struct gk20a_mm_entry { 41enum gk20a_mem_rw_flag {
42 /* backing for */ 42 gk20a_mem_flag_none = 0, /* RW */
43 struct nvgpu_mem mem; 43 gk20a_mem_flag_read_only = 1, /* RO */
44 u32 woffset; /* if >0, mem is a shadow copy, owned by another entry */ 44 gk20a_mem_flag_write_only = 2, /* WO */
45 int pgsz; 45};
46 struct gk20a_mm_entry *entries; 46
47 int num_entries; 47/*
48 * GMMU page directory. This is the kernel's tracking of a list of PDEs or PTEs
49 * in the GMMU.
50 */
51struct nvgpu_gmmu_pd {
52 /*
53 * DMA memory describing the PTEs or PTEs.
54 */
55 struct nvgpu_mem mem;
56
57 /*
58 * List of pointers to the next level of page tables. Does not
59 * need to be populated when this PD is pointing to PTEs.
60 */
61 struct nvgpu_gmmu_pd *entries;
62 int num_entries;
63};
64
65/*
66 * Reduce the number of arguments getting passed through the various levels of
67 * GMMU mapping functions.
68 *
69 * The following fields are set statically and do not change throughout
70 * mapping call:
71 *
72 * pgsz: Index into the page size table.
73 * kind_v: Kind attributes for mapping.
74 * cacheable: Cacheability of the mapping.
75 * rw_flag: Flag from enum gk20a_mem_rw_flag
76 * sparse: Set if the mapping should be sparse.
77 * priv: Privilidged mapping.
78 * valid: Set if the PTE should be marked valid.
79 * aperture: VIDMEM or SYSMEM.
80 * debug: When set print debugging info.
81 *
82 * These fields are dynamically updated as necessary during the map:
83 *
84 * ctag: Comptag line in the comptag cache;
85 * updated every time we write a PTE.
86 */
87struct nvgpu_gmmu_attrs {
88 u32 pgsz;
89 u32 kind_v;
90 u64 ctag;
91 bool cacheable;
92 int rw_flag;
93 bool sparse;
94 bool priv;
95 bool valid;
96 enum nvgpu_aperture aperture;
97 bool debug;
48}; 98};
49 99
50struct gk20a_mmu_level { 100struct gk20a_mmu_level {
51 int hi_bit[2]; 101 int hi_bit[2];
52 int lo_bit[2]; 102 int lo_bit[2];
53 int (*update_entry)(struct vm_gk20a *vm, 103
54 struct gk20a_mm_entry *pte, 104 /*
55 u32 i, u32 gmmu_pgsz_idx, 105 * Build map from virt_addr -> phys_addr.
56 struct scatterlist **sgl, 106 */
57 u64 *offset, 107 void (*update_entry)(struct vm_gk20a *vm,
58 u64 *iova, 108 const struct gk20a_mmu_level *l,
59 u32 kind_v, u64 *ctag, 109 struct nvgpu_gmmu_pd *pd,
60 bool cacheable, bool unmapped_pte, 110 u32 pd_idx,
61 int rw_flag, bool sparse, bool priv, 111 u64 phys_addr,
62 enum nvgpu_aperture aperture); 112 u64 virt_addr,
63 size_t entry_size; 113 struct nvgpu_gmmu_attrs *attrs);
114 u32 entry_size;
64}; 115};
65 116
66int nvgpu_zalloc_gmmu_page_table(struct vm_gk20a *vm, 117static inline const char *nvgpu_gmmu_perm_str(enum gk20a_mem_rw_flag p)
67 enum gmmu_pgsz_gk20a pgsz_idx, 118{
68 const struct gk20a_mmu_level *l, 119 switch (p) {
69 struct gk20a_mm_entry *entry, 120 case gk20a_mem_flag_none:
70 struct gk20a_mm_entry *prev_entry); 121 return "RW";
122 case gk20a_mem_flag_write_only:
123 return "WO";
124 case gk20a_mem_flag_read_only:
125 return "RO";
126 default:
127 return "??";
128 }
129}
130
131int nvgpu_gmmu_init_page_table(struct vm_gk20a *vm);
71 132
72/** 133/**
73 * nvgpu_gmmu_map - Map memory into the GMMU. 134 * nvgpu_gmmu_map - Map memory into the GMMU.
@@ -106,6 +167,33 @@ void nvgpu_gmmu_unmap(struct vm_gk20a *vm,
106 u64 gpu_va); 167 u64 gpu_va);
107 168
108void nvgpu_free_gmmu_pages(struct vm_gk20a *vm, 169void nvgpu_free_gmmu_pages(struct vm_gk20a *vm,
109 struct gk20a_mm_entry *entry); 170 struct nvgpu_gmmu_pd *entry);
171
172/*
173 * Some useful routines that are shared across chips.
174 */
175static inline u32 pd_offset_from_index(const struct gk20a_mmu_level *l,
176 u32 pd_idx)
177{
178 return (pd_idx * l->entry_size) / sizeof(u32);
179}
180
181static inline void pd_write(struct gk20a *g, struct nvgpu_gmmu_pd *pd,
182 size_t w, size_t data)
183{
184 nvgpu_mem_wr32(g, &pd->mem, w, data);
185}
186
187
188/*
189 * Internal debugging routines. Probably not something you want to use.
190 */
191#define pte_dbg(g, attrs, fmt, args...) \
192 do { \
193 if (attrs && attrs->debug) \
194 nvgpu_info(g, fmt, ##args); \
195 else \
196 nvgpu_log(g, gpu_dbg_pte, fmt, ##args); \
197 } while (0)
110 198
111#endif 199#endif
diff --git a/drivers/gpu/nvgpu/include/nvgpu/nvgpu_mem.h b/drivers/gpu/nvgpu/include/nvgpu/nvgpu_mem.h
index 66d04ab8..4259d40f 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/nvgpu_mem.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/nvgpu_mem.h
@@ -109,9 +109,9 @@ nvgpu_mem_from_clear_list_entry(struct nvgpu_list_node *node)
109static inline const char *nvgpu_aperture_str(enum nvgpu_aperture aperture) 109static inline const char *nvgpu_aperture_str(enum nvgpu_aperture aperture)
110{ 110{
111 switch (aperture) { 111 switch (aperture) {
112 case APERTURE_INVALID: return "invalid"; 112 case APERTURE_INVALID: return "INVAL";
113 case APERTURE_SYSMEM: return "sysmem"; 113 case APERTURE_SYSMEM: return "SYSMEM";
114 case APERTURE_VIDMEM: return "vidmem"; 114 case APERTURE_VIDMEM: return "VIDMEM";
115 }; 115 };
116 return "UNKNOWN"; 116 return "UNKNOWN";
117} 117}
diff --git a/drivers/gpu/nvgpu/include/nvgpu/vm.h b/drivers/gpu/nvgpu/include/nvgpu/vm.h
index f6d88cc3..255b4361 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/vm.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/vm.h
@@ -126,6 +126,7 @@ mapped_buffer_from_rbtree_node(struct nvgpu_rbtree_node *node)
126struct vm_gk20a { 126struct vm_gk20a {
127 struct mm_gk20a *mm; 127 struct mm_gk20a *mm;
128 struct gk20a_as_share *as_share; /* as_share this represents */ 128 struct gk20a_as_share *as_share; /* as_share this represents */
129 char name[20];
129 130
130 u64 va_start; 131 u64 va_start;
131 u64 va_limit; 132 u64 va_limit;
@@ -145,7 +146,7 @@ struct vm_gk20a {
145 146
146 struct nvgpu_mutex update_gmmu_lock; 147 struct nvgpu_mutex update_gmmu_lock;
147 148
148 struct gk20a_mm_entry pdb; 149 struct nvgpu_gmmu_pd pdb;
149 150
150 /* 151 /*
151 * These structs define the address spaces. In some cases it's possible 152 * These structs define the address spaces. In some cases it's possible