summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--drivers/gpu/nvgpu/common/mm/gmmu.c976
-rw-r--r--drivers/gpu/nvgpu/common/mm/vm.c36
-rw-r--r--drivers/gpu/nvgpu/gk20a/fb_gk20a.c2
-rw-r--r--drivers/gpu/nvgpu/gk20a/mm_gk20a.c306
-rw-r--r--drivers/gpu/nvgpu/gk20a/mm_gk20a.h16
-rw-r--r--drivers/gpu/nvgpu/gp10b/mm_gp10b.c309
-rw-r--r--drivers/gpu/nvgpu/include/nvgpu/gmmu.h136
-rw-r--r--drivers/gpu/nvgpu/include/nvgpu/nvgpu_mem.h6
-rw-r--r--drivers/gpu/nvgpu/include/nvgpu/vm.h3
9 files changed, 979 insertions, 811 deletions
diff --git a/drivers/gpu/nvgpu/common/mm/gmmu.c b/drivers/gpu/nvgpu/common/mm/gmmu.c
index 06291600..ec1bc095 100644
--- a/drivers/gpu/nvgpu/common/mm/gmmu.c
+++ b/drivers/gpu/nvgpu/common/mm/gmmu.c
@@ -25,115 +25,26 @@
25#include "gk20a/gk20a.h" 25#include "gk20a/gk20a.h"
26#include "gk20a/mm_gk20a.h" 26#include "gk20a/mm_gk20a.h"
27 27
28#define gmmu_dbg(g, fmt, args...) \ 28#define __gmmu_dbg(g, attrs, fmt, args...) \
29 nvgpu_log(g, gpu_dbg_map, fmt, ##args) 29 do { \
30#define gmmu_dbg_v(g, fmt, args...) \ 30 if (attrs->debug) \
31 nvgpu_log(g, gpu_dbg_map_v, fmt, ##args) 31 nvgpu_info(g, fmt, ##args); \
32 32 else \
33static int map_gmmu_pages(struct gk20a *g, struct gk20a_mm_entry *entry) 33 nvgpu_log(g, gpu_dbg_map, fmt, ##args); \
34{ 34 } while (0)
35 return nvgpu_mem_begin(g, &entry->mem); 35
36} 36#define __gmmu_dbg_v(g, attrs, fmt, args...) \
37 37 do { \
38static void unmap_gmmu_pages(struct gk20a *g, struct gk20a_mm_entry *entry) 38 if (attrs->debug) \
39{ 39 nvgpu_info(g, fmt, ##args); \
40 nvgpu_mem_end(g, &entry->mem); 40 else \
41} 41 nvgpu_log(g, gpu_dbg_map_v, fmt, ##args); \
42 42 } while (0)
43static int nvgpu_alloc_gmmu_pages(struct vm_gk20a *vm, u32 order, 43
44 struct gk20a_mm_entry *entry) 44static int pd_allocate(struct vm_gk20a *vm,
45{ 45 struct nvgpu_gmmu_pd *pd,
46 struct gk20a *g = gk20a_from_vm(vm); 46 const struct gk20a_mmu_level *l,
47 u32 num_pages = 1 << order; 47 struct nvgpu_gmmu_attrs *attrs);
48 u32 len = num_pages * PAGE_SIZE;
49 int err;
50
51 err = nvgpu_dma_alloc(g, len, &entry->mem);
52
53 if (err) {
54 nvgpu_err(g, "memory allocation failed");
55 return -ENOMEM;
56 }
57
58 return 0;
59}
60
61void nvgpu_free_gmmu_pages(struct vm_gk20a *vm,
62 struct gk20a_mm_entry *entry)
63{
64 struct gk20a *g = gk20a_from_vm(vm);
65
66 if (!entry->mem.size)
67 return;
68
69 if (entry->woffset) /* fake shadow mem */
70 return;
71
72 nvgpu_dma_free(g, &entry->mem);
73}
74
75/*
76 * Allocate a phys contig region big enough for a full
77 * sized gmmu page table for the given gmmu_page_size.
78 * the whole range is zeroed so it's "invalid"/will fault.
79 *
80 * If a previous entry is supplied, its memory will be used for
81 * suballocation for this next entry too, if there is space.
82 */
83int nvgpu_zalloc_gmmu_page_table(struct vm_gk20a *vm,
84 enum gmmu_pgsz_gk20a pgsz_idx,
85 const struct gk20a_mmu_level *l,
86 struct gk20a_mm_entry *entry,
87 struct gk20a_mm_entry *prev_entry)
88{
89 int err = -ENOMEM;
90 int order;
91 struct gk20a *g = gk20a_from_vm(vm);
92 u32 bytes;
93
94 /* allocate enough pages for the table */
95 order = l->hi_bit[pgsz_idx] - l->lo_bit[pgsz_idx] + 1;
96 order += ilog2(l->entry_size);
97 bytes = 1 << order;
98 order -= PAGE_SHIFT;
99 if (order < 0 && prev_entry) {
100 /* try to suballocate from previous chunk */
101 u32 capacity = prev_entry->mem.size / bytes;
102 u32 prev = prev_entry->woffset * sizeof(u32) / bytes;
103 u32 free = capacity - prev - 1;
104
105 nvgpu_log(g, gpu_dbg_pte, "cap %d prev %d free %d bytes %d",
106 capacity, prev, free, bytes);
107
108 if (free) {
109 memcpy(&entry->mem, &prev_entry->mem,
110 sizeof(entry->mem));
111 entry->woffset = prev_entry->woffset
112 + bytes / sizeof(u32);
113 err = 0;
114 }
115 }
116
117 if (err) {
118 /* no suballoc space */
119 order = max(0, order);
120 err = nvgpu_alloc_gmmu_pages(vm, order, entry);
121 entry->woffset = 0;
122 }
123
124 nvgpu_log(g, gpu_dbg_pte, "entry = 0x%p, addr=%08llx, size %d, woff %x",
125 entry,
126 (entry->mem.priv.sgt &&
127 entry->mem.aperture == APERTURE_SYSMEM) ?
128 g->ops.mm.get_iova_addr(g, entry->mem.priv.sgt->sgl, 0) : 0,
129 order, entry->woffset);
130 if (err)
131 return err;
132 entry->pgsz = pgsz_idx;
133 entry->mem.skip_wmb = true;
134
135 return err;
136}
137 48
138/* 49/*
139 * Core GMMU map function for the kernel to use. If @addr is 0 then the GPU 50 * Core GMMU map function for the kernel to use. If @addr is 0 then the GPU
@@ -225,103 +136,484 @@ void nvgpu_gmmu_unmap(struct vm_gk20a *vm, struct nvgpu_mem *mem, u64 gpu_va)
225 nvgpu_mutex_release(&vm->update_gmmu_lock); 136 nvgpu_mutex_release(&vm->update_gmmu_lock);
226} 137}
227 138
228static int update_gmmu_level_locked(struct vm_gk20a *vm, 139int nvgpu_gmmu_init_page_table(struct vm_gk20a *vm)
229 struct gk20a_mm_entry *pte, 140{
230 enum gmmu_pgsz_gk20a pgsz_idx, 141 /*
231 struct scatterlist **sgl, 142 * Need this just for page size. Everything else can be ignored. Also
232 u64 *offset, 143 * note that we can just use pgsz 0 (i.e small pages) since the number
233 u64 *iova, 144 * of bits present in the top level PDE are the same for small/large
234 u64 gpu_va, u64 gpu_end, 145 * page VMs.
235 u8 kind_v, u64 *ctag, 146 */
236 bool cacheable, bool unmapped_pte, 147 struct nvgpu_gmmu_attrs attrs = {
237 int rw_flag, 148 .pgsz = 0,
238 bool sparse, 149 };
239 int lvl, 150
240 bool priv, 151 return pd_allocate(vm, &vm->pdb, &vm->mmu_levels[0], &attrs);
241 enum nvgpu_aperture aperture) 152}
153
154
155/*
156 * Ensure that there's a CPU mapping for the page directory memory. This won't
157 * always be the case for 32 bit systems since we may need to save kernel
158 * virtual memory.
159 */
160static int map_gmmu_pages(struct gk20a *g, struct nvgpu_gmmu_pd *entry)
161{
162 return nvgpu_mem_begin(g, &entry->mem);
163}
164
165/*
166 * Handle any necessary CPU unmap semantics for a page directories DMA memory.
167 * For 64 bit platforms this is a noop.
168 */
169static void unmap_gmmu_pages(struct gk20a *g, struct nvgpu_gmmu_pd *entry)
170{
171 nvgpu_mem_end(g, &entry->mem);
172}
173
174static int nvgpu_alloc_gmmu_pages(struct vm_gk20a *vm, u32 bytes,
175 struct nvgpu_gmmu_pd *pd)
242{ 176{
243 struct gk20a *g = gk20a_from_vm(vm); 177 struct gk20a *g = gk20a_from_vm(vm);
244 const struct gk20a_mmu_level *l = &vm->mmu_levels[lvl]; 178 unsigned long flags = NVGPU_DMA_FORCE_CONTIGUOUS;
245 const struct gk20a_mmu_level *next_l = &vm->mmu_levels[lvl+1]; 179 int err;
246 int err = 0; 180
247 u32 pde_i; 181 /*
248 u64 pde_size = 1ULL << (u64)l->lo_bit[pgsz_idx]; 182 * On arm32 vmalloc space is a precious commodity so we do not map pages
249 struct gk20a_mm_entry *next_pte = NULL, *prev_pte = NULL; 183 * by default.
184 */
185 if (!IS_ENABLED(CONFIG_ARM64))
186 flags |= NVGPU_DMA_NO_KERNEL_MAPPING;
187
188 err = nvgpu_dma_alloc_flags(g, flags, bytes, &pd->mem);
189 if (err)
190 return -ENOMEM;
191
192 return 0;
193}
194
195void nvgpu_free_gmmu_pages(struct vm_gk20a *vm,
196 struct nvgpu_gmmu_pd *pd)
197{
198 struct gk20a *g = gk20a_from_vm(vm);
199
200 nvgpu_dma_free(g, &pd->mem);
201}
202
203/*
204 * Return the _physical_ address of a page directory.
205 */
206u64 nvgpu_pde_phys_addr(struct gk20a *g, struct nvgpu_gmmu_pd *pd)
207{
208 if (g->mm.has_physical_mode)
209 return sg_phys(pd->mem.priv.sgt->sgl);
210 else
211 return nvgpu_mem_get_base_addr(g, &pd->mem, 0);
212}
213
214/*
215 * Return the aligned length based on the page size in attrs.
216 */
217static u64 nvgpu_align_map_length(struct vm_gk20a *vm, u64 length,
218 struct nvgpu_gmmu_attrs *attrs)
219{
220 u64 page_size = vm->gmmu_page_sizes[attrs->pgsz];
221
222 return ALIGN(length, page_size);
223}
224
225static u32 pd_entries(const struct gk20a_mmu_level *l,
226 struct nvgpu_gmmu_attrs *attrs)
227{
228 /*
229 * Number of entries in a PD is easy to compute from the number of bits
230 * used to index the page directory. That is simply 2 raised to the
231 * number of bits.
232 */
233 return 1UL << (l->hi_bit[attrs->pgsz] - l->lo_bit[attrs->pgsz] + 1UL);
234}
235
236/*
237 * Computes the size of a PD table.
238 */
239static u32 pd_size(const struct gk20a_mmu_level *l,
240 struct nvgpu_gmmu_attrs *attrs)
241{
242 return pd_entries(l, attrs) * l->entry_size;
243}
244
245/*
246 * Allocate a physically contiguous region big enough for a gmmu page table
247 * of the specified level and page size. The whole range is zeroed so that any
248 * accesses will fault until proper values are programmed.
249 */
250static int pd_allocate(struct vm_gk20a *vm,
251 struct nvgpu_gmmu_pd *pd,
252 const struct gk20a_mmu_level *l,
253 struct nvgpu_gmmu_attrs *attrs)
254{
255 int err;
250 256
251 gk20a_dbg_fn(""); 257 if (pd->mem.size)
258 return 0;
252 259
253 pde_i = (gpu_va & ((1ULL << ((u64)l->hi_bit[pgsz_idx]+1)) - 1ULL)) 260 err = nvgpu_alloc_gmmu_pages(vm, pd_size(l, attrs), pd);
254 >> (u64)l->lo_bit[pgsz_idx]; 261 if (err) {
262 nvgpu_info(vm->mm->g, "error allocating page directory!");
263 return err;
264 }
255 265
256 gk20a_dbg(gpu_dbg_pte, "size_idx=%d, l: %d, [%llx,%llx], iova=%llx", 266 /*
257 pgsz_idx, lvl, gpu_va, gpu_end-1, *iova); 267 * One mb() is done after all mapping operations. Don't need individual
268 * barriers for each PD write.
269 */
270 pd->mem.skip_wmb = true;
258 271
259 while (gpu_va < gpu_end) { 272 return 0;
260 u64 next = min((gpu_va + pde_size) & ~(pde_size-1), gpu_end); 273}
261 274
262 /* Allocate next level */ 275/*
276 * Compute what page directory index at the passed level the passed virtual
277 * address corresponds to. @attrs is necessary for determining the page size
278 * which is used to pick the right bit offsets for the GMMU level.
279 */
280static u32 pd_index(const struct gk20a_mmu_level *l, u64 virt,
281 struct nvgpu_gmmu_attrs *attrs)
282{
283 u64 pd_mask = (1ULL << ((u64)l->hi_bit[attrs->pgsz] + 1)) - 1ULL;
284 u32 pd_shift = (u64)l->lo_bit[attrs->pgsz];
285
286 /*
287 * For convenience we don't bother computing the lower bound of the
288 * mask; it's easier to just shift it off.
289 */
290 return (virt & pd_mask) >> pd_shift;
291}
292
293static int pd_allocate_children(struct vm_gk20a *vm,
294 const struct gk20a_mmu_level *l,
295 struct nvgpu_gmmu_pd *pd,
296 struct nvgpu_gmmu_attrs *attrs)
297{
298 struct gk20a *g = gk20a_from_vm(vm);
299
300 if (pd->entries)
301 return 0;
302
303 pd->num_entries = pd_entries(l, attrs);
304 pd->entries = nvgpu_vzalloc(g, sizeof(struct nvgpu_gmmu_pd) *
305 pd->num_entries);
306 if (!pd->entries)
307 return -ENOMEM;
308
309 return 0;
310}
311
312/*
313 * This function programs the GMMU based on two ranges: a physical range and a
314 * GPU virtual range. The virtual is mapped to the physical. Physical in this
315 * case can mean either a real physical sysmem address or a IO virtual address
316 * (for instance when a system has an IOMMU running).
317 *
318 * The rest of the parameters are for describing the actual mapping itself.
319 *
320 * This function recursively calls itself for handling PDEs. At the final level
321 * a PTE handler is called. The phys and virt ranges are adjusted for each
322 * recursion so that each invocation of this function need only worry about the
323 * range it is passed.
324 *
325 * phys_addr will always point to a contiguous range - the discontiguous nature
326 * of DMA buffers is taken care of at the layer above this.
327 */
328static int __set_pd_level(struct vm_gk20a *vm,
329 struct nvgpu_gmmu_pd *pd,
330 int lvl,
331 u64 phys_addr,
332 u64 virt_addr, u64 length,
333 struct nvgpu_gmmu_attrs *attrs)
334{
335 int err = 0;
336 u64 pde_range;
337 struct gk20a *g = gk20a_from_vm(vm);
338 struct nvgpu_gmmu_pd *next_pd = NULL;
339 const struct gk20a_mmu_level *l = &vm->mmu_levels[lvl];
340 const struct gk20a_mmu_level *next_l = &vm->mmu_levels[lvl + 1];
341
342 /*
343 * 5 levels for Pascal+. For pre-pascal we only have 2. This puts
344 * offsets into the page table debugging code which makes it easier to
345 * see what level prints are from.
346 */
347 static const char *__lvl_debug[] = {
348 "", /* L=0 */
349 " ", /* L=1 */
350 " ", /* L=2 */
351 " ", /* L=3 */
352 " ", /* L=4 */
353 };
354
355 pde_range = 1ULL << (u64)l->lo_bit[attrs->pgsz];
356
357 __gmmu_dbg_v(g, attrs,
358 "L=%d %sGPU virt %#-12llx +%#-9llx -> phys %#-12llx",
359 lvl,
360 __lvl_debug[lvl],
361 virt_addr,
362 length,
363 phys_addr);
364
365 /*
366 * Iterate across the mapping in chunks the size of this level's PDE.
367 * For each of those chunks program our level's PDE and then, if there's
368 * a next level, program the next level's PDEs/PTEs.
369 */
370 while (length) {
371 u32 pd_idx = pd_index(l, virt_addr, attrs);
372 u64 chunk_size;
373 u64 target_addr;
374
375 /*
376 * Truncate the pde_range when the virtual address does not
377 * start at a PDE boundary.
378 */
379 chunk_size = min(length,
380 pde_range - (virt_addr & (pde_range - 1)));
381
382 /*
383 * If the next level has an update_entry function then we know
384 * that _this_ level points to PDEs (not PTEs). Thus we need to
385 * have a bunch of children PDs.
386 */
263 if (next_l->update_entry) { 387 if (next_l->update_entry) {
264 if (!pte->entries) { 388 if (pd_allocate_children(vm, l, pd, attrs))
265 int num_entries = 389 return -ENOMEM;
266 1 << 390
267 (l->hi_bit[pgsz_idx] 391 /*
268 - l->lo_bit[pgsz_idx] + 1); 392 * Get the next PD so that we know what to put in this
269 pte->entries = 393 * current PD. If the next level is actually PTEs then
270 nvgpu_vzalloc(g, 394 * we don't need this - we will just use the real
271 sizeof(struct gk20a_mm_entry) * 395 * physical target.
272 num_entries); 396 */
273 if (!pte->entries) 397 next_pd = &pd->entries[pd_idx];
274 return -ENOMEM; 398
275 pte->pgsz = pgsz_idx; 399 /*
276 pte->num_entries = num_entries; 400 * Allocate the backing memory for next_pd.
277 } 401 */
278 prev_pte = next_pte; 402 if (pd_allocate(vm, next_pd, next_l, attrs))
279 next_pte = pte->entries + pde_i; 403 return -ENOMEM;
280
281 if (!next_pte->mem.size) {
282 err = nvgpu_zalloc_gmmu_page_table(vm,
283 pgsz_idx, next_l, next_pte, prev_pte);
284 if (err)
285 return err;
286 }
287 } 404 }
288 405
289 err = l->update_entry(vm, pte, pde_i, pgsz_idx, 406 /*
290 sgl, offset, iova, 407 * This is the address we want to program into the actual PDE/
291 kind_v, ctag, cacheable, unmapped_pte, 408 * PTE. When the next level is PDEs we need the target address
292 rw_flag, sparse, priv, aperture); 409 * to be the table of PDEs. When the next level is PTEs the
293 if (err) 410 * target addr is the real physical address we are aiming for.
294 return err; 411 */
412 target_addr = next_pd ? nvgpu_pde_phys_addr(g, next_pd) :
413 phys_addr;
414
415 l->update_entry(vm, l,
416 pd, pd_idx,
417 virt_addr,
418 target_addr,
419 attrs);
295 420
296 if (next_l->update_entry) { 421 if (next_l->update_entry) {
297 /* get cpu access to the ptes */ 422 err = map_gmmu_pages(g, next_pd);
298 err = map_gmmu_pages(g, next_pte);
299 if (err) { 423 if (err) {
300 nvgpu_err(g, 424 nvgpu_err(g,
301 "couldn't map ptes for update as=%d", 425 "couldn't map ptes for update as=%d",
302 vm_aspace_id(vm)); 426 vm_aspace_id(vm));
303 return err; 427 return err;
304 } 428 }
305 err = update_gmmu_level_locked(vm, next_pte, 429
306 pgsz_idx, 430 err = __set_pd_level(vm, next_pd,
307 sgl, 431 lvl + 1,
308 offset, 432 phys_addr,
309 iova, 433 virt_addr,
310 gpu_va, 434 chunk_size,
311 next, 435 attrs);
312 kind_v, ctag, cacheable, unmapped_pte, 436 unmap_gmmu_pages(g, next_pd);
313 rw_flag, sparse, lvl+1, priv, aperture);
314 unmap_gmmu_pages(g, next_pte);
315 437
316 if (err) 438 if (err)
317 return err; 439 return err;
318 } 440 }
319 441
320 pde_i++; 442 virt_addr += chunk_size;
321 gpu_va = next; 443
444 /*
445 * Only add to phys_addr if it's non-zero. A zero value implies
446 * we are unmapping as as a result we don't want to place
447 * non-zero phys addresses in the PTEs. A non-zero phys-addr
448 * would also confuse the lower level PTE programming code.
449 */
450 if (phys_addr)
451 phys_addr += chunk_size;
452 length -= chunk_size;
453 }
454
455 __gmmu_dbg_v(g, attrs, "L=%d %s%s", lvl, __lvl_debug[lvl], "ret!");
456
457 return 0;
458}
459
460/*
461 * VIDMEM version of the update_ptes logic.
462 */
463static int __nvgpu_gmmu_update_page_table_vidmem(struct vm_gk20a *vm,
464 struct sg_table *sgt,
465 u64 space_to_skip,
466 u64 virt_addr,
467 u64 length,
468 struct nvgpu_gmmu_attrs *attrs)
469{
470 struct nvgpu_page_alloc *alloc = NULL;
471 struct page_alloc_chunk *chunk = NULL;
472 u64 phys_addr, chunk_length;
473 int err = 0;
474
475 if (!sgt) {
476 /*
477 * This is considered an unmap. Just pass in 0 as the physical
478 * address for the entire GPU range.
479 */
480 err = __set_pd_level(vm, &vm->pdb,
481 0,
482 0,
483 virt_addr, length,
484 attrs);
485 return err;
486 }
487
488 alloc = get_vidmem_page_alloc(sgt->sgl);
489
490 /*
491 * Otherwise iterate across all the chunks in this allocation and
492 * map them.
493 */
494 nvgpu_list_for_each_entry(chunk, &alloc->alloc_chunks,
495 page_alloc_chunk, list_entry) {
496 if (space_to_skip &&
497 space_to_skip >= chunk->length) {
498 space_to_skip -= chunk->length;
499 continue;
500 }
501
502 phys_addr = chunk->base + space_to_skip;
503 chunk_length = min(length, (chunk->length - space_to_skip));
504
505 err = __set_pd_level(vm, &vm->pdb,
506 0,
507 phys_addr,
508 virt_addr, length,
509 attrs);
510 if (err)
511 break;
512
513 /* Space has been skipped so zero this for future chunks. */
514 space_to_skip = 0;
515
516 /*
517 * Update the map pointer and the remaining length.
518 */
519 virt_addr += chunk_length;
520 length -= chunk_length;
521
522 if (length == 0)
523 break;
322 } 524 }
323 525
324 gk20a_dbg_fn("done"); 526 return err;
527}
528
529static int __nvgpu_gmmu_update_page_table_sysmem(struct vm_gk20a *vm,
530 struct sg_table *sgt,
531 u64 space_to_skip,
532 u64 virt_addr,
533 u64 length,
534 struct nvgpu_gmmu_attrs *attrs)
535{
536 int err;
537 struct scatterlist *sgl;
538 struct gk20a *g = gk20a_from_vm(vm);
539
540 if (!sgt) {
541 /*
542 * This is considered an unmap. Just pass in 0 as the physical
543 * address for the entire GPU range.
544 */
545 err = __set_pd_level(vm, &vm->pdb,
546 0,
547 0,
548 virt_addr, length,
549 attrs);
550 return err;
551 }
552
553 /*
554 * At this point we have a Linux scatter-gather list pointing to some
555 * number of discontiguous chunks of memory. Iterate over that list and
556 * generate a GMMU map call for each chunk. There are two possibilities:
557 * either the IOMMU is enabled or not. When the IOMMU is enabled the
558 * mapping is simple since the "physical" address is actually a virtual
559 * IO address and will be contiguous. The no-IOMMU case is more
560 * complicated. We will have to iterate over the SGT and do a separate
561 * map for each chunk of the SGT.
562 */
563 sgl = sgt->sgl;
564
565 if (!g->mm.bypass_smmu) {
566 u64 io_addr = g->ops.mm.get_iova_addr(g, sgl, 0);
567
568 io_addr += space_to_skip;
569
570 err = __set_pd_level(vm, &vm->pdb,
571 0,
572 io_addr,
573 virt_addr,
574 length,
575 attrs);
576
577 return err;
578 }
579
580 /*
581 * Finally: last possible case: do the no-IOMMU mapping. In this case we
582 * really are mapping physical pages directly.
583 */
584 while (sgl) {
585 u64 phys_addr;
586 u64 chunk_length;
587
588 /*
589 * Cut out sgl ents for space_to_skip.
590 */
591 if (space_to_skip && space_to_skip >= sgl->length) {
592 space_to_skip -= sgl->length;
593 sgl = sg_next(sgl);
594 continue;
595 }
596
597 phys_addr = sg_phys(sgl) + space_to_skip;
598 chunk_length = min(length, sgl->length - space_to_skip);
599
600 err = __set_pd_level(vm, &vm->pdb,
601 0,
602 phys_addr,
603 virt_addr,
604 chunk_length,
605 attrs);
606 if (err)
607 return err;
608
609 space_to_skip = 0;
610 virt_addr += chunk_length;
611 length -= chunk_length;
612 sgl = sg_next(sgl);
613
614 if (length == 0)
615 break;
616 }
325 617
326 return 0; 618 return 0;
327} 619}
@@ -332,8 +624,8 @@ static int update_gmmu_level_locked(struct vm_gk20a *vm,
332 * physical* address. 624 * physical* address.
333 * 625 *
334 * The update of each level of the page tables is farmed out to chip specific 626 * The update of each level of the page tables is farmed out to chip specific
335 * implementations. But the logic around that is generic to all chips. Every chip 627 * implementations. But the logic around that is generic to all chips. Every
336 * has some number of PDE levels and then a PTE level. 628 * chip has some number of PDE levels and then a PTE level.
337 * 629 *
338 * Each chunk of the incoming SGT is sent to the chip specific implementation 630 * Each chunk of the incoming SGT is sent to the chip specific implementation
339 * of page table update. 631 * of page table update.
@@ -341,148 +633,81 @@ static int update_gmmu_level_locked(struct vm_gk20a *vm,
341 * [*] Note: the "physical" address may actually be an IO virtual address in the 633 * [*] Note: the "physical" address may actually be an IO virtual address in the
342 * case of SMMU usage. 634 * case of SMMU usage.
343 */ 635 */
344static int update_gmmu_ptes_locked(struct vm_gk20a *vm, 636static int __nvgpu_gmmu_update_page_table(struct vm_gk20a *vm,
345 enum gmmu_pgsz_gk20a pgsz_idx, 637 struct sg_table *sgt,
346 struct sg_table *sgt, 638 u64 space_to_skip,
347 u64 buffer_offset, 639 u64 virt_addr,
348 u64 gpu_va, u64 gpu_end, 640 u64 length,
349 u8 kind_v, u32 ctag_offset, 641 struct nvgpu_gmmu_attrs *attrs)
350 bool cacheable, bool unmapped_pte,
351 int rw_flag,
352 bool sparse,
353 bool priv,
354 enum nvgpu_aperture aperture)
355{ 642{
356 struct gk20a *g = gk20a_from_vm(vm); 643 struct gk20a *g = gk20a_from_vm(vm);
357 int ctag_granularity = g->ops.fb.compression_page_size(g); 644 u32 page_size;
358 u64 ctag = (u64)ctag_offset * (u64)ctag_granularity;
359 u64 iova = 0;
360 u64 space_to_skip = buffer_offset;
361 u64 map_size = gpu_end - gpu_va;
362 u32 page_size = vm->gmmu_page_sizes[pgsz_idx];
363 int err; 645 int err;
364 struct scatterlist *sgl = NULL;
365 struct nvgpu_page_alloc *alloc = NULL;
366 struct page_alloc_chunk *chunk = NULL;
367 u64 length;
368 646
369 /* note: here we need to map kernel to small, since the 647 /* note: here we need to map kernel to small, since the
370 * low-level mmu code assumes 0 is small and 1 is big pages */ 648 * low-level mmu code assumes 0 is small and 1 is big pages */
371 if (pgsz_idx == gmmu_page_size_kernel) 649 if (attrs->pgsz == gmmu_page_size_kernel)
372 pgsz_idx = gmmu_page_size_small; 650 attrs->pgsz = gmmu_page_size_small;
651
652 page_size = vm->gmmu_page_sizes[attrs->pgsz];
373 653
374 if (space_to_skip & (page_size - 1)) 654 if (space_to_skip & (page_size - 1))
375 return -EINVAL; 655 return -EINVAL;
376 656
657 /*
658 * Update length to be aligned to the passed page size.
659 */
660 length = nvgpu_align_map_length(vm, length, attrs);
661
377 err = map_gmmu_pages(g, &vm->pdb); 662 err = map_gmmu_pages(g, &vm->pdb);
378 if (err) { 663 if (err) {
379 nvgpu_err(g, 664 nvgpu_err(g, "couldn't map ptes for update as=%d",
380 "couldn't map ptes for update as=%d", 665 vm_aspace_id(vm));
381 vm_aspace_id(vm));
382 return err; 666 return err;
383 } 667 }
384 668
385 if (aperture == APERTURE_VIDMEM) { 669 __gmmu_dbg(g, attrs,
386 gmmu_dbg_v(g, "vidmem map size_idx=%d, gpu_va=[%llx,%llx]", 670 "vm=%s "
387 pgsz_idx, gpu_va, gpu_end-1); 671 "%-5s GPU virt %#-12llx +%#-9llx phys %#-12llx "
388 672 "phys offset: %#-4llx; pgsz: %3dkb perm=%-2s | "
389 if (sgt) { 673 "kind=%#02x APT=%-6s %c%c%c",
390 alloc = get_vidmem_page_alloc(sgt->sgl); 674 vm->name,
391 675 sgt ? "MAP" : "UNMAP",
392 nvgpu_list_for_each_entry(chunk, &alloc->alloc_chunks, 676 virt_addr,
393 page_alloc_chunk, list_entry) { 677 length,
394 if (space_to_skip && 678 sgt ? g->ops.mm.get_iova_addr(g, sgt->sgl, 0) : 0ULL,
395 space_to_skip > chunk->length) { 679 space_to_skip,
396 space_to_skip -= chunk->length; 680 page_size >> 10,
397 } else { 681 nvgpu_gmmu_perm_str(attrs->rw_flag),
398 iova = chunk->base + space_to_skip; 682 attrs->kind_v,
399 length = chunk->length - space_to_skip; 683 nvgpu_aperture_str(attrs->aperture),
400 length = min(length, map_size); 684 attrs->cacheable ? 'C' : 'V', /* C = cached, V = volatile. */
401 space_to_skip = 0; 685 attrs->sparse ? 'S' : '-',
402 686 attrs->priv ? 'P' : '-');
403 err = update_gmmu_level_locked(vm, 687
404 &vm->pdb, pgsz_idx, 688 /*
405 &sgl, 689 * Handle VIDMEM progamming. Currently uses a different scatter list
406 &space_to_skip, 690 * format.
407 &iova, 691 */
408 gpu_va, gpu_va + length, 692 if (attrs->aperture == APERTURE_VIDMEM)
409 kind_v, &ctag, 693 err = __nvgpu_gmmu_update_page_table_vidmem(vm,
410 cacheable, unmapped_pte, 694 sgt,
411 rw_flag, sparse, 0, priv, 695 space_to_skip,
412 aperture); 696 virt_addr,
413 if (err) 697 length,
414 break; 698 attrs);
415 699 else
416 /* need to set explicit zero here */ 700 err = __nvgpu_gmmu_update_page_table_sysmem(vm,
417 space_to_skip = 0; 701 sgt,
418 gpu_va += length; 702 space_to_skip,
419 map_size -= length; 703 virt_addr,
420 704 length,
421 if (!map_size) 705 attrs);
422 break;
423 }
424 }
425 } else {
426 err = update_gmmu_level_locked(vm, &vm->pdb, pgsz_idx,
427 &sgl,
428 &space_to_skip,
429 &iova,
430 gpu_va, gpu_end,
431 kind_v, &ctag,
432 cacheable, unmapped_pte, rw_flag,
433 sparse, 0, priv,
434 aperture);
435 }
436 } else {
437 gmmu_dbg_v(g,
438 "pgsz=%-6d, gpu_va: %#-12llx +%#-6llx phys: %#-12llx "
439 "buffer offset: %-4lld, nents: %d",
440 page_size,
441 gpu_va, gpu_end - gpu_va,
442 sgt ? g->ops.mm.get_iova_addr(g, sgt->sgl, 0) : 0ULL,
443 buffer_offset,
444 sgt ? sgt->nents : 0);
445
446 if (sgt) {
447 iova = g->ops.mm.get_iova_addr(vm->mm->g, sgt->sgl, 0);
448 if (!vm->mm->bypass_smmu && iova) {
449 iova += space_to_skip;
450 } else {
451 sgl = sgt->sgl;
452
453 gk20a_dbg(gpu_dbg_pte, "chunk address %llx, size %d",
454 (u64)sg_phys(sgl),
455 sgl->length);
456
457 while (space_to_skip && sgl &&
458 space_to_skip + page_size > sgl->length) {
459 space_to_skip -= sgl->length;
460 sgl = sg_next(sgl);
461 gk20a_dbg(gpu_dbg_pte, "chunk address %llx, size %d",
462 (u64)sg_phys(sgl),
463 sgl->length);
464 }
465
466 iova = sg_phys(sgl) + space_to_skip;
467 }
468 }
469
470 err = update_gmmu_level_locked(vm, &vm->pdb, pgsz_idx,
471 &sgl,
472 &space_to_skip,
473 &iova,
474 gpu_va, gpu_end,
475 kind_v, &ctag,
476 cacheable, unmapped_pte, rw_flag,
477 sparse, 0, priv,
478 aperture);
479 }
480 706
481 unmap_gmmu_pages(g, &vm->pdb); 707 unmap_gmmu_pages(g, &vm->pdb);
482
483 mb(); 708 mb();
484 709
485 gk20a_dbg_fn("done"); 710 __gmmu_dbg(g, attrs, "%-5s Done!", sgt ? "MAP" : "UNMAP");
486 711
487 return err; 712 return err;
488} 713}
@@ -500,32 +725,44 @@ static int update_gmmu_ptes_locked(struct vm_gk20a *vm,
500 * have the update_gmmu_lock aquired. 725 * have the update_gmmu_lock aquired.
501 */ 726 */
502u64 gk20a_locked_gmmu_map(struct vm_gk20a *vm, 727u64 gk20a_locked_gmmu_map(struct vm_gk20a *vm,
503 u64 map_offset, 728 u64 vaddr,
504 struct sg_table *sgt, 729 struct sg_table *sgt,
505 u64 buffer_offset, 730 u64 buffer_offset,
506 u64 size, 731 u64 size,
507 int pgsz_idx, 732 int pgsz_idx,
508 u8 kind_v, 733 u8 kind_v,
509 u32 ctag_offset, 734 u32 ctag_offset,
510 u32 flags, 735 u32 flags,
511 int rw_flag, 736 int rw_flag,
512 bool clear_ctags, 737 bool clear_ctags,
513 bool sparse, 738 bool sparse,
514 bool priv, 739 bool priv,
515 struct vm_gk20a_mapping_batch *batch, 740 struct vm_gk20a_mapping_batch *batch,
516 enum nvgpu_aperture aperture) 741 enum nvgpu_aperture aperture)
517{ 742{
743 struct gk20a *g = gk20a_from_vm(vm);
518 int err = 0; 744 int err = 0;
519 bool allocated = false; 745 bool allocated = false;
520 struct gk20a *g = gk20a_from_vm(vm);
521 int ctag_granularity = g->ops.fb.compression_page_size(g); 746 int ctag_granularity = g->ops.fb.compression_page_size(g);
522 u32 ctag_lines = DIV_ROUND_UP_ULL(size, ctag_granularity); 747 struct nvgpu_gmmu_attrs attrs = {
523 748 .pgsz = pgsz_idx,
524 /* Allocate (or validate when map_offset != 0) the virtual address. */ 749 .kind_v = kind_v,
525 if (!map_offset) { 750 .ctag = (u64)ctag_offset * (u64)ctag_granularity,
526 map_offset = __nvgpu_vm_alloc_va(vm, size, 751 .cacheable = flags & NVGPU_MAP_BUFFER_FLAGS_CACHEABLE_TRUE,
527 pgsz_idx); 752 .rw_flag = rw_flag,
528 if (!map_offset) { 753 .sparse = sparse,
754 .priv = priv,
755 .valid = !(flags & NVGPU_AS_MAP_BUFFER_FLAGS_UNMAPPED_PTE),
756 .aperture = aperture
757 };
758
759 /*
760 * Only allocate a new GPU VA range if we haven't already been passed a
761 * GPU VA range. This facilitates fixed mappings.
762 */
763 if (!vaddr) {
764 vaddr = __nvgpu_vm_alloc_va(vm, size, pgsz_idx);
765 if (!vaddr) {
529 nvgpu_err(g, "failed to allocate va space"); 766 nvgpu_err(g, "failed to allocate va space");
530 err = -ENOMEM; 767 err = -ENOMEM;
531 goto fail_alloc; 768 goto fail_alloc;
@@ -533,34 +770,8 @@ u64 gk20a_locked_gmmu_map(struct vm_gk20a *vm,
533 allocated = true; 770 allocated = true;
534 } 771 }
535 772
536 gmmu_dbg(g, 773 err = __nvgpu_gmmu_update_page_table(vm, sgt, buffer_offset,
537 "gv: 0x%04x_%08x + 0x%-7llx " 774 vaddr, size, &attrs);
538 "[dma: 0x%02x_%08x, pa: 0x%02x_%08x] "
539 "pgsz=%-3dKb as=%-2d ctags=%d start=%d "
540 "kind=0x%x flags=0x%x apt=%s",
541 u64_hi32(map_offset), u64_lo32(map_offset), size,
542 sgt ? u64_hi32((u64)sg_dma_address(sgt->sgl)) : 0,
543 sgt ? u64_lo32((u64)sg_dma_address(sgt->sgl)) : 0,
544 sgt ? u64_hi32((u64)sg_phys(sgt->sgl)) : 0,
545 sgt ? u64_lo32((u64)sg_phys(sgt->sgl)) : 0,
546 vm->gmmu_page_sizes[pgsz_idx] >> 10, vm_aspace_id(vm),
547 ctag_lines, ctag_offset,
548 kind_v, flags, nvgpu_aperture_str(aperture));
549
550 err = update_gmmu_ptes_locked(vm, pgsz_idx,
551 sgt,
552 buffer_offset,
553 map_offset, map_offset + size,
554 kind_v,
555 ctag_offset,
556 flags &
557 NVGPU_MAP_BUFFER_FLAGS_CACHEABLE_TRUE,
558 flags &
559 NVGPU_AS_MAP_BUFFER_FLAGS_UNMAPPED_PTE,
560 rw_flag,
561 sparse,
562 priv,
563 aperture);
564 if (err) { 775 if (err) {
565 nvgpu_err(g, "failed to update ptes on map"); 776 nvgpu_err(g, "failed to update ptes on map");
566 goto fail_validate; 777 goto fail_validate;
@@ -571,26 +782,37 @@ u64 gk20a_locked_gmmu_map(struct vm_gk20a *vm,
571 else 782 else
572 batch->need_tlb_invalidate = true; 783 batch->need_tlb_invalidate = true;
573 784
574 return map_offset; 785 return vaddr;
575fail_validate: 786fail_validate:
576 if (allocated) 787 if (allocated)
577 __nvgpu_vm_free_va(vm, map_offset, pgsz_idx); 788 __nvgpu_vm_free_va(vm, vaddr, pgsz_idx);
578fail_alloc: 789fail_alloc:
579 nvgpu_err(g, "%s: failed with err=%d", __func__, err); 790 nvgpu_err(g, "%s: failed with err=%d", __func__, err);
580 return 0; 791 return 0;
581} 792}
582 793
583void gk20a_locked_gmmu_unmap(struct vm_gk20a *vm, 794void gk20a_locked_gmmu_unmap(struct vm_gk20a *vm,
584 u64 vaddr, 795 u64 vaddr,
585 u64 size, 796 u64 size,
586 int pgsz_idx, 797 int pgsz_idx,
587 bool va_allocated, 798 bool va_allocated,
588 int rw_flag, 799 int rw_flag,
589 bool sparse, 800 bool sparse,
590 struct vm_gk20a_mapping_batch *batch) 801 struct vm_gk20a_mapping_batch *batch)
591{ 802{
592 int err = 0; 803 int err = 0;
593 struct gk20a *g = gk20a_from_vm(vm); 804 struct gk20a *g = gk20a_from_vm(vm);
805 struct nvgpu_gmmu_attrs attrs = {
806 .pgsz = pgsz_idx,
807 .kind_v = 0,
808 .ctag = 0,
809 .cacheable = 0,
810 .rw_flag = rw_flag,
811 .sparse = sparse,
812 .priv = 0,
813 .valid = 0,
814 .aperture = APERTURE_INVALID,
815 };
594 816
595 if (va_allocated) { 817 if (va_allocated) {
596 err = __nvgpu_vm_free_va(vm, vaddr, pgsz_idx); 818 err = __nvgpu_vm_free_va(vm, vaddr, pgsz_idx);
@@ -601,27 +823,11 @@ void gk20a_locked_gmmu_unmap(struct vm_gk20a *vm,
601 } 823 }
602 824
603 /* unmap here needs to know the page size we assigned at mapping */ 825 /* unmap here needs to know the page size we assigned at mapping */
604 err = update_gmmu_ptes_locked(vm, 826 err = __nvgpu_gmmu_update_page_table(vm, NULL, 0,
605 pgsz_idx, 827 vaddr, size, &attrs);
606 NULL, /* n/a for unmap */
607 0,
608 vaddr,
609 vaddr + size,
610 0, 0, false /* n/a for unmap */,
611 false, rw_flag,
612 sparse, 0,
613 APERTURE_INVALID); /* don't care for unmap */
614 if (err) 828 if (err)
615 nvgpu_err(g, "failed to update gmmu ptes on unmap"); 829 nvgpu_err(g, "failed to update gmmu ptes on unmap");
616 830
617 /* flush l2 so any dirty lines are written out *now*.
618 * also as we could potentially be switching this buffer
619 * from nonvolatile (l2 cacheable) to volatile (l2 non-cacheable) at
620 * some point in the future we need to invalidate l2. e.g. switching
621 * from a render buffer unmap (here) to later using the same memory
622 * for gmmu ptes. note the positioning of this relative to any smmu
623 * unmapping (below). */
624
625 if (!batch) { 831 if (!batch) {
626 gk20a_mm_l2_flush(g, true); 832 gk20a_mm_l2_flush(g, true);
627 g->ops.fb.tlb_invalidate(g, &vm->pdb.mem); 833 g->ops.fb.tlb_invalidate(g, &vm->pdb.mem);
diff --git a/drivers/gpu/nvgpu/common/mm/vm.c b/drivers/gpu/nvgpu/common/mm/vm.c
index 88622eca..3aeba500 100644
--- a/drivers/gpu/nvgpu/common/mm/vm.c
+++ b/drivers/gpu/nvgpu/common/mm/vm.c
@@ -36,7 +36,7 @@ int vm_aspace_id(struct vm_gk20a *vm)
36} 36}
37 37
38static void nvgpu_vm_free_entries(struct vm_gk20a *vm, 38static void nvgpu_vm_free_entries(struct vm_gk20a *vm,
39 struct gk20a_mm_entry *parent, 39 struct nvgpu_gmmu_pd *parent,
40 int level) 40 int level)
41{ 41{
42 int i; 42 int i;
@@ -75,8 +75,6 @@ u64 __nvgpu_vm_alloc_va(struct vm_gk20a *vm, u64 size,
75 75
76 /* Be certain we round up to page_size if needed */ 76 /* Be certain we round up to page_size if needed */
77 size = (size + ((u64)page_size - 1)) & ~((u64)page_size - 1); 77 size = (size + ((u64)page_size - 1)) & ~((u64)page_size - 1);
78 nvgpu_log(g, gpu_dbg_map, "size=0x%llx @ pgsz=%dKB", size,
79 vm->gmmu_page_sizes[pgsz_idx] >> 10);
80 78
81 addr = nvgpu_alloc(vma, size); 79 addr = nvgpu_alloc(vma, size);
82 if (!addr) { 80 if (!addr) {
@@ -84,17 +82,14 @@ u64 __nvgpu_vm_alloc_va(struct vm_gk20a *vm, u64 size,
84 return 0; 82 return 0;
85 } 83 }
86 84
87 nvgpu_log(g, gpu_dbg_map, "(%s) addr: 0x%llx", vma->name, addr);
88 return addr; 85 return addr;
89} 86}
90 87
91int __nvgpu_vm_free_va(struct vm_gk20a *vm, u64 addr, 88int __nvgpu_vm_free_va(struct vm_gk20a *vm, u64 addr,
92 enum gmmu_pgsz_gk20a pgsz_idx) 89 enum gmmu_pgsz_gk20a pgsz_idx)
93{ 90{
94 struct gk20a *g = vm->mm->g;
95 struct nvgpu_allocator *vma = vm->vma[pgsz_idx]; 91 struct nvgpu_allocator *vma = vm->vma[pgsz_idx];
96 92
97 nvgpu_log(g, gpu_dbg_map, "(%s) addr: 0x%llx", vma->name, addr);
98 nvgpu_free(vma, addr); 93 nvgpu_free(vma, addr);
99 94
100 return 0; 95 return 0;
@@ -127,32 +122,6 @@ void nvgpu_vm_mapping_batch_finish(struct vm_gk20a *vm,
127 nvgpu_mutex_release(&vm->update_gmmu_lock); 122 nvgpu_mutex_release(&vm->update_gmmu_lock);
128} 123}
129 124
130static int nvgpu_vm_init_page_tables(struct vm_gk20a *vm)
131{
132 u32 pde_lo, pde_hi;
133 int err;
134
135 pde_range_from_vaddr_range(vm,
136 0, vm->va_limit-1,
137 &pde_lo, &pde_hi);
138 vm->pdb.entries = nvgpu_vzalloc(vm->mm->g,
139 sizeof(struct gk20a_mm_entry) *
140 (pde_hi + 1));
141 vm->pdb.num_entries = pde_hi + 1;
142
143 if (!vm->pdb.entries)
144 return -ENOMEM;
145
146 err = nvgpu_zalloc_gmmu_page_table(vm, 0, &vm->mmu_levels[0],
147 &vm->pdb, NULL);
148 if (err) {
149 nvgpu_vfree(vm->mm->g, vm->pdb.entries);
150 return err;
151 }
152
153 return 0;
154}
155
156/* 125/*
157 * Determine if the passed address space can support big pages or not. 126 * Determine if the passed address space can support big pages or not.
158 */ 127 */
@@ -280,7 +249,8 @@ static int __nvgpu_vm_init(struct mm_gk20a *mm,
280#endif 249#endif
281 250
282 /* Initialize the page table data structures. */ 251 /* Initialize the page table data structures. */
283 err = nvgpu_vm_init_page_tables(vm); 252 strncpy(vm->name, name, min(strlen(name), sizeof(vm->name)));
253 err = nvgpu_gmmu_init_page_table(vm);
284 if (err) 254 if (err)
285 goto clean_up_vgpu_vm; 255 goto clean_up_vgpu_vm;
286 256
diff --git a/drivers/gpu/nvgpu/gk20a/fb_gk20a.c b/drivers/gpu/nvgpu/gk20a/fb_gk20a.c
index 3c76e817..c5f9c1fd 100644
--- a/drivers/gpu/nvgpu/gk20a/fb_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/fb_gk20a.c
@@ -67,7 +67,7 @@ void gk20a_fb_tlb_invalidate(struct gk20a *g, struct nvgpu_mem *pdb)
67 if (!g->power_on) 67 if (!g->power_on)
68 return; 68 return;
69 69
70 addr_lo = u64_lo32(gk20a_mem_get_base_addr(g, pdb, 0) >> 12); 70 addr_lo = u64_lo32(nvgpu_mem_get_base_addr(g, pdb, 0) >> 12);
71 71
72 nvgpu_mutex_acquire(&g->mm.tlb_lock); 72 nvgpu_mutex_acquire(&g->mm.tlb_lock);
73 73
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
index b7b68575..558a1b06 100644
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
@@ -777,31 +777,6 @@ int gk20a_mm_pde_coverage_bit_count(struct vm_gk20a *vm)
777 return vm->mmu_levels[0].lo_bit[0]; 777 return vm->mmu_levels[0].lo_bit[0];
778} 778}
779 779
780/* given address range (inclusive) determine the pdes crossed */
781void pde_range_from_vaddr_range(struct vm_gk20a *vm,
782 u64 addr_lo, u64 addr_hi,
783 u32 *pde_lo, u32 *pde_hi)
784{
785 int pde_shift = gk20a_mm_pde_coverage_bit_count(vm);
786
787 *pde_lo = (u32)(addr_lo >> pde_shift);
788 *pde_hi = (u32)(addr_hi >> pde_shift);
789 gk20a_dbg(gpu_dbg_pte, "addr_lo=0x%llx addr_hi=0x%llx pde_ss=%d",
790 addr_lo, addr_hi, pde_shift);
791 gk20a_dbg(gpu_dbg_pte, "pde_lo=%d pde_hi=%d",
792 *pde_lo, *pde_hi);
793}
794
795static u32 pde_from_index(u32 i)
796{
797 return i * gmmu_pde__size_v() / sizeof(u32);
798}
799
800static u32 pte_from_index(u32 i)
801{
802 return i * gmmu_pte__size_v() / sizeof(u32);
803}
804
805int nvgpu_vm_get_buffers(struct vm_gk20a *vm, 780int nvgpu_vm_get_buffers(struct vm_gk20a *vm,
806 struct nvgpu_mapped_buf ***mapped_buffers, 781 struct nvgpu_mapped_buf ***mapped_buffers,
807 int *num_buffers) 782 int *num_buffers)
@@ -1478,7 +1453,7 @@ static int gk20a_gmmu_clear_vidmem_mem(struct gk20a *g, struct nvgpu_mem *mem)
1478 * If mem is in VIDMEM, return base address in vidmem 1453 * If mem is in VIDMEM, return base address in vidmem
1479 * else return IOVA address for SYSMEM 1454 * else return IOVA address for SYSMEM
1480 */ 1455 */
1481u64 gk20a_mem_get_base_addr(struct gk20a *g, struct nvgpu_mem *mem, 1456u64 nvgpu_mem_get_base_addr(struct gk20a *g, struct nvgpu_mem *mem,
1482 u32 flags) 1457 u32 flags)
1483{ 1458{
1484 struct nvgpu_page_alloc *alloc; 1459 struct nvgpu_page_alloc *alloc;
@@ -1580,203 +1555,168 @@ u64 gk20a_mm_iova_addr(struct gk20a *g, struct scatterlist *sgl,
1580 return gk20a_mm_smmu_vaddr_translate(g, sg_dma_address(sgl)); 1555 return gk20a_mm_smmu_vaddr_translate(g, sg_dma_address(sgl));
1581} 1556}
1582 1557
1583void gk20a_pde_wr32(struct gk20a *g, struct gk20a_mm_entry *entry,
1584 size_t w, size_t data)
1585{
1586 nvgpu_mem_wr32(g, &entry->mem, entry->woffset + w, data);
1587}
1588
1589u64 gk20a_pde_addr(struct gk20a *g, struct gk20a_mm_entry *entry)
1590{
1591 u64 base;
1592
1593 if (g->mm.has_physical_mode)
1594 base = sg_phys(entry->mem.priv.sgt->sgl);
1595 else
1596 base = gk20a_mem_get_base_addr(g, &entry->mem, 0);
1597
1598 return base + entry->woffset * sizeof(u32);
1599}
1600
1601/* for gk20a the "video memory" apertures here are misnomers. */ 1558/* for gk20a the "video memory" apertures here are misnomers. */
1602static inline u32 big_valid_pde0_bits(struct gk20a *g, 1559static inline u32 big_valid_pde0_bits(struct gk20a *g,
1603 struct gk20a_mm_entry *entry) 1560 struct nvgpu_gmmu_pd *pd, u64 addr)
1604{ 1561{
1605 u64 pte_addr = gk20a_pde_addr(g, entry);
1606 u32 pde0_bits = 1562 u32 pde0_bits =
1607 nvgpu_aperture_mask(g, &entry->mem, 1563 nvgpu_aperture_mask(g, &pd->mem,
1608 gmmu_pde_aperture_big_sys_mem_ncoh_f(), 1564 gmmu_pde_aperture_big_sys_mem_ncoh_f(),
1609 gmmu_pde_aperture_big_video_memory_f()) | 1565 gmmu_pde_aperture_big_video_memory_f()) |
1610 gmmu_pde_address_big_sys_f( 1566 gmmu_pde_address_big_sys_f(
1611 (u32)(pte_addr >> gmmu_pde_address_shift_v())); 1567 (u32)(addr >> gmmu_pde_address_shift_v()));
1612 1568
1613 return pde0_bits; 1569 return pde0_bits;
1614} 1570}
1615 1571
1616static inline u32 small_valid_pde1_bits(struct gk20a *g, 1572static inline u32 small_valid_pde1_bits(struct gk20a *g,
1617 struct gk20a_mm_entry *entry) 1573 struct nvgpu_gmmu_pd *pd, u64 addr)
1618{ 1574{
1619 u64 pte_addr = gk20a_pde_addr(g, entry);
1620 u32 pde1_bits = 1575 u32 pde1_bits =
1621 nvgpu_aperture_mask(g, &entry->mem, 1576 nvgpu_aperture_mask(g, &pd->mem,
1622 gmmu_pde_aperture_small_sys_mem_ncoh_f(), 1577 gmmu_pde_aperture_small_sys_mem_ncoh_f(),
1623 gmmu_pde_aperture_small_video_memory_f()) | 1578 gmmu_pde_aperture_small_video_memory_f()) |
1624 gmmu_pde_vol_small_true_f() | /* tbd: why? */ 1579 gmmu_pde_vol_small_true_f() | /* tbd: why? */
1625 gmmu_pde_address_small_sys_f( 1580 gmmu_pde_address_small_sys_f(
1626 (u32)(pte_addr >> gmmu_pde_address_shift_v())); 1581 (u32)(addr >> gmmu_pde_address_shift_v()));
1627 1582
1628 return pde1_bits; 1583 return pde1_bits;
1629} 1584}
1630 1585
1631/* Given the current state of the ptes associated with a pde, 1586static void update_gmmu_pde_locked(struct vm_gk20a *vm,
1632 determine value and write it out. There's no checking 1587 const struct gk20a_mmu_level *l,
1633 here to determine whether or not a change was actually 1588 struct nvgpu_gmmu_pd *pd,
1634 made. So, superfluous updates will cause unnecessary 1589 u32 pd_idx,
1635 pde invalidations. 1590 u64 virt_addr,
1636*/ 1591 u64 phys_addr,
1637static int update_gmmu_pde_locked(struct vm_gk20a *vm, 1592 struct nvgpu_gmmu_attrs *attrs)
1638 struct gk20a_mm_entry *pte,
1639 u32 i, u32 gmmu_pgsz_idx,
1640 struct scatterlist **sgl,
1641 u64 *offset,
1642 u64 *iova,
1643 u32 kind_v, u64 *ctag,
1644 bool cacheable, bool unammped_pte,
1645 int rw_flag, bool sparse, bool priv,
1646 enum nvgpu_aperture aperture)
1647{ 1593{
1648 struct gk20a *g = gk20a_from_vm(vm); 1594 struct gk20a *g = gk20a_from_vm(vm);
1649 bool small_valid, big_valid; 1595 bool small_valid, big_valid;
1650 struct gk20a_mm_entry *entry = vm->pdb.entries + i; 1596 u32 pd_offset = pd_offset_from_index(l, pd_idx);
1651 u32 pde_v[2] = {0, 0}; 1597 u32 pde_v[2] = {0, 0};
1652 u32 pde;
1653 1598
1654 gk20a_dbg_fn(""); 1599 small_valid = attrs->pgsz == gmmu_page_size_small;
1655 1600 big_valid = attrs->pgsz == gmmu_page_size_big;
1656 small_valid = entry->mem.size && entry->pgsz == gmmu_page_size_small;
1657 big_valid = entry->mem.size && entry->pgsz == gmmu_page_size_big;
1658 1601
1659 pde_v[0] = gmmu_pde_size_full_f(); 1602 pde_v[0] = gmmu_pde_size_full_f();
1660 pde_v[0] |= big_valid ? 1603 pde_v[0] |= big_valid ?
1661 big_valid_pde0_bits(g, entry) : 1604 big_valid_pde0_bits(g, pd, phys_addr) :
1662 gmmu_pde_aperture_big_invalid_f(); 1605 gmmu_pde_aperture_big_invalid_f();
1663 1606
1664 pde_v[1] |= (small_valid ? 1607 pde_v[1] |= (small_valid ? small_valid_pde1_bits(g, pd, phys_addr) :
1665 small_valid_pde1_bits(g, entry) :
1666 (gmmu_pde_aperture_small_invalid_f() | 1608 (gmmu_pde_aperture_small_invalid_f() |
1667 gmmu_pde_vol_small_false_f())) 1609 gmmu_pde_vol_small_false_f()))
1668 | 1610 |
1669 (big_valid ? (gmmu_pde_vol_big_true_f()) : 1611 (big_valid ? (gmmu_pde_vol_big_true_f()) :
1670 gmmu_pde_vol_big_false_f()); 1612 gmmu_pde_vol_big_false_f());
1671 1613
1672 pde = pde_from_index(i); 1614 pte_dbg(g, attrs,
1615 "PDE: i=%-4u size=%-2u offs=%-4u pgsz: %c%c | "
1616 "GPU %#-12llx phys %#-12llx "
1617 "[0x%08x, 0x%08x]",
1618 pd_idx, l->entry_size, pd_offset,
1619 small_valid ? 'S' : '-',
1620 big_valid ? 'B' : '-',
1621 virt_addr, phys_addr,
1622 pde_v[1], pde_v[0]);
1673 1623
1674 gk20a_pde_wr32(g, &vm->pdb, pde + 0, pde_v[0]); 1624 pd_write(g, &vm->pdb, pd_offset + 0, pde_v[0]);
1675 gk20a_pde_wr32(g, &vm->pdb, pde + 1, pde_v[1]); 1625 pd_write(g, &vm->pdb, pd_offset + 1, pde_v[1]);
1626}
1676 1627
1677 gk20a_dbg(gpu_dbg_pte, "pde:%d,sz=%d = 0x%x,0x%08x", 1628static void __update_pte_sparse(u32 *pte_w)
1678 i, gmmu_pgsz_idx, pde_v[1], pde_v[0]); 1629{
1679 return 0; 1630 pte_w[0] = gmmu_pte_valid_false_f();
1631 pte_w[1] |= gmmu_pte_vol_true_f();
1680} 1632}
1681 1633
1682static int update_gmmu_pte_locked(struct vm_gk20a *vm, 1634static void __update_pte(struct vm_gk20a *vm,
1683 struct gk20a_mm_entry *pte, 1635 u32 *pte_w,
1684 u32 i, u32 gmmu_pgsz_idx, 1636 u64 phys_addr,
1685 struct scatterlist **sgl, 1637 struct nvgpu_gmmu_attrs *attrs)
1686 u64 *offset,
1687 u64 *iova,
1688 u32 kind_v, u64 *ctag,
1689 bool cacheable, bool unmapped_pte,
1690 int rw_flag, bool sparse, bool priv,
1691 enum nvgpu_aperture aperture)
1692{ 1638{
1693 struct gk20a *g = gk20a_from_vm(vm); 1639 struct gk20a *g = gk20a_from_vm(vm);
1640 u32 page_size = vm->gmmu_page_sizes[attrs->pgsz];
1641 u32 pte_valid = attrs->valid ?
1642 gmmu_pte_valid_true_f() :
1643 gmmu_pte_valid_false_f();
1644 u32 phys_shifted = phys_addr >> gmmu_pte_address_shift_v();
1645 u32 addr = attrs->aperture == APERTURE_SYSMEM ?
1646 gmmu_pte_address_sys_f(phys_shifted) :
1647 gmmu_pte_address_vid_f(phys_shifted);
1694 int ctag_shift = ilog2(g->ops.fb.compression_page_size(g)); 1648 int ctag_shift = ilog2(g->ops.fb.compression_page_size(g));
1695 u32 page_size = vm->gmmu_page_sizes[gmmu_pgsz_idx];
1696 u32 pte_w[2] = {0, 0}; /* invalid pte */
1697
1698 if (*iova) {
1699 u32 pte_valid = unmapped_pte ?
1700 gmmu_pte_valid_false_f() :
1701 gmmu_pte_valid_true_f();
1702 u32 iova_v = *iova >> gmmu_pte_address_shift_v();
1703 u32 pte_addr = aperture == APERTURE_SYSMEM ?
1704 gmmu_pte_address_sys_f(iova_v) :
1705 gmmu_pte_address_vid_f(iova_v);
1706
1707 pte_w[0] = pte_valid | pte_addr;
1708
1709 if (priv)
1710 pte_w[0] |= gmmu_pte_privilege_true_f();
1711
1712 pte_w[1] = __nvgpu_aperture_mask(g, aperture,
1713 gmmu_pte_aperture_sys_mem_ncoh_f(),
1714 gmmu_pte_aperture_video_memory_f()) |
1715 gmmu_pte_kind_f(kind_v) |
1716 gmmu_pte_comptagline_f((u32)(*ctag >> ctag_shift));
1717
1718 if (*ctag && vm->mm->use_full_comp_tag_line && *iova & 0x10000)
1719 pte_w[1] |= gmmu_pte_comptagline_f(
1720 1 << (gmmu_pte_comptagline_s() - 1));
1721
1722 if (rw_flag == gk20a_mem_flag_read_only) {
1723 pte_w[0] |= gmmu_pte_read_only_true_f();
1724 pte_w[1] |=
1725 gmmu_pte_write_disable_true_f();
1726 } else if (rw_flag ==
1727 gk20a_mem_flag_write_only) {
1728 pte_w[1] |=
1729 gmmu_pte_read_disable_true_f();
1730 }
1731 if (!unmapped_pte) {
1732 if (!cacheable)
1733 pte_w[1] |=
1734 gmmu_pte_vol_true_f();
1735 } else {
1736 /* Store cacheable value behind
1737 * gmmu_pte_write_disable_true_f */
1738 if (!cacheable)
1739 pte_w[1] |=
1740 gmmu_pte_write_disable_true_f();
1741 }
1742 1649
1743 gk20a_dbg(gpu_dbg_pte, 1650 pte_w[0] = pte_valid | addr;
1744 "pte=%d iova=0x%llx kind=%d ctag=%d vol=%d [0x%08x, 0x%08x]",
1745 i, *iova,
1746 kind_v, (u32)(*ctag >> ctag_shift), !cacheable,
1747 pte_w[1], pte_w[0]);
1748 1651
1749 if (*ctag) 1652 if (attrs->priv)
1750 *ctag += page_size; 1653 pte_w[0] |= gmmu_pte_privilege_true_f();
1751 } else if (sparse) {
1752 pte_w[0] = gmmu_pte_valid_false_f();
1753 pte_w[1] |= gmmu_pte_vol_true_f();
1754 } else {
1755 gk20a_dbg(gpu_dbg_pte, "pte_cur=%d [0x0,0x0]", i);
1756 }
1757 1654
1758 gk20a_pde_wr32(g, pte, pte_from_index(i) + 0, pte_w[0]); 1655 pte_w[1] = __nvgpu_aperture_mask(g, attrs->aperture,
1759 gk20a_pde_wr32(g, pte, pte_from_index(i) + 1, pte_w[1]); 1656 gmmu_pte_aperture_sys_mem_ncoh_f(),
1760 1657 gmmu_pte_aperture_video_memory_f()) |
1761 if (*iova) { 1658 gmmu_pte_kind_f(attrs->kind_v) |
1762 *iova += page_size; 1659 gmmu_pte_comptagline_f((u32)(attrs->ctag >> ctag_shift));
1763 *offset += page_size; 1660
1764 if (*sgl && *offset + page_size > (*sgl)->length) { 1661 if (attrs->ctag && vm->mm->use_full_comp_tag_line &&
1765 u64 new_iova; 1662 phys_addr & 0x10000)
1766 *sgl = sg_next(*sgl); 1663 pte_w[1] |= gmmu_pte_comptagline_f(
1767 if (*sgl) { 1664 1 << (gmmu_pte_comptagline_s() - 1));
1768 new_iova = sg_phys(*sgl); 1665
1769 gk20a_dbg(gpu_dbg_pte, "chunk address %llx, size %d", 1666 if (attrs->rw_flag == gk20a_mem_flag_read_only) {
1770 new_iova, (*sgl)->length); 1667 pte_w[0] |= gmmu_pte_read_only_true_f();
1771 if (new_iova) { 1668 pte_w[1] |= gmmu_pte_write_disable_true_f();
1772 *offset = 0; 1669 } else if (attrs->rw_flag == gk20a_mem_flag_write_only) {
1773 *iova = new_iova; 1670 pte_w[1] |= gmmu_pte_read_disable_true_f();
1774 }
1775 }
1776 }
1777 } 1671 }
1778 1672
1779 return 0; 1673 if (!attrs->cacheable)
1674 pte_w[1] |= gmmu_pte_vol_true_f();
1675
1676 if (attrs->ctag)
1677 attrs->ctag += page_size;
1678}
1679
1680static void update_gmmu_pte_locked(struct vm_gk20a *vm,
1681 const struct gk20a_mmu_level *l,
1682 struct nvgpu_gmmu_pd *pd,
1683 u32 pd_idx,
1684 u64 virt_addr,
1685 u64 phys_addr,
1686 struct nvgpu_gmmu_attrs *attrs)
1687{
1688 struct gk20a *g = gk20a_from_vm(vm);
1689 u32 page_size = vm->gmmu_page_sizes[attrs->pgsz];
1690 u32 pd_offset = pd_offset_from_index(l, pd_idx);
1691 u32 pte_w[2] = {0, 0};
1692 int ctag_shift = ilog2(g->ops.fb.compression_page_size(g));
1693
1694 if (phys_addr)
1695 __update_pte(vm, pte_w, phys_addr, attrs);
1696 else if (attrs->sparse)
1697 __update_pte_sparse(pte_w);
1698
1699 pte_dbg(g, attrs,
1700 "PTE: i=%-4u size=%-2u offs=%-4u | "
1701 "GPU %#-12llx phys %#-12llx "
1702 "pgsz: %3dkb perm=%-2s kind=%#02x APT=%-6s %c%c%c%c "
1703 "ctag=0x%08x "
1704 "[0x%08x, 0x%08x]",
1705 pd_idx, l->entry_size, pd_offset,
1706 virt_addr, phys_addr,
1707 page_size >> 10,
1708 nvgpu_gmmu_perm_str(attrs->rw_flag),
1709 attrs->kind_v,
1710 nvgpu_aperture_str(attrs->aperture),
1711 attrs->valid ? 'V' : '-',
1712 attrs->cacheable ? 'C' : '-',
1713 attrs->sparse ? 'S' : '-',
1714 attrs->priv ? 'P' : '-',
1715 (u32)attrs->ctag >> ctag_shift,
1716 pte_w[1], pte_w[0]);
1717
1718 pd_write(g, pd, pd_offset + 0, pte_w[0]);
1719 pd_write(g, pd, pd_offset + 1, pte_w[1]);
1780} 1720}
1781 1721
1782/* NOTE! mapped_buffers lock must be held */ 1722/* NOTE! mapped_buffers lock must be held */
@@ -1809,13 +1749,6 @@ void nvgpu_vm_unmap_locked(struct nvgpu_mapped_buf *mapped_buffer,
1809 mapped_buffer->vm_area->sparse : false, 1749 mapped_buffer->vm_area->sparse : false,
1810 batch); 1750 batch);
1811 1751
1812 gk20a_dbg(gpu_dbg_map,
1813 "gv: 0x%04x_%08x pgsz=%-3dKb as=%-2d own_mem_ref=%d",
1814 u64_hi32(mapped_buffer->addr), u64_lo32(mapped_buffer->addr),
1815 vm->gmmu_page_sizes[mapped_buffer->pgsz_idx] >> 10,
1816 vm_aspace_id(vm),
1817 mapped_buffer->own_mem_ref);
1818
1819 gk20a_mm_unpin(dev_from_vm(vm), mapped_buffer->dmabuf, 1752 gk20a_mm_unpin(dev_from_vm(vm), mapped_buffer->dmabuf,
1820 mapped_buffer->sgt); 1753 mapped_buffer->sgt);
1821 1754
@@ -1942,6 +1875,9 @@ int __gk20a_vm_bind_channel(struct vm_gk20a *vm, struct channel_gk20a *ch)
1942 if (err) 1875 if (err)
1943 ch->vm = NULL; 1876 ch->vm = NULL;
1944 1877
1878 nvgpu_log(gk20a_from_vm(vm), gpu_dbg_map, "Binding ch=%d -> VM:%s",
1879 ch->chid, vm->name);
1880
1945 return err; 1881 return err;
1946} 1882}
1947 1883
@@ -2114,7 +2050,7 @@ u64 gk20a_mm_inst_block_addr(struct gk20a *g, struct nvgpu_mem *inst_block)
2114 if (g->mm.has_physical_mode) 2050 if (g->mm.has_physical_mode)
2115 addr = gk20a_mem_phys(inst_block); 2051 addr = gk20a_mem_phys(inst_block);
2116 else 2052 else
2117 addr = gk20a_mem_get_base_addr(g, inst_block, 0); 2053 addr = nvgpu_mem_get_base_addr(g, inst_block, 0);
2118 2054
2119 return addr; 2055 return addr;
2120} 2056}
@@ -2237,7 +2173,7 @@ static int gk20a_init_ce_vm(struct mm_gk20a *mm)
2237void gk20a_mm_init_pdb(struct gk20a *g, struct nvgpu_mem *inst_block, 2173void gk20a_mm_init_pdb(struct gk20a *g, struct nvgpu_mem *inst_block,
2238 struct vm_gk20a *vm) 2174 struct vm_gk20a *vm)
2239{ 2175{
2240 u64 pdb_addr = gk20a_mem_get_base_addr(g, &vm->pdb.mem, 0); 2176 u64 pdb_addr = nvgpu_mem_get_base_addr(g, &vm->pdb.mem, 0);
2241 u32 pdb_addr_lo = u64_lo32(pdb_addr >> ram_in_base_shift_v()); 2177 u32 pdb_addr_lo = u64_lo32(pdb_addr >> ram_in_base_shift_v());
2242 u32 pdb_addr_hi = u64_hi32(pdb_addr); 2178 u32 pdb_addr_hi = u64_hi32(pdb_addr);
2243 2179
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
index cf37640d..a245d0e0 100644
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
@@ -42,12 +42,6 @@
42 outer_flush_range(pa, pa + (size_t)(size)); \ 42 outer_flush_range(pa, pa + (size_t)(size)); \
43 } while (0) 43 } while (0)
44 44
45enum gk20a_mem_rw_flag {
46 gk20a_mem_flag_none = 0,
47 gk20a_mem_flag_read_only = 1,
48 gk20a_mem_flag_write_only = 2,
49};
50
51struct gpfifo_desc { 45struct gpfifo_desc {
52 struct nvgpu_mem mem; 46 struct nvgpu_mem mem;
53 u32 entry_num; 47 u32 entry_num;
@@ -347,7 +341,7 @@ int gk20a_mm_suspend(struct gk20a *g);
347u64 gk20a_mm_iova_addr(struct gk20a *g, struct scatterlist *sgl, 341u64 gk20a_mm_iova_addr(struct gk20a *g, struct scatterlist *sgl,
348 u32 flags); 342 u32 flags);
349u64 gk20a_mm_smmu_vaddr_translate(struct gk20a *g, dma_addr_t iova); 343u64 gk20a_mm_smmu_vaddr_translate(struct gk20a *g, dma_addr_t iova);
350u64 gk20a_mem_get_base_addr(struct gk20a *g, struct nvgpu_mem *mem, 344u64 nvgpu_mem_get_base_addr(struct gk20a *g, struct nvgpu_mem *mem,
351 u32 flags); 345 u32 flags);
352 346
353void gk20a_mm_ltc_isr(struct gk20a *g); 347void gk20a_mm_ltc_isr(struct gk20a *g);
@@ -371,10 +365,6 @@ static inline phys_addr_t gk20a_mem_phys(struct nvgpu_mem *mem)
371 return 0; 365 return 0;
372} 366}
373 367
374void gk20a_pde_wr32(struct gk20a *g, struct gk20a_mm_entry *entry,
375 size_t w, size_t data);
376u64 gk20a_pde_addr(struct gk20a *g, struct gk20a_mm_entry *entry);
377
378u64 gk20a_locked_gmmu_map(struct vm_gk20a *vm, 368u64 gk20a_locked_gmmu_map(struct vm_gk20a *vm,
379 u64 map_offset, 369 u64 map_offset,
380 struct sg_table *sgt, 370 struct sg_table *sgt,
@@ -451,8 +441,4 @@ int gk20a_mm_get_buffer_info(struct device *dev, int dmabuf_fd,
451 u64 *buffer_id, u64 *buffer_len); 441 u64 *buffer_id, u64 *buffer_len);
452void gk20a_vm_unmap_locked_kref(struct kref *ref); 442void gk20a_vm_unmap_locked_kref(struct kref *ref);
453 443
454void gk20a_vm_free_entries(struct vm_gk20a *vm,
455 struct gk20a_mm_entry *parent,
456 int level);
457
458#endif /* MM_GK20A_H */ 444#endif /* MM_GK20A_H */
diff --git a/drivers/gpu/nvgpu/gp10b/mm_gp10b.c b/drivers/gpu/nvgpu/gp10b/mm_gp10b.c
index d7391c6d..c3867e9d 100644
--- a/drivers/gpu/nvgpu/gp10b/mm_gp10b.c
+++ b/drivers/gpu/nvgpu/gp10b/mm_gp10b.c
@@ -14,6 +14,7 @@
14 */ 14 */
15 15
16#include <nvgpu/dma.h> 16#include <nvgpu/dma.h>
17#include <nvgpu/gmmu.h>
17 18
18#include "gk20a/gk20a.h" 19#include "gk20a/gk20a.h"
19#include "gk20a/platform_gk20a.h" 20#include "gk20a/platform_gk20a.h"
@@ -149,206 +150,186 @@ static u64 gp10b_mm_iova_addr(struct gk20a *g, struct scatterlist *sgl,
149 return gk20a_mm_smmu_vaddr_translate(g, sg_dma_address(sgl)); 150 return gk20a_mm_smmu_vaddr_translate(g, sg_dma_address(sgl));
150} 151}
151 152
152static u32 pde3_from_index(u32 i) 153static void update_gmmu_pde3_locked(struct vm_gk20a *vm,
153{ 154 const struct gk20a_mmu_level *l,
154 return i * gmmu_new_pde__size_v() / sizeof(u32); 155 struct nvgpu_gmmu_pd *pd,
155} 156 u32 pd_idx,
156 157 u64 virt_addr,
157static u32 pte3_from_index(u32 i) 158 u64 phys_addr,
158{ 159 struct nvgpu_gmmu_attrs *attrs)
159 return i * gmmu_new_pte__size_v() / sizeof(u32);
160}
161
162static int update_gmmu_pde3_locked(struct vm_gk20a *vm,
163 struct gk20a_mm_entry *parent,
164 u32 i, u32 gmmu_pgsz_idx,
165 struct scatterlist **sgl,
166 u64 *offset,
167 u64 *iova,
168 u32 kind_v, u64 *ctag,
169 bool cacheable, bool unmapped_pte,
170 int rw_flag, bool sparse, bool priv,
171 enum nvgpu_aperture aperture)
172{ 160{
173 struct gk20a *g = gk20a_from_vm(vm); 161 struct gk20a *g = gk20a_from_vm(vm);
174 u64 pte_addr = 0; 162 u32 pd_offset = pd_offset_from_index(l, pd_idx);
175 struct gk20a_mm_entry *pte = parent->entries + i;
176 u32 pde_v[2] = {0, 0}; 163 u32 pde_v[2] = {0, 0};
177 u32 pde;
178
179 gk20a_dbg_fn("");
180 164
181 pte_addr = gk20a_pde_addr(g, pte) >> gmmu_new_pde_address_shift_v(); 165 phys_addr >>= gmmu_new_pde_address_shift_v();
182 166
183 pde_v[0] |= nvgpu_aperture_mask(g, &pte->mem, 167 pde_v[0] |= nvgpu_aperture_mask(g, &pd->mem,
184 gmmu_new_pde_aperture_sys_mem_ncoh_f(), 168 gmmu_new_pde_aperture_sys_mem_ncoh_f(),
185 gmmu_new_pde_aperture_video_memory_f()); 169 gmmu_new_pde_aperture_video_memory_f());
186 pde_v[0] |= gmmu_new_pde_address_sys_f(u64_lo32(pte_addr)); 170 pde_v[0] |= gmmu_new_pde_address_sys_f(u64_lo32(phys_addr));
187 pde_v[0] |= gmmu_new_pde_vol_true_f(); 171 pde_v[0] |= gmmu_new_pde_vol_true_f();
188 pde_v[1] |= pte_addr >> 24; 172 pde_v[1] |= phys_addr >> 24;
189 pde = pde3_from_index(i); 173
190 174 pd_write(g, pd, pd_offset + 0, pde_v[0]);
191 gk20a_pde_wr32(g, parent, pde + 0, pde_v[0]); 175 pd_write(g, pd, pd_offset + 1, pde_v[1]);
192 gk20a_pde_wr32(g, parent, pde + 1, pde_v[1]); 176
193 177 pte_dbg(g, attrs,
194 gk20a_dbg(gpu_dbg_pte, "pde:%d,sz=%d = 0x%x,0x%08x", 178 "PDE: i=%-4u size=%-2u offs=%-4u pgsz: -- | "
195 i, gmmu_pgsz_idx, pde_v[1], pde_v[0]); 179 "GPU %#-12llx phys %#-12llx "
196 gk20a_dbg_fn("done"); 180 "[0x%08x, 0x%08x]",
197 return 0; 181 pd_idx, l->entry_size, pd_offset,
182 virt_addr, phys_addr,
183 pde_v[1], pde_v[0]);
198} 184}
199 185
200static u32 pde0_from_index(u32 i) 186static void update_gmmu_pde0_locked(struct vm_gk20a *vm,
201{ 187 const struct gk20a_mmu_level *l,
202 return i * gmmu_new_dual_pde__size_v() / sizeof(u32); 188 struct nvgpu_gmmu_pd *pd,
203} 189 u32 pd_idx,
204 190 u64 virt_addr,
205static int update_gmmu_pde0_locked(struct vm_gk20a *vm, 191 u64 phys_addr,
206 struct gk20a_mm_entry *pte, 192 struct nvgpu_gmmu_attrs *attrs)
207 u32 i, u32 gmmu_pgsz_idx,
208 struct scatterlist **sgl,
209 u64 *offset,
210 u64 *iova,
211 u32 kind_v, u64 *ctag,
212 bool cacheable, bool unmapped_pte,
213 int rw_flag, bool sparse, bool priv,
214 enum nvgpu_aperture aperture)
215{ 193{
216 struct gk20a *g = gk20a_from_vm(vm); 194 struct gk20a *g = gk20a_from_vm(vm);
217 bool small_valid, big_valid; 195 bool small_valid, big_valid;
218 u32 pte_addr_small = 0, pte_addr_big = 0; 196 u32 small_addr = 0, big_addr = 0;
219 struct gk20a_mm_entry *entry = pte->entries + i; 197 u32 pd_offset = pd_offset_from_index(l, pd_idx);
220 u32 pde_v[4] = {0, 0, 0, 0}; 198 u32 pde_v[4] = {0, 0, 0, 0};
221 u32 pde;
222
223 gk20a_dbg_fn("");
224 199
225 small_valid = entry->mem.size && entry->pgsz == gmmu_page_size_small; 200 small_valid = attrs->pgsz == gmmu_page_size_small;
226 big_valid = entry->mem.size && entry->pgsz == gmmu_page_size_big; 201 big_valid = attrs->pgsz == gmmu_page_size_big;
227 202
228 if (small_valid) { 203 if (small_valid)
229 pte_addr_small = gk20a_pde_addr(g, entry) 204 small_addr = phys_addr >> gmmu_new_dual_pde_address_shift_v();
230 >> gmmu_new_dual_pde_address_shift_v();
231 }
232 205
233 if (big_valid) 206 if (big_valid)
234 pte_addr_big = gk20a_pde_addr(g, entry) 207 big_addr = phys_addr >> gmmu_new_dual_pde_address_big_shift_v();
235 >> gmmu_new_dual_pde_address_big_shift_v();
236 208
237 if (small_valid) { 209 if (small_valid) {
238 pde_v[2] |= gmmu_new_dual_pde_address_small_sys_f(pte_addr_small); 210 pde_v[2] |=
239 pde_v[2] |= nvgpu_aperture_mask(g, &entry->mem, 211 gmmu_new_dual_pde_address_small_sys_f(small_addr);
212 pde_v[2] |= nvgpu_aperture_mask(g, &pd->mem,
240 gmmu_new_dual_pde_aperture_small_sys_mem_ncoh_f(), 213 gmmu_new_dual_pde_aperture_small_sys_mem_ncoh_f(),
241 gmmu_new_dual_pde_aperture_small_video_memory_f()); 214 gmmu_new_dual_pde_aperture_small_video_memory_f());
242 pde_v[2] |= gmmu_new_dual_pde_vol_small_true_f(); 215 pde_v[2] |= gmmu_new_dual_pde_vol_small_true_f();
243 pde_v[3] |= pte_addr_small >> 24; 216 pde_v[3] |= small_addr >> 24;
244 } 217 }
245 218
246 if (big_valid) { 219 if (big_valid) {
247 pde_v[0] |= gmmu_new_dual_pde_address_big_sys_f(pte_addr_big); 220 pde_v[0] |= gmmu_new_dual_pde_address_big_sys_f(big_addr);
248 pde_v[0] |= gmmu_new_dual_pde_vol_big_true_f(); 221 pde_v[0] |= gmmu_new_dual_pde_vol_big_true_f();
249 pde_v[0] |= nvgpu_aperture_mask(g, &entry->mem, 222 pde_v[0] |= nvgpu_aperture_mask(g, &pd->mem,
250 gmmu_new_dual_pde_aperture_big_sys_mem_ncoh_f(), 223 gmmu_new_dual_pde_aperture_big_sys_mem_ncoh_f(),
251 gmmu_new_dual_pde_aperture_big_video_memory_f()); 224 gmmu_new_dual_pde_aperture_big_video_memory_f());
252 pde_v[1] |= pte_addr_big >> 28; 225 pde_v[1] |= big_addr >> 28;
253 } 226 }
254 227
255 pde = pde0_from_index(i); 228 pd_write(g, pd, pd_offset + 0, pde_v[0]);
256 229 pd_write(g, pd, pd_offset + 1, pde_v[1]);
257 gk20a_pde_wr32(g, pte, pde + 0, pde_v[0]); 230 pd_write(g, pd, pd_offset + 2, pde_v[2]);
258 gk20a_pde_wr32(g, pte, pde + 1, pde_v[1]); 231 pd_write(g, pd, pd_offset + 3, pde_v[3]);
259 gk20a_pde_wr32(g, pte, pde + 2, pde_v[2]); 232
260 gk20a_pde_wr32(g, pte, pde + 3, pde_v[3]); 233 pte_dbg(g, attrs,
261 234 "PDE: i=%-4u size=%-2u offs=%-4u pgsz: %c%c | "
262 gk20a_dbg(gpu_dbg_pte, "pde:%d,sz=%d [0x%08x, 0x%08x, 0x%x, 0x%08x]", 235 "GPU %#-12llx phys %#-12llx "
263 i, gmmu_pgsz_idx, pde_v[3], pde_v[2], pde_v[1], pde_v[0]); 236 "[0x%08x, 0x%08x, 0x%08x, 0x%08x]",
264 gk20a_dbg_fn("done"); 237 pd_idx, l->entry_size, pd_offset,
265 return 0; 238 small_valid ? 'S' : '-',
239 big_valid ? 'B' : '-',
240 virt_addr, phys_addr,
241 pde_v[3], pde_v[2], pde_v[1], pde_v[0]);
266} 242}
267 243
268static int update_gmmu_pte_locked(struct vm_gk20a *vm, 244static void __update_pte(struct vm_gk20a *vm,
269 struct gk20a_mm_entry *pte, 245 u32 *pte_w,
270 u32 i, u32 gmmu_pgsz_idx, 246 u64 phys_addr,
271 struct scatterlist **sgl, 247 struct nvgpu_gmmu_attrs *attrs)
272 u64 *offset,
273 u64 *iova,
274 u32 kind_v, u64 *ctag,
275 bool cacheable, bool unmapped_pte,
276 int rw_flag, bool sparse, bool priv,
277 enum nvgpu_aperture aperture)
278{ 248{
279 struct gk20a *g = vm->mm->g; 249 struct gk20a *g = gk20a_from_vm(vm);
280 u32 page_size = vm->gmmu_page_sizes[gmmu_pgsz_idx];
281 u64 ctag_granularity = g->ops.fb.compression_page_size(g); 250 u64 ctag_granularity = g->ops.fb.compression_page_size(g);
282 u32 pte_w[2] = {0, 0}; /* invalid pte */ 251 u32 page_size = vm->gmmu_page_sizes[attrs->pgsz];
283 u32 pte_i; 252 u32 pte_valid = attrs->valid ?
284 253 gmmu_new_pte_valid_true_f() :
285 if (*iova) { 254 gmmu_new_pte_valid_false_f();
286 u32 pte_valid = unmapped_pte ? 255 u32 phys_shifted = phys_addr >> gmmu_new_pte_address_shift_v();
287 gmmu_new_pte_valid_false_f() : 256 u32 pte_addr = attrs->aperture == APERTURE_SYSMEM ?
288 gmmu_new_pte_valid_true_f(); 257 gmmu_new_pte_address_sys_f(phys_shifted) :
289 u32 iova_v = *iova >> gmmu_new_pte_address_shift_v(); 258 gmmu_new_pte_address_vid_f(phys_shifted);
290 u32 pte_addr = aperture == APERTURE_SYSMEM ? 259 u32 pte_tgt = __nvgpu_aperture_mask(g, attrs->aperture,
291 gmmu_new_pte_address_sys_f(iova_v) : 260 gmmu_new_pte_aperture_sys_mem_ncoh_f(),
292 gmmu_new_pte_address_vid_f(iova_v); 261 gmmu_new_pte_aperture_video_memory_f());
293 u32 pte_tgt = __nvgpu_aperture_mask(g, aperture, 262
294 gmmu_new_pte_aperture_sys_mem_ncoh_f(), 263 pte_w[0] = pte_valid | pte_addr | pte_tgt;
295 gmmu_new_pte_aperture_video_memory_f()); 264
296 265 if (attrs->priv)
297 pte_w[0] = pte_valid | pte_addr | pte_tgt; 266 pte_w[0] |= gmmu_new_pte_privilege_true_f();
298 267
299 if (priv) 268 pte_w[1] = phys_addr >> (24 + gmmu_new_pte_address_shift_v()) |
300 pte_w[0] |= gmmu_new_pte_privilege_true_f(); 269 gmmu_new_pte_kind_f(attrs->kind_v) |
301 270 gmmu_new_pte_comptagline_f((u32)(attrs->ctag /
302 pte_w[1] = *iova >> (24 + gmmu_new_pte_address_shift_v()) | 271 ctag_granularity));
303 gmmu_new_pte_kind_f(kind_v) | 272
304 gmmu_new_pte_comptagline_f((u32)(*ctag / ctag_granularity)); 273 if (attrs->rw_flag == gk20a_mem_flag_read_only)
305 274 pte_w[0] |= gmmu_new_pte_read_only_true_f();
306 if (rw_flag == gk20a_mem_flag_read_only) 275
307 pte_w[0] |= gmmu_new_pte_read_only_true_f(); 276 if (!attrs->valid && !attrs->cacheable)
308 if (unmapped_pte && !cacheable) 277 pte_w[0] |= gmmu_new_pte_read_only_true_f();
309 pte_w[0] |= gmmu_new_pte_read_only_true_f(); 278 else if (!attrs->cacheable)
310 else if (!cacheable)
311 pte_w[0] |= gmmu_new_pte_vol_true_f();
312
313 gk20a_dbg(gpu_dbg_pte, "pte=%d iova=0x%llx kind=%d"
314 " ctag=%d vol=%d"
315 " [0x%08x, 0x%08x]",
316 i, *iova,
317 kind_v, (u32)(*ctag / ctag_granularity), !cacheable,
318 pte_w[1], pte_w[0]);
319
320 if (*ctag)
321 *ctag += page_size;
322 } else if (sparse) {
323 pte_w[0] = gmmu_new_pte_valid_false_f();
324 pte_w[0] |= gmmu_new_pte_vol_true_f(); 279 pte_w[0] |= gmmu_new_pte_vol_true_f();
325 } else {
326 gk20a_dbg(gpu_dbg_pte, "pte_cur=%d [0x0,0x0]", i);
327 }
328 280
329 pte_i = pte3_from_index(i); 281 if (attrs->ctag)
330 282 attrs->ctag += page_size;
331 gk20a_pde_wr32(g, pte, pte_i + 0, pte_w[0]); 283
332 gk20a_pde_wr32(g, pte, pte_i + 1, pte_w[1]); 284}
333 285
334 if (*iova) { 286static void __update_pte_sparse(u32 *pte_w)
335 *iova += page_size; 287{
336 *offset += page_size; 288 pte_w[0] = gmmu_new_pte_valid_false_f();
337 if (*sgl && *offset + page_size > (*sgl)->length) { 289 pte_w[0] |= gmmu_new_pte_vol_true_f();
338 u64 new_iova; 290}
339 *sgl = sg_next(*sgl); 291
340 if (*sgl) { 292static void update_gmmu_pte_locked(struct vm_gk20a *vm,
341 new_iova = sg_phys(*sgl); 293 const struct gk20a_mmu_level *l,
342 gk20a_dbg(gpu_dbg_pte, "chunk address %llx, size %d", 294 struct nvgpu_gmmu_pd *pd,
343 new_iova, (*sgl)->length); 295 u32 pd_idx,
344 if (new_iova) { 296 u64 virt_addr,
345 *offset = 0; 297 u64 phys_addr,
346 *iova = new_iova; 298 struct nvgpu_gmmu_attrs *attrs)
347 } 299{
348 } 300 struct gk20a *g = vm->mm->g;
349 } 301 u32 page_size = vm->gmmu_page_sizes[attrs->pgsz];
350 } 302 u32 pd_offset = pd_offset_from_index(l, pd_idx);
351 return 0; 303 u32 pte_w[2] = {0, 0};
304
305 if (phys_addr)
306 __update_pte(vm, pte_w, phys_addr, attrs);
307 else if (attrs->sparse)
308 __update_pte_sparse(pte_w);
309
310 pte_dbg(g, attrs,
311 "vm=%s "
312 "PTE: i=%-4u size=%-2u offs=%-4u | "
313 "GPU %#-12llx phys %#-12llx "
314 "pgsz: %3dkb perm=%-2s kind=%#02x APT=%-6s %c%c%c%c "
315 "ctag=0x%08x "
316 "[0x%08x, 0x%08x]",
317 vm->name,
318 pd_idx, l->entry_size, pd_offset,
319 virt_addr, phys_addr,
320 page_size >> 10,
321 nvgpu_gmmu_perm_str(attrs->rw_flag),
322 attrs->kind_v,
323 nvgpu_aperture_str(attrs->aperture),
324 attrs->valid ? 'V' : '-',
325 attrs->cacheable ? 'C' : '-',
326 attrs->sparse ? 'S' : '-',
327 attrs->priv ? 'P' : '-',
328 (u32)attrs->ctag / g->ops.fb.compression_page_size(g),
329 pte_w[1], pte_w[0]);
330
331 pd_write(g, pd, pd_offset + 0, pte_w[0]);
332 pd_write(g, pd, pd_offset + 1, pte_w[1]);
352} 333}
353 334
354static const struct gk20a_mmu_level gp10b_mm_levels[] = { 335static const struct gk20a_mmu_level gp10b_mm_levels[] = {
@@ -384,7 +365,7 @@ static const struct gk20a_mmu_level *gp10b_mm_get_mmu_levels(struct gk20a *g,
384static void gp10b_mm_init_pdb(struct gk20a *g, struct nvgpu_mem *inst_block, 365static void gp10b_mm_init_pdb(struct gk20a *g, struct nvgpu_mem *inst_block,
385 struct vm_gk20a *vm) 366 struct vm_gk20a *vm)
386{ 367{
387 u64 pdb_addr = gk20a_mem_get_base_addr(g, &vm->pdb.mem, 0); 368 u64 pdb_addr = nvgpu_mem_get_base_addr(g, &vm->pdb.mem, 0);
388 u32 pdb_addr_lo = u64_lo32(pdb_addr >> ram_in_base_shift_v()); 369 u32 pdb_addr_lo = u64_lo32(pdb_addr >> ram_in_base_shift_v());
389 u32 pdb_addr_hi = u64_hi32(pdb_addr); 370 u32 pdb_addr_hi = u64_hi32(pdb_addr);
390 371
diff --git a/drivers/gpu/nvgpu/include/nvgpu/gmmu.h b/drivers/gpu/nvgpu/include/nvgpu/gmmu.h
index ed152cd8..28a2cb82 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/gmmu.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/gmmu.h
@@ -38,36 +38,97 @@ enum gmmu_pgsz_gk20a {
38 gmmu_nr_page_sizes = 3, 38 gmmu_nr_page_sizes = 3,
39}; 39};
40 40
41struct gk20a_mm_entry { 41enum gk20a_mem_rw_flag {
42 /* backing for */ 42 gk20a_mem_flag_none = 0, /* RW */
43 struct nvgpu_mem mem; 43 gk20a_mem_flag_read_only = 1, /* RO */
44 u32 woffset; /* if >0, mem is a shadow copy, owned by another entry */ 44 gk20a_mem_flag_write_only = 2, /* WO */
45 int pgsz; 45};
46 struct gk20a_mm_entry *entries; 46
47 int num_entries; 47/*
48 * GMMU page directory. This is the kernel's tracking of a list of PDEs or PTEs
49 * in the GMMU.
50 */
51struct nvgpu_gmmu_pd {
52 /*
53 * DMA memory describing the PTEs or PTEs.
54 */
55 struct nvgpu_mem mem;
56
57 /*
58 * List of pointers to the next level of page tables. Does not
59 * need to be populated when this PD is pointing to PTEs.
60 */
61 struct nvgpu_gmmu_pd *entries;
62 int num_entries;
63};
64
65/*
66 * Reduce the number of arguments getting passed through the various levels of
67 * GMMU mapping functions.
68 *
69 * The following fields are set statically and do not change throughout
70 * mapping call:
71 *
72 * pgsz: Index into the page size table.
73 * kind_v: Kind attributes for mapping.
74 * cacheable: Cacheability of the mapping.
75 * rw_flag: Flag from enum gk20a_mem_rw_flag
76 * sparse: Set if the mapping should be sparse.
77 * priv: Privilidged mapping.
78 * valid: Set if the PTE should be marked valid.
79 * aperture: VIDMEM or SYSMEM.
80 * debug: When set print debugging info.
81 *
82 * These fields are dynamically updated as necessary during the map:
83 *
84 * ctag: Comptag line in the comptag cache;
85 * updated every time we write a PTE.
86 */
87struct nvgpu_gmmu_attrs {
88 u32 pgsz;
89 u32 kind_v;
90 u64 ctag;
91 bool cacheable;
92 int rw_flag;
93 bool sparse;
94 bool priv;
95 bool valid;
96 enum nvgpu_aperture aperture;
97 bool debug;
48}; 98};
49 99
50struct gk20a_mmu_level { 100struct gk20a_mmu_level {
51 int hi_bit[2]; 101 int hi_bit[2];
52 int lo_bit[2]; 102 int lo_bit[2];
53 int (*update_entry)(struct vm_gk20a *vm, 103
54 struct gk20a_mm_entry *pte, 104 /*
55 u32 i, u32 gmmu_pgsz_idx, 105 * Build map from virt_addr -> phys_addr.
56 struct scatterlist **sgl, 106 */
57 u64 *offset, 107 void (*update_entry)(struct vm_gk20a *vm,
58 u64 *iova, 108 const struct gk20a_mmu_level *l,
59 u32 kind_v, u64 *ctag, 109 struct nvgpu_gmmu_pd *pd,
60 bool cacheable, bool unmapped_pte, 110 u32 pd_idx,
61 int rw_flag, bool sparse, bool priv, 111 u64 phys_addr,
62 enum nvgpu_aperture aperture); 112 u64 virt_addr,
63 size_t entry_size; 113 struct nvgpu_gmmu_attrs *attrs);
114 u32 entry_size;
64}; 115};
65 116
66int nvgpu_zalloc_gmmu_page_table(struct vm_gk20a *vm, 117static inline const char *nvgpu_gmmu_perm_str(enum gk20a_mem_rw_flag p)
67 enum gmmu_pgsz_gk20a pgsz_idx, 118{
68 const struct gk20a_mmu_level *l, 119 switch (p) {
69 struct gk20a_mm_entry *entry, 120 case gk20a_mem_flag_none:
70 struct gk20a_mm_entry *prev_entry); 121 return "RW";
122 case gk20a_mem_flag_write_only:
123 return "WO";
124 case gk20a_mem_flag_read_only:
125 return "RO";
126 default:
127 return "??";
128 }
129}
130
131int nvgpu_gmmu_init_page_table(struct vm_gk20a *vm);
71 132
72/** 133/**
73 * nvgpu_gmmu_map - Map memory into the GMMU. 134 * nvgpu_gmmu_map - Map memory into the GMMU.
@@ -106,6 +167,33 @@ void nvgpu_gmmu_unmap(struct vm_gk20a *vm,
106 u64 gpu_va); 167 u64 gpu_va);
107 168
108void nvgpu_free_gmmu_pages(struct vm_gk20a *vm, 169void nvgpu_free_gmmu_pages(struct vm_gk20a *vm,
109 struct gk20a_mm_entry *entry); 170 struct nvgpu_gmmu_pd *entry);
171
172/*
173 * Some useful routines that are shared across chips.
174 */
175static inline u32 pd_offset_from_index(const struct gk20a_mmu_level *l,
176 u32 pd_idx)
177{
178 return (pd_idx * l->entry_size) / sizeof(u32);
179}
180
181static inline void pd_write(struct gk20a *g, struct nvgpu_gmmu_pd *pd,
182 size_t w, size_t data)
183{
184 nvgpu_mem_wr32(g, &pd->mem, w, data);
185}
186
187
188/*
189 * Internal debugging routines. Probably not something you want to use.
190 */
191#define pte_dbg(g, attrs, fmt, args...) \
192 do { \
193 if (attrs && attrs->debug) \
194 nvgpu_info(g, fmt, ##args); \
195 else \
196 nvgpu_log(g, gpu_dbg_pte, fmt, ##args); \
197 } while (0)
110 198
111#endif 199#endif
diff --git a/drivers/gpu/nvgpu/include/nvgpu/nvgpu_mem.h b/drivers/gpu/nvgpu/include/nvgpu/nvgpu_mem.h
index 66d04ab8..4259d40f 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/nvgpu_mem.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/nvgpu_mem.h
@@ -109,9 +109,9 @@ nvgpu_mem_from_clear_list_entry(struct nvgpu_list_node *node)
109static inline const char *nvgpu_aperture_str(enum nvgpu_aperture aperture) 109static inline const char *nvgpu_aperture_str(enum nvgpu_aperture aperture)
110{ 110{
111 switch (aperture) { 111 switch (aperture) {
112 case APERTURE_INVALID: return "invalid"; 112 case APERTURE_INVALID: return "INVAL";
113 case APERTURE_SYSMEM: return "sysmem"; 113 case APERTURE_SYSMEM: return "SYSMEM";
114 case APERTURE_VIDMEM: return "vidmem"; 114 case APERTURE_VIDMEM: return "VIDMEM";
115 }; 115 };
116 return "UNKNOWN"; 116 return "UNKNOWN";
117} 117}
diff --git a/drivers/gpu/nvgpu/include/nvgpu/vm.h b/drivers/gpu/nvgpu/include/nvgpu/vm.h
index f6d88cc3..255b4361 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/vm.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/vm.h
@@ -126,6 +126,7 @@ mapped_buffer_from_rbtree_node(struct nvgpu_rbtree_node *node)
126struct vm_gk20a { 126struct vm_gk20a {
127 struct mm_gk20a *mm; 127 struct mm_gk20a *mm;
128 struct gk20a_as_share *as_share; /* as_share this represents */ 128 struct gk20a_as_share *as_share; /* as_share this represents */
129 char name[20];
129 130
130 u64 va_start; 131 u64 va_start;
131 u64 va_limit; 132 u64 va_limit;
@@ -145,7 +146,7 @@ struct vm_gk20a {
145 146
146 struct nvgpu_mutex update_gmmu_lock; 147 struct nvgpu_mutex update_gmmu_lock;
147 148
148 struct gk20a_mm_entry pdb; 149 struct nvgpu_gmmu_pd pdb;
149 150
150 /* 151 /*
151 * These structs define the address spaces. In some cases it's possible 152 * These structs define the address spaces. In some cases it's possible