summaryrefslogtreecommitdiffstats
path: root/drivers/gpu/nvgpu/common/mm/gmmu.c
diff options
context:
space:
mode:
authorAlex Waterman <alexw@nvidia.com>2017-05-11 16:59:22 -0400
committermobile promotions <svcmobile_promotions@nvidia.com>2017-07-06 17:44:15 -0400
commitc1393d5b68e63c992f4c689cb788139fdf8c2f1a (patch)
tree00a588d35342d75c05fed7733e91da753ba640fb /drivers/gpu/nvgpu/common/mm/gmmu.c
parent84f712dee8b582dd7d2a19345c621a2ae3bd6292 (diff)
gpu: nvgpu: gmmu programming rewrite
Update the high level mapping logic. Instead of iterating over the GPU VA iterate over the scatter-gather table chunks. As a result each GMMU page table update call is simplified dramatically. This also modifies the chip level code to no longer require an SGL as an argument. Each call to the chip level code will be guaranteed to be contiguous so it only has to worry about making a mapping from virt -> phys. This removes the dependency on Linux that the chip code currently has. With this patch the core GMMU code still uses the Linux SGL but the logic is highly transferable to a different, nvgpu specific, scatter gather list format in the near future. The last major update is to push most of the page table attribute arguments to a struct. That struct is passed on through the various mapping levels. This makes the funtions calls more simple and easier to follow. JIRA NVGPU-30 Change-Id: Ibb6b11755f99818fe642622ca0bd4cbed054f602 Signed-off-by: Alex Waterman <alexw@nvidia.com> Reviewed-on: https://git-master/r/1484104 Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com> GVS: Gerrit_Virtual_Submit
Diffstat (limited to 'drivers/gpu/nvgpu/common/mm/gmmu.c')
-rw-r--r--drivers/gpu/nvgpu/common/mm/gmmu.c976
1 files changed, 591 insertions, 385 deletions
diff --git a/drivers/gpu/nvgpu/common/mm/gmmu.c b/drivers/gpu/nvgpu/common/mm/gmmu.c
index 06291600..ec1bc095 100644
--- a/drivers/gpu/nvgpu/common/mm/gmmu.c
+++ b/drivers/gpu/nvgpu/common/mm/gmmu.c
@@ -25,115 +25,26 @@
25#include "gk20a/gk20a.h" 25#include "gk20a/gk20a.h"
26#include "gk20a/mm_gk20a.h" 26#include "gk20a/mm_gk20a.h"
27 27
28#define gmmu_dbg(g, fmt, args...) \ 28#define __gmmu_dbg(g, attrs, fmt, args...) \
29 nvgpu_log(g, gpu_dbg_map, fmt, ##args) 29 do { \
30#define gmmu_dbg_v(g, fmt, args...) \ 30 if (attrs->debug) \
31 nvgpu_log(g, gpu_dbg_map_v, fmt, ##args) 31 nvgpu_info(g, fmt, ##args); \
32 32 else \
33static int map_gmmu_pages(struct gk20a *g, struct gk20a_mm_entry *entry) 33 nvgpu_log(g, gpu_dbg_map, fmt, ##args); \
34{ 34 } while (0)
35 return nvgpu_mem_begin(g, &entry->mem); 35
36} 36#define __gmmu_dbg_v(g, attrs, fmt, args...) \
37 37 do { \
38static void unmap_gmmu_pages(struct gk20a *g, struct gk20a_mm_entry *entry) 38 if (attrs->debug) \
39{ 39 nvgpu_info(g, fmt, ##args); \
40 nvgpu_mem_end(g, &entry->mem); 40 else \
41} 41 nvgpu_log(g, gpu_dbg_map_v, fmt, ##args); \
42 42 } while (0)
43static int nvgpu_alloc_gmmu_pages(struct vm_gk20a *vm, u32 order, 43
44 struct gk20a_mm_entry *entry) 44static int pd_allocate(struct vm_gk20a *vm,
45{ 45 struct nvgpu_gmmu_pd *pd,
46 struct gk20a *g = gk20a_from_vm(vm); 46 const struct gk20a_mmu_level *l,
47 u32 num_pages = 1 << order; 47 struct nvgpu_gmmu_attrs *attrs);
48 u32 len = num_pages * PAGE_SIZE;
49 int err;
50
51 err = nvgpu_dma_alloc(g, len, &entry->mem);
52
53 if (err) {
54 nvgpu_err(g, "memory allocation failed");
55 return -ENOMEM;
56 }
57
58 return 0;
59}
60
61void nvgpu_free_gmmu_pages(struct vm_gk20a *vm,
62 struct gk20a_mm_entry *entry)
63{
64 struct gk20a *g = gk20a_from_vm(vm);
65
66 if (!entry->mem.size)
67 return;
68
69 if (entry->woffset) /* fake shadow mem */
70 return;
71
72 nvgpu_dma_free(g, &entry->mem);
73}
74
75/*
76 * Allocate a phys contig region big enough for a full
77 * sized gmmu page table for the given gmmu_page_size.
78 * the whole range is zeroed so it's "invalid"/will fault.
79 *
80 * If a previous entry is supplied, its memory will be used for
81 * suballocation for this next entry too, if there is space.
82 */
83int nvgpu_zalloc_gmmu_page_table(struct vm_gk20a *vm,
84 enum gmmu_pgsz_gk20a pgsz_idx,
85 const struct gk20a_mmu_level *l,
86 struct gk20a_mm_entry *entry,
87 struct gk20a_mm_entry *prev_entry)
88{
89 int err = -ENOMEM;
90 int order;
91 struct gk20a *g = gk20a_from_vm(vm);
92 u32 bytes;
93
94 /* allocate enough pages for the table */
95 order = l->hi_bit[pgsz_idx] - l->lo_bit[pgsz_idx] + 1;
96 order += ilog2(l->entry_size);
97 bytes = 1 << order;
98 order -= PAGE_SHIFT;
99 if (order < 0 && prev_entry) {
100 /* try to suballocate from previous chunk */
101 u32 capacity = prev_entry->mem.size / bytes;
102 u32 prev = prev_entry->woffset * sizeof(u32) / bytes;
103 u32 free = capacity - prev - 1;
104
105 nvgpu_log(g, gpu_dbg_pte, "cap %d prev %d free %d bytes %d",
106 capacity, prev, free, bytes);
107
108 if (free) {
109 memcpy(&entry->mem, &prev_entry->mem,
110 sizeof(entry->mem));
111 entry->woffset = prev_entry->woffset
112 + bytes / sizeof(u32);
113 err = 0;
114 }
115 }
116
117 if (err) {
118 /* no suballoc space */
119 order = max(0, order);
120 err = nvgpu_alloc_gmmu_pages(vm, order, entry);
121 entry->woffset = 0;
122 }
123
124 nvgpu_log(g, gpu_dbg_pte, "entry = 0x%p, addr=%08llx, size %d, woff %x",
125 entry,
126 (entry->mem.priv.sgt &&
127 entry->mem.aperture == APERTURE_SYSMEM) ?
128 g->ops.mm.get_iova_addr(g, entry->mem.priv.sgt->sgl, 0) : 0,
129 order, entry->woffset);
130 if (err)
131 return err;
132 entry->pgsz = pgsz_idx;
133 entry->mem.skip_wmb = true;
134
135 return err;
136}
137 48
138/* 49/*
139 * Core GMMU map function for the kernel to use. If @addr is 0 then the GPU 50 * Core GMMU map function for the kernel to use. If @addr is 0 then the GPU
@@ -225,103 +136,484 @@ void nvgpu_gmmu_unmap(struct vm_gk20a *vm, struct nvgpu_mem *mem, u64 gpu_va)
225 nvgpu_mutex_release(&vm->update_gmmu_lock); 136 nvgpu_mutex_release(&vm->update_gmmu_lock);
226} 137}
227 138
228static int update_gmmu_level_locked(struct vm_gk20a *vm, 139int nvgpu_gmmu_init_page_table(struct vm_gk20a *vm)
229 struct gk20a_mm_entry *pte, 140{
230 enum gmmu_pgsz_gk20a pgsz_idx, 141 /*
231 struct scatterlist **sgl, 142 * Need this just for page size. Everything else can be ignored. Also
232 u64 *offset, 143 * note that we can just use pgsz 0 (i.e small pages) since the number
233 u64 *iova, 144 * of bits present in the top level PDE are the same for small/large
234 u64 gpu_va, u64 gpu_end, 145 * page VMs.
235 u8 kind_v, u64 *ctag, 146 */
236 bool cacheable, bool unmapped_pte, 147 struct nvgpu_gmmu_attrs attrs = {
237 int rw_flag, 148 .pgsz = 0,
238 bool sparse, 149 };
239 int lvl, 150
240 bool priv, 151 return pd_allocate(vm, &vm->pdb, &vm->mmu_levels[0], &attrs);
241 enum nvgpu_aperture aperture) 152}
153
154
155/*
156 * Ensure that there's a CPU mapping for the page directory memory. This won't
157 * always be the case for 32 bit systems since we may need to save kernel
158 * virtual memory.
159 */
160static int map_gmmu_pages(struct gk20a *g, struct nvgpu_gmmu_pd *entry)
161{
162 return nvgpu_mem_begin(g, &entry->mem);
163}
164
165/*
166 * Handle any necessary CPU unmap semantics for a page directories DMA memory.
167 * For 64 bit platforms this is a noop.
168 */
169static void unmap_gmmu_pages(struct gk20a *g, struct nvgpu_gmmu_pd *entry)
170{
171 nvgpu_mem_end(g, &entry->mem);
172}
173
174static int nvgpu_alloc_gmmu_pages(struct vm_gk20a *vm, u32 bytes,
175 struct nvgpu_gmmu_pd *pd)
242{ 176{
243 struct gk20a *g = gk20a_from_vm(vm); 177 struct gk20a *g = gk20a_from_vm(vm);
244 const struct gk20a_mmu_level *l = &vm->mmu_levels[lvl]; 178 unsigned long flags = NVGPU_DMA_FORCE_CONTIGUOUS;
245 const struct gk20a_mmu_level *next_l = &vm->mmu_levels[lvl+1]; 179 int err;
246 int err = 0; 180
247 u32 pde_i; 181 /*
248 u64 pde_size = 1ULL << (u64)l->lo_bit[pgsz_idx]; 182 * On arm32 vmalloc space is a precious commodity so we do not map pages
249 struct gk20a_mm_entry *next_pte = NULL, *prev_pte = NULL; 183 * by default.
184 */
185 if (!IS_ENABLED(CONFIG_ARM64))
186 flags |= NVGPU_DMA_NO_KERNEL_MAPPING;
187
188 err = nvgpu_dma_alloc_flags(g, flags, bytes, &pd->mem);
189 if (err)
190 return -ENOMEM;
191
192 return 0;
193}
194
195void nvgpu_free_gmmu_pages(struct vm_gk20a *vm,
196 struct nvgpu_gmmu_pd *pd)
197{
198 struct gk20a *g = gk20a_from_vm(vm);
199
200 nvgpu_dma_free(g, &pd->mem);
201}
202
203/*
204 * Return the _physical_ address of a page directory.
205 */
206u64 nvgpu_pde_phys_addr(struct gk20a *g, struct nvgpu_gmmu_pd *pd)
207{
208 if (g->mm.has_physical_mode)
209 return sg_phys(pd->mem.priv.sgt->sgl);
210 else
211 return nvgpu_mem_get_base_addr(g, &pd->mem, 0);
212}
213
214/*
215 * Return the aligned length based on the page size in attrs.
216 */
217static u64 nvgpu_align_map_length(struct vm_gk20a *vm, u64 length,
218 struct nvgpu_gmmu_attrs *attrs)
219{
220 u64 page_size = vm->gmmu_page_sizes[attrs->pgsz];
221
222 return ALIGN(length, page_size);
223}
224
225static u32 pd_entries(const struct gk20a_mmu_level *l,
226 struct nvgpu_gmmu_attrs *attrs)
227{
228 /*
229 * Number of entries in a PD is easy to compute from the number of bits
230 * used to index the page directory. That is simply 2 raised to the
231 * number of bits.
232 */
233 return 1UL << (l->hi_bit[attrs->pgsz] - l->lo_bit[attrs->pgsz] + 1UL);
234}
235
236/*
237 * Computes the size of a PD table.
238 */
239static u32 pd_size(const struct gk20a_mmu_level *l,
240 struct nvgpu_gmmu_attrs *attrs)
241{
242 return pd_entries(l, attrs) * l->entry_size;
243}
244
245/*
246 * Allocate a physically contiguous region big enough for a gmmu page table
247 * of the specified level and page size. The whole range is zeroed so that any
248 * accesses will fault until proper values are programmed.
249 */
250static int pd_allocate(struct vm_gk20a *vm,
251 struct nvgpu_gmmu_pd *pd,
252 const struct gk20a_mmu_level *l,
253 struct nvgpu_gmmu_attrs *attrs)
254{
255 int err;
250 256
251 gk20a_dbg_fn(""); 257 if (pd->mem.size)
258 return 0;
252 259
253 pde_i = (gpu_va & ((1ULL << ((u64)l->hi_bit[pgsz_idx]+1)) - 1ULL)) 260 err = nvgpu_alloc_gmmu_pages(vm, pd_size(l, attrs), pd);
254 >> (u64)l->lo_bit[pgsz_idx]; 261 if (err) {
262 nvgpu_info(vm->mm->g, "error allocating page directory!");
263 return err;
264 }
255 265
256 gk20a_dbg(gpu_dbg_pte, "size_idx=%d, l: %d, [%llx,%llx], iova=%llx", 266 /*
257 pgsz_idx, lvl, gpu_va, gpu_end-1, *iova); 267 * One mb() is done after all mapping operations. Don't need individual
268 * barriers for each PD write.
269 */
270 pd->mem.skip_wmb = true;
258 271
259 while (gpu_va < gpu_end) { 272 return 0;
260 u64 next = min((gpu_va + pde_size) & ~(pde_size-1), gpu_end); 273}
261 274
262 /* Allocate next level */ 275/*
276 * Compute what page directory index at the passed level the passed virtual
277 * address corresponds to. @attrs is necessary for determining the page size
278 * which is used to pick the right bit offsets for the GMMU level.
279 */
280static u32 pd_index(const struct gk20a_mmu_level *l, u64 virt,
281 struct nvgpu_gmmu_attrs *attrs)
282{
283 u64 pd_mask = (1ULL << ((u64)l->hi_bit[attrs->pgsz] + 1)) - 1ULL;
284 u32 pd_shift = (u64)l->lo_bit[attrs->pgsz];
285
286 /*
287 * For convenience we don't bother computing the lower bound of the
288 * mask; it's easier to just shift it off.
289 */
290 return (virt & pd_mask) >> pd_shift;
291}
292
293static int pd_allocate_children(struct vm_gk20a *vm,
294 const struct gk20a_mmu_level *l,
295 struct nvgpu_gmmu_pd *pd,
296 struct nvgpu_gmmu_attrs *attrs)
297{
298 struct gk20a *g = gk20a_from_vm(vm);
299
300 if (pd->entries)
301 return 0;
302
303 pd->num_entries = pd_entries(l, attrs);
304 pd->entries = nvgpu_vzalloc(g, sizeof(struct nvgpu_gmmu_pd) *
305 pd->num_entries);
306 if (!pd->entries)
307 return -ENOMEM;
308
309 return 0;
310}
311
312/*
313 * This function programs the GMMU based on two ranges: a physical range and a
314 * GPU virtual range. The virtual is mapped to the physical. Physical in this
315 * case can mean either a real physical sysmem address or a IO virtual address
316 * (for instance when a system has an IOMMU running).
317 *
318 * The rest of the parameters are for describing the actual mapping itself.
319 *
320 * This function recursively calls itself for handling PDEs. At the final level
321 * a PTE handler is called. The phys and virt ranges are adjusted for each
322 * recursion so that each invocation of this function need only worry about the
323 * range it is passed.
324 *
325 * phys_addr will always point to a contiguous range - the discontiguous nature
326 * of DMA buffers is taken care of at the layer above this.
327 */
328static int __set_pd_level(struct vm_gk20a *vm,
329 struct nvgpu_gmmu_pd *pd,
330 int lvl,
331 u64 phys_addr,
332 u64 virt_addr, u64 length,
333 struct nvgpu_gmmu_attrs *attrs)
334{
335 int err = 0;
336 u64 pde_range;
337 struct gk20a *g = gk20a_from_vm(vm);
338 struct nvgpu_gmmu_pd *next_pd = NULL;
339 const struct gk20a_mmu_level *l = &vm->mmu_levels[lvl];
340 const struct gk20a_mmu_level *next_l = &vm->mmu_levels[lvl + 1];
341
342 /*
343 * 5 levels for Pascal+. For pre-pascal we only have 2. This puts
344 * offsets into the page table debugging code which makes it easier to
345 * see what level prints are from.
346 */
347 static const char *__lvl_debug[] = {
348 "", /* L=0 */
349 " ", /* L=1 */
350 " ", /* L=2 */
351 " ", /* L=3 */
352 " ", /* L=4 */
353 };
354
355 pde_range = 1ULL << (u64)l->lo_bit[attrs->pgsz];
356
357 __gmmu_dbg_v(g, attrs,
358 "L=%d %sGPU virt %#-12llx +%#-9llx -> phys %#-12llx",
359 lvl,
360 __lvl_debug[lvl],
361 virt_addr,
362 length,
363 phys_addr);
364
365 /*
366 * Iterate across the mapping in chunks the size of this level's PDE.
367 * For each of those chunks program our level's PDE and then, if there's
368 * a next level, program the next level's PDEs/PTEs.
369 */
370 while (length) {
371 u32 pd_idx = pd_index(l, virt_addr, attrs);
372 u64 chunk_size;
373 u64 target_addr;
374
375 /*
376 * Truncate the pde_range when the virtual address does not
377 * start at a PDE boundary.
378 */
379 chunk_size = min(length,
380 pde_range - (virt_addr & (pde_range - 1)));
381
382 /*
383 * If the next level has an update_entry function then we know
384 * that _this_ level points to PDEs (not PTEs). Thus we need to
385 * have a bunch of children PDs.
386 */
263 if (next_l->update_entry) { 387 if (next_l->update_entry) {
264 if (!pte->entries) { 388 if (pd_allocate_children(vm, l, pd, attrs))
265 int num_entries = 389 return -ENOMEM;
266 1 << 390
267 (l->hi_bit[pgsz_idx] 391 /*
268 - l->lo_bit[pgsz_idx] + 1); 392 * Get the next PD so that we know what to put in this
269 pte->entries = 393 * current PD. If the next level is actually PTEs then
270 nvgpu_vzalloc(g, 394 * we don't need this - we will just use the real
271 sizeof(struct gk20a_mm_entry) * 395 * physical target.
272 num_entries); 396 */
273 if (!pte->entries) 397 next_pd = &pd->entries[pd_idx];
274 return -ENOMEM; 398
275 pte->pgsz = pgsz_idx; 399 /*
276 pte->num_entries = num_entries; 400 * Allocate the backing memory for next_pd.
277 } 401 */
278 prev_pte = next_pte; 402 if (pd_allocate(vm, next_pd, next_l, attrs))
279 next_pte = pte->entries + pde_i; 403 return -ENOMEM;
280
281 if (!next_pte->mem.size) {
282 err = nvgpu_zalloc_gmmu_page_table(vm,
283 pgsz_idx, next_l, next_pte, prev_pte);
284 if (err)
285 return err;
286 }
287 } 404 }
288 405
289 err = l->update_entry(vm, pte, pde_i, pgsz_idx, 406 /*
290 sgl, offset, iova, 407 * This is the address we want to program into the actual PDE/
291 kind_v, ctag, cacheable, unmapped_pte, 408 * PTE. When the next level is PDEs we need the target address
292 rw_flag, sparse, priv, aperture); 409 * to be the table of PDEs. When the next level is PTEs the
293 if (err) 410 * target addr is the real physical address we are aiming for.
294 return err; 411 */
412 target_addr = next_pd ? nvgpu_pde_phys_addr(g, next_pd) :
413 phys_addr;
414
415 l->update_entry(vm, l,
416 pd, pd_idx,
417 virt_addr,
418 target_addr,
419 attrs);
295 420
296 if (next_l->update_entry) { 421 if (next_l->update_entry) {
297 /* get cpu access to the ptes */ 422 err = map_gmmu_pages(g, next_pd);
298 err = map_gmmu_pages(g, next_pte);
299 if (err) { 423 if (err) {
300 nvgpu_err(g, 424 nvgpu_err(g,
301 "couldn't map ptes for update as=%d", 425 "couldn't map ptes for update as=%d",
302 vm_aspace_id(vm)); 426 vm_aspace_id(vm));
303 return err; 427 return err;
304 } 428 }
305 err = update_gmmu_level_locked(vm, next_pte, 429
306 pgsz_idx, 430 err = __set_pd_level(vm, next_pd,
307 sgl, 431 lvl + 1,
308 offset, 432 phys_addr,
309 iova, 433 virt_addr,
310 gpu_va, 434 chunk_size,
311 next, 435 attrs);
312 kind_v, ctag, cacheable, unmapped_pte, 436 unmap_gmmu_pages(g, next_pd);
313 rw_flag, sparse, lvl+1, priv, aperture);
314 unmap_gmmu_pages(g, next_pte);
315 437
316 if (err) 438 if (err)
317 return err; 439 return err;
318 } 440 }
319 441
320 pde_i++; 442 virt_addr += chunk_size;
321 gpu_va = next; 443
444 /*
445 * Only add to phys_addr if it's non-zero. A zero value implies
446 * we are unmapping as as a result we don't want to place
447 * non-zero phys addresses in the PTEs. A non-zero phys-addr
448 * would also confuse the lower level PTE programming code.
449 */
450 if (phys_addr)
451 phys_addr += chunk_size;
452 length -= chunk_size;
453 }
454
455 __gmmu_dbg_v(g, attrs, "L=%d %s%s", lvl, __lvl_debug[lvl], "ret!");
456
457 return 0;
458}
459
460/*
461 * VIDMEM version of the update_ptes logic.
462 */
463static int __nvgpu_gmmu_update_page_table_vidmem(struct vm_gk20a *vm,
464 struct sg_table *sgt,
465 u64 space_to_skip,
466 u64 virt_addr,
467 u64 length,
468 struct nvgpu_gmmu_attrs *attrs)
469{
470 struct nvgpu_page_alloc *alloc = NULL;
471 struct page_alloc_chunk *chunk = NULL;
472 u64 phys_addr, chunk_length;
473 int err = 0;
474
475 if (!sgt) {
476 /*
477 * This is considered an unmap. Just pass in 0 as the physical
478 * address for the entire GPU range.
479 */
480 err = __set_pd_level(vm, &vm->pdb,
481 0,
482 0,
483 virt_addr, length,
484 attrs);
485 return err;
486 }
487
488 alloc = get_vidmem_page_alloc(sgt->sgl);
489
490 /*
491 * Otherwise iterate across all the chunks in this allocation and
492 * map them.
493 */
494 nvgpu_list_for_each_entry(chunk, &alloc->alloc_chunks,
495 page_alloc_chunk, list_entry) {
496 if (space_to_skip &&
497 space_to_skip >= chunk->length) {
498 space_to_skip -= chunk->length;
499 continue;
500 }
501
502 phys_addr = chunk->base + space_to_skip;
503 chunk_length = min(length, (chunk->length - space_to_skip));
504
505 err = __set_pd_level(vm, &vm->pdb,
506 0,
507 phys_addr,
508 virt_addr, length,
509 attrs);
510 if (err)
511 break;
512
513 /* Space has been skipped so zero this for future chunks. */
514 space_to_skip = 0;
515
516 /*
517 * Update the map pointer and the remaining length.
518 */
519 virt_addr += chunk_length;
520 length -= chunk_length;
521
522 if (length == 0)
523 break;
322 } 524 }
323 525
324 gk20a_dbg_fn("done"); 526 return err;
527}
528
529static int __nvgpu_gmmu_update_page_table_sysmem(struct vm_gk20a *vm,
530 struct sg_table *sgt,
531 u64 space_to_skip,
532 u64 virt_addr,
533 u64 length,
534 struct nvgpu_gmmu_attrs *attrs)
535{
536 int err;
537 struct scatterlist *sgl;
538 struct gk20a *g = gk20a_from_vm(vm);
539
540 if (!sgt) {
541 /*
542 * This is considered an unmap. Just pass in 0 as the physical
543 * address for the entire GPU range.
544 */
545 err = __set_pd_level(vm, &vm->pdb,
546 0,
547 0,
548 virt_addr, length,
549 attrs);
550 return err;
551 }
552
553 /*
554 * At this point we have a Linux scatter-gather list pointing to some
555 * number of discontiguous chunks of memory. Iterate over that list and
556 * generate a GMMU map call for each chunk. There are two possibilities:
557 * either the IOMMU is enabled or not. When the IOMMU is enabled the
558 * mapping is simple since the "physical" address is actually a virtual
559 * IO address and will be contiguous. The no-IOMMU case is more
560 * complicated. We will have to iterate over the SGT and do a separate
561 * map for each chunk of the SGT.
562 */
563 sgl = sgt->sgl;
564
565 if (!g->mm.bypass_smmu) {
566 u64 io_addr = g->ops.mm.get_iova_addr(g, sgl, 0);
567
568 io_addr += space_to_skip;
569
570 err = __set_pd_level(vm, &vm->pdb,
571 0,
572 io_addr,
573 virt_addr,
574 length,
575 attrs);
576
577 return err;
578 }
579
580 /*
581 * Finally: last possible case: do the no-IOMMU mapping. In this case we
582 * really are mapping physical pages directly.
583 */
584 while (sgl) {
585 u64 phys_addr;
586 u64 chunk_length;
587
588 /*
589 * Cut out sgl ents for space_to_skip.
590 */
591 if (space_to_skip && space_to_skip >= sgl->length) {
592 space_to_skip -= sgl->length;
593 sgl = sg_next(sgl);
594 continue;
595 }
596
597 phys_addr = sg_phys(sgl) + space_to_skip;
598 chunk_length = min(length, sgl->length - space_to_skip);
599
600 err = __set_pd_level(vm, &vm->pdb,
601 0,
602 phys_addr,
603 virt_addr,
604 chunk_length,
605 attrs);
606 if (err)
607 return err;
608
609 space_to_skip = 0;
610 virt_addr += chunk_length;
611 length -= chunk_length;
612 sgl = sg_next(sgl);
613
614 if (length == 0)
615 break;
616 }
325 617
326 return 0; 618 return 0;
327} 619}
@@ -332,8 +624,8 @@ static int update_gmmu_level_locked(struct vm_gk20a *vm,
332 * physical* address. 624 * physical* address.
333 * 625 *
334 * The update of each level of the page tables is farmed out to chip specific 626 * The update of each level of the page tables is farmed out to chip specific
335 * implementations. But the logic around that is generic to all chips. Every chip 627 * implementations. But the logic around that is generic to all chips. Every
336 * has some number of PDE levels and then a PTE level. 628 * chip has some number of PDE levels and then a PTE level.
337 * 629 *
338 * Each chunk of the incoming SGT is sent to the chip specific implementation 630 * Each chunk of the incoming SGT is sent to the chip specific implementation
339 * of page table update. 631 * of page table update.
@@ -341,148 +633,81 @@ static int update_gmmu_level_locked(struct vm_gk20a *vm,
341 * [*] Note: the "physical" address may actually be an IO virtual address in the 633 * [*] Note: the "physical" address may actually be an IO virtual address in the
342 * case of SMMU usage. 634 * case of SMMU usage.
343 */ 635 */
344static int update_gmmu_ptes_locked(struct vm_gk20a *vm, 636static int __nvgpu_gmmu_update_page_table(struct vm_gk20a *vm,
345 enum gmmu_pgsz_gk20a pgsz_idx, 637 struct sg_table *sgt,
346 struct sg_table *sgt, 638 u64 space_to_skip,
347 u64 buffer_offset, 639 u64 virt_addr,
348 u64 gpu_va, u64 gpu_end, 640 u64 length,
349 u8 kind_v, u32 ctag_offset, 641 struct nvgpu_gmmu_attrs *attrs)
350 bool cacheable, bool unmapped_pte,
351 int rw_flag,
352 bool sparse,
353 bool priv,
354 enum nvgpu_aperture aperture)
355{ 642{
356 struct gk20a *g = gk20a_from_vm(vm); 643 struct gk20a *g = gk20a_from_vm(vm);
357 int ctag_granularity = g->ops.fb.compression_page_size(g); 644 u32 page_size;
358 u64 ctag = (u64)ctag_offset * (u64)ctag_granularity;
359 u64 iova = 0;
360 u64 space_to_skip = buffer_offset;
361 u64 map_size = gpu_end - gpu_va;
362 u32 page_size = vm->gmmu_page_sizes[pgsz_idx];
363 int err; 645 int err;
364 struct scatterlist *sgl = NULL;
365 struct nvgpu_page_alloc *alloc = NULL;
366 struct page_alloc_chunk *chunk = NULL;
367 u64 length;
368 646
369 /* note: here we need to map kernel to small, since the 647 /* note: here we need to map kernel to small, since the
370 * low-level mmu code assumes 0 is small and 1 is big pages */ 648 * low-level mmu code assumes 0 is small and 1 is big pages */
371 if (pgsz_idx == gmmu_page_size_kernel) 649 if (attrs->pgsz == gmmu_page_size_kernel)
372 pgsz_idx = gmmu_page_size_small; 650 attrs->pgsz = gmmu_page_size_small;
651
652 page_size = vm->gmmu_page_sizes[attrs->pgsz];
373 653
374 if (space_to_skip & (page_size - 1)) 654 if (space_to_skip & (page_size - 1))
375 return -EINVAL; 655 return -EINVAL;
376 656
657 /*
658 * Update length to be aligned to the passed page size.
659 */
660 length = nvgpu_align_map_length(vm, length, attrs);
661
377 err = map_gmmu_pages(g, &vm->pdb); 662 err = map_gmmu_pages(g, &vm->pdb);
378 if (err) { 663 if (err) {
379 nvgpu_err(g, 664 nvgpu_err(g, "couldn't map ptes for update as=%d",
380 "couldn't map ptes for update as=%d", 665 vm_aspace_id(vm));
381 vm_aspace_id(vm));
382 return err; 666 return err;
383 } 667 }
384 668
385 if (aperture == APERTURE_VIDMEM) { 669 __gmmu_dbg(g, attrs,
386 gmmu_dbg_v(g, "vidmem map size_idx=%d, gpu_va=[%llx,%llx]", 670 "vm=%s "
387 pgsz_idx, gpu_va, gpu_end-1); 671 "%-5s GPU virt %#-12llx +%#-9llx phys %#-12llx "
388 672 "phys offset: %#-4llx; pgsz: %3dkb perm=%-2s | "
389 if (sgt) { 673 "kind=%#02x APT=%-6s %c%c%c",
390 alloc = get_vidmem_page_alloc(sgt->sgl); 674 vm->name,
391 675 sgt ? "MAP" : "UNMAP",
392 nvgpu_list_for_each_entry(chunk, &alloc->alloc_chunks, 676 virt_addr,
393 page_alloc_chunk, list_entry) { 677 length,
394 if (space_to_skip && 678 sgt ? g->ops.mm.get_iova_addr(g, sgt->sgl, 0) : 0ULL,
395 space_to_skip > chunk->length) { 679 space_to_skip,
396 space_to_skip -= chunk->length; 680 page_size >> 10,
397 } else { 681 nvgpu_gmmu_perm_str(attrs->rw_flag),
398 iova = chunk->base + space_to_skip; 682 attrs->kind_v,
399 length = chunk->length - space_to_skip; 683 nvgpu_aperture_str(attrs->aperture),
400 length = min(length, map_size); 684 attrs->cacheable ? 'C' : 'V', /* C = cached, V = volatile. */
401 space_to_skip = 0; 685 attrs->sparse ? 'S' : '-',
402 686 attrs->priv ? 'P' : '-');
403 err = update_gmmu_level_locked(vm, 687
404 &vm->pdb, pgsz_idx, 688 /*
405 &sgl, 689 * Handle VIDMEM progamming. Currently uses a different scatter list
406 &space_to_skip, 690 * format.
407 &iova, 691 */
408 gpu_va, gpu_va + length, 692 if (attrs->aperture == APERTURE_VIDMEM)
409 kind_v, &ctag, 693 err = __nvgpu_gmmu_update_page_table_vidmem(vm,
410 cacheable, unmapped_pte, 694 sgt,
411 rw_flag, sparse, 0, priv, 695 space_to_skip,
412 aperture); 696 virt_addr,
413 if (err) 697 length,
414 break; 698 attrs);
415 699 else
416 /* need to set explicit zero here */ 700 err = __nvgpu_gmmu_update_page_table_sysmem(vm,
417 space_to_skip = 0; 701 sgt,
418 gpu_va += length; 702 space_to_skip,
419 map_size -= length; 703 virt_addr,
420 704 length,
421 if (!map_size) 705 attrs);
422 break;
423 }
424 }
425 } else {
426 err = update_gmmu_level_locked(vm, &vm->pdb, pgsz_idx,
427 &sgl,
428 &space_to_skip,
429 &iova,
430 gpu_va, gpu_end,
431 kind_v, &ctag,
432 cacheable, unmapped_pte, rw_flag,
433 sparse, 0, priv,
434 aperture);
435 }
436 } else {
437 gmmu_dbg_v(g,
438 "pgsz=%-6d, gpu_va: %#-12llx +%#-6llx phys: %#-12llx "
439 "buffer offset: %-4lld, nents: %d",
440 page_size,
441 gpu_va, gpu_end - gpu_va,
442 sgt ? g->ops.mm.get_iova_addr(g, sgt->sgl, 0) : 0ULL,
443 buffer_offset,
444 sgt ? sgt->nents : 0);
445
446 if (sgt) {
447 iova = g->ops.mm.get_iova_addr(vm->mm->g, sgt->sgl, 0);
448 if (!vm->mm->bypass_smmu && iova) {
449 iova += space_to_skip;
450 } else {
451 sgl = sgt->sgl;
452
453 gk20a_dbg(gpu_dbg_pte, "chunk address %llx, size %d",
454 (u64)sg_phys(sgl),
455 sgl->length);
456
457 while (space_to_skip && sgl &&
458 space_to_skip + page_size > sgl->length) {
459 space_to_skip -= sgl->length;
460 sgl = sg_next(sgl);
461 gk20a_dbg(gpu_dbg_pte, "chunk address %llx, size %d",
462 (u64)sg_phys(sgl),
463 sgl->length);
464 }
465
466 iova = sg_phys(sgl) + space_to_skip;
467 }
468 }
469
470 err = update_gmmu_level_locked(vm, &vm->pdb, pgsz_idx,
471 &sgl,
472 &space_to_skip,
473 &iova,
474 gpu_va, gpu_end,
475 kind_v, &ctag,
476 cacheable, unmapped_pte, rw_flag,
477 sparse, 0, priv,
478 aperture);
479 }
480 706
481 unmap_gmmu_pages(g, &vm->pdb); 707 unmap_gmmu_pages(g, &vm->pdb);
482
483 mb(); 708 mb();
484 709
485 gk20a_dbg_fn("done"); 710 __gmmu_dbg(g, attrs, "%-5s Done!", sgt ? "MAP" : "UNMAP");
486 711
487 return err; 712 return err;
488} 713}
@@ -500,32 +725,44 @@ static int update_gmmu_ptes_locked(struct vm_gk20a *vm,
500 * have the update_gmmu_lock aquired. 725 * have the update_gmmu_lock aquired.
501 */ 726 */
502u64 gk20a_locked_gmmu_map(struct vm_gk20a *vm, 727u64 gk20a_locked_gmmu_map(struct vm_gk20a *vm,
503 u64 map_offset, 728 u64 vaddr,
504 struct sg_table *sgt, 729 struct sg_table *sgt,
505 u64 buffer_offset, 730 u64 buffer_offset,
506 u64 size, 731 u64 size,
507 int pgsz_idx, 732 int pgsz_idx,
508 u8 kind_v, 733 u8 kind_v,
509 u32 ctag_offset, 734 u32 ctag_offset,
510 u32 flags, 735 u32 flags,
511 int rw_flag, 736 int rw_flag,
512 bool clear_ctags, 737 bool clear_ctags,
513 bool sparse, 738 bool sparse,
514 bool priv, 739 bool priv,
515 struct vm_gk20a_mapping_batch *batch, 740 struct vm_gk20a_mapping_batch *batch,
516 enum nvgpu_aperture aperture) 741 enum nvgpu_aperture aperture)
517{ 742{
743 struct gk20a *g = gk20a_from_vm(vm);
518 int err = 0; 744 int err = 0;
519 bool allocated = false; 745 bool allocated = false;
520 struct gk20a *g = gk20a_from_vm(vm);
521 int ctag_granularity = g->ops.fb.compression_page_size(g); 746 int ctag_granularity = g->ops.fb.compression_page_size(g);
522 u32 ctag_lines = DIV_ROUND_UP_ULL(size, ctag_granularity); 747 struct nvgpu_gmmu_attrs attrs = {
523 748 .pgsz = pgsz_idx,
524 /* Allocate (or validate when map_offset != 0) the virtual address. */ 749 .kind_v = kind_v,
525 if (!map_offset) { 750 .ctag = (u64)ctag_offset * (u64)ctag_granularity,
526 map_offset = __nvgpu_vm_alloc_va(vm, size, 751 .cacheable = flags & NVGPU_MAP_BUFFER_FLAGS_CACHEABLE_TRUE,
527 pgsz_idx); 752 .rw_flag = rw_flag,
528 if (!map_offset) { 753 .sparse = sparse,
754 .priv = priv,
755 .valid = !(flags & NVGPU_AS_MAP_BUFFER_FLAGS_UNMAPPED_PTE),
756 .aperture = aperture
757 };
758
759 /*
760 * Only allocate a new GPU VA range if we haven't already been passed a
761 * GPU VA range. This facilitates fixed mappings.
762 */
763 if (!vaddr) {
764 vaddr = __nvgpu_vm_alloc_va(vm, size, pgsz_idx);
765 if (!vaddr) {
529 nvgpu_err(g, "failed to allocate va space"); 766 nvgpu_err(g, "failed to allocate va space");
530 err = -ENOMEM; 767 err = -ENOMEM;
531 goto fail_alloc; 768 goto fail_alloc;
@@ -533,34 +770,8 @@ u64 gk20a_locked_gmmu_map(struct vm_gk20a *vm,
533 allocated = true; 770 allocated = true;
534 } 771 }
535 772
536 gmmu_dbg(g, 773 err = __nvgpu_gmmu_update_page_table(vm, sgt, buffer_offset,
537 "gv: 0x%04x_%08x + 0x%-7llx " 774 vaddr, size, &attrs);
538 "[dma: 0x%02x_%08x, pa: 0x%02x_%08x] "
539 "pgsz=%-3dKb as=%-2d ctags=%d start=%d "
540 "kind=0x%x flags=0x%x apt=%s",
541 u64_hi32(map_offset), u64_lo32(map_offset), size,
542 sgt ? u64_hi32((u64)sg_dma_address(sgt->sgl)) : 0,
543 sgt ? u64_lo32((u64)sg_dma_address(sgt->sgl)) : 0,
544 sgt ? u64_hi32((u64)sg_phys(sgt->sgl)) : 0,
545 sgt ? u64_lo32((u64)sg_phys(sgt->sgl)) : 0,
546 vm->gmmu_page_sizes[pgsz_idx] >> 10, vm_aspace_id(vm),
547 ctag_lines, ctag_offset,
548 kind_v, flags, nvgpu_aperture_str(aperture));
549
550 err = update_gmmu_ptes_locked(vm, pgsz_idx,
551 sgt,
552 buffer_offset,
553 map_offset, map_offset + size,
554 kind_v,
555 ctag_offset,
556 flags &
557 NVGPU_MAP_BUFFER_FLAGS_CACHEABLE_TRUE,
558 flags &
559 NVGPU_AS_MAP_BUFFER_FLAGS_UNMAPPED_PTE,
560 rw_flag,
561 sparse,
562 priv,
563 aperture);
564 if (err) { 775 if (err) {
565 nvgpu_err(g, "failed to update ptes on map"); 776 nvgpu_err(g, "failed to update ptes on map");
566 goto fail_validate; 777 goto fail_validate;
@@ -571,26 +782,37 @@ u64 gk20a_locked_gmmu_map(struct vm_gk20a *vm,
571 else 782 else
572 batch->need_tlb_invalidate = true; 783 batch->need_tlb_invalidate = true;
573 784
574 return map_offset; 785 return vaddr;
575fail_validate: 786fail_validate:
576 if (allocated) 787 if (allocated)
577 __nvgpu_vm_free_va(vm, map_offset, pgsz_idx); 788 __nvgpu_vm_free_va(vm, vaddr, pgsz_idx);
578fail_alloc: 789fail_alloc:
579 nvgpu_err(g, "%s: failed with err=%d", __func__, err); 790 nvgpu_err(g, "%s: failed with err=%d", __func__, err);
580 return 0; 791 return 0;
581} 792}
582 793
583void gk20a_locked_gmmu_unmap(struct vm_gk20a *vm, 794void gk20a_locked_gmmu_unmap(struct vm_gk20a *vm,
584 u64 vaddr, 795 u64 vaddr,
585 u64 size, 796 u64 size,
586 int pgsz_idx, 797 int pgsz_idx,
587 bool va_allocated, 798 bool va_allocated,
588 int rw_flag, 799 int rw_flag,
589 bool sparse, 800 bool sparse,
590 struct vm_gk20a_mapping_batch *batch) 801 struct vm_gk20a_mapping_batch *batch)
591{ 802{
592 int err = 0; 803 int err = 0;
593 struct gk20a *g = gk20a_from_vm(vm); 804 struct gk20a *g = gk20a_from_vm(vm);
805 struct nvgpu_gmmu_attrs attrs = {
806 .pgsz = pgsz_idx,
807 .kind_v = 0,
808 .ctag = 0,
809 .cacheable = 0,
810 .rw_flag = rw_flag,
811 .sparse = sparse,
812 .priv = 0,
813 .valid = 0,
814 .aperture = APERTURE_INVALID,
815 };
594 816
595 if (va_allocated) { 817 if (va_allocated) {
596 err = __nvgpu_vm_free_va(vm, vaddr, pgsz_idx); 818 err = __nvgpu_vm_free_va(vm, vaddr, pgsz_idx);
@@ -601,27 +823,11 @@ void gk20a_locked_gmmu_unmap(struct vm_gk20a *vm,
601 } 823 }
602 824
603 /* unmap here needs to know the page size we assigned at mapping */ 825 /* unmap here needs to know the page size we assigned at mapping */
604 err = update_gmmu_ptes_locked(vm, 826 err = __nvgpu_gmmu_update_page_table(vm, NULL, 0,
605 pgsz_idx, 827 vaddr, size, &attrs);
606 NULL, /* n/a for unmap */
607 0,
608 vaddr,
609 vaddr + size,
610 0, 0, false /* n/a for unmap */,
611 false, rw_flag,
612 sparse, 0,
613 APERTURE_INVALID); /* don't care for unmap */
614 if (err) 828 if (err)
615 nvgpu_err(g, "failed to update gmmu ptes on unmap"); 829 nvgpu_err(g, "failed to update gmmu ptes on unmap");
616 830
617 /* flush l2 so any dirty lines are written out *now*.
618 * also as we could potentially be switching this buffer
619 * from nonvolatile (l2 cacheable) to volatile (l2 non-cacheable) at
620 * some point in the future we need to invalidate l2. e.g. switching
621 * from a render buffer unmap (here) to later using the same memory
622 * for gmmu ptes. note the positioning of this relative to any smmu
623 * unmapping (below). */
624
625 if (!batch) { 831 if (!batch) {
626 gk20a_mm_l2_flush(g, true); 832 gk20a_mm_l2_flush(g, true);
627 g->ops.fb.tlb_invalidate(g, &vm->pdb.mem); 833 g->ops.fb.tlb_invalidate(g, &vm->pdb.mem);