diff options
Diffstat (limited to 'drivers/gpu/nvgpu/common/mm')
-rw-r--r-- | drivers/gpu/nvgpu/common/mm/gmmu.c | 976 | ||||
-rw-r--r-- | drivers/gpu/nvgpu/common/mm/vm.c | 36 |
2 files changed, 594 insertions, 418 deletions
diff --git a/drivers/gpu/nvgpu/common/mm/gmmu.c b/drivers/gpu/nvgpu/common/mm/gmmu.c index 06291600..ec1bc095 100644 --- a/drivers/gpu/nvgpu/common/mm/gmmu.c +++ b/drivers/gpu/nvgpu/common/mm/gmmu.c | |||
@@ -25,115 +25,26 @@ | |||
25 | #include "gk20a/gk20a.h" | 25 | #include "gk20a/gk20a.h" |
26 | #include "gk20a/mm_gk20a.h" | 26 | #include "gk20a/mm_gk20a.h" |
27 | 27 | ||
28 | #define gmmu_dbg(g, fmt, args...) \ | 28 | #define __gmmu_dbg(g, attrs, fmt, args...) \ |
29 | nvgpu_log(g, gpu_dbg_map, fmt, ##args) | 29 | do { \ |
30 | #define gmmu_dbg_v(g, fmt, args...) \ | 30 | if (attrs->debug) \ |
31 | nvgpu_log(g, gpu_dbg_map_v, fmt, ##args) | 31 | nvgpu_info(g, fmt, ##args); \ |
32 | 32 | else \ | |
33 | static int map_gmmu_pages(struct gk20a *g, struct gk20a_mm_entry *entry) | 33 | nvgpu_log(g, gpu_dbg_map, fmt, ##args); \ |
34 | { | 34 | } while (0) |
35 | return nvgpu_mem_begin(g, &entry->mem); | 35 | |
36 | } | 36 | #define __gmmu_dbg_v(g, attrs, fmt, args...) \ |
37 | 37 | do { \ | |
38 | static void unmap_gmmu_pages(struct gk20a *g, struct gk20a_mm_entry *entry) | 38 | if (attrs->debug) \ |
39 | { | 39 | nvgpu_info(g, fmt, ##args); \ |
40 | nvgpu_mem_end(g, &entry->mem); | 40 | else \ |
41 | } | 41 | nvgpu_log(g, gpu_dbg_map_v, fmt, ##args); \ |
42 | 42 | } while (0) | |
43 | static int nvgpu_alloc_gmmu_pages(struct vm_gk20a *vm, u32 order, | 43 | |
44 | struct gk20a_mm_entry *entry) | 44 | static int pd_allocate(struct vm_gk20a *vm, |
45 | { | 45 | struct nvgpu_gmmu_pd *pd, |
46 | struct gk20a *g = gk20a_from_vm(vm); | 46 | const struct gk20a_mmu_level *l, |
47 | u32 num_pages = 1 << order; | 47 | struct nvgpu_gmmu_attrs *attrs); |
48 | u32 len = num_pages * PAGE_SIZE; | ||
49 | int err; | ||
50 | |||
51 | err = nvgpu_dma_alloc(g, len, &entry->mem); | ||
52 | |||
53 | if (err) { | ||
54 | nvgpu_err(g, "memory allocation failed"); | ||
55 | return -ENOMEM; | ||
56 | } | ||
57 | |||
58 | return 0; | ||
59 | } | ||
60 | |||
61 | void nvgpu_free_gmmu_pages(struct vm_gk20a *vm, | ||
62 | struct gk20a_mm_entry *entry) | ||
63 | { | ||
64 | struct gk20a *g = gk20a_from_vm(vm); | ||
65 | |||
66 | if (!entry->mem.size) | ||
67 | return; | ||
68 | |||
69 | if (entry->woffset) /* fake shadow mem */ | ||
70 | return; | ||
71 | |||
72 | nvgpu_dma_free(g, &entry->mem); | ||
73 | } | ||
74 | |||
75 | /* | ||
76 | * Allocate a phys contig region big enough for a full | ||
77 | * sized gmmu page table for the given gmmu_page_size. | ||
78 | * the whole range is zeroed so it's "invalid"/will fault. | ||
79 | * | ||
80 | * If a previous entry is supplied, its memory will be used for | ||
81 | * suballocation for this next entry too, if there is space. | ||
82 | */ | ||
83 | int nvgpu_zalloc_gmmu_page_table(struct vm_gk20a *vm, | ||
84 | enum gmmu_pgsz_gk20a pgsz_idx, | ||
85 | const struct gk20a_mmu_level *l, | ||
86 | struct gk20a_mm_entry *entry, | ||
87 | struct gk20a_mm_entry *prev_entry) | ||
88 | { | ||
89 | int err = -ENOMEM; | ||
90 | int order; | ||
91 | struct gk20a *g = gk20a_from_vm(vm); | ||
92 | u32 bytes; | ||
93 | |||
94 | /* allocate enough pages for the table */ | ||
95 | order = l->hi_bit[pgsz_idx] - l->lo_bit[pgsz_idx] + 1; | ||
96 | order += ilog2(l->entry_size); | ||
97 | bytes = 1 << order; | ||
98 | order -= PAGE_SHIFT; | ||
99 | if (order < 0 && prev_entry) { | ||
100 | /* try to suballocate from previous chunk */ | ||
101 | u32 capacity = prev_entry->mem.size / bytes; | ||
102 | u32 prev = prev_entry->woffset * sizeof(u32) / bytes; | ||
103 | u32 free = capacity - prev - 1; | ||
104 | |||
105 | nvgpu_log(g, gpu_dbg_pte, "cap %d prev %d free %d bytes %d", | ||
106 | capacity, prev, free, bytes); | ||
107 | |||
108 | if (free) { | ||
109 | memcpy(&entry->mem, &prev_entry->mem, | ||
110 | sizeof(entry->mem)); | ||
111 | entry->woffset = prev_entry->woffset | ||
112 | + bytes / sizeof(u32); | ||
113 | err = 0; | ||
114 | } | ||
115 | } | ||
116 | |||
117 | if (err) { | ||
118 | /* no suballoc space */ | ||
119 | order = max(0, order); | ||
120 | err = nvgpu_alloc_gmmu_pages(vm, order, entry); | ||
121 | entry->woffset = 0; | ||
122 | } | ||
123 | |||
124 | nvgpu_log(g, gpu_dbg_pte, "entry = 0x%p, addr=%08llx, size %d, woff %x", | ||
125 | entry, | ||
126 | (entry->mem.priv.sgt && | ||
127 | entry->mem.aperture == APERTURE_SYSMEM) ? | ||
128 | g->ops.mm.get_iova_addr(g, entry->mem.priv.sgt->sgl, 0) : 0, | ||
129 | order, entry->woffset); | ||
130 | if (err) | ||
131 | return err; | ||
132 | entry->pgsz = pgsz_idx; | ||
133 | entry->mem.skip_wmb = true; | ||
134 | |||
135 | return err; | ||
136 | } | ||
137 | 48 | ||
138 | /* | 49 | /* |
139 | * Core GMMU map function for the kernel to use. If @addr is 0 then the GPU | 50 | * Core GMMU map function for the kernel to use. If @addr is 0 then the GPU |
@@ -225,103 +136,484 @@ void nvgpu_gmmu_unmap(struct vm_gk20a *vm, struct nvgpu_mem *mem, u64 gpu_va) | |||
225 | nvgpu_mutex_release(&vm->update_gmmu_lock); | 136 | nvgpu_mutex_release(&vm->update_gmmu_lock); |
226 | } | 137 | } |
227 | 138 | ||
228 | static int update_gmmu_level_locked(struct vm_gk20a *vm, | 139 | int nvgpu_gmmu_init_page_table(struct vm_gk20a *vm) |
229 | struct gk20a_mm_entry *pte, | 140 | { |
230 | enum gmmu_pgsz_gk20a pgsz_idx, | 141 | /* |
231 | struct scatterlist **sgl, | 142 | * Need this just for page size. Everything else can be ignored. Also |
232 | u64 *offset, | 143 | * note that we can just use pgsz 0 (i.e small pages) since the number |
233 | u64 *iova, | 144 | * of bits present in the top level PDE are the same for small/large |
234 | u64 gpu_va, u64 gpu_end, | 145 | * page VMs. |
235 | u8 kind_v, u64 *ctag, | 146 | */ |
236 | bool cacheable, bool unmapped_pte, | 147 | struct nvgpu_gmmu_attrs attrs = { |
237 | int rw_flag, | 148 | .pgsz = 0, |
238 | bool sparse, | 149 | }; |
239 | int lvl, | 150 | |
240 | bool priv, | 151 | return pd_allocate(vm, &vm->pdb, &vm->mmu_levels[0], &attrs); |
241 | enum nvgpu_aperture aperture) | 152 | } |
153 | |||
154 | |||
155 | /* | ||
156 | * Ensure that there's a CPU mapping for the page directory memory. This won't | ||
157 | * always be the case for 32 bit systems since we may need to save kernel | ||
158 | * virtual memory. | ||
159 | */ | ||
160 | static int map_gmmu_pages(struct gk20a *g, struct nvgpu_gmmu_pd *entry) | ||
161 | { | ||
162 | return nvgpu_mem_begin(g, &entry->mem); | ||
163 | } | ||
164 | |||
165 | /* | ||
166 | * Handle any necessary CPU unmap semantics for a page directories DMA memory. | ||
167 | * For 64 bit platforms this is a noop. | ||
168 | */ | ||
169 | static void unmap_gmmu_pages(struct gk20a *g, struct nvgpu_gmmu_pd *entry) | ||
170 | { | ||
171 | nvgpu_mem_end(g, &entry->mem); | ||
172 | } | ||
173 | |||
174 | static int nvgpu_alloc_gmmu_pages(struct vm_gk20a *vm, u32 bytes, | ||
175 | struct nvgpu_gmmu_pd *pd) | ||
242 | { | 176 | { |
243 | struct gk20a *g = gk20a_from_vm(vm); | 177 | struct gk20a *g = gk20a_from_vm(vm); |
244 | const struct gk20a_mmu_level *l = &vm->mmu_levels[lvl]; | 178 | unsigned long flags = NVGPU_DMA_FORCE_CONTIGUOUS; |
245 | const struct gk20a_mmu_level *next_l = &vm->mmu_levels[lvl+1]; | 179 | int err; |
246 | int err = 0; | 180 | |
247 | u32 pde_i; | 181 | /* |
248 | u64 pde_size = 1ULL << (u64)l->lo_bit[pgsz_idx]; | 182 | * On arm32 vmalloc space is a precious commodity so we do not map pages |
249 | struct gk20a_mm_entry *next_pte = NULL, *prev_pte = NULL; | 183 | * by default. |
184 | */ | ||
185 | if (!IS_ENABLED(CONFIG_ARM64)) | ||
186 | flags |= NVGPU_DMA_NO_KERNEL_MAPPING; | ||
187 | |||
188 | err = nvgpu_dma_alloc_flags(g, flags, bytes, &pd->mem); | ||
189 | if (err) | ||
190 | return -ENOMEM; | ||
191 | |||
192 | return 0; | ||
193 | } | ||
194 | |||
195 | void nvgpu_free_gmmu_pages(struct vm_gk20a *vm, | ||
196 | struct nvgpu_gmmu_pd *pd) | ||
197 | { | ||
198 | struct gk20a *g = gk20a_from_vm(vm); | ||
199 | |||
200 | nvgpu_dma_free(g, &pd->mem); | ||
201 | } | ||
202 | |||
203 | /* | ||
204 | * Return the _physical_ address of a page directory. | ||
205 | */ | ||
206 | u64 nvgpu_pde_phys_addr(struct gk20a *g, struct nvgpu_gmmu_pd *pd) | ||
207 | { | ||
208 | if (g->mm.has_physical_mode) | ||
209 | return sg_phys(pd->mem.priv.sgt->sgl); | ||
210 | else | ||
211 | return nvgpu_mem_get_base_addr(g, &pd->mem, 0); | ||
212 | } | ||
213 | |||
214 | /* | ||
215 | * Return the aligned length based on the page size in attrs. | ||
216 | */ | ||
217 | static u64 nvgpu_align_map_length(struct vm_gk20a *vm, u64 length, | ||
218 | struct nvgpu_gmmu_attrs *attrs) | ||
219 | { | ||
220 | u64 page_size = vm->gmmu_page_sizes[attrs->pgsz]; | ||
221 | |||
222 | return ALIGN(length, page_size); | ||
223 | } | ||
224 | |||
225 | static u32 pd_entries(const struct gk20a_mmu_level *l, | ||
226 | struct nvgpu_gmmu_attrs *attrs) | ||
227 | { | ||
228 | /* | ||
229 | * Number of entries in a PD is easy to compute from the number of bits | ||
230 | * used to index the page directory. That is simply 2 raised to the | ||
231 | * number of bits. | ||
232 | */ | ||
233 | return 1UL << (l->hi_bit[attrs->pgsz] - l->lo_bit[attrs->pgsz] + 1UL); | ||
234 | } | ||
235 | |||
236 | /* | ||
237 | * Computes the size of a PD table. | ||
238 | */ | ||
239 | static u32 pd_size(const struct gk20a_mmu_level *l, | ||
240 | struct nvgpu_gmmu_attrs *attrs) | ||
241 | { | ||
242 | return pd_entries(l, attrs) * l->entry_size; | ||
243 | } | ||
244 | |||
245 | /* | ||
246 | * Allocate a physically contiguous region big enough for a gmmu page table | ||
247 | * of the specified level and page size. The whole range is zeroed so that any | ||
248 | * accesses will fault until proper values are programmed. | ||
249 | */ | ||
250 | static int pd_allocate(struct vm_gk20a *vm, | ||
251 | struct nvgpu_gmmu_pd *pd, | ||
252 | const struct gk20a_mmu_level *l, | ||
253 | struct nvgpu_gmmu_attrs *attrs) | ||
254 | { | ||
255 | int err; | ||
250 | 256 | ||
251 | gk20a_dbg_fn(""); | 257 | if (pd->mem.size) |
258 | return 0; | ||
252 | 259 | ||
253 | pde_i = (gpu_va & ((1ULL << ((u64)l->hi_bit[pgsz_idx]+1)) - 1ULL)) | 260 | err = nvgpu_alloc_gmmu_pages(vm, pd_size(l, attrs), pd); |
254 | >> (u64)l->lo_bit[pgsz_idx]; | 261 | if (err) { |
262 | nvgpu_info(vm->mm->g, "error allocating page directory!"); | ||
263 | return err; | ||
264 | } | ||
255 | 265 | ||
256 | gk20a_dbg(gpu_dbg_pte, "size_idx=%d, l: %d, [%llx,%llx], iova=%llx", | 266 | /* |
257 | pgsz_idx, lvl, gpu_va, gpu_end-1, *iova); | 267 | * One mb() is done after all mapping operations. Don't need individual |
268 | * barriers for each PD write. | ||
269 | */ | ||
270 | pd->mem.skip_wmb = true; | ||
258 | 271 | ||
259 | while (gpu_va < gpu_end) { | 272 | return 0; |
260 | u64 next = min((gpu_va + pde_size) & ~(pde_size-1), gpu_end); | 273 | } |
261 | 274 | ||
262 | /* Allocate next level */ | 275 | /* |
276 | * Compute what page directory index at the passed level the passed virtual | ||
277 | * address corresponds to. @attrs is necessary for determining the page size | ||
278 | * which is used to pick the right bit offsets for the GMMU level. | ||
279 | */ | ||
280 | static u32 pd_index(const struct gk20a_mmu_level *l, u64 virt, | ||
281 | struct nvgpu_gmmu_attrs *attrs) | ||
282 | { | ||
283 | u64 pd_mask = (1ULL << ((u64)l->hi_bit[attrs->pgsz] + 1)) - 1ULL; | ||
284 | u32 pd_shift = (u64)l->lo_bit[attrs->pgsz]; | ||
285 | |||
286 | /* | ||
287 | * For convenience we don't bother computing the lower bound of the | ||
288 | * mask; it's easier to just shift it off. | ||
289 | */ | ||
290 | return (virt & pd_mask) >> pd_shift; | ||
291 | } | ||
292 | |||
293 | static int pd_allocate_children(struct vm_gk20a *vm, | ||
294 | const struct gk20a_mmu_level *l, | ||
295 | struct nvgpu_gmmu_pd *pd, | ||
296 | struct nvgpu_gmmu_attrs *attrs) | ||
297 | { | ||
298 | struct gk20a *g = gk20a_from_vm(vm); | ||
299 | |||
300 | if (pd->entries) | ||
301 | return 0; | ||
302 | |||
303 | pd->num_entries = pd_entries(l, attrs); | ||
304 | pd->entries = nvgpu_vzalloc(g, sizeof(struct nvgpu_gmmu_pd) * | ||
305 | pd->num_entries); | ||
306 | if (!pd->entries) | ||
307 | return -ENOMEM; | ||
308 | |||
309 | return 0; | ||
310 | } | ||
311 | |||
312 | /* | ||
313 | * This function programs the GMMU based on two ranges: a physical range and a | ||
314 | * GPU virtual range. The virtual is mapped to the physical. Physical in this | ||
315 | * case can mean either a real physical sysmem address or a IO virtual address | ||
316 | * (for instance when a system has an IOMMU running). | ||
317 | * | ||
318 | * The rest of the parameters are for describing the actual mapping itself. | ||
319 | * | ||
320 | * This function recursively calls itself for handling PDEs. At the final level | ||
321 | * a PTE handler is called. The phys and virt ranges are adjusted for each | ||
322 | * recursion so that each invocation of this function need only worry about the | ||
323 | * range it is passed. | ||
324 | * | ||
325 | * phys_addr will always point to a contiguous range - the discontiguous nature | ||
326 | * of DMA buffers is taken care of at the layer above this. | ||
327 | */ | ||
328 | static int __set_pd_level(struct vm_gk20a *vm, | ||
329 | struct nvgpu_gmmu_pd *pd, | ||
330 | int lvl, | ||
331 | u64 phys_addr, | ||
332 | u64 virt_addr, u64 length, | ||
333 | struct nvgpu_gmmu_attrs *attrs) | ||
334 | { | ||
335 | int err = 0; | ||
336 | u64 pde_range; | ||
337 | struct gk20a *g = gk20a_from_vm(vm); | ||
338 | struct nvgpu_gmmu_pd *next_pd = NULL; | ||
339 | const struct gk20a_mmu_level *l = &vm->mmu_levels[lvl]; | ||
340 | const struct gk20a_mmu_level *next_l = &vm->mmu_levels[lvl + 1]; | ||
341 | |||
342 | /* | ||
343 | * 5 levels for Pascal+. For pre-pascal we only have 2. This puts | ||
344 | * offsets into the page table debugging code which makes it easier to | ||
345 | * see what level prints are from. | ||
346 | */ | ||
347 | static const char *__lvl_debug[] = { | ||
348 | "", /* L=0 */ | ||
349 | " ", /* L=1 */ | ||
350 | " ", /* L=2 */ | ||
351 | " ", /* L=3 */ | ||
352 | " ", /* L=4 */ | ||
353 | }; | ||
354 | |||
355 | pde_range = 1ULL << (u64)l->lo_bit[attrs->pgsz]; | ||
356 | |||
357 | __gmmu_dbg_v(g, attrs, | ||
358 | "L=%d %sGPU virt %#-12llx +%#-9llx -> phys %#-12llx", | ||
359 | lvl, | ||
360 | __lvl_debug[lvl], | ||
361 | virt_addr, | ||
362 | length, | ||
363 | phys_addr); | ||
364 | |||
365 | /* | ||
366 | * Iterate across the mapping in chunks the size of this level's PDE. | ||
367 | * For each of those chunks program our level's PDE and then, if there's | ||
368 | * a next level, program the next level's PDEs/PTEs. | ||
369 | */ | ||
370 | while (length) { | ||
371 | u32 pd_idx = pd_index(l, virt_addr, attrs); | ||
372 | u64 chunk_size; | ||
373 | u64 target_addr; | ||
374 | |||
375 | /* | ||
376 | * Truncate the pde_range when the virtual address does not | ||
377 | * start at a PDE boundary. | ||
378 | */ | ||
379 | chunk_size = min(length, | ||
380 | pde_range - (virt_addr & (pde_range - 1))); | ||
381 | |||
382 | /* | ||
383 | * If the next level has an update_entry function then we know | ||
384 | * that _this_ level points to PDEs (not PTEs). Thus we need to | ||
385 | * have a bunch of children PDs. | ||
386 | */ | ||
263 | if (next_l->update_entry) { | 387 | if (next_l->update_entry) { |
264 | if (!pte->entries) { | 388 | if (pd_allocate_children(vm, l, pd, attrs)) |
265 | int num_entries = | 389 | return -ENOMEM; |
266 | 1 << | 390 | |
267 | (l->hi_bit[pgsz_idx] | 391 | /* |
268 | - l->lo_bit[pgsz_idx] + 1); | 392 | * Get the next PD so that we know what to put in this |
269 | pte->entries = | 393 | * current PD. If the next level is actually PTEs then |
270 | nvgpu_vzalloc(g, | 394 | * we don't need this - we will just use the real |
271 | sizeof(struct gk20a_mm_entry) * | 395 | * physical target. |
272 | num_entries); | 396 | */ |
273 | if (!pte->entries) | 397 | next_pd = &pd->entries[pd_idx]; |
274 | return -ENOMEM; | 398 | |
275 | pte->pgsz = pgsz_idx; | 399 | /* |
276 | pte->num_entries = num_entries; | 400 | * Allocate the backing memory for next_pd. |
277 | } | 401 | */ |
278 | prev_pte = next_pte; | 402 | if (pd_allocate(vm, next_pd, next_l, attrs)) |
279 | next_pte = pte->entries + pde_i; | 403 | return -ENOMEM; |
280 | |||
281 | if (!next_pte->mem.size) { | ||
282 | err = nvgpu_zalloc_gmmu_page_table(vm, | ||
283 | pgsz_idx, next_l, next_pte, prev_pte); | ||
284 | if (err) | ||
285 | return err; | ||
286 | } | ||
287 | } | 404 | } |
288 | 405 | ||
289 | err = l->update_entry(vm, pte, pde_i, pgsz_idx, | 406 | /* |
290 | sgl, offset, iova, | 407 | * This is the address we want to program into the actual PDE/ |
291 | kind_v, ctag, cacheable, unmapped_pte, | 408 | * PTE. When the next level is PDEs we need the target address |
292 | rw_flag, sparse, priv, aperture); | 409 | * to be the table of PDEs. When the next level is PTEs the |
293 | if (err) | 410 | * target addr is the real physical address we are aiming for. |
294 | return err; | 411 | */ |
412 | target_addr = next_pd ? nvgpu_pde_phys_addr(g, next_pd) : | ||
413 | phys_addr; | ||
414 | |||
415 | l->update_entry(vm, l, | ||
416 | pd, pd_idx, | ||
417 | virt_addr, | ||
418 | target_addr, | ||
419 | attrs); | ||
295 | 420 | ||
296 | if (next_l->update_entry) { | 421 | if (next_l->update_entry) { |
297 | /* get cpu access to the ptes */ | 422 | err = map_gmmu_pages(g, next_pd); |
298 | err = map_gmmu_pages(g, next_pte); | ||
299 | if (err) { | 423 | if (err) { |
300 | nvgpu_err(g, | 424 | nvgpu_err(g, |
301 | "couldn't map ptes for update as=%d", | 425 | "couldn't map ptes for update as=%d", |
302 | vm_aspace_id(vm)); | 426 | vm_aspace_id(vm)); |
303 | return err; | 427 | return err; |
304 | } | 428 | } |
305 | err = update_gmmu_level_locked(vm, next_pte, | 429 | |
306 | pgsz_idx, | 430 | err = __set_pd_level(vm, next_pd, |
307 | sgl, | 431 | lvl + 1, |
308 | offset, | 432 | phys_addr, |
309 | iova, | 433 | virt_addr, |
310 | gpu_va, | 434 | chunk_size, |
311 | next, | 435 | attrs); |
312 | kind_v, ctag, cacheable, unmapped_pte, | 436 | unmap_gmmu_pages(g, next_pd); |
313 | rw_flag, sparse, lvl+1, priv, aperture); | ||
314 | unmap_gmmu_pages(g, next_pte); | ||
315 | 437 | ||
316 | if (err) | 438 | if (err) |
317 | return err; | 439 | return err; |
318 | } | 440 | } |
319 | 441 | ||
320 | pde_i++; | 442 | virt_addr += chunk_size; |
321 | gpu_va = next; | 443 | |
444 | /* | ||
445 | * Only add to phys_addr if it's non-zero. A zero value implies | ||
446 | * we are unmapping as as a result we don't want to place | ||
447 | * non-zero phys addresses in the PTEs. A non-zero phys-addr | ||
448 | * would also confuse the lower level PTE programming code. | ||
449 | */ | ||
450 | if (phys_addr) | ||
451 | phys_addr += chunk_size; | ||
452 | length -= chunk_size; | ||
453 | } | ||
454 | |||
455 | __gmmu_dbg_v(g, attrs, "L=%d %s%s", lvl, __lvl_debug[lvl], "ret!"); | ||
456 | |||
457 | return 0; | ||
458 | } | ||
459 | |||
460 | /* | ||
461 | * VIDMEM version of the update_ptes logic. | ||
462 | */ | ||
463 | static int __nvgpu_gmmu_update_page_table_vidmem(struct vm_gk20a *vm, | ||
464 | struct sg_table *sgt, | ||
465 | u64 space_to_skip, | ||
466 | u64 virt_addr, | ||
467 | u64 length, | ||
468 | struct nvgpu_gmmu_attrs *attrs) | ||
469 | { | ||
470 | struct nvgpu_page_alloc *alloc = NULL; | ||
471 | struct page_alloc_chunk *chunk = NULL; | ||
472 | u64 phys_addr, chunk_length; | ||
473 | int err = 0; | ||
474 | |||
475 | if (!sgt) { | ||
476 | /* | ||
477 | * This is considered an unmap. Just pass in 0 as the physical | ||
478 | * address for the entire GPU range. | ||
479 | */ | ||
480 | err = __set_pd_level(vm, &vm->pdb, | ||
481 | 0, | ||
482 | 0, | ||
483 | virt_addr, length, | ||
484 | attrs); | ||
485 | return err; | ||
486 | } | ||
487 | |||
488 | alloc = get_vidmem_page_alloc(sgt->sgl); | ||
489 | |||
490 | /* | ||
491 | * Otherwise iterate across all the chunks in this allocation and | ||
492 | * map them. | ||
493 | */ | ||
494 | nvgpu_list_for_each_entry(chunk, &alloc->alloc_chunks, | ||
495 | page_alloc_chunk, list_entry) { | ||
496 | if (space_to_skip && | ||
497 | space_to_skip >= chunk->length) { | ||
498 | space_to_skip -= chunk->length; | ||
499 | continue; | ||
500 | } | ||
501 | |||
502 | phys_addr = chunk->base + space_to_skip; | ||
503 | chunk_length = min(length, (chunk->length - space_to_skip)); | ||
504 | |||
505 | err = __set_pd_level(vm, &vm->pdb, | ||
506 | 0, | ||
507 | phys_addr, | ||
508 | virt_addr, length, | ||
509 | attrs); | ||
510 | if (err) | ||
511 | break; | ||
512 | |||
513 | /* Space has been skipped so zero this for future chunks. */ | ||
514 | space_to_skip = 0; | ||
515 | |||
516 | /* | ||
517 | * Update the map pointer and the remaining length. | ||
518 | */ | ||
519 | virt_addr += chunk_length; | ||
520 | length -= chunk_length; | ||
521 | |||
522 | if (length == 0) | ||
523 | break; | ||
322 | } | 524 | } |
323 | 525 | ||
324 | gk20a_dbg_fn("done"); | 526 | return err; |
527 | } | ||
528 | |||
529 | static int __nvgpu_gmmu_update_page_table_sysmem(struct vm_gk20a *vm, | ||
530 | struct sg_table *sgt, | ||
531 | u64 space_to_skip, | ||
532 | u64 virt_addr, | ||
533 | u64 length, | ||
534 | struct nvgpu_gmmu_attrs *attrs) | ||
535 | { | ||
536 | int err; | ||
537 | struct scatterlist *sgl; | ||
538 | struct gk20a *g = gk20a_from_vm(vm); | ||
539 | |||
540 | if (!sgt) { | ||
541 | /* | ||
542 | * This is considered an unmap. Just pass in 0 as the physical | ||
543 | * address for the entire GPU range. | ||
544 | */ | ||
545 | err = __set_pd_level(vm, &vm->pdb, | ||
546 | 0, | ||
547 | 0, | ||
548 | virt_addr, length, | ||
549 | attrs); | ||
550 | return err; | ||
551 | } | ||
552 | |||
553 | /* | ||
554 | * At this point we have a Linux scatter-gather list pointing to some | ||
555 | * number of discontiguous chunks of memory. Iterate over that list and | ||
556 | * generate a GMMU map call for each chunk. There are two possibilities: | ||
557 | * either the IOMMU is enabled or not. When the IOMMU is enabled the | ||
558 | * mapping is simple since the "physical" address is actually a virtual | ||
559 | * IO address and will be contiguous. The no-IOMMU case is more | ||
560 | * complicated. We will have to iterate over the SGT and do a separate | ||
561 | * map for each chunk of the SGT. | ||
562 | */ | ||
563 | sgl = sgt->sgl; | ||
564 | |||
565 | if (!g->mm.bypass_smmu) { | ||
566 | u64 io_addr = g->ops.mm.get_iova_addr(g, sgl, 0); | ||
567 | |||
568 | io_addr += space_to_skip; | ||
569 | |||
570 | err = __set_pd_level(vm, &vm->pdb, | ||
571 | 0, | ||
572 | io_addr, | ||
573 | virt_addr, | ||
574 | length, | ||
575 | attrs); | ||
576 | |||
577 | return err; | ||
578 | } | ||
579 | |||
580 | /* | ||
581 | * Finally: last possible case: do the no-IOMMU mapping. In this case we | ||
582 | * really are mapping physical pages directly. | ||
583 | */ | ||
584 | while (sgl) { | ||
585 | u64 phys_addr; | ||
586 | u64 chunk_length; | ||
587 | |||
588 | /* | ||
589 | * Cut out sgl ents for space_to_skip. | ||
590 | */ | ||
591 | if (space_to_skip && space_to_skip >= sgl->length) { | ||
592 | space_to_skip -= sgl->length; | ||
593 | sgl = sg_next(sgl); | ||
594 | continue; | ||
595 | } | ||
596 | |||
597 | phys_addr = sg_phys(sgl) + space_to_skip; | ||
598 | chunk_length = min(length, sgl->length - space_to_skip); | ||
599 | |||
600 | err = __set_pd_level(vm, &vm->pdb, | ||
601 | 0, | ||
602 | phys_addr, | ||
603 | virt_addr, | ||
604 | chunk_length, | ||
605 | attrs); | ||
606 | if (err) | ||
607 | return err; | ||
608 | |||
609 | space_to_skip = 0; | ||
610 | virt_addr += chunk_length; | ||
611 | length -= chunk_length; | ||
612 | sgl = sg_next(sgl); | ||
613 | |||
614 | if (length == 0) | ||
615 | break; | ||
616 | } | ||
325 | 617 | ||
326 | return 0; | 618 | return 0; |
327 | } | 619 | } |
@@ -332,8 +624,8 @@ static int update_gmmu_level_locked(struct vm_gk20a *vm, | |||
332 | * physical* address. | 624 | * physical* address. |
333 | * | 625 | * |
334 | * The update of each level of the page tables is farmed out to chip specific | 626 | * The update of each level of the page tables is farmed out to chip specific |
335 | * implementations. But the logic around that is generic to all chips. Every chip | 627 | * implementations. But the logic around that is generic to all chips. Every |
336 | * has some number of PDE levels and then a PTE level. | 628 | * chip has some number of PDE levels and then a PTE level. |
337 | * | 629 | * |
338 | * Each chunk of the incoming SGT is sent to the chip specific implementation | 630 | * Each chunk of the incoming SGT is sent to the chip specific implementation |
339 | * of page table update. | 631 | * of page table update. |
@@ -341,148 +633,81 @@ static int update_gmmu_level_locked(struct vm_gk20a *vm, | |||
341 | * [*] Note: the "physical" address may actually be an IO virtual address in the | 633 | * [*] Note: the "physical" address may actually be an IO virtual address in the |
342 | * case of SMMU usage. | 634 | * case of SMMU usage. |
343 | */ | 635 | */ |
344 | static int update_gmmu_ptes_locked(struct vm_gk20a *vm, | 636 | static int __nvgpu_gmmu_update_page_table(struct vm_gk20a *vm, |
345 | enum gmmu_pgsz_gk20a pgsz_idx, | 637 | struct sg_table *sgt, |
346 | struct sg_table *sgt, | 638 | u64 space_to_skip, |
347 | u64 buffer_offset, | 639 | u64 virt_addr, |
348 | u64 gpu_va, u64 gpu_end, | 640 | u64 length, |
349 | u8 kind_v, u32 ctag_offset, | 641 | struct nvgpu_gmmu_attrs *attrs) |
350 | bool cacheable, bool unmapped_pte, | ||
351 | int rw_flag, | ||
352 | bool sparse, | ||
353 | bool priv, | ||
354 | enum nvgpu_aperture aperture) | ||
355 | { | 642 | { |
356 | struct gk20a *g = gk20a_from_vm(vm); | 643 | struct gk20a *g = gk20a_from_vm(vm); |
357 | int ctag_granularity = g->ops.fb.compression_page_size(g); | 644 | u32 page_size; |
358 | u64 ctag = (u64)ctag_offset * (u64)ctag_granularity; | ||
359 | u64 iova = 0; | ||
360 | u64 space_to_skip = buffer_offset; | ||
361 | u64 map_size = gpu_end - gpu_va; | ||
362 | u32 page_size = vm->gmmu_page_sizes[pgsz_idx]; | ||
363 | int err; | 645 | int err; |
364 | struct scatterlist *sgl = NULL; | ||
365 | struct nvgpu_page_alloc *alloc = NULL; | ||
366 | struct page_alloc_chunk *chunk = NULL; | ||
367 | u64 length; | ||
368 | 646 | ||
369 | /* note: here we need to map kernel to small, since the | 647 | /* note: here we need to map kernel to small, since the |
370 | * low-level mmu code assumes 0 is small and 1 is big pages */ | 648 | * low-level mmu code assumes 0 is small and 1 is big pages */ |
371 | if (pgsz_idx == gmmu_page_size_kernel) | 649 | if (attrs->pgsz == gmmu_page_size_kernel) |
372 | pgsz_idx = gmmu_page_size_small; | 650 | attrs->pgsz = gmmu_page_size_small; |
651 | |||
652 | page_size = vm->gmmu_page_sizes[attrs->pgsz]; | ||
373 | 653 | ||
374 | if (space_to_skip & (page_size - 1)) | 654 | if (space_to_skip & (page_size - 1)) |
375 | return -EINVAL; | 655 | return -EINVAL; |
376 | 656 | ||
657 | /* | ||
658 | * Update length to be aligned to the passed page size. | ||
659 | */ | ||
660 | length = nvgpu_align_map_length(vm, length, attrs); | ||
661 | |||
377 | err = map_gmmu_pages(g, &vm->pdb); | 662 | err = map_gmmu_pages(g, &vm->pdb); |
378 | if (err) { | 663 | if (err) { |
379 | nvgpu_err(g, | 664 | nvgpu_err(g, "couldn't map ptes for update as=%d", |
380 | "couldn't map ptes for update as=%d", | 665 | vm_aspace_id(vm)); |
381 | vm_aspace_id(vm)); | ||
382 | return err; | 666 | return err; |
383 | } | 667 | } |
384 | 668 | ||
385 | if (aperture == APERTURE_VIDMEM) { | 669 | __gmmu_dbg(g, attrs, |
386 | gmmu_dbg_v(g, "vidmem map size_idx=%d, gpu_va=[%llx,%llx]", | 670 | "vm=%s " |
387 | pgsz_idx, gpu_va, gpu_end-1); | 671 | "%-5s GPU virt %#-12llx +%#-9llx phys %#-12llx " |
388 | 672 | "phys offset: %#-4llx; pgsz: %3dkb perm=%-2s | " | |
389 | if (sgt) { | 673 | "kind=%#02x APT=%-6s %c%c%c", |
390 | alloc = get_vidmem_page_alloc(sgt->sgl); | 674 | vm->name, |
391 | 675 | sgt ? "MAP" : "UNMAP", | |
392 | nvgpu_list_for_each_entry(chunk, &alloc->alloc_chunks, | 676 | virt_addr, |
393 | page_alloc_chunk, list_entry) { | 677 | length, |
394 | if (space_to_skip && | 678 | sgt ? g->ops.mm.get_iova_addr(g, sgt->sgl, 0) : 0ULL, |
395 | space_to_skip > chunk->length) { | 679 | space_to_skip, |
396 | space_to_skip -= chunk->length; | 680 | page_size >> 10, |
397 | } else { | 681 | nvgpu_gmmu_perm_str(attrs->rw_flag), |
398 | iova = chunk->base + space_to_skip; | 682 | attrs->kind_v, |
399 | length = chunk->length - space_to_skip; | 683 | nvgpu_aperture_str(attrs->aperture), |
400 | length = min(length, map_size); | 684 | attrs->cacheable ? 'C' : 'V', /* C = cached, V = volatile. */ |
401 | space_to_skip = 0; | 685 | attrs->sparse ? 'S' : '-', |
402 | 686 | attrs->priv ? 'P' : '-'); | |
403 | err = update_gmmu_level_locked(vm, | 687 | |
404 | &vm->pdb, pgsz_idx, | 688 | /* |
405 | &sgl, | 689 | * Handle VIDMEM progamming. Currently uses a different scatter list |
406 | &space_to_skip, | 690 | * format. |
407 | &iova, | 691 | */ |
408 | gpu_va, gpu_va + length, | 692 | if (attrs->aperture == APERTURE_VIDMEM) |
409 | kind_v, &ctag, | 693 | err = __nvgpu_gmmu_update_page_table_vidmem(vm, |
410 | cacheable, unmapped_pte, | 694 | sgt, |
411 | rw_flag, sparse, 0, priv, | 695 | space_to_skip, |
412 | aperture); | 696 | virt_addr, |
413 | if (err) | 697 | length, |
414 | break; | 698 | attrs); |
415 | 699 | else | |
416 | /* need to set explicit zero here */ | 700 | err = __nvgpu_gmmu_update_page_table_sysmem(vm, |
417 | space_to_skip = 0; | 701 | sgt, |
418 | gpu_va += length; | 702 | space_to_skip, |
419 | map_size -= length; | 703 | virt_addr, |
420 | 704 | length, | |
421 | if (!map_size) | 705 | attrs); |
422 | break; | ||
423 | } | ||
424 | } | ||
425 | } else { | ||
426 | err = update_gmmu_level_locked(vm, &vm->pdb, pgsz_idx, | ||
427 | &sgl, | ||
428 | &space_to_skip, | ||
429 | &iova, | ||
430 | gpu_va, gpu_end, | ||
431 | kind_v, &ctag, | ||
432 | cacheable, unmapped_pte, rw_flag, | ||
433 | sparse, 0, priv, | ||
434 | aperture); | ||
435 | } | ||
436 | } else { | ||
437 | gmmu_dbg_v(g, | ||
438 | "pgsz=%-6d, gpu_va: %#-12llx +%#-6llx phys: %#-12llx " | ||
439 | "buffer offset: %-4lld, nents: %d", | ||
440 | page_size, | ||
441 | gpu_va, gpu_end - gpu_va, | ||
442 | sgt ? g->ops.mm.get_iova_addr(g, sgt->sgl, 0) : 0ULL, | ||
443 | buffer_offset, | ||
444 | sgt ? sgt->nents : 0); | ||
445 | |||
446 | if (sgt) { | ||
447 | iova = g->ops.mm.get_iova_addr(vm->mm->g, sgt->sgl, 0); | ||
448 | if (!vm->mm->bypass_smmu && iova) { | ||
449 | iova += space_to_skip; | ||
450 | } else { | ||
451 | sgl = sgt->sgl; | ||
452 | |||
453 | gk20a_dbg(gpu_dbg_pte, "chunk address %llx, size %d", | ||
454 | (u64)sg_phys(sgl), | ||
455 | sgl->length); | ||
456 | |||
457 | while (space_to_skip && sgl && | ||
458 | space_to_skip + page_size > sgl->length) { | ||
459 | space_to_skip -= sgl->length; | ||
460 | sgl = sg_next(sgl); | ||
461 | gk20a_dbg(gpu_dbg_pte, "chunk address %llx, size %d", | ||
462 | (u64)sg_phys(sgl), | ||
463 | sgl->length); | ||
464 | } | ||
465 | |||
466 | iova = sg_phys(sgl) + space_to_skip; | ||
467 | } | ||
468 | } | ||
469 | |||
470 | err = update_gmmu_level_locked(vm, &vm->pdb, pgsz_idx, | ||
471 | &sgl, | ||
472 | &space_to_skip, | ||
473 | &iova, | ||
474 | gpu_va, gpu_end, | ||
475 | kind_v, &ctag, | ||
476 | cacheable, unmapped_pte, rw_flag, | ||
477 | sparse, 0, priv, | ||
478 | aperture); | ||
479 | } | ||
480 | 706 | ||
481 | unmap_gmmu_pages(g, &vm->pdb); | 707 | unmap_gmmu_pages(g, &vm->pdb); |
482 | |||
483 | mb(); | 708 | mb(); |
484 | 709 | ||
485 | gk20a_dbg_fn("done"); | 710 | __gmmu_dbg(g, attrs, "%-5s Done!", sgt ? "MAP" : "UNMAP"); |
486 | 711 | ||
487 | return err; | 712 | return err; |
488 | } | 713 | } |
@@ -500,32 +725,44 @@ static int update_gmmu_ptes_locked(struct vm_gk20a *vm, | |||
500 | * have the update_gmmu_lock aquired. | 725 | * have the update_gmmu_lock aquired. |
501 | */ | 726 | */ |
502 | u64 gk20a_locked_gmmu_map(struct vm_gk20a *vm, | 727 | u64 gk20a_locked_gmmu_map(struct vm_gk20a *vm, |
503 | u64 map_offset, | 728 | u64 vaddr, |
504 | struct sg_table *sgt, | 729 | struct sg_table *sgt, |
505 | u64 buffer_offset, | 730 | u64 buffer_offset, |
506 | u64 size, | 731 | u64 size, |
507 | int pgsz_idx, | 732 | int pgsz_idx, |
508 | u8 kind_v, | 733 | u8 kind_v, |
509 | u32 ctag_offset, | 734 | u32 ctag_offset, |
510 | u32 flags, | 735 | u32 flags, |
511 | int rw_flag, | 736 | int rw_flag, |
512 | bool clear_ctags, | 737 | bool clear_ctags, |
513 | bool sparse, | 738 | bool sparse, |
514 | bool priv, | 739 | bool priv, |
515 | struct vm_gk20a_mapping_batch *batch, | 740 | struct vm_gk20a_mapping_batch *batch, |
516 | enum nvgpu_aperture aperture) | 741 | enum nvgpu_aperture aperture) |
517 | { | 742 | { |
743 | struct gk20a *g = gk20a_from_vm(vm); | ||
518 | int err = 0; | 744 | int err = 0; |
519 | bool allocated = false; | 745 | bool allocated = false; |
520 | struct gk20a *g = gk20a_from_vm(vm); | ||
521 | int ctag_granularity = g->ops.fb.compression_page_size(g); | 746 | int ctag_granularity = g->ops.fb.compression_page_size(g); |
522 | u32 ctag_lines = DIV_ROUND_UP_ULL(size, ctag_granularity); | 747 | struct nvgpu_gmmu_attrs attrs = { |
523 | 748 | .pgsz = pgsz_idx, | |
524 | /* Allocate (or validate when map_offset != 0) the virtual address. */ | 749 | .kind_v = kind_v, |
525 | if (!map_offset) { | 750 | .ctag = (u64)ctag_offset * (u64)ctag_granularity, |
526 | map_offset = __nvgpu_vm_alloc_va(vm, size, | 751 | .cacheable = flags & NVGPU_MAP_BUFFER_FLAGS_CACHEABLE_TRUE, |
527 | pgsz_idx); | 752 | .rw_flag = rw_flag, |
528 | if (!map_offset) { | 753 | .sparse = sparse, |
754 | .priv = priv, | ||
755 | .valid = !(flags & NVGPU_AS_MAP_BUFFER_FLAGS_UNMAPPED_PTE), | ||
756 | .aperture = aperture | ||
757 | }; | ||
758 | |||
759 | /* | ||
760 | * Only allocate a new GPU VA range if we haven't already been passed a | ||
761 | * GPU VA range. This facilitates fixed mappings. | ||
762 | */ | ||
763 | if (!vaddr) { | ||
764 | vaddr = __nvgpu_vm_alloc_va(vm, size, pgsz_idx); | ||
765 | if (!vaddr) { | ||
529 | nvgpu_err(g, "failed to allocate va space"); | 766 | nvgpu_err(g, "failed to allocate va space"); |
530 | err = -ENOMEM; | 767 | err = -ENOMEM; |
531 | goto fail_alloc; | 768 | goto fail_alloc; |
@@ -533,34 +770,8 @@ u64 gk20a_locked_gmmu_map(struct vm_gk20a *vm, | |||
533 | allocated = true; | 770 | allocated = true; |
534 | } | 771 | } |
535 | 772 | ||
536 | gmmu_dbg(g, | 773 | err = __nvgpu_gmmu_update_page_table(vm, sgt, buffer_offset, |
537 | "gv: 0x%04x_%08x + 0x%-7llx " | 774 | vaddr, size, &attrs); |
538 | "[dma: 0x%02x_%08x, pa: 0x%02x_%08x] " | ||
539 | "pgsz=%-3dKb as=%-2d ctags=%d start=%d " | ||
540 | "kind=0x%x flags=0x%x apt=%s", | ||
541 | u64_hi32(map_offset), u64_lo32(map_offset), size, | ||
542 | sgt ? u64_hi32((u64)sg_dma_address(sgt->sgl)) : 0, | ||
543 | sgt ? u64_lo32((u64)sg_dma_address(sgt->sgl)) : 0, | ||
544 | sgt ? u64_hi32((u64)sg_phys(sgt->sgl)) : 0, | ||
545 | sgt ? u64_lo32((u64)sg_phys(sgt->sgl)) : 0, | ||
546 | vm->gmmu_page_sizes[pgsz_idx] >> 10, vm_aspace_id(vm), | ||
547 | ctag_lines, ctag_offset, | ||
548 | kind_v, flags, nvgpu_aperture_str(aperture)); | ||
549 | |||
550 | err = update_gmmu_ptes_locked(vm, pgsz_idx, | ||
551 | sgt, | ||
552 | buffer_offset, | ||
553 | map_offset, map_offset + size, | ||
554 | kind_v, | ||
555 | ctag_offset, | ||
556 | flags & | ||
557 | NVGPU_MAP_BUFFER_FLAGS_CACHEABLE_TRUE, | ||
558 | flags & | ||
559 | NVGPU_AS_MAP_BUFFER_FLAGS_UNMAPPED_PTE, | ||
560 | rw_flag, | ||
561 | sparse, | ||
562 | priv, | ||
563 | aperture); | ||
564 | if (err) { | 775 | if (err) { |
565 | nvgpu_err(g, "failed to update ptes on map"); | 776 | nvgpu_err(g, "failed to update ptes on map"); |
566 | goto fail_validate; | 777 | goto fail_validate; |
@@ -571,26 +782,37 @@ u64 gk20a_locked_gmmu_map(struct vm_gk20a *vm, | |||
571 | else | 782 | else |
572 | batch->need_tlb_invalidate = true; | 783 | batch->need_tlb_invalidate = true; |
573 | 784 | ||
574 | return map_offset; | 785 | return vaddr; |
575 | fail_validate: | 786 | fail_validate: |
576 | if (allocated) | 787 | if (allocated) |
577 | __nvgpu_vm_free_va(vm, map_offset, pgsz_idx); | 788 | __nvgpu_vm_free_va(vm, vaddr, pgsz_idx); |
578 | fail_alloc: | 789 | fail_alloc: |
579 | nvgpu_err(g, "%s: failed with err=%d", __func__, err); | 790 | nvgpu_err(g, "%s: failed with err=%d", __func__, err); |
580 | return 0; | 791 | return 0; |
581 | } | 792 | } |
582 | 793 | ||
583 | void gk20a_locked_gmmu_unmap(struct vm_gk20a *vm, | 794 | void gk20a_locked_gmmu_unmap(struct vm_gk20a *vm, |
584 | u64 vaddr, | 795 | u64 vaddr, |
585 | u64 size, | 796 | u64 size, |
586 | int pgsz_idx, | 797 | int pgsz_idx, |
587 | bool va_allocated, | 798 | bool va_allocated, |
588 | int rw_flag, | 799 | int rw_flag, |
589 | bool sparse, | 800 | bool sparse, |
590 | struct vm_gk20a_mapping_batch *batch) | 801 | struct vm_gk20a_mapping_batch *batch) |
591 | { | 802 | { |
592 | int err = 0; | 803 | int err = 0; |
593 | struct gk20a *g = gk20a_from_vm(vm); | 804 | struct gk20a *g = gk20a_from_vm(vm); |
805 | struct nvgpu_gmmu_attrs attrs = { | ||
806 | .pgsz = pgsz_idx, | ||
807 | .kind_v = 0, | ||
808 | .ctag = 0, | ||
809 | .cacheable = 0, | ||
810 | .rw_flag = rw_flag, | ||
811 | .sparse = sparse, | ||
812 | .priv = 0, | ||
813 | .valid = 0, | ||
814 | .aperture = APERTURE_INVALID, | ||
815 | }; | ||
594 | 816 | ||
595 | if (va_allocated) { | 817 | if (va_allocated) { |
596 | err = __nvgpu_vm_free_va(vm, vaddr, pgsz_idx); | 818 | err = __nvgpu_vm_free_va(vm, vaddr, pgsz_idx); |
@@ -601,27 +823,11 @@ void gk20a_locked_gmmu_unmap(struct vm_gk20a *vm, | |||
601 | } | 823 | } |
602 | 824 | ||
603 | /* unmap here needs to know the page size we assigned at mapping */ | 825 | /* unmap here needs to know the page size we assigned at mapping */ |
604 | err = update_gmmu_ptes_locked(vm, | 826 | err = __nvgpu_gmmu_update_page_table(vm, NULL, 0, |
605 | pgsz_idx, | 827 | vaddr, size, &attrs); |
606 | NULL, /* n/a for unmap */ | ||
607 | 0, | ||
608 | vaddr, | ||
609 | vaddr + size, | ||
610 | 0, 0, false /* n/a for unmap */, | ||
611 | false, rw_flag, | ||
612 | sparse, 0, | ||
613 | APERTURE_INVALID); /* don't care for unmap */ | ||
614 | if (err) | 828 | if (err) |
615 | nvgpu_err(g, "failed to update gmmu ptes on unmap"); | 829 | nvgpu_err(g, "failed to update gmmu ptes on unmap"); |
616 | 830 | ||
617 | /* flush l2 so any dirty lines are written out *now*. | ||
618 | * also as we could potentially be switching this buffer | ||
619 | * from nonvolatile (l2 cacheable) to volatile (l2 non-cacheable) at | ||
620 | * some point in the future we need to invalidate l2. e.g. switching | ||
621 | * from a render buffer unmap (here) to later using the same memory | ||
622 | * for gmmu ptes. note the positioning of this relative to any smmu | ||
623 | * unmapping (below). */ | ||
624 | |||
625 | if (!batch) { | 831 | if (!batch) { |
626 | gk20a_mm_l2_flush(g, true); | 832 | gk20a_mm_l2_flush(g, true); |
627 | g->ops.fb.tlb_invalidate(g, &vm->pdb.mem); | 833 | g->ops.fb.tlb_invalidate(g, &vm->pdb.mem); |
diff --git a/drivers/gpu/nvgpu/common/mm/vm.c b/drivers/gpu/nvgpu/common/mm/vm.c index 88622eca..3aeba500 100644 --- a/drivers/gpu/nvgpu/common/mm/vm.c +++ b/drivers/gpu/nvgpu/common/mm/vm.c | |||
@@ -36,7 +36,7 @@ int vm_aspace_id(struct vm_gk20a *vm) | |||
36 | } | 36 | } |
37 | 37 | ||
38 | static void nvgpu_vm_free_entries(struct vm_gk20a *vm, | 38 | static void nvgpu_vm_free_entries(struct vm_gk20a *vm, |
39 | struct gk20a_mm_entry *parent, | 39 | struct nvgpu_gmmu_pd *parent, |
40 | int level) | 40 | int level) |
41 | { | 41 | { |
42 | int i; | 42 | int i; |
@@ -75,8 +75,6 @@ u64 __nvgpu_vm_alloc_va(struct vm_gk20a *vm, u64 size, | |||
75 | 75 | ||
76 | /* Be certain we round up to page_size if needed */ | 76 | /* Be certain we round up to page_size if needed */ |
77 | size = (size + ((u64)page_size - 1)) & ~((u64)page_size - 1); | 77 | size = (size + ((u64)page_size - 1)) & ~((u64)page_size - 1); |
78 | nvgpu_log(g, gpu_dbg_map, "size=0x%llx @ pgsz=%dKB", size, | ||
79 | vm->gmmu_page_sizes[pgsz_idx] >> 10); | ||
80 | 78 | ||
81 | addr = nvgpu_alloc(vma, size); | 79 | addr = nvgpu_alloc(vma, size); |
82 | if (!addr) { | 80 | if (!addr) { |
@@ -84,17 +82,14 @@ u64 __nvgpu_vm_alloc_va(struct vm_gk20a *vm, u64 size, | |||
84 | return 0; | 82 | return 0; |
85 | } | 83 | } |
86 | 84 | ||
87 | nvgpu_log(g, gpu_dbg_map, "(%s) addr: 0x%llx", vma->name, addr); | ||
88 | return addr; | 85 | return addr; |
89 | } | 86 | } |
90 | 87 | ||
91 | int __nvgpu_vm_free_va(struct vm_gk20a *vm, u64 addr, | 88 | int __nvgpu_vm_free_va(struct vm_gk20a *vm, u64 addr, |
92 | enum gmmu_pgsz_gk20a pgsz_idx) | 89 | enum gmmu_pgsz_gk20a pgsz_idx) |
93 | { | 90 | { |
94 | struct gk20a *g = vm->mm->g; | ||
95 | struct nvgpu_allocator *vma = vm->vma[pgsz_idx]; | 91 | struct nvgpu_allocator *vma = vm->vma[pgsz_idx]; |
96 | 92 | ||
97 | nvgpu_log(g, gpu_dbg_map, "(%s) addr: 0x%llx", vma->name, addr); | ||
98 | nvgpu_free(vma, addr); | 93 | nvgpu_free(vma, addr); |
99 | 94 | ||
100 | return 0; | 95 | return 0; |
@@ -127,32 +122,6 @@ void nvgpu_vm_mapping_batch_finish(struct vm_gk20a *vm, | |||
127 | nvgpu_mutex_release(&vm->update_gmmu_lock); | 122 | nvgpu_mutex_release(&vm->update_gmmu_lock); |
128 | } | 123 | } |
129 | 124 | ||
130 | static int nvgpu_vm_init_page_tables(struct vm_gk20a *vm) | ||
131 | { | ||
132 | u32 pde_lo, pde_hi; | ||
133 | int err; | ||
134 | |||
135 | pde_range_from_vaddr_range(vm, | ||
136 | 0, vm->va_limit-1, | ||
137 | &pde_lo, &pde_hi); | ||
138 | vm->pdb.entries = nvgpu_vzalloc(vm->mm->g, | ||
139 | sizeof(struct gk20a_mm_entry) * | ||
140 | (pde_hi + 1)); | ||
141 | vm->pdb.num_entries = pde_hi + 1; | ||
142 | |||
143 | if (!vm->pdb.entries) | ||
144 | return -ENOMEM; | ||
145 | |||
146 | err = nvgpu_zalloc_gmmu_page_table(vm, 0, &vm->mmu_levels[0], | ||
147 | &vm->pdb, NULL); | ||
148 | if (err) { | ||
149 | nvgpu_vfree(vm->mm->g, vm->pdb.entries); | ||
150 | return err; | ||
151 | } | ||
152 | |||
153 | return 0; | ||
154 | } | ||
155 | |||
156 | /* | 125 | /* |
157 | * Determine if the passed address space can support big pages or not. | 126 | * Determine if the passed address space can support big pages or not. |
158 | */ | 127 | */ |
@@ -280,7 +249,8 @@ static int __nvgpu_vm_init(struct mm_gk20a *mm, | |||
280 | #endif | 249 | #endif |
281 | 250 | ||
282 | /* Initialize the page table data structures. */ | 251 | /* Initialize the page table data structures. */ |
283 | err = nvgpu_vm_init_page_tables(vm); | 252 | strncpy(vm->name, name, min(strlen(name), sizeof(vm->name))); |
253 | err = nvgpu_gmmu_init_page_table(vm); | ||
284 | if (err) | 254 | if (err) |
285 | goto clean_up_vgpu_vm; | 255 | goto clean_up_vgpu_vm; |
286 | 256 | ||