diff options
Diffstat (limited to 'drivers/gpu/nvgpu')
-rw-r--r-- | drivers/gpu/nvgpu/common/mm/gmmu.c | 976 | ||||
-rw-r--r-- | drivers/gpu/nvgpu/common/mm/vm.c | 36 | ||||
-rw-r--r-- | drivers/gpu/nvgpu/gk20a/fb_gk20a.c | 2 | ||||
-rw-r--r-- | drivers/gpu/nvgpu/gk20a/mm_gk20a.c | 306 | ||||
-rw-r--r-- | drivers/gpu/nvgpu/gk20a/mm_gk20a.h | 16 | ||||
-rw-r--r-- | drivers/gpu/nvgpu/gp10b/mm_gp10b.c | 309 | ||||
-rw-r--r-- | drivers/gpu/nvgpu/include/nvgpu/gmmu.h | 136 | ||||
-rw-r--r-- | drivers/gpu/nvgpu/include/nvgpu/nvgpu_mem.h | 6 | ||||
-rw-r--r-- | drivers/gpu/nvgpu/include/nvgpu/vm.h | 3 |
9 files changed, 979 insertions, 811 deletions
diff --git a/drivers/gpu/nvgpu/common/mm/gmmu.c b/drivers/gpu/nvgpu/common/mm/gmmu.c index 06291600..ec1bc095 100644 --- a/drivers/gpu/nvgpu/common/mm/gmmu.c +++ b/drivers/gpu/nvgpu/common/mm/gmmu.c | |||
@@ -25,115 +25,26 @@ | |||
25 | #include "gk20a/gk20a.h" | 25 | #include "gk20a/gk20a.h" |
26 | #include "gk20a/mm_gk20a.h" | 26 | #include "gk20a/mm_gk20a.h" |
27 | 27 | ||
28 | #define gmmu_dbg(g, fmt, args...) \ | 28 | #define __gmmu_dbg(g, attrs, fmt, args...) \ |
29 | nvgpu_log(g, gpu_dbg_map, fmt, ##args) | 29 | do { \ |
30 | #define gmmu_dbg_v(g, fmt, args...) \ | 30 | if (attrs->debug) \ |
31 | nvgpu_log(g, gpu_dbg_map_v, fmt, ##args) | 31 | nvgpu_info(g, fmt, ##args); \ |
32 | 32 | else \ | |
33 | static int map_gmmu_pages(struct gk20a *g, struct gk20a_mm_entry *entry) | 33 | nvgpu_log(g, gpu_dbg_map, fmt, ##args); \ |
34 | { | 34 | } while (0) |
35 | return nvgpu_mem_begin(g, &entry->mem); | 35 | |
36 | } | 36 | #define __gmmu_dbg_v(g, attrs, fmt, args...) \ |
37 | 37 | do { \ | |
38 | static void unmap_gmmu_pages(struct gk20a *g, struct gk20a_mm_entry *entry) | 38 | if (attrs->debug) \ |
39 | { | 39 | nvgpu_info(g, fmt, ##args); \ |
40 | nvgpu_mem_end(g, &entry->mem); | 40 | else \ |
41 | } | 41 | nvgpu_log(g, gpu_dbg_map_v, fmt, ##args); \ |
42 | 42 | } while (0) | |
43 | static int nvgpu_alloc_gmmu_pages(struct vm_gk20a *vm, u32 order, | 43 | |
44 | struct gk20a_mm_entry *entry) | 44 | static int pd_allocate(struct vm_gk20a *vm, |
45 | { | 45 | struct nvgpu_gmmu_pd *pd, |
46 | struct gk20a *g = gk20a_from_vm(vm); | 46 | const struct gk20a_mmu_level *l, |
47 | u32 num_pages = 1 << order; | 47 | struct nvgpu_gmmu_attrs *attrs); |
48 | u32 len = num_pages * PAGE_SIZE; | ||
49 | int err; | ||
50 | |||
51 | err = nvgpu_dma_alloc(g, len, &entry->mem); | ||
52 | |||
53 | if (err) { | ||
54 | nvgpu_err(g, "memory allocation failed"); | ||
55 | return -ENOMEM; | ||
56 | } | ||
57 | |||
58 | return 0; | ||
59 | } | ||
60 | |||
61 | void nvgpu_free_gmmu_pages(struct vm_gk20a *vm, | ||
62 | struct gk20a_mm_entry *entry) | ||
63 | { | ||
64 | struct gk20a *g = gk20a_from_vm(vm); | ||
65 | |||
66 | if (!entry->mem.size) | ||
67 | return; | ||
68 | |||
69 | if (entry->woffset) /* fake shadow mem */ | ||
70 | return; | ||
71 | |||
72 | nvgpu_dma_free(g, &entry->mem); | ||
73 | } | ||
74 | |||
75 | /* | ||
76 | * Allocate a phys contig region big enough for a full | ||
77 | * sized gmmu page table for the given gmmu_page_size. | ||
78 | * the whole range is zeroed so it's "invalid"/will fault. | ||
79 | * | ||
80 | * If a previous entry is supplied, its memory will be used for | ||
81 | * suballocation for this next entry too, if there is space. | ||
82 | */ | ||
83 | int nvgpu_zalloc_gmmu_page_table(struct vm_gk20a *vm, | ||
84 | enum gmmu_pgsz_gk20a pgsz_idx, | ||
85 | const struct gk20a_mmu_level *l, | ||
86 | struct gk20a_mm_entry *entry, | ||
87 | struct gk20a_mm_entry *prev_entry) | ||
88 | { | ||
89 | int err = -ENOMEM; | ||
90 | int order; | ||
91 | struct gk20a *g = gk20a_from_vm(vm); | ||
92 | u32 bytes; | ||
93 | |||
94 | /* allocate enough pages for the table */ | ||
95 | order = l->hi_bit[pgsz_idx] - l->lo_bit[pgsz_idx] + 1; | ||
96 | order += ilog2(l->entry_size); | ||
97 | bytes = 1 << order; | ||
98 | order -= PAGE_SHIFT; | ||
99 | if (order < 0 && prev_entry) { | ||
100 | /* try to suballocate from previous chunk */ | ||
101 | u32 capacity = prev_entry->mem.size / bytes; | ||
102 | u32 prev = prev_entry->woffset * sizeof(u32) / bytes; | ||
103 | u32 free = capacity - prev - 1; | ||
104 | |||
105 | nvgpu_log(g, gpu_dbg_pte, "cap %d prev %d free %d bytes %d", | ||
106 | capacity, prev, free, bytes); | ||
107 | |||
108 | if (free) { | ||
109 | memcpy(&entry->mem, &prev_entry->mem, | ||
110 | sizeof(entry->mem)); | ||
111 | entry->woffset = prev_entry->woffset | ||
112 | + bytes / sizeof(u32); | ||
113 | err = 0; | ||
114 | } | ||
115 | } | ||
116 | |||
117 | if (err) { | ||
118 | /* no suballoc space */ | ||
119 | order = max(0, order); | ||
120 | err = nvgpu_alloc_gmmu_pages(vm, order, entry); | ||
121 | entry->woffset = 0; | ||
122 | } | ||
123 | |||
124 | nvgpu_log(g, gpu_dbg_pte, "entry = 0x%p, addr=%08llx, size %d, woff %x", | ||
125 | entry, | ||
126 | (entry->mem.priv.sgt && | ||
127 | entry->mem.aperture == APERTURE_SYSMEM) ? | ||
128 | g->ops.mm.get_iova_addr(g, entry->mem.priv.sgt->sgl, 0) : 0, | ||
129 | order, entry->woffset); | ||
130 | if (err) | ||
131 | return err; | ||
132 | entry->pgsz = pgsz_idx; | ||
133 | entry->mem.skip_wmb = true; | ||
134 | |||
135 | return err; | ||
136 | } | ||
137 | 48 | ||
138 | /* | 49 | /* |
139 | * Core GMMU map function for the kernel to use. If @addr is 0 then the GPU | 50 | * Core GMMU map function for the kernel to use. If @addr is 0 then the GPU |
@@ -225,103 +136,484 @@ void nvgpu_gmmu_unmap(struct vm_gk20a *vm, struct nvgpu_mem *mem, u64 gpu_va) | |||
225 | nvgpu_mutex_release(&vm->update_gmmu_lock); | 136 | nvgpu_mutex_release(&vm->update_gmmu_lock); |
226 | } | 137 | } |
227 | 138 | ||
228 | static int update_gmmu_level_locked(struct vm_gk20a *vm, | 139 | int nvgpu_gmmu_init_page_table(struct vm_gk20a *vm) |
229 | struct gk20a_mm_entry *pte, | 140 | { |
230 | enum gmmu_pgsz_gk20a pgsz_idx, | 141 | /* |
231 | struct scatterlist **sgl, | 142 | * Need this just for page size. Everything else can be ignored. Also |
232 | u64 *offset, | 143 | * note that we can just use pgsz 0 (i.e small pages) since the number |
233 | u64 *iova, | 144 | * of bits present in the top level PDE are the same for small/large |
234 | u64 gpu_va, u64 gpu_end, | 145 | * page VMs. |
235 | u8 kind_v, u64 *ctag, | 146 | */ |
236 | bool cacheable, bool unmapped_pte, | 147 | struct nvgpu_gmmu_attrs attrs = { |
237 | int rw_flag, | 148 | .pgsz = 0, |
238 | bool sparse, | 149 | }; |
239 | int lvl, | 150 | |
240 | bool priv, | 151 | return pd_allocate(vm, &vm->pdb, &vm->mmu_levels[0], &attrs); |
241 | enum nvgpu_aperture aperture) | 152 | } |
153 | |||
154 | |||
155 | /* | ||
156 | * Ensure that there's a CPU mapping for the page directory memory. This won't | ||
157 | * always be the case for 32 bit systems since we may need to save kernel | ||
158 | * virtual memory. | ||
159 | */ | ||
160 | static int map_gmmu_pages(struct gk20a *g, struct nvgpu_gmmu_pd *entry) | ||
161 | { | ||
162 | return nvgpu_mem_begin(g, &entry->mem); | ||
163 | } | ||
164 | |||
165 | /* | ||
166 | * Handle any necessary CPU unmap semantics for a page directories DMA memory. | ||
167 | * For 64 bit platforms this is a noop. | ||
168 | */ | ||
169 | static void unmap_gmmu_pages(struct gk20a *g, struct nvgpu_gmmu_pd *entry) | ||
170 | { | ||
171 | nvgpu_mem_end(g, &entry->mem); | ||
172 | } | ||
173 | |||
174 | static int nvgpu_alloc_gmmu_pages(struct vm_gk20a *vm, u32 bytes, | ||
175 | struct nvgpu_gmmu_pd *pd) | ||
242 | { | 176 | { |
243 | struct gk20a *g = gk20a_from_vm(vm); | 177 | struct gk20a *g = gk20a_from_vm(vm); |
244 | const struct gk20a_mmu_level *l = &vm->mmu_levels[lvl]; | 178 | unsigned long flags = NVGPU_DMA_FORCE_CONTIGUOUS; |
245 | const struct gk20a_mmu_level *next_l = &vm->mmu_levels[lvl+1]; | 179 | int err; |
246 | int err = 0; | 180 | |
247 | u32 pde_i; | 181 | /* |
248 | u64 pde_size = 1ULL << (u64)l->lo_bit[pgsz_idx]; | 182 | * On arm32 vmalloc space is a precious commodity so we do not map pages |
249 | struct gk20a_mm_entry *next_pte = NULL, *prev_pte = NULL; | 183 | * by default. |
184 | */ | ||
185 | if (!IS_ENABLED(CONFIG_ARM64)) | ||
186 | flags |= NVGPU_DMA_NO_KERNEL_MAPPING; | ||
187 | |||
188 | err = nvgpu_dma_alloc_flags(g, flags, bytes, &pd->mem); | ||
189 | if (err) | ||
190 | return -ENOMEM; | ||
191 | |||
192 | return 0; | ||
193 | } | ||
194 | |||
195 | void nvgpu_free_gmmu_pages(struct vm_gk20a *vm, | ||
196 | struct nvgpu_gmmu_pd *pd) | ||
197 | { | ||
198 | struct gk20a *g = gk20a_from_vm(vm); | ||
199 | |||
200 | nvgpu_dma_free(g, &pd->mem); | ||
201 | } | ||
202 | |||
203 | /* | ||
204 | * Return the _physical_ address of a page directory. | ||
205 | */ | ||
206 | u64 nvgpu_pde_phys_addr(struct gk20a *g, struct nvgpu_gmmu_pd *pd) | ||
207 | { | ||
208 | if (g->mm.has_physical_mode) | ||
209 | return sg_phys(pd->mem.priv.sgt->sgl); | ||
210 | else | ||
211 | return nvgpu_mem_get_base_addr(g, &pd->mem, 0); | ||
212 | } | ||
213 | |||
214 | /* | ||
215 | * Return the aligned length based on the page size in attrs. | ||
216 | */ | ||
217 | static u64 nvgpu_align_map_length(struct vm_gk20a *vm, u64 length, | ||
218 | struct nvgpu_gmmu_attrs *attrs) | ||
219 | { | ||
220 | u64 page_size = vm->gmmu_page_sizes[attrs->pgsz]; | ||
221 | |||
222 | return ALIGN(length, page_size); | ||
223 | } | ||
224 | |||
225 | static u32 pd_entries(const struct gk20a_mmu_level *l, | ||
226 | struct nvgpu_gmmu_attrs *attrs) | ||
227 | { | ||
228 | /* | ||
229 | * Number of entries in a PD is easy to compute from the number of bits | ||
230 | * used to index the page directory. That is simply 2 raised to the | ||
231 | * number of bits. | ||
232 | */ | ||
233 | return 1UL << (l->hi_bit[attrs->pgsz] - l->lo_bit[attrs->pgsz] + 1UL); | ||
234 | } | ||
235 | |||
236 | /* | ||
237 | * Computes the size of a PD table. | ||
238 | */ | ||
239 | static u32 pd_size(const struct gk20a_mmu_level *l, | ||
240 | struct nvgpu_gmmu_attrs *attrs) | ||
241 | { | ||
242 | return pd_entries(l, attrs) * l->entry_size; | ||
243 | } | ||
244 | |||
245 | /* | ||
246 | * Allocate a physically contiguous region big enough for a gmmu page table | ||
247 | * of the specified level and page size. The whole range is zeroed so that any | ||
248 | * accesses will fault until proper values are programmed. | ||
249 | */ | ||
250 | static int pd_allocate(struct vm_gk20a *vm, | ||
251 | struct nvgpu_gmmu_pd *pd, | ||
252 | const struct gk20a_mmu_level *l, | ||
253 | struct nvgpu_gmmu_attrs *attrs) | ||
254 | { | ||
255 | int err; | ||
250 | 256 | ||
251 | gk20a_dbg_fn(""); | 257 | if (pd->mem.size) |
258 | return 0; | ||
252 | 259 | ||
253 | pde_i = (gpu_va & ((1ULL << ((u64)l->hi_bit[pgsz_idx]+1)) - 1ULL)) | 260 | err = nvgpu_alloc_gmmu_pages(vm, pd_size(l, attrs), pd); |
254 | >> (u64)l->lo_bit[pgsz_idx]; | 261 | if (err) { |
262 | nvgpu_info(vm->mm->g, "error allocating page directory!"); | ||
263 | return err; | ||
264 | } | ||
255 | 265 | ||
256 | gk20a_dbg(gpu_dbg_pte, "size_idx=%d, l: %d, [%llx,%llx], iova=%llx", | 266 | /* |
257 | pgsz_idx, lvl, gpu_va, gpu_end-1, *iova); | 267 | * One mb() is done after all mapping operations. Don't need individual |
268 | * barriers for each PD write. | ||
269 | */ | ||
270 | pd->mem.skip_wmb = true; | ||
258 | 271 | ||
259 | while (gpu_va < gpu_end) { | 272 | return 0; |
260 | u64 next = min((gpu_va + pde_size) & ~(pde_size-1), gpu_end); | 273 | } |
261 | 274 | ||
262 | /* Allocate next level */ | 275 | /* |
276 | * Compute what page directory index at the passed level the passed virtual | ||
277 | * address corresponds to. @attrs is necessary for determining the page size | ||
278 | * which is used to pick the right bit offsets for the GMMU level. | ||
279 | */ | ||
280 | static u32 pd_index(const struct gk20a_mmu_level *l, u64 virt, | ||
281 | struct nvgpu_gmmu_attrs *attrs) | ||
282 | { | ||
283 | u64 pd_mask = (1ULL << ((u64)l->hi_bit[attrs->pgsz] + 1)) - 1ULL; | ||
284 | u32 pd_shift = (u64)l->lo_bit[attrs->pgsz]; | ||
285 | |||
286 | /* | ||
287 | * For convenience we don't bother computing the lower bound of the | ||
288 | * mask; it's easier to just shift it off. | ||
289 | */ | ||
290 | return (virt & pd_mask) >> pd_shift; | ||
291 | } | ||
292 | |||
293 | static int pd_allocate_children(struct vm_gk20a *vm, | ||
294 | const struct gk20a_mmu_level *l, | ||
295 | struct nvgpu_gmmu_pd *pd, | ||
296 | struct nvgpu_gmmu_attrs *attrs) | ||
297 | { | ||
298 | struct gk20a *g = gk20a_from_vm(vm); | ||
299 | |||
300 | if (pd->entries) | ||
301 | return 0; | ||
302 | |||
303 | pd->num_entries = pd_entries(l, attrs); | ||
304 | pd->entries = nvgpu_vzalloc(g, sizeof(struct nvgpu_gmmu_pd) * | ||
305 | pd->num_entries); | ||
306 | if (!pd->entries) | ||
307 | return -ENOMEM; | ||
308 | |||
309 | return 0; | ||
310 | } | ||
311 | |||
312 | /* | ||
313 | * This function programs the GMMU based on two ranges: a physical range and a | ||
314 | * GPU virtual range. The virtual is mapped to the physical. Physical in this | ||
315 | * case can mean either a real physical sysmem address or a IO virtual address | ||
316 | * (for instance when a system has an IOMMU running). | ||
317 | * | ||
318 | * The rest of the parameters are for describing the actual mapping itself. | ||
319 | * | ||
320 | * This function recursively calls itself for handling PDEs. At the final level | ||
321 | * a PTE handler is called. The phys and virt ranges are adjusted for each | ||
322 | * recursion so that each invocation of this function need only worry about the | ||
323 | * range it is passed. | ||
324 | * | ||
325 | * phys_addr will always point to a contiguous range - the discontiguous nature | ||
326 | * of DMA buffers is taken care of at the layer above this. | ||
327 | */ | ||
328 | static int __set_pd_level(struct vm_gk20a *vm, | ||
329 | struct nvgpu_gmmu_pd *pd, | ||
330 | int lvl, | ||
331 | u64 phys_addr, | ||
332 | u64 virt_addr, u64 length, | ||
333 | struct nvgpu_gmmu_attrs *attrs) | ||
334 | { | ||
335 | int err = 0; | ||
336 | u64 pde_range; | ||
337 | struct gk20a *g = gk20a_from_vm(vm); | ||
338 | struct nvgpu_gmmu_pd *next_pd = NULL; | ||
339 | const struct gk20a_mmu_level *l = &vm->mmu_levels[lvl]; | ||
340 | const struct gk20a_mmu_level *next_l = &vm->mmu_levels[lvl + 1]; | ||
341 | |||
342 | /* | ||
343 | * 5 levels for Pascal+. For pre-pascal we only have 2. This puts | ||
344 | * offsets into the page table debugging code which makes it easier to | ||
345 | * see what level prints are from. | ||
346 | */ | ||
347 | static const char *__lvl_debug[] = { | ||
348 | "", /* L=0 */ | ||
349 | " ", /* L=1 */ | ||
350 | " ", /* L=2 */ | ||
351 | " ", /* L=3 */ | ||
352 | " ", /* L=4 */ | ||
353 | }; | ||
354 | |||
355 | pde_range = 1ULL << (u64)l->lo_bit[attrs->pgsz]; | ||
356 | |||
357 | __gmmu_dbg_v(g, attrs, | ||
358 | "L=%d %sGPU virt %#-12llx +%#-9llx -> phys %#-12llx", | ||
359 | lvl, | ||
360 | __lvl_debug[lvl], | ||
361 | virt_addr, | ||
362 | length, | ||
363 | phys_addr); | ||
364 | |||
365 | /* | ||
366 | * Iterate across the mapping in chunks the size of this level's PDE. | ||
367 | * For each of those chunks program our level's PDE and then, if there's | ||
368 | * a next level, program the next level's PDEs/PTEs. | ||
369 | */ | ||
370 | while (length) { | ||
371 | u32 pd_idx = pd_index(l, virt_addr, attrs); | ||
372 | u64 chunk_size; | ||
373 | u64 target_addr; | ||
374 | |||
375 | /* | ||
376 | * Truncate the pde_range when the virtual address does not | ||
377 | * start at a PDE boundary. | ||
378 | */ | ||
379 | chunk_size = min(length, | ||
380 | pde_range - (virt_addr & (pde_range - 1))); | ||
381 | |||
382 | /* | ||
383 | * If the next level has an update_entry function then we know | ||
384 | * that _this_ level points to PDEs (not PTEs). Thus we need to | ||
385 | * have a bunch of children PDs. | ||
386 | */ | ||
263 | if (next_l->update_entry) { | 387 | if (next_l->update_entry) { |
264 | if (!pte->entries) { | 388 | if (pd_allocate_children(vm, l, pd, attrs)) |
265 | int num_entries = | 389 | return -ENOMEM; |
266 | 1 << | 390 | |
267 | (l->hi_bit[pgsz_idx] | 391 | /* |
268 | - l->lo_bit[pgsz_idx] + 1); | 392 | * Get the next PD so that we know what to put in this |
269 | pte->entries = | 393 | * current PD. If the next level is actually PTEs then |
270 | nvgpu_vzalloc(g, | 394 | * we don't need this - we will just use the real |
271 | sizeof(struct gk20a_mm_entry) * | 395 | * physical target. |
272 | num_entries); | 396 | */ |
273 | if (!pte->entries) | 397 | next_pd = &pd->entries[pd_idx]; |
274 | return -ENOMEM; | 398 | |
275 | pte->pgsz = pgsz_idx; | 399 | /* |
276 | pte->num_entries = num_entries; | 400 | * Allocate the backing memory for next_pd. |
277 | } | 401 | */ |
278 | prev_pte = next_pte; | 402 | if (pd_allocate(vm, next_pd, next_l, attrs)) |
279 | next_pte = pte->entries + pde_i; | 403 | return -ENOMEM; |
280 | |||
281 | if (!next_pte->mem.size) { | ||
282 | err = nvgpu_zalloc_gmmu_page_table(vm, | ||
283 | pgsz_idx, next_l, next_pte, prev_pte); | ||
284 | if (err) | ||
285 | return err; | ||
286 | } | ||
287 | } | 404 | } |
288 | 405 | ||
289 | err = l->update_entry(vm, pte, pde_i, pgsz_idx, | 406 | /* |
290 | sgl, offset, iova, | 407 | * This is the address we want to program into the actual PDE/ |
291 | kind_v, ctag, cacheable, unmapped_pte, | 408 | * PTE. When the next level is PDEs we need the target address |
292 | rw_flag, sparse, priv, aperture); | 409 | * to be the table of PDEs. When the next level is PTEs the |
293 | if (err) | 410 | * target addr is the real physical address we are aiming for. |
294 | return err; | 411 | */ |
412 | target_addr = next_pd ? nvgpu_pde_phys_addr(g, next_pd) : | ||
413 | phys_addr; | ||
414 | |||
415 | l->update_entry(vm, l, | ||
416 | pd, pd_idx, | ||
417 | virt_addr, | ||
418 | target_addr, | ||
419 | attrs); | ||
295 | 420 | ||
296 | if (next_l->update_entry) { | 421 | if (next_l->update_entry) { |
297 | /* get cpu access to the ptes */ | 422 | err = map_gmmu_pages(g, next_pd); |
298 | err = map_gmmu_pages(g, next_pte); | ||
299 | if (err) { | 423 | if (err) { |
300 | nvgpu_err(g, | 424 | nvgpu_err(g, |
301 | "couldn't map ptes for update as=%d", | 425 | "couldn't map ptes for update as=%d", |
302 | vm_aspace_id(vm)); | 426 | vm_aspace_id(vm)); |
303 | return err; | 427 | return err; |
304 | } | 428 | } |
305 | err = update_gmmu_level_locked(vm, next_pte, | 429 | |
306 | pgsz_idx, | 430 | err = __set_pd_level(vm, next_pd, |
307 | sgl, | 431 | lvl + 1, |
308 | offset, | 432 | phys_addr, |
309 | iova, | 433 | virt_addr, |
310 | gpu_va, | 434 | chunk_size, |
311 | next, | 435 | attrs); |
312 | kind_v, ctag, cacheable, unmapped_pte, | 436 | unmap_gmmu_pages(g, next_pd); |
313 | rw_flag, sparse, lvl+1, priv, aperture); | ||
314 | unmap_gmmu_pages(g, next_pte); | ||
315 | 437 | ||
316 | if (err) | 438 | if (err) |
317 | return err; | 439 | return err; |
318 | } | 440 | } |
319 | 441 | ||
320 | pde_i++; | 442 | virt_addr += chunk_size; |
321 | gpu_va = next; | 443 | |
444 | /* | ||
445 | * Only add to phys_addr if it's non-zero. A zero value implies | ||
446 | * we are unmapping as as a result we don't want to place | ||
447 | * non-zero phys addresses in the PTEs. A non-zero phys-addr | ||
448 | * would also confuse the lower level PTE programming code. | ||
449 | */ | ||
450 | if (phys_addr) | ||
451 | phys_addr += chunk_size; | ||
452 | length -= chunk_size; | ||
453 | } | ||
454 | |||
455 | __gmmu_dbg_v(g, attrs, "L=%d %s%s", lvl, __lvl_debug[lvl], "ret!"); | ||
456 | |||
457 | return 0; | ||
458 | } | ||
459 | |||
460 | /* | ||
461 | * VIDMEM version of the update_ptes logic. | ||
462 | */ | ||
463 | static int __nvgpu_gmmu_update_page_table_vidmem(struct vm_gk20a *vm, | ||
464 | struct sg_table *sgt, | ||
465 | u64 space_to_skip, | ||
466 | u64 virt_addr, | ||
467 | u64 length, | ||
468 | struct nvgpu_gmmu_attrs *attrs) | ||
469 | { | ||
470 | struct nvgpu_page_alloc *alloc = NULL; | ||
471 | struct page_alloc_chunk *chunk = NULL; | ||
472 | u64 phys_addr, chunk_length; | ||
473 | int err = 0; | ||
474 | |||
475 | if (!sgt) { | ||
476 | /* | ||
477 | * This is considered an unmap. Just pass in 0 as the physical | ||
478 | * address for the entire GPU range. | ||
479 | */ | ||
480 | err = __set_pd_level(vm, &vm->pdb, | ||
481 | 0, | ||
482 | 0, | ||
483 | virt_addr, length, | ||
484 | attrs); | ||
485 | return err; | ||
486 | } | ||
487 | |||
488 | alloc = get_vidmem_page_alloc(sgt->sgl); | ||
489 | |||
490 | /* | ||
491 | * Otherwise iterate across all the chunks in this allocation and | ||
492 | * map them. | ||
493 | */ | ||
494 | nvgpu_list_for_each_entry(chunk, &alloc->alloc_chunks, | ||
495 | page_alloc_chunk, list_entry) { | ||
496 | if (space_to_skip && | ||
497 | space_to_skip >= chunk->length) { | ||
498 | space_to_skip -= chunk->length; | ||
499 | continue; | ||
500 | } | ||
501 | |||
502 | phys_addr = chunk->base + space_to_skip; | ||
503 | chunk_length = min(length, (chunk->length - space_to_skip)); | ||
504 | |||
505 | err = __set_pd_level(vm, &vm->pdb, | ||
506 | 0, | ||
507 | phys_addr, | ||
508 | virt_addr, length, | ||
509 | attrs); | ||
510 | if (err) | ||
511 | break; | ||
512 | |||
513 | /* Space has been skipped so zero this for future chunks. */ | ||
514 | space_to_skip = 0; | ||
515 | |||
516 | /* | ||
517 | * Update the map pointer and the remaining length. | ||
518 | */ | ||
519 | virt_addr += chunk_length; | ||
520 | length -= chunk_length; | ||
521 | |||
522 | if (length == 0) | ||
523 | break; | ||
322 | } | 524 | } |
323 | 525 | ||
324 | gk20a_dbg_fn("done"); | 526 | return err; |
527 | } | ||
528 | |||
529 | static int __nvgpu_gmmu_update_page_table_sysmem(struct vm_gk20a *vm, | ||
530 | struct sg_table *sgt, | ||
531 | u64 space_to_skip, | ||
532 | u64 virt_addr, | ||
533 | u64 length, | ||
534 | struct nvgpu_gmmu_attrs *attrs) | ||
535 | { | ||
536 | int err; | ||
537 | struct scatterlist *sgl; | ||
538 | struct gk20a *g = gk20a_from_vm(vm); | ||
539 | |||
540 | if (!sgt) { | ||
541 | /* | ||
542 | * This is considered an unmap. Just pass in 0 as the physical | ||
543 | * address for the entire GPU range. | ||
544 | */ | ||
545 | err = __set_pd_level(vm, &vm->pdb, | ||
546 | 0, | ||
547 | 0, | ||
548 | virt_addr, length, | ||
549 | attrs); | ||
550 | return err; | ||
551 | } | ||
552 | |||
553 | /* | ||
554 | * At this point we have a Linux scatter-gather list pointing to some | ||
555 | * number of discontiguous chunks of memory. Iterate over that list and | ||
556 | * generate a GMMU map call for each chunk. There are two possibilities: | ||
557 | * either the IOMMU is enabled or not. When the IOMMU is enabled the | ||
558 | * mapping is simple since the "physical" address is actually a virtual | ||
559 | * IO address and will be contiguous. The no-IOMMU case is more | ||
560 | * complicated. We will have to iterate over the SGT and do a separate | ||
561 | * map for each chunk of the SGT. | ||
562 | */ | ||
563 | sgl = sgt->sgl; | ||
564 | |||
565 | if (!g->mm.bypass_smmu) { | ||
566 | u64 io_addr = g->ops.mm.get_iova_addr(g, sgl, 0); | ||
567 | |||
568 | io_addr += space_to_skip; | ||
569 | |||
570 | err = __set_pd_level(vm, &vm->pdb, | ||
571 | 0, | ||
572 | io_addr, | ||
573 | virt_addr, | ||
574 | length, | ||
575 | attrs); | ||
576 | |||
577 | return err; | ||
578 | } | ||
579 | |||
580 | /* | ||
581 | * Finally: last possible case: do the no-IOMMU mapping. In this case we | ||
582 | * really are mapping physical pages directly. | ||
583 | */ | ||
584 | while (sgl) { | ||
585 | u64 phys_addr; | ||
586 | u64 chunk_length; | ||
587 | |||
588 | /* | ||
589 | * Cut out sgl ents for space_to_skip. | ||
590 | */ | ||
591 | if (space_to_skip && space_to_skip >= sgl->length) { | ||
592 | space_to_skip -= sgl->length; | ||
593 | sgl = sg_next(sgl); | ||
594 | continue; | ||
595 | } | ||
596 | |||
597 | phys_addr = sg_phys(sgl) + space_to_skip; | ||
598 | chunk_length = min(length, sgl->length - space_to_skip); | ||
599 | |||
600 | err = __set_pd_level(vm, &vm->pdb, | ||
601 | 0, | ||
602 | phys_addr, | ||
603 | virt_addr, | ||
604 | chunk_length, | ||
605 | attrs); | ||
606 | if (err) | ||
607 | return err; | ||
608 | |||
609 | space_to_skip = 0; | ||
610 | virt_addr += chunk_length; | ||
611 | length -= chunk_length; | ||
612 | sgl = sg_next(sgl); | ||
613 | |||
614 | if (length == 0) | ||
615 | break; | ||
616 | } | ||
325 | 617 | ||
326 | return 0; | 618 | return 0; |
327 | } | 619 | } |
@@ -332,8 +624,8 @@ static int update_gmmu_level_locked(struct vm_gk20a *vm, | |||
332 | * physical* address. | 624 | * physical* address. |
333 | * | 625 | * |
334 | * The update of each level of the page tables is farmed out to chip specific | 626 | * The update of each level of the page tables is farmed out to chip specific |
335 | * implementations. But the logic around that is generic to all chips. Every chip | 627 | * implementations. But the logic around that is generic to all chips. Every |
336 | * has some number of PDE levels and then a PTE level. | 628 | * chip has some number of PDE levels and then a PTE level. |
337 | * | 629 | * |
338 | * Each chunk of the incoming SGT is sent to the chip specific implementation | 630 | * Each chunk of the incoming SGT is sent to the chip specific implementation |
339 | * of page table update. | 631 | * of page table update. |
@@ -341,148 +633,81 @@ static int update_gmmu_level_locked(struct vm_gk20a *vm, | |||
341 | * [*] Note: the "physical" address may actually be an IO virtual address in the | 633 | * [*] Note: the "physical" address may actually be an IO virtual address in the |
342 | * case of SMMU usage. | 634 | * case of SMMU usage. |
343 | */ | 635 | */ |
344 | static int update_gmmu_ptes_locked(struct vm_gk20a *vm, | 636 | static int __nvgpu_gmmu_update_page_table(struct vm_gk20a *vm, |
345 | enum gmmu_pgsz_gk20a pgsz_idx, | 637 | struct sg_table *sgt, |
346 | struct sg_table *sgt, | 638 | u64 space_to_skip, |
347 | u64 buffer_offset, | 639 | u64 virt_addr, |
348 | u64 gpu_va, u64 gpu_end, | 640 | u64 length, |
349 | u8 kind_v, u32 ctag_offset, | 641 | struct nvgpu_gmmu_attrs *attrs) |
350 | bool cacheable, bool unmapped_pte, | ||
351 | int rw_flag, | ||
352 | bool sparse, | ||
353 | bool priv, | ||
354 | enum nvgpu_aperture aperture) | ||
355 | { | 642 | { |
356 | struct gk20a *g = gk20a_from_vm(vm); | 643 | struct gk20a *g = gk20a_from_vm(vm); |
357 | int ctag_granularity = g->ops.fb.compression_page_size(g); | 644 | u32 page_size; |
358 | u64 ctag = (u64)ctag_offset * (u64)ctag_granularity; | ||
359 | u64 iova = 0; | ||
360 | u64 space_to_skip = buffer_offset; | ||
361 | u64 map_size = gpu_end - gpu_va; | ||
362 | u32 page_size = vm->gmmu_page_sizes[pgsz_idx]; | ||
363 | int err; | 645 | int err; |
364 | struct scatterlist *sgl = NULL; | ||
365 | struct nvgpu_page_alloc *alloc = NULL; | ||
366 | struct page_alloc_chunk *chunk = NULL; | ||
367 | u64 length; | ||
368 | 646 | ||
369 | /* note: here we need to map kernel to small, since the | 647 | /* note: here we need to map kernel to small, since the |
370 | * low-level mmu code assumes 0 is small and 1 is big pages */ | 648 | * low-level mmu code assumes 0 is small and 1 is big pages */ |
371 | if (pgsz_idx == gmmu_page_size_kernel) | 649 | if (attrs->pgsz == gmmu_page_size_kernel) |
372 | pgsz_idx = gmmu_page_size_small; | 650 | attrs->pgsz = gmmu_page_size_small; |
651 | |||
652 | page_size = vm->gmmu_page_sizes[attrs->pgsz]; | ||
373 | 653 | ||
374 | if (space_to_skip & (page_size - 1)) | 654 | if (space_to_skip & (page_size - 1)) |
375 | return -EINVAL; | 655 | return -EINVAL; |
376 | 656 | ||
657 | /* | ||
658 | * Update length to be aligned to the passed page size. | ||
659 | */ | ||
660 | length = nvgpu_align_map_length(vm, length, attrs); | ||
661 | |||
377 | err = map_gmmu_pages(g, &vm->pdb); | 662 | err = map_gmmu_pages(g, &vm->pdb); |
378 | if (err) { | 663 | if (err) { |
379 | nvgpu_err(g, | 664 | nvgpu_err(g, "couldn't map ptes for update as=%d", |
380 | "couldn't map ptes for update as=%d", | 665 | vm_aspace_id(vm)); |
381 | vm_aspace_id(vm)); | ||
382 | return err; | 666 | return err; |
383 | } | 667 | } |
384 | 668 | ||
385 | if (aperture == APERTURE_VIDMEM) { | 669 | __gmmu_dbg(g, attrs, |
386 | gmmu_dbg_v(g, "vidmem map size_idx=%d, gpu_va=[%llx,%llx]", | 670 | "vm=%s " |
387 | pgsz_idx, gpu_va, gpu_end-1); | 671 | "%-5s GPU virt %#-12llx +%#-9llx phys %#-12llx " |
388 | 672 | "phys offset: %#-4llx; pgsz: %3dkb perm=%-2s | " | |
389 | if (sgt) { | 673 | "kind=%#02x APT=%-6s %c%c%c", |
390 | alloc = get_vidmem_page_alloc(sgt->sgl); | 674 | vm->name, |
391 | 675 | sgt ? "MAP" : "UNMAP", | |
392 | nvgpu_list_for_each_entry(chunk, &alloc->alloc_chunks, | 676 | virt_addr, |
393 | page_alloc_chunk, list_entry) { | 677 | length, |
394 | if (space_to_skip && | 678 | sgt ? g->ops.mm.get_iova_addr(g, sgt->sgl, 0) : 0ULL, |
395 | space_to_skip > chunk->length) { | 679 | space_to_skip, |
396 | space_to_skip -= chunk->length; | 680 | page_size >> 10, |
397 | } else { | 681 | nvgpu_gmmu_perm_str(attrs->rw_flag), |
398 | iova = chunk->base + space_to_skip; | 682 | attrs->kind_v, |
399 | length = chunk->length - space_to_skip; | 683 | nvgpu_aperture_str(attrs->aperture), |
400 | length = min(length, map_size); | 684 | attrs->cacheable ? 'C' : 'V', /* C = cached, V = volatile. */ |
401 | space_to_skip = 0; | 685 | attrs->sparse ? 'S' : '-', |
402 | 686 | attrs->priv ? 'P' : '-'); | |
403 | err = update_gmmu_level_locked(vm, | 687 | |
404 | &vm->pdb, pgsz_idx, | 688 | /* |
405 | &sgl, | 689 | * Handle VIDMEM progamming. Currently uses a different scatter list |
406 | &space_to_skip, | 690 | * format. |
407 | &iova, | 691 | */ |
408 | gpu_va, gpu_va + length, | 692 | if (attrs->aperture == APERTURE_VIDMEM) |
409 | kind_v, &ctag, | 693 | err = __nvgpu_gmmu_update_page_table_vidmem(vm, |
410 | cacheable, unmapped_pte, | 694 | sgt, |
411 | rw_flag, sparse, 0, priv, | 695 | space_to_skip, |
412 | aperture); | 696 | virt_addr, |
413 | if (err) | 697 | length, |
414 | break; | 698 | attrs); |
415 | 699 | else | |
416 | /* need to set explicit zero here */ | 700 | err = __nvgpu_gmmu_update_page_table_sysmem(vm, |
417 | space_to_skip = 0; | 701 | sgt, |
418 | gpu_va += length; | 702 | space_to_skip, |
419 | map_size -= length; | 703 | virt_addr, |
420 | 704 | length, | |
421 | if (!map_size) | 705 | attrs); |
422 | break; | ||
423 | } | ||
424 | } | ||
425 | } else { | ||
426 | err = update_gmmu_level_locked(vm, &vm->pdb, pgsz_idx, | ||
427 | &sgl, | ||
428 | &space_to_skip, | ||
429 | &iova, | ||
430 | gpu_va, gpu_end, | ||
431 | kind_v, &ctag, | ||
432 | cacheable, unmapped_pte, rw_flag, | ||
433 | sparse, 0, priv, | ||
434 | aperture); | ||
435 | } | ||
436 | } else { | ||
437 | gmmu_dbg_v(g, | ||
438 | "pgsz=%-6d, gpu_va: %#-12llx +%#-6llx phys: %#-12llx " | ||
439 | "buffer offset: %-4lld, nents: %d", | ||
440 | page_size, | ||
441 | gpu_va, gpu_end - gpu_va, | ||
442 | sgt ? g->ops.mm.get_iova_addr(g, sgt->sgl, 0) : 0ULL, | ||
443 | buffer_offset, | ||
444 | sgt ? sgt->nents : 0); | ||
445 | |||
446 | if (sgt) { | ||
447 | iova = g->ops.mm.get_iova_addr(vm->mm->g, sgt->sgl, 0); | ||
448 | if (!vm->mm->bypass_smmu && iova) { | ||
449 | iova += space_to_skip; | ||
450 | } else { | ||
451 | sgl = sgt->sgl; | ||
452 | |||
453 | gk20a_dbg(gpu_dbg_pte, "chunk address %llx, size %d", | ||
454 | (u64)sg_phys(sgl), | ||
455 | sgl->length); | ||
456 | |||
457 | while (space_to_skip && sgl && | ||
458 | space_to_skip + page_size > sgl->length) { | ||
459 | space_to_skip -= sgl->length; | ||
460 | sgl = sg_next(sgl); | ||
461 | gk20a_dbg(gpu_dbg_pte, "chunk address %llx, size %d", | ||
462 | (u64)sg_phys(sgl), | ||
463 | sgl->length); | ||
464 | } | ||
465 | |||
466 | iova = sg_phys(sgl) + space_to_skip; | ||
467 | } | ||
468 | } | ||
469 | |||
470 | err = update_gmmu_level_locked(vm, &vm->pdb, pgsz_idx, | ||
471 | &sgl, | ||
472 | &space_to_skip, | ||
473 | &iova, | ||
474 | gpu_va, gpu_end, | ||
475 | kind_v, &ctag, | ||
476 | cacheable, unmapped_pte, rw_flag, | ||
477 | sparse, 0, priv, | ||
478 | aperture); | ||
479 | } | ||
480 | 706 | ||
481 | unmap_gmmu_pages(g, &vm->pdb); | 707 | unmap_gmmu_pages(g, &vm->pdb); |
482 | |||
483 | mb(); | 708 | mb(); |
484 | 709 | ||
485 | gk20a_dbg_fn("done"); | 710 | __gmmu_dbg(g, attrs, "%-5s Done!", sgt ? "MAP" : "UNMAP"); |
486 | 711 | ||
487 | return err; | 712 | return err; |
488 | } | 713 | } |
@@ -500,32 +725,44 @@ static int update_gmmu_ptes_locked(struct vm_gk20a *vm, | |||
500 | * have the update_gmmu_lock aquired. | 725 | * have the update_gmmu_lock aquired. |
501 | */ | 726 | */ |
502 | u64 gk20a_locked_gmmu_map(struct vm_gk20a *vm, | 727 | u64 gk20a_locked_gmmu_map(struct vm_gk20a *vm, |
503 | u64 map_offset, | 728 | u64 vaddr, |
504 | struct sg_table *sgt, | 729 | struct sg_table *sgt, |
505 | u64 buffer_offset, | 730 | u64 buffer_offset, |
506 | u64 size, | 731 | u64 size, |
507 | int pgsz_idx, | 732 | int pgsz_idx, |
508 | u8 kind_v, | 733 | u8 kind_v, |
509 | u32 ctag_offset, | 734 | u32 ctag_offset, |
510 | u32 flags, | 735 | u32 flags, |
511 | int rw_flag, | 736 | int rw_flag, |
512 | bool clear_ctags, | 737 | bool clear_ctags, |
513 | bool sparse, | 738 | bool sparse, |
514 | bool priv, | 739 | bool priv, |
515 | struct vm_gk20a_mapping_batch *batch, | 740 | struct vm_gk20a_mapping_batch *batch, |
516 | enum nvgpu_aperture aperture) | 741 | enum nvgpu_aperture aperture) |
517 | { | 742 | { |
743 | struct gk20a *g = gk20a_from_vm(vm); | ||
518 | int err = 0; | 744 | int err = 0; |
519 | bool allocated = false; | 745 | bool allocated = false; |
520 | struct gk20a *g = gk20a_from_vm(vm); | ||
521 | int ctag_granularity = g->ops.fb.compression_page_size(g); | 746 | int ctag_granularity = g->ops.fb.compression_page_size(g); |
522 | u32 ctag_lines = DIV_ROUND_UP_ULL(size, ctag_granularity); | 747 | struct nvgpu_gmmu_attrs attrs = { |
523 | 748 | .pgsz = pgsz_idx, | |
524 | /* Allocate (or validate when map_offset != 0) the virtual address. */ | 749 | .kind_v = kind_v, |
525 | if (!map_offset) { | 750 | .ctag = (u64)ctag_offset * (u64)ctag_granularity, |
526 | map_offset = __nvgpu_vm_alloc_va(vm, size, | 751 | .cacheable = flags & NVGPU_MAP_BUFFER_FLAGS_CACHEABLE_TRUE, |
527 | pgsz_idx); | 752 | .rw_flag = rw_flag, |
528 | if (!map_offset) { | 753 | .sparse = sparse, |
754 | .priv = priv, | ||
755 | .valid = !(flags & NVGPU_AS_MAP_BUFFER_FLAGS_UNMAPPED_PTE), | ||
756 | .aperture = aperture | ||
757 | }; | ||
758 | |||
759 | /* | ||
760 | * Only allocate a new GPU VA range if we haven't already been passed a | ||
761 | * GPU VA range. This facilitates fixed mappings. | ||
762 | */ | ||
763 | if (!vaddr) { | ||
764 | vaddr = __nvgpu_vm_alloc_va(vm, size, pgsz_idx); | ||
765 | if (!vaddr) { | ||
529 | nvgpu_err(g, "failed to allocate va space"); | 766 | nvgpu_err(g, "failed to allocate va space"); |
530 | err = -ENOMEM; | 767 | err = -ENOMEM; |
531 | goto fail_alloc; | 768 | goto fail_alloc; |
@@ -533,34 +770,8 @@ u64 gk20a_locked_gmmu_map(struct vm_gk20a *vm, | |||
533 | allocated = true; | 770 | allocated = true; |
534 | } | 771 | } |
535 | 772 | ||
536 | gmmu_dbg(g, | 773 | err = __nvgpu_gmmu_update_page_table(vm, sgt, buffer_offset, |
537 | "gv: 0x%04x_%08x + 0x%-7llx " | 774 | vaddr, size, &attrs); |
538 | "[dma: 0x%02x_%08x, pa: 0x%02x_%08x] " | ||
539 | "pgsz=%-3dKb as=%-2d ctags=%d start=%d " | ||
540 | "kind=0x%x flags=0x%x apt=%s", | ||
541 | u64_hi32(map_offset), u64_lo32(map_offset), size, | ||
542 | sgt ? u64_hi32((u64)sg_dma_address(sgt->sgl)) : 0, | ||
543 | sgt ? u64_lo32((u64)sg_dma_address(sgt->sgl)) : 0, | ||
544 | sgt ? u64_hi32((u64)sg_phys(sgt->sgl)) : 0, | ||
545 | sgt ? u64_lo32((u64)sg_phys(sgt->sgl)) : 0, | ||
546 | vm->gmmu_page_sizes[pgsz_idx] >> 10, vm_aspace_id(vm), | ||
547 | ctag_lines, ctag_offset, | ||
548 | kind_v, flags, nvgpu_aperture_str(aperture)); | ||
549 | |||
550 | err = update_gmmu_ptes_locked(vm, pgsz_idx, | ||
551 | sgt, | ||
552 | buffer_offset, | ||
553 | map_offset, map_offset + size, | ||
554 | kind_v, | ||
555 | ctag_offset, | ||
556 | flags & | ||
557 | NVGPU_MAP_BUFFER_FLAGS_CACHEABLE_TRUE, | ||
558 | flags & | ||
559 | NVGPU_AS_MAP_BUFFER_FLAGS_UNMAPPED_PTE, | ||
560 | rw_flag, | ||
561 | sparse, | ||
562 | priv, | ||
563 | aperture); | ||
564 | if (err) { | 775 | if (err) { |
565 | nvgpu_err(g, "failed to update ptes on map"); | 776 | nvgpu_err(g, "failed to update ptes on map"); |
566 | goto fail_validate; | 777 | goto fail_validate; |
@@ -571,26 +782,37 @@ u64 gk20a_locked_gmmu_map(struct vm_gk20a *vm, | |||
571 | else | 782 | else |
572 | batch->need_tlb_invalidate = true; | 783 | batch->need_tlb_invalidate = true; |
573 | 784 | ||
574 | return map_offset; | 785 | return vaddr; |
575 | fail_validate: | 786 | fail_validate: |
576 | if (allocated) | 787 | if (allocated) |
577 | __nvgpu_vm_free_va(vm, map_offset, pgsz_idx); | 788 | __nvgpu_vm_free_va(vm, vaddr, pgsz_idx); |
578 | fail_alloc: | 789 | fail_alloc: |
579 | nvgpu_err(g, "%s: failed with err=%d", __func__, err); | 790 | nvgpu_err(g, "%s: failed with err=%d", __func__, err); |
580 | return 0; | 791 | return 0; |
581 | } | 792 | } |
582 | 793 | ||
583 | void gk20a_locked_gmmu_unmap(struct vm_gk20a *vm, | 794 | void gk20a_locked_gmmu_unmap(struct vm_gk20a *vm, |
584 | u64 vaddr, | 795 | u64 vaddr, |
585 | u64 size, | 796 | u64 size, |
586 | int pgsz_idx, | 797 | int pgsz_idx, |
587 | bool va_allocated, | 798 | bool va_allocated, |
588 | int rw_flag, | 799 | int rw_flag, |
589 | bool sparse, | 800 | bool sparse, |
590 | struct vm_gk20a_mapping_batch *batch) | 801 | struct vm_gk20a_mapping_batch *batch) |
591 | { | 802 | { |
592 | int err = 0; | 803 | int err = 0; |
593 | struct gk20a *g = gk20a_from_vm(vm); | 804 | struct gk20a *g = gk20a_from_vm(vm); |
805 | struct nvgpu_gmmu_attrs attrs = { | ||
806 | .pgsz = pgsz_idx, | ||
807 | .kind_v = 0, | ||
808 | .ctag = 0, | ||
809 | .cacheable = 0, | ||
810 | .rw_flag = rw_flag, | ||
811 | .sparse = sparse, | ||
812 | .priv = 0, | ||
813 | .valid = 0, | ||
814 | .aperture = APERTURE_INVALID, | ||
815 | }; | ||
594 | 816 | ||
595 | if (va_allocated) { | 817 | if (va_allocated) { |
596 | err = __nvgpu_vm_free_va(vm, vaddr, pgsz_idx); | 818 | err = __nvgpu_vm_free_va(vm, vaddr, pgsz_idx); |
@@ -601,27 +823,11 @@ void gk20a_locked_gmmu_unmap(struct vm_gk20a *vm, | |||
601 | } | 823 | } |
602 | 824 | ||
603 | /* unmap here needs to know the page size we assigned at mapping */ | 825 | /* unmap here needs to know the page size we assigned at mapping */ |
604 | err = update_gmmu_ptes_locked(vm, | 826 | err = __nvgpu_gmmu_update_page_table(vm, NULL, 0, |
605 | pgsz_idx, | 827 | vaddr, size, &attrs); |
606 | NULL, /* n/a for unmap */ | ||
607 | 0, | ||
608 | vaddr, | ||
609 | vaddr + size, | ||
610 | 0, 0, false /* n/a for unmap */, | ||
611 | false, rw_flag, | ||
612 | sparse, 0, | ||
613 | APERTURE_INVALID); /* don't care for unmap */ | ||
614 | if (err) | 828 | if (err) |
615 | nvgpu_err(g, "failed to update gmmu ptes on unmap"); | 829 | nvgpu_err(g, "failed to update gmmu ptes on unmap"); |
616 | 830 | ||
617 | /* flush l2 so any dirty lines are written out *now*. | ||
618 | * also as we could potentially be switching this buffer | ||
619 | * from nonvolatile (l2 cacheable) to volatile (l2 non-cacheable) at | ||
620 | * some point in the future we need to invalidate l2. e.g. switching | ||
621 | * from a render buffer unmap (here) to later using the same memory | ||
622 | * for gmmu ptes. note the positioning of this relative to any smmu | ||
623 | * unmapping (below). */ | ||
624 | |||
625 | if (!batch) { | 831 | if (!batch) { |
626 | gk20a_mm_l2_flush(g, true); | 832 | gk20a_mm_l2_flush(g, true); |
627 | g->ops.fb.tlb_invalidate(g, &vm->pdb.mem); | 833 | g->ops.fb.tlb_invalidate(g, &vm->pdb.mem); |
diff --git a/drivers/gpu/nvgpu/common/mm/vm.c b/drivers/gpu/nvgpu/common/mm/vm.c index 88622eca..3aeba500 100644 --- a/drivers/gpu/nvgpu/common/mm/vm.c +++ b/drivers/gpu/nvgpu/common/mm/vm.c | |||
@@ -36,7 +36,7 @@ int vm_aspace_id(struct vm_gk20a *vm) | |||
36 | } | 36 | } |
37 | 37 | ||
38 | static void nvgpu_vm_free_entries(struct vm_gk20a *vm, | 38 | static void nvgpu_vm_free_entries(struct vm_gk20a *vm, |
39 | struct gk20a_mm_entry *parent, | 39 | struct nvgpu_gmmu_pd *parent, |
40 | int level) | 40 | int level) |
41 | { | 41 | { |
42 | int i; | 42 | int i; |
@@ -75,8 +75,6 @@ u64 __nvgpu_vm_alloc_va(struct vm_gk20a *vm, u64 size, | |||
75 | 75 | ||
76 | /* Be certain we round up to page_size if needed */ | 76 | /* Be certain we round up to page_size if needed */ |
77 | size = (size + ((u64)page_size - 1)) & ~((u64)page_size - 1); | 77 | size = (size + ((u64)page_size - 1)) & ~((u64)page_size - 1); |
78 | nvgpu_log(g, gpu_dbg_map, "size=0x%llx @ pgsz=%dKB", size, | ||
79 | vm->gmmu_page_sizes[pgsz_idx] >> 10); | ||
80 | 78 | ||
81 | addr = nvgpu_alloc(vma, size); | 79 | addr = nvgpu_alloc(vma, size); |
82 | if (!addr) { | 80 | if (!addr) { |
@@ -84,17 +82,14 @@ u64 __nvgpu_vm_alloc_va(struct vm_gk20a *vm, u64 size, | |||
84 | return 0; | 82 | return 0; |
85 | } | 83 | } |
86 | 84 | ||
87 | nvgpu_log(g, gpu_dbg_map, "(%s) addr: 0x%llx", vma->name, addr); | ||
88 | return addr; | 85 | return addr; |
89 | } | 86 | } |
90 | 87 | ||
91 | int __nvgpu_vm_free_va(struct vm_gk20a *vm, u64 addr, | 88 | int __nvgpu_vm_free_va(struct vm_gk20a *vm, u64 addr, |
92 | enum gmmu_pgsz_gk20a pgsz_idx) | 89 | enum gmmu_pgsz_gk20a pgsz_idx) |
93 | { | 90 | { |
94 | struct gk20a *g = vm->mm->g; | ||
95 | struct nvgpu_allocator *vma = vm->vma[pgsz_idx]; | 91 | struct nvgpu_allocator *vma = vm->vma[pgsz_idx]; |
96 | 92 | ||
97 | nvgpu_log(g, gpu_dbg_map, "(%s) addr: 0x%llx", vma->name, addr); | ||
98 | nvgpu_free(vma, addr); | 93 | nvgpu_free(vma, addr); |
99 | 94 | ||
100 | return 0; | 95 | return 0; |
@@ -127,32 +122,6 @@ void nvgpu_vm_mapping_batch_finish(struct vm_gk20a *vm, | |||
127 | nvgpu_mutex_release(&vm->update_gmmu_lock); | 122 | nvgpu_mutex_release(&vm->update_gmmu_lock); |
128 | } | 123 | } |
129 | 124 | ||
130 | static int nvgpu_vm_init_page_tables(struct vm_gk20a *vm) | ||
131 | { | ||
132 | u32 pde_lo, pde_hi; | ||
133 | int err; | ||
134 | |||
135 | pde_range_from_vaddr_range(vm, | ||
136 | 0, vm->va_limit-1, | ||
137 | &pde_lo, &pde_hi); | ||
138 | vm->pdb.entries = nvgpu_vzalloc(vm->mm->g, | ||
139 | sizeof(struct gk20a_mm_entry) * | ||
140 | (pde_hi + 1)); | ||
141 | vm->pdb.num_entries = pde_hi + 1; | ||
142 | |||
143 | if (!vm->pdb.entries) | ||
144 | return -ENOMEM; | ||
145 | |||
146 | err = nvgpu_zalloc_gmmu_page_table(vm, 0, &vm->mmu_levels[0], | ||
147 | &vm->pdb, NULL); | ||
148 | if (err) { | ||
149 | nvgpu_vfree(vm->mm->g, vm->pdb.entries); | ||
150 | return err; | ||
151 | } | ||
152 | |||
153 | return 0; | ||
154 | } | ||
155 | |||
156 | /* | 125 | /* |
157 | * Determine if the passed address space can support big pages or not. | 126 | * Determine if the passed address space can support big pages or not. |
158 | */ | 127 | */ |
@@ -280,7 +249,8 @@ static int __nvgpu_vm_init(struct mm_gk20a *mm, | |||
280 | #endif | 249 | #endif |
281 | 250 | ||
282 | /* Initialize the page table data structures. */ | 251 | /* Initialize the page table data structures. */ |
283 | err = nvgpu_vm_init_page_tables(vm); | 252 | strncpy(vm->name, name, min(strlen(name), sizeof(vm->name))); |
253 | err = nvgpu_gmmu_init_page_table(vm); | ||
284 | if (err) | 254 | if (err) |
285 | goto clean_up_vgpu_vm; | 255 | goto clean_up_vgpu_vm; |
286 | 256 | ||
diff --git a/drivers/gpu/nvgpu/gk20a/fb_gk20a.c b/drivers/gpu/nvgpu/gk20a/fb_gk20a.c index 3c76e817..c5f9c1fd 100644 --- a/drivers/gpu/nvgpu/gk20a/fb_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/fb_gk20a.c | |||
@@ -67,7 +67,7 @@ void gk20a_fb_tlb_invalidate(struct gk20a *g, struct nvgpu_mem *pdb) | |||
67 | if (!g->power_on) | 67 | if (!g->power_on) |
68 | return; | 68 | return; |
69 | 69 | ||
70 | addr_lo = u64_lo32(gk20a_mem_get_base_addr(g, pdb, 0) >> 12); | 70 | addr_lo = u64_lo32(nvgpu_mem_get_base_addr(g, pdb, 0) >> 12); |
71 | 71 | ||
72 | nvgpu_mutex_acquire(&g->mm.tlb_lock); | 72 | nvgpu_mutex_acquire(&g->mm.tlb_lock); |
73 | 73 | ||
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c index b7b68575..558a1b06 100644 --- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c | |||
@@ -777,31 +777,6 @@ int gk20a_mm_pde_coverage_bit_count(struct vm_gk20a *vm) | |||
777 | return vm->mmu_levels[0].lo_bit[0]; | 777 | return vm->mmu_levels[0].lo_bit[0]; |
778 | } | 778 | } |
779 | 779 | ||
780 | /* given address range (inclusive) determine the pdes crossed */ | ||
781 | void pde_range_from_vaddr_range(struct vm_gk20a *vm, | ||
782 | u64 addr_lo, u64 addr_hi, | ||
783 | u32 *pde_lo, u32 *pde_hi) | ||
784 | { | ||
785 | int pde_shift = gk20a_mm_pde_coverage_bit_count(vm); | ||
786 | |||
787 | *pde_lo = (u32)(addr_lo >> pde_shift); | ||
788 | *pde_hi = (u32)(addr_hi >> pde_shift); | ||
789 | gk20a_dbg(gpu_dbg_pte, "addr_lo=0x%llx addr_hi=0x%llx pde_ss=%d", | ||
790 | addr_lo, addr_hi, pde_shift); | ||
791 | gk20a_dbg(gpu_dbg_pte, "pde_lo=%d pde_hi=%d", | ||
792 | *pde_lo, *pde_hi); | ||
793 | } | ||
794 | |||
795 | static u32 pde_from_index(u32 i) | ||
796 | { | ||
797 | return i * gmmu_pde__size_v() / sizeof(u32); | ||
798 | } | ||
799 | |||
800 | static u32 pte_from_index(u32 i) | ||
801 | { | ||
802 | return i * gmmu_pte__size_v() / sizeof(u32); | ||
803 | } | ||
804 | |||
805 | int nvgpu_vm_get_buffers(struct vm_gk20a *vm, | 780 | int nvgpu_vm_get_buffers(struct vm_gk20a *vm, |
806 | struct nvgpu_mapped_buf ***mapped_buffers, | 781 | struct nvgpu_mapped_buf ***mapped_buffers, |
807 | int *num_buffers) | 782 | int *num_buffers) |
@@ -1478,7 +1453,7 @@ static int gk20a_gmmu_clear_vidmem_mem(struct gk20a *g, struct nvgpu_mem *mem) | |||
1478 | * If mem is in VIDMEM, return base address in vidmem | 1453 | * If mem is in VIDMEM, return base address in vidmem |
1479 | * else return IOVA address for SYSMEM | 1454 | * else return IOVA address for SYSMEM |
1480 | */ | 1455 | */ |
1481 | u64 gk20a_mem_get_base_addr(struct gk20a *g, struct nvgpu_mem *mem, | 1456 | u64 nvgpu_mem_get_base_addr(struct gk20a *g, struct nvgpu_mem *mem, |
1482 | u32 flags) | 1457 | u32 flags) |
1483 | { | 1458 | { |
1484 | struct nvgpu_page_alloc *alloc; | 1459 | struct nvgpu_page_alloc *alloc; |
@@ -1580,203 +1555,168 @@ u64 gk20a_mm_iova_addr(struct gk20a *g, struct scatterlist *sgl, | |||
1580 | return gk20a_mm_smmu_vaddr_translate(g, sg_dma_address(sgl)); | 1555 | return gk20a_mm_smmu_vaddr_translate(g, sg_dma_address(sgl)); |
1581 | } | 1556 | } |
1582 | 1557 | ||
1583 | void gk20a_pde_wr32(struct gk20a *g, struct gk20a_mm_entry *entry, | ||
1584 | size_t w, size_t data) | ||
1585 | { | ||
1586 | nvgpu_mem_wr32(g, &entry->mem, entry->woffset + w, data); | ||
1587 | } | ||
1588 | |||
1589 | u64 gk20a_pde_addr(struct gk20a *g, struct gk20a_mm_entry *entry) | ||
1590 | { | ||
1591 | u64 base; | ||
1592 | |||
1593 | if (g->mm.has_physical_mode) | ||
1594 | base = sg_phys(entry->mem.priv.sgt->sgl); | ||
1595 | else | ||
1596 | base = gk20a_mem_get_base_addr(g, &entry->mem, 0); | ||
1597 | |||
1598 | return base + entry->woffset * sizeof(u32); | ||
1599 | } | ||
1600 | |||
1601 | /* for gk20a the "video memory" apertures here are misnomers. */ | 1558 | /* for gk20a the "video memory" apertures here are misnomers. */ |
1602 | static inline u32 big_valid_pde0_bits(struct gk20a *g, | 1559 | static inline u32 big_valid_pde0_bits(struct gk20a *g, |
1603 | struct gk20a_mm_entry *entry) | 1560 | struct nvgpu_gmmu_pd *pd, u64 addr) |
1604 | { | 1561 | { |
1605 | u64 pte_addr = gk20a_pde_addr(g, entry); | ||
1606 | u32 pde0_bits = | 1562 | u32 pde0_bits = |
1607 | nvgpu_aperture_mask(g, &entry->mem, | 1563 | nvgpu_aperture_mask(g, &pd->mem, |
1608 | gmmu_pde_aperture_big_sys_mem_ncoh_f(), | 1564 | gmmu_pde_aperture_big_sys_mem_ncoh_f(), |
1609 | gmmu_pde_aperture_big_video_memory_f()) | | 1565 | gmmu_pde_aperture_big_video_memory_f()) | |
1610 | gmmu_pde_address_big_sys_f( | 1566 | gmmu_pde_address_big_sys_f( |
1611 | (u32)(pte_addr >> gmmu_pde_address_shift_v())); | 1567 | (u32)(addr >> gmmu_pde_address_shift_v())); |
1612 | 1568 | ||
1613 | return pde0_bits; | 1569 | return pde0_bits; |
1614 | } | 1570 | } |
1615 | 1571 | ||
1616 | static inline u32 small_valid_pde1_bits(struct gk20a *g, | 1572 | static inline u32 small_valid_pde1_bits(struct gk20a *g, |
1617 | struct gk20a_mm_entry *entry) | 1573 | struct nvgpu_gmmu_pd *pd, u64 addr) |
1618 | { | 1574 | { |
1619 | u64 pte_addr = gk20a_pde_addr(g, entry); | ||
1620 | u32 pde1_bits = | 1575 | u32 pde1_bits = |
1621 | nvgpu_aperture_mask(g, &entry->mem, | 1576 | nvgpu_aperture_mask(g, &pd->mem, |
1622 | gmmu_pde_aperture_small_sys_mem_ncoh_f(), | 1577 | gmmu_pde_aperture_small_sys_mem_ncoh_f(), |
1623 | gmmu_pde_aperture_small_video_memory_f()) | | 1578 | gmmu_pde_aperture_small_video_memory_f()) | |
1624 | gmmu_pde_vol_small_true_f() | /* tbd: why? */ | 1579 | gmmu_pde_vol_small_true_f() | /* tbd: why? */ |
1625 | gmmu_pde_address_small_sys_f( | 1580 | gmmu_pde_address_small_sys_f( |
1626 | (u32)(pte_addr >> gmmu_pde_address_shift_v())); | 1581 | (u32)(addr >> gmmu_pde_address_shift_v())); |
1627 | 1582 | ||
1628 | return pde1_bits; | 1583 | return pde1_bits; |
1629 | } | 1584 | } |
1630 | 1585 | ||
1631 | /* Given the current state of the ptes associated with a pde, | 1586 | static void update_gmmu_pde_locked(struct vm_gk20a *vm, |
1632 | determine value and write it out. There's no checking | 1587 | const struct gk20a_mmu_level *l, |
1633 | here to determine whether or not a change was actually | 1588 | struct nvgpu_gmmu_pd *pd, |
1634 | made. So, superfluous updates will cause unnecessary | 1589 | u32 pd_idx, |
1635 | pde invalidations. | 1590 | u64 virt_addr, |
1636 | */ | 1591 | u64 phys_addr, |
1637 | static int update_gmmu_pde_locked(struct vm_gk20a *vm, | 1592 | struct nvgpu_gmmu_attrs *attrs) |
1638 | struct gk20a_mm_entry *pte, | ||
1639 | u32 i, u32 gmmu_pgsz_idx, | ||
1640 | struct scatterlist **sgl, | ||
1641 | u64 *offset, | ||
1642 | u64 *iova, | ||
1643 | u32 kind_v, u64 *ctag, | ||
1644 | bool cacheable, bool unammped_pte, | ||
1645 | int rw_flag, bool sparse, bool priv, | ||
1646 | enum nvgpu_aperture aperture) | ||
1647 | { | 1593 | { |
1648 | struct gk20a *g = gk20a_from_vm(vm); | 1594 | struct gk20a *g = gk20a_from_vm(vm); |
1649 | bool small_valid, big_valid; | 1595 | bool small_valid, big_valid; |
1650 | struct gk20a_mm_entry *entry = vm->pdb.entries + i; | 1596 | u32 pd_offset = pd_offset_from_index(l, pd_idx); |
1651 | u32 pde_v[2] = {0, 0}; | 1597 | u32 pde_v[2] = {0, 0}; |
1652 | u32 pde; | ||
1653 | 1598 | ||
1654 | gk20a_dbg_fn(""); | 1599 | small_valid = attrs->pgsz == gmmu_page_size_small; |
1655 | 1600 | big_valid = attrs->pgsz == gmmu_page_size_big; | |
1656 | small_valid = entry->mem.size && entry->pgsz == gmmu_page_size_small; | ||
1657 | big_valid = entry->mem.size && entry->pgsz == gmmu_page_size_big; | ||
1658 | 1601 | ||
1659 | pde_v[0] = gmmu_pde_size_full_f(); | 1602 | pde_v[0] = gmmu_pde_size_full_f(); |
1660 | pde_v[0] |= big_valid ? | 1603 | pde_v[0] |= big_valid ? |
1661 | big_valid_pde0_bits(g, entry) : | 1604 | big_valid_pde0_bits(g, pd, phys_addr) : |
1662 | gmmu_pde_aperture_big_invalid_f(); | 1605 | gmmu_pde_aperture_big_invalid_f(); |
1663 | 1606 | ||
1664 | pde_v[1] |= (small_valid ? | 1607 | pde_v[1] |= (small_valid ? small_valid_pde1_bits(g, pd, phys_addr) : |
1665 | small_valid_pde1_bits(g, entry) : | ||
1666 | (gmmu_pde_aperture_small_invalid_f() | | 1608 | (gmmu_pde_aperture_small_invalid_f() | |
1667 | gmmu_pde_vol_small_false_f())) | 1609 | gmmu_pde_vol_small_false_f())) |
1668 | | | 1610 | | |
1669 | (big_valid ? (gmmu_pde_vol_big_true_f()) : | 1611 | (big_valid ? (gmmu_pde_vol_big_true_f()) : |
1670 | gmmu_pde_vol_big_false_f()); | 1612 | gmmu_pde_vol_big_false_f()); |
1671 | 1613 | ||
1672 | pde = pde_from_index(i); | 1614 | pte_dbg(g, attrs, |
1615 | "PDE: i=%-4u size=%-2u offs=%-4u pgsz: %c%c | " | ||
1616 | "GPU %#-12llx phys %#-12llx " | ||
1617 | "[0x%08x, 0x%08x]", | ||
1618 | pd_idx, l->entry_size, pd_offset, | ||
1619 | small_valid ? 'S' : '-', | ||
1620 | big_valid ? 'B' : '-', | ||
1621 | virt_addr, phys_addr, | ||
1622 | pde_v[1], pde_v[0]); | ||
1673 | 1623 | ||
1674 | gk20a_pde_wr32(g, &vm->pdb, pde + 0, pde_v[0]); | 1624 | pd_write(g, &vm->pdb, pd_offset + 0, pde_v[0]); |
1675 | gk20a_pde_wr32(g, &vm->pdb, pde + 1, pde_v[1]); | 1625 | pd_write(g, &vm->pdb, pd_offset + 1, pde_v[1]); |
1626 | } | ||
1676 | 1627 | ||
1677 | gk20a_dbg(gpu_dbg_pte, "pde:%d,sz=%d = 0x%x,0x%08x", | 1628 | static void __update_pte_sparse(u32 *pte_w) |
1678 | i, gmmu_pgsz_idx, pde_v[1], pde_v[0]); | 1629 | { |
1679 | return 0; | 1630 | pte_w[0] = gmmu_pte_valid_false_f(); |
1631 | pte_w[1] |= gmmu_pte_vol_true_f(); | ||
1680 | } | 1632 | } |
1681 | 1633 | ||
1682 | static int update_gmmu_pte_locked(struct vm_gk20a *vm, | 1634 | static void __update_pte(struct vm_gk20a *vm, |
1683 | struct gk20a_mm_entry *pte, | 1635 | u32 *pte_w, |
1684 | u32 i, u32 gmmu_pgsz_idx, | 1636 | u64 phys_addr, |
1685 | struct scatterlist **sgl, | 1637 | struct nvgpu_gmmu_attrs *attrs) |
1686 | u64 *offset, | ||
1687 | u64 *iova, | ||
1688 | u32 kind_v, u64 *ctag, | ||
1689 | bool cacheable, bool unmapped_pte, | ||
1690 | int rw_flag, bool sparse, bool priv, | ||
1691 | enum nvgpu_aperture aperture) | ||
1692 | { | 1638 | { |
1693 | struct gk20a *g = gk20a_from_vm(vm); | 1639 | struct gk20a *g = gk20a_from_vm(vm); |
1640 | u32 page_size = vm->gmmu_page_sizes[attrs->pgsz]; | ||
1641 | u32 pte_valid = attrs->valid ? | ||
1642 | gmmu_pte_valid_true_f() : | ||
1643 | gmmu_pte_valid_false_f(); | ||
1644 | u32 phys_shifted = phys_addr >> gmmu_pte_address_shift_v(); | ||
1645 | u32 addr = attrs->aperture == APERTURE_SYSMEM ? | ||
1646 | gmmu_pte_address_sys_f(phys_shifted) : | ||
1647 | gmmu_pte_address_vid_f(phys_shifted); | ||
1694 | int ctag_shift = ilog2(g->ops.fb.compression_page_size(g)); | 1648 | int ctag_shift = ilog2(g->ops.fb.compression_page_size(g)); |
1695 | u32 page_size = vm->gmmu_page_sizes[gmmu_pgsz_idx]; | ||
1696 | u32 pte_w[2] = {0, 0}; /* invalid pte */ | ||
1697 | |||
1698 | if (*iova) { | ||
1699 | u32 pte_valid = unmapped_pte ? | ||
1700 | gmmu_pte_valid_false_f() : | ||
1701 | gmmu_pte_valid_true_f(); | ||
1702 | u32 iova_v = *iova >> gmmu_pte_address_shift_v(); | ||
1703 | u32 pte_addr = aperture == APERTURE_SYSMEM ? | ||
1704 | gmmu_pte_address_sys_f(iova_v) : | ||
1705 | gmmu_pte_address_vid_f(iova_v); | ||
1706 | |||
1707 | pte_w[0] = pte_valid | pte_addr; | ||
1708 | |||
1709 | if (priv) | ||
1710 | pte_w[0] |= gmmu_pte_privilege_true_f(); | ||
1711 | |||
1712 | pte_w[1] = __nvgpu_aperture_mask(g, aperture, | ||
1713 | gmmu_pte_aperture_sys_mem_ncoh_f(), | ||
1714 | gmmu_pte_aperture_video_memory_f()) | | ||
1715 | gmmu_pte_kind_f(kind_v) | | ||
1716 | gmmu_pte_comptagline_f((u32)(*ctag >> ctag_shift)); | ||
1717 | |||
1718 | if (*ctag && vm->mm->use_full_comp_tag_line && *iova & 0x10000) | ||
1719 | pte_w[1] |= gmmu_pte_comptagline_f( | ||
1720 | 1 << (gmmu_pte_comptagline_s() - 1)); | ||
1721 | |||
1722 | if (rw_flag == gk20a_mem_flag_read_only) { | ||
1723 | pte_w[0] |= gmmu_pte_read_only_true_f(); | ||
1724 | pte_w[1] |= | ||
1725 | gmmu_pte_write_disable_true_f(); | ||
1726 | } else if (rw_flag == | ||
1727 | gk20a_mem_flag_write_only) { | ||
1728 | pte_w[1] |= | ||
1729 | gmmu_pte_read_disable_true_f(); | ||
1730 | } | ||
1731 | if (!unmapped_pte) { | ||
1732 | if (!cacheable) | ||
1733 | pte_w[1] |= | ||
1734 | gmmu_pte_vol_true_f(); | ||
1735 | } else { | ||
1736 | /* Store cacheable value behind | ||
1737 | * gmmu_pte_write_disable_true_f */ | ||
1738 | if (!cacheable) | ||
1739 | pte_w[1] |= | ||
1740 | gmmu_pte_write_disable_true_f(); | ||
1741 | } | ||
1742 | 1649 | ||
1743 | gk20a_dbg(gpu_dbg_pte, | 1650 | pte_w[0] = pte_valid | addr; |
1744 | "pte=%d iova=0x%llx kind=%d ctag=%d vol=%d [0x%08x, 0x%08x]", | ||
1745 | i, *iova, | ||
1746 | kind_v, (u32)(*ctag >> ctag_shift), !cacheable, | ||
1747 | pte_w[1], pte_w[0]); | ||
1748 | 1651 | ||
1749 | if (*ctag) | 1652 | if (attrs->priv) |
1750 | *ctag += page_size; | 1653 | pte_w[0] |= gmmu_pte_privilege_true_f(); |
1751 | } else if (sparse) { | ||
1752 | pte_w[0] = gmmu_pte_valid_false_f(); | ||
1753 | pte_w[1] |= gmmu_pte_vol_true_f(); | ||
1754 | } else { | ||
1755 | gk20a_dbg(gpu_dbg_pte, "pte_cur=%d [0x0,0x0]", i); | ||
1756 | } | ||
1757 | 1654 | ||
1758 | gk20a_pde_wr32(g, pte, pte_from_index(i) + 0, pte_w[0]); | 1655 | pte_w[1] = __nvgpu_aperture_mask(g, attrs->aperture, |
1759 | gk20a_pde_wr32(g, pte, pte_from_index(i) + 1, pte_w[1]); | 1656 | gmmu_pte_aperture_sys_mem_ncoh_f(), |
1760 | 1657 | gmmu_pte_aperture_video_memory_f()) | | |
1761 | if (*iova) { | 1658 | gmmu_pte_kind_f(attrs->kind_v) | |
1762 | *iova += page_size; | 1659 | gmmu_pte_comptagline_f((u32)(attrs->ctag >> ctag_shift)); |
1763 | *offset += page_size; | 1660 | |
1764 | if (*sgl && *offset + page_size > (*sgl)->length) { | 1661 | if (attrs->ctag && vm->mm->use_full_comp_tag_line && |
1765 | u64 new_iova; | 1662 | phys_addr & 0x10000) |
1766 | *sgl = sg_next(*sgl); | 1663 | pte_w[1] |= gmmu_pte_comptagline_f( |
1767 | if (*sgl) { | 1664 | 1 << (gmmu_pte_comptagline_s() - 1)); |
1768 | new_iova = sg_phys(*sgl); | 1665 | |
1769 | gk20a_dbg(gpu_dbg_pte, "chunk address %llx, size %d", | 1666 | if (attrs->rw_flag == gk20a_mem_flag_read_only) { |
1770 | new_iova, (*sgl)->length); | 1667 | pte_w[0] |= gmmu_pte_read_only_true_f(); |
1771 | if (new_iova) { | 1668 | pte_w[1] |= gmmu_pte_write_disable_true_f(); |
1772 | *offset = 0; | 1669 | } else if (attrs->rw_flag == gk20a_mem_flag_write_only) { |
1773 | *iova = new_iova; | 1670 | pte_w[1] |= gmmu_pte_read_disable_true_f(); |
1774 | } | ||
1775 | } | ||
1776 | } | ||
1777 | } | 1671 | } |
1778 | 1672 | ||
1779 | return 0; | 1673 | if (!attrs->cacheable) |
1674 | pte_w[1] |= gmmu_pte_vol_true_f(); | ||
1675 | |||
1676 | if (attrs->ctag) | ||
1677 | attrs->ctag += page_size; | ||
1678 | } | ||
1679 | |||
1680 | static void update_gmmu_pte_locked(struct vm_gk20a *vm, | ||
1681 | const struct gk20a_mmu_level *l, | ||
1682 | struct nvgpu_gmmu_pd *pd, | ||
1683 | u32 pd_idx, | ||
1684 | u64 virt_addr, | ||
1685 | u64 phys_addr, | ||
1686 | struct nvgpu_gmmu_attrs *attrs) | ||
1687 | { | ||
1688 | struct gk20a *g = gk20a_from_vm(vm); | ||
1689 | u32 page_size = vm->gmmu_page_sizes[attrs->pgsz]; | ||
1690 | u32 pd_offset = pd_offset_from_index(l, pd_idx); | ||
1691 | u32 pte_w[2] = {0, 0}; | ||
1692 | int ctag_shift = ilog2(g->ops.fb.compression_page_size(g)); | ||
1693 | |||
1694 | if (phys_addr) | ||
1695 | __update_pte(vm, pte_w, phys_addr, attrs); | ||
1696 | else if (attrs->sparse) | ||
1697 | __update_pte_sparse(pte_w); | ||
1698 | |||
1699 | pte_dbg(g, attrs, | ||
1700 | "PTE: i=%-4u size=%-2u offs=%-4u | " | ||
1701 | "GPU %#-12llx phys %#-12llx " | ||
1702 | "pgsz: %3dkb perm=%-2s kind=%#02x APT=%-6s %c%c%c%c " | ||
1703 | "ctag=0x%08x " | ||
1704 | "[0x%08x, 0x%08x]", | ||
1705 | pd_idx, l->entry_size, pd_offset, | ||
1706 | virt_addr, phys_addr, | ||
1707 | page_size >> 10, | ||
1708 | nvgpu_gmmu_perm_str(attrs->rw_flag), | ||
1709 | attrs->kind_v, | ||
1710 | nvgpu_aperture_str(attrs->aperture), | ||
1711 | attrs->valid ? 'V' : '-', | ||
1712 | attrs->cacheable ? 'C' : '-', | ||
1713 | attrs->sparse ? 'S' : '-', | ||
1714 | attrs->priv ? 'P' : '-', | ||
1715 | (u32)attrs->ctag >> ctag_shift, | ||
1716 | pte_w[1], pte_w[0]); | ||
1717 | |||
1718 | pd_write(g, pd, pd_offset + 0, pte_w[0]); | ||
1719 | pd_write(g, pd, pd_offset + 1, pte_w[1]); | ||
1780 | } | 1720 | } |
1781 | 1721 | ||
1782 | /* NOTE! mapped_buffers lock must be held */ | 1722 | /* NOTE! mapped_buffers lock must be held */ |
@@ -1809,13 +1749,6 @@ void nvgpu_vm_unmap_locked(struct nvgpu_mapped_buf *mapped_buffer, | |||
1809 | mapped_buffer->vm_area->sparse : false, | 1749 | mapped_buffer->vm_area->sparse : false, |
1810 | batch); | 1750 | batch); |
1811 | 1751 | ||
1812 | gk20a_dbg(gpu_dbg_map, | ||
1813 | "gv: 0x%04x_%08x pgsz=%-3dKb as=%-2d own_mem_ref=%d", | ||
1814 | u64_hi32(mapped_buffer->addr), u64_lo32(mapped_buffer->addr), | ||
1815 | vm->gmmu_page_sizes[mapped_buffer->pgsz_idx] >> 10, | ||
1816 | vm_aspace_id(vm), | ||
1817 | mapped_buffer->own_mem_ref); | ||
1818 | |||
1819 | gk20a_mm_unpin(dev_from_vm(vm), mapped_buffer->dmabuf, | 1752 | gk20a_mm_unpin(dev_from_vm(vm), mapped_buffer->dmabuf, |
1820 | mapped_buffer->sgt); | 1753 | mapped_buffer->sgt); |
1821 | 1754 | ||
@@ -1942,6 +1875,9 @@ int __gk20a_vm_bind_channel(struct vm_gk20a *vm, struct channel_gk20a *ch) | |||
1942 | if (err) | 1875 | if (err) |
1943 | ch->vm = NULL; | 1876 | ch->vm = NULL; |
1944 | 1877 | ||
1878 | nvgpu_log(gk20a_from_vm(vm), gpu_dbg_map, "Binding ch=%d -> VM:%s", | ||
1879 | ch->chid, vm->name); | ||
1880 | |||
1945 | return err; | 1881 | return err; |
1946 | } | 1882 | } |
1947 | 1883 | ||
@@ -2114,7 +2050,7 @@ u64 gk20a_mm_inst_block_addr(struct gk20a *g, struct nvgpu_mem *inst_block) | |||
2114 | if (g->mm.has_physical_mode) | 2050 | if (g->mm.has_physical_mode) |
2115 | addr = gk20a_mem_phys(inst_block); | 2051 | addr = gk20a_mem_phys(inst_block); |
2116 | else | 2052 | else |
2117 | addr = gk20a_mem_get_base_addr(g, inst_block, 0); | 2053 | addr = nvgpu_mem_get_base_addr(g, inst_block, 0); |
2118 | 2054 | ||
2119 | return addr; | 2055 | return addr; |
2120 | } | 2056 | } |
@@ -2237,7 +2173,7 @@ static int gk20a_init_ce_vm(struct mm_gk20a *mm) | |||
2237 | void gk20a_mm_init_pdb(struct gk20a *g, struct nvgpu_mem *inst_block, | 2173 | void gk20a_mm_init_pdb(struct gk20a *g, struct nvgpu_mem *inst_block, |
2238 | struct vm_gk20a *vm) | 2174 | struct vm_gk20a *vm) |
2239 | { | 2175 | { |
2240 | u64 pdb_addr = gk20a_mem_get_base_addr(g, &vm->pdb.mem, 0); | 2176 | u64 pdb_addr = nvgpu_mem_get_base_addr(g, &vm->pdb.mem, 0); |
2241 | u32 pdb_addr_lo = u64_lo32(pdb_addr >> ram_in_base_shift_v()); | 2177 | u32 pdb_addr_lo = u64_lo32(pdb_addr >> ram_in_base_shift_v()); |
2242 | u32 pdb_addr_hi = u64_hi32(pdb_addr); | 2178 | u32 pdb_addr_hi = u64_hi32(pdb_addr); |
2243 | 2179 | ||
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h index cf37640d..a245d0e0 100644 --- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h +++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h | |||
@@ -42,12 +42,6 @@ | |||
42 | outer_flush_range(pa, pa + (size_t)(size)); \ | 42 | outer_flush_range(pa, pa + (size_t)(size)); \ |
43 | } while (0) | 43 | } while (0) |
44 | 44 | ||
45 | enum gk20a_mem_rw_flag { | ||
46 | gk20a_mem_flag_none = 0, | ||
47 | gk20a_mem_flag_read_only = 1, | ||
48 | gk20a_mem_flag_write_only = 2, | ||
49 | }; | ||
50 | |||
51 | struct gpfifo_desc { | 45 | struct gpfifo_desc { |
52 | struct nvgpu_mem mem; | 46 | struct nvgpu_mem mem; |
53 | u32 entry_num; | 47 | u32 entry_num; |
@@ -347,7 +341,7 @@ int gk20a_mm_suspend(struct gk20a *g); | |||
347 | u64 gk20a_mm_iova_addr(struct gk20a *g, struct scatterlist *sgl, | 341 | u64 gk20a_mm_iova_addr(struct gk20a *g, struct scatterlist *sgl, |
348 | u32 flags); | 342 | u32 flags); |
349 | u64 gk20a_mm_smmu_vaddr_translate(struct gk20a *g, dma_addr_t iova); | 343 | u64 gk20a_mm_smmu_vaddr_translate(struct gk20a *g, dma_addr_t iova); |
350 | u64 gk20a_mem_get_base_addr(struct gk20a *g, struct nvgpu_mem *mem, | 344 | u64 nvgpu_mem_get_base_addr(struct gk20a *g, struct nvgpu_mem *mem, |
351 | u32 flags); | 345 | u32 flags); |
352 | 346 | ||
353 | void gk20a_mm_ltc_isr(struct gk20a *g); | 347 | void gk20a_mm_ltc_isr(struct gk20a *g); |
@@ -371,10 +365,6 @@ static inline phys_addr_t gk20a_mem_phys(struct nvgpu_mem *mem) | |||
371 | return 0; | 365 | return 0; |
372 | } | 366 | } |
373 | 367 | ||
374 | void gk20a_pde_wr32(struct gk20a *g, struct gk20a_mm_entry *entry, | ||
375 | size_t w, size_t data); | ||
376 | u64 gk20a_pde_addr(struct gk20a *g, struct gk20a_mm_entry *entry); | ||
377 | |||
378 | u64 gk20a_locked_gmmu_map(struct vm_gk20a *vm, | 368 | u64 gk20a_locked_gmmu_map(struct vm_gk20a *vm, |
379 | u64 map_offset, | 369 | u64 map_offset, |
380 | struct sg_table *sgt, | 370 | struct sg_table *sgt, |
@@ -451,8 +441,4 @@ int gk20a_mm_get_buffer_info(struct device *dev, int dmabuf_fd, | |||
451 | u64 *buffer_id, u64 *buffer_len); | 441 | u64 *buffer_id, u64 *buffer_len); |
452 | void gk20a_vm_unmap_locked_kref(struct kref *ref); | 442 | void gk20a_vm_unmap_locked_kref(struct kref *ref); |
453 | 443 | ||
454 | void gk20a_vm_free_entries(struct vm_gk20a *vm, | ||
455 | struct gk20a_mm_entry *parent, | ||
456 | int level); | ||
457 | |||
458 | #endif /* MM_GK20A_H */ | 444 | #endif /* MM_GK20A_H */ |
diff --git a/drivers/gpu/nvgpu/gp10b/mm_gp10b.c b/drivers/gpu/nvgpu/gp10b/mm_gp10b.c index d7391c6d..c3867e9d 100644 --- a/drivers/gpu/nvgpu/gp10b/mm_gp10b.c +++ b/drivers/gpu/nvgpu/gp10b/mm_gp10b.c | |||
@@ -14,6 +14,7 @@ | |||
14 | */ | 14 | */ |
15 | 15 | ||
16 | #include <nvgpu/dma.h> | 16 | #include <nvgpu/dma.h> |
17 | #include <nvgpu/gmmu.h> | ||
17 | 18 | ||
18 | #include "gk20a/gk20a.h" | 19 | #include "gk20a/gk20a.h" |
19 | #include "gk20a/platform_gk20a.h" | 20 | #include "gk20a/platform_gk20a.h" |
@@ -149,206 +150,186 @@ static u64 gp10b_mm_iova_addr(struct gk20a *g, struct scatterlist *sgl, | |||
149 | return gk20a_mm_smmu_vaddr_translate(g, sg_dma_address(sgl)); | 150 | return gk20a_mm_smmu_vaddr_translate(g, sg_dma_address(sgl)); |
150 | } | 151 | } |
151 | 152 | ||
152 | static u32 pde3_from_index(u32 i) | 153 | static void update_gmmu_pde3_locked(struct vm_gk20a *vm, |
153 | { | 154 | const struct gk20a_mmu_level *l, |
154 | return i * gmmu_new_pde__size_v() / sizeof(u32); | 155 | struct nvgpu_gmmu_pd *pd, |
155 | } | 156 | u32 pd_idx, |
156 | 157 | u64 virt_addr, | |
157 | static u32 pte3_from_index(u32 i) | 158 | u64 phys_addr, |
158 | { | 159 | struct nvgpu_gmmu_attrs *attrs) |
159 | return i * gmmu_new_pte__size_v() / sizeof(u32); | ||
160 | } | ||
161 | |||
162 | static int update_gmmu_pde3_locked(struct vm_gk20a *vm, | ||
163 | struct gk20a_mm_entry *parent, | ||
164 | u32 i, u32 gmmu_pgsz_idx, | ||
165 | struct scatterlist **sgl, | ||
166 | u64 *offset, | ||
167 | u64 *iova, | ||
168 | u32 kind_v, u64 *ctag, | ||
169 | bool cacheable, bool unmapped_pte, | ||
170 | int rw_flag, bool sparse, bool priv, | ||
171 | enum nvgpu_aperture aperture) | ||
172 | { | 160 | { |
173 | struct gk20a *g = gk20a_from_vm(vm); | 161 | struct gk20a *g = gk20a_from_vm(vm); |
174 | u64 pte_addr = 0; | 162 | u32 pd_offset = pd_offset_from_index(l, pd_idx); |
175 | struct gk20a_mm_entry *pte = parent->entries + i; | ||
176 | u32 pde_v[2] = {0, 0}; | 163 | u32 pde_v[2] = {0, 0}; |
177 | u32 pde; | ||
178 | |||
179 | gk20a_dbg_fn(""); | ||
180 | 164 | ||
181 | pte_addr = gk20a_pde_addr(g, pte) >> gmmu_new_pde_address_shift_v(); | 165 | phys_addr >>= gmmu_new_pde_address_shift_v(); |
182 | 166 | ||
183 | pde_v[0] |= nvgpu_aperture_mask(g, &pte->mem, | 167 | pde_v[0] |= nvgpu_aperture_mask(g, &pd->mem, |
184 | gmmu_new_pde_aperture_sys_mem_ncoh_f(), | 168 | gmmu_new_pde_aperture_sys_mem_ncoh_f(), |
185 | gmmu_new_pde_aperture_video_memory_f()); | 169 | gmmu_new_pde_aperture_video_memory_f()); |
186 | pde_v[0] |= gmmu_new_pde_address_sys_f(u64_lo32(pte_addr)); | 170 | pde_v[0] |= gmmu_new_pde_address_sys_f(u64_lo32(phys_addr)); |
187 | pde_v[0] |= gmmu_new_pde_vol_true_f(); | 171 | pde_v[0] |= gmmu_new_pde_vol_true_f(); |
188 | pde_v[1] |= pte_addr >> 24; | 172 | pde_v[1] |= phys_addr >> 24; |
189 | pde = pde3_from_index(i); | 173 | |
190 | 174 | pd_write(g, pd, pd_offset + 0, pde_v[0]); | |
191 | gk20a_pde_wr32(g, parent, pde + 0, pde_v[0]); | 175 | pd_write(g, pd, pd_offset + 1, pde_v[1]); |
192 | gk20a_pde_wr32(g, parent, pde + 1, pde_v[1]); | 176 | |
193 | 177 | pte_dbg(g, attrs, | |
194 | gk20a_dbg(gpu_dbg_pte, "pde:%d,sz=%d = 0x%x,0x%08x", | 178 | "PDE: i=%-4u size=%-2u offs=%-4u pgsz: -- | " |
195 | i, gmmu_pgsz_idx, pde_v[1], pde_v[0]); | 179 | "GPU %#-12llx phys %#-12llx " |
196 | gk20a_dbg_fn("done"); | 180 | "[0x%08x, 0x%08x]", |
197 | return 0; | 181 | pd_idx, l->entry_size, pd_offset, |
182 | virt_addr, phys_addr, | ||
183 | pde_v[1], pde_v[0]); | ||
198 | } | 184 | } |
199 | 185 | ||
200 | static u32 pde0_from_index(u32 i) | 186 | static void update_gmmu_pde0_locked(struct vm_gk20a *vm, |
201 | { | 187 | const struct gk20a_mmu_level *l, |
202 | return i * gmmu_new_dual_pde__size_v() / sizeof(u32); | 188 | struct nvgpu_gmmu_pd *pd, |
203 | } | 189 | u32 pd_idx, |
204 | 190 | u64 virt_addr, | |
205 | static int update_gmmu_pde0_locked(struct vm_gk20a *vm, | 191 | u64 phys_addr, |
206 | struct gk20a_mm_entry *pte, | 192 | struct nvgpu_gmmu_attrs *attrs) |
207 | u32 i, u32 gmmu_pgsz_idx, | ||
208 | struct scatterlist **sgl, | ||
209 | u64 *offset, | ||
210 | u64 *iova, | ||
211 | u32 kind_v, u64 *ctag, | ||
212 | bool cacheable, bool unmapped_pte, | ||
213 | int rw_flag, bool sparse, bool priv, | ||
214 | enum nvgpu_aperture aperture) | ||
215 | { | 193 | { |
216 | struct gk20a *g = gk20a_from_vm(vm); | 194 | struct gk20a *g = gk20a_from_vm(vm); |
217 | bool small_valid, big_valid; | 195 | bool small_valid, big_valid; |
218 | u32 pte_addr_small = 0, pte_addr_big = 0; | 196 | u32 small_addr = 0, big_addr = 0; |
219 | struct gk20a_mm_entry *entry = pte->entries + i; | 197 | u32 pd_offset = pd_offset_from_index(l, pd_idx); |
220 | u32 pde_v[4] = {0, 0, 0, 0}; | 198 | u32 pde_v[4] = {0, 0, 0, 0}; |
221 | u32 pde; | ||
222 | |||
223 | gk20a_dbg_fn(""); | ||
224 | 199 | ||
225 | small_valid = entry->mem.size && entry->pgsz == gmmu_page_size_small; | 200 | small_valid = attrs->pgsz == gmmu_page_size_small; |
226 | big_valid = entry->mem.size && entry->pgsz == gmmu_page_size_big; | 201 | big_valid = attrs->pgsz == gmmu_page_size_big; |
227 | 202 | ||
228 | if (small_valid) { | 203 | if (small_valid) |
229 | pte_addr_small = gk20a_pde_addr(g, entry) | 204 | small_addr = phys_addr >> gmmu_new_dual_pde_address_shift_v(); |
230 | >> gmmu_new_dual_pde_address_shift_v(); | ||
231 | } | ||
232 | 205 | ||
233 | if (big_valid) | 206 | if (big_valid) |
234 | pte_addr_big = gk20a_pde_addr(g, entry) | 207 | big_addr = phys_addr >> gmmu_new_dual_pde_address_big_shift_v(); |
235 | >> gmmu_new_dual_pde_address_big_shift_v(); | ||
236 | 208 | ||
237 | if (small_valid) { | 209 | if (small_valid) { |
238 | pde_v[2] |= gmmu_new_dual_pde_address_small_sys_f(pte_addr_small); | 210 | pde_v[2] |= |
239 | pde_v[2] |= nvgpu_aperture_mask(g, &entry->mem, | 211 | gmmu_new_dual_pde_address_small_sys_f(small_addr); |
212 | pde_v[2] |= nvgpu_aperture_mask(g, &pd->mem, | ||
240 | gmmu_new_dual_pde_aperture_small_sys_mem_ncoh_f(), | 213 | gmmu_new_dual_pde_aperture_small_sys_mem_ncoh_f(), |
241 | gmmu_new_dual_pde_aperture_small_video_memory_f()); | 214 | gmmu_new_dual_pde_aperture_small_video_memory_f()); |
242 | pde_v[2] |= gmmu_new_dual_pde_vol_small_true_f(); | 215 | pde_v[2] |= gmmu_new_dual_pde_vol_small_true_f(); |
243 | pde_v[3] |= pte_addr_small >> 24; | 216 | pde_v[3] |= small_addr >> 24; |
244 | } | 217 | } |
245 | 218 | ||
246 | if (big_valid) { | 219 | if (big_valid) { |
247 | pde_v[0] |= gmmu_new_dual_pde_address_big_sys_f(pte_addr_big); | 220 | pde_v[0] |= gmmu_new_dual_pde_address_big_sys_f(big_addr); |
248 | pde_v[0] |= gmmu_new_dual_pde_vol_big_true_f(); | 221 | pde_v[0] |= gmmu_new_dual_pde_vol_big_true_f(); |
249 | pde_v[0] |= nvgpu_aperture_mask(g, &entry->mem, | 222 | pde_v[0] |= nvgpu_aperture_mask(g, &pd->mem, |
250 | gmmu_new_dual_pde_aperture_big_sys_mem_ncoh_f(), | 223 | gmmu_new_dual_pde_aperture_big_sys_mem_ncoh_f(), |
251 | gmmu_new_dual_pde_aperture_big_video_memory_f()); | 224 | gmmu_new_dual_pde_aperture_big_video_memory_f()); |
252 | pde_v[1] |= pte_addr_big >> 28; | 225 | pde_v[1] |= big_addr >> 28; |
253 | } | 226 | } |
254 | 227 | ||
255 | pde = pde0_from_index(i); | 228 | pd_write(g, pd, pd_offset + 0, pde_v[0]); |
256 | 229 | pd_write(g, pd, pd_offset + 1, pde_v[1]); | |
257 | gk20a_pde_wr32(g, pte, pde + 0, pde_v[0]); | 230 | pd_write(g, pd, pd_offset + 2, pde_v[2]); |
258 | gk20a_pde_wr32(g, pte, pde + 1, pde_v[1]); | 231 | pd_write(g, pd, pd_offset + 3, pde_v[3]); |
259 | gk20a_pde_wr32(g, pte, pde + 2, pde_v[2]); | 232 | |
260 | gk20a_pde_wr32(g, pte, pde + 3, pde_v[3]); | 233 | pte_dbg(g, attrs, |
261 | 234 | "PDE: i=%-4u size=%-2u offs=%-4u pgsz: %c%c | " | |
262 | gk20a_dbg(gpu_dbg_pte, "pde:%d,sz=%d [0x%08x, 0x%08x, 0x%x, 0x%08x]", | 235 | "GPU %#-12llx phys %#-12llx " |
263 | i, gmmu_pgsz_idx, pde_v[3], pde_v[2], pde_v[1], pde_v[0]); | 236 | "[0x%08x, 0x%08x, 0x%08x, 0x%08x]", |
264 | gk20a_dbg_fn("done"); | 237 | pd_idx, l->entry_size, pd_offset, |
265 | return 0; | 238 | small_valid ? 'S' : '-', |
239 | big_valid ? 'B' : '-', | ||
240 | virt_addr, phys_addr, | ||
241 | pde_v[3], pde_v[2], pde_v[1], pde_v[0]); | ||
266 | } | 242 | } |
267 | 243 | ||
268 | static int update_gmmu_pte_locked(struct vm_gk20a *vm, | 244 | static void __update_pte(struct vm_gk20a *vm, |
269 | struct gk20a_mm_entry *pte, | 245 | u32 *pte_w, |
270 | u32 i, u32 gmmu_pgsz_idx, | 246 | u64 phys_addr, |
271 | struct scatterlist **sgl, | 247 | struct nvgpu_gmmu_attrs *attrs) |
272 | u64 *offset, | ||
273 | u64 *iova, | ||
274 | u32 kind_v, u64 *ctag, | ||
275 | bool cacheable, bool unmapped_pte, | ||
276 | int rw_flag, bool sparse, bool priv, | ||
277 | enum nvgpu_aperture aperture) | ||
278 | { | 248 | { |
279 | struct gk20a *g = vm->mm->g; | 249 | struct gk20a *g = gk20a_from_vm(vm); |
280 | u32 page_size = vm->gmmu_page_sizes[gmmu_pgsz_idx]; | ||
281 | u64 ctag_granularity = g->ops.fb.compression_page_size(g); | 250 | u64 ctag_granularity = g->ops.fb.compression_page_size(g); |
282 | u32 pte_w[2] = {0, 0}; /* invalid pte */ | 251 | u32 page_size = vm->gmmu_page_sizes[attrs->pgsz]; |
283 | u32 pte_i; | 252 | u32 pte_valid = attrs->valid ? |
284 | 253 | gmmu_new_pte_valid_true_f() : | |
285 | if (*iova) { | 254 | gmmu_new_pte_valid_false_f(); |
286 | u32 pte_valid = unmapped_pte ? | 255 | u32 phys_shifted = phys_addr >> gmmu_new_pte_address_shift_v(); |
287 | gmmu_new_pte_valid_false_f() : | 256 | u32 pte_addr = attrs->aperture == APERTURE_SYSMEM ? |
288 | gmmu_new_pte_valid_true_f(); | 257 | gmmu_new_pte_address_sys_f(phys_shifted) : |
289 | u32 iova_v = *iova >> gmmu_new_pte_address_shift_v(); | 258 | gmmu_new_pte_address_vid_f(phys_shifted); |
290 | u32 pte_addr = aperture == APERTURE_SYSMEM ? | 259 | u32 pte_tgt = __nvgpu_aperture_mask(g, attrs->aperture, |
291 | gmmu_new_pte_address_sys_f(iova_v) : | 260 | gmmu_new_pte_aperture_sys_mem_ncoh_f(), |
292 | gmmu_new_pte_address_vid_f(iova_v); | 261 | gmmu_new_pte_aperture_video_memory_f()); |
293 | u32 pte_tgt = __nvgpu_aperture_mask(g, aperture, | 262 | |
294 | gmmu_new_pte_aperture_sys_mem_ncoh_f(), | 263 | pte_w[0] = pte_valid | pte_addr | pte_tgt; |
295 | gmmu_new_pte_aperture_video_memory_f()); | 264 | |
296 | 265 | if (attrs->priv) | |
297 | pte_w[0] = pte_valid | pte_addr | pte_tgt; | 266 | pte_w[0] |= gmmu_new_pte_privilege_true_f(); |
298 | 267 | ||
299 | if (priv) | 268 | pte_w[1] = phys_addr >> (24 + gmmu_new_pte_address_shift_v()) | |
300 | pte_w[0] |= gmmu_new_pte_privilege_true_f(); | 269 | gmmu_new_pte_kind_f(attrs->kind_v) | |
301 | 270 | gmmu_new_pte_comptagline_f((u32)(attrs->ctag / | |
302 | pte_w[1] = *iova >> (24 + gmmu_new_pte_address_shift_v()) | | 271 | ctag_granularity)); |
303 | gmmu_new_pte_kind_f(kind_v) | | 272 | |
304 | gmmu_new_pte_comptagline_f((u32)(*ctag / ctag_granularity)); | 273 | if (attrs->rw_flag == gk20a_mem_flag_read_only) |
305 | 274 | pte_w[0] |= gmmu_new_pte_read_only_true_f(); | |
306 | if (rw_flag == gk20a_mem_flag_read_only) | 275 | |
307 | pte_w[0] |= gmmu_new_pte_read_only_true_f(); | 276 | if (!attrs->valid && !attrs->cacheable) |
308 | if (unmapped_pte && !cacheable) | 277 | pte_w[0] |= gmmu_new_pte_read_only_true_f(); |
309 | pte_w[0] |= gmmu_new_pte_read_only_true_f(); | 278 | else if (!attrs->cacheable) |
310 | else if (!cacheable) | ||
311 | pte_w[0] |= gmmu_new_pte_vol_true_f(); | ||
312 | |||
313 | gk20a_dbg(gpu_dbg_pte, "pte=%d iova=0x%llx kind=%d" | ||
314 | " ctag=%d vol=%d" | ||
315 | " [0x%08x, 0x%08x]", | ||
316 | i, *iova, | ||
317 | kind_v, (u32)(*ctag / ctag_granularity), !cacheable, | ||
318 | pte_w[1], pte_w[0]); | ||
319 | |||
320 | if (*ctag) | ||
321 | *ctag += page_size; | ||
322 | } else if (sparse) { | ||
323 | pte_w[0] = gmmu_new_pte_valid_false_f(); | ||
324 | pte_w[0] |= gmmu_new_pte_vol_true_f(); | 279 | pte_w[0] |= gmmu_new_pte_vol_true_f(); |
325 | } else { | ||
326 | gk20a_dbg(gpu_dbg_pte, "pte_cur=%d [0x0,0x0]", i); | ||
327 | } | ||
328 | 280 | ||
329 | pte_i = pte3_from_index(i); | 281 | if (attrs->ctag) |
330 | 282 | attrs->ctag += page_size; | |
331 | gk20a_pde_wr32(g, pte, pte_i + 0, pte_w[0]); | 283 | |
332 | gk20a_pde_wr32(g, pte, pte_i + 1, pte_w[1]); | 284 | } |
333 | 285 | ||
334 | if (*iova) { | 286 | static void __update_pte_sparse(u32 *pte_w) |
335 | *iova += page_size; | 287 | { |
336 | *offset += page_size; | 288 | pte_w[0] = gmmu_new_pte_valid_false_f(); |
337 | if (*sgl && *offset + page_size > (*sgl)->length) { | 289 | pte_w[0] |= gmmu_new_pte_vol_true_f(); |
338 | u64 new_iova; | 290 | } |
339 | *sgl = sg_next(*sgl); | 291 | |
340 | if (*sgl) { | 292 | static void update_gmmu_pte_locked(struct vm_gk20a *vm, |
341 | new_iova = sg_phys(*sgl); | 293 | const struct gk20a_mmu_level *l, |
342 | gk20a_dbg(gpu_dbg_pte, "chunk address %llx, size %d", | 294 | struct nvgpu_gmmu_pd *pd, |
343 | new_iova, (*sgl)->length); | 295 | u32 pd_idx, |
344 | if (new_iova) { | 296 | u64 virt_addr, |
345 | *offset = 0; | 297 | u64 phys_addr, |
346 | *iova = new_iova; | 298 | struct nvgpu_gmmu_attrs *attrs) |
347 | } | 299 | { |
348 | } | 300 | struct gk20a *g = vm->mm->g; |
349 | } | 301 | u32 page_size = vm->gmmu_page_sizes[attrs->pgsz]; |
350 | } | 302 | u32 pd_offset = pd_offset_from_index(l, pd_idx); |
351 | return 0; | 303 | u32 pte_w[2] = {0, 0}; |
304 | |||
305 | if (phys_addr) | ||
306 | __update_pte(vm, pte_w, phys_addr, attrs); | ||
307 | else if (attrs->sparse) | ||
308 | __update_pte_sparse(pte_w); | ||
309 | |||
310 | pte_dbg(g, attrs, | ||
311 | "vm=%s " | ||
312 | "PTE: i=%-4u size=%-2u offs=%-4u | " | ||
313 | "GPU %#-12llx phys %#-12llx " | ||
314 | "pgsz: %3dkb perm=%-2s kind=%#02x APT=%-6s %c%c%c%c " | ||
315 | "ctag=0x%08x " | ||
316 | "[0x%08x, 0x%08x]", | ||
317 | vm->name, | ||
318 | pd_idx, l->entry_size, pd_offset, | ||
319 | virt_addr, phys_addr, | ||
320 | page_size >> 10, | ||
321 | nvgpu_gmmu_perm_str(attrs->rw_flag), | ||
322 | attrs->kind_v, | ||
323 | nvgpu_aperture_str(attrs->aperture), | ||
324 | attrs->valid ? 'V' : '-', | ||
325 | attrs->cacheable ? 'C' : '-', | ||
326 | attrs->sparse ? 'S' : '-', | ||
327 | attrs->priv ? 'P' : '-', | ||
328 | (u32)attrs->ctag / g->ops.fb.compression_page_size(g), | ||
329 | pte_w[1], pte_w[0]); | ||
330 | |||
331 | pd_write(g, pd, pd_offset + 0, pte_w[0]); | ||
332 | pd_write(g, pd, pd_offset + 1, pte_w[1]); | ||
352 | } | 333 | } |
353 | 334 | ||
354 | static const struct gk20a_mmu_level gp10b_mm_levels[] = { | 335 | static const struct gk20a_mmu_level gp10b_mm_levels[] = { |
@@ -384,7 +365,7 @@ static const struct gk20a_mmu_level *gp10b_mm_get_mmu_levels(struct gk20a *g, | |||
384 | static void gp10b_mm_init_pdb(struct gk20a *g, struct nvgpu_mem *inst_block, | 365 | static void gp10b_mm_init_pdb(struct gk20a *g, struct nvgpu_mem *inst_block, |
385 | struct vm_gk20a *vm) | 366 | struct vm_gk20a *vm) |
386 | { | 367 | { |
387 | u64 pdb_addr = gk20a_mem_get_base_addr(g, &vm->pdb.mem, 0); | 368 | u64 pdb_addr = nvgpu_mem_get_base_addr(g, &vm->pdb.mem, 0); |
388 | u32 pdb_addr_lo = u64_lo32(pdb_addr >> ram_in_base_shift_v()); | 369 | u32 pdb_addr_lo = u64_lo32(pdb_addr >> ram_in_base_shift_v()); |
389 | u32 pdb_addr_hi = u64_hi32(pdb_addr); | 370 | u32 pdb_addr_hi = u64_hi32(pdb_addr); |
390 | 371 | ||
diff --git a/drivers/gpu/nvgpu/include/nvgpu/gmmu.h b/drivers/gpu/nvgpu/include/nvgpu/gmmu.h index ed152cd8..28a2cb82 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/gmmu.h +++ b/drivers/gpu/nvgpu/include/nvgpu/gmmu.h | |||
@@ -38,36 +38,97 @@ enum gmmu_pgsz_gk20a { | |||
38 | gmmu_nr_page_sizes = 3, | 38 | gmmu_nr_page_sizes = 3, |
39 | }; | 39 | }; |
40 | 40 | ||
41 | struct gk20a_mm_entry { | 41 | enum gk20a_mem_rw_flag { |
42 | /* backing for */ | 42 | gk20a_mem_flag_none = 0, /* RW */ |
43 | struct nvgpu_mem mem; | 43 | gk20a_mem_flag_read_only = 1, /* RO */ |
44 | u32 woffset; /* if >0, mem is a shadow copy, owned by another entry */ | 44 | gk20a_mem_flag_write_only = 2, /* WO */ |
45 | int pgsz; | 45 | }; |
46 | struct gk20a_mm_entry *entries; | 46 | |
47 | int num_entries; | 47 | /* |
48 | * GMMU page directory. This is the kernel's tracking of a list of PDEs or PTEs | ||
49 | * in the GMMU. | ||
50 | */ | ||
51 | struct nvgpu_gmmu_pd { | ||
52 | /* | ||
53 | * DMA memory describing the PTEs or PTEs. | ||
54 | */ | ||
55 | struct nvgpu_mem mem; | ||
56 | |||
57 | /* | ||
58 | * List of pointers to the next level of page tables. Does not | ||
59 | * need to be populated when this PD is pointing to PTEs. | ||
60 | */ | ||
61 | struct nvgpu_gmmu_pd *entries; | ||
62 | int num_entries; | ||
63 | }; | ||
64 | |||
65 | /* | ||
66 | * Reduce the number of arguments getting passed through the various levels of | ||
67 | * GMMU mapping functions. | ||
68 | * | ||
69 | * The following fields are set statically and do not change throughout | ||
70 | * mapping call: | ||
71 | * | ||
72 | * pgsz: Index into the page size table. | ||
73 | * kind_v: Kind attributes for mapping. | ||
74 | * cacheable: Cacheability of the mapping. | ||
75 | * rw_flag: Flag from enum gk20a_mem_rw_flag | ||
76 | * sparse: Set if the mapping should be sparse. | ||
77 | * priv: Privilidged mapping. | ||
78 | * valid: Set if the PTE should be marked valid. | ||
79 | * aperture: VIDMEM or SYSMEM. | ||
80 | * debug: When set print debugging info. | ||
81 | * | ||
82 | * These fields are dynamically updated as necessary during the map: | ||
83 | * | ||
84 | * ctag: Comptag line in the comptag cache; | ||
85 | * updated every time we write a PTE. | ||
86 | */ | ||
87 | struct nvgpu_gmmu_attrs { | ||
88 | u32 pgsz; | ||
89 | u32 kind_v; | ||
90 | u64 ctag; | ||
91 | bool cacheable; | ||
92 | int rw_flag; | ||
93 | bool sparse; | ||
94 | bool priv; | ||
95 | bool valid; | ||
96 | enum nvgpu_aperture aperture; | ||
97 | bool debug; | ||
48 | }; | 98 | }; |
49 | 99 | ||
50 | struct gk20a_mmu_level { | 100 | struct gk20a_mmu_level { |
51 | int hi_bit[2]; | 101 | int hi_bit[2]; |
52 | int lo_bit[2]; | 102 | int lo_bit[2]; |
53 | int (*update_entry)(struct vm_gk20a *vm, | 103 | |
54 | struct gk20a_mm_entry *pte, | 104 | /* |
55 | u32 i, u32 gmmu_pgsz_idx, | 105 | * Build map from virt_addr -> phys_addr. |
56 | struct scatterlist **sgl, | 106 | */ |
57 | u64 *offset, | 107 | void (*update_entry)(struct vm_gk20a *vm, |
58 | u64 *iova, | 108 | const struct gk20a_mmu_level *l, |
59 | u32 kind_v, u64 *ctag, | 109 | struct nvgpu_gmmu_pd *pd, |
60 | bool cacheable, bool unmapped_pte, | 110 | u32 pd_idx, |
61 | int rw_flag, bool sparse, bool priv, | 111 | u64 phys_addr, |
62 | enum nvgpu_aperture aperture); | 112 | u64 virt_addr, |
63 | size_t entry_size; | 113 | struct nvgpu_gmmu_attrs *attrs); |
114 | u32 entry_size; | ||
64 | }; | 115 | }; |
65 | 116 | ||
66 | int nvgpu_zalloc_gmmu_page_table(struct vm_gk20a *vm, | 117 | static inline const char *nvgpu_gmmu_perm_str(enum gk20a_mem_rw_flag p) |
67 | enum gmmu_pgsz_gk20a pgsz_idx, | 118 | { |
68 | const struct gk20a_mmu_level *l, | 119 | switch (p) { |
69 | struct gk20a_mm_entry *entry, | 120 | case gk20a_mem_flag_none: |
70 | struct gk20a_mm_entry *prev_entry); | 121 | return "RW"; |
122 | case gk20a_mem_flag_write_only: | ||
123 | return "WO"; | ||
124 | case gk20a_mem_flag_read_only: | ||
125 | return "RO"; | ||
126 | default: | ||
127 | return "??"; | ||
128 | } | ||
129 | } | ||
130 | |||
131 | int nvgpu_gmmu_init_page_table(struct vm_gk20a *vm); | ||
71 | 132 | ||
72 | /** | 133 | /** |
73 | * nvgpu_gmmu_map - Map memory into the GMMU. | 134 | * nvgpu_gmmu_map - Map memory into the GMMU. |
@@ -106,6 +167,33 @@ void nvgpu_gmmu_unmap(struct vm_gk20a *vm, | |||
106 | u64 gpu_va); | 167 | u64 gpu_va); |
107 | 168 | ||
108 | void nvgpu_free_gmmu_pages(struct vm_gk20a *vm, | 169 | void nvgpu_free_gmmu_pages(struct vm_gk20a *vm, |
109 | struct gk20a_mm_entry *entry); | 170 | struct nvgpu_gmmu_pd *entry); |
171 | |||
172 | /* | ||
173 | * Some useful routines that are shared across chips. | ||
174 | */ | ||
175 | static inline u32 pd_offset_from_index(const struct gk20a_mmu_level *l, | ||
176 | u32 pd_idx) | ||
177 | { | ||
178 | return (pd_idx * l->entry_size) / sizeof(u32); | ||
179 | } | ||
180 | |||
181 | static inline void pd_write(struct gk20a *g, struct nvgpu_gmmu_pd *pd, | ||
182 | size_t w, size_t data) | ||
183 | { | ||
184 | nvgpu_mem_wr32(g, &pd->mem, w, data); | ||
185 | } | ||
186 | |||
187 | |||
188 | /* | ||
189 | * Internal debugging routines. Probably not something you want to use. | ||
190 | */ | ||
191 | #define pte_dbg(g, attrs, fmt, args...) \ | ||
192 | do { \ | ||
193 | if (attrs && attrs->debug) \ | ||
194 | nvgpu_info(g, fmt, ##args); \ | ||
195 | else \ | ||
196 | nvgpu_log(g, gpu_dbg_pte, fmt, ##args); \ | ||
197 | } while (0) | ||
110 | 198 | ||
111 | #endif | 199 | #endif |
diff --git a/drivers/gpu/nvgpu/include/nvgpu/nvgpu_mem.h b/drivers/gpu/nvgpu/include/nvgpu/nvgpu_mem.h index 66d04ab8..4259d40f 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/nvgpu_mem.h +++ b/drivers/gpu/nvgpu/include/nvgpu/nvgpu_mem.h | |||
@@ -109,9 +109,9 @@ nvgpu_mem_from_clear_list_entry(struct nvgpu_list_node *node) | |||
109 | static inline const char *nvgpu_aperture_str(enum nvgpu_aperture aperture) | 109 | static inline const char *nvgpu_aperture_str(enum nvgpu_aperture aperture) |
110 | { | 110 | { |
111 | switch (aperture) { | 111 | switch (aperture) { |
112 | case APERTURE_INVALID: return "invalid"; | 112 | case APERTURE_INVALID: return "INVAL"; |
113 | case APERTURE_SYSMEM: return "sysmem"; | 113 | case APERTURE_SYSMEM: return "SYSMEM"; |
114 | case APERTURE_VIDMEM: return "vidmem"; | 114 | case APERTURE_VIDMEM: return "VIDMEM"; |
115 | }; | 115 | }; |
116 | return "UNKNOWN"; | 116 | return "UNKNOWN"; |
117 | } | 117 | } |
diff --git a/drivers/gpu/nvgpu/include/nvgpu/vm.h b/drivers/gpu/nvgpu/include/nvgpu/vm.h index f6d88cc3..255b4361 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/vm.h +++ b/drivers/gpu/nvgpu/include/nvgpu/vm.h | |||
@@ -126,6 +126,7 @@ mapped_buffer_from_rbtree_node(struct nvgpu_rbtree_node *node) | |||
126 | struct vm_gk20a { | 126 | struct vm_gk20a { |
127 | struct mm_gk20a *mm; | 127 | struct mm_gk20a *mm; |
128 | struct gk20a_as_share *as_share; /* as_share this represents */ | 128 | struct gk20a_as_share *as_share; /* as_share this represents */ |
129 | char name[20]; | ||
129 | 130 | ||
130 | u64 va_start; | 131 | u64 va_start; |
131 | u64 va_limit; | 132 | u64 va_limit; |
@@ -145,7 +146,7 @@ struct vm_gk20a { | |||
145 | 146 | ||
146 | struct nvgpu_mutex update_gmmu_lock; | 147 | struct nvgpu_mutex update_gmmu_lock; |
147 | 148 | ||
148 | struct gk20a_mm_entry pdb; | 149 | struct nvgpu_gmmu_pd pdb; |
149 | 150 | ||
150 | /* | 151 | /* |
151 | * These structs define the address spaces. In some cases it's possible | 152 | * These structs define the address spaces. In some cases it's possible |