summaryrefslogtreecommitdiffstats
path: root/drivers/gpu/nvgpu/gp10b/mm_gp10b.c
diff options
context:
space:
mode:
authorAlex Waterman <alexw@nvidia.com>2017-05-11 16:59:22 -0400
committermobile promotions <svcmobile_promotions@nvidia.com>2017-07-06 17:44:15 -0400
commitc1393d5b68e63c992f4c689cb788139fdf8c2f1a (patch)
tree00a588d35342d75c05fed7733e91da753ba640fb /drivers/gpu/nvgpu/gp10b/mm_gp10b.c
parent84f712dee8b582dd7d2a19345c621a2ae3bd6292 (diff)
gpu: nvgpu: gmmu programming rewrite
Update the high level mapping logic. Instead of iterating over the GPU VA iterate over the scatter-gather table chunks. As a result each GMMU page table update call is simplified dramatically. This also modifies the chip level code to no longer require an SGL as an argument. Each call to the chip level code will be guaranteed to be contiguous so it only has to worry about making a mapping from virt -> phys. This removes the dependency on Linux that the chip code currently has. With this patch the core GMMU code still uses the Linux SGL but the logic is highly transferable to a different, nvgpu specific, scatter gather list format in the near future. The last major update is to push most of the page table attribute arguments to a struct. That struct is passed on through the various mapping levels. This makes the funtions calls more simple and easier to follow. JIRA NVGPU-30 Change-Id: Ibb6b11755f99818fe642622ca0bd4cbed054f602 Signed-off-by: Alex Waterman <alexw@nvidia.com> Reviewed-on: https://git-master/r/1484104 Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com> GVS: Gerrit_Virtual_Submit
Diffstat (limited to 'drivers/gpu/nvgpu/gp10b/mm_gp10b.c')
-rw-r--r--drivers/gpu/nvgpu/gp10b/mm_gp10b.c309
1 files changed, 145 insertions, 164 deletions
diff --git a/drivers/gpu/nvgpu/gp10b/mm_gp10b.c b/drivers/gpu/nvgpu/gp10b/mm_gp10b.c
index d7391c6d..c3867e9d 100644
--- a/drivers/gpu/nvgpu/gp10b/mm_gp10b.c
+++ b/drivers/gpu/nvgpu/gp10b/mm_gp10b.c
@@ -14,6 +14,7 @@
14 */ 14 */
15 15
16#include <nvgpu/dma.h> 16#include <nvgpu/dma.h>
17#include <nvgpu/gmmu.h>
17 18
18#include "gk20a/gk20a.h" 19#include "gk20a/gk20a.h"
19#include "gk20a/platform_gk20a.h" 20#include "gk20a/platform_gk20a.h"
@@ -149,206 +150,186 @@ static u64 gp10b_mm_iova_addr(struct gk20a *g, struct scatterlist *sgl,
149 return gk20a_mm_smmu_vaddr_translate(g, sg_dma_address(sgl)); 150 return gk20a_mm_smmu_vaddr_translate(g, sg_dma_address(sgl));
150} 151}
151 152
152static u32 pde3_from_index(u32 i) 153static void update_gmmu_pde3_locked(struct vm_gk20a *vm,
153{ 154 const struct gk20a_mmu_level *l,
154 return i * gmmu_new_pde__size_v() / sizeof(u32); 155 struct nvgpu_gmmu_pd *pd,
155} 156 u32 pd_idx,
156 157 u64 virt_addr,
157static u32 pte3_from_index(u32 i) 158 u64 phys_addr,
158{ 159 struct nvgpu_gmmu_attrs *attrs)
159 return i * gmmu_new_pte__size_v() / sizeof(u32);
160}
161
162static int update_gmmu_pde3_locked(struct vm_gk20a *vm,
163 struct gk20a_mm_entry *parent,
164 u32 i, u32 gmmu_pgsz_idx,
165 struct scatterlist **sgl,
166 u64 *offset,
167 u64 *iova,
168 u32 kind_v, u64 *ctag,
169 bool cacheable, bool unmapped_pte,
170 int rw_flag, bool sparse, bool priv,
171 enum nvgpu_aperture aperture)
172{ 160{
173 struct gk20a *g = gk20a_from_vm(vm); 161 struct gk20a *g = gk20a_from_vm(vm);
174 u64 pte_addr = 0; 162 u32 pd_offset = pd_offset_from_index(l, pd_idx);
175 struct gk20a_mm_entry *pte = parent->entries + i;
176 u32 pde_v[2] = {0, 0}; 163 u32 pde_v[2] = {0, 0};
177 u32 pde;
178
179 gk20a_dbg_fn("");
180 164
181 pte_addr = gk20a_pde_addr(g, pte) >> gmmu_new_pde_address_shift_v(); 165 phys_addr >>= gmmu_new_pde_address_shift_v();
182 166
183 pde_v[0] |= nvgpu_aperture_mask(g, &pte->mem, 167 pde_v[0] |= nvgpu_aperture_mask(g, &pd->mem,
184 gmmu_new_pde_aperture_sys_mem_ncoh_f(), 168 gmmu_new_pde_aperture_sys_mem_ncoh_f(),
185 gmmu_new_pde_aperture_video_memory_f()); 169 gmmu_new_pde_aperture_video_memory_f());
186 pde_v[0] |= gmmu_new_pde_address_sys_f(u64_lo32(pte_addr)); 170 pde_v[0] |= gmmu_new_pde_address_sys_f(u64_lo32(phys_addr));
187 pde_v[0] |= gmmu_new_pde_vol_true_f(); 171 pde_v[0] |= gmmu_new_pde_vol_true_f();
188 pde_v[1] |= pte_addr >> 24; 172 pde_v[1] |= phys_addr >> 24;
189 pde = pde3_from_index(i); 173
190 174 pd_write(g, pd, pd_offset + 0, pde_v[0]);
191 gk20a_pde_wr32(g, parent, pde + 0, pde_v[0]); 175 pd_write(g, pd, pd_offset + 1, pde_v[1]);
192 gk20a_pde_wr32(g, parent, pde + 1, pde_v[1]); 176
193 177 pte_dbg(g, attrs,
194 gk20a_dbg(gpu_dbg_pte, "pde:%d,sz=%d = 0x%x,0x%08x", 178 "PDE: i=%-4u size=%-2u offs=%-4u pgsz: -- | "
195 i, gmmu_pgsz_idx, pde_v[1], pde_v[0]); 179 "GPU %#-12llx phys %#-12llx "
196 gk20a_dbg_fn("done"); 180 "[0x%08x, 0x%08x]",
197 return 0; 181 pd_idx, l->entry_size, pd_offset,
182 virt_addr, phys_addr,
183 pde_v[1], pde_v[0]);
198} 184}
199 185
200static u32 pde0_from_index(u32 i) 186static void update_gmmu_pde0_locked(struct vm_gk20a *vm,
201{ 187 const struct gk20a_mmu_level *l,
202 return i * gmmu_new_dual_pde__size_v() / sizeof(u32); 188 struct nvgpu_gmmu_pd *pd,
203} 189 u32 pd_idx,
204 190 u64 virt_addr,
205static int update_gmmu_pde0_locked(struct vm_gk20a *vm, 191 u64 phys_addr,
206 struct gk20a_mm_entry *pte, 192 struct nvgpu_gmmu_attrs *attrs)
207 u32 i, u32 gmmu_pgsz_idx,
208 struct scatterlist **sgl,
209 u64 *offset,
210 u64 *iova,
211 u32 kind_v, u64 *ctag,
212 bool cacheable, bool unmapped_pte,
213 int rw_flag, bool sparse, bool priv,
214 enum nvgpu_aperture aperture)
215{ 193{
216 struct gk20a *g = gk20a_from_vm(vm); 194 struct gk20a *g = gk20a_from_vm(vm);
217 bool small_valid, big_valid; 195 bool small_valid, big_valid;
218 u32 pte_addr_small = 0, pte_addr_big = 0; 196 u32 small_addr = 0, big_addr = 0;
219 struct gk20a_mm_entry *entry = pte->entries + i; 197 u32 pd_offset = pd_offset_from_index(l, pd_idx);
220 u32 pde_v[4] = {0, 0, 0, 0}; 198 u32 pde_v[4] = {0, 0, 0, 0};
221 u32 pde;
222
223 gk20a_dbg_fn("");
224 199
225 small_valid = entry->mem.size && entry->pgsz == gmmu_page_size_small; 200 small_valid = attrs->pgsz == gmmu_page_size_small;
226 big_valid = entry->mem.size && entry->pgsz == gmmu_page_size_big; 201 big_valid = attrs->pgsz == gmmu_page_size_big;
227 202
228 if (small_valid) { 203 if (small_valid)
229 pte_addr_small = gk20a_pde_addr(g, entry) 204 small_addr = phys_addr >> gmmu_new_dual_pde_address_shift_v();
230 >> gmmu_new_dual_pde_address_shift_v();
231 }
232 205
233 if (big_valid) 206 if (big_valid)
234 pte_addr_big = gk20a_pde_addr(g, entry) 207 big_addr = phys_addr >> gmmu_new_dual_pde_address_big_shift_v();
235 >> gmmu_new_dual_pde_address_big_shift_v();
236 208
237 if (small_valid) { 209 if (small_valid) {
238 pde_v[2] |= gmmu_new_dual_pde_address_small_sys_f(pte_addr_small); 210 pde_v[2] |=
239 pde_v[2] |= nvgpu_aperture_mask(g, &entry->mem, 211 gmmu_new_dual_pde_address_small_sys_f(small_addr);
212 pde_v[2] |= nvgpu_aperture_mask(g, &pd->mem,
240 gmmu_new_dual_pde_aperture_small_sys_mem_ncoh_f(), 213 gmmu_new_dual_pde_aperture_small_sys_mem_ncoh_f(),
241 gmmu_new_dual_pde_aperture_small_video_memory_f()); 214 gmmu_new_dual_pde_aperture_small_video_memory_f());
242 pde_v[2] |= gmmu_new_dual_pde_vol_small_true_f(); 215 pde_v[2] |= gmmu_new_dual_pde_vol_small_true_f();
243 pde_v[3] |= pte_addr_small >> 24; 216 pde_v[3] |= small_addr >> 24;
244 } 217 }
245 218
246 if (big_valid) { 219 if (big_valid) {
247 pde_v[0] |= gmmu_new_dual_pde_address_big_sys_f(pte_addr_big); 220 pde_v[0] |= gmmu_new_dual_pde_address_big_sys_f(big_addr);
248 pde_v[0] |= gmmu_new_dual_pde_vol_big_true_f(); 221 pde_v[0] |= gmmu_new_dual_pde_vol_big_true_f();
249 pde_v[0] |= nvgpu_aperture_mask(g, &entry->mem, 222 pde_v[0] |= nvgpu_aperture_mask(g, &pd->mem,
250 gmmu_new_dual_pde_aperture_big_sys_mem_ncoh_f(), 223 gmmu_new_dual_pde_aperture_big_sys_mem_ncoh_f(),
251 gmmu_new_dual_pde_aperture_big_video_memory_f()); 224 gmmu_new_dual_pde_aperture_big_video_memory_f());
252 pde_v[1] |= pte_addr_big >> 28; 225 pde_v[1] |= big_addr >> 28;
253 } 226 }
254 227
255 pde = pde0_from_index(i); 228 pd_write(g, pd, pd_offset + 0, pde_v[0]);
256 229 pd_write(g, pd, pd_offset + 1, pde_v[1]);
257 gk20a_pde_wr32(g, pte, pde + 0, pde_v[0]); 230 pd_write(g, pd, pd_offset + 2, pde_v[2]);
258 gk20a_pde_wr32(g, pte, pde + 1, pde_v[1]); 231 pd_write(g, pd, pd_offset + 3, pde_v[3]);
259 gk20a_pde_wr32(g, pte, pde + 2, pde_v[2]); 232
260 gk20a_pde_wr32(g, pte, pde + 3, pde_v[3]); 233 pte_dbg(g, attrs,
261 234 "PDE: i=%-4u size=%-2u offs=%-4u pgsz: %c%c | "
262 gk20a_dbg(gpu_dbg_pte, "pde:%d,sz=%d [0x%08x, 0x%08x, 0x%x, 0x%08x]", 235 "GPU %#-12llx phys %#-12llx "
263 i, gmmu_pgsz_idx, pde_v[3], pde_v[2], pde_v[1], pde_v[0]); 236 "[0x%08x, 0x%08x, 0x%08x, 0x%08x]",
264 gk20a_dbg_fn("done"); 237 pd_idx, l->entry_size, pd_offset,
265 return 0; 238 small_valid ? 'S' : '-',
239 big_valid ? 'B' : '-',
240 virt_addr, phys_addr,
241 pde_v[3], pde_v[2], pde_v[1], pde_v[0]);
266} 242}
267 243
268static int update_gmmu_pte_locked(struct vm_gk20a *vm, 244static void __update_pte(struct vm_gk20a *vm,
269 struct gk20a_mm_entry *pte, 245 u32 *pte_w,
270 u32 i, u32 gmmu_pgsz_idx, 246 u64 phys_addr,
271 struct scatterlist **sgl, 247 struct nvgpu_gmmu_attrs *attrs)
272 u64 *offset,
273 u64 *iova,
274 u32 kind_v, u64 *ctag,
275 bool cacheable, bool unmapped_pte,
276 int rw_flag, bool sparse, bool priv,
277 enum nvgpu_aperture aperture)
278{ 248{
279 struct gk20a *g = vm->mm->g; 249 struct gk20a *g = gk20a_from_vm(vm);
280 u32 page_size = vm->gmmu_page_sizes[gmmu_pgsz_idx];
281 u64 ctag_granularity = g->ops.fb.compression_page_size(g); 250 u64 ctag_granularity = g->ops.fb.compression_page_size(g);
282 u32 pte_w[2] = {0, 0}; /* invalid pte */ 251 u32 page_size = vm->gmmu_page_sizes[attrs->pgsz];
283 u32 pte_i; 252 u32 pte_valid = attrs->valid ?
284 253 gmmu_new_pte_valid_true_f() :
285 if (*iova) { 254 gmmu_new_pte_valid_false_f();
286 u32 pte_valid = unmapped_pte ? 255 u32 phys_shifted = phys_addr >> gmmu_new_pte_address_shift_v();
287 gmmu_new_pte_valid_false_f() : 256 u32 pte_addr = attrs->aperture == APERTURE_SYSMEM ?
288 gmmu_new_pte_valid_true_f(); 257 gmmu_new_pte_address_sys_f(phys_shifted) :
289 u32 iova_v = *iova >> gmmu_new_pte_address_shift_v(); 258 gmmu_new_pte_address_vid_f(phys_shifted);
290 u32 pte_addr = aperture == APERTURE_SYSMEM ? 259 u32 pte_tgt = __nvgpu_aperture_mask(g, attrs->aperture,
291 gmmu_new_pte_address_sys_f(iova_v) : 260 gmmu_new_pte_aperture_sys_mem_ncoh_f(),
292 gmmu_new_pte_address_vid_f(iova_v); 261 gmmu_new_pte_aperture_video_memory_f());
293 u32 pte_tgt = __nvgpu_aperture_mask(g, aperture, 262
294 gmmu_new_pte_aperture_sys_mem_ncoh_f(), 263 pte_w[0] = pte_valid | pte_addr | pte_tgt;
295 gmmu_new_pte_aperture_video_memory_f()); 264
296 265 if (attrs->priv)
297 pte_w[0] = pte_valid | pte_addr | pte_tgt; 266 pte_w[0] |= gmmu_new_pte_privilege_true_f();
298 267
299 if (priv) 268 pte_w[1] = phys_addr >> (24 + gmmu_new_pte_address_shift_v()) |
300 pte_w[0] |= gmmu_new_pte_privilege_true_f(); 269 gmmu_new_pte_kind_f(attrs->kind_v) |
301 270 gmmu_new_pte_comptagline_f((u32)(attrs->ctag /
302 pte_w[1] = *iova >> (24 + gmmu_new_pte_address_shift_v()) | 271 ctag_granularity));
303 gmmu_new_pte_kind_f(kind_v) | 272
304 gmmu_new_pte_comptagline_f((u32)(*ctag / ctag_granularity)); 273 if (attrs->rw_flag == gk20a_mem_flag_read_only)
305 274 pte_w[0] |= gmmu_new_pte_read_only_true_f();
306 if (rw_flag == gk20a_mem_flag_read_only) 275
307 pte_w[0] |= gmmu_new_pte_read_only_true_f(); 276 if (!attrs->valid && !attrs->cacheable)
308 if (unmapped_pte && !cacheable) 277 pte_w[0] |= gmmu_new_pte_read_only_true_f();
309 pte_w[0] |= gmmu_new_pte_read_only_true_f(); 278 else if (!attrs->cacheable)
310 else if (!cacheable)
311 pte_w[0] |= gmmu_new_pte_vol_true_f();
312
313 gk20a_dbg(gpu_dbg_pte, "pte=%d iova=0x%llx kind=%d"
314 " ctag=%d vol=%d"
315 " [0x%08x, 0x%08x]",
316 i, *iova,
317 kind_v, (u32)(*ctag / ctag_granularity), !cacheable,
318 pte_w[1], pte_w[0]);
319
320 if (*ctag)
321 *ctag += page_size;
322 } else if (sparse) {
323 pte_w[0] = gmmu_new_pte_valid_false_f();
324 pte_w[0] |= gmmu_new_pte_vol_true_f(); 279 pte_w[0] |= gmmu_new_pte_vol_true_f();
325 } else {
326 gk20a_dbg(gpu_dbg_pte, "pte_cur=%d [0x0,0x0]", i);
327 }
328 280
329 pte_i = pte3_from_index(i); 281 if (attrs->ctag)
330 282 attrs->ctag += page_size;
331 gk20a_pde_wr32(g, pte, pte_i + 0, pte_w[0]); 283
332 gk20a_pde_wr32(g, pte, pte_i + 1, pte_w[1]); 284}
333 285
334 if (*iova) { 286static void __update_pte_sparse(u32 *pte_w)
335 *iova += page_size; 287{
336 *offset += page_size; 288 pte_w[0] = gmmu_new_pte_valid_false_f();
337 if (*sgl && *offset + page_size > (*sgl)->length) { 289 pte_w[0] |= gmmu_new_pte_vol_true_f();
338 u64 new_iova; 290}
339 *sgl = sg_next(*sgl); 291
340 if (*sgl) { 292static void update_gmmu_pte_locked(struct vm_gk20a *vm,
341 new_iova = sg_phys(*sgl); 293 const struct gk20a_mmu_level *l,
342 gk20a_dbg(gpu_dbg_pte, "chunk address %llx, size %d", 294 struct nvgpu_gmmu_pd *pd,
343 new_iova, (*sgl)->length); 295 u32 pd_idx,
344 if (new_iova) { 296 u64 virt_addr,
345 *offset = 0; 297 u64 phys_addr,
346 *iova = new_iova; 298 struct nvgpu_gmmu_attrs *attrs)
347 } 299{
348 } 300 struct gk20a *g = vm->mm->g;
349 } 301 u32 page_size = vm->gmmu_page_sizes[attrs->pgsz];
350 } 302 u32 pd_offset = pd_offset_from_index(l, pd_idx);
351 return 0; 303 u32 pte_w[2] = {0, 0};
304
305 if (phys_addr)
306 __update_pte(vm, pte_w, phys_addr, attrs);
307 else if (attrs->sparse)
308 __update_pte_sparse(pte_w);
309
310 pte_dbg(g, attrs,
311 "vm=%s "
312 "PTE: i=%-4u size=%-2u offs=%-4u | "
313 "GPU %#-12llx phys %#-12llx "
314 "pgsz: %3dkb perm=%-2s kind=%#02x APT=%-6s %c%c%c%c "
315 "ctag=0x%08x "
316 "[0x%08x, 0x%08x]",
317 vm->name,
318 pd_idx, l->entry_size, pd_offset,
319 virt_addr, phys_addr,
320 page_size >> 10,
321 nvgpu_gmmu_perm_str(attrs->rw_flag),
322 attrs->kind_v,
323 nvgpu_aperture_str(attrs->aperture),
324 attrs->valid ? 'V' : '-',
325 attrs->cacheable ? 'C' : '-',
326 attrs->sparse ? 'S' : '-',
327 attrs->priv ? 'P' : '-',
328 (u32)attrs->ctag / g->ops.fb.compression_page_size(g),
329 pte_w[1], pte_w[0]);
330
331 pd_write(g, pd, pd_offset + 0, pte_w[0]);
332 pd_write(g, pd, pd_offset + 1, pte_w[1]);
352} 333}
353 334
354static const struct gk20a_mmu_level gp10b_mm_levels[] = { 335static const struct gk20a_mmu_level gp10b_mm_levels[] = {
@@ -384,7 +365,7 @@ static const struct gk20a_mmu_level *gp10b_mm_get_mmu_levels(struct gk20a *g,
384static void gp10b_mm_init_pdb(struct gk20a *g, struct nvgpu_mem *inst_block, 365static void gp10b_mm_init_pdb(struct gk20a *g, struct nvgpu_mem *inst_block,
385 struct vm_gk20a *vm) 366 struct vm_gk20a *vm)
386{ 367{
387 u64 pdb_addr = gk20a_mem_get_base_addr(g, &vm->pdb.mem, 0); 368 u64 pdb_addr = nvgpu_mem_get_base_addr(g, &vm->pdb.mem, 0);
388 u32 pdb_addr_lo = u64_lo32(pdb_addr >> ram_in_base_shift_v()); 369 u32 pdb_addr_lo = u64_lo32(pdb_addr >> ram_in_base_shift_v());
389 u32 pdb_addr_hi = u64_hi32(pdb_addr); 370 u32 pdb_addr_hi = u64_hi32(pdb_addr);
390 371