summaryrefslogtreecommitdiffstats
path: root/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
diff options
context:
space:
mode:
authorKonsta Holtta <kholtta@nvidia.com>2016-09-16 04:28:18 -0400
committermobile promotions <svcmobile_promotions@nvidia.com>2016-10-13 11:09:16 -0400
commit8728da1c6e76566ebc4717399d1f247200125595 (patch)
tree86a3b5c581998e0a9575de7a1c292e648adae73d /drivers/gpu/nvgpu/gk20a/mm_gk20a.c
parentde17750cf975005d5f3db8a0195f9a04961cc74e (diff)
gpu: nvgpu: compact pte buffers
The lowest page table level may hold very few entries for mappings of large pages, but a new page is allocated for each list of entries at the lowest level, wasting memory and performance. Compact these so that the new "allocation" of ptes is appended at the end of the previous allocation, if there is space. 4 KB page is still the smallest size requested from the allocator; any possible overhead in the allocator (e.g., internally allocating big pages only) is not taken into account. Bug 1736604 Change-Id: I03fb795cbc06c869fcf5f1b92def89a04583ee83 Signed-off-by: Konsta Holtta <kholtta@nvidia.com> Reviewed-on: http://git-master/r/1221841 (cherry picked from commit fa92017ed48e1d5f48c1a12c512641c6ce9924af) Reviewed-on: http://git-master/r/1234996 Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
Diffstat (limited to 'drivers/gpu/nvgpu/gk20a/mm_gk20a.c')
-rw-r--r--drivers/gpu/nvgpu/gk20a/mm_gk20a.c98
1 files changed, 75 insertions, 23 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
index 4c55f8ce..f327294a 100644
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
@@ -1252,6 +1252,9 @@ void free_gmmu_pages(struct vm_gk20a *vm,
1252 if (!entry->mem.size) 1252 if (!entry->mem.size)
1253 return; 1253 return;
1254 1254
1255 if (entry->woffset) /* fake shadow mem */
1256 return;
1257
1255 if (platform->is_fmodel) { 1258 if (platform->is_fmodel) {
1256 free_gmmu_phys_pages(vm, entry); 1259 free_gmmu_phys_pages(vm, entry);
1257 return; 1260 return;
@@ -1317,35 +1320,64 @@ void unmap_gmmu_pages(struct gk20a *g, struct gk20a_mm_entry *entry)
1317 } 1320 }
1318} 1321}
1319 1322
1320/* allocate a phys contig region big enough for a full 1323/*
1324 * Allocate a phys contig region big enough for a full
1321 * sized gmmu page table for the given gmmu_page_size. 1325 * sized gmmu page table for the given gmmu_page_size.
1322 * the whole range is zeroed so it's "invalid"/will fault 1326 * the whole range is zeroed so it's "invalid"/will fault.
1327 *
1328 * If a previous entry is supplied, its memory will be used for
1329 * suballocation for this next entry too, if there is space.
1323 */ 1330 */
1324 1331
1325static int gk20a_zalloc_gmmu_page_table(struct vm_gk20a *vm, 1332static int gk20a_zalloc_gmmu_page_table(struct vm_gk20a *vm,
1326 enum gmmu_pgsz_gk20a pgsz_idx, 1333 enum gmmu_pgsz_gk20a pgsz_idx,
1327 const struct gk20a_mmu_level *l, 1334 const struct gk20a_mmu_level *l,
1328 struct gk20a_mm_entry *entry) 1335 struct gk20a_mm_entry *entry,
1336 struct gk20a_mm_entry *prev_entry)
1329{ 1337{
1330 int err; 1338 int err = -ENOMEM;
1331 int order; 1339 int order;
1332 struct gk20a *g = gk20a_from_vm(vm); 1340 struct gk20a *g = gk20a_from_vm(vm);
1341 u32 bytes;
1333 1342
1334 gk20a_dbg_fn(""); 1343 gk20a_dbg_fn("");
1335 1344
1336 /* allocate enough pages for the table */ 1345 /* allocate enough pages for the table */
1337 order = l->hi_bit[pgsz_idx] - l->lo_bit[pgsz_idx] + 1; 1346 order = l->hi_bit[pgsz_idx] - l->lo_bit[pgsz_idx] + 1;
1338 order += ilog2(l->entry_size); 1347 order += ilog2(l->entry_size);
1348 bytes = 1 << order;
1339 order -= PAGE_SHIFT; 1349 order -= PAGE_SHIFT;
1340 order = max(0, order); 1350 if (order < 0 && prev_entry) {
1351 /* try to suballocate from previous chunk */
1352 u32 capacity = prev_entry->mem.size / bytes;
1353 u32 prev = prev_entry->woffset * sizeof(u32) / bytes;
1354 u32 free = capacity - prev - 1;
1355
1356 gk20a_dbg(gpu_dbg_pte, "cap %d prev %d free %d bytes %d",
1357 capacity, prev, free, bytes);
1358
1359 if (free) {
1360 memcpy(&entry->mem, &prev_entry->mem,
1361 sizeof(entry->mem));
1362 entry->woffset = prev_entry->woffset
1363 + bytes / sizeof(u32);
1364 err = 0;
1365 }
1366 }
1367
1368 if (err) {
1369 /* no suballoc space */
1370 order = max(0, order);
1371 err = alloc_gmmu_pages(vm, order, entry);
1372 entry->woffset = 0;
1373 }
1341 1374
1342 err = alloc_gmmu_pages(vm, order, entry); 1375 gk20a_dbg(gpu_dbg_pte, "entry = 0x%p, addr=%08llx, size %d, woff %x",
1343 gk20a_dbg(gpu_dbg_pte, "entry = 0x%p, addr=%08llx, size %d",
1344 entry, 1376 entry,
1345 (entry->mem.sgt && entry->mem.aperture == APERTURE_SYSMEM) ? 1377 (entry->mem.sgt && entry->mem.aperture == APERTURE_SYSMEM) ?
1346 g->ops.mm.get_iova_addr(g, entry->mem.sgt->sgl, 0) 1378 g->ops.mm.get_iova_addr(g, entry->mem.sgt->sgl, 0)
1347 : 0, 1379 : 0,
1348 order); 1380 order, entry->woffset);
1349 if (err) 1381 if (err)
1350 return err; 1382 return err;
1351 entry->pgsz = pgsz_idx; 1383 entry->pgsz = pgsz_idx;
@@ -3476,13 +3508,31 @@ u64 gk20a_mm_iova_addr(struct gk20a *g, struct scatterlist *sgl,
3476 return gk20a_mm_smmu_vaddr_translate(g, sg_dma_address(sgl)); 3508 return gk20a_mm_smmu_vaddr_translate(g, sg_dma_address(sgl));
3477} 3509}
3478 3510
3511void gk20a_pde_wr32(struct gk20a *g, struct gk20a_mm_entry *entry,
3512 size_t w, size_t data)
3513{
3514 gk20a_mem_wr32(g, &entry->mem, entry->woffset + w, data);
3515}
3516
3517u64 gk20a_pde_addr(struct gk20a *g, struct gk20a_mm_entry *entry)
3518{
3519 u64 base;
3520
3521 if (g->mm.has_physical_mode)
3522 base = sg_phys(entry->mem.sgt->sgl);
3523 else
3524 base = gk20a_mem_get_base_addr(g, &entry->mem, 0);
3525
3526 return base + entry->woffset * sizeof(u32);
3527}
3528
3479/* for gk20a the "video memory" apertures here are misnomers. */ 3529/* for gk20a the "video memory" apertures here are misnomers. */
3480static inline u32 big_valid_pde0_bits(struct gk20a *g, 3530static inline u32 big_valid_pde0_bits(struct gk20a *g,
3481 struct mem_desc *entry_mem) 3531 struct gk20a_mm_entry *entry)
3482{ 3532{
3483 u64 pte_addr = gk20a_mem_get_base_addr(g, entry_mem, 0); 3533 u64 pte_addr = gk20a_pde_addr(g, entry);
3484 u32 pde0_bits = 3534 u32 pde0_bits =
3485 gk20a_aperture_mask(g, entry_mem, 3535 gk20a_aperture_mask(g, &entry->mem,
3486 gmmu_pde_aperture_big_sys_mem_ncoh_f(), 3536 gmmu_pde_aperture_big_sys_mem_ncoh_f(),
3487 gmmu_pde_aperture_big_video_memory_f()) | 3537 gmmu_pde_aperture_big_video_memory_f()) |
3488 gmmu_pde_address_big_sys_f( 3538 gmmu_pde_address_big_sys_f(
@@ -3492,11 +3542,11 @@ static inline u32 big_valid_pde0_bits(struct gk20a *g,
3492} 3542}
3493 3543
3494static inline u32 small_valid_pde1_bits(struct gk20a *g, 3544static inline u32 small_valid_pde1_bits(struct gk20a *g,
3495 struct mem_desc *entry_mem) 3545 struct gk20a_mm_entry *entry)
3496{ 3546{
3497 u64 pte_addr = gk20a_mem_get_base_addr(g, entry_mem, 0); 3547 u64 pte_addr = gk20a_pde_addr(g, entry);
3498 u32 pde1_bits = 3548 u32 pde1_bits =
3499 gk20a_aperture_mask(g, entry_mem, 3549 gk20a_aperture_mask(g, &entry->mem,
3500 gmmu_pde_aperture_small_sys_mem_ncoh_f(), 3550 gmmu_pde_aperture_small_sys_mem_ncoh_f(),
3501 gmmu_pde_aperture_small_video_memory_f()) | 3551 gmmu_pde_aperture_small_video_memory_f()) |
3502 gmmu_pde_vol_small_true_f() | /* tbd: why? */ 3552 gmmu_pde_vol_small_true_f() | /* tbd: why? */
@@ -3536,11 +3586,11 @@ static int update_gmmu_pde_locked(struct vm_gk20a *vm,
3536 3586
3537 pde_v[0] = gmmu_pde_size_full_f(); 3587 pde_v[0] = gmmu_pde_size_full_f();
3538 pde_v[0] |= big_valid ? 3588 pde_v[0] |= big_valid ?
3539 big_valid_pde0_bits(g, &entry->mem) : 3589 big_valid_pde0_bits(g, entry) :
3540 gmmu_pde_aperture_big_invalid_f(); 3590 gmmu_pde_aperture_big_invalid_f();
3541 3591
3542 pde_v[1] |= (small_valid ? 3592 pde_v[1] |= (small_valid ?
3543 small_valid_pde1_bits(g, &entry->mem) : 3593 small_valid_pde1_bits(g, entry) :
3544 (gmmu_pde_aperture_small_invalid_f() | 3594 (gmmu_pde_aperture_small_invalid_f() |
3545 gmmu_pde_vol_small_false_f())) 3595 gmmu_pde_vol_small_false_f()))
3546 | 3596 |
@@ -3549,8 +3599,8 @@ static int update_gmmu_pde_locked(struct vm_gk20a *vm,
3549 3599
3550 pde = pde_from_index(i); 3600 pde = pde_from_index(i);
3551 3601
3552 gk20a_mem_wr32(g, &vm->pdb.mem, pde + 0, pde_v[0]); 3602 gk20a_pde_wr32(g, &vm->pdb, pde + 0, pde_v[0]);
3553 gk20a_mem_wr32(g, &vm->pdb.mem, pde + 1, pde_v[1]); 3603 gk20a_pde_wr32(g, &vm->pdb, pde + 1, pde_v[1]);
3554 3604
3555 gk20a_dbg(gpu_dbg_pte, "pde:%d,sz=%d = 0x%x,0x%08x", 3605 gk20a_dbg(gpu_dbg_pte, "pde:%d,sz=%d = 0x%x,0x%08x",
3556 i, gmmu_pgsz_idx, pde_v[1], pde_v[0]); 3606 i, gmmu_pgsz_idx, pde_v[1], pde_v[0]);
@@ -3633,8 +3683,8 @@ static int update_gmmu_pte_locked(struct vm_gk20a *vm,
3633 gk20a_dbg(gpu_dbg_pte, "pte_cur=%d [0x0,0x0]", i); 3683 gk20a_dbg(gpu_dbg_pte, "pte_cur=%d [0x0,0x0]", i);
3634 } 3684 }
3635 3685
3636 gk20a_mem_wr32(g, &pte->mem, pte_from_index(i) + 0, pte_w[0]); 3686 gk20a_pde_wr32(g, pte, pte_from_index(i) + 0, pte_w[0]);
3637 gk20a_mem_wr32(g, &pte->mem, pte_from_index(i) + 1, pte_w[1]); 3687 gk20a_pde_wr32(g, pte, pte_from_index(i) + 1, pte_w[1]);
3638 3688
3639 if (*iova) { 3689 if (*iova) {
3640 *iova += page_size; 3690 *iova += page_size;
@@ -3678,6 +3728,7 @@ static int update_gmmu_level_locked(struct vm_gk20a *vm,
3678 int err = 0; 3728 int err = 0;
3679 u32 pde_i; 3729 u32 pde_i;
3680 u64 pde_size = 1ULL << (u64)l->lo_bit[pgsz_idx]; 3730 u64 pde_size = 1ULL << (u64)l->lo_bit[pgsz_idx];
3731 struct gk20a_mm_entry *next_pte = NULL, *prev_pte = NULL;
3681 3732
3682 gk20a_dbg_fn(""); 3733 gk20a_dbg_fn("");
3683 3734
@@ -3688,7 +3739,6 @@ static int update_gmmu_level_locked(struct vm_gk20a *vm,
3688 pgsz_idx, lvl, gpu_va, gpu_end-1, *iova); 3739 pgsz_idx, lvl, gpu_va, gpu_end-1, *iova);
3689 3740
3690 while (gpu_va < gpu_end) { 3741 while (gpu_va < gpu_end) {
3691 struct gk20a_mm_entry *next_pte = NULL;
3692 u64 next = min((gpu_va + pde_size) & ~(pde_size-1), gpu_end); 3742 u64 next = min((gpu_va + pde_size) & ~(pde_size-1), gpu_end);
3693 3743
3694 /* Allocate next level */ 3744 /* Allocate next level */
@@ -3706,11 +3756,12 @@ static int update_gmmu_level_locked(struct vm_gk20a *vm,
3706 pte->pgsz = pgsz_idx; 3756 pte->pgsz = pgsz_idx;
3707 pte->num_entries = num_entries; 3757 pte->num_entries = num_entries;
3708 } 3758 }
3759 prev_pte = next_pte;
3709 next_pte = pte->entries + pde_i; 3760 next_pte = pte->entries + pde_i;
3710 3761
3711 if (!next_pte->mem.size) { 3762 if (!next_pte->mem.size) {
3712 err = gk20a_zalloc_gmmu_page_table(vm, 3763 err = gk20a_zalloc_gmmu_page_table(vm,
3713 pgsz_idx, next_l, next_pte); 3764 pgsz_idx, next_l, next_pte, prev_pte);
3714 if (err) 3765 if (err)
3715 return err; 3766 return err;
3716 } 3767 }
@@ -4203,7 +4254,8 @@ int gk20a_init_vm(struct mm_gk20a *mm,
4203 name, vm->va_limit, pde_hi + 1); 4254 name, vm->va_limit, pde_hi + 1);
4204 4255
4205 /* allocate the page table directory */ 4256 /* allocate the page table directory */
4206 err = gk20a_zalloc_gmmu_page_table(vm, 0, &vm->mmu_levels[0], &vm->pdb); 4257 err = gk20a_zalloc_gmmu_page_table(vm, 0, &vm->mmu_levels[0],
4258 &vm->pdb, NULL);
4207 if (err) 4259 if (err)
4208 goto clean_up_pdes; 4260 goto clean_up_pdes;
4209 4261