From 048c6b062ae381a329dccbc7ca0599113dbd7417 Mon Sep 17 00:00:00 2001 From: Alex Waterman Date: Thu, 11 May 2017 18:25:47 +0100 Subject: gpu: nvgpu: Separate GMMU mapping impl from mm_gk20a.c Separate the non-chip specific GMMU mapping implementation code out of mm_gk20a.c. This puts all of the chip-agnostic code into common/mm/gmmu.c in preparation for rewriting it. JIRA NVGPU-12 JIRA NVGPU-30 Change-Id: I6f7fdac3422703f5e80bb22ad304dc27bba4814d Signed-off-by: Alex Waterman Reviewed-on: http://git-master/r/1480228 Reviewed-by: mobile promotions Tested-by: mobile promotions --- drivers/gpu/nvgpu/common/mm/gmmu.c | 517 ++++++++++++++++++++++++++++++++ drivers/gpu/nvgpu/common/mm/vm.c | 21 +- drivers/gpu/nvgpu/gk20a/mm_gk20a.c | 521 --------------------------------- drivers/gpu/nvgpu/gk20a/mm_gk20a.h | 7 - drivers/gpu/nvgpu/include/nvgpu/gmmu.h | 3 + 5 files changed, 539 insertions(+), 530 deletions(-) (limited to 'drivers') diff --git a/drivers/gpu/nvgpu/common/mm/gmmu.c b/drivers/gpu/nvgpu/common/mm/gmmu.c index dc91cc2f..e63155f2 100644 --- a/drivers/gpu/nvgpu/common/mm/gmmu.c +++ b/drivers/gpu/nvgpu/common/mm/gmmu.c @@ -15,14 +15,81 @@ */ #include +#include #include #include #include #include +#include #include "gk20a/gk20a.h" #include "gk20a/mm_gk20a.h" +#define gmmu_dbg(g, fmt, args...) \ + nvgpu_log(g, gpu_dbg_map, fmt, ##args) +#define gmmu_dbg_v(g, fmt, args...) \ + nvgpu_log(g, gpu_dbg_map_v, fmt, ##args) + +static int map_gmmu_phys_pages(struct gk20a_mm_entry *entry) +{ + FLUSH_CPU_DCACHE(entry->mem.cpu_va, + sg_phys(entry->mem.priv.sgt->sgl), + entry->mem.priv.sgt->sgl->length); + return 0; +} + +static void unmap_gmmu_phys_pages(struct gk20a_mm_entry *entry) +{ + FLUSH_CPU_DCACHE(entry->mem.cpu_va, + sg_phys(entry->mem.priv.sgt->sgl), + entry->mem.priv.sgt->sgl->length); +} + +static int map_gmmu_pages(struct gk20a *g, struct gk20a_mm_entry *entry) +{ + gk20a_dbg_fn(""); + + if (nvgpu_is_enabled(g, NVGPU_IS_FMODEL)) + return map_gmmu_phys_pages(entry); + + if (IS_ENABLED(CONFIG_ARM64)) { + if (entry->mem.aperture == APERTURE_VIDMEM) + return 0; + + FLUSH_CPU_DCACHE(entry->mem.cpu_va, + sg_phys(entry->mem.priv.sgt->sgl), + entry->mem.size); + } else { + int err = nvgpu_mem_begin(g, &entry->mem); + + if (err) + return err; + } + + return 0; +} + +static void unmap_gmmu_pages(struct gk20a *g, struct gk20a_mm_entry *entry) +{ + gk20a_dbg_fn(""); + + if (nvgpu_is_enabled(g, NVGPU_IS_FMODEL)) { + unmap_gmmu_phys_pages(entry); + return; + } + + if (IS_ENABLED(CONFIG_ARM64)) { + if (entry->mem.aperture == APERTURE_VIDMEM) + return; + + FLUSH_CPU_DCACHE(entry->mem.cpu_va, + sg_phys(entry->mem.priv.sgt->sgl), + entry->mem.size); + } else { + nvgpu_mem_end(g, &entry->mem); + } +} + static int alloc_gmmu_phys_pages(struct vm_gk20a *vm, u32 order, struct gk20a_mm_entry *entry) { @@ -97,6 +164,44 @@ static int nvgpu_alloc_gmmu_pages(struct vm_gk20a *vm, u32 order, return 0; } +static void free_gmmu_phys_pages(struct vm_gk20a *vm, + struct gk20a_mm_entry *entry) +{ + gk20a_dbg_fn(""); + + /* note: mem_desc slightly abused (wrt. nvgpu_free_gmmu_pages) */ + + free_pages((unsigned long)entry->mem.cpu_va, get_order(entry->mem.size)); + entry->mem.cpu_va = NULL; + + sg_free_table(entry->mem.priv.sgt); + nvgpu_kfree(vm->mm->g, entry->mem.priv.sgt); + entry->mem.priv.sgt = NULL; + entry->mem.size = 0; + entry->mem.aperture = APERTURE_INVALID; +} + +void nvgpu_free_gmmu_pages(struct vm_gk20a *vm, + struct gk20a_mm_entry *entry) +{ + struct gk20a *g = gk20a_from_vm(vm); + + gk20a_dbg_fn(""); + + if (!entry->mem.size) + return; + + if (entry->woffset) /* fake shadow mem */ + return; + + if (nvgpu_is_enabled(g, NVGPU_IS_FMODEL)) { + free_gmmu_phys_pages(vm, entry); + return; + } + + nvgpu_dma_free(g, &entry->mem); +} + /* * Allocate a phys contig region big enough for a full * sized gmmu page table for the given gmmu_page_size. @@ -202,6 +307,9 @@ static u64 __nvgpu_gmmu_map(struct vm_gk20a *vm, return vaddr; } +/* + * Convenience wrapper over __nvgpu_gmmu_map() for non-fixed mappings. + */ u64 nvgpu_gmmu_map(struct vm_gk20a *vm, struct nvgpu_mem *mem, u64 size, @@ -246,3 +354,412 @@ void nvgpu_gmmu_unmap(struct vm_gk20a *vm, struct nvgpu_mem *mem, u64 gpu_va) nvgpu_mutex_release(&vm->update_gmmu_lock); } + +static int update_gmmu_level_locked(struct vm_gk20a *vm, + struct gk20a_mm_entry *pte, + enum gmmu_pgsz_gk20a pgsz_idx, + struct scatterlist **sgl, + u64 *offset, + u64 *iova, + u64 gpu_va, u64 gpu_end, + u8 kind_v, u64 *ctag, + bool cacheable, bool unmapped_pte, + int rw_flag, + bool sparse, + int lvl, + bool priv, + enum nvgpu_aperture aperture) +{ + struct gk20a *g = gk20a_from_vm(vm); + const struct gk20a_mmu_level *l = &vm->mmu_levels[lvl]; + const struct gk20a_mmu_level *next_l = &vm->mmu_levels[lvl+1]; + int err = 0; + u32 pde_i; + u64 pde_size = 1ULL << (u64)l->lo_bit[pgsz_idx]; + struct gk20a_mm_entry *next_pte = NULL, *prev_pte = NULL; + + gk20a_dbg_fn(""); + + pde_i = (gpu_va & ((1ULL << ((u64)l->hi_bit[pgsz_idx]+1)) - 1ULL)) + >> (u64)l->lo_bit[pgsz_idx]; + + gk20a_dbg(gpu_dbg_pte, "size_idx=%d, l: %d, [%llx,%llx], iova=%llx", + pgsz_idx, lvl, gpu_va, gpu_end-1, *iova); + + while (gpu_va < gpu_end) { + u64 next = min((gpu_va + pde_size) & ~(pde_size-1), gpu_end); + + /* Allocate next level */ + if (next_l->update_entry) { + if (!pte->entries) { + int num_entries = + 1 << + (l->hi_bit[pgsz_idx] + - l->lo_bit[pgsz_idx] + 1); + pte->entries = + nvgpu_vzalloc(g, + sizeof(struct gk20a_mm_entry) * + num_entries); + if (!pte->entries) + return -ENOMEM; + pte->pgsz = pgsz_idx; + pte->num_entries = num_entries; + } + prev_pte = next_pte; + next_pte = pte->entries + pde_i; + + if (!next_pte->mem.size) { + err = nvgpu_zalloc_gmmu_page_table(vm, + pgsz_idx, next_l, next_pte, prev_pte); + if (err) + return err; + } + } + + err = l->update_entry(vm, pte, pde_i, pgsz_idx, + sgl, offset, iova, + kind_v, ctag, cacheable, unmapped_pte, + rw_flag, sparse, priv, aperture); + if (err) + return err; + + if (next_l->update_entry) { + /* get cpu access to the ptes */ + err = map_gmmu_pages(g, next_pte); + if (err) { + nvgpu_err(g, + "couldn't map ptes for update as=%d", + vm_aspace_id(vm)); + return err; + } + err = update_gmmu_level_locked(vm, next_pte, + pgsz_idx, + sgl, + offset, + iova, + gpu_va, + next, + kind_v, ctag, cacheable, unmapped_pte, + rw_flag, sparse, lvl+1, priv, aperture); + unmap_gmmu_pages(g, next_pte); + + if (err) + return err; + } + + pde_i++; + gpu_va = next; + } + + gk20a_dbg_fn("done"); + + return 0; +} + +/* + * This is the true top level GMMU mapping logic. This breaks down the incoming + * scatter gather table and does actual programming of GPU virtual address to + * physical* address. + * + * The update of each level of the page tables is farmed out to chip specific + * implementations. But the logic around that is generic to all chips. Every chip + * has some number of PDE levels and then a PTE level. + * + * Each chunk of the incoming SGT is sent to the chip specific implementation + * of page table update. + * + * [*] Note: the "physical" address may actually be an IO virtual address in the + * case of SMMU usage. + */ +static int update_gmmu_ptes_locked(struct vm_gk20a *vm, + enum gmmu_pgsz_gk20a pgsz_idx, + struct sg_table *sgt, + u64 buffer_offset, + u64 gpu_va, u64 gpu_end, + u8 kind_v, u32 ctag_offset, + bool cacheable, bool unmapped_pte, + int rw_flag, + bool sparse, + bool priv, + enum nvgpu_aperture aperture) +{ + struct gk20a *g = gk20a_from_vm(vm); + int ctag_granularity = g->ops.fb.compression_page_size(g); + u64 ctag = (u64)ctag_offset * (u64)ctag_granularity; + u64 iova = 0; + u64 space_to_skip = buffer_offset; + u64 map_size = gpu_end - gpu_va; + u32 page_size = vm->gmmu_page_sizes[pgsz_idx]; + int err; + struct scatterlist *sgl = NULL; + struct nvgpu_page_alloc *alloc = NULL; + struct page_alloc_chunk *chunk = NULL; + u64 length; + + /* note: here we need to map kernel to small, since the + * low-level mmu code assumes 0 is small and 1 is big pages */ + if (pgsz_idx == gmmu_page_size_kernel) + pgsz_idx = gmmu_page_size_small; + + if (space_to_skip & (page_size - 1)) + return -EINVAL; + + err = map_gmmu_pages(g, &vm->pdb); + if (err) { + nvgpu_err(g, + "couldn't map ptes for update as=%d", + vm_aspace_id(vm)); + return err; + } + + if (aperture == APERTURE_VIDMEM) { + gmmu_dbg_v(g, "vidmem map size_idx=%d, gpu_va=[%llx,%llx]", + pgsz_idx, gpu_va, gpu_end-1); + + if (sgt) { + alloc = get_vidmem_page_alloc(sgt->sgl); + + nvgpu_list_for_each_entry(chunk, &alloc->alloc_chunks, + page_alloc_chunk, list_entry) { + if (space_to_skip && + space_to_skip > chunk->length) { + space_to_skip -= chunk->length; + } else { + iova = chunk->base + space_to_skip; + length = chunk->length - space_to_skip; + length = min(length, map_size); + space_to_skip = 0; + + err = update_gmmu_level_locked(vm, + &vm->pdb, pgsz_idx, + &sgl, + &space_to_skip, + &iova, + gpu_va, gpu_va + length, + kind_v, &ctag, + cacheable, unmapped_pte, + rw_flag, sparse, 0, priv, + aperture); + if (err) + break; + + /* need to set explicit zero here */ + space_to_skip = 0; + gpu_va += length; + map_size -= length; + + if (!map_size) + break; + } + } + } else { + err = update_gmmu_level_locked(vm, &vm->pdb, pgsz_idx, + &sgl, + &space_to_skip, + &iova, + gpu_va, gpu_end, + kind_v, &ctag, + cacheable, unmapped_pte, rw_flag, + sparse, 0, priv, + aperture); + } + } else { + gmmu_dbg_v(g, + "pgsz=%-6d, gpu_va: %#-12llx +%#-6llx phys: %#-12llx " + "buffer offset: %-4lld, nents: %d", + page_size, + gpu_va, gpu_end - gpu_va, + sgt ? g->ops.mm.get_iova_addr(g, sgt->sgl, 0) : 0ULL, + buffer_offset, + sgt ? sgt->nents : 0); + + if (sgt) { + iova = g->ops.mm.get_iova_addr(vm->mm->g, sgt->sgl, 0); + if (!vm->mm->bypass_smmu && iova) { + iova += space_to_skip; + } else { + sgl = sgt->sgl; + + gk20a_dbg(gpu_dbg_pte, "chunk address %llx, size %d", + (u64)sg_phys(sgl), + sgl->length); + + while (space_to_skip && sgl && + space_to_skip + page_size > sgl->length) { + space_to_skip -= sgl->length; + sgl = sg_next(sgl); + gk20a_dbg(gpu_dbg_pte, "chunk address %llx, size %d", + (u64)sg_phys(sgl), + sgl->length); + } + + iova = sg_phys(sgl) + space_to_skip; + } + } + + err = update_gmmu_level_locked(vm, &vm->pdb, pgsz_idx, + &sgl, + &space_to_skip, + &iova, + gpu_va, gpu_end, + kind_v, &ctag, + cacheable, unmapped_pte, rw_flag, + sparse, 0, priv, + aperture); + } + + unmap_gmmu_pages(g, &vm->pdb); + + mb(); + + gk20a_dbg_fn("done"); + + return err; +} + +/** + * gk20a_locked_gmmu_map - Map a buffer into the GMMU + * + * This is for non-vGPU chips. It's part of the HAL at the moment but really + * should not be. Chip specific stuff is handled at the PTE/PDE programming + * layer. The rest of the logic is essentially generic for all chips. + * + * To call this function you must have locked the VM lock: vm->update_gmmu_lock. + * However, note: this function is not called directly. It's used through the + * mm.gmmu_lock() HAL. So before calling the mm.gmmu_lock() HAL make sure you + * have the update_gmmu_lock aquired. + */ +u64 gk20a_locked_gmmu_map(struct vm_gk20a *vm, + u64 map_offset, + struct sg_table *sgt, + u64 buffer_offset, + u64 size, + int pgsz_idx, + u8 kind_v, + u32 ctag_offset, + u32 flags, + int rw_flag, + bool clear_ctags, + bool sparse, + bool priv, + struct vm_gk20a_mapping_batch *batch, + enum nvgpu_aperture aperture) +{ + int err = 0; + bool allocated = false; + struct gk20a *g = gk20a_from_vm(vm); + int ctag_granularity = g->ops.fb.compression_page_size(g); + u32 ctag_lines = DIV_ROUND_UP_ULL(size, ctag_granularity); + + /* Allocate (or validate when map_offset != 0) the virtual address. */ + if (!map_offset) { + map_offset = __nvgpu_vm_alloc_va(vm, size, + pgsz_idx); + if (!map_offset) { + nvgpu_err(g, "failed to allocate va space"); + err = -ENOMEM; + goto fail_alloc; + } + allocated = true; + } + + gmmu_dbg(g, + "gv: 0x%04x_%08x + 0x%-7llx " + "[dma: 0x%02x_%08x, pa: 0x%02x_%08x] " + "pgsz=%-3dKb as=%-2d ctags=%d start=%d " + "kind=0x%x flags=0x%x apt=%s", + u64_hi32(map_offset), u64_lo32(map_offset), size, + sgt ? u64_hi32((u64)sg_dma_address(sgt->sgl)) : 0, + sgt ? u64_lo32((u64)sg_dma_address(sgt->sgl)) : 0, + sgt ? u64_hi32((u64)sg_phys(sgt->sgl)) : 0, + sgt ? u64_lo32((u64)sg_phys(sgt->sgl)) : 0, + vm->gmmu_page_sizes[pgsz_idx] >> 10, vm_aspace_id(vm), + ctag_lines, ctag_offset, + kind_v, flags, nvgpu_aperture_str(aperture)); + + err = update_gmmu_ptes_locked(vm, pgsz_idx, + sgt, + buffer_offset, + map_offset, map_offset + size, + kind_v, + ctag_offset, + flags & + NVGPU_MAP_BUFFER_FLAGS_CACHEABLE_TRUE, + flags & + NVGPU_AS_MAP_BUFFER_FLAGS_UNMAPPED_PTE, + rw_flag, + sparse, + priv, + aperture); + if (err) { + nvgpu_err(g, "failed to update ptes on map"); + goto fail_validate; + } + + if (!batch) + g->ops.fb.tlb_invalidate(g, &vm->pdb.mem); + else + batch->need_tlb_invalidate = true; + + return map_offset; +fail_validate: + if (allocated) + __nvgpu_vm_free_va(vm, map_offset, pgsz_idx); +fail_alloc: + nvgpu_err(g, "%s: failed with err=%d", __func__, err); + return 0; +} + +void gk20a_locked_gmmu_unmap(struct vm_gk20a *vm, + u64 vaddr, + u64 size, + int pgsz_idx, + bool va_allocated, + int rw_flag, + bool sparse, + struct vm_gk20a_mapping_batch *batch) +{ + int err = 0; + struct gk20a *g = gk20a_from_vm(vm); + + if (va_allocated) { + err = __nvgpu_vm_free_va(vm, vaddr, pgsz_idx); + if (err) { + nvgpu_err(g, "failed to free va"); + return; + } + } + + /* unmap here needs to know the page size we assigned at mapping */ + err = update_gmmu_ptes_locked(vm, + pgsz_idx, + NULL, /* n/a for unmap */ + 0, + vaddr, + vaddr + size, + 0, 0, false /* n/a for unmap */, + false, rw_flag, + sparse, 0, + APERTURE_INVALID); /* don't care for unmap */ + if (err) + nvgpu_err(g, "failed to update gmmu ptes on unmap"); + + /* flush l2 so any dirty lines are written out *now*. + * also as we could potentially be switching this buffer + * from nonvolatile (l2 cacheable) to volatile (l2 non-cacheable) at + * some point in the future we need to invalidate l2. e.g. switching + * from a render buffer unmap (here) to later using the same memory + * for gmmu ptes. note the positioning of this relative to any smmu + * unmapping (below). */ + + if (!batch) { + gk20a_mm_l2_flush(g, true); + g->ops.fb.tlb_invalidate(g, &vm->pdb.mem); + } else { + if (!batch->gpu_l2_flushed) { + gk20a_mm_l2_flush(g, true); + batch->gpu_l2_flushed = true; + } + batch->need_tlb_invalidate = true; + } +} diff --git a/drivers/gpu/nvgpu/common/mm/vm.c b/drivers/gpu/nvgpu/common/mm/vm.c index e24d40bf..5ba386c9 100644 --- a/drivers/gpu/nvgpu/common/mm/vm.c +++ b/drivers/gpu/nvgpu/common/mm/vm.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -34,6 +35,22 @@ int vm_aspace_id(struct vm_gk20a *vm) return vm->as_share ? vm->as_share->id : -1; } +static void nvgpu_vm_free_entries(struct vm_gk20a *vm, + struct gk20a_mm_entry *parent, + int level) +{ + int i; + + if (parent->entries) + for (i = 0; i < parent->num_entries; i++) + nvgpu_vm_free_entries(vm, &parent->entries[i], level+1); + + if (parent->mem.size) + nvgpu_free_gmmu_pages(vm, parent); + nvgpu_vfree(vm->mm->g, parent->entries); + parent->entries = NULL; +} + u64 __nvgpu_vm_alloc_va(struct vm_gk20a *vm, u64 size, enum gmmu_pgsz_gk20a pgsz_idx) @@ -421,7 +438,7 @@ clean_up_allocators: clean_up_page_tables: /* Cleans up nvgpu_vm_init_page_tables() */ nvgpu_vfree(g, vm->pdb.entries); - free_gmmu_pages(vm, &vm->pdb); + nvgpu_free_gmmu_pages(vm, &vm->pdb); clean_up_vgpu_vm: #ifdef CONFIG_TEGRA_GR_VIRTUALIZATION if (g->is_virtual) @@ -537,7 +554,7 @@ static void __nvgpu_vm_remove(struct vm_gk20a *vm) if (nvgpu_alloc_initialized(&vm->user_lp)) nvgpu_alloc_destroy(&vm->user_lp); - gk20a_vm_free_entries(vm, &vm->pdb, 0); + nvgpu_vm_free_entries(vm, &vm->pdb, 0); #ifdef CONFIG_TEGRA_GR_VIRTUALIZATION if (g->is_virtual) diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c index a1873a30..e7bcf6f0 100644 --- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c @@ -124,15 +124,6 @@ struct nvgpu_page_alloc *get_vidmem_page_alloc(struct scatterlist *sgl) * */ -static int update_gmmu_ptes_locked(struct vm_gk20a *vm, - enum gmmu_pgsz_gk20a pgsz_idx, - struct sg_table *sgt, u64 buffer_offset, - u64 first_vaddr, u64 last_vaddr, - u8 kind_v, u32 ctag_offset, bool cacheable, - bool umapped_pte, int rw_flag, - bool sparse, - bool priv, - enum nvgpu_aperture aperture); static int __must_check gk20a_init_system_vm(struct mm_gk20a *mm); static int __must_check gk20a_init_bar1_vm(struct mm_gk20a *mm); static int __must_check gk20a_init_hwpm(struct mm_gk20a *mm); @@ -781,104 +772,6 @@ void gk20a_init_mm_ce_context(struct gk20a *g) #endif } -static void free_gmmu_phys_pages(struct vm_gk20a *vm, - struct gk20a_mm_entry *entry) -{ - gk20a_dbg_fn(""); - - /* note: mem_desc slightly abused (wrt. free_gmmu_pages) */ - - free_pages((unsigned long)entry->mem.cpu_va, get_order(entry->mem.size)); - entry->mem.cpu_va = NULL; - - sg_free_table(entry->mem.priv.sgt); - nvgpu_kfree(vm->mm->g, entry->mem.priv.sgt); - entry->mem.priv.sgt = NULL; - entry->mem.size = 0; - entry->mem.aperture = APERTURE_INVALID; -} - -static int map_gmmu_phys_pages(struct gk20a_mm_entry *entry) -{ - FLUSH_CPU_DCACHE(entry->mem.cpu_va, - sg_phys(entry->mem.priv.sgt->sgl), - entry->mem.priv.sgt->sgl->length); - return 0; -} - -static void unmap_gmmu_phys_pages(struct gk20a_mm_entry *entry) -{ - FLUSH_CPU_DCACHE(entry->mem.cpu_va, - sg_phys(entry->mem.priv.sgt->sgl), - entry->mem.priv.sgt->sgl->length); -} - -void free_gmmu_pages(struct vm_gk20a *vm, - struct gk20a_mm_entry *entry) -{ - struct gk20a *g = gk20a_from_vm(vm); - - gk20a_dbg_fn(""); - - if (!entry->mem.size) - return; - - if (entry->woffset) /* fake shadow mem */ - return; - - if (nvgpu_is_enabled(g, NVGPU_IS_FMODEL)) { - free_gmmu_phys_pages(vm, entry); - return; - } - - nvgpu_dma_free(g, &entry->mem); -} - -int map_gmmu_pages(struct gk20a *g, struct gk20a_mm_entry *entry) -{ - gk20a_dbg_fn(""); - - if (nvgpu_is_enabled(g, NVGPU_IS_FMODEL)) - return map_gmmu_phys_pages(entry); - - if (IS_ENABLED(CONFIG_ARM64)) { - if (entry->mem.aperture == APERTURE_VIDMEM) - return 0; - - FLUSH_CPU_DCACHE(entry->mem.cpu_va, - sg_phys(entry->mem.priv.sgt->sgl), - entry->mem.size); - } else { - int err = nvgpu_mem_begin(g, &entry->mem); - - if (err) - return err; - } - - return 0; -} - -void unmap_gmmu_pages(struct gk20a *g, struct gk20a_mm_entry *entry) -{ - gk20a_dbg_fn(""); - - if (nvgpu_is_enabled(g, NVGPU_IS_FMODEL)) { - unmap_gmmu_phys_pages(entry); - return; - } - - if (IS_ENABLED(CONFIG_ARM64)) { - if (entry->mem.aperture == APERTURE_VIDMEM) - return; - - FLUSH_CPU_DCACHE(entry->mem.cpu_va, - sg_phys(entry->mem.priv.sgt->sgl), - entry->mem.size); - } else { - nvgpu_mem_end(g, &entry->mem); - } -} - int gk20a_mm_pde_coverage_bit_count(struct vm_gk20a *vm) { return vm->mmu_levels[0].lo_bit[0]; @@ -909,21 +802,6 @@ static u32 pte_from_index(u32 i) return i * gmmu_pte__size_v() / sizeof(u32); } -u32 pte_index_from_vaddr(struct vm_gk20a *vm, - u64 addr, enum gmmu_pgsz_gk20a pgsz_idx) -{ - u32 ret; - /* mask off pde part */ - addr = addr & ((1ULL << gk20a_mm_pde_coverage_bit_count(vm)) - 1ULL); - - /* shift over to get pte index. note assumption that pte index - * doesn't leak over into the high 32b */ - ret = (u32)(addr >> ilog2(vm->gmmu_page_sizes[pgsz_idx])); - - gk20a_dbg(gpu_dbg_pte, "addr=0x%llx pte_i=0x%x", addr, ret); - return ret; -} - int nvgpu_vm_get_buffers(struct vm_gk20a *vm, struct nvgpu_mapped_buf ***mapped_buffers, int *num_buffers) @@ -1096,141 +974,6 @@ int setup_buffer_kind_and_compression(struct vm_gk20a *vm, return 0; } -u64 gk20a_locked_gmmu_map(struct vm_gk20a *vm, - u64 map_offset, - struct sg_table *sgt, - u64 buffer_offset, - u64 size, - int pgsz_idx, - u8 kind_v, - u32 ctag_offset, - u32 flags, - int rw_flag, - bool clear_ctags, - bool sparse, - bool priv, - struct vm_gk20a_mapping_batch *batch, - enum nvgpu_aperture aperture) -{ - int err = 0; - bool allocated = false; - struct gk20a *g = gk20a_from_vm(vm); - int ctag_granularity = g->ops.fb.compression_page_size(g); - u32 ctag_lines = DIV_ROUND_UP_ULL(size, ctag_granularity); - - /* Allocate (or validate when map_offset != 0) the virtual address. */ - if (!map_offset) { - map_offset = __nvgpu_vm_alloc_va(vm, size, - pgsz_idx); - if (!map_offset) { - nvgpu_err(g, "failed to allocate va space"); - err = -ENOMEM; - goto fail_alloc; - } - allocated = true; - } - - gk20a_dbg(gpu_dbg_map, - "gv: 0x%04x_%08x + 0x%-7llx " - "[dma: 0x%02x_%08x, pa: 0x%02x_%08x] " - "pgsz=%-3dKb as=%-2d ctags=%d start=%d " - "kind=0x%x flags=0x%x apt=%s", - u64_hi32(map_offset), u64_lo32(map_offset), size, - sgt ? u64_hi32((u64)sg_dma_address(sgt->sgl)) : 0, - sgt ? u64_lo32((u64)sg_dma_address(sgt->sgl)) : 0, - sgt ? u64_hi32((u64)sg_phys(sgt->sgl)) : 0, - sgt ? u64_lo32((u64)sg_phys(sgt->sgl)) : 0, - vm->gmmu_page_sizes[pgsz_idx] >> 10, vm_aspace_id(vm), - ctag_lines, ctag_offset, - kind_v, flags, nvgpu_aperture_str(aperture)); - - err = update_gmmu_ptes_locked(vm, pgsz_idx, - sgt, - buffer_offset, - map_offset, map_offset + size, - kind_v, - ctag_offset, - flags & - NVGPU_MAP_BUFFER_FLAGS_CACHEABLE_TRUE, - flags & - NVGPU_AS_MAP_BUFFER_FLAGS_UNMAPPED_PTE, - rw_flag, - sparse, - priv, - aperture); - if (err) { - nvgpu_err(g, "failed to update ptes on map"); - goto fail_validate; - } - - if (!batch) - g->ops.fb.tlb_invalidate(g, &vm->pdb.mem); - else - batch->need_tlb_invalidate = true; - - return map_offset; -fail_validate: - if (allocated) - __nvgpu_vm_free_va(vm, map_offset, pgsz_idx); -fail_alloc: - nvgpu_err(g, "%s: failed with err=%d", __func__, err); - return 0; -} - -void gk20a_locked_gmmu_unmap(struct vm_gk20a *vm, - u64 vaddr, - u64 size, - int pgsz_idx, - bool va_allocated, - int rw_flag, - bool sparse, - struct vm_gk20a_mapping_batch *batch) -{ - int err = 0; - struct gk20a *g = gk20a_from_vm(vm); - - if (va_allocated) { - err = __nvgpu_vm_free_va(vm, vaddr, pgsz_idx); - if (err) { - nvgpu_err(g, "failed to free va"); - return; - } - } - - /* unmap here needs to know the page size we assigned at mapping */ - err = update_gmmu_ptes_locked(vm, - pgsz_idx, - NULL, /* n/a for unmap */ - 0, - vaddr, - vaddr + size, - 0, 0, false /* n/a for unmap */, - false, rw_flag, - sparse, 0, - APERTURE_INVALID); /* don't care for unmap */ - if (err) - nvgpu_err(g, "failed to update gmmu ptes on unmap"); - - /* flush l2 so any dirty lines are written out *now*. - * also as we could potentially be switching this buffer - * from nonvolatile (l2 cacheable) to volatile (l2 non-cacheable) at - * some point in the future we need to invalidate l2. e.g. switching - * from a render buffer unmap (here) to later using the same memory - * for gmmu ptes. note the positioning of this relative to any smmu - * unmapping (below). */ - - if (!batch) { - gk20a_mm_l2_flush(g, true); - g->ops.fb.tlb_invalidate(g, &vm->pdb.mem); - } else { - if (!batch->gpu_l2_flushed) { - gk20a_mm_l2_flush(g, true); - batch->gpu_l2_flushed = true; - } - batch->need_tlb_invalidate = true; - } -} - enum nvgpu_aperture gk20a_dmabuf_aperture(struct gk20a *g, struct dma_buf *dmabuf) { @@ -2036,254 +1779,6 @@ static int update_gmmu_pte_locked(struct vm_gk20a *vm, return 0; } -static int update_gmmu_level_locked(struct vm_gk20a *vm, - struct gk20a_mm_entry *pte, - enum gmmu_pgsz_gk20a pgsz_idx, - struct scatterlist **sgl, - u64 *offset, - u64 *iova, - u64 gpu_va, u64 gpu_end, - u8 kind_v, u64 *ctag, - bool cacheable, bool unmapped_pte, - int rw_flag, - bool sparse, - int lvl, - bool priv, - enum nvgpu_aperture aperture) -{ - struct gk20a *g = gk20a_from_vm(vm); - const struct gk20a_mmu_level *l = &vm->mmu_levels[lvl]; - const struct gk20a_mmu_level *next_l = &vm->mmu_levels[lvl+1]; - int err = 0; - u32 pde_i; - u64 pde_size = 1ULL << (u64)l->lo_bit[pgsz_idx]; - struct gk20a_mm_entry *next_pte = NULL, *prev_pte = NULL; - - gk20a_dbg_fn(""); - - pde_i = (gpu_va & ((1ULL << ((u64)l->hi_bit[pgsz_idx]+1)) - 1ULL)) - >> (u64)l->lo_bit[pgsz_idx]; - - gk20a_dbg(gpu_dbg_pte, "size_idx=%d, l: %d, [%llx,%llx], iova=%llx", - pgsz_idx, lvl, gpu_va, gpu_end-1, *iova); - - while (gpu_va < gpu_end) { - u64 next = min((gpu_va + pde_size) & ~(pde_size-1), gpu_end); - - /* Allocate next level */ - if (next_l->update_entry) { - if (!pte->entries) { - int num_entries = - 1 << - (l->hi_bit[pgsz_idx] - - l->lo_bit[pgsz_idx] + 1); - pte->entries = - nvgpu_vzalloc(g, - sizeof(struct gk20a_mm_entry) * - num_entries); - if (!pte->entries) - return -ENOMEM; - pte->pgsz = pgsz_idx; - pte->num_entries = num_entries; - } - prev_pte = next_pte; - next_pte = pte->entries + pde_i; - - if (!next_pte->mem.size) { - err = nvgpu_zalloc_gmmu_page_table(vm, - pgsz_idx, next_l, next_pte, prev_pte); - if (err) - return err; - } - } - - err = l->update_entry(vm, pte, pde_i, pgsz_idx, - sgl, offset, iova, - kind_v, ctag, cacheable, unmapped_pte, - rw_flag, sparse, priv, aperture); - if (err) - return err; - - if (next_l->update_entry) { - /* get cpu access to the ptes */ - err = map_gmmu_pages(g, next_pte); - if (err) { - nvgpu_err(g, - "couldn't map ptes for update as=%d", - vm_aspace_id(vm)); - return err; - } - err = update_gmmu_level_locked(vm, next_pte, - pgsz_idx, - sgl, - offset, - iova, - gpu_va, - next, - kind_v, ctag, cacheable, unmapped_pte, - rw_flag, sparse, lvl+1, priv, aperture); - unmap_gmmu_pages(g, next_pte); - - if (err) - return err; - } - - pde_i++; - gpu_va = next; - } - - gk20a_dbg_fn("done"); - - return 0; -} - -static int update_gmmu_ptes_locked(struct vm_gk20a *vm, - enum gmmu_pgsz_gk20a pgsz_idx, - struct sg_table *sgt, - u64 buffer_offset, - u64 gpu_va, u64 gpu_end, - u8 kind_v, u32 ctag_offset, - bool cacheable, bool unmapped_pte, - int rw_flag, - bool sparse, - bool priv, - enum nvgpu_aperture aperture) -{ - struct gk20a *g = gk20a_from_vm(vm); - int ctag_granularity = g->ops.fb.compression_page_size(g); - u64 ctag = (u64)ctag_offset * (u64)ctag_granularity; - u64 iova = 0; - u64 space_to_skip = buffer_offset; - u64 map_size = gpu_end - gpu_va; - u32 page_size = vm->gmmu_page_sizes[pgsz_idx]; - int err; - struct scatterlist *sgl = NULL; - struct nvgpu_page_alloc *alloc = NULL; - struct page_alloc_chunk *chunk = NULL; - u64 length; - - /* note: here we need to map kernel to small, since the - * low-level mmu code assumes 0 is small and 1 is big pages */ - if (pgsz_idx == gmmu_page_size_kernel) - pgsz_idx = gmmu_page_size_small; - - if (space_to_skip & (page_size - 1)) - return -EINVAL; - - err = map_gmmu_pages(g, &vm->pdb); - if (err) { - nvgpu_err(g, - "couldn't map ptes for update as=%d", - vm_aspace_id(vm)); - return err; - } - - if (aperture == APERTURE_VIDMEM) { - gk20a_dbg(gpu_dbg_map_v, "vidmem map size_idx=%d, gpu_va=[%llx,%llx], alloc=%llx", - pgsz_idx, gpu_va, gpu_end-1, iova); - - if (sgt) { - alloc = get_vidmem_page_alloc(sgt->sgl); - - nvgpu_list_for_each_entry(chunk, &alloc->alloc_chunks, - page_alloc_chunk, list_entry) { - if (space_to_skip && - space_to_skip > chunk->length) { - space_to_skip -= chunk->length; - } else { - iova = chunk->base + space_to_skip; - length = chunk->length - space_to_skip; - length = min(length, map_size); - space_to_skip = 0; - - err = update_gmmu_level_locked(vm, - &vm->pdb, pgsz_idx, - &sgl, - &space_to_skip, - &iova, - gpu_va, gpu_va + length, - kind_v, &ctag, - cacheable, unmapped_pte, - rw_flag, sparse, 0, priv, - aperture); - if (err) - break; - - /* need to set explicit zero here */ - space_to_skip = 0; - gpu_va += length; - map_size -= length; - - if (!map_size) - break; - } - } - } else { - err = update_gmmu_level_locked(vm, &vm->pdb, pgsz_idx, - &sgl, - &space_to_skip, - &iova, - gpu_va, gpu_end, - kind_v, &ctag, - cacheable, unmapped_pte, rw_flag, - sparse, 0, priv, - aperture); - } - } else { - gk20a_dbg(gpu_dbg_pte, "size_idx=%d, iova=%llx, buffer offset %lld, nents %d", - pgsz_idx, - sgt ? g->ops.mm.get_iova_addr(vm->mm->g, sgt->sgl, 0) - : 0ULL, - buffer_offset, - sgt ? sgt->nents : 0); - - gk20a_dbg(gpu_dbg_map_v, "size_idx=%d, gpu_va=[%llx,%llx], iova=%llx", - pgsz_idx, gpu_va, gpu_end-1, iova); - - if (sgt) { - iova = g->ops.mm.get_iova_addr(vm->mm->g, sgt->sgl, 0); - if (!vm->mm->bypass_smmu && iova) { - iova += space_to_skip; - } else { - sgl = sgt->sgl; - - gk20a_dbg(gpu_dbg_pte, "chunk address %llx, size %d", - (u64)sg_phys(sgl), - sgl->length); - - while (space_to_skip && sgl && - space_to_skip + page_size > sgl->length) { - space_to_skip -= sgl->length; - sgl = sg_next(sgl); - gk20a_dbg(gpu_dbg_pte, "chunk address %llx, size %d", - (u64)sg_phys(sgl), - sgl->length); - } - - iova = sg_phys(sgl) + space_to_skip; - } - } - - err = update_gmmu_level_locked(vm, &vm->pdb, pgsz_idx, - &sgl, - &space_to_skip, - &iova, - gpu_va, gpu_end, - kind_v, &ctag, - cacheable, unmapped_pte, rw_flag, - sparse, 0, priv, - aperture); - } - - unmap_gmmu_pages(g, &vm->pdb); - - smp_mb(); - - gk20a_dbg_fn("done"); - - return err; -} - /* NOTE! mapped_buffers lock must be held */ void nvgpu_vm_unmap_locked(struct nvgpu_mapped_buf *mapped_buffer, struct vm_gk20a_mapping_batch *batch) @@ -2341,22 +1836,6 @@ void nvgpu_vm_unmap_locked(struct nvgpu_mapped_buf *mapped_buffer, return; } -void gk20a_vm_free_entries(struct vm_gk20a *vm, - struct gk20a_mm_entry *parent, - int level) -{ - int i; - - if (parent->entries) - for (i = 0; i < parent->num_entries; i++) - gk20a_vm_free_entries(vm, &parent->entries[i], level+1); - - if (parent->mem.size) - free_gmmu_pages(vm, parent); - nvgpu_vfree(vm->mm->g, parent->entries); - parent->entries = NULL; -} - const struct gk20a_mmu_level gk20a_mm_levels_64k[] = { {.hi_bit = {NV_GMMU_VA_RANGE-1, NV_GMMU_VA_RANGE-1}, .lo_bit = {26, 26}, diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h index 7e2ba051..2581bc0d 100644 --- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h +++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h @@ -433,17 +433,10 @@ int gk20a_dmabuf_alloc_drvdata(struct dma_buf *dmabuf, struct device *dev); int gk20a_dmabuf_get_state(struct dma_buf *dmabuf, struct gk20a *g, u64 offset, struct gk20a_buffer_state **state); -int map_gmmu_pages(struct gk20a *g, struct gk20a_mm_entry *entry); -void unmap_gmmu_pages(struct gk20a *g, struct gk20a_mm_entry *entry); void pde_range_from_vaddr_range(struct vm_gk20a *vm, u64 addr_lo, u64 addr_hi, u32 *pde_lo, u32 *pde_hi); int gk20a_mm_pde_coverage_bit_count(struct vm_gk20a *vm); -u32 pte_index_from_vaddr(struct vm_gk20a *vm, - u64 addr, enum gmmu_pgsz_gk20a pgsz_idx); -void free_gmmu_pages(struct vm_gk20a *vm, - struct gk20a_mm_entry *entry); - u32 gk20a_mm_get_physical_addr_bits(struct gk20a *g); struct gpu_ops; diff --git a/drivers/gpu/nvgpu/include/nvgpu/gmmu.h b/drivers/gpu/nvgpu/include/nvgpu/gmmu.h index 45c5def4..ed152cd8 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/gmmu.h +++ b/drivers/gpu/nvgpu/include/nvgpu/gmmu.h @@ -105,4 +105,7 @@ void nvgpu_gmmu_unmap(struct vm_gk20a *vm, struct nvgpu_mem *mem, u64 gpu_va); +void nvgpu_free_gmmu_pages(struct vm_gk20a *vm, + struct gk20a_mm_entry *entry); + #endif -- cgit v1.2.2