From 583704620db88e391f6b14acc57af859a70127de Mon Sep 17 00:00:00 2001 From: Alex Waterman Date: Fri, 9 Jun 2017 11:42:50 -0700 Subject: gpu: nvgpu: Implement PD packing In some cases page directories require less than a full page of memory. For example, on Pascal, the final PD level for large pages is only 256 bytes; thus 16 PDs can fit in a single page. To allocate an entire page for each of these 256 B PDs is extremely wasteful. This patch aims to alleviate the wasted DMA memory from having small PDs in a full page by packing multiple small PDs into a single page. The packing is implemented as a slab allocator - each page is a slab and from each page multiple PD instances can be allocated. Several modifications to the nvgpu_gmmu_pd struct also needed to be made to support this. The nvgpu_mem is now a pointer and there's an explicit offset into the nvgpu_mem struct so that each nvgpu_gmmu_pd knows what portion of the memory it's using. The nvgpu_pde_phys_addr() function and the pd_write() functions also require some changes since the PD no longer is always situated at the start of the nvgpu_mem. Initialization and cleanup of the page tables for each VM was slightly modified to work through the new pd_cache implementation. Some PDs (i.e the PDB), despite not being a full page, still require a full page for alignment purposes (HW requirements). Thus a direct allocation method for PDs is still provided. This is also used when a PD that could in principle be cached is greater than a page in size. Lastly a new debug flag was added for the pd_cache code. JIRA NVGPU-30 Change-Id: I64c8037fc356783c1ef203cc143c4d71bbd5d77c Signed-off-by: Alex Waterman Reviewed-on: https://git-master/r/1506610 Reviewed-by: Terje Bergstrom GVS: Gerrit_Virtual_Submit --- drivers/gpu/nvgpu/Makefile.nvgpu | 1 + drivers/gpu/nvgpu/common/mm/gmmu.c | 81 +++---- drivers/gpu/nvgpu/common/mm/pd_cache.c | 426 +++++++++++++++++++++++++++++++++ drivers/gpu/nvgpu/common/mm/vm.c | 50 ++-- drivers/gpu/nvgpu/gk20a/gk20a.c | 9 + drivers/gpu/nvgpu/gk20a/mm_gk20a.c | 9 +- drivers/gpu/nvgpu/gk20a/mm_gk20a.h | 4 + drivers/gpu/nvgpu/gp10b/mm_gp10b.c | 10 +- drivers/gpu/nvgpu/include/nvgpu/gmmu.h | 91 ++++++- drivers/gpu/nvgpu/include/nvgpu/log.h | 1 + 10 files changed, 609 insertions(+), 73 deletions(-) create mode 100644 drivers/gpu/nvgpu/common/mm/pd_cache.c (limited to 'drivers') diff --git a/drivers/gpu/nvgpu/Makefile.nvgpu b/drivers/gpu/nvgpu/Makefile.nvgpu index 3a256771..4aaf7bc5 100644 --- a/drivers/gpu/nvgpu/Makefile.nvgpu +++ b/drivers/gpu/nvgpu/Makefile.nvgpu @@ -50,6 +50,7 @@ nvgpu-y := \ common/mm/page_allocator.o \ common/mm/lockless_allocator.o \ common/mm/gmmu.o \ + common/mm/pd_cache.o \ common/mm/vm.o \ common/mm/vm_area.o \ common/bus.o \ diff --git a/drivers/gpu/nvgpu/common/mm/gmmu.c b/drivers/gpu/nvgpu/common/mm/gmmu.c index ec1bc095..602dfb3b 100644 --- a/drivers/gpu/nvgpu/common/mm/gmmu.c +++ b/drivers/gpu/nvgpu/common/mm/gmmu.c @@ -45,7 +45,8 @@ static int pd_allocate(struct vm_gk20a *vm, struct nvgpu_gmmu_pd *pd, const struct gk20a_mmu_level *l, struct nvgpu_gmmu_attrs *attrs); - +static u32 pd_size(const struct gk20a_mmu_level *l, + struct nvgpu_gmmu_attrs *attrs); /* * Core GMMU map function for the kernel to use. If @addr is 0 then the GPU * VA will be allocated for you. If addr is non-zero then the buffer will be @@ -138,6 +139,9 @@ void nvgpu_gmmu_unmap(struct vm_gk20a *vm, struct nvgpu_mem *mem, u64 gpu_va) int nvgpu_gmmu_init_page_table(struct vm_gk20a *vm) { + u32 pdb_size; + int err; + /* * Need this just for page size. Everything else can be ignored. Also * note that we can just use pgsz 0 (i.e small pages) since the number @@ -148,56 +152,43 @@ int nvgpu_gmmu_init_page_table(struct vm_gk20a *vm) .pgsz = 0, }; - return pd_allocate(vm, &vm->pdb, &vm->mmu_levels[0], &attrs); -} + /* + * PDB size here must be one page so that its address is page size + * aligned. Although lower PDE tables can be aligned at 256B boundaries + * the main PDB must be page aligned. + */ + pdb_size = ALIGN(pd_size(&vm->mmu_levels[0], &attrs), PAGE_SIZE); + + err = __nvgpu_pd_cache_alloc_direct(vm->mm->g, &vm->pdb, pdb_size); + if (WARN_ON(err)) + return err; + /* + * One mb() is done after all mapping operations. Don't need individual + * barriers for each PD write. + */ + vm->pdb.mem->skip_wmb = true; + + return 0; +} /* * Ensure that there's a CPU mapping for the page directory memory. This won't * always be the case for 32 bit systems since we may need to save kernel * virtual memory. */ -static int map_gmmu_pages(struct gk20a *g, struct nvgpu_gmmu_pd *entry) +static int map_gmmu_pages(struct gk20a *g, struct nvgpu_gmmu_pd *pd) { - return nvgpu_mem_begin(g, &entry->mem); + return nvgpu_mem_begin(g, pd->mem); } /* * Handle any necessary CPU unmap semantics for a page directories DMA memory. * For 64 bit platforms this is a noop. */ -static void unmap_gmmu_pages(struct gk20a *g, struct nvgpu_gmmu_pd *entry) -{ - nvgpu_mem_end(g, &entry->mem); -} - -static int nvgpu_alloc_gmmu_pages(struct vm_gk20a *vm, u32 bytes, - struct nvgpu_gmmu_pd *pd) -{ - struct gk20a *g = gk20a_from_vm(vm); - unsigned long flags = NVGPU_DMA_FORCE_CONTIGUOUS; - int err; - - /* - * On arm32 vmalloc space is a precious commodity so we do not map pages - * by default. - */ - if (!IS_ENABLED(CONFIG_ARM64)) - flags |= NVGPU_DMA_NO_KERNEL_MAPPING; - - err = nvgpu_dma_alloc_flags(g, flags, bytes, &pd->mem); - if (err) - return -ENOMEM; - - return 0; -} - -void nvgpu_free_gmmu_pages(struct vm_gk20a *vm, - struct nvgpu_gmmu_pd *pd) +static void unmap_gmmu_pages(struct gk20a *g, struct nvgpu_gmmu_pd *pd) { - struct gk20a *g = gk20a_from_vm(vm); - - nvgpu_dma_free(g, &pd->mem); + nvgpu_mem_end(g, pd->mem); } /* @@ -205,10 +196,14 @@ void nvgpu_free_gmmu_pages(struct vm_gk20a *vm, */ u64 nvgpu_pde_phys_addr(struct gk20a *g, struct nvgpu_gmmu_pd *pd) { + u64 page_addr; + if (g->mm.has_physical_mode) - return sg_phys(pd->mem.priv.sgt->sgl); + page_addr = sg_phys(pd->mem->priv.sgt->sgl); else - return nvgpu_mem_get_base_addr(g, &pd->mem, 0); + page_addr = nvgpu_mem_get_base_addr(g, pd->mem, 0); + + return page_addr + pd->mem_offs; } /* @@ -254,10 +249,10 @@ static int pd_allocate(struct vm_gk20a *vm, { int err; - if (pd->mem.size) + if (pd->mem) return 0; - err = nvgpu_alloc_gmmu_pages(vm, pd_size(l, attrs), pd); + err = __nvgpu_pd_alloc(vm, pd, pd_size(l, attrs)); if (err) { nvgpu_info(vm->mm->g, "error allocating page directory!"); return err; @@ -267,7 +262,7 @@ static int pd_allocate(struct vm_gk20a *vm, * One mb() is done after all mapping operations. Don't need individual * barriers for each PD write. */ - pd->mem.skip_wmb = true; + pd->mem->skip_wmb = true; return 0; } @@ -778,7 +773,7 @@ u64 gk20a_locked_gmmu_map(struct vm_gk20a *vm, } if (!batch) - g->ops.fb.tlb_invalidate(g, &vm->pdb.mem); + g->ops.fb.tlb_invalidate(g, vm->pdb.mem); else batch->need_tlb_invalidate = true; @@ -830,7 +825,7 @@ void gk20a_locked_gmmu_unmap(struct vm_gk20a *vm, if (!batch) { gk20a_mm_l2_flush(g, true); - g->ops.fb.tlb_invalidate(g, &vm->pdb.mem); + g->ops.fb.tlb_invalidate(g, vm->pdb.mem); } else { if (!batch->gpu_l2_flushed) { gk20a_mm_l2_flush(g, true); diff --git a/drivers/gpu/nvgpu/common/mm/pd_cache.c b/drivers/gpu/nvgpu/common/mm/pd_cache.c new file mode 100644 index 00000000..4f312eff --- /dev/null +++ b/drivers/gpu/nvgpu/common/mm/pd_cache.c @@ -0,0 +1,426 @@ +/* + * Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include +#include +#include +#include +#include +#include + +#include "gk20a/gk20a.h" +#include "gk20a/mm_gk20a.h" + +#define pd_dbg(g, fmt, args...) nvgpu_log(g, gpu_dbg_pd_cache, fmt, ##args) + +/** + * DOC: PD cache + * + * In the name of saving memory with the many sub-page sized PD levels in Pascal + * and beyond a way of packing PD tables together is necessary. This code here + * does just that. If a PD table only requires 1024 bytes, then it is possible + * to have 4 of these PDs in one page. This is even more pronounced for 256 byte + * PD tables. + * + * The pd cache is basially just a slab allocator. Each instance of the nvgpu + * driver makes one of these structs: + * + * struct nvgpu_pd_cache { + * struct nvgpu_list_node full[NVGPU_PD_CACHE_COUNT]; + * struct nvgpu_list_node partial[NVGPU_PD_CACHE_COUNT]; + * + * struct nvgpu_rbtree_node *mem_tree; + * }; + * + * There are two sets of lists, the full and the partial. The full lists contain + * pages of memory for which all the memory in that page is in use. The partial + * lists contain partially full pages of memory which can be used for more PD + * allocations. There a couple of assumptions here: + * + * 1. PDs greater than or equal to the page size bypass the pd cache. + * 2. PDs are always power of 2 and greater than %NVGPU_PD_CACHE_MIN bytes. + * + * There are NVGPU_PD_CACHE_COUNT full lists and the same number of partial + * lists. For a 4Kb page NVGPU_PD_CACHE_COUNT is 4. This is enough space for + * 256, 512, 1024, and 2048 byte PDs. + * + * __nvgpu_pd_alloc() will allocate a PD for the GMMU. It will check if the PD + * size is page size or larger and choose the correct allocation scheme - either + * from the PD cache or directly. Similarly __nvgpu_pd_free() will free a PD + * allocated by __nvgpu_pd_alloc(). + * + * Since the top level PD (the PDB) is a page aligned pointer but less than a + * page size the direct functions must be used for allocating PDBs. Otherwise + * there would be alignment issues for the PDBs when they get packed. + */ + +static u32 nvgpu_pd_cache_nr(u32 bytes) +{ + return ilog2(bytes >> (NVGPU_PD_CACHE_MIN_SHIFT - 1)); +} + +static u32 nvgpu_pd_cache_get_mask(struct nvgpu_pd_mem_entry *pentry) +{ + u32 mask_offset = 1 << (PAGE_SIZE / pentry->pd_size); + + return mask_offset - 1; +} + +int nvgpu_pd_cache_init(struct gk20a *g) +{ + struct nvgpu_pd_cache *cache; + int i; + + /* + * This gets called from finalize_poweron() so we need to make sure we + * don't reinit the pd_cache over and over. + */ + if (g->mm.pd_cache) + return 0; + + cache = nvgpu_kzalloc(g, sizeof(*cache)); + if (!cache) { + nvgpu_err(g, "Failed to alloc pd_cache!"); + return -ENOMEM; + } + + for (i = 0; i < NVGPU_PD_CACHE_COUNT; i++) { + nvgpu_init_list_node(&cache->full[i]); + nvgpu_init_list_node(&cache->partial[i]); + } + + cache->mem_tree = NULL; + g->mm.pd_cache = cache; + nvgpu_mutex_init(&cache->lock); + + pd_dbg(g, "PD cache initialized!"); + + return 0; +} + +void nvgpu_pd_cache_fini(struct gk20a *g) +{ + int i; + struct nvgpu_pd_cache *cache = g->mm.pd_cache; + + if (!cache) + return; + + for (i = 0; i < NVGPU_PD_CACHE_COUNT; i++) { + WARN_ON(!nvgpu_list_empty(&cache->full[i])); + WARN_ON(!nvgpu_list_empty(&cache->partial[i])); + } + + nvgpu_kfree(g, g->mm.pd_cache); +} + +/* + * This is the simple pass-through for greater than page or page sized PDs. + * + * Note: this does not need the cache lock since it does not modify any of the + * PD cache data structures. + */ +int __nvgpu_pd_cache_alloc_direct(struct gk20a *g, + struct nvgpu_gmmu_pd *pd, u32 bytes) +{ + int err; + + pd_dbg(g, "PD-Alloc [D] %u bytes", bytes); + + pd->mem = nvgpu_kzalloc(g, sizeof(*pd->mem)); + if (!pd->mem) { + pd_dbg(g, "OOM allocating nvgpu_mem struct!"); + return -ENOMEM; + } + + err = nvgpu_dma_alloc_flags(g, NVGPU_DMA_FORCE_CONTIGUOUS, + bytes, pd->mem); + if (err) { + pd_dbg(g, "OOM allocating page directory!"); + nvgpu_kfree(g, pd->mem); + return -ENOMEM; + } + + pd->cached = false; + pd->mem_offs = 0; + + return 0; +} + +/* + * Make a new nvgpu_pd_cache_entry and allocate a PD from it. Update the passed + * pd to reflect this allocation. + */ +static int nvgpu_pd_cache_alloc_new(struct gk20a *g, + struct nvgpu_pd_cache *cache, + struct nvgpu_gmmu_pd *pd, + u32 bytes) +{ + struct nvgpu_pd_mem_entry *pentry; + + pd_dbg(g, "PD-Alloc [C] New: offs=0"); + + pentry = nvgpu_kzalloc(g, sizeof(*pentry)); + if (!pentry) { + pd_dbg(g, "OOM allocating pentry!"); + return -ENOMEM; + } + + if (nvgpu_dma_alloc_flags(g, NVGPU_DMA_FORCE_CONTIGUOUS, + PAGE_SIZE, &pentry->mem)) { + nvgpu_kfree(g, pentry); + pd_dbg(g, "Unable to DMA alloc!"); + return -ENOMEM; + } + + pentry->pd_size = bytes; + nvgpu_list_add(&pentry->list_entry, + &cache->partial[nvgpu_pd_cache_nr(bytes)]); + + /* + * This allocates the very first PD table in the set of tables in this + * nvgpu_pd_mem_entry. + */ + pentry->alloc_map = 1; + + /* + * Now update the nvgpu_gmmu_pd to reflect this allocation. + */ + pd->mem = &pentry->mem; + pd->mem_offs = 0; + pd->cached = true; + + pentry->tree_entry.key_start = (u64)(uintptr_t)&pentry->mem; + nvgpu_rbtree_insert(&pentry->tree_entry, &cache->mem_tree); + + return 0; +} + +static int nvgpu_pd_cache_alloc_from_partial(struct gk20a *g, + struct nvgpu_pd_cache *cache, + struct nvgpu_pd_mem_entry *pentry, + struct nvgpu_gmmu_pd *pd) +{ + unsigned long bit_offs; + u32 mem_offs; + u32 pentry_mask = nvgpu_pd_cache_get_mask(pentry); + + /* + * Find and allocate an open PD. + */ + bit_offs = ffz(pentry->alloc_map); + mem_offs = bit_offs * pentry->pd_size; + + /* Bit map full. Somethings wrong. */ + if (WARN_ON(bit_offs >= ffz(pentry_mask))) + return -ENOMEM; + + pentry->alloc_map |= 1 << bit_offs; + + pd_dbg(g, "PD-Alloc [C] Partial: offs=%lu", bit_offs); + + /* + * First update the pd. + */ + pd->mem = &pentry->mem; + pd->mem_offs = mem_offs; + pd->cached = true; + + /* + * Now make sure the pentry is in the correct list (full vs partial). + */ + if ((pentry->alloc_map & pentry_mask) == pentry_mask) { + pd_dbg(g, "Adding pentry to full list!"); + nvgpu_list_del(&pentry->list_entry); + nvgpu_list_add(&pentry->list_entry, + &cache->full[nvgpu_pd_cache_nr(pentry->pd_size)]); + } + + return 0; +} + +/* + * Get a partially full nvgpu_pd_mem_entry. Returns NULL if there is no partial + * nvgpu_pd_mem_entry's. + */ +static struct nvgpu_pd_mem_entry *nvgpu_pd_cache_get_partial( + struct nvgpu_pd_cache *cache, u32 bytes) +{ + struct nvgpu_list_node *list = + &cache->partial[nvgpu_pd_cache_nr(bytes)]; + + if (nvgpu_list_empty(list)) + return NULL; + + return nvgpu_list_first_entry(list, + nvgpu_pd_mem_entry, + list_entry); +} + +/* + * Allocate memory from an nvgpu_mem for the page directory. + */ +static int nvgpu_pd_cache_alloc(struct gk20a *g, struct nvgpu_pd_cache *cache, + struct nvgpu_gmmu_pd *pd, u32 bytes) +{ + struct nvgpu_pd_mem_entry *pentry; + int err; + + pd_dbg(g, "PD-Alloc [C] %u bytes", bytes); + + if (bytes & (bytes - 1) || + (bytes >= PAGE_SIZE || + bytes < NVGPU_PD_CACHE_MIN)) { + pd_dbg(g, "PD-Alloc [C] Invalid (bytes=%u)!", bytes); + return -EINVAL; + } + + pentry = nvgpu_pd_cache_get_partial(cache, bytes); + if (!pentry) + err = nvgpu_pd_cache_alloc_new(g, cache, pd, bytes); + else + err = nvgpu_pd_cache_alloc_from_partial(g, cache, pentry, pd); + + if (err) + pd_dbg(g, "PD-Alloc [C] Failed!"); + + return err; +} + +/* + * Allocate the DMA memory for a page directory. This handles the necessary PD + * cache logistics. Since on Parker and later GPUs some of the page directories + * are smaller than a page packing these PDs together saves a lot of memory. + */ +int __nvgpu_pd_alloc(struct vm_gk20a *vm, struct nvgpu_gmmu_pd *pd, u32 bytes) +{ + struct gk20a *g = gk20a_from_vm(vm); + int err; + + /* + * Simple case: PD is bigger than a page so just do a regular DMA + * alloc. + */ + if (bytes >= PAGE_SIZE) { + err = __nvgpu_pd_cache_alloc_direct(g, pd, bytes); + if (err) + return err; + + return 0; + } + + if (WARN_ON(!g->mm.pd_cache)) + return -ENOMEM; + + nvgpu_mutex_acquire(&g->mm.pd_cache->lock); + err = nvgpu_pd_cache_alloc(g, g->mm.pd_cache, pd, bytes); + nvgpu_mutex_release(&g->mm.pd_cache->lock); + + return err; +} + +void __nvgpu_pd_cache_free_direct(struct gk20a *g, struct nvgpu_gmmu_pd *pd) +{ + pd_dbg(g, "PD-Free [D] 0x%p", pd->mem); + + if (!pd->mem) + return; + + nvgpu_dma_free(g, pd->mem); + nvgpu_kfree(g, pd->mem); + pd->mem = NULL; +} + +static void nvgpu_pd_cache_free_mem_entry(struct gk20a *g, + struct nvgpu_pd_cache *cache, + struct nvgpu_pd_mem_entry *pentry) +{ + nvgpu_dma_free(g, &pentry->mem); + nvgpu_list_del(&pentry->list_entry); + nvgpu_rbtree_unlink(&pentry->tree_entry, &cache->mem_tree); + nvgpu_kfree(g, pentry); +} + +static void nvgpu_pd_cache_do_free(struct gk20a *g, + struct nvgpu_pd_cache *cache, + struct nvgpu_pd_mem_entry *pentry, + struct nvgpu_gmmu_pd *pd) +{ + u32 index = pd->mem_offs / pentry->pd_size; + u32 bit = 1 << index; + + /* Mark entry as free. */ + pentry->alloc_map &= ~bit; + + if (pentry->alloc_map & nvgpu_pd_cache_get_mask(pentry)) { + /* + * Partially full still. If it was already on the partial list + * this just re-adds it. + */ + nvgpu_list_del(&pentry->list_entry); + nvgpu_list_add(&pentry->list_entry, + &cache->partial[nvgpu_pd_cache_nr(pentry->pd_size)]); + } else { + /* Empty now so free it. */ + nvgpu_pd_cache_free_mem_entry(g, cache, pentry); + } +} + +static struct nvgpu_pd_mem_entry *nvgpu_pd_cache_look_up( + struct gk20a *g, + struct nvgpu_pd_cache *cache, + struct nvgpu_gmmu_pd *pd) +{ + struct nvgpu_rbtree_node *node; + + nvgpu_rbtree_search((u64)(uintptr_t)pd->mem, &node, + cache->mem_tree); + if (!node) + return NULL; + + return nvgpu_pd_mem_entry_from_tree_entry(node); +} + +static void nvgpu_pd_cache_free(struct gk20a *g, struct nvgpu_pd_cache *cache, + struct nvgpu_gmmu_pd *pd) +{ + struct nvgpu_pd_mem_entry *pentry; + + pd_dbg(g, "PD-Free [C] 0x%p", pd->mem); + + pentry = nvgpu_pd_cache_look_up(g, cache, pd); + if (!pentry) { + WARN(1, "Attempting to free non-existent pd"); + return; + } + + nvgpu_pd_cache_do_free(g, cache, pentry, pd); +} + +void __nvgpu_pd_free(struct vm_gk20a *vm, struct nvgpu_gmmu_pd *pd) +{ + struct gk20a *g = gk20a_from_vm(vm); + + /* + * Simple case: just DMA free. + */ + if (!pd->cached) + return __nvgpu_pd_cache_free_direct(g, pd); + + nvgpu_mutex_acquire(&g->mm.pd_cache->lock); + nvgpu_pd_cache_free(g, g->mm.pd_cache, pd); + nvgpu_mutex_release(&g->mm.pd_cache->lock); +} diff --git a/drivers/gpu/nvgpu/common/mm/vm.c b/drivers/gpu/nvgpu/common/mm/vm.c index 3aeba500..3ed3c7fe 100644 --- a/drivers/gpu/nvgpu/common/mm/vm.c +++ b/drivers/gpu/nvgpu/common/mm/vm.c @@ -35,21 +35,42 @@ int vm_aspace_id(struct vm_gk20a *vm) return vm->as_share ? vm->as_share->id : -1; } -static void nvgpu_vm_free_entries(struct vm_gk20a *vm, - struct nvgpu_gmmu_pd *parent, - int level) +static void __nvgpu_vm_free_entries(struct vm_gk20a *vm, + struct nvgpu_gmmu_pd *pd, + int level) { int i; - if (parent->entries) - for (i = 0; i < parent->num_entries; i++) - nvgpu_vm_free_entries(vm, &parent->entries[i], + if (pd->mem) { + __nvgpu_pd_free(vm, pd); + pd->mem = NULL; + } + + if (pd->entries) { + for (i = 0; i < pd->num_entries; i++) + __nvgpu_vm_free_entries(vm, &pd->entries[i], level + 1); + nvgpu_vfree(vm->mm->g, pd->entries); + pd->entries = NULL; + } +} + +static void nvgpu_vm_free_entries(struct vm_gk20a *vm, + struct nvgpu_gmmu_pd *pdb) +{ + struct gk20a *g = vm->mm->g; + int i; + + __nvgpu_pd_cache_free_direct(g, pdb); + + if (!pdb->entries) + return; + + for (i = 0; i < pdb->num_entries; i++) + __nvgpu_vm_free_entries(vm, &pdb->entries[i], 1); - if (parent->mem.size) - nvgpu_free_gmmu_pages(vm, parent); - nvgpu_vfree(vm->mm->g, parent->entries); - parent->entries = NULL; + nvgpu_vfree(g, pdb->entries); + pdb->entries = NULL; } u64 __nvgpu_vm_alloc_va(struct vm_gk20a *vm, u64 size, @@ -110,7 +131,7 @@ void nvgpu_vm_mapping_batch_finish_locked( if (mapping_batch->need_tlb_invalidate) { struct gk20a *g = gk20a_from_vm(vm); - g->ops.fb.tlb_invalidate(g, &vm->pdb.mem); + g->ops.fb.tlb_invalidate(g, vm->pdb.mem); } } @@ -407,9 +428,8 @@ clean_up_allocators: if (nvgpu_alloc_initialized(&vm->user_lp)) nvgpu_alloc_destroy(&vm->user_lp); clean_up_page_tables: - /* Cleans up nvgpu_vm_init_page_tables() */ - nvgpu_vfree(g, vm->pdb.entries); - nvgpu_free_gmmu_pages(vm, &vm->pdb); + /* Cleans up nvgpu_gmmu_init_page_table() */ + __nvgpu_pd_cache_free_direct(g, &vm->pdb); clean_up_vgpu_vm: #ifdef CONFIG_TEGRA_GR_VIRTUALIZATION if (g->is_virtual) @@ -525,7 +545,7 @@ static void __nvgpu_vm_remove(struct vm_gk20a *vm) if (nvgpu_alloc_initialized(&vm->user_lp)) nvgpu_alloc_destroy(&vm->user_lp); - nvgpu_vm_free_entries(vm, &vm->pdb, 0); + nvgpu_vm_free_entries(vm, &vm->pdb); #ifdef CONFIG_TEGRA_GR_VIRTUALIZATION if (g->is_virtual) diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.c b/drivers/gpu/nvgpu/gk20a/gk20a.c index 380c28ac..a0753770 100644 --- a/drivers/gpu/nvgpu/gk20a/gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/gk20a.c @@ -25,6 +25,7 @@ #include #include #include +#include #include @@ -174,6 +175,14 @@ int gk20a_finalize_poweron(struct gk20a *g) g->gpu_reset_done = true; } + /* + * Do this early so any early VMs that get made are capable of mapping + * buffers. + */ + err = nvgpu_pd_cache_init(g); + if (err) + return err; + /* init interface layer support for PMU falcon */ nvgpu_flcn_sw_init(g, FALCON_ID_PMU); nvgpu_flcn_sw_init(g, FALCON_ID_SEC2); diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c index 558a1b06..0a84cabb 100644 --- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c @@ -478,6 +478,7 @@ static void gk20a_remove_mm_support(struct mm_gk20a *mm) gk20a_semaphore_sea_destroy(g); gk20a_vidmem_destroy(g); + nvgpu_pd_cache_fini(g); } static int gk20a_alloc_sysmem_flush(struct gk20a *g) @@ -1560,7 +1561,7 @@ static inline u32 big_valid_pde0_bits(struct gk20a *g, struct nvgpu_gmmu_pd *pd, u64 addr) { u32 pde0_bits = - nvgpu_aperture_mask(g, &pd->mem, + nvgpu_aperture_mask(g, pd->mem, gmmu_pde_aperture_big_sys_mem_ncoh_f(), gmmu_pde_aperture_big_video_memory_f()) | gmmu_pde_address_big_sys_f( @@ -1573,7 +1574,7 @@ static inline u32 small_valid_pde1_bits(struct gk20a *g, struct nvgpu_gmmu_pd *pd, u64 addr) { u32 pde1_bits = - nvgpu_aperture_mask(g, &pd->mem, + nvgpu_aperture_mask(g, pd->mem, gmmu_pde_aperture_small_sys_mem_ncoh_f(), gmmu_pde_aperture_small_video_memory_f()) | gmmu_pde_vol_small_true_f() | /* tbd: why? */ @@ -2173,14 +2174,14 @@ static int gk20a_init_ce_vm(struct mm_gk20a *mm) void gk20a_mm_init_pdb(struct gk20a *g, struct nvgpu_mem *inst_block, struct vm_gk20a *vm) { - u64 pdb_addr = nvgpu_mem_get_base_addr(g, &vm->pdb.mem, 0); + u64 pdb_addr = nvgpu_mem_get_base_addr(g, vm->pdb.mem, 0); u32 pdb_addr_lo = u64_lo32(pdb_addr >> ram_in_base_shift_v()); u32 pdb_addr_hi = u64_hi32(pdb_addr); gk20a_dbg_info("pde pa=0x%llx", pdb_addr); nvgpu_mem_wr32(g, inst_block, ram_in_page_dir_base_lo_w(), - nvgpu_aperture_mask(g, &vm->pdb.mem, + nvgpu_aperture_mask(g, vm->pdb.mem, ram_in_page_dir_base_target_sys_mem_ncoh_f(), ram_in_page_dir_base_target_vid_mem_f()) | ram_in_page_dir_base_vol_true_f() | diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h index a245d0e0..cadcffa4 100644 --- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h +++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h @@ -31,6 +31,8 @@ #include #include +struct nvgpu_pd_cache; + #ifdef CONFIG_ARM64 #define outer_flush_range(a, b) #define __cpuc_flush_dcache_area __flush_dcache_area @@ -217,6 +219,8 @@ struct mm_gk20a { struct vm_gk20a *vm; } ce; + struct nvgpu_pd_cache *pd_cache; + struct nvgpu_mutex l2_op_lock; struct nvgpu_mutex tlb_lock; struct nvgpu_mutex priv_lock; diff --git a/drivers/gpu/nvgpu/gp10b/mm_gp10b.c b/drivers/gpu/nvgpu/gp10b/mm_gp10b.c index c3867e9d..2ff199c6 100644 --- a/drivers/gpu/nvgpu/gp10b/mm_gp10b.c +++ b/drivers/gpu/nvgpu/gp10b/mm_gp10b.c @@ -164,7 +164,7 @@ static void update_gmmu_pde3_locked(struct vm_gk20a *vm, phys_addr >>= gmmu_new_pde_address_shift_v(); - pde_v[0] |= nvgpu_aperture_mask(g, &pd->mem, + pde_v[0] |= nvgpu_aperture_mask(g, pd->mem, gmmu_new_pde_aperture_sys_mem_ncoh_f(), gmmu_new_pde_aperture_video_memory_f()); pde_v[0] |= gmmu_new_pde_address_sys_f(u64_lo32(phys_addr)); @@ -209,7 +209,7 @@ static void update_gmmu_pde0_locked(struct vm_gk20a *vm, if (small_valid) { pde_v[2] |= gmmu_new_dual_pde_address_small_sys_f(small_addr); - pde_v[2] |= nvgpu_aperture_mask(g, &pd->mem, + pde_v[2] |= nvgpu_aperture_mask(g, pd->mem, gmmu_new_dual_pde_aperture_small_sys_mem_ncoh_f(), gmmu_new_dual_pde_aperture_small_video_memory_f()); pde_v[2] |= gmmu_new_dual_pde_vol_small_true_f(); @@ -219,7 +219,7 @@ static void update_gmmu_pde0_locked(struct vm_gk20a *vm, if (big_valid) { pde_v[0] |= gmmu_new_dual_pde_address_big_sys_f(big_addr); pde_v[0] |= gmmu_new_dual_pde_vol_big_true_f(); - pde_v[0] |= nvgpu_aperture_mask(g, &pd->mem, + pde_v[0] |= nvgpu_aperture_mask(g, pd->mem, gmmu_new_dual_pde_aperture_big_sys_mem_ncoh_f(), gmmu_new_dual_pde_aperture_big_video_memory_f()); pde_v[1] |= big_addr >> 28; @@ -365,14 +365,14 @@ static const struct gk20a_mmu_level *gp10b_mm_get_mmu_levels(struct gk20a *g, static void gp10b_mm_init_pdb(struct gk20a *g, struct nvgpu_mem *inst_block, struct vm_gk20a *vm) { - u64 pdb_addr = nvgpu_mem_get_base_addr(g, &vm->pdb.mem, 0); + u64 pdb_addr = nvgpu_mem_get_base_addr(g, vm->pdb.mem, 0); u32 pdb_addr_lo = u64_lo32(pdb_addr >> ram_in_base_shift_v()); u32 pdb_addr_hi = u64_hi32(pdb_addr); gk20a_dbg_info("pde pa=0x%llx", pdb_addr); nvgpu_mem_wr32(g, inst_block, ram_in_page_dir_base_lo_w(), - nvgpu_aperture_mask(g, &vm->pdb.mem, + nvgpu_aperture_mask(g, vm->pdb.mem, ram_in_page_dir_base_target_sys_mem_ncoh_f(), ram_in_page_dir_base_target_vid_mem_f()) | ram_in_page_dir_base_vol_true_f() | diff --git a/drivers/gpu/nvgpu/include/nvgpu/gmmu.h b/drivers/gpu/nvgpu/include/nvgpu/gmmu.h index 28a2cb82..eff87c31 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/gmmu.h +++ b/drivers/gpu/nvgpu/include/nvgpu/gmmu.h @@ -19,6 +19,9 @@ #include #include +#include +#include +#include struct scatterlist; @@ -44,15 +47,86 @@ enum gk20a_mem_rw_flag { gk20a_mem_flag_write_only = 2, /* WO */ }; +/* + * Minimum size of a cache. The number of different caches in the nvgpu_pd_cache + * structure is of course depending on this. The MIN_SHIFT define is the right + * number of bits to shift to determine which list to use in the array of lists. + */ +#define NVGPU_PD_CACHE_MIN 256 +#define NVGPU_PD_CACHE_MIN_SHIFT 9 +#define NVGPU_PD_CACHE_COUNT 4 + +struct nvgpu_pd_mem_entry { + struct nvgpu_mem mem; + + /* + * Size of the page directories (not the mem). bmap is a bitmap showing + * which PDs have been allocated. The size of mem will always be one + * page. pd_size will always be a power of 2. + */ + u32 pd_size; + unsigned long alloc_map; + + struct nvgpu_list_node list_entry; + struct nvgpu_rbtree_node tree_entry; +}; + +static inline struct nvgpu_pd_mem_entry * +nvgpu_pd_mem_entry_from_list_entry(struct nvgpu_list_node *node) +{ + return (struct nvgpu_pd_mem_entry *) + ((uintptr_t)node - + offsetof(struct nvgpu_pd_mem_entry, list_entry)); +}; + +static inline struct nvgpu_pd_mem_entry * +nvgpu_pd_mem_entry_from_tree_entry(struct nvgpu_rbtree_node *node) +{ + return (struct nvgpu_pd_mem_entry *) + ((uintptr_t)node - + offsetof(struct nvgpu_pd_mem_entry, tree_entry)); +}; + +/* + * A cache for allocating PD memory from. This enables smaller PDs to be packed + * into single pages. + * + * This is fairly complex so see the documentation in pd_cache.c for a full + * description of how this is organized. + */ +struct nvgpu_pd_cache { + /* + * Array of lists of full nvgpu_pd_mem_entries and partially full (or + * empty) nvgpu_pd_mem_entries. + */ + struct nvgpu_list_node full[NVGPU_PD_CACHE_COUNT]; + struct nvgpu_list_node partial[NVGPU_PD_CACHE_COUNT]; + + /* + * Tree of all allocated struct nvgpu_mem's for fast look up. + */ + struct nvgpu_rbtree_node *mem_tree; + + /* + * All access to the cache much be locked. This protects the lists and + * the rb tree. + */ + struct nvgpu_mutex lock; +}; + /* * GMMU page directory. This is the kernel's tracking of a list of PDEs or PTEs * in the GMMU. */ struct nvgpu_gmmu_pd { /* - * DMA memory describing the PTEs or PTEs. + * DMA memory describing the PTEs or PDEs. @mem_offs describes the + * offset of the PDE table in @mem. @cached specifies if this PD is + * using pd_cache memory. */ - struct nvgpu_mem mem; + struct nvgpu_mem *mem; + u32 mem_offs; + bool cached; /* * List of pointers to the next level of page tables. Does not @@ -66,7 +140,7 @@ struct nvgpu_gmmu_pd { * Reduce the number of arguments getting passed through the various levels of * GMMU mapping functions. * - * The following fields are set statically and do not change throughout + * The following fields are set statically and do not change throughout the * mapping call: * * pgsz: Index into the page size table. @@ -166,8 +240,13 @@ void nvgpu_gmmu_unmap(struct vm_gk20a *vm, struct nvgpu_mem *mem, u64 gpu_va); -void nvgpu_free_gmmu_pages(struct vm_gk20a *vm, - struct nvgpu_gmmu_pd *entry); +int __nvgpu_pd_alloc(struct vm_gk20a *vm, struct nvgpu_gmmu_pd *pd, u32 bytes); +void __nvgpu_pd_free(struct vm_gk20a *vm, struct nvgpu_gmmu_pd *pd); +int __nvgpu_pd_cache_alloc_direct(struct gk20a *g, + struct nvgpu_gmmu_pd *pd, u32 bytes); +void __nvgpu_pd_cache_free_direct(struct gk20a *g, struct nvgpu_gmmu_pd *pd); +int nvgpu_pd_cache_init(struct gk20a *g); +void nvgpu_pd_cache_fini(struct gk20a *g); /* * Some useful routines that are shared across chips. @@ -181,7 +260,7 @@ static inline u32 pd_offset_from_index(const struct gk20a_mmu_level *l, static inline void pd_write(struct gk20a *g, struct nvgpu_gmmu_pd *pd, size_t w, size_t data) { - nvgpu_mem_wr32(g, &pd->mem, w, data); + nvgpu_mem_wr32(g, pd->mem, (pd->mem_offs / sizeof(u32)) + w, data); } diff --git a/drivers/gpu/nvgpu/include/nvgpu/log.h b/drivers/gpu/nvgpu/include/nvgpu/log.h index 3b8e6b19..a1110a59 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/log.h +++ b/drivers/gpu/nvgpu/include/nvgpu/log.h @@ -68,6 +68,7 @@ enum nvgpu_log_categories { gpu_dbg_xv = BIT(17), /* XVE debugging. */ gpu_dbg_shutdown = BIT(18), /* GPU shutdown tracing. */ gpu_dbg_kmem = BIT(19), /* Kmem tracking debugging. */ + gpu_dbg_pd_cache = BIT(20), /* PD cache traces. */ gpu_dbg_mem = BIT(31), /* memory accesses; very verbose. */ }; -- cgit v1.2.2