From 036e000a17425e0569990f2aacae91b273392153 Mon Sep 17 00:00:00 2001 From: Peter Daifuku Date: Thu, 20 Aug 2020 18:45:26 -0700 Subject: nvgpu: add PD cache support for page-sized PTEs Large buffers being mapped to GMMU end up needing many pages for the PTE tables. Allocating these pages one by one can end up being a performance bottleneck, particularly in the virtualized case. Add support for page-sized PTEs to the existing PD cache: - define NVGPU_PD_CACHE_SIZE, the allocation size for a new slab for the PD cache, effectively set to 64K bytes - Use the PD cache for any allocation < NVGPU_PD_CACHE_SIZE - When freeing up cached entries, avoid prefetch errors by invalidating the entry (memset to 0) Bug 3093183 Bug 3100907 Change-Id: I2302a1dfeb056b9461159121bbae1be70524a357 Signed-off-by: Peter Daifuku Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2401783 Reviewed-by: Alex Waterman Reviewed-by: Satish Arora Reviewed-by: mobile promotions GVS: Gerrit_Virtual_Submit Tested-by: mobile promotions --- drivers/gpu/nvgpu/common/mm/pd_cache.c | 94 ++++++++++++++++++++++------------ drivers/gpu/nvgpu/include/nvgpu/gmmu.h | 23 ++++++--- 2 files changed, 78 insertions(+), 39 deletions(-) diff --git a/drivers/gpu/nvgpu/common/mm/pd_cache.c b/drivers/gpu/nvgpu/common/mm/pd_cache.c index d9dc3db0..a5b3d134 100644 --- a/drivers/gpu/nvgpu/common/mm/pd_cache.c +++ b/drivers/gpu/nvgpu/common/mm/pd_cache.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -42,7 +42,7 @@ * to have 4 of these PDs in one page. This is even more pronounced for 256 byte * PD tables. * - * The pd cache is basially just a slab allocator. Each instance of the nvgpu + * The pd cache is basically a slab allocator. Each instance of the nvgpu * driver makes one of these structs: * * struct nvgpu_pd_cache { @@ -52,22 +52,18 @@ * struct nvgpu_rbtree_node *mem_tree; * }; * - * There are two sets of lists, the full and the partial. The full lists contain - * pages of memory for which all the memory in that page is in use. The partial - * lists contain partially full pages of memory which can be used for more PD - * allocations. There a couple of assumptions here: + * There are two sets of lists used for cached allocations, the full and the + * partial. The full lists contain pages of memory for which all the memory in + * that entry is in use. The partial lists contain partially full blocks of + * memory which can be used for more PD allocations. The cache works as follows: * - * 1. PDs greater than or equal to the page size bypass the pd cache. + * 1. PDs greater than NVGPU_PD_CACHE_SIZE bypass the pd cache. * 2. PDs are always power of 2 and greater than %NVGPU_PD_CACHE_MIN bytes. * - * There are NVGPU_PD_CACHE_COUNT full lists and the same number of partial - * lists. For a 4Kb page NVGPU_PD_CACHE_COUNT is 4. This is enough space for - * 256, 512, 1024, and 2048 byte PDs. - * * nvgpu_pd_alloc() will allocate a PD for the GMMU. It will check if the PD - * size is page size or larger and choose the correct allocation scheme - either - * from the PD cache or directly. Similarly nvgpu_pd_free() will free a PD - * allocated by nvgpu_pd_alloc(). + * size is NVGPU_PD_CACHE_SIZE or larger and choose the correct allocation + * scheme - either from the PD cache or directly. Similarly nvgpu_pd_free() + * will free a PD allocated by nvgpu_pd_alloc(). * * Since the top level PD (the PDB) is a page aligned pointer but less than a * page size the direct functions must be used for allocating PDBs. Otherwise @@ -79,11 +75,11 @@ static u32 nvgpu_pd_cache_nr(u32 bytes) return ilog2(bytes >> (NVGPU_PD_CACHE_MIN_SHIFT - 1U)); } -static u32 nvgpu_pd_cache_get_mask(struct nvgpu_pd_mem_entry *pentry) +static u32 nvgpu_pd_cache_get_nr_entries(struct nvgpu_pd_mem_entry *pentry) { - u32 mask_offset = 1 << (PAGE_SIZE / pentry->pd_size); + BUG_ON(pentry->pd_size == 0); - return mask_offset - 1U; + return NVGPU_PD_CACHE_SIZE / pentry->pd_size; } int nvgpu_pd_cache_init(struct gk20a *g) @@ -201,6 +197,8 @@ static int nvgpu_pd_cache_alloc_new(struct gk20a *g, u32 bytes) { struct nvgpu_pd_mem_entry *pentry; + unsigned long flags = 0; + int err; pd_dbg(g, "PD-Alloc [C] New: offs=0"); @@ -210,8 +208,21 @@ static int nvgpu_pd_cache_alloc_new(struct gk20a *g, return -ENOMEM; } - if (nvgpu_dma_alloc(g, PAGE_SIZE, &pentry->mem)) { + if (!nvgpu_iommuable(g) && (NVGPU_PD_CACHE_SIZE > PAGE_SIZE)) { + flags = NVGPU_DMA_FORCE_CONTIGUOUS; + } + + err = nvgpu_dma_alloc_flags(g, flags, + NVGPU_PD_CACHE_SIZE, &pentry->mem); + if (err != 0) { nvgpu_kfree(g, pentry); + + /* Not enough contiguous space, but a direct + * allocation may work + */ + if (err == -ENOMEM) { + return nvgpu_pd_cache_alloc_direct(g, pd, bytes); + } nvgpu_err(g, "Unable to DMA alloc!"); return -ENOMEM; } @@ -224,7 +235,8 @@ static int nvgpu_pd_cache_alloc_new(struct gk20a *g, * This allocates the very first PD table in the set of tables in this * nvgpu_pd_mem_entry. */ - pentry->alloc_map = 1; + set_bit(0U, pentry->alloc_map); + pentry->allocs = 1; /* * Now update the nvgpu_gmmu_pd to reflect this allocation. @@ -246,20 +258,21 @@ static int nvgpu_pd_cache_alloc_from_partial(struct gk20a *g, { unsigned long bit_offs; u32 mem_offs; - u32 pentry_mask = nvgpu_pd_cache_get_mask(pentry); + u32 nr_bits = nvgpu_pd_cache_get_nr_entries(pentry); /* * Find and allocate an open PD. */ - bit_offs = ffz(pentry->alloc_map); + bit_offs = find_first_zero_bit(pentry->alloc_map, nr_bits); mem_offs = bit_offs * pentry->pd_size; /* Bit map full. Somethings wrong. */ - if (WARN_ON(bit_offs >= ffz(pentry_mask))) { + if (WARN_ON(bit_offs >= nr_bits)) { return -ENOMEM; } - pentry->alloc_map |= 1 << bit_offs; + set_bit(bit_offs, pentry->alloc_map); + pentry->allocs++; pd_dbg(g, "PD-Alloc [C] Partial: offs=%lu", bit_offs); @@ -273,7 +286,7 @@ static int nvgpu_pd_cache_alloc_from_partial(struct gk20a *g, /* * Now make sure the pentry is in the correct list (full vs partial). */ - if ((pentry->alloc_map & pentry_mask) == pentry_mask) { + if (pentry->allocs >= nr_bits) { pd_dbg(g, "Adding pentry to full list!"); nvgpu_list_del(&pentry->list_entry); nvgpu_list_add(&pentry->list_entry, @@ -314,7 +327,7 @@ static int nvgpu_pd_cache_alloc(struct gk20a *g, struct nvgpu_pd_cache *cache, pd_dbg(g, "PD-Alloc [C] %u bytes", bytes); if ((bytes & (bytes - 1U)) != 0U || - (bytes >= PAGE_SIZE || + (bytes >= NVGPU_PD_CACHE_SIZE || bytes < NVGPU_PD_CACHE_MIN)) { pd_dbg(g, "PD-Alloc [C] Invalid (bytes=%u)!", bytes); return -EINVAL; @@ -339,16 +352,18 @@ static int nvgpu_pd_cache_alloc(struct gk20a *g, struct nvgpu_pd_cache *cache, * cache logistics. Since on Parker and later GPUs some of the page directories * are smaller than a page packing these PDs together saves a lot of memory. */ -int nvgpu_pd_alloc(struct vm_gk20a *vm, struct nvgpu_gmmu_pd *pd, u32 bytes) +int nvgpu_pd_alloc(struct vm_gk20a *vm, + struct nvgpu_gmmu_pd *pd, + u32 bytes) { struct gk20a *g = gk20a_from_vm(vm); int err; /* - * Simple case: PD is bigger than a page so just do a regular DMA - * alloc. + * Simple case: PD is bigger than or equal to NVGPU_PD_CACHE_SIZE so + * just do a regular DMA alloc. */ - if (bytes >= PAGE_SIZE) { + if (bytes >= NVGPU_PD_CACHE_SIZE) { err = nvgpu_pd_cache_alloc_direct(g, pd, bytes); if (err) { return err; @@ -396,17 +411,28 @@ static void nvgpu_pd_cache_do_free(struct gk20a *g, struct nvgpu_pd_mem_entry *pentry, struct nvgpu_gmmu_pd *pd) { - u32 index = pd->mem_offs / pentry->pd_size; - u32 bit = 1 << index; + u32 bit = pd->mem_offs / pentry->pd_size; /* Mark entry as free. */ - pentry->alloc_map &= ~bit; + clear_bit(bit, pentry->alloc_map); + pentry->allocs--; - if (pentry->alloc_map & nvgpu_pd_cache_get_mask(pentry)) { + if (pentry->allocs > 0U) { /* * Partially full still. If it was already on the partial list * this just re-adds it. + * + * Since the memory used for the entries is still mapped, if + * igpu make sure the entries are invalidated so that the hw + * doesn't accidentally try to prefetch non-existent fb memory. + * + * TBD: what about dgpu? (Not supported in Drive 5.0) */ + if (pd->mem->cpu_va != NULL) { + memset((void *)((u64)pd->mem->cpu_va + pd->mem_offs), 0, + pentry->pd_size); + } + nvgpu_list_del(&pentry->list_entry); nvgpu_list_add(&pentry->list_entry, &cache->partial[nvgpu_pd_cache_nr(pentry->pd_size)]); @@ -414,6 +440,8 @@ static void nvgpu_pd_cache_do_free(struct gk20a *g, /* Empty now so free it. */ nvgpu_pd_cache_free_mem_entry(g, cache, pentry); } + + pd->mem = NULL; } static struct nvgpu_pd_mem_entry *nvgpu_pd_cache_look_up( diff --git a/drivers/gpu/nvgpu/include/nvgpu/gmmu.h b/drivers/gpu/nvgpu/include/nvgpu/gmmu.h index 8e1eeedc..ee2207b8 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/gmmu.h +++ b/drivers/gpu/nvgpu/include/nvgpu/gmmu.h @@ -28,6 +28,7 @@ #include #include #include +#include /* * This is the GMMU API visible to blocks outside of the GMMU. Basically this @@ -56,18 +57,25 @@ enum gk20a_mem_rw_flag { */ #define NVGPU_PD_CACHE_MIN 256U #define NVGPU_PD_CACHE_MIN_SHIFT 9U -#define NVGPU_PD_CACHE_COUNT 4U +#define NVGPU_PD_CACHE_COUNT 8U +#define NVGPU_PD_CACHE_SIZE (NVGPU_PD_CACHE_MIN * (1U << NVGPU_PD_CACHE_COUNT)) struct nvgpu_pd_mem_entry { struct nvgpu_mem mem; /* - * Size of the page directories (not the mem). bmap is a bitmap showing - * which PDs have been allocated. The size of mem will always be one - * page. pd_size will always be a power of 2. + * Size of the page directories (not the mem). alloc_map is a bitmap + * showing which PDs have been allocated. + * + * The size of mem will be NVGPU_PD_CACHE_SIZE + * and pd_size will always be a power of 2. + * */ u32 pd_size; - unsigned long alloc_map; + DECLARE_BITMAP(alloc_map, NVGPU_PD_CACHE_SIZE / NVGPU_PD_CACHE_MIN); + + /* Total number of allocations in this PD. */ + u32 allocs; struct nvgpu_list_node list_entry; struct nvgpu_rbtree_node tree_entry; @@ -251,7 +259,10 @@ void nvgpu_gmmu_unmap(struct vm_gk20a *vm, struct nvgpu_mem *mem, u64 gpu_va); -int nvgpu_pd_alloc(struct vm_gk20a *vm, struct nvgpu_gmmu_pd *pd, u32 bytes); +int nvgpu_pd_alloc(struct vm_gk20a *vm, + struct nvgpu_gmmu_pd *pd, + u32 bytes); + void nvgpu_pd_free(struct vm_gk20a *vm, struct nvgpu_gmmu_pd *pd); int nvgpu_pd_cache_alloc_direct(struct gk20a *g, struct nvgpu_gmmu_pd *pd, u32 bytes); -- cgit v1.2.2