summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPeter Daifuku <pdaifuku@nvidia.com>2020-08-20 21:45:26 -0400
committermobile promotions <svcmobile_promotions@nvidia.com>2020-09-15 05:38:45 -0400
commit036e000a17425e0569990f2aacae91b273392153 (patch)
tree5fb6845c56652a6bf97f70411846c891cdfbfab3
parent1c34f50227e9f308491758482d88c3c2f6605ffb (diff)
nvgpu: add PD cache support for page-sized PTEs
Large buffers being mapped to GMMU end up needing many pages for the PTE tables. Allocating these pages one by one can end up being a performance bottleneck, particularly in the virtualized case. Add support for page-sized PTEs to the existing PD cache: - define NVGPU_PD_CACHE_SIZE, the allocation size for a new slab for the PD cache, effectively set to 64K bytes - Use the PD cache for any allocation < NVGPU_PD_CACHE_SIZE - When freeing up cached entries, avoid prefetch errors by invalidating the entry (memset to 0) Bug 3093183 Bug 3100907 Change-Id: I2302a1dfeb056b9461159121bbae1be70524a357 Signed-off-by: Peter Daifuku <pdaifuku@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2401783 Reviewed-by: Alex Waterman <alexw@nvidia.com> Reviewed-by: Satish Arora <satisha@nvidia.com> Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> GVS: Gerrit_Virtual_Submit Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
-rw-r--r--drivers/gpu/nvgpu/common/mm/pd_cache.c94
-rw-r--r--drivers/gpu/nvgpu/include/nvgpu/gmmu.h23
2 files changed, 78 insertions, 39 deletions
diff --git a/drivers/gpu/nvgpu/common/mm/pd_cache.c b/drivers/gpu/nvgpu/common/mm/pd_cache.c
index d9dc3db0..a5b3d134 100644
--- a/drivers/gpu/nvgpu/common/mm/pd_cache.c
+++ b/drivers/gpu/nvgpu/common/mm/pd_cache.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. 2 * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
3 * 3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a 4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"), 5 * copy of this software and associated documentation files (the "Software"),
@@ -42,7 +42,7 @@
42 * to have 4 of these PDs in one page. This is even more pronounced for 256 byte 42 * to have 4 of these PDs in one page. This is even more pronounced for 256 byte
43 * PD tables. 43 * PD tables.
44 * 44 *
45 * The pd cache is basially just a slab allocator. Each instance of the nvgpu 45 * The pd cache is basically a slab allocator. Each instance of the nvgpu
46 * driver makes one of these structs: 46 * driver makes one of these structs:
47 * 47 *
48 * struct nvgpu_pd_cache { 48 * struct nvgpu_pd_cache {
@@ -52,22 +52,18 @@
52 * struct nvgpu_rbtree_node *mem_tree; 52 * struct nvgpu_rbtree_node *mem_tree;
53 * }; 53 * };
54 * 54 *
55 * There are two sets of lists, the full and the partial. The full lists contain 55 * There are two sets of lists used for cached allocations, the full and the
56 * pages of memory for which all the memory in that page is in use. The partial 56 * partial. The full lists contain pages of memory for which all the memory in
57 * lists contain partially full pages of memory which can be used for more PD 57 * that entry is in use. The partial lists contain partially full blocks of
58 * allocations. There a couple of assumptions here: 58 * memory which can be used for more PD allocations. The cache works as follows:
59 * 59 *
60 * 1. PDs greater than or equal to the page size bypass the pd cache. 60 * 1. PDs greater than NVGPU_PD_CACHE_SIZE bypass the pd cache.
61 * 2. PDs are always power of 2 and greater than %NVGPU_PD_CACHE_MIN bytes. 61 * 2. PDs are always power of 2 and greater than %NVGPU_PD_CACHE_MIN bytes.
62 * 62 *
63 * There are NVGPU_PD_CACHE_COUNT full lists and the same number of partial
64 * lists. For a 4Kb page NVGPU_PD_CACHE_COUNT is 4. This is enough space for
65 * 256, 512, 1024, and 2048 byte PDs.
66 *
67 * nvgpu_pd_alloc() will allocate a PD for the GMMU. It will check if the PD 63 * nvgpu_pd_alloc() will allocate a PD for the GMMU. It will check if the PD
68 * size is page size or larger and choose the correct allocation scheme - either 64 * size is NVGPU_PD_CACHE_SIZE or larger and choose the correct allocation
69 * from the PD cache or directly. Similarly nvgpu_pd_free() will free a PD 65 * scheme - either from the PD cache or directly. Similarly nvgpu_pd_free()
70 * allocated by nvgpu_pd_alloc(). 66 * will free a PD allocated by nvgpu_pd_alloc().
71 * 67 *
72 * Since the top level PD (the PDB) is a page aligned pointer but less than a 68 * Since the top level PD (the PDB) is a page aligned pointer but less than a
73 * page size the direct functions must be used for allocating PDBs. Otherwise 69 * page size the direct functions must be used for allocating PDBs. Otherwise
@@ -79,11 +75,11 @@ static u32 nvgpu_pd_cache_nr(u32 bytes)
79 return ilog2(bytes >> (NVGPU_PD_CACHE_MIN_SHIFT - 1U)); 75 return ilog2(bytes >> (NVGPU_PD_CACHE_MIN_SHIFT - 1U));
80} 76}
81 77
82static u32 nvgpu_pd_cache_get_mask(struct nvgpu_pd_mem_entry *pentry) 78static u32 nvgpu_pd_cache_get_nr_entries(struct nvgpu_pd_mem_entry *pentry)
83{ 79{
84 u32 mask_offset = 1 << (PAGE_SIZE / pentry->pd_size); 80 BUG_ON(pentry->pd_size == 0);
85 81
86 return mask_offset - 1U; 82 return NVGPU_PD_CACHE_SIZE / pentry->pd_size;
87} 83}
88 84
89int nvgpu_pd_cache_init(struct gk20a *g) 85int nvgpu_pd_cache_init(struct gk20a *g)
@@ -201,6 +197,8 @@ static int nvgpu_pd_cache_alloc_new(struct gk20a *g,
201 u32 bytes) 197 u32 bytes)
202{ 198{
203 struct nvgpu_pd_mem_entry *pentry; 199 struct nvgpu_pd_mem_entry *pentry;
200 unsigned long flags = 0;
201 int err;
204 202
205 pd_dbg(g, "PD-Alloc [C] New: offs=0"); 203 pd_dbg(g, "PD-Alloc [C] New: offs=0");
206 204
@@ -210,8 +208,21 @@ static int nvgpu_pd_cache_alloc_new(struct gk20a *g,
210 return -ENOMEM; 208 return -ENOMEM;
211 } 209 }
212 210
213 if (nvgpu_dma_alloc(g, PAGE_SIZE, &pentry->mem)) { 211 if (!nvgpu_iommuable(g) && (NVGPU_PD_CACHE_SIZE > PAGE_SIZE)) {
212 flags = NVGPU_DMA_FORCE_CONTIGUOUS;
213 }
214
215 err = nvgpu_dma_alloc_flags(g, flags,
216 NVGPU_PD_CACHE_SIZE, &pentry->mem);
217 if (err != 0) {
214 nvgpu_kfree(g, pentry); 218 nvgpu_kfree(g, pentry);
219
220 /* Not enough contiguous space, but a direct
221 * allocation may work
222 */
223 if (err == -ENOMEM) {
224 return nvgpu_pd_cache_alloc_direct(g, pd, bytes);
225 }
215 nvgpu_err(g, "Unable to DMA alloc!"); 226 nvgpu_err(g, "Unable to DMA alloc!");
216 return -ENOMEM; 227 return -ENOMEM;
217 } 228 }
@@ -224,7 +235,8 @@ static int nvgpu_pd_cache_alloc_new(struct gk20a *g,
224 * This allocates the very first PD table in the set of tables in this 235 * This allocates the very first PD table in the set of tables in this
225 * nvgpu_pd_mem_entry. 236 * nvgpu_pd_mem_entry.
226 */ 237 */
227 pentry->alloc_map = 1; 238 set_bit(0U, pentry->alloc_map);
239 pentry->allocs = 1;
228 240
229 /* 241 /*
230 * Now update the nvgpu_gmmu_pd to reflect this allocation. 242 * Now update the nvgpu_gmmu_pd to reflect this allocation.
@@ -246,20 +258,21 @@ static int nvgpu_pd_cache_alloc_from_partial(struct gk20a *g,
246{ 258{
247 unsigned long bit_offs; 259 unsigned long bit_offs;
248 u32 mem_offs; 260 u32 mem_offs;
249 u32 pentry_mask = nvgpu_pd_cache_get_mask(pentry); 261 u32 nr_bits = nvgpu_pd_cache_get_nr_entries(pentry);
250 262
251 /* 263 /*
252 * Find and allocate an open PD. 264 * Find and allocate an open PD.
253 */ 265 */
254 bit_offs = ffz(pentry->alloc_map); 266 bit_offs = find_first_zero_bit(pentry->alloc_map, nr_bits);
255 mem_offs = bit_offs * pentry->pd_size; 267 mem_offs = bit_offs * pentry->pd_size;
256 268
257 /* Bit map full. Somethings wrong. */ 269 /* Bit map full. Somethings wrong. */
258 if (WARN_ON(bit_offs >= ffz(pentry_mask))) { 270 if (WARN_ON(bit_offs >= nr_bits)) {
259 return -ENOMEM; 271 return -ENOMEM;
260 } 272 }
261 273
262 pentry->alloc_map |= 1 << bit_offs; 274 set_bit(bit_offs, pentry->alloc_map);
275 pentry->allocs++;
263 276
264 pd_dbg(g, "PD-Alloc [C] Partial: offs=%lu", bit_offs); 277 pd_dbg(g, "PD-Alloc [C] Partial: offs=%lu", bit_offs);
265 278
@@ -273,7 +286,7 @@ static int nvgpu_pd_cache_alloc_from_partial(struct gk20a *g,
273 /* 286 /*
274 * Now make sure the pentry is in the correct list (full vs partial). 287 * Now make sure the pentry is in the correct list (full vs partial).
275 */ 288 */
276 if ((pentry->alloc_map & pentry_mask) == pentry_mask) { 289 if (pentry->allocs >= nr_bits) {
277 pd_dbg(g, "Adding pentry to full list!"); 290 pd_dbg(g, "Adding pentry to full list!");
278 nvgpu_list_del(&pentry->list_entry); 291 nvgpu_list_del(&pentry->list_entry);
279 nvgpu_list_add(&pentry->list_entry, 292 nvgpu_list_add(&pentry->list_entry,
@@ -314,7 +327,7 @@ static int nvgpu_pd_cache_alloc(struct gk20a *g, struct nvgpu_pd_cache *cache,
314 pd_dbg(g, "PD-Alloc [C] %u bytes", bytes); 327 pd_dbg(g, "PD-Alloc [C] %u bytes", bytes);
315 328
316 if ((bytes & (bytes - 1U)) != 0U || 329 if ((bytes & (bytes - 1U)) != 0U ||
317 (bytes >= PAGE_SIZE || 330 (bytes >= NVGPU_PD_CACHE_SIZE ||
318 bytes < NVGPU_PD_CACHE_MIN)) { 331 bytes < NVGPU_PD_CACHE_MIN)) {
319 pd_dbg(g, "PD-Alloc [C] Invalid (bytes=%u)!", bytes); 332 pd_dbg(g, "PD-Alloc [C] Invalid (bytes=%u)!", bytes);
320 return -EINVAL; 333 return -EINVAL;
@@ -339,16 +352,18 @@ static int nvgpu_pd_cache_alloc(struct gk20a *g, struct nvgpu_pd_cache *cache,
339 * cache logistics. Since on Parker and later GPUs some of the page directories 352 * cache logistics. Since on Parker and later GPUs some of the page directories
340 * are smaller than a page packing these PDs together saves a lot of memory. 353 * are smaller than a page packing these PDs together saves a lot of memory.
341 */ 354 */
342int nvgpu_pd_alloc(struct vm_gk20a *vm, struct nvgpu_gmmu_pd *pd, u32 bytes) 355int nvgpu_pd_alloc(struct vm_gk20a *vm,
356 struct nvgpu_gmmu_pd *pd,
357 u32 bytes)
343{ 358{
344 struct gk20a *g = gk20a_from_vm(vm); 359 struct gk20a *g = gk20a_from_vm(vm);
345 int err; 360 int err;
346 361
347 /* 362 /*
348 * Simple case: PD is bigger than a page so just do a regular DMA 363 * Simple case: PD is bigger than or equal to NVGPU_PD_CACHE_SIZE so
349 * alloc. 364 * just do a regular DMA alloc.
350 */ 365 */
351 if (bytes >= PAGE_SIZE) { 366 if (bytes >= NVGPU_PD_CACHE_SIZE) {
352 err = nvgpu_pd_cache_alloc_direct(g, pd, bytes); 367 err = nvgpu_pd_cache_alloc_direct(g, pd, bytes);
353 if (err) { 368 if (err) {
354 return err; 369 return err;
@@ -396,17 +411,28 @@ static void nvgpu_pd_cache_do_free(struct gk20a *g,
396 struct nvgpu_pd_mem_entry *pentry, 411 struct nvgpu_pd_mem_entry *pentry,
397 struct nvgpu_gmmu_pd *pd) 412 struct nvgpu_gmmu_pd *pd)
398{ 413{
399 u32 index = pd->mem_offs / pentry->pd_size; 414 u32 bit = pd->mem_offs / pentry->pd_size;
400 u32 bit = 1 << index;
401 415
402 /* Mark entry as free. */ 416 /* Mark entry as free. */
403 pentry->alloc_map &= ~bit; 417 clear_bit(bit, pentry->alloc_map);
418 pentry->allocs--;
404 419
405 if (pentry->alloc_map & nvgpu_pd_cache_get_mask(pentry)) { 420 if (pentry->allocs > 0U) {
406 /* 421 /*
407 * Partially full still. If it was already on the partial list 422 * Partially full still. If it was already on the partial list
408 * this just re-adds it. 423 * this just re-adds it.
424 *
425 * Since the memory used for the entries is still mapped, if
426 * igpu make sure the entries are invalidated so that the hw
427 * doesn't accidentally try to prefetch non-existent fb memory.
428 *
429 * TBD: what about dgpu? (Not supported in Drive 5.0)
409 */ 430 */
431 if (pd->mem->cpu_va != NULL) {
432 memset((void *)((u64)pd->mem->cpu_va + pd->mem_offs), 0,
433 pentry->pd_size);
434 }
435
410 nvgpu_list_del(&pentry->list_entry); 436 nvgpu_list_del(&pentry->list_entry);
411 nvgpu_list_add(&pentry->list_entry, 437 nvgpu_list_add(&pentry->list_entry,
412 &cache->partial[nvgpu_pd_cache_nr(pentry->pd_size)]); 438 &cache->partial[nvgpu_pd_cache_nr(pentry->pd_size)]);
@@ -414,6 +440,8 @@ static void nvgpu_pd_cache_do_free(struct gk20a *g,
414 /* Empty now so free it. */ 440 /* Empty now so free it. */
415 nvgpu_pd_cache_free_mem_entry(g, cache, pentry); 441 nvgpu_pd_cache_free_mem_entry(g, cache, pentry);
416 } 442 }
443
444 pd->mem = NULL;
417} 445}
418 446
419static struct nvgpu_pd_mem_entry *nvgpu_pd_cache_look_up( 447static struct nvgpu_pd_mem_entry *nvgpu_pd_cache_look_up(
diff --git a/drivers/gpu/nvgpu/include/nvgpu/gmmu.h b/drivers/gpu/nvgpu/include/nvgpu/gmmu.h
index 8e1eeedc..ee2207b8 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/gmmu.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/gmmu.h
@@ -28,6 +28,7 @@
28#include <nvgpu/list.h> 28#include <nvgpu/list.h>
29#include <nvgpu/rbtree.h> 29#include <nvgpu/rbtree.h>
30#include <nvgpu/lock.h> 30#include <nvgpu/lock.h>
31#include <nvgpu/bitops.h>
31 32
32/* 33/*
33 * This is the GMMU API visible to blocks outside of the GMMU. Basically this 34 * This is the GMMU API visible to blocks outside of the GMMU. Basically this
@@ -56,18 +57,25 @@ enum gk20a_mem_rw_flag {
56 */ 57 */
57#define NVGPU_PD_CACHE_MIN 256U 58#define NVGPU_PD_CACHE_MIN 256U
58#define NVGPU_PD_CACHE_MIN_SHIFT 9U 59#define NVGPU_PD_CACHE_MIN_SHIFT 9U
59#define NVGPU_PD_CACHE_COUNT 4U 60#define NVGPU_PD_CACHE_COUNT 8U
61#define NVGPU_PD_CACHE_SIZE (NVGPU_PD_CACHE_MIN * (1U << NVGPU_PD_CACHE_COUNT))
60 62
61struct nvgpu_pd_mem_entry { 63struct nvgpu_pd_mem_entry {
62 struct nvgpu_mem mem; 64 struct nvgpu_mem mem;
63 65
64 /* 66 /*
65 * Size of the page directories (not the mem). bmap is a bitmap showing 67 * Size of the page directories (not the mem). alloc_map is a bitmap
66 * which PDs have been allocated. The size of mem will always be one 68 * showing which PDs have been allocated.
67 * page. pd_size will always be a power of 2. 69 *
70 * The size of mem will be NVGPU_PD_CACHE_SIZE
71 * and pd_size will always be a power of 2.
72 *
68 */ 73 */
69 u32 pd_size; 74 u32 pd_size;
70 unsigned long alloc_map; 75 DECLARE_BITMAP(alloc_map, NVGPU_PD_CACHE_SIZE / NVGPU_PD_CACHE_MIN);
76
77 /* Total number of allocations in this PD. */
78 u32 allocs;
71 79
72 struct nvgpu_list_node list_entry; 80 struct nvgpu_list_node list_entry;
73 struct nvgpu_rbtree_node tree_entry; 81 struct nvgpu_rbtree_node tree_entry;
@@ -251,7 +259,10 @@ void nvgpu_gmmu_unmap(struct vm_gk20a *vm,
251 struct nvgpu_mem *mem, 259 struct nvgpu_mem *mem,
252 u64 gpu_va); 260 u64 gpu_va);
253 261
254int nvgpu_pd_alloc(struct vm_gk20a *vm, struct nvgpu_gmmu_pd *pd, u32 bytes); 262int nvgpu_pd_alloc(struct vm_gk20a *vm,
263 struct nvgpu_gmmu_pd *pd,
264 u32 bytes);
265
255void nvgpu_pd_free(struct vm_gk20a *vm, struct nvgpu_gmmu_pd *pd); 266void nvgpu_pd_free(struct vm_gk20a *vm, struct nvgpu_gmmu_pd *pd);
256int nvgpu_pd_cache_alloc_direct(struct gk20a *g, 267int nvgpu_pd_cache_alloc_direct(struct gk20a *g,
257 struct nvgpu_gmmu_pd *pd, u32 bytes); 268 struct nvgpu_gmmu_pd *pd, u32 bytes);