nvgpu: add PD cache support for page-sized PTEs

Large buffers being mapped to GMMU end up needing many pages for the PTE tables. Allocating these pages one by one can end up being a performance bottleneck, particularly in the virtualized case. Add support for page-sized PTEs to the existing PD cache: - define NVGPU_PD_CACHE_SIZE, the allocation size for a new slab for the PD cache, effectively set to 64K bytes - Use the PD cache for any allocation < NVGPU_PD_CACHE_SIZE - When freeing up cached entries, avoid prefetch errors by invalidating the entry (memset to 0) Bug 3093183 Bug 3100907 Change-Id: I2302a1dfeb056b9461159121bbae1be70524a357 Signed-off-by: Peter Daifuku <pdaifuku@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2401783 Reviewed-by: Alex Waterman <alexw@nvidia.com> Reviewed-by: Satish Arora <satisha@nvidia.com> Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> GVS: Gerrit_Virtual_Submit Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
author: Peter Daifuku <pdaifuku@nvidia.com> 2020-08-20 21:45:26 -0400
committer: mobile promotions <svcmobile_promotions@nvidia.com> 2020-09-15 05:38:45 -0400
commit: 036e000a17425e0569990f2aacae91b273392153 (patch)
tree: 5fb6845c56652a6bf97f70411846c891cdfbfab3
parent: 1c34f50227e9f308491758482d88c3c2f6605ffb (diff)
2 files changed, 78 insertions, 39 deletions
diff --git a/drivers/gpu/nvgpu/common/mm/pd_cache.c b/drivers/gpu/nvgpu/common/mm/pd_cache.c
index d9dc3db0..a5b3d134 100644
--- a/drivers/gpu/nvgpu/common/mm/pd_cache.c
+++ b/drivers/gpu/nvgpu/common/mm/pd_cache.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
@@ -42,7 +42,7 @@
 * to have 4 of these PDs in one page. This is even more pronounced for 256 byte
 * PD tables.
 *
- * The pd cache is basially just a slab allocator. Each instance of the nvgpu
+ * The pd cache is basically a slab allocator. Each instance of the nvgpu
 * driver makes one of these structs:
 *
 *   struct nvgpu_pd_cache {
@@ -52,22 +52,18 @@
 *      struct nvgpu_rbtree_node        *mem_tree;
 *   };
 *
- * There are two sets of lists, the full and the partial. The full lists contain
+ * There are two sets of lists used for cached allocations, the full and the
- * pages of memory for which all the memory in that page is in use. The partial
+ * partial. The full lists contain pages of memory for which all the memory in
- * lists contain partially full pages of memory which can be used for more PD
+ * that entry is in use. The partial lists contain partially full blocks of
- * allocations. There a couple of assumptions here:
+ * memory which can be used for more PD allocations. The cache works as follows:
 *
- *   1. PDs greater than or equal to the page size bypass the pd cache.
+ *   1. PDs greater than NVGPU_PD_CACHE_SIZE bypass the pd cache.
 *   2. PDs are always power of 2 and greater than %NVGPU_PD_CACHE_MIN bytes.
 *
- * There are NVGPU_PD_CACHE_COUNT full lists and the same number of partial
- * lists. For a 4Kb page NVGPU_PD_CACHE_COUNT is 4. This is enough space for
- * 256, 512, 1024, and 2048 byte PDs.
- *
 * nvgpu_pd_alloc() will allocate a PD for the GMMU. It will check if the PD
- * size is page size or larger and choose the correct allocation scheme - either
+ * size is NVGPU_PD_CACHE_SIZE or larger and choose the correct allocation
- * from the PD cache or directly. Similarly nvgpu_pd_free() will free a PD
+ * scheme - either from the PD cache or directly. Similarly nvgpu_pd_free()
- * allocated by nvgpu_pd_alloc().
+ * will free a PD allocated by nvgpu_pd_alloc().
 *
 * Since the top level PD (the PDB) is a page aligned pointer but less than a
 * page size the direct functions must be used for allocating PDBs. Otherwise
@@ -79,11 +75,11 @@ static u32 nvgpu_pd_cache_nr(u32 bytes)
        return ilog2(bytes >> (NVGPU_PD_CACHE_MIN_SHIFT - 1U));
 }
-static u32 nvgpu_pd_cache_get_mask(struct nvgpu_pd_mem_entry *pentry)
+static u32 nvgpu_pd_cache_get_nr_entries(struct nvgpu_pd_mem_entry *pentry)
 {
-        u32 mask_offset = 1 << (PAGE_SIZE / pentry->pd_size);
+        BUG_ON(pentry->pd_size == 0);
-        return mask_offset - 1U;
+        return NVGPU_PD_CACHE_SIZE / pentry->pd_size;
 }
 int nvgpu_pd_cache_init(struct gk20a *g)
@@ -201,6 +197,8 @@ static int nvgpu_pd_cache_alloc_new(struct gk20a *g,
                                    u32 bytes)
 {
        struct nvgpu_pd_mem_entry *pentry;
+        unsigned long flags = 0;
+        int err;
        pd_dbg(g, "PD-Alloc [C]   New: offs=0");
@@ -210,8 +208,21 @@ static int nvgpu_pd_cache_alloc_new(struct gk20a *g,
                return -ENOMEM;
        }
-        if (nvgpu_dma_alloc(g, PAGE_SIZE, &pentry->mem)) {
+        if (!nvgpu_iommuable(g) && (NVGPU_PD_CACHE_SIZE > PAGE_SIZE)) {
+                flags = NVGPU_DMA_FORCE_CONTIGUOUS;
+        }
+        err = nvgpu_dma_alloc_flags(g, flags,
+                                    NVGPU_PD_CACHE_SIZE, &pentry->mem);
+        if (err != 0) {
                nvgpu_kfree(g, pentry);
+                /* Not enough contiguous space, but a direct
+                 * allocation may work
+                 */
+                if (err == -ENOMEM) {
+                        return nvgpu_pd_cache_alloc_direct(g, pd, bytes);
+                }
                nvgpu_err(g, "Unable to DMA alloc!");
                return -ENOMEM;
        }
@@ -224,7 +235,8 @@ static int nvgpu_pd_cache_alloc_new(struct gk20a *g,
         * This allocates the very first PD table in the set of tables in this
         * nvgpu_pd_mem_entry.
         */
-        pentry->alloc_map = 1;
+        set_bit(0U, pentry->alloc_map);
+        pentry->allocs = 1;
        /*
         * Now update the nvgpu_gmmu_pd to reflect this allocation.
@@ -246,20 +258,21 @@ static int nvgpu_pd_cache_alloc_from_partial(struct gk20a *g,
 {
        unsigned long bit_offs;
        u32 mem_offs;
-        u32 pentry_mask = nvgpu_pd_cache_get_mask(pentry);
+        u32 nr_bits = nvgpu_pd_cache_get_nr_entries(pentry);
        /*
         * Find and allocate an open PD.
         */
-        bit_offs = ffz(pentry->alloc_map);
+        bit_offs = find_first_zero_bit(pentry->alloc_map, nr_bits);
        mem_offs = bit_offs * pentry->pd_size;
        /* Bit map full. Somethings wrong. */
-        if (WARN_ON(bit_offs >= ffz(pentry_mask))) {
+        if (WARN_ON(bit_offs >= nr_bits)) {
                return -ENOMEM;
        }
-        pentry->alloc_map |= 1 << bit_offs;
+        set_bit(bit_offs, pentry->alloc_map);
+        pentry->allocs++;
        pd_dbg(g, "PD-Alloc [C]   Partial: offs=%lu", bit_offs);
@@ -273,7 +286,7 @@ static int nvgpu_pd_cache_alloc_from_partial(struct gk20a *g,
        /*
         * Now make sure the pentry is in the correct list (full vs partial).
         */
-        if ((pentry->alloc_map & pentry_mask) == pentry_mask) {
+        if (pentry->allocs >= nr_bits) {
                pd_dbg(g, "Adding pentry to full list!");
                nvgpu_list_del(&pentry->list_entry);
                nvgpu_list_add(&pentry->list_entry,
@@ -314,7 +327,7 @@ static int nvgpu_pd_cache_alloc(struct gk20a *g, struct nvgpu_pd_cache *cache,
        pd_dbg(g, "PD-Alloc [C] %u bytes", bytes);
        if ((bytes & (bytes - 1U)) != 0U ||
-            (bytes >= PAGE_SIZE ||
+            (bytes >= NVGPU_PD_CACHE_SIZE ||
             bytes < NVGPU_PD_CACHE_MIN)) {
                pd_dbg(g, "PD-Alloc [C]   Invalid (bytes=%u)!", bytes);
                return -EINVAL;
@@ -339,16 +352,18 @@ static int nvgpu_pd_cache_alloc(struct gk20a *g, struct nvgpu_pd_cache *cache,
 * cache logistics. Since on Parker and later GPUs some of the page  directories
 * are smaller than a page packing these PDs together saves a lot of memory.
 */
-int nvgpu_pd_alloc(struct vm_gk20a *vm, struct nvgpu_gmmu_pd *pd, u32 bytes)
+int nvgpu_pd_alloc(struct vm_gk20a *vm,
+                   struct nvgpu_gmmu_pd *pd,
+                   u32 bytes)
 {
        struct gk20a *g = gk20a_from_vm(vm);
        int err;
        /*
-         * Simple case: PD is bigger than a page so just do a regular DMA
+         * Simple case: PD is bigger than or equal to NVGPU_PD_CACHE_SIZE so
-         * alloc.
+         * just do a regular DMA alloc.
         */
-        if (bytes >= PAGE_SIZE) {
+        if (bytes >= NVGPU_PD_CACHE_SIZE) {
                err = nvgpu_pd_cache_alloc_direct(g, pd, bytes);
                if (err) {
                        return err;
@@ -396,17 +411,28 @@ static void nvgpu_pd_cache_do_free(struct gk20a *g,
                                   struct nvgpu_pd_mem_entry *pentry,
                                   struct nvgpu_gmmu_pd *pd)
 {
-        u32 index = pd->mem_offs / pentry->pd_size;
+        u32 bit = pd->mem_offs / pentry->pd_size;
-        u32 bit = 1 << index;
        /* Mark entry as free. */
-        pentry->alloc_map &= ~bit;
+        clear_bit(bit, pentry->alloc_map);
+        pentry->allocs--;
-        if (pentry->alloc_map & nvgpu_pd_cache_get_mask(pentry)) {
+        if (pentry->allocs > 0U) {
                /*
                 * Partially full still. If it was already on the partial list
                 * this just re-adds it.
+                 *
+                 * Since the memory used for the entries is still mapped, if
+                 * igpu make sure the entries are invalidated so that the hw
+                 * doesn't accidentally try to prefetch non-existent fb memory.
+                 *
+                 * TBD: what about dgpu? (Not supported in Drive 5.0)
                 */
+                if (pd->mem->cpu_va != NULL) {
+                        memset((void *)((u64)pd->mem->cpu_va + pd->mem_offs), 0,
+                                        pentry->pd_size);
+                }
                nvgpu_list_del(&pentry->list_entry);
                nvgpu_list_add(&pentry->list_entry,
                        &cache->partial[nvgpu_pd_cache_nr(pentry->pd_size)]);
@@ -414,6 +440,8 @@ static void nvgpu_pd_cache_do_free(struct gk20a *g,
                /* Empty now so free it. */
                nvgpu_pd_cache_free_mem_entry(g, cache, pentry);
        }
+        pd->mem = NULL;
 }
 static struct nvgpu_pd_mem_entry *nvgpu_pd_cache_look_up(
diff --git a/drivers/gpu/nvgpu/include/nvgpu/gmmu.h b/drivers/gpu/nvgpu/include/nvgpu/gmmu.h
index 8e1eeedc..ee2207b8 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/gmmu.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/gmmu.h
@@ -28,6 +28,7 @@
 #include <nvgpu/list.h>
 #include <nvgpu/rbtree.h>
 #include <nvgpu/lock.h>
+#include <nvgpu/bitops.h>
 /*
 * This is the GMMU API visible to blocks outside of the GMMU. Basically this
@@ -56,18 +57,25 @@ enum gk20a_mem_rw_flag {
 */
 #define NVGPU_PD_CACHE_MIN              256U
 #define NVGPU_PD_CACHE_MIN_SHIFT        9U
-#define NVGPU_PD_CACHE_COUNT            4U
+#define NVGPU_PD_CACHE_COUNT            8U
+#define NVGPU_PD_CACHE_SIZE             (NVGPU_PD_CACHE_MIN * (1U << NVGPU_PD_CACHE_COUNT))
 struct nvgpu_pd_mem_entry {
        struct nvgpu_mem                mem;
        /*
-         * Size of the page directories (not the mem). bmap is a bitmap showing
+         * Size of the page directories (not the mem). alloc_map is a bitmap
-         * which PDs have been allocated. The size of mem will always be one
+         * showing which PDs have been allocated.
-         * page. pd_size will always be a power of 2.
+         *
+         * The size of mem will be NVGPU_PD_CACHE_SIZE
+         * and pd_size will always be a power of 2.
+         *
         */
        u32                             pd_size;
-        unsigned long                   alloc_map;
+        DECLARE_BITMAP(alloc_map, NVGPU_PD_CACHE_SIZE / NVGPU_PD_CACHE_MIN);
+        /* Total number of allocations in this PD. */
+        u32                             allocs;
        struct nvgpu_list_node          list_entry;
        struct nvgpu_rbtree_node        tree_entry;
@@ -251,7 +259,10 @@ void nvgpu_gmmu_unmap(struct vm_gk20a *vm,
                      struct nvgpu_mem *mem,
                      u64 gpu_va);
-int nvgpu_pd_alloc(struct vm_gk20a *vm, struct nvgpu_gmmu_pd *pd, u32 bytes);
+int nvgpu_pd_alloc(struct vm_gk20a *vm,
+                   struct nvgpu_gmmu_pd *pd,
+                   u32 bytes);
 void nvgpu_pd_free(struct vm_gk20a *vm, struct nvgpu_gmmu_pd *pd);
 int nvgpu_pd_cache_alloc_direct(struct gk20a *g,
                                  struct nvgpu_gmmu_pd *pd, u32 bytes);
author	Peter Daifuku <pdaifuku@nvidia.com>	2020-08-20 21:45:26 -0400
committer	mobile promotions <svcmobile_promotions@nvidia.com>	2020-09-15 05:38:45 -0400
commit	036e000a17425e0569990f2aacae91b273392153 (patch)
tree	5fb6845c56652a6bf97f70411846c891cdfbfab3
parent	1c34f50227e9f308491758482d88c3c2f6605ffb (diff)