From 036e000a17425e0569990f2aacae91b273392153 Mon Sep 17 00:00:00 2001
From: Peter Daifuku <pdaifuku@nvidia.com>
Date: Thu, 20 Aug 2020 18:45:26 -0700
Subject: nvgpu: add PD cache support for page-sized PTEs

Large buffers being mapped to GMMU end up needing many
pages for the PTE tables. Allocating these pages one
by one can end up being a performance bottleneck, particularly
in the virtualized case.

Add support for page-sized PTEs to the existing PD cache:

- define NVGPU_PD_CACHE_SIZE, the allocation size for a new slab
  for the PD cache, effectively set to 64K bytes
- Use the PD cache for any allocation < NVGPU_PD_CACHE_SIZE
- When freeing up cached entries, avoid prefetch errors by
  invalidating the entry (memset to 0)

Bug 3093183
Bug 3100907

Change-Id: I2302a1dfeb056b9461159121bbae1be70524a357
Signed-off-by: Peter Daifuku <pdaifuku@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2401783
Reviewed-by: Alex Waterman <alexw@nvidia.com>
Reviewed-by: Satish Arora <satisha@nvidia.com>
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
GVS: Gerrit_Virtual_Submit
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
---
 drivers/gpu/nvgpu/common/mm/pd_cache.c | 94 ++++++++++++++++++++++------------
 drivers/gpu/nvgpu/include/nvgpu/gmmu.h | 23 ++++++---
 2 files changed, 78 insertions(+), 39 deletions(-)

diff --git a/drivers/gpu/nvgpu/common/mm/pd_cache.c b/drivers/gpu/nvgpu/common/mm/pd_cache.c
index d9dc3db0..a5b3d134 100644
--- a/drivers/gpu/nvgpu/common/mm/pd_cache.c
+++ b/drivers/gpu/nvgpu/common/mm/pd_cache.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -42,7 +42,7 @@
  * to have 4 of these PDs in one page. This is even more pronounced for 256 byte
  * PD tables.
  *
- * The pd cache is basially just a slab allocator. Each instance of the nvgpu
+ * The pd cache is basically a slab allocator. Each instance of the nvgpu
  * driver makes one of these structs:
  *
  *   struct nvgpu_pd_cache {
@@ -52,22 +52,18 @@
  *      struct nvgpu_rbtree_node	*mem_tree;
  *   };
  *
- * There are two sets of lists, the full and the partial. The full lists contain
- * pages of memory for which all the memory in that page is in use. The partial
- * lists contain partially full pages of memory which can be used for more PD
- * allocations. There a couple of assumptions here:
+ * There are two sets of lists used for cached allocations, the full and the
+ * partial. The full lists contain pages of memory for which all the memory in
+ * that entry is in use. The partial lists contain partially full blocks of
+ * memory which can be used for more PD allocations. The cache works as follows:
  *
- *   1. PDs greater than or equal to the page size bypass the pd cache.
+ *   1. PDs greater than NVGPU_PD_CACHE_SIZE bypass the pd cache.
  *   2. PDs are always power of 2 and greater than %NVGPU_PD_CACHE_MIN bytes.
  *
- * There are NVGPU_PD_CACHE_COUNT full lists and the same number of partial
- * lists. For a 4Kb page NVGPU_PD_CACHE_COUNT is 4. This is enough space for
- * 256, 512, 1024, and 2048 byte PDs.
- *
  * nvgpu_pd_alloc() will allocate a PD for the GMMU. It will check if the PD
- * size is page size or larger and choose the correct allocation scheme - either
- * from the PD cache or directly. Similarly nvgpu_pd_free() will free a PD
- * allocated by nvgpu_pd_alloc().
+ * size is NVGPU_PD_CACHE_SIZE or larger and choose the correct allocation
+ * scheme - either from the PD cache or directly. Similarly nvgpu_pd_free()
+ * will free a PD allocated by nvgpu_pd_alloc().
  *
  * Since the top level PD (the PDB) is a page aligned pointer but less than a
  * page size the direct functions must be used for allocating PDBs. Otherwise
@@ -79,11 +75,11 @@ static u32 nvgpu_pd_cache_nr(u32 bytes)
 	return ilog2(bytes >> (NVGPU_PD_CACHE_MIN_SHIFT - 1U));
 }
 
-static u32 nvgpu_pd_cache_get_mask(struct nvgpu_pd_mem_entry *pentry)
+static u32 nvgpu_pd_cache_get_nr_entries(struct nvgpu_pd_mem_entry *pentry)
 {
-	u32 mask_offset = 1 << (PAGE_SIZE / pentry->pd_size);
+	BUG_ON(pentry->pd_size == 0);
 
-	return mask_offset - 1U;
+	return NVGPU_PD_CACHE_SIZE / pentry->pd_size;
 }
 
 int nvgpu_pd_cache_init(struct gk20a *g)
@@ -201,6 +197,8 @@ static int nvgpu_pd_cache_alloc_new(struct gk20a *g,
 				    u32 bytes)
 {
 	struct nvgpu_pd_mem_entry *pentry;
+	unsigned long flags = 0;
+	int err;
 
 	pd_dbg(g, "PD-Alloc [C]   New: offs=0");
 
@@ -210,8 +208,21 @@ static int nvgpu_pd_cache_alloc_new(struct gk20a *g,
 		return -ENOMEM;
 	}
 
-	if (nvgpu_dma_alloc(g, PAGE_SIZE, &pentry->mem)) {
+	if (!nvgpu_iommuable(g) && (NVGPU_PD_CACHE_SIZE > PAGE_SIZE)) {
+		flags = NVGPU_DMA_FORCE_CONTIGUOUS;
+	}
+
+	err = nvgpu_dma_alloc_flags(g, flags,
+				    NVGPU_PD_CACHE_SIZE, &pentry->mem);
+	if (err != 0) {
 		nvgpu_kfree(g, pentry);
+
+		/* Not enough contiguous space, but a direct
+		 * allocation may work
+		 */
+		if (err == -ENOMEM) {
+			return nvgpu_pd_cache_alloc_direct(g, pd, bytes);
+		}
 		nvgpu_err(g, "Unable to DMA alloc!");
 		return -ENOMEM;
 	}
@@ -224,7 +235,8 @@ static int nvgpu_pd_cache_alloc_new(struct gk20a *g,
 	 * This allocates the very first PD table in the set of tables in this
 	 * nvgpu_pd_mem_entry.
 	 */
-	pentry->alloc_map = 1;
+	set_bit(0U, pentry->alloc_map);
+	pentry->allocs = 1;
 
 	/*
 	 * Now update the nvgpu_gmmu_pd to reflect this allocation.
@@ -246,20 +258,21 @@ static int nvgpu_pd_cache_alloc_from_partial(struct gk20a *g,
 {
 	unsigned long bit_offs;
 	u32 mem_offs;
-	u32 pentry_mask = nvgpu_pd_cache_get_mask(pentry);
+	u32 nr_bits = nvgpu_pd_cache_get_nr_entries(pentry);
 
 	/*
 	 * Find and allocate an open PD.
 	 */
-	bit_offs = ffz(pentry->alloc_map);
+	bit_offs = find_first_zero_bit(pentry->alloc_map, nr_bits);
 	mem_offs = bit_offs * pentry->pd_size;
 
 	/* Bit map full. Somethings wrong. */
-	if (WARN_ON(bit_offs >= ffz(pentry_mask))) {
+	if (WARN_ON(bit_offs >= nr_bits)) {
 		return -ENOMEM;
 	}
 
-	pentry->alloc_map |= 1 << bit_offs;
+	set_bit(bit_offs, pentry->alloc_map);
+	pentry->allocs++;
 
 	pd_dbg(g, "PD-Alloc [C]   Partial: offs=%lu", bit_offs);
 
@@ -273,7 +286,7 @@ static int nvgpu_pd_cache_alloc_from_partial(struct gk20a *g,
 	/*
 	 * Now make sure the pentry is in the correct list (full vs partial).
 	 */
-	if ((pentry->alloc_map & pentry_mask) == pentry_mask) {
+	if (pentry->allocs >= nr_bits) {
 		pd_dbg(g, "Adding pentry to full list!");
 		nvgpu_list_del(&pentry->list_entry);
 		nvgpu_list_add(&pentry->list_entry,
@@ -314,7 +327,7 @@ static int nvgpu_pd_cache_alloc(struct gk20a *g, struct nvgpu_pd_cache *cache,
 	pd_dbg(g, "PD-Alloc [C] %u bytes", bytes);
 
 	if ((bytes & (bytes - 1U)) != 0U ||
-	    (bytes >= PAGE_SIZE ||
+	    (bytes >= NVGPU_PD_CACHE_SIZE ||
 	     bytes < NVGPU_PD_CACHE_MIN)) {
 		pd_dbg(g, "PD-Alloc [C]   Invalid (bytes=%u)!", bytes);
 		return -EINVAL;
@@ -339,16 +352,18 @@ static int nvgpu_pd_cache_alloc(struct gk20a *g, struct nvgpu_pd_cache *cache,
  * cache logistics. Since on Parker and later GPUs some of the page  directories
  * are smaller than a page packing these PDs together saves a lot of memory.
  */
-int nvgpu_pd_alloc(struct vm_gk20a *vm, struct nvgpu_gmmu_pd *pd, u32 bytes)
+int nvgpu_pd_alloc(struct vm_gk20a *vm,
+		   struct nvgpu_gmmu_pd *pd,
+		   u32 bytes)
 {
 	struct gk20a *g = gk20a_from_vm(vm);
 	int err;
 
 	/*
-	 * Simple case: PD is bigger than a page so just do a regular DMA
-	 * alloc.
+	 * Simple case: PD is bigger than or equal to NVGPU_PD_CACHE_SIZE so
+	 * just do a regular DMA alloc.
 	 */
-	if (bytes >= PAGE_SIZE) {
+	if (bytes >= NVGPU_PD_CACHE_SIZE) {
 		err = nvgpu_pd_cache_alloc_direct(g, pd, bytes);
 		if (err) {
 			return err;
@@ -396,17 +411,28 @@ static void nvgpu_pd_cache_do_free(struct gk20a *g,
 				   struct nvgpu_pd_mem_entry *pentry,
 				   struct nvgpu_gmmu_pd *pd)
 {
-	u32 index = pd->mem_offs / pentry->pd_size;
-	u32 bit = 1 << index;
+	u32 bit = pd->mem_offs / pentry->pd_size;
 
 	/* Mark entry as free. */
-	pentry->alloc_map &= ~bit;
+	clear_bit(bit, pentry->alloc_map);
+	pentry->allocs--;
 
-	if (pentry->alloc_map & nvgpu_pd_cache_get_mask(pentry)) {
+	if (pentry->allocs > 0U) {
 		/*
 		 * Partially full still. If it was already on the partial list
 		 * this just re-adds it.
+		 *
+		 * Since the memory used for the entries is still mapped, if
+		 * igpu make sure the entries are invalidated so that the hw
+		 * doesn't accidentally try to prefetch non-existent fb memory.
+		 *
+		 * TBD: what about dgpu? (Not supported in Drive 5.0)
 		 */
+		if (pd->mem->cpu_va != NULL) {
+			memset((void *)((u64)pd->mem->cpu_va + pd->mem_offs), 0,
+					pentry->pd_size);
+		}
+
 		nvgpu_list_del(&pentry->list_entry);
 		nvgpu_list_add(&pentry->list_entry,
 			&cache->partial[nvgpu_pd_cache_nr(pentry->pd_size)]);
@@ -414,6 +440,8 @@ static void nvgpu_pd_cache_do_free(struct gk20a *g,
 		/* Empty now so free it. */
 		nvgpu_pd_cache_free_mem_entry(g, cache, pentry);
 	}
+
+	pd->mem = NULL;
 }
 
 static struct nvgpu_pd_mem_entry *nvgpu_pd_cache_look_up(
diff --git a/drivers/gpu/nvgpu/include/nvgpu/gmmu.h b/drivers/gpu/nvgpu/include/nvgpu/gmmu.h
index 8e1eeedc..ee2207b8 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/gmmu.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/gmmu.h
@@ -28,6 +28,7 @@
 #include <nvgpu/list.h>
 #include <nvgpu/rbtree.h>
 #include <nvgpu/lock.h>
+#include <nvgpu/bitops.h>
 
 /*
  * This is the GMMU API visible to blocks outside of the GMMU. Basically this
@@ -56,18 +57,25 @@ enum gk20a_mem_rw_flag {
  */
 #define NVGPU_PD_CACHE_MIN		256U
 #define NVGPU_PD_CACHE_MIN_SHIFT	9U
-#define NVGPU_PD_CACHE_COUNT		4U
+#define NVGPU_PD_CACHE_COUNT		8U
+#define NVGPU_PD_CACHE_SIZE		(NVGPU_PD_CACHE_MIN * (1U << NVGPU_PD_CACHE_COUNT))
 
 struct nvgpu_pd_mem_entry {
 	struct nvgpu_mem		mem;
 
 	/*
-	 * Size of the page directories (not the mem). bmap is a bitmap showing
-	 * which PDs have been allocated. The size of mem will always be one
-	 * page. pd_size will always be a power of 2.
+	 * Size of the page directories (not the mem). alloc_map is a bitmap
+	 * showing which PDs have been allocated.
+	 *
+	 * The size of mem will be NVGPU_PD_CACHE_SIZE
+	 * and pd_size will always be a power of 2.
+	 *
 	 */
 	u32				pd_size;
-	unsigned long			alloc_map;
+	DECLARE_BITMAP(alloc_map, NVGPU_PD_CACHE_SIZE / NVGPU_PD_CACHE_MIN);
+
+	/* Total number of allocations in this PD. */
+	u32				allocs;
 
 	struct nvgpu_list_node		list_entry;
 	struct nvgpu_rbtree_node	tree_entry;
@@ -251,7 +259,10 @@ void nvgpu_gmmu_unmap(struct vm_gk20a *vm,
 		      struct nvgpu_mem *mem,
 		      u64 gpu_va);
 
-int nvgpu_pd_alloc(struct vm_gk20a *vm, struct nvgpu_gmmu_pd *pd, u32 bytes);
+int nvgpu_pd_alloc(struct vm_gk20a *vm,
+		   struct nvgpu_gmmu_pd *pd,
+		   u32 bytes);
+
 void nvgpu_pd_free(struct vm_gk20a *vm, struct nvgpu_gmmu_pd *pd);
 int nvgpu_pd_cache_alloc_direct(struct gk20a *g,
 				  struct nvgpu_gmmu_pd *pd, u32 bytes);
-- 
cgit v1.2.2