gpu: nvgpu: Implement PD packing

In some cases page directories require less than a full page of memory. For example, on Pascal, the final PD level for large pages is only 256 bytes; thus 16 PDs can fit in a single page. To allocate an entire page for each of these 256 B PDs is extremely wasteful. This patch aims to alleviate the wasted DMA memory from having small PDs in a full page by packing multiple small PDs into a single page. The packing is implemented as a slab allocator - each page is a slab and from each page multiple PD instances can be allocated. Several modifications to the nvgpu_gmmu_pd struct also needed to be made to support this. The nvgpu_mem is now a pointer and there's an explicit offset into the nvgpu_mem struct so that each nvgpu_gmmu_pd knows what portion of the memory it's using. The nvgpu_pde_phys_addr() function and the pd_write() functions also require some changes since the PD no longer is always situated at the start of the nvgpu_mem. Initialization and cleanup of the page tables for each VM was slightly modified to work through the new pd_cache implementation. Some PDs (i.e the PDB), despite not being a full page, still require a full page for alignment purposes (HW requirements). Thus a direct allocation method for PDs is still provided. This is also used when a PD that could in principle be cached is greater than a page in size. Lastly a new debug flag was added for the pd_cache code. JIRA NVGPU-30 Change-Id: I64c8037fc356783c1ef203cc143c4d71bbd5d77c Signed-off-by: Alex Waterman <alexw@nvidia.com> Reviewed-on: https://git-master/r/1506610 Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com> GVS: Gerrit_Virtual_Submit
author: Alex Waterman <alexw@nvidia.com> 2017-06-09 14:42:50 -0400
committer: mobile promotions <svcmobile_promotions@nvidia.com> 2017-07-06 17:44:16 -0400
commit: 583704620db88e391f6b14acc57af859a70127de (patch)
tree: 8fc3becf2850b724e87011b0e0250c52d0efb7ee /drivers/gpu/nvgpu/include
parent: c1393d5b68e63c992f4c689cb788139fdf8c2f1a (diff)
2 files changed, 86 insertions, 6 deletions
diff --git a/drivers/gpu/nvgpu/include/nvgpu/gmmu.h b/drivers/gpu/nvgpu/include/nvgpu/gmmu.h
index 28a2cb82..eff87c31 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/gmmu.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/gmmu.h
@@ -19,6 +19,9 @@
 #include <nvgpu/types.h>
 #include <nvgpu/nvgpu_mem.h>
+#include <nvgpu/list.h>
+#include <nvgpu/rbtree.h>
+#include <nvgpu/lock.h>
 struct scatterlist;
@@ -45,14 +48,85 @@ enum gk20a_mem_rw_flag {
 };
 /*
+ * Minimum size of a cache. The number of different caches in the nvgpu_pd_cache
+ * structure is of course depending on this. The MIN_SHIFT define is the right
+ * number of bits to shift to determine which list to use in the array of lists.
+ */
+#define NVGPU_PD_CACHE_MIN              256
+#define NVGPU_PD_CACHE_MIN_SHIFT        9
+#define NVGPU_PD_CACHE_COUNT            4
+struct nvgpu_pd_mem_entry {
+        struct nvgpu_mem                mem;
+        /*
+         * Size of the page directories (not the mem). bmap is a bitmap showing
+         * which PDs have been allocated. The size of mem will always be one
+         * page. pd_size will always be a power of 2.
+         */
+        u32                             pd_size;
+        unsigned long                   alloc_map;
+        struct nvgpu_list_node          list_entry;
+        struct nvgpu_rbtree_node        tree_entry;
+};
+static inline struct nvgpu_pd_mem_entry *
+nvgpu_pd_mem_entry_from_list_entry(struct nvgpu_list_node *node)
+{
+        return (struct nvgpu_pd_mem_entry *)
+                ((uintptr_t)node -
+                 offsetof(struct nvgpu_pd_mem_entry, list_entry));
+};
+static inline struct nvgpu_pd_mem_entry *
+nvgpu_pd_mem_entry_from_tree_entry(struct nvgpu_rbtree_node *node)
+{
+        return (struct nvgpu_pd_mem_entry *)
+                ((uintptr_t)node -
+                 offsetof(struct nvgpu_pd_mem_entry, tree_entry));
+};
+/*
+ * A cache for allocating PD memory from. This enables smaller PDs to be packed
+ * into single pages.
+ *
+ * This is fairly complex so see the documentation in pd_cache.c for a full
+ * description of how this is organized.
+ */
+struct nvgpu_pd_cache {
+        /*
+         * Array of lists of full nvgpu_pd_mem_entries and partially full (or
+         * empty) nvgpu_pd_mem_entries.
+         */
+        struct nvgpu_list_node           full[NVGPU_PD_CACHE_COUNT];
+        struct nvgpu_list_node           partial[NVGPU_PD_CACHE_COUNT];
+        /*
+         * Tree of all allocated struct nvgpu_mem's for fast look up.
+         */
+        struct nvgpu_rbtree_node        *mem_tree;
+        /*
+         * All access to the cache much be locked. This protects the lists and
+         * the rb tree.
+         */
+        struct nvgpu_mutex               lock;
+};
+/*
 * GMMU page directory. This is the kernel's tracking of a list of PDEs or PTEs
 * in the GMMU.
 */
 struct nvgpu_gmmu_pd {
        /*
-         * DMA memory describing the PTEs or PTEs.
+         * DMA memory describing the PTEs or PDEs. @mem_offs describes the
+         * offset of the PDE table in @mem. @cached specifies if this PD is
+         * using pd_cache memory.
         */
-        struct nvgpu_mem         mem;
+        struct nvgpu_mem        *mem;
+        u32                      mem_offs;
+        bool                     cached;
        /*
         * List of pointers to the next level of page tables. Does not
@@ -66,7 +140,7 @@ struct nvgpu_gmmu_pd {
 * Reduce the number of arguments getting passed through the various levels of
 * GMMU mapping functions.
 *
- * The following fields are set statically and do not change throughout
+ * The following fields are set statically and do not change throughout the
 * mapping call:
 *
 *   pgsz:        Index into the page size table.
@@ -166,8 +240,13 @@ void nvgpu_gmmu_unmap(struct vm_gk20a *vm,
                      struct nvgpu_mem *mem,
                      u64 gpu_va);
-void nvgpu_free_gmmu_pages(struct vm_gk20a *vm,
+int __nvgpu_pd_alloc(struct vm_gk20a *vm, struct nvgpu_gmmu_pd *pd, u32 bytes);
-                     struct nvgpu_gmmu_pd *entry);
+void __nvgpu_pd_free(struct vm_gk20a *vm, struct nvgpu_gmmu_pd *pd);
+int __nvgpu_pd_cache_alloc_direct(struct gk20a *g,
+                                  struct nvgpu_gmmu_pd *pd, u32 bytes);
+void __nvgpu_pd_cache_free_direct(struct gk20a *g, struct nvgpu_gmmu_pd *pd);
+int nvgpu_pd_cache_init(struct gk20a *g);
+void nvgpu_pd_cache_fini(struct gk20a *g);
 /*
 * Some useful routines that are shared across chips.
@@ -181,7 +260,7 @@ static inline u32 pd_offset_from_index(const struct gk20a_mmu_level *l,
 static inline void pd_write(struct gk20a *g, struct nvgpu_gmmu_pd *pd,
                            size_t w, size_t data)
 {
-        nvgpu_mem_wr32(g, &pd->mem, w, data);
+        nvgpu_mem_wr32(g, pd->mem, (pd->mem_offs / sizeof(u32)) + w, data);
 }
diff --git a/drivers/gpu/nvgpu/include/nvgpu/log.h b/drivers/gpu/nvgpu/include/nvgpu/log.h
index 3b8e6b19..a1110a59 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/log.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/log.h
@@ -68,6 +68,7 @@ enum nvgpu_log_categories {
        gpu_dbg_xv         = BIT(17),   /* XVE debugging. */
        gpu_dbg_shutdown   = BIT(18),   /* GPU shutdown tracing. */
        gpu_dbg_kmem       = BIT(19),   /* Kmem tracking debugging. */
+        gpu_dbg_pd_cache   = BIT(20),   /* PD cache traces. */
        gpu_dbg_mem        = BIT(31),   /* memory accesses; very verbose. */
 };
author	Alex Waterman <alexw@nvidia.com>	2017-06-09 14:42:50 -0400
committer	mobile promotions <svcmobile_promotions@nvidia.com>	2017-07-06 17:44:16 -0400
commit	583704620db88e391f6b14acc57af859a70127de (patch)
tree	8fc3becf2850b724e87011b0e0250c52d0efb7ee /drivers/gpu/nvgpu/include
parent	c1393d5b68e63c992f4c689cb788139fdf8c2f1a (diff)

diff --git a/drivers/gpu/nvgpu/include/nvgpu/gmmu.h b/drivers/gpu/nvgpu/include/nvgpu/gmmu.h index 28a2cb82..eff87c31 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/gmmu.h +++ b/drivers/gpu/nvgpu/include/nvgpu/gmmu.h
@@ -19,6 +19,9 @@
19		19
20	#include <nvgpu/types.h>	20	#include <nvgpu/types.h>
21	#include <nvgpu/nvgpu_mem.h>	21	#include <nvgpu/nvgpu_mem.h>
		22	#include <nvgpu/list.h>
		23	#include <nvgpu/rbtree.h>
		24	#include <nvgpu/lock.h>
22		25
23	struct scatterlist;	26	struct scatterlist;
24		27
@@ -45,14 +48,85 @@ enum gk20a_mem_rw_flag {
45	};	48	};
46		49
47	/*	50	/*
		51	* Minimum size of a cache. The number of different caches in the nvgpu_pd_cache
		52	* structure is of course depending on this. The MIN_SHIFT define is the right
		53	* number of bits to shift to determine which list to use in the array of lists.
		54	*/
		55	#define NVGPU_PD_CACHE_MIN 256
		56	#define NVGPU_PD_CACHE_MIN_SHIFT 9
		57	#define NVGPU_PD_CACHE_COUNT 4
		58
		59	struct nvgpu_pd_mem_entry {
		60	struct nvgpu_mem mem;
		61
		62	/*
		63	* Size of the page directories (not the mem). bmap is a bitmap showing
		64	* which PDs have been allocated. The size of mem will always be one
		65	* page. pd_size will always be a power of 2.
		66	*/
		67	u32 pd_size;
		68	unsigned long alloc_map;
		69
		70	struct nvgpu_list_node list_entry;
		71	struct nvgpu_rbtree_node tree_entry;
		72	};
		73
		74	static inline struct nvgpu_pd_mem_entry *
		75	nvgpu_pd_mem_entry_from_list_entry(struct nvgpu_list_node *node)
		76	{
		77	return (struct nvgpu_pd_mem_entry *)
		78	((uintptr_t)node -
		79	offsetof(struct nvgpu_pd_mem_entry, list_entry));
		80	};
		81
		82	static inline struct nvgpu_pd_mem_entry *
		83	nvgpu_pd_mem_entry_from_tree_entry(struct nvgpu_rbtree_node *node)
		84	{
		85	return (struct nvgpu_pd_mem_entry *)
		86	((uintptr_t)node -
		87	offsetof(struct nvgpu_pd_mem_entry, tree_entry));
		88	};
		89
		90	/*
		91	* A cache for allocating PD memory from. This enables smaller PDs to be packed
		92	* into single pages.
		93	*
		94	* This is fairly complex so see the documentation in pd_cache.c for a full
		95	* description of how this is organized.
		96	*/
		97	struct nvgpu_pd_cache {
		98	/*
		99	* Array of lists of full nvgpu_pd_mem_entries and partially full (or
		100	* empty) nvgpu_pd_mem_entries.
		101	*/
		102	struct nvgpu_list_node full[NVGPU_PD_CACHE_COUNT];
		103	struct nvgpu_list_node partial[NVGPU_PD_CACHE_COUNT];
		104
		105	/*
		106	* Tree of all allocated struct nvgpu_mem's for fast look up.
		107	*/
		108	struct nvgpu_rbtree_node *mem_tree;
		109
		110	/*
		111	* All access to the cache much be locked. This protects the lists and
		112	* the rb tree.
		113	*/
		114	struct nvgpu_mutex lock;
		115	};
		116
		117	/*
48	* GMMU page directory. This is the kernel's tracking of a list of PDEs or PTEs	118	* GMMU page directory. This is the kernel's tracking of a list of PDEs or PTEs
49	* in the GMMU.	119	* in the GMMU.
50	*/	120	*/
51	struct nvgpu_gmmu_pd {	121	struct nvgpu_gmmu_pd {
52	/*	122	/*
53	* DMA memory describing the PTEs or PTEs.	123	* DMA memory describing the PTEs or PDEs. @mem_offs describes the
		124	* offset of the PDE table in @mem. @cached specifies if this PD is
		125	* using pd_cache memory.
54	*/	126	*/
55	struct nvgpu_mem mem;	127	struct nvgpu_mem *mem;
		128	u32 mem_offs;
		129	bool cached;
56		130
57	/*	131	/*
58	* List of pointers to the next level of page tables. Does not	132	* List of pointers to the next level of page tables. Does not
@@ -66,7 +140,7 @@ struct nvgpu_gmmu_pd {
66	* Reduce the number of arguments getting passed through the various levels of	140	* Reduce the number of arguments getting passed through the various levels of
67	* GMMU mapping functions.	141	* GMMU mapping functions.
68	*	142	*
69	* The following fields are set statically and do not change throughout	143	* The following fields are set statically and do not change throughout the
70	* mapping call:	144	* mapping call:
71	*	145	*
72	* pgsz: Index into the page size table.	146	* pgsz: Index into the page size table.
@@ -166,8 +240,13 @@ void nvgpu_gmmu_unmap(struct vm_gk20a *vm,
166	struct nvgpu_mem *mem,	240	struct nvgpu_mem *mem,
167	u64 gpu_va);	241	u64 gpu_va);
168		242
169	void nvgpu_free_gmmu_pages(struct vm_gk20a *vm,	243	int __nvgpu_pd_alloc(struct vm_gk20a vm, struct nvgpu_gmmu_pd pd, u32 bytes);
170	struct nvgpu_gmmu_pd *entry);	244	void __nvgpu_pd_free(struct vm_gk20a vm, struct nvgpu_gmmu_pd pd);
		245	int __nvgpu_pd_cache_alloc_direct(struct gk20a *g,
		246	struct nvgpu_gmmu_pd *pd, u32 bytes);
		247	void __nvgpu_pd_cache_free_direct(struct gk20a g, struct nvgpu_gmmu_pd pd);
		248	int nvgpu_pd_cache_init(struct gk20a *g);
		249	void nvgpu_pd_cache_fini(struct gk20a *g);
171		250
172	/*	251	/*
173	* Some useful routines that are shared across chips.	252	* Some useful routines that are shared across chips.
@@ -181,7 +260,7 @@ static inline u32 pd_offset_from_index(const struct gk20a_mmu_level *l,
181	static inline void pd_write(struct gk20a g, struct nvgpu_gmmu_pd pd,	260	static inline void pd_write(struct gk20a g, struct nvgpu_gmmu_pd pd,
182	size_t w, size_t data)	261	size_t w, size_t data)
183	{	262	{
184	nvgpu_mem_wr32(g, &pd->mem, w, data);	263	nvgpu_mem_wr32(g, pd->mem, (pd->mem_offs / sizeof(u32)) + w, data);
185	}	264	}
186		265
187		266


diff --git a/drivers/gpu/nvgpu/include/nvgpu/log.h b/drivers/gpu/nvgpu/include/nvgpu/log.h index 3b8e6b19..a1110a59 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/log.h +++ b/drivers/gpu/nvgpu/include/nvgpu/log.h
@@ -68,6 +68,7 @@ enum nvgpu_log_categories {
68	gpu_dbg_xv = BIT(17), /* XVE debugging. */	68	gpu_dbg_xv = BIT(17), /* XVE debugging. */
69	gpu_dbg_shutdown = BIT(18), /* GPU shutdown tracing. */	69	gpu_dbg_shutdown = BIT(18), /* GPU shutdown tracing. */
70	gpu_dbg_kmem = BIT(19), /* Kmem tracking debugging. */	70	gpu_dbg_kmem = BIT(19), /* Kmem tracking debugging. */
		71	gpu_dbg_pd_cache = BIT(20), /* PD cache traces. */
71	gpu_dbg_mem = BIT(31), /* memory accesses; very verbose. */	72	gpu_dbg_mem = BIT(31), /* memory accesses; very verbose. */
72	};	73	};
73		74