gpu: nvgpu: New allocator for VA space

Implement a new buddy allocation scheme for the GPU's VA space. The bitmap allocator was using too much memory and is not a scaleable solution as the GPU's address space keeps getting bigger. The buddy allocation scheme is much more memory efficient when the majority of the address space is not allocated. The buddy allocator is not constrained by the notion of a split address space. The bitmap allocator could only manage either small pages or large pages but not both at the same time. Thus the bottom of the address space was for small pages, the top for large pages. Although, that split is not removed quite yet, the new allocator enables that to happen. The buddy allocator is also very scalable. It manages the relatively small comptag space to the enormous GPU VA space and everything in between. This is important since the GPU has lots of different sized spaces that need managing. Currently there are certain limitations. For one the allocator does not handle the fixed allocations from CUDA very well. It can do so but with certain caveats. The PTE page size is always set to small. This means the BA may place other small page allocations in the buddies around the fixed allocation. It does this to avoid having large and small page allocations in the same PDE. Change-Id: I501cd15af03611536490137331d43761c402c7f9 Signed-off-by: Alex Waterman <alexw@nvidia.com> Reviewed-on: http://git-master/r/740694 Reviewed-by: Automatic_Commit_Validation_User GVS: Gerrit_Virtual_Submit Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com> Tested-by: Terje Bergstrom <tbergstrom@nvidia.com>
author: Alex Waterman <alexw@nvidia.com> 2015-03-18 16:33:09 -0400
committer: Terje Bergstrom <tbergstrom@nvidia.com> 2015-05-11 11:53:25 -0400
commit: a2e852364582e9c337f52bc53ccc33877c8f3b47 (patch)
tree: fb13c5ad80db8eb2424a753a92389c7a3a322a12 /drivers/gpu/nvgpu/gk20a/gk20a_allocator.h
parent: 0566aee853eb32f4f796499b6b00ddf0f1d7de34 (diff)
1 files changed, 164 insertions, 49 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a_allocator.h b/drivers/gpu/nvgpu/gk20a/gk20a_allocator.h
index 69a227bd..e86e053b 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a_allocator.h
+++ b/drivers/gpu/nvgpu/gk20a/gk20a_allocator.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2015, NVIDIA CORPORATION.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms and conditions of the GNU General Public License,
@@ -17,75 +17,190 @@
 #ifndef GK20A_ALLOCATOR_H
 #define GK20A_ALLOCATOR_H
+#include <linux/list.h>
 #include <linux/rbtree.h>
-#include <linux/rwsem.h>
+#include <linux/debugfs.h>
-#include <linux/slab.h>
+#include <linux/platform_device.h>
 /* #define ALLOCATOR_DEBUG */
-/* main struct */
+/*
+ * Each buddy is an element in a binary tree.
+ */
+struct gk20a_buddy {
+        struct gk20a_buddy *parent;     /* Parent node. */
+        struct gk20a_buddy *buddy;      /* This node's buddy. */
+        struct gk20a_buddy *left;       /* Lower address sub-node. */
+        struct gk20a_buddy *right;      /* Higher address sub-node. */
+        struct list_head buddy_entry;   /* List entry for various lists. */
+        struct rb_node alloced_entry;   /* RB tree of allocations. */
+        u64 start;                      /* Start address of this buddy. */
+        u64 end;                        /* End address of this buddy. */
+        u64 order;                      /* Buddy order. */
+#define BALLOC_BUDDY_ALLOCED    0x1
+#define BALLOC_BUDDY_SPLIT      0x2
+#define BALLOC_BUDDY_IN_LIST    0x4
+        int flags;                      /* List of associated flags. */
+        /*
+         * Size of the PDE this buddy is using. This allows for grouping like
+         * sized allocations into the same PDE.
+         */
+#define BALLOC_PTE_SIZE_ANY     0x0
+#define BALLOC_PTE_SIZE_SMALL   0x1
+#define BALLOC_PTE_SIZE_BIG     0x2
+        int pte_size;
+};
+#define __buddy_flag_ops(flag, flag_up)                                 \
+        static inline int buddy_is_ ## flag(struct gk20a_buddy *b)      \
+        {                                                               \
+                return b->flags & BALLOC_BUDDY_ ## flag_up;             \
+        }                                                               \
+        static inline void buddy_set_ ## flag(struct gk20a_buddy *b)    \
+        {                                                               \
+                b->flags |= BALLOC_BUDDY_ ## flag_up;                   \
+        }                                                               \
+        static inline void buddy_clr_ ## flag(struct gk20a_buddy *b)    \
+        {                                                               \
+                b->flags &= ~BALLOC_BUDDY_ ## flag_up;                  \
+        }
+/*
+ * int  buddy_is_alloced(struct gk20a_buddy *b);
+ * void buddy_set_alloced(struct gk20a_buddy *b);
+ * void buddy_clr_alloced(struct gk20a_buddy *b);
+ *
+ * int  buddy_is_split(struct gk20a_buddy *b);
+ * void buddy_set_split(struct gk20a_buddy *b);
+ * void buddy_clr_split(struct gk20a_buddy *b);
+ *
+ * int  buddy_is_in_list(struct gk20a_buddy *b);
+ * void buddy_set_in_list(struct gk20a_buddy *b);
+ * void buddy_clr_in_list(struct gk20a_buddy *b);
+ */
+__buddy_flag_ops(alloced, ALLOCED);
+__buddy_flag_ops(split,   SPLIT);
+__buddy_flag_ops(in_list, IN_LIST);
+/*
+ * Keeps info for a fixed allocation.
+ */
+struct gk20a_fixed_alloc {
+        struct list_head buddies;       /* List of buddies. */
+        struct rb_node alloced_entry;   /* RB tree of fixed allocations. */
+        u64 start;                      /* Start of fixed block. */
+        u64 end;                        /* End address. */
+};
+struct vm_gk20a;
+/*
+ * GPU buddy allocator for the various GPU address spaces. Each addressable unit
+ * doesn't have to correspond to a byte. In some cases each unit is a more
+ * complex object such as a comp_tag line or the like.
+ *
+ * The max order is computed based on the size of the minimum order and the size
+ * of the address space.
+ *
+ * order_size is the size of an order 0 buddy.
+ */
 struct gk20a_allocator {
-        char name[32];                  /* name for allocator */
+        struct vm_gk20a *vm;            /* Parent VM - can be NULL. */
-        struct rb_root rb_root;         /* rb tree root for blocks */
-        u32 base;                       /* min value of this linear space */
+        char name[32];                  /* Name of allocator. */
-        u32 limit;                      /* max value = limit - 1 */
-        unsigned long *bitmap;          /* bitmap */
+        u64 base;                       /* Base address of the space. */
+        u64 length;                     /* Length of the space. */
+        u64 blk_size;                   /* Size of order 0 allocation. */
+        u64 blk_shift;                  /* Shift to divide by blk_size. */
-        struct gk20a_alloc_block *block_first;  /* first block in list */
+        int init;                       /* Non-zero if initialized. */
-        struct gk20a_alloc_block *block_recent; /* last visited block */
-        u32 first_free_addr;            /* first free addr, non-contigous
+        /* Internal stuff. */
-                                           allocation preferred start,
+        u64 start;                      /* Real start (aligned to blk_size). */
-                                           in order to pick up small holes */
+        u64 end;                        /* Real end, trimmed if needed. */
-        u32 last_free_addr;             /* last free addr, contiguous
+        u64 count;                      /* Count of objects in space. */
-                                           allocation preferred start */
+        u64 blks;                       /* Count of blks in the space. */
-        u32 cached_hole_size;           /* max free hole size up to
+        u64 max_order;                  /* Specific maximum order. */
-                                           last_free_addr */
-        u32 block_count;                /* number of blocks */
-        struct rw_semaphore rw_sema;    /* lock */
+        struct rb_root alloced_buddies; /* Outstanding allocations. */
-        struct kmem_cache *block_cache; /* slab cache */
+        struct rb_root fixed_allocs;    /* Outstanding fixed allocations. */
-        /* if enabled, constrain to [base, limit) */
+        struct mutex lock;              /* Protects buddy access. */
-        struct {
-                bool enable;
-                u32 base;
-                u32 limit;
-        } constraint;
-        int (*alloc)(struct gk20a_allocator *allocator,
+#define GPU_BALLOC_GVA_SPACE            0x1
-                u32 *addr, u32 len, u32 align);
+        u64 flags;
-        int (*free)(struct gk20a_allocator *allocator,
-                u32 addr, u32 len, u32 align);
-};
+        /*
+         * Impose an upper bound on the maximum order.
+         */
+#define GPU_BALLOC_MAX_ORDER            31
+#define GPU_BALLOC_ORDER_LIST_LEN       (GPU_BALLOC_MAX_ORDER + 1)
-int gk20a_allocator_init(struct gk20a_allocator *allocator,
+        struct list_head buddy_list[GPU_BALLOC_ORDER_LIST_LEN];
-                        const char *name, u32 base, u32 size);
+        u64 buddy_list_len[GPU_BALLOC_ORDER_LIST_LEN];
-void gk20a_allocator_destroy(struct gk20a_allocator *allocator);
+        u64 buddy_list_split[GPU_BALLOC_ORDER_LIST_LEN];
+        u64 buddy_list_alloced[GPU_BALLOC_ORDER_LIST_LEN];
-int gk20a_allocator_block_alloc(struct gk20a_allocator *allocator,
+        /*
-                        u32 *addr, u32 len, u32 align);
+         * This is for when the allocator is managing a GVA space (the
+         * GPU_BALLOC_GVA_SPACE bit is set in @flags). This requires
+         * that we group like sized allocations into PDE blocks.
+         */
+        u64 pte_blk_order;
-int gk20a_allocator_block_free(struct gk20a_allocator *allocator,
+        struct dentry *debugfs_entry;
-                        u32 addr, u32 len, u32 align);
-#if defined(ALLOCATOR_DEBUG)
+        u64 bytes_alloced;
+        u64 bytes_alloced_real;
+        u64 bytes_freed;
+};
-#define allocator_dbg(alloctor, format, arg...)                         \
+#define balloc_lock(a)          mutex_lock(&(a)->lock)
-do {                                                            \
+#define balloc_unlock(a)        mutex_unlock(&(a)->lock)
-        if (1)                                                  \
-                pr_debug("gk20a_allocator (%s) %s: " format "\n",\
-                        alloctor->name, __func__, ##arg);\
-} while (0)
-#else /* ALLOCATOR_DEBUG */
+#define balloc_get_order_list(a, order) (&(a)->buddy_list[(order)])
+#define balloc_order_to_len(a, order)   ((1 << order) * (a)->blk_size)
+#define balloc_base_shift(a, base)      ((base) - (a)->start)
+#define balloc_base_unshift(a, base)    ((base) + (a)->start)
-#define allocator_dbg(format, arg...)
+int  gk20a_allocator_init(struct gk20a_allocator *allocator,
+                          const char *name, u64 base, u64 size, u64 order0);
+int  __gk20a_allocator_init(struct gk20a_allocator *allocator,
+                            struct vm_gk20a *vm, const char *name,
+                            u64 base, u64 size, u64 order0,
+                            u64 max_order, u64 flags);
+void gk20a_allocator_destroy(struct gk20a_allocator *allocator);
-#endif /* ALLOCATOR_DEBUG */
+/*
+ * Normal alloc/free operations for the buddy allocator.
+ */
+u64  gk20a_balloc(struct gk20a_allocator *allocator, u64 len);
+void gk20a_bfree(struct gk20a_allocator *allocator, u64 addr);
+/*
+ * Special interface to allocate a memory regions with a specific starting
+ * address. Yikes.
+ */
+u64  gk20a_balloc_fixed(struct gk20a_allocator *allocator, u64 base, u64 len);
+/*
+ * Debugfs init.
+ */
+void gk20a_alloc_debugfs_init(struct platform_device *pdev);
+#if defined(ALLOCATOR_DEBUG)
+#define balloc_dbg(alloctor, format, arg...)            \
+        pr_info("%-25s %25s() " format,                 \
+                alloctor->name, __func__, ##arg)
+#else
+#define balloc_dbg(allocator, format, arg...)
+#endif
 #endif /* GK20A_ALLOCATOR_H */
author	Alex Waterman <alexw@nvidia.com>	2015-03-18 16:33:09 -0400
committer	Terje Bergstrom <tbergstrom@nvidia.com>	2015-05-11 11:53:25 -0400
commit	a2e852364582e9c337f52bc53ccc33877c8f3b47 (patch)
tree	fb13c5ad80db8eb2424a753a92389c7a3a322a12 /drivers/gpu/nvgpu/gk20a/gk20a_allocator.h
parent	0566aee853eb32f4f796499b6b00ddf0f1d7de34 (diff)

diff --git a/drivers/gpu/nvgpu/gk20a/gk20a_allocator.h b/drivers/gpu/nvgpu/gk20a/gk20a_allocator.h index 69a227bd..e86e053b 100644 --- a/drivers/gpu/nvgpu/gk20a/gk20a_allocator.h +++ b/drivers/gpu/nvgpu/gk20a/gk20a_allocator.h
@@ -1,5 +1,5 @@
1	/*	1	/*
2	* Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.	2	* Copyright (c) 2011-2015, NVIDIA CORPORATION. All rights reserved.
3	*	3	*
4	* This program is free software; you can redistribute it and/or modify it	4	* This program is free software; you can redistribute it and/or modify it
5	* under the terms and conditions of the GNU General Public License,	5	* under the terms and conditions of the GNU General Public License,
@@ -17,75 +17,190 @@
17	#ifndef GK20A_ALLOCATOR_H	17	#ifndef GK20A_ALLOCATOR_H
18	#define GK20A_ALLOCATOR_H	18	#define GK20A_ALLOCATOR_H
19		19
		20	#include <linux/list.h>
20	#include <linux/rbtree.h>	21	#include <linux/rbtree.h>
21	#include <linux/rwsem.h>	22	#include <linux/debugfs.h>
22	#include <linux/slab.h>	23	#include <linux/platform_device.h>
23		24
24	/* #define ALLOCATOR_DEBUG */	25	/* #define ALLOCATOR_DEBUG */
25		26
26	/* main struct */	27	/*
		28	* Each buddy is an element in a binary tree.
		29	*/
		30	struct gk20a_buddy {
		31	struct gk20a_buddy parent; / Parent node. */
		32	struct gk20a_buddy buddy; / This node's buddy. */
		33	struct gk20a_buddy left; / Lower address sub-node. */
		34	struct gk20a_buddy right; / Higher address sub-node. */
		35
		36	struct list_head buddy_entry; /* List entry for various lists. */
		37	struct rb_node alloced_entry; /* RB tree of allocations. */
		38
		39	u64 start; /* Start address of this buddy. */
		40	u64 end; /* End address of this buddy. */
		41	u64 order; /* Buddy order. */
		42
		43	#define BALLOC_BUDDY_ALLOCED 0x1
		44	#define BALLOC_BUDDY_SPLIT 0x2
		45	#define BALLOC_BUDDY_IN_LIST 0x4
		46	int flags; /* List of associated flags. */
		47
		48	/*
		49	* Size of the PDE this buddy is using. This allows for grouping like
		50	* sized allocations into the same PDE.
		51	*/
		52	#define BALLOC_PTE_SIZE_ANY 0x0
		53	#define BALLOC_PTE_SIZE_SMALL 0x1
		54	#define BALLOC_PTE_SIZE_BIG 0x2
		55	int pte_size;
		56	};
		57
		58	#define __buddy_flag_ops(flag, flag_up) \
		59	static inline int buddy_is_ ## flag(struct gk20a_buddy *b) \
		60	{ \
		61	return b->flags & BALLOC_BUDDY_ ## flag_up; \
		62	} \
		63	static inline void buddy_set_ ## flag(struct gk20a_buddy *b) \
		64	{ \
		65	b->flags \|= BALLOC_BUDDY_ ## flag_up; \
		66	} \
		67	static inline void buddy_clr_ ## flag(struct gk20a_buddy *b) \
		68	{ \
		69	b->flags &= ~BALLOC_BUDDY_ ## flag_up; \
		70	}
		71
		72	/*
		73	* int buddy_is_alloced(struct gk20a_buddy *b);
		74	* void buddy_set_alloced(struct gk20a_buddy *b);
		75	* void buddy_clr_alloced(struct gk20a_buddy *b);
		76	*
		77	* int buddy_is_split(struct gk20a_buddy *b);
		78	* void buddy_set_split(struct gk20a_buddy *b);
		79	* void buddy_clr_split(struct gk20a_buddy *b);
		80	*
		81	* int buddy_is_in_list(struct gk20a_buddy *b);
		82	* void buddy_set_in_list(struct gk20a_buddy *b);
		83	* void buddy_clr_in_list(struct gk20a_buddy *b);
		84	*/
		85	__buddy_flag_ops(alloced, ALLOCED);
		86	__buddy_flag_ops(split, SPLIT);
		87	__buddy_flag_ops(in_list, IN_LIST);
		88
		89	/*
		90	* Keeps info for a fixed allocation.
		91	*/
		92	struct gk20a_fixed_alloc {
		93	struct list_head buddies; /* List of buddies. */
		94	struct rb_node alloced_entry; /* RB tree of fixed allocations. */
		95
		96	u64 start; /* Start of fixed block. */
		97	u64 end; /* End address. */
		98	};
		99
		100	struct vm_gk20a;
		101
		102	/*
		103	* GPU buddy allocator for the various GPU address spaces. Each addressable unit
		104	* doesn't have to correspond to a byte. In some cases each unit is a more
		105	* complex object such as a comp_tag line or the like.
		106	*
		107	* The max order is computed based on the size of the minimum order and the size
		108	* of the address space.
		109	*
		110	* order_size is the size of an order 0 buddy.
		111	*/
27	struct gk20a_allocator {	112	struct gk20a_allocator {
28		113
29	char name[32]; /* name for allocator */	114	struct vm_gk20a vm; / Parent VM - can be NULL. */
30	struct rb_root rb_root; /* rb tree root for blocks */
31		115
32	u32 base; /* min value of this linear space */	116	char name[32]; /* Name of allocator. */
33	u32 limit; /* max value = limit - 1 */
34		117
35	unsigned long bitmap; / bitmap */	118	u64 base; /* Base address of the space. */
		119	u64 length; /* Length of the space. */
		120	u64 blk_size; /* Size of order 0 allocation. */
		121	u64 blk_shift; /* Shift to divide by blk_size. */
36		122
37	struct gk20a_alloc_block block_first; / first block in list */	123	int init; /* Non-zero if initialized. */
38	struct gk20a_alloc_block block_recent; / last visited block */
39		124
40	u32 first_free_addr; /* first free addr, non-contigous	125	/* Internal stuff. */
41	allocation preferred start,	126	u64 start; /* Real start (aligned to blk_size). */
42	in order to pick up small holes */	127	u64 end; /* Real end, trimmed if needed. */
43	u32 last_free_addr; /* last free addr, contiguous	128	u64 count; /* Count of objects in space. */
44	allocation preferred start */	129	u64 blks; /* Count of blks in the space. */
45	u32 cached_hole_size; /* max free hole size up to	130	u64 max_order; /* Specific maximum order. */
46	last_free_addr */
47	u32 block_count; /* number of blocks */
48		131
49	struct rw_semaphore rw_sema; /* lock */	132	struct rb_root alloced_buddies; /* Outstanding allocations. */
50	struct kmem_cache block_cache; / slab cache */	133	struct rb_root fixed_allocs; /* Outstanding fixed allocations. */
51		134
52	/* if enabled, constrain to [base, limit) */	135	struct mutex lock; /* Protects buddy access. */
53	struct {
54	bool enable;
55	u32 base;
56	u32 limit;
57	} constraint;
58		136
59	int (alloc)(struct gk20a_allocator allocator,	137	#define GPU_BALLOC_GVA_SPACE 0x1
60	u32 *addr, u32 len, u32 align);	138	u64 flags;
61	int (free)(struct gk20a_allocator allocator,
62	u32 addr, u32 len, u32 align);
63		139
64	};	140	/*
		141	* Impose an upper bound on the maximum order.
		142	*/
		143	#define GPU_BALLOC_MAX_ORDER 31
		144	#define GPU_BALLOC_ORDER_LIST_LEN (GPU_BALLOC_MAX_ORDER + 1)
65		145
66	int gk20a_allocator_init(struct gk20a_allocator *allocator,	146	struct list_head buddy_list[GPU_BALLOC_ORDER_LIST_LEN];
67	const char *name, u32 base, u32 size);	147	u64 buddy_list_len[GPU_BALLOC_ORDER_LIST_LEN];
68	void gk20a_allocator_destroy(struct gk20a_allocator *allocator);	148	u64 buddy_list_split[GPU_BALLOC_ORDER_LIST_LEN];
		149	u64 buddy_list_alloced[GPU_BALLOC_ORDER_LIST_LEN];
69		150
70	int gk20a_allocator_block_alloc(struct gk20a_allocator *allocator,	151	/*
71	u32 *addr, u32 len, u32 align);	152	* This is for when the allocator is managing a GVA space (the
		153	* GPU_BALLOC_GVA_SPACE bit is set in @flags). This requires
		154	* that we group like sized allocations into PDE blocks.
		155	*/
		156	u64 pte_blk_order;
72		157
73	int gk20a_allocator_block_free(struct gk20a_allocator *allocator,	158	struct dentry *debugfs_entry;
74	u32 addr, u32 len, u32 align);
75		159
76	#if defined(ALLOCATOR_DEBUG)	160	u64 bytes_alloced;
		161	u64 bytes_alloced_real;
		162	u64 bytes_freed;
		163	};
77		164
78	#define allocator_dbg(alloctor, format, arg...) \	165	#define balloc_lock(a) mutex_lock(&(a)->lock)
79	do { \	166	#define balloc_unlock(a) mutex_unlock(&(a)->lock)
80	if (1) \
81	pr_debug("gk20a_allocator (%s) %s: " format "\n",\
82	alloctor->name, __func__, ##arg);\
83	} while (0)
84		167
85	#else /* ALLOCATOR_DEBUG */	168	#define balloc_get_order_list(a, order) (&(a)->buddy_list[(order)])
		169	#define balloc_order_to_len(a, order) ((1 << order) * (a)->blk_size)
		170	#define balloc_base_shift(a, base) ((base) - (a)->start)
		171	#define balloc_base_unshift(a, base) ((base) + (a)->start)
86		172
87	#define allocator_dbg(format, arg...)	173	int gk20a_allocator_init(struct gk20a_allocator *allocator,
		174	const char *name, u64 base, u64 size, u64 order0);
		175	int __gk20a_allocator_init(struct gk20a_allocator *allocator,
		176	struct vm_gk20a vm, const char name,
		177	u64 base, u64 size, u64 order0,
		178	u64 max_order, u64 flags);
		179	void gk20a_allocator_destroy(struct gk20a_allocator *allocator);
88		180
89	#endif /* ALLOCATOR_DEBUG */	181	/*
		182	* Normal alloc/free operations for the buddy allocator.
		183	*/
		184	u64 gk20a_balloc(struct gk20a_allocator *allocator, u64 len);
		185	void gk20a_bfree(struct gk20a_allocator *allocator, u64 addr);
		186
		187	/*
		188	* Special interface to allocate a memory regions with a specific starting
		189	* address. Yikes.
		190	*/
		191	u64 gk20a_balloc_fixed(struct gk20a_allocator *allocator, u64 base, u64 len);
		192
		193	/*
		194	* Debugfs init.
		195	*/
		196	void gk20a_alloc_debugfs_init(struct platform_device *pdev);
		197
		198	#if defined(ALLOCATOR_DEBUG)
		199	#define balloc_dbg(alloctor, format, arg...) \
		200	pr_info("%-25s %25s() " format, \
		201	alloctor->name, __func__, ##arg)
		202	#else
		203	#define balloc_dbg(allocator, format, arg...)
		204	#endif
90		205
91	#endif /* GK20A_ALLOCATOR_H */	206	#endif /* GK20A_ALLOCATOR_H */