slob: initial NUMA support

This adds preliminary NUMA support to SLOB, primarily aimed at systems with small nodes (tested all the way down to a 128kB SRAM block), whether asymmetric or otherwise. We follow the same conventions as SLAB/SLUB, preferring current node placement for new pages, or with explicit placement, if a node has been specified. Presently on UP NUMA this has the side-effect of preferring node#0 allocations (since numa_node_id() == 0, though this could be reworked if we could hand off a pfn to determine node placement), so single-CPU NUMA systems will want to place smaller nodes further out in terms of node id. Once a page has been bound to a node (via explicit node id typing), we only do block allocations from partial free pages that have a matching node id in the page flags. The current implementation does have some scalability problems, in that all partial free pages are tracked in the global freelist (with contention due to the single spinlock). However, these are things that are being reworked for SMP scalability first, while things like per-node freelists can easily be built on top of this sort of functionality once it's been added. More background can be found in: http://marc.info/?l=linux-mm&m=118117916022379&w=2 http://marc.info/?l=linux-mm&m=118170446306199&w=2 http://marc.info/?l=linux-mm&m=118187859420048&w=2 and subsequent threads. Acked-by: Christoph Lameter <clameter@sgi.com> Acked-by: Matt Mackall <mpm@selenic.com> Signed-off-by: Paul Mundt <lethal@linux-sh.org> Acked-by: Nick Piggin <nickpiggin@yahoo.com.au> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Paul Mundt <lethal@linux-sh.org> 2007-07-16 02:38:22 -0400
committer: Linus Torvalds <torvalds@woody.linux-foundation.org> 2007-07-16 12:05:36 -0400
commit: 6193a2ff180920f84ee06977165ebf32431fc2d2 (patch)
tree: d3c6423c50463ea741080a58a2e654cf103431f3 /mm/slob.c
parent: f7977793240d836e60ff413e94e6914f08e10941 (diff)
1 files changed, 55 insertions, 17 deletions
diff --git a/mm/slob.c b/mm/slob.c
index 06e5e725fab..b99b0ef2347 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -3,6 +3,8 @@
 *
 * Matt Mackall <mpm@selenic.com> 12/30/03
 *
+ * NUMA support by Paul Mundt, 2007.
+ *
 * How SLOB works:
 *
 * The core of SLOB is a traditional K&R style heap allocator, with
@@ -10,7 +12,7 @@
 * allocator is as little as 2 bytes, however typically most architectures
 * will require 4 bytes on 32-bit and 8 bytes on 64-bit.
 *
- * The slob heap is a linked list of pages from __get_free_page, and
+ * The slob heap is a linked list of pages from alloc_pages(), and
 * within each page, there is a singly-linked list of free blocks (slob_t).
 * The heap is grown on demand and allocation from the heap is currently
 * first-fit.
@@ -18,7 +20,7 @@
 * Above this is an implementation of kmalloc/kfree. Blocks returned
 * from kmalloc are prepended with a 4-byte header with the kmalloc size.
 * If kmalloc is asked for objects of PAGE_SIZE or larger, it calls
- * __get_free_pages directly, allocating compound pages so the page order
+ * alloc_pages() directly, allocating compound pages so the page order
 * does not have to be separately tracked, and also stores the exact
 * allocation size in page->private so that it can be used to accurately
 * provide ksize(). These objects are detected in kfree() because slob_page()
@@ -29,10 +31,23 @@
 * 4-byte alignment unless the SLAB_HWCACHE_ALIGN flag is set, in which
 * case the low-level allocator will fragment blocks to create the proper
 * alignment. Again, objects of page-size or greater are allocated by
- * calling __get_free_pages. As SLAB objects know their size, no separate
+ * calling alloc_pages(). As SLAB objects know their size, no separate
 * size bookkeeping is necessary and there is essentially no allocation
 * space overhead, and compound pages aren't needed for multi-page
 * allocations.
+ *
+ * NUMA support in SLOB is fairly simplistic, pushing most of the real
+ * logic down to the page allocator, and simply doing the node accounting
+ * on the upper levels. In the event that a node id is explicitly
+ * provided, alloc_pages_node() with the specified node id is used
+ * instead. The common case (or when the node id isn't explicitly provided)
+ * will default to the current node, as per numa_node_id().
+ *
+ * Node aware pages are still inserted in to the global freelist, and
+ * these are scanned for by matching against the node id encoded in the
+ * page flags. As a result, block allocations that can be satisfied from
+ * the freelist will only be done so on pages residing on the same node,
+ * in order to prevent random node placement.
 */
 #include <linux/kernel.h>
@@ -204,6 +219,23 @@ static int slob_last(slob_t *s)
        return !((unsigned long)slob_next(s) & ~PAGE_MASK);
 }
+static void *slob_new_page(gfp_t gfp, int order, int node)
+{
+        void *page;
+#ifdef CONFIG_NUMA
+        if (node != -1)
+                page = alloc_pages_node(node, gfp, order);
+        else
+#endif
+                page = alloc_pages(gfp, order);
+        if (!page)
+                return NULL;
+        return page_address(page);
+}
 /*
 * Allocate a slob block within a given slob_page sp.
 */
@@ -258,7 +290,7 @@ static void *slob_page_alloc(struct slob_page *sp, size_t size, int align)
 /*
 * slob_alloc: entry point into the slob allocator.
 */
-static void *slob_alloc(size_t size, gfp_t gfp, int align)
+static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
 {
        struct slob_page *sp;
        slob_t *b = NULL;
@@ -267,6 +299,15 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align)
        spin_lock_irqsave(&slob_lock, flags);
        /* Iterate through each partially free page, try to find room */
        list_for_each_entry(sp, &free_slob_pages, list) {
+#ifdef CONFIG_NUMA
+                /*
+                 * If there's a node specification, search for a partial
+                 * page with a matching node id in the freelist.
+                 */
+                if (node != -1 && page_to_nid(&sp->page) != node)
+                        continue;
+#endif
                if (sp->units >= SLOB_UNITS(size)) {
                        b = slob_page_alloc(sp, size, align);
                        if (b)
@@ -277,7 +318,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align)
        /* Not enough space: must allocate a new page */
        if (!b) {
-                b = (slob_t *)__get_free_page(gfp);
+                b = slob_new_page(gfp, 0, node);
                if (!b)
                        return 0;
                sp = (struct slob_page *)virt_to_page(b);
@@ -381,22 +422,20 @@ out:
 #define ARCH_SLAB_MINALIGN __alignof__(unsigned long)
 #endif
+void *__kmalloc_node(size_t size, gfp_t gfp, int node)
-void *__kmalloc(size_t size, gfp_t gfp)
 {
        int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
        if (size < PAGE_SIZE - align) {
                unsigned int *m;
-                m = slob_alloc(size + align, gfp, align);
+                m = slob_alloc(size + align, gfp, align, node);
                if (m)
                        *m = size;
                return (void *)m + align;
        } else {
                void *ret;
-                ret = (void *) __get_free_pages(gfp | __GFP_COMP,
+                ret = slob_new_page(gfp | __GFP_COMP, get_order(size), node);
-                                                get_order(size));
                if (ret) {
                        struct page *page;
                        page = virt_to_page(ret);
@@ -405,7 +444,7 @@ void *__kmalloc(size_t size, gfp_t gfp)
                return ret;
        }
 }
-EXPORT_SYMBOL(__kmalloc);
+EXPORT_SYMBOL(__kmalloc_node);
 /**
 * krealloc - reallocate memory. The contents will remain unchanged.
@@ -455,7 +494,6 @@ void kfree(const void *block)
        } else
                put_page(&sp->page);
 }
 EXPORT_SYMBOL(kfree);
 /* can't use ksize for kmem_cache_alloc memory, only kmalloc */
@@ -487,7 +525,7 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
 {
        struct kmem_cache *c;
-        c = slob_alloc(sizeof(struct kmem_cache), flags, 0);
+        c = slob_alloc(sizeof(struct kmem_cache), flags, 0, -1);
        if (c) {
                c->name = name;
@@ -517,21 +555,21 @@ void kmem_cache_destroy(struct kmem_cache *c)
 }
 EXPORT_SYMBOL(kmem_cache_destroy);
-void *kmem_cache_alloc(struct kmem_cache *c, gfp_t flags)
+void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
 {
        void *b;
        if (c->size < PAGE_SIZE)
-                b = slob_alloc(c->size, flags, c->align);
+                b = slob_alloc(c->size, flags, c->align, node);
        else
-                b = (void *)__get_free_pages(flags, get_order(c->size));
+                b = slob_new_page(flags, get_order(c->size), node);
        if (c->ctor)
                c->ctor(b, c, 0);
        return b;
 }
-EXPORT_SYMBOL(kmem_cache_alloc);
+EXPORT_SYMBOL(kmem_cache_alloc_node);
 void *kmem_cache_zalloc(struct kmem_cache *c, gfp_t flags)
 {
author	Paul Mundt <lethal@linux-sh.org>	2007-07-16 02:38:22 -0400
committer	Linus Torvalds <torvalds@woody.linux-foundation.org>	2007-07-16 12:05:36 -0400
commit	6193a2ff180920f84ee06977165ebf32431fc2d2 (patch)
tree	d3c6423c50463ea741080a58a2e654cf103431f3 /mm/slob.c
parent	f7977793240d836e60ff413e94e6914f08e10941 (diff)

diff --git a/mm/slob.c b/mm/slob.c index 06e5e725fab..b99b0ef2347 100644 --- a/mm/slob.c +++ b/mm/slob.c
@@ -3,6 +3,8 @@
3	*	3	*
4	* Matt Mackall <mpm@selenic.com> 12/30/03	4	* Matt Mackall <mpm@selenic.com> 12/30/03
5	*	5	*
		6	* NUMA support by Paul Mundt, 2007.
		7	*
6	* How SLOB works:	8	* How SLOB works:
7	*	9	*
8	* The core of SLOB is a traditional K&R style heap allocator, with	10	* The core of SLOB is a traditional K&R style heap allocator, with
@@ -10,7 +12,7 @@
10	* allocator is as little as 2 bytes, however typically most architectures	12	* allocator is as little as 2 bytes, however typically most architectures
11	* will require 4 bytes on 32-bit and 8 bytes on 64-bit.	13	* will require 4 bytes on 32-bit and 8 bytes on 64-bit.
12	*	14	*
13	* The slob heap is a linked list of pages from __get_free_page, and	15	* The slob heap is a linked list of pages from alloc_pages(), and
14	* within each page, there is a singly-linked list of free blocks (slob_t).	16	* within each page, there is a singly-linked list of free blocks (slob_t).
15	* The heap is grown on demand and allocation from the heap is currently	17	* The heap is grown on demand and allocation from the heap is currently
16	* first-fit.	18	* first-fit.
@@ -18,7 +20,7 @@
18	* Above this is an implementation of kmalloc/kfree. Blocks returned	20	* Above this is an implementation of kmalloc/kfree. Blocks returned
19	* from kmalloc are prepended with a 4-byte header with the kmalloc size.	21	* from kmalloc are prepended with a 4-byte header with the kmalloc size.
20	* If kmalloc is asked for objects of PAGE_SIZE or larger, it calls	22	* If kmalloc is asked for objects of PAGE_SIZE or larger, it calls
21	* __get_free_pages directly, allocating compound pages so the page order	23	* alloc_pages() directly, allocating compound pages so the page order
22	* does not have to be separately tracked, and also stores the exact	24	* does not have to be separately tracked, and also stores the exact
23	* allocation size in page->private so that it can be used to accurately	25	* allocation size in page->private so that it can be used to accurately
24	* provide ksize(). These objects are detected in kfree() because slob_page()	26	* provide ksize(). These objects are detected in kfree() because slob_page()
@@ -29,10 +31,23 @@
29	* 4-byte alignment unless the SLAB_HWCACHE_ALIGN flag is set, in which	31	* 4-byte alignment unless the SLAB_HWCACHE_ALIGN flag is set, in which
30	* case the low-level allocator will fragment blocks to create the proper	32	* case the low-level allocator will fragment blocks to create the proper
31	* alignment. Again, objects of page-size or greater are allocated by	33	* alignment. Again, objects of page-size or greater are allocated by
32	* calling __get_free_pages. As SLAB objects know their size, no separate	34	* calling alloc_pages(). As SLAB objects know their size, no separate
33	* size bookkeeping is necessary and there is essentially no allocation	35	* size bookkeeping is necessary and there is essentially no allocation
34	* space overhead, and compound pages aren't needed for multi-page	36	* space overhead, and compound pages aren't needed for multi-page
35	* allocations.	37	* allocations.
		38	*
		39	* NUMA support in SLOB is fairly simplistic, pushing most of the real
		40	* logic down to the page allocator, and simply doing the node accounting
		41	* on the upper levels. In the event that a node id is explicitly
		42	* provided, alloc_pages_node() with the specified node id is used
		43	* instead. The common case (or when the node id isn't explicitly provided)
		44	* will default to the current node, as per numa_node_id().
		45	*
		46	* Node aware pages are still inserted in to the global freelist, and
		47	* these are scanned for by matching against the node id encoded in the
		48	* page flags. As a result, block allocations that can be satisfied from
		49	* the freelist will only be done so on pages residing on the same node,
		50	* in order to prevent random node placement.
36	*/	51	*/
37		52
38	#include <linux/kernel.h>	53	#include <linux/kernel.h>
@@ -204,6 +219,23 @@ static int slob_last(slob_t *s)
204	return !((unsigned long)slob_next(s) & ~PAGE_MASK);	219	return !((unsigned long)slob_next(s) & ~PAGE_MASK);
205	}	220	}
206		221
		222	static void *slob_new_page(gfp_t gfp, int order, int node)
		223	{
		224	void *page;
		225
		226	#ifdef CONFIG_NUMA
		227	if (node != -1)
		228	page = alloc_pages_node(node, gfp, order);
		229	else
		230	#endif
		231	page = alloc_pages(gfp, order);
		232
		233	if (!page)
		234	return NULL;
		235
		236	return page_address(page);
		237	}
		238
207	/*	239	/*
208	* Allocate a slob block within a given slob_page sp.	240	* Allocate a slob block within a given slob_page sp.
209	*/	241	*/
@@ -258,7 +290,7 @@ static void slob_page_alloc(struct slob_page sp, size_t size, int align)
258	/*	290	/*
259	* slob_alloc: entry point into the slob allocator.	291	* slob_alloc: entry point into the slob allocator.
260	*/	292	*/
261	static void *slob_alloc(size_t size, gfp_t gfp, int align)	293	static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
262	{	294	{
263	struct slob_page *sp;	295	struct slob_page *sp;
264	slob_t *b = NULL;	296	slob_t *b = NULL;
@@ -267,6 +299,15 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align)
267	spin_lock_irqsave(&slob_lock, flags);	299	spin_lock_irqsave(&slob_lock, flags);
268	/* Iterate through each partially free page, try to find room */	300	/* Iterate through each partially free page, try to find room */
269	list_for_each_entry(sp, &free_slob_pages, list) {	301	list_for_each_entry(sp, &free_slob_pages, list) {
		302	#ifdef CONFIG_NUMA
		303	/*
		304	* If there's a node specification, search for a partial
		305	* page with a matching node id in the freelist.
		306	*/
		307	if (node != -1 && page_to_nid(&sp->page) != node)
		308	continue;
		309	#endif
		310
270	if (sp->units >= SLOB_UNITS(size)) {	311	if (sp->units >= SLOB_UNITS(size)) {
271	b = slob_page_alloc(sp, size, align);	312	b = slob_page_alloc(sp, size, align);
272	if (b)	313	if (b)
@@ -277,7 +318,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align)
277		318
278	/* Not enough space: must allocate a new page */	319	/* Not enough space: must allocate a new page */
279	if (!b) {	320	if (!b) {
280	b = (slob_t *)__get_free_page(gfp);	321	b = slob_new_page(gfp, 0, node);
281	if (!b)	322	if (!b)
282	return 0;	323	return 0;
283	sp = (struct slob_page *)virt_to_page(b);	324	sp = (struct slob_page *)virt_to_page(b);
@@ -381,22 +422,20 @@ out:
381	#define ARCH_SLAB_MINALIGN __alignof__(unsigned long)	422	#define ARCH_SLAB_MINALIGN __alignof__(unsigned long)
382	#endif	423	#endif
383		424
384		425	void *__kmalloc_node(size_t size, gfp_t gfp, int node)
385	void *__kmalloc(size_t size, gfp_t gfp)
386	{	426	{
387	int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);	427	int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
388		428
389	if (size < PAGE_SIZE - align) {	429	if (size < PAGE_SIZE - align) {
390	unsigned int *m;	430	unsigned int *m;
391	m = slob_alloc(size + align, gfp, align);	431	m = slob_alloc(size + align, gfp, align, node);
392	if (m)	432	if (m)
393	*m = size;	433	*m = size;
394	return (void *)m + align;	434	return (void *)m + align;
395	} else {	435	} else {
396	void *ret;	436	void *ret;
397		437
398	ret = (void *) __get_free_pages(gfp \| __GFP_COMP,	438	ret = slob_new_page(gfp \| __GFP_COMP, get_order(size), node);
399	get_order(size));
400	if (ret) {	439	if (ret) {
401	struct page *page;	440	struct page *page;
402	page = virt_to_page(ret);	441	page = virt_to_page(ret);
@@ -405,7 +444,7 @@ void *__kmalloc(size_t size, gfp_t gfp)
405	return ret;	444	return ret;
406	}	445	}
407	}	446	}
408	EXPORT_SYMBOL(__kmalloc);	447	EXPORT_SYMBOL(__kmalloc_node);
409		448
410	/**	449	/**
411	* krealloc - reallocate memory. The contents will remain unchanged.	450	* krealloc - reallocate memory. The contents will remain unchanged.
@@ -455,7 +494,6 @@ void kfree(const void *block)
455	} else	494	} else
456	put_page(&sp->page);	495	put_page(&sp->page);
457	}	496	}
458
459	EXPORT_SYMBOL(kfree);	497	EXPORT_SYMBOL(kfree);
460		498
461	/* can't use ksize for kmem_cache_alloc memory, only kmalloc */	499	/* can't use ksize for kmem_cache_alloc memory, only kmalloc */
@@ -487,7 +525,7 @@ struct kmem_cache kmem_cache_create(const char name, size_t size,
487	{	525	{
488	struct kmem_cache *c;	526	struct kmem_cache *c;
489		527
490	c = slob_alloc(sizeof(struct kmem_cache), flags, 0);	528	c = slob_alloc(sizeof(struct kmem_cache), flags, 0, -1);
491		529
492	if (c) {	530	if (c) {
493	c->name = name;	531	c->name = name;
@@ -517,21 +555,21 @@ void kmem_cache_destroy(struct kmem_cache *c)
517	}	555	}
518	EXPORT_SYMBOL(kmem_cache_destroy);	556	EXPORT_SYMBOL(kmem_cache_destroy);
519		557
520	void kmem_cache_alloc(struct kmem_cache c, gfp_t flags)	558	void kmem_cache_alloc_node(struct kmem_cache c, gfp_t flags, int node)
521	{	559	{
522	void *b;	560	void *b;
523		561
524	if (c->size < PAGE_SIZE)	562	if (c->size < PAGE_SIZE)
525	b = slob_alloc(c->size, flags, c->align);	563	b = slob_alloc(c->size, flags, c->align, node);
526	else	564	else
527	b = (void *)__get_free_pages(flags, get_order(c->size));	565	b = slob_new_page(flags, get_order(c->size), node);
528		566
529	if (c->ctor)	567	if (c->ctor)
530	c->ctor(b, c, 0);	568	c->ctor(b, c, 0);
531		569
532	return b;	570	return b;
533	}	571	}
534	EXPORT_SYMBOL(kmem_cache_alloc);	572	EXPORT_SYMBOL(kmem_cache_alloc_node);
535		573
536	void kmem_cache_zalloc(struct kmem_cache c, gfp_t flags)	574	void kmem_cache_zalloc(struct kmem_cache c, gfp_t flags)
537	{	575	{