4 files changed, 82 insertions, 42 deletions
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index c92b1532f05a..a8d389d72405 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -2395,6 +2395,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
        slram=          [HW,MTD]
+        slab_max_order= [MM, SLAB]
+                        Determines the maximum allowed order for slabs.
+                        A high setting may cause OOMs due to memory
+                        fragmentation.  Defaults to 1 for systems with
+                        more than 32MB of RAM, 0 otherwise.
        slub_debug[=options[,slabs]]    [MM, SLUB]
                        Enabling slub_debug allows one to determine the
                        culprit if slab objects become corrupted. Enabling
diff --git a/Documentation/vm/slub.txt b/Documentation/vm/slub.txt
index f464f47bc60d..2acdda9601b0 100644
--- a/Documentation/vm/slub.txt
+++ b/Documentation/vm/slub.txt
@@ -117,7 +117,7 @@ can be influenced by kernel parameters:
 slub_min_objects=x              (default 4)
 slub_min_order=x                (default 0)
-slub_max_order=x                (default 1)
+slub_max_order=x                (default 3 (PAGE_ALLOC_COSTLY_ORDER))
 slub_min_objects allows to specify how many objects must at least fit
 into one slab in order for the allocation order to be acceptable.
diff --git a/mm/slab.c b/mm/slab.c
index 2acfa0d90943..f0bd7857ab3b 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -481,11 +481,13 @@ EXPORT_SYMBOL(slab_buffer_size);
 #endif
 /*
- * Do not go above this order unless 0 objects fit into the slab.
+ * Do not go above this order unless 0 objects fit into the slab or
+ * overridden on the command line.
 */
-#define BREAK_GFP_ORDER_HI      1
+#define SLAB_MAX_ORDER_HI       1
-#define BREAK_GFP_ORDER_LO      0
+#define SLAB_MAX_ORDER_LO       0
-static int slab_break_gfp_order = BREAK_GFP_ORDER_LO;
+static int slab_max_order = SLAB_MAX_ORDER_LO;
+static bool slab_max_order_set __initdata;
 /*
 * Functions for storing/retrieving the cachep and or slab from the page
@@ -854,6 +856,17 @@ static int __init noaliencache_setup(char *s)
 }
 __setup("noaliencache", noaliencache_setup);
+static int __init slab_max_order_setup(char *str)
+{
+        get_option(&str, &slab_max_order);
+        slab_max_order = slab_max_order < 0 ? 0 :
+                                min(slab_max_order, MAX_ORDER - 1);
+        slab_max_order_set = true;
+        return 1;
+}
+__setup("slab_max_order=", slab_max_order_setup);
 #ifdef CONFIG_NUMA
 /*
 * Special reaping functions for NUMA systems called from cache_reap().
@@ -1502,10 +1515,11 @@ void __init kmem_cache_init(void)
        /*
         * Fragmentation resistance on low memory - only use bigger
-         * page orders on machines with more than 32MB of memory.
+         * page orders on machines with more than 32MB of memory if
+         * not overridden on the command line.
         */
-        if (totalram_pages > (32 << 20) >> PAGE_SHIFT)
+        if (!slab_max_order_set && totalram_pages > (32 << 20) >> PAGE_SHIFT)
-                slab_break_gfp_order = BREAK_GFP_ORDER_HI;
+                slab_max_order = SLAB_MAX_ORDER_HI;
        /* Bootstrap is tricky, because several objects are allocated
         * from caches that do not exist yet:
@@ -1932,8 +1946,8 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp)
                        /* Print header */
                        if (lines == 0) {
                                printk(KERN_ERR
-                                        "Slab corruption: %s start=%p, len=%d\n",
+                                        "Slab corruption (%s): %s start=%p, len=%d\n",
-                                        cachep->name, realobj, size);
+                                        print_tainted(), cachep->name, realobj, size);
                                print_objinfo(cachep, objp, 0);
                        }
                        /* Hexdump the affected line */
@@ -2117,7 +2131,7 @@ static size_t calculate_slab_order(struct kmem_cache *cachep,
                 * Large number of objects is good, but very large slabs are
                 * currently bad for the gfp()s.
                 */
-                if (gfporder >= slab_break_gfp_order)
+                if (gfporder >= slab_max_order)
                        break;
                /*
@@ -3042,8 +3056,9 @@ static void check_slabp(struct kmem_cache *cachep, struct slab *slabp)
        if (entries != cachep->num - slabp->inuse) {
 bad:
                printk(KERN_ERR "slab: Internal list corruption detected in "
-                                "cache '%s'(%d), slabp %p(%d). Hexdump:\n",
+                        "cache '%s'(%d), slabp %p(%d). Tainted(%s). Hexdump:\n",
-                        cachep->name, cachep->num, slabp, slabp->inuse);
+                        cachep->name, cachep->num, slabp, slabp->inuse,
+                        print_tainted());
                print_hex_dump(KERN_ERR, "", DUMP_PREFIX_OFFSET, 16, 1, slabp,
                        sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t),
                        1);
diff --git a/mm/slub.c b/mm/slub.c
index d99acbf14e01..5d37b5e44140 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -570,7 +570,7 @@ static void slab_bug(struct kmem_cache *s, char *fmt, ...)
        va_end(args);
        printk(KERN_ERR "========================================"
                        "=====================================\n");
-        printk(KERN_ERR "BUG %s: %s\n", s->name, buf);
+        printk(KERN_ERR "BUG %s (%s): %s\n", s->name, print_tainted(), buf);
        printk(KERN_ERR "----------------------------------------"
                        "-------------------------------------\n\n");
 }
@@ -1901,11 +1901,14 @@ static void unfreeze_partials(struct kmem_cache *s)
                        }
                        if (l != m) {
-                                if (l == M_PARTIAL)
+                                if (l == M_PARTIAL) {
                                        remove_partial(n, page);
-                                else
+                                        stat(s, FREE_REMOVE_PARTIAL);
+                                } else {
                                        add_partial(n, page,
                                                DEACTIVATE_TO_TAIL);
+                                        stat(s, FREE_ADD_PARTIAL);
+                                }
                                l = m;
                        }
@@ -2124,6 +2127,37 @@ static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags,
 }
 /*
+ * Check the page->freelist of a page and either transfer the freelist to the per cpu freelist
+ * or deactivate the page.
+ *
+ * The page is still frozen if the return value is not NULL.
+ *
+ * If this function returns NULL then the page has been unfrozen.
+ */
+static inline void *get_freelist(struct kmem_cache *s, struct page *page)
+{
+        struct page new;
+        unsigned long counters;
+        void *freelist;
+        do {
+                freelist = page->freelist;
+                counters = page->counters;
+                new.counters = counters;
+                VM_BUG_ON(!new.frozen);
+                new.inuse = page->objects;
+                new.frozen = freelist != NULL;
+        } while (!cmpxchg_double_slab(s, page,
+                freelist, counters,
+                NULL, new.counters,
+                "get_freelist"));
+        return freelist;
+}
+/*
 * Slow path. The lockless freelist is empty or we need to perform
 * debugging duties.
 *
@@ -2144,8 +2178,6 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
 {
        void **object;
        unsigned long flags;
-        struct page new;
-        unsigned long counters;
        local_irq_save(flags);
 #ifdef CONFIG_PREEMPT
@@ -2166,31 +2198,14 @@ redo:
                goto new_slab;
        }
-        stat(s, ALLOC_SLOWPATH);
+        /* must check again c->freelist in case of cpu migration or IRQ */
+        object = c->freelist;
-        do {
+        if (object)
-                object = c->page->freelist;
+                goto load_freelist;
-                counters = c->page->counters;
-                new.counters = counters;
-                VM_BUG_ON(!new.frozen);
-                /*
-                 * If there is no object left then we use this loop to
-                 * deactivate the slab which is simple since no objects
-                 * are left in the slab and therefore we do not need to
-                 * put the page back onto the partial list.
-                 *
-                 * If there are objects left then we retrieve them
-                 * and use them to refill the per cpu queue.
-                 */
-                new.inuse = c->page->objects;
+        stat(s, ALLOC_SLOWPATH);
-                new.frozen = object != NULL;
-        } while (!__cmpxchg_double_slab(s, c->page,
+        object = get_freelist(s, c->page);
-                        object, counters,
-                        NULL, new.counters,
-                        "__slab_alloc"));
        if (!object) {
                c->page = NULL;
@@ -3028,7 +3043,9 @@ static int kmem_cache_open(struct kmem_cache *s,
         *    per node list when we run out of per cpu objects. We only fetch 50%
         *    to keep some capacity around for frees.
         */
-        if (s->size >= PAGE_SIZE)
+        if (kmem_cache_debug(s))
+                s->cpu_partial = 0;
+        else if (s->size >= PAGE_SIZE)
                s->cpu_partial = 2;
        else if (s->size >= 1024)
                s->cpu_partial = 6;
@@ -4637,6 +4654,8 @@ static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf,
        err = strict_strtoul(buf, 10, &objects);
        if (err)
                return err;
+        if (objects && kmem_cache_debug(s))
+                return -EINVAL;
        s->cpu_partial = objects;
        flush_all(s);

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index c92b1532f05a..a8d389d72405 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt
@@ -2395,6 +2395,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
2395		2395
2396	slram= [HW,MTD]	2396	slram= [HW,MTD]
2397		2397
		2398	slab_max_order= [MM, SLAB]
		2399	Determines the maximum allowed order for slabs.
		2400	A high setting may cause OOMs due to memory
		2401	fragmentation. Defaults to 1 for systems with
		2402	more than 32MB of RAM, 0 otherwise.
		2403
2398	slub_debug[=options[,slabs]] [MM, SLUB]	2404	slub_debug[=options[,slabs]] [MM, SLUB]
2399	Enabling slub_debug allows one to determine the	2405	Enabling slub_debug allows one to determine the
2400	culprit if slab objects become corrupted. Enabling	2406	culprit if slab objects become corrupted. Enabling


diff --git a/Documentation/vm/slub.txt b/Documentation/vm/slub.txt index f464f47bc60d..2acdda9601b0 100644 --- a/Documentation/vm/slub.txt +++ b/Documentation/vm/slub.txt
@@ -117,7 +117,7 @@ can be influenced by kernel parameters:
117		117
118	slub_min_objects=x (default 4)	118	slub_min_objects=x (default 4)
119	slub_min_order=x (default 0)	119	slub_min_order=x (default 0)
120	slub_max_order=x (default 1)	120	slub_max_order=x (default 3 (PAGE_ALLOC_COSTLY_ORDER))
121		121
122	slub_min_objects allows to specify how many objects must at least fit	122	slub_min_objects allows to specify how many objects must at least fit
123	into one slab in order for the allocation order to be acceptable.	123	into one slab in order for the allocation order to be acceptable.


diff --git a/mm/slab.c b/mm/slab.c index 2acfa0d90943..f0bd7857ab3b 100644 --- a/mm/slab.c +++ b/mm/slab.c
@@ -481,11 +481,13 @@ EXPORT_SYMBOL(slab_buffer_size);
481	#endif	481	#endif
482		482
483	/*	483	/*
484	* Do not go above this order unless 0 objects fit into the slab.	484	* Do not go above this order unless 0 objects fit into the slab or
		485	* overridden on the command line.
485	*/	486	*/
486	#define BREAK_GFP_ORDER_HI 1	487	#define SLAB_MAX_ORDER_HI 1
487	#define BREAK_GFP_ORDER_LO 0	488	#define SLAB_MAX_ORDER_LO 0
488	static int slab_break_gfp_order = BREAK_GFP_ORDER_LO;	489	static int slab_max_order = SLAB_MAX_ORDER_LO;
		490	static bool slab_max_order_set __initdata;
489		491
490	/*	492	/*
491	* Functions for storing/retrieving the cachep and or slab from the page	493	* Functions for storing/retrieving the cachep and or slab from the page
@@ -854,6 +856,17 @@ static int __init noaliencache_setup(char *s)
854	}	856	}
855	__setup("noaliencache", noaliencache_setup);	857	__setup("noaliencache", noaliencache_setup);
856		858
		859	static int __init slab_max_order_setup(char *str)
		860	{
		861	get_option(&str, &slab_max_order);
		862	slab_max_order = slab_max_order < 0 ? 0 :
		863	min(slab_max_order, MAX_ORDER - 1);
		864	slab_max_order_set = true;
		865
		866	return 1;
		867	}
		868	__setup("slab_max_order=", slab_max_order_setup);
		869
857	#ifdef CONFIG_NUMA	870	#ifdef CONFIG_NUMA
858	/*	871	/*
859	* Special reaping functions for NUMA systems called from cache_reap().	872	* Special reaping functions for NUMA systems called from cache_reap().
@@ -1502,10 +1515,11 @@ void __init kmem_cache_init(void)
1502		1515
1503	/*	1516	/*
1504	* Fragmentation resistance on low memory - only use bigger	1517	* Fragmentation resistance on low memory - only use bigger
1505	* page orders on machines with more than 32MB of memory.	1518	* page orders on machines with more than 32MB of memory if
		1519	* not overridden on the command line.
1506	*/	1520	*/
1507	if (totalram_pages > (32 << 20) >> PAGE_SHIFT)	1521	if (!slab_max_order_set && totalram_pages > (32 << 20) >> PAGE_SHIFT)
1508	slab_break_gfp_order = BREAK_GFP_ORDER_HI;	1522	slab_max_order = SLAB_MAX_ORDER_HI;
1509		1523
1510	/* Bootstrap is tricky, because several objects are allocated	1524	/* Bootstrap is tricky, because several objects are allocated
1511	* from caches that do not exist yet:	1525	* from caches that do not exist yet:
@@ -1932,8 +1946,8 @@ static void check_poison_obj(struct kmem_cache cachep, void objp)
1932	/* Print header */	1946	/* Print header */
1933	if (lines == 0) {	1947	if (lines == 0) {
1934	printk(KERN_ERR	1948	printk(KERN_ERR
1935	"Slab corruption: %s start=%p, len=%d\n",	1949	"Slab corruption (%s): %s start=%p, len=%d\n",
1936	cachep->name, realobj, size);	1950	print_tainted(), cachep->name, realobj, size);
1937	print_objinfo(cachep, objp, 0);	1951	print_objinfo(cachep, objp, 0);
1938	}	1952	}
1939	/* Hexdump the affected line */	1953	/* Hexdump the affected line */
@@ -2117,7 +2131,7 @@ static size_t calculate_slab_order(struct kmem_cache *cachep,
2117	* Large number of objects is good, but very large slabs are	2131	* Large number of objects is good, but very large slabs are
2118	* currently bad for the gfp()s.	2132	* currently bad for the gfp()s.
2119	*/	2133	*/
2120	if (gfporder >= slab_break_gfp_order)	2134	if (gfporder >= slab_max_order)
2121	break;	2135	break;
2122		2136
2123	/*	2137	/*
@@ -3042,8 +3056,9 @@ static void check_slabp(struct kmem_cache cachep, struct slab slabp)
3042	if (entries != cachep->num - slabp->inuse) {	3056	if (entries != cachep->num - slabp->inuse) {
3043	bad:	3057	bad:
3044	printk(KERN_ERR "slab: Internal list corruption detected in "	3058	printk(KERN_ERR "slab: Internal list corruption detected in "
3045	"cache '%s'(%d), slabp %p(%d). Hexdump:\n",	3059	"cache '%s'(%d), slabp %p(%d). Tainted(%s). Hexdump:\n",
3046	cachep->name, cachep->num, slabp, slabp->inuse);	3060	cachep->name, cachep->num, slabp, slabp->inuse,
		3061	print_tainted());
3047	print_hex_dump(KERN_ERR, "", DUMP_PREFIX_OFFSET, 16, 1, slabp,	3062	print_hex_dump(KERN_ERR, "", DUMP_PREFIX_OFFSET, 16, 1, slabp,
3048	sizeof(slabp) + cachep->num sizeof(kmem_bufctl_t),	3063	sizeof(slabp) + cachep->num sizeof(kmem_bufctl_t),
3049	1);	3064	1);


diff --git a/mm/slub.c b/mm/slub.c index d99acbf14e01..5d37b5e44140 100644 --- a/mm/slub.c +++ b/mm/slub.c
@@ -570,7 +570,7 @@ static void slab_bug(struct kmem_cache s, char fmt, ...)
570	va_end(args);	570	va_end(args);
571	printk(KERN_ERR "========================================"	571	printk(KERN_ERR "========================================"
572	"=====================================\n");	572	"=====================================\n");
573	printk(KERN_ERR "BUG %s: %s\n", s->name, buf);	573	printk(KERN_ERR "BUG %s (%s): %s\n", s->name, print_tainted(), buf);
574	printk(KERN_ERR "----------------------------------------"	574	printk(KERN_ERR "----------------------------------------"
575	"-------------------------------------\n\n");	575	"-------------------------------------\n\n");
576	}	576	}
@@ -1901,11 +1901,14 @@ static void unfreeze_partials(struct kmem_cache *s)
1901	}	1901	}
1902		1902
1903	if (l != m) {	1903	if (l != m) {
1904	if (l == M_PARTIAL)	1904	if (l == M_PARTIAL) {
1905	remove_partial(n, page);	1905	remove_partial(n, page);
1906	else	1906	stat(s, FREE_REMOVE_PARTIAL);
		1907	} else {
1907	add_partial(n, page,	1908	add_partial(n, page,
1908	DEACTIVATE_TO_TAIL);	1909	DEACTIVATE_TO_TAIL);
		1910	stat(s, FREE_ADD_PARTIAL);
		1911	}
1909		1912
1910	l = m;	1913	l = m;
1911	}	1914	}
@@ -2124,6 +2127,37 @@ static inline void new_slab_objects(struct kmem_cache s, gfp_t flags,
2124	}	2127	}
2125		2128
2126	/*	2129	/*
		2130	* Check the page->freelist of a page and either transfer the freelist to the per cpu freelist
		2131	* or deactivate the page.
		2132	*
		2133	* The page is still frozen if the return value is not NULL.
		2134	*
		2135	* If this function returns NULL then the page has been unfrozen.
		2136	*/
		2137	static inline void get_freelist(struct kmem_cache s, struct page *page)
		2138	{
		2139	struct page new;
		2140	unsigned long counters;
		2141	void *freelist;
		2142
		2143	do {
		2144	freelist = page->freelist;
		2145	counters = page->counters;
		2146	new.counters = counters;
		2147	VM_BUG_ON(!new.frozen);
		2148
		2149	new.inuse = page->objects;
		2150	new.frozen = freelist != NULL;
		2151
		2152	} while (!cmpxchg_double_slab(s, page,
		2153	freelist, counters,
		2154	NULL, new.counters,
		2155	"get_freelist"));
		2156
		2157	return freelist;
		2158	}
		2159
		2160	/*
2127	* Slow path. The lockless freelist is empty or we need to perform	2161	* Slow path. The lockless freelist is empty or we need to perform
2128	* debugging duties.	2162	* debugging duties.
2129	*	2163	*
@@ -2144,8 +2178,6 @@ static void __slab_alloc(struct kmem_cache s, gfp_t gfpflags, int node,
2144	{	2178	{
2145	void **object;	2179	void **object;
2146	unsigned long flags;	2180	unsigned long flags;
2147	struct page new;
2148	unsigned long counters;
2149		2181
2150	local_irq_save(flags);	2182	local_irq_save(flags);
2151	#ifdef CONFIG_PREEMPT	2183	#ifdef CONFIG_PREEMPT
@@ -2166,31 +2198,14 @@ redo:
2166	goto new_slab;	2198	goto new_slab;
2167	}	2199	}
2168		2200
2169	stat(s, ALLOC_SLOWPATH);	2201	/* must check again c->freelist in case of cpu migration or IRQ */
2170		2202	object = c->freelist;
2171	do {	2203	if (object)
2172	object = c->page->freelist;	2204	goto load_freelist;
2173	counters = c->page->counters;
2174	new.counters = counters;
2175	VM_BUG_ON(!new.frozen);
2176
2177	/*
2178	* If there is no object left then we use this loop to
2179	* deactivate the slab which is simple since no objects
2180	* are left in the slab and therefore we do not need to
2181	* put the page back onto the partial list.
2182	*
2183	* If there are objects left then we retrieve them
2184	* and use them to refill the per cpu queue.
2185	*/
2186		2205
2187	new.inuse = c->page->objects;	2206	stat(s, ALLOC_SLOWPATH);
2188	new.frozen = object != NULL;
2189		2207
2190	} while (!__cmpxchg_double_slab(s, c->page,	2208	object = get_freelist(s, c->page);
2191	object, counters,
2192	NULL, new.counters,
2193	"__slab_alloc"));
2194		2209
2195	if (!object) {	2210	if (!object) {
2196	c->page = NULL;	2211	c->page = NULL;
@@ -3028,7 +3043,9 @@ static int kmem_cache_open(struct kmem_cache *s,
3028	* per node list when we run out of per cpu objects. We only fetch 50%	3043	* per node list when we run out of per cpu objects. We only fetch 50%
3029	* to keep some capacity around for frees.	3044	* to keep some capacity around for frees.
3030	*/	3045	*/
3031	if (s->size >= PAGE_SIZE)	3046	if (kmem_cache_debug(s))
		3047	s->cpu_partial = 0;
		3048	else if (s->size >= PAGE_SIZE)
3032	s->cpu_partial = 2;	3049	s->cpu_partial = 2;
3033	else if (s->size >= 1024)	3050	else if (s->size >= 1024)
3034	s->cpu_partial = 6;	3051	s->cpu_partial = 6;
@@ -4637,6 +4654,8 @@ static ssize_t cpu_partial_store(struct kmem_cache s, const char buf,
4637	err = strict_strtoul(buf, 10, &objects);	4654	err = strict_strtoul(buf, 10, &objects);
4638	if (err)	4655	if (err)
4639	return err;	4656	return err;
		4657	if (objects && kmem_cache_debug(s))
		4658	return -EINVAL;
4640		4659
4641	s->cpu_partial = objects;	4660	s->cpu_partial = objects;
4642	flush_all(s);	4661	flush_all(s);