1 files changed, 134 insertions, 64 deletions
diff --git a/mm/slab.c b/mm/slab.c
index bac0f4fcc216..50a73fca19c4 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -115,6 +115,7 @@
 #include        <linux/reciprocal_div.h>
 #include        <linux/debugobjects.h>
 #include        <linux/kmemcheck.h>
+#include        <linux/memory.h>
 #include        <asm/cacheflush.h>
 #include        <asm/tlbflush.h>
@@ -144,30 +145,6 @@
 #define BYTES_PER_WORD          sizeof(void *)
 #define REDZONE_ALIGN           max(BYTES_PER_WORD, __alignof__(unsigned long long))
-#ifndef ARCH_KMALLOC_MINALIGN
-/*
- * Enforce a minimum alignment for the kmalloc caches.
- * Usually, the kmalloc caches are cache_line_size() aligned, except when
- * DEBUG and FORCED_DEBUG are enabled, then they are BYTES_PER_WORD aligned.
- * Some archs want to perform DMA into kmalloc caches and need a guaranteed
- * alignment larger than the alignment of a 64-bit integer.
- * ARCH_KMALLOC_MINALIGN allows that.
- * Note that increasing this value may disable some debug features.
- */
-#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long)
-#endif
-#ifndef ARCH_SLAB_MINALIGN
-/*
- * Enforce a minimum alignment for all caches.
- * Intended for archs that get misalignment faults even for BYTES_PER_WORD
- * aligned buffers. Includes ARCH_KMALLOC_MINALIGN.
- * If possible: Do not enable this flag for CONFIG_DEBUG_SLAB, it disables
- * some debug features.
- */
-#define ARCH_SLAB_MINALIGN 0
-#endif
 #ifndef ARCH_KMALLOC_FLAGS
 #define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN
 #endif
@@ -1102,6 +1079,52 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
 }
 #endif
+/*
+ * Allocates and initializes nodelists for a node on each slab cache, used for
+ * either memory or cpu hotplug.  If memory is being hot-added, the kmem_list3
+ * will be allocated off-node since memory is not yet online for the new node.
+ * When hotplugging memory or a cpu, existing nodelists are not replaced if
+ * already in use.
+ *
+ * Must hold cache_chain_mutex.
+ */
+static int init_cache_nodelists_node(int node)
+{
+        struct kmem_cache *cachep;
+        struct kmem_list3 *l3;
+        const int memsize = sizeof(struct kmem_list3);
+        list_for_each_entry(cachep, &cache_chain, next) {
+                /*
+                 * Set up the size64 kmemlist for cpu before we can
+                 * begin anything. Make sure some other cpu on this
+                 * node has not already allocated this
+                 */
+                if (!cachep->nodelists[node]) {
+                        l3 = kmalloc_node(memsize, GFP_KERNEL, node);
+                        if (!l3)
+                                return -ENOMEM;
+                        kmem_list3_init(l3);
+                        l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
+                            ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
+                        /*
+                         * The l3s don't come and go as CPUs come and
+                         * go.  cache_chain_mutex is sufficient
+                         * protection here.
+                         */
+                        cachep->nodelists[node] = l3;
+                }
+                spin_lock_irq(&cachep->nodelists[node]->list_lock);
+                cachep->nodelists[node]->free_limit =
+                        (1 + nr_cpus_node(node)) *
+                        cachep->batchcount + cachep->num;
+                spin_unlock_irq(&cachep->nodelists[node]->list_lock);
+        }
+        return 0;
+}
 static void __cpuinit cpuup_canceled(long cpu)
 {
        struct kmem_cache *cachep;
@@ -1172,7 +1195,7 @@ static int __cpuinit cpuup_prepare(long cpu)
        struct kmem_cache *cachep;
        struct kmem_list3 *l3 = NULL;
        int node = cpu_to_node(cpu);
-        const int memsize = sizeof(struct kmem_list3);
+        int err;
        /*
         * We need to do this right in the beginning since
@@ -1180,35 +1203,9 @@ static int __cpuinit cpuup_prepare(long cpu)
         * kmalloc_node allows us to add the slab to the right
         * kmem_list3 and not this cpu's kmem_list3
         */
+        err = init_cache_nodelists_node(node);
-        list_for_each_entry(cachep, &cache_chain, next) {
+        if (err < 0)
-                /*
+                goto bad;
-                 * Set up the size64 kmemlist for cpu before we can
-                 * begin anything. Make sure some other cpu on this
-                 * node has not already allocated this
-                 */
-                if (!cachep->nodelists[node]) {
-                        l3 = kmalloc_node(memsize, GFP_KERNEL, node);
-                        if (!l3)
-                                goto bad;
-                        kmem_list3_init(l3);
-                        l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
-                            ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
-                        /*
-                         * The l3s don't come and go as CPUs come and
-                         * go.  cache_chain_mutex is sufficient
-                         * protection here.
-                         */
-                        cachep->nodelists[node] = l3;
-                }
-                spin_lock_irq(&cachep->nodelists[node]->list_lock);
-                cachep->nodelists[node]->free_limit =
-                        (1 + nr_cpus_node(node)) *
-                        cachep->batchcount + cachep->num;
-                spin_unlock_irq(&cachep->nodelists[node]->list_lock);
-        }
        /*
         * Now we can go ahead with allocating the shared arrays and
@@ -1331,11 +1328,75 @@ static struct notifier_block __cpuinitdata cpucache_notifier = {
        &cpuup_callback, NULL, 0
 };
+#if defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG)
+/*
+ * Drains freelist for a node on each slab cache, used for memory hot-remove.
+ * Returns -EBUSY if all objects cannot be drained so that the node is not
+ * removed.
+ *
+ * Must hold cache_chain_mutex.
+ */
+static int __meminit drain_cache_nodelists_node(int node)
+{
+        struct kmem_cache *cachep;
+        int ret = 0;
+        list_for_each_entry(cachep, &cache_chain, next) {
+                struct kmem_list3 *l3;
+                l3 = cachep->nodelists[node];
+                if (!l3)
+                        continue;
+                drain_freelist(cachep, l3, l3->free_objects);
+                if (!list_empty(&l3->slabs_full) ||
+                    !list_empty(&l3->slabs_partial)) {
+                        ret = -EBUSY;
+                        break;
+                }
+        }
+        return ret;
+}
+static int __meminit slab_memory_callback(struct notifier_block *self,
+                                        unsigned long action, void *arg)
+{
+        struct memory_notify *mnb = arg;
+        int ret = 0;
+        int nid;
+        nid = mnb->status_change_nid;
+        if (nid < 0)
+                goto out;
+        switch (action) {
+        case MEM_GOING_ONLINE:
+                mutex_lock(&cache_chain_mutex);
+                ret = init_cache_nodelists_node(nid);
+                mutex_unlock(&cache_chain_mutex);
+                break;
+        case MEM_GOING_OFFLINE:
+                mutex_lock(&cache_chain_mutex);
+                ret = drain_cache_nodelists_node(nid);
+                mutex_unlock(&cache_chain_mutex);
+                break;
+        case MEM_ONLINE:
+        case MEM_OFFLINE:
+        case MEM_CANCEL_ONLINE:
+        case MEM_CANCEL_OFFLINE:
+                break;
+        }
+out:
+        return ret ? notifier_from_errno(ret) : NOTIFY_OK;
+}
+#endif /* CONFIG_NUMA && CONFIG_MEMORY_HOTPLUG */
 /*
 * swap the static kmem_list3 with kmalloced memory
 */
-static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list,
+static void __init init_list(struct kmem_cache *cachep, struct kmem_list3 *list,
-                        int nodeid)
+                                int nodeid)
 {
        struct kmem_list3 *ptr;
@@ -1580,6 +1641,14 @@ void __init kmem_cache_init_late(void)
         */
        register_cpu_notifier(&cpucache_notifier);
+#ifdef CONFIG_NUMA
+        /*
+         * Register a memory hotplug callback that initializes and frees
+         * nodelists.
+         */
+        hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
+#endif
        /*
         * The reap timers are started later, with a module init call: That part
         * of the kernel is not yet operational.
@@ -2220,8 +2289,8 @@ kmem_cache_create (const char *name, size_t size, size_t align,
        if (ralign < align) {
                ralign = align;
        }
-        /* disable debug if necessary */
+        /* disable debug if not aligning with REDZONE_ALIGN */
-        if (ralign > __alignof__(unsigned long long))
+        if (ralign & (__alignof__(unsigned long long) - 1))
                flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
        /*
         * 4) Store it.
@@ -2247,8 +2316,8 @@ kmem_cache_create (const char *name, size_t size, size_t align,
         */
        if (flags & SLAB_RED_ZONE) {
                /* add space for red zone words */
-                cachep->obj_offset += sizeof(unsigned long long);
+                cachep->obj_offset += align;
-                size += 2 * sizeof(unsigned long long);
+                size += align + sizeof(unsigned long long);
        }
        if (flags & SLAB_STORE_USER) {
                /* user store requires one word storage behind the end of
@@ -4216,10 +4285,11 @@ static int s_show(struct seq_file *m, void *p)
                unsigned long node_frees = cachep->node_frees;
                unsigned long overflows = cachep->node_overflow;
-                seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu \
+                seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu "
-                                %4lu %4lu %4lu %4lu %4lu", allocs, high, grown,
+                           "%4lu %4lu %4lu %4lu %4lu",
-                                reaped, errors, max_freeable, node_allocs,
+                           allocs, high, grown,
-                                node_frees, overflows);
+                           reaped, errors, max_freeable, node_allocs,
+                           node_frees, overflows);
        }
        /* cpu stats */
        {

diff --git a/mm/slab.c b/mm/slab.c index bac0f4fcc216..50a73fca19c4 100644 --- a/mm/slab.c +++ b/mm/slab.c
@@ -115,6 +115,7 @@
115	#include <linux/reciprocal_div.h>	115	#include <linux/reciprocal_div.h>
116	#include <linux/debugobjects.h>	116	#include <linux/debugobjects.h>
117	#include <linux/kmemcheck.h>	117	#include <linux/kmemcheck.h>
		118	#include <linux/memory.h>
118		119
119	#include <asm/cacheflush.h>	120	#include <asm/cacheflush.h>
120	#include <asm/tlbflush.h>	121	#include <asm/tlbflush.h>
@@ -144,30 +145,6 @@
144	#define BYTES_PER_WORD sizeof(void *)	145	#define BYTES_PER_WORD sizeof(void *)
145	#define REDZONE_ALIGN max(BYTES_PER_WORD, __alignof__(unsigned long long))	146	#define REDZONE_ALIGN max(BYTES_PER_WORD, __alignof__(unsigned long long))
146		147
147	#ifndef ARCH_KMALLOC_MINALIGN
148	/*
149	* Enforce a minimum alignment for the kmalloc caches.
150	* Usually, the kmalloc caches are cache_line_size() aligned, except when
151	* DEBUG and FORCED_DEBUG are enabled, then they are BYTES_PER_WORD aligned.
152	* Some archs want to perform DMA into kmalloc caches and need a guaranteed
153	* alignment larger than the alignment of a 64-bit integer.
154	* ARCH_KMALLOC_MINALIGN allows that.
155	* Note that increasing this value may disable some debug features.
156	*/
157	#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long)
158	#endif
159
160	#ifndef ARCH_SLAB_MINALIGN
161	/*
162	* Enforce a minimum alignment for all caches.
163	* Intended for archs that get misalignment faults even for BYTES_PER_WORD
164	* aligned buffers. Includes ARCH_KMALLOC_MINALIGN.
165	* If possible: Do not enable this flag for CONFIG_DEBUG_SLAB, it disables
166	* some debug features.
167	*/
168	#define ARCH_SLAB_MINALIGN 0
169	#endif
170
171	#ifndef ARCH_KMALLOC_FLAGS	148	#ifndef ARCH_KMALLOC_FLAGS
172	#define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN	149	#define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN
173	#endif	150	#endif
@@ -1102,6 +1079,52 @@ static inline int cache_free_alien(struct kmem_cache cachep, void objp)
1102	}	1079	}
1103	#endif	1080	#endif
1104		1081
		1082	/*
		1083	* Allocates and initializes nodelists for a node on each slab cache, used for
		1084	* either memory or cpu hotplug. If memory is being hot-added, the kmem_list3
		1085	* will be allocated off-node since memory is not yet online for the new node.
		1086	* When hotplugging memory or a cpu, existing nodelists are not replaced if
		1087	* already in use.
		1088	*
		1089	* Must hold cache_chain_mutex.
		1090	*/
		1091	static int init_cache_nodelists_node(int node)
		1092	{
		1093	struct kmem_cache *cachep;
		1094	struct kmem_list3 *l3;
		1095	const int memsize = sizeof(struct kmem_list3);
		1096
		1097	list_for_each_entry(cachep, &cache_chain, next) {
		1098	/*
		1099	* Set up the size64 kmemlist for cpu before we can
		1100	* begin anything. Make sure some other cpu on this
		1101	* node has not already allocated this
		1102	*/
		1103	if (!cachep->nodelists[node]) {
		1104	l3 = kmalloc_node(memsize, GFP_KERNEL, node);
		1105	if (!l3)
		1106	return -ENOMEM;
		1107	kmem_list3_init(l3);
		1108	l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
		1109	((unsigned long)cachep) % REAPTIMEOUT_LIST3;
		1110
		1111	/*
		1112	* The l3s don't come and go as CPUs come and
		1113	* go. cache_chain_mutex is sufficient
		1114	* protection here.
		1115	*/
		1116	cachep->nodelists[node] = l3;
		1117	}
		1118
		1119	spin_lock_irq(&cachep->nodelists[node]->list_lock);
		1120	cachep->nodelists[node]->free_limit =
		1121	(1 + nr_cpus_node(node)) *
		1122	cachep->batchcount + cachep->num;
		1123	spin_unlock_irq(&cachep->nodelists[node]->list_lock);
		1124	}
		1125	return 0;
		1126	}
		1127
1105	static void __cpuinit cpuup_canceled(long cpu)	1128	static void __cpuinit cpuup_canceled(long cpu)
1106	{	1129	{
1107	struct kmem_cache *cachep;	1130	struct kmem_cache *cachep;
@@ -1172,7 +1195,7 @@ static int __cpuinit cpuup_prepare(long cpu)
1172	struct kmem_cache *cachep;	1195	struct kmem_cache *cachep;
1173	struct kmem_list3 *l3 = NULL;	1196	struct kmem_list3 *l3 = NULL;
1174	int node = cpu_to_node(cpu);	1197	int node = cpu_to_node(cpu);
1175	const int memsize = sizeof(struct kmem_list3);	1198	int err;
1176		1199
1177	/*	1200	/*
1178	* We need to do this right in the beginning since	1201	* We need to do this right in the beginning since
@@ -1180,35 +1203,9 @@ static int __cpuinit cpuup_prepare(long cpu)
1180	* kmalloc_node allows us to add the slab to the right	1203	* kmalloc_node allows us to add the slab to the right
1181	* kmem_list3 and not this cpu's kmem_list3	1204	* kmem_list3 and not this cpu's kmem_list3
1182	*/	1205	*/
1183		1206	err = init_cache_nodelists_node(node);
1184	list_for_each_entry(cachep, &cache_chain, next) {	1207	if (err < 0)
1185	/*	1208	goto bad;
1186	* Set up the size64 kmemlist for cpu before we can
1187	* begin anything. Make sure some other cpu on this
1188	* node has not already allocated this
1189	*/
1190	if (!cachep->nodelists[node]) {
1191	l3 = kmalloc_node(memsize, GFP_KERNEL, node);
1192	if (!l3)
1193	goto bad;
1194	kmem_list3_init(l3);
1195	l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
1196	((unsigned long)cachep) % REAPTIMEOUT_LIST3;
1197
1198	/*
1199	* The l3s don't come and go as CPUs come and
1200	* go. cache_chain_mutex is sufficient
1201	* protection here.
1202	*/
1203	cachep->nodelists[node] = l3;
1204	}
1205
1206	spin_lock_irq(&cachep->nodelists[node]->list_lock);
1207	cachep->nodelists[node]->free_limit =
1208	(1 + nr_cpus_node(node)) *
1209	cachep->batchcount + cachep->num;
1210	spin_unlock_irq(&cachep->nodelists[node]->list_lock);
1211	}
1212		1209
1213	/*	1210	/*
1214	* Now we can go ahead with allocating the shared arrays and	1211	* Now we can go ahead with allocating the shared arrays and
@@ -1331,11 +1328,75 @@ static struct notifier_block __cpuinitdata cpucache_notifier = {
1331	&cpuup_callback, NULL, 0	1328	&cpuup_callback, NULL, 0
1332	};	1329	};
1333		1330
		1331	#if defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG)
		1332	/*
		1333	* Drains freelist for a node on each slab cache, used for memory hot-remove.
		1334	* Returns -EBUSY if all objects cannot be drained so that the node is not
		1335	* removed.
		1336	*
		1337	* Must hold cache_chain_mutex.
		1338	*/
		1339	static int __meminit drain_cache_nodelists_node(int node)
		1340	{
		1341	struct kmem_cache *cachep;
		1342	int ret = 0;
		1343
		1344	list_for_each_entry(cachep, &cache_chain, next) {
		1345	struct kmem_list3 *l3;
		1346
		1347	l3 = cachep->nodelists[node];
		1348	if (!l3)
		1349	continue;
		1350
		1351	drain_freelist(cachep, l3, l3->free_objects);
		1352
		1353	if (!list_empty(&l3->slabs_full) \|\|
		1354	!list_empty(&l3->slabs_partial)) {
		1355	ret = -EBUSY;
		1356	break;
		1357	}
		1358	}
		1359	return ret;
		1360	}
		1361
		1362	static int __meminit slab_memory_callback(struct notifier_block *self,
		1363	unsigned long action, void *arg)
		1364	{
		1365	struct memory_notify *mnb = arg;
		1366	int ret = 0;
		1367	int nid;
		1368
		1369	nid = mnb->status_change_nid;
		1370	if (nid < 0)
		1371	goto out;
		1372
		1373	switch (action) {
		1374	case MEM_GOING_ONLINE:
		1375	mutex_lock(&cache_chain_mutex);
		1376	ret = init_cache_nodelists_node(nid);
		1377	mutex_unlock(&cache_chain_mutex);
		1378	break;
		1379	case MEM_GOING_OFFLINE:
		1380	mutex_lock(&cache_chain_mutex);
		1381	ret = drain_cache_nodelists_node(nid);
		1382	mutex_unlock(&cache_chain_mutex);
		1383	break;
		1384	case MEM_ONLINE:
		1385	case MEM_OFFLINE:
		1386	case MEM_CANCEL_ONLINE:
		1387	case MEM_CANCEL_OFFLINE:
		1388	break;
		1389	}
		1390	out:
		1391	return ret ? notifier_from_errno(ret) : NOTIFY_OK;
		1392	}
		1393	#endif /* CONFIG_NUMA && CONFIG_MEMORY_HOTPLUG */
		1394
1334	/*	1395	/*
1335	* swap the static kmem_list3 with kmalloced memory	1396	* swap the static kmem_list3 with kmalloced memory
1336	*/	1397	*/
1337	static void init_list(struct kmem_cache cachep, struct kmem_list3 list,	1398	static void __init init_list(struct kmem_cache cachep, struct kmem_list3 list,
1338	int nodeid)	1399	int nodeid)
1339	{	1400	{
1340	struct kmem_list3 *ptr;	1401	struct kmem_list3 *ptr;
1341		1402
@@ -1580,6 +1641,14 @@ void __init kmem_cache_init_late(void)
1580	*/	1641	*/
1581	register_cpu_notifier(&cpucache_notifier);	1642	register_cpu_notifier(&cpucache_notifier);
1582		1643
		1644	#ifdef CONFIG_NUMA
		1645	/*
		1646	* Register a memory hotplug callback that initializes and frees
		1647	* nodelists.
		1648	*/
		1649	hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
		1650	#endif
		1651
1583	/*	1652	/*
1584	* The reap timers are started later, with a module init call: That part	1653	* The reap timers are started later, with a module init call: That part
1585	* of the kernel is not yet operational.	1654	* of the kernel is not yet operational.
@@ -2220,8 +2289,8 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2220	if (ralign < align) {	2289	if (ralign < align) {
2221	ralign = align;	2290	ralign = align;
2222	}	2291	}
2223	/* disable debug if necessary */	2292	/* disable debug if not aligning with REDZONE_ALIGN */
2224	if (ralign > __alignof__(unsigned long long))	2293	if (ralign & (__alignof__(unsigned long long) - 1))
2225	flags &= ~(SLAB_RED_ZONE \| SLAB_STORE_USER);	2294	flags &= ~(SLAB_RED_ZONE \| SLAB_STORE_USER);
2226	/*	2295	/*
2227	* 4) Store it.	2296	* 4) Store it.
@@ -2247,8 +2316,8 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2247	*/	2316	*/
2248	if (flags & SLAB_RED_ZONE) {	2317	if (flags & SLAB_RED_ZONE) {
2249	/* add space for red zone words */	2318	/* add space for red zone words */
2250	cachep->obj_offset += sizeof(unsigned long long);	2319	cachep->obj_offset += align;
2251	size += 2 * sizeof(unsigned long long);	2320	size += align + sizeof(unsigned long long);
2252	}	2321	}
2253	if (flags & SLAB_STORE_USER) {	2322	if (flags & SLAB_STORE_USER) {
2254	/* user store requires one word storage behind the end of	2323	/* user store requires one word storage behind the end of
@@ -4216,10 +4285,11 @@ static int s_show(struct seq_file m, void p)
4216	unsigned long node_frees = cachep->node_frees;	4285	unsigned long node_frees = cachep->node_frees;
4217	unsigned long overflows = cachep->node_overflow;	4286	unsigned long overflows = cachep->node_overflow;
4218		4287
4219	seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu \	4288	seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu "
4220	%4lu %4lu %4lu %4lu %4lu", allocs, high, grown,	4289	"%4lu %4lu %4lu %4lu %4lu",
4221	reaped, errors, max_freeable, node_allocs,	4290	allocs, high, grown,
4222	node_frees, overflows);	4291	reaped, errors, max_freeable, node_allocs,
		4292	node_frees, overflows);
4223	}	4293	}
4224	/* cpu stats */	4294	/* cpu stats */
4225	{	4295	{