slab: add memory hotplug support

Slab lacks any memory hotplug support for nodes that are hotplugged without cpus being hotplugged. This is possible at least on x86 CONFIG_MEMORY_HOTPLUG_SPARSE kernels where SRAT entries are marked ACPI_SRAT_MEM_HOT_PLUGGABLE and the regions of RAM represent a seperate node. It can also be done manually by writing the start address to /sys/devices/system/memory/probe for kernels that have CONFIG_ARCH_MEMORY_PROBE set, which is how this patch was tested, and then onlining the new memory region. When a node is hotadded, a nodelist for that node is allocated and initialized for each slab cache. If this isn't completed due to a lack of memory, the hotadd is aborted: we have a reasonable expectation that kmalloc_node(nid) will work for all caches if nid is online and memory is available. Since nodelists must be allocated and initialized prior to the new node's memory actually being online, the struct kmem_list3 is allocated off-node due to kmalloc_node()'s fallback. When an entire node would be offlined, its nodelists are subsequently drained. If slab objects still exist and cannot be freed, the offline is aborted. It is possible that objects will be allocated between this drain and page isolation, so it's still possible that the offline will still fail, however. Acked-by: Christoph Lameter <cl@linux-foundation.org> Signed-off-by: David Rientjes <rientjes@google.com> Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
author: David Rientjes <rientjes@google.com> 2010-03-27 22:40:47 -0400
committer: Pekka Enberg <penberg@cs.helsinki.fi> 2010-04-07 12:28:31 -0400
commit: 8f9f8d9e8080a2ff46caa7decef47810d093d252 (patch)
tree: c9adbf892104431816b4a6aaf96083c649f3b36a /mm
parent: 220bf991b0366cc50a94feede3d7341fa5710ee4 (diff)
1 files changed, 125 insertions, 32 deletions
diff --git a/mm/slab.c b/mm/slab.c
index a9f325b28bed..3230cd2c6b3b 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -115,6 +115,7 @@
 #include        <linux/reciprocal_div.h>
 #include        <linux/debugobjects.h>
 #include        <linux/kmemcheck.h>
+#include        <linux/memory.h>
 #include        <asm/cacheflush.h>
 #include        <asm/tlbflush.h>
@@ -1102,6 +1103,52 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
 }
 #endif
+/*
+ * Allocates and initializes nodelists for a node on each slab cache, used for
+ * either memory or cpu hotplug.  If memory is being hot-added, the kmem_list3
+ * will be allocated off-node since memory is not yet online for the new node.
+ * When hotplugging memory or a cpu, existing nodelists are not replaced if
+ * already in use.
+ *
+ * Must hold cache_chain_mutex.
+ */
+static int init_cache_nodelists_node(int node)
+{
+        struct kmem_cache *cachep;
+        struct kmem_list3 *l3;
+        const int memsize = sizeof(struct kmem_list3);
+        list_for_each_entry(cachep, &cache_chain, next) {
+                /*
+                 * Set up the size64 kmemlist for cpu before we can
+                 * begin anything. Make sure some other cpu on this
+                 * node has not already allocated this
+                 */
+                if (!cachep->nodelists[node]) {
+                        l3 = kmalloc_node(memsize, GFP_KERNEL, node);
+                        if (!l3)
+                                return -ENOMEM;
+                        kmem_list3_init(l3);
+                        l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
+                            ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
+                        /*
+                         * The l3s don't come and go as CPUs come and
+                         * go.  cache_chain_mutex is sufficient
+                         * protection here.
+                         */
+                        cachep->nodelists[node] = l3;
+                }
+                spin_lock_irq(&cachep->nodelists[node]->list_lock);
+                cachep->nodelists[node]->free_limit =
+                        (1 + nr_cpus_node(node)) *
+                        cachep->batchcount + cachep->num;
+                spin_unlock_irq(&cachep->nodelists[node]->list_lock);
+        }
+        return 0;
+}
 static void __cpuinit cpuup_canceled(long cpu)
 {
        struct kmem_cache *cachep;
@@ -1172,7 +1219,7 @@ static int __cpuinit cpuup_prepare(long cpu)
        struct kmem_cache *cachep;
        struct kmem_list3 *l3 = NULL;
        int node = cpu_to_node(cpu);
-        const int memsize = sizeof(struct kmem_list3);
+        int err;
        /*
         * We need to do this right in the beginning since
@@ -1180,35 +1227,9 @@ static int __cpuinit cpuup_prepare(long cpu)
         * kmalloc_node allows us to add the slab to the right
         * kmem_list3 and not this cpu's kmem_list3
         */
+        err = init_cache_nodelists_node(node);
-        list_for_each_entry(cachep, &cache_chain, next) {
+        if (err < 0)
-                /*
+                goto bad;
-                 * Set up the size64 kmemlist for cpu before we can
-                 * begin anything. Make sure some other cpu on this
-                 * node has not already allocated this
-                 */
-                if (!cachep->nodelists[node]) {
-                        l3 = kmalloc_node(memsize, GFP_KERNEL, node);
-                        if (!l3)
-                                goto bad;
-                        kmem_list3_init(l3);
-                        l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
-                            ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
-                        /*
-                         * The l3s don't come and go as CPUs come and
-                         * go.  cache_chain_mutex is sufficient
-                         * protection here.
-                         */
-                        cachep->nodelists[node] = l3;
-                }
-                spin_lock_irq(&cachep->nodelists[node]->list_lock);
-                cachep->nodelists[node]->free_limit =
-                        (1 + nr_cpus_node(node)) *
-                        cachep->batchcount + cachep->num;
-                spin_unlock_irq(&cachep->nodelists[node]->list_lock);
-        }
        /*
         * Now we can go ahead with allocating the shared arrays and
@@ -1331,11 +1352,75 @@ static struct notifier_block __cpuinitdata cpucache_notifier = {
        &cpuup_callback, NULL, 0
 };
+#if defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG)
+/*
+ * Drains freelist for a node on each slab cache, used for memory hot-remove.
+ * Returns -EBUSY if all objects cannot be drained so that the node is not
+ * removed.
+ *
+ * Must hold cache_chain_mutex.
+ */
+static int __meminit drain_cache_nodelists_node(int node)
+{
+        struct kmem_cache *cachep;
+        int ret = 0;
+        list_for_each_entry(cachep, &cache_chain, next) {
+                struct kmem_list3 *l3;
+                l3 = cachep->nodelists[node];
+                if (!l3)
+                        continue;
+                drain_freelist(cachep, l3, l3->free_objects);
+                if (!list_empty(&l3->slabs_full) ||
+                    !list_empty(&l3->slabs_partial)) {
+                        ret = -EBUSY;
+                        break;
+                }
+        }
+        return ret;
+}
+static int __meminit slab_memory_callback(struct notifier_block *self,
+                                        unsigned long action, void *arg)
+{
+        struct memory_notify *mnb = arg;
+        int ret = 0;
+        int nid;
+        nid = mnb->status_change_nid;
+        if (nid < 0)
+                goto out;
+        switch (action) {
+        case MEM_GOING_ONLINE:
+                mutex_lock(&cache_chain_mutex);
+                ret = init_cache_nodelists_node(nid);
+                mutex_unlock(&cache_chain_mutex);
+                break;
+        case MEM_GOING_OFFLINE:
+                mutex_lock(&cache_chain_mutex);
+                ret = drain_cache_nodelists_node(nid);
+                mutex_unlock(&cache_chain_mutex);
+                break;
+        case MEM_ONLINE:
+        case MEM_OFFLINE:
+        case MEM_CANCEL_ONLINE:
+        case MEM_CANCEL_OFFLINE:
+                break;
+        }
+out:
+        return ret ? notifier_from_errno(ret) : NOTIFY_OK;
+}
+#endif /* CONFIG_NUMA && CONFIG_MEMORY_HOTPLUG */
 /*
 * swap the static kmem_list3 with kmalloced memory
 */
-static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list,
+static void __init init_list(struct kmem_cache *cachep, struct kmem_list3 *list,
-                        int nodeid)
+                                int nodeid)
 {
        struct kmem_list3 *ptr;
@@ -1580,6 +1665,14 @@ void __init kmem_cache_init_late(void)
         */
        register_cpu_notifier(&cpucache_notifier);
+#ifdef CONFIG_NUMA
+        /*
+         * Register a memory hotplug callback that initializes and frees
+         * nodelists.
+         */
+        hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
+#endif
        /*
         * The reap timers are started later, with a module init call: That part
         * of the kernel is not yet operational.
author	David Rientjes <rientjes@google.com>	2010-03-27 22:40:47 -0400
committer	Pekka Enberg <penberg@cs.helsinki.fi>	2010-04-07 12:28:31 -0400
commit	8f9f8d9e8080a2ff46caa7decef47810d093d252 (patch)
tree	c9adbf892104431816b4a6aaf96083c649f3b36a /mm
parent	220bf991b0366cc50a94feede3d7341fa5710ee4 (diff)

diff --git a/mm/slab.c b/mm/slab.c index a9f325b28bed..3230cd2c6b3b 100644 --- a/mm/slab.c +++ b/mm/slab.c
@@ -115,6 +115,7 @@
115	#include <linux/reciprocal_div.h>	115	#include <linux/reciprocal_div.h>
116	#include <linux/debugobjects.h>	116	#include <linux/debugobjects.h>
117	#include <linux/kmemcheck.h>	117	#include <linux/kmemcheck.h>
		118	#include <linux/memory.h>
118		119
119	#include <asm/cacheflush.h>	120	#include <asm/cacheflush.h>
120	#include <asm/tlbflush.h>	121	#include <asm/tlbflush.h>
@@ -1102,6 +1103,52 @@ static inline int cache_free_alien(struct kmem_cache cachep, void objp)
1102	}	1103	}
1103	#endif	1104	#endif
1104		1105
		1106	/*
		1107	* Allocates and initializes nodelists for a node on each slab cache, used for
		1108	* either memory or cpu hotplug. If memory is being hot-added, the kmem_list3
		1109	* will be allocated off-node since memory is not yet online for the new node.
		1110	* When hotplugging memory or a cpu, existing nodelists are not replaced if
		1111	* already in use.
		1112	*
		1113	* Must hold cache_chain_mutex.
		1114	*/
		1115	static int init_cache_nodelists_node(int node)
		1116	{
		1117	struct kmem_cache *cachep;
		1118	struct kmem_list3 *l3;
		1119	const int memsize = sizeof(struct kmem_list3);
		1120
		1121	list_for_each_entry(cachep, &cache_chain, next) {
		1122	/*
		1123	* Set up the size64 kmemlist for cpu before we can
		1124	* begin anything. Make sure some other cpu on this
		1125	* node has not already allocated this
		1126	*/
		1127	if (!cachep->nodelists[node]) {
		1128	l3 = kmalloc_node(memsize, GFP_KERNEL, node);
		1129	if (!l3)
		1130	return -ENOMEM;
		1131	kmem_list3_init(l3);
		1132	l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
		1133	((unsigned long)cachep) % REAPTIMEOUT_LIST3;
		1134
		1135	/*
		1136	* The l3s don't come and go as CPUs come and
		1137	* go. cache_chain_mutex is sufficient
		1138	* protection here.
		1139	*/
		1140	cachep->nodelists[node] = l3;
		1141	}
		1142
		1143	spin_lock_irq(&cachep->nodelists[node]->list_lock);
		1144	cachep->nodelists[node]->free_limit =
		1145	(1 + nr_cpus_node(node)) *
		1146	cachep->batchcount + cachep->num;
		1147	spin_unlock_irq(&cachep->nodelists[node]->list_lock);
		1148	}
		1149	return 0;
		1150	}
		1151
1105	static void __cpuinit cpuup_canceled(long cpu)	1152	static void __cpuinit cpuup_canceled(long cpu)
1106	{	1153	{
1107	struct kmem_cache *cachep;	1154	struct kmem_cache *cachep;
@@ -1172,7 +1219,7 @@ static int __cpuinit cpuup_prepare(long cpu)
1172	struct kmem_cache *cachep;	1219	struct kmem_cache *cachep;
1173	struct kmem_list3 *l3 = NULL;	1220	struct kmem_list3 *l3 = NULL;
1174	int node = cpu_to_node(cpu);	1221	int node = cpu_to_node(cpu);
1175	const int memsize = sizeof(struct kmem_list3);	1222	int err;
1176		1223
1177	/*	1224	/*
1178	* We need to do this right in the beginning since	1225	* We need to do this right in the beginning since
@@ -1180,35 +1227,9 @@ static int __cpuinit cpuup_prepare(long cpu)
1180	* kmalloc_node allows us to add the slab to the right	1227	* kmalloc_node allows us to add the slab to the right
1181	* kmem_list3 and not this cpu's kmem_list3	1228	* kmem_list3 and not this cpu's kmem_list3
1182	*/	1229	*/
1183		1230	err = init_cache_nodelists_node(node);
1184	list_for_each_entry(cachep, &cache_chain, next) {	1231	if (err < 0)
1185	/*	1232	goto bad;
1186	* Set up the size64 kmemlist for cpu before we can
1187	* begin anything. Make sure some other cpu on this
1188	* node has not already allocated this
1189	*/
1190	if (!cachep->nodelists[node]) {
1191	l3 = kmalloc_node(memsize, GFP_KERNEL, node);
1192	if (!l3)
1193	goto bad;
1194	kmem_list3_init(l3);
1195	l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
1196	((unsigned long)cachep) % REAPTIMEOUT_LIST3;
1197
1198	/*
1199	* The l3s don't come and go as CPUs come and
1200	* go. cache_chain_mutex is sufficient
1201	* protection here.
1202	*/
1203	cachep->nodelists[node] = l3;
1204	}
1205
1206	spin_lock_irq(&cachep->nodelists[node]->list_lock);
1207	cachep->nodelists[node]->free_limit =
1208	(1 + nr_cpus_node(node)) *
1209	cachep->batchcount + cachep->num;
1210	spin_unlock_irq(&cachep->nodelists[node]->list_lock);
1211	}
1212		1233
1213	/*	1234	/*
1214	* Now we can go ahead with allocating the shared arrays and	1235	* Now we can go ahead with allocating the shared arrays and
@@ -1331,11 +1352,75 @@ static struct notifier_block __cpuinitdata cpucache_notifier = {
1331	&cpuup_callback, NULL, 0	1352	&cpuup_callback, NULL, 0
1332	};	1353	};
1333		1354
		1355	#if defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG)
		1356	/*
		1357	* Drains freelist for a node on each slab cache, used for memory hot-remove.
		1358	* Returns -EBUSY if all objects cannot be drained so that the node is not
		1359	* removed.
		1360	*
		1361	* Must hold cache_chain_mutex.
		1362	*/
		1363	static int __meminit drain_cache_nodelists_node(int node)
		1364	{
		1365	struct kmem_cache *cachep;
		1366	int ret = 0;
		1367
		1368	list_for_each_entry(cachep, &cache_chain, next) {
		1369	struct kmem_list3 *l3;
		1370
		1371	l3 = cachep->nodelists[node];
		1372	if (!l3)
		1373	continue;
		1374
		1375	drain_freelist(cachep, l3, l3->free_objects);
		1376
		1377	if (!list_empty(&l3->slabs_full) \|\|
		1378	!list_empty(&l3->slabs_partial)) {
		1379	ret = -EBUSY;
		1380	break;
		1381	}
		1382	}
		1383	return ret;
		1384	}
		1385
		1386	static int __meminit slab_memory_callback(struct notifier_block *self,
		1387	unsigned long action, void *arg)
		1388	{
		1389	struct memory_notify *mnb = arg;
		1390	int ret = 0;
		1391	int nid;
		1392
		1393	nid = mnb->status_change_nid;
		1394	if (nid < 0)
		1395	goto out;
		1396
		1397	switch (action) {
		1398	case MEM_GOING_ONLINE:
		1399	mutex_lock(&cache_chain_mutex);
		1400	ret = init_cache_nodelists_node(nid);
		1401	mutex_unlock(&cache_chain_mutex);
		1402	break;
		1403	case MEM_GOING_OFFLINE:
		1404	mutex_lock(&cache_chain_mutex);
		1405	ret = drain_cache_nodelists_node(nid);
		1406	mutex_unlock(&cache_chain_mutex);
		1407	break;
		1408	case MEM_ONLINE:
		1409	case MEM_OFFLINE:
		1410	case MEM_CANCEL_ONLINE:
		1411	case MEM_CANCEL_OFFLINE:
		1412	break;
		1413	}
		1414	out:
		1415	return ret ? notifier_from_errno(ret) : NOTIFY_OK;
		1416	}
		1417	#endif /* CONFIG_NUMA && CONFIG_MEMORY_HOTPLUG */
		1418
1334	/*	1419	/*
1335	* swap the static kmem_list3 with kmalloced memory	1420	* swap the static kmem_list3 with kmalloced memory
1336	*/	1421	*/
1337	static void init_list(struct kmem_cache cachep, struct kmem_list3 list,	1422	static void __init init_list(struct kmem_cache cachep, struct kmem_list3 list,
1338	int nodeid)	1423	int nodeid)
1339	{	1424	{
1340	struct kmem_list3 *ptr;	1425	struct kmem_list3 *ptr;
1341		1426
@@ -1580,6 +1665,14 @@ void __init kmem_cache_init_late(void)
1580	*/	1665	*/
1581	register_cpu_notifier(&cpucache_notifier);	1666	register_cpu_notifier(&cpucache_notifier);
1582		1667
		1668	#ifdef CONFIG_NUMA
		1669	/*
		1670	* Register a memory hotplug callback that initializes and frees
		1671	* nodelists.
		1672	*/
		1673	hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
		1674	#endif
		1675
1583	/*	1676	/*
1584	* The reap timers are started later, with a module init call: That part	1677	* The reap timers are started later, with a module init call: That part
1585	* of the kernel is not yet operational.	1678	* of the kernel is not yet operational.