aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDavid Rientjes <rientjes@google.com>2010-03-27 22:40:47 -0400
committerPekka Enberg <penberg@cs.helsinki.fi>2010-04-07 12:28:31 -0400
commit8f9f8d9e8080a2ff46caa7decef47810d093d252 (patch)
treec9adbf892104431816b4a6aaf96083c649f3b36a
parent220bf991b0366cc50a94feede3d7341fa5710ee4 (diff)
slab: add memory hotplug support
Slab lacks any memory hotplug support for nodes that are hotplugged without cpus being hotplugged. This is possible at least on x86 CONFIG_MEMORY_HOTPLUG_SPARSE kernels where SRAT entries are marked ACPI_SRAT_MEM_HOT_PLUGGABLE and the regions of RAM represent a seperate node. It can also be done manually by writing the start address to /sys/devices/system/memory/probe for kernels that have CONFIG_ARCH_MEMORY_PROBE set, which is how this patch was tested, and then onlining the new memory region. When a node is hotadded, a nodelist for that node is allocated and initialized for each slab cache. If this isn't completed due to a lack of memory, the hotadd is aborted: we have a reasonable expectation that kmalloc_node(nid) will work for all caches if nid is online and memory is available. Since nodelists must be allocated and initialized prior to the new node's memory actually being online, the struct kmem_list3 is allocated off-node due to kmalloc_node()'s fallback. When an entire node would be offlined, its nodelists are subsequently drained. If slab objects still exist and cannot be freed, the offline is aborted. It is possible that objects will be allocated between this drain and page isolation, so it's still possible that the offline will still fail, however. Acked-by: Christoph Lameter <cl@linux-foundation.org> Signed-off-by: David Rientjes <rientjes@google.com> Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
-rw-r--r--mm/slab.c157
1 files changed, 125 insertions, 32 deletions
diff --git a/mm/slab.c b/mm/slab.c
index a9f325b28bed..3230cd2c6b3b 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -115,6 +115,7 @@
115#include <linux/reciprocal_div.h> 115#include <linux/reciprocal_div.h>
116#include <linux/debugobjects.h> 116#include <linux/debugobjects.h>
117#include <linux/kmemcheck.h> 117#include <linux/kmemcheck.h>
118#include <linux/memory.h>
118 119
119#include <asm/cacheflush.h> 120#include <asm/cacheflush.h>
120#include <asm/tlbflush.h> 121#include <asm/tlbflush.h>
@@ -1102,6 +1103,52 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
1102} 1103}
1103#endif 1104#endif
1104 1105
1106/*
1107 * Allocates and initializes nodelists for a node on each slab cache, used for
1108 * either memory or cpu hotplug. If memory is being hot-added, the kmem_list3
1109 * will be allocated off-node since memory is not yet online for the new node.
1110 * When hotplugging memory or a cpu, existing nodelists are not replaced if
1111 * already in use.
1112 *
1113 * Must hold cache_chain_mutex.
1114 */
1115static int init_cache_nodelists_node(int node)
1116{
1117 struct kmem_cache *cachep;
1118 struct kmem_list3 *l3;
1119 const int memsize = sizeof(struct kmem_list3);
1120
1121 list_for_each_entry(cachep, &cache_chain, next) {
1122 /*
1123 * Set up the size64 kmemlist for cpu before we can
1124 * begin anything. Make sure some other cpu on this
1125 * node has not already allocated this
1126 */
1127 if (!cachep->nodelists[node]) {
1128 l3 = kmalloc_node(memsize, GFP_KERNEL, node);
1129 if (!l3)
1130 return -ENOMEM;
1131 kmem_list3_init(l3);
1132 l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
1133 ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
1134
1135 /*
1136 * The l3s don't come and go as CPUs come and
1137 * go. cache_chain_mutex is sufficient
1138 * protection here.
1139 */
1140 cachep->nodelists[node] = l3;
1141 }
1142
1143 spin_lock_irq(&cachep->nodelists[node]->list_lock);
1144 cachep->nodelists[node]->free_limit =
1145 (1 + nr_cpus_node(node)) *
1146 cachep->batchcount + cachep->num;
1147 spin_unlock_irq(&cachep->nodelists[node]->list_lock);
1148 }
1149 return 0;
1150}
1151
1105static void __cpuinit cpuup_canceled(long cpu) 1152static void __cpuinit cpuup_canceled(long cpu)
1106{ 1153{
1107 struct kmem_cache *cachep; 1154 struct kmem_cache *cachep;
@@ -1172,7 +1219,7 @@ static int __cpuinit cpuup_prepare(long cpu)
1172 struct kmem_cache *cachep; 1219 struct kmem_cache *cachep;
1173 struct kmem_list3 *l3 = NULL; 1220 struct kmem_list3 *l3 = NULL;
1174 int node = cpu_to_node(cpu); 1221 int node = cpu_to_node(cpu);
1175 const int memsize = sizeof(struct kmem_list3); 1222 int err;
1176 1223
1177 /* 1224 /*
1178 * We need to do this right in the beginning since 1225 * We need to do this right in the beginning since
@@ -1180,35 +1227,9 @@ static int __cpuinit cpuup_prepare(long cpu)
1180 * kmalloc_node allows us to add the slab to the right 1227 * kmalloc_node allows us to add the slab to the right
1181 * kmem_list3 and not this cpu's kmem_list3 1228 * kmem_list3 and not this cpu's kmem_list3
1182 */ 1229 */
1183 1230 err = init_cache_nodelists_node(node);
1184 list_for_each_entry(cachep, &cache_chain, next) { 1231 if (err < 0)
1185 /* 1232 goto bad;
1186 * Set up the size64 kmemlist for cpu before we can
1187 * begin anything. Make sure some other cpu on this
1188 * node has not already allocated this
1189 */
1190 if (!cachep->nodelists[node]) {
1191 l3 = kmalloc_node(memsize, GFP_KERNEL, node);
1192 if (!l3)
1193 goto bad;
1194 kmem_list3_init(l3);
1195 l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
1196 ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
1197
1198 /*
1199 * The l3s don't come and go as CPUs come and
1200 * go. cache_chain_mutex is sufficient
1201 * protection here.
1202 */
1203 cachep->nodelists[node] = l3;
1204 }
1205
1206 spin_lock_irq(&cachep->nodelists[node]->list_lock);
1207 cachep->nodelists[node]->free_limit =
1208 (1 + nr_cpus_node(node)) *
1209 cachep->batchcount + cachep->num;
1210 spin_unlock_irq(&cachep->nodelists[node]->list_lock);
1211 }
1212 1233
1213 /* 1234 /*
1214 * Now we can go ahead with allocating the shared arrays and 1235 * Now we can go ahead with allocating the shared arrays and
@@ -1331,11 +1352,75 @@ static struct notifier_block __cpuinitdata cpucache_notifier = {
1331 &cpuup_callback, NULL, 0 1352 &cpuup_callback, NULL, 0
1332}; 1353};
1333 1354
1355#if defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG)
1356/*
1357 * Drains freelist for a node on each slab cache, used for memory hot-remove.
1358 * Returns -EBUSY if all objects cannot be drained so that the node is not
1359 * removed.
1360 *
1361 * Must hold cache_chain_mutex.
1362 */
1363static int __meminit drain_cache_nodelists_node(int node)
1364{
1365 struct kmem_cache *cachep;
1366 int ret = 0;
1367
1368 list_for_each_entry(cachep, &cache_chain, next) {
1369 struct kmem_list3 *l3;
1370
1371 l3 = cachep->nodelists[node];
1372 if (!l3)
1373 continue;
1374
1375 drain_freelist(cachep, l3, l3->free_objects);
1376
1377 if (!list_empty(&l3->slabs_full) ||
1378 !list_empty(&l3->slabs_partial)) {
1379 ret = -EBUSY;
1380 break;
1381 }
1382 }
1383 return ret;
1384}
1385
1386static int __meminit slab_memory_callback(struct notifier_block *self,
1387 unsigned long action, void *arg)
1388{
1389 struct memory_notify *mnb = arg;
1390 int ret = 0;
1391 int nid;
1392
1393 nid = mnb->status_change_nid;
1394 if (nid < 0)
1395 goto out;
1396
1397 switch (action) {
1398 case MEM_GOING_ONLINE:
1399 mutex_lock(&cache_chain_mutex);
1400 ret = init_cache_nodelists_node(nid);
1401 mutex_unlock(&cache_chain_mutex);
1402 break;
1403 case MEM_GOING_OFFLINE:
1404 mutex_lock(&cache_chain_mutex);
1405 ret = drain_cache_nodelists_node(nid);
1406 mutex_unlock(&cache_chain_mutex);
1407 break;
1408 case MEM_ONLINE:
1409 case MEM_OFFLINE:
1410 case MEM_CANCEL_ONLINE:
1411 case MEM_CANCEL_OFFLINE:
1412 break;
1413 }
1414out:
1415 return ret ? notifier_from_errno(ret) : NOTIFY_OK;
1416}
1417#endif /* CONFIG_NUMA && CONFIG_MEMORY_HOTPLUG */
1418
1334/* 1419/*
1335 * swap the static kmem_list3 with kmalloced memory 1420 * swap the static kmem_list3 with kmalloced memory
1336 */ 1421 */
1337static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list, 1422static void __init init_list(struct kmem_cache *cachep, struct kmem_list3 *list,
1338 int nodeid) 1423 int nodeid)
1339{ 1424{
1340 struct kmem_list3 *ptr; 1425 struct kmem_list3 *ptr;
1341 1426
@@ -1580,6 +1665,14 @@ void __init kmem_cache_init_late(void)
1580 */ 1665 */
1581 register_cpu_notifier(&cpucache_notifier); 1666 register_cpu_notifier(&cpucache_notifier);
1582 1667
1668#ifdef CONFIG_NUMA
1669 /*
1670 * Register a memory hotplug callback that initializes and frees
1671 * nodelists.
1672 */
1673 hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
1674#endif
1675
1583 /* 1676 /*
1584 * The reap timers are started later, with a module init call: That part 1677 * The reap timers are started later, with a module init call: That part
1585 * of the kernel is not yet operational. 1678 * of the kernel is not yet operational.