aboutsummaryrefslogtreecommitdiffstats
path: root/arch/powerpc/mm/numa.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/powerpc/mm/numa.c')
-rw-r--r--arch/powerpc/mm/numa.c361
1 files changed, 326 insertions, 35 deletions
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 002878ccf90b..2164006fe170 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -20,10 +20,15 @@
20#include <linux/memblock.h> 20#include <linux/memblock.h>
21#include <linux/of.h> 21#include <linux/of.h>
22#include <linux/pfn.h> 22#include <linux/pfn.h>
23#include <linux/cpuset.h>
24#include <linux/node.h>
23#include <asm/sparsemem.h> 25#include <asm/sparsemem.h>
24#include <asm/prom.h> 26#include <asm/prom.h>
25#include <asm/system.h> 27#include <asm/system.h>
26#include <asm/smp.h> 28#include <asm/smp.h>
29#include <asm/firmware.h>
30#include <asm/paca.h>
31#include <asm/hvcall.h>
27 32
28static int numa_enabled = 1; 33static int numa_enabled = 1;
29 34
@@ -163,7 +168,7 @@ static void __init get_node_active_region(unsigned long start_pfn,
163 work_with_active_regions(nid, get_active_region_work_fn, node_ar); 168 work_with_active_regions(nid, get_active_region_work_fn, node_ar);
164} 169}
165 170
166static void __cpuinit map_cpu_to_node(int cpu, int node) 171static void map_cpu_to_node(int cpu, int node)
167{ 172{
168 numa_cpu_lookup_table[cpu] = node; 173 numa_cpu_lookup_table[cpu] = node;
169 174
@@ -173,7 +178,7 @@ static void __cpuinit map_cpu_to_node(int cpu, int node)
173 cpumask_set_cpu(cpu, node_to_cpumask_map[node]); 178 cpumask_set_cpu(cpu, node_to_cpumask_map[node]);
174} 179}
175 180
176#ifdef CONFIG_HOTPLUG_CPU 181#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_PPC_SPLPAR)
177static void unmap_cpu_from_node(unsigned long cpu) 182static void unmap_cpu_from_node(unsigned long cpu)
178{ 183{
179 int node = numa_cpu_lookup_table[cpu]; 184 int node = numa_cpu_lookup_table[cpu];
@@ -181,13 +186,13 @@ static void unmap_cpu_from_node(unsigned long cpu)
181 dbg("removing cpu %lu from node %d\n", cpu, node); 186 dbg("removing cpu %lu from node %d\n", cpu, node);
182 187
183 if (cpumask_test_cpu(cpu, node_to_cpumask_map[node])) { 188 if (cpumask_test_cpu(cpu, node_to_cpumask_map[node])) {
184 cpumask_set_cpu(cpu, node_to_cpumask_map[node]); 189 cpumask_clear_cpu(cpu, node_to_cpumask_map[node]);
185 } else { 190 } else {
186 printk(KERN_ERR "WARNING: cpu %lu not found in node %d\n", 191 printk(KERN_ERR "WARNING: cpu %lu not found in node %d\n",
187 cpu, node); 192 cpu, node);
188 } 193 }
189} 194}
190#endif /* CONFIG_HOTPLUG_CPU */ 195#endif /* CONFIG_HOTPLUG_CPU || CONFIG_PPC_SPLPAR */
191 196
192/* must hold reference to node during call */ 197/* must hold reference to node during call */
193static const int *of_get_associativity(struct device_node *dev) 198static const int *of_get_associativity(struct device_node *dev)
@@ -246,32 +251,41 @@ static void initialize_distance_lookup_table(int nid,
246/* Returns nid in the range [0..MAX_NUMNODES-1], or -1 if no useful numa 251/* Returns nid in the range [0..MAX_NUMNODES-1], or -1 if no useful numa
247 * info is found. 252 * info is found.
248 */ 253 */
249static int of_node_to_nid_single(struct device_node *device) 254static int associativity_to_nid(const unsigned int *associativity)
250{ 255{
251 int nid = -1; 256 int nid = -1;
252 const unsigned int *tmp;
253 257
254 if (min_common_depth == -1) 258 if (min_common_depth == -1)
255 goto out; 259 goto out;
256 260
257 tmp = of_get_associativity(device); 261 if (associativity[0] >= min_common_depth)
258 if (!tmp) 262 nid = associativity[min_common_depth];
259 goto out;
260
261 if (tmp[0] >= min_common_depth)
262 nid = tmp[min_common_depth];
263 263
264 /* POWER4 LPAR uses 0xffff as invalid node */ 264 /* POWER4 LPAR uses 0xffff as invalid node */
265 if (nid == 0xffff || nid >= MAX_NUMNODES) 265 if (nid == 0xffff || nid >= MAX_NUMNODES)
266 nid = -1; 266 nid = -1;
267 267
268 if (nid > 0 && tmp[0] >= distance_ref_points_depth) 268 if (nid > 0 && associativity[0] >= distance_ref_points_depth)
269 initialize_distance_lookup_table(nid, tmp); 269 initialize_distance_lookup_table(nid, associativity);
270 270
271out: 271out:
272 return nid; 272 return nid;
273} 273}
274 274
275/* Returns the nid associated with the given device tree node,
276 * or -1 if not found.
277 */
278static int of_node_to_nid_single(struct device_node *device)
279{
280 int nid = -1;
281 const unsigned int *tmp;
282
283 tmp = of_get_associativity(device);
284 if (tmp)
285 nid = associativity_to_nid(tmp);
286 return nid;
287}
288
275/* Walk the device tree upwards, looking for an associativity id */ 289/* Walk the device tree upwards, looking for an associativity id */
276int of_node_to_nid(struct device_node *device) 290int of_node_to_nid(struct device_node *device)
277{ 291{
@@ -297,14 +311,13 @@ EXPORT_SYMBOL_GPL(of_node_to_nid);
297static int __init find_min_common_depth(void) 311static int __init find_min_common_depth(void)
298{ 312{
299 int depth; 313 int depth;
300 struct device_node *rtas_root;
301 struct device_node *chosen; 314 struct device_node *chosen;
315 struct device_node *root;
302 const char *vec5; 316 const char *vec5;
303 317
304 rtas_root = of_find_node_by_path("/rtas"); 318 root = of_find_node_by_path("/rtas");
305 319 if (!root)
306 if (!rtas_root) 320 root = of_find_node_by_path("/");
307 return -1;
308 321
309 /* 322 /*
310 * This property is a set of 32-bit integers, each representing 323 * This property is a set of 32-bit integers, each representing
@@ -318,7 +331,7 @@ static int __init find_min_common_depth(void)
318 * NUMA boundary and the following are progressively less significant 331 * NUMA boundary and the following are progressively less significant
319 * boundaries. There can be more than one level of NUMA. 332 * boundaries. There can be more than one level of NUMA.
320 */ 333 */
321 distance_ref_points = of_get_property(rtas_root, 334 distance_ref_points = of_get_property(root,
322 "ibm,associativity-reference-points", 335 "ibm,associativity-reference-points",
323 &distance_ref_points_depth); 336 &distance_ref_points_depth);
324 337
@@ -362,11 +375,11 @@ static int __init find_min_common_depth(void)
362 distance_ref_points_depth = MAX_DISTANCE_REF_POINTS; 375 distance_ref_points_depth = MAX_DISTANCE_REF_POINTS;
363 } 376 }
364 377
365 of_node_put(rtas_root); 378 of_node_put(root);
366 return depth; 379 return depth;
367 380
368err: 381err:
369 of_node_put(rtas_root); 382 of_node_put(root);
370 return -1; 383 return -1;
371} 384}
372 385
@@ -426,11 +439,11 @@ static void read_drconf_cell(struct of_drconf_cell *drmem, const u32 **cellp)
426} 439}
427 440
428/* 441/*
429 * Retreive and validate the ibm,dynamic-memory property of the device tree. 442 * Retrieve and validate the ibm,dynamic-memory property of the device tree.
430 * 443 *
431 * The layout of the ibm,dynamic-memory property is a number N of memblock 444 * The layout of the ibm,dynamic-memory property is a number N of memblock
432 * list entries followed by N memblock list entries. Each memblock list entry 445 * list entries followed by N memblock list entries. Each memblock list entry
433 * contains information as layed out in the of_drconf_cell struct above. 446 * contains information as laid out in the of_drconf_cell struct above.
434 */ 447 */
435static int of_get_drconf_memory(struct device_node *memory, const u32 **dm) 448static int of_get_drconf_memory(struct device_node *memory, const u32 **dm)
436{ 449{
@@ -454,7 +467,7 @@ static int of_get_drconf_memory(struct device_node *memory, const u32 **dm)
454} 467}
455 468
456/* 469/*
457 * Retreive and validate the ibm,lmb-size property for drconf memory 470 * Retrieve and validate the ibm,lmb-size property for drconf memory
458 * from the device tree. 471 * from the device tree.
459 */ 472 */
460static u64 of_get_lmb_size(struct device_node *memory) 473static u64 of_get_lmb_size(struct device_node *memory)
@@ -476,7 +489,7 @@ struct assoc_arrays {
476}; 489};
477 490
478/* 491/*
479 * Retreive and validate the list of associativity arrays for drconf 492 * Retrieve and validate the list of associativity arrays for drconf
480 * memory from the ibm,associativity-lookup-arrays property of the 493 * memory from the ibm,associativity-lookup-arrays property of the
481 * device tree.. 494 * device tree..
482 * 495 *
@@ -590,7 +603,7 @@ static int __cpuinit cpu_numa_callback(struct notifier_block *nfb,
590 * Returns the size the region should have to enforce the memory limit. 603 * Returns the size the region should have to enforce the memory limit.
591 * This will either be the original value of size, a truncated value, 604 * This will either be the original value of size, a truncated value,
592 * or zero. If the returned value of size is 0 the region should be 605 * or zero. If the returned value of size is 0 the region should be
593 * discarded as it lies wholy above the memory limit. 606 * discarded as it lies wholly above the memory limit.
594 */ 607 */
595static unsigned long __init numa_enforce_memory_limit(unsigned long start, 608static unsigned long __init numa_enforce_memory_limit(unsigned long start,
596 unsigned long size) 609 unsigned long size)
@@ -802,16 +815,17 @@ static void __init setup_nonnuma(void)
802 unsigned long top_of_ram = memblock_end_of_DRAM(); 815 unsigned long top_of_ram = memblock_end_of_DRAM();
803 unsigned long total_ram = memblock_phys_mem_size(); 816 unsigned long total_ram = memblock_phys_mem_size();
804 unsigned long start_pfn, end_pfn; 817 unsigned long start_pfn, end_pfn;
805 unsigned int i, nid = 0; 818 unsigned int nid = 0;
819 struct memblock_region *reg;
806 820
807 printk(KERN_DEBUG "Top of RAM: 0x%lx, Total RAM: 0x%lx\n", 821 printk(KERN_DEBUG "Top of RAM: 0x%lx, Total RAM: 0x%lx\n",
808 top_of_ram, total_ram); 822 top_of_ram, total_ram);
809 printk(KERN_DEBUG "Memory hole size: %ldMB\n", 823 printk(KERN_DEBUG "Memory hole size: %ldMB\n",
810 (top_of_ram - total_ram) >> 20); 824 (top_of_ram - total_ram) >> 20);
811 825
812 for (i = 0; i < memblock.memory.cnt; ++i) { 826 for_each_memblock(memory, reg) {
813 start_pfn = memblock.memory.region[i].base >> PAGE_SHIFT; 827 start_pfn = memblock_region_memory_base_pfn(reg);
814 end_pfn = start_pfn + memblock_size_pages(&memblock.memory, i); 828 end_pfn = memblock_region_memory_end_pfn(reg);
815 829
816 fake_numa_create_new_node(end_pfn, &nid); 830 fake_numa_create_new_node(end_pfn, &nid);
817 add_active_range(nid, start_pfn, end_pfn); 831 add_active_range(nid, start_pfn, end_pfn);
@@ -947,11 +961,11 @@ static struct notifier_block __cpuinitdata ppc64_numa_nb = {
947static void mark_reserved_regions_for_nid(int nid) 961static void mark_reserved_regions_for_nid(int nid)
948{ 962{
949 struct pglist_data *node = NODE_DATA(nid); 963 struct pglist_data *node = NODE_DATA(nid);
950 int i; 964 struct memblock_region *reg;
951 965
952 for (i = 0; i < memblock.reserved.cnt; i++) { 966 for_each_memblock(reserved, reg) {
953 unsigned long physbase = memblock.reserved.region[i].base; 967 unsigned long physbase = reg->base;
954 unsigned long size = memblock.reserved.region[i].size; 968 unsigned long size = reg->size;
955 unsigned long start_pfn = physbase >> PAGE_SHIFT; 969 unsigned long start_pfn = physbase >> PAGE_SHIFT;
956 unsigned long end_pfn = PFN_UP(physbase + size); 970 unsigned long end_pfn = PFN_UP(physbase + size);
957 struct node_active_region node_ar; 971 struct node_active_region node_ar;
@@ -1246,4 +1260,281 @@ int hot_add_scn_to_nid(unsigned long scn_addr)
1246 return nid; 1260 return nid;
1247} 1261}
1248 1262
1263static u64 hot_add_drconf_memory_max(void)
1264{
1265 struct device_node *memory = NULL;
1266 unsigned int drconf_cell_cnt = 0;
1267 u64 lmb_size = 0;
1268 const u32 *dm = 0;
1269
1270 memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
1271 if (memory) {
1272 drconf_cell_cnt = of_get_drconf_memory(memory, &dm);
1273 lmb_size = of_get_lmb_size(memory);
1274 of_node_put(memory);
1275 }
1276 return lmb_size * drconf_cell_cnt;
1277}
1278
1279/*
1280 * memory_hotplug_max - return max address of memory that may be added
1281 *
1282 * This is currently only used on systems that support drconfig memory
1283 * hotplug.
1284 */
1285u64 memory_hotplug_max(void)
1286{
1287 return max(hot_add_drconf_memory_max(), memblock_end_of_DRAM());
1288}
1249#endif /* CONFIG_MEMORY_HOTPLUG */ 1289#endif /* CONFIG_MEMORY_HOTPLUG */
1290
1291/* Virtual Processor Home Node (VPHN) support */
1292#ifdef CONFIG_PPC_SPLPAR
1293static u8 vphn_cpu_change_counts[NR_CPUS][MAX_DISTANCE_REF_POINTS];
1294static cpumask_t cpu_associativity_changes_mask;
1295static int vphn_enabled;
1296static void set_topology_timer(void);
1297
1298/*
1299 * Store the current values of the associativity change counters in the
1300 * hypervisor.
1301 */
1302static void setup_cpu_associativity_change_counters(void)
1303{
1304 int cpu;
1305
1306 /* The VPHN feature supports a maximum of 8 reference points */
1307 BUILD_BUG_ON(MAX_DISTANCE_REF_POINTS > 8);
1308
1309 for_each_possible_cpu(cpu) {
1310 int i;
1311 u8 *counts = vphn_cpu_change_counts[cpu];
1312 volatile u8 *hypervisor_counts = lppaca[cpu].vphn_assoc_counts;
1313
1314 for (i = 0; i < distance_ref_points_depth; i++)
1315 counts[i] = hypervisor_counts[i];
1316 }
1317}
1318
1319/*
1320 * The hypervisor maintains a set of 8 associativity change counters in
1321 * the VPA of each cpu that correspond to the associativity levels in the
1322 * ibm,associativity-reference-points property. When an associativity
1323 * level changes, the corresponding counter is incremented.
1324 *
1325 * Set a bit in cpu_associativity_changes_mask for each cpu whose home
1326 * node associativity levels have changed.
1327 *
1328 * Returns the number of cpus with unhandled associativity changes.
1329 */
1330static int update_cpu_associativity_changes_mask(void)
1331{
1332 int cpu, nr_cpus = 0;
1333 cpumask_t *changes = &cpu_associativity_changes_mask;
1334
1335 cpumask_clear(changes);
1336
1337 for_each_possible_cpu(cpu) {
1338 int i, changed = 0;
1339 u8 *counts = vphn_cpu_change_counts[cpu];
1340 volatile u8 *hypervisor_counts = lppaca[cpu].vphn_assoc_counts;
1341
1342 for (i = 0; i < distance_ref_points_depth; i++) {
1343 if (hypervisor_counts[i] != counts[i]) {
1344 counts[i] = hypervisor_counts[i];
1345 changed = 1;
1346 }
1347 }
1348 if (changed) {
1349 cpumask_set_cpu(cpu, changes);
1350 nr_cpus++;
1351 }
1352 }
1353
1354 return nr_cpus;
1355}
1356
1357/*
1358 * 6 64-bit registers unpacked into 12 32-bit associativity values. To form
1359 * the complete property we have to add the length in the first cell.
1360 */
1361#define VPHN_ASSOC_BUFSIZE (6*sizeof(u64)/sizeof(u32) + 1)
1362
1363/*
1364 * Convert the associativity domain numbers returned from the hypervisor
1365 * to the sequence they would appear in the ibm,associativity property.
1366 */
1367static int vphn_unpack_associativity(const long *packed, unsigned int *unpacked)
1368{
1369 int i, nr_assoc_doms = 0;
1370 const u16 *field = (const u16*) packed;
1371
1372#define VPHN_FIELD_UNUSED (0xffff)
1373#define VPHN_FIELD_MSB (0x8000)
1374#define VPHN_FIELD_MASK (~VPHN_FIELD_MSB)
1375
1376 for (i = 1; i < VPHN_ASSOC_BUFSIZE; i++) {
1377 if (*field == VPHN_FIELD_UNUSED) {
1378 /* All significant fields processed, and remaining
1379 * fields contain the reserved value of all 1's.
1380 * Just store them.
1381 */
1382 unpacked[i] = *((u32*)field);
1383 field += 2;
1384 } else if (*field & VPHN_FIELD_MSB) {
1385 /* Data is in the lower 15 bits of this field */
1386 unpacked[i] = *field & VPHN_FIELD_MASK;
1387 field++;
1388 nr_assoc_doms++;
1389 } else {
1390 /* Data is in the lower 15 bits of this field
1391 * concatenated with the next 16 bit field
1392 */
1393 unpacked[i] = *((u32*)field);
1394 field += 2;
1395 nr_assoc_doms++;
1396 }
1397 }
1398
1399 /* The first cell contains the length of the property */
1400 unpacked[0] = nr_assoc_doms;
1401
1402 return nr_assoc_doms;
1403}
1404
1405/*
1406 * Retrieve the new associativity information for a virtual processor's
1407 * home node.
1408 */
1409static long hcall_vphn(unsigned long cpu, unsigned int *associativity)
1410{
1411 long rc;
1412 long retbuf[PLPAR_HCALL9_BUFSIZE] = {0};
1413 u64 flags = 1;
1414 int hwcpu = get_hard_smp_processor_id(cpu);
1415
1416 rc = plpar_hcall9(H_HOME_NODE_ASSOCIATIVITY, retbuf, flags, hwcpu);
1417 vphn_unpack_associativity(retbuf, associativity);
1418
1419 return rc;
1420}
1421
1422static long vphn_get_associativity(unsigned long cpu,
1423 unsigned int *associativity)
1424{
1425 long rc;
1426
1427 rc = hcall_vphn(cpu, associativity);
1428
1429 switch (rc) {
1430 case H_FUNCTION:
1431 printk(KERN_INFO
1432 "VPHN is not supported. Disabling polling...\n");
1433 stop_topology_update();
1434 break;
1435 case H_HARDWARE:
1436 printk(KERN_ERR
1437 "hcall_vphn() experienced a hardware fault "
1438 "preventing VPHN. Disabling polling...\n");
1439 stop_topology_update();
1440 }
1441
1442 return rc;
1443}
1444
1445/*
1446 * Update the node maps and sysfs entries for each cpu whose home node
1447 * has changed.
1448 */
1449int arch_update_cpu_topology(void)
1450{
1451 int cpu, nid, old_nid;
1452 unsigned int associativity[VPHN_ASSOC_BUFSIZE] = {0};
1453 struct sys_device *sysdev;
1454
1455 for_each_cpu(cpu,&cpu_associativity_changes_mask) {
1456 vphn_get_associativity(cpu, associativity);
1457 nid = associativity_to_nid(associativity);
1458
1459 if (nid < 0 || !node_online(nid))
1460 nid = first_online_node;
1461
1462 old_nid = numa_cpu_lookup_table[cpu];
1463
1464 /* Disable hotplug while we update the cpu
1465 * masks and sysfs.
1466 */
1467 get_online_cpus();
1468 unregister_cpu_under_node(cpu, old_nid);
1469 unmap_cpu_from_node(cpu);
1470 map_cpu_to_node(cpu, nid);
1471 register_cpu_under_node(cpu, nid);
1472 put_online_cpus();
1473
1474 sysdev = get_cpu_sysdev(cpu);
1475 if (sysdev)
1476 kobject_uevent(&sysdev->kobj, KOBJ_CHANGE);
1477 }
1478
1479 return 1;
1480}
1481
1482static void topology_work_fn(struct work_struct *work)
1483{
1484 rebuild_sched_domains();
1485}
1486static DECLARE_WORK(topology_work, topology_work_fn);
1487
1488void topology_schedule_update(void)
1489{
1490 schedule_work(&topology_work);
1491}
1492
1493static void topology_timer_fn(unsigned long ignored)
1494{
1495 if (!vphn_enabled)
1496 return;
1497 if (update_cpu_associativity_changes_mask() > 0)
1498 topology_schedule_update();
1499 set_topology_timer();
1500}
1501static struct timer_list topology_timer =
1502 TIMER_INITIALIZER(topology_timer_fn, 0, 0);
1503
1504static void set_topology_timer(void)
1505{
1506 topology_timer.data = 0;
1507 topology_timer.expires = jiffies + 60 * HZ;
1508 add_timer(&topology_timer);
1509}
1510
1511/*
1512 * Start polling for VPHN associativity changes.
1513 */
1514int start_topology_update(void)
1515{
1516 int rc = 0;
1517
1518 /* Disabled until races with load balancing are fixed */
1519 if (0 && firmware_has_feature(FW_FEATURE_VPHN) &&
1520 get_lppaca()->shared_proc) {
1521 vphn_enabled = 1;
1522 setup_cpu_associativity_change_counters();
1523 init_timer_deferrable(&topology_timer);
1524 set_topology_timer();
1525 rc = 1;
1526 }
1527
1528 return rc;
1529}
1530__initcall(start_topology_update);
1531
1532/*
1533 * Disable polling for VPHN associativity changes.
1534 */
1535int stop_topology_update(void)
1536{
1537 vphn_enabled = 0;
1538 return del_timer_sync(&topology_timer);
1539}
1540#endif /* CONFIG_PPC_SPLPAR */