diff options
author | Christoph Lameter <clameter@sgi.com> | 2007-10-16 04:25:27 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@woody.linux-foundation.org> | 2007-10-16 12:42:58 -0400 |
commit | 13808910713a98cc1159291e62cdfec92cc94d05 (patch) | |
tree | 0fd7189dc2a76e1ae165ca5d6e8c6b4e6f1761af | |
parent | 55144768e100b68447f44c5e5c9deb155ad661bd (diff) |
Memoryless nodes: Generic management of nodemasks for various purposes
Why do we need to support memoryless nodes?
KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> For fujitsu, problem is called "empty" node.
>
> When ACPI's SRAT table includes "possible nodes", ia64 bootstrap(acpi_numa_init)
> creates nodes, which includes no memory, no cpu.
>
> I tried to remove empty-node in past, but that was denied.
> It was because we can hot-add cpu to the empty node.
> (node-hotplug triggered by cpu is not implemented now. and it will be ugly.)
>
>
> For HP, (Lee can comment on this later), they have memory-less-node.
> As far as I hear, HP's machine can have following configration.
>
> (example)
> Node0: CPU0 memory AAA MB
> Node1: CPU1 memory AAA MB
> Node2: CPU2 memory AAA MB
> Node3: CPU3 memory AAA MB
> Node4: Memory XXX GB
>
> AAA is very small value (below 16MB) and will be omitted by ia64 bootstrap.
> After boot, only Node 4 has valid memory (but have no cpu.)
>
> Maybe this is memory-interleave by firmware config.
Christoph Lameter <clameter@sgi.com> wrote:
> Future SGI platforms (actually also current one can have but nothing like
> that is deployed to my knowledge) have nodes with only cpus. Current SGI
> platforms have nodes with just I/O that we so far cannot manage in the
> core. So the arch code maps them to the nearest memory node.
Lee Schermerhorn <Lee.Schermerhorn@hp.com> wrote:
> For the HP platforms, we can configure each cell with from 0% to 100%
> "cell local memory". When we configure with <100% CLM, the "missing
> percentages" are interleaved by hardware on a cache-line granularity to
> improve bandwidth at the expense of latency for numa-challenged
> applications [and OSes, but not our problem ;-)]. When we boot Linux on
> such a config, all of the real nodes have no memory--it all resides in a
> single interleaved pseudo-node.
>
> When we boot Linux on a 100% CLM configuration [== NUMA], we still have
> the interleaved pseudo-node. It contains a few hundred MB stolen from
> the real nodes to contain the DMA zone. [Interleaved memory resides at
> phys addr 0]. The memoryless-nodes patches, along with the zoneorder
> patches, support this config as well.
>
> Also, when we boot a NUMA config with the "mem=" command line,
> specifying less memory than actually exists, Linux takes the excluded
> memory "off the top" rather than distributing it across the nodes. This
> can result in memoryless nodes, as well.
>
This patch:
Preparation for memoryless node patches.
Provide a generic way to keep nodemasks describing various characteristics of
NUMA nodes.
Remove the node_online_map and the node_possible map and realize the same
functionality using two nodes stats: N_POSSIBLE and N_ONLINE.
[Lee.Schermerhorn@hp.com: Initialize N_*_MEMORY and N_CPU masks for non-NUMA config]
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Tested-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Acked-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Acked-by: Bob Picco <bob.picco@hp.com>
Cc: Nishanth Aravamudan <nacc@us.ibm.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Mel Gorman <mel@skynet.ie>
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: "Serge E. Hallyn" <serge@hallyn.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r-- | include/linux/nodemask.h | 87 | ||||
-rw-r--r-- | mm/page_alloc.c | 20 |
2 files changed, 85 insertions, 22 deletions
diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h index 52c54a5720f3..583e6b843d2a 100644 --- a/include/linux/nodemask.h +++ b/include/linux/nodemask.h | |||
@@ -338,31 +338,81 @@ static inline void __nodes_remap(nodemask_t *dstp, const nodemask_t *srcp, | |||
338 | #endif /* MAX_NUMNODES */ | 338 | #endif /* MAX_NUMNODES */ |
339 | 339 | ||
340 | /* | 340 | /* |
341 | * Bitmasks that are kept for all the nodes. | ||
342 | */ | ||
343 | enum node_states { | ||
344 | N_POSSIBLE, /* The node could become online at some point */ | ||
345 | N_ONLINE, /* The node is online */ | ||
346 | NR_NODE_STATES | ||
347 | }; | ||
348 | |||
349 | /* | ||
341 | * The following particular system nodemasks and operations | 350 | * The following particular system nodemasks and operations |
342 | * on them manage all possible and online nodes. | 351 | * on them manage all possible and online nodes. |
343 | */ | 352 | */ |
344 | 353 | ||
345 | extern nodemask_t node_online_map; | 354 | extern nodemask_t node_states[NR_NODE_STATES]; |
346 | extern nodemask_t node_possible_map; | ||
347 | 355 | ||
348 | #if MAX_NUMNODES > 1 | 356 | #if MAX_NUMNODES > 1 |
349 | #define num_online_nodes() nodes_weight(node_online_map) | 357 | static inline int node_state(int node, enum node_states state) |
350 | #define num_possible_nodes() nodes_weight(node_possible_map) | 358 | { |
351 | #define node_online(node) node_isset((node), node_online_map) | 359 | return node_isset(node, node_states[state]); |
352 | #define node_possible(node) node_isset((node), node_possible_map) | 360 | } |
353 | #define first_online_node first_node(node_online_map) | 361 | |
354 | #define next_online_node(nid) next_node((nid), node_online_map) | 362 | static inline void node_set_state(int node, enum node_states state) |
363 | { | ||
364 | __node_set(node, &node_states[state]); | ||
365 | } | ||
366 | |||
367 | static inline void node_clear_state(int node, enum node_states state) | ||
368 | { | ||
369 | __node_clear(node, &node_states[state]); | ||
370 | } | ||
371 | |||
372 | static inline int num_node_state(enum node_states state) | ||
373 | { | ||
374 | return nodes_weight(node_states[state]); | ||
375 | } | ||
376 | |||
377 | #define for_each_node_state(__node, __state) \ | ||
378 | for_each_node_mask((__node), node_states[__state]) | ||
379 | |||
380 | #define first_online_node first_node(node_states[N_ONLINE]) | ||
381 | #define next_online_node(nid) next_node((nid), node_states[N_ONLINE]) | ||
382 | |||
355 | extern int nr_node_ids; | 383 | extern int nr_node_ids; |
356 | #else | 384 | #else |
357 | #define num_online_nodes() 1 | 385 | |
358 | #define num_possible_nodes() 1 | 386 | static inline int node_state(int node, enum node_states state) |
359 | #define node_online(node) ((node) == 0) | 387 | { |
360 | #define node_possible(node) ((node) == 0) | 388 | return node == 0; |
389 | } | ||
390 | |||
391 | static inline void node_set_state(int node, enum node_states state) | ||
392 | { | ||
393 | } | ||
394 | |||
395 | static inline void node_clear_state(int node, enum node_states state) | ||
396 | { | ||
397 | } | ||
398 | |||
399 | static inline int num_node_state(enum node_states state) | ||
400 | { | ||
401 | return 1; | ||
402 | } | ||
403 | |||
404 | #define for_each_node_state(node, __state) \ | ||
405 | for ( (node) = 0; (node) == 0; (node) = 1) | ||
406 | |||
361 | #define first_online_node 0 | 407 | #define first_online_node 0 |
362 | #define next_online_node(nid) (MAX_NUMNODES) | 408 | #define next_online_node(nid) (MAX_NUMNODES) |
363 | #define nr_node_ids 1 | 409 | #define nr_node_ids 1 |
410 | |||
364 | #endif | 411 | #endif |
365 | 412 | ||
413 | #define node_online_map node_states[N_ONLINE] | ||
414 | #define node_possible_map node_states[N_POSSIBLE] | ||
415 | |||
366 | #define any_online_node(mask) \ | 416 | #define any_online_node(mask) \ |
367 | ({ \ | 417 | ({ \ |
368 | int node; \ | 418 | int node; \ |
@@ -372,10 +422,15 @@ extern int nr_node_ids; | |||
372 | node; \ | 422 | node; \ |
373 | }) | 423 | }) |
374 | 424 | ||
375 | #define node_set_online(node) set_bit((node), node_online_map.bits) | 425 | #define num_online_nodes() num_node_state(N_ONLINE) |
376 | #define node_set_offline(node) clear_bit((node), node_online_map.bits) | 426 | #define num_possible_nodes() num_node_state(N_POSSIBLE) |
427 | #define node_online(node) node_state((node), N_ONLINE) | ||
428 | #define node_possible(node) node_state((node), N_POSSIBLE) | ||
429 | |||
430 | #define node_set_online(node) node_set_state((node), N_ONLINE) | ||
431 | #define node_set_offline(node) node_clear_state((node), N_ONLINE) | ||
377 | 432 | ||
378 | #define for_each_node(node) for_each_node_mask((node), node_possible_map) | 433 | #define for_each_node(node) for_each_node_state(node, N_POSSIBLE) |
379 | #define for_each_online_node(node) for_each_node_mask((node), node_online_map) | 434 | #define for_each_online_node(node) for_each_node_state(node, N_ONLINE) |
380 | 435 | ||
381 | #endif /* __LINUX_NODEMASK_H */ | 436 | #endif /* __LINUX_NODEMASK_H */ |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 71013e6bef25..0cc5b3e198e5 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -47,13 +47,21 @@ | |||
47 | #include "internal.h" | 47 | #include "internal.h" |
48 | 48 | ||
49 | /* | 49 | /* |
50 | * MCD - HACK: Find somewhere to initialize this EARLY, or make this | 50 | * Array of node states. |
51 | * initializer cleaner | ||
52 | */ | 51 | */ |
53 | nodemask_t node_online_map __read_mostly = { { [0] = 1UL } }; | 52 | nodemask_t node_states[NR_NODE_STATES] __read_mostly = { |
54 | EXPORT_SYMBOL(node_online_map); | 53 | [N_POSSIBLE] = NODE_MASK_ALL, |
55 | nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL; | 54 | [N_ONLINE] = { { [0] = 1UL } }, |
56 | EXPORT_SYMBOL(node_possible_map); | 55 | #ifndef CONFIG_NUMA |
56 | [N_NORMAL_MEMORY] = { { [0] = 1UL } }, | ||
57 | #ifdef CONFIG_HIGHMEM | ||
58 | [N_HIGH_MEMORY] = { { [0] = 1UL } }, | ||
59 | #endif | ||
60 | [N_CPU] = { { [0] = 1UL } }, | ||
61 | #endif /* NUMA */ | ||
62 | }; | ||
63 | EXPORT_SYMBOL(node_states); | ||
64 | |||
57 | unsigned long totalram_pages __read_mostly; | 65 | unsigned long totalram_pages __read_mostly; |
58 | unsigned long totalreserve_pages __read_mostly; | 66 | unsigned long totalreserve_pages __read_mostly; |
59 | long nr_swap_pages; | 67 | long nr_swap_pages; |