aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorChristoph Lameter <clameter@sgi.com>2007-10-16 04:25:27 -0400
committerLinus Torvalds <torvalds@woody.linux-foundation.org>2007-10-16 12:42:58 -0400
commit13808910713a98cc1159291e62cdfec92cc94d05 (patch)
tree0fd7189dc2a76e1ae165ca5d6e8c6b4e6f1761af
parent55144768e100b68447f44c5e5c9deb155ad661bd (diff)
Memoryless nodes: Generic management of nodemasks for various purposes
Why do we need to support memoryless nodes? KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote: > For fujitsu, problem is called "empty" node. > > When ACPI's SRAT table includes "possible nodes", ia64 bootstrap(acpi_numa_init) > creates nodes, which includes no memory, no cpu. > > I tried to remove empty-node in past, but that was denied. > It was because we can hot-add cpu to the empty node. > (node-hotplug triggered by cpu is not implemented now. and it will be ugly.) > > > For HP, (Lee can comment on this later), they have memory-less-node. > As far as I hear, HP's machine can have following configration. > > (example) > Node0: CPU0 memory AAA MB > Node1: CPU1 memory AAA MB > Node2: CPU2 memory AAA MB > Node3: CPU3 memory AAA MB > Node4: Memory XXX GB > > AAA is very small value (below 16MB) and will be omitted by ia64 bootstrap. > After boot, only Node 4 has valid memory (but have no cpu.) > > Maybe this is memory-interleave by firmware config. Christoph Lameter <clameter@sgi.com> wrote: > Future SGI platforms (actually also current one can have but nothing like > that is deployed to my knowledge) have nodes with only cpus. Current SGI > platforms have nodes with just I/O that we so far cannot manage in the > core. So the arch code maps them to the nearest memory node. Lee Schermerhorn <Lee.Schermerhorn@hp.com> wrote: > For the HP platforms, we can configure each cell with from 0% to 100% > "cell local memory". When we configure with <100% CLM, the "missing > percentages" are interleaved by hardware on a cache-line granularity to > improve bandwidth at the expense of latency for numa-challenged > applications [and OSes, but not our problem ;-)]. When we boot Linux on > such a config, all of the real nodes have no memory--it all resides in a > single interleaved pseudo-node. > > When we boot Linux on a 100% CLM configuration [== NUMA], we still have > the interleaved pseudo-node. It contains a few hundred MB stolen from > the real nodes to contain the DMA zone. [Interleaved memory resides at > phys addr 0]. The memoryless-nodes patches, along with the zoneorder > patches, support this config as well. > > Also, when we boot a NUMA config with the "mem=" command line, > specifying less memory than actually exists, Linux takes the excluded > memory "off the top" rather than distributing it across the nodes. This > can result in memoryless nodes, as well. > This patch: Preparation for memoryless node patches. Provide a generic way to keep nodemasks describing various characteristics of NUMA nodes. Remove the node_online_map and the node_possible map and realize the same functionality using two nodes stats: N_POSSIBLE and N_ONLINE. [Lee.Schermerhorn@hp.com: Initialize N_*_MEMORY and N_CPU masks for non-NUMA config] Signed-off-by: Christoph Lameter <clameter@sgi.com> Tested-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Acked-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Acked-by: Bob Picco <bob.picco@hp.com> Cc: Nishanth Aravamudan <nacc@us.ibm.com> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Mel Gorman <mel@skynet.ie> Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: "Serge E. Hallyn" <serge@hallyn.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--include/linux/nodemask.h87
-rw-r--r--mm/page_alloc.c20
2 files changed, 85 insertions, 22 deletions
diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h
index 52c54a5720f3..583e6b843d2a 100644
--- a/include/linux/nodemask.h
+++ b/include/linux/nodemask.h
@@ -338,31 +338,81 @@ static inline void __nodes_remap(nodemask_t *dstp, const nodemask_t *srcp,
338#endif /* MAX_NUMNODES */ 338#endif /* MAX_NUMNODES */
339 339
340/* 340/*
341 * Bitmasks that are kept for all the nodes.
342 */
343enum node_states {
344 N_POSSIBLE, /* The node could become online at some point */
345 N_ONLINE, /* The node is online */
346 NR_NODE_STATES
347};
348
349/*
341 * The following particular system nodemasks and operations 350 * The following particular system nodemasks and operations
342 * on them manage all possible and online nodes. 351 * on them manage all possible and online nodes.
343 */ 352 */
344 353
345extern nodemask_t node_online_map; 354extern nodemask_t node_states[NR_NODE_STATES];
346extern nodemask_t node_possible_map;
347 355
348#if MAX_NUMNODES > 1 356#if MAX_NUMNODES > 1
349#define num_online_nodes() nodes_weight(node_online_map) 357static inline int node_state(int node, enum node_states state)
350#define num_possible_nodes() nodes_weight(node_possible_map) 358{
351#define node_online(node) node_isset((node), node_online_map) 359 return node_isset(node, node_states[state]);
352#define node_possible(node) node_isset((node), node_possible_map) 360}
353#define first_online_node first_node(node_online_map) 361
354#define next_online_node(nid) next_node((nid), node_online_map) 362static inline void node_set_state(int node, enum node_states state)
363{
364 __node_set(node, &node_states[state]);
365}
366
367static inline void node_clear_state(int node, enum node_states state)
368{
369 __node_clear(node, &node_states[state]);
370}
371
372static inline int num_node_state(enum node_states state)
373{
374 return nodes_weight(node_states[state]);
375}
376
377#define for_each_node_state(__node, __state) \
378 for_each_node_mask((__node), node_states[__state])
379
380#define first_online_node first_node(node_states[N_ONLINE])
381#define next_online_node(nid) next_node((nid), node_states[N_ONLINE])
382
355extern int nr_node_ids; 383extern int nr_node_ids;
356#else 384#else
357#define num_online_nodes() 1 385
358#define num_possible_nodes() 1 386static inline int node_state(int node, enum node_states state)
359#define node_online(node) ((node) == 0) 387{
360#define node_possible(node) ((node) == 0) 388 return node == 0;
389}
390
391static inline void node_set_state(int node, enum node_states state)
392{
393}
394
395static inline void node_clear_state(int node, enum node_states state)
396{
397}
398
399static inline int num_node_state(enum node_states state)
400{
401 return 1;
402}
403
404#define for_each_node_state(node, __state) \
405 for ( (node) = 0; (node) == 0; (node) = 1)
406
361#define first_online_node 0 407#define first_online_node 0
362#define next_online_node(nid) (MAX_NUMNODES) 408#define next_online_node(nid) (MAX_NUMNODES)
363#define nr_node_ids 1 409#define nr_node_ids 1
410
364#endif 411#endif
365 412
413#define node_online_map node_states[N_ONLINE]
414#define node_possible_map node_states[N_POSSIBLE]
415
366#define any_online_node(mask) \ 416#define any_online_node(mask) \
367({ \ 417({ \
368 int node; \ 418 int node; \
@@ -372,10 +422,15 @@ extern int nr_node_ids;
372 node; \ 422 node; \
373}) 423})
374 424
375#define node_set_online(node) set_bit((node), node_online_map.bits) 425#define num_online_nodes() num_node_state(N_ONLINE)
376#define node_set_offline(node) clear_bit((node), node_online_map.bits) 426#define num_possible_nodes() num_node_state(N_POSSIBLE)
427#define node_online(node) node_state((node), N_ONLINE)
428#define node_possible(node) node_state((node), N_POSSIBLE)
429
430#define node_set_online(node) node_set_state((node), N_ONLINE)
431#define node_set_offline(node) node_clear_state((node), N_ONLINE)
377 432
378#define for_each_node(node) for_each_node_mask((node), node_possible_map) 433#define for_each_node(node) for_each_node_state(node, N_POSSIBLE)
379#define for_each_online_node(node) for_each_node_mask((node), node_online_map) 434#define for_each_online_node(node) for_each_node_state(node, N_ONLINE)
380 435
381#endif /* __LINUX_NODEMASK_H */ 436#endif /* __LINUX_NODEMASK_H */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 71013e6bef25..0cc5b3e198e5 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -47,13 +47,21 @@
47#include "internal.h" 47#include "internal.h"
48 48
49/* 49/*
50 * MCD - HACK: Find somewhere to initialize this EARLY, or make this 50 * Array of node states.
51 * initializer cleaner
52 */ 51 */
53nodemask_t node_online_map __read_mostly = { { [0] = 1UL } }; 52nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
54EXPORT_SYMBOL(node_online_map); 53 [N_POSSIBLE] = NODE_MASK_ALL,
55nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL; 54 [N_ONLINE] = { { [0] = 1UL } },
56EXPORT_SYMBOL(node_possible_map); 55#ifndef CONFIG_NUMA
56 [N_NORMAL_MEMORY] = { { [0] = 1UL } },
57#ifdef CONFIG_HIGHMEM
58 [N_HIGH_MEMORY] = { { [0] = 1UL } },
59#endif
60 [N_CPU] = { { [0] = 1UL } },
61#endif /* NUMA */
62};
63EXPORT_SYMBOL(node_states);
64
57unsigned long totalram_pages __read_mostly; 65unsigned long totalram_pages __read_mostly;
58unsigned long totalreserve_pages __read_mostly; 66unsigned long totalreserve_pages __read_mostly;
59long nr_swap_pages; 67long nr_swap_pages;