aboutsummaryrefslogtreecommitdiffstats
path: root/arch/ia64/mm/discontig.c
diff options
context:
space:
mode:
authorbob.picco <bob.picco@hp.com>2005-06-30 12:52:00 -0400
committerTony Luck <tony.luck@intel.com>2005-07-06 18:45:30 -0400
commit564601a5d12f93fdde04c6bc5b097b95e7752a46 (patch)
tree7ecd89b4eea6c626eb0726a5f7cf16e6e0f93b6e /arch/ia64/mm/discontig.c
parentaf25e94d4dcfb9608846242fabdd4e6014e5c9f0 (diff)
[IA64] memory-less-nodes repost
I reworked how nodes with only CPUs are treated. The patch below seems simpler to me and has eliminated the complicated routine reassign_cpu_only_nodes. There isn't any longer the requirement to modify ACPI NUMA information which was in large part the complexity introduced in reassign_cpu_only_nodes. This patch will produce a different number of nodes. For example, reassign_cpu_only_nodes would reduce two CPUonly nodes and one memory node configuration to one memory+CPUs node configuration. This patch doesn't change the number of nodes which means the user will see three. Two nodes without memory and one node with all the memory. While doing this patch, I noticed that early_nr_phys_cpus_node isn't serving any useful purpose. It is called once in find_pernode_space but the value isn't used to computer pernode space. Signed-off-by: bob.picco <bob.picco@hp.com> Signed-off-by: Tony Luck <tony.luck@intel.com>
Diffstat (limited to 'arch/ia64/mm/discontig.c')
-rw-r--r--arch/ia64/mm/discontig.c394
1 files changed, 167 insertions, 227 deletions
diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c
index f3fd528ead3b..54136fd00202 100644
--- a/arch/ia64/mm/discontig.c
+++ b/arch/ia64/mm/discontig.c
@@ -44,150 +44,7 @@ struct early_node_data {
44}; 44};
45 45
46static struct early_node_data mem_data[MAX_NUMNODES] __initdata; 46static struct early_node_data mem_data[MAX_NUMNODES] __initdata;
47 47static nodemask_t memory_less_mask __initdata;
48/**
49 * reassign_cpu_only_nodes - called from find_memory to move CPU-only nodes to a memory node
50 *
51 * This function will move nodes with only CPUs (no memory)
52 * to a node with memory which is at the minimum numa_slit distance.
53 * Any reassigments will result in the compression of the nodes
54 * and renumbering the nid values where appropriate.
55 * The static declarations below are to avoid large stack size which
56 * makes the code not re-entrant.
57 */
58static void __init reassign_cpu_only_nodes(void)
59{
60 struct node_memblk_s *p;
61 int i, j, k, nnode, nid, cpu, cpunid, pxm;
62 u8 cslit, slit;
63 static DECLARE_BITMAP(nodes_with_mem, MAX_NUMNODES) __initdata;
64 static u8 numa_slit_fix[MAX_NUMNODES * MAX_NUMNODES] __initdata;
65 static int node_flip[MAX_NUMNODES] __initdata;
66 static int old_nid_map[NR_CPUS] __initdata;
67
68 for (nnode = 0, p = &node_memblk[0]; p < &node_memblk[num_node_memblks]; p++)
69 if (!test_bit(p->nid, (void *) nodes_with_mem)) {
70 set_bit(p->nid, (void *) nodes_with_mem);
71 nnode++;
72 }
73
74 /*
75 * All nids with memory.
76 */
77 if (nnode == num_online_nodes())
78 return;
79
80 /*
81 * Change nids and attempt to migrate CPU-only nodes
82 * to the best numa_slit (closest neighbor) possible.
83 * For reassigned CPU nodes a nid can't be arrived at
84 * until after this loop because the target nid's new
85 * identity might not have been established yet. So
86 * new nid values are fabricated above num_online_nodes() and
87 * mapped back later to their true value.
88 */
89 /* MCD - This code is a bit complicated, but may be unnecessary now.
90 * We can now handle much more interesting node-numbering.
91 * The old requirement that 0 <= nid <= numnodes <= MAX_NUMNODES
92 * and that there be no holes in the numbering 0..numnodes
93 * has become simply 0 <= nid <= MAX_NUMNODES.
94 */
95 nid = 0;
96 for_each_online_node(i) {
97 if (test_bit(i, (void *) nodes_with_mem)) {
98 /*
99 * Save original nid value for numa_slit
100 * fixup and node_cpuid reassignments.
101 */
102 node_flip[nid] = i;
103
104 if (i == nid) {
105 nid++;
106 continue;
107 }
108
109 for (p = &node_memblk[0]; p < &node_memblk[num_node_memblks]; p++)
110 if (p->nid == i)
111 p->nid = nid;
112
113 cpunid = nid;
114 nid++;
115 } else
116 cpunid = MAX_NUMNODES;
117
118 for (cpu = 0; cpu < NR_CPUS; cpu++)
119 if (node_cpuid[cpu].nid == i) {
120 /*
121 * For nodes not being reassigned just
122 * fix the cpu's nid and reverse pxm map
123 */
124 if (cpunid < MAX_NUMNODES) {
125 pxm = nid_to_pxm_map[i];
126 pxm_to_nid_map[pxm] =
127 node_cpuid[cpu].nid = cpunid;
128 continue;
129 }
130
131 /*
132 * For nodes being reassigned, find best node by
133 * numa_slit information and then make a temporary
134 * nid value based on current nid and num_online_nodes().
135 */
136 slit = 0xff;
137 k = 2*num_online_nodes();
138 for_each_online_node(j) {
139 if (i == j)
140 continue;
141 else if (test_bit(j, (void *) nodes_with_mem)) {
142 cslit = numa_slit[i * num_online_nodes() + j];
143 if (cslit < slit) {
144 k = num_online_nodes() + j;
145 slit = cslit;
146 }
147 }
148 }
149
150 /* save old nid map so we can update the pxm */
151 old_nid_map[cpu] = node_cpuid[cpu].nid;
152 node_cpuid[cpu].nid = k;
153 }
154 }
155
156 /*
157 * Fixup temporary nid values for CPU-only nodes.
158 */
159 for (cpu = 0; cpu < NR_CPUS; cpu++)
160 if (node_cpuid[cpu].nid == (2*num_online_nodes())) {
161 pxm = nid_to_pxm_map[old_nid_map[cpu]];
162 pxm_to_nid_map[pxm] = node_cpuid[cpu].nid = nnode - 1;
163 } else {
164 for (i = 0; i < nnode; i++) {
165 if (node_flip[i] != (node_cpuid[cpu].nid - num_online_nodes()))
166 continue;
167
168 pxm = nid_to_pxm_map[old_nid_map[cpu]];
169 pxm_to_nid_map[pxm] = node_cpuid[cpu].nid = i;
170 break;
171 }
172 }
173
174 /*
175 * Fix numa_slit by compressing from larger
176 * nid array to reduced nid array.
177 */
178 for (i = 0; i < nnode; i++)
179 for (j = 0; j < nnode; j++)
180 numa_slit_fix[i * nnode + j] =
181 numa_slit[node_flip[i] * num_online_nodes() + node_flip[j]];
182
183 memcpy(numa_slit, numa_slit_fix, sizeof (numa_slit));
184
185 nodes_clear(node_online_map);
186 for (i = 0; i < nnode; i++)
187 node_set_online(i);
188
189 return;
190}
191 48
192/* 49/*
193 * To prevent cache aliasing effects, align per-node structures so that they 50 * To prevent cache aliasing effects, align per-node structures so that they
@@ -233,46 +90,88 @@ static int __init build_node_maps(unsigned long start, unsigned long len,
233} 90}
234 91
235/** 92/**
236 * early_nr_phys_cpus_node - return number of physical cpus on a given node 93 * early_nr_cpus_node - return number of cpus on a given node
237 * @node: node to check 94 * @node: node to check
238 * 95 *
239 * Count the number of physical cpus on @node. These are cpus that actually 96 * Count the number of cpus on @node. We can't use nr_cpus_node() yet because
240 * exist. We can't use nr_cpus_node() yet because
241 * acpi_boot_init() (which builds the node_to_cpu_mask array) hasn't been 97 * acpi_boot_init() (which builds the node_to_cpu_mask array) hasn't been
242 * called yet. 98 * called yet. Note that node 0 will also count all non-existent cpus.
243 */ 99 */
244static int early_nr_phys_cpus_node(int node) 100static int __init early_nr_cpus_node(int node)
245{ 101{
246 int cpu, n = 0; 102 int cpu, n = 0;
247 103
248 for (cpu = 0; cpu < NR_CPUS; cpu++) 104 for (cpu = 0; cpu < NR_CPUS; cpu++)
249 if (node == node_cpuid[cpu].nid) 105 if (node == node_cpuid[cpu].nid)
250 if ((cpu == 0) || node_cpuid[cpu].phys_id) 106 n++;
251 n++;
252 107
253 return n; 108 return n;
254} 109}
255 110
111/**
112 * compute_pernodesize - compute size of pernode data
113 * @node: the node id.
114 */
115static unsigned long __init compute_pernodesize(int node)
116{
117 unsigned long pernodesize = 0, cpus;
118
119 cpus = early_nr_cpus_node(node);
120 pernodesize += PERCPU_PAGE_SIZE * cpus;
121 pernodesize += node * L1_CACHE_BYTES;
122 pernodesize += L1_CACHE_ALIGN(sizeof(pg_data_t));
123 pernodesize += L1_CACHE_ALIGN(sizeof(struct ia64_node_data));
124 pernodesize = PAGE_ALIGN(pernodesize);
125 return pernodesize;
126}
256 127
257/** 128/**
258 * early_nr_cpus_node - return number of cpus on a given node 129 * fill_pernode - initialize pernode data.
259 * @node: node to check 130 * @node: the node id.
260 * 131 * @pernode: physical address of pernode data
261 * Count the number of cpus on @node. We can't use nr_cpus_node() yet because 132 * @pernodesize: size of the pernode data
262 * acpi_boot_init() (which builds the node_to_cpu_mask array) hasn't been
263 * called yet. Note that node 0 will also count all non-existent cpus.
264 */ 133 */
265static int early_nr_cpus_node(int node) 134static void __init fill_pernode(int node, unsigned long pernode,
135 unsigned long pernodesize)
266{ 136{
267 int cpu, n = 0; 137 void *cpu_data;
138 int cpus = early_nr_cpus_node(node), cpu;
139 struct bootmem_data *bdp = &mem_data[node].bootmem_data;
268 140
269 for (cpu = 0; cpu < NR_CPUS; cpu++) 141 mem_data[node].pernode_addr = pernode;
270 if (node == node_cpuid[cpu].nid) 142 mem_data[node].pernode_size = pernodesize;
271 n++; 143 memset(__va(pernode), 0, pernodesize);
272 144
273 return n; 145 cpu_data = (void *)pernode;
274} 146 pernode += PERCPU_PAGE_SIZE * cpus;
147 pernode += node * L1_CACHE_BYTES;
148
149 mem_data[node].pgdat = __va(pernode);
150 pernode += L1_CACHE_ALIGN(sizeof(pg_data_t));
151
152 mem_data[node].node_data = __va(pernode);
153 pernode += L1_CACHE_ALIGN(sizeof(struct ia64_node_data));
154
155 mem_data[node].pgdat->bdata = bdp;
156 pernode += L1_CACHE_ALIGN(sizeof(pg_data_t));
157
158 /*
159 * Copy the static per-cpu data into the region we
160 * just set aside and then setup __per_cpu_offset
161 * for each CPU on this node.
162 */
163 for (cpu = 0; cpu < NR_CPUS; cpu++) {
164 if (node == node_cpuid[cpu].nid) {
165 memcpy(__va(cpu_data), __phys_per_cpu_start,
166 __per_cpu_end - __per_cpu_start);
167 __per_cpu_offset[cpu] = (char*)__va(cpu_data) -
168 __per_cpu_start;
169 cpu_data += PERCPU_PAGE_SIZE;
170 }
171 }
275 172
173 return;
174}
276/** 175/**
277 * find_pernode_space - allocate memory for memory map and per-node structures 176 * find_pernode_space - allocate memory for memory map and per-node structures
278 * @start: physical start of range 177 * @start: physical start of range
@@ -304,9 +203,8 @@ static int early_nr_cpus_node(int node)
304static int __init find_pernode_space(unsigned long start, unsigned long len, 203static int __init find_pernode_space(unsigned long start, unsigned long len,
305 int node) 204 int node)
306{ 205{
307 unsigned long epfn, cpu, cpus, phys_cpus; 206 unsigned long epfn;
308 unsigned long pernodesize = 0, pernode, pages, mapsize; 207 unsigned long pernodesize = 0, pernode, pages, mapsize;
309 void *cpu_data;
310 struct bootmem_data *bdp = &mem_data[node].bootmem_data; 208 struct bootmem_data *bdp = &mem_data[node].bootmem_data;
311 209
312 epfn = (start + len) >> PAGE_SHIFT; 210 epfn = (start + len) >> PAGE_SHIFT;
@@ -329,49 +227,12 @@ static int __init find_pernode_space(unsigned long start, unsigned long len,
329 * Calculate total size needed, incl. what's necessary 227 * Calculate total size needed, incl. what's necessary
330 * for good alignment and alias prevention. 228 * for good alignment and alias prevention.
331 */ 229 */
332 cpus = early_nr_cpus_node(node); 230 pernodesize = compute_pernodesize(node);
333 phys_cpus = early_nr_phys_cpus_node(node);
334 pernodesize += PERCPU_PAGE_SIZE * cpus;
335 pernodesize += node * L1_CACHE_BYTES;
336 pernodesize += L1_CACHE_ALIGN(sizeof(pg_data_t));
337 pernodesize += L1_CACHE_ALIGN(sizeof(struct ia64_node_data));
338 pernodesize = PAGE_ALIGN(pernodesize);
339 pernode = NODEDATA_ALIGN(start, node); 231 pernode = NODEDATA_ALIGN(start, node);
340 232
341 /* Is this range big enough for what we want to store here? */ 233 /* Is this range big enough for what we want to store here? */
342 if (start + len > (pernode + pernodesize + mapsize)) { 234 if (start + len > (pernode + pernodesize + mapsize))
343 mem_data[node].pernode_addr = pernode; 235 fill_pernode(node, pernode, pernodesize);
344 mem_data[node].pernode_size = pernodesize;
345 memset(__va(pernode), 0, pernodesize);
346
347 cpu_data = (void *)pernode;
348 pernode += PERCPU_PAGE_SIZE * cpus;
349 pernode += node * L1_CACHE_BYTES;
350
351 mem_data[node].pgdat = __va(pernode);
352 pernode += L1_CACHE_ALIGN(sizeof(pg_data_t));
353
354 mem_data[node].node_data = __va(pernode);
355 pernode += L1_CACHE_ALIGN(sizeof(struct ia64_node_data));
356
357 mem_data[node].pgdat->bdata = bdp;
358 pernode += L1_CACHE_ALIGN(sizeof(pg_data_t));
359
360 /*
361 * Copy the static per-cpu data into the region we
362 * just set aside and then setup __per_cpu_offset
363 * for each CPU on this node.
364 */
365 for (cpu = 0; cpu < NR_CPUS; cpu++) {
366 if (node == node_cpuid[cpu].nid) {
367 memcpy(__va(cpu_data), __phys_per_cpu_start,
368 __per_cpu_end - __per_cpu_start);
369 __per_cpu_offset[cpu] = (char*)__va(cpu_data) -
370 __per_cpu_start;
371 cpu_data += PERCPU_PAGE_SIZE;
372 }
373 }
374 }
375 236
376 return 0; 237 return 0;
377} 238}
@@ -411,6 +272,9 @@ static void __init reserve_pernode_space(void)
411 for_each_online_node(node) { 272 for_each_online_node(node) {
412 pg_data_t *pdp = mem_data[node].pgdat; 273 pg_data_t *pdp = mem_data[node].pgdat;
413 274
275 if (node_isset(node, memory_less_mask))
276 continue;
277
414 bdp = pdp->bdata; 278 bdp = pdp->bdata;
415 279
416 /* First the bootmem_map itself */ 280 /* First the bootmem_map itself */
@@ -456,6 +320,83 @@ static void __init initialize_pernode_data(void)
456} 320}
457 321
458/** 322/**
323 * memory_less_node_alloc - * attempt to allocate memory on the best NUMA slit
324 * node but fall back to any other node when __alloc_bootmem_node fails
325 * for best.
326 * @nid: node id
327 * @pernodesize: size of this node's pernode data
328 * @align: alignment to use for this node's pernode data
329 */
330static void __init *memory_less_node_alloc(int nid, unsigned long pernodesize,
331 unsigned long align)
332{
333 void *ptr = NULL;
334 u8 best = 0xff;
335 int bestnode = -1, node;
336
337 for_each_online_node(node) {
338 if (node_isset(node, memory_less_mask))
339 continue;
340 else if (node_distance(nid, node) < best) {
341 best = node_distance(nid, node);
342 bestnode = node;
343 }
344 }
345
346 ptr = __alloc_bootmem_node(mem_data[bestnode].pgdat,
347 pernodesize, align, __pa(MAX_DMA_ADDRESS));
348
349 if (!ptr)
350 panic("NO memory for memory less node\n");
351 return ptr;
352}
353
354/**
355 * pgdat_insert - insert the pgdat into global pgdat_list
356 * @pgdat: the pgdat for a node.
357 */
358static void __init pgdat_insert(pg_data_t *pgdat)
359{
360 pg_data_t *prev = NULL, *next;
361
362 for_each_pgdat(next)
363 if (pgdat->node_id < next->node_id)
364 break;
365 else
366 prev = next;
367
368 if (prev) {
369 prev->pgdat_next = pgdat;
370 pgdat->pgdat_next = next;
371 } else {
372 pgdat->pgdat_next = pgdat_list;
373 pgdat_list = pgdat;
374 }
375
376 return;
377}
378
379/**
380 * memory_less_nodes - allocate and initialize CPU only nodes pernode
381 * information.
382 */
383static void __init memory_less_nodes(void)
384{
385 unsigned long pernodesize;
386 void *pernode;
387 int node;
388
389 for_each_node_mask(node, memory_less_mask) {
390 pernodesize = compute_pernodesize(node);
391 pernode = memory_less_node_alloc(node, pernodesize,
392 (node) ? (node * PERCPU_PAGE_SIZE) : (1024*1024));
393 fill_pernode(node, __pa(pernode), pernodesize);
394 }
395
396 return;
397}
398
399/**
459 * find_memory - walk the EFI memory map and setup the bootmem allocator 400 * find_memory - walk the EFI memory map and setup the bootmem allocator
460 * 401 *
461 * Called early in boot to setup the bootmem allocator, and to 402 * Called early in boot to setup the bootmem allocator, and to
@@ -472,16 +413,19 @@ void __init find_memory(void)
472 node_set_online(0); 413 node_set_online(0);
473 } 414 }
474 415
416 nodes_or(memory_less_mask, memory_less_mask, node_online_map);
475 min_low_pfn = -1; 417 min_low_pfn = -1;
476 max_low_pfn = 0; 418 max_low_pfn = 0;
477 419
478 if (num_online_nodes() > 1)
479 reassign_cpu_only_nodes();
480
481 /* These actually end up getting called by call_pernode_memory() */ 420 /* These actually end up getting called by call_pernode_memory() */
482 efi_memmap_walk(filter_rsvd_memory, build_node_maps); 421 efi_memmap_walk(filter_rsvd_memory, build_node_maps);
483 efi_memmap_walk(filter_rsvd_memory, find_pernode_space); 422 efi_memmap_walk(filter_rsvd_memory, find_pernode_space);
484 423
424 for_each_online_node(node)
425 if (mem_data[node].bootmem_data.node_low_pfn) {
426 node_clear(node, memory_less_mask);
427 mem_data[node].min_pfn = ~0UL;
428 }
485 /* 429 /*
486 * Initialize the boot memory maps in reverse order since that's 430 * Initialize the boot memory maps in reverse order since that's
487 * what the bootmem allocator expects 431 * what the bootmem allocator expects
@@ -492,17 +436,14 @@ void __init find_memory(void)
492 436
493 if (!node_online(node)) 437 if (!node_online(node))
494 continue; 438 continue;
439 else if (node_isset(node, memory_less_mask))
440 continue;
495 441
496 bdp = &mem_data[node].bootmem_data; 442 bdp = &mem_data[node].bootmem_data;
497 pernode = mem_data[node].pernode_addr; 443 pernode = mem_data[node].pernode_addr;
498 pernodesize = mem_data[node].pernode_size; 444 pernodesize = mem_data[node].pernode_size;
499 map = pernode + pernodesize; 445 map = pernode + pernodesize;
500 446
501 /* Sanity check... */
502 if (!pernode)
503 panic("pernode space for node %d "
504 "could not be allocated!", node);
505
506 init_bootmem_node(mem_data[node].pgdat, 447 init_bootmem_node(mem_data[node].pgdat,
507 map>>PAGE_SHIFT, 448 map>>PAGE_SHIFT,
508 bdp->node_boot_start>>PAGE_SHIFT, 449 bdp->node_boot_start>>PAGE_SHIFT,
@@ -512,6 +453,7 @@ void __init find_memory(void)
512 efi_memmap_walk(filter_rsvd_memory, free_node_bootmem); 453 efi_memmap_walk(filter_rsvd_memory, free_node_bootmem);
513 454
514 reserve_pernode_space(); 455 reserve_pernode_space();
456 memory_less_nodes();
515 initialize_pernode_data(); 457 initialize_pernode_data();
516 458
517 max_pfn = max_low_pfn; 459 max_pfn = max_low_pfn;
@@ -680,12 +622,13 @@ void __init paging_init(void)
680 622
681 max_dma = virt_to_phys((void *) MAX_DMA_ADDRESS) >> PAGE_SHIFT; 623 max_dma = virt_to_phys((void *) MAX_DMA_ADDRESS) >> PAGE_SHIFT;
682 624
683 /* so min() will work in count_node_pages */
684 for_each_online_node(node)
685 mem_data[node].min_pfn = ~0UL;
686
687 efi_memmap_walk(filter_rsvd_memory, count_node_pages); 625 efi_memmap_walk(filter_rsvd_memory, count_node_pages);
688 626
627 vmalloc_end -= PAGE_ALIGN(max_low_pfn * sizeof(struct page));
628 vmem_map = (struct page *) vmalloc_end;
629 efi_memmap_walk(create_mem_map_page_table, NULL);
630 printk("Virtual mem_map starts at 0x%p\n", vmem_map);
631
689 for_each_online_node(node) { 632 for_each_online_node(node) {
690 memset(zones_size, 0, sizeof(zones_size)); 633 memset(zones_size, 0, sizeof(zones_size));
691 memset(zholes_size, 0, sizeof(zholes_size)); 634 memset(zholes_size, 0, sizeof(zholes_size));
@@ -719,15 +662,6 @@ void __init paging_init(void)
719 mem_data[node].num_dma_physpages); 662 mem_data[node].num_dma_physpages);
720 } 663 }
721 664
722 if (node == 0) {
723 vmalloc_end -=
724 PAGE_ALIGN(max_low_pfn * sizeof(struct page));
725 vmem_map = (struct page *) vmalloc_end;
726
727 efi_memmap_walk(create_mem_map_page_table, NULL);
728 printk("Virtual mem_map starts at 0x%p\n", vmem_map);
729 }
730
731 pfn_offset = mem_data[node].min_pfn; 665 pfn_offset = mem_data[node].min_pfn;
732 666
733 NODE_DATA(node)->node_mem_map = vmem_map + pfn_offset; 667 NODE_DATA(node)->node_mem_map = vmem_map + pfn_offset;
@@ -735,5 +669,11 @@ void __init paging_init(void)
735 pfn_offset, zholes_size); 669 pfn_offset, zholes_size);
736 } 670 }
737 671
672 /*
673 * Make memory less nodes become a member of the known nodes.
674 */
675 for_each_node_mask(node, memory_less_mask)
676 pgdat_insert(mem_data[node].pgdat);
677
738 zero_page_memmap_ptr = virt_to_page(ia64_imva(empty_zero_page)); 678 zero_page_memmap_ptr = virt_to_page(ia64_imva(empty_zero_page));
739} 679}