aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--arch/ia64/mm/discontig.c394
-rw-r--r--arch/ia64/mm/init.c3
2 files changed, 169 insertions, 228 deletions
diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c
index f3fd528ead3b..54136fd00202 100644
--- a/arch/ia64/mm/discontig.c
+++ b/arch/ia64/mm/discontig.c
@@ -44,150 +44,7 @@ struct early_node_data {
44}; 44};
45 45
46static struct early_node_data mem_data[MAX_NUMNODES] __initdata; 46static struct early_node_data mem_data[MAX_NUMNODES] __initdata;
47 47static nodemask_t memory_less_mask __initdata;
48/**
49 * reassign_cpu_only_nodes - called from find_memory to move CPU-only nodes to a memory node
50 *
51 * This function will move nodes with only CPUs (no memory)
52 * to a node with memory which is at the minimum numa_slit distance.
53 * Any reassigments will result in the compression of the nodes
54 * and renumbering the nid values where appropriate.
55 * The static declarations below are to avoid large stack size which
56 * makes the code not re-entrant.
57 */
58static void __init reassign_cpu_only_nodes(void)
59{
60 struct node_memblk_s *p;
61 int i, j, k, nnode, nid, cpu, cpunid, pxm;
62 u8 cslit, slit;
63 static DECLARE_BITMAP(nodes_with_mem, MAX_NUMNODES) __initdata;
64 static u8 numa_slit_fix[MAX_NUMNODES * MAX_NUMNODES] __initdata;
65 static int node_flip[MAX_NUMNODES] __initdata;
66 static int old_nid_map[NR_CPUS] __initdata;
67
68 for (nnode = 0, p = &node_memblk[0]; p < &node_memblk[num_node_memblks]; p++)
69 if (!test_bit(p->nid, (void *) nodes_with_mem)) {
70 set_bit(p->nid, (void *) nodes_with_mem);
71 nnode++;
72 }
73
74 /*
75 * All nids with memory.
76 */
77 if (nnode == num_online_nodes())
78 return;
79
80 /*
81 * Change nids and attempt to migrate CPU-only nodes
82 * to the best numa_slit (closest neighbor) possible.
83 * For reassigned CPU nodes a nid can't be arrived at
84 * until after this loop because the target nid's new
85 * identity might not have been established yet. So
86 * new nid values are fabricated above num_online_nodes() and
87 * mapped back later to their true value.
88 */
89 /* MCD - This code is a bit complicated, but may be unnecessary now.
90 * We can now handle much more interesting node-numbering.
91 * The old requirement that 0 <= nid <= numnodes <= MAX_NUMNODES
92 * and that there be no holes in the numbering 0..numnodes
93 * has become simply 0 <= nid <= MAX_NUMNODES.
94 */
95 nid = 0;
96 for_each_online_node(i) {
97 if (test_bit(i, (void *) nodes_with_mem)) {
98 /*
99 * Save original nid value for numa_slit
100 * fixup and node_cpuid reassignments.
101 */
102 node_flip[nid] = i;
103
104 if (i == nid) {
105 nid++;
106 continue;
107 }
108
109 for (p = &node_memblk[0]; p < &node_memblk[num_node_memblks]; p++)
110 if (p->nid == i)
111 p->nid = nid;
112
113 cpunid = nid;
114 nid++;
115 } else
116 cpunid = MAX_NUMNODES;
117
118 for (cpu = 0; cpu < NR_CPUS; cpu++)
119 if (node_cpuid[cpu].nid == i) {
120 /*
121 * For nodes not being reassigned just
122 * fix the cpu's nid and reverse pxm map
123 */
124 if (cpunid < MAX_NUMNODES) {
125 pxm = nid_to_pxm_map[i];
126 pxm_to_nid_map[pxm] =
127 node_cpuid[cpu].nid = cpunid;
128 continue;
129 }
130
131 /*
132 * For nodes being reassigned, find best node by
133 * numa_slit information and then make a temporary
134 * nid value based on current nid and num_online_nodes().
135 */
136 slit = 0xff;
137 k = 2*num_online_nodes();
138 for_each_online_node(j) {
139 if (i == j)
140 continue;
141 else if (test_bit(j, (void *) nodes_with_mem)) {
142 cslit = numa_slit[i * num_online_nodes() + j];
143 if (cslit < slit) {
144 k = num_online_nodes() + j;
145 slit = cslit;
146 }
147 }
148 }
149
150 /* save old nid map so we can update the pxm */
151 old_nid_map[cpu] = node_cpuid[cpu].nid;
152 node_cpuid[cpu].nid = k;
153 }
154 }
155
156 /*
157 * Fixup temporary nid values for CPU-only nodes.
158 */
159 for (cpu = 0; cpu < NR_CPUS; cpu++)
160 if (node_cpuid[cpu].nid == (2*num_online_nodes())) {
161 pxm = nid_to_pxm_map[old_nid_map[cpu]];
162 pxm_to_nid_map[pxm] = node_cpuid[cpu].nid = nnode - 1;
163 } else {
164 for (i = 0; i < nnode; i++) {
165 if (node_flip[i] != (node_cpuid[cpu].nid - num_online_nodes()))
166 continue;
167
168 pxm = nid_to_pxm_map[old_nid_map[cpu]];
169 pxm_to_nid_map[pxm] = node_cpuid[cpu].nid = i;
170 break;
171 }
172 }
173
174 /*
175 * Fix numa_slit by compressing from larger
176 * nid array to reduced nid array.
177 */
178 for (i = 0; i < nnode; i++)
179 for (j = 0; j < nnode; j++)
180 numa_slit_fix[i * nnode + j] =
181 numa_slit[node_flip[i] * num_online_nodes() + node_flip[j]];
182
183 memcpy(numa_slit, numa_slit_fix, sizeof (numa_slit));
184
185 nodes_clear(node_online_map);
186 for (i = 0; i < nnode; i++)
187 node_set_online(i);
188
189 return;
190}
191 48
192/* 49/*
193 * To prevent cache aliasing effects, align per-node structures so that they 50 * To prevent cache aliasing effects, align per-node structures so that they
@@ -233,46 +90,88 @@ static int __init build_node_maps(unsigned long start, unsigned long len,
233} 90}
234 91
235/** 92/**
236 * early_nr_phys_cpus_node - return number of physical cpus on a given node 93 * early_nr_cpus_node - return number of cpus on a given node
237 * @node: node to check 94 * @node: node to check
238 * 95 *
239 * Count the number of physical cpus on @node. These are cpus that actually 96 * Count the number of cpus on @node. We can't use nr_cpus_node() yet because
240 * exist. We can't use nr_cpus_node() yet because
241 * acpi_boot_init() (which builds the node_to_cpu_mask array) hasn't been 97 * acpi_boot_init() (which builds the node_to_cpu_mask array) hasn't been
242 * called yet. 98 * called yet. Note that node 0 will also count all non-existent cpus.
243 */ 99 */
244static int early_nr_phys_cpus_node(int node) 100static int __init early_nr_cpus_node(int node)
245{ 101{
246 int cpu, n = 0; 102 int cpu, n = 0;
247 103
248 for (cpu = 0; cpu < NR_CPUS; cpu++) 104 for (cpu = 0; cpu < NR_CPUS; cpu++)
249 if (node == node_cpuid[cpu].nid) 105 if (node == node_cpuid[cpu].nid)
250 if ((cpu == 0) || node_cpuid[cpu].phys_id) 106 n++;
251 n++;
252 107
253 return n; 108 return n;
254} 109}
255 110
111/**
112 * compute_pernodesize - compute size of pernode data
113 * @node: the node id.
114 */
115static unsigned long __init compute_pernodesize(int node)
116{
117 unsigned long pernodesize = 0, cpus;
118
119 cpus = early_nr_cpus_node(node);
120 pernodesize += PERCPU_PAGE_SIZE * cpus;
121 pernodesize += node * L1_CACHE_BYTES;
122 pernodesize += L1_CACHE_ALIGN(sizeof(pg_data_t));
123 pernodesize += L1_CACHE_ALIGN(sizeof(struct ia64_node_data));
124 pernodesize = PAGE_ALIGN(pernodesize);
125 return pernodesize;
126}
256 127
257/** 128/**
258 * early_nr_cpus_node - return number of cpus on a given node 129 * fill_pernode - initialize pernode data.
259 * @node: node to check 130 * @node: the node id.
260 * 131 * @pernode: physical address of pernode data
261 * Count the number of cpus on @node. We can't use nr_cpus_node() yet because 132 * @pernodesize: size of the pernode data
262 * acpi_boot_init() (which builds the node_to_cpu_mask array) hasn't been
263 * called yet. Note that node 0 will also count all non-existent cpus.
264 */ 133 */
265static int early_nr_cpus_node(int node) 134static void __init fill_pernode(int node, unsigned long pernode,
135 unsigned long pernodesize)
266{ 136{
267 int cpu, n = 0; 137 void *cpu_data;
138 int cpus = early_nr_cpus_node(node), cpu;
139 struct bootmem_data *bdp = &mem_data[node].bootmem_data;
268 140
269 for (cpu = 0; cpu < NR_CPUS; cpu++) 141 mem_data[node].pernode_addr = pernode;
270 if (node == node_cpuid[cpu].nid) 142 mem_data[node].pernode_size = pernodesize;
271 n++; 143 memset(__va(pernode), 0, pernodesize);
272 144
273 return n; 145 cpu_data = (void *)pernode;
274} 146 pernode += PERCPU_PAGE_SIZE * cpus;
147 pernode += node * L1_CACHE_BYTES;
148
149 mem_data[node].pgdat = __va(pernode);
150 pernode += L1_CACHE_ALIGN(sizeof(pg_data_t));
151
152 mem_data[node].node_data = __va(pernode);
153 pernode += L1_CACHE_ALIGN(sizeof(struct ia64_node_data));
154
155 mem_data[node].pgdat->bdata = bdp;
156 pernode += L1_CACHE_ALIGN(sizeof(pg_data_t));
157
158 /*
159 * Copy the static per-cpu data into the region we
160 * just set aside and then setup __per_cpu_offset
161 * for each CPU on this node.
162 */
163 for (cpu = 0; cpu < NR_CPUS; cpu++) {
164 if (node == node_cpuid[cpu].nid) {
165 memcpy(__va(cpu_data), __phys_per_cpu_start,
166 __per_cpu_end - __per_cpu_start);
167 __per_cpu_offset[cpu] = (char*)__va(cpu_data) -
168 __per_cpu_start;
169 cpu_data += PERCPU_PAGE_SIZE;
170 }
171 }
275 172
173 return;
174}
276/** 175/**
277 * find_pernode_space - allocate memory for memory map and per-node structures 176 * find_pernode_space - allocate memory for memory map and per-node structures
278 * @start: physical start of range 177 * @start: physical start of range
@@ -304,9 +203,8 @@ static int early_nr_cpus_node(int node)
304static int __init find_pernode_space(unsigned long start, unsigned long len, 203static int __init find_pernode_space(unsigned long start, unsigned long len,
305 int node) 204 int node)
306{ 205{
307 unsigned long epfn, cpu, cpus, phys_cpus; 206 unsigned long epfn;
308 unsigned long pernodesize = 0, pernode, pages, mapsize; 207 unsigned long pernodesize = 0, pernode, pages, mapsize;
309 void *cpu_data;
310 struct bootmem_data *bdp = &mem_data[node].bootmem_data; 208 struct bootmem_data *bdp = &mem_data[node].bootmem_data;
311 209
312 epfn = (start + len) >> PAGE_SHIFT; 210 epfn = (start + len) >> PAGE_SHIFT;
@@ -329,49 +227,12 @@ static int __init find_pernode_space(unsigned long start, unsigned long len,
329 * Calculate total size needed, incl. what's necessary 227 * Calculate total size needed, incl. what's necessary
330 * for good alignment and alias prevention. 228 * for good alignment and alias prevention.
331 */ 229 */
332 cpus = early_nr_cpus_node(node); 230 pernodesize = compute_pernodesize(node);
333 phys_cpus = early_nr_phys_cpus_node(node);
334 pernodesize += PERCPU_PAGE_SIZE * cpus;
335 pernodesize += node * L1_CACHE_BYTES;
336 pernodesize += L1_CACHE_ALIGN(sizeof(pg_data_t));
337 pernodesize += L1_CACHE_ALIGN(sizeof(struct ia64_node_data));
338 pernodesize = PAGE_ALIGN(pernodesize);
339 pernode = NODEDATA_ALIGN(start, node); 231 pernode = NODEDATA_ALIGN(start, node);
340 232
341 /* Is this range big enough for what we want to store here? */ 233 /* Is this range big enough for what we want to store here? */
342 if (start + len > (pernode + pernodesize + mapsize)) { 234 if (start + len > (pernode + pernodesize + mapsize))
343 mem_data[node].pernode_addr = pernode; 235 fill_pernode(node, pernode, pernodesize);
344 mem_data[node].pernode_size = pernodesize;
345 memset(__va(pernode), 0, pernodesize);
346
347 cpu_data = (void *)pernode;
348 pernode += PERCPU_PAGE_SIZE * cpus;
349 pernode += node * L1_CACHE_BYTES;
350
351 mem_data[node].pgdat = __va(pernode);
352 pernode += L1_CACHE_ALIGN(sizeof(pg_data_t));
353
354 mem_data[node].node_data = __va(pernode);
355 pernode += L1_CACHE_ALIGN(sizeof(struct ia64_node_data));
356
357 mem_data[node].pgdat->bdata = bdp;
358 pernode += L1_CACHE_ALIGN(sizeof(pg_data_t));
359
360 /*
361 * Copy the static per-cpu data into the region we
362 * just set aside and then setup __per_cpu_offset
363 * for each CPU on this node.
364 */
365 for (cpu = 0; cpu < NR_CPUS; cpu++) {
366 if (node == node_cpuid[cpu].nid) {
367 memcpy(__va(cpu_data), __phys_per_cpu_start,
368 __per_cpu_end - __per_cpu_start);
369 __per_cpu_offset[cpu] = (char*)__va(cpu_data) -
370 __per_cpu_start;
371 cpu_data += PERCPU_PAGE_SIZE;
372 }
373 }
374 }
375 236
376 return 0; 237 return 0;
377} 238}
@@ -411,6 +272,9 @@ static void __init reserve_pernode_space(void)
411 for_each_online_node(node) { 272 for_each_online_node(node) {
412 pg_data_t *pdp = mem_data[node].pgdat; 273 pg_data_t *pdp = mem_data[node].pgdat;
413 274
275 if (node_isset(node, memory_less_mask))
276 continue;
277
414 bdp = pdp->bdata; 278 bdp = pdp->bdata;
415 279
416 /* First the bootmem_map itself */ 280 /* First the bootmem_map itself */
@@ -456,6 +320,83 @@ static void __init initialize_pernode_data(void)
456} 320}
457 321
458/** 322/**
323 * memory_less_node_alloc - * attempt to allocate memory on the best NUMA slit
324 * node but fall back to any other node when __alloc_bootmem_node fails
325 * for best.
326 * @nid: node id
327 * @pernodesize: size of this node's pernode data
328 * @align: alignment to use for this node's pernode data
329 */
330static void __init *memory_less_node_alloc(int nid, unsigned long pernodesize,
331 unsigned long align)
332{
333 void *ptr = NULL;
334 u8 best = 0xff;
335 int bestnode = -1, node;
336
337 for_each_online_node(node) {
338 if (node_isset(node, memory_less_mask))
339 continue;
340 else if (node_distance(nid, node) < best) {
341 best = node_distance(nid, node);
342 bestnode = node;
343 }
344 }
345
346 ptr = __alloc_bootmem_node(mem_data[bestnode].pgdat,
347 pernodesize, align, __pa(MAX_DMA_ADDRESS));
348
349 if (!ptr)
350 panic("NO memory for memory less node\n");
351 return ptr;
352}
353
354/**
355 * pgdat_insert - insert the pgdat into global pgdat_list
356 * @pgdat: the pgdat for a node.
357 */
358static void __init pgdat_insert(pg_data_t *pgdat)
359{
360 pg_data_t *prev = NULL, *next;
361
362 for_each_pgdat(next)
363 if (pgdat->node_id < next->node_id)
364 break;
365 else
366 prev = next;
367
368 if (prev) {
369 prev->pgdat_next = pgdat;
370 pgdat->pgdat_next = next;
371 } else {
372 pgdat->pgdat_next = pgdat_list;
373 pgdat_list = pgdat;
374 }
375
376 return;
377}
378
379/**
380 * memory_less_nodes - allocate and initialize CPU only nodes pernode
381 * information.
382 */
383static void __init memory_less_nodes(void)
384{
385 unsigned long pernodesize;
386 void *pernode;
387 int node;
388
389 for_each_node_mask(node, memory_less_mask) {
390 pernodesize = compute_pernodesize(node);
391 pernode = memory_less_node_alloc(node, pernodesize,
392 (node) ? (node * PERCPU_PAGE_SIZE) : (1024*1024));
393 fill_pernode(node, __pa(pernode), pernodesize);
394 }
395
396 return;
397}
398
399/**
459 * find_memory - walk the EFI memory map and setup the bootmem allocator 400 * find_memory - walk the EFI memory map and setup the bootmem allocator
460 * 401 *
461 * Called early in boot to setup the bootmem allocator, and to 402 * Called early in boot to setup the bootmem allocator, and to
@@ -472,16 +413,19 @@ void __init find_memory(void)
472 node_set_online(0); 413 node_set_online(0);
473 } 414 }
474 415
416 nodes_or(memory_less_mask, memory_less_mask, node_online_map);
475 min_low_pfn = -1; 417 min_low_pfn = -1;
476 max_low_pfn = 0; 418 max_low_pfn = 0;
477 419
478 if (num_online_nodes() > 1)
479 reassign_cpu_only_nodes();
480
481 /* These actually end up getting called by call_pernode_memory() */ 420 /* These actually end up getting called by call_pernode_memory() */
482 efi_memmap_walk(filter_rsvd_memory, build_node_maps); 421 efi_memmap_walk(filter_rsvd_memory, build_node_maps);
483 efi_memmap_walk(filter_rsvd_memory, find_pernode_space); 422 efi_memmap_walk(filter_rsvd_memory, find_pernode_space);
484 423
424 for_each_online_node(node)
425 if (mem_data[node].bootmem_data.node_low_pfn) {
426 node_clear(node, memory_less_mask);
427 mem_data[node].min_pfn = ~0UL;
428 }
485 /* 429 /*
486 * Initialize the boot memory maps in reverse order since that's 430 * Initialize the boot memory maps in reverse order since that's
487 * what the bootmem allocator expects 431 * what the bootmem allocator expects
@@ -492,17 +436,14 @@ void __init find_memory(void)
492 436
493 if (!node_online(node)) 437 if (!node_online(node))
494 continue; 438 continue;
439 else if (node_isset(node, memory_less_mask))
440 continue;
495 441
496 bdp = &mem_data[node].bootmem_data; 442 bdp = &mem_data[node].bootmem_data;
497 pernode = mem_data[node].pernode_addr; 443 pernode = mem_data[node].pernode_addr;
498 pernodesize = mem_data[node].pernode_size; 444 pernodesize = mem_data[node].pernode_size;
499 map = pernode + pernodesize; 445 map = pernode + pernodesize;
500 446
501 /* Sanity check... */
502 if (!pernode)
503 panic("pernode space for node %d "
504 "could not be allocated!", node);
505
506 init_bootmem_node(mem_data[node].pgdat, 447 init_bootmem_node(mem_data[node].pgdat,
507 map>>PAGE_SHIFT, 448 map>>PAGE_SHIFT,
508 bdp->node_boot_start>>PAGE_SHIFT, 449 bdp->node_boot_start>>PAGE_SHIFT,
@@ -512,6 +453,7 @@ void __init find_memory(void)
512 efi_memmap_walk(filter_rsvd_memory, free_node_bootmem); 453 efi_memmap_walk(filter_rsvd_memory, free_node_bootmem);
513 454
514 reserve_pernode_space(); 455 reserve_pernode_space();
456 memory_less_nodes();
515 initialize_pernode_data(); 457 initialize_pernode_data();
516 458
517 max_pfn = max_low_pfn; 459 max_pfn = max_low_pfn;
@@ -680,12 +622,13 @@ void __init paging_init(void)
680 622
681 max_dma = virt_to_phys((void *) MAX_DMA_ADDRESS) >> PAGE_SHIFT; 623 max_dma = virt_to_phys((void *) MAX_DMA_ADDRESS) >> PAGE_SHIFT;
682 624
683 /* so min() will work in count_node_pages */
684 for_each_online_node(node)
685 mem_data[node].min_pfn = ~0UL;
686
687 efi_memmap_walk(filter_rsvd_memory, count_node_pages); 625 efi_memmap_walk(filter_rsvd_memory, count_node_pages);
688 626
627 vmalloc_end -= PAGE_ALIGN(max_low_pfn * sizeof(struct page));
628 vmem_map = (struct page *) vmalloc_end;
629 efi_memmap_walk(create_mem_map_page_table, NULL);
630 printk("Virtual mem_map starts at 0x%p\n", vmem_map);
631
689 for_each_online_node(node) { 632 for_each_online_node(node) {
690 memset(zones_size, 0, sizeof(zones_size)); 633 memset(zones_size, 0, sizeof(zones_size));
691 memset(zholes_size, 0, sizeof(zholes_size)); 634 memset(zholes_size, 0, sizeof(zholes_size));
@@ -719,15 +662,6 @@ void __init paging_init(void)
719 mem_data[node].num_dma_physpages); 662 mem_data[node].num_dma_physpages);
720 } 663 }
721 664
722 if (node == 0) {
723 vmalloc_end -=
724 PAGE_ALIGN(max_low_pfn * sizeof(struct page));
725 vmem_map = (struct page *) vmalloc_end;
726
727 efi_memmap_walk(create_mem_map_page_table, NULL);
728 printk("Virtual mem_map starts at 0x%p\n", vmem_map);
729 }
730
731 pfn_offset = mem_data[node].min_pfn; 665 pfn_offset = mem_data[node].min_pfn;
732 666
733 NODE_DATA(node)->node_mem_map = vmem_map + pfn_offset; 667 NODE_DATA(node)->node_mem_map = vmem_map + pfn_offset;
@@ -735,5 +669,11 @@ void __init paging_init(void)
735 pfn_offset, zholes_size); 669 pfn_offset, zholes_size);
736 } 670 }
737 671
672 /*
673 * Make memory less nodes become a member of the known nodes.
674 */
675 for_each_node_mask(node, memory_less_mask)
676 pgdat_insert(mem_data[node].pgdat);
677
738 zero_page_memmap_ptr = virt_to_page(ia64_imva(empty_zero_page)); 678 zero_page_memmap_ptr = virt_to_page(ia64_imva(empty_zero_page));
739} 679}
diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index 4eb2f52b87a1..65f9958db9f0 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -597,7 +597,8 @@ mem_init (void)
597 kclist_add(&kcore_kernel, _stext, _end - _stext); 597 kclist_add(&kcore_kernel, _stext, _end - _stext);
598 598
599 for_each_pgdat(pgdat) 599 for_each_pgdat(pgdat)
600 totalram_pages += free_all_bootmem_node(pgdat); 600 if (pgdat->bdata->node_bootmem_map)
601 totalram_pages += free_all_bootmem_node(pgdat);
601 602
602 reserved_pages = 0; 603 reserved_pages = 0;
603 efi_memmap_walk(count_reserved_pages, &reserved_pages); 604 efi_memmap_walk(count_reserved_pages, &reserved_pages);