aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorYinghai Lu <Yinghai.Lu@Sun.COM>2008-02-01 11:49:41 -0500
committerIngo Molnar <mingo@elte.hu>2008-02-01 11:49:41 -0500
commit24a5da73f49c17ca88f369b257fef620a494e79d (patch)
treeeca488d3da9e3ac160e9c4ff30ba89ad2946deae
parent25eff8d4cd7400372d490c392519c5b0064c03f7 (diff)
x86_64: make bootmap_start page align v6
boot oopses when a system has 64 or 128 GB of RAM installed: Calling initcall 0xffffffff80bc33b6: sctp_init+0x0/0x711() BUG: unable to handle kernel NULL pointer dereference at 000000000000005f IP: [<ffffffff802bfe55>] proc_register+0xe7/0x10f PGD 0 Oops: 0000 [1] SMP CPU 0 Modules linked in: Pid: 1, comm: swapper Not tainted 2.6.24-smp-g5a514e21-dirty #6 RIP: 0010:[<ffffffff802bfe55>] [<ffffffff802bfe55>] proc_register+0xe7/0x10f RSP: 0000:ffff810824c57e60 EFLAGS: 00010246 RAX: 000000000000d7d7 RBX: ffff811024c5fa80 RCX: ffff810824c57e08 RDX: 0000000000000000 RSI: 0000000000000195 RDI: ffffffff80cc2460 RBP: ffffffffffffffff R08: 0000000000000000 R09: ffff811024c5fa80 R10: 0000000000000000 R11: 0000000000000002 R12: ffff810824c57e6c R13: 0000000000000000 R14: ffff810824c57ee0 R15: 00000006abd25bee FS: 0000000000000000(0000) GS:ffffffff80b4d000(0000) knlGS:0000000000000000 CS: 0010 DS: 0018 ES: 0018 CR0: 000000008005003b CR2: 000000000000005f CR3: 0000000000201000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process swapper (pid: 1, threadinfo ffff810824c56000, task ffff812024c52000) Stack: ffffffff80a57348 0000019500000000 ffff811024c5fa80 0000000000000000 00000000ffffff97 ffffffff802bfef0 0000000000000000 ffffffffffffffff 0000000000000000 ffffffff80bc3b4b ffff810824c57ee0 ffffffff80bc34a5 Call Trace: [<ffffffff802bfef0>] ? create_proc_entry+0x73/0x8a [<ffffffff80bc3b4b>] ? sctp_snmp_proc_init+0x1c/0x34 [<ffffffff80bc34a5>] ? sctp_init+0xef/0x711 [<ffffffff80b976e3>] ? kernel_init+0x175/0x2e1 [<ffffffff8020ccf8>] ? child_rip+0xa/0x12 [<ffffffff80b9756e>] ? kernel_init+0x0/0x2e1 [<ffffffff8020ccee>] ? child_rip+0x0/0x12 Code: 1e 48 83 7b 38 00 75 08 48 c7 43 38 f0 e8 82 80 48 83 7b 30 00 75 08 48 c7 43 30 d0 e9 82 80 48 c7 c7 60 24 cc 80 e8 bd 5a 54 00 <48> 8b 45 60 48 89 6b 58 48 89 5d 60 48 89 43 50 fe 05 f5 25 a0 RIP [<ffffffff802bfe55>] proc_register+0xe7/0x10f RSP <ffff810824c57e60> CR2: 000000000000005f ---[ end trace 02c2d78def82877a ]--- Kernel panic - not syncing: Attempted to kill init! it turns out some variables near end of bss are corrupted already. in System.map we have ffffffff80d40420 b rsi_table ffffffff80d40620 B krb5_seq_lock ffffffff80d40628 b i.20437 ffffffff80d40630 b xprt_rdma_inline_write_padding ffffffff80d40638 b sunrpc_table_header ffffffff80d40640 b zero ffffffff80d40644 b min_memreg ffffffff80d40648 b rpcrdma_tk_lock_g ffffffff80d40650 B sctp_assocs_id_lock ffffffff80d40658 B proc_net_sctp ffffffff80d40660 B sctp_assocs_id ffffffff80d40680 B sysctl_sctp_mem ffffffff80d40690 B sysctl_sctp_rmem ffffffff80d406a0 B sysctl_sctp_wmem ffffffff80d406b0 b sctp_ctl_socket ffffffff80d406b8 b sctp_pf_inet6_specific ffffffff80d406c0 b sctp_pf_inet_specific ffffffff80d406c8 b sctp_af_v4_specific ffffffff80d406d0 b sctp_af_v6_specific ffffffff80d406d8 b sctp_rand.33270 ffffffff80d406dc b sctp_memory_pressure ffffffff80d406e0 b sctp_sockets_allocated ffffffff80d406e4 b sctp_memory_allocated ffffffff80d406e8 b sctp_sysctl_header ffffffff80d406f0 b zero ffffffff80d406f4 A __bss_stop ffffffff80d406f4 A _end and setup_node_bootmem() will use that page 0xd40000 for bootmap Bootmem setup node 0 0000000000000000-0000000828000000 NODE_DATA [000000000008a485 - 0000000000091484] bootmap [0000000000d406f4 - 0000000000e456f3] pages 105 Bootmem setup node 1 0000000828000000-0000001028000000 NODE_DATA [0000000828000000 - 0000000828006fff] bootmap [0000000828007000 - 0000000828106fff] pages 100 Bootmem setup node 2 0000001028000000-0000001828000000 NODE_DATA [0000001028000000 - 0000001028006fff] bootmap [0000001028007000 - 0000001028106fff] pages 100 Bootmem setup node 3 0000001828000000-0000002028000000 NODE_DATA [0000001828000000 - 0000001828006fff] bootmap [0000001828007000 - 0000001828106fff] pages 100 setup_node_bootmem() makes NODE_DATA cacheline aligned, and bootmap is page-aligned. the patch updates find_e820_area() to make sure we can meet the alignment constraints. Signed-off-by: Yinghai Lu <yinghai.lu@sun.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
-rw-r--r--arch/x86/kernel/e820_64.c8
-rw-r--r--arch/x86/kernel/setup_64.c3
-rw-r--r--arch/x86/mm/init_64.c13
-rw-r--r--arch/x86/mm/numa_64.c35
-rw-r--r--include/asm-x86/e820_64.h2
5 files changed, 32 insertions, 29 deletions
diff --git a/arch/x86/kernel/e820_64.c b/arch/x86/kernel/e820_64.c
index b74e83b214cc..9f65b4cc323c 100644
--- a/arch/x86/kernel/e820_64.c
+++ b/arch/x86/kernel/e820_64.c
@@ -171,12 +171,13 @@ int __init e820_all_mapped(unsigned long start, unsigned long end,
171} 171}
172 172
173/* 173/*
174 * Find a free area in a specific range. 174 * Find a free area with specified alignment in a specific range.
175 */ 175 */
176unsigned long __init find_e820_area(unsigned long start, unsigned long end, 176unsigned long __init find_e820_area(unsigned long start, unsigned long end,
177 unsigned size) 177 unsigned size, unsigned long align)
178{ 178{
179 int i; 179 int i;
180 unsigned long mask = ~(align - 1);
180 181
181 for (i = 0; i < e820.nr_map; i++) { 182 for (i = 0; i < e820.nr_map; i++) {
182 struct e820entry *ei = &e820.map[i]; 183 struct e820entry *ei = &e820.map[i];
@@ -190,7 +191,8 @@ unsigned long __init find_e820_area(unsigned long start, unsigned long end,
190 continue; 191 continue;
191 while (bad_addr(&addr, size) && addr+size <= ei->addr+ei->size) 192 while (bad_addr(&addr, size) && addr+size <= ei->addr+ei->size)
192 ; 193 ;
193 last = PAGE_ALIGN(addr) + size; 194 addr = (addr + align - 1) & mask;
195 last = addr + size;
194 if (last > ei->addr + ei->size) 196 if (last > ei->addr + ei->size)
195 continue; 197 continue;
196 if (last > end) 198 if (last > end)
diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index 77fb87bf6e5a..18df70c534b9 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -182,7 +182,8 @@ contig_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
182 unsigned long bootmap_size, bootmap; 182 unsigned long bootmap_size, bootmap;
183 183
184 bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT; 184 bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
185 bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size); 185 bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size,
186 PAGE_SIZE);
186 if (bootmap == -1L) 187 if (bootmap == -1L)
187 panic("Cannot find bootmem map of size %ld\n", bootmap_size); 188 panic("Cannot find bootmem map of size %ld\n", bootmap_size);
188 bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn); 189 bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn);
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 9a471be4f5f1..eabcaed76c28 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -354,17 +354,10 @@ static void __init find_early_table_space(unsigned long end)
354 * need roughly 0.5KB per GB. 354 * need roughly 0.5KB per GB.
355 */ 355 */
356 start = 0x8000; 356 start = 0x8000;
357 table_start = find_e820_area(start, end, tables); 357 table_start = find_e820_area(start, end, tables, PAGE_SIZE);
358 if (table_start == -1UL) 358 if (table_start == -1UL)
359 panic("Cannot find space for the kernel page tables"); 359 panic("Cannot find space for the kernel page tables");
360 360
361 /*
362 * When you have a lot of RAM like 256GB, early_table will not fit
363 * into 0x8000 range, find_e820_area() will find area after kernel
364 * bss but the table_start is not page aligned, so need to round it
365 * up to avoid overlap with bss:
366 */
367 table_start = round_up(table_start, PAGE_SIZE);
368 table_start >>= PAGE_SHIFT; 361 table_start >>= PAGE_SHIFT;
369 table_end = table_start; 362 table_end = table_start;
370 363
@@ -420,7 +413,9 @@ void __init_refok init_memory_mapping(unsigned long start, unsigned long end)
420 mmu_cr4_features = read_cr4(); 413 mmu_cr4_features = read_cr4();
421 __flush_tlb_all(); 414 __flush_tlb_all();
422 415
423 reserve_early(table_start << PAGE_SHIFT, table_end << PAGE_SHIFT, "PGTABLE"); 416 if (!after_bootmem)
417 reserve_early(table_start << PAGE_SHIFT,
418 table_end << PAGE_SHIFT, "PGTABLE");
424} 419}
425 420
426#ifndef CONFIG_NUMA 421#ifndef CONFIG_NUMA
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index d33954866085..9f533deb9dad 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -84,25 +84,23 @@ static int __init populate_memnodemap(const struct bootnode *nodes,
84 84
85static int __init allocate_cachealigned_memnodemap(void) 85static int __init allocate_cachealigned_memnodemap(void)
86{ 86{
87 unsigned long pad, pad_addr; 87 unsigned long addr;
88 88
89 memnodemap = memnode.embedded_map; 89 memnodemap = memnode.embedded_map;
90 if (memnodemapsize <= ARRAY_SIZE(memnode.embedded_map)) 90 if (memnodemapsize <= ARRAY_SIZE(memnode.embedded_map))
91 return 0; 91 return 0;
92 92
93 pad = L1_CACHE_BYTES - 1; 93 addr = 0x8000;
94 pad_addr = 0x8000; 94 nodemap_size = round_up(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES);
95 nodemap_size = pad + sizeof(s16) * memnodemapsize; 95 nodemap_addr = find_e820_area(addr, end_pfn<<PAGE_SHIFT,
96 nodemap_addr = find_e820_area(pad_addr, end_pfn<<PAGE_SHIFT, 96 nodemap_size, L1_CACHE_BYTES);
97 nodemap_size);
98 if (nodemap_addr == -1UL) { 97 if (nodemap_addr == -1UL) {
99 printk(KERN_ERR 98 printk(KERN_ERR
100 "NUMA: Unable to allocate Memory to Node hash map\n"); 99 "NUMA: Unable to allocate Memory to Node hash map\n");
101 nodemap_addr = nodemap_size = 0; 100 nodemap_addr = nodemap_size = 0;
102 return -1; 101 return -1;
103 } 102 }
104 pad_addr = (nodemap_addr + pad) & ~pad; 103 memnodemap = phys_to_virt(nodemap_addr);
105 memnodemap = phys_to_virt(pad_addr);
106 reserve_early(nodemap_addr, nodemap_addr + nodemap_size, "MEMNODEMAP"); 104 reserve_early(nodemap_addr, nodemap_addr + nodemap_size, "MEMNODEMAP");
107 105
108 printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n", 106 printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n",
@@ -164,15 +162,17 @@ int early_pfn_to_nid(unsigned long pfn)
164} 162}
165 163
166static void * __init early_node_mem(int nodeid, unsigned long start, 164static void * __init early_node_mem(int nodeid, unsigned long start,
167 unsigned long end, unsigned long size) 165 unsigned long end, unsigned long size,
166 unsigned long align)
168{ 167{
169 unsigned long mem = find_e820_area(start, end, size); 168 unsigned long mem = find_e820_area(start, end, size, align);
170 void *ptr; 169 void *ptr;
171 170
172 if (mem != -1L) 171 if (mem != -1L) {
172 mem = round_up(mem, align);
173 return __va(mem); 173 return __va(mem);
174 ptr = __alloc_bootmem_nopanic(size, 174 }
175 SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS)); 175 ptr = __alloc_bootmem_nopanic(size, align, __pa(MAX_DMA_ADDRESS));
176 if (ptr == NULL) { 176 if (ptr == NULL) {
177 printk(KERN_ERR "Cannot find %lu bytes in node %d\n", 177 printk(KERN_ERR "Cannot find %lu bytes in node %d\n",
178 size, nodeid); 178 size, nodeid);
@@ -198,7 +198,8 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
198 start_pfn = start >> PAGE_SHIFT; 198 start_pfn = start >> PAGE_SHIFT;
199 end_pfn = end >> PAGE_SHIFT; 199 end_pfn = end >> PAGE_SHIFT;
200 200
201 node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size); 201 node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size,
202 SMP_CACHE_BYTES);
202 if (node_data[nodeid] == NULL) 203 if (node_data[nodeid] == NULL)
203 return; 204 return;
204 nodedata_phys = __pa(node_data[nodeid]); 205 nodedata_phys = __pa(node_data[nodeid]);
@@ -211,8 +212,12 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
211 /* Find a place for the bootmem map */ 212 /* Find a place for the bootmem map */
212 bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn); 213 bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn);
213 bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE); 214 bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE);
215 /*
216 * SMP_CAHCE_BYTES could be enough, but init_bootmem_node like
217 * to use that to align to PAGE_SIZE
218 */
214 bootmap = early_node_mem(nodeid, bootmap_start, end, 219 bootmap = early_node_mem(nodeid, bootmap_start, end,
215 bootmap_pages<<PAGE_SHIFT); 220 bootmap_pages<<PAGE_SHIFT, PAGE_SIZE);
216 if (bootmap == NULL) { 221 if (bootmap == NULL) {
217 if (nodedata_phys < start || nodedata_phys >= end) 222 if (nodedata_phys < start || nodedata_phys >= end)
218 free_bootmem((unsigned long)node_data[nodeid], 223 free_bootmem((unsigned long)node_data[nodeid],
diff --git a/include/asm-x86/e820_64.h b/include/asm-x86/e820_64.h
index cc0946996055..a560c4f5d500 100644
--- a/include/asm-x86/e820_64.h
+++ b/include/asm-x86/e820_64.h
@@ -15,7 +15,7 @@
15 15
16#ifndef __ASSEMBLY__ 16#ifndef __ASSEMBLY__
17extern unsigned long find_e820_area(unsigned long start, unsigned long end, 17extern unsigned long find_e820_area(unsigned long start, unsigned long end,
18 unsigned size); 18 unsigned size, unsigned long align);
19extern void add_memory_region(unsigned long start, unsigned long size, 19extern void add_memory_region(unsigned long start, unsigned long size,
20 int type); 20 int type);
21extern void setup_memory_region(void); 21extern void setup_memory_region(void);