diff options
author | Yinghai Lu <Yinghai.Lu@Sun.COM> | 2008-02-01 11:49:41 -0500 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2008-02-01 11:49:41 -0500 |
commit | 24a5da73f49c17ca88f369b257fef620a494e79d (patch) | |
tree | eca488d3da9e3ac160e9c4ff30ba89ad2946deae /arch | |
parent | 25eff8d4cd7400372d490c392519c5b0064c03f7 (diff) |
x86_64: make bootmap_start page align v6
boot oopses when a system has 64 or 128 GB of RAM installed:
Calling initcall 0xffffffff80bc33b6: sctp_init+0x0/0x711()
BUG: unable to handle kernel NULL pointer dereference at 000000000000005f
IP: [<ffffffff802bfe55>] proc_register+0xe7/0x10f
PGD 0
Oops: 0000 [1] SMP
CPU 0
Modules linked in:
Pid: 1, comm: swapper Not tainted 2.6.24-smp-g5a514e21-dirty #6
RIP: 0010:[<ffffffff802bfe55>] [<ffffffff802bfe55>] proc_register+0xe7/0x10f
RSP: 0000:ffff810824c57e60 EFLAGS: 00010246
RAX: 000000000000d7d7 RBX: ffff811024c5fa80 RCX: ffff810824c57e08
RDX: 0000000000000000 RSI: 0000000000000195 RDI: ffffffff80cc2460
RBP: ffffffffffffffff R08: 0000000000000000 R09: ffff811024c5fa80
R10: 0000000000000000 R11: 0000000000000002 R12: ffff810824c57e6c
R13: 0000000000000000 R14: ffff810824c57ee0 R15: 00000006abd25bee
FS: 0000000000000000(0000) GS:ffffffff80b4d000(0000) knlGS:0000000000000000
CS: 0010 DS: 0018 ES: 0018 CR0: 000000008005003b
CR2: 000000000000005f CR3: 0000000000201000 CR4: 00000000000006e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
Process swapper (pid: 1, threadinfo ffff810824c56000, task ffff812024c52000)
Stack: ffffffff80a57348 0000019500000000 ffff811024c5fa80 0000000000000000
00000000ffffff97 ffffffff802bfef0 0000000000000000 ffffffffffffffff
0000000000000000 ffffffff80bc3b4b ffff810824c57ee0 ffffffff80bc34a5
Call Trace:
[<ffffffff802bfef0>] ? create_proc_entry+0x73/0x8a
[<ffffffff80bc3b4b>] ? sctp_snmp_proc_init+0x1c/0x34
[<ffffffff80bc34a5>] ? sctp_init+0xef/0x711
[<ffffffff80b976e3>] ? kernel_init+0x175/0x2e1
[<ffffffff8020ccf8>] ? child_rip+0xa/0x12
[<ffffffff80b9756e>] ? kernel_init+0x0/0x2e1
[<ffffffff8020ccee>] ? child_rip+0x0/0x12
Code: 1e 48 83 7b 38 00 75 08 48 c7 43 38 f0 e8 82 80 48 83 7b 30 00 75 08 48 c7 43 30 d0 e9 82 80 48 c7 c7 60 24 cc 80 e8 bd 5a 54 00 <48> 8b 45 60 48 89 6b 58 48 89 5d 60 48 89 43 50 fe 05 f5 25 a0
RIP [<ffffffff802bfe55>] proc_register+0xe7/0x10f
RSP <ffff810824c57e60>
CR2: 000000000000005f
---[ end trace 02c2d78def82877a ]---
Kernel panic - not syncing: Attempted to kill init!
it turns out some variables near end of bss are corrupted already.
in System.map we have
ffffffff80d40420 b rsi_table
ffffffff80d40620 B krb5_seq_lock
ffffffff80d40628 b i.20437
ffffffff80d40630 b xprt_rdma_inline_write_padding
ffffffff80d40638 b sunrpc_table_header
ffffffff80d40640 b zero
ffffffff80d40644 b min_memreg
ffffffff80d40648 b rpcrdma_tk_lock_g
ffffffff80d40650 B sctp_assocs_id_lock
ffffffff80d40658 B proc_net_sctp
ffffffff80d40660 B sctp_assocs_id
ffffffff80d40680 B sysctl_sctp_mem
ffffffff80d40690 B sysctl_sctp_rmem
ffffffff80d406a0 B sysctl_sctp_wmem
ffffffff80d406b0 b sctp_ctl_socket
ffffffff80d406b8 b sctp_pf_inet6_specific
ffffffff80d406c0 b sctp_pf_inet_specific
ffffffff80d406c8 b sctp_af_v4_specific
ffffffff80d406d0 b sctp_af_v6_specific
ffffffff80d406d8 b sctp_rand.33270
ffffffff80d406dc b sctp_memory_pressure
ffffffff80d406e0 b sctp_sockets_allocated
ffffffff80d406e4 b sctp_memory_allocated
ffffffff80d406e8 b sctp_sysctl_header
ffffffff80d406f0 b zero
ffffffff80d406f4 A __bss_stop
ffffffff80d406f4 A _end
and setup_node_bootmem() will use that page 0xd40000 for bootmap
Bootmem setup node 0 0000000000000000-0000000828000000
NODE_DATA [000000000008a485 - 0000000000091484]
bootmap [0000000000d406f4 - 0000000000e456f3] pages 105
Bootmem setup node 1 0000000828000000-0000001028000000
NODE_DATA [0000000828000000 - 0000000828006fff]
bootmap [0000000828007000 - 0000000828106fff] pages 100
Bootmem setup node 2 0000001028000000-0000001828000000
NODE_DATA [0000001028000000 - 0000001028006fff]
bootmap [0000001028007000 - 0000001028106fff] pages 100
Bootmem setup node 3 0000001828000000-0000002028000000
NODE_DATA [0000001828000000 - 0000001828006fff]
bootmap [0000001828007000 - 0000001828106fff] pages 100
setup_node_bootmem() makes NODE_DATA cacheline aligned,
and bootmap is page-aligned.
the patch updates find_e820_area() to make sure we can meet
the alignment constraints.
Signed-off-by: Yinghai Lu <yinghai.lu@sun.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'arch')
-rw-r--r-- | arch/x86/kernel/e820_64.c | 8 | ||||
-rw-r--r-- | arch/x86/kernel/setup_64.c | 3 | ||||
-rw-r--r-- | arch/x86/mm/init_64.c | 13 | ||||
-rw-r--r-- | arch/x86/mm/numa_64.c | 35 |
4 files changed, 31 insertions, 28 deletions
diff --git a/arch/x86/kernel/e820_64.c b/arch/x86/kernel/e820_64.c index b74e83b214cc..9f65b4cc323c 100644 --- a/arch/x86/kernel/e820_64.c +++ b/arch/x86/kernel/e820_64.c | |||
@@ -171,12 +171,13 @@ int __init e820_all_mapped(unsigned long start, unsigned long end, | |||
171 | } | 171 | } |
172 | 172 | ||
173 | /* | 173 | /* |
174 | * Find a free area in a specific range. | 174 | * Find a free area with specified alignment in a specific range. |
175 | */ | 175 | */ |
176 | unsigned long __init find_e820_area(unsigned long start, unsigned long end, | 176 | unsigned long __init find_e820_area(unsigned long start, unsigned long end, |
177 | unsigned size) | 177 | unsigned size, unsigned long align) |
178 | { | 178 | { |
179 | int i; | 179 | int i; |
180 | unsigned long mask = ~(align - 1); | ||
180 | 181 | ||
181 | for (i = 0; i < e820.nr_map; i++) { | 182 | for (i = 0; i < e820.nr_map; i++) { |
182 | struct e820entry *ei = &e820.map[i]; | 183 | struct e820entry *ei = &e820.map[i]; |
@@ -190,7 +191,8 @@ unsigned long __init find_e820_area(unsigned long start, unsigned long end, | |||
190 | continue; | 191 | continue; |
191 | while (bad_addr(&addr, size) && addr+size <= ei->addr+ei->size) | 192 | while (bad_addr(&addr, size) && addr+size <= ei->addr+ei->size) |
192 | ; | 193 | ; |
193 | last = PAGE_ALIGN(addr) + size; | 194 | addr = (addr + align - 1) & mask; |
195 | last = addr + size; | ||
194 | if (last > ei->addr + ei->size) | 196 | if (last > ei->addr + ei->size) |
195 | continue; | 197 | continue; |
196 | if (last > end) | 198 | if (last > end) |
diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c index 77fb87bf6e5a..18df70c534b9 100644 --- a/arch/x86/kernel/setup_64.c +++ b/arch/x86/kernel/setup_64.c | |||
@@ -182,7 +182,8 @@ contig_initmem_init(unsigned long start_pfn, unsigned long end_pfn) | |||
182 | unsigned long bootmap_size, bootmap; | 182 | unsigned long bootmap_size, bootmap; |
183 | 183 | ||
184 | bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT; | 184 | bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT; |
185 | bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size); | 185 | bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size, |
186 | PAGE_SIZE); | ||
186 | if (bootmap == -1L) | 187 | if (bootmap == -1L) |
187 | panic("Cannot find bootmem map of size %ld\n", bootmap_size); | 188 | panic("Cannot find bootmem map of size %ld\n", bootmap_size); |
188 | bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn); | 189 | bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn); |
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 9a471be4f5f1..eabcaed76c28 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c | |||
@@ -354,17 +354,10 @@ static void __init find_early_table_space(unsigned long end) | |||
354 | * need roughly 0.5KB per GB. | 354 | * need roughly 0.5KB per GB. |
355 | */ | 355 | */ |
356 | start = 0x8000; | 356 | start = 0x8000; |
357 | table_start = find_e820_area(start, end, tables); | 357 | table_start = find_e820_area(start, end, tables, PAGE_SIZE); |
358 | if (table_start == -1UL) | 358 | if (table_start == -1UL) |
359 | panic("Cannot find space for the kernel page tables"); | 359 | panic("Cannot find space for the kernel page tables"); |
360 | 360 | ||
361 | /* | ||
362 | * When you have a lot of RAM like 256GB, early_table will not fit | ||
363 | * into 0x8000 range, find_e820_area() will find area after kernel | ||
364 | * bss but the table_start is not page aligned, so need to round it | ||
365 | * up to avoid overlap with bss: | ||
366 | */ | ||
367 | table_start = round_up(table_start, PAGE_SIZE); | ||
368 | table_start >>= PAGE_SHIFT; | 361 | table_start >>= PAGE_SHIFT; |
369 | table_end = table_start; | 362 | table_end = table_start; |
370 | 363 | ||
@@ -420,7 +413,9 @@ void __init_refok init_memory_mapping(unsigned long start, unsigned long end) | |||
420 | mmu_cr4_features = read_cr4(); | 413 | mmu_cr4_features = read_cr4(); |
421 | __flush_tlb_all(); | 414 | __flush_tlb_all(); |
422 | 415 | ||
423 | reserve_early(table_start << PAGE_SHIFT, table_end << PAGE_SHIFT, "PGTABLE"); | 416 | if (!after_bootmem) |
417 | reserve_early(table_start << PAGE_SHIFT, | ||
418 | table_end << PAGE_SHIFT, "PGTABLE"); | ||
424 | } | 419 | } |
425 | 420 | ||
426 | #ifndef CONFIG_NUMA | 421 | #ifndef CONFIG_NUMA |
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index d33954866085..9f533deb9dad 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c | |||
@@ -84,25 +84,23 @@ static int __init populate_memnodemap(const struct bootnode *nodes, | |||
84 | 84 | ||
85 | static int __init allocate_cachealigned_memnodemap(void) | 85 | static int __init allocate_cachealigned_memnodemap(void) |
86 | { | 86 | { |
87 | unsigned long pad, pad_addr; | 87 | unsigned long addr; |
88 | 88 | ||
89 | memnodemap = memnode.embedded_map; | 89 | memnodemap = memnode.embedded_map; |
90 | if (memnodemapsize <= ARRAY_SIZE(memnode.embedded_map)) | 90 | if (memnodemapsize <= ARRAY_SIZE(memnode.embedded_map)) |
91 | return 0; | 91 | return 0; |
92 | 92 | ||
93 | pad = L1_CACHE_BYTES - 1; | 93 | addr = 0x8000; |
94 | pad_addr = 0x8000; | 94 | nodemap_size = round_up(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES); |
95 | nodemap_size = pad + sizeof(s16) * memnodemapsize; | 95 | nodemap_addr = find_e820_area(addr, end_pfn<<PAGE_SHIFT, |
96 | nodemap_addr = find_e820_area(pad_addr, end_pfn<<PAGE_SHIFT, | 96 | nodemap_size, L1_CACHE_BYTES); |
97 | nodemap_size); | ||
98 | if (nodemap_addr == -1UL) { | 97 | if (nodemap_addr == -1UL) { |
99 | printk(KERN_ERR | 98 | printk(KERN_ERR |
100 | "NUMA: Unable to allocate Memory to Node hash map\n"); | 99 | "NUMA: Unable to allocate Memory to Node hash map\n"); |
101 | nodemap_addr = nodemap_size = 0; | 100 | nodemap_addr = nodemap_size = 0; |
102 | return -1; | 101 | return -1; |
103 | } | 102 | } |
104 | pad_addr = (nodemap_addr + pad) & ~pad; | 103 | memnodemap = phys_to_virt(nodemap_addr); |
105 | memnodemap = phys_to_virt(pad_addr); | ||
106 | reserve_early(nodemap_addr, nodemap_addr + nodemap_size, "MEMNODEMAP"); | 104 | reserve_early(nodemap_addr, nodemap_addr + nodemap_size, "MEMNODEMAP"); |
107 | 105 | ||
108 | printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n", | 106 | printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n", |
@@ -164,15 +162,17 @@ int early_pfn_to_nid(unsigned long pfn) | |||
164 | } | 162 | } |
165 | 163 | ||
166 | static void * __init early_node_mem(int nodeid, unsigned long start, | 164 | static void * __init early_node_mem(int nodeid, unsigned long start, |
167 | unsigned long end, unsigned long size) | 165 | unsigned long end, unsigned long size, |
166 | unsigned long align) | ||
168 | { | 167 | { |
169 | unsigned long mem = find_e820_area(start, end, size); | 168 | unsigned long mem = find_e820_area(start, end, size, align); |
170 | void *ptr; | 169 | void *ptr; |
171 | 170 | ||
172 | if (mem != -1L) | 171 | if (mem != -1L) { |
172 | mem = round_up(mem, align); | ||
173 | return __va(mem); | 173 | return __va(mem); |
174 | ptr = __alloc_bootmem_nopanic(size, | 174 | } |
175 | SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS)); | 175 | ptr = __alloc_bootmem_nopanic(size, align, __pa(MAX_DMA_ADDRESS)); |
176 | if (ptr == NULL) { | 176 | if (ptr == NULL) { |
177 | printk(KERN_ERR "Cannot find %lu bytes in node %d\n", | 177 | printk(KERN_ERR "Cannot find %lu bytes in node %d\n", |
178 | size, nodeid); | 178 | size, nodeid); |
@@ -198,7 +198,8 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, | |||
198 | start_pfn = start >> PAGE_SHIFT; | 198 | start_pfn = start >> PAGE_SHIFT; |
199 | end_pfn = end >> PAGE_SHIFT; | 199 | end_pfn = end >> PAGE_SHIFT; |
200 | 200 | ||
201 | node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size); | 201 | node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size, |
202 | SMP_CACHE_BYTES); | ||
202 | if (node_data[nodeid] == NULL) | 203 | if (node_data[nodeid] == NULL) |
203 | return; | 204 | return; |
204 | nodedata_phys = __pa(node_data[nodeid]); | 205 | nodedata_phys = __pa(node_data[nodeid]); |
@@ -211,8 +212,12 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, | |||
211 | /* Find a place for the bootmem map */ | 212 | /* Find a place for the bootmem map */ |
212 | bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn); | 213 | bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn); |
213 | bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE); | 214 | bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE); |
215 | /* | ||
216 | * SMP_CAHCE_BYTES could be enough, but init_bootmem_node like | ||
217 | * to use that to align to PAGE_SIZE | ||
218 | */ | ||
214 | bootmap = early_node_mem(nodeid, bootmap_start, end, | 219 | bootmap = early_node_mem(nodeid, bootmap_start, end, |
215 | bootmap_pages<<PAGE_SHIFT); | 220 | bootmap_pages<<PAGE_SHIFT, PAGE_SIZE); |
216 | if (bootmap == NULL) { | 221 | if (bootmap == NULL) { |
217 | if (nodedata_phys < start || nodedata_phys >= end) | 222 | if (nodedata_phys < start || nodedata_phys >= end) |
218 | free_bootmem((unsigned long)node_data[nodeid], | 223 | free_bootmem((unsigned long)node_data[nodeid], |