aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86_64/mm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86_64/mm')
-rw-r--r--arch/x86_64/mm/fault.c19
-rw-r--r--arch/x86_64/mm/init.c129
-rw-r--r--arch/x86_64/mm/k8topology.c1
-rw-r--r--arch/x86_64/mm/numa.c122
-rw-r--r--arch/x86_64/mm/srat.c6
5 files changed, 161 insertions, 116 deletions
diff --git a/arch/x86_64/mm/fault.c b/arch/x86_64/mm/fault.c
index b75b872ec154..3a63707a698b 100644
--- a/arch/x86_64/mm/fault.c
+++ b/arch/x86_64/mm/fault.c
@@ -308,18 +308,6 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
308 unsigned long flags; 308 unsigned long flags;
309 siginfo_t info; 309 siginfo_t info;
310 310
311#ifdef CONFIG_CHECKING
312 {
313 unsigned long gs;
314 struct x8664_pda *pda = cpu_pda + stack_smp_processor_id();
315 rdmsrl(MSR_GS_BASE, gs);
316 if (gs != (unsigned long)pda) {
317 wrmsrl(MSR_GS_BASE, pda);
318 printk("page_fault: wrong gs %lx expected %p\n", gs, pda);
319 }
320 }
321#endif
322
323 /* get the address */ 311 /* get the address */
324 __asm__("movq %%cr2,%0":"=r" (address)); 312 __asm__("movq %%cr2,%0":"=r" (address));
325 if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14, 313 if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
@@ -571,3 +559,10 @@ do_sigbus:
571 force_sig_info(SIGBUS, &info, tsk); 559 force_sig_info(SIGBUS, &info, tsk);
572 return; 560 return;
573} 561}
562
563static int __init enable_pagefaulttrace(char *str)
564{
565 page_fault_trace = 1;
566 return 0;
567}
568__setup("pagefaulttrace", enable_pagefaulttrace);
diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c
index e60a1a848de8..286f6a624c3a 100644
--- a/arch/x86_64/mm/init.c
+++ b/arch/x86_64/mm/init.c
@@ -22,6 +22,7 @@
22#include <linux/pagemap.h> 22#include <linux/pagemap.h>
23#include <linux/bootmem.h> 23#include <linux/bootmem.h>
24#include <linux/proc_fs.h> 24#include <linux/proc_fs.h>
25#include <linux/pci.h>
25 26
26#include <asm/processor.h> 27#include <asm/processor.h>
27#include <asm/system.h> 28#include <asm/system.h>
@@ -36,16 +37,13 @@
36#include <asm/mmu_context.h> 37#include <asm/mmu_context.h>
37#include <asm/proto.h> 38#include <asm/proto.h>
38#include <asm/smp.h> 39#include <asm/smp.h>
40#include <asm/sections.h>
39 41
40#ifndef Dprintk 42#ifndef Dprintk
41#define Dprintk(x...) 43#define Dprintk(x...)
42#endif 44#endif
43 45
44#ifdef CONFIG_GART_IOMMU 46static unsigned long dma_reserve __initdata;
45extern int swiotlb;
46#endif
47
48extern char _stext[];
49 47
50DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); 48DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
51 49
@@ -86,9 +84,6 @@ void show_mem(void)
86 84
87/* References to section boundaries */ 85/* References to section boundaries */
88 86
89extern char _text, _etext, _edata, __bss_start, _end[];
90extern char __init_begin, __init_end;
91
92int after_bootmem; 87int after_bootmem;
93 88
94static void *spp_getpage(void) 89static void *spp_getpage(void)
@@ -308,42 +303,81 @@ void __init init_memory_mapping(unsigned long start, unsigned long end)
308 table_end<<PAGE_SHIFT); 303 table_end<<PAGE_SHIFT);
309} 304}
310 305
311extern struct x8664_pda cpu_pda[NR_CPUS]; 306void __cpuinit zap_low_mappings(int cpu)
307{
308 if (cpu == 0) {
309 pgd_t *pgd = pgd_offset_k(0UL);
310 pgd_clear(pgd);
311 } else {
312 /*
313 * For AP's, zap the low identity mappings by changing the cr3
314 * to init_level4_pgt and doing local flush tlb all
315 */
316 asm volatile("movq %0,%%cr3" :: "r" (__pa_symbol(&init_level4_pgt)));
317 }
318 __flush_tlb_all();
319}
312 320
313/* Assumes all CPUs still execute in init_mm */ 321/* Compute zone sizes for the DMA and DMA32 zones in a node. */
314void zap_low_mappings(void) 322__init void
323size_zones(unsigned long *z, unsigned long *h,
324 unsigned long start_pfn, unsigned long end_pfn)
315{ 325{
316 pgd_t *pgd = pgd_offset_k(0UL); 326 int i;
317 pgd_clear(pgd); 327 unsigned long w;
318 flush_tlb_all(); 328
329 for (i = 0; i < MAX_NR_ZONES; i++)
330 z[i] = 0;
331
332 if (start_pfn < MAX_DMA_PFN)
333 z[ZONE_DMA] = MAX_DMA_PFN - start_pfn;
334 if (start_pfn < MAX_DMA32_PFN) {
335 unsigned long dma32_pfn = MAX_DMA32_PFN;
336 if (dma32_pfn > end_pfn)
337 dma32_pfn = end_pfn;
338 z[ZONE_DMA32] = dma32_pfn - start_pfn;
339 }
340 z[ZONE_NORMAL] = end_pfn - start_pfn;
341
342 /* Remove lower zones from higher ones. */
343 w = 0;
344 for (i = 0; i < MAX_NR_ZONES; i++) {
345 if (z[i])
346 z[i] -= w;
347 w += z[i];
348 }
349
350 /* Compute holes */
351 w = 0;
352 for (i = 0; i < MAX_NR_ZONES; i++) {
353 unsigned long s = w;
354 w += z[i];
355 h[i] = e820_hole_size(s, w);
356 }
357
358 /* Add the space pace needed for mem_map to the holes too. */
359 for (i = 0; i < MAX_NR_ZONES; i++)
360 h[i] += (z[i] * sizeof(struct page)) / PAGE_SIZE;
361
362 /* The 16MB DMA zone has the kernel and other misc mappings.
363 Account them too */
364 if (h[ZONE_DMA]) {
365 h[ZONE_DMA] += dma_reserve;
366 if (h[ZONE_DMA] >= z[ZONE_DMA]) {
367 printk(KERN_WARNING
368 "Kernel too large and filling up ZONE_DMA?\n");
369 h[ZONE_DMA] = z[ZONE_DMA];
370 }
371 }
319} 372}
320 373
321#ifndef CONFIG_NUMA 374#ifndef CONFIG_NUMA
322void __init paging_init(void) 375void __init paging_init(void)
323{ 376{
324 { 377 unsigned long zones[MAX_NR_ZONES], holes[MAX_NR_ZONES];
325 unsigned long zones_size[MAX_NR_ZONES]; 378 size_zones(zones, holes, 0, end_pfn);
326 unsigned long holes[MAX_NR_ZONES]; 379 free_area_init_node(0, NODE_DATA(0), zones,
327 unsigned int max_dma; 380 __pa(PAGE_OFFSET) >> PAGE_SHIFT, holes);
328
329 memset(zones_size, 0, sizeof(zones_size));
330 memset(holes, 0, sizeof(holes));
331
332 max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
333
334 if (end_pfn < max_dma) {
335 zones_size[ZONE_DMA] = end_pfn;
336 holes[ZONE_DMA] = e820_hole_size(0, end_pfn);
337 } else {
338 zones_size[ZONE_DMA] = max_dma;
339 holes[ZONE_DMA] = e820_hole_size(0, max_dma);
340 zones_size[ZONE_NORMAL] = end_pfn - max_dma;
341 holes[ZONE_NORMAL] = e820_hole_size(max_dma, end_pfn);
342 }
343 free_area_init_node(0, NODE_DATA(0), zones_size,
344 __pa(PAGE_OFFSET) >> PAGE_SHIFT, holes);
345 }
346 return;
347} 381}
348#endif 382#endif
349 383
@@ -438,19 +472,16 @@ void __init mem_init(void)
438 datasize >> 10, 472 datasize >> 10,
439 initsize >> 10); 473 initsize >> 10);
440 474
475#ifdef CONFIG_SMP
441 /* 476 /*
442 * Subtle. SMP is doing its boot stuff late (because it has to 477 * Sync boot_level4_pgt mappings with the init_level4_pgt
443 * fork idle threads) - but it also needs low mappings for the 478 * except for the low identity mappings which are already zapped
444 * protected-mode entry to work. We zap these entries only after 479 * in init_level4_pgt. This sync-up is essential for AP's bringup
445 * the WP-bit has been tested.
446 */ 480 */
447#ifndef CONFIG_SMP 481 memcpy(boot_level4_pgt+1, init_level4_pgt+1, (PTRS_PER_PGD-1)*sizeof(pgd_t));
448 zap_low_mappings();
449#endif 482#endif
450} 483}
451 484
452extern char __initdata_begin[], __initdata_end[];
453
454void free_initmem(void) 485void free_initmem(void)
455{ 486{
456 unsigned long addr; 487 unsigned long addr;
@@ -464,7 +495,7 @@ void free_initmem(void)
464 totalram_pages++; 495 totalram_pages++;
465 } 496 }
466 memset(__initdata_begin, 0xba, __initdata_end - __initdata_begin); 497 memset(__initdata_begin, 0xba, __initdata_end - __initdata_begin);
467 printk ("Freeing unused kernel memory: %luk freed\n", (&__init_end - &__init_begin) >> 10); 498 printk ("Freeing unused kernel memory: %luk freed\n", (__init_end - __init_begin) >> 10);
468} 499}
469 500
470#ifdef CONFIG_BLK_DEV_INITRD 501#ifdef CONFIG_BLK_DEV_INITRD
@@ -491,6 +522,8 @@ void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
491#else 522#else
492 reserve_bootmem(phys, len); 523 reserve_bootmem(phys, len);
493#endif 524#endif
525 if (phys+len <= MAX_DMA_PFN*PAGE_SIZE)
526 dma_reserve += len / PAGE_SIZE;
494} 527}
495 528
496int kern_addr_valid(unsigned long addr) 529int kern_addr_valid(unsigned long addr)
@@ -532,10 +565,6 @@ extern int exception_trace, page_fault_trace;
532static ctl_table debug_table2[] = { 565static ctl_table debug_table2[] = {
533 { 99, "exception-trace", &exception_trace, sizeof(int), 0644, NULL, 566 { 99, "exception-trace", &exception_trace, sizeof(int), 0644, NULL,
534 proc_dointvec }, 567 proc_dointvec },
535#ifdef CONFIG_CHECKING
536 { 100, "page-fault-trace", &page_fault_trace, sizeof(int), 0644, NULL,
537 proc_dointvec },
538#endif
539 { 0, } 568 { 0, }
540}; 569};
541 570
diff --git a/arch/x86_64/mm/k8topology.c b/arch/x86_64/mm/k8topology.c
index 65417b040c1b..a5663e0bb01c 100644
--- a/arch/x86_64/mm/k8topology.c
+++ b/arch/x86_64/mm/k8topology.c
@@ -108,6 +108,7 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
108 limit >>= 16; 108 limit >>= 16;
109 limit <<= 24; 109 limit <<= 24;
110 limit |= (1<<24)-1; 110 limit |= (1<<24)-1;
111 limit++;
111 112
112 if (limit > end_pfn << PAGE_SHIFT) 113 if (limit > end_pfn << PAGE_SHIFT)
113 limit = end_pfn << PAGE_SHIFT; 114 limit = end_pfn << PAGE_SHIFT;
diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c
index 214803821001..a828a01739cc 100644
--- a/arch/x86_64/mm/numa.c
+++ b/arch/x86_64/mm/numa.c
@@ -38,38 +38,57 @@ cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly;
38 38
39int numa_off __initdata; 39int numa_off __initdata;
40 40
41int __init compute_hash_shift(struct node *nodes, int numnodes) 41
42/*
43 * Given a shift value, try to populate memnodemap[]
44 * Returns :
45 * 1 if OK
46 * 0 if memnodmap[] too small (of shift too small)
47 * -1 if node overlap or lost ram (shift too big)
48 */
49static int __init populate_memnodemap(
50 const struct node *nodes, int numnodes, int shift)
42{ 51{
43 int i; 52 int i;
44 int shift = 20; 53 int res = -1;
45 unsigned long addr,maxend=0; 54 unsigned long addr, end;
46
47 for (i = 0; i < numnodes; i++)
48 if ((nodes[i].start != nodes[i].end) && (nodes[i].end > maxend))
49 maxend = nodes[i].end;
50 55
51 while ((1UL << shift) < (maxend / NODEMAPSIZE)) 56 memset(memnodemap, 0xff, sizeof(memnodemap));
52 shift++;
53
54 printk (KERN_DEBUG"Using %d for the hash shift. Max adder is %lx \n",
55 shift,maxend);
56 memset(memnodemap,0xff,sizeof(*memnodemap) * NODEMAPSIZE);
57 for (i = 0; i < numnodes; i++) { 57 for (i = 0; i < numnodes; i++) {
58 if (nodes[i].start == nodes[i].end) 58 addr = nodes[i].start;
59 end = nodes[i].end;
60 if (addr >= end)
59 continue; 61 continue;
60 for (addr = nodes[i].start; 62 if ((end >> shift) >= NODEMAPSIZE)
61 addr < nodes[i].end; 63 return 0;
62 addr += (1UL << shift)) { 64 do {
63 if (memnodemap[addr >> shift] != 0xff) { 65 if (memnodemap[addr >> shift] != 0xff)
64 printk(KERN_INFO
65 "Your memory is not aligned you need to rebuild your kernel "
66 "with a bigger NODEMAPSIZE shift=%d adder=%lu\n",
67 shift,addr);
68 return -1; 66 return -1;
69 }
70 memnodemap[addr >> shift] = i; 67 memnodemap[addr >> shift] = i;
71 } 68 addr += (1 << shift);
69 } while (addr < end);
70 res = 1;
72 } 71 }
72 return res;
73}
74
75int __init compute_hash_shift(struct node *nodes, int numnodes)
76{
77 int shift = 20;
78
79 while (populate_memnodemap(nodes, numnodes, shift + 1) >= 0)
80 shift++;
81
82 printk(KERN_DEBUG "Using %d for the hash shift.\n",
83 shift);
84
85 if (populate_memnodemap(nodes, numnodes, shift) != 1) {
86 printk(KERN_INFO
87 "Your memory is not aligned you need to rebuild your kernel "
88 "with a bigger NODEMAPSIZE shift=%d\n",
89 shift);
90 return -1;
91 }
73 return shift; 92 return shift;
74} 93}
75 94
@@ -94,7 +113,6 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long en
94 start_pfn = start >> PAGE_SHIFT; 113 start_pfn = start >> PAGE_SHIFT;
95 end_pfn = end >> PAGE_SHIFT; 114 end_pfn = end >> PAGE_SHIFT;
96 115
97 memory_present(nodeid, start_pfn, end_pfn);
98 nodedata_phys = find_e820_area(start, end, pgdat_size); 116 nodedata_phys = find_e820_area(start, end, pgdat_size);
99 if (nodedata_phys == -1L) 117 if (nodedata_phys == -1L)
100 panic("Cannot find memory pgdat in node %d\n", nodeid); 118 panic("Cannot find memory pgdat in node %d\n", nodeid);
@@ -132,29 +150,14 @@ void __init setup_node_zones(int nodeid)
132 unsigned long start_pfn, end_pfn; 150 unsigned long start_pfn, end_pfn;
133 unsigned long zones[MAX_NR_ZONES]; 151 unsigned long zones[MAX_NR_ZONES];
134 unsigned long holes[MAX_NR_ZONES]; 152 unsigned long holes[MAX_NR_ZONES];
135 unsigned long dma_end_pfn;
136 153
137 memset(zones, 0, sizeof(unsigned long) * MAX_NR_ZONES); 154 start_pfn = node_start_pfn(nodeid);
138 memset(holes, 0, sizeof(unsigned long) * MAX_NR_ZONES); 155 end_pfn = node_end_pfn(nodeid);
139 156
140 start_pfn = node_start_pfn(nodeid); 157 Dprintk(KERN_INFO "setting up node %d %lx-%lx\n",
141 end_pfn = node_end_pfn(nodeid); 158 nodeid, start_pfn, end_pfn);
142 159
143 Dprintk(KERN_INFO "setting up node %d %lx-%lx\n", nodeid, start_pfn, end_pfn); 160 size_zones(zones, holes, start_pfn, end_pfn);
144
145 /* All nodes > 0 have a zero length zone DMA */
146 dma_end_pfn = __pa(MAX_DMA_ADDRESS) >> PAGE_SHIFT;
147 if (start_pfn < dma_end_pfn) {
148 zones[ZONE_DMA] = dma_end_pfn - start_pfn;
149 holes[ZONE_DMA] = e820_hole_size(start_pfn, dma_end_pfn);
150 zones[ZONE_NORMAL] = end_pfn - dma_end_pfn;
151 holes[ZONE_NORMAL] = e820_hole_size(dma_end_pfn, end_pfn);
152
153 } else {
154 zones[ZONE_NORMAL] = end_pfn - start_pfn;
155 holes[ZONE_NORMAL] = e820_hole_size(start_pfn, end_pfn);
156 }
157
158 free_area_init_node(nodeid, NODE_DATA(nodeid), zones, 161 free_area_init_node(nodeid, NODE_DATA(nodeid), zones,
159 start_pfn, holes); 162 start_pfn, holes);
160} 163}
@@ -171,7 +174,7 @@ void __init numa_init_array(void)
171 for (i = 0; i < NR_CPUS; i++) { 174 for (i = 0; i < NR_CPUS; i++) {
172 if (cpu_to_node[i] != NUMA_NO_NODE) 175 if (cpu_to_node[i] != NUMA_NO_NODE)
173 continue; 176 continue;
174 cpu_to_node[i] = rr; 177 numa_set_node(i, rr);
175 rr = next_node(rr, node_online_map); 178 rr = next_node(rr, node_online_map);
176 if (rr == MAX_NUMNODES) 179 if (rr == MAX_NUMNODES)
177 rr = first_node(node_online_map); 180 rr = first_node(node_online_map);
@@ -205,8 +208,6 @@ static int numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
205 if (i == numa_fake-1) 208 if (i == numa_fake-1)
206 sz = (end_pfn<<PAGE_SHIFT) - nodes[i].start; 209 sz = (end_pfn<<PAGE_SHIFT) - nodes[i].start;
207 nodes[i].end = nodes[i].start + sz; 210 nodes[i].end = nodes[i].start + sz;
208 if (i != numa_fake-1)
209 nodes[i].end--;
210 printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", 211 printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n",
211 i, 212 i,
212 nodes[i].start, nodes[i].end, 213 nodes[i].start, nodes[i].end,
@@ -257,7 +258,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
257 nodes_clear(node_online_map); 258 nodes_clear(node_online_map);
258 node_set_online(0); 259 node_set_online(0);
259 for (i = 0; i < NR_CPUS; i++) 260 for (i = 0; i < NR_CPUS; i++)
260 cpu_to_node[i] = 0; 261 numa_set_node(i, 0);
261 node_to_cpumask[0] = cpumask_of_cpu(0); 262 node_to_cpumask[0] = cpumask_of_cpu(0);
262 setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT); 263 setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
263} 264}
@@ -267,6 +268,12 @@ __cpuinit void numa_add_cpu(int cpu)
267 set_bit(cpu, &node_to_cpumask[cpu_to_node(cpu)]); 268 set_bit(cpu, &node_to_cpumask[cpu_to_node(cpu)]);
268} 269}
269 270
271void __cpuinit numa_set_node(int cpu, int node)
272{
273 cpu_pda[cpu].nodenumber = node;
274 cpu_to_node[cpu] = node;
275}
276
270unsigned long __init numa_free_all_bootmem(void) 277unsigned long __init numa_free_all_bootmem(void)
271{ 278{
272 int i; 279 int i;
@@ -277,9 +284,26 @@ unsigned long __init numa_free_all_bootmem(void)
277 return pages; 284 return pages;
278} 285}
279 286
287#ifdef CONFIG_SPARSEMEM
288static void __init arch_sparse_init(void)
289{
290 int i;
291
292 for_each_online_node(i)
293 memory_present(i, node_start_pfn(i), node_end_pfn(i));
294
295 sparse_init();
296}
297#else
298#define arch_sparse_init() do {} while (0)
299#endif
300
280void __init paging_init(void) 301void __init paging_init(void)
281{ 302{
282 int i; 303 int i;
304
305 arch_sparse_init();
306
283 for_each_online_node(i) { 307 for_each_online_node(i) {
284 setup_node_zones(i); 308 setup_node_zones(i);
285 } 309 }
diff --git a/arch/x86_64/mm/srat.c b/arch/x86_64/mm/srat.c
index 4b2e844c15a7..33340bd1e328 100644
--- a/arch/x86_64/mm/srat.c
+++ b/arch/x86_64/mm/srat.c
@@ -71,8 +71,6 @@ static __init void cutoff_node(int i, unsigned long start, unsigned long end)
71 nd->start = nd->end; 71 nd->start = nd->end;
72 } 72 }
73 if (nd->end > end) { 73 if (nd->end > end) {
74 if (!(end & 0xfff))
75 end--;
76 nd->end = end; 74 nd->end = end;
77 if (nd->start > nd->end) 75 if (nd->start > nd->end)
78 nd->start = nd->end; 76 nd->start = nd->end;
@@ -166,8 +164,6 @@ acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma)
166 if (nd->end < end) 164 if (nd->end < end)
167 nd->end = end; 165 nd->end = end;
168 } 166 }
169 if (!(nd->end & 0xfff))
170 nd->end--;
171 printk(KERN_INFO "SRAT: Node %u PXM %u %Lx-%Lx\n", node, pxm, 167 printk(KERN_INFO "SRAT: Node %u PXM %u %Lx-%Lx\n", node, pxm,
172 nd->start, nd->end); 168 nd->start, nd->end);
173} 169}
@@ -203,7 +199,7 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
203 if (cpu_to_node[i] == NUMA_NO_NODE) 199 if (cpu_to_node[i] == NUMA_NO_NODE)
204 continue; 200 continue;
205 if (!node_isset(cpu_to_node[i], nodes_parsed)) 201 if (!node_isset(cpu_to_node[i], nodes_parsed))
206 cpu_to_node[i] = NUMA_NO_NODE; 202 numa_set_node(i, NUMA_NO_NODE);
207 } 203 }
208 numa_init_array(); 204 numa_init_array();
209 return 0; 205 return 0;