aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86_64/mm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86_64/mm')
-rw-r--r--arch/x86_64/mm/fault.c18
-rw-r--r--arch/x86_64/mm/init.c24
-rw-r--r--arch/x86_64/mm/numa.c202
-rw-r--r--arch/x86_64/mm/pageattr.c4
4 files changed, 192 insertions, 56 deletions
diff --git a/arch/x86_64/mm/fault.c b/arch/x86_64/mm/fault.c
index 49e8cf2e06f8..6ada7231f3ab 100644
--- a/arch/x86_64/mm/fault.c
+++ b/arch/x86_64/mm/fault.c
@@ -56,17 +56,17 @@ int unregister_page_fault_notifier(struct notifier_block *nb)
56} 56}
57EXPORT_SYMBOL_GPL(unregister_page_fault_notifier); 57EXPORT_SYMBOL_GPL(unregister_page_fault_notifier);
58 58
59static inline int notify_page_fault(enum die_val val, const char *str, 59static inline int notify_page_fault(struct pt_regs *regs, long err)
60 struct pt_regs *regs, long err, int trap, int sig)
61{ 60{
62 struct die_args args = { 61 struct die_args args = {
63 .regs = regs, 62 .regs = regs,
64 .str = str, 63 .str = "page fault",
65 .err = err, 64 .err = err,
66 .trapnr = trap, 65 .trapnr = 14,
67 .signr = sig 66 .signr = SIGSEGV
68 }; 67 };
69 return atomic_notifier_call_chain(&notify_page_fault_chain, val, &args); 68 return atomic_notifier_call_chain(&notify_page_fault_chain,
69 DIE_PAGE_FAULT, &args);
70} 70}
71 71
72/* Sometimes the CPU reports invalid exceptions on prefetch. 72/* Sometimes the CPU reports invalid exceptions on prefetch.
@@ -355,8 +355,7 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
355 if (vmalloc_fault(address) >= 0) 355 if (vmalloc_fault(address) >= 0)
356 return; 356 return;
357 } 357 }
358 if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, error_code, 14, 358 if (notify_page_fault(regs, error_code) == NOTIFY_STOP)
359 SIGSEGV) == NOTIFY_STOP)
360 return; 359 return;
361 /* 360 /*
362 * Don't take the mm semaphore here. If we fixup a prefetch 361 * Don't take the mm semaphore here. If we fixup a prefetch
@@ -365,8 +364,7 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
365 goto bad_area_nosemaphore; 364 goto bad_area_nosemaphore;
366 } 365 }
367 366
368 if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, error_code, 14, 367 if (notify_page_fault(regs, error_code) == NOTIFY_STOP)
369 SIGSEGV) == NOTIFY_STOP)
370 return; 368 return;
371 369
372 if (likely(regs->eflags & X86_EFLAGS_IF)) 370 if (likely(regs->eflags & X86_EFLAGS_IF))
diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c
index 2968b90ef8ad..ec31534eb104 100644
--- a/arch/x86_64/mm/init.c
+++ b/arch/x86_64/mm/init.c
@@ -711,20 +711,30 @@ int kern_addr_valid(unsigned long addr)
711extern int exception_trace, page_fault_trace; 711extern int exception_trace, page_fault_trace;
712 712
713static ctl_table debug_table2[] = { 713static ctl_table debug_table2[] = {
714 { 99, "exception-trace", &exception_trace, sizeof(int), 0644, NULL, 714 {
715 proc_dointvec }, 715 .ctl_name = 99,
716 { 0, } 716 .procname = "exception-trace",
717 .data = &exception_trace,
718 .maxlen = sizeof(int),
719 .mode = 0644,
720 .proc_handler = proc_dointvec
721 },
722 {}
717}; 723};
718 724
719static ctl_table debug_root_table2[] = { 725static ctl_table debug_root_table2[] = {
720 { .ctl_name = CTL_DEBUG, .procname = "debug", .mode = 0555, 726 {
721 .child = debug_table2 }, 727 .ctl_name = CTL_DEBUG,
722 { 0 }, 728 .procname = "debug",
729 .mode = 0555,
730 .child = debug_table2
731 },
732 {}
723}; 733};
724 734
725static __init int x8664_sysctl_init(void) 735static __init int x8664_sysctl_init(void)
726{ 736{
727 register_sysctl_table(debug_root_table2, 1); 737 register_sysctl_table(debug_root_table2);
728 return 0; 738 return 0;
729} 739}
730__initcall(x8664_sysctl_init); 740__initcall(x8664_sysctl_init);
diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c
index 2ee2e003606c..41b8fb069924 100644
--- a/arch/x86_64/mm/numa.c
+++ b/arch/x86_64/mm/numa.c
@@ -36,6 +36,8 @@ unsigned char apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
36cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly; 36cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly;
37 37
38int numa_off __initdata; 38int numa_off __initdata;
39unsigned long __initdata nodemap_addr;
40unsigned long __initdata nodemap_size;
39 41
40 42
41/* 43/*
@@ -52,34 +54,88 @@ populate_memnodemap(const struct bootnode *nodes, int numnodes, int shift)
52 int res = -1; 54 int res = -1;
53 unsigned long addr, end; 55 unsigned long addr, end;
54 56
55 if (shift >= 64) 57 memset(memnodemap, 0xff, memnodemapsize);
56 return -1;
57 memset(memnodemap, 0xff, sizeof(memnodemap));
58 for (i = 0; i < numnodes; i++) { 58 for (i = 0; i < numnodes; i++) {
59 addr = nodes[i].start; 59 addr = nodes[i].start;
60 end = nodes[i].end; 60 end = nodes[i].end;
61 if (addr >= end) 61 if (addr >= end)
62 continue; 62 continue;
63 if ((end >> shift) >= NODEMAPSIZE) 63 if ((end >> shift) >= memnodemapsize)
64 return 0; 64 return 0;
65 do { 65 do {
66 if (memnodemap[addr >> shift] != 0xff) 66 if (memnodemap[addr >> shift] != 0xff)
67 return -1; 67 return -1;
68 memnodemap[addr >> shift] = i; 68 memnodemap[addr >> shift] = i;
69 addr += (1UL << shift); 69 addr += (1UL << shift);
70 } while (addr < end); 70 } while (addr < end);
71 res = 1; 71 res = 1;
72 } 72 }
73 return res; 73 return res;
74} 74}
75 75
76int __init compute_hash_shift(struct bootnode *nodes, int numnodes) 76static int __init allocate_cachealigned_memnodemap(void)
77{ 77{
78 int shift = 20; 78 unsigned long pad, pad_addr;
79
80 memnodemap = memnode.embedded_map;
81 if (memnodemapsize <= 48)
82 return 0;
83
84 pad = L1_CACHE_BYTES - 1;
85 pad_addr = 0x8000;
86 nodemap_size = pad + memnodemapsize;
87 nodemap_addr = find_e820_area(pad_addr, end_pfn<<PAGE_SHIFT,
88 nodemap_size);
89 if (nodemap_addr == -1UL) {
90 printk(KERN_ERR
91 "NUMA: Unable to allocate Memory to Node hash map\n");
92 nodemap_addr = nodemap_size = 0;
93 return -1;
94 }
95 pad_addr = (nodemap_addr + pad) & ~pad;
96 memnodemap = phys_to_virt(pad_addr);
97
98 printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n",
99 nodemap_addr, nodemap_addr + nodemap_size);
100 return 0;
101}
79 102
80 while (populate_memnodemap(nodes, numnodes, shift + 1) >= 0) 103/*
81 shift++; 104 * The LSB of all start and end addresses in the node map is the value of the
105 * maximum possible shift.
106 */
107static int __init
108extract_lsb_from_nodes (const struct bootnode *nodes, int numnodes)
109{
110 int i, nodes_used = 0;
111 unsigned long start, end;
112 unsigned long bitfield = 0, memtop = 0;
113
114 for (i = 0; i < numnodes; i++) {
115 start = nodes[i].start;
116 end = nodes[i].end;
117 if (start >= end)
118 continue;
119 bitfield |= start;
120 nodes_used++;
121 if (end > memtop)
122 memtop = end;
123 }
124 if (nodes_used <= 1)
125 i = 63;
126 else
127 i = find_first_bit(&bitfield, sizeof(unsigned long)*8);
128 memnodemapsize = (memtop >> i)+1;
129 return i;
130}
131
132int __init compute_hash_shift(struct bootnode *nodes, int numnodes)
133{
134 int shift;
82 135
136 shift = extract_lsb_from_nodes(nodes, numnodes);
137 if (allocate_cachealigned_memnodemap())
138 return -1;
83 printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n", 139 printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n",
84 shift); 140 shift);
85 141
@@ -216,31 +272,113 @@ void __init numa_init_array(void)
216} 272}
217 273
218#ifdef CONFIG_NUMA_EMU 274#ifdef CONFIG_NUMA_EMU
275/* Numa emulation */
219int numa_fake __initdata = 0; 276int numa_fake __initdata = 0;
220 277
221/* Numa emulation */ 278/*
279 * This function is used to find out if the start and end correspond to
280 * different zones.
281 */
282int zone_cross_over(unsigned long start, unsigned long end)
283{
284 if ((start < (MAX_DMA32_PFN << PAGE_SHIFT)) &&
285 (end >= (MAX_DMA32_PFN << PAGE_SHIFT)))
286 return 1;
287 return 0;
288}
289
222static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn) 290static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
223{ 291{
224 int i; 292 int i, big;
225 struct bootnode nodes[MAX_NUMNODES]; 293 struct bootnode nodes[MAX_NUMNODES];
226 unsigned long sz = ((end_pfn - start_pfn)<<PAGE_SHIFT) / numa_fake; 294 unsigned long sz, old_sz;
295 unsigned long hole_size;
296 unsigned long start, end;
297 unsigned long max_addr = (end_pfn << PAGE_SHIFT);
298
299 start = (start_pfn << PAGE_SHIFT);
300 hole_size = e820_hole_size(start, max_addr);
301 sz = (max_addr - start - hole_size) / numa_fake;
227 302
228 /* Kludge needed for the hash function */ 303 /* Kludge needed for the hash function */
229 if (hweight64(sz) > 1) {
230 unsigned long x = 1;
231 while ((x << 1) < sz)
232 x <<= 1;
233 if (x < sz/2)
234 printk(KERN_ERR "Numa emulation unbalanced. Complain to maintainer\n");
235 sz = x;
236 }
237 304
305 old_sz = sz;
306 /*
307 * Round down to the nearest FAKE_NODE_MIN_SIZE.
308 */
309 sz &= FAKE_NODE_MIN_HASH_MASK;
310
311 /*
312 * We ensure that each node is at least 64MB big. Smaller than this
313 * size can cause VM hiccups.
314 */
315 if (sz == 0) {
316 printk(KERN_INFO "Not enough memory for %d nodes. Reducing "
317 "the number of nodes\n", numa_fake);
318 numa_fake = (max_addr - start - hole_size) / FAKE_NODE_MIN_SIZE;
319 printk(KERN_INFO "Number of fake nodes will be = %d\n",
320 numa_fake);
321 sz = FAKE_NODE_MIN_SIZE;
322 }
323 /*
324 * Find out how many nodes can get an extra NODE_MIN_SIZE granule.
325 * This logic ensures the extra memory gets distributed among as many
326 * nodes as possible (as compared to one single node getting all that
327 * extra memory.
328 */
329 big = ((old_sz - sz) * numa_fake) / FAKE_NODE_MIN_SIZE;
330 printk(KERN_INFO "Fake node Size: %luMB hole_size: %luMB big nodes: "
331 "%d\n",
332 (sz >> 20), (hole_size >> 20), big);
238 memset(&nodes,0,sizeof(nodes)); 333 memset(&nodes,0,sizeof(nodes));
334 end = start;
239 for (i = 0; i < numa_fake; i++) { 335 for (i = 0; i < numa_fake; i++) {
240 nodes[i].start = (start_pfn<<PAGE_SHIFT) + i*sz; 336 /*
337 * In case we are not able to allocate enough memory for all
338 * the nodes, we reduce the number of fake nodes.
339 */
340 if (end >= max_addr) {
341 numa_fake = i - 1;
342 break;
343 }
344 start = nodes[i].start = end;
345 /*
346 * Final node can have all the remaining memory.
347 */
241 if (i == numa_fake-1) 348 if (i == numa_fake-1)
242 sz = (end_pfn<<PAGE_SHIFT) - nodes[i].start; 349 sz = max_addr - start;
243 nodes[i].end = nodes[i].start + sz; 350 end = nodes[i].start + sz;
351 /*
352 * Fir "big" number of nodes get extra granule.
353 */
354 if (i < big)
355 end += FAKE_NODE_MIN_SIZE;
356 /*
357 * Iterate over the range to ensure that this node gets at
358 * least sz amount of RAM (excluding holes)
359 */
360 while ((end - start - e820_hole_size(start, end)) < sz) {
361 end += FAKE_NODE_MIN_SIZE;
362 if (end >= max_addr)
363 break;
364 }
365 /*
366 * Look at the next node to make sure there is some real memory
367 * to map. Bad things happen when the only memory present
368 * in a zone on a fake node is IO hole.
369 */
370 while (e820_hole_size(end, end + FAKE_NODE_MIN_SIZE) > 0) {
371 if (zone_cross_over(start, end + sz)) {
372 end = (MAX_DMA32_PFN << PAGE_SHIFT);
373 break;
374 }
375 if (end >= max_addr)
376 break;
377 end += FAKE_NODE_MIN_SIZE;
378 }
379 if (end > max_addr)
380 end = max_addr;
381 nodes[i].end = end;
244 printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", 382 printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n",
245 i, 383 i,
246 nodes[i].start, nodes[i].end, 384 nodes[i].start, nodes[i].end,
@@ -290,6 +428,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
290 end_pfn << PAGE_SHIFT); 428 end_pfn << PAGE_SHIFT);
291 /* setup dummy node covering all memory */ 429 /* setup dummy node covering all memory */
292 memnode_shift = 63; 430 memnode_shift = 63;
431 memnodemap = memnode.embedded_map;
293 memnodemap[0] = 0; 432 memnodemap[0] = 0;
294 nodes_clear(node_online_map); 433 nodes_clear(node_online_map);
295 node_set_online(0); 434 node_set_online(0);
@@ -321,20 +460,6 @@ unsigned long __init numa_free_all_bootmem(void)
321 return pages; 460 return pages;
322} 461}
323 462
324#ifdef CONFIG_SPARSEMEM
325static void __init arch_sparse_init(void)
326{
327 int i;
328
329 for_each_online_node(i)
330 memory_present(i, node_start_pfn(i), node_end_pfn(i));
331
332 sparse_init();
333}
334#else
335#define arch_sparse_init() do {} while (0)
336#endif
337
338void __init paging_init(void) 463void __init paging_init(void)
339{ 464{
340 int i; 465 int i;
@@ -344,7 +469,8 @@ void __init paging_init(void)
344 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; 469 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
345 max_zone_pfns[ZONE_NORMAL] = end_pfn; 470 max_zone_pfns[ZONE_NORMAL] = end_pfn;
346 471
347 arch_sparse_init(); 472 sparse_memory_present_with_active_regions(MAX_NUMNODES);
473 sparse_init();
348 474
349 for_each_online_node(i) { 475 for_each_online_node(i) {
350 setup_node_zones(i); 476 setup_node_zones(i);
diff --git a/arch/x86_64/mm/pageattr.c b/arch/x86_64/mm/pageattr.c
index ccb91dd996a9..65c5eaa59905 100644
--- a/arch/x86_64/mm/pageattr.c
+++ b/arch/x86_64/mm/pageattr.c
@@ -107,6 +107,7 @@ static void revert_page(unsigned long address, pgprot_t ref_prot)
107 pud_t *pud; 107 pud_t *pud;
108 pmd_t *pmd; 108 pmd_t *pmd;
109 pte_t large_pte; 109 pte_t large_pte;
110 unsigned long pfn;
110 111
111 pgd = pgd_offset_k(address); 112 pgd = pgd_offset_k(address);
112 BUG_ON(pgd_none(*pgd)); 113 BUG_ON(pgd_none(*pgd));
@@ -114,7 +115,8 @@ static void revert_page(unsigned long address, pgprot_t ref_prot)
114 BUG_ON(pud_none(*pud)); 115 BUG_ON(pud_none(*pud));
115 pmd = pmd_offset(pud, address); 116 pmd = pmd_offset(pud, address);
116 BUG_ON(pmd_val(*pmd) & _PAGE_PSE); 117 BUG_ON(pmd_val(*pmd) & _PAGE_PSE);
117 large_pte = mk_pte_phys(__pa(address) & LARGE_PAGE_MASK, ref_prot); 118 pfn = (__pa(address) & LARGE_PAGE_MASK) >> PAGE_SHIFT;
119 large_pte = pfn_pte(pfn, ref_prot);
118 large_pte = pte_mkhuge(large_pte); 120 large_pte = pte_mkhuge(large_pte);
119 set_pte((pte_t *)pmd, large_pte); 121 set_pte((pte_t *)pmd, large_pte);
120} 122}