diff options
Diffstat (limited to 'arch/x86_64/mm')
-rw-r--r-- | arch/x86_64/mm/fault.c | 18 | ||||
-rw-r--r-- | arch/x86_64/mm/init.c | 24 | ||||
-rw-r--r-- | arch/x86_64/mm/numa.c | 202 | ||||
-rw-r--r-- | arch/x86_64/mm/pageattr.c | 4 |
4 files changed, 192 insertions, 56 deletions
diff --git a/arch/x86_64/mm/fault.c b/arch/x86_64/mm/fault.c index 49e8cf2e06f8..6ada7231f3ab 100644 --- a/arch/x86_64/mm/fault.c +++ b/arch/x86_64/mm/fault.c | |||
@@ -56,17 +56,17 @@ int unregister_page_fault_notifier(struct notifier_block *nb) | |||
56 | } | 56 | } |
57 | EXPORT_SYMBOL_GPL(unregister_page_fault_notifier); | 57 | EXPORT_SYMBOL_GPL(unregister_page_fault_notifier); |
58 | 58 | ||
59 | static inline int notify_page_fault(enum die_val val, const char *str, | 59 | static inline int notify_page_fault(struct pt_regs *regs, long err) |
60 | struct pt_regs *regs, long err, int trap, int sig) | ||
61 | { | 60 | { |
62 | struct die_args args = { | 61 | struct die_args args = { |
63 | .regs = regs, | 62 | .regs = regs, |
64 | .str = str, | 63 | .str = "page fault", |
65 | .err = err, | 64 | .err = err, |
66 | .trapnr = trap, | 65 | .trapnr = 14, |
67 | .signr = sig | 66 | .signr = SIGSEGV |
68 | }; | 67 | }; |
69 | return atomic_notifier_call_chain(¬ify_page_fault_chain, val, &args); | 68 | return atomic_notifier_call_chain(¬ify_page_fault_chain, |
69 | DIE_PAGE_FAULT, &args); | ||
70 | } | 70 | } |
71 | 71 | ||
72 | /* Sometimes the CPU reports invalid exceptions on prefetch. | 72 | /* Sometimes the CPU reports invalid exceptions on prefetch. |
@@ -355,8 +355,7 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs, | |||
355 | if (vmalloc_fault(address) >= 0) | 355 | if (vmalloc_fault(address) >= 0) |
356 | return; | 356 | return; |
357 | } | 357 | } |
358 | if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, error_code, 14, | 358 | if (notify_page_fault(regs, error_code) == NOTIFY_STOP) |
359 | SIGSEGV) == NOTIFY_STOP) | ||
360 | return; | 359 | return; |
361 | /* | 360 | /* |
362 | * Don't take the mm semaphore here. If we fixup a prefetch | 361 | * Don't take the mm semaphore here. If we fixup a prefetch |
@@ -365,8 +364,7 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs, | |||
365 | goto bad_area_nosemaphore; | 364 | goto bad_area_nosemaphore; |
366 | } | 365 | } |
367 | 366 | ||
368 | if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, error_code, 14, | 367 | if (notify_page_fault(regs, error_code) == NOTIFY_STOP) |
369 | SIGSEGV) == NOTIFY_STOP) | ||
370 | return; | 368 | return; |
371 | 369 | ||
372 | if (likely(regs->eflags & X86_EFLAGS_IF)) | 370 | if (likely(regs->eflags & X86_EFLAGS_IF)) |
diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c index 2968b90ef8ad..ec31534eb104 100644 --- a/arch/x86_64/mm/init.c +++ b/arch/x86_64/mm/init.c | |||
@@ -711,20 +711,30 @@ int kern_addr_valid(unsigned long addr) | |||
711 | extern int exception_trace, page_fault_trace; | 711 | extern int exception_trace, page_fault_trace; |
712 | 712 | ||
713 | static ctl_table debug_table2[] = { | 713 | static ctl_table debug_table2[] = { |
714 | { 99, "exception-trace", &exception_trace, sizeof(int), 0644, NULL, | 714 | { |
715 | proc_dointvec }, | 715 | .ctl_name = 99, |
716 | { 0, } | 716 | .procname = "exception-trace", |
717 | .data = &exception_trace, | ||
718 | .maxlen = sizeof(int), | ||
719 | .mode = 0644, | ||
720 | .proc_handler = proc_dointvec | ||
721 | }, | ||
722 | {} | ||
717 | }; | 723 | }; |
718 | 724 | ||
719 | static ctl_table debug_root_table2[] = { | 725 | static ctl_table debug_root_table2[] = { |
720 | { .ctl_name = CTL_DEBUG, .procname = "debug", .mode = 0555, | 726 | { |
721 | .child = debug_table2 }, | 727 | .ctl_name = CTL_DEBUG, |
722 | { 0 }, | 728 | .procname = "debug", |
729 | .mode = 0555, | ||
730 | .child = debug_table2 | ||
731 | }, | ||
732 | {} | ||
723 | }; | 733 | }; |
724 | 734 | ||
725 | static __init int x8664_sysctl_init(void) | 735 | static __init int x8664_sysctl_init(void) |
726 | { | 736 | { |
727 | register_sysctl_table(debug_root_table2, 1); | 737 | register_sysctl_table(debug_root_table2); |
728 | return 0; | 738 | return 0; |
729 | } | 739 | } |
730 | __initcall(x8664_sysctl_init); | 740 | __initcall(x8664_sysctl_init); |
diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c index 2ee2e003606c..41b8fb069924 100644 --- a/arch/x86_64/mm/numa.c +++ b/arch/x86_64/mm/numa.c | |||
@@ -36,6 +36,8 @@ unsigned char apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { | |||
36 | cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly; | 36 | cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly; |
37 | 37 | ||
38 | int numa_off __initdata; | 38 | int numa_off __initdata; |
39 | unsigned long __initdata nodemap_addr; | ||
40 | unsigned long __initdata nodemap_size; | ||
39 | 41 | ||
40 | 42 | ||
41 | /* | 43 | /* |
@@ -52,34 +54,88 @@ populate_memnodemap(const struct bootnode *nodes, int numnodes, int shift) | |||
52 | int res = -1; | 54 | int res = -1; |
53 | unsigned long addr, end; | 55 | unsigned long addr, end; |
54 | 56 | ||
55 | if (shift >= 64) | 57 | memset(memnodemap, 0xff, memnodemapsize); |
56 | return -1; | ||
57 | memset(memnodemap, 0xff, sizeof(memnodemap)); | ||
58 | for (i = 0; i < numnodes; i++) { | 58 | for (i = 0; i < numnodes; i++) { |
59 | addr = nodes[i].start; | 59 | addr = nodes[i].start; |
60 | end = nodes[i].end; | 60 | end = nodes[i].end; |
61 | if (addr >= end) | 61 | if (addr >= end) |
62 | continue; | 62 | continue; |
63 | if ((end >> shift) >= NODEMAPSIZE) | 63 | if ((end >> shift) >= memnodemapsize) |
64 | return 0; | 64 | return 0; |
65 | do { | 65 | do { |
66 | if (memnodemap[addr >> shift] != 0xff) | 66 | if (memnodemap[addr >> shift] != 0xff) |
67 | return -1; | 67 | return -1; |
68 | memnodemap[addr >> shift] = i; | 68 | memnodemap[addr >> shift] = i; |
69 | addr += (1UL << shift); | 69 | addr += (1UL << shift); |
70 | } while (addr < end); | 70 | } while (addr < end); |
71 | res = 1; | 71 | res = 1; |
72 | } | 72 | } |
73 | return res; | 73 | return res; |
74 | } | 74 | } |
75 | 75 | ||
76 | int __init compute_hash_shift(struct bootnode *nodes, int numnodes) | 76 | static int __init allocate_cachealigned_memnodemap(void) |
77 | { | 77 | { |
78 | int shift = 20; | 78 | unsigned long pad, pad_addr; |
79 | |||
80 | memnodemap = memnode.embedded_map; | ||
81 | if (memnodemapsize <= 48) | ||
82 | return 0; | ||
83 | |||
84 | pad = L1_CACHE_BYTES - 1; | ||
85 | pad_addr = 0x8000; | ||
86 | nodemap_size = pad + memnodemapsize; | ||
87 | nodemap_addr = find_e820_area(pad_addr, end_pfn<<PAGE_SHIFT, | ||
88 | nodemap_size); | ||
89 | if (nodemap_addr == -1UL) { | ||
90 | printk(KERN_ERR | ||
91 | "NUMA: Unable to allocate Memory to Node hash map\n"); | ||
92 | nodemap_addr = nodemap_size = 0; | ||
93 | return -1; | ||
94 | } | ||
95 | pad_addr = (nodemap_addr + pad) & ~pad; | ||
96 | memnodemap = phys_to_virt(pad_addr); | ||
97 | |||
98 | printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n", | ||
99 | nodemap_addr, nodemap_addr + nodemap_size); | ||
100 | return 0; | ||
101 | } | ||
79 | 102 | ||
80 | while (populate_memnodemap(nodes, numnodes, shift + 1) >= 0) | 103 | /* |
81 | shift++; | 104 | * The LSB of all start and end addresses in the node map is the value of the |
105 | * maximum possible shift. | ||
106 | */ | ||
107 | static int __init | ||
108 | extract_lsb_from_nodes (const struct bootnode *nodes, int numnodes) | ||
109 | { | ||
110 | int i, nodes_used = 0; | ||
111 | unsigned long start, end; | ||
112 | unsigned long bitfield = 0, memtop = 0; | ||
113 | |||
114 | for (i = 0; i < numnodes; i++) { | ||
115 | start = nodes[i].start; | ||
116 | end = nodes[i].end; | ||
117 | if (start >= end) | ||
118 | continue; | ||
119 | bitfield |= start; | ||
120 | nodes_used++; | ||
121 | if (end > memtop) | ||
122 | memtop = end; | ||
123 | } | ||
124 | if (nodes_used <= 1) | ||
125 | i = 63; | ||
126 | else | ||
127 | i = find_first_bit(&bitfield, sizeof(unsigned long)*8); | ||
128 | memnodemapsize = (memtop >> i)+1; | ||
129 | return i; | ||
130 | } | ||
131 | |||
132 | int __init compute_hash_shift(struct bootnode *nodes, int numnodes) | ||
133 | { | ||
134 | int shift; | ||
82 | 135 | ||
136 | shift = extract_lsb_from_nodes(nodes, numnodes); | ||
137 | if (allocate_cachealigned_memnodemap()) | ||
138 | return -1; | ||
83 | printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n", | 139 | printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n", |
84 | shift); | 140 | shift); |
85 | 141 | ||
@@ -216,31 +272,113 @@ void __init numa_init_array(void) | |||
216 | } | 272 | } |
217 | 273 | ||
218 | #ifdef CONFIG_NUMA_EMU | 274 | #ifdef CONFIG_NUMA_EMU |
275 | /* Numa emulation */ | ||
219 | int numa_fake __initdata = 0; | 276 | int numa_fake __initdata = 0; |
220 | 277 | ||
221 | /* Numa emulation */ | 278 | /* |
279 | * This function is used to find out if the start and end correspond to | ||
280 | * different zones. | ||
281 | */ | ||
282 | int zone_cross_over(unsigned long start, unsigned long end) | ||
283 | { | ||
284 | if ((start < (MAX_DMA32_PFN << PAGE_SHIFT)) && | ||
285 | (end >= (MAX_DMA32_PFN << PAGE_SHIFT))) | ||
286 | return 1; | ||
287 | return 0; | ||
288 | } | ||
289 | |||
222 | static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn) | 290 | static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn) |
223 | { | 291 | { |
224 | int i; | 292 | int i, big; |
225 | struct bootnode nodes[MAX_NUMNODES]; | 293 | struct bootnode nodes[MAX_NUMNODES]; |
226 | unsigned long sz = ((end_pfn - start_pfn)<<PAGE_SHIFT) / numa_fake; | 294 | unsigned long sz, old_sz; |
295 | unsigned long hole_size; | ||
296 | unsigned long start, end; | ||
297 | unsigned long max_addr = (end_pfn << PAGE_SHIFT); | ||
298 | |||
299 | start = (start_pfn << PAGE_SHIFT); | ||
300 | hole_size = e820_hole_size(start, max_addr); | ||
301 | sz = (max_addr - start - hole_size) / numa_fake; | ||
227 | 302 | ||
228 | /* Kludge needed for the hash function */ | 303 | /* Kludge needed for the hash function */ |
229 | if (hweight64(sz) > 1) { | ||
230 | unsigned long x = 1; | ||
231 | while ((x << 1) < sz) | ||
232 | x <<= 1; | ||
233 | if (x < sz/2) | ||
234 | printk(KERN_ERR "Numa emulation unbalanced. Complain to maintainer\n"); | ||
235 | sz = x; | ||
236 | } | ||
237 | 304 | ||
305 | old_sz = sz; | ||
306 | /* | ||
307 | * Round down to the nearest FAKE_NODE_MIN_SIZE. | ||
308 | */ | ||
309 | sz &= FAKE_NODE_MIN_HASH_MASK; | ||
310 | |||
311 | /* | ||
312 | * We ensure that each node is at least 64MB big. Smaller than this | ||
313 | * size can cause VM hiccups. | ||
314 | */ | ||
315 | if (sz == 0) { | ||
316 | printk(KERN_INFO "Not enough memory for %d nodes. Reducing " | ||
317 | "the number of nodes\n", numa_fake); | ||
318 | numa_fake = (max_addr - start - hole_size) / FAKE_NODE_MIN_SIZE; | ||
319 | printk(KERN_INFO "Number of fake nodes will be = %d\n", | ||
320 | numa_fake); | ||
321 | sz = FAKE_NODE_MIN_SIZE; | ||
322 | } | ||
323 | /* | ||
324 | * Find out how many nodes can get an extra NODE_MIN_SIZE granule. | ||
325 | * This logic ensures the extra memory gets distributed among as many | ||
326 | * nodes as possible (as compared to one single node getting all that | ||
327 | * extra memory. | ||
328 | */ | ||
329 | big = ((old_sz - sz) * numa_fake) / FAKE_NODE_MIN_SIZE; | ||
330 | printk(KERN_INFO "Fake node Size: %luMB hole_size: %luMB big nodes: " | ||
331 | "%d\n", | ||
332 | (sz >> 20), (hole_size >> 20), big); | ||
238 | memset(&nodes,0,sizeof(nodes)); | 333 | memset(&nodes,0,sizeof(nodes)); |
334 | end = start; | ||
239 | for (i = 0; i < numa_fake; i++) { | 335 | for (i = 0; i < numa_fake; i++) { |
240 | nodes[i].start = (start_pfn<<PAGE_SHIFT) + i*sz; | 336 | /* |
337 | * In case we are not able to allocate enough memory for all | ||
338 | * the nodes, we reduce the number of fake nodes. | ||
339 | */ | ||
340 | if (end >= max_addr) { | ||
341 | numa_fake = i - 1; | ||
342 | break; | ||
343 | } | ||
344 | start = nodes[i].start = end; | ||
345 | /* | ||
346 | * Final node can have all the remaining memory. | ||
347 | */ | ||
241 | if (i == numa_fake-1) | 348 | if (i == numa_fake-1) |
242 | sz = (end_pfn<<PAGE_SHIFT) - nodes[i].start; | 349 | sz = max_addr - start; |
243 | nodes[i].end = nodes[i].start + sz; | 350 | end = nodes[i].start + sz; |
351 | /* | ||
352 | * Fir "big" number of nodes get extra granule. | ||
353 | */ | ||
354 | if (i < big) | ||
355 | end += FAKE_NODE_MIN_SIZE; | ||
356 | /* | ||
357 | * Iterate over the range to ensure that this node gets at | ||
358 | * least sz amount of RAM (excluding holes) | ||
359 | */ | ||
360 | while ((end - start - e820_hole_size(start, end)) < sz) { | ||
361 | end += FAKE_NODE_MIN_SIZE; | ||
362 | if (end >= max_addr) | ||
363 | break; | ||
364 | } | ||
365 | /* | ||
366 | * Look at the next node to make sure there is some real memory | ||
367 | * to map. Bad things happen when the only memory present | ||
368 | * in a zone on a fake node is IO hole. | ||
369 | */ | ||
370 | while (e820_hole_size(end, end + FAKE_NODE_MIN_SIZE) > 0) { | ||
371 | if (zone_cross_over(start, end + sz)) { | ||
372 | end = (MAX_DMA32_PFN << PAGE_SHIFT); | ||
373 | break; | ||
374 | } | ||
375 | if (end >= max_addr) | ||
376 | break; | ||
377 | end += FAKE_NODE_MIN_SIZE; | ||
378 | } | ||
379 | if (end > max_addr) | ||
380 | end = max_addr; | ||
381 | nodes[i].end = end; | ||
244 | printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", | 382 | printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", |
245 | i, | 383 | i, |
246 | nodes[i].start, nodes[i].end, | 384 | nodes[i].start, nodes[i].end, |
@@ -290,6 +428,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) | |||
290 | end_pfn << PAGE_SHIFT); | 428 | end_pfn << PAGE_SHIFT); |
291 | /* setup dummy node covering all memory */ | 429 | /* setup dummy node covering all memory */ |
292 | memnode_shift = 63; | 430 | memnode_shift = 63; |
431 | memnodemap = memnode.embedded_map; | ||
293 | memnodemap[0] = 0; | 432 | memnodemap[0] = 0; |
294 | nodes_clear(node_online_map); | 433 | nodes_clear(node_online_map); |
295 | node_set_online(0); | 434 | node_set_online(0); |
@@ -321,20 +460,6 @@ unsigned long __init numa_free_all_bootmem(void) | |||
321 | return pages; | 460 | return pages; |
322 | } | 461 | } |
323 | 462 | ||
324 | #ifdef CONFIG_SPARSEMEM | ||
325 | static void __init arch_sparse_init(void) | ||
326 | { | ||
327 | int i; | ||
328 | |||
329 | for_each_online_node(i) | ||
330 | memory_present(i, node_start_pfn(i), node_end_pfn(i)); | ||
331 | |||
332 | sparse_init(); | ||
333 | } | ||
334 | #else | ||
335 | #define arch_sparse_init() do {} while (0) | ||
336 | #endif | ||
337 | |||
338 | void __init paging_init(void) | 463 | void __init paging_init(void) |
339 | { | 464 | { |
340 | int i; | 465 | int i; |
@@ -344,7 +469,8 @@ void __init paging_init(void) | |||
344 | max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; | 469 | max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; |
345 | max_zone_pfns[ZONE_NORMAL] = end_pfn; | 470 | max_zone_pfns[ZONE_NORMAL] = end_pfn; |
346 | 471 | ||
347 | arch_sparse_init(); | 472 | sparse_memory_present_with_active_regions(MAX_NUMNODES); |
473 | sparse_init(); | ||
348 | 474 | ||
349 | for_each_online_node(i) { | 475 | for_each_online_node(i) { |
350 | setup_node_zones(i); | 476 | setup_node_zones(i); |
diff --git a/arch/x86_64/mm/pageattr.c b/arch/x86_64/mm/pageattr.c index ccb91dd996a9..65c5eaa59905 100644 --- a/arch/x86_64/mm/pageattr.c +++ b/arch/x86_64/mm/pageattr.c | |||
@@ -107,6 +107,7 @@ static void revert_page(unsigned long address, pgprot_t ref_prot) | |||
107 | pud_t *pud; | 107 | pud_t *pud; |
108 | pmd_t *pmd; | 108 | pmd_t *pmd; |
109 | pte_t large_pte; | 109 | pte_t large_pte; |
110 | unsigned long pfn; | ||
110 | 111 | ||
111 | pgd = pgd_offset_k(address); | 112 | pgd = pgd_offset_k(address); |
112 | BUG_ON(pgd_none(*pgd)); | 113 | BUG_ON(pgd_none(*pgd)); |
@@ -114,7 +115,8 @@ static void revert_page(unsigned long address, pgprot_t ref_prot) | |||
114 | BUG_ON(pud_none(*pud)); | 115 | BUG_ON(pud_none(*pud)); |
115 | pmd = pmd_offset(pud, address); | 116 | pmd = pmd_offset(pud, address); |
116 | BUG_ON(pmd_val(*pmd) & _PAGE_PSE); | 117 | BUG_ON(pmd_val(*pmd) & _PAGE_PSE); |
117 | large_pte = mk_pte_phys(__pa(address) & LARGE_PAGE_MASK, ref_prot); | 118 | pfn = (__pa(address) & LARGE_PAGE_MASK) >> PAGE_SHIFT; |
119 | large_pte = pfn_pte(pfn, ref_prot); | ||
118 | large_pte = pte_mkhuge(large_pte); | 120 | large_pte = pte_mkhuge(large_pte); |
119 | set_pte((pte_t *)pmd, large_pte); | 121 | set_pte((pte_t *)pmd, large_pte); |
120 | } | 122 | } |