aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/mm/Makefile2
-rw-r--r--arch/x86/mm/Makefile_6410
-rw-r--r--arch/x86/mm/extable_64.c34
-rw-r--r--arch/x86/mm/fault_64.c636
-rw-r--r--arch/x86/mm/init_64.c750
-rw-r--r--arch/x86/mm/ioremap_64.c210
-rw-r--r--arch/x86/mm/k8topology_64.c182
-rw-r--r--arch/x86/mm/mmap_64.c29
-rw-r--r--arch/x86/mm/numa_64.c648
-rw-r--r--arch/x86/mm/pageattr_64.c249
-rw-r--r--arch/x86/mm/srat_64.c566
11 files changed, 3315 insertions, 1 deletions
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 7317648e6587..983291096848 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -1,5 +1,5 @@
1ifeq ($(CONFIG_X86_32),y) 1ifeq ($(CONFIG_X86_32),y)
2include ${srctree}/arch/x86/mm/Makefile_32 2include ${srctree}/arch/x86/mm/Makefile_32
3else 3else
4include ${srctree}/arch/x86_64/mm/Makefile_64 4include ${srctree}/arch/x86/mm/Makefile_64
5endif 5endif
diff --git a/arch/x86/mm/Makefile_64 b/arch/x86/mm/Makefile_64
new file mode 100644
index 000000000000..6bcb47945b87
--- /dev/null
+++ b/arch/x86/mm/Makefile_64
@@ -0,0 +1,10 @@
1#
2# Makefile for the linux x86_64-specific parts of the memory manager.
3#
4
5obj-y := init_64.o fault_64.o ioremap_64.o extable_64.o pageattr_64.o mmap_64.o
6obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
7obj-$(CONFIG_NUMA) += numa_64.o
8obj-$(CONFIG_K8_NUMA) += k8topology_64.o
9obj-$(CONFIG_ACPI_NUMA) += srat_64.o
10
diff --git a/arch/x86/mm/extable_64.c b/arch/x86/mm/extable_64.c
new file mode 100644
index 000000000000..79ac6e7100af
--- /dev/null
+++ b/arch/x86/mm/extable_64.c
@@ -0,0 +1,34 @@
1/*
2 * linux/arch/x86_64/mm/extable.c
3 */
4
5#include <linux/module.h>
6#include <linux/spinlock.h>
7#include <linux/init.h>
8#include <asm/uaccess.h>
9
10/* Simple binary search */
11const struct exception_table_entry *
12search_extable(const struct exception_table_entry *first,
13 const struct exception_table_entry *last,
14 unsigned long value)
15{
16 /* Work around a B stepping K8 bug */
17 if ((value >> 32) == 0)
18 value |= 0xffffffffUL << 32;
19
20 while (first <= last) {
21 const struct exception_table_entry *mid;
22 long diff;
23
24 mid = (last - first) / 2 + first;
25 diff = mid->insn - value;
26 if (diff == 0)
27 return mid;
28 else if (diff < 0)
29 first = mid+1;
30 else
31 last = mid-1;
32 }
33 return NULL;
34}
diff --git a/arch/x86/mm/fault_64.c b/arch/x86/mm/fault_64.c
new file mode 100644
index 000000000000..54816adb8e93
--- /dev/null
+++ b/arch/x86/mm/fault_64.c
@@ -0,0 +1,636 @@
1/*
2 * linux/arch/x86-64/mm/fault.c
3 *
4 * Copyright (C) 1995 Linus Torvalds
5 * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
6 */
7
8#include <linux/signal.h>
9#include <linux/sched.h>
10#include <linux/kernel.h>
11#include <linux/errno.h>
12#include <linux/string.h>
13#include <linux/types.h>
14#include <linux/ptrace.h>
15#include <linux/mman.h>
16#include <linux/mm.h>
17#include <linux/smp.h>
18#include <linux/interrupt.h>
19#include <linux/init.h>
20#include <linux/tty.h>
21#include <linux/vt_kern.h> /* For unblank_screen() */
22#include <linux/compiler.h>
23#include <linux/vmalloc.h>
24#include <linux/module.h>
25#include <linux/kprobes.h>
26#include <linux/uaccess.h>
27#include <linux/kdebug.h>
28
29#include <asm/system.h>
30#include <asm/pgalloc.h>
31#include <asm/smp.h>
32#include <asm/tlbflush.h>
33#include <asm/proto.h>
34#include <asm-generic/sections.h>
35
36/* Page fault error code bits */
37#define PF_PROT (1<<0) /* or no page found */
38#define PF_WRITE (1<<1)
39#define PF_USER (1<<2)
40#define PF_RSVD (1<<3)
41#define PF_INSTR (1<<4)
42
43static ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain);
44
45/* Hook to register for page fault notifications */
46int register_page_fault_notifier(struct notifier_block *nb)
47{
48 vmalloc_sync_all();
49 return atomic_notifier_chain_register(&notify_page_fault_chain, nb);
50}
51EXPORT_SYMBOL_GPL(register_page_fault_notifier);
52
53int unregister_page_fault_notifier(struct notifier_block *nb)
54{
55 return atomic_notifier_chain_unregister(&notify_page_fault_chain, nb);
56}
57EXPORT_SYMBOL_GPL(unregister_page_fault_notifier);
58
59static inline int notify_page_fault(struct pt_regs *regs, long err)
60{
61 struct die_args args = {
62 .regs = regs,
63 .str = "page fault",
64 .err = err,
65 .trapnr = 14,
66 .signr = SIGSEGV
67 };
68 return atomic_notifier_call_chain(&notify_page_fault_chain,
69 DIE_PAGE_FAULT, &args);
70}
71
72/* Sometimes the CPU reports invalid exceptions on prefetch.
73 Check that here and ignore.
74 Opcode checker based on code by Richard Brunner */
75static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr,
76 unsigned long error_code)
77{
78 unsigned char *instr;
79 int scan_more = 1;
80 int prefetch = 0;
81 unsigned char *max_instr;
82
83 /* If it was a exec fault ignore */
84 if (error_code & PF_INSTR)
85 return 0;
86
87 instr = (unsigned char __user *)convert_rip_to_linear(current, regs);
88 max_instr = instr + 15;
89
90 if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
91 return 0;
92
93 while (scan_more && instr < max_instr) {
94 unsigned char opcode;
95 unsigned char instr_hi;
96 unsigned char instr_lo;
97
98 if (probe_kernel_address(instr, opcode))
99 break;
100
101 instr_hi = opcode & 0xf0;
102 instr_lo = opcode & 0x0f;
103 instr++;
104
105 switch (instr_hi) {
106 case 0x20:
107 case 0x30:
108 /* Values 0x26,0x2E,0x36,0x3E are valid x86
109 prefixes. In long mode, the CPU will signal
110 invalid opcode if some of these prefixes are
111 present so we will never get here anyway */
112 scan_more = ((instr_lo & 7) == 0x6);
113 break;
114
115 case 0x40:
116 /* In AMD64 long mode, 0x40 to 0x4F are valid REX prefixes
117 Need to figure out under what instruction mode the
118 instruction was issued ... */
119 /* Could check the LDT for lm, but for now it's good
120 enough to assume that long mode only uses well known
121 segments or kernel. */
122 scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
123 break;
124
125 case 0x60:
126 /* 0x64 thru 0x67 are valid prefixes in all modes. */
127 scan_more = (instr_lo & 0xC) == 0x4;
128 break;
129 case 0xF0:
130 /* 0xF0, 0xF2, and 0xF3 are valid prefixes in all modes. */
131 scan_more = !instr_lo || (instr_lo>>1) == 1;
132 break;
133 case 0x00:
134 /* Prefetch instruction is 0x0F0D or 0x0F18 */
135 scan_more = 0;
136 if (probe_kernel_address(instr, opcode))
137 break;
138 prefetch = (instr_lo == 0xF) &&
139 (opcode == 0x0D || opcode == 0x18);
140 break;
141 default:
142 scan_more = 0;
143 break;
144 }
145 }
146 return prefetch;
147}
148
149static int bad_address(void *p)
150{
151 unsigned long dummy;
152 return probe_kernel_address((unsigned long *)p, dummy);
153}
154
155void dump_pagetable(unsigned long address)
156{
157 pgd_t *pgd;
158 pud_t *pud;
159 pmd_t *pmd;
160 pte_t *pte;
161
162 pgd = (pgd_t *)read_cr3();
163
164 pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
165 pgd += pgd_index(address);
166 if (bad_address(pgd)) goto bad;
167 printk("PGD %lx ", pgd_val(*pgd));
168 if (!pgd_present(*pgd)) goto ret;
169
170 pud = pud_offset(pgd, address);
171 if (bad_address(pud)) goto bad;
172 printk("PUD %lx ", pud_val(*pud));
173 if (!pud_present(*pud)) goto ret;
174
175 pmd = pmd_offset(pud, address);
176 if (bad_address(pmd)) goto bad;
177 printk("PMD %lx ", pmd_val(*pmd));
178 if (!pmd_present(*pmd)) goto ret;
179
180 pte = pte_offset_kernel(pmd, address);
181 if (bad_address(pte)) goto bad;
182 printk("PTE %lx", pte_val(*pte));
183ret:
184 printk("\n");
185 return;
186bad:
187 printk("BAD\n");
188}
189
190static const char errata93_warning[] =
191KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
192KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
193KERN_ERR "******* Please consider a BIOS update.\n"
194KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
195
196/* Workaround for K8 erratum #93 & buggy BIOS.
197 BIOS SMM functions are required to use a specific workaround
198 to avoid corruption of the 64bit RIP register on C stepping K8.
199 A lot of BIOS that didn't get tested properly miss this.
200 The OS sees this as a page fault with the upper 32bits of RIP cleared.
201 Try to work around it here.
202 Note we only handle faults in kernel here. */
203
204static int is_errata93(struct pt_regs *regs, unsigned long address)
205{
206 static int warned;
207 if (address != regs->rip)
208 return 0;
209 if ((address >> 32) != 0)
210 return 0;
211 address |= 0xffffffffUL << 32;
212 if ((address >= (u64)_stext && address <= (u64)_etext) ||
213 (address >= MODULES_VADDR && address <= MODULES_END)) {
214 if (!warned) {
215 printk(errata93_warning);
216 warned = 1;
217 }
218 regs->rip = address;
219 return 1;
220 }
221 return 0;
222}
223
224static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
225 unsigned long error_code)
226{
227 unsigned long flags = oops_begin();
228 struct task_struct *tsk;
229
230 printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
231 current->comm, address);
232 dump_pagetable(address);
233 tsk = current;
234 tsk->thread.cr2 = address;
235 tsk->thread.trap_no = 14;
236 tsk->thread.error_code = error_code;
237 __die("Bad pagetable", regs, error_code);
238 oops_end(flags);
239 do_exit(SIGKILL);
240}
241
242/*
243 * Handle a fault on the vmalloc area
244 *
245 * This assumes no large pages in there.
246 */
247static int vmalloc_fault(unsigned long address)
248{
249 pgd_t *pgd, *pgd_ref;
250 pud_t *pud, *pud_ref;
251 pmd_t *pmd, *pmd_ref;
252 pte_t *pte, *pte_ref;
253
254 /* Copy kernel mappings over when needed. This can also
255 happen within a race in page table update. In the later
256 case just flush. */
257
258 pgd = pgd_offset(current->mm ?: &init_mm, address);
259 pgd_ref = pgd_offset_k(address);
260 if (pgd_none(*pgd_ref))
261 return -1;
262 if (pgd_none(*pgd))
263 set_pgd(pgd, *pgd_ref);
264 else
265 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
266
267 /* Below here mismatches are bugs because these lower tables
268 are shared */
269
270 pud = pud_offset(pgd, address);
271 pud_ref = pud_offset(pgd_ref, address);
272 if (pud_none(*pud_ref))
273 return -1;
274 if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
275 BUG();
276 pmd = pmd_offset(pud, address);
277 pmd_ref = pmd_offset(pud_ref, address);
278 if (pmd_none(*pmd_ref))
279 return -1;
280 if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
281 BUG();
282 pte_ref = pte_offset_kernel(pmd_ref, address);
283 if (!pte_present(*pte_ref))
284 return -1;
285 pte = pte_offset_kernel(pmd, address);
286 /* Don't use pte_page here, because the mappings can point
287 outside mem_map, and the NUMA hash lookup cannot handle
288 that. */
289 if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
290 BUG();
291 return 0;
292}
293
294static int page_fault_trace;
295int show_unhandled_signals = 1;
296
297/*
298 * This routine handles page faults. It determines the address,
299 * and the problem, and then passes it off to one of the appropriate
300 * routines.
301 */
302asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
303 unsigned long error_code)
304{
305 struct task_struct *tsk;
306 struct mm_struct *mm;
307 struct vm_area_struct * vma;
308 unsigned long address;
309 const struct exception_table_entry *fixup;
310 int write, fault;
311 unsigned long flags;
312 siginfo_t info;
313
314 tsk = current;
315 mm = tsk->mm;
316 prefetchw(&mm->mmap_sem);
317
318 /* get the address */
319 address = read_cr2();
320
321 info.si_code = SEGV_MAPERR;
322
323
324 /*
325 * We fault-in kernel-space virtual memory on-demand. The
326 * 'reference' page table is init_mm.pgd.
327 *
328 * NOTE! We MUST NOT take any locks for this case. We may
329 * be in an interrupt or a critical region, and should
330 * only copy the information from the master page table,
331 * nothing more.
332 *
333 * This verifies that the fault happens in kernel space
334 * (error_code & 4) == 0, and that the fault was not a
335 * protection error (error_code & 9) == 0.
336 */
337 if (unlikely(address >= TASK_SIZE64)) {
338 /*
339 * Don't check for the module range here: its PML4
340 * is always initialized because it's shared with the main
341 * kernel text. Only vmalloc may need PML4 syncups.
342 */
343 if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
344 ((address >= VMALLOC_START && address < VMALLOC_END))) {
345 if (vmalloc_fault(address) >= 0)
346 return;
347 }
348 if (notify_page_fault(regs, error_code) == NOTIFY_STOP)
349 return;
350 /*
351 * Don't take the mm semaphore here. If we fixup a prefetch
352 * fault we could otherwise deadlock.
353 */
354 goto bad_area_nosemaphore;
355 }
356
357 if (notify_page_fault(regs, error_code) == NOTIFY_STOP)
358 return;
359
360 if (likely(regs->eflags & X86_EFLAGS_IF))
361 local_irq_enable();
362
363 if (unlikely(page_fault_trace))
364 printk("pagefault rip:%lx rsp:%lx cs:%lu ss:%lu address %lx error %lx\n",
365 regs->rip,regs->rsp,regs->cs,regs->ss,address,error_code);
366
367 if (unlikely(error_code & PF_RSVD))
368 pgtable_bad(address, regs, error_code);
369
370 /*
371 * If we're in an interrupt or have no user
372 * context, we must not take the fault..
373 */
374 if (unlikely(in_atomic() || !mm))
375 goto bad_area_nosemaphore;
376
377 /*
378 * User-mode registers count as a user access even for any
379 * potential system fault or CPU buglet.
380 */
381 if (user_mode_vm(regs))
382 error_code |= PF_USER;
383
384 again:
385 /* When running in the kernel we expect faults to occur only to
386 * addresses in user space. All other faults represent errors in the
387 * kernel and should generate an OOPS. Unfortunatly, in the case of an
388 * erroneous fault occurring in a code path which already holds mmap_sem
389 * we will deadlock attempting to validate the fault against the
390 * address space. Luckily the kernel only validly references user
391 * space from well defined areas of code, which are listed in the
392 * exceptions table.
393 *
394 * As the vast majority of faults will be valid we will only perform
395 * the source reference check when there is a possibilty of a deadlock.
396 * Attempt to lock the address space, if we cannot we then validate the
397 * source. If this is invalid we can skip the address space check,
398 * thus avoiding the deadlock.
399 */
400 if (!down_read_trylock(&mm->mmap_sem)) {
401 if ((error_code & PF_USER) == 0 &&
402 !search_exception_tables(regs->rip))
403 goto bad_area_nosemaphore;
404 down_read(&mm->mmap_sem);
405 }
406
407 vma = find_vma(mm, address);
408 if (!vma)
409 goto bad_area;
410 if (likely(vma->vm_start <= address))
411 goto good_area;
412 if (!(vma->vm_flags & VM_GROWSDOWN))
413 goto bad_area;
414 if (error_code & 4) {
415 /* Allow userspace just enough access below the stack pointer
416 * to let the 'enter' instruction work.
417 */
418 if (address + 65536 + 32 * sizeof(unsigned long) < regs->rsp)
419 goto bad_area;
420 }
421 if (expand_stack(vma, address))
422 goto bad_area;
423/*
424 * Ok, we have a good vm_area for this memory access, so
425 * we can handle it..
426 */
427good_area:
428 info.si_code = SEGV_ACCERR;
429 write = 0;
430 switch (error_code & (PF_PROT|PF_WRITE)) {
431 default: /* 3: write, present */
432 /* fall through */
433 case PF_WRITE: /* write, not present */
434 if (!(vma->vm_flags & VM_WRITE))
435 goto bad_area;
436 write++;
437 break;
438 case PF_PROT: /* read, present */
439 goto bad_area;
440 case 0: /* read, not present */
441 if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
442 goto bad_area;
443 }
444
445 /*
446 * If for any reason at all we couldn't handle the fault,
447 * make sure we exit gracefully rather than endlessly redo
448 * the fault.
449 */
450 fault = handle_mm_fault(mm, vma, address, write);
451 if (unlikely(fault & VM_FAULT_ERROR)) {
452 if (fault & VM_FAULT_OOM)
453 goto out_of_memory;
454 else if (fault & VM_FAULT_SIGBUS)
455 goto do_sigbus;
456 BUG();
457 }
458 if (fault & VM_FAULT_MAJOR)
459 tsk->maj_flt++;
460 else
461 tsk->min_flt++;
462 up_read(&mm->mmap_sem);
463 return;
464
465/*
466 * Something tried to access memory that isn't in our memory map..
467 * Fix it, but check if it's kernel or user first..
468 */
469bad_area:
470 up_read(&mm->mmap_sem);
471
472bad_area_nosemaphore:
473 /* User mode accesses just cause a SIGSEGV */
474 if (error_code & PF_USER) {
475
476 /*
477 * It's possible to have interrupts off here.
478 */
479 local_irq_enable();
480
481 if (is_prefetch(regs, address, error_code))
482 return;
483
484 /* Work around K8 erratum #100 K8 in compat mode
485 occasionally jumps to illegal addresses >4GB. We
486 catch this here in the page fault handler because
487 these addresses are not reachable. Just detect this
488 case and return. Any code segment in LDT is
489 compatibility mode. */
490 if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
491 (address >> 32))
492 return;
493
494 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
495 printk_ratelimit()) {
496 printk(
497 "%s%s[%d]: segfault at %016lx rip %016lx rsp %016lx error %lx\n",
498 tsk->pid > 1 ? KERN_INFO : KERN_EMERG,
499 tsk->comm, tsk->pid, address, regs->rip,
500 regs->rsp, error_code);
501 }
502
503 tsk->thread.cr2 = address;
504 /* Kernel addresses are always protection faults */
505 tsk->thread.error_code = error_code | (address >= TASK_SIZE);
506 tsk->thread.trap_no = 14;
507 info.si_signo = SIGSEGV;
508 info.si_errno = 0;
509 /* info.si_code has been set above */
510 info.si_addr = (void __user *)address;
511 force_sig_info(SIGSEGV, &info, tsk);
512 return;
513 }
514
515no_context:
516
517 /* Are we prepared to handle this kernel fault? */
518 fixup = search_exception_tables(regs->rip);
519 if (fixup) {
520 regs->rip = fixup->fixup;
521 return;
522 }
523
524 /*
525 * Hall of shame of CPU/BIOS bugs.
526 */
527
528 if (is_prefetch(regs, address, error_code))
529 return;
530
531 if (is_errata93(regs, address))
532 return;
533
534/*
535 * Oops. The kernel tried to access some bad page. We'll have to
536 * terminate things with extreme prejudice.
537 */
538
539 flags = oops_begin();
540
541 if (address < PAGE_SIZE)
542 printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");
543 else
544 printk(KERN_ALERT "Unable to handle kernel paging request");
545 printk(" at %016lx RIP: \n" KERN_ALERT,address);
546 printk_address(regs->rip);
547 dump_pagetable(address);
548 tsk->thread.cr2 = address;
549 tsk->thread.trap_no = 14;
550 tsk->thread.error_code = error_code;
551 __die("Oops", regs, error_code);
552 /* Executive summary in case the body of the oops scrolled away */
553 printk(KERN_EMERG "CR2: %016lx\n", address);
554 oops_end(flags);
555 do_exit(SIGKILL);
556
557/*
558 * We ran out of memory, or some other thing happened to us that made
559 * us unable to handle the page fault gracefully.
560 */
561out_of_memory:
562 up_read(&mm->mmap_sem);
563 if (is_init(current)) {
564 yield();
565 goto again;
566 }
567 printk("VM: killing process %s\n", tsk->comm);
568 if (error_code & 4)
569 do_group_exit(SIGKILL);
570 goto no_context;
571
572do_sigbus:
573 up_read(&mm->mmap_sem);
574
575 /* Kernel mode? Handle exceptions or die */
576 if (!(error_code & PF_USER))
577 goto no_context;
578
579 tsk->thread.cr2 = address;
580 tsk->thread.error_code = error_code;
581 tsk->thread.trap_no = 14;
582 info.si_signo = SIGBUS;
583 info.si_errno = 0;
584 info.si_code = BUS_ADRERR;
585 info.si_addr = (void __user *)address;
586 force_sig_info(SIGBUS, &info, tsk);
587 return;
588}
589
590DEFINE_SPINLOCK(pgd_lock);
591LIST_HEAD(pgd_list);
592
593void vmalloc_sync_all(void)
594{
595 /* Note that races in the updates of insync and start aren't
596 problematic:
597 insync can only get set bits added, and updates to start are only
598 improving performance (without affecting correctness if undone). */
599 static DECLARE_BITMAP(insync, PTRS_PER_PGD);
600 static unsigned long start = VMALLOC_START & PGDIR_MASK;
601 unsigned long address;
602
603 for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
604 if (!test_bit(pgd_index(address), insync)) {
605 const pgd_t *pgd_ref = pgd_offset_k(address);
606 struct page *page;
607
608 if (pgd_none(*pgd_ref))
609 continue;
610 spin_lock(&pgd_lock);
611 list_for_each_entry(page, &pgd_list, lru) {
612 pgd_t *pgd;
613 pgd = (pgd_t *)page_address(page) + pgd_index(address);
614 if (pgd_none(*pgd))
615 set_pgd(pgd, *pgd_ref);
616 else
617 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
618 }
619 spin_unlock(&pgd_lock);
620 set_bit(pgd_index(address), insync);
621 }
622 if (address == start)
623 start = address + PGDIR_SIZE;
624 }
625 /* Check that there is no need to do the same for the modules area. */
626 BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
627 BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
628 (__START_KERNEL & PGDIR_MASK)));
629}
630
631static int __init enable_pagefaulttrace(char *str)
632{
633 page_fault_trace = 1;
634 return 1;
635}
636__setup("pagefaulttrace", enable_pagefaulttrace);
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
new file mode 100644
index 000000000000..458893b376f8
--- /dev/null
+++ b/arch/x86/mm/init_64.c
@@ -0,0 +1,750 @@
1/*
2 * linux/arch/x86_64/mm/init.c
3 *
4 * Copyright (C) 1995 Linus Torvalds
5 * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
6 * Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
7 */
8
9#include <linux/signal.h>
10#include <linux/sched.h>
11#include <linux/kernel.h>
12#include <linux/errno.h>
13#include <linux/string.h>
14#include <linux/types.h>
15#include <linux/ptrace.h>
16#include <linux/mman.h>
17#include <linux/mm.h>
18#include <linux/swap.h>
19#include <linux/smp.h>
20#include <linux/init.h>
21#include <linux/pagemap.h>
22#include <linux/bootmem.h>
23#include <linux/proc_fs.h>
24#include <linux/pci.h>
25#include <linux/pfn.h>
26#include <linux/poison.h>
27#include <linux/dma-mapping.h>
28#include <linux/module.h>
29#include <linux/memory_hotplug.h>
30#include <linux/nmi.h>
31
32#include <asm/processor.h>
33#include <asm/system.h>
34#include <asm/uaccess.h>
35#include <asm/pgtable.h>
36#include <asm/pgalloc.h>
37#include <asm/dma.h>
38#include <asm/fixmap.h>
39#include <asm/e820.h>
40#include <asm/apic.h>
41#include <asm/tlb.h>
42#include <asm/mmu_context.h>
43#include <asm/proto.h>
44#include <asm/smp.h>
45#include <asm/sections.h>
46
47#ifndef Dprintk
48#define Dprintk(x...)
49#endif
50
51const struct dma_mapping_ops* dma_ops;
52EXPORT_SYMBOL(dma_ops);
53
54static unsigned long dma_reserve __initdata;
55
56DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
57
58/*
59 * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
60 * physical space so we can cache the place of the first one and move
61 * around without checking the pgd every time.
62 */
63
64void show_mem(void)
65{
66 long i, total = 0, reserved = 0;
67 long shared = 0, cached = 0;
68 pg_data_t *pgdat;
69 struct page *page;
70
71 printk(KERN_INFO "Mem-info:\n");
72 show_free_areas();
73 printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
74
75 for_each_online_pgdat(pgdat) {
76 for (i = 0; i < pgdat->node_spanned_pages; ++i) {
77 /* this loop can take a while with 256 GB and 4k pages
78 so update the NMI watchdog */
79 if (unlikely(i % MAX_ORDER_NR_PAGES == 0)) {
80 touch_nmi_watchdog();
81 }
82 if (!pfn_valid(pgdat->node_start_pfn + i))
83 continue;
84 page = pfn_to_page(pgdat->node_start_pfn + i);
85 total++;
86 if (PageReserved(page))
87 reserved++;
88 else if (PageSwapCache(page))
89 cached++;
90 else if (page_count(page))
91 shared += page_count(page) - 1;
92 }
93 }
94 printk(KERN_INFO "%lu pages of RAM\n", total);
95 printk(KERN_INFO "%lu reserved pages\n",reserved);
96 printk(KERN_INFO "%lu pages shared\n",shared);
97 printk(KERN_INFO "%lu pages swap cached\n",cached);
98}
99
100int after_bootmem;
101
102static __init void *spp_getpage(void)
103{
104 void *ptr;
105 if (after_bootmem)
106 ptr = (void *) get_zeroed_page(GFP_ATOMIC);
107 else
108 ptr = alloc_bootmem_pages(PAGE_SIZE);
109 if (!ptr || ((unsigned long)ptr & ~PAGE_MASK))
110 panic("set_pte_phys: cannot allocate page data %s\n", after_bootmem?"after bootmem":"");
111
112 Dprintk("spp_getpage %p\n", ptr);
113 return ptr;
114}
115
116static __init void set_pte_phys(unsigned long vaddr,
117 unsigned long phys, pgprot_t prot)
118{
119 pgd_t *pgd;
120 pud_t *pud;
121 pmd_t *pmd;
122 pte_t *pte, new_pte;
123
124 Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys);
125
126 pgd = pgd_offset_k(vaddr);
127 if (pgd_none(*pgd)) {
128 printk("PGD FIXMAP MISSING, it should be setup in head.S!\n");
129 return;
130 }
131 pud = pud_offset(pgd, vaddr);
132 if (pud_none(*pud)) {
133 pmd = (pmd_t *) spp_getpage();
134 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
135 if (pmd != pmd_offset(pud, 0)) {
136 printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0));
137 return;
138 }
139 }
140 pmd = pmd_offset(pud, vaddr);
141 if (pmd_none(*pmd)) {
142 pte = (pte_t *) spp_getpage();
143 set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
144 if (pte != pte_offset_kernel(pmd, 0)) {
145 printk("PAGETABLE BUG #02!\n");
146 return;
147 }
148 }
149 new_pte = pfn_pte(phys >> PAGE_SHIFT, prot);
150
151 pte = pte_offset_kernel(pmd, vaddr);
152 if (!pte_none(*pte) &&
153 pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask))
154 pte_ERROR(*pte);
155 set_pte(pte, new_pte);
156
157 /*
158 * It's enough to flush this one mapping.
159 * (PGE mappings get flushed as well)
160 */
161 __flush_tlb_one(vaddr);
162}
163
164/* NOTE: this is meant to be run only at boot */
165void __init
166__set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
167{
168 unsigned long address = __fix_to_virt(idx);
169
170 if (idx >= __end_of_fixed_addresses) {
171 printk("Invalid __set_fixmap\n");
172 return;
173 }
174 set_pte_phys(address, phys, prot);
175}
176
177unsigned long __meminitdata table_start, table_end;
178
179static __meminit void *alloc_low_page(unsigned long *phys)
180{
181 unsigned long pfn = table_end++;
182 void *adr;
183
184 if (after_bootmem) {
185 adr = (void *)get_zeroed_page(GFP_ATOMIC);
186 *phys = __pa(adr);
187 return adr;
188 }
189
190 if (pfn >= end_pfn)
191 panic("alloc_low_page: ran out of memory");
192
193 adr = early_ioremap(pfn * PAGE_SIZE, PAGE_SIZE);
194 memset(adr, 0, PAGE_SIZE);
195 *phys = pfn * PAGE_SIZE;
196 return adr;
197}
198
199static __meminit void unmap_low_page(void *adr)
200{
201
202 if (after_bootmem)
203 return;
204
205 early_iounmap(adr, PAGE_SIZE);
206}
207
208/* Must run before zap_low_mappings */
209__meminit void *early_ioremap(unsigned long addr, unsigned long size)
210{
211 unsigned long vaddr;
212 pmd_t *pmd, *last_pmd;
213 int i, pmds;
214
215 pmds = ((addr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
216 vaddr = __START_KERNEL_map;
217 pmd = level2_kernel_pgt;
218 last_pmd = level2_kernel_pgt + PTRS_PER_PMD - 1;
219 for (; pmd <= last_pmd; pmd++, vaddr += PMD_SIZE) {
220 for (i = 0; i < pmds; i++) {
221 if (pmd_present(pmd[i]))
222 goto next;
223 }
224 vaddr += addr & ~PMD_MASK;
225 addr &= PMD_MASK;
226 for (i = 0; i < pmds; i++, addr += PMD_SIZE)
227 set_pmd(pmd + i,__pmd(addr | _KERNPG_TABLE | _PAGE_PSE));
228 __flush_tlb();
229 return (void *)vaddr;
230 next:
231 ;
232 }
233 printk("early_ioremap(0x%lx, %lu) failed\n", addr, size);
234 return NULL;
235}
236
237/* To avoid virtual aliases later */
238__meminit void early_iounmap(void *addr, unsigned long size)
239{
240 unsigned long vaddr;
241 pmd_t *pmd;
242 int i, pmds;
243
244 vaddr = (unsigned long)addr;
245 pmds = ((vaddr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
246 pmd = level2_kernel_pgt + pmd_index(vaddr);
247 for (i = 0; i < pmds; i++)
248 pmd_clear(pmd + i);
249 __flush_tlb();
250}
251
252static void __meminit
253phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end)
254{
255 int i = pmd_index(address);
256
257 for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) {
258 unsigned long entry;
259 pmd_t *pmd = pmd_page + pmd_index(address);
260
261 if (address >= end) {
262 if (!after_bootmem)
263 for (; i < PTRS_PER_PMD; i++, pmd++)
264 set_pmd(pmd, __pmd(0));
265 break;
266 }
267
268 if (pmd_val(*pmd))
269 continue;
270
271 entry = _PAGE_NX|_PAGE_PSE|_KERNPG_TABLE|_PAGE_GLOBAL|address;
272 entry &= __supported_pte_mask;
273 set_pmd(pmd, __pmd(entry));
274 }
275}
276
277static void __meminit
278phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end)
279{
280 pmd_t *pmd = pmd_offset(pud,0);
281 spin_lock(&init_mm.page_table_lock);
282 phys_pmd_init(pmd, address, end);
283 spin_unlock(&init_mm.page_table_lock);
284 __flush_tlb_all();
285}
286
287static void __meminit phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
288{
289 int i = pud_index(addr);
290
291
292 for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE ) {
293 unsigned long pmd_phys;
294 pud_t *pud = pud_page + pud_index(addr);
295 pmd_t *pmd;
296
297 if (addr >= end)
298 break;
299
300 if (!after_bootmem && !e820_any_mapped(addr,addr+PUD_SIZE,0)) {
301 set_pud(pud, __pud(0));
302 continue;
303 }
304
305 if (pud_val(*pud)) {
306 phys_pmd_update(pud, addr, end);
307 continue;
308 }
309
310 pmd = alloc_low_page(&pmd_phys);
311 spin_lock(&init_mm.page_table_lock);
312 set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE));
313 phys_pmd_init(pmd, addr, end);
314 spin_unlock(&init_mm.page_table_lock);
315 unmap_low_page(pmd);
316 }
317 __flush_tlb();
318}
319
320static void __init find_early_table_space(unsigned long end)
321{
322 unsigned long puds, pmds, tables, start;
323
324 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
325 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
326 tables = round_up(puds * sizeof(pud_t), PAGE_SIZE) +
327 round_up(pmds * sizeof(pmd_t), PAGE_SIZE);
328
329 /* RED-PEN putting page tables only on node 0 could
330 cause a hotspot and fill up ZONE_DMA. The page tables
331 need roughly 0.5KB per GB. */
332 start = 0x8000;
333 table_start = find_e820_area(start, end, tables);
334 if (table_start == -1UL)
335 panic("Cannot find space for the kernel page tables");
336
337 table_start >>= PAGE_SHIFT;
338 table_end = table_start;
339
340 early_printk("kernel direct mapping tables up to %lx @ %lx-%lx\n",
341 end, table_start << PAGE_SHIFT,
342 (table_start << PAGE_SHIFT) + tables);
343}
344
345/* Setup the direct mapping of the physical memory at PAGE_OFFSET.
346 This runs before bootmem is initialized and gets pages directly from the
347 physical memory. To access them they are temporarily mapped. */
348void __meminit init_memory_mapping(unsigned long start, unsigned long end)
349{
350 unsigned long next;
351
352 Dprintk("init_memory_mapping\n");
353
354 /*
355 * Find space for the kernel direct mapping tables.
356 * Later we should allocate these tables in the local node of the memory
357 * mapped. Unfortunately this is done currently before the nodes are
358 * discovered.
359 */
360 if (!after_bootmem)
361 find_early_table_space(end);
362
363 start = (unsigned long)__va(start);
364 end = (unsigned long)__va(end);
365
366 for (; start < end; start = next) {
367 unsigned long pud_phys;
368 pgd_t *pgd = pgd_offset_k(start);
369 pud_t *pud;
370
371 if (after_bootmem)
372 pud = pud_offset(pgd, start & PGDIR_MASK);
373 else
374 pud = alloc_low_page(&pud_phys);
375
376 next = start + PGDIR_SIZE;
377 if (next > end)
378 next = end;
379 phys_pud_init(pud, __pa(start), __pa(next));
380 if (!after_bootmem)
381 set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
382 unmap_low_page(pud);
383 }
384
385 if (!after_bootmem)
386 mmu_cr4_features = read_cr4();
387 __flush_tlb_all();
388}
389
390#ifndef CONFIG_NUMA
391void __init paging_init(void)
392{
393 unsigned long max_zone_pfns[MAX_NR_ZONES];
394 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
395 max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
396 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
397 max_zone_pfns[ZONE_NORMAL] = end_pfn;
398
399 memory_present(0, 0, end_pfn);
400 sparse_init();
401 free_area_init_nodes(max_zone_pfns);
402}
403#endif
404
405/* Unmap a kernel mapping if it exists. This is useful to avoid prefetches
406 from the CPU leading to inconsistent cache lines. address and size
407 must be aligned to 2MB boundaries.
408 Does nothing when the mapping doesn't exist. */
409void __init clear_kernel_mapping(unsigned long address, unsigned long size)
410{
411 unsigned long end = address + size;
412
413 BUG_ON(address & ~LARGE_PAGE_MASK);
414 BUG_ON(size & ~LARGE_PAGE_MASK);
415
416 for (; address < end; address += LARGE_PAGE_SIZE) {
417 pgd_t *pgd = pgd_offset_k(address);
418 pud_t *pud;
419 pmd_t *pmd;
420 if (pgd_none(*pgd))
421 continue;
422 pud = pud_offset(pgd, address);
423 if (pud_none(*pud))
424 continue;
425 pmd = pmd_offset(pud, address);
426 if (!pmd || pmd_none(*pmd))
427 continue;
428 if (0 == (pmd_val(*pmd) & _PAGE_PSE)) {
429 /* Could handle this, but it should not happen currently. */
430 printk(KERN_ERR
431 "clear_kernel_mapping: mapping has been split. will leak memory\n");
432 pmd_ERROR(*pmd);
433 }
434 set_pmd(pmd, __pmd(0));
435 }
436 __flush_tlb_all();
437}
438
439/*
440 * Memory hotplug specific functions
441 */
442void online_page(struct page *page)
443{
444 ClearPageReserved(page);
445 init_page_count(page);
446 __free_page(page);
447 totalram_pages++;
448 num_physpages++;
449}
450
451#ifdef CONFIG_MEMORY_HOTPLUG
452/*
453 * Memory is added always to NORMAL zone. This means you will never get
454 * additional DMA/DMA32 memory.
455 */
456int arch_add_memory(int nid, u64 start, u64 size)
457{
458 struct pglist_data *pgdat = NODE_DATA(nid);
459 struct zone *zone = pgdat->node_zones + ZONE_NORMAL;
460 unsigned long start_pfn = start >> PAGE_SHIFT;
461 unsigned long nr_pages = size >> PAGE_SHIFT;
462 int ret;
463
464 init_memory_mapping(start, (start + size -1));
465
466 ret = __add_pages(zone, start_pfn, nr_pages);
467 if (ret)
468 goto error;
469
470 return ret;
471error:
472 printk("%s: Problem encountered in __add_pages!\n", __func__);
473 return ret;
474}
475EXPORT_SYMBOL_GPL(arch_add_memory);
476
477int remove_memory(u64 start, u64 size)
478{
479 return -EINVAL;
480}
481EXPORT_SYMBOL_GPL(remove_memory);
482
483#if !defined(CONFIG_ACPI_NUMA) && defined(CONFIG_NUMA)
484int memory_add_physaddr_to_nid(u64 start)
485{
486 return 0;
487}
488EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
489#endif
490
491#endif /* CONFIG_MEMORY_HOTPLUG */
492
493#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
494/*
495 * Memory Hotadd without sparsemem. The mem_maps have been allocated in advance,
496 * just online the pages.
497 */
498int __add_pages(struct zone *z, unsigned long start_pfn, unsigned long nr_pages)
499{
500 int err = -EIO;
501 unsigned long pfn;
502 unsigned long total = 0, mem = 0;
503 for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++) {
504 if (pfn_valid(pfn)) {
505 online_page(pfn_to_page(pfn));
506 err = 0;
507 mem++;
508 }
509 total++;
510 }
511 if (!err) {
512 z->spanned_pages += total;
513 z->present_pages += mem;
514 z->zone_pgdat->node_spanned_pages += total;
515 z->zone_pgdat->node_present_pages += mem;
516 }
517 return err;
518}
519#endif
520
521static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules,
522 kcore_vsyscall;
523
524void __init mem_init(void)
525{
526 long codesize, reservedpages, datasize, initsize;
527
528 pci_iommu_alloc();
529
530 /* clear the zero-page */
531 memset(empty_zero_page, 0, PAGE_SIZE);
532
533 reservedpages = 0;
534
535 /* this will put all low memory onto the freelists */
536#ifdef CONFIG_NUMA
537 totalram_pages = numa_free_all_bootmem();
538#else
539 totalram_pages = free_all_bootmem();
540#endif
541 reservedpages = end_pfn - totalram_pages -
542 absent_pages_in_range(0, end_pfn);
543
544 after_bootmem = 1;
545
546 codesize = (unsigned long) &_etext - (unsigned long) &_text;
547 datasize = (unsigned long) &_edata - (unsigned long) &_etext;
548 initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin;
549
550 /* Register memory areas for /proc/kcore */
551 kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
552 kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
553 VMALLOC_END-VMALLOC_START);
554 kclist_add(&kcore_kernel, &_stext, _end - _stext);
555 kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN);
556 kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
557 VSYSCALL_END - VSYSCALL_START);
558
559 printk("Memory: %luk/%luk available (%ldk kernel code, %ldk reserved, %ldk data, %ldk init)\n",
560 (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
561 end_pfn << (PAGE_SHIFT-10),
562 codesize >> 10,
563 reservedpages << (PAGE_SHIFT-10),
564 datasize >> 10,
565 initsize >> 10);
566}
567
568void free_init_pages(char *what, unsigned long begin, unsigned long end)
569{
570 unsigned long addr;
571
572 if (begin >= end)
573 return;
574
575 printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
576 for (addr = begin; addr < end; addr += PAGE_SIZE) {
577 ClearPageReserved(virt_to_page(addr));
578 init_page_count(virt_to_page(addr));
579 memset((void *)(addr & ~(PAGE_SIZE-1)),
580 POISON_FREE_INITMEM, PAGE_SIZE);
581 if (addr >= __START_KERNEL_map)
582 change_page_attr_addr(addr, 1, __pgprot(0));
583 free_page(addr);
584 totalram_pages++;
585 }
586 if (addr > __START_KERNEL_map)
587 global_flush_tlb();
588}
589
590void free_initmem(void)
591{
592 free_init_pages("unused kernel memory",
593 (unsigned long)(&__init_begin),
594 (unsigned long)(&__init_end));
595}
596
597#ifdef CONFIG_DEBUG_RODATA
598
599void mark_rodata_ro(void)
600{
601 unsigned long start = (unsigned long)_stext, end;
602
603#ifdef CONFIG_HOTPLUG_CPU
604 /* It must still be possible to apply SMP alternatives. */
605 if (num_possible_cpus() > 1)
606 start = (unsigned long)_etext;
607#endif
608
609#ifdef CONFIG_KPROBES
610 start = (unsigned long)__start_rodata;
611#endif
612
613 end = (unsigned long)__end_rodata;
614 start = (start + PAGE_SIZE - 1) & PAGE_MASK;
615 end &= PAGE_MASK;
616 if (end <= start)
617 return;
618
619 change_page_attr_addr(start, (end - start) >> PAGE_SHIFT, PAGE_KERNEL_RO);
620
621 printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
622 (end - start) >> 10);
623
624 /*
625 * change_page_attr_addr() requires a global_flush_tlb() call after it.
626 * We do this after the printk so that if something went wrong in the
627 * change, the printk gets out at least to give a better debug hint
628 * of who is the culprit.
629 */
630 global_flush_tlb();
631}
632#endif
633
634#ifdef CONFIG_BLK_DEV_INITRD
635void free_initrd_mem(unsigned long start, unsigned long end)
636{
637 free_init_pages("initrd memory", start, end);
638}
639#endif
640
641void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
642{
643#ifdef CONFIG_NUMA
644 int nid = phys_to_nid(phys);
645#endif
646 unsigned long pfn = phys >> PAGE_SHIFT;
647 if (pfn >= end_pfn) {
648 /* This can happen with kdump kernels when accessing firmware
649 tables. */
650 if (pfn < end_pfn_map)
651 return;
652 printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n",
653 phys, len);
654 return;
655 }
656
657 /* Should check here against the e820 map to avoid double free */
658#ifdef CONFIG_NUMA
659 reserve_bootmem_node(NODE_DATA(nid), phys, len);
660#else
661 reserve_bootmem(phys, len);
662#endif
663 if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) {
664 dma_reserve += len / PAGE_SIZE;
665 set_dma_reserve(dma_reserve);
666 }
667}
668
669int kern_addr_valid(unsigned long addr)
670{
671 unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
672 pgd_t *pgd;
673 pud_t *pud;
674 pmd_t *pmd;
675 pte_t *pte;
676
677 if (above != 0 && above != -1UL)
678 return 0;
679
680 pgd = pgd_offset_k(addr);
681 if (pgd_none(*pgd))
682 return 0;
683
684 pud = pud_offset(pgd, addr);
685 if (pud_none(*pud))
686 return 0;
687
688 pmd = pmd_offset(pud, addr);
689 if (pmd_none(*pmd))
690 return 0;
691 if (pmd_large(*pmd))
692 return pfn_valid(pmd_pfn(*pmd));
693
694 pte = pte_offset_kernel(pmd, addr);
695 if (pte_none(*pte))
696 return 0;
697 return pfn_valid(pte_pfn(*pte));
698}
699
700/* A pseudo VMA to allow ptrace access for the vsyscall page. This only
701 covers the 64bit vsyscall page now. 32bit has a real VMA now and does
702 not need special handling anymore. */
703
704static struct vm_area_struct gate_vma = {
705 .vm_start = VSYSCALL_START,
706 .vm_end = VSYSCALL_START + (VSYSCALL_MAPPED_PAGES << PAGE_SHIFT),
707 .vm_page_prot = PAGE_READONLY_EXEC,
708 .vm_flags = VM_READ | VM_EXEC
709};
710
711struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
712{
713#ifdef CONFIG_IA32_EMULATION
714 if (test_tsk_thread_flag(tsk, TIF_IA32))
715 return NULL;
716#endif
717 return &gate_vma;
718}
719
720int in_gate_area(struct task_struct *task, unsigned long addr)
721{
722 struct vm_area_struct *vma = get_gate_vma(task);
723 if (!vma)
724 return 0;
725 return (addr >= vma->vm_start) && (addr < vma->vm_end);
726}
727
728/* Use this when you have no reliable task/vma, typically from interrupt
729 * context. It is less reliable than using the task's vma and may give
730 * false positives.
731 */
732int in_gate_area_no_task(unsigned long addr)
733{
734 return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);
735}
736
737void * __init alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size)
738{
739 return __alloc_bootmem_core(pgdat->bdata, size,
740 SMP_CACHE_BYTES, (4UL*1024*1024*1024), 0);
741}
742
743const char *arch_vma_name(struct vm_area_struct *vma)
744{
745 if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso)
746 return "[vdso]";
747 if (vma == &gate_vma)
748 return "[vsyscall]";
749 return NULL;
750}
diff --git a/arch/x86/mm/ioremap_64.c b/arch/x86/mm/ioremap_64.c
new file mode 100644
index 000000000000..6cac90aa5032
--- /dev/null
+++ b/arch/x86/mm/ioremap_64.c
@@ -0,0 +1,210 @@
1/*
2 * arch/x86_64/mm/ioremap.c
3 *
4 * Re-map IO memory to kernel address space so that we can access it.
5 * This is needed for high PCI addresses that aren't mapped in the
6 * 640k-1MB IO memory area on PC's
7 *
8 * (C) Copyright 1995 1996 Linus Torvalds
9 */
10
11#include <linux/vmalloc.h>
12#include <linux/init.h>
13#include <linux/slab.h>
14#include <linux/module.h>
15#include <linux/io.h>
16
17#include <asm/pgalloc.h>
18#include <asm/fixmap.h>
19#include <asm/tlbflush.h>
20#include <asm/cacheflush.h>
21#include <asm/proto.h>
22
23unsigned long __phys_addr(unsigned long x)
24{
25 if (x >= __START_KERNEL_map)
26 return x - __START_KERNEL_map + phys_base;
27 return x - PAGE_OFFSET;
28}
29EXPORT_SYMBOL(__phys_addr);
30
31#define ISA_START_ADDRESS 0xa0000
32#define ISA_END_ADDRESS 0x100000
33
34/*
35 * Fix up the linear direct mapping of the kernel to avoid cache attribute
36 * conflicts.
37 */
38static int
39ioremap_change_attr(unsigned long phys_addr, unsigned long size,
40 unsigned long flags)
41{
42 int err = 0;
43 if (phys_addr + size - 1 < (end_pfn_map << PAGE_SHIFT)) {
44 unsigned long npages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
45 unsigned long vaddr = (unsigned long) __va(phys_addr);
46
47 /*
48 * Must use a address here and not struct page because the phys addr
49 * can be a in hole between nodes and not have an memmap entry.
50 */
51 err = change_page_attr_addr(vaddr,npages,__pgprot(__PAGE_KERNEL|flags));
52 if (!err)
53 global_flush_tlb();
54 }
55 return err;
56}
57
58/*
59 * Generic mapping function
60 */
61
62/*
63 * Remap an arbitrary physical address space into the kernel virtual
64 * address space. Needed when the kernel wants to access high addresses
65 * directly.
66 *
67 * NOTE! We need to allow non-page-aligned mappings too: we will obviously
68 * have to convert them into an offset in a page-aligned mapping, but the
69 * caller shouldn't need to know that small detail.
70 */
71void __iomem * __ioremap(unsigned long phys_addr, unsigned long size, unsigned long flags)
72{
73 void * addr;
74 struct vm_struct * area;
75 unsigned long offset, last_addr;
76 pgprot_t pgprot;
77
78 /* Don't allow wraparound or zero size */
79 last_addr = phys_addr + size - 1;
80 if (!size || last_addr < phys_addr)
81 return NULL;
82
83 /*
84 * Don't remap the low PCI/ISA area, it's always mapped..
85 */
86 if (phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS)
87 return (__force void __iomem *)phys_to_virt(phys_addr);
88
89#ifdef CONFIG_FLATMEM
90 /*
91 * Don't allow anybody to remap normal RAM that we're using..
92 */
93 if (last_addr < virt_to_phys(high_memory)) {
94 char *t_addr, *t_end;
95 struct page *page;
96
97 t_addr = __va(phys_addr);
98 t_end = t_addr + (size - 1);
99
100 for(page = virt_to_page(t_addr); page <= virt_to_page(t_end); page++)
101 if(!PageReserved(page))
102 return NULL;
103 }
104#endif
105
106 pgprot = __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_GLOBAL
107 | _PAGE_DIRTY | _PAGE_ACCESSED | flags);
108 /*
109 * Mappings have to be page-aligned
110 */
111 offset = phys_addr & ~PAGE_MASK;
112 phys_addr &= PAGE_MASK;
113 size = PAGE_ALIGN(last_addr+1) - phys_addr;
114
115 /*
116 * Ok, go for it..
117 */
118 area = get_vm_area(size, VM_IOREMAP | (flags << 20));
119 if (!area)
120 return NULL;
121 area->phys_addr = phys_addr;
122 addr = area->addr;
123 if (ioremap_page_range((unsigned long)addr, (unsigned long)addr + size,
124 phys_addr, pgprot)) {
125 remove_vm_area((void *)(PAGE_MASK & (unsigned long) addr));
126 return NULL;
127 }
128 if (flags && ioremap_change_attr(phys_addr, size, flags) < 0) {
129 area->flags &= 0xffffff;
130 vunmap(addr);
131 return NULL;
132 }
133 return (__force void __iomem *) (offset + (char *)addr);
134}
135EXPORT_SYMBOL(__ioremap);
136
137/**
138 * ioremap_nocache - map bus memory into CPU space
139 * @offset: bus address of the memory
140 * @size: size of the resource to map
141 *
142 * ioremap_nocache performs a platform specific sequence of operations to
143 * make bus memory CPU accessible via the readb/readw/readl/writeb/
144 * writew/writel functions and the other mmio helpers. The returned
145 * address is not guaranteed to be usable directly as a virtual
146 * address.
147 *
148 * This version of ioremap ensures that the memory is marked uncachable
149 * on the CPU as well as honouring existing caching rules from things like
150 * the PCI bus. Note that there are other caches and buffers on many
151 * busses. In particular driver authors should read up on PCI writes
152 *
153 * It's useful if some control registers are in such an area and
154 * write combining or read caching is not desirable:
155 *
156 * Must be freed with iounmap.
157 */
158
159void __iomem *ioremap_nocache (unsigned long phys_addr, unsigned long size)
160{
161 return __ioremap(phys_addr, size, _PAGE_PCD);
162}
163EXPORT_SYMBOL(ioremap_nocache);
164
165/**
166 * iounmap - Free a IO remapping
167 * @addr: virtual address from ioremap_*
168 *
169 * Caller must ensure there is only one unmapping for the same pointer.
170 */
171void iounmap(volatile void __iomem *addr)
172{
173 struct vm_struct *p, *o;
174
175 if (addr <= high_memory)
176 return;
177 if (addr >= phys_to_virt(ISA_START_ADDRESS) &&
178 addr < phys_to_virt(ISA_END_ADDRESS))
179 return;
180
181 addr = (volatile void __iomem *)(PAGE_MASK & (unsigned long __force)addr);
182 /* Use the vm area unlocked, assuming the caller
183 ensures there isn't another iounmap for the same address
184 in parallel. Reuse of the virtual address is prevented by
185 leaving it in the global lists until we're done with it.
186 cpa takes care of the direct mappings. */
187 read_lock(&vmlist_lock);
188 for (p = vmlist; p; p = p->next) {
189 if (p->addr == addr)
190 break;
191 }
192 read_unlock(&vmlist_lock);
193
194 if (!p) {
195 printk("iounmap: bad address %p\n", addr);
196 dump_stack();
197 return;
198 }
199
200 /* Reset the direct mapping. Can block */
201 if (p->flags >> 20)
202 ioremap_change_attr(p->phys_addr, p->size, 0);
203
204 /* Finally remove it */
205 o = remove_vm_area((void *)addr);
206 BUG_ON(p != o || o == NULL);
207 kfree(p);
208}
209EXPORT_SYMBOL(iounmap);
210
diff --git a/arch/x86/mm/k8topology_64.c b/arch/x86/mm/k8topology_64.c
new file mode 100644
index 000000000000..a96006f7ae0c
--- /dev/null
+++ b/arch/x86/mm/k8topology_64.c
@@ -0,0 +1,182 @@
1/*
2 * AMD K8 NUMA support.
3 * Discover the memory map and associated nodes.
4 *
5 * This version reads it directly from the K8 northbridge.
6 *
7 * Copyright 2002,2003 Andi Kleen, SuSE Labs.
8 */
9#include <linux/kernel.h>
10#include <linux/init.h>
11#include <linux/string.h>
12#include <linux/module.h>
13#include <linux/nodemask.h>
14#include <asm/io.h>
15#include <linux/pci_ids.h>
16#include <asm/types.h>
17#include <asm/mmzone.h>
18#include <asm/proto.h>
19#include <asm/e820.h>
20#include <asm/pci-direct.h>
21#include <asm/numa.h>
22
23static __init int find_northbridge(void)
24{
25 int num;
26
27 for (num = 0; num < 32; num++) {
28 u32 header;
29
30 header = read_pci_config(0, num, 0, 0x00);
31 if (header != (PCI_VENDOR_ID_AMD | (0x1100<<16)))
32 continue;
33
34 header = read_pci_config(0, num, 1, 0x00);
35 if (header != (PCI_VENDOR_ID_AMD | (0x1101<<16)))
36 continue;
37 return num;
38 }
39
40 return -1;
41}
42
43int __init k8_scan_nodes(unsigned long start, unsigned long end)
44{
45 unsigned long prevbase;
46 struct bootnode nodes[8];
47 int nodeid, i, j, nb;
48 unsigned char nodeids[8];
49 int found = 0;
50 u32 reg;
51 unsigned numnodes;
52 unsigned num_cores;
53
54 if (!early_pci_allowed())
55 return -1;
56
57 nb = find_northbridge();
58 if (nb < 0)
59 return nb;
60
61 printk(KERN_INFO "Scanning NUMA topology in Northbridge %d\n", nb);
62
63 num_cores = (cpuid_ecx(0x80000008) & 0xff) + 1;
64 printk(KERN_INFO "CPU has %d num_cores\n", num_cores);
65
66 reg = read_pci_config(0, nb, 0, 0x60);
67 numnodes = ((reg >> 4) & 0xF) + 1;
68 if (numnodes <= 1)
69 return -1;
70
71 printk(KERN_INFO "Number of nodes %d\n", numnodes);
72
73 memset(&nodes,0,sizeof(nodes));
74 prevbase = 0;
75 for (i = 0; i < 8; i++) {
76 unsigned long base,limit;
77 u32 nodeid;
78
79 base = read_pci_config(0, nb, 1, 0x40 + i*8);
80 limit = read_pci_config(0, nb, 1, 0x44 + i*8);
81
82 nodeid = limit & 7;
83 nodeids[i] = nodeid;
84 if ((base & 3) == 0) {
85 if (i < numnodes)
86 printk("Skipping disabled node %d\n", i);
87 continue;
88 }
89 if (nodeid >= numnodes) {
90 printk("Ignoring excess node %d (%lx:%lx)\n", nodeid,
91 base, limit);
92 continue;
93 }
94
95 if (!limit) {
96 printk(KERN_INFO "Skipping node entry %d (base %lx)\n", i,
97 base);
98 continue;
99 }
100 if ((base >> 8) & 3 || (limit >> 8) & 3) {
101 printk(KERN_ERR "Node %d using interleaving mode %lx/%lx\n",
102 nodeid, (base>>8)&3, (limit>>8) & 3);
103 return -1;
104 }
105 if (node_isset(nodeid, node_possible_map)) {
106 printk(KERN_INFO "Node %d already present. Skipping\n",
107 nodeid);
108 continue;
109 }
110
111 limit >>= 16;
112 limit <<= 24;
113 limit |= (1<<24)-1;
114 limit++;
115
116 if (limit > end_pfn << PAGE_SHIFT)
117 limit = end_pfn << PAGE_SHIFT;
118 if (limit <= base)
119 continue;
120
121 base >>= 16;
122 base <<= 24;
123
124 if (base < start)
125 base = start;
126 if (limit > end)
127 limit = end;
128 if (limit == base) {
129 printk(KERN_ERR "Empty node %d\n", nodeid);
130 continue;
131 }
132 if (limit < base) {
133 printk(KERN_ERR "Node %d bogus settings %lx-%lx.\n",
134 nodeid, base, limit);
135 continue;
136 }
137
138 /* Could sort here, but pun for now. Should not happen anyroads. */
139 if (prevbase > base) {
140 printk(KERN_ERR "Node map not sorted %lx,%lx\n",
141 prevbase,base);
142 return -1;
143 }
144
145 printk(KERN_INFO "Node %d MemBase %016lx Limit %016lx\n",
146 nodeid, base, limit);
147
148 found++;
149
150 nodes[nodeid].start = base;
151 nodes[nodeid].end = limit;
152 e820_register_active_regions(nodeid,
153 nodes[nodeid].start >> PAGE_SHIFT,
154 nodes[nodeid].end >> PAGE_SHIFT);
155
156 prevbase = base;
157
158 node_set(nodeid, node_possible_map);
159 }
160
161 if (!found)
162 return -1;
163
164 memnode_shift = compute_hash_shift(nodes, 8);
165 if (memnode_shift < 0) {
166 printk(KERN_ERR "No NUMA node hash function found. Contact maintainer\n");
167 return -1;
168 }
169 printk(KERN_INFO "Using node hash shift of %d\n", memnode_shift);
170
171 for (i = 0; i < 8; i++) {
172 if (nodes[i].start != nodes[i].end) {
173 nodeid = nodeids[i];
174 for (j = 0; j < num_cores; j++)
175 apicid_to_node[(nodeid * num_cores) + j] = i;
176 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
177 }
178 }
179
180 numa_init_array();
181 return 0;
182}
diff --git a/arch/x86/mm/mmap_64.c b/arch/x86/mm/mmap_64.c
new file mode 100644
index 000000000000..80bba0dc000e
--- /dev/null
+++ b/arch/x86/mm/mmap_64.c
@@ -0,0 +1,29 @@
1/* Copyright 2005 Andi Kleen, SuSE Labs.
2 * Licensed under GPL, v.2
3 */
4#include <linux/mm.h>
5#include <linux/sched.h>
6#include <linux/random.h>
7#include <asm/ia32.h>
8
9/* Notebook: move the mmap code from sys_x86_64.c over here. */
10
11void arch_pick_mmap_layout(struct mm_struct *mm)
12{
13#ifdef CONFIG_IA32_EMULATION
14 if (current_thread_info()->flags & _TIF_IA32)
15 return ia32_pick_mmap_layout(mm);
16#endif
17 mm->mmap_base = TASK_UNMAPPED_BASE;
18 if (current->flags & PF_RANDOMIZE) {
19 /* Add 28bit randomness which is about 40bits of address space
20 because mmap base has to be page aligned.
21 or ~1/128 of the total user VM
22 (total user address space is 47bits) */
23 unsigned rnd = get_random_int() & 0xfffffff;
24 mm->mmap_base += ((unsigned long)rnd) << PAGE_SHIFT;
25 }
26 mm->get_unmapped_area = arch_get_unmapped_area;
27 mm->unmap_area = arch_unmap_area;
28}
29
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
new file mode 100644
index 000000000000..6da235522269
--- /dev/null
+++ b/arch/x86/mm/numa_64.c
@@ -0,0 +1,648 @@
1/*
2 * Generic VM initialization for x86-64 NUMA setups.
3 * Copyright 2002,2003 Andi Kleen, SuSE Labs.
4 */
5#include <linux/kernel.h>
6#include <linux/mm.h>
7#include <linux/string.h>
8#include <linux/init.h>
9#include <linux/bootmem.h>
10#include <linux/mmzone.h>
11#include <linux/ctype.h>
12#include <linux/module.h>
13#include <linux/nodemask.h>
14
15#include <asm/e820.h>
16#include <asm/proto.h>
17#include <asm/dma.h>
18#include <asm/numa.h>
19#include <asm/acpi.h>
20
21#ifndef Dprintk
22#define Dprintk(x...)
23#endif
24
25struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
26bootmem_data_t plat_node_bdata[MAX_NUMNODES];
27
28struct memnode memnode;
29
30unsigned char cpu_to_node[NR_CPUS] __read_mostly = {
31 [0 ... NR_CPUS-1] = NUMA_NO_NODE
32};
33unsigned char apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
34 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
35};
36cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly;
37
38int numa_off __initdata;
39unsigned long __initdata nodemap_addr;
40unsigned long __initdata nodemap_size;
41
42
43/*
44 * Given a shift value, try to populate memnodemap[]
45 * Returns :
46 * 1 if OK
47 * 0 if memnodmap[] too small (of shift too small)
48 * -1 if node overlap or lost ram (shift too big)
49 */
50static int __init
51populate_memnodemap(const struct bootnode *nodes, int numnodes, int shift)
52{
53 int i;
54 int res = -1;
55 unsigned long addr, end;
56
57 memset(memnodemap, 0xff, memnodemapsize);
58 for (i = 0; i < numnodes; i++) {
59 addr = nodes[i].start;
60 end = nodes[i].end;
61 if (addr >= end)
62 continue;
63 if ((end >> shift) >= memnodemapsize)
64 return 0;
65 do {
66 if (memnodemap[addr >> shift] != 0xff)
67 return -1;
68 memnodemap[addr >> shift] = i;
69 addr += (1UL << shift);
70 } while (addr < end);
71 res = 1;
72 }
73 return res;
74}
75
76static int __init allocate_cachealigned_memnodemap(void)
77{
78 unsigned long pad, pad_addr;
79
80 memnodemap = memnode.embedded_map;
81 if (memnodemapsize <= 48)
82 return 0;
83
84 pad = L1_CACHE_BYTES - 1;
85 pad_addr = 0x8000;
86 nodemap_size = pad + memnodemapsize;
87 nodemap_addr = find_e820_area(pad_addr, end_pfn<<PAGE_SHIFT,
88 nodemap_size);
89 if (nodemap_addr == -1UL) {
90 printk(KERN_ERR
91 "NUMA: Unable to allocate Memory to Node hash map\n");
92 nodemap_addr = nodemap_size = 0;
93 return -1;
94 }
95 pad_addr = (nodemap_addr + pad) & ~pad;
96 memnodemap = phys_to_virt(pad_addr);
97
98 printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n",
99 nodemap_addr, nodemap_addr + nodemap_size);
100 return 0;
101}
102
103/*
104 * The LSB of all start and end addresses in the node map is the value of the
105 * maximum possible shift.
106 */
107static int __init
108extract_lsb_from_nodes (const struct bootnode *nodes, int numnodes)
109{
110 int i, nodes_used = 0;
111 unsigned long start, end;
112 unsigned long bitfield = 0, memtop = 0;
113
114 for (i = 0; i < numnodes; i++) {
115 start = nodes[i].start;
116 end = nodes[i].end;
117 if (start >= end)
118 continue;
119 bitfield |= start;
120 nodes_used++;
121 if (end > memtop)
122 memtop = end;
123 }
124 if (nodes_used <= 1)
125 i = 63;
126 else
127 i = find_first_bit(&bitfield, sizeof(unsigned long)*8);
128 memnodemapsize = (memtop >> i)+1;
129 return i;
130}
131
132int __init compute_hash_shift(struct bootnode *nodes, int numnodes)
133{
134 int shift;
135
136 shift = extract_lsb_from_nodes(nodes, numnodes);
137 if (allocate_cachealigned_memnodemap())
138 return -1;
139 printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n",
140 shift);
141
142 if (populate_memnodemap(nodes, numnodes, shift) != 1) {
143 printk(KERN_INFO
144 "Your memory is not aligned you need to rebuild your kernel "
145 "with a bigger NODEMAPSIZE shift=%d\n",
146 shift);
147 return -1;
148 }
149 return shift;
150}
151
152#ifdef CONFIG_SPARSEMEM
153int early_pfn_to_nid(unsigned long pfn)
154{
155 return phys_to_nid(pfn << PAGE_SHIFT);
156}
157#endif
158
159static void * __init
160early_node_mem(int nodeid, unsigned long start, unsigned long end,
161 unsigned long size)
162{
163 unsigned long mem = find_e820_area(start, end, size);
164 void *ptr;
165 if (mem != -1L)
166 return __va(mem);
167 ptr = __alloc_bootmem_nopanic(size,
168 SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS));
169 if (ptr == 0) {
170 printk(KERN_ERR "Cannot find %lu bytes in node %d\n",
171 size, nodeid);
172 return NULL;
173 }
174 return ptr;
175}
176
177/* Initialize bootmem allocator for a node */
178void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
179{
180 unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size, bootmap_start;
181 unsigned long nodedata_phys;
182 void *bootmap;
183 const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE);
184
185 start = round_up(start, ZONE_ALIGN);
186
187 printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid, start, end);
188
189 start_pfn = start >> PAGE_SHIFT;
190 end_pfn = end >> PAGE_SHIFT;
191
192 node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size);
193 if (node_data[nodeid] == NULL)
194 return;
195 nodedata_phys = __pa(node_data[nodeid]);
196
197 memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
198 NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid];
199 NODE_DATA(nodeid)->node_start_pfn = start_pfn;
200 NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn;
201
202 /* Find a place for the bootmem map */
203 bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn);
204 bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE);
205 bootmap = early_node_mem(nodeid, bootmap_start, end,
206 bootmap_pages<<PAGE_SHIFT);
207 if (bootmap == NULL) {
208 if (nodedata_phys < start || nodedata_phys >= end)
209 free_bootmem((unsigned long)node_data[nodeid],pgdat_size);
210 node_data[nodeid] = NULL;
211 return;
212 }
213 bootmap_start = __pa(bootmap);
214 Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages);
215
216 bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
217 bootmap_start >> PAGE_SHIFT,
218 start_pfn, end_pfn);
219
220 free_bootmem_with_active_regions(nodeid, end);
221
222 reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size);
223 reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT);
224#ifdef CONFIG_ACPI_NUMA
225 srat_reserve_add_area(nodeid);
226#endif
227 node_set_online(nodeid);
228}
229
230/* Initialize final allocator for a zone */
231void __init setup_node_zones(int nodeid)
232{
233 unsigned long start_pfn, end_pfn, memmapsize, limit;
234
235 start_pfn = node_start_pfn(nodeid);
236 end_pfn = node_end_pfn(nodeid);
237
238 Dprintk(KERN_INFO "Setting up memmap for node %d %lx-%lx\n",
239 nodeid, start_pfn, end_pfn);
240
241 /* Try to allocate mem_map at end to not fill up precious <4GB
242 memory. */
243 memmapsize = sizeof(struct page) * (end_pfn-start_pfn);
244 limit = end_pfn << PAGE_SHIFT;
245#ifdef CONFIG_FLAT_NODE_MEM_MAP
246 NODE_DATA(nodeid)->node_mem_map =
247 __alloc_bootmem_core(NODE_DATA(nodeid)->bdata,
248 memmapsize, SMP_CACHE_BYTES,
249 round_down(limit - memmapsize, PAGE_SIZE),
250 limit);
251#endif
252}
253
254void __init numa_init_array(void)
255{
256 int rr, i;
257 /* There are unfortunately some poorly designed mainboards around
258 that only connect memory to a single CPU. This breaks the 1:1 cpu->node
259 mapping. To avoid this fill in the mapping for all possible
260 CPUs, as the number of CPUs is not known yet.
261 We round robin the existing nodes. */
262 rr = first_node(node_online_map);
263 for (i = 0; i < NR_CPUS; i++) {
264 if (cpu_to_node[i] != NUMA_NO_NODE)
265 continue;
266 numa_set_node(i, rr);
267 rr = next_node(rr, node_online_map);
268 if (rr == MAX_NUMNODES)
269 rr = first_node(node_online_map);
270 }
271
272}
273
274#ifdef CONFIG_NUMA_EMU
275/* Numa emulation */
276char *cmdline __initdata;
277
278/*
279 * Setups up nid to range from addr to addr + size. If the end boundary is
280 * greater than max_addr, then max_addr is used instead. The return value is 0
281 * if there is additional memory left for allocation past addr and -1 otherwise.
282 * addr is adjusted to be at the end of the node.
283 */
284static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr,
285 u64 size, u64 max_addr)
286{
287 int ret = 0;
288 nodes[nid].start = *addr;
289 *addr += size;
290 if (*addr >= max_addr) {
291 *addr = max_addr;
292 ret = -1;
293 }
294 nodes[nid].end = *addr;
295 node_set(nid, node_possible_map);
296 printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid,
297 nodes[nid].start, nodes[nid].end,
298 (nodes[nid].end - nodes[nid].start) >> 20);
299 return ret;
300}
301
302/*
303 * Splits num_nodes nodes up equally starting at node_start. The return value
304 * is the number of nodes split up and addr is adjusted to be at the end of the
305 * last node allocated.
306 */
307static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr,
308 u64 max_addr, int node_start,
309 int num_nodes)
310{
311 unsigned int big;
312 u64 size;
313 int i;
314
315 if (num_nodes <= 0)
316 return -1;
317 if (num_nodes > MAX_NUMNODES)
318 num_nodes = MAX_NUMNODES;
319 size = (max_addr - *addr - e820_hole_size(*addr, max_addr)) /
320 num_nodes;
321 /*
322 * Calculate the number of big nodes that can be allocated as a result
323 * of consolidating the leftovers.
324 */
325 big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * num_nodes) /
326 FAKE_NODE_MIN_SIZE;
327
328 /* Round down to nearest FAKE_NODE_MIN_SIZE. */
329 size &= FAKE_NODE_MIN_HASH_MASK;
330 if (!size) {
331 printk(KERN_ERR "Not enough memory for each node. "
332 "NUMA emulation disabled.\n");
333 return -1;
334 }
335
336 for (i = node_start; i < num_nodes + node_start; i++) {
337 u64 end = *addr + size;
338 if (i < big)
339 end += FAKE_NODE_MIN_SIZE;
340 /*
341 * The final node can have the remaining system RAM. Other
342 * nodes receive roughly the same amount of available pages.
343 */
344 if (i == num_nodes + node_start - 1)
345 end = max_addr;
346 else
347 while (end - *addr - e820_hole_size(*addr, end) <
348 size) {
349 end += FAKE_NODE_MIN_SIZE;
350 if (end > max_addr) {
351 end = max_addr;
352 break;
353 }
354 }
355 if (setup_node_range(i, nodes, addr, end - *addr, max_addr) < 0)
356 break;
357 }
358 return i - node_start + 1;
359}
360
361/*
362 * Splits the remaining system RAM into chunks of size. The remaining memory is
363 * always assigned to a final node and can be asymmetric. Returns the number of
364 * nodes split.
365 */
366static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr,
367 u64 max_addr, int node_start, u64 size)
368{
369 int i = node_start;
370 size = (size << 20) & FAKE_NODE_MIN_HASH_MASK;
371 while (!setup_node_range(i++, nodes, addr, size, max_addr))
372 ;
373 return i - node_start;
374}
375
376/*
377 * Sets up the system RAM area from start_pfn to end_pfn according to the
378 * numa=fake command-line option.
379 */
380static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
381{
382 struct bootnode nodes[MAX_NUMNODES];
383 u64 addr = start_pfn << PAGE_SHIFT;
384 u64 max_addr = end_pfn << PAGE_SHIFT;
385 int num_nodes = 0;
386 int coeff_flag;
387 int coeff = -1;
388 int num = 0;
389 u64 size;
390 int i;
391
392 memset(&nodes, 0, sizeof(nodes));
393 /*
394 * If the numa=fake command-line is just a single number N, split the
395 * system RAM into N fake nodes.
396 */
397 if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) {
398 num_nodes = split_nodes_equally(nodes, &addr, max_addr, 0,
399 simple_strtol(cmdline, NULL, 0));
400 if (num_nodes < 0)
401 return num_nodes;
402 goto out;
403 }
404
405 /* Parse the command line. */
406 for (coeff_flag = 0; ; cmdline++) {
407 if (*cmdline && isdigit(*cmdline)) {
408 num = num * 10 + *cmdline - '0';
409 continue;
410 }
411 if (*cmdline == '*') {
412 if (num > 0)
413 coeff = num;
414 coeff_flag = 1;
415 }
416 if (!*cmdline || *cmdline == ',') {
417 if (!coeff_flag)
418 coeff = 1;
419 /*
420 * Round down to the nearest FAKE_NODE_MIN_SIZE.
421 * Command-line coefficients are in megabytes.
422 */
423 size = ((u64)num << 20) & FAKE_NODE_MIN_HASH_MASK;
424 if (size)
425 for (i = 0; i < coeff; i++, num_nodes++)
426 if (setup_node_range(num_nodes, nodes,
427 &addr, size, max_addr) < 0)
428 goto done;
429 if (!*cmdline)
430 break;
431 coeff_flag = 0;
432 coeff = -1;
433 }
434 num = 0;
435 }
436done:
437 if (!num_nodes)
438 return -1;
439 /* Fill remainder of system RAM, if appropriate. */
440 if (addr < max_addr) {
441 if (coeff_flag && coeff < 0) {
442 /* Split remaining nodes into num-sized chunks */
443 num_nodes += split_nodes_by_size(nodes, &addr, max_addr,
444 num_nodes, num);
445 goto out;
446 }
447 switch (*(cmdline - 1)) {
448 case '*':
449 /* Split remaining nodes into coeff chunks */
450 if (coeff <= 0)
451 break;
452 num_nodes += split_nodes_equally(nodes, &addr, max_addr,
453 num_nodes, coeff);
454 break;
455 case ',':
456 /* Do not allocate remaining system RAM */
457 break;
458 default:
459 /* Give one final node */
460 setup_node_range(num_nodes, nodes, &addr,
461 max_addr - addr, max_addr);
462 num_nodes++;
463 }
464 }
465out:
466 memnode_shift = compute_hash_shift(nodes, num_nodes);
467 if (memnode_shift < 0) {
468 memnode_shift = 0;
469 printk(KERN_ERR "No NUMA hash function found. NUMA emulation "
470 "disabled.\n");
471 return -1;
472 }
473
474 /*
475 * We need to vacate all active ranges that may have been registered by
476 * SRAT and set acpi_numa to -1 so that srat_disabled() always returns
477 * true. NUMA emulation has succeeded so we will not scan ACPI nodes.
478 */
479 remove_all_active_ranges();
480#ifdef CONFIG_ACPI_NUMA
481 acpi_numa = -1;
482#endif
483 for_each_node_mask(i, node_possible_map) {
484 e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
485 nodes[i].end >> PAGE_SHIFT);
486 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
487 }
488 acpi_fake_nodes(nodes, num_nodes);
489 numa_init_array();
490 return 0;
491}
492#endif /* CONFIG_NUMA_EMU */
493
494void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
495{
496 int i;
497
498 nodes_clear(node_possible_map);
499
500#ifdef CONFIG_NUMA_EMU
501 if (cmdline && !numa_emulation(start_pfn, end_pfn))
502 return;
503 nodes_clear(node_possible_map);
504#endif
505
506#ifdef CONFIG_ACPI_NUMA
507 if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
508 end_pfn << PAGE_SHIFT))
509 return;
510 nodes_clear(node_possible_map);
511#endif
512
513#ifdef CONFIG_K8_NUMA
514 if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT))
515 return;
516 nodes_clear(node_possible_map);
517#endif
518 printk(KERN_INFO "%s\n",
519 numa_off ? "NUMA turned off" : "No NUMA configuration found");
520
521 printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
522 start_pfn << PAGE_SHIFT,
523 end_pfn << PAGE_SHIFT);
524 /* setup dummy node covering all memory */
525 memnode_shift = 63;
526 memnodemap = memnode.embedded_map;
527 memnodemap[0] = 0;
528 nodes_clear(node_online_map);
529 node_set_online(0);
530 node_set(0, node_possible_map);
531 for (i = 0; i < NR_CPUS; i++)
532 numa_set_node(i, 0);
533 node_to_cpumask[0] = cpumask_of_cpu(0);
534 e820_register_active_regions(0, start_pfn, end_pfn);
535 setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
536}
537
538__cpuinit void numa_add_cpu(int cpu)
539{
540 set_bit(cpu, &node_to_cpumask[cpu_to_node(cpu)]);
541}
542
543void __cpuinit numa_set_node(int cpu, int node)
544{
545 cpu_pda(cpu)->nodenumber = node;
546 cpu_to_node[cpu] = node;
547}
548
549unsigned long __init numa_free_all_bootmem(void)
550{
551 int i;
552 unsigned long pages = 0;
553 for_each_online_node(i) {
554 pages += free_all_bootmem_node(NODE_DATA(i));
555 }
556 return pages;
557}
558
559void __init paging_init(void)
560{
561 int i;
562 unsigned long max_zone_pfns[MAX_NR_ZONES];
563 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
564 max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
565 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
566 max_zone_pfns[ZONE_NORMAL] = end_pfn;
567
568 sparse_memory_present_with_active_regions(MAX_NUMNODES);
569 sparse_init();
570
571 for_each_online_node(i) {
572 setup_node_zones(i);
573 }
574
575 free_area_init_nodes(max_zone_pfns);
576}
577
578static __init int numa_setup(char *opt)
579{
580 if (!opt)
581 return -EINVAL;
582 if (!strncmp(opt,"off",3))
583 numa_off = 1;
584#ifdef CONFIG_NUMA_EMU
585 if (!strncmp(opt, "fake=", 5))
586 cmdline = opt + 5;
587#endif
588#ifdef CONFIG_ACPI_NUMA
589 if (!strncmp(opt,"noacpi",6))
590 acpi_numa = -1;
591 if (!strncmp(opt,"hotadd=", 7))
592 hotadd_percent = simple_strtoul(opt+7, NULL, 10);
593#endif
594 return 0;
595}
596
597early_param("numa", numa_setup);
598
599/*
600 * Setup early cpu_to_node.
601 *
602 * Populate cpu_to_node[] only if x86_cpu_to_apicid[],
603 * and apicid_to_node[] tables have valid entries for a CPU.
604 * This means we skip cpu_to_node[] initialisation for NUMA
605 * emulation and faking node case (when running a kernel compiled
606 * for NUMA on a non NUMA box), which is OK as cpu_to_node[]
607 * is already initialized in a round robin manner at numa_init_array,
608 * prior to this call, and this initialization is good enough
609 * for the fake NUMA cases.
610 */
611void __init init_cpu_to_node(void)
612{
613 int i;
614 for (i = 0; i < NR_CPUS; i++) {
615 u8 apicid = x86_cpu_to_apicid[i];
616 if (apicid == BAD_APICID)
617 continue;
618 if (apicid_to_node[apicid] == NUMA_NO_NODE)
619 continue;
620 numa_set_node(i,apicid_to_node[apicid]);
621 }
622}
623
624EXPORT_SYMBOL(cpu_to_node);
625EXPORT_SYMBOL(node_to_cpumask);
626EXPORT_SYMBOL(memnode);
627EXPORT_SYMBOL(node_data);
628
629#ifdef CONFIG_DISCONTIGMEM
630/*
631 * Functions to convert PFNs from/to per node page addresses.
632 * These are out of line because they are quite big.
633 * They could be all tuned by pre caching more state.
634 * Should do that.
635 */
636
637int pfn_valid(unsigned long pfn)
638{
639 unsigned nid;
640 if (pfn >= num_physpages)
641 return 0;
642 nid = pfn_to_nid(pfn);
643 if (nid == 0xff)
644 return 0;
645 return pfn >= node_start_pfn(nid) && (pfn) < node_end_pfn(nid);
646}
647EXPORT_SYMBOL(pfn_valid);
648#endif
diff --git a/arch/x86/mm/pageattr_64.c b/arch/x86/mm/pageattr_64.c
new file mode 100644
index 000000000000..10b9809ce821
--- /dev/null
+++ b/arch/x86/mm/pageattr_64.c
@@ -0,0 +1,249 @@
1/*
2 * Copyright 2002 Andi Kleen, SuSE Labs.
3 * Thanks to Ben LaHaise for precious feedback.
4 */
5
6#include <linux/mm.h>
7#include <linux/sched.h>
8#include <linux/highmem.h>
9#include <linux/module.h>
10#include <linux/slab.h>
11#include <asm/uaccess.h>
12#include <asm/processor.h>
13#include <asm/tlbflush.h>
14#include <asm/io.h>
15
16pte_t *lookup_address(unsigned long address)
17{
18 pgd_t *pgd = pgd_offset_k(address);
19 pud_t *pud;
20 pmd_t *pmd;
21 pte_t *pte;
22 if (pgd_none(*pgd))
23 return NULL;
24 pud = pud_offset(pgd, address);
25 if (!pud_present(*pud))
26 return NULL;
27 pmd = pmd_offset(pud, address);
28 if (!pmd_present(*pmd))
29 return NULL;
30 if (pmd_large(*pmd))
31 return (pte_t *)pmd;
32 pte = pte_offset_kernel(pmd, address);
33 if (pte && !pte_present(*pte))
34 pte = NULL;
35 return pte;
36}
37
38static struct page *split_large_page(unsigned long address, pgprot_t prot,
39 pgprot_t ref_prot)
40{
41 int i;
42 unsigned long addr;
43 struct page *base = alloc_pages(GFP_KERNEL, 0);
44 pte_t *pbase;
45 if (!base)
46 return NULL;
47 /*
48 * page_private is used to track the number of entries in
49 * the page table page have non standard attributes.
50 */
51 SetPagePrivate(base);
52 page_private(base) = 0;
53
54 address = __pa(address);
55 addr = address & LARGE_PAGE_MASK;
56 pbase = (pte_t *)page_address(base);
57 for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) {
58 pbase[i] = pfn_pte(addr >> PAGE_SHIFT,
59 addr == address ? prot : ref_prot);
60 }
61 return base;
62}
63
64static void cache_flush_page(void *adr)
65{
66 int i;
67 for (i = 0; i < PAGE_SIZE; i += boot_cpu_data.x86_clflush_size)
68 asm volatile("clflush (%0)" :: "r" (adr + i));
69}
70
71static void flush_kernel_map(void *arg)
72{
73 struct list_head *l = (struct list_head *)arg;
74 struct page *pg;
75
76 /* When clflush is available always use it because it is
77 much cheaper than WBINVD. */
78 /* clflush is still broken. Disable for now. */
79 if (1 || !cpu_has_clflush)
80 asm volatile("wbinvd" ::: "memory");
81 else list_for_each_entry(pg, l, lru) {
82 void *adr = page_address(pg);
83 cache_flush_page(adr);
84 }
85 __flush_tlb_all();
86}
87
88static inline void flush_map(struct list_head *l)
89{
90 on_each_cpu(flush_kernel_map, l, 1, 1);
91}
92
93static LIST_HEAD(deferred_pages); /* protected by init_mm.mmap_sem */
94
95static inline void save_page(struct page *fpage)
96{
97 if (!test_and_set_bit(PG_arch_1, &fpage->flags))
98 list_add(&fpage->lru, &deferred_pages);
99}
100
101/*
102 * No more special protections in this 2/4MB area - revert to a
103 * large page again.
104 */
105static void revert_page(unsigned long address, pgprot_t ref_prot)
106{
107 pgd_t *pgd;
108 pud_t *pud;
109 pmd_t *pmd;
110 pte_t large_pte;
111 unsigned long pfn;
112
113 pgd = pgd_offset_k(address);
114 BUG_ON(pgd_none(*pgd));
115 pud = pud_offset(pgd,address);
116 BUG_ON(pud_none(*pud));
117 pmd = pmd_offset(pud, address);
118 BUG_ON(pmd_val(*pmd) & _PAGE_PSE);
119 pfn = (__pa(address) & LARGE_PAGE_MASK) >> PAGE_SHIFT;
120 large_pte = pfn_pte(pfn, ref_prot);
121 large_pte = pte_mkhuge(large_pte);
122 set_pte((pte_t *)pmd, large_pte);
123}
124
125static int
126__change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot,
127 pgprot_t ref_prot)
128{
129 pte_t *kpte;
130 struct page *kpte_page;
131 pgprot_t ref_prot2;
132
133 kpte = lookup_address(address);
134 if (!kpte) return 0;
135 kpte_page = virt_to_page(((unsigned long)kpte) & PAGE_MASK);
136 BUG_ON(PageLRU(kpte_page));
137 BUG_ON(PageCompound(kpte_page));
138 if (pgprot_val(prot) != pgprot_val(ref_prot)) {
139 if (!pte_huge(*kpte)) {
140 set_pte(kpte, pfn_pte(pfn, prot));
141 } else {
142 /*
143 * split_large_page will take the reference for this
144 * change_page_attr on the split page.
145 */
146 struct page *split;
147 ref_prot2 = pte_pgprot(pte_clrhuge(*kpte));
148 split = split_large_page(address, prot, ref_prot2);
149 if (!split)
150 return -ENOMEM;
151 set_pte(kpte, mk_pte(split, ref_prot2));
152 kpte_page = split;
153 }
154 page_private(kpte_page)++;
155 } else if (!pte_huge(*kpte)) {
156 set_pte(kpte, pfn_pte(pfn, ref_prot));
157 BUG_ON(page_private(kpte_page) == 0);
158 page_private(kpte_page)--;
159 } else
160 BUG();
161
162 /* on x86-64 the direct mapping set at boot is not using 4k pages */
163 BUG_ON(PageReserved(kpte_page));
164
165 save_page(kpte_page);
166 if (page_private(kpte_page) == 0)
167 revert_page(address, ref_prot);
168 return 0;
169}
170
171/*
172 * Change the page attributes of an page in the linear mapping.
173 *
174 * This should be used when a page is mapped with a different caching policy
175 * than write-back somewhere - some CPUs do not like it when mappings with
176 * different caching policies exist. This changes the page attributes of the
177 * in kernel linear mapping too.
178 *
179 * The caller needs to ensure that there are no conflicting mappings elsewhere.
180 * This function only deals with the kernel linear map.
181 *
182 * Caller must call global_flush_tlb() after this.
183 */
184int change_page_attr_addr(unsigned long address, int numpages, pgprot_t prot)
185{
186 int err = 0, kernel_map = 0;
187 int i;
188
189 if (address >= __START_KERNEL_map
190 && address < __START_KERNEL_map + KERNEL_TEXT_SIZE) {
191 address = (unsigned long)__va(__pa(address));
192 kernel_map = 1;
193 }
194
195 down_write(&init_mm.mmap_sem);
196 for (i = 0; i < numpages; i++, address += PAGE_SIZE) {
197 unsigned long pfn = __pa(address) >> PAGE_SHIFT;
198
199 if (!kernel_map || pte_present(pfn_pte(0, prot))) {
200 err = __change_page_attr(address, pfn, prot, PAGE_KERNEL);
201 if (err)
202 break;
203 }
204 /* Handle kernel mapping too which aliases part of the
205 * lowmem */
206 if (__pa(address) < KERNEL_TEXT_SIZE) {
207 unsigned long addr2;
208 pgprot_t prot2;
209 addr2 = __START_KERNEL_map + __pa(address);
210 /* Make sure the kernel mappings stay executable */
211 prot2 = pte_pgprot(pte_mkexec(pfn_pte(0, prot)));
212 err = __change_page_attr(addr2, pfn, prot2,
213 PAGE_KERNEL_EXEC);
214 }
215 }
216 up_write(&init_mm.mmap_sem);
217 return err;
218}
219
220/* Don't call this for MMIO areas that may not have a mem_map entry */
221int change_page_attr(struct page *page, int numpages, pgprot_t prot)
222{
223 unsigned long addr = (unsigned long)page_address(page);
224 return change_page_attr_addr(addr, numpages, prot);
225}
226
227void global_flush_tlb(void)
228{
229 struct page *pg, *next;
230 struct list_head l;
231
232 down_read(&init_mm.mmap_sem);
233 list_replace_init(&deferred_pages, &l);
234 up_read(&init_mm.mmap_sem);
235
236 flush_map(&l);
237
238 list_for_each_entry_safe(pg, next, &l, lru) {
239 list_del(&pg->lru);
240 clear_bit(PG_arch_1, &pg->flags);
241 if (page_private(pg) != 0)
242 continue;
243 ClearPagePrivate(pg);
244 __free_page(pg);
245 }
246}
247
248EXPORT_SYMBOL(change_page_attr);
249EXPORT_SYMBOL(global_flush_tlb);
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
new file mode 100644
index 000000000000..acdf03e19146
--- /dev/null
+++ b/arch/x86/mm/srat_64.c
@@ -0,0 +1,566 @@
1/*
2 * ACPI 3.0 based NUMA setup
3 * Copyright 2004 Andi Kleen, SuSE Labs.
4 *
5 * Reads the ACPI SRAT table to figure out what memory belongs to which CPUs.
6 *
7 * Called from acpi_numa_init while reading the SRAT and SLIT tables.
8 * Assumes all memory regions belonging to a single proximity domain
9 * are in one chunk. Holes between them will be included in the node.
10 */
11
12#include <linux/kernel.h>
13#include <linux/acpi.h>
14#include <linux/mmzone.h>
15#include <linux/bitmap.h>
16#include <linux/module.h>
17#include <linux/topology.h>
18#include <linux/bootmem.h>
19#include <linux/mm.h>
20#include <asm/proto.h>
21#include <asm/numa.h>
22#include <asm/e820.h>
23
24int acpi_numa __initdata;
25
26static struct acpi_table_slit *acpi_slit;
27
28static nodemask_t nodes_parsed __initdata;
29static struct bootnode nodes[MAX_NUMNODES] __initdata;
30static struct bootnode nodes_add[MAX_NUMNODES];
31static int found_add_area __initdata;
32int hotadd_percent __initdata = 0;
33
34/* Too small nodes confuse the VM badly. Usually they result
35 from BIOS bugs. */
36#define NODE_MIN_SIZE (4*1024*1024)
37
38static __init int setup_node(int pxm)
39{
40 return acpi_map_pxm_to_node(pxm);
41}
42
43static __init int conflicting_nodes(unsigned long start, unsigned long end)
44{
45 int i;
46 for_each_node_mask(i, nodes_parsed) {
47 struct bootnode *nd = &nodes[i];
48 if (nd->start == nd->end)
49 continue;
50 if (nd->end > start && nd->start < end)
51 return i;
52 if (nd->end == end && nd->start == start)
53 return i;
54 }
55 return -1;
56}
57
58static __init void cutoff_node(int i, unsigned long start, unsigned long end)
59{
60 struct bootnode *nd = &nodes[i];
61
62 if (found_add_area)
63 return;
64
65 if (nd->start < start) {
66 nd->start = start;
67 if (nd->end < nd->start)
68 nd->start = nd->end;
69 }
70 if (nd->end > end) {
71 nd->end = end;
72 if (nd->start > nd->end)
73 nd->start = nd->end;
74 }
75}
76
77static __init void bad_srat(void)
78{
79 int i;
80 printk(KERN_ERR "SRAT: SRAT not used.\n");
81 acpi_numa = -1;
82 found_add_area = 0;
83 for (i = 0; i < MAX_LOCAL_APIC; i++)
84 apicid_to_node[i] = NUMA_NO_NODE;
85 for (i = 0; i < MAX_NUMNODES; i++)
86 nodes_add[i].start = nodes[i].end = 0;
87 remove_all_active_ranges();
88}
89
90static __init inline int srat_disabled(void)
91{
92 return numa_off || acpi_numa < 0;
93}
94
95/*
96 * A lot of BIOS fill in 10 (= no distance) everywhere. This messes
97 * up the NUMA heuristics which wants the local node to have a smaller
98 * distance than the others.
99 * Do some quick checks here and only use the SLIT if it passes.
100 */
101static __init int slit_valid(struct acpi_table_slit *slit)
102{
103 int i, j;
104 int d = slit->locality_count;
105 for (i = 0; i < d; i++) {
106 for (j = 0; j < d; j++) {
107 u8 val = slit->entry[d*i + j];
108 if (i == j) {
109 if (val != LOCAL_DISTANCE)
110 return 0;
111 } else if (val <= LOCAL_DISTANCE)
112 return 0;
113 }
114 }
115 return 1;
116}
117
118/* Callback for SLIT parsing */
119void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
120{
121 if (!slit_valid(slit)) {
122 printk(KERN_INFO "ACPI: SLIT table looks invalid. Not used.\n");
123 return;
124 }
125 acpi_slit = slit;
126}
127
128/* Callback for Proximity Domain -> LAPIC mapping */
129void __init
130acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
131{
132 int pxm, node;
133 if (srat_disabled())
134 return;
135 if (pa->header.length != sizeof(struct acpi_srat_cpu_affinity)) {
136 bad_srat();
137 return;
138 }
139 if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0)
140 return;
141 pxm = pa->proximity_domain_lo;
142 node = setup_node(pxm);
143 if (node < 0) {
144 printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm);
145 bad_srat();
146 return;
147 }
148 apicid_to_node[pa->apic_id] = node;
149 acpi_numa = 1;
150 printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n",
151 pxm, pa->apic_id, node);
152}
153
154#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
155/*
156 * Protect against too large hotadd areas that would fill up memory.
157 */
158static int hotadd_enough_memory(struct bootnode *nd)
159{
160 static unsigned long allocated;
161 static unsigned long last_area_end;
162 unsigned long pages = (nd->end - nd->start) >> PAGE_SHIFT;
163 long mem = pages * sizeof(struct page);
164 unsigned long addr;
165 unsigned long allowed;
166 unsigned long oldpages = pages;
167
168 if (mem < 0)
169 return 0;
170 allowed = (end_pfn - absent_pages_in_range(0, end_pfn)) * PAGE_SIZE;
171 allowed = (allowed / 100) * hotadd_percent;
172 if (allocated + mem > allowed) {
173 unsigned long range;
174 /* Give them at least part of their hotadd memory upto hotadd_percent
175 It would be better to spread the limit out
176 over multiple hotplug areas, but that is too complicated
177 right now */
178 if (allocated >= allowed)
179 return 0;
180 range = allowed - allocated;
181 pages = (range / PAGE_SIZE);
182 mem = pages * sizeof(struct page);
183 nd->end = nd->start + range;
184 }
185 /* Not completely fool proof, but a good sanity check */
186 addr = find_e820_area(last_area_end, end_pfn<<PAGE_SHIFT, mem);
187 if (addr == -1UL)
188 return 0;
189 if (pages != oldpages)
190 printk(KERN_NOTICE "SRAT: Hotadd area limited to %lu bytes\n",
191 pages << PAGE_SHIFT);
192 last_area_end = addr + mem;
193 allocated += mem;
194 return 1;
195}
196
197static int update_end_of_memory(unsigned long end)
198{
199 found_add_area = 1;
200 if ((end >> PAGE_SHIFT) > end_pfn)
201 end_pfn = end >> PAGE_SHIFT;
202 return 1;
203}
204
205static inline int save_add_info(void)
206{
207 return hotadd_percent > 0;
208}
209#else
210int update_end_of_memory(unsigned long end) {return -1;}
211static int hotadd_enough_memory(struct bootnode *nd) {return 1;}
212#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
213static inline int save_add_info(void) {return 1;}
214#else
215static inline int save_add_info(void) {return 0;}
216#endif
217#endif
218/*
219 * Update nodes_add and decide if to include add are in the zone.
220 * Both SPARSE and RESERVE need nodes_add infomation.
221 * This code supports one contigious hot add area per node.
222 */
223static int reserve_hotadd(int node, unsigned long start, unsigned long end)
224{
225 unsigned long s_pfn = start >> PAGE_SHIFT;
226 unsigned long e_pfn = end >> PAGE_SHIFT;
227 int ret = 0, changed = 0;
228 struct bootnode *nd = &nodes_add[node];
229
230 /* I had some trouble with strange memory hotadd regions breaking
231 the boot. Be very strict here and reject anything unexpected.
232 If you want working memory hotadd write correct SRATs.
233
234 The node size check is a basic sanity check to guard against
235 mistakes */
236 if ((signed long)(end - start) < NODE_MIN_SIZE) {
237 printk(KERN_ERR "SRAT: Hotplug area too small\n");
238 return -1;
239 }
240
241 /* This check might be a bit too strict, but I'm keeping it for now. */
242 if (absent_pages_in_range(s_pfn, e_pfn) != e_pfn - s_pfn) {
243 printk(KERN_ERR
244 "SRAT: Hotplug area %lu -> %lu has existing memory\n",
245 s_pfn, e_pfn);
246 return -1;
247 }
248
249 if (!hotadd_enough_memory(&nodes_add[node])) {
250 printk(KERN_ERR "SRAT: Hotplug area too large\n");
251 return -1;
252 }
253
254 /* Looks good */
255
256 if (nd->start == nd->end) {
257 nd->start = start;
258 nd->end = end;
259 changed = 1;
260 } else {
261 if (nd->start == end) {
262 nd->start = start;
263 changed = 1;
264 }
265 if (nd->end == start) {
266 nd->end = end;
267 changed = 1;
268 }
269 if (!changed)
270 printk(KERN_ERR "SRAT: Hotplug zone not continuous. Partly ignored\n");
271 }
272
273 ret = update_end_of_memory(nd->end);
274
275 if (changed)
276 printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n", nd->start, nd->end);
277 return ret;
278}
279
280/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
281void __init
282acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
283{
284 struct bootnode *nd, oldnode;
285 unsigned long start, end;
286 int node, pxm;
287 int i;
288
289 if (srat_disabled())
290 return;
291 if (ma->header.length != sizeof(struct acpi_srat_mem_affinity)) {
292 bad_srat();
293 return;
294 }
295 if ((ma->flags & ACPI_SRAT_MEM_ENABLED) == 0)
296 return;
297
298 if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && !save_add_info())
299 return;
300 start = ma->base_address;
301 end = start + ma->length;
302 pxm = ma->proximity_domain;
303 node = setup_node(pxm);
304 if (node < 0) {
305 printk(KERN_ERR "SRAT: Too many proximity domains.\n");
306 bad_srat();
307 return;
308 }
309 i = conflicting_nodes(start, end);
310 if (i == node) {
311 printk(KERN_WARNING
312 "SRAT: Warning: PXM %d (%lx-%lx) overlaps with itself (%Lx-%Lx)\n",
313 pxm, start, end, nodes[i].start, nodes[i].end);
314 } else if (i >= 0) {
315 printk(KERN_ERR
316 "SRAT: PXM %d (%lx-%lx) overlaps with PXM %d (%Lx-%Lx)\n",
317 pxm, start, end, node_to_pxm(i),
318 nodes[i].start, nodes[i].end);
319 bad_srat();
320 return;
321 }
322 nd = &nodes[node];
323 oldnode = *nd;
324 if (!node_test_and_set(node, nodes_parsed)) {
325 nd->start = start;
326 nd->end = end;
327 } else {
328 if (start < nd->start)
329 nd->start = start;
330 if (nd->end < end)
331 nd->end = end;
332 }
333
334 printk(KERN_INFO "SRAT: Node %u PXM %u %Lx-%Lx\n", node, pxm,
335 nd->start, nd->end);
336 e820_register_active_regions(node, nd->start >> PAGE_SHIFT,
337 nd->end >> PAGE_SHIFT);
338 push_node_boundaries(node, nd->start >> PAGE_SHIFT,
339 nd->end >> PAGE_SHIFT);
340
341 if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) &&
342 (reserve_hotadd(node, start, end) < 0)) {
343 /* Ignore hotadd region. Undo damage */
344 printk(KERN_NOTICE "SRAT: Hotplug region ignored\n");
345 *nd = oldnode;
346 if ((nd->start | nd->end) == 0)
347 node_clear(node, nodes_parsed);
348 }
349}
350
351/* Sanity check to catch more bad SRATs (they are amazingly common).
352 Make sure the PXMs cover all memory. */
353static int __init nodes_cover_memory(const struct bootnode *nodes)
354{
355 int i;
356 unsigned long pxmram, e820ram;
357
358 pxmram = 0;
359 for_each_node_mask(i, nodes_parsed) {
360 unsigned long s = nodes[i].start >> PAGE_SHIFT;
361 unsigned long e = nodes[i].end >> PAGE_SHIFT;
362 pxmram += e - s;
363 pxmram -= absent_pages_in_range(s, e);
364 if ((long)pxmram < 0)
365 pxmram = 0;
366 }
367
368 e820ram = end_pfn - absent_pages_in_range(0, end_pfn);
369 /* We seem to lose 3 pages somewhere. Allow a bit of slack. */
370 if ((long)(e820ram - pxmram) >= 1*1024*1024) {
371 printk(KERN_ERR
372 "SRAT: PXMs only cover %luMB of your %luMB e820 RAM. Not used.\n",
373 (pxmram << PAGE_SHIFT) >> 20,
374 (e820ram << PAGE_SHIFT) >> 20);
375 return 0;
376 }
377 return 1;
378}
379
380static void unparse_node(int node)
381{
382 int i;
383 node_clear(node, nodes_parsed);
384 for (i = 0; i < MAX_LOCAL_APIC; i++) {
385 if (apicid_to_node[i] == node)
386 apicid_to_node[i] = NUMA_NO_NODE;
387 }
388}
389
390void __init acpi_numa_arch_fixup(void) {}
391
392/* Use the information discovered above to actually set up the nodes. */
393int __init acpi_scan_nodes(unsigned long start, unsigned long end)
394{
395 int i;
396
397 if (acpi_numa <= 0)
398 return -1;
399
400 /* First clean up the node list */
401 for (i = 0; i < MAX_NUMNODES; i++) {
402 cutoff_node(i, start, end);
403 if ((nodes[i].end - nodes[i].start) < NODE_MIN_SIZE) {
404 unparse_node(i);
405 node_set_offline(i);
406 }
407 }
408
409 if (!nodes_cover_memory(nodes)) {
410 bad_srat();
411 return -1;
412 }
413
414 memnode_shift = compute_hash_shift(nodes, MAX_NUMNODES);
415 if (memnode_shift < 0) {
416 printk(KERN_ERR
417 "SRAT: No NUMA node hash function found. Contact maintainer\n");
418 bad_srat();
419 return -1;
420 }
421
422 node_possible_map = nodes_parsed;
423
424 /* Finally register nodes */
425 for_each_node_mask(i, node_possible_map)
426 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
427 /* Try again in case setup_node_bootmem missed one due
428 to missing bootmem */
429 for_each_node_mask(i, node_possible_map)
430 if (!node_online(i))
431 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
432
433 for (i = 0; i < NR_CPUS; i++) {
434 if (cpu_to_node[i] == NUMA_NO_NODE)
435 continue;
436 if (!node_isset(cpu_to_node[i], node_possible_map))
437 numa_set_node(i, NUMA_NO_NODE);
438 }
439 numa_init_array();
440 return 0;
441}
442
443#ifdef CONFIG_NUMA_EMU
444static int __init find_node_by_addr(unsigned long addr)
445{
446 int ret = NUMA_NO_NODE;
447 int i;
448
449 for_each_node_mask(i, nodes_parsed) {
450 /*
451 * Find the real node that this emulated node appears on. For
452 * the sake of simplicity, we only use a real node's starting
453 * address to determine which emulated node it appears on.
454 */
455 if (addr >= nodes[i].start && addr < nodes[i].end) {
456 ret = i;
457 break;
458 }
459 }
460 return i;
461}
462
463/*
464 * In NUMA emulation, we need to setup proximity domain (_PXM) to node ID
465 * mappings that respect the real ACPI topology but reflect our emulated
466 * environment. For each emulated node, we find which real node it appears on
467 * and create PXM to NID mappings for those fake nodes which mirror that
468 * locality. SLIT will now represent the correct distances between emulated
469 * nodes as a result of the real topology.
470 */
471void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes)
472{
473 int i, j;
474 int fake_node_to_pxm_map[MAX_NUMNODES] = {
475 [0 ... MAX_NUMNODES-1] = PXM_INVAL
476 };
477 unsigned char fake_apicid_to_node[MAX_LOCAL_APIC] = {
478 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
479 };
480
481 printk(KERN_INFO "Faking PXM affinity for fake nodes on real "
482 "topology.\n");
483 for (i = 0; i < num_nodes; i++) {
484 int nid, pxm;
485
486 nid = find_node_by_addr(fake_nodes[i].start);
487 if (nid == NUMA_NO_NODE)
488 continue;
489 pxm = node_to_pxm(nid);
490 if (pxm == PXM_INVAL)
491 continue;
492 fake_node_to_pxm_map[i] = pxm;
493 /*
494 * For each apicid_to_node mapping that exists for this real
495 * node, it must now point to the fake node ID.
496 */
497 for (j = 0; j < MAX_LOCAL_APIC; j++)
498 if (apicid_to_node[j] == nid)
499 fake_apicid_to_node[j] = i;
500 }
501 for (i = 0; i < num_nodes; i++)
502 __acpi_map_pxm_to_node(fake_node_to_pxm_map[i], i);
503 memcpy(apicid_to_node, fake_apicid_to_node, sizeof(apicid_to_node));
504
505 nodes_clear(nodes_parsed);
506 for (i = 0; i < num_nodes; i++)
507 if (fake_nodes[i].start != fake_nodes[i].end)
508 node_set(i, nodes_parsed);
509 WARN_ON(!nodes_cover_memory(fake_nodes));
510}
511
512static int null_slit_node_compare(int a, int b)
513{
514 return node_to_pxm(a) == node_to_pxm(b);
515}
516#else
517static int null_slit_node_compare(int a, int b)
518{
519 return a == b;
520}
521#endif /* CONFIG_NUMA_EMU */
522
523void __init srat_reserve_add_area(int nodeid)
524{
525 if (found_add_area && nodes_add[nodeid].end) {
526 u64 total_mb;
527
528 printk(KERN_INFO "SRAT: Reserving hot-add memory space "
529 "for node %d at %Lx-%Lx\n",
530 nodeid, nodes_add[nodeid].start, nodes_add[nodeid].end);
531 total_mb = (nodes_add[nodeid].end - nodes_add[nodeid].start)
532 >> PAGE_SHIFT;
533 total_mb *= sizeof(struct page);
534 total_mb >>= 20;
535 printk(KERN_INFO "SRAT: This will cost you %Lu MB of "
536 "pre-allocated memory.\n", (unsigned long long)total_mb);
537 reserve_bootmem_node(NODE_DATA(nodeid), nodes_add[nodeid].start,
538 nodes_add[nodeid].end - nodes_add[nodeid].start);
539 }
540}
541
542int __node_distance(int a, int b)
543{
544 int index;
545
546 if (!acpi_slit)
547 return null_slit_node_compare(a, b) ? LOCAL_DISTANCE :
548 REMOTE_DISTANCE;
549 index = acpi_slit->locality_count * node_to_pxm(a);
550 return acpi_slit->entry[index + node_to_pxm(b)];
551}
552
553EXPORT_SYMBOL(__node_distance);
554
555int memory_add_physaddr_to_nid(u64 start)
556{
557 int i, ret = 0;
558
559 for_each_node(i)
560 if (nodes_add[i].start <= start && nodes_add[i].end > start)
561 ret = i;
562
563 return ret;
564}
565EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
566