aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86_64/mm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86_64/mm')
-rw-r--r--arch/x86_64/mm/Makefile11
-rw-r--r--arch/x86_64/mm/extable.c35
-rw-r--r--arch/x86_64/mm/fault.c579
-rw-r--r--arch/x86_64/mm/init.c630
-rw-r--r--arch/x86_64/mm/ioremap.c283
-rw-r--r--arch/x86_64/mm/k8topology.c168
-rw-r--r--arch/x86_64/mm/numa.c294
-rw-r--r--arch/x86_64/mm/pageattr.c235
-rw-r--r--arch/x86_64/mm/srat.c217
9 files changed, 2452 insertions, 0 deletions
diff --git a/arch/x86_64/mm/Makefile b/arch/x86_64/mm/Makefile
new file mode 100644
index 000000000000..66c354ad80ca
--- /dev/null
+++ b/arch/x86_64/mm/Makefile
@@ -0,0 +1,11 @@
1#
2# Makefile for the linux x86_64-specific parts of the memory manager.
3#
4
5obj-y := init.o fault.o ioremap.o extable.o pageattr.o
6obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
7obj-$(CONFIG_DISCONTIGMEM) += numa.o
8obj-$(CONFIG_K8_NUMA) += k8topology.o
9obj-$(CONFIG_ACPI_NUMA) += srat.o
10
11hugetlbpage-y = ../../i386/mm/hugetlbpage.o
diff --git a/arch/x86_64/mm/extable.c b/arch/x86_64/mm/extable.c
new file mode 100644
index 000000000000..2d78f9fb4035
--- /dev/null
+++ b/arch/x86_64/mm/extable.c
@@ -0,0 +1,35 @@
1/*
2 * linux/arch/x86_64/mm/extable.c
3 */
4
5#include <linux/config.h>
6#include <linux/module.h>
7#include <linux/spinlock.h>
8#include <linux/init.h>
9#include <asm/uaccess.h>
10
11/* Simple binary search */
12const struct exception_table_entry *
13search_extable(const struct exception_table_entry *first,
14 const struct exception_table_entry *last,
15 unsigned long value)
16{
17 /* Work around a B stepping K8 bug */
18 if ((value >> 32) == 0)
19 value |= 0xffffffffUL << 32;
20
21 while (first <= last) {
22 const struct exception_table_entry *mid;
23 long diff;
24
25 mid = (last - first) / 2 + first;
26 diff = mid->insn - value;
27 if (diff == 0)
28 return mid;
29 else if (diff < 0)
30 first = mid+1;
31 else
32 last = mid-1;
33 }
34 return NULL;
35}
diff --git a/arch/x86_64/mm/fault.c b/arch/x86_64/mm/fault.c
new file mode 100644
index 000000000000..5724370475cc
--- /dev/null
+++ b/arch/x86_64/mm/fault.c
@@ -0,0 +1,579 @@
1/*
2 * linux/arch/x86-64/mm/fault.c
3 *
4 * Copyright (C) 1995 Linus Torvalds
5 * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
6 */
7
8#include <linux/config.h>
9#include <linux/signal.h>
10#include <linux/sched.h>
11#include <linux/kernel.h>
12#include <linux/errno.h>
13#include <linux/string.h>
14#include <linux/types.h>
15#include <linux/ptrace.h>
16#include <linux/mman.h>
17#include <linux/mm.h>
18#include <linux/smp.h>
19#include <linux/smp_lock.h>
20#include <linux/interrupt.h>
21#include <linux/init.h>
22#include <linux/tty.h>
23#include <linux/vt_kern.h> /* For unblank_screen() */
24#include <linux/compiler.h>
25#include <linux/module.h>
26#include <linux/kprobes.h>
27
28#include <asm/system.h>
29#include <asm/uaccess.h>
30#include <asm/pgalloc.h>
31#include <asm/smp.h>
32#include <asm/tlbflush.h>
33#include <asm/proto.h>
34#include <asm/kdebug.h>
35#include <asm-generic/sections.h>
36#include <asm/kdebug.h>
37
38void bust_spinlocks(int yes)
39{
40 int loglevel_save = console_loglevel;
41 if (yes) {
42 oops_in_progress = 1;
43 } else {
44#ifdef CONFIG_VT
45 unblank_screen();
46#endif
47 oops_in_progress = 0;
48 /*
49 * OK, the message is on the console. Now we call printk()
50 * without oops_in_progress set so that printk will give klogd
51 * a poke. Hold onto your hats...
52 */
53 console_loglevel = 15; /* NMI oopser may have shut the console up */
54 printk(" ");
55 console_loglevel = loglevel_save;
56 }
57}
58
59/* Sometimes the CPU reports invalid exceptions on prefetch.
60 Check that here and ignore.
61 Opcode checker based on code by Richard Brunner */
62static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr,
63 unsigned long error_code)
64{
65 unsigned char *instr = (unsigned char *)(regs->rip);
66 int scan_more = 1;
67 int prefetch = 0;
68 unsigned char *max_instr = instr + 15;
69
70 /* If it was a exec fault ignore */
71 if (error_code & (1<<4))
72 return 0;
73
74 /* Code segments in LDT could have a non zero base. Don't check
75 when that's possible */
76 if (regs->cs & (1<<2))
77 return 0;
78
79 if ((regs->cs & 3) != 0 && regs->rip >= TASK_SIZE)
80 return 0;
81
82 while (scan_more && instr < max_instr) {
83 unsigned char opcode;
84 unsigned char instr_hi;
85 unsigned char instr_lo;
86
87 if (__get_user(opcode, instr))
88 break;
89
90 instr_hi = opcode & 0xf0;
91 instr_lo = opcode & 0x0f;
92 instr++;
93
94 switch (instr_hi) {
95 case 0x20:
96 case 0x30:
97 /* Values 0x26,0x2E,0x36,0x3E are valid x86
98 prefixes. In long mode, the CPU will signal
99 invalid opcode if some of these prefixes are
100 present so we will never get here anyway */
101 scan_more = ((instr_lo & 7) == 0x6);
102 break;
103
104 case 0x40:
105 /* In AMD64 long mode, 0x40 to 0x4F are valid REX prefixes
106 Need to figure out under what instruction mode the
107 instruction was issued ... */
108 /* Could check the LDT for lm, but for now it's good
109 enough to assume that long mode only uses well known
110 segments or kernel. */
111 scan_more = ((regs->cs & 3) == 0) || (regs->cs == __USER_CS);
112 break;
113
114 case 0x60:
115 /* 0x64 thru 0x67 are valid prefixes in all modes. */
116 scan_more = (instr_lo & 0xC) == 0x4;
117 break;
118 case 0xF0:
119 /* 0xF0, 0xF2, and 0xF3 are valid prefixes in all modes. */
120 scan_more = !instr_lo || (instr_lo>>1) == 1;
121 break;
122 case 0x00:
123 /* Prefetch instruction is 0x0F0D or 0x0F18 */
124 scan_more = 0;
125 if (__get_user(opcode, instr))
126 break;
127 prefetch = (instr_lo == 0xF) &&
128 (opcode == 0x0D || opcode == 0x18);
129 break;
130 default:
131 scan_more = 0;
132 break;
133 }
134 }
135 return prefetch;
136}
137
138static int bad_address(void *p)
139{
140 unsigned long dummy;
141 return __get_user(dummy, (unsigned long *)p);
142}
143
144void dump_pagetable(unsigned long address)
145{
146 pgd_t *pgd;
147 pud_t *pud;
148 pmd_t *pmd;
149 pte_t *pte;
150
151 asm("movq %%cr3,%0" : "=r" (pgd));
152
153 pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
154 pgd += pgd_index(address);
155 printk("PGD %lx ", pgd_val(*pgd));
156 if (bad_address(pgd)) goto bad;
157 if (!pgd_present(*pgd)) goto ret;
158
159 pud = __pud_offset_k((pud_t *)pgd_page(*pgd), address);
160 if (bad_address(pud)) goto bad;
161 printk("PUD %lx ", pud_val(*pud));
162 if (!pud_present(*pud)) goto ret;
163
164 pmd = pmd_offset(pud, address);
165 if (bad_address(pmd)) goto bad;
166 printk("PMD %lx ", pmd_val(*pmd));
167 if (!pmd_present(*pmd)) goto ret;
168
169 pte = pte_offset_kernel(pmd, address);
170 if (bad_address(pte)) goto bad;
171 printk("PTE %lx", pte_val(*pte));
172ret:
173 printk("\n");
174 return;
175bad:
176 printk("BAD\n");
177}
178
179static const char errata93_warning[] =
180KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
181KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
182KERN_ERR "******* Please consider a BIOS update.\n"
183KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
184
185/* Workaround for K8 erratum #93 & buggy BIOS.
186 BIOS SMM functions are required to use a specific workaround
187 to avoid corruption of the 64bit RIP register on C stepping K8.
188 A lot of BIOS that didn't get tested properly miss this.
189 The OS sees this as a page fault with the upper 32bits of RIP cleared.
190 Try to work around it here.
191 Note we only handle faults in kernel here. */
192
193static int is_errata93(struct pt_regs *regs, unsigned long address)
194{
195 static int warned;
196 if (address != regs->rip)
197 return 0;
198 if ((address >> 32) != 0)
199 return 0;
200 address |= 0xffffffffUL << 32;
201 if ((address >= (u64)_stext && address <= (u64)_etext) ||
202 (address >= MODULES_VADDR && address <= MODULES_END)) {
203 if (!warned) {
204 printk(errata93_warning);
205 warned = 1;
206 }
207 regs->rip = address;
208 return 1;
209 }
210 return 0;
211}
212
213int unhandled_signal(struct task_struct *tsk, int sig)
214{
215 if (tsk->pid == 1)
216 return 1;
217 /* Warn for strace, but not for gdb */
218 if (!test_ti_thread_flag(tsk->thread_info, TIF_SYSCALL_TRACE) &&
219 (tsk->ptrace & PT_PTRACED))
220 return 0;
221 return (tsk->sighand->action[sig-1].sa.sa_handler == SIG_IGN) ||
222 (tsk->sighand->action[sig-1].sa.sa_handler == SIG_DFL);
223}
224
225static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
226 unsigned long error_code)
227{
228 oops_begin();
229 printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
230 current->comm, address);
231 dump_pagetable(address);
232 __die("Bad pagetable", regs, error_code);
233 oops_end();
234 do_exit(SIGKILL);
235}
236
237/*
238 * Handle a fault on the vmalloc or module mapping area
239 */
240static int vmalloc_fault(unsigned long address)
241{
242 pgd_t *pgd, *pgd_ref;
243 pud_t *pud, *pud_ref;
244 pmd_t *pmd, *pmd_ref;
245 pte_t *pte, *pte_ref;
246
247 /* Copy kernel mappings over when needed. This can also
248 happen within a race in page table update. In the later
249 case just flush. */
250
251 pgd = pgd_offset(current->mm ?: &init_mm, address);
252 pgd_ref = pgd_offset_k(address);
253 if (pgd_none(*pgd_ref))
254 return -1;
255 if (pgd_none(*pgd))
256 set_pgd(pgd, *pgd_ref);
257
258 /* Below here mismatches are bugs because these lower tables
259 are shared */
260
261 pud = pud_offset(pgd, address);
262 pud_ref = pud_offset(pgd_ref, address);
263 if (pud_none(*pud_ref))
264 return -1;
265 if (pud_none(*pud) || pud_page(*pud) != pud_page(*pud_ref))
266 BUG();
267 pmd = pmd_offset(pud, address);
268 pmd_ref = pmd_offset(pud_ref, address);
269 if (pmd_none(*pmd_ref))
270 return -1;
271 if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
272 BUG();
273 pte_ref = pte_offset_kernel(pmd_ref, address);
274 if (!pte_present(*pte_ref))
275 return -1;
276 pte = pte_offset_kernel(pmd, address);
277 if (!pte_present(*pte) || pte_page(*pte) != pte_page(*pte_ref))
278 BUG();
279 __flush_tlb_all();
280 return 0;
281}
282
283int page_fault_trace = 0;
284int exception_trace = 1;
285
286/*
287 * This routine handles page faults. It determines the address,
288 * and the problem, and then passes it off to one of the appropriate
289 * routines.
290 *
291 * error_code:
292 * bit 0 == 0 means no page found, 1 means protection fault
293 * bit 1 == 0 means read, 1 means write
294 * bit 2 == 0 means kernel, 1 means user-mode
295 * bit 3 == 1 means fault was an instruction fetch
296 */
297asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code)
298{
299 struct task_struct *tsk;
300 struct mm_struct *mm;
301 struct vm_area_struct * vma;
302 unsigned long address;
303 const struct exception_table_entry *fixup;
304 int write;
305 siginfo_t info;
306
307#ifdef CONFIG_CHECKING
308 {
309 unsigned long gs;
310 struct x8664_pda *pda = cpu_pda + stack_smp_processor_id();
311 rdmsrl(MSR_GS_BASE, gs);
312 if (gs != (unsigned long)pda) {
313 wrmsrl(MSR_GS_BASE, pda);
314 printk("page_fault: wrong gs %lx expected %p\n", gs, pda);
315 }
316 }
317#endif
318
319 /* get the address */
320 __asm__("movq %%cr2,%0":"=r" (address));
321 if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
322 SIGSEGV) == NOTIFY_STOP)
323 return;
324
325 if (likely(regs->eflags & X86_EFLAGS_IF))
326 local_irq_enable();
327
328 if (unlikely(page_fault_trace))
329 printk("pagefault rip:%lx rsp:%lx cs:%lu ss:%lu address %lx error %lx\n",
330 regs->rip,regs->rsp,regs->cs,regs->ss,address,error_code);
331
332 tsk = current;
333 mm = tsk->mm;
334 info.si_code = SEGV_MAPERR;
335
336
337 /*
338 * We fault-in kernel-space virtual memory on-demand. The
339 * 'reference' page table is init_mm.pgd.
340 *
341 * NOTE! We MUST NOT take any locks for this case. We may
342 * be in an interrupt or a critical region, and should
343 * only copy the information from the master page table,
344 * nothing more.
345 *
346 * This verifies that the fault happens in kernel space
347 * (error_code & 4) == 0, and that the fault was not a
348 * protection error (error_code & 1) == 0.
349 */
350 if (unlikely(address >= TASK_SIZE)) {
351 if (!(error_code & 5)) {
352 if (vmalloc_fault(address) < 0)
353 goto bad_area_nosemaphore;
354 return;
355 }
356 /*
357 * Don't take the mm semaphore here. If we fixup a prefetch
358 * fault we could otherwise deadlock.
359 */
360 goto bad_area_nosemaphore;
361 }
362
363 if (unlikely(error_code & (1 << 3)))
364 pgtable_bad(address, regs, error_code);
365
366 /*
367 * If we're in an interrupt or have no user
368 * context, we must not take the fault..
369 */
370 if (unlikely(in_atomic() || !mm))
371 goto bad_area_nosemaphore;
372
373 again:
374 /* When running in the kernel we expect faults to occur only to
375 * addresses in user space. All other faults represent errors in the
376 * kernel and should generate an OOPS. Unfortunatly, in the case of an
377 * erroneous fault occuring in a code path which already holds mmap_sem
378 * we will deadlock attempting to validate the fault against the
379 * address space. Luckily the kernel only validly references user
380 * space from well defined areas of code, which are listed in the
381 * exceptions table.
382 *
383 * As the vast majority of faults will be valid we will only perform
384 * the source reference check when there is a possibilty of a deadlock.
385 * Attempt to lock the address space, if we cannot we then validate the
386 * source. If this is invalid we can skip the address space check,
387 * thus avoiding the deadlock.
388 */
389 if (!down_read_trylock(&mm->mmap_sem)) {
390 if ((error_code & 4) == 0 &&
391 !search_exception_tables(regs->rip))
392 goto bad_area_nosemaphore;
393 down_read(&mm->mmap_sem);
394 }
395
396 vma = find_vma(mm, address);
397 if (!vma)
398 goto bad_area;
399 if (likely(vma->vm_start <= address))
400 goto good_area;
401 if (!(vma->vm_flags & VM_GROWSDOWN))
402 goto bad_area;
403 if (error_code & 4) {
404 // XXX: align red zone size with ABI
405 if (address + 128 < regs->rsp)
406 goto bad_area;
407 }
408 if (expand_stack(vma, address))
409 goto bad_area;
410/*
411 * Ok, we have a good vm_area for this memory access, so
412 * we can handle it..
413 */
414good_area:
415 info.si_code = SEGV_ACCERR;
416 write = 0;
417 switch (error_code & 3) {
418 default: /* 3: write, present */
419 /* fall through */
420 case 2: /* write, not present */
421 if (!(vma->vm_flags & VM_WRITE))
422 goto bad_area;
423 write++;
424 break;
425 case 1: /* read, present */
426 goto bad_area;
427 case 0: /* read, not present */
428 if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
429 goto bad_area;
430 }
431
432 /*
433 * If for any reason at all we couldn't handle the fault,
434 * make sure we exit gracefully rather than endlessly redo
435 * the fault.
436 */
437 switch (handle_mm_fault(mm, vma, address, write)) {
438 case 1:
439 tsk->min_flt++;
440 break;
441 case 2:
442 tsk->maj_flt++;
443 break;
444 case 0:
445 goto do_sigbus;
446 default:
447 goto out_of_memory;
448 }
449
450 up_read(&mm->mmap_sem);
451 return;
452
453/*
454 * Something tried to access memory that isn't in our memory map..
455 * Fix it, but check if it's kernel or user first..
456 */
457bad_area:
458 up_read(&mm->mmap_sem);
459
460bad_area_nosemaphore:
461
462#ifdef CONFIG_IA32_EMULATION
463 /* 32bit vsyscall. map on demand. */
464 if (test_thread_flag(TIF_IA32) &&
465 address >= VSYSCALL32_BASE && address < VSYSCALL32_END) {
466 if (map_syscall32(mm, address) < 0)
467 goto out_of_memory2;
468 return;
469 }
470#endif
471
472 /* User mode accesses just cause a SIGSEGV */
473 if (error_code & 4) {
474 if (is_prefetch(regs, address, error_code))
475 return;
476
477 /* Work around K8 erratum #100 K8 in compat mode
478 occasionally jumps to illegal addresses >4GB. We
479 catch this here in the page fault handler because
480 these addresses are not reachable. Just detect this
481 case and return. Any code segment in LDT is
482 compatibility mode. */
483 if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
484 (address >> 32))
485 return;
486
487 if (exception_trace && unhandled_signal(tsk, SIGSEGV)) {
488 printk(
489 "%s%s[%d]: segfault at %016lx rip %016lx rsp %016lx error %lx\n",
490 tsk->pid > 1 ? KERN_INFO : KERN_EMERG,
491 tsk->comm, tsk->pid, address, regs->rip,
492 regs->rsp, error_code);
493 }
494
495 tsk->thread.cr2 = address;
496 /* Kernel addresses are always protection faults */
497 tsk->thread.error_code = error_code | (address >= TASK_SIZE);
498 tsk->thread.trap_no = 14;
499 info.si_signo = SIGSEGV;
500 info.si_errno = 0;
501 /* info.si_code has been set above */
502 info.si_addr = (void __user *)address;
503 force_sig_info(SIGSEGV, &info, tsk);
504 return;
505 }
506
507no_context:
508
509 /* Are we prepared to handle this kernel fault? */
510 fixup = search_exception_tables(regs->rip);
511 if (fixup) {
512 regs->rip = fixup->fixup;
513 return;
514 }
515
516 /*
517 * Hall of shame of CPU/BIOS bugs.
518 */
519
520 if (is_prefetch(regs, address, error_code))
521 return;
522
523 if (is_errata93(regs, address))
524 return;
525
526/*
527 * Oops. The kernel tried to access some bad page. We'll have to
528 * terminate things with extreme prejudice.
529 */
530
531 oops_begin();
532
533 if (address < PAGE_SIZE)
534 printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");
535 else
536 printk(KERN_ALERT "Unable to handle kernel paging request");
537 printk(" at %016lx RIP: \n" KERN_ALERT,address);
538 printk_address(regs->rip);
539 printk("\n");
540 dump_pagetable(address);
541 __die("Oops", regs, error_code);
542 /* Executive summary in case the body of the oops scrolled away */
543 printk(KERN_EMERG "CR2: %016lx\n", address);
544 oops_end();
545 do_exit(SIGKILL);
546
547/*
548 * We ran out of memory, or some other thing happened to us that made
549 * us unable to handle the page fault gracefully.
550 */
551out_of_memory:
552 up_read(&mm->mmap_sem);
553out_of_memory2:
554 if (current->pid == 1) {
555 yield();
556 goto again;
557 }
558 printk("VM: killing process %s\n", tsk->comm);
559 if (error_code & 4)
560 do_exit(SIGKILL);
561 goto no_context;
562
563do_sigbus:
564 up_read(&mm->mmap_sem);
565
566 /* Kernel mode? Handle exceptions or die */
567 if (!(error_code & 4))
568 goto no_context;
569
570 tsk->thread.cr2 = address;
571 tsk->thread.error_code = error_code;
572 tsk->thread.trap_no = 14;
573 info.si_signo = SIGBUS;
574 info.si_errno = 0;
575 info.si_code = BUS_ADRERR;
576 info.si_addr = (void __user *)address;
577 force_sig_info(SIGBUS, &info, tsk);
578 return;
579}
diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c
new file mode 100644
index 000000000000..b0d604551d86
--- /dev/null
+++ b/arch/x86_64/mm/init.c
@@ -0,0 +1,630 @@
1/*
2 * linux/arch/x86_64/mm/init.c
3 *
4 * Copyright (C) 1995 Linus Torvalds
5 * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
6 * Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
7 */
8
9#include <linux/config.h>
10#include <linux/signal.h>
11#include <linux/sched.h>
12#include <linux/kernel.h>
13#include <linux/errno.h>
14#include <linux/string.h>
15#include <linux/types.h>
16#include <linux/ptrace.h>
17#include <linux/mman.h>
18#include <linux/mm.h>
19#include <linux/swap.h>
20#include <linux/smp.h>
21#include <linux/init.h>
22#include <linux/pagemap.h>
23#include <linux/bootmem.h>
24#include <linux/proc_fs.h>
25
26#include <asm/processor.h>
27#include <asm/system.h>
28#include <asm/uaccess.h>
29#include <asm/pgtable.h>
30#include <asm/pgalloc.h>
31#include <asm/dma.h>
32#include <asm/fixmap.h>
33#include <asm/e820.h>
34#include <asm/apic.h>
35#include <asm/tlb.h>
36#include <asm/mmu_context.h>
37#include <asm/proto.h>
38#include <asm/smp.h>
39
40#ifndef Dprintk
41#define Dprintk(x...)
42#endif
43
44#ifdef CONFIG_GART_IOMMU
45extern int swiotlb;
46#endif
47
48extern char _stext[];
49
50DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
51
52/*
53 * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
54 * physical space so we can cache the place of the first one and move
55 * around without checking the pgd every time.
56 */
57
58void show_mem(void)
59{
60 int i, total = 0, reserved = 0;
61 int shared = 0, cached = 0;
62 pg_data_t *pgdat;
63 struct page *page;
64
65 printk("Mem-info:\n");
66 show_free_areas();
67 printk("Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
68
69 for_each_pgdat(pgdat) {
70 for (i = 0; i < pgdat->node_spanned_pages; ++i) {
71 page = pfn_to_page(pgdat->node_start_pfn + i);
72 total++;
73 if (PageReserved(page))
74 reserved++;
75 else if (PageSwapCache(page))
76 cached++;
77 else if (page_count(page))
78 shared += page_count(page) - 1;
79 }
80 }
81 printk("%d pages of RAM\n", total);
82 printk("%d reserved pages\n",reserved);
83 printk("%d pages shared\n",shared);
84 printk("%d pages swap cached\n",cached);
85}
86
87/* References to section boundaries */
88
89extern char _text, _etext, _edata, __bss_start, _end[];
90extern char __init_begin, __init_end;
91
92int after_bootmem;
93
94static void *spp_getpage(void)
95{
96 void *ptr;
97 if (after_bootmem)
98 ptr = (void *) get_zeroed_page(GFP_ATOMIC);
99 else
100 ptr = alloc_bootmem_pages(PAGE_SIZE);
101 if (!ptr || ((unsigned long)ptr & ~PAGE_MASK))
102 panic("set_pte_phys: cannot allocate page data %s\n", after_bootmem?"after bootmem":"");
103
104 Dprintk("spp_getpage %p\n", ptr);
105 return ptr;
106}
107
108static void set_pte_phys(unsigned long vaddr,
109 unsigned long phys, pgprot_t prot)
110{
111 pgd_t *pgd;
112 pud_t *pud;
113 pmd_t *pmd;
114 pte_t *pte, new_pte;
115
116 Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys);
117
118 pgd = pgd_offset_k(vaddr);
119 if (pgd_none(*pgd)) {
120 printk("PGD FIXMAP MISSING, it should be setup in head.S!\n");
121 return;
122 }
123 pud = pud_offset(pgd, vaddr);
124 if (pud_none(*pud)) {
125 pmd = (pmd_t *) spp_getpage();
126 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
127 if (pmd != pmd_offset(pud, 0)) {
128 printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0));
129 return;
130 }
131 }
132 pmd = pmd_offset(pud, vaddr);
133 if (pmd_none(*pmd)) {
134 pte = (pte_t *) spp_getpage();
135 set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
136 if (pte != pte_offset_kernel(pmd, 0)) {
137 printk("PAGETABLE BUG #02!\n");
138 return;
139 }
140 }
141 new_pte = pfn_pte(phys >> PAGE_SHIFT, prot);
142
143 pte = pte_offset_kernel(pmd, vaddr);
144 if (!pte_none(*pte) &&
145 pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask))
146 pte_ERROR(*pte);
147 set_pte(pte, new_pte);
148
149 /*
150 * It's enough to flush this one mapping.
151 * (PGE mappings get flushed as well)
152 */
153 __flush_tlb_one(vaddr);
154}
155
156/* NOTE: this is meant to be run only at boot */
157void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
158{
159 unsigned long address = __fix_to_virt(idx);
160
161 if (idx >= __end_of_fixed_addresses) {
162 printk("Invalid __set_fixmap\n");
163 return;
164 }
165 set_pte_phys(address, phys, prot);
166}
167
168unsigned long __initdata table_start, table_end;
169
170extern pmd_t temp_boot_pmds[];
171
172static struct temp_map {
173 pmd_t *pmd;
174 void *address;
175 int allocated;
176} temp_mappings[] __initdata = {
177 { &temp_boot_pmds[0], (void *)(40UL * 1024 * 1024) },
178 { &temp_boot_pmds[1], (void *)(42UL * 1024 * 1024) },
179 {}
180};
181
182static __init void *alloc_low_page(int *index, unsigned long *phys)
183{
184 struct temp_map *ti;
185 int i;
186 unsigned long pfn = table_end++, paddr;
187 void *adr;
188
189 if (pfn >= end_pfn)
190 panic("alloc_low_page: ran out of memory");
191 for (i = 0; temp_mappings[i].allocated; i++) {
192 if (!temp_mappings[i].pmd)
193 panic("alloc_low_page: ran out of temp mappings");
194 }
195 ti = &temp_mappings[i];
196 paddr = (pfn << PAGE_SHIFT) & PMD_MASK;
197 set_pmd(ti->pmd, __pmd(paddr | _KERNPG_TABLE | _PAGE_PSE));
198 ti->allocated = 1;
199 __flush_tlb();
200 adr = ti->address + ((pfn << PAGE_SHIFT) & ~PMD_MASK);
201 *index = i;
202 *phys = pfn * PAGE_SIZE;
203 return adr;
204}
205
206static __init void unmap_low_page(int i)
207{
208 struct temp_map *ti = &temp_mappings[i];
209 set_pmd(ti->pmd, __pmd(0));
210 ti->allocated = 0;
211}
212
213static void __init phys_pud_init(pud_t *pud, unsigned long address, unsigned long end)
214{
215 long i, j;
216
217 i = pud_index(address);
218 pud = pud + i;
219 for (; i < PTRS_PER_PUD; pud++, i++) {
220 int map;
221 unsigned long paddr, pmd_phys;
222 pmd_t *pmd;
223
224 paddr = address + i*PUD_SIZE;
225 if (paddr >= end) {
226 for (; i < PTRS_PER_PUD; i++, pud++)
227 set_pud(pud, __pud(0));
228 break;
229 }
230
231 if (!e820_mapped(paddr, paddr+PUD_SIZE, 0)) {
232 set_pud(pud, __pud(0));
233 continue;
234 }
235
236 pmd = alloc_low_page(&map, &pmd_phys);
237 set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE));
238 for (j = 0; j < PTRS_PER_PMD; pmd++, j++, paddr += PMD_SIZE) {
239 unsigned long pe;
240
241 if (paddr >= end) {
242 for (; j < PTRS_PER_PMD; j++, pmd++)
243 set_pmd(pmd, __pmd(0));
244 break;
245 }
246 pe = _PAGE_NX|_PAGE_PSE | _KERNPG_TABLE | _PAGE_GLOBAL | paddr;
247 pe &= __supported_pte_mask;
248 set_pmd(pmd, __pmd(pe));
249 }
250 unmap_low_page(map);
251 }
252 __flush_tlb();
253}
254
255static void __init find_early_table_space(unsigned long end)
256{
257 unsigned long puds, pmds, tables;
258
259 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
260 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
261 tables = round_up(puds * sizeof(pud_t), PAGE_SIZE) +
262 round_up(pmds * sizeof(pmd_t), PAGE_SIZE);
263
264 table_start = find_e820_area(0x8000, __pa_symbol(&_text), tables);
265 if (table_start == -1UL)
266 panic("Cannot find space for the kernel page tables");
267
268 table_start >>= PAGE_SHIFT;
269 table_end = table_start;
270}
271
272/* Setup the direct mapping of the physical memory at PAGE_OFFSET.
273 This runs before bootmem is initialized and gets pages directly from the
274 physical memory. To access them they are temporarily mapped. */
275void __init init_memory_mapping(unsigned long start, unsigned long end)
276{
277 unsigned long next;
278
279 Dprintk("init_memory_mapping\n");
280
281 /*
282 * Find space for the kernel direct mapping tables.
283 * Later we should allocate these tables in the local node of the memory
284 * mapped. Unfortunately this is done currently before the nodes are
285 * discovered.
286 */
287 find_early_table_space(end);
288
289 start = (unsigned long)__va(start);
290 end = (unsigned long)__va(end);
291
292 for (; start < end; start = next) {
293 int map;
294 unsigned long pud_phys;
295 pud_t *pud = alloc_low_page(&map, &pud_phys);
296 next = start + PGDIR_SIZE;
297 if (next > end)
298 next = end;
299 phys_pud_init(pud, __pa(start), __pa(next));
300 set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
301 unmap_low_page(map);
302 }
303
304 asm volatile("movq %%cr4,%0" : "=r" (mmu_cr4_features));
305 __flush_tlb_all();
306 early_printk("kernel direct mapping tables upto %lx @ %lx-%lx\n", end,
307 table_start<<PAGE_SHIFT,
308 table_end<<PAGE_SHIFT);
309}
310
311extern struct x8664_pda cpu_pda[NR_CPUS];
312
313/* Assumes all CPUs still execute in init_mm */
314void zap_low_mappings(void)
315{
316 pgd_t *pgd = pgd_offset_k(0UL);
317 pgd_clear(pgd);
318 flush_tlb_all();
319}
320
321#ifndef CONFIG_DISCONTIGMEM
322void __init paging_init(void)
323{
324 {
325 unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0};
326 unsigned int max_dma;
327
328 max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
329
330 if (end_pfn < max_dma)
331 zones_size[ZONE_DMA] = end_pfn;
332 else {
333 zones_size[ZONE_DMA] = max_dma;
334 zones_size[ZONE_NORMAL] = end_pfn - max_dma;
335 }
336 free_area_init(zones_size);
337 }
338 return;
339}
340#endif
341
342/* Unmap a kernel mapping if it exists. This is useful to avoid prefetches
343 from the CPU leading to inconsistent cache lines. address and size
344 must be aligned to 2MB boundaries.
345 Does nothing when the mapping doesn't exist. */
346void __init clear_kernel_mapping(unsigned long address, unsigned long size)
347{
348 unsigned long end = address + size;
349
350 BUG_ON(address & ~LARGE_PAGE_MASK);
351 BUG_ON(size & ~LARGE_PAGE_MASK);
352
353 for (; address < end; address += LARGE_PAGE_SIZE) {
354 pgd_t *pgd = pgd_offset_k(address);
355 pud_t *pud;
356 pmd_t *pmd;
357 if (pgd_none(*pgd))
358 continue;
359 pud = pud_offset(pgd, address);
360 if (pud_none(*pud))
361 continue;
362 pmd = pmd_offset(pud, address);
363 if (!pmd || pmd_none(*pmd))
364 continue;
365 if (0 == (pmd_val(*pmd) & _PAGE_PSE)) {
366 /* Could handle this, but it should not happen currently. */
367 printk(KERN_ERR
368 "clear_kernel_mapping: mapping has been split. will leak memory\n");
369 pmd_ERROR(*pmd);
370 }
371 set_pmd(pmd, __pmd(0));
372 }
373 __flush_tlb_all();
374}
375
376static inline int page_is_ram (unsigned long pagenr)
377{
378 int i;
379
380 for (i = 0; i < e820.nr_map; i++) {
381 unsigned long addr, end;
382
383 if (e820.map[i].type != E820_RAM) /* not usable memory */
384 continue;
385 /*
386 * !!!FIXME!!! Some BIOSen report areas as RAM that
387 * are not. Notably the 640->1Mb area. We need a sanity
388 * check here.
389 */
390 addr = (e820.map[i].addr+PAGE_SIZE-1) >> PAGE_SHIFT;
391 end = (e820.map[i].addr+e820.map[i].size) >> PAGE_SHIFT;
392 if ((pagenr >= addr) && (pagenr < end))
393 return 1;
394 }
395 return 0;
396}
397
398extern int swiotlb_force;
399
400static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules,
401 kcore_vsyscall;
402
403void __init mem_init(void)
404{
405 int codesize, reservedpages, datasize, initsize;
406 int tmp;
407
408#ifdef CONFIG_SWIOTLB
409 if (swiotlb_force)
410 swiotlb = 1;
411 if (!iommu_aperture &&
412 (end_pfn >= 0xffffffff>>PAGE_SHIFT || force_iommu))
413 swiotlb = 1;
414 if (swiotlb)
415 swiotlb_init();
416#endif
417
418 /* How many end-of-memory variables you have, grandma! */
419 max_low_pfn = end_pfn;
420 max_pfn = end_pfn;
421 num_physpages = end_pfn;
422 high_memory = (void *) __va(end_pfn * PAGE_SIZE);
423
424 /* clear the zero-page */
425 memset(empty_zero_page, 0, PAGE_SIZE);
426
427 reservedpages = 0;
428
429 /* this will put all low memory onto the freelists */
430#ifdef CONFIG_DISCONTIGMEM
431 totalram_pages += numa_free_all_bootmem();
432 tmp = 0;
433 /* should count reserved pages here for all nodes */
434#else
435 max_mapnr = end_pfn;
436 if (!mem_map) BUG();
437
438 totalram_pages += free_all_bootmem();
439
440 for (tmp = 0; tmp < end_pfn; tmp++)
441 /*
442 * Only count reserved RAM pages
443 */
444 if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp)))
445 reservedpages++;
446#endif
447
448 after_bootmem = 1;
449
450 codesize = (unsigned long) &_etext - (unsigned long) &_text;
451 datasize = (unsigned long) &_edata - (unsigned long) &_etext;
452 initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin;
453
454 /* Register memory areas for /proc/kcore */
455 kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
456 kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
457 VMALLOC_END-VMALLOC_START);
458 kclist_add(&kcore_kernel, &_stext, _end - _stext);
459 kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN);
460 kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
461 VSYSCALL_END - VSYSCALL_START);
462
463 printk("Memory: %luk/%luk available (%dk kernel code, %dk reserved, %dk data, %dk init)\n",
464 (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
465 end_pfn << (PAGE_SHIFT-10),
466 codesize >> 10,
467 reservedpages << (PAGE_SHIFT-10),
468 datasize >> 10,
469 initsize >> 10);
470
471 /*
472 * Subtle. SMP is doing its boot stuff late (because it has to
473 * fork idle threads) - but it also needs low mappings for the
474 * protected-mode entry to work. We zap these entries only after
475 * the WP-bit has been tested.
476 */
477#ifndef CONFIG_SMP
478 zap_low_mappings();
479#endif
480}
481
482extern char __initdata_begin[], __initdata_end[];
483
484void free_initmem(void)
485{
486 unsigned long addr;
487
488 addr = (unsigned long)(&__init_begin);
489 for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) {
490 ClearPageReserved(virt_to_page(addr));
491 set_page_count(virt_to_page(addr), 1);
492 memset((void *)(addr & ~(PAGE_SIZE-1)), 0xcc, PAGE_SIZE);
493 free_page(addr);
494 totalram_pages++;
495 }
496 memset(__initdata_begin, 0xba, __initdata_end - __initdata_begin);
497 printk ("Freeing unused kernel memory: %luk freed\n", (&__init_end - &__init_begin) >> 10);
498}
499
500#ifdef CONFIG_BLK_DEV_INITRD
501void free_initrd_mem(unsigned long start, unsigned long end)
502{
503 if (start < (unsigned long)&_end)
504 return;
505 printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10);
506 for (; start < end; start += PAGE_SIZE) {
507 ClearPageReserved(virt_to_page(start));
508 set_page_count(virt_to_page(start), 1);
509 free_page(start);
510 totalram_pages++;
511 }
512}
513#endif
514
515void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
516{
517 /* Should check here against the e820 map to avoid double free */
518#ifdef CONFIG_DISCONTIGMEM
519 int nid = phys_to_nid(phys);
520 reserve_bootmem_node(NODE_DATA(nid), phys, len);
521#else
522 reserve_bootmem(phys, len);
523#endif
524}
525
526int kern_addr_valid(unsigned long addr)
527{
528 unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
529 pgd_t *pgd;
530 pud_t *pud;
531 pmd_t *pmd;
532 pte_t *pte;
533
534 if (above != 0 && above != -1UL)
535 return 0;
536
537 pgd = pgd_offset_k(addr);
538 if (pgd_none(*pgd))
539 return 0;
540
541 pud = pud_offset(pgd, addr);
542 if (pud_none(*pud))
543 return 0;
544
545 pmd = pmd_offset(pud, addr);
546 if (pmd_none(*pmd))
547 return 0;
548 if (pmd_large(*pmd))
549 return pfn_valid(pmd_pfn(*pmd));
550
551 pte = pte_offset_kernel(pmd, addr);
552 if (pte_none(*pte))
553 return 0;
554 return pfn_valid(pte_pfn(*pte));
555}
556
557#ifdef CONFIG_SYSCTL
558#include <linux/sysctl.h>
559
560extern int exception_trace, page_fault_trace;
561
562static ctl_table debug_table2[] = {
563 { 99, "exception-trace", &exception_trace, sizeof(int), 0644, NULL,
564 proc_dointvec },
565#ifdef CONFIG_CHECKING
566 { 100, "page-fault-trace", &page_fault_trace, sizeof(int), 0644, NULL,
567 proc_dointvec },
568#endif
569 { 0, }
570};
571
572static ctl_table debug_root_table2[] = {
573 { .ctl_name = CTL_DEBUG, .procname = "debug", .mode = 0555,
574 .child = debug_table2 },
575 { 0 },
576};
577
578static __init int x8664_sysctl_init(void)
579{
580 register_sysctl_table(debug_root_table2, 1);
581 return 0;
582}
583__initcall(x8664_sysctl_init);
584#endif
585
586/* Pseudo VMAs to allow ptrace access for the vsyscall pages. x86-64 has two
587 different ones: one for 32bit and one for 64bit. Use the appropiate
588 for the target task. */
589
590static struct vm_area_struct gate_vma = {
591 .vm_start = VSYSCALL_START,
592 .vm_end = VSYSCALL_END,
593 .vm_page_prot = PAGE_READONLY
594};
595
596static struct vm_area_struct gate32_vma = {
597 .vm_start = VSYSCALL32_BASE,
598 .vm_end = VSYSCALL32_END,
599 .vm_page_prot = PAGE_READONLY
600};
601
602struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
603{
604#ifdef CONFIG_IA32_EMULATION
605 if (test_tsk_thread_flag(tsk, TIF_IA32)) {
606 /* lookup code assumes the pages are present. set them up
607 now */
608 if (__map_syscall32(tsk->mm, VSYSCALL32_BASE) < 0)
609 return NULL;
610 return &gate32_vma;
611 }
612#endif
613 return &gate_vma;
614}
615
616int in_gate_area(struct task_struct *task, unsigned long addr)
617{
618 struct vm_area_struct *vma = get_gate_vma(task);
619 return (addr >= vma->vm_start) && (addr < vma->vm_end);
620}
621
622/* Use this when you have no reliable task/vma, typically from interrupt
623 * context. It is less reliable than using the task's vma and may give
624 * false positives.
625 */
626int in_gate_area_no_task(unsigned long addr)
627{
628 return (((addr >= VSYSCALL_START) && (addr < VSYSCALL_END)) ||
629 ((addr >= VSYSCALL32_BASE) && (addr < VSYSCALL32_END)));
630}
diff --git a/arch/x86_64/mm/ioremap.c b/arch/x86_64/mm/ioremap.c
new file mode 100644
index 000000000000..74ec8554b195
--- /dev/null
+++ b/arch/x86_64/mm/ioremap.c
@@ -0,0 +1,283 @@
1/*
2 * arch/x86_64/mm/ioremap.c
3 *
4 * Re-map IO memory to kernel address space so that we can access it.
5 * This is needed for high PCI addresses that aren't mapped in the
6 * 640k-1MB IO memory area on PC's
7 *
8 * (C) Copyright 1995 1996 Linus Torvalds
9 */
10
11#include <linux/vmalloc.h>
12#include <linux/init.h>
13#include <linux/slab.h>
14#include <asm/io.h>
15#include <asm/pgalloc.h>
16#include <asm/fixmap.h>
17#include <asm/cacheflush.h>
18#include <asm/tlbflush.h>
19#include <asm/proto.h>
20
21#define ISA_START_ADDRESS 0xa0000
22#define ISA_END_ADDRESS 0x100000
23
24static inline void remap_area_pte(pte_t * pte, unsigned long address, unsigned long size,
25 unsigned long phys_addr, unsigned long flags)
26{
27 unsigned long end;
28 unsigned long pfn;
29
30 address &= ~PMD_MASK;
31 end = address + size;
32 if (end > PMD_SIZE)
33 end = PMD_SIZE;
34 if (address >= end)
35 BUG();
36 pfn = phys_addr >> PAGE_SHIFT;
37 do {
38 if (!pte_none(*pte)) {
39 printk("remap_area_pte: page already exists\n");
40 BUG();
41 }
42 set_pte(pte, pfn_pte(pfn, __pgprot(_PAGE_PRESENT | _PAGE_RW |
43 _PAGE_GLOBAL | _PAGE_DIRTY | _PAGE_ACCESSED | flags)));
44 address += PAGE_SIZE;
45 pfn++;
46 pte++;
47 } while (address && (address < end));
48}
49
50static inline int remap_area_pmd(pmd_t * pmd, unsigned long address, unsigned long size,
51 unsigned long phys_addr, unsigned long flags)
52{
53 unsigned long end;
54
55 address &= ~PUD_MASK;
56 end = address + size;
57 if (end > PUD_SIZE)
58 end = PUD_SIZE;
59 phys_addr -= address;
60 if (address >= end)
61 BUG();
62 do {
63 pte_t * pte = pte_alloc_kernel(&init_mm, pmd, address);
64 if (!pte)
65 return -ENOMEM;
66 remap_area_pte(pte, address, end - address, address + phys_addr, flags);
67 address = (address + PMD_SIZE) & PMD_MASK;
68 pmd++;
69 } while (address && (address < end));
70 return 0;
71}
72
73static inline int remap_area_pud(pud_t * pud, unsigned long address, unsigned long size,
74 unsigned long phys_addr, unsigned long flags)
75{
76 unsigned long end;
77
78 address &= ~PGDIR_MASK;
79 end = address + size;
80 if (end > PGDIR_SIZE)
81 end = PGDIR_SIZE;
82 phys_addr -= address;
83 if (address >= end)
84 BUG();
85 do {
86 pmd_t * pmd = pmd_alloc(&init_mm, pud, address);
87 if (!pmd)
88 return -ENOMEM;
89 remap_area_pmd(pmd, address, end - address, address + phys_addr, flags);
90 address = (address + PUD_SIZE) & PUD_MASK;
91 pud++;
92 } while (address && (address < end));
93 return 0;
94}
95
96static int remap_area_pages(unsigned long address, unsigned long phys_addr,
97 unsigned long size, unsigned long flags)
98{
99 int error;
100 pgd_t *pgd;
101 unsigned long end = address + size;
102
103 phys_addr -= address;
104 pgd = pgd_offset_k(address);
105 flush_cache_all();
106 if (address >= end)
107 BUG();
108 spin_lock(&init_mm.page_table_lock);
109 do {
110 pud_t *pud;
111 pud = pud_alloc(&init_mm, pgd, address);
112 error = -ENOMEM;
113 if (!pud)
114 break;
115 if (remap_area_pud(pud, address, end - address,
116 phys_addr + address, flags))
117 break;
118 error = 0;
119 address = (address + PGDIR_SIZE) & PGDIR_MASK;
120 pgd++;
121 } while (address && (address < end));
122 spin_unlock(&init_mm.page_table_lock);
123 flush_tlb_all();
124 return error;
125}
126
127/*
128 * Fix up the linear direct mapping of the kernel to avoid cache attribute
129 * conflicts.
130 */
131static int
132ioremap_change_attr(unsigned long phys_addr, unsigned long size,
133 unsigned long flags)
134{
135 int err = 0;
136 if (flags && phys_addr + size - 1 < (end_pfn_map << PAGE_SHIFT)) {
137 unsigned long npages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
138 unsigned long vaddr = (unsigned long) __va(phys_addr);
139
140 /*
141 * Must use a address here and not struct page because the phys addr
142 * can be a in hole between nodes and not have an memmap entry.
143 */
144 err = change_page_attr_addr(vaddr,npages,__pgprot(__PAGE_KERNEL|flags));
145 if (!err)
146 global_flush_tlb();
147 }
148 return err;
149}
150
151/*
152 * Generic mapping function
153 */
154
155/*
156 * Remap an arbitrary physical address space into the kernel virtual
157 * address space. Needed when the kernel wants to access high addresses
158 * directly.
159 *
160 * NOTE! We need to allow non-page-aligned mappings too: we will obviously
161 * have to convert them into an offset in a page-aligned mapping, but the
162 * caller shouldn't need to know that small detail.
163 */
164void __iomem * __ioremap(unsigned long phys_addr, unsigned long size, unsigned long flags)
165{
166 void * addr;
167 struct vm_struct * area;
168 unsigned long offset, last_addr;
169
170 /* Don't allow wraparound or zero size */
171 last_addr = phys_addr + size - 1;
172 if (!size || last_addr < phys_addr)
173 return NULL;
174
175 /*
176 * Don't remap the low PCI/ISA area, it's always mapped..
177 */
178 if (phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS)
179 return (__force void __iomem *)phys_to_virt(phys_addr);
180
181#ifndef CONFIG_DISCONTIGMEM
182 /*
183 * Don't allow anybody to remap normal RAM that we're using..
184 */
185 if (last_addr < virt_to_phys(high_memory)) {
186 char *t_addr, *t_end;
187 struct page *page;
188
189 t_addr = __va(phys_addr);
190 t_end = t_addr + (size - 1);
191
192 for(page = virt_to_page(t_addr); page <= virt_to_page(t_end); page++)
193 if(!PageReserved(page))
194 return NULL;
195 }
196#endif
197
198 /*
199 * Mappings have to be page-aligned
200 */
201 offset = phys_addr & ~PAGE_MASK;
202 phys_addr &= PAGE_MASK;
203 size = PAGE_ALIGN(last_addr+1) - phys_addr;
204
205 /*
206 * Ok, go for it..
207 */
208 area = get_vm_area(size, VM_IOREMAP | (flags << 20));
209 if (!area)
210 return NULL;
211 area->phys_addr = phys_addr;
212 addr = area->addr;
213 if (remap_area_pages((unsigned long) addr, phys_addr, size, flags)) {
214 remove_vm_area((void *)(PAGE_MASK & (unsigned long) addr));
215 return NULL;
216 }
217 if (ioremap_change_attr(phys_addr, size, flags) < 0) {
218 area->flags &= 0xffffff;
219 vunmap(addr);
220 return NULL;
221 }
222 return (__force void __iomem *) (offset + (char *)addr);
223}
224
225/**
226 * ioremap_nocache - map bus memory into CPU space
227 * @offset: bus address of the memory
228 * @size: size of the resource to map
229 *
230 * ioremap_nocache performs a platform specific sequence of operations to
231 * make bus memory CPU accessible via the readb/readw/readl/writeb/
232 * writew/writel functions and the other mmio helpers. The returned
233 * address is not guaranteed to be usable directly as a virtual
234 * address.
235 *
236 * This version of ioremap ensures that the memory is marked uncachable
237 * on the CPU as well as honouring existing caching rules from things like
238 * the PCI bus. Note that there are other caches and buffers on many
239 * busses. In particular driver authors should read up on PCI writes
240 *
241 * It's useful if some control registers are in such an area and
242 * write combining or read caching is not desirable:
243 *
244 * Must be freed with iounmap.
245 */
246
247void __iomem *ioremap_nocache (unsigned long phys_addr, unsigned long size)
248{
249 return __ioremap(phys_addr, size, _PAGE_PCD);
250}
251
252void iounmap(volatile void __iomem *addr)
253{
254 struct vm_struct *p, **pprev;
255
256 if (addr <= high_memory)
257 return;
258 if (addr >= phys_to_virt(ISA_START_ADDRESS) &&
259 addr < phys_to_virt(ISA_END_ADDRESS))
260 return;
261
262 write_lock(&vmlist_lock);
263 for (p = vmlist, pprev = &vmlist; p != NULL; pprev = &p->next, p = *pprev)
264 if (p->addr == (void *)(PAGE_MASK & (unsigned long)addr))
265 break;
266 if (!p) {
267 printk("__iounmap: bad address %p\n", addr);
268 goto out_unlock;
269 }
270 *pprev = p->next;
271 unmap_vm_area(p);
272 if ((p->flags >> 20) &&
273 p->phys_addr + p->size - 1 < virt_to_phys(high_memory)) {
274 /* p->size includes the guard page, but cpa doesn't like that */
275 change_page_attr(virt_to_page(__va(p->phys_addr)),
276 p->size >> PAGE_SHIFT,
277 PAGE_KERNEL);
278 global_flush_tlb();
279 }
280out_unlock:
281 write_unlock(&vmlist_lock);
282 kfree(p);
283}
diff --git a/arch/x86_64/mm/k8topology.c b/arch/x86_64/mm/k8topology.c
new file mode 100644
index 000000000000..ec35747aacd7
--- /dev/null
+++ b/arch/x86_64/mm/k8topology.c
@@ -0,0 +1,168 @@
1/*
2 * AMD K8 NUMA support.
3 * Discover the memory map and associated nodes.
4 *
5 * This version reads it directly from the K8 northbridge.
6 *
7 * Copyright 2002,2003 Andi Kleen, SuSE Labs.
8 */
9#include <linux/kernel.h>
10#include <linux/init.h>
11#include <linux/string.h>
12#include <linux/module.h>
13#include <linux/nodemask.h>
14#include <asm/io.h>
15#include <linux/pci_ids.h>
16#include <asm/types.h>
17#include <asm/mmzone.h>
18#include <asm/proto.h>
19#include <asm/e820.h>
20#include <asm/pci-direct.h>
21#include <asm/numa.h>
22
23static __init int find_northbridge(void)
24{
25 int num;
26
27 for (num = 0; num < 32; num++) {
28 u32 header;
29
30 header = read_pci_config(0, num, 0, 0x00);
31 if (header != (PCI_VENDOR_ID_AMD | (0x1100<<16)))
32 continue;
33
34 header = read_pci_config(0, num, 1, 0x00);
35 if (header != (PCI_VENDOR_ID_AMD | (0x1101<<16)))
36 continue;
37 return num;
38 }
39
40 return -1;
41}
42
43int __init k8_scan_nodes(unsigned long start, unsigned long end)
44{
45 unsigned long prevbase;
46 struct node nodes[8];
47 int nodeid, i, nb;
48 int found = 0;
49 u32 reg;
50 unsigned numnodes;
51 nodemask_t nodes_parsed;
52
53 nodes_clear(nodes_parsed);
54
55 nb = find_northbridge();
56 if (nb < 0)
57 return nb;
58
59 printk(KERN_INFO "Scanning NUMA topology in Northbridge %d\n", nb);
60
61 reg = read_pci_config(0, nb, 0, 0x60);
62 numnodes = ((reg >> 4) & 0xF) + 1;
63
64 printk(KERN_INFO "Number of nodes %d\n", numnodes);
65
66 memset(&nodes,0,sizeof(nodes));
67 prevbase = 0;
68 for (i = 0; i < 8; i++) {
69 unsigned long base,limit;
70
71 base = read_pci_config(0, nb, 1, 0x40 + i*8);
72 limit = read_pci_config(0, nb, 1, 0x44 + i*8);
73
74 nodeid = limit & 7;
75 if ((base & 3) == 0) {
76 if (i < numnodes)
77 printk("Skipping disabled node %d\n", i);
78 continue;
79 }
80 if (nodeid >= numnodes) {
81 printk("Ignoring excess node %d (%lx:%lx)\n", nodeid,
82 base, limit);
83 continue;
84 }
85
86 if (!limit) {
87 printk(KERN_INFO "Skipping node entry %d (base %lx)\n", i,
88 base);
89 continue;
90 }
91 if ((base >> 8) & 3 || (limit >> 8) & 3) {
92 printk(KERN_ERR "Node %d using interleaving mode %lx/%lx\n",
93 nodeid, (base>>8)&3, (limit>>8) & 3);
94 return -1;
95 }
96 if (node_isset(nodeid, nodes_parsed)) {
97 printk(KERN_INFO "Node %d already present. Skipping\n",
98 nodeid);
99 continue;
100 }
101
102 limit >>= 16;
103 limit <<= 24;
104 limit |= (1<<24)-1;
105
106 if (limit > end_pfn << PAGE_SHIFT)
107 limit = end_pfn << PAGE_SHIFT;
108 if (limit <= base)
109 continue;
110
111 base >>= 16;
112 base <<= 24;
113
114 if (base < start)
115 base = start;
116 if (limit > end)
117 limit = end;
118 if (limit == base) {
119 printk(KERN_ERR "Empty node %d\n", nodeid);
120 continue;
121 }
122 if (limit < base) {
123 printk(KERN_ERR "Node %d bogus settings %lx-%lx.\n",
124 nodeid, base, limit);
125 continue;
126 }
127
128 /* Could sort here, but pun for now. Should not happen anyroads. */
129 if (prevbase > base) {
130 printk(KERN_ERR "Node map not sorted %lx,%lx\n",
131 prevbase,base);
132 return -1;
133 }
134
135 printk(KERN_INFO "Node %d MemBase %016lx Limit %016lx\n",
136 nodeid, base, limit);
137
138 found++;
139
140 nodes[nodeid].start = base;
141 nodes[nodeid].end = limit;
142
143 prevbase = base;
144
145 node_set(nodeid, nodes_parsed);
146 }
147
148 if (!found)
149 return -1;
150
151 memnode_shift = compute_hash_shift(nodes, numnodes);
152 if (memnode_shift < 0) {
153 printk(KERN_ERR "No NUMA node hash function found. Contact maintainer\n");
154 return -1;
155 }
156 printk(KERN_INFO "Using node hash shift of %d\n", memnode_shift);
157
158 for (i = 0; i < 8; i++) {
159 if (nodes[i].start != nodes[i].end) {
160 /* assume 1:1 NODE:CPU */
161 cpu_to_node[i] = i;
162 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
163 }
164 }
165
166 numa_init_array();
167 return 0;
168}
diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c
new file mode 100644
index 000000000000..fd9f25d7a6c4
--- /dev/null
+++ b/arch/x86_64/mm/numa.c
@@ -0,0 +1,294 @@
1/*
2 * Generic VM initialization for x86-64 NUMA setups.
3 * Copyright 2002,2003 Andi Kleen, SuSE Labs.
4 */
5#include <linux/kernel.h>
6#include <linux/mm.h>
7#include <linux/string.h>
8#include <linux/init.h>
9#include <linux/bootmem.h>
10#include <linux/mmzone.h>
11#include <linux/ctype.h>
12#include <linux/module.h>
13#include <linux/nodemask.h>
14
15#include <asm/e820.h>
16#include <asm/proto.h>
17#include <asm/dma.h>
18#include <asm/numa.h>
19#include <asm/acpi.h>
20
21#ifndef Dprintk
22#define Dprintk(x...)
23#endif
24
25struct pglist_data *node_data[MAX_NUMNODES];
26bootmem_data_t plat_node_bdata[MAX_NUMNODES];
27
28int memnode_shift;
29u8 memnodemap[NODEMAPSIZE];
30
31unsigned char cpu_to_node[NR_CPUS] = { [0 ... NR_CPUS-1] = NUMA_NO_NODE };
32cpumask_t node_to_cpumask[MAX_NUMNODES];
33
34int numa_off __initdata;
35
36int __init compute_hash_shift(struct node *nodes, int numnodes)
37{
38 int i;
39 int shift = 24;
40 u64 addr;
41
42 /* When in doubt use brute force. */
43 while (shift < 48) {
44 memset(memnodemap,0xff,sizeof(*memnodemap) * NODEMAPSIZE);
45 for (i = 0; i < numnodes; i++) {
46 if (nodes[i].start == nodes[i].end)
47 continue;
48 for (addr = nodes[i].start;
49 addr < nodes[i].end;
50 addr += (1UL << shift)) {
51 if (memnodemap[addr >> shift] != 0xff &&
52 memnodemap[addr >> shift] != i) {
53 printk(KERN_INFO
54 "node %d shift %d addr %Lx conflict %d\n",
55 i, shift, addr, memnodemap[addr>>shift]);
56 goto next;
57 }
58 memnodemap[addr >> shift] = i;
59 }
60 }
61 return shift;
62 next:
63 shift++;
64 }
65 memset(memnodemap,0,sizeof(*memnodemap) * NODEMAPSIZE);
66 return -1;
67}
68
69/* Initialize bootmem allocator for a node */
70void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
71{
72 unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size, bootmap_start;
73 unsigned long nodedata_phys;
74 const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE);
75
76 start = round_up(start, ZONE_ALIGN);
77
78 printk("Bootmem setup node %d %016lx-%016lx\n", nodeid, start, end);
79
80 start_pfn = start >> PAGE_SHIFT;
81 end_pfn = end >> PAGE_SHIFT;
82
83 nodedata_phys = find_e820_area(start, end, pgdat_size);
84 if (nodedata_phys == -1L)
85 panic("Cannot find memory pgdat in node %d\n", nodeid);
86
87 Dprintk("nodedata_phys %lx\n", nodedata_phys);
88
89 node_data[nodeid] = phys_to_virt(nodedata_phys);
90 memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
91 NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid];
92 NODE_DATA(nodeid)->node_start_pfn = start_pfn;
93 NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn;
94
95 /* Find a place for the bootmem map */
96 bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn);
97 bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE);
98 bootmap_start = find_e820_area(bootmap_start, end, bootmap_pages<<PAGE_SHIFT);
99 if (bootmap_start == -1L)
100 panic("Not enough continuous space for bootmap on node %d", nodeid);
101 Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages);
102
103 bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
104 bootmap_start >> PAGE_SHIFT,
105 start_pfn, end_pfn);
106
107 e820_bootmem_free(NODE_DATA(nodeid), start, end);
108
109 reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size);
110 reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT);
111 node_set_online(nodeid);
112}
113
114/* Initialize final allocator for a zone */
115void __init setup_node_zones(int nodeid)
116{
117 unsigned long start_pfn, end_pfn;
118 unsigned long zones[MAX_NR_ZONES];
119 unsigned long dma_end_pfn;
120
121 memset(zones, 0, sizeof(unsigned long) * MAX_NR_ZONES);
122
123 start_pfn = node_start_pfn(nodeid);
124 end_pfn = node_end_pfn(nodeid);
125
126 Dprintk(KERN_INFO "setting up node %d %lx-%lx\n", nodeid, start_pfn, end_pfn);
127
128 /* All nodes > 0 have a zero length zone DMA */
129 dma_end_pfn = __pa(MAX_DMA_ADDRESS) >> PAGE_SHIFT;
130 if (start_pfn < dma_end_pfn) {
131 zones[ZONE_DMA] = dma_end_pfn - start_pfn;
132 zones[ZONE_NORMAL] = end_pfn - dma_end_pfn;
133 } else {
134 zones[ZONE_NORMAL] = end_pfn - start_pfn;
135 }
136
137 free_area_init_node(nodeid, NODE_DATA(nodeid), zones,
138 start_pfn, NULL);
139}
140
141void __init numa_init_array(void)
142{
143 int rr, i;
144 /* There are unfortunately some poorly designed mainboards around
145 that only connect memory to a single CPU. This breaks the 1:1 cpu->node
146 mapping. To avoid this fill in the mapping for all possible
147 CPUs, as the number of CPUs is not known yet.
148 We round robin the existing nodes. */
149 rr = 0;
150 for (i = 0; i < NR_CPUS; i++) {
151 if (cpu_to_node[i] != NUMA_NO_NODE)
152 continue;
153 rr = next_node(rr, node_online_map);
154 if (rr == MAX_NUMNODES)
155 rr = first_node(node_online_map);
156 cpu_to_node[i] = rr;
157 rr++;
158 }
159
160 set_bit(0, &node_to_cpumask[cpu_to_node(0)]);
161}
162
163#ifdef CONFIG_NUMA_EMU
164int numa_fake __initdata = 0;
165
166/* Numa emulation */
167static int numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
168{
169 int i;
170 struct node nodes[MAX_NUMNODES];
171 unsigned long sz = ((end_pfn - start_pfn)<<PAGE_SHIFT) / numa_fake;
172
173 /* Kludge needed for the hash function */
174 if (hweight64(sz) > 1) {
175 unsigned long x = 1;
176 while ((x << 1) < sz)
177 x <<= 1;
178 if (x < sz/2)
179 printk("Numa emulation unbalanced. Complain to maintainer\n");
180 sz = x;
181 }
182
183 memset(&nodes,0,sizeof(nodes));
184 for (i = 0; i < numa_fake; i++) {
185 nodes[i].start = (start_pfn<<PAGE_SHIFT) + i*sz;
186 if (i == numa_fake-1)
187 sz = (end_pfn<<PAGE_SHIFT) - nodes[i].start;
188 nodes[i].end = nodes[i].start + sz;
189 if (i != numa_fake-1)
190 nodes[i].end--;
191 printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n",
192 i,
193 nodes[i].start, nodes[i].end,
194 (nodes[i].end - nodes[i].start) >> 20);
195 node_set_online(i);
196 }
197 memnode_shift = compute_hash_shift(nodes, numa_fake);
198 if (memnode_shift < 0) {
199 memnode_shift = 0;
200 printk(KERN_ERR "No NUMA hash function found. Emulation disabled.\n");
201 return -1;
202 }
203 for_each_online_node(i)
204 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
205 numa_init_array();
206 return 0;
207}
208#endif
209
210void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
211{
212 int i;
213
214#ifdef CONFIG_NUMA_EMU
215 if (numa_fake && !numa_emulation(start_pfn, end_pfn))
216 return;
217#endif
218
219#ifdef CONFIG_ACPI_NUMA
220 if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
221 end_pfn << PAGE_SHIFT))
222 return;
223#endif
224
225#ifdef CONFIG_K8_NUMA
226 if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT))
227 return;
228#endif
229 printk(KERN_INFO "%s\n",
230 numa_off ? "NUMA turned off" : "No NUMA configuration found");
231
232 printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
233 start_pfn << PAGE_SHIFT,
234 end_pfn << PAGE_SHIFT);
235 /* setup dummy node covering all memory */
236 memnode_shift = 63;
237 memnodemap[0] = 0;
238 nodes_clear(node_online_map);
239 node_set_online(0);
240 for (i = 0; i < NR_CPUS; i++)
241 cpu_to_node[i] = 0;
242 node_to_cpumask[0] = cpumask_of_cpu(0);
243 setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
244}
245
246__init void numa_add_cpu(int cpu)
247{
248 /* BP is initialized elsewhere */
249 if (cpu)
250 set_bit(cpu, &node_to_cpumask[cpu_to_node(cpu)]);
251}
252
253unsigned long __init numa_free_all_bootmem(void)
254{
255 int i;
256 unsigned long pages = 0;
257 for_each_online_node(i) {
258 pages += free_all_bootmem_node(NODE_DATA(i));
259 }
260 return pages;
261}
262
263void __init paging_init(void)
264{
265 int i;
266 for_each_online_node(i) {
267 setup_node_zones(i);
268 }
269}
270
271/* [numa=off] */
272__init int numa_setup(char *opt)
273{
274 if (!strncmp(opt,"off",3))
275 numa_off = 1;
276#ifdef CONFIG_NUMA_EMU
277 if(!strncmp(opt, "fake=", 5)) {
278 numa_fake = simple_strtoul(opt+5,NULL,0); ;
279 if (numa_fake >= MAX_NUMNODES)
280 numa_fake = MAX_NUMNODES;
281 }
282#endif
283#ifdef CONFIG_ACPI_NUMA
284 if (!strncmp(opt,"noacpi",6))
285 acpi_numa = -1;
286#endif
287 return 1;
288}
289
290EXPORT_SYMBOL(cpu_to_node);
291EXPORT_SYMBOL(node_to_cpumask);
292EXPORT_SYMBOL(memnode_shift);
293EXPORT_SYMBOL(memnodemap);
294EXPORT_SYMBOL(node_data);
diff --git a/arch/x86_64/mm/pageattr.c b/arch/x86_64/mm/pageattr.c
new file mode 100644
index 000000000000..94862e1ec032
--- /dev/null
+++ b/arch/x86_64/mm/pageattr.c
@@ -0,0 +1,235 @@
1/*
2 * Copyright 2002 Andi Kleen, SuSE Labs.
3 * Thanks to Ben LaHaise for precious feedback.
4 */
5
6#include <linux/config.h>
7#include <linux/mm.h>
8#include <linux/sched.h>
9#include <linux/highmem.h>
10#include <linux/module.h>
11#include <linux/slab.h>
12#include <asm/uaccess.h>
13#include <asm/processor.h>
14#include <asm/tlbflush.h>
15#include <asm/io.h>
16
17static inline pte_t *lookup_address(unsigned long address)
18{
19 pgd_t *pgd = pgd_offset_k(address);
20 pud_t *pud;
21 pmd_t *pmd;
22 pte_t *pte;
23 if (pgd_none(*pgd))
24 return NULL;
25 pud = pud_offset(pgd, address);
26 if (!pud_present(*pud))
27 return NULL;
28 pmd = pmd_offset(pud, address);
29 if (!pmd_present(*pmd))
30 return NULL;
31 if (pmd_large(*pmd))
32 return (pte_t *)pmd;
33 pte = pte_offset_kernel(pmd, address);
34 if (pte && !pte_present(*pte))
35 pte = NULL;
36 return pte;
37}
38
39static struct page *split_large_page(unsigned long address, pgprot_t prot,
40 pgprot_t ref_prot)
41{
42 int i;
43 unsigned long addr;
44 struct page *base = alloc_pages(GFP_KERNEL, 0);
45 pte_t *pbase;
46 if (!base)
47 return NULL;
48 address = __pa(address);
49 addr = address & LARGE_PAGE_MASK;
50 pbase = (pte_t *)page_address(base);
51 for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) {
52 pbase[i] = pfn_pte(addr >> PAGE_SHIFT,
53 addr == address ? prot : ref_prot);
54 }
55 return base;
56}
57
58
59static void flush_kernel_map(void *address)
60{
61 if (0 && address && cpu_has_clflush) {
62 /* is this worth it? */
63 int i;
64 for (i = 0; i < PAGE_SIZE; i += boot_cpu_data.x86_clflush_size)
65 asm volatile("clflush (%0)" :: "r" (address + i));
66 } else
67 asm volatile("wbinvd":::"memory");
68 if (address)
69 __flush_tlb_one(address);
70 else
71 __flush_tlb_all();
72}
73
74
75static inline void flush_map(unsigned long address)
76{
77 on_each_cpu(flush_kernel_map, (void *)address, 1, 1);
78}
79
80struct deferred_page {
81 struct deferred_page *next;
82 struct page *fpage;
83 unsigned long address;
84};
85static struct deferred_page *df_list; /* protected by init_mm.mmap_sem */
86
87static inline void save_page(unsigned long address, struct page *fpage)
88{
89 struct deferred_page *df;
90 df = kmalloc(sizeof(struct deferred_page), GFP_KERNEL);
91 if (!df) {
92 flush_map(address);
93 __free_page(fpage);
94 } else {
95 df->next = df_list;
96 df->fpage = fpage;
97 df->address = address;
98 df_list = df;
99 }
100}
101
102/*
103 * No more special protections in this 2/4MB area - revert to a
104 * large page again.
105 */
106static void revert_page(unsigned long address, pgprot_t ref_prot)
107{
108 pgd_t *pgd;
109 pud_t *pud;
110 pmd_t *pmd;
111 pte_t large_pte;
112
113 pgd = pgd_offset_k(address);
114 BUG_ON(pgd_none(*pgd));
115 pud = pud_offset(pgd,address);
116 BUG_ON(pud_none(*pud));
117 pmd = pmd_offset(pud, address);
118 BUG_ON(pmd_val(*pmd) & _PAGE_PSE);
119 pgprot_val(ref_prot) |= _PAGE_PSE;
120 large_pte = mk_pte_phys(__pa(address) & LARGE_PAGE_MASK, ref_prot);
121 set_pte((pte_t *)pmd, large_pte);
122}
123
124static int
125__change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot,
126 pgprot_t ref_prot)
127{
128 pte_t *kpte;
129 struct page *kpte_page;
130 unsigned kpte_flags;
131 kpte = lookup_address(address);
132 if (!kpte) return 0;
133 kpte_page = virt_to_page(((unsigned long)kpte) & PAGE_MASK);
134 kpte_flags = pte_val(*kpte);
135 if (pgprot_val(prot) != pgprot_val(ref_prot)) {
136 if ((kpte_flags & _PAGE_PSE) == 0) {
137 set_pte(kpte, pfn_pte(pfn, prot));
138 } else {
139 /*
140 * split_large_page will take the reference for this change_page_attr
141 * on the split page.
142 */
143 struct page *split = split_large_page(address, prot, ref_prot);
144 if (!split)
145 return -ENOMEM;
146 set_pte(kpte,mk_pte(split, ref_prot));
147 kpte_page = split;
148 }
149 get_page(kpte_page);
150 } else if ((kpte_flags & _PAGE_PSE) == 0) {
151 set_pte(kpte, pfn_pte(pfn, ref_prot));
152 __put_page(kpte_page);
153 } else
154 BUG();
155
156 /* on x86-64 the direct mapping set at boot is not using 4k pages */
157 BUG_ON(PageReserved(kpte_page));
158
159 switch (page_count(kpte_page)) {
160 case 1:
161 save_page(address, kpte_page);
162 revert_page(address, ref_prot);
163 break;
164 case 0:
165 BUG(); /* memleak and failed 2M page regeneration */
166 }
167 return 0;
168}
169
170/*
171 * Change the page attributes of an page in the linear mapping.
172 *
173 * This should be used when a page is mapped with a different caching policy
174 * than write-back somewhere - some CPUs do not like it when mappings with
175 * different caching policies exist. This changes the page attributes of the
176 * in kernel linear mapping too.
177 *
178 * The caller needs to ensure that there are no conflicting mappings elsewhere.
179 * This function only deals with the kernel linear map.
180 *
181 * Caller must call global_flush_tlb() after this.
182 */
183int change_page_attr_addr(unsigned long address, int numpages, pgprot_t prot)
184{
185 int err = 0;
186 int i;
187
188 down_write(&init_mm.mmap_sem);
189 for (i = 0; i < numpages; i++, address += PAGE_SIZE) {
190 unsigned long pfn = __pa(address) >> PAGE_SHIFT;
191
192 err = __change_page_attr(address, pfn, prot, PAGE_KERNEL);
193 if (err)
194 break;
195 /* Handle kernel mapping too which aliases part of the
196 * lowmem */
197 if (__pa(address) < KERNEL_TEXT_SIZE) {
198 unsigned long addr2;
199 pgprot_t prot2 = prot;
200 addr2 = __START_KERNEL_map + __pa(address);
201 pgprot_val(prot2) &= ~_PAGE_NX;
202 err = __change_page_attr(addr2, pfn, prot2, PAGE_KERNEL_EXEC);
203 }
204 }
205 up_write(&init_mm.mmap_sem);
206 return err;
207}
208
209/* Don't call this for MMIO areas that may not have a mem_map entry */
210int change_page_attr(struct page *page, int numpages, pgprot_t prot)
211{
212 unsigned long addr = (unsigned long)page_address(page);
213 return change_page_attr_addr(addr, numpages, prot);
214}
215
216void global_flush_tlb(void)
217{
218 struct deferred_page *df, *next_df;
219
220 down_read(&init_mm.mmap_sem);
221 df = xchg(&df_list, NULL);
222 up_read(&init_mm.mmap_sem);
223 if (!df)
224 return;
225 flush_map((df && !df->next) ? df->address : 0);
226 for (; df; df = next_df) {
227 next_df = df->next;
228 if (df->fpage)
229 __free_page(df->fpage);
230 kfree(df);
231 }
232}
233
234EXPORT_SYMBOL(change_page_attr);
235EXPORT_SYMBOL(global_flush_tlb);
diff --git a/arch/x86_64/mm/srat.c b/arch/x86_64/mm/srat.c
new file mode 100644
index 000000000000..5d01b31472e1
--- /dev/null
+++ b/arch/x86_64/mm/srat.c
@@ -0,0 +1,217 @@
1/*
2 * ACPI 3.0 based NUMA setup
3 * Copyright 2004 Andi Kleen, SuSE Labs.
4 *
5 * Reads the ACPI SRAT table to figure out what memory belongs to which CPUs.
6 *
7 * Called from acpi_numa_init while reading the SRAT and SLIT tables.
8 * Assumes all memory regions belonging to a single proximity domain
9 * are in one chunk. Holes between them will be included in the node.
10 */
11
12#include <linux/kernel.h>
13#include <linux/acpi.h>
14#include <linux/mmzone.h>
15#include <linux/bitmap.h>
16#include <linux/module.h>
17#include <linux/topology.h>
18#include <asm/proto.h>
19#include <asm/numa.h>
20
21static struct acpi_table_slit *acpi_slit;
22
23static nodemask_t nodes_parsed __initdata;
24static nodemask_t nodes_found __initdata;
25static struct node nodes[MAX_NUMNODES] __initdata;
26static __u8 pxm2node[256] = { [0 ... 255] = 0xff };
27
28static __init int setup_node(int pxm)
29{
30 unsigned node = pxm2node[pxm];
31 if (node == 0xff) {
32 if (nodes_weight(nodes_found) >= MAX_NUMNODES)
33 return -1;
34 node = first_unset_node(nodes_found);
35 node_set(node, nodes_found);
36 pxm2node[pxm] = node;
37 }
38 return pxm2node[pxm];
39}
40
41static __init int conflicting_nodes(unsigned long start, unsigned long end)
42{
43 int i;
44 for_each_online_node(i) {
45 struct node *nd = &nodes[i];
46 if (nd->start == nd->end)
47 continue;
48 if (nd->end > start && nd->start < end)
49 return 1;
50 if (nd->end == end && nd->start == start)
51 return 1;
52 }
53 return -1;
54}
55
56static __init void cutoff_node(int i, unsigned long start, unsigned long end)
57{
58 struct node *nd = &nodes[i];
59 if (nd->start < start) {
60 nd->start = start;
61 if (nd->end < nd->start)
62 nd->start = nd->end;
63 }
64 if (nd->end > end) {
65 if (!(end & 0xfff))
66 end--;
67 nd->end = end;
68 if (nd->start > nd->end)
69 nd->start = nd->end;
70 }
71}
72
73static __init void bad_srat(void)
74{
75 printk(KERN_ERR "SRAT: SRAT not used.\n");
76 acpi_numa = -1;
77}
78
79static __init inline int srat_disabled(void)
80{
81 return numa_off || acpi_numa < 0;
82}
83
84/* Callback for SLIT parsing */
85void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
86{
87 acpi_slit = slit;
88}
89
90/* Callback for Proximity Domain -> LAPIC mapping */
91void __init
92acpi_numa_processor_affinity_init(struct acpi_table_processor_affinity *pa)
93{
94 int pxm, node;
95 if (srat_disabled() || pa->flags.enabled == 0)
96 return;
97 pxm = pa->proximity_domain;
98 node = setup_node(pxm);
99 if (node < 0) {
100 printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm);
101 bad_srat();
102 return;
103 }
104 if (pa->apic_id >= NR_CPUS) {
105 printk(KERN_ERR "SRAT: lapic %u too large.\n",
106 pa->apic_id);
107 bad_srat();
108 return;
109 }
110 cpu_to_node[pa->apic_id] = node;
111 acpi_numa = 1;
112 printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n",
113 pxm, pa->apic_id, node);
114}
115
116/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
117void __init
118acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma)
119{
120 struct node *nd;
121 unsigned long start, end;
122 int node, pxm;
123 int i;
124
125 if (srat_disabled() || ma->flags.enabled == 0)
126 return;
127 /* hotplug bit is ignored for now */
128 pxm = ma->proximity_domain;
129 node = setup_node(pxm);
130 if (node < 0) {
131 printk(KERN_ERR "SRAT: Too many proximity domains.\n");
132 bad_srat();
133 return;
134 }
135 start = ma->base_addr_lo | ((u64)ma->base_addr_hi << 32);
136 end = start + (ma->length_lo | ((u64)ma->length_hi << 32));
137 i = conflicting_nodes(start, end);
138 if (i >= 0) {
139 printk(KERN_ERR
140 "SRAT: pxm %d overlap %lx-%lx with node %d(%Lx-%Lx)\n",
141 pxm, start, end, i, nodes[i].start, nodes[i].end);
142 bad_srat();
143 return;
144 }
145 nd = &nodes[node];
146 if (!node_test_and_set(node, nodes_parsed)) {
147 nd->start = start;
148 nd->end = end;
149 } else {
150 if (start < nd->start)
151 nd->start = start;
152 if (nd->end < end)
153 nd->end = end;
154 }
155 if (!(nd->end & 0xfff))
156 nd->end--;
157 printk(KERN_INFO "SRAT: Node %u PXM %u %Lx-%Lx\n", node, pxm,
158 nd->start, nd->end);
159}
160
161void __init acpi_numa_arch_fixup(void) {}
162
163/* Use the information discovered above to actually set up the nodes. */
164int __init acpi_scan_nodes(unsigned long start, unsigned long end)
165{
166 int i;
167 if (acpi_numa <= 0)
168 return -1;
169 memnode_shift = compute_hash_shift(nodes, nodes_weight(nodes_parsed));
170 if (memnode_shift < 0) {
171 printk(KERN_ERR
172 "SRAT: No NUMA node hash function found. Contact maintainer\n");
173 bad_srat();
174 return -1;
175 }
176 for (i = 0; i < MAX_NUMNODES; i++) {
177 if (!node_isset(i, nodes_parsed))
178 continue;
179 cutoff_node(i, start, end);
180 if (nodes[i].start == nodes[i].end) {
181 node_clear(i, nodes_parsed);
182 continue;
183 }
184 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
185 }
186 for (i = 0; i < NR_CPUS; i++) {
187 if (cpu_to_node[i] == NUMA_NO_NODE)
188 continue;
189 if (!node_isset(cpu_to_node[i], nodes_parsed))
190 cpu_to_node[i] = NUMA_NO_NODE;
191 }
192 numa_init_array();
193 return 0;
194}
195
196int node_to_pxm(int n)
197{
198 int i;
199 if (pxm2node[n] == n)
200 return n;
201 for (i = 0; i < 256; i++)
202 if (pxm2node[i] == n)
203 return i;
204 return 0;
205}
206
207int __node_distance(int a, int b)
208{
209 int index;
210
211 if (!acpi_slit)
212 return a == b ? 10 : 20;
213 index = acpi_slit->localities * node_to_pxm(a);
214 return acpi_slit->entry[index + node_to_pxm(b)];
215}
216
217EXPORT_SYMBOL(__node_distance);