diff options
Diffstat (limited to 'arch/x86_64/mm')
-rw-r--r-- | arch/x86_64/mm/Makefile | 11 | ||||
-rw-r--r-- | arch/x86_64/mm/extable.c | 35 | ||||
-rw-r--r-- | arch/x86_64/mm/fault.c | 579 | ||||
-rw-r--r-- | arch/x86_64/mm/init.c | 630 | ||||
-rw-r--r-- | arch/x86_64/mm/ioremap.c | 283 | ||||
-rw-r--r-- | arch/x86_64/mm/k8topology.c | 168 | ||||
-rw-r--r-- | arch/x86_64/mm/numa.c | 294 | ||||
-rw-r--r-- | arch/x86_64/mm/pageattr.c | 235 | ||||
-rw-r--r-- | arch/x86_64/mm/srat.c | 217 |
9 files changed, 2452 insertions, 0 deletions
diff --git a/arch/x86_64/mm/Makefile b/arch/x86_64/mm/Makefile new file mode 100644 index 000000000000..66c354ad80ca --- /dev/null +++ b/arch/x86_64/mm/Makefile | |||
@@ -0,0 +1,11 @@ | |||
1 | # | ||
2 | # Makefile for the linux x86_64-specific parts of the memory manager. | ||
3 | # | ||
4 | |||
5 | obj-y := init.o fault.o ioremap.o extable.o pageattr.o | ||
6 | obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o | ||
7 | obj-$(CONFIG_DISCONTIGMEM) += numa.o | ||
8 | obj-$(CONFIG_K8_NUMA) += k8topology.o | ||
9 | obj-$(CONFIG_ACPI_NUMA) += srat.o | ||
10 | |||
11 | hugetlbpage-y = ../../i386/mm/hugetlbpage.o | ||
diff --git a/arch/x86_64/mm/extable.c b/arch/x86_64/mm/extable.c new file mode 100644 index 000000000000..2d78f9fb4035 --- /dev/null +++ b/arch/x86_64/mm/extable.c | |||
@@ -0,0 +1,35 @@ | |||
1 | /* | ||
2 | * linux/arch/x86_64/mm/extable.c | ||
3 | */ | ||
4 | |||
5 | #include <linux/config.h> | ||
6 | #include <linux/module.h> | ||
7 | #include <linux/spinlock.h> | ||
8 | #include <linux/init.h> | ||
9 | #include <asm/uaccess.h> | ||
10 | |||
11 | /* Simple binary search */ | ||
12 | const struct exception_table_entry * | ||
13 | search_extable(const struct exception_table_entry *first, | ||
14 | const struct exception_table_entry *last, | ||
15 | unsigned long value) | ||
16 | { | ||
17 | /* Work around a B stepping K8 bug */ | ||
18 | if ((value >> 32) == 0) | ||
19 | value |= 0xffffffffUL << 32; | ||
20 | |||
21 | while (first <= last) { | ||
22 | const struct exception_table_entry *mid; | ||
23 | long diff; | ||
24 | |||
25 | mid = (last - first) / 2 + first; | ||
26 | diff = mid->insn - value; | ||
27 | if (diff == 0) | ||
28 | return mid; | ||
29 | else if (diff < 0) | ||
30 | first = mid+1; | ||
31 | else | ||
32 | last = mid-1; | ||
33 | } | ||
34 | return NULL; | ||
35 | } | ||
diff --git a/arch/x86_64/mm/fault.c b/arch/x86_64/mm/fault.c new file mode 100644 index 000000000000..5724370475cc --- /dev/null +++ b/arch/x86_64/mm/fault.c | |||
@@ -0,0 +1,579 @@ | |||
1 | /* | ||
2 | * linux/arch/x86-64/mm/fault.c | ||
3 | * | ||
4 | * Copyright (C) 1995 Linus Torvalds | ||
5 | * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs. | ||
6 | */ | ||
7 | |||
8 | #include <linux/config.h> | ||
9 | #include <linux/signal.h> | ||
10 | #include <linux/sched.h> | ||
11 | #include <linux/kernel.h> | ||
12 | #include <linux/errno.h> | ||
13 | #include <linux/string.h> | ||
14 | #include <linux/types.h> | ||
15 | #include <linux/ptrace.h> | ||
16 | #include <linux/mman.h> | ||
17 | #include <linux/mm.h> | ||
18 | #include <linux/smp.h> | ||
19 | #include <linux/smp_lock.h> | ||
20 | #include <linux/interrupt.h> | ||
21 | #include <linux/init.h> | ||
22 | #include <linux/tty.h> | ||
23 | #include <linux/vt_kern.h> /* For unblank_screen() */ | ||
24 | #include <linux/compiler.h> | ||
25 | #include <linux/module.h> | ||
26 | #include <linux/kprobes.h> | ||
27 | |||
28 | #include <asm/system.h> | ||
29 | #include <asm/uaccess.h> | ||
30 | #include <asm/pgalloc.h> | ||
31 | #include <asm/smp.h> | ||
32 | #include <asm/tlbflush.h> | ||
33 | #include <asm/proto.h> | ||
34 | #include <asm/kdebug.h> | ||
35 | #include <asm-generic/sections.h> | ||
36 | #include <asm/kdebug.h> | ||
37 | |||
38 | void bust_spinlocks(int yes) | ||
39 | { | ||
40 | int loglevel_save = console_loglevel; | ||
41 | if (yes) { | ||
42 | oops_in_progress = 1; | ||
43 | } else { | ||
44 | #ifdef CONFIG_VT | ||
45 | unblank_screen(); | ||
46 | #endif | ||
47 | oops_in_progress = 0; | ||
48 | /* | ||
49 | * OK, the message is on the console. Now we call printk() | ||
50 | * without oops_in_progress set so that printk will give klogd | ||
51 | * a poke. Hold onto your hats... | ||
52 | */ | ||
53 | console_loglevel = 15; /* NMI oopser may have shut the console up */ | ||
54 | printk(" "); | ||
55 | console_loglevel = loglevel_save; | ||
56 | } | ||
57 | } | ||
58 | |||
59 | /* Sometimes the CPU reports invalid exceptions on prefetch. | ||
60 | Check that here and ignore. | ||
61 | Opcode checker based on code by Richard Brunner */ | ||
62 | static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr, | ||
63 | unsigned long error_code) | ||
64 | { | ||
65 | unsigned char *instr = (unsigned char *)(regs->rip); | ||
66 | int scan_more = 1; | ||
67 | int prefetch = 0; | ||
68 | unsigned char *max_instr = instr + 15; | ||
69 | |||
70 | /* If it was a exec fault ignore */ | ||
71 | if (error_code & (1<<4)) | ||
72 | return 0; | ||
73 | |||
74 | /* Code segments in LDT could have a non zero base. Don't check | ||
75 | when that's possible */ | ||
76 | if (regs->cs & (1<<2)) | ||
77 | return 0; | ||
78 | |||
79 | if ((regs->cs & 3) != 0 && regs->rip >= TASK_SIZE) | ||
80 | return 0; | ||
81 | |||
82 | while (scan_more && instr < max_instr) { | ||
83 | unsigned char opcode; | ||
84 | unsigned char instr_hi; | ||
85 | unsigned char instr_lo; | ||
86 | |||
87 | if (__get_user(opcode, instr)) | ||
88 | break; | ||
89 | |||
90 | instr_hi = opcode & 0xf0; | ||
91 | instr_lo = opcode & 0x0f; | ||
92 | instr++; | ||
93 | |||
94 | switch (instr_hi) { | ||
95 | case 0x20: | ||
96 | case 0x30: | ||
97 | /* Values 0x26,0x2E,0x36,0x3E are valid x86 | ||
98 | prefixes. In long mode, the CPU will signal | ||
99 | invalid opcode if some of these prefixes are | ||
100 | present so we will never get here anyway */ | ||
101 | scan_more = ((instr_lo & 7) == 0x6); | ||
102 | break; | ||
103 | |||
104 | case 0x40: | ||
105 | /* In AMD64 long mode, 0x40 to 0x4F are valid REX prefixes | ||
106 | Need to figure out under what instruction mode the | ||
107 | instruction was issued ... */ | ||
108 | /* Could check the LDT for lm, but for now it's good | ||
109 | enough to assume that long mode only uses well known | ||
110 | segments or kernel. */ | ||
111 | scan_more = ((regs->cs & 3) == 0) || (regs->cs == __USER_CS); | ||
112 | break; | ||
113 | |||
114 | case 0x60: | ||
115 | /* 0x64 thru 0x67 are valid prefixes in all modes. */ | ||
116 | scan_more = (instr_lo & 0xC) == 0x4; | ||
117 | break; | ||
118 | case 0xF0: | ||
119 | /* 0xF0, 0xF2, and 0xF3 are valid prefixes in all modes. */ | ||
120 | scan_more = !instr_lo || (instr_lo>>1) == 1; | ||
121 | break; | ||
122 | case 0x00: | ||
123 | /* Prefetch instruction is 0x0F0D or 0x0F18 */ | ||
124 | scan_more = 0; | ||
125 | if (__get_user(opcode, instr)) | ||
126 | break; | ||
127 | prefetch = (instr_lo == 0xF) && | ||
128 | (opcode == 0x0D || opcode == 0x18); | ||
129 | break; | ||
130 | default: | ||
131 | scan_more = 0; | ||
132 | break; | ||
133 | } | ||
134 | } | ||
135 | return prefetch; | ||
136 | } | ||
137 | |||
138 | static int bad_address(void *p) | ||
139 | { | ||
140 | unsigned long dummy; | ||
141 | return __get_user(dummy, (unsigned long *)p); | ||
142 | } | ||
143 | |||
144 | void dump_pagetable(unsigned long address) | ||
145 | { | ||
146 | pgd_t *pgd; | ||
147 | pud_t *pud; | ||
148 | pmd_t *pmd; | ||
149 | pte_t *pte; | ||
150 | |||
151 | asm("movq %%cr3,%0" : "=r" (pgd)); | ||
152 | |||
153 | pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK); | ||
154 | pgd += pgd_index(address); | ||
155 | printk("PGD %lx ", pgd_val(*pgd)); | ||
156 | if (bad_address(pgd)) goto bad; | ||
157 | if (!pgd_present(*pgd)) goto ret; | ||
158 | |||
159 | pud = __pud_offset_k((pud_t *)pgd_page(*pgd), address); | ||
160 | if (bad_address(pud)) goto bad; | ||
161 | printk("PUD %lx ", pud_val(*pud)); | ||
162 | if (!pud_present(*pud)) goto ret; | ||
163 | |||
164 | pmd = pmd_offset(pud, address); | ||
165 | if (bad_address(pmd)) goto bad; | ||
166 | printk("PMD %lx ", pmd_val(*pmd)); | ||
167 | if (!pmd_present(*pmd)) goto ret; | ||
168 | |||
169 | pte = pte_offset_kernel(pmd, address); | ||
170 | if (bad_address(pte)) goto bad; | ||
171 | printk("PTE %lx", pte_val(*pte)); | ||
172 | ret: | ||
173 | printk("\n"); | ||
174 | return; | ||
175 | bad: | ||
176 | printk("BAD\n"); | ||
177 | } | ||
178 | |||
179 | static const char errata93_warning[] = | ||
180 | KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n" | ||
181 | KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n" | ||
182 | KERN_ERR "******* Please consider a BIOS update.\n" | ||
183 | KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n"; | ||
184 | |||
185 | /* Workaround for K8 erratum #93 & buggy BIOS. | ||
186 | BIOS SMM functions are required to use a specific workaround | ||
187 | to avoid corruption of the 64bit RIP register on C stepping K8. | ||
188 | A lot of BIOS that didn't get tested properly miss this. | ||
189 | The OS sees this as a page fault with the upper 32bits of RIP cleared. | ||
190 | Try to work around it here. | ||
191 | Note we only handle faults in kernel here. */ | ||
192 | |||
193 | static int is_errata93(struct pt_regs *regs, unsigned long address) | ||
194 | { | ||
195 | static int warned; | ||
196 | if (address != regs->rip) | ||
197 | return 0; | ||
198 | if ((address >> 32) != 0) | ||
199 | return 0; | ||
200 | address |= 0xffffffffUL << 32; | ||
201 | if ((address >= (u64)_stext && address <= (u64)_etext) || | ||
202 | (address >= MODULES_VADDR && address <= MODULES_END)) { | ||
203 | if (!warned) { | ||
204 | printk(errata93_warning); | ||
205 | warned = 1; | ||
206 | } | ||
207 | regs->rip = address; | ||
208 | return 1; | ||
209 | } | ||
210 | return 0; | ||
211 | } | ||
212 | |||
213 | int unhandled_signal(struct task_struct *tsk, int sig) | ||
214 | { | ||
215 | if (tsk->pid == 1) | ||
216 | return 1; | ||
217 | /* Warn for strace, but not for gdb */ | ||
218 | if (!test_ti_thread_flag(tsk->thread_info, TIF_SYSCALL_TRACE) && | ||
219 | (tsk->ptrace & PT_PTRACED)) | ||
220 | return 0; | ||
221 | return (tsk->sighand->action[sig-1].sa.sa_handler == SIG_IGN) || | ||
222 | (tsk->sighand->action[sig-1].sa.sa_handler == SIG_DFL); | ||
223 | } | ||
224 | |||
225 | static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs, | ||
226 | unsigned long error_code) | ||
227 | { | ||
228 | oops_begin(); | ||
229 | printk(KERN_ALERT "%s: Corrupted page table at address %lx\n", | ||
230 | current->comm, address); | ||
231 | dump_pagetable(address); | ||
232 | __die("Bad pagetable", regs, error_code); | ||
233 | oops_end(); | ||
234 | do_exit(SIGKILL); | ||
235 | } | ||
236 | |||
237 | /* | ||
238 | * Handle a fault on the vmalloc or module mapping area | ||
239 | */ | ||
240 | static int vmalloc_fault(unsigned long address) | ||
241 | { | ||
242 | pgd_t *pgd, *pgd_ref; | ||
243 | pud_t *pud, *pud_ref; | ||
244 | pmd_t *pmd, *pmd_ref; | ||
245 | pte_t *pte, *pte_ref; | ||
246 | |||
247 | /* Copy kernel mappings over when needed. This can also | ||
248 | happen within a race in page table update. In the later | ||
249 | case just flush. */ | ||
250 | |||
251 | pgd = pgd_offset(current->mm ?: &init_mm, address); | ||
252 | pgd_ref = pgd_offset_k(address); | ||
253 | if (pgd_none(*pgd_ref)) | ||
254 | return -1; | ||
255 | if (pgd_none(*pgd)) | ||
256 | set_pgd(pgd, *pgd_ref); | ||
257 | |||
258 | /* Below here mismatches are bugs because these lower tables | ||
259 | are shared */ | ||
260 | |||
261 | pud = pud_offset(pgd, address); | ||
262 | pud_ref = pud_offset(pgd_ref, address); | ||
263 | if (pud_none(*pud_ref)) | ||
264 | return -1; | ||
265 | if (pud_none(*pud) || pud_page(*pud) != pud_page(*pud_ref)) | ||
266 | BUG(); | ||
267 | pmd = pmd_offset(pud, address); | ||
268 | pmd_ref = pmd_offset(pud_ref, address); | ||
269 | if (pmd_none(*pmd_ref)) | ||
270 | return -1; | ||
271 | if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref)) | ||
272 | BUG(); | ||
273 | pte_ref = pte_offset_kernel(pmd_ref, address); | ||
274 | if (!pte_present(*pte_ref)) | ||
275 | return -1; | ||
276 | pte = pte_offset_kernel(pmd, address); | ||
277 | if (!pte_present(*pte) || pte_page(*pte) != pte_page(*pte_ref)) | ||
278 | BUG(); | ||
279 | __flush_tlb_all(); | ||
280 | return 0; | ||
281 | } | ||
282 | |||
283 | int page_fault_trace = 0; | ||
284 | int exception_trace = 1; | ||
285 | |||
286 | /* | ||
287 | * This routine handles page faults. It determines the address, | ||
288 | * and the problem, and then passes it off to one of the appropriate | ||
289 | * routines. | ||
290 | * | ||
291 | * error_code: | ||
292 | * bit 0 == 0 means no page found, 1 means protection fault | ||
293 | * bit 1 == 0 means read, 1 means write | ||
294 | * bit 2 == 0 means kernel, 1 means user-mode | ||
295 | * bit 3 == 1 means fault was an instruction fetch | ||
296 | */ | ||
297 | asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code) | ||
298 | { | ||
299 | struct task_struct *tsk; | ||
300 | struct mm_struct *mm; | ||
301 | struct vm_area_struct * vma; | ||
302 | unsigned long address; | ||
303 | const struct exception_table_entry *fixup; | ||
304 | int write; | ||
305 | siginfo_t info; | ||
306 | |||
307 | #ifdef CONFIG_CHECKING | ||
308 | { | ||
309 | unsigned long gs; | ||
310 | struct x8664_pda *pda = cpu_pda + stack_smp_processor_id(); | ||
311 | rdmsrl(MSR_GS_BASE, gs); | ||
312 | if (gs != (unsigned long)pda) { | ||
313 | wrmsrl(MSR_GS_BASE, pda); | ||
314 | printk("page_fault: wrong gs %lx expected %p\n", gs, pda); | ||
315 | } | ||
316 | } | ||
317 | #endif | ||
318 | |||
319 | /* get the address */ | ||
320 | __asm__("movq %%cr2,%0":"=r" (address)); | ||
321 | if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14, | ||
322 | SIGSEGV) == NOTIFY_STOP) | ||
323 | return; | ||
324 | |||
325 | if (likely(regs->eflags & X86_EFLAGS_IF)) | ||
326 | local_irq_enable(); | ||
327 | |||
328 | if (unlikely(page_fault_trace)) | ||
329 | printk("pagefault rip:%lx rsp:%lx cs:%lu ss:%lu address %lx error %lx\n", | ||
330 | regs->rip,regs->rsp,regs->cs,regs->ss,address,error_code); | ||
331 | |||
332 | tsk = current; | ||
333 | mm = tsk->mm; | ||
334 | info.si_code = SEGV_MAPERR; | ||
335 | |||
336 | |||
337 | /* | ||
338 | * We fault-in kernel-space virtual memory on-demand. The | ||
339 | * 'reference' page table is init_mm.pgd. | ||
340 | * | ||
341 | * NOTE! We MUST NOT take any locks for this case. We may | ||
342 | * be in an interrupt or a critical region, and should | ||
343 | * only copy the information from the master page table, | ||
344 | * nothing more. | ||
345 | * | ||
346 | * This verifies that the fault happens in kernel space | ||
347 | * (error_code & 4) == 0, and that the fault was not a | ||
348 | * protection error (error_code & 1) == 0. | ||
349 | */ | ||
350 | if (unlikely(address >= TASK_SIZE)) { | ||
351 | if (!(error_code & 5)) { | ||
352 | if (vmalloc_fault(address) < 0) | ||
353 | goto bad_area_nosemaphore; | ||
354 | return; | ||
355 | } | ||
356 | /* | ||
357 | * Don't take the mm semaphore here. If we fixup a prefetch | ||
358 | * fault we could otherwise deadlock. | ||
359 | */ | ||
360 | goto bad_area_nosemaphore; | ||
361 | } | ||
362 | |||
363 | if (unlikely(error_code & (1 << 3))) | ||
364 | pgtable_bad(address, regs, error_code); | ||
365 | |||
366 | /* | ||
367 | * If we're in an interrupt or have no user | ||
368 | * context, we must not take the fault.. | ||
369 | */ | ||
370 | if (unlikely(in_atomic() || !mm)) | ||
371 | goto bad_area_nosemaphore; | ||
372 | |||
373 | again: | ||
374 | /* When running in the kernel we expect faults to occur only to | ||
375 | * addresses in user space. All other faults represent errors in the | ||
376 | * kernel and should generate an OOPS. Unfortunatly, in the case of an | ||
377 | * erroneous fault occuring in a code path which already holds mmap_sem | ||
378 | * we will deadlock attempting to validate the fault against the | ||
379 | * address space. Luckily the kernel only validly references user | ||
380 | * space from well defined areas of code, which are listed in the | ||
381 | * exceptions table. | ||
382 | * | ||
383 | * As the vast majority of faults will be valid we will only perform | ||
384 | * the source reference check when there is a possibilty of a deadlock. | ||
385 | * Attempt to lock the address space, if we cannot we then validate the | ||
386 | * source. If this is invalid we can skip the address space check, | ||
387 | * thus avoiding the deadlock. | ||
388 | */ | ||
389 | if (!down_read_trylock(&mm->mmap_sem)) { | ||
390 | if ((error_code & 4) == 0 && | ||
391 | !search_exception_tables(regs->rip)) | ||
392 | goto bad_area_nosemaphore; | ||
393 | down_read(&mm->mmap_sem); | ||
394 | } | ||
395 | |||
396 | vma = find_vma(mm, address); | ||
397 | if (!vma) | ||
398 | goto bad_area; | ||
399 | if (likely(vma->vm_start <= address)) | ||
400 | goto good_area; | ||
401 | if (!(vma->vm_flags & VM_GROWSDOWN)) | ||
402 | goto bad_area; | ||
403 | if (error_code & 4) { | ||
404 | // XXX: align red zone size with ABI | ||
405 | if (address + 128 < regs->rsp) | ||
406 | goto bad_area; | ||
407 | } | ||
408 | if (expand_stack(vma, address)) | ||
409 | goto bad_area; | ||
410 | /* | ||
411 | * Ok, we have a good vm_area for this memory access, so | ||
412 | * we can handle it.. | ||
413 | */ | ||
414 | good_area: | ||
415 | info.si_code = SEGV_ACCERR; | ||
416 | write = 0; | ||
417 | switch (error_code & 3) { | ||
418 | default: /* 3: write, present */ | ||
419 | /* fall through */ | ||
420 | case 2: /* write, not present */ | ||
421 | if (!(vma->vm_flags & VM_WRITE)) | ||
422 | goto bad_area; | ||
423 | write++; | ||
424 | break; | ||
425 | case 1: /* read, present */ | ||
426 | goto bad_area; | ||
427 | case 0: /* read, not present */ | ||
428 | if (!(vma->vm_flags & (VM_READ | VM_EXEC))) | ||
429 | goto bad_area; | ||
430 | } | ||
431 | |||
432 | /* | ||
433 | * If for any reason at all we couldn't handle the fault, | ||
434 | * make sure we exit gracefully rather than endlessly redo | ||
435 | * the fault. | ||
436 | */ | ||
437 | switch (handle_mm_fault(mm, vma, address, write)) { | ||
438 | case 1: | ||
439 | tsk->min_flt++; | ||
440 | break; | ||
441 | case 2: | ||
442 | tsk->maj_flt++; | ||
443 | break; | ||
444 | case 0: | ||
445 | goto do_sigbus; | ||
446 | default: | ||
447 | goto out_of_memory; | ||
448 | } | ||
449 | |||
450 | up_read(&mm->mmap_sem); | ||
451 | return; | ||
452 | |||
453 | /* | ||
454 | * Something tried to access memory that isn't in our memory map.. | ||
455 | * Fix it, but check if it's kernel or user first.. | ||
456 | */ | ||
457 | bad_area: | ||
458 | up_read(&mm->mmap_sem); | ||
459 | |||
460 | bad_area_nosemaphore: | ||
461 | |||
462 | #ifdef CONFIG_IA32_EMULATION | ||
463 | /* 32bit vsyscall. map on demand. */ | ||
464 | if (test_thread_flag(TIF_IA32) && | ||
465 | address >= VSYSCALL32_BASE && address < VSYSCALL32_END) { | ||
466 | if (map_syscall32(mm, address) < 0) | ||
467 | goto out_of_memory2; | ||
468 | return; | ||
469 | } | ||
470 | #endif | ||
471 | |||
472 | /* User mode accesses just cause a SIGSEGV */ | ||
473 | if (error_code & 4) { | ||
474 | if (is_prefetch(regs, address, error_code)) | ||
475 | return; | ||
476 | |||
477 | /* Work around K8 erratum #100 K8 in compat mode | ||
478 | occasionally jumps to illegal addresses >4GB. We | ||
479 | catch this here in the page fault handler because | ||
480 | these addresses are not reachable. Just detect this | ||
481 | case and return. Any code segment in LDT is | ||
482 | compatibility mode. */ | ||
483 | if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && | ||
484 | (address >> 32)) | ||
485 | return; | ||
486 | |||
487 | if (exception_trace && unhandled_signal(tsk, SIGSEGV)) { | ||
488 | printk( | ||
489 | "%s%s[%d]: segfault at %016lx rip %016lx rsp %016lx error %lx\n", | ||
490 | tsk->pid > 1 ? KERN_INFO : KERN_EMERG, | ||
491 | tsk->comm, tsk->pid, address, regs->rip, | ||
492 | regs->rsp, error_code); | ||
493 | } | ||
494 | |||
495 | tsk->thread.cr2 = address; | ||
496 | /* Kernel addresses are always protection faults */ | ||
497 | tsk->thread.error_code = error_code | (address >= TASK_SIZE); | ||
498 | tsk->thread.trap_no = 14; | ||
499 | info.si_signo = SIGSEGV; | ||
500 | info.si_errno = 0; | ||
501 | /* info.si_code has been set above */ | ||
502 | info.si_addr = (void __user *)address; | ||
503 | force_sig_info(SIGSEGV, &info, tsk); | ||
504 | return; | ||
505 | } | ||
506 | |||
507 | no_context: | ||
508 | |||
509 | /* Are we prepared to handle this kernel fault? */ | ||
510 | fixup = search_exception_tables(regs->rip); | ||
511 | if (fixup) { | ||
512 | regs->rip = fixup->fixup; | ||
513 | return; | ||
514 | } | ||
515 | |||
516 | /* | ||
517 | * Hall of shame of CPU/BIOS bugs. | ||
518 | */ | ||
519 | |||
520 | if (is_prefetch(regs, address, error_code)) | ||
521 | return; | ||
522 | |||
523 | if (is_errata93(regs, address)) | ||
524 | return; | ||
525 | |||
526 | /* | ||
527 | * Oops. The kernel tried to access some bad page. We'll have to | ||
528 | * terminate things with extreme prejudice. | ||
529 | */ | ||
530 | |||
531 | oops_begin(); | ||
532 | |||
533 | if (address < PAGE_SIZE) | ||
534 | printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference"); | ||
535 | else | ||
536 | printk(KERN_ALERT "Unable to handle kernel paging request"); | ||
537 | printk(" at %016lx RIP: \n" KERN_ALERT,address); | ||
538 | printk_address(regs->rip); | ||
539 | printk("\n"); | ||
540 | dump_pagetable(address); | ||
541 | __die("Oops", regs, error_code); | ||
542 | /* Executive summary in case the body of the oops scrolled away */ | ||
543 | printk(KERN_EMERG "CR2: %016lx\n", address); | ||
544 | oops_end(); | ||
545 | do_exit(SIGKILL); | ||
546 | |||
547 | /* | ||
548 | * We ran out of memory, or some other thing happened to us that made | ||
549 | * us unable to handle the page fault gracefully. | ||
550 | */ | ||
551 | out_of_memory: | ||
552 | up_read(&mm->mmap_sem); | ||
553 | out_of_memory2: | ||
554 | if (current->pid == 1) { | ||
555 | yield(); | ||
556 | goto again; | ||
557 | } | ||
558 | printk("VM: killing process %s\n", tsk->comm); | ||
559 | if (error_code & 4) | ||
560 | do_exit(SIGKILL); | ||
561 | goto no_context; | ||
562 | |||
563 | do_sigbus: | ||
564 | up_read(&mm->mmap_sem); | ||
565 | |||
566 | /* Kernel mode? Handle exceptions or die */ | ||
567 | if (!(error_code & 4)) | ||
568 | goto no_context; | ||
569 | |||
570 | tsk->thread.cr2 = address; | ||
571 | tsk->thread.error_code = error_code; | ||
572 | tsk->thread.trap_no = 14; | ||
573 | info.si_signo = SIGBUS; | ||
574 | info.si_errno = 0; | ||
575 | info.si_code = BUS_ADRERR; | ||
576 | info.si_addr = (void __user *)address; | ||
577 | force_sig_info(SIGBUS, &info, tsk); | ||
578 | return; | ||
579 | } | ||
diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c new file mode 100644 index 000000000000..b0d604551d86 --- /dev/null +++ b/arch/x86_64/mm/init.c | |||
@@ -0,0 +1,630 @@ | |||
1 | /* | ||
2 | * linux/arch/x86_64/mm/init.c | ||
3 | * | ||
4 | * Copyright (C) 1995 Linus Torvalds | ||
5 | * Copyright (C) 2000 Pavel Machek <pavel@suse.cz> | ||
6 | * Copyright (C) 2002,2003 Andi Kleen <ak@suse.de> | ||
7 | */ | ||
8 | |||
9 | #include <linux/config.h> | ||
10 | #include <linux/signal.h> | ||
11 | #include <linux/sched.h> | ||
12 | #include <linux/kernel.h> | ||
13 | #include <linux/errno.h> | ||
14 | #include <linux/string.h> | ||
15 | #include <linux/types.h> | ||
16 | #include <linux/ptrace.h> | ||
17 | #include <linux/mman.h> | ||
18 | #include <linux/mm.h> | ||
19 | #include <linux/swap.h> | ||
20 | #include <linux/smp.h> | ||
21 | #include <linux/init.h> | ||
22 | #include <linux/pagemap.h> | ||
23 | #include <linux/bootmem.h> | ||
24 | #include <linux/proc_fs.h> | ||
25 | |||
26 | #include <asm/processor.h> | ||
27 | #include <asm/system.h> | ||
28 | #include <asm/uaccess.h> | ||
29 | #include <asm/pgtable.h> | ||
30 | #include <asm/pgalloc.h> | ||
31 | #include <asm/dma.h> | ||
32 | #include <asm/fixmap.h> | ||
33 | #include <asm/e820.h> | ||
34 | #include <asm/apic.h> | ||
35 | #include <asm/tlb.h> | ||
36 | #include <asm/mmu_context.h> | ||
37 | #include <asm/proto.h> | ||
38 | #include <asm/smp.h> | ||
39 | |||
40 | #ifndef Dprintk | ||
41 | #define Dprintk(x...) | ||
42 | #endif | ||
43 | |||
44 | #ifdef CONFIG_GART_IOMMU | ||
45 | extern int swiotlb; | ||
46 | #endif | ||
47 | |||
48 | extern char _stext[]; | ||
49 | |||
50 | DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); | ||
51 | |||
52 | /* | ||
53 | * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the | ||
54 | * physical space so we can cache the place of the first one and move | ||
55 | * around without checking the pgd every time. | ||
56 | */ | ||
57 | |||
58 | void show_mem(void) | ||
59 | { | ||
60 | int i, total = 0, reserved = 0; | ||
61 | int shared = 0, cached = 0; | ||
62 | pg_data_t *pgdat; | ||
63 | struct page *page; | ||
64 | |||
65 | printk("Mem-info:\n"); | ||
66 | show_free_areas(); | ||
67 | printk("Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); | ||
68 | |||
69 | for_each_pgdat(pgdat) { | ||
70 | for (i = 0; i < pgdat->node_spanned_pages; ++i) { | ||
71 | page = pfn_to_page(pgdat->node_start_pfn + i); | ||
72 | total++; | ||
73 | if (PageReserved(page)) | ||
74 | reserved++; | ||
75 | else if (PageSwapCache(page)) | ||
76 | cached++; | ||
77 | else if (page_count(page)) | ||
78 | shared += page_count(page) - 1; | ||
79 | } | ||
80 | } | ||
81 | printk("%d pages of RAM\n", total); | ||
82 | printk("%d reserved pages\n",reserved); | ||
83 | printk("%d pages shared\n",shared); | ||
84 | printk("%d pages swap cached\n",cached); | ||
85 | } | ||
86 | |||
87 | /* References to section boundaries */ | ||
88 | |||
89 | extern char _text, _etext, _edata, __bss_start, _end[]; | ||
90 | extern char __init_begin, __init_end; | ||
91 | |||
92 | int after_bootmem; | ||
93 | |||
94 | static void *spp_getpage(void) | ||
95 | { | ||
96 | void *ptr; | ||
97 | if (after_bootmem) | ||
98 | ptr = (void *) get_zeroed_page(GFP_ATOMIC); | ||
99 | else | ||
100 | ptr = alloc_bootmem_pages(PAGE_SIZE); | ||
101 | if (!ptr || ((unsigned long)ptr & ~PAGE_MASK)) | ||
102 | panic("set_pte_phys: cannot allocate page data %s\n", after_bootmem?"after bootmem":""); | ||
103 | |||
104 | Dprintk("spp_getpage %p\n", ptr); | ||
105 | return ptr; | ||
106 | } | ||
107 | |||
108 | static void set_pte_phys(unsigned long vaddr, | ||
109 | unsigned long phys, pgprot_t prot) | ||
110 | { | ||
111 | pgd_t *pgd; | ||
112 | pud_t *pud; | ||
113 | pmd_t *pmd; | ||
114 | pte_t *pte, new_pte; | ||
115 | |||
116 | Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys); | ||
117 | |||
118 | pgd = pgd_offset_k(vaddr); | ||
119 | if (pgd_none(*pgd)) { | ||
120 | printk("PGD FIXMAP MISSING, it should be setup in head.S!\n"); | ||
121 | return; | ||
122 | } | ||
123 | pud = pud_offset(pgd, vaddr); | ||
124 | if (pud_none(*pud)) { | ||
125 | pmd = (pmd_t *) spp_getpage(); | ||
126 | set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER)); | ||
127 | if (pmd != pmd_offset(pud, 0)) { | ||
128 | printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0)); | ||
129 | return; | ||
130 | } | ||
131 | } | ||
132 | pmd = pmd_offset(pud, vaddr); | ||
133 | if (pmd_none(*pmd)) { | ||
134 | pte = (pte_t *) spp_getpage(); | ||
135 | set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER)); | ||
136 | if (pte != pte_offset_kernel(pmd, 0)) { | ||
137 | printk("PAGETABLE BUG #02!\n"); | ||
138 | return; | ||
139 | } | ||
140 | } | ||
141 | new_pte = pfn_pte(phys >> PAGE_SHIFT, prot); | ||
142 | |||
143 | pte = pte_offset_kernel(pmd, vaddr); | ||
144 | if (!pte_none(*pte) && | ||
145 | pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask)) | ||
146 | pte_ERROR(*pte); | ||
147 | set_pte(pte, new_pte); | ||
148 | |||
149 | /* | ||
150 | * It's enough to flush this one mapping. | ||
151 | * (PGE mappings get flushed as well) | ||
152 | */ | ||
153 | __flush_tlb_one(vaddr); | ||
154 | } | ||
155 | |||
156 | /* NOTE: this is meant to be run only at boot */ | ||
157 | void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t prot) | ||
158 | { | ||
159 | unsigned long address = __fix_to_virt(idx); | ||
160 | |||
161 | if (idx >= __end_of_fixed_addresses) { | ||
162 | printk("Invalid __set_fixmap\n"); | ||
163 | return; | ||
164 | } | ||
165 | set_pte_phys(address, phys, prot); | ||
166 | } | ||
167 | |||
168 | unsigned long __initdata table_start, table_end; | ||
169 | |||
170 | extern pmd_t temp_boot_pmds[]; | ||
171 | |||
172 | static struct temp_map { | ||
173 | pmd_t *pmd; | ||
174 | void *address; | ||
175 | int allocated; | ||
176 | } temp_mappings[] __initdata = { | ||
177 | { &temp_boot_pmds[0], (void *)(40UL * 1024 * 1024) }, | ||
178 | { &temp_boot_pmds[1], (void *)(42UL * 1024 * 1024) }, | ||
179 | {} | ||
180 | }; | ||
181 | |||
182 | static __init void *alloc_low_page(int *index, unsigned long *phys) | ||
183 | { | ||
184 | struct temp_map *ti; | ||
185 | int i; | ||
186 | unsigned long pfn = table_end++, paddr; | ||
187 | void *adr; | ||
188 | |||
189 | if (pfn >= end_pfn) | ||
190 | panic("alloc_low_page: ran out of memory"); | ||
191 | for (i = 0; temp_mappings[i].allocated; i++) { | ||
192 | if (!temp_mappings[i].pmd) | ||
193 | panic("alloc_low_page: ran out of temp mappings"); | ||
194 | } | ||
195 | ti = &temp_mappings[i]; | ||
196 | paddr = (pfn << PAGE_SHIFT) & PMD_MASK; | ||
197 | set_pmd(ti->pmd, __pmd(paddr | _KERNPG_TABLE | _PAGE_PSE)); | ||
198 | ti->allocated = 1; | ||
199 | __flush_tlb(); | ||
200 | adr = ti->address + ((pfn << PAGE_SHIFT) & ~PMD_MASK); | ||
201 | *index = i; | ||
202 | *phys = pfn * PAGE_SIZE; | ||
203 | return adr; | ||
204 | } | ||
205 | |||
206 | static __init void unmap_low_page(int i) | ||
207 | { | ||
208 | struct temp_map *ti = &temp_mappings[i]; | ||
209 | set_pmd(ti->pmd, __pmd(0)); | ||
210 | ti->allocated = 0; | ||
211 | } | ||
212 | |||
213 | static void __init phys_pud_init(pud_t *pud, unsigned long address, unsigned long end) | ||
214 | { | ||
215 | long i, j; | ||
216 | |||
217 | i = pud_index(address); | ||
218 | pud = pud + i; | ||
219 | for (; i < PTRS_PER_PUD; pud++, i++) { | ||
220 | int map; | ||
221 | unsigned long paddr, pmd_phys; | ||
222 | pmd_t *pmd; | ||
223 | |||
224 | paddr = address + i*PUD_SIZE; | ||
225 | if (paddr >= end) { | ||
226 | for (; i < PTRS_PER_PUD; i++, pud++) | ||
227 | set_pud(pud, __pud(0)); | ||
228 | break; | ||
229 | } | ||
230 | |||
231 | if (!e820_mapped(paddr, paddr+PUD_SIZE, 0)) { | ||
232 | set_pud(pud, __pud(0)); | ||
233 | continue; | ||
234 | } | ||
235 | |||
236 | pmd = alloc_low_page(&map, &pmd_phys); | ||
237 | set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE)); | ||
238 | for (j = 0; j < PTRS_PER_PMD; pmd++, j++, paddr += PMD_SIZE) { | ||
239 | unsigned long pe; | ||
240 | |||
241 | if (paddr >= end) { | ||
242 | for (; j < PTRS_PER_PMD; j++, pmd++) | ||
243 | set_pmd(pmd, __pmd(0)); | ||
244 | break; | ||
245 | } | ||
246 | pe = _PAGE_NX|_PAGE_PSE | _KERNPG_TABLE | _PAGE_GLOBAL | paddr; | ||
247 | pe &= __supported_pte_mask; | ||
248 | set_pmd(pmd, __pmd(pe)); | ||
249 | } | ||
250 | unmap_low_page(map); | ||
251 | } | ||
252 | __flush_tlb(); | ||
253 | } | ||
254 | |||
255 | static void __init find_early_table_space(unsigned long end) | ||
256 | { | ||
257 | unsigned long puds, pmds, tables; | ||
258 | |||
259 | puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; | ||
260 | pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; | ||
261 | tables = round_up(puds * sizeof(pud_t), PAGE_SIZE) + | ||
262 | round_up(pmds * sizeof(pmd_t), PAGE_SIZE); | ||
263 | |||
264 | table_start = find_e820_area(0x8000, __pa_symbol(&_text), tables); | ||
265 | if (table_start == -1UL) | ||
266 | panic("Cannot find space for the kernel page tables"); | ||
267 | |||
268 | table_start >>= PAGE_SHIFT; | ||
269 | table_end = table_start; | ||
270 | } | ||
271 | |||
272 | /* Setup the direct mapping of the physical memory at PAGE_OFFSET. | ||
273 | This runs before bootmem is initialized and gets pages directly from the | ||
274 | physical memory. To access them they are temporarily mapped. */ | ||
275 | void __init init_memory_mapping(unsigned long start, unsigned long end) | ||
276 | { | ||
277 | unsigned long next; | ||
278 | |||
279 | Dprintk("init_memory_mapping\n"); | ||
280 | |||
281 | /* | ||
282 | * Find space for the kernel direct mapping tables. | ||
283 | * Later we should allocate these tables in the local node of the memory | ||
284 | * mapped. Unfortunately this is done currently before the nodes are | ||
285 | * discovered. | ||
286 | */ | ||
287 | find_early_table_space(end); | ||
288 | |||
289 | start = (unsigned long)__va(start); | ||
290 | end = (unsigned long)__va(end); | ||
291 | |||
292 | for (; start < end; start = next) { | ||
293 | int map; | ||
294 | unsigned long pud_phys; | ||
295 | pud_t *pud = alloc_low_page(&map, &pud_phys); | ||
296 | next = start + PGDIR_SIZE; | ||
297 | if (next > end) | ||
298 | next = end; | ||
299 | phys_pud_init(pud, __pa(start), __pa(next)); | ||
300 | set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys)); | ||
301 | unmap_low_page(map); | ||
302 | } | ||
303 | |||
304 | asm volatile("movq %%cr4,%0" : "=r" (mmu_cr4_features)); | ||
305 | __flush_tlb_all(); | ||
306 | early_printk("kernel direct mapping tables upto %lx @ %lx-%lx\n", end, | ||
307 | table_start<<PAGE_SHIFT, | ||
308 | table_end<<PAGE_SHIFT); | ||
309 | } | ||
310 | |||
311 | extern struct x8664_pda cpu_pda[NR_CPUS]; | ||
312 | |||
313 | /* Assumes all CPUs still execute in init_mm */ | ||
314 | void zap_low_mappings(void) | ||
315 | { | ||
316 | pgd_t *pgd = pgd_offset_k(0UL); | ||
317 | pgd_clear(pgd); | ||
318 | flush_tlb_all(); | ||
319 | } | ||
320 | |||
321 | #ifndef CONFIG_DISCONTIGMEM | ||
322 | void __init paging_init(void) | ||
323 | { | ||
324 | { | ||
325 | unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0}; | ||
326 | unsigned int max_dma; | ||
327 | |||
328 | max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; | ||
329 | |||
330 | if (end_pfn < max_dma) | ||
331 | zones_size[ZONE_DMA] = end_pfn; | ||
332 | else { | ||
333 | zones_size[ZONE_DMA] = max_dma; | ||
334 | zones_size[ZONE_NORMAL] = end_pfn - max_dma; | ||
335 | } | ||
336 | free_area_init(zones_size); | ||
337 | } | ||
338 | return; | ||
339 | } | ||
340 | #endif | ||
341 | |||
342 | /* Unmap a kernel mapping if it exists. This is useful to avoid prefetches | ||
343 | from the CPU leading to inconsistent cache lines. address and size | ||
344 | must be aligned to 2MB boundaries. | ||
345 | Does nothing when the mapping doesn't exist. */ | ||
346 | void __init clear_kernel_mapping(unsigned long address, unsigned long size) | ||
347 | { | ||
348 | unsigned long end = address + size; | ||
349 | |||
350 | BUG_ON(address & ~LARGE_PAGE_MASK); | ||
351 | BUG_ON(size & ~LARGE_PAGE_MASK); | ||
352 | |||
353 | for (; address < end; address += LARGE_PAGE_SIZE) { | ||
354 | pgd_t *pgd = pgd_offset_k(address); | ||
355 | pud_t *pud; | ||
356 | pmd_t *pmd; | ||
357 | if (pgd_none(*pgd)) | ||
358 | continue; | ||
359 | pud = pud_offset(pgd, address); | ||
360 | if (pud_none(*pud)) | ||
361 | continue; | ||
362 | pmd = pmd_offset(pud, address); | ||
363 | if (!pmd || pmd_none(*pmd)) | ||
364 | continue; | ||
365 | if (0 == (pmd_val(*pmd) & _PAGE_PSE)) { | ||
366 | /* Could handle this, but it should not happen currently. */ | ||
367 | printk(KERN_ERR | ||
368 | "clear_kernel_mapping: mapping has been split. will leak memory\n"); | ||
369 | pmd_ERROR(*pmd); | ||
370 | } | ||
371 | set_pmd(pmd, __pmd(0)); | ||
372 | } | ||
373 | __flush_tlb_all(); | ||
374 | } | ||
375 | |||
376 | static inline int page_is_ram (unsigned long pagenr) | ||
377 | { | ||
378 | int i; | ||
379 | |||
380 | for (i = 0; i < e820.nr_map; i++) { | ||
381 | unsigned long addr, end; | ||
382 | |||
383 | if (e820.map[i].type != E820_RAM) /* not usable memory */ | ||
384 | continue; | ||
385 | /* | ||
386 | * !!!FIXME!!! Some BIOSen report areas as RAM that | ||
387 | * are not. Notably the 640->1Mb area. We need a sanity | ||
388 | * check here. | ||
389 | */ | ||
390 | addr = (e820.map[i].addr+PAGE_SIZE-1) >> PAGE_SHIFT; | ||
391 | end = (e820.map[i].addr+e820.map[i].size) >> PAGE_SHIFT; | ||
392 | if ((pagenr >= addr) && (pagenr < end)) | ||
393 | return 1; | ||
394 | } | ||
395 | return 0; | ||
396 | } | ||
397 | |||
398 | extern int swiotlb_force; | ||
399 | |||
400 | static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules, | ||
401 | kcore_vsyscall; | ||
402 | |||
403 | void __init mem_init(void) | ||
404 | { | ||
405 | int codesize, reservedpages, datasize, initsize; | ||
406 | int tmp; | ||
407 | |||
408 | #ifdef CONFIG_SWIOTLB | ||
409 | if (swiotlb_force) | ||
410 | swiotlb = 1; | ||
411 | if (!iommu_aperture && | ||
412 | (end_pfn >= 0xffffffff>>PAGE_SHIFT || force_iommu)) | ||
413 | swiotlb = 1; | ||
414 | if (swiotlb) | ||
415 | swiotlb_init(); | ||
416 | #endif | ||
417 | |||
418 | /* How many end-of-memory variables you have, grandma! */ | ||
419 | max_low_pfn = end_pfn; | ||
420 | max_pfn = end_pfn; | ||
421 | num_physpages = end_pfn; | ||
422 | high_memory = (void *) __va(end_pfn * PAGE_SIZE); | ||
423 | |||
424 | /* clear the zero-page */ | ||
425 | memset(empty_zero_page, 0, PAGE_SIZE); | ||
426 | |||
427 | reservedpages = 0; | ||
428 | |||
429 | /* this will put all low memory onto the freelists */ | ||
430 | #ifdef CONFIG_DISCONTIGMEM | ||
431 | totalram_pages += numa_free_all_bootmem(); | ||
432 | tmp = 0; | ||
433 | /* should count reserved pages here for all nodes */ | ||
434 | #else | ||
435 | max_mapnr = end_pfn; | ||
436 | if (!mem_map) BUG(); | ||
437 | |||
438 | totalram_pages += free_all_bootmem(); | ||
439 | |||
440 | for (tmp = 0; tmp < end_pfn; tmp++) | ||
441 | /* | ||
442 | * Only count reserved RAM pages | ||
443 | */ | ||
444 | if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp))) | ||
445 | reservedpages++; | ||
446 | #endif | ||
447 | |||
448 | after_bootmem = 1; | ||
449 | |||
450 | codesize = (unsigned long) &_etext - (unsigned long) &_text; | ||
451 | datasize = (unsigned long) &_edata - (unsigned long) &_etext; | ||
452 | initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin; | ||
453 | |||
454 | /* Register memory areas for /proc/kcore */ | ||
455 | kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT); | ||
456 | kclist_add(&kcore_vmalloc, (void *)VMALLOC_START, | ||
457 | VMALLOC_END-VMALLOC_START); | ||
458 | kclist_add(&kcore_kernel, &_stext, _end - _stext); | ||
459 | kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN); | ||
460 | kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START, | ||
461 | VSYSCALL_END - VSYSCALL_START); | ||
462 | |||
463 | printk("Memory: %luk/%luk available (%dk kernel code, %dk reserved, %dk data, %dk init)\n", | ||
464 | (unsigned long) nr_free_pages() << (PAGE_SHIFT-10), | ||
465 | end_pfn << (PAGE_SHIFT-10), | ||
466 | codesize >> 10, | ||
467 | reservedpages << (PAGE_SHIFT-10), | ||
468 | datasize >> 10, | ||
469 | initsize >> 10); | ||
470 | |||
471 | /* | ||
472 | * Subtle. SMP is doing its boot stuff late (because it has to | ||
473 | * fork idle threads) - but it also needs low mappings for the | ||
474 | * protected-mode entry to work. We zap these entries only after | ||
475 | * the WP-bit has been tested. | ||
476 | */ | ||
477 | #ifndef CONFIG_SMP | ||
478 | zap_low_mappings(); | ||
479 | #endif | ||
480 | } | ||
481 | |||
482 | extern char __initdata_begin[], __initdata_end[]; | ||
483 | |||
484 | void free_initmem(void) | ||
485 | { | ||
486 | unsigned long addr; | ||
487 | |||
488 | addr = (unsigned long)(&__init_begin); | ||
489 | for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) { | ||
490 | ClearPageReserved(virt_to_page(addr)); | ||
491 | set_page_count(virt_to_page(addr), 1); | ||
492 | memset((void *)(addr & ~(PAGE_SIZE-1)), 0xcc, PAGE_SIZE); | ||
493 | free_page(addr); | ||
494 | totalram_pages++; | ||
495 | } | ||
496 | memset(__initdata_begin, 0xba, __initdata_end - __initdata_begin); | ||
497 | printk ("Freeing unused kernel memory: %luk freed\n", (&__init_end - &__init_begin) >> 10); | ||
498 | } | ||
499 | |||
500 | #ifdef CONFIG_BLK_DEV_INITRD | ||
501 | void free_initrd_mem(unsigned long start, unsigned long end) | ||
502 | { | ||
503 | if (start < (unsigned long)&_end) | ||
504 | return; | ||
505 | printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10); | ||
506 | for (; start < end; start += PAGE_SIZE) { | ||
507 | ClearPageReserved(virt_to_page(start)); | ||
508 | set_page_count(virt_to_page(start), 1); | ||
509 | free_page(start); | ||
510 | totalram_pages++; | ||
511 | } | ||
512 | } | ||
513 | #endif | ||
514 | |||
515 | void __init reserve_bootmem_generic(unsigned long phys, unsigned len) | ||
516 | { | ||
517 | /* Should check here against the e820 map to avoid double free */ | ||
518 | #ifdef CONFIG_DISCONTIGMEM | ||
519 | int nid = phys_to_nid(phys); | ||
520 | reserve_bootmem_node(NODE_DATA(nid), phys, len); | ||
521 | #else | ||
522 | reserve_bootmem(phys, len); | ||
523 | #endif | ||
524 | } | ||
525 | |||
526 | int kern_addr_valid(unsigned long addr) | ||
527 | { | ||
528 | unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT; | ||
529 | pgd_t *pgd; | ||
530 | pud_t *pud; | ||
531 | pmd_t *pmd; | ||
532 | pte_t *pte; | ||
533 | |||
534 | if (above != 0 && above != -1UL) | ||
535 | return 0; | ||
536 | |||
537 | pgd = pgd_offset_k(addr); | ||
538 | if (pgd_none(*pgd)) | ||
539 | return 0; | ||
540 | |||
541 | pud = pud_offset(pgd, addr); | ||
542 | if (pud_none(*pud)) | ||
543 | return 0; | ||
544 | |||
545 | pmd = pmd_offset(pud, addr); | ||
546 | if (pmd_none(*pmd)) | ||
547 | return 0; | ||
548 | if (pmd_large(*pmd)) | ||
549 | return pfn_valid(pmd_pfn(*pmd)); | ||
550 | |||
551 | pte = pte_offset_kernel(pmd, addr); | ||
552 | if (pte_none(*pte)) | ||
553 | return 0; | ||
554 | return pfn_valid(pte_pfn(*pte)); | ||
555 | } | ||
556 | |||
557 | #ifdef CONFIG_SYSCTL | ||
558 | #include <linux/sysctl.h> | ||
559 | |||
560 | extern int exception_trace, page_fault_trace; | ||
561 | |||
562 | static ctl_table debug_table2[] = { | ||
563 | { 99, "exception-trace", &exception_trace, sizeof(int), 0644, NULL, | ||
564 | proc_dointvec }, | ||
565 | #ifdef CONFIG_CHECKING | ||
566 | { 100, "page-fault-trace", &page_fault_trace, sizeof(int), 0644, NULL, | ||
567 | proc_dointvec }, | ||
568 | #endif | ||
569 | { 0, } | ||
570 | }; | ||
571 | |||
572 | static ctl_table debug_root_table2[] = { | ||
573 | { .ctl_name = CTL_DEBUG, .procname = "debug", .mode = 0555, | ||
574 | .child = debug_table2 }, | ||
575 | { 0 }, | ||
576 | }; | ||
577 | |||
578 | static __init int x8664_sysctl_init(void) | ||
579 | { | ||
580 | register_sysctl_table(debug_root_table2, 1); | ||
581 | return 0; | ||
582 | } | ||
583 | __initcall(x8664_sysctl_init); | ||
584 | #endif | ||
585 | |||
586 | /* Pseudo VMAs to allow ptrace access for the vsyscall pages. x86-64 has two | ||
587 | different ones: one for 32bit and one for 64bit. Use the appropiate | ||
588 | for the target task. */ | ||
589 | |||
590 | static struct vm_area_struct gate_vma = { | ||
591 | .vm_start = VSYSCALL_START, | ||
592 | .vm_end = VSYSCALL_END, | ||
593 | .vm_page_prot = PAGE_READONLY | ||
594 | }; | ||
595 | |||
596 | static struct vm_area_struct gate32_vma = { | ||
597 | .vm_start = VSYSCALL32_BASE, | ||
598 | .vm_end = VSYSCALL32_END, | ||
599 | .vm_page_prot = PAGE_READONLY | ||
600 | }; | ||
601 | |||
602 | struct vm_area_struct *get_gate_vma(struct task_struct *tsk) | ||
603 | { | ||
604 | #ifdef CONFIG_IA32_EMULATION | ||
605 | if (test_tsk_thread_flag(tsk, TIF_IA32)) { | ||
606 | /* lookup code assumes the pages are present. set them up | ||
607 | now */ | ||
608 | if (__map_syscall32(tsk->mm, VSYSCALL32_BASE) < 0) | ||
609 | return NULL; | ||
610 | return &gate32_vma; | ||
611 | } | ||
612 | #endif | ||
613 | return &gate_vma; | ||
614 | } | ||
615 | |||
616 | int in_gate_area(struct task_struct *task, unsigned long addr) | ||
617 | { | ||
618 | struct vm_area_struct *vma = get_gate_vma(task); | ||
619 | return (addr >= vma->vm_start) && (addr < vma->vm_end); | ||
620 | } | ||
621 | |||
622 | /* Use this when you have no reliable task/vma, typically from interrupt | ||
623 | * context. It is less reliable than using the task's vma and may give | ||
624 | * false positives. | ||
625 | */ | ||
626 | int in_gate_area_no_task(unsigned long addr) | ||
627 | { | ||
628 | return (((addr >= VSYSCALL_START) && (addr < VSYSCALL_END)) || | ||
629 | ((addr >= VSYSCALL32_BASE) && (addr < VSYSCALL32_END))); | ||
630 | } | ||
diff --git a/arch/x86_64/mm/ioremap.c b/arch/x86_64/mm/ioremap.c new file mode 100644 index 000000000000..74ec8554b195 --- /dev/null +++ b/arch/x86_64/mm/ioremap.c | |||
@@ -0,0 +1,283 @@ | |||
1 | /* | ||
2 | * arch/x86_64/mm/ioremap.c | ||
3 | * | ||
4 | * Re-map IO memory to kernel address space so that we can access it. | ||
5 | * This is needed for high PCI addresses that aren't mapped in the | ||
6 | * 640k-1MB IO memory area on PC's | ||
7 | * | ||
8 | * (C) Copyright 1995 1996 Linus Torvalds | ||
9 | */ | ||
10 | |||
11 | #include <linux/vmalloc.h> | ||
12 | #include <linux/init.h> | ||
13 | #include <linux/slab.h> | ||
14 | #include <asm/io.h> | ||
15 | #include <asm/pgalloc.h> | ||
16 | #include <asm/fixmap.h> | ||
17 | #include <asm/cacheflush.h> | ||
18 | #include <asm/tlbflush.h> | ||
19 | #include <asm/proto.h> | ||
20 | |||
21 | #define ISA_START_ADDRESS 0xa0000 | ||
22 | #define ISA_END_ADDRESS 0x100000 | ||
23 | |||
24 | static inline void remap_area_pte(pte_t * pte, unsigned long address, unsigned long size, | ||
25 | unsigned long phys_addr, unsigned long flags) | ||
26 | { | ||
27 | unsigned long end; | ||
28 | unsigned long pfn; | ||
29 | |||
30 | address &= ~PMD_MASK; | ||
31 | end = address + size; | ||
32 | if (end > PMD_SIZE) | ||
33 | end = PMD_SIZE; | ||
34 | if (address >= end) | ||
35 | BUG(); | ||
36 | pfn = phys_addr >> PAGE_SHIFT; | ||
37 | do { | ||
38 | if (!pte_none(*pte)) { | ||
39 | printk("remap_area_pte: page already exists\n"); | ||
40 | BUG(); | ||
41 | } | ||
42 | set_pte(pte, pfn_pte(pfn, __pgprot(_PAGE_PRESENT | _PAGE_RW | | ||
43 | _PAGE_GLOBAL | _PAGE_DIRTY | _PAGE_ACCESSED | flags))); | ||
44 | address += PAGE_SIZE; | ||
45 | pfn++; | ||
46 | pte++; | ||
47 | } while (address && (address < end)); | ||
48 | } | ||
49 | |||
50 | static inline int remap_area_pmd(pmd_t * pmd, unsigned long address, unsigned long size, | ||
51 | unsigned long phys_addr, unsigned long flags) | ||
52 | { | ||
53 | unsigned long end; | ||
54 | |||
55 | address &= ~PUD_MASK; | ||
56 | end = address + size; | ||
57 | if (end > PUD_SIZE) | ||
58 | end = PUD_SIZE; | ||
59 | phys_addr -= address; | ||
60 | if (address >= end) | ||
61 | BUG(); | ||
62 | do { | ||
63 | pte_t * pte = pte_alloc_kernel(&init_mm, pmd, address); | ||
64 | if (!pte) | ||
65 | return -ENOMEM; | ||
66 | remap_area_pte(pte, address, end - address, address + phys_addr, flags); | ||
67 | address = (address + PMD_SIZE) & PMD_MASK; | ||
68 | pmd++; | ||
69 | } while (address && (address < end)); | ||
70 | return 0; | ||
71 | } | ||
72 | |||
73 | static inline int remap_area_pud(pud_t * pud, unsigned long address, unsigned long size, | ||
74 | unsigned long phys_addr, unsigned long flags) | ||
75 | { | ||
76 | unsigned long end; | ||
77 | |||
78 | address &= ~PGDIR_MASK; | ||
79 | end = address + size; | ||
80 | if (end > PGDIR_SIZE) | ||
81 | end = PGDIR_SIZE; | ||
82 | phys_addr -= address; | ||
83 | if (address >= end) | ||
84 | BUG(); | ||
85 | do { | ||
86 | pmd_t * pmd = pmd_alloc(&init_mm, pud, address); | ||
87 | if (!pmd) | ||
88 | return -ENOMEM; | ||
89 | remap_area_pmd(pmd, address, end - address, address + phys_addr, flags); | ||
90 | address = (address + PUD_SIZE) & PUD_MASK; | ||
91 | pud++; | ||
92 | } while (address && (address < end)); | ||
93 | return 0; | ||
94 | } | ||
95 | |||
96 | static int remap_area_pages(unsigned long address, unsigned long phys_addr, | ||
97 | unsigned long size, unsigned long flags) | ||
98 | { | ||
99 | int error; | ||
100 | pgd_t *pgd; | ||
101 | unsigned long end = address + size; | ||
102 | |||
103 | phys_addr -= address; | ||
104 | pgd = pgd_offset_k(address); | ||
105 | flush_cache_all(); | ||
106 | if (address >= end) | ||
107 | BUG(); | ||
108 | spin_lock(&init_mm.page_table_lock); | ||
109 | do { | ||
110 | pud_t *pud; | ||
111 | pud = pud_alloc(&init_mm, pgd, address); | ||
112 | error = -ENOMEM; | ||
113 | if (!pud) | ||
114 | break; | ||
115 | if (remap_area_pud(pud, address, end - address, | ||
116 | phys_addr + address, flags)) | ||
117 | break; | ||
118 | error = 0; | ||
119 | address = (address + PGDIR_SIZE) & PGDIR_MASK; | ||
120 | pgd++; | ||
121 | } while (address && (address < end)); | ||
122 | spin_unlock(&init_mm.page_table_lock); | ||
123 | flush_tlb_all(); | ||
124 | return error; | ||
125 | } | ||
126 | |||
127 | /* | ||
128 | * Fix up the linear direct mapping of the kernel to avoid cache attribute | ||
129 | * conflicts. | ||
130 | */ | ||
131 | static int | ||
132 | ioremap_change_attr(unsigned long phys_addr, unsigned long size, | ||
133 | unsigned long flags) | ||
134 | { | ||
135 | int err = 0; | ||
136 | if (flags && phys_addr + size - 1 < (end_pfn_map << PAGE_SHIFT)) { | ||
137 | unsigned long npages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; | ||
138 | unsigned long vaddr = (unsigned long) __va(phys_addr); | ||
139 | |||
140 | /* | ||
141 | * Must use a address here and not struct page because the phys addr | ||
142 | * can be a in hole between nodes and not have an memmap entry. | ||
143 | */ | ||
144 | err = change_page_attr_addr(vaddr,npages,__pgprot(__PAGE_KERNEL|flags)); | ||
145 | if (!err) | ||
146 | global_flush_tlb(); | ||
147 | } | ||
148 | return err; | ||
149 | } | ||
150 | |||
151 | /* | ||
152 | * Generic mapping function | ||
153 | */ | ||
154 | |||
155 | /* | ||
156 | * Remap an arbitrary physical address space into the kernel virtual | ||
157 | * address space. Needed when the kernel wants to access high addresses | ||
158 | * directly. | ||
159 | * | ||
160 | * NOTE! We need to allow non-page-aligned mappings too: we will obviously | ||
161 | * have to convert them into an offset in a page-aligned mapping, but the | ||
162 | * caller shouldn't need to know that small detail. | ||
163 | */ | ||
164 | void __iomem * __ioremap(unsigned long phys_addr, unsigned long size, unsigned long flags) | ||
165 | { | ||
166 | void * addr; | ||
167 | struct vm_struct * area; | ||
168 | unsigned long offset, last_addr; | ||
169 | |||
170 | /* Don't allow wraparound or zero size */ | ||
171 | last_addr = phys_addr + size - 1; | ||
172 | if (!size || last_addr < phys_addr) | ||
173 | return NULL; | ||
174 | |||
175 | /* | ||
176 | * Don't remap the low PCI/ISA area, it's always mapped.. | ||
177 | */ | ||
178 | if (phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS) | ||
179 | return (__force void __iomem *)phys_to_virt(phys_addr); | ||
180 | |||
181 | #ifndef CONFIG_DISCONTIGMEM | ||
182 | /* | ||
183 | * Don't allow anybody to remap normal RAM that we're using.. | ||
184 | */ | ||
185 | if (last_addr < virt_to_phys(high_memory)) { | ||
186 | char *t_addr, *t_end; | ||
187 | struct page *page; | ||
188 | |||
189 | t_addr = __va(phys_addr); | ||
190 | t_end = t_addr + (size - 1); | ||
191 | |||
192 | for(page = virt_to_page(t_addr); page <= virt_to_page(t_end); page++) | ||
193 | if(!PageReserved(page)) | ||
194 | return NULL; | ||
195 | } | ||
196 | #endif | ||
197 | |||
198 | /* | ||
199 | * Mappings have to be page-aligned | ||
200 | */ | ||
201 | offset = phys_addr & ~PAGE_MASK; | ||
202 | phys_addr &= PAGE_MASK; | ||
203 | size = PAGE_ALIGN(last_addr+1) - phys_addr; | ||
204 | |||
205 | /* | ||
206 | * Ok, go for it.. | ||
207 | */ | ||
208 | area = get_vm_area(size, VM_IOREMAP | (flags << 20)); | ||
209 | if (!area) | ||
210 | return NULL; | ||
211 | area->phys_addr = phys_addr; | ||
212 | addr = area->addr; | ||
213 | if (remap_area_pages((unsigned long) addr, phys_addr, size, flags)) { | ||
214 | remove_vm_area((void *)(PAGE_MASK & (unsigned long) addr)); | ||
215 | return NULL; | ||
216 | } | ||
217 | if (ioremap_change_attr(phys_addr, size, flags) < 0) { | ||
218 | area->flags &= 0xffffff; | ||
219 | vunmap(addr); | ||
220 | return NULL; | ||
221 | } | ||
222 | return (__force void __iomem *) (offset + (char *)addr); | ||
223 | } | ||
224 | |||
225 | /** | ||
226 | * ioremap_nocache - map bus memory into CPU space | ||
227 | * @offset: bus address of the memory | ||
228 | * @size: size of the resource to map | ||
229 | * | ||
230 | * ioremap_nocache performs a platform specific sequence of operations to | ||
231 | * make bus memory CPU accessible via the readb/readw/readl/writeb/ | ||
232 | * writew/writel functions and the other mmio helpers. The returned | ||
233 | * address is not guaranteed to be usable directly as a virtual | ||
234 | * address. | ||
235 | * | ||
236 | * This version of ioremap ensures that the memory is marked uncachable | ||
237 | * on the CPU as well as honouring existing caching rules from things like | ||
238 | * the PCI bus. Note that there are other caches and buffers on many | ||
239 | * busses. In particular driver authors should read up on PCI writes | ||
240 | * | ||
241 | * It's useful if some control registers are in such an area and | ||
242 | * write combining or read caching is not desirable: | ||
243 | * | ||
244 | * Must be freed with iounmap. | ||
245 | */ | ||
246 | |||
247 | void __iomem *ioremap_nocache (unsigned long phys_addr, unsigned long size) | ||
248 | { | ||
249 | return __ioremap(phys_addr, size, _PAGE_PCD); | ||
250 | } | ||
251 | |||
252 | void iounmap(volatile void __iomem *addr) | ||
253 | { | ||
254 | struct vm_struct *p, **pprev; | ||
255 | |||
256 | if (addr <= high_memory) | ||
257 | return; | ||
258 | if (addr >= phys_to_virt(ISA_START_ADDRESS) && | ||
259 | addr < phys_to_virt(ISA_END_ADDRESS)) | ||
260 | return; | ||
261 | |||
262 | write_lock(&vmlist_lock); | ||
263 | for (p = vmlist, pprev = &vmlist; p != NULL; pprev = &p->next, p = *pprev) | ||
264 | if (p->addr == (void *)(PAGE_MASK & (unsigned long)addr)) | ||
265 | break; | ||
266 | if (!p) { | ||
267 | printk("__iounmap: bad address %p\n", addr); | ||
268 | goto out_unlock; | ||
269 | } | ||
270 | *pprev = p->next; | ||
271 | unmap_vm_area(p); | ||
272 | if ((p->flags >> 20) && | ||
273 | p->phys_addr + p->size - 1 < virt_to_phys(high_memory)) { | ||
274 | /* p->size includes the guard page, but cpa doesn't like that */ | ||
275 | change_page_attr(virt_to_page(__va(p->phys_addr)), | ||
276 | p->size >> PAGE_SHIFT, | ||
277 | PAGE_KERNEL); | ||
278 | global_flush_tlb(); | ||
279 | } | ||
280 | out_unlock: | ||
281 | write_unlock(&vmlist_lock); | ||
282 | kfree(p); | ||
283 | } | ||
diff --git a/arch/x86_64/mm/k8topology.c b/arch/x86_64/mm/k8topology.c new file mode 100644 index 000000000000..ec35747aacd7 --- /dev/null +++ b/arch/x86_64/mm/k8topology.c | |||
@@ -0,0 +1,168 @@ | |||
1 | /* | ||
2 | * AMD K8 NUMA support. | ||
3 | * Discover the memory map and associated nodes. | ||
4 | * | ||
5 | * This version reads it directly from the K8 northbridge. | ||
6 | * | ||
7 | * Copyright 2002,2003 Andi Kleen, SuSE Labs. | ||
8 | */ | ||
9 | #include <linux/kernel.h> | ||
10 | #include <linux/init.h> | ||
11 | #include <linux/string.h> | ||
12 | #include <linux/module.h> | ||
13 | #include <linux/nodemask.h> | ||
14 | #include <asm/io.h> | ||
15 | #include <linux/pci_ids.h> | ||
16 | #include <asm/types.h> | ||
17 | #include <asm/mmzone.h> | ||
18 | #include <asm/proto.h> | ||
19 | #include <asm/e820.h> | ||
20 | #include <asm/pci-direct.h> | ||
21 | #include <asm/numa.h> | ||
22 | |||
23 | static __init int find_northbridge(void) | ||
24 | { | ||
25 | int num; | ||
26 | |||
27 | for (num = 0; num < 32; num++) { | ||
28 | u32 header; | ||
29 | |||
30 | header = read_pci_config(0, num, 0, 0x00); | ||
31 | if (header != (PCI_VENDOR_ID_AMD | (0x1100<<16))) | ||
32 | continue; | ||
33 | |||
34 | header = read_pci_config(0, num, 1, 0x00); | ||
35 | if (header != (PCI_VENDOR_ID_AMD | (0x1101<<16))) | ||
36 | continue; | ||
37 | return num; | ||
38 | } | ||
39 | |||
40 | return -1; | ||
41 | } | ||
42 | |||
43 | int __init k8_scan_nodes(unsigned long start, unsigned long end) | ||
44 | { | ||
45 | unsigned long prevbase; | ||
46 | struct node nodes[8]; | ||
47 | int nodeid, i, nb; | ||
48 | int found = 0; | ||
49 | u32 reg; | ||
50 | unsigned numnodes; | ||
51 | nodemask_t nodes_parsed; | ||
52 | |||
53 | nodes_clear(nodes_parsed); | ||
54 | |||
55 | nb = find_northbridge(); | ||
56 | if (nb < 0) | ||
57 | return nb; | ||
58 | |||
59 | printk(KERN_INFO "Scanning NUMA topology in Northbridge %d\n", nb); | ||
60 | |||
61 | reg = read_pci_config(0, nb, 0, 0x60); | ||
62 | numnodes = ((reg >> 4) & 0xF) + 1; | ||
63 | |||
64 | printk(KERN_INFO "Number of nodes %d\n", numnodes); | ||
65 | |||
66 | memset(&nodes,0,sizeof(nodes)); | ||
67 | prevbase = 0; | ||
68 | for (i = 0; i < 8; i++) { | ||
69 | unsigned long base,limit; | ||
70 | |||
71 | base = read_pci_config(0, nb, 1, 0x40 + i*8); | ||
72 | limit = read_pci_config(0, nb, 1, 0x44 + i*8); | ||
73 | |||
74 | nodeid = limit & 7; | ||
75 | if ((base & 3) == 0) { | ||
76 | if (i < numnodes) | ||
77 | printk("Skipping disabled node %d\n", i); | ||
78 | continue; | ||
79 | } | ||
80 | if (nodeid >= numnodes) { | ||
81 | printk("Ignoring excess node %d (%lx:%lx)\n", nodeid, | ||
82 | base, limit); | ||
83 | continue; | ||
84 | } | ||
85 | |||
86 | if (!limit) { | ||
87 | printk(KERN_INFO "Skipping node entry %d (base %lx)\n", i, | ||
88 | base); | ||
89 | continue; | ||
90 | } | ||
91 | if ((base >> 8) & 3 || (limit >> 8) & 3) { | ||
92 | printk(KERN_ERR "Node %d using interleaving mode %lx/%lx\n", | ||
93 | nodeid, (base>>8)&3, (limit>>8) & 3); | ||
94 | return -1; | ||
95 | } | ||
96 | if (node_isset(nodeid, nodes_parsed)) { | ||
97 | printk(KERN_INFO "Node %d already present. Skipping\n", | ||
98 | nodeid); | ||
99 | continue; | ||
100 | } | ||
101 | |||
102 | limit >>= 16; | ||
103 | limit <<= 24; | ||
104 | limit |= (1<<24)-1; | ||
105 | |||
106 | if (limit > end_pfn << PAGE_SHIFT) | ||
107 | limit = end_pfn << PAGE_SHIFT; | ||
108 | if (limit <= base) | ||
109 | continue; | ||
110 | |||
111 | base >>= 16; | ||
112 | base <<= 24; | ||
113 | |||
114 | if (base < start) | ||
115 | base = start; | ||
116 | if (limit > end) | ||
117 | limit = end; | ||
118 | if (limit == base) { | ||
119 | printk(KERN_ERR "Empty node %d\n", nodeid); | ||
120 | continue; | ||
121 | } | ||
122 | if (limit < base) { | ||
123 | printk(KERN_ERR "Node %d bogus settings %lx-%lx.\n", | ||
124 | nodeid, base, limit); | ||
125 | continue; | ||
126 | } | ||
127 | |||
128 | /* Could sort here, but pun for now. Should not happen anyroads. */ | ||
129 | if (prevbase > base) { | ||
130 | printk(KERN_ERR "Node map not sorted %lx,%lx\n", | ||
131 | prevbase,base); | ||
132 | return -1; | ||
133 | } | ||
134 | |||
135 | printk(KERN_INFO "Node %d MemBase %016lx Limit %016lx\n", | ||
136 | nodeid, base, limit); | ||
137 | |||
138 | found++; | ||
139 | |||
140 | nodes[nodeid].start = base; | ||
141 | nodes[nodeid].end = limit; | ||
142 | |||
143 | prevbase = base; | ||
144 | |||
145 | node_set(nodeid, nodes_parsed); | ||
146 | } | ||
147 | |||
148 | if (!found) | ||
149 | return -1; | ||
150 | |||
151 | memnode_shift = compute_hash_shift(nodes, numnodes); | ||
152 | if (memnode_shift < 0) { | ||
153 | printk(KERN_ERR "No NUMA node hash function found. Contact maintainer\n"); | ||
154 | return -1; | ||
155 | } | ||
156 | printk(KERN_INFO "Using node hash shift of %d\n", memnode_shift); | ||
157 | |||
158 | for (i = 0; i < 8; i++) { | ||
159 | if (nodes[i].start != nodes[i].end) { | ||
160 | /* assume 1:1 NODE:CPU */ | ||
161 | cpu_to_node[i] = i; | ||
162 | setup_node_bootmem(i, nodes[i].start, nodes[i].end); | ||
163 | } | ||
164 | } | ||
165 | |||
166 | numa_init_array(); | ||
167 | return 0; | ||
168 | } | ||
diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c new file mode 100644 index 000000000000..fd9f25d7a6c4 --- /dev/null +++ b/arch/x86_64/mm/numa.c | |||
@@ -0,0 +1,294 @@ | |||
1 | /* | ||
2 | * Generic VM initialization for x86-64 NUMA setups. | ||
3 | * Copyright 2002,2003 Andi Kleen, SuSE Labs. | ||
4 | */ | ||
5 | #include <linux/kernel.h> | ||
6 | #include <linux/mm.h> | ||
7 | #include <linux/string.h> | ||
8 | #include <linux/init.h> | ||
9 | #include <linux/bootmem.h> | ||
10 | #include <linux/mmzone.h> | ||
11 | #include <linux/ctype.h> | ||
12 | #include <linux/module.h> | ||
13 | #include <linux/nodemask.h> | ||
14 | |||
15 | #include <asm/e820.h> | ||
16 | #include <asm/proto.h> | ||
17 | #include <asm/dma.h> | ||
18 | #include <asm/numa.h> | ||
19 | #include <asm/acpi.h> | ||
20 | |||
21 | #ifndef Dprintk | ||
22 | #define Dprintk(x...) | ||
23 | #endif | ||
24 | |||
25 | struct pglist_data *node_data[MAX_NUMNODES]; | ||
26 | bootmem_data_t plat_node_bdata[MAX_NUMNODES]; | ||
27 | |||
28 | int memnode_shift; | ||
29 | u8 memnodemap[NODEMAPSIZE]; | ||
30 | |||
31 | unsigned char cpu_to_node[NR_CPUS] = { [0 ... NR_CPUS-1] = NUMA_NO_NODE }; | ||
32 | cpumask_t node_to_cpumask[MAX_NUMNODES]; | ||
33 | |||
34 | int numa_off __initdata; | ||
35 | |||
36 | int __init compute_hash_shift(struct node *nodes, int numnodes) | ||
37 | { | ||
38 | int i; | ||
39 | int shift = 24; | ||
40 | u64 addr; | ||
41 | |||
42 | /* When in doubt use brute force. */ | ||
43 | while (shift < 48) { | ||
44 | memset(memnodemap,0xff,sizeof(*memnodemap) * NODEMAPSIZE); | ||
45 | for (i = 0; i < numnodes; i++) { | ||
46 | if (nodes[i].start == nodes[i].end) | ||
47 | continue; | ||
48 | for (addr = nodes[i].start; | ||
49 | addr < nodes[i].end; | ||
50 | addr += (1UL << shift)) { | ||
51 | if (memnodemap[addr >> shift] != 0xff && | ||
52 | memnodemap[addr >> shift] != i) { | ||
53 | printk(KERN_INFO | ||
54 | "node %d shift %d addr %Lx conflict %d\n", | ||
55 | i, shift, addr, memnodemap[addr>>shift]); | ||
56 | goto next; | ||
57 | } | ||
58 | memnodemap[addr >> shift] = i; | ||
59 | } | ||
60 | } | ||
61 | return shift; | ||
62 | next: | ||
63 | shift++; | ||
64 | } | ||
65 | memset(memnodemap,0,sizeof(*memnodemap) * NODEMAPSIZE); | ||
66 | return -1; | ||
67 | } | ||
68 | |||
69 | /* Initialize bootmem allocator for a node */ | ||
70 | void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) | ||
71 | { | ||
72 | unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size, bootmap_start; | ||
73 | unsigned long nodedata_phys; | ||
74 | const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE); | ||
75 | |||
76 | start = round_up(start, ZONE_ALIGN); | ||
77 | |||
78 | printk("Bootmem setup node %d %016lx-%016lx\n", nodeid, start, end); | ||
79 | |||
80 | start_pfn = start >> PAGE_SHIFT; | ||
81 | end_pfn = end >> PAGE_SHIFT; | ||
82 | |||
83 | nodedata_phys = find_e820_area(start, end, pgdat_size); | ||
84 | if (nodedata_phys == -1L) | ||
85 | panic("Cannot find memory pgdat in node %d\n", nodeid); | ||
86 | |||
87 | Dprintk("nodedata_phys %lx\n", nodedata_phys); | ||
88 | |||
89 | node_data[nodeid] = phys_to_virt(nodedata_phys); | ||
90 | memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t)); | ||
91 | NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid]; | ||
92 | NODE_DATA(nodeid)->node_start_pfn = start_pfn; | ||
93 | NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn; | ||
94 | |||
95 | /* Find a place for the bootmem map */ | ||
96 | bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn); | ||
97 | bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE); | ||
98 | bootmap_start = find_e820_area(bootmap_start, end, bootmap_pages<<PAGE_SHIFT); | ||
99 | if (bootmap_start == -1L) | ||
100 | panic("Not enough continuous space for bootmap on node %d", nodeid); | ||
101 | Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages); | ||
102 | |||
103 | bootmap_size = init_bootmem_node(NODE_DATA(nodeid), | ||
104 | bootmap_start >> PAGE_SHIFT, | ||
105 | start_pfn, end_pfn); | ||
106 | |||
107 | e820_bootmem_free(NODE_DATA(nodeid), start, end); | ||
108 | |||
109 | reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size); | ||
110 | reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT); | ||
111 | node_set_online(nodeid); | ||
112 | } | ||
113 | |||
114 | /* Initialize final allocator for a zone */ | ||
115 | void __init setup_node_zones(int nodeid) | ||
116 | { | ||
117 | unsigned long start_pfn, end_pfn; | ||
118 | unsigned long zones[MAX_NR_ZONES]; | ||
119 | unsigned long dma_end_pfn; | ||
120 | |||
121 | memset(zones, 0, sizeof(unsigned long) * MAX_NR_ZONES); | ||
122 | |||
123 | start_pfn = node_start_pfn(nodeid); | ||
124 | end_pfn = node_end_pfn(nodeid); | ||
125 | |||
126 | Dprintk(KERN_INFO "setting up node %d %lx-%lx\n", nodeid, start_pfn, end_pfn); | ||
127 | |||
128 | /* All nodes > 0 have a zero length zone DMA */ | ||
129 | dma_end_pfn = __pa(MAX_DMA_ADDRESS) >> PAGE_SHIFT; | ||
130 | if (start_pfn < dma_end_pfn) { | ||
131 | zones[ZONE_DMA] = dma_end_pfn - start_pfn; | ||
132 | zones[ZONE_NORMAL] = end_pfn - dma_end_pfn; | ||
133 | } else { | ||
134 | zones[ZONE_NORMAL] = end_pfn - start_pfn; | ||
135 | } | ||
136 | |||
137 | free_area_init_node(nodeid, NODE_DATA(nodeid), zones, | ||
138 | start_pfn, NULL); | ||
139 | } | ||
140 | |||
141 | void __init numa_init_array(void) | ||
142 | { | ||
143 | int rr, i; | ||
144 | /* There are unfortunately some poorly designed mainboards around | ||
145 | that only connect memory to a single CPU. This breaks the 1:1 cpu->node | ||
146 | mapping. To avoid this fill in the mapping for all possible | ||
147 | CPUs, as the number of CPUs is not known yet. | ||
148 | We round robin the existing nodes. */ | ||
149 | rr = 0; | ||
150 | for (i = 0; i < NR_CPUS; i++) { | ||
151 | if (cpu_to_node[i] != NUMA_NO_NODE) | ||
152 | continue; | ||
153 | rr = next_node(rr, node_online_map); | ||
154 | if (rr == MAX_NUMNODES) | ||
155 | rr = first_node(node_online_map); | ||
156 | cpu_to_node[i] = rr; | ||
157 | rr++; | ||
158 | } | ||
159 | |||
160 | set_bit(0, &node_to_cpumask[cpu_to_node(0)]); | ||
161 | } | ||
162 | |||
163 | #ifdef CONFIG_NUMA_EMU | ||
164 | int numa_fake __initdata = 0; | ||
165 | |||
166 | /* Numa emulation */ | ||
167 | static int numa_emulation(unsigned long start_pfn, unsigned long end_pfn) | ||
168 | { | ||
169 | int i; | ||
170 | struct node nodes[MAX_NUMNODES]; | ||
171 | unsigned long sz = ((end_pfn - start_pfn)<<PAGE_SHIFT) / numa_fake; | ||
172 | |||
173 | /* Kludge needed for the hash function */ | ||
174 | if (hweight64(sz) > 1) { | ||
175 | unsigned long x = 1; | ||
176 | while ((x << 1) < sz) | ||
177 | x <<= 1; | ||
178 | if (x < sz/2) | ||
179 | printk("Numa emulation unbalanced. Complain to maintainer\n"); | ||
180 | sz = x; | ||
181 | } | ||
182 | |||
183 | memset(&nodes,0,sizeof(nodes)); | ||
184 | for (i = 0; i < numa_fake; i++) { | ||
185 | nodes[i].start = (start_pfn<<PAGE_SHIFT) + i*sz; | ||
186 | if (i == numa_fake-1) | ||
187 | sz = (end_pfn<<PAGE_SHIFT) - nodes[i].start; | ||
188 | nodes[i].end = nodes[i].start + sz; | ||
189 | if (i != numa_fake-1) | ||
190 | nodes[i].end--; | ||
191 | printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", | ||
192 | i, | ||
193 | nodes[i].start, nodes[i].end, | ||
194 | (nodes[i].end - nodes[i].start) >> 20); | ||
195 | node_set_online(i); | ||
196 | } | ||
197 | memnode_shift = compute_hash_shift(nodes, numa_fake); | ||
198 | if (memnode_shift < 0) { | ||
199 | memnode_shift = 0; | ||
200 | printk(KERN_ERR "No NUMA hash function found. Emulation disabled.\n"); | ||
201 | return -1; | ||
202 | } | ||
203 | for_each_online_node(i) | ||
204 | setup_node_bootmem(i, nodes[i].start, nodes[i].end); | ||
205 | numa_init_array(); | ||
206 | return 0; | ||
207 | } | ||
208 | #endif | ||
209 | |||
210 | void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) | ||
211 | { | ||
212 | int i; | ||
213 | |||
214 | #ifdef CONFIG_NUMA_EMU | ||
215 | if (numa_fake && !numa_emulation(start_pfn, end_pfn)) | ||
216 | return; | ||
217 | #endif | ||
218 | |||
219 | #ifdef CONFIG_ACPI_NUMA | ||
220 | if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT, | ||
221 | end_pfn << PAGE_SHIFT)) | ||
222 | return; | ||
223 | #endif | ||
224 | |||
225 | #ifdef CONFIG_K8_NUMA | ||
226 | if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT)) | ||
227 | return; | ||
228 | #endif | ||
229 | printk(KERN_INFO "%s\n", | ||
230 | numa_off ? "NUMA turned off" : "No NUMA configuration found"); | ||
231 | |||
232 | printk(KERN_INFO "Faking a node at %016lx-%016lx\n", | ||
233 | start_pfn << PAGE_SHIFT, | ||
234 | end_pfn << PAGE_SHIFT); | ||
235 | /* setup dummy node covering all memory */ | ||
236 | memnode_shift = 63; | ||
237 | memnodemap[0] = 0; | ||
238 | nodes_clear(node_online_map); | ||
239 | node_set_online(0); | ||
240 | for (i = 0; i < NR_CPUS; i++) | ||
241 | cpu_to_node[i] = 0; | ||
242 | node_to_cpumask[0] = cpumask_of_cpu(0); | ||
243 | setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT); | ||
244 | } | ||
245 | |||
246 | __init void numa_add_cpu(int cpu) | ||
247 | { | ||
248 | /* BP is initialized elsewhere */ | ||
249 | if (cpu) | ||
250 | set_bit(cpu, &node_to_cpumask[cpu_to_node(cpu)]); | ||
251 | } | ||
252 | |||
253 | unsigned long __init numa_free_all_bootmem(void) | ||
254 | { | ||
255 | int i; | ||
256 | unsigned long pages = 0; | ||
257 | for_each_online_node(i) { | ||
258 | pages += free_all_bootmem_node(NODE_DATA(i)); | ||
259 | } | ||
260 | return pages; | ||
261 | } | ||
262 | |||
263 | void __init paging_init(void) | ||
264 | { | ||
265 | int i; | ||
266 | for_each_online_node(i) { | ||
267 | setup_node_zones(i); | ||
268 | } | ||
269 | } | ||
270 | |||
271 | /* [numa=off] */ | ||
272 | __init int numa_setup(char *opt) | ||
273 | { | ||
274 | if (!strncmp(opt,"off",3)) | ||
275 | numa_off = 1; | ||
276 | #ifdef CONFIG_NUMA_EMU | ||
277 | if(!strncmp(opt, "fake=", 5)) { | ||
278 | numa_fake = simple_strtoul(opt+5,NULL,0); ; | ||
279 | if (numa_fake >= MAX_NUMNODES) | ||
280 | numa_fake = MAX_NUMNODES; | ||
281 | } | ||
282 | #endif | ||
283 | #ifdef CONFIG_ACPI_NUMA | ||
284 | if (!strncmp(opt,"noacpi",6)) | ||
285 | acpi_numa = -1; | ||
286 | #endif | ||
287 | return 1; | ||
288 | } | ||
289 | |||
290 | EXPORT_SYMBOL(cpu_to_node); | ||
291 | EXPORT_SYMBOL(node_to_cpumask); | ||
292 | EXPORT_SYMBOL(memnode_shift); | ||
293 | EXPORT_SYMBOL(memnodemap); | ||
294 | EXPORT_SYMBOL(node_data); | ||
diff --git a/arch/x86_64/mm/pageattr.c b/arch/x86_64/mm/pageattr.c new file mode 100644 index 000000000000..94862e1ec032 --- /dev/null +++ b/arch/x86_64/mm/pageattr.c | |||
@@ -0,0 +1,235 @@ | |||
1 | /* | ||
2 | * Copyright 2002 Andi Kleen, SuSE Labs. | ||
3 | * Thanks to Ben LaHaise for precious feedback. | ||
4 | */ | ||
5 | |||
6 | #include <linux/config.h> | ||
7 | #include <linux/mm.h> | ||
8 | #include <linux/sched.h> | ||
9 | #include <linux/highmem.h> | ||
10 | #include <linux/module.h> | ||
11 | #include <linux/slab.h> | ||
12 | #include <asm/uaccess.h> | ||
13 | #include <asm/processor.h> | ||
14 | #include <asm/tlbflush.h> | ||
15 | #include <asm/io.h> | ||
16 | |||
17 | static inline pte_t *lookup_address(unsigned long address) | ||
18 | { | ||
19 | pgd_t *pgd = pgd_offset_k(address); | ||
20 | pud_t *pud; | ||
21 | pmd_t *pmd; | ||
22 | pte_t *pte; | ||
23 | if (pgd_none(*pgd)) | ||
24 | return NULL; | ||
25 | pud = pud_offset(pgd, address); | ||
26 | if (!pud_present(*pud)) | ||
27 | return NULL; | ||
28 | pmd = pmd_offset(pud, address); | ||
29 | if (!pmd_present(*pmd)) | ||
30 | return NULL; | ||
31 | if (pmd_large(*pmd)) | ||
32 | return (pte_t *)pmd; | ||
33 | pte = pte_offset_kernel(pmd, address); | ||
34 | if (pte && !pte_present(*pte)) | ||
35 | pte = NULL; | ||
36 | return pte; | ||
37 | } | ||
38 | |||
39 | static struct page *split_large_page(unsigned long address, pgprot_t prot, | ||
40 | pgprot_t ref_prot) | ||
41 | { | ||
42 | int i; | ||
43 | unsigned long addr; | ||
44 | struct page *base = alloc_pages(GFP_KERNEL, 0); | ||
45 | pte_t *pbase; | ||
46 | if (!base) | ||
47 | return NULL; | ||
48 | address = __pa(address); | ||
49 | addr = address & LARGE_PAGE_MASK; | ||
50 | pbase = (pte_t *)page_address(base); | ||
51 | for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) { | ||
52 | pbase[i] = pfn_pte(addr >> PAGE_SHIFT, | ||
53 | addr == address ? prot : ref_prot); | ||
54 | } | ||
55 | return base; | ||
56 | } | ||
57 | |||
58 | |||
59 | static void flush_kernel_map(void *address) | ||
60 | { | ||
61 | if (0 && address && cpu_has_clflush) { | ||
62 | /* is this worth it? */ | ||
63 | int i; | ||
64 | for (i = 0; i < PAGE_SIZE; i += boot_cpu_data.x86_clflush_size) | ||
65 | asm volatile("clflush (%0)" :: "r" (address + i)); | ||
66 | } else | ||
67 | asm volatile("wbinvd":::"memory"); | ||
68 | if (address) | ||
69 | __flush_tlb_one(address); | ||
70 | else | ||
71 | __flush_tlb_all(); | ||
72 | } | ||
73 | |||
74 | |||
75 | static inline void flush_map(unsigned long address) | ||
76 | { | ||
77 | on_each_cpu(flush_kernel_map, (void *)address, 1, 1); | ||
78 | } | ||
79 | |||
80 | struct deferred_page { | ||
81 | struct deferred_page *next; | ||
82 | struct page *fpage; | ||
83 | unsigned long address; | ||
84 | }; | ||
85 | static struct deferred_page *df_list; /* protected by init_mm.mmap_sem */ | ||
86 | |||
87 | static inline void save_page(unsigned long address, struct page *fpage) | ||
88 | { | ||
89 | struct deferred_page *df; | ||
90 | df = kmalloc(sizeof(struct deferred_page), GFP_KERNEL); | ||
91 | if (!df) { | ||
92 | flush_map(address); | ||
93 | __free_page(fpage); | ||
94 | } else { | ||
95 | df->next = df_list; | ||
96 | df->fpage = fpage; | ||
97 | df->address = address; | ||
98 | df_list = df; | ||
99 | } | ||
100 | } | ||
101 | |||
102 | /* | ||
103 | * No more special protections in this 2/4MB area - revert to a | ||
104 | * large page again. | ||
105 | */ | ||
106 | static void revert_page(unsigned long address, pgprot_t ref_prot) | ||
107 | { | ||
108 | pgd_t *pgd; | ||
109 | pud_t *pud; | ||
110 | pmd_t *pmd; | ||
111 | pte_t large_pte; | ||
112 | |||
113 | pgd = pgd_offset_k(address); | ||
114 | BUG_ON(pgd_none(*pgd)); | ||
115 | pud = pud_offset(pgd,address); | ||
116 | BUG_ON(pud_none(*pud)); | ||
117 | pmd = pmd_offset(pud, address); | ||
118 | BUG_ON(pmd_val(*pmd) & _PAGE_PSE); | ||
119 | pgprot_val(ref_prot) |= _PAGE_PSE; | ||
120 | large_pte = mk_pte_phys(__pa(address) & LARGE_PAGE_MASK, ref_prot); | ||
121 | set_pte((pte_t *)pmd, large_pte); | ||
122 | } | ||
123 | |||
124 | static int | ||
125 | __change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot, | ||
126 | pgprot_t ref_prot) | ||
127 | { | ||
128 | pte_t *kpte; | ||
129 | struct page *kpte_page; | ||
130 | unsigned kpte_flags; | ||
131 | kpte = lookup_address(address); | ||
132 | if (!kpte) return 0; | ||
133 | kpte_page = virt_to_page(((unsigned long)kpte) & PAGE_MASK); | ||
134 | kpte_flags = pte_val(*kpte); | ||
135 | if (pgprot_val(prot) != pgprot_val(ref_prot)) { | ||
136 | if ((kpte_flags & _PAGE_PSE) == 0) { | ||
137 | set_pte(kpte, pfn_pte(pfn, prot)); | ||
138 | } else { | ||
139 | /* | ||
140 | * split_large_page will take the reference for this change_page_attr | ||
141 | * on the split page. | ||
142 | */ | ||
143 | struct page *split = split_large_page(address, prot, ref_prot); | ||
144 | if (!split) | ||
145 | return -ENOMEM; | ||
146 | set_pte(kpte,mk_pte(split, ref_prot)); | ||
147 | kpte_page = split; | ||
148 | } | ||
149 | get_page(kpte_page); | ||
150 | } else if ((kpte_flags & _PAGE_PSE) == 0) { | ||
151 | set_pte(kpte, pfn_pte(pfn, ref_prot)); | ||
152 | __put_page(kpte_page); | ||
153 | } else | ||
154 | BUG(); | ||
155 | |||
156 | /* on x86-64 the direct mapping set at boot is not using 4k pages */ | ||
157 | BUG_ON(PageReserved(kpte_page)); | ||
158 | |||
159 | switch (page_count(kpte_page)) { | ||
160 | case 1: | ||
161 | save_page(address, kpte_page); | ||
162 | revert_page(address, ref_prot); | ||
163 | break; | ||
164 | case 0: | ||
165 | BUG(); /* memleak and failed 2M page regeneration */ | ||
166 | } | ||
167 | return 0; | ||
168 | } | ||
169 | |||
170 | /* | ||
171 | * Change the page attributes of an page in the linear mapping. | ||
172 | * | ||
173 | * This should be used when a page is mapped with a different caching policy | ||
174 | * than write-back somewhere - some CPUs do not like it when mappings with | ||
175 | * different caching policies exist. This changes the page attributes of the | ||
176 | * in kernel linear mapping too. | ||
177 | * | ||
178 | * The caller needs to ensure that there are no conflicting mappings elsewhere. | ||
179 | * This function only deals with the kernel linear map. | ||
180 | * | ||
181 | * Caller must call global_flush_tlb() after this. | ||
182 | */ | ||
183 | int change_page_attr_addr(unsigned long address, int numpages, pgprot_t prot) | ||
184 | { | ||
185 | int err = 0; | ||
186 | int i; | ||
187 | |||
188 | down_write(&init_mm.mmap_sem); | ||
189 | for (i = 0; i < numpages; i++, address += PAGE_SIZE) { | ||
190 | unsigned long pfn = __pa(address) >> PAGE_SHIFT; | ||
191 | |||
192 | err = __change_page_attr(address, pfn, prot, PAGE_KERNEL); | ||
193 | if (err) | ||
194 | break; | ||
195 | /* Handle kernel mapping too which aliases part of the | ||
196 | * lowmem */ | ||
197 | if (__pa(address) < KERNEL_TEXT_SIZE) { | ||
198 | unsigned long addr2; | ||
199 | pgprot_t prot2 = prot; | ||
200 | addr2 = __START_KERNEL_map + __pa(address); | ||
201 | pgprot_val(prot2) &= ~_PAGE_NX; | ||
202 | err = __change_page_attr(addr2, pfn, prot2, PAGE_KERNEL_EXEC); | ||
203 | } | ||
204 | } | ||
205 | up_write(&init_mm.mmap_sem); | ||
206 | return err; | ||
207 | } | ||
208 | |||
209 | /* Don't call this for MMIO areas that may not have a mem_map entry */ | ||
210 | int change_page_attr(struct page *page, int numpages, pgprot_t prot) | ||
211 | { | ||
212 | unsigned long addr = (unsigned long)page_address(page); | ||
213 | return change_page_attr_addr(addr, numpages, prot); | ||
214 | } | ||
215 | |||
216 | void global_flush_tlb(void) | ||
217 | { | ||
218 | struct deferred_page *df, *next_df; | ||
219 | |||
220 | down_read(&init_mm.mmap_sem); | ||
221 | df = xchg(&df_list, NULL); | ||
222 | up_read(&init_mm.mmap_sem); | ||
223 | if (!df) | ||
224 | return; | ||
225 | flush_map((df && !df->next) ? df->address : 0); | ||
226 | for (; df; df = next_df) { | ||
227 | next_df = df->next; | ||
228 | if (df->fpage) | ||
229 | __free_page(df->fpage); | ||
230 | kfree(df); | ||
231 | } | ||
232 | } | ||
233 | |||
234 | EXPORT_SYMBOL(change_page_attr); | ||
235 | EXPORT_SYMBOL(global_flush_tlb); | ||
diff --git a/arch/x86_64/mm/srat.c b/arch/x86_64/mm/srat.c new file mode 100644 index 000000000000..5d01b31472e1 --- /dev/null +++ b/arch/x86_64/mm/srat.c | |||
@@ -0,0 +1,217 @@ | |||
1 | /* | ||
2 | * ACPI 3.0 based NUMA setup | ||
3 | * Copyright 2004 Andi Kleen, SuSE Labs. | ||
4 | * | ||
5 | * Reads the ACPI SRAT table to figure out what memory belongs to which CPUs. | ||
6 | * | ||
7 | * Called from acpi_numa_init while reading the SRAT and SLIT tables. | ||
8 | * Assumes all memory regions belonging to a single proximity domain | ||
9 | * are in one chunk. Holes between them will be included in the node. | ||
10 | */ | ||
11 | |||
12 | #include <linux/kernel.h> | ||
13 | #include <linux/acpi.h> | ||
14 | #include <linux/mmzone.h> | ||
15 | #include <linux/bitmap.h> | ||
16 | #include <linux/module.h> | ||
17 | #include <linux/topology.h> | ||
18 | #include <asm/proto.h> | ||
19 | #include <asm/numa.h> | ||
20 | |||
21 | static struct acpi_table_slit *acpi_slit; | ||
22 | |||
23 | static nodemask_t nodes_parsed __initdata; | ||
24 | static nodemask_t nodes_found __initdata; | ||
25 | static struct node nodes[MAX_NUMNODES] __initdata; | ||
26 | static __u8 pxm2node[256] = { [0 ... 255] = 0xff }; | ||
27 | |||
28 | static __init int setup_node(int pxm) | ||
29 | { | ||
30 | unsigned node = pxm2node[pxm]; | ||
31 | if (node == 0xff) { | ||
32 | if (nodes_weight(nodes_found) >= MAX_NUMNODES) | ||
33 | return -1; | ||
34 | node = first_unset_node(nodes_found); | ||
35 | node_set(node, nodes_found); | ||
36 | pxm2node[pxm] = node; | ||
37 | } | ||
38 | return pxm2node[pxm]; | ||
39 | } | ||
40 | |||
41 | static __init int conflicting_nodes(unsigned long start, unsigned long end) | ||
42 | { | ||
43 | int i; | ||
44 | for_each_online_node(i) { | ||
45 | struct node *nd = &nodes[i]; | ||
46 | if (nd->start == nd->end) | ||
47 | continue; | ||
48 | if (nd->end > start && nd->start < end) | ||
49 | return 1; | ||
50 | if (nd->end == end && nd->start == start) | ||
51 | return 1; | ||
52 | } | ||
53 | return -1; | ||
54 | } | ||
55 | |||
56 | static __init void cutoff_node(int i, unsigned long start, unsigned long end) | ||
57 | { | ||
58 | struct node *nd = &nodes[i]; | ||
59 | if (nd->start < start) { | ||
60 | nd->start = start; | ||
61 | if (nd->end < nd->start) | ||
62 | nd->start = nd->end; | ||
63 | } | ||
64 | if (nd->end > end) { | ||
65 | if (!(end & 0xfff)) | ||
66 | end--; | ||
67 | nd->end = end; | ||
68 | if (nd->start > nd->end) | ||
69 | nd->start = nd->end; | ||
70 | } | ||
71 | } | ||
72 | |||
73 | static __init void bad_srat(void) | ||
74 | { | ||
75 | printk(KERN_ERR "SRAT: SRAT not used.\n"); | ||
76 | acpi_numa = -1; | ||
77 | } | ||
78 | |||
79 | static __init inline int srat_disabled(void) | ||
80 | { | ||
81 | return numa_off || acpi_numa < 0; | ||
82 | } | ||
83 | |||
84 | /* Callback for SLIT parsing */ | ||
85 | void __init acpi_numa_slit_init(struct acpi_table_slit *slit) | ||
86 | { | ||
87 | acpi_slit = slit; | ||
88 | } | ||
89 | |||
90 | /* Callback for Proximity Domain -> LAPIC mapping */ | ||
91 | void __init | ||
92 | acpi_numa_processor_affinity_init(struct acpi_table_processor_affinity *pa) | ||
93 | { | ||
94 | int pxm, node; | ||
95 | if (srat_disabled() || pa->flags.enabled == 0) | ||
96 | return; | ||
97 | pxm = pa->proximity_domain; | ||
98 | node = setup_node(pxm); | ||
99 | if (node < 0) { | ||
100 | printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm); | ||
101 | bad_srat(); | ||
102 | return; | ||
103 | } | ||
104 | if (pa->apic_id >= NR_CPUS) { | ||
105 | printk(KERN_ERR "SRAT: lapic %u too large.\n", | ||
106 | pa->apic_id); | ||
107 | bad_srat(); | ||
108 | return; | ||
109 | } | ||
110 | cpu_to_node[pa->apic_id] = node; | ||
111 | acpi_numa = 1; | ||
112 | printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n", | ||
113 | pxm, pa->apic_id, node); | ||
114 | } | ||
115 | |||
116 | /* Callback for parsing of the Proximity Domain <-> Memory Area mappings */ | ||
117 | void __init | ||
118 | acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma) | ||
119 | { | ||
120 | struct node *nd; | ||
121 | unsigned long start, end; | ||
122 | int node, pxm; | ||
123 | int i; | ||
124 | |||
125 | if (srat_disabled() || ma->flags.enabled == 0) | ||
126 | return; | ||
127 | /* hotplug bit is ignored for now */ | ||
128 | pxm = ma->proximity_domain; | ||
129 | node = setup_node(pxm); | ||
130 | if (node < 0) { | ||
131 | printk(KERN_ERR "SRAT: Too many proximity domains.\n"); | ||
132 | bad_srat(); | ||
133 | return; | ||
134 | } | ||
135 | start = ma->base_addr_lo | ((u64)ma->base_addr_hi << 32); | ||
136 | end = start + (ma->length_lo | ((u64)ma->length_hi << 32)); | ||
137 | i = conflicting_nodes(start, end); | ||
138 | if (i >= 0) { | ||
139 | printk(KERN_ERR | ||
140 | "SRAT: pxm %d overlap %lx-%lx with node %d(%Lx-%Lx)\n", | ||
141 | pxm, start, end, i, nodes[i].start, nodes[i].end); | ||
142 | bad_srat(); | ||
143 | return; | ||
144 | } | ||
145 | nd = &nodes[node]; | ||
146 | if (!node_test_and_set(node, nodes_parsed)) { | ||
147 | nd->start = start; | ||
148 | nd->end = end; | ||
149 | } else { | ||
150 | if (start < nd->start) | ||
151 | nd->start = start; | ||
152 | if (nd->end < end) | ||
153 | nd->end = end; | ||
154 | } | ||
155 | if (!(nd->end & 0xfff)) | ||
156 | nd->end--; | ||
157 | printk(KERN_INFO "SRAT: Node %u PXM %u %Lx-%Lx\n", node, pxm, | ||
158 | nd->start, nd->end); | ||
159 | } | ||
160 | |||
161 | void __init acpi_numa_arch_fixup(void) {} | ||
162 | |||
163 | /* Use the information discovered above to actually set up the nodes. */ | ||
164 | int __init acpi_scan_nodes(unsigned long start, unsigned long end) | ||
165 | { | ||
166 | int i; | ||
167 | if (acpi_numa <= 0) | ||
168 | return -1; | ||
169 | memnode_shift = compute_hash_shift(nodes, nodes_weight(nodes_parsed)); | ||
170 | if (memnode_shift < 0) { | ||
171 | printk(KERN_ERR | ||
172 | "SRAT: No NUMA node hash function found. Contact maintainer\n"); | ||
173 | bad_srat(); | ||
174 | return -1; | ||
175 | } | ||
176 | for (i = 0; i < MAX_NUMNODES; i++) { | ||
177 | if (!node_isset(i, nodes_parsed)) | ||
178 | continue; | ||
179 | cutoff_node(i, start, end); | ||
180 | if (nodes[i].start == nodes[i].end) { | ||
181 | node_clear(i, nodes_parsed); | ||
182 | continue; | ||
183 | } | ||
184 | setup_node_bootmem(i, nodes[i].start, nodes[i].end); | ||
185 | } | ||
186 | for (i = 0; i < NR_CPUS; i++) { | ||
187 | if (cpu_to_node[i] == NUMA_NO_NODE) | ||
188 | continue; | ||
189 | if (!node_isset(cpu_to_node[i], nodes_parsed)) | ||
190 | cpu_to_node[i] = NUMA_NO_NODE; | ||
191 | } | ||
192 | numa_init_array(); | ||
193 | return 0; | ||
194 | } | ||
195 | |||
196 | int node_to_pxm(int n) | ||
197 | { | ||
198 | int i; | ||
199 | if (pxm2node[n] == n) | ||
200 | return n; | ||
201 | for (i = 0; i < 256; i++) | ||
202 | if (pxm2node[i] == n) | ||
203 | return i; | ||
204 | return 0; | ||
205 | } | ||
206 | |||
207 | int __node_distance(int a, int b) | ||
208 | { | ||
209 | int index; | ||
210 | |||
211 | if (!acpi_slit) | ||
212 | return a == b ? 10 : 20; | ||
213 | index = acpi_slit->localities * node_to_pxm(a); | ||
214 | return acpi_slit->entry[index + node_to_pxm(b)]; | ||
215 | } | ||
216 | |||
217 | EXPORT_SYMBOL(__node_distance); | ||