diff options
Diffstat (limited to 'arch/x86')
-rw-r--r-- | arch/x86/mm/Makefile | 2 | ||||
-rw-r--r-- | arch/x86/mm/Makefile_64 | 10 | ||||
-rw-r--r-- | arch/x86/mm/extable_64.c | 34 | ||||
-rw-r--r-- | arch/x86/mm/fault_64.c | 636 | ||||
-rw-r--r-- | arch/x86/mm/init_64.c | 750 | ||||
-rw-r--r-- | arch/x86/mm/ioremap_64.c | 210 | ||||
-rw-r--r-- | arch/x86/mm/k8topology_64.c | 182 | ||||
-rw-r--r-- | arch/x86/mm/mmap_64.c | 29 | ||||
-rw-r--r-- | arch/x86/mm/numa_64.c | 648 | ||||
-rw-r--r-- | arch/x86/mm/pageattr_64.c | 249 | ||||
-rw-r--r-- | arch/x86/mm/srat_64.c | 566 |
11 files changed, 3315 insertions, 1 deletions
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 7317648e6587..983291096848 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile | |||
@@ -1,5 +1,5 @@ | |||
1 | ifeq ($(CONFIG_X86_32),y) | 1 | ifeq ($(CONFIG_X86_32),y) |
2 | include ${srctree}/arch/x86/mm/Makefile_32 | 2 | include ${srctree}/arch/x86/mm/Makefile_32 |
3 | else | 3 | else |
4 | include ${srctree}/arch/x86_64/mm/Makefile_64 | 4 | include ${srctree}/arch/x86/mm/Makefile_64 |
5 | endif | 5 | endif |
diff --git a/arch/x86/mm/Makefile_64 b/arch/x86/mm/Makefile_64 new file mode 100644 index 000000000000..6bcb47945b87 --- /dev/null +++ b/arch/x86/mm/Makefile_64 | |||
@@ -0,0 +1,10 @@ | |||
1 | # | ||
2 | # Makefile for the linux x86_64-specific parts of the memory manager. | ||
3 | # | ||
4 | |||
5 | obj-y := init_64.o fault_64.o ioremap_64.o extable_64.o pageattr_64.o mmap_64.o | ||
6 | obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o | ||
7 | obj-$(CONFIG_NUMA) += numa_64.o | ||
8 | obj-$(CONFIG_K8_NUMA) += k8topology_64.o | ||
9 | obj-$(CONFIG_ACPI_NUMA) += srat_64.o | ||
10 | |||
diff --git a/arch/x86/mm/extable_64.c b/arch/x86/mm/extable_64.c new file mode 100644 index 000000000000..79ac6e7100af --- /dev/null +++ b/arch/x86/mm/extable_64.c | |||
@@ -0,0 +1,34 @@ | |||
1 | /* | ||
2 | * linux/arch/x86_64/mm/extable.c | ||
3 | */ | ||
4 | |||
5 | #include <linux/module.h> | ||
6 | #include <linux/spinlock.h> | ||
7 | #include <linux/init.h> | ||
8 | #include <asm/uaccess.h> | ||
9 | |||
10 | /* Simple binary search */ | ||
11 | const struct exception_table_entry * | ||
12 | search_extable(const struct exception_table_entry *first, | ||
13 | const struct exception_table_entry *last, | ||
14 | unsigned long value) | ||
15 | { | ||
16 | /* Work around a B stepping K8 bug */ | ||
17 | if ((value >> 32) == 0) | ||
18 | value |= 0xffffffffUL << 32; | ||
19 | |||
20 | while (first <= last) { | ||
21 | const struct exception_table_entry *mid; | ||
22 | long diff; | ||
23 | |||
24 | mid = (last - first) / 2 + first; | ||
25 | diff = mid->insn - value; | ||
26 | if (diff == 0) | ||
27 | return mid; | ||
28 | else if (diff < 0) | ||
29 | first = mid+1; | ||
30 | else | ||
31 | last = mid-1; | ||
32 | } | ||
33 | return NULL; | ||
34 | } | ||
diff --git a/arch/x86/mm/fault_64.c b/arch/x86/mm/fault_64.c new file mode 100644 index 000000000000..54816adb8e93 --- /dev/null +++ b/arch/x86/mm/fault_64.c | |||
@@ -0,0 +1,636 @@ | |||
1 | /* | ||
2 | * linux/arch/x86-64/mm/fault.c | ||
3 | * | ||
4 | * Copyright (C) 1995 Linus Torvalds | ||
5 | * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs. | ||
6 | */ | ||
7 | |||
8 | #include <linux/signal.h> | ||
9 | #include <linux/sched.h> | ||
10 | #include <linux/kernel.h> | ||
11 | #include <linux/errno.h> | ||
12 | #include <linux/string.h> | ||
13 | #include <linux/types.h> | ||
14 | #include <linux/ptrace.h> | ||
15 | #include <linux/mman.h> | ||
16 | #include <linux/mm.h> | ||
17 | #include <linux/smp.h> | ||
18 | #include <linux/interrupt.h> | ||
19 | #include <linux/init.h> | ||
20 | #include <linux/tty.h> | ||
21 | #include <linux/vt_kern.h> /* For unblank_screen() */ | ||
22 | #include <linux/compiler.h> | ||
23 | #include <linux/vmalloc.h> | ||
24 | #include <linux/module.h> | ||
25 | #include <linux/kprobes.h> | ||
26 | #include <linux/uaccess.h> | ||
27 | #include <linux/kdebug.h> | ||
28 | |||
29 | #include <asm/system.h> | ||
30 | #include <asm/pgalloc.h> | ||
31 | #include <asm/smp.h> | ||
32 | #include <asm/tlbflush.h> | ||
33 | #include <asm/proto.h> | ||
34 | #include <asm-generic/sections.h> | ||
35 | |||
36 | /* Page fault error code bits */ | ||
37 | #define PF_PROT (1<<0) /* or no page found */ | ||
38 | #define PF_WRITE (1<<1) | ||
39 | #define PF_USER (1<<2) | ||
40 | #define PF_RSVD (1<<3) | ||
41 | #define PF_INSTR (1<<4) | ||
42 | |||
43 | static ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain); | ||
44 | |||
45 | /* Hook to register for page fault notifications */ | ||
46 | int register_page_fault_notifier(struct notifier_block *nb) | ||
47 | { | ||
48 | vmalloc_sync_all(); | ||
49 | return atomic_notifier_chain_register(¬ify_page_fault_chain, nb); | ||
50 | } | ||
51 | EXPORT_SYMBOL_GPL(register_page_fault_notifier); | ||
52 | |||
53 | int unregister_page_fault_notifier(struct notifier_block *nb) | ||
54 | { | ||
55 | return atomic_notifier_chain_unregister(¬ify_page_fault_chain, nb); | ||
56 | } | ||
57 | EXPORT_SYMBOL_GPL(unregister_page_fault_notifier); | ||
58 | |||
59 | static inline int notify_page_fault(struct pt_regs *regs, long err) | ||
60 | { | ||
61 | struct die_args args = { | ||
62 | .regs = regs, | ||
63 | .str = "page fault", | ||
64 | .err = err, | ||
65 | .trapnr = 14, | ||
66 | .signr = SIGSEGV | ||
67 | }; | ||
68 | return atomic_notifier_call_chain(¬ify_page_fault_chain, | ||
69 | DIE_PAGE_FAULT, &args); | ||
70 | } | ||
71 | |||
72 | /* Sometimes the CPU reports invalid exceptions on prefetch. | ||
73 | Check that here and ignore. | ||
74 | Opcode checker based on code by Richard Brunner */ | ||
75 | static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr, | ||
76 | unsigned long error_code) | ||
77 | { | ||
78 | unsigned char *instr; | ||
79 | int scan_more = 1; | ||
80 | int prefetch = 0; | ||
81 | unsigned char *max_instr; | ||
82 | |||
83 | /* If it was a exec fault ignore */ | ||
84 | if (error_code & PF_INSTR) | ||
85 | return 0; | ||
86 | |||
87 | instr = (unsigned char __user *)convert_rip_to_linear(current, regs); | ||
88 | max_instr = instr + 15; | ||
89 | |||
90 | if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE) | ||
91 | return 0; | ||
92 | |||
93 | while (scan_more && instr < max_instr) { | ||
94 | unsigned char opcode; | ||
95 | unsigned char instr_hi; | ||
96 | unsigned char instr_lo; | ||
97 | |||
98 | if (probe_kernel_address(instr, opcode)) | ||
99 | break; | ||
100 | |||
101 | instr_hi = opcode & 0xf0; | ||
102 | instr_lo = opcode & 0x0f; | ||
103 | instr++; | ||
104 | |||
105 | switch (instr_hi) { | ||
106 | case 0x20: | ||
107 | case 0x30: | ||
108 | /* Values 0x26,0x2E,0x36,0x3E are valid x86 | ||
109 | prefixes. In long mode, the CPU will signal | ||
110 | invalid opcode if some of these prefixes are | ||
111 | present so we will never get here anyway */ | ||
112 | scan_more = ((instr_lo & 7) == 0x6); | ||
113 | break; | ||
114 | |||
115 | case 0x40: | ||
116 | /* In AMD64 long mode, 0x40 to 0x4F are valid REX prefixes | ||
117 | Need to figure out under what instruction mode the | ||
118 | instruction was issued ... */ | ||
119 | /* Could check the LDT for lm, but for now it's good | ||
120 | enough to assume that long mode only uses well known | ||
121 | segments or kernel. */ | ||
122 | scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS); | ||
123 | break; | ||
124 | |||
125 | case 0x60: | ||
126 | /* 0x64 thru 0x67 are valid prefixes in all modes. */ | ||
127 | scan_more = (instr_lo & 0xC) == 0x4; | ||
128 | break; | ||
129 | case 0xF0: | ||
130 | /* 0xF0, 0xF2, and 0xF3 are valid prefixes in all modes. */ | ||
131 | scan_more = !instr_lo || (instr_lo>>1) == 1; | ||
132 | break; | ||
133 | case 0x00: | ||
134 | /* Prefetch instruction is 0x0F0D or 0x0F18 */ | ||
135 | scan_more = 0; | ||
136 | if (probe_kernel_address(instr, opcode)) | ||
137 | break; | ||
138 | prefetch = (instr_lo == 0xF) && | ||
139 | (opcode == 0x0D || opcode == 0x18); | ||
140 | break; | ||
141 | default: | ||
142 | scan_more = 0; | ||
143 | break; | ||
144 | } | ||
145 | } | ||
146 | return prefetch; | ||
147 | } | ||
148 | |||
149 | static int bad_address(void *p) | ||
150 | { | ||
151 | unsigned long dummy; | ||
152 | return probe_kernel_address((unsigned long *)p, dummy); | ||
153 | } | ||
154 | |||
155 | void dump_pagetable(unsigned long address) | ||
156 | { | ||
157 | pgd_t *pgd; | ||
158 | pud_t *pud; | ||
159 | pmd_t *pmd; | ||
160 | pte_t *pte; | ||
161 | |||
162 | pgd = (pgd_t *)read_cr3(); | ||
163 | |||
164 | pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK); | ||
165 | pgd += pgd_index(address); | ||
166 | if (bad_address(pgd)) goto bad; | ||
167 | printk("PGD %lx ", pgd_val(*pgd)); | ||
168 | if (!pgd_present(*pgd)) goto ret; | ||
169 | |||
170 | pud = pud_offset(pgd, address); | ||
171 | if (bad_address(pud)) goto bad; | ||
172 | printk("PUD %lx ", pud_val(*pud)); | ||
173 | if (!pud_present(*pud)) goto ret; | ||
174 | |||
175 | pmd = pmd_offset(pud, address); | ||
176 | if (bad_address(pmd)) goto bad; | ||
177 | printk("PMD %lx ", pmd_val(*pmd)); | ||
178 | if (!pmd_present(*pmd)) goto ret; | ||
179 | |||
180 | pte = pte_offset_kernel(pmd, address); | ||
181 | if (bad_address(pte)) goto bad; | ||
182 | printk("PTE %lx", pte_val(*pte)); | ||
183 | ret: | ||
184 | printk("\n"); | ||
185 | return; | ||
186 | bad: | ||
187 | printk("BAD\n"); | ||
188 | } | ||
189 | |||
190 | static const char errata93_warning[] = | ||
191 | KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n" | ||
192 | KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n" | ||
193 | KERN_ERR "******* Please consider a BIOS update.\n" | ||
194 | KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n"; | ||
195 | |||
196 | /* Workaround for K8 erratum #93 & buggy BIOS. | ||
197 | BIOS SMM functions are required to use a specific workaround | ||
198 | to avoid corruption of the 64bit RIP register on C stepping K8. | ||
199 | A lot of BIOS that didn't get tested properly miss this. | ||
200 | The OS sees this as a page fault with the upper 32bits of RIP cleared. | ||
201 | Try to work around it here. | ||
202 | Note we only handle faults in kernel here. */ | ||
203 | |||
204 | static int is_errata93(struct pt_regs *regs, unsigned long address) | ||
205 | { | ||
206 | static int warned; | ||
207 | if (address != regs->rip) | ||
208 | return 0; | ||
209 | if ((address >> 32) != 0) | ||
210 | return 0; | ||
211 | address |= 0xffffffffUL << 32; | ||
212 | if ((address >= (u64)_stext && address <= (u64)_etext) || | ||
213 | (address >= MODULES_VADDR && address <= MODULES_END)) { | ||
214 | if (!warned) { | ||
215 | printk(errata93_warning); | ||
216 | warned = 1; | ||
217 | } | ||
218 | regs->rip = address; | ||
219 | return 1; | ||
220 | } | ||
221 | return 0; | ||
222 | } | ||
223 | |||
224 | static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs, | ||
225 | unsigned long error_code) | ||
226 | { | ||
227 | unsigned long flags = oops_begin(); | ||
228 | struct task_struct *tsk; | ||
229 | |||
230 | printk(KERN_ALERT "%s: Corrupted page table at address %lx\n", | ||
231 | current->comm, address); | ||
232 | dump_pagetable(address); | ||
233 | tsk = current; | ||
234 | tsk->thread.cr2 = address; | ||
235 | tsk->thread.trap_no = 14; | ||
236 | tsk->thread.error_code = error_code; | ||
237 | __die("Bad pagetable", regs, error_code); | ||
238 | oops_end(flags); | ||
239 | do_exit(SIGKILL); | ||
240 | } | ||
241 | |||
242 | /* | ||
243 | * Handle a fault on the vmalloc area | ||
244 | * | ||
245 | * This assumes no large pages in there. | ||
246 | */ | ||
247 | static int vmalloc_fault(unsigned long address) | ||
248 | { | ||
249 | pgd_t *pgd, *pgd_ref; | ||
250 | pud_t *pud, *pud_ref; | ||
251 | pmd_t *pmd, *pmd_ref; | ||
252 | pte_t *pte, *pte_ref; | ||
253 | |||
254 | /* Copy kernel mappings over when needed. This can also | ||
255 | happen within a race in page table update. In the later | ||
256 | case just flush. */ | ||
257 | |||
258 | pgd = pgd_offset(current->mm ?: &init_mm, address); | ||
259 | pgd_ref = pgd_offset_k(address); | ||
260 | if (pgd_none(*pgd_ref)) | ||
261 | return -1; | ||
262 | if (pgd_none(*pgd)) | ||
263 | set_pgd(pgd, *pgd_ref); | ||
264 | else | ||
265 | BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); | ||
266 | |||
267 | /* Below here mismatches are bugs because these lower tables | ||
268 | are shared */ | ||
269 | |||
270 | pud = pud_offset(pgd, address); | ||
271 | pud_ref = pud_offset(pgd_ref, address); | ||
272 | if (pud_none(*pud_ref)) | ||
273 | return -1; | ||
274 | if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref)) | ||
275 | BUG(); | ||
276 | pmd = pmd_offset(pud, address); | ||
277 | pmd_ref = pmd_offset(pud_ref, address); | ||
278 | if (pmd_none(*pmd_ref)) | ||
279 | return -1; | ||
280 | if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref)) | ||
281 | BUG(); | ||
282 | pte_ref = pte_offset_kernel(pmd_ref, address); | ||
283 | if (!pte_present(*pte_ref)) | ||
284 | return -1; | ||
285 | pte = pte_offset_kernel(pmd, address); | ||
286 | /* Don't use pte_page here, because the mappings can point | ||
287 | outside mem_map, and the NUMA hash lookup cannot handle | ||
288 | that. */ | ||
289 | if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref)) | ||
290 | BUG(); | ||
291 | return 0; | ||
292 | } | ||
293 | |||
294 | static int page_fault_trace; | ||
295 | int show_unhandled_signals = 1; | ||
296 | |||
297 | /* | ||
298 | * This routine handles page faults. It determines the address, | ||
299 | * and the problem, and then passes it off to one of the appropriate | ||
300 | * routines. | ||
301 | */ | ||
302 | asmlinkage void __kprobes do_page_fault(struct pt_regs *regs, | ||
303 | unsigned long error_code) | ||
304 | { | ||
305 | struct task_struct *tsk; | ||
306 | struct mm_struct *mm; | ||
307 | struct vm_area_struct * vma; | ||
308 | unsigned long address; | ||
309 | const struct exception_table_entry *fixup; | ||
310 | int write, fault; | ||
311 | unsigned long flags; | ||
312 | siginfo_t info; | ||
313 | |||
314 | tsk = current; | ||
315 | mm = tsk->mm; | ||
316 | prefetchw(&mm->mmap_sem); | ||
317 | |||
318 | /* get the address */ | ||
319 | address = read_cr2(); | ||
320 | |||
321 | info.si_code = SEGV_MAPERR; | ||
322 | |||
323 | |||
324 | /* | ||
325 | * We fault-in kernel-space virtual memory on-demand. The | ||
326 | * 'reference' page table is init_mm.pgd. | ||
327 | * | ||
328 | * NOTE! We MUST NOT take any locks for this case. We may | ||
329 | * be in an interrupt or a critical region, and should | ||
330 | * only copy the information from the master page table, | ||
331 | * nothing more. | ||
332 | * | ||
333 | * This verifies that the fault happens in kernel space | ||
334 | * (error_code & 4) == 0, and that the fault was not a | ||
335 | * protection error (error_code & 9) == 0. | ||
336 | */ | ||
337 | if (unlikely(address >= TASK_SIZE64)) { | ||
338 | /* | ||
339 | * Don't check for the module range here: its PML4 | ||
340 | * is always initialized because it's shared with the main | ||
341 | * kernel text. Only vmalloc may need PML4 syncups. | ||
342 | */ | ||
343 | if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) && | ||
344 | ((address >= VMALLOC_START && address < VMALLOC_END))) { | ||
345 | if (vmalloc_fault(address) >= 0) | ||
346 | return; | ||
347 | } | ||
348 | if (notify_page_fault(regs, error_code) == NOTIFY_STOP) | ||
349 | return; | ||
350 | /* | ||
351 | * Don't take the mm semaphore here. If we fixup a prefetch | ||
352 | * fault we could otherwise deadlock. | ||
353 | */ | ||
354 | goto bad_area_nosemaphore; | ||
355 | } | ||
356 | |||
357 | if (notify_page_fault(regs, error_code) == NOTIFY_STOP) | ||
358 | return; | ||
359 | |||
360 | if (likely(regs->eflags & X86_EFLAGS_IF)) | ||
361 | local_irq_enable(); | ||
362 | |||
363 | if (unlikely(page_fault_trace)) | ||
364 | printk("pagefault rip:%lx rsp:%lx cs:%lu ss:%lu address %lx error %lx\n", | ||
365 | regs->rip,regs->rsp,regs->cs,regs->ss,address,error_code); | ||
366 | |||
367 | if (unlikely(error_code & PF_RSVD)) | ||
368 | pgtable_bad(address, regs, error_code); | ||
369 | |||
370 | /* | ||
371 | * If we're in an interrupt or have no user | ||
372 | * context, we must not take the fault.. | ||
373 | */ | ||
374 | if (unlikely(in_atomic() || !mm)) | ||
375 | goto bad_area_nosemaphore; | ||
376 | |||
377 | /* | ||
378 | * User-mode registers count as a user access even for any | ||
379 | * potential system fault or CPU buglet. | ||
380 | */ | ||
381 | if (user_mode_vm(regs)) | ||
382 | error_code |= PF_USER; | ||
383 | |||
384 | again: | ||
385 | /* When running in the kernel we expect faults to occur only to | ||
386 | * addresses in user space. All other faults represent errors in the | ||
387 | * kernel and should generate an OOPS. Unfortunatly, in the case of an | ||
388 | * erroneous fault occurring in a code path which already holds mmap_sem | ||
389 | * we will deadlock attempting to validate the fault against the | ||
390 | * address space. Luckily the kernel only validly references user | ||
391 | * space from well defined areas of code, which are listed in the | ||
392 | * exceptions table. | ||
393 | * | ||
394 | * As the vast majority of faults will be valid we will only perform | ||
395 | * the source reference check when there is a possibilty of a deadlock. | ||
396 | * Attempt to lock the address space, if we cannot we then validate the | ||
397 | * source. If this is invalid we can skip the address space check, | ||
398 | * thus avoiding the deadlock. | ||
399 | */ | ||
400 | if (!down_read_trylock(&mm->mmap_sem)) { | ||
401 | if ((error_code & PF_USER) == 0 && | ||
402 | !search_exception_tables(regs->rip)) | ||
403 | goto bad_area_nosemaphore; | ||
404 | down_read(&mm->mmap_sem); | ||
405 | } | ||
406 | |||
407 | vma = find_vma(mm, address); | ||
408 | if (!vma) | ||
409 | goto bad_area; | ||
410 | if (likely(vma->vm_start <= address)) | ||
411 | goto good_area; | ||
412 | if (!(vma->vm_flags & VM_GROWSDOWN)) | ||
413 | goto bad_area; | ||
414 | if (error_code & 4) { | ||
415 | /* Allow userspace just enough access below the stack pointer | ||
416 | * to let the 'enter' instruction work. | ||
417 | */ | ||
418 | if (address + 65536 + 32 * sizeof(unsigned long) < regs->rsp) | ||
419 | goto bad_area; | ||
420 | } | ||
421 | if (expand_stack(vma, address)) | ||
422 | goto bad_area; | ||
423 | /* | ||
424 | * Ok, we have a good vm_area for this memory access, so | ||
425 | * we can handle it.. | ||
426 | */ | ||
427 | good_area: | ||
428 | info.si_code = SEGV_ACCERR; | ||
429 | write = 0; | ||
430 | switch (error_code & (PF_PROT|PF_WRITE)) { | ||
431 | default: /* 3: write, present */ | ||
432 | /* fall through */ | ||
433 | case PF_WRITE: /* write, not present */ | ||
434 | if (!(vma->vm_flags & VM_WRITE)) | ||
435 | goto bad_area; | ||
436 | write++; | ||
437 | break; | ||
438 | case PF_PROT: /* read, present */ | ||
439 | goto bad_area; | ||
440 | case 0: /* read, not present */ | ||
441 | if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))) | ||
442 | goto bad_area; | ||
443 | } | ||
444 | |||
445 | /* | ||
446 | * If for any reason at all we couldn't handle the fault, | ||
447 | * make sure we exit gracefully rather than endlessly redo | ||
448 | * the fault. | ||
449 | */ | ||
450 | fault = handle_mm_fault(mm, vma, address, write); | ||
451 | if (unlikely(fault & VM_FAULT_ERROR)) { | ||
452 | if (fault & VM_FAULT_OOM) | ||
453 | goto out_of_memory; | ||
454 | else if (fault & VM_FAULT_SIGBUS) | ||
455 | goto do_sigbus; | ||
456 | BUG(); | ||
457 | } | ||
458 | if (fault & VM_FAULT_MAJOR) | ||
459 | tsk->maj_flt++; | ||
460 | else | ||
461 | tsk->min_flt++; | ||
462 | up_read(&mm->mmap_sem); | ||
463 | return; | ||
464 | |||
465 | /* | ||
466 | * Something tried to access memory that isn't in our memory map.. | ||
467 | * Fix it, but check if it's kernel or user first.. | ||
468 | */ | ||
469 | bad_area: | ||
470 | up_read(&mm->mmap_sem); | ||
471 | |||
472 | bad_area_nosemaphore: | ||
473 | /* User mode accesses just cause a SIGSEGV */ | ||
474 | if (error_code & PF_USER) { | ||
475 | |||
476 | /* | ||
477 | * It's possible to have interrupts off here. | ||
478 | */ | ||
479 | local_irq_enable(); | ||
480 | |||
481 | if (is_prefetch(regs, address, error_code)) | ||
482 | return; | ||
483 | |||
484 | /* Work around K8 erratum #100 K8 in compat mode | ||
485 | occasionally jumps to illegal addresses >4GB. We | ||
486 | catch this here in the page fault handler because | ||
487 | these addresses are not reachable. Just detect this | ||
488 | case and return. Any code segment in LDT is | ||
489 | compatibility mode. */ | ||
490 | if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && | ||
491 | (address >> 32)) | ||
492 | return; | ||
493 | |||
494 | if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && | ||
495 | printk_ratelimit()) { | ||
496 | printk( | ||
497 | "%s%s[%d]: segfault at %016lx rip %016lx rsp %016lx error %lx\n", | ||
498 | tsk->pid > 1 ? KERN_INFO : KERN_EMERG, | ||
499 | tsk->comm, tsk->pid, address, regs->rip, | ||
500 | regs->rsp, error_code); | ||
501 | } | ||
502 | |||
503 | tsk->thread.cr2 = address; | ||
504 | /* Kernel addresses are always protection faults */ | ||
505 | tsk->thread.error_code = error_code | (address >= TASK_SIZE); | ||
506 | tsk->thread.trap_no = 14; | ||
507 | info.si_signo = SIGSEGV; | ||
508 | info.si_errno = 0; | ||
509 | /* info.si_code has been set above */ | ||
510 | info.si_addr = (void __user *)address; | ||
511 | force_sig_info(SIGSEGV, &info, tsk); | ||
512 | return; | ||
513 | } | ||
514 | |||
515 | no_context: | ||
516 | |||
517 | /* Are we prepared to handle this kernel fault? */ | ||
518 | fixup = search_exception_tables(regs->rip); | ||
519 | if (fixup) { | ||
520 | regs->rip = fixup->fixup; | ||
521 | return; | ||
522 | } | ||
523 | |||
524 | /* | ||
525 | * Hall of shame of CPU/BIOS bugs. | ||
526 | */ | ||
527 | |||
528 | if (is_prefetch(regs, address, error_code)) | ||
529 | return; | ||
530 | |||
531 | if (is_errata93(regs, address)) | ||
532 | return; | ||
533 | |||
534 | /* | ||
535 | * Oops. The kernel tried to access some bad page. We'll have to | ||
536 | * terminate things with extreme prejudice. | ||
537 | */ | ||
538 | |||
539 | flags = oops_begin(); | ||
540 | |||
541 | if (address < PAGE_SIZE) | ||
542 | printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference"); | ||
543 | else | ||
544 | printk(KERN_ALERT "Unable to handle kernel paging request"); | ||
545 | printk(" at %016lx RIP: \n" KERN_ALERT,address); | ||
546 | printk_address(regs->rip); | ||
547 | dump_pagetable(address); | ||
548 | tsk->thread.cr2 = address; | ||
549 | tsk->thread.trap_no = 14; | ||
550 | tsk->thread.error_code = error_code; | ||
551 | __die("Oops", regs, error_code); | ||
552 | /* Executive summary in case the body of the oops scrolled away */ | ||
553 | printk(KERN_EMERG "CR2: %016lx\n", address); | ||
554 | oops_end(flags); | ||
555 | do_exit(SIGKILL); | ||
556 | |||
557 | /* | ||
558 | * We ran out of memory, or some other thing happened to us that made | ||
559 | * us unable to handle the page fault gracefully. | ||
560 | */ | ||
561 | out_of_memory: | ||
562 | up_read(&mm->mmap_sem); | ||
563 | if (is_init(current)) { | ||
564 | yield(); | ||
565 | goto again; | ||
566 | } | ||
567 | printk("VM: killing process %s\n", tsk->comm); | ||
568 | if (error_code & 4) | ||
569 | do_group_exit(SIGKILL); | ||
570 | goto no_context; | ||
571 | |||
572 | do_sigbus: | ||
573 | up_read(&mm->mmap_sem); | ||
574 | |||
575 | /* Kernel mode? Handle exceptions or die */ | ||
576 | if (!(error_code & PF_USER)) | ||
577 | goto no_context; | ||
578 | |||
579 | tsk->thread.cr2 = address; | ||
580 | tsk->thread.error_code = error_code; | ||
581 | tsk->thread.trap_no = 14; | ||
582 | info.si_signo = SIGBUS; | ||
583 | info.si_errno = 0; | ||
584 | info.si_code = BUS_ADRERR; | ||
585 | info.si_addr = (void __user *)address; | ||
586 | force_sig_info(SIGBUS, &info, tsk); | ||
587 | return; | ||
588 | } | ||
589 | |||
590 | DEFINE_SPINLOCK(pgd_lock); | ||
591 | LIST_HEAD(pgd_list); | ||
592 | |||
593 | void vmalloc_sync_all(void) | ||
594 | { | ||
595 | /* Note that races in the updates of insync and start aren't | ||
596 | problematic: | ||
597 | insync can only get set bits added, and updates to start are only | ||
598 | improving performance (without affecting correctness if undone). */ | ||
599 | static DECLARE_BITMAP(insync, PTRS_PER_PGD); | ||
600 | static unsigned long start = VMALLOC_START & PGDIR_MASK; | ||
601 | unsigned long address; | ||
602 | |||
603 | for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) { | ||
604 | if (!test_bit(pgd_index(address), insync)) { | ||
605 | const pgd_t *pgd_ref = pgd_offset_k(address); | ||
606 | struct page *page; | ||
607 | |||
608 | if (pgd_none(*pgd_ref)) | ||
609 | continue; | ||
610 | spin_lock(&pgd_lock); | ||
611 | list_for_each_entry(page, &pgd_list, lru) { | ||
612 | pgd_t *pgd; | ||
613 | pgd = (pgd_t *)page_address(page) + pgd_index(address); | ||
614 | if (pgd_none(*pgd)) | ||
615 | set_pgd(pgd, *pgd_ref); | ||
616 | else | ||
617 | BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); | ||
618 | } | ||
619 | spin_unlock(&pgd_lock); | ||
620 | set_bit(pgd_index(address), insync); | ||
621 | } | ||
622 | if (address == start) | ||
623 | start = address + PGDIR_SIZE; | ||
624 | } | ||
625 | /* Check that there is no need to do the same for the modules area. */ | ||
626 | BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL)); | ||
627 | BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) == | ||
628 | (__START_KERNEL & PGDIR_MASK))); | ||
629 | } | ||
630 | |||
631 | static int __init enable_pagefaulttrace(char *str) | ||
632 | { | ||
633 | page_fault_trace = 1; | ||
634 | return 1; | ||
635 | } | ||
636 | __setup("pagefaulttrace", enable_pagefaulttrace); | ||
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c new file mode 100644 index 000000000000..458893b376f8 --- /dev/null +++ b/arch/x86/mm/init_64.c | |||
@@ -0,0 +1,750 @@ | |||
1 | /* | ||
2 | * linux/arch/x86_64/mm/init.c | ||
3 | * | ||
4 | * Copyright (C) 1995 Linus Torvalds | ||
5 | * Copyright (C) 2000 Pavel Machek <pavel@suse.cz> | ||
6 | * Copyright (C) 2002,2003 Andi Kleen <ak@suse.de> | ||
7 | */ | ||
8 | |||
9 | #include <linux/signal.h> | ||
10 | #include <linux/sched.h> | ||
11 | #include <linux/kernel.h> | ||
12 | #include <linux/errno.h> | ||
13 | #include <linux/string.h> | ||
14 | #include <linux/types.h> | ||
15 | #include <linux/ptrace.h> | ||
16 | #include <linux/mman.h> | ||
17 | #include <linux/mm.h> | ||
18 | #include <linux/swap.h> | ||
19 | #include <linux/smp.h> | ||
20 | #include <linux/init.h> | ||
21 | #include <linux/pagemap.h> | ||
22 | #include <linux/bootmem.h> | ||
23 | #include <linux/proc_fs.h> | ||
24 | #include <linux/pci.h> | ||
25 | #include <linux/pfn.h> | ||
26 | #include <linux/poison.h> | ||
27 | #include <linux/dma-mapping.h> | ||
28 | #include <linux/module.h> | ||
29 | #include <linux/memory_hotplug.h> | ||
30 | #include <linux/nmi.h> | ||
31 | |||
32 | #include <asm/processor.h> | ||
33 | #include <asm/system.h> | ||
34 | #include <asm/uaccess.h> | ||
35 | #include <asm/pgtable.h> | ||
36 | #include <asm/pgalloc.h> | ||
37 | #include <asm/dma.h> | ||
38 | #include <asm/fixmap.h> | ||
39 | #include <asm/e820.h> | ||
40 | #include <asm/apic.h> | ||
41 | #include <asm/tlb.h> | ||
42 | #include <asm/mmu_context.h> | ||
43 | #include <asm/proto.h> | ||
44 | #include <asm/smp.h> | ||
45 | #include <asm/sections.h> | ||
46 | |||
47 | #ifndef Dprintk | ||
48 | #define Dprintk(x...) | ||
49 | #endif | ||
50 | |||
51 | const struct dma_mapping_ops* dma_ops; | ||
52 | EXPORT_SYMBOL(dma_ops); | ||
53 | |||
54 | static unsigned long dma_reserve __initdata; | ||
55 | |||
56 | DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); | ||
57 | |||
58 | /* | ||
59 | * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the | ||
60 | * physical space so we can cache the place of the first one and move | ||
61 | * around without checking the pgd every time. | ||
62 | */ | ||
63 | |||
64 | void show_mem(void) | ||
65 | { | ||
66 | long i, total = 0, reserved = 0; | ||
67 | long shared = 0, cached = 0; | ||
68 | pg_data_t *pgdat; | ||
69 | struct page *page; | ||
70 | |||
71 | printk(KERN_INFO "Mem-info:\n"); | ||
72 | show_free_areas(); | ||
73 | printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); | ||
74 | |||
75 | for_each_online_pgdat(pgdat) { | ||
76 | for (i = 0; i < pgdat->node_spanned_pages; ++i) { | ||
77 | /* this loop can take a while with 256 GB and 4k pages | ||
78 | so update the NMI watchdog */ | ||
79 | if (unlikely(i % MAX_ORDER_NR_PAGES == 0)) { | ||
80 | touch_nmi_watchdog(); | ||
81 | } | ||
82 | if (!pfn_valid(pgdat->node_start_pfn + i)) | ||
83 | continue; | ||
84 | page = pfn_to_page(pgdat->node_start_pfn + i); | ||
85 | total++; | ||
86 | if (PageReserved(page)) | ||
87 | reserved++; | ||
88 | else if (PageSwapCache(page)) | ||
89 | cached++; | ||
90 | else if (page_count(page)) | ||
91 | shared += page_count(page) - 1; | ||
92 | } | ||
93 | } | ||
94 | printk(KERN_INFO "%lu pages of RAM\n", total); | ||
95 | printk(KERN_INFO "%lu reserved pages\n",reserved); | ||
96 | printk(KERN_INFO "%lu pages shared\n",shared); | ||
97 | printk(KERN_INFO "%lu pages swap cached\n",cached); | ||
98 | } | ||
99 | |||
100 | int after_bootmem; | ||
101 | |||
102 | static __init void *spp_getpage(void) | ||
103 | { | ||
104 | void *ptr; | ||
105 | if (after_bootmem) | ||
106 | ptr = (void *) get_zeroed_page(GFP_ATOMIC); | ||
107 | else | ||
108 | ptr = alloc_bootmem_pages(PAGE_SIZE); | ||
109 | if (!ptr || ((unsigned long)ptr & ~PAGE_MASK)) | ||
110 | panic("set_pte_phys: cannot allocate page data %s\n", after_bootmem?"after bootmem":""); | ||
111 | |||
112 | Dprintk("spp_getpage %p\n", ptr); | ||
113 | return ptr; | ||
114 | } | ||
115 | |||
116 | static __init void set_pte_phys(unsigned long vaddr, | ||
117 | unsigned long phys, pgprot_t prot) | ||
118 | { | ||
119 | pgd_t *pgd; | ||
120 | pud_t *pud; | ||
121 | pmd_t *pmd; | ||
122 | pte_t *pte, new_pte; | ||
123 | |||
124 | Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys); | ||
125 | |||
126 | pgd = pgd_offset_k(vaddr); | ||
127 | if (pgd_none(*pgd)) { | ||
128 | printk("PGD FIXMAP MISSING, it should be setup in head.S!\n"); | ||
129 | return; | ||
130 | } | ||
131 | pud = pud_offset(pgd, vaddr); | ||
132 | if (pud_none(*pud)) { | ||
133 | pmd = (pmd_t *) spp_getpage(); | ||
134 | set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER)); | ||
135 | if (pmd != pmd_offset(pud, 0)) { | ||
136 | printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0)); | ||
137 | return; | ||
138 | } | ||
139 | } | ||
140 | pmd = pmd_offset(pud, vaddr); | ||
141 | if (pmd_none(*pmd)) { | ||
142 | pte = (pte_t *) spp_getpage(); | ||
143 | set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER)); | ||
144 | if (pte != pte_offset_kernel(pmd, 0)) { | ||
145 | printk("PAGETABLE BUG #02!\n"); | ||
146 | return; | ||
147 | } | ||
148 | } | ||
149 | new_pte = pfn_pte(phys >> PAGE_SHIFT, prot); | ||
150 | |||
151 | pte = pte_offset_kernel(pmd, vaddr); | ||
152 | if (!pte_none(*pte) && | ||
153 | pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask)) | ||
154 | pte_ERROR(*pte); | ||
155 | set_pte(pte, new_pte); | ||
156 | |||
157 | /* | ||
158 | * It's enough to flush this one mapping. | ||
159 | * (PGE mappings get flushed as well) | ||
160 | */ | ||
161 | __flush_tlb_one(vaddr); | ||
162 | } | ||
163 | |||
164 | /* NOTE: this is meant to be run only at boot */ | ||
165 | void __init | ||
166 | __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t prot) | ||
167 | { | ||
168 | unsigned long address = __fix_to_virt(idx); | ||
169 | |||
170 | if (idx >= __end_of_fixed_addresses) { | ||
171 | printk("Invalid __set_fixmap\n"); | ||
172 | return; | ||
173 | } | ||
174 | set_pte_phys(address, phys, prot); | ||
175 | } | ||
176 | |||
177 | unsigned long __meminitdata table_start, table_end; | ||
178 | |||
179 | static __meminit void *alloc_low_page(unsigned long *phys) | ||
180 | { | ||
181 | unsigned long pfn = table_end++; | ||
182 | void *adr; | ||
183 | |||
184 | if (after_bootmem) { | ||
185 | adr = (void *)get_zeroed_page(GFP_ATOMIC); | ||
186 | *phys = __pa(adr); | ||
187 | return adr; | ||
188 | } | ||
189 | |||
190 | if (pfn >= end_pfn) | ||
191 | panic("alloc_low_page: ran out of memory"); | ||
192 | |||
193 | adr = early_ioremap(pfn * PAGE_SIZE, PAGE_SIZE); | ||
194 | memset(adr, 0, PAGE_SIZE); | ||
195 | *phys = pfn * PAGE_SIZE; | ||
196 | return adr; | ||
197 | } | ||
198 | |||
199 | static __meminit void unmap_low_page(void *adr) | ||
200 | { | ||
201 | |||
202 | if (after_bootmem) | ||
203 | return; | ||
204 | |||
205 | early_iounmap(adr, PAGE_SIZE); | ||
206 | } | ||
207 | |||
208 | /* Must run before zap_low_mappings */ | ||
209 | __meminit void *early_ioremap(unsigned long addr, unsigned long size) | ||
210 | { | ||
211 | unsigned long vaddr; | ||
212 | pmd_t *pmd, *last_pmd; | ||
213 | int i, pmds; | ||
214 | |||
215 | pmds = ((addr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE; | ||
216 | vaddr = __START_KERNEL_map; | ||
217 | pmd = level2_kernel_pgt; | ||
218 | last_pmd = level2_kernel_pgt + PTRS_PER_PMD - 1; | ||
219 | for (; pmd <= last_pmd; pmd++, vaddr += PMD_SIZE) { | ||
220 | for (i = 0; i < pmds; i++) { | ||
221 | if (pmd_present(pmd[i])) | ||
222 | goto next; | ||
223 | } | ||
224 | vaddr += addr & ~PMD_MASK; | ||
225 | addr &= PMD_MASK; | ||
226 | for (i = 0; i < pmds; i++, addr += PMD_SIZE) | ||
227 | set_pmd(pmd + i,__pmd(addr | _KERNPG_TABLE | _PAGE_PSE)); | ||
228 | __flush_tlb(); | ||
229 | return (void *)vaddr; | ||
230 | next: | ||
231 | ; | ||
232 | } | ||
233 | printk("early_ioremap(0x%lx, %lu) failed\n", addr, size); | ||
234 | return NULL; | ||
235 | } | ||
236 | |||
237 | /* To avoid virtual aliases later */ | ||
238 | __meminit void early_iounmap(void *addr, unsigned long size) | ||
239 | { | ||
240 | unsigned long vaddr; | ||
241 | pmd_t *pmd; | ||
242 | int i, pmds; | ||
243 | |||
244 | vaddr = (unsigned long)addr; | ||
245 | pmds = ((vaddr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE; | ||
246 | pmd = level2_kernel_pgt + pmd_index(vaddr); | ||
247 | for (i = 0; i < pmds; i++) | ||
248 | pmd_clear(pmd + i); | ||
249 | __flush_tlb(); | ||
250 | } | ||
251 | |||
252 | static void __meminit | ||
253 | phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end) | ||
254 | { | ||
255 | int i = pmd_index(address); | ||
256 | |||
257 | for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) { | ||
258 | unsigned long entry; | ||
259 | pmd_t *pmd = pmd_page + pmd_index(address); | ||
260 | |||
261 | if (address >= end) { | ||
262 | if (!after_bootmem) | ||
263 | for (; i < PTRS_PER_PMD; i++, pmd++) | ||
264 | set_pmd(pmd, __pmd(0)); | ||
265 | break; | ||
266 | } | ||
267 | |||
268 | if (pmd_val(*pmd)) | ||
269 | continue; | ||
270 | |||
271 | entry = _PAGE_NX|_PAGE_PSE|_KERNPG_TABLE|_PAGE_GLOBAL|address; | ||
272 | entry &= __supported_pte_mask; | ||
273 | set_pmd(pmd, __pmd(entry)); | ||
274 | } | ||
275 | } | ||
276 | |||
277 | static void __meminit | ||
278 | phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end) | ||
279 | { | ||
280 | pmd_t *pmd = pmd_offset(pud,0); | ||
281 | spin_lock(&init_mm.page_table_lock); | ||
282 | phys_pmd_init(pmd, address, end); | ||
283 | spin_unlock(&init_mm.page_table_lock); | ||
284 | __flush_tlb_all(); | ||
285 | } | ||
286 | |||
287 | static void __meminit phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end) | ||
288 | { | ||
289 | int i = pud_index(addr); | ||
290 | |||
291 | |||
292 | for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE ) { | ||
293 | unsigned long pmd_phys; | ||
294 | pud_t *pud = pud_page + pud_index(addr); | ||
295 | pmd_t *pmd; | ||
296 | |||
297 | if (addr >= end) | ||
298 | break; | ||
299 | |||
300 | if (!after_bootmem && !e820_any_mapped(addr,addr+PUD_SIZE,0)) { | ||
301 | set_pud(pud, __pud(0)); | ||
302 | continue; | ||
303 | } | ||
304 | |||
305 | if (pud_val(*pud)) { | ||
306 | phys_pmd_update(pud, addr, end); | ||
307 | continue; | ||
308 | } | ||
309 | |||
310 | pmd = alloc_low_page(&pmd_phys); | ||
311 | spin_lock(&init_mm.page_table_lock); | ||
312 | set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE)); | ||
313 | phys_pmd_init(pmd, addr, end); | ||
314 | spin_unlock(&init_mm.page_table_lock); | ||
315 | unmap_low_page(pmd); | ||
316 | } | ||
317 | __flush_tlb(); | ||
318 | } | ||
319 | |||
320 | static void __init find_early_table_space(unsigned long end) | ||
321 | { | ||
322 | unsigned long puds, pmds, tables, start; | ||
323 | |||
324 | puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; | ||
325 | pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; | ||
326 | tables = round_up(puds * sizeof(pud_t), PAGE_SIZE) + | ||
327 | round_up(pmds * sizeof(pmd_t), PAGE_SIZE); | ||
328 | |||
329 | /* RED-PEN putting page tables only on node 0 could | ||
330 | cause a hotspot and fill up ZONE_DMA. The page tables | ||
331 | need roughly 0.5KB per GB. */ | ||
332 | start = 0x8000; | ||
333 | table_start = find_e820_area(start, end, tables); | ||
334 | if (table_start == -1UL) | ||
335 | panic("Cannot find space for the kernel page tables"); | ||
336 | |||
337 | table_start >>= PAGE_SHIFT; | ||
338 | table_end = table_start; | ||
339 | |||
340 | early_printk("kernel direct mapping tables up to %lx @ %lx-%lx\n", | ||
341 | end, table_start << PAGE_SHIFT, | ||
342 | (table_start << PAGE_SHIFT) + tables); | ||
343 | } | ||
344 | |||
345 | /* Setup the direct mapping of the physical memory at PAGE_OFFSET. | ||
346 | This runs before bootmem is initialized and gets pages directly from the | ||
347 | physical memory. To access them they are temporarily mapped. */ | ||
348 | void __meminit init_memory_mapping(unsigned long start, unsigned long end) | ||
349 | { | ||
350 | unsigned long next; | ||
351 | |||
352 | Dprintk("init_memory_mapping\n"); | ||
353 | |||
354 | /* | ||
355 | * Find space for the kernel direct mapping tables. | ||
356 | * Later we should allocate these tables in the local node of the memory | ||
357 | * mapped. Unfortunately this is done currently before the nodes are | ||
358 | * discovered. | ||
359 | */ | ||
360 | if (!after_bootmem) | ||
361 | find_early_table_space(end); | ||
362 | |||
363 | start = (unsigned long)__va(start); | ||
364 | end = (unsigned long)__va(end); | ||
365 | |||
366 | for (; start < end; start = next) { | ||
367 | unsigned long pud_phys; | ||
368 | pgd_t *pgd = pgd_offset_k(start); | ||
369 | pud_t *pud; | ||
370 | |||
371 | if (after_bootmem) | ||
372 | pud = pud_offset(pgd, start & PGDIR_MASK); | ||
373 | else | ||
374 | pud = alloc_low_page(&pud_phys); | ||
375 | |||
376 | next = start + PGDIR_SIZE; | ||
377 | if (next > end) | ||
378 | next = end; | ||
379 | phys_pud_init(pud, __pa(start), __pa(next)); | ||
380 | if (!after_bootmem) | ||
381 | set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys)); | ||
382 | unmap_low_page(pud); | ||
383 | } | ||
384 | |||
385 | if (!after_bootmem) | ||
386 | mmu_cr4_features = read_cr4(); | ||
387 | __flush_tlb_all(); | ||
388 | } | ||
389 | |||
390 | #ifndef CONFIG_NUMA | ||
391 | void __init paging_init(void) | ||
392 | { | ||
393 | unsigned long max_zone_pfns[MAX_NR_ZONES]; | ||
394 | memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); | ||
395 | max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN; | ||
396 | max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; | ||
397 | max_zone_pfns[ZONE_NORMAL] = end_pfn; | ||
398 | |||
399 | memory_present(0, 0, end_pfn); | ||
400 | sparse_init(); | ||
401 | free_area_init_nodes(max_zone_pfns); | ||
402 | } | ||
403 | #endif | ||
404 | |||
405 | /* Unmap a kernel mapping if it exists. This is useful to avoid prefetches | ||
406 | from the CPU leading to inconsistent cache lines. address and size | ||
407 | must be aligned to 2MB boundaries. | ||
408 | Does nothing when the mapping doesn't exist. */ | ||
409 | void __init clear_kernel_mapping(unsigned long address, unsigned long size) | ||
410 | { | ||
411 | unsigned long end = address + size; | ||
412 | |||
413 | BUG_ON(address & ~LARGE_PAGE_MASK); | ||
414 | BUG_ON(size & ~LARGE_PAGE_MASK); | ||
415 | |||
416 | for (; address < end; address += LARGE_PAGE_SIZE) { | ||
417 | pgd_t *pgd = pgd_offset_k(address); | ||
418 | pud_t *pud; | ||
419 | pmd_t *pmd; | ||
420 | if (pgd_none(*pgd)) | ||
421 | continue; | ||
422 | pud = pud_offset(pgd, address); | ||
423 | if (pud_none(*pud)) | ||
424 | continue; | ||
425 | pmd = pmd_offset(pud, address); | ||
426 | if (!pmd || pmd_none(*pmd)) | ||
427 | continue; | ||
428 | if (0 == (pmd_val(*pmd) & _PAGE_PSE)) { | ||
429 | /* Could handle this, but it should not happen currently. */ | ||
430 | printk(KERN_ERR | ||
431 | "clear_kernel_mapping: mapping has been split. will leak memory\n"); | ||
432 | pmd_ERROR(*pmd); | ||
433 | } | ||
434 | set_pmd(pmd, __pmd(0)); | ||
435 | } | ||
436 | __flush_tlb_all(); | ||
437 | } | ||
438 | |||
439 | /* | ||
440 | * Memory hotplug specific functions | ||
441 | */ | ||
442 | void online_page(struct page *page) | ||
443 | { | ||
444 | ClearPageReserved(page); | ||
445 | init_page_count(page); | ||
446 | __free_page(page); | ||
447 | totalram_pages++; | ||
448 | num_physpages++; | ||
449 | } | ||
450 | |||
451 | #ifdef CONFIG_MEMORY_HOTPLUG | ||
452 | /* | ||
453 | * Memory is added always to NORMAL zone. This means you will never get | ||
454 | * additional DMA/DMA32 memory. | ||
455 | */ | ||
456 | int arch_add_memory(int nid, u64 start, u64 size) | ||
457 | { | ||
458 | struct pglist_data *pgdat = NODE_DATA(nid); | ||
459 | struct zone *zone = pgdat->node_zones + ZONE_NORMAL; | ||
460 | unsigned long start_pfn = start >> PAGE_SHIFT; | ||
461 | unsigned long nr_pages = size >> PAGE_SHIFT; | ||
462 | int ret; | ||
463 | |||
464 | init_memory_mapping(start, (start + size -1)); | ||
465 | |||
466 | ret = __add_pages(zone, start_pfn, nr_pages); | ||
467 | if (ret) | ||
468 | goto error; | ||
469 | |||
470 | return ret; | ||
471 | error: | ||
472 | printk("%s: Problem encountered in __add_pages!\n", __func__); | ||
473 | return ret; | ||
474 | } | ||
475 | EXPORT_SYMBOL_GPL(arch_add_memory); | ||
476 | |||
477 | int remove_memory(u64 start, u64 size) | ||
478 | { | ||
479 | return -EINVAL; | ||
480 | } | ||
481 | EXPORT_SYMBOL_GPL(remove_memory); | ||
482 | |||
483 | #if !defined(CONFIG_ACPI_NUMA) && defined(CONFIG_NUMA) | ||
484 | int memory_add_physaddr_to_nid(u64 start) | ||
485 | { | ||
486 | return 0; | ||
487 | } | ||
488 | EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); | ||
489 | #endif | ||
490 | |||
491 | #endif /* CONFIG_MEMORY_HOTPLUG */ | ||
492 | |||
493 | #ifdef CONFIG_MEMORY_HOTPLUG_RESERVE | ||
494 | /* | ||
495 | * Memory Hotadd without sparsemem. The mem_maps have been allocated in advance, | ||
496 | * just online the pages. | ||
497 | */ | ||
498 | int __add_pages(struct zone *z, unsigned long start_pfn, unsigned long nr_pages) | ||
499 | { | ||
500 | int err = -EIO; | ||
501 | unsigned long pfn; | ||
502 | unsigned long total = 0, mem = 0; | ||
503 | for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++) { | ||
504 | if (pfn_valid(pfn)) { | ||
505 | online_page(pfn_to_page(pfn)); | ||
506 | err = 0; | ||
507 | mem++; | ||
508 | } | ||
509 | total++; | ||
510 | } | ||
511 | if (!err) { | ||
512 | z->spanned_pages += total; | ||
513 | z->present_pages += mem; | ||
514 | z->zone_pgdat->node_spanned_pages += total; | ||
515 | z->zone_pgdat->node_present_pages += mem; | ||
516 | } | ||
517 | return err; | ||
518 | } | ||
519 | #endif | ||
520 | |||
521 | static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules, | ||
522 | kcore_vsyscall; | ||
523 | |||
524 | void __init mem_init(void) | ||
525 | { | ||
526 | long codesize, reservedpages, datasize, initsize; | ||
527 | |||
528 | pci_iommu_alloc(); | ||
529 | |||
530 | /* clear the zero-page */ | ||
531 | memset(empty_zero_page, 0, PAGE_SIZE); | ||
532 | |||
533 | reservedpages = 0; | ||
534 | |||
535 | /* this will put all low memory onto the freelists */ | ||
536 | #ifdef CONFIG_NUMA | ||
537 | totalram_pages = numa_free_all_bootmem(); | ||
538 | #else | ||
539 | totalram_pages = free_all_bootmem(); | ||
540 | #endif | ||
541 | reservedpages = end_pfn - totalram_pages - | ||
542 | absent_pages_in_range(0, end_pfn); | ||
543 | |||
544 | after_bootmem = 1; | ||
545 | |||
546 | codesize = (unsigned long) &_etext - (unsigned long) &_text; | ||
547 | datasize = (unsigned long) &_edata - (unsigned long) &_etext; | ||
548 | initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin; | ||
549 | |||
550 | /* Register memory areas for /proc/kcore */ | ||
551 | kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT); | ||
552 | kclist_add(&kcore_vmalloc, (void *)VMALLOC_START, | ||
553 | VMALLOC_END-VMALLOC_START); | ||
554 | kclist_add(&kcore_kernel, &_stext, _end - _stext); | ||
555 | kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN); | ||
556 | kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START, | ||
557 | VSYSCALL_END - VSYSCALL_START); | ||
558 | |||
559 | printk("Memory: %luk/%luk available (%ldk kernel code, %ldk reserved, %ldk data, %ldk init)\n", | ||
560 | (unsigned long) nr_free_pages() << (PAGE_SHIFT-10), | ||
561 | end_pfn << (PAGE_SHIFT-10), | ||
562 | codesize >> 10, | ||
563 | reservedpages << (PAGE_SHIFT-10), | ||
564 | datasize >> 10, | ||
565 | initsize >> 10); | ||
566 | } | ||
567 | |||
568 | void free_init_pages(char *what, unsigned long begin, unsigned long end) | ||
569 | { | ||
570 | unsigned long addr; | ||
571 | |||
572 | if (begin >= end) | ||
573 | return; | ||
574 | |||
575 | printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10); | ||
576 | for (addr = begin; addr < end; addr += PAGE_SIZE) { | ||
577 | ClearPageReserved(virt_to_page(addr)); | ||
578 | init_page_count(virt_to_page(addr)); | ||
579 | memset((void *)(addr & ~(PAGE_SIZE-1)), | ||
580 | POISON_FREE_INITMEM, PAGE_SIZE); | ||
581 | if (addr >= __START_KERNEL_map) | ||
582 | change_page_attr_addr(addr, 1, __pgprot(0)); | ||
583 | free_page(addr); | ||
584 | totalram_pages++; | ||
585 | } | ||
586 | if (addr > __START_KERNEL_map) | ||
587 | global_flush_tlb(); | ||
588 | } | ||
589 | |||
590 | void free_initmem(void) | ||
591 | { | ||
592 | free_init_pages("unused kernel memory", | ||
593 | (unsigned long)(&__init_begin), | ||
594 | (unsigned long)(&__init_end)); | ||
595 | } | ||
596 | |||
597 | #ifdef CONFIG_DEBUG_RODATA | ||
598 | |||
599 | void mark_rodata_ro(void) | ||
600 | { | ||
601 | unsigned long start = (unsigned long)_stext, end; | ||
602 | |||
603 | #ifdef CONFIG_HOTPLUG_CPU | ||
604 | /* It must still be possible to apply SMP alternatives. */ | ||
605 | if (num_possible_cpus() > 1) | ||
606 | start = (unsigned long)_etext; | ||
607 | #endif | ||
608 | |||
609 | #ifdef CONFIG_KPROBES | ||
610 | start = (unsigned long)__start_rodata; | ||
611 | #endif | ||
612 | |||
613 | end = (unsigned long)__end_rodata; | ||
614 | start = (start + PAGE_SIZE - 1) & PAGE_MASK; | ||
615 | end &= PAGE_MASK; | ||
616 | if (end <= start) | ||
617 | return; | ||
618 | |||
619 | change_page_attr_addr(start, (end - start) >> PAGE_SHIFT, PAGE_KERNEL_RO); | ||
620 | |||
621 | printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n", | ||
622 | (end - start) >> 10); | ||
623 | |||
624 | /* | ||
625 | * change_page_attr_addr() requires a global_flush_tlb() call after it. | ||
626 | * We do this after the printk so that if something went wrong in the | ||
627 | * change, the printk gets out at least to give a better debug hint | ||
628 | * of who is the culprit. | ||
629 | */ | ||
630 | global_flush_tlb(); | ||
631 | } | ||
632 | #endif | ||
633 | |||
634 | #ifdef CONFIG_BLK_DEV_INITRD | ||
635 | void free_initrd_mem(unsigned long start, unsigned long end) | ||
636 | { | ||
637 | free_init_pages("initrd memory", start, end); | ||
638 | } | ||
639 | #endif | ||
640 | |||
641 | void __init reserve_bootmem_generic(unsigned long phys, unsigned len) | ||
642 | { | ||
643 | #ifdef CONFIG_NUMA | ||
644 | int nid = phys_to_nid(phys); | ||
645 | #endif | ||
646 | unsigned long pfn = phys >> PAGE_SHIFT; | ||
647 | if (pfn >= end_pfn) { | ||
648 | /* This can happen with kdump kernels when accessing firmware | ||
649 | tables. */ | ||
650 | if (pfn < end_pfn_map) | ||
651 | return; | ||
652 | printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n", | ||
653 | phys, len); | ||
654 | return; | ||
655 | } | ||
656 | |||
657 | /* Should check here against the e820 map to avoid double free */ | ||
658 | #ifdef CONFIG_NUMA | ||
659 | reserve_bootmem_node(NODE_DATA(nid), phys, len); | ||
660 | #else | ||
661 | reserve_bootmem(phys, len); | ||
662 | #endif | ||
663 | if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) { | ||
664 | dma_reserve += len / PAGE_SIZE; | ||
665 | set_dma_reserve(dma_reserve); | ||
666 | } | ||
667 | } | ||
668 | |||
669 | int kern_addr_valid(unsigned long addr) | ||
670 | { | ||
671 | unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT; | ||
672 | pgd_t *pgd; | ||
673 | pud_t *pud; | ||
674 | pmd_t *pmd; | ||
675 | pte_t *pte; | ||
676 | |||
677 | if (above != 0 && above != -1UL) | ||
678 | return 0; | ||
679 | |||
680 | pgd = pgd_offset_k(addr); | ||
681 | if (pgd_none(*pgd)) | ||
682 | return 0; | ||
683 | |||
684 | pud = pud_offset(pgd, addr); | ||
685 | if (pud_none(*pud)) | ||
686 | return 0; | ||
687 | |||
688 | pmd = pmd_offset(pud, addr); | ||
689 | if (pmd_none(*pmd)) | ||
690 | return 0; | ||
691 | if (pmd_large(*pmd)) | ||
692 | return pfn_valid(pmd_pfn(*pmd)); | ||
693 | |||
694 | pte = pte_offset_kernel(pmd, addr); | ||
695 | if (pte_none(*pte)) | ||
696 | return 0; | ||
697 | return pfn_valid(pte_pfn(*pte)); | ||
698 | } | ||
699 | |||
700 | /* A pseudo VMA to allow ptrace access for the vsyscall page. This only | ||
701 | covers the 64bit vsyscall page now. 32bit has a real VMA now and does | ||
702 | not need special handling anymore. */ | ||
703 | |||
704 | static struct vm_area_struct gate_vma = { | ||
705 | .vm_start = VSYSCALL_START, | ||
706 | .vm_end = VSYSCALL_START + (VSYSCALL_MAPPED_PAGES << PAGE_SHIFT), | ||
707 | .vm_page_prot = PAGE_READONLY_EXEC, | ||
708 | .vm_flags = VM_READ | VM_EXEC | ||
709 | }; | ||
710 | |||
711 | struct vm_area_struct *get_gate_vma(struct task_struct *tsk) | ||
712 | { | ||
713 | #ifdef CONFIG_IA32_EMULATION | ||
714 | if (test_tsk_thread_flag(tsk, TIF_IA32)) | ||
715 | return NULL; | ||
716 | #endif | ||
717 | return &gate_vma; | ||
718 | } | ||
719 | |||
720 | int in_gate_area(struct task_struct *task, unsigned long addr) | ||
721 | { | ||
722 | struct vm_area_struct *vma = get_gate_vma(task); | ||
723 | if (!vma) | ||
724 | return 0; | ||
725 | return (addr >= vma->vm_start) && (addr < vma->vm_end); | ||
726 | } | ||
727 | |||
728 | /* Use this when you have no reliable task/vma, typically from interrupt | ||
729 | * context. It is less reliable than using the task's vma and may give | ||
730 | * false positives. | ||
731 | */ | ||
732 | int in_gate_area_no_task(unsigned long addr) | ||
733 | { | ||
734 | return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END); | ||
735 | } | ||
736 | |||
737 | void * __init alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size) | ||
738 | { | ||
739 | return __alloc_bootmem_core(pgdat->bdata, size, | ||
740 | SMP_CACHE_BYTES, (4UL*1024*1024*1024), 0); | ||
741 | } | ||
742 | |||
743 | const char *arch_vma_name(struct vm_area_struct *vma) | ||
744 | { | ||
745 | if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso) | ||
746 | return "[vdso]"; | ||
747 | if (vma == &gate_vma) | ||
748 | return "[vsyscall]"; | ||
749 | return NULL; | ||
750 | } | ||
diff --git a/arch/x86/mm/ioremap_64.c b/arch/x86/mm/ioremap_64.c new file mode 100644 index 000000000000..6cac90aa5032 --- /dev/null +++ b/arch/x86/mm/ioremap_64.c | |||
@@ -0,0 +1,210 @@ | |||
1 | /* | ||
2 | * arch/x86_64/mm/ioremap.c | ||
3 | * | ||
4 | * Re-map IO memory to kernel address space so that we can access it. | ||
5 | * This is needed for high PCI addresses that aren't mapped in the | ||
6 | * 640k-1MB IO memory area on PC's | ||
7 | * | ||
8 | * (C) Copyright 1995 1996 Linus Torvalds | ||
9 | */ | ||
10 | |||
11 | #include <linux/vmalloc.h> | ||
12 | #include <linux/init.h> | ||
13 | #include <linux/slab.h> | ||
14 | #include <linux/module.h> | ||
15 | #include <linux/io.h> | ||
16 | |||
17 | #include <asm/pgalloc.h> | ||
18 | #include <asm/fixmap.h> | ||
19 | #include <asm/tlbflush.h> | ||
20 | #include <asm/cacheflush.h> | ||
21 | #include <asm/proto.h> | ||
22 | |||
23 | unsigned long __phys_addr(unsigned long x) | ||
24 | { | ||
25 | if (x >= __START_KERNEL_map) | ||
26 | return x - __START_KERNEL_map + phys_base; | ||
27 | return x - PAGE_OFFSET; | ||
28 | } | ||
29 | EXPORT_SYMBOL(__phys_addr); | ||
30 | |||
31 | #define ISA_START_ADDRESS 0xa0000 | ||
32 | #define ISA_END_ADDRESS 0x100000 | ||
33 | |||
34 | /* | ||
35 | * Fix up the linear direct mapping of the kernel to avoid cache attribute | ||
36 | * conflicts. | ||
37 | */ | ||
38 | static int | ||
39 | ioremap_change_attr(unsigned long phys_addr, unsigned long size, | ||
40 | unsigned long flags) | ||
41 | { | ||
42 | int err = 0; | ||
43 | if (phys_addr + size - 1 < (end_pfn_map << PAGE_SHIFT)) { | ||
44 | unsigned long npages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; | ||
45 | unsigned long vaddr = (unsigned long) __va(phys_addr); | ||
46 | |||
47 | /* | ||
48 | * Must use a address here and not struct page because the phys addr | ||
49 | * can be a in hole between nodes and not have an memmap entry. | ||
50 | */ | ||
51 | err = change_page_attr_addr(vaddr,npages,__pgprot(__PAGE_KERNEL|flags)); | ||
52 | if (!err) | ||
53 | global_flush_tlb(); | ||
54 | } | ||
55 | return err; | ||
56 | } | ||
57 | |||
58 | /* | ||
59 | * Generic mapping function | ||
60 | */ | ||
61 | |||
62 | /* | ||
63 | * Remap an arbitrary physical address space into the kernel virtual | ||
64 | * address space. Needed when the kernel wants to access high addresses | ||
65 | * directly. | ||
66 | * | ||
67 | * NOTE! We need to allow non-page-aligned mappings too: we will obviously | ||
68 | * have to convert them into an offset in a page-aligned mapping, but the | ||
69 | * caller shouldn't need to know that small detail. | ||
70 | */ | ||
71 | void __iomem * __ioremap(unsigned long phys_addr, unsigned long size, unsigned long flags) | ||
72 | { | ||
73 | void * addr; | ||
74 | struct vm_struct * area; | ||
75 | unsigned long offset, last_addr; | ||
76 | pgprot_t pgprot; | ||
77 | |||
78 | /* Don't allow wraparound or zero size */ | ||
79 | last_addr = phys_addr + size - 1; | ||
80 | if (!size || last_addr < phys_addr) | ||
81 | return NULL; | ||
82 | |||
83 | /* | ||
84 | * Don't remap the low PCI/ISA area, it's always mapped.. | ||
85 | */ | ||
86 | if (phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS) | ||
87 | return (__force void __iomem *)phys_to_virt(phys_addr); | ||
88 | |||
89 | #ifdef CONFIG_FLATMEM | ||
90 | /* | ||
91 | * Don't allow anybody to remap normal RAM that we're using.. | ||
92 | */ | ||
93 | if (last_addr < virt_to_phys(high_memory)) { | ||
94 | char *t_addr, *t_end; | ||
95 | struct page *page; | ||
96 | |||
97 | t_addr = __va(phys_addr); | ||
98 | t_end = t_addr + (size - 1); | ||
99 | |||
100 | for(page = virt_to_page(t_addr); page <= virt_to_page(t_end); page++) | ||
101 | if(!PageReserved(page)) | ||
102 | return NULL; | ||
103 | } | ||
104 | #endif | ||
105 | |||
106 | pgprot = __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_GLOBAL | ||
107 | | _PAGE_DIRTY | _PAGE_ACCESSED | flags); | ||
108 | /* | ||
109 | * Mappings have to be page-aligned | ||
110 | */ | ||
111 | offset = phys_addr & ~PAGE_MASK; | ||
112 | phys_addr &= PAGE_MASK; | ||
113 | size = PAGE_ALIGN(last_addr+1) - phys_addr; | ||
114 | |||
115 | /* | ||
116 | * Ok, go for it.. | ||
117 | */ | ||
118 | area = get_vm_area(size, VM_IOREMAP | (flags << 20)); | ||
119 | if (!area) | ||
120 | return NULL; | ||
121 | area->phys_addr = phys_addr; | ||
122 | addr = area->addr; | ||
123 | if (ioremap_page_range((unsigned long)addr, (unsigned long)addr + size, | ||
124 | phys_addr, pgprot)) { | ||
125 | remove_vm_area((void *)(PAGE_MASK & (unsigned long) addr)); | ||
126 | return NULL; | ||
127 | } | ||
128 | if (flags && ioremap_change_attr(phys_addr, size, flags) < 0) { | ||
129 | area->flags &= 0xffffff; | ||
130 | vunmap(addr); | ||
131 | return NULL; | ||
132 | } | ||
133 | return (__force void __iomem *) (offset + (char *)addr); | ||
134 | } | ||
135 | EXPORT_SYMBOL(__ioremap); | ||
136 | |||
137 | /** | ||
138 | * ioremap_nocache - map bus memory into CPU space | ||
139 | * @offset: bus address of the memory | ||
140 | * @size: size of the resource to map | ||
141 | * | ||
142 | * ioremap_nocache performs a platform specific sequence of operations to | ||
143 | * make bus memory CPU accessible via the readb/readw/readl/writeb/ | ||
144 | * writew/writel functions and the other mmio helpers. The returned | ||
145 | * address is not guaranteed to be usable directly as a virtual | ||
146 | * address. | ||
147 | * | ||
148 | * This version of ioremap ensures that the memory is marked uncachable | ||
149 | * on the CPU as well as honouring existing caching rules from things like | ||
150 | * the PCI bus. Note that there are other caches and buffers on many | ||
151 | * busses. In particular driver authors should read up on PCI writes | ||
152 | * | ||
153 | * It's useful if some control registers are in such an area and | ||
154 | * write combining or read caching is not desirable: | ||
155 | * | ||
156 | * Must be freed with iounmap. | ||
157 | */ | ||
158 | |||
159 | void __iomem *ioremap_nocache (unsigned long phys_addr, unsigned long size) | ||
160 | { | ||
161 | return __ioremap(phys_addr, size, _PAGE_PCD); | ||
162 | } | ||
163 | EXPORT_SYMBOL(ioremap_nocache); | ||
164 | |||
165 | /** | ||
166 | * iounmap - Free a IO remapping | ||
167 | * @addr: virtual address from ioremap_* | ||
168 | * | ||
169 | * Caller must ensure there is only one unmapping for the same pointer. | ||
170 | */ | ||
171 | void iounmap(volatile void __iomem *addr) | ||
172 | { | ||
173 | struct vm_struct *p, *o; | ||
174 | |||
175 | if (addr <= high_memory) | ||
176 | return; | ||
177 | if (addr >= phys_to_virt(ISA_START_ADDRESS) && | ||
178 | addr < phys_to_virt(ISA_END_ADDRESS)) | ||
179 | return; | ||
180 | |||
181 | addr = (volatile void __iomem *)(PAGE_MASK & (unsigned long __force)addr); | ||
182 | /* Use the vm area unlocked, assuming the caller | ||
183 | ensures there isn't another iounmap for the same address | ||
184 | in parallel. Reuse of the virtual address is prevented by | ||
185 | leaving it in the global lists until we're done with it. | ||
186 | cpa takes care of the direct mappings. */ | ||
187 | read_lock(&vmlist_lock); | ||
188 | for (p = vmlist; p; p = p->next) { | ||
189 | if (p->addr == addr) | ||
190 | break; | ||
191 | } | ||
192 | read_unlock(&vmlist_lock); | ||
193 | |||
194 | if (!p) { | ||
195 | printk("iounmap: bad address %p\n", addr); | ||
196 | dump_stack(); | ||
197 | return; | ||
198 | } | ||
199 | |||
200 | /* Reset the direct mapping. Can block */ | ||
201 | if (p->flags >> 20) | ||
202 | ioremap_change_attr(p->phys_addr, p->size, 0); | ||
203 | |||
204 | /* Finally remove it */ | ||
205 | o = remove_vm_area((void *)addr); | ||
206 | BUG_ON(p != o || o == NULL); | ||
207 | kfree(p); | ||
208 | } | ||
209 | EXPORT_SYMBOL(iounmap); | ||
210 | |||
diff --git a/arch/x86/mm/k8topology_64.c b/arch/x86/mm/k8topology_64.c new file mode 100644 index 000000000000..a96006f7ae0c --- /dev/null +++ b/arch/x86/mm/k8topology_64.c | |||
@@ -0,0 +1,182 @@ | |||
1 | /* | ||
2 | * AMD K8 NUMA support. | ||
3 | * Discover the memory map and associated nodes. | ||
4 | * | ||
5 | * This version reads it directly from the K8 northbridge. | ||
6 | * | ||
7 | * Copyright 2002,2003 Andi Kleen, SuSE Labs. | ||
8 | */ | ||
9 | #include <linux/kernel.h> | ||
10 | #include <linux/init.h> | ||
11 | #include <linux/string.h> | ||
12 | #include <linux/module.h> | ||
13 | #include <linux/nodemask.h> | ||
14 | #include <asm/io.h> | ||
15 | #include <linux/pci_ids.h> | ||
16 | #include <asm/types.h> | ||
17 | #include <asm/mmzone.h> | ||
18 | #include <asm/proto.h> | ||
19 | #include <asm/e820.h> | ||
20 | #include <asm/pci-direct.h> | ||
21 | #include <asm/numa.h> | ||
22 | |||
23 | static __init int find_northbridge(void) | ||
24 | { | ||
25 | int num; | ||
26 | |||
27 | for (num = 0; num < 32; num++) { | ||
28 | u32 header; | ||
29 | |||
30 | header = read_pci_config(0, num, 0, 0x00); | ||
31 | if (header != (PCI_VENDOR_ID_AMD | (0x1100<<16))) | ||
32 | continue; | ||
33 | |||
34 | header = read_pci_config(0, num, 1, 0x00); | ||
35 | if (header != (PCI_VENDOR_ID_AMD | (0x1101<<16))) | ||
36 | continue; | ||
37 | return num; | ||
38 | } | ||
39 | |||
40 | return -1; | ||
41 | } | ||
42 | |||
43 | int __init k8_scan_nodes(unsigned long start, unsigned long end) | ||
44 | { | ||
45 | unsigned long prevbase; | ||
46 | struct bootnode nodes[8]; | ||
47 | int nodeid, i, j, nb; | ||
48 | unsigned char nodeids[8]; | ||
49 | int found = 0; | ||
50 | u32 reg; | ||
51 | unsigned numnodes; | ||
52 | unsigned num_cores; | ||
53 | |||
54 | if (!early_pci_allowed()) | ||
55 | return -1; | ||
56 | |||
57 | nb = find_northbridge(); | ||
58 | if (nb < 0) | ||
59 | return nb; | ||
60 | |||
61 | printk(KERN_INFO "Scanning NUMA topology in Northbridge %d\n", nb); | ||
62 | |||
63 | num_cores = (cpuid_ecx(0x80000008) & 0xff) + 1; | ||
64 | printk(KERN_INFO "CPU has %d num_cores\n", num_cores); | ||
65 | |||
66 | reg = read_pci_config(0, nb, 0, 0x60); | ||
67 | numnodes = ((reg >> 4) & 0xF) + 1; | ||
68 | if (numnodes <= 1) | ||
69 | return -1; | ||
70 | |||
71 | printk(KERN_INFO "Number of nodes %d\n", numnodes); | ||
72 | |||
73 | memset(&nodes,0,sizeof(nodes)); | ||
74 | prevbase = 0; | ||
75 | for (i = 0; i < 8; i++) { | ||
76 | unsigned long base,limit; | ||
77 | u32 nodeid; | ||
78 | |||
79 | base = read_pci_config(0, nb, 1, 0x40 + i*8); | ||
80 | limit = read_pci_config(0, nb, 1, 0x44 + i*8); | ||
81 | |||
82 | nodeid = limit & 7; | ||
83 | nodeids[i] = nodeid; | ||
84 | if ((base & 3) == 0) { | ||
85 | if (i < numnodes) | ||
86 | printk("Skipping disabled node %d\n", i); | ||
87 | continue; | ||
88 | } | ||
89 | if (nodeid >= numnodes) { | ||
90 | printk("Ignoring excess node %d (%lx:%lx)\n", nodeid, | ||
91 | base, limit); | ||
92 | continue; | ||
93 | } | ||
94 | |||
95 | if (!limit) { | ||
96 | printk(KERN_INFO "Skipping node entry %d (base %lx)\n", i, | ||
97 | base); | ||
98 | continue; | ||
99 | } | ||
100 | if ((base >> 8) & 3 || (limit >> 8) & 3) { | ||
101 | printk(KERN_ERR "Node %d using interleaving mode %lx/%lx\n", | ||
102 | nodeid, (base>>8)&3, (limit>>8) & 3); | ||
103 | return -1; | ||
104 | } | ||
105 | if (node_isset(nodeid, node_possible_map)) { | ||
106 | printk(KERN_INFO "Node %d already present. Skipping\n", | ||
107 | nodeid); | ||
108 | continue; | ||
109 | } | ||
110 | |||
111 | limit >>= 16; | ||
112 | limit <<= 24; | ||
113 | limit |= (1<<24)-1; | ||
114 | limit++; | ||
115 | |||
116 | if (limit > end_pfn << PAGE_SHIFT) | ||
117 | limit = end_pfn << PAGE_SHIFT; | ||
118 | if (limit <= base) | ||
119 | continue; | ||
120 | |||
121 | base >>= 16; | ||
122 | base <<= 24; | ||
123 | |||
124 | if (base < start) | ||
125 | base = start; | ||
126 | if (limit > end) | ||
127 | limit = end; | ||
128 | if (limit == base) { | ||
129 | printk(KERN_ERR "Empty node %d\n", nodeid); | ||
130 | continue; | ||
131 | } | ||
132 | if (limit < base) { | ||
133 | printk(KERN_ERR "Node %d bogus settings %lx-%lx.\n", | ||
134 | nodeid, base, limit); | ||
135 | continue; | ||
136 | } | ||
137 | |||
138 | /* Could sort here, but pun for now. Should not happen anyroads. */ | ||
139 | if (prevbase > base) { | ||
140 | printk(KERN_ERR "Node map not sorted %lx,%lx\n", | ||
141 | prevbase,base); | ||
142 | return -1; | ||
143 | } | ||
144 | |||
145 | printk(KERN_INFO "Node %d MemBase %016lx Limit %016lx\n", | ||
146 | nodeid, base, limit); | ||
147 | |||
148 | found++; | ||
149 | |||
150 | nodes[nodeid].start = base; | ||
151 | nodes[nodeid].end = limit; | ||
152 | e820_register_active_regions(nodeid, | ||
153 | nodes[nodeid].start >> PAGE_SHIFT, | ||
154 | nodes[nodeid].end >> PAGE_SHIFT); | ||
155 | |||
156 | prevbase = base; | ||
157 | |||
158 | node_set(nodeid, node_possible_map); | ||
159 | } | ||
160 | |||
161 | if (!found) | ||
162 | return -1; | ||
163 | |||
164 | memnode_shift = compute_hash_shift(nodes, 8); | ||
165 | if (memnode_shift < 0) { | ||
166 | printk(KERN_ERR "No NUMA node hash function found. Contact maintainer\n"); | ||
167 | return -1; | ||
168 | } | ||
169 | printk(KERN_INFO "Using node hash shift of %d\n", memnode_shift); | ||
170 | |||
171 | for (i = 0; i < 8; i++) { | ||
172 | if (nodes[i].start != nodes[i].end) { | ||
173 | nodeid = nodeids[i]; | ||
174 | for (j = 0; j < num_cores; j++) | ||
175 | apicid_to_node[(nodeid * num_cores) + j] = i; | ||
176 | setup_node_bootmem(i, nodes[i].start, nodes[i].end); | ||
177 | } | ||
178 | } | ||
179 | |||
180 | numa_init_array(); | ||
181 | return 0; | ||
182 | } | ||
diff --git a/arch/x86/mm/mmap_64.c b/arch/x86/mm/mmap_64.c new file mode 100644 index 000000000000..80bba0dc000e --- /dev/null +++ b/arch/x86/mm/mmap_64.c | |||
@@ -0,0 +1,29 @@ | |||
1 | /* Copyright 2005 Andi Kleen, SuSE Labs. | ||
2 | * Licensed under GPL, v.2 | ||
3 | */ | ||
4 | #include <linux/mm.h> | ||
5 | #include <linux/sched.h> | ||
6 | #include <linux/random.h> | ||
7 | #include <asm/ia32.h> | ||
8 | |||
9 | /* Notebook: move the mmap code from sys_x86_64.c over here. */ | ||
10 | |||
11 | void arch_pick_mmap_layout(struct mm_struct *mm) | ||
12 | { | ||
13 | #ifdef CONFIG_IA32_EMULATION | ||
14 | if (current_thread_info()->flags & _TIF_IA32) | ||
15 | return ia32_pick_mmap_layout(mm); | ||
16 | #endif | ||
17 | mm->mmap_base = TASK_UNMAPPED_BASE; | ||
18 | if (current->flags & PF_RANDOMIZE) { | ||
19 | /* Add 28bit randomness which is about 40bits of address space | ||
20 | because mmap base has to be page aligned. | ||
21 | or ~1/128 of the total user VM | ||
22 | (total user address space is 47bits) */ | ||
23 | unsigned rnd = get_random_int() & 0xfffffff; | ||
24 | mm->mmap_base += ((unsigned long)rnd) << PAGE_SHIFT; | ||
25 | } | ||
26 | mm->get_unmapped_area = arch_get_unmapped_area; | ||
27 | mm->unmap_area = arch_unmap_area; | ||
28 | } | ||
29 | |||
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c new file mode 100644 index 000000000000..6da235522269 --- /dev/null +++ b/arch/x86/mm/numa_64.c | |||
@@ -0,0 +1,648 @@ | |||
1 | /* | ||
2 | * Generic VM initialization for x86-64 NUMA setups. | ||
3 | * Copyright 2002,2003 Andi Kleen, SuSE Labs. | ||
4 | */ | ||
5 | #include <linux/kernel.h> | ||
6 | #include <linux/mm.h> | ||
7 | #include <linux/string.h> | ||
8 | #include <linux/init.h> | ||
9 | #include <linux/bootmem.h> | ||
10 | #include <linux/mmzone.h> | ||
11 | #include <linux/ctype.h> | ||
12 | #include <linux/module.h> | ||
13 | #include <linux/nodemask.h> | ||
14 | |||
15 | #include <asm/e820.h> | ||
16 | #include <asm/proto.h> | ||
17 | #include <asm/dma.h> | ||
18 | #include <asm/numa.h> | ||
19 | #include <asm/acpi.h> | ||
20 | |||
21 | #ifndef Dprintk | ||
22 | #define Dprintk(x...) | ||
23 | #endif | ||
24 | |||
25 | struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; | ||
26 | bootmem_data_t plat_node_bdata[MAX_NUMNODES]; | ||
27 | |||
28 | struct memnode memnode; | ||
29 | |||
30 | unsigned char cpu_to_node[NR_CPUS] __read_mostly = { | ||
31 | [0 ... NR_CPUS-1] = NUMA_NO_NODE | ||
32 | }; | ||
33 | unsigned char apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { | ||
34 | [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE | ||
35 | }; | ||
36 | cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly; | ||
37 | |||
38 | int numa_off __initdata; | ||
39 | unsigned long __initdata nodemap_addr; | ||
40 | unsigned long __initdata nodemap_size; | ||
41 | |||
42 | |||
43 | /* | ||
44 | * Given a shift value, try to populate memnodemap[] | ||
45 | * Returns : | ||
46 | * 1 if OK | ||
47 | * 0 if memnodmap[] too small (of shift too small) | ||
48 | * -1 if node overlap or lost ram (shift too big) | ||
49 | */ | ||
50 | static int __init | ||
51 | populate_memnodemap(const struct bootnode *nodes, int numnodes, int shift) | ||
52 | { | ||
53 | int i; | ||
54 | int res = -1; | ||
55 | unsigned long addr, end; | ||
56 | |||
57 | memset(memnodemap, 0xff, memnodemapsize); | ||
58 | for (i = 0; i < numnodes; i++) { | ||
59 | addr = nodes[i].start; | ||
60 | end = nodes[i].end; | ||
61 | if (addr >= end) | ||
62 | continue; | ||
63 | if ((end >> shift) >= memnodemapsize) | ||
64 | return 0; | ||
65 | do { | ||
66 | if (memnodemap[addr >> shift] != 0xff) | ||
67 | return -1; | ||
68 | memnodemap[addr >> shift] = i; | ||
69 | addr += (1UL << shift); | ||
70 | } while (addr < end); | ||
71 | res = 1; | ||
72 | } | ||
73 | return res; | ||
74 | } | ||
75 | |||
76 | static int __init allocate_cachealigned_memnodemap(void) | ||
77 | { | ||
78 | unsigned long pad, pad_addr; | ||
79 | |||
80 | memnodemap = memnode.embedded_map; | ||
81 | if (memnodemapsize <= 48) | ||
82 | return 0; | ||
83 | |||
84 | pad = L1_CACHE_BYTES - 1; | ||
85 | pad_addr = 0x8000; | ||
86 | nodemap_size = pad + memnodemapsize; | ||
87 | nodemap_addr = find_e820_area(pad_addr, end_pfn<<PAGE_SHIFT, | ||
88 | nodemap_size); | ||
89 | if (nodemap_addr == -1UL) { | ||
90 | printk(KERN_ERR | ||
91 | "NUMA: Unable to allocate Memory to Node hash map\n"); | ||
92 | nodemap_addr = nodemap_size = 0; | ||
93 | return -1; | ||
94 | } | ||
95 | pad_addr = (nodemap_addr + pad) & ~pad; | ||
96 | memnodemap = phys_to_virt(pad_addr); | ||
97 | |||
98 | printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n", | ||
99 | nodemap_addr, nodemap_addr + nodemap_size); | ||
100 | return 0; | ||
101 | } | ||
102 | |||
103 | /* | ||
104 | * The LSB of all start and end addresses in the node map is the value of the | ||
105 | * maximum possible shift. | ||
106 | */ | ||
107 | static int __init | ||
108 | extract_lsb_from_nodes (const struct bootnode *nodes, int numnodes) | ||
109 | { | ||
110 | int i, nodes_used = 0; | ||
111 | unsigned long start, end; | ||
112 | unsigned long bitfield = 0, memtop = 0; | ||
113 | |||
114 | for (i = 0; i < numnodes; i++) { | ||
115 | start = nodes[i].start; | ||
116 | end = nodes[i].end; | ||
117 | if (start >= end) | ||
118 | continue; | ||
119 | bitfield |= start; | ||
120 | nodes_used++; | ||
121 | if (end > memtop) | ||
122 | memtop = end; | ||
123 | } | ||
124 | if (nodes_used <= 1) | ||
125 | i = 63; | ||
126 | else | ||
127 | i = find_first_bit(&bitfield, sizeof(unsigned long)*8); | ||
128 | memnodemapsize = (memtop >> i)+1; | ||
129 | return i; | ||
130 | } | ||
131 | |||
132 | int __init compute_hash_shift(struct bootnode *nodes, int numnodes) | ||
133 | { | ||
134 | int shift; | ||
135 | |||
136 | shift = extract_lsb_from_nodes(nodes, numnodes); | ||
137 | if (allocate_cachealigned_memnodemap()) | ||
138 | return -1; | ||
139 | printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n", | ||
140 | shift); | ||
141 | |||
142 | if (populate_memnodemap(nodes, numnodes, shift) != 1) { | ||
143 | printk(KERN_INFO | ||
144 | "Your memory is not aligned you need to rebuild your kernel " | ||
145 | "with a bigger NODEMAPSIZE shift=%d\n", | ||
146 | shift); | ||
147 | return -1; | ||
148 | } | ||
149 | return shift; | ||
150 | } | ||
151 | |||
152 | #ifdef CONFIG_SPARSEMEM | ||
153 | int early_pfn_to_nid(unsigned long pfn) | ||
154 | { | ||
155 | return phys_to_nid(pfn << PAGE_SHIFT); | ||
156 | } | ||
157 | #endif | ||
158 | |||
159 | static void * __init | ||
160 | early_node_mem(int nodeid, unsigned long start, unsigned long end, | ||
161 | unsigned long size) | ||
162 | { | ||
163 | unsigned long mem = find_e820_area(start, end, size); | ||
164 | void *ptr; | ||
165 | if (mem != -1L) | ||
166 | return __va(mem); | ||
167 | ptr = __alloc_bootmem_nopanic(size, | ||
168 | SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS)); | ||
169 | if (ptr == 0) { | ||
170 | printk(KERN_ERR "Cannot find %lu bytes in node %d\n", | ||
171 | size, nodeid); | ||
172 | return NULL; | ||
173 | } | ||
174 | return ptr; | ||
175 | } | ||
176 | |||
177 | /* Initialize bootmem allocator for a node */ | ||
178 | void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) | ||
179 | { | ||
180 | unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size, bootmap_start; | ||
181 | unsigned long nodedata_phys; | ||
182 | void *bootmap; | ||
183 | const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE); | ||
184 | |||
185 | start = round_up(start, ZONE_ALIGN); | ||
186 | |||
187 | printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid, start, end); | ||
188 | |||
189 | start_pfn = start >> PAGE_SHIFT; | ||
190 | end_pfn = end >> PAGE_SHIFT; | ||
191 | |||
192 | node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size); | ||
193 | if (node_data[nodeid] == NULL) | ||
194 | return; | ||
195 | nodedata_phys = __pa(node_data[nodeid]); | ||
196 | |||
197 | memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t)); | ||
198 | NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid]; | ||
199 | NODE_DATA(nodeid)->node_start_pfn = start_pfn; | ||
200 | NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn; | ||
201 | |||
202 | /* Find a place for the bootmem map */ | ||
203 | bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn); | ||
204 | bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE); | ||
205 | bootmap = early_node_mem(nodeid, bootmap_start, end, | ||
206 | bootmap_pages<<PAGE_SHIFT); | ||
207 | if (bootmap == NULL) { | ||
208 | if (nodedata_phys < start || nodedata_phys >= end) | ||
209 | free_bootmem((unsigned long)node_data[nodeid],pgdat_size); | ||
210 | node_data[nodeid] = NULL; | ||
211 | return; | ||
212 | } | ||
213 | bootmap_start = __pa(bootmap); | ||
214 | Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages); | ||
215 | |||
216 | bootmap_size = init_bootmem_node(NODE_DATA(nodeid), | ||
217 | bootmap_start >> PAGE_SHIFT, | ||
218 | start_pfn, end_pfn); | ||
219 | |||
220 | free_bootmem_with_active_regions(nodeid, end); | ||
221 | |||
222 | reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size); | ||
223 | reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT); | ||
224 | #ifdef CONFIG_ACPI_NUMA | ||
225 | srat_reserve_add_area(nodeid); | ||
226 | #endif | ||
227 | node_set_online(nodeid); | ||
228 | } | ||
229 | |||
230 | /* Initialize final allocator for a zone */ | ||
231 | void __init setup_node_zones(int nodeid) | ||
232 | { | ||
233 | unsigned long start_pfn, end_pfn, memmapsize, limit; | ||
234 | |||
235 | start_pfn = node_start_pfn(nodeid); | ||
236 | end_pfn = node_end_pfn(nodeid); | ||
237 | |||
238 | Dprintk(KERN_INFO "Setting up memmap for node %d %lx-%lx\n", | ||
239 | nodeid, start_pfn, end_pfn); | ||
240 | |||
241 | /* Try to allocate mem_map at end to not fill up precious <4GB | ||
242 | memory. */ | ||
243 | memmapsize = sizeof(struct page) * (end_pfn-start_pfn); | ||
244 | limit = end_pfn << PAGE_SHIFT; | ||
245 | #ifdef CONFIG_FLAT_NODE_MEM_MAP | ||
246 | NODE_DATA(nodeid)->node_mem_map = | ||
247 | __alloc_bootmem_core(NODE_DATA(nodeid)->bdata, | ||
248 | memmapsize, SMP_CACHE_BYTES, | ||
249 | round_down(limit - memmapsize, PAGE_SIZE), | ||
250 | limit); | ||
251 | #endif | ||
252 | } | ||
253 | |||
254 | void __init numa_init_array(void) | ||
255 | { | ||
256 | int rr, i; | ||
257 | /* There are unfortunately some poorly designed mainboards around | ||
258 | that only connect memory to a single CPU. This breaks the 1:1 cpu->node | ||
259 | mapping. To avoid this fill in the mapping for all possible | ||
260 | CPUs, as the number of CPUs is not known yet. | ||
261 | We round robin the existing nodes. */ | ||
262 | rr = first_node(node_online_map); | ||
263 | for (i = 0; i < NR_CPUS; i++) { | ||
264 | if (cpu_to_node[i] != NUMA_NO_NODE) | ||
265 | continue; | ||
266 | numa_set_node(i, rr); | ||
267 | rr = next_node(rr, node_online_map); | ||
268 | if (rr == MAX_NUMNODES) | ||
269 | rr = first_node(node_online_map); | ||
270 | } | ||
271 | |||
272 | } | ||
273 | |||
274 | #ifdef CONFIG_NUMA_EMU | ||
275 | /* Numa emulation */ | ||
276 | char *cmdline __initdata; | ||
277 | |||
278 | /* | ||
279 | * Setups up nid to range from addr to addr + size. If the end boundary is | ||
280 | * greater than max_addr, then max_addr is used instead. The return value is 0 | ||
281 | * if there is additional memory left for allocation past addr and -1 otherwise. | ||
282 | * addr is adjusted to be at the end of the node. | ||
283 | */ | ||
284 | static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr, | ||
285 | u64 size, u64 max_addr) | ||
286 | { | ||
287 | int ret = 0; | ||
288 | nodes[nid].start = *addr; | ||
289 | *addr += size; | ||
290 | if (*addr >= max_addr) { | ||
291 | *addr = max_addr; | ||
292 | ret = -1; | ||
293 | } | ||
294 | nodes[nid].end = *addr; | ||
295 | node_set(nid, node_possible_map); | ||
296 | printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid, | ||
297 | nodes[nid].start, nodes[nid].end, | ||
298 | (nodes[nid].end - nodes[nid].start) >> 20); | ||
299 | return ret; | ||
300 | } | ||
301 | |||
302 | /* | ||
303 | * Splits num_nodes nodes up equally starting at node_start. The return value | ||
304 | * is the number of nodes split up and addr is adjusted to be at the end of the | ||
305 | * last node allocated. | ||
306 | */ | ||
307 | static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr, | ||
308 | u64 max_addr, int node_start, | ||
309 | int num_nodes) | ||
310 | { | ||
311 | unsigned int big; | ||
312 | u64 size; | ||
313 | int i; | ||
314 | |||
315 | if (num_nodes <= 0) | ||
316 | return -1; | ||
317 | if (num_nodes > MAX_NUMNODES) | ||
318 | num_nodes = MAX_NUMNODES; | ||
319 | size = (max_addr - *addr - e820_hole_size(*addr, max_addr)) / | ||
320 | num_nodes; | ||
321 | /* | ||
322 | * Calculate the number of big nodes that can be allocated as a result | ||
323 | * of consolidating the leftovers. | ||
324 | */ | ||
325 | big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * num_nodes) / | ||
326 | FAKE_NODE_MIN_SIZE; | ||
327 | |||
328 | /* Round down to nearest FAKE_NODE_MIN_SIZE. */ | ||
329 | size &= FAKE_NODE_MIN_HASH_MASK; | ||
330 | if (!size) { | ||
331 | printk(KERN_ERR "Not enough memory for each node. " | ||
332 | "NUMA emulation disabled.\n"); | ||
333 | return -1; | ||
334 | } | ||
335 | |||
336 | for (i = node_start; i < num_nodes + node_start; i++) { | ||
337 | u64 end = *addr + size; | ||
338 | if (i < big) | ||
339 | end += FAKE_NODE_MIN_SIZE; | ||
340 | /* | ||
341 | * The final node can have the remaining system RAM. Other | ||
342 | * nodes receive roughly the same amount of available pages. | ||
343 | */ | ||
344 | if (i == num_nodes + node_start - 1) | ||
345 | end = max_addr; | ||
346 | else | ||
347 | while (end - *addr - e820_hole_size(*addr, end) < | ||
348 | size) { | ||
349 | end += FAKE_NODE_MIN_SIZE; | ||
350 | if (end > max_addr) { | ||
351 | end = max_addr; | ||
352 | break; | ||
353 | } | ||
354 | } | ||
355 | if (setup_node_range(i, nodes, addr, end - *addr, max_addr) < 0) | ||
356 | break; | ||
357 | } | ||
358 | return i - node_start + 1; | ||
359 | } | ||
360 | |||
361 | /* | ||
362 | * Splits the remaining system RAM into chunks of size. The remaining memory is | ||
363 | * always assigned to a final node and can be asymmetric. Returns the number of | ||
364 | * nodes split. | ||
365 | */ | ||
366 | static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr, | ||
367 | u64 max_addr, int node_start, u64 size) | ||
368 | { | ||
369 | int i = node_start; | ||
370 | size = (size << 20) & FAKE_NODE_MIN_HASH_MASK; | ||
371 | while (!setup_node_range(i++, nodes, addr, size, max_addr)) | ||
372 | ; | ||
373 | return i - node_start; | ||
374 | } | ||
375 | |||
376 | /* | ||
377 | * Sets up the system RAM area from start_pfn to end_pfn according to the | ||
378 | * numa=fake command-line option. | ||
379 | */ | ||
380 | static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn) | ||
381 | { | ||
382 | struct bootnode nodes[MAX_NUMNODES]; | ||
383 | u64 addr = start_pfn << PAGE_SHIFT; | ||
384 | u64 max_addr = end_pfn << PAGE_SHIFT; | ||
385 | int num_nodes = 0; | ||
386 | int coeff_flag; | ||
387 | int coeff = -1; | ||
388 | int num = 0; | ||
389 | u64 size; | ||
390 | int i; | ||
391 | |||
392 | memset(&nodes, 0, sizeof(nodes)); | ||
393 | /* | ||
394 | * If the numa=fake command-line is just a single number N, split the | ||
395 | * system RAM into N fake nodes. | ||
396 | */ | ||
397 | if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) { | ||
398 | num_nodes = split_nodes_equally(nodes, &addr, max_addr, 0, | ||
399 | simple_strtol(cmdline, NULL, 0)); | ||
400 | if (num_nodes < 0) | ||
401 | return num_nodes; | ||
402 | goto out; | ||
403 | } | ||
404 | |||
405 | /* Parse the command line. */ | ||
406 | for (coeff_flag = 0; ; cmdline++) { | ||
407 | if (*cmdline && isdigit(*cmdline)) { | ||
408 | num = num * 10 + *cmdline - '0'; | ||
409 | continue; | ||
410 | } | ||
411 | if (*cmdline == '*') { | ||
412 | if (num > 0) | ||
413 | coeff = num; | ||
414 | coeff_flag = 1; | ||
415 | } | ||
416 | if (!*cmdline || *cmdline == ',') { | ||
417 | if (!coeff_flag) | ||
418 | coeff = 1; | ||
419 | /* | ||
420 | * Round down to the nearest FAKE_NODE_MIN_SIZE. | ||
421 | * Command-line coefficients are in megabytes. | ||
422 | */ | ||
423 | size = ((u64)num << 20) & FAKE_NODE_MIN_HASH_MASK; | ||
424 | if (size) | ||
425 | for (i = 0; i < coeff; i++, num_nodes++) | ||
426 | if (setup_node_range(num_nodes, nodes, | ||
427 | &addr, size, max_addr) < 0) | ||
428 | goto done; | ||
429 | if (!*cmdline) | ||
430 | break; | ||
431 | coeff_flag = 0; | ||
432 | coeff = -1; | ||
433 | } | ||
434 | num = 0; | ||
435 | } | ||
436 | done: | ||
437 | if (!num_nodes) | ||
438 | return -1; | ||
439 | /* Fill remainder of system RAM, if appropriate. */ | ||
440 | if (addr < max_addr) { | ||
441 | if (coeff_flag && coeff < 0) { | ||
442 | /* Split remaining nodes into num-sized chunks */ | ||
443 | num_nodes += split_nodes_by_size(nodes, &addr, max_addr, | ||
444 | num_nodes, num); | ||
445 | goto out; | ||
446 | } | ||
447 | switch (*(cmdline - 1)) { | ||
448 | case '*': | ||
449 | /* Split remaining nodes into coeff chunks */ | ||
450 | if (coeff <= 0) | ||
451 | break; | ||
452 | num_nodes += split_nodes_equally(nodes, &addr, max_addr, | ||
453 | num_nodes, coeff); | ||
454 | break; | ||
455 | case ',': | ||
456 | /* Do not allocate remaining system RAM */ | ||
457 | break; | ||
458 | default: | ||
459 | /* Give one final node */ | ||
460 | setup_node_range(num_nodes, nodes, &addr, | ||
461 | max_addr - addr, max_addr); | ||
462 | num_nodes++; | ||
463 | } | ||
464 | } | ||
465 | out: | ||
466 | memnode_shift = compute_hash_shift(nodes, num_nodes); | ||
467 | if (memnode_shift < 0) { | ||
468 | memnode_shift = 0; | ||
469 | printk(KERN_ERR "No NUMA hash function found. NUMA emulation " | ||
470 | "disabled.\n"); | ||
471 | return -1; | ||
472 | } | ||
473 | |||
474 | /* | ||
475 | * We need to vacate all active ranges that may have been registered by | ||
476 | * SRAT and set acpi_numa to -1 so that srat_disabled() always returns | ||
477 | * true. NUMA emulation has succeeded so we will not scan ACPI nodes. | ||
478 | */ | ||
479 | remove_all_active_ranges(); | ||
480 | #ifdef CONFIG_ACPI_NUMA | ||
481 | acpi_numa = -1; | ||
482 | #endif | ||
483 | for_each_node_mask(i, node_possible_map) { | ||
484 | e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT, | ||
485 | nodes[i].end >> PAGE_SHIFT); | ||
486 | setup_node_bootmem(i, nodes[i].start, nodes[i].end); | ||
487 | } | ||
488 | acpi_fake_nodes(nodes, num_nodes); | ||
489 | numa_init_array(); | ||
490 | return 0; | ||
491 | } | ||
492 | #endif /* CONFIG_NUMA_EMU */ | ||
493 | |||
494 | void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) | ||
495 | { | ||
496 | int i; | ||
497 | |||
498 | nodes_clear(node_possible_map); | ||
499 | |||
500 | #ifdef CONFIG_NUMA_EMU | ||
501 | if (cmdline && !numa_emulation(start_pfn, end_pfn)) | ||
502 | return; | ||
503 | nodes_clear(node_possible_map); | ||
504 | #endif | ||
505 | |||
506 | #ifdef CONFIG_ACPI_NUMA | ||
507 | if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT, | ||
508 | end_pfn << PAGE_SHIFT)) | ||
509 | return; | ||
510 | nodes_clear(node_possible_map); | ||
511 | #endif | ||
512 | |||
513 | #ifdef CONFIG_K8_NUMA | ||
514 | if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT)) | ||
515 | return; | ||
516 | nodes_clear(node_possible_map); | ||
517 | #endif | ||
518 | printk(KERN_INFO "%s\n", | ||
519 | numa_off ? "NUMA turned off" : "No NUMA configuration found"); | ||
520 | |||
521 | printk(KERN_INFO "Faking a node at %016lx-%016lx\n", | ||
522 | start_pfn << PAGE_SHIFT, | ||
523 | end_pfn << PAGE_SHIFT); | ||
524 | /* setup dummy node covering all memory */ | ||
525 | memnode_shift = 63; | ||
526 | memnodemap = memnode.embedded_map; | ||
527 | memnodemap[0] = 0; | ||
528 | nodes_clear(node_online_map); | ||
529 | node_set_online(0); | ||
530 | node_set(0, node_possible_map); | ||
531 | for (i = 0; i < NR_CPUS; i++) | ||
532 | numa_set_node(i, 0); | ||
533 | node_to_cpumask[0] = cpumask_of_cpu(0); | ||
534 | e820_register_active_regions(0, start_pfn, end_pfn); | ||
535 | setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT); | ||
536 | } | ||
537 | |||
538 | __cpuinit void numa_add_cpu(int cpu) | ||
539 | { | ||
540 | set_bit(cpu, &node_to_cpumask[cpu_to_node(cpu)]); | ||
541 | } | ||
542 | |||
543 | void __cpuinit numa_set_node(int cpu, int node) | ||
544 | { | ||
545 | cpu_pda(cpu)->nodenumber = node; | ||
546 | cpu_to_node[cpu] = node; | ||
547 | } | ||
548 | |||
549 | unsigned long __init numa_free_all_bootmem(void) | ||
550 | { | ||
551 | int i; | ||
552 | unsigned long pages = 0; | ||
553 | for_each_online_node(i) { | ||
554 | pages += free_all_bootmem_node(NODE_DATA(i)); | ||
555 | } | ||
556 | return pages; | ||
557 | } | ||
558 | |||
559 | void __init paging_init(void) | ||
560 | { | ||
561 | int i; | ||
562 | unsigned long max_zone_pfns[MAX_NR_ZONES]; | ||
563 | memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); | ||
564 | max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN; | ||
565 | max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; | ||
566 | max_zone_pfns[ZONE_NORMAL] = end_pfn; | ||
567 | |||
568 | sparse_memory_present_with_active_regions(MAX_NUMNODES); | ||
569 | sparse_init(); | ||
570 | |||
571 | for_each_online_node(i) { | ||
572 | setup_node_zones(i); | ||
573 | } | ||
574 | |||
575 | free_area_init_nodes(max_zone_pfns); | ||
576 | } | ||
577 | |||
578 | static __init int numa_setup(char *opt) | ||
579 | { | ||
580 | if (!opt) | ||
581 | return -EINVAL; | ||
582 | if (!strncmp(opt,"off",3)) | ||
583 | numa_off = 1; | ||
584 | #ifdef CONFIG_NUMA_EMU | ||
585 | if (!strncmp(opt, "fake=", 5)) | ||
586 | cmdline = opt + 5; | ||
587 | #endif | ||
588 | #ifdef CONFIG_ACPI_NUMA | ||
589 | if (!strncmp(opt,"noacpi",6)) | ||
590 | acpi_numa = -1; | ||
591 | if (!strncmp(opt,"hotadd=", 7)) | ||
592 | hotadd_percent = simple_strtoul(opt+7, NULL, 10); | ||
593 | #endif | ||
594 | return 0; | ||
595 | } | ||
596 | |||
597 | early_param("numa", numa_setup); | ||
598 | |||
599 | /* | ||
600 | * Setup early cpu_to_node. | ||
601 | * | ||
602 | * Populate cpu_to_node[] only if x86_cpu_to_apicid[], | ||
603 | * and apicid_to_node[] tables have valid entries for a CPU. | ||
604 | * This means we skip cpu_to_node[] initialisation for NUMA | ||
605 | * emulation and faking node case (when running a kernel compiled | ||
606 | * for NUMA on a non NUMA box), which is OK as cpu_to_node[] | ||
607 | * is already initialized in a round robin manner at numa_init_array, | ||
608 | * prior to this call, and this initialization is good enough | ||
609 | * for the fake NUMA cases. | ||
610 | */ | ||
611 | void __init init_cpu_to_node(void) | ||
612 | { | ||
613 | int i; | ||
614 | for (i = 0; i < NR_CPUS; i++) { | ||
615 | u8 apicid = x86_cpu_to_apicid[i]; | ||
616 | if (apicid == BAD_APICID) | ||
617 | continue; | ||
618 | if (apicid_to_node[apicid] == NUMA_NO_NODE) | ||
619 | continue; | ||
620 | numa_set_node(i,apicid_to_node[apicid]); | ||
621 | } | ||
622 | } | ||
623 | |||
624 | EXPORT_SYMBOL(cpu_to_node); | ||
625 | EXPORT_SYMBOL(node_to_cpumask); | ||
626 | EXPORT_SYMBOL(memnode); | ||
627 | EXPORT_SYMBOL(node_data); | ||
628 | |||
629 | #ifdef CONFIG_DISCONTIGMEM | ||
630 | /* | ||
631 | * Functions to convert PFNs from/to per node page addresses. | ||
632 | * These are out of line because they are quite big. | ||
633 | * They could be all tuned by pre caching more state. | ||
634 | * Should do that. | ||
635 | */ | ||
636 | |||
637 | int pfn_valid(unsigned long pfn) | ||
638 | { | ||
639 | unsigned nid; | ||
640 | if (pfn >= num_physpages) | ||
641 | return 0; | ||
642 | nid = pfn_to_nid(pfn); | ||
643 | if (nid == 0xff) | ||
644 | return 0; | ||
645 | return pfn >= node_start_pfn(nid) && (pfn) < node_end_pfn(nid); | ||
646 | } | ||
647 | EXPORT_SYMBOL(pfn_valid); | ||
648 | #endif | ||
diff --git a/arch/x86/mm/pageattr_64.c b/arch/x86/mm/pageattr_64.c new file mode 100644 index 000000000000..10b9809ce821 --- /dev/null +++ b/arch/x86/mm/pageattr_64.c | |||
@@ -0,0 +1,249 @@ | |||
1 | /* | ||
2 | * Copyright 2002 Andi Kleen, SuSE Labs. | ||
3 | * Thanks to Ben LaHaise for precious feedback. | ||
4 | */ | ||
5 | |||
6 | #include <linux/mm.h> | ||
7 | #include <linux/sched.h> | ||
8 | #include <linux/highmem.h> | ||
9 | #include <linux/module.h> | ||
10 | #include <linux/slab.h> | ||
11 | #include <asm/uaccess.h> | ||
12 | #include <asm/processor.h> | ||
13 | #include <asm/tlbflush.h> | ||
14 | #include <asm/io.h> | ||
15 | |||
16 | pte_t *lookup_address(unsigned long address) | ||
17 | { | ||
18 | pgd_t *pgd = pgd_offset_k(address); | ||
19 | pud_t *pud; | ||
20 | pmd_t *pmd; | ||
21 | pte_t *pte; | ||
22 | if (pgd_none(*pgd)) | ||
23 | return NULL; | ||
24 | pud = pud_offset(pgd, address); | ||
25 | if (!pud_present(*pud)) | ||
26 | return NULL; | ||
27 | pmd = pmd_offset(pud, address); | ||
28 | if (!pmd_present(*pmd)) | ||
29 | return NULL; | ||
30 | if (pmd_large(*pmd)) | ||
31 | return (pte_t *)pmd; | ||
32 | pte = pte_offset_kernel(pmd, address); | ||
33 | if (pte && !pte_present(*pte)) | ||
34 | pte = NULL; | ||
35 | return pte; | ||
36 | } | ||
37 | |||
38 | static struct page *split_large_page(unsigned long address, pgprot_t prot, | ||
39 | pgprot_t ref_prot) | ||
40 | { | ||
41 | int i; | ||
42 | unsigned long addr; | ||
43 | struct page *base = alloc_pages(GFP_KERNEL, 0); | ||
44 | pte_t *pbase; | ||
45 | if (!base) | ||
46 | return NULL; | ||
47 | /* | ||
48 | * page_private is used to track the number of entries in | ||
49 | * the page table page have non standard attributes. | ||
50 | */ | ||
51 | SetPagePrivate(base); | ||
52 | page_private(base) = 0; | ||
53 | |||
54 | address = __pa(address); | ||
55 | addr = address & LARGE_PAGE_MASK; | ||
56 | pbase = (pte_t *)page_address(base); | ||
57 | for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) { | ||
58 | pbase[i] = pfn_pte(addr >> PAGE_SHIFT, | ||
59 | addr == address ? prot : ref_prot); | ||
60 | } | ||
61 | return base; | ||
62 | } | ||
63 | |||
64 | static void cache_flush_page(void *adr) | ||
65 | { | ||
66 | int i; | ||
67 | for (i = 0; i < PAGE_SIZE; i += boot_cpu_data.x86_clflush_size) | ||
68 | asm volatile("clflush (%0)" :: "r" (adr + i)); | ||
69 | } | ||
70 | |||
71 | static void flush_kernel_map(void *arg) | ||
72 | { | ||
73 | struct list_head *l = (struct list_head *)arg; | ||
74 | struct page *pg; | ||
75 | |||
76 | /* When clflush is available always use it because it is | ||
77 | much cheaper than WBINVD. */ | ||
78 | /* clflush is still broken. Disable for now. */ | ||
79 | if (1 || !cpu_has_clflush) | ||
80 | asm volatile("wbinvd" ::: "memory"); | ||
81 | else list_for_each_entry(pg, l, lru) { | ||
82 | void *adr = page_address(pg); | ||
83 | cache_flush_page(adr); | ||
84 | } | ||
85 | __flush_tlb_all(); | ||
86 | } | ||
87 | |||
88 | static inline void flush_map(struct list_head *l) | ||
89 | { | ||
90 | on_each_cpu(flush_kernel_map, l, 1, 1); | ||
91 | } | ||
92 | |||
93 | static LIST_HEAD(deferred_pages); /* protected by init_mm.mmap_sem */ | ||
94 | |||
95 | static inline void save_page(struct page *fpage) | ||
96 | { | ||
97 | if (!test_and_set_bit(PG_arch_1, &fpage->flags)) | ||
98 | list_add(&fpage->lru, &deferred_pages); | ||
99 | } | ||
100 | |||
101 | /* | ||
102 | * No more special protections in this 2/4MB area - revert to a | ||
103 | * large page again. | ||
104 | */ | ||
105 | static void revert_page(unsigned long address, pgprot_t ref_prot) | ||
106 | { | ||
107 | pgd_t *pgd; | ||
108 | pud_t *pud; | ||
109 | pmd_t *pmd; | ||
110 | pte_t large_pte; | ||
111 | unsigned long pfn; | ||
112 | |||
113 | pgd = pgd_offset_k(address); | ||
114 | BUG_ON(pgd_none(*pgd)); | ||
115 | pud = pud_offset(pgd,address); | ||
116 | BUG_ON(pud_none(*pud)); | ||
117 | pmd = pmd_offset(pud, address); | ||
118 | BUG_ON(pmd_val(*pmd) & _PAGE_PSE); | ||
119 | pfn = (__pa(address) & LARGE_PAGE_MASK) >> PAGE_SHIFT; | ||
120 | large_pte = pfn_pte(pfn, ref_prot); | ||
121 | large_pte = pte_mkhuge(large_pte); | ||
122 | set_pte((pte_t *)pmd, large_pte); | ||
123 | } | ||
124 | |||
125 | static int | ||
126 | __change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot, | ||
127 | pgprot_t ref_prot) | ||
128 | { | ||
129 | pte_t *kpte; | ||
130 | struct page *kpte_page; | ||
131 | pgprot_t ref_prot2; | ||
132 | |||
133 | kpte = lookup_address(address); | ||
134 | if (!kpte) return 0; | ||
135 | kpte_page = virt_to_page(((unsigned long)kpte) & PAGE_MASK); | ||
136 | BUG_ON(PageLRU(kpte_page)); | ||
137 | BUG_ON(PageCompound(kpte_page)); | ||
138 | if (pgprot_val(prot) != pgprot_val(ref_prot)) { | ||
139 | if (!pte_huge(*kpte)) { | ||
140 | set_pte(kpte, pfn_pte(pfn, prot)); | ||
141 | } else { | ||
142 | /* | ||
143 | * split_large_page will take the reference for this | ||
144 | * change_page_attr on the split page. | ||
145 | */ | ||
146 | struct page *split; | ||
147 | ref_prot2 = pte_pgprot(pte_clrhuge(*kpte)); | ||
148 | split = split_large_page(address, prot, ref_prot2); | ||
149 | if (!split) | ||
150 | return -ENOMEM; | ||
151 | set_pte(kpte, mk_pte(split, ref_prot2)); | ||
152 | kpte_page = split; | ||
153 | } | ||
154 | page_private(kpte_page)++; | ||
155 | } else if (!pte_huge(*kpte)) { | ||
156 | set_pte(kpte, pfn_pte(pfn, ref_prot)); | ||
157 | BUG_ON(page_private(kpte_page) == 0); | ||
158 | page_private(kpte_page)--; | ||
159 | } else | ||
160 | BUG(); | ||
161 | |||
162 | /* on x86-64 the direct mapping set at boot is not using 4k pages */ | ||
163 | BUG_ON(PageReserved(kpte_page)); | ||
164 | |||
165 | save_page(kpte_page); | ||
166 | if (page_private(kpte_page) == 0) | ||
167 | revert_page(address, ref_prot); | ||
168 | return 0; | ||
169 | } | ||
170 | |||
171 | /* | ||
172 | * Change the page attributes of an page in the linear mapping. | ||
173 | * | ||
174 | * This should be used when a page is mapped with a different caching policy | ||
175 | * than write-back somewhere - some CPUs do not like it when mappings with | ||
176 | * different caching policies exist. This changes the page attributes of the | ||
177 | * in kernel linear mapping too. | ||
178 | * | ||
179 | * The caller needs to ensure that there are no conflicting mappings elsewhere. | ||
180 | * This function only deals with the kernel linear map. | ||
181 | * | ||
182 | * Caller must call global_flush_tlb() after this. | ||
183 | */ | ||
184 | int change_page_attr_addr(unsigned long address, int numpages, pgprot_t prot) | ||
185 | { | ||
186 | int err = 0, kernel_map = 0; | ||
187 | int i; | ||
188 | |||
189 | if (address >= __START_KERNEL_map | ||
190 | && address < __START_KERNEL_map + KERNEL_TEXT_SIZE) { | ||
191 | address = (unsigned long)__va(__pa(address)); | ||
192 | kernel_map = 1; | ||
193 | } | ||
194 | |||
195 | down_write(&init_mm.mmap_sem); | ||
196 | for (i = 0; i < numpages; i++, address += PAGE_SIZE) { | ||
197 | unsigned long pfn = __pa(address) >> PAGE_SHIFT; | ||
198 | |||
199 | if (!kernel_map || pte_present(pfn_pte(0, prot))) { | ||
200 | err = __change_page_attr(address, pfn, prot, PAGE_KERNEL); | ||
201 | if (err) | ||
202 | break; | ||
203 | } | ||
204 | /* Handle kernel mapping too which aliases part of the | ||
205 | * lowmem */ | ||
206 | if (__pa(address) < KERNEL_TEXT_SIZE) { | ||
207 | unsigned long addr2; | ||
208 | pgprot_t prot2; | ||
209 | addr2 = __START_KERNEL_map + __pa(address); | ||
210 | /* Make sure the kernel mappings stay executable */ | ||
211 | prot2 = pte_pgprot(pte_mkexec(pfn_pte(0, prot))); | ||
212 | err = __change_page_attr(addr2, pfn, prot2, | ||
213 | PAGE_KERNEL_EXEC); | ||
214 | } | ||
215 | } | ||
216 | up_write(&init_mm.mmap_sem); | ||
217 | return err; | ||
218 | } | ||
219 | |||
220 | /* Don't call this for MMIO areas that may not have a mem_map entry */ | ||
221 | int change_page_attr(struct page *page, int numpages, pgprot_t prot) | ||
222 | { | ||
223 | unsigned long addr = (unsigned long)page_address(page); | ||
224 | return change_page_attr_addr(addr, numpages, prot); | ||
225 | } | ||
226 | |||
227 | void global_flush_tlb(void) | ||
228 | { | ||
229 | struct page *pg, *next; | ||
230 | struct list_head l; | ||
231 | |||
232 | down_read(&init_mm.mmap_sem); | ||
233 | list_replace_init(&deferred_pages, &l); | ||
234 | up_read(&init_mm.mmap_sem); | ||
235 | |||
236 | flush_map(&l); | ||
237 | |||
238 | list_for_each_entry_safe(pg, next, &l, lru) { | ||
239 | list_del(&pg->lru); | ||
240 | clear_bit(PG_arch_1, &pg->flags); | ||
241 | if (page_private(pg) != 0) | ||
242 | continue; | ||
243 | ClearPagePrivate(pg); | ||
244 | __free_page(pg); | ||
245 | } | ||
246 | } | ||
247 | |||
248 | EXPORT_SYMBOL(change_page_attr); | ||
249 | EXPORT_SYMBOL(global_flush_tlb); | ||
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c new file mode 100644 index 000000000000..acdf03e19146 --- /dev/null +++ b/arch/x86/mm/srat_64.c | |||
@@ -0,0 +1,566 @@ | |||
1 | /* | ||
2 | * ACPI 3.0 based NUMA setup | ||
3 | * Copyright 2004 Andi Kleen, SuSE Labs. | ||
4 | * | ||
5 | * Reads the ACPI SRAT table to figure out what memory belongs to which CPUs. | ||
6 | * | ||
7 | * Called from acpi_numa_init while reading the SRAT and SLIT tables. | ||
8 | * Assumes all memory regions belonging to a single proximity domain | ||
9 | * are in one chunk. Holes between them will be included in the node. | ||
10 | */ | ||
11 | |||
12 | #include <linux/kernel.h> | ||
13 | #include <linux/acpi.h> | ||
14 | #include <linux/mmzone.h> | ||
15 | #include <linux/bitmap.h> | ||
16 | #include <linux/module.h> | ||
17 | #include <linux/topology.h> | ||
18 | #include <linux/bootmem.h> | ||
19 | #include <linux/mm.h> | ||
20 | #include <asm/proto.h> | ||
21 | #include <asm/numa.h> | ||
22 | #include <asm/e820.h> | ||
23 | |||
24 | int acpi_numa __initdata; | ||
25 | |||
26 | static struct acpi_table_slit *acpi_slit; | ||
27 | |||
28 | static nodemask_t nodes_parsed __initdata; | ||
29 | static struct bootnode nodes[MAX_NUMNODES] __initdata; | ||
30 | static struct bootnode nodes_add[MAX_NUMNODES]; | ||
31 | static int found_add_area __initdata; | ||
32 | int hotadd_percent __initdata = 0; | ||
33 | |||
34 | /* Too small nodes confuse the VM badly. Usually they result | ||
35 | from BIOS bugs. */ | ||
36 | #define NODE_MIN_SIZE (4*1024*1024) | ||
37 | |||
38 | static __init int setup_node(int pxm) | ||
39 | { | ||
40 | return acpi_map_pxm_to_node(pxm); | ||
41 | } | ||
42 | |||
43 | static __init int conflicting_nodes(unsigned long start, unsigned long end) | ||
44 | { | ||
45 | int i; | ||
46 | for_each_node_mask(i, nodes_parsed) { | ||
47 | struct bootnode *nd = &nodes[i]; | ||
48 | if (nd->start == nd->end) | ||
49 | continue; | ||
50 | if (nd->end > start && nd->start < end) | ||
51 | return i; | ||
52 | if (nd->end == end && nd->start == start) | ||
53 | return i; | ||
54 | } | ||
55 | return -1; | ||
56 | } | ||
57 | |||
58 | static __init void cutoff_node(int i, unsigned long start, unsigned long end) | ||
59 | { | ||
60 | struct bootnode *nd = &nodes[i]; | ||
61 | |||
62 | if (found_add_area) | ||
63 | return; | ||
64 | |||
65 | if (nd->start < start) { | ||
66 | nd->start = start; | ||
67 | if (nd->end < nd->start) | ||
68 | nd->start = nd->end; | ||
69 | } | ||
70 | if (nd->end > end) { | ||
71 | nd->end = end; | ||
72 | if (nd->start > nd->end) | ||
73 | nd->start = nd->end; | ||
74 | } | ||
75 | } | ||
76 | |||
77 | static __init void bad_srat(void) | ||
78 | { | ||
79 | int i; | ||
80 | printk(KERN_ERR "SRAT: SRAT not used.\n"); | ||
81 | acpi_numa = -1; | ||
82 | found_add_area = 0; | ||
83 | for (i = 0; i < MAX_LOCAL_APIC; i++) | ||
84 | apicid_to_node[i] = NUMA_NO_NODE; | ||
85 | for (i = 0; i < MAX_NUMNODES; i++) | ||
86 | nodes_add[i].start = nodes[i].end = 0; | ||
87 | remove_all_active_ranges(); | ||
88 | } | ||
89 | |||
90 | static __init inline int srat_disabled(void) | ||
91 | { | ||
92 | return numa_off || acpi_numa < 0; | ||
93 | } | ||
94 | |||
95 | /* | ||
96 | * A lot of BIOS fill in 10 (= no distance) everywhere. This messes | ||
97 | * up the NUMA heuristics which wants the local node to have a smaller | ||
98 | * distance than the others. | ||
99 | * Do some quick checks here and only use the SLIT if it passes. | ||
100 | */ | ||
101 | static __init int slit_valid(struct acpi_table_slit *slit) | ||
102 | { | ||
103 | int i, j; | ||
104 | int d = slit->locality_count; | ||
105 | for (i = 0; i < d; i++) { | ||
106 | for (j = 0; j < d; j++) { | ||
107 | u8 val = slit->entry[d*i + j]; | ||
108 | if (i == j) { | ||
109 | if (val != LOCAL_DISTANCE) | ||
110 | return 0; | ||
111 | } else if (val <= LOCAL_DISTANCE) | ||
112 | return 0; | ||
113 | } | ||
114 | } | ||
115 | return 1; | ||
116 | } | ||
117 | |||
118 | /* Callback for SLIT parsing */ | ||
119 | void __init acpi_numa_slit_init(struct acpi_table_slit *slit) | ||
120 | { | ||
121 | if (!slit_valid(slit)) { | ||
122 | printk(KERN_INFO "ACPI: SLIT table looks invalid. Not used.\n"); | ||
123 | return; | ||
124 | } | ||
125 | acpi_slit = slit; | ||
126 | } | ||
127 | |||
128 | /* Callback for Proximity Domain -> LAPIC mapping */ | ||
129 | void __init | ||
130 | acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa) | ||
131 | { | ||
132 | int pxm, node; | ||
133 | if (srat_disabled()) | ||
134 | return; | ||
135 | if (pa->header.length != sizeof(struct acpi_srat_cpu_affinity)) { | ||
136 | bad_srat(); | ||
137 | return; | ||
138 | } | ||
139 | if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0) | ||
140 | return; | ||
141 | pxm = pa->proximity_domain_lo; | ||
142 | node = setup_node(pxm); | ||
143 | if (node < 0) { | ||
144 | printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm); | ||
145 | bad_srat(); | ||
146 | return; | ||
147 | } | ||
148 | apicid_to_node[pa->apic_id] = node; | ||
149 | acpi_numa = 1; | ||
150 | printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n", | ||
151 | pxm, pa->apic_id, node); | ||
152 | } | ||
153 | |||
154 | #ifdef CONFIG_MEMORY_HOTPLUG_RESERVE | ||
155 | /* | ||
156 | * Protect against too large hotadd areas that would fill up memory. | ||
157 | */ | ||
158 | static int hotadd_enough_memory(struct bootnode *nd) | ||
159 | { | ||
160 | static unsigned long allocated; | ||
161 | static unsigned long last_area_end; | ||
162 | unsigned long pages = (nd->end - nd->start) >> PAGE_SHIFT; | ||
163 | long mem = pages * sizeof(struct page); | ||
164 | unsigned long addr; | ||
165 | unsigned long allowed; | ||
166 | unsigned long oldpages = pages; | ||
167 | |||
168 | if (mem < 0) | ||
169 | return 0; | ||
170 | allowed = (end_pfn - absent_pages_in_range(0, end_pfn)) * PAGE_SIZE; | ||
171 | allowed = (allowed / 100) * hotadd_percent; | ||
172 | if (allocated + mem > allowed) { | ||
173 | unsigned long range; | ||
174 | /* Give them at least part of their hotadd memory upto hotadd_percent | ||
175 | It would be better to spread the limit out | ||
176 | over multiple hotplug areas, but that is too complicated | ||
177 | right now */ | ||
178 | if (allocated >= allowed) | ||
179 | return 0; | ||
180 | range = allowed - allocated; | ||
181 | pages = (range / PAGE_SIZE); | ||
182 | mem = pages * sizeof(struct page); | ||
183 | nd->end = nd->start + range; | ||
184 | } | ||
185 | /* Not completely fool proof, but a good sanity check */ | ||
186 | addr = find_e820_area(last_area_end, end_pfn<<PAGE_SHIFT, mem); | ||
187 | if (addr == -1UL) | ||
188 | return 0; | ||
189 | if (pages != oldpages) | ||
190 | printk(KERN_NOTICE "SRAT: Hotadd area limited to %lu bytes\n", | ||
191 | pages << PAGE_SHIFT); | ||
192 | last_area_end = addr + mem; | ||
193 | allocated += mem; | ||
194 | return 1; | ||
195 | } | ||
196 | |||
197 | static int update_end_of_memory(unsigned long end) | ||
198 | { | ||
199 | found_add_area = 1; | ||
200 | if ((end >> PAGE_SHIFT) > end_pfn) | ||
201 | end_pfn = end >> PAGE_SHIFT; | ||
202 | return 1; | ||
203 | } | ||
204 | |||
205 | static inline int save_add_info(void) | ||
206 | { | ||
207 | return hotadd_percent > 0; | ||
208 | } | ||
209 | #else | ||
210 | int update_end_of_memory(unsigned long end) {return -1;} | ||
211 | static int hotadd_enough_memory(struct bootnode *nd) {return 1;} | ||
212 | #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE | ||
213 | static inline int save_add_info(void) {return 1;} | ||
214 | #else | ||
215 | static inline int save_add_info(void) {return 0;} | ||
216 | #endif | ||
217 | #endif | ||
218 | /* | ||
219 | * Update nodes_add and decide if to include add are in the zone. | ||
220 | * Both SPARSE and RESERVE need nodes_add infomation. | ||
221 | * This code supports one contigious hot add area per node. | ||
222 | */ | ||
223 | static int reserve_hotadd(int node, unsigned long start, unsigned long end) | ||
224 | { | ||
225 | unsigned long s_pfn = start >> PAGE_SHIFT; | ||
226 | unsigned long e_pfn = end >> PAGE_SHIFT; | ||
227 | int ret = 0, changed = 0; | ||
228 | struct bootnode *nd = &nodes_add[node]; | ||
229 | |||
230 | /* I had some trouble with strange memory hotadd regions breaking | ||
231 | the boot. Be very strict here and reject anything unexpected. | ||
232 | If you want working memory hotadd write correct SRATs. | ||
233 | |||
234 | The node size check is a basic sanity check to guard against | ||
235 | mistakes */ | ||
236 | if ((signed long)(end - start) < NODE_MIN_SIZE) { | ||
237 | printk(KERN_ERR "SRAT: Hotplug area too small\n"); | ||
238 | return -1; | ||
239 | } | ||
240 | |||
241 | /* This check might be a bit too strict, but I'm keeping it for now. */ | ||
242 | if (absent_pages_in_range(s_pfn, e_pfn) != e_pfn - s_pfn) { | ||
243 | printk(KERN_ERR | ||
244 | "SRAT: Hotplug area %lu -> %lu has existing memory\n", | ||
245 | s_pfn, e_pfn); | ||
246 | return -1; | ||
247 | } | ||
248 | |||
249 | if (!hotadd_enough_memory(&nodes_add[node])) { | ||
250 | printk(KERN_ERR "SRAT: Hotplug area too large\n"); | ||
251 | return -1; | ||
252 | } | ||
253 | |||
254 | /* Looks good */ | ||
255 | |||
256 | if (nd->start == nd->end) { | ||
257 | nd->start = start; | ||
258 | nd->end = end; | ||
259 | changed = 1; | ||
260 | } else { | ||
261 | if (nd->start == end) { | ||
262 | nd->start = start; | ||
263 | changed = 1; | ||
264 | } | ||
265 | if (nd->end == start) { | ||
266 | nd->end = end; | ||
267 | changed = 1; | ||
268 | } | ||
269 | if (!changed) | ||
270 | printk(KERN_ERR "SRAT: Hotplug zone not continuous. Partly ignored\n"); | ||
271 | } | ||
272 | |||
273 | ret = update_end_of_memory(nd->end); | ||
274 | |||
275 | if (changed) | ||
276 | printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n", nd->start, nd->end); | ||
277 | return ret; | ||
278 | } | ||
279 | |||
280 | /* Callback for parsing of the Proximity Domain <-> Memory Area mappings */ | ||
281 | void __init | ||
282 | acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) | ||
283 | { | ||
284 | struct bootnode *nd, oldnode; | ||
285 | unsigned long start, end; | ||
286 | int node, pxm; | ||
287 | int i; | ||
288 | |||
289 | if (srat_disabled()) | ||
290 | return; | ||
291 | if (ma->header.length != sizeof(struct acpi_srat_mem_affinity)) { | ||
292 | bad_srat(); | ||
293 | return; | ||
294 | } | ||
295 | if ((ma->flags & ACPI_SRAT_MEM_ENABLED) == 0) | ||
296 | return; | ||
297 | |||
298 | if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && !save_add_info()) | ||
299 | return; | ||
300 | start = ma->base_address; | ||
301 | end = start + ma->length; | ||
302 | pxm = ma->proximity_domain; | ||
303 | node = setup_node(pxm); | ||
304 | if (node < 0) { | ||
305 | printk(KERN_ERR "SRAT: Too many proximity domains.\n"); | ||
306 | bad_srat(); | ||
307 | return; | ||
308 | } | ||
309 | i = conflicting_nodes(start, end); | ||
310 | if (i == node) { | ||
311 | printk(KERN_WARNING | ||
312 | "SRAT: Warning: PXM %d (%lx-%lx) overlaps with itself (%Lx-%Lx)\n", | ||
313 | pxm, start, end, nodes[i].start, nodes[i].end); | ||
314 | } else if (i >= 0) { | ||
315 | printk(KERN_ERR | ||
316 | "SRAT: PXM %d (%lx-%lx) overlaps with PXM %d (%Lx-%Lx)\n", | ||
317 | pxm, start, end, node_to_pxm(i), | ||
318 | nodes[i].start, nodes[i].end); | ||
319 | bad_srat(); | ||
320 | return; | ||
321 | } | ||
322 | nd = &nodes[node]; | ||
323 | oldnode = *nd; | ||
324 | if (!node_test_and_set(node, nodes_parsed)) { | ||
325 | nd->start = start; | ||
326 | nd->end = end; | ||
327 | } else { | ||
328 | if (start < nd->start) | ||
329 | nd->start = start; | ||
330 | if (nd->end < end) | ||
331 | nd->end = end; | ||
332 | } | ||
333 | |||
334 | printk(KERN_INFO "SRAT: Node %u PXM %u %Lx-%Lx\n", node, pxm, | ||
335 | nd->start, nd->end); | ||
336 | e820_register_active_regions(node, nd->start >> PAGE_SHIFT, | ||
337 | nd->end >> PAGE_SHIFT); | ||
338 | push_node_boundaries(node, nd->start >> PAGE_SHIFT, | ||
339 | nd->end >> PAGE_SHIFT); | ||
340 | |||
341 | if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && | ||
342 | (reserve_hotadd(node, start, end) < 0)) { | ||
343 | /* Ignore hotadd region. Undo damage */ | ||
344 | printk(KERN_NOTICE "SRAT: Hotplug region ignored\n"); | ||
345 | *nd = oldnode; | ||
346 | if ((nd->start | nd->end) == 0) | ||
347 | node_clear(node, nodes_parsed); | ||
348 | } | ||
349 | } | ||
350 | |||
351 | /* Sanity check to catch more bad SRATs (they are amazingly common). | ||
352 | Make sure the PXMs cover all memory. */ | ||
353 | static int __init nodes_cover_memory(const struct bootnode *nodes) | ||
354 | { | ||
355 | int i; | ||
356 | unsigned long pxmram, e820ram; | ||
357 | |||
358 | pxmram = 0; | ||
359 | for_each_node_mask(i, nodes_parsed) { | ||
360 | unsigned long s = nodes[i].start >> PAGE_SHIFT; | ||
361 | unsigned long e = nodes[i].end >> PAGE_SHIFT; | ||
362 | pxmram += e - s; | ||
363 | pxmram -= absent_pages_in_range(s, e); | ||
364 | if ((long)pxmram < 0) | ||
365 | pxmram = 0; | ||
366 | } | ||
367 | |||
368 | e820ram = end_pfn - absent_pages_in_range(0, end_pfn); | ||
369 | /* We seem to lose 3 pages somewhere. Allow a bit of slack. */ | ||
370 | if ((long)(e820ram - pxmram) >= 1*1024*1024) { | ||
371 | printk(KERN_ERR | ||
372 | "SRAT: PXMs only cover %luMB of your %luMB e820 RAM. Not used.\n", | ||
373 | (pxmram << PAGE_SHIFT) >> 20, | ||
374 | (e820ram << PAGE_SHIFT) >> 20); | ||
375 | return 0; | ||
376 | } | ||
377 | return 1; | ||
378 | } | ||
379 | |||
380 | static void unparse_node(int node) | ||
381 | { | ||
382 | int i; | ||
383 | node_clear(node, nodes_parsed); | ||
384 | for (i = 0; i < MAX_LOCAL_APIC; i++) { | ||
385 | if (apicid_to_node[i] == node) | ||
386 | apicid_to_node[i] = NUMA_NO_NODE; | ||
387 | } | ||
388 | } | ||
389 | |||
390 | void __init acpi_numa_arch_fixup(void) {} | ||
391 | |||
392 | /* Use the information discovered above to actually set up the nodes. */ | ||
393 | int __init acpi_scan_nodes(unsigned long start, unsigned long end) | ||
394 | { | ||
395 | int i; | ||
396 | |||
397 | if (acpi_numa <= 0) | ||
398 | return -1; | ||
399 | |||
400 | /* First clean up the node list */ | ||
401 | for (i = 0; i < MAX_NUMNODES; i++) { | ||
402 | cutoff_node(i, start, end); | ||
403 | if ((nodes[i].end - nodes[i].start) < NODE_MIN_SIZE) { | ||
404 | unparse_node(i); | ||
405 | node_set_offline(i); | ||
406 | } | ||
407 | } | ||
408 | |||
409 | if (!nodes_cover_memory(nodes)) { | ||
410 | bad_srat(); | ||
411 | return -1; | ||
412 | } | ||
413 | |||
414 | memnode_shift = compute_hash_shift(nodes, MAX_NUMNODES); | ||
415 | if (memnode_shift < 0) { | ||
416 | printk(KERN_ERR | ||
417 | "SRAT: No NUMA node hash function found. Contact maintainer\n"); | ||
418 | bad_srat(); | ||
419 | return -1; | ||
420 | } | ||
421 | |||
422 | node_possible_map = nodes_parsed; | ||
423 | |||
424 | /* Finally register nodes */ | ||
425 | for_each_node_mask(i, node_possible_map) | ||
426 | setup_node_bootmem(i, nodes[i].start, nodes[i].end); | ||
427 | /* Try again in case setup_node_bootmem missed one due | ||
428 | to missing bootmem */ | ||
429 | for_each_node_mask(i, node_possible_map) | ||
430 | if (!node_online(i)) | ||
431 | setup_node_bootmem(i, nodes[i].start, nodes[i].end); | ||
432 | |||
433 | for (i = 0; i < NR_CPUS; i++) { | ||
434 | if (cpu_to_node[i] == NUMA_NO_NODE) | ||
435 | continue; | ||
436 | if (!node_isset(cpu_to_node[i], node_possible_map)) | ||
437 | numa_set_node(i, NUMA_NO_NODE); | ||
438 | } | ||
439 | numa_init_array(); | ||
440 | return 0; | ||
441 | } | ||
442 | |||
443 | #ifdef CONFIG_NUMA_EMU | ||
444 | static int __init find_node_by_addr(unsigned long addr) | ||
445 | { | ||
446 | int ret = NUMA_NO_NODE; | ||
447 | int i; | ||
448 | |||
449 | for_each_node_mask(i, nodes_parsed) { | ||
450 | /* | ||
451 | * Find the real node that this emulated node appears on. For | ||
452 | * the sake of simplicity, we only use a real node's starting | ||
453 | * address to determine which emulated node it appears on. | ||
454 | */ | ||
455 | if (addr >= nodes[i].start && addr < nodes[i].end) { | ||
456 | ret = i; | ||
457 | break; | ||
458 | } | ||
459 | } | ||
460 | return i; | ||
461 | } | ||
462 | |||
463 | /* | ||
464 | * In NUMA emulation, we need to setup proximity domain (_PXM) to node ID | ||
465 | * mappings that respect the real ACPI topology but reflect our emulated | ||
466 | * environment. For each emulated node, we find which real node it appears on | ||
467 | * and create PXM to NID mappings for those fake nodes which mirror that | ||
468 | * locality. SLIT will now represent the correct distances between emulated | ||
469 | * nodes as a result of the real topology. | ||
470 | */ | ||
471 | void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes) | ||
472 | { | ||
473 | int i, j; | ||
474 | int fake_node_to_pxm_map[MAX_NUMNODES] = { | ||
475 | [0 ... MAX_NUMNODES-1] = PXM_INVAL | ||
476 | }; | ||
477 | unsigned char fake_apicid_to_node[MAX_LOCAL_APIC] = { | ||
478 | [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE | ||
479 | }; | ||
480 | |||
481 | printk(KERN_INFO "Faking PXM affinity for fake nodes on real " | ||
482 | "topology.\n"); | ||
483 | for (i = 0; i < num_nodes; i++) { | ||
484 | int nid, pxm; | ||
485 | |||
486 | nid = find_node_by_addr(fake_nodes[i].start); | ||
487 | if (nid == NUMA_NO_NODE) | ||
488 | continue; | ||
489 | pxm = node_to_pxm(nid); | ||
490 | if (pxm == PXM_INVAL) | ||
491 | continue; | ||
492 | fake_node_to_pxm_map[i] = pxm; | ||
493 | /* | ||
494 | * For each apicid_to_node mapping that exists for this real | ||
495 | * node, it must now point to the fake node ID. | ||
496 | */ | ||
497 | for (j = 0; j < MAX_LOCAL_APIC; j++) | ||
498 | if (apicid_to_node[j] == nid) | ||
499 | fake_apicid_to_node[j] = i; | ||
500 | } | ||
501 | for (i = 0; i < num_nodes; i++) | ||
502 | __acpi_map_pxm_to_node(fake_node_to_pxm_map[i], i); | ||
503 | memcpy(apicid_to_node, fake_apicid_to_node, sizeof(apicid_to_node)); | ||
504 | |||
505 | nodes_clear(nodes_parsed); | ||
506 | for (i = 0; i < num_nodes; i++) | ||
507 | if (fake_nodes[i].start != fake_nodes[i].end) | ||
508 | node_set(i, nodes_parsed); | ||
509 | WARN_ON(!nodes_cover_memory(fake_nodes)); | ||
510 | } | ||
511 | |||
512 | static int null_slit_node_compare(int a, int b) | ||
513 | { | ||
514 | return node_to_pxm(a) == node_to_pxm(b); | ||
515 | } | ||
516 | #else | ||
517 | static int null_slit_node_compare(int a, int b) | ||
518 | { | ||
519 | return a == b; | ||
520 | } | ||
521 | #endif /* CONFIG_NUMA_EMU */ | ||
522 | |||
523 | void __init srat_reserve_add_area(int nodeid) | ||
524 | { | ||
525 | if (found_add_area && nodes_add[nodeid].end) { | ||
526 | u64 total_mb; | ||
527 | |||
528 | printk(KERN_INFO "SRAT: Reserving hot-add memory space " | ||
529 | "for node %d at %Lx-%Lx\n", | ||
530 | nodeid, nodes_add[nodeid].start, nodes_add[nodeid].end); | ||
531 | total_mb = (nodes_add[nodeid].end - nodes_add[nodeid].start) | ||
532 | >> PAGE_SHIFT; | ||
533 | total_mb *= sizeof(struct page); | ||
534 | total_mb >>= 20; | ||
535 | printk(KERN_INFO "SRAT: This will cost you %Lu MB of " | ||
536 | "pre-allocated memory.\n", (unsigned long long)total_mb); | ||
537 | reserve_bootmem_node(NODE_DATA(nodeid), nodes_add[nodeid].start, | ||
538 | nodes_add[nodeid].end - nodes_add[nodeid].start); | ||
539 | } | ||
540 | } | ||
541 | |||
542 | int __node_distance(int a, int b) | ||
543 | { | ||
544 | int index; | ||
545 | |||
546 | if (!acpi_slit) | ||
547 | return null_slit_node_compare(a, b) ? LOCAL_DISTANCE : | ||
548 | REMOTE_DISTANCE; | ||
549 | index = acpi_slit->locality_count * node_to_pxm(a); | ||
550 | return acpi_slit->entry[index + node_to_pxm(b)]; | ||
551 | } | ||
552 | |||
553 | EXPORT_SYMBOL(__node_distance); | ||
554 | |||
555 | int memory_add_physaddr_to_nid(u64 start) | ||
556 | { | ||
557 | int i, ret = 0; | ||
558 | |||
559 | for_each_node(i) | ||
560 | if (nodes_add[i].start <= start && nodes_add[i].end > start) | ||
561 | ret = i; | ||
562 | |||
563 | return ret; | ||
564 | } | ||
565 | EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); | ||
566 | |||