aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/mm
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2009-03-26 16:39:17 -0400
committerIngo Molnar <mingo@elte.hu>2009-03-27 12:28:43 -0400
commit6e15cf04860074ad032e88c306bea656bbdd0f22 (patch)
treec346383bb7563e8d66b2f4a502f875b259c34870 /arch/x86/mm
parentbe0ea69674ed95e1e98cb3687a241badc756d228 (diff)
parent60db56422043aaa455ac7f858ce23c273220f9d9 (diff)
Merge branch 'core/percpu' into percpu-cpumask-x86-for-linus-2
Conflicts: arch/parisc/kernel/irq.c arch/x86/include/asm/fixmap_64.h arch/x86/include/asm/setup.h kernel/irq/handle.c Semantic merge: arch/x86/include/asm/fixmap.h Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'arch/x86/mm')
-rw-r--r--arch/x86/mm/Makefile4
-rw-r--r--arch/x86/mm/extable.c6
-rw-r--r--arch/x86/mm/fault.c1333
-rw-r--r--arch/x86/mm/highmem_32.c34
-rw-r--r--arch/x86/mm/init.c49
-rw-r--r--arch/x86/mm/init_32.c213
-rw-r--r--arch/x86/mm/init_64.c109
-rw-r--r--arch/x86/mm/ioremap.c2
-rw-r--r--arch/x86/mm/memtest.c156
-rw-r--r--arch/x86/mm/mmap.c2
-rw-r--r--arch/x86/mm/numa_32.c28
-rw-r--r--arch/x86/mm/numa_64.c217
-rw-r--r--arch/x86/mm/pageattr.c7
-rw-r--r--arch/x86/mm/pat.c77
-rw-r--r--arch/x86/mm/pgtable.c18
-rw-r--r--arch/x86/mm/pgtable_32.c18
-rw-r--r--arch/x86/mm/srat_64.c3
-rw-r--r--arch/x86/mm/tlb.c295
18 files changed, 1690 insertions, 881 deletions
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index d8cc96a2738f..08537747cb58 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -1,6 +1,8 @@
1obj-y := init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ 1obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
2 pat.o pgtable.o gup.o 2 pat.o pgtable.o gup.o
3 3
4obj-$(CONFIG_SMP) += tlb.o
5
4obj-$(CONFIG_X86_32) += pgtable_32.o iomap_32.o 6obj-$(CONFIG_X86_32) += pgtable_32.o iomap_32.o
5 7
6obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o 8obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index 7e8db53528a7..61b41ca3b5a2 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -23,6 +23,12 @@ int fixup_exception(struct pt_regs *regs)
23 23
24 fixup = search_exception_tables(regs->ip); 24 fixup = search_exception_tables(regs->ip);
25 if (fixup) { 25 if (fixup) {
26 /* If fixup is less than 16, it means uaccess error */
27 if (fixup->fixup < 16) {
28 current_thread_info()->uaccess_err = -EFAULT;
29 regs->ip += fixup->fixup;
30 return 1;
31 }
26 regs->ip = fixup->fixup; 32 regs->ip = fixup->fixup;
27 return 1; 33 return 1;
28 } 34 }
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index c76ef1d701c9..a03b7279efa0 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1,73 +1,79 @@
1/* 1/*
2 * Copyright (C) 1995 Linus Torvalds 2 * Copyright (C) 1995 Linus Torvalds
3 * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs. 3 * Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs.
4 * Copyright (C) 2008-2009, Red Hat Inc., Ingo Molnar
4 */ 5 */
5
6#include <linux/signal.h>
7#include <linux/sched.h>
8#include <linux/kernel.h>
9#include <linux/errno.h>
10#include <linux/string.h>
11#include <linux/types.h>
12#include <linux/ptrace.h>
13#include <linux/mmiotrace.h>
14#include <linux/mman.h>
15#include <linux/mm.h>
16#include <linux/smp.h>
17#include <linux/interrupt.h> 6#include <linux/interrupt.h>
18#include <linux/init.h> 7#include <linux/mmiotrace.h>
19#include <linux/tty.h> 8#include <linux/bootmem.h>
20#include <linux/vt_kern.h> /* For unblank_screen() */
21#include <linux/compiler.h> 9#include <linux/compiler.h>
22#include <linux/highmem.h> 10#include <linux/highmem.h>
23#include <linux/bootmem.h> /* for max_low_pfn */
24#include <linux/vmalloc.h>
25#include <linux/module.h>
26#include <linux/kprobes.h> 11#include <linux/kprobes.h>
27#include <linux/uaccess.h> 12#include <linux/uaccess.h>
13#include <linux/vmalloc.h>
14#include <linux/vt_kern.h>
15#include <linux/signal.h>
16#include <linux/kernel.h>
17#include <linux/ptrace.h>
18#include <linux/string.h>
19#include <linux/module.h>
28#include <linux/kdebug.h> 20#include <linux/kdebug.h>
21#include <linux/errno.h>
22#include <linux/magic.h>
23#include <linux/sched.h>
24#include <linux/types.h>
25#include <linux/init.h>
26#include <linux/mman.h>
27#include <linux/tty.h>
28#include <linux/smp.h>
29#include <linux/mm.h>
30
31#include <asm-generic/sections.h>
29 32
30#include <asm/system.h>
31#include <asm/desc.h>
32#include <asm/segment.h>
33#include <asm/pgalloc.h>
34#include <asm/smp.h>
35#include <asm/tlbflush.h> 33#include <asm/tlbflush.h>
34#include <asm/pgalloc.h>
35#include <asm/segment.h>
36#include <asm/system.h>
36#include <asm/proto.h> 37#include <asm/proto.h>
37#include <asm-generic/sections.h>
38#include <asm/traps.h> 38#include <asm/traps.h>
39#include <asm/desc.h>
39 40
40/* 41/*
41 * Page fault error code bits 42 * Page fault error code bits:
42 * bit 0 == 0 means no page found, 1 means protection fault 43 *
43 * bit 1 == 0 means read, 1 means write 44 * bit 0 == 0: no page found 1: protection fault
44 * bit 2 == 0 means kernel, 1 means user-mode 45 * bit 1 == 0: read access 1: write access
45 * bit 3 == 1 means use of reserved bit detected 46 * bit 2 == 0: kernel-mode access 1: user-mode access
46 * bit 4 == 1 means fault was an instruction fetch 47 * bit 3 == 1: use of reserved bit detected
48 * bit 4 == 1: fault was an instruction fetch
47 */ 49 */
48#define PF_PROT (1<<0) 50enum x86_pf_error_code {
49#define PF_WRITE (1<<1) 51
50#define PF_USER (1<<2) 52 PF_PROT = 1 << 0,
51#define PF_RSVD (1<<3) 53 PF_WRITE = 1 << 1,
52#define PF_INSTR (1<<4) 54 PF_USER = 1 << 2,
55 PF_RSVD = 1 << 3,
56 PF_INSTR = 1 << 4,
57};
53 58
59/*
60 * Returns 0 if mmiotrace is disabled, or if the fault is not
61 * handled by mmiotrace:
62 */
54static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr) 63static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr)
55{ 64{
56#ifdef CONFIG_MMIOTRACE
57 if (unlikely(is_kmmio_active())) 65 if (unlikely(is_kmmio_active()))
58 if (kmmio_handler(regs, addr) == 1) 66 if (kmmio_handler(regs, addr) == 1)
59 return -1; 67 return -1;
60#endif
61 return 0; 68 return 0;
62} 69}
63 70
64static inline int notify_page_fault(struct pt_regs *regs) 71static inline int notify_page_fault(struct pt_regs *regs)
65{ 72{
66#ifdef CONFIG_KPROBES
67 int ret = 0; 73 int ret = 0;
68 74
69 /* kprobe_running() needs smp_processor_id() */ 75 /* kprobe_running() needs smp_processor_id() */
70 if (!user_mode_vm(regs)) { 76 if (kprobes_built_in() && !user_mode_vm(regs)) {
71 preempt_disable(); 77 preempt_disable();
72 if (kprobe_running() && kprobe_fault_handler(regs, 14)) 78 if (kprobe_running() && kprobe_fault_handler(regs, 14))
73 ret = 1; 79 ret = 1;
@@ -75,29 +81,76 @@ static inline int notify_page_fault(struct pt_regs *regs)
75 } 81 }
76 82
77 return ret; 83 return ret;
78#else
79 return 0;
80#endif
81} 84}
82 85
83/* 86/*
84 * X86_32 87 * Prefetch quirks:
85 * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch. 88 *
86 * Check that here and ignore it. 89 * 32-bit mode:
90 *
91 * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
92 * Check that here and ignore it.
93 *
94 * 64-bit mode:
87 * 95 *
88 * X86_64 96 * Sometimes the CPU reports invalid exceptions on prefetch.
89 * Sometimes the CPU reports invalid exceptions on prefetch. 97 * Check that here and ignore it.
90 * Check that here and ignore it.
91 * 98 *
92 * Opcode checker based on code by Richard Brunner 99 * Opcode checker based on code by Richard Brunner.
93 */ 100 */
94static int is_prefetch(struct pt_regs *regs, unsigned long addr, 101static inline int
95 unsigned long error_code) 102check_prefetch_opcode(struct pt_regs *regs, unsigned char *instr,
103 unsigned char opcode, int *prefetch)
96{ 104{
105 unsigned char instr_hi = opcode & 0xf0;
106 unsigned char instr_lo = opcode & 0x0f;
107
108 switch (instr_hi) {
109 case 0x20:
110 case 0x30:
111 /*
112 * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
113 * In X86_64 long mode, the CPU will signal invalid
114 * opcode if some of these prefixes are present so
115 * X86_64 will never get here anyway
116 */
117 return ((instr_lo & 7) == 0x6);
118#ifdef CONFIG_X86_64
119 case 0x40:
120 /*
121 * In AMD64 long mode 0x40..0x4F are valid REX prefixes
122 * Need to figure out under what instruction mode the
123 * instruction was issued. Could check the LDT for lm,
124 * but for now it's good enough to assume that long
125 * mode only uses well known segments or kernel.
126 */
127 return (!user_mode(regs)) || (regs->cs == __USER_CS);
128#endif
129 case 0x60:
130 /* 0x64 thru 0x67 are valid prefixes in all modes. */
131 return (instr_lo & 0xC) == 0x4;
132 case 0xF0:
133 /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */
134 return !instr_lo || (instr_lo>>1) == 1;
135 case 0x00:
136 /* Prefetch instruction is 0x0F0D or 0x0F18 */
137 if (probe_kernel_address(instr, opcode))
138 return 0;
139
140 *prefetch = (instr_lo == 0xF) &&
141 (opcode == 0x0D || opcode == 0x18);
142 return 0;
143 default:
144 return 0;
145 }
146}
147
148static int
149is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
150{
151 unsigned char *max_instr;
97 unsigned char *instr; 152 unsigned char *instr;
98 int scan_more = 1;
99 int prefetch = 0; 153 int prefetch = 0;
100 unsigned char *max_instr;
101 154
102 /* 155 /*
103 * If it was a exec (instruction fetch) fault on NX page, then 156 * If it was a exec (instruction fetch) fault on NX page, then
@@ -106,106 +159,170 @@ static int is_prefetch(struct pt_regs *regs, unsigned long addr,
106 if (error_code & PF_INSTR) 159 if (error_code & PF_INSTR)
107 return 0; 160 return 0;
108 161
109 instr = (unsigned char *)convert_ip_to_linear(current, regs); 162 instr = (void *)convert_ip_to_linear(current, regs);
110 max_instr = instr + 15; 163 max_instr = instr + 15;
111 164
112 if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE) 165 if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
113 return 0; 166 return 0;
114 167
115 while (scan_more && instr < max_instr) { 168 while (instr < max_instr) {
116 unsigned char opcode; 169 unsigned char opcode;
117 unsigned char instr_hi;
118 unsigned char instr_lo;
119 170
120 if (probe_kernel_address(instr, opcode)) 171 if (probe_kernel_address(instr, opcode))
121 break; 172 break;
122 173
123 instr_hi = opcode & 0xf0;
124 instr_lo = opcode & 0x0f;
125 instr++; 174 instr++;
126 175
127 switch (instr_hi) { 176 if (!check_prefetch_opcode(regs, instr, opcode, &prefetch))
128 case 0x20:
129 case 0x30:
130 /*
131 * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
132 * In X86_64 long mode, the CPU will signal invalid
133 * opcode if some of these prefixes are present so
134 * X86_64 will never get here anyway
135 */
136 scan_more = ((instr_lo & 7) == 0x6);
137 break; 177 break;
138#ifdef CONFIG_X86_64
139 case 0x40:
140 /*
141 * In AMD64 long mode 0x40..0x4F are valid REX prefixes
142 * Need to figure out under what instruction mode the
143 * instruction was issued. Could check the LDT for lm,
144 * but for now it's good enough to assume that long
145 * mode only uses well known segments or kernel.
146 */
147 scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
148 break;
149#endif
150 case 0x60:
151 /* 0x64 thru 0x67 are valid prefixes in all modes. */
152 scan_more = (instr_lo & 0xC) == 0x4;
153 break;
154 case 0xF0:
155 /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */
156 scan_more = !instr_lo || (instr_lo>>1) == 1;
157 break;
158 case 0x00:
159 /* Prefetch instruction is 0x0F0D or 0x0F18 */
160 scan_more = 0;
161
162 if (probe_kernel_address(instr, opcode))
163 break;
164 prefetch = (instr_lo == 0xF) &&
165 (opcode == 0x0D || opcode == 0x18);
166 break;
167 default:
168 scan_more = 0;
169 break;
170 }
171 } 178 }
172 return prefetch; 179 return prefetch;
173} 180}
174 181
175static void force_sig_info_fault(int si_signo, int si_code, 182static void
176 unsigned long address, struct task_struct *tsk) 183force_sig_info_fault(int si_signo, int si_code, unsigned long address,
184 struct task_struct *tsk)
177{ 185{
178 siginfo_t info; 186 siginfo_t info;
179 187
180 info.si_signo = si_signo; 188 info.si_signo = si_signo;
181 info.si_errno = 0; 189 info.si_errno = 0;
182 info.si_code = si_code; 190 info.si_code = si_code;
183 info.si_addr = (void __user *)address; 191 info.si_addr = (void __user *)address;
192
184 force_sig_info(si_signo, &info, tsk); 193 force_sig_info(si_signo, &info, tsk);
185} 194}
186 195
187#ifdef CONFIG_X86_64 196DEFINE_SPINLOCK(pgd_lock);
188static int bad_address(void *p) 197LIST_HEAD(pgd_list);
198
199#ifdef CONFIG_X86_32
200static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
189{ 201{
190 unsigned long dummy; 202 unsigned index = pgd_index(address);
191 return probe_kernel_address((unsigned long *)p, dummy); 203 pgd_t *pgd_k;
204 pud_t *pud, *pud_k;
205 pmd_t *pmd, *pmd_k;
206
207 pgd += index;
208 pgd_k = init_mm.pgd + index;
209
210 if (!pgd_present(*pgd_k))
211 return NULL;
212
213 /*
214 * set_pgd(pgd, *pgd_k); here would be useless on PAE
215 * and redundant with the set_pmd() on non-PAE. As would
216 * set_pud.
217 */
218 pud = pud_offset(pgd, address);
219 pud_k = pud_offset(pgd_k, address);
220 if (!pud_present(*pud_k))
221 return NULL;
222
223 pmd = pmd_offset(pud, address);
224 pmd_k = pmd_offset(pud_k, address);
225 if (!pmd_present(*pmd_k))
226 return NULL;
227
228 if (!pmd_present(*pmd)) {
229 set_pmd(pmd, *pmd_k);
230 arch_flush_lazy_mmu_mode();
231 } else {
232 BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
233 }
234
235 return pmd_k;
236}
237
238void vmalloc_sync_all(void)
239{
240 unsigned long address;
241
242 if (SHARED_KERNEL_PMD)
243 return;
244
245 for (address = VMALLOC_START & PMD_MASK;
246 address >= TASK_SIZE && address < FIXADDR_TOP;
247 address += PMD_SIZE) {
248
249 unsigned long flags;
250 struct page *page;
251
252 spin_lock_irqsave(&pgd_lock, flags);
253 list_for_each_entry(page, &pgd_list, lru) {
254 if (!vmalloc_sync_one(page_address(page), address))
255 break;
256 }
257 spin_unlock_irqrestore(&pgd_lock, flags);
258 }
259}
260
261/*
262 * 32-bit:
263 *
264 * Handle a fault on the vmalloc or module mapping area
265 */
266static noinline int vmalloc_fault(unsigned long address)
267{
268 unsigned long pgd_paddr;
269 pmd_t *pmd_k;
270 pte_t *pte_k;
271
272 /* Make sure we are in vmalloc area: */
273 if (!(address >= VMALLOC_START && address < VMALLOC_END))
274 return -1;
275
276 /*
277 * Synchronize this task's top level page-table
278 * with the 'reference' page table.
279 *
280 * Do _not_ use "current" here. We might be inside
281 * an interrupt in the middle of a task switch..
282 */
283 pgd_paddr = read_cr3();
284 pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
285 if (!pmd_k)
286 return -1;
287
288 pte_k = pte_offset_kernel(pmd_k, address);
289 if (!pte_present(*pte_k))
290 return -1;
291
292 return 0;
293}
294
295/*
296 * Did it hit the DOS screen memory VA from vm86 mode?
297 */
298static inline void
299check_v8086_mode(struct pt_regs *regs, unsigned long address,
300 struct task_struct *tsk)
301{
302 unsigned long bit;
303
304 if (!v8086_mode(regs))
305 return;
306
307 bit = (address - 0xA0000) >> PAGE_SHIFT;
308 if (bit < 32)
309 tsk->thread.screen_bitmap |= 1 << bit;
192} 310}
193#endif
194 311
195static void dump_pagetable(unsigned long address) 312static void dump_pagetable(unsigned long address)
196{ 313{
197#ifdef CONFIG_X86_32
198 __typeof__(pte_val(__pte(0))) page; 314 __typeof__(pte_val(__pte(0))) page;
199 315
200 page = read_cr3(); 316 page = read_cr3();
201 page = ((__typeof__(page) *) __va(page))[address >> PGDIR_SHIFT]; 317 page = ((__typeof__(page) *) __va(page))[address >> PGDIR_SHIFT];
318
202#ifdef CONFIG_X86_PAE 319#ifdef CONFIG_X86_PAE
203 printk("*pdpt = %016Lx ", page); 320 printk("*pdpt = %016Lx ", page);
204 if ((page >> PAGE_SHIFT) < max_low_pfn 321 if ((page >> PAGE_SHIFT) < max_low_pfn
205 && page & _PAGE_PRESENT) { 322 && page & _PAGE_PRESENT) {
206 page &= PAGE_MASK; 323 page &= PAGE_MASK;
207 page = ((__typeof__(page) *) __va(page))[(address >> PMD_SHIFT) 324 page = ((__typeof__(page) *) __va(page))[(address >> PMD_SHIFT)
208 & (PTRS_PER_PMD - 1)]; 325 & (PTRS_PER_PMD - 1)];
209 printk(KERN_CONT "*pde = %016Lx ", page); 326 printk(KERN_CONT "*pde = %016Lx ", page);
210 page &= ~_PAGE_NX; 327 page &= ~_PAGE_NX;
211 } 328 }
@@ -217,19 +334,145 @@ static void dump_pagetable(unsigned long address)
217 * We must not directly access the pte in the highpte 334 * We must not directly access the pte in the highpte
218 * case if the page table is located in highmem. 335 * case if the page table is located in highmem.
219 * And let's rather not kmap-atomic the pte, just in case 336 * And let's rather not kmap-atomic the pte, just in case
220 * it's allocated already. 337 * it's allocated already:
221 */ 338 */
222 if ((page >> PAGE_SHIFT) < max_low_pfn 339 if ((page >> PAGE_SHIFT) < max_low_pfn
223 && (page & _PAGE_PRESENT) 340 && (page & _PAGE_PRESENT)
224 && !(page & _PAGE_PSE)) { 341 && !(page & _PAGE_PSE)) {
342
225 page &= PAGE_MASK; 343 page &= PAGE_MASK;
226 page = ((__typeof__(page) *) __va(page))[(address >> PAGE_SHIFT) 344 page = ((__typeof__(page) *) __va(page))[(address >> PAGE_SHIFT)
227 & (PTRS_PER_PTE - 1)]; 345 & (PTRS_PER_PTE - 1)];
228 printk("*pte = %0*Lx ", sizeof(page)*2, (u64)page); 346 printk("*pte = %0*Lx ", sizeof(page)*2, (u64)page);
229 } 347 }
230 348
231 printk("\n"); 349 printk("\n");
232#else /* CONFIG_X86_64 */ 350}
351
352#else /* CONFIG_X86_64: */
353
354void vmalloc_sync_all(void)
355{
356 unsigned long address;
357
358 for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END;
359 address += PGDIR_SIZE) {
360
361 const pgd_t *pgd_ref = pgd_offset_k(address);
362 unsigned long flags;
363 struct page *page;
364
365 if (pgd_none(*pgd_ref))
366 continue;
367
368 spin_lock_irqsave(&pgd_lock, flags);
369 list_for_each_entry(page, &pgd_list, lru) {
370 pgd_t *pgd;
371 pgd = (pgd_t *)page_address(page) + pgd_index(address);
372 if (pgd_none(*pgd))
373 set_pgd(pgd, *pgd_ref);
374 else
375 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
376 }
377 spin_unlock_irqrestore(&pgd_lock, flags);
378 }
379}
380
381/*
382 * 64-bit:
383 *
384 * Handle a fault on the vmalloc area
385 *
386 * This assumes no large pages in there.
387 */
388static noinline int vmalloc_fault(unsigned long address)
389{
390 pgd_t *pgd, *pgd_ref;
391 pud_t *pud, *pud_ref;
392 pmd_t *pmd, *pmd_ref;
393 pte_t *pte, *pte_ref;
394
395 /* Make sure we are in vmalloc area: */
396 if (!(address >= VMALLOC_START && address < VMALLOC_END))
397 return -1;
398
399 /*
400 * Copy kernel mappings over when needed. This can also
401 * happen within a race in page table update. In the later
402 * case just flush:
403 */
404 pgd = pgd_offset(current->active_mm, address);
405 pgd_ref = pgd_offset_k(address);
406 if (pgd_none(*pgd_ref))
407 return -1;
408
409 if (pgd_none(*pgd))
410 set_pgd(pgd, *pgd_ref);
411 else
412 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
413
414 /*
415 * Below here mismatches are bugs because these lower tables
416 * are shared:
417 */
418
419 pud = pud_offset(pgd, address);
420 pud_ref = pud_offset(pgd_ref, address);
421 if (pud_none(*pud_ref))
422 return -1;
423
424 if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
425 BUG();
426
427 pmd = pmd_offset(pud, address);
428 pmd_ref = pmd_offset(pud_ref, address);
429 if (pmd_none(*pmd_ref))
430 return -1;
431
432 if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
433 BUG();
434
435 pte_ref = pte_offset_kernel(pmd_ref, address);
436 if (!pte_present(*pte_ref))
437 return -1;
438
439 pte = pte_offset_kernel(pmd, address);
440
441 /*
442 * Don't use pte_page here, because the mappings can point
443 * outside mem_map, and the NUMA hash lookup cannot handle
444 * that:
445 */
446 if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
447 BUG();
448
449 return 0;
450}
451
452static const char errata93_warning[] =
453KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
454KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
455KERN_ERR "******* Please consider a BIOS update.\n"
456KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
457
458/*
459 * No vm86 mode in 64-bit mode:
460 */
461static inline void
462check_v8086_mode(struct pt_regs *regs, unsigned long address,
463 struct task_struct *tsk)
464{
465}
466
467static int bad_address(void *p)
468{
469 unsigned long dummy;
470
471 return probe_kernel_address((unsigned long *)p, dummy);
472}
473
474static void dump_pagetable(unsigned long address)
475{
233 pgd_t *pgd; 476 pgd_t *pgd;
234 pud_t *pud; 477 pud_t *pud;
235 pmd_t *pmd; 478 pmd_t *pmd;
@@ -238,102 +481,77 @@ static void dump_pagetable(unsigned long address)
238 pgd = (pgd_t *)read_cr3(); 481 pgd = (pgd_t *)read_cr3();
239 482
240 pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK); 483 pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
484
241 pgd += pgd_index(address); 485 pgd += pgd_index(address);
242 if (bad_address(pgd)) goto bad; 486 if (bad_address(pgd))
487 goto bad;
488
243 printk("PGD %lx ", pgd_val(*pgd)); 489 printk("PGD %lx ", pgd_val(*pgd));
244 if (!pgd_present(*pgd)) goto ret; 490
491 if (!pgd_present(*pgd))
492 goto out;
245 493
246 pud = pud_offset(pgd, address); 494 pud = pud_offset(pgd, address);
247 if (bad_address(pud)) goto bad; 495 if (bad_address(pud))
496 goto bad;
497
248 printk("PUD %lx ", pud_val(*pud)); 498 printk("PUD %lx ", pud_val(*pud));
249 if (!pud_present(*pud) || pud_large(*pud)) 499 if (!pud_present(*pud) || pud_large(*pud))
250 goto ret; 500 goto out;
251 501
252 pmd = pmd_offset(pud, address); 502 pmd = pmd_offset(pud, address);
253 if (bad_address(pmd)) goto bad; 503 if (bad_address(pmd))
504 goto bad;
505
254 printk("PMD %lx ", pmd_val(*pmd)); 506 printk("PMD %lx ", pmd_val(*pmd));
255 if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret; 507 if (!pmd_present(*pmd) || pmd_large(*pmd))
508 goto out;
256 509
257 pte = pte_offset_kernel(pmd, address); 510 pte = pte_offset_kernel(pmd, address);
258 if (bad_address(pte)) goto bad; 511 if (bad_address(pte))
512 goto bad;
513
259 printk("PTE %lx", pte_val(*pte)); 514 printk("PTE %lx", pte_val(*pte));
260ret: 515out:
261 printk("\n"); 516 printk("\n");
262 return; 517 return;
263bad: 518bad:
264 printk("BAD\n"); 519 printk("BAD\n");
265#endif
266}
267
268#ifdef CONFIG_X86_32
269static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
270{
271 unsigned index = pgd_index(address);
272 pgd_t *pgd_k;
273 pud_t *pud, *pud_k;
274 pmd_t *pmd, *pmd_k;
275
276 pgd += index;
277 pgd_k = init_mm.pgd + index;
278
279 if (!pgd_present(*pgd_k))
280 return NULL;
281
282 /*
283 * set_pgd(pgd, *pgd_k); here would be useless on PAE
284 * and redundant with the set_pmd() on non-PAE. As would
285 * set_pud.
286 */
287
288 pud = pud_offset(pgd, address);
289 pud_k = pud_offset(pgd_k, address);
290 if (!pud_present(*pud_k))
291 return NULL;
292
293 pmd = pmd_offset(pud, address);
294 pmd_k = pmd_offset(pud_k, address);
295 if (!pmd_present(*pmd_k))
296 return NULL;
297 if (!pmd_present(*pmd)) {
298 set_pmd(pmd, *pmd_k);
299 arch_flush_lazy_mmu_mode();
300 } else
301 BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
302 return pmd_k;
303} 520}
304#endif
305 521
306#ifdef CONFIG_X86_64 522#endif /* CONFIG_X86_64 */
307static const char errata93_warning[] =
308KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
309KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
310KERN_ERR "******* Please consider a BIOS update.\n"
311KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
312#endif
313 523
314/* Workaround for K8 erratum #93 & buggy BIOS. 524/*
315 BIOS SMM functions are required to use a specific workaround 525 * Workaround for K8 erratum #93 & buggy BIOS.
316 to avoid corruption of the 64bit RIP register on C stepping K8. 526 *
317 A lot of BIOS that didn't get tested properly miss this. 527 * BIOS SMM functions are required to use a specific workaround
318 The OS sees this as a page fault with the upper 32bits of RIP cleared. 528 * to avoid corruption of the 64bit RIP register on C stepping K8.
319 Try to work around it here. 529 *
320 Note we only handle faults in kernel here. 530 * A lot of BIOS that didn't get tested properly miss this.
321 Does nothing for X86_32 531 *
532 * The OS sees this as a page fault with the upper 32bits of RIP cleared.
533 * Try to work around it here.
534 *
535 * Note we only handle faults in kernel here.
536 * Does nothing on 32-bit.
322 */ 537 */
323static int is_errata93(struct pt_regs *regs, unsigned long address) 538static int is_errata93(struct pt_regs *regs, unsigned long address)
324{ 539{
325#ifdef CONFIG_X86_64 540#ifdef CONFIG_X86_64
326 static int warned; 541 static int once;
542
327 if (address != regs->ip) 543 if (address != regs->ip)
328 return 0; 544 return 0;
545
329 if ((address >> 32) != 0) 546 if ((address >> 32) != 0)
330 return 0; 547 return 0;
548
331 address |= 0xffffffffUL << 32; 549 address |= 0xffffffffUL << 32;
332 if ((address >= (u64)_stext && address <= (u64)_etext) || 550 if ((address >= (u64)_stext && address <= (u64)_etext) ||
333 (address >= MODULES_VADDR && address <= MODULES_END)) { 551 (address >= MODULES_VADDR && address <= MODULES_END)) {
334 if (!warned) { 552 if (!once) {
335 printk(errata93_warning); 553 printk(errata93_warning);
336 warned = 1; 554 once = 1;
337 } 555 }
338 regs->ip = address; 556 regs->ip = address;
339 return 1; 557 return 1;
@@ -343,16 +561,17 @@ static int is_errata93(struct pt_regs *regs, unsigned long address)
343} 561}
344 562
345/* 563/*
346 * Work around K8 erratum #100 K8 in compat mode occasionally jumps to illegal 564 * Work around K8 erratum #100 K8 in compat mode occasionally jumps
347 * addresses >4GB. We catch this in the page fault handler because these 565 * to illegal addresses >4GB.
348 * addresses are not reachable. Just detect this case and return. Any code 566 *
567 * We catch this in the page fault handler because these addresses
568 * are not reachable. Just detect this case and return. Any code
349 * segment in LDT is compatibility mode. 569 * segment in LDT is compatibility mode.
350 */ 570 */
351static int is_errata100(struct pt_regs *regs, unsigned long address) 571static int is_errata100(struct pt_regs *regs, unsigned long address)
352{ 572{
353#ifdef CONFIG_X86_64 573#ifdef CONFIG_X86_64
354 if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && 574 if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && (address >> 32))
355 (address >> 32))
356 return 1; 575 return 1;
357#endif 576#endif
358 return 0; 577 return 0;
@@ -362,8 +581,9 @@ static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
362{ 581{
363#ifdef CONFIG_X86_F00F_BUG 582#ifdef CONFIG_X86_F00F_BUG
364 unsigned long nr; 583 unsigned long nr;
584
365 /* 585 /*
366 * Pentium F0 0F C7 C8 bug workaround. 586 * Pentium F0 0F C7 C8 bug workaround:
367 */ 587 */
368 if (boot_cpu_data.f00f_bug) { 588 if (boot_cpu_data.f00f_bug) {
369 nr = (address - idt_descr.address) >> 3; 589 nr = (address - idt_descr.address) >> 3;
@@ -377,62 +597,277 @@ static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
377 return 0; 597 return 0;
378} 598}
379 599
380static void show_fault_oops(struct pt_regs *regs, unsigned long error_code, 600static const char nx_warning[] = KERN_CRIT
381 unsigned long address) 601"kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n";
602
603static void
604show_fault_oops(struct pt_regs *regs, unsigned long error_code,
605 unsigned long address)
382{ 606{
383#ifdef CONFIG_X86_32
384 if (!oops_may_print()) 607 if (!oops_may_print())
385 return; 608 return;
386#endif
387 609
388#ifdef CONFIG_X86_PAE
389 if (error_code & PF_INSTR) { 610 if (error_code & PF_INSTR) {
390 unsigned int level; 611 unsigned int level;
612
391 pte_t *pte = lookup_address(address, &level); 613 pte_t *pte = lookup_address(address, &level);
392 614
393 if (pte && pte_present(*pte) && !pte_exec(*pte)) 615 if (pte && pte_present(*pte) && !pte_exec(*pte))
394 printk(KERN_CRIT "kernel tried to execute " 616 printk(nx_warning, current_uid());
395 "NX-protected page - exploit attempt? "
396 "(uid: %d)\n", current_uid());
397 } 617 }
398#endif
399 618
400 printk(KERN_ALERT "BUG: unable to handle kernel "); 619 printk(KERN_ALERT "BUG: unable to handle kernel ");
401 if (address < PAGE_SIZE) 620 if (address < PAGE_SIZE)
402 printk(KERN_CONT "NULL pointer dereference"); 621 printk(KERN_CONT "NULL pointer dereference");
403 else 622 else
404 printk(KERN_CONT "paging request"); 623 printk(KERN_CONT "paging request");
624
405 printk(KERN_CONT " at %p\n", (void *) address); 625 printk(KERN_CONT " at %p\n", (void *) address);
406 printk(KERN_ALERT "IP:"); 626 printk(KERN_ALERT "IP:");
407 printk_address(regs->ip, 1); 627 printk_address(regs->ip, 1);
628
408 dump_pagetable(address); 629 dump_pagetable(address);
409} 630}
410 631
411#ifdef CONFIG_X86_64 632static noinline void
412static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs, 633pgtable_bad(struct pt_regs *regs, unsigned long error_code,
413 unsigned long error_code) 634 unsigned long address)
414{ 635{
415 unsigned long flags = oops_begin();
416 int sig = SIGKILL;
417 struct task_struct *tsk; 636 struct task_struct *tsk;
637 unsigned long flags;
638 int sig;
639
640 flags = oops_begin();
641 tsk = current;
642 sig = SIGKILL;
418 643
419 printk(KERN_ALERT "%s: Corrupted page table at address %lx\n", 644 printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
420 current->comm, address); 645 tsk->comm, address);
421 dump_pagetable(address); 646 dump_pagetable(address);
422 tsk = current; 647
423 tsk->thread.cr2 = address; 648 tsk->thread.cr2 = address;
424 tsk->thread.trap_no = 14; 649 tsk->thread.trap_no = 14;
425 tsk->thread.error_code = error_code; 650 tsk->thread.error_code = error_code;
651
426 if (__die("Bad pagetable", regs, error_code)) 652 if (__die("Bad pagetable", regs, error_code))
427 sig = 0; 653 sig = 0;
654
428 oops_end(flags, regs, sig); 655 oops_end(flags, regs, sig);
429} 656}
430#endif 657
658static noinline void
659no_context(struct pt_regs *regs, unsigned long error_code,
660 unsigned long address)
661{
662 struct task_struct *tsk = current;
663 unsigned long *stackend;
664 unsigned long flags;
665 int sig;
666
667 /* Are we prepared to handle this kernel fault? */
668 if (fixup_exception(regs))
669 return;
670
671 /*
672 * 32-bit:
673 *
674 * Valid to do another page fault here, because if this fault
675 * had been triggered by is_prefetch fixup_exception would have
676 * handled it.
677 *
678 * 64-bit:
679 *
680 * Hall of shame of CPU/BIOS bugs.
681 */
682 if (is_prefetch(regs, error_code, address))
683 return;
684
685 if (is_errata93(regs, address))
686 return;
687
688 /*
689 * Oops. The kernel tried to access some bad page. We'll have to
690 * terminate things with extreme prejudice:
691 */
692 flags = oops_begin();
693
694 show_fault_oops(regs, error_code, address);
695
696 stackend = end_of_stack(tsk);
697 if (*stackend != STACK_END_MAGIC)
698 printk(KERN_ALERT "Thread overran stack, or stack corrupted\n");
699
700 tsk->thread.cr2 = address;
701 tsk->thread.trap_no = 14;
702 tsk->thread.error_code = error_code;
703
704 sig = SIGKILL;
705 if (__die("Oops", regs, error_code))
706 sig = 0;
707
708 /* Executive summary in case the body of the oops scrolled away */
709 printk(KERN_EMERG "CR2: %016lx\n", address);
710
711 oops_end(flags, regs, sig);
712}
713
714/*
715 * Print out info about fatal segfaults, if the show_unhandled_signals
716 * sysctl is set:
717 */
718static inline void
719show_signal_msg(struct pt_regs *regs, unsigned long error_code,
720 unsigned long address, struct task_struct *tsk)
721{
722 if (!unhandled_signal(tsk, SIGSEGV))
723 return;
724
725 if (!printk_ratelimit())
726 return;
727
728 printk(KERN_CONT "%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
729 task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
730 tsk->comm, task_pid_nr(tsk), address,
731 (void *)regs->ip, (void *)regs->sp, error_code);
732
733 print_vma_addr(KERN_CONT " in ", regs->ip);
734
735 printk(KERN_CONT "\n");
736}
737
738static void
739__bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
740 unsigned long address, int si_code)
741{
742 struct task_struct *tsk = current;
743
744 /* User mode accesses just cause a SIGSEGV */
745 if (error_code & PF_USER) {
746 /*
747 * It's possible to have interrupts off here:
748 */
749 local_irq_enable();
750
751 /*
752 * Valid to do another page fault here because this one came
753 * from user space:
754 */
755 if (is_prefetch(regs, error_code, address))
756 return;
757
758 if (is_errata100(regs, address))
759 return;
760
761 if (unlikely(show_unhandled_signals))
762 show_signal_msg(regs, error_code, address, tsk);
763
764 /* Kernel addresses are always protection faults: */
765 tsk->thread.cr2 = address;
766 tsk->thread.error_code = error_code | (address >= TASK_SIZE);
767 tsk->thread.trap_no = 14;
768
769 force_sig_info_fault(SIGSEGV, si_code, address, tsk);
770
771 return;
772 }
773
774 if (is_f00f_bug(regs, address))
775 return;
776
777 no_context(regs, error_code, address);
778}
779
780static noinline void
781bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
782 unsigned long address)
783{
784 __bad_area_nosemaphore(regs, error_code, address, SEGV_MAPERR);
785}
786
787static void
788__bad_area(struct pt_regs *regs, unsigned long error_code,
789 unsigned long address, int si_code)
790{
791 struct mm_struct *mm = current->mm;
792
793 /*
794 * Something tried to access memory that isn't in our memory map..
795 * Fix it, but check if it's kernel or user first..
796 */
797 up_read(&mm->mmap_sem);
798
799 __bad_area_nosemaphore(regs, error_code, address, si_code);
800}
801
802static noinline void
803bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address)
804{
805 __bad_area(regs, error_code, address, SEGV_MAPERR);
806}
807
808static noinline void
809bad_area_access_error(struct pt_regs *regs, unsigned long error_code,
810 unsigned long address)
811{
812 __bad_area(regs, error_code, address, SEGV_ACCERR);
813}
814
815/* TODO: fixup for "mm-invoke-oom-killer-from-page-fault.patch" */
816static void
817out_of_memory(struct pt_regs *regs, unsigned long error_code,
818 unsigned long address)
819{
820 /*
821 * We ran out of memory, call the OOM killer, and return the userspace
822 * (which will retry the fault, or kill us if we got oom-killed):
823 */
824 up_read(&current->mm->mmap_sem);
825
826 pagefault_out_of_memory();
827}
828
829static void
830do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address)
831{
832 struct task_struct *tsk = current;
833 struct mm_struct *mm = tsk->mm;
834
835 up_read(&mm->mmap_sem);
836
837 /* Kernel mode? Handle exceptions or die: */
838 if (!(error_code & PF_USER))
839 no_context(regs, error_code, address);
840
841 /* User-space => ok to do another page fault: */
842 if (is_prefetch(regs, error_code, address))
843 return;
844
845 tsk->thread.cr2 = address;
846 tsk->thread.error_code = error_code;
847 tsk->thread.trap_no = 14;
848
849 force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
850}
851
852static noinline void
853mm_fault_error(struct pt_regs *regs, unsigned long error_code,
854 unsigned long address, unsigned int fault)
855{
856 if (fault & VM_FAULT_OOM) {
857 out_of_memory(regs, error_code, address);
858 } else {
859 if (fault & VM_FAULT_SIGBUS)
860 do_sigbus(regs, error_code, address);
861 else
862 BUG();
863 }
864}
431 865
432static int spurious_fault_check(unsigned long error_code, pte_t *pte) 866static int spurious_fault_check(unsigned long error_code, pte_t *pte)
433{ 867{
434 if ((error_code & PF_WRITE) && !pte_write(*pte)) 868 if ((error_code & PF_WRITE) && !pte_write(*pte))
435 return 0; 869 return 0;
870
436 if ((error_code & PF_INSTR) && !pte_exec(*pte)) 871 if ((error_code & PF_INSTR) && !pte_exec(*pte))
437 return 0; 872 return 0;
438 873
@@ -440,21 +875,25 @@ static int spurious_fault_check(unsigned long error_code, pte_t *pte)
440} 875}
441 876
442/* 877/*
443 * Handle a spurious fault caused by a stale TLB entry. This allows 878 * Handle a spurious fault caused by a stale TLB entry.
444 * us to lazily refresh the TLB when increasing the permissions of a 879 *
445 * kernel page (RO -> RW or NX -> X). Doing it eagerly is very 880 * This allows us to lazily refresh the TLB when increasing the
446 * expensive since that implies doing a full cross-processor TLB 881 * permissions of a kernel page (RO -> RW or NX -> X). Doing it
447 * flush, even if no stale TLB entries exist on other processors. 882 * eagerly is very expensive since that implies doing a full
883 * cross-processor TLB flush, even if no stale TLB entries exist
884 * on other processors.
885 *
448 * There are no security implications to leaving a stale TLB when 886 * There are no security implications to leaving a stale TLB when
449 * increasing the permissions on a page. 887 * increasing the permissions on a page.
450 */ 888 */
451static int spurious_fault(unsigned long address, 889static noinline int
452 unsigned long error_code) 890spurious_fault(unsigned long error_code, unsigned long address)
453{ 891{
454 pgd_t *pgd; 892 pgd_t *pgd;
455 pud_t *pud; 893 pud_t *pud;
456 pmd_t *pmd; 894 pmd_t *pmd;
457 pte_t *pte; 895 pte_t *pte;
896 int ret;
458 897
459 /* Reserved-bit violation or user access to kernel space? */ 898 /* Reserved-bit violation or user access to kernel space? */
460 if (error_code & (PF_USER | PF_RSVD)) 899 if (error_code & (PF_USER | PF_RSVD))
@@ -482,127 +921,71 @@ static int spurious_fault(unsigned long address,
482 if (!pte_present(*pte)) 921 if (!pte_present(*pte))
483 return 0; 922 return 0;
484 923
485 return spurious_fault_check(error_code, pte); 924 ret = spurious_fault_check(error_code, pte);
486} 925 if (!ret)
487 926 return 0;
488/*
489 * X86_32
490 * Handle a fault on the vmalloc or module mapping area
491 *
492 * X86_64
493 * Handle a fault on the vmalloc area
494 *
495 * This assumes no large pages in there.
496 */
497static int vmalloc_fault(unsigned long address)
498{
499#ifdef CONFIG_X86_32
500 unsigned long pgd_paddr;
501 pmd_t *pmd_k;
502 pte_t *pte_k;
503
504 /* Make sure we are in vmalloc area */
505 if (!(address >= VMALLOC_START && address < VMALLOC_END))
506 return -1;
507 927
508 /* 928 /*
509 * Synchronize this task's top level page-table 929 * Make sure we have permissions in PMD.
510 * with the 'reference' page table. 930 * If not, then there's a bug in the page tables:
511 *
512 * Do _not_ use "current" here. We might be inside
513 * an interrupt in the middle of a task switch..
514 */ 931 */
515 pgd_paddr = read_cr3(); 932 ret = spurious_fault_check(error_code, (pte_t *) pmd);
516 pmd_k = vmalloc_sync_one(__va(pgd_paddr), address); 933 WARN_ONCE(!ret, "PMD has incorrect permission bits\n");
517 if (!pmd_k)
518 return -1;
519 pte_k = pte_offset_kernel(pmd_k, address);
520 if (!pte_present(*pte_k))
521 return -1;
522 return 0;
523#else
524 pgd_t *pgd, *pgd_ref;
525 pud_t *pud, *pud_ref;
526 pmd_t *pmd, *pmd_ref;
527 pte_t *pte, *pte_ref;
528 934
529 /* Make sure we are in vmalloc area */ 935 return ret;
530 if (!(address >= VMALLOC_START && address < VMALLOC_END)) 936}
531 return -1;
532 937
533 /* Copy kernel mappings over when needed. This can also 938int show_unhandled_signals = 1;
534 happen within a race in page table update. In the later
535 case just flush. */
536 939
537 pgd = pgd_offset(current->active_mm, address); 940static inline int
538 pgd_ref = pgd_offset_k(address); 941access_error(unsigned long error_code, int write, struct vm_area_struct *vma)
539 if (pgd_none(*pgd_ref)) 942{
540 return -1; 943 if (write) {
541 if (pgd_none(*pgd)) 944 /* write, present and write, not present: */
542 set_pgd(pgd, *pgd_ref); 945 if (unlikely(!(vma->vm_flags & VM_WRITE)))
543 else 946 return 1;
544 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); 947 return 0;
948 }
545 949
546 /* Below here mismatches are bugs because these lower tables 950 /* read, present: */
547 are shared */ 951 if (unlikely(error_code & PF_PROT))
952 return 1;
953
954 /* read, not present: */
955 if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))))
956 return 1;
548 957
549 pud = pud_offset(pgd, address);
550 pud_ref = pud_offset(pgd_ref, address);
551 if (pud_none(*pud_ref))
552 return -1;
553 if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
554 BUG();
555 pmd = pmd_offset(pud, address);
556 pmd_ref = pmd_offset(pud_ref, address);
557 if (pmd_none(*pmd_ref))
558 return -1;
559 if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
560 BUG();
561 pte_ref = pte_offset_kernel(pmd_ref, address);
562 if (!pte_present(*pte_ref))
563 return -1;
564 pte = pte_offset_kernel(pmd, address);
565 /* Don't use pte_page here, because the mappings can point
566 outside mem_map, and the NUMA hash lookup cannot handle
567 that. */
568 if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
569 BUG();
570 return 0; 958 return 0;
571#endif
572} 959}
573 960
574int show_unhandled_signals = 1; 961static int fault_in_kernel_space(unsigned long address)
962{
963 return address >= TASK_SIZE_MAX;
964}
575 965
576/* 966/*
577 * This routine handles page faults. It determines the address, 967 * This routine handles page faults. It determines the address,
578 * and the problem, and then passes it off to one of the appropriate 968 * and the problem, and then passes it off to one of the appropriate
579 * routines. 969 * routines.
580 */ 970 */
581#ifdef CONFIG_X86_64 971dotraplinkage void __kprobes
582asmlinkage 972do_page_fault(struct pt_regs *regs, unsigned long error_code)
583#endif
584void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
585{ 973{
586 struct task_struct *tsk;
587 struct mm_struct *mm;
588 struct vm_area_struct *vma; 974 struct vm_area_struct *vma;
975 struct task_struct *tsk;
589 unsigned long address; 976 unsigned long address;
590 int write, si_code; 977 struct mm_struct *mm;
978 int write;
591 int fault; 979 int fault;
592#ifdef CONFIG_X86_64
593 unsigned long flags;
594 int sig;
595#endif
596 980
597 tsk = current; 981 tsk = current;
598 mm = tsk->mm; 982 mm = tsk->mm;
983
599 prefetchw(&mm->mmap_sem); 984 prefetchw(&mm->mmap_sem);
600 985
601 /* get the address */ 986 /* Get the faulting address: */
602 address = read_cr2(); 987 address = read_cr2();
603 988
604 si_code = SEGV_MAPERR;
605
606 if (unlikely(kmmio_fault(regs, address))) 989 if (unlikely(kmmio_fault(regs, address)))
607 return; 990 return;
608 991
@@ -619,319 +1002,147 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
619 * (error_code & 4) == 0, and that the fault was not a 1002 * (error_code & 4) == 0, and that the fault was not a
620 * protection error (error_code & 9) == 0. 1003 * protection error (error_code & 9) == 0.
621 */ 1004 */
622#ifdef CONFIG_X86_32 1005 if (unlikely(fault_in_kernel_space(address))) {
623 if (unlikely(address >= TASK_SIZE)) {
624#else
625 if (unlikely(address >= TASK_SIZE64)) {
626#endif
627 if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) && 1006 if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
628 vmalloc_fault(address) >= 0) 1007 vmalloc_fault(address) >= 0)
629 return; 1008 return;
630 1009
631 /* Can handle a stale RO->RW TLB */ 1010 /* Can handle a stale RO->RW TLB: */
632 if (spurious_fault(address, error_code)) 1011 if (spurious_fault(error_code, address))
633 return; 1012 return;
634 1013
635 /* kprobes don't want to hook the spurious faults. */ 1014 /* kprobes don't want to hook the spurious faults: */
636 if (notify_page_fault(regs)) 1015 if (notify_page_fault(regs))
637 return; 1016 return;
638 /* 1017 /*
639 * Don't take the mm semaphore here. If we fixup a prefetch 1018 * Don't take the mm semaphore here. If we fixup a prefetch
640 * fault we could otherwise deadlock. 1019 * fault we could otherwise deadlock:
641 */ 1020 */
642 goto bad_area_nosemaphore; 1021 bad_area_nosemaphore(regs, error_code, address);
643 }
644 1022
645 /* kprobes don't want to hook the spurious faults. */
646 if (notify_page_fault(regs))
647 return; 1023 return;
1024 }
648 1025
1026 /* kprobes don't want to hook the spurious faults: */
1027 if (unlikely(notify_page_fault(regs)))
1028 return;
649 /* 1029 /*
650 * It's safe to allow irq's after cr2 has been saved and the 1030 * It's safe to allow irq's after cr2 has been saved and the
651 * vmalloc fault has been handled. 1031 * vmalloc fault has been handled.
652 * 1032 *
653 * User-mode registers count as a user access even for any 1033 * User-mode registers count as a user access even for any
654 * potential system fault or CPU buglet. 1034 * potential system fault or CPU buglet:
655 */ 1035 */
656 if (user_mode_vm(regs)) { 1036 if (user_mode_vm(regs)) {
657 local_irq_enable(); 1037 local_irq_enable();
658 error_code |= PF_USER; 1038 error_code |= PF_USER;
659 } else if (regs->flags & X86_EFLAGS_IF) 1039 } else {
660 local_irq_enable(); 1040 if (regs->flags & X86_EFLAGS_IF)
1041 local_irq_enable();
1042 }
661 1043
662#ifdef CONFIG_X86_64
663 if (unlikely(error_code & PF_RSVD)) 1044 if (unlikely(error_code & PF_RSVD))
664 pgtable_bad(address, regs, error_code); 1045 pgtable_bad(regs, error_code, address);
665#endif
666 1046
667 /* 1047 /*
668 * If we're in an interrupt, have no user context or are running in an 1048 * If we're in an interrupt, have no user context or are running
669 * atomic region then we must not take the fault. 1049 * in an atomic region then we must not take the fault:
670 */ 1050 */
671 if (unlikely(in_atomic() || !mm)) 1051 if (unlikely(in_atomic() || !mm)) {
672 goto bad_area_nosemaphore; 1052 bad_area_nosemaphore(regs, error_code, address);
1053 return;
1054 }
673 1055
674 /* 1056 /*
675 * When running in the kernel we expect faults to occur only to 1057 * When running in the kernel we expect faults to occur only to
676 * addresses in user space. All other faults represent errors in the 1058 * addresses in user space. All other faults represent errors in
677 * kernel and should generate an OOPS. Unfortunately, in the case of an 1059 * the kernel and should generate an OOPS. Unfortunately, in the
678 * erroneous fault occurring in a code path which already holds mmap_sem 1060 * case of an erroneous fault occurring in a code path which already
679 * we will deadlock attempting to validate the fault against the 1061 * holds mmap_sem we will deadlock attempting to validate the fault
680 * address space. Luckily the kernel only validly references user 1062 * against the address space. Luckily the kernel only validly
681 * space from well defined areas of code, which are listed in the 1063 * references user space from well defined areas of code, which are
682 * exceptions table. 1064 * listed in the exceptions table.
683 * 1065 *
684 * As the vast majority of faults will be valid we will only perform 1066 * As the vast majority of faults will be valid we will only perform
685 * the source reference check when there is a possibility of a deadlock. 1067 * the source reference check when there is a possibility of a
686 * Attempt to lock the address space, if we cannot we then validate the 1068 * deadlock. Attempt to lock the address space, if we cannot we then
687 * source. If this is invalid we can skip the address space check, 1069 * validate the source. If this is invalid we can skip the address
688 * thus avoiding the deadlock. 1070 * space check, thus avoiding the deadlock:
689 */ 1071 */
690 if (!down_read_trylock(&mm->mmap_sem)) { 1072 if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
691 if ((error_code & PF_USER) == 0 && 1073 if ((error_code & PF_USER) == 0 &&
692 !search_exception_tables(regs->ip)) 1074 !search_exception_tables(regs->ip)) {
693 goto bad_area_nosemaphore; 1075 bad_area_nosemaphore(regs, error_code, address);
1076 return;
1077 }
694 down_read(&mm->mmap_sem); 1078 down_read(&mm->mmap_sem);
1079 } else {
1080 /*
1081 * The above down_read_trylock() might have succeeded in
1082 * which case we'll have missed the might_sleep() from
1083 * down_read():
1084 */
1085 might_sleep();
695 } 1086 }
696 1087
697 vma = find_vma(mm, address); 1088 vma = find_vma(mm, address);
698 if (!vma) 1089 if (unlikely(!vma)) {
699 goto bad_area; 1090 bad_area(regs, error_code, address);
700 if (vma->vm_start <= address) 1091 return;
1092 }
1093 if (likely(vma->vm_start <= address))
701 goto good_area; 1094 goto good_area;
702 if (!(vma->vm_flags & VM_GROWSDOWN)) 1095 if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
703 goto bad_area; 1096 bad_area(regs, error_code, address);
1097 return;
1098 }
704 if (error_code & PF_USER) { 1099 if (error_code & PF_USER) {
705 /* 1100 /*
706 * Accessing the stack below %sp is always a bug. 1101 * Accessing the stack below %sp is always a bug.
707 * The large cushion allows instructions like enter 1102 * The large cushion allows instructions like enter
708 * and pusha to work. ("enter $65535,$31" pushes 1103 * and pusha to work. ("enter $65535, $31" pushes
709 * 32 pointers and then decrements %sp by 65535.) 1104 * 32 pointers and then decrements %sp by 65535.)
710 */ 1105 */
711 if (address + 65536 + 32 * sizeof(unsigned long) < regs->sp) 1106 if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) {
712 goto bad_area; 1107 bad_area(regs, error_code, address);
1108 return;
1109 }
713 } 1110 }
714 if (expand_stack(vma, address)) 1111 if (unlikely(expand_stack(vma, address))) {
715 goto bad_area; 1112 bad_area(regs, error_code, address);
716/* 1113 return;
717 * Ok, we have a good vm_area for this memory access, so 1114 }
718 * we can handle it.. 1115
719 */ 1116 /*
1117 * Ok, we have a good vm_area for this memory access, so
1118 * we can handle it..
1119 */
720good_area: 1120good_area:
721 si_code = SEGV_ACCERR; 1121 write = error_code & PF_WRITE;
722 write = 0; 1122
723 switch (error_code & (PF_PROT|PF_WRITE)) { 1123 if (unlikely(access_error(error_code, write, vma))) {
724 default: /* 3: write, present */ 1124 bad_area_access_error(regs, error_code, address);
725 /* fall through */ 1125 return;
726 case PF_WRITE: /* write, not present */
727 if (!(vma->vm_flags & VM_WRITE))
728 goto bad_area;
729 write++;
730 break;
731 case PF_PROT: /* read, present */
732 goto bad_area;
733 case 0: /* read, not present */
734 if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
735 goto bad_area;
736 } 1126 }
737 1127
738 /* 1128 /*
739 * If for any reason at all we couldn't handle the fault, 1129 * If for any reason at all we couldn't handle the fault,
740 * make sure we exit gracefully rather than endlessly redo 1130 * make sure we exit gracefully rather than endlessly redo
741 * the fault. 1131 * the fault:
742 */ 1132 */
743 fault = handle_mm_fault(mm, vma, address, write); 1133 fault = handle_mm_fault(mm, vma, address, write);
1134
744 if (unlikely(fault & VM_FAULT_ERROR)) { 1135 if (unlikely(fault & VM_FAULT_ERROR)) {
745 if (fault & VM_FAULT_OOM) 1136 mm_fault_error(regs, error_code, address, fault);
746 goto out_of_memory; 1137 return;
747 else if (fault & VM_FAULT_SIGBUS)
748 goto do_sigbus;
749 BUG();
750 } 1138 }
1139
751 if (fault & VM_FAULT_MAJOR) 1140 if (fault & VM_FAULT_MAJOR)
752 tsk->maj_flt++; 1141 tsk->maj_flt++;
753 else 1142 else
754 tsk->min_flt++; 1143 tsk->min_flt++;
755 1144
756#ifdef CONFIG_X86_32 1145 check_v8086_mode(regs, address, tsk);
757 /*
758 * Did it hit the DOS screen memory VA from vm86 mode?
759 */
760 if (v8086_mode(regs)) {
761 unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT;
762 if (bit < 32)
763 tsk->thread.screen_bitmap |= 1 << bit;
764 }
765#endif
766 up_read(&mm->mmap_sem);
767 return;
768 1146
769/*
770 * Something tried to access memory that isn't in our memory map..
771 * Fix it, but check if it's kernel or user first..
772 */
773bad_area:
774 up_read(&mm->mmap_sem); 1147 up_read(&mm->mmap_sem);
775
776bad_area_nosemaphore:
777 /* User mode accesses just cause a SIGSEGV */
778 if (error_code & PF_USER) {
779 /*
780 * It's possible to have interrupts off here.
781 */
782 local_irq_enable();
783
784 /*
785 * Valid to do another page fault here because this one came
786 * from user space.
787 */
788 if (is_prefetch(regs, address, error_code))
789 return;
790
791 if (is_errata100(regs, address))
792 return;
793
794 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
795 printk_ratelimit()) {
796 printk(
797 "%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
798 task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
799 tsk->comm, task_pid_nr(tsk), address,
800 (void *) regs->ip, (void *) regs->sp, error_code);
801 print_vma_addr(" in ", regs->ip);
802 printk("\n");
803 }
804
805 tsk->thread.cr2 = address;
806 /* Kernel addresses are always protection faults */
807 tsk->thread.error_code = error_code | (address >= TASK_SIZE);
808 tsk->thread.trap_no = 14;
809 force_sig_info_fault(SIGSEGV, si_code, address, tsk);
810 return;
811 }
812
813 if (is_f00f_bug(regs, address))
814 return;
815
816no_context:
817 /* Are we prepared to handle this kernel fault? */
818 if (fixup_exception(regs))
819 return;
820
821 /*
822 * X86_32
823 * Valid to do another page fault here, because if this fault
824 * had been triggered by is_prefetch fixup_exception would have
825 * handled it.
826 *
827 * X86_64
828 * Hall of shame of CPU/BIOS bugs.
829 */
830 if (is_prefetch(regs, address, error_code))
831 return;
832
833 if (is_errata93(regs, address))
834 return;
835
836/*
837 * Oops. The kernel tried to access some bad page. We'll have to
838 * terminate things with extreme prejudice.
839 */
840#ifdef CONFIG_X86_32
841 bust_spinlocks(1);
842#else
843 flags = oops_begin();
844#endif
845
846 show_fault_oops(regs, error_code, address);
847
848 tsk->thread.cr2 = address;
849 tsk->thread.trap_no = 14;
850 tsk->thread.error_code = error_code;
851
852#ifdef CONFIG_X86_32
853 die("Oops", regs, error_code);
854 bust_spinlocks(0);
855 do_exit(SIGKILL);
856#else
857 sig = SIGKILL;
858 if (__die("Oops", regs, error_code))
859 sig = 0;
860 /* Executive summary in case the body of the oops scrolled away */
861 printk(KERN_EMERG "CR2: %016lx\n", address);
862 oops_end(flags, regs, sig);
863#endif
864
865out_of_memory:
866 /*
867 * We ran out of memory, call the OOM killer, and return the userspace
868 * (which will retry the fault, or kill us if we got oom-killed).
869 */
870 up_read(&mm->mmap_sem);
871 pagefault_out_of_memory();
872 return;
873
874do_sigbus:
875 up_read(&mm->mmap_sem);
876
877 /* Kernel mode? Handle exceptions or die */
878 if (!(error_code & PF_USER))
879 goto no_context;
880#ifdef CONFIG_X86_32
881 /* User space => ok to do another page fault */
882 if (is_prefetch(regs, address, error_code))
883 return;
884#endif
885 tsk->thread.cr2 = address;
886 tsk->thread.error_code = error_code;
887 tsk->thread.trap_no = 14;
888 force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
889}
890
891DEFINE_SPINLOCK(pgd_lock);
892LIST_HEAD(pgd_list);
893
894void vmalloc_sync_all(void)
895{
896 unsigned long address;
897
898#ifdef CONFIG_X86_32
899 if (SHARED_KERNEL_PMD)
900 return;
901
902 for (address = VMALLOC_START & PMD_MASK;
903 address >= TASK_SIZE && address < FIXADDR_TOP;
904 address += PMD_SIZE) {
905 unsigned long flags;
906 struct page *page;
907
908 spin_lock_irqsave(&pgd_lock, flags);
909 list_for_each_entry(page, &pgd_list, lru) {
910 if (!vmalloc_sync_one(page_address(page),
911 address))
912 break;
913 }
914 spin_unlock_irqrestore(&pgd_lock, flags);
915 }
916#else /* CONFIG_X86_64 */
917 for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END;
918 address += PGDIR_SIZE) {
919 const pgd_t *pgd_ref = pgd_offset_k(address);
920 unsigned long flags;
921 struct page *page;
922
923 if (pgd_none(*pgd_ref))
924 continue;
925 spin_lock_irqsave(&pgd_lock, flags);
926 list_for_each_entry(page, &pgd_list, lru) {
927 pgd_t *pgd;
928 pgd = (pgd_t *)page_address(page) + pgd_index(address);
929 if (pgd_none(*pgd))
930 set_pgd(pgd, *pgd_ref);
931 else
932 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
933 }
934 spin_unlock_irqrestore(&pgd_lock, flags);
935 }
936#endif
937} 1148}
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
index bcc079c282dd..00f127c80b0e 100644
--- a/arch/x86/mm/highmem_32.c
+++ b/arch/x86/mm/highmem_32.c
@@ -1,5 +1,6 @@
1#include <linux/highmem.h> 1#include <linux/highmem.h>
2#include <linux/module.h> 2#include <linux/module.h>
3#include <linux/swap.h> /* for totalram_pages */
3 4
4void *kmap(struct page *page) 5void *kmap(struct page *page)
5{ 6{
@@ -156,3 +157,36 @@ EXPORT_SYMBOL(kmap);
156EXPORT_SYMBOL(kunmap); 157EXPORT_SYMBOL(kunmap);
157EXPORT_SYMBOL(kmap_atomic); 158EXPORT_SYMBOL(kmap_atomic);
158EXPORT_SYMBOL(kunmap_atomic); 159EXPORT_SYMBOL(kunmap_atomic);
160
161#ifdef CONFIG_NUMA
162void __init set_highmem_pages_init(void)
163{
164 struct zone *zone;
165 int nid;
166
167 for_each_zone(zone) {
168 unsigned long zone_start_pfn, zone_end_pfn;
169
170 if (!is_highmem(zone))
171 continue;
172
173 zone_start_pfn = zone->zone_start_pfn;
174 zone_end_pfn = zone_start_pfn + zone->spanned_pages;
175
176 nid = zone_to_nid(zone);
177 printk(KERN_INFO "Initializing %s for node %d (%08lx:%08lx)\n",
178 zone->name, nid, zone_start_pfn, zone_end_pfn);
179
180 add_highpages_with_active_regions(nid, zone_start_pfn,
181 zone_end_pfn);
182 }
183 totalram_pages += totalhigh_pages;
184}
185#else
186void __init set_highmem_pages_init(void)
187{
188 add_highpages_with_active_regions(0, highstart_pfn, highend_pfn);
189
190 totalram_pages += totalhigh_pages;
191}
192#endif /* CONFIG_NUMA */
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
new file mode 100644
index 000000000000..ce6a722587d8
--- /dev/null
+++ b/arch/x86/mm/init.c
@@ -0,0 +1,49 @@
1#include <linux/swap.h>
2#include <asm/cacheflush.h>
3#include <asm/page.h>
4#include <asm/sections.h>
5#include <asm/system.h>
6
7void free_init_pages(char *what, unsigned long begin, unsigned long end)
8{
9 unsigned long addr = begin;
10
11 if (addr >= end)
12 return;
13
14 /*
15 * If debugging page accesses then do not free this memory but
16 * mark them not present - any buggy init-section access will
17 * create a kernel page fault:
18 */
19#ifdef CONFIG_DEBUG_PAGEALLOC
20 printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n",
21 begin, PAGE_ALIGN(end));
22 set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
23#else
24 /*
25 * We just marked the kernel text read only above, now that
26 * we are going to free part of that, we need to make that
27 * writeable first.
28 */
29 set_memory_rw(begin, (end - begin) >> PAGE_SHIFT);
30
31 printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
32
33 for (; addr < end; addr += PAGE_SIZE) {
34 ClearPageReserved(virt_to_page(addr));
35 init_page_count(virt_to_page(addr));
36 memset((void *)(addr & ~(PAGE_SIZE-1)),
37 POISON_FREE_INITMEM, PAGE_SIZE);
38 free_page(addr);
39 totalram_pages++;
40 }
41#endif
42}
43
44void free_initmem(void)
45{
46 free_init_pages("unused kernel memory",
47 (unsigned long)(&__init_begin),
48 (unsigned long)(&__init_end));
49}
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 2cef05074413..47df0e1bbeb9 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -49,9 +49,6 @@
49#include <asm/paravirt.h> 49#include <asm/paravirt.h>
50#include <asm/setup.h> 50#include <asm/setup.h>
51#include <asm/cacheflush.h> 51#include <asm/cacheflush.h>
52#include <asm/smp.h>
53
54unsigned int __VMALLOC_RESERVE = 128 << 20;
55 52
56unsigned long max_low_pfn_mapped; 53unsigned long max_low_pfn_mapped;
57unsigned long max_pfn_mapped; 54unsigned long max_pfn_mapped;
@@ -138,6 +135,23 @@ static pte_t * __init one_page_table_init(pmd_t *pmd)
138 return pte_offset_kernel(pmd, 0); 135 return pte_offset_kernel(pmd, 0);
139} 136}
140 137
138pmd_t * __init populate_extra_pmd(unsigned long vaddr)
139{
140 int pgd_idx = pgd_index(vaddr);
141 int pmd_idx = pmd_index(vaddr);
142
143 return one_md_table_init(swapper_pg_dir + pgd_idx) + pmd_idx;
144}
145
146pte_t * __init populate_extra_pte(unsigned long vaddr)
147{
148 int pte_idx = pte_index(vaddr);
149 pmd_t *pmd;
150
151 pmd = populate_extra_pmd(vaddr);
152 return one_page_table_init(pmd) + pte_idx;
153}
154
141static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd, 155static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd,
142 unsigned long vaddr, pte_t *lastpte) 156 unsigned long vaddr, pte_t *lastpte)
143{ 157{
@@ -470,22 +484,10 @@ void __init add_highpages_with_active_regions(int nid, unsigned long start_pfn,
470 work_with_active_regions(nid, add_highpages_work_fn, &data); 484 work_with_active_regions(nid, add_highpages_work_fn, &data);
471} 485}
472 486
473#ifndef CONFIG_NUMA
474static void __init set_highmem_pages_init(void)
475{
476 add_highpages_with_active_regions(0, highstart_pfn, highend_pfn);
477
478 totalram_pages += totalhigh_pages;
479}
480#endif /* !CONFIG_NUMA */
481
482#else 487#else
483static inline void permanent_kmaps_init(pgd_t *pgd_base) 488static inline void permanent_kmaps_init(pgd_t *pgd_base)
484{ 489{
485} 490}
486static inline void set_highmem_pages_init(void)
487{
488}
489#endif /* CONFIG_HIGHMEM */ 491#endif /* CONFIG_HIGHMEM */
490 492
491void __init native_pagetable_setup_start(pgd_t *base) 493void __init native_pagetable_setup_start(pgd_t *base)
@@ -675,75 +677,97 @@ static int __init parse_highmem(char *arg)
675} 677}
676early_param("highmem", parse_highmem); 678early_param("highmem", parse_highmem);
677 679
680#define MSG_HIGHMEM_TOO_BIG \
681 "highmem size (%luMB) is bigger than pages available (%luMB)!\n"
682
683#define MSG_LOWMEM_TOO_SMALL \
684 "highmem size (%luMB) results in <64MB lowmem, ignoring it!\n"
678/* 685/*
679 * Determine low and high memory ranges: 686 * All of RAM fits into lowmem - but if user wants highmem
687 * artificially via the highmem=x boot parameter then create
688 * it:
680 */ 689 */
681void __init find_low_pfn_range(void) 690void __init lowmem_pfn_init(void)
682{ 691{
683 /* it could update max_pfn */
684
685 /* max_low_pfn is 0, we already have early_res support */ 692 /* max_low_pfn is 0, we already have early_res support */
686
687 max_low_pfn = max_pfn; 693 max_low_pfn = max_pfn;
688 if (max_low_pfn > MAXMEM_PFN) { 694
689 if (highmem_pages == -1) 695 if (highmem_pages == -1)
690 highmem_pages = max_pfn - MAXMEM_PFN; 696 highmem_pages = 0;
691 if (highmem_pages + MAXMEM_PFN < max_pfn) 697#ifdef CONFIG_HIGHMEM
692 max_pfn = MAXMEM_PFN + highmem_pages; 698 if (highmem_pages >= max_pfn) {
693 if (highmem_pages + MAXMEM_PFN > max_pfn) { 699 printk(KERN_ERR MSG_HIGHMEM_TOO_BIG,
694 printk(KERN_WARNING "only %luMB highmem pages " 700 pages_to_mb(highmem_pages), pages_to_mb(max_pfn));
695 "available, ignoring highmem size of %uMB.\n", 701 highmem_pages = 0;
696 pages_to_mb(max_pfn - MAXMEM_PFN), 702 }
703 if (highmem_pages) {
704 if (max_low_pfn - highmem_pages < 64*1024*1024/PAGE_SIZE) {
705 printk(KERN_ERR MSG_LOWMEM_TOO_SMALL,
697 pages_to_mb(highmem_pages)); 706 pages_to_mb(highmem_pages));
698 highmem_pages = 0; 707 highmem_pages = 0;
699 } 708 }
700 max_low_pfn = MAXMEM_PFN; 709 max_low_pfn -= highmem_pages;
710 }
711#else
712 if (highmem_pages)
713 printk(KERN_ERR "ignoring highmem size on non-highmem kernel!\n");
714#endif
715}
716
717#define MSG_HIGHMEM_TOO_SMALL \
718 "only %luMB highmem pages available, ignoring highmem size of %luMB!\n"
719
720#define MSG_HIGHMEM_TRIMMED \
721 "Warning: only 4GB will be used. Use a HIGHMEM64G enabled kernel!\n"
722/*
723 * We have more RAM than fits into lowmem - we try to put it into
724 * highmem, also taking the highmem=x boot parameter into account:
725 */
726void __init highmem_pfn_init(void)
727{
728 max_low_pfn = MAXMEM_PFN;
729
730 if (highmem_pages == -1)
731 highmem_pages = max_pfn - MAXMEM_PFN;
732
733 if (highmem_pages + MAXMEM_PFN < max_pfn)
734 max_pfn = MAXMEM_PFN + highmem_pages;
735
736 if (highmem_pages + MAXMEM_PFN > max_pfn) {
737 printk(KERN_WARNING MSG_HIGHMEM_TOO_SMALL,
738 pages_to_mb(max_pfn - MAXMEM_PFN),
739 pages_to_mb(highmem_pages));
740 highmem_pages = 0;
741 }
701#ifndef CONFIG_HIGHMEM 742#ifndef CONFIG_HIGHMEM
702 /* Maximum memory usable is what is directly addressable */ 743 /* Maximum memory usable is what is directly addressable */
703 printk(KERN_WARNING "Warning only %ldMB will be used.\n", 744 printk(KERN_WARNING "Warning only %ldMB will be used.\n", MAXMEM>>20);
704 MAXMEM>>20); 745 if (max_pfn > MAX_NONPAE_PFN)
705 if (max_pfn > MAX_NONPAE_PFN) 746 printk(KERN_WARNING "Use a HIGHMEM64G enabled kernel.\n");
706 printk(KERN_WARNING 747 else
707 "Use a HIGHMEM64G enabled kernel.\n"); 748 printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n");
708 else 749 max_pfn = MAXMEM_PFN;
709 printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n");
710 max_pfn = MAXMEM_PFN;
711#else /* !CONFIG_HIGHMEM */ 750#else /* !CONFIG_HIGHMEM */
712#ifndef CONFIG_HIGHMEM64G 751#ifndef CONFIG_HIGHMEM64G
713 if (max_pfn > MAX_NONPAE_PFN) { 752 if (max_pfn > MAX_NONPAE_PFN) {
714 max_pfn = MAX_NONPAE_PFN; 753 max_pfn = MAX_NONPAE_PFN;
715 printk(KERN_WARNING "Warning only 4GB will be used." 754 printk(KERN_WARNING MSG_HIGHMEM_TRIMMED);
716 "Use a HIGHMEM64G enabled kernel.\n"); 755 }
717 }
718#endif /* !CONFIG_HIGHMEM64G */ 756#endif /* !CONFIG_HIGHMEM64G */
719#endif /* !CONFIG_HIGHMEM */ 757#endif /* !CONFIG_HIGHMEM */
720 } else { 758}
721 if (highmem_pages == -1) 759
722 highmem_pages = 0; 760/*
723#ifdef CONFIG_HIGHMEM 761 * Determine low and high memory ranges:
724 if (highmem_pages >= max_pfn) { 762 */
725 printk(KERN_ERR "highmem size specified (%uMB) is " 763void __init find_low_pfn_range(void)
726 "bigger than pages available (%luMB)!.\n", 764{
727 pages_to_mb(highmem_pages), 765 /* it could update max_pfn */
728 pages_to_mb(max_pfn)); 766
729 highmem_pages = 0; 767 if (max_pfn <= MAXMEM_PFN)
730 } 768 lowmem_pfn_init();
731 if (highmem_pages) { 769 else
732 if (max_low_pfn - highmem_pages < 770 highmem_pfn_init();
733 64*1024*1024/PAGE_SIZE){
734 printk(KERN_ERR "highmem size %uMB results in "
735 "smaller than 64MB lowmem, ignoring it.\n"
736 , pages_to_mb(highmem_pages));
737 highmem_pages = 0;
738 }
739 max_low_pfn -= highmem_pages;
740 }
741#else
742 if (highmem_pages)
743 printk(KERN_ERR "ignoring highmem size on non-highmem"
744 " kernel!\n");
745#endif
746 }
747} 771}
748 772
749#ifndef CONFIG_NEED_MULTIPLE_NODES 773#ifndef CONFIG_NEED_MULTIPLE_NODES
@@ -826,10 +850,10 @@ static void __init find_early_table_space(unsigned long end, int use_pse)
826 unsigned long puds, pmds, ptes, tables, start; 850 unsigned long puds, pmds, ptes, tables, start;
827 851
828 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; 852 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
829 tables = PAGE_ALIGN(puds * sizeof(pud_t)); 853 tables = roundup(puds * sizeof(pud_t), PAGE_SIZE);
830 854
831 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; 855 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
832 tables += PAGE_ALIGN(pmds * sizeof(pmd_t)); 856 tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE);
833 857
834 if (use_pse) { 858 if (use_pse) {
835 unsigned long extra; 859 unsigned long extra;
@@ -840,10 +864,10 @@ static void __init find_early_table_space(unsigned long end, int use_pse)
840 } else 864 } else
841 ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT; 865 ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
842 866
843 tables += PAGE_ALIGN(ptes * sizeof(pte_t)); 867 tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE);
844 868
845 /* for fixmap */ 869 /* for fixmap */
846 tables += PAGE_ALIGN(__end_of_fixed_addresses * sizeof(pte_t)); 870 tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE);
847 871
848 /* 872 /*
849 * RED-PEN putting page tables only on node 0 could 873 * RED-PEN putting page tables only on node 0 could
@@ -1193,45 +1217,6 @@ void mark_rodata_ro(void)
1193} 1217}
1194#endif 1218#endif
1195 1219
1196void free_init_pages(char *what, unsigned long begin, unsigned long end)
1197{
1198#ifdef CONFIG_DEBUG_PAGEALLOC
1199 /*
1200 * If debugging page accesses then do not free this memory but
1201 * mark them not present - any buggy init-section access will
1202 * create a kernel page fault:
1203 */
1204 printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n",
1205 begin, PAGE_ALIGN(end));
1206 set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
1207#else
1208 unsigned long addr;
1209
1210 /*
1211 * We just marked the kernel text read only above, now that
1212 * we are going to free part of that, we need to make that
1213 * writeable first.
1214 */
1215 set_memory_rw(begin, (end - begin) >> PAGE_SHIFT);
1216
1217 for (addr = begin; addr < end; addr += PAGE_SIZE) {
1218 ClearPageReserved(virt_to_page(addr));
1219 init_page_count(virt_to_page(addr));
1220 memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE);
1221 free_page(addr);
1222 totalram_pages++;
1223 }
1224 printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
1225#endif
1226}
1227
1228void free_initmem(void)
1229{
1230 free_init_pages("unused kernel memory",
1231 (unsigned long)(&__init_begin),
1232 (unsigned long)(&__init_end));
1233}
1234
1235#ifdef CONFIG_BLK_DEV_INITRD 1220#ifdef CONFIG_BLK_DEV_INITRD
1236void free_initrd_mem(unsigned long start, unsigned long end) 1221void free_initrd_mem(unsigned long start, unsigned long end)
1237{ 1222{
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index b1352250096e..07f44d491df1 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -168,34 +168,51 @@ static __ref void *spp_getpage(void)
168 return ptr; 168 return ptr;
169} 169}
170 170
171void 171static pud_t *fill_pud(pgd_t *pgd, unsigned long vaddr)
172set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
173{ 172{
174 pud_t *pud; 173 if (pgd_none(*pgd)) {
175 pmd_t *pmd; 174 pud_t *pud = (pud_t *)spp_getpage();
176 pte_t *pte; 175 pgd_populate(&init_mm, pgd, pud);
176 if (pud != pud_offset(pgd, 0))
177 printk(KERN_ERR "PAGETABLE BUG #00! %p <-> %p\n",
178 pud, pud_offset(pgd, 0));
179 }
180 return pud_offset(pgd, vaddr);
181}
177 182
178 pud = pud_page + pud_index(vaddr); 183static pmd_t *fill_pmd(pud_t *pud, unsigned long vaddr)
184{
179 if (pud_none(*pud)) { 185 if (pud_none(*pud)) {
180 pmd = (pmd_t *) spp_getpage(); 186 pmd_t *pmd = (pmd_t *) spp_getpage();
181 pud_populate(&init_mm, pud, pmd); 187 pud_populate(&init_mm, pud, pmd);
182 if (pmd != pmd_offset(pud, 0)) { 188 if (pmd != pmd_offset(pud, 0))
183 printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n", 189 printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
184 pmd, pmd_offset(pud, 0)); 190 pmd, pmd_offset(pud, 0));
185 return;
186 }
187 } 191 }
188 pmd = pmd_offset(pud, vaddr); 192 return pmd_offset(pud, vaddr);
193}
194
195static pte_t *fill_pte(pmd_t *pmd, unsigned long vaddr)
196{
189 if (pmd_none(*pmd)) { 197 if (pmd_none(*pmd)) {
190 pte = (pte_t *) spp_getpage(); 198 pte_t *pte = (pte_t *) spp_getpage();
191 pmd_populate_kernel(&init_mm, pmd, pte); 199 pmd_populate_kernel(&init_mm, pmd, pte);
192 if (pte != pte_offset_kernel(pmd, 0)) { 200 if (pte != pte_offset_kernel(pmd, 0))
193 printk(KERN_ERR "PAGETABLE BUG #02!\n"); 201 printk(KERN_ERR "PAGETABLE BUG #02!\n");
194 return;
195 }
196 } 202 }
203 return pte_offset_kernel(pmd, vaddr);
204}
205
206void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
207{
208 pud_t *pud;
209 pmd_t *pmd;
210 pte_t *pte;
211
212 pud = pud_page + pud_index(vaddr);
213 pmd = fill_pmd(pud, vaddr);
214 pte = fill_pte(pmd, vaddr);
197 215
198 pte = pte_offset_kernel(pmd, vaddr);
199 set_pte(pte, new_pte); 216 set_pte(pte, new_pte);
200 217
201 /* 218 /*
@@ -205,8 +222,7 @@ set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
205 __flush_tlb_one(vaddr); 222 __flush_tlb_one(vaddr);
206} 223}
207 224
208void 225void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
209set_pte_vaddr(unsigned long vaddr, pte_t pteval)
210{ 226{
211 pgd_t *pgd; 227 pgd_t *pgd;
212 pud_t *pud_page; 228 pud_t *pud_page;
@@ -223,6 +239,24 @@ set_pte_vaddr(unsigned long vaddr, pte_t pteval)
223 set_pte_vaddr_pud(pud_page, vaddr, pteval); 239 set_pte_vaddr_pud(pud_page, vaddr, pteval);
224} 240}
225 241
242pmd_t * __init populate_extra_pmd(unsigned long vaddr)
243{
244 pgd_t *pgd;
245 pud_t *pud;
246
247 pgd = pgd_offset_k(vaddr);
248 pud = fill_pud(pgd, vaddr);
249 return fill_pmd(pud, vaddr);
250}
251
252pte_t * __init populate_extra_pte(unsigned long vaddr)
253{
254 pmd_t *pmd;
255
256 pmd = populate_extra_pmd(vaddr);
257 return fill_pte(pmd, vaddr);
258}
259
226/* 260/*
227 * Create large page table mappings for a range of physical addresses. 261 * Create large page table mappings for a range of physical addresses.
228 */ 262 */
@@ -947,43 +981,6 @@ void __init mem_init(void)
947 initsize >> 10); 981 initsize >> 10);
948} 982}
949 983
950void free_init_pages(char *what, unsigned long begin, unsigned long end)
951{
952 unsigned long addr = begin;
953
954 if (addr >= end)
955 return;
956
957 /*
958 * If debugging page accesses then do not free this memory but
959 * mark them not present - any buggy init-section access will
960 * create a kernel page fault:
961 */
962#ifdef CONFIG_DEBUG_PAGEALLOC
963 printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n",
964 begin, PAGE_ALIGN(end));
965 set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
966#else
967 printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
968
969 for (; addr < end; addr += PAGE_SIZE) {
970 ClearPageReserved(virt_to_page(addr));
971 init_page_count(virt_to_page(addr));
972 memset((void *)(addr & ~(PAGE_SIZE-1)),
973 POISON_FREE_INITMEM, PAGE_SIZE);
974 free_page(addr);
975 totalram_pages++;
976 }
977#endif
978}
979
980void free_initmem(void)
981{
982 free_init_pages("unused kernel memory",
983 (unsigned long)(&__init_begin),
984 (unsigned long)(&__init_end));
985}
986
987#ifdef CONFIG_DEBUG_RODATA 984#ifdef CONFIG_DEBUG_RODATA
988const int rodata_test_data = 0xC3; 985const int rodata_test_data = 0xC3;
989EXPORT_SYMBOL_GPL(rodata_test_data); 986EXPORT_SYMBOL_GPL(rodata_test_data);
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index f45d5e29a72e..433f7bd4648a 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -348,7 +348,7 @@ EXPORT_SYMBOL(ioremap_nocache);
348 * 348 *
349 * Must be freed with iounmap. 349 * Must be freed with iounmap.
350 */ 350 */
351void __iomem *ioremap_wc(unsigned long phys_addr, unsigned long size) 351void __iomem *ioremap_wc(resource_size_t phys_addr, unsigned long size)
352{ 352{
353 if (pat_enabled) 353 if (pat_enabled)
354 return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WC, 354 return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WC,
diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c
index 9cab18b0b857..0bcd7883d036 100644
--- a/arch/x86/mm/memtest.c
+++ b/arch/x86/mm/memtest.c
@@ -9,44 +9,44 @@
9 9
10#include <asm/e820.h> 10#include <asm/e820.h>
11 11
12static void __init memtest(unsigned long start_phys, unsigned long size, 12static u64 patterns[] __initdata = {
13 unsigned pattern) 13 0,
14 0xffffffffffffffffULL,
15 0x5555555555555555ULL,
16 0xaaaaaaaaaaaaaaaaULL,
17 0x1111111111111111ULL,
18 0x2222222222222222ULL,
19 0x4444444444444444ULL,
20 0x8888888888888888ULL,
21 0x3333333333333333ULL,
22 0x6666666666666666ULL,
23 0x9999999999999999ULL,
24 0xccccccccccccccccULL,
25 0x7777777777777777ULL,
26 0xbbbbbbbbbbbbbbbbULL,
27 0xddddddddddddddddULL,
28 0xeeeeeeeeeeeeeeeeULL,
29 0x7a6c7258554e494cULL, /* yeah ;-) */
30};
31
32static void __init reserve_bad_mem(u64 pattern, u64 start_bad, u64 end_bad)
14{ 33{
15 unsigned long i; 34 printk(KERN_INFO " %016llx bad mem addr %010llx - %010llx reserved\n",
16 unsigned long *start; 35 (unsigned long long) pattern,
17 unsigned long start_bad; 36 (unsigned long long) start_bad,
18 unsigned long last_bad; 37 (unsigned long long) end_bad);
19 unsigned long val; 38 reserve_early(start_bad, end_bad, "BAD RAM");
20 unsigned long start_phys_aligned; 39}
21 unsigned long count;
22 unsigned long incr;
23
24 switch (pattern) {
25 case 0:
26 val = 0UL;
27 break;
28 case 1:
29 val = -1UL;
30 break;
31 case 2:
32#ifdef CONFIG_X86_64
33 val = 0x5555555555555555UL;
34#else
35 val = 0x55555555UL;
36#endif
37 break;
38 case 3:
39#ifdef CONFIG_X86_64
40 val = 0xaaaaaaaaaaaaaaaaUL;
41#else
42 val = 0xaaaaaaaaUL;
43#endif
44 break;
45 default:
46 return;
47 }
48 40
49 incr = sizeof(unsigned long); 41static void __init memtest(u64 pattern, u64 start_phys, u64 size)
42{
43 u64 i, count;
44 u64 *start;
45 u64 start_bad, last_bad;
46 u64 start_phys_aligned;
47 size_t incr;
48
49 incr = sizeof(pattern);
50 start_phys_aligned = ALIGN(start_phys, incr); 50 start_phys_aligned = ALIGN(start_phys, incr);
51 count = (size - (start_phys_aligned - start_phys))/incr; 51 count = (size - (start_phys_aligned - start_phys))/incr;
52 start = __va(start_phys_aligned); 52 start = __va(start_phys_aligned);
@@ -54,25 +54,42 @@ static void __init memtest(unsigned long start_phys, unsigned long size,
54 last_bad = 0; 54 last_bad = 0;
55 55
56 for (i = 0; i < count; i++) 56 for (i = 0; i < count; i++)
57 start[i] = val; 57 start[i] = pattern;
58 for (i = 0; i < count; i++, start++, start_phys_aligned += incr) { 58 for (i = 0; i < count; i++, start++, start_phys_aligned += incr) {
59 if (*start != val) { 59 if (*start == pattern)
60 if (start_phys_aligned == last_bad + incr) { 60 continue;
61 last_bad += incr; 61 if (start_phys_aligned == last_bad + incr) {
62 } else { 62 last_bad += incr;
63 if (start_bad) { 63 continue;
64 printk(KERN_CONT "\n %016lx bad mem addr %010lx - %010lx reserved",
65 val, start_bad, last_bad + incr);
66 reserve_early(start_bad, last_bad + incr, "BAD RAM");
67 }
68 start_bad = last_bad = start_phys_aligned;
69 }
70 } 64 }
65 if (start_bad)
66 reserve_bad_mem(pattern, start_bad, last_bad + incr);
67 start_bad = last_bad = start_phys_aligned;
71 } 68 }
72 if (start_bad) { 69 if (start_bad)
73 printk(KERN_CONT "\n %016lx bad mem addr %010lx - %010lx reserved", 70 reserve_bad_mem(pattern, start_bad, last_bad + incr);
74 val, start_bad, last_bad + incr); 71}
75 reserve_early(start_bad, last_bad + incr, "BAD RAM"); 72
73static void __init do_one_pass(u64 pattern, u64 start, u64 end)
74{
75 u64 size = 0;
76
77 while (start < end) {
78 start = find_e820_area_size(start, &size, 1);
79
80 /* done ? */
81 if (start >= end)
82 break;
83 if (start + size > end)
84 size = end - start;
85
86 printk(KERN_INFO " %010llx - %010llx pattern %016llx\n",
87 (unsigned long long) start,
88 (unsigned long long) start + size,
89 (unsigned long long) cpu_to_be64(pattern));
90 memtest(pattern, start, size);
91
92 start += size;
76 } 93 }
77} 94}
78 95
@@ -90,33 +107,22 @@ early_param("memtest", parse_memtest);
90 107
91void __init early_memtest(unsigned long start, unsigned long end) 108void __init early_memtest(unsigned long start, unsigned long end)
92{ 109{
93 u64 t_start, t_size; 110 unsigned int i;
94 unsigned pattern; 111 unsigned int idx = 0;
95 112
96 if (!memtest_pattern) 113 if (!memtest_pattern)
97 return; 114 return;
98 115
99 printk(KERN_INFO "early_memtest: pattern num %d", memtest_pattern); 116 printk(KERN_INFO "early_memtest: # of tests: %d\n", memtest_pattern);
100 for (pattern = 0; pattern < memtest_pattern; pattern++) { 117 for (i = 0; i < memtest_pattern; i++) {
101 t_start = start; 118 idx = i % ARRAY_SIZE(patterns);
102 t_size = 0; 119 do_one_pass(patterns[idx], start, end);
103 while (t_start < end) { 120 }
104 t_start = find_e820_area_size(t_start, &t_size, 1);
105
106 /* done ? */
107 if (t_start >= end)
108 break;
109 if (t_start + t_size > end)
110 t_size = end - t_start;
111
112 printk(KERN_CONT "\n %010llx - %010llx pattern %d",
113 (unsigned long long)t_start,
114 (unsigned long long)t_start + t_size, pattern);
115
116 memtest(t_start, t_size, pattern);
117 121
118 t_start += t_size; 122 if (idx > 0) {
119 } 123 printk(KERN_INFO "early_memtest: wipe out "
124 "test pattern from memory\n");
125 /* additional test with pattern 0 will do this */
126 do_one_pass(0, start, end);
120 } 127 }
121 printk(KERN_CONT "\n");
122} 128}
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index 56fe7124fbec..165829600566 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -4,7 +4,7 @@
4 * Based on code by Ingo Molnar and Andi Kleen, copyrighted 4 * Based on code by Ingo Molnar and Andi Kleen, copyrighted
5 * as follows: 5 * as follows:
6 * 6 *
7 * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina. 7 * Copyright 2003-2009 Red Hat Inc.
8 * All Rights Reserved. 8 * All Rights Reserved.
9 * Copyright 2005 Andi Kleen, SUSE Labs. 9 * Copyright 2005 Andi Kleen, SUSE Labs.
10 * Copyright 2007 Jiri Kosina, SUSE Labs. 10 * Copyright 2007 Jiri Kosina, SUSE Labs.
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index d1f7439d173c..451fe95a0352 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -194,7 +194,7 @@ void *alloc_remap(int nid, unsigned long size)
194 size = ALIGN(size, L1_CACHE_BYTES); 194 size = ALIGN(size, L1_CACHE_BYTES);
195 195
196 if (!allocation || (allocation + size) >= node_remap_end_vaddr[nid]) 196 if (!allocation || (allocation + size) >= node_remap_end_vaddr[nid])
197 return 0; 197 return NULL;
198 198
199 node_remap_alloc_vaddr[nid] += size; 199 node_remap_alloc_vaddr[nid] += size;
200 memset(allocation, 0, size); 200 memset(allocation, 0, size);
@@ -423,32 +423,6 @@ void __init initmem_init(unsigned long start_pfn,
423 setup_bootmem_allocator(); 423 setup_bootmem_allocator();
424} 424}
425 425
426void __init set_highmem_pages_init(void)
427{
428#ifdef CONFIG_HIGHMEM
429 struct zone *zone;
430 int nid;
431
432 for_each_zone(zone) {
433 unsigned long zone_start_pfn, zone_end_pfn;
434
435 if (!is_highmem(zone))
436 continue;
437
438 zone_start_pfn = zone->zone_start_pfn;
439 zone_end_pfn = zone_start_pfn + zone->spanned_pages;
440
441 nid = zone_to_nid(zone);
442 printk(KERN_INFO "Initializing %s for node %d (%08lx:%08lx)\n",
443 zone->name, nid, zone_start_pfn, zone_end_pfn);
444
445 add_highpages_with_active_regions(nid, zone_start_pfn,
446 zone_end_pfn);
447 }
448 totalram_pages += totalhigh_pages;
449#endif
450}
451
452#ifdef CONFIG_MEMORY_HOTPLUG 426#ifdef CONFIG_MEMORY_HOTPLUG
453static int paddr_to_nid(u64 addr) 427static int paddr_to_nid(u64 addr)
454{ 428{
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index f3516da035d1..64c9cf043cdd 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -20,6 +20,12 @@
20#include <asm/acpi.h> 20#include <asm/acpi.h>
21#include <asm/k8.h> 21#include <asm/k8.h>
22 22
23#ifdef CONFIG_DEBUG_PER_CPU_MAPS
24# define DBG(x...) printk(KERN_DEBUG x)
25#else
26# define DBG(x...)
27#endif
28
23struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; 29struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
24EXPORT_SYMBOL(node_data); 30EXPORT_SYMBOL(node_data);
25 31
@@ -33,6 +39,21 @@ int numa_off __initdata;
33static unsigned long __initdata nodemap_addr; 39static unsigned long __initdata nodemap_addr;
34static unsigned long __initdata nodemap_size; 40static unsigned long __initdata nodemap_size;
35 41
42DEFINE_PER_CPU(int, node_number) = 0;
43EXPORT_PER_CPU_SYMBOL(node_number);
44
45/*
46 * Map cpu index to node index
47 */
48DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE);
49EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
50
51/*
52 * Which logical CPUs are on which nodes
53 */
54cpumask_t *node_to_cpumask_map;
55EXPORT_SYMBOL(node_to_cpumask_map);
56
36/* 57/*
37 * Given a shift value, try to populate memnodemap[] 58 * Given a shift value, try to populate memnodemap[]
38 * Returns : 59 * Returns :
@@ -640,3 +661,199 @@ void __init init_cpu_to_node(void)
640#endif 661#endif
641 662
642 663
664/*
665 * Allocate node_to_cpumask_map based on number of available nodes
666 * Requires node_possible_map to be valid.
667 *
668 * Note: node_to_cpumask() is not valid until after this is done.
669 * (Use CONFIG_DEBUG_PER_CPU_MAPS to check this.)
670 */
671void __init setup_node_to_cpumask_map(void)
672{
673 unsigned int node, num = 0;
674 cpumask_t *map;
675
676 /* setup nr_node_ids if not done yet */
677 if (nr_node_ids == MAX_NUMNODES) {
678 for_each_node_mask(node, node_possible_map)
679 num = node;
680 nr_node_ids = num + 1;
681 }
682
683 /* allocate the map */
684 map = alloc_bootmem_low(nr_node_ids * sizeof(cpumask_t));
685 DBG("node_to_cpumask_map at %p for %d nodes\n", map, nr_node_ids);
686
687 pr_debug("Node to cpumask map at %p for %d nodes\n",
688 map, nr_node_ids);
689
690 /* node_to_cpumask() will now work */
691 node_to_cpumask_map = map;
692}
693
694void __cpuinit numa_set_node(int cpu, int node)
695{
696 int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);
697
698 /* early setting, no percpu area yet */
699 if (cpu_to_node_map) {
700 cpu_to_node_map[cpu] = node;
701 return;
702 }
703
704#ifdef CONFIG_DEBUG_PER_CPU_MAPS
705 if (cpu >= nr_cpu_ids || !cpu_possible(cpu)) {
706 printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu);
707 dump_stack();
708 return;
709 }
710#endif
711 per_cpu(x86_cpu_to_node_map, cpu) = node;
712
713 if (node != NUMA_NO_NODE)
714 per_cpu(node_number, cpu) = node;
715}
716
717void __cpuinit numa_clear_node(int cpu)
718{
719 numa_set_node(cpu, NUMA_NO_NODE);
720}
721
722#ifndef CONFIG_DEBUG_PER_CPU_MAPS
723
724void __cpuinit numa_add_cpu(int cpu)
725{
726 cpu_set(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
727}
728
729void __cpuinit numa_remove_cpu(int cpu)
730{
731 cpu_clear(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
732}
733
734#else /* CONFIG_DEBUG_PER_CPU_MAPS */
735
736/*
737 * --------- debug versions of the numa functions ---------
738 */
739static void __cpuinit numa_set_cpumask(int cpu, int enable)
740{
741 int node = early_cpu_to_node(cpu);
742 cpumask_t *mask;
743 char buf[64];
744
745 if (node_to_cpumask_map == NULL) {
746 printk(KERN_ERR "node_to_cpumask_map NULL\n");
747 dump_stack();
748 return;
749 }
750
751 mask = &node_to_cpumask_map[node];
752 if (enable)
753 cpu_set(cpu, *mask);
754 else
755 cpu_clear(cpu, *mask);
756
757 cpulist_scnprintf(buf, sizeof(buf), mask);
758 printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n",
759 enable ? "numa_add_cpu" : "numa_remove_cpu", cpu, node, buf);
760}
761
762void __cpuinit numa_add_cpu(int cpu)
763{
764 numa_set_cpumask(cpu, 1);
765}
766
767void __cpuinit numa_remove_cpu(int cpu)
768{
769 numa_set_cpumask(cpu, 0);
770}
771
772int cpu_to_node(int cpu)
773{
774 if (early_per_cpu_ptr(x86_cpu_to_node_map)) {
775 printk(KERN_WARNING
776 "cpu_to_node(%d): usage too early!\n", cpu);
777 dump_stack();
778 return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
779 }
780 return per_cpu(x86_cpu_to_node_map, cpu);
781}
782EXPORT_SYMBOL(cpu_to_node);
783
784/*
785 * Same function as cpu_to_node() but used if called before the
786 * per_cpu areas are setup.
787 */
788int early_cpu_to_node(int cpu)
789{
790 if (early_per_cpu_ptr(x86_cpu_to_node_map))
791 return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
792
793 if (!cpu_possible(cpu)) {
794 printk(KERN_WARNING
795 "early_cpu_to_node(%d): no per_cpu area!\n", cpu);
796 dump_stack();
797 return NUMA_NO_NODE;
798 }
799 return per_cpu(x86_cpu_to_node_map, cpu);
800}
801
802
803/* empty cpumask */
804static const cpumask_t cpu_mask_none;
805
806/*
807 * Returns a pointer to the bitmask of CPUs on Node 'node'.
808 */
809const cpumask_t *cpumask_of_node(int node)
810{
811 if (node_to_cpumask_map == NULL) {
812 printk(KERN_WARNING
813 "cpumask_of_node(%d): no node_to_cpumask_map!\n",
814 node);
815 dump_stack();
816 return (const cpumask_t *)&cpu_online_map;
817 }
818 if (node >= nr_node_ids) {
819 printk(KERN_WARNING
820 "cpumask_of_node(%d): node > nr_node_ids(%d)\n",
821 node, nr_node_ids);
822 dump_stack();
823 return &cpu_mask_none;
824 }
825 return &node_to_cpumask_map[node];
826}
827EXPORT_SYMBOL(cpumask_of_node);
828
829/*
830 * Returns a bitmask of CPUs on Node 'node'.
831 *
832 * Side note: this function creates the returned cpumask on the stack
833 * so with a high NR_CPUS count, excessive stack space is used. The
834 * node_to_cpumask_ptr function should be used whenever possible.
835 */
836cpumask_t node_to_cpumask(int node)
837{
838 if (node_to_cpumask_map == NULL) {
839 printk(KERN_WARNING
840 "node_to_cpumask(%d): no node_to_cpumask_map!\n", node);
841 dump_stack();
842 return cpu_online_map;
843 }
844 if (node >= nr_node_ids) {
845 printk(KERN_WARNING
846 "node_to_cpumask(%d): node > nr_node_ids(%d)\n",
847 node, nr_node_ids);
848 dump_stack();
849 return cpu_mask_none;
850 }
851 return node_to_cpumask_map[node];
852}
853EXPORT_SYMBOL(node_to_cpumask);
854
855/*
856 * --------- end of debug versions of the numa functions ---------
857 */
858
859#endif /* CONFIG_DEBUG_PER_CPU_MAPS */
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 7233bd7e357b..9c4294986af7 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -482,6 +482,13 @@ static int split_large_page(pte_t *kpte, unsigned long address)
482 pbase = (pte_t *)page_address(base); 482 pbase = (pte_t *)page_address(base);
483 paravirt_alloc_pte(&init_mm, page_to_pfn(base)); 483 paravirt_alloc_pte(&init_mm, page_to_pfn(base));
484 ref_prot = pte_pgprot(pte_clrhuge(*kpte)); 484 ref_prot = pte_pgprot(pte_clrhuge(*kpte));
485 /*
486 * If we ever want to utilize the PAT bit, we need to
487 * update this function to make sure it's converted from
488 * bit 12 to bit 7 when we cross from the 2MB level to
489 * the 4K level:
490 */
491 WARN_ON_ONCE(pgprot_val(ref_prot) & _PAGE_PAT_LARGE);
485 492
486#ifdef CONFIG_X86_64 493#ifdef CONFIG_X86_64
487 if (level == PG_LEVEL_1G) { 494 if (level == PG_LEVEL_1G) {
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index e0ab173b6974..2ed37158012d 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -31,7 +31,7 @@
31#ifdef CONFIG_X86_PAT 31#ifdef CONFIG_X86_PAT
32int __read_mostly pat_enabled = 1; 32int __read_mostly pat_enabled = 1;
33 33
34void __cpuinit pat_disable(char *reason) 34void __cpuinit pat_disable(const char *reason)
35{ 35{
36 pat_enabled = 0; 36 pat_enabled = 0;
37 printk(KERN_INFO "%s\n", reason); 37 printk(KERN_INFO "%s\n", reason);
@@ -43,6 +43,11 @@ static int __init nopat(char *str)
43 return 0; 43 return 0;
44} 44}
45early_param("nopat", nopat); 45early_param("nopat", nopat);
46#else
47static inline void pat_disable(const char *reason)
48{
49 (void)reason;
50}
46#endif 51#endif
47 52
48 53
@@ -79,16 +84,20 @@ void pat_init(void)
79 if (!pat_enabled) 84 if (!pat_enabled)
80 return; 85 return;
81 86
82 /* Paranoia check. */ 87 if (!cpu_has_pat) {
83 if (!cpu_has_pat && boot_pat_state) { 88 if (!boot_pat_state) {
84 /* 89 pat_disable("PAT not supported by CPU.");
85 * If this happens we are on a secondary CPU, but 90 return;
86 * switched to PAT on the boot CPU. We have no way to 91 } else {
87 * undo PAT. 92 /*
88 */ 93 * If this happens we are on a secondary CPU, but
89 printk(KERN_ERR "PAT enabled, " 94 * switched to PAT on the boot CPU. We have no way to
90 "but not supported by secondary CPU\n"); 95 * undo PAT.
91 BUG(); 96 */
97 printk(KERN_ERR "PAT enabled, "
98 "but not supported by secondary CPU\n");
99 BUG();
100 }
92 } 101 }
93 102
94 /* Set PWT to Write-Combining. All other bits stay the same */ 103 /* Set PWT to Write-Combining. All other bits stay the same */
@@ -626,6 +635,33 @@ void unmap_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot)
626} 635}
627 636
628/* 637/*
638 * Change the memory type for the physial address range in kernel identity
639 * mapping space if that range is a part of identity map.
640 */
641int kernel_map_sync_memtype(u64 base, unsigned long size, unsigned long flags)
642{
643 unsigned long id_sz;
644
645 if (!pat_enabled || base >= __pa(high_memory))
646 return 0;
647
648 id_sz = (__pa(high_memory) < base + size) ?
649 __pa(high_memory) - base :
650 size;
651
652 if (ioremap_change_attr((unsigned long)__va(base), id_sz, flags) < 0) {
653 printk(KERN_INFO
654 "%s:%d ioremap_change_attr failed %s "
655 "for %Lx-%Lx\n",
656 current->comm, current->pid,
657 cattr_name(flags),
658 base, (unsigned long long)(base + size));
659 return -EINVAL;
660 }
661 return 0;
662}
663
664/*
629 * Internal interface to reserve a range of physical memory with prot. 665 * Internal interface to reserve a range of physical memory with prot.
630 * Reserved non RAM regions only and after successful reserve_memtype, 666 * Reserved non RAM regions only and after successful reserve_memtype,
631 * this func also keeps identity mapping (if any) in sync with this new prot. 667 * this func also keeps identity mapping (if any) in sync with this new prot.
@@ -634,7 +670,7 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot,
634 int strict_prot) 670 int strict_prot)
635{ 671{
636 int is_ram = 0; 672 int is_ram = 0;
637 int id_sz, ret; 673 int ret;
638 unsigned long flags; 674 unsigned long flags;
639 unsigned long want_flags = (pgprot_val(*vma_prot) & _PAGE_CACHE_MASK); 675 unsigned long want_flags = (pgprot_val(*vma_prot) & _PAGE_CACHE_MASK);
640 676
@@ -671,23 +707,8 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot,
671 flags); 707 flags);
672 } 708 }
673 709
674 /* Need to keep identity mapping in sync */ 710 if (kernel_map_sync_memtype(paddr, size, flags) < 0) {
675 if (paddr >= __pa(high_memory))
676 return 0;
677
678 id_sz = (__pa(high_memory) < paddr + size) ?
679 __pa(high_memory) - paddr :
680 size;
681
682 if (ioremap_change_attr((unsigned long)__va(paddr), id_sz, flags) < 0) {
683 free_memtype(paddr, paddr + size); 711 free_memtype(paddr, paddr + size);
684 printk(KERN_ERR
685 "%s:%d reserve_pfn_range ioremap_change_attr failed %s "
686 "for %Lx-%Lx\n",
687 current->comm, current->pid,
688 cattr_name(flags),
689 (unsigned long long)paddr,
690 (unsigned long long)(paddr + size));
691 return -EINVAL; 712 return -EINVAL;
692 } 713 }
693 return 0; 714 return 0;
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 86f2ffc43c3d..5b7c7c8464fe 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -313,6 +313,24 @@ int ptep_clear_flush_young(struct vm_area_struct *vma,
313 return young; 313 return young;
314} 314}
315 315
316/**
317 * reserve_top_address - reserves a hole in the top of kernel address space
318 * @reserve - size of hole to reserve
319 *
320 * Can be used to relocate the fixmap area and poke a hole in the top
321 * of kernel address space to make room for a hypervisor.
322 */
323void __init reserve_top_address(unsigned long reserve)
324{
325#ifdef CONFIG_X86_32
326 BUG_ON(fixmaps_set > 0);
327 printk(KERN_INFO "Reserving virtual address space above 0x%08x\n",
328 (int)-reserve);
329 __FIXADDR_TOP = -reserve - PAGE_SIZE;
330 __VMALLOC_RESERVE += reserve;
331#endif
332}
333
316int fixmaps_set; 334int fixmaps_set;
317 335
318void __native_set_fixmap(enum fixed_addresses idx, pte_t pte) 336void __native_set_fixmap(enum fixed_addresses idx, pte_t pte)
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
index 0951db9ee519..f2e477c91c1b 100644
--- a/arch/x86/mm/pgtable_32.c
+++ b/arch/x86/mm/pgtable_32.c
@@ -20,6 +20,8 @@
20#include <asm/tlb.h> 20#include <asm/tlb.h>
21#include <asm/tlbflush.h> 21#include <asm/tlbflush.h>
22 22
23unsigned int __VMALLOC_RESERVE = 128 << 20;
24
23/* 25/*
24 * Associate a virtual page frame with a given physical page frame 26 * Associate a virtual page frame with a given physical page frame
25 * and protection flags for that frame. 27 * and protection flags for that frame.
@@ -97,22 +99,6 @@ void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
97unsigned long __FIXADDR_TOP = 0xfffff000; 99unsigned long __FIXADDR_TOP = 0xfffff000;
98EXPORT_SYMBOL(__FIXADDR_TOP); 100EXPORT_SYMBOL(__FIXADDR_TOP);
99 101
100/**
101 * reserve_top_address - reserves a hole in the top of kernel address space
102 * @reserve - size of hole to reserve
103 *
104 * Can be used to relocate the fixmap area and poke a hole in the top
105 * of kernel address space to make room for a hypervisor.
106 */
107void __init reserve_top_address(unsigned long reserve)
108{
109 BUG_ON(fixmaps_set > 0);
110 printk(KERN_INFO "Reserving virtual address space above 0x%08x\n",
111 (int)-reserve);
112 __FIXADDR_TOP = -reserve - PAGE_SIZE;
113 __VMALLOC_RESERVE += reserve;
114}
115
116/* 102/*
117 * vmalloc=size forces the vmalloc area to be exactly 'size' 103 * vmalloc=size forces the vmalloc area to be exactly 'size'
118 * bytes. This can be used to increase (or decrease) the 104 * bytes. This can be used to increase (or decrease) the
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 09737c8af074..574c8bc95ef0 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -20,7 +20,8 @@
20#include <asm/proto.h> 20#include <asm/proto.h>
21#include <asm/numa.h> 21#include <asm/numa.h>
22#include <asm/e820.h> 22#include <asm/e820.h>
23#include <asm/genapic.h> 23#include <asm/apic.h>
24#include <asm/uv/uv.h>
24 25
25int acpi_numa __initdata; 26int acpi_numa __initdata;
26 27
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
new file mode 100644
index 000000000000..a654d59e4483
--- /dev/null
+++ b/arch/x86/mm/tlb.c
@@ -0,0 +1,295 @@
1#include <linux/init.h>
2
3#include <linux/mm.h>
4#include <linux/spinlock.h>
5#include <linux/smp.h>
6#include <linux/interrupt.h>
7#include <linux/module.h>
8
9#include <asm/tlbflush.h>
10#include <asm/mmu_context.h>
11#include <asm/apic.h>
12#include <asm/uv/uv.h>
13
14DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate)
15 = { &init_mm, 0, };
16
17/*
18 * Smarter SMP flushing macros.
19 * c/o Linus Torvalds.
20 *
21 * These mean you can really definitely utterly forget about
22 * writing to user space from interrupts. (Its not allowed anyway).
23 *
24 * Optimizations Manfred Spraul <manfred@colorfullife.com>
25 *
26 * More scalable flush, from Andi Kleen
27 *
28 * To avoid global state use 8 different call vectors.
29 * Each CPU uses a specific vector to trigger flushes on other
30 * CPUs. Depending on the received vector the target CPUs look into
31 * the right array slot for the flush data.
32 *
33 * With more than 8 CPUs they are hashed to the 8 available
34 * vectors. The limited global vector space forces us to this right now.
35 * In future when interrupts are split into per CPU domains this could be
36 * fixed, at the cost of triggering multiple IPIs in some cases.
37 */
38
39union smp_flush_state {
40 struct {
41 struct mm_struct *flush_mm;
42 unsigned long flush_va;
43 spinlock_t tlbstate_lock;
44 DECLARE_BITMAP(flush_cpumask, NR_CPUS);
45 };
46 char pad[CONFIG_X86_INTERNODE_CACHE_BYTES];
47} ____cacheline_internodealigned_in_smp;
48
49/* State is put into the per CPU data section, but padded
50 to a full cache line because other CPUs can access it and we don't
51 want false sharing in the per cpu data segment. */
52static union smp_flush_state flush_state[NUM_INVALIDATE_TLB_VECTORS];
53
54/*
55 * We cannot call mmdrop() because we are in interrupt context,
56 * instead update mm->cpu_vm_mask.
57 */
58void leave_mm(int cpu)
59{
60 if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
61 BUG();
62 cpu_clear(cpu, percpu_read(cpu_tlbstate.active_mm)->cpu_vm_mask);
63 load_cr3(swapper_pg_dir);
64}
65EXPORT_SYMBOL_GPL(leave_mm);
66
67/*
68 *
69 * The flush IPI assumes that a thread switch happens in this order:
70 * [cpu0: the cpu that switches]
71 * 1) switch_mm() either 1a) or 1b)
72 * 1a) thread switch to a different mm
73 * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask);
74 * Stop ipi delivery for the old mm. This is not synchronized with
75 * the other cpus, but smp_invalidate_interrupt ignore flush ipis
76 * for the wrong mm, and in the worst case we perform a superfluous
77 * tlb flush.
78 * 1a2) set cpu mmu_state to TLBSTATE_OK
79 * Now the smp_invalidate_interrupt won't call leave_mm if cpu0
80 * was in lazy tlb mode.
81 * 1a3) update cpu active_mm
82 * Now cpu0 accepts tlb flushes for the new mm.
83 * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask);
84 * Now the other cpus will send tlb flush ipis.
85 * 1a4) change cr3.
86 * 1b) thread switch without mm change
87 * cpu active_mm is correct, cpu0 already handles
88 * flush ipis.
89 * 1b1) set cpu mmu_state to TLBSTATE_OK
90 * 1b2) test_and_set the cpu bit in cpu_vm_mask.
91 * Atomically set the bit [other cpus will start sending flush ipis],
92 * and test the bit.
93 * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
94 * 2) switch %%esp, ie current
95 *
96 * The interrupt must handle 2 special cases:
97 * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
98 * - the cpu performs speculative tlb reads, i.e. even if the cpu only
99 * runs in kernel space, the cpu could load tlb entries for user space
100 * pages.
101 *
102 * The good news is that cpu mmu_state is local to each cpu, no
103 * write/read ordering problems.
104 */
105
106/*
107 * TLB flush IPI:
108 *
109 * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
110 * 2) Leave the mm if we are in the lazy tlb mode.
111 *
112 * Interrupts are disabled.
113 */
114
115/*
116 * FIXME: use of asmlinkage is not consistent. On x86_64 it's noop
117 * but still used for documentation purpose but the usage is slightly
118 * inconsistent. On x86_32, asmlinkage is regparm(0) but interrupt
119 * entry calls in with the first parameter in %eax. Maybe define
120 * intrlinkage?
121 */
122#ifdef CONFIG_X86_64
123asmlinkage
124#endif
125void smp_invalidate_interrupt(struct pt_regs *regs)
126{
127 unsigned int cpu;
128 unsigned int sender;
129 union smp_flush_state *f;
130
131 cpu = smp_processor_id();
132 /*
133 * orig_rax contains the negated interrupt vector.
134 * Use that to determine where the sender put the data.
135 */
136 sender = ~regs->orig_ax - INVALIDATE_TLB_VECTOR_START;
137 f = &flush_state[sender];
138
139 if (!cpumask_test_cpu(cpu, to_cpumask(f->flush_cpumask)))
140 goto out;
141 /*
142 * This was a BUG() but until someone can quote me the
143 * line from the intel manual that guarantees an IPI to
144 * multiple CPUs is retried _only_ on the erroring CPUs
145 * its staying as a return
146 *
147 * BUG();
148 */
149
150 if (f->flush_mm == percpu_read(cpu_tlbstate.active_mm)) {
151 if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK) {
152 if (f->flush_va == TLB_FLUSH_ALL)
153 local_flush_tlb();
154 else
155 __flush_tlb_one(f->flush_va);
156 } else
157 leave_mm(cpu);
158 }
159out:
160 ack_APIC_irq();
161 smp_mb__before_clear_bit();
162 cpumask_clear_cpu(cpu, to_cpumask(f->flush_cpumask));
163 smp_mb__after_clear_bit();
164 inc_irq_stat(irq_tlb_count);
165}
166
167static void flush_tlb_others_ipi(const struct cpumask *cpumask,
168 struct mm_struct *mm, unsigned long va)
169{
170 unsigned int sender;
171 union smp_flush_state *f;
172
173 /* Caller has disabled preemption */
174 sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS;
175 f = &flush_state[sender];
176
177 /*
178 * Could avoid this lock when
179 * num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is
180 * probably not worth checking this for a cache-hot lock.
181 */
182 spin_lock(&f->tlbstate_lock);
183
184 f->flush_mm = mm;
185 f->flush_va = va;
186 cpumask_andnot(to_cpumask(f->flush_cpumask),
187 cpumask, cpumask_of(smp_processor_id()));
188
189 /*
190 * Make the above memory operations globally visible before
191 * sending the IPI.
192 */
193 smp_mb();
194 /*
195 * We have to send the IPI only to
196 * CPUs affected.
197 */
198 apic->send_IPI_mask(to_cpumask(f->flush_cpumask),
199 INVALIDATE_TLB_VECTOR_START + sender);
200
201 while (!cpumask_empty(to_cpumask(f->flush_cpumask)))
202 cpu_relax();
203
204 f->flush_mm = NULL;
205 f->flush_va = 0;
206 spin_unlock(&f->tlbstate_lock);
207}
208
209void native_flush_tlb_others(const struct cpumask *cpumask,
210 struct mm_struct *mm, unsigned long va)
211{
212 if (is_uv_system()) {
213 unsigned int cpu;
214
215 cpu = get_cpu();
216 cpumask = uv_flush_tlb_others(cpumask, mm, va, cpu);
217 if (cpumask)
218 flush_tlb_others_ipi(cpumask, mm, va);
219 put_cpu();
220 return;
221 }
222 flush_tlb_others_ipi(cpumask, mm, va);
223}
224
225static int __cpuinit init_smp_flush(void)
226{
227 int i;
228
229 for (i = 0; i < ARRAY_SIZE(flush_state); i++)
230 spin_lock_init(&flush_state[i].tlbstate_lock);
231
232 return 0;
233}
234core_initcall(init_smp_flush);
235
236void flush_tlb_current_task(void)
237{
238 struct mm_struct *mm = current->mm;
239
240 preempt_disable();
241
242 local_flush_tlb();
243 if (cpumask_any_but(&mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids)
244 flush_tlb_others(&mm->cpu_vm_mask, mm, TLB_FLUSH_ALL);
245 preempt_enable();
246}
247
248void flush_tlb_mm(struct mm_struct *mm)
249{
250 preempt_disable();
251
252 if (current->active_mm == mm) {
253 if (current->mm)
254 local_flush_tlb();
255 else
256 leave_mm(smp_processor_id());
257 }
258 if (cpumask_any_but(&mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids)
259 flush_tlb_others(&mm->cpu_vm_mask, mm, TLB_FLUSH_ALL);
260
261 preempt_enable();
262}
263
264void flush_tlb_page(struct vm_area_struct *vma, unsigned long va)
265{
266 struct mm_struct *mm = vma->vm_mm;
267
268 preempt_disable();
269
270 if (current->active_mm == mm) {
271 if (current->mm)
272 __flush_tlb_one(va);
273 else
274 leave_mm(smp_processor_id());
275 }
276
277 if (cpumask_any_but(&mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids)
278 flush_tlb_others(&mm->cpu_vm_mask, mm, va);
279
280 preempt_enable();
281}
282
283static void do_flush_tlb_all(void *info)
284{
285 unsigned long cpu = smp_processor_id();
286
287 __flush_tlb_all();
288 if (percpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY)
289 leave_mm(cpu);
290}
291
292void flush_tlb_all(void)
293{
294 on_each_cpu(do_flush_tlb_all, NULL, 1);
295}