aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/mm/fault.c
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2009-02-20 13:56:40 -0500
committerIngo Molnar <mingo@elte.hu>2009-02-20 18:09:39 -0500
commit2d4a71676f4d89418a0d53e60b89e8b804b390b2 (patch)
treef25c247fbb660281e32fe94bfe986610b855d4c1 /arch/x86/mm/fault.c
parentc9e1585b1b7e36a72181f2c59c2abfd476512e93 (diff)
x86, mm: fault.c cleanup
Impact: cleanup, no code changed Clean up various small details, which can be correctness checked automatically: - tidy up the include file section - eliminate unnecessary includes - introduce show_signal_msg() to clean up code flow - standardize the code flow - standardize comments and other style details - more cleanups, pointed out by checkpatch No code changed on either 32-bit nor 64-bit: arch/x86/mm/fault.o: text data bss dec hex filename 4632 32 24 4688 1250 fault.o.before 4632 32 24 4688 1250 fault.o.after the md5 changed due to a change in a single instruction: 2e8a8241e7f0d69706776a5a26c90bc0 fault.o.before.asm c5c3d36e725586eb74f0e10692f0193e fault.o.after.asm Because a __LINE__ reference in a WARN_ONCE() has changed. On 32-bit a few stack offsets changed - no code size difference nor any functionality difference. Cc: Linus Torvalds <torvalds@linux-foundation.org> Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'arch/x86/mm/fault.c')
-rw-r--r--arch/x86/mm/fault.c546
1 files changed, 331 insertions, 215 deletions
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index e4b9fc5001c6..351d679bf977 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1,56 +1,59 @@
1/* 1/*
2 * Copyright (C) 1995 Linus Torvalds 2 * Copyright (C) 1995 Linus Torvalds
3 * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs. 3 * Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs.
4 */ 4 */
5
6#include <linux/signal.h>
7#include <linux/sched.h>
8#include <linux/kernel.h>
9#include <linux/errno.h>
10#include <linux/string.h>
11#include <linux/types.h>
12#include <linux/ptrace.h>
13#include <linux/mmiotrace.h>
14#include <linux/mman.h>
15#include <linux/mm.h>
16#include <linux/smp.h>
17#include <linux/interrupt.h> 5#include <linux/interrupt.h>
18#include <linux/init.h> 6#include <linux/mmiotrace.h>
19#include <linux/tty.h> 7#include <linux/bootmem.h>
20#include <linux/vt_kern.h> /* For unblank_screen() */
21#include <linux/compiler.h> 8#include <linux/compiler.h>
22#include <linux/highmem.h> 9#include <linux/highmem.h>
23#include <linux/bootmem.h> /* for max_low_pfn */
24#include <linux/vmalloc.h>
25#include <linux/module.h>
26#include <linux/kprobes.h> 10#include <linux/kprobes.h>
27#include <linux/uaccess.h> 11#include <linux/uaccess.h>
12#include <linux/vmalloc.h>
13#include <linux/vt_kern.h>
14#include <linux/signal.h>
15#include <linux/kernel.h>
16#include <linux/ptrace.h>
17#include <linux/string.h>
18#include <linux/module.h>
28#include <linux/kdebug.h> 19#include <linux/kdebug.h>
20#include <linux/errno.h>
29#include <linux/magic.h> 21#include <linux/magic.h>
22#include <linux/sched.h>
23#include <linux/types.h>
24#include <linux/init.h>
25#include <linux/mman.h>
26#include <linux/tty.h>
27#include <linux/smp.h>
28#include <linux/mm.h>
29
30#include <asm-generic/sections.h>
30 31
31#include <asm/system.h>
32#include <asm/desc.h>
33#include <asm/segment.h>
34#include <asm/pgalloc.h>
35#include <asm/smp.h>
36#include <asm/tlbflush.h> 32#include <asm/tlbflush.h>
33#include <asm/pgalloc.h>
34#include <asm/segment.h>
35#include <asm/system.h>
37#include <asm/proto.h> 36#include <asm/proto.h>
38#include <asm-generic/sections.h>
39#include <asm/traps.h> 37#include <asm/traps.h>
38#include <asm/desc.h>
40 39
41/* 40/*
42 * Page fault error code bits 41 * Page fault error code bits:
43 * bit 0 == 0 means no page found, 1 means protection fault 42 *
44 * bit 1 == 0 means read, 1 means write 43 * bit 0 == 0: no page found 1: protection fault
45 * bit 2 == 0 means kernel, 1 means user-mode 44 * bit 1 == 0: read access 1: write access
46 * bit 3 == 1 means use of reserved bit detected 45 * bit 2 == 0: kernel-mode access 1: user-mode access
47 * bit 4 == 1 means fault was an instruction fetch 46 * bit 3 == 1: use of reserved bit detected
47 * bit 4 == 1: fault was an instruction fetch
48 */ 48 */
49#define PF_PROT (1<<0) 49enum x86_pf_error_code {
50#define PF_WRITE (1<<1) 50
51#define PF_USER (1<<2) 51 PF_PROT = 1 << 0,
52#define PF_RSVD (1<<3) 52 PF_WRITE = 1 << 1,
53#define PF_INSTR (1<<4) 53 PF_USER = 1 << 2,
54 PF_RSVD = 1 << 3,
55 PF_INSTR = 1 << 4,
56};
54 57
55static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr) 58static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr)
56{ 59{
@@ -82,23 +85,27 @@ static inline int notify_page_fault(struct pt_regs *regs)
82} 85}
83 86
84/* 87/*
85 * X86_32 88 * Prefetch quirks:
86 * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch. 89 *
87 * Check that here and ignore it. 90 * 32-bit mode:
91 *
92 * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
93 * Check that here and ignore it.
88 * 94 *
89 * X86_64 95 * 64-bit mode:
90 * Sometimes the CPU reports invalid exceptions on prefetch.
91 * Check that here and ignore it.
92 * 96 *
93 * Opcode checker based on code by Richard Brunner 97 * Sometimes the CPU reports invalid exceptions on prefetch.
98 * Check that here and ignore it.
99 *
100 * Opcode checker based on code by Richard Brunner.
94 */ 101 */
95static int is_prefetch(struct pt_regs *regs, unsigned long error_code, 102static int
96 unsigned long addr) 103is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
97{ 104{
105 unsigned char *max_instr;
98 unsigned char *instr; 106 unsigned char *instr;
99 int scan_more = 1; 107 int scan_more = 1;
100 int prefetch = 0; 108 int prefetch = 0;
101 unsigned char *max_instr;
102 109
103 /* 110 /*
104 * If it was a exec (instruction fetch) fault on NX page, then 111 * If it was a exec (instruction fetch) fault on NX page, then
@@ -114,9 +121,9 @@ static int is_prefetch(struct pt_regs *regs, unsigned long error_code,
114 return 0; 121 return 0;
115 122
116 while (scan_more && instr < max_instr) { 123 while (scan_more && instr < max_instr) {
117 unsigned char opcode;
118 unsigned char instr_hi; 124 unsigned char instr_hi;
119 unsigned char instr_lo; 125 unsigned char instr_lo;
126 unsigned char opcode;
120 127
121 if (probe_kernel_address(instr, opcode)) 128 if (probe_kernel_address(instr, opcode))
122 break; 129 break;
@@ -173,15 +180,17 @@ static int is_prefetch(struct pt_regs *regs, unsigned long error_code,
173 return prefetch; 180 return prefetch;
174} 181}
175 182
176static void force_sig_info_fault(int si_signo, int si_code, 183static void
177 unsigned long address, struct task_struct *tsk) 184force_sig_info_fault(int si_signo, int si_code, unsigned long address,
185 struct task_struct *tsk)
178{ 186{
179 siginfo_t info; 187 siginfo_t info;
180 188
181 info.si_signo = si_signo; 189 info.si_signo = si_signo;
182 info.si_errno = 0; 190 info.si_errno = 0;
183 info.si_code = si_code; 191 info.si_code = si_code;
184 info.si_addr = (void __user *)address; 192 info.si_addr = (void __user *)address;
193
185 force_sig_info(si_signo, &info, tsk); 194 force_sig_info(si_signo, &info, tsk);
186} 195}
187 196
@@ -189,6 +198,7 @@ static void force_sig_info_fault(int si_signo, int si_code,
189static int bad_address(void *p) 198static int bad_address(void *p)
190{ 199{
191 unsigned long dummy; 200 unsigned long dummy;
201
192 return probe_kernel_address((unsigned long *)p, dummy); 202 return probe_kernel_address((unsigned long *)p, dummy);
193} 203}
194#endif 204#endif
@@ -200,13 +210,14 @@ static void dump_pagetable(unsigned long address)
200 210
201 page = read_cr3(); 211 page = read_cr3();
202 page = ((__typeof__(page) *) __va(page))[address >> PGDIR_SHIFT]; 212 page = ((__typeof__(page) *) __va(page))[address >> PGDIR_SHIFT];
213
203#ifdef CONFIG_X86_PAE 214#ifdef CONFIG_X86_PAE
204 printk("*pdpt = %016Lx ", page); 215 printk("*pdpt = %016Lx ", page);
205 if ((page >> PAGE_SHIFT) < max_low_pfn 216 if ((page >> PAGE_SHIFT) < max_low_pfn
206 && page & _PAGE_PRESENT) { 217 && page & _PAGE_PRESENT) {
207 page &= PAGE_MASK; 218 page &= PAGE_MASK;
208 page = ((__typeof__(page) *) __va(page))[(address >> PMD_SHIFT) 219 page = ((__typeof__(page) *) __va(page))[(address >> PMD_SHIFT)
209 & (PTRS_PER_PMD - 1)]; 220 & (PTRS_PER_PMD - 1)];
210 printk(KERN_CONT "*pde = %016Lx ", page); 221 printk(KERN_CONT "*pde = %016Lx ", page);
211 page &= ~_PAGE_NX; 222 page &= ~_PAGE_NX;
212 } 223 }
@@ -218,14 +229,15 @@ static void dump_pagetable(unsigned long address)
218 * We must not directly access the pte in the highpte 229 * We must not directly access the pte in the highpte
219 * case if the page table is located in highmem. 230 * case if the page table is located in highmem.
220 * And let's rather not kmap-atomic the pte, just in case 231 * And let's rather not kmap-atomic the pte, just in case
221 * it's allocated already. 232 * it's allocated already:
222 */ 233 */
223 if ((page >> PAGE_SHIFT) < max_low_pfn 234 if ((page >> PAGE_SHIFT) < max_low_pfn
224 && (page & _PAGE_PRESENT) 235 && (page & _PAGE_PRESENT)
225 && !(page & _PAGE_PSE)) { 236 && !(page & _PAGE_PSE)) {
237
226 page &= PAGE_MASK; 238 page &= PAGE_MASK;
227 page = ((__typeof__(page) *) __va(page))[(address >> PAGE_SHIFT) 239 page = ((__typeof__(page) *) __va(page))[(address >> PAGE_SHIFT)
228 & (PTRS_PER_PTE - 1)]; 240 & (PTRS_PER_PTE - 1)];
229 printk("*pte = %0*Lx ", sizeof(page)*2, (u64)page); 241 printk("*pte = %0*Lx ", sizeof(page)*2, (u64)page);
230 } 242 }
231 243
@@ -239,26 +251,38 @@ static void dump_pagetable(unsigned long address)
239 pgd = (pgd_t *)read_cr3(); 251 pgd = (pgd_t *)read_cr3();
240 252
241 pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK); 253 pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
254
242 pgd += pgd_index(address); 255 pgd += pgd_index(address);
243 if (bad_address(pgd)) goto bad; 256 if (bad_address(pgd))
257 goto bad;
258
244 printk("PGD %lx ", pgd_val(*pgd)); 259 printk("PGD %lx ", pgd_val(*pgd));
245 if (!pgd_present(*pgd)) goto ret; 260
261 if (!pgd_present(*pgd))
262 goto out;
246 263
247 pud = pud_offset(pgd, address); 264 pud = pud_offset(pgd, address);
248 if (bad_address(pud)) goto bad; 265 if (bad_address(pud))
266 goto bad;
267
249 printk("PUD %lx ", pud_val(*pud)); 268 printk("PUD %lx ", pud_val(*pud));
250 if (!pud_present(*pud) || pud_large(*pud)) 269 if (!pud_present(*pud) || pud_large(*pud))
251 goto ret; 270 goto out;
252 271
253 pmd = pmd_offset(pud, address); 272 pmd = pmd_offset(pud, address);
254 if (bad_address(pmd)) goto bad; 273 if (bad_address(pmd))
274 goto bad;
275
255 printk("PMD %lx ", pmd_val(*pmd)); 276 printk("PMD %lx ", pmd_val(*pmd));
256 if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret; 277 if (!pmd_present(*pmd) || pmd_large(*pmd))
278 goto out;
257 279
258 pte = pte_offset_kernel(pmd, address); 280 pte = pte_offset_kernel(pmd, address);
259 if (bad_address(pte)) goto bad; 281 if (bad_address(pte))
282 goto bad;
283
260 printk("PTE %lx", pte_val(*pte)); 284 printk("PTE %lx", pte_val(*pte));
261ret: 285out:
262 printk("\n"); 286 printk("\n");
263 return; 287 return;
264bad: 288bad:
@@ -285,7 +309,6 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
285 * and redundant with the set_pmd() on non-PAE. As would 309 * and redundant with the set_pmd() on non-PAE. As would
286 * set_pud. 310 * set_pud.
287 */ 311 */
288
289 pud = pud_offset(pgd, address); 312 pud = pud_offset(pgd, address);
290 pud_k = pud_offset(pgd_k, address); 313 pud_k = pud_offset(pgd_k, address);
291 if (!pud_present(*pud_k)) 314 if (!pud_present(*pud_k))
@@ -295,11 +318,14 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
295 pmd_k = pmd_offset(pud_k, address); 318 pmd_k = pmd_offset(pud_k, address);
296 if (!pmd_present(*pmd_k)) 319 if (!pmd_present(*pmd_k))
297 return NULL; 320 return NULL;
321
298 if (!pmd_present(*pmd)) { 322 if (!pmd_present(*pmd)) {
299 set_pmd(pmd, *pmd_k); 323 set_pmd(pmd, *pmd_k);
300 arch_flush_lazy_mmu_mode(); 324 arch_flush_lazy_mmu_mode();
301 } else 325 } else {
302 BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k)); 326 BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
327 }
328
303 return pmd_k; 329 return pmd_k;
304} 330}
305#endif 331#endif
@@ -312,29 +338,37 @@ KERN_ERR "******* Please consider a BIOS update.\n"
312KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n"; 338KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
313#endif 339#endif
314 340
315/* Workaround for K8 erratum #93 & buggy BIOS. 341/*
316 BIOS SMM functions are required to use a specific workaround 342 * Workaround for K8 erratum #93 & buggy BIOS.
317 to avoid corruption of the 64bit RIP register on C stepping K8. 343 *
318 A lot of BIOS that didn't get tested properly miss this. 344 * BIOS SMM functions are required to use a specific workaround
319 The OS sees this as a page fault with the upper 32bits of RIP cleared. 345 * to avoid corruption of the 64bit RIP register on C stepping K8.
320 Try to work around it here. 346 *
321 Note we only handle faults in kernel here. 347 * A lot of BIOS that didn't get tested properly miss this.
322 Does nothing for X86_32 348 *
349 * The OS sees this as a page fault with the upper 32bits of RIP cleared.
350 * Try to work around it here.
351 *
352 * Note we only handle faults in kernel here.
353 * Does nothing on 32-bit.
323 */ 354 */
324static int is_errata93(struct pt_regs *regs, unsigned long address) 355static int is_errata93(struct pt_regs *regs, unsigned long address)
325{ 356{
326#ifdef CONFIG_X86_64 357#ifdef CONFIG_X86_64
327 static int warned; 358 static int once;
359
328 if (address != regs->ip) 360 if (address != regs->ip)
329 return 0; 361 return 0;
362
330 if ((address >> 32) != 0) 363 if ((address >> 32) != 0)
331 return 0; 364 return 0;
365
332 address |= 0xffffffffUL << 32; 366 address |= 0xffffffffUL << 32;
333 if ((address >= (u64)_stext && address <= (u64)_etext) || 367 if ((address >= (u64)_stext && address <= (u64)_etext) ||
334 (address >= MODULES_VADDR && address <= MODULES_END)) { 368 (address >= MODULES_VADDR && address <= MODULES_END)) {
335 if (!warned) { 369 if (!once) {
336 printk(errata93_warning); 370 printk(errata93_warning);
337 warned = 1; 371 once = 1;
338 } 372 }
339 regs->ip = address; 373 regs->ip = address;
340 return 1; 374 return 1;
@@ -344,16 +378,17 @@ static int is_errata93(struct pt_regs *regs, unsigned long address)
344} 378}
345 379
346/* 380/*
347 * Work around K8 erratum #100 K8 in compat mode occasionally jumps to illegal 381 * Work around K8 erratum #100 K8 in compat mode occasionally jumps
348 * addresses >4GB. We catch this in the page fault handler because these 382 * to illegal addresses >4GB.
349 * addresses are not reachable. Just detect this case and return. Any code 383 *
384 * We catch this in the page fault handler because these addresses
385 * are not reachable. Just detect this case and return. Any code
350 * segment in LDT is compatibility mode. 386 * segment in LDT is compatibility mode.
351 */ 387 */
352static int is_errata100(struct pt_regs *regs, unsigned long address) 388static int is_errata100(struct pt_regs *regs, unsigned long address)
353{ 389{
354#ifdef CONFIG_X86_64 390#ifdef CONFIG_X86_64
355 if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && 391 if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && (address >> 32))
356 (address >> 32))
357 return 1; 392 return 1;
358#endif 393#endif
359 return 0; 394 return 0;
@@ -363,8 +398,9 @@ static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
363{ 398{
364#ifdef CONFIG_X86_F00F_BUG 399#ifdef CONFIG_X86_F00F_BUG
365 unsigned long nr; 400 unsigned long nr;
401
366 /* 402 /*
367 * Pentium F0 0F C7 C8 bug workaround. 403 * Pentium F0 0F C7 C8 bug workaround:
368 */ 404 */
369 if (boot_cpu_data.f00f_bug) { 405 if (boot_cpu_data.f00f_bug) {
370 nr = (address - idt_descr.address) >> 3; 406 nr = (address - idt_descr.address) >> 3;
@@ -378,8 +414,9 @@ static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
378 return 0; 414 return 0;
379} 415}
380 416
381static void show_fault_oops(struct pt_regs *regs, unsigned long error_code, 417static void
382 unsigned long address) 418show_fault_oops(struct pt_regs *regs, unsigned long error_code,
419 unsigned long address)
383{ 420{
384#ifdef CONFIG_X86_32 421#ifdef CONFIG_X86_32
385 if (!oops_may_print()) 422 if (!oops_may_print())
@@ -389,12 +426,14 @@ static void show_fault_oops(struct pt_regs *regs, unsigned long error_code,
389#ifdef CONFIG_X86_PAE 426#ifdef CONFIG_X86_PAE
390 if (error_code & PF_INSTR) { 427 if (error_code & PF_INSTR) {
391 unsigned int level; 428 unsigned int level;
429
392 pte_t *pte = lookup_address(address, &level); 430 pte_t *pte = lookup_address(address, &level);
393 431
394 if (pte && pte_present(*pte) && !pte_exec(*pte)) 432 if (pte && pte_present(*pte) && !pte_exec(*pte)) {
395 printk(KERN_CRIT "kernel tried to execute " 433 printk(KERN_CRIT "kernel tried to execute "
396 "NX-protected page - exploit attempt? " 434 "NX-protected page - exploit attempt? "
397 "(uid: %d)\n", current_uid()); 435 "(uid: %d)\n", current_uid());
436 }
398 } 437 }
399#endif 438#endif
400 439
@@ -403,34 +442,45 @@ static void show_fault_oops(struct pt_regs *regs, unsigned long error_code,
403 printk(KERN_CONT "NULL pointer dereference"); 442 printk(KERN_CONT "NULL pointer dereference");
404 else 443 else
405 printk(KERN_CONT "paging request"); 444 printk(KERN_CONT "paging request");
445
406 printk(KERN_CONT " at %p\n", (void *) address); 446 printk(KERN_CONT " at %p\n", (void *) address);
407 printk(KERN_ALERT "IP:"); 447 printk(KERN_ALERT "IP:");
408 printk_address(regs->ip, 1); 448 printk_address(regs->ip, 1);
449
409 dump_pagetable(address); 450 dump_pagetable(address);
410} 451}
411 452
412#ifdef CONFIG_X86_64 453#ifdef CONFIG_X86_64
413static noinline void pgtable_bad(struct pt_regs *regs, 454static noinline void
414 unsigned long error_code, unsigned long address) 455pgtable_bad(struct pt_regs *regs, unsigned long error_code,
456 unsigned long address)
415{ 457{
416 unsigned long flags = oops_begin(); 458 struct task_struct *tsk;
417 int sig = SIGKILL; 459 unsigned long flags;
418 struct task_struct *tsk = current; 460 int sig;
461
462 flags = oops_begin();
463 tsk = current;
464 sig = SIGKILL;
419 465
420 printk(KERN_ALERT "%s: Corrupted page table at address %lx\n", 466 printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
421 tsk->comm, address); 467 tsk->comm, address);
422 dump_pagetable(address); 468 dump_pagetable(address);
423 tsk->thread.cr2 = address; 469
424 tsk->thread.trap_no = 14; 470 tsk->thread.cr2 = address;
425 tsk->thread.error_code = error_code; 471 tsk->thread.trap_no = 14;
472 tsk->thread.error_code = error_code;
473
426 if (__die("Bad pagetable", regs, error_code)) 474 if (__die("Bad pagetable", regs, error_code))
427 sig = 0; 475 sig = 0;
476
428 oops_end(flags, regs, sig); 477 oops_end(flags, regs, sig);
429} 478}
430#endif 479#endif
431 480
432static noinline void no_context(struct pt_regs *regs, 481static noinline void
433 unsigned long error_code, unsigned long address) 482no_context(struct pt_regs *regs, unsigned long error_code,
483 unsigned long address)
434{ 484{
435 struct task_struct *tsk = current; 485 struct task_struct *tsk = current;
436 unsigned long *stackend; 486 unsigned long *stackend;
@@ -440,18 +490,20 @@ static noinline void no_context(struct pt_regs *regs,
440 int sig; 490 int sig;
441#endif 491#endif
442 492
443 /* Are we prepared to handle this kernel fault? */ 493 /* Are we prepared to handle this kernel fault? */
444 if (fixup_exception(regs)) 494 if (fixup_exception(regs))
445 return; 495 return;
446 496
447 /* 497 /*
448 * X86_32 498 * 32-bit:
449 * Valid to do another page fault here, because if this fault 499 *
450 * had been triggered by is_prefetch fixup_exception would have 500 * Valid to do another page fault here, because if this fault
451 * handled it. 501 * had been triggered by is_prefetch fixup_exception would have
502 * handled it.
503 *
504 * 64-bit:
452 * 505 *
453 * X86_64 506 * Hall of shame of CPU/BIOS bugs.
454 * Hall of shame of CPU/BIOS bugs.
455 */ 507 */
456 if (is_prefetch(regs, error_code, address)) 508 if (is_prefetch(regs, error_code, address))
457 return; 509 return;
@@ -461,7 +513,7 @@ static noinline void no_context(struct pt_regs *regs,
461 513
462 /* 514 /*
463 * Oops. The kernel tried to access some bad page. We'll have to 515 * Oops. The kernel tried to access some bad page. We'll have to
464 * terminate things with extreme prejudice. 516 * terminate things with extreme prejudice:
465 */ 517 */
466#ifdef CONFIG_X86_32 518#ifdef CONFIG_X86_32
467 bust_spinlocks(1); 519 bust_spinlocks(1);
@@ -471,7 +523,7 @@ static noinline void no_context(struct pt_regs *regs,
471 523
472 show_fault_oops(regs, error_code, address); 524 show_fault_oops(regs, error_code, address);
473 525
474 stackend = end_of_stack(tsk); 526 stackend = end_of_stack(tsk);
475 if (*stackend != STACK_END_MAGIC) 527 if (*stackend != STACK_END_MAGIC)
476 printk(KERN_ALERT "Thread overran stack, or stack corrupted\n"); 528 printk(KERN_ALERT "Thread overran stack, or stack corrupted\n");
477 529
@@ -487,28 +539,54 @@ static noinline void no_context(struct pt_regs *regs,
487 sig = SIGKILL; 539 sig = SIGKILL;
488 if (__die("Oops", regs, error_code)) 540 if (__die("Oops", regs, error_code))
489 sig = 0; 541 sig = 0;
542
490 /* Executive summary in case the body of the oops scrolled away */ 543 /* Executive summary in case the body of the oops scrolled away */
491 printk(KERN_EMERG "CR2: %016lx\n", address); 544 printk(KERN_EMERG "CR2: %016lx\n", address);
545
492 oops_end(flags, regs, sig); 546 oops_end(flags, regs, sig);
493#endif 547#endif
494} 548}
495 549
496static void __bad_area_nosemaphore(struct pt_regs *regs, 550/*
497 unsigned long error_code, unsigned long address, 551 * Print out info about fatal segfaults, if the show_unhandled_signals
498 int si_code) 552 * sysctl is set:
553 */
554static inline void
555show_signal_msg(struct pt_regs *regs, unsigned long error_code,
556 unsigned long address, struct task_struct *tsk)
557{
558 if (!unhandled_signal(tsk, SIGSEGV))
559 return;
560
561 if (!printk_ratelimit())
562 return;
563
564 printk(KERN_CONT "%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
565 task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
566 tsk->comm, task_pid_nr(tsk), address,
567 (void *)regs->ip, (void *)regs->sp, error_code);
568
569 print_vma_addr(KERN_CONT " in ", regs->ip);
570
571 printk(KERN_CONT "\n");
572}
573
574static void
575__bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
576 unsigned long address, int si_code)
499{ 577{
500 struct task_struct *tsk = current; 578 struct task_struct *tsk = current;
501 579
502 /* User mode accesses just cause a SIGSEGV */ 580 /* User mode accesses just cause a SIGSEGV */
503 if (error_code & PF_USER) { 581 if (error_code & PF_USER) {
504 /* 582 /*
505 * It's possible to have interrupts off here. 583 * It's possible to have interrupts off here:
506 */ 584 */
507 local_irq_enable(); 585 local_irq_enable();
508 586
509 /* 587 /*
510 * Valid to do another page fault here because this one came 588 * Valid to do another page fault here because this one came
511 * from user space. 589 * from user space:
512 */ 590 */
513 if (is_prefetch(regs, error_code, address)) 591 if (is_prefetch(regs, error_code, address))
514 return; 592 return;
@@ -516,22 +594,16 @@ static void __bad_area_nosemaphore(struct pt_regs *regs,
516 if (is_errata100(regs, address)) 594 if (is_errata100(regs, address))
517 return; 595 return;
518 596
519 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && 597 if (unlikely(show_unhandled_signals))
520 printk_ratelimit()) { 598 show_signal_msg(regs, error_code, address, tsk);
521 printk( 599
522 "%s%s[%d]: segfault at %lx ip %p sp %p error %lx", 600 /* Kernel addresses are always protection faults: */
523 task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, 601 tsk->thread.cr2 = address;
524 tsk->comm, task_pid_nr(tsk), address, 602 tsk->thread.error_code = error_code | (address >= TASK_SIZE);
525 (void *) regs->ip, (void *) regs->sp, error_code); 603 tsk->thread.trap_no = 14;
526 print_vma_addr(" in ", regs->ip);
527 printk("\n");
528 }
529 604
530 tsk->thread.cr2 = address;
531 /* Kernel addresses are always protection faults */
532 tsk->thread.error_code = error_code | (address >= TASK_SIZE);
533 tsk->thread.trap_no = 14;
534 force_sig_info_fault(SIGSEGV, si_code, address, tsk); 605 force_sig_info_fault(SIGSEGV, si_code, address, tsk);
606
535 return; 607 return;
536 } 608 }
537 609
@@ -541,15 +613,16 @@ static void __bad_area_nosemaphore(struct pt_regs *regs,
541 no_context(regs, error_code, address); 613 no_context(regs, error_code, address);
542} 614}
543 615
544static noinline void bad_area_nosemaphore(struct pt_regs *regs, 616static noinline void
545 unsigned long error_code, unsigned long address) 617bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
618 unsigned long address)
546{ 619{
547 __bad_area_nosemaphore(regs, error_code, address, SEGV_MAPERR); 620 __bad_area_nosemaphore(regs, error_code, address, SEGV_MAPERR);
548} 621}
549 622
550static void __bad_area(struct pt_regs *regs, 623static void
551 unsigned long error_code, unsigned long address, 624__bad_area(struct pt_regs *regs, unsigned long error_code,
552 int si_code) 625 unsigned long address, int si_code)
553{ 626{
554 struct mm_struct *mm = current->mm; 627 struct mm_struct *mm = current->mm;
555 628
@@ -562,67 +635,77 @@ static void __bad_area(struct pt_regs *regs,
562 __bad_area_nosemaphore(regs, error_code, address, si_code); 635 __bad_area_nosemaphore(regs, error_code, address, si_code);
563} 636}
564 637
565static noinline void bad_area(struct pt_regs *regs, 638static noinline void
566 unsigned long error_code, unsigned long address) 639bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address)
567{ 640{
568 __bad_area(regs, error_code, address, SEGV_MAPERR); 641 __bad_area(regs, error_code, address, SEGV_MAPERR);
569} 642}
570 643
571static noinline void bad_area_access_error(struct pt_regs *regs, 644static noinline void
572 unsigned long error_code, unsigned long address) 645bad_area_access_error(struct pt_regs *regs, unsigned long error_code,
646 unsigned long address)
573{ 647{
574 __bad_area(regs, error_code, address, SEGV_ACCERR); 648 __bad_area(regs, error_code, address, SEGV_ACCERR);
575} 649}
576 650
577/* TODO: fixup for "mm-invoke-oom-killer-from-page-fault.patch" */ 651/* TODO: fixup for "mm-invoke-oom-killer-from-page-fault.patch" */
578static void out_of_memory(struct pt_regs *regs, 652static void
579 unsigned long error_code, unsigned long address) 653out_of_memory(struct pt_regs *regs, unsigned long error_code,
654 unsigned long address)
580{ 655{
581 /* 656 /*
582 * We ran out of memory, call the OOM killer, and return the userspace 657 * We ran out of memory, call the OOM killer, and return the userspace
583 * (which will retry the fault, or kill us if we got oom-killed). 658 * (which will retry the fault, or kill us if we got oom-killed):
584 */ 659 */
585 up_read(&current->mm->mmap_sem); 660 up_read(&current->mm->mmap_sem);
661
586 pagefault_out_of_memory(); 662 pagefault_out_of_memory();
587} 663}
588 664
589static void do_sigbus(struct pt_regs *regs, 665static void
590 unsigned long error_code, unsigned long address) 666do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address)
591{ 667{
592 struct task_struct *tsk = current; 668 struct task_struct *tsk = current;
593 struct mm_struct *mm = tsk->mm; 669 struct mm_struct *mm = tsk->mm;
594 670
595 up_read(&mm->mmap_sem); 671 up_read(&mm->mmap_sem);
596 672
597 /* Kernel mode? Handle exceptions or die */ 673 /* Kernel mode? Handle exceptions or die: */
598 if (!(error_code & PF_USER)) 674 if (!(error_code & PF_USER))
599 no_context(regs, error_code, address); 675 no_context(regs, error_code, address);
676
600#ifdef CONFIG_X86_32 677#ifdef CONFIG_X86_32
601 /* User space => ok to do another page fault */ 678 /* User space => ok to do another page fault: */
602 if (is_prefetch(regs, error_code, address)) 679 if (is_prefetch(regs, error_code, address))
603 return; 680 return;
604#endif 681#endif
605 tsk->thread.cr2 = address; 682
606 tsk->thread.error_code = error_code; 683 tsk->thread.cr2 = address;
607 tsk->thread.trap_no = 14; 684 tsk->thread.error_code = error_code;
685 tsk->thread.trap_no = 14;
686
608 force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk); 687 force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
609} 688}
610 689
611static noinline void mm_fault_error(struct pt_regs *regs, 690static noinline void
612 unsigned long error_code, unsigned long address, unsigned int fault) 691mm_fault_error(struct pt_regs *regs, unsigned long error_code,
692 unsigned long address, unsigned int fault)
613{ 693{
614 if (fault & VM_FAULT_OOM) 694 if (fault & VM_FAULT_OOM) {
615 out_of_memory(regs, error_code, address); 695 out_of_memory(regs, error_code, address);
616 else if (fault & VM_FAULT_SIGBUS) 696 } else {
617 do_sigbus(regs, error_code, address); 697 if (fault & VM_FAULT_SIGBUS)
618 else 698 do_sigbus(regs, error_code, address);
619 BUG(); 699 else
700 BUG();
701 }
620} 702}
621 703
622static int spurious_fault_check(unsigned long error_code, pte_t *pte) 704static int spurious_fault_check(unsigned long error_code, pte_t *pte)
623{ 705{
624 if ((error_code & PF_WRITE) && !pte_write(*pte)) 706 if ((error_code & PF_WRITE) && !pte_write(*pte))
625 return 0; 707 return 0;
708
626 if ((error_code & PF_INSTR) && !pte_exec(*pte)) 709 if ((error_code & PF_INSTR) && !pte_exec(*pte))
627 return 0; 710 return 0;
628 711
@@ -630,16 +713,19 @@ static int spurious_fault_check(unsigned long error_code, pte_t *pte)
630} 713}
631 714
632/* 715/*
633 * Handle a spurious fault caused by a stale TLB entry. This allows 716 * Handle a spurious fault caused by a stale TLB entry.
634 * us to lazily refresh the TLB when increasing the permissions of a 717 *
635 * kernel page (RO -> RW or NX -> X). Doing it eagerly is very 718 * This allows us to lazily refresh the TLB when increasing the
636 * expensive since that implies doing a full cross-processor TLB 719 * permissions of a kernel page (RO -> RW or NX -> X). Doing it
637 * flush, even if no stale TLB entries exist on other processors. 720 * eagerly is very expensive since that implies doing a full
721 * cross-processor TLB flush, even if no stale TLB entries exist
722 * on other processors.
723 *
638 * There are no security implications to leaving a stale TLB when 724 * There are no security implications to leaving a stale TLB when
639 * increasing the permissions on a page. 725 * increasing the permissions on a page.
640 */ 726 */
641static noinline int spurious_fault(unsigned long error_code, 727static noinline int
642 unsigned long address) 728spurious_fault(unsigned long error_code, unsigned long address)
643{ 729{
644 pgd_t *pgd; 730 pgd_t *pgd;
645 pud_t *pud; 731 pud_t *pud;
@@ -678,20 +764,23 @@ static noinline int spurious_fault(unsigned long error_code,
678 return 0; 764 return 0;
679 765
680 /* 766 /*
681 * Make sure we have permissions in PMD 767 * Make sure we have permissions in PMD.
682 * If not, then there's a bug in the page tables. 768 * If not, then there's a bug in the page tables:
683 */ 769 */
684 ret = spurious_fault_check(error_code, (pte_t *) pmd); 770 ret = spurious_fault_check(error_code, (pte_t *) pmd);
685 WARN_ONCE(!ret, "PMD has incorrect permission bits\n"); 771 WARN_ONCE(!ret, "PMD has incorrect permission bits\n");
772
686 return ret; 773 return ret;
687} 774}
688 775
689/* 776/*
690 * X86_32 777 * 32-bit:
691 * Handle a fault on the vmalloc or module mapping area 778 *
779 * Handle a fault on the vmalloc or module mapping area
692 * 780 *
693 * X86_64 781 * 64-bit:
694 * Handle a fault on the vmalloc area 782 *
783 * Handle a fault on the vmalloc area
695 * 784 *
696 * This assumes no large pages in there. 785 * This assumes no large pages in there.
697 */ 786 */
@@ -702,7 +791,7 @@ static noinline int vmalloc_fault(unsigned long address)
702 pmd_t *pmd_k; 791 pmd_t *pmd_k;
703 pte_t *pte_k; 792 pte_t *pte_k;
704 793
705 /* Make sure we are in vmalloc area */ 794 /* Make sure we are in vmalloc area: */
706 if (!(address >= VMALLOC_START && address < VMALLOC_END)) 795 if (!(address >= VMALLOC_START && address < VMALLOC_END))
707 return -1; 796 return -1;
708 797
@@ -717,9 +806,11 @@ static noinline int vmalloc_fault(unsigned long address)
717 pmd_k = vmalloc_sync_one(__va(pgd_paddr), address); 806 pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
718 if (!pmd_k) 807 if (!pmd_k)
719 return -1; 808 return -1;
809
720 pte_k = pte_offset_kernel(pmd_k, address); 810 pte_k = pte_offset_kernel(pmd_k, address);
721 if (!pte_present(*pte_k)) 811 if (!pte_present(*pte_k))
722 return -1; 812 return -1;
813
723 return 0; 814 return 0;
724#else 815#else
725 pgd_t *pgd, *pgd_ref; 816 pgd_t *pgd, *pgd_ref;
@@ -727,69 +818,84 @@ static noinline int vmalloc_fault(unsigned long address)
727 pmd_t *pmd, *pmd_ref; 818 pmd_t *pmd, *pmd_ref;
728 pte_t *pte, *pte_ref; 819 pte_t *pte, *pte_ref;
729 820
730 /* Make sure we are in vmalloc area */ 821 /* Make sure we are in vmalloc area: */
731 if (!(address >= VMALLOC_START && address < VMALLOC_END)) 822 if (!(address >= VMALLOC_START && address < VMALLOC_END))
732 return -1; 823 return -1;
733 824
734 /* Copy kernel mappings over when needed. This can also 825 /*
735 happen within a race in page table update. In the later 826 * Copy kernel mappings over when needed. This can also
736 case just flush. */ 827 * happen within a race in page table update. In the later
737 828 * case just flush:
829 */
738 pgd = pgd_offset(current->active_mm, address); 830 pgd = pgd_offset(current->active_mm, address);
739 pgd_ref = pgd_offset_k(address); 831 pgd_ref = pgd_offset_k(address);
740 if (pgd_none(*pgd_ref)) 832 if (pgd_none(*pgd_ref))
741 return -1; 833 return -1;
834
742 if (pgd_none(*pgd)) 835 if (pgd_none(*pgd))
743 set_pgd(pgd, *pgd_ref); 836 set_pgd(pgd, *pgd_ref);
744 else 837 else
745 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); 838 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
746 839
747 /* Below here mismatches are bugs because these lower tables 840 /*
748 are shared */ 841 * Below here mismatches are bugs because these lower tables
842 * are shared:
843 */
749 844
750 pud = pud_offset(pgd, address); 845 pud = pud_offset(pgd, address);
751 pud_ref = pud_offset(pgd_ref, address); 846 pud_ref = pud_offset(pgd_ref, address);
752 if (pud_none(*pud_ref)) 847 if (pud_none(*pud_ref))
753 return -1; 848 return -1;
849
754 if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref)) 850 if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
755 BUG(); 851 BUG();
852
756 pmd = pmd_offset(pud, address); 853 pmd = pmd_offset(pud, address);
757 pmd_ref = pmd_offset(pud_ref, address); 854 pmd_ref = pmd_offset(pud_ref, address);
758 if (pmd_none(*pmd_ref)) 855 if (pmd_none(*pmd_ref))
759 return -1; 856 return -1;
857
760 if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref)) 858 if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
761 BUG(); 859 BUG();
860
762 pte_ref = pte_offset_kernel(pmd_ref, address); 861 pte_ref = pte_offset_kernel(pmd_ref, address);
763 if (!pte_present(*pte_ref)) 862 if (!pte_present(*pte_ref))
764 return -1; 863 return -1;
864
765 pte = pte_offset_kernel(pmd, address); 865 pte = pte_offset_kernel(pmd, address);
766 /* Don't use pte_page here, because the mappings can point 866
767 outside mem_map, and the NUMA hash lookup cannot handle 867 /*
768 that. */ 868 * Don't use pte_page here, because the mappings can point
869 * outside mem_map, and the NUMA hash lookup cannot handle
870 * that:
871 */
769 if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref)) 872 if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
770 BUG(); 873 BUG();
874
771 return 0; 875 return 0;
772#endif 876#endif
773} 877}
774 878
775int show_unhandled_signals = 1; 879int show_unhandled_signals = 1;
776 880
777static inline int access_error(unsigned long error_code, int write, 881static inline int
778 struct vm_area_struct *vma) 882access_error(unsigned long error_code, int write, struct vm_area_struct *vma)
779{ 883{
780 if (write) { 884 if (write) {
781 /* write, present and write, not present */ 885 /* write, present and write, not present: */
782 if (unlikely(!(vma->vm_flags & VM_WRITE))) 886 if (unlikely(!(vma->vm_flags & VM_WRITE)))
783 return 1; 887 return 1;
784 } else if (unlikely(error_code & PF_PROT)) { 888 return 0;
785 /* read, present */
786 return 1;
787 } else {
788 /* read, not present */
789 if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))))
790 return 1;
791 } 889 }
792 890
891 /* read, present: */
892 if (unlikely(error_code & PF_PROT))
893 return 1;
894
895 /* read, not present: */
896 if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))))
897 return 1;
898
793 return 0; 899 return 0;
794} 900}
795 901
@@ -797,9 +903,9 @@ static int fault_in_kernel_space(unsigned long address)
797{ 903{
798#ifdef CONFIG_X86_32 904#ifdef CONFIG_X86_32
799 return address >= TASK_SIZE; 905 return address >= TASK_SIZE;
800#else /* !CONFIG_X86_32 */ 906#else
801 return address >= TASK_SIZE64; 907 return address >= TASK_SIZE64;
802#endif /* CONFIG_X86_32 */ 908#endif
803} 909}
804 910
805/* 911/*
@@ -812,18 +918,19 @@ asmlinkage
812#endif 918#endif
813void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) 919void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
814{ 920{
815 unsigned long address; 921 struct vm_area_struct *vma;
816 struct task_struct *tsk; 922 struct task_struct *tsk;
923 unsigned long address;
817 struct mm_struct *mm; 924 struct mm_struct *mm;
818 struct vm_area_struct *vma;
819 int write; 925 int write;
820 int fault; 926 int fault;
821 927
822 tsk = current; 928 tsk = current;
823 mm = tsk->mm; 929 mm = tsk->mm;
930
824 prefetchw(&mm->mmap_sem); 931 prefetchw(&mm->mmap_sem);
825 932
826 /* get the address */ 933 /* Get the faulting address: */
827 address = read_cr2(); 934 address = read_cr2();
828 935
829 if (unlikely(kmmio_fault(regs, address))) 936 if (unlikely(kmmio_fault(regs, address)))
@@ -847,22 +954,23 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
847 vmalloc_fault(address) >= 0) 954 vmalloc_fault(address) >= 0)
848 return; 955 return;
849 956
850 /* Can handle a stale RO->RW TLB */ 957 /* Can handle a stale RO->RW TLB: */
851 if (spurious_fault(error_code, address)) 958 if (spurious_fault(error_code, address))
852 return; 959 return;
853 960
854 /* kprobes don't want to hook the spurious faults. */ 961 /* kprobes don't want to hook the spurious faults: */
855 if (notify_page_fault(regs)) 962 if (notify_page_fault(regs))
856 return; 963 return;
857 /* 964 /*
858 * Don't take the mm semaphore here. If we fixup a prefetch 965 * Don't take the mm semaphore here. If we fixup a prefetch
859 * fault we could otherwise deadlock. 966 * fault we could otherwise deadlock:
860 */ 967 */
861 bad_area_nosemaphore(regs, error_code, address); 968 bad_area_nosemaphore(regs, error_code, address);
969
862 return; 970 return;
863 } 971 }
864 972
865 /* kprobes don't want to hook the spurious faults. */ 973 /* kprobes don't want to hook the spurious faults: */
866 if (unlikely(notify_page_fault(regs))) 974 if (unlikely(notify_page_fault(regs)))
867 return; 975 return;
868 /* 976 /*
@@ -870,13 +978,15 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
870 * vmalloc fault has been handled. 978 * vmalloc fault has been handled.
871 * 979 *
872 * User-mode registers count as a user access even for any 980 * User-mode registers count as a user access even for any
873 * potential system fault or CPU buglet. 981 * potential system fault or CPU buglet:
874 */ 982 */
875 if (user_mode_vm(regs)) { 983 if (user_mode_vm(regs)) {
876 local_irq_enable(); 984 local_irq_enable();
877 error_code |= PF_USER; 985 error_code |= PF_USER;
878 } else if (regs->flags & X86_EFLAGS_IF) 986 } else {
879 local_irq_enable(); 987 if (regs->flags & X86_EFLAGS_IF)
988 local_irq_enable();
989 }
880 990
881#ifdef CONFIG_X86_64 991#ifdef CONFIG_X86_64
882 if (unlikely(error_code & PF_RSVD)) 992 if (unlikely(error_code & PF_RSVD))
@@ -884,8 +994,8 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
884#endif 994#endif
885 995
886 /* 996 /*
887 * If we're in an interrupt, have no user context or are running in an 997 * If we're in an interrupt, have no user context or are running
888 * atomic region then we must not take the fault. 998 * in an atomic region then we must not take the fault:
889 */ 999 */
890 if (unlikely(in_atomic() || !mm)) { 1000 if (unlikely(in_atomic() || !mm)) {
891 bad_area_nosemaphore(regs, error_code, address); 1001 bad_area_nosemaphore(regs, error_code, address);
@@ -894,19 +1004,19 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
894 1004
895 /* 1005 /*
896 * When running in the kernel we expect faults to occur only to 1006 * When running in the kernel we expect faults to occur only to
897 * addresses in user space. All other faults represent errors in the 1007 * addresses in user space. All other faults represent errors in
898 * kernel and should generate an OOPS. Unfortunately, in the case of an 1008 * the kernel and should generate an OOPS. Unfortunately, in the
899 * erroneous fault occurring in a code path which already holds mmap_sem 1009 * case of an erroneous fault occurring in a code path which already
900 * we will deadlock attempting to validate the fault against the 1010 * holds mmap_sem we will deadlock attempting to validate the fault
901 * address space. Luckily the kernel only validly references user 1011 * against the address space. Luckily the kernel only validly
902 * space from well defined areas of code, which are listed in the 1012 * references user space from well defined areas of code, which are
903 * exceptions table. 1013 * listed in the exceptions table.
904 * 1014 *
905 * As the vast majority of faults will be valid we will only perform 1015 * As the vast majority of faults will be valid we will only perform
906 * the source reference check when there is a possibility of a deadlock. 1016 * the source reference check when there is a possibility of a
907 * Attempt to lock the address space, if we cannot we then validate the 1017 * deadlock. Attempt to lock the address space, if we cannot we then
908 * source. If this is invalid we can skip the address space check, 1018 * validate the source. If this is invalid we can skip the address
909 * thus avoiding the deadlock. 1019 * space check, thus avoiding the deadlock:
910 */ 1020 */
911 if (unlikely(!down_read_trylock(&mm->mmap_sem))) { 1021 if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
912 if ((error_code & PF_USER) == 0 && 1022 if ((error_code & PF_USER) == 0 &&
@@ -917,8 +1027,9 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
917 down_read(&mm->mmap_sem); 1027 down_read(&mm->mmap_sem);
918 } else { 1028 } else {
919 /* 1029 /*
920 * The above down_read_trylock() might have succeeded in which 1030 * The above down_read_trylock() might have succeeded in
921 * case we'll have missed the might_sleep() from down_read(). 1031 * which case we'll have missed the might_sleep() from
1032 * down_read():
922 */ 1033 */
923 might_sleep(); 1034 might_sleep();
924 } 1035 }
@@ -938,7 +1049,7 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
938 /* 1049 /*
939 * Accessing the stack below %sp is always a bug. 1050 * Accessing the stack below %sp is always a bug.
940 * The large cushion allows instructions like enter 1051 * The large cushion allows instructions like enter
941 * and pusha to work. ("enter $65535,$31" pushes 1052 * and pusha to work. ("enter $65535, $31" pushes
942 * 32 pointers and then decrements %sp by 65535.) 1053 * 32 pointers and then decrements %sp by 65535.)
943 */ 1054 */
944 if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) { 1055 if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) {
@@ -957,6 +1068,7 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
957 */ 1068 */
958good_area: 1069good_area:
959 write = error_code & PF_WRITE; 1070 write = error_code & PF_WRITE;
1071
960 if (unlikely(access_error(error_code, write, vma))) { 1072 if (unlikely(access_error(error_code, write, vma))) {
961 bad_area_access_error(regs, error_code, address); 1073 bad_area_access_error(regs, error_code, address);
962 return; 1074 return;
@@ -965,13 +1077,15 @@ good_area:
965 /* 1077 /*
966 * If for any reason at all we couldn't handle the fault, 1078 * If for any reason at all we couldn't handle the fault,
967 * make sure we exit gracefully rather than endlessly redo 1079 * make sure we exit gracefully rather than endlessly redo
968 * the fault. 1080 * the fault:
969 */ 1081 */
970 fault = handle_mm_fault(mm, vma, address, write); 1082 fault = handle_mm_fault(mm, vma, address, write);
1083
971 if (unlikely(fault & VM_FAULT_ERROR)) { 1084 if (unlikely(fault & VM_FAULT_ERROR)) {
972 mm_fault_error(regs, error_code, address, fault); 1085 mm_fault_error(regs, error_code, address, fault);
973 return; 1086 return;
974 } 1087 }
1088
975 if (fault & VM_FAULT_MAJOR) 1089 if (fault & VM_FAULT_MAJOR)
976 tsk->maj_flt++; 1090 tsk->maj_flt++;
977 else 1091 else
@@ -1004,13 +1118,13 @@ void vmalloc_sync_all(void)
1004 for (address = VMALLOC_START & PMD_MASK; 1118 for (address = VMALLOC_START & PMD_MASK;
1005 address >= TASK_SIZE && address < FIXADDR_TOP; 1119 address >= TASK_SIZE && address < FIXADDR_TOP;
1006 address += PMD_SIZE) { 1120 address += PMD_SIZE) {
1121
1007 unsigned long flags; 1122 unsigned long flags;
1008 struct page *page; 1123 struct page *page;
1009 1124
1010 spin_lock_irqsave(&pgd_lock, flags); 1125 spin_lock_irqsave(&pgd_lock, flags);
1011 list_for_each_entry(page, &pgd_list, lru) { 1126 list_for_each_entry(page, &pgd_list, lru) {
1012 if (!vmalloc_sync_one(page_address(page), 1127 if (!vmalloc_sync_one(page_address(page), address))
1013 address))
1014 break; 1128 break;
1015 } 1129 }
1016 spin_unlock_irqrestore(&pgd_lock, flags); 1130 spin_unlock_irqrestore(&pgd_lock, flags);
@@ -1018,12 +1132,14 @@ void vmalloc_sync_all(void)
1018#else /* CONFIG_X86_64 */ 1132#else /* CONFIG_X86_64 */
1019 for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END; 1133 for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END;
1020 address += PGDIR_SIZE) { 1134 address += PGDIR_SIZE) {
1135
1021 const pgd_t *pgd_ref = pgd_offset_k(address); 1136 const pgd_t *pgd_ref = pgd_offset_k(address);
1022 unsigned long flags; 1137 unsigned long flags;
1023 struct page *page; 1138 struct page *page;
1024 1139
1025 if (pgd_none(*pgd_ref)) 1140 if (pgd_none(*pgd_ref))
1026 continue; 1141 continue;
1142
1027 spin_lock_irqsave(&pgd_lock, flags); 1143 spin_lock_irqsave(&pgd_lock, flags);
1028 list_for_each_entry(page, &pgd_list, lru) { 1144 list_for_each_entry(page, &pgd_list, lru) {
1029 pgd_t *pgd; 1145 pgd_t *pgd;