aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--arch/x86/mm/fault_32.c148
-rw-r--r--arch/x86/mm/fault_64.c151
2 files changed, 160 insertions, 139 deletions
diff --git a/arch/x86/mm/fault_32.c b/arch/x86/mm/fault_32.c
index db8d748814e4..bfb0917d699d 100644
--- a/arch/x86/mm/fault_32.c
+++ b/arch/x86/mm/fault_32.c
@@ -1,6 +1,4 @@
1/* 1/*
2 * linux/arch/i386/mm/fault.c
3 *
4 * Copyright (C) 1995 Linus Torvalds 2 * Copyright (C) 1995 Linus Torvalds
5 */ 3 */
6 4
@@ -30,11 +28,25 @@
30#include <asm/desc.h> 28#include <asm/desc.h>
31#include <asm/segment.h> 29#include <asm/segment.h>
32 30
33extern void die(const char *,struct pt_regs *,long); 31/*
32 * Page fault error code bits
33 * bit 0 == 0 means no page found, 1 means protection fault
34 * bit 1 == 0 means read, 1 means write
35 * bit 2 == 0 means kernel, 1 means user-mode
36 * bit 3 == 1 means use of reserved bit detected
37 * bit 4 == 1 means fault was an instruction fetch
38 */
39#define PF_PROT (1<<0)
40#define PF_WRITE (1<<1)
41#define PF_USER (1<<2)
42#define PF_RSVD (1<<3)
43#define PF_INSTR (1<<4)
44
45extern void die(const char *, struct pt_regs *, long);
34 46
35#ifdef CONFIG_KPROBES
36static inline int notify_page_fault(struct pt_regs *regs) 47static inline int notify_page_fault(struct pt_regs *regs)
37{ 48{
49#ifdef CONFIG_KPROBES
38 int ret = 0; 50 int ret = 0;
39 51
40 /* kprobe_running() needs smp_processor_id() */ 52 /* kprobe_running() needs smp_processor_id() */
@@ -46,13 +58,10 @@ static inline int notify_page_fault(struct pt_regs *regs)
46 } 58 }
47 59
48 return ret; 60 return ret;
49}
50#else 61#else
51static inline int notify_page_fault(struct pt_regs *regs)
52{
53 return 0; 62 return 0;
54}
55#endif 63#endif
64}
56 65
57/* 66/*
58 * Return EIP plus the CS segment base. The segment limit is also 67 * Return EIP plus the CS segment base. The segment limit is also
@@ -65,7 +74,7 @@ static inline int notify_page_fault(struct pt_regs *regs)
65 * If CS is no longer a valid code segment, or if EIP is beyond the 74 * If CS is no longer a valid code segment, or if EIP is beyond the
66 * limit, or if it is a kernel address when CS is not a kernel segment, 75 * limit, or if it is a kernel address when CS is not a kernel segment,
67 * then the returned value will be greater than *eip_limit. 76 * then the returned value will be greater than *eip_limit.
68 * 77 *
69 * This is slow, but is very rarely executed. 78 * This is slow, but is very rarely executed.
70 */ 79 */
71static inline unsigned long get_segment_eip(struct pt_regs *regs, 80static inline unsigned long get_segment_eip(struct pt_regs *regs,
@@ -84,7 +93,7 @@ static inline unsigned long get_segment_eip(struct pt_regs *regs,
84 93
85 /* The standard kernel/user address space limit. */ 94 /* The standard kernel/user address space limit. */
86 *eip_limit = user_mode(regs) ? USER_DS.seg : KERNEL_DS.seg; 95 *eip_limit = user_mode(regs) ? USER_DS.seg : KERNEL_DS.seg;
87 96
88 /* By far the most common cases. */ 97 /* By far the most common cases. */
89 if (likely(SEGMENT_IS_FLAT_CODE(seg))) 98 if (likely(SEGMENT_IS_FLAT_CODE(seg)))
90 return ip; 99 return ip;
@@ -99,7 +108,7 @@ static inline unsigned long get_segment_eip(struct pt_regs *regs,
99 return 1; /* So that returned ip > *eip_limit. */ 108 return 1; /* So that returned ip > *eip_limit. */
100 } 109 }
101 110
102 /* Get the GDT/LDT descriptor base. 111 /* Get the GDT/LDT descriptor base.
103 When you look for races in this code remember that 112 When you look for races in this code remember that
104 LDT and other horrors are only used in user space. */ 113 LDT and other horrors are only used in user space. */
105 if (seg & (1<<2)) { 114 if (seg & (1<<2)) {
@@ -109,16 +118,16 @@ static inline unsigned long get_segment_eip(struct pt_regs *regs,
109 desc = (void *)desc + (seg & ~7); 118 desc = (void *)desc + (seg & ~7);
110 } else { 119 } else {
111 /* Must disable preemption while reading the GDT. */ 120 /* Must disable preemption while reading the GDT. */
112 desc = (u32 *)get_cpu_gdt_table(get_cpu()); 121 desc = (u32 *)get_cpu_gdt_table(get_cpu());
113 desc = (void *)desc + (seg & ~7); 122 desc = (void *)desc + (seg & ~7);
114 } 123 }
115 124
116 /* Decode the code segment base from the descriptor */ 125 /* Decode the code segment base from the descriptor */
117 base = get_desc_base((struct desc_struct *)desc); 126 base = get_desc_base((struct desc_struct *)desc);
118 127
119 if (seg & (1<<2)) { 128 if (seg & (1<<2))
120 mutex_unlock(&current->mm->context.lock); 129 mutex_unlock(&current->mm->context.lock);
121 } else 130 else
122 put_cpu(); 131 put_cpu();
123 132
124 /* Adjust EIP and segment limit, and clamp at the kernel limit. 133 /* Adjust EIP and segment limit, and clamp at the kernel limit.
@@ -129,19 +138,19 @@ static inline unsigned long get_segment_eip(struct pt_regs *regs,
129 return ip + base; 138 return ip + base;
130} 139}
131 140
132/* 141/*
133 * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch. 142 * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
134 * Check that here and ignore it. 143 * Check that here and ignore it.
135 */ 144 */
136static int __is_prefetch(struct pt_regs *regs, unsigned long addr) 145static int __is_prefetch(struct pt_regs *regs, unsigned long addr)
137{ 146{
138 unsigned long limit; 147 unsigned long limit;
139 unsigned char *instr = (unsigned char *)get_segment_eip (regs, &limit); 148 unsigned char *instr = (unsigned char *)get_segment_eip(regs, &limit);
140 int scan_more = 1; 149 int scan_more = 1;
141 int prefetch = 0; 150 int prefetch = 0;
142 int i; 151 int i;
143 152
144 for (i = 0; scan_more && i < 15; i++) { 153 for (i = 0; scan_more && i < 15; i++) {
145 unsigned char opcode; 154 unsigned char opcode;
146 unsigned char instr_hi; 155 unsigned char instr_hi;
147 unsigned char instr_lo; 156 unsigned char instr_lo;
@@ -149,27 +158,43 @@ static int __is_prefetch(struct pt_regs *regs, unsigned long addr)
149 if (instr > (unsigned char *)limit) 158 if (instr > (unsigned char *)limit)
150 break; 159 break;
151 if (probe_kernel_address(instr, opcode)) 160 if (probe_kernel_address(instr, opcode))
152 break; 161 break;
153 162
154 instr_hi = opcode & 0xf0; 163 instr_hi = opcode & 0xf0;
155 instr_lo = opcode & 0x0f; 164 instr_lo = opcode & 0x0f;
156 instr++; 165 instr++;
157 166
158 switch (instr_hi) { 167 switch (instr_hi) {
159 case 0x20: 168 case 0x20:
160 case 0x30: 169 case 0x30:
161 /* Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes. */ 170 /*
171 * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
172 * In X86_64 long mode, the CPU will signal invalid
173 * opcode if some of these prefixes are present so
174 * X86_64 will never get here anyway
175 */
162 scan_more = ((instr_lo & 7) == 0x6); 176 scan_more = ((instr_lo & 7) == 0x6);
163 break; 177 break;
164 178#ifdef CONFIG_X86_64
179 case 0x40:
180 /*
181 * In AMD64 long mode 0x40..0x4F are valid REX prefixes
182 * Need to figure out under what instruction mode the
183 * instruction was issued. Could check the LDT for lm,
184 * but for now it's good enough to assume that long
185 * mode only uses well known segments or kernel.
186 */
187 scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
188 break;
189#endif
165 case 0x60: 190 case 0x60:
166 /* 0x64 thru 0x67 are valid prefixes in all modes. */ 191 /* 0x64 thru 0x67 are valid prefixes in all modes. */
167 scan_more = (instr_lo & 0xC) == 0x4; 192 scan_more = (instr_lo & 0xC) == 0x4;
168 break; 193 break;
169 case 0xF0: 194 case 0xF0:
170 /* 0xF0, 0xF2, and 0xF3 are valid prefixes */ 195 /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */
171 scan_more = !instr_lo || (instr_lo>>1) == 1; 196 scan_more = !instr_lo || (instr_lo>>1) == 1;
172 break; 197 break;
173 case 0x00: 198 case 0x00:
174 /* Prefetch instruction is 0x0F0D or 0x0F18 */ 199 /* Prefetch instruction is 0x0F0D or 0x0F18 */
175 scan_more = 0; 200 scan_more = 0;
@@ -179,11 +204,11 @@ static int __is_prefetch(struct pt_regs *regs, unsigned long addr)
179 break; 204 break;
180 prefetch = (instr_lo == 0xF) && 205 prefetch = (instr_lo == 0xF) &&
181 (opcode == 0x0D || opcode == 0x18); 206 (opcode == 0x0D || opcode == 0x18);
182 break; 207 break;
183 default: 208 default:
184 scan_more = 0; 209 scan_more = 0;
185 break; 210 break;
186 } 211 }
187 } 212 }
188 return prefetch; 213 return prefetch;
189} 214}
@@ -199,7 +224,7 @@ static inline int is_prefetch(struct pt_regs *regs, unsigned long addr,
199 return __is_prefetch(regs, addr); 224 return __is_prefetch(regs, addr);
200 } 225 }
201 return 0; 226 return 0;
202} 227}
203 228
204static noinline void force_sig_info_fault(int si_signo, int si_code, 229static noinline void force_sig_info_fault(int si_signo, int si_code,
205 unsigned long address, struct task_struct *tsk) 230 unsigned long address, struct task_struct *tsk)
@@ -284,19 +309,12 @@ int show_unhandled_signals = 1;
284 * This routine handles page faults. It determines the address, 309 * This routine handles page faults. It determines the address,
285 * and the problem, and then passes it off to one of the appropriate 310 * and the problem, and then passes it off to one of the appropriate
286 * routines. 311 * routines.
287 *
288 * error_code:
289 * bit 0 == 0 means no page found, 1 means protection fault
290 * bit 1 == 0 means read, 1 means write
291 * bit 2 == 0 means kernel, 1 means user-mode
292 * bit 3 == 1 means use of reserved bit detected
293 * bit 4 == 1 means fault was an instruction fetch
294 */ 312 */
295void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) 313void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
296{ 314{
297 struct task_struct *tsk; 315 struct task_struct *tsk;
298 struct mm_struct *mm; 316 struct mm_struct *mm;
299 struct vm_area_struct * vma; 317 struct vm_area_struct *vma;
300 unsigned long address; 318 unsigned long address;
301 int write, si_code; 319 int write, si_code;
302 int fault; 320 int fault;
@@ -307,7 +325,7 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
307 trace_hardirqs_fixup(); 325 trace_hardirqs_fixup();
308 326
309 /* get the address */ 327 /* get the address */
310 address = read_cr2(); 328 address = read_cr2();
311 329
312 tsk = current; 330 tsk = current;
313 331
@@ -350,7 +368,7 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
350 368
351 /* 369 /*
352 * If we're in an interrupt, have no user context or are running in an 370 * If we're in an interrupt, have no user context or are running in an
353 * atomic region then we must not take the fault.. 371 * atomic region then we must not take the fault.
354 */ 372 */
355 if (in_atomic() || !mm) 373 if (in_atomic() || !mm)
356 goto bad_area_nosemaphore; 374 goto bad_area_nosemaphore;
@@ -371,7 +389,7 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
371 * thus avoiding the deadlock. 389 * thus avoiding the deadlock.
372 */ 390 */
373 if (!down_read_trylock(&mm->mmap_sem)) { 391 if (!down_read_trylock(&mm->mmap_sem)) {
374 if ((error_code & 4) == 0 && 392 if ((error_code & PF_USER) == 0 &&
375 !search_exception_tables(regs->ip)) 393 !search_exception_tables(regs->ip))
376 goto bad_area_nosemaphore; 394 goto bad_area_nosemaphore;
377 down_read(&mm->mmap_sem); 395 down_read(&mm->mmap_sem);
@@ -384,7 +402,7 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
384 goto good_area; 402 goto good_area;
385 if (!(vma->vm_flags & VM_GROWSDOWN)) 403 if (!(vma->vm_flags & VM_GROWSDOWN))
386 goto bad_area; 404 goto bad_area;
387 if (error_code & 4) { 405 if (error_code & PF_USER) {
388 /* 406 /*
389 * Accessing the stack below %sp is always a bug. 407 * Accessing the stack below %sp is always a bug.
390 * The large cushion allows instructions like enter 408 * The large cushion allows instructions like enter
@@ -403,19 +421,19 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
403good_area: 421good_area:
404 si_code = SEGV_ACCERR; 422 si_code = SEGV_ACCERR;
405 write = 0; 423 write = 0;
406 switch (error_code & 3) { 424 switch (error_code & (PF_PROT|PF_WRITE)) {
407 default: /* 3: write, present */ 425 default: /* 3: write, present */
408 /* fall through */ 426 /* fall through */
409 case 2: /* write, not present */ 427 case PF_WRITE: /* write, not present */
410 if (!(vma->vm_flags & VM_WRITE)) 428 if (!(vma->vm_flags & VM_WRITE))
411 goto bad_area; 429 goto bad_area;
412 write++; 430 write++;
413 break; 431 break;
414 case 1: /* read, present */ 432 case PF_PROT: /* read, present */
433 goto bad_area;
434 case 0: /* read, not present */
435 if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
415 goto bad_area; 436 goto bad_area;
416 case 0: /* read, not present */
417 if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
418 goto bad_area;
419 } 437 }
420 438
421 survive: 439 survive:
@@ -457,14 +475,14 @@ bad_area:
457 475
458bad_area_nosemaphore: 476bad_area_nosemaphore:
459 /* User mode accesses just cause a SIGSEGV */ 477 /* User mode accesses just cause a SIGSEGV */
460 if (error_code & 4) { 478 if (error_code & PF_USER) {
461 /* 479 /*
462 * It's possible to have interrupts off here. 480 * It's possible to have interrupts off here.
463 */ 481 */
464 local_irq_enable(); 482 local_irq_enable();
465 483
466 /* 484 /*
467 * Valid to do another page fault here because this one came 485 * Valid to do another page fault here because this one came
468 * from user space. 486 * from user space.
469 */ 487 */
470 if (is_prefetch(regs, address, error_code)) 488 if (is_prefetch(regs, address, error_code))
@@ -492,7 +510,7 @@ bad_area_nosemaphore:
492 */ 510 */
493 if (boot_cpu_data.f00f_bug) { 511 if (boot_cpu_data.f00f_bug) {
494 unsigned long nr; 512 unsigned long nr;
495 513
496 nr = (address - idt_descr.address) >> 3; 514 nr = (address - idt_descr.address) >> 3;
497 515
498 if (nr == 6) { 516 if (nr == 6) {
@@ -507,13 +525,13 @@ no_context:
507 if (fixup_exception(regs)) 525 if (fixup_exception(regs))
508 return; 526 return;
509 527
510 /* 528 /*
511 * Valid to do another page fault here, because if this fault 529 * Valid to do another page fault here, because if this fault
512 * had been triggered by is_prefetch fixup_exception would have 530 * had been triggered by is_prefetch fixup_exception would have
513 * handled it. 531 * handled it.
514 */ 532 */
515 if (is_prefetch(regs, address, error_code)) 533 if (is_prefetch(regs, address, error_code))
516 return; 534 return;
517 535
518/* 536/*
519 * Oops. The kernel tried to access some bad page. We'll have to 537 * Oops. The kernel tried to access some bad page. We'll have to
@@ -541,7 +559,7 @@ no_context:
541 else 559 else
542 printk(KERN_ALERT "BUG: unable to handle kernel paging" 560 printk(KERN_ALERT "BUG: unable to handle kernel paging"
543 " request"); 561 " request");
544 printk(" at virtual address %08lx\n",address); 562 printk(" at virtual address %08lx\n", address);
545 printk(KERN_ALERT "printing ip: %08lx ", regs->ip); 563 printk(KERN_ALERT "printing ip: %08lx ", regs->ip);
546 564
547 page = read_cr3(); 565 page = read_cr3();
@@ -605,7 +623,7 @@ do_sigbus:
605 up_read(&mm->mmap_sem); 623 up_read(&mm->mmap_sem);
606 624
607 /* Kernel mode? Handle exceptions or die */ 625 /* Kernel mode? Handle exceptions or die */
608 if (!(error_code & 4)) 626 if (!(error_code & PF_USER))
609 goto no_context; 627 goto no_context;
610 628
611 /* User space => ok to do another page fault */ 629 /* User space => ok to do another page fault */
diff --git a/arch/x86/mm/fault_64.c b/arch/x86/mm/fault_64.c
index 3a94941578fa..7e98a7691283 100644
--- a/arch/x86/mm/fault_64.c
+++ b/arch/x86/mm/fault_64.c
@@ -1,6 +1,4 @@
1/* 1/*
2 * linux/arch/x86-64/mm/fault.c
3 *
4 * Copyright (C) 1995 Linus Torvalds 2 * Copyright (C) 1995 Linus Torvalds
5 * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs. 3 * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
6 */ 4 */
@@ -33,16 +31,23 @@
33#include <asm/proto.h> 31#include <asm/proto.h>
34#include <asm-generic/sections.h> 32#include <asm-generic/sections.h>
35 33
36/* Page fault error code bits */ 34/*
37#define PF_PROT (1<<0) /* or no page found */ 35 * Page fault error code bits
36 * bit 0 == 0 means no page found, 1 means protection fault
37 * bit 1 == 0 means read, 1 means write
38 * bit 2 == 0 means kernel, 1 means user-mode
39 * bit 3 == 1 means use of reserved bit detected
40 * bit 4 == 1 means fault was an instruction fetch
41 */
42#define PF_PROT (1<<0)
38#define PF_WRITE (1<<1) 43#define PF_WRITE (1<<1)
39#define PF_USER (1<<2) 44#define PF_USER (1<<2)
40#define PF_RSVD (1<<3) 45#define PF_RSVD (1<<3)
41#define PF_INSTR (1<<4) 46#define PF_INSTR (1<<4)
42 47
43#ifdef CONFIG_KPROBES
44static inline int notify_page_fault(struct pt_regs *regs) 48static inline int notify_page_fault(struct pt_regs *regs)
45{ 49{
50#ifdef CONFIG_KPROBES
46 int ret = 0; 51 int ret = 0;
47 52
48 /* kprobe_running() needs smp_processor_id() */ 53 /* kprobe_running() needs smp_processor_id() */
@@ -54,75 +59,75 @@ static inline int notify_page_fault(struct pt_regs *regs)
54 } 59 }
55 60
56 return ret; 61 return ret;
57}
58#else 62#else
59static inline int notify_page_fault(struct pt_regs *regs)
60{
61 return 0; 63 return 0;
62}
63#endif 64#endif
65}
64 66
65/* Sometimes the CPU reports invalid exceptions on prefetch. 67/* Sometimes the CPU reports invalid exceptions on prefetch.
66 Check that here and ignore. 68 Check that here and ignore.
67 Opcode checker based on code by Richard Brunner */ 69 Opcode checker based on code by Richard Brunner */
68static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr, 70static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr,
69 unsigned long error_code) 71 unsigned long error_code)
70{ 72{
71 unsigned char *instr; 73 unsigned char *instr;
72 int scan_more = 1; 74 int scan_more = 1;
73 int prefetch = 0; 75 int prefetch = 0;
74 unsigned char *max_instr; 76 unsigned char *max_instr;
75 77
76 /* If it was a exec fault ignore */ 78 /* If it was a exec fault ignore */
77 if (error_code & PF_INSTR) 79 if (error_code & PF_INSTR)
78 return 0; 80 return 0;
79 81
80 instr = (unsigned char __user *)convert_rip_to_linear(current, regs); 82 instr = (unsigned char __user *)convert_rip_to_linear(current, regs);
81 max_instr = instr + 15; 83 max_instr = instr + 15;
82 84
83 if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE) 85 if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
84 return 0; 86 return 0;
85 87
86 while (scan_more && instr < max_instr) { 88 while (scan_more && instr < max_instr) {
87 unsigned char opcode; 89 unsigned char opcode;
88 unsigned char instr_hi; 90 unsigned char instr_hi;
89 unsigned char instr_lo; 91 unsigned char instr_lo;
90 92
91 if (probe_kernel_address(instr, opcode)) 93 if (probe_kernel_address(instr, opcode))
92 break; 94 break;
93 95
94 instr_hi = opcode & 0xf0; 96 instr_hi = opcode & 0xf0;
95 instr_lo = opcode & 0x0f; 97 instr_lo = opcode & 0x0f;
96 instr++; 98 instr++;
97 99
98 switch (instr_hi) { 100 switch (instr_hi) {
99 case 0x20: 101 case 0x20:
100 case 0x30: 102 case 0x30:
101 /* Values 0x26,0x2E,0x36,0x3E are valid x86 103 /*
102 prefixes. In long mode, the CPU will signal 104 * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
103 invalid opcode if some of these prefixes are 105 * In X86_64 long mode, the CPU will signal invalid
104 present so we will never get here anyway */ 106 * opcode if some of these prefixes are present so
107 * X86_64 will never get here anyway
108 */
105 scan_more = ((instr_lo & 7) == 0x6); 109 scan_more = ((instr_lo & 7) == 0x6);
106 break; 110 break;
107 111#ifdef CONFIG_X86_64
108 case 0x40: 112 case 0x40:
109 /* In AMD64 long mode, 0x40 to 0x4F are valid REX prefixes 113 /*
110 Need to figure out under what instruction mode the 114 * In AMD64 long mode 0x40..0x4F are valid REX prefixes
111 instruction was issued ... */ 115 * Need to figure out under what instruction mode the
112 /* Could check the LDT for lm, but for now it's good 116 * instruction was issued. Could check the LDT for lm,
113 enough to assume that long mode only uses well known 117 * but for now it's good enough to assume that long
114 segments or kernel. */ 118 * mode only uses well known segments or kernel.
119 */
115 scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS); 120 scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
116 break; 121 break;
117 122#endif
118 case 0x60: 123 case 0x60:
119 /* 0x64 thru 0x67 are valid prefixes in all modes. */ 124 /* 0x64 thru 0x67 are valid prefixes in all modes. */
120 scan_more = (instr_lo & 0xC) == 0x4; 125 scan_more = (instr_lo & 0xC) == 0x4;
121 break; 126 break;
122 case 0xF0: 127 case 0xF0:
123 /* 0xF0, 0xF2, and 0xF3 are valid prefixes in all modes. */ 128 /* 0xF0, 0xF2, and 0xF3 are valid prefixes in all modes. */
124 scan_more = !instr_lo || (instr_lo>>1) == 1; 129 scan_more = !instr_lo || (instr_lo>>1) == 1;
125 break; 130 break;
126 case 0x00: 131 case 0x00:
127 /* Prefetch instruction is 0x0F0D or 0x0F18 */ 132 /* Prefetch instruction is 0x0F0D or 0x0F18 */
128 scan_more = 0; 133 scan_more = 0;
@@ -130,20 +135,20 @@ static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr,
130 break; 135 break;
131 prefetch = (instr_lo == 0xF) && 136 prefetch = (instr_lo == 0xF) &&
132 (opcode == 0x0D || opcode == 0x18); 137 (opcode == 0x0D || opcode == 0x18);
133 break; 138 break;
134 default: 139 default:
135 scan_more = 0; 140 scan_more = 0;
136 break; 141 break;
137 } 142 }
138 } 143 }
139 return prefetch; 144 return prefetch;
140} 145}
141 146
142static int bad_address(void *p) 147static int bad_address(void *p)
143{ 148{
144 unsigned long dummy; 149 unsigned long dummy;
145 return probe_kernel_address((unsigned long *)p, dummy); 150 return probe_kernel_address((unsigned long *)p, dummy);
146} 151}
147 152
148void dump_pagetable(unsigned long address) 153void dump_pagetable(unsigned long address)
149{ 154{
@@ -154,11 +159,11 @@ void dump_pagetable(unsigned long address)
154 159
155 pgd = (pgd_t *)read_cr3(); 160 pgd = (pgd_t *)read_cr3();
156 161
157 pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK); 162 pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
158 pgd += pgd_index(address); 163 pgd += pgd_index(address);
159 if (bad_address(pgd)) goto bad; 164 if (bad_address(pgd)) goto bad;
160 printk("PGD %lx ", pgd_val(*pgd)); 165 printk("PGD %lx ", pgd_val(*pgd));
161 if (!pgd_present(*pgd)) goto ret; 166 if (!pgd_present(*pgd)) goto ret;
162 167
163 pud = pud_offset(pgd, address); 168 pud = pud_offset(pgd, address);
164 if (bad_address(pud)) goto bad; 169 if (bad_address(pud)) goto bad;
@@ -172,7 +177,7 @@ void dump_pagetable(unsigned long address)
172 177
173 pte = pte_offset_kernel(pmd, address); 178 pte = pte_offset_kernel(pmd, address);
174 if (bad_address(pte)) goto bad; 179 if (bad_address(pte)) goto bad;
175 printk("PTE %lx", pte_val(*pte)); 180 printk("PTE %lx", pte_val(*pte));
176ret: 181ret:
177 printk("\n"); 182 printk("\n");
178 return; 183 return;
@@ -180,7 +185,7 @@ bad:
180 printk("BAD\n"); 185 printk("BAD\n");
181} 186}
182 187
183static const char errata93_warning[] = 188static const char errata93_warning[] =
184KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n" 189KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
185KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n" 190KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
186KERN_ERR "******* Please consider a BIOS update.\n" 191KERN_ERR "******* Please consider a BIOS update.\n"
@@ -188,31 +193,31 @@ KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
188 193
189/* Workaround for K8 erratum #93 & buggy BIOS. 194/* Workaround for K8 erratum #93 & buggy BIOS.
190 BIOS SMM functions are required to use a specific workaround 195 BIOS SMM functions are required to use a specific workaround
191 to avoid corruption of the 64bit RIP register on C stepping K8. 196 to avoid corruption of the 64bit RIP register on C stepping K8.
192 A lot of BIOS that didn't get tested properly miss this. 197 A lot of BIOS that didn't get tested properly miss this.
193 The OS sees this as a page fault with the upper 32bits of RIP cleared. 198 The OS sees this as a page fault with the upper 32bits of RIP cleared.
194 Try to work around it here. 199 Try to work around it here.
195 Note we only handle faults in kernel here. */ 200 Note we only handle faults in kernel here. */
196 201
197static int is_errata93(struct pt_regs *regs, unsigned long address) 202static int is_errata93(struct pt_regs *regs, unsigned long address)
198{ 203{
199 static int warned; 204 static int warned;
200 if (address != regs->ip) 205 if (address != regs->ip)
201 return 0; 206 return 0;
202 if ((address >> 32) != 0) 207 if ((address >> 32) != 0)
203 return 0; 208 return 0;
204 address |= 0xffffffffUL << 32; 209 address |= 0xffffffffUL << 32;
205 if ((address >= (u64)_stext && address <= (u64)_etext) || 210 if ((address >= (u64)_stext && address <= (u64)_etext) ||
206 (address >= MODULES_VADDR && address <= MODULES_END)) { 211 (address >= MODULES_VADDR && address <= MODULES_END)) {
207 if (!warned) { 212 if (!warned) {
208 printk(errata93_warning); 213 printk(errata93_warning);
209 warned = 1; 214 warned = 1;
210 } 215 }
211 regs->ip = address; 216 regs->ip = address;
212 return 1; 217 return 1;
213 } 218 }
214 return 0; 219 return 0;
215} 220}
216 221
217static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs, 222static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
218 unsigned long error_code) 223 unsigned long error_code)
@@ -296,7 +301,7 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
296{ 301{
297 struct task_struct *tsk; 302 struct task_struct *tsk;
298 struct mm_struct *mm; 303 struct mm_struct *mm;
299 struct vm_area_struct * vma; 304 struct vm_area_struct *vma;
300 unsigned long address; 305 unsigned long address;
301 int write, fault; 306 int write, fault;
302 unsigned long flags; 307 unsigned long flags;
@@ -360,8 +365,8 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
360 pgtable_bad(address, regs, error_code); 365 pgtable_bad(address, regs, error_code);
361 366
362 /* 367 /*
363 * If we're in an interrupt or have no user 368 * If we're in an interrupt, have no user context or are running in an
364 * context, we must not take the fault.. 369 * atomic region then we must not take the fault.
365 */ 370 */
366 if (unlikely(in_atomic() || !mm)) 371 if (unlikely(in_atomic() || !mm))
367 goto bad_area_nosemaphore; 372 goto bad_area_nosemaphore;
@@ -403,7 +408,7 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
403 goto good_area; 408 goto good_area;
404 if (!(vma->vm_flags & VM_GROWSDOWN)) 409 if (!(vma->vm_flags & VM_GROWSDOWN))
405 goto bad_area; 410 goto bad_area;
406 if (error_code & 4) { 411 if (error_code & PF_USER) {
407 /* Allow userspace just enough access below the stack pointer 412 /* Allow userspace just enough access below the stack pointer
408 * to let the 'enter' instruction work. 413 * to let the 'enter' instruction work.
409 */ 414 */
@@ -420,18 +425,18 @@ good_area:
420 info.si_code = SEGV_ACCERR; 425 info.si_code = SEGV_ACCERR;
421 write = 0; 426 write = 0;
422 switch (error_code & (PF_PROT|PF_WRITE)) { 427 switch (error_code & (PF_PROT|PF_WRITE)) {
423 default: /* 3: write, present */ 428 default: /* 3: write, present */
424 /* fall through */ 429 /* fall through */
425 case PF_WRITE: /* write, not present */ 430 case PF_WRITE: /* write, not present */
426 if (!(vma->vm_flags & VM_WRITE)) 431 if (!(vma->vm_flags & VM_WRITE))
427 goto bad_area; 432 goto bad_area;
428 write++; 433 write++;
429 break; 434 break;
430 case PF_PROT: /* read, present */ 435 case PF_PROT: /* read, present */
436 goto bad_area;
437 case 0: /* read, not present */
438 if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
431 goto bad_area; 439 goto bad_area;
432 case 0: /* read, not present */
433 if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
434 goto bad_area;
435 } 440 }
436 441
437 /* 442 /*
@@ -491,7 +496,7 @@ bad_area_nosemaphore:
491 tsk->comm, tsk->pid, address, regs->ip, 496 tsk->comm, tsk->pid, address, regs->ip,
492 regs->sp, error_code); 497 regs->sp, error_code);
493 } 498 }
494 499
495 tsk->thread.cr2 = address; 500 tsk->thread.cr2 = address;
496 /* Kernel addresses are always protection faults */ 501 /* Kernel addresses are always protection faults */
497 tsk->thread.error_code = error_code | (address >= TASK_SIZE); 502 tsk->thread.error_code = error_code | (address >= TASK_SIZE);
@@ -505,21 +510,19 @@ bad_area_nosemaphore:
505 } 510 }
506 511
507no_context: 512no_context:
508
509 /* Are we prepared to handle this kernel fault? */ 513 /* Are we prepared to handle this kernel fault? */
510 if (fixup_exception(regs)) { 514 if (fixup_exception(regs))
511 return; 515 return;
512 }
513 516
514 /* 517 /*
515 * Hall of shame of CPU/BIOS bugs. 518 * Hall of shame of CPU/BIOS bugs.
516 */ 519 */
517 520
518 if (is_prefetch(regs, address, error_code)) 521 if (is_prefetch(regs, address, error_code))
519 return; 522 return;
520 523
521 if (is_errata93(regs, address)) 524 if (is_errata93(regs, address))
522 return; 525 return;
523 526
524/* 527/*
525 * Oops. The kernel tried to access some bad page. We'll have to 528 * Oops. The kernel tried to access some bad page. We'll have to
@@ -532,7 +535,7 @@ no_context:
532 printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference"); 535 printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");
533 else 536 else
534 printk(KERN_ALERT "Unable to handle kernel paging request"); 537 printk(KERN_ALERT "Unable to handle kernel paging request");
535 printk(" at %016lx RIP: \n" KERN_ALERT,address); 538 printk(" at %016lx RIP: \n" KERN_ALERT, address);
536 printk_address(regs->ip); 539 printk_address(regs->ip);
537 dump_pagetable(address); 540 dump_pagetable(address);
538 tsk->thread.cr2 = address; 541 tsk->thread.cr2 = address;
@@ -582,7 +585,7 @@ LIST_HEAD(pgd_list);
582 585
583void vmalloc_sync_all(void) 586void vmalloc_sync_all(void)
584{ 587{
585 /* Note that races in the updates of insync and start aren't 588 /* Note that races in the updates of insync and start aren't
586 problematic: 589 problematic:
587 insync can only get set bits added, and updates to start are only 590 insync can only get set bits added, and updates to start are only
588 improving performance (without affecting correctness if undone). */ 591 improving performance (without affecting correctness if undone). */
@@ -614,6 +617,6 @@ void vmalloc_sync_all(void)
614 } 617 }
615 /* Check that there is no need to do the same for the modules area. */ 618 /* Check that there is no need to do the same for the modules area. */
616 BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL)); 619 BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
617 BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) == 620 BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
618 (__START_KERNEL & PGDIR_MASK))); 621 (__START_KERNEL & PGDIR_MASK)));
619} 622}