diff options
Diffstat (limited to 'arch/i386/mm/fault.c')
-rw-r--r-- | arch/i386/mm/fault.c | 210 |
1 files changed, 138 insertions, 72 deletions
diff --git a/arch/i386/mm/fault.c b/arch/i386/mm/fault.c index cf572d9a3b6e..7f0fcf219a26 100644 --- a/arch/i386/mm/fault.c +++ b/arch/i386/mm/fault.c | |||
@@ -214,6 +214,68 @@ static noinline void force_sig_info_fault(int si_signo, int si_code, | |||
214 | 214 | ||
215 | fastcall void do_invalid_op(struct pt_regs *, unsigned long); | 215 | fastcall void do_invalid_op(struct pt_regs *, unsigned long); |
216 | 216 | ||
217 | static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) | ||
218 | { | ||
219 | unsigned index = pgd_index(address); | ||
220 | pgd_t *pgd_k; | ||
221 | pud_t *pud, *pud_k; | ||
222 | pmd_t *pmd, *pmd_k; | ||
223 | |||
224 | pgd += index; | ||
225 | pgd_k = init_mm.pgd + index; | ||
226 | |||
227 | if (!pgd_present(*pgd_k)) | ||
228 | return NULL; | ||
229 | |||
230 | /* | ||
231 | * set_pgd(pgd, *pgd_k); here would be useless on PAE | ||
232 | * and redundant with the set_pmd() on non-PAE. As would | ||
233 | * set_pud. | ||
234 | */ | ||
235 | |||
236 | pud = pud_offset(pgd, address); | ||
237 | pud_k = pud_offset(pgd_k, address); | ||
238 | if (!pud_present(*pud_k)) | ||
239 | return NULL; | ||
240 | |||
241 | pmd = pmd_offset(pud, address); | ||
242 | pmd_k = pmd_offset(pud_k, address); | ||
243 | if (!pmd_present(*pmd_k)) | ||
244 | return NULL; | ||
245 | if (!pmd_present(*pmd)) | ||
246 | set_pmd(pmd, *pmd_k); | ||
247 | else | ||
248 | BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k)); | ||
249 | return pmd_k; | ||
250 | } | ||
251 | |||
252 | /* | ||
253 | * Handle a fault on the vmalloc or module mapping area | ||
254 | * | ||
255 | * This assumes no large pages in there. | ||
256 | */ | ||
257 | static inline int vmalloc_fault(unsigned long address) | ||
258 | { | ||
259 | unsigned long pgd_paddr; | ||
260 | pmd_t *pmd_k; | ||
261 | pte_t *pte_k; | ||
262 | /* | ||
263 | * Synchronize this task's top level page-table | ||
264 | * with the 'reference' page table. | ||
265 | * | ||
266 | * Do _not_ use "current" here. We might be inside | ||
267 | * an interrupt in the middle of a task switch.. | ||
268 | */ | ||
269 | pgd_paddr = read_cr3(); | ||
270 | pmd_k = vmalloc_sync_one(__va(pgd_paddr), address); | ||
271 | if (!pmd_k) | ||
272 | return -1; | ||
273 | pte_k = pte_offset_kernel(pmd_k, address); | ||
274 | if (!pte_present(*pte_k)) | ||
275 | return -1; | ||
276 | return 0; | ||
277 | } | ||
278 | |||
217 | /* | 279 | /* |
218 | * This routine handles page faults. It determines the address, | 280 | * This routine handles page faults. It determines the address, |
219 | * and the problem, and then passes it off to one of the appropriate | 281 | * and the problem, and then passes it off to one of the appropriate |
@@ -223,6 +285,8 @@ fastcall void do_invalid_op(struct pt_regs *, unsigned long); | |||
223 | * bit 0 == 0 means no page found, 1 means protection fault | 285 | * bit 0 == 0 means no page found, 1 means protection fault |
224 | * bit 1 == 0 means read, 1 means write | 286 | * bit 1 == 0 means read, 1 means write |
225 | * bit 2 == 0 means kernel, 1 means user-mode | 287 | * bit 2 == 0 means kernel, 1 means user-mode |
288 | * bit 3 == 1 means use of reserved bit detected | ||
289 | * bit 4 == 1 means fault was an instruction fetch | ||
226 | */ | 290 | */ |
227 | fastcall void __kprobes do_page_fault(struct pt_regs *regs, | 291 | fastcall void __kprobes do_page_fault(struct pt_regs *regs, |
228 | unsigned long error_code) | 292 | unsigned long error_code) |
@@ -237,13 +301,6 @@ fastcall void __kprobes do_page_fault(struct pt_regs *regs, | |||
237 | /* get the address */ | 301 | /* get the address */ |
238 | address = read_cr2(); | 302 | address = read_cr2(); |
239 | 303 | ||
240 | if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14, | ||
241 | SIGSEGV) == NOTIFY_STOP) | ||
242 | return; | ||
243 | /* It's safe to allow irq's after cr2 has been saved */ | ||
244 | if (regs->eflags & (X86_EFLAGS_IF|VM_MASK)) | ||
245 | local_irq_enable(); | ||
246 | |||
247 | tsk = current; | 304 | tsk = current; |
248 | 305 | ||
249 | si_code = SEGV_MAPERR; | 306 | si_code = SEGV_MAPERR; |
@@ -259,17 +316,29 @@ fastcall void __kprobes do_page_fault(struct pt_regs *regs, | |||
259 | * | 316 | * |
260 | * This verifies that the fault happens in kernel space | 317 | * This verifies that the fault happens in kernel space |
261 | * (error_code & 4) == 0, and that the fault was not a | 318 | * (error_code & 4) == 0, and that the fault was not a |
262 | * protection error (error_code & 1) == 0. | 319 | * protection error (error_code & 9) == 0. |
263 | */ | 320 | */ |
264 | if (unlikely(address >= TASK_SIZE)) { | 321 | if (unlikely(address >= TASK_SIZE)) { |
265 | if (!(error_code & 5)) | 322 | if (!(error_code & 0x0000000d) && vmalloc_fault(address) >= 0) |
266 | goto vmalloc_fault; | 323 | return; |
267 | /* | 324 | if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14, |
325 | SIGSEGV) == NOTIFY_STOP) | ||
326 | return; | ||
327 | /* | ||
268 | * Don't take the mm semaphore here. If we fixup a prefetch | 328 | * Don't take the mm semaphore here. If we fixup a prefetch |
269 | * fault we could otherwise deadlock. | 329 | * fault we could otherwise deadlock. |
270 | */ | 330 | */ |
271 | goto bad_area_nosemaphore; | 331 | goto bad_area_nosemaphore; |
272 | } | 332 | } |
333 | |||
334 | if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14, | ||
335 | SIGSEGV) == NOTIFY_STOP) | ||
336 | return; | ||
337 | |||
338 | /* It's safe to allow irq's after cr2 has been saved and the vmalloc | ||
339 | fault has been handled. */ | ||
340 | if (regs->eflags & (X86_EFLAGS_IF|VM_MASK)) | ||
341 | local_irq_enable(); | ||
273 | 342 | ||
274 | mm = tsk->mm; | 343 | mm = tsk->mm; |
275 | 344 | ||
@@ -440,24 +509,31 @@ no_context: | |||
440 | 509 | ||
441 | bust_spinlocks(1); | 510 | bust_spinlocks(1); |
442 | 511 | ||
443 | #ifdef CONFIG_X86_PAE | 512 | if (oops_may_print()) { |
444 | if (error_code & 16) { | 513 | #ifdef CONFIG_X86_PAE |
445 | pte_t *pte = lookup_address(address); | 514 | if (error_code & 16) { |
515 | pte_t *pte = lookup_address(address); | ||
446 | 516 | ||
447 | if (pte && pte_present(*pte) && !pte_exec_kernel(*pte)) | 517 | if (pte && pte_present(*pte) && !pte_exec_kernel(*pte)) |
448 | printk(KERN_CRIT "kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n", current->uid); | 518 | printk(KERN_CRIT "kernel tried to execute " |
519 | "NX-protected page - exploit attempt? " | ||
520 | "(uid: %d)\n", current->uid); | ||
521 | } | ||
522 | #endif | ||
523 | if (address < PAGE_SIZE) | ||
524 | printk(KERN_ALERT "BUG: unable to handle kernel NULL " | ||
525 | "pointer dereference"); | ||
526 | else | ||
527 | printk(KERN_ALERT "BUG: unable to handle kernel paging" | ||
528 | " request"); | ||
529 | printk(" at virtual address %08lx\n",address); | ||
530 | printk(KERN_ALERT " printing eip:\n"); | ||
531 | printk("%08lx\n", regs->eip); | ||
449 | } | 532 | } |
450 | #endif | ||
451 | if (address < PAGE_SIZE) | ||
452 | printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference"); | ||
453 | else | ||
454 | printk(KERN_ALERT "Unable to handle kernel paging request"); | ||
455 | printk(" at virtual address %08lx\n",address); | ||
456 | printk(KERN_ALERT " printing eip:\n"); | ||
457 | printk("%08lx\n", regs->eip); | ||
458 | page = read_cr3(); | 533 | page = read_cr3(); |
459 | page = ((unsigned long *) __va(page))[address >> 22]; | 534 | page = ((unsigned long *) __va(page))[address >> 22]; |
460 | printk(KERN_ALERT "*pde = %08lx\n", page); | 535 | if (oops_may_print()) |
536 | printk(KERN_ALERT "*pde = %08lx\n", page); | ||
461 | /* | 537 | /* |
462 | * We must not directly access the pte in the highpte | 538 | * We must not directly access the pte in the highpte |
463 | * case, the page table might be allocated in highmem. | 539 | * case, the page table might be allocated in highmem. |
@@ -465,7 +541,7 @@ no_context: | |||
465 | * it's allocated already. | 541 | * it's allocated already. |
466 | */ | 542 | */ |
467 | #ifndef CONFIG_HIGHPTE | 543 | #ifndef CONFIG_HIGHPTE |
468 | if (page & 1) { | 544 | if ((page & 1) && oops_may_print()) { |
469 | page &= PAGE_MASK; | 545 | page &= PAGE_MASK; |
470 | address &= 0x003ff000; | 546 | address &= 0x003ff000; |
471 | page = ((unsigned long *) __va(page))[address >> PAGE_SHIFT]; | 547 | page = ((unsigned long *) __va(page))[address >> PAGE_SHIFT]; |
@@ -510,51 +586,41 @@ do_sigbus: | |||
510 | tsk->thread.error_code = error_code; | 586 | tsk->thread.error_code = error_code; |
511 | tsk->thread.trap_no = 14; | 587 | tsk->thread.trap_no = 14; |
512 | force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk); | 588 | force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk); |
513 | return; | 589 | } |
514 | |||
515 | vmalloc_fault: | ||
516 | { | ||
517 | /* | ||
518 | * Synchronize this task's top level page-table | ||
519 | * with the 'reference' page table. | ||
520 | * | ||
521 | * Do _not_ use "tsk" here. We might be inside | ||
522 | * an interrupt in the middle of a task switch.. | ||
523 | */ | ||
524 | int index = pgd_index(address); | ||
525 | unsigned long pgd_paddr; | ||
526 | pgd_t *pgd, *pgd_k; | ||
527 | pud_t *pud, *pud_k; | ||
528 | pmd_t *pmd, *pmd_k; | ||
529 | pte_t *pte_k; | ||
530 | |||
531 | pgd_paddr = read_cr3(); | ||
532 | pgd = index + (pgd_t *)__va(pgd_paddr); | ||
533 | pgd_k = init_mm.pgd + index; | ||
534 | |||
535 | if (!pgd_present(*pgd_k)) | ||
536 | goto no_context; | ||
537 | |||
538 | /* | ||
539 | * set_pgd(pgd, *pgd_k); here would be useless on PAE | ||
540 | * and redundant with the set_pmd() on non-PAE. As would | ||
541 | * set_pud. | ||
542 | */ | ||
543 | 590 | ||
544 | pud = pud_offset(pgd, address); | 591 | #ifndef CONFIG_X86_PAE |
545 | pud_k = pud_offset(pgd_k, address); | 592 | void vmalloc_sync_all(void) |
546 | if (!pud_present(*pud_k)) | 593 | { |
547 | goto no_context; | 594 | /* |
548 | 595 | * Note that races in the updates of insync and start aren't | |
549 | pmd = pmd_offset(pud, address); | 596 | * problematic: insync can only get set bits added, and updates to |
550 | pmd_k = pmd_offset(pud_k, address); | 597 | * start are only improving performance (without affecting correctness |
551 | if (!pmd_present(*pmd_k)) | 598 | * if undone). |
552 | goto no_context; | 599 | */ |
553 | set_pmd(pmd, *pmd_k); | 600 | static DECLARE_BITMAP(insync, PTRS_PER_PGD); |
601 | static unsigned long start = TASK_SIZE; | ||
602 | unsigned long address; | ||
554 | 603 | ||
555 | pte_k = pte_offset_kernel(pmd_k, address); | 604 | BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK); |
556 | if (!pte_present(*pte_k)) | 605 | for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) { |
557 | goto no_context; | 606 | if (!test_bit(pgd_index(address), insync)) { |
558 | return; | 607 | unsigned long flags; |
608 | struct page *page; | ||
609 | |||
610 | spin_lock_irqsave(&pgd_lock, flags); | ||
611 | for (page = pgd_list; page; page = | ||
612 | (struct page *)page->index) | ||
613 | if (!vmalloc_sync_one(page_address(page), | ||
614 | address)) { | ||
615 | BUG_ON(page != pgd_list); | ||
616 | break; | ||
617 | } | ||
618 | spin_unlock_irqrestore(&pgd_lock, flags); | ||
619 | if (!page) | ||
620 | set_bit(pgd_index(address), insync); | ||
621 | } | ||
622 | if (address == start && test_bit(pgd_index(address), insync)) | ||
623 | start = address + PGDIR_SIZE; | ||
559 | } | 624 | } |
560 | } | 625 | } |
626 | #endif | ||