diff options
Diffstat (limited to 'arch/i386/mm/fault.c')
| -rw-r--r-- | arch/i386/mm/fault.c | 210 |
1 files changed, 138 insertions, 72 deletions
diff --git a/arch/i386/mm/fault.c b/arch/i386/mm/fault.c index cf572d9a3b6e..7f0fcf219a26 100644 --- a/arch/i386/mm/fault.c +++ b/arch/i386/mm/fault.c | |||
| @@ -214,6 +214,68 @@ static noinline void force_sig_info_fault(int si_signo, int si_code, | |||
| 214 | 214 | ||
| 215 | fastcall void do_invalid_op(struct pt_regs *, unsigned long); | 215 | fastcall void do_invalid_op(struct pt_regs *, unsigned long); |
| 216 | 216 | ||
| 217 | static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) | ||
| 218 | { | ||
| 219 | unsigned index = pgd_index(address); | ||
| 220 | pgd_t *pgd_k; | ||
| 221 | pud_t *pud, *pud_k; | ||
| 222 | pmd_t *pmd, *pmd_k; | ||
| 223 | |||
| 224 | pgd += index; | ||
| 225 | pgd_k = init_mm.pgd + index; | ||
| 226 | |||
| 227 | if (!pgd_present(*pgd_k)) | ||
| 228 | return NULL; | ||
| 229 | |||
| 230 | /* | ||
| 231 | * set_pgd(pgd, *pgd_k); here would be useless on PAE | ||
| 232 | * and redundant with the set_pmd() on non-PAE. As would | ||
| 233 | * set_pud. | ||
| 234 | */ | ||
| 235 | |||
| 236 | pud = pud_offset(pgd, address); | ||
| 237 | pud_k = pud_offset(pgd_k, address); | ||
| 238 | if (!pud_present(*pud_k)) | ||
| 239 | return NULL; | ||
| 240 | |||
| 241 | pmd = pmd_offset(pud, address); | ||
| 242 | pmd_k = pmd_offset(pud_k, address); | ||
| 243 | if (!pmd_present(*pmd_k)) | ||
| 244 | return NULL; | ||
| 245 | if (!pmd_present(*pmd)) | ||
| 246 | set_pmd(pmd, *pmd_k); | ||
| 247 | else | ||
| 248 | BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k)); | ||
| 249 | return pmd_k; | ||
| 250 | } | ||
| 251 | |||
| 252 | /* | ||
| 253 | * Handle a fault on the vmalloc or module mapping area | ||
| 254 | * | ||
| 255 | * This assumes no large pages in there. | ||
| 256 | */ | ||
| 257 | static inline int vmalloc_fault(unsigned long address) | ||
| 258 | { | ||
| 259 | unsigned long pgd_paddr; | ||
| 260 | pmd_t *pmd_k; | ||
| 261 | pte_t *pte_k; | ||
| 262 | /* | ||
| 263 | * Synchronize this task's top level page-table | ||
| 264 | * with the 'reference' page table. | ||
| 265 | * | ||
| 266 | * Do _not_ use "current" here. We might be inside | ||
| 267 | * an interrupt in the middle of a task switch.. | ||
| 268 | */ | ||
| 269 | pgd_paddr = read_cr3(); | ||
| 270 | pmd_k = vmalloc_sync_one(__va(pgd_paddr), address); | ||
| 271 | if (!pmd_k) | ||
| 272 | return -1; | ||
| 273 | pte_k = pte_offset_kernel(pmd_k, address); | ||
| 274 | if (!pte_present(*pte_k)) | ||
| 275 | return -1; | ||
| 276 | return 0; | ||
| 277 | } | ||
| 278 | |||
| 217 | /* | 279 | /* |
| 218 | * This routine handles page faults. It determines the address, | 280 | * This routine handles page faults. It determines the address, |
| 219 | * and the problem, and then passes it off to one of the appropriate | 281 | * and the problem, and then passes it off to one of the appropriate |
| @@ -223,6 +285,8 @@ fastcall void do_invalid_op(struct pt_regs *, unsigned long); | |||
| 223 | * bit 0 == 0 means no page found, 1 means protection fault | 285 | * bit 0 == 0 means no page found, 1 means protection fault |
| 224 | * bit 1 == 0 means read, 1 means write | 286 | * bit 1 == 0 means read, 1 means write |
| 225 | * bit 2 == 0 means kernel, 1 means user-mode | 287 | * bit 2 == 0 means kernel, 1 means user-mode |
| 288 | * bit 3 == 1 means use of reserved bit detected | ||
| 289 | * bit 4 == 1 means fault was an instruction fetch | ||
| 226 | */ | 290 | */ |
| 227 | fastcall void __kprobes do_page_fault(struct pt_regs *regs, | 291 | fastcall void __kprobes do_page_fault(struct pt_regs *regs, |
| 228 | unsigned long error_code) | 292 | unsigned long error_code) |
| @@ -237,13 +301,6 @@ fastcall void __kprobes do_page_fault(struct pt_regs *regs, | |||
| 237 | /* get the address */ | 301 | /* get the address */ |
| 238 | address = read_cr2(); | 302 | address = read_cr2(); |
| 239 | 303 | ||
| 240 | if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14, | ||
| 241 | SIGSEGV) == NOTIFY_STOP) | ||
| 242 | return; | ||
| 243 | /* It's safe to allow irq's after cr2 has been saved */ | ||
| 244 | if (regs->eflags & (X86_EFLAGS_IF|VM_MASK)) | ||
| 245 | local_irq_enable(); | ||
| 246 | |||
| 247 | tsk = current; | 304 | tsk = current; |
| 248 | 305 | ||
| 249 | si_code = SEGV_MAPERR; | 306 | si_code = SEGV_MAPERR; |
| @@ -259,17 +316,29 @@ fastcall void __kprobes do_page_fault(struct pt_regs *regs, | |||
| 259 | * | 316 | * |
| 260 | * This verifies that the fault happens in kernel space | 317 | * This verifies that the fault happens in kernel space |
| 261 | * (error_code & 4) == 0, and that the fault was not a | 318 | * (error_code & 4) == 0, and that the fault was not a |
| 262 | * protection error (error_code & 1) == 0. | 319 | * protection error (error_code & 9) == 0. |
| 263 | */ | 320 | */ |
| 264 | if (unlikely(address >= TASK_SIZE)) { | 321 | if (unlikely(address >= TASK_SIZE)) { |
| 265 | if (!(error_code & 5)) | 322 | if (!(error_code & 0x0000000d) && vmalloc_fault(address) >= 0) |
| 266 | goto vmalloc_fault; | 323 | return; |
| 267 | /* | 324 | if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14, |
| 325 | SIGSEGV) == NOTIFY_STOP) | ||
| 326 | return; | ||
| 327 | /* | ||
| 268 | * Don't take the mm semaphore here. If we fixup a prefetch | 328 | * Don't take the mm semaphore here. If we fixup a prefetch |
| 269 | * fault we could otherwise deadlock. | 329 | * fault we could otherwise deadlock. |
| 270 | */ | 330 | */ |
| 271 | goto bad_area_nosemaphore; | 331 | goto bad_area_nosemaphore; |
| 272 | } | 332 | } |
| 333 | |||
| 334 | if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14, | ||
| 335 | SIGSEGV) == NOTIFY_STOP) | ||
| 336 | return; | ||
| 337 | |||
| 338 | /* It's safe to allow irq's after cr2 has been saved and the vmalloc | ||
| 339 | fault has been handled. */ | ||
| 340 | if (regs->eflags & (X86_EFLAGS_IF|VM_MASK)) | ||
| 341 | local_irq_enable(); | ||
| 273 | 342 | ||
| 274 | mm = tsk->mm; | 343 | mm = tsk->mm; |
| 275 | 344 | ||
| @@ -440,24 +509,31 @@ no_context: | |||
| 440 | 509 | ||
| 441 | bust_spinlocks(1); | 510 | bust_spinlocks(1); |
| 442 | 511 | ||
| 443 | #ifdef CONFIG_X86_PAE | 512 | if (oops_may_print()) { |
| 444 | if (error_code & 16) { | 513 | #ifdef CONFIG_X86_PAE |
| 445 | pte_t *pte = lookup_address(address); | 514 | if (error_code & 16) { |
| 515 | pte_t *pte = lookup_address(address); | ||
| 446 | 516 | ||
| 447 | if (pte && pte_present(*pte) && !pte_exec_kernel(*pte)) | 517 | if (pte && pte_present(*pte) && !pte_exec_kernel(*pte)) |
| 448 | printk(KERN_CRIT "kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n", current->uid); | 518 | printk(KERN_CRIT "kernel tried to execute " |
| 519 | "NX-protected page - exploit attempt? " | ||
| 520 | "(uid: %d)\n", current->uid); | ||
| 521 | } | ||
| 522 | #endif | ||
| 523 | if (address < PAGE_SIZE) | ||
| 524 | printk(KERN_ALERT "BUG: unable to handle kernel NULL " | ||
| 525 | "pointer dereference"); | ||
| 526 | else | ||
| 527 | printk(KERN_ALERT "BUG: unable to handle kernel paging" | ||
| 528 | " request"); | ||
| 529 | printk(" at virtual address %08lx\n",address); | ||
| 530 | printk(KERN_ALERT " printing eip:\n"); | ||
| 531 | printk("%08lx\n", regs->eip); | ||
| 449 | } | 532 | } |
| 450 | #endif | ||
| 451 | if (address < PAGE_SIZE) | ||
| 452 | printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference"); | ||
| 453 | else | ||
| 454 | printk(KERN_ALERT "Unable to handle kernel paging request"); | ||
| 455 | printk(" at virtual address %08lx\n",address); | ||
| 456 | printk(KERN_ALERT " printing eip:\n"); | ||
| 457 | printk("%08lx\n", regs->eip); | ||
| 458 | page = read_cr3(); | 533 | page = read_cr3(); |
| 459 | page = ((unsigned long *) __va(page))[address >> 22]; | 534 | page = ((unsigned long *) __va(page))[address >> 22]; |
| 460 | printk(KERN_ALERT "*pde = %08lx\n", page); | 535 | if (oops_may_print()) |
| 536 | printk(KERN_ALERT "*pde = %08lx\n", page); | ||
| 461 | /* | 537 | /* |
| 462 | * We must not directly access the pte in the highpte | 538 | * We must not directly access the pte in the highpte |
| 463 | * case, the page table might be allocated in highmem. | 539 | * case, the page table might be allocated in highmem. |
| @@ -465,7 +541,7 @@ no_context: | |||
| 465 | * it's allocated already. | 541 | * it's allocated already. |
| 466 | */ | 542 | */ |
| 467 | #ifndef CONFIG_HIGHPTE | 543 | #ifndef CONFIG_HIGHPTE |
| 468 | if (page & 1) { | 544 | if ((page & 1) && oops_may_print()) { |
| 469 | page &= PAGE_MASK; | 545 | page &= PAGE_MASK; |
| 470 | address &= 0x003ff000; | 546 | address &= 0x003ff000; |
| 471 | page = ((unsigned long *) __va(page))[address >> PAGE_SHIFT]; | 547 | page = ((unsigned long *) __va(page))[address >> PAGE_SHIFT]; |
| @@ -510,51 +586,41 @@ do_sigbus: | |||
| 510 | tsk->thread.error_code = error_code; | 586 | tsk->thread.error_code = error_code; |
| 511 | tsk->thread.trap_no = 14; | 587 | tsk->thread.trap_no = 14; |
| 512 | force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk); | 588 | force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk); |
| 513 | return; | 589 | } |
| 514 | |||
| 515 | vmalloc_fault: | ||
| 516 | { | ||
| 517 | /* | ||
| 518 | * Synchronize this task's top level page-table | ||
| 519 | * with the 'reference' page table. | ||
| 520 | * | ||
| 521 | * Do _not_ use "tsk" here. We might be inside | ||
| 522 | * an interrupt in the middle of a task switch.. | ||
| 523 | */ | ||
| 524 | int index = pgd_index(address); | ||
| 525 | unsigned long pgd_paddr; | ||
| 526 | pgd_t *pgd, *pgd_k; | ||
| 527 | pud_t *pud, *pud_k; | ||
| 528 | pmd_t *pmd, *pmd_k; | ||
| 529 | pte_t *pte_k; | ||
| 530 | |||
| 531 | pgd_paddr = read_cr3(); | ||
| 532 | pgd = index + (pgd_t *)__va(pgd_paddr); | ||
| 533 | pgd_k = init_mm.pgd + index; | ||
| 534 | |||
| 535 | if (!pgd_present(*pgd_k)) | ||
| 536 | goto no_context; | ||
| 537 | |||
| 538 | /* | ||
| 539 | * set_pgd(pgd, *pgd_k); here would be useless on PAE | ||
| 540 | * and redundant with the set_pmd() on non-PAE. As would | ||
| 541 | * set_pud. | ||
| 542 | */ | ||
| 543 | 590 | ||
| 544 | pud = pud_offset(pgd, address); | 591 | #ifndef CONFIG_X86_PAE |
| 545 | pud_k = pud_offset(pgd_k, address); | 592 | void vmalloc_sync_all(void) |
| 546 | if (!pud_present(*pud_k)) | 593 | { |
| 547 | goto no_context; | 594 | /* |
| 548 | 595 | * Note that races in the updates of insync and start aren't | |
| 549 | pmd = pmd_offset(pud, address); | 596 | * problematic: insync can only get set bits added, and updates to |
| 550 | pmd_k = pmd_offset(pud_k, address); | 597 | * start are only improving performance (without affecting correctness |
| 551 | if (!pmd_present(*pmd_k)) | 598 | * if undone). |
| 552 | goto no_context; | 599 | */ |
| 553 | set_pmd(pmd, *pmd_k); | 600 | static DECLARE_BITMAP(insync, PTRS_PER_PGD); |
| 601 | static unsigned long start = TASK_SIZE; | ||
| 602 | unsigned long address; | ||
| 554 | 603 | ||
| 555 | pte_k = pte_offset_kernel(pmd_k, address); | 604 | BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK); |
| 556 | if (!pte_present(*pte_k)) | 605 | for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) { |
| 557 | goto no_context; | 606 | if (!test_bit(pgd_index(address), insync)) { |
| 558 | return; | 607 | unsigned long flags; |
| 608 | struct page *page; | ||
| 609 | |||
| 610 | spin_lock_irqsave(&pgd_lock, flags); | ||
| 611 | for (page = pgd_list; page; page = | ||
| 612 | (struct page *)page->index) | ||
| 613 | if (!vmalloc_sync_one(page_address(page), | ||
| 614 | address)) { | ||
| 615 | BUG_ON(page != pgd_list); | ||
| 616 | break; | ||
| 617 | } | ||
| 618 | spin_unlock_irqrestore(&pgd_lock, flags); | ||
| 619 | if (!page) | ||
| 620 | set_bit(pgd_index(address), insync); | ||
| 621 | } | ||
| 622 | if (address == start && test_bit(pgd_index(address), insync)) | ||
| 623 | start = address + PGDIR_SIZE; | ||
| 559 | } | 624 | } |
| 560 | } | 625 | } |
| 626 | #endif | ||
