From fb746d0e1365b7472ccc4c3d5b0672b34a092d0b Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 21 Jan 2009 20:45:41 +0100 Subject: x86: optimise page fault entry, cleanup tsk is already assigned to current, drop the redundant second assignment. Signed-off-by: Johannes Weiner Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86/mm/fault.c') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 033292dc9e21..8f4b859a04b3 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -419,7 +419,6 @@ static noinline void pgtable_bad(struct pt_regs *regs, printk(KERN_ALERT "%s: Corrupted page table at address %lx\n", tsk->comm, address); dump_pagetable(address); - tsk = current; tsk->thread.cr2 = address; tsk->thread.trap_no = 14; tsk->thread.error_code = error_code; -- cgit v1.2.2 From 010060741ad35eacb504414bc6fb9bb575b15f62 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 29 Jan 2009 16:02:12 +0100 Subject: x86: add might_sleep() to do_page_fault() Impact: widen debug checks VirtualBox calls do_page_fault() from an atomic context but runs into a might_sleep() way pas this point, cure that. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/x86/mm/fault.c') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 8f4b859a04b3..eb4d7fe05938 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -888,6 +888,12 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) return; } down_read(&mm->mmap_sem); + } else { + /* + * The above down_read_trylock() might have succeeded in which + * case we'll have missed the might_sleep() from down_read(). + */ + might_sleep(); } vma = find_vma(mm, address); -- cgit v1.2.2 From 0973a06cde8cc1522fbcf2baacb926f1ee3f4c79 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Wed, 4 Feb 2009 15:24:09 -0800 Subject: x86: mm: introduce helper function in fault.c Impact: cleanup Introduce helper function fault_in_kernel_address() to make editors happy. Signed-off-by: Hiroshi Shimamoto Signed-off-by: H. Peter Anvin --- arch/x86/mm/fault.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'arch/x86/mm/fault.c') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index eb4d7fe05938..8e9b0f1fd872 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -775,6 +775,15 @@ static inline int access_error(unsigned long error_code, int write, return 0; } +static int fault_in_kernel_space(unsigned long address) +{ +#ifdef CONFIG_X86_32 + return address >= TASK_SIZE; +#else /* !CONFIG_X86_32 */ + return address >= TASK_SIZE64; +#endif /* CONFIG_X86_32 */ +} + /* * This routine handles page faults. It determines the address, * and the problem, and then passes it off to one of the appropriate @@ -817,11 +826,7 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) * (error_code & 4) == 0, and that the fault was not a * protection error (error_code & 9) == 0. */ -#ifdef CONFIG_X86_32 - if (unlikely(address >= TASK_SIZE)) { -#else - if (unlikely(address >= TASK_SIZE64)) { -#endif + if (unlikely(fault_in_kernel_space(address))) { if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) && vmalloc_fault(address) >= 0) return; -- cgit v1.2.2 From 3c3e5694add02e665bbbd0fecfbbdcc0b903097a Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 19 Feb 2009 11:46:36 -0500 Subject: x86: check PMD in spurious_fault handler Impact: fix to prevent hard lockup on bad PMD permissions If the PMD does not have the correct permissions for a page access, but the PTE does, the spurious fault handler will mistake the fault as a lazy TLB transaction. This will result in an infinite loop of: fault -> spurious_fault check (pass) -> return to code -> fault This patch adds a check and a warn on if the PTE passes the permissions but the PMD does not. [ Updated: Ingo Molnar suggested using WARN_ONCE with some text ] Signed-off-by: Steven Rostedt --- arch/x86/mm/fault.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'arch/x86/mm/fault.c') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index c76ef1d701c9..278d645d108a 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -455,6 +455,7 @@ static int spurious_fault(unsigned long address, pud_t *pud; pmd_t *pmd; pte_t *pte; + int ret; /* Reserved-bit violation or user access to kernel space? */ if (error_code & (PF_USER | PF_RSVD)) @@ -482,7 +483,17 @@ static int spurious_fault(unsigned long address, if (!pte_present(*pte)) return 0; - return spurious_fault_check(error_code, pte); + ret = spurious_fault_check(error_code, pte); + if (!ret) + return 0; + + /* + * Make sure we have permissions in PMD + * If not, then there's a bug in the page tables. + */ + ret = spurious_fault_check(error_code, (pte_t *) pmd); + WARN_ONCE(!ret, "PMD has incorrect permission bits\n"); + return ret; } /* -- cgit v1.2.2 From 2d4a71676f4d89418a0d53e60b89e8b804b390b2 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 20 Feb 2009 19:56:40 +0100 Subject: x86, mm: fault.c cleanup Impact: cleanup, no code changed Clean up various small details, which can be correctness checked automatically: - tidy up the include file section - eliminate unnecessary includes - introduce show_signal_msg() to clean up code flow - standardize the code flow - standardize comments and other style details - more cleanups, pointed out by checkpatch No code changed on either 32-bit nor 64-bit: arch/x86/mm/fault.o: text data bss dec hex filename 4632 32 24 4688 1250 fault.o.before 4632 32 24 4688 1250 fault.o.after the md5 changed due to a change in a single instruction: 2e8a8241e7f0d69706776a5a26c90bc0 fault.o.before.asm c5c3d36e725586eb74f0e10692f0193e fault.o.after.asm Because a __LINE__ reference in a WARN_ONCE() has changed. On 32-bit a few stack offsets changed - no code size difference nor any functionality difference. Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 546 +++++++++++++++++++++++++++++++--------------------- 1 file changed, 331 insertions(+), 215 deletions(-) (limited to 'arch/x86/mm/fault.c') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index e4b9fc5001c6..351d679bf977 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1,56 +1,59 @@ /* * Copyright (C) 1995 Linus Torvalds - * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs. + * Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs. */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include #include -#include -#include -#include /* For unblank_screen() */ +#include +#include #include #include -#include /* for max_low_pfn */ -#include -#include #include #include +#include +#include +#include +#include +#include +#include +#include #include +#include #include +#include +#include +#include +#include +#include +#include +#include + +#include -#include -#include -#include -#include -#include #include +#include +#include +#include #include -#include #include +#include /* - * Page fault error code bits - * bit 0 == 0 means no page found, 1 means protection fault - * bit 1 == 0 means read, 1 means write - * bit 2 == 0 means kernel, 1 means user-mode - * bit 3 == 1 means use of reserved bit detected - * bit 4 == 1 means fault was an instruction fetch + * Page fault error code bits: + * + * bit 0 == 0: no page found 1: protection fault + * bit 1 == 0: read access 1: write access + * bit 2 == 0: kernel-mode access 1: user-mode access + * bit 3 == 1: use of reserved bit detected + * bit 4 == 1: fault was an instruction fetch */ -#define PF_PROT (1<<0) -#define PF_WRITE (1<<1) -#define PF_USER (1<<2) -#define PF_RSVD (1<<3) -#define PF_INSTR (1<<4) +enum x86_pf_error_code { + + PF_PROT = 1 << 0, + PF_WRITE = 1 << 1, + PF_USER = 1 << 2, + PF_RSVD = 1 << 3, + PF_INSTR = 1 << 4, +}; static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr) { @@ -82,23 +85,27 @@ static inline int notify_page_fault(struct pt_regs *regs) } /* - * X86_32 - * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch. - * Check that here and ignore it. + * Prefetch quirks: + * + * 32-bit mode: + * + * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch. + * Check that here and ignore it. * - * X86_64 - * Sometimes the CPU reports invalid exceptions on prefetch. - * Check that here and ignore it. + * 64-bit mode: * - * Opcode checker based on code by Richard Brunner + * Sometimes the CPU reports invalid exceptions on prefetch. + * Check that here and ignore it. + * + * Opcode checker based on code by Richard Brunner. */ -static int is_prefetch(struct pt_regs *regs, unsigned long error_code, - unsigned long addr) +static int +is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr) { + unsigned char *max_instr; unsigned char *instr; int scan_more = 1; int prefetch = 0; - unsigned char *max_instr; /* * If it was a exec (instruction fetch) fault on NX page, then @@ -114,9 +121,9 @@ static int is_prefetch(struct pt_regs *regs, unsigned long error_code, return 0; while (scan_more && instr < max_instr) { - unsigned char opcode; unsigned char instr_hi; unsigned char instr_lo; + unsigned char opcode; if (probe_kernel_address(instr, opcode)) break; @@ -173,15 +180,17 @@ static int is_prefetch(struct pt_regs *regs, unsigned long error_code, return prefetch; } -static void force_sig_info_fault(int si_signo, int si_code, - unsigned long address, struct task_struct *tsk) +static void +force_sig_info_fault(int si_signo, int si_code, unsigned long address, + struct task_struct *tsk) { siginfo_t info; - info.si_signo = si_signo; - info.si_errno = 0; - info.si_code = si_code; - info.si_addr = (void __user *)address; + info.si_signo = si_signo; + info.si_errno = 0; + info.si_code = si_code; + info.si_addr = (void __user *)address; + force_sig_info(si_signo, &info, tsk); } @@ -189,6 +198,7 @@ static void force_sig_info_fault(int si_signo, int si_code, static int bad_address(void *p) { unsigned long dummy; + return probe_kernel_address((unsigned long *)p, dummy); } #endif @@ -200,13 +210,14 @@ static void dump_pagetable(unsigned long address) page = read_cr3(); page = ((__typeof__(page) *) __va(page))[address >> PGDIR_SHIFT]; + #ifdef CONFIG_X86_PAE printk("*pdpt = %016Lx ", page); if ((page >> PAGE_SHIFT) < max_low_pfn && page & _PAGE_PRESENT) { page &= PAGE_MASK; page = ((__typeof__(page) *) __va(page))[(address >> PMD_SHIFT) - & (PTRS_PER_PMD - 1)]; + & (PTRS_PER_PMD - 1)]; printk(KERN_CONT "*pde = %016Lx ", page); page &= ~_PAGE_NX; } @@ -218,14 +229,15 @@ static void dump_pagetable(unsigned long address) * We must not directly access the pte in the highpte * case if the page table is located in highmem. * And let's rather not kmap-atomic the pte, just in case - * it's allocated already. + * it's allocated already: */ if ((page >> PAGE_SHIFT) < max_low_pfn && (page & _PAGE_PRESENT) && !(page & _PAGE_PSE)) { + page &= PAGE_MASK; page = ((__typeof__(page) *) __va(page))[(address >> PAGE_SHIFT) - & (PTRS_PER_PTE - 1)]; + & (PTRS_PER_PTE - 1)]; printk("*pte = %0*Lx ", sizeof(page)*2, (u64)page); } @@ -239,26 +251,38 @@ static void dump_pagetable(unsigned long address) pgd = (pgd_t *)read_cr3(); pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK); + pgd += pgd_index(address); - if (bad_address(pgd)) goto bad; + if (bad_address(pgd)) + goto bad; + printk("PGD %lx ", pgd_val(*pgd)); - if (!pgd_present(*pgd)) goto ret; + + if (!pgd_present(*pgd)) + goto out; pud = pud_offset(pgd, address); - if (bad_address(pud)) goto bad; + if (bad_address(pud)) + goto bad; + printk("PUD %lx ", pud_val(*pud)); if (!pud_present(*pud) || pud_large(*pud)) - goto ret; + goto out; pmd = pmd_offset(pud, address); - if (bad_address(pmd)) goto bad; + if (bad_address(pmd)) + goto bad; + printk("PMD %lx ", pmd_val(*pmd)); - if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret; + if (!pmd_present(*pmd) || pmd_large(*pmd)) + goto out; pte = pte_offset_kernel(pmd, address); - if (bad_address(pte)) goto bad; + if (bad_address(pte)) + goto bad; + printk("PTE %lx", pte_val(*pte)); -ret: +out: printk("\n"); return; bad: @@ -285,7 +309,6 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) * and redundant with the set_pmd() on non-PAE. As would * set_pud. */ - pud = pud_offset(pgd, address); pud_k = pud_offset(pgd_k, address); if (!pud_present(*pud_k)) @@ -295,11 +318,14 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) pmd_k = pmd_offset(pud_k, address); if (!pmd_present(*pmd_k)) return NULL; + if (!pmd_present(*pmd)) { set_pmd(pmd, *pmd_k); arch_flush_lazy_mmu_mode(); - } else + } else { BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k)); + } + return pmd_k; } #endif @@ -312,29 +338,37 @@ KERN_ERR "******* Please consider a BIOS update.\n" KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n"; #endif -/* Workaround for K8 erratum #93 & buggy BIOS. - BIOS SMM functions are required to use a specific workaround - to avoid corruption of the 64bit RIP register on C stepping K8. - A lot of BIOS that didn't get tested properly miss this. - The OS sees this as a page fault with the upper 32bits of RIP cleared. - Try to work around it here. - Note we only handle faults in kernel here. - Does nothing for X86_32 +/* + * Workaround for K8 erratum #93 & buggy BIOS. + * + * BIOS SMM functions are required to use a specific workaround + * to avoid corruption of the 64bit RIP register on C stepping K8. + * + * A lot of BIOS that didn't get tested properly miss this. + * + * The OS sees this as a page fault with the upper 32bits of RIP cleared. + * Try to work around it here. + * + * Note we only handle faults in kernel here. + * Does nothing on 32-bit. */ static int is_errata93(struct pt_regs *regs, unsigned long address) { #ifdef CONFIG_X86_64 - static int warned; + static int once; + if (address != regs->ip) return 0; + if ((address >> 32) != 0) return 0; + address |= 0xffffffffUL << 32; if ((address >= (u64)_stext && address <= (u64)_etext) || (address >= MODULES_VADDR && address <= MODULES_END)) { - if (!warned) { + if (!once) { printk(errata93_warning); - warned = 1; + once = 1; } regs->ip = address; return 1; @@ -344,16 +378,17 @@ static int is_errata93(struct pt_regs *regs, unsigned long address) } /* - * Work around K8 erratum #100 K8 in compat mode occasionally jumps to illegal - * addresses >4GB. We catch this in the page fault handler because these - * addresses are not reachable. Just detect this case and return. Any code + * Work around K8 erratum #100 K8 in compat mode occasionally jumps + * to illegal addresses >4GB. + * + * We catch this in the page fault handler because these addresses + * are not reachable. Just detect this case and return. Any code * segment in LDT is compatibility mode. */ static int is_errata100(struct pt_regs *regs, unsigned long address) { #ifdef CONFIG_X86_64 - if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && - (address >> 32)) + if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && (address >> 32)) return 1; #endif return 0; @@ -363,8 +398,9 @@ static int is_f00f_bug(struct pt_regs *regs, unsigned long address) { #ifdef CONFIG_X86_F00F_BUG unsigned long nr; + /* - * Pentium F0 0F C7 C8 bug workaround. + * Pentium F0 0F C7 C8 bug workaround: */ if (boot_cpu_data.f00f_bug) { nr = (address - idt_descr.address) >> 3; @@ -378,8 +414,9 @@ static int is_f00f_bug(struct pt_regs *regs, unsigned long address) return 0; } -static void show_fault_oops(struct pt_regs *regs, unsigned long error_code, - unsigned long address) +static void +show_fault_oops(struct pt_regs *regs, unsigned long error_code, + unsigned long address) { #ifdef CONFIG_X86_32 if (!oops_may_print()) @@ -389,12 +426,14 @@ static void show_fault_oops(struct pt_regs *regs, unsigned long error_code, #ifdef CONFIG_X86_PAE if (error_code & PF_INSTR) { unsigned int level; + pte_t *pte = lookup_address(address, &level); - if (pte && pte_present(*pte) && !pte_exec(*pte)) + if (pte && pte_present(*pte) && !pte_exec(*pte)) { printk(KERN_CRIT "kernel tried to execute " "NX-protected page - exploit attempt? " "(uid: %d)\n", current_uid()); + } } #endif @@ -403,34 +442,45 @@ static void show_fault_oops(struct pt_regs *regs, unsigned long error_code, printk(KERN_CONT "NULL pointer dereference"); else printk(KERN_CONT "paging request"); + printk(KERN_CONT " at %p\n", (void *) address); printk(KERN_ALERT "IP:"); printk_address(regs->ip, 1); + dump_pagetable(address); } #ifdef CONFIG_X86_64 -static noinline void pgtable_bad(struct pt_regs *regs, - unsigned long error_code, unsigned long address) +static noinline void +pgtable_bad(struct pt_regs *regs, unsigned long error_code, + unsigned long address) { - unsigned long flags = oops_begin(); - int sig = SIGKILL; - struct task_struct *tsk = current; + struct task_struct *tsk; + unsigned long flags; + int sig; + + flags = oops_begin(); + tsk = current; + sig = SIGKILL; printk(KERN_ALERT "%s: Corrupted page table at address %lx\n", tsk->comm, address); dump_pagetable(address); - tsk->thread.cr2 = address; - tsk->thread.trap_no = 14; - tsk->thread.error_code = error_code; + + tsk->thread.cr2 = address; + tsk->thread.trap_no = 14; + tsk->thread.error_code = error_code; + if (__die("Bad pagetable", regs, error_code)) sig = 0; + oops_end(flags, regs, sig); } #endif -static noinline void no_context(struct pt_regs *regs, - unsigned long error_code, unsigned long address) +static noinline void +no_context(struct pt_regs *regs, unsigned long error_code, + unsigned long address) { struct task_struct *tsk = current; unsigned long *stackend; @@ -440,18 +490,20 @@ static noinline void no_context(struct pt_regs *regs, int sig; #endif - /* Are we prepared to handle this kernel fault? */ + /* Are we prepared to handle this kernel fault? */ if (fixup_exception(regs)) return; /* - * X86_32 - * Valid to do another page fault here, because if this fault - * had been triggered by is_prefetch fixup_exception would have - * handled it. + * 32-bit: + * + * Valid to do another page fault here, because if this fault + * had been triggered by is_prefetch fixup_exception would have + * handled it. + * + * 64-bit: * - * X86_64 - * Hall of shame of CPU/BIOS bugs. + * Hall of shame of CPU/BIOS bugs. */ if (is_prefetch(regs, error_code, address)) return; @@ -461,7 +513,7 @@ static noinline void no_context(struct pt_regs *regs, /* * Oops. The kernel tried to access some bad page. We'll have to - * terminate things with extreme prejudice. + * terminate things with extreme prejudice: */ #ifdef CONFIG_X86_32 bust_spinlocks(1); @@ -471,7 +523,7 @@ static noinline void no_context(struct pt_regs *regs, show_fault_oops(regs, error_code, address); - stackend = end_of_stack(tsk); + stackend = end_of_stack(tsk); if (*stackend != STACK_END_MAGIC) printk(KERN_ALERT "Thread overran stack, or stack corrupted\n"); @@ -487,28 +539,54 @@ static noinline void no_context(struct pt_regs *regs, sig = SIGKILL; if (__die("Oops", regs, error_code)) sig = 0; + /* Executive summary in case the body of the oops scrolled away */ printk(KERN_EMERG "CR2: %016lx\n", address); + oops_end(flags, regs, sig); #endif } -static void __bad_area_nosemaphore(struct pt_regs *regs, - unsigned long error_code, unsigned long address, - int si_code) +/* + * Print out info about fatal segfaults, if the show_unhandled_signals + * sysctl is set: + */ +static inline void +show_signal_msg(struct pt_regs *regs, unsigned long error_code, + unsigned long address, struct task_struct *tsk) +{ + if (!unhandled_signal(tsk, SIGSEGV)) + return; + + if (!printk_ratelimit()) + return; + + printk(KERN_CONT "%s%s[%d]: segfault at %lx ip %p sp %p error %lx", + task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, + tsk->comm, task_pid_nr(tsk), address, + (void *)regs->ip, (void *)regs->sp, error_code); + + print_vma_addr(KERN_CONT " in ", regs->ip); + + printk(KERN_CONT "\n"); +} + +static void +__bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, + unsigned long address, int si_code) { struct task_struct *tsk = current; /* User mode accesses just cause a SIGSEGV */ if (error_code & PF_USER) { /* - * It's possible to have interrupts off here. + * It's possible to have interrupts off here: */ local_irq_enable(); /* * Valid to do another page fault here because this one came - * from user space. + * from user space: */ if (is_prefetch(regs, error_code, address)) return; @@ -516,22 +594,16 @@ static void __bad_area_nosemaphore(struct pt_regs *regs, if (is_errata100(regs, address)) return; - if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && - printk_ratelimit()) { - printk( - "%s%s[%d]: segfault at %lx ip %p sp %p error %lx", - task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, - tsk->comm, task_pid_nr(tsk), address, - (void *) regs->ip, (void *) regs->sp, error_code); - print_vma_addr(" in ", regs->ip); - printk("\n"); - } + if (unlikely(show_unhandled_signals)) + show_signal_msg(regs, error_code, address, tsk); + + /* Kernel addresses are always protection faults: */ + tsk->thread.cr2 = address; + tsk->thread.error_code = error_code | (address >= TASK_SIZE); + tsk->thread.trap_no = 14; - tsk->thread.cr2 = address; - /* Kernel addresses are always protection faults */ - tsk->thread.error_code = error_code | (address >= TASK_SIZE); - tsk->thread.trap_no = 14; force_sig_info_fault(SIGSEGV, si_code, address, tsk); + return; } @@ -541,15 +613,16 @@ static void __bad_area_nosemaphore(struct pt_regs *regs, no_context(regs, error_code, address); } -static noinline void bad_area_nosemaphore(struct pt_regs *regs, - unsigned long error_code, unsigned long address) +static noinline void +bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, + unsigned long address) { __bad_area_nosemaphore(regs, error_code, address, SEGV_MAPERR); } -static void __bad_area(struct pt_regs *regs, - unsigned long error_code, unsigned long address, - int si_code) +static void +__bad_area(struct pt_regs *regs, unsigned long error_code, + unsigned long address, int si_code) { struct mm_struct *mm = current->mm; @@ -562,67 +635,77 @@ static void __bad_area(struct pt_regs *regs, __bad_area_nosemaphore(regs, error_code, address, si_code); } -static noinline void bad_area(struct pt_regs *regs, - unsigned long error_code, unsigned long address) +static noinline void +bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address) { __bad_area(regs, error_code, address, SEGV_MAPERR); } -static noinline void bad_area_access_error(struct pt_regs *regs, - unsigned long error_code, unsigned long address) +static noinline void +bad_area_access_error(struct pt_regs *regs, unsigned long error_code, + unsigned long address) { __bad_area(regs, error_code, address, SEGV_ACCERR); } /* TODO: fixup for "mm-invoke-oom-killer-from-page-fault.patch" */ -static void out_of_memory(struct pt_regs *regs, - unsigned long error_code, unsigned long address) +static void +out_of_memory(struct pt_regs *regs, unsigned long error_code, + unsigned long address) { /* * We ran out of memory, call the OOM killer, and return the userspace - * (which will retry the fault, or kill us if we got oom-killed). + * (which will retry the fault, or kill us if we got oom-killed): */ up_read(¤t->mm->mmap_sem); + pagefault_out_of_memory(); } -static void do_sigbus(struct pt_regs *regs, - unsigned long error_code, unsigned long address) +static void +do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address) { struct task_struct *tsk = current; struct mm_struct *mm = tsk->mm; up_read(&mm->mmap_sem); - /* Kernel mode? Handle exceptions or die */ + /* Kernel mode? Handle exceptions or die: */ if (!(error_code & PF_USER)) no_context(regs, error_code, address); + #ifdef CONFIG_X86_32 - /* User space => ok to do another page fault */ + /* User space => ok to do another page fault: */ if (is_prefetch(regs, error_code, address)) return; #endif - tsk->thread.cr2 = address; - tsk->thread.error_code = error_code; - tsk->thread.trap_no = 14; + + tsk->thread.cr2 = address; + tsk->thread.error_code = error_code; + tsk->thread.trap_no = 14; + force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk); } -static noinline void mm_fault_error(struct pt_regs *regs, - unsigned long error_code, unsigned long address, unsigned int fault) +static noinline void +mm_fault_error(struct pt_regs *regs, unsigned long error_code, + unsigned long address, unsigned int fault) { - if (fault & VM_FAULT_OOM) + if (fault & VM_FAULT_OOM) { out_of_memory(regs, error_code, address); - else if (fault & VM_FAULT_SIGBUS) - do_sigbus(regs, error_code, address); - else - BUG(); + } else { + if (fault & VM_FAULT_SIGBUS) + do_sigbus(regs, error_code, address); + else + BUG(); + } } static int spurious_fault_check(unsigned long error_code, pte_t *pte) { if ((error_code & PF_WRITE) && !pte_write(*pte)) return 0; + if ((error_code & PF_INSTR) && !pte_exec(*pte)) return 0; @@ -630,16 +713,19 @@ static int spurious_fault_check(unsigned long error_code, pte_t *pte) } /* - * Handle a spurious fault caused by a stale TLB entry. This allows - * us to lazily refresh the TLB when increasing the permissions of a - * kernel page (RO -> RW or NX -> X). Doing it eagerly is very - * expensive since that implies doing a full cross-processor TLB - * flush, even if no stale TLB entries exist on other processors. + * Handle a spurious fault caused by a stale TLB entry. + * + * This allows us to lazily refresh the TLB when increasing the + * permissions of a kernel page (RO -> RW or NX -> X). Doing it + * eagerly is very expensive since that implies doing a full + * cross-processor TLB flush, even if no stale TLB entries exist + * on other processors. + * * There are no security implications to leaving a stale TLB when * increasing the permissions on a page. */ -static noinline int spurious_fault(unsigned long error_code, - unsigned long address) +static noinline int +spurious_fault(unsigned long error_code, unsigned long address) { pgd_t *pgd; pud_t *pud; @@ -678,20 +764,23 @@ static noinline int spurious_fault(unsigned long error_code, return 0; /* - * Make sure we have permissions in PMD - * If not, then there's a bug in the page tables. + * Make sure we have permissions in PMD. + * If not, then there's a bug in the page tables: */ ret = spurious_fault_check(error_code, (pte_t *) pmd); WARN_ONCE(!ret, "PMD has incorrect permission bits\n"); + return ret; } /* - * X86_32 - * Handle a fault on the vmalloc or module mapping area + * 32-bit: + * + * Handle a fault on the vmalloc or module mapping area * - * X86_64 - * Handle a fault on the vmalloc area + * 64-bit: + * + * Handle a fault on the vmalloc area * * This assumes no large pages in there. */ @@ -702,7 +791,7 @@ static noinline int vmalloc_fault(unsigned long address) pmd_t *pmd_k; pte_t *pte_k; - /* Make sure we are in vmalloc area */ + /* Make sure we are in vmalloc area: */ if (!(address >= VMALLOC_START && address < VMALLOC_END)) return -1; @@ -717,9 +806,11 @@ static noinline int vmalloc_fault(unsigned long address) pmd_k = vmalloc_sync_one(__va(pgd_paddr), address); if (!pmd_k) return -1; + pte_k = pte_offset_kernel(pmd_k, address); if (!pte_present(*pte_k)) return -1; + return 0; #else pgd_t *pgd, *pgd_ref; @@ -727,69 +818,84 @@ static noinline int vmalloc_fault(unsigned long address) pmd_t *pmd, *pmd_ref; pte_t *pte, *pte_ref; - /* Make sure we are in vmalloc area */ + /* Make sure we are in vmalloc area: */ if (!(address >= VMALLOC_START && address < VMALLOC_END)) return -1; - /* Copy kernel mappings over when needed. This can also - happen within a race in page table update. In the later - case just flush. */ - + /* + * Copy kernel mappings over when needed. This can also + * happen within a race in page table update. In the later + * case just flush: + */ pgd = pgd_offset(current->active_mm, address); pgd_ref = pgd_offset_k(address); if (pgd_none(*pgd_ref)) return -1; + if (pgd_none(*pgd)) set_pgd(pgd, *pgd_ref); else BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); - /* Below here mismatches are bugs because these lower tables - are shared */ + /* + * Below here mismatches are bugs because these lower tables + * are shared: + */ pud = pud_offset(pgd, address); pud_ref = pud_offset(pgd_ref, address); if (pud_none(*pud_ref)) return -1; + if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref)) BUG(); + pmd = pmd_offset(pud, address); pmd_ref = pmd_offset(pud_ref, address); if (pmd_none(*pmd_ref)) return -1; + if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref)) BUG(); + pte_ref = pte_offset_kernel(pmd_ref, address); if (!pte_present(*pte_ref)) return -1; + pte = pte_offset_kernel(pmd, address); - /* Don't use pte_page here, because the mappings can point - outside mem_map, and the NUMA hash lookup cannot handle - that. */ + + /* + * Don't use pte_page here, because the mappings can point + * outside mem_map, and the NUMA hash lookup cannot handle + * that: + */ if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref)) BUG(); + return 0; #endif } int show_unhandled_signals = 1; -static inline int access_error(unsigned long error_code, int write, - struct vm_area_struct *vma) +static inline int +access_error(unsigned long error_code, int write, struct vm_area_struct *vma) { if (write) { - /* write, present and write, not present */ + /* write, present and write, not present: */ if (unlikely(!(vma->vm_flags & VM_WRITE))) return 1; - } else if (unlikely(error_code & PF_PROT)) { - /* read, present */ - return 1; - } else { - /* read, not present */ - if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))) - return 1; + return 0; } + /* read, present: */ + if (unlikely(error_code & PF_PROT)) + return 1; + + /* read, not present: */ + if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))) + return 1; + return 0; } @@ -797,9 +903,9 @@ static int fault_in_kernel_space(unsigned long address) { #ifdef CONFIG_X86_32 return address >= TASK_SIZE; -#else /* !CONFIG_X86_32 */ +#else return address >= TASK_SIZE64; -#endif /* CONFIG_X86_32 */ +#endif } /* @@ -812,18 +918,19 @@ asmlinkage #endif void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) { - unsigned long address; + struct vm_area_struct *vma; struct task_struct *tsk; + unsigned long address; struct mm_struct *mm; - struct vm_area_struct *vma; int write; int fault; tsk = current; mm = tsk->mm; + prefetchw(&mm->mmap_sem); - /* get the address */ + /* Get the faulting address: */ address = read_cr2(); if (unlikely(kmmio_fault(regs, address))) @@ -847,22 +954,23 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) vmalloc_fault(address) >= 0) return; - /* Can handle a stale RO->RW TLB */ + /* Can handle a stale RO->RW TLB: */ if (spurious_fault(error_code, address)) return; - /* kprobes don't want to hook the spurious faults. */ + /* kprobes don't want to hook the spurious faults: */ if (notify_page_fault(regs)) return; /* * Don't take the mm semaphore here. If we fixup a prefetch - * fault we could otherwise deadlock. + * fault we could otherwise deadlock: */ bad_area_nosemaphore(regs, error_code, address); + return; } - /* kprobes don't want to hook the spurious faults. */ + /* kprobes don't want to hook the spurious faults: */ if (unlikely(notify_page_fault(regs))) return; /* @@ -870,13 +978,15 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) * vmalloc fault has been handled. * * User-mode registers count as a user access even for any - * potential system fault or CPU buglet. + * potential system fault or CPU buglet: */ if (user_mode_vm(regs)) { local_irq_enable(); error_code |= PF_USER; - } else if (regs->flags & X86_EFLAGS_IF) - local_irq_enable(); + } else { + if (regs->flags & X86_EFLAGS_IF) + local_irq_enable(); + } #ifdef CONFIG_X86_64 if (unlikely(error_code & PF_RSVD)) @@ -884,8 +994,8 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) #endif /* - * If we're in an interrupt, have no user context or are running in an - * atomic region then we must not take the fault. + * If we're in an interrupt, have no user context or are running + * in an atomic region then we must not take the fault: */ if (unlikely(in_atomic() || !mm)) { bad_area_nosemaphore(regs, error_code, address); @@ -894,19 +1004,19 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) /* * When running in the kernel we expect faults to occur only to - * addresses in user space. All other faults represent errors in the - * kernel and should generate an OOPS. Unfortunately, in the case of an - * erroneous fault occurring in a code path which already holds mmap_sem - * we will deadlock attempting to validate the fault against the - * address space. Luckily the kernel only validly references user - * space from well defined areas of code, which are listed in the - * exceptions table. + * addresses in user space. All other faults represent errors in + * the kernel and should generate an OOPS. Unfortunately, in the + * case of an erroneous fault occurring in a code path which already + * holds mmap_sem we will deadlock attempting to validate the fault + * against the address space. Luckily the kernel only validly + * references user space from well defined areas of code, which are + * listed in the exceptions table. * * As the vast majority of faults will be valid we will only perform - * the source reference check when there is a possibility of a deadlock. - * Attempt to lock the address space, if we cannot we then validate the - * source. If this is invalid we can skip the address space check, - * thus avoiding the deadlock. + * the source reference check when there is a possibility of a + * deadlock. Attempt to lock the address space, if we cannot we then + * validate the source. If this is invalid we can skip the address + * space check, thus avoiding the deadlock: */ if (unlikely(!down_read_trylock(&mm->mmap_sem))) { if ((error_code & PF_USER) == 0 && @@ -917,8 +1027,9 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) down_read(&mm->mmap_sem); } else { /* - * The above down_read_trylock() might have succeeded in which - * case we'll have missed the might_sleep() from down_read(). + * The above down_read_trylock() might have succeeded in + * which case we'll have missed the might_sleep() from + * down_read(): */ might_sleep(); } @@ -938,7 +1049,7 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) /* * Accessing the stack below %sp is always a bug. * The large cushion allows instructions like enter - * and pusha to work. ("enter $65535,$31" pushes + * and pusha to work. ("enter $65535, $31" pushes * 32 pointers and then decrements %sp by 65535.) */ if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) { @@ -957,6 +1068,7 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) */ good_area: write = error_code & PF_WRITE; + if (unlikely(access_error(error_code, write, vma))) { bad_area_access_error(regs, error_code, address); return; @@ -965,13 +1077,15 @@ good_area: /* * If for any reason at all we couldn't handle the fault, * make sure we exit gracefully rather than endlessly redo - * the fault. + * the fault: */ fault = handle_mm_fault(mm, vma, address, write); + if (unlikely(fault & VM_FAULT_ERROR)) { mm_fault_error(regs, error_code, address, fault); return; } + if (fault & VM_FAULT_MAJOR) tsk->maj_flt++; else @@ -1004,13 +1118,13 @@ void vmalloc_sync_all(void) for (address = VMALLOC_START & PMD_MASK; address >= TASK_SIZE && address < FIXADDR_TOP; address += PMD_SIZE) { + unsigned long flags; struct page *page; spin_lock_irqsave(&pgd_lock, flags); list_for_each_entry(page, &pgd_list, lru) { - if (!vmalloc_sync_one(page_address(page), - address)) + if (!vmalloc_sync_one(page_address(page), address)) break; } spin_unlock_irqrestore(&pgd_lock, flags); @@ -1018,12 +1132,14 @@ void vmalloc_sync_all(void) #else /* CONFIG_X86_64 */ for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END; address += PGDIR_SIZE) { + const pgd_t *pgd_ref = pgd_offset_k(address); unsigned long flags; struct page *page; if (pgd_none(*pgd_ref)) continue; + spin_lock_irqsave(&pgd_lock, flags); list_for_each_entry(page, &pgd_list, lru) { pgd_t *pgd; -- cgit v1.2.2 From 107a03678cac0dd6cf7095f81442a4fa477e4700 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 20 Feb 2009 20:37:05 +0100 Subject: x86, mm: fault.c, refactor/simplify the is_prefetch() code Impact: no functionality changed Factor out the opcode checker into a helper inline. The code got a tiny bit smaller: text data bss dec hex filename 4632 32 24 4688 1250 fault.o.before 4618 32 24 4674 1242 fault.o.after And it got cleaner / easier to review as well. Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 100 ++++++++++++++++++++++++++-------------------------- 1 file changed, 50 insertions(+), 50 deletions(-) (limited to 'arch/x86/mm/fault.c') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 351d679bf977..72973c7682ba 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -99,12 +99,58 @@ static inline int notify_page_fault(struct pt_regs *regs) * * Opcode checker based on code by Richard Brunner. */ +static inline int +check_prefetch_opcode(struct pt_regs *regs, unsigned char *instr, + unsigned char opcode, int *prefetch) +{ + unsigned char instr_hi = opcode & 0xf0; + unsigned char instr_lo = opcode & 0x0f; + + switch (instr_hi) { + case 0x20: + case 0x30: + /* + * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes. + * In X86_64 long mode, the CPU will signal invalid + * opcode if some of these prefixes are present so + * X86_64 will never get here anyway + */ + return ((instr_lo & 7) == 0x6); +#ifdef CONFIG_X86_64 + case 0x40: + /* + * In AMD64 long mode 0x40..0x4F are valid REX prefixes + * Need to figure out under what instruction mode the + * instruction was issued. Could check the LDT for lm, + * but for now it's good enough to assume that long + * mode only uses well known segments or kernel. + */ + return (!user_mode(regs)) || (regs->cs == __USER_CS); +#endif + case 0x60: + /* 0x64 thru 0x67 are valid prefixes in all modes. */ + return (instr_lo & 0xC) == 0x4; + case 0xF0: + /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */ + return !instr_lo || (instr_lo>>1) == 1; + case 0x00: + /* Prefetch instruction is 0x0F0D or 0x0F18 */ + if (probe_kernel_address(instr, opcode)) + return 0; + + *prefetch = (instr_lo == 0xF) && + (opcode == 0x0D || opcode == 0x18); + return 0; + default: + return 0; + } +} + static int is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr) { unsigned char *max_instr; unsigned char *instr; - int scan_more = 1; int prefetch = 0; /* @@ -114,68 +160,22 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr) if (error_code & PF_INSTR) return 0; - instr = (unsigned char *)convert_ip_to_linear(current, regs); + instr = (void *)convert_ip_to_linear(current, regs); max_instr = instr + 15; if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE) return 0; - while (scan_more && instr < max_instr) { - unsigned char instr_hi; - unsigned char instr_lo; + while (instr < max_instr) { unsigned char opcode; if (probe_kernel_address(instr, opcode)) break; - instr_hi = opcode & 0xf0; - instr_lo = opcode & 0x0f; instr++; - switch (instr_hi) { - case 0x20: - case 0x30: - /* - * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes. - * In X86_64 long mode, the CPU will signal invalid - * opcode if some of these prefixes are present so - * X86_64 will never get here anyway - */ - scan_more = ((instr_lo & 7) == 0x6); - break; -#ifdef CONFIG_X86_64 - case 0x40: - /* - * In AMD64 long mode 0x40..0x4F are valid REX prefixes - * Need to figure out under what instruction mode the - * instruction was issued. Could check the LDT for lm, - * but for now it's good enough to assume that long - * mode only uses well known segments or kernel. - */ - scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS); - break; -#endif - case 0x60: - /* 0x64 thru 0x67 are valid prefixes in all modes. */ - scan_more = (instr_lo & 0xC) == 0x4; + if (!check_prefetch_opcode(regs, instr, opcode, &prefetch)) break; - case 0xF0: - /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */ - scan_more = !instr_lo || (instr_lo>>1) == 1; - break; - case 0x00: - /* Prefetch instruction is 0x0F0D or 0x0F18 */ - scan_more = 0; - - if (probe_kernel_address(instr, opcode)) - break; - prefetch = (instr_lo == 0xF) && - (opcode == 0x0D || opcode == 0x18); - break; - default: - scan_more = 0; - break; - } } return prefetch; } -- cgit v1.2.2 From 8c938f9fae887f6e180bf802aa1c33cf74712aff Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 20 Feb 2009 22:12:18 +0100 Subject: x86, mm: fault.c, factor out the vm86 fault check Impact: cleanup Instead of an ugly, open-coded, #ifdef-ed vm86 related legacy check in do_page_fault(), put it into the check_v8086_mode() helper function and merge it with an existing #ifdef. Also, simplify the code flow a tiny bit in the helper. No code changed: arch/x86/mm/fault.o: text data bss dec hex filename 2711 12 12 2735 aaf fault.o.before 2711 12 12 2735 aaf fault.o.after Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 43 +++++++++++++++++++++++++++++++------------ 1 file changed, 31 insertions(+), 12 deletions(-) (limited to 'arch/x86/mm/fault.c') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 72973c7682ba..7dc0615c3cfe 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -328,14 +328,41 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) return pmd_k; } -#endif -#ifdef CONFIG_X86_64 +/* + * Did it hit the DOS screen memory VA from vm86 mode? + */ +static inline void +check_v8086_mode(struct pt_regs *regs, unsigned long address, + struct task_struct *tsk) +{ + unsigned long bit; + + if (!v8086_mode(regs)) + return; + + bit = (address - 0xA0000) >> PAGE_SHIFT; + if (bit < 32) + tsk->thread.screen_bitmap |= 1 << bit; +} + +#else /* CONFIG_X86_64: */ + static const char errata93_warning[] = KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n" KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n" KERN_ERR "******* Please consider a BIOS update.\n" KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n"; + +/* + * No vm86 mode in 64-bit mode: + */ +static inline void +check_v8086_mode(struct pt_regs *regs, unsigned long address, + struct task_struct *tsk) +{ +} + #endif /* @@ -1091,16 +1118,8 @@ good_area: else tsk->min_flt++; -#ifdef CONFIG_X86_32 - /* - * Did it hit the DOS screen memory VA from vm86 mode? - */ - if (v8086_mode(regs)) { - unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT; - if (bit < 32) - tsk->thread.screen_bitmap |= 1 << bit; - } -#endif + check_v8086_mode(regs, address, tsk); + up_read(&mm->mmap_sem); } -- cgit v1.2.2 From 121d5d0a7e5808fbcfda484efd7ba840ac93450f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 20 Feb 2009 22:18:08 +0100 Subject: x86, mm: fault.c, enable PF_RSVD checks on 32-bit too Impact: improve page fault handling robustness The 'PF_RSVD' flag (bit 3) of the page-fault error_code is a relatively recent addition to x86 CPUs, so the 32-bit do_fault() implementation never had it. This flag gets set when the CPU detects nonzero values in any reserved bits of the page directory entries. Extend the existing 64-bit check for PF_RSVD in do_page_fault() to 32-bit too. If we detect such a fault then we print a more informative oops and the pagetables. This unifies the code some more, removes an ugly #ifdef and improves the 32-bit page fault code robustness a bit. It slightly increases the 32-bit kernel text size. Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'arch/x86/mm/fault.c') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 7dc0615c3cfe..3e3661462739 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -477,7 +477,6 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code, dump_pagetable(address); } -#ifdef CONFIG_X86_64 static noinline void pgtable_bad(struct pt_regs *regs, unsigned long error_code, unsigned long address) @@ -503,7 +502,6 @@ pgtable_bad(struct pt_regs *regs, unsigned long error_code, oops_end(flags, regs, sig); } -#endif static noinline void no_context(struct pt_regs *regs, unsigned long error_code, @@ -1015,10 +1013,8 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) local_irq_enable(); } -#ifdef CONFIG_X86_64 if (unlikely(error_code & PF_RSVD)) pgtable_bad(regs, error_code, address); -#endif /* * If we're in an interrupt, have no user context or are running -- cgit v1.2.2 From b814d41f0987c7648d7ed07471258101c95c026b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 20 Feb 2009 22:32:10 +0100 Subject: x86, mm: fault.c, simplify kmmio_fault() Impact: cleanup Remove an #ifdef from kmmio_fault() - we can do this by providing default implementations for is_kmmio_active() and kmmio_handler(). The compiler optimizes it all away in the !CONFIG_MMIOTRACE case. Also, while at it, clean up mmiotrace.h a bit: - standard header guards - standard vertical spaces for structure definitions No code changed (both with mmiotrace on and off in the config): text data bss dec hex filename 2947 12 12 2971 b9b fault.o.before 2947 12 12 2971 b9b fault.o.after Cc: Pekka Paalanen Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86/mm/fault.c') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 3e3661462739..fe99af4b86d9 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -55,13 +55,14 @@ enum x86_pf_error_code { PF_INSTR = 1 << 4, }; +/* + * (returns 0 if mmiotrace is disabled) + */ static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr) { -#ifdef CONFIG_MMIOTRACE if (unlikely(is_kmmio_active())) if (kmmio_handler(regs, addr) == 1) return -1; -#endif return 0; } -- cgit v1.2.2 From b18018126f422f5b706fd750373425e10e84b486 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 20 Feb 2009 22:42:57 +0100 Subject: x86, mm, kprobes: fault.c, simplify notify_page_fault() Impact: cleanup Remove an #ifdef from notify_page_fault(). The function still compiles to nothing in the !CONFIG_KPROBES case. Introduce kprobes_built_in() and kprobe_fault_handler() helpers to allow this - they returns 0 if !CONFIG_KPROBES. No code changed: text data bss dec hex filename 4618 32 24 4674 1242 fault.o.before 4618 32 24 4674 1242 fault.o.after Cc: Masami Hiramatsu Cc: Andrew Morton Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'arch/x86/mm/fault.c') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index fe99af4b86d9..379beaec6caa 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -68,11 +68,10 @@ static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr) static inline int notify_page_fault(struct pt_regs *regs) { -#ifdef CONFIG_KPROBES int ret = 0; /* kprobe_running() needs smp_processor_id() */ - if (!user_mode_vm(regs)) { + if (kprobes_built_in() && !user_mode_vm(regs)) { preempt_disable(); if (kprobe_running() && kprobe_fault_handler(regs, 14)) ret = 1; @@ -80,9 +79,6 @@ static inline int notify_page_fault(struct pt_regs *regs) } return ret; -#else - return 0; -#endif } /* -- cgit v1.2.2 From f2f13a8535174dbb813a0607a9d4737cfba98f6c Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 20 Feb 2009 22:50:24 +0100 Subject: x86, mm: fault.c, reorder functions Impact: cleanup Avoid a couple more #ifdefs by moving fundamentally non-unifiable functions into a single #ifdef 32-bit / #else / #endif block in fault.c: vmalloc*(), dump_pagetable(), check_vm8086_mode(). No code changed: text data bss dec hex filename 4618 32 24 4674 1242 fault.o.before 4618 32 24 4674 1242 fault.o.after Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 474 ++++++++++++++++++++++++++-------------------------- 1 file changed, 239 insertions(+), 235 deletions(-) (limited to 'arch/x86/mm/fault.c') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 379beaec6caa..4ce62fb80da7 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -191,18 +191,124 @@ force_sig_info_fault(int si_signo, int si_code, unsigned long address, force_sig_info(si_signo, &info, tsk); } -#ifdef CONFIG_X86_64 -static int bad_address(void *p) +DEFINE_SPINLOCK(pgd_lock); +LIST_HEAD(pgd_list); + +#ifdef CONFIG_X86_32 +static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) { - unsigned long dummy; + unsigned index = pgd_index(address); + pgd_t *pgd_k; + pud_t *pud, *pud_k; + pmd_t *pmd, *pmd_k; - return probe_kernel_address((unsigned long *)p, dummy); + pgd += index; + pgd_k = init_mm.pgd + index; + + if (!pgd_present(*pgd_k)) + return NULL; + + /* + * set_pgd(pgd, *pgd_k); here would be useless on PAE + * and redundant with the set_pmd() on non-PAE. As would + * set_pud. + */ + pud = pud_offset(pgd, address); + pud_k = pud_offset(pgd_k, address); + if (!pud_present(*pud_k)) + return NULL; + + pmd = pmd_offset(pud, address); + pmd_k = pmd_offset(pud_k, address); + if (!pmd_present(*pmd_k)) + return NULL; + + if (!pmd_present(*pmd)) { + set_pmd(pmd, *pmd_k); + arch_flush_lazy_mmu_mode(); + } else { + BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k)); + } + + return pmd_k; +} + +void vmalloc_sync_all(void) +{ + unsigned long address; + + if (SHARED_KERNEL_PMD) + return; + + for (address = VMALLOC_START & PMD_MASK; + address >= TASK_SIZE && address < FIXADDR_TOP; + address += PMD_SIZE) { + + unsigned long flags; + struct page *page; + + spin_lock_irqsave(&pgd_lock, flags); + list_for_each_entry(page, &pgd_list, lru) { + if (!vmalloc_sync_one(page_address(page), address)) + break; + } + spin_unlock_irqrestore(&pgd_lock, flags); + } +} + +/* + * 32-bit: + * + * Handle a fault on the vmalloc or module mapping area + */ +static noinline int vmalloc_fault(unsigned long address) +{ + unsigned long pgd_paddr; + pmd_t *pmd_k; + pte_t *pte_k; + + /* Make sure we are in vmalloc area: */ + if (!(address >= VMALLOC_START && address < VMALLOC_END)) + return -1; + + /* + * Synchronize this task's top level page-table + * with the 'reference' page table. + * + * Do _not_ use "current" here. We might be inside + * an interrupt in the middle of a task switch.. + */ + pgd_paddr = read_cr3(); + pmd_k = vmalloc_sync_one(__va(pgd_paddr), address); + if (!pmd_k) + return -1; + + pte_k = pte_offset_kernel(pmd_k, address); + if (!pte_present(*pte_k)) + return -1; + + return 0; +} + +/* + * Did it hit the DOS screen memory VA from vm86 mode? + */ +static inline void +check_v8086_mode(struct pt_regs *regs, unsigned long address, + struct task_struct *tsk) +{ + unsigned long bit; + + if (!v8086_mode(regs)) + return; + + bit = (address - 0xA0000) >> PAGE_SHIFT; + if (bit < 32) + tsk->thread.screen_bitmap |= 1 << bit; } -#endif static void dump_pagetable(unsigned long address) { -#ifdef CONFIG_X86_32 __typeof__(pte_val(__pte(0))) page; page = read_cr3(); @@ -239,7 +345,132 @@ static void dump_pagetable(unsigned long address) } printk("\n"); -#else /* CONFIG_X86_64 */ +} + +#else /* CONFIG_X86_64: */ + +void vmalloc_sync_all(void) +{ + unsigned long address; + + for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END; + address += PGDIR_SIZE) { + + const pgd_t *pgd_ref = pgd_offset_k(address); + unsigned long flags; + struct page *page; + + if (pgd_none(*pgd_ref)) + continue; + + spin_lock_irqsave(&pgd_lock, flags); + list_for_each_entry(page, &pgd_list, lru) { + pgd_t *pgd; + pgd = (pgd_t *)page_address(page) + pgd_index(address); + if (pgd_none(*pgd)) + set_pgd(pgd, *pgd_ref); + else + BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); + } + spin_unlock_irqrestore(&pgd_lock, flags); + } +} + +/* + * 64-bit: + * + * Handle a fault on the vmalloc area + * + * This assumes no large pages in there. + */ +static noinline int vmalloc_fault(unsigned long address) +{ + pgd_t *pgd, *pgd_ref; + pud_t *pud, *pud_ref; + pmd_t *pmd, *pmd_ref; + pte_t *pte, *pte_ref; + + /* Make sure we are in vmalloc area: */ + if (!(address >= VMALLOC_START && address < VMALLOC_END)) + return -1; + + /* + * Copy kernel mappings over when needed. This can also + * happen within a race in page table update. In the later + * case just flush: + */ + pgd = pgd_offset(current->active_mm, address); + pgd_ref = pgd_offset_k(address); + if (pgd_none(*pgd_ref)) + return -1; + + if (pgd_none(*pgd)) + set_pgd(pgd, *pgd_ref); + else + BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); + + /* + * Below here mismatches are bugs because these lower tables + * are shared: + */ + + pud = pud_offset(pgd, address); + pud_ref = pud_offset(pgd_ref, address); + if (pud_none(*pud_ref)) + return -1; + + if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref)) + BUG(); + + pmd = pmd_offset(pud, address); + pmd_ref = pmd_offset(pud_ref, address); + if (pmd_none(*pmd_ref)) + return -1; + + if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref)) + BUG(); + + pte_ref = pte_offset_kernel(pmd_ref, address); + if (!pte_present(*pte_ref)) + return -1; + + pte = pte_offset_kernel(pmd, address); + + /* + * Don't use pte_page here, because the mappings can point + * outside mem_map, and the NUMA hash lookup cannot handle + * that: + */ + if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref)) + BUG(); + + return 0; +} + +static const char errata93_warning[] = +KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n" +KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n" +KERN_ERR "******* Please consider a BIOS update.\n" +KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n"; + +/* + * No vm86 mode in 64-bit mode: + */ +static inline void +check_v8086_mode(struct pt_regs *regs, unsigned long address, + struct task_struct *tsk) +{ +} + +static int bad_address(void *p) +{ + unsigned long dummy; + + return probe_kernel_address((unsigned long *)p, dummy); +} + +static void dump_pagetable(unsigned long address) +{ pgd_t *pgd; pud_t *pud; pmd_t *pmd; @@ -284,83 +515,9 @@ out: return; bad: printk("BAD\n"); -#endif -} - -#ifdef CONFIG_X86_32 -static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) -{ - unsigned index = pgd_index(address); - pgd_t *pgd_k; - pud_t *pud, *pud_k; - pmd_t *pmd, *pmd_k; - - pgd += index; - pgd_k = init_mm.pgd + index; - - if (!pgd_present(*pgd_k)) - return NULL; - - /* - * set_pgd(pgd, *pgd_k); here would be useless on PAE - * and redundant with the set_pmd() on non-PAE. As would - * set_pud. - */ - pud = pud_offset(pgd, address); - pud_k = pud_offset(pgd_k, address); - if (!pud_present(*pud_k)) - return NULL; - - pmd = pmd_offset(pud, address); - pmd_k = pmd_offset(pud_k, address); - if (!pmd_present(*pmd_k)) - return NULL; - - if (!pmd_present(*pmd)) { - set_pmd(pmd, *pmd_k); - arch_flush_lazy_mmu_mode(); - } else { - BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k)); - } - - return pmd_k; -} - -/* - * Did it hit the DOS screen memory VA from vm86 mode? - */ -static inline void -check_v8086_mode(struct pt_regs *regs, unsigned long address, - struct task_struct *tsk) -{ - unsigned long bit; - - if (!v8086_mode(regs)) - return; - - bit = (address - 0xA0000) >> PAGE_SHIFT; - if (bit < 32) - tsk->thread.screen_bitmap |= 1 << bit; -} - -#else /* CONFIG_X86_64: */ - -static const char errata93_warning[] = -KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n" -KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n" -KERN_ERR "******* Please consider a BIOS update.\n" -KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n"; - -/* - * No vm86 mode in 64-bit mode: - */ -static inline void -check_v8086_mode(struct pt_regs *regs, unsigned long address, - struct task_struct *tsk) -{ } -#endif +#endif /* CONFIG_X86_64 */ /* * Workaround for K8 erratum #93 & buggy BIOS. @@ -795,109 +952,6 @@ spurious_fault(unsigned long error_code, unsigned long address) return ret; } -/* - * 32-bit: - * - * Handle a fault on the vmalloc or module mapping area - * - * 64-bit: - * - * Handle a fault on the vmalloc area - * - * This assumes no large pages in there. - */ -static noinline int vmalloc_fault(unsigned long address) -{ -#ifdef CONFIG_X86_32 - unsigned long pgd_paddr; - pmd_t *pmd_k; - pte_t *pte_k; - - /* Make sure we are in vmalloc area: */ - if (!(address >= VMALLOC_START && address < VMALLOC_END)) - return -1; - - /* - * Synchronize this task's top level page-table - * with the 'reference' page table. - * - * Do _not_ use "current" here. We might be inside - * an interrupt in the middle of a task switch.. - */ - pgd_paddr = read_cr3(); - pmd_k = vmalloc_sync_one(__va(pgd_paddr), address); - if (!pmd_k) - return -1; - - pte_k = pte_offset_kernel(pmd_k, address); - if (!pte_present(*pte_k)) - return -1; - - return 0; -#else - pgd_t *pgd, *pgd_ref; - pud_t *pud, *pud_ref; - pmd_t *pmd, *pmd_ref; - pte_t *pte, *pte_ref; - - /* Make sure we are in vmalloc area: */ - if (!(address >= VMALLOC_START && address < VMALLOC_END)) - return -1; - - /* - * Copy kernel mappings over when needed. This can also - * happen within a race in page table update. In the later - * case just flush: - */ - pgd = pgd_offset(current->active_mm, address); - pgd_ref = pgd_offset_k(address); - if (pgd_none(*pgd_ref)) - return -1; - - if (pgd_none(*pgd)) - set_pgd(pgd, *pgd_ref); - else - BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); - - /* - * Below here mismatches are bugs because these lower tables - * are shared: - */ - - pud = pud_offset(pgd, address); - pud_ref = pud_offset(pgd_ref, address); - if (pud_none(*pud_ref)) - return -1; - - if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref)) - BUG(); - - pmd = pmd_offset(pud, address); - pmd_ref = pmd_offset(pud_ref, address); - if (pmd_none(*pmd_ref)) - return -1; - - if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref)) - BUG(); - - pte_ref = pte_offset_kernel(pmd_ref, address); - if (!pte_present(*pte_ref)) - return -1; - - pte = pte_offset_kernel(pmd, address); - - /* - * Don't use pte_page here, because the mappings can point - * outside mem_map, and the NUMA hash lookup cannot handle - * that: - */ - if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref)) - BUG(); - - return 0; -#endif -} - int show_unhandled_signals = 1; static inline int @@ -1115,53 +1169,3 @@ good_area: up_read(&mm->mmap_sem); } - -DEFINE_SPINLOCK(pgd_lock); -LIST_HEAD(pgd_list); - -void vmalloc_sync_all(void) -{ - unsigned long address; - -#ifdef CONFIG_X86_32 - if (SHARED_KERNEL_PMD) - return; - - for (address = VMALLOC_START & PMD_MASK; - address >= TASK_SIZE && address < FIXADDR_TOP; - address += PMD_SIZE) { - - unsigned long flags; - struct page *page; - - spin_lock_irqsave(&pgd_lock, flags); - list_for_each_entry(page, &pgd_list, lru) { - if (!vmalloc_sync_one(page_address(page), address)) - break; - } - spin_unlock_irqrestore(&pgd_lock, flags); - } -#else /* CONFIG_X86_64 */ - for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END; - address += PGDIR_SIZE) { - - const pgd_t *pgd_ref = pgd_offset_k(address); - unsigned long flags; - struct page *page; - - if (pgd_none(*pgd_ref)) - continue; - - spin_lock_irqsave(&pgd_lock, flags); - list_for_each_entry(page, &pgd_list, lru) { - pgd_t *pgd; - pgd = (pgd_t *)page_address(page) + pgd_index(address); - if (pgd_none(*pgd)) - set_pgd(pgd, *pgd_ref); - else - BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); - } - spin_unlock_irqrestore(&pgd_lock, flags); - } -#endif -} -- cgit v1.2.2 From 8f7661496cece8320137d5e26808825498fd2b26 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 20 Feb 2009 23:00:29 +0100 Subject: x86, mm: fault.c, unify oops printing Impact: refine/extend page fault related oops printing on 64-bit - honor the pause_on_oops logic on 64-bit too - print out NX fault warnings on 64-bit as well - factor out the NX fault message to make it git-greppable and readable Note that this means that we do the PF_INSTR check on 32-bit non-PAE as well where it should not occur ... normally. Cannot hurt. Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) (limited to 'arch/x86/mm/fault.c') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 4ce62fb80da7..ebfaca3bbb12 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -595,28 +595,24 @@ static int is_f00f_bug(struct pt_regs *regs, unsigned long address) return 0; } +static const char nx_warning[] = KERN_CRIT +"kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n"; + static void show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long address) { -#ifdef CONFIG_X86_32 if (!oops_may_print()) return; -#endif -#ifdef CONFIG_X86_PAE if (error_code & PF_INSTR) { unsigned int level; pte_t *pte = lookup_address(address, &level); - if (pte && pte_present(*pte) && !pte_exec(*pte)) { - printk(KERN_CRIT "kernel tried to execute " - "NX-protected page - exploit attempt? " - "(uid: %d)\n", current_uid()); - } + if (pte && pte_present(*pte) && !pte_exec(*pte)) + printk(nx_warning, current_uid()); } -#endif printk(KERN_ALERT "BUG: unable to handle kernel "); if (address < PAGE_SIZE) -- cgit v1.2.2 From 1cc99544dde9e48602979f16b9309fade6e93051 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 20 Feb 2009 23:07:48 +0100 Subject: x86, mm: fault.c, unify oops handling Impact: add oops-recursion check to 32-bit Unify the oops state-machine, to the 64-bit version. It is slightly more careful in that it does a recursion check in oops_begin(), and is thus more likely to show the relevant oops. It also means that 32-bit will print one more line at the end of pagefault triggered oopses: printk(KERN_EMERG "CR2: %016lx\n", address); Which is generally good information to be seen in partial-dump digital-camera jpegs ;-) The downside is the somewhat more complex critical path. Both variants have been tested well meanwhile by kernel developers crashing their boxes so i dont think this is a practical worry. This removes 3 ugly #ifdefs from no_context() and makes the function a lot nicer read. Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) (limited to 'arch/x86/mm/fault.c') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index ebfaca3bbb12..8fe2dd254df0 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -659,11 +659,8 @@ no_context(struct pt_regs *regs, unsigned long error_code, { struct task_struct *tsk = current; unsigned long *stackend; - -#ifdef CONFIG_X86_64 unsigned long flags; int sig; -#endif /* Are we prepared to handle this kernel fault? */ if (fixup_exception(regs)) @@ -690,11 +687,7 @@ no_context(struct pt_regs *regs, unsigned long error_code, * Oops. The kernel tried to access some bad page. We'll have to * terminate things with extreme prejudice: */ -#ifdef CONFIG_X86_32 - bust_spinlocks(1); -#else flags = oops_begin(); -#endif show_fault_oops(regs, error_code, address); @@ -702,15 +695,10 @@ no_context(struct pt_regs *regs, unsigned long error_code, if (*stackend != STACK_END_MAGIC) printk(KERN_ALERT "Thread overran stack, or stack corrupted\n"); - tsk->thread.cr2 = address; - tsk->thread.trap_no = 14; - tsk->thread.error_code = error_code; + tsk->thread.cr2 = address; + tsk->thread.trap_no = 14; + tsk->thread.error_code = error_code; -#ifdef CONFIG_X86_32 - die("Oops", regs, error_code); - bust_spinlocks(0); - do_exit(SIGKILL); -#else sig = SIGKILL; if (__die("Oops", regs, error_code)) sig = 0; @@ -719,7 +707,6 @@ no_context(struct pt_regs *regs, unsigned long error_code, printk(KERN_EMERG "CR2: %016lx\n", address); oops_end(flags, regs, sig); -#endif } /* -- cgit v1.2.2 From c3731c68668325abddee8665018c74c7156a57be Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 20 Feb 2009 23:22:34 +0100 Subject: x86, mm: fault.c, remove #ifdef from do_page_fault() Impact: cleanup do_page_fault() has this ugly #ifdef in its prototype: #ifdef CONFIG_X86_64 asmlinkage #endif void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) Replace it with 'dotraplinkage' which maps to exactly the above construct: nothing on 32-bit and asmlinkage on 64-bit. Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'arch/x86/mm/fault.c') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 8fe2dd254df0..9c2dc5d79531 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -972,10 +972,8 @@ static int fault_in_kernel_space(unsigned long address) * and the problem, and then passes it off to one of the appropriate * routines. */ -#ifdef CONFIG_X86_64 -asmlinkage -#endif -void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) +dotraplinkage void __kprobes +do_page_fault(struct pt_regs *regs, unsigned long error_code) { struct vm_area_struct *vma; struct task_struct *tsk; -- cgit v1.2.2 From d951734654f76a370a89b4e531af9b765bd13541 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 20 Feb 2009 23:32:28 +0100 Subject: x86, mm: rename TASK_SIZE64 => TASK_SIZE_MAX Impact: cleanup Rename TASK_SIZE64 to TASK_SIZE_MAX, and provide the define on 32-bit too. (mapped to TASK_SIZE) This allows 32-bit code to make use of the (former-) TASK_SIZE64 symbol as well, in a clean way. Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/mm/fault.c') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 9c2dc5d79531..6fa9f175cba3 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -963,7 +963,7 @@ static int fault_in_kernel_space(unsigned long address) #ifdef CONFIG_X86_32 return address >= TASK_SIZE; #else - return address >= TASK_SIZE64; + return address >= TASK_SIZE_MAX; #endif } -- cgit v1.2.2 From 7c178a26d3e753d2a4346d3e4b8aa549d387f698 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 20 Feb 2009 23:27:29 +0100 Subject: x86, mm: fault.c, remove #ifdef from fault_in_kernel_space() Impact: cleanup Removal of an #ifdef in fault_in_kernel_space(), by making use of the new TASK_SIZE_MAX symbol which is now available on 32-bit too. Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'arch/x86/mm/fault.c') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 6fa9f175cba3..f195691ec26e 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -960,11 +960,7 @@ access_error(unsigned long error_code, int write, struct vm_area_struct *vma) static int fault_in_kernel_space(unsigned long address) { -#ifdef CONFIG_X86_32 - return address >= TASK_SIZE; -#else return address >= TASK_SIZE_MAX; -#endif } /* -- cgit v1.2.2 From cd1b68f08f6328737928e5b8ba8ef80394680ff0 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 20 Feb 2009 23:39:02 +0100 Subject: x86, mm: fault.c, give another attempt at prefetch handing before SIGBUS Impact: extend prefetch handling on 64-bit Currently there's an extra is_prefetch() check done in do_sigbus(), which we only do on 32 bits. This is a last-ditch check before we terminate a task, so it's worth giving prefetch instructions another chance - should none of our existing quirks have caught a prefetch instruction related spurious fault. The only risk is if a prefetch causes a real sigbus, in that case we'll not OOM but try another fault. But this code has been on 32-bit for a long time, so it should be fine in practice. So do this on 64-bit too - and thus remove one more #ifdef. Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'arch/x86/mm/fault.c') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index f195691ec26e..413e835e4a80 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -836,11 +836,9 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address) if (!(error_code & PF_USER)) no_context(regs, error_code, address); -#ifdef CONFIG_X86_32 - /* User space => ok to do another page fault: */ + /* User-space => ok to do another page fault: */ if (is_prefetch(regs, error_code, address)) return; -#endif tsk->thread.cr2 = address; tsk->thread.error_code = error_code; -- cgit v1.2.2 From f8eeb2e6be367d79be3617f0a12646bced8b2fe6 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 20 Feb 2009 23:13:36 +0100 Subject: x86, mm: fault.c, update copyrights Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/mm/fault.c') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 413e835e4a80..9bb07d331c3b 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1,6 +1,7 @@ /* * Copyright (C) 1995 Linus Torvalds * Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs. + * Copyright (C) 2008-2009, Red Hat Inc., Ingo Molnar */ #include #include -- cgit v1.2.2 From b319eed0aa0a6d710887350a3cb734c572aa64c4 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 22 Feb 2009 10:24:18 +0100 Subject: x86, mm: fault.c, simplify kmmio_fault(), cleanup Clarify the kmmio_fault() comment. Acked-by: Pekka Paalanen Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/mm/fault.c') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 9bb07d331c3b..a03b7279efa0 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -57,7 +57,8 @@ enum x86_pf_error_code { }; /* - * (returns 0 if mmiotrace is disabled) + * Returns 0 if mmiotrace is disabled, or if the fault is not + * handled by mmiotrace: */ static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr) { -- cgit v1.2.2