1 files changed, 636 insertions, 0 deletions
diff --git a/arch/x86/mm/fault_64.c b/arch/x86/mm/fault_64.c
new file mode 100644
index 000000000000..54816adb8e93
--- /dev/null
+++ b/arch/x86/mm/fault_64.c
@@ -0,0 +1,636 @@
+/*
+ *  linux/arch/x86-64/mm/fault.c
+ *
+ *  Copyright (C) 1995  Linus Torvalds
+ *  Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
+ */
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/ptrace.h>
+#include <linux/mman.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/interrupt.h>
+#include <linux/init.h>
+#include <linux/tty.h>
+#include <linux/vt_kern.h>              /* For unblank_screen() */
+#include <linux/compiler.h>
+#include <linux/vmalloc.h>
+#include <linux/module.h>
+#include <linux/kprobes.h>
+#include <linux/uaccess.h>
+#include <linux/kdebug.h>
+#include <asm/system.h>
+#include <asm/pgalloc.h>
+#include <asm/smp.h>
+#include <asm/tlbflush.h>
+#include <asm/proto.h>
+#include <asm-generic/sections.h>
+/* Page fault error code bits */
+#define PF_PROT (1<<0)          /* or no page found */
+#define PF_WRITE        (1<<1)
+#define PF_USER (1<<2)
+#define PF_RSVD (1<<3)
+#define PF_INSTR        (1<<4)
+static ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain);
+/* Hook to register for page fault notifications */
+int register_page_fault_notifier(struct notifier_block *nb)
+{
+        vmalloc_sync_all();
+        return atomic_notifier_chain_register(&notify_page_fault_chain, nb);
+}
+EXPORT_SYMBOL_GPL(register_page_fault_notifier);
+int unregister_page_fault_notifier(struct notifier_block *nb)
+{
+        return atomic_notifier_chain_unregister(&notify_page_fault_chain, nb);
+}
+EXPORT_SYMBOL_GPL(unregister_page_fault_notifier);
+static inline int notify_page_fault(struct pt_regs *regs, long err)
+{
+        struct die_args args = {
+                .regs = regs,
+                .str = "page fault",
+                .err = err,
+                .trapnr = 14,
+                .signr = SIGSEGV
+        };
+        return atomic_notifier_call_chain(&notify_page_fault_chain,
+                                          DIE_PAGE_FAULT, &args);
+}
+/* Sometimes the CPU reports invalid exceptions on prefetch.
+   Check that here and ignore.
+   Opcode checker based on code by Richard Brunner */
+static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr,
+                                unsigned long error_code)
+{ 
+        unsigned char *instr;
+        int scan_more = 1;
+        int prefetch = 0; 
+        unsigned char *max_instr;
+        /* If it was a exec fault ignore */
+        if (error_code & PF_INSTR)
+                return 0;
+        
+        instr = (unsigned char __user *)convert_rip_to_linear(current, regs);
+        max_instr = instr + 15;
+        if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
+                return 0;
+        while (scan_more && instr < max_instr) { 
+                unsigned char opcode;
+                unsigned char instr_hi;
+                unsigned char instr_lo;
+                if (probe_kernel_address(instr, opcode))
+                        break; 
+                instr_hi = opcode & 0xf0; 
+                instr_lo = opcode & 0x0f; 
+                instr++;
+                switch (instr_hi) { 
+                case 0x20:
+                case 0x30:
+                        /* Values 0x26,0x2E,0x36,0x3E are valid x86
+                           prefixes.  In long mode, the CPU will signal
+                           invalid opcode if some of these prefixes are
+                           present so we will never get here anyway */
+                        scan_more = ((instr_lo & 7) == 0x6);
+                        break;
+                        
+                case 0x40:
+                        /* In AMD64 long mode, 0x40 to 0x4F are valid REX prefixes
+                           Need to figure out under what instruction mode the
+                           instruction was issued ... */
+                        /* Could check the LDT for lm, but for now it's good
+                           enough to assume that long mode only uses well known
+                           segments or kernel. */
+                        scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
+                        break;
+                        
+                case 0x60:
+                        /* 0x64 thru 0x67 are valid prefixes in all modes. */
+                        scan_more = (instr_lo & 0xC) == 0x4;
+                        break;          
+                case 0xF0:
+                        /* 0xF0, 0xF2, and 0xF3 are valid prefixes in all modes. */
+                        scan_more = !instr_lo || (instr_lo>>1) == 1;
+                        break;                  
+                case 0x00:
+                        /* Prefetch instruction is 0x0F0D or 0x0F18 */
+                        scan_more = 0;
+                        if (probe_kernel_address(instr, opcode))
+                                break;
+                        prefetch = (instr_lo == 0xF) &&
+                                (opcode == 0x0D || opcode == 0x18);
+                        break;                  
+                default:
+                        scan_more = 0;
+                        break;
+                } 
+        }
+        return prefetch;
+}
+static int bad_address(void *p) 
+{ 
+        unsigned long dummy;
+        return probe_kernel_address((unsigned long *)p, dummy);
+} 
+void dump_pagetable(unsigned long address)
+{
+        pgd_t *pgd;
+        pud_t *pud;
+        pmd_t *pmd;
+        pte_t *pte;
+        pgd = (pgd_t *)read_cr3();
+        pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK); 
+        pgd += pgd_index(address);
+        if (bad_address(pgd)) goto bad;
+        printk("PGD %lx ", pgd_val(*pgd));
+        if (!pgd_present(*pgd)) goto ret; 
+        pud = pud_offset(pgd, address);
+        if (bad_address(pud)) goto bad;
+        printk("PUD %lx ", pud_val(*pud));
+        if (!pud_present(*pud)) goto ret;
+        pmd = pmd_offset(pud, address);
+        if (bad_address(pmd)) goto bad;
+        printk("PMD %lx ", pmd_val(*pmd));
+        if (!pmd_present(*pmd)) goto ret;        
+        pte = pte_offset_kernel(pmd, address);
+        if (bad_address(pte)) goto bad;
+        printk("PTE %lx", pte_val(*pte)); 
+ret:
+        printk("\n");
+        return;
+bad:
+        printk("BAD\n");
+}
+static const char errata93_warning[] = 
+KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
+KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
+KERN_ERR "******* Please consider a BIOS update.\n"
+KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
+/* Workaround for K8 erratum #93 & buggy BIOS.
+   BIOS SMM functions are required to use a specific workaround
+   to avoid corruption of the 64bit RIP register on C stepping K8. 
+   A lot of BIOS that didn't get tested properly miss this. 
+   The OS sees this as a page fault with the upper 32bits of RIP cleared.
+   Try to work around it here.
+   Note we only handle faults in kernel here. */
+static int is_errata93(struct pt_regs *regs, unsigned long address) 
+{
+        static int warned;
+        if (address != regs->rip)
+                return 0;
+        if ((address >> 32) != 0) 
+                return 0;
+        address |= 0xffffffffUL << 32;
+        if ((address >= (u64)_stext && address <= (u64)_etext) || 
+            (address >= MODULES_VADDR && address <= MODULES_END)) { 
+                if (!warned) {
+                        printk(errata93_warning);               
+                        warned = 1;
+                }
+                regs->rip = address;
+                return 1;
+        }
+        return 0;
+} 
+static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
+                                 unsigned long error_code)
+{
+        unsigned long flags = oops_begin();
+        struct task_struct *tsk;
+        printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
+               current->comm, address);
+        dump_pagetable(address);
+        tsk = current;
+        tsk->thread.cr2 = address;
+        tsk->thread.trap_no = 14;
+        tsk->thread.error_code = error_code;
+        __die("Bad pagetable", regs, error_code);
+        oops_end(flags);
+        do_exit(SIGKILL);
+}
+/*
+ * Handle a fault on the vmalloc area
+ *
+ * This assumes no large pages in there.
+ */
+static int vmalloc_fault(unsigned long address)
+{
+        pgd_t *pgd, *pgd_ref;
+        pud_t *pud, *pud_ref;
+        pmd_t *pmd, *pmd_ref;
+        pte_t *pte, *pte_ref;
+        /* Copy kernel mappings over when needed. This can also
+           happen within a race in page table update. In the later
+           case just flush. */
+        pgd = pgd_offset(current->mm ?: &init_mm, address);
+        pgd_ref = pgd_offset_k(address);
+        if (pgd_none(*pgd_ref))
+                return -1;
+        if (pgd_none(*pgd))
+                set_pgd(pgd, *pgd_ref);
+        else
+                BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
+        /* Below here mismatches are bugs because these lower tables
+           are shared */
+        pud = pud_offset(pgd, address);
+        pud_ref = pud_offset(pgd_ref, address);
+        if (pud_none(*pud_ref))
+                return -1;
+        if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
+                BUG();
+        pmd = pmd_offset(pud, address);
+        pmd_ref = pmd_offset(pud_ref, address);
+        if (pmd_none(*pmd_ref))
+                return -1;
+        if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
+                BUG();
+        pte_ref = pte_offset_kernel(pmd_ref, address);
+        if (!pte_present(*pte_ref))
+                return -1;
+        pte = pte_offset_kernel(pmd, address);
+        /* Don't use pte_page here, because the mappings can point
+           outside mem_map, and the NUMA hash lookup cannot handle
+           that. */
+        if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
+                BUG();
+        return 0;
+}
+static int page_fault_trace;
+int show_unhandled_signals = 1;
+/*
+ * This routine handles page faults.  It determines the address,
+ * and the problem, and then passes it off to one of the appropriate
+ * routines.
+ */
+asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
+                                        unsigned long error_code)
+{
+        struct task_struct *tsk;
+        struct mm_struct *mm;
+        struct vm_area_struct * vma;
+        unsigned long address;
+        const struct exception_table_entry *fixup;
+        int write, fault;
+        unsigned long flags;
+        siginfo_t info;
+        tsk = current;
+        mm = tsk->mm;
+        prefetchw(&mm->mmap_sem);
+        /* get the address */
+        address = read_cr2();
+        info.si_code = SEGV_MAPERR;
+        /*
+         * We fault-in kernel-space virtual memory on-demand. The
+         * 'reference' page table is init_mm.pgd.
+         *
+         * NOTE! We MUST NOT take any locks for this case. We may
+         * be in an interrupt or a critical region, and should
+         * only copy the information from the master page table,
+         * nothing more.
+         *
+         * This verifies that the fault happens in kernel space
+         * (error_code & 4) == 0, and that the fault was not a
+         * protection error (error_code & 9) == 0.
+         */
+        if (unlikely(address >= TASK_SIZE64)) {
+                /*
+                 * Don't check for the module range here: its PML4
+                 * is always initialized because it's shared with the main
+                 * kernel text. Only vmalloc may need PML4 syncups.
+                 */
+                if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
+                      ((address >= VMALLOC_START && address < VMALLOC_END))) {
+                        if (vmalloc_fault(address) >= 0)
+                                return;
+                }
+                if (notify_page_fault(regs, error_code) == NOTIFY_STOP)
+                        return;
+                /*
+                 * Don't take the mm semaphore here. If we fixup a prefetch
+                 * fault we could otherwise deadlock.
+                 */
+                goto bad_area_nosemaphore;
+        }
+        if (notify_page_fault(regs, error_code) == NOTIFY_STOP)
+                return;
+        if (likely(regs->eflags & X86_EFLAGS_IF))
+                local_irq_enable();
+        if (unlikely(page_fault_trace))
+                printk("pagefault rip:%lx rsp:%lx cs:%lu ss:%lu address %lx error %lx\n",
+                       regs->rip,regs->rsp,regs->cs,regs->ss,address,error_code); 
+        if (unlikely(error_code & PF_RSVD))
+                pgtable_bad(address, regs, error_code);
+        /*
+         * If we're in an interrupt or have no user
+         * context, we must not take the fault..
+         */
+        if (unlikely(in_atomic() || !mm))
+                goto bad_area_nosemaphore;
+        /*
+         * User-mode registers count as a user access even for any
+         * potential system fault or CPU buglet.
+         */
+        if (user_mode_vm(regs))
+                error_code |= PF_USER;
+ again:
+        /* When running in the kernel we expect faults to occur only to
+         * addresses in user space.  All other faults represent errors in the
+         * kernel and should generate an OOPS.  Unfortunatly, in the case of an
+         * erroneous fault occurring in a code path which already holds mmap_sem
+         * we will deadlock attempting to validate the fault against the
+         * address space.  Luckily the kernel only validly references user
+         * space from well defined areas of code, which are listed in the
+         * exceptions table.
+         *
+         * As the vast majority of faults will be valid we will only perform
+         * the source reference check when there is a possibilty of a deadlock.
+         * Attempt to lock the address space, if we cannot we then validate the
+         * source.  If this is invalid we can skip the address space check,
+         * thus avoiding the deadlock.
+         */
+        if (!down_read_trylock(&mm->mmap_sem)) {
+                if ((error_code & PF_USER) == 0 &&
+                    !search_exception_tables(regs->rip))
+                        goto bad_area_nosemaphore;
+                down_read(&mm->mmap_sem);
+        }
+        vma = find_vma(mm, address);
+        if (!vma)
+                goto bad_area;
+        if (likely(vma->vm_start <= address))
+                goto good_area;
+        if (!(vma->vm_flags & VM_GROWSDOWN))
+                goto bad_area;
+        if (error_code & 4) {
+                /* Allow userspace just enough access below the stack pointer
+                 * to let the 'enter' instruction work.
+                 */
+                if (address + 65536 + 32 * sizeof(unsigned long) < regs->rsp)
+                        goto bad_area;
+        }
+        if (expand_stack(vma, address))
+                goto bad_area;
+/*
+ * Ok, we have a good vm_area for this memory access, so
+ * we can handle it..
+ */
+good_area:
+        info.si_code = SEGV_ACCERR;
+        write = 0;
+        switch (error_code & (PF_PROT|PF_WRITE)) {
+                default:        /* 3: write, present */
+                        /* fall through */
+                case PF_WRITE:          /* write, not present */
+                        if (!(vma->vm_flags & VM_WRITE))
+                                goto bad_area;
+                        write++;
+                        break;
+                case PF_PROT:           /* read, present */
+                        goto bad_area;
+                case 0:                 /* read, not present */
+                        if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
+                                goto bad_area;
+        }
+        /*
+         * If for any reason at all we couldn't handle the fault,
+         * make sure we exit gracefully rather than endlessly redo
+         * the fault.
+         */
+        fault = handle_mm_fault(mm, vma, address, write);
+        if (unlikely(fault & VM_FAULT_ERROR)) {
+                if (fault & VM_FAULT_OOM)
+                        goto out_of_memory;
+                else if (fault & VM_FAULT_SIGBUS)
+                        goto do_sigbus;
+                BUG();
+        }
+        if (fault & VM_FAULT_MAJOR)
+                tsk->maj_flt++;
+        else
+                tsk->min_flt++;
+        up_read(&mm->mmap_sem);
+        return;
+/*
+ * Something tried to access memory that isn't in our memory map..
+ * Fix it, but check if it's kernel or user first..
+ */
+bad_area:
+        up_read(&mm->mmap_sem);
+bad_area_nosemaphore:
+        /* User mode accesses just cause a SIGSEGV */
+        if (error_code & PF_USER) {
+                /*
+                 * It's possible to have interrupts off here.
+                 */
+                local_irq_enable();
+                if (is_prefetch(regs, address, error_code))
+                        return;
+                /* Work around K8 erratum #100 K8 in compat mode
+                   occasionally jumps to illegal addresses >4GB.  We
+                   catch this here in the page fault handler because
+                   these addresses are not reachable. Just detect this
+                   case and return.  Any code segment in LDT is
+                   compatibility mode. */
+                if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
+                    (address >> 32))
+                        return;
+                if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
+                    printk_ratelimit()) {
+                        printk(
+                       "%s%s[%d]: segfault at %016lx rip %016lx rsp %016lx error %lx\n",
+                                        tsk->pid > 1 ? KERN_INFO : KERN_EMERG,
+                                        tsk->comm, tsk->pid, address, regs->rip,
+                                        regs->rsp, error_code);
+                }
+       
+                tsk->thread.cr2 = address;
+                /* Kernel addresses are always protection faults */
+                tsk->thread.error_code = error_code | (address >= TASK_SIZE);
+                tsk->thread.trap_no = 14;
+                info.si_signo = SIGSEGV;
+                info.si_errno = 0;
+                /* info.si_code has been set above */
+                info.si_addr = (void __user *)address;
+                force_sig_info(SIGSEGV, &info, tsk);
+                return;
+        }
+no_context:
+        
+        /* Are we prepared to handle this kernel fault?  */
+        fixup = search_exception_tables(regs->rip);
+        if (fixup) {
+                regs->rip = fixup->fixup;
+                return;
+        }
+        /* 
+         * Hall of shame of CPU/BIOS bugs.
+         */
+        if (is_prefetch(regs, address, error_code))
+                return;
+        if (is_errata93(regs, address))
+                return; 
+/*
+ * Oops. The kernel tried to access some bad page. We'll have to
+ * terminate things with extreme prejudice.
+ */
+        flags = oops_begin();
+        if (address < PAGE_SIZE)
+                printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");
+        else
+                printk(KERN_ALERT "Unable to handle kernel paging request");
+        printk(" at %016lx RIP: \n" KERN_ALERT,address);
+        printk_address(regs->rip);
+        dump_pagetable(address);
+        tsk->thread.cr2 = address;
+        tsk->thread.trap_no = 14;
+        tsk->thread.error_code = error_code;
+        __die("Oops", regs, error_code);
+        /* Executive summary in case the body of the oops scrolled away */
+        printk(KERN_EMERG "CR2: %016lx\n", address);
+        oops_end(flags);
+        do_exit(SIGKILL);
+/*
+ * We ran out of memory, or some other thing happened to us that made
+ * us unable to handle the page fault gracefully.
+ */
+out_of_memory:
+        up_read(&mm->mmap_sem);
+        if (is_init(current)) {
+                yield();
+                goto again;
+        }
+        printk("VM: killing process %s\n", tsk->comm);
+        if (error_code & 4)
+                do_group_exit(SIGKILL);
+        goto no_context;
+do_sigbus:
+        up_read(&mm->mmap_sem);
+        /* Kernel mode? Handle exceptions or die */
+        if (!(error_code & PF_USER))
+                goto no_context;
+        tsk->thread.cr2 = address;
+        tsk->thread.error_code = error_code;
+        tsk->thread.trap_no = 14;
+        info.si_signo = SIGBUS;
+        info.si_errno = 0;
+        info.si_code = BUS_ADRERR;
+        info.si_addr = (void __user *)address;
+        force_sig_info(SIGBUS, &info, tsk);
+        return;
+}
+DEFINE_SPINLOCK(pgd_lock);
+LIST_HEAD(pgd_list);
+void vmalloc_sync_all(void)
+{
+        /* Note that races in the updates of insync and start aren't 
+           problematic:
+           insync can only get set bits added, and updates to start are only
+           improving performance (without affecting correctness if undone). */
+        static DECLARE_BITMAP(insync, PTRS_PER_PGD);
+        static unsigned long start = VMALLOC_START & PGDIR_MASK;
+        unsigned long address;
+        for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
+                if (!test_bit(pgd_index(address), insync)) {
+                        const pgd_t *pgd_ref = pgd_offset_k(address);
+                        struct page *page;
+                        if (pgd_none(*pgd_ref))
+                                continue;
+                        spin_lock(&pgd_lock);
+                        list_for_each_entry(page, &pgd_list, lru) {
+                                pgd_t *pgd;
+                                pgd = (pgd_t *)page_address(page) + pgd_index(address);
+                                if (pgd_none(*pgd))
+                                        set_pgd(pgd, *pgd_ref);
+                                else
+                                        BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
+                        }
+                        spin_unlock(&pgd_lock);
+                        set_bit(pgd_index(address), insync);
+                }
+                if (address == start)
+                        start = address + PGDIR_SIZE;
+        }
+        /* Check that there is no need to do the same for the modules area. */
+        BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
+        BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) == 
+                                (__START_KERNEL & PGDIR_MASK)));
+}
+static int __init enable_pagefaulttrace(char *str)
+{
+        page_fault_trace = 1;
+        return 1;
+}
+__setup("pagefaulttrace", enable_pagefaulttrace);

diff --git a/arch/x86/mm/fault_64.c b/arch/x86/mm/fault_64.c new file mode 100644 index 000000000000..54816adb8e93 --- /dev/null +++ b/arch/x86/mm/fault_64.c
@@ -0,0 +1,636 @@
	1	/*
	2	* linux/arch/x86-64/mm/fault.c
	3	*
	4	* Copyright (C) 1995 Linus Torvalds
	5	* Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
	6	*/
	7
	8	#include <linux/signal.h>
	9	#include <linux/sched.h>
	10	#include <linux/kernel.h>
	11	#include <linux/errno.h>
	12	#include <linux/string.h>
	13	#include <linux/types.h>
	14	#include <linux/ptrace.h>
	15	#include <linux/mman.h>
	16	#include <linux/mm.h>
	17	#include <linux/smp.h>
	18	#include <linux/interrupt.h>
	19	#include <linux/init.h>
	20	#include <linux/tty.h>
	21	#include <linux/vt_kern.h> /* For unblank_screen() */
	22	#include <linux/compiler.h>
	23	#include <linux/vmalloc.h>
	24	#include <linux/module.h>
	25	#include <linux/kprobes.h>
	26	#include <linux/uaccess.h>
	27	#include <linux/kdebug.h>
	28
	29	#include <asm/system.h>
	30	#include <asm/pgalloc.h>
	31	#include <asm/smp.h>
	32	#include <asm/tlbflush.h>
	33	#include <asm/proto.h>
	34	#include <asm-generic/sections.h>
	35
	36	/* Page fault error code bits */
	37	#define PF_PROT (1<<0) /* or no page found */
	38	#define PF_WRITE (1<<1)
	39	#define PF_USER (1<<2)
	40	#define PF_RSVD (1<<3)
	41	#define PF_INSTR (1<<4)
	42
	43	static ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain);
	44
	45	/* Hook to register for page fault notifications */
	46	int register_page_fault_notifier(struct notifier_block *nb)
	47	{
	48	vmalloc_sync_all();
	49	return atomic_notifier_chain_register(&notify_page_fault_chain, nb);
	50	}
	51	EXPORT_SYMBOL_GPL(register_page_fault_notifier);
	52
	53	int unregister_page_fault_notifier(struct notifier_block *nb)
	54	{
	55	return atomic_notifier_chain_unregister(&notify_page_fault_chain, nb);
	56	}
	57	EXPORT_SYMBOL_GPL(unregister_page_fault_notifier);
	58
	59	static inline int notify_page_fault(struct pt_regs *regs, long err)
	60	{
	61	struct die_args args = {
	62	.regs = regs,
	63	.str = "page fault",
	64	.err = err,
	65	.trapnr = 14,
	66	.signr = SIGSEGV
	67	};
	68	return atomic_notifier_call_chain(&notify_page_fault_chain,
	69	DIE_PAGE_FAULT, &args);
	70	}
	71
	72	/* Sometimes the CPU reports invalid exceptions on prefetch.
	73	Check that here and ignore.
	74	Opcode checker based on code by Richard Brunner */
	75	static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr,
	76	unsigned long error_code)
	77	{
	78	unsigned char *instr;
	79	int scan_more = 1;
	80	int prefetch = 0;
	81	unsigned char *max_instr;
	82
	83	/* If it was a exec fault ignore */
	84	if (error_code & PF_INSTR)
	85	return 0;
	86
	87	instr = (unsigned char __user *)convert_rip_to_linear(current, regs);
	88	max_instr = instr + 15;
	89
	90	if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
	91	return 0;
	92
	93	while (scan_more && instr < max_instr) {
	94	unsigned char opcode;
	95	unsigned char instr_hi;
	96	unsigned char instr_lo;
	97
	98	if (probe_kernel_address(instr, opcode))
	99	break;
	100
	101	instr_hi = opcode & 0xf0;
	102	instr_lo = opcode & 0x0f;
	103	instr++;
	104
	105	switch (instr_hi) {
	106	case 0x20:
	107	case 0x30:
	108	/* Values 0x26,0x2E,0x36,0x3E are valid x86
	109	prefixes. In long mode, the CPU will signal
	110	invalid opcode if some of these prefixes are
	111	present so we will never get here anyway */
	112	scan_more = ((instr_lo & 7) == 0x6);
	113	break;
	114
	115	case 0x40:
	116	/* In AMD64 long mode, 0x40 to 0x4F are valid REX prefixes
	117	Need to figure out under what instruction mode the
	118	instruction was issued ... */
	119	/* Could check the LDT for lm, but for now it's good
	120	enough to assume that long mode only uses well known
	121	segments or kernel. */
	122	scan_more = (!user_mode(regs)) \|\| (regs->cs == __USER_CS);
	123	break;
	124
	125	case 0x60:
	126	/* 0x64 thru 0x67 are valid prefixes in all modes. */
	127	scan_more = (instr_lo & 0xC) == 0x4;
	128	break;
	129	case 0xF0:
	130	/* 0xF0, 0xF2, and 0xF3 are valid prefixes in all modes. */
	131	scan_more = !instr_lo \|\| (instr_lo>>1) == 1;
	132	break;
	133	case 0x00:
	134	/* Prefetch instruction is 0x0F0D or 0x0F18 */
	135	scan_more = 0;
	136	if (probe_kernel_address(instr, opcode))
	137	break;
	138	prefetch = (instr_lo == 0xF) &&
	139	(opcode == 0x0D \|\| opcode == 0x18);
	140	break;
	141	default:
	142	scan_more = 0;
	143	break;
	144	}
	145	}
	146	return prefetch;
	147	}
	148
	149	static int bad_address(void *p)
	150	{
	151	unsigned long dummy;
	152	return probe_kernel_address((unsigned long *)p, dummy);
	153	}
	154
	155	void dump_pagetable(unsigned long address)
	156	{
	157	pgd_t *pgd;
	158	pud_t *pud;
	159	pmd_t *pmd;
	160	pte_t *pte;
	161
	162	pgd = (pgd_t *)read_cr3();
	163
	164	pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
	165	pgd += pgd_index(address);
	166	if (bad_address(pgd)) goto bad;
	167	printk("PGD %lx ", pgd_val(*pgd));
	168	if (!pgd_present(*pgd)) goto ret;
	169
	170	pud = pud_offset(pgd, address);
	171	if (bad_address(pud)) goto bad;
	172	printk("PUD %lx ", pud_val(*pud));
	173	if (!pud_present(*pud)) goto ret;
	174
	175	pmd = pmd_offset(pud, address);
	176	if (bad_address(pmd)) goto bad;
	177	printk("PMD %lx ", pmd_val(*pmd));
	178	if (!pmd_present(*pmd)) goto ret;
	179
	180	pte = pte_offset_kernel(pmd, address);
	181	if (bad_address(pte)) goto bad;
	182	printk("PTE %lx", pte_val(*pte));
	183	ret:
	184	printk("\n");
	185	return;
	186	bad:
	187	printk("BAD\n");
	188	}
	189
	190	static const char errata93_warning[] =
	191	KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
	192	KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
	193	KERN_ERR "******* Please consider a BIOS update.\n"
	194	KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
	195
	196	/* Workaround for K8 erratum #93 & buggy BIOS.
	197	BIOS SMM functions are required to use a specific workaround
	198	to avoid corruption of the 64bit RIP register on C stepping K8.
	199	A lot of BIOS that didn't get tested properly miss this.
	200	The OS sees this as a page fault with the upper 32bits of RIP cleared.
	201	Try to work around it here.
	202	Note we only handle faults in kernel here. */
	203
	204	static int is_errata93(struct pt_regs *regs, unsigned long address)
	205	{
	206	static int warned;
	207	if (address != regs->rip)
	208	return 0;
	209	if ((address >> 32) != 0)
	210	return 0;
	211	address \|= 0xffffffffUL << 32;
	212	if ((address >= (u64)_stext && address <= (u64)_etext) \|\|
	213	(address >= MODULES_VADDR && address <= MODULES_END)) {
	214	if (!warned) {
	215	printk(errata93_warning);
	216	warned = 1;
	217	}
	218	regs->rip = address;
	219	return 1;
	220	}
	221	return 0;
	222	}
	223
	224	static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
	225	unsigned long error_code)
	226	{
	227	unsigned long flags = oops_begin();
	228	struct task_struct *tsk;
	229
	230	printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
	231	current->comm, address);
	232	dump_pagetable(address);
	233	tsk = current;
	234	tsk->thread.cr2 = address;
	235	tsk->thread.trap_no = 14;
	236	tsk->thread.error_code = error_code;
	237	__die("Bad pagetable", regs, error_code);
	238	oops_end(flags);
	239	do_exit(SIGKILL);
	240	}
	241
	242	/*
	243	* Handle a fault on the vmalloc area
	244	*
	245	* This assumes no large pages in there.
	246	*/
	247	static int vmalloc_fault(unsigned long address)
	248	{
	249	pgd_t pgd, pgd_ref;
	250	pud_t pud, pud_ref;
	251	pmd_t pmd, pmd_ref;
	252	pte_t pte, pte_ref;
	253
	254	/* Copy kernel mappings over when needed. This can also
	255	happen within a race in page table update. In the later
	256	case just flush. */
	257
	258	pgd = pgd_offset(current->mm ?: &init_mm, address);
	259	pgd_ref = pgd_offset_k(address);
	260	if (pgd_none(*pgd_ref))
	261	return -1;
	262	if (pgd_none(*pgd))
	263	set_pgd(pgd, *pgd_ref);
	264	else
	265	BUG_ON(pgd_page_vaddr(pgd) != pgd_page_vaddr(pgd_ref));
	266
	267	/* Below here mismatches are bugs because these lower tables
	268	are shared */
	269
	270	pud = pud_offset(pgd, address);
	271	pud_ref = pud_offset(pgd_ref, address);
	272	if (pud_none(*pud_ref))
	273	return -1;
	274	if (pud_none(pud) \|\| pud_page_vaddr(pud) != pud_page_vaddr(*pud_ref))
	275	BUG();
	276	pmd = pmd_offset(pud, address);
	277	pmd_ref = pmd_offset(pud_ref, address);
	278	if (pmd_none(*pmd_ref))
	279	return -1;
	280	if (pmd_none(pmd) \|\| pmd_page(pmd) != pmd_page(*pmd_ref))
	281	BUG();
	282	pte_ref = pte_offset_kernel(pmd_ref, address);
	283	if (!pte_present(*pte_ref))
	284	return -1;
	285	pte = pte_offset_kernel(pmd, address);
	286	/* Don't use pte_page here, because the mappings can point
	287	outside mem_map, and the NUMA hash lookup cannot handle
	288	that. */
	289	if (!pte_present(pte) \|\| pte_pfn(pte) != pte_pfn(*pte_ref))
	290	BUG();
	291	return 0;
	292	}
	293
	294	static int page_fault_trace;
	295	int show_unhandled_signals = 1;
	296
	297	/*
	298	* This routine handles page faults. It determines the address,
	299	* and the problem, and then passes it off to one of the appropriate
	300	* routines.
	301	*/
	302	asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
	303	unsigned long error_code)
	304	{
	305	struct task_struct *tsk;
	306	struct mm_struct *mm;
	307	struct vm_area_struct * vma;
	308	unsigned long address;
	309	const struct exception_table_entry *fixup;
	310	int write, fault;
	311	unsigned long flags;
	312	siginfo_t info;
	313
	314	tsk = current;
	315	mm = tsk->mm;
	316	prefetchw(&mm->mmap_sem);
	317
	318	/* get the address */
	319	address = read_cr2();
	320
	321	info.si_code = SEGV_MAPERR;
	322
	323
	324	/*
	325	* We fault-in kernel-space virtual memory on-demand. The
	326	* 'reference' page table is init_mm.pgd.
	327	*
	328	* NOTE! We MUST NOT take any locks for this case. We may
	329	* be in an interrupt or a critical region, and should
	330	* only copy the information from the master page table,
	331	* nothing more.
	332	*
	333	* This verifies that the fault happens in kernel space
	334	* (error_code & 4) == 0, and that the fault was not a
	335	* protection error (error_code & 9) == 0.
	336	*/
	337	if (unlikely(address >= TASK_SIZE64)) {
	338	/*
	339	* Don't check for the module range here: its PML4
	340	* is always initialized because it's shared with the main
	341	* kernel text. Only vmalloc may need PML4 syncups.
	342	*/
	343	if (!(error_code & (PF_RSVD\|PF_USER\|PF_PROT)) &&
	344	((address >= VMALLOC_START && address < VMALLOC_END))) {
	345	if (vmalloc_fault(address) >= 0)
	346	return;
	347	}
	348	if (notify_page_fault(regs, error_code) == NOTIFY_STOP)
	349	return;
	350	/*
	351	* Don't take the mm semaphore here. If we fixup a prefetch
	352	* fault we could otherwise deadlock.
	353	*/
	354	goto bad_area_nosemaphore;
	355	}
	356
	357	if (notify_page_fault(regs, error_code) == NOTIFY_STOP)
	358	return;
	359
	360	if (likely(regs->eflags & X86_EFLAGS_IF))
	361	local_irq_enable();
	362
	363	if (unlikely(page_fault_trace))
	364	printk("pagefault rip:%lx rsp:%lx cs:%lu ss:%lu address %lx error %lx\n",
	365	regs->rip,regs->rsp,regs->cs,regs->ss,address,error_code);
	366
	367	if (unlikely(error_code & PF_RSVD))
	368	pgtable_bad(address, regs, error_code);
	369
	370	/*
	371	* If we're in an interrupt or have no user
	372	* context, we must not take the fault..
	373	*/
	374	if (unlikely(in_atomic() \|\| !mm))
	375	goto bad_area_nosemaphore;
	376
	377	/*
	378	* User-mode registers count as a user access even for any
	379	* potential system fault or CPU buglet.
	380	*/
	381	if (user_mode_vm(regs))
	382	error_code \|= PF_USER;
	383
	384	again:
	385	/* When running in the kernel we expect faults to occur only to
	386	* addresses in user space. All other faults represent errors in the
	387	* kernel and should generate an OOPS. Unfortunatly, in the case of an
	388	* erroneous fault occurring in a code path which already holds mmap_sem
	389	* we will deadlock attempting to validate the fault against the
	390	* address space. Luckily the kernel only validly references user
	391	* space from well defined areas of code, which are listed in the
	392	* exceptions table.
	393	*
	394	* As the vast majority of faults will be valid we will only perform
	395	* the source reference check when there is a possibilty of a deadlock.
	396	* Attempt to lock the address space, if we cannot we then validate the
	397	* source. If this is invalid we can skip the address space check,
	398	* thus avoiding the deadlock.
	399	*/
	400	if (!down_read_trylock(&mm->mmap_sem)) {
	401	if ((error_code & PF_USER) == 0 &&
	402	!search_exception_tables(regs->rip))
	403	goto bad_area_nosemaphore;
	404	down_read(&mm->mmap_sem);
	405	}
	406
	407	vma = find_vma(mm, address);
	408	if (!vma)
	409	goto bad_area;
	410	if (likely(vma->vm_start <= address))
	411	goto good_area;
	412	if (!(vma->vm_flags & VM_GROWSDOWN))
	413	goto bad_area;
	414	if (error_code & 4) {
	415	/* Allow userspace just enough access below the stack pointer
	416	* to let the 'enter' instruction work.
	417	*/
	418	if (address + 65536 + 32 * sizeof(unsigned long) < regs->rsp)
	419	goto bad_area;
	420	}
	421	if (expand_stack(vma, address))
	422	goto bad_area;
	423	/*
	424	* Ok, we have a good vm_area for this memory access, so
	425	* we can handle it..
	426	*/
	427	good_area:
	428	info.si_code = SEGV_ACCERR;
	429	write = 0;
	430	switch (error_code & (PF_PROT\|PF_WRITE)) {
	431	default: /* 3: write, present */
	432	/* fall through */
	433	case PF_WRITE: /* write, not present */
	434	if (!(vma->vm_flags & VM_WRITE))
	435	goto bad_area;
	436	write++;
	437	break;
	438	case PF_PROT: /* read, present */
	439	goto bad_area;
	440	case 0: /* read, not present */
	441	if (!(vma->vm_flags & (VM_READ \| VM_EXEC \| VM_WRITE)))
	442	goto bad_area;
	443	}
	444
	445	/*
	446	* If for any reason at all we couldn't handle the fault,
	447	* make sure we exit gracefully rather than endlessly redo
	448	* the fault.
	449	*/
	450	fault = handle_mm_fault(mm, vma, address, write);
	451	if (unlikely(fault & VM_FAULT_ERROR)) {
	452	if (fault & VM_FAULT_OOM)
	453	goto out_of_memory;
	454	else if (fault & VM_FAULT_SIGBUS)
	455	goto do_sigbus;
	456	BUG();
	457	}
	458	if (fault & VM_FAULT_MAJOR)
	459	tsk->maj_flt++;
	460	else
	461	tsk->min_flt++;
	462	up_read(&mm->mmap_sem);
	463	return;
	464
	465	/*
	466	* Something tried to access memory that isn't in our memory map..
	467	* Fix it, but check if it's kernel or user first..
	468	*/
	469	bad_area:
	470	up_read(&mm->mmap_sem);
	471
	472	bad_area_nosemaphore:
	473	/* User mode accesses just cause a SIGSEGV */
	474	if (error_code & PF_USER) {
	475
	476	/*
	477	* It's possible to have interrupts off here.
	478	*/
	479	local_irq_enable();
	480
	481	if (is_prefetch(regs, address, error_code))
	482	return;
	483
	484	/* Work around K8 erratum #100 K8 in compat mode
	485	occasionally jumps to illegal addresses >4GB. We
	486	catch this here in the page fault handler because
	487	these addresses are not reachable. Just detect this
	488	case and return. Any code segment in LDT is
	489	compatibility mode. */
	490	if ((regs->cs == __USER32_CS \|\| (regs->cs & (1<<2))) &&
	491	(address >> 32))
	492	return;
	493
	494	if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
	495	printk_ratelimit()) {
	496	printk(
	497	"%s%s[%d]: segfault at %016lx rip %016lx rsp %016lx error %lx\n",
	498	tsk->pid > 1 ? KERN_INFO : KERN_EMERG,
	499	tsk->comm, tsk->pid, address, regs->rip,
	500	regs->rsp, error_code);
	501	}
	502
	503	tsk->thread.cr2 = address;
	504	/* Kernel addresses are always protection faults */
	505	tsk->thread.error_code = error_code \| (address >= TASK_SIZE);
	506	tsk->thread.trap_no = 14;
	507	info.si_signo = SIGSEGV;
	508	info.si_errno = 0;
	509	/* info.si_code has been set above */
	510	info.si_addr = (void __user *)address;
	511	force_sig_info(SIGSEGV, &info, tsk);
	512	return;
	513	}
	514
	515	no_context:
	516
	517	/* Are we prepared to handle this kernel fault? */
	518	fixup = search_exception_tables(regs->rip);
	519	if (fixup) {
	520	regs->rip = fixup->fixup;
	521	return;
	522	}
	523
	524	/*
	525	* Hall of shame of CPU/BIOS bugs.
	526	*/
	527
	528	if (is_prefetch(regs, address, error_code))
	529	return;
	530
	531	if (is_errata93(regs, address))
	532	return;
	533
	534	/*
	535	* Oops. The kernel tried to access some bad page. We'll have to
	536	* terminate things with extreme prejudice.
	537	*/
	538
	539	flags = oops_begin();
	540
	541	if (address < PAGE_SIZE)
	542	printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");
	543	else
	544	printk(KERN_ALERT "Unable to handle kernel paging request");
	545	printk(" at %016lx RIP: \n" KERN_ALERT,address);
	546	printk_address(regs->rip);
	547	dump_pagetable(address);
	548	tsk->thread.cr2 = address;
	549	tsk->thread.trap_no = 14;
	550	tsk->thread.error_code = error_code;
	551	__die("Oops", regs, error_code);
	552	/* Executive summary in case the body of the oops scrolled away */
	553	printk(KERN_EMERG "CR2: %016lx\n", address);
	554	oops_end(flags);
	555	do_exit(SIGKILL);
	556
	557	/*
	558	* We ran out of memory, or some other thing happened to us that made
	559	* us unable to handle the page fault gracefully.
	560	*/
	561	out_of_memory:
	562	up_read(&mm->mmap_sem);
	563	if (is_init(current)) {
	564	yield();
	565	goto again;
	566	}
	567	printk("VM: killing process %s\n", tsk->comm);
	568	if (error_code & 4)
	569	do_group_exit(SIGKILL);
	570	goto no_context;
	571
	572	do_sigbus:
	573	up_read(&mm->mmap_sem);
	574
	575	/* Kernel mode? Handle exceptions or die */
	576	if (!(error_code & PF_USER))
	577	goto no_context;
	578
	579	tsk->thread.cr2 = address;
	580	tsk->thread.error_code = error_code;
	581	tsk->thread.trap_no = 14;
	582	info.si_signo = SIGBUS;
	583	info.si_errno = 0;
	584	info.si_code = BUS_ADRERR;
	585	info.si_addr = (void __user *)address;
	586	force_sig_info(SIGBUS, &info, tsk);
	587	return;
	588	}
	589
	590	DEFINE_SPINLOCK(pgd_lock);
	591	LIST_HEAD(pgd_list);
	592
	593	void vmalloc_sync_all(void)
	594	{
	595	/* Note that races in the updates of insync and start aren't
	596	problematic:
	597	insync can only get set bits added, and updates to start are only
	598	improving performance (without affecting correctness if undone). */
	599	static DECLARE_BITMAP(insync, PTRS_PER_PGD);
	600	static unsigned long start = VMALLOC_START & PGDIR_MASK;
	601	unsigned long address;
	602
	603	for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
	604	if (!test_bit(pgd_index(address), insync)) {
	605	const pgd_t *pgd_ref = pgd_offset_k(address);
	606	struct page *page;
	607
	608	if (pgd_none(*pgd_ref))
	609	continue;
	610	spin_lock(&pgd_lock);
	611	list_for_each_entry(page, &pgd_list, lru) {
	612	pgd_t *pgd;
	613	pgd = (pgd_t *)page_address(page) + pgd_index(address);
	614	if (pgd_none(*pgd))
	615	set_pgd(pgd, *pgd_ref);
	616	else
	617	BUG_ON(pgd_page_vaddr(pgd) != pgd_page_vaddr(pgd_ref));
	618	}
	619	spin_unlock(&pgd_lock);
	620	set_bit(pgd_index(address), insync);
	621	}
	622	if (address == start)
	623	start = address + PGDIR_SIZE;
	624	}
	625	/* Check that there is no need to do the same for the modules area. */
	626	BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
	627	BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
	628	(__START_KERNEL & PGDIR_MASK)));
	629	}
	630
	631	static int __init enable_pagefaulttrace(char *str)
	632	{
	633	page_fault_trace = 1;
	634	return 1;
	635	}
	636	__setup("pagefaulttrace", enable_pagefaulttrace);