Merge commit 'remotes/tip/x86/paravirt' into x86/untangle2

* commit 'remotes/tip/x86/paravirt': (175 commits) xen: use direct ops on 64-bit xen: make direct versions of irq_enable/disable/save/restore to common code xen: setup percpu data pointers xen: fix 32-bit build resulting from mmu move x86/paravirt: return full 64-bit result x86, percpu: fix kexec with vmlinux x86/vmi: fix interrupt enable/disable/save/restore calling convention. x86/paravirt: don't restore second return reg xen: setup percpu data pointers x86: split loading percpu segments from loading gdt x86: pass in cpu number to switch_to_new_gdt() x86: UV fix uv_flush_send_and_wait() x86/paravirt: fix missing callee-save call on pud_val x86/paravirt: use callee-saved convention for pte_val/make_pte/etc x86/paravirt: implement PVOP_CALL macros for callee-save functions x86/paravirt: add register-saving thunks to reduce caller register pressure x86/paravirt: selectively save/restore regs around pvops calls x86: fix paravirt clobber in entry_64.S x86/pvops: add a paravirt_ident functions to allow special patching xen: move remaining mmu-related stuff into mmu.c ... Conflicts: arch/x86/mach-voyager/voyager_smp.c arch/x86/mm/fault.c
author: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> 2009-02-11 14:52:22 -0500
committer: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> 2009-02-11 14:52:22 -0500
commit: 9049a11de73d3ecc623f1903100d099f82ede56c (patch)
tree: c03d130d58168e337a66fe999682452b7a02b42b /arch/x86/mm
parent: c47c1b1f3a9d6973108020df1dcab7604f7774dd (diff)
parent: e4d0407185cdbdcfd99fc23bde2e5454bbc46329 (diff)
6 files changed, 779 insertions, 182 deletions
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index d8cc96a2738f..9f05157220f5 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -1,6 +1,8 @@
 obj-y   :=  init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
            pat.o pgtable.o gup.o
+obj-$(CONFIG_X86_SMP)           += tlb.o
 obj-$(CONFIG_X86_32)            += pgtable_32.o iomap_32.o
 obj-$(CONFIG_HUGETLB_PAGE)      += hugetlbpage.o
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index c76ef1d701c9..976b5a72ec30 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -26,6 +26,7 @@
 #include <linux/kprobes.h>
 #include <linux/uaccess.h>
 #include <linux/kdebug.h>
+#include <linux/magic.h>
 #include <asm/system.h>
 #include <asm/desc.h>
@@ -91,8 +92,8 @@ static inline int notify_page_fault(struct pt_regs *regs)
 *
 * Opcode checker based on code by Richard Brunner
 */
-static int is_prefetch(struct pt_regs *regs, unsigned long addr,
+static int is_prefetch(struct pt_regs *regs, unsigned long error_code,
-                       unsigned long error_code)
+                        unsigned long addr)
 {
        unsigned char *instr;
        int scan_more = 1;
@@ -409,15 +410,15 @@ static void show_fault_oops(struct pt_regs *regs, unsigned long error_code,
 }
 #ifdef CONFIG_X86_64
-static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
+static noinline void pgtable_bad(struct pt_regs *regs,
-                                 unsigned long error_code)
+                         unsigned long error_code, unsigned long address)
 {
        unsigned long flags = oops_begin();
        int sig = SIGKILL;
-        struct task_struct *tsk;
+        struct task_struct *tsk = current;
        printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
-               current->comm, address);
+               tsk->comm, address);
        dump_pagetable(address);
        tsk = current;
        tsk->thread.cr2 = address;
@@ -429,6 +430,196 @@ static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
 }
 #endif
+static noinline void no_context(struct pt_regs *regs,
+                        unsigned long error_code, unsigned long address)
+{
+        struct task_struct *tsk = current;
+        unsigned long *stackend;
+#ifdef CONFIG_X86_64
+        unsigned long flags;
+        int sig;
+#endif
+        /* Are we prepared to handle this kernel fault?  */
+        if (fixup_exception(regs))
+                return;
+        /*
+         * X86_32
+         * Valid to do another page fault here, because if this fault
+         * had been triggered by is_prefetch fixup_exception would have
+         * handled it.
+         *
+         * X86_64
+         * Hall of shame of CPU/BIOS bugs.
+         */
+        if (is_prefetch(regs, error_code, address))
+                return;
+        if (is_errata93(regs, address))
+                return;
+        /*
+         * Oops. The kernel tried to access some bad page. We'll have to
+         * terminate things with extreme prejudice.
+         */
+#ifdef CONFIG_X86_32
+        bust_spinlocks(1);
+#else
+        flags = oops_begin();
+#endif
+        show_fault_oops(regs, error_code, address);
+        stackend = end_of_stack(tsk);
+        if (*stackend != STACK_END_MAGIC)
+                printk(KERN_ALERT "Thread overran stack, or stack corrupted\n");
+        tsk->thread.cr2 = address;
+        tsk->thread.trap_no = 14;
+        tsk->thread.error_code = error_code;
+#ifdef CONFIG_X86_32
+        die("Oops", regs, error_code);
+        bust_spinlocks(0);
+        do_exit(SIGKILL);
+#else
+        sig = SIGKILL;
+        if (__die("Oops", regs, error_code))
+                sig = 0;
+        /* Executive summary in case the body of the oops scrolled away */
+        printk(KERN_EMERG "CR2: %016lx\n", address);
+        oops_end(flags, regs, sig);
+#endif
+}
+static void __bad_area_nosemaphore(struct pt_regs *regs,
+                        unsigned long error_code, unsigned long address,
+                        int si_code)
+{
+        struct task_struct *tsk = current;
+        /* User mode accesses just cause a SIGSEGV */
+        if (error_code & PF_USER) {
+                /*
+                 * It's possible to have interrupts off here.
+                 */
+                local_irq_enable();
+                /*
+                 * Valid to do another page fault here because this one came
+                 * from user space.
+                 */
+                if (is_prefetch(regs, error_code, address))
+                        return;
+                if (is_errata100(regs, address))
+                        return;
+                if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
+                    printk_ratelimit()) {
+                        printk(
+                        "%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
+                        task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
+                        tsk->comm, task_pid_nr(tsk), address,
+                        (void *) regs->ip, (void *) regs->sp, error_code);
+                        print_vma_addr(" in ", regs->ip);
+                        printk("\n");
+                }
+                tsk->thread.cr2 = address;
+                /* Kernel addresses are always protection faults */
+                tsk->thread.error_code = error_code | (address >= TASK_SIZE);
+                tsk->thread.trap_no = 14;
+                force_sig_info_fault(SIGSEGV, si_code, address, tsk);
+                return;
+        }
+        if (is_f00f_bug(regs, address))
+                return;
+        no_context(regs, error_code, address);
+}
+static noinline void bad_area_nosemaphore(struct pt_regs *regs,
+                        unsigned long error_code, unsigned long address)
+{
+        __bad_area_nosemaphore(regs, error_code, address, SEGV_MAPERR);
+}
+static void __bad_area(struct pt_regs *regs,
+                        unsigned long error_code, unsigned long address,
+                        int si_code)
+{
+        struct mm_struct *mm = current->mm;
+        /*
+         * Something tried to access memory that isn't in our memory map..
+         * Fix it, but check if it's kernel or user first..
+         */
+        up_read(&mm->mmap_sem);
+        __bad_area_nosemaphore(regs, error_code, address, si_code);
+}
+static noinline void bad_area(struct pt_regs *regs,
+                        unsigned long error_code, unsigned long address)
+{
+        __bad_area(regs, error_code, address, SEGV_MAPERR);
+}
+static noinline void bad_area_access_error(struct pt_regs *regs,
+                        unsigned long error_code, unsigned long address)
+{
+        __bad_area(regs, error_code, address, SEGV_ACCERR);
+}
+/* TODO: fixup for "mm-invoke-oom-killer-from-page-fault.patch" */
+static void out_of_memory(struct pt_regs *regs,
+                        unsigned long error_code, unsigned long address)
+{
+        /*
+         * We ran out of memory, call the OOM killer, and return the userspace
+         * (which will retry the fault, or kill us if we got oom-killed).
+         */
+        up_read(&current->mm->mmap_sem);
+        pagefault_out_of_memory();
+}
+static void do_sigbus(struct pt_regs *regs,
+                        unsigned long error_code, unsigned long address)
+{
+        struct task_struct *tsk = current;
+        struct mm_struct *mm = tsk->mm;
+        up_read(&mm->mmap_sem);
+        /* Kernel mode? Handle exceptions or die */
+        if (!(error_code & PF_USER))
+                no_context(regs, error_code, address);
+#ifdef CONFIG_X86_32
+        /* User space => ok to do another page fault */
+        if (is_prefetch(regs, error_code, address))
+                return;
+#endif
+        tsk->thread.cr2 = address;
+        tsk->thread.error_code = error_code;
+        tsk->thread.trap_no = 14;
+        force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
+}
+static noinline void mm_fault_error(struct pt_regs *regs,
+                unsigned long error_code, unsigned long address, unsigned int fault)
+{
+        if (fault & VM_FAULT_OOM)
+                out_of_memory(regs, error_code, address);
+        else if (fault & VM_FAULT_SIGBUS)
+                do_sigbus(regs, error_code, address);
+        else
+                BUG();
+}
 static int spurious_fault_check(unsigned long error_code, pte_t *pte)
 {
        if ((error_code & PF_WRITE) && !pte_write(*pte))
@@ -448,8 +639,8 @@ static int spurious_fault_check(unsigned long error_code, pte_t *pte)
 * There are no security implications to leaving a stale TLB when
 * increasing the permissions on a page.
 */
-static int spurious_fault(unsigned long address,
+static noinline int spurious_fault(unsigned long error_code,
-                          unsigned long error_code)
+                                unsigned long address)
 {
        pgd_t *pgd;
        pud_t *pud;
@@ -494,7 +685,7 @@ static int spurious_fault(unsigned long address,
 *
 * This assumes no large pages in there.
 */
-static int vmalloc_fault(unsigned long address)
+static noinline int vmalloc_fault(unsigned long address)
 {
 #ifdef CONFIG_X86_32
        unsigned long pgd_paddr;
@@ -573,6 +764,25 @@ static int vmalloc_fault(unsigned long address)
 int show_unhandled_signals = 1;
+static inline int access_error(unsigned long error_code, int write,
+                                struct vm_area_struct *vma)
+{
+        if (write) {
+                /* write, present and write, not present */
+                if (unlikely(!(vma->vm_flags & VM_WRITE)))
+                        return 1;
+        } else if (unlikely(error_code & PF_PROT)) {
+                /* read, present */
+                return 1;
+        } else {
+                /* read, not present */
+                if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))))
+                        return 1;
+        }
+        return 0;
+}
 /*
 * This routine handles page faults.  It determines the address,
 * and the problem, and then passes it off to one of the appropriate
@@ -583,16 +793,12 @@ asmlinkage
 #endif
 void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
 {
+        unsigned long address;
        struct task_struct *tsk;
        struct mm_struct *mm;
        struct vm_area_struct *vma;
-        unsigned long address;
+        int write;
-        int write, si_code;
        int fault;
-#ifdef CONFIG_X86_64
-        unsigned long flags;
-        int sig;
-#endif
        tsk = current;
        mm = tsk->mm;
@@ -601,8 +807,7 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
        /* get the address */
        address = read_cr2();
-        si_code = SEGV_MAPERR;
+        if (unlikely(notify_page_fault(regs)))
        if (unlikely(kmmio_fault(regs, address)))
                return;
@@ -629,7 +834,7 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
                        return;
                /* Can handle a stale RO->RW TLB */
-                if (spurious_fault(address, error_code))
+                if (spurious_fault(error_code, address))
                        return;
                /* kprobes don't want to hook the spurious faults. */
@@ -639,13 +844,13 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
                 * Don't take the mm semaphore here. If we fixup a prefetch
                 * fault we could otherwise deadlock.
                 */
-                goto bad_area_nosemaphore;
+                bad_area_nosemaphore(regs, error_code, address);
+                return;
        }
        /* kprobes don't want to hook the spurious faults. */
        if (notify_page_fault(regs))
                return;
        /*
         * It's safe to allow irq's after cr2 has been saved and the
         * vmalloc fault has been handled.
@@ -661,15 +866,17 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
 #ifdef CONFIG_X86_64
        if (unlikely(error_code & PF_RSVD))
-                pgtable_bad(address, regs, error_code);
+                pgtable_bad(regs, error_code, address);
 #endif
        /*
         * If we're in an interrupt, have no user context or are running in an
         * atomic region then we must not take the fault.
         */
-        if (unlikely(in_atomic() || !mm))
+        if (unlikely(in_atomic() || !mm)) {
-                goto bad_area_nosemaphore;
+                bad_area_nosemaphore(regs, error_code, address);
+                return;
+        }
        /*
         * When running in the kernel we expect faults to occur only to
@@ -687,20 +894,26 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
         * source.  If this is invalid we can skip the address space check,
         * thus avoiding the deadlock.
         */
-        if (!down_read_trylock(&mm->mmap_sem)) {
+        if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
                if ((error_code & PF_USER) == 0 &&
-                    !search_exception_tables(regs->ip))
+                    !search_exception_tables(regs->ip)) {
-                        goto bad_area_nosemaphore;
+                        bad_area_nosemaphore(regs, error_code, address);
+                        return;
+                }
                down_read(&mm->mmap_sem);
        }
        vma = find_vma(mm, address);
-        if (!vma)
+        if (unlikely(!vma)) {
-                goto bad_area;
+                bad_area(regs, error_code, address);
-        if (vma->vm_start <= address)
+                return;
+        }
+        if (likely(vma->vm_start <= address))
                goto good_area;
-        if (!(vma->vm_flags & VM_GROWSDOWN))
+        if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
-                goto bad_area;
+                bad_area(regs, error_code, address);
+                return;
+        }
        if (error_code & PF_USER) {
                /*
                 * Accessing the stack below %sp is always a bug.
@@ -708,31 +921,25 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
                 * and pusha to work.  ("enter $65535,$31" pushes
                 * 32 pointers and then decrements %sp by 65535.)
                 */
-                if (address + 65536 + 32 * sizeof(unsigned long) < regs->sp)
+                if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) {
-                        goto bad_area;
+                        bad_area(regs, error_code, address);
+                        return;
+                }
        }
-        if (expand_stack(vma, address))
+        if (unlikely(expand_stack(vma, address))) {
-                goto bad_area;
+                bad_area(regs, error_code, address);
-/*
+                return;
- * Ok, we have a good vm_area for this memory access, so
+        }
- * we can handle it..
- */
+        /*
+         * Ok, we have a good vm_area for this memory access, so
+         * we can handle it..
+         */
 good_area:
-        si_code = SEGV_ACCERR;
+        write = error_code & PF_WRITE;
-        write = 0;
+        if (unlikely(access_error(error_code, write, vma))) {
-        switch (error_code & (PF_PROT|PF_WRITE)) {
+                bad_area_access_error(regs, error_code, address);
-        default:        /* 3: write, present */
+                return;
-                /* fall through */
-        case PF_WRITE:          /* write, not present */
-                if (!(vma->vm_flags & VM_WRITE))
-                        goto bad_area;
-                write++;
-                break;
-        case PF_PROT:           /* read, present */
-                goto bad_area;
-        case 0:                 /* read, not present */
-                if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
-                        goto bad_area;
        }
        /*
@@ -742,11 +949,8 @@ good_area:
         */
        fault = handle_mm_fault(mm, vma, address, write);
        if (unlikely(fault & VM_FAULT_ERROR)) {
-                if (fault & VM_FAULT_OOM)
+                mm_fault_error(regs, error_code, address, fault);
-                        goto out_of_memory;
+                return;
-                else if (fault & VM_FAULT_SIGBUS)
-                        goto do_sigbus;
-                BUG();
        }
        if (fault & VM_FAULT_MAJOR)
                tsk->maj_flt++;
@@ -764,128 +968,6 @@ good_area:
        }
 #endif
        up_read(&mm->mmap_sem);
-        return;
-/*
- * Something tried to access memory that isn't in our memory map..
- * Fix it, but check if it's kernel or user first..
- */
-bad_area:
-        up_read(&mm->mmap_sem);
-bad_area_nosemaphore:
-        /* User mode accesses just cause a SIGSEGV */
-        if (error_code & PF_USER) {
-                /*
-                 * It's possible to have interrupts off here.
-                 */
-                local_irq_enable();
-                /*
-                 * Valid to do another page fault here because this one came
-                 * from user space.
-                 */
-                if (is_prefetch(regs, address, error_code))
-                        return;
-                if (is_errata100(regs, address))
-                        return;
-                if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
-                    printk_ratelimit()) {
-                        printk(
-                        "%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
-                        task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
-                        tsk->comm, task_pid_nr(tsk), address,
-                        (void *) regs->ip, (void *) regs->sp, error_code);
-                        print_vma_addr(" in ", regs->ip);
-                        printk("\n");
-                }
-                tsk->thread.cr2 = address;
-                /* Kernel addresses are always protection faults */
-                tsk->thread.error_code = error_code | (address >= TASK_SIZE);
-                tsk->thread.trap_no = 14;
-                force_sig_info_fault(SIGSEGV, si_code, address, tsk);
-                return;
-        }
-        if (is_f00f_bug(regs, address))
-                return;
-no_context:
-        /* Are we prepared to handle this kernel fault?  */
-        if (fixup_exception(regs))
-                return;
-        /*
-         * X86_32
-         * Valid to do another page fault here, because if this fault
-         * had been triggered by is_prefetch fixup_exception would have
-         * handled it.
-         *
-         * X86_64
-         * Hall of shame of CPU/BIOS bugs.
-         */
-        if (is_prefetch(regs, address, error_code))
-                return;
-        if (is_errata93(regs, address))
-                return;
-/*
- * Oops. The kernel tried to access some bad page. We'll have to
- * terminate things with extreme prejudice.
- */
-#ifdef CONFIG_X86_32
-        bust_spinlocks(1);
-#else
-        flags = oops_begin();
-#endif
-        show_fault_oops(regs, error_code, address);
-        tsk->thread.cr2 = address;
-        tsk->thread.trap_no = 14;
-        tsk->thread.error_code = error_code;
-#ifdef CONFIG_X86_32
-        die("Oops", regs, error_code);
-        bust_spinlocks(0);
-        do_exit(SIGKILL);
-#else
-        sig = SIGKILL;
-        if (__die("Oops", regs, error_code))
-                sig = 0;
-        /* Executive summary in case the body of the oops scrolled away */
-        printk(KERN_EMERG "CR2: %016lx\n", address);
-        oops_end(flags, regs, sig);
-#endif
-out_of_memory:
-        /*
-         * We ran out of memory, call the OOM killer, and return the userspace
-         * (which will retry the fault, or kill us if we got oom-killed).
-         */
-        up_read(&mm->mmap_sem);
-        pagefault_out_of_memory();
-        return;
-do_sigbus:
-        up_read(&mm->mmap_sem);
-        /* Kernel mode? Handle exceptions or die */
-        if (!(error_code & PF_USER))
-                goto no_context;
-#ifdef CONFIG_X86_32
-        /* User space => ok to do another page fault */
-        if (is_prefetch(regs, address, error_code))
-                return;
-#endif
-        tsk->thread.cr2 = address;
-        tsk->thread.error_code = error_code;
-        tsk->thread.trap_no = 14;
-        force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
 }
 DEFINE_SPINLOCK(pgd_lock);
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 2cef05074413..00263bf07a88 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -49,7 +49,6 @@
 #include <asm/paravirt.h>
 #include <asm/setup.h>
 #include <asm/cacheflush.h>
-#include <asm/smp.h>
 unsigned int __VMALLOC_RESERVE = 128 << 20;
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 71a14f89f89e..08d140fbc31b 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -20,6 +20,12 @@
 #include <asm/acpi.h>
 #include <asm/k8.h>
+#ifdef CONFIG_DEBUG_PER_CPU_MAPS
+# define DBG(x...) printk(KERN_DEBUG x)
+#else
+# define DBG(x...)
+#endif
 struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
 EXPORT_SYMBOL(node_data);
@@ -33,6 +39,21 @@ int numa_off __initdata;
 static unsigned long __initdata nodemap_addr;
 static unsigned long __initdata nodemap_size;
+DEFINE_PER_CPU(int, node_number) = 0;
+EXPORT_PER_CPU_SYMBOL(node_number);
+/*
+ * Map cpu index to node index
+ */
+DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE);
+EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
+/*
+ * Which logical CPUs are on which nodes
+ */
+cpumask_t *node_to_cpumask_map;
+EXPORT_SYMBOL(node_to_cpumask_map);
 /*
 * Given a shift value, try to populate memnodemap[]
 * Returns :
@@ -640,3 +661,199 @@ void __init init_cpu_to_node(void)
 #endif
+/*
+ * Allocate node_to_cpumask_map based on number of available nodes
+ * Requires node_possible_map to be valid.
+ *
+ * Note: node_to_cpumask() is not valid until after this is done.
+ * (Use CONFIG_DEBUG_PER_CPU_MAPS to check this.)
+ */
+void __init setup_node_to_cpumask_map(void)
+{
+        unsigned int node, num = 0;
+        cpumask_t *map;
+        /* setup nr_node_ids if not done yet */
+        if (nr_node_ids == MAX_NUMNODES) {
+                for_each_node_mask(node, node_possible_map)
+                        num = node;
+                nr_node_ids = num + 1;
+        }
+        /* allocate the map */
+        map = alloc_bootmem_low(nr_node_ids * sizeof(cpumask_t));
+        DBG("node_to_cpumask_map at %p for %d nodes\n", map, nr_node_ids);
+        pr_debug("Node to cpumask map at %p for %d nodes\n",
+                 map, nr_node_ids);
+        /* node_to_cpumask() will now work */
+        node_to_cpumask_map = map;
+}
+void __cpuinit numa_set_node(int cpu, int node)
+{
+        int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);
+        /* early setting, no percpu area yet */
+        if (cpu_to_node_map) {
+                cpu_to_node_map[cpu] = node;
+                return;
+        }
+#ifdef CONFIG_DEBUG_PER_CPU_MAPS
+        if (cpu >= nr_cpu_ids || !per_cpu_offset(cpu)) {
+                printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu);
+                dump_stack();
+                return;
+        }
+#endif
+        per_cpu(x86_cpu_to_node_map, cpu) = node;
+        if (node != NUMA_NO_NODE)
+                per_cpu(node_number, cpu) = node;
+}
+void __cpuinit numa_clear_node(int cpu)
+{
+        numa_set_node(cpu, NUMA_NO_NODE);
+}
+#ifndef CONFIG_DEBUG_PER_CPU_MAPS
+void __cpuinit numa_add_cpu(int cpu)
+{
+        cpu_set(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
+}
+void __cpuinit numa_remove_cpu(int cpu)
+{
+        cpu_clear(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
+}
+#else /* CONFIG_DEBUG_PER_CPU_MAPS */
+/*
+ * --------- debug versions of the numa functions ---------
+ */
+static void __cpuinit numa_set_cpumask(int cpu, int enable)
+{
+        int node = early_cpu_to_node(cpu);
+        cpumask_t *mask;
+        char buf[64];
+        if (node_to_cpumask_map == NULL) {
+                printk(KERN_ERR "node_to_cpumask_map NULL\n");
+                dump_stack();
+                return;
+        }
+        mask = &node_to_cpumask_map[node];
+        if (enable)
+                cpu_set(cpu, *mask);
+        else
+                cpu_clear(cpu, *mask);
+        cpulist_scnprintf(buf, sizeof(buf), mask);
+        printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n",
+                enable ? "numa_add_cpu" : "numa_remove_cpu", cpu, node, buf);
+}
+void __cpuinit numa_add_cpu(int cpu)
+{
+        numa_set_cpumask(cpu, 1);
+}
+void __cpuinit numa_remove_cpu(int cpu)
+{
+        numa_set_cpumask(cpu, 0);
+}
+int cpu_to_node(int cpu)
+{
+        if (early_per_cpu_ptr(x86_cpu_to_node_map)) {
+                printk(KERN_WARNING
+                        "cpu_to_node(%d): usage too early!\n", cpu);
+                dump_stack();
+                return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
+        }
+        return per_cpu(x86_cpu_to_node_map, cpu);
+}
+EXPORT_SYMBOL(cpu_to_node);
+/*
+ * Same function as cpu_to_node() but used if called before the
+ * per_cpu areas are setup.
+ */
+int early_cpu_to_node(int cpu)
+{
+        if (early_per_cpu_ptr(x86_cpu_to_node_map))
+                return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
+        if (!per_cpu_offset(cpu)) {
+                printk(KERN_WARNING
+                        "early_cpu_to_node(%d): no per_cpu area!\n", cpu);
+                dump_stack();
+                return NUMA_NO_NODE;
+        }
+        return per_cpu(x86_cpu_to_node_map, cpu);
+}
+/* empty cpumask */
+static const cpumask_t cpu_mask_none;
+/*
+ * Returns a pointer to the bitmask of CPUs on Node 'node'.
+ */
+const cpumask_t *cpumask_of_node(int node)
+{
+        if (node_to_cpumask_map == NULL) {
+                printk(KERN_WARNING
+                        "cpumask_of_node(%d): no node_to_cpumask_map!\n",
+                        node);
+                dump_stack();
+                return (const cpumask_t *)&cpu_online_map;
+        }
+        if (node >= nr_node_ids) {
+                printk(KERN_WARNING
+                        "cpumask_of_node(%d): node > nr_node_ids(%d)\n",
+                        node, nr_node_ids);
+                dump_stack();
+                return &cpu_mask_none;
+        }
+        return &node_to_cpumask_map[node];
+}
+EXPORT_SYMBOL(cpumask_of_node);
+/*
+ * Returns a bitmask of CPUs on Node 'node'.
+ *
+ * Side note: this function creates the returned cpumask on the stack
+ * so with a high NR_CPUS count, excessive stack space is used.  The
+ * node_to_cpumask_ptr function should be used whenever possible.
+ */
+cpumask_t node_to_cpumask(int node)
+{
+        if (node_to_cpumask_map == NULL) {
+                printk(KERN_WARNING
+                        "node_to_cpumask(%d): no node_to_cpumask_map!\n", node);
+                dump_stack();
+                return cpu_online_map;
+        }
+        if (node >= nr_node_ids) {
+                printk(KERN_WARNING
+                        "node_to_cpumask(%d): node > nr_node_ids(%d)\n",
+                        node, nr_node_ids);
+                dump_stack();
+                return cpu_mask_none;
+        }
+        return node_to_cpumask_map[node];
+}
+EXPORT_SYMBOL(node_to_cpumask);
+/*
+ * --------- end of debug versions of the numa functions ---------
+ */
+#endif /* CONFIG_DEBUG_PER_CPU_MAPS */
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 09737c8af074..15df1baee100 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -21,6 +21,7 @@
 #include <asm/numa.h>
 #include <asm/e820.h>
 #include <asm/genapic.h>
+#include <asm/uv/uv.h>
 int acpi_numa __initdata;
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
new file mode 100644
index 000000000000..72a6d4ebe34d
--- /dev/null
+++ b/arch/x86/mm/tlb.c
@@ -0,0 +1,296 @@
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/interrupt.h>
+#include <linux/module.h>
+#include <asm/tlbflush.h>
+#include <asm/mmu_context.h>
+#include <asm/apic.h>
+#include <asm/uv/uv.h>
+DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate)
+                        = { &init_mm, 0, };
+#include <mach_ipi.h>
+/*
+ *      Smarter SMP flushing macros.
+ *              c/o Linus Torvalds.
+ *
+ *      These mean you can really definitely utterly forget about
+ *      writing to user space from interrupts. (Its not allowed anyway).
+ *
+ *      Optimizations Manfred Spraul <manfred@colorfullife.com>
+ *
+ *      More scalable flush, from Andi Kleen
+ *
+ *      To avoid global state use 8 different call vectors.
+ *      Each CPU uses a specific vector to trigger flushes on other
+ *      CPUs. Depending on the received vector the target CPUs look into
+ *      the right array slot for the flush data.
+ *
+ *      With more than 8 CPUs they are hashed to the 8 available
+ *      vectors. The limited global vector space forces us to this right now.
+ *      In future when interrupts are split into per CPU domains this could be
+ *      fixed, at the cost of triggering multiple IPIs in some cases.
+ */
+union smp_flush_state {
+        struct {
+                struct mm_struct *flush_mm;
+                unsigned long flush_va;
+                spinlock_t tlbstate_lock;
+                DECLARE_BITMAP(flush_cpumask, NR_CPUS);
+        };
+        char pad[CONFIG_X86_INTERNODE_CACHE_BYTES];
+} ____cacheline_internodealigned_in_smp;
+/* State is put into the per CPU data section, but padded
+   to a full cache line because other CPUs can access it and we don't
+   want false sharing in the per cpu data segment. */
+static union smp_flush_state flush_state[NUM_INVALIDATE_TLB_VECTORS];
+/*
+ * We cannot call mmdrop() because we are in interrupt context,
+ * instead update mm->cpu_vm_mask.
+ */
+void leave_mm(int cpu)
+{
+        if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
+                BUG();
+        cpu_clear(cpu, percpu_read(cpu_tlbstate.active_mm)->cpu_vm_mask);
+        load_cr3(swapper_pg_dir);
+}
+EXPORT_SYMBOL_GPL(leave_mm);
+/*
+ *
+ * The flush IPI assumes that a thread switch happens in this order:
+ * [cpu0: the cpu that switches]
+ * 1) switch_mm() either 1a) or 1b)
+ * 1a) thread switch to a different mm
+ * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask);
+ *      Stop ipi delivery for the old mm. This is not synchronized with
+ *      the other cpus, but smp_invalidate_interrupt ignore flush ipis
+ *      for the wrong mm, and in the worst case we perform a superfluous
+ *      tlb flush.
+ * 1a2) set cpu mmu_state to TLBSTATE_OK
+ *      Now the smp_invalidate_interrupt won't call leave_mm if cpu0
+ *      was in lazy tlb mode.
+ * 1a3) update cpu active_mm
+ *      Now cpu0 accepts tlb flushes for the new mm.
+ * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask);
+ *      Now the other cpus will send tlb flush ipis.
+ * 1a4) change cr3.
+ * 1b) thread switch without mm change
+ *      cpu active_mm is correct, cpu0 already handles
+ *      flush ipis.
+ * 1b1) set cpu mmu_state to TLBSTATE_OK
+ * 1b2) test_and_set the cpu bit in cpu_vm_mask.
+ *      Atomically set the bit [other cpus will start sending flush ipis],
+ *      and test the bit.
+ * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
+ * 2) switch %%esp, ie current
+ *
+ * The interrupt must handle 2 special cases:
+ * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
+ * - the cpu performs speculative tlb reads, i.e. even if the cpu only
+ *   runs in kernel space, the cpu could load tlb entries for user space
+ *   pages.
+ *
+ * The good news is that cpu mmu_state is local to each cpu, no
+ * write/read ordering problems.
+ */
+/*
+ * TLB flush IPI:
+ *
+ * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
+ * 2) Leave the mm if we are in the lazy tlb mode.
+ *
+ * Interrupts are disabled.
+ */
+/*
+ * FIXME: use of asmlinkage is not consistent.  On x86_64 it's noop
+ * but still used for documentation purpose but the usage is slightly
+ * inconsistent.  On x86_32, asmlinkage is regparm(0) but interrupt
+ * entry calls in with the first parameter in %eax.  Maybe define
+ * intrlinkage?
+ */
+#ifdef CONFIG_X86_64
+asmlinkage
+#endif
+void smp_invalidate_interrupt(struct pt_regs *regs)
+{
+        unsigned int cpu;
+        unsigned int sender;
+        union smp_flush_state *f;
+        cpu = smp_processor_id();
+        /*
+         * orig_rax contains the negated interrupt vector.
+         * Use that to determine where the sender put the data.
+         */
+        sender = ~regs->orig_ax - INVALIDATE_TLB_VECTOR_START;
+        f = &flush_state[sender];
+        if (!cpumask_test_cpu(cpu, to_cpumask(f->flush_cpumask)))
+                goto out;
+                /*
+                 * This was a BUG() but until someone can quote me the
+                 * line from the intel manual that guarantees an IPI to
+                 * multiple CPUs is retried _only_ on the erroring CPUs
+                 * its staying as a return
+                 *
+                 * BUG();
+                 */
+        if (f->flush_mm == percpu_read(cpu_tlbstate.active_mm)) {
+                if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK) {
+                        if (f->flush_va == TLB_FLUSH_ALL)
+                                local_flush_tlb();
+                        else
+                                __flush_tlb_one(f->flush_va);
+                } else
+                        leave_mm(cpu);
+        }
+out:
+        ack_APIC_irq();
+        smp_mb__before_clear_bit();
+        cpumask_clear_cpu(cpu, to_cpumask(f->flush_cpumask));
+        smp_mb__after_clear_bit();
+        inc_irq_stat(irq_tlb_count);
+}
+static void flush_tlb_others_ipi(const struct cpumask *cpumask,
+                                 struct mm_struct *mm, unsigned long va)
+{
+        unsigned int sender;
+        union smp_flush_state *f;
+        /* Caller has disabled preemption */
+        sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS;
+        f = &flush_state[sender];
+        /*
+         * Could avoid this lock when
+         * num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is
+         * probably not worth checking this for a cache-hot lock.
+         */
+        spin_lock(&f->tlbstate_lock);
+        f->flush_mm = mm;
+        f->flush_va = va;
+        cpumask_andnot(to_cpumask(f->flush_cpumask),
+                       cpumask, cpumask_of(smp_processor_id()));
+        /*
+         * Make the above memory operations globally visible before
+         * sending the IPI.
+         */
+        smp_mb();
+        /*
+         * We have to send the IPI only to
+         * CPUs affected.
+         */
+        send_IPI_mask(to_cpumask(f->flush_cpumask),
+                      INVALIDATE_TLB_VECTOR_START + sender);
+        while (!cpumask_empty(to_cpumask(f->flush_cpumask)))
+                cpu_relax();
+        f->flush_mm = NULL;
+        f->flush_va = 0;
+        spin_unlock(&f->tlbstate_lock);
+}
+void native_flush_tlb_others(const struct cpumask *cpumask,
+                             struct mm_struct *mm, unsigned long va)
+{
+        if (is_uv_system()) {
+                unsigned int cpu;
+                cpu = get_cpu();
+                cpumask = uv_flush_tlb_others(cpumask, mm, va, cpu);
+                if (cpumask)
+                        flush_tlb_others_ipi(cpumask, mm, va);
+                put_cpu();
+                return;
+        }
+        flush_tlb_others_ipi(cpumask, mm, va);
+}
+static int __cpuinit init_smp_flush(void)
+{
+        int i;
+        for (i = 0; i < ARRAY_SIZE(flush_state); i++)
+                spin_lock_init(&flush_state[i].tlbstate_lock);
+        return 0;
+}
+core_initcall(init_smp_flush);
+void flush_tlb_current_task(void)
+{
+        struct mm_struct *mm = current->mm;
+        preempt_disable();
+        local_flush_tlb();
+        if (cpumask_any_but(&mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids)
+                flush_tlb_others(&mm->cpu_vm_mask, mm, TLB_FLUSH_ALL);
+        preempt_enable();
+}
+void flush_tlb_mm(struct mm_struct *mm)
+{
+        preempt_disable();
+        if (current->active_mm == mm) {
+                if (current->mm)
+                        local_flush_tlb();
+                else
+                        leave_mm(smp_processor_id());
+        }
+        if (cpumask_any_but(&mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids)
+                flush_tlb_others(&mm->cpu_vm_mask, mm, TLB_FLUSH_ALL);
+        preempt_enable();
+}
+void flush_tlb_page(struct vm_area_struct *vma, unsigned long va)
+{
+        struct mm_struct *mm = vma->vm_mm;
+        preempt_disable();
+        if (current->active_mm == mm) {
+                if (current->mm)
+                        __flush_tlb_one(va);
+                else
+                        leave_mm(smp_processor_id());
+        }
+        if (cpumask_any_but(&mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids)
+                flush_tlb_others(&mm->cpu_vm_mask, mm, va);
+        preempt_enable();
+}
+static void do_flush_tlb_all(void *info)
+{
+        unsigned long cpu = smp_processor_id();
+        __flush_tlb_all();
+        if (percpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY)
+                leave_mm(cpu);
+}
+void flush_tlb_all(void)
+{
+        on_each_cpu(do_flush_tlb_all, NULL, 1);
+}
author	Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>	2009-02-11 14:52:22 -0500
committer	Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>	2009-02-11 14:52:22 -0500
commit	9049a11de73d3ecc623f1903100d099f82ede56c (patch)
tree	c03d130d58168e337a66fe999682452b7a02b42b /arch/x86/mm
parent	c47c1b1f3a9d6973108020df1dcab7604f7774dd (diff)
parent	e4d0407185cdbdcfd99fc23bde2e5454bbc46329 (diff)