Merge branch 'for-ingo' of git://git.kernel.org/pub/scm/linux/kernel/git/lenb/linux-sfi-2.6 into x86/apic

Merge reason: the SFI (Simple Firmware Interface) feature in the ACPI tree needs this cleanup, pull it into the APIC branch as well so that there's no interactions. Signed-off-by: Ingo Molnar <mingo@elte.hu>
author: Ingo Molnar <mingo@elte.hu> 2009-08-29 03:30:41 -0400
committer: Ingo Molnar <mingo@elte.hu> 2009-08-29 03:31:47 -0400
commit: eebc57f73d42095b778e899f6aa90ad050c72655 (patch)
tree: 2ba80c75e9284093e6d7606dbb1b6a4bb752a2a5 /arch/x86
parent: d3a247bfb2c26f5b67367d58af7ad8c2efbbc6c1 (diff)
parent: 2a4ab640d3c28c2952967e5f63ea495555bf2a5f (diff)
61 files changed, 1287 insertions, 660 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 738bdc6b0f8b..13ffa5df37d7 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -24,6 +24,7 @@ config X86
        select HAVE_UNSTABLE_SCHED_CLOCK
        select HAVE_IDE
        select HAVE_OPROFILE
+        select HAVE_PERF_COUNTERS if (!M386 && !M486)
        select HAVE_IOREMAP_PROT
        select HAVE_KPROBES
        select ARCH_WANT_OPTIONAL_GPIOLIB
@@ -742,7 +743,6 @@ config X86_UP_IOAPIC
 config X86_LOCAL_APIC
        def_bool y
        depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC
-        select HAVE_PERF_COUNTERS if (!M386 && !M486)
 config X86_IO_APIC
        def_bool y
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index e2ff504b4ddc..f8ed0658404c 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -4,7 +4,7 @@
 # create a compressed vmlinux image from the original vmlinux
 #
-targets := vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma head_$(BITS).o misc.o piggy.o
+targets := vmlinux.lds vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma head_$(BITS).o misc.o piggy.o
 KBUILD_CFLAGS := -m$(BITS) -D__KERNEL__ $(LINUX_INCLUDE) -O2
 KBUILD_CFLAGS += -fno-strict-aliasing -fPIC
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index edc90f23e708..8406ed7f9926 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -33,7 +33,7 @@ extern unsigned long asmlinkage efi_call_phys(void *, ...);
 #define efi_call_virt6(f, a1, a2, a3, a4, a5, a6)       \
        efi_call_virt(f, a1, a2, a3, a4, a5, a6)
-#define efi_ioremap(addr, size)                 ioremap_cache(addr, size)
+#define efi_ioremap(addr, size, type)           ioremap_cache(addr, size)
 #else /* !CONFIG_X86_32 */
@@ -84,7 +84,8 @@ extern u64 efi_call6(void *fp, u64 arg1, u64 arg2, u64 arg3,
        efi_call6((void *)(efi.systab->runtime->f), (u64)(a1), (u64)(a2), \
                  (u64)(a3), (u64)(a4), (u64)(a5), (u64)(a6))
-extern void __iomem *efi_ioremap(unsigned long addr, unsigned long size);
+extern void __iomem *efi_ioremap(unsigned long addr, unsigned long size,
+                                 u32 type);
 #endif /* CONFIG_X86_32 */
diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index daf866ed0612..85232d32fcb8 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -150,17 +150,17 @@ extern int timer_through_8259;
 #define io_apic_assign_pci_irqs \
        (mp_irq_entries && !skip_ioapic_setup && io_apic_irqs)
-#ifdef CONFIG_ACPI
+extern u8 io_apic_unique_id(u8 id);
 extern int io_apic_get_unique_id(int ioapic, int apic_id);
 extern int io_apic_get_version(int ioapic);
 extern int io_apic_get_redir_entries(int ioapic);
-#endif /* CONFIG_ACPI */
 struct io_apic_irq_attr;
 extern int io_apic_set_pci_routing(struct device *dev, int irq,
                 struct io_apic_irq_attr *irq_attr);
 extern int (*ioapic_renumber_irq)(int ioapic, int irq);
 extern void ioapic_init_mappings(void);
+extern void ioapic_insert_resources(void);
 extern struct IO_APIC_route_entry **alloc_ioapic_entries(void);
 extern void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries);
@@ -176,10 +176,21 @@ extern int setup_ioapic_entry(int apic, int irq,
                              int polarity, int vector, int pin);
 extern void ioapic_write_entry(int apic, int pin,
                               struct IO_APIC_route_entry e);
+struct mp_ioapic_gsi{
+        int gsi_base;
+        int gsi_end;
+};
+extern struct mp_ioapic_gsi  mp_gsi_routing[];
+int mp_find_ioapic(int gsi);
+int mp_find_ioapic_pin(int ioapic, int gsi);
+void __init mp_register_ioapic(int id, u32 address, u32 gsi_base);
 #else  /* !CONFIG_X86_IO_APIC */
 #define io_apic_assign_pci_irqs 0
 static const int timer_through_8259 = 0;
 static inline void ioapic_init_mappings(void)   { }
+static inline void ioapic_insert_resources(void) { }
 static inline void probe_nr_irqs_gsi(void)      { }
 #endif
diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
index 2bdab21f0898..c6ccbe7e81ad 100644
--- a/arch/x86/include/asm/irqflags.h
+++ b/arch/x86/include/asm/irqflags.h
@@ -12,9 +12,15 @@ static inline unsigned long native_save_fl(void)
 {
        unsigned long flags;
+        /*
+         * Note: this needs to be "=r" not "=rm", because we have the
+         * stack offset from what gcc expects at the time the "pop" is
+         * executed, and so a memory reference with respect to the stack
+         * would end up using the wrong address.
+         */
        asm volatile("# __raw_save_flags\n\t"
                     "pushf ; pop %0"
-                     : "=g" (flags)
+                     : "=r" (flags)
                     : /* no input */
                     : "memory");
diff --git a/arch/x86/include/asm/lguest.h b/arch/x86/include/asm/lguest.h
index 313389cd50d2..5136dad57cbb 100644
--- a/arch/x86/include/asm/lguest.h
+++ b/arch/x86/include/asm/lguest.h
@@ -17,8 +17,7 @@
 /* Pages for switcher itself, then two pages per cpu */
 #define TOTAL_SWITCHER_PAGES (SHARED_SWITCHER_PAGES + 2 * nr_cpu_ids)
-/* We map at -4M (-2M when PAE is activated) for ease of mapping
+/* We map at -4M (-2M for PAE) for ease of mapping (one PTE page). */
- * into the guest (one PTE page). */
 #ifdef CONFIG_X86_PAE
 #define SWITCHER_ADDR 0xFFE00000
 #else
diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h
index d31c4a684078..ba0eed8aa1a6 100644
--- a/arch/x86/include/asm/lguest_hcall.h
+++ b/arch/x86/include/asm/lguest_hcall.h
@@ -30,27 +30,27 @@
 #include <asm/hw_irq.h>
 #include <asm/kvm_para.h>
-/*G:031 But first, how does our Guest contact the Host to ask for privileged
+/*G:030
+ * But first, how does our Guest contact the Host to ask for privileged
 * operations?  There are two ways: the direct way is to make a "hypercall",
 * to make requests of the Host Itself.
 *
- * We use the KVM hypercall mechanism. Seventeen hypercalls are
+ * We use the KVM hypercall mechanism, though completely different hypercall
- * available: the hypercall number is put in the %eax register, and the
+ * numbers. Seventeen hypercalls are available: the hypercall number is put in
- * arguments (when required) are placed in %ebx, %ecx, %edx and %esi.
+ * the %eax register, and the arguments (when required) are placed in %ebx,
- * If a return value makes sense, it's returned in %eax.
+ * %ecx, %edx and %esi.  If a return value makes sense, it's returned in %eax.
 *
 * Grossly invalid calls result in Sudden Death at the hands of the vengeful
 * Host, rather than returning failure.  This reflects Winston Churchill's
- * definition of a gentleman: "someone who is only rude intentionally". */
+ * definition of a gentleman: "someone who is only rude intentionally".
-/*:*/
+:*/
 /* Can't use our min() macro here: needs to be a constant */
 #define LGUEST_IRQS (NR_IRQS < 32 ? NR_IRQS: 32)
 #define LHCALL_RING_SIZE 64
 struct hcall_args {
-        /* These map directly onto eax, ebx, ecx, edx and esi
+        /* These map directly onto eax/ebx/ecx/edx/esi in struct lguest_regs */
-         * in struct lguest_regs */
        unsigned long arg0, arg1, arg2, arg3, arg4;
 };
diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h
index dd14c54ac718..0e8c2a0fd922 100644
--- a/arch/x86/include/asm/pgalloc.h
+++ b/arch/x86/include/asm/pgalloc.h
@@ -46,7 +46,13 @@ static inline void pte_free(struct mm_struct *mm, struct page *pte)
        __free_page(pte);
 }
-extern void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte);
+extern void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte);
+static inline void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte,
+                                  unsigned long address)
+{
+        ___pte_free_tlb(tlb, pte);
+}
 static inline void pmd_populate_kernel(struct mm_struct *mm,
                                       pmd_t *pmd, pte_t *pte)
@@ -78,7 +84,13 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
        free_page((unsigned long)pmd);
 }
-extern void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd);
+extern void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd);
+static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd,
+                                  unsigned long adddress)
+{
+        ___pmd_free_tlb(tlb, pmd);
+}
 #ifdef CONFIG_X86_PAE
 extern void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd);
@@ -108,7 +120,14 @@ static inline void pud_free(struct mm_struct *mm, pud_t *pud)
        free_page((unsigned long)pud);
 }
-extern void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud);
+extern void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud);
+static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud,
+                                  unsigned long address)
+{
+        ___pud_free_tlb(tlb, pud);
+}
 #endif  /* PAGETABLE_LEVELS > 3 */
 #endif  /* PAGETABLE_LEVELS > 2 */
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 3cc06e3fceb8..16748077559a 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -2,6 +2,7 @@
 #define _ASM_X86_PGTABLE_H
 #include <asm/page.h>
+#include <asm/e820.h>
 #include <asm/pgtable_types.h>
@@ -269,10 +270,17 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
 #define canon_pgprot(p) __pgprot(massage_pgprot(p))
-static inline int is_new_memtype_allowed(unsigned long flags,
+static inline int is_new_memtype_allowed(u64 paddr, unsigned long size,
-                                                unsigned long new_flags)
+                                         unsigned long flags,
+                                         unsigned long new_flags)
 {
        /*
+         * PAT type is always WB for ISA. So no need to check.
+         */
+        if (is_ISA_range(paddr, paddr + size - 1))
+                return 1;
+        /*
         * Certain new memtypes are not allowed with certain
         * requested memtype:
         * - request is uncached, return cannot be write-back
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 20e6a795e160..d2c6c930b491 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -212,9 +212,9 @@ extern int __get_user_bad(void);
                     : "A" ((typeof(*(ptr)))(x)), "c" (ptr) : "ebx")
 #else
 #define __put_user_asm_u64(x, ptr, retval, errret) \
-        __put_user_asm(x, ptr, retval, "q", "", "Zr", errret)
+        __put_user_asm(x, ptr, retval, "q", "", "er", errret)
 #define __put_user_asm_ex_u64(x, addr)  \
-        __put_user_asm_ex(x, addr, "q", "", "Zr")
+        __put_user_asm_ex(x, addr, "q", "", "er")
 #define __put_user_x8(x, ptr, __ret_pu) __put_user_x(8, x, ptr, __ret_pu)
 #endif
diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index 8cc687326eb8..db24b215fc50 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -88,11 +88,11 @@ int __copy_to_user(void __user *dst, const void *src, unsigned size)
                              ret, "l", "k", "ir", 4);
                return ret;
        case 8:__put_user_asm(*(u64 *)src, (u64 __user *)dst,
-                              ret, "q", "", "ir", 8);
+                              ret, "q", "", "er", 8);
                return ret;
        case 10:
                __put_user_asm(*(u64 *)src, (u64 __user *)dst,
-                               ret, "q", "", "ir", 10);
+                               ret, "q", "", "er", 10);
                if (unlikely(ret))
                        return ret;
                asm("":::"memory");
@@ -101,12 +101,12 @@ int __copy_to_user(void __user *dst, const void *src, unsigned size)
                return ret;
        case 16:
                __put_user_asm(*(u64 *)src, (u64 __user *)dst,
-                               ret, "q", "", "ir", 16);
+                               ret, "q", "", "er", 16);
                if (unlikely(ret))
                        return ret;
                asm("":::"memory");
                __put_user_asm(1[(u64 *)src], 1 + (u64 __user *)dst,
-                               ret, "q", "", "ir", 8);
+                               ret, "q", "", "er", 8);
                return ret;
        default:
                return copy_user_generic((__force void *)dst, src, size);
@@ -157,7 +157,7 @@ int __copy_in_user(void __user *dst, const void __user *src, unsigned size)
                               ret, "q", "", "=r", 8);
                if (likely(!ret))
                        __put_user_asm(tmp, (u64 __user *)dst,
-                                       ret, "q", "", "ir", 8);
+                                       ret, "q", "", "er", 8);
                return ret;
        }
        default:
diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
index bddd44f2f0ab..80e2984f521c 100644
--- a/arch/x86/include/asm/uv/uv_bau.h
+++ b/arch/x86/include/asm/uv/uv_bau.h
@@ -133,7 +133,7 @@ struct bau_msg_payload {
 * see table 4.2.3.0.1 in broacast_assist spec.
 */
 struct bau_msg_header {
-        unsigned int dest_subnodeid:6;  /* must be zero */
+        unsigned int dest_subnodeid:6;  /* must be 0x10, for the LB */
        /* bits 5:0 */
        unsigned int base_dest_nodeid:15; /* nasid>>1 (pnode) of */
        /* bits 20:6 */                   /* first bit in node_map */
diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h
index 341070f7ad5c..77a68505419a 100644
--- a/arch/x86/include/asm/uv/uv_hub.h
+++ b/arch/x86/include/asm/uv/uv_hub.h
@@ -175,7 +175,7 @@ DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
 #define UV_GLOBAL_MMR32_PNODE_BITS(p)   ((p) << (UV_GLOBAL_MMR32_PNODE_SHIFT))
 #define UV_GLOBAL_MMR64_PNODE_BITS(p)                                   \
-        ((unsigned long)(UV_PNODE_TO_GNODE(p)) << UV_GLOBAL_MMR64_PNODE_SHIFT)
+        (((unsigned long)(p)) << UV_GLOBAL_MMR64_PNODE_SHIFT)
 #define UV_APIC_PNODE_SHIFT     6
@@ -327,6 +327,7 @@ struct uv_blade_info {
        unsigned short  nr_possible_cpus;
        unsigned short  nr_online_cpus;
        unsigned short  pnode;
+        short           memory_nid;
 };
 extern struct uv_blade_info *uv_blade_info;
 extern short *uv_node_to_blade;
@@ -363,6 +364,12 @@ static inline int uv_blade_to_pnode(int bid)
        return uv_blade_info[bid].pnode;
 }
+/* Nid of memory node on blade. -1 if no blade-local memory */
+static inline int uv_blade_to_memory_nid(int bid)
+{
+        return uv_blade_info[bid].memory_nid;
+}
 /* Determine the number of possible cpus on a blade */
 static inline int uv_blade_nr_possible_cpus(int bid)
 {
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index ce31c1af854f..67e929b89875 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -833,106 +833,6 @@ static int __init acpi_parse_madt_lapic_entries(void)
 extern int es7000_plat;
 #endif
-static struct {
-        int gsi_base;
-        int gsi_end;
-} mp_ioapic_routing[MAX_IO_APICS];
-int mp_find_ioapic(int gsi)
-{
-        int i = 0;
-        /* Find the IOAPIC that manages this GSI. */
-        for (i = 0; i < nr_ioapics; i++) {
-                if ((gsi >= mp_ioapic_routing[i].gsi_base)
-                    && (gsi <= mp_ioapic_routing[i].gsi_end))
-                        return i;
-        }
-        printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
-        return -1;
-}
-int mp_find_ioapic_pin(int ioapic, int gsi)
-{
-        if (WARN_ON(ioapic == -1))
-                return -1;
-        if (WARN_ON(gsi > mp_ioapic_routing[ioapic].gsi_end))
-                return -1;
-        return gsi - mp_ioapic_routing[ioapic].gsi_base;
-}
-static u8 __init uniq_ioapic_id(u8 id)
-{
-#ifdef CONFIG_X86_32
-        if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
-            !APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
-                return io_apic_get_unique_id(nr_ioapics, id);
-        else
-                return id;
-#else
-        int i;
-        DECLARE_BITMAP(used, 256);
-        bitmap_zero(used, 256);
-        for (i = 0; i < nr_ioapics; i++) {
-                struct mpc_ioapic *ia = &mp_ioapics[i];
-                __set_bit(ia->apicid, used);
-        }
-        if (!test_bit(id, used))
-                return id;
-        return find_first_zero_bit(used, 256);
-#endif
-}
-static int bad_ioapic(unsigned long address)
-{
-        if (nr_ioapics >= MAX_IO_APICS) {
-                printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
-                       "(found %d)\n", MAX_IO_APICS, nr_ioapics);
-                panic("Recompile kernel with bigger MAX_IO_APICS!\n");
-        }
-        if (!address) {
-                printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address"
-                       " found in table, skipping!\n");
-                return 1;
-        }
-        return 0;
-}
-void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
-{
-        int idx = 0;
-        if (bad_ioapic(address))
-                return;
-        idx = nr_ioapics;
-        mp_ioapics[idx].type = MP_IOAPIC;
-        mp_ioapics[idx].flags = MPC_APIC_USABLE;
-        mp_ioapics[idx].apicaddr = address;
-        set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
-        mp_ioapics[idx].apicid = uniq_ioapic_id(id);
-        mp_ioapics[idx].apicver = io_apic_get_version(idx);
-        /*
-         * Build basic GSI lookup table to facilitate gsi->io_apic lookups
-         * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
-         */
-        mp_ioapic_routing[idx].gsi_base = gsi_base;
-        mp_ioapic_routing[idx].gsi_end = gsi_base +
-            io_apic_get_redir_entries(idx);
-        printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
-               "GSI %d-%d\n", idx, mp_ioapics[idx].apicid,
-               mp_ioapics[idx].apicver, mp_ioapics[idx].apicaddr,
-               mp_ioapic_routing[idx].gsi_base, mp_ioapic_routing[idx].gsi_end);
-        nr_ioapics++;
-}
 int __init acpi_probe_gsi(void)
 {
        int idx;
@@ -947,7 +847,7 @@ int __init acpi_probe_gsi(void)
        max_gsi = 0;
        for (idx = 0; idx < nr_ioapics; idx++) {
-                gsi = mp_ioapic_routing[idx].gsi_end;
+                gsi = mp_gsi_routing[idx].gsi_end;
                if (gsi > max_gsi)
                        max_gsi = gsi;
diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c
index 420f95da7bf6..89174f847b49 100644
--- a/arch/x86/kernel/apic/es7000_32.c
+++ b/arch/x86/kernel/apic/es7000_32.c
@@ -652,7 +652,8 @@ static int es7000_mps_oem_check_cluster(struct mpc_table *mpc, char *oem,
        return ret && es7000_apic_is_cluster();
 }
-struct apic apic_es7000_cluster = {
+/* We've been warned by a false positive warning.Use __refdata to keep calm. */
+struct apic __refdata apic_es7000_cluster = {
        .name                           = "es7000",
        .probe                          = probe_es7000,
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index d836b4d347e6..3c8f9e75d038 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -87,6 +87,9 @@ int nr_ioapic_registers[MAX_IO_APICS];
 struct mpc_ioapic mp_ioapics[MAX_IO_APICS];
 int nr_ioapics;
+/* IO APIC gsi routing info */
+struct mp_ioapic_gsi  mp_gsi_routing[MAX_IO_APICS];
 /* MP IRQ source entries */
 struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES];
@@ -3736,6 +3739,9 @@ int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
        mmr_pnode = uv_blade_to_pnode(mmr_blade);
        uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
+        if (cfg->move_in_progress)
+                send_cleanup_vector(cfg);
        return irq;
 }
@@ -3885,11 +3891,28 @@ int io_apic_set_pci_routing(struct device *dev, int irq,
        return __io_apic_set_pci_routing(dev, irq, irq_attr);
 }
-/* --------------------------------------------------------------------------
+u8 __init io_apic_unique_id(u8 id)
-                          ACPI-based IOAPIC Configuration
+{
-   -------------------------------------------------------------------------- */
+#ifdef CONFIG_X86_32
+        if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
+            !APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
+                return io_apic_get_unique_id(nr_ioapics, id);
+        else
+                return id;
+#else
+        int i;
+        DECLARE_BITMAP(used, 256);
-#ifdef CONFIG_ACPI
+        bitmap_zero(used, 256);
+        for (i = 0; i < nr_ioapics; i++) {
+                struct mpc_ioapic *ia = &mp_ioapics[i];
+                __set_bit(ia->apicid, used);
+        }
+        if (!test_bit(id, used))
+                return id;
+        return find_first_zero_bit(used, 256);
+#endif
+}
 #ifdef CONFIG_X86_32
 int __init io_apic_get_unique_id(int ioapic, int apic_id)
@@ -3998,8 +4021,6 @@ int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity)
        return 0;
 }
-#endif /* CONFIG_ACPI */
 /*
 * This function currently is only a helper for the i386 smp boot process where
 * we need to reprogram the ioredtbls to cater for the cpus which have come online
@@ -4124,28 +4145,93 @@ fake_ioapic_page:
        }
 }
-static int __init ioapic_insert_resources(void)
+void __init ioapic_insert_resources(void)
 {
        int i;
        struct resource *r = ioapic_resources;
        if (!r) {
-                if (nr_ioapics > 0) {
+                if (nr_ioapics > 0)
                        printk(KERN_ERR
                                "IO APIC resources couldn't be allocated.\n");
-                        return -1;
+                return;
-                }
-                return 0;
        }
        for (i = 0; i < nr_ioapics; i++) {
                insert_resource(&iomem_resource, r);
                r++;
        }
+}
+int mp_find_ioapic(int gsi)
+{
+        int i = 0;
+        /* Find the IOAPIC that manages this GSI. */
+        for (i = 0; i < nr_ioapics; i++) {
+                if ((gsi >= mp_gsi_routing[i].gsi_base)
+                    && (gsi <= mp_gsi_routing[i].gsi_end))
+                        return i;
+        }
+        printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
+        return -1;
+}
+int mp_find_ioapic_pin(int ioapic, int gsi)
+{
+        if (WARN_ON(ioapic == -1))
+                return -1;
+        if (WARN_ON(gsi > mp_gsi_routing[ioapic].gsi_end))
+                return -1;
+        return gsi - mp_gsi_routing[ioapic].gsi_base;
+}
+static int bad_ioapic(unsigned long address)
+{
+        if (nr_ioapics >= MAX_IO_APICS) {
+                printk(KERN_WARNING "WARING: Max # of I/O APICs (%d) exceeded "
+                       "(found %d), skipping\n", MAX_IO_APICS, nr_ioapics);
+                return 1;
+        }
+        if (!address) {
+                printk(KERN_WARNING "WARNING: Bogus (zero) I/O APIC address"
+                       " found in table, skipping!\n");
+                return 1;
+        }
        return 0;
 }
-/* Insert the IO APIC resources after PCI initialization has occured to handle
+void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
- * IO APICS that are mapped in on a BAR in PCI space. */
+{
-late_initcall(ioapic_insert_resources);
+        int idx = 0;
+        if (bad_ioapic(address))
+                return;
+        idx = nr_ioapics;
+        mp_ioapics[idx].type = MP_IOAPIC;
+        mp_ioapics[idx].flags = MPC_APIC_USABLE;
+        mp_ioapics[idx].apicaddr = address;
+        set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
+        mp_ioapics[idx].apicid = io_apic_unique_id(id);
+        mp_ioapics[idx].apicver = io_apic_get_version(idx);
+        /*
+         * Build basic GSI lookup table to facilitate gsi->io_apic lookups
+         * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
+         */
+        mp_gsi_routing[idx].gsi_base = gsi_base;
+        mp_gsi_routing[idx].gsi_end = gsi_base +
+            io_apic_get_redir_entries(idx);
+        printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
+               "GSI %d-%d\n", idx, mp_ioapics[idx].apicid,
+               mp_ioapics[idx].apicver, mp_ioapics[idx].apicaddr,
+               mp_gsi_routing[idx].gsi_base, mp_gsi_routing[idx].gsi_end);
+        nr_ioapics++;
+}
diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c
index dbf5445727a9..6ef00ba4c886 100644
--- a/arch/x86/kernel/apic/ipi.c
+++ b/arch/x86/kernel/apic/ipi.c
@@ -106,6 +106,9 @@ void default_send_IPI_mask_logical(const struct cpumask *cpumask, int vector)
        unsigned long mask = cpumask_bits(cpumask)[0];
        unsigned long flags;
+        if (WARN_ONCE(!mask, "empty IPI mask"))
+                return;
        local_irq_save(flags);
        WARN_ON(mask & ~cpumask_bits(cpu_online_mask)[0]);
        __default_send_IPI_dest_field(mask, vector, apic->dest_logical);
diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c
index 533e59c6fc82..ca96e68f0d23 100644
--- a/arch/x86/kernel/apic/numaq_32.c
+++ b/arch/x86/kernel/apic/numaq_32.c
@@ -493,7 +493,8 @@ static void numaq_setup_portio_remap(void)
                (u_long) xquad_portio, (u_long) num_quads*XQUAD_PORTIO_QUAD);
 }
-struct apic apic_numaq = {
+/* Use __refdata to keep false positive warning calm.   */
+struct apic __refdata apic_numaq = {
        .name                           = "NUMAQ",
        .probe                          = probe_numaq,
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index 8e4cbb255c38..a5371ec36776 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -17,11 +17,13 @@ static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
        return x2apic_enabled();
 }
-/* Start with all IRQs pointing to boot CPU.  IRQ balancing will shift them. */
+/*
+ * need to use more than cpu 0, because we need more vectors when
+ * MSI-X are used.
+ */
 static const struct cpumask *x2apic_target_cpus(void)
 {
-        return cpumask_of(0);
+        return cpu_online_mask;
 }
 /*
@@ -170,7 +172,7 @@ static unsigned long set_apic_id(unsigned int id)
 static int x2apic_cluster_phys_pkg_id(int initial_apicid, int index_msb)
 {
-        return current_cpu_data.initial_apicid >> index_msb;
+        return initial_apicid >> index_msb;
 }
 static void x2apic_send_IPI_self(int vector)
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index a284359627e7..a8989aadc99a 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -27,11 +27,13 @@ static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
                return 0;
 }
-/* Start with all IRQs pointing to boot CPU.  IRQ balancing will shift them. */
+/*
+ * need to use more than cpu 0, because we need more vectors when
+ * MSI-X are used.
+ */
 static const struct cpumask *x2apic_target_cpus(void)
 {
-        return cpumask_of(0);
+        return cpu_online_mask;
 }
 static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask)
@@ -162,7 +164,7 @@ static unsigned long set_apic_id(unsigned int id)
 static int x2apic_phys_pkg_id(int initial_apicid, int index_msb)
 {
-        return current_cpu_data.initial_apicid >> index_msb;
+        return initial_apicid >> index_msb;
 }
 static void x2apic_send_IPI_self(int vector)
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 096d19aea2f7..601159374e87 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -46,7 +46,7 @@ static int early_get_nodeid(void)
        return node_id.s.node_id;
 }
-static int uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
 {
        if (!strcmp(oem_id, "SGI")) {
                if (!strcmp(oem_table_id, "UVL"))
@@ -253,7 +253,7 @@ static void uv_send_IPI_self(int vector)
        apic_write(APIC_SELF_IPI, vector);
 }
-struct apic apic_x2apic_uv_x = {
+struct apic __refdata apic_x2apic_uv_x = {
        .name                           = "UV large system",
        .probe                          = NULL,
@@ -261,7 +261,7 @@ struct apic apic_x2apic_uv_x = {
        .apic_id_registered             = uv_apic_id_registered,
        .irq_delivery_mode              = dest_Fixed,
-        .irq_dest_mode                  = 1, /* logical */
+        .irq_dest_mode                  = 0, /* physical */
        .target_cpus                    = uv_target_cpus,
        .disable_esr                    = 0,
@@ -362,12 +362,6 @@ static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size)
        BUG();
 }
-static __init void map_low_mmrs(void)
-{
-        init_extra_mapping_uc(UV_GLOBAL_MMR32_BASE, UV_GLOBAL_MMR32_SIZE);
-        init_extra_mapping_uc(UV_LOCAL_MMR_BASE, UV_LOCAL_MMR_SIZE);
-}
 enum map_type {map_wb, map_uc};
 static __init void map_high(char *id, unsigned long base, int shift,
@@ -395,26 +389,6 @@ static __init void map_gru_high(int max_pnode)
                map_high("GRU", gru.s.base, shift, max_pnode, map_wb);
 }
-static __init void map_config_high(int max_pnode)
-{
-        union uvh_rh_gam_cfg_overlay_config_mmr_u cfg;
-        int shift = UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR_BASE_SHFT;
-        cfg.v = uv_read_local_mmr(UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR);
-        if (cfg.s.enable)
-                map_high("CONFIG", cfg.s.base, shift, max_pnode, map_uc);
-}
-static __init void map_mmr_high(int max_pnode)
-{
-        union uvh_rh_gam_mmr_overlay_config_mmr_u mmr;
-        int shift = UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT;
-        mmr.v = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR);
-        if (mmr.s.enable)
-                map_high("MMR", mmr.s.base, shift, max_pnode, map_uc);
-}
 static __init void map_mmioh_high(int max_pnode)
 {
        union uvh_rh_gam_mmioh_overlay_config_mmr_u mmioh;
@@ -566,8 +540,6 @@ void __init uv_system_init(void)
        unsigned long mmr_base, present, paddr;
        unsigned short pnode_mask;
-        map_low_mmrs();
        m_n_config.v = uv_read_local_mmr(UVH_SI_ADDR_MAP_CONFIG);
        m_val = m_n_config.s.m_skt;
        n_val = m_n_config.s.n_skt;
@@ -591,6 +563,8 @@ void __init uv_system_init(void)
        bytes = sizeof(struct uv_blade_info) * uv_num_possible_blades();
        uv_blade_info = kmalloc(bytes, GFP_KERNEL);
        BUG_ON(!uv_blade_info);
+        for (blade = 0; blade < uv_num_possible_blades(); blade++)
+                uv_blade_info[blade].memory_nid = -1;
        get_lowmem_redirect(&lowmem_redir_base, &lowmem_redir_size);
@@ -629,6 +603,9 @@ void __init uv_system_init(void)
                lcpu = uv_blade_info[blade].nr_possible_cpus;
                uv_blade_info[blade].nr_possible_cpus++;
+                /* Any node on the blade, else will contain -1. */
+                uv_blade_info[blade].memory_nid = nid;
                uv_cpu_hub_info(cpu)->lowmem_remap_base = lowmem_redir_base;
                uv_cpu_hub_info(cpu)->lowmem_remap_top = lowmem_redir_size;
                uv_cpu_hub_info(cpu)->m_val = m_val;
@@ -662,11 +639,10 @@ void __init uv_system_init(void)
                pnode = (paddr >> m_val) & pnode_mask;
                blade = boot_pnode_to_blade(pnode);
                uv_node_to_blade[nid] = blade;
+                max_pnode = max(pnode, max_pnode);
        }
        map_gru_high(max_pnode);
-        map_mmr_high(max_pnode);
-        map_config_high(max_pnode);
        map_mmioh_high(max_pnode);
        uv_cpu_init();
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 79302e9a33a4..442b5508893f 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -811,7 +811,7 @@ static int apm_do_idle(void)
        u8 ret = 0;
        int idled = 0;
        int polling;
-        int err;
+        int err = 0;
        polling = !!(current_thread_info()->status & TS_POLLING);
        if (polling) {
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 3efcb2b96a15..c1f253dac155 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -7,6 +7,10 @@ ifdef CONFIG_FUNCTION_TRACER
 CFLAGS_REMOVE_common.o = -pg
 endif
+# Make sure load_percpu_segment has no stackprotector
+nostackp := $(call cc-option, -fno-stack-protector)
+CFLAGS_common.o         := $(nostackp)
 obj-y                   := intel_cacheinfo.o addon_cpuid_features.o
 obj-y                   += proc.o capflags.o powerflags.o common.o
 obj-y                   += vmware.o hypervisor.o
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 28e5f5956042..63fddcd082cd 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -356,7 +356,7 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
 #endif
 #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_PCI)
        /* check CPU config space for extended APIC ID */
-        if (c->x86 >= 0xf) {
+        if (cpu_has_apic && c->x86 >= 0xf) {
                unsigned int val;
                val = read_pci_config(0, 24, 0, 0x68);
                if ((val & ((1 << 17) | (1 << 18))) == ((1 << 17) | (1 << 18)))
@@ -400,6 +400,13 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
                level = cpuid_eax(1);
                if((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58)
                        set_cpu_cap(c, X86_FEATURE_REP_GOOD);
+                /*
+                 * Some BIOSes incorrectly force this feature, but only K8
+                 * revision D (model = 0x14) and later actually support it.
+                 */
+                if (c->x86_model < 0x14)
+                        clear_cpu_cap(c, X86_FEATURE_LAHF_LM);
        }
        if (c->x86 == 0x10 || c->x86 == 0x11)
                set_cpu_cap(c, X86_FEATURE_REP_GOOD);
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index f1961c07af9a..5ce60a88027b 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -59,7 +59,30 @@ void __init setup_cpu_local_masks(void)
        alloc_bootmem_cpumask_var(&cpu_sibling_setup_mask);
 }
-static const struct cpu_dev *this_cpu __cpuinitdata;
+static void __cpuinit default_init(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_X86_64
+        display_cacheinfo(c);
+#else
+        /* Not much we can do here... */
+        /* Check if at least it has cpuid */
+        if (c->cpuid_level == -1) {
+                /* No cpuid. It must be an ancient CPU */
+                if (c->x86 == 4)
+                        strcpy(c->x86_model_id, "486");
+                else if (c->x86 == 3)
+                        strcpy(c->x86_model_id, "386");
+        }
+#endif
+}
+static const struct cpu_dev __cpuinitconst default_cpu = {
+        .c_init         = default_init,
+        .c_vendor       = "Unknown",
+        .c_x86_vendor   = X86_VENDOR_UNKNOWN,
+};
+static const struct cpu_dev *this_cpu __cpuinitdata = &default_cpu;
 DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
 #ifdef CONFIG_X86_64
@@ -332,29 +355,6 @@ void switch_to_new_gdt(int cpu)
 static const struct cpu_dev *__cpuinitdata cpu_devs[X86_VENDOR_NUM] = {};
-static void __cpuinit default_init(struct cpuinfo_x86 *c)
-{
-#ifdef CONFIG_X86_64
-        display_cacheinfo(c);
-#else
-        /* Not much we can do here... */
-        /* Check if at least it has cpuid */
-        if (c->cpuid_level == -1) {
-                /* No cpuid. It must be an ancient CPU */
-                if (c->x86 == 4)
-                        strcpy(c->x86_model_id, "486");
-                else if (c->x86 == 3)
-                        strcpy(c->x86_model_id, "386");
-        }
-#endif
-}
-static const struct cpu_dev __cpuinitconst default_cpu = {
-        .c_init = default_init,
-        .c_vendor = "Unknown",
-        .c_x86_vendor = X86_VENDOR_UNKNOWN,
-};
 static void __cpuinit get_model_name(struct cpuinfo_x86 *c)
 {
        unsigned int *v;
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 484c1e5f658e..01213048f62f 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -1226,8 +1226,13 @@ static void mce_init(void)
 }
 /* Add per CPU specific workarounds here */
-static void mce_cpu_quirks(struct cpuinfo_x86 *c)
+static int mce_cpu_quirks(struct cpuinfo_x86 *c)
 {
+        if (c->x86_vendor == X86_VENDOR_UNKNOWN) {
+                pr_info("MCE: unknown CPU type - not enabling MCE support.\n");
+                return -EOPNOTSUPP;
+        }
        /* This should be disabled by the BIOS, but isn't always */
        if (c->x86_vendor == X86_VENDOR_AMD) {
                if (c->x86 == 15 && banks > 4) {
@@ -1273,11 +1278,20 @@ static void mce_cpu_quirks(struct cpuinfo_x86 *c)
                if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) &&
                        monarch_timeout < 0)
                        monarch_timeout = USEC_PER_SEC;
+                /*
+                 * There are also broken BIOSes on some Pentium M and
+                 * earlier systems:
+                 */
+                if (c->x86 == 6 && c->x86_model <= 13 && mce_bootlog < 0)
+                        mce_bootlog = 0;
        }
        if (monarch_timeout < 0)
                monarch_timeout = 0;
        if (mce_bootlog != 0)
                mce_panic_timeout = 30;
+        return 0;
 }
 static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c)
@@ -1338,11 +1352,10 @@ void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
        if (!mce_available(c))
                return;
-        if (mce_cap_init() < 0) {
+        if (mce_cap_init() < 0 || mce_cpu_quirks(c) < 0) {
                mce_disabled = 1;
                return;
        }
-        mce_cpu_quirks(c);
        machine_check_vector = do_machine_check;
@@ -1692,17 +1705,15 @@ static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr,
                                const char *buf, size_t siz)
 {
        char *p;
-        int len;
        strncpy(mce_helper, buf, sizeof(mce_helper));
        mce_helper[sizeof(mce_helper)-1] = 0;
-        len = strlen(mce_helper);
        p = strchr(mce_helper, '\n');
-        if (*p)
+        if (p)
                *p = 0;
-        return len;
+        return strlen(mce_helper) + !!p;
 }
 static ssize_t set_ignore_ce(struct sys_device *s,
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index bff8dd191dd5..5957a93e5173 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -36,6 +36,7 @@
 static DEFINE_PER_CPU(__u64, next_check) = INITIAL_JIFFIES;
 static DEFINE_PER_CPU(unsigned long, thermal_throttle_count);
+static DEFINE_PER_CPU(bool, thermal_throttle_active);
 static atomic_t therm_throt_en          = ATOMIC_INIT(0);
@@ -96,27 +97,33 @@ static int therm_throt_process(int curr)
 {
        unsigned int cpu = smp_processor_id();
        __u64 tmp_jiffs = get_jiffies_64();
+        bool was_throttled = __get_cpu_var(thermal_throttle_active);
+        bool is_throttled = __get_cpu_var(thermal_throttle_active) = curr;
-        if (curr)
+        if (is_throttled)
                __get_cpu_var(thermal_throttle_count)++;
-        if (time_before64(tmp_jiffs, __get_cpu_var(next_check)))
+        if (!(was_throttled ^ is_throttled) &&
+            time_before64(tmp_jiffs, __get_cpu_var(next_check)))
                return 0;
        __get_cpu_var(next_check) = tmp_jiffs + CHECK_INTERVAL;
        /* if we just entered the thermal event */
-        if (curr) {
+        if (is_throttled) {
                printk(KERN_CRIT "CPU%d: Temperature above threshold, "
-                       "cpu clock throttled (total events = %lu)\n", cpu,
+                       "cpu clock throttled (total events = %lu)\n",
-                       __get_cpu_var(thermal_throttle_count));
+                       cpu, __get_cpu_var(thermal_throttle_count));
                add_taint(TAINT_MACHINE_CHECK);
-        } else {
+                return 1;
-                printk(KERN_CRIT "CPU%d: Temperature/speed normal\n", cpu);
+        }
+        if (was_throttled) {
+                printk(KERN_INFO "CPU%d: Temperature/speed normal\n", cpu);
+                return 1;
        }
-        return 1;
+        return 0;
 }
 #ifdef CONFIG_SYSFS
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 36c3dc7b8991..900332b800f8 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -55,6 +55,7 @@ struct x86_pmu {
        int             num_counters_fixed;
        int             counter_bits;
        u64             counter_mask;
+        int             apic;
        u64             max_period;
        u64             intel_ctrl;
 };
@@ -66,6 +67,52 @@ static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = {
 };
 /*
+ * Not sure about some of these
+ */
+static const u64 p6_perfmon_event_map[] =
+{
+  [PERF_COUNT_HW_CPU_CYCLES]            = 0x0079,
+  [PERF_COUNT_HW_INSTRUCTIONS]          = 0x00c0,
+  [PERF_COUNT_HW_CACHE_REFERENCES]      = 0x0f2e,
+  [PERF_COUNT_HW_CACHE_MISSES]          = 0x012e,
+  [PERF_COUNT_HW_BRANCH_INSTRUCTIONS]   = 0x00c4,
+  [PERF_COUNT_HW_BRANCH_MISSES]         = 0x00c5,
+  [PERF_COUNT_HW_BUS_CYCLES]            = 0x0062,
+};
+static u64 p6_pmu_event_map(int event)
+{
+        return p6_perfmon_event_map[event];
+}
+/*
+ * Counter setting that is specified not to count anything.
+ * We use this to effectively disable a counter.
+ *
+ * L2_RQSTS with 0 MESI unit mask.
+ */
+#define P6_NOP_COUNTER                  0x0000002EULL
+static u64 p6_pmu_raw_event(u64 event)
+{
+#define P6_EVNTSEL_EVENT_MASK           0x000000FFULL
+#define P6_EVNTSEL_UNIT_MASK            0x0000FF00ULL
+#define P6_EVNTSEL_EDGE_MASK            0x00040000ULL
+#define P6_EVNTSEL_INV_MASK             0x00800000ULL
+#define P6_EVNTSEL_COUNTER_MASK         0xFF000000ULL
+#define P6_EVNTSEL_MASK                 \
+        (P6_EVNTSEL_EVENT_MASK |        \
+         P6_EVNTSEL_UNIT_MASK  |        \
+         P6_EVNTSEL_EDGE_MASK  |        \
+         P6_EVNTSEL_INV_MASK   |        \
+         P6_EVNTSEL_COUNTER_MASK)
+        return event & P6_EVNTSEL_MASK;
+}
+/*
 * Intel PerfMon v3. Used on Core2 and later.
 */
 static const u64 intel_perfmon_event_map[] =
@@ -567,6 +614,7 @@ static DEFINE_MUTEX(pmc_reserve_mutex);
 static bool reserve_pmc_hardware(void)
 {
+#ifdef CONFIG_X86_LOCAL_APIC
        int i;
        if (nmi_watchdog == NMI_LOCAL_APIC)
@@ -581,9 +629,11 @@ static bool reserve_pmc_hardware(void)
                if (!reserve_evntsel_nmi(x86_pmu.eventsel + i))
                        goto eventsel_fail;
        }
+#endif
        return true;
+#ifdef CONFIG_X86_LOCAL_APIC
 eventsel_fail:
        for (i--; i >= 0; i--)
                release_evntsel_nmi(x86_pmu.eventsel + i);
@@ -598,10 +648,12 @@ perfctr_fail:
                enable_lapic_nmi_watchdog();
        return false;
+#endif
 }
 static void release_pmc_hardware(void)
 {
+#ifdef CONFIG_X86_LOCAL_APIC
        int i;
        for (i = 0; i < x86_pmu.num_counters; i++) {
@@ -611,6 +663,7 @@ static void release_pmc_hardware(void)
        if (nmi_watchdog == NMI_LOCAL_APIC)
                enable_lapic_nmi_watchdog();
+#endif
 }
 static void hw_perf_counter_destroy(struct perf_counter *counter)
@@ -666,6 +719,7 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 {
        struct perf_counter_attr *attr = &counter->attr;
        struct hw_perf_counter *hwc = &counter->hw;
+        u64 config;
        int err;
        if (!x86_pmu_initialized())
@@ -701,6 +755,15 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
                hwc->sample_period = x86_pmu.max_period;
                hwc->last_period = hwc->sample_period;
                atomic64_set(&hwc->period_left, hwc->sample_period);
+        } else {
+                /*
+                 * If we have a PMU initialized but no APIC
+                 * interrupts, we cannot sample hardware
+                 * counters (user-space has to fall back and
+                 * sample via a hrtimer based software counter):
+                 */
+                if (!x86_pmu.apic)
+                        return -EOPNOTSUPP;
        }
        counter->destroy = hw_perf_counter_destroy;
@@ -718,14 +781,40 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
        if (attr->config >= x86_pmu.max_events)
                return -EINVAL;
        /*
         * The generic map:
         */
-        hwc->config |= x86_pmu.event_map(attr->config);
+        config = x86_pmu.event_map(attr->config);
+        if (config == 0)
+                return -ENOENT;
+        if (config == -1LL)
+                return -EINVAL;
+        hwc->config |= config;
        return 0;
 }
+static void p6_pmu_disable_all(void)
+{
+        struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+        u64 val;
+        if (!cpuc->enabled)
+                return;
+        cpuc->enabled = 0;
+        barrier();
+        /* p6 only has one enable register */
+        rdmsrl(MSR_P6_EVNTSEL0, val);
+        val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
+        wrmsrl(MSR_P6_EVNTSEL0, val);
+}
 static void intel_pmu_disable_all(void)
 {
        wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
@@ -767,6 +856,23 @@ void hw_perf_disable(void)
        return x86_pmu.disable_all();
 }
+static void p6_pmu_enable_all(void)
+{
+        struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+        unsigned long val;
+        if (cpuc->enabled)
+                return;
+        cpuc->enabled = 1;
+        barrier();
+        /* p6 only has one enable register */
+        rdmsrl(MSR_P6_EVNTSEL0, val);
+        val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
+        wrmsrl(MSR_P6_EVNTSEL0, val);
+}
 static void intel_pmu_enable_all(void)
 {
        wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
@@ -784,13 +890,13 @@ static void amd_pmu_enable_all(void)
        barrier();
        for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+                struct perf_counter *counter = cpuc->counters[idx];
                u64 val;
                if (!test_bit(idx, cpuc->active_mask))
                        continue;
-                rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
-                if (val & ARCH_PERFMON_EVENTSEL0_ENABLE)
+                val = counter->hw.config;
-                        continue;
                val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
                wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
        }
@@ -819,16 +925,13 @@ static inline void intel_pmu_ack_status(u64 ack)
 static inline void x86_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
 {
-        int err;
+        (void)checking_wrmsrl(hwc->config_base + idx,
-        err = checking_wrmsrl(hwc->config_base + idx,
                              hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE);
 }
 static inline void x86_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
 {
-        int err;
+        (void)checking_wrmsrl(hwc->config_base + idx, hwc->config);
-        err = checking_wrmsrl(hwc->config_base + idx,
-                              hwc->config);
 }
 static inline void
@@ -836,13 +939,24 @@ intel_pmu_disable_fixed(struct hw_perf_counter *hwc, int __idx)
 {
        int idx = __idx - X86_PMC_IDX_FIXED;
        u64 ctrl_val, mask;
-        int err;
        mask = 0xfULL << (idx * 4);
        rdmsrl(hwc->config_base, ctrl_val);
        ctrl_val &= ~mask;
-        err = checking_wrmsrl(hwc->config_base, ctrl_val);
+        (void)checking_wrmsrl(hwc->config_base, ctrl_val);
+}
+static inline void
+p6_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
+{
+        struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+        u64 val = P6_NOP_COUNTER;
+        if (cpuc->enabled)
+                val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
+        (void)checking_wrmsrl(hwc->config_base + idx, val);
 }
 static inline void
@@ -943,6 +1057,19 @@ intel_pmu_enable_fixed(struct hw_perf_counter *hwc, int __idx)
        err = checking_wrmsrl(hwc->config_base, ctrl_val);
 }
+static void p6_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
+{
+        struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+        u64 val;
+        val = hwc->config;
+        if (cpuc->enabled)
+                val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
+        (void)checking_wrmsrl(hwc->config_base + idx, val);
+}
 static void intel_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
 {
        if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
@@ -959,8 +1086,6 @@ static void amd_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
        if (cpuc->enabled)
                x86_pmu_enable_counter(hwc, idx);
-        else
-                x86_pmu_disable_counter(hwc, idx);
 }
 static int
@@ -1176,6 +1301,49 @@ static void intel_pmu_reset(void)
        local_irq_restore(flags);
 }
+static int p6_pmu_handle_irq(struct pt_regs *regs)
+{
+        struct perf_sample_data data;
+        struct cpu_hw_counters *cpuc;
+        struct perf_counter *counter;
+        struct hw_perf_counter *hwc;
+        int idx, handled = 0;
+        u64 val;
+        data.regs = regs;
+        data.addr = 0;
+        cpuc = &__get_cpu_var(cpu_hw_counters);
+        for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+                if (!test_bit(idx, cpuc->active_mask))
+                        continue;
+                counter = cpuc->counters[idx];
+                hwc = &counter->hw;
+                val = x86_perf_counter_update(counter, hwc, idx);
+                if (val & (1ULL << (x86_pmu.counter_bits - 1)))
+                        continue;
+                /*
+                 * counter overflow
+                 */
+                handled         = 1;
+                data.period     = counter->hw.last_period;
+                if (!x86_perf_counter_set_period(counter, hwc, idx))
+                        continue;
+                if (perf_counter_overflow(counter, 1, &data))
+                        p6_pmu_disable_counter(hwc, idx);
+        }
+        if (handled)
+                inc_irq_stat(apic_perf_irqs);
+        return handled;
+}
 /*
 * This handler is triggered by the local APIC, so the APIC IRQ handling
@@ -1185,14 +1353,13 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
 {
        struct perf_sample_data data;
        struct cpu_hw_counters *cpuc;
-        int bit, cpu, loops;
+        int bit, loops;
        u64 ack, status;
        data.regs = regs;
        data.addr = 0;
-        cpu = smp_processor_id();
+        cpuc = &__get_cpu_var(cpu_hw_counters);
-        cpuc = &per_cpu(cpu_hw_counters, cpu);
        perf_disable();
        status = intel_pmu_get_status();
@@ -1249,14 +1416,13 @@ static int amd_pmu_handle_irq(struct pt_regs *regs)
        struct cpu_hw_counters *cpuc;
        struct perf_counter *counter;
        struct hw_perf_counter *hwc;
-        int cpu, idx, handled = 0;
+        int idx, handled = 0;
        u64 val;
        data.regs = regs;
        data.addr = 0;
-        cpu = smp_processor_id();
+        cpuc = &__get_cpu_var(cpu_hw_counters);
-        cpuc = &per_cpu(cpu_hw_counters, cpu);
        for (idx = 0; idx < x86_pmu.num_counters; idx++) {
                if (!test_bit(idx, cpuc->active_mask))
@@ -1299,18 +1465,22 @@ void smp_perf_pending_interrupt(struct pt_regs *regs)
 void set_perf_counter_pending(void)
 {
+#ifdef CONFIG_X86_LOCAL_APIC
        apic->send_IPI_self(LOCAL_PENDING_VECTOR);
+#endif
 }
 void perf_counters_lapic_init(void)
 {
-        if (!x86_pmu_initialized())
+#ifdef CONFIG_X86_LOCAL_APIC
+        if (!x86_pmu.apic || !x86_pmu_initialized())
                return;
        /*
         * Always use NMI for PMU
         */
        apic_write(APIC_LVTPC, APIC_DM_NMI);
+#endif
 }
 static int __kprobes
@@ -1334,7 +1504,9 @@ perf_counter_nmi_handler(struct notifier_block *self,
        regs = args->regs;
+#ifdef CONFIG_X86_LOCAL_APIC
        apic_write(APIC_LVTPC, APIC_DM_NMI);
+#endif
        /*
         * Can't rely on the handled return value to say it was our NMI, two
         * counters could trigger 'simultaneously' raising two back-to-back NMIs.
@@ -1353,6 +1525,33 @@ static __read_mostly struct notifier_block perf_counter_nmi_notifier = {
        .priority               = 1
 };
+static struct x86_pmu p6_pmu = {
+        .name                   = "p6",
+        .handle_irq             = p6_pmu_handle_irq,
+        .disable_all            = p6_pmu_disable_all,
+        .enable_all             = p6_pmu_enable_all,
+        .enable                 = p6_pmu_enable_counter,
+        .disable                = p6_pmu_disable_counter,
+        .eventsel               = MSR_P6_EVNTSEL0,
+        .perfctr                = MSR_P6_PERFCTR0,
+        .event_map              = p6_pmu_event_map,
+        .raw_event              = p6_pmu_raw_event,
+        .max_events             = ARRAY_SIZE(p6_perfmon_event_map),
+        .apic                   = 1,
+        .max_period             = (1ULL << 31) - 1,
+        .version                = 0,
+        .num_counters           = 2,
+        /*
+         * Counters have 40 bits implemented. However they are designed such
+         * that bits [32-39] are sign extensions of bit 31. As such the
+         * effective width of a counter for P6-like PMU is 32 bits only.
+         *
+         * See IA-32 Intel Architecture Software developer manual Vol 3B
+         */
+        .counter_bits           = 32,
+        .counter_mask           = (1ULL << 32) - 1,
+};
 static struct x86_pmu intel_pmu = {
        .name                   = "Intel",
        .handle_irq             = intel_pmu_handle_irq,
@@ -1365,6 +1564,7 @@ static struct x86_pmu intel_pmu = {
        .event_map              = intel_pmu_event_map,
        .raw_event              = intel_pmu_raw_event,
        .max_events             = ARRAY_SIZE(intel_perfmon_event_map),
+        .apic                   = 1,
        /*
         * Intel PMCs cannot be accessed sanely above 32 bit width,
         * so we install an artificial 1<<31 period regardless of
@@ -1388,10 +1588,43 @@ static struct x86_pmu amd_pmu = {
        .num_counters           = 4,
        .counter_bits           = 48,
        .counter_mask           = (1ULL << 48) - 1,
+        .apic                   = 1,
        /* use highest bit to detect overflow */
        .max_period             = (1ULL << 47) - 1,
 };
+static int p6_pmu_init(void)
+{
+        switch (boot_cpu_data.x86_model) {
+        case 1:
+        case 3:  /* Pentium Pro */
+        case 5:
+        case 6:  /* Pentium II */
+        case 7:
+        case 8:
+        case 11: /* Pentium III */
+                break;
+        case 9:
+        case 13:
+                /* Pentium M */
+                break;
+        default:
+                pr_cont("unsupported p6 CPU model %d ",
+                        boot_cpu_data.x86_model);
+                return -ENODEV;
+        }
+        x86_pmu = p6_pmu;
+        if (!cpu_has_apic) {
+                pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n");
+                pr_info("no hardware sampling interrupt available.\n");
+                x86_pmu.apic = 0;
+        }
+        return 0;
+}
 static int intel_pmu_init(void)
 {
        union cpuid10_edx edx;
@@ -1400,8 +1633,14 @@ static int intel_pmu_init(void)
        unsigned int ebx;
        int version;
-        if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
+        if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
+                /* check for P6 processor family */
+           if (boot_cpu_data.x86 == 6) {
+                return p6_pmu_init();
+           } else {
                return -ENODEV;
+           }
+        }
        /*
         * Check whether the Architectural PerfMon supports
diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c
index 96f7ac0bbf01..fe26ba3e3451 100644
--- a/arch/x86/kernel/efi.c
+++ b/arch/x86/kernel/efi.c
@@ -354,7 +354,7 @@ void __init efi_init(void)
         */
        c16 = tmp = early_ioremap(efi.systab->fw_vendor, 2);
        if (c16) {
-                for (i = 0; i < sizeof(vendor) && *c16; ++i)
+                for (i = 0; i < sizeof(vendor) - 1 && *c16; ++i)
                        vendor[i] = *c16++;
                vendor[i] = '\0';
        } else
@@ -512,7 +512,7 @@ void __init efi_enter_virtual_mode(void)
                        && end_pfn <= max_pfn_mapped))
                        va = __va(md->phys_addr);
                else
-                        va = efi_ioremap(md->phys_addr, size);
+                        va = efi_ioremap(md->phys_addr, size, md->type);
                md->virt_addr = (u64) (unsigned long) va;
diff --git a/arch/x86/kernel/efi_64.c b/arch/x86/kernel/efi_64.c
index 22c3b7828c50..ac0621a7ac3d 100644
--- a/arch/x86/kernel/efi_64.c
+++ b/arch/x86/kernel/efi_64.c
@@ -98,10 +98,14 @@ void __init efi_call_phys_epilog(void)
        early_runtime_code_mapping_set_exec(0);
 }
-void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size)
+void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size,
+                                 u32 type)
 {
        unsigned long last_map_pfn;
+        if (type == EFI_MEMORY_MAPPED_IO)
+                return ioremap(phys_addr, size);
        last_map_pfn = init_memory_mapping(phys_addr, phys_addr + size);
        if ((last_map_pfn << PAGE_SHIFT) < phys_addr + size)
                return NULL;
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 8663afb56535..cc827ac9e8d3 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -261,9 +261,7 @@ page_pde_offset = (__PAGE_OFFSET >> 20);
 * which will be freed later
 */
-#ifndef CONFIG_HOTPLUG_CPU
+__CPUINIT
-.section .init.text,"ax",@progbits
-#endif
 #ifdef CONFIG_SMP
 ENTRY(startup_32_smp)
@@ -602,7 +600,7 @@ ignore_int:
 #endif
        iret
-.section .cpuinit.data,"wa"
+        __REFDATA
 .align 4
 ENTRY(initial_code)
        .long i386_start_kernel
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 696f0e475c2d..92b7703d3d58 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -187,7 +187,7 @@ static void __init apic_intr_init(void)
 #ifdef CONFIG_X86_THERMAL_VECTOR
        alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
 #endif
-#ifdef CONFIG_X86_THRESHOLD
+#ifdef CONFIG_X86_MCE_THRESHOLD
        alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
 #endif
 #if defined(CONFIG_X86_NEW_MCE) && defined(CONFIG_X86_LOCAL_APIC)
diff --git a/arch/x86/kernel/mfgpt_32.c b/arch/x86/kernel/mfgpt_32.c
index 846510b78a09..2a62d843f015 100644
--- a/arch/x86/kernel/mfgpt_32.c
+++ b/arch/x86/kernel/mfgpt_32.c
@@ -347,7 +347,7 @@ static irqreturn_t mfgpt_tick(int irq, void *dev_id)
 static struct irqaction mfgptirq  = {
        .handler = mfgpt_tick,
-        .flags = IRQF_DISABLED | IRQF_NOBALANCING,
+        .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_TIMER,
        .name = "mfgpt-timer"
 };
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 994dd6a4a2a0..071166a4ba83 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -519,16 +519,12 @@ static void c1e_idle(void)
                if (!cpumask_test_cpu(cpu, c1e_mask)) {
                        cpumask_set_cpu(cpu, c1e_mask);
                        /*
-                         * Force broadcast so ACPI can not interfere. Needs
+                         * Force broadcast so ACPI can not interfere.
-                         * to run with interrupts enabled as it uses
-                         * smp_function_call.
                         */
-                        local_irq_enable();
                        clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE,
                                           &cpu);
                        printk(KERN_INFO "Switch to broadcast mode on CPU%d\n",
                               cpu);
-                        local_irq_disable();
                }
                clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu);
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index 4f9c55f3a7c0..03801f2f761f 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -60,7 +60,7 @@ static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
                "adc  %5,%%edx ; "
                : "=A" (product), "=r" (tmp1), "=r" (tmp2)
                : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
-#elif __x86_64__
+#elif defined(__x86_64__)
        __asm__ (
                "mul %%rdx ; shrd $32,%%rdx,%%rax"
                : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index d2d1ce8170f0..a06e8d101844 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -3,6 +3,7 @@
 #include <linux/init.h>
 #include <linux/pm.h>
 #include <linux/efi.h>
+#include <linux/dmi.h>
 #include <acpi/reboot.h>
 #include <asm/io.h>
 #include <asm/apic.h>
@@ -17,7 +18,6 @@
 #include <asm/cpu.h>
 #ifdef CONFIG_X86_32
-# include <linux/dmi.h>
 # include <linux/ctype.h>
 # include <linux/mc146818rtc.h>
 #else
@@ -249,6 +249,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
                        DMI_MATCH(DMI_PRODUCT_NAME, "VGN-Z540N"),
                },
        },
+        {       /* Handle problems with rebooting on CompuLab SBC-FITPC2 */
+                .callback = set_bios_reboot,
+                .ident = "CompuLab SBC-FITPC2",
+                .matches = {
+                        DMI_MATCH(DMI_SYS_VENDOR, "CompuLab"),
+                        DMI_MATCH(DMI_PRODUCT_NAME, "SBC-FITPC2"),
+                },
+        },
        { }
 };
@@ -396,6 +404,46 @@ EXPORT_SYMBOL(machine_real_restart);
 #endif /* CONFIG_X86_32 */
+/*
+ * Some Apple MacBook and MacBookPro's needs reboot=p to be able to reboot
+ */
+static int __init set_pci_reboot(const struct dmi_system_id *d)
+{
+        if (reboot_type != BOOT_CF9) {
+                reboot_type = BOOT_CF9;
+                printk(KERN_INFO "%s series board detected. "
+                       "Selecting PCI-method for reboots.\n", d->ident);
+        }
+        return 0;
+}
+static struct dmi_system_id __initdata pci_reboot_dmi_table[] = {
+        {       /* Handle problems with rebooting on Apple MacBook5 */
+                .callback = set_pci_reboot,
+                .ident = "Apple MacBook5",
+                .matches = {
+                        DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
+                        DMI_MATCH(DMI_PRODUCT_NAME, "MacBook5"),
+                },
+        },
+        {       /* Handle problems with rebooting on Apple MacBookPro5 */
+                .callback = set_pci_reboot,
+                .ident = "Apple MacBookPro5",
+                .matches = {
+                        DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
+                        DMI_MATCH(DMI_PRODUCT_NAME, "MacBookPro5"),
+                },
+        },
+        { }
+};
+static int __init pci_reboot_init(void)
+{
+        dmi_check_system(pci_reboot_dmi_table);
+        return 0;
+}
+core_initcall(pci_reboot_init);
 static inline void kb_wait(void)
 {
        int i;
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index de2cab132844..63f32d220ef2 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -672,6 +672,19 @@ static struct dmi_system_id __initdata bad_bios_dmi_table[] = {
                        DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies"),
                },
        },
+        {
+        /*
+         * AMI BIOS with low memory corruption was found on Intel DG45ID board.
+         * It hase different DMI_BIOS_VENDOR = "Intel Corp.", for now we will
+         * match only DMI_BOARD_NAME and see if there is more bad products
+         * with this vendor.
+         */
+                .callback = dmi_low_memory_corruption,
+                .ident = "AMI BIOS",
+                .matches = {
+                        DMI_MATCH(DMI_BOARD_NAME, "DG45ID"),
+                },
+        },
 #endif
        {}
 };
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 29a3eef7cf4a..07d81916f212 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -165,7 +165,7 @@ static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen)
        if (!chosen) {
                size_t vm_size = VMALLOC_END - VMALLOC_START;
-                size_t tot_size = num_possible_cpus() * PMD_SIZE;
+                size_t tot_size = nr_cpu_ids * PMD_SIZE;
                /* on non-NUMA, embedding is better */
                if (!pcpu_need_numa())
@@ -199,7 +199,7 @@ static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen)
        dyn_size = pcpul_size - static_size - PERCPU_FIRST_CHUNK_RESERVE;
        /* allocate pointer array and alloc large pages */
-        map_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpul_map[0]));
+        map_size = PFN_ALIGN(nr_cpu_ids * sizeof(pcpul_map[0]));
        pcpul_map = alloc_bootmem(map_size);
        for_each_possible_cpu(cpu) {
@@ -228,7 +228,7 @@ static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen)
        /* allocate address and map */
        pcpul_vm.flags = VM_ALLOC;
-        pcpul_vm.size = num_possible_cpus() * PMD_SIZE;
+        pcpul_vm.size = nr_cpu_ids * PMD_SIZE;
        vm_area_register_early(&pcpul_vm, PMD_SIZE);
        for_each_possible_cpu(cpu) {
@@ -250,8 +250,8 @@ static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen)
                                     PMD_SIZE, pcpul_vm.addr, NULL);
        /* sort pcpul_map array for pcpu_lpage_remapped() */
-        for (i = 0; i < num_possible_cpus() - 1; i++)
+        for (i = 0; i < nr_cpu_ids - 1; i++)
-                for (j = i + 1; j < num_possible_cpus(); j++)
+                for (j = i + 1; j < nr_cpu_ids; j++)
                        if (pcpul_map[i].ptr > pcpul_map[j].ptr) {
                                struct pcpul_ent tmp = pcpul_map[i];
                                pcpul_map[i] = pcpul_map[j];
@@ -288,7 +288,7 @@ void *pcpu_lpage_remapped(void *kaddr)
 {
        void *pmd_addr = (void *)((unsigned long)kaddr & PMD_MASK);
        unsigned long offset = (unsigned long)kaddr & ~PMD_MASK;
-        int left = 0, right = num_possible_cpus() - 1;
+        int left = 0, right = nr_cpu_ids - 1;
        int pos;
        /* pcpul in use at all? */
@@ -377,7 +377,7 @@ static ssize_t __init setup_pcpu_4k(size_t static_size)
        pcpu4k_nr_static_pages = PFN_UP(static_size);
        /* unaligned allocations can't be freed, round up to page size */
-        pages_size = PFN_ALIGN(pcpu4k_nr_static_pages * num_possible_cpus()
+        pages_size = PFN_ALIGN(pcpu4k_nr_static_pages * nr_cpu_ids
                               * sizeof(pcpu4k_pages[0]));
        pcpu4k_pages = alloc_bootmem(pages_size);
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index 8ccabb8a2f6a..77b9689f8edb 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -744,6 +744,7 @@ uv_activation_descriptor_init(int node, int pnode)
                 * note that base_dest_nodeid is actually a nasid.
                 */
                ad2->header.base_dest_nodeid = uv_partition_base_pnode << 1;
+                ad2->header.dest_subnodeid = 0x10; /* the LB */
                ad2->header.command = UV_NET_ENDPOINT_INTD;
                ad2->header.int_both = 1;
                /*
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 6e1a368d21d4..71f4368b357e 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -275,15 +275,20 @@ static unsigned long pit_calibrate_tsc(u32 latch, unsigned long ms, int loopmin)
 * use the TSC value at the transitions to calculate a pretty
 * good value for the TSC frequencty.
 */
+static inline int pit_verify_msb(unsigned char val)
+{
+        /* Ignore LSB */
+        inb(0x42);
+        return inb(0x42) == val;
+}
 static inline int pit_expect_msb(unsigned char val, u64 *tscp, unsigned long *deltap)
 {
        int count;
        u64 tsc = 0;
        for (count = 0; count < 50000; count++) {
-                /* Ignore LSB */
+                if (!pit_verify_msb(val))
-                inb(0x42);
-                if (inb(0x42) != val)
                        break;
                tsc = get_cycles();
        }
@@ -336,8 +341,7 @@ static unsigned long quick_pit_calibrate(void)
         * to do that is to just read back the 16-bit counter
         * once from the PIT.
         */
-        inb(0x42);
+        pit_verify_msb(0);
-        inb(0x42);
        if (pit_expect_msb(0xff, &tsc, &d1)) {
                for (i = 1; i <= MAX_QUICK_PIT_ITERATIONS; i++) {
@@ -348,8 +352,19 @@ static unsigned long quick_pit_calibrate(void)
                         * Iterate until the error is less than 500 ppm
                         */
                        delta -= tsc;
-                        if (d1+d2 < delta >> 11)
+                        if (d1+d2 >= delta >> 11)
-                                goto success;
+                                continue;
+                        /*
+                         * Check the PIT one more time to verify that
+                         * all TSC reads were stable wrt the PIT.
+                         *
+                         * This also guarantees serialization of the
+                         * last cycle read ('d2') in pit_expect_msb.
+                         */
+                        if (!pit_verify_msb(0xfe - i))
+                                break;
+                        goto success;
                }
        }
        printk("Fast TSC calibration failed\n");
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index b263423fbe2a..95a7289e4b0c 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -441,7 +441,7 @@ vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip,
        ap.ds = __USER_DS;
        ap.es = __USER_DS;
        ap.fs = __KERNEL_PERCPU;
-        ap.gs = 0;
+        ap.gs = __KERNEL_STACK_CANARY;
        ap.eflags = 0;
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 367e87882041..9fc178255c04 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -46,11 +46,10 @@ PHDRS {
        data PT_LOAD FLAGS(7);          /* RWE */
 #ifdef CONFIG_X86_64
        user PT_LOAD FLAGS(7);          /* RWE */
-        data.init PT_LOAD FLAGS(7);     /* RWE */
 #ifdef CONFIG_SMP
        percpu PT_LOAD FLAGS(7);        /* RWE */
 #endif
-        data.init2 PT_LOAD FLAGS(7);    /* RWE */
+        init PT_LOAD FLAGS(7);          /* RWE */
 #endif
        note PT_NOTE FLAGS(0);          /* ___ */
 }
@@ -103,72 +102,43 @@ SECTIONS
                __stop___ex_table = .;
        } :text = 0x9090
-        RODATA
+        RO_DATA(PAGE_SIZE)
        /* Data */
-        . = ALIGN(PAGE_SIZE);
        .data : AT(ADDR(.data) - LOAD_OFFSET) {
                /* Start of data section */
                _sdata = .;
-                DATA_DATA
-                CONSTRUCTORS
-#ifdef CONFIG_X86_64
+                /* init_task */
-                /* End of data section */
+                INIT_TASK_DATA(THREAD_SIZE)
-                _edata = .;
-#endif
-        } :data
 #ifdef CONFIG_X86_32
-        /* 32 bit has nosave before _edata */
+                /* 32 bit has nosave before _edata */
-        . = ALIGN(PAGE_SIZE);
+                NOSAVE_DATA
-        .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
-                __nosave_begin = .;
-                *(.data.nosave)
-                . = ALIGN(PAGE_SIZE);
-                __nosave_end = .;
-        }
 #endif
-        . = ALIGN(PAGE_SIZE);
+                PAGE_ALIGNED_DATA(PAGE_SIZE)
-        .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
-                *(.data.page_aligned)
                *(.data.idt)
-        }
-#ifdef CONFIG_X86_32
+                CACHELINE_ALIGNED_DATA(CONFIG_X86_L1_CACHE_BYTES)
-        . = ALIGN(32);
-#else
-        . = ALIGN(PAGE_SIZE);
-        . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
-#endif
-        .data.cacheline_aligned :
-                AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) {
-                *(.data.cacheline_aligned)
-        }
-        /* rarely changed data like cpu maps */
+                DATA_DATA
-#ifdef CONFIG_X86_32
+                CONSTRUCTORS
-        . = ALIGN(32);
-#else
+                /* rarely changed data like cpu maps */
-        . = ALIGN(CONFIG_X86_INTERNODE_CACHE_BYTES);
+                READ_MOSTLY_DATA(CONFIG_X86_INTERNODE_CACHE_BYTES)
-#endif
-        .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) {
-                *(.data.read_mostly)
-#ifdef CONFIG_X86_32
                /* End of data section */
                _edata = .;
-#endif
+        } :data
-        }
 #ifdef CONFIG_X86_64
 #define VSYSCALL_ADDR (-10*1024*1024)
-#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data.read_mostly) + \
+#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data) + SIZEOF(.data) + \
-                            SIZEOF(.data.read_mostly) + 4095) & ~(4095))
+                            PAGE_SIZE - 1) & ~(PAGE_SIZE - 1))
-#define VSYSCALL_VIRT_ADDR ((ADDR(.data.read_mostly) + \
+#define VSYSCALL_VIRT_ADDR ((ADDR(.data) + SIZEOF(.data) + \
-                            SIZEOF(.data.read_mostly) + 4095) & ~(4095))
+                            PAGE_SIZE - 1) & ~(PAGE_SIZE - 1))
 #define VLOAD_OFFSET (VSYSCALL_ADDR - VSYSCALL_PHYS_ADDR)
 #define VLOAD(x) (ADDR(x) - VLOAD_OFFSET)
@@ -234,35 +204,29 @@ SECTIONS
 #endif /* CONFIG_X86_64 */
-        /* init_task */
+        /* Init code and data - will be freed after init */
-        . = ALIGN(THREAD_SIZE);
+        . = ALIGN(PAGE_SIZE);
-        .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
+        .init.begin : AT(ADDR(.init.begin) - LOAD_OFFSET) {
-                *(.data.init_task)
+                __init_begin = .; /* paired with __init_end */
        }
-#ifdef CONFIG_X86_64
-         :data.init
-#endif
+#if defined(CONFIG_X86_64) && defined(CONFIG_SMP)
        /*
-         * smp_locks might be freed after init
+         * percpu offsets are zero-based on SMP.  PERCPU_VADDR() changes the
-         * start/end must be page aligned
+         * output PHDR, so the next output section - .init.text - should
+         * start another segment - init.
         */
-        . = ALIGN(PAGE_SIZE);
+        PERCPU_VADDR(0, :percpu)
-        .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
+#endif
-                __smp_locks = .;
-                *(.smp_locks)
-                __smp_locks_end = .;
-                . = ALIGN(PAGE_SIZE);
-        }
-        /* Init code and data - will be freed after init */
-        . = ALIGN(PAGE_SIZE);
        .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
-                __init_begin = .; /* paired with __init_end */
                _sinittext = .;
                INIT_TEXT
                _einittext = .;
        }
+#ifdef CONFIG_X86_64
+        :init
+#endif
        .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) {
                INIT_DATA
@@ -333,17 +297,7 @@ SECTIONS
        }
 #endif
-#if defined(CONFIG_X86_64) && defined(CONFIG_SMP)
+#if !defined(CONFIG_X86_64) || !defined(CONFIG_SMP)
-        /*
-         * percpu offsets are zero-based on SMP.  PERCPU_VADDR() changes the
-         * output PHDR, so the next output section - __data_nosave - should
-         * start another section data.init2.  Also, pda should be at the head of
-         * percpu area.  Preallocate it and define the percpu offset symbol
-         * so that it can be accessed as a percpu variable.
-         */
-        . = ALIGN(PAGE_SIZE);
-        PERCPU_VADDR(0, :percpu)
-#else
        PERCPU(PAGE_SIZE)
 #endif
@@ -354,15 +308,22 @@ SECTIONS
                __init_end = .;
        }
+        /*
+         * smp_locks might be freed after init
+         * start/end must be page aligned
+         */
+        . = ALIGN(PAGE_SIZE);
+        .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
+                __smp_locks = .;
+                *(.smp_locks)
+                __smp_locks_end = .;
+                . = ALIGN(PAGE_SIZE);
+        }
 #ifdef CONFIG_X86_64
        .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
-                . = ALIGN(PAGE_SIZE);
+                NOSAVE_DATA
-                __nosave_begin = .;
+        }
-                *(.data.nosave)
-                . = ALIGN(PAGE_SIZE);
-                __nosave_end = .;
-        } :data.init2
-        /* use another section data.init2, see PERCPU_VADDR() above */
 #endif
        /* BSS */
@@ -400,8 +361,8 @@ SECTIONS
 #ifdef CONFIG_X86_32
-ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE),
+. = ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE),
-        "kernel image bigger than KERNEL_IMAGE_SIZE")
+           "kernel image bigger than KERNEL_IMAGE_SIZE");
 #else
 /*
 * Per-cpu symbols which need to be offset from __per_cpu_load
@@ -414,12 +375,12 @@ INIT_PER_CPU(irq_stack_union);
 /*
 * Build-time check on the image size:
 */
-ASSERT((_end - _text <= KERNEL_IMAGE_SIZE),
+. = ASSERT((_end - _text <= KERNEL_IMAGE_SIZE),
-        "kernel image bigger than KERNEL_IMAGE_SIZE")
+           "kernel image bigger than KERNEL_IMAGE_SIZE");
 #ifdef CONFIG_SMP
-ASSERT((per_cpu__irq_stack_union == 0),
+. = ASSERT((per_cpu__irq_stack_union == 0),
-        "irq_stack_union is not at start of per-cpu area");
+           "irq_stack_union is not at start of per-cpu area");
 #endif
 #endif /* CONFIG_X86_32 */
@@ -427,7 +388,7 @@ ASSERT((per_cpu__irq_stack_union == 0),
 #ifdef CONFIG_KEXEC
 #include <asm/kexec.h>
-ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE,
+. = ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE,
-       "kexec control code size is too big")
+           "kexec control code size is too big");
 #endif
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 4d6f0d293ee2..21f68e00524f 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -104,6 +104,9 @@ static s64 __kpit_elapsed(struct kvm *kvm)
        ktime_t remaining;
        struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state;
+        if (!ps->pit_timer.period)
+                return 0;
        /*
         * The Counter does not stop when it reaches zero. In
         * Modes 0, 1, 4, and 5 the Counter ``wraps around'' to
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 7030b5f911bf..0ef5bb2b4043 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -489,16 +489,20 @@ static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int lpage)
 *
 * If rmapp bit zero is one, (then rmap & ~1) points to a struct kvm_rmap_desc
 * containing more mappings.
+ *
+ * Returns the number of rmap entries before the spte was added or zero if
+ * the spte was not added.
+ *
 */
-static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn, int lpage)
+static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn, int lpage)
 {
        struct kvm_mmu_page *sp;
        struct kvm_rmap_desc *desc;
        unsigned long *rmapp;
-        int i;
+        int i, count = 0;
        if (!is_rmap_pte(*spte))
-                return;
+                return count;
        gfn = unalias_gfn(vcpu->kvm, gfn);
        sp = page_header(__pa(spte));
        sp->gfns[spte - sp->spt] = gfn;
@@ -515,8 +519,10 @@ static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn, int lpage)
        } else {
                rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte);
                desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
-                while (desc->shadow_ptes[RMAP_EXT-1] && desc->more)
+                while (desc->shadow_ptes[RMAP_EXT-1] && desc->more) {
                        desc = desc->more;
+                        count += RMAP_EXT;
+                }
                if (desc->shadow_ptes[RMAP_EXT-1]) {
                        desc->more = mmu_alloc_rmap_desc(vcpu);
                        desc = desc->more;
@@ -525,6 +531,7 @@ static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn, int lpage)
                        ;
                desc->shadow_ptes[i] = spte;
        }
+        return count;
 }
 static void rmap_desc_remove_entry(unsigned long *rmapp,
@@ -754,6 +761,19 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp)
        return young;
 }
+#define RMAP_RECYCLE_THRESHOLD 1000
+static void rmap_recycle(struct kvm_vcpu *vcpu, gfn_t gfn, int lpage)
+{
+        unsigned long *rmapp;
+        gfn = unalias_gfn(vcpu->kvm, gfn);
+        rmapp = gfn_to_rmap(vcpu->kvm, gfn, lpage);
+        kvm_unmap_rmapp(vcpu->kvm, rmapp);
+        kvm_flush_remote_tlbs(vcpu->kvm);
+}
 int kvm_age_hva(struct kvm *kvm, unsigned long hva)
 {
        return kvm_handle_hva(kvm, hva, kvm_age_rmapp);
@@ -1407,24 +1427,25 @@ static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 */
 void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
 {
+        int used_pages;
+        used_pages = kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages;
+        used_pages = max(0, used_pages);
        /*
         * If we set the number of mmu pages to be smaller be than the
         * number of actived pages , we must to free some mmu pages before we
         * change the value
         */
-        if ((kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages) >
+        if (used_pages > kvm_nr_mmu_pages) {
-            kvm_nr_mmu_pages) {
+                while (used_pages > kvm_nr_mmu_pages) {
-                int n_used_mmu_pages = kvm->arch.n_alloc_mmu_pages
-                                       - kvm->arch.n_free_mmu_pages;
-                while (n_used_mmu_pages > kvm_nr_mmu_pages) {
                        struct kvm_mmu_page *page;
                        page = container_of(kvm->arch.active_mmu_pages.prev,
                                            struct kvm_mmu_page, link);
                        kvm_mmu_zap_page(kvm, page);
-                        n_used_mmu_pages--;
+                        used_pages--;
                }
                kvm->arch.n_free_mmu_pages = 0;
        }
@@ -1740,6 +1761,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
 {
        int was_rmapped = 0;
        int was_writeble = is_writeble_pte(*shadow_pte);
+        int rmap_count;
        pgprintk("%s: spte %llx access %x write_fault %d"
                 " user_fault %d gfn %lx\n",
@@ -1781,9 +1803,11 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
        page_header_update_slot(vcpu->kvm, shadow_pte, gfn);
        if (!was_rmapped) {
-                rmap_add(vcpu, shadow_pte, gfn, largepage);
+                rmap_count = rmap_add(vcpu, shadow_pte, gfn, largepage);
                if (!is_rmap_pte(*shadow_pte))
                        kvm_release_pfn_clean(pfn);
+                if (rmap_count > RMAP_RECYCLE_THRESHOLD)
+                        rmap_recycle(vcpu, gfn, largepage);
        } else {
                if (was_writeble)
                        kvm_release_pfn_dirty(pfn);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 71510e07e69e..b1f658ad2f06 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -711,6 +711,7 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
                svm->vmcb->control.tsc_offset += delta;
                vcpu->cpu = cpu;
                kvm_migrate_timers(vcpu);
+                svm->asid_generation = 0;
        }
        for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
@@ -1031,7 +1032,6 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *svm_data)
                svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID;
        }
-        svm->vcpu.cpu = svm_data->cpu;
        svm->asid_generation = svm_data->asid_generation;
        svm->vmcb->control.asid = svm_data->next_asid++;
 }
@@ -2300,8 +2300,8 @@ static void pre_svm_run(struct vcpu_svm *svm)
        struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu);
        svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
-        if (svm->vcpu.cpu != cpu ||
+        /* FIXME: handle wraparound of asid_generation */
-            svm->asid_generation != svm_data->asid_generation)
+        if (svm->asid_generation != svm_data->asid_generation)
                new_asid(svm, svm_data);
 }
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 356a0ce85c68..29f912927a58 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3157,8 +3157,8 @@ static void handle_invalid_guest_state(struct kvm_vcpu *vcpu,
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        enum emulation_result err = EMULATE_DONE;
-        preempt_enable();
        local_irq_enable();
+        preempt_enable();
        while (!guest_state_valid(vcpu)) {
                err = emulate_instruction(vcpu, kvm_run, 0, 0, 0);
@@ -3168,7 +3168,7 @@ static void handle_invalid_guest_state(struct kvm_vcpu *vcpu,
                if (err != EMULATE_DONE) {
                        kvm_report_emulation_failure(vcpu, "emulation failure");
-                        return;
+                        break;
                }
                if (signal_pending(current))
@@ -3177,8 +3177,8 @@ static void handle_invalid_guest_state(struct kvm_vcpu *vcpu,
                        schedule();
        }
-        local_irq_disable();
        preempt_disable();
+        local_irq_disable();
        vmx->invalid_state_emulation_result = err;
 }
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index fe5474aec41a..3d4529011828 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -704,11 +704,48 @@ static bool msr_mtrr_valid(unsigned msr)
        return false;
 }
+static bool valid_pat_type(unsigned t)
+{
+        return t < 8 && (1 << t) & 0xf3; /* 0, 1, 4, 5, 6, 7 */
+}
+static bool valid_mtrr_type(unsigned t)
+{
+        return t < 8 && (1 << t) & 0x73; /* 0, 1, 4, 5, 6 */
+}
+static bool mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+{
+        int i;
+        if (!msr_mtrr_valid(msr))
+                return false;
+        if (msr == MSR_IA32_CR_PAT) {
+                for (i = 0; i < 8; i++)
+                        if (!valid_pat_type((data >> (i * 8)) & 0xff))
+                                return false;
+                return true;
+        } else if (msr == MSR_MTRRdefType) {
+                if (data & ~0xcff)
+                        return false;
+                return valid_mtrr_type(data & 0xff);
+        } else if (msr >= MSR_MTRRfix64K_00000 && msr <= MSR_MTRRfix4K_F8000) {
+                for (i = 0; i < 8 ; i++)
+                        if (!valid_mtrr_type((data >> (i * 8)) & 0xff))
+                                return false;
+                return true;
+        }
+        /* variable MTRRs */
+        return valid_mtrr_type(data & 0xff);
+}
 static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 {
        u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges;
-        if (!msr_mtrr_valid(msr))
+        if (!mtrr_valid(vcpu, msr, data))
                return 1;
        if (msr == MSR_MTRRdefType) {
@@ -1079,14 +1116,13 @@ long kvm_arch_dev_ioctl(struct file *filp,
                if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
                        goto out;
                r = -E2BIG;
-                if (n < num_msrs_to_save)
+                if (n < msr_list.nmsrs)
                        goto out;
                r = -EFAULT;
                if (copy_to_user(user_msr_list->indices, &msrs_to_save,
                                 num_msrs_to_save * sizeof(u32)))
                        goto out;
-                if (copy_to_user(user_msr_list->indices
+                if (copy_to_user(user_msr_list->indices + num_msrs_to_save,
-                                 + num_msrs_to_save * sizeof(u32),
                                 &emulated_msrs,
                                 ARRAY_SIZE(emulated_msrs) * sizeof(u32)))
                        goto out;
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 7bc65f0f62c4..d677fa9ca650 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -22,7 +22,8 @@
 *
 * So how does the kernel know it's a Guest?  We'll see that later, but let's
 * just say that we end up here where we replace the native functions various
- * "paravirt" structures with our Guest versions, then boot like normal. :*/
+ * "paravirt" structures with our Guest versions, then boot like normal.
+:*/
 /*
 * Copyright (C) 2006, Rusty Russell <rusty@rustcorp.com.au> IBM Corporation.
@@ -74,7 +75,8 @@
 *
 * The Guest in our tale is a simple creature: identical to the Host but
 * behaving in simplified but equivalent ways.  In particular, the Guest is the
- * same kernel as the Host (or at least, built from the same source code). :*/
+ * same kernel as the Host (or at least, built from the same source code).
+:*/
 struct lguest_data lguest_data = {
        .hcall_status = { [0 ... LHCALL_RING_SIZE-1] = 0xFF },
@@ -85,7 +87,8 @@ struct lguest_data lguest_data = {
        .syscall_vec = SYSCALL_VECTOR,
 };
-/*G:037 async_hcall() is pretty simple: I'm quite proud of it really.  We have a
+/*G:037
+ * async_hcall() is pretty simple: I'm quite proud of it really.  We have a
 * ring buffer of stored hypercalls which the Host will run though next time we
 * do a normal hypercall.  Each entry in the ring has 5 slots for the hypercall
 * arguments, and a "hcall_status" word which is 0 if the call is ready to go,
@@ -94,7 +97,8 @@ struct lguest_data lguest_data = {
 * If we come around to a slot which hasn't been finished, then the table is
 * full and we just make the hypercall directly.  This has the nice side
 * effect of causing the Host to run all the stored calls in the ring buffer
- * which empties it for next time! */
+ * which empties it for next time!
+ */
 static void async_hcall(unsigned long call, unsigned long arg1,
                        unsigned long arg2, unsigned long arg3,
                        unsigned long arg4)
@@ -103,9 +107,11 @@ static void async_hcall(unsigned long call, unsigned long arg1,
        static unsigned int next_call;
        unsigned long flags;
-        /* Disable interrupts if not already disabled: we don't want an
+        /*
+         * Disable interrupts if not already disabled: we don't want an
         * interrupt handler making a hypercall while we're already doing
-         * one! */
+         * one!
+         */
        local_irq_save(flags);
        if (lguest_data.hcall_status[next_call] != 0xFF) {
                /* Table full, so do normal hcall which will flush table. */
@@ -125,8 +131,9 @@ static void async_hcall(unsigned long call, unsigned long arg1,
        local_irq_restore(flags);
 }
-/*G:035 Notice the lazy_hcall() above, rather than hcall().  This is our first
+/*G:035
- * real optimization trick!
+ * Notice the lazy_hcall() above, rather than hcall().  This is our first real
+ * optimization trick!
 *
 * When lazy_mode is set, it means we're allowed to defer all hypercalls and do
 * them as a batch when lazy_mode is eventually turned off.  Because hypercalls
@@ -136,7 +143,8 @@ static void async_hcall(unsigned long call, unsigned long arg1,
 * lguest_leave_lazy_mode().
 *
 * So, when we're in lazy mode, we call async_hcall() to store the call for
- * future processing: */
+ * future processing:
+ */
 static void lazy_hcall1(unsigned long call,
                       unsigned long arg1)
 {
@@ -146,6 +154,7 @@ static void lazy_hcall1(unsigned long call,
                async_hcall(call, arg1, 0, 0, 0);
 }
+/* You can imagine what lazy_hcall2, 3 and 4 look like. :*/
 static void lazy_hcall2(unsigned long call,
                       unsigned long arg1,
                       unsigned long arg2)
@@ -181,8 +190,10 @@ static void lazy_hcall4(unsigned long call,
 }
 #endif
-/* When lazy mode is turned off reset the per-cpu lazy mode variable and then
+/*G:036
- * issue the do-nothing hypercall to flush any stored calls. */
+ * When lazy mode is turned off reset the per-cpu lazy mode variable and then
+ * issue the do-nothing hypercall to flush any stored calls.
+:*/
 static void lguest_leave_lazy_mmu_mode(void)
 {
        kvm_hypercall0(LHCALL_FLUSH_ASYNC);
@@ -208,9 +219,11 @@ static void lguest_end_context_switch(struct task_struct *next)
 * check there before it tries to deliver an interrupt.
 */
-/* save_flags() is expected to return the processor state (ie. "flags").  The
+/*
+ * save_flags() is expected to return the processor state (ie. "flags").  The
 * flags word contains all kind of stuff, but in practice Linux only cares
- * about the interrupt flag.  Our "save_flags()" just returns that. */
+ * about the interrupt flag.  Our "save_flags()" just returns that.
+ */
 static unsigned long save_fl(void)
 {
        return lguest_data.irq_enabled;
@@ -222,13 +235,15 @@ static void irq_disable(void)
        lguest_data.irq_enabled = 0;
 }
-/* Let's pause a moment.  Remember how I said these are called so often?
+/*
+ * Let's pause a moment.  Remember how I said these are called so often?
 * Jeremy Fitzhardinge optimized them so hard early in 2009 that he had to
 * break some rules.  In particular, these functions are assumed to save their
 * own registers if they need to: normal C functions assume they can trash the
 * eax register.  To use normal C functions, we use
 * PV_CALLEE_SAVE_REGS_THUNK(), which pushes %eax onto the stack, calls the
- * C function, then restores it. */
+ * C function, then restores it.
+ */
 PV_CALLEE_SAVE_REGS_THUNK(save_fl);
 PV_CALLEE_SAVE_REGS_THUNK(irq_disable);
 /*:*/
@@ -237,18 +252,18 @@ PV_CALLEE_SAVE_REGS_THUNK(irq_disable);
 extern void lg_irq_enable(void);
 extern void lg_restore_fl(unsigned long flags);
-/*M:003 Note that we don't check for outstanding interrupts when we re-enable
+/*M:003
- * them (or when we unmask an interrupt).  This seems to work for the moment,
+ * We could be more efficient in our checking of outstanding interrupts, rather
- * since interrupts are rare and we'll just get the interrupt on the next timer
+ * than using a branch.  One way would be to put the "irq_enabled" field in a
- * tick, but now we can run with CONFIG_NO_HZ, we should revisit this.  One way
+ * page by itself, and have the Host write-protect it when an interrupt comes
- * would be to put the "irq_enabled" field in a page by itself, and have the
+ * in when irqs are disabled.  There will then be a page fault as soon as
- * Host write-protect it when an interrupt comes in when irqs are disabled.
+ * interrupts are re-enabled.
- * There will then be a page fault as soon as interrupts are re-enabled.
 *
 * A better method is to implement soft interrupt disable generally for x86:
 * instead of disabling interrupts, we set a flag.  If an interrupt does come
 * in, we then disable them for real.  This is uncommon, so we could simply use
- * a hypercall for interrupt control and not worry about efficiency. :*/
+ * a hypercall for interrupt control and not worry about efficiency.
+:*/
 /*G:034
 * The Interrupt Descriptor Table (IDT).
@@ -261,10 +276,12 @@ extern void lg_restore_fl(unsigned long flags);
 static void lguest_write_idt_entry(gate_desc *dt,
                                   int entrynum, const gate_desc *g)
 {
-        /* The gate_desc structure is 8 bytes long: we hand it to the Host in
+        /*
+         * The gate_desc structure is 8 bytes long: we hand it to the Host in
         * two 32-bit chunks.  The whole 32-bit kernel used to hand descriptors
         * around like this; typesafety wasn't a big concern in Linux's early
-         * years. */
+         * years.
+         */
        u32 *desc = (u32 *)g;
        /* Keep the local copy up to date. */
        native_write_idt_entry(dt, entrynum, g);
@@ -272,9 +289,11 @@ static void lguest_write_idt_entry(gate_desc *dt,
        kvm_hypercall3(LHCALL_LOAD_IDT_ENTRY, entrynum, desc[0], desc[1]);
 }
-/* Changing to a different IDT is very rare: we keep the IDT up-to-date every
+/*
+ * Changing to a different IDT is very rare: we keep the IDT up-to-date every
 * time it is written, so we can simply loop through all entries and tell the
- * Host about them. */
+ * Host about them.
+ */
 static void lguest_load_idt(const struct desc_ptr *desc)
 {
        unsigned int i;
@@ -305,9 +324,11 @@ static void lguest_load_gdt(const struct desc_ptr *desc)
                kvm_hypercall3(LHCALL_LOAD_GDT_ENTRY, i, gdt[i].a, gdt[i].b);
 }
-/* For a single GDT entry which changes, we do the lazy thing: alter our GDT,
+/*
+ * For a single GDT entry which changes, we do the lazy thing: alter our GDT,
 * then tell the Host to reload the entire thing.  This operation is so rare
- * that this naive implementation is reasonable. */
+ * that this naive implementation is reasonable.
+ */
 static void lguest_write_gdt_entry(struct desc_struct *dt, int entrynum,
                                   const void *desc, int type)
 {
@@ -317,29 +338,36 @@ static void lguest_write_gdt_entry(struct desc_struct *dt, int entrynum,
                       dt[entrynum].a, dt[entrynum].b);
 }
-/* OK, I lied.  There are three "thread local storage" GDT entries which change
+/*
+ * OK, I lied.  There are three "thread local storage" GDT entries which change
 * on every context switch (these three entries are how glibc implements
- * __thread variables).  So we have a hypercall specifically for this case. */
+ * __thread variables).  So we have a hypercall specifically for this case.
+ */
 static void lguest_load_tls(struct thread_struct *t, unsigned int cpu)
 {
-        /* There's one problem which normal hardware doesn't have: the Host
+        /*
+         * There's one problem which normal hardware doesn't have: the Host
         * can't handle us removing entries we're currently using.  So we clear
-         * the GS register here: if it's needed it'll be reloaded anyway. */
+         * the GS register here: if it's needed it'll be reloaded anyway.
+         */
        lazy_load_gs(0);
        lazy_hcall2(LHCALL_LOAD_TLS, __pa(&t->tls_array), cpu);
 }
-/*G:038 That's enough excitement for now, back to ploughing through each of
+/*G:038
- * the different pv_ops structures (we're about 1/3 of the way through).
+ * That's enough excitement for now, back to ploughing through each of the
+ * different pv_ops structures (we're about 1/3 of the way through).
 *
 * This is the Local Descriptor Table, another weird Intel thingy.  Linux only
 * uses this for some strange applications like Wine.  We don't do anything
- * here, so they'll get an informative and friendly Segmentation Fault. */
+ * here, so they'll get an informative and friendly Segmentation Fault.
+ */
 static void lguest_set_ldt(const void *addr, unsigned entries)
 {
 }
-/* This loads a GDT entry into the "Task Register": that entry points to a
+/*
+ * This loads a GDT entry into the "Task Register": that entry points to a
 * structure called the Task State Segment.  Some comments scattered though the
 * kernel code indicate that this used for task switching in ages past, along
 * with blood sacrifice and astrology.
@@ -347,19 +375,21 @@ static void lguest_set_ldt(const void *addr, unsigned entries)
 * Now there's nothing interesting in here that we don't get told elsewhere.
 * But the native version uses the "ltr" instruction, which makes the Host
 * complain to the Guest about a Segmentation Fault and it'll oops.  So we
- * override the native version with a do-nothing version. */
+ * override the native version with a do-nothing version.
+ */
 static void lguest_load_tr_desc(void)
 {
 }
-/* The "cpuid" instruction is a way of querying both the CPU identity
+/*
+ * The "cpuid" instruction is a way of querying both the CPU identity
 * (manufacturer, model, etc) and its features.  It was introduced before the
 * Pentium in 1993 and keeps getting extended by both Intel, AMD and others.
 * As you might imagine, after a decade and a half this treatment, it is now a
 * giant ball of hair.  Its entry in the current Intel manual runs to 28 pages.
 *
 * This instruction even it has its own Wikipedia entry.  The Wikipedia entry
- * has been translated into 4 languages.  I am not making this up!
+ * has been translated into 5 languages.  I am not making this up!
 *
 * We could get funky here and identify ourselves as "GenuineLguest", but
 * instead we just use the real "cpuid" instruction.  Then I pretty much turned
@@ -371,7 +401,8 @@ static void lguest_load_tr_desc(void)
 * Replacing the cpuid so we can turn features off is great for the kernel, but
 * anyone (including userspace) can just use the raw "cpuid" instruction and
 * the Host won't even notice since it isn't privileged.  So we try not to get
- * too worked up about it. */
+ * too worked up about it.
+ */
 static void lguest_cpuid(unsigned int *ax, unsigned int *bx,
                         unsigned int *cx, unsigned int *dx)
 {
@@ -379,38 +410,63 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx,
        native_cpuid(ax, bx, cx, dx);
        switch (function) {
-        case 1: /* Basic feature request. */
+        /*
-                /* We only allow kernel to see SSE3, CMPXCHG16B and SSSE3 */
+         * CPUID 0 gives the highest legal CPUID number (and the ID string).
+         * We futureproof our code a little by sticking to known CPUID values.
+         */
+        case 0:
+                if (*ax > 5)
+                        *ax = 5;
+                break;
+        /*
+         * CPUID 1 is a basic feature request.
+         *
+         * CX: we only allow kernel to see SSE3, CMPXCHG16B and SSSE3
+         * DX: SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU and PAE.
+         */
+        case 1:
                *cx &= 0x00002201;
-                /* SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU, PAE. */
                *dx &= 0x07808151;
-                /* The Host can do a nice optimization if it knows that the
+                /*
+                 * The Host can do a nice optimization if it knows that the
                 * kernel mappings (addresses above 0xC0000000 or whatever
                 * PAGE_OFFSET is set to) haven't changed.  But Linux calls
                 * flush_tlb_user() for both user and kernel mappings unless
-                 * the Page Global Enable (PGE) feature bit is set. */
+                 * the Page Global Enable (PGE) feature bit is set.
+                 */
                *dx |= 0x00002000;
-                /* We also lie, and say we're family id 5.  6 or greater
+                /*
+                 * We also lie, and say we're family id 5.  6 or greater
                 * leads to a rdmsr in early_init_intel which we can't handle.
-                 * Family ID is returned as bits 8-12 in ax. */
+                 * Family ID is returned as bits 8-12 in ax.
+                 */
                *ax &= 0xFFFFF0FF;
                *ax |= 0x00000500;
                break;
+        /*
+         * 0x80000000 returns the highest Extended Function, so we futureproof
+         * like we do above by limiting it to known fields.
+         */
        case 0x80000000:
-                /* Futureproof this a little: if they ask how much extended
-                 * processor information there is, limit it to known fields. */
                if (*ax > 0x80000008)
                        *ax = 0x80000008;
                break;
+        /*
+         * PAE systems can mark pages as non-executable.  Linux calls this the
+         * NX bit.  Intel calls it XD (eXecute Disable), AMD EVP (Enhanced
+         * Virus Protection).  We just switch turn if off here, since we don't
+         * support it.
+         */
        case 0x80000001:
-                /* Here we should fix nx cap depending on host. */
-                /* For this version of PAE, we just clear NX bit. */
                *dx &= ~(1 << 20);
                break;
        }
 }
-/* Intel has four control registers, imaginatively named cr0, cr2, cr3 and cr4.
+/*
+ * Intel has four control registers, imaginatively named cr0, cr2, cr3 and cr4.
 * I assume there's a cr1, but it hasn't bothered us yet, so we'll not bother
 * it.  The Host needs to know when the Guest wants to change them, so we have
 * a whole series of functions like read_cr0() and write_cr0().
@@ -425,7 +481,8 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx,
 * name like "FPUTRAP bit" be a little less cryptic?
 *
 * We store cr0 locally because the Host never changes it.  The Guest sometimes
- * wants to read it and we'd prefer not to bother the Host unnecessarily. */
+ * wants to read it and we'd prefer not to bother the Host unnecessarily.
+ */
 static unsigned long current_cr0;
 static void lguest_write_cr0(unsigned long val)
 {
@@ -438,18 +495,22 @@ static unsigned long lguest_read_cr0(void)
        return current_cr0;
 }
-/* Intel provided a special instruction to clear the TS bit for people too cool
+/*
+ * Intel provided a special instruction to clear the TS bit for people too cool
 * to use write_cr0() to do it.  This "clts" instruction is faster, because all
- * the vowels have been optimized out. */
+ * the vowels have been optimized out.
+ */
 static void lguest_clts(void)
 {
        lazy_hcall1(LHCALL_TS, 0);
        current_cr0 &= ~X86_CR0_TS;
 }
-/* cr2 is the virtual address of the last page fault, which the Guest only ever
+/*
+ * cr2 is the virtual address of the last page fault, which the Guest only ever
 * reads.  The Host kindly writes this into our "struct lguest_data", so we
- * just read it out of there. */
+ * just read it out of there.
+ */
 static unsigned long lguest_read_cr2(void)
 {
        return lguest_data.cr2;
@@ -458,10 +519,12 @@ static unsigned long lguest_read_cr2(void)
 /* See lguest_set_pte() below. */
 static bool cr3_changed = false;
-/* cr3 is the current toplevel pagetable page: the principle is the same as
+/*
+ * cr3 is the current toplevel pagetable page: the principle is the same as
 * cr0.  Keep a local copy, and tell the Host when it changes.  The only
 * difference is that our local copy is in lguest_data because the Host needs
- * to set it upon our initial hypercall. */
+ * to set it upon our initial hypercall.
+ */
 static void lguest_write_cr3(unsigned long cr3)
 {
        lguest_data.pgdir = cr3;
@@ -506,7 +569,7 @@ static void lguest_write_cr4(unsigned long val)
 * cr3 ---> +---------+
 *          |      --------->+---------+
 *          |         |      | PADDR1  |
- *        Top-level   |      | PADDR2  |
+ *        Mid-level   |      | PADDR2  |
 *        (PMD) page  |      |         |
 *          |         |    Lower-level |
 *          |         |    (PTE) page  |
@@ -526,21 +589,62 @@ static void lguest_write_cr4(unsigned long val)
 *    Index into top     Index into second      Offset within page
 *  page directory page    pagetable page
 *
- * The kernel spends a lot of time changing both the top-level page directory
+ * Now, unfortunately, this isn't the whole story: Intel added Physical Address
- * and lower-level pagetable pages.  The Guest doesn't know physical addresses,
+ * Extension (PAE) to allow 32 bit systems to use 64GB of memory (ie. 36 bits).
- * so while it maintains these page tables exactly like normal, it also needs
+ * These are held in 64-bit page table entries, so we can now only fit 512
- * to keep the Host informed whenever it makes a change: the Host will create
+ * entries in a page, and the neat three-level tree breaks down.
- * the real page tables based on the Guests'.
+ *
+ * The result is a four level page table:
+ *
+ * cr3 --> [ 4 Upper  ]
+ *         [   Level  ]
+ *         [  Entries ]
+ *         [(PUD Page)]---> +---------+
+ *                          |      --------->+---------+
+ *                          |         |      | PADDR1  |
+ *                        Mid-level   |      | PADDR2  |
+ *                        (PMD) page  |      |         |
+ *                          |         |    Lower-level |
+ *                          |         |    (PTE) page  |
+ *                          |         |      |         |
+ *                            ....               ....
+ *
+ *
+ * And the virtual address is decoded as:
+ *
+ *         1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+ *      |<-2->|<--- 9 bits ---->|<---- 9 bits --->|<------ 12 bits ------>|
+ * Index into    Index into mid    Index into lower    Offset within page
+ * top entries   directory page     pagetable page
+ *
+ * It's too hard to switch between these two formats at runtime, so Linux only
+ * supports one or the other depending on whether CONFIG_X86_PAE is set.  Many
+ * distributions turn it on, and not just for people with silly amounts of
+ * memory: the larger PTE entries allow room for the NX bit, which lets the
+ * kernel disable execution of pages and increase security.
+ *
+ * This was a problem for lguest, which couldn't run on these distributions;
+ * then Matias Zabaljauregui figured it all out and implemented it, and only a
+ * handful of puppies were crushed in the process!
+ *
+ * Back to our point: the kernel spends a lot of time changing both the
+ * top-level page directory and lower-level pagetable pages.  The Guest doesn't
+ * know physical addresses, so while it maintains these page tables exactly
+ * like normal, it also needs to keep the Host informed whenever it makes a
+ * change: the Host will create the real page tables based on the Guests'.
 */
-/* The Guest calls this to set a second-level entry (pte), ie. to map a page
+/*
- * into a process' address space.  We set the entry then tell the Host the
+ * The Guest calls this after it has set a second-level entry (pte), ie. to map
- * toplevel and address this corresponds to.  The Guest uses one pagetable per
+ * a page into a process' address space.  Wetell the Host the toplevel and
- * process, so we need to tell the Host which one we're changing (mm->pgd). */
+ * address this corresponds to.  The Guest uses one pagetable per process, so
+ * we need to tell the Host which one we're changing (mm->pgd).
+ */
 static void lguest_pte_update(struct mm_struct *mm, unsigned long addr,
                               pte_t *ptep)
 {
 #ifdef CONFIG_X86_PAE
+        /* PAE needs to hand a 64 bit page table entry, so it uses two args. */
        lazy_hcall4(LHCALL_SET_PTE, __pa(mm->pgd), addr,
                    ptep->pte_low, ptep->pte_high);
 #else
@@ -548,6 +652,7 @@ static void lguest_pte_update(struct mm_struct *mm, unsigned long addr,
 #endif
 }
+/* This is the "set and update" combo-meal-deal version. */
 static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr,
                              pte_t *ptep, pte_t pteval)
 {
@@ -555,10 +660,13 @@ static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr,
        lguest_pte_update(mm, addr, ptep);
 }
-/* The Guest calls lguest_set_pud to set a top-level entry and lguest_set_pmd
+/*
+ * The Guest calls lguest_set_pud to set a top-level entry and lguest_set_pmd
 * to set a middle-level entry when PAE is activated.
+ *
 * Again, we set the entry then tell the Host which page we changed,
- * and the index of the entry we changed. */
+ * and the index of the entry we changed.
+ */
 #ifdef CONFIG_X86_PAE
 static void lguest_set_pud(pud_t *pudp, pud_t pudval)
 {
@@ -577,8 +685,7 @@ static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
 }
 #else
-/* The Guest calls lguest_set_pmd to set a top-level entry when PAE is not
+/* The Guest calls lguest_set_pmd to set a top-level entry when !PAE. */
- * activated. */
 static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
 {
        native_set_pmd(pmdp, pmdval);
@@ -587,7 +694,8 @@ static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
 }
 #endif
-/* There are a couple of legacy places where the kernel sets a PTE, but we
+/*
+ * There are a couple of legacy places where the kernel sets a PTE, but we
 * don't know the top level any more.  This is useless for us, since we don't
 * know which pagetable is changing or what address, so we just tell the Host
 * to forget all of them.  Fortunately, this is very rare.
@@ -595,7 +703,8 @@ static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
 * ... except in early boot when the kernel sets up the initial pagetables,
 * which makes booting astonishingly slow: 1.83 seconds!  So we don't even tell
 * the Host anything changed until we've done the first page table switch,
- * which brings boot back to 0.25 seconds. */
+ * which brings boot back to 0.25 seconds.
+ */
 static void lguest_set_pte(pte_t *ptep, pte_t pteval)
 {
        native_set_pte(ptep, pteval);
@@ -604,6 +713,11 @@ static void lguest_set_pte(pte_t *ptep, pte_t pteval)
 }
 #ifdef CONFIG_X86_PAE
+/*
+ * With 64-bit PTE values, we need to be careful setting them: if we set 32
+ * bits at a time, the hardware could see a weird half-set entry.  These
+ * versions ensure we update all 64 bits at once.
+ */
 static void lguest_set_pte_atomic(pte_t *ptep, pte_t pte)
 {
        native_set_pte_atomic(ptep, pte);
@@ -611,19 +725,21 @@ static void lguest_set_pte_atomic(pte_t *ptep, pte_t pte)
                lazy_hcall1(LHCALL_FLUSH_TLB, 1);
 }
-void lguest_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
+static void lguest_pte_clear(struct mm_struct *mm, unsigned long addr,
+                             pte_t *ptep)
 {
        native_pte_clear(mm, addr, ptep);
        lguest_pte_update(mm, addr, ptep);
 }
-void lguest_pmd_clear(pmd_t *pmdp)
+static void lguest_pmd_clear(pmd_t *pmdp)
 {
        lguest_set_pmd(pmdp, __pmd(0));
 }
 #endif
-/* Unfortunately for Lguest, the pv_mmu_ops for page tables were based on
+/*
+ * Unfortunately for Lguest, the pv_mmu_ops for page tables were based on
 * native page table operations.  On native hardware you can set a new page
 * table entry whenever you want, but if you want to remove one you have to do
 * a TLB flush (a TLB is a little cache of page table entries kept by the CPU).
@@ -632,24 +748,29 @@ void lguest_pmd_clear(pmd_t *pmdp)
 * called when a valid entry is written, not when it's removed (ie. marked not
 * present).  Instead, this is where we come when the Guest wants to remove a
 * page table entry: we tell the Host to set that entry to 0 (ie. the present
- * bit is zero). */
+ * bit is zero).
+ */
 static void lguest_flush_tlb_single(unsigned long addr)
 {
        /* Simply set it to zero: if it was not, it will fault back in. */
        lazy_hcall3(LHCALL_SET_PTE, lguest_data.pgdir, addr, 0);
 }
-/* This is what happens after the Guest has removed a large number of entries.
+/*
+ * This is what happens after the Guest has removed a large number of entries.
 * This tells the Host that any of the page table entries for userspace might
- * have changed, ie. virtual addresses below PAGE_OFFSET. */
+ * have changed, ie. virtual addresses below PAGE_OFFSET.
+ */
 static void lguest_flush_tlb_user(void)
 {
        lazy_hcall1(LHCALL_FLUSH_TLB, 0);
 }
-/* This is called when the kernel page tables have changed.  That's not very
+/*
+ * This is called when the kernel page tables have changed.  That's not very
 * common (unless the Guest is using highmem, which makes the Guest extremely
- * slow), so it's worth separating this from the user flushing above. */
+ * slow), so it's worth separating this from the user flushing above.
+ */
 static void lguest_flush_tlb_kernel(void)
 {
        lazy_hcall1(LHCALL_FLUSH_TLB, 1);
@@ -686,26 +807,38 @@ static struct irq_chip lguest_irq_controller = {
        .unmask         = enable_lguest_irq,
 };
-/* This sets up the Interrupt Descriptor Table (IDT) entry for each hardware
+/*
+ * This sets up the Interrupt Descriptor Table (IDT) entry for each hardware
 * interrupt (except 128, which is used for system calls), and then tells the
 * Linux infrastructure that each interrupt is controlled by our level-based
- * lguest interrupt controller. */
+ * lguest interrupt controller.
+ */
 static void __init lguest_init_IRQ(void)
 {
        unsigned int i;
        for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) {
-                /* Some systems map "vectors" to interrupts weirdly.  Lguest has
+                /* Some systems map "vectors" to interrupts weirdly.  Not us! */
-                 * a straightforward 1 to 1 mapping, so force that here. */
                __get_cpu_var(vector_irq)[i] = i - FIRST_EXTERNAL_VECTOR;
                if (i != SYSCALL_VECTOR)
                        set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]);
        }
-        /* This call is required to set up for 4k stacks, where we have
-         * separate stacks for hard and soft interrupts. */
+        /*
+         * This call is required to set up for 4k stacks, where we have
+         * separate stacks for hard and soft interrupts.
+         */
        irq_ctx_init(smp_processor_id());
 }
+/*
+ * With CONFIG_SPARSE_IRQ, interrupt descriptors are allocated as-needed, so
+ * rather than set them in lguest_init_IRQ we are called here every time an
+ * lguest device needs an interrupt.
+ *
+ * FIXME: irq_to_desc_alloc_node() can fail due to lack of memory, we should
+ * pass that up!
+ */
 void lguest_setup_irq(unsigned int irq)
 {
        irq_to_desc_alloc_node(irq, 0);
@@ -724,31 +857,39 @@ static unsigned long lguest_get_wallclock(void)
        return lguest_data.time.tv_sec;
 }
-/* The TSC is an Intel thing called the Time Stamp Counter.  The Host tells us
+/*
+ * The TSC is an Intel thing called the Time Stamp Counter.  The Host tells us
 * what speed it runs at, or 0 if it's unusable as a reliable clock source.
 * This matches what we want here: if we return 0 from this function, the x86
- * TSC clock will give up and not register itself. */
+ * TSC clock will give up and not register itself.
+ */
 static unsigned long lguest_tsc_khz(void)
 {
        return lguest_data.tsc_khz;
 }
-/* If we can't use the TSC, the kernel falls back to our lower-priority
+/*
- * "lguest_clock", where we read the time value given to us by the Host. */
+ * If we can't use the TSC, the kernel falls back to our lower-priority
+ * "lguest_clock", where we read the time value given to us by the Host.
+ */
 static cycle_t lguest_clock_read(struct clocksource *cs)
 {
        unsigned long sec, nsec;
-        /* Since the time is in two parts (seconds and nanoseconds), we risk
+        /*
+         * Since the time is in two parts (seconds and nanoseconds), we risk
         * reading it just as it's changing from 99 & 0.999999999 to 100 and 0,
         * and getting 99 and 0.  As Linux tends to come apart under the stress
-         * of time travel, we must be careful: */
+         * of time travel, we must be careful:
+         */
        do {
                /* First we read the seconds part. */
                sec = lguest_data.time.tv_sec;
-                /* This read memory barrier tells the compiler and the CPU that
+                /*
+                 * This read memory barrier tells the compiler and the CPU that
                 * this can't be reordered: we have to complete the above
-                 * before going on. */
+                 * before going on.
+                 */
                rmb();
                /* Now we read the nanoseconds part. */
                nsec = lguest_data.time.tv_nsec;
@@ -772,9 +913,11 @@ static struct clocksource lguest_clock = {
        .flags          = CLOCK_SOURCE_IS_CONTINUOUS,
 };
-/* We also need a "struct clock_event_device": Linux asks us to set it to go
+/*
+ * We also need a "struct clock_event_device": Linux asks us to set it to go
 * off some time in the future.  Actually, James Morris figured all this out, I
- * just applied the patch. */
+ * just applied the patch.
+ */
 static int lguest_clockevent_set_next_event(unsigned long delta,
                                           struct clock_event_device *evt)
 {
@@ -824,8 +967,10 @@ static struct clock_event_device lguest_clockevent = {
        .max_delta_ns           = LG_CLOCK_MAX_DELTA,
 };
-/* This is the Guest timer interrupt handler (hardware interrupt 0).  We just
+/*
- * call the clockevent infrastructure and it does whatever needs doing. */
+ * This is the Guest timer interrupt handler (hardware interrupt 0).  We just
+ * call the clockevent infrastructure and it does whatever needs doing.
+ */
 static void lguest_time_irq(unsigned int irq, struct irq_desc *desc)
 {
        unsigned long flags;
@@ -836,10 +981,12 @@ static void lguest_time_irq(unsigned int irq, struct irq_desc *desc)
        local_irq_restore(flags);
 }
-/* At some point in the boot process, we get asked to set up our timing
+/*
+ * At some point in the boot process, we get asked to set up our timing
 * infrastructure.  The kernel doesn't expect timer interrupts before this, but
 * we cleverly initialized the "blocked_interrupts" field of "struct
- * lguest_data" so that timer interrupts were blocked until now. */
+ * lguest_data" so that timer interrupts were blocked until now.
+ */
 static void lguest_time_init(void)
 {
        /* Set up the timer interrupt (0) to go to our simple timer routine */
@@ -863,14 +1010,16 @@ static void lguest_time_init(void)
 * to work.  They're pretty simple.
 */
-/* The Guest needs to tell the Host what stack it expects traps to use.  For
+/*
+ * The Guest needs to tell the Host what stack it expects traps to use.  For
 * native hardware, this is part of the Task State Segment mentioned above in
 * lguest_load_tr_desc(), but to help hypervisors there's this special call.
 *
 * We tell the Host the segment we want to use (__KERNEL_DS is the kernel data
 * segment), the privilege level (we're privilege level 1, the Host is 0 and
 * will not tolerate us trying to use that), the stack pointer, and the number
- * of pages in the stack. */
+ * of pages in the stack.
+ */
 static void lguest_load_sp0(struct tss_struct *tss,
                            struct thread_struct *thread)
 {
@@ -884,7 +1033,8 @@ static void lguest_set_debugreg(int regno, unsigned long value)
        /* FIXME: Implement */
 }
-/* There are times when the kernel wants to make sure that no memory writes are
+/*
+ * There are times when the kernel wants to make sure that no memory writes are
 * caught in the cache (that they've all reached real hardware devices).  This
 * doesn't matter for the Guest which has virtual hardware.
 *
@@ -898,11 +1048,13 @@ static void lguest_wbinvd(void)
 {
 }
-/* If the Guest expects to have an Advanced Programmable Interrupt Controller,
+/*
+ * If the Guest expects to have an Advanced Programmable Interrupt Controller,
 * we play dumb by ignoring writes and returning 0 for reads.  So it's no
 * longer Programmable nor Controlling anything, and I don't think 8 lines of
 * code qualifies for Advanced.  It will also never interrupt anything.  It
- * does, however, allow us to get through the Linux boot code. */
+ * does, however, allow us to get through the Linux boot code.
+ */
 #ifdef CONFIG_X86_LOCAL_APIC
 static void lguest_apic_write(u32 reg, u32 v)
 {
@@ -951,11 +1103,13 @@ static void lguest_safe_halt(void)
        kvm_hypercall0(LHCALL_HALT);
 }
-/* The SHUTDOWN hypercall takes a string to describe what's happening, and
+/*
+ * The SHUTDOWN hypercall takes a string to describe what's happening, and
 * an argument which says whether this to restart (reboot) the Guest or not.
 *
 * Note that the Host always prefers that the Guest speak in physical addresses
- * rather than virtual addresses, so we use __pa() here. */
+ * rather than virtual addresses, so we use __pa() here.
+ */
 static void lguest_power_off(void)
 {
        kvm_hypercall2(LHCALL_SHUTDOWN, __pa("Power down"),
@@ -986,8 +1140,10 @@ static __init char *lguest_memory_setup(void)
         * nice to move it back to lguest_init.  Patch welcome... */
        atomic_notifier_chain_register(&panic_notifier_list, &paniced);
-        /* The Linux bootloader header contains an "e820" memory map: the
+        /*
-         * Launcher populated the first entry with our memory limit. */
+         *The Linux bootloader header contains an "e820" memory map: the
+         * Launcher populated the first entry with our memory limit.
+         */
        e820_add_region(boot_params.e820_map[0].addr,
                          boot_params.e820_map[0].size,
                          boot_params.e820_map[0].type);
@@ -996,16 +1152,17 @@ static __init char *lguest_memory_setup(void)
        return "LGUEST";
 }
-/* We will eventually use the virtio console device to produce console output,
+/*
+ * We will eventually use the virtio console device to produce console output,
 * but before that is set up we use LHCALL_NOTIFY on normal memory to produce
- * console output. */
+ * console output.
+ */
 static __init int early_put_chars(u32 vtermno, const char *buf, int count)
 {
        char scratch[17];
        unsigned int len = count;
-        /* We use a nul-terminated string, so we have to make a copy.  Icky,
+        /* We use a nul-terminated string, so we make a copy.  Icky, huh? */
-         * huh? */
        if (len > sizeof(scratch) - 1)
                len = sizeof(scratch) - 1;
        scratch[len] = '\0';
@@ -1016,8 +1173,10 @@ static __init int early_put_chars(u32 vtermno, const char *buf, int count)
        return len;
 }
-/* Rebooting also tells the Host we're finished, but the RESTART flag tells the
+/*
- * Launcher to reboot us. */
+ * Rebooting also tells the Host we're finished, but the RESTART flag tells the
+ * Launcher to reboot us.
+ */
 static void lguest_restart(char *reason)
 {
        kvm_hypercall2(LHCALL_SHUTDOWN, __pa(reason), LGUEST_SHUTDOWN_RESTART);
@@ -1044,7 +1203,8 @@ static void lguest_restart(char *reason)
 * fit comfortably.
 *
 * First we need assembly templates of each of the patchable Guest operations,
- * and these are in i386_head.S. */
+ * and these are in i386_head.S.
+ */
 /*G:060 We construct a table from the assembler templates: */
 static const struct lguest_insns
@@ -1055,9 +1215,11 @@ static const struct lguest_insns
        [PARAVIRT_PATCH(pv_irq_ops.save_fl)] = { lgstart_pushf, lgend_pushf },
 };
-/* Now our patch routine is fairly simple (based on the native one in
+/*
+ * Now our patch routine is fairly simple (based on the native one in
 * paravirt.c).  If we have a replacement, we copy it in and return how much of
- * the available space we used. */
+ * the available space we used.
+ */
 static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf,
                             unsigned long addr, unsigned len)
 {
@@ -1069,8 +1231,7 @@ static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf,
        insn_len = lguest_insns[type].end - lguest_insns[type].start;
-        /* Similarly if we can't fit replacement (shouldn't happen, but let's
+        /* Similarly if it can't fit (doesn't happen, but let's be thorough). */
-         * be thorough). */
        if (len < insn_len)
                return paravirt_patch_default(type, clobber, ibuf, addr, len);
@@ -1079,22 +1240,28 @@ static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf,
        return insn_len;
 }
-/*G:030 Once we get to lguest_init(), we know we're a Guest.  The various
+/*G:029
+ * Once we get to lguest_init(), we know we're a Guest.  The various
 * pv_ops structures in the kernel provide points for (almost) every routine we
- * have to override to avoid privileged instructions. */
+ * have to override to avoid privileged instructions.
+ */
 __init void lguest_init(void)
 {
-        /* We're under lguest, paravirt is enabled, and we're running at
+        /* We're under lguest. */
-         * privilege level 1, not 0 as normal. */
        pv_info.name = "lguest";
+        /* Paravirt is enabled. */
        pv_info.paravirt_enabled = 1;
+        /* We're running at privilege level 1, not 0 as normal. */
        pv_info.kernel_rpl = 1;
+        /* Everyone except Xen runs with this set. */
        pv_info.shared_kernel_pmd = 1;
-        /* We set up all the lguest overrides for sensitive operations.  These
+        /*
-         * are detailed with the operations themselves. */
+         * We set up all the lguest overrides for sensitive operations.  These
+         * are detailed with the operations themselves.
+         */
-        /* interrupt-related operations */
+        /* Interrupt-related operations */
        pv_irq_ops.init_IRQ = lguest_init_IRQ;
        pv_irq_ops.save_fl = PV_CALLEE_SAVE(save_fl);
        pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(lg_restore_fl);
@@ -1102,11 +1269,11 @@ __init void lguest_init(void)
        pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(lg_irq_enable);
        pv_irq_ops.safe_halt = lguest_safe_halt;
-        /* init-time operations */
+        /* Setup operations */
        pv_init_ops.memory_setup = lguest_memory_setup;
        pv_init_ops.patch = lguest_patch;
-        /* Intercepts of various cpu instructions */
+        /* Intercepts of various CPU instructions */
        pv_cpu_ops.load_gdt = lguest_load_gdt;
        pv_cpu_ops.cpuid = lguest_cpuid;
        pv_cpu_ops.load_idt = lguest_load_idt;
@@ -1127,7 +1294,7 @@ __init void lguest_init(void)
        pv_cpu_ops.start_context_switch = paravirt_start_context_switch;
        pv_cpu_ops.end_context_switch = lguest_end_context_switch;
-        /* pagetable management */
+        /* Pagetable management */
        pv_mmu_ops.write_cr3 = lguest_write_cr3;
        pv_mmu_ops.flush_tlb_user = lguest_flush_tlb_user;
        pv_mmu_ops.flush_tlb_single = lguest_flush_tlb_single;
@@ -1149,54 +1316,71 @@ __init void lguest_init(void)
        pv_mmu_ops.pte_update_defer = lguest_pte_update;
 #ifdef CONFIG_X86_LOCAL_APIC
-        /* apic read/write intercepts */
+        /* APIC read/write intercepts */
        set_lguest_basic_apic_ops();
 #endif
-        /* time operations */
+        /* Time operations */
        pv_time_ops.get_wallclock = lguest_get_wallclock;
        pv_time_ops.time_init = lguest_time_init;
        pv_time_ops.get_tsc_khz = lguest_tsc_khz;
-        /* Now is a good time to look at the implementations of these functions
+        /*
-         * before returning to the rest of lguest_init(). */
+         * Now is a good time to look at the implementations of these functions
+         * before returning to the rest of lguest_init().
+         */
-        /*G:070 Now we've seen all the paravirt_ops, we return to
+        /*G:070
+         * Now we've seen all the paravirt_ops, we return to
         * lguest_init() where the rest of the fairly chaotic boot setup
-         * occurs. */
+         * occurs.
+         */
-        /* The stack protector is a weird thing where gcc places a canary
+        /*
+         * The stack protector is a weird thing where gcc places a canary
         * value on the stack and then checks it on return.  This file is
         * compiled with -fno-stack-protector it, so we got this far without
         * problems.  The value of the canary is kept at offset 20 from the
         * %gs register, so we need to set that up before calling C functions
-         * in other files. */
+         * in other files.
+         */
        setup_stack_canary_segment(0);
-        /* We could just call load_stack_canary_segment(), but we might as
-         * call switch_to_new_gdt() which loads the whole table and sets up
+        /*
-         * the per-cpu segment descriptor register %fs as well. */
+         * We could just call load_stack_canary_segment(), but we might as well
+         * call switch_to_new_gdt() which loads the whole table and sets up the
+         * per-cpu segment descriptor register %fs as well.
+         */
        switch_to_new_gdt(0);
-        /* As described in head_32.S, we map the first 128M of memory. */
+        /* We actually boot with all memory mapped, but let's say 128MB. */
        max_pfn_mapped = (128*1024*1024) >> PAGE_SHIFT;
-        /* The Host<->Guest Switcher lives at the top of our address space, and
+        /*
+         * The Host<->Guest Switcher lives at the top of our address space, and
         * the Host told us how big it is when we made LGUEST_INIT hypercall:
-         * it put the answer in lguest_data.reserve_mem  */
+         * it put the answer in lguest_data.reserve_mem
+         */
        reserve_top_address(lguest_data.reserve_mem);
-        /* If we don't initialize the lock dependency checker now, it crashes
+        /*
-         * paravirt_disable_iospace. */
+         * If we don't initialize the lock dependency checker now, it crashes
+         * paravirt_disable_iospace.
+         */
        lockdep_init();
-        /* The IDE code spends about 3 seconds probing for disks: if we reserve
+        /*
+         * The IDE code spends about 3 seconds probing for disks: if we reserve
         * all the I/O ports up front it can't get them and so doesn't probe.
         * Other device drivers are similar (but less severe).  This cuts the
-         * kernel boot time on my machine from 4.1 seconds to 0.45 seconds. */
+         * kernel boot time on my machine from 4.1 seconds to 0.45 seconds.
+         */
        paravirt_disable_iospace();
-        /* This is messy CPU setup stuff which the native boot code does before
+        /*
-         * start_kernel, so we have to do, too: */
+         * This is messy CPU setup stuff which the native boot code does before
+         * start_kernel, so we have to do, too:
+         */
        cpu_detect(&new_cpu_data);
        /* head.S usually sets up the first capability word, so do it here. */
        new_cpu_data.x86_capability[0] = cpuid_edx(1);
@@ -1213,22 +1397,28 @@ __init void lguest_init(void)
        acpi_ht = 0;
 #endif
-        /* We set the preferred console to "hvc".  This is the "hypervisor
+        /*
+         * We set the preferred console to "hvc".  This is the "hypervisor
         * virtual console" driver written by the PowerPC people, which we also
-         * adapted for lguest's use. */
+         * adapted for lguest's use.
+         */
        add_preferred_console("hvc", 0, NULL);
        /* Register our very early console. */
        virtio_cons_early_init(early_put_chars);
-        /* Last of all, we set the power management poweroff hook to point to
+        /*
+         * Last of all, we set the power management poweroff hook to point to
         * the Guest routine to power off, and the reboot hook to our restart
-         * routine. */
+         * routine.
+         */
        pm_power_off = lguest_power_off;
        machine_ops.restart = lguest_restart;
-        /* Now we're set up, call i386_start_kernel() in head32.c and we proceed
+        /*
-         * to boot as normal.  It never returns. */
+         * Now we're set up, call i386_start_kernel() in head32.c and we proceed
+         * to boot as normal.  It never returns.
+         */
        i386_start_kernel();
 }
 /*
diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S
index a9c8cfe61cd4..27eac0faee48 100644
--- a/arch/x86/lguest/i386_head.S
+++ b/arch/x86/lguest/i386_head.S
@@ -5,7 +5,8 @@
 #include <asm/thread_info.h>
 #include <asm/processor-flags.h>
-/*G:020 Our story starts with the kernel booting into startup_32 in
+/*G:020
+ * Our story starts with the kernel booting into startup_32 in
 * arch/x86/kernel/head_32.S.  It expects a boot header, which is created by
 * the bootloader (the Launcher in our case).
 *
@@ -21,11 +22,14 @@
 * data without remembering to subtract __PAGE_OFFSET!
 *
 * The .section line puts this code in .init.text so it will be discarded after
- * boot. */
+ * boot.
+ */
 .section .init.text, "ax", @progbits
 ENTRY(lguest_entry)
-        /* We make the "initialization" hypercall now to tell the Host about
+        /*
-         * us, and also find out where it put our page tables. */
+         * We make the "initialization" hypercall now to tell the Host about
+         * us, and also find out where it put our page tables.
+         */
        movl $LHCALL_LGUEST_INIT, %eax
        movl $lguest_data - __PAGE_OFFSET, %ebx
        .byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */
@@ -33,13 +37,14 @@ ENTRY(lguest_entry)
        /* Set up the initial stack so we can run C code. */
        movl $(init_thread_union+THREAD_SIZE),%esp
-        /* Jumps are relative, and we're running __PAGE_OFFSET too low at the
+        /* Jumps are relative: we're running __PAGE_OFFSET too low. */
-         * moment. */
        jmp lguest_init+__PAGE_OFFSET
-/*G:055 We create a macro which puts the assembler code between lgstart_ and
+/*G:055
- * lgend_ markers.  These templates are put in the .text section: they can't be
+ * We create a macro which puts the assembler code between lgstart_ and lgend_
- * discarded after boot as we may need to patch modules, too. */
+ * markers.  These templates are put in the .text section: they can't be
+ * discarded after boot as we may need to patch modules, too.
+ */
 .text
 #define LGUEST_PATCH(name, insns...)                    \
        lgstart_##name: insns; lgend_##name:;           \
@@ -48,83 +53,103 @@ ENTRY(lguest_entry)
 LGUEST_PATCH(cli, movl $0, lguest_data+LGUEST_DATA_irq_enabled)
 LGUEST_PATCH(pushf, movl lguest_data+LGUEST_DATA_irq_enabled, %eax)
-/*G:033 But using those wrappers is inefficient (we'll see why that doesn't
+/*G:033
- * matter for save_fl and irq_disable later).  If we write our routines
+ * But using those wrappers is inefficient (we'll see why that doesn't matter
- * carefully in assembler, we can avoid clobbering any registers and avoid
+ * for save_fl and irq_disable later).  If we write our routines carefully in
- * jumping through the wrapper functions.
+ * assembler, we can avoid clobbering any registers and avoid jumping through
+ * the wrapper functions.
 *
 * I skipped over our first piece of assembler, but this one is worth studying
- * in a bit more detail so I'll describe in easy stages.  First, the routine
+ * in a bit more detail so I'll describe in easy stages.  First, the routine to
- * to enable interrupts: */
+ * enable interrupts:
+ */
 ENTRY(lg_irq_enable)
-        /* The reverse of irq_disable, this sets lguest_data.irq_enabled to
+        /*
-         * X86_EFLAGS_IF (ie. "Interrupts enabled"). */
+         * The reverse of irq_disable, this sets lguest_data.irq_enabled to
+         * X86_EFLAGS_IF (ie. "Interrupts enabled").
+         */
        movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled
-        /* But now we need to check if the Host wants to know: there might have
+        /*
+         * But now we need to check if the Host wants to know: there might have
         * been interrupts waiting to be delivered, in which case it will have
         * set lguest_data.irq_pending to X86_EFLAGS_IF.  If it's not zero, we
-         * jump to send_interrupts, otherwise we're done. */
+         * jump to send_interrupts, otherwise we're done.
+         */
        testl $0, lguest_data+LGUEST_DATA_irq_pending
        jnz send_interrupts
-        /* One cool thing about x86 is that you can do many things without using
+        /*
+         * One cool thing about x86 is that you can do many things without using
         * a register.  In this case, the normal path hasn't needed to save or
-         * restore any registers at all! */
+         * restore any registers at all!
+         */
        ret
 send_interrupts:
-        /* OK, now we need a register: eax is used for the hypercall number,
+        /*
+         * OK, now we need a register: eax is used for the hypercall number,
         * which is LHCALL_SEND_INTERRUPTS.
         *
         * We used not to bother with this pending detection at all, which was
         * much simpler.  Sooner or later the Host would realize it had to
         * send us an interrupt.  But that turns out to make performance 7
         * times worse on a simple tcp benchmark.  So now we do this the hard
-         * way. */
+         * way.
+         */
        pushl %eax
        movl $LHCALL_SEND_INTERRUPTS, %eax
-        /* This is a vmcall instruction (same thing that KVM uses).  Older
+        /*
+         * This is a vmcall instruction (same thing that KVM uses).  Older
         * assembler versions might not know the "vmcall" instruction, so we
-         * create one manually here. */
+         * create one manually here.
+         */
        .byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */
+        /* Put eax back the way we found it. */
        popl %eax
        ret
-/* Finally, the "popf" or "restore flags" routine.  The %eax register holds the
+/*
+ * Finally, the "popf" or "restore flags" routine.  The %eax register holds the
 * flags (in practice, either X86_EFLAGS_IF or 0): if it's X86_EFLAGS_IF we're
- * enabling interrupts again, if it's 0 we're leaving them off. */
+ * enabling interrupts again, if it's 0 we're leaving them off.
+ */
 ENTRY(lg_restore_fl)
        /* This is just "lguest_data.irq_enabled = flags;" */
        movl %eax, lguest_data+LGUEST_DATA_irq_enabled
-        /* Now, if the %eax value has enabled interrupts and
+        /*
+         * Now, if the %eax value has enabled interrupts and
         * lguest_data.irq_pending is set, we want to tell the Host so it can
         * deliver any outstanding interrupts.  Fortunately, both values will
         * be X86_EFLAGS_IF (ie. 512) in that case, and the "testl"
         * instruction will AND them together for us.  If both are set, we
-         * jump to send_interrupts. */
+         * jump to send_interrupts.
+         */
        testl lguest_data+LGUEST_DATA_irq_pending, %eax
        jnz send_interrupts
        /* Again, the normal path has used no extra registers.  Clever, huh? */
        ret
+/*:*/
 /* These demark the EIP range where host should never deliver interrupts. */
 .global lguest_noirq_start
 .global lguest_noirq_end
-/*M:004 When the Host reflects a trap or injects an interrupt into the Guest,
+/*M:004
- * it sets the eflags interrupt bit on the stack based on
+ * When the Host reflects a trap or injects an interrupt into the Guest, it
- * lguest_data.irq_enabled, so the Guest iret logic does the right thing when
+ * sets the eflags interrupt bit on the stack based on lguest_data.irq_enabled,
- * restoring it.  However, when the Host sets the Guest up for direct traps,
+ * so the Guest iret logic does the right thing when restoring it.  However,
- * such as system calls, the processor is the one to push eflags onto the
+ * when the Host sets the Guest up for direct traps, such as system calls, the
- * stack, and the interrupt bit will be 1 (in reality, interrupts are always
+ * processor is the one to push eflags onto the stack, and the interrupt bit
- * enabled in the Guest).
+ * will be 1 (in reality, interrupts are always enabled in the Guest).
 *
 * This turns out to be harmless: the only trap which should happen under Linux
 * with interrupts disabled is Page Fault (due to our lazy mapping of vmalloc
 * regions), which has to be reflected through the Host anyway.  If another
 * trap *does* go off when interrupts are disabled, the Guest will panic, and
- * we'll never get to this iret! :*/
+ * we'll never get to this iret!
+:*/
-/*G:045 There is one final paravirt_op that the Guest implements, and glancing
+/*G:045
- * at it you can see why I left it to last.  It's *cool*!  It's in *assembler*!
+ * There is one final paravirt_op that the Guest implements, and glancing at it
+ * you can see why I left it to last.  It's *cool*!  It's in *assembler*!
 *
 * The "iret" instruction is used to return from an interrupt or trap.  The
 * stack looks like this:
@@ -148,15 +173,18 @@ ENTRY(lg_restore_fl)
 * return to userspace or wherever.  Our solution to this is to surround the
 * code with lguest_noirq_start: and lguest_noirq_end: labels.  We tell the
 * Host that it is *never* to interrupt us there, even if interrupts seem to be
- * enabled. */
+ * enabled.
+ */
 ENTRY(lguest_iret)
        pushl   %eax
        movl    12(%esp), %eax
 lguest_noirq_start:
-        /* Note the %ss: segment prefix here.  Normal data accesses use the
+        /*
+         * Note the %ss: segment prefix here.  Normal data accesses use the
         * "ds" segment, but that will have already been restored for whatever
         * we're returning to (such as userspace): we can't trust it.  The %ss:
-         * prefix makes sure we use the stack segment, which is still valid. */
+         * prefix makes sure we use the stack segment, which is still valid.
+         */
        movl    %eax,%ss:lguest_data+LGUEST_DATA_irq_enabled
        popl    %eax
        iret
diff --git a/arch/x86/lib/msr.c b/arch/x86/lib/msr.c
index 1440b9c0547e..caa24aca8115 100644
--- a/arch/x86/lib/msr.c
+++ b/arch/x86/lib/msr.c
@@ -89,16 +89,13 @@ void rdmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs)
        rv.msrs   = msrs;
        rv.msr_no = msr_no;
-        preempt_disable();
+        this_cpu = get_cpu();
-        /*
-         * FIXME: handle the CPU we're executing on separately for now until
+        if (cpumask_test_cpu(this_cpu, mask))
-         * smp_call_function_many has been fixed to not skip it.
+                __rdmsr_on_cpu(&rv);
-         */
-        this_cpu = raw_smp_processor_id();
-        smp_call_function_single(this_cpu, __rdmsr_on_cpu, &rv, 1);
        smp_call_function_many(mask, __rdmsr_on_cpu, &rv, 1);
-        preempt_enable();
+        put_cpu();
 }
 EXPORT_SYMBOL(rdmsr_on_cpus);
@@ -121,16 +118,13 @@ void wrmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs)
        rv.msrs   = msrs;
        rv.msr_no = msr_no;
-        preempt_disable();
+        this_cpu = get_cpu();
-        /*
-         * FIXME: handle the CPU we're executing on separately for now until
+        if (cpumask_test_cpu(this_cpu, mask))
-         * smp_call_function_many has been fixed to not skip it.
+                __wrmsr_on_cpu(&rv);
-         */
-        this_cpu = raw_smp_processor_id();
-        smp_call_function_single(this_cpu, __wrmsr_on_cpu, &rv, 1);
        smp_call_function_many(mask, __wrmsr_on_cpu, &rv, 1);
-        preempt_enable();
+        put_cpu();
 }
 EXPORT_SYMBOL(wrmsr_on_cpus);
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 85307cc6e45f..bfae139182ff 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -697,7 +697,7 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code,
        if (!printk_ratelimit())
                return;
-        printk(KERN_CONT "%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
+        printk("%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
                task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
                tsk->comm, task_pid_nr(tsk), address,
                (void *)regs->ip, (void *)regs->sp, error_code);
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
index 58f621e81919..2112ed55e7ea 100644
--- a/arch/x86/mm/highmem_32.c
+++ b/arch/x86/mm/highmem_32.c
@@ -103,6 +103,7 @@ EXPORT_SYMBOL(kmap);
 EXPORT_SYMBOL(kunmap);
 EXPORT_SYMBOL(kmap_atomic);
 EXPORT_SYMBOL(kunmap_atomic);
+EXPORT_SYMBOL(kmap_atomic_prot);
 void __init set_highmem_pages_init(void)
 {
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 6176fe8f29e0..ea56b8cbb6a6 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -796,7 +796,7 @@ int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
                return ret;
 #else
-        reserve_bootmem(phys, len, BOOTMEM_DEFAULT);
+        reserve_bootmem(phys, len, flags);
 #endif
        if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) {
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 1b734d7a8966..7e600c1962db 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -591,9 +591,12 @@ static int __change_page_attr(struct cpa_data *cpa, int primary)
        unsigned int level;
        pte_t *kpte, old_pte;
-        if (cpa->flags & CPA_PAGES_ARRAY)
+        if (cpa->flags & CPA_PAGES_ARRAY) {
-                address = (unsigned long)page_address(cpa->pages[cpa->curpage]);
+                struct page *page = cpa->pages[cpa->curpage];
-        else if (cpa->flags & CPA_ARRAY)
+                if (unlikely(PageHighMem(page)))
+                        return 0;
+                address = (unsigned long)page_address(page);
+        } else if (cpa->flags & CPA_ARRAY)
                address = cpa->vaddr[cpa->curpage];
        else
                address = *cpa->vaddr;
@@ -697,9 +700,12 @@ static int cpa_process_alias(struct cpa_data *cpa)
         * No need to redo, when the primary call touched the direct
         * mapping already:
         */
-        if (cpa->flags & CPA_PAGES_ARRAY)
+        if (cpa->flags & CPA_PAGES_ARRAY) {
-                vaddr = (unsigned long)page_address(cpa->pages[cpa->curpage]);
+                struct page *page = cpa->pages[cpa->curpage];
-        else if (cpa->flags & CPA_ARRAY)
+                if (unlikely(PageHighMem(page)))
+                        return 0;
+                vaddr = (unsigned long)page_address(page);
+        } else if (cpa->flags & CPA_ARRAY)
                vaddr = cpa->vaddr[cpa->curpage];
        else
                vaddr = *cpa->vaddr;
@@ -997,12 +1003,15 @@ EXPORT_SYMBOL(set_memory_array_uc);
 int _set_memory_wc(unsigned long addr, int numpages)
 {
        int ret;
+        unsigned long addr_copy = addr;
        ret = change_page_attr_set(&addr, numpages,
                                    __pgprot(_PAGE_CACHE_UC_MINUS), 0);
        if (!ret) {
-                ret = change_page_attr_set(&addr, numpages,
+                ret = change_page_attr_set_clr(&addr_copy, numpages,
-                                    __pgprot(_PAGE_CACHE_WC), 0);
+                                               __pgprot(_PAGE_CACHE_WC),
+                                               __pgprot(_PAGE_CACHE_MASK),
+                                               0, 0, NULL);
        }
        return ret;
 }
@@ -1119,7 +1128,9 @@ int set_pages_array_uc(struct page **pages, int addrinarray)
        int free_idx;
        for (i = 0; i < addrinarray; i++) {
-                start = (unsigned long)page_address(pages[i]);
+                if (PageHighMem(pages[i]))
+                        continue;
+                start = page_to_pfn(pages[i]) << PAGE_SHIFT;
                end = start + PAGE_SIZE;
                if (reserve_memtype(start, end, _PAGE_CACHE_UC_MINUS, NULL))
                        goto err_out;
@@ -1132,7 +1143,9 @@ int set_pages_array_uc(struct page **pages, int addrinarray)
 err_out:
        free_idx = i;
        for (i = 0; i < free_idx; i++) {
-                start = (unsigned long)page_address(pages[i]);
+                if (PageHighMem(pages[i]))
+                        continue;
+                start = page_to_pfn(pages[i]) << PAGE_SHIFT;
                end = start + PAGE_SIZE;
                free_memtype(start, end);
        }
@@ -1161,7 +1174,9 @@ int set_pages_array_wb(struct page **pages, int addrinarray)
                return retval;
        for (i = 0; i < addrinarray; i++) {
-                start = (unsigned long)page_address(pages[i]);
+                if (PageHighMem(pages[i]))
+                        continue;
+                start = page_to_pfn(pages[i]) << PAGE_SHIFT;
                end = start + PAGE_SIZE;
                free_memtype(start, end);
        }
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index e6718bb28065..352aa9e927e2 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -623,7 +623,8 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot,
                return ret;
        if (flags != want_flags) {
-                if (strict_prot || !is_new_memtype_allowed(want_flags, flags)) {
+                if (strict_prot ||
+                    !is_new_memtype_allowed(paddr, size, want_flags, flags)) {
                        free_memtype(paddr, paddr + size);
                        printk(KERN_ERR "%s:%d map pfn expected mapping type %s"
                                " for %Lx-%Lx, got %s\n",
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 8e43bdd45456..ed34f5e35999 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -25,7 +25,7 @@ pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
        return pte;
 }
-void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
+void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
 {
        pgtable_page_dtor(pte);
        paravirt_release_pte(page_to_pfn(pte));
@@ -33,14 +33,14 @@ void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
 }
 #if PAGETABLE_LEVELS > 2
-void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
+void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
 {
        paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT);
        tlb_remove_page(tlb, virt_to_page(pmd));
 }
 #if PAGETABLE_LEVELS > 3
-void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
+void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
 {
        paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
        tlb_remove_page(tlb, virt_to_page(pud));
@@ -329,7 +329,6 @@ void __init reserve_top_address(unsigned long reserve)
        printk(KERN_INFO "Reserving virtual address space above 0x%08x\n",
               (int)-reserve);
        __FIXADDR_TOP = -reserve - PAGE_SIZE;
-        __VMALLOC_RESERVE += reserve;
 #endif
 }
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 2dfcbf9df2ae..dbb5381f7b3b 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -79,8 +79,10 @@ static __init void bad_srat(void)
        acpi_numa = -1;
        for (i = 0; i < MAX_LOCAL_APIC; i++)
                apicid_to_node[i] = NUMA_NO_NODE;
-        for (i = 0; i < MAX_NUMNODES; i++)
+        for (i = 0; i < MAX_NUMNODES; i++) {
-                nodes_add[i].start = nodes[i].end = 0;
+                nodes[i].start = nodes[i].end = 0;
+                nodes_add[i].start = nodes_add[i].end = 0;
+        }
        remove_all_active_ranges();
 }
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 821e97017e95..c814e144a3f0 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -183,18 +183,17 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask,
        f->flush_mm = mm;
        f->flush_va = va;
-        cpumask_andnot(to_cpumask(f->flush_cpumask),
+        if (cpumask_andnot(to_cpumask(f->flush_cpumask), cpumask, cpumask_of(smp_processor_id()))) {
-                       cpumask, cpumask_of(smp_processor_id()));
+                /*
+                 * We have to send the IPI only to
-        /*
+                 * CPUs affected.
-         * We have to send the IPI only to
+                 */
-         * CPUs affected.
+                apic->send_IPI_mask(to_cpumask(f->flush_cpumask),
-         */
+                              INVALIDATE_TLB_VECTOR_START + sender);
-        apic->send_IPI_mask(to_cpumask(f->flush_cpumask),
-                      INVALIDATE_TLB_VECTOR_START + sender);
-        while (!cpumask_empty(to_cpumask(f->flush_cpumask)))
+                while (!cpumask_empty(to_cpumask(f->flush_cpumask)))
-                cpu_relax();
+                        cpu_relax();
+        }
        f->flush_mm = NULL;
        f->flush_va = 0;
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index 0fb56db16d18..52e62e57fedd 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -35,6 +35,7 @@
 #include <asm/pat.h>
 #include <asm/e820.h>
 #include <asm/pci_x86.h>
+#include <asm/io_apic.h>
 static int
@@ -227,6 +228,12 @@ void __init pcibios_resource_survey(void)
        pcibios_allocate_resources(1);
        e820_reserve_resources_late();
+        /*
+         * Insert the IO APIC resources after PCI initialization has
+         * occured to handle IO APICS that are mapped in on a BAR in
+         * PCI space, but before trying to assign unassigned pci res.
+         */
+        ioapic_insert_resources();
 }
 /**
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 172438f86a02..7410640db173 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -5,6 +5,10 @@ CFLAGS_REMOVE_time.o = -pg
 CFLAGS_REMOVE_irq.o = -pg
 endif
+# Make sure early boot has no stackprotector
+nostackp := $(call cc-option, -fno-stack-protector)
+CFLAGS_enlighten.o              := $(nostackp)
 obj-y           := enlighten.o setup.o multicalls.o mmu.o irq.o \
                        time.o xen-asm.o xen-asm_$(BITS).o \
                        grant-table.o suspend.o
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 0a1700a2be9c..e90540a46a0b 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -974,10 +974,6 @@ asmlinkage void __init xen_start_kernel(void)
        xen_domain_type = XEN_PV_DOMAIN;
-        BUG_ON(memcmp(xen_start_info->magic, "xen-3", 5) != 0);
-        xen_setup_features();
        /* Install Xen paravirt ops */
        pv_info = xen_info;
        pv_init_ops = xen_init_ops;
@@ -986,8 +982,15 @@ asmlinkage void __init xen_start_kernel(void)
        pv_apic_ops = xen_apic_ops;
        pv_mmu_ops = xen_mmu_ops;
-        xen_init_irq_ops();
+#ifdef CONFIG_X86_64
+        /*
+         * Setup percpu state.  We only need to do this for 64-bit
+         * because 32-bit already has %fs set properly.
+         */
+        load_percpu_segment(0);
+#endif
+        xen_init_irq_ops();
        xen_init_cpuid_mask();
 #ifdef CONFIG_X86_LOCAL_APIC
@@ -997,6 +1000,8 @@ asmlinkage void __init xen_start_kernel(void)
        set_xen_basic_apic_ops();
 #endif
+        xen_setup_features();
        if (xen_feature(XENFEAT_mmu_pt_update_preserve_ad)) {
                pv_mmu_ops.ptep_modify_prot_start = xen_ptep_modify_prot_start;
                pv_mmu_ops.ptep_modify_prot_commit = xen_ptep_modify_prot_commit;
@@ -1004,13 +1009,6 @@ asmlinkage void __init xen_start_kernel(void)
        machine_ops = xen_machine_ops;
-#ifdef CONFIG_X86_64
-        /*
-         * Setup percpu state.  We only need to do this for 64-bit
-         * because 32-bit already has %fs set properly.
-         */
-        load_percpu_segment(0);
-#endif
        /*
         * The only reliable way to retain the initial address of the
         * percpu gdt_page is to remember it here, so we can go and
author	Ingo Molnar <mingo@elte.hu>	2009-08-29 03:30:41 -0400
committer	Ingo Molnar <mingo@elte.hu>	2009-08-29 03:31:47 -0400
commit	eebc57f73d42095b778e899f6aa90ad050c72655 (patch)
tree	2ba80c75e9284093e6d7606dbb1b6a4bb752a2a5 /arch/x86
parent	d3a247bfb2c26f5b67367d58af7ad8c2efbbc6c1 (diff)
parent	2a4ab640d3c28c2952967e5f63ea495555bf2a5f (diff)