99 files changed, 2911 insertions, 1310 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 98a0ed52b5c3..249d1e0824b5 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -247,6 +247,28 @@ config X86_HAS_BOOT_CPU_ID
        def_bool y
        depends on X86_VOYAGER
+config SPARSE_IRQ
+        bool "Support sparse irq numbering"
+        depends on PCI_MSI || HT_IRQ
+        help
+          This enables support for sparse irqs. This is useful for distro
+          kernels that want to define a high CONFIG_NR_CPUS value but still
+          want to have low kernel memory footprint on smaller machines.
+          ( Sparse IRQs can also be beneficial on NUMA boxes, as they spread
+            out the irq_desc[] array in a more NUMA-friendly way. )
+          If you don't know what to do here, say N.
+config NUMA_MIGRATE_IRQ_DESC
+        bool "Move irq desc when changing irq smp_affinity"
+        depends on SPARSE_IRQ && NUMA
+        default n
+        help
+          This enables moving irq_desc to cpu/node that irq will use handled.
+          If you don't know what to do here, say N.
 config X86_FIND_SMP_CONFIG
        def_bool y
        depends on X86_MPPARSE || X86_VOYAGER
@@ -479,7 +501,7 @@ config HPET_TIMER
         The HPET provides a stable time base on SMP
         systems, unlike the TSC, but it is more expensive to access,
         as it is off-chip.  You can find the HPET spec at
-         <http://www.intel.com/hardwaredesign/hpetspec.htm>.
+         <http://www.intel.com/hardwaredesign/hpetspec_1.pdf>.
         You can safely choose Y here.  However, HPET will only be
         activated if the platform and the BIOS support this feature.
@@ -579,19 +601,20 @@ config IOMMU_HELPER
 config MAXSMP
        bool "Configure Maximum number of SMP Processors and NUMA Nodes"
-        depends on X86_64 && SMP && BROKEN
+        depends on X86_64 && SMP && DEBUG_KERNEL && EXPERIMENTAL
+        select CPUMASK_OFFSTACK
        default n
        help
          Configure maximum number of CPUS and NUMA Nodes for this architecture.
          If unsure, say N.
 config NR_CPUS
-        int "Maximum number of CPUs (2-512)" if !MAXSMP
+        int "Maximum number of CPUs" if SMP && !MAXSMP
-        range 2 512
+        range 2 512 if SMP && !MAXSMP
-        depends on SMP
+        default "1" if !SMP
        default "4096" if MAXSMP
-        default "32" if X86_NUMAQ || X86_SUMMIT || X86_BIGSMP || X86_ES7000
+        default "32" if SMP && (X86_NUMAQ || X86_SUMMIT || X86_BIGSMP || X86_ES7000)
-        default "8"
+        default "8" if SMP
        help
          This allows you to specify the maximum number of CPUs which this
          kernel will support.  The maximum supported value is 512 and the
diff --git a/arch/x86/include/asm/bigsmp/apic.h b/arch/x86/include/asm/bigsmp/apic.h
index ce547f24a1cd..d8dd9f537911 100644
--- a/arch/x86/include/asm/bigsmp/apic.h
+++ b/arch/x86/include/asm/bigsmp/apic.h
@@ -9,12 +9,12 @@ static inline int apic_id_registered(void)
        return (1);
 }
-static inline cpumask_t target_cpus(void)
+static inline const cpumask_t *target_cpus(void)
 {
 #ifdef CONFIG_SMP
-        return cpu_online_map;
+        return &cpu_online_map;
 #else
-        return cpumask_of_cpu(0);
+        return &cpumask_of_cpu(0);
 #endif
 }
@@ -79,7 +79,7 @@ static inline int apicid_to_node(int logical_apicid)
 static inline int cpu_present_to_apicid(int mps_cpu)
 {
-        if (mps_cpu < NR_CPUS)
+        if (mps_cpu < nr_cpu_ids)
                return (int) per_cpu(x86_bios_cpu_apicid, mps_cpu);
        return BAD_APICID;
@@ -94,7 +94,7 @@ extern u8 cpu_2_logical_apicid[];
 /* Mapping from cpu number to logical apicid */
 static inline int cpu_to_logical_apicid(int cpu)
 {
-        if (cpu >= NR_CPUS)
+        if (cpu >= nr_cpu_ids)
                return BAD_APICID;
        return cpu_physical_id(cpu);
 }
@@ -119,16 +119,34 @@ static inline int check_phys_apicid_present(int boot_cpu_physical_apicid)
 }
 /* As we are using single CPU as destination, pick only one CPU here */
-static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask)
+static inline unsigned int cpu_mask_to_apicid(const cpumask_t *cpumask)
 {
        int cpu;
        int apicid;     
-        cpu = first_cpu(cpumask);
+        cpu = first_cpu(*cpumask);
        apicid = cpu_to_logical_apicid(cpu);
        return apicid;
 }
+static inline unsigned int cpu_mask_to_apicid_and(const struct cpumask *cpumask,
+                                                  const struct cpumask *andmask)
+{
+        int cpu;
+        /*
+         * We're using fixed IRQ delivery, can only return one phys APIC ID.
+         * May as well be the first.
+         */
+        for_each_cpu_and(cpu, cpumask, andmask)
+                if (cpumask_test_cpu(cpu, cpu_online_mask))
+                        break;
+        if (cpu < nr_cpu_ids)
+                return cpu_to_logical_apicid(cpu);
+        return BAD_APICID;
+}
 static inline u32 phys_pkg_id(u32 cpuid_apic, int index_msb)
 {
        return cpuid_apic >> index_msb;
diff --git a/arch/x86/include/asm/bigsmp/ipi.h b/arch/x86/include/asm/bigsmp/ipi.h
index 9404c535b7ec..27fcd01b3ae6 100644
--- a/arch/x86/include/asm/bigsmp/ipi.h
+++ b/arch/x86/include/asm/bigsmp/ipi.h
@@ -1,25 +1,22 @@
 #ifndef __ASM_MACH_IPI_H
 #define __ASM_MACH_IPI_H
-void send_IPI_mask_sequence(cpumask_t mask, int vector);
+void send_IPI_mask_sequence(const struct cpumask *mask, int vector);
+void send_IPI_mask_allbutself(const struct cpumask *mask, int vector);
-static inline void send_IPI_mask(cpumask_t mask, int vector)
+static inline void send_IPI_mask(const struct cpumask *mask, int vector)
 {
        send_IPI_mask_sequence(mask, vector);
 }
 static inline void send_IPI_allbutself(int vector)
 {
-        cpumask_t mask = cpu_online_map;
+        send_IPI_mask_allbutself(cpu_online_mask, vector);
-        cpu_clear(smp_processor_id(), mask);
-        if (!cpus_empty(mask))
-                send_IPI_mask(mask, vector);
 }
 static inline void send_IPI_all(int vector)
 {
-        send_IPI_mask(cpu_online_map, vector);
+        send_IPI_mask(cpu_online_mask, vector);
 }
 #endif /* __ASM_MACH_IPI_H */
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index e6b82b17b072..dc27705f5443 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -320,16 +320,14 @@ static inline void set_intr_gate(unsigned int n, void *addr)
        _set_gate(n, GATE_INTERRUPT, addr, 0, 0, __KERNEL_CS);
 }
-#define SYS_VECTOR_FREE         0
-#define SYS_VECTOR_ALLOCED      1
 extern int first_system_vector;
-extern char system_vectors[];
+/* used_vectors is BITMAP for irq is not managed by percpu vector_irq */
+extern unsigned long used_vectors[];
 static inline void alloc_system_vector(int vector)
 {
-        if (system_vectors[vector] == SYS_VECTOR_FREE) {
+        if (!test_bit(vector, used_vectors)) {
-                system_vectors[vector] = SYS_VECTOR_ALLOCED;
+                set_bit(vector, used_vectors);
                if (first_system_vector > vector)
                        first_system_vector = vector;
        } else
diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h
index dc22c0733282..4035357f5b9d 100644
--- a/arch/x86/include/asm/dma-mapping.h
+++ b/arch/x86/include/asm/dma-mapping.h
@@ -65,7 +65,7 @@ static inline struct dma_mapping_ops *get_dma_ops(struct device *dev)
                return dma_ops;
        else
                return dev->archdata.dma_ops;
-#endif /* _ASM_X86_DMA_MAPPING_H */
+#endif
 }
 /* Make sure we keep the same behaviour */
diff --git a/arch/x86/include/asm/es7000/apic.h b/arch/x86/include/asm/es7000/apic.h
index e24ef876915f..51ac1230294e 100644
--- a/arch/x86/include/asm/es7000/apic.h
+++ b/arch/x86/include/asm/es7000/apic.h
@@ -9,14 +9,14 @@ static inline int apic_id_registered(void)
                return (1);
 }
-static inline cpumask_t target_cpus_cluster(void)
+static inline const cpumask_t *target_cpus_cluster(void)
 {
-        return CPU_MASK_ALL;
+        return &CPU_MASK_ALL;
 }
-static inline cpumask_t target_cpus(void)
+static inline const cpumask_t *target_cpus(void)
 {
-        return cpumask_of_cpu(smp_processor_id());
+        return &cpumask_of_cpu(smp_processor_id());
 }
 #define APIC_DFR_VALUE_CLUSTER          (APIC_DFR_CLUSTER)
@@ -80,9 +80,10 @@ extern int apic_version [MAX_APICS];
 static inline void setup_apic_routing(void)
 {
        int apic = per_cpu(x86_bios_cpu_apicid, smp_processor_id());
-        printk("Enabling APIC mode:  %s.  Using %d I/O APICs, target cpus %lx\n",
+        printk("Enabling APIC mode:  %s. Using %d I/O APICs, target cpus %lx\n",
                (apic_version[apic] == 0x14) ?
-                "Physical Cluster" : "Logical Cluster", nr_ioapics, cpus_addr(target_cpus())[0]);
+                        "Physical Cluster" : "Logical Cluster",
+                        nr_ioapics, cpus_addr(*target_cpus())[0]);
 }
 static inline int multi_timer_check(int apic, int irq)
@@ -100,7 +101,7 @@ static inline int cpu_present_to_apicid(int mps_cpu)
 {
        if (!mps_cpu)
                return boot_cpu_physical_apicid;
-        else if (mps_cpu < NR_CPUS)
+        else if (mps_cpu < nr_cpu_ids)
                return (int) per_cpu(x86_bios_cpu_apicid, mps_cpu);
        else
                return BAD_APICID;
@@ -120,9 +121,9 @@ extern u8 cpu_2_logical_apicid[];
 static inline int cpu_to_logical_apicid(int cpu)
 {
 #ifdef CONFIG_SMP
-       if (cpu >= NR_CPUS)
+        if (cpu >= nr_cpu_ids)
-               return BAD_APICID;
+                return BAD_APICID;
-       return (int)cpu_2_logical_apicid[cpu];
+        return (int)cpu_2_logical_apicid[cpu];
 #else
        return logical_smp_processor_id();
 #endif
@@ -146,14 +147,15 @@ static inline int check_phys_apicid_present(int cpu_physical_apicid)
        return (1);
 }
-static inline unsigned int cpu_mask_to_apicid_cluster(cpumask_t cpumask)
+static inline unsigned int
+cpu_mask_to_apicid_cluster(const struct cpumask *cpumask)
 {
        int num_bits_set;
        int cpus_found = 0;
        int cpu;
        int apicid;
-        num_bits_set = cpus_weight(cpumask);
+        num_bits_set = cpumask_weight(cpumask);
        /* Return id to all */
        if (num_bits_set == NR_CPUS)
                return 0xFF;
@@ -161,10 +163,10 @@ static inline unsigned int cpu_mask_to_apicid_cluster(cpumask_t cpumask)
         * The cpus in the mask must all be on the apic cluster.  If are not
         * on the same apicid cluster return default value of TARGET_CPUS.
         */
-        cpu = first_cpu(cpumask);
+        cpu = cpumask_first(cpumask);
        apicid = cpu_to_logical_apicid(cpu);
        while (cpus_found < num_bits_set) {
-                if (cpu_isset(cpu, cpumask)) {
+                if (cpumask_test_cpu(cpu, cpumask)) {
                        int new_apicid = cpu_to_logical_apicid(cpu);
                        if (apicid_cluster(apicid) !=
                                        apicid_cluster(new_apicid)){
@@ -179,14 +181,14 @@ static inline unsigned int cpu_mask_to_apicid_cluster(cpumask_t cpumask)
        return apicid;
 }
-static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask)
+static inline unsigned int cpu_mask_to_apicid(const cpumask_t *cpumask)
 {
        int num_bits_set;
        int cpus_found = 0;
        int cpu;
        int apicid;
-        num_bits_set = cpus_weight(cpumask);
+        num_bits_set = cpus_weight(*cpumask);
        /* Return id to all */
        if (num_bits_set == NR_CPUS)
                return cpu_to_logical_apicid(0);
@@ -194,10 +196,52 @@ static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask)
         * The cpus in the mask must all be on the apic cluster.  If are not
         * on the same apicid cluster return default value of TARGET_CPUS.
         */
-        cpu = first_cpu(cpumask);
+        cpu = first_cpu(*cpumask);
+        apicid = cpu_to_logical_apicid(cpu);
+        while (cpus_found < num_bits_set) {
+                if (cpu_isset(cpu, *cpumask)) {
+                        int new_apicid = cpu_to_logical_apicid(cpu);
+                        if (apicid_cluster(apicid) !=
+                                        apicid_cluster(new_apicid)){
+                                printk ("%s: Not a valid mask!\n", __func__);
+                                return cpu_to_logical_apicid(0);
+                        }
+                        apicid = new_apicid;
+                        cpus_found++;
+                }
+                cpu++;
+        }
+        return apicid;
+}
+static inline unsigned int cpu_mask_to_apicid_and(const struct cpumask *inmask,
+                                                  const struct cpumask *andmask)
+{
+        int num_bits_set;
+        int cpus_found = 0;
+        int cpu;
+        int apicid = cpu_to_logical_apicid(0);
+        cpumask_var_t cpumask;
+        if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC))
+                return apicid;
+        cpumask_and(cpumask, inmask, andmask);
+        cpumask_and(cpumask, cpumask, cpu_online_mask);
+        num_bits_set = cpumask_weight(cpumask);
+        /* Return id to all */
+        if (num_bits_set == NR_CPUS)
+                goto exit;
+        /*
+         * The cpus in the mask must all be on the apic cluster.  If are not
+         * on the same apicid cluster return default value of TARGET_CPUS.
+         */
+        cpu = cpumask_first(cpumask);
        apicid = cpu_to_logical_apicid(cpu);
        while (cpus_found < num_bits_set) {
-                if (cpu_isset(cpu, cpumask)) {
+                if (cpumask_test_cpu(cpu, cpumask)) {
                        int new_apicid = cpu_to_logical_apicid(cpu);
                        if (apicid_cluster(apicid) !=
                                        apicid_cluster(new_apicid)){
@@ -209,6 +253,8 @@ static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask)
                }
                cpu++;
        }
+exit:
+        free_cpumask_var(cpumask);
        return apicid;
 }
diff --git a/arch/x86/include/asm/es7000/ipi.h b/arch/x86/include/asm/es7000/ipi.h
index 632a955fcc0a..7e8ed24d4b8a 100644
--- a/arch/x86/include/asm/es7000/ipi.h
+++ b/arch/x86/include/asm/es7000/ipi.h
@@ -1,24 +1,22 @@
 #ifndef __ASM_ES7000_IPI_H
 #define __ASM_ES7000_IPI_H
-void send_IPI_mask_sequence(cpumask_t mask, int vector);
+void send_IPI_mask_sequence(const struct cpumask *mask, int vector);
+void send_IPI_mask_allbutself(const struct cpumask *mask, int vector);
-static inline void send_IPI_mask(cpumask_t mask, int vector)
+static inline void send_IPI_mask(const struct cpumask *mask, int vector)
 {
        send_IPI_mask_sequence(mask, vector);
 }
 static inline void send_IPI_allbutself(int vector)
 {
-        cpumask_t mask = cpu_online_map;
+        send_IPI_mask_allbutself(cpu_online_mask, vector);
-        cpu_clear(smp_processor_id(), mask);
-        if (!cpus_empty(mask))
-                send_IPI_mask(mask, vector);
 }
 static inline void send_IPI_all(int vector)
 {
-        send_IPI_mask(cpu_online_map, vector);
+        send_IPI_mask(cpu_online_mask, vector);
 }
 #endif /* __ASM_ES7000_IPI_H */
diff --git a/arch/x86/include/asm/genapic_32.h b/arch/x86/include/asm/genapic_32.h
index 0ac17d33a8c7..746f37a7963a 100644
--- a/arch/x86/include/asm/genapic_32.h
+++ b/arch/x86/include/asm/genapic_32.h
@@ -24,7 +24,7 @@ struct genapic {
        int (*probe)(void);
        int (*apic_id_registered)(void);
-        cpumask_t (*target_cpus)(void);
+        const struct cpumask *(*target_cpus)(void);
        int int_delivery_mode;
        int int_dest_mode;
        int ESR_DISABLE;
@@ -57,12 +57,16 @@ struct genapic {
        unsigned (*get_apic_id)(unsigned long x);
        unsigned long apic_id_mask;
-        unsigned int (*cpu_mask_to_apicid)(cpumask_t cpumask);
+        unsigned int (*cpu_mask_to_apicid)(const struct cpumask *cpumask);
-        cpumask_t (*vector_allocation_domain)(int cpu);
+        unsigned int (*cpu_mask_to_apicid_and)(const struct cpumask *cpumask,
+                                               const struct cpumask *andmask);
+        void (*vector_allocation_domain)(int cpu, struct cpumask *retmask);
 #ifdef CONFIG_SMP
        /* ipi */
-        void (*send_IPI_mask)(cpumask_t mask, int vector);
+        void (*send_IPI_mask)(const struct cpumask *mask, int vector);
+        void (*send_IPI_mask_allbutself)(const struct cpumask *mask,
+                                         int vector);
        void (*send_IPI_allbutself)(int vector);
        void (*send_IPI_all)(int vector);
 #endif
@@ -114,6 +118,7 @@ struct genapic {
        APICFUNC(get_apic_id)                           \
        .apic_id_mask = APIC_ID_MASK,                   \
        APICFUNC(cpu_mask_to_apicid)                    \
+        APICFUNC(cpu_mask_to_apicid_and)                \
        APICFUNC(vector_allocation_domain)              \
        APICFUNC(acpi_madt_oem_check)                   \
        IPIFUNC(send_IPI_mask)                          \
diff --git a/arch/x86/include/asm/genapic_64.h b/arch/x86/include/asm/genapic_64.h
index 2cae011668b7..adf32fb56aa6 100644
--- a/arch/x86/include/asm/genapic_64.h
+++ b/arch/x86/include/asm/genapic_64.h
@@ -1,6 +1,8 @@
 #ifndef _ASM_X86_GENAPIC_64_H
 #define _ASM_X86_GENAPIC_64_H
+#include <linux/cpumask.h>
 /*
 * Copyright 2004 James Cleverdon, IBM.
 * Subject to the GNU Public License, v.2
@@ -18,16 +20,20 @@ struct genapic {
        u32 int_delivery_mode;
        u32 int_dest_mode;
        int (*apic_id_registered)(void);
-        cpumask_t (*target_cpus)(void);
+        const struct cpumask *(*target_cpus)(void);
-        cpumask_t (*vector_allocation_domain)(int cpu);
+        void (*vector_allocation_domain)(int cpu, struct cpumask *retmask);
        void (*init_apic_ldr)(void);
        /* ipi */
-        void (*send_IPI_mask)(cpumask_t mask, int vector);
+        void (*send_IPI_mask)(const struct cpumask *mask, int vector);
+        void (*send_IPI_mask_allbutself)(const struct cpumask *mask,
+                                         int vector);
        void (*send_IPI_allbutself)(int vector);
        void (*send_IPI_all)(int vector);
        void (*send_IPI_self)(int vector);
        /* */
-        unsigned int (*cpu_mask_to_apicid)(cpumask_t cpumask);
+        unsigned int (*cpu_mask_to_apicid)(const struct cpumask *cpumask);
+        unsigned int (*cpu_mask_to_apicid_and)(const struct cpumask *cpumask,
+                                               const struct cpumask *andmask);
        unsigned int (*phys_pkg_id)(int index_msb);
        unsigned int (*get_apic_id)(unsigned long x);
        unsigned long (*set_apic_id)(unsigned int id);
diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index e475e009ae5d..7a1f44ac1f17 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -198,17 +198,14 @@ extern void restore_IO_APIC_setup(void);
 extern void reinit_intr_remapped_IO_APIC(int);
 #endif
-extern int probe_nr_irqs(void);
+extern void probe_nr_irqs_gsi(void);
 #else  /* !CONFIG_X86_IO_APIC */
 #define io_apic_assign_pci_irqs 0
 static const int timer_through_8259 = 0;
-static inline void ioapic_init_mappings(void) { }
+static inline void ioapic_init_mappings(void)   { }
-static inline int probe_nr_irqs(void)
+static inline void probe_nr_irqs_gsi(void)      { }
-{
-        return NR_IRQS;
-}
 #endif
 #endif /* _ASM_X86_IO_APIC_H */
diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h
index 295b13193f4d..a6ee9e6f530f 100644
--- a/arch/x86/include/asm/iommu.h
+++ b/arch/x86/include/asm/iommu.h
@@ -7,8 +7,6 @@ extern struct dma_mapping_ops nommu_dma_ops;
 extern int force_iommu, no_iommu;
 extern int iommu_detected;
-extern unsigned long iommu_nr_pages(unsigned long addr, unsigned long len);
 /* 10 seconds */
 #define DMAR_OPERATION_TIMEOUT ((cycles_t) tsc_khz*10*1000)
diff --git a/arch/x86/include/asm/ipi.h b/arch/x86/include/asm/ipi.h
index f89dffb28aa9..c745a306f7d3 100644
--- a/arch/x86/include/asm/ipi.h
+++ b/arch/x86/include/asm/ipi.h
@@ -117,7 +117,8 @@ static inline void __send_IPI_dest_field(unsigned int mask, int vector,
        native_apic_mem_write(APIC_ICR, cfg);
 }
-static inline void send_IPI_mask_sequence(cpumask_t mask, int vector)
+static inline void send_IPI_mask_sequence(const struct cpumask *mask,
+                                          int vector)
 {
        unsigned long flags;
        unsigned long query_cpu;
@@ -128,11 +129,29 @@ static inline void send_IPI_mask_sequence(cpumask_t mask, int vector)
         * - mbligh
         */
        local_irq_save(flags);
-        for_each_cpu_mask_nr(query_cpu, mask) {
+        for_each_cpu(query_cpu, mask) {
                __send_IPI_dest_field(per_cpu(x86_cpu_to_apicid, query_cpu),
                                      vector, APIC_DEST_PHYSICAL);
        }
        local_irq_restore(flags);
 }
+static inline void send_IPI_mask_allbutself(const struct cpumask *mask,
+                                            int vector)
+{
+        unsigned long flags;
+        unsigned int query_cpu;
+        unsigned int this_cpu = smp_processor_id();
+        /* See Hack comment above */
+        local_irq_save(flags);
+        for_each_cpu(query_cpu, mask)
+                if (query_cpu != this_cpu)
+                        __send_IPI_dest_field(
+                                per_cpu(x86_cpu_to_apicid, query_cpu),
+                                vector, APIC_DEST_PHYSICAL);
+        local_irq_restore(flags);
+}
 #endif /* _ASM_X86_IPI_H */
diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h
index 28e409fc73f3..592688ed04d3 100644
--- a/arch/x86/include/asm/irq.h
+++ b/arch/x86/include/asm/irq.h
@@ -33,7 +33,7 @@ static inline int irq_canonicalize(int irq)
 #ifdef CONFIG_HOTPLUG_CPU
 #include <linux/cpumask.h>
-extern void fixup_irqs(cpumask_t map);
+extern void fixup_irqs(void);
 #endif
 extern unsigned int do_IRQ(struct pt_regs *regs);
@@ -42,5 +42,6 @@ extern void native_init_IRQ(void);
 /* Interrupt vector management */
 extern DECLARE_BITMAP(used_vectors, NR_VECTORS);
+extern int vector_used_by_percpu_irq(unsigned int vector);
 #endif /* _ASM_X86_IRQ_H */
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 0005adb0f941..f7ff65032b9d 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -101,12 +101,23 @@
 #define LAST_VM86_IRQ           15
 #define invalid_vm86_irq(irq)   ((irq) < 3 || (irq) > 15)
+#define NR_IRQS_LEGACY          16
 #if defined(CONFIG_X86_IO_APIC) && !defined(CONFIG_X86_VOYAGER)
+#ifndef CONFIG_SPARSE_IRQ
 # if NR_CPUS < MAX_IO_APICS
 #  define NR_IRQS (NR_VECTORS + (32 * NR_CPUS))
 # else
 #  define NR_IRQS (NR_VECTORS + (32 * MAX_IO_APICS))
 # endif
+#else
+# if (8 * NR_CPUS) > (32 * MAX_IO_APICS)
+#  define NR_IRQS (NR_VECTORS + (8 * NR_CPUS))
+# else
+#  define NR_IRQS (NR_VECTORS + (32 * MAX_IO_APICS))
+# endif
+#endif
 #elif defined(CONFIG_X86_VOYAGER)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 8346be87cfa1..97215a458e5f 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -21,6 +21,7 @@
 #include <asm/pvclock-abi.h>
 #include <asm/desc.h>
+#include <asm/mtrr.h>
 #define KVM_MAX_VCPUS 16
 #define KVM_MEMORY_SLOTS 32
@@ -86,6 +87,7 @@
 #define KVM_MIN_FREE_MMU_PAGES 5
 #define KVM_REFILL_PAGES 25
 #define KVM_MAX_CPUID_ENTRIES 40
+#define KVM_NR_FIXED_MTRR_REGION 88
 #define KVM_NR_VAR_MTRR 8
 extern spinlock_t kvm_lock;
@@ -180,6 +182,8 @@ struct kvm_mmu_page {
        struct list_head link;
        struct hlist_node hash_link;
+        struct list_head oos_link;
        /*
         * The following two entries are used to key the shadow page in the
         * hash table.
@@ -190,13 +194,16 @@ struct kvm_mmu_page {
        u64 *spt;
        /* hold the gfn of each spte inside spt */
        gfn_t *gfns;
-        unsigned long slot_bitmap; /* One bit set per slot which has memory
+        /*
-                                    * in this shadow page.
+         * One bit set per slot which has memory
-                                    */
+         * in this shadow page.
+         */
+        DECLARE_BITMAP(slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
        int multimapped;         /* More than one parent_pte? */
        int root_count;          /* Currently serving as active root */
        bool unsync;
-        bool unsync_children;
+        bool global;
+        unsigned int unsync_children;
        union {
                u64 *parent_pte;               /* !multimapped */
                struct hlist_head parent_ptes; /* multimapped, kvm_pte_chain */
@@ -327,8 +334,10 @@ struct kvm_vcpu_arch {
        bool nmi_pending;
        bool nmi_injected;
+        bool nmi_window_open;
-        u64 mtrr[0x100];
+        struct mtrr_state_type mtrr_state;
+        u32 pat;
 };
 struct kvm_mem_alias {
@@ -350,11 +359,13 @@ struct kvm_arch{
         */
        struct list_head active_mmu_pages;
        struct list_head assigned_dev_head;
+        struct list_head oos_global_pages;
        struct dmar_domain *intel_iommu_domain;
        struct kvm_pic *vpic;
        struct kvm_ioapic *vioapic;
        struct kvm_pit *vpit;
        struct hlist_head irq_ack_notifier_list;
+        int vapics_in_nmi_mode;
        int round_robin_prev_vcpu;
        unsigned int tss_addr;
@@ -378,6 +389,7 @@ struct kvm_vm_stat {
        u32 mmu_recycled;
        u32 mmu_cache_miss;
        u32 mmu_unsync;
+        u32 mmu_unsync_global;
        u32 remote_tlb_flush;
        u32 lpages;
 };
@@ -397,6 +409,7 @@ struct kvm_vcpu_stat {
        u32 halt_exits;
        u32 halt_wakeup;
        u32 request_irq_exits;
+        u32 request_nmi_exits;
        u32 irq_exits;
        u32 host_state_reload;
        u32 efer_reload;
@@ -405,6 +418,7 @@ struct kvm_vcpu_stat {
        u32 insn_emulation_fail;
        u32 hypercalls;
        u32 irq_injections;
+        u32 nmi_injections;
 };
 struct descriptor_table {
@@ -477,6 +491,7 @@ struct kvm_x86_ops {
        int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
        int (*get_tdp_level)(void);
+        int (*get_mt_mask_shift)(void);
 };
 extern struct kvm_x86_ops *kvm_x86_ops;
@@ -490,7 +505,7 @@ int kvm_mmu_setup(struct kvm_vcpu *vcpu);
 void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte);
 void kvm_mmu_set_base_ptes(u64 base_pte);
 void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
-                u64 dirty_mask, u64 nx_mask, u64 x_mask);
+                u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 mt_mask);
 int kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
 void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
@@ -587,12 +602,14 @@ unsigned long segment_base(u16 selector);
 void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu);
 void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
-                       const u8 *new, int bytes);
+                       const u8 *new, int bytes,
+                       bool guest_initiated);
 int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva);
 void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
 int kvm_mmu_load(struct kvm_vcpu *vcpu);
 void kvm_mmu_unload(struct kvm_vcpu *vcpu);
 void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu);
+void kvm_mmu_sync_global(struct kvm_vcpu *vcpu);
 int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
@@ -607,6 +624,8 @@ void kvm_disable_tdp(void);
 int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3);
 int complete_pio(struct kvm_vcpu *vcpu);
+struct kvm_memory_slot *gfn_to_memslot_unaliased(struct kvm *kvm, gfn_t gfn);
 static inline struct kvm_mmu_page *page_header(hpa_t shadow_page)
 {
        struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT);
@@ -702,18 +721,6 @@ static inline void kvm_inject_gp(struct kvm_vcpu *vcpu, u32 error_code)
        kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
 }
-#define ASM_VMX_VMCLEAR_RAX       ".byte 0x66, 0x0f, 0xc7, 0x30"
-#define ASM_VMX_VMLAUNCH          ".byte 0x0f, 0x01, 0xc2"
-#define ASM_VMX_VMRESUME          ".byte 0x0f, 0x01, 0xc3"
-#define ASM_VMX_VMPTRLD_RAX       ".byte 0x0f, 0xc7, 0x30"
-#define ASM_VMX_VMREAD_RDX_RAX    ".byte 0x0f, 0x78, 0xd0"
-#define ASM_VMX_VMWRITE_RAX_RDX   ".byte 0x0f, 0x79, 0xd0"
-#define ASM_VMX_VMWRITE_RSP_RDX   ".byte 0x0f, 0x79, 0xd4"
-#define ASM_VMX_VMXOFF            ".byte 0x0f, 0x01, 0xc4"
-#define ASM_VMX_VMXON_RAX         ".byte 0xf3, 0x0f, 0xc7, 0x30"
-#define ASM_VMX_INVEPT            ".byte 0x66, 0x0f, 0x38, 0x80, 0x08"
-#define ASM_VMX_INVVPID           ".byte 0x66, 0x0f, 0x38, 0x81, 0x08"
 #define MSR_IA32_TIME_STAMP_COUNTER             0x010
 #define TSS_IOPB_BASE_OFFSET 0x66
diff --git a/arch/x86/include/asm/kvm_x86_emulate.h b/arch/x86/include/asm/kvm_x86_emulate.h
index 25179a29f208..6a159732881a 100644
--- a/arch/x86/include/asm/kvm_x86_emulate.h
+++ b/arch/x86/include/asm/kvm_x86_emulate.h
@@ -123,6 +123,7 @@ struct decode_cache {
        u8 ad_bytes;
        u8 rex_prefix;
        struct operand src;
+        struct operand src2;
        struct operand dst;
        bool has_seg_override;
        u8 seg_override;
@@ -146,22 +147,18 @@ struct x86_emulate_ctxt {
        /* Register state before/after emulation. */
        struct kvm_vcpu *vcpu;
-        /* Linear faulting address (if emulating a page-faulting instruction) */
        unsigned long eflags;
        /* Emulated execution mode, represented by an X86EMUL_MODE value. */
        int mode;
        u32 cs_base;
        /* decode cache */
        struct decode_cache decode;
 };
 /* Repeat String Operation Prefix */
-#define REPE_PREFIX  1
+#define REPE_PREFIX     1
-#define REPNE_PREFIX    2
+#define REPNE_PREFIX    2
 /* Execution mode, passed to the emulator. */
 #define X86EMUL_MODE_REAL     0 /* Real mode.             */
@@ -170,7 +167,7 @@ struct x86_emulate_ctxt {
 #define X86EMUL_MODE_PROT64   8 /* 64-bit (long) mode.    */
 /* Host execution mode. */
-#if defined(__i386__)
+#if defined(CONFIG_X86_32)
 #define X86EMUL_MODE_HOST X86EMUL_MODE_PROT32
 #elif defined(CONFIG_X86_64)
 #define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64
diff --git a/arch/x86/include/asm/mach-default/mach_apic.h b/arch/x86/include/asm/mach-default/mach_apic.h
index 6cb3a467e067..cc09cbbee27e 100644
--- a/arch/x86/include/asm/mach-default/mach_apic.h
+++ b/arch/x86/include/asm/mach-default/mach_apic.h
@@ -8,12 +8,12 @@
 #define APIC_DFR_VALUE  (APIC_DFR_FLAT)
-static inline cpumask_t target_cpus(void)
+static inline const struct cpumask *target_cpus(void)
 { 
 #ifdef CONFIG_SMP
-        return cpu_online_map;
+        return cpu_online_mask;
 #else
-        return cpumask_of_cpu(0);
+        return cpumask_of(0);
 #endif
 } 
@@ -28,6 +28,7 @@ static inline cpumask_t target_cpus(void)
 #define apic_id_registered (genapic->apic_id_registered)
 #define init_apic_ldr (genapic->init_apic_ldr)
 #define cpu_mask_to_apicid (genapic->cpu_mask_to_apicid)
+#define cpu_mask_to_apicid_and (genapic->cpu_mask_to_apicid_and)
 #define phys_pkg_id     (genapic->phys_pkg_id)
 #define vector_allocation_domain    (genapic->vector_allocation_domain)
 #define read_apic_id()  (GET_APIC_ID(apic_read(APIC_ID)))
@@ -61,9 +62,19 @@ static inline int apic_id_registered(void)
        return physid_isset(read_apic_id(), phys_cpu_present_map);
 }
-static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask)
+static inline unsigned int cpu_mask_to_apicid(const struct cpumask *cpumask)
 {
-        return cpus_addr(cpumask)[0];
+        return cpumask_bits(cpumask)[0];
+}
+static inline unsigned int cpu_mask_to_apicid_and(const struct cpumask *cpumask,
+                                                  const struct cpumask *andmask)
+{
+        unsigned long mask1 = cpumask_bits(cpumask)[0];
+        unsigned long mask2 = cpumask_bits(andmask)[0];
+        unsigned long mask3 = cpumask_bits(cpu_online_mask)[0];
+        return (unsigned int)(mask1 & mask2 & mask3);
 }
 static inline u32 phys_pkg_id(u32 cpuid_apic, int index_msb)
@@ -88,7 +99,7 @@ static inline int apicid_to_node(int logical_apicid)
 #endif
 }
-static inline cpumask_t vector_allocation_domain(int cpu)
+static inline void vector_allocation_domain(int cpu, struct cpumask *retmask)
 {
        /* Careful. Some cpus do not strictly honor the set of cpus
         * specified in the interrupt destination when using lowest
@@ -98,8 +109,7 @@ static inline cpumask_t vector_allocation_domain(int cpu)
         * deliver interrupts to the wrong hyperthread when only one
         * hyperthread was specified in the interrupt desitination.
         */
-        cpumask_t domain = { { [0] = APIC_ALL_CPUS, } };
+        *retmask = (cpumask_t) { { [0] = APIC_ALL_CPUS } };
-        return domain;
 }
 #endif
@@ -131,7 +141,7 @@ static inline int cpu_to_logical_apicid(int cpu)
 static inline int cpu_present_to_apicid(int mps_cpu)
 {
-        if (mps_cpu < NR_CPUS && cpu_present(mps_cpu))
+        if (mps_cpu < nr_cpu_ids && cpu_present(mps_cpu))
                return (int)per_cpu(x86_bios_cpu_apicid, mps_cpu);
        else
                return BAD_APICID;
diff --git a/arch/x86/include/asm/mach-default/mach_ipi.h b/arch/x86/include/asm/mach-default/mach_ipi.h
index fabca01ebacf..191312d155da 100644
--- a/arch/x86/include/asm/mach-default/mach_ipi.h
+++ b/arch/x86/include/asm/mach-default/mach_ipi.h
@@ -4,7 +4,8 @@
 /* Avoid include hell */
 #define NMI_VECTOR 0x02
-void send_IPI_mask_bitmask(cpumask_t mask, int vector);
+void send_IPI_mask_bitmask(const struct cpumask *mask, int vector);
+void send_IPI_mask_allbutself(const struct cpumask *mask, int vector);
 void __send_IPI_shortcut(unsigned int shortcut, int vector);
 extern int no_broadcast;
@@ -12,28 +13,27 @@ extern int no_broadcast;
 #ifdef CONFIG_X86_64
 #include <asm/genapic.h>
 #define send_IPI_mask (genapic->send_IPI_mask)
+#define send_IPI_mask_allbutself (genapic->send_IPI_mask_allbutself)
 #else
-static inline void send_IPI_mask(cpumask_t mask, int vector)
+static inline void send_IPI_mask(const struct cpumask *mask, int vector)
 {
        send_IPI_mask_bitmask(mask, vector);
 }
+void send_IPI_mask_allbutself(const struct cpumask *mask, int vector);
 #endif
 static inline void __local_send_IPI_allbutself(int vector)
 {
-        if (no_broadcast || vector == NMI_VECTOR) {
+        if (no_broadcast || vector == NMI_VECTOR)
-                cpumask_t mask = cpu_online_map;
+                send_IPI_mask_allbutself(cpu_online_mask, vector);
+        else
-                cpu_clear(smp_processor_id(), mask);
-                send_IPI_mask(mask, vector);
-        } else
                __send_IPI_shortcut(APIC_DEST_ALLBUT, vector);
 }
 static inline void __local_send_IPI_all(int vector)
 {
        if (no_broadcast || vector == NMI_VECTOR)
-                send_IPI_mask(cpu_online_map, vector);
+                send_IPI_mask(cpu_online_mask, vector);
        else
                __send_IPI_shortcut(APIC_DEST_ALLINC, vector);
 }
diff --git a/arch/x86/include/asm/mach-generic/mach_apic.h b/arch/x86/include/asm/mach-generic/mach_apic.h
index e430f47df667..48553e958ad5 100644
--- a/arch/x86/include/asm/mach-generic/mach_apic.h
+++ b/arch/x86/include/asm/mach-generic/mach_apic.h
@@ -24,6 +24,7 @@
 #define check_phys_apicid_present (genapic->check_phys_apicid_present)
 #define check_apicid_used (genapic->check_apicid_used)
 #define cpu_mask_to_apicid (genapic->cpu_mask_to_apicid)
+#define cpu_mask_to_apicid_and (genapic->cpu_mask_to_apicid_and)
 #define vector_allocation_domain (genapic->vector_allocation_domain)
 #define enable_apic_mode (genapic->enable_apic_mode)
 #define phys_pkg_id (genapic->phys_pkg_id)
diff --git a/arch/x86/include/asm/mtrr.h b/arch/x86/include/asm/mtrr.h
index 7c1e4258b31e..cb988aab716d 100644
--- a/arch/x86/include/asm/mtrr.h
+++ b/arch/x86/include/asm/mtrr.h
@@ -57,6 +57,31 @@ struct mtrr_gentry {
 };
 #endif /* !__i386__ */
+struct mtrr_var_range {
+        u32 base_lo;
+        u32 base_hi;
+        u32 mask_lo;
+        u32 mask_hi;
+};
+/* In the Intel processor's MTRR interface, the MTRR type is always held in
+   an 8 bit field: */
+typedef u8 mtrr_type;
+#define MTRR_NUM_FIXED_RANGES 88
+#define MTRR_MAX_VAR_RANGES 256
+struct mtrr_state_type {
+        struct mtrr_var_range var_ranges[MTRR_MAX_VAR_RANGES];
+        mtrr_type fixed_ranges[MTRR_NUM_FIXED_RANGES];
+        unsigned char enabled;
+        unsigned char have_fixed;
+        mtrr_type def_type;
+};
+#define MTRRphysBase_MSR(reg) (0x200 + 2 * (reg))
+#define MTRRphysMask_MSR(reg) (0x200 + 2 * (reg) + 1)
 /*  These are the various ioctls  */
 #define MTRRIOC_ADD_ENTRY        _IOW(MTRR_IOCTL_BASE,  0, struct mtrr_sentry)
 #define MTRRIOC_SET_ENTRY        _IOW(MTRR_IOCTL_BASE,  1, struct mtrr_sentry)
diff --git a/arch/x86/include/asm/numaq/apic.h b/arch/x86/include/asm/numaq/apic.h
index 0bf2a06b7a4e..c80f00d29965 100644
--- a/arch/x86/include/asm/numaq/apic.h
+++ b/arch/x86/include/asm/numaq/apic.h
@@ -7,9 +7,9 @@
 #define APIC_DFR_VALUE  (APIC_DFR_CLUSTER)
-static inline cpumask_t target_cpus(void)
+static inline const cpumask_t *target_cpus(void)
 {
-        return CPU_MASK_ALL;
+        return &CPU_MASK_ALL;
 }
 #define NO_BALANCE_IRQ (1)
@@ -122,7 +122,13 @@ static inline void enable_apic_mode(void)
 * We use physical apicids here, not logical, so just return the default
 * physical broadcast to stop people from breaking us
 */
-static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask)
+static inline unsigned int cpu_mask_to_apicid(const cpumask_t *cpumask)
+{
+        return (int) 0xF;
+}
+static inline unsigned int cpu_mask_to_apicid_and(const struct cpumask *cpumask,
+                                                  const struct cpumask *andmask)
 {
        return (int) 0xF;
 }
diff --git a/arch/x86/include/asm/numaq/ipi.h b/arch/x86/include/asm/numaq/ipi.h
index 935588d286cf..a8374c652778 100644
--- a/arch/x86/include/asm/numaq/ipi.h
+++ b/arch/x86/include/asm/numaq/ipi.h
@@ -1,25 +1,22 @@
 #ifndef __ASM_NUMAQ_IPI_H
 #define __ASM_NUMAQ_IPI_H
-void send_IPI_mask_sequence(cpumask_t, int vector);
+void send_IPI_mask_sequence(const struct cpumask *mask, int vector);
+void send_IPI_mask_allbutself(const struct cpumask *mask, int vector);
-static inline void send_IPI_mask(cpumask_t mask, int vector)
+static inline void send_IPI_mask(const struct cpumask *mask, int vector)
 {
        send_IPI_mask_sequence(mask, vector);
 }
 static inline void send_IPI_allbutself(int vector)
 {
-        cpumask_t mask = cpu_online_map;
+        send_IPI_mask_allbutself(cpu_online_mask, vector);
-        cpu_clear(smp_processor_id(), mask);
-        if (!cpus_empty(mask))
-                send_IPI_mask(mask, vector);
 }
 static inline void send_IPI_all(int vector)
 {
-        send_IPI_mask(cpu_online_map, vector);
+        send_IPI_mask(cpu_online_mask, vector);
 }
 #endif /* __ASM_NUMAQ_IPI_H */
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index 647781298e7e..66834c41c049 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -84,6 +84,8 @@ static inline void pci_dma_burst_advice(struct pci_dev *pdev,
 static inline void early_quirks(void) { }
 #endif
+extern void pci_iommu_alloc(void);
 #endif  /* __KERNEL__ */
 #ifdef CONFIG_X86_32
diff --git a/arch/x86/include/asm/pci_64.h b/arch/x86/include/asm/pci_64.h
index d02d936840a3..4da207982777 100644
--- a/arch/x86/include/asm/pci_64.h
+++ b/arch/x86/include/asm/pci_64.h
@@ -23,7 +23,6 @@ extern int (*pci_config_write)(int seg, int bus, int dev, int fn,
                               int reg, int len, u32 value);
 extern void dma32_reserve_bootmem(void);
-extern void pci_iommu_alloc(void);
 /* The PCI address space does equal the physical memory
 * address space.  The networking and block device layers use
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index d12811ce51d9..830b9fcb6427 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -60,7 +60,7 @@ struct smp_ops {
        void (*cpu_die)(unsigned int cpu);
        void (*play_dead)(void);
-        void (*send_call_func_ipi)(cpumask_t mask);
+        void (*send_call_func_ipi)(const struct cpumask *mask);
        void (*send_call_func_single_ipi)(int cpu);
 };
@@ -125,7 +125,7 @@ static inline void arch_send_call_function_single_ipi(int cpu)
 static inline void arch_send_call_function_ipi(cpumask_t mask)
 {
-        smp_ops.send_call_func_ipi(mask);
+        smp_ops.send_call_func_ipi(&mask);
 }
 void cpu_disable_common(void);
@@ -138,7 +138,7 @@ void native_cpu_die(unsigned int cpu);
 void native_play_dead(void);
 void play_dead_common(void);
-void native_send_call_func_ipi(cpumask_t mask);
+void native_send_call_func_ipi(const struct cpumask *mask);
 void native_send_call_func_single_ipi(int cpu);
 extern void prefill_possible_map(void);
diff --git a/arch/x86/include/asm/summit/apic.h b/arch/x86/include/asm/summit/apic.h
index 9b3070f1c2ac..99327d1be49f 100644
--- a/arch/x86/include/asm/summit/apic.h
+++ b/arch/x86/include/asm/summit/apic.h
@@ -14,13 +14,13 @@
 #define APIC_DFR_VALUE  (APIC_DFR_CLUSTER)
-static inline cpumask_t target_cpus(void)
+static inline const cpumask_t *target_cpus(void)
 {
        /* CPU_MASK_ALL (0xff) has undefined behaviour with
         * dest_LowestPrio mode logical clustered apic interrupt routing
         * Just start on cpu 0.  IRQ balancing will spread load
         */
-        return cpumask_of_cpu(0);
+        return &cpumask_of_cpu(0);
 }
 #define INT_DELIVERY_MODE (dest_LowestPrio)
@@ -137,14 +137,14 @@ static inline void enable_apic_mode(void)
 {
 }
-static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask)
+static inline unsigned int cpu_mask_to_apicid(const cpumask_t *cpumask)
 {
        int num_bits_set;
        int cpus_found = 0;
        int cpu;
        int apicid;
-        num_bits_set = cpus_weight(cpumask);
+        num_bits_set = cpus_weight(*cpumask);
        /* Return id to all */
        if (num_bits_set == NR_CPUS)
                return (int) 0xFF;
@@ -152,10 +152,10 @@ static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask)
         * The cpus in the mask must all be on the apic cluster.  If are not
         * on the same apicid cluster return default value of TARGET_CPUS.
         */
-        cpu = first_cpu(cpumask);
+        cpu = first_cpu(*cpumask);
        apicid = cpu_to_logical_apicid(cpu);
        while (cpus_found < num_bits_set) {
-                if (cpu_isset(cpu, cpumask)) {
+                if (cpu_isset(cpu, *cpumask)) {
                        int new_apicid = cpu_to_logical_apicid(cpu);
                        if (apicid_cluster(apicid) !=
                                        apicid_cluster(new_apicid)){
@@ -170,6 +170,49 @@ static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask)
        return apicid;
 }
+static inline unsigned int cpu_mask_to_apicid_and(const struct cpumask *inmask,
+                                                  const struct cpumask *andmask)
+{
+        int num_bits_set;
+        int cpus_found = 0;
+        int cpu;
+        int apicid = 0xFF;
+        cpumask_var_t cpumask;
+        if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC))
+                return (int) 0xFF;
+        cpumask_and(cpumask, inmask, andmask);
+        cpumask_and(cpumask, cpumask, cpu_online_mask);
+        num_bits_set = cpumask_weight(cpumask);
+        /* Return id to all */
+        if (num_bits_set == nr_cpu_ids)
+                goto exit;
+        /*
+         * The cpus in the mask must all be on the apic cluster.  If are not
+         * on the same apicid cluster return default value of TARGET_CPUS.
+         */
+        cpu = cpumask_first(cpumask);
+        apicid = cpu_to_logical_apicid(cpu);
+        while (cpus_found < num_bits_set) {
+                if (cpumask_test_cpu(cpu, cpumask)) {
+                        int new_apicid = cpu_to_logical_apicid(cpu);
+                        if (apicid_cluster(apicid) !=
+                                        apicid_cluster(new_apicid)){
+                                printk ("%s: Not a valid mask!\n", __func__);
+                                return 0xFF;
+                        }
+                        apicid = apicid | new_apicid;
+                        cpus_found++;
+                }
+                cpu++;
+        }
+exit:
+        free_cpumask_var(cpumask);
+        return apicid;
+}
 /* cpuid returns the value latched in the HW at reset, not the APIC ID
 * register's value.  For any box whose BIOS changes APIC IDs, like
 * clustered APIC systems, we must use hard_smp_processor_id.
diff --git a/arch/x86/include/asm/summit/ipi.h b/arch/x86/include/asm/summit/ipi.h
index 53bd1e7bd7b4..a8a2c24f50cc 100644
--- a/arch/x86/include/asm/summit/ipi.h
+++ b/arch/x86/include/asm/summit/ipi.h
@@ -1,9 +1,10 @@
 #ifndef __ASM_SUMMIT_IPI_H
 #define __ASM_SUMMIT_IPI_H
-void send_IPI_mask_sequence(cpumask_t mask, int vector);
+void send_IPI_mask_sequence(const cpumask_t *mask, int vector);
+void send_IPI_mask_allbutself(const cpumask_t *mask, int vector);
-static inline void send_IPI_mask(cpumask_t mask, int vector)
+static inline void send_IPI_mask(const cpumask_t *mask, int vector)
 {
        send_IPI_mask_sequence(mask, vector);
 }
@@ -14,12 +15,12 @@ static inline void send_IPI_allbutself(int vector)
        cpu_clear(smp_processor_id(), mask);
        if (!cpus_empty(mask))
-                send_IPI_mask(mask, vector);
+                send_IPI_mask(&mask, vector);
 }
 static inline void send_IPI_all(int vector)
 {
-        send_IPI_mask(cpu_online_map, vector);
+        send_IPI_mask(&cpu_online_map, vector);
 }
 #endif /* __ASM_SUMMIT_IPI_H */
diff --git a/arch/x86/kvm/svm.h b/arch/x86/include/asm/svm.h
index 1b8afa78e869..1b8afa78e869 100644
--- a/arch/x86/kvm/svm.h
+++ b/arch/x86/include/asm/svm.h
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index ff386ff50ed7..79e31e9dcdda 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -226,6 +226,8 @@ extern cpumask_t cpu_coregroup_map(int cpu);
 #define topology_core_id(cpu)                   (cpu_data(cpu).cpu_core_id)
 #define topology_core_siblings(cpu)             (per_cpu(cpu_core_map, cpu))
 #define topology_thread_siblings(cpu)           (per_cpu(cpu_sibling_map, cpu))
+#define topology_core_cpumask(cpu)              (&per_cpu(cpu_core_map, cpu))
+#define topology_thread_cpumask(cpu)            (&per_cpu(cpu_sibling_map, cpu))
 /* indicates that pointers to the topology cpumask_t maps are valid */
 #define arch_provides_topology_pointers         yes
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 580c3ee6c58c..4340055b7559 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -157,6 +157,7 @@ extern int __get_user_bad(void);
        int __ret_gu;                                                   \
        unsigned long __val_gu;                                         \
        __chk_user_ptr(ptr);                                            \
+        might_fault();                                                  \
        switch (sizeof(*(ptr))) {                                       \
        case 1:                                                         \
                __get_user_x(1, __ret_gu, __val_gu, ptr);               \
@@ -241,6 +242,7 @@ extern void __put_user_8(void);
        int __ret_pu;                                           \
        __typeof__(*(ptr)) __pu_val;                            \
        __chk_user_ptr(ptr);                                    \
+        might_fault();                                          \
        __pu_val = x;                                           \
        switch (sizeof(*(ptr))) {                               \
        case 1:                                                 \
diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h
index d095a3aeea1b..5e06259e90e5 100644
--- a/arch/x86/include/asm/uaccess_32.h
+++ b/arch/x86/include/asm/uaccess_32.h
@@ -82,8 +82,8 @@ __copy_to_user_inatomic(void __user *to, const void *from, unsigned long n)
 static __always_inline unsigned long __must_check
 __copy_to_user(void __user *to, const void *from, unsigned long n)
 {
-       might_sleep();
+        might_fault();
-       return __copy_to_user_inatomic(to, from, n);
+        return __copy_to_user_inatomic(to, from, n);
 }
 static __always_inline unsigned long
@@ -137,7 +137,7 @@ __copy_from_user_inatomic(void *to, const void __user *from, unsigned long n)
 static __always_inline unsigned long
 __copy_from_user(void *to, const void __user *from, unsigned long n)
 {
-        might_sleep();
+        might_fault();
        if (__builtin_constant_p(n)) {
                unsigned long ret;
@@ -159,7 +159,7 @@ __copy_from_user(void *to, const void __user *from, unsigned long n)
 static __always_inline unsigned long __copy_from_user_nocache(void *to,
                                const void __user *from, unsigned long n)
 {
-        might_sleep();
+        might_fault();
        if (__builtin_constant_p(n)) {
                unsigned long ret;
diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index f8cfd00db450..84210c479fca 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -29,6 +29,8 @@ static __always_inline __must_check
 int __copy_from_user(void *dst, const void __user *src, unsigned size)
 {
        int ret = 0;
+        might_fault();
        if (!__builtin_constant_p(size))
                return copy_user_generic(dst, (__force void *)src, size);
        switch (size) {
@@ -71,6 +73,8 @@ static __always_inline __must_check
 int __copy_to_user(void __user *dst, const void *src, unsigned size)
 {
        int ret = 0;
+        might_fault();
        if (!__builtin_constant_p(size))
                return copy_user_generic((__force void *)dst, src, size);
        switch (size) {
@@ -113,6 +117,8 @@ static __always_inline __must_check
 int __copy_in_user(void __user *dst, const void __user *src, unsigned size)
 {
        int ret = 0;
+        might_fault();
        if (!__builtin_constant_p(size))
                return copy_user_generic((__force void *)dst,
                                         (__force void *)src, size);
diff --git a/arch/x86/include/asm/virtext.h b/arch/x86/include/asm/virtext.h
new file mode 100644
index 000000000000..593636275238
--- /dev/null
+++ b/arch/x86/include/asm/virtext.h
@@ -0,0 +1,132 @@
+/* CPU virtualization extensions handling
+ *
+ * This should carry the code for handling CPU virtualization extensions
+ * that needs to live in the kernel core.
+ *
+ * Author: Eduardo Habkost <ehabkost@redhat.com>
+ *
+ * Copyright (C) 2008, Red Hat Inc.
+ *
+ * Contains code from KVM, Copyright (C) 2006 Qumranet, Inc.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ */
+#ifndef _ASM_X86_VIRTEX_H
+#define _ASM_X86_VIRTEX_H
+#include <asm/processor.h>
+#include <asm/system.h>
+#include <asm/vmx.h>
+#include <asm/svm.h>
+/*
+ * VMX functions:
+ */
+static inline int cpu_has_vmx(void)
+{
+        unsigned long ecx = cpuid_ecx(1);
+        return test_bit(5, &ecx); /* CPUID.1:ECX.VMX[bit 5] -> VT */
+}
+/** Disable VMX on the current CPU
+ *
+ * vmxoff causes a undefined-opcode exception if vmxon was not run
+ * on the CPU previously. Only call this function if you know VMX
+ * is enabled.
+ */
+static inline void cpu_vmxoff(void)
+{
+        asm volatile (ASM_VMX_VMXOFF : : : "cc");
+        write_cr4(read_cr4() & ~X86_CR4_VMXE);
+}
+static inline int cpu_vmx_enabled(void)
+{
+        return read_cr4() & X86_CR4_VMXE;
+}
+/** Disable VMX if it is enabled on the current CPU
+ *
+ * You shouldn't call this if cpu_has_vmx() returns 0.
+ */
+static inline void __cpu_emergency_vmxoff(void)
+{
+        if (cpu_vmx_enabled())
+                cpu_vmxoff();
+}
+/** Disable VMX if it is supported and enabled on the current CPU
+ */
+static inline void cpu_emergency_vmxoff(void)
+{
+        if (cpu_has_vmx())
+                __cpu_emergency_vmxoff();
+}
+/*
+ * SVM functions:
+ */
+/** Check if the CPU has SVM support
+ *
+ * You can use the 'msg' arg to get a message describing the problem,
+ * if the function returns zero. Simply pass NULL if you are not interested
+ * on the messages; gcc should take care of not generating code for
+ * the messages on this case.
+ */
+static inline int cpu_has_svm(const char **msg)
+{
+        uint32_t eax, ebx, ecx, edx;
+        if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) {
+                if (msg)
+                        *msg = "not amd";
+                return 0;
+        }
+        cpuid(0x80000000, &eax, &ebx, &ecx, &edx);
+        if (eax < SVM_CPUID_FUNC) {
+                if (msg)
+                        *msg = "can't execute cpuid_8000000a";
+                return 0;
+        }
+        cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
+        if (!(ecx & (1 << SVM_CPUID_FEATURE_SHIFT))) {
+                if (msg)
+                        *msg = "svm not available";
+                return 0;
+        }
+        return 1;
+}
+/** Disable SVM on the current CPU
+ *
+ * You should call this only if cpu_has_svm() returned true.
+ */
+static inline void cpu_svm_disable(void)
+{
+        uint64_t efer;
+        wrmsrl(MSR_VM_HSAVE_PA, 0);
+        rdmsrl(MSR_EFER, efer);
+        wrmsrl(MSR_EFER, efer & ~MSR_EFER_SVME_MASK);
+}
+/** Makes sure SVM is disabled, if it is supported on the CPU
+ */
+static inline void cpu_emergency_svm_disable(void)
+{
+        if (cpu_has_svm(NULL))
+                cpu_svm_disable();
+}
+#endif /* _ASM_X86_VIRTEX_H */
diff --git a/arch/x86/kvm/vmx.h b/arch/x86/include/asm/vmx.h
index ec5edc339da6..d0238e6151d8 100644
--- a/arch/x86/kvm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -63,10 +63,13 @@
 #define VM_EXIT_HOST_ADDR_SPACE_SIZE            0x00000200
 #define VM_EXIT_ACK_INTR_ON_EXIT                0x00008000
+#define VM_EXIT_SAVE_IA32_PAT                   0x00040000
+#define VM_EXIT_LOAD_IA32_PAT                   0x00080000
 #define VM_ENTRY_IA32E_MODE                     0x00000200
 #define VM_ENTRY_SMM                            0x00000400
 #define VM_ENTRY_DEACT_DUAL_MONITOR             0x00000800
+#define VM_ENTRY_LOAD_IA32_PAT                  0x00004000
 /* VMCS Encodings */
 enum vmcs_field {
@@ -112,6 +115,8 @@ enum vmcs_field {
        VMCS_LINK_POINTER_HIGH          = 0x00002801,
        GUEST_IA32_DEBUGCTL             = 0x00002802,
        GUEST_IA32_DEBUGCTL_HIGH        = 0x00002803,
+        GUEST_IA32_PAT                  = 0x00002804,
+        GUEST_IA32_PAT_HIGH             = 0x00002805,
        GUEST_PDPTR0                    = 0x0000280a,
        GUEST_PDPTR0_HIGH               = 0x0000280b,
        GUEST_PDPTR1                    = 0x0000280c,
@@ -120,6 +125,8 @@ enum vmcs_field {
        GUEST_PDPTR2_HIGH               = 0x0000280f,
        GUEST_PDPTR3                    = 0x00002810,
        GUEST_PDPTR3_HIGH               = 0x00002811,
+        HOST_IA32_PAT                   = 0x00002c00,
+        HOST_IA32_PAT_HIGH              = 0x00002c01,
        PIN_BASED_VM_EXEC_CONTROL       = 0x00004000,
        CPU_BASED_VM_EXEC_CONTROL       = 0x00004002,
        EXCEPTION_BITMAP                = 0x00004004,
@@ -331,8 +338,9 @@ enum vmcs_field {
 #define AR_RESERVD_MASK 0xfffe0f00
-#define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT        9
+#define TSS_PRIVATE_MEMSLOT                     (KVM_MEMORY_SLOTS + 0)
-#define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT      10
+#define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT        (KVM_MEMORY_SLOTS + 1)
+#define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT      (KVM_MEMORY_SLOTS + 2)
 #define VMX_NR_VPIDS                            (1 << 16)
 #define VMX_VPID_EXTENT_SINGLE_CONTEXT          1
@@ -356,4 +364,19 @@ enum vmcs_field {
 #define VMX_EPT_IDENTITY_PAGETABLE_ADDR         0xfffbc000ul
+#define ASM_VMX_VMCLEAR_RAX       ".byte 0x66, 0x0f, 0xc7, 0x30"
+#define ASM_VMX_VMLAUNCH          ".byte 0x0f, 0x01, 0xc2"
+#define ASM_VMX_VMRESUME          ".byte 0x0f, 0x01, 0xc3"
+#define ASM_VMX_VMPTRLD_RAX       ".byte 0x0f, 0xc7, 0x30"
+#define ASM_VMX_VMREAD_RDX_RAX    ".byte 0x0f, 0x78, 0xd0"
+#define ASM_VMX_VMWRITE_RAX_RDX   ".byte 0x0f, 0x79, 0xd0"
+#define ASM_VMX_VMWRITE_RSP_RDX   ".byte 0x0f, 0x79, 0xd4"
+#define ASM_VMX_VMXOFF            ".byte 0x0f, 0x01, 0xc4"
+#define ASM_VMX_VMXON_RAX         ".byte 0xf3, 0x0f, 0xc7, 0x30"
+#define ASM_VMX_INVEPT            ".byte 0x66, 0x0f, 0x38, 0x80, 0x08"
+#define ASM_VMX_INVVPID           ".byte 0x66, 0x0f, 0x38, 0x81, 0x08"
 #endif
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 88dd768eab6d..d364df03c1d6 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -109,6 +109,8 @@ obj-$(CONFIG_MICROCODE)			+= microcode.o
 obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o
+obj-$(CONFIG_SWIOTLB)                   += pci-swiotlb_64.o # NB rename without _64
 ###
 # 64 bit specific files
 ifeq ($(CONFIG_X86_64),y)
@@ -122,7 +124,6 @@ ifeq ($(CONFIG_X86_64),y)
        obj-$(CONFIG_GART_IOMMU)        += pci-gart_64.o aperture_64.o
        obj-$(CONFIG_CALGARY_IOMMU)     += pci-calgary_64.o tce_64.o
        obj-$(CONFIG_AMD_IOMMU)         += amd_iommu_init.o amd_iommu.o
-        obj-$(CONFIG_SWIOTLB)           += pci-swiotlb_64.o
        obj-$(CONFIG_PCI_MMCONFIG)      += mmconf-fam10h_64.o
 endif
diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c
index 66198cbe464d..d652515e2855 100644
--- a/arch/x86/kernel/apic.c
+++ b/arch/x86/kernel/apic.c
@@ -119,8 +119,6 @@ EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok);
 int first_system_vector = 0xfe;
-char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE};
 /*
 * Debug level, exported for io_apic.c
 */
@@ -142,7 +140,7 @@ static int lapic_next_event(unsigned long delta,
                            struct clock_event_device *evt);
 static void lapic_timer_setup(enum clock_event_mode mode,
                              struct clock_event_device *evt);
-static void lapic_timer_broadcast(cpumask_t mask);
+static void lapic_timer_broadcast(const cpumask_t *mask);
 static void apic_pm_activate(void);
 /*
@@ -455,7 +453,7 @@ static void lapic_timer_setup(enum clock_event_mode mode,
 /*
 * Local APIC timer broadcast function
 */
-static void lapic_timer_broadcast(cpumask_t mask)
+static void lapic_timer_broadcast(const cpumask_t *mask)
 {
 #ifdef CONFIG_SMP
        send_IPI_mask(mask, LOCAL_TIMER_VECTOR);
@@ -471,7 +469,7 @@ static void __cpuinit setup_APIC_timer(void)
        struct clock_event_device *levt = &__get_cpu_var(lapic_events);
        memcpy(levt, &lapic_clockevent, sizeof(*levt));
-        levt->cpumask = cpumask_of_cpu(smp_processor_id());
+        levt->cpumask = cpumask_of(smp_processor_id());
        clockevents_register_device(levt);
 }
@@ -1807,28 +1805,32 @@ void disconnect_bsp_APIC(int virt_wire_setup)
 void __cpuinit generic_processor_info(int apicid, int version)
 {
        int cpu;
-        cpumask_t tmp_map;
        /*
         * Validate version
         */
        if (version == 0x0) {
                pr_warning("BIOS bug, APIC version is 0 for CPU#%d! "
-                        "fixing up to 0x10. (tell your hw vendor)\n",
+                           "fixing up to 0x10. (tell your hw vendor)\n",
-                        version);
+                                version);
                version = 0x10;
        }
        apic_version[apicid] = version;
-        if (num_processors >= NR_CPUS) {
+        if (num_processors >= nr_cpu_ids) {
-                pr_warning("WARNING: NR_CPUS limit of %i reached."
+                int max = nr_cpu_ids;
-                        "  Processor ignored.\n", NR_CPUS);
+                int thiscpu = max + disabled_cpus;
+                pr_warning(
+                        "ACPI: NR_CPUS/possible_cpus limit of %i reached."
+                        "  Processor %d/0x%x ignored.\n", max, thiscpu, apicid);
+                disabled_cpus++;
                return;
        }
        num_processors++;
-        cpus_complement(tmp_map, cpu_present_map);
+        cpu = cpumask_next_zero(-1, cpu_present_mask);
-        cpu = first_cpu(tmp_map);
        physid_set(apicid, phys_cpu_present_map);
        if (apicid == boot_cpu_physical_apicid) {
@@ -1878,8 +1880,8 @@ void __cpuinit generic_processor_info(int apicid, int version)
        }
 #endif
-        cpu_set(cpu, cpu_possible_map);
+        set_cpu_possible(cpu, true);
-        cpu_set(cpu, cpu_present_map);
+        set_cpu_present(cpu, true);
 }
 #ifdef CONFIG_X86_64
@@ -2081,7 +2083,7 @@ __cpuinit int apic_is_clustered_box(void)
        bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid);
        bitmap_zero(clustermap, NUM_APIC_CLUSTERS);
-        for (i = 0; i < NR_CPUS; i++) {
+        for (i = 0; i < nr_cpu_ids; i++) {
                /* are we being called early in kernel startup? */
                if (bios_cpu_apicid) {
                        id = bios_cpu_apicid[i];
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 68b5d8681cbb..c6ecda64f5f1 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -534,31 +534,16 @@ static void __cpuinit free_cache_attributes(unsigned int cpu)
        per_cpu(cpuid4_info, cpu) = NULL;
 }
-static int __cpuinit detect_cache_attributes(unsigned int cpu)
+static void get_cpu_leaves(void *_retval)
 {
-        struct _cpuid4_info     *this_leaf;
+        int j, *retval = _retval, cpu = smp_processor_id();
-        unsigned long           j;
-        int                     retval;
-        cpumask_t               oldmask;
-        if (num_cache_leaves == 0)
-                return -ENOENT;
-        per_cpu(cpuid4_info, cpu) = kzalloc(
-            sizeof(struct _cpuid4_info) * num_cache_leaves, GFP_KERNEL);
-        if (per_cpu(cpuid4_info, cpu) == NULL)
-                return -ENOMEM;
-        oldmask = current->cpus_allowed;
-        retval = set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
-        if (retval)
-                goto out;
        /* Do cpuid and store the results */
        for (j = 0; j < num_cache_leaves; j++) {
+                struct _cpuid4_info *this_leaf;
                this_leaf = CPUID4_INFO_IDX(cpu, j);
-                retval = cpuid4_cache_lookup(j, this_leaf);
+                *retval = cpuid4_cache_lookup(j, this_leaf);
-                if (unlikely(retval < 0)) {
+                if (unlikely(*retval < 0)) {
                        int i;
                        for (i = 0; i < j; i++)
@@ -567,9 +552,21 @@ static int __cpuinit detect_cache_attributes(unsigned int cpu)
                }
                cache_shared_cpu_map_setup(cpu, j);
        }
-        set_cpus_allowed_ptr(current, &oldmask);
+}
+static int __cpuinit detect_cache_attributes(unsigned int cpu)
+{
+        int                     retval;
+        if (num_cache_leaves == 0)
+                return -ENOENT;
+        per_cpu(cpuid4_info, cpu) = kzalloc(
+            sizeof(struct _cpuid4_info) * num_cache_leaves, GFP_KERNEL);
+        if (per_cpu(cpuid4_info, cpu) == NULL)
+                return -ENOMEM;
-out:
+        smp_call_function_single(cpu, get_cpu_leaves, &retval, true);
        if (retval) {
                kfree(per_cpu(cpuid4_info, cpu));
                per_cpu(cpuid4_info, cpu) = NULL;
@@ -626,8 +623,8 @@ static ssize_t show_shared_cpu_map_func(struct _cpuid4_info *this_leaf,
                cpumask_t *mask = &this_leaf->shared_cpu_map;
                n = type?
-                        cpulist_scnprintf(buf, len-2, *mask):
+                        cpulist_scnprintf(buf, len-2, mask) :
-                        cpumask_scnprintf(buf, len-2, *mask);
+                        cpumask_scnprintf(buf, len-2, mask);
                buf[n++] = '\n';
                buf[n] = '\0';
        }
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
index 748c8f9e7a05..a5a5e0530370 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
@@ -83,34 +83,41 @@ static DEFINE_PER_CPU(unsigned char, bank_map);	/* see which banks are on */
 * CPU Initialization
 */
+struct thresh_restart {
+        struct threshold_block *b;
+        int reset;
+        u16 old_limit;
+};
 /* must be called with correct cpu affinity */
-static void threshold_restart_bank(struct threshold_block *b,
+static long threshold_restart_bank(void *_tr)
-                                   int reset, u16 old_limit)
 {
+        struct thresh_restart *tr = _tr;
        u32 mci_misc_hi, mci_misc_lo;
-        rdmsr(b->address, mci_misc_lo, mci_misc_hi);
+        rdmsr(tr->b->address, mci_misc_lo, mci_misc_hi);
-        if (b->threshold_limit < (mci_misc_hi & THRESHOLD_MAX))
+        if (tr->b->threshold_limit < (mci_misc_hi & THRESHOLD_MAX))
-                reset = 1;      /* limit cannot be lower than err count */
+                tr->reset = 1;  /* limit cannot be lower than err count */
-        if (reset) {            /* reset err count and overflow bit */
+        if (tr->reset) {                /* reset err count and overflow bit */
                mci_misc_hi =
                    (mci_misc_hi & ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI)) |
-                    (THRESHOLD_MAX - b->threshold_limit);
+                    (THRESHOLD_MAX - tr->b->threshold_limit);
-        } else if (old_limit) { /* change limit w/o reset */
+        } else if (tr->old_limit) {     /* change limit w/o reset */
                int new_count = (mci_misc_hi & THRESHOLD_MAX) +
-                    (old_limit - b->threshold_limit);
+                    (tr->old_limit - tr->b->threshold_limit);
                mci_misc_hi = (mci_misc_hi & ~MASK_ERR_COUNT_HI) |
                    (new_count & THRESHOLD_MAX);
        }
-        b->interrupt_enable ?
+        tr->b->interrupt_enable ?
            (mci_misc_hi = (mci_misc_hi & ~MASK_INT_TYPE_HI) | INT_TYPE_APIC) :
            (mci_misc_hi &= ~MASK_INT_TYPE_HI);
        mci_misc_hi |= MASK_COUNT_EN_HI;
-        wrmsr(b->address, mci_misc_lo, mci_misc_hi);
+        wrmsr(tr->b->address, mci_misc_lo, mci_misc_hi);
+        return 0;
 }
 /* cpu init entry point, called from mce.c with preempt off */
@@ -120,6 +127,7 @@ void __cpuinit mce_amd_feature_init(struct cpuinfo_x86 *c)
        unsigned int cpu = smp_processor_id();
        u8 lvt_off;
        u32 low = 0, high = 0, address = 0;
+        struct thresh_restart tr;
        for (bank = 0; bank < NR_BANKS; ++bank) {
                for (block = 0; block < NR_BLOCKS; ++block) {
@@ -162,7 +170,10 @@ void __cpuinit mce_amd_feature_init(struct cpuinfo_x86 *c)
                        wrmsr(address, low, high);
                        threshold_defaults.address = address;
-                        threshold_restart_bank(&threshold_defaults, 0, 0);
+                        tr.b = &threshold_defaults;
+                        tr.reset = 0;
+                        tr.old_limit = 0;
+                        threshold_restart_bank(&tr);
                }
        }
 }
@@ -251,20 +262,6 @@ struct threshold_attr {
        ssize_t(*store) (struct threshold_block *, const char *, size_t count);
 };
-static void affinity_set(unsigned int cpu, cpumask_t *oldmask,
-                                           cpumask_t *newmask)
-{
-        *oldmask = current->cpus_allowed;
-        cpus_clear(*newmask);
-        cpu_set(cpu, *newmask);
-        set_cpus_allowed_ptr(current, newmask);
-}
-static void affinity_restore(const cpumask_t *oldmask)
-{
-        set_cpus_allowed_ptr(current, oldmask);
-}
 #define SHOW_FIELDS(name)                                           \
 static ssize_t show_ ## name(struct threshold_block * b, char *buf) \
 {                                                                   \
@@ -277,15 +274,16 @@ static ssize_t store_interrupt_enable(struct threshold_block *b,
                                      const char *buf, size_t count)
 {
        char *end;
-        cpumask_t oldmask, newmask;
+        struct thresh_restart tr;
        unsigned long new = simple_strtoul(buf, &end, 0);
        if (end == buf)
                return -EINVAL;
        b->interrupt_enable = !!new;
-        affinity_set(b->cpu, &oldmask, &newmask);
+        tr.b = b;
-        threshold_restart_bank(b, 0, 0);
+        tr.reset = 0;
-        affinity_restore(&oldmask);
+        tr.old_limit = 0;
+        work_on_cpu(b->cpu, threshold_restart_bank, &tr);
        return end - buf;
 }
@@ -294,8 +292,7 @@ static ssize_t store_threshold_limit(struct threshold_block *b,
                                     const char *buf, size_t count)
 {
        char *end;
-        cpumask_t oldmask, newmask;
+        struct thresh_restart tr;
-        u16 old;
        unsigned long new = simple_strtoul(buf, &end, 0);
        if (end == buf)
                return -EINVAL;
@@ -303,34 +300,36 @@ static ssize_t store_threshold_limit(struct threshold_block *b,
                new = THRESHOLD_MAX;
        if (new < 1)
                new = 1;
-        old = b->threshold_limit;
+        tr.old_limit = b->threshold_limit;
        b->threshold_limit = new;
+        tr.b = b;
+        tr.reset = 0;
-        affinity_set(b->cpu, &oldmask, &newmask);
+        work_on_cpu(b->cpu, threshold_restart_bank, &tr);
-        threshold_restart_bank(b, 0, old);
-        affinity_restore(&oldmask);
        return end - buf;
 }
-static ssize_t show_error_count(struct threshold_block *b, char *buf)
+static long local_error_count(void *_b)
 {
-        u32 high, low;
+        struct threshold_block *b = _b;
-        cpumask_t oldmask, newmask;
+        u32 low, high;
-        affinity_set(b->cpu, &oldmask, &newmask);
        rdmsr(b->address, low, high);
-        affinity_restore(&oldmask);
+        return (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit);
-        return sprintf(buf, "%x\n",
+}
-                       (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit));
+static ssize_t show_error_count(struct threshold_block *b, char *buf)
+{
+        return sprintf(buf, "%lx\n", work_on_cpu(b->cpu, local_error_count, b));
 }
 static ssize_t store_error_count(struct threshold_block *b,
                                 const char *buf, size_t count)
 {
-        cpumask_t oldmask, newmask;
+        struct thresh_restart tr = { .b = b, .reset = 1, .old_limit = 0 };
-        affinity_set(b->cpu, &oldmask, &newmask);
-        threshold_restart_bank(b, 1, 0);
+        work_on_cpu(b->cpu, threshold_restart_bank, &tr);
-        affinity_restore(&oldmask);
        return 1;
 }
@@ -463,12 +462,19 @@ out_free:
        return err;
 }
+static long local_allocate_threshold_blocks(void *_bank)
+{
+        unsigned int *bank = _bank;
+        return allocate_threshold_blocks(smp_processor_id(), *bank, 0,
+                                         MSR_IA32_MC0_MISC + *bank * 4);
+}
 /* symlinks sibling shared banks to first core.  first core owns dir/files. */
 static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
 {
        int i, err = 0;
        struct threshold_bank *b = NULL;
-        cpumask_t oldmask, newmask;
        char name[32];
        sprintf(name, "threshold_bank%i", bank);
@@ -519,11 +525,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
        per_cpu(threshold_banks, cpu)[bank] = b;
-        affinity_set(cpu, &oldmask, &newmask);
+        err = work_on_cpu(cpu, local_allocate_threshold_blocks, &bank);
-        err = allocate_threshold_blocks(cpu, bank, 0,
-                                        MSR_IA32_MC0_MISC + bank * 4);
-        affinity_restore(&oldmask);
        if (err)
                goto out_free;
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 4e8d77f01eeb..b59ddcc88cd8 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -14,14 +14,6 @@
 #include <asm/pat.h>
 #include "mtrr.h"
-struct mtrr_state {
-        struct mtrr_var_range var_ranges[MAX_VAR_RANGES];
-        mtrr_type fixed_ranges[NUM_FIXED_RANGES];
-        unsigned char enabled;
-        unsigned char have_fixed;
-        mtrr_type def_type;
-};
 struct fixed_range_block {
        int base_msr; /* start address of an MTRR block */
        int ranges;   /* number of MTRRs in this block  */
@@ -35,10 +27,12 @@ static struct fixed_range_block fixed_range_blocks[] = {
 };
 static unsigned long smp_changes_mask;
-static struct mtrr_state mtrr_state = {};
 static int mtrr_state_set;
 u64 mtrr_tom2;
+struct mtrr_state_type mtrr_state = {};
+EXPORT_SYMBOL_GPL(mtrr_state);
 #undef MODULE_PARAM_PREFIX
 #define MODULE_PARAM_PREFIX "mtrr."
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 44fcb237bd52..d259e5d2e054 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -49,7 +49,7 @@
 u32 num_var_ranges = 0;
-unsigned int mtrr_usage_table[MAX_VAR_RANGES];
+unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES];
 static DEFINE_MUTEX(mtrr_mutex);
 u64 size_or_mask, size_and_mask;
@@ -574,7 +574,7 @@ struct mtrr_value {
        unsigned long   lsize;
 };
-static struct mtrr_value mtrr_state[MAX_VAR_RANGES];
+static struct mtrr_value mtrr_state[MTRR_MAX_VAR_RANGES];
 static int mtrr_save(struct sys_device * sysdev, pm_message_t state)
 {
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h
index 2dc4ec656b23..ffd60409cc6d 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.h
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.h
@@ -8,11 +8,6 @@
 #define MTRRcap_MSR     0x0fe
 #define MTRRdefType_MSR 0x2ff
-#define MTRRphysBase_MSR(reg) (0x200 + 2 * (reg))
-#define MTRRphysMask_MSR(reg) (0x200 + 2 * (reg) + 1)
-#define NUM_FIXED_RANGES 88
-#define MAX_VAR_RANGES 256
 #define MTRRfix64K_00000_MSR 0x250
 #define MTRRfix16K_80000_MSR 0x258
 #define MTRRfix16K_A0000_MSR 0x259
@@ -29,11 +24,7 @@
 #define MTRR_CHANGE_MASK_VARIABLE  0x02
 #define MTRR_CHANGE_MASK_DEFTYPE   0x04
-/* In the Intel processor's MTRR interface, the MTRR type is always held in
+extern unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES];
-   an 8 bit field: */
-typedef u8 mtrr_type;
-extern unsigned int mtrr_usage_table[MAX_VAR_RANGES];
 struct mtrr_ops {
        u32     vendor;
@@ -70,13 +61,6 @@ struct set_mtrr_context {
        u32 ccr3;
 };
-struct mtrr_var_range {
-        u32 base_lo;
-        u32 base_hi;
-        u32 mask_lo;
-        u32 mask_hi;
-};
 void set_mtrr_done(struct set_mtrr_context *ctxt);
 void set_mtrr_cache_disable(struct set_mtrr_context *ctxt);
 void set_mtrr_prepare_save(struct set_mtrr_context *ctxt);
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index d84a852e4cd7..c689d19e35ab 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -26,6 +26,7 @@
 #include <linux/kdebug.h>
 #include <asm/smp.h>
 #include <asm/reboot.h>
+#include <asm/virtext.h>
 #include <mach_ipi.h>
@@ -49,6 +50,15 @@ static void kdump_nmi_callback(int cpu, struct die_args *args)
 #endif
        crash_save_cpu(regs, cpu);
+        /* Disable VMX or SVM if needed.
+         *
+         * We need to disable virtualization on all CPUs.
+         * Having VMX or SVM enabled on any CPU may break rebooting
+         * after the kdump kernel has finished its task.
+         */
+        cpu_emergency_vmxoff();
+        cpu_emergency_svm_disable();
        disable_local_APIC();
 }
@@ -80,6 +90,14 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
        local_irq_disable();
        kdump_nmi_shootdown_cpus();
+        /* Booting kdump kernel with VMX or SVM enabled won't work,
+         * because (among other limitations) we can't disable paging
+         * with the virt flags.
+         */
+        cpu_emergency_vmxoff();
+        cpu_emergency_svm_disable();
        lapic_shutdown();
 #if defined(CONFIG_X86_IO_APIC)
        disable_IO_APIC();
diff --git a/arch/x86/kernel/genapic_flat_64.c b/arch/x86/kernel/genapic_flat_64.c
index c0262791bda4..34185488e4fb 100644
--- a/arch/x86/kernel/genapic_flat_64.c
+++ b/arch/x86/kernel/genapic_flat_64.c
@@ -30,12 +30,12 @@ static int flat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
        return 1;
 }
-static cpumask_t flat_target_cpus(void)
+static const struct cpumask *flat_target_cpus(void)
 {
-        return cpu_online_map;
+        return cpu_online_mask;
 }
-static cpumask_t flat_vector_allocation_domain(int cpu)
+static void flat_vector_allocation_domain(int cpu, struct cpumask *retmask)
 {
        /* Careful. Some cpus do not strictly honor the set of cpus
         * specified in the interrupt destination when using lowest
@@ -45,8 +45,8 @@ static cpumask_t flat_vector_allocation_domain(int cpu)
         * deliver interrupts to the wrong hyperthread when only one
         * hyperthread was specified in the interrupt desitination.
         */
-        cpumask_t domain = { { [0] = APIC_ALL_CPUS, } };
+        cpumask_clear(retmask);
-        return domain;
+        cpumask_bits(retmask)[0] = APIC_ALL_CPUS;
 }
 /*
@@ -69,9 +69,8 @@ static void flat_init_apic_ldr(void)
        apic_write(APIC_LDR, val);
 }
-static void flat_send_IPI_mask(cpumask_t cpumask, int vector)
+static inline void _flat_send_IPI_mask(unsigned long mask, int vector)
 {
-        unsigned long mask = cpus_addr(cpumask)[0];
        unsigned long flags;
        local_irq_save(flags);
@@ -79,20 +78,41 @@ static void flat_send_IPI_mask(cpumask_t cpumask, int vector)
        local_irq_restore(flags);
 }
+static void flat_send_IPI_mask(const struct cpumask *cpumask, int vector)
+{
+        unsigned long mask = cpumask_bits(cpumask)[0];
+        _flat_send_IPI_mask(mask, vector);
+}
+static void flat_send_IPI_mask_allbutself(const struct cpumask *cpumask,
+                                          int vector)
+{
+        unsigned long mask = cpumask_bits(cpumask)[0];
+        int cpu = smp_processor_id();
+        if (cpu < BITS_PER_LONG)
+                clear_bit(cpu, &mask);
+        _flat_send_IPI_mask(mask, vector);
+}
 static void flat_send_IPI_allbutself(int vector)
 {
+        int cpu = smp_processor_id();
 #ifdef  CONFIG_HOTPLUG_CPU
        int hotplug = 1;
 #else
        int hotplug = 0;
 #endif
        if (hotplug || vector == NMI_VECTOR) {
-                cpumask_t allbutme = cpu_online_map;
+                if (!cpumask_equal(cpu_online_mask, cpumask_of(cpu))) {
+                        unsigned long mask = cpumask_bits(cpu_online_mask)[0];
-                cpu_clear(smp_processor_id(), allbutme);
+                        if (cpu < BITS_PER_LONG)
+                                clear_bit(cpu, &mask);
-                if (!cpus_empty(allbutme))
+                        _flat_send_IPI_mask(mask, vector);
-                        flat_send_IPI_mask(allbutme, vector);
+                }
        } else if (num_online_cpus() > 1) {
                __send_IPI_shortcut(APIC_DEST_ALLBUT, vector,APIC_DEST_LOGICAL);
        }
@@ -101,7 +121,7 @@ static void flat_send_IPI_allbutself(int vector)
 static void flat_send_IPI_all(int vector)
 {
        if (vector == NMI_VECTOR)
-                flat_send_IPI_mask(cpu_online_map, vector);
+                flat_send_IPI_mask(cpu_online_mask, vector);
        else
                __send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL);
 }
@@ -135,9 +155,18 @@ static int flat_apic_id_registered(void)
        return physid_isset(read_xapic_id(), phys_cpu_present_map);
 }
-static unsigned int flat_cpu_mask_to_apicid(cpumask_t cpumask)
+static unsigned int flat_cpu_mask_to_apicid(const struct cpumask *cpumask)
+{
+        return cpumask_bits(cpumask)[0] & APIC_ALL_CPUS;
+}
+static unsigned int flat_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
+                                                const struct cpumask *andmask)
 {
-        return cpus_addr(cpumask)[0] & APIC_ALL_CPUS;
+        unsigned long mask1 = cpumask_bits(cpumask)[0] & APIC_ALL_CPUS;
+        unsigned long mask2 = cpumask_bits(andmask)[0] & APIC_ALL_CPUS;
+        return mask1 & mask2;
 }
 static unsigned int phys_pkg_id(int index_msb)
@@ -157,8 +186,10 @@ struct genapic apic_flat =  {
        .send_IPI_all = flat_send_IPI_all,
        .send_IPI_allbutself = flat_send_IPI_allbutself,
        .send_IPI_mask = flat_send_IPI_mask,
+        .send_IPI_mask_allbutself = flat_send_IPI_mask_allbutself,
        .send_IPI_self = apic_send_IPI_self,
        .cpu_mask_to_apicid = flat_cpu_mask_to_apicid,
+        .cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and,
        .phys_pkg_id = phys_pkg_id,
        .get_apic_id = get_apic_id,
        .set_apic_id = set_apic_id,
@@ -188,35 +219,39 @@ static int physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
        return 0;
 }
-static cpumask_t physflat_target_cpus(void)
+static const struct cpumask *physflat_target_cpus(void)
 {
-        return cpu_online_map;
+        return cpu_online_mask;
 }
-static cpumask_t physflat_vector_allocation_domain(int cpu)
+static void physflat_vector_allocation_domain(int cpu, struct cpumask *retmask)
 {
-        return cpumask_of_cpu(cpu);
+        cpumask_clear(retmask);
+        cpumask_set_cpu(cpu, retmask);
 }
-static void physflat_send_IPI_mask(cpumask_t cpumask, int vector)
+static void physflat_send_IPI_mask(const struct cpumask *cpumask, int vector)
 {
        send_IPI_mask_sequence(cpumask, vector);
 }
-static void physflat_send_IPI_allbutself(int vector)
+static void physflat_send_IPI_mask_allbutself(const struct cpumask *cpumask,
+                                              int vector)
 {
-        cpumask_t allbutme = cpu_online_map;
+        send_IPI_mask_allbutself(cpumask, vector);
+}
-        cpu_clear(smp_processor_id(), allbutme);
+static void physflat_send_IPI_allbutself(int vector)
-        physflat_send_IPI_mask(allbutme, vector);
+{
+        send_IPI_mask_allbutself(cpu_online_mask, vector);
 }
 static void physflat_send_IPI_all(int vector)
 {
-        physflat_send_IPI_mask(cpu_online_map, vector);
+        physflat_send_IPI_mask(cpu_online_mask, vector);
 }
-static unsigned int physflat_cpu_mask_to_apicid(cpumask_t cpumask)
+static unsigned int physflat_cpu_mask_to_apicid(const struct cpumask *cpumask)
 {
        int cpu;
@@ -224,13 +259,31 @@ static unsigned int physflat_cpu_mask_to_apicid(cpumask_t cpumask)
         * We're using fixed IRQ delivery, can only return one phys APIC ID.
         * May as well be the first.
         */
-        cpu = first_cpu(cpumask);
+        cpu = cpumask_first(cpumask);
        if ((unsigned)cpu < nr_cpu_ids)
                return per_cpu(x86_cpu_to_apicid, cpu);
        else
                return BAD_APICID;
 }
+static unsigned int
+physflat_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
+                                const struct cpumask *andmask)
+{
+        int cpu;
+        /*
+         * We're using fixed IRQ delivery, can only return one phys APIC ID.
+         * May as well be the first.
+         */
+        for_each_cpu_and(cpu, cpumask, andmask)
+                if (cpumask_test_cpu(cpu, cpu_online_mask))
+                        break;
+        if (cpu < nr_cpu_ids)
+                return per_cpu(x86_cpu_to_apicid, cpu);
+        return BAD_APICID;
+}
 struct genapic apic_physflat =  {
        .name = "physical flat",
        .acpi_madt_oem_check = physflat_acpi_madt_oem_check,
@@ -243,8 +296,10 @@ struct genapic apic_physflat =  {
        .send_IPI_all = physflat_send_IPI_all,
        .send_IPI_allbutself = physflat_send_IPI_allbutself,
        .send_IPI_mask = physflat_send_IPI_mask,
+        .send_IPI_mask_allbutself = physflat_send_IPI_mask_allbutself,
        .send_IPI_self = apic_send_IPI_self,
        .cpu_mask_to_apicid = physflat_cpu_mask_to_apicid,
+        .cpu_mask_to_apicid_and = physflat_cpu_mask_to_apicid_and,
        .phys_pkg_id = phys_pkg_id,
        .get_apic_id = get_apic_id,
        .set_apic_id = set_apic_id,
diff --git a/arch/x86/kernel/genx2apic_cluster.c b/arch/x86/kernel/genx2apic_cluster.c
index f6a2c8eb48a6..6ce497cc372d 100644
--- a/arch/x86/kernel/genx2apic_cluster.c
+++ b/arch/x86/kernel/genx2apic_cluster.c
@@ -22,19 +22,18 @@ static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
 /* Start with all IRQs pointing to boot CPU.  IRQ balancing will shift them. */
-static cpumask_t x2apic_target_cpus(void)
+static const struct cpumask *x2apic_target_cpus(void)
 {
-        return cpumask_of_cpu(0);
+        return cpumask_of(0);
 }
 /*
 * for now each logical cpu is in its own vector allocation domain.
 */
-static cpumask_t x2apic_vector_allocation_domain(int cpu)
+static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask)
 {
-        cpumask_t domain = CPU_MASK_NONE;
+        cpumask_clear(retmask);
-        cpu_set(cpu, domain);
+        cpumask_set_cpu(cpu, retmask);
-        return domain;
 }
 static void __x2apic_send_IPI_dest(unsigned int apicid, int vector,
@@ -56,32 +55,53 @@ static void __x2apic_send_IPI_dest(unsigned int apicid, int vector,
 * at once. We have 16 cpu's in a cluster. This will minimize IPI register
 * writes.
 */
-static void x2apic_send_IPI_mask(cpumask_t mask, int vector)
+static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
 {
        unsigned long flags;
        unsigned long query_cpu;
        local_irq_save(flags);
-        for_each_cpu_mask(query_cpu, mask) {
+        for_each_cpu(query_cpu, mask)
-                __x2apic_send_IPI_dest(per_cpu(x86_cpu_to_logical_apicid, query_cpu),
+                __x2apic_send_IPI_dest(
-                                       vector, APIC_DEST_LOGICAL);
+                        per_cpu(x86_cpu_to_logical_apicid, query_cpu),
-        }
+                        vector, APIC_DEST_LOGICAL);
        local_irq_restore(flags);
 }
-static void x2apic_send_IPI_allbutself(int vector)
+static void x2apic_send_IPI_mask_allbutself(const struct cpumask *mask,
+                                            int vector)
 {
-        cpumask_t mask = cpu_online_map;
+        unsigned long flags;
+        unsigned long query_cpu;
+        unsigned long this_cpu = smp_processor_id();
-        cpu_clear(smp_processor_id(), mask);
+        local_irq_save(flags);
+        for_each_cpu(query_cpu, mask)
+                if (query_cpu != this_cpu)
+                        __x2apic_send_IPI_dest(
+                                per_cpu(x86_cpu_to_logical_apicid, query_cpu),
+                                vector, APIC_DEST_LOGICAL);
+        local_irq_restore(flags);
+}
+static void x2apic_send_IPI_allbutself(int vector)
+{
+        unsigned long flags;
+        unsigned long query_cpu;
+        unsigned long this_cpu = smp_processor_id();
-        if (!cpus_empty(mask))
+        local_irq_save(flags);
-                x2apic_send_IPI_mask(mask, vector);
+        for_each_online_cpu(query_cpu)
+                if (query_cpu != this_cpu)
+                        __x2apic_send_IPI_dest(
+                                per_cpu(x86_cpu_to_logical_apicid, query_cpu),
+                                vector, APIC_DEST_LOGICAL);
+        local_irq_restore(flags);
 }
 static void x2apic_send_IPI_all(int vector)
 {
-        x2apic_send_IPI_mask(cpu_online_map, vector);
+        x2apic_send_IPI_mask(cpu_online_mask, vector);
 }
 static int x2apic_apic_id_registered(void)
@@ -89,21 +109,38 @@ static int x2apic_apic_id_registered(void)
        return 1;
 }
-static unsigned int x2apic_cpu_mask_to_apicid(cpumask_t cpumask)
+static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask)
 {
        int cpu;
        /*
-         * We're using fixed IRQ delivery, can only return one phys APIC ID.
+         * We're using fixed IRQ delivery, can only return one logical APIC ID.
         * May as well be the first.
         */
-        cpu = first_cpu(cpumask);
+        cpu = cpumask_first(cpumask);
-        if ((unsigned)cpu < NR_CPUS)
+        if ((unsigned)cpu < nr_cpu_ids)
                return per_cpu(x86_cpu_to_logical_apicid, cpu);
        else
                return BAD_APICID;
 }
+static unsigned int x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
+                                                  const struct cpumask *andmask)
+{
+        int cpu;
+        /*
+         * We're using fixed IRQ delivery, can only return one logical APIC ID.
+         * May as well be the first.
+         */
+        for_each_cpu_and(cpu, cpumask, andmask)
+                if (cpumask_test_cpu(cpu, cpu_online_mask))
+                        break;
+        if (cpu < nr_cpu_ids)
+                return per_cpu(x86_cpu_to_logical_apicid, cpu);
+        return BAD_APICID;
+}
 static unsigned int get_apic_id(unsigned long x)
 {
        unsigned int id;
@@ -150,8 +187,10 @@ struct genapic apic_x2apic_cluster = {
        .send_IPI_all = x2apic_send_IPI_all,
        .send_IPI_allbutself = x2apic_send_IPI_allbutself,
        .send_IPI_mask = x2apic_send_IPI_mask,
+        .send_IPI_mask_allbutself = x2apic_send_IPI_mask_allbutself,
        .send_IPI_self = x2apic_send_IPI_self,
        .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid,
+        .cpu_mask_to_apicid_and = x2apic_cpu_mask_to_apicid_and,
        .phys_pkg_id = phys_pkg_id,
        .get_apic_id = get_apic_id,
        .set_apic_id = set_apic_id,
diff --git a/arch/x86/kernel/genx2apic_phys.c b/arch/x86/kernel/genx2apic_phys.c
index a177c7880ab5..21bcc0e098ba 100644
--- a/arch/x86/kernel/genx2apic_phys.c
+++ b/arch/x86/kernel/genx2apic_phys.c
@@ -29,16 +29,15 @@ static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
 /* Start with all IRQs pointing to boot CPU.  IRQ balancing will shift them. */
-static cpumask_t x2apic_target_cpus(void)
+static const struct cpumask *x2apic_target_cpus(void)
 {
-        return cpumask_of_cpu(0);
+        return cpumask_of(0);
 }
-static cpumask_t x2apic_vector_allocation_domain(int cpu)
+static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask)
 {
-        cpumask_t domain = CPU_MASK_NONE;
+        cpumask_clear(retmask);
-        cpu_set(cpu, domain);
+        cpumask_set_cpu(cpu, retmask);
-        return domain;
 }
 static void __x2apic_send_IPI_dest(unsigned int apicid, int vector,
@@ -54,32 +53,54 @@ static void __x2apic_send_IPI_dest(unsigned int apicid, int vector,
        x2apic_icr_write(cfg, apicid);
 }
-static void x2apic_send_IPI_mask(cpumask_t mask, int vector)
+static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
 {
        unsigned long flags;
        unsigned long query_cpu;
        local_irq_save(flags);
-        for_each_cpu_mask(query_cpu, mask) {
+        for_each_cpu(query_cpu, mask) {
                __x2apic_send_IPI_dest(per_cpu(x86_cpu_to_apicid, query_cpu),
                                       vector, APIC_DEST_PHYSICAL);
        }
        local_irq_restore(flags);
 }
-static void x2apic_send_IPI_allbutself(int vector)
+static void x2apic_send_IPI_mask_allbutself(const struct cpumask *mask,
+                                            int vector)
 {
-        cpumask_t mask = cpu_online_map;
+        unsigned long flags;
+        unsigned long query_cpu;
+        unsigned long this_cpu = smp_processor_id();
+        local_irq_save(flags);
+        for_each_cpu(query_cpu, mask) {
+                if (query_cpu != this_cpu)
+                        __x2apic_send_IPI_dest(
+                                per_cpu(x86_cpu_to_apicid, query_cpu),
+                                vector, APIC_DEST_PHYSICAL);
+        }
+        local_irq_restore(flags);
+}
-        cpu_clear(smp_processor_id(), mask);
+static void x2apic_send_IPI_allbutself(int vector)
+{
+        unsigned long flags;
+        unsigned long query_cpu;
+        unsigned long this_cpu = smp_processor_id();
-        if (!cpus_empty(mask))
+        local_irq_save(flags);
-                x2apic_send_IPI_mask(mask, vector);
+        for_each_online_cpu(query_cpu)
+                if (query_cpu != this_cpu)
+                        __x2apic_send_IPI_dest(
+                                per_cpu(x86_cpu_to_apicid, query_cpu),
+                                vector, APIC_DEST_PHYSICAL);
+        local_irq_restore(flags);
 }
 static void x2apic_send_IPI_all(int vector)
 {
-        x2apic_send_IPI_mask(cpu_online_map, vector);
+        x2apic_send_IPI_mask(cpu_online_mask, vector);
 }
 static int x2apic_apic_id_registered(void)
@@ -87,7 +108,7 @@ static int x2apic_apic_id_registered(void)
        return 1;
 }
-static unsigned int x2apic_cpu_mask_to_apicid(cpumask_t cpumask)
+static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask)
 {
        int cpu;
@@ -95,13 +116,30 @@ static unsigned int x2apic_cpu_mask_to_apicid(cpumask_t cpumask)
         * We're using fixed IRQ delivery, can only return one phys APIC ID.
         * May as well be the first.
         */
-        cpu = first_cpu(cpumask);
+        cpu = cpumask_first(cpumask);
-        if ((unsigned)cpu < NR_CPUS)
+        if ((unsigned)cpu < nr_cpu_ids)
                return per_cpu(x86_cpu_to_apicid, cpu);
        else
                return BAD_APICID;
 }
+static unsigned int x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
+                                                  const struct cpumask *andmask)
+{
+        int cpu;
+        /*
+         * We're using fixed IRQ delivery, can only return one phys APIC ID.
+         * May as well be the first.
+         */
+        for_each_cpu_and(cpu, cpumask, andmask)
+                if (cpumask_test_cpu(cpu, cpu_online_mask))
+                        break;
+        if (cpu < nr_cpu_ids)
+                return per_cpu(x86_cpu_to_apicid, cpu);
+        return BAD_APICID;
+}
 static unsigned int get_apic_id(unsigned long x)
 {
        unsigned int id;
@@ -145,8 +183,10 @@ struct genapic apic_x2apic_phys = {
        .send_IPI_all = x2apic_send_IPI_all,
        .send_IPI_allbutself = x2apic_send_IPI_allbutself,
        .send_IPI_mask = x2apic_send_IPI_mask,
+        .send_IPI_mask_allbutself = x2apic_send_IPI_mask_allbutself,
        .send_IPI_self = x2apic_send_IPI_self,
        .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid,
+        .cpu_mask_to_apicid_and = x2apic_cpu_mask_to_apicid_and,
        .phys_pkg_id = phys_pkg_id,
        .get_apic_id = get_apic_id,
        .set_apic_id = set_apic_id,
diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c
index dece17289731..b193e082f6ce 100644
--- a/arch/x86/kernel/genx2apic_uv_x.c
+++ b/arch/x86/kernel/genx2apic_uv_x.c
@@ -79,16 +79,15 @@ EXPORT_SYMBOL(sn_rtc_cycles_per_second);
 /* Start with all IRQs pointing to boot CPU.  IRQ balancing will shift them. */
-static cpumask_t uv_target_cpus(void)
+static const struct cpumask *uv_target_cpus(void)
 {
-        return cpumask_of_cpu(0);
+        return cpumask_of(0);
 }
-static cpumask_t uv_vector_allocation_domain(int cpu)
+static void uv_vector_allocation_domain(int cpu, struct cpumask *retmask)
 {
-        cpumask_t domain = CPU_MASK_NONE;
+        cpumask_clear(retmask);
-        cpu_set(cpu, domain);
+        cpumask_set_cpu(cpu, retmask);
-        return domain;
 }
 int uv_wakeup_secondary(int phys_apicid, unsigned int start_rip)
@@ -127,28 +126,37 @@ static void uv_send_IPI_one(int cpu, int vector)
        uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
 }
-static void uv_send_IPI_mask(cpumask_t mask, int vector)
+static void uv_send_IPI_mask(const struct cpumask *mask, int vector)
 {
        unsigned int cpu;
-        for_each_possible_cpu(cpu)
+        for_each_cpu(cpu, mask)
-                if (cpu_isset(cpu, mask))
+                uv_send_IPI_one(cpu, vector);
+}
+static void uv_send_IPI_mask_allbutself(const struct cpumask *mask, int vector)
+{
+        unsigned int cpu;
+        unsigned int this_cpu = smp_processor_id();
+        for_each_cpu(cpu, mask)
+                if (cpu != this_cpu)
                        uv_send_IPI_one(cpu, vector);
 }
 static void uv_send_IPI_allbutself(int vector)
 {
-        cpumask_t mask = cpu_online_map;
+        unsigned int cpu;
+        unsigned int this_cpu = smp_processor_id();
-        cpu_clear(smp_processor_id(), mask);
-        if (!cpus_empty(mask))
+        for_each_online_cpu(cpu)
-                uv_send_IPI_mask(mask, vector);
+                if (cpu != this_cpu)
+                        uv_send_IPI_one(cpu, vector);
 }
 static void uv_send_IPI_all(int vector)
 {
-        uv_send_IPI_mask(cpu_online_map, vector);
+        uv_send_IPI_mask(cpu_online_mask, vector);
 }
 static int uv_apic_id_registered(void)
@@ -160,7 +168,7 @@ static void uv_init_apic_ldr(void)
 {
 }
-static unsigned int uv_cpu_mask_to_apicid(cpumask_t cpumask)
+static unsigned int uv_cpu_mask_to_apicid(const struct cpumask *cpumask)
 {
        int cpu;
@@ -168,13 +176,30 @@ static unsigned int uv_cpu_mask_to_apicid(cpumask_t cpumask)
         * We're using fixed IRQ delivery, can only return one phys APIC ID.
         * May as well be the first.
         */
-        cpu = first_cpu(cpumask);
+        cpu = cpumask_first(cpumask);
        if ((unsigned)cpu < nr_cpu_ids)
                return per_cpu(x86_cpu_to_apicid, cpu);
        else
                return BAD_APICID;
 }
+static unsigned int uv_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
+                                              const struct cpumask *andmask)
+{
+        int cpu;
+        /*
+         * We're using fixed IRQ delivery, can only return one phys APIC ID.
+         * May as well be the first.
+         */
+        for_each_cpu_and(cpu, cpumask, andmask)
+                if (cpumask_test_cpu(cpu, cpu_online_mask))
+                        break;
+        if (cpu < nr_cpu_ids)
+                return per_cpu(x86_cpu_to_apicid, cpu);
+        return BAD_APICID;
+}
 static unsigned int get_apic_id(unsigned long x)
 {
        unsigned int id;
@@ -222,8 +247,10 @@ struct genapic apic_x2apic_uv_x = {
        .send_IPI_all = uv_send_IPI_all,
        .send_IPI_allbutself = uv_send_IPI_allbutself,
        .send_IPI_mask = uv_send_IPI_mask,
+        .send_IPI_mask_allbutself = uv_send_IPI_mask_allbutself,
        .send_IPI_self = uv_send_IPI_self,
        .cpu_mask_to_apicid = uv_cpu_mask_to_apicid,
+        .cpu_mask_to_apicid_and = uv_cpu_mask_to_apicid_and,
        .phys_pkg_id = phys_pkg_id,
        .get_apic_id = get_apic_id,
        .set_apic_id = set_apic_id,
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 3f0a3edf0a57..cd759ad90690 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -248,7 +248,7 @@ static void hpet_legacy_clockevent_register(void)
         * Start hpet with the boot cpu mask and make it
         * global after the IO_APIC has been initialized.
         */
-        hpet_clockevent.cpumask = cpumask_of_cpu(smp_processor_id());
+        hpet_clockevent.cpumask = cpumask_of(smp_processor_id());
        clockevents_register_device(&hpet_clockevent);
        global_clock_event = &hpet_clockevent;
        printk(KERN_DEBUG "hpet clockevent registered\n");
@@ -303,7 +303,7 @@ static void hpet_set_mode(enum clock_event_mode mode,
                        struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
                        hpet_setup_msi_irq(hdev->irq);
                        disable_irq(hdev->irq);
-                        irq_set_affinity(hdev->irq, cpumask_of_cpu(hdev->cpu));
+                        irq_set_affinity(hdev->irq, cpumask_of(hdev->cpu));
                        enable_irq(hdev->irq);
                }
                break;
@@ -451,7 +451,7 @@ static int hpet_setup_irq(struct hpet_dev *dev)
                return -1;
        disable_irq(dev->irq);
-        irq_set_affinity(dev->irq, cpumask_of_cpu(dev->cpu));
+        irq_set_affinity(dev->irq, cpumask_of(dev->cpu));
        enable_irq(dev->irq);
        printk(KERN_DEBUG "hpet: %s irq %d for MSI\n",
@@ -502,7 +502,7 @@ static void init_one_hpet_msi_clockevent(struct hpet_dev *hdev, int cpu)
        /* 5 usec minimum reprogramming delta. */
        evt->min_delta_ns = 5000;
-        evt->cpumask = cpumask_of_cpu(hdev->cpu);
+        evt->cpumask = cpumask_of(hdev->cpu);
        clockevents_register_device(evt);
 }
@@ -813,7 +813,7 @@ int __init hpet_enable(void)
 out_nohpet:
        hpet_clear_mapping();
-        boot_hpet_disable = 1;
+        hpet_address = 0;
        return 0;
 }
@@ -836,10 +836,11 @@ static __init int hpet_late_init(void)
                hpet_address = force_hpet_address;
                hpet_enable();
-                if (!hpet_virt_address)
-                        return -ENODEV;
        }
+        if (!hpet_virt_address)
+                return -ENODEV;
        hpet_reserve_platform_timers(hpet_readl(HPET_ID));
        for_each_online_cpu(cpu) {
diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c
index c1b5e3ece1f2..10f92fb532f3 100644
--- a/arch/x86/kernel/i8253.c
+++ b/arch/x86/kernel/i8253.c
@@ -114,7 +114,7 @@ void __init setup_pit_timer(void)
         * Start pit with the boot cpu mask and make it global after the
         * IO_APIC has been initialized.
         */
-        pit_clockevent.cpumask = cpumask_of_cpu(smp_processor_id());
+        pit_clockevent.cpumask = cpumask_of(smp_processor_id());
        pit_clockevent.mult = div_sc(CLOCK_TICK_RATE, NSEC_PER_SEC,
                                     pit_clockevent.shift);
        pit_clockevent.max_delta_ns =
diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c
index d39918076bb4..df3bf269beab 100644
--- a/arch/x86/kernel/init_task.c
+++ b/arch/x86/kernel/init_task.c
@@ -10,7 +10,6 @@
 #include <asm/pgtable.h>
 #include <asm/desc.h>
-static struct fs_struct init_fs = INIT_FS;
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
 struct mm_struct init_mm = INIT_MM(init_mm);
diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c
index b8c8a8e99341..69911722b9d3 100644
--- a/arch/x86/kernel/io_apic.c
+++ b/arch/x86/kernel/io_apic.c
@@ -108,94 +108,277 @@ static int __init parse_noapic(char *str)
 early_param("noapic", parse_noapic);
 struct irq_pin_list;
+/*
+ * This is performance-critical, we want to do it O(1)
+ *
+ * the indexing order of this array favors 1:1 mappings
+ * between pins and IRQs.
+ */
+struct irq_pin_list {
+        int apic, pin;
+        struct irq_pin_list *next;
+};
+static struct irq_pin_list *get_one_free_irq_2_pin(int cpu)
+{
+        struct irq_pin_list *pin;
+        int node;
+        node = cpu_to_node(cpu);
+        pin = kzalloc_node(sizeof(*pin), GFP_ATOMIC, node);
+        printk(KERN_DEBUG "  alloc irq_2_pin on cpu %d node %d\n", cpu, node);
+        return pin;
+}
 struct irq_cfg {
-        unsigned int irq;
        struct irq_pin_list *irq_2_pin;
-        cpumask_t domain;
+        cpumask_var_t domain;
-        cpumask_t old_domain;
+        cpumask_var_t old_domain;
        unsigned move_cleanup_count;
        u8 vector;
        u8 move_in_progress : 1;
+#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
+        u8 move_desc_pending : 1;
+#endif
 };
 /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
+#ifdef CONFIG_SPARSE_IRQ
+static struct irq_cfg irq_cfgx[] = {
+#else
 static struct irq_cfg irq_cfgx[NR_IRQS] = {
-        [0]  = { .irq =  0, .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR,  },
+#endif
-        [1]  = { .irq =  1, .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR,  },
+        [0]  = { .vector = IRQ0_VECTOR,  },
-        [2]  = { .irq =  2, .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR,  },
+        [1]  = { .vector = IRQ1_VECTOR,  },
-        [3]  = { .irq =  3, .domain = CPU_MASK_ALL, .vector = IRQ3_VECTOR,  },
+        [2]  = { .vector = IRQ2_VECTOR,  },
-        [4]  = { .irq =  4, .domain = CPU_MASK_ALL, .vector = IRQ4_VECTOR,  },
+        [3]  = { .vector = IRQ3_VECTOR,  },
-        [5]  = { .irq =  5, .domain = CPU_MASK_ALL, .vector = IRQ5_VECTOR,  },
+        [4]  = { .vector = IRQ4_VECTOR,  },
-        [6]  = { .irq =  6, .domain = CPU_MASK_ALL, .vector = IRQ6_VECTOR,  },
+        [5]  = { .vector = IRQ5_VECTOR,  },
-        [7]  = { .irq =  7, .domain = CPU_MASK_ALL, .vector = IRQ7_VECTOR,  },
+        [6]  = { .vector = IRQ6_VECTOR,  },
-        [8]  = { .irq =  8, .domain = CPU_MASK_ALL, .vector = IRQ8_VECTOR,  },
+        [7]  = { .vector = IRQ7_VECTOR,  },
-        [9]  = { .irq =  9, .domain = CPU_MASK_ALL, .vector = IRQ9_VECTOR,  },
+        [8]  = { .vector = IRQ8_VECTOR,  },
-        [10] = { .irq = 10, .domain = CPU_MASK_ALL, .vector = IRQ10_VECTOR, },
+        [9]  = { .vector = IRQ9_VECTOR,  },
-        [11] = { .irq = 11, .domain = CPU_MASK_ALL, .vector = IRQ11_VECTOR, },
+        [10] = { .vector = IRQ10_VECTOR, },
-        [12] = { .irq = 12, .domain = CPU_MASK_ALL, .vector = IRQ12_VECTOR, },
+        [11] = { .vector = IRQ11_VECTOR, },
-        [13] = { .irq = 13, .domain = CPU_MASK_ALL, .vector = IRQ13_VECTOR, },
+        [12] = { .vector = IRQ12_VECTOR, },
-        [14] = { .irq = 14, .domain = CPU_MASK_ALL, .vector = IRQ14_VECTOR, },
+        [13] = { .vector = IRQ13_VECTOR, },
-        [15] = { .irq = 15, .domain = CPU_MASK_ALL, .vector = IRQ15_VECTOR, },
+        [14] = { .vector = IRQ14_VECTOR, },
+        [15] = { .vector = IRQ15_VECTOR, },
 };
-#define for_each_irq_cfg(irq, cfg)              \
+int __init arch_early_irq_init(void)
-        for (irq = 0, cfg = irq_cfgx; irq < nr_irqs; irq++, cfg++)
+{
+        struct irq_cfg *cfg;
+        struct irq_desc *desc;
+        int count;
+        int i;
+        cfg = irq_cfgx;
+        count = ARRAY_SIZE(irq_cfgx);
+        for (i = 0; i < count; i++) {
+                desc = irq_to_desc(i);
+                desc->chip_data = &cfg[i];
+                alloc_bootmem_cpumask_var(&cfg[i].domain);
+                alloc_bootmem_cpumask_var(&cfg[i].old_domain);
+                if (i < NR_IRQS_LEGACY)
+                        cpumask_setall(cfg[i].domain);
+        }
+        return 0;
+}
+#ifdef CONFIG_SPARSE_IRQ
 static struct irq_cfg *irq_cfg(unsigned int irq)
 {
-        return irq < nr_irqs ? irq_cfgx + irq : NULL;
+        struct irq_cfg *cfg = NULL;
+        struct irq_desc *desc;
+        desc = irq_to_desc(irq);
+        if (desc)
+                cfg = desc->chip_data;
+        return cfg;
 }
-static struct irq_cfg *irq_cfg_alloc(unsigned int irq)
+static struct irq_cfg *get_one_free_irq_cfg(int cpu)
 {
-        return irq_cfg(irq);
+        struct irq_cfg *cfg;
+        int node;
+        node = cpu_to_node(cpu);
+        cfg = kzalloc_node(sizeof(*cfg), GFP_ATOMIC, node);
+        if (cfg) {
+                /* FIXME: needs alloc_cpumask_var_node() */
+                if (!alloc_cpumask_var(&cfg->domain, GFP_ATOMIC)) {
+                        kfree(cfg);
+                        cfg = NULL;
+                } else if (!alloc_cpumask_var(&cfg->old_domain, GFP_ATOMIC)) {
+                        free_cpumask_var(cfg->domain);
+                        kfree(cfg);
+                        cfg = NULL;
+                } else {
+                        cpumask_clear(cfg->domain);
+                        cpumask_clear(cfg->old_domain);
+                }
+        }
+        printk(KERN_DEBUG "  alloc irq_cfg on cpu %d node %d\n", cpu, node);
+        return cfg;
 }
-/*
+int arch_init_chip_data(struct irq_desc *desc, int cpu)
- * Rough estimation of how many shared IRQs there are, can be changed
+{
- * anytime.
+        struct irq_cfg *cfg;
- */
-#define MAX_PLUS_SHARED_IRQS NR_IRQS
-#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS)
-/*
+        cfg = desc->chip_data;
- * This is performance-critical, we want to do it O(1)
+        if (!cfg) {
- *
+                desc->chip_data = get_one_free_irq_cfg(cpu);
- * the indexing order of this array favors 1:1 mappings
+                if (!desc->chip_data) {
- * between pins and IRQs.
+                        printk(KERN_ERR "can not alloc irq_cfg\n");
- */
+                        BUG_ON(1);
+                }
+        }
-struct irq_pin_list {
+        return 0;
-        int apic, pin;
+}
-        struct irq_pin_list *next;
-};
-static struct irq_pin_list irq_2_pin_head[PIN_MAP_SIZE];
+#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
-static struct irq_pin_list *irq_2_pin_ptr;
-static void __init irq_2_pin_init(void)
+static void
+init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int cpu)
 {
-        struct irq_pin_list *pin = irq_2_pin_head;
+        struct irq_pin_list *old_entry, *head, *tail, *entry;
-        int i;
+        cfg->irq_2_pin = NULL;
+        old_entry = old_cfg->irq_2_pin;
+        if (!old_entry)
+                return;
+        entry = get_one_free_irq_2_pin(cpu);
+        if (!entry)
+                return;
-        for (i = 1; i < PIN_MAP_SIZE; i++)
+        entry->apic     = old_entry->apic;
-                pin[i-1].next = &pin[i];
+        entry->pin      = old_entry->pin;
+        head            = entry;
+        tail            = entry;
+        old_entry       = old_entry->next;
+        while (old_entry) {
+                entry = get_one_free_irq_2_pin(cpu);
+                if (!entry) {
+                        entry = head;
+                        while (entry) {
+                                head = entry->next;
+                                kfree(entry);
+                                entry = head;
+                        }
+                        /* still use the old one */
+                        return;
+                }
+                entry->apic     = old_entry->apic;
+                entry->pin      = old_entry->pin;
+                tail->next      = entry;
+                tail            = entry;
+                old_entry       = old_entry->next;
+        }
-        irq_2_pin_ptr = &pin[0];
+        tail->next = NULL;
+        cfg->irq_2_pin = head;
 }
-static struct irq_pin_list *get_one_free_irq_2_pin(void)
+static void free_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg)
 {
-        struct irq_pin_list *pin = irq_2_pin_ptr;
+        struct irq_pin_list *entry, *next;
-        if (!pin)
+        if (old_cfg->irq_2_pin == cfg->irq_2_pin)
-                panic("can not get more irq_2_pin\n");
+                return;
-        irq_2_pin_ptr = pin->next;
+        entry = old_cfg->irq_2_pin;
-        pin->next = NULL;
-        return pin;
+        while (entry) {
+                next = entry->next;
+                kfree(entry);
+                entry = next;
+        }
+        old_cfg->irq_2_pin = NULL;
+}
+void arch_init_copy_chip_data(struct irq_desc *old_desc,
+                                 struct irq_desc *desc, int cpu)
+{
+        struct irq_cfg *cfg;
+        struct irq_cfg *old_cfg;
+        cfg = get_one_free_irq_cfg(cpu);
+        if (!cfg)
+                return;
+        desc->chip_data = cfg;
+        old_cfg = old_desc->chip_data;
+        memcpy(cfg, old_cfg, sizeof(struct irq_cfg));
+        init_copy_irq_2_pin(old_cfg, cfg, cpu);
 }
+static void free_irq_cfg(struct irq_cfg *old_cfg)
+{
+        kfree(old_cfg);
+}
+void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc)
+{
+        struct irq_cfg *old_cfg, *cfg;
+        old_cfg = old_desc->chip_data;
+        cfg = desc->chip_data;
+        if (old_cfg == cfg)
+                return;
+        if (old_cfg) {
+                free_irq_2_pin(old_cfg, cfg);
+                free_irq_cfg(old_cfg);
+                old_desc->chip_data = NULL;
+        }
+}
+static void
+set_extra_move_desc(struct irq_desc *desc, const struct cpumask *mask)
+{
+        struct irq_cfg *cfg = desc->chip_data;
+        if (!cfg->move_in_progress) {
+                /* it means that domain is not changed */
+                if (!cpumask_intersects(&desc->affinity, mask))
+                        cfg->move_desc_pending = 1;
+        }
+}
+#endif
+#else
+static struct irq_cfg *irq_cfg(unsigned int irq)
+{
+        return irq < nr_irqs ? irq_cfgx + irq : NULL;
+}
+#endif
+#ifndef CONFIG_NUMA_MIGRATE_IRQ_DESC
+static inline void
+set_extra_move_desc(struct irq_desc *desc, const struct cpumask *mask)
+{
+}
+#endif
 struct io_apic {
        unsigned int index;
        unsigned int unused[3];
@@ -237,11 +420,10 @@ static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned
        writel(value, &io_apic->data);
 }
-static bool io_apic_level_ack_pending(unsigned int irq)
+static bool io_apic_level_ack_pending(struct irq_cfg *cfg)
 {
        struct irq_pin_list *entry;
        unsigned long flags;
-        struct irq_cfg *cfg = irq_cfg(irq);
        spin_lock_irqsave(&ioapic_lock, flags);
        entry = cfg->irq_2_pin;
@@ -323,13 +505,32 @@ static void ioapic_mask_entry(int apic, int pin)
 }
 #ifdef CONFIG_SMP
-static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector)
+static void send_cleanup_vector(struct irq_cfg *cfg)
+{
+        cpumask_var_t cleanup_mask;
+        if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) {
+                unsigned int i;
+                cfg->move_cleanup_count = 0;
+                for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
+                        cfg->move_cleanup_count++;
+                for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
+                        send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR);
+        } else {
+                cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask);
+                cfg->move_cleanup_count = cpumask_weight(cleanup_mask);
+                send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
+                free_cpumask_var(cleanup_mask);
+        }
+        cfg->move_in_progress = 0;
+}
+static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg)
 {
        int apic, pin;
-        struct irq_cfg *cfg;
        struct irq_pin_list *entry;
+        u8 vector = cfg->vector;
-        cfg = irq_cfg(irq);
        entry = cfg->irq_2_pin;
        for (;;) {
                unsigned int reg;
@@ -359,36 +560,61 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector)
        }
 }
-static int assign_irq_vector(int irq, cpumask_t mask);
+static int
+assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask);
-static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
+/*
+ * Either sets desc->affinity to a valid value, and returns cpu_mask_to_apicid
+ * of that, or returns BAD_APICID and leaves desc->affinity untouched.
+ */
+static unsigned int
+set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask)
+{
+        struct irq_cfg *cfg;
+        unsigned int irq;
+        if (!cpumask_intersects(mask, cpu_online_mask))
+                return BAD_APICID;
+        irq = desc->irq;
+        cfg = desc->chip_data;
+        if (assign_irq_vector(irq, cfg, mask))
+                return BAD_APICID;
+        cpumask_and(&desc->affinity, cfg->domain, mask);
+        set_extra_move_desc(desc, mask);
+        return cpu_mask_to_apicid_and(&desc->affinity, cpu_online_mask);
+}
+static void
+set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
 {
        struct irq_cfg *cfg;
        unsigned long flags;
        unsigned int dest;
-        cpumask_t tmp;
+        unsigned int irq;
-        struct irq_desc *desc;
-        cpus_and(tmp, mask, cpu_online_map);
+        irq = desc->irq;
-        if (cpus_empty(tmp))
+        cfg = desc->chip_data;
-                return;
-        cfg = irq_cfg(irq);
+        spin_lock_irqsave(&ioapic_lock, flags);
-        if (assign_irq_vector(irq, mask))
+        dest = set_desc_affinity(desc, mask);
-                return;
+        if (dest != BAD_APICID) {
+                /* Only the high 8 bits are valid. */
+                dest = SET_APIC_LOGICAL_ID(dest);
+                __target_IO_APIC_irq(irq, dest, cfg);
+        }
+        spin_unlock_irqrestore(&ioapic_lock, flags);
+}
-        cpus_and(tmp, cfg->domain, mask);
+static void
-        dest = cpu_mask_to_apicid(tmp);
+set_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask)
-        /*
+{
-         * Only the high 8 bits are valid.
+        struct irq_desc *desc;
-         */
-        dest = SET_APIC_LOGICAL_ID(dest);
        desc = irq_to_desc(irq);
-        spin_lock_irqsave(&ioapic_lock, flags);
-        __target_IO_APIC_irq(irq, dest, cfg->vector);
+        set_ioapic_affinity_irq_desc(desc, mask);
-        desc->affinity = mask;
-        spin_unlock_irqrestore(&ioapic_lock, flags);
 }
 #endif /* CONFIG_SMP */
@@ -397,16 +623,18 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
 * shared ISA-space IRQs, so we have to support them. We are super
 * fast in the common case, and fast for shared ISA-space IRQs.
 */
-static void add_pin_to_irq(unsigned int irq, int apic, int pin)
+static void add_pin_to_irq_cpu(struct irq_cfg *cfg, int cpu, int apic, int pin)
 {
-        struct irq_cfg *cfg;
        struct irq_pin_list *entry;
-        /* first time to refer irq_cfg, so with new */
-        cfg = irq_cfg_alloc(irq);
        entry = cfg->irq_2_pin;
        if (!entry) {
-                entry = get_one_free_irq_2_pin();
+                entry = get_one_free_irq_2_pin(cpu);
+                if (!entry) {
+                        printk(KERN_ERR "can not alloc irq_2_pin to add %d - %d\n",
+                                        apic, pin);
+                        return;
+                }
                cfg->irq_2_pin = entry;
                entry->apic = apic;
                entry->pin = pin;
@@ -421,7 +649,7 @@ static void add_pin_to_irq(unsigned int irq, int apic, int pin)
                entry = entry->next;
        }
-        entry->next = get_one_free_irq_2_pin();
+        entry->next = get_one_free_irq_2_pin(cpu);
        entry = entry->next;
        entry->apic = apic;
        entry->pin = pin;
@@ -430,11 +658,10 @@ static void add_pin_to_irq(unsigned int irq, int apic, int pin)
 /*
 * Reroute an IRQ to a different pin.
 */
-static void __init replace_pin_at_irq(unsigned int irq,
+static void __init replace_pin_at_irq_cpu(struct irq_cfg *cfg, int cpu,
                                      int oldapic, int oldpin,
                                      int newapic, int newpin)
 {
-        struct irq_cfg *cfg = irq_cfg(irq);
        struct irq_pin_list *entry = cfg->irq_2_pin;
        int replaced = 0;
@@ -451,18 +678,16 @@ static void __init replace_pin_at_irq(unsigned int irq,
        /* why? call replace before add? */
        if (!replaced)
-                add_pin_to_irq(irq, newapic, newpin);
+                add_pin_to_irq_cpu(cfg, cpu, newapic, newpin);
 }
-static inline void io_apic_modify_irq(unsigned int irq,
+static inline void io_apic_modify_irq(struct irq_cfg *cfg,
                                int mask_and, int mask_or,
                                void (*final)(struct irq_pin_list *entry))
 {
        int pin;
-        struct irq_cfg *cfg;
        struct irq_pin_list *entry;
-        cfg = irq_cfg(irq);
        for (entry = cfg->irq_2_pin; entry != NULL; entry = entry->next) {
                unsigned int reg;
                pin = entry->pin;
@@ -475,9 +700,9 @@ static inline void io_apic_modify_irq(unsigned int irq,
        }
 }
-static void __unmask_IO_APIC_irq(unsigned int irq)
+static void __unmask_IO_APIC_irq(struct irq_cfg *cfg)
 {
-        io_apic_modify_irq(irq, ~IO_APIC_REDIR_MASKED, 0, NULL);
+        io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, 0, NULL);
 }
 #ifdef CONFIG_X86_64
@@ -492,47 +717,64 @@ static void io_apic_sync(struct irq_pin_list *entry)
        readl(&io_apic->data);
 }
-static void __mask_IO_APIC_irq(unsigned int irq)
+static void __mask_IO_APIC_irq(struct irq_cfg *cfg)
 {
-        io_apic_modify_irq(irq, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync);
+        io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync);
 }
 #else /* CONFIG_X86_32 */
-static void __mask_IO_APIC_irq(unsigned int irq)
+static void __mask_IO_APIC_irq(struct irq_cfg *cfg)
 {
-        io_apic_modify_irq(irq, ~0, IO_APIC_REDIR_MASKED, NULL);
+        io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, NULL);
 }
-static void __mask_and_edge_IO_APIC_irq(unsigned int irq)
+static void __mask_and_edge_IO_APIC_irq(struct irq_cfg *cfg)
 {
-        io_apic_modify_irq(irq, ~IO_APIC_REDIR_LEVEL_TRIGGER,
+        io_apic_modify_irq(cfg, ~IO_APIC_REDIR_LEVEL_TRIGGER,
                        IO_APIC_REDIR_MASKED, NULL);
 }
-static void __unmask_and_level_IO_APIC_irq(unsigned int irq)
+static void __unmask_and_level_IO_APIC_irq(struct irq_cfg *cfg)
 {
-        io_apic_modify_irq(irq, ~IO_APIC_REDIR_MASKED,
+        io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED,
                        IO_APIC_REDIR_LEVEL_TRIGGER, NULL);
 }
 #endif /* CONFIG_X86_32 */
-static void mask_IO_APIC_irq (unsigned int irq)
+static void mask_IO_APIC_irq_desc(struct irq_desc *desc)
 {
+        struct irq_cfg *cfg = desc->chip_data;
        unsigned long flags;
+        BUG_ON(!cfg);
        spin_lock_irqsave(&ioapic_lock, flags);
-        __mask_IO_APIC_irq(irq);
+        __mask_IO_APIC_irq(cfg);
        spin_unlock_irqrestore(&ioapic_lock, flags);
 }
-static void unmask_IO_APIC_irq (unsigned int irq)
+static void unmask_IO_APIC_irq_desc(struct irq_desc *desc)
 {
+        struct irq_cfg *cfg = desc->chip_data;
        unsigned long flags;
        spin_lock_irqsave(&ioapic_lock, flags);
-        __unmask_IO_APIC_irq(irq);
+        __unmask_IO_APIC_irq(cfg);
        spin_unlock_irqrestore(&ioapic_lock, flags);
 }
+static void mask_IO_APIC_irq(unsigned int irq)
+{
+        struct irq_desc *desc = irq_to_desc(irq);
+        mask_IO_APIC_irq_desc(desc);
+}
+static void unmask_IO_APIC_irq(unsigned int irq)
+{
+        struct irq_desc *desc = irq_to_desc(irq);
+        unmask_IO_APIC_irq_desc(desc);
+}
 static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
 {
        struct IO_APIC_route_entry entry;
@@ -809,7 +1051,7 @@ EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
 */
 static int EISA_ELCR(unsigned int irq)
 {
-        if (irq < 16) {
+        if (irq < NR_IRQS_LEGACY) {
                unsigned int port = 0x4d0 + (irq >> 3);
                return (inb(port) >> (irq & 7)) & 1;
        }
@@ -1034,7 +1276,8 @@ void unlock_vector_lock(void)
        spin_unlock(&vector_lock);
 }
-static int __assign_irq_vector(int irq, cpumask_t mask)
+static int
+__assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
 {
        /*
         * NOTE! The local APIC isn't very good at handling
@@ -1049,52 +1292,49 @@ static int __assign_irq_vector(int irq, cpumask_t mask)
         */
        static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0;
        unsigned int old_vector;
-        int cpu;
+        int cpu, err;
-        struct irq_cfg *cfg;
+        cpumask_var_t tmp_mask;
-        cfg = irq_cfg(irq);
-        /* Only try and allocate irqs on cpus that are present */
-        cpus_and(mask, mask, cpu_online_map);
        if ((cfg->move_in_progress) || cfg->move_cleanup_count)
                return -EBUSY;
+        if (!alloc_cpumask_var(&tmp_mask, GFP_ATOMIC))
+                return -ENOMEM;
        old_vector = cfg->vector;
        if (old_vector) {
-                cpumask_t tmp;
+                cpumask_and(tmp_mask, mask, cpu_online_mask);
-                cpus_and(tmp, cfg->domain, mask);
+                cpumask_and(tmp_mask, cfg->domain, tmp_mask);
-                if (!cpus_empty(tmp))
+                if (!cpumask_empty(tmp_mask)) {
+                        free_cpumask_var(tmp_mask);
                        return 0;
+                }
        }
-        for_each_cpu_mask_nr(cpu, mask) {
+        /* Only try and allocate irqs on cpus that are present */
-                cpumask_t domain, new_mask;
+        err = -ENOSPC;
+        for_each_cpu_and(cpu, mask, cpu_online_mask) {
                int new_cpu;
                int vector, offset;
-                domain = vector_allocation_domain(cpu);
+                vector_allocation_domain(cpu, tmp_mask);
-                cpus_and(new_mask, domain, cpu_online_map);
                vector = current_vector;
                offset = current_offset;
 next:
                vector += 8;
                if (vector >= first_system_vector) {
-                        /* If we run out of vectors on large boxen, must share them. */
+                        /* If out of vectors on large boxen, must share them. */
                        offset = (offset + 1) % 8;
                        vector = FIRST_DEVICE_VECTOR + offset;
                }
                if (unlikely(current_vector == vector))
                        continue;
-#ifdef CONFIG_X86_64
-                if (vector == IA32_SYSCALL_VECTOR)
+                if (test_bit(vector, used_vectors))
-                        goto next;
-#else
-                if (vector == SYSCALL_VECTOR)
                        goto next;
-#endif
-                for_each_cpu_mask_nr(new_cpu, new_mask)
+                for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask)
                        if (per_cpu(vector_irq, new_cpu)[vector] != -1)
                                goto next;
                /* Found one! */
@@ -1102,49 +1342,47 @@ next:
                current_offset = offset;
                if (old_vector) {
                        cfg->move_in_progress = 1;
-                        cfg->old_domain = cfg->domain;
+                        cpumask_copy(cfg->old_domain, cfg->domain);
                }
-                for_each_cpu_mask_nr(new_cpu, new_mask)
+                for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask)
                        per_cpu(vector_irq, new_cpu)[vector] = irq;
                cfg->vector = vector;
-                cfg->domain = domain;
+                cpumask_copy(cfg->domain, tmp_mask);
-                return 0;
+                err = 0;
+                break;
        }
-        return -ENOSPC;
+        free_cpumask_var(tmp_mask);
+        return err;
 }
-static int assign_irq_vector(int irq, cpumask_t mask)
+static int
+assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
 {
        int err;
        unsigned long flags;
        spin_lock_irqsave(&vector_lock, flags);
-        err = __assign_irq_vector(irq, mask);
+        err = __assign_irq_vector(irq, cfg, mask);
        spin_unlock_irqrestore(&vector_lock, flags);
        return err;
 }
-static void __clear_irq_vector(int irq)
+static void __clear_irq_vector(int irq, struct irq_cfg *cfg)
 {
-        struct irq_cfg *cfg;
-        cpumask_t mask;
        int cpu, vector;
-        cfg = irq_cfg(irq);
        BUG_ON(!cfg->vector);
        vector = cfg->vector;
-        cpus_and(mask, cfg->domain, cpu_online_map);
+        for_each_cpu_and(cpu, cfg->domain, cpu_online_mask)
-        for_each_cpu_mask_nr(cpu, mask)
                per_cpu(vector_irq, cpu)[vector] = -1;
        cfg->vector = 0;
-        cpus_clear(cfg->domain);
+        cpumask_clear(cfg->domain);
        if (likely(!cfg->move_in_progress))
                return;
-        cpus_and(mask, cfg->old_domain, cpu_online_map);
+        for_each_cpu_and(cpu, cfg->old_domain, cpu_online_mask) {
-        for_each_cpu_mask_nr(cpu, mask) {
                for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS;
                                                                vector++) {
                        if (per_cpu(vector_irq, cpu)[vector] != irq)
@@ -1162,10 +1400,12 @@ void __setup_vector_irq(int cpu)
        /* This function must be called with vector_lock held */
        int irq, vector;
        struct irq_cfg *cfg;
+        struct irq_desc *desc;
        /* Mark the inuse vectors */
-        for_each_irq_cfg(irq, cfg) {
+        for_each_irq_desc(irq, desc) {
-                if (!cpu_isset(cpu, cfg->domain))
+                cfg = desc->chip_data;
+                if (!cpumask_test_cpu(cpu, cfg->domain))
                        continue;
                vector = cfg->vector;
                per_cpu(vector_irq, cpu)[vector] = irq;
@@ -1177,7 +1417,7 @@ void __setup_vector_irq(int cpu)
                        continue;
                cfg = irq_cfg(irq);
-                if (!cpu_isset(cpu, cfg->domain))
+                if (!cpumask_test_cpu(cpu, cfg->domain))
                        per_cpu(vector_irq, cpu)[vector] = -1;
        }
 }
@@ -1215,11 +1455,8 @@ static inline int IO_APIC_irq_trigger(int irq)
 }
 #endif
-static void ioapic_register_intr(int irq, unsigned long trigger)
+static void ioapic_register_intr(int irq, struct irq_desc *desc, unsigned long trigger)
 {
-        struct irq_desc *desc;
-        desc = irq_to_desc(irq);
        if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
            trigger == IOAPIC_LEVEL)
@@ -1311,23 +1548,22 @@ static int setup_ioapic_entry(int apic, int irq,
        return 0;
 }
-static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq,
+static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, struct irq_desc *desc,
                              int trigger, int polarity)
 {
        struct irq_cfg *cfg;
        struct IO_APIC_route_entry entry;
-        cpumask_t mask;
+        unsigned int dest;
        if (!IO_APIC_IRQ(irq))
                return;
-        cfg = irq_cfg(irq);
+        cfg = desc->chip_data;
-        mask = TARGET_CPUS;
+        if (assign_irq_vector(irq, cfg, TARGET_CPUS))
-        if (assign_irq_vector(irq, mask))
                return;
-        cpus_and(mask, cfg->domain, mask);
+        dest = cpu_mask_to_apicid_and(cfg->domain, TARGET_CPUS);
        apic_printk(APIC_VERBOSE,KERN_DEBUG
                    "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> "
@@ -1337,16 +1573,15 @@ static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq,
        if (setup_ioapic_entry(mp_ioapics[apic].mp_apicid, irq, &entry,
-                               cpu_mask_to_apicid(mask), trigger, polarity,
+                               dest, trigger, polarity, cfg->vector)) {
-                               cfg->vector)) {
                printk("Failed to setup ioapic entry for ioapic  %d, pin %d\n",
                       mp_ioapics[apic].mp_apicid, pin);
-                __clear_irq_vector(irq);
+                __clear_irq_vector(irq, cfg);
                return;
        }
-        ioapic_register_intr(irq, trigger);
+        ioapic_register_intr(irq, desc, trigger);
-        if (irq < 16)
+        if (irq < NR_IRQS_LEGACY)
                disable_8259A_irq(irq);
        ioapic_write_entry(apic, pin, entry);
@@ -1356,6 +1591,9 @@ static void __init setup_IO_APIC_irqs(void)
 {
        int apic, pin, idx, irq;
        int notcon = 0;
+        struct irq_desc *desc;
+        struct irq_cfg *cfg;
+        int cpu = boot_cpu_id;
        apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
@@ -1387,9 +1625,15 @@ static void __init setup_IO_APIC_irqs(void)
                        if (multi_timer_check(apic, irq))
                                continue;
 #endif
-                        add_pin_to_irq(irq, apic, pin);
+                        desc = irq_to_desc_alloc_cpu(irq, cpu);
+                        if (!desc) {
+                                printk(KERN_INFO "can not get irq_desc for %d\n", irq);
+                                continue;
+                        }
+                        cfg = desc->chip_data;
+                        add_pin_to_irq_cpu(cfg, cpu, apic, pin);
-                        setup_IO_APIC_irq(apic, pin, irq,
+                        setup_IO_APIC_irq(apic, pin, irq, desc,
                                        irq_trigger(idx), irq_polarity(idx));
                }
        }
@@ -1448,6 +1692,7 @@ __apicdebuginit(void) print_IO_APIC(void)
        union IO_APIC_reg_03 reg_03;
        unsigned long flags;
        struct irq_cfg *cfg;
+        struct irq_desc *desc;
        unsigned int irq;
        if (apic_verbosity == APIC_QUIET)
@@ -1537,8 +1782,11 @@ __apicdebuginit(void) print_IO_APIC(void)
        }
        }
        printk(KERN_DEBUG "IRQ to pin mappings:\n");
-        for_each_irq_cfg(irq, cfg) {
+        for_each_irq_desc(irq, desc) {
-                struct irq_pin_list *entry = cfg->irq_2_pin;
+                struct irq_pin_list *entry;
+                cfg = desc->chip_data;
+                entry = cfg->irq_2_pin;
                if (!entry)
                        continue;
                printk(KERN_DEBUG "IRQ%d ", irq);
@@ -2022,14 +2270,16 @@ static unsigned int startup_ioapic_irq(unsigned int irq)
 {
        int was_pending = 0;
        unsigned long flags;
+        struct irq_cfg *cfg;
        spin_lock_irqsave(&ioapic_lock, flags);
-        if (irq < 16) {
+        if (irq < NR_IRQS_LEGACY) {
                disable_8259A_irq(irq);
                if (i8259A_irq_pending(irq))
                        was_pending = 1;
        }
-        __unmask_IO_APIC_irq(irq);
+        cfg = irq_cfg(irq);
+        __unmask_IO_APIC_irq(cfg);
        spin_unlock_irqrestore(&ioapic_lock, flags);
        return was_pending;
@@ -2043,7 +2293,7 @@ static int ioapic_retrigger_irq(unsigned int irq)
        unsigned long flags;
        spin_lock_irqsave(&vector_lock, flags);
-        send_IPI_mask(cpumask_of_cpu(first_cpu(cfg->domain)), cfg->vector);
+        send_IPI_mask(cpumask_of(cpumask_first(cfg->domain)), cfg->vector);
        spin_unlock_irqrestore(&vector_lock, flags);
        return 1;
@@ -2092,35 +2342,35 @@ static DECLARE_DELAYED_WORK(ir_migration_work, ir_irq_migration);
 * as simple as edge triggered migration and we can do the irq migration
 * with a simple atomic update to IO-APIC RTE.
 */
-static void migrate_ioapic_irq(int irq, cpumask_t mask)
+static void
+migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
 {
        struct irq_cfg *cfg;
-        struct irq_desc *desc;
-        cpumask_t tmp, cleanup_mask;
        struct irte irte;
        int modify_ioapic_rte;
        unsigned int dest;
        unsigned long flags;
+        unsigned int irq;
-        cpus_and(tmp, mask, cpu_online_map);
+        if (!cpumask_intersects(mask, cpu_online_mask))
-        if (cpus_empty(tmp))
                return;
+        irq = desc->irq;
        if (get_irte(irq, &irte))
                return;
-        if (assign_irq_vector(irq, mask))
+        cfg = desc->chip_data;
+        if (assign_irq_vector(irq, cfg, mask))
                return;
-        cfg = irq_cfg(irq);
+        set_extra_move_desc(desc, mask);
-        cpus_and(tmp, cfg->domain, mask);
-        dest = cpu_mask_to_apicid(tmp);
+        dest = cpu_mask_to_apicid_and(cfg->domain, mask);
-        desc = irq_to_desc(irq);
        modify_ioapic_rte = desc->status & IRQ_LEVEL;
        if (modify_ioapic_rte) {
                spin_lock_irqsave(&ioapic_lock, flags);
-                __target_IO_APIC_irq(irq, dest, cfg->vector);
+                __target_IO_APIC_irq(irq, dest, cfg);
                spin_unlock_irqrestore(&ioapic_lock, flags);
        }
@@ -2132,24 +2382,20 @@ static void migrate_ioapic_irq(int irq, cpumask_t mask)
         */
        modify_irte(irq, &irte);
-        if (cfg->move_in_progress) {
+        if (cfg->move_in_progress)
-                cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map);
+                send_cleanup_vector(cfg);
-                cfg->move_cleanup_count = cpus_weight(cleanup_mask);
-                send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
-                cfg->move_in_progress = 0;
-        }
-        desc->affinity = mask;
+        cpumask_copy(&desc->affinity, mask);
 }
-static int migrate_irq_remapped_level(int irq)
+static int migrate_irq_remapped_level_desc(struct irq_desc *desc)
 {
        int ret = -1;
-        struct irq_desc *desc = irq_to_desc(irq);
+        struct irq_cfg *cfg = desc->chip_data;
-        mask_IO_APIC_irq(irq);
+        mask_IO_APIC_irq_desc(desc);
-        if (io_apic_level_ack_pending(irq)) {
+        if (io_apic_level_ack_pending(cfg)) {
                /*
                 * Interrupt in progress. Migrating irq now will change the
                 * vector information in the IO-APIC RTE and that will confuse
@@ -2161,14 +2407,15 @@ static int migrate_irq_remapped_level(int irq)
        }
        /* everthing is clear. we have right of way */
-        migrate_ioapic_irq(irq, desc->pending_mask);
+        migrate_ioapic_irq_desc(desc, &desc->pending_mask);
        ret = 0;
        desc->status &= ~IRQ_MOVE_PENDING;
-        cpus_clear(desc->pending_mask);
+        cpumask_clear(&desc->pending_mask);
 unmask:
-        unmask_IO_APIC_irq(irq);
+        unmask_IO_APIC_irq_desc(desc);
        return ret;
 }
@@ -2189,7 +2436,7 @@ static void ir_irq_migration(struct work_struct *work)
                                continue;
                        }
-                        desc->chip->set_affinity(irq, desc->pending_mask);
+                        desc->chip->set_affinity(irq, &desc->pending_mask);
                        spin_unlock_irqrestore(&desc->lock, flags);
                }
        }
@@ -2198,18 +2445,24 @@ static void ir_irq_migration(struct work_struct *work)
 /*
 * Migrates the IRQ destination in the process context.
 */
-static void set_ir_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
+static void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
+                                            const struct cpumask *mask)
 {
-        struct irq_desc *desc = irq_to_desc(irq);
        if (desc->status & IRQ_LEVEL) {
                desc->status |= IRQ_MOVE_PENDING;
-                desc->pending_mask = mask;
+                cpumask_copy(&desc->pending_mask, mask);
-                migrate_irq_remapped_level(irq);
+                migrate_irq_remapped_level_desc(desc);
                return;
        }
-        migrate_ioapic_irq(irq, mask);
+        migrate_ioapic_irq_desc(desc, mask);
+}
+static void set_ir_ioapic_affinity_irq(unsigned int irq,
+                                       const struct cpumask *mask)
+{
+        struct irq_desc *desc = irq_to_desc(irq);
+        set_ir_ioapic_affinity_irq_desc(desc, mask);
 }
 #endif
@@ -2228,6 +2481,9 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
                struct irq_cfg *cfg;
                irq = __get_cpu_var(vector_irq)[vector];
+                if (irq == -1)
+                        continue;
                desc = irq_to_desc(irq);
                if (!desc)
                        continue;
@@ -2237,7 +2493,7 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
                if (!cfg->move_cleanup_count)
                        goto unlock;
-                if ((vector == cfg->vector) && cpu_isset(me, cfg->domain))
+                if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
                        goto unlock;
                __get_cpu_var(vector_irq)[vector] = -1;
@@ -2249,28 +2505,44 @@ unlock:
        irq_exit();
 }
-static void irq_complete_move(unsigned int irq)
+static void irq_complete_move(struct irq_desc **descp)
 {
-        struct irq_cfg *cfg = irq_cfg(irq);
+        struct irq_desc *desc = *descp;
+        struct irq_cfg *cfg = desc->chip_data;
        unsigned vector, me;
-        if (likely(!cfg->move_in_progress))
+        if (likely(!cfg->move_in_progress)) {
+#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
+                if (likely(!cfg->move_desc_pending))
+                        return;
+                /* domain has not changed, but affinity did */
+                me = smp_processor_id();
+                if (cpu_isset(me, desc->affinity)) {
+                        *descp = desc = move_irq_desc(desc, me);
+                        /* get the new one */
+                        cfg = desc->chip_data;
+                        cfg->move_desc_pending = 0;
+                }
+#endif
                return;
+        }
        vector = ~get_irq_regs()->orig_ax;
        me = smp_processor_id();
-        if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) {
+#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
-                cpumask_t cleanup_mask;
+                *descp = desc = move_irq_desc(desc, me);
+                /* get the new one */
+                cfg = desc->chip_data;
+#endif
-                cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map);
+        if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
-                cfg->move_cleanup_count = cpus_weight(cleanup_mask);
+                send_cleanup_vector(cfg);
-                send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
-                cfg->move_in_progress = 0;
-        }
 }
 #else
-static inline void irq_complete_move(unsigned int irq) {}
+static inline void irq_complete_move(struct irq_desc **descp) {}
 #endif
 #ifdef CONFIG_INTR_REMAP
 static void ack_x2apic_level(unsigned int irq)
 {
@@ -2281,11 +2553,14 @@ static void ack_x2apic_edge(unsigned int irq)
 {
        ack_x2APIC_irq();
 }
 #endif
 static void ack_apic_edge(unsigned int irq)
 {
-        irq_complete_move(irq);
+        struct irq_desc *desc = irq_to_desc(irq);
+        irq_complete_move(&desc);
        move_native_irq(irq);
        ack_APIC_irq();
 }
@@ -2294,18 +2569,21 @@ atomic_t irq_mis_count;
 static void ack_apic_level(unsigned int irq)
 {
+        struct irq_desc *desc = irq_to_desc(irq);
 #ifdef CONFIG_X86_32
        unsigned long v;
        int i;
 #endif
+        struct irq_cfg *cfg;
        int do_unmask_irq = 0;
-        irq_complete_move(irq);
+        irq_complete_move(&desc);
 #ifdef CONFIG_GENERIC_PENDING_IRQ
        /* If we are moving the irq we need to mask it */
-        if (unlikely(irq_to_desc(irq)->status & IRQ_MOVE_PENDING)) {
+        if (unlikely(desc->status & IRQ_MOVE_PENDING)) {
                do_unmask_irq = 1;
-                mask_IO_APIC_irq(irq);
+                mask_IO_APIC_irq_desc(desc);
        }
 #endif
@@ -2329,7 +2607,8 @@ static void ack_apic_level(unsigned int irq)
        * operation to prevent an edge-triggered interrupt escaping meanwhile.
        * The idea is from Manfred Spraul.  --macro
        */
-        i = irq_cfg(irq)->vector;
+        cfg = desc->chip_data;
+        i = cfg->vector;
        v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1));
 #endif
@@ -2368,17 +2647,18 @@ static void ack_apic_level(unsigned int irq)
                 * accurate and is causing problems then it is a hardware bug
                 * and you can go talk to the chipset vendor about it.
                 */
-                if (!io_apic_level_ack_pending(irq))
+                cfg = desc->chip_data;
+                if (!io_apic_level_ack_pending(cfg))
                        move_masked_irq(irq);
-                unmask_IO_APIC_irq(irq);
+                unmask_IO_APIC_irq_desc(desc);
        }
 #ifdef CONFIG_X86_32
        if (!(v & (1 << (i & 0x1f)))) {
                atomic_inc(&irq_mis_count);
                spin_lock(&ioapic_lock);
-                __mask_and_edge_IO_APIC_irq(irq);
+                __mask_and_edge_IO_APIC_irq(cfg);
-                __unmask_and_level_IO_APIC_irq(irq);
+                __unmask_and_level_IO_APIC_irq(cfg);
                spin_unlock(&ioapic_lock);
        }
 #endif
@@ -2429,20 +2709,19 @@ static inline void init_IO_APIC_traps(void)
         * Also, we've got to be careful not to trash gate
         * 0x80, because int 0x80 is hm, kind of importantish. ;)
         */
-        for_each_irq_cfg(irq, cfg) {
+        for_each_irq_desc(irq, desc) {
-                if (IO_APIC_IRQ(irq) && !cfg->vector) {
+                cfg = desc->chip_data;
+                if (IO_APIC_IRQ(irq) && cfg && !cfg->vector) {
                        /*
                         * Hmm.. We don't have an entry for this,
                         * so default to an old-fashioned 8259
                         * interrupt if we can..
                         */
-                        if (irq < 16)
+                        if (irq < NR_IRQS_LEGACY)
                                make_8259A_irq(irq);
-                        else {
+                        else
-                                desc = irq_to_desc(irq);
                                /* Strange. Oh, well.. */
                                desc->chip = &no_irq_chip;
-                        }
                }
        }
 }
@@ -2467,7 +2746,7 @@ static void unmask_lapic_irq(unsigned int irq)
        apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
 }
-static void ack_lapic_irq (unsigned int irq)
+static void ack_lapic_irq(unsigned int irq)
 {
        ack_APIC_irq();
 }
@@ -2479,11 +2758,8 @@ static struct irq_chip lapic_chip __read_mostly = {
        .ack            = ack_lapic_irq,
 };
-static void lapic_register_intr(int irq)
+static void lapic_register_intr(int irq, struct irq_desc *desc)
 {
-        struct irq_desc *desc;
-        desc = irq_to_desc(irq);
        desc->status &= ~IRQ_LEVEL;
        set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq,
                                      "edge");
@@ -2587,7 +2863,9 @@ int timer_through_8259 __initdata;
 */
 static inline void __init check_timer(void)
 {
-        struct irq_cfg *cfg = irq_cfg(0);
+        struct irq_desc *desc = irq_to_desc(0);
+        struct irq_cfg *cfg = desc->chip_data;
+        int cpu = boot_cpu_id;
        int apic1, pin1, apic2, pin2;
        unsigned long flags;
        unsigned int ver;
@@ -2602,7 +2880,7 @@ static inline void __init check_timer(void)
         * get/set the timer IRQ vector:
         */
        disable_8259A_irq(0);
-        assign_irq_vector(0, TARGET_CPUS);
+        assign_irq_vector(0, cfg, TARGET_CPUS);
        /*
         * As IRQ0 is to be enabled in the 8259A, the virtual
@@ -2653,10 +2931,10 @@ static inline void __init check_timer(void)
                 * Ok, does IRQ0 through the IOAPIC work?
                 */
                if (no_pin1) {
-                        add_pin_to_irq(0, apic1, pin1);
+                        add_pin_to_irq_cpu(cfg, cpu, apic1, pin1);
                        setup_timer_IRQ0_pin(apic1, pin1, cfg->vector);
                }
-                unmask_IO_APIC_irq(0);
+                unmask_IO_APIC_irq_desc(desc);
                if (timer_irq_works()) {
                        if (nmi_watchdog == NMI_IO_APIC) {
                                setup_nmi();
@@ -2682,9 +2960,9 @@ static inline void __init check_timer(void)
                /*
                 * legacy devices should be connected to IO APIC #0
                 */
-                replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
+                replace_pin_at_irq_cpu(cfg, cpu, apic1, pin1, apic2, pin2);
                setup_timer_IRQ0_pin(apic2, pin2, cfg->vector);
-                unmask_IO_APIC_irq(0);
+                unmask_IO_APIC_irq_desc(desc);
                enable_8259A_irq(0);
                if (timer_irq_works()) {
                        apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
@@ -2716,7 +2994,7 @@ static inline void __init check_timer(void)
        apic_printk(APIC_QUIET, KERN_INFO
                    "...trying to set up timer as Virtual Wire IRQ...\n");
-        lapic_register_intr(0);
+        lapic_register_intr(0, desc);
        apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector);     /* Fixed mode */
        enable_8259A_irq(0);
@@ -2901,22 +3179,26 @@ unsigned int create_irq_nr(unsigned int irq_want)
        unsigned int irq;
        unsigned int new;
        unsigned long flags;
-        struct irq_cfg *cfg_new;
+        struct irq_cfg *cfg_new = NULL;
+        int cpu = boot_cpu_id;
-        irq_want = nr_irqs - 1;
+        struct irq_desc *desc_new = NULL;
        irq = 0;
        spin_lock_irqsave(&vector_lock, flags);
-        for (new = irq_want; new > 0; new--) {
+        for (new = irq_want; new < NR_IRQS; new++) {
                if (platform_legacy_irq(new))
                        continue;
-                cfg_new = irq_cfg(new);
-                if (cfg_new && cfg_new->vector != 0)
+                desc_new = irq_to_desc_alloc_cpu(new, cpu);
+                if (!desc_new) {
+                        printk(KERN_INFO "can not get irq_desc for %d\n", new);
                        continue;
-                /* check if need to create one */
+                }
-                if (!cfg_new)
+                cfg_new = desc_new->chip_data;
-                        cfg_new = irq_cfg_alloc(new);
-                if (__assign_irq_vector(new, TARGET_CPUS) == 0)
+                if (cfg_new->vector != 0)
+                        continue;
+                if (__assign_irq_vector(new, cfg_new, TARGET_CPUS) == 0)
                        irq = new;
                break;
        }
@@ -2924,15 +3206,21 @@ unsigned int create_irq_nr(unsigned int irq_want)
        if (irq > 0) {
                dynamic_irq_init(irq);
+                /* restore it, in case dynamic_irq_init clear it */
+                if (desc_new)
+                        desc_new->chip_data = cfg_new;
        }
        return irq;
 }
+static int nr_irqs_gsi = NR_IRQS_LEGACY;
 int create_irq(void)
 {
+        unsigned int irq_want;
        int irq;
-        irq = create_irq_nr(nr_irqs - 1);
+        irq_want = nr_irqs_gsi;
+        irq = create_irq_nr(irq_want);
        if (irq == 0)
                irq = -1;
@@ -2943,14 +3231,22 @@ int create_irq(void)
 void destroy_irq(unsigned int irq)
 {
        unsigned long flags;
+        struct irq_cfg *cfg;
+        struct irq_desc *desc;
+        /* store it, in case dynamic_irq_cleanup clear it */
+        desc = irq_to_desc(irq);
+        cfg = desc->chip_data;
        dynamic_irq_cleanup(irq);
+        /* connect back irq_cfg */
+        if (desc)
+                desc->chip_data = cfg;
 #ifdef CONFIG_INTR_REMAP
        free_irte(irq);
 #endif
        spin_lock_irqsave(&vector_lock, flags);
-        __clear_irq_vector(irq);
+        __clear_irq_vector(irq, cfg);
        spin_unlock_irqrestore(&vector_lock, flags);
 }
@@ -2963,16 +3259,13 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
        struct irq_cfg *cfg;
        int err;
        unsigned dest;
-        cpumask_t tmp;
-        tmp = TARGET_CPUS;
+        cfg = irq_cfg(irq);
-        err = assign_irq_vector(irq, tmp);
+        err = assign_irq_vector(irq, cfg, TARGET_CPUS);
        if (err)
                return err;
-        cfg = irq_cfg(irq);
+        dest = cpu_mask_to_apicid_and(cfg->domain, TARGET_CPUS);
-        cpus_and(tmp, cfg->domain, tmp);
-        dest = cpu_mask_to_apicid(tmp);
 #ifdef CONFIG_INTR_REMAP
        if (irq_remapped(irq)) {
@@ -3026,64 +3319,48 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
 }
 #ifdef CONFIG_SMP
-static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
+static void set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
 {
+        struct irq_desc *desc = irq_to_desc(irq);
        struct irq_cfg *cfg;
        struct msi_msg msg;
        unsigned int dest;
-        cpumask_t tmp;
-        struct irq_desc *desc;
-        cpus_and(tmp, mask, cpu_online_map);
+        dest = set_desc_affinity(desc, mask);
-        if (cpus_empty(tmp))
+        if (dest == BAD_APICID)
                return;
-        if (assign_irq_vector(irq, mask))
+        cfg = desc->chip_data;
-                return;
-        cfg = irq_cfg(irq);
+        read_msi_msg_desc(desc, &msg);
-        cpus_and(tmp, cfg->domain, mask);
-        dest = cpu_mask_to_apicid(tmp);
-        read_msi_msg(irq, &msg);
        msg.data &= ~MSI_DATA_VECTOR_MASK;
        msg.data |= MSI_DATA_VECTOR(cfg->vector);
        msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
        msg.address_lo |= MSI_ADDR_DEST_ID(dest);
-        write_msi_msg(irq, &msg);
+        write_msi_msg_desc(desc, &msg);
-        desc = irq_to_desc(irq);
-        desc->affinity = mask;
 }
 #ifdef CONFIG_INTR_REMAP
 /*
 * Migrate the MSI irq to another cpumask. This migration is
 * done in the process context using interrupt-remapping hardware.
 */
-static void ir_set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
+static void
+ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
 {
-        struct irq_cfg *cfg;
+        struct irq_desc *desc = irq_to_desc(irq);
+        struct irq_cfg *cfg = desc->chip_data;
        unsigned int dest;
-        cpumask_t tmp, cleanup_mask;
        struct irte irte;
-        struct irq_desc *desc;
-        cpus_and(tmp, mask, cpu_online_map);
-        if (cpus_empty(tmp))
-                return;
        if (get_irte(irq, &irte))
                return;
-        if (assign_irq_vector(irq, mask))
+        dest = set_desc_affinity(desc, mask);
+        if (dest == BAD_APICID)
                return;
-        cfg = irq_cfg(irq);
-        cpus_and(tmp, cfg->domain, mask);
-        dest = cpu_mask_to_apicid(tmp);
        irte.vector = cfg->vector;
        irte.dest_id = IRTE_DEST(dest);
@@ -3097,16 +3374,10 @@ static void ir_set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
         * at the new destination. So, time to cleanup the previous
         * vector allocation.
         */
-        if (cfg->move_in_progress) {
+        if (cfg->move_in_progress)
-                cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map);
+                send_cleanup_vector(cfg);
-                cfg->move_cleanup_count = cpus_weight(cleanup_mask);
-                send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
-                cfg->move_in_progress = 0;
-        }
-        desc = irq_to_desc(irq);
-        desc->affinity = mask;
 }
 #endif
 #endif /* CONFIG_SMP */
@@ -3165,7 +3436,7 @@ static int msi_alloc_irte(struct pci_dev *dev, int irq, int nvec)
 }
 #endif
-static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc, int irq)
+static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
 {
        int ret;
        struct msi_msg msg;
@@ -3174,7 +3445,7 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc, int irq)
        if (ret < 0)
                return ret;
-        set_irq_msi(irq, desc);
+        set_irq_msi(irq, msidesc);
        write_msi_msg(irq, &msg);
 #ifdef CONFIG_INTR_REMAP
@@ -3194,26 +3465,13 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc, int irq)
        return 0;
 }
-static unsigned int build_irq_for_pci_dev(struct pci_dev *dev)
+int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc)
-{
-        unsigned int irq;
-        irq = dev->bus->number;
-        irq <<= 8;
-        irq |= dev->devfn;
-        irq <<= 12;
-        return irq;
-}
-int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc)
 {
        unsigned int irq;
        int ret;
        unsigned int irq_want;
-        irq_want = build_irq_for_pci_dev(dev) + 0x100;
+        irq_want = nr_irqs_gsi;
        irq = create_irq_nr(irq_want);
        if (irq == 0)
                return -1;
@@ -3227,7 +3485,7 @@ int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc)
                goto error;
 no_ir:
 #endif
-        ret = setup_msi_irq(dev, desc, irq);
+        ret = setup_msi_irq(dev, msidesc, irq);
        if (ret < 0) {
                destroy_irq(irq);
                return ret;
@@ -3245,7 +3503,7 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 {
        unsigned int irq;
        int ret, sub_handle;
-        struct msi_desc *desc;
+        struct msi_desc *msidesc;
        unsigned int irq_want;
 #ifdef CONFIG_INTR_REMAP
@@ -3253,10 +3511,11 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
        int index = 0;
 #endif
-        irq_want = build_irq_for_pci_dev(dev) + 0x100;
+        irq_want = nr_irqs_gsi;
        sub_handle = 0;
-        list_for_each_entry(desc, &dev->msi_list, list) {
+        list_for_each_entry(msidesc, &dev->msi_list, list) {
-                irq = create_irq_nr(irq_want--);
+                irq = create_irq_nr(irq_want);
+                irq_want++;
                if (irq == 0)
                        return -1;
 #ifdef CONFIG_INTR_REMAP
@@ -3288,7 +3547,7 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
                }
 no_ir:
 #endif
-                ret = setup_msi_irq(dev, desc, irq);
+                ret = setup_msi_irq(dev, msidesc, irq);
                if (ret < 0)
                        goto error;
                sub_handle++;
@@ -3307,24 +3566,18 @@ void arch_teardown_msi_irq(unsigned int irq)
 #ifdef CONFIG_DMAR
 #ifdef CONFIG_SMP
-static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask)
+static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
 {
+        struct irq_desc *desc = irq_to_desc(irq);
        struct irq_cfg *cfg;
        struct msi_msg msg;
        unsigned int dest;
-        cpumask_t tmp;
-        struct irq_desc *desc;
-        cpus_and(tmp, mask, cpu_online_map);
+        dest = set_desc_affinity(desc, mask);
-        if (cpus_empty(tmp))
+        if (dest == BAD_APICID)
                return;
-        if (assign_irq_vector(irq, mask))
+        cfg = desc->chip_data;
-                return;
-        cfg = irq_cfg(irq);
-        cpus_and(tmp, cfg->domain, mask);
-        dest = cpu_mask_to_apicid(tmp);
        dmar_msi_read(irq, &msg);
@@ -3334,9 +3587,8 @@ static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask)
        msg.address_lo |= MSI_ADDR_DEST_ID(dest);
        dmar_msi_write(irq, &msg);
-        desc = irq_to_desc(irq);
-        desc->affinity = mask;
 }
 #endif /* CONFIG_SMP */
 struct irq_chip dmar_msi_type = {
@@ -3368,24 +3620,18 @@ int arch_setup_dmar_msi(unsigned int irq)
 #ifdef CONFIG_HPET_TIMER
 #ifdef CONFIG_SMP
-static void hpet_msi_set_affinity(unsigned int irq, cpumask_t mask)
+static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
 {
+        struct irq_desc *desc = irq_to_desc(irq);
        struct irq_cfg *cfg;
-        struct irq_desc *desc;
        struct msi_msg msg;
        unsigned int dest;
-        cpumask_t tmp;
-        cpus_and(tmp, mask, cpu_online_map);
+        dest = set_desc_affinity(desc, mask);
-        if (cpus_empty(tmp))
+        if (dest == BAD_APICID)
                return;
-        if (assign_irq_vector(irq, mask))
+        cfg = desc->chip_data;
-                return;
-        cfg = irq_cfg(irq);
-        cpus_and(tmp, cfg->domain, mask);
-        dest = cpu_mask_to_apicid(tmp);
        hpet_msi_read(irq, &msg);
@@ -3395,9 +3641,8 @@ static void hpet_msi_set_affinity(unsigned int irq, cpumask_t mask)
        msg.address_lo |= MSI_ADDR_DEST_ID(dest);
        hpet_msi_write(irq, &msg);
-        desc = irq_to_desc(irq);
-        desc->affinity = mask;
 }
 #endif /* CONFIG_SMP */
 struct irq_chip hpet_msi_type = {
@@ -3450,28 +3695,21 @@ static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector)
        write_ht_irq_msg(irq, &msg);
 }
-static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask)
+static void set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask)
 {
+        struct irq_desc *desc = irq_to_desc(irq);
        struct irq_cfg *cfg;
        unsigned int dest;
-        cpumask_t tmp;
-        struct irq_desc *desc;
-        cpus_and(tmp, mask, cpu_online_map);
+        dest = set_desc_affinity(desc, mask);
-        if (cpus_empty(tmp))
+        if (dest == BAD_APICID)
                return;
-        if (assign_irq_vector(irq, mask))
+        cfg = desc->chip_data;
-                return;
-        cfg = irq_cfg(irq);
-        cpus_and(tmp, cfg->domain, mask);
-        dest = cpu_mask_to_apicid(tmp);
        target_ht_irq(irq, dest, cfg->vector);
-        desc = irq_to_desc(irq);
-        desc->affinity = mask;
 }
 #endif
 static struct irq_chip ht_irq_chip = {
@@ -3489,17 +3727,14 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
 {
        struct irq_cfg *cfg;
        int err;
-        cpumask_t tmp;
-        tmp = TARGET_CPUS;
+        cfg = irq_cfg(irq);
-        err = assign_irq_vector(irq, tmp);
+        err = assign_irq_vector(irq, cfg, TARGET_CPUS);
        if (!err) {
                struct ht_irq_msg msg;
                unsigned dest;
-                cfg = irq_cfg(irq);
+                dest = cpu_mask_to_apicid_and(cfg->domain, TARGET_CPUS);
-                cpus_and(tmp, cfg->domain, tmp);
-                dest = cpu_mask_to_apicid(tmp);
                msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest);
@@ -3535,7 +3770,7 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
 int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
                       unsigned long mmr_offset)
 {
-        const cpumask_t *eligible_cpu = get_cpu_mask(cpu);
+        const struct cpumask *eligible_cpu = cpumask_of(cpu);
        struct irq_cfg *cfg;
        int mmr_pnode;
        unsigned long mmr_value;
@@ -3543,7 +3778,9 @@ int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
        unsigned long flags;
        int err;
-        err = assign_irq_vector(irq, *eligible_cpu);
+        cfg = irq_cfg(irq);
+        err = assign_irq_vector(irq, cfg, eligible_cpu);
        if (err != 0)
                return err;
@@ -3552,8 +3789,6 @@ int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
                                      irq_name);
        spin_unlock_irqrestore(&vector_lock, flags);
-        cfg = irq_cfg(irq);
        mmr_value = 0;
        entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
        BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
@@ -3564,7 +3799,7 @@ int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
        entry->polarity = 0;
        entry->trigger = 0;
        entry->mask = 0;
-        entry->dest = cpu_mask_to_apicid(*eligible_cpu);
+        entry->dest = cpu_mask_to_apicid(eligible_cpu);
        mmr_pnode = uv_blade_to_pnode(mmr_blade);
        uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
@@ -3605,9 +3840,16 @@ int __init io_apic_get_redir_entries (int ioapic)
        return reg_01.bits.entries;
 }
-int __init probe_nr_irqs(void)
+void __init probe_nr_irqs_gsi(void)
 {
-        return NR_IRQS;
+        int idx;
+        int nr = 0;
+        for (idx = 0; idx < nr_ioapics; idx++)
+                nr += io_apic_get_redir_entries(idx) + 1;
+        if (nr > nr_irqs_gsi)
+                nr_irqs_gsi = nr;
 }
 /* --------------------------------------------------------------------------
@@ -3706,19 +3948,31 @@ int __init io_apic_get_version(int ioapic)
 int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int polarity)
 {
+        struct irq_desc *desc;
+        struct irq_cfg *cfg;
+        int cpu = boot_cpu_id;
        if (!IO_APIC_IRQ(irq)) {
                apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
                        ioapic);
                return -EINVAL;
        }
+        desc = irq_to_desc_alloc_cpu(irq, cpu);
+        if (!desc) {
+                printk(KERN_INFO "can not get irq_desc %d\n", irq);
+                return 0;
+        }
        /*
         * IRQs < 16 are already in the irq_2_pin[] map
         */
-        if (irq >= 16)
+        if (irq >= NR_IRQS_LEGACY) {
-                add_pin_to_irq(irq, ioapic, pin);
+                cfg = desc->chip_data;
+                add_pin_to_irq_cpu(cfg, cpu, ioapic, pin);
+        }
-        setup_IO_APIC_irq(ioapic, pin, irq, triggering, polarity);
+        setup_IO_APIC_irq(ioapic, pin, irq, desc, triggering, polarity);
        return 0;
 }
@@ -3756,7 +4010,7 @@ void __init setup_ioapic_dest(void)
        int pin, ioapic, irq, irq_entry;
        struct irq_desc *desc;
        struct irq_cfg *cfg;
-        cpumask_t mask;
+        const struct cpumask *mask;
        if (skip_ioapic_setup == 1)
                return;
@@ -3772,9 +4026,10 @@ void __init setup_ioapic_dest(void)
                         * when you have too many devices, because at that time only boot
                         * cpu is online.
                         */
-                        cfg = irq_cfg(irq);
+                        desc = irq_to_desc(irq);
+                        cfg = desc->chip_data;
                        if (!cfg->vector) {
-                                setup_IO_APIC_irq(ioapic, pin, irq,
+                                setup_IO_APIC_irq(ioapic, pin, irq, desc,
                                                  irq_trigger(irq_entry),
                                                  irq_polarity(irq_entry));
                                continue;
@@ -3784,19 +4039,18 @@ void __init setup_ioapic_dest(void)
                        /*
                         * Honour affinities which have been set in early boot
                         */
-                        desc = irq_to_desc(irq);
                        if (desc->status &
                            (IRQ_NO_BALANCING | IRQ_AFFINITY_SET))
-                                mask = desc->affinity;
+                                mask = &desc->affinity;
                        else
                                mask = TARGET_CPUS;
 #ifdef CONFIG_INTR_REMAP
                        if (intr_remapping_enabled)
-                                set_ir_ioapic_affinity_irq(irq, mask);
+                                set_ir_ioapic_affinity_irq_desc(desc, mask);
                        else
 #endif
-                                set_ioapic_affinity_irq(irq, mask);
+                                set_ioapic_affinity_irq_desc(desc, mask);
                }
        }
@@ -3845,7 +4099,6 @@ void __init ioapic_init_mappings(void)
        struct resource *ioapic_res;
        int i;
-        irq_2_pin_init();
        ioapic_res = ioapic_setup_resources();
        for (i = 0; i < nr_ioapics; i++) {
                if (smp_found_config) {
diff --git a/arch/x86/kernel/ipi.c b/arch/x86/kernel/ipi.c
index f1c688e46f35..285bbf8831fa 100644
--- a/arch/x86/kernel/ipi.c
+++ b/arch/x86/kernel/ipi.c
@@ -116,18 +116,18 @@ static inline void __send_IPI_dest_field(unsigned long mask, int vector)
 /*
 * This is only used on smaller machines.
 */
-void send_IPI_mask_bitmask(cpumask_t cpumask, int vector)
+void send_IPI_mask_bitmask(const struct cpumask *cpumask, int vector)
 {
-        unsigned long mask = cpus_addr(cpumask)[0];
+        unsigned long mask = cpumask_bits(cpumask)[0];
        unsigned long flags;
        local_irq_save(flags);
-        WARN_ON(mask & ~cpus_addr(cpu_online_map)[0]);
+        WARN_ON(mask & ~cpumask_bits(cpu_online_mask)[0]);
        __send_IPI_dest_field(mask, vector);
        local_irq_restore(flags);
 }
-void send_IPI_mask_sequence(cpumask_t mask, int vector)
+void send_IPI_mask_sequence(const struct cpumask *mask, int vector)
 {
        unsigned long flags;
        unsigned int query_cpu;
@@ -139,12 +139,24 @@ void send_IPI_mask_sequence(cpumask_t mask, int vector)
         */
        local_irq_save(flags);
-        for_each_possible_cpu(query_cpu) {
+        for_each_cpu(query_cpu, mask)
-                if (cpu_isset(query_cpu, mask)) {
+                __send_IPI_dest_field(cpu_to_logical_apicid(query_cpu), vector);
+        local_irq_restore(flags);
+}
+void send_IPI_mask_allbutself(const struct cpumask *mask, int vector)
+{
+        unsigned long flags;
+        unsigned int query_cpu;
+        unsigned int this_cpu = smp_processor_id();
+        /* See Hack comment above */
+        local_irq_save(flags);
+        for_each_cpu(query_cpu, mask)
+                if (query_cpu != this_cpu)
                        __send_IPI_dest_field(cpu_to_logical_apicid(query_cpu),
                                              vector);
-                }
-        }
        local_irq_restore(flags);
 }
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index d1d4dc52f649..bce53e1352a0 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -9,6 +9,7 @@
 #include <asm/apic.h>
 #include <asm/io_apic.h>
 #include <asm/smp.h>
+#include <asm/irq.h>
 atomic_t irq_err_count;
@@ -118,6 +119,9 @@ int show_interrupts(struct seq_file *p, void *v)
        }
        desc = irq_to_desc(i);
+        if (!desc)
+                return 0;
        spin_lock_irqsave(&desc->lock, flags);
 #ifndef CONFIG_SMP
        any_count = kstat_irqs(i);
@@ -187,3 +191,5 @@ u64 arch_irq_stat(void)
 #endif
        return sum;
 }
+EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq);
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index a51382672de0..9dc5588f336a 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -233,25 +233,28 @@ unsigned int do_IRQ(struct pt_regs *regs)
 #ifdef CONFIG_HOTPLUG_CPU
 #include <mach_apic.h>
-void fixup_irqs(cpumask_t map)
+/* A cpu has been removed from cpu_online_mask.  Reset irq affinities. */
+void fixup_irqs(void)
 {
        unsigned int irq;
        static int warned;
        struct irq_desc *desc;
        for_each_irq_desc(irq, desc) {
-                cpumask_t mask;
+                const struct cpumask *affinity;
+                if (!desc)
+                        continue;
                if (irq == 2)
                        continue;
-                cpus_and(mask, desc->affinity, map);
+                affinity = &desc->affinity;
-                if (any_online_cpu(mask) == NR_CPUS) {
+                if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
                        printk("Breaking affinity for irq %i\n", irq);
-                        mask = map;
+                        affinity = cpu_all_mask;
                }
                if (desc->chip->set_affinity)
-                        desc->chip->set_affinity(irq, mask);
+                        desc->chip->set_affinity(irq, affinity);
                else if (desc->action && !(warned++))
                        printk("Cannot set affinity for irq %i\n", irq);
        }
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 1df869e5bd0b..6383d50f82ea 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -80,40 +80,43 @@ asmlinkage unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
 }
 #ifdef CONFIG_HOTPLUG_CPU
-void fixup_irqs(cpumask_t map)
+/* A cpu has been removed from cpu_online_mask.  Reset irq affinities. */
+void fixup_irqs(void)
 {
        unsigned int irq;
        static int warned;
        struct irq_desc *desc;
        for_each_irq_desc(irq, desc) {
-                cpumask_t mask;
                int break_affinity = 0;
                int set_affinity = 1;
+                const struct cpumask *affinity;
+                if (!desc)
+                        continue;
                if (irq == 2)
                        continue;
                /* interrupt's are disabled at this point */
                spin_lock(&desc->lock);
+                affinity = &desc->affinity;
                if (!irq_has_action(irq) ||
-                    cpus_equal(desc->affinity, map)) {
+                    cpumask_equal(affinity, cpu_online_mask)) {
                        spin_unlock(&desc->lock);
                        continue;
                }
-                cpus_and(mask, desc->affinity, map);
+                if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
-                if (cpus_empty(mask)) {
                        break_affinity = 1;
-                        mask = map;
+                        affinity = cpu_all_mask;
                }
                if (desc->chip->mask)
                        desc->chip->mask(irq);
                if (desc->chip->set_affinity)
-                        desc->chip->set_affinity(irq, mask);
+                        desc->chip->set_affinity(irq, affinity);
                else if (!(warned++))
                        set_affinity = 0;
diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c
index 607db63044a5..84723295f88a 100644
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit_32.c
@@ -68,8 +68,7 @@ void __init init_ISA_irqs (void)
        /*
         * 16 old-style INTA-cycle interrupts:
         */
-        for (i = 0; i < 16; i++) {
+        for (i = 0; i < NR_IRQS_LEGACY; i++) {
-                /* first time call this irq_desc */
                struct irq_desc *desc = irq_to_desc(i);
                desc->status = IRQ_DISABLED;
@@ -111,6 +110,18 @@ DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
        [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1
 };
+int vector_used_by_percpu_irq(unsigned int vector)
+{
+        int cpu;
+        for_each_online_cpu(cpu) {
+                if (per_cpu(vector_irq, cpu)[vector] != -1)
+                        return 1;
+        }
+        return 0;
+}
 /* Overridden in paravirt.c */
 void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
@@ -147,10 +158,12 @@ void __init native_init_IRQ(void)
        alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
        /* IPI for single call function */
-        set_intr_gate(CALL_FUNCTION_SINGLE_VECTOR, call_function_single_interrupt);
+        alloc_intr_gate(CALL_FUNCTION_SINGLE_VECTOR,
+                                 call_function_single_interrupt);
        /* Low priority IPI to cleanup after moving an irq */
        set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
+        set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors);
 #endif
 #ifdef CONFIG_X86_LOCAL_APIC
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c
index 8670b3ce626e..31ebfe38e96c 100644
--- a/arch/x86/kernel/irqinit_64.c
+++ b/arch/x86/kernel/irqinit_64.c
@@ -69,6 +69,18 @@ DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
        [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1
 };
+int vector_used_by_percpu_irq(unsigned int vector)
+{
+        int cpu;
+        for_each_online_cpu(cpu) {
+                if (per_cpu(vector_irq, cpu)[vector] != -1)
+                        return 1;
+        }
+        return 0;
+}
 void __init init_ISA_irqs(void)
 {
        int i;
@@ -76,8 +88,7 @@ void __init init_ISA_irqs(void)
        init_bsp_APIC();
        init_8259A(0);
-        for (i = 0; i < 16; i++) {
+        for (i = 0; i < NR_IRQS_LEGACY; i++) {
-                /* first time call this irq_desc */
                struct irq_desc *desc = irq_to_desc(i);
                desc->status = IRQ_DISABLED;
@@ -122,6 +133,7 @@ static void __init smp_intr_init(void)
        /* Low priority IPI to cleanup after moving an irq */
        set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
+        set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors);
 #endif
 }
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index e169ae9b6a62..652fce6d2cce 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -89,17 +89,17 @@ static cycle_t kvm_clock_read(void)
 */
 static unsigned long kvm_get_tsc_khz(void)
 {
-        return preset_lpj;
+        struct pvclock_vcpu_time_info *src;
+        src = &per_cpu(hv_clock, 0);
+        return pvclock_tsc_khz(src);
 }
 static void kvm_get_preset_lpj(void)
 {
-        struct pvclock_vcpu_time_info *src;
        unsigned long khz;
        u64 lpj;
-        src = &per_cpu(hv_clock, 0);
+        khz = kvm_get_tsc_khz();
-        khz = pvclock_tsc_khz(src);
        lpj = ((u64)khz * 1000);
        do_div(lpj, HZ);
@@ -194,5 +194,7 @@ void __init kvmclock_init(void)
 #endif
                kvm_get_preset_lpj();
                clocksource_register(&kvm_clock);
+                pv_info.paravirt_enabled = 1;
+                pv_info.name = "KVM";
        }
 }
diff --git a/arch/x86/kernel/mfgpt_32.c b/arch/x86/kernel/mfgpt_32.c
index 3b599518c322..c12314c9e86f 100644
--- a/arch/x86/kernel/mfgpt_32.c
+++ b/arch/x86/kernel/mfgpt_32.c
@@ -287,7 +287,7 @@ static struct clock_event_device mfgpt_clockevent = {
        .set_mode = mfgpt_set_mode,
        .set_next_event = mfgpt_next_event,
        .rating = 250,
-        .cpumask = CPU_MASK_ALL,
+        .cpumask = cpu_all_mask,
        .shift = 32
 };
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 7a3dfceb90e4..19a1044a0cd9 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -101,11 +101,15 @@ static void __init dma32_free_bootmem(void)
        dma32_bootmem_ptr = NULL;
        dma32_bootmem_size = 0;
 }
+#endif
 void __init pci_iommu_alloc(void)
 {
+#ifdef CONFIG_X86_64
        /* free the range so iommu could get some range less than 4G */
        dma32_free_bootmem();
+#endif
        /*
         * The order of these functions is important for
         * fall-back/fail-over reasons
@@ -121,15 +125,6 @@ void __init pci_iommu_alloc(void)
        pci_swiotlb_init();
 }
-unsigned long iommu_nr_pages(unsigned long addr, unsigned long len)
-{
-        unsigned long size = roundup((addr & ~PAGE_MASK) + len, PAGE_SIZE);
-        return size >> PAGE_SHIFT;
-}
-EXPORT_SYMBOL(iommu_nr_pages);
-#endif
 void *dma_generic_alloc_coherent(struct device *dev, size_t size,
                                 dma_addr_t *dma_addr, gfp_t flag)
 {
diff --git a/arch/x86/kernel/pci-swiotlb_64.c b/arch/x86/kernel/pci-swiotlb_64.c
index 3c539d111abb..242c3440687f 100644
--- a/arch/x86/kernel/pci-swiotlb_64.c
+++ b/arch/x86/kernel/pci-swiotlb_64.c
@@ -3,6 +3,8 @@
 #include <linux/pci.h>
 #include <linux/cache.h>
 #include <linux/module.h>
+#include <linux/swiotlb.h>
+#include <linux/bootmem.h>
 #include <linux/dma-mapping.h>
 #include <asm/iommu.h>
@@ -11,6 +13,31 @@
 int swiotlb __read_mostly;
+void *swiotlb_alloc_boot(size_t size, unsigned long nslabs)
+{
+        return alloc_bootmem_low_pages(size);
+}
+void *swiotlb_alloc(unsigned order, unsigned long nslabs)
+{
+        return (void *)__get_free_pages(GFP_DMA | __GFP_NOWARN, order);
+}
+dma_addr_t swiotlb_phys_to_bus(phys_addr_t paddr)
+{
+        return paddr;
+}
+phys_addr_t swiotlb_bus_to_phys(dma_addr_t baddr)
+{
+        return baddr;
+}
+int __weak swiotlb_arch_range_needs_mapping(void *ptr, size_t size)
+{
+        return 0;
+}
 static dma_addr_t
 swiotlb_map_single_phys(struct device *hwdev, phys_addr_t paddr, size_t size,
                        int direction)
@@ -50,8 +77,10 @@ struct dma_mapping_ops swiotlb_dma_ops = {
 void __init pci_swiotlb_init(void)
 {
        /* don't initialize swiotlb if iommu=off (no_iommu=1) */
+#ifdef CONFIG_X86_64
        if (!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN)
               swiotlb = 1;
+#endif
        if (swiotlb_force)
                swiotlb = 1;
        if (swiotlb) {
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index 67465ed89310..309949e9e1c1 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -168,6 +168,8 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH7_31,
                         ich_force_enable_hpet);
 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH8_1,
                         ich_force_enable_hpet);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH8_4,
+                         ich_force_enable_hpet);
 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH9_7,
                         ich_force_enable_hpet);
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index a90913cccfb7..bf088c61fa40 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -13,6 +13,7 @@
 #include <asm/reboot_fixups.h>
 #include <asm/reboot.h>
 #include <asm/pci_x86.h>
+#include <asm/virtext.h>
 #ifdef CONFIG_X86_32
 # include <linux/dmi.h>
@@ -39,6 +40,12 @@ int reboot_force;
 static int reboot_cpu = -1;
 #endif
+/* This is set if we need to go through the 'emergency' path.
+ * When machine_emergency_restart() is called, we may be on
+ * an inconsistent state and won't be able to do a clean cleanup
+ */
+static int reboot_emergency;
 /* This is set by the PCI code if either type 1 or type 2 PCI is detected */
 bool port_cf9_safe = false;
@@ -368,6 +375,48 @@ static inline void kb_wait(void)
        }
 }
+static void vmxoff_nmi(int cpu, struct die_args *args)
+{
+        cpu_emergency_vmxoff();
+}
+/* Use NMIs as IPIs to tell all CPUs to disable virtualization
+ */
+static void emergency_vmx_disable_all(void)
+{
+        /* Just make sure we won't change CPUs while doing this */
+        local_irq_disable();
+        /* We need to disable VMX on all CPUs before rebooting, otherwise
+         * we risk hanging up the machine, because the CPU ignore INIT
+         * signals when VMX is enabled.
+         *
+         * We can't take any locks and we may be on an inconsistent
+         * state, so we use NMIs as IPIs to tell the other CPUs to disable
+         * VMX and halt.
+         *
+         * For safety, we will avoid running the nmi_shootdown_cpus()
+         * stuff unnecessarily, but we don't have a way to check
+         * if other CPUs have VMX enabled. So we will call it only if the
+         * CPU we are running on has VMX enabled.
+         *
+         * We will miss cases where VMX is not enabled on all CPUs. This
+         * shouldn't do much harm because KVM always enable VMX on all
+         * CPUs anyway. But we can miss it on the small window where KVM
+         * is still enabling VMX.
+         */
+        if (cpu_has_vmx() && cpu_vmx_enabled()) {
+                /* Disable VMX on this CPU.
+                 */
+                cpu_vmxoff();
+                /* Halt and disable VMX on the other CPUs */
+                nmi_shootdown_cpus(vmxoff_nmi);
+        }
+}
 void __attribute__((weak)) mach_reboot_fixups(void)
 {
 }
@@ -376,6 +425,9 @@ static void native_machine_emergency_restart(void)
 {
        int i;
+        if (reboot_emergency)
+                emergency_vmx_disable_all();
        /* Tell the BIOS if we want cold or warm reboot */
        *((unsigned short *)__va(0x472)) = reboot_mode;
@@ -482,13 +534,19 @@ void native_machine_shutdown(void)
 #endif
 }
+static void __machine_emergency_restart(int emergency)
+{
+        reboot_emergency = emergency;
+        machine_ops.emergency_restart();
+}
 static void native_machine_restart(char *__unused)
 {
        printk("machine restart\n");
        if (!reboot_force)
                machine_shutdown();
-        machine_emergency_restart();
+        __machine_emergency_restart(0);
 }
 static void native_machine_halt(void)
@@ -532,7 +590,7 @@ void machine_shutdown(void)
 void machine_emergency_restart(void)
 {
-        machine_ops.emergency_restart();
+        __machine_emergency_restart(1);
 }
 void machine_restart(char *cmd)
@@ -592,10 +650,7 @@ static int crash_nmi_callback(struct notifier_block *self,
 static void smp_send_nmi_allbutself(void)
 {
-        cpumask_t mask = cpu_online_map;
+        send_IPI_allbutself(NMI_VECTOR);
-        cpu_clear(safe_smp_processor_id(), mask);
-        if (!cpus_empty(mask))
-                send_IPI_mask(mask, NMI_VECTOR);
 }
 static struct notifier_block crash_nmi_nb = {
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 08e02e8453c9..ae0d8042cf69 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -953,7 +953,7 @@ void __init setup_arch(char **cmdline_p)
        ioapic_init_mappings();
        /* need to wait for io_apic is mapped */
-        nr_irqs = probe_nr_irqs();
+        probe_nr_irqs_gsi();
        kvm_guest_init();
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index ae0c0d3bb770..0b63b08e7530 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -152,6 +152,11 @@ void __init setup_per_cpu_areas(void)
        old_size = PERCPU_ENOUGH_ROOM;
        align = max_t(unsigned long, PAGE_SIZE, align);
        size = roundup(old_size, align);
+        printk(KERN_INFO
+                "NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n",
+                NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids);
        printk(KERN_INFO "PERCPU: Allocating %zd bytes of per cpu data\n",
                          size);
@@ -168,24 +173,24 @@ void __init setup_per_cpu_areas(void)
                               "cpu %d has no node %d or node-local memory\n",
                                cpu, node);
                        if (ptr)
-                                printk(KERN_DEBUG "per cpu data for cpu%d at %016lx\n",
+                                printk(KERN_DEBUG
+                                        "per cpu data for cpu%d at %016lx\n",
                                         cpu, __pa(ptr));
                }
                else {
                        ptr = __alloc_bootmem_node(NODE_DATA(node), size, align,
                                                        __pa(MAX_DMA_ADDRESS));
                        if (ptr)
-                                printk(KERN_DEBUG "per cpu data for cpu%d on node%d at %016lx\n",
+                                printk(KERN_DEBUG
-                                         cpu, node, __pa(ptr));
+                                        "per cpu data for cpu%d on node%d "
+                                        "at %016lx\n",
+                                        cpu, node, __pa(ptr));
                }
 #endif
                per_cpu_offset(cpu) = ptr - __per_cpu_start;
                memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
        }
-        printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d, nr_node_ids %d\n",
-                NR_CPUS, nr_cpu_ids, nr_node_ids);
        /* Setup percpu data maps */
        setup_per_cpu_maps();
@@ -282,7 +287,7 @@ static void __cpuinit numa_set_cpumask(int cpu, int enable)
        else
                cpu_clear(cpu, *mask);
-        cpulist_scnprintf(buf, sizeof(buf), *mask);
+        cpulist_scnprintf(buf, sizeof(buf), mask);
        printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n",
                enable? "numa_add_cpu":"numa_remove_cpu", cpu, node, buf);
 }
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 7e558db362c1..beea2649a240 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -118,22 +118,22 @@ static void native_smp_send_reschedule(int cpu)
                WARN_ON(1);
                return;
        }
-        send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
+        send_IPI_mask(cpumask_of(cpu), RESCHEDULE_VECTOR);
 }
 void native_send_call_func_single_ipi(int cpu)
 {
-        send_IPI_mask(cpumask_of_cpu(cpu), CALL_FUNCTION_SINGLE_VECTOR);
+        send_IPI_mask(cpumask_of(cpu), CALL_FUNCTION_SINGLE_VECTOR);
 }
-void native_send_call_func_ipi(cpumask_t mask)
+void native_send_call_func_ipi(const struct cpumask *mask)
 {
        cpumask_t allbutself;
        allbutself = cpu_online_map;
        cpu_clear(smp_processor_id(), allbutself);
-        if (cpus_equal(mask, allbutself) &&
+        if (cpus_equal(*mask, allbutself) &&
            cpus_equal(cpu_online_map, cpu_callout_map))
                send_IPI_allbutself(CALL_FUNCTION_VECTOR);
        else
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index f8500c969442..31869bf5fabd 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -102,14 +102,8 @@ EXPORT_SYMBOL(smp_num_siblings);
 /* Last level cache ID of each logical CPU */
 DEFINE_PER_CPU(u16, cpu_llc_id) = BAD_APICID;
-/* bitmap of online cpus */
-cpumask_t cpu_online_map __read_mostly;
-EXPORT_SYMBOL(cpu_online_map);
 cpumask_t cpu_callin_map;
 cpumask_t cpu_callout_map;
-cpumask_t cpu_possible_map;
-EXPORT_SYMBOL(cpu_possible_map);
 /* representing HT siblings of each logical CPU */
 DEFINE_PER_CPU(cpumask_t, cpu_sibling_map);
@@ -1260,6 +1254,15 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
        check_nmi_watchdog();
 }
+static int __initdata setup_possible_cpus = -1;
+static int __init _setup_possible_cpus(char *str)
+{
+        get_option(&str, &setup_possible_cpus);
+        return 0;
+}
+early_param("possible_cpus", _setup_possible_cpus);
 /*
 * cpu_possible_map should be static, it cannot change as cpu's
 * are onlined, or offlined. The reason is per-cpu data-structures
@@ -1272,7 +1275,7 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
 *
 * Three ways to find out the number of additional hotplug CPUs:
 * - If the BIOS specified disabled CPUs in ACPI/mptables use that.
- * - The user can overwrite it with additional_cpus=NUM
+ * - The user can overwrite it with possible_cpus=NUM
 * - Otherwise don't reserve additional CPUs.
 * We do this because additional CPUs waste a lot of memory.
 * -AK
@@ -1285,9 +1288,17 @@ __init void prefill_possible_map(void)
        if (!num_processors)
                num_processors = 1;
-        possible = num_processors + disabled_cpus;
+        if (setup_possible_cpus == -1)
-        if (possible > NR_CPUS)
+                possible = num_processors + disabled_cpus;
-                possible = NR_CPUS;
+        else
+                possible = setup_possible_cpus;
+        if (possible > CONFIG_NR_CPUS) {
+                printk(KERN_WARNING
+                        "%d Processors exceeds NR_CPUS limit of %d\n",
+                        possible, CONFIG_NR_CPUS);
+                possible = CONFIG_NR_CPUS;
+        }
        printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n",
                possible, max_t(int, possible - num_processors, 0));
@@ -1352,7 +1363,7 @@ void cpu_disable_common(void)
        lock_vector_lock();
        remove_cpu_from_maps(cpu);
        unlock_vector_lock();
-        fixup_irqs(cpu_online_map);
+        fixup_irqs();
 }
 int native_cpu_disable(void)
diff --git a/arch/x86/kernel/tlb_32.c b/arch/x86/kernel/tlb_32.c
index 8da059f949be..ce5054642247 100644
--- a/arch/x86/kernel/tlb_32.c
+++ b/arch/x86/kernel/tlb_32.c
@@ -163,7 +163,7 @@ void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
         * We have to send the IPI only to
         * CPUs affected.
         */
-        send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR);
+        send_IPI_mask(&cpumask, INVALIDATE_TLB_VECTOR);
        while (!cpus_empty(flush_cpumask))
                /* nothing. lockup detection does not belong here */
diff --git a/arch/x86/kernel/tlb_64.c b/arch/x86/kernel/tlb_64.c
index 29887d7081a9..f8be6f1d2e48 100644
--- a/arch/x86/kernel/tlb_64.c
+++ b/arch/x86/kernel/tlb_64.c
@@ -191,7 +191,7 @@ void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
         * We have to send the IPI only to
         * CPUs affected.
         */
-        send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR_START + sender);
+        send_IPI_mask(&cpumask, INVALIDATE_TLB_VECTOR_START + sender);
        while (!cpus_empty(f->flush_cpumask))
                cpu_relax();
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 961e26a69d55..ce6650eb64e9 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -72,9 +72,6 @@
 #include "cpu/mcheck/mce.h"
-DECLARE_BITMAP(used_vectors, NR_VECTORS);
-EXPORT_SYMBOL_GPL(used_vectors);
 asmlinkage int system_call(void);
 /* Do we ignore FPU interrupts ? */
@@ -89,6 +86,9 @@ gate_desc idt_table[256]
        __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, };
 #endif
+DECLARE_BITMAP(used_vectors, NR_VECTORS);
+EXPORT_SYMBOL_GPL(used_vectors);
 static int ignore_nmis;
 static inline void conditional_sti(struct pt_regs *regs)
@@ -946,9 +946,7 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
 void __init trap_init(void)
 {
-#ifdef CONFIG_X86_32
        int i;
-#endif
 #ifdef CONFIG_EISA
        void __iomem *p = early_ioremap(0x0FFFD9, 4);
@@ -1005,11 +1003,15 @@ void __init trap_init(void)
        }
        set_system_trap_gate(SYSCALL_VECTOR, &system_call);
+#endif
        /* Reserve all the builtin and the syscall vector: */
        for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++)
                set_bit(i, used_vectors);
+#ifdef CONFIG_X86_64
+        set_bit(IA32_SYSCALL_VECTOR, used_vectors);
+#else
        set_bit(SYSCALL_VECTOR, used_vectors);
 #endif
        /*
diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c
index 254ee07f8635..c4c1f9e09402 100644
--- a/arch/x86/kernel/vmiclock_32.c
+++ b/arch/x86/kernel/vmiclock_32.c
@@ -226,7 +226,7 @@ static void __devinit vmi_time_init_clockevent(void)
        /* Upper bound is clockevent's use of ulong for cycle deltas. */
        evt->max_delta_ns = clockevent_delta2ns(ULONG_MAX, evt);
        evt->min_delta_ns = clockevent_delta2ns(1, evt);
-        evt->cpumask = cpumask_of_cpu(cpu);
+        evt->cpumask = cpumask_of(cpu);
        printk(KERN_WARNING "vmi: registering clock event %s. mult=%lu shift=%u\n",
               evt->name, evt->mult, evt->shift);
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 59ebd37ad79e..e665d1c623ca 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -603,10 +603,29 @@ void kvm_free_pit(struct kvm *kvm)
 static void __inject_pit_timer_intr(struct kvm *kvm)
 {
+        struct kvm_vcpu *vcpu;
+        int i;
        mutex_lock(&kvm->lock);
        kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1);
        kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0);
        mutex_unlock(&kvm->lock);
+        /*
+         * Provides NMI watchdog support via Virtual Wire mode.
+         * The route is: PIT -> PIC -> LVT0 in NMI mode.
+         *
+         * Note: Our Virtual Wire implementation is simplified, only
+         * propagating PIT interrupts to all VCPUs when they have set
+         * LVT0 to NMI delivery. Other PIC interrupts are just sent to
+         * VCPU0, and only if its LVT0 is in EXTINT mode.
+         */
+        if (kvm->arch.vapics_in_nmi_mode > 0)
+                for (i = 0; i < KVM_MAX_VCPUS; ++i) {
+                        vcpu = kvm->vcpus[i];
+                        if (vcpu)
+                                kvm_apic_nmi_wd_deliver(vcpu);
+                }
 }
 void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 17e41e165f1a..179dcb0103fd 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -26,10 +26,40 @@
 *   Port from Qemu.
 */
 #include <linux/mm.h>
+#include <linux/bitops.h>
 #include "irq.h"
 #include <linux/kvm_host.h>
+static void pic_lock(struct kvm_pic *s)
+{
+        spin_lock(&s->lock);
+}
+static void pic_unlock(struct kvm_pic *s)
+{
+        struct kvm *kvm = s->kvm;
+        unsigned acks = s->pending_acks;
+        bool wakeup = s->wakeup_needed;
+        struct kvm_vcpu *vcpu;
+        s->pending_acks = 0;
+        s->wakeup_needed = false;
+        spin_unlock(&s->lock);
+        while (acks) {
+                kvm_notify_acked_irq(kvm, __ffs(acks));
+                acks &= acks - 1;
+        }
+        if (wakeup) {
+                vcpu = s->kvm->vcpus[0];
+                if (vcpu)
+                        kvm_vcpu_kick(vcpu);
+        }
+}
 static void pic_clear_isr(struct kvm_kpic_state *s, int irq)
 {
        s->isr &= ~(1 << irq);
@@ -136,17 +166,21 @@ static void pic_update_irq(struct kvm_pic *s)
 void kvm_pic_update_irq(struct kvm_pic *s)
 {
+        pic_lock(s);
        pic_update_irq(s);
+        pic_unlock(s);
 }
 void kvm_pic_set_irq(void *opaque, int irq, int level)
 {
        struct kvm_pic *s = opaque;
+        pic_lock(s);
        if (irq >= 0 && irq < PIC_NUM_PINS) {
                pic_set_irq1(&s->pics[irq >> 3], irq & 7, level);
                pic_update_irq(s);
        }
+        pic_unlock(s);
 }
 /*
@@ -172,6 +206,7 @@ int kvm_pic_read_irq(struct kvm *kvm)
        int irq, irq2, intno;
        struct kvm_pic *s = pic_irqchip(kvm);
+        pic_lock(s);
        irq = pic_get_irq(&s->pics[0]);
        if (irq >= 0) {
                pic_intack(&s->pics[0], irq);
@@ -196,6 +231,7 @@ int kvm_pic_read_irq(struct kvm *kvm)
                intno = s->pics[0].irq_base + irq;
        }
        pic_update_irq(s);
+        pic_unlock(s);
        kvm_notify_acked_irq(kvm, irq);
        return intno;
@@ -203,7 +239,7 @@ int kvm_pic_read_irq(struct kvm *kvm)
 void kvm_pic_reset(struct kvm_kpic_state *s)
 {
-        int irq, irqbase;
+        int irq, irqbase, n;
        struct kvm *kvm = s->pics_state->irq_request_opaque;
        struct kvm_vcpu *vcpu0 = kvm->vcpus[0];
@@ -214,8 +250,10 @@ void kvm_pic_reset(struct kvm_kpic_state *s)
        for (irq = 0; irq < PIC_NUM_PINS/2; irq++) {
                if (vcpu0 && kvm_apic_accept_pic_intr(vcpu0))
-                        if (s->irr & (1 << irq) || s->isr & (1 << irq))
+                        if (s->irr & (1 << irq) || s->isr & (1 << irq)) {
-                                kvm_notify_acked_irq(kvm, irq+irqbase);
+                                n = irq + irqbase;
+                                s->pics_state->pending_acks |= 1 << n;
+                        }
        }
        s->last_irr = 0;
        s->irr = 0;
@@ -406,6 +444,7 @@ static void picdev_write(struct kvm_io_device *this,
                        printk(KERN_ERR "PIC: non byte write\n");
                return;
        }
+        pic_lock(s);
        switch (addr) {
        case 0x20:
        case 0x21:
@@ -418,6 +457,7 @@ static void picdev_write(struct kvm_io_device *this,
                elcr_ioport_write(&s->pics[addr & 1], addr, data);
                break;
        }
+        pic_unlock(s);
 }
 static void picdev_read(struct kvm_io_device *this,
@@ -431,6 +471,7 @@ static void picdev_read(struct kvm_io_device *this,
                        printk(KERN_ERR "PIC: non byte read\n");
                return;
        }
+        pic_lock(s);
        switch (addr) {
        case 0x20:
        case 0x21:
@@ -444,6 +485,7 @@ static void picdev_read(struct kvm_io_device *this,
                break;
        }
        *(unsigned char *)val = data;
+        pic_unlock(s);
 }
 /*
@@ -459,7 +501,7 @@ static void pic_irq_request(void *opaque, int level)
        s->output = level;
        if (vcpu && level && (s->pics[0].isr_ack & (1 << irq))) {
                s->pics[0].isr_ack &= ~(1 << irq);
-                kvm_vcpu_kick(vcpu);
+                s->wakeup_needed = true;
        }
 }
@@ -469,6 +511,8 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm)
        s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL);
        if (!s)
                return NULL;
+        spin_lock_init(&s->lock);
+        s->kvm = kvm;
        s->pics[0].elcr_mask = 0xf8;
        s->pics[1].elcr_mask = 0xde;
        s->irq_request = pic_irq_request;
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index f17c8f5bbf31..2bf32a03ceec 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -25,6 +25,7 @@
 #include <linux/mm_types.h>
 #include <linux/hrtimer.h>
 #include <linux/kvm_host.h>
+#include <linux/spinlock.h>
 #include "iodev.h"
 #include "ioapic.h"
@@ -59,6 +60,10 @@ struct kvm_kpic_state {
 };
 struct kvm_pic {
+        spinlock_t lock;
+        bool wakeup_needed;
+        unsigned pending_acks;
+        struct kvm *kvm;
        struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */
        irq_request_func *irq_request;
        void *irq_request_opaque;
@@ -87,6 +92,7 @@ void kvm_pic_reset(struct kvm_kpic_state *s);
 void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec);
 void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu);
 void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu);
+void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu);
 void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu);
 void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu);
 void __kvm_migrate_timers(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/kvm_svm.h b/arch/x86/kvm/kvm_svm.h
index 65ef0fc2c036..8e5ee99551f6 100644
--- a/arch/x86/kvm/kvm_svm.h
+++ b/arch/x86/kvm/kvm_svm.h
@@ -7,7 +7,7 @@
 #include <linux/kvm_host.h>
 #include <asm/msr.h>
-#include "svm.h"
+#include <asm/svm.h>
 static const u32 host_save_user_msrs[] = {
 #ifdef CONFIG_X86_64
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 0fc3cab48943..afac68c0815c 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -130,6 +130,11 @@ static inline int apic_lvtt_period(struct kvm_lapic *apic)
        return apic_get_reg(apic, APIC_LVTT) & APIC_LVT_TIMER_PERIODIC;
 }
+static inline int apic_lvt_nmi_mode(u32 lvt_val)
+{
+        return (lvt_val & (APIC_MODE_MASK | APIC_LVT_MASKED)) == APIC_DM_NMI;
+}
 static unsigned int apic_lvt_mask[APIC_LVT_NUM] = {
        LVT_MASK | APIC_LVT_TIMER_PERIODIC,     /* LVTT */
        LVT_MASK | APIC_MODE_MASK,      /* LVTTHMR */
@@ -354,6 +359,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
        case APIC_DM_NMI:
                kvm_inject_nmi(vcpu);
+                kvm_vcpu_kick(vcpu);
                break;
        case APIC_DM_INIT:
@@ -380,6 +386,14 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
                }
                break;
+        case APIC_DM_EXTINT:
+                /*
+                 * Should only be called by kvm_apic_local_deliver() with LVT0,
+                 * before NMI watchdog was enabled. Already handled by
+                 * kvm_apic_accept_pic_intr().
+                 */
+                break;
        default:
                printk(KERN_ERR "TODO: unsupported delivery mode %x\n",
                       delivery_mode);
@@ -663,6 +677,20 @@ static void start_apic_timer(struct kvm_lapic *apic)
                                        apic->timer.period)));
 }
+static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val)
+{
+        int nmi_wd_enabled = apic_lvt_nmi_mode(apic_get_reg(apic, APIC_LVT0));
+        if (apic_lvt_nmi_mode(lvt0_val)) {
+                if (!nmi_wd_enabled) {
+                        apic_debug("Receive NMI setting on APIC_LVT0 "
+                                   "for cpu %d\n", apic->vcpu->vcpu_id);
+                        apic->vcpu->kvm->arch.vapics_in_nmi_mode++;
+                }
+        } else if (nmi_wd_enabled)
+                apic->vcpu->kvm->arch.vapics_in_nmi_mode--;
+}
 static void apic_mmio_write(struct kvm_io_device *this,
                            gpa_t address, int len, const void *data)
 {
@@ -743,10 +771,11 @@ static void apic_mmio_write(struct kvm_io_device *this,
                apic_set_reg(apic, APIC_ICR2, val & 0xff000000);
                break;
+        case APIC_LVT0:
+                apic_manage_nmi_watchdog(apic, val);
        case APIC_LVTT:
        case APIC_LVTTHMR:
        case APIC_LVTPC:
-        case APIC_LVT0:
        case APIC_LVT1:
        case APIC_LVTERR:
                /* TODO: Check vector */
@@ -961,12 +990,26 @@ int apic_has_pending_timer(struct kvm_vcpu *vcpu)
        return 0;
 }
-static int __inject_apic_timer_irq(struct kvm_lapic *apic)
+static int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type)
+{
+        u32 reg = apic_get_reg(apic, lvt_type);
+        int vector, mode, trig_mode;
+        if (apic_hw_enabled(apic) && !(reg & APIC_LVT_MASKED)) {
+                vector = reg & APIC_VECTOR_MASK;
+                mode = reg & APIC_MODE_MASK;
+                trig_mode = reg & APIC_LVT_LEVEL_TRIGGER;
+                return __apic_accept_irq(apic, mode, vector, 1, trig_mode);
+        }
+        return 0;
+}
+void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu)
 {
-        int vector;
+        struct kvm_lapic *apic = vcpu->arch.apic;
-        vector = apic_lvt_vector(apic, APIC_LVTT);
+        if (apic)
-        return __apic_accept_irq(apic, APIC_DM_FIXED, vector, 1, 0);
+                kvm_apic_local_deliver(apic, APIC_LVT0);
 }
 static enum hrtimer_restart apic_timer_fn(struct hrtimer *data)
@@ -1061,9 +1104,8 @@ void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu)
 {
        struct kvm_lapic *apic = vcpu->arch.apic;
-        if (apic && apic_lvt_enabled(apic, APIC_LVTT) &&
+        if (apic && atomic_read(&apic->timer.pending) > 0) {
-                atomic_read(&apic->timer.pending) > 0) {
+                if (kvm_apic_local_deliver(apic, APIC_LVTT))
-                if (__inject_apic_timer_irq(apic))
                        atomic_dec(&apic->timer.pending);
        }
 }
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 410ddbc1aa2e..83f11c7474a1 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -17,7 +17,6 @@
 *
 */
-#include "vmx.h"
 #include "mmu.h"
 #include <linux/kvm_host.h>
@@ -33,6 +32,7 @@
 #include <asm/page.h>
 #include <asm/cmpxchg.h>
 #include <asm/io.h>
+#include <asm/vmx.h>
 /*
 * When setting this variable to true it enables Two-Dimensional-Paging
@@ -168,6 +168,7 @@ static u64 __read_mostly shadow_x_mask;	/* mutual exclusive with nx_mask */
 static u64 __read_mostly shadow_user_mask;
 static u64 __read_mostly shadow_accessed_mask;
 static u64 __read_mostly shadow_dirty_mask;
+static u64 __read_mostly shadow_mt_mask;
 void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte)
 {
@@ -183,13 +184,14 @@ void kvm_mmu_set_base_ptes(u64 base_pte)
 EXPORT_SYMBOL_GPL(kvm_mmu_set_base_ptes);
 void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
-                u64 dirty_mask, u64 nx_mask, u64 x_mask)
+                u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 mt_mask)
 {
        shadow_user_mask = user_mask;
        shadow_accessed_mask = accessed_mask;
        shadow_dirty_mask = dirty_mask;
        shadow_nx_mask = nx_mask;
        shadow_x_mask = x_mask;
+        shadow_mt_mask = mt_mask;
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
@@ -384,7 +386,9 @@ static void account_shadowed(struct kvm *kvm, gfn_t gfn)
 {
        int *write_count;
-        write_count = slot_largepage_idx(gfn, gfn_to_memslot(kvm, gfn));
+        gfn = unalias_gfn(kvm, gfn);
+        write_count = slot_largepage_idx(gfn,
+                                         gfn_to_memslot_unaliased(kvm, gfn));
        *write_count += 1;
 }
@@ -392,16 +396,20 @@ static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
 {
        int *write_count;
-        write_count = slot_largepage_idx(gfn, gfn_to_memslot(kvm, gfn));
+        gfn = unalias_gfn(kvm, gfn);
+        write_count = slot_largepage_idx(gfn,
+                                         gfn_to_memslot_unaliased(kvm, gfn));
        *write_count -= 1;
        WARN_ON(*write_count < 0);
 }
 static int has_wrprotected_page(struct kvm *kvm, gfn_t gfn)
 {
-        struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
+        struct kvm_memory_slot *slot;
        int *largepage_idx;
+        gfn = unalias_gfn(kvm, gfn);
+        slot = gfn_to_memslot_unaliased(kvm, gfn);
        if (slot) {
                largepage_idx = slot_largepage_idx(gfn, slot);
                return *largepage_idx;
@@ -613,7 +621,7 @@ static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
        return NULL;
 }
-static void rmap_write_protect(struct kvm *kvm, u64 gfn)
+static int rmap_write_protect(struct kvm *kvm, u64 gfn)
 {
        unsigned long *rmapp;
        u64 *spte;
@@ -659,8 +667,7 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn)
                spte = rmap_next(kvm, rmapp, spte);
        }
-        if (write_protected)
+        return write_protected;
-                kvm_flush_remote_tlbs(kvm);
 }
 static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp)
@@ -786,9 +793,11 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
        sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
        set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
        list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
+        INIT_LIST_HEAD(&sp->oos_link);
        ASSERT(is_empty_shadow_page(sp->spt));
-        sp->slot_bitmap = 0;
+        bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
        sp->multimapped = 0;
+        sp->global = 1;
        sp->parent_pte = parent_pte;
        --vcpu->kvm->arch.n_free_mmu_pages;
        return sp;
@@ -900,8 +909,9 @@ static void kvm_mmu_update_unsync_bitmap(u64 *spte)
        struct kvm_mmu_page *sp = page_header(__pa(spte));
        index = spte - sp->spt;
-        __set_bit(index, sp->unsync_child_bitmap);
+        if (!__test_and_set_bit(index, sp->unsync_child_bitmap))
-        sp->unsync_children = 1;
+                sp->unsync_children++;
+        WARN_ON(!sp->unsync_children);
 }
 static void kvm_mmu_update_parents_unsync(struct kvm_mmu_page *sp)
@@ -928,7 +938,6 @@ static void kvm_mmu_update_parents_unsync(struct kvm_mmu_page *sp)
 static int unsync_walk_fn(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 {
-        sp->unsync_children = 1;
        kvm_mmu_update_parents_unsync(sp);
        return 1;
 }
@@ -959,38 +968,66 @@ static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
 {
 }
+#define KVM_PAGE_ARRAY_NR 16
+struct kvm_mmu_pages {
+        struct mmu_page_and_offset {
+                struct kvm_mmu_page *sp;
+                unsigned int idx;
+        } page[KVM_PAGE_ARRAY_NR];
+        unsigned int nr;
+};
 #define for_each_unsync_children(bitmap, idx)           \
        for (idx = find_first_bit(bitmap, 512);         \
             idx < 512;                                 \
             idx = find_next_bit(bitmap, 512, idx+1))
-static int mmu_unsync_walk(struct kvm_mmu_page *sp,
+int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp,
-                           struct kvm_unsync_walk *walker)
+                   int idx)
 {
-        int i, ret;
+        int i;
-        if (!sp->unsync_children)
+        if (sp->unsync)
-                return 0;
+                for (i=0; i < pvec->nr; i++)
+                        if (pvec->page[i].sp == sp)
+                                return 0;
+        pvec->page[pvec->nr].sp = sp;
+        pvec->page[pvec->nr].idx = idx;
+        pvec->nr++;
+        return (pvec->nr == KVM_PAGE_ARRAY_NR);
+}
+static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
+                           struct kvm_mmu_pages *pvec)
+{
+        int i, ret, nr_unsync_leaf = 0;
        for_each_unsync_children(sp->unsync_child_bitmap, i) {
                u64 ent = sp->spt[i];
-                if (is_shadow_present_pte(ent)) {
+                if (is_shadow_present_pte(ent) && !is_large_pte(ent)) {
                        struct kvm_mmu_page *child;
                        child = page_header(ent & PT64_BASE_ADDR_MASK);
                        if (child->unsync_children) {
-                                ret = mmu_unsync_walk(child, walker);
+                                if (mmu_pages_add(pvec, child, i))
-                                if (ret)
+                                        return -ENOSPC;
+                                ret = __mmu_unsync_walk(child, pvec);
+                                if (!ret)
+                                        __clear_bit(i, sp->unsync_child_bitmap);
+                                else if (ret > 0)
+                                        nr_unsync_leaf += ret;
+                                else
                                        return ret;
-                                __clear_bit(i, sp->unsync_child_bitmap);
                        }
                        if (child->unsync) {
-                                ret = walker->entry(child, walker);
+                                nr_unsync_leaf++;
-                                __clear_bit(i, sp->unsync_child_bitmap);
+                                if (mmu_pages_add(pvec, child, i))
-                                if (ret)
+                                        return -ENOSPC;
-                                        return ret;
                        }
                }
        }
@@ -998,7 +1035,17 @@ static int mmu_unsync_walk(struct kvm_mmu_page *sp,
        if (find_first_bit(sp->unsync_child_bitmap, 512) == 512)
                sp->unsync_children = 0;
-        return 0;
+        return nr_unsync_leaf;
+}
+static int mmu_unsync_walk(struct kvm_mmu_page *sp,
+                           struct kvm_mmu_pages *pvec)
+{
+        if (!sp->unsync_children)
+                return 0;
+        mmu_pages_add(pvec, sp, 0);
+        return __mmu_unsync_walk(sp, pvec);
 }
 static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
@@ -1021,10 +1068,18 @@ static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
        return NULL;
 }
+static void kvm_unlink_unsync_global(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+        list_del(&sp->oos_link);
+        --kvm->stat.mmu_unsync_global;
+}
 static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 {
        WARN_ON(!sp->unsync);
        sp->unsync = 0;
+        if (sp->global)
+                kvm_unlink_unsync_global(kvm, sp);
        --kvm->stat.mmu_unsync;
 }
@@ -1037,7 +1092,8 @@ static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
                return 1;
        }
-        rmap_write_protect(vcpu->kvm, sp->gfn);
+        if (rmap_write_protect(vcpu->kvm, sp->gfn))
+                kvm_flush_remote_tlbs(vcpu->kvm);
        kvm_unlink_unsync_page(vcpu->kvm, sp);
        if (vcpu->arch.mmu.sync_page(vcpu, sp)) {
                kvm_mmu_zap_page(vcpu->kvm, sp);
@@ -1048,30 +1104,89 @@ static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
        return 0;
 }
-struct sync_walker {
+struct mmu_page_path {
-        struct kvm_vcpu *vcpu;
+        struct kvm_mmu_page *parent[PT64_ROOT_LEVEL-1];
-        struct kvm_unsync_walk walker;
+        unsigned int idx[PT64_ROOT_LEVEL-1];
 };
-static int mmu_sync_fn(struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk)
+#define for_each_sp(pvec, sp, parents, i)                       \
+                for (i = mmu_pages_next(&pvec, &parents, -1),   \
+                        sp = pvec.page[i].sp;                   \
+                        i < pvec.nr && ({ sp = pvec.page[i].sp; 1;});   \
+                        i = mmu_pages_next(&pvec, &parents, i))
+int mmu_pages_next(struct kvm_mmu_pages *pvec, struct mmu_page_path *parents,
+                   int i)
 {
-        struct sync_walker *sync_walk = container_of(walk, struct sync_walker,
+        int n;
-                                                     walker);
-        struct kvm_vcpu *vcpu = sync_walk->vcpu;
-        kvm_sync_page(vcpu, sp);
+        for (n = i+1; n < pvec->nr; n++) {
-        return (need_resched() || spin_needbreak(&vcpu->kvm->mmu_lock));
+                struct kvm_mmu_page *sp = pvec->page[n].sp;
+                if (sp->role.level == PT_PAGE_TABLE_LEVEL) {
+                        parents->idx[0] = pvec->page[n].idx;
+                        return n;
+                }
+                parents->parent[sp->role.level-2] = sp;
+                parents->idx[sp->role.level-1] = pvec->page[n].idx;
+        }
+        return n;
 }
-static void mmu_sync_children(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+void mmu_pages_clear_parents(struct mmu_page_path *parents)
 {
-        struct sync_walker walker = {
+        struct kvm_mmu_page *sp;
-                .walker = { .entry = mmu_sync_fn, },
+        unsigned int level = 0;
-                .vcpu = vcpu,
-        };
+        do {
+                unsigned int idx = parents->idx[level];
+                sp = parents->parent[level];
+                if (!sp)
+                        return;
+                --sp->unsync_children;
+                WARN_ON((int)sp->unsync_children < 0);
+                __clear_bit(idx, sp->unsync_child_bitmap);
+                level++;
+        } while (level < PT64_ROOT_LEVEL-1 && !sp->unsync_children);
+}
+static void kvm_mmu_pages_init(struct kvm_mmu_page *parent,
+                               struct mmu_page_path *parents,
+                               struct kvm_mmu_pages *pvec)
+{
+        parents->parent[parent->role.level-1] = NULL;
+        pvec->nr = 0;
+}
+static void mmu_sync_children(struct kvm_vcpu *vcpu,
+                              struct kvm_mmu_page *parent)
+{
+        int i;
+        struct kvm_mmu_page *sp;
+        struct mmu_page_path parents;
+        struct kvm_mmu_pages pages;
+        kvm_mmu_pages_init(parent, &parents, &pages);
+        while (mmu_unsync_walk(parent, &pages)) {
+                int protected = 0;
-        while (mmu_unsync_walk(sp, &walker.walker))
+                for_each_sp(pages, sp, parents, i)
+                        protected |= rmap_write_protect(vcpu->kvm, sp->gfn);
+                if (protected)
+                        kvm_flush_remote_tlbs(vcpu->kvm);
+                for_each_sp(pages, sp, parents, i) {
+                        kvm_sync_page(vcpu, sp);
+                        mmu_pages_clear_parents(&parents);
+                }
                cond_resched_lock(&vcpu->kvm->mmu_lock);
+                kvm_mmu_pages_init(parent, &parents, &pages);
+        }
 }
 static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
@@ -1129,7 +1244,8 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
        sp->role = role;
        hlist_add_head(&sp->hash_link, bucket);
        if (!metaphysical) {
-                rmap_write_protect(vcpu->kvm, gfn);
+                if (rmap_write_protect(vcpu->kvm, gfn))
+                        kvm_flush_remote_tlbs(vcpu->kvm);
                account_shadowed(vcpu->kvm, gfn);
        }
        if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte)
@@ -1153,6 +1269,8 @@ static int walk_shadow(struct kvm_shadow_walk *walker,
        if (level == PT32E_ROOT_LEVEL) {
                shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
                shadow_addr &= PT64_BASE_ADDR_MASK;
+                if (!shadow_addr)
+                        return 1;
                --level;
        }
@@ -1237,33 +1355,29 @@ static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
        }
 }
-struct zap_walker {
+static int mmu_zap_unsync_children(struct kvm *kvm,
-        struct kvm_unsync_walk walker;
+                                   struct kvm_mmu_page *parent)
-        struct kvm *kvm;
-        int zapped;
-};
-static int mmu_zap_fn(struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk)
 {
-        struct zap_walker *zap_walk = container_of(walk, struct zap_walker,
+        int i, zapped = 0;
-                                                     walker);
+        struct mmu_page_path parents;
-        kvm_mmu_zap_page(zap_walk->kvm, sp);
+        struct kvm_mmu_pages pages;
-        zap_walk->zapped = 1;
-        return 0;
-}
-static int mmu_zap_unsync_children(struct kvm *kvm, struct kvm_mmu_page *sp)
+        if (parent->role.level == PT_PAGE_TABLE_LEVEL)
-{
-        struct zap_walker walker = {
-                .walker = { .entry = mmu_zap_fn, },
-                .kvm = kvm,
-                .zapped = 0,
-        };
-        if (sp->role.level == PT_PAGE_TABLE_LEVEL)
                return 0;
-        mmu_unsync_walk(sp, &walker.walker);
-        return walker.zapped;
+        kvm_mmu_pages_init(parent, &parents, &pages);
+        while (mmu_unsync_walk(parent, &pages)) {
+                struct kvm_mmu_page *sp;
+                for_each_sp(pages, sp, parents, i) {
+                        kvm_mmu_zap_page(kvm, sp);
+                        mmu_pages_clear_parents(&parents);
+                }
+                zapped += pages.nr;
+                kvm_mmu_pages_init(parent, &parents, &pages);
+        }
+        return zapped;
 }
 static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
@@ -1362,7 +1476,7 @@ static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
        int slot = memslot_id(kvm, gfn_to_memslot(kvm, gfn));
        struct kvm_mmu_page *sp = page_header(__pa(pte));
-        __set_bit(slot, &sp->slot_bitmap);
+        __set_bit(slot, sp->slot_bitmap);
 }
 static void mmu_convert_notrap(struct kvm_mmu_page *sp)
@@ -1393,6 +1507,110 @@ struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
        return page;
 }
+/*
+ * The function is based on mtrr_type_lookup() in
+ * arch/x86/kernel/cpu/mtrr/generic.c
+ */
+static int get_mtrr_type(struct mtrr_state_type *mtrr_state,
+                         u64 start, u64 end)
+{
+        int i;
+        u64 base, mask;
+        u8 prev_match, curr_match;
+        int num_var_ranges = KVM_NR_VAR_MTRR;
+        if (!mtrr_state->enabled)
+                return 0xFF;
+        /* Make end inclusive end, instead of exclusive */
+        end--;
+        /* Look in fixed ranges. Just return the type as per start */
+        if (mtrr_state->have_fixed && (start < 0x100000)) {
+                int idx;
+                if (start < 0x80000) {
+                        idx = 0;
+                        idx += (start >> 16);
+                        return mtrr_state->fixed_ranges[idx];
+                } else if (start < 0xC0000) {
+                        idx = 1 * 8;
+                        idx += ((start - 0x80000) >> 14);
+                        return mtrr_state->fixed_ranges[idx];
+                } else if (start < 0x1000000) {
+                        idx = 3 * 8;
+                        idx += ((start - 0xC0000) >> 12);
+                        return mtrr_state->fixed_ranges[idx];
+                }
+        }
+        /*
+         * Look in variable ranges
+         * Look of multiple ranges matching this address and pick type
+         * as per MTRR precedence
+         */
+        if (!(mtrr_state->enabled & 2))
+                return mtrr_state->def_type;
+        prev_match = 0xFF;
+        for (i = 0; i < num_var_ranges; ++i) {
+                unsigned short start_state, end_state;
+                if (!(mtrr_state->var_ranges[i].mask_lo & (1 << 11)))
+                        continue;
+                base = (((u64)mtrr_state->var_ranges[i].base_hi) << 32) +
+                       (mtrr_state->var_ranges[i].base_lo & PAGE_MASK);
+                mask = (((u64)mtrr_state->var_ranges[i].mask_hi) << 32) +
+                       (mtrr_state->var_ranges[i].mask_lo & PAGE_MASK);
+                start_state = ((start & mask) == (base & mask));
+                end_state = ((end & mask) == (base & mask));
+                if (start_state != end_state)
+                        return 0xFE;
+                if ((start & mask) != (base & mask))
+                        continue;
+                curr_match = mtrr_state->var_ranges[i].base_lo & 0xff;
+                if (prev_match == 0xFF) {
+                        prev_match = curr_match;
+                        continue;
+                }
+                if (prev_match == MTRR_TYPE_UNCACHABLE ||
+                    curr_match == MTRR_TYPE_UNCACHABLE)
+                        return MTRR_TYPE_UNCACHABLE;
+                if ((prev_match == MTRR_TYPE_WRBACK &&
+                     curr_match == MTRR_TYPE_WRTHROUGH) ||
+                    (prev_match == MTRR_TYPE_WRTHROUGH &&
+                     curr_match == MTRR_TYPE_WRBACK)) {
+                        prev_match = MTRR_TYPE_WRTHROUGH;
+                        curr_match = MTRR_TYPE_WRTHROUGH;
+                }
+                if (prev_match != curr_match)
+                        return MTRR_TYPE_UNCACHABLE;
+        }
+        if (prev_match != 0xFF)
+                return prev_match;
+        return mtrr_state->def_type;
+}
+static u8 get_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+        u8 mtrr;
+        mtrr = get_mtrr_type(&vcpu->arch.mtrr_state, gfn << PAGE_SHIFT,
+                             (gfn << PAGE_SHIFT) + PAGE_SIZE);
+        if (mtrr == 0xfe || mtrr == 0xff)
+                mtrr = MTRR_TYPE_WRBACK;
+        return mtrr;
+}
 static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 {
        unsigned index;
@@ -1409,9 +1627,15 @@ static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
                if (s->role.word != sp->role.word)
                        return 1;
        }
-        kvm_mmu_mark_parents_unsync(vcpu, sp);
        ++vcpu->kvm->stat.mmu_unsync;
        sp->unsync = 1;
+        if (sp->global) {
+                list_add(&sp->oos_link, &vcpu->kvm->arch.oos_global_pages);
+                ++vcpu->kvm->stat.mmu_unsync_global;
+        } else
+                kvm_mmu_mark_parents_unsync(vcpu, sp);
        mmu_convert_notrap(sp);
        return 0;
 }
@@ -1437,11 +1661,24 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
 static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
                    unsigned pte_access, int user_fault,
                    int write_fault, int dirty, int largepage,
-                    gfn_t gfn, pfn_t pfn, bool speculative,
+                    int global, gfn_t gfn, pfn_t pfn, bool speculative,
                    bool can_unsync)
 {
        u64 spte;
        int ret = 0;
+        u64 mt_mask = shadow_mt_mask;
+        struct kvm_mmu_page *sp = page_header(__pa(shadow_pte));
+        if (!(vcpu->arch.cr4 & X86_CR4_PGE))
+                global = 0;
+        if (!global && sp->global) {
+                sp->global = 0;
+                if (sp->unsync) {
+                        kvm_unlink_unsync_global(vcpu->kvm, sp);
+                        kvm_mmu_mark_parents_unsync(vcpu, sp);
+                }
+        }
        /*
         * We don't set the accessed bit, since we sometimes want to see
         * whether the guest actually used the pte (in order to detect
@@ -1460,6 +1697,11 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
                spte |= shadow_user_mask;
        if (largepage)
                spte |= PT_PAGE_SIZE_MASK;
+        if (mt_mask) {
+                mt_mask = get_memory_type(vcpu, gfn) <<
+                          kvm_x86_ops->get_mt_mask_shift();
+                spte |= mt_mask;
+        }
        spte |= (u64)pfn << PAGE_SHIFT;
@@ -1474,6 +1716,15 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
                spte |= PT_WRITABLE_MASK;
+                /*
+                 * Optimization: for pte sync, if spte was writable the hash
+                 * lookup is unnecessary (and expensive). Write protection
+                 * is responsibility of mmu_get_page / kvm_sync_page.
+                 * Same reasoning can be applied to dirty page accounting.
+                 */
+                if (!can_unsync && is_writeble_pte(*shadow_pte))
+                        goto set_pte;
                if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
                        pgprintk("%s: found shadow page for %lx, marking ro\n",
                                 __func__, gfn);
@@ -1495,8 +1746,8 @@ set_pte:
 static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
                         unsigned pt_access, unsigned pte_access,
                         int user_fault, int write_fault, int dirty,
-                         int *ptwrite, int largepage, gfn_t gfn,
+                         int *ptwrite, int largepage, int global,
-                         pfn_t pfn, bool speculative)
+                         gfn_t gfn, pfn_t pfn, bool speculative)
 {
        int was_rmapped = 0;
        int was_writeble = is_writeble_pte(*shadow_pte);
@@ -1529,7 +1780,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
                }
        }
        if (set_spte(vcpu, shadow_pte, pte_access, user_fault, write_fault,
-                      dirty, largepage, gfn, pfn, speculative, true)) {
+                      dirty, largepage, global, gfn, pfn, speculative, true)) {
                if (write_fault)
                        *ptwrite = 1;
                kvm_x86_ops->tlb_flush(vcpu);
@@ -1586,7 +1837,7 @@ static int direct_map_entry(struct kvm_shadow_walk *_walk,
            || (walk->largepage && level == PT_DIRECTORY_LEVEL)) {
                mmu_set_spte(vcpu, sptep, ACC_ALL, ACC_ALL,
                             0, walk->write, 1, &walk->pt_write,
-                             walk->largepage, gfn, walk->pfn, false);
+                             walk->largepage, 0, gfn, walk->pfn, false);
                ++vcpu->stat.pf_fixed;
                return 1;
        }
@@ -1773,6 +2024,15 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
        }
 }
+static void mmu_sync_global(struct kvm_vcpu *vcpu)
+{
+        struct kvm *kvm = vcpu->kvm;
+        struct kvm_mmu_page *sp, *n;
+        list_for_each_entry_safe(sp, n, &kvm->arch.oos_global_pages, oos_link)
+                kvm_sync_page(vcpu, sp);
+}
 void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
 {
        spin_lock(&vcpu->kvm->mmu_lock);
@@ -1780,6 +2040,13 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
        spin_unlock(&vcpu->kvm->mmu_lock);
 }
+void kvm_mmu_sync_global(struct kvm_vcpu *vcpu)
+{
+        spin_lock(&vcpu->kvm->mmu_lock);
+        mmu_sync_global(vcpu);
+        spin_unlock(&vcpu->kvm->mmu_lock);
+}
 static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr)
 {
        return vaddr;
@@ -2178,7 +2445,8 @@ static void kvm_mmu_access_page(struct kvm_vcpu *vcpu, gfn_t gfn)
 }
 void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
-                       const u8 *new, int bytes)
+                       const u8 *new, int bytes,
+                       bool guest_initiated)
 {
        gfn_t gfn = gpa >> PAGE_SHIFT;
        struct kvm_mmu_page *sp;
@@ -2204,15 +2472,17 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
        kvm_mmu_free_some_pages(vcpu);
        ++vcpu->kvm->stat.mmu_pte_write;
        kvm_mmu_audit(vcpu, "pre pte write");
-        if (gfn == vcpu->arch.last_pt_write_gfn
+        if (guest_initiated) {
-            && !last_updated_pte_accessed(vcpu)) {
+                if (gfn == vcpu->arch.last_pt_write_gfn
-                ++vcpu->arch.last_pt_write_count;
+                    && !last_updated_pte_accessed(vcpu)) {
-                if (vcpu->arch.last_pt_write_count >= 3)
+                        ++vcpu->arch.last_pt_write_count;
-                        flooded = 1;
+                        if (vcpu->arch.last_pt_write_count >= 3)
-        } else {
+                                flooded = 1;
-                vcpu->arch.last_pt_write_gfn = gfn;
+                } else {
-                vcpu->arch.last_pt_write_count = 1;
+                        vcpu->arch.last_pt_write_gfn = gfn;
-                vcpu->arch.last_pte_updated = NULL;
+                        vcpu->arch.last_pt_write_count = 1;
+                        vcpu->arch.last_pte_updated = NULL;
+                }
        }
        index = kvm_page_table_hashfn(gfn);
        bucket = &vcpu->kvm->arch.mmu_page_hash[index];
@@ -2352,9 +2622,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
 void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
 {
-        spin_lock(&vcpu->kvm->mmu_lock);
        vcpu->arch.mmu.invlpg(vcpu, gva);
-        spin_unlock(&vcpu->kvm->mmu_lock);
        kvm_mmu_flush_tlb(vcpu);
        ++vcpu->stat.invlpg;
 }
@@ -2451,7 +2719,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
                int i;
                u64 *pt;
-                if (!test_bit(slot, &sp->slot_bitmap))
+                if (!test_bit(slot, sp->slot_bitmap))
                        continue;
                pt = sp->spt;
@@ -2860,8 +3128,8 @@ static void audit_write_protection(struct kvm_vcpu *vcpu)
                if (sp->role.metaphysical)
                        continue;
-                slot = gfn_to_memslot(vcpu->kvm, sp->gfn);
                gfn = unalias_gfn(vcpu->kvm, sp->gfn);
+                slot = gfn_to_memslot_unaliased(vcpu->kvm, sp->gfn);
                rmapp = &slot->rmap[gfn - slot->base_gfn];
                if (*rmapp)
                        printk(KERN_ERR "%s: (%s) shadow page has writable"
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 84eee43bbe74..9fd78b6e17ad 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -82,6 +82,7 @@ struct shadow_walker {
        int *ptwrite;
        pfn_t pfn;
        u64 *sptep;
+        gpa_t pte_gpa;
 };
 static gfn_t gpte_to_gfn(pt_element_t gpte)
@@ -222,7 +223,7 @@ walk:
                if (ret)
                        goto walk;
                pte |= PT_DIRTY_MASK;
-                kvm_mmu_pte_write(vcpu, pte_gpa, (u8 *)&pte, sizeof(pte));
+                kvm_mmu_pte_write(vcpu, pte_gpa, (u8 *)&pte, sizeof(pte), 0);
                walker->ptes[walker->level - 1] = pte;
        }
@@ -274,7 +275,8 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
                return;
        kvm_get_pfn(pfn);
        mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0,
-                     gpte & PT_DIRTY_MASK, NULL, largepage, gpte_to_gfn(gpte),
+                     gpte & PT_DIRTY_MASK, NULL, largepage,
+                     gpte & PT_GLOBAL_MASK, gpte_to_gfn(gpte),
                     pfn, true);
 }
@@ -301,8 +303,9 @@ static int FNAME(shadow_walk_entry)(struct kvm_shadow_walk *_sw,
                mmu_set_spte(vcpu, sptep, access, gw->pte_access & access,
                             sw->user_fault, sw->write_fault,
                             gw->ptes[gw->level-1] & PT_DIRTY_MASK,
-                             sw->ptwrite, sw->largepage, gw->gfn, sw->pfn,
+                             sw->ptwrite, sw->largepage,
-                             false);
+                             gw->ptes[gw->level-1] & PT_GLOBAL_MASK,
+                             gw->gfn, sw->pfn, false);
                sw->sptep = sptep;
                return 1;
        }
@@ -466,10 +469,22 @@ static int FNAME(shadow_invlpg_entry)(struct kvm_shadow_walk *_sw,
                                      struct kvm_vcpu *vcpu, u64 addr,
                                      u64 *sptep, int level)
 {
+        struct shadow_walker *sw =
+                container_of(_sw, struct shadow_walker, walker);
-        if (level == PT_PAGE_TABLE_LEVEL) {
+        /* FIXME: properly handle invlpg on large guest pages */
-                if (is_shadow_present_pte(*sptep))
+        if (level == PT_PAGE_TABLE_LEVEL ||
+            ((level == PT_DIRECTORY_LEVEL) && is_large_pte(*sptep))) {
+                struct kvm_mmu_page *sp = page_header(__pa(sptep));
+                sw->pte_gpa = (sp->gfn << PAGE_SHIFT);
+                sw->pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t);
+                if (is_shadow_present_pte(*sptep)) {
                        rmap_remove(vcpu->kvm, sptep);
+                        if (is_large_pte(*sptep))
+                                --vcpu->kvm->stat.lpages;
+                }
                set_shadow_pte(sptep, shadow_trap_nonpresent_pte);
                return 1;
        }
@@ -480,11 +495,26 @@ static int FNAME(shadow_invlpg_entry)(struct kvm_shadow_walk *_sw,
 static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
 {
+        pt_element_t gpte;
        struct shadow_walker walker = {
                .walker = { .entry = FNAME(shadow_invlpg_entry), },
+                .pte_gpa = -1,
        };
+        spin_lock(&vcpu->kvm->mmu_lock);
        walk_shadow(&walker.walker, vcpu, gva);
+        spin_unlock(&vcpu->kvm->mmu_lock);
+        if (walker.pte_gpa == -1)
+                return;
+        if (kvm_read_guest_atomic(vcpu->kvm, walker.pte_gpa, &gpte,
+                                  sizeof(pt_element_t)))
+                return;
+        if (is_present_pte(gpte) && (gpte & PT_ACCESSED_MASK)) {
+                if (mmu_topup_memory_caches(vcpu))
+                        return;
+                kvm_mmu_pte_write(vcpu, walker.pte_gpa, (const u8 *)&gpte,
+                                  sizeof(pt_element_t), 0);
+        }
 }
 static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
@@ -580,7 +610,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
                nr_present++;
                pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
                set_spte(vcpu, &sp->spt[i], pte_access, 0, 0,
-                         is_dirty_pte(gpte), 0, gfn,
+                         is_dirty_pte(gpte), 0, gpte & PT_GLOBAL_MASK, gfn,
                         spte_to_pfn(sp->spt[i]), true, false);
        }
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 9c4ce657d963..1452851ae258 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -28,6 +28,8 @@
 #include <asm/desc.h>
+#include <asm/virtext.h>
 #define __ex(x) __kvm_handle_fault_on_reboot(x)
 MODULE_AUTHOR("Qumranet");
@@ -245,34 +247,19 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
 static int has_svm(void)
 {
-        uint32_t eax, ebx, ecx, edx;
+        const char *msg;
-        if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) {
-                printk(KERN_INFO "has_svm: not amd\n");
-                return 0;
-        }
-        cpuid(0x80000000, &eax, &ebx, &ecx, &edx);
+        if (!cpu_has_svm(&msg)) {
-        if (eax < SVM_CPUID_FUNC) {
+                printk(KERN_INFO "has_svn: %s\n", msg);
-                printk(KERN_INFO "has_svm: can't execute cpuid_8000000a\n");
                return 0;
        }
-        cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
-        if (!(ecx & (1 << SVM_CPUID_FEATURE_SHIFT))) {
-                printk(KERN_DEBUG "has_svm: svm not available\n");
-                return 0;
-        }
        return 1;
 }
 static void svm_hardware_disable(void *garbage)
 {
-        uint64_t efer;
+        cpu_svm_disable();
-        wrmsrl(MSR_VM_HSAVE_PA, 0);
-        rdmsrl(MSR_EFER, efer);
-        wrmsrl(MSR_EFER, efer & ~MSR_EFER_SVME_MASK);
 }
 static void svm_hardware_enable(void *garbage)
@@ -772,6 +759,22 @@ static void svm_get_segment(struct kvm_vcpu *vcpu,
        var->l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1;
        var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1;
        var->g = (s->attrib >> SVM_SELECTOR_G_SHIFT) & 1;
+        /*
+         * SVM always stores 0 for the 'G' bit in the CS selector in
+         * the VMCB on a VMEXIT. This hurts cross-vendor migration:
+         * Intel's VMENTRY has a check on the 'G' bit.
+         */
+        if (seg == VCPU_SREG_CS)
+                var->g = s->limit > 0xfffff;
+        /*
+         * Work around a bug where the busy flag in the tr selector
+         * isn't exposed
+         */
+        if (seg == VCPU_SREG_TR)
+                var->type |= 0x2;
        var->unusable = !var->present;
 }
@@ -1099,6 +1102,7 @@ static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
        rep = (io_info & SVM_IOIO_REP_MASK) != 0;
        down = (svm->vmcb->save.rflags & X86_EFLAGS_DF) != 0;
+        skip_emulated_instruction(&svm->vcpu);
        return kvm_emulate_pio(&svm->vcpu, kvm_run, in, size, port);
 }
@@ -1912,6 +1916,11 @@ static int get_npt_level(void)
 #endif
 }
+static int svm_get_mt_mask_shift(void)
+{
+        return 0;
+}
 static struct kvm_x86_ops svm_x86_ops = {
        .cpu_has_kvm_support = has_svm,
        .disabled_by_bios = is_disabled,
@@ -1967,6 +1976,7 @@ static struct kvm_x86_ops svm_x86_ops = {
        .set_tss_addr = svm_set_tss_addr,
        .get_tdp_level = get_npt_level,
+        .get_mt_mask_shift = svm_get_mt_mask_shift,
 };
 static int __init svm_init(void)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index a4018b01e1f9..6259d7467648 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -16,7 +16,6 @@
 */
 #include "irq.h"
-#include "vmx.h"
 #include "mmu.h"
 #include <linux/kvm_host.h>
@@ -31,6 +30,8 @@
 #include <asm/io.h>
 #include <asm/desc.h>
+#include <asm/vmx.h>
+#include <asm/virtext.h>
 #define __ex(x) __kvm_handle_fault_on_reboot(x)
@@ -90,6 +91,11 @@ struct vcpu_vmx {
        } rmode;
        int vpid;
        bool emulation_required;
+        /* Support for vnmi-less CPUs */
+        int soft_vnmi_blocked;
+        ktime_t entry_time;
+        s64 vnmi_blocked_time;
 };
 static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
@@ -122,7 +128,7 @@ static struct vmcs_config {
        u32 vmentry_ctrl;
 } vmcs_config;
-struct vmx_capability {
+static struct vmx_capability {
        u32 ept;
        u32 vpid;
 } vmx_capability;
@@ -957,6 +963,13 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
                pr_unimpl(vcpu, "unimplemented perfctr wrmsr: 0x%x data 0x%llx\n", msr_index, data);
                break;
+        case MSR_IA32_CR_PAT:
+                if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
+                        vmcs_write64(GUEST_IA32_PAT, data);
+                        vcpu->arch.pat = data;
+                        break;
+                }
+                /* Otherwise falls through to kvm_set_msr_common */
        default:
                vmx_load_host_state(vmx);
                msr = find_msr_entry(vmx, msr_index);
@@ -1032,8 +1045,7 @@ static int vmx_get_irq(struct kvm_vcpu *vcpu)
 static __init int cpu_has_kvm_support(void)
 {
-        unsigned long ecx = cpuid_ecx(1);
+        return cpu_has_vmx();
-        return test_bit(5, &ecx); /* CPUID.1:ECX.VMX[bit 5] -> VT */
 }
 static __init int vmx_disabled_by_bios(void)
@@ -1079,13 +1091,22 @@ static void vmclear_local_vcpus(void)
                __vcpu_clear(vmx);
 }
-static void hardware_disable(void *garbage)
+/* Just like cpu_vmxoff(), but with the __kvm_handle_fault_on_reboot()
+ * tricks.
+ */
+static void kvm_cpu_vmxoff(void)
 {
-        vmclear_local_vcpus();
        asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc");
        write_cr4(read_cr4() & ~X86_CR4_VMXE);
 }
+static void hardware_disable(void *garbage)
+{
+        vmclear_local_vcpus();
+        kvm_cpu_vmxoff();
+}
 static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
                                      u32 msr, u32 *result)
 {
@@ -1176,12 +1197,13 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 #ifdef CONFIG_X86_64
        min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
 #endif
-        opt = 0;
+        opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT;
        if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
                                &_vmexit_control) < 0)
                return -EIO;
-        min = opt = 0;
+        min = 0;
+        opt = VM_ENTRY_LOAD_IA32_PAT;
        if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS,
                                &_vmentry_control) < 0)
                return -EIO;
@@ -2087,8 +2109,9 @@ static void vmx_disable_intercept_for_msr(struct page *msr_bitmap, u32 msr)
 */
 static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 {
-        u32 host_sysenter_cs;
+        u32 host_sysenter_cs, msr_low, msr_high;
        u32 junk;
+        u64 host_pat;
        unsigned long a;
        struct descriptor_table dt;
        int i;
@@ -2176,6 +2199,20 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
        rdmsrl(MSR_IA32_SYSENTER_EIP, a);
        vmcs_writel(HOST_IA32_SYSENTER_EIP, a);   /* 22.2.3 */
+        if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) {
+                rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high);
+                host_pat = msr_low | ((u64) msr_high << 32);
+                vmcs_write64(HOST_IA32_PAT, host_pat);
+        }
+        if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
+                rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high);
+                host_pat = msr_low | ((u64) msr_high << 32);
+                /* Write the default value follow host pat */
+                vmcs_write64(GUEST_IA32_PAT, host_pat);
+                /* Keep arch.pat sync with GUEST_IA32_PAT */
+                vmx->vcpu.arch.pat = host_pat;
+        }
        for (i = 0; i < NR_VMX_MSR; ++i) {
                u32 index = vmx_msr_index[i];
                u32 data_low, data_high;
@@ -2230,6 +2267,8 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
        vmx->vcpu.arch.rmode.active = 0;
+        vmx->soft_vnmi_blocked = 0;
        vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
        kvm_set_cr8(&vmx->vcpu, 0);
        msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
@@ -2335,6 +2374,29 @@ out:
        return ret;
 }
+static void enable_irq_window(struct kvm_vcpu *vcpu)
+{
+        u32 cpu_based_vm_exec_control;
+        cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
+        cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
+        vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+}
+static void enable_nmi_window(struct kvm_vcpu *vcpu)
+{
+        u32 cpu_based_vm_exec_control;
+        if (!cpu_has_virtual_nmis()) {
+                enable_irq_window(vcpu);
+                return;
+        }
+        cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
+        cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING;
+        vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+}
 static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -2358,10 +2420,54 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq)
 static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
 {
+        struct vcpu_vmx *vmx = to_vmx(vcpu);
+        if (!cpu_has_virtual_nmis()) {
+                /*
+                 * Tracking the NMI-blocked state in software is built upon
+                 * finding the next open IRQ window. This, in turn, depends on
+                 * well-behaving guests: They have to keep IRQs disabled at
+                 * least as long as the NMI handler runs. Otherwise we may
+                 * cause NMI nesting, maybe breaking the guest. But as this is
+                 * highly unlikely, we can live with the residual risk.
+                 */
+                vmx->soft_vnmi_blocked = 1;
+                vmx->vnmi_blocked_time = 0;
+        }
+        ++vcpu->stat.nmi_injections;
+        if (vcpu->arch.rmode.active) {
+                vmx->rmode.irq.pending = true;
+                vmx->rmode.irq.vector = NMI_VECTOR;
+                vmx->rmode.irq.rip = kvm_rip_read(vcpu);
+                vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
+                             NMI_VECTOR | INTR_TYPE_SOFT_INTR |
+                             INTR_INFO_VALID_MASK);
+                vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
+                kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1);
+                return;
+        }
        vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
                        INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
 }
+static void vmx_update_window_states(struct kvm_vcpu *vcpu)
+{
+        u32 guest_intr = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
+        vcpu->arch.nmi_window_open =
+                !(guest_intr & (GUEST_INTR_STATE_STI |
+                                GUEST_INTR_STATE_MOV_SS |
+                                GUEST_INTR_STATE_NMI));
+        if (!cpu_has_virtual_nmis() && to_vmx(vcpu)->soft_vnmi_blocked)
+                vcpu->arch.nmi_window_open = 0;
+        vcpu->arch.interrupt_window_open =
+                ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
+                 !(guest_intr & (GUEST_INTR_STATE_STI |
+                                 GUEST_INTR_STATE_MOV_SS)));
+}
 static void kvm_do_inject_irq(struct kvm_vcpu *vcpu)
 {
        int word_index = __ffs(vcpu->arch.irq_summary);
@@ -2374,40 +2480,49 @@ static void kvm_do_inject_irq(struct kvm_vcpu *vcpu)
        kvm_queue_interrupt(vcpu, irq);
 }
 static void do_interrupt_requests(struct kvm_vcpu *vcpu,
                                       struct kvm_run *kvm_run)
 {
-        u32 cpu_based_vm_exec_control;
+        vmx_update_window_states(vcpu);
-        vcpu->arch.interrupt_window_open =
-                ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
-                 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0);
-        if (vcpu->arch.interrupt_window_open &&
+        if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) {
-            vcpu->arch.irq_summary && !vcpu->arch.interrupt.pending)
+                if (vcpu->arch.interrupt.pending) {
-                kvm_do_inject_irq(vcpu);
+                        enable_nmi_window(vcpu);
+                } else if (vcpu->arch.nmi_window_open) {
+                        vcpu->arch.nmi_pending = false;
+                        vcpu->arch.nmi_injected = true;
+                } else {
+                        enable_nmi_window(vcpu);
+                        return;
+                }
+        }
+        if (vcpu->arch.nmi_injected) {
+                vmx_inject_nmi(vcpu);
+                if (vcpu->arch.nmi_pending)
+                        enable_nmi_window(vcpu);
+                else if (vcpu->arch.irq_summary
+                         || kvm_run->request_interrupt_window)
+                        enable_irq_window(vcpu);
+                return;
+        }
-        if (vcpu->arch.interrupt_window_open && vcpu->arch.interrupt.pending)
+        if (vcpu->arch.interrupt_window_open) {
-                vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr);
+                if (vcpu->arch.irq_summary && !vcpu->arch.interrupt.pending)
+                        kvm_do_inject_irq(vcpu);
-        cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
+                if (vcpu->arch.interrupt.pending)
+                        vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr);
+        }
        if (!vcpu->arch.interrupt_window_open &&
            (vcpu->arch.irq_summary || kvm_run->request_interrupt_window))
-                /*
+                enable_irq_window(vcpu);
-                 * Interrupts blocked.  Wait for unblock.
-                 */
-                cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
-        else
-                cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
-        vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
 }
 static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
 {
        int ret;
        struct kvm_userspace_memory_region tss_mem = {
-                .slot = 8,
+                .slot = TSS_PRIVATE_MEMSLOT,
                .guest_phys_addr = addr,
                .memory_size = PAGE_SIZE * 3,
                .flags = 0,
@@ -2492,7 +2607,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
                set_bit(irq / BITS_PER_LONG, &vcpu->arch.irq_summary);
        }
-        if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) /* nmi */
+        if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR)
                return 1;  /* already handled by vmx_vcpu_run() */
        if (is_no_device(intr_info)) {
@@ -2581,6 +2696,7 @@ static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        rep = (exit_qualification & 32) != 0;
        port = exit_qualification >> 16;
+        skip_emulated_instruction(vcpu);
        return kvm_emulate_pio(vcpu, kvm_run, in, size, port);
 }
@@ -2767,6 +2883,7 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu,
        vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
        KVMTRACE_0D(PEND_INTR, vcpu, handler);
+        ++vcpu->stat.irq_window_exits;
        /*
         * If the user space waits to inject interrupts, exit as soon as
@@ -2775,7 +2892,6 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu,
        if (kvm_run->request_interrupt_window &&
            !vcpu->arch.irq_summary) {
                kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
-                ++vcpu->stat.irq_window_exits;
                return 0;
        }
        return 1;
@@ -2832,6 +2948,7 @@ static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
+        struct vcpu_vmx *vmx = to_vmx(vcpu);
        unsigned long exit_qualification;
        u16 tss_selector;
        int reason;
@@ -2839,6 +2956,15 @@ static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
        reason = (u32)exit_qualification >> 30;
+        if (reason == TASK_SWITCH_GATE && vmx->vcpu.arch.nmi_injected &&
+            (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
+            (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK)
+            == INTR_TYPE_NMI_INTR) {
+                vcpu->arch.nmi_injected = false;
+                if (cpu_has_virtual_nmis())
+                        vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
+                                      GUEST_INTR_STATE_NMI);
+        }
        tss_selector = exit_qualification;
        return kvm_task_switch(vcpu, tss_selector, reason);
@@ -2927,16 +3053,12 @@ static void handle_invalid_guest_state(struct kvm_vcpu *vcpu,
        while (!guest_state_valid(vcpu)) {
                err = emulate_instruction(vcpu, kvm_run, 0, 0, 0);
-                switch (err) {
+                if (err == EMULATE_DO_MMIO)
-                        case EMULATE_DONE:
+                        break;
-                                break;
-                        case EMULATE_DO_MMIO:
+                if (err != EMULATE_DONE) {
-                                kvm_report_emulation_failure(vcpu, "mmio");
+                        kvm_report_emulation_failure(vcpu, "emulation failure");
-                                /* TODO: Handle MMIO */
+                        return;
-                                return;
-                        default:
-                                kvm_report_emulation_failure(vcpu, "emulation failure");
-                                return;
                }
                if (signal_pending(current))
@@ -2948,8 +3070,10 @@ static void handle_invalid_guest_state(struct kvm_vcpu *vcpu,
        local_irq_disable();
        preempt_disable();
-        /* Guest state should be valid now, no more emulation should be needed */
+        /* Guest state should be valid now except if we need to
-        vmx->emulation_required = 0;
+         * emulate an MMIO */
+        if (guest_state_valid(vcpu))
+                vmx->emulation_required = 0;
 }
 /*
@@ -2996,6 +3120,11 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
        KVMTRACE_3D(VMEXIT, vcpu, exit_reason, (u32)kvm_rip_read(vcpu),
                    (u32)((u64)kvm_rip_read(vcpu) >> 32), entryexit);
+        /* If we need to emulate an MMIO from handle_invalid_guest_state
+         * we just return 0 */
+        if (vmx->emulation_required && emulate_invalid_guest_state)
+                return 0;
        /* Access CR3 don't cause VMExit in paging mode, so we need
         * to sync with guest real CR3. */
        if (vm_need_ept() && is_paging(vcpu)) {
@@ -3012,9 +3141,32 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
        if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
                        (exit_reason != EXIT_REASON_EXCEPTION_NMI &&
-                        exit_reason != EXIT_REASON_EPT_VIOLATION))
+                        exit_reason != EXIT_REASON_EPT_VIOLATION &&
-                printk(KERN_WARNING "%s: unexpected, valid vectoring info and "
+                        exit_reason != EXIT_REASON_TASK_SWITCH))
-                       "exit reason is 0x%x\n", __func__, exit_reason);
+                printk(KERN_WARNING "%s: unexpected, valid vectoring info "
+                       "(0x%x) and exit reason is 0x%x\n",
+                       __func__, vectoring_info, exit_reason);
+        if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) {
+                if (vcpu->arch.interrupt_window_open) {
+                        vmx->soft_vnmi_blocked = 0;
+                        vcpu->arch.nmi_window_open = 1;
+                } else if (vmx->vnmi_blocked_time > 1000000000LL &&
+                           vcpu->arch.nmi_pending) {
+                        /*
+                         * This CPU don't support us in finding the end of an
+                         * NMI-blocked window if the guest runs with IRQs
+                         * disabled. So we pull the trigger after 1 s of
+                         * futile waiting, but inform the user about this.
+                         */
+                        printk(KERN_WARNING "%s: Breaking out of NMI-blocked "
+                               "state on VCPU %d after 1 s timeout\n",
+                               __func__, vcpu->vcpu_id);
+                        vmx->soft_vnmi_blocked = 0;
+                        vmx->vcpu.arch.nmi_window_open = 1;
+                }
+        }
        if (exit_reason < kvm_vmx_max_exit_handlers
            && kvm_vmx_exit_handlers[exit_reason])
                return kvm_vmx_exit_handlers[exit_reason](vcpu, kvm_run);
@@ -3042,51 +3194,6 @@ static void update_tpr_threshold(struct kvm_vcpu *vcpu)
        vmcs_write32(TPR_THRESHOLD, (max_irr > tpr) ? tpr >> 4 : max_irr >> 4);
 }
-static void enable_irq_window(struct kvm_vcpu *vcpu)
-{
-        u32 cpu_based_vm_exec_control;
-        cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
-        cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
-        vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
-}
-static void enable_nmi_window(struct kvm_vcpu *vcpu)
-{
-        u32 cpu_based_vm_exec_control;
-        if (!cpu_has_virtual_nmis())
-                return;
-        cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
-        cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING;
-        vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
-}
-static int vmx_nmi_enabled(struct kvm_vcpu *vcpu)
-{
-        u32 guest_intr = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
-        return !(guest_intr & (GUEST_INTR_STATE_NMI |
-                               GUEST_INTR_STATE_MOV_SS |
-                               GUEST_INTR_STATE_STI));
-}
-static int vmx_irq_enabled(struct kvm_vcpu *vcpu)
-{
-        u32 guest_intr = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
-        return (!(guest_intr & (GUEST_INTR_STATE_MOV_SS |
-                               GUEST_INTR_STATE_STI)) &&
-                (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF));
-}
-static void enable_intr_window(struct kvm_vcpu *vcpu)
-{
-        if (vcpu->arch.nmi_pending)
-                enable_nmi_window(vcpu);
-        else if (kvm_cpu_has_interrupt(vcpu))
-                enable_irq_window(vcpu);
-}
 static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
 {
        u32 exit_intr_info;
@@ -3109,7 +3216,9 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
                if (unblock_nmi && vector != DF_VECTOR)
                        vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
                                      GUEST_INTR_STATE_NMI);
-        }
+        } else if (unlikely(vmx->soft_vnmi_blocked))
+                vmx->vnmi_blocked_time +=
+                        ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time));
        idt_vectoring_info = vmx->idt_vectoring_info;
        idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
@@ -3147,26 +3256,29 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu)
 {
        update_tpr_threshold(vcpu);
-        if (cpu_has_virtual_nmis()) {
+        vmx_update_window_states(vcpu);
-                if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) {
-                        if (vcpu->arch.interrupt.pending) {
+        if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) {
-                                enable_nmi_window(vcpu);
+                if (vcpu->arch.interrupt.pending) {
-                        } else if (vmx_nmi_enabled(vcpu)) {
+                        enable_nmi_window(vcpu);
-                                vcpu->arch.nmi_pending = false;
+                } else if (vcpu->arch.nmi_window_open) {
-                                vcpu->arch.nmi_injected = true;
+                        vcpu->arch.nmi_pending = false;
-                        } else {
+                        vcpu->arch.nmi_injected = true;
-                                enable_intr_window(vcpu);
+                } else {
-                                return;
+                        enable_nmi_window(vcpu);
-                        }
-                }
-                if (vcpu->arch.nmi_injected) {
-                        vmx_inject_nmi(vcpu);
-                        enable_intr_window(vcpu);
                        return;
                }
        }
+        if (vcpu->arch.nmi_injected) {
+                vmx_inject_nmi(vcpu);
+                if (vcpu->arch.nmi_pending)
+                        enable_nmi_window(vcpu);
+                else if (kvm_cpu_has_interrupt(vcpu))
+                        enable_irq_window(vcpu);
+                return;
+        }
        if (!vcpu->arch.interrupt.pending && kvm_cpu_has_interrupt(vcpu)) {
-                if (vmx_irq_enabled(vcpu))
+                if (vcpu->arch.interrupt_window_open)
                        kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu));
                else
                        enable_irq_window(vcpu);
@@ -3174,6 +3286,8 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu)
        if (vcpu->arch.interrupt.pending) {
                vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr);
                kvm_timer_intr_post(vcpu, vcpu->arch.interrupt.nr);
+                if (kvm_cpu_has_interrupt(vcpu))
+                        enable_irq_window(vcpu);
        }
 }
@@ -3213,6 +3327,10 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        u32 intr_info;
+        /* Record the guest's net vcpu time for enforced NMI injections. */
+        if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked))
+                vmx->entry_time = ktime_get();
        /* Handle invalid guest state instead of entering VMX */
        if (vmx->emulation_required && emulate_invalid_guest_state) {
                handle_invalid_guest_state(vcpu, kvm_run);
@@ -3327,9 +3445,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        if (vmx->rmode.irq.pending)
                fixup_rmode_irq(vmx);
-        vcpu->arch.interrupt_window_open =
+        vmx_update_window_states(vcpu);
-                (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
-                 (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)) == 0;
        asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
        vmx->launched = 1;
@@ -3337,7 +3453,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
        /* We need to handle NMIs before interrupts are enabled */
-        if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200 &&
+        if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR &&
            (intr_info & INTR_INFO_VALID_MASK)) {
                KVMTRACE_0D(NMI, vcpu, handler);
                asm("int $2");
@@ -3455,6 +3571,11 @@ static int get_ept_level(void)
        return VMX_EPT_DEFAULT_GAW + 1;
 }
+static int vmx_get_mt_mask_shift(void)
+{
+        return VMX_EPT_MT_EPTE_SHIFT;
+}
 static struct kvm_x86_ops vmx_x86_ops = {
        .cpu_has_kvm_support = cpu_has_kvm_support,
        .disabled_by_bios = vmx_disabled_by_bios,
@@ -3510,6 +3631,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
        .set_tss_addr = vmx_set_tss_addr,
        .get_tdp_level = get_ept_level,
+        .get_mt_mask_shift = vmx_get_mt_mask_shift,
 };
 static int __init vmx_init(void)
@@ -3566,10 +3688,10 @@ static int __init vmx_init(void)
                bypass_guest_pf = 0;
                kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK |
                        VMX_EPT_WRITABLE_MASK |
-                        VMX_EPT_DEFAULT_MT << VMX_EPT_MT_EPTE_SHIFT |
                        VMX_EPT_IGMT_BIT);
                kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull,
-                                VMX_EPT_EXECUTABLE_MASK);
+                                VMX_EPT_EXECUTABLE_MASK,
+                                VMX_EPT_DEFAULT_MT << VMX_EPT_MT_EPTE_SHIFT);
                kvm_enable_tdp();
        } else
                kvm_disable_tdp();
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index f1f8ff2f1fa2..0e6aa8141dcd 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -39,6 +39,7 @@
 #include <asm/uaccess.h>
 #include <asm/msr.h>
 #include <asm/desc.h>
+#include <asm/mtrr.h>
 #define MAX_IO_MSRS 256
 #define CR0_RESERVED_BITS                                               \
@@ -86,6 +87,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
        { "halt_wakeup", VCPU_STAT(halt_wakeup) },
        { "hypercalls", VCPU_STAT(hypercalls) },
        { "request_irq", VCPU_STAT(request_irq_exits) },
+        { "request_nmi", VCPU_STAT(request_nmi_exits) },
        { "irq_exits", VCPU_STAT(irq_exits) },
        { "host_state_reload", VCPU_STAT(host_state_reload) },
        { "efer_reload", VCPU_STAT(efer_reload) },
@@ -93,6 +95,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
        { "insn_emulation", VCPU_STAT(insn_emulation) },
        { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
        { "irq_injections", VCPU_STAT(irq_injections) },
+        { "nmi_injections", VCPU_STAT(nmi_injections) },
        { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
        { "mmu_pte_write", VM_STAT(mmu_pte_write) },
        { "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
@@ -101,6 +104,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
        { "mmu_recycled", VM_STAT(mmu_recycled) },
        { "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
        { "mmu_unsync", VM_STAT(mmu_unsync) },
+        { "mmu_unsync_global", VM_STAT(mmu_unsync_global) },
        { "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
        { "largepages", VM_STAT(lpages) },
        { NULL }
@@ -312,6 +316,7 @@ void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
        kvm_x86_ops->set_cr0(vcpu, cr0);
        vcpu->arch.cr0 = cr0;
+        kvm_mmu_sync_global(vcpu);
        kvm_mmu_reset_context(vcpu);
        return;
 }
@@ -355,6 +360,7 @@ void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
        }
        kvm_x86_ops->set_cr4(vcpu, cr4);
        vcpu->arch.cr4 = cr4;
+        kvm_mmu_sync_global(vcpu);
        kvm_mmu_reset_context(vcpu);
 }
 EXPORT_SYMBOL_GPL(kvm_set_cr4);
@@ -449,7 +455,7 @@ static u32 msrs_to_save[] = {
        MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
 #endif
        MSR_IA32_TIME_STAMP_COUNTER, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
-        MSR_IA32_PERF_STATUS,
+        MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT
 };
 static unsigned num_msrs_to_save;
@@ -648,10 +654,38 @@ static bool msr_mtrr_valid(unsigned msr)
 static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 {
+        u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges;
        if (!msr_mtrr_valid(msr))
                return 1;
-        vcpu->arch.mtrr[msr - 0x200] = data;
+        if (msr == MSR_MTRRdefType) {
+                vcpu->arch.mtrr_state.def_type = data;
+                vcpu->arch.mtrr_state.enabled = (data & 0xc00) >> 10;
+        } else if (msr == MSR_MTRRfix64K_00000)
+                p[0] = data;
+        else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000)
+                p[1 + msr - MSR_MTRRfix16K_80000] = data;
+        else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000)
+                p[3 + msr - MSR_MTRRfix4K_C0000] = data;
+        else if (msr == MSR_IA32_CR_PAT)
+                vcpu->arch.pat = data;
+        else {  /* Variable MTRRs */
+                int idx, is_mtrr_mask;
+                u64 *pt;
+                idx = (msr - 0x200) / 2;
+                is_mtrr_mask = msr - 0x200 - 2 * idx;
+                if (!is_mtrr_mask)
+                        pt =
+                          (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo;
+                else
+                        pt =
+                          (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo;
+                *pt = data;
+        }
+        kvm_mmu_reset_context(vcpu);
        return 0;
 }
@@ -747,10 +781,37 @@ int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
 static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 {
+        u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges;
        if (!msr_mtrr_valid(msr))
                return 1;
-        *pdata = vcpu->arch.mtrr[msr - 0x200];
+        if (msr == MSR_MTRRdefType)
+                *pdata = vcpu->arch.mtrr_state.def_type +
+                         (vcpu->arch.mtrr_state.enabled << 10);
+        else if (msr == MSR_MTRRfix64K_00000)
+                *pdata = p[0];
+        else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000)
+                *pdata = p[1 + msr - MSR_MTRRfix16K_80000];
+        else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000)
+                *pdata = p[3 + msr - MSR_MTRRfix4K_C0000];
+        else if (msr == MSR_IA32_CR_PAT)
+                *pdata = vcpu->arch.pat;
+        else {  /* Variable MTRRs */
+                int idx, is_mtrr_mask;
+                u64 *pt;
+                idx = (msr - 0x200) / 2;
+                is_mtrr_mask = msr - 0x200 - 2 * idx;
+                if (!is_mtrr_mask)
+                        pt =
+                          (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo;
+                else
+                        pt =
+                          (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo;
+                *pdata = *pt;
+        }
        return 0;
 }
@@ -903,7 +964,6 @@ int kvm_dev_ioctl_check_extension(long ext)
        case KVM_CAP_IRQCHIP:
        case KVM_CAP_HLT:
        case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:
-        case KVM_CAP_USER_MEMORY:
        case KVM_CAP_SET_TSS_ADDR:
        case KVM_CAP_EXT_CPUID:
        case KVM_CAP_CLOCKSOURCE:
@@ -1188,6 +1248,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
                int t, times = entry->eax & 0xff;
                entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
+                entry->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
                for (t = 1; t < times && *nent < maxnent; ++t) {
                        do_cpuid_1_ent(&entry[t], function, 0);
                        entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
@@ -1218,7 +1279,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
                entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
                /* read more entries until level_type is zero */
                for (i = 1; *nent < maxnent; ++i) {
-                        level_type = entry[i - 1].ecx & 0xff;
+                        level_type = entry[i - 1].ecx & 0xff00;
                        if (!level_type)
                                break;
                        do_cpuid_1_ent(&entry[i], function, i);
@@ -1318,6 +1379,15 @@ static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
        return 0;
 }
+static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu)
+{
+        vcpu_load(vcpu);
+        kvm_inject_nmi(vcpu);
+        vcpu_put(vcpu);
+        return 0;
+}
 static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu,
                                           struct kvm_tpr_access_ctl *tac)
 {
@@ -1377,6 +1447,13 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
                r = 0;
                break;
        }
+        case KVM_NMI: {
+                r = kvm_vcpu_ioctl_nmi(vcpu);
+                if (r)
+                        goto out;
+                r = 0;
+                break;
+        }
        case KVM_SET_CPUID: {
                struct kvm_cpuid __user *cpuid_arg = argp;
                struct kvm_cpuid cpuid;
@@ -1968,7 +2045,7 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
        ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes);
        if (ret < 0)
                return 0;
-        kvm_mmu_pte_write(vcpu, gpa, val, bytes);
+        kvm_mmu_pte_write(vcpu, gpa, val, bytes, 1);
        return 1;
 }
@@ -2404,8 +2481,6 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
        val = kvm_register_read(vcpu, VCPU_REGS_RAX);
        memcpy(vcpu->arch.pio_data, &val, 4);
-        kvm_x86_ops->skip_emulated_instruction(vcpu);
        pio_dev = vcpu_find_pio_dev(vcpu, port, size, !in);
        if (pio_dev) {
                kernel_pio(pio_dev, vcpu, vcpu->arch.pio_data);
@@ -2541,7 +2616,7 @@ int kvm_arch_init(void *opaque)
        kvm_mmu_set_nonpresent_ptes(0ull, 0ull);
        kvm_mmu_set_base_ptes(PT_PRESENT_MASK);
        kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
-                        PT_DIRTY_MASK, PT64_NX_MASK, 0);
+                        PT_DIRTY_MASK, PT64_NX_MASK, 0, 0);
        return 0;
 out:
@@ -2729,7 +2804,7 @@ static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i)
        e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT;
        /* when no next entry is found, the current entry[i] is reselected */
-        for (j = i + 1; j == i; j = (j + 1) % nent) {
+        for (j = i + 1; ; j = (j + 1) % nent) {
                struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j];
                if (ej->function == e->function) {
                        ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
@@ -2973,7 +3048,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
                pr_debug("vcpu %d received sipi with vector # %x\n",
                         vcpu->vcpu_id, vcpu->arch.sipi_vector);
                kvm_lapic_reset(vcpu);
-                r = kvm_x86_ops->vcpu_reset(vcpu);
+                r = kvm_arch_vcpu_reset(vcpu);
                if (r)
                        return r;
                vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
@@ -3275,9 +3350,9 @@ static void seg_desct_to_kvm_desct(struct desc_struct *seg_desc, u16 selector,
        kvm_desct->padding = 0;
 }
-static void get_segment_descritptor_dtable(struct kvm_vcpu *vcpu,
+static void get_segment_descriptor_dtable(struct kvm_vcpu *vcpu,
-                                           u16 selector,
+                                          u16 selector,
-                                           struct descriptor_table *dtable)
+                                          struct descriptor_table *dtable)
 {
        if (selector & 1 << 2) {
                struct kvm_segment kvm_seg;
@@ -3302,7 +3377,7 @@ static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
        struct descriptor_table dtable;
        u16 index = selector >> 3;
-        get_segment_descritptor_dtable(vcpu, selector, &dtable);
+        get_segment_descriptor_dtable(vcpu, selector, &dtable);
        if (dtable.limit < index * 8 + 7) {
                kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc);
@@ -3321,7 +3396,7 @@ static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
        struct descriptor_table dtable;
        u16 index = selector >> 3;
-        get_segment_descritptor_dtable(vcpu, selector, &dtable);
+        get_segment_descriptor_dtable(vcpu, selector, &dtable);
        if (dtable.limit < index * 8 + 7)
                return 1;
@@ -3900,6 +3975,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
        /* We do fxsave: this must be aligned. */
        BUG_ON((unsigned long)&vcpu->arch.host_fx_image & 0xF);
+        vcpu->arch.mtrr_state.have_fixed = 1;
        vcpu_load(vcpu);
        r = kvm_arch_vcpu_reset(vcpu);
        if (r == 0)
@@ -3925,6 +4001,9 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
 int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
 {
+        vcpu->arch.nmi_pending = false;
+        vcpu->arch.nmi_injected = false;
        return kvm_x86_ops->vcpu_reset(vcpu);
 }
@@ -4012,6 +4091,7 @@ struct  kvm *kvm_arch_create_vm(void)
                return ERR_PTR(-ENOMEM);
        INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
+        INIT_LIST_HEAD(&kvm->arch.oos_global_pages);
        INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
        /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
@@ -4048,8 +4128,8 @@ static void kvm_free_vcpus(struct kvm *kvm)
 void kvm_arch_destroy_vm(struct kvm *kvm)
 {
-        kvm_iommu_unmap_guest(kvm);
        kvm_free_all_assigned_devices(kvm);
+        kvm_iommu_unmap_guest(kvm);
        kvm_free_pit(kvm);
        kfree(kvm->arch.vpic);
        kfree(kvm->arch.vioapic);
@@ -4127,7 +4207,8 @@ void kvm_arch_flush_shadow(struct kvm *kvm)
 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
 {
        return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE
-               || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED;
+               || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED
+               || vcpu->arch.nmi_pending;
 }
 static void vcpu_kick_intr(void *info)
diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index ea051173b0da..d174db7a3370 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -58,6 +58,7 @@
 #define SrcMem32    (4<<4)      /* Memory operand (32-bit). */
 #define SrcImm      (5<<4)      /* Immediate operand. */
 #define SrcImmByte  (6<<4)      /* 8-bit sign-extended immediate operand. */
+#define SrcOne      (7<<4)      /* Implied '1' */
 #define SrcMask     (7<<4)
 /* Generic ModRM decode. */
 #define ModRM       (1<<7)
@@ -70,17 +71,23 @@
 #define Group       (1<<14)     /* Bits 3:5 of modrm byte extend opcode */
 #define GroupDual   (1<<15)     /* Alternate decoding of mod == 3 */
 #define GroupMask   0xff        /* Group number stored in bits 0:7 */
+/* Source 2 operand type */
+#define Src2None    (0<<29)
+#define Src2CL      (1<<29)
+#define Src2ImmByte (2<<29)
+#define Src2One     (3<<29)
+#define Src2Mask    (7<<29)
 enum {
        Group1_80, Group1_81, Group1_82, Group1_83,
        Group1A, Group3_Byte, Group3, Group4, Group5, Group7,
 };
-static u16 opcode_table[256] = {
+static u32 opcode_table[256] = {
        /* 0x00 - 0x07 */
        ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
        ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
-        0, 0, 0, 0,
+        ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, 0, 0,
        /* 0x08 - 0x0F */
        ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
        ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
@@ -195,7 +202,7 @@ static u16 opcode_table[256] = {
        ImplicitOps, ImplicitOps, Group | Group4, Group | Group5,
 };
-static u16 twobyte_table[256] = {
+static u32 twobyte_table[256] = {
        /* 0x00 - 0x0F */
        0, Group | GroupDual | Group7, 0, 0, 0, 0, ImplicitOps, 0,
        ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0,
@@ -230,9 +237,14 @@ static u16 twobyte_table[256] = {
        /* 0x90 - 0x9F */
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        /* 0xA0 - 0xA7 */
-        0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0,
+        0, 0, 0, DstMem | SrcReg | ModRM | BitOp,
+        DstMem | SrcReg | Src2ImmByte | ModRM,
+        DstMem | SrcReg | Src2CL | ModRM, 0, 0,
        /* 0xA8 - 0xAF */
-        0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, ModRM, 0,
+        0, 0, 0, DstMem | SrcReg | ModRM | BitOp,
+        DstMem | SrcReg | Src2ImmByte | ModRM,
+        DstMem | SrcReg | Src2CL | ModRM,
+        ModRM, 0,
        /* 0xB0 - 0xB7 */
        ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 0,
            DstMem | SrcReg | ModRM | BitOp,
@@ -253,7 +265,7 @@ static u16 twobyte_table[256] = {
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 };
-static u16 group_table[] = {
+static u32 group_table[] = {
        [Group1_80*8] =
        ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM,
        ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM,
@@ -297,9 +309,9 @@ static u16 group_table[] = {
        SrcMem16 | ModRM | Mov, SrcMem | ModRM | ByteOp,
 };
-static u16 group2_table[] = {
+static u32 group2_table[] = {
        [Group7*8] =
-        SrcNone | ModRM, 0, 0, 0,
+        SrcNone | ModRM, 0, 0, SrcNone | ModRM,
        SrcNone | ModRM | DstMem | Mov, 0,
        SrcMem16 | ModRM | Mov, 0,
 };
@@ -359,49 +371,48 @@ static u16 group2_table[] = {
        "andl %"_msk",%"_LO32 _tmp"; "          \
        "orl  %"_LO32 _tmp",%"_sav"; "
+#ifdef CONFIG_X86_64
+#define ON64(x) x
+#else
+#define ON64(x)
+#endif
+#define ____emulate_2op(_op, _src, _dst, _eflags, _x, _y, _suffix)      \
+        do {                                                            \
+                __asm__ __volatile__ (                                  \
+                        _PRE_EFLAGS("0", "4", "2")                      \
+                        _op _suffix " %"_x"3,%1; "                      \
+                        _POST_EFLAGS("0", "4", "2")                     \
+                        : "=m" (_eflags), "=m" ((_dst).val),            \
+                          "=&r" (_tmp)                                  \
+                        : _y ((_src).val), "i" (EFLAGS_MASK));          \
+        } while (0)
 /* Raw emulation: instruction has two explicit operands. */
 #define __emulate_2op_nobyte(_op,_src,_dst,_eflags,_wx,_wy,_lx,_ly,_qx,_qy) \
-        do {                                                                \
+        do {                                                            \
-                unsigned long _tmp;                                         \
+                unsigned long _tmp;                                     \
-                                                                            \
+                                                                        \
-                switch ((_dst).bytes) {                                     \
+                switch ((_dst).bytes) {                                 \
-                case 2:                                                     \
+                case 2:                                                 \
-                        __asm__ __volatile__ (                              \
+                        ____emulate_2op(_op,_src,_dst,_eflags,_wx,_wy,"w"); \
-                                _PRE_EFLAGS("0", "4", "2")                  \
+                        break;                                          \
-                                _op"w %"_wx"3,%1; "                         \
+                case 4:                                                 \
-                                _POST_EFLAGS("0", "4", "2")                 \
+                        ____emulate_2op(_op,_src,_dst,_eflags,_lx,_ly,"l"); \
-                                : "=m" (_eflags), "=m" ((_dst).val),        \
+                        break;                                          \
-                                  "=&r" (_tmp)                              \
+                case 8:                                                 \
-                                : _wy ((_src).val), "i" (EFLAGS_MASK));     \
+                        ON64(____emulate_2op(_op,_src,_dst,_eflags,_qx,_qy,"q")); \
-                        break;                                              \
+                        break;                                          \
-                case 4:                                                     \
+                }                                                       \
-                        __asm__ __volatile__ (                              \
-                                _PRE_EFLAGS("0", "4", "2")                  \
-                                _op"l %"_lx"3,%1; "                         \
-                                _POST_EFLAGS("0", "4", "2")                 \
-                                : "=m" (_eflags), "=m" ((_dst).val),        \
-                                  "=&r" (_tmp)                              \
-                                : _ly ((_src).val), "i" (EFLAGS_MASK));     \
-                        break;                                              \
-                case 8:                                                     \
-                        __emulate_2op_8byte(_op, _src, _dst,                \
-                                            _eflags, _qx, _qy);             \
-                        break;                                              \
-                }                                                           \
        } while (0)
 #define __emulate_2op(_op,_src,_dst,_eflags,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \
        do {                                                                 \
-                unsigned long __tmp;                                         \
+                unsigned long _tmp;                                          \
                switch ((_dst).bytes) {                                      \
                case 1:                                                      \
-                        __asm__ __volatile__ (                               \
+                        ____emulate_2op(_op,_src,_dst,_eflags,_bx,_by,"b");  \
-                                _PRE_EFLAGS("0", "4", "2")                   \
-                                _op"b %"_bx"3,%1; "                          \
-                                _POST_EFLAGS("0", "4", "2")                  \
-                                : "=m" (_eflags), "=m" ((_dst).val),         \
-                                  "=&r" (__tmp)                              \
-                                : _by ((_src).val), "i" (EFLAGS_MASK));      \
                        break;                                               \
                default:                                                     \
                        __emulate_2op_nobyte(_op, _src, _dst, _eflags,       \
@@ -425,71 +436,68 @@ static u16 group2_table[] = {
        __emulate_2op_nobyte(_op, _src, _dst, _eflags,                  \
                             "w", "r", _LO32, "r", "", "r")
-/* Instruction has only one explicit operand (no source operand). */
+/* Instruction has three operands and one operand is stored in ECX register */
-#define emulate_1op(_op, _dst, _eflags)                                    \
+#define __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, _suffix, _type)         \
-        do {                                                            \
+        do {                                                                    \
-                unsigned long _tmp;                                     \
+                unsigned long _tmp;                                             \
-                                                                        \
+                _type _clv  = (_cl).val;                                        \
-                switch ((_dst).bytes) {                                 \
+                _type _srcv = (_src).val;                                       \
-                case 1:                                                 \
+                _type _dstv = (_dst).val;                                       \
-                        __asm__ __volatile__ (                          \
+                                                                                \
-                                _PRE_EFLAGS("0", "3", "2")              \
+                __asm__ __volatile__ (                                          \
-                                _op"b %1; "                             \
+                        _PRE_EFLAGS("0", "5", "2")                              \
-                                _POST_EFLAGS("0", "3", "2")             \
+                        _op _suffix " %4,%1 \n"                                 \
-                                : "=m" (_eflags), "=m" ((_dst).val),    \
+                        _POST_EFLAGS("0", "5", "2")                             \
-                                  "=&r" (_tmp)                          \
+                        : "=m" (_eflags), "+r" (_dstv), "=&r" (_tmp)            \
-                                : "i" (EFLAGS_MASK));                   \
+                        : "c" (_clv) , "r" (_srcv), "i" (EFLAGS_MASK)           \
-                        break;                                          \
+                        );                                                      \
-                case 2:                                                 \
+                                                                                \
-                        __asm__ __volatile__ (                          \
+                (_cl).val  = (unsigned long) _clv;                              \
-                                _PRE_EFLAGS("0", "3", "2")              \
+                (_src).val = (unsigned long) _srcv;                             \
-                                _op"w %1; "                             \
+                (_dst).val = (unsigned long) _dstv;                             \
-                                _POST_EFLAGS("0", "3", "2")             \
-                                : "=m" (_eflags), "=m" ((_dst).val),    \
-                                  "=&r" (_tmp)                          \
-                                : "i" (EFLAGS_MASK));                   \
-                        break;                                          \
-                case 4:                                                 \
-                        __asm__ __volatile__ (                          \
-                                _PRE_EFLAGS("0", "3", "2")              \
-                                _op"l %1; "                             \
-                                _POST_EFLAGS("0", "3", "2")             \
-                                : "=m" (_eflags), "=m" ((_dst).val),    \
-                                  "=&r" (_tmp)                          \
-                                : "i" (EFLAGS_MASK));                   \
-                        break;                                          \
-                case 8:                                                 \
-                        __emulate_1op_8byte(_op, _dst, _eflags);        \
-                        break;                                          \
-                }                                                       \
        } while (0)
-/* Emulate an instruction with quadword operands (x86/64 only). */
+#define emulate_2op_cl(_op, _cl, _src, _dst, _eflags)                           \
-#if defined(CONFIG_X86_64)
+        do {                                                                    \
-#define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy)           \
+                switch ((_dst).bytes) {                                         \
-        do {                                                              \
+                case 2:                                                         \
-                __asm__ __volatile__ (                                    \
+                        __emulate_2op_cl(_op, _cl, _src, _dst, _eflags,         \
-                        _PRE_EFLAGS("0", "4", "2")                        \
+                                                "w", unsigned short);           \
-                        _op"q %"_qx"3,%1; "                               \
+                        break;                                                  \
-                        _POST_EFLAGS("0", "4", "2")                       \
+                case 4:                                                         \
-                        : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \
+                        __emulate_2op_cl(_op, _cl, _src, _dst, _eflags,         \
-                        : _qy ((_src).val), "i" (EFLAGS_MASK));         \
+                                                "l", unsigned int);             \
+                        break;                                                  \
+                case 8:                                                         \
+                        ON64(__emulate_2op_cl(_op, _cl, _src, _dst, _eflags,    \
+                                                "q", unsigned long));           \
+                        break;                                                  \
+                }                                                               \
        } while (0)
-#define __emulate_1op_8byte(_op, _dst, _eflags)                           \
+#define __emulate_1op(_op, _dst, _eflags, _suffix)                      \
-        do {                                                              \
+        do {                                                            \
-                __asm__ __volatile__ (                                    \
+                unsigned long _tmp;                                     \
-                        _PRE_EFLAGS("0", "3", "2")                        \
+                                                                        \
-                        _op"q %1; "                                       \
+                __asm__ __volatile__ (                                  \
-                        _POST_EFLAGS("0", "3", "2")                       \
+                        _PRE_EFLAGS("0", "3", "2")                      \
-                        : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \
+                        _op _suffix " %1; "                             \
-                        : "i" (EFLAGS_MASK));                             \
+                        _POST_EFLAGS("0", "3", "2")                     \
+                        : "=m" (_eflags), "+m" ((_dst).val),            \
+                          "=&r" (_tmp)                                  \
+                        : "i" (EFLAGS_MASK));                           \
        } while (0)
-#elif defined(__i386__)
+/* Instruction has only one explicit operand (no source operand). */
-#define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy)
+#define emulate_1op(_op, _dst, _eflags)                                    \
-#define __emulate_1op_8byte(_op, _dst, _eflags)
+        do {                                                            \
-#endif                          /* __i386__ */
+                switch ((_dst).bytes) {                                 \
+                case 1: __emulate_1op(_op, _dst, _eflags, "b"); break;  \
+                case 2: __emulate_1op(_op, _dst, _eflags, "w"); break;  \
+                case 4: __emulate_1op(_op, _dst, _eflags, "l"); break;  \
+                case 8: ON64(__emulate_1op(_op, _dst, _eflags, "q")); break; \
+                }                                                       \
+        } while (0)
 /* Fetch next part of the instruction being emulated. */
 #define insn_fetch(_type, _size, _eip)                                  \
@@ -1041,6 +1049,33 @@ done_prefixes:
                c->src.bytes = 1;
                c->src.val = insn_fetch(s8, 1, c->eip);
                break;
+        case SrcOne:
+                c->src.bytes = 1;
+                c->src.val = 1;
+                break;
+        }
+        /*
+         * Decode and fetch the second source operand: register, memory
+         * or immediate.
+         */
+        switch (c->d & Src2Mask) {
+        case Src2None:
+                break;
+        case Src2CL:
+                c->src2.bytes = 1;
+                c->src2.val = c->regs[VCPU_REGS_RCX] & 0x8;
+                break;
+        case Src2ImmByte:
+                c->src2.type = OP_IMM;
+                c->src2.ptr = (unsigned long *)c->eip;
+                c->src2.bytes = 1;
+                c->src2.val = insn_fetch(u8, 1, c->eip);
+                break;
+        case Src2One:
+                c->src2.bytes = 1;
+                c->src2.val = 1;
+                break;
        }
        /* Decode and fetch the destination operand: register or memory. */
@@ -1100,20 +1135,33 @@ static inline void emulate_push(struct x86_emulate_ctxt *ctxt)
                                               c->regs[VCPU_REGS_RSP]);
 }
-static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt,
+static int emulate_pop(struct x86_emulate_ctxt *ctxt,
-                                struct x86_emulate_ops *ops)
+                       struct x86_emulate_ops *ops)
 {
        struct decode_cache *c = &ctxt->decode;
        int rc;
-        rc = ops->read_std(register_address(c, ss_base(ctxt),
+        rc = ops->read_emulated(register_address(c, ss_base(ctxt),
-                                            c->regs[VCPU_REGS_RSP]),
+                                                 c->regs[VCPU_REGS_RSP]),
-                           &c->dst.val, c->dst.bytes, ctxt->vcpu);
+                                &c->src.val, c->src.bytes, ctxt->vcpu);
        if (rc != 0)
                return rc;
-        register_address_increment(c, &c->regs[VCPU_REGS_RSP], c->dst.bytes);
+        register_address_increment(c, &c->regs[VCPU_REGS_RSP], c->src.bytes);
+        return rc;
+}
+static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt,
+                                struct x86_emulate_ops *ops)
+{
+        struct decode_cache *c = &ctxt->decode;
+        int rc;
+        c->src.bytes = c->dst.bytes;
+        rc = emulate_pop(ctxt, ops);
+        if (rc != 0)
+                return rc;
+        c->dst.val = c->src.val;
        return 0;
 }
@@ -1415,24 +1463,15 @@ special_insn:
                emulate_1op("dec", c->dst, ctxt->eflags);
                break;
        case 0x50 ... 0x57:  /* push reg */
-                c->dst.type  = OP_MEM;
+                emulate_push(ctxt);
-                c->dst.bytes = c->op_bytes;
-                c->dst.val = c->src.val;
-                register_address_increment(c, &c->regs[VCPU_REGS_RSP],
-                                           -c->op_bytes);
-                c->dst.ptr = (void *) register_address(
-                        c, ss_base(ctxt), c->regs[VCPU_REGS_RSP]);
                break;
        case 0x58 ... 0x5f: /* pop reg */
        pop_instruction:
-                if ((rc = ops->read_std(register_address(c, ss_base(ctxt),
+                c->src.bytes = c->op_bytes;
-                        c->regs[VCPU_REGS_RSP]), c->dst.ptr,
+                rc = emulate_pop(ctxt, ops);
-                        c->op_bytes, ctxt->vcpu)) != 0)
+                if (rc != 0)
                        goto done;
+                c->dst.val = c->src.val;
-                register_address_increment(c, &c->regs[VCPU_REGS_RSP],
-                                           c->op_bytes);
-                c->dst.type = OP_NONE;  /* Disable writeback. */
                break;
        case 0x63:              /* movsxd */
                if (ctxt->mode != X86EMUL_MODE_PROT64)
@@ -1591,7 +1630,9 @@ special_insn:
                emulate_push(ctxt);
                break;
        case 0x9d: /* popf */
+                c->dst.type = OP_REG;
                c->dst.ptr = (unsigned long *) &ctxt->eflags;
+                c->dst.bytes = c->op_bytes;
                goto pop_instruction;
        case 0xa0 ... 0xa1:     /* mov */
                c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
@@ -1689,7 +1730,9 @@ special_insn:
                emulate_grp2(ctxt);
                break;
        case 0xc3: /* ret */
+                c->dst.type = OP_REG;
                c->dst.ptr = &c->eip;
+                c->dst.bytes = c->op_bytes;
                goto pop_instruction;
        case 0xc6 ... 0xc7:     /* mov (sole member of Grp11) */
        mov:
@@ -1778,7 +1821,7 @@ special_insn:
                        c->eip = saved_eip;
                        goto cannot_emulate;
                }
-                return 0;
+                break;
        case 0xf4:              /* hlt */
                ctxt->vcpu->arch.halt_request = 1;
                break;
@@ -1999,12 +2042,20 @@ twobyte_insn:
                c->src.val &= (c->dst.bytes << 3) - 1;
                emulate_2op_SrcV_nobyte("bt", c->src, c->dst, ctxt->eflags);
                break;
+        case 0xa4: /* shld imm8, r, r/m */
+        case 0xa5: /* shld cl, r, r/m */
+                emulate_2op_cl("shld", c->src2, c->src, c->dst, ctxt->eflags);
+                break;
        case 0xab:
              bts:              /* bts */
                /* only subword offset */
                c->src.val &= (c->dst.bytes << 3) - 1;
                emulate_2op_SrcV_nobyte("bts", c->src, c->dst, ctxt->eflags);
                break;
+        case 0xac: /* shrd imm8, r, r/m */
+        case 0xad: /* shrd cl, r, r/m */
+                emulate_2op_cl("shrd", c->src2, c->src, c->dst, ctxt->eflags);
+                break;
        case 0xae:              /* clflush */
                break;
        case 0xb0 ... 0xb1:     /* cmpxchg */
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 50a779264bb1..a7ed208f81e3 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -738,7 +738,7 @@ static void lguest_time_init(void)
        /* We can't set cpumask in the initializer: damn C limitations!  Set it
         * here and register our timer device. */
-        lguest_clockevent.cpumask = cpumask_of_cpu(0);
+        lguest_clockevent.cpumask = cpumask_of(0);
        clockevents_register_device(&lguest_clockevent);
        /* Finally, we unblock the timer interrupt. */
diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S
index 5c7cef34c9e7..10b9bd35a8ff 100644
--- a/arch/x86/lguest/i386_head.S
+++ b/arch/x86/lguest/i386_head.S
@@ -30,21 +30,6 @@ ENTRY(lguest_entry)
        movl $lguest_data - __PAGE_OFFSET, %edx
        int $LGUEST_TRAP_ENTRY
-        /* The Host put the toplevel pagetable in lguest_data.pgdir.  The movsl
-         * instruction uses %esi implicitly as the source for the copy we're
-         * about to do. */
-        movl lguest_data - __PAGE_OFFSET + LGUEST_DATA_pgdir, %esi
-        /* Copy first 32 entries of page directory to __PAGE_OFFSET entries.
-         * This means the first 128M of kernel memory will be mapped at
-         * PAGE_OFFSET where the kernel expects to run.  This will get it far
-         * enough through boot to switch to its own pagetables. */
-        movl $32, %ecx
-        movl %esi, %edi
-        addl $((__PAGE_OFFSET >> 22) * 4), %edi
-        rep
-        movsl
        /* Set up the initial stack so we can run C code. */
        movl $(init_thread_union+THREAD_SIZE),%esp
diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c
index 9e68075544f6..4a20b2f9a381 100644
--- a/arch/x86/lib/usercopy_32.c
+++ b/arch/x86/lib/usercopy_32.c
@@ -39,7 +39,7 @@ static inline int __movsl_is_ok(unsigned long a1, unsigned long a2, unsigned lon
 #define __do_strncpy_from_user(dst, src, count, res)                       \
 do {                                                                       \
        int __d0, __d1, __d2;                                              \
-        might_sleep();                                                     \
+        might_fault();                                                     \
        __asm__ __volatile__(                                              \
                "       testl %1,%1\n"                                     \
                "       jz 2f\n"                                           \
@@ -126,7 +126,7 @@ EXPORT_SYMBOL(strncpy_from_user);
 #define __do_clear_user(addr,size)                                      \
 do {                                                                    \
        int __d0;                                                       \
-        might_sleep();                                                  \
+        might_fault();                                                  \
        __asm__ __volatile__(                                           \
                "0:     rep; stosl\n"                                   \
                "       movl %2,%0\n"                                   \
@@ -155,7 +155,7 @@ do {									\
 unsigned long
 clear_user(void __user *to, unsigned long n)
 {
-        might_sleep();
+        might_fault();
        if (access_ok(VERIFY_WRITE, to, n))
                __do_clear_user(to, n);
        return n;
@@ -197,7 +197,7 @@ long strnlen_user(const char __user *s, long n)
        unsigned long mask = -__addr_ok(s);
        unsigned long res, tmp;
-        might_sleep();
+        might_fault();
        __asm__ __volatile__(
                "       testl %0, %0\n"
diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c
index f4df6e7c718b..64d6c84e6353 100644
--- a/arch/x86/lib/usercopy_64.c
+++ b/arch/x86/lib/usercopy_64.c
@@ -15,7 +15,7 @@
 #define __do_strncpy_from_user(dst,src,count,res)                          \
 do {                                                                       \
        long __d0, __d1, __d2;                                             \
-        might_sleep();                                                     \
+        might_fault();                                                     \
        __asm__ __volatile__(                                              \
                "       testq %1,%1\n"                                     \
                "       jz 2f\n"                                           \
@@ -64,7 +64,7 @@ EXPORT_SYMBOL(strncpy_from_user);
 unsigned long __clear_user(void __user *addr, unsigned long size)
 {
        long __d0;
-        might_sleep();
+        might_fault();
        /* no memory constraint because it doesn't change any memory gcc knows
           about */
        asm volatile(
diff --git a/arch/x86/mach-generic/bigsmp.c b/arch/x86/mach-generic/bigsmp.c
index 3624a364b7f3..bc4c7840b2a8 100644
--- a/arch/x86/mach-generic/bigsmp.c
+++ b/arch/x86/mach-generic/bigsmp.c
@@ -42,9 +42,10 @@ static const struct dmi_system_id bigsmp_dmi_table[] = {
         { }
 };
-static cpumask_t vector_allocation_domain(int cpu)
+static void vector_allocation_domain(int cpu, cpumask_t *retmask)
 {
-        return cpumask_of_cpu(cpu);
+        cpus_clear(*retmask);
+        cpu_set(cpu, *retmask);
 }
 static int probe_bigsmp(void)
diff --git a/arch/x86/mach-generic/es7000.c b/arch/x86/mach-generic/es7000.c
index 7b4e6d0d1690..4ba5ccaa1584 100644
--- a/arch/x86/mach-generic/es7000.c
+++ b/arch/x86/mach-generic/es7000.c
@@ -87,7 +87,7 @@ static int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id)
 }
 #endif
-static cpumask_t vector_allocation_domain(int cpu)
+static void vector_allocation_domain(int cpu, cpumask_t *retmask)
 {
        /* Careful. Some cpus do not strictly honor the set of cpus
         * specified in the interrupt destination when using lowest
@@ -97,8 +97,7 @@ static cpumask_t vector_allocation_domain(int cpu)
         * deliver interrupts to the wrong hyperthread when only one
         * hyperthread was specified in the interrupt desitination.
         */
-        cpumask_t domain = { { [0] = APIC_ALL_CPUS, } };
+        *retmask = (cpumask_t){ { [0] = APIC_ALL_CPUS, } };
-        return domain;
 }
 struct genapic __initdata_refok apic_es7000 = APIC_INIT("es7000", probe_es7000);
diff --git a/arch/x86/mach-generic/numaq.c b/arch/x86/mach-generic/numaq.c
index 71a309b122e6..511d7941364f 100644
--- a/arch/x86/mach-generic/numaq.c
+++ b/arch/x86/mach-generic/numaq.c
@@ -38,7 +38,7 @@ static int acpi_madt_oem_check(char *oem_id, char *oem_table_id)
        return 0;
 }
-static cpumask_t vector_allocation_domain(int cpu)
+static void vector_allocation_domain(int cpu, cpumask_t *retmask)
 {
        /* Careful. Some cpus do not strictly honor the set of cpus
         * specified in the interrupt destination when using lowest
@@ -48,8 +48,7 @@ static cpumask_t vector_allocation_domain(int cpu)
         * deliver interrupts to the wrong hyperthread when only one
         * hyperthread was specified in the interrupt desitination.
         */
-        cpumask_t domain = { { [0] = APIC_ALL_CPUS, } };
+        *retmask = (cpumask_t){ { [0] = APIC_ALL_CPUS, } };
-        return domain;
 }
 struct genapic apic_numaq = APIC_INIT("NUMAQ", probe_numaq);
diff --git a/arch/x86/mach-generic/summit.c b/arch/x86/mach-generic/summit.c
index 2c6d234e0009..2821ffc188b5 100644
--- a/arch/x86/mach-generic/summit.c
+++ b/arch/x86/mach-generic/summit.c
@@ -24,7 +24,7 @@ static int probe_summit(void)
        return 0;
 }
-static cpumask_t vector_allocation_domain(int cpu)
+static void vector_allocation_domain(int cpu, cpumask_t *retmask)
 {
        /* Careful. Some cpus do not strictly honor the set of cpus
         * specified in the interrupt destination when using lowest
@@ -34,8 +34,7 @@ static cpumask_t vector_allocation_domain(int cpu)
         * deliver interrupts to the wrong hyperthread when only one
         * hyperthread was specified in the interrupt desitination.
         */
-        cpumask_t domain = { { [0] = APIC_ALL_CPUS, } };
+        *retmask = (cpumask_t){ { [0] = APIC_ALL_CPUS, } };
-        return domain;
 }
 struct genapic apic_summit = APIC_INIT("summit", probe_summit);
diff --git a/arch/x86/mach-voyager/voyager_smp.c b/arch/x86/mach-voyager/voyager_smp.c
index 52145007bd7e..a5bc05492b1e 100644
--- a/arch/x86/mach-voyager/voyager_smp.c
+++ b/arch/x86/mach-voyager/voyager_smp.c
@@ -63,11 +63,6 @@ static int voyager_extended_cpus = 1;
 /* Used for the invalidate map that's also checked in the spinlock */
 static volatile unsigned long smp_invalidate_needed;
-/* Bitmask of currently online CPUs - used by setup.c for
-   /proc/cpuinfo, visible externally but still physical */
-cpumask_t cpu_online_map = CPU_MASK_NONE;
-EXPORT_SYMBOL(cpu_online_map);
 /* Bitmask of CPUs present in the system - exported by i386_syms.c, used
 * by scheduler but indexed physically */
 cpumask_t phys_cpu_present_map = CPU_MASK_NONE;
@@ -218,8 +213,6 @@ static cpumask_t smp_commenced_mask = CPU_MASK_NONE;
 /* This is for the new dynamic CPU boot code */
 cpumask_t cpu_callin_map = CPU_MASK_NONE;
 cpumask_t cpu_callout_map = CPU_MASK_NONE;
-cpumask_t cpu_possible_map = CPU_MASK_NONE;
-EXPORT_SYMBOL(cpu_possible_map);
 /* The per processor IRQ masks (these are usually kept in sync) */
 static __u16 vic_irq_mask[NR_CPUS] __cacheline_aligned;
@@ -679,7 +672,7 @@ void __init smp_boot_cpus(void)
        /* loop over all the extended VIC CPUs and boot them.  The
         * Quad CPUs must be bootstrapped by their extended VIC cpu */
-        for (i = 0; i < NR_CPUS; i++) {
+        for (i = 0; i < nr_cpu_ids; i++) {
                if (i == boot_cpu_id || !cpu_isset(i, phys_cpu_present_map))
                        continue;
                do_boot_cpu(i);
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index ad98b18f2b48..f99a6c6c432e 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -21,6 +21,7 @@
 #include <linux/init.h>
 #include <linux/highmem.h>
 #include <linux/pagemap.h>
+#include <linux/pci.h>
 #include <linux/pfn.h>
 #include <linux/poison.h>
 #include <linux/bootmem.h>
@@ -971,6 +972,8 @@ void __init mem_init(void)
        int codesize, reservedpages, datasize, initsize;
        int tmp;
+        pci_iommu_alloc();
 #ifdef CONFIG_FLATMEM
        BUG_ON(!mem_map);
 #endif
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index cebcbf152d46..71a14f89f89e 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -278,7 +278,7 @@ void __init numa_init_array(void)
        int rr, i;
        rr = first_node(node_online_map);
-        for (i = 0; i < NR_CPUS; i++) {
+        for (i = 0; i < nr_cpu_ids; i++) {
                if (early_cpu_to_node(i) != NUMA_NO_NODE)
                        continue;
                numa_set_node(i, rr);
@@ -549,7 +549,7 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn)
        memnodemap[0] = 0;
        node_set_online(0);
        node_set(0, node_possible_map);
-        for (i = 0; i < NR_CPUS; i++)
+        for (i = 0; i < nr_cpu_ids; i++)
                numa_set_node(i, 0);
        e820_register_active_regions(0, start_pfn, last_pfn);
        setup_node_bootmem(0, start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT);
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 51c0a2fc14fe..09737c8af074 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -382,7 +382,7 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
                if (!node_online(i))
                        setup_node_bootmem(i, nodes[i].start, nodes[i].end);
-        for (i = 0; i < NR_CPUS; i++) {
+        for (i = 0; i < nr_cpu_ids; i++) {
                int node = early_cpu_to_node(i);
                if (node == NUMA_NO_NODE)
diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c
index 509513760a6e..98658f25f542 100644
--- a/arch/x86/oprofile/op_model_amd.c
+++ b/arch/x86/oprofile/op_model_amd.c
@@ -65,11 +65,13 @@ static unsigned long reset_value[NUM_COUNTERS];
 #define IBS_FETCH_BEGIN 3
 #define IBS_OP_BEGIN    4
-/* The function interface needs to be fixed, something like add
+/*
-   data. Should then be added to linux/oprofile.h. */
+ * The function interface needs to be fixed, something like add
+ * data. Should then be added to linux/oprofile.h.
+ */
 extern void
-oprofile_add_ibs_sample(struct pt_regs *const regs,
+oprofile_add_ibs_sample(struct pt_regs * const regs,
-                        unsigned int *const ibs_sample, int ibs_code);
+                        unsigned int * const ibs_sample, int ibs_code);
 struct ibs_fetch_sample {
        /* MSRC001_1031 IBS Fetch Linear Address Register */
@@ -104,11 +106,6 @@ struct ibs_op_sample {
        unsigned int ibs_dc_phys_high;
 };
-/*
- * unitialize the APIC for the IBS interrupts if needed on AMD Family10h+
-*/
-static void clear_ibs_nmi(void);
 static int ibs_allowed; /* AMD Family10h and later */
 struct op_ibs_config {
@@ -223,7 +220,7 @@ op_amd_handle_ibs(struct pt_regs * const regs,
                                                (unsigned int *)&ibs_fetch,
                                                IBS_FETCH_BEGIN);
-                        /*reenable the IRQ */
+                        /* reenable the IRQ */
                        rdmsr(MSR_AMD64_IBSFETCHCTL, low, high);
                        high &= ~IBS_FETCH_HIGH_VALID_BIT;
                        high |= IBS_FETCH_HIGH_ENABLE;
@@ -331,8 +328,10 @@ static void op_amd_stop(struct op_msrs const * const msrs)
        unsigned int low, high;
        int i;
-        /* Subtle: stop on all counters to avoid race with
+        /*
-         * setting our pm callback */
+         * Subtle: stop on all counters to avoid race with setting our
+         * pm callback
+         */
        for (i = 0 ; i < NUM_COUNTERS ; ++i) {
                if (!reset_value[i])
                        continue;
@@ -343,13 +342,15 @@ static void op_amd_stop(struct op_msrs const * const msrs)
 #ifdef CONFIG_OPROFILE_IBS
        if (ibs_allowed && ibs_config.fetch_enabled) {
-                low = 0;                /* clear max count and enable */
+                /* clear max count and enable */
+                low = 0;
                high = 0;
                wrmsr(MSR_AMD64_IBSFETCHCTL, low, high);
        }
        if (ibs_allowed && ibs_config.op_enabled) {
-                low = 0;                /* clear max count and enable */
+                /* clear max count and enable */
+                low = 0;
                high = 0;
                wrmsr(MSR_AMD64_IBSOPCTL, low, high);
        }
@@ -370,18 +371,7 @@ static void op_amd_shutdown(struct op_msrs const * const msrs)
        }
 }
-#ifndef CONFIG_OPROFILE_IBS
+#ifdef CONFIG_OPROFILE_IBS
-/* no IBS support */
-static int op_amd_init(struct oprofile_operations *ops)
-{
-        return 0;
-}
-static void op_amd_exit(void) {}
-#else
 static u8 ibs_eilvt_off;
@@ -395,7 +385,7 @@ static inline void apic_clear_ibs_nmi_per_cpu(void *arg)
        setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_FIX, 1);
 }
-static int pfm_amd64_setup_eilvt(void)
+static int init_ibs_nmi(void)
 {
 #define IBSCTL_LVTOFFSETVAL             (1 << 8)
 #define IBSCTL                          0x1cc
@@ -443,18 +433,22 @@ static int pfm_amd64_setup_eilvt(void)
        return 0;
 }
-/*
+/* uninitialize the APIC for the IBS interrupts if needed */
- * initialize the APIC for the IBS interrupts
+static void clear_ibs_nmi(void)
- * if available (AMD Family10h rev B0 and later)
+{
- */
+        if (ibs_allowed)
-static void setup_ibs(void)
+                on_each_cpu(apic_clear_ibs_nmi_per_cpu, NULL, 1);
+}
+/* initialize the APIC for the IBS interrupts if available */
+static void ibs_init(void)
 {
        ibs_allowed = boot_cpu_has(X86_FEATURE_IBS);
        if (!ibs_allowed)
                return;
-        if (pfm_amd64_setup_eilvt()) {
+        if (init_ibs_nmi()) {
                ibs_allowed = 0;
                return;
        }
@@ -462,14 +456,12 @@ static void setup_ibs(void)
        printk(KERN_INFO "oprofile: AMD IBS detected\n");
 }
+static void ibs_exit(void)
-/*
- * unitialize the APIC for the IBS interrupts if needed on AMD Family10h
- * rev B0 and later */
-static void clear_ibs_nmi(void)
 {
-        if (ibs_allowed)
+        if (!ibs_allowed)
-                on_each_cpu(apic_clear_ibs_nmi_per_cpu, NULL, 1);
+                return;
+        clear_ibs_nmi();
 }
 static int (*create_arch_files)(struct super_block *sb, struct dentry *root);
@@ -519,7 +511,7 @@ static int setup_ibs_files(struct super_block *sb, struct dentry *root)
 static int op_amd_init(struct oprofile_operations *ops)
 {
-        setup_ibs();
+        ibs_init();
        create_arch_files = ops->create_files;
        ops->create_files = setup_ibs_files;
        return 0;
@@ -527,10 +519,21 @@ static int op_amd_init(struct oprofile_operations *ops)
 static void op_amd_exit(void)
 {
-        clear_ibs_nmi();
+        ibs_exit();
 }
-#endif
+#else
+/* no IBS support */
+static int op_amd_init(struct oprofile_operations *ops)
+{
+        return 0;
+}
+static void op_amd_exit(void) {}
+#endif /* CONFIG_OPROFILE_IBS */
 struct op_x86_model_spec const op_amd_spec = {
        .init                   = op_amd_init,
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 773d68d3e912..503c240e26c7 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1082,7 +1082,7 @@ static void drop_other_mm_ref(void *info)
 static void xen_drop_mm_ref(struct mm_struct *mm)
 {
-        cpumask_t mask;
+        cpumask_var_t mask;
        unsigned cpu;
        if (current->active_mm == mm) {
@@ -1094,7 +1094,16 @@ static void xen_drop_mm_ref(struct mm_struct *mm)
        }
        /* Get the "official" set of cpus referring to our pagetable. */
-        mask = mm->cpu_vm_mask;
+        if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) {
+                for_each_online_cpu(cpu) {
+                        if (!cpumask_test_cpu(cpu, &mm->cpu_vm_mask)
+                            && per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd))
+                                continue;
+                        smp_call_function_single(cpu, drop_other_mm_ref, mm, 1);
+                }
+                return;
+        }
+        cpumask_copy(mask, &mm->cpu_vm_mask);
        /* It's possible that a vcpu may have a stale reference to our
           cr3, because its in lazy mode, and it hasn't yet flushed
@@ -1103,11 +1112,12 @@ static void xen_drop_mm_ref(struct mm_struct *mm)
           if needed. */
        for_each_online_cpu(cpu) {
                if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd))
-                        cpu_set(cpu, mask);
+                        cpumask_set_cpu(cpu, mask);
        }
-        if (!cpus_empty(mask))
+        if (!cpumask_empty(mask))
-                smp_call_function_mask(mask, drop_other_mm_ref, mm, 1);
+                smp_call_function_many(mask, drop_other_mm_ref, mm, 1);
+        free_cpumask_var(mask);
 }
 #else
 static void xen_drop_mm_ref(struct mm_struct *mm)
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index acd9b6705e02..c44e2069c7c7 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -33,7 +33,7 @@
 #include "xen-ops.h"
 #include "mmu.h"
-cpumask_t xen_cpu_initialized_map;
+cpumask_var_t xen_cpu_initialized_map;
 static DEFINE_PER_CPU(int, resched_irq);
 static DEFINE_PER_CPU(int, callfunc_irq);
@@ -158,7 +158,7 @@ static void __init xen_fill_possible_map(void)
 {
        int i, rc;
-        for (i = 0; i < NR_CPUS; i++) {
+        for (i = 0; i < nr_cpu_ids; i++) {
                rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
                if (rc >= 0) {
                        num_processors++;
@@ -192,11 +192,14 @@ static void __init xen_smp_prepare_cpus(unsigned int max_cpus)
        if (xen_smp_intr_init(0))
                BUG();
-        xen_cpu_initialized_map = cpumask_of_cpu(0);
+        if (!alloc_cpumask_var(&xen_cpu_initialized_map, GFP_KERNEL))
+                panic("could not allocate xen_cpu_initialized_map\n");
+        cpumask_copy(xen_cpu_initialized_map, cpumask_of(0));
        /* Restrict the possible_map according to max_cpus. */
        while ((num_possible_cpus() > 1) && (num_possible_cpus() > max_cpus)) {
-                for (cpu = NR_CPUS - 1; !cpu_possible(cpu); cpu--)
+                for (cpu = nr_cpu_ids - 1; !cpu_possible(cpu); cpu--)
                        continue;
                cpu_clear(cpu, cpu_possible_map);
        }
@@ -221,7 +224,7 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
        struct vcpu_guest_context *ctxt;
        struct desc_struct *gdt;
-        if (cpu_test_and_set(cpu, xen_cpu_initialized_map))
+        if (cpumask_test_and_set_cpu(cpu, xen_cpu_initialized_map))
                return 0;
        ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL);
@@ -408,24 +411,23 @@ static void xen_smp_send_reschedule(int cpu)
        xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR);
 }
-static void xen_send_IPI_mask(cpumask_t mask, enum ipi_vector vector)
+static void xen_send_IPI_mask(const struct cpumask *mask,
+                              enum ipi_vector vector)
 {
        unsigned cpu;
-        cpus_and(mask, mask, cpu_online_map);
+        for_each_cpu_and(cpu, mask, cpu_online_mask)
-        for_each_cpu_mask_nr(cpu, mask)
                xen_send_IPI_one(cpu, vector);
 }
-static void xen_smp_send_call_function_ipi(cpumask_t mask)
+static void xen_smp_send_call_function_ipi(const struct cpumask *mask)
 {
        int cpu;
        xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR);
        /* Make sure other vcpus get a chance to run if they need to. */
-        for_each_cpu_mask_nr(cpu, mask) {
+        for_each_cpu(cpu, mask) {
                if (xen_vcpu_stolen(cpu)) {
                        HYPERVISOR_sched_op(SCHEDOP_yield, 0);
                        break;
@@ -435,7 +437,8 @@ static void xen_smp_send_call_function_ipi(cpumask_t mask)
 static void xen_smp_send_call_function_single_ipi(int cpu)
 {
-        xen_send_IPI_mask(cpumask_of_cpu(cpu), XEN_CALL_FUNCTION_SINGLE_VECTOR);
+        xen_send_IPI_mask(cpumask_of(cpu),
+                          XEN_CALL_FUNCTION_SINGLE_VECTOR);
 }
 static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id)
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
index 2a234db5949b..212ffe012b76 100644
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@@ -35,7 +35,8 @@ void xen_post_suspend(int suspend_cancelled)
                        pfn_to_mfn(xen_start_info->console.domU.mfn);
        } else {
 #ifdef CONFIG_SMP
-                xen_cpu_initialized_map = cpu_online_map;
+                BUG_ON(xen_cpu_initialized_map == NULL);
+                cpumask_copy(xen_cpu_initialized_map, cpu_online_mask);
 #endif
                xen_vcpu_restore();
        }
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index c9f7cda48ed7..65d75a6be0ba 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -437,7 +437,7 @@ void xen_setup_timer(int cpu)
        evt = &per_cpu(xen_clock_events, cpu);
        memcpy(evt, xen_clockevent, sizeof(*evt));
-        evt->cpumask = cpumask_of_cpu(cpu);
+        evt->cpumask = cpumask_of(cpu);
        evt->irq = irq;
        setup_runstate_info(cpu);
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 9e1afae8461f..c1f8faf0a2c5 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -58,7 +58,7 @@ void __init xen_init_spinlocks(void);
 __cpuinit void xen_init_lock_cpu(int cpu);
 void xen_uninit_lock_cpu(int cpu);
-extern cpumask_t xen_cpu_initialized_map;
+extern cpumask_var_t xen_cpu_initialized_map;
 #else
 static inline void xen_smp_init(void) {}
 #endif