47 files changed, 589 insertions, 1353 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index d364df03c1d6..37fa30bada17 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -23,11 +23,12 @@ nostackp := $(call cc-option, -fno-stack-protector)
 CFLAGS_vsyscall_64.o    := $(PROFILING) -g0 $(nostackp)
 CFLAGS_hpet.o           := $(nostackp)
 CFLAGS_tsc.o            := $(nostackp)
+CFLAGS_paravirt.o       := $(nostackp)
 obj-y                   := process_$(BITS).o signal.o entry_$(BITS).o
 obj-y                   += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
 obj-y                   += time_$(BITS).o ioport.o ldt.o dumpstack.o
-obj-y                   += setup.o i8259.o irqinit_$(BITS).o setup_percpu.o
+obj-y                   += setup.o i8259.o irqinit_$(BITS).o
 obj-$(CONFIG_X86_VISWS) += visws_quirks.o
 obj-$(CONFIG_X86_32)    += probe_roms_32.o
 obj-$(CONFIG_X86_32)    += sys_i386_32.o i386_ksyms_32.o
@@ -57,9 +58,9 @@ obj-$(CONFIG_PCI)		+= early-quirks.o
 apm-y                           := apm_32.o
 obj-$(CONFIG_APM)               += apm.o
 obj-$(CONFIG_X86_SMP)           += smp.o
-obj-$(CONFIG_X86_SMP)           += smpboot.o tsc_sync.o ipi.o tlb_$(BITS).o
+obj-$(CONFIG_X86_SMP)           += smpboot.o tsc_sync.o ipi.o
-obj-$(CONFIG_X86_32_SMP)        += smpcommon.o
+obj-$(CONFIG_SMP)               += setup_percpu.o
-obj-$(CONFIG_X86_64_SMP)        += tsc_sync.o smpcommon.o
+obj-$(CONFIG_X86_64_SMP)        += tsc_sync.o
 obj-$(CONFIG_X86_TRAMPOLINE)    += trampoline_$(BITS).o
 obj-$(CONFIG_X86_MPPARSE)       += mpparse.o
 obj-$(CONFIG_X86_LOCAL_APIC)    += apic.o nmi.o
@@ -114,10 +115,11 @@ obj-$(CONFIG_SWIOTLB)			+= pci-swiotlb_64.o # NB rename without _64
 ###
 # 64 bit specific files
 ifeq ($(CONFIG_X86_64),y)
-        obj-y                           += genapic_64.o genapic_flat_64.o genx2apic_uv_x.o tlb_uv.o
+        obj-y                           += genapic_64.o genapic_flat_64.o
-        obj-y                           += bios_uv.o uv_irq.o uv_sysfs.o
        obj-y                           += genx2apic_cluster.o
        obj-y                           += genx2apic_phys.o
+        obj-$(CONFIG_X86_UV)            += genx2apic_uv_x.o tlb_uv.o
+        obj-$(CONFIG_X86_UV)            += bios_uv.o uv_irq.o uv_sysfs.o
        obj-$(CONFIG_X86_PM_TIMER)      += pmtimer_64.o
        obj-$(CONFIG_AUDIT)             += audit_64.o
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index 707c1f6f95fa..4abff454c55b 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -101,6 +101,7 @@ int acpi_save_state_mem(void)
        stack_start.sp = temp_stack + sizeof(temp_stack);
        early_gdt_descr.address =
                        (unsigned long)get_cpu_gdt_table(smp_processor_id());
+        initial_gs = per_cpu_offset(smp_processor_id());
 #endif
        initial_code = (unsigned long)wakeup_long64;
        saved_magic = 0x123456789abcdef0;
diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c
index 566a08466b19..c6f15647eba9 100644
--- a/arch/x86/kernel/apic.c
+++ b/arch/x86/kernel/apic.c
@@ -47,6 +47,7 @@
 #include <asm/proto.h>
 #include <asm/apic.h>
 #include <asm/i8259.h>
+#include <asm/smp.h>
 #include <mach_apic.h>
 #include <mach_apicdef.h>
@@ -59,6 +60,24 @@
 # error SPURIOUS_APIC_VECTOR definition error
 #endif
+unsigned int num_processors;
+unsigned disabled_cpus __cpuinitdata;
+/* Processor that is doing the boot up */
+unsigned int boot_cpu_physical_apicid = -1U;
+EXPORT_SYMBOL(boot_cpu_physical_apicid);
+unsigned int max_physical_apicid;
+/* Bitmask of physically existing CPUs */
+physid_mask_t phys_cpu_present_map;
+/*
+ * Map cpu index to physical APIC ID
+ */
+DEFINE_EARLY_PER_CPU(u16, x86_cpu_to_apicid, BAD_APICID);
+DEFINE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid, BAD_APICID);
+EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid);
+EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
 #ifdef CONFIG_X86_32
 /*
 * Knob to control our willingness to enable the local APIC.
@@ -894,6 +913,10 @@ void disable_local_APIC(void)
 {
        unsigned int value;
+        /* APIC hasn't been mapped yet */
+        if (!apic_phys)
+                return;
        clear_local_APIC();
        /*
@@ -1125,6 +1148,13 @@ void __cpuinit setup_local_APIC(void)
        unsigned int value;
        int i, j;
+        if (disable_apic) {
+#ifdef CONFIG_X86_IO_APIC
+                disable_ioapic_setup();
+#endif
+                return;
+        }
 #ifdef CONFIG_X86_32
        /* Pound the ESR really hard over the head with a big hammer - mbligh */
        if (lapic_is_integrated() && esr_disable) {
@@ -1565,11 +1595,11 @@ int apic_version[MAX_APICS];
 int __init APIC_init_uniprocessor(void)
 {
-#ifdef CONFIG_X86_64
        if (disable_apic) {
                pr_info("Apic disabled\n");
                return -1;
        }
+#ifdef CONFIG_X86_64
        if (!cpu_has_apic) {
                disable_apic = 1;
                pr_info("Apic disabled by BIOS\n");
@@ -1832,6 +1862,11 @@ void __cpuinit generic_processor_info(int apicid, int version)
        num_processors++;
        cpu = cpumask_next_zero(-1, cpu_present_mask);
+        if (version != apic_version[boot_cpu_physical_apicid])
+                WARN_ONCE(1,
+                        "ACPI: apic version mismatch, bootcpu: %x cpu %d: %x\n",
+                        apic_version[boot_cpu_physical_apicid], cpu, version);
        physid_set(apicid, phys_cpu_present_map);
        if (apicid == boot_cpu_physical_apicid) {
                /*
@@ -1867,17 +1902,8 @@ void __cpuinit generic_processor_info(int apicid, int version)
 #endif
 #if defined(CONFIG_X86_SMP) || defined(CONFIG_X86_64)
-        /* are we being called early in kernel startup? */
+        early_per_cpu(x86_cpu_to_apicid, cpu) = apicid;
-        if (early_per_cpu_ptr(x86_cpu_to_apicid)) {
+        early_per_cpu(x86_bios_cpu_apicid, cpu) = apicid;
-                u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
-                u16 *bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid);
-                cpu_to_apicid[cpu] = apicid;
-                bios_cpu_apicid[cpu] = apicid;
-        } else {
-                per_cpu(x86_cpu_to_apicid, cpu) = apicid;
-                per_cpu(x86_bios_cpu_apicid, cpu) = apicid;
-        }
 #endif
        set_cpu_possible(cpu, true);
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index 1d41d3f1edbc..8793ab33e2c1 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -11,7 +11,6 @@
 #include <linux/hardirq.h>
 #include <linux/suspend.h>
 #include <linux/kbuild.h>
-#include <asm/pda.h>
 #include <asm/processor.h>
 #include <asm/segment.h>
 #include <asm/thread_info.h>
@@ -48,16 +47,6 @@ int main(void)
 #endif
        BLANK();
 #undef ENTRY
-#define ENTRY(entry) DEFINE(pda_ ## entry, offsetof(struct x8664_pda, entry))
-        ENTRY(kernelstack); 
-        ENTRY(oldrsp); 
-        ENTRY(pcurrent); 
-        ENTRY(irqcount);
-        ENTRY(cpunumber);
-        ENTRY(irqstackptr);
-        ENTRY(data_offset);
-        BLANK();
-#undef ENTRY
 #ifdef CONFIG_PARAVIRT
        BLANK();
        OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled);
diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c
index 2cf23634b6d9..4e581fdc0a5a 100644
--- a/arch/x86/kernel/cpu/addon_cpuid_features.c
+++ b/arch/x86/kernel/cpu/addon_cpuid_features.c
@@ -143,37 +143,3 @@ void __cpuinit detect_extended_topology(struct cpuinfo_x86 *c)
        return;
 #endif
 }
-#ifdef CONFIG_X86_PAT
-void __cpuinit validate_pat_support(struct cpuinfo_x86 *c)
-{
-        if (!cpu_has_pat)
-                pat_disable("PAT not supported by CPU.");
-        switch (c->x86_vendor) {
-        case X86_VENDOR_INTEL:
-                /*
-                 * There is a known erratum on Pentium III and Core Solo
-                 * and Core Duo CPUs.
-                 * " Page with PAT set to WC while associated MTRR is UC
-                 *   may consolidate to UC "
-                 * Because of this erratum, it is better to stick with
-                 * setting WC in MTRR rather than using PAT on these CPUs.
-                 *
-                 * Enable PAT WC only on P4, Core 2 or later CPUs.
-                 */
-                if (c->x86 > 0x6 || (c->x86 == 6 && c->x86_model >= 15))
-                        return;
-                pat_disable("PAT WC disabled due to known CPU erratum.");
-                return;
-        case X86_VENDOR_AMD:
-        case X86_VENDOR_CENTAUR:
-        case X86_VENDOR_TRANSMETA:
-                return;
-        }
-        pat_disable("PAT disabled. Not yet verified on this CPU type.");
-}
-#endif
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index f00258462444..275e2cb43b91 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -28,9 +28,9 @@
 #include <asm/apic.h>
 #include <mach_apic.h>
 #include <asm/genapic.h>
+#include <asm/uv/uv.h>
 #endif
-#include <asm/pda.h>
 #include <asm/pgtable.h>
 #include <asm/processor.h>
 #include <asm/desc.h>
@@ -52,6 +52,15 @@ cpumask_var_t cpu_initialized_mask;
 /* representing cpus for which sibling maps can be computed */
 cpumask_var_t cpu_sibling_setup_mask;
+/* correctly size the local cpu masks */
+void __init setup_cpu_local_masks(void)
+{
+        alloc_bootmem_cpumask_var(&cpu_initialized_mask);
+        alloc_bootmem_cpumask_var(&cpu_callin_mask);
+        alloc_bootmem_cpumask_var(&cpu_callout_mask);
+        alloc_bootmem_cpumask_var(&cpu_sibling_setup_mask);
+}
 #else /* CONFIG_X86_32 */
 cpumask_t cpu_callin_map;
@@ -64,23 +73,23 @@ cpumask_t cpu_sibling_setup_map;
 static struct cpu_dev *this_cpu __cpuinitdata;
+DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
 #ifdef CONFIG_X86_64
-/* We need valid kernel segments for data and code in long mode too
+        /*
- * IRET will check the segment types  kkeil 2000/10/28
+         * We need valid kernel segments for data and code in long mode too
- * Also sysret mandates a special GDT layout
+         * IRET will check the segment types  kkeil 2000/10/28
- */
+         * Also sysret mandates a special GDT layout
-/* The TLS descriptors are currently at a different place compared to i386.
+         *
-   Hopefully nobody expects them at a fixed place (Wine?) */
+         * The TLS descriptors are currently at a different place compared to i386.
-DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = {
+         * Hopefully nobody expects them at a fixed place (Wine?)
+         */
        [GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } },
        [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } },
        [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } },
        [GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } },
        [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } },
        [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } },
-} };
 #else
-DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
        [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } },
        [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } },
        [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00cffa00 } } },
@@ -112,9 +121,9 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
        [GDT_ENTRY_APMBIOS_BASE+2] = { { { 0x0000ffff, 0x00409200 } } },
        [GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } },
-        [GDT_ENTRY_PERCPU] = { { { 0x00000000, 0x00000000 } } },
+        [GDT_ENTRY_PERCPU] = { { { 0x0000ffff, 0x00cf9200 } } },
-} };
 #endif
+} };
 EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
 #ifdef CONFIG_X86_32
@@ -215,6 +224,49 @@ static inline void squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
 #endif
 /*
+ * Some CPU features depend on higher CPUID levels, which may not always
+ * be available due to CPUID level capping or broken virtualization
+ * software.  Add those features to this table to auto-disable them.
+ */
+struct cpuid_dependent_feature {
+        u32 feature;
+        u32 level;
+};
+static const struct cpuid_dependent_feature __cpuinitconst
+cpuid_dependent_features[] = {
+        { X86_FEATURE_MWAIT,            0x00000005 },
+        { X86_FEATURE_DCA,              0x00000009 },
+        { X86_FEATURE_XSAVE,            0x0000000d },
+        { 0, 0 }
+};
+static void __cpuinit filter_cpuid_features(struct cpuinfo_x86 *c, bool warn)
+{
+        const struct cpuid_dependent_feature *df;
+        for (df = cpuid_dependent_features; df->feature; df++) {
+                /*
+                 * Note: cpuid_level is set to -1 if unavailable, but
+                 * extended_extended_level is set to 0 if unavailable
+                 * and the legitimate extended levels are all negative
+                 * when signed; hence the weird messing around with
+                 * signs here...
+                 */
+                if (cpu_has(c, df->feature) &&
+                    ((s32)df->feature < 0 ?
+                     (u32)df->feature > (u32)c->extended_cpuid_level :
+                     (s32)df->feature > (s32)c->cpuid_level)) {
+                        clear_cpu_cap(c, df->feature);
+                        if (warn)
+                                printk(KERN_WARNING
+                                       "CPU: CPU feature %s disabled "
+                                       "due to lack of CPUID level 0x%x\n",
+                                       x86_cap_flags[df->feature],
+                                       df->level);
+                }
+        }
+}       
+/*
 * Naming convention should be: <Name> [(<Codename>)]
 * This table only is used unless init_<vendor>() below doesn't set it;
 * in particular, if CPUID levels 0x80000002..4 are supported, this isn't used
@@ -249,12 +301,17 @@ __u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;
 void switch_to_new_gdt(void)
 {
        struct desc_ptr gdt_descr;
+        int cpu = smp_processor_id();
-        gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id());
+        gdt_descr.address = (long)get_cpu_gdt_table(cpu);
        gdt_descr.size = GDT_SIZE - 1;
        load_gdt(&gdt_descr);
+        /* Reload the per-cpu base */
 #ifdef CONFIG_X86_32
-        asm("mov %0, %%fs" : : "r" (__KERNEL_PERCPU) : "memory");
+        loadsegment(fs, __KERNEL_PERCPU);
+#else
+        loadsegment(gs, 0);
+        wrmsrl(MSR_GS_BASE, (unsigned long)per_cpu(irq_stack_union.gs_base, cpu));
 #endif
 }
@@ -572,11 +629,10 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
        if (this_cpu->c_early_init)
                this_cpu->c_early_init(c);
-        validate_pat_support(c);
 #ifdef CONFIG_SMP
        c->cpu_index = boot_cpu_id;
 #endif
+        filter_cpuid_features(c, false);
 }
 void __init early_cpu_init(void)
@@ -710,6 +766,9 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
         * we do "generic changes."
         */
+        /* Filter out anything that depends on CPUID levels we don't have */
+        filter_cpuid_features(c, true);
        /* If the model name is still unset, do table lookup. */
        if (!c->x86_model_id[0]) {
                char *p;
@@ -879,54 +938,26 @@ static __init int setup_disablecpuid(char *arg)
 __setup("clearcpuid=", setup_disablecpuid);
 #ifdef CONFIG_X86_64
-struct x8664_pda **_cpu_pda __read_mostly;
-EXPORT_SYMBOL(_cpu_pda);
 struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
-static char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss;
+DEFINE_PER_CPU_FIRST(union irq_stack_union,
+                     irq_stack_union) __aligned(PAGE_SIZE);
-void __cpuinit pda_init(int cpu)
+#ifdef CONFIG_SMP
-{
+DEFINE_PER_CPU(char *, irq_stack_ptr);  /* will be set during per cpu init */
-        struct x8664_pda *pda = cpu_pda(cpu);
+#else
+DEFINE_PER_CPU(char *, irq_stack_ptr) =
+        per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE - 64;
+#endif
-        /* Setup up data that may be needed in __get_free_pages early */
+DEFINE_PER_CPU(unsigned long, kernel_stack) =
-        loadsegment(fs, 0);
+        (unsigned long)&init_thread_union - KERNEL_STACK_OFFSET + THREAD_SIZE;
-        loadsegment(gs, 0);
+EXPORT_PER_CPU_SYMBOL(kernel_stack);
-        /* Memory clobbers used to order PDA accessed */
-        mb();
-        wrmsrl(MSR_GS_BASE, pda);
-        mb();
-        pda->cpunumber = cpu;
-        pda->irqcount = -1;
-        pda->kernelstack = (unsigned long)stack_thread_info() -
-                                 PDA_STACKOFFSET + THREAD_SIZE;
-        pda->active_mm = &init_mm;
-        pda->mmu_state = 0;
-        if (cpu == 0) {
-                /* others are initialized in smpboot.c */
-                pda->pcurrent = &init_task;
-                pda->irqstackptr = boot_cpu_stack;
-                pda->irqstackptr += IRQSTACKSIZE - 64;
-        } else {
-                if (!pda->irqstackptr) {
-                        pda->irqstackptr = (char *)
-                                __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER);
-                        if (!pda->irqstackptr)
-                                panic("cannot allocate irqstack for cpu %d",
-                                      cpu);
-                        pda->irqstackptr += IRQSTACKSIZE - 64;
-                }
-                if (pda->nodenumber == 0 && cpu_to_node(cpu) != NUMA_NO_NODE)
+DEFINE_PER_CPU(unsigned int, irq_count) = -1;
-                        pda->nodenumber = cpu_to_node(cpu);
-        }
-}
-static char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ +
+static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
-                                  DEBUG_STKSZ] __page_aligned_bss;
+        [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ])
+        __aligned(PAGE_SIZE);
 extern asmlinkage void ignore_sysret(void);
@@ -984,15 +1015,14 @@ void __cpuinit cpu_init(void)
        struct tss_struct *t = &per_cpu(init_tss, cpu);
        struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu);
        unsigned long v;
-        char *estacks = NULL;
        struct task_struct *me;
        int i;
-        /* CPU 0 is initialised in head64.c */
+#ifdef CONFIG_NUMA
-        if (cpu != 0)
+        if (cpu != 0 && percpu_read(node_number) == 0 &&
-                pda_init(cpu);
+            cpu_to_node(cpu) != NUMA_NO_NODE)
-        else
+                percpu_write(node_number, cpu_to_node(cpu));
-                estacks = boot_exception_stacks;
+#endif
        me = current;
@@ -1009,6 +1039,8 @@ void __cpuinit cpu_init(void)
         */
        switch_to_new_gdt();
+        loadsegment(fs, 0);
        load_idt((const struct desc_ptr *)&idt_descr);
        memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
@@ -1026,18 +1058,13 @@ void __cpuinit cpu_init(void)
         * set up and load the per-CPU TSS
         */
        if (!orig_ist->ist[0]) {
-                static const unsigned int order[N_EXCEPTION_STACKS] = {
+                static const unsigned int sizes[N_EXCEPTION_STACKS] = {
-                  [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER,
+                  [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ,
-                  [DEBUG_STACK - 1] = DEBUG_STACK_ORDER
+                  [DEBUG_STACK - 1] = DEBUG_STKSZ
                };
+                char *estacks = per_cpu(exception_stacks, cpu);
                for (v = 0; v < N_EXCEPTION_STACKS; v++) {
-                        if (cpu) {
+                        estacks += sizes[v];
-                                estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]);
-                                if (!estacks)
-                                        panic("Cannot allocate exception "
-                                              "stack %ld %d\n", v, cpu);
-                        }
-                        estacks += PAGE_SIZE << order[v];
                        orig_ist->ist[v] = t->x86_tss.ist[v] =
                                        (unsigned long)estacks;
                }
@@ -1071,22 +1098,19 @@ void __cpuinit cpu_init(void)
         */
        if (kgdb_connected && arch_kgdb_ops.correct_hw_break)
                arch_kgdb_ops.correct_hw_break();
-        else {
+        else
 #endif
-        /*
+        {
-         * Clear all 6 debug registers:
+                /*
-         */
+                 * Clear all 6 debug registers:
+                 */
-        set_debugreg(0UL, 0);
+                set_debugreg(0UL, 0);
-        set_debugreg(0UL, 1);
+                set_debugreg(0UL, 1);
-        set_debugreg(0UL, 2);
+                set_debugreg(0UL, 2);
-        set_debugreg(0UL, 3);
+                set_debugreg(0UL, 3);
-        set_debugreg(0UL, 6);
+                set_debugreg(0UL, 6);
-        set_debugreg(0UL, 7);
+                set_debugreg(0UL, 7);
-#ifdef CONFIG_KGDB
-        /* If the kgdb is connected no debug regs should be altered. */
        }
-#endif
        fpu_init();
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 06fcd8f9323c..4b1c319d30c3 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -145,7 +145,7 @@ typedef union {
 struct drv_cmd {
        unsigned int type;
-        cpumask_var_t mask;
+        const struct cpumask *mask;
        drv_addr_union addr;
        u32 val;
 };
@@ -231,15 +231,9 @@ static u32 get_cur_val(const struct cpumask *mask)
                return 0;
        }
-        if (unlikely(!alloc_cpumask_var(&cmd.mask, GFP_KERNEL)))
+        cmd.mask = mask;
-                return 0;
-        cpumask_copy(cmd.mask, mask);
        drv_read(&cmd);
-        free_cpumask_var(cmd.mask);
        dprintk("get_cur_val = %u\n", cmd.val);
        return cmd.val;
@@ -369,7 +363,7 @@ static unsigned int get_cur_freq_on_cpu(unsigned int cpu)
        return freq;
 }
-static unsigned int check_freqs(const cpumask_t *mask, unsigned int freq,
+static unsigned int check_freqs(const struct cpumask *mask, unsigned int freq,
                                struct acpi_cpufreq_data *data)
 {
        unsigned int cur_freq;
@@ -404,9 +398,6 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
                return -ENODEV;
        }
-        if (unlikely(!alloc_cpumask_var(&cmd.mask, GFP_KERNEL)))
-                return -ENOMEM;
        perf = data->acpi_data;
        result = cpufreq_frequency_table_target(policy,
                                                data->freq_table,
@@ -451,9 +442,9 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
        /* cpufreq holds the hotplug lock, so we are safe from here on */
        if (policy->shared_type != CPUFREQ_SHARED_TYPE_ANY)
-                cpumask_and(cmd.mask, cpu_online_mask, policy->cpus);
+                cmd.mask = policy->cpus;
        else
-                cpumask_copy(cmd.mask, cpumask_of(policy->cpu));
+                cmd.mask = cpumask_of(policy->cpu);
        freqs.old = perf->states[perf->state].core_frequency * 1000;
        freqs.new = data->freq_table[next_state].frequency;
@@ -480,7 +471,6 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
        perf->state = next_perf_state;
 out:
-        free_cpumask_var(cmd.mask);
        return result;
 }
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 8ea6929e974c..5deefae9064d 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -29,6 +29,19 @@
 static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
 {
+        /* Unmask CPUID levels if masked: */
+        if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) {
+                u64 misc_enable;
+                rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
+                if (misc_enable & MSR_IA32_MISC_ENABLE_LIMIT_CPUID) {
+                        misc_enable &= ~MSR_IA32_MISC_ENABLE_LIMIT_CPUID;
+                        wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
+                        c->cpuid_level = cpuid_eax(0);
+                }
+        }
        if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
                (c->x86 == 0x6 && c->x86_model >= 0x0e))
                set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
@@ -50,6 +63,18 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
                set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
        }
+        /*
+         * There is a known erratum on Pentium III and Core Solo
+         * and Core Duo CPUs.
+         * " Page with PAT set to WC while associated MTRR is UC
+         *   may consolidate to UC "
+         * Because of this erratum, it is better to stick with
+         * setting WC in MTRR rather than using PAT on these CPUs.
+         *
+         * Enable PAT WC only on P4, Core 2 or later CPUs.
+         */
+        if (c->x86 == 6 && c->x86_model < 15)
+                clear_cpu_cap(c, X86_FEATURE_PAT);
 }
 #ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 48533d77be78..58527a9fc404 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -132,7 +132,16 @@ struct _cpuid4_info {
        union _cpuid4_leaf_ecx ecx;
        unsigned long size;
        unsigned long can_disable;
-        cpumask_t shared_cpu_map;       /* future?: only cpus/node is needed */
+        DECLARE_BITMAP(shared_cpu_map, NR_CPUS);
+};
+/* subset of above _cpuid4_info w/o shared_cpu_map */
+struct _cpuid4_info_regs {
+        union _cpuid4_leaf_eax eax;
+        union _cpuid4_leaf_ebx ebx;
+        union _cpuid4_leaf_ecx ecx;
+        unsigned long size;
+        unsigned long can_disable;
 };
 #ifdef CONFIG_PCI
@@ -263,7 +272,7 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
 }
 static void __cpuinit
-amd_check_l3_disable(int index, struct _cpuid4_info *this_leaf)
+amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
 {
        if (index < 3)
                return;
@@ -271,7 +280,8 @@ amd_check_l3_disable(int index, struct _cpuid4_info *this_leaf)
 }
 static int
-__cpuinit cpuid4_cache_lookup(int index, struct _cpuid4_info *this_leaf)
+__cpuinit cpuid4_cache_lookup_regs(int index,
+                                   struct _cpuid4_info_regs *this_leaf)
 {
        union _cpuid4_leaf_eax  eax;
        union _cpuid4_leaf_ebx  ebx;
@@ -299,6 +309,15 @@ __cpuinit cpuid4_cache_lookup(int index, struct _cpuid4_info *this_leaf)
        return 0;
 }
+static int
+__cpuinit cpuid4_cache_lookup(int index, struct _cpuid4_info *this_leaf)
+{
+        struct _cpuid4_info_regs *leaf_regs =
+                (struct _cpuid4_info_regs *)this_leaf;
+        return cpuid4_cache_lookup_regs(index, leaf_regs);
+}
 static int __cpuinit find_num_cache_leaves(void)
 {
        unsigned int            eax, ebx, ecx, edx;
@@ -338,11 +357,10 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
                 * parameters cpuid leaf to find the cache details
                 */
                for (i = 0; i < num_cache_leaves; i++) {
-                        struct _cpuid4_info this_leaf;
+                        struct _cpuid4_info_regs this_leaf;
                        int retval;
-                        retval = cpuid4_cache_lookup(i, &this_leaf);
+                        retval = cpuid4_cache_lookup_regs(i, &this_leaf);
                        if (retval >= 0) {
                                switch(this_leaf.eax.split.level) {
                                    case 1:
@@ -491,17 +509,20 @@ static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index)
        num_threads_sharing = 1 + this_leaf->eax.split.num_threads_sharing;
        if (num_threads_sharing == 1)
-                cpu_set(cpu, this_leaf->shared_cpu_map);
+                cpumask_set_cpu(cpu, to_cpumask(this_leaf->shared_cpu_map));
        else {
                index_msb = get_count_order(num_threads_sharing);
                for_each_online_cpu(i) {
                        if (cpu_data(i).apicid >> index_msb ==
                            c->apicid >> index_msb) {
-                                cpu_set(i, this_leaf->shared_cpu_map);
+                                cpumask_set_cpu(i,
+                                        to_cpumask(this_leaf->shared_cpu_map));
                                if (i != cpu && per_cpu(cpuid4_info, i))  {
-                                        sibling_leaf = CPUID4_INFO_IDX(i, index);
+                                        sibling_leaf =
-                                        cpu_set(cpu, sibling_leaf->shared_cpu_map);
+                                                CPUID4_INFO_IDX(i, index);
+                                        cpumask_set_cpu(cpu, to_cpumask(
+                                                sibling_leaf->shared_cpu_map));
                                }
                        }
                }
@@ -513,9 +534,10 @@ static void __cpuinit cache_remove_shared_cpu_map(unsigned int cpu, int index)
        int sibling;
        this_leaf = CPUID4_INFO_IDX(cpu, index);
-        for_each_cpu_mask_nr(sibling, this_leaf->shared_cpu_map) {
+        for_each_cpu(sibling, to_cpumask(this_leaf->shared_cpu_map)) {
                sibling_leaf = CPUID4_INFO_IDX(sibling, index);
-                cpu_clear(cpu, sibling_leaf->shared_cpu_map);
+                cpumask_clear_cpu(cpu,
+                                  to_cpumask(sibling_leaf->shared_cpu_map));
        }
 }
 #else
@@ -620,8 +642,9 @@ static ssize_t show_shared_cpu_map_func(struct _cpuid4_info *this_leaf,
        int n = 0;
        if (len > 1) {
-                cpumask_t *mask = &this_leaf->shared_cpu_map;
+                const struct cpumask *mask;
+                mask = to_cpumask(this_leaf->shared_cpu_map);
                n = type?
                        cpulist_scnprintf(buf, len-2, mask) :
                        cpumask_scnprintf(buf, len-2, mask);
@@ -684,7 +707,8 @@ static struct pci_dev *get_k8_northbridge(int node)
 static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf)
 {
-        int node = cpu_to_node(first_cpu(this_leaf->shared_cpu_map));
+        const struct cpumask *mask = to_cpumask(this_leaf->shared_cpu_map);
+        int node = cpu_to_node(cpumask_first(mask));
        struct pci_dev *dev = NULL;
        ssize_t ret = 0;
        int i;
@@ -718,7 +742,8 @@ static ssize_t
 store_cache_disable(struct _cpuid4_info *this_leaf, const char *buf,
                    size_t count)
 {
-        int node = cpu_to_node(first_cpu(this_leaf->shared_cpu_map));
+        const struct cpumask *mask = to_cpumask(this_leaf->shared_cpu_map);
+        int node = cpu_to_node(cpumask_first(mask));
        struct pci_dev *dev = NULL;
        unsigned int ret, index, val;
@@ -863,7 +888,7 @@ err_out:
        return -ENOMEM;
 }
-static cpumask_t cache_dev_map = CPU_MASK_NONE;
+static DECLARE_BITMAP(cache_dev_map, NR_CPUS);
 /* Add/Remove cache interface for CPU device */
 static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
@@ -903,7 +928,7 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
                }
                kobject_uevent(&(this_object->kobj), KOBJ_ADD);
        }
-        cpu_set(cpu, cache_dev_map);
+        cpumask_set_cpu(cpu, to_cpumask(cache_dev_map));
        kobject_uevent(per_cpu(cache_kobject, cpu), KOBJ_ADD);
        return 0;
@@ -916,9 +941,9 @@ static void __cpuinit cache_remove_dev(struct sys_device * sys_dev)
        if (per_cpu(cpuid4_info, cpu) == NULL)
                return;
-        if (!cpu_isset(cpu, cache_dev_map))
+        if (!cpumask_test_cpu(cpu, to_cpumask(cache_dev_map)))
                return;
-        cpu_clear(cpu, cache_dev_map);
+        cpumask_clear_cpu(cpu, to_cpumask(cache_dev_map));
        for (i = 0; i < num_cache_leaves; i++)
                kobject_put(&(INDEX_KOBJECT_PTR(cpu,i)->kobj));
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
index 8ae8c4ff094d..4772e91e8246 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
@@ -67,7 +67,7 @@ static struct threshold_block threshold_defaults = {
 struct threshold_bank {
        struct kobject *kobj;
        struct threshold_block *blocks;
-        cpumask_t cpus;
+        cpumask_var_t cpus;
 };
 static DEFINE_PER_CPU(struct threshold_bank *, threshold_banks[NR_BANKS]);
@@ -481,7 +481,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
 #ifdef CONFIG_SMP
        if (cpu_data(cpu).cpu_core_id && shared_bank[bank]) {   /* symlink */
-                i = first_cpu(per_cpu(cpu_core_map, cpu));
+                i = cpumask_first(&per_cpu(cpu_core_map, cpu));
                /* first core not up yet */
                if (cpu_data(i).cpu_core_id)
@@ -501,7 +501,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
                if (err)
                        goto out;
-                b->cpus = per_cpu(cpu_core_map, cpu);
+                cpumask_copy(b->cpus, &per_cpu(cpu_core_map, cpu));
                per_cpu(threshold_banks, cpu)[bank] = b;
                goto out;
        }
@@ -512,15 +512,20 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
                err = -ENOMEM;
                goto out;
        }
+        if (!alloc_cpumask_var(&b->cpus, GFP_KERNEL)) {
+                kfree(b);
+                err = -ENOMEM;
+                goto out;
+        }
        b->kobj = kobject_create_and_add(name, &per_cpu(device_mce, cpu).kobj);
        if (!b->kobj)
                goto out_free;
 #ifndef CONFIG_SMP
-        b->cpus = CPU_MASK_ALL;
+        cpumask_setall(b->cpus);
 #else
-        b->cpus = per_cpu(cpu_core_map, cpu);
+        cpumask_copy(b->cpus, &per_cpu(cpu_core_map, cpu));
 #endif
        per_cpu(threshold_banks, cpu)[bank] = b;
@@ -529,7 +534,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
        if (err)
                goto out_free;
-        for_each_cpu_mask_nr(i, b->cpus) {
+        for_each_cpu(i, b->cpus) {
                if (i == cpu)
                        continue;
@@ -545,6 +550,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
 out_free:
        per_cpu(threshold_banks, cpu)[bank] = NULL;
+        free_cpumask_var(b->cpus);
        kfree(b);
 out:
        return err;
@@ -619,7 +625,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
 #endif
        /* remove all sibling symlinks before unregistering */
-        for_each_cpu_mask_nr(i, b->cpus) {
+        for_each_cpu(i, b->cpus) {
                if (i == cpu)
                        continue;
@@ -632,6 +638,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
 free_out:
        kobject_del(b->kobj);
        kobject_put(b->kobj);
+        free_cpumask_var(b->cpus);
        kfree(b);
        per_cpu(threshold_banks, cpu)[bank] = NULL;
 }
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
index 4b48f251fd39..5e8c79e748a6 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
@@ -7,6 +7,7 @@
 #include <linux/interrupt.h>
 #include <linux/percpu.h>
 #include <asm/processor.h>
+#include <asm/apic.h>
 #include <asm/msr.h>
 #include <asm/mce.h>
 #include <asm/hw_irq.h>
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index b59ddcc88cd8..0c0a455fe95c 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -33,11 +33,13 @@ u64 mtrr_tom2;
 struct mtrr_state_type mtrr_state = {};
 EXPORT_SYMBOL_GPL(mtrr_state);
-#undef MODULE_PARAM_PREFIX
+static int __initdata mtrr_show;
-#define MODULE_PARAM_PREFIX "mtrr."
+static int __init mtrr_debug(char *opt)
+{
-static int mtrr_show;
+        mtrr_show = 1;
-module_param_named(show, mtrr_show, bool, 0);
+        return 0;
+}
+early_param("mtrr.show", mtrr_debug);
 /*
 * Returns the effective MTRR type for the region
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index c302d0707048..d35db5993fd6 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -106,7 +106,8 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
                const struct stacktrace_ops *ops, void *data)
 {
        const unsigned cpu = get_cpu();
-        unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr;
+        unsigned long *irq_stack_end =
+                (unsigned long *)per_cpu(irq_stack_ptr, cpu);
        unsigned used = 0;
        struct thread_info *tinfo;
        int graph = 0;
@@ -160,23 +161,23 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
                        stack = (unsigned long *) estack_end[-2];
                        continue;
                }
-                if (irqstack_end) {
+                if (irq_stack_end) {
-                        unsigned long *irqstack;
+                        unsigned long *irq_stack;
-                        irqstack = irqstack_end -
+                        irq_stack = irq_stack_end -
-                                (IRQSTACKSIZE - 64) / sizeof(*irqstack);
+                                (IRQ_STACK_SIZE - 64) / sizeof(*irq_stack);
-                        if (stack >= irqstack && stack < irqstack_end) {
+                        if (stack >= irq_stack && stack < irq_stack_end) {
                                if (ops->stack(data, "IRQ") < 0)
                                        break;
                                bp = print_context_stack(tinfo, stack, bp,
-                                        ops, data, irqstack_end, &graph);
+                                        ops, data, irq_stack_end, &graph);
                                /*
                                 * We link to the next stack (which would be
                                 * the process stack normally) the last
                                 * pointer (index -1 to end) in the IRQ stack:
                                 */
-                                stack = (unsigned long *) (irqstack_end[-1]);
+                                stack = (unsigned long *) (irq_stack_end[-1]);
-                                irqstack_end = NULL;
+                                irq_stack_end = NULL;
                                ops->stack(data, "EOI");
                                continue;
                        }
@@ -199,10 +200,10 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
        unsigned long *stack;
        int i;
        const int cpu = smp_processor_id();
-        unsigned long *irqstack_end =
+        unsigned long *irq_stack_end =
-                (unsigned long *) (cpu_pda(cpu)->irqstackptr);
+                (unsigned long *)(per_cpu(irq_stack_ptr, cpu));
-        unsigned long *irqstack =
+        unsigned long *irq_stack =
-                (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE);
+                (unsigned long *)(per_cpu(irq_stack_ptr, cpu) - IRQ_STACK_SIZE);
        /*
         * debugging aid: "show_stack(NULL, NULL);" prints the
@@ -218,9 +219,9 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
        stack = sp;
        for (i = 0; i < kstack_depth_to_print; i++) {
-                if (stack >= irqstack && stack <= irqstack_end) {
+                if (stack >= irq_stack && stack <= irq_stack_end) {
-                        if (stack == irqstack_end) {
+                        if (stack == irq_stack_end) {
-                                stack = (unsigned long *) (irqstack_end[-1]);
+                                stack = (unsigned long *) (irq_stack_end[-1]);
                                printk(" <EOI> ");
                        }
                } else {
@@ -241,7 +242,7 @@ void show_registers(struct pt_regs *regs)
        int i;
        unsigned long sp;
        const int cpu = smp_processor_id();
-        struct task_struct *cur = cpu_pda(cpu)->pcurrent;
+        struct task_struct *cur = current;
        sp = regs->sp;
        printk("CPU %d ", cpu);
diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c
index 1119d247fe11..b205272ad394 100644
--- a/arch/x86/kernel/efi.c
+++ b/arch/x86/kernel/efi.c
@@ -366,10 +366,12 @@ void __init efi_init(void)
                                        SMBIOS_TABLE_GUID)) {
                        efi.smbios = config_tables[i].table;
                        printk(" SMBIOS=0x%lx ", config_tables[i].table);
+#ifdef CONFIG_X86_UV
                } else if (!efi_guidcmp(config_tables[i].guid,
                                        UV_SYSTEM_TABLE_GUID)) {
                        efi.uv_systab = config_tables[i].table;
                        printk(" UVsystab=0x%lx ", config_tables[i].table);
+#endif
                } else if (!efi_guidcmp(config_tables[i].guid,
                                        HCDP_TABLE_GUID)) {
                        efi.hcdp = config_tables[i].table;
diff --git a/arch/x86/kernel/efi_64.c b/arch/x86/kernel/efi_64.c
index 652c5287215f..a4ee29127fdf 100644
--- a/arch/x86/kernel/efi_64.c
+++ b/arch/x86/kernel/efi_64.c
@@ -36,6 +36,7 @@
 #include <asm/proto.h>
 #include <asm/efi.h>
 #include <asm/cacheflush.h>
+#include <asm/fixmap.h>
 static pgd_t save_pgd __initdata;
 static unsigned long efi_flags __initdata;
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index d6f0490a7391..a0b91aac72a1 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -672,7 +672,7 @@ common_interrupt:
 ENDPROC(common_interrupt)
        CFI_ENDPROC
-#define BUILD_INTERRUPT(name, nr)       \
+#define BUILD_INTERRUPT3(name, nr, fn)  \
 ENTRY(name)                             \
        RING0_INT_FRAME;                \
        pushl $~(nr);                   \
@@ -680,11 +680,13 @@ ENTRY(name)				\
        SAVE_ALL;                       \
        TRACE_IRQS_OFF                  \
        movl %esp,%eax;                 \
-        call smp_##name;                \
+        call fn;                        \
        jmp ret_from_intr;              \
        CFI_ENDPROC;                    \
 ENDPROC(name)
+#define BUILD_INTERRUPT(name, nr)       BUILD_INTERRUPT3(name, nr, smp_##name)
 /* The include is where all of the SMP etc. interrupts come from */
 #include "entry_arch.h"
@@ -1203,7 +1205,6 @@ nmi_stack_correct:
        pushl %eax
        CFI_ADJUST_CFA_OFFSET 4
        SAVE_ALL
-        TRACE_IRQS_OFF
        xorl %edx,%edx          # zero error code
        movl %esp,%eax          # pt_regs pointer
        call do_nmi
@@ -1244,7 +1245,6 @@ nmi_espfix_stack:
        pushl %eax
        CFI_ADJUST_CFA_OFFSET 4
        SAVE_ALL
-        TRACE_IRQS_OFF
        FIXUP_ESPFIX_STACK              # %eax == %esp
        xorl %edx,%edx                  # zero error code
        call do_nmi
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index e28c7a987793..82801fd2e931 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -52,6 +52,7 @@
 #include <asm/irqflags.h>
 #include <asm/paravirt.h>
 #include <asm/ftrace.h>
+#include <asm/percpu.h>
 /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this.  */
 #include <linux/elf-em.h>
@@ -209,7 +210,7 @@ ENTRY(native_usergs_sysret64)
        /* %rsp:at FRAMEEND */
        .macro FIXUP_TOP_OF_STACK tmp offset=0
-        movq %gs:pda_oldrsp,\tmp
+        movq PER_CPU_VAR(old_rsp),\tmp
        movq \tmp,RSP+\offset(%rsp)
        movq $__USER_DS,SS+\offset(%rsp)
        movq $__USER_CS,CS+\offset(%rsp)
@@ -220,7 +221,7 @@ ENTRY(native_usergs_sysret64)
        .macro RESTORE_TOP_OF_STACK tmp offset=0
        movq RSP+\offset(%rsp),\tmp
-        movq \tmp,%gs:pda_oldrsp
+        movq \tmp,PER_CPU_VAR(old_rsp)
        movq EFLAGS+\offset(%rsp),\tmp
        movq \tmp,R11+\offset(%rsp)
        .endm
@@ -336,15 +337,15 @@ ENTRY(save_args)
        je 1f
        SWAPGS
        /*
-         * irqcount is used to check if a CPU is already on an interrupt stack
+         * irq_count is used to check if a CPU is already on an interrupt stack
         * or not. While this is essentially redundant with preempt_count it is
         * a little cheaper to use a separate counter in the PDA (short of
         * moving irq_enter into assembly, which would be too much work)
         */
-1:      incl %gs:pda_irqcount
+1:      incl PER_CPU_VAR(irq_count)
        jne 2f
        popq_cfi %rax                   /* move return address... */
-        mov %gs:pda_irqstackptr,%rsp
+        mov PER_CPU_VAR(irq_stack_ptr),%rsp
        EMPTY_FRAME 0
        pushq_cfi %rax                  /* ... to the new stack */
        /*
@@ -408,6 +409,8 @@ END(save_paranoid)
 ENTRY(ret_from_fork)
        DEFAULT_FRAME
+        LOCK ; btr $TIF_FORK,TI_flags(%r8)
        push kernel_eflags(%rip)
        CFI_ADJUST_CFA_OFFSET 8
        popf                                    # reset kernel eflags
@@ -467,7 +470,7 @@ END(ret_from_fork)
 ENTRY(system_call)
        CFI_STARTPROC   simple
        CFI_SIGNAL_FRAME
-        CFI_DEF_CFA     rsp,PDA_STACKOFFSET
+        CFI_DEF_CFA     rsp,KERNEL_STACK_OFFSET
        CFI_REGISTER    rip,rcx
        /*CFI_REGISTER  rflags,r11*/
        SWAPGS_UNSAFE_STACK
@@ -478,8 +481,8 @@ ENTRY(system_call)
         */
 ENTRY(system_call_after_swapgs)
-        movq    %rsp,%gs:pda_oldrsp
+        movq    %rsp,PER_CPU_VAR(old_rsp)
-        movq    %gs:pda_kernelstack,%rsp
+        movq    PER_CPU_VAR(kernel_stack),%rsp
        /*
         * No need to follow this irqs off/on section - it's straight
         * and short:
@@ -522,7 +525,7 @@ sysret_check:
        CFI_REGISTER    rip,rcx
        RESTORE_ARGS 0,-ARG_SKIP,1
        /*CFI_REGISTER  rflags,r11*/
-        movq    %gs:pda_oldrsp, %rsp
+        movq    PER_CPU_VAR(old_rsp), %rsp
        USERGS_SYSRET64
        CFI_RESTORE_STATE
@@ -832,11 +835,11 @@ common_interrupt:
        XCPT_FRAME
        addq $-0x80,(%rsp)              /* Adjust vector to [-256,-1] range */
        interrupt do_IRQ
-        /* 0(%rsp): oldrsp-ARGOFFSET */
+        /* 0(%rsp): old_rsp-ARGOFFSET */
 ret_from_intr:
        DISABLE_INTERRUPTS(CLBR_NONE)
        TRACE_IRQS_OFF
-        decl %gs:pda_irqcount
+        decl PER_CPU_VAR(irq_count)
        leaveq
        CFI_DEF_CFA_REGISTER    rsp
        CFI_ADJUST_CFA_OFFSET   -8
@@ -981,8 +984,10 @@ apicinterrupt IRQ_MOVE_CLEANUP_VECTOR \
        irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt
 #endif
+#ifdef CONFIG_X86_UV
 apicinterrupt UV_BAU_MESSAGE \
        uv_bau_message_intr1 uv_bau_message_interrupt
+#endif
 apicinterrupt LOCAL_TIMER_VECTOR \
        apic_timer_interrupt smp_apic_timer_interrupt
@@ -1072,10 +1077,10 @@ ENTRY(\sym)
        TRACE_IRQS_OFF
        movq %rsp,%rdi          /* pt_regs pointer */
        xorl %esi,%esi          /* no error code */
-        movq %gs:pda_data_offset, %rbp
+        PER_CPU(init_tss, %rbp)
-        subq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
+        subq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%rbp)
        call \do_sym
-        addq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
+        addq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%rbp)
        jmp paranoid_exit       /* %ebx: no swapgs flag */
        CFI_ENDPROC
 END(\sym)
@@ -1259,14 +1264,14 @@ ENTRY(call_softirq)
        CFI_REL_OFFSET rbp,0
        mov  %rsp,%rbp
        CFI_DEF_CFA_REGISTER rbp
-        incl %gs:pda_irqcount
+        incl PER_CPU_VAR(irq_count)
-        cmove %gs:pda_irqstackptr,%rsp
+        cmove PER_CPU_VAR(irq_stack_ptr),%rsp
        push  %rbp                      # backlink for old unwinder
        call __do_softirq
        leaveq
        CFI_DEF_CFA_REGISTER    rsp
        CFI_ADJUST_CFA_OFFSET   -8
-        decl %gs:pda_irqcount
+        decl PER_CPU_VAR(irq_count)
        ret
        CFI_ENDPROC
 END(call_softirq)
@@ -1296,15 +1301,15 @@ ENTRY(xen_do_hypervisor_callback)   # do_hypervisor_callback(struct *pt_regs)
        movq %rdi, %rsp            # we don't return, adjust the stack frame
        CFI_ENDPROC
        DEFAULT_FRAME
-11:     incl %gs:pda_irqcount
+11:     incl PER_CPU_VAR(irq_count)
        movq %rsp,%rbp
        CFI_DEF_CFA_REGISTER rbp
-        cmovzq %gs:pda_irqstackptr,%rsp
+        cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp
        pushq %rbp                      # backlink for old unwinder
        call xen_evtchn_do_upcall
        popq %rsp
        CFI_DEF_CFA_REGISTER rsp
-        decl %gs:pda_irqcount
+        decl PER_CPU_VAR(irq_count)
        jmp  error_exit
        CFI_ENDPROC
 END(do_hypervisor_callback)
diff --git a/arch/x86/kernel/genapic_64.c b/arch/x86/kernel/genapic_64.c
index 2bced78b0b8e..e656c2721154 100644
--- a/arch/x86/kernel/genapic_64.c
+++ b/arch/x86/kernel/genapic_64.c
@@ -32,7 +32,9 @@ extern struct genapic apic_x2apic_cluster;
 struct genapic __read_mostly *genapic = &apic_flat;
 static struct genapic *apic_probe[] __initdata = {
+#ifdef CONFIG_X86_UV
        &apic_x2apic_uv_x,
+#endif
        &apic_x2apic_phys,
        &apic_x2apic_cluster,
        &apic_physflat,
diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c
index b193e082f6ce..bfe36249145c 100644
--- a/arch/x86/kernel/genx2apic_uv_x.c
+++ b/arch/x86/kernel/genx2apic_uv_x.c
@@ -25,6 +25,7 @@
 #include <asm/ipi.h>
 #include <asm/genapic.h>
 #include <asm/pgtable.h>
+#include <asm/uv/uv.h>
 #include <asm/uv/uv_mmrs.h>
 #include <asm/uv/uv_hub.h>
 #include <asm/uv/bios.h>
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index b9a4d8c4b935..f5b272247690 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -26,27 +26,6 @@
 #include <asm/bios_ebda.h>
 #include <asm/trampoline.h>
-/* boot cpu pda */
-static struct x8664_pda _boot_cpu_pda;
-#ifdef CONFIG_SMP
-/*
- * We install an empty cpu_pda pointer table to indicate to early users
- * (numa_set_node) that the cpu_pda pointer table for cpus other than
- * the boot cpu is not yet setup.
- */
-static struct x8664_pda *__cpu_pda[NR_CPUS] __initdata;
-#else
-static struct x8664_pda *__cpu_pda[NR_CPUS] __read_mostly;
-#endif
-void __init x86_64_init_pda(void)
-{
-        _cpu_pda = __cpu_pda;
-        cpu_pda(0) = &_boot_cpu_pda;
-        pda_init(0);
-}
 static void __init zap_identity_mappings(void)
 {
        pgd_t *pgd = pgd_offset_k(0UL);
@@ -112,8 +91,6 @@ void __init x86_64_start_kernel(char * real_mode_data)
        if (console_loglevel == 10)
                early_printk("Kernel alive\n");
-        x86_64_init_pda();
        x86_64_start_reservations(real_mode_data);
 }
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index e835b4eea70b..722464c520cf 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -429,12 +429,14 @@ is386:	movl $2,%ecx		# set MP
        ljmp $(__KERNEL_CS),$1f
 1:      movl $(__KERNEL_DS),%eax        # reload all the segment registers
        movl %eax,%ss                   # after changing gdt.
-        movl %eax,%fs                   # gets reset once there's real percpu
        movl $(__USER_DS),%eax          # DS/ES contains default USER segment
        movl %eax,%ds
        movl %eax,%es
+        movl $(__KERNEL_PERCPU), %eax
+        movl %eax,%fs                   # set this cpu's percpu
        xorl %eax,%eax                  # Clear GS and LDT
        movl %eax,%gs
        lldt %ax
@@ -446,8 +448,6 @@ is386:	movl $2,%ecx		# set MP
        movb $1, ready
        cmpb $0,%cl             # the first CPU calls start_kernel
        je   1f
-        movl $(__KERNEL_PERCPU), %eax
-        movl %eax,%fs           # set this cpu's percpu
        movl (stack_start), %esp
 1:
 #endif /* CONFIG_SMP */
@@ -548,12 +548,8 @@ early_fault:
        pushl %eax
        pushl %edx              /* trapno */
        pushl $fault_msg
-#ifdef CONFIG_EARLY_PRINTK
-        call early_printk
-#else
        call printk
 #endif
-#endif
        call dump_stack
 hlt_loop:
        hlt
@@ -580,11 +576,10 @@ ignore_int:
        pushl 32(%esp)
        pushl 40(%esp)
        pushl $int_msg
-#ifdef CONFIG_EARLY_PRINTK
-        call early_printk
-#else
        call printk
-#endif
+        call dump_stack
        addl $(5*4),%esp
        popl %ds
        popl %es
@@ -660,7 +655,7 @@ early_recursion_flag:
        .long 0
 int_msg:
-        .asciz "Unknown interrupt or fault at EIP %p %p %p\n"
+        .asciz "Unknown interrupt or fault at: %p %p %p\n"
 fault_msg:
 /* fault info: */
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 0e275d495563..a0a2b5ca9b7d 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -19,6 +19,7 @@
 #include <asm/msr.h>
 #include <asm/cache.h>
 #include <asm/processor-flags.h>
+#include <asm/percpu.h>
 #ifdef CONFIG_PARAVIRT
 #include <asm/asm-offsets.h>
@@ -204,6 +205,19 @@ ENTRY(secondary_startup_64)
        pushq $0
        popfq
+#ifdef CONFIG_SMP
+        /*
+         * Fix up static pointers that need __per_cpu_load added.  The assembler
+         * is unable to do this directly.  This is only needed for the boot cpu.
+         * These values are set up with the correct base addresses by C code for
+         * secondary cpus.
+         */
+        movq    initial_gs(%rip), %rax
+        cmpl    $0, per_cpu__cpu_number(%rax)
+        jne     1f
+        addq    %rax, early_gdt_descr_base(%rip)
+1:
+#endif
        /*
         * We must switch to a new descriptor in kernel space for the GDT
         * because soon the kernel won't have access anymore to the userspace
@@ -226,12 +240,15 @@ ENTRY(secondary_startup_64)
        movl %eax,%fs
        movl %eax,%gs
-        /* 
+        /* Set up %gs.
-         * Setup up a dummy PDA. this is just for some early bootup code
+         *
-         * that does in_interrupt() 
+         * The base of %gs always points to the bottom of the irqstack
-         */ 
+         * union.  If the stack protector canary is enabled, it is
+         * located at %gs:40.  Note that, on SMP, the boot cpu uses
+         * init data section till per cpu areas are set up.
+         */
        movl    $MSR_GS_BASE,%ecx
-        movq    $empty_zero_page,%rax
+        movq    initial_gs(%rip),%rax
        movq    %rax,%rdx
        shrq    $32,%rdx
        wrmsr   
@@ -257,6 +274,12 @@ ENTRY(secondary_startup_64)
        .align  8
        ENTRY(initial_code)
        .quad   x86_64_start_kernel
+        ENTRY(initial_gs)
+#ifdef CONFIG_SMP
+        .quad   __per_cpu_load
+#else
+        .quad   PER_CPU_VAR(irq_stack_union)
+#endif
        __FINITDATA
        ENTRY(stack_start)
@@ -401,7 +424,8 @@ NEXT_PAGE(level2_spare_pgt)
        .globl early_gdt_descr
 early_gdt_descr:
        .word   GDT_ENTRIES*8-1
-        .quad   per_cpu__gdt_page
+early_gdt_descr_base:
+        .quad   per_cpu__gdt_page
 ENTRY(phys_base)
        /* This must match the first entry in level2_kernel_pgt */
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index cd759ad90690..64d5ad0b8add 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -628,11 +628,12 @@ static int hpet_cpuhp_notify(struct notifier_block *n,
        switch (action & 0xf) {
        case CPU_ONLINE:
-                INIT_DELAYED_WORK(&work.work, hpet_work);
+                INIT_DELAYED_WORK_ON_STACK(&work.work, hpet_work);
                init_completion(&work.complete);
                /* FIXME: add schedule_work_on() */
                schedule_delayed_work_on(cpu, &work.work, 0);
                wait_for_completion(&work.complete);
+                destroy_timer_on_stack(&work.work.timer);
                break;
        case CPU_DEAD:
                if (hdev) {
diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c
index 157aafa45583..bfb7d734062a 100644
--- a/arch/x86/kernel/io_apic.c
+++ b/arch/x86/kernel/io_apic.c
@@ -357,7 +357,7 @@ set_extra_move_desc(struct irq_desc *desc, const struct cpumask *mask)
        if (!cfg->move_in_progress) {
                /* it means that domain is not changed */
-                if (!cpumask_intersects(&desc->affinity, mask))
+                if (!cpumask_intersects(desc->affinity, mask))
                        cfg->move_desc_pending = 1;
        }
 }
@@ -580,9 +580,9 @@ set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask)
        if (assign_irq_vector(irq, cfg, mask))
                return BAD_APICID;
-        cpumask_and(&desc->affinity, cfg->domain, mask);
+        cpumask_and(desc->affinity, cfg->domain, mask);
        set_extra_move_desc(desc, mask);
-        return cpu_mask_to_apicid_and(&desc->affinity, cpu_online_mask);
+        return cpu_mask_to_apicid_and(desc->affinity, cpu_online_mask);
 }
 static void
@@ -2382,7 +2382,7 @@ migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
        if (cfg->move_in_progress)
                send_cleanup_vector(cfg);
-        cpumask_copy(&desc->affinity, mask);
+        cpumask_copy(desc->affinity, mask);
 }
 static int migrate_irq_remapped_level_desc(struct irq_desc *desc)
@@ -2404,11 +2404,11 @@ static int migrate_irq_remapped_level_desc(struct irq_desc *desc)
        }
        /* everthing is clear. we have right of way */
-        migrate_ioapic_irq_desc(desc, &desc->pending_mask);
+        migrate_ioapic_irq_desc(desc, desc->pending_mask);
        ret = 0;
        desc->status &= ~IRQ_MOVE_PENDING;
-        cpumask_clear(&desc->pending_mask);
+        cpumask_clear(desc->pending_mask);
 unmask:
        unmask_IO_APIC_irq_desc(desc);
@@ -2433,7 +2433,7 @@ static void ir_irq_migration(struct work_struct *work)
                                continue;
                        }
-                        desc->chip->set_affinity(irq, &desc->pending_mask);
+                        desc->chip->set_affinity(irq, desc->pending_mask);
                        spin_unlock_irqrestore(&desc->lock, flags);
                }
        }
@@ -2447,7 +2447,7 @@ static void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
 {
        if (desc->status & IRQ_LEVEL) {
                desc->status |= IRQ_MOVE_PENDING;
-                cpumask_copy(&desc->pending_mask, mask);
+                cpumask_copy(desc->pending_mask, mask);
                migrate_irq_remapped_level_desc(desc);
                return;
        }
@@ -2515,7 +2515,7 @@ static void irq_complete_move(struct irq_desc **descp)
                /* domain has not changed, but affinity did */
                me = smp_processor_id();
-                if (cpu_isset(me, desc->affinity)) {
+                if (cpumask_test_cpu(me, desc->affinity)) {
                        *descp = desc = move_irq_desc(desc, me);
                        /* get the new one */
                        cfg = desc->chip_data;
@@ -3182,7 +3182,7 @@ unsigned int create_irq_nr(unsigned int irq_want)
        irq = 0;
        spin_lock_irqsave(&vector_lock, flags);
-        for (new = irq_want; new < NR_IRQS; new++) {
+        for (new = irq_want; new < nr_irqs; new++) {
                if (platform_legacy_irq(new))
                        continue;
@@ -3257,6 +3257,9 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
        int err;
        unsigned dest;
+        if (disable_apic)
+                return -ENXIO;
        cfg = irq_cfg(irq);
        err = assign_irq_vector(irq, cfg, TARGET_CPUS);
        if (err)
@@ -3691,6 +3694,9 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
        struct irq_cfg *cfg;
        int err;
+        if (disable_apic)
+                return -ENXIO;
        cfg = irq_cfg(irq);
        err = assign_irq_vector(irq, cfg, TARGET_CPUS);
        if (!err) {
@@ -3725,7 +3731,7 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
 }
 #endif /* CONFIG_HT_IRQ */
-#ifdef CONFIG_X86_64
+#ifdef CONFIG_X86_UV
 /*
 * Re-target the irq to the specified CPU and enable the specified MMR located
 * on the specified blade to allow the sending of MSIs to the specified CPU.
@@ -3815,6 +3821,22 @@ void __init probe_nr_irqs_gsi(void)
                nr_irqs_gsi = nr;
 }
+#ifdef CONFIG_SPARSE_IRQ
+int __init arch_probe_nr_irqs(void)
+{
+        int nr;
+        nr = ((8 * nr_cpu_ids) > (32 * nr_ioapics) ?
+                (NR_VECTORS + (8 * nr_cpu_ids)) :
+                (NR_VECTORS + (32 * nr_ioapics)));
+        if (nr < nr_irqs && nr > nr_irqs_gsi)
+                nr_irqs = nr;
+        return 0;
+}
+#endif
 /* --------------------------------------------------------------------------
                          ACPI-based IOAPIC Configuration
   -------------------------------------------------------------------------- */
@@ -4004,7 +4026,7 @@ void __init setup_ioapic_dest(void)
                         */
                        if (desc->status &
                            (IRQ_NO_BALANCING | IRQ_AFFINITY_SET))
-                                mask = &desc->affinity;
+                                mask = desc->affinity;
                        else
                                mask = TARGET_CPUS;
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 3973e2df7f87..8b30d0c2512c 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -36,11 +36,7 @@ void ack_bad_irq(unsigned int irq)
 #endif
 }
-#ifdef CONFIG_X86_32
+#define irq_stats(x)            (&per_cpu(irq_stat, x))
-# define irq_stats(x)           (&per_cpu(irq_stat, x))
-#else
-# define irq_stats(x)           cpu_pda(x)
-#endif
 /*
 * /proc/interrupts printing:
 */
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 74b9ff7341e9..e0f29be8ab0b 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -248,7 +248,7 @@ void fixup_irqs(void)
                if (irq == 2)
                        continue;
-                affinity = &desc->affinity;
+                affinity = desc->affinity;
                if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
                        printk("Breaking affinity for irq %i\n", irq);
                        affinity = cpu_all_mask;
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 63c88e6ec025..018963aa6ee3 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -18,6 +18,13 @@
 #include <linux/smp.h>
 #include <asm/io_apic.h>
 #include <asm/idle.h>
+#include <asm/apic.h>
+DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
+EXPORT_PER_CPU_SYMBOL(irq_stat);
+DEFINE_PER_CPU(struct pt_regs *, irq_regs);
+EXPORT_PER_CPU_SYMBOL(irq_regs);
 /*
 * Probabilistic stack overflow check:
@@ -100,7 +107,7 @@ void fixup_irqs(void)
                /* interrupt's are disabled at this point */
                spin_lock(&desc->lock);
-                affinity = &desc->affinity;
+                affinity = desc->affinity;
                if (!irq_has_action(irq) ||
                    cpumask_equal(affinity, cpu_online_mask)) {
                        spin_unlock(&desc->lock);
diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c
index 1507ad4e674d..bf629cadec1a 100644
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit_32.c
@@ -149,8 +149,15 @@ void __init native_init_IRQ(void)
         */
        alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
-        /* IPI for invalidation */
+        /* IPIs for invalidation */
-        alloc_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt);
+        alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0);
+        alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1);
+        alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2);
+        alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3);
+        alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4);
+        alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5);
+        alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6);
+        alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7);
        /* IPI for generic function call */
        alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index 884d985b8b82..e948b28a5a9a 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -446,7 +446,7 @@ void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
 static void __kprobes setup_singlestep(struct kprobe *p, struct pt_regs *regs,
                                       struct kprobe_ctlblk *kcb)
 {
-#if !defined(CONFIG_PREEMPT) || defined(CONFIG_PM)
+#if !defined(CONFIG_PREEMPT) || defined(CONFIG_FREEZER)
        if (p->ainsn.boostable == 1 && !p->post_handler) {
                /* Boost up -- we can execute copied instructions directly */
                reset_current_kprobe();
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index ad36377dc935..fa6bb263892e 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -27,6 +27,7 @@
 #include <asm/e820.h>
 #include <asm/trampoline.h>
 #include <asm/setup.h>
+#include <asm/smp.h>
 #include <mach_apic.h>
 #ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index 7228979f1e7f..23b6d9e6e4f5 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -61,11 +61,7 @@ static int endflag __initdata;
 static inline unsigned int get_nmi_count(int cpu)
 {
-#ifdef CONFIG_X86_64
+        return per_cpu(irq_stat, cpu).__nmi_count;
-        return cpu_pda(cpu)->__nmi_count;
-#else
-        return nmi_count(cpu);
-#endif
 }
 static inline int mce_in_progress(void)
@@ -82,12 +78,8 @@ static inline int mce_in_progress(void)
 */
 static inline unsigned int get_timer_irqs(int cpu)
 {
-#ifdef CONFIG_X86_64
-        return read_pda(apic_timer_irqs) + read_pda(irq0_irqs);
-#else
        return per_cpu(irq_stat, cpu).apic_timer_irqs +
                per_cpu(irq_stat, cpu).irq0_irqs;
-#endif
 }
 #ifdef CONFIG_SMP
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index e4c8fb608873..202514be5923 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -435,7 +435,6 @@ struct pv_mmu_ops pv_mmu_ops = {
 #endif /* PAGETABLE_LEVELS >= 3 */
        .pte_val = native_pte_val,
-        .pte_flags = native_pte_flags,
        .pgd_val = native_pgd_val,
        .make_pte = native_make_pte,
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index a546f55c77b4..1a1ae8edc40c 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -66,9 +66,6 @@ asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
 DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
 EXPORT_PER_CPU_SYMBOL(current_task);
-DEFINE_PER_CPU(int, cpu_number);
-EXPORT_PER_CPU_SYMBOL(cpu_number);
 /*
 * Return saved PC of a blocked thread.
 */
@@ -111,7 +108,6 @@ void cpu_idle(void)
                                play_dead();
                        local_irq_disable();
-                        __get_cpu_var(irq_stat).idle_timestamp = jiffies;
                        /* Don't trace irqs off for idle */
                        stop_critical_timings();
                        pm_idle();
@@ -591,7 +587,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
        if (prev->gs | next->gs)
                loadsegment(gs, next->gs);
-        x86_write_percpu(current_task, next_p);
+        percpu_write(current_task, next_p);
        return prev_p;
 }
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 416fb9282f4f..c422eebb0c58 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -16,6 +16,7 @@
 #include <stdarg.h>
+#include <linux/stackprotector.h>
 #include <linux/cpu.h>
 #include <linux/errno.h>
 #include <linux/sched.h>
@@ -46,7 +47,6 @@
 #include <asm/processor.h>
 #include <asm/i387.h>
 #include <asm/mmu_context.h>
-#include <asm/pda.h>
 #include <asm/prctl.h>
 #include <asm/desc.h>
 #include <asm/proto.h>
@@ -57,6 +57,12 @@
 asmlinkage extern void ret_from_fork(void);
+DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
+EXPORT_PER_CPU_SYMBOL(current_task);
+DEFINE_PER_CPU(unsigned long, old_rsp);
+static DEFINE_PER_CPU(unsigned char, is_idle);
 unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
 static ATOMIC_NOTIFIER_HEAD(idle_notifier);
@@ -75,13 +81,13 @@ EXPORT_SYMBOL_GPL(idle_notifier_unregister);
 void enter_idle(void)
 {
-        write_pda(isidle, 1);
+        percpu_write(is_idle, 1);
        atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
 }
 static void __exit_idle(void)
 {
-        if (test_and_clear_bit_pda(0, isidle) == 0)
+        if (x86_test_and_clear_bit_percpu(0, is_idle) == 0)
                return;
        atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
 }
@@ -111,6 +117,17 @@ static inline void play_dead(void)
 void cpu_idle(void)
 {
        current_thread_info()->status |= TS_POLLING;
+        /*
+         * If we're the non-boot CPU, nothing set the PDA stack
+         * canary up for us - and if we are the boot CPU we have
+         * a 0 stack canary. This is a good place for updating
+         * it, as we wont ever return from this function (so the
+         * invalid canaries already on the stack wont ever
+         * trigger):
+         */
+        boot_init_stack_canary();
        /* endless idle loop with no priority at all */
        while (1) {
                tick_nohz_stop_sched_tick(1);
@@ -392,7 +409,7 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
        load_gs_index(0);
        regs->ip                = new_ip;
        regs->sp                = new_sp;
-        write_pda(oldrsp, new_sp);
+        percpu_write(old_rsp, new_sp);
        regs->cs                = __USER_CS;
        regs->ss                = __USER_DS;
        regs->flags             = 0x200;
@@ -613,21 +630,13 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
        /*
         * Switch the PDA and FPU contexts.
         */
-        prev->usersp = read_pda(oldrsp);
+        prev->usersp = percpu_read(old_rsp);
-        write_pda(oldrsp, next->usersp);
+        percpu_write(old_rsp, next->usersp);
-        write_pda(pcurrent, next_p);
+        percpu_write(current_task, next_p);
-        write_pda(kernelstack,
+        percpu_write(kernel_stack,
                  (unsigned long)task_stack_page(next_p) +
-                  THREAD_SIZE - PDA_STACKOFFSET);
+                  THREAD_SIZE - KERNEL_STACK_OFFSET);
-#ifdef CONFIG_CC_STACKPROTECTOR
-        write_pda(stack_canary, next_p->stack_canary);
-        /*
-         * Build time only check to make sure the stack_canary is at
-         * offset 40 in the pda; this is a gcc ABI requirement
-         */
-        BUILD_BUG_ON(offsetof(struct x8664_pda, stack_canary) != 40);
-#endif
        /*
         * Now maybe reload the debug registers and handle I/O bitmaps
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index bf63de72b643..0d1e7ac439f4 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -13,146 +13,46 @@
 #include <asm/mpspec.h>
 #include <asm/apicdef.h>
 #include <asm/highmem.h>
+#include <asm/proto.h>
 #include <asm/cpumask.h>
+#include <asm/cpu.h>
-#ifdef CONFIG_X86_LOCAL_APIC
+#ifdef CONFIG_DEBUG_PER_CPU_MAPS
-unsigned int num_processors;
+# define DBG(x...) printk(KERN_DEBUG x)
-unsigned disabled_cpus __cpuinitdata;
-/* Processor that is doing the boot up */
-unsigned int boot_cpu_physical_apicid = -1U;
-EXPORT_SYMBOL(boot_cpu_physical_apicid);
-unsigned int max_physical_apicid;
-/* Bitmask of physically existing CPUs */
-physid_mask_t phys_cpu_present_map;
-#endif
-/* map cpu index to physical APIC ID */
-DEFINE_EARLY_PER_CPU(u16, x86_cpu_to_apicid, BAD_APICID);
-DEFINE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid, BAD_APICID);
-EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid);
-EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
-#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
-#define X86_64_NUMA     1
-/* map cpu index to node index */
-DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE);
-EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
-/* which logical CPUs are on which nodes */
-cpumask_t *node_to_cpumask_map;
-EXPORT_SYMBOL(node_to_cpumask_map);
-/* setup node_to_cpumask_map */
-static void __init setup_node_to_cpumask_map(void);
 #else
-static inline void setup_node_to_cpumask_map(void) { }
+# define DBG(x...)
 #endif
-#if defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) && defined(CONFIG_X86_SMP)
+DEFINE_PER_CPU(int, cpu_number);
-/*
+EXPORT_PER_CPU_SYMBOL(cpu_number);
- * Copy data used in early init routines from the initial arrays to the
- * per cpu data areas.  These arrays then become expendable and the
- * *_early_ptr's are zeroed indicating that the static arrays are gone.
- */
-static void __init setup_per_cpu_maps(void)
-{
-        int cpu;
-        for_each_possible_cpu(cpu) {
+#ifdef CONFIG_X86_64
-                per_cpu(x86_cpu_to_apicid, cpu) =
+#define BOOT_PERCPU_OFFSET ((unsigned long)__per_cpu_load)
-                                early_per_cpu_map(x86_cpu_to_apicid, cpu);
+#else
-                per_cpu(x86_bios_cpu_apicid, cpu) =
+#define BOOT_PERCPU_OFFSET 0
-                                early_per_cpu_map(x86_bios_cpu_apicid, cpu);
-#ifdef X86_64_NUMA
-                per_cpu(x86_cpu_to_node_map, cpu) =
-                                early_per_cpu_map(x86_cpu_to_node_map, cpu);
 #endif
-        }
-        /* indicate the early static arrays will soon be gone */
+DEFINE_PER_CPU(unsigned long, this_cpu_off) = BOOT_PERCPU_OFFSET;
-        early_per_cpu_ptr(x86_cpu_to_apicid) = NULL;
+EXPORT_PER_CPU_SYMBOL(this_cpu_off);
-        early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL;
-#ifdef X86_64_NUMA
-        early_per_cpu_ptr(x86_cpu_to_node_map) = NULL;
-#endif
-}
-#ifdef CONFIG_X86_32
+unsigned long __per_cpu_offset[NR_CPUS] __read_mostly = {
-/*
+        [0 ... NR_CPUS-1] = BOOT_PERCPU_OFFSET,
- * Great future not-so-futuristic plan: make i386 and x86_64 do it
+};
- * the same way
- */
-unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
 EXPORT_SYMBOL(__per_cpu_offset);
-static inline void setup_cpu_pda_map(void) { }
-#elif !defined(CONFIG_SMP)
-static inline void setup_cpu_pda_map(void) { }
-#else /* CONFIG_SMP && CONFIG_X86_64 */
-/*
- * Allocate cpu_pda pointer table and array via alloc_bootmem.
- */
-static void __init setup_cpu_pda_map(void)
-{
-        char *pda;
-        struct x8664_pda **new_cpu_pda;
-        unsigned long size;
-        int cpu;
-        size = roundup(sizeof(struct x8664_pda), cache_line_size());
-        /* allocate cpu_pda array and pointer table */
-        {
-                unsigned long tsize = nr_cpu_ids * sizeof(void *);
-                unsigned long asize = size * (nr_cpu_ids - 1);
-                tsize = roundup(tsize, cache_line_size());
+static inline void setup_percpu_segment(int cpu)
-                new_cpu_pda = alloc_bootmem(tsize + asize);
-                pda = (char *)new_cpu_pda + tsize;
-        }
-        /* initialize pointer table to static pda's */
-        for_each_possible_cpu(cpu) {
-                if (cpu == 0) {
-                        /* leave boot cpu pda in place */
-                        new_cpu_pda[0] = cpu_pda(0);
-                        continue;
-                }
-                new_cpu_pda[cpu] = (struct x8664_pda *)pda;
-                new_cpu_pda[cpu]->in_bootmem = 1;
-                pda += size;
-        }
-        /* point to new pointer table */
-        _cpu_pda = new_cpu_pda;
-}
-#endif /* CONFIG_SMP && CONFIG_X86_64 */
-#ifdef CONFIG_X86_64
-/* correctly size the local cpu masks */
-static void setup_cpu_local_masks(void)
 {
-        alloc_bootmem_cpumask_var(&cpu_initialized_mask);
+#ifdef CONFIG_X86_32
-        alloc_bootmem_cpumask_var(&cpu_callin_mask);
+        struct desc_struct gdt;
-        alloc_bootmem_cpumask_var(&cpu_callout_mask);
-        alloc_bootmem_cpumask_var(&cpu_sibling_setup_mask);
-}
-#else /* CONFIG_X86_32 */
-static inline void setup_cpu_local_masks(void)
+        pack_descriptor(&gdt, per_cpu_offset(cpu), 0xFFFFF,
-{
+                        0x2 | DESCTYPE_S, 0x8);
+        gdt.s = 1;
+        write_gdt_entry(get_cpu_gdt_table(cpu),
+                        GDT_ENTRY_PERCPU, &gdt, DESCTYPE_S);
+#endif
 }
-#endif /* CONFIG_X86_32 */
 /*
 * Great future plan:
 * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data.
@@ -160,18 +60,12 @@ static inline void setup_cpu_local_masks(void)
 */
 void __init setup_per_cpu_areas(void)
 {
-        ssize_t size, old_size;
+        ssize_t size;
        char *ptr;
        int cpu;
-        unsigned long align = 1;
-        /* Setup cpu_pda map */
-        setup_cpu_pda_map();
        /* Copy section for each CPU (we discard the original) */
-        old_size = PERCPU_ENOUGH_ROOM;
+        size = roundup(PERCPU_ENOUGH_ROOM, PAGE_SIZE);
-        align = max_t(unsigned long, PAGE_SIZE, align);
-        size = roundup(old_size, align);
        pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n",
                NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids);
@@ -180,30 +74,67 @@ void __init setup_per_cpu_areas(void)
        for_each_possible_cpu(cpu) {
 #ifndef CONFIG_NEED_MULTIPLE_NODES
-                ptr = __alloc_bootmem(size, align,
+                ptr = alloc_bootmem_pages(size);
-                                 __pa(MAX_DMA_ADDRESS));
 #else
                int node = early_cpu_to_node(cpu);
                if (!node_online(node) || !NODE_DATA(node)) {
-                        ptr = __alloc_bootmem(size, align,
+                        ptr = alloc_bootmem_pages(size);
-                                         __pa(MAX_DMA_ADDRESS));
                        pr_info("cpu %d has no node %d or node-local memory\n",
                                cpu, node);
                        pr_debug("per cpu data for cpu%d at %016lx\n",
                                 cpu, __pa(ptr));
                } else {
-                        ptr = __alloc_bootmem_node(NODE_DATA(node), size, align,
+                        ptr = alloc_bootmem_pages_node(NODE_DATA(node), size);
-                                                        __pa(MAX_DMA_ADDRESS));
                        pr_debug("per cpu data for cpu%d on node%d at %016lx\n",
                                cpu, node, __pa(ptr));
                }
 #endif
+                memcpy(ptr, __per_cpu_load, __per_cpu_end - __per_cpu_start);
                per_cpu_offset(cpu) = ptr - __per_cpu_start;
-                memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
+                per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu);
+                per_cpu(cpu_number, cpu) = cpu;
+                setup_percpu_segment(cpu);
+                /*
+                 * Copy data used in early init routines from the
+                 * initial arrays to the per cpu data areas.  These
+                 * arrays then become expendable and the *_early_ptr's
+                 * are zeroed indicating that the static arrays are
+                 * gone.
+                 */
+#ifdef CONFIG_X86_LOCAL_APIC
+                per_cpu(x86_cpu_to_apicid, cpu) =
+                        early_per_cpu_map(x86_cpu_to_apicid, cpu);
+                per_cpu(x86_bios_cpu_apicid, cpu) =
+                        early_per_cpu_map(x86_bios_cpu_apicid, cpu);
+#endif
+#ifdef CONFIG_X86_64
+                per_cpu(irq_stack_ptr, cpu) =
+                        per_cpu(irq_stack_union.irq_stack, cpu) +
+                        IRQ_STACK_SIZE - 64;
+#ifdef CONFIG_NUMA
+                per_cpu(x86_cpu_to_node_map, cpu) =
+                        early_per_cpu_map(x86_cpu_to_node_map, cpu);
+#endif
+#endif
+                /*
+                 * Up to this point, the boot CPU has been using .data.init
+                 * area.  Reload any changed state for the boot CPU.
+                 */
+                if (cpu == boot_cpu_id)
+                        switch_to_new_gdt();
+                DBG("PERCPU: cpu %4d %p\n", cpu, ptr);
        }
-        /* Setup percpu data maps */
+        /* indicate the early static arrays will soon be gone */
-        setup_per_cpu_maps();
+#ifdef CONFIG_X86_LOCAL_APIC
+        early_per_cpu_ptr(x86_cpu_to_apicid) = NULL;
+        early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL;
+#endif
+#if defined(CONFIG_X86_64) && defined(CONFIG_NUMA)
+        early_per_cpu_ptr(x86_cpu_to_node_map) = NULL;
+#endif
        /* Setup node to cpumask map */
        setup_node_to_cpumask_map();
@@ -211,199 +142,3 @@ void __init setup_per_cpu_areas(void)
        /* Setup cpu initialized, callin, callout masks */
        setup_cpu_local_masks();
 }
-#endif
-#ifdef X86_64_NUMA
-/*
- * Allocate node_to_cpumask_map based on number of available nodes
- * Requires node_possible_map to be valid.
- *
- * Note: node_to_cpumask() is not valid until after this is done.
- */
-static void __init setup_node_to_cpumask_map(void)
-{
-        unsigned int node, num = 0;
-        cpumask_t *map;
-        /* setup nr_node_ids if not done yet */
-        if (nr_node_ids == MAX_NUMNODES) {
-                for_each_node_mask(node, node_possible_map)
-                        num = node;
-                nr_node_ids = num + 1;
-        }
-        /* allocate the map */
-        map = alloc_bootmem_low(nr_node_ids * sizeof(cpumask_t));
-        pr_debug("Node to cpumask map at %p for %d nodes\n",
-                 map, nr_node_ids);
-        /* node_to_cpumask() will now work */
-        node_to_cpumask_map = map;
-}
-void __cpuinit numa_set_node(int cpu, int node)
-{
-        int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);
-        if (cpu_pda(cpu) && node != NUMA_NO_NODE)
-                cpu_pda(cpu)->nodenumber = node;
-        if (cpu_to_node_map)
-                cpu_to_node_map[cpu] = node;
-        else if (per_cpu_offset(cpu))
-                per_cpu(x86_cpu_to_node_map, cpu) = node;
-        else
-                pr_debug("Setting node for non-present cpu %d\n", cpu);
-}
-void __cpuinit numa_clear_node(int cpu)
-{
-        numa_set_node(cpu, NUMA_NO_NODE);
-}
-#ifndef CONFIG_DEBUG_PER_CPU_MAPS
-void __cpuinit numa_add_cpu(int cpu)
-{
-        cpu_set(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
-}
-void __cpuinit numa_remove_cpu(int cpu)
-{
-        cpu_clear(cpu, node_to_cpumask_map[cpu_to_node(cpu)]);
-}
-#else /* CONFIG_DEBUG_PER_CPU_MAPS */
-/*
- * --------- debug versions of the numa functions ---------
- */
-static void __cpuinit numa_set_cpumask(int cpu, int enable)
-{
-        int node = cpu_to_node(cpu);
-        cpumask_t *mask;
-        char buf[64];
-        if (node_to_cpumask_map == NULL) {
-                printk(KERN_ERR "node_to_cpumask_map NULL\n");
-                dump_stack();
-                return;
-        }
-        mask = &node_to_cpumask_map[node];
-        if (enable)
-                cpu_set(cpu, *mask);
-        else
-                cpu_clear(cpu, *mask);
-        cpulist_scnprintf(buf, sizeof(buf), mask);
-        printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n",
-                enable ? "numa_add_cpu" : "numa_remove_cpu", cpu, node, buf);
-}
-void __cpuinit numa_add_cpu(int cpu)
-{
-        numa_set_cpumask(cpu, 1);
-}
-void __cpuinit numa_remove_cpu(int cpu)
-{
-        numa_set_cpumask(cpu, 0);
-}
-int cpu_to_node(int cpu)
-{
-        if (early_per_cpu_ptr(x86_cpu_to_node_map)) {
-                printk(KERN_WARNING
-                        "cpu_to_node(%d): usage too early!\n", cpu);
-                dump_stack();
-                return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
-        }
-        return per_cpu(x86_cpu_to_node_map, cpu);
-}
-EXPORT_SYMBOL(cpu_to_node);
-/*
- * Same function as cpu_to_node() but used if called before the
- * per_cpu areas are setup.
- */
-int early_cpu_to_node(int cpu)
-{
-        if (early_per_cpu_ptr(x86_cpu_to_node_map))
-                return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
-        if (!per_cpu_offset(cpu)) {
-                printk(KERN_WARNING
-                        "early_cpu_to_node(%d): no per_cpu area!\n", cpu);
-                dump_stack();
-                return NUMA_NO_NODE;
-        }
-        return per_cpu(x86_cpu_to_node_map, cpu);
-}
-/* empty cpumask */
-static const cpumask_t cpu_mask_none;
-/*
- * Returns a pointer to the bitmask of CPUs on Node 'node'.
- */
-const cpumask_t *cpumask_of_node(int node)
-{
-        if (node_to_cpumask_map == NULL) {
-                printk(KERN_WARNING
-                        "cpumask_of_node(%d): no node_to_cpumask_map!\n",
-                        node);
-                dump_stack();
-                return (const cpumask_t *)&cpu_online_map;
-        }
-        if (node >= nr_node_ids) {
-                printk(KERN_WARNING
-                        "cpumask_of_node(%d): node > nr_node_ids(%d)\n",
-                        node, nr_node_ids);
-                dump_stack();
-                return &cpu_mask_none;
-        }
-        return &node_to_cpumask_map[node];
-}
-EXPORT_SYMBOL(cpumask_of_node);
-/*
- * Returns a bitmask of CPUs on Node 'node'.
- *
- * Side note: this function creates the returned cpumask on the stack
- * so with a high NR_CPUS count, excessive stack space is used.  The
- * node_to_cpumask_ptr function should be used whenever possible.
- */
-cpumask_t node_to_cpumask(int node)
-{
-        if (node_to_cpumask_map == NULL) {
-                printk(KERN_WARNING
-                        "node_to_cpumask(%d): no node_to_cpumask_map!\n", node);
-                dump_stack();
-                return cpu_online_map;
-        }
-        if (node >= nr_node_ids) {
-                printk(KERN_WARNING
-                        "node_to_cpumask(%d): node > nr_node_ids(%d)\n",
-                        node, nr_node_ids);
-                dump_stack();
-                return cpu_mask_none;
-        }
-        return node_to_cpumask_map[node];
-}
-EXPORT_SYMBOL(node_to_cpumask);
-/*
- * --------- end of debug versions of the numa functions ---------
- */
-#endif /* CONFIG_DEBUG_PER_CPU_MAPS */
-#endif /* X86_64_NUMA */
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index cf34eb37fbee..7fc78b019815 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -649,9 +649,16 @@ badframe:
 }
 #ifdef CONFIG_X86_32
-asmlinkage int sys_rt_sigreturn(struct pt_regs regs)
+/*
+ * Note: do not pass in pt_regs directly as with tail-call optimization
+ * GCC will incorrectly stomp on the caller's frame and corrupt user-space
+ * register state:
+ */
+asmlinkage int sys_rt_sigreturn(unsigned long __unused)
 {
-        return do_rt_sigreturn(&regs);
+        struct pt_regs *regs = (struct pt_regs *)&__unused;
+        return do_rt_sigreturn(regs);
 }
 #else /* !CONFIG_X86_32 */
 asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 6c2b8444b830..f9dbcff43546 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -62,6 +62,7 @@
 #include <asm/vmi.h>
 #include <asm/genapic.h>
 #include <asm/setup.h>
+#include <asm/uv/uv.h>
 #include <linux/mc146818rtc.h>
 #include <mach_apic.h>
@@ -744,52 +745,6 @@ static void __cpuinit do_fork_idle(struct work_struct *work)
        complete(&c_idle->done);
 }
-#ifdef CONFIG_X86_64
-/* __ref because it's safe to call free_bootmem when after_bootmem == 0. */
-static void __ref free_bootmem_pda(struct x8664_pda *oldpda)
-{
-        if (!after_bootmem)
-                free_bootmem((unsigned long)oldpda, sizeof(*oldpda));
-}
-/*
- * Allocate node local memory for the AP pda.
- *
- * Must be called after the _cpu_pda pointer table is initialized.
- */
-int __cpuinit get_local_pda(int cpu)
-{
-        struct x8664_pda *oldpda, *newpda;
-        unsigned long size = sizeof(struct x8664_pda);
-        int node = cpu_to_node(cpu);
-        if (cpu_pda(cpu) && !cpu_pda(cpu)->in_bootmem)
-                return 0;
-        oldpda = cpu_pda(cpu);
-        newpda = kmalloc_node(size, GFP_ATOMIC, node);
-        if (!newpda) {
-                printk(KERN_ERR "Could not allocate node local PDA "
-                        "for CPU %d on node %d\n", cpu, node);
-                if (oldpda)
-                        return 0;       /* have a usable pda */
-                else
-                        return -1;
-        }
-        if (oldpda) {
-                memcpy(newpda, oldpda, size);
-                free_bootmem_pda(oldpda);
-        }
-        newpda->in_bootmem = 0;
-        cpu_pda(cpu) = newpda;
-        return 0;
-}
-#endif /* CONFIG_X86_64 */
 static int __cpuinit do_boot_cpu(int apicid, int cpu)
 /*
 * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
@@ -807,16 +762,6 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
        };
        INIT_WORK(&c_idle.work, do_fork_idle);
-#ifdef CONFIG_X86_64
-        /* Allocate node local memory for AP pdas */
-        if (cpu > 0) {
-                boot_error = get_local_pda(cpu);
-                if (boot_error)
-                        goto restore_state;
-                        /* if can't get pda memory, can't start cpu */
-        }
-#endif
        alternatives_smp_switch(1);
        c_idle.idle = get_idle_for_cpu(cpu);
@@ -846,14 +791,16 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
        set_idle_for_cpu(cpu, c_idle.idle);
 do_rest:
-#ifdef CONFIG_X86_32
        per_cpu(current_task, cpu) = c_idle.idle;
-        init_gdt(cpu);
+#ifdef CONFIG_X86_32
        /* Stack for startup_32 can be just as for start_secondary onwards */
        irq_ctx_init(cpu);
 #else
-        cpu_pda(cpu)->pcurrent = c_idle.idle;
        clear_tsk_thread_flag(c_idle.idle, TIF_FORK);
+        initial_gs = per_cpu_offset(cpu);
+        per_cpu(kernel_stack, cpu) =
+                (unsigned long)task_stack_page(c_idle.idle) -
+                KERNEL_STACK_OFFSET + THREAD_SIZE;
 #endif
        early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
        initial_code = (unsigned long)start_secondary;
@@ -930,9 +877,7 @@ do_rest:
                                inquire_remote_apic(apicid);
                }
        }
-#ifdef CONFIG_X86_64
-restore_state:
-#endif
        if (boot_error) {
                /* Try to put things back the way they were before ... */
                numa_remove_cpu(cpu); /* was set by numa_add_cpu */
@@ -1124,6 +1069,7 @@ static int __init smp_sanity_check(unsigned max_cpus)
                printk(KERN_ERR "... forcing use of dummy APIC emulation."
                                "(tell your hw vendor)\n");
                smpboot_clear_io_apic();
+                disable_ioapic_setup();
                return -1;
        }
@@ -1239,9 +1185,6 @@ out:
 void __init native_smp_prepare_boot_cpu(void)
 {
        int me = smp_processor_id();
-#ifdef CONFIG_X86_32
-        init_gdt(me);
-#endif
        switch_to_new_gdt();
        /* already set me in cpu_online_mask in boot_cpu_init() */
        cpumask_set_cpu(me, cpu_callout_mask);
diff --git a/arch/x86/kernel/smpcommon.c b/arch/x86/kernel/smpcommon.c
deleted file mode 100644
index 397e309839dd..000000000000
--- a/arch/x86/kernel/smpcommon.c
+++ /dev/null
@@ -1,30 +0,0 @@
-/*
- * SMP stuff which is common to all sub-architectures.
- */
-#include <linux/module.h>
-#include <asm/smp.h>
-#ifdef CONFIG_X86_32
-DEFINE_PER_CPU(unsigned long, this_cpu_off);
-EXPORT_PER_CPU_SYMBOL(this_cpu_off);
-/*
- * Initialize the CPU's GDT.  This is either the boot CPU doing itself
- * (still using the master per-cpu area), or a CPU doing it for a
- * secondary which will soon come up.
- */
-__cpuinit void init_gdt(int cpu)
-{
-        struct desc_struct gdt;
-        pack_descriptor(&gdt, __per_cpu_offset[cpu], 0xFFFFF,
-                        0x2 | DESCTYPE_S, 0x8);
-        gdt.s = 1;
-        write_gdt_entry(get_cpu_gdt_table(cpu),
-                        GDT_ENTRY_PERCPU, &gdt, DESCTYPE_S);
-        per_cpu(this_cpu_off, cpu) = __per_cpu_offset[cpu];
-        per_cpu(cpu_number, cpu) = cpu;
-}
-#endif
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index d44395ff34c3..e2e86a08f31d 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -88,7 +88,7 @@ ENTRY(sys_call_table)
        .long sys_uselib
        .long sys_swapon
        .long sys_reboot
-        .long old_readdir
+        .long sys_old_readdir
        .long old_mmap          /* 90 */
        .long sys_munmap
        .long sys_truncate
diff --git a/arch/x86/kernel/tlb_32.c b/arch/x86/kernel/tlb_32.c
deleted file mode 100644
index ce5054642247..000000000000
--- a/arch/x86/kernel/tlb_32.c
+++ /dev/null
@@ -1,256 +0,0 @@
-#include <linux/spinlock.h>
-#include <linux/cpu.h>
-#include <linux/interrupt.h>
-#include <asm/tlbflush.h>
-DEFINE_PER_CPU(struct tlb_state, cpu_tlbstate)
-                        ____cacheline_aligned = { &init_mm, 0, };
-/* must come after the send_IPI functions above for inlining */
-#include <mach_ipi.h>
-/*
- *      Smarter SMP flushing macros.
- *              c/o Linus Torvalds.
- *
- *      These mean you can really definitely utterly forget about
- *      writing to user space from interrupts. (Its not allowed anyway).
- *
- *      Optimizations Manfred Spraul <manfred@colorfullife.com>
- */
-static cpumask_t flush_cpumask;
-static struct mm_struct *flush_mm;
-static unsigned long flush_va;
-static DEFINE_SPINLOCK(tlbstate_lock);
-/*
- * We cannot call mmdrop() because we are in interrupt context,
- * instead update mm->cpu_vm_mask.
- *
- * We need to reload %cr3 since the page tables may be going
- * away from under us..
- */
-void leave_mm(int cpu)
-{
-        BUG_ON(x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_OK);
-        cpu_clear(cpu, x86_read_percpu(cpu_tlbstate.active_mm)->cpu_vm_mask);
-        load_cr3(swapper_pg_dir);
-}
-EXPORT_SYMBOL_GPL(leave_mm);
-/*
- *
- * The flush IPI assumes that a thread switch happens in this order:
- * [cpu0: the cpu that switches]
- * 1) switch_mm() either 1a) or 1b)
- * 1a) thread switch to a different mm
- * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask);
- *      Stop ipi delivery for the old mm. This is not synchronized with
- *      the other cpus, but smp_invalidate_interrupt ignore flush ipis
- *      for the wrong mm, and in the worst case we perform a superfluous
- *      tlb flush.
- * 1a2) set cpu_tlbstate to TLBSTATE_OK
- *      Now the smp_invalidate_interrupt won't call leave_mm if cpu0
- *      was in lazy tlb mode.
- * 1a3) update cpu_tlbstate[].active_mm
- *      Now cpu0 accepts tlb flushes for the new mm.
- * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask);
- *      Now the other cpus will send tlb flush ipis.
- * 1a4) change cr3.
- * 1b) thread switch without mm change
- *      cpu_tlbstate[].active_mm is correct, cpu0 already handles
- *      flush ipis.
- * 1b1) set cpu_tlbstate to TLBSTATE_OK
- * 1b2) test_and_set the cpu bit in cpu_vm_mask.
- *      Atomically set the bit [other cpus will start sending flush ipis],
- *      and test the bit.
- * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
- * 2) switch %%esp, ie current
- *
- * The interrupt must handle 2 special cases:
- * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
- * - the cpu performs speculative tlb reads, i.e. even if the cpu only
- *   runs in kernel space, the cpu could load tlb entries for user space
- *   pages.
- *
- * The good news is that cpu_tlbstate is local to each cpu, no
- * write/read ordering problems.
- */
-/*
- * TLB flush IPI:
- *
- * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
- * 2) Leave the mm if we are in the lazy tlb mode.
- */
-void smp_invalidate_interrupt(struct pt_regs *regs)
-{
-        unsigned long cpu;
-        cpu = get_cpu();
-        if (!cpu_isset(cpu, flush_cpumask))
-                goto out;
-                /*
-                 * This was a BUG() but until someone can quote me the
-                 * line from the intel manual that guarantees an IPI to
-                 * multiple CPUs is retried _only_ on the erroring CPUs
-                 * its staying as a return
-                 *
-                 * BUG();
-                 */
-        if (flush_mm == x86_read_percpu(cpu_tlbstate.active_mm)) {
-                if (x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_OK) {
-                        if (flush_va == TLB_FLUSH_ALL)
-                                local_flush_tlb();
-                        else
-                                __flush_tlb_one(flush_va);
-                } else
-                        leave_mm(cpu);
-        }
-        ack_APIC_irq();
-        smp_mb__before_clear_bit();
-        cpu_clear(cpu, flush_cpumask);
-        smp_mb__after_clear_bit();
-out:
-        put_cpu_no_resched();
-        inc_irq_stat(irq_tlb_count);
-}
-void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
-                             unsigned long va)
-{
-        cpumask_t cpumask = *cpumaskp;
-        /*
-         * A couple of (to be removed) sanity checks:
-         *
-         * - current CPU must not be in mask
-         * - mask must exist :)
-         */
-        BUG_ON(cpus_empty(cpumask));
-        BUG_ON(cpu_isset(smp_processor_id(), cpumask));
-        BUG_ON(!mm);
-#ifdef CONFIG_HOTPLUG_CPU
-        /* If a CPU which we ran on has gone down, OK. */
-        cpus_and(cpumask, cpumask, cpu_online_map);
-        if (unlikely(cpus_empty(cpumask)))
-                return;
-#endif
-        /*
-         * i'm not happy about this global shared spinlock in the
-         * MM hot path, but we'll see how contended it is.
-         * AK: x86-64 has a faster method that could be ported.
-         */
-        spin_lock(&tlbstate_lock);
-        flush_mm = mm;
-        flush_va = va;
-        cpus_or(flush_cpumask, cpumask, flush_cpumask);
-        /*
-         * Make the above memory operations globally visible before
-         * sending the IPI.
-         */
-        smp_mb();
-        /*
-         * We have to send the IPI only to
-         * CPUs affected.
-         */
-        send_IPI_mask(&cpumask, INVALIDATE_TLB_VECTOR);
-        while (!cpus_empty(flush_cpumask))
-                /* nothing. lockup detection does not belong here */
-                cpu_relax();
-        flush_mm = NULL;
-        flush_va = 0;
-        spin_unlock(&tlbstate_lock);
-}
-void flush_tlb_current_task(void)
-{
-        struct mm_struct *mm = current->mm;
-        cpumask_t cpu_mask;
-        preempt_disable();
-        cpu_mask = mm->cpu_vm_mask;
-        cpu_clear(smp_processor_id(), cpu_mask);
-        local_flush_tlb();
-        if (!cpus_empty(cpu_mask))
-                flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
-        preempt_enable();
-}
-void flush_tlb_mm(struct mm_struct *mm)
-{
-        cpumask_t cpu_mask;
-        preempt_disable();
-        cpu_mask = mm->cpu_vm_mask;
-        cpu_clear(smp_processor_id(), cpu_mask);
-        if (current->active_mm == mm) {
-                if (current->mm)
-                        local_flush_tlb();
-                else
-                        leave_mm(smp_processor_id());
-        }
-        if (!cpus_empty(cpu_mask))
-                flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
-        preempt_enable();
-}
-void flush_tlb_page(struct vm_area_struct *vma, unsigned long va)
-{
-        struct mm_struct *mm = vma->vm_mm;
-        cpumask_t cpu_mask;
-        preempt_disable();
-        cpu_mask = mm->cpu_vm_mask;
-        cpu_clear(smp_processor_id(), cpu_mask);
-        if (current->active_mm == mm) {
-                if (current->mm)
-                        __flush_tlb_one(va);
-                 else
-                        leave_mm(smp_processor_id());
-        }
-        if (!cpus_empty(cpu_mask))
-                flush_tlb_others(cpu_mask, mm, va);
-        preempt_enable();
-}
-EXPORT_SYMBOL(flush_tlb_page);
-static void do_flush_tlb_all(void *info)
-{
-        unsigned long cpu = smp_processor_id();
-        __flush_tlb_all();
-        if (x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_LAZY)
-                leave_mm(cpu);
-}
-void flush_tlb_all(void)
-{
-        on_each_cpu(do_flush_tlb_all, NULL, 1);
-}
-void reset_lazy_tlbstate(void)
-{
-        int cpu = raw_smp_processor_id();
-        per_cpu(cpu_tlbstate, cpu).state = 0;
-        per_cpu(cpu_tlbstate, cpu).active_mm = &init_mm;
-}
diff --git a/arch/x86/kernel/tlb_64.c b/arch/x86/kernel/tlb_64.c
deleted file mode 100644
index f8be6f1d2e48..000000000000
--- a/arch/x86/kernel/tlb_64.c
+++ /dev/null
@@ -1,284 +0,0 @@
-#include <linux/init.h>
-#include <linux/mm.h>
-#include <linux/delay.h>
-#include <linux/spinlock.h>
-#include <linux/smp.h>
-#include <linux/kernel_stat.h>
-#include <linux/mc146818rtc.h>
-#include <linux/interrupt.h>
-#include <asm/mtrr.h>
-#include <asm/pgalloc.h>
-#include <asm/tlbflush.h>
-#include <asm/mmu_context.h>
-#include <asm/proto.h>
-#include <asm/apicdef.h>
-#include <asm/idle.h>
-#include <asm/uv/uv_hub.h>
-#include <asm/uv/uv_bau.h>
-#include <mach_ipi.h>
-/*
- *      Smarter SMP flushing macros.
- *              c/o Linus Torvalds.
- *
- *      These mean you can really definitely utterly forget about
- *      writing to user space from interrupts. (Its not allowed anyway).
- *
- *      Optimizations Manfred Spraul <manfred@colorfullife.com>
- *
- *      More scalable flush, from Andi Kleen
- *
- *      To avoid global state use 8 different call vectors.
- *      Each CPU uses a specific vector to trigger flushes on other
- *      CPUs. Depending on the received vector the target CPUs look into
- *      the right per cpu variable for the flush data.
- *
- *      With more than 8 CPUs they are hashed to the 8 available
- *      vectors. The limited global vector space forces us to this right now.
- *      In future when interrupts are split into per CPU domains this could be
- *      fixed, at the cost of triggering multiple IPIs in some cases.
- */
-union smp_flush_state {
-        struct {
-                cpumask_t flush_cpumask;
-                struct mm_struct *flush_mm;
-                unsigned long flush_va;
-                spinlock_t tlbstate_lock;
-        };
-        char pad[SMP_CACHE_BYTES];
-} ____cacheline_aligned;
-/* State is put into the per CPU data section, but padded
-   to a full cache line because other CPUs can access it and we don't
-   want false sharing in the per cpu data segment. */
-static DEFINE_PER_CPU(union smp_flush_state, flush_state);
-/*
- * We cannot call mmdrop() because we are in interrupt context,
- * instead update mm->cpu_vm_mask.
- */
-void leave_mm(int cpu)
-{
-        if (read_pda(mmu_state) == TLBSTATE_OK)
-                BUG();
-        cpu_clear(cpu, read_pda(active_mm)->cpu_vm_mask);
-        load_cr3(swapper_pg_dir);
-}
-EXPORT_SYMBOL_GPL(leave_mm);
-/*
- *
- * The flush IPI assumes that a thread switch happens in this order:
- * [cpu0: the cpu that switches]
- * 1) switch_mm() either 1a) or 1b)
- * 1a) thread switch to a different mm
- * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask);
- *      Stop ipi delivery for the old mm. This is not synchronized with
- *      the other cpus, but smp_invalidate_interrupt ignore flush ipis
- *      for the wrong mm, and in the worst case we perform a superfluous
- *      tlb flush.
- * 1a2) set cpu mmu_state to TLBSTATE_OK
- *      Now the smp_invalidate_interrupt won't call leave_mm if cpu0
- *      was in lazy tlb mode.
- * 1a3) update cpu active_mm
- *      Now cpu0 accepts tlb flushes for the new mm.
- * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask);
- *      Now the other cpus will send tlb flush ipis.
- * 1a4) change cr3.
- * 1b) thread switch without mm change
- *      cpu active_mm is correct, cpu0 already handles
- *      flush ipis.
- * 1b1) set cpu mmu_state to TLBSTATE_OK
- * 1b2) test_and_set the cpu bit in cpu_vm_mask.
- *      Atomically set the bit [other cpus will start sending flush ipis],
- *      and test the bit.
- * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
- * 2) switch %%esp, ie current
- *
- * The interrupt must handle 2 special cases:
- * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
- * - the cpu performs speculative tlb reads, i.e. even if the cpu only
- *   runs in kernel space, the cpu could load tlb entries for user space
- *   pages.
- *
- * The good news is that cpu mmu_state is local to each cpu, no
- * write/read ordering problems.
- */
-/*
- * TLB flush IPI:
- *
- * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
- * 2) Leave the mm if we are in the lazy tlb mode.
- *
- * Interrupts are disabled.
- */
-asmlinkage void smp_invalidate_interrupt(struct pt_regs *regs)
-{
-        int cpu;
-        int sender;
-        union smp_flush_state *f;
-        cpu = smp_processor_id();
-        /*
-         * orig_rax contains the negated interrupt vector.
-         * Use that to determine where the sender put the data.
-         */
-        sender = ~regs->orig_ax - INVALIDATE_TLB_VECTOR_START;
-        f = &per_cpu(flush_state, sender);
-        if (!cpu_isset(cpu, f->flush_cpumask))
-                goto out;
-                /*
-                 * This was a BUG() but until someone can quote me the
-                 * line from the intel manual that guarantees an IPI to
-                 * multiple CPUs is retried _only_ on the erroring CPUs
-                 * its staying as a return
-                 *
-                 * BUG();
-                 */
-        if (f->flush_mm == read_pda(active_mm)) {
-                if (read_pda(mmu_state) == TLBSTATE_OK) {
-                        if (f->flush_va == TLB_FLUSH_ALL)
-                                local_flush_tlb();
-                        else
-                                __flush_tlb_one(f->flush_va);
-                } else
-                        leave_mm(cpu);
-        }
-out:
-        ack_APIC_irq();
-        cpu_clear(cpu, f->flush_cpumask);
-        inc_irq_stat(irq_tlb_count);
-}
-void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
-                             unsigned long va)
-{
-        int sender;
-        union smp_flush_state *f;
-        cpumask_t cpumask = *cpumaskp;
-        if (is_uv_system() && uv_flush_tlb_others(&cpumask, mm, va))
-                return;
-        /* Caller has disabled preemption */
-        sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS;
-        f = &per_cpu(flush_state, sender);
-        /*
-         * Could avoid this lock when
-         * num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is
-         * probably not worth checking this for a cache-hot lock.
-         */
-        spin_lock(&f->tlbstate_lock);
-        f->flush_mm = mm;
-        f->flush_va = va;
-        cpus_or(f->flush_cpumask, cpumask, f->flush_cpumask);
-        /*
-         * Make the above memory operations globally visible before
-         * sending the IPI.
-         */
-        smp_mb();
-        /*
-         * We have to send the IPI only to
-         * CPUs affected.
-         */
-        send_IPI_mask(&cpumask, INVALIDATE_TLB_VECTOR_START + sender);
-        while (!cpus_empty(f->flush_cpumask))
-                cpu_relax();
-        f->flush_mm = NULL;
-        f->flush_va = 0;
-        spin_unlock(&f->tlbstate_lock);
-}
-static int __cpuinit init_smp_flush(void)
-{
-        int i;
-        for_each_possible_cpu(i)
-                spin_lock_init(&per_cpu(flush_state, i).tlbstate_lock);
-        return 0;
-}
-core_initcall(init_smp_flush);
-void flush_tlb_current_task(void)
-{
-        struct mm_struct *mm = current->mm;
-        cpumask_t cpu_mask;
-        preempt_disable();
-        cpu_mask = mm->cpu_vm_mask;
-        cpu_clear(smp_processor_id(), cpu_mask);
-        local_flush_tlb();
-        if (!cpus_empty(cpu_mask))
-                flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
-        preempt_enable();
-}
-void flush_tlb_mm(struct mm_struct *mm)
-{
-        cpumask_t cpu_mask;
-        preempt_disable();
-        cpu_mask = mm->cpu_vm_mask;
-        cpu_clear(smp_processor_id(), cpu_mask);
-        if (current->active_mm == mm) {
-                if (current->mm)
-                        local_flush_tlb();
-                else
-                        leave_mm(smp_processor_id());
-        }
-        if (!cpus_empty(cpu_mask))
-                flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
-        preempt_enable();
-}
-void flush_tlb_page(struct vm_area_struct *vma, unsigned long va)
-{
-        struct mm_struct *mm = vma->vm_mm;
-        cpumask_t cpu_mask;
-        preempt_disable();
-        cpu_mask = mm->cpu_vm_mask;
-        cpu_clear(smp_processor_id(), cpu_mask);
-        if (current->active_mm == mm) {
-                if (current->mm)
-                        __flush_tlb_one(va);
-                else
-                        leave_mm(smp_processor_id());
-        }
-        if (!cpus_empty(cpu_mask))
-                flush_tlb_others(cpu_mask, mm, va);
-        preempt_enable();
-}
-static void do_flush_tlb_all(void *info)
-{
-        unsigned long cpu = smp_processor_id();
-        __flush_tlb_all();
-        if (read_pda(mmu_state) == TLBSTATE_LAZY)
-                leave_mm(cpu);
-}
-void flush_tlb_all(void)
-{
-        on_each_cpu(do_flush_tlb_all, NULL, 1);
-}
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index f885023167e0..89fce1b6d01f 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -11,6 +11,7 @@
 #include <linux/kernel.h>
 #include <asm/mmu_context.h>
+#include <asm/uv/uv.h>
 #include <asm/uv/uv_mmrs.h>
 #include <asm/uv/uv_hub.h>
 #include <asm/uv/uv_bau.h>
@@ -200,6 +201,7 @@ static int uv_wait_completion(struct bau_desc *bau_desc,
                                destination_timeouts = 0;
                        }
                }
+                cpu_relax();
        }
        return FLUSH_COMPLETE;
 }
@@ -209,14 +211,15 @@ static int uv_wait_completion(struct bau_desc *bau_desc,
 *
 * Send a broadcast and wait for a broadcast message to complete.
 *
- * The cpumaskp mask contains the cpus the broadcast was sent to.
+ * The flush_mask contains the cpus the broadcast was sent to.
 *
- * Returns 1 if all remote flushing was done. The mask is zeroed.
+ * Returns NULL if all remote flushing was done. The mask is zeroed.
- * Returns 0 if some remote flushing remains to be done. The mask is left
+ * Returns @flush_mask if some remote flushing remains to be done. The
- * unchanged.
+ * mask will have some bits still set.
 */
-int uv_flush_send_and_wait(int cpu, int this_blade, struct bau_desc *bau_desc,
+const struct cpumask *uv_flush_send_and_wait(int cpu, int this_blade,
-                           cpumask_t *cpumaskp)
+                                             struct bau_desc *bau_desc,
+                                             struct cpumask *flush_mask)
 {
        int completion_status = 0;
        int right_shift;
@@ -263,59 +266,69 @@ int uv_flush_send_and_wait(int cpu, int this_blade, struct bau_desc *bau_desc,
         * Success, so clear the remote cpu's from the mask so we don't
         * use the IPI method of shootdown on them.
         */
-        for_each_cpu_mask(bit, *cpumaskp) {
+        for_each_cpu(bit, flush_mask) {
                blade = uv_cpu_to_blade_id(bit);
                if (blade == this_blade)
                        continue;
-                cpu_clear(bit, *cpumaskp);
+                cpumask_clear_cpu(bit, flush_mask);
        }
-        if (!cpus_empty(*cpumaskp))
+        if (!cpumask_empty(flush_mask))
-                return 0;
+                return flush_mask;
-        return 1;
+        return NULL;
 }
 /**
 * uv_flush_tlb_others - globally purge translation cache of a virtual
 * address or all TLB's
- * @cpumaskp: mask of all cpu's in which the address is to be removed
+ * @cpumask: mask of all cpu's in which the address is to be removed
 * @mm: mm_struct containing virtual address range
 * @va: virtual address to be removed (or TLB_FLUSH_ALL for all TLB's on cpu)
+ * @cpu: the current cpu
 *
 * This is the entry point for initiating any UV global TLB shootdown.
 *
 * Purges the translation caches of all specified processors of the given
 * virtual address, or purges all TLB's on specified processors.
 *
- * The caller has derived the cpumaskp from the mm_struct and has subtracted
+ * The caller has derived the cpumask from the mm_struct.  This function
- * the local cpu from the mask.  This function is called only if there
+ * is called only if there are bits set in the mask. (e.g. flush_tlb_page())
- * are bits set in the mask. (e.g. flush_tlb_page())
 *
- * The cpumaskp is converted into a nodemask of the nodes containing
+ * The cpumask is converted into a nodemask of the nodes containing
 * the cpus.
 *
- * Returns 1 if all remote flushing was done.
+ * Note that this function should be called with preemption disabled.
- * Returns 0 if some remote flushing remains to be done.
+ *
+ * Returns NULL if all remote flushing was done.
+ * Returns pointer to cpumask if some remote flushing remains to be
+ * done.  The returned pointer is valid till preemption is re-enabled.
 */
-int uv_flush_tlb_others(cpumask_t *cpumaskp, struct mm_struct *mm,
+const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
-                        unsigned long va)
+                                          struct mm_struct *mm,
+                                          unsigned long va, unsigned int cpu)
 {
+        static DEFINE_PER_CPU(cpumask_t, flush_tlb_mask);
+        struct cpumask *flush_mask = &__get_cpu_var(flush_tlb_mask);
        int i;
        int bit;
        int blade;
-        int cpu;
+        int uv_cpu;
        int this_blade;
        int locals = 0;
        struct bau_desc *bau_desc;
-        cpu = uv_blade_processor_id();
+        WARN_ON(!in_atomic());
+        cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu));
+        uv_cpu = uv_blade_processor_id();
        this_blade = uv_numa_blade_id();
        bau_desc = __get_cpu_var(bau_control).descriptor_base;
-        bau_desc += UV_ITEMS_PER_DESCRIPTOR * cpu;
+        bau_desc += UV_ITEMS_PER_DESCRIPTOR * uv_cpu;
        bau_nodes_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE);
        i = 0;
-        for_each_cpu_mask(bit, *cpumaskp) {
+        for_each_cpu(bit, flush_mask) {
                blade = uv_cpu_to_blade_id(bit);
                BUG_ON(blade > (UV_DISTRIBUTION_SIZE - 1));
                if (blade == this_blade) {
@@ -330,17 +343,17 @@ int uv_flush_tlb_others(cpumask_t *cpumaskp, struct mm_struct *mm,
                 * no off_node flushing; return status for local node
                 */
                if (locals)
-                        return 0;
+                        return flush_mask;
                else
-                        return 1;
+                        return NULL;
        }
        __get_cpu_var(ptcstats).requestor++;
        __get_cpu_var(ptcstats).ntargeted += i;
        bau_desc->payload.address = va;
-        bau_desc->payload.sending_cpu = smp_processor_id();
+        bau_desc->payload.sending_cpu = cpu;
-        return uv_flush_send_and_wait(cpu, this_blade, bau_desc, cpumaskp);
+        return uv_flush_send_and_wait(uv_cpu, this_blade, bau_desc, flush_mask);
 }
 /*
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 98c2d055284b..ed5aee5f3fcc 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -59,7 +59,6 @@
 #ifdef CONFIG_X86_64
 #include <asm/pgalloc.h>
 #include <asm/proto.h>
-#include <asm/pda.h>
 #else
 #include <asm/processor-flags.h>
 #include <asm/arch_hooks.h>
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index 23206ba16874..1d3302cc2ddf 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -858,7 +858,7 @@ void __init vmi_init(void)
 #endif
 }
-void vmi_activate(void)
+void __init vmi_activate(void)
 {
        unsigned long flags;
diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S
index 82c67559dde7..3eba7f7bac05 100644
--- a/arch/x86/kernel/vmlinux_32.lds.S
+++ b/arch/x86/kernel/vmlinux_32.lds.S
@@ -178,14 +178,7 @@ SECTIONS
        __initramfs_end = .;
  }
 #endif
-  . = ALIGN(PAGE_SIZE);
+  PERCPU(PAGE_SIZE)
-  .data.percpu  : AT(ADDR(.data.percpu) - LOAD_OFFSET) {
-        __per_cpu_start = .;
-        *(.data.percpu.page_aligned)
-        *(.data.percpu)
-        *(.data.percpu.shared_aligned)
-        __per_cpu_end = .;
-  }
  . = ALIGN(PAGE_SIZE);
  /* freed after init ends here */
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
index 1a614c0e6bef..c9740996430a 100644
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ b/arch/x86/kernel/vmlinux_64.lds.S
@@ -5,6 +5,7 @@
 #define LOAD_OFFSET __START_KERNEL_map
 #include <asm-generic/vmlinux.lds.h>
+#include <asm/asm-offsets.h>
 #include <asm/page.h>
 #undef i386     /* in case the preprocessor is a 32bit one */
@@ -13,12 +14,14 @@ OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64")
 OUTPUT_ARCH(i386:x86-64)
 ENTRY(phys_startup_64)
 jiffies_64 = jiffies;
-_proxy_pda = 1;
 PHDRS {
        text PT_LOAD FLAGS(5);  /* R_E */
        data PT_LOAD FLAGS(7);  /* RWE */
        user PT_LOAD FLAGS(7);  /* RWE */
        data.init PT_LOAD FLAGS(7);     /* RWE */
+#ifdef CONFIG_SMP
+        percpu PT_LOAD FLAGS(7);        /* RWE */
+#endif
        note PT_NOTE FLAGS(0);  /* ___ */
 }
 SECTIONS
@@ -208,14 +211,28 @@ SECTIONS
  __initramfs_end = .;
 #endif
+#ifdef CONFIG_SMP
+  /*
+   * percpu offsets are zero-based on SMP.  PERCPU_VADDR() changes the
+   * output PHDR, so the next output section - __data_nosave - should
+   * switch it back to data.init.  Also, pda should be at the head of
+   * percpu area.  Preallocate it and define the percpu offset symbol
+   * so that it can be accessed as a percpu variable.
+   */
+  . = ALIGN(PAGE_SIZE);
+  PERCPU_VADDR(0, :percpu)
+#else
  PERCPU(PAGE_SIZE)
+#endif
  . = ALIGN(PAGE_SIZE);
  __init_end = .;
  . = ALIGN(PAGE_SIZE);
  __nosave_begin = .;
-  .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { *(.data.nosave) }
+  .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
+      *(.data.nosave)
+  } :data.init  /* switch back to data.init, see PERCPU_VADDR() above */
  . = ALIGN(PAGE_SIZE);
  __nosave_end = .;
@@ -244,3 +261,8 @@ SECTIONS
 */
 ASSERT((_end - _text <= KERNEL_IMAGE_SIZE),
        "kernel image bigger than KERNEL_IMAGE_SIZE")
+#ifdef CONFIG_SMP
+ASSERT((per_cpu__irq_stack_union == 0),
+        "irq_stack_union is not at start of per-cpu area");
+#endif
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index 695e426aa354..3909e3ba5ce3 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -58,5 +58,3 @@ EXPORT_SYMBOL(__memcpy);
 EXPORT_SYMBOL(empty_zero_page);
 EXPORT_SYMBOL(init_level4_pgt);
 EXPORT_SYMBOL(load_gs_index);
-EXPORT_SYMBOL(_proxy_pda);