27 files changed, 149 insertions, 692 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index d364df03c1d6..a99437c965cc 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -23,6 +23,7 @@ nostackp := $(call cc-option, -fno-stack-protector)
 CFLAGS_vsyscall_64.o    := $(PROFILING) -g0 $(nostackp)
 CFLAGS_hpet.o           := $(nostackp)
 CFLAGS_tsc.o            := $(nostackp)
+CFLAGS_paravirt.o       := $(nostackp)
 obj-y                   := process_$(BITS).o signal.o entry_$(BITS).o
 obj-y                   += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
@@ -57,7 +58,7 @@ obj-$(CONFIG_PCI)		+= early-quirks.o
 apm-y                           := apm_32.o
 obj-$(CONFIG_APM)               += apm.o
 obj-$(CONFIG_X86_SMP)           += smp.o
-obj-$(CONFIG_X86_SMP)           += smpboot.o tsc_sync.o ipi.o tlb_$(BITS).o
+obj-$(CONFIG_X86_SMP)           += smpboot.o tsc_sync.o ipi.o
 obj-$(CONFIG_X86_32_SMP)        += smpcommon.o
 obj-$(CONFIG_X86_64_SMP)        += tsc_sync.o smpcommon.o
 obj-$(CONFIG_X86_TRAMPOLINE)    += trampoline_$(BITS).o
@@ -114,10 +115,11 @@ obj-$(CONFIG_SWIOTLB)			+= pci-swiotlb_64.o # NB rename without _64
 ###
 # 64 bit specific files
 ifeq ($(CONFIG_X86_64),y)
-        obj-y                           += genapic_64.o genapic_flat_64.o genx2apic_uv_x.o tlb_uv.o
+        obj-y                           += genapic_64.o genapic_flat_64.o
-        obj-y                           += bios_uv.o uv_irq.o uv_sysfs.o
        obj-y                           += genx2apic_cluster.o
        obj-y                           += genx2apic_phys.o
+        obj-$(CONFIG_X86_UV)            += genx2apic_uv_x.o tlb_uv.o
+        obj-$(CONFIG_X86_UV)            += bios_uv.o uv_irq.o uv_sysfs.o
        obj-$(CONFIG_X86_PM_TIMER)      += pmtimer_64.o
        obj-$(CONFIG_AUDIT)             += audit_64.o
diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c
index e9af14f748ea..7b434e5b14c9 100644
--- a/arch/x86/kernel/apic.c
+++ b/arch/x86/kernel/apic.c
@@ -1132,7 +1132,9 @@ void __cpuinit setup_local_APIC(void)
        int i, j;
        if (disable_apic) {
+#ifdef CONFIG_X86_IO_APIC
                disable_ioapic_setup();
+#endif
                return;
        }
@@ -1844,6 +1846,11 @@ void __cpuinit generic_processor_info(int apicid, int version)
        num_processors++;
        cpu = cpumask_next_zero(-1, cpu_present_mask);
+        if (version != apic_version[boot_cpu_physical_apicid])
+                WARN_ONCE(1,
+                        "ACPI: apic version mismatch, bootcpu: %x cpu %d: %x\n",
+                        apic_version[boot_cpu_physical_apicid], cpu, version);
        physid_set(apicid, phys_cpu_present_map);
        if (apicid == boot_cpu_physical_apicid) {
                /*
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index 64c834a39aa8..8793ab33e2c1 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -11,7 +11,6 @@
 #include <linux/hardirq.h>
 #include <linux/suspend.h>
 #include <linux/kbuild.h>
-#include <asm/pda.h>
 #include <asm/processor.h>
 #include <asm/segment.h>
 #include <asm/thread_info.h>
@@ -48,10 +47,6 @@ int main(void)
 #endif
        BLANK();
 #undef ENTRY
-#define ENTRY(entry) DEFINE(pda_ ## entry, offsetof(struct x8664_pda, entry))
-        DEFINE(pda_size, sizeof(struct x8664_pda));
-        BLANK();
-#undef ENTRY
 #ifdef CONFIG_PARAVIRT
        BLANK();
        OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled);
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 95eb30e1e677..6fd316689c47 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -29,9 +29,9 @@
 #include <asm/apic.h>
 #include <mach_apic.h>
 #include <asm/genapic.h>
+#include <asm/uv/uv.h>
 #endif
-#include <asm/pda.h>
 #include <asm/pgtable.h>
 #include <asm/processor.h>
 #include <asm/desc.h>
@@ -65,23 +65,23 @@ cpumask_t cpu_sibling_setup_map;
 static struct cpu_dev *this_cpu __cpuinitdata;
+DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
 #ifdef CONFIG_X86_64
-/* We need valid kernel segments for data and code in long mode too
+        /*
- * IRET will check the segment types  kkeil 2000/10/28
+         * We need valid kernel segments for data and code in long mode too
- * Also sysret mandates a special GDT layout
+         * IRET will check the segment types  kkeil 2000/10/28
- */
+         * Also sysret mandates a special GDT layout
-/* The TLS descriptors are currently at a different place compared to i386.
+         *
-   Hopefully nobody expects them at a fixed place (Wine?) */
+         * The TLS descriptors are currently at a different place compared to i386.
-DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = {
+         * Hopefully nobody expects them at a fixed place (Wine?)
+         */
        [GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } },
        [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } },
        [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } },
        [GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } },
        [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } },
        [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } },
-} };
 #else
-DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
        [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } },
        [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } },
        [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00cffa00 } } },
@@ -113,9 +113,9 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
        [GDT_ENTRY_APMBIOS_BASE+2] = { { { 0x0000ffff, 0x00409200 } } },
        [GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } },
-        [GDT_ENTRY_PERCPU] = { { { 0x00000000, 0x00000000 } } },
+        [GDT_ENTRY_PERCPU] = { { { 0x0000ffff, 0x00cf9200 } } },
-} };
 #endif
+} };
 EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
 #ifdef CONFIG_X86_32
@@ -883,12 +883,13 @@ __setup("clearcpuid=", setup_disablecpuid);
 #ifdef CONFIG_X86_64
 struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
-DEFINE_PER_CPU_PAGE_ALIGNED(char[IRQ_STACK_SIZE], irq_stack);
+DEFINE_PER_CPU_FIRST(union irq_stack_union,
+                     irq_stack_union) __aligned(PAGE_SIZE);
 #ifdef CONFIG_SMP
 DEFINE_PER_CPU(char *, irq_stack_ptr);  /* will be set during per cpu init */
 #else
 DEFINE_PER_CPU(char *, irq_stack_ptr) =
-        per_cpu_var(irq_stack) + IRQ_STACK_SIZE - 64;
+        per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE - 64;
 #endif
 DEFINE_PER_CPU(unsigned long, kernel_stack) =
@@ -897,15 +898,6 @@ EXPORT_PER_CPU_SYMBOL(kernel_stack);
 DEFINE_PER_CPU(unsigned int, irq_count) = -1;
-void __cpuinit pda_init(int cpu)
-{
-        /* Setup up data that may be needed in __get_free_pages early */
-        loadsegment(fs, 0);
-        loadsegment(gs, 0);
-        load_pda_offset(cpu);
-}
 static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
        [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ])
        __aligned(PAGE_SIZE);
@@ -969,9 +961,9 @@ void __cpuinit cpu_init(void)
        struct task_struct *me;
        int i;
-        /* CPU 0 is initialised in head64.c */
+        loadsegment(fs, 0);
-        if (cpu != 0)
+        loadsegment(gs, 0);
-                pda_init(cpu);
+        load_gs_base(cpu);
 #ifdef CONFIG_NUMA
        if (cpu != 0 && percpu_read(node_number) == 0 &&
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 8f3c95c7e61f..4b1c319d30c3 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -145,13 +145,14 @@ typedef union {
 struct drv_cmd {
        unsigned int type;
-        cpumask_var_t mask;
+        const struct cpumask *mask;
        drv_addr_union addr;
        u32 val;
 };
-static void do_drv_read(struct drv_cmd *cmd)
+static long do_drv_read(void *_cmd)
 {
+        struct drv_cmd *cmd = _cmd;
        u32 h;
        switch (cmd->type) {
@@ -166,10 +167,12 @@ static void do_drv_read(struct drv_cmd *cmd)
        default:
                break;
        }
+        return 0;
 }
-static void do_drv_write(struct drv_cmd *cmd)
+static long do_drv_write(void *_cmd)
 {
+        struct drv_cmd *cmd = _cmd;
        u32 lo, hi;
        switch (cmd->type) {
@@ -186,30 +189,23 @@ static void do_drv_write(struct drv_cmd *cmd)
        default:
                break;
        }
+        return 0;
 }
 static void drv_read(struct drv_cmd *cmd)
 {
-        cpumask_t saved_mask = current->cpus_allowed;
        cmd->val = 0;
-        set_cpus_allowed_ptr(current, cmd->mask);
+        work_on_cpu(cpumask_any(cmd->mask), do_drv_read, cmd);
-        do_drv_read(cmd);
-        set_cpus_allowed_ptr(current, &saved_mask);
 }
 static void drv_write(struct drv_cmd *cmd)
 {
-        cpumask_t saved_mask = current->cpus_allowed;
        unsigned int i;
        for_each_cpu(i, cmd->mask) {
-                set_cpus_allowed_ptr(current, cpumask_of(i));
+                work_on_cpu(i, do_drv_write, cmd);
-                do_drv_write(cmd);
        }
-        set_cpus_allowed_ptr(current, &saved_mask);
-        return;
 }
 static u32 get_cur_val(const struct cpumask *mask)
@@ -235,6 +231,7 @@ static u32 get_cur_val(const struct cpumask *mask)
                return 0;
        }
+        cmd.mask = mask;
        drv_read(&cmd);
        dprintk("get_cur_val = %u\n", cmd.val);
@@ -366,7 +363,7 @@ static unsigned int get_cur_freq_on_cpu(unsigned int cpu)
        return freq;
 }
-static unsigned int check_freqs(const cpumask_t *mask, unsigned int freq,
+static unsigned int check_freqs(const struct cpumask *mask, unsigned int freq,
                                struct acpi_cpufreq_data *data)
 {
        unsigned int cur_freq;
@@ -401,9 +398,6 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
                return -ENODEV;
        }
-        if (unlikely(!alloc_cpumask_var(&cmd.mask, GFP_KERNEL)))
-                return -ENOMEM;
        perf = data->acpi_data;
        result = cpufreq_frequency_table_target(policy,
                                                data->freq_table,
@@ -448,9 +442,9 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
        /* cpufreq holds the hotplug lock, so we are safe from here on */
        if (policy->shared_type != CPUFREQ_SHARED_TYPE_ANY)
-                cpumask_and(cmd.mask, cpu_online_mask, policy->cpus);
+                cmd.mask = policy->cpus;
        else
-                cpumask_copy(cmd.mask, cpumask_of(policy->cpu));
+                cmd.mask = cpumask_of(policy->cpu);
        freqs.old = perf->states[perf->state].core_frequency * 1000;
        freqs.new = data->freq_table[next_state].frequency;
@@ -477,7 +471,6 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
        perf->state = next_perf_state;
 out:
-        free_cpumask_var(cmd.mask);
        return result;
 }
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
index 4b48f251fd39..5e8c79e748a6 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
@@ -7,6 +7,7 @@
 #include <linux/interrupt.h>
 #include <linux/percpu.h>
 #include <asm/processor.h>
+#include <asm/apic.h>
 #include <asm/msr.h>
 #include <asm/mce.h>
 #include <asm/hw_irq.h>
diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c
index 1119d247fe11..b205272ad394 100644
--- a/arch/x86/kernel/efi.c
+++ b/arch/x86/kernel/efi.c
@@ -366,10 +366,12 @@ void __init efi_init(void)
                                        SMBIOS_TABLE_GUID)) {
                        efi.smbios = config_tables[i].table;
                        printk(" SMBIOS=0x%lx ", config_tables[i].table);
+#ifdef CONFIG_X86_UV
                } else if (!efi_guidcmp(config_tables[i].guid,
                                        UV_SYSTEM_TABLE_GUID)) {
                        efi.uv_systab = config_tables[i].table;
                        printk(" UVsystab=0x%lx ", config_tables[i].table);
+#endif
                } else if (!efi_guidcmp(config_tables[i].guid,
                                        HCDP_TABLE_GUID)) {
                        efi.hcdp = config_tables[i].table;
diff --git a/arch/x86/kernel/efi_64.c b/arch/x86/kernel/efi_64.c
index 652c5287215f..a4ee29127fdf 100644
--- a/arch/x86/kernel/efi_64.c
+++ b/arch/x86/kernel/efi_64.c
@@ -36,6 +36,7 @@
 #include <asm/proto.h>
 #include <asm/efi.h>
 #include <asm/cacheflush.h>
+#include <asm/fixmap.h>
 static pgd_t save_pgd __initdata;
 static unsigned long efi_flags __initdata;
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 46469029e9d3..a0b91aac72a1 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -672,7 +672,7 @@ common_interrupt:
 ENDPROC(common_interrupt)
        CFI_ENDPROC
-#define BUILD_INTERRUPT(name, nr)       \
+#define BUILD_INTERRUPT3(name, nr, fn)  \
 ENTRY(name)                             \
        RING0_INT_FRAME;                \
        pushl $~(nr);                   \
@@ -680,11 +680,13 @@ ENTRY(name)				\
        SAVE_ALL;                       \
        TRACE_IRQS_OFF                  \
        movl %esp,%eax;                 \
-        call smp_##name;                \
+        call fn;                        \
        jmp ret_from_intr;              \
        CFI_ENDPROC;                    \
 ENDPROC(name)
+#define BUILD_INTERRUPT(name, nr)       BUILD_INTERRUPT3(name, nr, smp_##name)
 /* The include is where all of the SMP etc. interrupts come from */
 #include "entry_arch.h"
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index c092e7d2686d..eb0a0703f4c9 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -982,8 +982,10 @@ apicinterrupt IRQ_MOVE_CLEANUP_VECTOR \
        irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt
 #endif
+#ifdef CONFIG_X86_UV
 apicinterrupt UV_BAU_MESSAGE \
        uv_bau_message_intr1 uv_bau_message_interrupt
+#endif
 apicinterrupt LOCAL_TIMER_VECTOR \
        apic_timer_interrupt smp_apic_timer_interrupt
diff --git a/arch/x86/kernel/genapic_64.c b/arch/x86/kernel/genapic_64.c
index 2bced78b0b8e..e656c2721154 100644
--- a/arch/x86/kernel/genapic_64.c
+++ b/arch/x86/kernel/genapic_64.c
@@ -32,7 +32,9 @@ extern struct genapic apic_x2apic_cluster;
 struct genapic __read_mostly *genapic = &apic_flat;
 static struct genapic *apic_probe[] __initdata = {
+#ifdef CONFIG_X86_UV
        &apic_x2apic_uv_x,
+#endif
        &apic_x2apic_phys,
        &apic_x2apic_cluster,
        &apic_physflat,
diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c
index b193e082f6ce..bfe36249145c 100644
--- a/arch/x86/kernel/genx2apic_uv_x.c
+++ b/arch/x86/kernel/genx2apic_uv_x.c
@@ -25,6 +25,7 @@
 #include <asm/ipi.h>
 #include <asm/genapic.h>
 #include <asm/pgtable.h>
+#include <asm/uv/uv.h>
 #include <asm/uv/uv_mmrs.h>
 #include <asm/uv/uv_hub.h>
 #include <asm/uv/bios.h>
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index af67d3227ea6..f5b272247690 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -91,8 +91,6 @@ void __init x86_64_start_kernel(char * real_mode_data)
        if (console_loglevel == 10)
                early_printk("Kernel alive\n");
-        pda_init(0);
        x86_64_start_reservations(real_mode_data);
 }
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index e835b4eea70b..24c0e5cd71e3 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -429,12 +429,14 @@ is386:	movl $2,%ecx		# set MP
        ljmp $(__KERNEL_CS),$1f
 1:      movl $(__KERNEL_DS),%eax        # reload all the segment registers
        movl %eax,%ss                   # after changing gdt.
-        movl %eax,%fs                   # gets reset once there's real percpu
        movl $(__USER_DS),%eax          # DS/ES contains default USER segment
        movl %eax,%ds
        movl %eax,%es
+        movl $(__KERNEL_PERCPU), %eax
+        movl %eax,%fs                   # set this cpu's percpu
        xorl %eax,%eax                  # Clear GS and LDT
        movl %eax,%gs
        lldt %ax
@@ -446,8 +448,6 @@ is386:	movl $2,%ecx		# set MP
        movb $1, ready
        cmpb $0,%cl             # the first CPU calls start_kernel
        je   1f
-        movl $(__KERNEL_PERCPU), %eax
-        movl %eax,%fs           # set this cpu's percpu
        movl (stack_start), %esp
 1:
 #endif /* CONFIG_SMP */
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index c8ace880661b..a0a2b5ca9b7d 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -207,19 +207,15 @@ ENTRY(secondary_startup_64)
 #ifdef CONFIG_SMP
        /*
-         * early_gdt_base should point to the gdt_page in static percpu init
+         * Fix up static pointers that need __per_cpu_load added.  The assembler
-         * data area.  Computing this requires two symbols - __per_cpu_load
+         * is unable to do this directly.  This is only needed for the boot cpu.
-         * and per_cpu__gdt_page.  As linker can't do no such relocation, do
+         * These values are set up with the correct base addresses by C code for
-         * it by hand.  As early_gdt_descr is manipulated by C code for
+         * secondary cpus.
-         * secondary CPUs, this should be done only once for the boot CPU
-         * when early_gdt_descr_base contains zero.
         */
-        movq    early_gdt_descr_base(%rip), %rax
+        movq    initial_gs(%rip), %rax
-        testq   %rax, %rax
+        cmpl    $0, per_cpu__cpu_number(%rax)
-        jnz     1f
+        jne     1f
-        movq    $__per_cpu_load, %rax
+        addq    %rax, early_gdt_descr_base(%rip)
-        addq    $per_cpu__gdt_page, %rax
-        movq    %rax, early_gdt_descr_base(%rip)
 1:
 #endif
        /*
@@ -246,13 +242,10 @@ ENTRY(secondary_startup_64)
        /* Set up %gs.
         *
-         * On SMP, %gs should point to the per-cpu area.  For initial
+         * The base of %gs always points to the bottom of the irqstack
-         * boot, make %gs point to the init data section.  For a
+         * union.  If the stack protector canary is enabled, it is
-         * secondary CPU,initial_gs should be set to its pda address
+         * located at %gs:40.  Note that, on SMP, the boot cpu uses
-         * before the CPU runs this code.
+         * init data section till per cpu areas are set up.
-         *
-         * On UP, initial_gs points to PER_CPU_VAR(__pda) and doesn't
-         * change.
         */
        movl    $MSR_GS_BASE,%ecx
        movq    initial_gs(%rip),%rax
@@ -285,7 +278,7 @@ ENTRY(secondary_startup_64)
 #ifdef CONFIG_SMP
        .quad   __per_cpu_load
 #else
-        .quad   PER_CPU_VAR(__pda)
+        .quad   PER_CPU_VAR(irq_stack_union)
 #endif
        __FINITDATA
@@ -431,12 +424,8 @@ NEXT_PAGE(level2_spare_pgt)
        .globl early_gdt_descr
 early_gdt_descr:
        .word   GDT_ENTRIES*8-1
-#ifdef CONFIG_SMP
 early_gdt_descr_base:
-        .quad   0x0000000000000000
-#else
        .quad   per_cpu__gdt_page
-#endif
 ENTRY(phys_base)
        /* This must match the first entry in level2_kernel_pgt */
diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c
index f79660390724..e4d36bd56b62 100644
--- a/arch/x86/kernel/io_apic.c
+++ b/arch/x86/kernel/io_apic.c
@@ -3765,7 +3765,7 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
 }
 #endif /* CONFIG_HT_IRQ */
-#ifdef CONFIG_X86_64
+#ifdef CONFIG_X86_UV
 /*
 * Re-target the irq to the specified CPU and enable the specified MMR located
 * on the specified blade to allow the sending of MSIs to the specified CPU.
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 1db05247b47f..018963aa6ee3 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -18,10 +18,14 @@
 #include <linux/smp.h>
 #include <asm/io_apic.h>
 #include <asm/idle.h>
+#include <asm/apic.h>
 DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
 EXPORT_PER_CPU_SYMBOL(irq_stat);
+DEFINE_PER_CPU(struct pt_regs *, irq_regs);
+EXPORT_PER_CPU_SYMBOL(irq_regs);
 /*
 * Probabilistic stack overflow check:
 *
diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c
index 0bef6280f30c..c56496f8c6fc 100644
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit_32.c
@@ -149,8 +149,15 @@ void __init native_init_IRQ(void)
         */
        alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
-        /* IPI for invalidation */
+        /* IPIs for invalidation */
-        alloc_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt);
+        alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0);
+        alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1);
+        alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2);
+        alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3);
+        alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4);
+        alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5);
+        alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6);
+        alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7);
        /* IPI for generic function call */
        alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 2c00a57ccb90..1a1ae8edc40c 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -108,7 +108,6 @@ void cpu_idle(void)
                                play_dead();
                        local_irq_disable();
-                        __get_cpu_var(irq_stat).idle_timestamp = jiffies;
                        /* Don't trace irqs off for idle */
                        stop_critical_timings();
                        pm_idle();
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 4523ff88a69d..c422eebb0c58 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -16,6 +16,7 @@
 #include <stdarg.h>
+#include <linux/stackprotector.h>
 #include <linux/cpu.h>
 #include <linux/errno.h>
 #include <linux/sched.h>
@@ -46,7 +47,6 @@
 #include <asm/processor.h>
 #include <asm/i387.h>
 #include <asm/mmu_context.h>
-#include <asm/pda.h>
 #include <asm/prctl.h>
 #include <asm/desc.h>
 #include <asm/proto.h>
@@ -117,6 +117,17 @@ static inline void play_dead(void)
 void cpu_idle(void)
 {
        current_thread_info()->status |= TS_POLLING;
+        /*
+         * If we're the non-boot CPU, nothing set the PDA stack
+         * canary up for us - and if we are the boot CPU we have
+         * a 0 stack canary. This is a good place for updating
+         * it, as we wont ever return from this function (so the
+         * invalid canaries already on the stack wont ever
+         * trigger):
+         */
+        boot_init_stack_canary();
        /* endless idle loop with no priority at all */
        while (1) {
                tick_nohz_stop_sched_tick(1);
@@ -626,14 +637,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
        percpu_write(kernel_stack,
                  (unsigned long)task_stack_page(next_p) +
                  THREAD_SIZE - KERNEL_STACK_OFFSET);
-#ifdef CONFIG_CC_STACKPROTECTOR
-        write_pda(stack_canary, next_p->stack_canary);
-        /*
-         * Build time only check to make sure the stack_canary is at
-         * offset 40 in the pda; this is a gcc ABI requirement
-         */
-        BUILD_BUG_ON(offsetof(struct x8664_pda, stack_canary) != 40);
-#endif
        /*
         * Now maybe reload the debug registers and handle I/O bitmaps
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index efbafbbff584..90b8e154bb53 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -77,30 +77,6 @@ static void __init setup_node_to_cpumask_map(void);
 static inline void setup_node_to_cpumask_map(void) { }
 #endif
-/*
- * Define load_pda_offset() and per-cpu __pda for x86_64.
- * load_pda_offset() is responsible for loading the offset of pda into
- * %gs.
- *
- * On SMP, pda offset also duals as percpu base address and thus it
- * should be at the start of per-cpu area.  To achieve this, it's
- * preallocated in vmlinux_64.lds.S directly instead of using
- * DEFINE_PER_CPU().
- */
-#ifdef CONFIG_X86_64
-void __cpuinit load_pda_offset(int cpu)
-{
-        /* Memory clobbers used to order pda/percpu accesses */
-        mb();
-        wrmsrl(MSR_GS_BASE, cpu_pda(cpu));
-        mb();
-}
-#ifndef CONFIG_SMP
-DEFINE_PER_CPU(struct x8664_pda, __pda);
-#endif
-EXPORT_PER_CPU_SYMBOL(__pda);
-#endif /* CONFIG_SMP && CONFIG_X86_64 */
 #ifdef CONFIG_X86_64
 /* correctly size the local cpu masks */
@@ -207,15 +183,13 @@ void __init setup_per_cpu_areas(void)
                per_cpu(cpu_number, cpu) = cpu;
 #ifdef CONFIG_X86_64
                per_cpu(irq_stack_ptr, cpu) =
-                        (char *)per_cpu(irq_stack, cpu) + IRQ_STACK_SIZE - 64;
+                        per_cpu(irq_stack_union.irq_stack, cpu) + IRQ_STACK_SIZE - 64;
                /*
-                 * CPU0 modified pda in the init data area, reload pda
+                 * Up to this point, CPU0 has been using .data.init
-                 * offset for CPU0 and clear the area for others.
+                 * area.  Reload %gs offset for CPU0.
                 */
                if (cpu == 0)
-                        load_pda_offset(0);
+                        load_gs_base(cpu);
-                else
-                        memset(cpu_pda(cpu), 0, sizeof(*cpu_pda(cpu)));
 #endif
                DBG("PERCPU: cpu %4d %p\n", cpu, ptr);
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 869b98840fd0..def770b57b5a 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -62,6 +62,7 @@
 #include <asm/vmi.h>
 #include <asm/genapic.h>
 #include <asm/setup.h>
+#include <asm/uv/uv.h>
 #include <linux/mc146818rtc.h>
 #include <mach_apic.h>
diff --git a/arch/x86/kernel/tlb_32.c b/arch/x86/kernel/tlb_32.c
deleted file mode 100644
index abf0808d6fc4..000000000000
--- a/arch/x86/kernel/tlb_32.c
+++ /dev/null
@@ -1,239 +0,0 @@
-#include <linux/spinlock.h>
-#include <linux/cpu.h>
-#include <linux/interrupt.h>
-#include <asm/tlbflush.h>
-DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate)
-                        = { &init_mm, 0, };
-/* must come after the send_IPI functions above for inlining */
-#include <mach_ipi.h>
-/*
- *      Smarter SMP flushing macros.
- *              c/o Linus Torvalds.
- *
- *      These mean you can really definitely utterly forget about
- *      writing to user space from interrupts. (Its not allowed anyway).
- *
- *      Optimizations Manfred Spraul <manfred@colorfullife.com>
- */
-static cpumask_var_t flush_cpumask;
-static struct mm_struct *flush_mm;
-static unsigned long flush_va;
-static DEFINE_SPINLOCK(tlbstate_lock);
-/*
- * We cannot call mmdrop() because we are in interrupt context,
- * instead update mm->cpu_vm_mask.
- *
- * We need to reload %cr3 since the page tables may be going
- * away from under us..
- */
-void leave_mm(int cpu)
-{
-        BUG_ON(percpu_read(cpu_tlbstate.state) == TLBSTATE_OK);
-        cpu_clear(cpu, percpu_read(cpu_tlbstate.active_mm)->cpu_vm_mask);
-        load_cr3(swapper_pg_dir);
-}
-EXPORT_SYMBOL_GPL(leave_mm);
-/*
- *
- * The flush IPI assumes that a thread switch happens in this order:
- * [cpu0: the cpu that switches]
- * 1) switch_mm() either 1a) or 1b)
- * 1a) thread switch to a different mm
- * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask);
- *      Stop ipi delivery for the old mm. This is not synchronized with
- *      the other cpus, but smp_invalidate_interrupt ignore flush ipis
- *      for the wrong mm, and in the worst case we perform a superfluous
- *      tlb flush.
- * 1a2) set cpu_tlbstate to TLBSTATE_OK
- *      Now the smp_invalidate_interrupt won't call leave_mm if cpu0
- *      was in lazy tlb mode.
- * 1a3) update cpu_tlbstate[].active_mm
- *      Now cpu0 accepts tlb flushes for the new mm.
- * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask);
- *      Now the other cpus will send tlb flush ipis.
- * 1a4) change cr3.
- * 1b) thread switch without mm change
- *      cpu_tlbstate[].active_mm is correct, cpu0 already handles
- *      flush ipis.
- * 1b1) set cpu_tlbstate to TLBSTATE_OK
- * 1b2) test_and_set the cpu bit in cpu_vm_mask.
- *      Atomically set the bit [other cpus will start sending flush ipis],
- *      and test the bit.
- * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
- * 2) switch %%esp, ie current
- *
- * The interrupt must handle 2 special cases:
- * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
- * - the cpu performs speculative tlb reads, i.e. even if the cpu only
- *   runs in kernel space, the cpu could load tlb entries for user space
- *   pages.
- *
- * The good news is that cpu_tlbstate is local to each cpu, no
- * write/read ordering problems.
- */
-/*
- * TLB flush IPI:
- *
- * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
- * 2) Leave the mm if we are in the lazy tlb mode.
- */
-void smp_invalidate_interrupt(struct pt_regs *regs)
-{
-        unsigned long cpu;
-        cpu = get_cpu();
-        if (!cpumask_test_cpu(cpu, flush_cpumask))
-                goto out;
-                /*
-                 * This was a BUG() but until someone can quote me the
-                 * line from the intel manual that guarantees an IPI to
-                 * multiple CPUs is retried _only_ on the erroring CPUs
-                 * its staying as a return
-                 *
-                 * BUG();
-                 */
-        if (flush_mm == percpu_read(cpu_tlbstate.active_mm)) {
-                if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK) {
-                        if (flush_va == TLB_FLUSH_ALL)
-                                local_flush_tlb();
-                        else
-                                __flush_tlb_one(flush_va);
-                } else
-                        leave_mm(cpu);
-        }
-        ack_APIC_irq();
-        smp_mb__before_clear_bit();
-        cpumask_clear_cpu(cpu, flush_cpumask);
-        smp_mb__after_clear_bit();
-out:
-        put_cpu_no_resched();
-        inc_irq_stat(irq_tlb_count);
-}
-void native_flush_tlb_others(const struct cpumask *cpumask,
-                             struct mm_struct *mm, unsigned long va)
-{
-        /*
-         * - mask must exist :)
-         */
-        BUG_ON(cpumask_empty(cpumask));
-        BUG_ON(!mm);
-        /*
-         * i'm not happy about this global shared spinlock in the
-         * MM hot path, but we'll see how contended it is.
-         * AK: x86-64 has a faster method that could be ported.
-         */
-        spin_lock(&tlbstate_lock);
-        cpumask_andnot(flush_cpumask, cpumask, cpumask_of(smp_processor_id()));
-#ifdef CONFIG_HOTPLUG_CPU
-        /* If a CPU which we ran on has gone down, OK. */
-        cpumask_and(flush_cpumask, flush_cpumask, cpu_online_mask);
-        if (unlikely(cpumask_empty(flush_cpumask))) {
-                spin_unlock(&tlbstate_lock);
-                return;
-        }
-#endif
-        flush_mm = mm;
-        flush_va = va;
-        /*
-         * Make the above memory operations globally visible before
-         * sending the IPI.
-         */
-        smp_mb();
-        /*
-         * We have to send the IPI only to
-         * CPUs affected.
-         */
-        send_IPI_mask(flush_cpumask, INVALIDATE_TLB_VECTOR);
-        while (!cpumask_empty(flush_cpumask))
-                /* nothing. lockup detection does not belong here */
-                cpu_relax();
-        flush_mm = NULL;
-        flush_va = 0;
-        spin_unlock(&tlbstate_lock);
-}
-void flush_tlb_current_task(void)
-{
-        struct mm_struct *mm = current->mm;
-        preempt_disable();
-        local_flush_tlb();
-        if (cpumask_any_but(&mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids)
-                flush_tlb_others(&mm->cpu_vm_mask, mm, TLB_FLUSH_ALL);
-        preempt_enable();
-}
-void flush_tlb_mm(struct mm_struct *mm)
-{
-        preempt_disable();
-        if (current->active_mm == mm) {
-                if (current->mm)
-                        local_flush_tlb();
-                else
-                        leave_mm(smp_processor_id());
-        }
-        if (cpumask_any_but(&mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids)
-                flush_tlb_others(&mm->cpu_vm_mask, mm, TLB_FLUSH_ALL);
-        preempt_enable();
-}
-void flush_tlb_page(struct vm_area_struct *vma, unsigned long va)
-{
-        struct mm_struct *mm = vma->vm_mm;
-        preempt_disable();
-        if (current->active_mm == mm) {
-                if (current->mm)
-                        __flush_tlb_one(va);
-                 else
-                        leave_mm(smp_processor_id());
-        }
-        if (cpumask_any_but(&mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids)
-                flush_tlb_others(&mm->cpu_vm_mask, mm, va);
-        preempt_enable();
-}
-EXPORT_SYMBOL(flush_tlb_page);
-static void do_flush_tlb_all(void *info)
-{
-        unsigned long cpu = smp_processor_id();
-        __flush_tlb_all();
-        if (percpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY)
-                leave_mm(cpu);
-}
-void flush_tlb_all(void)
-{
-        on_each_cpu(do_flush_tlb_all, NULL, 1);
-}
-static int init_flush_cpumask(void)
-{
-        alloc_cpumask_var(&flush_cpumask, GFP_KERNEL);
-        return 0;
-}
-early_initcall(init_flush_cpumask);
diff --git a/arch/x86/kernel/tlb_64.c b/arch/x86/kernel/tlb_64.c
deleted file mode 100644
index e64a32c48825..000000000000
--- a/arch/x86/kernel/tlb_64.c
+++ /dev/null
@@ -1,294 +0,0 @@
-#include <linux/init.h>
-#include <linux/mm.h>
-#include <linux/delay.h>
-#include <linux/spinlock.h>
-#include <linux/smp.h>
-#include <linux/kernel_stat.h>
-#include <linux/mc146818rtc.h>
-#include <linux/interrupt.h>
-#include <asm/mtrr.h>
-#include <asm/pgalloc.h>
-#include <asm/tlbflush.h>
-#include <asm/mmu_context.h>
-#include <asm/proto.h>
-#include <asm/apicdef.h>
-#include <asm/idle.h>
-#include <asm/uv/uv_hub.h>
-#include <asm/uv/uv_bau.h>
-DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate)
-                        = { &init_mm, 0, };
-#include <mach_ipi.h>
-/*
- *      Smarter SMP flushing macros.
- *              c/o Linus Torvalds.
- *
- *      These mean you can really definitely utterly forget about
- *      writing to user space from interrupts. (Its not allowed anyway).
- *
- *      Optimizations Manfred Spraul <manfred@colorfullife.com>
- *
- *      More scalable flush, from Andi Kleen
- *
- *      To avoid global state use 8 different call vectors.
- *      Each CPU uses a specific vector to trigger flushes on other
- *      CPUs. Depending on the received vector the target CPUs look into
- *      the right per cpu variable for the flush data.
- *
- *      With more than 8 CPUs they are hashed to the 8 available
- *      vectors. The limited global vector space forces us to this right now.
- *      In future when interrupts are split into per CPU domains this could be
- *      fixed, at the cost of triggering multiple IPIs in some cases.
- */
-union smp_flush_state {
-        struct {
-                struct mm_struct *flush_mm;
-                unsigned long flush_va;
-                spinlock_t tlbstate_lock;
-                DECLARE_BITMAP(flush_cpumask, NR_CPUS);
-        };
-        char pad[SMP_CACHE_BYTES];
-} ____cacheline_aligned;
-/* State is put into the per CPU data section, but padded
-   to a full cache line because other CPUs can access it and we don't
-   want false sharing in the per cpu data segment. */
-static DEFINE_PER_CPU(union smp_flush_state, flush_state);
-/*
- * We cannot call mmdrop() because we are in interrupt context,
- * instead update mm->cpu_vm_mask.
- */
-void leave_mm(int cpu)
-{
-        if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
-                BUG();
-        cpu_clear(cpu, percpu_read(cpu_tlbstate.active_mm)->cpu_vm_mask);
-        load_cr3(swapper_pg_dir);
-}
-EXPORT_SYMBOL_GPL(leave_mm);
-/*
- *
- * The flush IPI assumes that a thread switch happens in this order:
- * [cpu0: the cpu that switches]
- * 1) switch_mm() either 1a) or 1b)
- * 1a) thread switch to a different mm
- * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask);
- *      Stop ipi delivery for the old mm. This is not synchronized with
- *      the other cpus, but smp_invalidate_interrupt ignore flush ipis
- *      for the wrong mm, and in the worst case we perform a superfluous
- *      tlb flush.
- * 1a2) set cpu mmu_state to TLBSTATE_OK
- *      Now the smp_invalidate_interrupt won't call leave_mm if cpu0
- *      was in lazy tlb mode.
- * 1a3) update cpu active_mm
- *      Now cpu0 accepts tlb flushes for the new mm.
- * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask);
- *      Now the other cpus will send tlb flush ipis.
- * 1a4) change cr3.
- * 1b) thread switch without mm change
- *      cpu active_mm is correct, cpu0 already handles
- *      flush ipis.
- * 1b1) set cpu mmu_state to TLBSTATE_OK
- * 1b2) test_and_set the cpu bit in cpu_vm_mask.
- *      Atomically set the bit [other cpus will start sending flush ipis],
- *      and test the bit.
- * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
- * 2) switch %%esp, ie current
- *
- * The interrupt must handle 2 special cases:
- * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
- * - the cpu performs speculative tlb reads, i.e. even if the cpu only
- *   runs in kernel space, the cpu could load tlb entries for user space
- *   pages.
- *
- * The good news is that cpu mmu_state is local to each cpu, no
- * write/read ordering problems.
- */
-/*
- * TLB flush IPI:
- *
- * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
- * 2) Leave the mm if we are in the lazy tlb mode.
- *
- * Interrupts are disabled.
- */
-asmlinkage void smp_invalidate_interrupt(struct pt_regs *regs)
-{
-        int cpu;
-        int sender;
-        union smp_flush_state *f;
-        cpu = smp_processor_id();
-        /*
-         * orig_rax contains the negated interrupt vector.
-         * Use that to determine where the sender put the data.
-         */
-        sender = ~regs->orig_ax - INVALIDATE_TLB_VECTOR_START;
-        f = &per_cpu(flush_state, sender);
-        if (!cpumask_test_cpu(cpu, to_cpumask(f->flush_cpumask)))
-                goto out;
-                /*
-                 * This was a BUG() but until someone can quote me the
-                 * line from the intel manual that guarantees an IPI to
-                 * multiple CPUs is retried _only_ on the erroring CPUs
-                 * its staying as a return
-                 *
-                 * BUG();
-                 */
-        if (f->flush_mm == percpu_read(cpu_tlbstate.active_mm)) {
-                if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK) {
-                        if (f->flush_va == TLB_FLUSH_ALL)
-                                local_flush_tlb();
-                        else
-                                __flush_tlb_one(f->flush_va);
-                } else
-                        leave_mm(cpu);
-        }
-out:
-        ack_APIC_irq();
-        cpumask_clear_cpu(cpu, to_cpumask(f->flush_cpumask));
-        inc_irq_stat(irq_tlb_count);
-}
-static void flush_tlb_others_ipi(const struct cpumask *cpumask,
-                                 struct mm_struct *mm, unsigned long va)
-{
-        int sender;
-        union smp_flush_state *f;
-        /* Caller has disabled preemption */
-        sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS;
-        f = &per_cpu(flush_state, sender);
-        /*
-         * Could avoid this lock when
-         * num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is
-         * probably not worth checking this for a cache-hot lock.
-         */
-        spin_lock(&f->tlbstate_lock);
-        f->flush_mm = mm;
-        f->flush_va = va;
-        cpumask_andnot(to_cpumask(f->flush_cpumask),
-                       cpumask, cpumask_of(smp_processor_id()));
-        /*
-         * Make the above memory operations globally visible before
-         * sending the IPI.
-         */
-        smp_mb();
-        /*
-         * We have to send the IPI only to
-         * CPUs affected.
-         */
-        send_IPI_mask(to_cpumask(f->flush_cpumask),
-                      INVALIDATE_TLB_VECTOR_START + sender);
-        while (!cpumask_empty(to_cpumask(f->flush_cpumask)))
-                cpu_relax();
-        f->flush_mm = NULL;
-        f->flush_va = 0;
-        spin_unlock(&f->tlbstate_lock);
-}
-void native_flush_tlb_others(const struct cpumask *cpumask,
-                             struct mm_struct *mm, unsigned long va)
-{
-        if (is_uv_system()) {
-                /* FIXME: could be an percpu_alloc'd thing */
-                static DEFINE_PER_CPU(cpumask_t, flush_tlb_mask);
-                struct cpumask *after_uv_flush = &get_cpu_var(flush_tlb_mask);
-                cpumask_andnot(after_uv_flush, cpumask,
-                               cpumask_of(smp_processor_id()));
-                if (!uv_flush_tlb_others(after_uv_flush, mm, va))
-                        flush_tlb_others_ipi(after_uv_flush, mm, va);
-                put_cpu_var(flush_tlb_uv_cpumask);
-                return;
-        }
-        flush_tlb_others_ipi(cpumask, mm, va);
-}
-static int __cpuinit init_smp_flush(void)
-{
-        int i;
-        for_each_possible_cpu(i)
-                spin_lock_init(&per_cpu(flush_state, i).tlbstate_lock);
-        return 0;
-}
-core_initcall(init_smp_flush);
-void flush_tlb_current_task(void)
-{
-        struct mm_struct *mm = current->mm;
-        preempt_disable();
-        local_flush_tlb();
-        if (cpumask_any_but(&mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids)
-                flush_tlb_others(&mm->cpu_vm_mask, mm, TLB_FLUSH_ALL);
-        preempt_enable();
-}
-void flush_tlb_mm(struct mm_struct *mm)
-{
-        preempt_disable();
-        if (current->active_mm == mm) {
-                if (current->mm)
-                        local_flush_tlb();
-                else
-                        leave_mm(smp_processor_id());
-        }
-        if (cpumask_any_but(&mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids)
-                flush_tlb_others(&mm->cpu_vm_mask, mm, TLB_FLUSH_ALL);
-        preempt_enable();
-}
-void flush_tlb_page(struct vm_area_struct *vma, unsigned long va)
-{
-        struct mm_struct *mm = vma->vm_mm;
-        preempt_disable();
-        if (current->active_mm == mm) {
-                if (current->mm)
-                        __flush_tlb_one(va);
-                else
-                        leave_mm(smp_processor_id());
-        }
-        if (cpumask_any_but(&mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids)
-                flush_tlb_others(&mm->cpu_vm_mask, mm, va);
-        preempt_enable();
-}
-static void do_flush_tlb_all(void *info)
-{
-        unsigned long cpu = smp_processor_id();
-        __flush_tlb_all();
-        if (percpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY)
-                leave_mm(cpu);
-}
-void flush_tlb_all(void)
-{
-        on_each_cpu(do_flush_tlb_all, NULL, 1);
-}
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index 690dcf1a27d4..aae15dd72604 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -11,6 +11,7 @@
 #include <linux/kernel.h>
 #include <asm/mmu_context.h>
+#include <asm/uv/uv.h>
 #include <asm/uv/uv_mmrs.h>
 #include <asm/uv/uv_hub.h>
 #include <asm/uv/uv_bau.h>
@@ -209,14 +210,15 @@ static int uv_wait_completion(struct bau_desc *bau_desc,
 *
 * Send a broadcast and wait for a broadcast message to complete.
 *
- * The cpumaskp mask contains the cpus the broadcast was sent to.
+ * The flush_mask contains the cpus the broadcast was sent to.
 *
- * Returns 1 if all remote flushing was done. The mask is zeroed.
+ * Returns NULL if all remote flushing was done. The mask is zeroed.
- * Returns 0 if some remote flushing remains to be done. The mask will have
+ * Returns @flush_mask if some remote flushing remains to be done. The
- * some bits still set.
+ * mask will have some bits still set.
 */
-int uv_flush_send_and_wait(int cpu, int this_blade, struct bau_desc *bau_desc,
+const struct cpumask *uv_flush_send_and_wait(int cpu, int this_blade,
-                           struct cpumask *cpumaskp)
+                                             struct bau_desc *bau_desc,
+                                             struct cpumask *flush_mask)
 {
        int completion_status = 0;
        int right_shift;
@@ -263,59 +265,69 @@ int uv_flush_send_and_wait(int cpu, int this_blade, struct bau_desc *bau_desc,
         * Success, so clear the remote cpu's from the mask so we don't
         * use the IPI method of shootdown on them.
         */
-        for_each_cpu(bit, cpumaskp) {
+        for_each_cpu(bit, flush_mask) {
                blade = uv_cpu_to_blade_id(bit);
                if (blade == this_blade)
                        continue;
-                cpumask_clear_cpu(bit, cpumaskp);
+                cpumask_clear_cpu(bit, flush_mask);
        }
-        if (!cpumask_empty(cpumaskp))
+        if (!cpumask_empty(flush_mask))
-                return 0;
+                return flush_mask;
-        return 1;
+        return NULL;
 }
 /**
 * uv_flush_tlb_others - globally purge translation cache of a virtual
 * address or all TLB's
- * @cpumaskp: mask of all cpu's in which the address is to be removed
+ * @cpumask: mask of all cpu's in which the address is to be removed
 * @mm: mm_struct containing virtual address range
 * @va: virtual address to be removed (or TLB_FLUSH_ALL for all TLB's on cpu)
+ * @cpu: the current cpu
 *
 * This is the entry point for initiating any UV global TLB shootdown.
 *
 * Purges the translation caches of all specified processors of the given
 * virtual address, or purges all TLB's on specified processors.
 *
- * The caller has derived the cpumaskp from the mm_struct and has subtracted
+ * The caller has derived the cpumask from the mm_struct.  This function
- * the local cpu from the mask.  This function is called only if there
+ * is called only if there are bits set in the mask. (e.g. flush_tlb_page())
- * are bits set in the mask. (e.g. flush_tlb_page())
 *
- * The cpumaskp is converted into a nodemask of the nodes containing
+ * The cpumask is converted into a nodemask of the nodes containing
 * the cpus.
 *
- * Returns 1 if all remote flushing was done.
+ * Note that this function should be called with preemption disabled.
- * Returns 0 if some remote flushing remains to be done.
+ *
+ * Returns NULL if all remote flushing was done.
+ * Returns pointer to cpumask if some remote flushing remains to be
+ * done.  The returned pointer is valid till preemption is re-enabled.
 */
-int uv_flush_tlb_others(struct cpumask *cpumaskp, struct mm_struct *mm,
+const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
-                        unsigned long va)
+                                          struct mm_struct *mm,
+                                          unsigned long va, unsigned int cpu)
 {
+        static DEFINE_PER_CPU(cpumask_t, flush_tlb_mask);
+        struct cpumask *flush_mask = &__get_cpu_var(flush_tlb_mask);
        int i;
        int bit;
        int blade;
-        int cpu;
+        int uv_cpu;
        int this_blade;
        int locals = 0;
        struct bau_desc *bau_desc;
-        cpu = uv_blade_processor_id();
+        WARN_ON(!in_atomic());
+        cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu));
+        uv_cpu = uv_blade_processor_id();
        this_blade = uv_numa_blade_id();
        bau_desc = __get_cpu_var(bau_control).descriptor_base;
-        bau_desc += UV_ITEMS_PER_DESCRIPTOR * cpu;
+        bau_desc += UV_ITEMS_PER_DESCRIPTOR * uv_cpu;
        bau_nodes_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE);
        i = 0;
-        for_each_cpu(bit, cpumaskp) {
+        for_each_cpu(bit, flush_mask) {
                blade = uv_cpu_to_blade_id(bit);
                BUG_ON(blade > (UV_DISTRIBUTION_SIZE - 1));
                if (blade == this_blade) {
@@ -330,17 +342,17 @@ int uv_flush_tlb_others(struct cpumask *cpumaskp, struct mm_struct *mm,
                 * no off_node flushing; return status for local node
                 */
                if (locals)
-                        return 0;
+                        return flush_mask;
                else
-                        return 1;
+                        return NULL;
        }
        __get_cpu_var(ptcstats).requestor++;
        __get_cpu_var(ptcstats).ntargeted += i;
        bau_desc->payload.address = va;
-        bau_desc->payload.sending_cpu = smp_processor_id();
+        bau_desc->payload.sending_cpu = cpu;
-        return uv_flush_send_and_wait(cpu, this_blade, bau_desc, cpumaskp);
+        return uv_flush_send_and_wait(uv_cpu, this_blade, bau_desc, flush_mask);
 }
 /*
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 98c2d055284b..ed5aee5f3fcc 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -59,7 +59,6 @@
 #ifdef CONFIG_X86_64
 #include <asm/pgalloc.h>
 #include <asm/proto.h>
-#include <asm/pda.h>
 #else
 #include <asm/processor-flags.h>
 #include <asm/arch_hooks.h>
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
index a09abb8fb97f..c9740996430a 100644
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ b/arch/x86/kernel/vmlinux_64.lds.S
@@ -220,8 +220,7 @@ SECTIONS
   * so that it can be accessed as a percpu variable.
   */
  . = ALIGN(PAGE_SIZE);
-  PERCPU_VADDR_PREALLOC(0, :percpu, pda_size)
+  PERCPU_VADDR(0, :percpu)
-  per_cpu____pda = __per_cpu_start;
 #else
  PERCPU(PAGE_SIZE)
 #endif
@@ -262,3 +261,8 @@ SECTIONS
 */
 ASSERT((_end - _text <= KERNEL_IMAGE_SIZE),
        "kernel image bigger than KERNEL_IMAGE_SIZE")
+#ifdef CONFIG_SMP
+ASSERT((per_cpu__irq_stack_union == 0),
+        "irq_stack_union is not at start of per-cpu area");
+#endif