Merge branch 'mpi-master' into wip-k-fmlpwip-k-fmlp

Conflicts: litmus/sched_cedf.c
author: Glenn Elliott <gelliott@cs.unc.edu> 2012-03-04 19:47:13 -0500
committer: Glenn Elliott <gelliott@cs.unc.edu> 2012-03-04 19:47:13 -0500
commit: c71c03bda1e86c9d5198c5d83f712e695c4f2a1e (patch)
tree: ecb166cb3e2b7e2adb3b5e292245fefd23381ac8 /arch/x86/kernel
parent: ea53c912f8a86a8567697115b6a0d8152beee5c8 (diff)
parent: 6a00f206debf8a5c8899055726ad127dbeeed098 (diff)
196 files changed, 8552 insertions, 24695 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 6890dbb9ac15..d727f8f94333 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -24,28 +24,34 @@ endif
 nostackp := $(call cc-option, -fno-stack-protector)
 CFLAGS_vsyscall_64.o    := $(PROFILING) -g0 $(nostackp)
 CFLAGS_hpet.o           := $(nostackp)
-CFLAGS_tsc.o            := $(nostackp)
+CFLAGS_vread_tsc_64.o   := $(nostackp)
 CFLAGS_paravirt.o       := $(nostackp)
 GCOV_PROFILE_vsyscall_64.o      := n
 GCOV_PROFILE_hpet.o             := n
 GCOV_PROFILE_tsc.o              := n
+GCOV_PROFILE_vread_tsc_64.o     := n
 GCOV_PROFILE_paravirt.o         := n
+# vread_tsc_64 is hot and should be fully optimized:
+CFLAGS_REMOVE_vread_tsc_64.o = -pg -fno-optimize-sibling-calls
 obj-y                   := process_$(BITS).o signal.o entry_$(BITS).o
 obj-y                   += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
 obj-y                   += time.o ioport.o ldt.o dumpstack.o
-obj-y                   += setup.o x86_init.o i8259.o irqinit.o
+obj-y                   += setup.o x86_init.o i8259.o irqinit.o jump_label.o
-obj-$(CONFIG_X86_VISWS) += visws_quirks.o
+obj-$(CONFIG_IRQ_WORK)  += irq_work.o
-obj-$(CONFIG_X86_32)    += probe_roms_32.o
+obj-y                   += probe_roms.o
 obj-$(CONFIG_X86_32)    += sys_i386_32.o i386_ksyms_32.o
 obj-$(CONFIG_X86_64)    += sys_x86_64.o x8664_ksyms_64.o
-obj-$(CONFIG_X86_64)    += syscall_64.o vsyscall_64.o
+obj-$(CONFIG_X86_64)    += syscall_64.o vsyscall_64.o vread_tsc_64.o
 obj-y                   += bootflag.o e820.o
-obj-y                   += pci-dma.o quirks.o i8237.o topology.o kdebugfs.o
+obj-y                   += pci-dma.o quirks.o topology.o kdebugfs.o
 obj-y                   += alternative.o i8253.o pci-nommu.o hw_breakpoint.o
 obj-y                   += tsc.o io_delay.o rtc.o
+obj-y                   += pci-iommu_table.o
+obj-y                   += resource.o
-obj-$(CONFIG_X86_TRAMPOLINE)    += trampoline.o
+obj-y                           += trampoline.o trampoline_$(BITS).o
 obj-y                           += process.o
 obj-y                           += i387.o xsave.o
 obj-y                           += ptrace.o
@@ -53,11 +59,12 @@ obj-$(CONFIG_X86_32)		+= tls.o
 obj-$(CONFIG_IA32_EMULATION)    += tls.o
 obj-y                           += step.o
 obj-$(CONFIG_INTEL_TXT)         += tboot.o
+obj-$(CONFIG_ISA_DMA_API)       += i8237.o
 obj-$(CONFIG_STACKTRACE)        += stacktrace.o
 obj-y                           += cpu/
 obj-y                           += acpi/
-obj-$(CONFIG_SFI)               += sfi.o
 obj-y                           += reboot.o
+obj-$(CONFIG_X86_32)            += reboot_32.o
 obj-$(CONFIG_MCA)               += mca_32.o
 obj-$(CONFIG_X86_MSR)           += msr.o
 obj-$(CONFIG_X86_CPUID)         += cpuid.o
@@ -65,10 +72,9 @@ obj-$(CONFIG_PCI)		+= early-quirks.o
 apm-y                           := apm_32.o
 obj-$(CONFIG_APM)               += apm.o
 obj-$(CONFIG_SMP)               += smp.o
-obj-$(CONFIG_SMP)               += smpboot.o tsc_sync.o
+obj-$(CONFIG_SMP)               += smpboot.o
+obj-$(CONFIG_SMP)               += tsc_sync.o
 obj-$(CONFIG_SMP)               += setup_percpu.o
-obj-$(CONFIG_X86_64_SMP)        += tsc_sync.o
-obj-$(CONFIG_X86_TRAMPOLINE)    += trampoline_$(BITS).o
 obj-$(CONFIG_X86_MPPARSE)       += mpparse.o
 obj-y                           += apic/
 obj-$(CONFIG_X86_REBOOTFIXUPS)  += reboot_fixups_32.o
@@ -80,7 +86,6 @@ obj-$(CONFIG_KEXEC)		+= relocate_kernel_$(BITS).o crash.o
 obj-$(CONFIG_CRASH_DUMP)        += crash_dump_$(BITS).o
 obj-$(CONFIG_KPROBES)           += kprobes.o
 obj-$(CONFIG_MODULES)           += module.o
-obj-$(CONFIG_EFI)               += efi.o efi_$(BITS).o efi_stub_$(BITS).o
 obj-$(CONFIG_DOUBLEFAULT)       += doublefault_32.o
 obj-$(CONFIG_KGDB)              += kgdb.o
 obj-$(CONFIG_VM86)              += vm86_32.o
@@ -89,11 +94,10 @@ obj-$(CONFIG_EARLY_PRINTK)	+= early_printk.o
 obj-$(CONFIG_HPET_TIMER)        += hpet.o
 obj-$(CONFIG_APB_TIMER)         += apb_timer.o
-obj-$(CONFIG_K8_NB)             += k8.o
+obj-$(CONFIG_AMD_NB)            += amd_nb.o
 obj-$(CONFIG_DEBUG_RODATA_TEST) += test_rodata.o
 obj-$(CONFIG_DEBUG_NX_TEST)     += test_nx.o
-obj-$(CONFIG_VMI)               += vmi_32.o vmiclock_32.o
 obj-$(CONFIG_KVM_GUEST)         += kvm.o
 obj-$(CONFIG_KVM_CLOCK)         += kvmclock.o
 obj-$(CONFIG_PARAVIRT)          += paravirt.o paravirt_patch_$(BITS).o
@@ -102,13 +106,6 @@ obj-$(CONFIG_PARAVIRT_CLOCK)	+= pvclock.o
 obj-$(CONFIG_PCSPKR_PLATFORM)   += pcspeaker.o
-obj-$(CONFIG_SCx200)            += scx200.o
-scx200-y                        += scx200_32.o
-obj-$(CONFIG_OLPC)              += olpc.o
-obj-$(CONFIG_OLPC_OPENFIRMWARE) += olpc_ofw.o
-obj-$(CONFIG_X86_MRST)          += mrst.o
 microcode-y                             := microcode_core.o
 microcode-$(CONFIG_MICROCODE_INTEL)     += microcode_intel.o
 microcode-$(CONFIG_MICROCODE_AMD)       += microcode_amd.o
@@ -117,17 +114,16 @@ obj-$(CONFIG_MICROCODE)			+= microcode.o
 obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o
 obj-$(CONFIG_SWIOTLB)                   += pci-swiotlb.o
+obj-$(CONFIG_OF)                        += devicetree.o
 obj-$(CONFIG_FEATHER_TRACE)     += ft_event.o
 ###
 # 64 bit specific files
 ifeq ($(CONFIG_X86_64),y)
-        obj-$(CONFIG_X86_UV)            += tlb_uv.o bios_uv.o uv_irq.o uv_sysfs.o uv_time.o
-        obj-$(CONFIG_X86_PM_TIMER)      += pmtimer_64.o
        obj-$(CONFIG_AUDIT)             += audit_64.o
-        obj-$(CONFIG_GART_IOMMU)        += pci-gart_64.o aperture_64.o
+        obj-$(CONFIG_GART_IOMMU)        += amd_gart_64.o aperture_64.o
        obj-$(CONFIG_CALGARY_IOMMU)     += pci-calgary_64.o tce_64.o
        obj-$(CONFIG_AMD_IOMMU)         += amd_iommu_init.o amd_iommu.o
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index c05872aa3ce0..4558f0d0822d 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -72,6 +72,7 @@ u8 acpi_sci_flags __initdata;
 int acpi_sci_override_gsi __initdata;
 int acpi_skip_timer_override __initdata;
 int acpi_use_timer_override __initdata;
+int acpi_fix_pin2_polarity __initdata;
 #ifdef CONFIG_X86_LOCAL_APIC
 static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
@@ -198,6 +199,11 @@ static void __cpuinit acpi_register_lapic(int id, u8 enabled)
 {
        unsigned int ver = 0;
+        if (id >= (MAX_LOCAL_APIC-1)) {
+                printk(KERN_INFO PREFIX "skipped apicid that is too big\n");
+                return;
+        }
        if (!enabled) {
                ++disabled_cpus;
                return;
@@ -410,10 +416,15 @@ acpi_parse_int_src_ovr(struct acpi_subtable_header * header,
                return 0;
        }
-        if (acpi_skip_timer_override &&
+        if (intsrc->source_irq == 0 && intsrc->global_irq == 2) {
-            intsrc->source_irq == 0 && intsrc->global_irq == 2) {
+                if (acpi_skip_timer_override) {
-                printk(PREFIX "BIOS IRQ0 pin2 override ignored.\n");
+                        printk(PREFIX "BIOS IRQ0 pin2 override ignored.\n");
-                return 0;
+                        return 0;
+                }
+                if (acpi_fix_pin2_polarity && (intsrc->inti_flags & ACPI_MADT_POLARITY_MASK)) {
+                        intsrc->inti_flags &= ~ACPI_MADT_POLARITY_MASK;
+                        printk(PREFIX "BIOS IRQ0 pin2 override: forcing polarity to high active.\n");
+                }
        }
        mp_override_legacy_irq(intsrc->source_irq,
@@ -504,6 +515,7 @@ int acpi_gsi_to_irq(u32 gsi, unsigned int *irq)
        return 0;
 }
+EXPORT_SYMBOL_GPL(acpi_gsi_to_irq);
 int acpi_isa_irq_to_gsi(unsigned isa_irq, u32 *gsi)
 {
@@ -513,35 +525,62 @@ int acpi_isa_irq_to_gsi(unsigned isa_irq, u32 *gsi)
        return 0;
 }
-/*
+static int acpi_register_gsi_pic(struct device *dev, u32 gsi,
- * success: return IRQ number (>=0)
+                                 int trigger, int polarity)
- * failure: return < 0
- */
-int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
 {
-        unsigned int irq;
-        unsigned int plat_gsi = gsi;
 #ifdef CONFIG_PCI
        /*
         * Make sure all (legacy) PCI IRQs are set as level-triggered.
         */
-        if (acpi_irq_model == ACPI_IRQ_MODEL_PIC) {
+        if (trigger == ACPI_LEVEL_SENSITIVE)
-                if (trigger == ACPI_LEVEL_SENSITIVE)
+                eisa_set_level_irq(gsi);
-                        eisa_set_level_irq(gsi);
-        }
 #endif
+        return gsi;
+}
+static int acpi_register_gsi_ioapic(struct device *dev, u32 gsi,
+                                    int trigger, int polarity)
+{
 #ifdef CONFIG_X86_IO_APIC
-        if (acpi_irq_model == ACPI_IRQ_MODEL_IOAPIC) {
+        gsi = mp_register_gsi(dev, gsi, trigger, polarity);
-                plat_gsi = mp_register_gsi(dev, gsi, trigger, polarity);
-        }
 #endif
+        return gsi;
+}
+int (*__acpi_register_gsi)(struct device *dev, u32 gsi,
+                           int trigger, int polarity) = acpi_register_gsi_pic;
+/*
+ * success: return IRQ number (>=0)
+ * failure: return < 0
+ */
+int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
+{
+        unsigned int irq;
+        unsigned int plat_gsi = gsi;
+        plat_gsi = (*__acpi_register_gsi)(dev, gsi, trigger, polarity);
        irq = gsi_to_irq(plat_gsi);
        return irq;
 }
+void __init acpi_set_irq_model_pic(void)
+{
+        acpi_irq_model = ACPI_IRQ_MODEL_PIC;
+        __acpi_register_gsi = acpi_register_gsi_pic;
+        acpi_ioapic = 0;
+}
+void __init acpi_set_irq_model_ioapic(void)
+{
+        acpi_irq_model = ACPI_IRQ_MODEL_IOAPIC;
+        __acpi_register_gsi = acpi_register_gsi_ioapic;
+        acpi_ioapic = 1;
+}
 /*
 *  ACPI based hotplug support for CPU
 */
@@ -556,14 +595,8 @@ static void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
        nid = acpi_get_node(handle);
        if (nid == -1 || !node_online(nid))
                return;
-#ifdef CONFIG_X86_64
+        set_apicid_to_node(physid, nid);
-        apicid_to_node[physid] = nid;
        numa_set_node(cpu, nid);
-#else /* CONFIG_X86_32 */
-        apicid_2_node[physid] = nid;
-        cpu_to_node_map[cpu] = nid;
-#endif
 #endif
 }
@@ -820,18 +853,6 @@ static int __init acpi_parse_fadt(struct acpi_table_header *table)
 * returns 0 on success, < 0 on error
 */
-static void __init acpi_register_lapic_address(unsigned long address)
-{
-        mp_lapic_addr = address;
-        set_fixmap_nocache(FIX_APIC_BASE, address);
-        if (boot_cpu_physical_apicid == -1U) {
-                boot_cpu_physical_apicid  = read_apic_id();
-                apic_version[boot_cpu_physical_apicid] =
-                         GET_APIC_VERSION(apic_read(APIC_LVR));
-        }
-}
 static int __init early_acpi_parse_madt_lapic_addr_ovr(void)
 {
        int count;
@@ -853,7 +874,7 @@ static int __init early_acpi_parse_madt_lapic_addr_ovr(void)
                return count;
        }
-        acpi_register_lapic_address(acpi_lapic_addr);
+        register_lapic_address(acpi_lapic_addr);
        return count;
 }
@@ -880,16 +901,16 @@ static int __init acpi_parse_madt_lapic_entries(void)
                return count;
        }
-        acpi_register_lapic_address(acpi_lapic_addr);
+        register_lapic_address(acpi_lapic_addr);
        count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_SAPIC,
-                                      acpi_parse_sapic, MAX_APICS);
+                                      acpi_parse_sapic, MAX_LOCAL_APIC);
        if (!count) {
                x2count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_X2APIC,
-                                                acpi_parse_x2apic, MAX_APICS);
+                                        acpi_parse_x2apic, MAX_LOCAL_APIC);
                count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC,
-                                              acpi_parse_lapic, MAX_APICS);
+                                        acpi_parse_lapic, MAX_LOCAL_APIC);
        }
        if (!count && !x2count) {
                printk(KERN_ERR PREFIX "No LAPIC entries present\n");
@@ -922,32 +943,6 @@ static int __init acpi_parse_madt_lapic_entries(void)
 extern int es7000_plat;
 #endif
-static void assign_to_mp_irq(struct mpc_intsrc *m,
-                                    struct mpc_intsrc *mp_irq)
-{
-        memcpy(mp_irq, m, sizeof(struct mpc_intsrc));
-}
-static int mp_irq_cmp(struct mpc_intsrc *mp_irq,
-                                struct mpc_intsrc *m)
-{
-        return memcmp(mp_irq, m, sizeof(struct mpc_intsrc));
-}
-static void save_mp_irq(struct mpc_intsrc *m)
-{
-        int i;
-        for (i = 0; i < mp_irq_entries; i++) {
-                if (!mp_irq_cmp(&mp_irqs[i], m))
-                        return;
-        }
-        assign_to_mp_irq(m, &mp_irqs[mp_irq_entries]);
-        if (++mp_irq_entries == MAX_IRQ_SOURCES)
-                panic("Max # of irq sources exceeded!!\n");
-}
 void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
 {
        int ioapic;
@@ -975,10 +970,10 @@ void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
        mp_irq.irqflag = (trigger << 2) | polarity;
        mp_irq.srcbus = MP_ISA_BUS;
        mp_irq.srcbusirq = bus_irq;     /* IRQ */
-        mp_irq.dstapic = mp_ioapics[ioapic].apicid; /* APIC ID */
+        mp_irq.dstapic = mpc_ioapic_id(ioapic); /* APIC ID */
        mp_irq.dstirq = pin;    /* INTIN# */
-        save_mp_irq(&mp_irq);
+        mp_save_irq(&mp_irq);
        isa_irq_to_gsi[bus_irq] = gsi;
 }
@@ -1026,7 +1021,7 @@ void __init mp_config_acpi_legacy_irqs(void)
                if (ioapic < 0)
                        continue;
                pin = mp_find_ioapic_pin(ioapic, gsi);
-                dstapic = mp_ioapics[ioapic].apicid;
+                dstapic = mpc_ioapic_id(ioapic);
                for (idx = 0; idx < mp_irq_entries; idx++) {
                        struct mpc_intsrc *irq = mp_irqs + idx;
@@ -1053,7 +1048,7 @@ void __init mp_config_acpi_legacy_irqs(void)
                mp_irq.srcbusirq = i; /* Identity mapped */
                mp_irq.dstirq = pin;
-                save_mp_irq(&mp_irq);
+                mp_save_irq(&mp_irq);
        }
 }
@@ -1087,10 +1082,10 @@ static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int trigger,
        mp_irq.srcbus = number;
        mp_irq.srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3);
        ioapic = mp_find_ioapic(gsi);
-        mp_irq.dstapic = mp_ioapics[ioapic].apicid;
+        mp_irq.dstapic = mpc_ioapic_id(ioapic);
        mp_irq.dstirq = mp_find_ioapic_pin(ioapic, gsi);
-        save_mp_irq(&mp_irq);
+        mp_save_irq(&mp_irq);
 #endif
        return 0;
 }
@@ -1118,7 +1113,7 @@ int mp_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
        if (ioapic_pin > MP_MAX_IOAPIC_PIN) {
                printk(KERN_ERR "Invalid reference to IOAPIC pin "
-                       "%d-%d\n", mp_ioapics[ioapic].apicid,
+                       "%d-%d\n", mpc_ioapic_id(ioapic),
                       ioapic_pin);
                return gsi;
        }
@@ -1259,8 +1254,7 @@ static void __init acpi_process_madt(void)
                         */
                        error = acpi_parse_madt_ioapic_entries();
                        if (!error) {
-                                acpi_irq_model = ACPI_IRQ_MODEL_IOAPIC;
+                                acpi_set_irq_model_ioapic();
-                                acpi_ioapic = 1;
                                smp_found_config = 1;
                        }
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c
index fb16f17e59be..5812404a0d4c 100644
--- a/arch/x86/kernel/acpi/cstate.c
+++ b/arch/x86/kernel/acpi/cstate.c
@@ -13,6 +13,7 @@
 #include <acpi/processor.h>
 #include <asm/acpi.h>
+#include <asm/mwait.h>
 /*
 * Initialize bm_flags based on the CPU cache properties
@@ -65,16 +66,6 @@ static struct cstate_entry __percpu *cpu_cstate_entry;	/* per CPU ptr */
 static short mwait_supported[ACPI_PROCESSOR_MAX_POWER];
-#define MWAIT_SUBSTATE_MASK     (0xf)
-#define MWAIT_CSTATE_MASK       (0xf)
-#define MWAIT_SUBSTATE_SIZE     (4)
-#define CPUID_MWAIT_LEAF (5)
-#define CPUID5_ECX_EXTENSIONS_SUPPORTED (0x1)
-#define CPUID5_ECX_INTERRUPT_BREAK      (0x2)
-#define MWAIT_ECX_INTERRUPT_BREAK       (0x1)
 #define NATIVE_CSTATE_BEYOND_HALT       (2)
 static long acpi_processor_ffh_cstate_probe_cpu(void *_cx)
diff --git a/arch/x86/kernel/acpi/realmode/wakeup.S b/arch/x86/kernel/acpi/realmode/wakeup.S
index 28595d6df47c..b4fd836e4053 100644
--- a/arch/x86/kernel/acpi/realmode/wakeup.S
+++ b/arch/x86/kernel/acpi/realmode/wakeup.S
@@ -6,11 +6,17 @@
 #include <asm/page_types.h>
 #include <asm/pgtable_types.h>
 #include <asm/processor-flags.h>
+#include "wakeup.h"
        .code16
-        .section ".header", "a"
+        .section ".jump", "ax"
+        .globl  _start
+_start:
+        cli
+        jmp     wakeup_code
 /* This should match the structure in wakeup.h */
+                .section ".header", "a"
                .globl  wakeup_header
 wakeup_header:
 video_mode:     .short  0       /* Video mode number */
@@ -22,6 +28,8 @@ pmode_cr3:	.long	0	/* Saved %cr3 */
 pmode_cr4:      .long   0       /* Saved %cr4 */
 pmode_efer:     .quad   0       /* Saved EFER */
 pmode_gdt:      .quad   0
+pmode_misc_en:  .quad   0       /* Saved MISC_ENABLE MSR */
+pmode_behavior: .long   0       /* Wakeup behavior flags */
 realmode_flags: .long   0
 real_magic:     .long   0
 trampoline_segment:     .word 0
@@ -30,14 +38,11 @@ wakeup_jmp:	.byte	0xea	/* ljmpw */
 wakeup_jmp_off: .word   3f
 wakeup_jmp_seg: .word   0
 wakeup_gdt:     .quad   0, 0, 0
-signature:      .long   0x51ee1111
+signature:      .long   WAKEUP_HEADER_SIGNATURE
        .text
-        .globl  _start
        .code16
 wakeup_code:
-_start:
-        cli
        cld
        /* Apparently some dimwit BIOS programmers don't know how to
@@ -77,17 +82,29 @@ _start:
        /* Check header signature... */
        movl    signature, %eax
-        cmpl    $0x51ee1111, %eax
+        cmpl    $WAKEUP_HEADER_SIGNATURE, %eax
        jne     bogus_real_magic
        /* Check we really have everything... */
        movl    end_signature, %eax
-        cmpl    $0x65a22c82, %eax
+        cmpl    $WAKEUP_END_SIGNATURE, %eax
        jne     bogus_real_magic
        /* Call the C code */
        calll   main
+        /* Restore MISC_ENABLE before entering protected mode, in case
+           BIOS decided to clear XD_DISABLE during S3. */
+        movl    pmode_behavior, %eax
+        btl     $WAKEUP_BEHAVIOR_RESTORE_MISC_ENABLE, %eax
+        jnc     1f
+        movl    pmode_misc_en, %eax
+        movl    pmode_misc_en + 4, %edx
+        movl    $MSR_IA32_MISC_ENABLE, %ecx
+        wrmsr
+1:
        /* Do any other stuff... */
 #ifndef CONFIG_64BIT
@@ -147,3 +164,7 @@ wakeup_heap:
 wakeup_stack:
        .space  2048
 wakeup_stack_end:
+        .section ".signature","a"
+end_signature:
+        .long   WAKEUP_END_SIGNATURE
diff --git a/arch/x86/kernel/acpi/realmode/wakeup.h b/arch/x86/kernel/acpi/realmode/wakeup.h
index 69d38d0b2b64..97a29e1430e3 100644
--- a/arch/x86/kernel/acpi/realmode/wakeup.h
+++ b/arch/x86/kernel/acpi/realmode/wakeup.h
@@ -21,6 +21,9 @@ struct wakeup_header {
        u32 pmode_efer_low;     /* Protected mode EFER */
        u32 pmode_efer_high;
        u64 pmode_gdt;
+        u32 pmode_misc_en_low;  /* Protected mode MISC_ENABLE */
+        u32 pmode_misc_en_high;
+        u32 pmode_behavior;     /* Wakeup routine behavior flags */
        u32 realmode_flags;
        u32 real_magic;
        u16 trampoline_segment; /* segment with trampoline code, 64-bit only */
@@ -35,7 +38,11 @@ struct wakeup_header {
 extern struct wakeup_header wakeup_header;
 #endif
-#define HEADER_OFFSET 0x3f00
+#define WAKEUP_HEADER_OFFSET    8
-#define WAKEUP_SIZE   0x4000
+#define WAKEUP_HEADER_SIGNATURE 0x51ee1111
+#define WAKEUP_END_SIGNATURE    0x65a22c82
+/* Wakeup behavior bits */
+#define WAKEUP_BEHAVIOR_RESTORE_MISC_ENABLE     0
 #endif /* ARCH_X86_KERNEL_ACPI_RM_WAKEUP_H */
diff --git a/arch/x86/kernel/acpi/realmode/wakeup.lds.S b/arch/x86/kernel/acpi/realmode/wakeup.lds.S
index 060fff8f5c5b..d4f8010a5b1b 100644
--- a/arch/x86/kernel/acpi/realmode/wakeup.lds.S
+++ b/arch/x86/kernel/acpi/realmode/wakeup.lds.S
@@ -13,9 +13,19 @@ ENTRY(_start)
 SECTIONS
 {
        . = 0;
+        .jump   : {
+                *(.jump)
+        } = 0x90909090
+        . = WAKEUP_HEADER_OFFSET;
+        .header : {
+                *(.header)
+        }
+        . = ALIGN(16);
        .text : {
                 *(.text*)
-        }
+        } = 0x90909090
        . = ALIGN(16);
        .rodata : {
@@ -33,11 +43,6 @@ SECTIONS
                 *(.data*)
        }
-        .signature : {
-                end_signature = .;
-                LONG(0x65a22c82)
-        }
        . = ALIGN(16);
        .bss :  {
                __bss_start = .;
@@ -45,20 +50,13 @@ SECTIONS
                __bss_end = .;
        }
-        . = HEADER_OFFSET;
+        .signature : {
-        .header : {
+                *(.signature)
-                *(.header)
        }
-        . = ALIGN(16);
        _end = .;
        /DISCARD/ : {
                *(.note*)
        }
-        /*
-         * The ASSERT() sink to . is intentional, for binutils 2.14 compatibility:
-         */
-        . = ASSERT(_end <= WAKEUP_SIZE, "Wakeup too big!");
 }
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index 33cec152070d..103b6ab368d3 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -7,45 +7,39 @@
 #include <linux/acpi.h>
 #include <linux/bootmem.h>
+#include <linux/memblock.h>
 #include <linux/dmi.h>
 #include <linux/cpumask.h>
 #include <asm/segment.h>
 #include <asm/desc.h>
+#include <asm/pgtable.h>
+#include <asm/cacheflush.h>
 #include "realmode/wakeup.h"
 #include "sleep.h"
-unsigned long acpi_wakeup_address;
 unsigned long acpi_realmode_flags;
-/* address in low memory of the wakeup routine. */
-static unsigned long acpi_realmode;
 #if defined(CONFIG_SMP) && defined(CONFIG_64BIT)
 static char temp_stack[4096];
 #endif
 /**
- * acpi_save_state_mem - save kernel state
+ * acpi_suspend_lowlevel - save kernel state
 *
 * Create an identity mapped page table and copy the wakeup routine to
 * low memory.
- *
- * Note that this is too late to change acpi_wakeup_address.
 */
-int acpi_save_state_mem(void)
+int acpi_suspend_lowlevel(void)
 {
        struct wakeup_header *header;
+        /* address in low memory of the wakeup routine. */
+        char *acpi_realmode;
-        if (!acpi_realmode) {
+        acpi_realmode = TRAMPOLINE_SYM(acpi_wakeup_code);
-                printk(KERN_ERR "Could not allocate memory during boot, "
-                       "S3 disabled\n");
-                return -ENOMEM;
-        }
-        memcpy((void *)acpi_realmode, &wakeup_code_start, WAKEUP_SIZE);
-        header = (struct wakeup_header *)(acpi_realmode + HEADER_OFFSET);
+        header = (struct wakeup_header *)(acpi_realmode + WAKEUP_HEADER_OFFSET);
-        if (header->signature != 0x51ee1111) {
+        if (header->signature != WAKEUP_HEADER_SIGNATURE) {
                printk(KERN_ERR "wakeup header does not match\n");
                return -EINVAL;
        }
@@ -65,9 +59,7 @@ int acpi_save_state_mem(void)
        /* GDT[0]: GDT self-pointer */
        header->wakeup_gdt[0] =
                (u64)(sizeof(header->wakeup_gdt) - 1) +
-                ((u64)(acpi_wakeup_address +
+                ((u64)__pa(&header->wakeup_gdt) << 16);
-                        ((char *)&header->wakeup_gdt - (char *)acpi_realmode))
-                                << 16);
        /* GDT[1]: big real mode-like code segment */
        header->wakeup_gdt[1] =
                GDT_ENTRY(0x809b, acpi_wakeup_address, 0xfffff);
@@ -85,17 +77,23 @@ int acpi_save_state_mem(void)
        header->pmode_cr0 = read_cr0();
        header->pmode_cr4 = read_cr4_safe();
+        header->pmode_behavior = 0;
+        if (!rdmsr_safe(MSR_IA32_MISC_ENABLE,
+                        &header->pmode_misc_en_low,
+                        &header->pmode_misc_en_high))
+                header->pmode_behavior |=
+                        (1 << WAKEUP_BEHAVIOR_RESTORE_MISC_ENABLE);
        header->realmode_flags = acpi_realmode_flags;
        header->real_magic = 0x12345678;
 #ifndef CONFIG_64BIT
        header->pmode_entry = (u32)&wakeup_pmode_return;
-        header->pmode_cr3 = (u32)(swsusp_pg_dir - __PAGE_OFFSET);
+        header->pmode_cr3 = (u32)__pa(&initial_page_table);
        saved_magic = 0x12345678;
 #else /* CONFIG_64BIT */
-        header->trampoline_segment = setup_trampoline() >> 4;
+        header->trampoline_segment = trampoline_address() >> 4;
 #ifdef CONFIG_SMP
-        stack_start.sp = temp_stack + sizeof(temp_stack);
+        stack_start = (unsigned long)temp_stack + sizeof(temp_stack);
        early_gdt_descr.address =
                        (unsigned long)get_cpu_gdt_table(smp_processor_id());
        initial_gs = per_cpu_offset(smp_processor_id());
@@ -104,47 +102,10 @@ int acpi_save_state_mem(void)
       saved_magic = 0x123456789abcdef0L;
 #endif /* CONFIG_64BIT */
+        do_suspend_lowlevel();
        return 0;
 }
-/*
- * acpi_restore_state - undo effects of acpi_save_state_mem
- */
-void acpi_restore_state_mem(void)
-{
-}
-/**
- * acpi_reserve_wakeup_memory - do _very_ early ACPI initialisation
- *
- * We allocate a page from the first 1MB of memory for the wakeup
- * routine for when we come back from a sleep state. The
- * runtime allocator allows specification of <16MB pages, but not
- * <1MB pages.
- */
-void __init acpi_reserve_wakeup_memory(void)
-{
-        unsigned long mem;
-        if ((&wakeup_code_end - &wakeup_code_start) > WAKEUP_SIZE) {
-                printk(KERN_ERR
-                       "ACPI: Wakeup code way too big, S3 disabled.\n");
-                return;
-        }
-        mem = find_e820_area(0, 1<<20, WAKEUP_SIZE, PAGE_SIZE);
-        if (mem == -1L) {
-                printk(KERN_ERR "ACPI: Cannot allocate lowmem, S3 disabled.\n");
-                return;
-        }
-        acpi_realmode = (unsigned long) phys_to_virt(mem);
-        acpi_wakeup_address = mem;
-        reserve_early(mem, mem + WAKEUP_SIZE, "ACPI WAKEUP");
-}
 static int __init acpi_sleep_setup(char *str)
 {
        while ((str != NULL) && (*str != '\0')) {
@@ -157,11 +118,6 @@ static int __init acpi_sleep_setup(char *str)
 #ifdef CONFIG_HIBERNATION
                if (strncmp(str, "s4_nohwsig", 10) == 0)
                        acpi_no_s4_hw_signature();
-                if (strncmp(str, "s4_nonvs", 8) == 0) {
-                        pr_warning("ACPI: acpi_sleep=s4_nonvs is deprecated, "
-                                        "please use acpi_sleep=nonvs instead");
-                        acpi_nvs_nosave();
-                }
 #endif
                if (strncmp(str, "nonvs", 5) == 0)
                        acpi_nvs_nosave();
diff --git a/arch/x86/kernel/acpi/sleep.h b/arch/x86/kernel/acpi/sleep.h
index adbcbaa6f1df..416d4be13fef 100644
--- a/arch/x86/kernel/acpi/sleep.h
+++ b/arch/x86/kernel/acpi/sleep.h
@@ -4,13 +4,12 @@
 #include <asm/trampoline.h>
-extern char wakeup_code_start, wakeup_code_end;
 extern unsigned long saved_video_mode;
 extern long saved_magic;
 extern int wakeup_pmode_return;
-extern char swsusp_pg_dir[PAGE_SIZE];
 extern unsigned long acpi_copy_wakeup_routine(unsigned long);
 extern void wakeup_long64(void);
+extern void do_suspend_lowlevel(void);
diff --git a/arch/x86/kernel/acpi/wakeup_rm.S b/arch/x86/kernel/acpi/wakeup_rm.S
index 6ff3b5730575..63b8ab524f2c 100644
--- a/arch/x86/kernel/acpi/wakeup_rm.S
+++ b/arch/x86/kernel/acpi/wakeup_rm.S
@@ -2,9 +2,11 @@
 * Wrapper script for the realmode binary as a transport object
 * before copying to low memory.
 */
-        .section ".rodata","a"
+#include <asm/page_types.h>
-        .globl  wakeup_code_start, wakeup_code_end
-wakeup_code_start:
+        .section ".x86_trampoline","a"
+        .balign PAGE_SIZE
+        .globl  acpi_wakeup_code
+acpi_wakeup_code:
        .incbin "arch/x86/kernel/acpi/realmode/wakeup.bin"
-wakeup_code_end:
+        .size   acpi_wakeup_code, .-acpi_wakeup_code
-        .size   wakeup_code_start, .-wakeup_code_start
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index f65ab8b014c4..a81f2d52f869 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -67,17 +67,30 @@ __setup("noreplace-paravirt", setup_noreplace_paravirt);
 #define DPRINTK(fmt, args...) if (debug_alternative) \
        printk(KERN_DEBUG fmt, args)
+/*
+ * Each GENERIC_NOPX is of X bytes, and defined as an array of bytes
+ * that correspond to that nop. Getting from one nop to the next, we
+ * add to the array the offset that is equal to the sum of all sizes of
+ * nops preceding the one we are after.
+ *
+ * Note: The GENERIC_NOP5_ATOMIC is at the end, as it breaks the
+ * nice symmetry of sizes of the previous nops.
+ */
 #if defined(GENERIC_NOP1) && !defined(CONFIG_X86_64)
-/* Use inline assembly to define this because the nops are defined
+static const unsigned char intelnops[] =
-   as inline assembly strings in the include files and we cannot
+{
-   get them easily into strings. */
+        GENERIC_NOP1,
-asm("\t" __stringify(__INITRODATA_OR_MODULE) "\nintelnops: "
+        GENERIC_NOP2,
-        GENERIC_NOP1 GENERIC_NOP2 GENERIC_NOP3 GENERIC_NOP4 GENERIC_NOP5 GENERIC_NOP6
+        GENERIC_NOP3,
-        GENERIC_NOP7 GENERIC_NOP8
+        GENERIC_NOP4,
-    "\t.previous");
+        GENERIC_NOP5,
-extern const unsigned char intelnops[];
+        GENERIC_NOP6,
-static const unsigned char *const __initconst_or_module
+        GENERIC_NOP7,
-intel_nops[ASM_NOP_MAX+1] = {
+        GENERIC_NOP8,
+        GENERIC_NOP5_ATOMIC
+};
+static const unsigned char * const intel_nops[ASM_NOP_MAX+2] =
+{
        NULL,
        intelnops,
        intelnops + 1,
@@ -87,17 +100,25 @@ intel_nops[ASM_NOP_MAX+1] = {
        intelnops + 1 + 2 + 3 + 4 + 5,
        intelnops + 1 + 2 + 3 + 4 + 5 + 6,
        intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
+        intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
 };
 #endif
 #ifdef K8_NOP1
-asm("\t" __stringify(__INITRODATA_OR_MODULE) "\nk8nops: "
+static const unsigned char k8nops[] =
-        K8_NOP1 K8_NOP2 K8_NOP3 K8_NOP4 K8_NOP5 K8_NOP6
+{
-        K8_NOP7 K8_NOP8
+        K8_NOP1,
-    "\t.previous");
+        K8_NOP2,
-extern const unsigned char k8nops[];
+        K8_NOP3,
-static const unsigned char *const __initconst_or_module
+        K8_NOP4,
-k8_nops[ASM_NOP_MAX+1] = {
+        K8_NOP5,
+        K8_NOP6,
+        K8_NOP7,
+        K8_NOP8,
+        K8_NOP5_ATOMIC
+};
+static const unsigned char * const k8_nops[ASM_NOP_MAX+2] =
+{
        NULL,
        k8nops,
        k8nops + 1,
@@ -107,17 +128,25 @@ k8_nops[ASM_NOP_MAX+1] = {
        k8nops + 1 + 2 + 3 + 4 + 5,
        k8nops + 1 + 2 + 3 + 4 + 5 + 6,
        k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
+        k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
 };
 #endif
 #if defined(K7_NOP1) && !defined(CONFIG_X86_64)
-asm("\t" __stringify(__INITRODATA_OR_MODULE) "\nk7nops: "
+static const unsigned char k7nops[] =
-        K7_NOP1 K7_NOP2 K7_NOP3 K7_NOP4 K7_NOP5 K7_NOP6
+{
-        K7_NOP7 K7_NOP8
+        K7_NOP1,
-    "\t.previous");
+        K7_NOP2,
-extern const unsigned char k7nops[];
+        K7_NOP3,
-static const unsigned char *const __initconst_or_module
+        K7_NOP4,
-k7_nops[ASM_NOP_MAX+1] = {
+        K7_NOP5,
+        K7_NOP6,
+        K7_NOP7,
+        K7_NOP8,
+        K7_NOP5_ATOMIC
+};
+static const unsigned char * const k7_nops[ASM_NOP_MAX+2] =
+{
        NULL,
        k7nops,
        k7nops + 1,
@@ -127,17 +156,25 @@ k7_nops[ASM_NOP_MAX+1] = {
        k7nops + 1 + 2 + 3 + 4 + 5,
        k7nops + 1 + 2 + 3 + 4 + 5 + 6,
        k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
+        k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
 };
 #endif
 #ifdef P6_NOP1
-asm("\t" __stringify(__INITRODATA_OR_MODULE) "\np6nops: "
+static const unsigned char  __initconst_or_module p6nops[] =
-        P6_NOP1 P6_NOP2 P6_NOP3 P6_NOP4 P6_NOP5 P6_NOP6
+{
-        P6_NOP7 P6_NOP8
+        P6_NOP1,
-    "\t.previous");
+        P6_NOP2,
-extern const unsigned char p6nops[];
+        P6_NOP3,
-static const unsigned char *const __initconst_or_module
+        P6_NOP4,
-p6_nops[ASM_NOP_MAX+1] = {
+        P6_NOP5,
+        P6_NOP6,
+        P6_NOP7,
+        P6_NOP8,
+        P6_NOP5_ATOMIC
+};
+static const unsigned char * const p6_nops[ASM_NOP_MAX+2] =
+{
        NULL,
        p6nops,
        p6nops + 1,
@@ -147,47 +184,65 @@ p6_nops[ASM_NOP_MAX+1] = {
        p6nops + 1 + 2 + 3 + 4 + 5,
        p6nops + 1 + 2 + 3 + 4 + 5 + 6,
        p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
+        p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
 };
 #endif
+/* Initialize these to a safe default */
 #ifdef CONFIG_X86_64
+const unsigned char * const *ideal_nops = p6_nops;
+#else
+const unsigned char * const *ideal_nops = intel_nops;
+#endif
-extern char __vsyscall_0;
+void __init arch_init_ideal_nops(void)
-static const unsigned char *const *__init_or_module find_nop_table(void)
 {
-        if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
+        switch (boot_cpu_data.x86_vendor) {
-            boot_cpu_has(X86_FEATURE_NOPL))
+        case X86_VENDOR_INTEL:
-                return p6_nops;
+                /*
-        else
+                 * Due to a decoder implementation quirk, some
-                return k8_nops;
+                 * specific Intel CPUs actually perform better with
-}
+                 * the "k8_nops" than with the SDM-recommended NOPs.
+                 */
-#else /* CONFIG_X86_64 */
+                if (boot_cpu_data.x86 == 6 &&
+                    boot_cpu_data.x86_model >= 0x0f &&
+                    boot_cpu_data.x86_model != 0x1c &&
+                    boot_cpu_data.x86_model != 0x26 &&
+                    boot_cpu_data.x86_model != 0x27 &&
+                    boot_cpu_data.x86_model < 0x30) {
+                        ideal_nops = k8_nops;
+                } else if (boot_cpu_has(X86_FEATURE_NOPL)) {
+                           ideal_nops = p6_nops;
+                } else {
+#ifdef CONFIG_X86_64
+                        ideal_nops = k8_nops;
+#else
+                        ideal_nops = intel_nops;
+#endif
+                }
-static const unsigned char *const *__init_or_module find_nop_table(void)
+        default:
-{
+#ifdef CONFIG_X86_64
-        if (boot_cpu_has(X86_FEATURE_K8))
+                ideal_nops = k8_nops;
-                return k8_nops;
+#else
-        else if (boot_cpu_has(X86_FEATURE_K7))
+                if (boot_cpu_has(X86_FEATURE_K8))
-                return k7_nops;
+                        ideal_nops = k8_nops;
-        else if (boot_cpu_has(X86_FEATURE_NOPL))
+                else if (boot_cpu_has(X86_FEATURE_K7))
-                return p6_nops;
+                        ideal_nops = k7_nops;
-        else
+                else
-                return intel_nops;
+                        ideal_nops = intel_nops;
+#endif
+        }
 }
-#endif /* CONFIG_X86_64 */
 /* Use this to add nops to a buffer, then text_poke the whole buffer. */
 static void __init_or_module add_nops(void *insns, unsigned int len)
 {
-        const unsigned char *const *noptable = find_nop_table();
        while (len > 0) {
                unsigned int noplen = len;
                if (noplen > ASM_NOP_MAX)
                        noplen = ASM_NOP_MAX;
-                memcpy(insns, noptable[noplen], noplen);
+                memcpy(insns, ideal_nops[noplen], noplen);
                insns += noplen;
                len -= noplen;
        }
@@ -195,11 +250,12 @@ static void __init_or_module add_nops(void *insns, unsigned int len)
 extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
 extern s32 __smp_locks[], __smp_locks_end[];
-static void *text_poke_early(void *addr, const void *opcode, size_t len);
+extern char __vsyscall_0;
+void *text_poke_early(void *addr, const void *opcode, size_t len);
 /* Replace instructions with better alternatives for this CPU type.
   This runs before SMP is initialized to avoid SMP problems with
-   self modifying code. This implies that assymetric systems where
+   self modifying code. This implies that asymmetric systems where
   APs have less capabilities than the boot processor are not handled.
   Tough. Make sure you disable such features by hand. */
@@ -210,6 +266,15 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
        u8 insnbuf[MAX_PATCH_LEN];
        DPRINTK("%s: alt table %p -> %p\n", __func__, start, end);
+        /*
+         * The scan order should be from start to end. A later scanned
+         * alternative code can overwrite a previous scanned alternative code.
+         * Some kernel functions (e.g. memcpy, memset, etc) use this order to
+         * patch code.
+         *
+         * So be careful if you want to change the scan order to any other
+         * order.
+         */
        for (a = start; a < end; a++) {
                u8 *instr = a->instr;
                BUG_ON(a->replacementlen > a->instrlen);
@@ -353,6 +418,7 @@ void __init_or_module alternatives_smp_module_del(struct module *mod)
        mutex_unlock(&smp_alt);
 }
+bool skip_smp_alternatives;
 void alternatives_smp_switch(int smp)
 {
        struct smp_alt_module *mod;
@@ -368,7 +434,7 @@ void alternatives_smp_switch(int smp)
        printk("lockdep: fixing up alternatives.\n");
 #endif
-        if (noreplace_smp || smp_alt_once)
+        if (noreplace_smp || smp_alt_once || skip_smp_alternatives)
                return;
        BUG_ON(!smp && (num_online_cpus() > 1));
@@ -522,7 +588,7 @@ void __init alternative_instructions(void)
 * instructions. And on the local CPU you need to be protected again NMI or MCE
 * handlers seeing an inconsistent instruction while you patch.
 */
-static void *__init_or_module text_poke_early(void *addr, const void *opcode,
+void *__init_or_module text_poke_early(void *addr, const void *opcode,
                                              size_t len)
 {
        unsigned long flags;
@@ -591,17 +657,21 @@ static atomic_t stop_machine_first;
 static int wrote_text;
 struct text_poke_params {
-        void *addr;
+        struct text_poke_param *params;
-        const void *opcode;
+        int nparams;
-        size_t len;
 };
 static int __kprobes stop_machine_text_poke(void *data)
 {
        struct text_poke_params *tpp = data;
+        struct text_poke_param *p;
+        int i;
        if (atomic_dec_and_test(&stop_machine_first)) {
-                text_poke(tpp->addr, tpp->opcode, tpp->len);
+                for (i = 0; i < tpp->nparams; i++) {
+                        p = &tpp->params[i];
+                        text_poke(p->addr, p->opcode, p->len);
+                }
                smp_wmb();      /* Make sure other cpus see that this has run */
                wrote_text = 1;
        } else {
@@ -610,8 +680,17 @@ static int __kprobes stop_machine_text_poke(void *data)
                smp_mb();       /* Load wrote_text before following execution */
        }
-        flush_icache_range((unsigned long)tpp->addr,
+        for (i = 0; i < tpp->nparams; i++) {
-                           (unsigned long)tpp->addr + tpp->len);
+                p = &tpp->params[i];
+                flush_icache_range((unsigned long)p->addr,
+                                   (unsigned long)p->addr + p->len);
+        }
+        /*
+         * Intel Archiecture Software Developer's Manual section 7.1.3 specifies
+         * that a core serializing instruction such as "cpuid" should be
+         * executed on _each_ core before the new instruction is made visible.
+         */
+        sync_core();
        return 0;
 }
@@ -631,13 +710,36 @@ static int __kprobes stop_machine_text_poke(void *data)
 void *__kprobes text_poke_smp(void *addr, const void *opcode, size_t len)
 {
        struct text_poke_params tpp;
+        struct text_poke_param p;
-        tpp.addr = addr;
+        p.addr = addr;
-        tpp.opcode = opcode;
+        p.opcode = opcode;
-        tpp.len = len;
+        p.len = len;
+        tpp.params = &p;
+        tpp.nparams = 1;
        atomic_set(&stop_machine_first, 1);
        wrote_text = 0;
-        stop_machine(stop_machine_text_poke, (void *)&tpp, NULL);
+        /* Use __stop_machine() because the caller already got online_cpus. */
+        __stop_machine(stop_machine_text_poke, (void *)&tpp, cpu_online_mask);
        return addr;
 }
+/**
+ * text_poke_smp_batch - Update instructions on a live kernel on SMP
+ * @params: an array of text_poke parameters
+ * @n: the number of elements in params.
+ *
+ * Modify multi-byte instruction by using stop_machine() on SMP. Since the
+ * stop_machine() is heavy task, it is better to aggregate text_poke requests
+ * and do it once if possible.
+ *
+ * Note: Must be called under get_online_cpus() and text_mutex.
+ */
+void __kprobes text_poke_smp_batch(struct text_poke_param *params, int n)
+{
+        struct text_poke_params tpp = {.params = params, .nparams = n};
+        atomic_set(&stop_machine_first, 1);
+        wrote_text = 0;
+        __stop_machine(stop_machine_text_poke, (void *)&tpp, NULL);
+}
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/amd_gart_64.c
index 0f7f130caa67..b117efd24f71 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/amd_gart_64.c
@@ -27,7 +27,7 @@
 #include <linux/kdebug.h>
 #include <linux/scatterlist.h>
 #include <linux/iommu-helper.h>
-#include <linux/sysdev.h>
+#include <linux/syscore_ops.h>
 #include <linux/io.h>
 #include <linux/gfp.h>
 #include <asm/atomic.h>
@@ -39,8 +39,9 @@
 #include <asm/cacheflush.h>
 #include <asm/swiotlb.h>
 #include <asm/dma.h>
-#include <asm/k8.h>
+#include <asm/amd_nb.h>
 #include <asm/x86_init.h>
+#include <asm/iommu_table.h>
 static unsigned long iommu_bus_base;    /* GART remapping area (physical) */
 static unsigned long iommu_size;        /* size of remapping area bytes */
@@ -80,6 +81,9 @@ static u32 gart_unmapped_entry;
 #define AGPEXTERN
 #endif
+/* GART can only remap to physical addresses < 1TB */
+#define GART_MAX_PHYS_ADDR      (1ULL << 40)
 /* backdoor interface to AGP driver */
 AGPEXTERN int agp_memory_reserved;
 AGPEXTERN __u32 *agp_gatt_table;
@@ -142,7 +146,7 @@ static void flush_gart(void)
        spin_lock_irqsave(&iommu_bitmap_lock, flags);
        if (need_flush) {
-                k8_flush_garts();
+                amd_flush_garts();
                need_flush = false;
        }
        spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
@@ -211,9 +215,13 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
                                size_t size, int dir, unsigned long align_mask)
 {
        unsigned long npages = iommu_num_pages(phys_mem, size, PAGE_SIZE);
-        unsigned long iommu_page = alloc_iommu(dev, npages, align_mask);
+        unsigned long iommu_page;
        int i;
+        if (unlikely(phys_mem + size > GART_MAX_PHYS_ADDR))
+                return bad_dma_addr;
+        iommu_page = alloc_iommu(dev, npages, align_mask);
        if (iommu_page == -1) {
                if (!nonforced_iommu(dev, phys_mem, size))
                        return phys_mem;
@@ -560,14 +568,17 @@ static void enable_gart_translations(void)
 {
        int i;
-        for (i = 0; i < num_k8_northbridges; i++) {
+        if (!amd_nb_has_feature(AMD_NB_GART))
-                struct pci_dev *dev = k8_northbridges[i];
+                return;
+        for (i = 0; i < amd_nb_num(); i++) {
+                struct pci_dev *dev = node_to_amd_nb(i)->misc;
                enable_gart_translation(dev, __pa(agp_gatt_table));
        }
        /* Flush the GART-TLB to remove stale entries */
-        k8_flush_garts();
+        amd_flush_garts();
 }
 /*
@@ -585,72 +596,62 @@ void set_up_gart_resume(u32 aper_order, u32 aper_alloc)
        aperture_alloc = aper_alloc;
 }
-static void gart_fixup_northbridges(struct sys_device *dev)
+static void gart_fixup_northbridges(void)
 {
        int i;
        if (!fix_up_north_bridges)
                return;
+        if (!amd_nb_has_feature(AMD_NB_GART))
+                return;
        pr_info("PCI-DMA: Restoring GART aperture settings\n");
-        for (i = 0; i < num_k8_northbridges; i++) {
+        for (i = 0; i < amd_nb_num(); i++) {
-                struct pci_dev *dev = k8_northbridges[i];
+                struct pci_dev *dev = node_to_amd_nb(i)->misc;
                /*
                 * Don't enable translations just yet.  That is the next
                 * step.  Restore the pre-suspend aperture settings.
                 */
-                pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, aperture_order << 1);
+                gart_set_size_and_enable(dev, aperture_order);
                pci_write_config_dword(dev, AMD64_GARTAPERTUREBASE, aperture_alloc >> 25);
        }
 }
-static int gart_resume(struct sys_device *dev)
+static void gart_resume(void)
 {
        pr_info("PCI-DMA: Resuming GART IOMMU\n");
-        gart_fixup_northbridges(dev);
+        gart_fixup_northbridges();
        enable_gart_translations();
-        return 0;
 }
-static int gart_suspend(struct sys_device *dev, pm_message_t state)
+static struct syscore_ops gart_syscore_ops = {
-{
-        return 0;
-}
-static struct sysdev_class gart_sysdev_class = {
-        .name           = "gart",
-        .suspend        = gart_suspend,
        .resume         = gart_resume,
 };
-static struct sys_device device_gart = {
-        .cls            = &gart_sysdev_class,
-};
 /*
 * Private Northbridge GATT initialization in case we cannot use the
 * AGP driver for some reason.
 */
-static __init int init_k8_gatt(struct agp_kern_info *info)
+static __init int init_amd_gatt(struct agp_kern_info *info)
 {
        unsigned aper_size, gatt_size, new_aper_size;
        unsigned aper_base, new_aper_base;
        struct pci_dev *dev;
        void *gatt;
-        int i, error;
+        int i;
        pr_info("PCI-DMA: Disabling AGP.\n");
        aper_size = aper_base = info->aper_size = 0;
        dev = NULL;
-        for (i = 0; i < num_k8_northbridges; i++) {
+        for (i = 0; i < amd_nb_num(); i++) {
-                dev = k8_northbridges[i];
+                dev = node_to_amd_nb(i)->misc;
                new_aper_base = read_aperture(dev, &new_aper_size);
                if (!new_aper_base)
                        goto nommu;
@@ -678,12 +679,7 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
        agp_gatt_table = gatt;
-        error = sysdev_class_register(&gart_sysdev_class);
+        register_syscore_ops(&gart_syscore_ops);
-        if (!error)
-                error = sysdev_register(&device_gart);
-        if (error)
-                panic("Could not register gart_sysdev -- "
-                      "would corrupt data on next suspend");
        flush_gart();
@@ -718,10 +714,13 @@ static void gart_iommu_shutdown(void)
        if (!no_agp)
                return;
-        for (i = 0; i < num_k8_northbridges; i++) {
+        if (!amd_nb_has_feature(AMD_NB_GART))
+                return;
+        for (i = 0; i < amd_nb_num(); i++) {
                u32 ctl;
-                dev = k8_northbridges[i];
+                dev = node_to_amd_nb(i)->misc;
                pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &ctl);
                ctl &= ~GARTEN;
@@ -739,14 +738,14 @@ int __init gart_iommu_init(void)
        unsigned long scratch;
        long i;
-        if (num_k8_northbridges == 0)
+        if (!amd_nb_has_feature(AMD_NB_GART))
                return 0;
 #ifndef CONFIG_AGP_AMD64
        no_agp = 1;
 #else
        /* Makefile puts PCI initialization via subsys_initcall first. */
-        /* Add other K8 AGP bridge drivers here */
+        /* Add other AMD AGP bridge drivers here */
        no_agp = no_agp ||
                (agp_amd64_init() < 0) ||
                (agp_copy_info(agp_bridge, &info) < 0);
@@ -755,7 +754,7 @@ int __init gart_iommu_init(void)
        if (no_iommu ||
            (!force_iommu && max_pfn <= MAX_DMA32_PFN) ||
            !gart_iommu_aperture ||
-            (no_agp && init_k8_gatt(&info) < 0)) {
+            (no_agp && init_amd_gatt(&info) < 0)) {
                if (max_pfn > MAX_DMA32_PFN) {
                        pr_warning("More than 4GB of memory but GART IOMMU not available.\n");
                        pr_warning("falling back to iommu=soft.\n");
@@ -896,3 +895,4 @@ void __init gart_parse_options(char *p)
                }
        }
 }
+IOMMU_INIT_POST(gart_iommu_hole_init);
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 679b6450382b..7c3a95e54ec5 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2007-2009 Advanced Micro Devices, Inc.
+ * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
 * Author: Joerg Roedel <joerg.roedel@amd.com>
 *         Leo Duran <leo.duran@amd.com>
 *
@@ -18,6 +18,7 @@
 */
 #include <linux/pci.h>
+#include <linux/pci-ats.h>
 #include <linux/bitmap.h>
 #include <linux/slab.h>
 #include <linux/debugfs.h>
@@ -25,16 +26,18 @@
 #include <linux/dma-mapping.h>
 #include <linux/iommu-helper.h>
 #include <linux/iommu.h>
+#include <linux/delay.h>
 #include <asm/proto.h>
 #include <asm/iommu.h>
 #include <asm/gart.h>
+#include <asm/dma.h>
 #include <asm/amd_iommu_proto.h>
 #include <asm/amd_iommu_types.h>
 #include <asm/amd_iommu.h>
 #define CMD_SET_TYPE(cmd, t) ((cmd)->data[1] |= ((t) << 28))
-#define EXIT_LOOP_COUNT 10000000
+#define LOOP_TIMEOUT    100000
 static DEFINE_RWLOCK(amd_iommu_devtable_lock);
@@ -57,7 +60,6 @@ struct iommu_cmd {
        u32 data[4];
 };
-static void reset_iommu_command_buffer(struct amd_iommu *iommu);
 static void update_domain(struct protection_domain *domain);
 /****************************************************************************
@@ -153,6 +155,10 @@ static int iommu_init_device(struct device *dev)
        pdev = pci_get_bus_and_slot(PCI_BUS(alias), alias & 0xff);
        if (pdev)
                dev_data->alias = &pdev->dev;
+        else {
+                kfree(dev_data);
+                return -ENOTSUPP;
+        }
        atomic_set(&dev_data->bind, 0);
@@ -162,6 +168,20 @@ static int iommu_init_device(struct device *dev)
        return 0;
 }
+static void iommu_ignore_device(struct device *dev)
+{
+        u16 devid, alias;
+        devid = get_device_id(dev);
+        alias = amd_iommu_alias_table[devid];
+        memset(&amd_iommu_dev_table[devid], 0, sizeof(struct dev_table_entry));
+        memset(&amd_iommu_dev_table[alias], 0, sizeof(struct dev_table_entry));
+        amd_iommu_rlookup_table[devid] = NULL;
+        amd_iommu_rlookup_table[alias] = NULL;
+}
 static void iommu_uninit_device(struct device *dev)
 {
        kfree(dev->archdata.iommu);
@@ -191,7 +211,9 @@ int __init amd_iommu_init_devices(void)
                        continue;
                ret = iommu_init_device(&pdev->dev);
-                if (ret)
+                if (ret == -ENOTSUPP)
+                        iommu_ignore_device(&pdev->dev);
+                else if (ret)
                        goto out_free;
        }
@@ -322,8 +344,6 @@ static void iommu_print_event(struct amd_iommu *iommu, void *__evt)
                break;
        case EVENT_TYPE_ILL_CMD:
                printk("ILLEGAL_COMMAND_ERROR address=0x%016llx]\n", address);
-                iommu->reset_in_progress = true;
-                reset_iommu_command_buffer(iommu);
                dump_command(address);
                break;
        case EVENT_TYPE_CMD_HARD_ERR:
@@ -367,7 +387,7 @@ static void iommu_poll_events(struct amd_iommu *iommu)
        spin_unlock_irqrestore(&iommu->lock, flags);
 }
-irqreturn_t amd_iommu_int_handler(int irq, void *data)
+irqreturn_t amd_iommu_int_thread(int irq, void *data)
 {
        struct amd_iommu *iommu;
@@ -377,192 +397,300 @@ irqreturn_t amd_iommu_int_handler(int irq, void *data)
        return IRQ_HANDLED;
 }
+irqreturn_t amd_iommu_int_handler(int irq, void *data)
+{
+        return IRQ_WAKE_THREAD;
+}
 /****************************************************************************
 *
 * IOMMU command queuing functions
 *
 ****************************************************************************/
-/*
+static int wait_on_sem(volatile u64 *sem)
- * Writes the command to the IOMMUs command buffer and informs the
+{
- * hardware about the new command. Must be called with iommu->lock held.
+        int i = 0;
- */
-static int __iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
+        while (*sem == 0 && i < LOOP_TIMEOUT) {
+                udelay(1);
+                i += 1;
+        }
+        if (i == LOOP_TIMEOUT) {
+                pr_alert("AMD-Vi: Completion-Wait loop timed out\n");
+                return -EIO;
+        }
+        return 0;
+}
+static void copy_cmd_to_buffer(struct amd_iommu *iommu,
+                               struct iommu_cmd *cmd,
+                               u32 tail)
 {
-        u32 tail, head;
        u8 *target;
-        WARN_ON(iommu->cmd_buf_size & CMD_BUFFER_UNINITIALIZED);
-        tail = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
        target = iommu->cmd_buf + tail;
-        memcpy_toio(target, cmd, sizeof(*cmd));
+        tail   = (tail + sizeof(*cmd)) % iommu->cmd_buf_size;
-        tail = (tail + sizeof(*cmd)) % iommu->cmd_buf_size;
-        head = readl(iommu->mmio_base + MMIO_CMD_HEAD_OFFSET);
+        /* Copy command to buffer */
-        if (tail == head)
+        memcpy(target, cmd, sizeof(*cmd));
-                return -ENOMEM;
+        /* Tell the IOMMU about it */
        writel(tail, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
+}
-        return 0;
+static void build_completion_wait(struct iommu_cmd *cmd, u64 address)
+{
+        WARN_ON(address & 0x7ULL);
+        memset(cmd, 0, sizeof(*cmd));
+        cmd->data[0] = lower_32_bits(__pa(address)) | CMD_COMPL_WAIT_STORE_MASK;
+        cmd->data[1] = upper_32_bits(__pa(address));
+        cmd->data[2] = 1;
+        CMD_SET_TYPE(cmd, CMD_COMPL_WAIT);
+}
+static void build_inv_dte(struct iommu_cmd *cmd, u16 devid)
+{
+        memset(cmd, 0, sizeof(*cmd));
+        cmd->data[0] = devid;
+        CMD_SET_TYPE(cmd, CMD_INV_DEV_ENTRY);
+}
+static void build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address,
+                                  size_t size, u16 domid, int pde)
+{
+        u64 pages;
+        int s;
+        pages = iommu_num_pages(address, size, PAGE_SIZE);
+        s     = 0;
+        if (pages > 1) {
+                /*
+                 * If we have to flush more than one page, flush all
+                 * TLB entries for this domain
+                 */
+                address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS;
+                s = 1;
+        }
+        address &= PAGE_MASK;
+        memset(cmd, 0, sizeof(*cmd));
+        cmd->data[1] |= domid;
+        cmd->data[2]  = lower_32_bits(address);
+        cmd->data[3]  = upper_32_bits(address);
+        CMD_SET_TYPE(cmd, CMD_INV_IOMMU_PAGES);
+        if (s) /* size bit - we flush more than one 4kb page */
+                cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
+        if (pde) /* PDE bit - we wan't flush everything not only the PTEs */
+                cmd->data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK;
+}
+static void build_inv_iotlb_pages(struct iommu_cmd *cmd, u16 devid, int qdep,
+                                  u64 address, size_t size)
+{
+        u64 pages;
+        int s;
+        pages = iommu_num_pages(address, size, PAGE_SIZE);
+        s     = 0;
+        if (pages > 1) {
+                /*
+                 * If we have to flush more than one page, flush all
+                 * TLB entries for this domain
+                 */
+                address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS;
+                s = 1;
+        }
+        address &= PAGE_MASK;
+        memset(cmd, 0, sizeof(*cmd));
+        cmd->data[0]  = devid;
+        cmd->data[0] |= (qdep & 0xff) << 24;
+        cmd->data[1]  = devid;
+        cmd->data[2]  = lower_32_bits(address);
+        cmd->data[3]  = upper_32_bits(address);
+        CMD_SET_TYPE(cmd, CMD_INV_IOTLB_PAGES);
+        if (s)
+                cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
+}
+static void build_inv_all(struct iommu_cmd *cmd)
+{
+        memset(cmd, 0, sizeof(*cmd));
+        CMD_SET_TYPE(cmd, CMD_INV_ALL);
 }
 /*
- * General queuing function for commands. Takes iommu->lock and calls
+ * Writes the command to the IOMMUs command buffer and informs the
- * __iommu_queue_command().
+ * hardware about the new command.
 */
 static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
 {
+        u32 left, tail, head, next_tail;
        unsigned long flags;
-        int ret;
+        WARN_ON(iommu->cmd_buf_size & CMD_BUFFER_UNINITIALIZED);
+again:
        spin_lock_irqsave(&iommu->lock, flags);
-        ret = __iommu_queue_command(iommu, cmd);
-        if (!ret)
-                iommu->need_sync = true;
-        spin_unlock_irqrestore(&iommu->lock, flags);
-        return ret;
+        head      = readl(iommu->mmio_base + MMIO_CMD_HEAD_OFFSET);
-}
+        tail      = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
+        next_tail = (tail + sizeof(*cmd)) % iommu->cmd_buf_size;
+        left      = (head - next_tail) % iommu->cmd_buf_size;
-/*
+        if (left <= 2) {
- * This function waits until an IOMMU has completed a completion
+                struct iommu_cmd sync_cmd;
- * wait command
+                volatile u64 sem = 0;
- */
+                int ret;
-static void __iommu_wait_for_completion(struct amd_iommu *iommu)
-{
-        int ready = 0;
-        unsigned status = 0;
-        unsigned long i = 0;
-        INC_STATS_COUNTER(compl_wait);
+                build_completion_wait(&sync_cmd, (u64)&sem);
+                copy_cmd_to_buffer(iommu, &sync_cmd, tail);
-        while (!ready && (i < EXIT_LOOP_COUNT)) {
+                spin_unlock_irqrestore(&iommu->lock, flags);
-                ++i;
-                /* wait for the bit to become one */
+                if ((ret = wait_on_sem(&sem)) != 0)
-                status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET);
+                        return ret;
-                ready = status & MMIO_STATUS_COM_WAIT_INT_MASK;
+                goto again;
        }
-        /* set bit back to zero */
+        copy_cmd_to_buffer(iommu, cmd, tail);
-        status &= ~MMIO_STATUS_COM_WAIT_INT_MASK;
-        writel(status, iommu->mmio_base + MMIO_STATUS_OFFSET);
+        /* We need to sync now to make sure all commands are processed */
+        iommu->need_sync = true;
-        if (unlikely(i == EXIT_LOOP_COUNT))
+        spin_unlock_irqrestore(&iommu->lock, flags);
-                iommu->reset_in_progress = true;
+        return 0;
 }
 /*
 * This function queues a completion wait command into the command
 * buffer of an IOMMU
 */
-static int __iommu_completion_wait(struct amd_iommu *iommu)
+static int iommu_completion_wait(struct amd_iommu *iommu)
 {
        struct iommu_cmd cmd;
+        volatile u64 sem = 0;
+        int ret;
-         memset(&cmd, 0, sizeof(cmd));
+        if (!iommu->need_sync)
-         cmd.data[0] = CMD_COMPL_WAIT_INT_MASK;
+                return 0;
-         CMD_SET_TYPE(&cmd, CMD_COMPL_WAIT);
-         return __iommu_queue_command(iommu, &cmd);
+        build_completion_wait(&cmd, (u64)&sem);
+        ret = iommu_queue_command(iommu, &cmd);
+        if (ret)
+                return ret;
+        return wait_on_sem(&sem);
 }
-/*
+static int iommu_flush_dte(struct amd_iommu *iommu, u16 devid)
- * This function is called whenever we need to ensure that the IOMMU has
- * completed execution of all commands we sent. It sends a
- * COMPLETION_WAIT command and waits for it to finish. The IOMMU informs
- * us about that by writing a value to a physical address we pass with
- * the command.
- */
-static int iommu_completion_wait(struct amd_iommu *iommu)
 {
-        int ret = 0;
+        struct iommu_cmd cmd;
-        unsigned long flags;
-        spin_lock_irqsave(&iommu->lock, flags);
-        if (!iommu->need_sync)
+        build_inv_dte(&cmd, devid);
-                goto out;
-        ret = __iommu_completion_wait(iommu);
+        return iommu_queue_command(iommu, &cmd);
+}
-        iommu->need_sync = false;
+static void iommu_flush_dte_all(struct amd_iommu *iommu)
+{
+        u32 devid;
-        if (ret)
+        for (devid = 0; devid <= 0xffff; ++devid)
-                goto out;
+                iommu_flush_dte(iommu, devid);
-        __iommu_wait_for_completion(iommu);
+        iommu_completion_wait(iommu);
+}
-out:
+/*
-        spin_unlock_irqrestore(&iommu->lock, flags);
+ * This function uses heavy locking and may disable irqs for some time. But
+ * this is no issue because it is only called during resume.
+ */
+static void iommu_flush_tlb_all(struct amd_iommu *iommu)
+{
+        u32 dom_id;
-        if (iommu->reset_in_progress)
+        for (dom_id = 0; dom_id <= 0xffff; ++dom_id) {
-                reset_iommu_command_buffer(iommu);
+                struct iommu_cmd cmd;
+                build_inv_iommu_pages(&cmd, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS,
+                                      dom_id, 1);
+                iommu_queue_command(iommu, &cmd);
+        }
-        return 0;
+        iommu_completion_wait(iommu);
 }
-static void iommu_flush_complete(struct protection_domain *domain)
+static void iommu_flush_all(struct amd_iommu *iommu)
 {
-        int i;
+        struct iommu_cmd cmd;
-        for (i = 0; i < amd_iommus_present; ++i) {
+        build_inv_all(&cmd);
-                if (!domain->dev_iommu[i])
-                        continue;
-                /*
+        iommu_queue_command(iommu, &cmd);
-                 * Devices of this domain are behind this IOMMU
+        iommu_completion_wait(iommu);
-                 * We need to wait for completion of all commands.
+}
-                 */
-                iommu_completion_wait(amd_iommus[i]);
+void iommu_flush_all_caches(struct amd_iommu *iommu)
+{
+        if (iommu_feature(iommu, FEATURE_IA)) {
+                iommu_flush_all(iommu);
+        } else {
+                iommu_flush_dte_all(iommu);
+                iommu_flush_tlb_all(iommu);
        }
 }
 /*
- * Command send function for invalidating a device table entry
+ * Command send function for flushing on-device TLB
 */
-static int iommu_flush_device(struct device *dev)
+static int device_flush_iotlb(struct device *dev, u64 address, size_t size)
 {
+        struct pci_dev *pdev = to_pci_dev(dev);
        struct amd_iommu *iommu;
        struct iommu_cmd cmd;
        u16 devid;
+        int qdep;
+        qdep  = pci_ats_queue_depth(pdev);
        devid = get_device_id(dev);
        iommu = amd_iommu_rlookup_table[devid];
-        /* Build command */
+        build_inv_iotlb_pages(&cmd, devid, qdep, address, size);
-        memset(&cmd, 0, sizeof(cmd));
-        CMD_SET_TYPE(&cmd, CMD_INV_DEV_ENTRY);
-        cmd.data[0] = devid;
        return iommu_queue_command(iommu, &cmd);
 }
-static void __iommu_build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address,
-                                          u16 domid, int pde, int s)
-{
-        memset(cmd, 0, sizeof(*cmd));
-        address &= PAGE_MASK;
-        CMD_SET_TYPE(cmd, CMD_INV_IOMMU_PAGES);
-        cmd->data[1] |= domid;
-        cmd->data[2] = lower_32_bits(address);
-        cmd->data[3] = upper_32_bits(address);
-        if (s) /* size bit - we flush more than one 4kb page */
-                cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
-        if (pde) /* PDE bit - we wan't flush everything not only the PTEs */
-                cmd->data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK;
-}
 /*
- * Generic command send function for invalidaing TLB entries
+ * Command send function for invalidating a device table entry
 */
-static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu,
+static int device_flush_dte(struct device *dev)
-                u64 address, u16 domid, int pde, int s)
 {
-        struct iommu_cmd cmd;
+        struct amd_iommu *iommu;
+        struct pci_dev *pdev;
+        u16 devid;
        int ret;
-        __iommu_build_inv_iommu_pages(&cmd, address, domid, pde, s);
+        pdev  = to_pci_dev(dev);
+        devid = get_device_id(dev);
+        iommu = amd_iommu_rlookup_table[devid];
-        ret = iommu_queue_command(iommu, &cmd);
+        ret = iommu_flush_dte(iommu, devid);
+        if (ret)
+                return ret;
+        if (pci_ats_enabled(pdev))
+                ret = device_flush_iotlb(dev, 0, ~0UL);
        return ret;
 }
@@ -572,23 +700,14 @@ static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu,
 * It invalidates a single PTE if the range to flush is within a single
 * page. Otherwise it flushes the whole TLB of the IOMMU.
 */
-static void __iommu_flush_pages(struct protection_domain *domain,
+static void __domain_flush_pages(struct protection_domain *domain,
-                                u64 address, size_t size, int pde)
+                                 u64 address, size_t size, int pde)
 {
-        int s = 0, i;
+        struct iommu_dev_data *dev_data;
-        unsigned long pages = iommu_num_pages(address, size, PAGE_SIZE);
+        struct iommu_cmd cmd;
+        int ret = 0, i;
-        address &= PAGE_MASK;
-        if (pages > 1) {
-                /*
-                 * If we have to flush more than one page, flush all
-                 * TLB entries for this domain
-                 */
-                address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS;
-                s = 1;
-        }
+        build_inv_iommu_pages(&cmd, address, size, domain->id, pde);
        for (i = 0; i < amd_iommus_present; ++i) {
                if (!domain->dev_iommu[i])
@@ -598,101 +717,70 @@ static void __iommu_flush_pages(struct protection_domain *domain,
                 * Devices of this domain are behind this IOMMU
                 * We need a TLB flush
                 */
-                iommu_queue_inv_iommu_pages(amd_iommus[i], address,
+                ret |= iommu_queue_command(amd_iommus[i], &cmd);
-                                            domain->id, pde, s);
+        }
+        list_for_each_entry(dev_data, &domain->dev_list, list) {
+                struct pci_dev *pdev = to_pci_dev(dev_data->dev);
+                if (!pci_ats_enabled(pdev))
+                        continue;
+                ret |= device_flush_iotlb(dev_data->dev, address, size);
        }
-        return;
+        WARN_ON(ret);
 }
-static void iommu_flush_pages(struct protection_domain *domain,
+static void domain_flush_pages(struct protection_domain *domain,
-                             u64 address, size_t size)
+                               u64 address, size_t size)
 {
-        __iommu_flush_pages(domain, address, size, 0);
+        __domain_flush_pages(domain, address, size, 0);
 }
 /* Flush the whole IO/TLB for a given protection domain */
-static void iommu_flush_tlb(struct protection_domain *domain)
+static void domain_flush_tlb(struct protection_domain *domain)
 {
-        __iommu_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 0);
+        __domain_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 0);
 }
 /* Flush the whole IO/TLB for a given protection domain - including PDE */
-static void iommu_flush_tlb_pde(struct protection_domain *domain)
+static void domain_flush_tlb_pde(struct protection_domain *domain)
-{
-        __iommu_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 1);
-}
-/*
- * This function flushes the DTEs for all devices in domain
- */
-static void iommu_flush_domain_devices(struct protection_domain *domain)
 {
-        struct iommu_dev_data *dev_data;
+        __domain_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 1);
-        unsigned long flags;
-        spin_lock_irqsave(&domain->lock, flags);
-        list_for_each_entry(dev_data, &domain->dev_list, list)
-                iommu_flush_device(dev_data->dev);
-        spin_unlock_irqrestore(&domain->lock, flags);
 }
-static void iommu_flush_all_domain_devices(void)
+static void domain_flush_complete(struct protection_domain *domain)
 {
-        struct protection_domain *domain;
+        int i;
-        unsigned long flags;
-        spin_lock_irqsave(&amd_iommu_pd_lock, flags);
+        for (i = 0; i < amd_iommus_present; ++i) {
+                if (!domain->dev_iommu[i])
+                        continue;
-        list_for_each_entry(domain, &amd_iommu_pd_list, list) {
+                /*
-                iommu_flush_domain_devices(domain);
+                 * Devices of this domain are behind this IOMMU
-                iommu_flush_complete(domain);
+                 * We need to wait for completion of all commands.
+                 */
+                iommu_completion_wait(amd_iommus[i]);
        }
-        spin_unlock_irqrestore(&amd_iommu_pd_lock, flags);
 }
-void amd_iommu_flush_all_devices(void)
-{
-        iommu_flush_all_domain_devices();
-}
 /*
- * This function uses heavy locking and may disable irqs for some time. But
+ * This function flushes the DTEs for all devices in domain
- * this is no issue because it is only called during resume.
 */
-void amd_iommu_flush_all_domains(void)
+static void domain_flush_devices(struct protection_domain *domain)
 {
-        struct protection_domain *domain;
+        struct iommu_dev_data *dev_data;
        unsigned long flags;
-        spin_lock_irqsave(&amd_iommu_pd_lock, flags);
+        spin_lock_irqsave(&domain->lock, flags);
-        list_for_each_entry(domain, &amd_iommu_pd_list, list) {
-                spin_lock(&domain->lock);
-                iommu_flush_tlb_pde(domain);
-                iommu_flush_complete(domain);
-                spin_unlock(&domain->lock);
-        }
-        spin_unlock_irqrestore(&amd_iommu_pd_lock, flags);
-}
-static void reset_iommu_command_buffer(struct amd_iommu *iommu)
-{
-        pr_err("AMD-Vi: Resetting IOMMU command buffer\n");
-        if (iommu->reset_in_progress)
-                panic("AMD-Vi: ILLEGAL_COMMAND_ERROR while resetting command buffer\n");
-        amd_iommu_reset_cmd_buffer(iommu);
+        list_for_each_entry(dev_data, &domain->dev_list, list)
-        amd_iommu_flush_all_devices();
+                device_flush_dte(dev_data->dev);
-        amd_iommu_flush_all_domains();
-        iommu->reset_in_progress = false;
+        spin_unlock_irqrestore(&domain->lock, flags);
 }
 /****************************************************************************
@@ -1086,7 +1174,7 @@ static int alloc_new_range(struct dma_ops_domain *dma_dom,
        dma_dom->aperture_size += APERTURE_RANGE_SIZE;
-        /* Intialize the exclusion range if necessary */
+        /* Initialize the exclusion range if necessary */
        for_each_iommu(iommu) {
                if (iommu->exclusion_start &&
                    iommu->exclusion_start >= dma_dom->aperture[index]->offset
@@ -1353,7 +1441,7 @@ static void dma_ops_domain_free(struct dma_ops_domain *dom)
 /*
 * Allocates a new protection domain usable for the dma_ops functions.
- * It also intializes the page table and the address allocator data
+ * It also initializes the page table and the address allocator data
 * structures required for the dma_ops interface
 */
 static struct dma_ops_domain *dma_ops_domain_alloc(void)
@@ -1410,17 +1498,22 @@ static bool dma_ops_domain(struct protection_domain *domain)
        return domain->flags & PD_DMA_OPS_MASK;
 }
-static void set_dte_entry(u16 devid, struct protection_domain *domain)
+static void set_dte_entry(u16 devid, struct protection_domain *domain, bool ats)
 {
        u64 pte_root = virt_to_phys(domain->pt_root);
+        u32 flags = 0;
        pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK)
                    << DEV_ENTRY_MODE_SHIFT;
        pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | IOMMU_PTE_TV;
-        amd_iommu_dev_table[devid].data[2] = domain->id;
+        if (ats)
-        amd_iommu_dev_table[devid].data[1] = upper_32_bits(pte_root);
+                flags |= DTE_FLAG_IOTLB;
-        amd_iommu_dev_table[devid].data[0] = lower_32_bits(pte_root);
+        amd_iommu_dev_table[devid].data[3] |= flags;
+        amd_iommu_dev_table[devid].data[2]  = domain->id;
+        amd_iommu_dev_table[devid].data[1]  = upper_32_bits(pte_root);
+        amd_iommu_dev_table[devid].data[0]  = lower_32_bits(pte_root);
 }
 static void clear_dte_entry(u16 devid)
@@ -1437,23 +1530,29 @@ static void do_attach(struct device *dev, struct protection_domain *domain)
 {
        struct iommu_dev_data *dev_data;
        struct amd_iommu *iommu;
+        struct pci_dev *pdev;
+        bool ats = false;
        u16 devid;
        devid    = get_device_id(dev);
        iommu    = amd_iommu_rlookup_table[devid];
        dev_data = get_dev_data(dev);
+        pdev     = to_pci_dev(dev);
+        if (amd_iommu_iotlb_sup)
+                ats = pci_ats_enabled(pdev);
        /* Update data structures */
        dev_data->domain = domain;
        list_add(&dev_data->list, &domain->dev_list);
-        set_dte_entry(devid, domain);
+        set_dte_entry(devid, domain, ats);
        /* Do reference counting */
        domain->dev_iommu[iommu->index] += 1;
        domain->dev_cnt                 += 1;
        /* Flush the DTE entry */
-        iommu_flush_device(dev);
+        device_flush_dte(dev);
 }
 static void do_detach(struct device *dev)
@@ -1476,7 +1575,7 @@ static void do_detach(struct device *dev)
        clear_dte_entry(devid);
        /* Flush the DTE entry */
-        iommu_flush_device(dev);
+        device_flush_dte(dev);
 }
 /*
@@ -1539,9 +1638,13 @@ out_unlock:
 static int attach_device(struct device *dev,
                         struct protection_domain *domain)
 {
+        struct pci_dev *pdev = to_pci_dev(dev);
        unsigned long flags;
        int ret;
+        if (amd_iommu_iotlb_sup)
+                pci_enable_ats(pdev, PAGE_SHIFT);
        write_lock_irqsave(&amd_iommu_devtable_lock, flags);
        ret = __attach_device(dev, domain);
        write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
@@ -1551,7 +1654,7 @@ static int attach_device(struct device *dev,
         * left the caches in the IOMMU dirty. So we have to flush
         * here to evict all dirty stuff.
         */
-        iommu_flush_tlb_pde(domain);
+        domain_flush_tlb_pde(domain);
        return ret;
 }
@@ -1598,12 +1701,16 @@ static void __detach_device(struct device *dev)
 */
 static void detach_device(struct device *dev)
 {
+        struct pci_dev *pdev = to_pci_dev(dev);
        unsigned long flags;
        /* lock device table */
        write_lock_irqsave(&amd_iommu_devtable_lock, flags);
        __detach_device(dev);
        write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
+        if (amd_iommu_iotlb_sup && pci_ats_enabled(pdev))
+                pci_disable_ats(pdev);
 }
 /*
@@ -1615,10 +1722,9 @@ static struct protection_domain *domain_for_device(struct device *dev)
        struct protection_domain *dom;
        struct iommu_dev_data *dev_data, *alias_data;
        unsigned long flags;
-        u16 devid, alias;
+        u16 devid;
        devid      = get_device_id(dev);
-        alias      = amd_iommu_alias_table[devid];
        dev_data   = get_dev_data(dev);
        alias_data = get_dev_data(dev_data->alias);
        if (!alias_data)
@@ -1692,7 +1798,7 @@ static int device_change_notifier(struct notifier_block *nb,
                goto out;
        }
-        iommu_flush_device(dev);
+        device_flush_dte(dev);
        iommu_completion_wait(iommu);
 out:
@@ -1753,8 +1859,9 @@ static void update_device_table(struct protection_domain *domain)
        struct iommu_dev_data *dev_data;
        list_for_each_entry(dev_data, &domain->dev_list, list) {
+                struct pci_dev *pdev = to_pci_dev(dev_data->dev);
                u16 devid = get_device_id(dev_data->dev);
-                set_dte_entry(devid, domain);
+                set_dte_entry(devid, domain, pci_ats_enabled(pdev));
        }
 }
@@ -1764,8 +1871,9 @@ static void update_domain(struct protection_domain *domain)
                return;
        update_device_table(domain);
-        iommu_flush_domain_devices(domain);
-        iommu_flush_tlb_pde(domain);
+        domain_flush_devices(domain);
+        domain_flush_tlb_pde(domain);
        domain->updated = false;
 }
@@ -1924,10 +2032,10 @@ retry:
        ADD_STATS_COUNTER(alloced_io_mem, size);
        if (unlikely(dma_dom->need_flush && !amd_iommu_unmap_flush)) {
-                iommu_flush_tlb(&dma_dom->domain);
+                domain_flush_tlb(&dma_dom->domain);
                dma_dom->need_flush = false;
        } else if (unlikely(amd_iommu_np_cache))
-                iommu_flush_pages(&dma_dom->domain, address, size);
+                domain_flush_pages(&dma_dom->domain, address, size);
 out:
        return address;
@@ -1976,7 +2084,7 @@ static void __unmap_single(struct dma_ops_domain *dma_dom,
        dma_ops_free_addresses(dma_dom, dma_addr, pages);
        if (amd_iommu_unmap_flush || dma_dom->need_flush) {
-                iommu_flush_pages(&dma_dom->domain, flush_addr, size);
+                domain_flush_pages(&dma_dom->domain, flush_addr, size);
                dma_dom->need_flush = false;
        }
 }
@@ -2012,7 +2120,7 @@ static dma_addr_t map_page(struct device *dev, struct page *page,
        if (addr == DMA_ERROR_CODE)
                goto out;
-        iommu_flush_complete(domain);
+        domain_flush_complete(domain);
 out:
        spin_unlock_irqrestore(&domain->lock, flags);
@@ -2039,7 +2147,7 @@ static void unmap_page(struct device *dev, dma_addr_t dma_addr, size_t size,
        __unmap_single(domain->priv, dma_addr, size, dir);
-        iommu_flush_complete(domain);
+        domain_flush_complete(domain);
        spin_unlock_irqrestore(&domain->lock, flags);
 }
@@ -2104,7 +2212,7 @@ static int map_sg(struct device *dev, struct scatterlist *sglist,
                        goto unmap;
        }
-        iommu_flush_complete(domain);
+        domain_flush_complete(domain);
 out:
        spin_unlock_irqrestore(&domain->lock, flags);
@@ -2150,7 +2258,7 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist,
                s->dma_address = s->dma_length = 0;
        }
-        iommu_flush_complete(domain);
+        domain_flush_complete(domain);
        spin_unlock_irqrestore(&domain->lock, flags);
 }
@@ -2200,7 +2308,7 @@ static void *alloc_coherent(struct device *dev, size_t size,
                goto out_free;
        }
-        iommu_flush_complete(domain);
+        domain_flush_complete(domain);
        spin_unlock_irqrestore(&domain->lock, flags);
@@ -2232,7 +2340,7 @@ static void free_coherent(struct device *dev, size_t size,
        __unmap_single(domain->priv, dma_addr, size, DMA_BIDIRECTIONAL);
-        iommu_flush_complete(domain);
+        domain_flush_complete(domain);
        spin_unlock_irqrestore(&domain->lock, flags);
@@ -2296,6 +2404,23 @@ static struct dma_map_ops amd_iommu_dma_ops = {
        .dma_supported = amd_iommu_dma_supported,
 };
+static unsigned device_dma_ops_init(void)
+{
+        struct pci_dev *pdev = NULL;
+        unsigned unhandled = 0;
+        for_each_pci_dev(pdev) {
+                if (!check_device(&pdev->dev)) {
+                        unhandled += 1;
+                        continue;
+                }
+                pdev->dev.archdata.dma_ops = &amd_iommu_dma_ops;
+        }
+        return unhandled;
+}
 /*
 * The function which clues the AMD IOMMU driver into dma_ops.
 */
@@ -2308,7 +2433,7 @@ void __init amd_iommu_init_api(void)
 int __init amd_iommu_init_dma_ops(void)
 {
        struct amd_iommu *iommu;
-        int ret;
+        int ret, unhandled;
        /*
         * first allocate a default protection domain for every IOMMU we
@@ -2334,7 +2459,11 @@ int __init amd_iommu_init_dma_ops(void)
        swiotlb = 0;
        /* Make the driver finally visible to the drivers */
-        dma_ops = &amd_iommu_dma_ops;
+        unhandled = device_dma_ops_init();
+        if (unhandled && max_pfn > MAX_DMA32_PFN) {
+                /* There are unhandled devices - initialize swiotlb for them */
+                swiotlb = 1;
+        }
        amd_iommu_stats_init();
@@ -2476,7 +2605,7 @@ static void amd_iommu_detach_device(struct iommu_domain *dom,
        if (!iommu)
                return;
-        iommu_flush_device(dev);
+        device_flush_dte(dev);
        iommu_completion_wait(iommu);
 }
@@ -2542,7 +2671,7 @@ static int amd_iommu_unmap(struct iommu_domain *dom, unsigned long iova,
        unmap_size = iommu_unmap_page(domain, iova, page_size);
        mutex_unlock(&domain->api_lock);
-        iommu_flush_tlb_pde(domain);
+        domain_flush_tlb_pde(domain);
        return get_order(unmap_size);
 }
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 5a170cbbbed8..bfc8453bd98d 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2007-2009 Advanced Micro Devices, Inc.
+ * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
 * Author: Joerg Roedel <joerg.roedel@amd.com>
 *         Leo Duran <leo.duran@amd.com>
 *
@@ -21,7 +21,7 @@
 #include <linux/acpi.h>
 #include <linux/list.h>
 #include <linux/slab.h>
-#include <linux/sysdev.h>
+#include <linux/syscore_ops.h>
 #include <linux/interrupt.h>
 #include <linux/msi.h>
 #include <asm/pci-direct.h>
@@ -31,7 +31,7 @@
 #include <asm/iommu.h>
 #include <asm/gart.h>
 #include <asm/x86_init.h>
+#include <asm/iommu_table.h>
 /*
 * definitions for the ACPI scanning code
 */
@@ -137,6 +137,7 @@ int amd_iommus_present;
 /* IOMMUs have a non-present cache? */
 bool amd_iommu_np_cache __read_mostly;
+bool amd_iommu_iotlb_sup __read_mostly = true;
 /*
 * The ACPI table parsing functions set this variable on an error
@@ -180,6 +181,12 @@ static u32 dev_table_size;	/* size of the device table */
 static u32 alias_table_size;    /* size of the alias table */
 static u32 rlookup_table_size;  /* size if the rlookup table */
+/*
+ * This function flushes all internal caches of
+ * the IOMMU used by this driver.
+ */
+extern void iommu_flush_all_caches(struct amd_iommu *iommu);
 static inline void update_last_devid(u16 devid)
 {
        if (devid > amd_iommu_last_bdf)
@@ -194,6 +201,39 @@ static inline unsigned long tbl_size(int entry_size)
        return 1UL << shift;
 }
+/* Access to l1 and l2 indexed register spaces */
+static u32 iommu_read_l1(struct amd_iommu *iommu, u16 l1, u8 address)
+{
+        u32 val;
+        pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16));
+        pci_read_config_dword(iommu->dev, 0xfc, &val);
+        return val;
+}
+static void iommu_write_l1(struct amd_iommu *iommu, u16 l1, u8 address, u32 val)
+{
+        pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16 | 1 << 31));
+        pci_write_config_dword(iommu->dev, 0xfc, val);
+        pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16));
+}
+static u32 iommu_read_l2(struct amd_iommu *iommu, u8 address)
+{
+        u32 val;
+        pci_write_config_dword(iommu->dev, 0xf0, address);
+        pci_read_config_dword(iommu->dev, 0xf4, &val);
+        return val;
+}
+static void iommu_write_l2(struct amd_iommu *iommu, u8 address, u32 val)
+{
+        pci_write_config_dword(iommu->dev, 0xf0, (address | 1 << 8));
+        pci_write_config_dword(iommu->dev, 0xf4, val);
+}
 /****************************************************************************
 *
 * AMD IOMMU MMIO register space handling functions
@@ -260,9 +300,23 @@ static void iommu_feature_disable(struct amd_iommu *iommu, u8 bit)
 /* Function to enable the hardware */
 static void iommu_enable(struct amd_iommu *iommu)
 {
-        printk(KERN_INFO "AMD-Vi: Enabling IOMMU at %s cap 0x%hx\n",
+        static const char * const feat_str[] = {
+                "PreF", "PPR", "X2APIC", "NX", "GT", "[5]",
+                "IA", "GA", "HE", "PC", NULL
+        };
+        int i;
+        printk(KERN_INFO "AMD-Vi: Enabling IOMMU at %s cap 0x%hx",
               dev_name(&iommu->dev->dev), iommu->cap_ptr);
+        if (iommu->cap & (1 << IOMMU_CAP_EFR)) {
+                printk(KERN_CONT " extended features: ");
+                for (i = 0; feat_str[i]; ++i)
+                        if (iommu_feature(iommu, (1ULL << i)))
+                                printk(KERN_CONT " %s", feat_str[i]);
+        }
+        printk(KERN_CONT "\n");
        iommu_feature_enable(iommu, CONTROL_IOMMU_EN);
 }
@@ -618,7 +672,8 @@ static void __init set_device_exclusion_range(u16 devid, struct ivmd_header *m)
 static void __init init_iommu_from_pci(struct amd_iommu *iommu)
 {
        int cap_ptr = iommu->cap_ptr;
-        u32 range, misc;
+        u32 range, misc, low, high;
+        int i, j;
        pci_read_config_dword(iommu->dev, cap_ptr + MMIO_CAP_HDR_OFFSET,
                              &iommu->cap);
@@ -633,12 +688,38 @@ static void __init init_iommu_from_pci(struct amd_iommu *iommu)
                                        MMIO_GET_LD(range));
        iommu->evt_msi_num = MMIO_MSI_NUM(misc);
-        if (is_rd890_iommu(iommu->dev)) {
+        if (!(iommu->cap & (1 << IOMMU_CAP_IOTLB)))
-                pci_read_config_dword(iommu->dev, 0xf0, &iommu->cache_cfg[0]);
+                amd_iommu_iotlb_sup = false;
-                pci_read_config_dword(iommu->dev, 0xf4, &iommu->cache_cfg[1]);
-                pci_read_config_dword(iommu->dev, 0xf8, &iommu->cache_cfg[2]);
+        /* read extended feature bits */
-                pci_read_config_dword(iommu->dev, 0xfc, &iommu->cache_cfg[3]);
+        low  = readl(iommu->mmio_base + MMIO_EXT_FEATURES);
-        }
+        high = readl(iommu->mmio_base + MMIO_EXT_FEATURES + 4);
+        iommu->features = ((u64)high << 32) | low;
+        if (!is_rd890_iommu(iommu->dev))
+                return;
+        /*
+         * Some rd890 systems may not be fully reconfigured by the BIOS, so
+         * it's necessary for us to store this information so it can be
+         * reprogrammed on resume
+         */
+        pci_read_config_dword(iommu->dev, iommu->cap_ptr + 4,
+                              &iommu->stored_addr_lo);
+        pci_read_config_dword(iommu->dev, iommu->cap_ptr + 8,
+                              &iommu->stored_addr_hi);
+        /* Low bit locks writes to configuration space */
+        iommu->stored_addr_lo &= ~1;
+        for (i = 0; i < 6; i++)
+                for (j = 0; j < 0x12; j++)
+                        iommu->stored_l1[i][j] = iommu_read_l1(iommu, i, j);
+        for (i = 0; i < 0x83; i++)
+                iommu->stored_l2[i] = iommu_read_l2(iommu, i);
 }
 /*
@@ -650,8 +731,8 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
 {
        u8 *p = (u8 *)h;
        u8 *end = p, flags = 0;
-        u16 dev_i, devid = 0, devid_start = 0, devid_to = 0;
+        u16 devid = 0, devid_start = 0, devid_to = 0;
-        u32 ext_flags = 0;
+        u32 dev_i, ext_flags = 0;
        bool alias = false;
        struct ivhd_entry *e;
@@ -806,7 +887,7 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
 /* Initializes the device->iommu mapping for the driver */
 static int __init init_iommu_devices(struct amd_iommu *iommu)
 {
-        u16 i;
+        u32 i;
        for (i = iommu->first_device; i <= iommu->last_device; ++i)
                set_iommu_for_device(iommu, i);
@@ -953,10 +1034,11 @@ static int iommu_setup_msi(struct amd_iommu *iommu)
        if (pci_enable_msi(iommu->dev))
                return 1;
-        r = request_irq(iommu->dev->irq, amd_iommu_int_handler,
+        r = request_threaded_irq(iommu->dev->irq,
-                        IRQF_SAMPLE_RANDOM,
+                                 amd_iommu_int_handler,
-                        "AMD-Vi",
+                                 amd_iommu_int_thread,
-                        NULL);
+                                 0, "AMD-Vi",
+                                 iommu->dev);
        if (r) {
                pci_disable_msi(iommu->dev);
@@ -1095,7 +1177,7 @@ static int __init init_memory_definitions(struct acpi_table_header *table)
 */
 static void init_device_table(void)
 {
-        u16 devid;
+        u32 devid;
        for (devid = 0; devid <= amd_iommu_last_bdf; ++devid) {
                set_dev_entry_bit(devid, DEV_ENTRY_VALID);
@@ -1127,14 +1209,53 @@ static void iommu_init_flags(struct amd_iommu *iommu)
        iommu_feature_enable(iommu, CONTROL_COHERENT_EN);
 }
-static void iommu_apply_quirks(struct amd_iommu *iommu)
+static void iommu_apply_resume_quirks(struct amd_iommu *iommu)
 {
-        if (is_rd890_iommu(iommu->dev)) {
+        int i, j;
-                pci_write_config_dword(iommu->dev, 0xf0, iommu->cache_cfg[0]);
+        u32 ioc_feature_control;
-                pci_write_config_dword(iommu->dev, 0xf4, iommu->cache_cfg[1]);
+        struct pci_dev *pdev = NULL;
-                pci_write_config_dword(iommu->dev, 0xf8, iommu->cache_cfg[2]);
-                pci_write_config_dword(iommu->dev, 0xfc, iommu->cache_cfg[3]);
+        /* RD890 BIOSes may not have completely reconfigured the iommu */
-        }
+        if (!is_rd890_iommu(iommu->dev))
+                return;
+        /*
+         * First, we need to ensure that the iommu is enabled. This is
+         * controlled by a register in the northbridge
+         */
+        pdev = pci_get_bus_and_slot(iommu->dev->bus->number, PCI_DEVFN(0, 0));
+        if (!pdev)
+                return;
+        /* Select Northbridge indirect register 0x75 and enable writing */
+        pci_write_config_dword(pdev, 0x60, 0x75 | (1 << 7));
+        pci_read_config_dword(pdev, 0x64, &ioc_feature_control);
+        /* Enable the iommu */
+        if (!(ioc_feature_control & 0x1))
+                pci_write_config_dword(pdev, 0x64, ioc_feature_control | 1);
+        pci_dev_put(pdev);
+        /* Restore the iommu BAR */
+        pci_write_config_dword(iommu->dev, iommu->cap_ptr + 4,
+                               iommu->stored_addr_lo);
+        pci_write_config_dword(iommu->dev, iommu->cap_ptr + 8,
+                               iommu->stored_addr_hi);
+        /* Restore the l1 indirect regs for each of the 6 l1s */
+        for (i = 0; i < 6; i++)
+                for (j = 0; j < 0x12; j++)
+                        iommu_write_l1(iommu, i, j, iommu->stored_l1[i][j]);
+        /* Restore the l2 indirect regs */
+        for (i = 0; i < 0x83; i++)
+                iommu_write_l2(iommu, i, iommu->stored_l2[i]);
+        /* Lock PCI setup registers */
+        pci_write_config_dword(iommu->dev, iommu->cap_ptr + 4,
+                               iommu->stored_addr_lo | 1);
 }
 /*
@@ -1147,7 +1268,6 @@ static void enable_iommus(void)
        for_each_iommu(iommu) {
                iommu_disable(iommu);
-                iommu_apply_quirks(iommu);
                iommu_init_flags(iommu);
                iommu_set_device_table(iommu);
                iommu_enable_command_buffer(iommu);
@@ -1155,6 +1275,7 @@ static void enable_iommus(void)
                iommu_set_exclusion_range(iommu);
                iommu_init_msi(iommu);
                iommu_enable(iommu);
+                iommu_flush_all_caches(iommu);
        }
 }
@@ -1171,8 +1292,13 @@ static void disable_iommus(void)
 * disable suspend until real resume implemented
 */
-static int amd_iommu_resume(struct sys_device *dev)
+static void amd_iommu_resume(void)
 {
+        struct amd_iommu *iommu;
+        for_each_iommu(iommu)
+                iommu_apply_resume_quirks(iommu);
        /* re-load the hardware */
        enable_iommus();
@@ -1180,13 +1306,11 @@ static int amd_iommu_resume(struct sys_device *dev)
         * we have to flush after the IOMMUs are enabled because a
         * disabled IOMMU will never execute the commands we send
         */
-        amd_iommu_flush_all_devices();
+        for_each_iommu(iommu)
-        amd_iommu_flush_all_domains();
+                iommu_flush_all_caches(iommu);
-        return 0;
 }
-static int amd_iommu_suspend(struct sys_device *dev, pm_message_t state)
+static int amd_iommu_suspend(void)
 {
        /* disable IOMMUs to go out of the way for BIOS */
        disable_iommus();
@@ -1194,17 +1318,11 @@ static int amd_iommu_suspend(struct sys_device *dev, pm_message_t state)
        return 0;
 }
-static struct sysdev_class amd_iommu_sysdev_class = {
+static struct syscore_ops amd_iommu_syscore_ops = {
-        .name = "amd_iommu",
        .suspend = amd_iommu_suspend,
        .resume = amd_iommu_resume,
 };
-static struct sys_device device_amd_iommu = {
-        .id = 0,
-        .cls = &amd_iommu_sysdev_class,
-};
 /*
 * This is the core init function for AMD IOMMU hardware in the system.
 * This function is called from the generic x86 DMA layer initialization
@@ -1321,14 +1439,6 @@ static int __init amd_iommu_init(void)
                goto free;
        }
-        ret = sysdev_class_register(&amd_iommu_sysdev_class);
-        if (ret)
-                goto free;
-        ret = sysdev_register(&device_amd_iommu);
-        if (ret)
-                goto free;
        ret = amd_iommu_init_devices();
        if (ret)
                goto free;
@@ -1347,6 +1457,8 @@ static int __init amd_iommu_init(void)
        amd_iommu_init_notifier();
+        register_syscore_ops(&amd_iommu_syscore_ops);
        if (iommu_pass_through)
                goto out;
@@ -1405,13 +1517,13 @@ static int __init early_amd_iommu_detect(struct acpi_table_header *table)
        return 0;
 }
-void __init amd_iommu_detect(void)
+int __init amd_iommu_detect(void)
 {
        if (no_iommu || (iommu_detected && !gart_iommu_aperture))
-                return;
+                return -ENODEV;
        if (amd_iommu_disabled)
-                return;
+                return -ENODEV;
        if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) {
                iommu_detected = 1;
@@ -1420,7 +1532,9 @@ void __init amd_iommu_detect(void)
                /* Make sure ACS will be enabled */
                pci_request_acs();
+                return 1;
        }
+        return -ENODEV;
 }
 /****************************************************************************
@@ -1451,3 +1565,8 @@ static int __init parse_amd_iommu_options(char *str)
 __setup("amd_iommu_dump", parse_amd_iommu_dump);
 __setup("amd_iommu=", parse_amd_iommu_options);
+IOMMU_INIT_FINISH(amd_iommu_detect,
+                  gart_iommu_hole_init,
+                  0,
+                  0);
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
new file mode 100644
index 000000000000..4c39baa8facc
--- /dev/null
+++ b/arch/x86/kernel/amd_nb.c
@@ -0,0 +1,255 @@
+/*
+ * Shared support code for AMD K8 northbridges and derivates.
+ * Copyright 2006 Andi Kleen, SUSE Labs. Subject to GPLv2.
+ */
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <asm/amd_nb.h>
+static u32 *flush_words;
+const struct pci_device_id amd_nb_misc_ids[] = {
+        { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) },
+        { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) },
+        { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F3) },
+        {}
+};
+EXPORT_SYMBOL(amd_nb_misc_ids);
+static struct pci_device_id amd_nb_link_ids[] = {
+        { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F4) },
+        {}
+};
+const struct amd_nb_bus_dev_range amd_nb_bus_dev_ranges[] __initconst = {
+        { 0x00, 0x18, 0x20 },
+        { 0xff, 0x00, 0x20 },
+        { 0xfe, 0x00, 0x20 },
+        { }
+};
+struct amd_northbridge_info amd_northbridges;
+EXPORT_SYMBOL(amd_northbridges);
+static struct pci_dev *next_northbridge(struct pci_dev *dev,
+                                        const struct pci_device_id *ids)
+{
+        do {
+                dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev);
+                if (!dev)
+                        break;
+        } while (!pci_match_id(ids, dev));
+        return dev;
+}
+int amd_cache_northbridges(void)
+{
+        u16 i = 0;
+        struct amd_northbridge *nb;
+        struct pci_dev *misc, *link;
+        if (amd_nb_num())
+                return 0;
+        misc = NULL;
+        while ((misc = next_northbridge(misc, amd_nb_misc_ids)) != NULL)
+                i++;
+        if (i == 0)
+                return 0;
+        nb = kzalloc(i * sizeof(struct amd_northbridge), GFP_KERNEL);
+        if (!nb)
+                return -ENOMEM;
+        amd_northbridges.nb = nb;
+        amd_northbridges.num = i;
+        link = misc = NULL;
+        for (i = 0; i != amd_nb_num(); i++) {
+                node_to_amd_nb(i)->misc = misc =
+                        next_northbridge(misc, amd_nb_misc_ids);
+                node_to_amd_nb(i)->link = link =
+                        next_northbridge(link, amd_nb_link_ids);
+        }
+        /* some CPU families (e.g. family 0x11) do not support GART */
+        if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10 ||
+            boot_cpu_data.x86 == 0x15)
+                amd_northbridges.flags |= AMD_NB_GART;
+        /*
+         * Some CPU families support L3 Cache Index Disable. There are some
+         * limitations because of E382 and E388 on family 0x10.
+         */
+        if (boot_cpu_data.x86 == 0x10 &&
+            boot_cpu_data.x86_model >= 0x8 &&
+            (boot_cpu_data.x86_model > 0x9 ||
+             boot_cpu_data.x86_mask >= 0x1))
+                amd_northbridges.flags |= AMD_NB_L3_INDEX_DISABLE;
+        if (boot_cpu_data.x86 == 0x15)
+                amd_northbridges.flags |= AMD_NB_L3_INDEX_DISABLE;
+        /* L3 cache partitioning is supported on family 0x15 */
+        if (boot_cpu_data.x86 == 0x15)
+                amd_northbridges.flags |= AMD_NB_L3_PARTITIONING;
+        return 0;
+}
+EXPORT_SYMBOL_GPL(amd_cache_northbridges);
+/*
+ * Ignores subdevice/subvendor but as far as I can figure out
+ * they're useless anyways
+ */
+bool __init early_is_amd_nb(u32 device)
+{
+        const struct pci_device_id *id;
+        u32 vendor = device & 0xffff;
+        device >>= 16;
+        for (id = amd_nb_misc_ids; id->vendor; id++)
+                if (vendor == id->vendor && device == id->device)
+                        return true;
+        return false;
+}
+int amd_get_subcaches(int cpu)
+{
+        struct pci_dev *link = node_to_amd_nb(amd_get_nb_id(cpu))->link;
+        unsigned int mask;
+        int cuid = 0;
+        if (!amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
+                return 0;
+        pci_read_config_dword(link, 0x1d4, &mask);
+#ifdef CONFIG_SMP
+        cuid = cpu_data(cpu).compute_unit_id;
+#endif
+        return (mask >> (4 * cuid)) & 0xf;
+}
+int amd_set_subcaches(int cpu, int mask)
+{
+        static unsigned int reset, ban;
+        struct amd_northbridge *nb = node_to_amd_nb(amd_get_nb_id(cpu));
+        unsigned int reg;
+        int cuid = 0;
+        if (!amd_nb_has_feature(AMD_NB_L3_PARTITIONING) || mask > 0xf)
+                return -EINVAL;
+        /* if necessary, collect reset state of L3 partitioning and BAN mode */
+        if (reset == 0) {
+                pci_read_config_dword(nb->link, 0x1d4, &reset);
+                pci_read_config_dword(nb->misc, 0x1b8, &ban);
+                ban &= 0x180000;
+        }
+        /* deactivate BAN mode if any subcaches are to be disabled */
+        if (mask != 0xf) {
+                pci_read_config_dword(nb->misc, 0x1b8, &reg);
+                pci_write_config_dword(nb->misc, 0x1b8, reg & ~0x180000);
+        }
+#ifdef CONFIG_SMP
+        cuid = cpu_data(cpu).compute_unit_id;
+#endif
+        mask <<= 4 * cuid;
+        mask |= (0xf ^ (1 << cuid)) << 26;
+        pci_write_config_dword(nb->link, 0x1d4, mask);
+        /* reset BAN mode if L3 partitioning returned to reset state */
+        pci_read_config_dword(nb->link, 0x1d4, &reg);
+        if (reg == reset) {
+                pci_read_config_dword(nb->misc, 0x1b8, &reg);
+                reg &= ~0x180000;
+                pci_write_config_dword(nb->misc, 0x1b8, reg | ban);
+        }
+        return 0;
+}
+static int amd_cache_gart(void)
+{
+        u16 i;
+       if (!amd_nb_has_feature(AMD_NB_GART))
+               return 0;
+       flush_words = kmalloc(amd_nb_num() * sizeof(u32), GFP_KERNEL);
+       if (!flush_words) {
+               amd_northbridges.flags &= ~AMD_NB_GART;
+               return -ENOMEM;
+       }
+       for (i = 0; i != amd_nb_num(); i++)
+               pci_read_config_dword(node_to_amd_nb(i)->misc, 0x9c,
+                                     &flush_words[i]);
+       return 0;
+}
+void amd_flush_garts(void)
+{
+        int flushed, i;
+        unsigned long flags;
+        static DEFINE_SPINLOCK(gart_lock);
+        if (!amd_nb_has_feature(AMD_NB_GART))
+                return;
+        /* Avoid races between AGP and IOMMU. In theory it's not needed
+           but I'm not sure if the hardware won't lose flush requests
+           when another is pending. This whole thing is so expensive anyways
+           that it doesn't matter to serialize more. -AK */
+        spin_lock_irqsave(&gart_lock, flags);
+        flushed = 0;
+        for (i = 0; i < amd_nb_num(); i++) {
+                pci_write_config_dword(node_to_amd_nb(i)->misc, 0x9c,
+                                       flush_words[i] | 1);
+                flushed++;
+        }
+        for (i = 0; i < amd_nb_num(); i++) {
+                u32 w;
+                /* Make sure the hardware actually executed the flush*/
+                for (;;) {
+                        pci_read_config_dword(node_to_amd_nb(i)->misc,
+                                              0x9c, &w);
+                        if (!(w & 1))
+                                break;
+                        cpu_relax();
+                }
+        }
+        spin_unlock_irqrestore(&gart_lock, flags);
+        if (!flushed)
+                printk("nothing to flush?\n");
+}
+EXPORT_SYMBOL_GPL(amd_flush_garts);
+static __init int init_amd_nbs(void)
+{
+        int err = 0;
+        err = amd_cache_northbridges();
+        if (err < 0)
+                printk(KERN_NOTICE "AMD NB: Cannot enumerate AMD northbridges.\n");
+        if (amd_cache_gart() < 0)
+                printk(KERN_NOTICE "AMD NB: Cannot initialize GART flush words, "
+                       "GART support disabled.\n");
+        return err;
+}
+/* This has to go after the PCI subsystem */
+fs_initcall(init_amd_nbs);
diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c
index 8dd77800ff5d..289e92862fd9 100644
--- a/arch/x86/kernel/apb_timer.c
+++ b/arch/x86/kernel/apb_timer.c
@@ -177,7 +177,6 @@ static struct clocksource clocksource_apbt = {
        .rating         = APBT_CLOCKSOURCE_RATING,
        .read           = apbt_read_clocksource,
        .mask           = APBT_MASK,
-        .shift          = APBT_SHIFT,
        .flags          = CLOCK_SOURCE_IS_CONTINUOUS,
        .resume         = apbt_restart_clocksource,
 };
@@ -231,34 +230,6 @@ static void apbt_restart_clocksource(struct clocksource *cs)
        apbt_start_counter(phy_cs_timer_id);
 }
-/* Setup IRQ routing via IOAPIC */
-#ifdef CONFIG_SMP
-static void apbt_setup_irq(struct apbt_dev *adev)
-{
-        struct irq_chip *chip;
-        struct irq_desc *desc;
-        /* timer0 irq has been setup early */
-        if (adev->irq == 0)
-                return;
-        desc = irq_to_desc(adev->irq);
-        chip = get_irq_chip(adev->irq);
-        disable_irq(adev->irq);
-        desc->status |= IRQ_MOVE_PCNTXT;
-        irq_set_affinity(adev->irq, cpumask_of(adev->cpu));
-        /* APB timer irqs are set up as mp_irqs, timer is edge triggerred */
-        set_irq_chip_and_handler_name(adev->irq, chip, handle_edge_irq, "edge");
-        enable_irq(adev->irq);
-        if (system_state == SYSTEM_BOOTING)
-                if (request_irq(adev->irq, apbt_interrupt_handler,
-                                IRQF_TIMER | IRQF_DISABLED | IRQF_NOBALANCING,
-                                adev->name, adev)) {
-                        printk(KERN_ERR "Failed request IRQ for APBT%d\n",
-                               adev->num);
-                }
-}
-#endif
 static void apbt_enable_int(int n)
 {
        unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL);
@@ -312,7 +283,7 @@ static int __init apbt_clockevent_register(void)
        memcpy(&adev->evt, &apbt_clockevent, sizeof(struct clock_event_device));
        if (mrst_timer_options == MRST_TIMER_LAPIC_APBT) {
-                apbt_clockevent.rating = APBT_CLOCKEVENT_RATING - 100;
+                adev->evt.rating = APBT_CLOCKEVENT_RATING - 100;
                global_clock_event = &adev->evt;
                printk(KERN_DEBUG "%s clockevent registered as global\n",
                       global_clock_event->name);
@@ -334,6 +305,30 @@ static int __init apbt_clockevent_register(void)
 }
 #ifdef CONFIG_SMP
+static void apbt_setup_irq(struct apbt_dev *adev)
+{
+        /* timer0 irq has been setup early */
+        if (adev->irq == 0)
+                return;
+        irq_modify_status(adev->irq, 0, IRQ_MOVE_PCNTXT);
+        irq_set_affinity(adev->irq, cpumask_of(adev->cpu));
+        /* APB timer irqs are set up as mp_irqs, timer is edge type */
+        __irq_set_handler(adev->irq, handle_edge_irq, 0, "edge");
+        if (system_state == SYSTEM_BOOTING) {
+                if (request_irq(adev->irq, apbt_interrupt_handler,
+                                        IRQF_TIMER | IRQF_DISABLED |
+                                        IRQF_NOBALANCING,
+                                        adev->name, adev)) {
+                        printk(KERN_ERR "Failed request IRQ for APBT%d\n",
+                               adev->num);
+                }
+        } else
+                enable_irq(adev->irq);
+}
 /* Should be called with per cpu */
 void apbt_setup_secondary_clock(void)
 {
@@ -343,7 +338,7 @@ void apbt_setup_secondary_clock(void)
        /* Don't register boot CPU clockevent */
        cpu = smp_processor_id();
-        if (cpu == boot_cpu_id)
+        if (!cpu)
                return;
        /*
         * We need to calculate the scaled math multiplication factor for
@@ -389,16 +384,17 @@ static int apbt_cpuhp_notify(struct notifier_block *n,
        switch (action & 0xf) {
        case CPU_DEAD:
+                disable_irq(adev->irq);
                apbt_disable_int(cpu);
-                if (system_state == SYSTEM_RUNNING)
+                if (system_state == SYSTEM_RUNNING) {
                        pr_debug("skipping APBT CPU %lu offline\n", cpu);
-                else if (adev) {
+                } else if (adev) {
                        pr_debug("APBT clockevent for cpu %lu offline\n", cpu);
                        free_irq(adev->irq, adev);
                }
                break;
        default:
-                pr_debug(KERN_INFO "APBT notified %lu, no action\n", action);
+                pr_debug("APBT notified %lu, no action\n", action);
        }
        return NOTIFY_OK;
 }
@@ -511,64 +507,12 @@ static int apbt_next_event(unsigned long delta,
        return 0;
 }
-/*
- * APB timer clock is not in sync with pclk on Langwell, which translates to
- * unreliable read value caused by sampling error. the error does not add up
- * overtime and only happens when sampling a 0 as a 1 by mistake. so the time
- * would go backwards. the following code is trying to prevent time traveling
- * backwards. little bit paranoid.
- */
 static cycle_t apbt_read_clocksource(struct clocksource *cs)
 {
-        unsigned long t0, t1, t2;
+        unsigned long current_count;
-        static unsigned long last_read;
+        current_count = apbt_readl(phy_cs_timer_id, APBTMR_N_CURRENT_VALUE);
-bad_count:
+        return (cycle_t)~current_count;
-        t1 = apbt_readl(phy_cs_timer_id,
-                        APBTMR_N_CURRENT_VALUE);
-        t2 = apbt_readl(phy_cs_timer_id,
-                        APBTMR_N_CURRENT_VALUE);
-        if (unlikely(t1 < t2)) {
-                pr_debug("APBT: read current count error %lx:%lx:%lx\n",
-                         t1, t2, t2 - t1);
-                goto bad_count;
-        }
-        /*
-         * check against cached last read, makes sure time does not go back.
-         * it could be a normal rollover but we will do tripple check anyway
-         */
-        if (unlikely(t2 > last_read)) {
-                /* check if we have a normal rollover */
-                unsigned long raw_intr_status =
-                        apbt_readl_reg(APBTMRS_RAW_INT_STATUS);
-                /*
-                 * cs timer interrupt is masked but raw intr bit is set if
-                 * rollover occurs. then we read EOI reg to clear it.
-                 */
-                if (raw_intr_status & (1 << phy_cs_timer_id)) {
-                        apbt_readl(phy_cs_timer_id, APBTMR_N_EOI);
-                        goto out;
-                }
-                pr_debug("APB CS going back %lx:%lx:%lx ",
-                         t2, last_read, t2 - last_read);
-bad_count_x3:
-                pr_debug(KERN_INFO "tripple check enforced\n");
-                t0 = apbt_readl(phy_cs_timer_id,
-                                APBTMR_N_CURRENT_VALUE);
-                udelay(1);
-                t1 = apbt_readl(phy_cs_timer_id,
-                                APBTMR_N_CURRENT_VALUE);
-                udelay(1);
-                t2 = apbt_readl(phy_cs_timer_id,
-                                APBTMR_N_CURRENT_VALUE);
-                if ((t2 > t1) || (t1 > t0)) {
-                        printk(KERN_ERR "Error: APB CS tripple check failed\n");
-                        goto bad_count_x3;
-                }
-        }
-out:
-        last_read = t2;
-        return (cycle_t)~t2;
 }
 static int apbt_clocksource_register(void)
@@ -598,14 +542,7 @@ static int apbt_clocksource_register(void)
        if (t1 == apbt_read_clocksource(&clocksource_apbt))
                panic("APBT counter not counting. APBT disabled\n");
-        /*
+        clocksource_register_khz(&clocksource_apbt, (u32)apbt_freq*1000);
-         * initialize and register APBT clocksource
-         * convert that to ns/clock cycle
-         * mult = (ns/c) * 2^APBT_SHIFT
-         */
-        clocksource_apbt.mult = div_sc(MSEC_PER_SEC,
-                                       (unsigned long) apbt_freq, APBT_SHIFT);
-        clocksource_register(&clocksource_apbt);
        return 0;
 }
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index a2e0caf26e17..3d2661ca6542 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -13,7 +13,7 @@
 #include <linux/kernel.h>
 #include <linux/types.h>
 #include <linux/init.h>
-#include <linux/bootmem.h>
+#include <linux/memblock.h>
 #include <linux/mmzone.h>
 #include <linux/pci_ids.h>
 #include <linux/pci.h>
@@ -27,9 +27,25 @@
 #include <asm/gart.h>
 #include <asm/pci-direct.h>
 #include <asm/dma.h>
-#include <asm/k8.h>
+#include <asm/amd_nb.h>
 #include <asm/x86_init.h>
+/*
+ * Using 512M as goal, in case kexec will load kernel_big
+ * that will do the on-position decompress, and could overlap with
+ * with the gart aperture that is used.
+ * Sequence:
+ * kernel_small
+ * ==> kexec (with kdump trigger path or gart still enabled)
+ * ==> kernel_small (gart area become e820_reserved)
+ * ==> kexec (with kdump trigger path or gart still enabled)
+ * ==> kerne_big (uncompressed size will be big than 64M or 128M)
+ * So don't use 512M below as gart iommu, leave the space for kernel
+ * code for safe.
+ */
+#define GART_MIN_ADDR   (512ULL << 20)
+#define GART_MAX_ADDR   (1ULL   << 32)
 int gart_iommu_aperture;
 int gart_iommu_aperture_disabled __initdata;
 int gart_iommu_aperture_allowed __initdata;
@@ -39,18 +55,6 @@ int fallback_aper_force __initdata;
 int fix_aperture __initdata = 1;
-struct bus_dev_range {
-        int bus;
-        int dev_base;
-        int dev_limit;
-};
-static struct bus_dev_range bus_dev_ranges[] __initdata = {
-        { 0x00, 0x18, 0x20},
-        { 0xff, 0x00, 0x20},
-        { 0xfe, 0x00, 0x20}
-};
 static struct resource gart_resource = {
        .name   = "GART",
        .flags  = IORESOURCE_MEM,
@@ -69,7 +73,7 @@ static void __init insert_aperture_resource(u32 aper_base, u32 aper_size)
 static u32 __init allocate_aperture(void)
 {
        u32 aper_size;
-        void *p;
+        unsigned long addr;
        /* aper_size should <= 1G */
        if (fallback_aper_order > 5)
@@ -82,40 +86,27 @@ static u32 __init allocate_aperture(void)
         * memory. Unfortunately we cannot move it up because that would
         * make the IOMMU useless.
         */
-        /*
+        addr = memblock_find_in_range(GART_MIN_ADDR, GART_MAX_ADDR,
-         * using 512M as goal, in case kexec will load kernel_big
+                                      aper_size, aper_size);
-         * that will do the on position decompress, and  could overlap with
+        if (addr == MEMBLOCK_ERROR || addr + aper_size > GART_MAX_ADDR) {
-         * that positon with gart that is used.
+                printk(KERN_ERR
-         * sequende:
+                        "Cannot allocate aperture memory hole (%lx,%uK)\n",
-         * kernel_small
+                                addr, aper_size>>10);
-         * ==> kexec (with kdump trigger path or previous doesn't shutdown gart)
+                return 0;
-         * ==> kernel_small(gart area become e820_reserved)
+        }
-         * ==> kexec (with kdump trigger path or previous doesn't shutdown gart)
+        memblock_x86_reserve_range(addr, addr + aper_size, "aperture64");
-         * ==> kerne_big (uncompressed size will be big than 64M or 128M)
-         * so don't use 512M below as gart iommu, leave the space for kernel
-         * code for safe
-         */
-        p = __alloc_bootmem_nopanic(aper_size, aper_size, 512ULL<<20);
        /*
         * Kmemleak should not scan this block as it may not be mapped via the
         * kernel direct mapping.
         */
-        kmemleak_ignore(p);
+        kmemleak_ignore(phys_to_virt(addr));
-        if (!p || __pa(p)+aper_size > 0xffffffff) {
-                printk(KERN_ERR
-                        "Cannot allocate aperture memory hole (%p,%uK)\n",
-                                p, aper_size>>10);
-                if (p)
-                        free_bootmem(__pa(p), aper_size);
-                return 0;
-        }
        printk(KERN_INFO "Mapping aperture over %d KB of RAM @ %lx\n",
-                        aper_size >> 10, __pa(p));
+                        aper_size >> 10, addr);
-        insert_aperture_resource((u32)__pa(p), aper_size);
+        insert_aperture_resource((u32)addr, aper_size);
-        register_nosave_region((u32)__pa(p) >> PAGE_SHIFT,
+        register_nosave_region(addr >> PAGE_SHIFT,
-                                (u32)__pa(p+aper_size) >> PAGE_SHIFT);
+                               (addr+aper_size) >> PAGE_SHIFT);
-        return (u32)__pa(p);
+        return (u32)addr;
 }
@@ -206,7 +197,7 @@ static u32 __init read_agp(int bus, int slot, int func, int cap, u32 *order)
 * Do an PCI bus scan by hand because we're running before the PCI
 * subsystem.
 *
- * All K8 AGP bridges are AGPv3 compliant, so we can do this scan
+ * All AMD AGP bridges are AGPv3 compliant, so we can do this scan
 * generically. It's probably overkill to always scan all slots because
 * the AGP bridges should be always an own bus on the HT hierarchy,
 * but do it here for future safety.
@@ -294,20 +285,20 @@ void __init early_gart_iommu_check(void)
        search_agp_bridge(&agp_aper_order, &valid_agp);
        fix = 0;
-        for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) {
+        for (i = 0; amd_nb_bus_dev_ranges[i].dev_limit; i++) {
                int bus;
                int dev_base, dev_limit;
-                bus = bus_dev_ranges[i].bus;
+                bus = amd_nb_bus_dev_ranges[i].bus;
-                dev_base = bus_dev_ranges[i].dev_base;
+                dev_base = amd_nb_bus_dev_ranges[i].dev_base;
-                dev_limit = bus_dev_ranges[i].dev_limit;
+                dev_limit = amd_nb_bus_dev_ranges[i].dev_limit;
                for (slot = dev_base; slot < dev_limit; slot++) {
-                        if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00)))
+                        if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00)))
                                continue;
                        ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL);
-                        aper_enabled = ctl & AMD64_GARTEN;
+                        aper_enabled = ctl & GARTEN;
                        aper_order = (ctl >> 1) & 7;
                        aper_size = (32 * 1024 * 1024) << aper_order;
                        aper_base = read_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE) & 0x7fff;
@@ -349,20 +340,20 @@ void __init early_gart_iommu_check(void)
                return;
        /* disable them all at first */
-        for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) {
+        for (i = 0; i < amd_nb_bus_dev_ranges[i].dev_limit; i++) {
                int bus;
                int dev_base, dev_limit;
-                bus = bus_dev_ranges[i].bus;
+                bus = amd_nb_bus_dev_ranges[i].bus;
-                dev_base = bus_dev_ranges[i].dev_base;
+                dev_base = amd_nb_bus_dev_ranges[i].dev_base;
-                dev_limit = bus_dev_ranges[i].dev_limit;
+                dev_limit = amd_nb_bus_dev_ranges[i].dev_limit;
                for (slot = dev_base; slot < dev_limit; slot++) {
-                        if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00)))
+                        if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00)))
                                continue;
                        ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL);
-                        ctl &= ~AMD64_GARTEN;
+                        ctl &= ~GARTEN;
                        write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl);
                }
        }
@@ -371,7 +362,7 @@ void __init early_gart_iommu_check(void)
 static int __initdata printed_gart_size_msg;
-void __init gart_iommu_hole_init(void)
+int __init gart_iommu_hole_init(void)
 {
        u32 agp_aper_base = 0, agp_aper_order = 0;
        u32 aper_size, aper_alloc = 0, aper_order = 0, last_aper_order = 0;
@@ -381,7 +372,7 @@ void __init gart_iommu_hole_init(void)
        if (gart_iommu_aperture_disabled || !fix_aperture ||
            !early_pci_allowed())
-                return;
+                return -ENODEV;
        printk(KERN_INFO  "Checking aperture...\n");
@@ -390,17 +381,17 @@ void __init gart_iommu_hole_init(void)
        fix = 0;
        node = 0;
-        for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) {
+        for (i = 0; i < amd_nb_bus_dev_ranges[i].dev_limit; i++) {
                int bus;
                int dev_base, dev_limit;
                u32 ctl;
-                bus = bus_dev_ranges[i].bus;
+                bus = amd_nb_bus_dev_ranges[i].bus;
-                dev_base = bus_dev_ranges[i].dev_base;
+                dev_base = amd_nb_bus_dev_ranges[i].dev_base;
-                dev_limit = bus_dev_ranges[i].dev_limit;
+                dev_limit = amd_nb_bus_dev_ranges[i].dev_limit;
                for (slot = dev_base; slot < dev_limit; slot++) {
-                        if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00)))
+                        if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00)))
                                continue;
                        iommu_detected = 1;
@@ -463,8 +454,9 @@ out:
                        unsigned long n = (32 * 1024 * 1024) << last_aper_order;
                        insert_aperture_resource((u32)last_aper_base, n);
+                        return 1;
                }
-                return;
+                return 0;
        }
        if (!fallback_aper_force) {
@@ -500,28 +492,32 @@ out:
                        panic("Not enough memory for aperture");
                }
        } else {
-                return;
+                return 0;
        }
        /* Fix up the north bridges */
-        for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) {
+        for (i = 0; i < amd_nb_bus_dev_ranges[i].dev_limit; i++) {
-                int bus;
+                int bus, dev_base, dev_limit;
-                int dev_base, dev_limit;
+                /*
-                bus = bus_dev_ranges[i].bus;
+                 * Don't enable translation yet but enable GART IO and CPU
-                dev_base = bus_dev_ranges[i].dev_base;
+                 * accesses and set DISTLBWALKPRB since GART table memory is UC.
-                dev_limit = bus_dev_ranges[i].dev_limit;
+                 */
+                u32 ctl = aper_order << 1;
+                bus = amd_nb_bus_dev_ranges[i].bus;
+                dev_base = amd_nb_bus_dev_ranges[i].dev_base;
+                dev_limit = amd_nb_bus_dev_ranges[i].dev_limit;
                for (slot = dev_base; slot < dev_limit; slot++) {
-                        if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00)))
+                        if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00)))
                                continue;
-                        /* Don't enable translation yet. That is done later.
+                        write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl);
-                           Assume this BIOS didn't initialise the GART so
-                           just overwrite all previous bits */
-                        write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, aper_order << 1);
                        write_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE, aper_alloc >> 25);
                }
        }
        set_up_gart_resume(aper_order, aper_alloc);
+        return 1;
 }
diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile
index 910f20b457c4..767fd04f2843 100644
--- a/arch/x86/kernel/apic/Makefile
+++ b/arch/x86/kernel/apic/Makefile
@@ -2,23 +2,25 @@
 # Makefile for local APIC drivers and for the IO-APIC code
 #
-obj-$(CONFIG_X86_LOCAL_APIC)    += apic.o apic_noop.o probe_$(BITS).o ipi.o
+obj-$(CONFIG_X86_LOCAL_APIC)    += apic.o apic_noop.o ipi.o
-ifneq ($(CONFIG_HARDLOCKUP_DETECTOR),y)
+obj-y                           += hw_nmi.o
-obj-$(CONFIG_X86_LOCAL_APIC)    += nmi.o
-endif
-obj-$(CONFIG_HARDLOCKUP_DETECTOR)       += hw_nmi.o
 obj-$(CONFIG_X86_IO_APIC)       += io_apic.o
 obj-$(CONFIG_SMP)               += ipi.o
 ifeq ($(CONFIG_X86_64),y)
-obj-y                           += apic_flat_64.o
+# APIC probe will depend on the listing order here
-obj-$(CONFIG_X86_X2APIC)        += x2apic_cluster.o
-obj-$(CONFIG_X86_X2APIC)        += x2apic_phys.o
 obj-$(CONFIG_X86_UV)            += x2apic_uv_x.o
+obj-$(CONFIG_X86_X2APIC)        += x2apic_phys.o
+obj-$(CONFIG_X86_X2APIC)        += x2apic_cluster.o
+obj-y                           += apic_flat_64.o
 endif
-obj-$(CONFIG_X86_BIGSMP)        += bigsmp_32.o
+# APIC probe will depend on the listing order here
 obj-$(CONFIG_X86_NUMAQ)         += numaq_32.o
-obj-$(CONFIG_X86_ES7000)        += es7000_32.o
 obj-$(CONFIG_X86_SUMMIT)        += summit_32.o
+obj-$(CONFIG_X86_BIGSMP)        += bigsmp_32.o
+obj-$(CONFIG_X86_ES7000)        += es7000_32.o
+# For 32bit, probe_32 need to be listed last
+obj-$(CONFIG_X86_LOCAL_APIC)    += probe_$(BITS).o
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index e3b534cda49a..b9338b8cf420 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -24,14 +24,13 @@
 #include <linux/ftrace.h>
 #include <linux/ioport.h>
 #include <linux/module.h>
-#include <linux/sysdev.h>
+#include <linux/syscore_ops.h>
 #include <linux/delay.h>
 #include <linux/timex.h>
 #include <linux/dmar.h>
 #include <linux/init.h>
 #include <linux/cpu.h>
 #include <linux/dmi.h>
-#include <linux/nmi.h>
 #include <linux/smp.h>
 #include <linux/mm.h>
@@ -44,14 +43,15 @@
 #include <asm/i8259.h>
 #include <asm/proto.h>
 #include <asm/apic.h>
+#include <asm/io_apic.h>
 #include <asm/desc.h>
 #include <asm/hpet.h>
 #include <asm/idle.h>
 #include <asm/mtrr.h>
 #include <asm/smp.h>
 #include <asm/mce.h>
-#include <asm/kvm_para.h>
 #include <asm/tsc.h>
+#include <asm/hypervisor.h>
 unsigned int num_processors;
@@ -79,12 +79,21 @@ EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid);
 EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
 #ifdef CONFIG_X86_32
+/*
+ * On x86_32, the mapping between cpu and logical apicid may vary
+ * depending on apic in use.  The following early percpu variable is
+ * used for the mapping.  This is where the behaviors of x86_64 and 32
+ * actually diverge.  Let's keep it ugly for now.
+ */
+DEFINE_EARLY_PER_CPU(int, x86_cpu_to_logical_apicid, BAD_APICID);
 /*
 * Knob to control our willingness to enable the local APIC.
 *
 * +1=force-enable
 */
-static int force_enable_local_apic;
+static int force_enable_local_apic __initdata;
 /*
 * APIC command line parameters
 */
@@ -154,7 +163,7 @@ early_param("nox2apic", setup_nox2apic);
 unsigned long mp_lapic_addr;
 int disable_apic;
 /* Disable local APIC timer from the kernel commandline or via dmi quirk */
-static int disable_apic_timer __cpuinitdata;
+static int disable_apic_timer __initdata;
 /* Local APIC timer works in C2 */
 int local_apic_timer_c2_ok;
 EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok);
@@ -178,29 +187,8 @@ static struct resource lapic_resource = {
 static unsigned int calibration_result;
-static int lapic_next_event(unsigned long delta,
-                            struct clock_event_device *evt);
-static void lapic_timer_setup(enum clock_event_mode mode,
-                              struct clock_event_device *evt);
-static void lapic_timer_broadcast(const struct cpumask *mask);
 static void apic_pm_activate(void);
-/*
- * The local apic timer can be used for any function which is CPU local.
- */
-static struct clock_event_device lapic_clockevent = {
-        .name           = "lapic",
-        .features       = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT
-                        | CLOCK_EVT_FEAT_C3STOP | CLOCK_EVT_FEAT_DUMMY,
-        .shift          = 32,
-        .set_mode       = lapic_timer_setup,
-        .set_next_event = lapic_next_event,
-        .broadcast      = lapic_timer_broadcast,
-        .rating         = 100,
-        .irq            = -1,
-};
-static DEFINE_PER_CPU(struct clock_event_device, lapic_events);
 static unsigned long apic_phys;
 /*
@@ -239,7 +227,7 @@ static int modern_apic(void)
 * right after this call apic become NOOP driven
 * so apic->write/read doesn't do anything
 */
-void apic_disable(void)
+static void __init apic_disable(void)
 {
        pr_info("APIC: switched to apic NOOP\n");
        apic = &apic_noop;
@@ -283,23 +271,6 @@ u64 native_apic_icr_read(void)
        return icr1 | ((u64)icr2 << 32);
 }
-/**
- * enable_NMI_through_LVT0 - enable NMI through local vector table 0
- */
-void __cpuinit enable_NMI_through_LVT0(void)
-{
-        unsigned int v;
-        /* unmask and set to NMI */
-        v = APIC_DM_NMI;
-        /* Level triggered for 82489DX (32bit mode) */
-        if (!lapic_is_integrated())
-                v |= APIC_LVT_LEVEL_TRIGGER;
-        apic_write(APIC_LVT0, v);
-}
 #ifdef CONFIG_X86_32
 /**
 * get_physical_broadcast - Get number of physical broadcast IDs
@@ -370,38 +341,89 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
 }
 /*
- * Setup extended LVT, AMD specific (K8, family 10h)
+ * Setup extended LVT, AMD specific
 *
- * Vector mappings are hard coded. On K8 only offset 0 (APIC500) and
+ * Software should use the LVT offsets the BIOS provides.  The offsets
- * MCE interrupts are supported. Thus MCE offset must be set to 0.
+ * are determined by the subsystems using it like those for MCE
+ * threshold or IBS.  On K8 only offset 0 (APIC500) and MCE interrupts
+ * are supported. Beginning with family 10h at least 4 offsets are
+ * available.
 *
- * If mask=1, the LVT entry does not generate interrupts while mask=0
+ * Since the offsets must be consistent for all cores, we keep track
- * enables the vector. See also the BKDGs.
+ * of the LVT offsets in software and reserve the offset for the same
+ * vector also to be used on other cores. An offset is freed by
+ * setting the entry to APIC_EILVT_MASKED.
+ *
+ * If the BIOS is right, there should be no conflicts. Otherwise a
+ * "[Firmware Bug]: ..." error message is generated. However, if
+ * software does not properly determines the offsets, it is not
+ * necessarily a BIOS bug.
 */
-#define APIC_EILVT_LVTOFF_MCE 0
+static atomic_t eilvt_offsets[APIC_EILVT_NR_MAX];
-#define APIC_EILVT_LVTOFF_IBS 1
-static void setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask)
+static inline int eilvt_entry_is_changeable(unsigned int old, unsigned int new)
 {
-        unsigned long reg = (lvt_off << 4) + APIC_EILVTn(0);
+        return (old & APIC_EILVT_MASKED)
-        unsigned int  v   = (mask << 16) | (msg_type << 8) | vector;
+                || (new == APIC_EILVT_MASKED)
+                || ((new & ~APIC_EILVT_MASKED) == old);
-        apic_write(reg, v);
 }
-u8 setup_APIC_eilvt_mce(u8 vector, u8 msg_type, u8 mask)
+static unsigned int reserve_eilvt_offset(int offset, unsigned int new)
 {
-        setup_APIC_eilvt(APIC_EILVT_LVTOFF_MCE, vector, msg_type, mask);
+        unsigned int rsvd;                      /* 0: uninitialized */
-        return APIC_EILVT_LVTOFF_MCE;
+        if (offset >= APIC_EILVT_NR_MAX)
+                return ~0;
+        rsvd = atomic_read(&eilvt_offsets[offset]) & ~APIC_EILVT_MASKED;
+        do {
+                if (rsvd &&
+                    !eilvt_entry_is_changeable(rsvd, new))
+                        /* may not change if vectors are different */
+                        return rsvd;
+                rsvd = atomic_cmpxchg(&eilvt_offsets[offset], rsvd, new);
+        } while (rsvd != new);
+        return new;
 }
-u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask)
+/*
+ * If mask=1, the LVT entry does not generate interrupts while mask=0
+ * enables the vector. See also the BKDGs. Must be called with
+ * preemption disabled.
+ */
+int setup_APIC_eilvt(u8 offset, u8 vector, u8 msg_type, u8 mask)
 {
-        setup_APIC_eilvt(APIC_EILVT_LVTOFF_IBS, vector, msg_type, mask);
+        unsigned long reg = APIC_EILVTn(offset);
-        return APIC_EILVT_LVTOFF_IBS;
+        unsigned int new, old, reserved;
+        new = (mask << 16) | (msg_type << 8) | vector;
+        old = apic_read(reg);
+        reserved = reserve_eilvt_offset(offset, new);
+        if (reserved != new) {
+                pr_err(FW_BUG "cpu %d, try to use APIC%lX (LVT offset %d) for "
+                       "vector 0x%x, but the register is already in use for "
+                       "vector 0x%x on another cpu\n",
+                       smp_processor_id(), reg, offset, new, reserved);
+                return -EINVAL;
+        }
+        if (!eilvt_entry_is_changeable(old, new)) {
+                pr_err(FW_BUG "cpu %d, try to use APIC%lX (LVT offset %d) for "
+                       "vector 0x%x, but the register is already in use for "
+                       "vector 0x%x on this cpu\n",
+                       smp_processor_id(), reg, offset, new, old);
+                return -EBUSY;
+        }
+        apic_write(reg, new);
+        return 0;
 }
-EXPORT_SYMBOL_GPL(setup_APIC_eilvt_ibs);
+EXPORT_SYMBOL_GPL(setup_APIC_eilvt);
 /*
 * Program the next event, relative to now
@@ -459,6 +481,23 @@ static void lapic_timer_broadcast(const struct cpumask *mask)
 #endif
 }
+/*
+ * The local apic timer can be used for any function which is CPU local.
+ */
+static struct clock_event_device lapic_clockevent = {
+        .name           = "lapic",
+        .features       = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT
+                        | CLOCK_EVT_FEAT_C3STOP | CLOCK_EVT_FEAT_DUMMY,
+        .shift          = 32,
+        .set_mode       = lapic_timer_setup,
+        .set_next_event = lapic_next_event,
+        .broadcast      = lapic_timer_broadcast,
+        .rating         = 100,
+        .irq            = -1,
+};
+static DEFINE_PER_CPU(struct clock_event_device, lapic_events);
 /*
 * Setup the local APIC timer for this CPU. Copy the initialized values
 * of the boot CPU and register the clock event in the framework.
@@ -467,7 +506,7 @@ static void __cpuinit setup_APIC_timer(void)
 {
        struct clock_event_device *levt = &__get_cpu_var(lapic_events);
-        if (cpu_has(&current_cpu_data, X86_FEATURE_ARAT)) {
+        if (this_cpu_has(X86_FEATURE_ARAT)) {
                lapic_clockevent.features &= ~CLOCK_EVT_FEAT_C3STOP;
                /* Make LAPIC timer preferrable over percpu HPET */
                lapic_clockevent.rating = 150;
@@ -635,7 +674,7 @@ static int __init calibrate_APIC_clock(void)
        lapic_clockevent.mult = div_sc(delta, TICK_NSEC * LAPIC_CAL_LOOPS,
                                       lapic_clockevent.shift);
        lapic_clockevent.max_delta_ns =
-                clockevent_delta2ns(0x7FFFFF, &lapic_clockevent);
+                clockevent_delta2ns(0x7FFFFFFF, &lapic_clockevent);
        lapic_clockevent.min_delta_ns =
                clockevent_delta2ns(0xF, &lapic_clockevent);
@@ -750,11 +789,7 @@ void __init setup_boot_APIC_clock(void)
         * PIT/HPET going.  Otherwise register lapic as a dummy
         * device.
         */
-        if (nmi_watchdog != NMI_IO_APIC)
+        lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
-                lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
-        else
-                pr_warning("APIC timer registered as dummy,"
-                        " due to nmi_watchdog=%d!\n", nmi_watchdog);
        /* Setup the lapic or request the broadcast */
        setup_APIC_timer();
@@ -1146,12 +1181,15 @@ static void __cpuinit lapic_setup_esr(void)
                        oldvalue, value);
 }
 /**
 * setup_local_APIC - setup the local APIC
+ *
+ * Used to setup local APIC while initializing BSP or bringin up APs.
+ * Always called with preemption disabled.
 */
 void __cpuinit setup_local_APIC(void)
 {
+        int cpu = smp_processor_id();
        unsigned int value, queued;
        int i, j, acked = 0;
        unsigned long long tsc = 0, ntsc;
@@ -1161,7 +1199,7 @@ void __cpuinit setup_local_APIC(void)
                rdtscll(tsc);
        if (disable_apic) {
-                arch_disable_smp_support();
+                disable_ioapic_support();
                return;
        }
@@ -1176,8 +1214,6 @@ void __cpuinit setup_local_APIC(void)
 #endif
        perf_events_lapic_init();
-        preempt_disable();
        /*
         * Double-check whether this APIC is really registered.
         * This is meaningless in clustered apic mode, so we skip it.
@@ -1191,6 +1227,30 @@ void __cpuinit setup_local_APIC(void)
         */
        apic->init_apic_ldr();
+#ifdef CONFIG_X86_32
+        /*
+         * APIC LDR is initialized.  If logical_apicid mapping was
+         * initialized during get_smp_config(), make sure it matches the
+         * actual value.
+         */
+        i = early_per_cpu(x86_cpu_to_logical_apicid, cpu);
+        WARN_ON(i != BAD_APICID && i != logical_smp_processor_id());
+        /* always use the value from LDR */
+        early_per_cpu(x86_cpu_to_logical_apicid, cpu) =
+                logical_smp_processor_id();
+        /*
+         * Some NUMA implementations (NUMAQ) don't initialize apicid to
+         * node mapping during NUMA init.  Now that logical apicid is
+         * guaranteed to be known, give it another chance.  This is already
+         * a bit too late - percpu allocation has already happened without
+         * proper NUMA affinity.
+         */
+        if (apic->x86_32_numa_cpu_node)
+                set_apicid_to_node(early_per_cpu(x86_cpu_to_apicid, cpu),
+                                   apic->x86_32_numa_cpu_node(cpu));
+#endif
        /*
         * Set Task Priority to 'accept all'. We never change this
         * later on.
@@ -1293,21 +1353,19 @@ void __cpuinit setup_local_APIC(void)
         * TODO: set up through-local-APIC from through-I/O-APIC? --macro
         */
        value = apic_read(APIC_LVT0) & APIC_LVT_MASKED;
-        if (!smp_processor_id() && (pic_mode || !value)) {
+        if (!cpu && (pic_mode || !value)) {
                value = APIC_DM_EXTINT;
-                apic_printk(APIC_VERBOSE, "enabled ExtINT on CPU#%d\n",
+                apic_printk(APIC_VERBOSE, "enabled ExtINT on CPU#%d\n", cpu);
-                                smp_processor_id());
        } else {
                value = APIC_DM_EXTINT | APIC_LVT_MASKED;
-                apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n",
+                apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n", cpu);
-                                smp_processor_id());
        }
        apic_write(APIC_LVT0, value);
        /*
         * only the BP should see the LINT1 NMI signal, obviously.
         */
-        if (!smp_processor_id())
+        if (!cpu)
                value = APIC_DM_NMI;
        else
                value = APIC_DM_NMI | APIC_LVT_MASKED;
@@ -1315,11 +1373,9 @@ void __cpuinit setup_local_APIC(void)
                value |= APIC_LVT_LEVEL_TRIGGER;
        apic_write(APIC_LVT1, value);
-        preempt_enable();
 #ifdef CONFIG_X86_MCE_INTEL
        /* Recheck CMCI information after local APIC is up on CPU #0 */
-        if (smp_processor_id() == 0)
+        if (!cpu)
                cmci_recheck();
 #endif
 }
@@ -1338,10 +1394,22 @@ void __cpuinit end_local_APIC_setup(void)
        }
 #endif
-        setup_apic_nmi_watchdog(NULL);
        apic_pm_activate();
 }
+void __init bsp_end_local_APIC_setup(void)
+{
+        end_local_APIC_setup();
+        /*
+         * Now that local APIC setup is completed for BP, configure the fault
+         * handling for interrupt remapping.
+         */
+        if (intr_remapping_enabled)
+                enable_drhd_fault_handling();
+}
 #ifdef CONFIG_X86_X2APIC
 void check_x2apic(void)
 {
@@ -1394,7 +1462,6 @@ int __init enable_IR(void)
 void __init enable_IR_x2apic(void)
 {
        unsigned long flags;
-        struct IO_APIC_route_entry **ioapic_entries = NULL;
        int ret, x2apic_enabled = 0;
        int dmar_table_init_ret;
@@ -1402,13 +1469,7 @@ void __init enable_IR_x2apic(void)
        if (dmar_table_init_ret && !x2apic_supported())
                return;
-        ioapic_entries = alloc_ioapic_entries();
+        ret = save_ioapic_entries();
-        if (!ioapic_entries) {
-                pr_err("Allocate ioapic_entries failed\n");
-                goto out;
-        }
-        ret = save_IO_APIC_setup(ioapic_entries);
        if (ret) {
                pr_info("Saving IO-APIC state failed: %d\n", ret);
                goto out;
@@ -1416,7 +1477,7 @@ void __init enable_IR_x2apic(void)
        local_irq_save(flags);
        legacy_pic->mask_all();
-        mask_IO_APIC_setup(ioapic_entries);
+        mask_ioapic_entries();
        if (dmar_table_init_ret)
                ret = 0;
@@ -1427,7 +1488,8 @@ void __init enable_IR_x2apic(void)
                /* IR is required if there is APIC ID > 255 even when running
                 * under KVM
                 */
-                if (max_physical_apicid > 255 || !kvm_para_available())
+                if (max_physical_apicid > 255 ||
+                    !hypervisor_x2apic_available())
                        goto nox2apic;
                /*
                 * without IR all CPUs can be addressed by IOAPIC/MSI
@@ -1446,14 +1508,11 @@ void __init enable_IR_x2apic(void)
 nox2apic:
        if (!ret) /* IR enabling failed */
-                restore_IO_APIC_setup(ioapic_entries);
+                restore_ioapic_entries();
        legacy_pic->restore_mask();
        local_irq_restore(flags);
 out:
-        if (ioapic_entries)
-                free_ioapic_entries(ioapic_entries);
        if (x2apic_enabled)
                return;
@@ -1481,13 +1540,60 @@ static int __init detect_init_APIC(void)
        return 0;
 }
 #else
+static int __init apic_verify(void)
+{
+        u32 features, h, l;
+        /*
+         * The APIC feature bit should now be enabled
+         * in `cpuid'
+         */
+        features = cpuid_edx(1);
+        if (!(features & (1 << X86_FEATURE_APIC))) {
+                pr_warning("Could not enable APIC!\n");
+                return -1;
+        }
+        set_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
+        mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
+        /* The BIOS may have set up the APIC at some other address */
+        rdmsr(MSR_IA32_APICBASE, l, h);
+        if (l & MSR_IA32_APICBASE_ENABLE)
+                mp_lapic_addr = l & MSR_IA32_APICBASE_BASE;
+        pr_info("Found and enabled local APIC!\n");
+        return 0;
+}
+int __init apic_force_enable(unsigned long addr)
+{
+        u32 h, l;
+        if (disable_apic)
+                return -1;
+        /*
+         * Some BIOSes disable the local APIC in the APIC_BASE
+         * MSR. This can only be done in software for Intel P6 or later
+         * and AMD K7 (Model > 1) or later.
+         */
+        rdmsr(MSR_IA32_APICBASE, l, h);
+        if (!(l & MSR_IA32_APICBASE_ENABLE)) {
+                pr_info("Local APIC disabled by BIOS -- reenabling.\n");
+                l &= ~MSR_IA32_APICBASE_BASE;
+                l |= MSR_IA32_APICBASE_ENABLE | addr;
+                wrmsr(MSR_IA32_APICBASE, l, h);
+                enabled_via_apicbase = 1;
+        }
+        return apic_verify();
+}
 /*
 * Detect and initialize APIC
 */
 static int __init detect_init_APIC(void)
 {
-        u32 h, l, features;
        /* Disabled by kernel option? */
        if (disable_apic)
                return -1;
@@ -1517,38 +1623,12 @@ static int __init detect_init_APIC(void)
                                "you can enable it with \"lapic\"\n");
                        return -1;
                }
-                /*
+                if (apic_force_enable(APIC_DEFAULT_PHYS_BASE))
-                 * Some BIOSes disable the local APIC in the APIC_BASE
+                        return -1;
-                 * MSR. This can only be done in software for Intel P6 or later
+        } else {
-                 * and AMD K7 (Model > 1) or later.
+                if (apic_verify())
-                 */
+                        return -1;
-                rdmsr(MSR_IA32_APICBASE, l, h);
-                if (!(l & MSR_IA32_APICBASE_ENABLE)) {
-                        pr_info("Local APIC disabled by BIOS -- reenabling.\n");
-                        l &= ~MSR_IA32_APICBASE_BASE;
-                        l |= MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE;
-                        wrmsr(MSR_IA32_APICBASE, l, h);
-                        enabled_via_apicbase = 1;
-                }
-        }
-        /*
-         * The APIC feature bit should now be enabled
-         * in `cpuid'
-         */
-        features = cpuid_edx(1);
-        if (!(features & (1 << X86_FEATURE_APIC))) {
-                pr_warning("Could not enable APIC!\n");
-                return -1;
        }
-        set_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
-        mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
-        /* The BIOS may have set up the APIC at some other address */
-        rdmsr(MSR_IA32_APICBASE, l, h);
-        if (l & MSR_IA32_APICBASE_ENABLE)
-                mp_lapic_addr = l & MSR_IA32_APICBASE_BASE;
-        pr_info("Found and enabled local APIC!\n");
        apic_pm_activate();
@@ -1560,28 +1640,6 @@ no_apic:
 }
 #endif
-#ifdef CONFIG_X86_64
-void __init early_init_lapic_mapping(void)
-{
-        /*
-         * If no local APIC can be found then go out
-         * : it means there is no mpatable and MADT
-         */
-        if (!smp_found_config)
-                return;
-        set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr);
-        apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n",
-                    APIC_BASE, mp_lapic_addr);
-        /*
-         * Fetch the APIC ID of the BSP in case we have a
-         * default configuration (or the MP table is broken).
-         */
-        boot_cpu_physical_apicid = read_apic_id();
-}
-#endif
 /**
 * init_apic_mappings - initialize APIC mappings
 */
@@ -1607,10 +1665,7 @@ void __init init_apic_mappings(void)
                 * acpi_register_lapic_address()
                 */
                if (!acpi_lapic && !smp_found_config)
-                        set_fixmap_nocache(FIX_APIC_BASE, apic_phys);
+                        register_lapic_address(apic_phys);
-                apic_printk(APIC_VERBOSE, "mapped APIC to %08lx (%08lx)\n",
-                                        APIC_BASE, apic_phys);
        }
        /*
@@ -1632,11 +1687,27 @@ void __init init_apic_mappings(void)
        }
 }
+void __init register_lapic_address(unsigned long address)
+{
+        mp_lapic_addr = address;
+        if (!x2apic_mode) {
+                set_fixmap_nocache(FIX_APIC_BASE, address);
+                apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n",
+                            APIC_BASE, mp_lapic_addr);
+        }
+        if (boot_cpu_physical_apicid == -1U) {
+                boot_cpu_physical_apicid  = read_apic_id();
+                apic_version[boot_cpu_physical_apicid] =
+                         GET_APIC_VERSION(apic_read(APIC_LVR));
+        }
+}
 /*
 * This initializes the IO-APIC and APIC hardware if this is
 * a UP kernel.
 */
-int apic_version[MAX_APICS];
+int apic_version[MAX_LOCAL_APIC];
 int __init APIC_init_uniprocessor(void)
 {
@@ -1665,10 +1736,7 @@ int __init APIC_init_uniprocessor(void)
        }
 #endif
-#ifndef CONFIG_SMP
-        enable_IR_x2apic();
        default_setup_apic_routing();
-#endif
        verify_local_APIC();
        connect_bsp_APIC();
@@ -1697,24 +1765,17 @@ int __init APIC_init_uniprocessor(void)
                enable_IO_APIC();
 #endif
-        end_local_APIC_setup();
+        bsp_end_local_APIC_setup();
 #ifdef CONFIG_X86_IO_APIC
        if (smp_found_config && !skip_ioapic_setup && nr_ioapics)
                setup_IO_APIC();
        else {
                nr_ioapics = 0;
-                localise_nmi_watchdog();
        }
-#else
-        localise_nmi_watchdog();
 #endif
        x86_init.timers.setup_percpu_clockev();
-#ifdef CONFIG_X86_64
-        check_nmi_watchdog();
-#endif
        return 0;
 }
@@ -1753,30 +1814,41 @@ void smp_spurious_interrupt(struct pt_regs *regs)
 */
 void smp_error_interrupt(struct pt_regs *regs)
 {
-        u32 v, v1;
+        u32 v0, v1;
+        u32 i = 0;
+        static const char * const error_interrupt_reason[] = {
+                "Send CS error",                /* APIC Error Bit 0 */
+                "Receive CS error",             /* APIC Error Bit 1 */
+                "Send accept error",            /* APIC Error Bit 2 */
+                "Receive accept error",         /* APIC Error Bit 3 */
+                "Redirectable IPI",             /* APIC Error Bit 4 */
+                "Send illegal vector",          /* APIC Error Bit 5 */
+                "Received illegal vector",      /* APIC Error Bit 6 */
+                "Illegal register address",     /* APIC Error Bit 7 */
+        };
        exit_idle();
        irq_enter();
        /* First tickle the hardware, only then report what went on. -- REW */
-        v = apic_read(APIC_ESR);
+        v0 = apic_read(APIC_ESR);
        apic_write(APIC_ESR, 0);
        v1 = apic_read(APIC_ESR);
        ack_APIC_irq();
        atomic_inc(&irq_err_count);
-        /*
+        apic_printk(APIC_DEBUG, KERN_DEBUG "APIC error on CPU%d: %02x(%02x)",
-         * Here is what the APIC error bits mean:
+                    smp_processor_id(), v0 , v1);
-         * 0: Send CS error
-         * 1: Receive CS error
+        v1 = v1 & 0xff;
-         * 2: Send accept error
+        while (v1) {
-         * 3: Receive accept error
+                if (v1 & 0x1)
-         * 4: Reserved
+                        apic_printk(APIC_DEBUG, KERN_CONT " : %s", error_interrupt_reason[i]);
-         * 5: Send illegal vector
+                i++;
-         * 6: Received illegal vector
+                v1 >>= 1;
-         * 7: Illegal register address
+        };
-         */
-        pr_debug("APIC error on CPU%d: %02x(%02x)\n",
+        apic_printk(APIC_DEBUG, KERN_CONT "\n");
-                smp_processor_id(), v , v1);
        irq_exit();
 }
@@ -1873,17 +1945,6 @@ void __cpuinit generic_processor_info(int apicid, int version)
 {
        int cpu;
-        /*
-         * Validate version
-         */
-        if (version == 0x0) {
-                pr_warning("BIOS bug, APIC version is 0 for CPU#%d! "
-                           "fixing up to 0x10. (tell your hw vendor)\n",
-                                version);
-                version = 0x10;
-        }
-        apic_version[apicid] = version;
        if (num_processors >= nr_cpu_ids) {
                int max = nr_cpu_ids;
                int thiscpu = max + disabled_cpus;
@@ -1897,22 +1958,34 @@ void __cpuinit generic_processor_info(int apicid, int version)
        }
        num_processors++;
-        cpu = cpumask_next_zero(-1, cpu_present_mask);
-        if (version != apic_version[boot_cpu_physical_apicid])
-                WARN_ONCE(1,
-                        "ACPI: apic version mismatch, bootcpu: %x cpu %d: %x\n",
-                        apic_version[boot_cpu_physical_apicid], cpu, version);
-        physid_set(apicid, phys_cpu_present_map);
        if (apicid == boot_cpu_physical_apicid) {
                /*
                 * x86_bios_cpu_apicid is required to have processors listed
                 * in same order as logical cpu numbers. Hence the first
                 * entry is BSP, and so on.
+                 * boot_cpu_init() already hold bit 0 in cpu_present_mask
+                 * for BSP.
                 */
                cpu = 0;
+        } else
+                cpu = cpumask_next_zero(-1, cpu_present_mask);
+        /*
+         * Validate version
+         */
+        if (version == 0x0) {
+                pr_warning("BIOS bug: APIC version is 0 for CPU %d/0x%x, fixing up to 0x10\n",
+                           cpu, apicid);
+                version = 0x10;
        }
+        apic_version[apicid] = version;
+        if (version != apic_version[boot_cpu_physical_apicid]) {
+                pr_warning("BIOS bug: APIC version mismatch, boot CPU: %x, CPU %d: version %x\n",
+                        apic_version[boot_cpu_physical_apicid], cpu, version);
+        }
+        physid_set(apicid, phys_cpu_present_map);
        if (apicid > max_physical_apicid)
                max_physical_apicid = apicid;
@@ -1920,7 +1993,10 @@ void __cpuinit generic_processor_info(int apicid, int version)
        early_per_cpu(x86_cpu_to_apicid, cpu) = apicid;
        early_per_cpu(x86_bios_cpu_apicid, cpu) = apicid;
 #endif
+#ifdef CONFIG_X86_32
+        early_per_cpu(x86_cpu_to_logical_apicid, cpu) =
+                apic->x86_32_early_logical_apicid(cpu);
+#endif
        set_cpu_possible(cpu, true);
        set_cpu_present(cpu, true);
 }
@@ -1940,17 +2016,6 @@ void default_init_apic_ldr(void)
        apic_write(APIC_LDR, val);
 }
-#ifdef CONFIG_X86_32
-int default_apicid_to_node(int logical_apicid)
-{
-#ifdef CONFIG_SMP
-        return apicid_2_node[hard_smp_processor_id()];
-#else
-        return 0;
-#endif
-}
-#endif
 /*
 * Power management
 */
@@ -1979,7 +2044,7 @@ static struct {
        unsigned int apic_thmr;
 } apic_pm_state;
-static int lapic_suspend(struct sys_device *dev, pm_message_t state)
+static int lapic_suspend(void)
 {
        unsigned long flags;
        int maxlvt;
@@ -2017,34 +2082,24 @@ static int lapic_suspend(struct sys_device *dev, pm_message_t state)
        return 0;
 }
-static int lapic_resume(struct sys_device *dev)
+static void lapic_resume(void)
 {
        unsigned int l, h;
        unsigned long flags;
        int maxlvt;
-        int ret = 0;
-        struct IO_APIC_route_entry **ioapic_entries = NULL;
        if (!apic_pm_state.active)
-                return 0;
+                return;
        local_irq_save(flags);
        if (intr_remapping_enabled) {
-                ioapic_entries = alloc_ioapic_entries();
+                /*
-                if (!ioapic_entries) {
+                 * IO-APIC and PIC have their own resume routines.
-                        WARN(1, "Alloc ioapic_entries in lapic resume failed.");
+                 * We just mask them here to make sure the interrupt
-                        ret = -ENOMEM;
+                 * subsystem is completely quiet while we enable x2apic
-                        goto restore;
+                 * and interrupt-remapping.
-                }
+                 */
+                mask_ioapic_entries();
-                ret = save_IO_APIC_setup(ioapic_entries);
-                if (ret) {
-                        WARN(1, "Saving IO-APIC state failed: %d\n", ret);
-                        free_ioapic_entries(ioapic_entries);
-                        goto restore;
-                }
-                mask_IO_APIC_setup(ioapic_entries);
                legacy_pic->mask_all();
        }
@@ -2087,16 +2142,10 @@ static int lapic_resume(struct sys_device *dev)
        apic_write(APIC_ESR, 0);
        apic_read(APIC_ESR);
-        if (intr_remapping_enabled) {
+        if (intr_remapping_enabled)
                reenable_intr_remapping(x2apic_mode);
-                legacy_pic->restore_mask();
-                restore_IO_APIC_setup(ioapic_entries);
-                free_ioapic_entries(ioapic_entries);
-        }
-restore:
-        local_irq_restore(flags);
-        return ret;
+        local_irq_restore(flags);
 }
 /*
@@ -2104,17 +2153,11 @@ restore:
 * are needed on every CPU up until machine_halt/restart/poweroff.
 */
-static struct sysdev_class lapic_sysclass = {
+static struct syscore_ops lapic_syscore_ops = {
-        .name           = "lapic",
        .resume         = lapic_resume,
        .suspend        = lapic_suspend,
 };
-static struct sys_device device_lapic = {
-        .id     = 0,
-        .cls    = &lapic_sysclass,
-};
 static void __cpuinit apic_pm_activate(void)
 {
        apic_pm_state.active = 1;
@@ -2122,16 +2165,11 @@ static void __cpuinit apic_pm_activate(void)
 static int __init init_lapic_sysfs(void)
 {
-        int error;
-        if (!cpu_has_apic)
-                return 0;
        /* XXX: remove suspend/resume procs if !apic_pm_state.active? */
+        if (cpu_has_apic)
+                register_syscore_ops(&lapic_syscore_ops);
-        error = sysdev_class_register(&lapic_sysclass);
+        return 0;
-        if (!error)
-                error = sysdev_register(&device_lapic);
-        return error;
 }
 /* local apic needs to resume before other devices access its registers. */
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index 09d3b17ce0c2..f7a41e4cae47 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -16,6 +16,7 @@
 #include <linux/ctype.h>
 #include <linux/init.h>
 #include <linux/hardirq.h>
+#include <linux/module.h>
 #include <asm/smp.h>
 #include <asm/apic.h>
 #include <asm/ipi.h>
@@ -24,6 +25,12 @@
 #include <acpi/acpi_bus.h>
 #endif
+static struct apic apic_physflat;
+static struct apic apic_flat;
+struct apic __read_mostly *apic = &apic_flat;
+EXPORT_SYMBOL_GPL(apic);
 static int flat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
 {
        return 1;
@@ -164,7 +171,7 @@ static int flat_phys_pkg_id(int initial_apic_id, int index_msb)
        return initial_apic_id >> index_msb;
 }
-struct apic apic_flat =  {
+static struct apic apic_flat =  {
        .name                           = "flat",
        .probe                          = NULL,
        .acpi_madt_oem_check            = flat_acpi_madt_oem_check,
@@ -185,8 +192,6 @@ struct apic apic_flat =  {
        .ioapic_phys_id_map             = NULL,
        .setup_apic_routing             = NULL,
        .multi_timer_check              = NULL,
-        .apicid_to_node                 = NULL,
-        .cpu_to_logical_apicid          = NULL,
        .cpu_present_to_apicid          = default_cpu_present_to_apicid,
        .apicid_to_cpu_present          = NULL,
        .setup_portio_remap             = NULL,
@@ -314,10 +319,18 @@ physflat_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
        return per_cpu(x86_cpu_to_apicid, cpu);
 }
-struct apic apic_physflat =  {
+static int physflat_probe(void)
+{
+        if (apic == &apic_physflat || num_possible_cpus() > 8)
+                return 1;
+        return 0;
+}
+static struct apic apic_physflat =  {
        .name                           = "physical flat",
-        .probe                          = NULL,
+        .probe                          = physflat_probe,
        .acpi_madt_oem_check            = physflat_acpi_madt_oem_check,
        .apic_id_registered             = flat_apic_id_registered,
@@ -337,8 +350,6 @@ struct apic apic_physflat =  {
        .ioapic_phys_id_map             = NULL,
        .setup_apic_routing             = NULL,
        .multi_timer_check              = NULL,
-        .apicid_to_node                 = NULL,
-        .cpu_to_logical_apicid          = NULL,
        .cpu_present_to_apicid          = default_cpu_present_to_apicid,
        .apicid_to_cpu_present          = NULL,
        .setup_portio_remap             = NULL,
@@ -373,3 +384,8 @@ struct apic apic_physflat =  {
        .wait_icr_idle                  = native_apic_wait_icr_idle,
        .safe_wait_icr_idle             = native_safe_apic_wait_icr_idle,
 };
+/*
+ * We need to check for physflat first, so this order is important.
+ */
+apic_drivers(apic_physflat, apic_flat);
diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c
index e31b9ffe25f5..775b82bc655c 100644
--- a/arch/x86/kernel/apic/apic_noop.c
+++ b/arch/x86/kernel/apic/apic_noop.c
@@ -54,11 +54,6 @@ static u64 noop_apic_icr_read(void)
        return 0;
 }
-static int noop_cpu_to_logical_apicid(int cpu)
-{
-        return 0;
-}
 static int noop_phys_pkg_id(int cpuid_apic, int index_msb)
 {
        return 0;
@@ -113,12 +108,6 @@ static void noop_vector_allocation_domain(int cpu, struct cpumask *retmask)
        cpumask_set_cpu(cpu, retmask);
 }
-int noop_apicid_to_node(int logical_apicid)
-{
-        /* we're always on node 0 */
-        return 0;
-}
 static u32 noop_apic_read(u32 reg)
 {
        WARN_ON_ONCE((cpu_has_apic && !disable_apic));
@@ -153,9 +142,7 @@ struct apic apic_noop = {
        .ioapic_phys_id_map             = default_ioapic_phys_id_map,
        .setup_apic_routing             = NULL,
        .multi_timer_check              = NULL,
-        .apicid_to_node                 = noop_apicid_to_node,
-        .cpu_to_logical_apicid          = noop_cpu_to_logical_apicid,
        .cpu_present_to_apicid          = default_cpu_present_to_apicid,
        .apicid_to_cpu_present          = physid_set_mask_of_physid,
@@ -197,4 +184,8 @@ struct apic apic_noop = {
        .icr_write                      = noop_apic_icr_write,
        .wait_icr_idle                  = noop_apic_wait_icr_idle,
        .safe_wait_icr_idle             = noop_safe_apic_wait_icr_idle,
+#ifdef CONFIG_X86_32
+        .x86_32_early_logical_apicid    = noop_x86_32_early_logical_apicid,
+#endif
 };
diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c
index cb804c5091b9..efd737e827f4 100644
--- a/arch/x86/kernel/apic/bigsmp_32.c
+++ b/arch/x86/kernel/apic/bigsmp_32.c
@@ -45,6 +45,12 @@ static unsigned long bigsmp_check_apicid_present(int bit)
        return 1;
 }
+static int bigsmp_early_logical_apicid(int cpu)
+{
+        /* on bigsmp, logical apicid is the same as physical */
+        return early_per_cpu(x86_cpu_to_apicid, cpu);
+}
 static inline unsigned long calculate_ldr(int cpu)
 {
        unsigned long val, id;
@@ -80,11 +86,6 @@ static void bigsmp_setup_apic_routing(void)
                nr_ioapics);
 }
-static int bigsmp_apicid_to_node(int logical_apicid)
-{
-        return apicid_2_node[hard_smp_processor_id()];
-}
 static int bigsmp_cpu_present_to_apicid(int mps_cpu)
 {
        if (mps_cpu < nr_cpu_ids)
@@ -93,14 +94,6 @@ static int bigsmp_cpu_present_to_apicid(int mps_cpu)
        return BAD_APICID;
 }
-/* Mapping from cpu number to logical apicid */
-static inline int bigsmp_cpu_to_logical_apicid(int cpu)
-{
-        if (cpu >= nr_cpu_ids)
-                return BAD_APICID;
-        return cpu_physical_id(cpu);
-}
 static void bigsmp_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap)
 {
        /* For clustered we don't have a good way to do this yet - hack */
@@ -115,7 +108,11 @@ static int bigsmp_check_phys_apicid_present(int phys_apicid)
 /* As we are using single CPU as destination, pick only one CPU here */
 static unsigned int bigsmp_cpu_mask_to_apicid(const struct cpumask *cpumask)
 {
-        return bigsmp_cpu_to_logical_apicid(cpumask_first(cpumask));
+        int cpu = cpumask_first(cpumask);
+        if (cpu < nr_cpu_ids)
+                return cpu_physical_id(cpu);
+        return BAD_APICID;
 }
 static unsigned int bigsmp_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
@@ -129,9 +126,9 @@ static unsigned int bigsmp_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
         */
        for_each_cpu_and(cpu, cpumask, andmask) {
                if (cpumask_test_cpu(cpu, cpu_online_mask))
-                        break;
+                        return cpu_physical_id(cpu);
        }
-        return bigsmp_cpu_to_logical_apicid(cpu);
+        return BAD_APICID;
 }
 static int bigsmp_phys_pkg_id(int cpuid_apic, int index_msb)
@@ -196,7 +193,7 @@ static int probe_bigsmp(void)
        return dmi_bigsmp;
 }
-struct apic apic_bigsmp = {
+static struct apic apic_bigsmp = {
        .name                           = "bigsmp",
        .probe                          = probe_bigsmp,
@@ -219,8 +216,6 @@ struct apic apic_bigsmp = {
        .ioapic_phys_id_map             = bigsmp_ioapic_phys_id_map,
        .setup_apic_routing             = bigsmp_setup_apic_routing,
        .multi_timer_check              = NULL,
-        .apicid_to_node                 = bigsmp_apicid_to_node,
-        .cpu_to_logical_apicid          = bigsmp_cpu_to_logical_apicid,
        .cpu_present_to_apicid          = bigsmp_cpu_present_to_apicid,
        .apicid_to_cpu_present          = physid_set_mask_of_physid,
        .setup_portio_remap             = NULL,
@@ -256,4 +251,16 @@ struct apic apic_bigsmp = {
        .icr_write                      = native_apic_icr_write,
        .wait_icr_idle                  = native_apic_wait_icr_idle,
        .safe_wait_icr_idle             = native_safe_apic_wait_icr_idle,
+        .x86_32_early_logical_apicid    = bigsmp_early_logical_apicid,
 };
+struct apic * __init generic_bigsmp_probe(void)
+{
+        if (probe_bigsmp())
+                return &apic_bigsmp;
+        return NULL;
+}
+apic_driver(apic_bigsmp);
diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c
index 8593582d8022..9536b3fe43f8 100644
--- a/arch/x86/kernel/apic/es7000_32.c
+++ b/arch/x86/kernel/apic/es7000_32.c
@@ -460,6 +460,12 @@ static unsigned long es7000_check_apicid_present(int bit)
        return physid_isset(bit, phys_cpu_present_map);
 }
+static int es7000_early_logical_apicid(int cpu)
+{
+        /* on es7000, logical apicid is the same as physical */
+        return early_per_cpu(x86_bios_cpu_apicid, cpu);
+}
 static unsigned long calculate_ldr(int cpu)
 {
        unsigned long id = per_cpu(x86_bios_cpu_apicid, cpu);
@@ -504,12 +510,6 @@ static void es7000_setup_apic_routing(void)
                nr_ioapics, cpumask_bits(es7000_target_cpus())[0]);
 }
-static int es7000_apicid_to_node(int logical_apicid)
-{
-        return 0;
-}
 static int es7000_cpu_present_to_apicid(int mps_cpu)
 {
        if (!mps_cpu)
@@ -528,18 +528,6 @@ static void es7000_apicid_to_cpu_present(int phys_apicid, physid_mask_t *retmap)
        ++cpu_id;
 }
-/* Mapping from cpu number to logical apicid */
-static int es7000_cpu_to_logical_apicid(int cpu)
-{
-#ifdef CONFIG_SMP
-        if (cpu >= nr_cpu_ids)
-                return BAD_APICID;
-        return cpu_2_logical_apicid[cpu];
-#else
-        return logical_smp_processor_id();
-#endif
-}
 static void es7000_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap)
 {
        /* For clustered we don't have a good way to do this yet - hack */
@@ -561,7 +549,7 @@ static unsigned int es7000_cpu_mask_to_apicid(const struct cpumask *cpumask)
         * The cpus in the mask must all be on the apic cluster.
         */
        for_each_cpu(cpu, cpumask) {
-                int new_apicid = es7000_cpu_to_logical_apicid(cpu);
+                int new_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu);
                if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) {
                        WARN(1, "Not a valid mask!");
@@ -578,7 +566,7 @@ static unsigned int
 es7000_cpu_mask_to_apicid_and(const struct cpumask *inmask,
                              const struct cpumask *andmask)
 {
-        int apicid = es7000_cpu_to_logical_apicid(0);
+        int apicid = early_per_cpu(x86_cpu_to_logical_apicid, 0);
        cpumask_var_t cpumask;
        if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC))
@@ -632,7 +620,7 @@ static int es7000_mps_oem_check_cluster(struct mpc_table *mpc, char *oem,
 }
 /* We've been warned by a false positive warning.Use __refdata to keep calm. */
-struct apic __refdata apic_es7000_cluster = {
+static struct apic __refdata apic_es7000_cluster = {
        .name                           = "es7000",
        .probe                          = probe_es7000,
@@ -655,8 +643,6 @@ struct apic __refdata apic_es7000_cluster = {
        .ioapic_phys_id_map             = es7000_ioapic_phys_id_map,
        .setup_apic_routing             = es7000_setup_apic_routing,
        .multi_timer_check              = NULL,
-        .apicid_to_node                 = es7000_apicid_to_node,
-        .cpu_to_logical_apicid          = es7000_cpu_to_logical_apicid,
        .cpu_present_to_apicid          = es7000_cpu_present_to_apicid,
        .apicid_to_cpu_present          = es7000_apicid_to_cpu_present,
        .setup_portio_remap             = NULL,
@@ -695,9 +681,11 @@ struct apic __refdata apic_es7000_cluster = {
        .icr_write                      = native_apic_icr_write,
        .wait_icr_idle                  = native_apic_wait_icr_idle,
        .safe_wait_icr_idle             = native_safe_apic_wait_icr_idle,
+        .x86_32_early_logical_apicid    = es7000_early_logical_apicid,
 };
-struct apic __refdata apic_es7000 = {
+static struct apic __refdata apic_es7000 = {
        .name                           = "es7000",
        .probe                          = probe_es7000,
@@ -720,8 +708,6 @@ struct apic __refdata apic_es7000 = {
        .ioapic_phys_id_map             = es7000_ioapic_phys_id_map,
        .setup_apic_routing             = es7000_setup_apic_routing,
        .multi_timer_check              = NULL,
-        .apicid_to_node                 = es7000_apicid_to_node,
-        .cpu_to_logical_apicid          = es7000_cpu_to_logical_apicid,
        .cpu_present_to_apicid          = es7000_cpu_present_to_apicid,
        .apicid_to_cpu_present          = es7000_apicid_to_cpu_present,
        .setup_portio_remap             = NULL,
@@ -758,4 +744,12 @@ struct apic __refdata apic_es7000 = {
        .icr_write                      = native_apic_icr_write,
        .wait_icr_idle                  = native_apic_wait_icr_idle,
        .safe_wait_icr_idle             = native_safe_apic_wait_icr_idle,
+        .x86_32_early_logical_apicid    = es7000_early_logical_apicid,
 };
+/*
+ * Need to check for es7000 followed by es7000_cluster, so this order
+ * in apic_drivers is important.
+ */
+apic_drivers(apic_es7000, apic_es7000_cluster);
diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
index cefd6942f0e9..d5e57db0f7be 100644
--- a/arch/x86/kernel/apic/hw_nmi.c
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -16,20 +16,33 @@
 #include <linux/kprobes.h>
 #include <linux/nmi.h>
 #include <linux/module.h>
+#include <linux/delay.h>
+#ifdef CONFIG_HARDLOCKUP_DETECTOR
+u64 hw_nmi_get_sample_period(int watchdog_thresh)
+{
+        return (u64)(cpu_khz) * 1000 * watchdog_thresh;
+}
+#endif
+#ifdef arch_trigger_all_cpu_backtrace
 /* For reliability, we're prepared to waste bits here. */
 static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly;
-u64 hw_nmi_get_sample_period(void)
+/* "in progress" flag of arch_trigger_all_cpu_backtrace */
-{
+static unsigned long backtrace_flag;
-        return (u64)(cpu_khz) * 1000 * 60;
-}
-#ifdef ARCH_HAS_NMI_WATCHDOG
 void arch_trigger_all_cpu_backtrace(void)
 {
        int i;
+        if (test_and_set_bit(0, &backtrace_flag))
+                /*
+                 * If there is already a trigger_all_cpu_backtrace() in progress
+                 * (backtrace_flag == 1), don't output double cpu dump infos.
+                 */
+                return;
        cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask);
        printk(KERN_INFO "sending NMI to all CPUs:\n");
@@ -41,6 +54,9 @@ void arch_trigger_all_cpu_backtrace(void)
                        break;
                mdelay(1);
        }
+        clear_bit(0, &backtrace_flag);
+        smp_mb__after_clear_bit();
 }
 static int __kprobes
@@ -49,11 +65,10 @@ arch_trigger_all_cpu_backtrace_handler(struct notifier_block *self,
 {
        struct die_args *args = __args;
        struct pt_regs *regs;
-        int cpu = smp_processor_id();
+        int cpu;
        switch (cmd) {
        case DIE_NMI:
-        case DIE_NMI_IPI:
                break;
        default:
@@ -61,6 +76,7 @@ arch_trigger_all_cpu_backtrace_handler(struct notifier_block *self,
        }
        regs = args->regs;
+        cpu = smp_processor_id();
        if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) {
                static arch_spinlock_t lock = __ARCH_SPIN_LOCK_UNLOCKED;
@@ -68,7 +84,6 @@ arch_trigger_all_cpu_backtrace_handler(struct notifier_block *self,
                arch_spin_lock(&lock);
                printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu);
                show_regs(regs);
-                dump_stack();
                arch_spin_unlock(&lock);
                cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask));
                return NOTIFY_STOP;
@@ -80,7 +95,7 @@ arch_trigger_all_cpu_backtrace_handler(struct notifier_block *self,
 static __read_mostly struct notifier_block backtrace_notifier = {
        .notifier_call          = arch_trigger_all_cpu_backtrace_handler,
        .next                   = NULL,
-        .priority               = 1
+        .priority               = NMI_LOCAL_LOW_PRIOR,
 };
 static int __init register_trigger_all_cpu_backtrace(void)
@@ -90,18 +105,3 @@ static int __init register_trigger_all_cpu_backtrace(void)
 }
 early_initcall(register_trigger_all_cpu_backtrace);
 #endif
-/* STUB calls to mimic old nmi_watchdog behaviour */
-#if defined(CONFIG_X86_LOCAL_APIC)
-unsigned int nmi_watchdog = NMI_NONE;
-EXPORT_SYMBOL(nmi_watchdog);
-void acpi_nmi_enable(void) { return; }
-void acpi_nmi_disable(void) { return; }
-#endif
-atomic_t nmi_active = ATOMIC_INIT(0);           /* oprofile uses this */
-EXPORT_SYMBOL(nmi_active);
-int unknown_nmi_panic;
-void cpu_nmi_set_wd_enabled(void) { return; }
-void stop_apic_nmi_watchdog(void *unused) { return; }
-void setup_apic_nmi_watchdog(void *unused) { return; }
-int __init check_nmi_watchdog(void) { return 0; }
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 5c5b8f3dddb5..e5293394b548 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -30,7 +30,7 @@
 #include <linux/compiler.h>
 #include <linux/acpi.h>
 #include <linux/module.h>
-#include <linux/sysdev.h>
+#include <linux/syscore_ops.h>
 #include <linux/msi.h>
 #include <linux/htirq.h>
 #include <linux/freezer.h>
@@ -54,7 +54,6 @@
 #include <asm/dma.h>
 #include <asm/timer.h>
 #include <asm/i8259.h>
-#include <asm/nmi.h>
 #include <asm/msidef.h>
 #include <asm/hypertransport.h>
 #include <asm/setup.h>
@@ -77,17 +76,40 @@ int sis_apic_bug = -1;
 static DEFINE_RAW_SPINLOCK(ioapic_lock);
 static DEFINE_RAW_SPINLOCK(vector_lock);
-/*
+static struct ioapic {
- * # of IRQ routing registers
+        /*
- */
+         * # of IRQ routing registers
-int nr_ioapic_registers[MAX_IO_APICS];
+         */
+        int nr_registers;
+        /*
+         * Saved state during suspend/resume, or while enabling intr-remap.
+         */
+        struct IO_APIC_route_entry *saved_registers;
+        /* I/O APIC config */
+        struct mpc_ioapic mp_config;
+        /* IO APIC gsi routing info */
+        struct mp_ioapic_gsi  gsi_config;
+        DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1);
+} ioapics[MAX_IO_APICS];
-/* I/O APIC entries */
+#define mpc_ioapic_ver(id)              ioapics[id].mp_config.apicver
-struct mpc_ioapic mp_ioapics[MAX_IO_APICS];
-int nr_ioapics;
-/* IO APIC gsi routing info */
+int mpc_ioapic_id(int id)
-struct mp_ioapic_gsi  mp_gsi_routing[MAX_IO_APICS];
+{
+        return ioapics[id].mp_config.apicid;
+}
+unsigned int mpc_ioapic_addr(int id)
+{
+        return ioapics[id].mp_config.apicaddr;
+}
+struct mp_ioapic_gsi *mp_ioapic_gsi_routing(int id)
+{
+        return &ioapics[id].gsi_config;
+}
+int nr_ioapics;
 /* The one past the highest gsi number used */
 u32 gsi_top;
@@ -109,7 +131,10 @@ DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
 int skip_ioapic_setup;
-void arch_disable_smp_support(void)
+/**
+ * disable_ioapic_support() - disables ioapic support at runtime
+ */
+void disable_ioapic_support(void)
 {
 #ifdef CONFIG_PCI
        noioapicquirk = 1;
@@ -121,25 +146,45 @@ void arch_disable_smp_support(void)
 static int __init parse_noapic(char *str)
 {
        /* disable IO-APIC */
-        arch_disable_smp_support();
+        disable_ioapic_support();
        return 0;
 }
 early_param("noapic", parse_noapic);
+static int io_apic_setup_irq_pin(unsigned int irq, int node,
+                                 struct io_apic_irq_attr *attr);
+/* Will be called in mpparse/acpi/sfi codes for saving IRQ info */
+void mp_save_irq(struct mpc_intsrc *m)
+{
+        int i;
+        apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x,"
+                " IRQ %02x, APIC ID %x, APIC INT %02x\n",
+                m->irqtype, m->irqflag & 3, (m->irqflag >> 2) & 3, m->srcbus,
+                m->srcbusirq, m->dstapic, m->dstirq);
+        for (i = 0; i < mp_irq_entries; i++) {
+                if (!memcmp(&mp_irqs[i], m, sizeof(*m)))
+                        return;
+        }
+        memcpy(&mp_irqs[mp_irq_entries], m, sizeof(*m));
+        if (++mp_irq_entries == MAX_IRQ_SOURCES)
+                panic("Max # of irq sources exceeded!!\n");
+}
 struct irq_pin_list {
        int apic, pin;
        struct irq_pin_list *next;
 };
-static struct irq_pin_list *get_one_free_irq_2_pin(int node)
+static struct irq_pin_list *alloc_irq_pin_list(int node)
 {
-        struct irq_pin_list *pin;
+        return kzalloc_node(sizeof(struct irq_pin_list), GFP_KERNEL, node);
-        pin = kzalloc_node(sizeof(*pin), GFP_ATOMIC, node);
-        return pin;
 }
 /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
 #ifdef CONFIG_SPARSE_IRQ
 static struct irq_cfg irq_cfgx[NR_IRQS_LEGACY];
@@ -150,25 +195,32 @@ static struct irq_cfg irq_cfgx[NR_IRQS];
 int __init arch_early_irq_init(void)
 {
        struct irq_cfg *cfg;
-        struct irq_desc *desc;
+        int count, node, i;
-        int count;
-        int node;
-        int i;
        if (!legacy_pic->nr_legacy_irqs) {
                nr_irqs_gsi = 0;
                io_apic_irqs = ~0UL;
        }
+        for (i = 0; i < nr_ioapics; i++) {
+                ioapics[i].saved_registers =
+                        kzalloc(sizeof(struct IO_APIC_route_entry) *
+                                ioapics[i].nr_registers, GFP_KERNEL);
+                if (!ioapics[i].saved_registers)
+                        pr_err("IOAPIC %d: suspend/resume impossible!\n", i);
+        }
        cfg = irq_cfgx;
        count = ARRAY_SIZE(irq_cfgx);
-        node= cpu_to_node(boot_cpu_id);
+        node = cpu_to_node(0);
+        /* Make sure the legacy interrupts are marked in the bitmap */
+        irq_reserve_irqs(0, legacy_pic->nr_legacy_irqs);
        for (i = 0; i < count; i++) {
-                desc = irq_to_desc(i);
+                irq_set_chip_data(i, &cfg[i]);
-                desc->chip_data = &cfg[i];
+                zalloc_cpumask_var_node(&cfg[i].domain, GFP_KERNEL, node);
-                zalloc_cpumask_var_node(&cfg[i].domain, GFP_NOWAIT, node);
+                zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_KERNEL, node);
-                zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_NOWAIT, node);
                /*
                 * For legacy IRQ's, start with assigning irq0 to irq15 to
                 * IRQ0_VECTOR to IRQ15_VECTOR on cpu 0.
@@ -183,170 +235,88 @@ int __init arch_early_irq_init(void)
 }
 #ifdef CONFIG_SPARSE_IRQ
-struct irq_cfg *irq_cfg(unsigned int irq)
+static struct irq_cfg *irq_cfg(unsigned int irq)
 {
-        struct irq_cfg *cfg = NULL;
+        return irq_get_chip_data(irq);
-        struct irq_desc *desc;
-        desc = irq_to_desc(irq);
-        if (desc)
-                cfg = desc->chip_data;
-        return cfg;
 }
-static struct irq_cfg *get_one_free_irq_cfg(int node)
+static struct irq_cfg *alloc_irq_cfg(unsigned int irq, int node)
 {
        struct irq_cfg *cfg;
-        cfg = kzalloc_node(sizeof(*cfg), GFP_ATOMIC, node);
+        cfg = kzalloc_node(sizeof(*cfg), GFP_KERNEL, node);
-        if (cfg) {
+        if (!cfg)
-                if (!zalloc_cpumask_var_node(&cfg->domain, GFP_ATOMIC, node)) {
+                return NULL;
-                        kfree(cfg);
+        if (!zalloc_cpumask_var_node(&cfg->domain, GFP_KERNEL, node))
-                        cfg = NULL;
+                goto out_cfg;
-                } else if (!zalloc_cpumask_var_node(&cfg->old_domain,
+        if (!zalloc_cpumask_var_node(&cfg->old_domain, GFP_KERNEL, node))
-                                                          GFP_ATOMIC, node)) {
+                goto out_domain;
-                        free_cpumask_var(cfg->domain);
-                        kfree(cfg);
-                        cfg = NULL;
-                }
-        }
        return cfg;
+out_domain:
+        free_cpumask_var(cfg->domain);
+out_cfg:
+        kfree(cfg);
+        return NULL;
 }
-int arch_init_chip_data(struct irq_desc *desc, int node)
+static void free_irq_cfg(unsigned int at, struct irq_cfg *cfg)
-{
-        struct irq_cfg *cfg;
-        cfg = desc->chip_data;
-        if (!cfg) {
-                desc->chip_data = get_one_free_irq_cfg(node);
-                if (!desc->chip_data) {
-                        printk(KERN_ERR "can not alloc irq_cfg\n");
-                        BUG_ON(1);
-                }
-        }
-        return 0;
-}
-/* for move_irq_desc */
-static void
-init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int node)
 {
-        struct irq_pin_list *old_entry, *head, *tail, *entry;
+        if (!cfg)
-        cfg->irq_2_pin = NULL;
-        old_entry = old_cfg->irq_2_pin;
-        if (!old_entry)
-                return;
-        entry = get_one_free_irq_2_pin(node);
-        if (!entry)
                return;
+        irq_set_chip_data(at, NULL);
+        free_cpumask_var(cfg->domain);
+        free_cpumask_var(cfg->old_domain);
+        kfree(cfg);
+}
-        entry->apic     = old_entry->apic;
+#else
-        entry->pin      = old_entry->pin;
-        head            = entry;
-        tail            = entry;
-        old_entry       = old_entry->next;
-        while (old_entry) {
-                entry = get_one_free_irq_2_pin(node);
-                if (!entry) {
-                        entry = head;
-                        while (entry) {
-                                head = entry->next;
-                                kfree(entry);
-                                entry = head;
-                        }
-                        /* still use the old one */
-                        return;
-                }
-                entry->apic     = old_entry->apic;
-                entry->pin      = old_entry->pin;
-                tail->next      = entry;
-                tail            = entry;
-                old_entry       = old_entry->next;
-        }
-        tail->next = NULL;
+struct irq_cfg *irq_cfg(unsigned int irq)
-        cfg->irq_2_pin = head;
+{
+        return irq < nr_irqs ? irq_cfgx + irq : NULL;
 }
-static void free_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg)
+static struct irq_cfg *alloc_irq_cfg(unsigned int irq, int node)
 {
-        struct irq_pin_list *entry, *next;
+        return irq_cfgx + irq;
+}
-        if (old_cfg->irq_2_pin == cfg->irq_2_pin)
-                return;
-        entry = old_cfg->irq_2_pin;
+static inline void free_irq_cfg(unsigned int at, struct irq_cfg *cfg) { }
-        while (entry) {
+#endif
-                next = entry->next;
-                kfree(entry);
-                entry = next;
-        }
-        old_cfg->irq_2_pin = NULL;
-}
-void arch_init_copy_chip_data(struct irq_desc *old_desc,
+static struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node)
-                                 struct irq_desc *desc, int node)
 {
+        int res = irq_alloc_desc_at(at, node);
        struct irq_cfg *cfg;
-        struct irq_cfg *old_cfg;
-        cfg = get_one_free_irq_cfg(node);
+        if (res < 0) {
+                if (res != -EEXIST)
-        if (!cfg)
+                        return NULL;
-                return;
+                cfg = irq_get_chip_data(at);
+                if (cfg)
-        desc->chip_data = cfg;
+                        return cfg;
+        }
-        old_cfg = old_desc->chip_data;
-        cfg->vector = old_cfg->vector;
-        cfg->move_in_progress = old_cfg->move_in_progress;
-        cpumask_copy(cfg->domain, old_cfg->domain);
-        cpumask_copy(cfg->old_domain, old_cfg->old_domain);
-        init_copy_irq_2_pin(old_cfg, cfg, node);
-}
-static void free_irq_cfg(struct irq_cfg *cfg)
+        cfg = alloc_irq_cfg(at, node);
-{
+        if (cfg)
-        free_cpumask_var(cfg->domain);
+                irq_set_chip_data(at, cfg);
-        free_cpumask_var(cfg->old_domain);
+        else
-        kfree(cfg);
+                irq_free_desc(at);
+        return cfg;
 }
-void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc)
+static int alloc_irq_from(unsigned int from, int node)
 {
-        struct irq_cfg *old_cfg, *cfg;
+        return irq_alloc_desc_from(from, node);
-        old_cfg = old_desc->chip_data;
-        cfg = desc->chip_data;
-        if (old_cfg == cfg)
-                return;
-        if (old_cfg) {
-                free_irq_2_pin(old_cfg, cfg);
-                free_irq_cfg(old_cfg);
-                old_desc->chip_data = NULL;
-        }
 }
-/* end for move_irq_desc */
-#else
+static void free_irq_at(unsigned int at, struct irq_cfg *cfg)
-struct irq_cfg *irq_cfg(unsigned int irq)
 {
-        return irq < nr_irqs ? irq_cfgx + irq : NULL;
+        free_irq_cfg(at, cfg);
+        irq_free_desc(at);
 }
-#endif
 struct io_apic {
        unsigned int index;
        unsigned int unused[3];
@@ -358,7 +328,7 @@ struct io_apic {
 static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
 {
        return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
-                + (mp_ioapics[idx].apicaddr & ~PAGE_MASK);
+                + (mpc_ioapic_addr(idx) & ~PAGE_MASK);
 }
 static inline void io_apic_eoi(unsigned int apic, unsigned int vector)
@@ -451,7 +421,7 @@ __ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
        io_apic_write(apic, 0x10 + 2*pin, eu.w1);
 }
-void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
+static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
 {
        unsigned long flags;
        raw_spin_lock_irqsave(&ioapic_lock, flags);
@@ -481,7 +451,7 @@ static void ioapic_mask_entry(int apic, int pin)
 * fast in the common case, and fast for shared ISA-space IRQs.
 */
 static int
-add_pin_to_irq_node_nopanic(struct irq_cfg *cfg, int node, int apic, int pin)
+__add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin)
 {
        struct irq_pin_list **last, *entry;
@@ -493,7 +463,7 @@ add_pin_to_irq_node_nopanic(struct irq_cfg *cfg, int node, int apic, int pin)
                last = &entry->next;
        }
-        entry = get_one_free_irq_2_pin(node);
+        entry = alloc_irq_pin_list(node);
        if (!entry) {
                printk(KERN_ERR "can not alloc irq_pin_list (%d,%d,%d)\n",
                                node, apic, pin);
@@ -508,7 +478,7 @@ add_pin_to_irq_node_nopanic(struct irq_cfg *cfg, int node, int apic, int pin)
 static void add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin)
 {
-        if (add_pin_to_irq_node_nopanic(cfg, node, apic, pin))
+        if (__add_pin_to_irq_node(cfg, node, apic, pin))
                panic("IO-APIC: failed to add irq-pin. Can not proceed\n");
 }
@@ -571,11 +541,6 @@ static void __unmask_and_level_IO_APIC_irq(struct irq_pin_list *entry)
                             IO_APIC_REDIR_LEVEL_TRIGGER, NULL);
 }
-static void __unmask_IO_APIC_irq(struct irq_cfg *cfg)
-{
-        io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, 0, NULL);
-}
 static void io_apic_sync(struct irq_pin_list *entry)
 {
        /*
@@ -587,44 +552,37 @@ static void io_apic_sync(struct irq_pin_list *entry)
        readl(&io_apic->data);
 }
-static void __mask_IO_APIC_irq(struct irq_cfg *cfg)
+static void mask_ioapic(struct irq_cfg *cfg)
 {
+        unsigned long flags;
+        raw_spin_lock_irqsave(&ioapic_lock, flags);
        io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync);
+        raw_spin_unlock_irqrestore(&ioapic_lock, flags);
 }
-static void mask_IO_APIC_irq_desc(struct irq_desc *desc)
+static void mask_ioapic_irq(struct irq_data *data)
 {
-        struct irq_cfg *cfg = desc->chip_data;
+        mask_ioapic(data->chip_data);
-        unsigned long flags;
+}
-        BUG_ON(!cfg);
-        raw_spin_lock_irqsave(&ioapic_lock, flags);
+static void __unmask_ioapic(struct irq_cfg *cfg)
-        __mask_IO_APIC_irq(cfg);
+{
-        raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+        io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, 0, NULL);
 }
-static void unmask_IO_APIC_irq_desc(struct irq_desc *desc)
+static void unmask_ioapic(struct irq_cfg *cfg)
 {
-        struct irq_cfg *cfg = desc->chip_data;
        unsigned long flags;
        raw_spin_lock_irqsave(&ioapic_lock, flags);
-        __unmask_IO_APIC_irq(cfg);
+        __unmask_ioapic(cfg);
        raw_spin_unlock_irqrestore(&ioapic_lock, flags);
 }
-static void mask_IO_APIC_irq(unsigned int irq)
+static void unmask_ioapic_irq(struct irq_data *data)
 {
-        struct irq_desc *desc = irq_to_desc(irq);
+        unmask_ioapic(data->chip_data);
-        mask_IO_APIC_irq_desc(desc);
-}
-static void unmask_IO_APIC_irq(unsigned int irq)
-{
-        struct irq_desc *desc = irq_to_desc(irq);
-        unmask_IO_APIC_irq_desc(desc);
 }
 static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
@@ -646,7 +604,7 @@ static void clear_IO_APIC (void)
        int apic, pin;
        for (apic = 0; apic < nr_ioapics; apic++)
-                for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
+                for (pin = 0; pin < ioapics[apic].nr_registers; pin++)
                        clear_IO_APIC_pin(apic, pin);
 }
@@ -688,74 +646,43 @@ static int __init ioapic_pirq_setup(char *str)
 __setup("pirq=", ioapic_pirq_setup);
 #endif /* CONFIG_X86_32 */
-struct IO_APIC_route_entry **alloc_ioapic_entries(void)
-{
-        int apic;
-        struct IO_APIC_route_entry **ioapic_entries;
-        ioapic_entries = kzalloc(sizeof(*ioapic_entries) * nr_ioapics,
-                                GFP_ATOMIC);
-        if (!ioapic_entries)
-                return 0;
-        for (apic = 0; apic < nr_ioapics; apic++) {
-                ioapic_entries[apic] =
-                        kzalloc(sizeof(struct IO_APIC_route_entry) *
-                                nr_ioapic_registers[apic], GFP_ATOMIC);
-                if (!ioapic_entries[apic])
-                        goto nomem;
-        }
-        return ioapic_entries;
-nomem:
-        while (--apic >= 0)
-                kfree(ioapic_entries[apic]);
-        kfree(ioapic_entries);
-        return 0;
-}
 /*
 * Saves all the IO-APIC RTE's
 */
-int save_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries)
+int save_ioapic_entries(void)
 {
        int apic, pin;
+        int err = 0;
-        if (!ioapic_entries)
-                return -ENOMEM;
        for (apic = 0; apic < nr_ioapics; apic++) {
-                if (!ioapic_entries[apic])
+                if (!ioapics[apic].saved_registers) {
-                        return -ENOMEM;
+                        err = -ENOMEM;
+                        continue;
+                }
-                for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
+                for (pin = 0; pin < ioapics[apic].nr_registers; pin++)
-                        ioapic_entries[apic][pin] =
+                        ioapics[apic].saved_registers[pin] =
                                ioapic_read_entry(apic, pin);
        }
-        return 0;
+        return err;
 }
 /*
 * Mask all IO APIC entries.
 */
-void mask_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries)
+void mask_ioapic_entries(void)
 {
        int apic, pin;
-        if (!ioapic_entries)
-                return;
        for (apic = 0; apic < nr_ioapics; apic++) {
-                if (!ioapic_entries[apic])
+                if (!ioapics[apic].saved_registers)
-                        break;
+                        continue;
-                for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
+                for (pin = 0; pin < ioapics[apic].nr_registers; pin++) {
                        struct IO_APIC_route_entry entry;
-                        entry = ioapic_entries[apic][pin];
+                        entry = ioapics[apic].saved_registers[pin];
                        if (!entry.mask) {
                                entry.mask = 1;
                                ioapic_write_entry(apic, pin, entry);
@@ -765,36 +692,23 @@ void mask_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries)
 }
 /*
- * Restore IO APIC entries which was saved in ioapic_entries.
+ * Restore IO APIC entries which was saved in the ioapic structure.
 */
-int restore_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries)
+int restore_ioapic_entries(void)
 {
        int apic, pin;
-        if (!ioapic_entries)
-                return -ENOMEM;
        for (apic = 0; apic < nr_ioapics; apic++) {
-                if (!ioapic_entries[apic])
+                if (!ioapics[apic].saved_registers)
-                        return -ENOMEM;
+                        continue;
-                for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
+                for (pin = 0; pin < ioapics[apic].nr_registers; pin++)
                        ioapic_write_entry(apic, pin,
-                                        ioapic_entries[apic][pin]);
+                                           ioapics[apic].saved_registers[pin]);
        }
        return 0;
 }
-void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries)
-{
-        int apic;
-        for (apic = 0; apic < nr_ioapics; apic++)
-                kfree(ioapic_entries[apic]);
-        kfree(ioapic_entries);
-}
 /*
 * Find the IRQ entry number of a certain pin.
 */
@@ -804,7 +718,7 @@ static int find_irq_entry(int apic, int pin, int type)
        for (i = 0; i < mp_irq_entries; i++)
                if (mp_irqs[i].irqtype == type &&
-                    (mp_irqs[i].dstapic == mp_ioapics[apic].apicid ||
+                    (mp_irqs[i].dstapic == mpc_ioapic_id(apic) ||
                     mp_irqs[i].dstapic == MP_APIC_ALL) &&
                    mp_irqs[i].dstirq == pin)
                        return i;
@@ -846,7 +760,7 @@ static int __init find_isa_irq_apic(int irq, int type)
        if (i < mp_irq_entries) {
                int apic;
                for(apic = 0; apic < nr_ioapics; apic++) {
-                        if (mp_ioapics[apic].apicid == mp_irqs[i].dstapic)
+                        if (mpc_ioapic_id(apic) == mp_irqs[i].dstapic)
                                return apic;
                }
        }
@@ -897,7 +811,7 @@ static int EISA_ELCR(unsigned int irq)
 #define default_MCA_trigger(idx)        (1)
 #define default_MCA_polarity(idx)       default_ISA_polarity(idx)
-static int MPBIOS_polarity(int idx)
+static int irq_polarity(int idx)
 {
        int bus = mp_irqs[idx].srcbus;
        int polarity;
@@ -939,7 +853,7 @@ static int MPBIOS_polarity(int idx)
        return polarity;
 }
-static int MPBIOS_trigger(int idx)
+static int irq_trigger(int idx)
 {
        int bus = mp_irqs[idx].srcbus;
        int trigger;
@@ -1011,20 +925,11 @@ static int MPBIOS_trigger(int idx)
        return trigger;
 }
-static inline int irq_polarity(int idx)
-{
-        return MPBIOS_polarity(idx);
-}
-static inline int irq_trigger(int idx)
-{
-        return MPBIOS_trigger(idx);
-}
 static int pin_2_irq(int idx, int apic, int pin)
 {
        int irq;
        int bus = mp_irqs[idx].srcbus;
+        struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(apic);
        /*
         * Debugging check, we are in big trouble if this message pops up!
@@ -1035,7 +940,7 @@ static int pin_2_irq(int idx, int apic, int pin)
        if (test_bit(bus, mp_bus_not_pci)) {
                irq = mp_irqs[idx].srcbusirq;
        } else {
-                u32 gsi = mp_gsi_routing[apic].gsi_base + pin;
+                u32 gsi = gsi_cfg->gsi_base + pin;
                if (gsi >= NR_IRQS_LEGACY)
                        irq = gsi;
@@ -1086,7 +991,7 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin,
                int lbus = mp_irqs[i].srcbus;
                for (apic = 0; apic < nr_ioapics; apic++)
-                        if (mp_ioapics[apic].apicid == mp_irqs[i].dstapic ||
+                        if (mpc_ioapic_id(apic) == mp_irqs[i].dstapic ||
                            mp_irqs[i].dstapic == MP_APIC_ALL)
                                break;
@@ -1259,7 +1164,6 @@ void __setup_vector_irq(int cpu)
        /* Initialize vector_irq on a new cpu */
        int irq, vector;
        struct irq_cfg *cfg;
-        struct irq_desc *desc;
        /*
         * vector_lock will make sure that we don't run into irq vector
@@ -1268,9 +1172,10 @@ void __setup_vector_irq(int cpu)
         */
        raw_spin_lock(&vector_lock);
        /* Mark the inuse vectors */
-        for_each_irq_desc(irq, desc) {
+        for_each_active_irq(irq) {
-                cfg = desc->chip_data;
+                cfg = irq_get_chip_data(irq);
+                if (!cfg)
+                        continue;
                /*
                 * If it is a legacy IRQ handled by the legacy PIC, this cpu
                 * will be part of the irq_cfg's domain.
@@ -1299,17 +1204,13 @@ void __setup_vector_irq(int cpu)
 static struct irq_chip ioapic_chip;
 static struct irq_chip ir_ioapic_chip;
-#define IOAPIC_AUTO     -1
-#define IOAPIC_EDGE     0
-#define IOAPIC_LEVEL    1
 #ifdef CONFIG_X86_32
 static inline int IO_APIC_irq_trigger(int irq)
 {
        int apic, idx, pin;
        for (apic = 0; apic < nr_ioapics; apic++) {
-                for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
+                for (pin = 0; pin < ioapics[apic].nr_registers; pin++) {
                        idx = find_irq_entry(apic, pin, mp_INT);
                        if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin)))
                                return irq_trigger(idx);
@@ -1327,41 +1228,37 @@ static inline int IO_APIC_irq_trigger(int irq)
 }
 #endif
-static void ioapic_register_intr(int irq, struct irq_desc *desc, unsigned long trigger)
+static void ioapic_register_intr(unsigned int irq, struct irq_cfg *cfg,
+                                 unsigned long trigger)
 {
+        struct irq_chip *chip = &ioapic_chip;
+        irq_flow_handler_t hdl;
+        bool fasteoi;
        if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
-            trigger == IOAPIC_LEVEL)
+            trigger == IOAPIC_LEVEL) {
-                desc->status |= IRQ_LEVEL;
+                irq_set_status_flags(irq, IRQ_LEVEL);
-        else
+                fasteoi = true;
-                desc->status &= ~IRQ_LEVEL;
+        } else {
+                irq_clear_status_flags(irq, IRQ_LEVEL);
-        if (irq_remapped(irq)) {
+                fasteoi = false;
-                desc->status |= IRQ_MOVE_PCNTXT;
-                if (trigger)
-                        set_irq_chip_and_handler_name(irq, &ir_ioapic_chip,
-                                                      handle_fasteoi_irq,
-                                                     "fasteoi");
-                else
-                        set_irq_chip_and_handler_name(irq, &ir_ioapic_chip,
-                                                      handle_edge_irq, "edge");
-                return;
        }
-        if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
+        if (irq_remapped(cfg)) {
-            trigger == IOAPIC_LEVEL)
+                irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
-                set_irq_chip_and_handler_name(irq, &ioapic_chip,
+                chip = &ir_ioapic_chip;
-                                              handle_fasteoi_irq,
+                fasteoi = trigger != 0;
-                                              "fasteoi");
+        }
-        else
-                set_irq_chip_and_handler_name(irq, &ioapic_chip,
+        hdl = fasteoi ? handle_fasteoi_irq : handle_edge_irq;
-                                              handle_edge_irq, "edge");
+        irq_set_chip_and_handler_name(irq, chip, hdl,
+                                      fasteoi ? "fasteoi" : "edge");
 }
-int setup_ioapic_entry(int apic_id, int irq,
+static int setup_ioapic_entry(int apic_id, int irq,
-                       struct IO_APIC_route_entry *entry,
+                              struct IO_APIC_route_entry *entry,
-                       unsigned int destination, int trigger,
+                              unsigned int destination, int trigger,
-                       int polarity, int vector, int pin)
+                              int polarity, int vector, int pin)
 {
        /*
         * add it to the IO-APIC irq-routing table:
@@ -1382,21 +1279,7 @@ int setup_ioapic_entry(int apic_id, int irq,
                if (index < 0)
                        panic("Failed to allocate IRTE for ioapic %d\n", apic_id);
-                memset(&irte, 0, sizeof(irte));
+                prepare_irte(&irte, vector, destination);
-                irte.present = 1;
-                irte.dst_mode = apic->irq_dest_mode;
-                /*
-                 * Trigger mode in the IRTE will always be edge, and the
-                 * actual level or edge trigger will be setup in the IO-APIC
-                 * RTE. This will help simplify level triggered irq migration.
-                 * For more details, see the comments above explainig IO-APIC
-                 * irq migration in the presence of interrupt-remapping.
-                 */
-                irte.trigger_mode = 0;
-                irte.dlvry_mode = apic->irq_delivery_mode;
-                irte.vector = vector;
-                irte.dest_id = IRTE_DEST(destination);
                /* Set source-id of interrupt request */
                set_ioapic_sid(&irte, apic_id);
@@ -1431,18 +1314,14 @@ int setup_ioapic_entry(int apic_id, int irq,
        return 0;
 }
-static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq_desc *desc,
+static void setup_ioapic_irq(int apic_id, int pin, unsigned int irq,
-                              int trigger, int polarity)
+                             struct irq_cfg *cfg, int trigger, int polarity)
 {
-        struct irq_cfg *cfg;
        struct IO_APIC_route_entry entry;
        unsigned int dest;
        if (!IO_APIC_IRQ(irq))
                return;
-        cfg = desc->chip_data;
        /*
         * For legacy irqs, cfg->domain starts with cpu 0 for legacy
         * controllers like 8259. Now that IO-APIC can handle this irq, update
@@ -1459,58 +1338,45 @@ static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq
        apic_printk(APIC_VERBOSE,KERN_DEBUG
                    "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> "
                    "IRQ %d Mode:%i Active:%i)\n",
-                    apic_id, mp_ioapics[apic_id].apicid, pin, cfg->vector,
+                    apic_id, mpc_ioapic_id(apic_id), pin, cfg->vector,
                    irq, trigger, polarity);
-        if (setup_ioapic_entry(mp_ioapics[apic_id].apicid, irq, &entry,
+        if (setup_ioapic_entry(mpc_ioapic_id(apic_id), irq, &entry,
                               dest, trigger, polarity, cfg->vector, pin)) {
                printk("Failed to setup ioapic entry for ioapic  %d, pin %d\n",
-                       mp_ioapics[apic_id].apicid, pin);
+                       mpc_ioapic_id(apic_id), pin);
                __clear_irq_vector(irq, cfg);
                return;
        }
-        ioapic_register_intr(irq, desc, trigger);
+        ioapic_register_intr(irq, cfg, trigger);
        if (irq < legacy_pic->nr_legacy_irqs)
-                legacy_pic->chip->mask(irq);
+                legacy_pic->mask(irq);
        ioapic_write_entry(apic_id, pin, entry);
 }
-static struct {
+static bool __init io_apic_pin_not_connected(int idx, int apic_id, int pin)
-        DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1);
-} mp_ioapic_routing[MAX_IO_APICS];
-static void __init setup_IO_APIC_irqs(void)
 {
-        int apic_id, pin, idx, irq;
+        if (idx != -1)
-        int notcon = 0;
+                return false;
-        struct irq_desc *desc;
-        struct irq_cfg *cfg;
-        int node = cpu_to_node(boot_cpu_id);
-        apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
+        apic_printk(APIC_VERBOSE, KERN_DEBUG " apic %d pin %d not connected\n",
+                    mpc_ioapic_id(apic_id), pin);
+        return true;
+}
-        for (apic_id = 0; apic_id < nr_ioapics; apic_id++)
+static void __init __io_apic_setup_irqs(unsigned int apic_id)
-        for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) {
+{
+        int idx, node = cpu_to_node(0);
+        struct io_apic_irq_attr attr;
+        unsigned int pin, irq;
+        for (pin = 0; pin < ioapics[apic_id].nr_registers; pin++) {
                idx = find_irq_entry(apic_id, pin, mp_INT);
-                if (idx == -1) {
+                if (io_apic_pin_not_connected(idx, apic_id, pin))
-                        if (!notcon) {
-                                notcon = 1;
-                                apic_printk(APIC_VERBOSE,
-                                        KERN_DEBUG " %d-%d",
-                                        mp_ioapics[apic_id].apicid, pin);
-                        } else
-                                apic_printk(APIC_VERBOSE, " %d-%d",
-                                        mp_ioapics[apic_id].apicid, pin);
                        continue;
-                }
-                if (notcon) {
-                        apic_printk(APIC_VERBOSE,
-                                " (apicid-pin) not connected\n");
-                        notcon = 0;
-                }
                irq = pin_2_irq(idx, apic_id, pin);
@@ -1522,27 +1388,24 @@ static void __init setup_IO_APIC_irqs(void)
                 * installed and if it returns 1:
                 */
                if (apic->multi_timer_check &&
-                                apic->multi_timer_check(apic_id, irq))
+                    apic->multi_timer_check(apic_id, irq))
                        continue;
-                desc = irq_to_desc_alloc_node(irq, node);
+                set_io_apic_irq_attr(&attr, apic_id, pin, irq_trigger(idx),
-                if (!desc) {
+                                     irq_polarity(idx));
-                        printk(KERN_INFO "can not get irq_desc for %d\n", irq);
-                        continue;
+                io_apic_setup_irq_pin(irq, node, &attr);
-                }
-                cfg = desc->chip_data;
-                add_pin_to_irq_node(cfg, node, apic_id, pin);
-                /*
-                 * don't mark it in pin_programmed, so later acpi could
-                 * set it correctly when irq < 16
-                 */
-                setup_IO_APIC_irq(apic_id, pin, irq, desc,
-                                irq_trigger(idx), irq_polarity(idx));
        }
+}
-        if (notcon)
+static void __init setup_IO_APIC_irqs(void)
-                apic_printk(APIC_VERBOSE,
+{
-                        " (apicid-pin) not connected\n");
+        unsigned int apic_id;
+        apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
+        for (apic_id = 0; apic_id < nr_ioapics; apic_id++)
+                __io_apic_setup_irqs(apic_id);
 }
 /*
@@ -1552,10 +1415,8 @@ static void __init setup_IO_APIC_irqs(void)
 */
 void setup_IO_APIC_irq_extra(u32 gsi)
 {
-        int apic_id = 0, pin, idx, irq;
+        int apic_id = 0, pin, idx, irq, node = cpu_to_node(0);
-        int node = cpu_to_node(boot_cpu_id);
+        struct io_apic_irq_attr attr;
-        struct irq_desc *desc;
-        struct irq_cfg *cfg;
        /*
         * Convert 'gsi' to 'ioapic.pin'.
@@ -1570,29 +1431,15 @@ void setup_IO_APIC_irq_extra(u32 gsi)
                return;
        irq = pin_2_irq(idx, apic_id, pin);
-#ifdef CONFIG_SPARSE_IRQ
-        desc = irq_to_desc(irq);
-        if (desc)
-                return;
-#endif
-        desc = irq_to_desc_alloc_node(irq, node);
-        if (!desc) {
-                printk(KERN_INFO "can not get irq_desc for %d\n", irq);
-                return;
-        }
-        cfg = desc->chip_data;
+        /* Only handle the non legacy irqs on secondary ioapics */
-        add_pin_to_irq_node(cfg, node, apic_id, pin);
+        if (apic_id == 0 || irq < NR_IRQS_LEGACY)
-        if (test_bit(pin, mp_ioapic_routing[apic_id].pin_programmed)) {
-                pr_debug("Pin %d-%d already programmed\n",
-                         mp_ioapics[apic_id].apicid, pin);
                return;
-        }
-        set_bit(pin, mp_ioapic_routing[apic_id].pin_programmed);
-        setup_IO_APIC_irq(apic_id, pin, irq, desc,
+        set_io_apic_irq_attr(&attr, apic_id, pin, irq_trigger(idx),
-                        irq_trigger(idx), irq_polarity(idx));
+                             irq_polarity(idx));
+        io_apic_setup_irq_pin_once(irq, node, &attr);
 }
 /*
@@ -1624,7 +1471,8 @@ static void __init setup_timer_IRQ0_pin(unsigned int apic_id, unsigned int pin,
         * The timer IRQ doesn't have to know that behind the
         * scene we may have a 8259A-master in AEOI mode ...
         */
-        set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge");
+        irq_set_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq,
+                                      "edge");
        /*
         * Add it to the IO-APIC irq-routing table:
@@ -1642,13 +1490,12 @@ __apicdebuginit(void) print_IO_APIC(void)
        union IO_APIC_reg_03 reg_03;
        unsigned long flags;
        struct irq_cfg *cfg;
-        struct irq_desc *desc;
        unsigned int irq;
        printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
        for (i = 0; i < nr_ioapics; i++)
                printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
-                       mp_ioapics[i].apicid, nr_ioapic_registers[i]);
+                       mpc_ioapic_id(i), ioapics[i].nr_registers);
        /*
         * We are a bit conservative about what we expect.  We have to
@@ -1668,7 +1515,7 @@ __apicdebuginit(void) print_IO_APIC(void)
        raw_spin_unlock_irqrestore(&ioapic_lock, flags);
        printk("\n");
-        printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].apicid);
+        printk(KERN_DEBUG "IO APIC #%d......\n", mpc_ioapic_id(apic));
        printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
        printk(KERN_DEBUG ".......    : physical APIC id: %02X\n", reg_00.bits.ID);
        printk(KERN_DEBUG ".......    : Delivery Type: %X\n", reg_00.bits.delivery_type);
@@ -1729,10 +1576,10 @@ __apicdebuginit(void) print_IO_APIC(void)
        }
        }
        printk(KERN_DEBUG "IRQ to pin mappings:\n");
-        for_each_irq_desc(irq, desc) {
+        for_each_active_irq(irq) {
                struct irq_pin_list *entry;
-                cfg = desc->chip_data;
+                cfg = irq_get_chip_data(irq);
                if (!cfg)
                        continue;
                entry = cfg->irq_2_pin;
@@ -1962,7 +1809,7 @@ void __init enable_IO_APIC(void)
        for(apic = 0; apic < nr_ioapics; apic++) {
                int pin;
                /* See if any of the pins is in ExtINT mode */
-                for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
+                for (pin = 0; pin < ioapics[apic].nr_registers; pin++) {
                        struct IO_APIC_route_entry entry;
                        entry = ioapic_read_entry(apic, pin);
@@ -2023,7 +1870,7 @@ void disable_IO_APIC(void)
         *
         * With interrupt-remapping, for now we will use virtual wire A mode,
         * as virtual wire B is little complex (need to configure both
-         * IOAPIC RTE aswell as interrupt-remapping table entry).
+         * IOAPIC RTE as well as interrupt-remapping table entry).
         * As this gets called during crash dump, keep this simple for now.
         */
        if (ioapic_i8259.pin != -1 && !intr_remapping_enabled) {
@@ -2061,8 +1908,7 @@ void disable_IO_APIC(void)
 *
 * by Matt Domsch <Matt_Domsch@dell.com>  Tue Dec 21 12:25:05 CST 1999
 */
+void __init setup_ioapic_ids_from_mpc_nocheck(void)
-void __init setup_ioapic_ids_from_mpc(void)
 {
        union IO_APIC_reg_00 reg_00;
        physid_mask_t phys_id_present_map;
@@ -2071,15 +1917,6 @@ void __init setup_ioapic_ids_from_mpc(void)
        unsigned char old_id;
        unsigned long flags;
-        if (acpi_ioapic)
-                return;
-        /*
-         * Don't check I/O APIC IDs for xAPIC systems.  They have
-         * no meaning without the serial APIC bus.
-         */
-        if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
-                || APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
-                return;
        /*
         * This is broken; anything with a real cpu count has to
         * circumvent this idiocy regardless.
@@ -2096,14 +1933,14 @@ void __init setup_ioapic_ids_from_mpc(void)
                reg_00.raw = io_apic_read(apic_id, 0);
                raw_spin_unlock_irqrestore(&ioapic_lock, flags);
-                old_id = mp_ioapics[apic_id].apicid;
+                old_id = mpc_ioapic_id(apic_id);
-                if (mp_ioapics[apic_id].apicid >= get_physical_broadcast()) {
+                if (mpc_ioapic_id(apic_id) >= get_physical_broadcast()) {
                        printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n",
-                                apic_id, mp_ioapics[apic_id].apicid);
+                                apic_id, mpc_ioapic_id(apic_id));
                        printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
                                reg_00.bits.ID);
-                        mp_ioapics[apic_id].apicid = reg_00.bits.ID;
+                        ioapics[apic_id].mp_config.apicid = reg_00.bits.ID;
                }
                /*
@@ -2112,9 +1949,9 @@ void __init setup_ioapic_ids_from_mpc(void)
                 * 'stuck on smp_invalidate_needed IPI wait' messages.
                 */
                if (apic->check_apicid_used(&phys_id_present_map,
-                                        mp_ioapics[apic_id].apicid)) {
+                                            mpc_ioapic_id(apic_id))) {
                        printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n",
-                                apic_id, mp_ioapics[apic_id].apicid);
+                                apic_id, mpc_ioapic_id(apic_id));
                        for (i = 0; i < get_physical_broadcast(); i++)
                                if (!physid_isset(i, phys_id_present_map))
                                        break;
@@ -2123,36 +1960,39 @@ void __init setup_ioapic_ids_from_mpc(void)
                        printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
                                i);
                        physid_set(i, phys_id_present_map);
-                        mp_ioapics[apic_id].apicid = i;
+                        ioapics[apic_id].mp_config.apicid = i;
                } else {
                        physid_mask_t tmp;
-                        apic->apicid_to_cpu_present(mp_ioapics[apic_id].apicid, &tmp);
+                        apic->apicid_to_cpu_present(mpc_ioapic_id(apic_id),
+                                                    &tmp);
                        apic_printk(APIC_VERBOSE, "Setting %d in the "
                                        "phys_id_present_map\n",
-                                        mp_ioapics[apic_id].apicid);
+                                        mpc_ioapic_id(apic_id));
                        physids_or(phys_id_present_map, phys_id_present_map, tmp);
                }
                /*
                 * We need to adjust the IRQ routing table
                 * if the ID changed.
                 */
-                if (old_id != mp_ioapics[apic_id].apicid)
+                if (old_id != mpc_ioapic_id(apic_id))
                        for (i = 0; i < mp_irq_entries; i++)
                                if (mp_irqs[i].dstapic == old_id)
                                        mp_irqs[i].dstapic
-                                                = mp_ioapics[apic_id].apicid;
+                                                = mpc_ioapic_id(apic_id);
                /*
-                 * Read the right value from the MPC table and
+                 * Update the ID register according to the right value
-                 * write it into the ID register.
+                 * from the MPC table if they are different.
                 */
+                if (mpc_ioapic_id(apic_id) == reg_00.bits.ID)
+                        continue;
                apic_printk(APIC_VERBOSE, KERN_INFO
                        "...changing IO-APIC physical APIC ID to %d ...",
-                        mp_ioapics[apic_id].apicid);
+                        mpc_ioapic_id(apic_id));
-                reg_00.bits.ID = mp_ioapics[apic_id].apicid;
+                reg_00.bits.ID = mpc_ioapic_id(apic_id);
                raw_spin_lock_irqsave(&ioapic_lock, flags);
                io_apic_write(apic_id, 0, reg_00.raw);
                raw_spin_unlock_irqrestore(&ioapic_lock, flags);
@@ -2163,12 +2003,27 @@ void __init setup_ioapic_ids_from_mpc(void)
                raw_spin_lock_irqsave(&ioapic_lock, flags);
                reg_00.raw = io_apic_read(apic_id, 0);
                raw_spin_unlock_irqrestore(&ioapic_lock, flags);
-                if (reg_00.bits.ID != mp_ioapics[apic_id].apicid)
+                if (reg_00.bits.ID != mpc_ioapic_id(apic_id))
                        printk("could not set ID!\n");
                else
                        apic_printk(APIC_VERBOSE, " ok.\n");
        }
 }
+void __init setup_ioapic_ids_from_mpc(void)
+{
+        if (acpi_ioapic)
+                return;
+        /*
+         * Don't check I/O APIC IDs for xAPIC systems.  They have
+         * no meaning without the serial APIC bus.
+         */
+        if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+                || APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
+                return;
+        setup_ioapic_ids_from_mpc_nocheck();
+}
 #endif
 int no_timer_check __initdata;
@@ -2239,29 +2094,26 @@ static int __init timer_irq_works(void)
 * an edge even if it isn't on the 8259A...
 */
-static unsigned int startup_ioapic_irq(unsigned int irq)
+static unsigned int startup_ioapic_irq(struct irq_data *data)
 {
-        int was_pending = 0;
+        int was_pending = 0, irq = data->irq;
        unsigned long flags;
-        struct irq_cfg *cfg;
        raw_spin_lock_irqsave(&ioapic_lock, flags);
        if (irq < legacy_pic->nr_legacy_irqs) {
-                legacy_pic->chip->mask(irq);
+                legacy_pic->mask(irq);
                if (legacy_pic->irq_pending(irq))
                        was_pending = 1;
        }
-        cfg = irq_cfg(irq);
+        __unmask_ioapic(data->chip_data);
-        __unmask_IO_APIC_irq(cfg);
        raw_spin_unlock_irqrestore(&ioapic_lock, flags);
        return was_pending;
 }
-static int ioapic_retrigger_irq(unsigned int irq)
+static int ioapic_retrigger_irq(struct irq_data *data)
 {
+        struct irq_cfg *cfg = data->chip_data;
-        struct irq_cfg *cfg = irq_cfg(irq);
        unsigned long flags;
        raw_spin_lock_irqsave(&vector_lock, flags);
@@ -2312,7 +2164,7 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq
                 * With interrupt-remapping, destination information comes
                 * from interrupt-remapping table entry.
                 */
-                if (!irq_remapped(irq))
+                if (!irq_remapped(cfg))
                        io_apic_write(apic, 0x11 + pin*2, dest);
                reg = io_apic_read(apic, 0x10 + pin*2);
                reg &= ~IO_APIC_REDIR_VECTOR_MASK;
@@ -2322,65 +2174,46 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq
 }
 /*
- * Either sets desc->affinity to a valid value, and returns
+ * Either sets data->affinity to a valid value, and returns
 * ->cpu_mask_to_apicid of that in dest_id, or returns -1 and
- * leaves desc->affinity untouched.
+ * leaves data->affinity untouched.
 */
-unsigned int
+int __ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
-set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask,
+                          unsigned int *dest_id)
-                  unsigned int *dest_id)
 {
-        struct irq_cfg *cfg;
+        struct irq_cfg *cfg = data->chip_data;
-        unsigned int irq;
        if (!cpumask_intersects(mask, cpu_online_mask))
                return -1;
-        irq = desc->irq;
+        if (assign_irq_vector(data->irq, data->chip_data, mask))
-        cfg = desc->chip_data;
-        if (assign_irq_vector(irq, cfg, mask))
                return -1;
-        cpumask_copy(desc->affinity, mask);
+        cpumask_copy(data->affinity, mask);
-        *dest_id = apic->cpu_mask_to_apicid_and(desc->affinity, cfg->domain);
+        *dest_id = apic->cpu_mask_to_apicid_and(mask, cfg->domain);
        return 0;
 }
 static int
-set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
+ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
+                    bool force)
 {
-        struct irq_cfg *cfg;
+        unsigned int dest, irq = data->irq;
        unsigned long flags;
-        unsigned int dest;
+        int ret;
-        unsigned int irq;
-        int ret = -1;
-        irq = desc->irq;
-        cfg = desc->chip_data;
        raw_spin_lock_irqsave(&ioapic_lock, flags);
-        ret = set_desc_affinity(desc, mask, &dest);
+        ret = __ioapic_set_affinity(data, mask, &dest);
        if (!ret) {
                /* Only the high 8 bits are valid. */
                dest = SET_APIC_LOGICAL_ID(dest);
-                __target_IO_APIC_irq(irq, dest, cfg);
+                __target_IO_APIC_irq(irq, dest, data->chip_data);
        }
        raw_spin_unlock_irqrestore(&ioapic_lock, flags);
        return ret;
 }
-static int
-set_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask)
-{
-        struct irq_desc *desc;
-        desc = irq_to_desc(irq);
-        return set_ioapic_affinity_irq_desc(desc, mask);
-}
 #ifdef CONFIG_INTR_REMAP
 /*
@@ -2395,24 +2228,21 @@ set_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask)
 * the interrupt-remapping table entry.
 */
 static int
-migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
+ir_ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
+                       bool force)
 {
-        struct irq_cfg *cfg;
+        struct irq_cfg *cfg = data->chip_data;
+        unsigned int dest, irq = data->irq;
        struct irte irte;
-        unsigned int dest;
-        unsigned int irq;
-        int ret = -1;
        if (!cpumask_intersects(mask, cpu_online_mask))
-                return ret;
+                return -EINVAL;
-        irq = desc->irq;
        if (get_irte(irq, &irte))
-                return ret;
+                return -EBUSY;
-        cfg = desc->chip_data;
        if (assign_irq_vector(irq, cfg, mask))
-                return ret;
+                return -EBUSY;
        dest = apic->cpu_mask_to_apicid_and(cfg->domain, mask);
@@ -2427,29 +2257,14 @@ migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
        if (cfg->move_in_progress)
                send_cleanup_vector(cfg);
-        cpumask_copy(desc->affinity, mask);
+        cpumask_copy(data->affinity, mask);
        return 0;
 }
-/*
- * Migrates the IRQ destination in the process context.
- */
-static int set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
-                                            const struct cpumask *mask)
-{
-        return migrate_ioapic_irq_desc(desc, mask);
-}
-static int set_ir_ioapic_affinity_irq(unsigned int irq,
-                                       const struct cpumask *mask)
-{
-        struct irq_desc *desc = irq_to_desc(irq);
-        return set_ir_ioapic_affinity_irq_desc(desc, mask);
-}
 #else
-static inline int set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
+static inline int
-                                                   const struct cpumask *mask)
+ir_ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
+                       bool force)
 {
        return 0;
 }
@@ -2469,7 +2284,7 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
                unsigned int irr;
                struct irq_desc *desc;
                struct irq_cfg *cfg;
-                irq = __get_cpu_var(vector_irq)[vector];
+                irq = __this_cpu_read(vector_irq[vector]);
                if (irq == -1)
                        continue;
@@ -2503,7 +2318,7 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
                        apic->send_IPI_self(IRQ_MOVE_CLEANUP_VECTOR);
                        goto unlock;
                }
-                __get_cpu_var(vector_irq)[vector] = -1;
+                __this_cpu_write(vector_irq[vector], -1);
 unlock:
                raw_spin_unlock(&desc->lock);
        }
@@ -2511,10 +2326,8 @@ unlock:
        irq_exit();
 }
-static void __irq_complete_move(struct irq_desc **descp, unsigned vector)
+static void __irq_complete_move(struct irq_cfg *cfg, unsigned vector)
 {
-        struct irq_desc *desc = *descp;
-        struct irq_cfg *cfg = desc->chip_data;
        unsigned me;
        if (likely(!cfg->move_in_progress))
@@ -2526,31 +2339,28 @@ static void __irq_complete_move(struct irq_desc **descp, unsigned vector)
                send_cleanup_vector(cfg);
 }
-static void irq_complete_move(struct irq_desc **descp)
+static void irq_complete_move(struct irq_cfg *cfg)
 {
-        __irq_complete_move(descp, ~get_irq_regs()->orig_ax);
+        __irq_complete_move(cfg, ~get_irq_regs()->orig_ax);
 }
 void irq_force_complete_move(int irq)
 {
-        struct irq_desc *desc = irq_to_desc(irq);
+        struct irq_cfg *cfg = irq_get_chip_data(irq);
-        struct irq_cfg *cfg = desc->chip_data;
        if (!cfg)
                return;
-        __irq_complete_move(&desc, cfg->vector);
+        __irq_complete_move(cfg, cfg->vector);
 }
 #else
-static inline void irq_complete_move(struct irq_desc **descp) {}
+static inline void irq_complete_move(struct irq_cfg *cfg) { }
 #endif
-static void ack_apic_edge(unsigned int irq)
+static void ack_apic_edge(struct irq_data *data)
 {
-        struct irq_desc *desc = irq_to_desc(irq);
+        irq_complete_move(data->chip_data);
+        irq_move_irq(data);
-        irq_complete_move(&desc);
-        move_native_irq(irq);
        ack_APIC_irq();
 }
@@ -2572,19 +2382,21 @@ atomic_t irq_mis_count;
 * Otherwise, we simulate the EOI message manually by changing the trigger
 * mode to edge and then back to level, with RTE being masked during this.
 */
-static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
+static void eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
 {
        struct irq_pin_list *entry;
+        unsigned long flags;
+        raw_spin_lock_irqsave(&ioapic_lock, flags);
        for_each_irq_pin(entry, cfg->irq_2_pin) {
-                if (mp_ioapics[entry->apic].apicver >= 0x20) {
+                if (mpc_ioapic_ver(entry->apic) >= 0x20) {
                        /*
                         * Intr-remapping uses pin number as the virtual vector
                         * in the RTE. Actual vector is programmed in
                         * intr-remapping table entry. Hence for the io-apic
                         * EOI we use the pin number.
                         */
-                        if (irq_remapped(irq))
+                        if (irq_remapped(cfg))
                                io_apic_eoi(entry->apic, entry->pin);
                        else
                                io_apic_eoi(entry->apic, cfg->vector);
@@ -2593,36 +2405,21 @@ static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
                        __unmask_and_level_IO_APIC_irq(entry);
                }
        }
-}
-static void eoi_ioapic_irq(struct irq_desc *desc)
-{
-        struct irq_cfg *cfg;
-        unsigned long flags;
-        unsigned int irq;
-        irq = desc->irq;
-        cfg = desc->chip_data;
-        raw_spin_lock_irqsave(&ioapic_lock, flags);
-        __eoi_ioapic_irq(irq, cfg);
        raw_spin_unlock_irqrestore(&ioapic_lock, flags);
 }
-static void ack_apic_level(unsigned int irq)
+static void ack_apic_level(struct irq_data *data)
 {
-        struct irq_desc *desc = irq_to_desc(irq);
+        struct irq_cfg *cfg = data->chip_data;
+        int i, do_unmask_irq = 0, irq = data->irq;
        unsigned long v;
-        int i;
-        struct irq_cfg *cfg;
-        int do_unmask_irq = 0;
-        irq_complete_move(&desc);
+        irq_complete_move(cfg);
 #ifdef CONFIG_GENERIC_PENDING_IRQ
        /* If we are moving the irq we need to mask it */
-        if (unlikely(desc->status & IRQ_MOVE_PENDING)) {
+        if (unlikely(irqd_is_setaffinity_pending(data))) {
                do_unmask_irq = 1;
-                mask_IO_APIC_irq_desc(desc);
+                mask_ioapic(cfg);
        }
 #endif
@@ -2658,7 +2455,6 @@ static void ack_apic_level(unsigned int irq)
         * we use the above logic (mask+edge followed by unmask+level) from
         * Manfred Spraul to clear the remote IRR.
         */
-        cfg = desc->chip_data;
        i = cfg->vector;
        v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1));
@@ -2678,7 +2474,7 @@ static void ack_apic_level(unsigned int irq)
        if (!(v & (1 << (i & 0x1f)))) {
                atomic_inc(&irq_mis_count);
-                eoi_ioapic_irq(desc);
+                eoi_ioapic_irq(irq, cfg);
        }
        /* Now we can move and renable the irq */
@@ -2709,61 +2505,57 @@ static void ack_apic_level(unsigned int irq)
                 * accurate and is causing problems then it is a hardware bug
                 * and you can go talk to the chipset vendor about it.
                 */
-                cfg = desc->chip_data;
                if (!io_apic_level_ack_pending(cfg))
-                        move_masked_irq(irq);
+                        irq_move_masked_irq(data);
-                unmask_IO_APIC_irq_desc(desc);
+                unmask_ioapic(cfg);
        }
 }
 #ifdef CONFIG_INTR_REMAP
-static void ir_ack_apic_edge(unsigned int irq)
+static void ir_ack_apic_edge(struct irq_data *data)
 {
        ack_APIC_irq();
 }
-static void ir_ack_apic_level(unsigned int irq)
+static void ir_ack_apic_level(struct irq_data *data)
 {
-        struct irq_desc *desc = irq_to_desc(irq);
        ack_APIC_irq();
-        eoi_ioapic_irq(desc);
+        eoi_ioapic_irq(data->irq, data->chip_data);
 }
 #endif /* CONFIG_INTR_REMAP */
 static struct irq_chip ioapic_chip __read_mostly = {
-        .name           = "IO-APIC",
+        .name                   = "IO-APIC",
-        .startup        = startup_ioapic_irq,
+        .irq_startup            = startup_ioapic_irq,
-        .mask           = mask_IO_APIC_irq,
+        .irq_mask               = mask_ioapic_irq,
-        .unmask         = unmask_IO_APIC_irq,
+        .irq_unmask             = unmask_ioapic_irq,
-        .ack            = ack_apic_edge,
+        .irq_ack                = ack_apic_edge,
-        .eoi            = ack_apic_level,
+        .irq_eoi                = ack_apic_level,
 #ifdef CONFIG_SMP
-        .set_affinity   = set_ioapic_affinity_irq,
+        .irq_set_affinity       = ioapic_set_affinity,
 #endif
-        .retrigger      = ioapic_retrigger_irq,
+        .irq_retrigger          = ioapic_retrigger_irq,
 };
 static struct irq_chip ir_ioapic_chip __read_mostly = {
-        .name           = "IR-IO-APIC",
+        .name                   = "IR-IO-APIC",
-        .startup        = startup_ioapic_irq,
+        .irq_startup            = startup_ioapic_irq,
-        .mask           = mask_IO_APIC_irq,
+        .irq_mask               = mask_ioapic_irq,
-        .unmask         = unmask_IO_APIC_irq,
+        .irq_unmask             = unmask_ioapic_irq,
 #ifdef CONFIG_INTR_REMAP
-        .ack            = ir_ack_apic_edge,
+        .irq_ack                = ir_ack_apic_edge,
-        .eoi            = ir_ack_apic_level,
+        .irq_eoi                = ir_ack_apic_level,
 #ifdef CONFIG_SMP
-        .set_affinity   = set_ir_ioapic_affinity_irq,
+        .irq_set_affinity       = ir_ioapic_set_affinity,
 #endif
 #endif
-        .retrigger      = ioapic_retrigger_irq,
+        .irq_retrigger          = ioapic_retrigger_irq,
 };
 static inline void init_IO_APIC_traps(void)
 {
-        int irq;
-        struct irq_desc *desc;
        struct irq_cfg *cfg;
+        unsigned int irq;
        /*
         * NOTE! The local APIC isn't very good at handling
@@ -2776,8 +2568,8 @@ static inline void init_IO_APIC_traps(void)
         * Also, we've got to be careful not to trash gate
         * 0x80, because int 0x80 is hm, kind of importantish. ;)
         */
-        for_each_irq_desc(irq, desc) {
+        for_each_active_irq(irq) {
-                cfg = desc->chip_data;
+                cfg = irq_get_chip_data(irq);
                if (IO_APIC_IRQ(irq) && cfg && !cfg->vector) {
                        /*
                         * Hmm.. We don't have an entry for this,
@@ -2788,7 +2580,7 @@ static inline void init_IO_APIC_traps(void)
                                legacy_pic->make_irq(irq);
                        else
                                /* Strange. Oh, well.. */
-                                desc->chip = &no_irq_chip;
+                                irq_set_chip(irq, &no_irq_chip);
                }
        }
 }
@@ -2797,7 +2589,7 @@ static inline void init_IO_APIC_traps(void)
 * The local APIC irq-chip implementation:
 */
-static void mask_lapic_irq(unsigned int irq)
+static void mask_lapic_irq(struct irq_data *data)
 {
        unsigned long v;
@@ -2805,7 +2597,7 @@ static void mask_lapic_irq(unsigned int irq)
        apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
 }
-static void unmask_lapic_irq(unsigned int irq)
+static void unmask_lapic_irq(struct irq_data *data)
 {
        unsigned long v;
@@ -2813,43 +2605,25 @@ static void unmask_lapic_irq(unsigned int irq)
        apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
 }
-static void ack_lapic_irq(unsigned int irq)
+static void ack_lapic_irq(struct irq_data *data)
 {
        ack_APIC_irq();
 }
 static struct irq_chip lapic_chip __read_mostly = {
        .name           = "local-APIC",
-        .mask           = mask_lapic_irq,
+        .irq_mask       = mask_lapic_irq,
-        .unmask         = unmask_lapic_irq,
+        .irq_unmask     = unmask_lapic_irq,
-        .ack            = ack_lapic_irq,
+        .irq_ack        = ack_lapic_irq,
 };
-static void lapic_register_intr(int irq, struct irq_desc *desc)
+static void lapic_register_intr(int irq)
 {
-        desc->status &= ~IRQ_LEVEL;
+        irq_clear_status_flags(irq, IRQ_LEVEL);
-        set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq,
+        irq_set_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq,
                                      "edge");
 }
-static void __init setup_nmi(void)
-{
-        /*
-         * Dirty trick to enable the NMI watchdog ...
-         * We put the 8259A master into AEOI mode and
-         * unmask on all local APICs LVT0 as NMI.
-         *
-         * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire')
-         * is from Maciej W. Rozycki - so we do not have to EOI from
-         * the NMI handler or the timer interrupt.
-         */
-        apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ...");
-        enable_NMI_through_LVT0();
-        apic_printk(APIC_VERBOSE, " done.\n");
-}
 /*
 * This looks a bit hackish but it's about the only one way of sending
 * a few INTA cycles to 8259As and any associated glue logic.  ICR does
@@ -2930,9 +2704,8 @@ int timer_through_8259 __initdata;
 */
 static inline void __init check_timer(void)
 {
-        struct irq_desc *desc = irq_to_desc(0);
+        struct irq_cfg *cfg = irq_get_chip_data(0);
-        struct irq_cfg *cfg = desc->chip_data;
+        int node = cpu_to_node(0);
-        int node = cpu_to_node(boot_cpu_id);
        int apic1, pin1, apic2, pin2;
        unsigned long flags;
        int no_pin1 = 0;
@@ -2942,7 +2715,7 @@ static inline void __init check_timer(void)
        /*
         * get/set the timer IRQ vector:
         */
-        legacy_pic->chip->mask(0);
+        legacy_pic->mask(0);
        assign_irq_vector(0, cfg, apic->target_cpus());
        /*
@@ -2956,15 +2729,6 @@ static inline void __init check_timer(void)
         */
        apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
        legacy_pic->init(1);
-#ifdef CONFIG_X86_32
-        {
-                unsigned int ver;
-                ver = apic_read(APIC_LVR);
-                ver = GET_APIC_VERSION(ver);
-                timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver));
-        }
-#endif
        pin1  = find_isa_irq_pin(0, mp_INT);
        apic1 = find_isa_irq_apic(0, mp_INT);
@@ -3001,7 +2765,7 @@ static inline void __init check_timer(void)
                        add_pin_to_irq_node(cfg, node, apic1, pin1);
                        setup_timer_IRQ0_pin(apic1, pin1, cfg->vector);
                } else {
-                        /* for edge trigger, setup_IO_APIC_irq already
+                        /* for edge trigger, setup_ioapic_irq already
                         * leave it unmasked.
                         * so only need to unmask if it is level-trigger
                         * do we really have level trigger timer?
@@ -3009,13 +2773,9 @@ static inline void __init check_timer(void)
                        int idx;
                        idx = find_irq_entry(apic1, pin1, mp_INT);
                        if (idx != -1 && irq_trigger(idx))
-                                unmask_IO_APIC_irq_desc(desc);
+                                unmask_ioapic(cfg);
                }
                if (timer_irq_works()) {
-                        if (nmi_watchdog == NMI_IO_APIC) {
-                                setup_nmi();
-                                legacy_pic->chip->unmask(0);
-                        }
                        if (disable_timer_pin_1 > 0)
                                clear_IO_APIC_pin(0, pin1);
                        goto out;
@@ -3037,48 +2797,34 @@ static inline void __init check_timer(void)
                 */
                replace_pin_at_irq_node(cfg, node, apic1, pin1, apic2, pin2);
                setup_timer_IRQ0_pin(apic2, pin2, cfg->vector);
-                legacy_pic->chip->unmask(0);
+                legacy_pic->unmask(0);
                if (timer_irq_works()) {
                        apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
                        timer_through_8259 = 1;
-                        if (nmi_watchdog == NMI_IO_APIC) {
-                                legacy_pic->chip->mask(0);
-                                setup_nmi();
-                                legacy_pic->chip->unmask(0);
-                        }
                        goto out;
                }
                /*
                 * Cleanup, just in case ...
                 */
                local_irq_disable();
-                legacy_pic->chip->mask(0);
+                legacy_pic->mask(0);
                clear_IO_APIC_pin(apic2, pin2);
                apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n");
        }
-        if (nmi_watchdog == NMI_IO_APIC) {
-                apic_printk(APIC_QUIET, KERN_WARNING "timer doesn't work "
-                            "through the IO-APIC - disabling NMI Watchdog!\n");
-                nmi_watchdog = NMI_NONE;
-        }
-#ifdef CONFIG_X86_32
-        timer_ack = 0;
-#endif
        apic_printk(APIC_QUIET, KERN_INFO
                    "...trying to set up timer as Virtual Wire IRQ...\n");
-        lapic_register_intr(0, desc);
+        lapic_register_intr(0);
        apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector);     /* Fixed mode */
-        legacy_pic->chip->unmask(0);
+        legacy_pic->unmask(0);
        if (timer_irq_works()) {
                apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
                goto out;
        }
        local_irq_disable();
-        legacy_pic->chip->mask(0);
+        legacy_pic->mask(0);
        apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector);
        apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n");
@@ -3144,7 +2890,7 @@ void __init setup_IO_APIC(void)
 }
 /*
- *      Called after all the initialization is done. If we didnt find any
+ *      Called after all the initialization is done. If we didn't find any
 *      APIC bugs then we can allow the modify fast path
 */
@@ -3157,136 +2903,84 @@ static int __init io_apic_bug_finalize(void)
 late_initcall(io_apic_bug_finalize);
-struct sysfs_ioapic_data {
+static void resume_ioapic_id(int ioapic_id)
-        struct sys_device dev;
-        struct IO_APIC_route_entry entry[0];
-};
-static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS];
-static int ioapic_suspend(struct sys_device *dev, pm_message_t state)
 {
-        struct IO_APIC_route_entry *entry;
-        struct sysfs_ioapic_data *data;
-        int i;
-        data = container_of(dev, struct sysfs_ioapic_data, dev);
-        entry = data->entry;
-        for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ )
-                *entry = ioapic_read_entry(dev->id, i);
-        return 0;
-}
-static int ioapic_resume(struct sys_device *dev)
-{
-        struct IO_APIC_route_entry *entry;
-        struct sysfs_ioapic_data *data;
        unsigned long flags;
        union IO_APIC_reg_00 reg_00;
-        int i;
-        data = container_of(dev, struct sysfs_ioapic_data, dev);
-        entry = data->entry;
        raw_spin_lock_irqsave(&ioapic_lock, flags);
-        reg_00.raw = io_apic_read(dev->id, 0);
+        reg_00.raw = io_apic_read(ioapic_id, 0);
-        if (reg_00.bits.ID != mp_ioapics[dev->id].apicid) {
+        if (reg_00.bits.ID != mpc_ioapic_id(ioapic_id)) {
-                reg_00.bits.ID = mp_ioapics[dev->id].apicid;
+                reg_00.bits.ID = mpc_ioapic_id(ioapic_id);
-                io_apic_write(dev->id, 0, reg_00.raw);
+                io_apic_write(ioapic_id, 0, reg_00.raw);
        }
        raw_spin_unlock_irqrestore(&ioapic_lock, flags);
-        for (i = 0; i < nr_ioapic_registers[dev->id]; i++)
+}
-                ioapic_write_entry(dev->id, i, entry[i]);
-        return 0;
+static void ioapic_resume(void)
+{
+        int ioapic_id;
+        for (ioapic_id = nr_ioapics - 1; ioapic_id >= 0; ioapic_id--)
+                resume_ioapic_id(ioapic_id);
+        restore_ioapic_entries();
 }
-static struct sysdev_class ioapic_sysdev_class = {
+static struct syscore_ops ioapic_syscore_ops = {
-        .name = "ioapic",
+        .suspend = save_ioapic_entries,
-        .suspend = ioapic_suspend,
        .resume = ioapic_resume,
 };
-static int __init ioapic_init_sysfs(void)
+static int __init ioapic_init_ops(void)
 {
-        struct sys_device * dev;
+        register_syscore_ops(&ioapic_syscore_ops);
-        int i, size, error;
-        error = sysdev_class_register(&ioapic_sysdev_class);
-        if (error)
-                return error;
-        for (i = 0; i < nr_ioapics; i++ ) {
-                size = sizeof(struct sys_device) + nr_ioapic_registers[i]
-                        * sizeof(struct IO_APIC_route_entry);
-                mp_ioapic_data[i] = kzalloc(size, GFP_KERNEL);
-                if (!mp_ioapic_data[i]) {
-                        printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
-                        continue;
-                }
-                dev = &mp_ioapic_data[i]->dev;
-                dev->id = i;
-                dev->cls = &ioapic_sysdev_class;
-                error = sysdev_register(dev);
-                if (error) {
-                        kfree(mp_ioapic_data[i]);
-                        mp_ioapic_data[i] = NULL;
-                        printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
-                        continue;
-                }
-        }
        return 0;
 }
-device_initcall(ioapic_init_sysfs);
+device_initcall(ioapic_init_ops);
 /*
 * Dynamic irq allocate and deallocation
 */
-unsigned int create_irq_nr(unsigned int irq_want, int node)
+unsigned int create_irq_nr(unsigned int from, int node)
 {
-        /* Allocate an unused irq */
+        struct irq_cfg *cfg;
-        unsigned int irq;
-        unsigned int new;
        unsigned long flags;
-        struct irq_cfg *cfg_new = NULL;
+        unsigned int ret = 0;
-        struct irq_desc *desc_new = NULL;
+        int irq;
-        irq = 0;
-        if (irq_want < nr_irqs_gsi)
-                irq_want = nr_irqs_gsi;
-        raw_spin_lock_irqsave(&vector_lock, flags);
-        for (new = irq_want; new < nr_irqs; new++) {
-                desc_new = irq_to_desc_alloc_node(new, node);
-                if (!desc_new) {
-                        printk(KERN_INFO "can not get irq_desc for %d\n", new);
-                        continue;
-                }
-                cfg_new = desc_new->chip_data;
-                if (cfg_new->vector != 0)
-                        continue;
-                desc_new = move_irq_desc(desc_new, node);
+        if (from < nr_irqs_gsi)
-                cfg_new = desc_new->chip_data;
+                from = nr_irqs_gsi;
-                if (__assign_irq_vector(new, cfg_new, apic->target_cpus()) == 0)
+        irq = alloc_irq_from(from, node);
-                        irq = new;
+        if (irq < 0)
-                break;
+                return 0;
+        cfg = alloc_irq_cfg(irq, node);
+        if (!cfg) {
+                free_irq_at(irq, NULL);
+                return 0;
        }
-        raw_spin_unlock_irqrestore(&vector_lock, flags);
-        if (irq > 0)
+        raw_spin_lock_irqsave(&vector_lock, flags);
-                dynamic_irq_init_keep_chip_data(irq);
+        if (!__assign_irq_vector(irq, cfg, apic->target_cpus()))
+                ret = irq;
+        raw_spin_unlock_irqrestore(&vector_lock, flags);
-        return irq;
+        if (ret) {
+                irq_set_chip_data(irq, cfg);
+                irq_clear_status_flags(irq, IRQ_NOREQUEST);
+        } else {
+                free_irq_at(irq, cfg);
+        }
+        return ret;
 }
 int create_irq(void)
 {
-        int node = cpu_to_node(boot_cpu_id);
+        int node = cpu_to_node(0);
        unsigned int irq_want;
        int irq;
@@ -3301,14 +2995,17 @@ int create_irq(void)
 void destroy_irq(unsigned int irq)
 {
+        struct irq_cfg *cfg = irq_get_chip_data(irq);
        unsigned long flags;
-        dynamic_irq_cleanup_keep_chip_data(irq);
+        irq_set_status_flags(irq, IRQ_NOREQUEST|IRQ_NOPROBE);
-        free_irte(irq);
+        if (irq_remapped(cfg))
+                free_irte(irq);
        raw_spin_lock_irqsave(&vector_lock, flags);
-        __clear_irq_vector(irq, get_irq_chip_data(irq));
+        __clear_irq_vector(irq, cfg);
        raw_spin_unlock_irqrestore(&vector_lock, flags);
+        free_irq_at(irq, cfg);
 }
 /*
@@ -3332,7 +3029,7 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq,
        dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus());
-        if (irq_remapped(irq)) {
+        if (irq_remapped(cfg)) {
                struct irte irte;
                int ir_index;
                u16 sub_handle;
@@ -3340,14 +3037,7 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq,
                ir_index = map_irq_to_irte_handle(irq, &sub_handle);
                BUG_ON(ir_index == -1);
-                memset (&irte, 0, sizeof(irte));
+                prepare_irte(&irte, cfg->vector, dest);
-                irte.present = 1;
-                irte.dst_mode = apic->irq_dest_mode;
-                irte.trigger_mode = 0; /* edge */
-                irte.dlvry_mode = apic->irq_delivery_mode;
-                irte.vector = cfg->vector;
-                irte.dest_id = IRTE_DEST(dest);
                /* Set source-id of interrupt request */
                if (pdev)
@@ -3392,26 +3082,24 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq,
 }
 #ifdef CONFIG_SMP
-static int set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
+static int
+msi_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force)
 {
-        struct irq_desc *desc = irq_to_desc(irq);
+        struct irq_cfg *cfg = data->chip_data;
-        struct irq_cfg *cfg;
        struct msi_msg msg;
        unsigned int dest;
-        if (set_desc_affinity(desc, mask, &dest))
+        if (__ioapic_set_affinity(data, mask, &dest))
                return -1;
-        cfg = desc->chip_data;
+        __get_cached_msi_msg(data->msi_desc, &msg);
-        get_cached_msi_msg_desc(desc, &msg);
        msg.data &= ~MSI_DATA_VECTOR_MASK;
        msg.data |= MSI_DATA_VECTOR(cfg->vector);
        msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
        msg.address_lo |= MSI_ADDR_DEST_ID(dest);
-        write_msi_msg_desc(desc, &msg);
+        __write_msi_msg(data->msi_desc, &msg);
        return 0;
 }
@@ -3421,17 +3109,17 @@ static int set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
 * done in the process context using interrupt-remapping hardware.
 */
 static int
-ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
+ir_msi_set_affinity(struct irq_data *data, const struct cpumask *mask,
+                    bool force)
 {
-        struct irq_desc *desc = irq_to_desc(irq);
+        struct irq_cfg *cfg = data->chip_data;
-        struct irq_cfg *cfg = desc->chip_data;
+        unsigned int dest, irq = data->irq;
-        unsigned int dest;
        struct irte irte;
        if (get_irte(irq, &irte))
                return -1;
-        if (set_desc_affinity(desc, mask, &dest))
+        if (__ioapic_set_affinity(data, mask, &dest))
                return -1;
        irte.vector = cfg->vector;
@@ -3461,27 +3149,27 @@ ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
 * which implement the MSI or MSI-X Capability Structure.
 */
 static struct irq_chip msi_chip = {
-        .name           = "PCI-MSI",
+        .name                   = "PCI-MSI",
-        .unmask         = unmask_msi_irq,
+        .irq_unmask             = unmask_msi_irq,
-        .mask           = mask_msi_irq,
+        .irq_mask               = mask_msi_irq,
-        .ack            = ack_apic_edge,
+        .irq_ack                = ack_apic_edge,
 #ifdef CONFIG_SMP
-        .set_affinity   = set_msi_irq_affinity,
+        .irq_set_affinity       = msi_set_affinity,
 #endif
-        .retrigger      = ioapic_retrigger_irq,
+        .irq_retrigger          = ioapic_retrigger_irq,
 };
 static struct irq_chip msi_ir_chip = {
-        .name           = "IR-PCI-MSI",
+        .name                   = "IR-PCI-MSI",
-        .unmask         = unmask_msi_irq,
+        .irq_unmask             = unmask_msi_irq,
-        .mask           = mask_msi_irq,
+        .irq_mask               = mask_msi_irq,
 #ifdef CONFIG_INTR_REMAP
-        .ack            = ir_ack_apic_edge,
+        .irq_ack                = ir_ack_apic_edge,
 #ifdef CONFIG_SMP
-        .set_affinity   = ir_set_msi_irq_affinity,
+        .irq_set_affinity       = ir_msi_set_affinity,
 #endif
 #endif
-        .retrigger      = ioapic_retrigger_irq,
+        .irq_retrigger          = ioapic_retrigger_irq,
 };
 /*
@@ -3513,40 +3201,35 @@ static int msi_alloc_irte(struct pci_dev *dev, int irq, int nvec)
 static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
 {
-        int ret;
+        struct irq_chip *chip = &msi_chip;
        struct msi_msg msg;
+        int ret;
        ret = msi_compose_msg(dev, irq, &msg, -1);
        if (ret < 0)
                return ret;
-        set_irq_msi(irq, msidesc);
+        irq_set_msi_desc(irq, msidesc);
        write_msi_msg(irq, &msg);
-        if (irq_remapped(irq)) {
+        if (irq_remapped(irq_get_chip_data(irq))) {
-                struct irq_desc *desc = irq_to_desc(irq);
+                irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
-                /*
+                chip = &msi_ir_chip;
-                 * irq migration in process context
+        }
-                 */
-                desc->status |= IRQ_MOVE_PCNTXT;
+        irq_set_chip_and_handler_name(irq, chip, handle_edge_irq, "edge");
-                set_irq_chip_and_handler_name(irq, &msi_ir_chip, handle_edge_irq, "edge");
-        } else
-                set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge");
        dev_printk(KERN_DEBUG, &dev->dev, "irq %d for MSI/MSI-X\n", irq);
        return 0;
 }
-int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
+int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 {
-        unsigned int irq;
+        int node, ret, sub_handle, index = 0;
-        int ret, sub_handle;
+        unsigned int irq, irq_want;
        struct msi_desc *msidesc;
-        unsigned int irq_want;
        struct intel_iommu *iommu = NULL;
-        int index = 0;
-        int node;
        /* x86 doesn't support multiple MSI yet */
        if (type == PCI_CAP_ID_MSI && nvec > 1)
@@ -3599,31 +3282,31 @@ error:
        return ret;
 }
-void arch_teardown_msi_irq(unsigned int irq)
+void native_teardown_msi_irq(unsigned int irq)
 {
        destroy_irq(irq);
 }
 #if defined (CONFIG_DMAR) || defined (CONFIG_INTR_REMAP)
 #ifdef CONFIG_SMP
-static int dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
+static int
+dmar_msi_set_affinity(struct irq_data *data, const struct cpumask *mask,
+                      bool force)
 {
-        struct irq_desc *desc = irq_to_desc(irq);
+        struct irq_cfg *cfg = data->chip_data;
-        struct irq_cfg *cfg;
+        unsigned int dest, irq = data->irq;
        struct msi_msg msg;
-        unsigned int dest;
-        if (set_desc_affinity(desc, mask, &dest))
+        if (__ioapic_set_affinity(data, mask, &dest))
                return -1;
-        cfg = desc->chip_data;
        dmar_msi_read(irq, &msg);
        msg.data &= ~MSI_DATA_VECTOR_MASK;
        msg.data |= MSI_DATA_VECTOR(cfg->vector);
        msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
        msg.address_lo |= MSI_ADDR_DEST_ID(dest);
+        msg.address_hi = MSI_ADDR_BASE_HI | MSI_ADDR_EXT_DEST_ID(dest);
        dmar_msi_write(irq, &msg);
@@ -3633,14 +3316,14 @@ static int dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
 #endif /* CONFIG_SMP */
 static struct irq_chip dmar_msi_type = {
-        .name = "DMAR_MSI",
+        .name                   = "DMAR_MSI",
-        .unmask = dmar_msi_unmask,
+        .irq_unmask             = dmar_msi_unmask,
-        .mask = dmar_msi_mask,
+        .irq_mask               = dmar_msi_mask,
-        .ack = ack_apic_edge,
+        .irq_ack                = ack_apic_edge,
 #ifdef CONFIG_SMP
-        .set_affinity = dmar_msi_set_affinity,
+        .irq_set_affinity       = dmar_msi_set_affinity,
 #endif
-        .retrigger = ioapic_retrigger_irq,
+        .irq_retrigger          = ioapic_retrigger_irq,
 };
 int arch_setup_dmar_msi(unsigned int irq)
@@ -3652,8 +3335,8 @@ int arch_setup_dmar_msi(unsigned int irq)
        if (ret < 0)
                return ret;
        dmar_msi_write(irq, &msg);
-        set_irq_chip_and_handler_name(irq, &dmar_msi_type, handle_edge_irq,
+        irq_set_chip_and_handler_name(irq, &dmar_msi_type, handle_edge_irq,
-                "edge");
+                                      "edge");
        return 0;
 }
 #endif
@@ -3661,26 +3344,24 @@ int arch_setup_dmar_msi(unsigned int irq)
 #ifdef CONFIG_HPET_TIMER
 #ifdef CONFIG_SMP
-static int hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
+static int hpet_msi_set_affinity(struct irq_data *data,
+                                 const struct cpumask *mask, bool force)
 {
-        struct irq_desc *desc = irq_to_desc(irq);
+        struct irq_cfg *cfg = data->chip_data;
-        struct irq_cfg *cfg;
        struct msi_msg msg;
        unsigned int dest;
-        if (set_desc_affinity(desc, mask, &dest))
+        if (__ioapic_set_affinity(data, mask, &dest))
                return -1;
-        cfg = desc->chip_data;
+        hpet_msi_read(data->handler_data, &msg);
-        hpet_msi_read(irq, &msg);
        msg.data &= ~MSI_DATA_VECTOR_MASK;
        msg.data |= MSI_DATA_VECTOR(cfg->vector);
        msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
        msg.address_lo |= MSI_ADDR_DEST_ID(dest);
-        hpet_msi_write(irq, &msg);
+        hpet_msi_write(data->handler_data, &msg);
        return 0;
 }
@@ -3688,34 +3369,34 @@ static int hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
 #endif /* CONFIG_SMP */
 static struct irq_chip ir_hpet_msi_type = {
-        .name = "IR-HPET_MSI",
+        .name                   = "IR-HPET_MSI",
-        .unmask = hpet_msi_unmask,
+        .irq_unmask             = hpet_msi_unmask,
-        .mask = hpet_msi_mask,
+        .irq_mask               = hpet_msi_mask,
 #ifdef CONFIG_INTR_REMAP
-        .ack = ir_ack_apic_edge,
+        .irq_ack                = ir_ack_apic_edge,
 #ifdef CONFIG_SMP
-        .set_affinity = ir_set_msi_irq_affinity,
+        .irq_set_affinity       = ir_msi_set_affinity,
 #endif
 #endif
-        .retrigger = ioapic_retrigger_irq,
+        .irq_retrigger          = ioapic_retrigger_irq,
 };
 static struct irq_chip hpet_msi_type = {
        .name = "HPET_MSI",
-        .unmask = hpet_msi_unmask,
+        .irq_unmask = hpet_msi_unmask,
-        .mask = hpet_msi_mask,
+        .irq_mask = hpet_msi_mask,
-        .ack = ack_apic_edge,
+        .irq_ack = ack_apic_edge,
 #ifdef CONFIG_SMP
-        .set_affinity = hpet_msi_set_affinity,
+        .irq_set_affinity = hpet_msi_set_affinity,
 #endif
-        .retrigger = ioapic_retrigger_irq,
+        .irq_retrigger = ioapic_retrigger_irq,
 };
 int arch_setup_hpet_msi(unsigned int irq, unsigned int id)
 {
-        int ret;
+        struct irq_chip *chip = &hpet_msi_type;
        struct msi_msg msg;
-        struct irq_desc *desc = irq_to_desc(irq);
+        int ret;
        if (intr_remapping_enabled) {
                struct intel_iommu *iommu = map_hpet_to_ir(id);
@@ -3733,15 +3414,12 @@ int arch_setup_hpet_msi(unsigned int irq, unsigned int id)
        if (ret < 0)
                return ret;
-        hpet_msi_write(irq, &msg);
+        hpet_msi_write(irq_get_handler_data(irq), &msg);
-        desc->status |= IRQ_MOVE_PCNTXT;
+        irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
-        if (irq_remapped(irq))
+        if (irq_remapped(irq_get_chip_data(irq)))
-                set_irq_chip_and_handler_name(irq, &ir_hpet_msi_type,
+                chip = &ir_hpet_msi_type;
-                                              handle_edge_irq, "edge");
-        else
-                set_irq_chip_and_handler_name(irq, &hpet_msi_type,
-                                              handle_edge_irq, "edge");
+        irq_set_chip_and_handler_name(irq, chip, handle_edge_irq, "edge");
        return 0;
 }
 #endif
@@ -3768,33 +3446,30 @@ static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector)
        write_ht_irq_msg(irq, &msg);
 }
-static int set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask)
+static int
+ht_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force)
 {
-        struct irq_desc *desc = irq_to_desc(irq);
+        struct irq_cfg *cfg = data->chip_data;
-        struct irq_cfg *cfg;
        unsigned int dest;
-        if (set_desc_affinity(desc, mask, &dest))
+        if (__ioapic_set_affinity(data, mask, &dest))
                return -1;
-        cfg = desc->chip_data;
+        target_ht_irq(data->irq, dest, cfg->vector);
-        target_ht_irq(irq, dest, cfg->vector);
        return 0;
 }
 #endif
 static struct irq_chip ht_irq_chip = {
-        .name           = "PCI-HT",
+        .name                   = "PCI-HT",
-        .mask           = mask_ht_irq,
+        .irq_mask               = mask_ht_irq,
-        .unmask         = unmask_ht_irq,
+        .irq_unmask             = unmask_ht_irq,
-        .ack            = ack_apic_edge,
+        .irq_ack                = ack_apic_edge,
 #ifdef CONFIG_SMP
-        .set_affinity   = set_ht_irq_affinity,
+        .irq_set_affinity       = ht_set_affinity,
 #endif
-        .retrigger      = ioapic_retrigger_irq,
+        .irq_retrigger          = ioapic_retrigger_irq,
 };
 int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
@@ -3831,7 +3506,7 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
                write_ht_irq_msg(irq, &msg);
-                set_irq_chip_and_handler_name(irq, &ht_irq_chip,
+                irq_set_chip_and_handler_name(irq, &ht_irq_chip,
                                              handle_edge_irq, "edge");
                dev_printk(KERN_DEBUG, &dev->dev, "irq %d for HT\n", irq);
@@ -3840,7 +3515,40 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
 }
 #endif /* CONFIG_HT_IRQ */
-int __init io_apic_get_redir_entries (int ioapic)
+static int
+io_apic_setup_irq_pin(unsigned int irq, int node, struct io_apic_irq_attr *attr)
+{
+        struct irq_cfg *cfg = alloc_irq_and_cfg_at(irq, node);
+        int ret;
+        if (!cfg)
+                return -EINVAL;
+        ret = __add_pin_to_irq_node(cfg, node, attr->ioapic, attr->ioapic_pin);
+        if (!ret)
+                setup_ioapic_irq(attr->ioapic, attr->ioapic_pin, irq, cfg,
+                                 attr->trigger, attr->polarity);
+        return ret;
+}
+int io_apic_setup_irq_pin_once(unsigned int irq, int node,
+                               struct io_apic_irq_attr *attr)
+{
+        unsigned int id = attr->ioapic, pin = attr->ioapic_pin;
+        int ret;
+        /* Avoid redundant programming */
+        if (test_bit(pin, ioapics[id].pin_programmed)) {
+                pr_debug("Pin %d-%d already programmed\n",
+                         mpc_ioapic_id(id), pin);
+                return 0;
+        }
+        ret = io_apic_setup_irq_pin(irq, node, attr);
+        if (!ret)
+                set_bit(pin, ioapics[id].pin_programmed);
+        return ret;
+}
+static int __init io_apic_get_redir_entries(int ioapic)
 {
        union IO_APIC_reg_01    reg_01;
        unsigned long flags;
@@ -3856,7 +3564,7 @@ int __init io_apic_get_redir_entries (int ioapic)
        return reg_01.bits.entries + 1;
 }
-void __init probe_nr_irqs_gsi(void)
+static void __init probe_nr_irqs_gsi(void)
 {
        int nr;
@@ -3867,6 +3575,11 @@ void __init probe_nr_irqs_gsi(void)
        printk(KERN_DEBUG "nr_irqs_gsi: %d\n", nr_irqs_gsi);
 }
+int get_nr_irqs_gsi(void)
+{
+        return nr_irqs_gsi;
+}
 #ifdef CONFIG_SPARSE_IRQ
 int __init arch_probe_nr_irqs(void)
 {
@@ -3885,104 +3598,28 @@ int __init arch_probe_nr_irqs(void)
        if (nr < nr_irqs)
                nr_irqs = nr;
-        return 0;
+        return NR_IRQS_LEGACY;
 }
 #endif
-static int __io_apic_set_pci_routing(struct device *dev, int irq,
+int io_apic_set_pci_routing(struct device *dev, int irq,
-                                struct io_apic_irq_attr *irq_attr)
+                            struct io_apic_irq_attr *irq_attr)
 {
-        struct irq_desc *desc;
-        struct irq_cfg *cfg;
        int node;
-        int ioapic, pin;
-        int trigger, polarity;
-        ioapic = irq_attr->ioapic;
        if (!IO_APIC_IRQ(irq)) {
                apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
-                        ioapic);
+                            irq_attr->ioapic);
                return -EINVAL;
        }
-        if (dev)
+        node = dev ? dev_to_node(dev) : cpu_to_node(0);
-                node = dev_to_node(dev);
-        else
-                node = cpu_to_node(boot_cpu_id);
-        desc = irq_to_desc_alloc_node(irq, node);
-        if (!desc) {
-                printk(KERN_INFO "can not get irq_desc %d\n", irq);
-                return 0;
-        }
-        pin = irq_attr->ioapic_pin;
-        trigger = irq_attr->trigger;
-        polarity = irq_attr->polarity;
-        /*
+        return io_apic_setup_irq_pin_once(irq, node, irq_attr);
-         * IRQs < 16 are already in the irq_2_pin[] map
-         */
-        if (irq >= legacy_pic->nr_legacy_irqs) {
-                cfg = desc->chip_data;
-                if (add_pin_to_irq_node_nopanic(cfg, node, ioapic, pin)) {
-                        printk(KERN_INFO "can not add pin %d for irq %d\n",
-                                pin, irq);
-                        return 0;
-                }
-        }
-        setup_IO_APIC_irq(ioapic, pin, irq, desc, trigger, polarity);
-        return 0;
-}
-int io_apic_set_pci_routing(struct device *dev, int irq,
-                                struct io_apic_irq_attr *irq_attr)
-{
-        int ioapic, pin;
-        /*
-         * Avoid pin reprogramming.  PRTs typically include entries
-         * with redundant pin->gsi mappings (but unique PCI devices);
-         * we only program the IOAPIC on the first.
-         */
-        ioapic = irq_attr->ioapic;
-        pin = irq_attr->ioapic_pin;
-        if (test_bit(pin, mp_ioapic_routing[ioapic].pin_programmed)) {
-                pr_debug("Pin %d-%d already programmed\n",
-                         mp_ioapics[ioapic].apicid, pin);
-                return 0;
-        }
-        set_bit(pin, mp_ioapic_routing[ioapic].pin_programmed);
-        return __io_apic_set_pci_routing(dev, irq, irq_attr);
-}
-u8 __init io_apic_unique_id(u8 id)
-{
-#ifdef CONFIG_X86_32
-        if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
-            !APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
-                return io_apic_get_unique_id(nr_ioapics, id);
-        else
-                return id;
-#else
-        int i;
-        DECLARE_BITMAP(used, 256);
-        bitmap_zero(used, 256);
-        for (i = 0; i < nr_ioapics; i++) {
-                struct mpc_ioapic *ia = &mp_ioapics[i];
-                __set_bit(ia->apicid, used);
-        }
-        if (!test_bit(id, used))
-                return id;
-        return find_first_zero_bit(used, 256);
-#endif
 }
 #ifdef CONFIG_X86_32
-int __init io_apic_get_unique_id(int ioapic, int apic_id)
+static int __init io_apic_get_unique_id(int ioapic, int apic_id)
 {
        union IO_APIC_reg_00 reg_00;
        static physid_mask_t apic_id_map = PHYSID_MASK_NONE;
@@ -4055,9 +3692,32 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id)
        return apic_id;
 }
+static u8 __init io_apic_unique_id(u8 id)
+{
+        if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
+            !APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
+                return io_apic_get_unique_id(nr_ioapics, id);
+        else
+                return id;
+}
+#else
+static u8 __init io_apic_unique_id(u8 id)
+{
+        int i;
+        DECLARE_BITMAP(used, 256);
+        bitmap_zero(used, 256);
+        for (i = 0; i < nr_ioapics; i++) {
+                __set_bit(mpc_ioapic_id(i), used);
+        }
+        if (!test_bit(id, used))
+                return id;
+        return find_first_zero_bit(used, 256);
+}
 #endif
-int __init io_apic_get_version(int ioapic)
+static int __init io_apic_get_version(int ioapic)
 {
        union IO_APIC_reg_01    reg_01;
        unsigned long flags;
@@ -4102,14 +3762,14 @@ int acpi_get_override_irq(u32 gsi, int *trigger, int *polarity)
 void __init setup_ioapic_dest(void)
 {
        int pin, ioapic, irq, irq_entry;
-        struct irq_desc *desc;
        const struct cpumask *mask;
+        struct irq_data *idata;
        if (skip_ioapic_setup == 1)
                return;
        for (ioapic = 0; ioapic < nr_ioapics; ioapic++)
-        for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) {
+        for (pin = 0; pin < ioapics[ioapic].nr_registers; pin++) {
                irq_entry = find_irq_entry(ioapic, pin, mp_INT);
                if (irq_entry == -1)
                        continue;
@@ -4118,21 +3778,20 @@ void __init setup_ioapic_dest(void)
                if ((ioapic > 0) && (irq > 16))
                        continue;
-                desc = irq_to_desc(irq);
+                idata = irq_get_irq_data(irq);
                /*
                 * Honour affinities which have been set in early boot
                 */
-                if (desc->status &
+                if (!irqd_can_balance(idata) || irqd_affinity_was_set(idata))
-                    (IRQ_NO_BALANCING | IRQ_AFFINITY_SET))
+                        mask = idata->affinity;
-                        mask = desc->affinity;
                else
                        mask = apic->target_cpus();
                if (intr_remapping_enabled)
-                        set_ir_ioapic_affinity_irq_desc(desc, mask);
+                        ir_ioapic_set_affinity(idata, mask, false);
                else
-                        set_ioapic_affinity_irq_desc(desc, mask);
+                        ioapic_set_affinity(idata, mask, false);
        }
 }
@@ -4172,7 +3831,7 @@ static struct resource * __init ioapic_setup_resources(int nr_ioapics)
        return res;
 }
-void __init ioapic_init_mappings(void)
+void __init ioapic_and_gsi_init(void)
 {
        unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
        struct resource *ioapic_res;
@@ -4181,7 +3840,7 @@ void __init ioapic_init_mappings(void)
        ioapic_res = ioapic_setup_resources(nr_ioapics);
        for (i = 0; i < nr_ioapics; i++) {
                if (smp_found_config) {
-                        ioapic_phys = mp_ioapics[i].apicaddr;
+                        ioapic_phys = mpc_ioapic_addr(i);
 #ifdef CONFIG_X86_32
                        if (!ioapic_phys) {
                                printk(KERN_ERR
@@ -4210,6 +3869,8 @@ fake_ioapic_page:
                ioapic_res->end = ioapic_phys + IO_APIC_SLOT_SIZE - 1;
                ioapic_res++;
        }
+        probe_nr_irqs_gsi();
 }
 void __init ioapic_insert_resources(void)
@@ -4234,10 +3895,14 @@ int mp_find_ioapic(u32 gsi)
 {
        int i = 0;
+        if (nr_ioapics == 0)
+                return -1;
        /* Find the IOAPIC that manages this GSI. */
        for (i = 0; i < nr_ioapics; i++) {
-                if ((gsi >= mp_gsi_routing[i].gsi_base)
+                struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(i);
-                    && (gsi <= mp_gsi_routing[i].gsi_end))
+                if ((gsi >= gsi_cfg->gsi_base)
+                    && (gsi <= gsi_cfg->gsi_end))
                        return i;
        }
@@ -4247,18 +3912,22 @@ int mp_find_ioapic(u32 gsi)
 int mp_find_ioapic_pin(int ioapic, u32 gsi)
 {
+        struct mp_ioapic_gsi *gsi_cfg;
        if (WARN_ON(ioapic == -1))
                return -1;
-        if (WARN_ON(gsi > mp_gsi_routing[ioapic].gsi_end))
+        gsi_cfg = mp_ioapic_gsi_routing(ioapic);
+        if (WARN_ON(gsi > gsi_cfg->gsi_end))
                return -1;
-        return gsi - mp_gsi_routing[ioapic].gsi_base;
+        return gsi - gsi_cfg->gsi_base;
 }
-static int bad_ioapic(unsigned long address)
+static __init int bad_ioapic(unsigned long address)
 {
        if (nr_ioapics >= MAX_IO_APICS) {
-                printk(KERN_WARNING "WARING: Max # of I/O APICs (%d) exceeded "
+                printk(KERN_WARNING "WARNING: Max # of I/O APICs (%d) exceeded "
                       "(found %d), skipping\n", MAX_IO_APICS, nr_ioapics);
                return 1;
        }
@@ -4274,40 +3943,42 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
 {
        int idx = 0;
        int entries;
+        struct mp_ioapic_gsi *gsi_cfg;
        if (bad_ioapic(address))
                return;
        idx = nr_ioapics;
-        mp_ioapics[idx].type = MP_IOAPIC;
+        ioapics[idx].mp_config.type = MP_IOAPIC;
-        mp_ioapics[idx].flags = MPC_APIC_USABLE;
+        ioapics[idx].mp_config.flags = MPC_APIC_USABLE;
-        mp_ioapics[idx].apicaddr = address;
+        ioapics[idx].mp_config.apicaddr = address;
        set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
-        mp_ioapics[idx].apicid = io_apic_unique_id(id);
+        ioapics[idx].mp_config.apicid = io_apic_unique_id(id);
-        mp_ioapics[idx].apicver = io_apic_get_version(idx);
+        ioapics[idx].mp_config.apicver = io_apic_get_version(idx);
        /*
         * Build basic GSI lookup table to facilitate gsi->io_apic lookups
         * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
         */
        entries = io_apic_get_redir_entries(idx);
-        mp_gsi_routing[idx].gsi_base = gsi_base;
+        gsi_cfg = mp_ioapic_gsi_routing(idx);
-        mp_gsi_routing[idx].gsi_end = gsi_base + entries - 1;
+        gsi_cfg->gsi_base = gsi_base;
+        gsi_cfg->gsi_end = gsi_base + entries - 1;
        /*
         * The number of IO-APIC IRQ registers (== #pins):
         */
-        nr_ioapic_registers[idx] = entries;
+        ioapics[idx].nr_registers = entries;
-        if (mp_gsi_routing[idx].gsi_end >= gsi_top)
+        if (gsi_cfg->gsi_end >= gsi_top)
-                gsi_top = mp_gsi_routing[idx].gsi_end + 1;
+                gsi_top = gsi_cfg->gsi_end + 1;
        printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
-               "GSI %d-%d\n", idx, mp_ioapics[idx].apicid,
+               "GSI %d-%d\n", idx, mpc_ioapic_id(idx),
-               mp_ioapics[idx].apicver, mp_ioapics[idx].apicaddr,
+               mpc_ioapic_ver(idx), mpc_ioapic_addr(idx),
-               mp_gsi_routing[idx].gsi_base, mp_gsi_routing[idx].gsi_end);
+               gsi_cfg->gsi_base, gsi_cfg->gsi_end);
        nr_ioapics++;
 }
@@ -4315,20 +3986,16 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
 /* Enable IOAPIC early just for system timer */
 void __init pre_init_apic_IRQ0(void)
 {
-        struct irq_cfg *cfg;
+        struct io_apic_irq_attr attr = { 0, 0, 0, 0 };
-        struct irq_desc *desc;
        printk(KERN_INFO "Early APIC setup for system timer0\n");
 #ifndef CONFIG_SMP
-        phys_cpu_present_map = physid_mask_of_physid(boot_cpu_physical_apicid);
+        physid_set_mask_of_physid(boot_cpu_physical_apicid,
+                                         &phys_cpu_present_map);
 #endif
-        desc = irq_to_desc_alloc_node(0, 0);
        setup_local_APIC();
-        cfg = irq_cfg(0);
+        io_apic_setup_irq_pin(0, 0, &attr);
-        add_pin_to_irq_node(cfg, 0, 0, 0);
+        irq_set_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq,
-        set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge");
+                                      "edge");
-        setup_IO_APIC_irq(0, 0, 0, desc, 0, 0);
 }
diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c
index 08385e090a6f..cce91bf26676 100644
--- a/arch/x86/kernel/apic/ipi.c
+++ b/arch/x86/kernel/apic/ipi.c
@@ -56,6 +56,8 @@ void default_send_IPI_mask_allbutself_phys(const struct cpumask *mask,
        local_irq_restore(flags);
 }
+#ifdef CONFIG_X86_32
 void default_send_IPI_mask_sequence_logical(const struct cpumask *mask,
                                                 int vector)
 {
@@ -71,8 +73,8 @@ void default_send_IPI_mask_sequence_logical(const struct cpumask *mask,
        local_irq_save(flags);
        for_each_cpu(query_cpu, mask)
                __default_send_IPI_dest_field(
-                        apic->cpu_to_logical_apicid(query_cpu), vector,
+                        early_per_cpu(x86_cpu_to_logical_apicid, query_cpu),
-                        apic->dest_logical);
+                        vector, apic->dest_logical);
        local_irq_restore(flags);
 }
@@ -90,14 +92,12 @@ void default_send_IPI_mask_allbutself_logical(const struct cpumask *mask,
                if (query_cpu == this_cpu)
                        continue;
                __default_send_IPI_dest_field(
-                        apic->cpu_to_logical_apicid(query_cpu), vector,
+                        early_per_cpu(x86_cpu_to_logical_apicid, query_cpu),
-                        apic->dest_logical);
+                        vector, apic->dest_logical);
                }
        local_irq_restore(flags);
 }
-#ifdef CONFIG_X86_32
 /*
 * This is only used on smaller machines.
 */
diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c
deleted file mode 100644
index a43f71cb30f8..000000000000
--- a/arch/x86/kernel/apic/nmi.c
+++ /dev/null
@@ -1,567 +0,0 @@
-/*
- *  NMI watchdog support on APIC systems
- *
- *  Started by Ingo Molnar <mingo@redhat.com>
- *
- *  Fixes:
- *  Mikael Pettersson   : AMD K7 support for local APIC NMI watchdog.
- *  Mikael Pettersson   : Power Management for local APIC NMI watchdog.
- *  Mikael Pettersson   : Pentium 4 support for local APIC NMI watchdog.
- *  Pavel Machek and
- *  Mikael Pettersson   : PM converted to driver model. Disable/enable API.
- */
-#include <asm/apic.h>
-#include <linux/nmi.h>
-#include <linux/mm.h>
-#include <linux/delay.h>
-#include <linux/interrupt.h>
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/sysdev.h>
-#include <linux/sysctl.h>
-#include <linux/percpu.h>
-#include <linux/kprobes.h>
-#include <linux/cpumask.h>
-#include <linux/kernel_stat.h>
-#include <linux/kdebug.h>
-#include <linux/smp.h>
-#include <asm/i8259.h>
-#include <asm/io_apic.h>
-#include <asm/proto.h>
-#include <asm/timer.h>
-#include <asm/mce.h>
-#include <asm/mach_traps.h>
-int unknown_nmi_panic;
-int nmi_watchdog_enabled;
-/* For reliability, we're prepared to waste bits here. */
-static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly;
-/* nmi_active:
- * >0: the lapic NMI watchdog is active, but can be disabled
- * <0: the lapic NMI watchdog has not been set up, and cannot
- *     be enabled
- *  0: the lapic NMI watchdog is disabled, but can be enabled
- */
-atomic_t nmi_active = ATOMIC_INIT(0);           /* oprofile uses this */
-EXPORT_SYMBOL(nmi_active);
-unsigned int nmi_watchdog = NMI_NONE;
-EXPORT_SYMBOL(nmi_watchdog);
-static int panic_on_timeout;
-static unsigned int nmi_hz = HZ;
-static DEFINE_PER_CPU(short, wd_enabled);
-static int endflag __initdata;
-static inline unsigned int get_nmi_count(int cpu)
-{
-        return per_cpu(irq_stat, cpu).__nmi_count;
-}
-static inline int mce_in_progress(void)
-{
-#if defined(CONFIG_X86_MCE)
-        return atomic_read(&mce_entry) > 0;
-#endif
-        return 0;
-}
-/*
- * Take the local apic timer and PIT/HPET into account. We don't
- * know which one is active, when we have highres/dyntick on
- */
-static inline unsigned int get_timer_irqs(int cpu)
-{
-        return per_cpu(irq_stat, cpu).apic_timer_irqs +
-                per_cpu(irq_stat, cpu).irq0_irqs;
-}
-#ifdef CONFIG_SMP
-/*
- * The performance counters used by NMI_LOCAL_APIC don't trigger when
- * the CPU is idle. To make sure the NMI watchdog really ticks on all
- * CPUs during the test make them busy.
- */
-static __init void nmi_cpu_busy(void *data)
-{
-        local_irq_enable_in_hardirq();
-        /*
-         * Intentionally don't use cpu_relax here. This is
-         * to make sure that the performance counter really ticks,
-         * even if there is a simulator or similar that catches the
-         * pause instruction. On a real HT machine this is fine because
-         * all other CPUs are busy with "useless" delay loops and don't
-         * care if they get somewhat less cycles.
-         */
-        while (endflag == 0)
-                mb();
-}
-#endif
-static void report_broken_nmi(int cpu, unsigned int *prev_nmi_count)
-{
-        printk(KERN_CONT "\n");
-        printk(KERN_WARNING
-                "WARNING: CPU#%d: NMI appears to be stuck (%d->%d)!\n",
-                        cpu, prev_nmi_count[cpu], get_nmi_count(cpu));
-        printk(KERN_WARNING
-                "Please report this to bugzilla.kernel.org,\n");
-        printk(KERN_WARNING
-                "and attach the output of the 'dmesg' command.\n");
-        per_cpu(wd_enabled, cpu) = 0;
-        atomic_dec(&nmi_active);
-}
-static void __acpi_nmi_disable(void *__unused)
-{
-        apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED);
-}
-int __init check_nmi_watchdog(void)
-{
-        unsigned int *prev_nmi_count;
-        int cpu;
-        if (!nmi_watchdog_active() || !atomic_read(&nmi_active))
-                return 0;
-        prev_nmi_count = kmalloc(nr_cpu_ids * sizeof(int), GFP_KERNEL);
-        if (!prev_nmi_count)
-                goto error;
-        printk(KERN_INFO "Testing NMI watchdog ... ");
-#ifdef CONFIG_SMP
-        if (nmi_watchdog == NMI_LOCAL_APIC)
-                smp_call_function(nmi_cpu_busy, (void *)&endflag, 0);
-#endif
-        for_each_possible_cpu(cpu)
-                prev_nmi_count[cpu] = get_nmi_count(cpu);
-        local_irq_enable();
-        mdelay((20 * 1000) / nmi_hz); /* wait 20 ticks */
-        for_each_online_cpu(cpu) {
-                if (!per_cpu(wd_enabled, cpu))
-                        continue;
-                if (get_nmi_count(cpu) - prev_nmi_count[cpu] <= 5)
-                        report_broken_nmi(cpu, prev_nmi_count);
-        }
-        endflag = 1;
-        if (!atomic_read(&nmi_active)) {
-                kfree(prev_nmi_count);
-                atomic_set(&nmi_active, -1);
-                goto error;
-        }
-        printk("OK.\n");
-        /*
-         * now that we know it works we can reduce NMI frequency to
-         * something more reasonable; makes a difference in some configs
-         */
-        if (nmi_watchdog == NMI_LOCAL_APIC)
-                nmi_hz = lapic_adjust_nmi_hz(1);
-        kfree(prev_nmi_count);
-        return 0;
-error:
-        if (nmi_watchdog == NMI_IO_APIC) {
-                if (!timer_through_8259)
-                        legacy_pic->chip->mask(0);
-                on_each_cpu(__acpi_nmi_disable, NULL, 1);
-        }
-#ifdef CONFIG_X86_32
-        timer_ack = 0;
-#endif
-        return -1;
-}
-static int __init setup_nmi_watchdog(char *str)
-{
-        unsigned int nmi;
-        if (!strncmp(str, "panic", 5)) {
-                panic_on_timeout = 1;
-                str = strchr(str, ',');
-                if (!str)
-                        return 1;
-                ++str;
-        }
-        if (!strncmp(str, "lapic", 5))
-                nmi_watchdog = NMI_LOCAL_APIC;
-        else if (!strncmp(str, "ioapic", 6))
-                nmi_watchdog = NMI_IO_APIC;
-        else {
-                get_option(&str, &nmi);
-                if (nmi >= NMI_INVALID)
-                        return 0;
-                nmi_watchdog = nmi;
-        }
-        return 1;
-}
-__setup("nmi_watchdog=", setup_nmi_watchdog);
-/*
- * Suspend/resume support
- */
-#ifdef CONFIG_PM
-static int nmi_pm_active; /* nmi_active before suspend */
-static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state)
-{
-        /* only CPU0 goes here, other CPUs should be offline */
-        nmi_pm_active = atomic_read(&nmi_active);
-        stop_apic_nmi_watchdog(NULL);
-        BUG_ON(atomic_read(&nmi_active) != 0);
-        return 0;
-}
-static int lapic_nmi_resume(struct sys_device *dev)
-{
-        /* only CPU0 goes here, other CPUs should be offline */
-        if (nmi_pm_active > 0) {
-                setup_apic_nmi_watchdog(NULL);
-                touch_nmi_watchdog();
-        }
-        return 0;
-}
-static struct sysdev_class nmi_sysclass = {
-        .name           = "lapic_nmi",
-        .resume         = lapic_nmi_resume,
-        .suspend        = lapic_nmi_suspend,
-};
-static struct sys_device device_lapic_nmi = {
-        .id     = 0,
-        .cls    = &nmi_sysclass,
-};
-static int __init init_lapic_nmi_sysfs(void)
-{
-        int error;
-        /*
-         * should really be a BUG_ON but b/c this is an
-         * init call, it just doesn't work.  -dcz
-         */
-        if (nmi_watchdog != NMI_LOCAL_APIC)
-                return 0;
-        if (atomic_read(&nmi_active) < 0)
-                return 0;
-        error = sysdev_class_register(&nmi_sysclass);
-        if (!error)
-                error = sysdev_register(&device_lapic_nmi);
-        return error;
-}
-/* must come after the local APIC's device_initcall() */
-late_initcall(init_lapic_nmi_sysfs);
-#endif  /* CONFIG_PM */
-static void __acpi_nmi_enable(void *__unused)
-{
-        apic_write(APIC_LVT0, APIC_DM_NMI);
-}
-/*
- * Enable timer based NMIs on all CPUs:
- */
-void acpi_nmi_enable(void)
-{
-        if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
-                on_each_cpu(__acpi_nmi_enable, NULL, 1);
-}
-/*
- * Disable timer based NMIs on all CPUs:
- */
-void acpi_nmi_disable(void)
-{
-        if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
-                on_each_cpu(__acpi_nmi_disable, NULL, 1);
-}
-/*
- * This function is called as soon the LAPIC NMI watchdog driver has everything
- * in place and it's ready to check if the NMIs belong to the NMI watchdog
- */
-void cpu_nmi_set_wd_enabled(void)
-{
-        __get_cpu_var(wd_enabled) = 1;
-}
-void setup_apic_nmi_watchdog(void *unused)
-{
-        if (__get_cpu_var(wd_enabled))
-                return;
-        /* cheap hack to support suspend/resume */
-        /* if cpu0 is not active neither should the other cpus */
-        if (smp_processor_id() != 0 && atomic_read(&nmi_active) <= 0)
-                return;
-        switch (nmi_watchdog) {
-        case NMI_LOCAL_APIC:
-                if (lapic_watchdog_init(nmi_hz) < 0) {
-                        __get_cpu_var(wd_enabled) = 0;
-                        return;
-                }
-                /* FALL THROUGH */
-        case NMI_IO_APIC:
-                __get_cpu_var(wd_enabled) = 1;
-                atomic_inc(&nmi_active);
-        }
-}
-void stop_apic_nmi_watchdog(void *unused)
-{
-        /* only support LOCAL and IO APICs for now */
-        if (!nmi_watchdog_active())
-                return;
-        if (__get_cpu_var(wd_enabled) == 0)
-                return;
-        if (nmi_watchdog == NMI_LOCAL_APIC)
-                lapic_watchdog_stop();
-        else
-                __acpi_nmi_disable(NULL);
-        __get_cpu_var(wd_enabled) = 0;
-        atomic_dec(&nmi_active);
-}
-/*
- * the best way to detect whether a CPU has a 'hard lockup' problem
- * is to check it's local APIC timer IRQ counts. If they are not
- * changing then that CPU has some problem.
- *
- * as these watchdog NMI IRQs are generated on every CPU, we only
- * have to check the current processor.
- *
- * since NMIs don't listen to _any_ locks, we have to be extremely
- * careful not to rely on unsafe variables. The printk might lock
- * up though, so we have to break up any console locks first ...
- * [when there will be more tty-related locks, break them up here too!]
- */
-static DEFINE_PER_CPU(unsigned, last_irq_sum);
-static DEFINE_PER_CPU(long, alert_counter);
-static DEFINE_PER_CPU(int, nmi_touch);
-void touch_nmi_watchdog(void)
-{
-        if (nmi_watchdog_active()) {
-                unsigned cpu;
-                /*
-                 * Tell other CPUs to reset their alert counters. We cannot
-                 * do it ourselves because the alert count increase is not
-                 * atomic.
-                 */
-                for_each_present_cpu(cpu) {
-                        if (per_cpu(nmi_touch, cpu) != 1)
-                                per_cpu(nmi_touch, cpu) = 1;
-                }
-        }
-        /*
-         * Tickle the softlockup detector too:
-         */
-        touch_softlockup_watchdog();
-}
-EXPORT_SYMBOL(touch_nmi_watchdog);
-notrace __kprobes int
-nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
-{
-        /*
-         * Since current_thread_info()-> is always on the stack, and we
-         * always switch the stack NMI-atomically, it's safe to use
-         * smp_processor_id().
-         */
-        unsigned int sum;
-        int touched = 0;
-        int cpu = smp_processor_id();
-        int rc = 0;
-        sum = get_timer_irqs(cpu);
-        if (__get_cpu_var(nmi_touch)) {
-                __get_cpu_var(nmi_touch) = 0;
-                touched = 1;
-        }
-        /* We can be called before check_nmi_watchdog, hence NULL check. */
-        if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) {
-                static DEFINE_RAW_SPINLOCK(lock); /* Serialise the printks */
-                raw_spin_lock(&lock);
-                printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu);
-                show_regs(regs);
-                dump_stack();
-                raw_spin_unlock(&lock);
-                cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask));
-                rc = 1;
-        }
-        /* Could check oops_in_progress here too, but it's safer not to */
-        if (mce_in_progress())
-                touched = 1;
-        /* if the none of the timers isn't firing, this cpu isn't doing much */
-        if (!touched && __get_cpu_var(last_irq_sum) == sum) {
-                /*
-                 * Ayiee, looks like this CPU is stuck ...
-                 * wait a few IRQs (5 seconds) before doing the oops ...
-                 */
-                __this_cpu_inc(alert_counter);
-                if (__this_cpu_read(alert_counter) == 5 * nmi_hz)
-                        /*
-                         * die_nmi will return ONLY if NOTIFY_STOP happens..
-                         */
-                        die_nmi("BUG: NMI Watchdog detected LOCKUP",
-                                regs, panic_on_timeout);
-        } else {
-                __get_cpu_var(last_irq_sum) = sum;
-                __this_cpu_write(alert_counter, 0);
-        }
-        /* see if the nmi watchdog went off */
-        if (!__get_cpu_var(wd_enabled))
-                return rc;
-        switch (nmi_watchdog) {
-        case NMI_LOCAL_APIC:
-                rc |= lapic_wd_event(nmi_hz);
-                break;
-        case NMI_IO_APIC:
-                /*
-                 * don't know how to accurately check for this.
-                 * just assume it was a watchdog timer interrupt
-                 * This matches the old behaviour.
-                 */
-                rc = 1;
-                break;
-        }
-        return rc;
-}
-#ifdef CONFIG_SYSCTL
-static void enable_ioapic_nmi_watchdog_single(void *unused)
-{
-        __get_cpu_var(wd_enabled) = 1;
-        atomic_inc(&nmi_active);
-        __acpi_nmi_enable(NULL);
-}
-static void enable_ioapic_nmi_watchdog(void)
-{
-        on_each_cpu(enable_ioapic_nmi_watchdog_single, NULL, 1);
-        touch_nmi_watchdog();
-}
-static void disable_ioapic_nmi_watchdog(void)
-{
-        on_each_cpu(stop_apic_nmi_watchdog, NULL, 1);
-}
-static int __init setup_unknown_nmi_panic(char *str)
-{
-        unknown_nmi_panic = 1;
-        return 1;
-}
-__setup("unknown_nmi_panic", setup_unknown_nmi_panic);
-static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu)
-{
-        unsigned char reason = get_nmi_reason();
-        char buf[64];
-        sprintf(buf, "NMI received for unknown reason %02x\n", reason);
-        die_nmi(buf, regs, 1); /* Always panic here */
-        return 0;
-}
-/*
- * proc handler for /proc/sys/kernel/nmi
- */
-int proc_nmi_enabled(struct ctl_table *table, int write,
-                        void __user *buffer, size_t *length, loff_t *ppos)
-{
-        int old_state;
-        nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0;
-        old_state = nmi_watchdog_enabled;
-        proc_dointvec(table, write, buffer, length, ppos);
-        if (!!old_state == !!nmi_watchdog_enabled)
-                return 0;
-        if (atomic_read(&nmi_active) < 0 || !nmi_watchdog_active()) {
-                printk(KERN_WARNING
-                        "NMI watchdog is permanently disabled\n");
-                return -EIO;
-        }
-        if (nmi_watchdog == NMI_LOCAL_APIC) {
-                if (nmi_watchdog_enabled)
-                        enable_lapic_nmi_watchdog();
-                else
-                        disable_lapic_nmi_watchdog();
-        } else if (nmi_watchdog == NMI_IO_APIC) {
-                if (nmi_watchdog_enabled)
-                        enable_ioapic_nmi_watchdog();
-                else
-                        disable_ioapic_nmi_watchdog();
-        } else {
-                printk(KERN_WARNING
-                        "NMI watchdog doesn't know what hardware to touch\n");
-                return -EIO;
-        }
-        return 0;
-}
-#endif /* CONFIG_SYSCTL */
-int do_nmi_callback(struct pt_regs *regs, int cpu)
-{
-#ifdef CONFIG_SYSCTL
-        if (unknown_nmi_panic)
-                return unknown_nmi_panic_callback(regs, cpu);
-#endif
-        return 0;
-}
-void arch_trigger_all_cpu_backtrace(void)
-{
-        int i;
-        cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask);
-        printk(KERN_INFO "sending NMI to all CPUs:\n");
-        apic->send_IPI_all(NMI_VECTOR);
-        /* Wait for up to 10 seconds for all CPUs to do the backtrace */
-        for (i = 0; i < 10 * 1000; i++) {
-                if (cpumask_empty(to_cpumask(backtrace_mask)))
-                        break;
-                mdelay(1);
-        }
-}
diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c
index 3e28401f161c..c4a61ca1349a 100644
--- a/arch/x86/kernel/apic/numaq_32.c
+++ b/arch/x86/kernel/apic/numaq_32.c
@@ -26,6 +26,7 @@
 #include <linux/nodemask.h>
 #include <linux/topology.h>
 #include <linux/bootmem.h>
+#include <linux/memblock.h>
 #include <linux/threads.h>
 #include <linux/cpumask.h>
 #include <linux/kernel.h>
@@ -47,8 +48,6 @@
 #include <asm/e820.h>
 #include <asm/ipi.h>
-#define MB_TO_PAGES(addr)               ((addr) << (20 - PAGE_SHIFT))
 int found_numaq;
 /*
@@ -78,31 +77,20 @@ int					quad_local_to_mp_bus_id[NR_CPUS/4][4];
 static inline void numaq_register_node(int node, struct sys_cfg_data *scd)
 {
        struct eachquadmem *eq = scd->eq + node;
+        u64 start = (u64)(eq->hi_shrd_mem_start - eq->priv_mem_size) << 20;
+        u64 end = (u64)(eq->hi_shrd_mem_start + eq->hi_shrd_mem_size) << 20;
+        int ret;
-        node_set_online(node);
+        node_set(node, numa_nodes_parsed);
+        ret = numa_add_memblk(node, start, end);
-        /* Convert to pages */
+        BUG_ON(ret < 0);
-        node_start_pfn[node] =
-                 MB_TO_PAGES(eq->hi_shrd_mem_start - eq->priv_mem_size);
-        node_end_pfn[node] =
-                 MB_TO_PAGES(eq->hi_shrd_mem_start + eq->hi_shrd_mem_size);
-        e820_register_active_regions(node, node_start_pfn[node],
-                                                node_end_pfn[node]);
-        memory_present(node, node_start_pfn[node], node_end_pfn[node]);
-        node_remap_size[node] = node_memmap_size_bytes(node,
-                                        node_start_pfn[node],
-                                        node_end_pfn[node]);
 }
 /*
 * Function: smp_dump_qct()
 *
 * Description: gets memory layout from the quad config table.  This
- * function also updates node_online_map with the nodes (quads) present.
+ * function also updates numa_nodes_parsed with the nodes (quads) present.
 */
 static void __init smp_dump_qct(void)
 {
@@ -111,7 +99,6 @@ static void __init smp_dump_qct(void)
        scd = (void *)__va(SYS_CFG_DATA_PRIV_ADDR);
-        nodes_clear(node_online_map);
        for_each_node(node) {
                if (scd->quads_present31_0 & (1 << node))
                        numaq_register_node(node, scd);
@@ -281,14 +268,14 @@ static __init void early_check_numaq(void)
        }
 }
-int __init get_memcfg_numaq(void)
+int __init numaq_numa_init(void)
 {
        early_check_numaq();
        if (!found_numaq)
-                return 0;
+                return -ENOENT;
        smp_dump_qct();
-        return 1;
+        return 0;
 }
 #define NUMAQ_APIC_DFR_VALUE    (APIC_DFR_CLUSTER)
@@ -372,13 +359,6 @@ static inline void numaq_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask
        return physids_promote(0xFUL, retmap);
 }
-static inline int numaq_cpu_to_logical_apicid(int cpu)
-{
-        if (cpu >= nr_cpu_ids)
-                return BAD_APICID;
-        return cpu_2_logical_apicid[cpu];
-}
 /*
 * Supporting over 60 cpus on NUMA-Q requires a locality-dependent
 * cpu to APIC ID relation to properly interact with the intelligent
@@ -397,6 +377,15 @@ static inline int numaq_apicid_to_node(int logical_apicid)
        return logical_apicid >> 4;
 }
+static int numaq_numa_cpu_node(int cpu)
+{
+        int logical_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu);
+        if (logical_apicid != BAD_APICID)
+                return numaq_apicid_to_node(logical_apicid);
+        return NUMA_NO_NODE;
+}
 static void numaq_apicid_to_cpu_present(int logical_apicid, physid_mask_t *retmap)
 {
        int node = numaq_apicid_to_node(logical_apicid);
@@ -483,8 +472,8 @@ static void numaq_setup_portio_remap(void)
                (u_long) xquad_portio, (u_long) num_quads*XQUAD_PORTIO_QUAD);
 }
-/* Use __refdata to keep false positive warning calm.   */
+/* Use __refdata to keep false positive warning calm.  */
-struct apic __refdata apic_numaq = {
+static struct apic __refdata apic_numaq = {
        .name                           = "NUMAQ",
        .probe                          = probe_numaq,
@@ -507,8 +496,6 @@ struct apic __refdata apic_numaq = {
        .ioapic_phys_id_map             = numaq_ioapic_phys_id_map,
        .setup_apic_routing             = numaq_setup_apic_routing,
        .multi_timer_check              = numaq_multi_timer_check,
-        .apicid_to_node                 = numaq_apicid_to_node,
-        .cpu_to_logical_apicid          = numaq_cpu_to_logical_apicid,
        .cpu_present_to_apicid          = numaq_cpu_present_to_apicid,
        .apicid_to_cpu_present          = numaq_apicid_to_cpu_present,
        .setup_portio_remap             = numaq_setup_portio_remap,
@@ -546,4 +533,9 @@ struct apic __refdata apic_numaq = {
        .icr_write                      = native_apic_icr_write,
        .wait_icr_idle                  = native_apic_wait_icr_idle,
        .safe_wait_icr_idle             = native_safe_apic_wait_icr_idle,
+        .x86_32_early_logical_apicid    = noop_x86_32_early_logical_apicid,
+        .x86_32_numa_cpu_node           = numaq_numa_cpu_node,
 };
+apic_driver(apic_numaq);
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index 99d2fe016084..b5254ad044ab 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -52,29 +52,9 @@ static int __init print_ipi_mode(void)
 }
 late_initcall(print_ipi_mode);
-void __init default_setup_apic_routing(void)
+static int default_x86_32_early_logical_apicid(int cpu)
 {
-        int version = apic_version[boot_cpu_physical_apicid];
+        return 1 << cpu;
-        if (num_possible_cpus() > 8) {
-                switch (boot_cpu_data.x86_vendor) {
-                case X86_VENDOR_INTEL:
-                        if (!APIC_XAPIC(version)) {
-                                def_to_bigsmp = 0;
-                                break;
-                        }
-                        /* If P4 and above fall through */
-                case X86_VENDOR_AMD:
-                        def_to_bigsmp = 1;
-                }
-        }
-#ifdef CONFIG_X86_BIGSMP
-        generic_bigsmp_probe();
-#endif
-        if (apic->setup_apic_routing)
-                apic->setup_apic_routing();
 }
 static void setup_apic_flat_routing(void)
@@ -107,7 +87,7 @@ static int probe_default(void)
        return 1;
 }
-struct apic apic_default = {
+static struct apic apic_default = {
        .name                           = "default",
        .probe                          = probe_default,
@@ -130,8 +110,6 @@ struct apic apic_default = {
        .ioapic_phys_id_map             = default_ioapic_phys_id_map,
        .setup_apic_routing             = setup_apic_flat_routing,
        .multi_timer_check              = NULL,
-        .apicid_to_node                 = default_apicid_to_node,
-        .cpu_to_logical_apicid          = default_cpu_to_logical_apicid,
        .cpu_present_to_apicid          = default_cpu_present_to_apicid,
        .apicid_to_cpu_present          = physid_set_mask_of_physid,
        .setup_portio_remap             = NULL,
@@ -167,46 +145,26 @@ struct apic apic_default = {
        .icr_write                      = native_apic_icr_write,
        .wait_icr_idle                  = native_apic_wait_icr_idle,
        .safe_wait_icr_idle             = native_safe_apic_wait_icr_idle,
+        .x86_32_early_logical_apicid    = default_x86_32_early_logical_apicid,
 };
-extern struct apic apic_numaq;
+apic_driver(apic_default);
-extern struct apic apic_summit;
-extern struct apic apic_bigsmp;
-extern struct apic apic_es7000;
-extern struct apic apic_es7000_cluster;
 struct apic *apic = &apic_default;
 EXPORT_SYMBOL_GPL(apic);
-static struct apic *apic_probe[] __initdata = {
-#ifdef CONFIG_X86_NUMAQ
-        &apic_numaq,
-#endif
-#ifdef CONFIG_X86_SUMMIT
-        &apic_summit,
-#endif
-#ifdef CONFIG_X86_BIGSMP
-        &apic_bigsmp,
-#endif
-#ifdef CONFIG_X86_ES7000
-        &apic_es7000,
-        &apic_es7000_cluster,
-#endif
-        &apic_default,  /* must be last */
-        NULL,
-};
 static int cmdline_apic __initdata;
 static int __init parse_apic(char *arg)
 {
-        int i;
+        struct apic **drv;
        if (!arg)
                return -EINVAL;
-        for (i = 0; apic_probe[i]; i++) {
+        for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
-                if (!strcmp(apic_probe[i]->name, arg)) {
+                if (!strcmp((*drv)->name, arg)) {
-                        apic = apic_probe[i];
+                        apic = *drv;
                        cmdline_apic = 1;
                        return 0;
                }
@@ -217,38 +175,58 @@ static int __init parse_apic(char *arg)
 }
 early_param("apic", parse_apic);
-void __init generic_bigsmp_probe(void)
+void __init default_setup_apic_routing(void)
 {
+        int version = apic_version[boot_cpu_physical_apicid];
+        if (num_possible_cpus() > 8) {
+                switch (boot_cpu_data.x86_vendor) {
+                case X86_VENDOR_INTEL:
+                        if (!APIC_XAPIC(version)) {
+                                def_to_bigsmp = 0;
+                                break;
+                        }
+                        /* If P4 and above fall through */
+                case X86_VENDOR_AMD:
+                        def_to_bigsmp = 1;
+                }
+        }
 #ifdef CONFIG_X86_BIGSMP
        /*
-         * This routine is used to switch to bigsmp mode when
+         * This is used to switch to bigsmp mode when
         * - There is no apic= option specified by the user
         * - generic_apic_probe() has chosen apic_default as the sub_arch
         * - we find more than 8 CPUs in acpi LAPIC listing with xAPIC support
         */
        if (!cmdline_apic && apic == &apic_default) {
-                if (apic_bigsmp.probe()) {
+                struct apic *bigsmp = generic_bigsmp_probe();
-                        apic = &apic_bigsmp;
+                if (bigsmp) {
+                        apic = bigsmp;
                        printk(KERN_INFO "Overriding APIC driver with %s\n",
                               apic->name);
                }
        }
 #endif
+        if (apic->setup_apic_routing)
+                apic->setup_apic_routing();
 }
 void __init generic_apic_probe(void)
 {
        if (!cmdline_apic) {
-                int i;
+                struct apic **drv;
-                for (i = 0; apic_probe[i]; i++) {
-                        if (apic_probe[i]->probe()) {
+                for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
-                                apic = apic_probe[i];
+                        if ((*drv)->probe()) {
+                                apic = *drv;
                                break;
                        }
                }
                /* Not visible without early console */
-                if (!apic_probe[i])
+                if (drv == __apicdrivers_end)
                        panic("Didn't find an APIC driver");
        }
        printk(KERN_INFO "Using APIC driver %s\n", apic->name);
@@ -259,16 +237,16 @@ void __init generic_apic_probe(void)
 int __init
 generic_mps_oem_check(struct mpc_table *mpc, char *oem, char *productid)
 {
-        int i;
+        struct apic **drv;
-        for (i = 0; apic_probe[i]; ++i) {
+        for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
-                if (!apic_probe[i]->mps_oem_check)
+                if (!((*drv)->mps_oem_check))
                        continue;
-                if (!apic_probe[i]->mps_oem_check(mpc, oem, productid))
+                if (!(*drv)->mps_oem_check(mpc, oem, productid))
                        continue;
                if (!cmdline_apic) {
-                        apic = apic_probe[i];
+                        apic = *drv;
                        printk(KERN_INFO "Switched to APIC driver `%s'.\n",
                               apic->name);
                }
@@ -279,16 +257,16 @@ generic_mps_oem_check(struct mpc_table *mpc, char *oem, char *productid)
 int __init default_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
 {
-        int i;
+        struct apic **drv;
-        for (i = 0; apic_probe[i]; ++i) {
+        for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
-                if (!apic_probe[i]->acpi_madt_oem_check)
+                if (!(*drv)->acpi_madt_oem_check)
                        continue;
-                if (!apic_probe[i]->acpi_madt_oem_check(oem_id, oem_table_id))
+                if (!(*drv)->acpi_madt_oem_check(oem_id, oem_table_id))
                        continue;
                if (!cmdline_apic) {
-                        apic = apic_probe[i];
+                        apic = *drv;
                        printk(KERN_INFO "Switched to APIC driver `%s'.\n",
                               apic->name);
                }
diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c
index 83e9be4778e2..3fe986698929 100644
--- a/arch/x86/kernel/apic/probe_64.c
+++ b/arch/x86/kernel/apic/probe_64.c
@@ -23,27 +23,6 @@
 #include <asm/ipi.h>
 #include <asm/setup.h>
-extern struct apic apic_flat;
-extern struct apic apic_physflat;
-extern struct apic apic_x2xpic_uv_x;
-extern struct apic apic_x2apic_phys;
-extern struct apic apic_x2apic_cluster;
-struct apic __read_mostly *apic = &apic_flat;
-EXPORT_SYMBOL_GPL(apic);
-static struct apic *apic_probe[] __initdata = {
-#ifdef CONFIG_X86_UV
-        &apic_x2apic_uv_x,
-#endif
-#ifdef CONFIG_X86_X2APIC
-        &apic_x2apic_phys,
-        &apic_x2apic_cluster,
-#endif
-        &apic_physflat,
-        NULL,
-};
 static int apicid_phys_pkg_id(int initial_apic_id, int index_msb)
 {
        return hard_smp_processor_id() >> index_msb;
@@ -54,35 +33,25 @@ static int apicid_phys_pkg_id(int initial_apic_id, int index_msb)
 */
 void __init default_setup_apic_routing(void)
 {
-#ifdef CONFIG_X86_X2APIC
+        struct apic **drv;
-        if (x2apic_mode
-#ifdef CONFIG_X86_UV
-                       && apic != &apic_x2apic_uv_x
-#endif
-                       ) {
-                if (x2apic_phys)
-                        apic = &apic_x2apic_phys;
-                else
-                        apic = &apic_x2apic_cluster;
-        }
-#endif
-        if (apic == &apic_flat && num_possible_cpus() > 8)
+        enable_IR_x2apic();
-                        apic = &apic_physflat;
-        printk(KERN_INFO "Setting APIC routing to %s\n", apic->name);
+        for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
+                if ((*drv)->probe && (*drv)->probe()) {
+                        if (apic != *drv) {
+                                apic = *drv;
+                                pr_info("Switched APIC routing to %s.\n",
+                                        apic->name);
+                        }
+                        break;
+                }
+        }
        if (is_vsmp_box()) {
                /* need to update phys_pkg_id */
                apic->phys_pkg_id = apicid_phys_pkg_id;
        }
-        /*
-         * Now that apic routing model is selected, configure the
-         * fault handling for intr remapping.
-         */
-        if (intr_remapping_enabled)
-                enable_drhd_fault_handling();
 }
 /* Same for both flat and physical. */
@@ -94,13 +63,15 @@ void apic_send_IPI_self(int vector)
 int __init default_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
 {
-        int i;
+        struct apic **drv;
-        for (i = 0; apic_probe[i]; ++i) {
+        for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
-                if (apic_probe[i]->acpi_madt_oem_check(oem_id, oem_table_id)) {
+                if ((*drv)->acpi_madt_oem_check(oem_id, oem_table_id)) {
-                        apic = apic_probe[i];
+                        if (apic != *drv) {
-                        printk(KERN_INFO "Setting APIC routing to %s.\n",
+                                apic = *drv;
-                                apic->name);
+                                pr_info("Setting APIC routing to %s.\n",
+                                        apic->name);
+                        }
                        return 1;
                }
        }
diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c
index 9b419263d90d..19114423c58c 100644
--- a/arch/x86/kernel/apic/summit_32.c
+++ b/arch/x86/kernel/apic/summit_32.c
@@ -194,11 +194,10 @@ static unsigned long summit_check_apicid_present(int bit)
        return 1;
 }
-static void summit_init_apic_ldr(void)
+static int summit_early_logical_apicid(int cpu)
 {
-        unsigned long val, id;
        int count = 0;
-        u8 my_id = (u8)hard_smp_processor_id();
+        u8 my_id = early_per_cpu(x86_cpu_to_apicid, cpu);
        u8 my_cluster = APIC_CLUSTER(my_id);
 #ifdef CONFIG_SMP
        u8 lid;
@@ -206,7 +205,7 @@ static void summit_init_apic_ldr(void)
        /* Create logical APIC IDs by counting CPUs already in cluster. */
        for (count = 0, i = nr_cpu_ids; --i >= 0; ) {
-                lid = cpu_2_logical_apicid[i];
+                lid = early_per_cpu(x86_cpu_to_logical_apicid, i);
                if (lid != BAD_APICID && APIC_CLUSTER(lid) == my_cluster)
                        ++count;
        }
@@ -214,7 +213,15 @@ static void summit_init_apic_ldr(void)
        /* We only have a 4 wide bitmap in cluster mode.  If a deranged
         * BIOS puts 5 CPUs in one APIC cluster, we're hosed. */
        BUG_ON(count >= XAPIC_DEST_CPUS_SHIFT);
-        id = my_cluster | (1UL << count);
+        return my_cluster | (1UL << count);
+}
+static void summit_init_apic_ldr(void)
+{
+        int cpu = smp_processor_id();
+        unsigned long id = early_per_cpu(x86_cpu_to_logical_apicid, cpu);
+        unsigned long val;
        apic_write(APIC_DFR, SUMMIT_APIC_DFR_VALUE);
        val = apic_read(APIC_LDR) & ~APIC_LDR_MASK;
        val |= SET_APIC_LOGICAL_ID(id);
@@ -232,27 +239,6 @@ static void summit_setup_apic_routing(void)
                                                nr_ioapics);
 }
-static int summit_apicid_to_node(int logical_apicid)
-{
-#ifdef CONFIG_SMP
-        return apicid_2_node[hard_smp_processor_id()];
-#else
-        return 0;
-#endif
-}
-/* Mapping from cpu number to logical apicid */
-static inline int summit_cpu_to_logical_apicid(int cpu)
-{
-#ifdef CONFIG_SMP
-        if (cpu >= nr_cpu_ids)
-                return BAD_APICID;
-        return cpu_2_logical_apicid[cpu];
-#else
-        return logical_smp_processor_id();
-#endif
-}
 static int summit_cpu_present_to_apicid(int mps_cpu)
 {
        if (mps_cpu < nr_cpu_ids)
@@ -286,7 +272,7 @@ static unsigned int summit_cpu_mask_to_apicid(const struct cpumask *cpumask)
         * The cpus in the mask must all be on the apic cluster.
         */
        for_each_cpu(cpu, cpumask) {
-                int new_apicid = summit_cpu_to_logical_apicid(cpu);
+                int new_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu);
                if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) {
                        printk("%s: Not a valid mask!\n", __func__);
@@ -301,7 +287,7 @@ static unsigned int summit_cpu_mask_to_apicid(const struct cpumask *cpumask)
 static unsigned int summit_cpu_mask_to_apicid_and(const struct cpumask *inmask,
                              const struct cpumask *andmask)
 {
-        int apicid = summit_cpu_to_logical_apicid(0);
+        int apicid = early_per_cpu(x86_cpu_to_logical_apicid, 0);
        cpumask_var_t cpumask;
        if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC))
@@ -505,7 +491,7 @@ void setup_summit(void)
 }
 #endif
-struct apic apic_summit = {
+static struct apic apic_summit = {
        .name                           = "summit",
        .probe                          = probe_summit,
@@ -528,8 +514,6 @@ struct apic apic_summit = {
        .ioapic_phys_id_map             = summit_ioapic_phys_id_map,
        .setup_apic_routing             = summit_setup_apic_routing,
        .multi_timer_check              = NULL,
-        .apicid_to_node                 = summit_apicid_to_node,
-        .cpu_to_logical_apicid          = summit_cpu_to_logical_apicid,
        .cpu_present_to_apicid          = summit_cpu_present_to_apicid,
        .apicid_to_cpu_present          = summit_apicid_to_cpu_present,
        .setup_portio_remap             = NULL,
@@ -565,4 +549,8 @@ struct apic apic_summit = {
        .icr_write                      = native_apic_icr_write,
        .wait_icr_idle                  = native_apic_wait_icr_idle,
        .safe_wait_icr_idle             = native_safe_apic_wait_icr_idle,
+        .x86_32_early_logical_apicid    = summit_early_logical_apicid,
 };
+apic_driver(apic_summit);
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index cf69c59f4910..500795875827 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -5,118 +5,95 @@
 #include <linux/ctype.h>
 #include <linux/init.h>
 #include <linux/dmar.h>
+#include <linux/cpu.h>
 #include <asm/smp.h>
-#include <asm/apic.h>
+#include <asm/x2apic.h>
-#include <asm/ipi.h>
 static DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid);
+static DEFINE_PER_CPU(cpumask_var_t, cpus_in_cluster);
+static DEFINE_PER_CPU(cpumask_var_t, ipi_mask);
 static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
 {
        return x2apic_enabled();
 }
-/*
+static inline u32 x2apic_cluster(int cpu)
- * need to use more than cpu 0, because we need more vectors when
- * MSI-X are used.
- */
-static const struct cpumask *x2apic_target_cpus(void)
 {
-        return cpu_online_mask;
+        return per_cpu(x86_cpu_to_logical_apicid, cpu) >> 16;
-}
-/*
- * for now each logical cpu is in its own vector allocation domain.
- */
-static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask)
-{
-        cpumask_clear(retmask);
-        cpumask_set_cpu(cpu, retmask);
 }
 static void
- __x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest)
+__x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest)
 {
-        unsigned long cfg;
+        struct cpumask *cpus_in_cluster_ptr;
+        struct cpumask *ipi_mask_ptr;
+        unsigned int cpu, this_cpu;
+        unsigned long flags;
+        u32 dest;
+        x2apic_wrmsr_fence();
+        local_irq_save(flags);
-        cfg = __prepare_ICR(0, vector, dest);
+        this_cpu = smp_processor_id();
        /*
-         * send the IPI.
+         * We are to modify mask, so we need an own copy
+         * and be sure it's manipulated with irq off.
         */
-        native_x2apic_icr_write(cfg, apicid);
+        ipi_mask_ptr = __raw_get_cpu_var(ipi_mask);
-}
+        cpumask_copy(ipi_mask_ptr, mask);
-/*
+        /*
- * for now, we send the IPI's one by one in the cpumask.
+         * The idea is to send one IPI per cluster.
- * TBD: Based on the cpu mask, we can send the IPI's to the cluster group
+         */
- * at once. We have 16 cpu's in a cluster. This will minimize IPI register
+        for_each_cpu(cpu, ipi_mask_ptr) {
- * writes.
+                unsigned long i;
- */
-static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
-{
-        unsigned long query_cpu;
-        unsigned long flags;
-        x2apic_wrmsr_fence();
+                cpus_in_cluster_ptr = per_cpu(cpus_in_cluster, cpu);
+                dest = 0;
-        local_irq_save(flags);
+                /* Collect cpus in cluster. */
-        for_each_cpu(query_cpu, mask) {
+                for_each_cpu_and(i, ipi_mask_ptr, cpus_in_cluster_ptr) {
-                __x2apic_send_IPI_dest(
+                        if (apic_dest == APIC_DEST_ALLINC || i != this_cpu)
-                        per_cpu(x86_cpu_to_logical_apicid, query_cpu),
+                                dest |= per_cpu(x86_cpu_to_logical_apicid, i);
-                        vector, apic->dest_logical);
+                }
+                if (!dest)
+                        continue;
+                __x2apic_send_IPI_dest(dest, vector, apic->dest_logical);
+                /*
+                 * Cluster sibling cpus should be discared now so
+                 * we would not send IPI them second time.
+                 */
+                cpumask_andnot(ipi_mask_ptr, ipi_mask_ptr, cpus_in_cluster_ptr);
        }
        local_irq_restore(flags);
 }
+static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
+{
+        __x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLINC);
+}
 static void
 x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector)
 {
-        unsigned long this_cpu = smp_processor_id();
+        __x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLBUT);
-        unsigned long query_cpu;
-        unsigned long flags;
-        x2apic_wrmsr_fence();
-        local_irq_save(flags);
-        for_each_cpu(query_cpu, mask) {
-                if (query_cpu == this_cpu)
-                        continue;
-                __x2apic_send_IPI_dest(
-                                per_cpu(x86_cpu_to_logical_apicid, query_cpu),
-                                vector, apic->dest_logical);
-        }
-        local_irq_restore(flags);
 }
 static void x2apic_send_IPI_allbutself(int vector)
 {
-        unsigned long this_cpu = smp_processor_id();
+        __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLBUT);
-        unsigned long query_cpu;
-        unsigned long flags;
-        x2apic_wrmsr_fence();
-        local_irq_save(flags);
-        for_each_online_cpu(query_cpu) {
-                if (query_cpu == this_cpu)
-                        continue;
-                __x2apic_send_IPI_dest(
-                                per_cpu(x86_cpu_to_logical_apicid, query_cpu),
-                                vector, apic->dest_logical);
-        }
-        local_irq_restore(flags);
 }
 static void x2apic_send_IPI_all(int vector)
 {
-        x2apic_send_IPI_mask(cpu_online_mask, vector);
+        __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLINC);
-}
-static int x2apic_apic_id_registered(void)
-{
-        return 1;
 }
 static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask)
@@ -151,43 +128,90 @@ x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
        return per_cpu(x86_cpu_to_logical_apicid, cpu);
 }
-static unsigned int x2apic_cluster_phys_get_apic_id(unsigned long x)
+static void init_x2apic_ldr(void)
 {
-        unsigned int id;
+        unsigned int this_cpu = smp_processor_id();
+        unsigned int cpu;
-        id = x;
+        per_cpu(x86_cpu_to_logical_apicid, this_cpu) = apic_read(APIC_LDR);
-        return id;
+        __cpu_set(this_cpu, per_cpu(cpus_in_cluster, this_cpu));
+        for_each_online_cpu(cpu) {
+                if (x2apic_cluster(this_cpu) != x2apic_cluster(cpu))
+                        continue;
+                __cpu_set(this_cpu, per_cpu(cpus_in_cluster, cpu));
+                __cpu_set(cpu, per_cpu(cpus_in_cluster, this_cpu));
+        }
 }
-static unsigned long set_apic_id(unsigned int id)
+ /*
+  * At CPU state changes, update the x2apic cluster sibling info.
+  */
+static int __cpuinit
+update_clusterinfo(struct notifier_block *nfb, unsigned long action, void *hcpu)
 {
-        unsigned long x;
+        unsigned int this_cpu = (unsigned long)hcpu;
+        unsigned int cpu;
+        int err = 0;
+        switch (action) {
+        case CPU_UP_PREPARE:
+                if (!zalloc_cpumask_var(&per_cpu(cpus_in_cluster, this_cpu),
+                                        GFP_KERNEL)) {
+                        err = -ENOMEM;
+                } else if (!zalloc_cpumask_var(&per_cpu(ipi_mask, this_cpu),
+                                               GFP_KERNEL)) {
+                        free_cpumask_var(per_cpu(cpus_in_cluster, this_cpu));
+                        err = -ENOMEM;
+                }
+                break;
+        case CPU_UP_CANCELED:
+        case CPU_UP_CANCELED_FROZEN:
+        case CPU_DEAD:
+                for_each_online_cpu(cpu) {
+                        if (x2apic_cluster(this_cpu) != x2apic_cluster(cpu))
+                                continue;
+                        __cpu_clear(this_cpu, per_cpu(cpus_in_cluster, cpu));
+                        __cpu_clear(cpu, per_cpu(cpus_in_cluster, this_cpu));
+                }
+                free_cpumask_var(per_cpu(cpus_in_cluster, this_cpu));
+                free_cpumask_var(per_cpu(ipi_mask, this_cpu));
+                break;
+        }
-        x = id;
+        return notifier_from_errno(err);
-        return x;
 }
-static int x2apic_cluster_phys_pkg_id(int initial_apicid, int index_msb)
+static struct notifier_block __refdata x2apic_cpu_notifier = {
-{
+        .notifier_call = update_clusterinfo,
-        return initial_apicid >> index_msb;
+};
-}
-static void x2apic_send_IPI_self(int vector)
+static int x2apic_init_cpu_notifier(void)
 {
-        apic_write(APIC_SELF_IPI, vector);
+        int cpu = smp_processor_id();
+        zalloc_cpumask_var(&per_cpu(cpus_in_cluster, cpu), GFP_KERNEL);
+        zalloc_cpumask_var(&per_cpu(ipi_mask, cpu), GFP_KERNEL);
+        BUG_ON(!per_cpu(cpus_in_cluster, cpu) || !per_cpu(ipi_mask, cpu));
+        __cpu_set(cpu, per_cpu(cpus_in_cluster, cpu));
+        register_hotcpu_notifier(&x2apic_cpu_notifier);
+        return 1;
 }
-static void init_x2apic_ldr(void)
+static int x2apic_cluster_probe(void)
 {
-        int cpu = smp_processor_id();
+        if (x2apic_mode)
+                return x2apic_init_cpu_notifier();
-        per_cpu(x86_cpu_to_logical_apicid, cpu) = apic_read(APIC_LDR);
+        else
+                return 0;
 }
-struct apic apic_x2apic_cluster = {
+static struct apic apic_x2apic_cluster = {
        .name                           = "cluster x2apic",
-        .probe                          = NULL,
+        .probe                          = x2apic_cluster_probe,
        .acpi_madt_oem_check            = x2apic_acpi_madt_oem_check,
        .apic_id_registered             = x2apic_apic_id_registered,
@@ -206,18 +230,16 @@ struct apic apic_x2apic_cluster = {
        .ioapic_phys_id_map             = NULL,
        .setup_apic_routing             = NULL,
        .multi_timer_check              = NULL,
-        .apicid_to_node                 = NULL,
-        .cpu_to_logical_apicid          = NULL,
        .cpu_present_to_apicid          = default_cpu_present_to_apicid,
        .apicid_to_cpu_present          = NULL,
        .setup_portio_remap             = NULL,
        .check_phys_apicid_present      = default_check_phys_apicid_present,
        .enable_apic_mode               = NULL,
-        .phys_pkg_id                    = x2apic_cluster_phys_pkg_id,
+        .phys_pkg_id                    = x2apic_phys_pkg_id,
        .mps_oem_check                  = NULL,
-        .get_apic_id                    = x2apic_cluster_phys_get_apic_id,
+        .get_apic_id                    = x2apic_get_apic_id,
-        .set_apic_id                    = set_apic_id,
+        .set_apic_id                    = x2apic_set_apic_id,
        .apic_id_mask                   = 0xFFFFFFFFu,
        .cpu_mask_to_apicid             = x2apic_cpu_mask_to_apicid,
@@ -242,3 +264,5 @@ struct apic apic_x2apic_cluster = {
        .wait_icr_idle                  = native_x2apic_wait_icr_idle,
        .safe_wait_icr_idle             = native_safe_x2apic_wait_icr_idle,
 };
+apic_driver(apic_x2apic_cluster);
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index 8972f38c5ced..f5373dfde21e 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -7,11 +7,12 @@
 #include <linux/dmar.h>
 #include <asm/smp.h>
-#include <asm/apic.h>
+#include <asm/x2apic.h>
-#include <asm/ipi.h>
 int x2apic_phys;
+static struct apic apic_x2apic_phys;
 static int set_x2apic_phys_mode(char *arg)
 {
        x2apic_phys = 1;
@@ -27,94 +28,46 @@ static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
                return 0;
 }
-/*
+static void
- * need to use more than cpu 0, because we need more vectors when
+__x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest)
- * MSI-X are used.
- */
-static const struct cpumask *x2apic_target_cpus(void)
-{
-        return cpu_online_mask;
-}
-static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask)
-{
-        cpumask_clear(retmask);
-        cpumask_set_cpu(cpu, retmask);
-}
-static void __x2apic_send_IPI_dest(unsigned int apicid, int vector,
-                                   unsigned int dest)
-{
-        unsigned long cfg;
-        cfg = __prepare_ICR(0, vector, dest);
-        /*
-         * send the IPI.
-         */
-        native_x2apic_icr_write(cfg, apicid);
-}
-static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
 {
        unsigned long query_cpu;
+        unsigned long this_cpu;
        unsigned long flags;
        x2apic_wrmsr_fence();
        local_irq_save(flags);
+        this_cpu = smp_processor_id();
        for_each_cpu(query_cpu, mask) {
+                if (apic_dest == APIC_DEST_ALLBUT && this_cpu == query_cpu)
+                        continue;
                __x2apic_send_IPI_dest(per_cpu(x86_cpu_to_apicid, query_cpu),
                                       vector, APIC_DEST_PHYSICAL);
        }
        local_irq_restore(flags);
 }
+static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
+{
+        __x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLINC);
+}
 static void
 x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector)
 {
-        unsigned long this_cpu = smp_processor_id();
+        __x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLBUT);
-        unsigned long query_cpu;
-        unsigned long flags;
-        x2apic_wrmsr_fence();
-        local_irq_save(flags);
-        for_each_cpu(query_cpu, mask) {
-                if (query_cpu != this_cpu)
-                        __x2apic_send_IPI_dest(
-                                per_cpu(x86_cpu_to_apicid, query_cpu),
-                                vector, APIC_DEST_PHYSICAL);
-        }
-        local_irq_restore(flags);
 }
 static void x2apic_send_IPI_allbutself(int vector)
 {
-        unsigned long this_cpu = smp_processor_id();
+        __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLBUT);
-        unsigned long query_cpu;
-        unsigned long flags;
-        x2apic_wrmsr_fence();
-        local_irq_save(flags);
-        for_each_online_cpu(query_cpu) {
-                if (query_cpu == this_cpu)
-                        continue;
-                __x2apic_send_IPI_dest(per_cpu(x86_cpu_to_apicid, query_cpu),
-                                       vector, APIC_DEST_PHYSICAL);
-        }
-        local_irq_restore(flags);
 }
 static void x2apic_send_IPI_all(int vector)
 {
-        x2apic_send_IPI_mask(cpu_online_mask, vector);
+        __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLINC);
-}
-static int x2apic_apic_id_registered(void)
-{
-        return 1;
 }
 static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask)
@@ -149,34 +102,22 @@ x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
        return per_cpu(x86_cpu_to_apicid, cpu);
 }
-static unsigned int x2apic_phys_get_apic_id(unsigned long x)
+static void init_x2apic_ldr(void)
-{
-        return x;
-}
-static unsigned long set_apic_id(unsigned int id)
-{
-        return id;
-}
-static int x2apic_phys_pkg_id(int initial_apicid, int index_msb)
 {
-        return initial_apicid >> index_msb;
 }
-static void x2apic_send_IPI_self(int vector)
+static int x2apic_phys_probe(void)
 {
-        apic_write(APIC_SELF_IPI, vector);
+        if (x2apic_mode && x2apic_phys)
-}
+                return 1;
-static void init_x2apic_ldr(void)
+        return apic == &apic_x2apic_phys;
-{
 }
-struct apic apic_x2apic_phys = {
+static struct apic apic_x2apic_phys = {
        .name                           = "physical x2apic",
-        .probe                          = NULL,
+        .probe                          = x2apic_phys_probe,
        .acpi_madt_oem_check            = x2apic_acpi_madt_oem_check,
        .apic_id_registered             = x2apic_apic_id_registered,
@@ -195,8 +136,6 @@ struct apic apic_x2apic_phys = {
        .ioapic_phys_id_map             = NULL,
        .setup_apic_routing             = NULL,
        .multi_timer_check              = NULL,
-        .apicid_to_node                 = NULL,
-        .cpu_to_logical_apicid          = NULL,
        .cpu_present_to_apicid          = default_cpu_present_to_apicid,
        .apicid_to_cpu_present          = NULL,
        .setup_portio_remap             = NULL,
@@ -205,8 +144,8 @@ struct apic apic_x2apic_phys = {
        .phys_pkg_id                    = x2apic_phys_pkg_id,
        .mps_oem_check                  = NULL,
-        .get_apic_id                    = x2apic_phys_get_apic_id,
+        .get_apic_id                    = x2apic_get_apic_id,
-        .set_apic_id                    = set_apic_id,
+        .set_apic_id                    = x2apic_set_apic_id,
        .apic_id_mask                   = 0xFFFFFFFFu,
        .cpu_mask_to_apicid             = x2apic_cpu_mask_to_apicid,
@@ -231,3 +170,5 @@ struct apic apic_x2apic_phys = {
        .wait_icr_idle                  = native_x2apic_wait_icr_idle,
        .safe_wait_icr_idle             = native_safe_x2apic_wait_icr_idle,
 };
+apic_driver(apic_x2apic_phys);
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index f744f54cb248..adc66c3a1fef 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -5,7 +5,7 @@
 *
 * SGI UV APIC functions (note: not an Intel compatible APIC)
 *
- * Copyright (C) 2007-2009 Silicon Graphics, Inc. All rights reserved.
+ * Copyright (C) 2007-2010 Silicon Graphics, Inc. All rights reserved.
 */
 #include <linux/cpumask.h>
 #include <linux/hardirq.h>
@@ -23,6 +23,8 @@
 #include <linux/io.h>
 #include <linux/pci.h>
 #include <linux/kdebug.h>
+#include <linux/delay.h>
+#include <linux/crash_dump.h>
 #include <asm/uv/uv_mmrs.h>
 #include <asm/uv/uv_hub.h>
@@ -34,6 +36,14 @@
 #include <asm/ipi.h>
 #include <asm/smp.h>
 #include <asm/x86_init.h>
+#include <asm/emergency-restart.h>
+#include <asm/nmi.h>
+/* BMC sets a bit this MMR non-zero before sending an NMI */
+#define UVH_NMI_MMR                             UVH_SCRATCH5
+#define UVH_NMI_MMR_CLEAR                       (UVH_NMI_MMR + 8)
+#define UV_NMI_PENDING_MASK                     (1UL << 63)
+DEFINE_PER_CPU(unsigned long, cpu_last_nmi_count);
 DEFINE_PER_CPU(int, x2apic_extra_bits);
@@ -41,10 +51,25 @@ DEFINE_PER_CPU(int, x2apic_extra_bits);
 static enum uv_system_type uv_system_type;
 static u64 gru_start_paddr, gru_end_paddr;
+static union uvh_apicid uvh_apicid;
 int uv_min_hub_revision_id;
 EXPORT_SYMBOL_GPL(uv_min_hub_revision_id);
+unsigned int uv_apicid_hibits;
+EXPORT_SYMBOL_GPL(uv_apicid_hibits);
 static DEFINE_SPINLOCK(uv_nmi_lock);
+static struct apic apic_x2apic_uv_x;
+static unsigned long __init uv_early_read_mmr(unsigned long addr)
+{
+        unsigned long val, *mmr;
+        mmr = early_ioremap(UV_LOCAL_MMR_BASE | addr, sizeof(*mmr));
+        val = *mmr;
+        early_iounmap(mmr, sizeof(*mmr));
+        return val;
+}
 static inline bool is_GRU_range(u64 start, u64 end)
 {
        return start >= gru_start_paddr && end <= gru_end_paddr;
@@ -55,27 +80,63 @@ static bool uv_is_untracked_pat_range(u64 start, u64 end)
        return is_ISA_range(start, end) || is_GRU_range(start, end);
 }
-static int early_get_nodeid(void)
+static int __init early_get_pnodeid(void)
 {
        union uvh_node_id_u node_id;
-        unsigned long *mmr;
+        union uvh_rh_gam_config_mmr_u  m_n_config;
+        int pnode;
-        mmr = early_ioremap(UV_LOCAL_MMR_BASE | UVH_NODE_ID, sizeof(*mmr));
-        node_id.v = *mmr;
-        early_iounmap(mmr, sizeof(*mmr));
        /* Currently, all blades have same revision number */
+        node_id.v = uv_early_read_mmr(UVH_NODE_ID);
+        m_n_config.v = uv_early_read_mmr(UVH_RH_GAM_CONFIG_MMR);
        uv_min_hub_revision_id = node_id.s.revision;
-        return node_id.s.node_id;
+        if (node_id.s.part_number == UV2_HUB_PART_NUMBER)
+                uv_min_hub_revision_id += UV2_HUB_REVISION_BASE - 1;
+        uv_hub_info->hub_revision = uv_min_hub_revision_id;
+        pnode = (node_id.s.node_id >> 1) & ((1 << m_n_config.s.n_skt) - 1);
+        return pnode;
+}
+static void __init early_get_apic_pnode_shift(void)
+{
+        uvh_apicid.v = uv_early_read_mmr(UVH_APICID);
+        if (!uvh_apicid.v)
+                /*
+                 * Old bios, use default value
+                 */
+                uvh_apicid.s.pnode_shift = UV_APIC_PNODE_SHIFT;
+}
+/*
+ * Add an extra bit as dictated by bios to the destination apicid of
+ * interrupts potentially passing through the UV HUB.  This prevents
+ * a deadlock between interrupts and IO port operations.
+ */
+static void __init uv_set_apicid_hibit(void)
+{
+        union uv1h_lb_target_physical_apic_id_mask_u apicid_mask;
+        if (is_uv1_hub()) {
+                apicid_mask.v =
+                        uv_early_read_mmr(UV1H_LB_TARGET_PHYSICAL_APIC_ID_MASK);
+                uv_apicid_hibits =
+                        apicid_mask.s1.bit_enables & UV_APICID_HIBIT_MASK;
+        }
 }
 static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
 {
-        int nodeid;
+        int pnodeid, is_uv1, is_uv2;
-        if (!strcmp(oem_id, "SGI")) {
+        is_uv1 = !strcmp(oem_id, "SGI");
-                nodeid = early_get_nodeid();
+        is_uv2 = !strcmp(oem_id, "SGI2");
+        if (is_uv1 || is_uv2) {
+                uv_hub_info->hub_revision =
+                        is_uv1 ? UV1_HUB_REVISION_BASE : UV2_HUB_REVISION_BASE;
+                pnodeid = early_get_pnodeid();
+                early_get_apic_pnode_shift();
                x86_platform.is_untracked_pat_range =  uv_is_untracked_pat_range;
                x86_platform.nmi_init = uv_nmi_init;
                if (!strcmp(oem_table_id, "UVL"))
@@ -83,9 +144,10 @@ static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
                else if (!strcmp(oem_table_id, "UVX"))
                        uv_system_type = UV_X2APIC;
                else if (!strcmp(oem_table_id, "UVH")) {
-                        __get_cpu_var(x2apic_extra_bits) =
+                        __this_cpu_write(x2apic_extra_bits,
-                                nodeid << (UV_APIC_PNODE_SHIFT - 1);
+                                pnodeid << uvh_apicid.s.pnode_shift);
                        uv_system_type = UV_NON_UNIQUE_APIC;
+                        uv_set_apicid_hibit();
                        return 1;
                }
        }
@@ -139,6 +201,7 @@ static int __cpuinit uv_wakeup_secondary(int phys_apicid, unsigned long start_ri
        int pnode;
        pnode = uv_apicid_to_pnode(phys_apicid);
+        phys_apicid |= uv_apicid_hibits;
        val = (1UL << UVH_IPI_INT_SEND_SHFT) |
            (phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) |
            ((start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) |
@@ -220,7 +283,7 @@ static unsigned int uv_cpu_mask_to_apicid(const struct cpumask *cpumask)
        int cpu = cpumask_first(cpumask);
        if ((unsigned)cpu < nr_cpu_ids)
-                return per_cpu(x86_cpu_to_apicid, cpu);
+                return per_cpu(x86_cpu_to_apicid, cpu) | uv_apicid_hibits;
        else
                return BAD_APICID;
 }
@@ -239,7 +302,7 @@ uv_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
                if (cpumask_test_cpu(cpu, cpu_online_mask))
                        break;
        }
-        return per_cpu(x86_cpu_to_apicid, cpu);
+        return per_cpu(x86_cpu_to_apicid, cpu) | uv_apicid_hibits;
 }
 static unsigned int x2apic_get_apic_id(unsigned long x)
@@ -247,7 +310,7 @@ static unsigned int x2apic_get_apic_id(unsigned long x)
        unsigned int id;
        WARN_ON(preemptible() && num_online_cpus() > 1);
-        id = x | __get_cpu_var(x2apic_extra_bits);
+        id = x | __this_cpu_read(x2apic_extra_bits);
        return id;
 }
@@ -277,10 +340,15 @@ static void uv_send_IPI_self(int vector)
        apic_write(APIC_SELF_IPI, vector);
 }
-struct apic __refdata apic_x2apic_uv_x = {
+static int uv_probe(void)
+{
+        return apic == &apic_x2apic_uv_x;
+}
+static struct apic __refdata apic_x2apic_uv_x = {
        .name                           = "UV large system",
-        .probe                          = NULL,
+        .probe                          = uv_probe,
        .acpi_madt_oem_check            = uv_acpi_madt_oem_check,
        .apic_id_registered             = uv_apic_id_registered,
@@ -299,8 +367,6 @@ struct apic __refdata apic_x2apic_uv_x = {
        .ioapic_phys_id_map             = NULL,
        .setup_apic_routing             = NULL,
        .multi_timer_check              = NULL,
-        .apicid_to_node                 = NULL,
-        .cpu_to_logical_apicid          = NULL,
        .cpu_present_to_apicid          = default_cpu_present_to_apicid,
        .apicid_to_cpu_present          = NULL,
        .setup_portio_remap             = NULL,
@@ -339,7 +405,7 @@ struct apic __refdata apic_x2apic_uv_x = {
 static __cpuinit void set_x2apic_extra_bits(int pnode)
 {
-        __get_cpu_var(x2apic_extra_bits) = (pnode << 6);
+        __this_cpu_write(x2apic_extra_bits, pnode << uvh_apicid.s.pnode_shift);
 }
 /*
@@ -363,14 +429,14 @@ struct redir_addr {
 #define DEST_SHIFT UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_SHFT
 static __initdata struct redir_addr redir_addrs[] = {
-        {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR, UVH_SI_ALIAS0_OVERLAY_CONFIG},
+        {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR, UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR},
-        {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR, UVH_SI_ALIAS1_OVERLAY_CONFIG},
+        {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR, UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR},
-        {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR, UVH_SI_ALIAS2_OVERLAY_CONFIG},
+        {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR, UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR},
 };
 static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size)
 {
-        union uvh_si_alias0_overlay_config_u alias;
+        union uvh_rh_gam_alias210_overlay_config_2_mmr_u alias;
        union uvh_rh_gam_alias210_redirect_config_2_mmr_u redirect;
        int i;
@@ -430,12 +496,19 @@ static __init void map_mmr_high(int max_pnode)
 static __init void map_mmioh_high(int max_pnode)
 {
        union uvh_rh_gam_mmioh_overlay_config_mmr_u mmioh;
-        int shift = UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT;
+        int shift;
        mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR);
-        if (mmioh.s.enable)
+        if (is_uv1_hub() && mmioh.s1.enable) {
-                map_high("MMIOH", mmioh.s.base, shift, mmioh.s.m_io,
+                shift = UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT;
+                map_high("MMIOH", mmioh.s1.base, shift, mmioh.s1.m_io,
                        max_pnode, map_uc);
+        }
+        if (is_uv2_hub() && mmioh.s2.enable) {
+                shift = UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT;
+                map_high("MMIOH", mmioh.s2.base, shift, mmioh.s2.m_io,
+                        max_pnode, map_uc);
+        }
 }
 static __init void map_low_mmrs(void)
@@ -559,14 +632,14 @@ late_initcall(uv_init_heartbeat);
 /* Direct Legacy VGA I/O traffic to designated IOH */
 int uv_set_vga_state(struct pci_dev *pdev, bool decode,
-                      unsigned int command_bits, bool change_bridge)
+                      unsigned int command_bits, u32 flags)
 {
        int domain, bus, rc;
-        PR_DEVEL("devfn %x decode %d cmd %x chg_brdg %d\n",
+        PR_DEVEL("devfn %x decode %d cmd %x flags %d\n",
-                        pdev->devfn, decode, command_bits, change_bridge);
+                        pdev->devfn, decode, command_bits, flags);
-        if (!change_bridge)
+        if (!(flags & PCI_VGA_STATE_CHANGE_BRIDGE))
                return 0;
        if ((command_bits & PCI_COMMAND_IO) == 0)
@@ -602,18 +675,46 @@ void __cpuinit uv_cpu_init(void)
 */
 int uv_handle_nmi(struct notifier_block *self, unsigned long reason, void *data)
 {
-        if (reason != DIE_NMI_IPI)
+        unsigned long real_uv_nmi;
+        int bid;
+        if (reason != DIE_NMIUNKNOWN)
                return NOTIFY_OK;
        if (in_crash_kexec)
                /* do nothing if entering the crash kernel */
                return NOTIFY_OK;
+        /*
+         * Each blade has an MMR that indicates when an NMI has been sent
+         * to cpus on the blade. If an NMI is detected, atomically
+         * clear the MMR and update a per-blade NMI count used to
+         * cause each cpu on the blade to notice a new NMI.
+         */
+        bid = uv_numa_blade_id();
+        real_uv_nmi = (uv_read_local_mmr(UVH_NMI_MMR) & UV_NMI_PENDING_MASK);
+        if (unlikely(real_uv_nmi)) {
+                spin_lock(&uv_blade_info[bid].nmi_lock);
+                real_uv_nmi = (uv_read_local_mmr(UVH_NMI_MMR) & UV_NMI_PENDING_MASK);
+                if (real_uv_nmi) {
+                        uv_blade_info[bid].nmi_count++;
+                        uv_write_local_mmr(UVH_NMI_MMR_CLEAR, UV_NMI_PENDING_MASK);
+                }
+                spin_unlock(&uv_blade_info[bid].nmi_lock);
+        }
+        if (likely(__get_cpu_var(cpu_last_nmi_count) == uv_blade_info[bid].nmi_count))
+                return NOTIFY_DONE;
+        __get_cpu_var(cpu_last_nmi_count) = uv_blade_info[bid].nmi_count;
        /*
-         * Use a lock so only one cpu prints at a time
+         * Use a lock so only one cpu prints at a time.
-         * to prevent intermixed output.
+         * This prevents intermixed output.
         */
        spin_lock(&uv_nmi_lock);
-        pr_info("NMI stack dump cpu %u:\n", smp_processor_id());
+        pr_info("UV NMI stack dump cpu %u:\n", smp_processor_id());
        dump_stack();
        spin_unlock(&uv_nmi_lock);
@@ -621,7 +722,8 @@ int uv_handle_nmi(struct notifier_block *self, unsigned long reason, void *data)
 }
 static struct notifier_block uv_dump_stack_nmi_nb = {
-        .notifier_call  = uv_handle_nmi
+        .notifier_call  = uv_handle_nmi,
+        .priority = NMI_LOCAL_LOW_PRIOR - 1,
 };
 void uv_register_nmi_notifier(void)
@@ -644,28 +746,34 @@ void uv_nmi_init(void)
 void __init uv_system_init(void)
 {
-        union uvh_si_addr_map_config_u m_n_config;
+        union uvh_rh_gam_config_mmr_u  m_n_config;
+        union uvh_rh_gam_mmioh_overlay_config_mmr_u mmioh;
        union uvh_node_id_u node_id;
        unsigned long gnode_upper, lowmem_redir_base, lowmem_redir_size;
-        int bytes, nid, cpu, lcpu, pnode, blade, i, j, m_val, n_val;
+        int bytes, nid, cpu, lcpu, pnode, blade, i, j, m_val, n_val, n_io;
        int gnode_extra, max_pnode = 0;
        unsigned long mmr_base, present, paddr;
-        unsigned short pnode_mask;
+        unsigned short pnode_mask, pnode_io_mask;
+        printk(KERN_INFO "UV: Found %s hub\n", is_uv1_hub() ? "UV1" : "UV2");
        map_low_mmrs();
-        m_n_config.v = uv_read_local_mmr(UVH_SI_ADDR_MAP_CONFIG);
+        m_n_config.v = uv_read_local_mmr(UVH_RH_GAM_CONFIG_MMR );
        m_val = m_n_config.s.m_skt;
        n_val = m_n_config.s.n_skt;
+        mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR);
+        n_io = is_uv1_hub() ? mmioh.s1.n_io : mmioh.s2.n_io;
        mmr_base =
            uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) &
            ~UV_MMR_ENABLE;
        pnode_mask = (1 << n_val) - 1;
+        pnode_io_mask = (1 << n_io) - 1;
        node_id.v = uv_read_local_mmr(UVH_NODE_ID);
        gnode_extra = (node_id.s.node_id & ~((1 << n_val) - 1)) >> 1;
        gnode_upper = ((unsigned long)gnode_extra  << m_val);
-        printk(KERN_DEBUG "UV: N %d, M %d, gnode_upper 0x%lx, gnode_extra 0x%x\n",
+        printk(KERN_INFO "UV: N %d, M %d, N_IO: %d, gnode_upper 0x%lx, gnode_extra 0x%x, pnode_mask 0x%x, pnode_io_mask 0x%x\n",
-                        n_val, m_val, gnode_upper, gnode_extra);
+                        n_val, m_val, n_io, gnode_upper, gnode_extra, pnode_mask, pnode_io_mask);
        printk(KERN_DEBUG "UV: global MMR base 0x%lx\n", mmr_base);
@@ -675,8 +783,9 @@ void __init uv_system_init(void)
        printk(KERN_DEBUG "UV: Found %d blades\n", uv_num_possible_blades());
        bytes = sizeof(struct uv_blade_info) * uv_num_possible_blades();
-        uv_blade_info = kmalloc(bytes, GFP_KERNEL);
+        uv_blade_info = kzalloc(bytes, GFP_KERNEL);
        BUG_ON(!uv_blade_info);
        for (blade = 0; blade < uv_num_possible_blades(); blade++)
                uv_blade_info[blade].memory_nid = -1;
@@ -698,10 +807,11 @@ void __init uv_system_init(void)
                for (j = 0; j < 64; j++) {
                        if (!test_bit(j, &present))
                                continue;
-                        pnode = (i * 64 + j);
+                        pnode = (i * 64 + j) & pnode_mask;
                        uv_blade_info[blade].pnode = pnode;
                        uv_blade_info[blade].nr_possible_cpus = 0;
                        uv_blade_info[blade].nr_online_cpus = 0;
+                        spin_lock_init(&uv_blade_info[blade].nmi_lock);
                        max_pnode = max(pnode, max_pnode);
                        blade++;
                }
@@ -716,6 +826,13 @@ void __init uv_system_init(void)
                int apicid = per_cpu(x86_cpu_to_apicid, cpu);
                nid = cpu_to_node(cpu);
+                /*
+                 * apic_pnode_shift must be set before calling uv_apicid_to_pnode();
+                 */
+                uv_cpu_hub_info(cpu)->pnode_mask = pnode_mask;
+                uv_cpu_hub_info(cpu)->apic_pnode_shift = uvh_apicid.s.pnode_shift;
+                uv_cpu_hub_info(cpu)->hub_revision = uv_hub_info->hub_revision;
                pnode = uv_apicid_to_pnode(apicid);
                blade = boot_pnode_to_blade(pnode);
                lcpu = uv_blade_info[blade].nr_possible_cpus;
@@ -731,7 +848,6 @@ void __init uv_system_init(void)
                uv_cpu_hub_info(cpu)->numa_blade_id = blade;
                uv_cpu_hub_info(cpu)->blade_processor_id = lcpu;
                uv_cpu_hub_info(cpu)->pnode = pnode;
-                uv_cpu_hub_info(cpu)->pnode_mask = pnode_mask;
                uv_cpu_hub_info(cpu)->gpa_mask = (1UL << (m_val + n_val)) - 1;
                uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper;
                uv_cpu_hub_info(cpu)->gnode_extra = gnode_extra;
@@ -755,7 +871,7 @@ void __init uv_system_init(void)
        map_gru_high(max_pnode);
        map_mmr_high(max_pnode);
-        map_mmioh_high(max_pnode);
+        map_mmioh_high(max_pnode & pnode_io_mask);
        uv_cpu_init();
        uv_scir_register_cpu_notifier();
@@ -764,4 +880,13 @@ void __init uv_system_init(void)
        /* register Legacy VGA I/O redirection handler */
        pci_register_set_vga_state(uv_set_vga_state);
+        /*
+         * For a kdump kernel the reset must be BOOT_ACPI, not BOOT_EFI, as
+         * EFI is not enabled in the kdump kernel.
+         */
+        if (is_kdump_kernel())
+                reboot_type = BOOT_ACPI;
 }
+apic_driver(apic_x2apic_uv_x);
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 4c9c67bf09b7..965a7666c283 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -66,7 +66,7 @@
 *    1.5: Fix segment register reloading (in case of bad segments saved
 *         across BIOS call).
 *         Stephen Rothwell
- *    1.6: Cope with complier/assembler differences.
+ *    1.6: Cope with compiler/assembler differences.
 *         Only try to turn off the first display device.
 *         Fix OOPS at power off with no APM BIOS by Jan Echternach
 *                   <echter@informatik.uni-rostock.de>
@@ -189,8 +189,8 @@
 *   Intel Order Number 241704-001.  Microsoft Part Number 781-110-X01.
 *
 * [This document is available free from Intel by calling 800.628.8686 (fax
- * 916.356.6100) or 800.548.4725; or via anonymous ftp from
+ * 916.356.6100) or 800.548.4725; or from
- * ftp://ftp.intel.com/pub/IAL/software_specs/apmv11.doc.  It is also
+ * http://www.microsoft.com/whdc/archive/amp_12.mspx  It is also
 * available from Microsoft by calling 206.882.8080.]
 *
 * APM 1.2 Reference:
@@ -227,6 +227,8 @@
 #include <linux/suspend.h>
 #include <linux/kthread.h>
 #include <linux/jiffies.h>
+#include <linux/acpi.h>
+#include <linux/syscore_ops.h>
 #include <asm/system.h>
 #include <asm/uaccess.h>
@@ -359,6 +361,7 @@ struct apm_user {
 * idle percentage above which bios idle calls are done
 */
 #ifdef CONFIG_APM_CPU_IDLE
+#warning deprecated CONFIG_APM_CPU_IDLE will be deleted in 2012
 #define DEFAULT_IDLE_THRESHOLD  95
 #else
 #define DEFAULT_IDLE_THRESHOLD  100
@@ -902,6 +905,7 @@ static void apm_cpu_idle(void)
        unsigned int jiffies_since_last_check = jiffies - last_jiffies;
        unsigned int bucket;
+        WARN_ONCE(1, "deprecated apm_cpu_idle will be deleted in 2012");
 recalc:
        if (jiffies_since_last_check > IDLE_CALC_LIMIT) {
                use_apm_idle = 0;
@@ -975,20 +979,10 @@ recalc:
 static void apm_power_off(void)
 {
-        unsigned char po_bios_call[] = {
-                0xb8, 0x00, 0x10,       /* movw  $0x1000,ax  */
-                0x8e, 0xd0,             /* movw  ax,ss       */
-                0xbc, 0x00, 0xf0,       /* movw  $0xf000,sp  */
-                0xb8, 0x07, 0x53,       /* movw  $0x5307,ax  */
-                0xbb, 0x01, 0x00,       /* movw  $0x0001,bx  */
-                0xb9, 0x03, 0x00,       /* movw  $0x0003,cx  */
-                0xcd, 0x15              /* int   $0x15       */
-        };
        /* Some bioses don't like being called from CPU != 0 */
        if (apm_info.realmode_power_off) {
                set_cpus_allowed_ptr(current, cpumask_of(0));
-                machine_real_restart(po_bios_call, sizeof(po_bios_call));
+                machine_real_restart(MRR_APM);
        } else {
                (void)set_system_power_state(APM_STATE_OFF);
        }
@@ -1246,7 +1240,7 @@ static int suspend(int vetoable)
        dpm_suspend_noirq(PMSG_SUSPEND);
        local_irq_disable();
-        sysdev_suspend(PMSG_SUSPEND);
+        syscore_suspend();
        local_irq_enable();
@@ -1264,7 +1258,7 @@ static int suspend(int vetoable)
                apm_error("suspend", err);
        err = (err == APM_SUCCESS) ? 0 : -EIO;
-        sysdev_resume();
+        syscore_resume();
        local_irq_enable();
        dpm_resume_noirq(PMSG_RESUME);
@@ -1288,7 +1282,7 @@ static void standby(void)
        dpm_suspend_noirq(PMSG_SUSPEND);
        local_irq_disable();
-        sysdev_suspend(PMSG_SUSPEND);
+        syscore_suspend();
        local_irq_enable();
        err = set_system_power_state(APM_STATE_STANDBY);
@@ -1296,7 +1290,7 @@ static void standby(void)
                apm_error("standby", err);
        local_irq_disable();
-        sysdev_resume();
+        syscore_resume();
        local_irq_enable();
        dpm_resume_noirq(PMSG_RESUME);
@@ -1926,6 +1920,7 @@ static const struct file_operations apm_bios_fops = {
        .unlocked_ioctl = do_ioctl,
        .open           = do_open,
        .release        = do_release,
+        .llseek         = noop_llseek,
 };
 static struct miscdevice apm_device = {
@@ -2330,12 +2325,11 @@ static int __init apm_init(void)
                apm_info.disabled = 1;
                return -ENODEV;
        }
-        if (pm_flags & PM_ACPI) {
+        if (!acpi_disabled) {
                printk(KERN_NOTICE "apm: overridden by ACPI.\n");
                apm_info.disabled = 1;
                return -ENODEV;
        }
-        pm_flags |= PM_APM;
        /*
         * Set up the long jump entry point to the APM BIOS, which is called
@@ -2427,7 +2421,6 @@ static void __exit apm_exit(void)
                kthread_stop(kapmd_task);
                kapmd_task = NULL;
        }
-        pm_flags &= ~PM_APM;
 }
 module_init(apm_init);
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index cfa82c899f47..4f13fafc5264 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -1,5 +1,70 @@
+/*
+ * Generate definitions needed by assembly language modules.
+ * This code generates raw asm output which is post-processed to extract
+ * and format the required data.
+ */
+#define COMPILE_OFFSETS
+#include <linux/crypto.h>
+#include <linux/sched.h>
+#include <linux/stddef.h>
+#include <linux/hardirq.h>
+#include <linux/suspend.h>
+#include <linux/kbuild.h>
+#include <asm/processor.h>
+#include <asm/thread_info.h>
+#include <asm/sigframe.h>
+#include <asm/bootparam.h>
+#include <asm/suspend.h>
+#ifdef CONFIG_XEN
+#include <xen/interface/xen.h>
+#endif
 #ifdef CONFIG_X86_32
 # include "asm-offsets_32.c"
 #else
 # include "asm-offsets_64.c"
 #endif
+void common(void) {
+        BLANK();
+        OFFSET(TI_flags, thread_info, flags);
+        OFFSET(TI_status, thread_info, status);
+        OFFSET(TI_addr_limit, thread_info, addr_limit);
+        OFFSET(TI_preempt_count, thread_info, preempt_count);
+        BLANK();
+        OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx);
+        BLANK();
+        OFFSET(pbe_address, pbe, address);
+        OFFSET(pbe_orig_address, pbe, orig_address);
+        OFFSET(pbe_next, pbe, next);
+#ifdef CONFIG_PARAVIRT
+        BLANK();
+        OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled);
+        OFFSET(PARAVIRT_PATCH_pv_cpu_ops, paravirt_patch_template, pv_cpu_ops);
+        OFFSET(PARAVIRT_PATCH_pv_irq_ops, paravirt_patch_template, pv_irq_ops);
+        OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable);
+        OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable);
+        OFFSET(PV_CPU_iret, pv_cpu_ops, iret);
+        OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit);
+        OFFSET(PV_CPU_read_cr0, pv_cpu_ops, read_cr0);
+        OFFSET(PV_MMU_read_cr2, pv_mmu_ops, read_cr2);
+#endif
+#ifdef CONFIG_XEN
+        BLANK();
+        OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask);
+        OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending);
+#endif
+        BLANK();
+        OFFSET(BP_scratch, boot_params, scratch);
+        OFFSET(BP_loadflags, boot_params, hdr.loadflags);
+        OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);
+        OFFSET(BP_version, boot_params, hdr.version);
+        OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment);
+}
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index dfdbf6403895..c29d631af6fc 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -1,26 +1,4 @@
-/*
- * Generate definitions needed by assembly language modules.
- * This code generates raw asm output which is post-processed
- * to extract and format the required data.
- */
-#include <linux/crypto.h>
-#include <linux/sched.h>
-#include <linux/signal.h>
-#include <linux/personality.h>
-#include <linux/suspend.h>
-#include <linux/kbuild.h>
 #include <asm/ucontext.h>
-#include <asm/sigframe.h>
-#include <asm/pgtable.h>
-#include <asm/fixmap.h>
-#include <asm/processor.h>
-#include <asm/thread_info.h>
-#include <asm/bootparam.h>
-#include <asm/elf.h>
-#include <asm/suspend.h>
-#include <xen/interface/xen.h>
 #include <linux/lguest.h>
 #include "../../../drivers/lguest/lg.h"
@@ -51,21 +29,10 @@ void foo(void)
        OFFSET(CPUINFO_x86_vendor_id, cpuinfo_x86, x86_vendor_id);
        BLANK();
-        OFFSET(TI_task, thread_info, task);
-        OFFSET(TI_exec_domain, thread_info, exec_domain);
-        OFFSET(TI_flags, thread_info, flags);
-        OFFSET(TI_status, thread_info, status);
-        OFFSET(TI_preempt_count, thread_info, preempt_count);
-        OFFSET(TI_addr_limit, thread_info, addr_limit);
-        OFFSET(TI_restart_block, thread_info, restart_block);
        OFFSET(TI_sysenter_return, thread_info, sysenter_return);
        OFFSET(TI_cpu, thread_info, cpu);
        BLANK();
-        OFFSET(GDS_size, desc_ptr, size);
-        OFFSET(GDS_address, desc_ptr, address);
-        BLANK();
        OFFSET(PT_EBX, pt_regs, bx);
        OFFSET(PT_ECX, pt_regs, cx);
        OFFSET(PT_EDX, pt_regs, dx);
@@ -85,44 +52,13 @@ void foo(void)
        OFFSET(PT_OLDSS,  pt_regs, ss);
        BLANK();
-        OFFSET(EXEC_DOMAIN_handler, exec_domain, handler);
        OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe, uc.uc_mcontext);
        BLANK();
-        OFFSET(pbe_address, pbe, address);
-        OFFSET(pbe_orig_address, pbe, orig_address);
-        OFFSET(pbe_next, pbe, next);
        /* Offset from the sysenter stack to tss.sp0 */
        DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) -
                 sizeof(struct tss_struct));
-        DEFINE(PAGE_SIZE_asm, PAGE_SIZE);
-        DEFINE(PAGE_SHIFT_asm, PAGE_SHIFT);
-        DEFINE(PTRS_PER_PTE, PTRS_PER_PTE);
-        DEFINE(PTRS_PER_PMD, PTRS_PER_PMD);
-        DEFINE(PTRS_PER_PGD, PTRS_PER_PGD);
-        OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx);
-#ifdef CONFIG_PARAVIRT
-        BLANK();
-        OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled);
-        OFFSET(PARAVIRT_PATCH_pv_cpu_ops, paravirt_patch_template, pv_cpu_ops);
-        OFFSET(PARAVIRT_PATCH_pv_irq_ops, paravirt_patch_template, pv_irq_ops);
-        OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable);
-        OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable);
-        OFFSET(PV_CPU_iret, pv_cpu_ops, iret);
-        OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit);
-        OFFSET(PV_CPU_read_cr0, pv_cpu_ops, read_cr0);
-#endif
-#ifdef CONFIG_XEN
-        BLANK();
-        OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask);
-        OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending);
-#endif
 #if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE)
        BLANK();
        OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled);
@@ -141,11 +77,4 @@ void foo(void)
        OFFSET(LGUEST_PAGES_regs_errcode, lguest_pages, regs.errcode);
        OFFSET(LGUEST_PAGES_regs, lguest_pages, regs);
 #endif
-        BLANK();
-        OFFSET(BP_scratch, boot_params, scratch);
-        OFFSET(BP_loadflags, boot_params, hdr.loadflags);
-        OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);
-        OFFSET(BP_version, boot_params, hdr.version);
-        OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment);
 }
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index 4a6aeedcd965..e72a1194af22 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -1,27 +1,4 @@
-/*
- * Generate definitions needed by assembly language modules.
- * This code generates raw asm output which is post-processed to extract
- * and format the required data.
- */
-#define COMPILE_OFFSETS
-#include <linux/crypto.h>
-#include <linux/sched.h> 
-#include <linux/stddef.h>
-#include <linux/errno.h> 
-#include <linux/hardirq.h>
-#include <linux/suspend.h>
-#include <linux/kbuild.h>
-#include <asm/processor.h>
-#include <asm/segment.h>
-#include <asm/thread_info.h>
 #include <asm/ia32.h>
-#include <asm/bootparam.h>
-#include <asm/suspend.h>
-#include <xen/interface/xen.h>
-#include <asm/sigframe.h>
 #define __NO_STUBS 1
 #undef __SYSCALL
@@ -33,41 +10,19 @@ static char syscalls[] = {
 int main(void)
 {
-#define ENTRY(entry) DEFINE(tsk_ ## entry, offsetof(struct task_struct, entry))
-        ENTRY(state);
-        ENTRY(flags); 
-        ENTRY(pid);
-        BLANK();
-#undef ENTRY
-#define ENTRY(entry) DEFINE(TI_ ## entry, offsetof(struct thread_info, entry))
-        ENTRY(flags);
-        ENTRY(addr_limit);
-        ENTRY(preempt_count);
-        ENTRY(status);
-#ifdef CONFIG_IA32_EMULATION
-        ENTRY(sysenter_return);
-#endif
-        BLANK();
-#undef ENTRY
 #ifdef CONFIG_PARAVIRT
-        BLANK();
-        OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled);
-        OFFSET(PARAVIRT_PATCH_pv_cpu_ops, paravirt_patch_template, pv_cpu_ops);
-        OFFSET(PARAVIRT_PATCH_pv_irq_ops, paravirt_patch_template, pv_irq_ops);
-        OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable);
-        OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable);
        OFFSET(PV_IRQ_adjust_exception_frame, pv_irq_ops, adjust_exception_frame);
-        OFFSET(PV_CPU_iret, pv_cpu_ops, iret);
        OFFSET(PV_CPU_usergs_sysret32, pv_cpu_ops, usergs_sysret32);
        OFFSET(PV_CPU_usergs_sysret64, pv_cpu_ops, usergs_sysret64);
-        OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit);
        OFFSET(PV_CPU_swapgs, pv_cpu_ops, swapgs);
-        OFFSET(PV_MMU_read_cr2, pv_mmu_ops, read_cr2);
+        BLANK();
 #endif
 #ifdef CONFIG_IA32_EMULATION
-#define ENTRY(entry) DEFINE(IA32_SIGCONTEXT_ ## entry, offsetof(struct sigcontext_ia32, entry))
+        OFFSET(TI_sysenter_return, thread_info, sysenter_return);
+        BLANK();
+#define ENTRY(entry) OFFSET(IA32_SIGCONTEXT_ ## entry, sigcontext_ia32, entry)
        ENTRY(ax);
        ENTRY(bx);
        ENTRY(cx);
@@ -79,15 +34,12 @@ int main(void)
        ENTRY(ip);
        BLANK();
 #undef ENTRY
-        DEFINE(IA32_RT_SIGFRAME_sigcontext,
-               offsetof (struct rt_sigframe_ia32, uc.uc_mcontext));
+        OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe_ia32, uc.uc_mcontext);
        BLANK();
 #endif
-        DEFINE(pbe_address, offsetof(struct pbe, address));
-        DEFINE(pbe_orig_address, offsetof(struct pbe, orig_address));
+#define ENTRY(entry) OFFSET(pt_regs_ ## entry, pt_regs, entry)
-        DEFINE(pbe_next, offsetof(struct pbe, next));
-        BLANK();
-#define ENTRY(entry) DEFINE(pt_regs_ ## entry, offsetof(struct pt_regs, entry))
        ENTRY(bx);
        ENTRY(bx);
        ENTRY(cx);
@@ -107,7 +59,8 @@ int main(void)
        ENTRY(flags);
        BLANK();
 #undef ENTRY
-#define ENTRY(entry) DEFINE(saved_context_ ## entry, offsetof(struct saved_context, entry))
+#define ENTRY(entry) OFFSET(saved_context_ ## entry, saved_context, entry)
        ENTRY(cr0);
        ENTRY(cr2);
        ENTRY(cr3);
@@ -115,26 +68,11 @@ int main(void)
        ENTRY(cr8);
        BLANK();
 #undef ENTRY
-        DEFINE(TSS_ist, offsetof(struct tss_struct, x86_tss.ist));
-        BLANK();
-        DEFINE(crypto_tfm_ctx_offset, offsetof(struct crypto_tfm, __crt_ctx));
-        BLANK();
-        DEFINE(__NR_syscall_max, sizeof(syscalls) - 1);
+        OFFSET(TSS_ist, tss_struct, x86_tss.ist);
        BLANK();
-        OFFSET(BP_scratch, boot_params, scratch);
-        OFFSET(BP_loadflags, boot_params, hdr.loadflags);
-        OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);
-        OFFSET(BP_version, boot_params, hdr.version);
-        OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment);
-        BLANK();
+        DEFINE(__NR_syscall_max, sizeof(syscalls) - 1);
-        DEFINE(PAGE_SIZE_asm, PAGE_SIZE);
-#ifdef CONFIG_XEN
-        BLANK();
-        OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask);
-        OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending);
-#undef ENTRY
-#endif
        return 0;
 }
diff --git a/arch/x86/kernel/bios_uv.c b/arch/x86/kernel/bios_uv.c
deleted file mode 100644
index 8bc57baaa9ad..000000000000
--- a/arch/x86/kernel/bios_uv.c
+++ /dev/null
@@ -1,215 +0,0 @@
-/*
- * BIOS run time interface routines.
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2 of the License, or
- *  (at your option) any later version.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
- *
- *  Copyright (c) 2008-2009 Silicon Graphics, Inc.  All Rights Reserved.
- *  Copyright (c) Russ Anderson <rja@sgi.com>
- */
-#include <linux/efi.h>
-#include <asm/efi.h>
-#include <linux/io.h>
-#include <asm/uv/bios.h>
-#include <asm/uv/uv_hub.h>
-static struct uv_systab uv_systab;
-s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5)
-{
-        struct uv_systab *tab = &uv_systab;
-        s64 ret;
-        if (!tab->function)
-                /*
-                 * BIOS does not support UV systab
-                 */
-                return BIOS_STATUS_UNIMPLEMENTED;
-        ret = efi_call6((void *)__va(tab->function), (u64)which,
-                        a1, a2, a3, a4, a5);
-        return ret;
-}
-EXPORT_SYMBOL_GPL(uv_bios_call);
-s64 uv_bios_call_irqsave(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3,
-                                        u64 a4, u64 a5)
-{
-        unsigned long bios_flags;
-        s64 ret;
-        local_irq_save(bios_flags);
-        ret = uv_bios_call(which, a1, a2, a3, a4, a5);
-        local_irq_restore(bios_flags);
-        return ret;
-}
-s64 uv_bios_call_reentrant(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3,
-                                        u64 a4, u64 a5)
-{
-        s64 ret;
-        preempt_disable();
-        ret = uv_bios_call(which, a1, a2, a3, a4, a5);
-        preempt_enable();
-        return ret;
-}
-long sn_partition_id;
-EXPORT_SYMBOL_GPL(sn_partition_id);
-long sn_coherency_id;
-EXPORT_SYMBOL_GPL(sn_coherency_id);
-long sn_region_size;
-EXPORT_SYMBOL_GPL(sn_region_size);
-long system_serial_number;
-EXPORT_SYMBOL_GPL(system_serial_number);
-int uv_type;
-EXPORT_SYMBOL_GPL(uv_type);
-s64 uv_bios_get_sn_info(int fc, int *uvtype, long *partid, long *coher,
-                long *region, long *ssn)
-{
-        s64 ret;
-        u64 v0, v1;
-        union partition_info_u part;
-        ret = uv_bios_call_irqsave(UV_BIOS_GET_SN_INFO, fc,
-                                (u64)(&v0), (u64)(&v1), 0, 0);
-        if (ret != BIOS_STATUS_SUCCESS)
-                return ret;
-        part.val = v0;
-        if (uvtype)
-                *uvtype = part.hub_version;
-        if (partid)
-                *partid = part.partition_id;
-        if (coher)
-                *coher = part.coherence_id;
-        if (region)
-                *region = part.region_size;
-        if (ssn)
-                *ssn = v1;
-        return ret;
-}
-EXPORT_SYMBOL_GPL(uv_bios_get_sn_info);
-int
-uv_bios_mq_watchlist_alloc(unsigned long addr, unsigned int mq_size,
-                           unsigned long *intr_mmr_offset)
-{
-        u64 watchlist;
-        s64 ret;
-        /*
-         * bios returns watchlist number or negative error number.
-         */
-        ret = (int)uv_bios_call_irqsave(UV_BIOS_WATCHLIST_ALLOC, addr,
-                        mq_size, (u64)intr_mmr_offset,
-                        (u64)&watchlist, 0);
-        if (ret < BIOS_STATUS_SUCCESS)
-                return ret;
-        return watchlist;
-}
-EXPORT_SYMBOL_GPL(uv_bios_mq_watchlist_alloc);
-int
-uv_bios_mq_watchlist_free(int blade, int watchlist_num)
-{
-        return (int)uv_bios_call_irqsave(UV_BIOS_WATCHLIST_FREE,
-                                blade, watchlist_num, 0, 0, 0);
-}
-EXPORT_SYMBOL_GPL(uv_bios_mq_watchlist_free);
-s64
-uv_bios_change_memprotect(u64 paddr, u64 len, enum uv_memprotect perms)
-{
-        return uv_bios_call_irqsave(UV_BIOS_MEMPROTECT, paddr, len,
-                                        perms, 0, 0);
-}
-EXPORT_SYMBOL_GPL(uv_bios_change_memprotect);
-s64
-uv_bios_reserved_page_pa(u64 buf, u64 *cookie, u64 *addr, u64 *len)
-{
-        s64 ret;
-        ret = uv_bios_call_irqsave(UV_BIOS_GET_PARTITION_ADDR, (u64)cookie,
-                                        (u64)addr, buf, (u64)len, 0);
-        return ret;
-}
-EXPORT_SYMBOL_GPL(uv_bios_reserved_page_pa);
-s64 uv_bios_freq_base(u64 clock_type, u64 *ticks_per_second)
-{
-        return uv_bios_call(UV_BIOS_FREQ_BASE, clock_type,
-                           (u64)ticks_per_second, 0, 0, 0);
-}
-EXPORT_SYMBOL_GPL(uv_bios_freq_base);
-/*
- * uv_bios_set_legacy_vga_target - Set Legacy VGA I/O Target
- * @decode: true to enable target, false to disable target
- * @domain: PCI domain number
- * @bus: PCI bus number
- *
- * Returns:
- *    0: Success
- *    -EINVAL: Invalid domain or bus number
- *    -ENOSYS: Capability not available
- *    -EBUSY: Legacy VGA I/O cannot be retargeted at this time
- */
-int uv_bios_set_legacy_vga_target(bool decode, int domain, int bus)
-{
-        return uv_bios_call(UV_BIOS_SET_LEGACY_VGA_TARGET,
-                                (u64)decode, (u64)domain, (u64)bus, 0, 0);
-}
-EXPORT_SYMBOL_GPL(uv_bios_set_legacy_vga_target);
-#ifdef CONFIG_EFI
-void uv_bios_init(void)
-{
-        struct uv_systab *tab;
-        if ((efi.uv_systab == EFI_INVALID_TABLE_ADDR) ||
-            (efi.uv_systab == (unsigned long)NULL)) {
-                printk(KERN_CRIT "No EFI UV System Table.\n");
-                uv_systab.function = (unsigned long)NULL;
-                return;
-        }
-        tab = (struct uv_systab *)ioremap(efi.uv_systab,
-                                        sizeof(struct uv_systab));
-        if (strncmp(tab->signature, "UVST", 4) != 0)
-                printk(KERN_ERR "bad signature in UV system table!");
-        /*
-         * Copy table to permanent spot for later use.
-         */
-        memcpy(&uv_systab, tab, sizeof(struct uv_systab));
-        iounmap(tab);
-        printk(KERN_INFO "EFI UV System Table Revision %d\n",
-                                        uv_systab.revision);
-}
-#else   /* !CONFIG_EFI */
-void uv_bios_init(void) { }
-#endif
diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c
index fc999e6fc46a..452932d34730 100644
--- a/arch/x86/kernel/check.c
+++ b/arch/x86/kernel/check.c
@@ -2,7 +2,8 @@
 #include <linux/sched.h>
 #include <linux/kthread.h>
 #include <linux/workqueue.h>
-#include <asm/e820.h>
+#include <linux/memblock.h>
 #include <asm/proto.h>
 /*
@@ -18,10 +19,12 @@ static int __read_mostly memory_corruption_check = -1;
 static unsigned __read_mostly corruption_check_size = 64*1024;
 static unsigned __read_mostly corruption_check_period = 60; /* seconds */
-static struct e820entry scan_areas[MAX_SCAN_AREAS];
+static struct scan_area {
+        u64 addr;
+        u64 size;
+} scan_areas[MAX_SCAN_AREAS];
 static int num_scan_areas;
 static __init int set_corruption_check(char *arg)
 {
        char *end;
@@ -81,9 +84,9 @@ void __init setup_bios_corruption_check(void)
        while (addr < corruption_check_size && num_scan_areas < MAX_SCAN_AREAS) {
                u64 size;
-                addr = find_e820_area_size(addr, &size, PAGE_SIZE);
+                addr = memblock_x86_find_in_range_size(addr, &size, PAGE_SIZE);
-                if (!(addr + 1))
+                if (addr == MEMBLOCK_ERROR)
                        break;
                if (addr >= corruption_check_size)
@@ -92,7 +95,7 @@ void __init setup_bios_corruption_check(void)
                if ((addr + size) > corruption_check_size)
                        size = corruption_check_size - addr;
-                e820_update_range(addr, size, E820_RAM, E820_RESERVED);
+                memblock_x86_reserve_range(addr, addr + size, "SCAN RAM");
                scan_areas[num_scan_areas].addr = addr;
                scan_areas[num_scan_areas].size = size;
                num_scan_areas++;
@@ -103,9 +106,8 @@ void __init setup_bios_corruption_check(void)
                addr += size;
        }
-        printk(KERN_INFO "Scanning %d areas for low memory corruption\n",
+        if (num_scan_areas)
-               num_scan_areas);
+                printk(KERN_INFO "Scanning %d areas for low memory corruption\n", num_scan_areas);
-        update_e820();
 }
@@ -141,12 +143,12 @@ static void check_corruption(struct work_struct *dummy)
 {
        check_for_bios_corruption();
        schedule_delayed_work(&bios_check_work,
-                round_jiffies_relative(corruption_check_period*HZ)); 
+                round_jiffies_relative(corruption_check_period*HZ));
 }
 static int start_periodic_check_for_corruption(void)
 {
-        if (!memory_corruption_check || corruption_check_period == 0)
+        if (!num_scan_areas || !memory_corruption_check || corruption_check_period == 0)
                return 0;
        printk(KERN_INFO "Scanning for low memory corruption every %d seconds\n",
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 3f0ebe429a01..6042981d0309 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -30,7 +30,6 @@ obj-$(CONFIG_PERF_EVENTS)		+= perf_event.o
 obj-$(CONFIG_X86_MCE)                   += mcheck/
 obj-$(CONFIG_MTRR)                      += mtrr/
-obj-$(CONFIG_CPU_FREQ)                  += cpufreq/
 obj-$(CONFIG_X86_LOCAL_APIC)            += perfctr-watchdog.o
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index ba5f62f45f01..b13ed393dfce 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -148,7 +148,7 @@ static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c)
 {
 #ifdef CONFIG_SMP
        /* calling is from identify_secondary_cpu() ? */
-        if (c->cpu_index == boot_cpu_id)
+        if (!c->cpu_index)
                return;
        /*
@@ -233,18 +233,22 @@ static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c)
 }
 #endif
-#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
+#ifdef CONFIG_NUMA
+/*
+ * To workaround broken NUMA config.  Read the comment in
+ * srat_detect_node().
+ */
 static int __cpuinit nearby_node(int apicid)
 {
        int i, node;
        for (i = apicid - 1; i >= 0; i--) {
-                node = apicid_to_node[i];
+                node = __apicid_to_node[i];
                if (node != NUMA_NO_NODE && node_online(node))
                        return node;
        }
        for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) {
-                node = apicid_to_node[i];
+                node = __apicid_to_node[i];
                if (node != NUMA_NO_NODE && node_online(node))
                        return node;
        }
@@ -253,37 +257,55 @@ static int __cpuinit nearby_node(int apicid)
 #endif
 /*
- * Fixup core topology information for AMD multi-node processors.
+ * Fixup core topology information for
- * Assumption: Number of cores in each internal node is the same.
+ * (1) AMD multi-node processors
+ *     Assumption: Number of cores in each internal node is the same.
+ * (2) AMD processors supporting compute units
 */
 #ifdef CONFIG_X86_HT
-static void __cpuinit amd_fixup_dcm(struct cpuinfo_x86 *c)
+static void __cpuinit amd_get_topology(struct cpuinfo_x86 *c)
 {
-        unsigned long long value;
+        u32 nodes, cores_per_cu = 1;
-        u32 nodes, cores_per_node;
+        u8 node_id;
        int cpu = smp_processor_id();
-        if (!cpu_has(c, X86_FEATURE_NODEID_MSR))
+        /* get information required for multi-node processors */
-                return;
+        if (cpu_has(c, X86_FEATURE_TOPOEXT)) {
+                u32 eax, ebx, ecx, edx;
-        /* fixup topology information only once for a core */
-        if (cpu_has(c, X86_FEATURE_AMD_DCM))
+                cpuid(0x8000001e, &eax, &ebx, &ecx, &edx);
+                nodes = ((ecx >> 8) & 7) + 1;
+                node_id = ecx & 7;
+                /* get compute unit information */
+                smp_num_siblings = ((ebx >> 8) & 3) + 1;
+                c->compute_unit_id = ebx & 0xff;
+                cores_per_cu += ((ebx >> 8) & 3);
+        } else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) {
+                u64 value;
+                rdmsrl(MSR_FAM10H_NODE_ID, value);
+                nodes = ((value >> 3) & 7) + 1;
+                node_id = value & 7;
+        } else
                return;
-        rdmsrl(MSR_FAM10H_NODE_ID, value);
+        /* fixup multi-node processor information */
+        if (nodes > 1) {
-        nodes = ((value >> 3) & 7) + 1;
+                u32 cores_per_node;
-        if (nodes == 1)
+                u32 cus_per_node;
-                return;
-        set_cpu_cap(c, X86_FEATURE_AMD_DCM);
+                set_cpu_cap(c, X86_FEATURE_AMD_DCM);
-        cores_per_node = c->x86_max_cores / nodes;
+                cores_per_node = c->x86_max_cores / nodes;
+                cus_per_node = cores_per_node / cores_per_cu;
-        /* store NodeID, use llc_shared_map to store sibling info */
+                /* store NodeID, use llc_shared_map to store sibling info */
-        per_cpu(cpu_llc_id, cpu) = value & 7;
+                per_cpu(cpu_llc_id, cpu) = node_id;
-        /* fixup core id to be in range from 0 to (cores_per_node - 1) */
+                /* core id has to be in the [0 .. cores_per_node - 1] range */
-        c->cpu_core_id = c->cpu_core_id % cores_per_node;
+                c->cpu_core_id %= cores_per_node;
+                c->compute_unit_id %= cus_per_node;
+        }
 }
 #endif
@@ -304,9 +326,7 @@ static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c)
        c->phys_proc_id = c->initial_apicid >> bits;
        /* use socket ID also for last level cache */
        per_cpu(cpu_llc_id, cpu) = c->phys_proc_id;
-        /* fixup topology information on multi-node processors */
+        amd_get_topology(c);
-        if ((c->x86 == 0x10) && (c->x86_model == 9))
-                amd_fixup_dcm(c);
 #endif
 }
@@ -322,31 +342,40 @@ EXPORT_SYMBOL_GPL(amd_get_nb_id);
 static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
 {
-#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
+#ifdef CONFIG_NUMA
        int cpu = smp_processor_id();
        int node;
        unsigned apicid = c->apicid;
-        node = per_cpu(cpu_llc_id, cpu);
+        node = numa_cpu_node(cpu);
+        if (node == NUMA_NO_NODE)
+                node = per_cpu(cpu_llc_id, cpu);
-        if (apicid_to_node[apicid] != NUMA_NO_NODE)
-                node = apicid_to_node[apicid];
        if (!node_online(node)) {
-                /* Two possibilities here:
+                /*
-                   - The CPU is missing memory and no node was created.
+                 * Two possibilities here:
-                   In that case try picking one from a nearby CPU
+                 *
-                   - The APIC IDs differ from the HyperTransport node IDs
+                 * - The CPU is missing memory and no node was created.  In
-                   which the K8 northbridge parsing fills in.
+                 *   that case try picking one from a nearby CPU.
-                   Assume they are all increased by a constant offset,
+                 *
-                   but in the same order as the HT nodeids.
+                 * - The APIC IDs differ from the HyperTransport node IDs
-                   If that doesn't result in a usable node fall back to the
+                 *   which the K8 northbridge parsing fills in.  Assume
-                   path for the previous case.  */
+                 *   they are all increased by a constant offset, but in
+                 *   the same order as the HT nodeids.  If that doesn't
+                 *   result in a usable node fall back to the path for the
+                 *   previous case.
+                 *
+                 * This workaround operates directly on the mapping between
+                 * APIC ID and NUMA node, assuming certain relationship
+                 * between APIC ID, HT node ID and NUMA topology.  As going
+                 * through CPU mapping may alter the outcome, directly
+                 * access __apicid_to_node[].
+                 */
                int ht_nodeid = c->initial_apicid;
                if (ht_nodeid >= 0 &&
-                    apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
+                    __apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
-                        node = apicid_to_node[ht_nodeid];
+                        node = __apicid_to_node[ht_nodeid];
                /* Pick a nearby node */
                if (!node_online(node))
                        node = nearby_node(apicid);
@@ -412,6 +441,23 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
                        set_cpu_cap(c, X86_FEATURE_EXTD_APICID);
        }
 #endif
+        /* We need to do the following only once */
+        if (c != &boot_cpu_data)
+                return;
+        if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) {
+                if (c->x86 > 0x10 ||
+                    (c->x86 == 0x10 && c->x86_model >= 0x2)) {
+                        u64 val;
+                        rdmsrl(MSR_K7_HWCR, val);
+                        if (!(val & BIT(24)))
+                                printk(KERN_WARNING FW_BUG "TSC doesn't count "
+                                        "with P0 frequency!\n");
+                }
+        }
 }
 static void __cpuinit init_amd(struct cpuinfo_x86 *c)
@@ -523,7 +569,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
 #endif
        if (c->extended_cpuid_level >= 0x80000006) {
-                if ((c->x86 >= 0x0f) && (cpuid_edx(0x80000006) & 0xf000))
+                if (cpuid_edx(0x80000006) & 0xf000)
                        num_cache_leaves = 4;
                else
                        num_cache_leaves = 3;
@@ -565,6 +611,35 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
                }
        }
 #endif
+        /*
+         * Family 0x12 and above processors have APIC timer
+         * running in deep C states.
+         */
+        if (c->x86 > 0x11)
+                set_cpu_cap(c, X86_FEATURE_ARAT);
+        /*
+         * Disable GART TLB Walk Errors on Fam10h. We do this here
+         * because this is always needed when GART is enabled, even in a
+         * kernel which has no MCE support built in.
+         */
+        if (c->x86 == 0x10) {
+                /*
+                 * BIOS should disable GartTlbWlk Errors themself. If
+                 * it doesn't do it here as suggested by the BKDG.
+                 *
+                 * Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=33012
+                 */
+                u64 mask;
+                int err;
+                err = rdmsrl_safe(MSR_AMD64_MCx_MASK(4), &mask);
+                if (err == 0) {
+                        mask |= (1 << 10);
+                        checking_wrmsrl(MSR_AMD64_MCx_MASK(4), mask);
+                }
+        }
 }
 #ifdef CONFIG_X86_32
@@ -639,7 +714,7 @@ EXPORT_SYMBOL_GPL(amd_erratum_383);
 bool cpu_has_amd_erratum(const int *erratum)
 {
-        struct cpuinfo_x86 *cpu = &current_cpu_data;
+        struct cpuinfo_x86 *cpu = __this_cpu_ptr(&cpu_info);
        int osvw_id = *erratum++;
        u32 range;
        u32 ms;
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index c39576cb3018..525514cf33c3 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -19,6 +19,7 @@
 static int __init no_halt(char *s)
 {
+        WARN_ONCE(1, "\"no-hlt\" is deprecated, please use \"idle=poll\"\n");
        boot_cpu_data.hlt_works_ok = 0;
        return 1;
 }
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index f2f9ac7da25c..22a073d7fbff 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -254,6 +254,25 @@ static inline void squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
 }
 #endif
+static int disable_smep __cpuinitdata;
+static __init int setup_disable_smep(char *arg)
+{
+        disable_smep = 1;
+        return 1;
+}
+__setup("nosmep", setup_disable_smep);
+static __cpuinit void setup_smep(struct cpuinfo_x86 *c)
+{
+        if (cpu_has(c, X86_FEATURE_SMEP)) {
+                if (unlikely(disable_smep)) {
+                        setup_clear_cpu_cap(X86_FEATURE_SMEP);
+                        clear_in_cr4(X86_CR4_SMEP);
+                } else
+                        set_in_cr4(X86_CR4_SMEP);
+        }
+}
 /*
 * Some CPU features depend on higher CPUID levels, which may not always
 * be available due to CPUID level capping or broken virtualization
@@ -458,13 +477,6 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c)
        if (smp_num_siblings <= 1)
                goto out;
-        if (smp_num_siblings > nr_cpu_ids) {
-                pr_warning("CPU: Unsupported number of siblings %d",
-                           smp_num_siblings);
-                smp_num_siblings = 1;
-                return;
-        }
        index_msb = get_count_order(smp_num_siblings);
        c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, index_msb);
@@ -565,8 +577,7 @@ void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c)
                cpuid_count(0x00000007, 0, &eax, &ebx, &ecx, &edx);
-                if (eax > 0)
+                c->x86_capability[9] = ebx;
-                        c->x86_capability[9] = ebx;
        }
        /* AMD-defined flags: level 0x80000001 */
@@ -665,9 +676,11 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
                this_cpu->c_early_init(c);
 #ifdef CONFIG_SMP
-        c->cpu_index = boot_cpu_id;
+        c->cpu_index = 0;
 #endif
        filter_cpuid_features(c, false);
+        setup_smep(c);
 }
 void __init early_cpu_init(void)
@@ -675,7 +688,7 @@ void __init early_cpu_init(void)
        const struct cpu_dev *const *cdev;
        int count = 0;
-#ifdef PROCESSOR_SELECT
+#ifdef CONFIG_PROCESSOR_SELECT
        printk(KERN_INFO "KERNEL supported cpus:\n");
 #endif
@@ -687,7 +700,7 @@ void __init early_cpu_init(void)
                cpu_devs[count] = cpudev;
                count++;
-#ifdef PROCESSOR_SELECT
+#ifdef CONFIG_PROCESSOR_SELECT
                {
                        unsigned int j;
@@ -704,16 +717,21 @@ void __init early_cpu_init(void)
 }
 /*
- * The NOPL instruction is supposed to exist on all CPUs with
+ * The NOPL instruction is supposed to exist on all CPUs of family >= 6;
- * family >= 6; unfortunately, that's not true in practice because
+ * unfortunately, that's not true in practice because of early VIA
- * of early VIA chips and (more importantly) broken virtualizers that
+ * chips and (more importantly) broken virtualizers that are not easy
- * are not easy to detect.  In the latter case it doesn't even *fail*
+ * to detect. In the latter case it doesn't even *fail* reliably, so
- * reliably, so probing for it doesn't even work.  Disable it completely
+ * probing for it doesn't even work. Disable it completely on 32-bit
 * unless we can find a reliable way to detect all the broken cases.
+ * Enable it explicitly on 64-bit for non-constant inputs of cpu_has().
 */
 static void __cpuinit detect_nopl(struct cpuinfo_x86 *c)
 {
+#ifdef CONFIG_X86_32
        clear_cpu_cap(c, X86_FEATURE_NOPL);
+#else
+        set_cpu_cap(c, X86_FEATURE_NOPL);
+#endif
 }
 static void __cpuinit generic_identify(struct cpuinfo_x86 *c)
@@ -748,6 +766,8 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 *c)
 #endif
        }
+        setup_smep(c);
        get_model_name(c); /* Default name */
        detect_nopl(c);
@@ -864,7 +884,7 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
        select_idle_routine(c);
-#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
+#ifdef CONFIG_NUMA
        numa_add_cpu(smp_processor_id());
 #endif
 }
@@ -882,14 +902,13 @@ static void vgetcpu_set_mode(void)
 void __init identify_boot_cpu(void)
 {
        identify_cpu(&boot_cpu_data);
-        init_c1e_mask();
+        init_amd_e400_c1e_mask();
 #ifdef CONFIG_X86_32
        sysenter_setup();
        enable_sep_cpu();
 #else
        vgetcpu_set_mode();
 #endif
-        init_hw_perf_events();
 }
 void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
@@ -1264,13 +1283,6 @@ void __cpuinit cpu_init(void)
        clear_all_debug_regs();
        dbg_restore_debug_regs();
-        /*
-         * Force FPU initialization:
-         */
-        current_thread_info()->status = 0;
-        clear_used_math();
-        mxcsr_feature_mask_init();
        fpu_init();
        xsave_init();
 }
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index f668bb1f7d43..e765633f210e 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -32,6 +32,7 @@ struct cpu_dev {
 extern const struct cpu_dev *const __x86_cpu_dev_start[],
                            *const __x86_cpu_dev_end[];
+extern void get_cpu_cap(struct cpuinfo_x86 *c);
 extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c);
 extern void get_cpu_cap(struct cpuinfo_x86 *c);
diff --git a/arch/x86/kernel/cpu/cpufreq/Kconfig b/arch/x86/kernel/cpu/cpufreq/Kconfig
deleted file mode 100644
index 870e6cc6ad28..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/Kconfig
+++ /dev/null
@@ -1,266 +0,0 @@
-#
-# CPU Frequency scaling
-#
-menu "CPU Frequency scaling"
-source "drivers/cpufreq/Kconfig"
-if CPU_FREQ
-comment "CPUFreq processor drivers"
-config X86_PCC_CPUFREQ
-        tristate "Processor Clocking Control interface driver"
-        depends on ACPI && ACPI_PROCESSOR
-        help
-          This driver adds support for the PCC interface.
-          For details, take a look at:
-          <file:Documentation/cpu-freq/pcc-cpufreq.txt>.
-          To compile this driver as a module, choose M here: the
-          module will be called pcc-cpufreq.
-          If in doubt, say N.
-config X86_ACPI_CPUFREQ
-        tristate "ACPI Processor P-States driver"
-        select CPU_FREQ_TABLE
-        depends on ACPI_PROCESSOR
-        help
-          This driver adds a CPUFreq driver which utilizes the ACPI
-          Processor Performance States.
-          This driver also supports Intel Enhanced Speedstep.
-          To compile this driver as a module, choose M here: the
-          module will be called acpi-cpufreq.
-          For details, take a look at <file:Documentation/cpu-freq/>.
-          If in doubt, say N.
-config ELAN_CPUFREQ
-        tristate "AMD Elan SC400 and SC410"
-        select CPU_FREQ_TABLE
-        depends on X86_ELAN
-        ---help---
-          This adds the CPUFreq driver for AMD Elan SC400 and SC410
-          processors.
-          You need to specify the processor maximum speed as boot
-          parameter: elanfreq=maxspeed (in kHz) or as module
-          parameter "max_freq".
-          For details, take a look at <file:Documentation/cpu-freq/>.
-          If in doubt, say N.
-config SC520_CPUFREQ
-        tristate "AMD Elan SC520"
-        select CPU_FREQ_TABLE
-        depends on X86_ELAN
-        ---help---
-          This adds the CPUFreq driver for AMD Elan SC520 processor.
-          For details, take a look at <file:Documentation/cpu-freq/>.
-          If in doubt, say N.
-config X86_POWERNOW_K6
-        tristate "AMD Mobile K6-2/K6-3 PowerNow!"
-        select CPU_FREQ_TABLE
-        depends on X86_32
-        help
-          This adds the CPUFreq driver for mobile AMD K6-2+ and mobile
-          AMD K6-3+ processors.
-          For details, take a look at <file:Documentation/cpu-freq/>.
-          If in doubt, say N.
-config X86_POWERNOW_K7
-        tristate "AMD Mobile Athlon/Duron PowerNow!"
-        select CPU_FREQ_TABLE
-        depends on X86_32
-        help
-          This adds the CPUFreq driver for mobile AMD K7 mobile processors.
-          For details, take a look at <file:Documentation/cpu-freq/>.
-          If in doubt, say N.
-config X86_POWERNOW_K7_ACPI
-        bool
-        depends on X86_POWERNOW_K7 && ACPI_PROCESSOR
-        depends on !(X86_POWERNOW_K7 = y && ACPI_PROCESSOR = m)
-        depends on X86_32
-        default y
-config X86_POWERNOW_K8
-        tristate "AMD Opteron/Athlon64 PowerNow!"
-        select CPU_FREQ_TABLE
-        depends on ACPI && ACPI_PROCESSOR
-        help
-          This adds the CPUFreq driver for K8/K10 Opteron/Athlon64 processors.
-          To compile this driver as a module, choose M here: the
-          module will be called powernow-k8.
-          For details, take a look at <file:Documentation/cpu-freq/>.
-config X86_GX_SUSPMOD
-        tristate "Cyrix MediaGX/NatSemi Geode Suspend Modulation"
-        depends on X86_32 && PCI
-        help
-         This add the CPUFreq driver for NatSemi Geode processors which
-         support suspend modulation.
-         For details, take a look at <file:Documentation/cpu-freq/>.
-         If in doubt, say N.
-config X86_SPEEDSTEP_CENTRINO
-        tristate "Intel Enhanced SpeedStep (deprecated)"
-        select CPU_FREQ_TABLE
-        select X86_SPEEDSTEP_CENTRINO_TABLE if X86_32
-        depends on X86_32 || (X86_64 && ACPI_PROCESSOR)
-        help
-          This is deprecated and this functionality is now merged into
-          acpi_cpufreq (X86_ACPI_CPUFREQ). Use that driver instead of
-          speedstep_centrino.
-          This adds the CPUFreq driver for Enhanced SpeedStep enabled
-          mobile CPUs.  This means Intel Pentium M (Centrino) CPUs
-          or 64bit enabled Intel Xeons.
-          To compile this driver as a module, choose M here: the
-          module will be called speedstep-centrino.
-          For details, take a look at <file:Documentation/cpu-freq/>.
-          If in doubt, say N.
-config X86_SPEEDSTEP_CENTRINO_TABLE
-        bool "Built-in tables for Banias CPUs"
-        depends on X86_32 && X86_SPEEDSTEP_CENTRINO
-        default y
-        help
-          Use built-in tables for Banias CPUs if ACPI encoding
-          is not available.
-          If in doubt, say N.
-config X86_SPEEDSTEP_ICH
-        tristate "Intel Speedstep on ICH-M chipsets (ioport interface)"
-        select CPU_FREQ_TABLE
-        depends on X86_32
-        help
-          This adds the CPUFreq driver for certain mobile Intel Pentium III
-          (Coppermine), all mobile Intel Pentium III-M (Tualatin) and all
-          mobile Intel Pentium 4 P4-M on systems which have an Intel ICH2,
-          ICH3 or ICH4 southbridge.
-          For details, take a look at <file:Documentation/cpu-freq/>.
-          If in doubt, say N.
-config X86_SPEEDSTEP_SMI
-        tristate "Intel SpeedStep on 440BX/ZX/MX chipsets (SMI interface)"
-        select CPU_FREQ_TABLE
-        depends on X86_32 && EXPERIMENTAL
-        help
-          This adds the CPUFreq driver for certain mobile Intel Pentium III
-          (Coppermine), all mobile Intel Pentium III-M (Tualatin)
-          on systems which have an Intel 440BX/ZX/MX southbridge.
-          For details, take a look at <file:Documentation/cpu-freq/>.
-          If in doubt, say N.
-config X86_P4_CLOCKMOD
-        tristate "Intel Pentium 4 clock modulation"
-        select CPU_FREQ_TABLE
-        help
-          This adds the CPUFreq driver for Intel Pentium 4 / XEON
-          processors.  When enabled it will lower CPU temperature by skipping
-          clocks.
-          This driver should be only used in exceptional
-          circumstances when very low power is needed because it causes severe
-          slowdowns and noticeable latencies.  Normally Speedstep should be used
-          instead.
-          To compile this driver as a module, choose M here: the
-          module will be called p4-clockmod.
-          For details, take a look at <file:Documentation/cpu-freq/>.
-          Unless you are absolutely sure say N.
-config X86_CPUFREQ_NFORCE2
-        tristate "nVidia nForce2 FSB changing"
-        depends on X86_32 && EXPERIMENTAL
-        help
-          This adds the CPUFreq driver for FSB changing on nVidia nForce2
-          platforms.
-          For details, take a look at <file:Documentation/cpu-freq/>.
-          If in doubt, say N.
-config X86_LONGRUN
-        tristate "Transmeta LongRun"
-        depends on X86_32
-        help
-          This adds the CPUFreq driver for Transmeta Crusoe and Efficeon processors
-          which support LongRun.
-          For details, take a look at <file:Documentation/cpu-freq/>.
-          If in doubt, say N.
-config X86_LONGHAUL
-        tristate "VIA Cyrix III Longhaul"
-        select CPU_FREQ_TABLE
-        depends on X86_32 && ACPI_PROCESSOR
-        help
-          This adds the CPUFreq driver for VIA Samuel/CyrixIII,
-          VIA Cyrix Samuel/C3, VIA Cyrix Ezra and VIA Cyrix Ezra-T
-          processors.
-          For details, take a look at <file:Documentation/cpu-freq/>.
-          If in doubt, say N.
-config X86_E_POWERSAVER
-        tristate "VIA C7 Enhanced PowerSaver (DANGEROUS)"
-        select CPU_FREQ_TABLE
-        depends on X86_32 && EXPERIMENTAL
-        help
-          This adds the CPUFreq driver for VIA C7 processors.  However, this driver
-          does not have any safeguards to prevent operating the CPU out of spec
-          and is thus considered dangerous.  Please use the regular ACPI cpufreq
-          driver, enabled by CONFIG_X86_ACPI_CPUFREQ.
-          If in doubt, say N.
-comment "shared options"
-config X86_SPEEDSTEP_LIB
-        tristate
-        default (X86_SPEEDSTEP_ICH || X86_SPEEDSTEP_SMI || X86_P4_CLOCKMOD)
-config X86_SPEEDSTEP_RELAXED_CAP_CHECK
-        bool "Relaxed speedstep capability checks"
-        depends on X86_32 && (X86_SPEEDSTEP_SMI || X86_SPEEDSTEP_ICH)
-        help
-          Don't perform all checks for a speedstep capable system which would
-          normally be done. Some ancient or strange systems, though speedstep
-          capable, don't always indicate that they are speedstep capable. This
-          option lets the probing code bypass some of those checks if the
-          parameter "relaxed_check=1" is passed to the module.
-endif   # CPU_FREQ
-endmenu
diff --git a/arch/x86/kernel/cpu/cpufreq/Makefile b/arch/x86/kernel/cpu/cpufreq/Makefile
deleted file mode 100644
index bd54bf67e6fb..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/Makefile
+++ /dev/null
@@ -1,21 +0,0 @@
-# Link order matters. K8 is preferred to ACPI because of firmware bugs in early
-# K8 systems. ACPI is preferred to all other hardware-specific drivers.
-# speedstep-* is preferred over p4-clockmod.
-obj-$(CONFIG_X86_POWERNOW_K8)           += powernow-k8.o mperf.o
-obj-$(CONFIG_X86_ACPI_CPUFREQ)          += acpi-cpufreq.o mperf.o
-obj-$(CONFIG_X86_PCC_CPUFREQ)           += pcc-cpufreq.o
-obj-$(CONFIG_X86_POWERNOW_K6)           += powernow-k6.o
-obj-$(CONFIG_X86_POWERNOW_K7)           += powernow-k7.o
-obj-$(CONFIG_X86_LONGHAUL)              += longhaul.o
-obj-$(CONFIG_X86_E_POWERSAVER)          += e_powersaver.o
-obj-$(CONFIG_ELAN_CPUFREQ)              += elanfreq.o
-obj-$(CONFIG_SC520_CPUFREQ)             += sc520_freq.o
-obj-$(CONFIG_X86_LONGRUN)               += longrun.o  
-obj-$(CONFIG_X86_GX_SUSPMOD)            += gx-suspmod.o
-obj-$(CONFIG_X86_SPEEDSTEP_ICH)         += speedstep-ich.o
-obj-$(CONFIG_X86_SPEEDSTEP_LIB)         += speedstep-lib.o
-obj-$(CONFIG_X86_SPEEDSTEP_SMI)         += speedstep-smi.o
-obj-$(CONFIG_X86_SPEEDSTEP_CENTRINO)    += speedstep-centrino.o
-obj-$(CONFIG_X86_P4_CLOCKMOD)           += p4-clockmod.o
-obj-$(CONFIG_X86_CPUFREQ_NFORCE2)       += cpufreq-nforce2.o
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
deleted file mode 100644
index cd8da247dda1..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ /dev/null
@@ -1,775 +0,0 @@
-/*
- * acpi-cpufreq.c - ACPI Processor P-States Driver
- *
- *  Copyright (C) 2001, 2002 Andy Grover <andrew.grover@intel.com>
- *  Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
- *  Copyright (C) 2002 - 2004 Dominik Brodowski <linux@brodo.de>
- *  Copyright (C) 2006       Denis Sadykov <denis.m.sadykov@intel.com>
- *
- * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2 of the License, or (at
- *  your option) any later version.
- *
- *  This program is distributed in the hope that it will be useful, but
- *  WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- *  General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License along
- *  with this program; if not, write to the Free Software Foundation, Inc.,
- *  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
- *
- * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- */
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/smp.h>
-#include <linux/sched.h>
-#include <linux/cpufreq.h>
-#include <linux/compiler.h>
-#include <linux/dmi.h>
-#include <linux/slab.h>
-#include <linux/acpi.h>
-#include <linux/io.h>
-#include <linux/delay.h>
-#include <linux/uaccess.h>
-#include <acpi/processor.h>
-#include <asm/msr.h>
-#include <asm/processor.h>
-#include <asm/cpufeature.h>
-#include "mperf.h"
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
-                "acpi-cpufreq", msg)
-MODULE_AUTHOR("Paul Diefenbaugh, Dominik Brodowski");
-MODULE_DESCRIPTION("ACPI Processor P-States Driver");
-MODULE_LICENSE("GPL");
-enum {
-        UNDEFINED_CAPABLE = 0,
-        SYSTEM_INTEL_MSR_CAPABLE,
-        SYSTEM_IO_CAPABLE,
-};
-#define INTEL_MSR_RANGE         (0xffff)
-struct acpi_cpufreq_data {
-        struct acpi_processor_performance *acpi_data;
-        struct cpufreq_frequency_table *freq_table;
-        unsigned int resume;
-        unsigned int cpu_feature;
-};
-static DEFINE_PER_CPU(struct acpi_cpufreq_data *, acfreq_data);
-/* acpi_perf_data is a pointer to percpu data. */
-static struct acpi_processor_performance __percpu *acpi_perf_data;
-static struct cpufreq_driver acpi_cpufreq_driver;
-static unsigned int acpi_pstate_strict;
-static int check_est_cpu(unsigned int cpuid)
-{
-        struct cpuinfo_x86 *cpu = &cpu_data(cpuid);
-        return cpu_has(cpu, X86_FEATURE_EST);
-}
-static unsigned extract_io(u32 value, struct acpi_cpufreq_data *data)
-{
-        struct acpi_processor_performance *perf;
-        int i;
-        perf = data->acpi_data;
-        for (i = 0; i < perf->state_count; i++) {
-                if (value == perf->states[i].status)
-                        return data->freq_table[i].frequency;
-        }
-        return 0;
-}
-static unsigned extract_msr(u32 msr, struct acpi_cpufreq_data *data)
-{
-        int i;
-        struct acpi_processor_performance *perf;
-        msr &= INTEL_MSR_RANGE;
-        perf = data->acpi_data;
-        for (i = 0; data->freq_table[i].frequency != CPUFREQ_TABLE_END; i++) {
-                if (msr == perf->states[data->freq_table[i].index].status)
-                        return data->freq_table[i].frequency;
-        }
-        return data->freq_table[0].frequency;
-}
-static unsigned extract_freq(u32 val, struct acpi_cpufreq_data *data)
-{
-        switch (data->cpu_feature) {
-        case SYSTEM_INTEL_MSR_CAPABLE:
-                return extract_msr(val, data);
-        case SYSTEM_IO_CAPABLE:
-                return extract_io(val, data);
-        default:
-                return 0;
-        }
-}
-struct msr_addr {
-        u32 reg;
-};
-struct io_addr {
-        u16 port;
-        u8 bit_width;
-};
-struct drv_cmd {
-        unsigned int type;
-        const struct cpumask *mask;
-        union {
-                struct msr_addr msr;
-                struct io_addr io;
-        } addr;
-        u32 val;
-};
-/* Called via smp_call_function_single(), on the target CPU */
-static void do_drv_read(void *_cmd)
-{
-        struct drv_cmd *cmd = _cmd;
-        u32 h;
-        switch (cmd->type) {
-        case SYSTEM_INTEL_MSR_CAPABLE:
-                rdmsr(cmd->addr.msr.reg, cmd->val, h);
-                break;
-        case SYSTEM_IO_CAPABLE:
-                acpi_os_read_port((acpi_io_address)cmd->addr.io.port,
-                                &cmd->val,
-                                (u32)cmd->addr.io.bit_width);
-                break;
-        default:
-                break;
-        }
-}
-/* Called via smp_call_function_many(), on the target CPUs */
-static void do_drv_write(void *_cmd)
-{
-        struct drv_cmd *cmd = _cmd;
-        u32 lo, hi;
-        switch (cmd->type) {
-        case SYSTEM_INTEL_MSR_CAPABLE:
-                rdmsr(cmd->addr.msr.reg, lo, hi);
-                lo = (lo & ~INTEL_MSR_RANGE) | (cmd->val & INTEL_MSR_RANGE);
-                wrmsr(cmd->addr.msr.reg, lo, hi);
-                break;
-        case SYSTEM_IO_CAPABLE:
-                acpi_os_write_port((acpi_io_address)cmd->addr.io.port,
-                                cmd->val,
-                                (u32)cmd->addr.io.bit_width);
-                break;
-        default:
-                break;
-        }
-}
-static void drv_read(struct drv_cmd *cmd)
-{
-        int err;
-        cmd->val = 0;
-        err = smp_call_function_any(cmd->mask, do_drv_read, cmd, 1);
-        WARN_ON_ONCE(err);      /* smp_call_function_any() was buggy? */
-}
-static void drv_write(struct drv_cmd *cmd)
-{
-        int this_cpu;
-        this_cpu = get_cpu();
-        if (cpumask_test_cpu(this_cpu, cmd->mask))
-                do_drv_write(cmd);
-        smp_call_function_many(cmd->mask, do_drv_write, cmd, 1);
-        put_cpu();
-}
-static u32 get_cur_val(const struct cpumask *mask)
-{
-        struct acpi_processor_performance *perf;
-        struct drv_cmd cmd;
-        if (unlikely(cpumask_empty(mask)))
-                return 0;
-        switch (per_cpu(acfreq_data, cpumask_first(mask))->cpu_feature) {
-        case SYSTEM_INTEL_MSR_CAPABLE:
-                cmd.type = SYSTEM_INTEL_MSR_CAPABLE;
-                cmd.addr.msr.reg = MSR_IA32_PERF_STATUS;
-                break;
-        case SYSTEM_IO_CAPABLE:
-                cmd.type = SYSTEM_IO_CAPABLE;
-                perf = per_cpu(acfreq_data, cpumask_first(mask))->acpi_data;
-                cmd.addr.io.port = perf->control_register.address;
-                cmd.addr.io.bit_width = perf->control_register.bit_width;
-                break;
-        default:
-                return 0;
-        }
-        cmd.mask = mask;
-        drv_read(&cmd);
-        dprintk("get_cur_val = %u\n", cmd.val);
-        return cmd.val;
-}
-static unsigned int get_cur_freq_on_cpu(unsigned int cpu)
-{
-        struct acpi_cpufreq_data *data = per_cpu(acfreq_data, cpu);
-        unsigned int freq;
-        unsigned int cached_freq;
-        dprintk("get_cur_freq_on_cpu (%d)\n", cpu);
-        if (unlikely(data == NULL ||
-                     data->acpi_data == NULL || data->freq_table == NULL)) {
-                return 0;
-        }
-        cached_freq = data->freq_table[data->acpi_data->state].frequency;
-        freq = extract_freq(get_cur_val(cpumask_of(cpu)), data);
-        if (freq != cached_freq) {
-                /*
-                 * The dreaded BIOS frequency change behind our back.
-                 * Force set the frequency on next target call.
-                 */
-                data->resume = 1;
-        }
-        dprintk("cur freq = %u\n", freq);
-        return freq;
-}
-static unsigned int check_freqs(const struct cpumask *mask, unsigned int freq,
-                                struct acpi_cpufreq_data *data)
-{
-        unsigned int cur_freq;
-        unsigned int i;
-        for (i = 0; i < 100; i++) {
-                cur_freq = extract_freq(get_cur_val(mask), data);
-                if (cur_freq == freq)
-                        return 1;
-                udelay(10);
-        }
-        return 0;
-}
-static int acpi_cpufreq_target(struct cpufreq_policy *policy,
-                               unsigned int target_freq, unsigned int relation)
-{
-        struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu);
-        struct acpi_processor_performance *perf;
-        struct cpufreq_freqs freqs;
-        struct drv_cmd cmd;
-        unsigned int next_state = 0; /* Index into freq_table */
-        unsigned int next_perf_state = 0; /* Index into perf table */
-        unsigned int i;
-        int result = 0;
-        dprintk("acpi_cpufreq_target %d (%d)\n", target_freq, policy->cpu);
-        if (unlikely(data == NULL ||
-             data->acpi_data == NULL || data->freq_table == NULL)) {
-                return -ENODEV;
-        }
-        perf = data->acpi_data;
-        result = cpufreq_frequency_table_target(policy,
-                                                data->freq_table,
-                                                target_freq,
-                                                relation, &next_state);
-        if (unlikely(result)) {
-                result = -ENODEV;
-                goto out;
-        }
-        next_perf_state = data->freq_table[next_state].index;
-        if (perf->state == next_perf_state) {
-                if (unlikely(data->resume)) {
-                        dprintk("Called after resume, resetting to P%d\n",
-                                next_perf_state);
-                        data->resume = 0;
-                } else {
-                        dprintk("Already at target state (P%d)\n",
-                                next_perf_state);
-                        goto out;
-                }
-        }
-        switch (data->cpu_feature) {
-        case SYSTEM_INTEL_MSR_CAPABLE:
-                cmd.type = SYSTEM_INTEL_MSR_CAPABLE;
-                cmd.addr.msr.reg = MSR_IA32_PERF_CTL;
-                cmd.val = (u32) perf->states[next_perf_state].control;
-                break;
-        case SYSTEM_IO_CAPABLE:
-                cmd.type = SYSTEM_IO_CAPABLE;
-                cmd.addr.io.port = perf->control_register.address;
-                cmd.addr.io.bit_width = perf->control_register.bit_width;
-                cmd.val = (u32) perf->states[next_perf_state].control;
-                break;
-        default:
-                result = -ENODEV;
-                goto out;
-        }
-        /* cpufreq holds the hotplug lock, so we are safe from here on */
-        if (policy->shared_type != CPUFREQ_SHARED_TYPE_ANY)
-                cmd.mask = policy->cpus;
-        else
-                cmd.mask = cpumask_of(policy->cpu);
-        freqs.old = perf->states[perf->state].core_frequency * 1000;
-        freqs.new = data->freq_table[next_state].frequency;
-        for_each_cpu(i, policy->cpus) {
-                freqs.cpu = i;
-                cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
-        }
-        drv_write(&cmd);
-        if (acpi_pstate_strict) {
-                if (!check_freqs(cmd.mask, freqs.new, data)) {
-                        dprintk("acpi_cpufreq_target failed (%d)\n",
-                                policy->cpu);
-                        result = -EAGAIN;
-                        goto out;
-                }
-        }
-        for_each_cpu(i, policy->cpus) {
-                freqs.cpu = i;
-                cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
-        }
-        perf->state = next_perf_state;
-out:
-        return result;
-}
-static int acpi_cpufreq_verify(struct cpufreq_policy *policy)
-{
-        struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu);
-        dprintk("acpi_cpufreq_verify\n");
-        return cpufreq_frequency_table_verify(policy, data->freq_table);
-}
-static unsigned long
-acpi_cpufreq_guess_freq(struct acpi_cpufreq_data *data, unsigned int cpu)
-{
-        struct acpi_processor_performance *perf = data->acpi_data;
-        if (cpu_khz) {
-                /* search the closest match to cpu_khz */
-                unsigned int i;
-                unsigned long freq;
-                unsigned long freqn = perf->states[0].core_frequency * 1000;
-                for (i = 0; i < (perf->state_count-1); i++) {
-                        freq = freqn;
-                        freqn = perf->states[i+1].core_frequency * 1000;
-                        if ((2 * cpu_khz) > (freqn + freq)) {
-                                perf->state = i;
-                                return freq;
-                        }
-                }
-                perf->state = perf->state_count-1;
-                return freqn;
-        } else {
-                /* assume CPU is at P0... */
-                perf->state = 0;
-                return perf->states[0].core_frequency * 1000;
-        }
-}
-static void free_acpi_perf_data(void)
-{
-        unsigned int i;
-        /* Freeing a NULL pointer is OK, and alloc_percpu zeroes. */
-        for_each_possible_cpu(i)
-                free_cpumask_var(per_cpu_ptr(acpi_perf_data, i)
-                                 ->shared_cpu_map);
-        free_percpu(acpi_perf_data);
-}
-/*
- * acpi_cpufreq_early_init - initialize ACPI P-States library
- *
- * Initialize the ACPI P-States library (drivers/acpi/processor_perflib.c)
- * in order to determine correct frequency and voltage pairings. We can
- * do _PDC and _PSD and find out the processor dependency for the
- * actual init that will happen later...
- */
-static int __init acpi_cpufreq_early_init(void)
-{
-        unsigned int i;
-        dprintk("acpi_cpufreq_early_init\n");
-        acpi_perf_data = alloc_percpu(struct acpi_processor_performance);
-        if (!acpi_perf_data) {
-                dprintk("Memory allocation error for acpi_perf_data.\n");
-                return -ENOMEM;
-        }
-        for_each_possible_cpu(i) {
-                if (!zalloc_cpumask_var_node(
-                        &per_cpu_ptr(acpi_perf_data, i)->shared_cpu_map,
-                        GFP_KERNEL, cpu_to_node(i))) {
-                        /* Freeing a NULL pointer is OK: alloc_percpu zeroes. */
-                        free_acpi_perf_data();
-                        return -ENOMEM;
-                }
-        }
-        /* Do initialization in ACPI core */
-        acpi_processor_preregister_performance(acpi_perf_data);
-        return 0;
-}
-#ifdef CONFIG_SMP
-/*
- * Some BIOSes do SW_ANY coordination internally, either set it up in hw
- * or do it in BIOS firmware and won't inform about it to OS. If not
- * detected, this has a side effect of making CPU run at a different speed
- * than OS intended it to run at. Detect it and handle it cleanly.
- */
-static int bios_with_sw_any_bug;
-static int sw_any_bug_found(const struct dmi_system_id *d)
-{
-        bios_with_sw_any_bug = 1;
-        return 0;
-}
-static const struct dmi_system_id sw_any_bug_dmi_table[] = {
-        {
-                .callback = sw_any_bug_found,
-                .ident = "Supermicro Server X6DLP",
-                .matches = {
-                        DMI_MATCH(DMI_SYS_VENDOR, "Supermicro"),
-                        DMI_MATCH(DMI_BIOS_VERSION, "080010"),
-                        DMI_MATCH(DMI_PRODUCT_NAME, "X6DLP"),
-                },
-        },
-        { }
-};
-static int acpi_cpufreq_blacklist(struct cpuinfo_x86 *c)
-{
-        /* Intel Xeon Processor 7100 Series Specification Update
-         * http://www.intel.com/Assets/PDF/specupdate/314554.pdf
-         * AL30: A Machine Check Exception (MCE) Occurring during an
-         * Enhanced Intel SpeedStep Technology Ratio Change May Cause
-         * Both Processor Cores to Lock Up. */
-        if (c->x86_vendor == X86_VENDOR_INTEL) {
-                if ((c->x86 == 15) &&
-                    (c->x86_model == 6) &&
-                    (c->x86_mask == 8)) {
-                        printk(KERN_INFO "acpi-cpufreq: Intel(R) "
-                            "Xeon(R) 7100 Errata AL30, processors may "
-                            "lock up on frequency changes: disabling "
-                            "acpi-cpufreq.\n");
-                        return -ENODEV;
-                    }
-                }
-        return 0;
-}
-#endif
-static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
-{
-        unsigned int i;
-        unsigned int valid_states = 0;
-        unsigned int cpu = policy->cpu;
-        struct acpi_cpufreq_data *data;
-        unsigned int result = 0;
-        struct cpuinfo_x86 *c = &cpu_data(policy->cpu);
-        struct acpi_processor_performance *perf;
-#ifdef CONFIG_SMP
-        static int blacklisted;
-#endif
-        dprintk("acpi_cpufreq_cpu_init\n");
-#ifdef CONFIG_SMP
-        if (blacklisted)
-                return blacklisted;
-        blacklisted = acpi_cpufreq_blacklist(c);
-        if (blacklisted)
-                return blacklisted;
-#endif
-        data = kzalloc(sizeof(struct acpi_cpufreq_data), GFP_KERNEL);
-        if (!data)
-                return -ENOMEM;
-        data->acpi_data = per_cpu_ptr(acpi_perf_data, cpu);
-        per_cpu(acfreq_data, cpu) = data;
-        if (cpu_has(c, X86_FEATURE_CONSTANT_TSC))
-                acpi_cpufreq_driver.flags |= CPUFREQ_CONST_LOOPS;
-        result = acpi_processor_register_performance(data->acpi_data, cpu);
-        if (result)
-                goto err_free;
-        perf = data->acpi_data;
-        policy->shared_type = perf->shared_type;
-        /*
-         * Will let policy->cpus know about dependency only when software
-         * coordination is required.
-         */
-        if (policy->shared_type == CPUFREQ_SHARED_TYPE_ALL ||
-            policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) {
-                cpumask_copy(policy->cpus, perf->shared_cpu_map);
-        }
-        cpumask_copy(policy->related_cpus, perf->shared_cpu_map);
-#ifdef CONFIG_SMP
-        dmi_check_system(sw_any_bug_dmi_table);
-        if (bios_with_sw_any_bug && cpumask_weight(policy->cpus) == 1) {
-                policy->shared_type = CPUFREQ_SHARED_TYPE_ALL;
-                cpumask_copy(policy->cpus, cpu_core_mask(cpu));
-        }
-#endif
-        /* capability check */
-        if (perf->state_count <= 1) {
-                dprintk("No P-States\n");
-                result = -ENODEV;
-                goto err_unreg;
-        }
-        if (perf->control_register.space_id != perf->status_register.space_id) {
-                result = -ENODEV;
-                goto err_unreg;
-        }
-        switch (perf->control_register.space_id) {
-        case ACPI_ADR_SPACE_SYSTEM_IO:
-                dprintk("SYSTEM IO addr space\n");
-                data->cpu_feature = SYSTEM_IO_CAPABLE;
-                break;
-        case ACPI_ADR_SPACE_FIXED_HARDWARE:
-                dprintk("HARDWARE addr space\n");
-                if (!check_est_cpu(cpu)) {
-                        result = -ENODEV;
-                        goto err_unreg;
-                }
-                data->cpu_feature = SYSTEM_INTEL_MSR_CAPABLE;
-                break;
-        default:
-                dprintk("Unknown addr space %d\n",
-                        (u32) (perf->control_register.space_id));
-                result = -ENODEV;
-                goto err_unreg;
-        }
-        data->freq_table = kmalloc(sizeof(struct cpufreq_frequency_table) *
-                    (perf->state_count+1), GFP_KERNEL);
-        if (!data->freq_table) {
-                result = -ENOMEM;
-                goto err_unreg;
-        }
-        /* detect transition latency */
-        policy->cpuinfo.transition_latency = 0;
-        for (i = 0; i < perf->state_count; i++) {
-                if ((perf->states[i].transition_latency * 1000) >
-                    policy->cpuinfo.transition_latency)
-                        policy->cpuinfo.transition_latency =
-                            perf->states[i].transition_latency * 1000;
-        }
-        /* Check for high latency (>20uS) from buggy BIOSes, like on T42 */
-        if (perf->control_register.space_id == ACPI_ADR_SPACE_FIXED_HARDWARE &&
-            policy->cpuinfo.transition_latency > 20 * 1000) {
-                policy->cpuinfo.transition_latency = 20 * 1000;
-                printk_once(KERN_INFO
-                            "P-state transition latency capped at 20 uS\n");
-        }
-        /* table init */
-        for (i = 0; i < perf->state_count; i++) {
-                if (i > 0 && perf->states[i].core_frequency >=
-                    data->freq_table[valid_states-1].frequency / 1000)
-                        continue;
-                data->freq_table[valid_states].index = i;
-                data->freq_table[valid_states].frequency =
-                    perf->states[i].core_frequency * 1000;
-                valid_states++;
-        }
-        data->freq_table[valid_states].frequency = CPUFREQ_TABLE_END;
-        perf->state = 0;
-        result = cpufreq_frequency_table_cpuinfo(policy, data->freq_table);
-        if (result)
-                goto err_freqfree;
-        if (perf->states[0].core_frequency * 1000 != policy->cpuinfo.max_freq)
-                printk(KERN_WARNING FW_WARN "P-state 0 is not max freq\n");
-        switch (perf->control_register.space_id) {
-        case ACPI_ADR_SPACE_SYSTEM_IO:
-                /* Current speed is unknown and not detectable by IO port */
-                policy->cur = acpi_cpufreq_guess_freq(data, policy->cpu);
-                break;
-        case ACPI_ADR_SPACE_FIXED_HARDWARE:
-                acpi_cpufreq_driver.get = get_cur_freq_on_cpu;
-                policy->cur = get_cur_freq_on_cpu(cpu);
-                break;
-        default:
-                break;
-        }
-        /* notify BIOS that we exist */
-        acpi_processor_notify_smm(THIS_MODULE);
-        /* Check for APERF/MPERF support in hardware */
-        if (cpu_has(c, X86_FEATURE_APERFMPERF))
-                acpi_cpufreq_driver.getavg = cpufreq_get_measured_perf;
-        dprintk("CPU%u - ACPI performance management activated.\n", cpu);
-        for (i = 0; i < perf->state_count; i++)
-                dprintk("     %cP%d: %d MHz, %d mW, %d uS\n",
-                        (i == perf->state ? '*' : ' '), i,
-                        (u32) perf->states[i].core_frequency,
-                        (u32) perf->states[i].power,
-                        (u32) perf->states[i].transition_latency);
-        cpufreq_frequency_table_get_attr(data->freq_table, policy->cpu);
-        /*
-         * the first call to ->target() should result in us actually
-         * writing something to the appropriate registers.
-         */
-        data->resume = 1;
-        return result;
-err_freqfree:
-        kfree(data->freq_table);
-err_unreg:
-        acpi_processor_unregister_performance(perf, cpu);
-err_free:
-        kfree(data);
-        per_cpu(acfreq_data, cpu) = NULL;
-        return result;
-}
-static int acpi_cpufreq_cpu_exit(struct cpufreq_policy *policy)
-{
-        struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu);
-        dprintk("acpi_cpufreq_cpu_exit\n");
-        if (data) {
-                cpufreq_frequency_table_put_attr(policy->cpu);
-                per_cpu(acfreq_data, policy->cpu) = NULL;
-                acpi_processor_unregister_performance(data->acpi_data,
-                                                      policy->cpu);
-                kfree(data);
-        }
-        return 0;
-}
-static int acpi_cpufreq_resume(struct cpufreq_policy *policy)
-{
-        struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu);
-        dprintk("acpi_cpufreq_resume\n");
-        data->resume = 1;
-        return 0;
-}
-static struct freq_attr *acpi_cpufreq_attr[] = {
-        &cpufreq_freq_attr_scaling_available_freqs,
-        NULL,
-};
-static struct cpufreq_driver acpi_cpufreq_driver = {
-        .verify         = acpi_cpufreq_verify,
-        .target         = acpi_cpufreq_target,
-        .bios_limit     = acpi_processor_get_bios_limit,
-        .init           = acpi_cpufreq_cpu_init,
-        .exit           = acpi_cpufreq_cpu_exit,
-        .resume         = acpi_cpufreq_resume,
-        .name           = "acpi-cpufreq",
-        .owner          = THIS_MODULE,
-        .attr           = acpi_cpufreq_attr,
-};
-static int __init acpi_cpufreq_init(void)
-{
-        int ret;
-        if (acpi_disabled)
-                return 0;
-        dprintk("acpi_cpufreq_init\n");
-        ret = acpi_cpufreq_early_init();
-        if (ret)
-                return ret;
-        ret = cpufreq_register_driver(&acpi_cpufreq_driver);
-        if (ret)
-                free_acpi_perf_data();
-        return ret;
-}
-static void __exit acpi_cpufreq_exit(void)
-{
-        dprintk("acpi_cpufreq_exit\n");
-        cpufreq_unregister_driver(&acpi_cpufreq_driver);
-        free_percpu(acpi_perf_data);
-}
-module_param(acpi_pstate_strict, uint, 0644);
-MODULE_PARM_DESC(acpi_pstate_strict,
-        "value 0 or non-zero. non-zero -> strict ACPI checks are "
-        "performed during frequency changes.");
-late_initcall(acpi_cpufreq_init);
-module_exit(acpi_cpufreq_exit);
-MODULE_ALIAS("acpi");
diff --git a/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c b/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c
deleted file mode 100644
index 733093d60436..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c
+++ /dev/null
@@ -1,446 +0,0 @@
-/*
- * (C) 2004-2006  Sebastian Witt <se.witt@gmx.net>
- *
- *  Licensed under the terms of the GNU GPL License version 2.
- *  Based upon reverse engineered information
- *
- *  BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
- */
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/moduleparam.h>
-#include <linux/init.h>
-#include <linux/cpufreq.h>
-#include <linux/pci.h>
-#include <linux/delay.h>
-#define NFORCE2_XTAL 25
-#define NFORCE2_BOOTFSB 0x48
-#define NFORCE2_PLLENABLE 0xa8
-#define NFORCE2_PLLREG 0xa4
-#define NFORCE2_PLLADR 0xa0
-#define NFORCE2_PLL(mul, div) (0x100000 | (mul << 8) | div)
-#define NFORCE2_MIN_FSB 50
-#define NFORCE2_SAFE_DISTANCE 50
-/* Delay in ms between FSB changes */
-/* #define NFORCE2_DELAY 10 */
-/*
- * nforce2_chipset:
- * FSB is changed using the chipset
- */
-static struct pci_dev *nforce2_dev;
-/* fid:
- * multiplier * 10
- */
-static int fid;
-/* min_fsb, max_fsb:
- * minimum and maximum FSB (= FSB at boot time)
- */
-static int min_fsb;
-static int max_fsb;
-MODULE_AUTHOR("Sebastian Witt <se.witt@gmx.net>");
-MODULE_DESCRIPTION("nForce2 FSB changing cpufreq driver");
-MODULE_LICENSE("GPL");
-module_param(fid, int, 0444);
-module_param(min_fsb, int, 0444);
-MODULE_PARM_DESC(fid, "CPU multiplier to use (11.5 = 115)");
-MODULE_PARM_DESC(min_fsb,
-                "Minimum FSB to use, if not defined: current FSB - 50");
-#define PFX "cpufreq-nforce2: "
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
-                "cpufreq-nforce2", msg)
-/**
- * nforce2_calc_fsb - calculate FSB
- * @pll: PLL value
- *
- *   Calculates FSB from PLL value
- */
-static int nforce2_calc_fsb(int pll)
-{
-        unsigned char mul, div;
-        mul = (pll >> 8) & 0xff;
-        div = pll & 0xff;
-        if (div > 0)
-                return NFORCE2_XTAL * mul / div;
-        return 0;
-}
-/**
- * nforce2_calc_pll - calculate PLL value
- * @fsb: FSB
- *
- *   Calculate PLL value for given FSB
- */
-static int nforce2_calc_pll(unsigned int fsb)
-{
-        unsigned char xmul, xdiv;
-        unsigned char mul = 0, div = 0;
-        int tried = 0;
-        /* Try to calculate multiplier and divider up to 4 times */
-        while (((mul == 0) || (div == 0)) && (tried <= 3)) {
-                for (xdiv = 2; xdiv <= 0x80; xdiv++)
-                        for (xmul = 1; xmul <= 0xfe; xmul++)
-                                if (nforce2_calc_fsb(NFORCE2_PLL(xmul, xdiv)) ==
-                                    fsb + tried) {
-                                        mul = xmul;
-                                        div = xdiv;
-                                }
-                tried++;
-        }
-        if ((mul == 0) || (div == 0))
-                return -1;
-        return NFORCE2_PLL(mul, div);
-}
-/**
- * nforce2_write_pll - write PLL value to chipset
- * @pll: PLL value
- *
- *   Writes new FSB PLL value to chipset
- */
-static void nforce2_write_pll(int pll)
-{
-        int temp;
-        /* Set the pll addr. to 0x00 */
-        pci_write_config_dword(nforce2_dev, NFORCE2_PLLADR, 0);
-        /* Now write the value in all 64 registers */
-        for (temp = 0; temp <= 0x3f; temp++)
-                pci_write_config_dword(nforce2_dev, NFORCE2_PLLREG, pll);
-        return;
-}
-/**
- * nforce2_fsb_read - Read FSB
- *
- *   Read FSB from chipset
- *   If bootfsb != 0, return FSB at boot-time
- */
-static unsigned int nforce2_fsb_read(int bootfsb)
-{
-        struct pci_dev *nforce2_sub5;
-        u32 fsb, temp = 0;
-        /* Get chipset boot FSB from subdevice 5 (FSB at boot-time) */
-        nforce2_sub5 = pci_get_subsys(PCI_VENDOR_ID_NVIDIA, 0x01EF,
-                                PCI_ANY_ID, PCI_ANY_ID, NULL);
-        if (!nforce2_sub5)
-                return 0;
-        pci_read_config_dword(nforce2_sub5, NFORCE2_BOOTFSB, &fsb);
-        fsb /= 1000000;
-        /* Check if PLL register is already set */
-        pci_read_config_byte(nforce2_dev, NFORCE2_PLLENABLE, (u8 *)&temp);
-        if (bootfsb || !temp)
-                return fsb;
-        /* Use PLL register FSB value */
-        pci_read_config_dword(nforce2_dev, NFORCE2_PLLREG, &temp);
-        fsb = nforce2_calc_fsb(temp);
-        return fsb;
-}
-/**
- * nforce2_set_fsb - set new FSB
- * @fsb: New FSB
- *
- *   Sets new FSB
- */
-static int nforce2_set_fsb(unsigned int fsb)
-{
-        u32 temp = 0;
-        unsigned int tfsb;
-        int diff;
-        int pll = 0;
-        if ((fsb > max_fsb) || (fsb < NFORCE2_MIN_FSB)) {
-                printk(KERN_ERR PFX "FSB %d is out of range!\n", fsb);
-                return -EINVAL;
-        }
-        tfsb = nforce2_fsb_read(0);
-        if (!tfsb) {
-                printk(KERN_ERR PFX "Error while reading the FSB\n");
-                return -EINVAL;
-        }
-        /* First write? Then set actual value */
-        pci_read_config_byte(nforce2_dev, NFORCE2_PLLENABLE, (u8 *)&temp);
-        if (!temp) {
-                pll = nforce2_calc_pll(tfsb);
-                if (pll < 0)
-                        return -EINVAL;
-                nforce2_write_pll(pll);
-        }
-        /* Enable write access */
-        temp = 0x01;
-        pci_write_config_byte(nforce2_dev, NFORCE2_PLLENABLE, (u8)temp);
-        diff = tfsb - fsb;
-        if (!diff)
-                return 0;
-        while ((tfsb != fsb) && (tfsb <= max_fsb) && (tfsb >= min_fsb)) {
-                if (diff < 0)
-                        tfsb++;
-                else
-                        tfsb--;
-                /* Calculate the PLL reg. value */
-                pll = nforce2_calc_pll(tfsb);
-                if (pll == -1)
-                        return -EINVAL;
-                nforce2_write_pll(pll);
-#ifdef NFORCE2_DELAY
-                mdelay(NFORCE2_DELAY);
-#endif
-        }
-        temp = 0x40;
-        pci_write_config_byte(nforce2_dev, NFORCE2_PLLADR, (u8)temp);
-        return 0;
-}
-/**
- * nforce2_get - get the CPU frequency
- * @cpu: CPU number
- *
- * Returns the CPU frequency
- */
-static unsigned int nforce2_get(unsigned int cpu)
-{
-        if (cpu)
-                return 0;
-        return nforce2_fsb_read(0) * fid * 100;
-}
-/**
- * nforce2_target - set a new CPUFreq policy
- * @policy: new policy
- * @target_freq: the target frequency
- * @relation: how that frequency relates to achieved frequency
- *  (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H)
- *
- * Sets a new CPUFreq policy.
- */
-static int nforce2_target(struct cpufreq_policy *policy,
-                          unsigned int target_freq, unsigned int relation)
-{
-/*        unsigned long         flags; */
-        struct cpufreq_freqs freqs;
-        unsigned int target_fsb;
-        if ((target_freq > policy->max) || (target_freq < policy->min))
-                return -EINVAL;
-        target_fsb = target_freq / (fid * 100);
-        freqs.old = nforce2_get(policy->cpu);
-        freqs.new = target_fsb * fid * 100;
-        freqs.cpu = 0;          /* Only one CPU on nForce2 platforms */
-        if (freqs.old == freqs.new)
-                return 0;
-        dprintk("Old CPU frequency %d kHz, new %d kHz\n",
-               freqs.old, freqs.new);
-        cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
-        /* Disable IRQs */
-        /* local_irq_save(flags); */
-        if (nforce2_set_fsb(target_fsb) < 0)
-                printk(KERN_ERR PFX "Changing FSB to %d failed\n",
-                        target_fsb);
-        else
-                dprintk("Changed FSB successfully to %d\n",
-                        target_fsb);
-        /* Enable IRQs */
-        /* local_irq_restore(flags); */
-        cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
-        return 0;
-}
-/**
- * nforce2_verify - verifies a new CPUFreq policy
- * @policy: new policy
- */
-static int nforce2_verify(struct cpufreq_policy *policy)
-{
-        unsigned int fsb_pol_max;
-        fsb_pol_max = policy->max / (fid * 100);
-        if (policy->min < (fsb_pol_max * fid * 100))
-                policy->max = (fsb_pol_max + 1) * fid * 100;
-        cpufreq_verify_within_limits(policy,
-                                     policy->cpuinfo.min_freq,
-                                     policy->cpuinfo.max_freq);
-        return 0;
-}
-static int nforce2_cpu_init(struct cpufreq_policy *policy)
-{
-        unsigned int fsb;
-        unsigned int rfid;
-        /* capability check */
-        if (policy->cpu != 0)
-                return -ENODEV;
-        /* Get current FSB */
-        fsb = nforce2_fsb_read(0);
-        if (!fsb)
-                return -EIO;
-        /* FIX: Get FID from CPU */
-        if (!fid) {
-                if (!cpu_khz) {
-                        printk(KERN_WARNING PFX
-                        "cpu_khz not set, can't calculate multiplier!\n");
-                        return -ENODEV;
-                }
-                fid = cpu_khz / (fsb * 100);
-                rfid = fid % 5;
-                if (rfid) {
-                        if (rfid > 2)
-                                fid += 5 - rfid;
-                        else
-                                fid -= rfid;
-                }
-        }
-        printk(KERN_INFO PFX "FSB currently at %i MHz, FID %d.%d\n", fsb,
-               fid / 10, fid % 10);
-        /* Set maximum FSB to FSB at boot time */
-        max_fsb = nforce2_fsb_read(1);
-        if (!max_fsb)
-                return -EIO;
-        if (!min_fsb)
-                min_fsb = max_fsb - NFORCE2_SAFE_DISTANCE;
-        if (min_fsb < NFORCE2_MIN_FSB)
-                min_fsb = NFORCE2_MIN_FSB;
-        /* cpuinfo and default policy values */
-        policy->cpuinfo.min_freq = min_fsb * fid * 100;
-        policy->cpuinfo.max_freq = max_fsb * fid * 100;
-        policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
-        policy->cur = nforce2_get(policy->cpu);
-        policy->min = policy->cpuinfo.min_freq;
-        policy->max = policy->cpuinfo.max_freq;
-        return 0;
-}
-static int nforce2_cpu_exit(struct cpufreq_policy *policy)
-{
-        return 0;
-}
-static struct cpufreq_driver nforce2_driver = {
-        .name = "nforce2",
-        .verify = nforce2_verify,
-        .target = nforce2_target,
-        .get = nforce2_get,
-        .init = nforce2_cpu_init,
-        .exit = nforce2_cpu_exit,
-        .owner = THIS_MODULE,
-};
-/**
- * nforce2_detect_chipset - detect the Southbridge which contains FSB PLL logic
- *
- * Detects nForce2 A2 and C1 stepping
- *
- */
-static unsigned int nforce2_detect_chipset(void)
-{
-        nforce2_dev = pci_get_subsys(PCI_VENDOR_ID_NVIDIA,
-                                        PCI_DEVICE_ID_NVIDIA_NFORCE2,
-                                        PCI_ANY_ID, PCI_ANY_ID, NULL);
-        if (nforce2_dev == NULL)
-                return -ENODEV;
-        printk(KERN_INFO PFX "Detected nForce2 chipset revision %X\n",
-               nforce2_dev->revision);
-        printk(KERN_INFO PFX
-               "FSB changing is maybe unstable and can lead to "
-               "crashes and data loss.\n");
-        return 0;
-}
-/**
- * nforce2_init - initializes the nForce2 CPUFreq driver
- *
- * Initializes the nForce2 FSB support. Returns -ENODEV on unsupported
- * devices, -EINVAL on problems during initiatization, and zero on
- * success.
- */
-static int __init nforce2_init(void)
-{
-        /* TODO: do we need to detect the processor? */
-        /* detect chipset */
-        if (nforce2_detect_chipset()) {
-                printk(KERN_INFO PFX "No nForce2 chipset.\n");
-                return -ENODEV;
-        }
-        return cpufreq_register_driver(&nforce2_driver);
-}
-/**
- * nforce2_exit - unregisters cpufreq module
- *
- *   Unregisters nForce2 FSB change support.
- */
-static void __exit nforce2_exit(void)
-{
-        cpufreq_unregister_driver(&nforce2_driver);
-}
-module_init(nforce2_init);
-module_exit(nforce2_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/e_powersaver.c b/arch/x86/kernel/cpu/cpufreq/e_powersaver.c
deleted file mode 100644
index 35a257dd4bb7..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/e_powersaver.c
+++ /dev/null
@@ -1,367 +0,0 @@
-/*
- *  Based on documentation provided by Dave Jones. Thanks!
- *
- *  Licensed under the terms of the GNU GPL License version 2.
- *
- *  BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
- */
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/cpufreq.h>
-#include <linux/ioport.h>
-#include <linux/slab.h>
-#include <linux/timex.h>
-#include <linux/io.h>
-#include <linux/delay.h>
-#include <asm/msr.h>
-#include <asm/tsc.h>
-#define EPS_BRAND_C7M   0
-#define EPS_BRAND_C7    1
-#define EPS_BRAND_EDEN  2
-#define EPS_BRAND_C3    3
-#define EPS_BRAND_C7D   4
-struct eps_cpu_data {
-        u32 fsb;
-        struct cpufreq_frequency_table freq_table[];
-};
-static struct eps_cpu_data *eps_cpu[NR_CPUS];
-static unsigned int eps_get(unsigned int cpu)
-{
-        struct eps_cpu_data *centaur;
-        u32 lo, hi;
-        if (cpu)
-                return 0;
-        centaur = eps_cpu[cpu];
-        if (centaur == NULL)
-                return 0;
-        /* Return current frequency */
-        rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
-        return centaur->fsb * ((lo >> 8) & 0xff);
-}
-static int eps_set_state(struct eps_cpu_data *centaur,
-                         unsigned int cpu,
-                         u32 dest_state)
-{
-        struct cpufreq_freqs freqs;
-        u32 lo, hi;
-        int err = 0;
-        int i;
-        freqs.old = eps_get(cpu);
-        freqs.new = centaur->fsb * ((dest_state >> 8) & 0xff);
-        freqs.cpu = cpu;
-        cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
-        /* Wait while CPU is busy */
-        rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
-        i = 0;
-        while (lo & ((1 << 16) | (1 << 17))) {
-                udelay(16);
-                rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
-                i++;
-                if (unlikely(i > 64)) {
-                        err = -ENODEV;
-                        goto postchange;
-                }
-        }
-        /* Set new multiplier and voltage */
-        wrmsr(MSR_IA32_PERF_CTL, dest_state & 0xffff, 0);
-        /* Wait until transition end */
-        i = 0;
-        do {
-                udelay(16);
-                rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
-                i++;
-                if (unlikely(i > 64)) {
-                        err = -ENODEV;
-                        goto postchange;
-                }
-        } while (lo & ((1 << 16) | (1 << 17)));
-        /* Return current frequency */
-postchange:
-        rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
-        freqs.new = centaur->fsb * ((lo >> 8) & 0xff);
-#ifdef DEBUG
-        {
-        u8 current_multiplier, current_voltage;
-        /* Print voltage and multiplier */
-        rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
-        current_voltage = lo & 0xff;
-        printk(KERN_INFO "eps: Current voltage = %dmV\n",
-                current_voltage * 16 + 700);
-        current_multiplier = (lo >> 8) & 0xff;
-        printk(KERN_INFO "eps: Current multiplier = %d\n",
-                current_multiplier);
-        }
-#endif
-        cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
-        return err;
-}
-static int eps_target(struct cpufreq_policy *policy,
-                               unsigned int target_freq,
-                               unsigned int relation)
-{
-        struct eps_cpu_data *centaur;
-        unsigned int newstate = 0;
-        unsigned int cpu = policy->cpu;
-        unsigned int dest_state;
-        int ret;
-        if (unlikely(eps_cpu[cpu] == NULL))
-                return -ENODEV;
-        centaur = eps_cpu[cpu];
-        if (unlikely(cpufreq_frequency_table_target(policy,
-                        &eps_cpu[cpu]->freq_table[0],
-                        target_freq,
-                        relation,
-                        &newstate))) {
-                return -EINVAL;
-        }
-        /* Make frequency transition */
-        dest_state = centaur->freq_table[newstate].index & 0xffff;
-        ret = eps_set_state(centaur, cpu, dest_state);
-        if (ret)
-                printk(KERN_ERR "eps: Timeout!\n");
-        return ret;
-}
-static int eps_verify(struct cpufreq_policy *policy)
-{
-        return cpufreq_frequency_table_verify(policy,
-                        &eps_cpu[policy->cpu]->freq_table[0]);
-}
-static int eps_cpu_init(struct cpufreq_policy *policy)
-{
-        unsigned int i;
-        u32 lo, hi;
-        u64 val;
-        u8 current_multiplier, current_voltage;
-        u8 max_multiplier, max_voltage;
-        u8 min_multiplier, min_voltage;
-        u8 brand = 0;
-        u32 fsb;
-        struct eps_cpu_data *centaur;
-        struct cpuinfo_x86 *c = &cpu_data(0);
-        struct cpufreq_frequency_table *f_table;
-        int k, step, voltage;
-        int ret;
-        int states;
-        if (policy->cpu != 0)
-                return -ENODEV;
-        /* Check brand */
-        printk(KERN_INFO "eps: Detected VIA ");
-        switch (c->x86_model) {
-        case 10:
-                rdmsr(0x1153, lo, hi);
-                brand = (((lo >> 2) ^ lo) >> 18) & 3;
-                printk(KERN_CONT "Model A ");
-                break;
-        case 13:
-                rdmsr(0x1154, lo, hi);
-                brand = (((lo >> 4) ^ (lo >> 2))) & 0x000000ff;
-                printk(KERN_CONT "Model D ");
-                break;
-        }
-        switch (brand) {
-        case EPS_BRAND_C7M:
-                printk(KERN_CONT "C7-M\n");
-                break;
-        case EPS_BRAND_C7:
-                printk(KERN_CONT "C7\n");
-                break;
-        case EPS_BRAND_EDEN:
-                printk(KERN_CONT "Eden\n");
-                break;
-        case EPS_BRAND_C7D:
-                printk(KERN_CONT "C7-D\n");
-                break;
-        case EPS_BRAND_C3:
-                printk(KERN_CONT "C3\n");
-                return -ENODEV;
-                break;
-        }
-        /* Enable Enhanced PowerSaver */
-        rdmsrl(MSR_IA32_MISC_ENABLE, val);
-        if (!(val & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP)) {
-                val |= MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP;
-                wrmsrl(MSR_IA32_MISC_ENABLE, val);
-                /* Can be locked at 0 */
-                rdmsrl(MSR_IA32_MISC_ENABLE, val);
-                if (!(val & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP)) {
-                        printk(KERN_INFO "eps: Can't enable Enhanced PowerSaver\n");
-                        return -ENODEV;
-                }
-        }
-        /* Print voltage and multiplier */
-        rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
-        current_voltage = lo & 0xff;
-        printk(KERN_INFO "eps: Current voltage = %dmV\n",
-                        current_voltage * 16 + 700);
-        current_multiplier = (lo >> 8) & 0xff;
-        printk(KERN_INFO "eps: Current multiplier = %d\n", current_multiplier);
-        /* Print limits */
-        max_voltage = hi & 0xff;
-        printk(KERN_INFO "eps: Highest voltage = %dmV\n",
-                        max_voltage * 16 + 700);
-        max_multiplier = (hi >> 8) & 0xff;
-        printk(KERN_INFO "eps: Highest multiplier = %d\n", max_multiplier);
-        min_voltage = (hi >> 16) & 0xff;
-        printk(KERN_INFO "eps: Lowest voltage = %dmV\n",
-                        min_voltage * 16 + 700);
-        min_multiplier = (hi >> 24) & 0xff;
-        printk(KERN_INFO "eps: Lowest multiplier = %d\n", min_multiplier);
-        /* Sanity checks */
-        if (current_multiplier == 0 || max_multiplier == 0
-            || min_multiplier == 0)
-                return -EINVAL;
-        if (current_multiplier > max_multiplier
-            || max_multiplier <= min_multiplier)
-                return -EINVAL;
-        if (current_voltage > 0x1f || max_voltage > 0x1f)
-                return -EINVAL;
-        if (max_voltage < min_voltage)
-                return -EINVAL;
-        /* Calc FSB speed */
-        fsb = cpu_khz / current_multiplier;
-        /* Calc number of p-states supported */
-        if (brand == EPS_BRAND_C7M)
-                states = max_multiplier - min_multiplier + 1;
-        else
-                states = 2;
-        /* Allocate private data and frequency table for current cpu */
-        centaur = kzalloc(sizeof(struct eps_cpu_data)
-                    + (states + 1) * sizeof(struct cpufreq_frequency_table),
-                    GFP_KERNEL);
-        if (!centaur)
-                return -ENOMEM;
-        eps_cpu[0] = centaur;
-        /* Copy basic values */
-        centaur->fsb = fsb;
-        /* Fill frequency and MSR value table */
-        f_table = &centaur->freq_table[0];
-        if (brand != EPS_BRAND_C7M) {
-                f_table[0].frequency = fsb * min_multiplier;
-                f_table[0].index = (min_multiplier << 8) | min_voltage;
-                f_table[1].frequency = fsb * max_multiplier;
-                f_table[1].index = (max_multiplier << 8) | max_voltage;
-                f_table[2].frequency = CPUFREQ_TABLE_END;
-        } else {
-                k = 0;
-                step = ((max_voltage - min_voltage) * 256)
-                        / (max_multiplier - min_multiplier);
-                for (i = min_multiplier; i <= max_multiplier; i++) {
-                        voltage = (k * step) / 256 + min_voltage;
-                        f_table[k].frequency = fsb * i;
-                        f_table[k].index = (i << 8) | voltage;
-                        k++;
-                }
-                f_table[k].frequency = CPUFREQ_TABLE_END;
-        }
-        policy->cpuinfo.transition_latency = 140000; /* 844mV -> 700mV in ns */
-        policy->cur = fsb * current_multiplier;
-        ret = cpufreq_frequency_table_cpuinfo(policy, &centaur->freq_table[0]);
-        if (ret) {
-                kfree(centaur);
-                return ret;
-        }
-        cpufreq_frequency_table_get_attr(&centaur->freq_table[0], policy->cpu);
-        return 0;
-}
-static int eps_cpu_exit(struct cpufreq_policy *policy)
-{
-        unsigned int cpu = policy->cpu;
-        struct eps_cpu_data *centaur;
-        u32 lo, hi;
-        if (eps_cpu[cpu] == NULL)
-                return -ENODEV;
-        centaur = eps_cpu[cpu];
-        /* Get max frequency */
-        rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
-        /* Set max frequency */
-        eps_set_state(centaur, cpu, hi & 0xffff);
-        /* Bye */
-        cpufreq_frequency_table_put_attr(policy->cpu);
-        kfree(eps_cpu[cpu]);
-        eps_cpu[cpu] = NULL;
-        return 0;
-}
-static struct freq_attr *eps_attr[] = {
-        &cpufreq_freq_attr_scaling_available_freqs,
-        NULL,
-};
-static struct cpufreq_driver eps_driver = {
-        .verify         = eps_verify,
-        .target         = eps_target,
-        .init           = eps_cpu_init,
-        .exit           = eps_cpu_exit,
-        .get            = eps_get,
-        .name           = "e_powersaver",
-        .owner          = THIS_MODULE,
-        .attr           = eps_attr,
-};
-static int __init eps_init(void)
-{
-        struct cpuinfo_x86 *c = &cpu_data(0);
-        /* This driver will work only on Centaur C7 processors with
-         * Enhanced SpeedStep/PowerSaver registers */
-        if (c->x86_vendor != X86_VENDOR_CENTAUR
-            || c->x86 != 6 || c->x86_model < 10)
-                return -ENODEV;
-        if (!cpu_has(c, X86_FEATURE_EST))
-                return -ENODEV;
-        if (cpufreq_register_driver(&eps_driver))
-                return -EINVAL;
-        return 0;
-}
-static void __exit eps_exit(void)
-{
-        cpufreq_unregister_driver(&eps_driver);
-}
-MODULE_AUTHOR("Rafal Bilski <rafalbilski@interia.pl>");
-MODULE_DESCRIPTION("Enhanced PowerSaver driver for VIA C7 CPU's.");
-MODULE_LICENSE("GPL");
-module_init(eps_init);
-module_exit(eps_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/elanfreq.c b/arch/x86/kernel/cpu/cpufreq/elanfreq.c
deleted file mode 100644
index c587db472a75..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/elanfreq.c
+++ /dev/null
@@ -1,309 +0,0 @@
-/*
- *      elanfreq:       cpufreq driver for the AMD ELAN family
- *
- *      (c) Copyright 2002 Robert Schwebel <r.schwebel@pengutronix.de>
- *
- *      Parts of this code are (c) Sven Geggus <sven@geggus.net>
- *
- *      All Rights Reserved.
- *
- *      This program is free software; you can redistribute it and/or
- *      modify it under the terms of the GNU General Public License
- *      as published by the Free Software Foundation; either version
- *      2 of the License, or (at your option) any later version.
- *
- *      2002-02-13: - initial revision for 2.4.18-pre9 by Robert Schwebel
- *
- */
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/delay.h>
-#include <linux/cpufreq.h>
-#include <asm/msr.h>
-#include <linux/timex.h>
-#include <linux/io.h>
-#define REG_CSCIR 0x22          /* Chip Setup and Control Index Register    */
-#define REG_CSCDR 0x23          /* Chip Setup and Control Data  Register    */
-/* Module parameter */
-static int max_freq;
-struct s_elan_multiplier {
-        int clock;              /* frequency in kHz                         */
-        int val40h;             /* PMU Force Mode register                  */
-        int val80h;             /* CPU Clock Speed Register                 */
-};
-/*
- * It is important that the frequencies
- * are listed in ascending order here!
- */
-static struct s_elan_multiplier elan_multiplier[] = {
-        {1000,  0x02,   0x18},
-        {2000,  0x02,   0x10},
-        {4000,  0x02,   0x08},
-        {8000,  0x00,   0x00},
-        {16000, 0x00,   0x02},
-        {33000, 0x00,   0x04},
-        {66000, 0x01,   0x04},
-        {99000, 0x01,   0x05}
-};
-static struct cpufreq_frequency_table elanfreq_table[] = {
-        {0,     1000},
-        {1,     2000},
-        {2,     4000},
-        {3,     8000},
-        {4,     16000},
-        {5,     33000},
-        {6,     66000},
-        {7,     99000},
-        {0,     CPUFREQ_TABLE_END},
-};
-/**
- *      elanfreq_get_cpu_frequency: determine current cpu speed
- *
- *      Finds out at which frequency the CPU of the Elan SOC runs
- *      at the moment. Frequencies from 1 to 33 MHz are generated
- *      the normal way, 66 and 99 MHz are called "Hyperspeed Mode"
- *      and have the rest of the chip running with 33 MHz.
- */
-static unsigned int elanfreq_get_cpu_frequency(unsigned int cpu)
-{
-        u8 clockspeed_reg;    /* Clock Speed Register */
-        local_irq_disable();
-        outb_p(0x80, REG_CSCIR);
-        clockspeed_reg = inb_p(REG_CSCDR);
-        local_irq_enable();
-        if ((clockspeed_reg & 0xE0) == 0xE0)
-                return 0;
-        /* Are we in CPU clock multiplied mode (66/99 MHz)? */
-        if ((clockspeed_reg & 0xE0) == 0xC0) {
-                if ((clockspeed_reg & 0x01) == 0)
-                        return 66000;
-                else
-                        return 99000;
-        }
-        /* 33 MHz is not 32 MHz... */
-        if ((clockspeed_reg & 0xE0) == 0xA0)
-                return 33000;
-        return (1<<((clockspeed_reg & 0xE0) >> 5)) * 1000;
-}
-/**
- *      elanfreq_set_cpu_frequency: Change the CPU core frequency
- *      @cpu: cpu number
- *      @freq: frequency in kHz
- *
- *      This function takes a frequency value and changes the CPU frequency
- *      according to this. Note that the frequency has to be checked by
- *      elanfreq_validatespeed() for correctness!
- *
- *      There is no return value.
- */
-static void elanfreq_set_cpu_state(unsigned int state)
-{
-        struct cpufreq_freqs    freqs;
-        freqs.old = elanfreq_get_cpu_frequency(0);
-        freqs.new = elan_multiplier[state].clock;
-        freqs.cpu = 0; /* elanfreq.c is UP only driver */
-        cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
-        printk(KERN_INFO "elanfreq: attempting to set frequency to %i kHz\n",
-                        elan_multiplier[state].clock);
-        /*
-         * Access to the Elan's internal registers is indexed via
-         * 0x22: Chip Setup & Control Register Index Register (CSCI)
-         * 0x23: Chip Setup & Control Register Data  Register (CSCD)
-         *
-         */
-        /*
-         * 0x40 is the Power Management Unit's Force Mode Register.
-         * Bit 6 enables Hyperspeed Mode (66/100 MHz core frequency)
-         */
-        local_irq_disable();
-        outb_p(0x40, REG_CSCIR);                /* Disable hyperspeed mode */
-        outb_p(0x00, REG_CSCDR);
-        local_irq_enable();             /* wait till internal pipelines and */
-        udelay(1000);                   /* buffers have cleaned up          */
-        local_irq_disable();
-        /* now, set the CPU clock speed register (0x80) */
-        outb_p(0x80, REG_CSCIR);
-        outb_p(elan_multiplier[state].val80h, REG_CSCDR);
-        /* now, the hyperspeed bit in PMU Force Mode Register (0x40) */
-        outb_p(0x40, REG_CSCIR);
-        outb_p(elan_multiplier[state].val40h, REG_CSCDR);
-        udelay(10000);
-        local_irq_enable();
-        cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
-};
-/**
- *      elanfreq_validatespeed: test if frequency range is valid
- *      @policy: the policy to validate
- *
- *      This function checks if a given frequency range in kHz is valid
- *      for the hardware supported by the driver.
- */
-static int elanfreq_verify(struct cpufreq_policy *policy)
-{
-        return cpufreq_frequency_table_verify(policy, &elanfreq_table[0]);
-}
-static int elanfreq_target(struct cpufreq_policy *policy,
-                            unsigned int target_freq,
-                            unsigned int relation)
-{
-        unsigned int newstate = 0;
-        if (cpufreq_frequency_table_target(policy, &elanfreq_table[0],
-                                target_freq, relation, &newstate))
-                return -EINVAL;
-        elanfreq_set_cpu_state(newstate);
-        return 0;
-}
-/*
- *      Module init and exit code
- */
-static int elanfreq_cpu_init(struct cpufreq_policy *policy)
-{
-        struct cpuinfo_x86 *c = &cpu_data(0);
-        unsigned int i;
-        int result;
-        /* capability check */
-        if ((c->x86_vendor != X86_VENDOR_AMD) ||
-            (c->x86 != 4) || (c->x86_model != 10))
-                return -ENODEV;
-        /* max freq */
-        if (!max_freq)
-                max_freq = elanfreq_get_cpu_frequency(0);
-        /* table init */
-        for (i = 0; (elanfreq_table[i].frequency != CPUFREQ_TABLE_END); i++) {
-                if (elanfreq_table[i].frequency > max_freq)
-                        elanfreq_table[i].frequency = CPUFREQ_ENTRY_INVALID;
-        }
-        /* cpuinfo and default policy values */
-        policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
-        policy->cur = elanfreq_get_cpu_frequency(0);
-        result = cpufreq_frequency_table_cpuinfo(policy, elanfreq_table);
-        if (result)
-                return result;
-        cpufreq_frequency_table_get_attr(elanfreq_table, policy->cpu);
-        return 0;
-}
-static int elanfreq_cpu_exit(struct cpufreq_policy *policy)
-{
-        cpufreq_frequency_table_put_attr(policy->cpu);
-        return 0;
-}
-#ifndef MODULE
-/**
- * elanfreq_setup - elanfreq command line parameter parsing
- *
- * elanfreq command line parameter.  Use:
- *  elanfreq=66000
- * to set the maximum CPU frequency to 66 MHz. Note that in
- * case you do not give this boot parameter, the maximum
- * frequency will fall back to _current_ CPU frequency which
- * might be lower. If you build this as a module, use the
- * max_freq module parameter instead.
- */
-static int __init elanfreq_setup(char *str)
-{
-        max_freq = simple_strtoul(str, &str, 0);
-        printk(KERN_WARNING "You're using the deprecated elanfreq command line option. Use elanfreq.max_freq instead, please!\n");
-        return 1;
-}
-__setup("elanfreq=", elanfreq_setup);
-#endif
-static struct freq_attr *elanfreq_attr[] = {
-        &cpufreq_freq_attr_scaling_available_freqs,
-        NULL,
-};
-static struct cpufreq_driver elanfreq_driver = {
-        .get            = elanfreq_get_cpu_frequency,
-        .verify         = elanfreq_verify,
-        .target         = elanfreq_target,
-        .init           = elanfreq_cpu_init,
-        .exit           = elanfreq_cpu_exit,
-        .name           = "elanfreq",
-        .owner          = THIS_MODULE,
-        .attr           = elanfreq_attr,
-};
-static int __init elanfreq_init(void)
-{
-        struct cpuinfo_x86 *c = &cpu_data(0);
-        /* Test if we have the right hardware */
-        if ((c->x86_vendor != X86_VENDOR_AMD) ||
-                (c->x86 != 4) || (c->x86_model != 10)) {
-                printk(KERN_INFO "elanfreq: error: no Elan processor found!\n");
-                return -ENODEV;
-        }
-        return cpufreq_register_driver(&elanfreq_driver);
-}
-static void __exit elanfreq_exit(void)
-{
-        cpufreq_unregister_driver(&elanfreq_driver);
-}
-module_param(max_freq, int, 0444);
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Robert Schwebel <r.schwebel@pengutronix.de>, "
-                "Sven Geggus <sven@geggus.net>");
-MODULE_DESCRIPTION("cpufreq driver for AMD's Elan CPUs");
-module_init(elanfreq_init);
-module_exit(elanfreq_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c b/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c
deleted file mode 100644
index 32974cf84232..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c
+++ /dev/null
@@ -1,517 +0,0 @@
-/*
- *      Cyrix MediaGX and NatSemi Geode Suspend Modulation
- *      (C) 2002 Zwane Mwaikambo <zwane@commfireservices.com>
- *      (C) 2002 Hiroshi Miura   <miura@da-cha.org>
- *      All Rights Reserved
- *
- *      This program is free software; you can redistribute it and/or
- *      modify it under the terms of the GNU General Public License
- *      version 2 as published by the Free Software Foundation
- *
- *      The author(s) of this software shall not be held liable for damages
- *      of any nature resulting due to the use of this software. This
- *      software is provided AS-IS with no warranties.
- *
- * Theoretical note:
- *
- *      (see Geode(tm) CS5530 manual (rev.4.1) page.56)
- *
- *      CPU frequency control on NatSemi Geode GX1/GXLV processor and CS55x0
- *      are based on Suspend Modulation.
- *
- *      Suspend Modulation works by asserting and de-asserting the SUSP# pin
- *      to CPU(GX1/GXLV) for configurable durations. When asserting SUSP#
- *      the CPU enters an idle state. GX1 stops its core clock when SUSP# is
- *      asserted then power consumption is reduced.
- *
- *      Suspend Modulation's OFF/ON duration are configurable
- *      with 'Suspend Modulation OFF Count Register'
- *      and 'Suspend Modulation ON Count Register'.
- *      These registers are 8bit counters that represent the number of
- *      32us intervals which the SUSP# pin is asserted(ON)/de-asserted(OFF)
- *      to the processor.
- *
- *      These counters define a ratio which is the effective frequency
- *      of operation of the system.
- *
- *                             OFF Count
- *      F_eff = Fgx * ----------------------
- *                      OFF Count + ON Count
- *
- *      0 <= On Count, Off Count <= 255
- *
- *      From these limits, we can get register values
- *
- *      off_duration + on_duration <= MAX_DURATION
- *      on_duration = off_duration * (stock_freq - freq) / freq
- *
- *      off_duration  =  (freq * DURATION) / stock_freq
- *      on_duration = DURATION - off_duration
- *
- *
- *---------------------------------------------------------------------------
- *
- * ChangeLog:
- *      Dec. 12, 2003   Hiroshi Miura <miura@da-cha.org>
- *              - fix on/off register mistake
- *              - fix cpu_khz calc when it stops cpu modulation.
- *
- *      Dec. 11, 2002   Hiroshi Miura <miura@da-cha.org>
- *              - rewrite for Cyrix MediaGX Cx5510/5520 and
- *                NatSemi Geode Cs5530(A).
- *
- *      Jul. ??, 2002  Zwane Mwaikambo <zwane@commfireservices.com>
- *              - cs5530_mod patch for 2.4.19-rc1.
- *
- *---------------------------------------------------------------------------
- *
- * Todo
- *      Test on machines with 5510, 5530, 5530A
- */
-/************************************************************************
- *                      Suspend Modulation - Definitions                *
- ************************************************************************/
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/smp.h>
-#include <linux/cpufreq.h>
-#include <linux/pci.h>
-#include <linux/errno.h>
-#include <linux/slab.h>
-#include <asm/processor-cyrix.h>
-/* PCI config registers, all at F0 */
-#define PCI_PMER1       0x80    /* power management enable register 1 */
-#define PCI_PMER2       0x81    /* power management enable register 2 */
-#define PCI_PMER3       0x82    /* power management enable register 3 */
-#define PCI_IRQTC       0x8c    /* irq speedup timer counter register:typical 2 to 4ms */
-#define PCI_VIDTC       0x8d    /* video speedup timer counter register: typical 50 to 100ms */
-#define PCI_MODOFF      0x94    /* suspend modulation OFF counter register, 1 = 32us */
-#define PCI_MODON       0x95    /* suspend modulation ON counter register */
-#define PCI_SUSCFG      0x96    /* suspend configuration register */
-/* PMER1 bits */
-#define GPM             (1<<0)  /* global power management */
-#define GIT             (1<<1)  /* globally enable PM device idle timers */
-#define GTR             (1<<2)  /* globally enable IO traps */
-#define IRQ_SPDUP       (1<<3)  /* disable clock throttle during interrupt handling */
-#define VID_SPDUP       (1<<4)  /* disable clock throttle during vga video handling */
-/* SUSCFG bits */
-#define SUSMOD          (1<<0)  /* enable/disable suspend modulation */
-/* the below is supported only with cs5530 (after rev.1.2)/cs5530A */
-#define SMISPDUP        (1<<1)  /* select how SMI re-enable suspend modulation: */
-                                /* IRQTC timer or read SMI speedup disable reg.(F1BAR[08-09h]) */
-#define SUSCFG          (1<<2)  /* enable powering down a GXLV processor. "Special 3Volt Suspend" mode */
-/* the below is supported only with cs5530A */
-#define PWRSVE_ISA      (1<<3)  /* stop ISA clock  */
-#define PWRSVE          (1<<4)  /* active idle */
-struct gxfreq_params {
-        u8 on_duration;
-        u8 off_duration;
-        u8 pci_suscfg;
-        u8 pci_pmer1;
-        u8 pci_pmer2;
-        struct pci_dev *cs55x0;
-};
-static struct gxfreq_params *gx_params;
-static int stock_freq;
-/* PCI bus clock - defaults to 30.000 if cpu_khz is not available */
-static int pci_busclk;
-module_param(pci_busclk, int, 0444);
-/* maximum duration for which the cpu may be suspended
- * (32us * MAX_DURATION). If no parameter is given, this defaults
- * to 255.
- * Note that this leads to a maximum of 8 ms(!) where the CPU clock
- * is suspended -- processing power is just 0.39% of what it used to be,
- * though. 781.25 kHz(!) for a 200 MHz processor -- wow. */
-static int max_duration = 255;
-module_param(max_duration, int, 0444);
-/* For the default policy, we want at least some processing power
- * - let's say 5%. (min = maxfreq / POLICY_MIN_DIV)
- */
-#define POLICY_MIN_DIV 20
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
-                "gx-suspmod", msg)
-/**
- * we can detect a core multipiler from dir0_lsb
- * from GX1 datasheet p.56,
- *      MULT[3:0]:
- *      0000 = SYSCLK multiplied by 4 (test only)
- *      0001 = SYSCLK multiplied by 10
- *      0010 = SYSCLK multiplied by 4
- *      0011 = SYSCLK multiplied by 6
- *      0100 = SYSCLK multiplied by 9
- *      0101 = SYSCLK multiplied by 5
- *      0110 = SYSCLK multiplied by 7
- *      0111 = SYSCLK multiplied by 8
- *              of 33.3MHz
- **/
-static int gx_freq_mult[16] = {
-                4, 10, 4, 6, 9, 5, 7, 8,
-                0, 0, 0, 0, 0, 0, 0, 0
-};
-/****************************************************************
- *      Low Level chipset interface                             *
- ****************************************************************/
-static struct pci_device_id gx_chipset_tbl[] __initdata = {
-        { PCI_VDEVICE(CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY), },
-        { PCI_VDEVICE(CYRIX, PCI_DEVICE_ID_CYRIX_5520), },
-        { PCI_VDEVICE(CYRIX, PCI_DEVICE_ID_CYRIX_5510), },
-        { 0, },
-};
-static void gx_write_byte(int reg, int value)
-{
-        pci_write_config_byte(gx_params->cs55x0, reg, value);
-}
-/**
- * gx_detect_chipset:
- *
- **/
-static __init struct pci_dev *gx_detect_chipset(void)
-{
-        struct pci_dev *gx_pci = NULL;
-        /* check if CPU is a MediaGX or a Geode. */
-        if ((boot_cpu_data.x86_vendor != X86_VENDOR_NSC) &&
-            (boot_cpu_data.x86_vendor != X86_VENDOR_CYRIX)) {
-                dprintk("error: no MediaGX/Geode processor found!\n");
-                return NULL;
-        }
-        /* detect which companion chip is used */
-        for_each_pci_dev(gx_pci) {
-                if ((pci_match_id(gx_chipset_tbl, gx_pci)) != NULL)
-                        return gx_pci;
-        }
-        dprintk("error: no supported chipset found!\n");
-        return NULL;
-}
-/**
- * gx_get_cpuspeed:
- *
- * Finds out at which efficient frequency the Cyrix MediaGX/NatSemi
- * Geode CPU runs.
- */
-static unsigned int gx_get_cpuspeed(unsigned int cpu)
-{
-        if ((gx_params->pci_suscfg & SUSMOD) == 0)
-                return stock_freq;
-        return (stock_freq * gx_params->off_duration)
-                / (gx_params->on_duration + gx_params->off_duration);
-}
-/**
- *      gx_validate_speed:
- *      determine current cpu speed
- *
- **/
-static unsigned int gx_validate_speed(unsigned int khz, u8 *on_duration,
-                u8 *off_duration)
-{
-        unsigned int i;
-        u8 tmp_on, tmp_off;
-        int old_tmp_freq = stock_freq;
-        int tmp_freq;
-        *off_duration = 1;
-        *on_duration = 0;
-        for (i = max_duration; i > 0; i--) {
-                tmp_off = ((khz * i) / stock_freq) & 0xff;
-                tmp_on = i - tmp_off;
-                tmp_freq = (stock_freq * tmp_off) / i;
-                /* if this relation is closer to khz, use this. If it's equal,
-                 * prefer it, too - lower latency */
-                if (abs(tmp_freq - khz) <= abs(old_tmp_freq - khz)) {
-                        *on_duration = tmp_on;
-                        *off_duration = tmp_off;
-                        old_tmp_freq = tmp_freq;
-                }
-        }
-        return old_tmp_freq;
-}
-/**
- * gx_set_cpuspeed:
- * set cpu speed in khz.
- **/
-static void gx_set_cpuspeed(unsigned int khz)
-{
-        u8 suscfg, pmer1;
-        unsigned int new_khz;
-        unsigned long flags;
-        struct cpufreq_freqs freqs;
-        freqs.cpu = 0;
-        freqs.old = gx_get_cpuspeed(0);
-        new_khz = gx_validate_speed(khz, &gx_params->on_duration,
-                        &gx_params->off_duration);
-        freqs.new = new_khz;
-        cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
-        local_irq_save(flags);
-        if (new_khz != stock_freq) {
-                /* if new khz == 100% of CPU speed, it is special case */
-                switch (gx_params->cs55x0->device) {
-                case PCI_DEVICE_ID_CYRIX_5530_LEGACY:
-                        pmer1 = gx_params->pci_pmer1 | IRQ_SPDUP | VID_SPDUP;
-                        /* FIXME: need to test other values -- Zwane,Miura */
-                        /* typical 2 to 4ms */
-                        gx_write_byte(PCI_IRQTC, 4);
-                        /* typical 50 to 100ms */
-                        gx_write_byte(PCI_VIDTC, 100);
-                        gx_write_byte(PCI_PMER1, pmer1);
-                        if (gx_params->cs55x0->revision < 0x10) {
-                                /* CS5530(rev 1.2, 1.3) */
-                                suscfg = gx_params->pci_suscfg|SUSMOD;
-                        } else {
-                                /* CS5530A,B.. */
-                                suscfg = gx_params->pci_suscfg|SUSMOD|PWRSVE;
-                        }
-                        break;
-                case PCI_DEVICE_ID_CYRIX_5520:
-                case PCI_DEVICE_ID_CYRIX_5510:
-                        suscfg = gx_params->pci_suscfg | SUSMOD;
-                        break;
-                default:
-                        local_irq_restore(flags);
-                        dprintk("fatal: try to set unknown chipset.\n");
-                        return;
-                }
-        } else {
-                suscfg = gx_params->pci_suscfg & ~(SUSMOD);
-                gx_params->off_duration = 0;
-                gx_params->on_duration = 0;
-                dprintk("suspend modulation disabled: cpu runs 100%% speed.\n");
-        }
-        gx_write_byte(PCI_MODOFF, gx_params->off_duration);
-        gx_write_byte(PCI_MODON, gx_params->on_duration);
-        gx_write_byte(PCI_SUSCFG, suscfg);
-        pci_read_config_byte(gx_params->cs55x0, PCI_SUSCFG, &suscfg);
-        local_irq_restore(flags);
-        gx_params->pci_suscfg = suscfg;
-        cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
-        dprintk("suspend modulation w/ duration of ON:%d us, OFF:%d us\n",
-                gx_params->on_duration * 32, gx_params->off_duration * 32);
-        dprintk("suspend modulation w/ clock speed: %d kHz.\n", freqs.new);
-}
-/****************************************************************
- *             High level functions                             *
- ****************************************************************/
-/*
- *      cpufreq_gx_verify: test if frequency range is valid
- *
- *      This function checks if a given frequency range in kHz is valid
- *      for the hardware supported by the driver.
- */
-static int cpufreq_gx_verify(struct cpufreq_policy *policy)
-{
-        unsigned int tmp_freq = 0;
-        u8 tmp1, tmp2;
-        if (!stock_freq || !policy)
-                return -EINVAL;
-        policy->cpu = 0;
-        cpufreq_verify_within_limits(policy, (stock_freq / max_duration),
-                        stock_freq);
-        /* it needs to be assured that at least one supported frequency is
-         * within policy->min and policy->max. If it is not, policy->max
-         * needs to be increased until one freuqency is supported.
-         * policy->min may not be decreased, though. This way we guarantee a
-         * specific processing capacity.
-         */
-        tmp_freq = gx_validate_speed(policy->min, &tmp1, &tmp2);
-        if (tmp_freq < policy->min)
-                tmp_freq += stock_freq / max_duration;
-        policy->min = tmp_freq;
-        if (policy->min > policy->max)
-                policy->max = tmp_freq;
-        tmp_freq = gx_validate_speed(policy->max, &tmp1, &tmp2);
-        if (tmp_freq > policy->max)
-                tmp_freq -= stock_freq / max_duration;
-        policy->max = tmp_freq;
-        if (policy->max < policy->min)
-                policy->max = policy->min;
-        cpufreq_verify_within_limits(policy, (stock_freq / max_duration),
-                        stock_freq);
-        return 0;
-}
-/*
- *      cpufreq_gx_target:
- *
- */
-static int cpufreq_gx_target(struct cpufreq_policy *policy,
-                             unsigned int target_freq,
-                             unsigned int relation)
-{
-        u8 tmp1, tmp2;
-        unsigned int tmp_freq;
-        if (!stock_freq || !policy)
-                return -EINVAL;
-        policy->cpu = 0;
-        tmp_freq = gx_validate_speed(target_freq, &tmp1, &tmp2);
-        while (tmp_freq < policy->min) {
-                tmp_freq += stock_freq / max_duration;
-                tmp_freq = gx_validate_speed(tmp_freq, &tmp1, &tmp2);
-        }
-        while (tmp_freq > policy->max) {
-                tmp_freq -= stock_freq / max_duration;
-                tmp_freq = gx_validate_speed(tmp_freq, &tmp1, &tmp2);
-        }
-        gx_set_cpuspeed(tmp_freq);
-        return 0;
-}
-static int cpufreq_gx_cpu_init(struct cpufreq_policy *policy)
-{
-        unsigned int maxfreq, curfreq;
-        if (!policy || policy->cpu != 0)
-                return -ENODEV;
-        /* determine maximum frequency */
-        if (pci_busclk)
-                maxfreq = pci_busclk * gx_freq_mult[getCx86(CX86_DIR1) & 0x0f];
-        else if (cpu_khz)
-                maxfreq = cpu_khz;
-        else
-                maxfreq = 30000 * gx_freq_mult[getCx86(CX86_DIR1) & 0x0f];
-        stock_freq = maxfreq;
-        curfreq = gx_get_cpuspeed(0);
-        dprintk("cpu max frequency is %d.\n", maxfreq);
-        dprintk("cpu current frequency is %dkHz.\n", curfreq);
-        /* setup basic struct for cpufreq API */
-        policy->cpu = 0;
-        if (max_duration < POLICY_MIN_DIV)
-                policy->min = maxfreq / max_duration;
-        else
-                policy->min = maxfreq / POLICY_MIN_DIV;
-        policy->max = maxfreq;
-        policy->cur = curfreq;
-        policy->cpuinfo.min_freq = maxfreq / max_duration;
-        policy->cpuinfo.max_freq = maxfreq;
-        policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
-        return 0;
-}
-/*
- * cpufreq_gx_init:
- *   MediaGX/Geode GX initialize cpufreq driver
- */
-static struct cpufreq_driver gx_suspmod_driver = {
-        .get            = gx_get_cpuspeed,
-        .verify         = cpufreq_gx_verify,
-        .target         = cpufreq_gx_target,
-        .init           = cpufreq_gx_cpu_init,
-        .name           = "gx-suspmod",
-        .owner          = THIS_MODULE,
-};
-static int __init cpufreq_gx_init(void)
-{
-        int ret;
-        struct gxfreq_params *params;
-        struct pci_dev *gx_pci;
-        /* Test if we have the right hardware */
-        gx_pci = gx_detect_chipset();
-        if (gx_pci == NULL)
-                return -ENODEV;
-        /* check whether module parameters are sane */
-        if (max_duration > 0xff)
-                max_duration = 0xff;
-        dprintk("geode suspend modulation available.\n");
-        params = kzalloc(sizeof(struct gxfreq_params), GFP_KERNEL);
-        if (params == NULL)
-                return -ENOMEM;
-        params->cs55x0 = gx_pci;
-        gx_params = params;
-        /* keep cs55x0 configurations */
-        pci_read_config_byte(params->cs55x0, PCI_SUSCFG, &(params->pci_suscfg));
-        pci_read_config_byte(params->cs55x0, PCI_PMER1, &(params->pci_pmer1));
-        pci_read_config_byte(params->cs55x0, PCI_PMER2, &(params->pci_pmer2));
-        pci_read_config_byte(params->cs55x0, PCI_MODON, &(params->on_duration));
-        pci_read_config_byte(params->cs55x0, PCI_MODOFF,
-                        &(params->off_duration));
-        ret = cpufreq_register_driver(&gx_suspmod_driver);
-        if (ret) {
-                kfree(params);
-                return ret;                   /* register error! */
-        }
-        return 0;
-}
-static void __exit cpufreq_gx_exit(void)
-{
-        cpufreq_unregister_driver(&gx_suspmod_driver);
-        pci_dev_put(gx_params->cs55x0);
-        kfree(gx_params);
-}
-MODULE_AUTHOR("Hiroshi Miura <miura@da-cha.org>");
-MODULE_DESCRIPTION("Cpufreq driver for Cyrix MediaGX and NatSemi Geode");
-MODULE_LICENSE("GPL");
-module_init(cpufreq_gx_init);
-module_exit(cpufreq_gx_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.c b/arch/x86/kernel/cpu/cpufreq/longhaul.c
deleted file mode 100644
index 03162dac6271..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/longhaul.c
+++ /dev/null
@@ -1,1029 +0,0 @@
-/*
- *  (C) 2001-2004  Dave Jones. <davej@redhat.com>
- *  (C) 2002  Padraig Brady. <padraig@antefacto.com>
- *
- *  Licensed under the terms of the GNU GPL License version 2.
- *  Based upon datasheets & sample CPUs kindly provided by VIA.
- *
- *  VIA have currently 3 different versions of Longhaul.
- *  Version 1 (Longhaul) uses the BCR2 MSR at 0x1147.
- *   It is present only in Samuel 1 (C5A), Samuel 2 (C5B) stepping 0.
- *  Version 2 of longhaul is backward compatible with v1, but adds
- *   LONGHAUL MSR for purpose of both frequency and voltage scaling.
- *   Present in Samuel 2 (steppings 1-7 only) (C5B), and Ezra (C5C).
- *  Version 3 of longhaul got renamed to Powersaver and redesigned
- *   to use only the POWERSAVER MSR at 0x110a.
- *   It is present in Ezra-T (C5M), Nehemiah (C5X) and above.
- *   It's pretty much the same feature wise to longhaul v2, though
- *   there is provision for scaling FSB too, but this doesn't work
- *   too well in practice so we don't even try to use this.
- *
- *  BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
- */
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/moduleparam.h>
-#include <linux/init.h>
-#include <linux/cpufreq.h>
-#include <linux/pci.h>
-#include <linux/slab.h>
-#include <linux/string.h>
-#include <linux/delay.h>
-#include <linux/timex.h>
-#include <linux/io.h>
-#include <linux/acpi.h>
-#include <asm/msr.h>
-#include <acpi/processor.h>
-#include "longhaul.h"
-#define PFX "longhaul: "
-#define TYPE_LONGHAUL_V1        1
-#define TYPE_LONGHAUL_V2        2
-#define TYPE_POWERSAVER         3
-#define CPU_SAMUEL      1
-#define CPU_SAMUEL2     2
-#define CPU_EZRA        3
-#define CPU_EZRA_T      4
-#define CPU_NEHEMIAH    5
-#define CPU_NEHEMIAH_C  6
-/* Flags */
-#define USE_ACPI_C3             (1 << 1)
-#define USE_NORTHBRIDGE         (1 << 2)
-static int cpu_model;
-static unsigned int numscales = 16;
-static unsigned int fsb;
-static const struct mV_pos *vrm_mV_table;
-static const unsigned char *mV_vrm_table;
-static unsigned int highest_speed, lowest_speed; /* kHz */
-static unsigned int minmult, maxmult;
-static int can_scale_voltage;
-static struct acpi_processor *pr;
-static struct acpi_processor_cx *cx;
-static u32 acpi_regs_addr;
-static u8 longhaul_flags;
-static unsigned int longhaul_index;
-/* Module parameters */
-static int scale_voltage;
-static int disable_acpi_c3;
-static int revid_errata;
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
-                "longhaul", msg)
-/* Clock ratios multiplied by 10 */
-static int mults[32];
-static int eblcr[32];
-static int longhaul_version;
-static struct cpufreq_frequency_table *longhaul_table;
-#ifdef CONFIG_CPU_FREQ_DEBUG
-static char speedbuffer[8];
-static char *print_speed(int speed)
-{
-        if (speed < 1000) {
-                snprintf(speedbuffer, sizeof(speedbuffer), "%dMHz", speed);
-                return speedbuffer;
-        }
-        if (speed%1000 == 0)
-                snprintf(speedbuffer, sizeof(speedbuffer),
-                        "%dGHz", speed/1000);
-        else
-                snprintf(speedbuffer, sizeof(speedbuffer),
-                        "%d.%dGHz", speed/1000, (speed%1000)/100);
-        return speedbuffer;
-}
-#endif
-static unsigned int calc_speed(int mult)
-{
-        int khz;
-        khz = (mult/10)*fsb;
-        if (mult%10)
-                khz += fsb/2;
-        khz *= 1000;
-        return khz;
-}
-static int longhaul_get_cpu_mult(void)
-{
-        unsigned long invalue = 0, lo, hi;
-        rdmsr(MSR_IA32_EBL_CR_POWERON, lo, hi);
-        invalue = (lo & (1<<22|1<<23|1<<24|1<<25))>>22;
-        if (longhaul_version == TYPE_LONGHAUL_V2 ||
-            longhaul_version == TYPE_POWERSAVER) {
-                if (lo & (1<<27))
-                        invalue += 16;
-        }
-        return eblcr[invalue];
-}
-/* For processor with BCR2 MSR */
-static void do_longhaul1(unsigned int mults_index)
-{
-        union msr_bcr2 bcr2;
-        rdmsrl(MSR_VIA_BCR2, bcr2.val);
-        /* Enable software clock multiplier */
-        bcr2.bits.ESOFTBF = 1;
-        bcr2.bits.CLOCKMUL = mults_index & 0xff;
-        /* Sync to timer tick */
-        safe_halt();
-        /* Change frequency on next halt or sleep */
-        wrmsrl(MSR_VIA_BCR2, bcr2.val);
-        /* Invoke transition */
-        ACPI_FLUSH_CPU_CACHE();
-        halt();
-        /* Disable software clock multiplier */
-        local_irq_disable();
-        rdmsrl(MSR_VIA_BCR2, bcr2.val);
-        bcr2.bits.ESOFTBF = 0;
-        wrmsrl(MSR_VIA_BCR2, bcr2.val);
-}
-/* For processor with Longhaul MSR */
-static void do_powersaver(int cx_address, unsigned int mults_index,
-                          unsigned int dir)
-{
-        union msr_longhaul longhaul;
-        u32 t;
-        rdmsrl(MSR_VIA_LONGHAUL, longhaul.val);
-        /* Setup new frequency */
-        if (!revid_errata)
-                longhaul.bits.RevisionKey = longhaul.bits.RevisionID;
-        else
-                longhaul.bits.RevisionKey = 0;
-        longhaul.bits.SoftBusRatio = mults_index & 0xf;
-        longhaul.bits.SoftBusRatio4 = (mults_index & 0x10) >> 4;
-        /* Setup new voltage */
-        if (can_scale_voltage)
-                longhaul.bits.SoftVID = (mults_index >> 8) & 0x1f;
-        /* Sync to timer tick */
-        safe_halt();
-        /* Raise voltage if necessary */
-        if (can_scale_voltage && dir) {
-                longhaul.bits.EnableSoftVID = 1;
-                wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
-                /* Change voltage */
-                if (!cx_address) {
-                        ACPI_FLUSH_CPU_CACHE();
-                        halt();
-                } else {
-                        ACPI_FLUSH_CPU_CACHE();
-                        /* Invoke C3 */
-                        inb(cx_address);
-                        /* Dummy op - must do something useless after P_LVL3
-                         * read */
-                        t = inl(acpi_gbl_FADT.xpm_timer_block.address);
-                }
-                longhaul.bits.EnableSoftVID = 0;
-                wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
-        }
-        /* Change frequency on next halt or sleep */
-        longhaul.bits.EnableSoftBusRatio = 1;
-        wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
-        if (!cx_address) {
-                ACPI_FLUSH_CPU_CACHE();
-                halt();
-        } else {
-                ACPI_FLUSH_CPU_CACHE();
-                /* Invoke C3 */
-                inb(cx_address);
-                /* Dummy op - must do something useless after P_LVL3 read */
-                t = inl(acpi_gbl_FADT.xpm_timer_block.address);
-        }
-        /* Disable bus ratio bit */
-        longhaul.bits.EnableSoftBusRatio = 0;
-        wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
-        /* Reduce voltage if necessary */
-        if (can_scale_voltage && !dir) {
-                longhaul.bits.EnableSoftVID = 1;
-                wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
-                /* Change voltage */
-                if (!cx_address) {
-                        ACPI_FLUSH_CPU_CACHE();
-                        halt();
-                } else {
-                        ACPI_FLUSH_CPU_CACHE();
-                        /* Invoke C3 */
-                        inb(cx_address);
-                        /* Dummy op - must do something useless after P_LVL3
-                         * read */
-                        t = inl(acpi_gbl_FADT.xpm_timer_block.address);
-                }
-                longhaul.bits.EnableSoftVID = 0;
-                wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
-        }
-}
-/**
- * longhaul_set_cpu_frequency()
- * @mults_index : bitpattern of the new multiplier.
- *
- * Sets a new clock ratio.
- */
-static void longhaul_setstate(unsigned int table_index)
-{
-        unsigned int mults_index;
-        int speed, mult;
-        struct cpufreq_freqs freqs;
-        unsigned long flags;
-        unsigned int pic1_mask, pic2_mask;
-        u16 bm_status = 0;
-        u32 bm_timeout = 1000;
-        unsigned int dir = 0;
-        mults_index = longhaul_table[table_index].index;
-        /* Safety precautions */
-        mult = mults[mults_index & 0x1f];
-        if (mult == -1)
-                return;
-        speed = calc_speed(mult);
-        if ((speed > highest_speed) || (speed < lowest_speed))
-                return;
-        /* Voltage transition before frequency transition? */
-        if (can_scale_voltage && longhaul_index < table_index)
-                dir = 1;
-        freqs.old = calc_speed(longhaul_get_cpu_mult());
-        freqs.new = speed;
-        freqs.cpu = 0; /* longhaul.c is UP only driver */
-        cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
-        dprintk("Setting to FSB:%dMHz Mult:%d.%dx (%s)\n",
-                        fsb, mult/10, mult%10, print_speed(speed/1000));
-retry_loop:
-        preempt_disable();
-        local_irq_save(flags);
-        pic2_mask = inb(0xA1);
-        pic1_mask = inb(0x21);  /* works on C3. save mask. */
-        outb(0xFF, 0xA1);       /* Overkill */
-        outb(0xFE, 0x21);       /* TMR0 only */
-        /* Wait while PCI bus is busy. */
-        if (acpi_regs_addr && (longhaul_flags & USE_NORTHBRIDGE
-            || ((pr != NULL) && pr->flags.bm_control))) {
-                bm_status = inw(acpi_regs_addr);
-                bm_status &= 1 << 4;
-                while (bm_status && bm_timeout) {
-                        outw(1 << 4, acpi_regs_addr);
-                        bm_timeout--;
-                        bm_status = inw(acpi_regs_addr);
-                        bm_status &= 1 << 4;
-                }
-        }
-        if (longhaul_flags & USE_NORTHBRIDGE) {
-                /* Disable AGP and PCI arbiters */
-                outb(3, 0x22);
-        } else if ((pr != NULL) && pr->flags.bm_control) {
-                /* Disable bus master arbitration */
-                acpi_write_bit_register(ACPI_BITREG_ARB_DISABLE, 1);
-        }
-        switch (longhaul_version) {
-        /*
-         * Longhaul v1. (Samuel[C5A] and Samuel2 stepping 0[C5B])
-         * Software controlled multipliers only.
-         */
-        case TYPE_LONGHAUL_V1:
-                do_longhaul1(mults_index);
-                break;
-        /*
-         * Longhaul v2 appears in Samuel2 Steppings 1->7 [C5B] and Ezra [C5C]
-         *
-         * Longhaul v3 (aka Powersaver). (Ezra-T [C5M] & Nehemiah [C5N])
-         * Nehemiah can do FSB scaling too, but this has never been proven
-         * to work in practice.
-         */
-        case TYPE_LONGHAUL_V2:
-        case TYPE_POWERSAVER:
-                if (longhaul_flags & USE_ACPI_C3) {
-                        /* Don't allow wakeup */
-                        acpi_write_bit_register(ACPI_BITREG_BUS_MASTER_RLD, 0);
-                        do_powersaver(cx->address, mults_index, dir);
-                } else {
-                        do_powersaver(0, mults_index, dir);
-                }
-                break;
-        }
-        if (longhaul_flags & USE_NORTHBRIDGE) {
-                /* Enable arbiters */
-                outb(0, 0x22);
-        } else if ((pr != NULL) && pr->flags.bm_control) {
-                /* Enable bus master arbitration */
-                acpi_write_bit_register(ACPI_BITREG_ARB_DISABLE, 0);
-        }
-        outb(pic2_mask, 0xA1);  /* restore mask */
-        outb(pic1_mask, 0x21);
-        local_irq_restore(flags);
-        preempt_enable();
-        freqs.new = calc_speed(longhaul_get_cpu_mult());
-        /* Check if requested frequency is set. */
-        if (unlikely(freqs.new != speed)) {
-                printk(KERN_INFO PFX "Failed to set requested frequency!\n");
-                /* Revision ID = 1 but processor is expecting revision key
-                 * equal to 0. Jumpers at the bottom of processor will change
-                 * multiplier and FSB, but will not change bits in Longhaul
-                 * MSR nor enable voltage scaling. */
-                if (!revid_errata) {
-                        printk(KERN_INFO PFX "Enabling \"Ignore Revision ID\" "
-                                                "option.\n");
-                        revid_errata = 1;
-                        msleep(200);
-                        goto retry_loop;
-                }
-                /* Why ACPI C3 sometimes doesn't work is a mystery for me.
-                 * But it does happen. Processor is entering ACPI C3 state,
-                 * but it doesn't change frequency. I tried poking various
-                 * bits in northbridge registers, but without success. */
-                if (longhaul_flags & USE_ACPI_C3) {
-                        printk(KERN_INFO PFX "Disabling ACPI C3 support.\n");
-                        longhaul_flags &= ~USE_ACPI_C3;
-                        if (revid_errata) {
-                                printk(KERN_INFO PFX "Disabling \"Ignore "
-                                                "Revision ID\" option.\n");
-                                revid_errata = 0;
-                        }
-                        msleep(200);
-                        goto retry_loop;
-                }
-                /* This shouldn't happen. Longhaul ver. 2 was reported not
-                 * working on processors without voltage scaling, but with
-                 * RevID = 1. RevID errata will make things right. Just
-                 * to be 100% sure. */
-                if (longhaul_version == TYPE_LONGHAUL_V2) {
-                        printk(KERN_INFO PFX "Switching to Longhaul ver. 1\n");
-                        longhaul_version = TYPE_LONGHAUL_V1;
-                        msleep(200);
-                        goto retry_loop;
-                }
-        }
-        /* Report true CPU frequency */
-        cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
-        if (!bm_timeout)
-                printk(KERN_INFO PFX "Warning: Timeout while waiting for "
-                                "idle PCI bus.\n");
-}
-/*
- * Centaur decided to make life a little more tricky.
- * Only longhaul v1 is allowed to read EBLCR BSEL[0:1].
- * Samuel2 and above have to try and guess what the FSB is.
- * We do this by assuming we booted at maximum multiplier, and interpolate
- * between that value multiplied by possible FSBs and cpu_mhz which
- * was calculated at boot time. Really ugly, but no other way to do this.
- */
-#define ROUNDING        0xf
-static int guess_fsb(int mult)
-{
-        int speed = cpu_khz / 1000;
-        int i;
-        int speeds[] = { 666, 1000, 1333, 2000 };
-        int f_max, f_min;
-        for (i = 0; i < 4; i++) {
-                f_max = ((speeds[i] * mult) + 50) / 100;
-                f_max += (ROUNDING / 2);
-                f_min = f_max - ROUNDING;
-                if ((speed <= f_max) && (speed >= f_min))
-                        return speeds[i] / 10;
-        }
-        return 0;
-}
-static int __cpuinit longhaul_get_ranges(void)
-{
-        unsigned int i, j, k = 0;
-        unsigned int ratio;
-        int mult;
-        /* Get current frequency */
-        mult = longhaul_get_cpu_mult();
-        if (mult == -1) {
-                printk(KERN_INFO PFX "Invalid (reserved) multiplier!\n");
-                return -EINVAL;
-        }
-        fsb = guess_fsb(mult);
-        if (fsb == 0) {
-                printk(KERN_INFO PFX "Invalid (reserved) FSB!\n");
-                return -EINVAL;
-        }
-        /* Get max multiplier - as we always did.
-         * Longhaul MSR is usefull only when voltage scaling is enabled.
-         * C3 is booting at max anyway. */
-        maxmult = mult;
-        /* Get min multiplier */
-        switch (cpu_model) {
-        case CPU_NEHEMIAH:
-                minmult = 50;
-                break;
-        case CPU_NEHEMIAH_C:
-                minmult = 40;
-                break;
-        default:
-                minmult = 30;
-                break;
-        }
-        dprintk("MinMult:%d.%dx MaxMult:%d.%dx\n",
-                 minmult/10, minmult%10, maxmult/10, maxmult%10);
-        highest_speed = calc_speed(maxmult);
-        lowest_speed = calc_speed(minmult);
-        dprintk("FSB:%dMHz  Lowest speed: %s   Highest speed:%s\n", fsb,
-                 print_speed(lowest_speed/1000),
-                 print_speed(highest_speed/1000));
-        if (lowest_speed == highest_speed) {
-                printk(KERN_INFO PFX "highestspeed == lowest, aborting.\n");
-                return -EINVAL;
-        }
-        if (lowest_speed > highest_speed) {
-                printk(KERN_INFO PFX "nonsense! lowest (%d > %d) !\n",
-                        lowest_speed, highest_speed);
-                return -EINVAL;
-        }
-        longhaul_table = kmalloc((numscales + 1) * sizeof(*longhaul_table),
-                        GFP_KERNEL);
-        if (!longhaul_table)
-                return -ENOMEM;
-        for (j = 0; j < numscales; j++) {
-                ratio = mults[j];
-                if (ratio == -1)
-                        continue;
-                if (ratio > maxmult || ratio < minmult)
-                        continue;
-                longhaul_table[k].frequency = calc_speed(ratio);
-                longhaul_table[k].index = j;
-                k++;
-        }
-        if (k <= 1) {
-                kfree(longhaul_table);
-                return -ENODEV;
-        }
-        /* Sort */
-        for (j = 0; j < k - 1; j++) {
-                unsigned int min_f, min_i;
-                min_f = longhaul_table[j].frequency;
-                min_i = j;
-                for (i = j + 1; i < k; i++) {
-                        if (longhaul_table[i].frequency < min_f) {
-                                min_f = longhaul_table[i].frequency;
-                                min_i = i;
-                        }
-                }
-                if (min_i != j) {
-                        swap(longhaul_table[j].frequency,
-                             longhaul_table[min_i].frequency);
-                        swap(longhaul_table[j].index,
-                             longhaul_table[min_i].index);
-                }
-        }
-        longhaul_table[k].frequency = CPUFREQ_TABLE_END;
-        /* Find index we are running on */
-        for (j = 0; j < k; j++) {
-                if (mults[longhaul_table[j].index & 0x1f] == mult) {
-                        longhaul_index = j;
-                        break;
-                }
-        }
-        return 0;
-}
-static void __cpuinit longhaul_setup_voltagescaling(void)
-{
-        union msr_longhaul longhaul;
-        struct mV_pos minvid, maxvid, vid;
-        unsigned int j, speed, pos, kHz_step, numvscales;
-        int min_vid_speed;
-        rdmsrl(MSR_VIA_LONGHAUL, longhaul.val);
-        if (!(longhaul.bits.RevisionID & 1)) {
-                printk(KERN_INFO PFX "Voltage scaling not supported by CPU.\n");
-                return;
-        }
-        if (!longhaul.bits.VRMRev) {
-                printk(KERN_INFO PFX "VRM 8.5\n");
-                vrm_mV_table = &vrm85_mV[0];
-                mV_vrm_table = &mV_vrm85[0];
-        } else {
-                printk(KERN_INFO PFX "Mobile VRM\n");
-                if (cpu_model < CPU_NEHEMIAH)
-                        return;
-                vrm_mV_table = &mobilevrm_mV[0];
-                mV_vrm_table = &mV_mobilevrm[0];
-        }
-        minvid = vrm_mV_table[longhaul.bits.MinimumVID];
-        maxvid = vrm_mV_table[longhaul.bits.MaximumVID];
-        if (minvid.mV == 0 || maxvid.mV == 0 || minvid.mV > maxvid.mV) {
-                printk(KERN_INFO PFX "Bogus values Min:%d.%03d Max:%d.%03d. "
-                                        "Voltage scaling disabled.\n",
-                                        minvid.mV/1000, minvid.mV%1000,
-                                        maxvid.mV/1000, maxvid.mV%1000);
-                return;
-        }
-        if (minvid.mV == maxvid.mV) {
-                printk(KERN_INFO PFX "Claims to support voltage scaling but "
-                                "min & max are both %d.%03d. "
-                                "Voltage scaling disabled\n",
-                                maxvid.mV/1000, maxvid.mV%1000);
-                return;
-        }
-        /* How many voltage steps*/
-        numvscales = maxvid.pos - minvid.pos + 1;
-        printk(KERN_INFO PFX
-                "Max VID=%d.%03d  "
-                "Min VID=%d.%03d, "
-                "%d possible voltage scales\n",
-                maxvid.mV/1000, maxvid.mV%1000,
-                minvid.mV/1000, minvid.mV%1000,
-                numvscales);
-        /* Calculate max frequency at min voltage */
-        j = longhaul.bits.MinMHzBR;
-        if (longhaul.bits.MinMHzBR4)
-                j += 16;
-        min_vid_speed = eblcr[j];
-        if (min_vid_speed == -1)
-                return;
-        switch (longhaul.bits.MinMHzFSB) {
-        case 0:
-                min_vid_speed *= 13333;
-                break;
-        case 1:
-                min_vid_speed *= 10000;
-                break;
-        case 3:
-                min_vid_speed *= 6666;
-                break;
-        default:
-                return;
-                break;
-        }
-        if (min_vid_speed >= highest_speed)
-                return;
-        /* Calculate kHz for one voltage step */
-        kHz_step = (highest_speed - min_vid_speed) / numvscales;
-        j = 0;
-        while (longhaul_table[j].frequency != CPUFREQ_TABLE_END) {
-                speed = longhaul_table[j].frequency;
-                if (speed > min_vid_speed)
-                        pos = (speed - min_vid_speed) / kHz_step + minvid.pos;
-                else
-                        pos = minvid.pos;
-                longhaul_table[j].index |= mV_vrm_table[pos] << 8;
-                vid = vrm_mV_table[mV_vrm_table[pos]];
-                printk(KERN_INFO PFX "f: %d kHz, index: %d, vid: %d mV\n",
-                                speed, j, vid.mV);
-                j++;
-        }
-        can_scale_voltage = 1;
-        printk(KERN_INFO PFX "Voltage scaling enabled.\n");
-}
-static int longhaul_verify(struct cpufreq_policy *policy)
-{
-        return cpufreq_frequency_table_verify(policy, longhaul_table);
-}
-static int longhaul_target(struct cpufreq_policy *policy,
-                            unsigned int target_freq, unsigned int relation)
-{
-        unsigned int table_index = 0;
-        unsigned int i;
-        unsigned int dir = 0;
-        u8 vid, current_vid;
-        if (cpufreq_frequency_table_target(policy, longhaul_table, target_freq,
-                                relation, &table_index))
-                return -EINVAL;
-        /* Don't set same frequency again */
-        if (longhaul_index == table_index)
-                return 0;
-        if (!can_scale_voltage)
-                longhaul_setstate(table_index);
-        else {
-                /* On test system voltage transitions exceeding single
-                 * step up or down were turning motherboard off. Both
-                 * "ondemand" and "userspace" are unsafe. C7 is doing
-                 * this in hardware, C3 is old and we need to do this
-                 * in software. */
-                i = longhaul_index;
-                current_vid = (longhaul_table[longhaul_index].index >> 8);
-                current_vid &= 0x1f;
-                if (table_index > longhaul_index)
-                        dir = 1;
-                while (i != table_index) {
-                        vid = (longhaul_table[i].index >> 8) & 0x1f;
-                        if (vid != current_vid) {
-                                longhaul_setstate(i);
-                                current_vid = vid;
-                                msleep(200);
-                        }
-                        if (dir)
-                                i++;
-                        else
-                                i--;
-                }
-                longhaul_setstate(table_index);
-        }
-        longhaul_index = table_index;
-        return 0;
-}
-static unsigned int longhaul_get(unsigned int cpu)
-{
-        if (cpu)
-                return 0;
-        return calc_speed(longhaul_get_cpu_mult());
-}
-static acpi_status longhaul_walk_callback(acpi_handle obj_handle,
-                                          u32 nesting_level,
-                                          void *context, void **return_value)
-{
-        struct acpi_device *d;
-        if (acpi_bus_get_device(obj_handle, &d))
-                return 0;
-        *return_value = acpi_driver_data(d);
-        return 1;
-}
-/* VIA don't support PM2 reg, but have something similar */
-static int enable_arbiter_disable(void)
-{
-        struct pci_dev *dev;
-        int status = 1;
-        int reg;
-        u8 pci_cmd;
-        /* Find PLE133 host bridge */
-        reg = 0x78;
-        dev = pci_get_device(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8601_0,
-                             NULL);
-        /* Find PM133/VT8605 host bridge */
-        if (dev == NULL)
-                dev = pci_get_device(PCI_VENDOR_ID_VIA,
-                                     PCI_DEVICE_ID_VIA_8605_0, NULL);
-        /* Find CLE266 host bridge */
-        if (dev == NULL) {
-                reg = 0x76;
-                dev = pci_get_device(PCI_VENDOR_ID_VIA,
-                                     PCI_DEVICE_ID_VIA_862X_0, NULL);
-                /* Find CN400 V-Link host bridge */
-                if (dev == NULL)
-                        dev = pci_get_device(PCI_VENDOR_ID_VIA, 0x7259, NULL);
-        }
-        if (dev != NULL) {
-                /* Enable access to port 0x22 */
-                pci_read_config_byte(dev, reg, &pci_cmd);
-                if (!(pci_cmd & 1<<7)) {
-                        pci_cmd |= 1<<7;
-                        pci_write_config_byte(dev, reg, pci_cmd);
-                        pci_read_config_byte(dev, reg, &pci_cmd);
-                        if (!(pci_cmd & 1<<7)) {
-                                printk(KERN_ERR PFX
-                                        "Can't enable access to port 0x22.\n");
-                                status = 0;
-                        }
-                }
-                pci_dev_put(dev);
-                return status;
-        }
-        return 0;
-}
-static int longhaul_setup_southbridge(void)
-{
-        struct pci_dev *dev;
-        u8 pci_cmd;
-        /* Find VT8235 southbridge */
-        dev = pci_get_device(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8235, NULL);
-        if (dev == NULL)
-                /* Find VT8237 southbridge */
-                dev = pci_get_device(PCI_VENDOR_ID_VIA,
-                                     PCI_DEVICE_ID_VIA_8237, NULL);
-        if (dev != NULL) {
-                /* Set transition time to max */
-                pci_read_config_byte(dev, 0xec, &pci_cmd);
-                pci_cmd &= ~(1 << 2);
-                pci_write_config_byte(dev, 0xec, pci_cmd);
-                pci_read_config_byte(dev, 0xe4, &pci_cmd);
-                pci_cmd &= ~(1 << 7);
-                pci_write_config_byte(dev, 0xe4, pci_cmd);
-                pci_read_config_byte(dev, 0xe5, &pci_cmd);
-                pci_cmd |= 1 << 7;
-                pci_write_config_byte(dev, 0xe5, pci_cmd);
-                /* Get address of ACPI registers block*/
-                pci_read_config_byte(dev, 0x81, &pci_cmd);
-                if (pci_cmd & 1 << 7) {
-                        pci_read_config_dword(dev, 0x88, &acpi_regs_addr);
-                        acpi_regs_addr &= 0xff00;
-                        printk(KERN_INFO PFX "ACPI I/O at 0x%x\n",
-                                        acpi_regs_addr);
-                }
-                pci_dev_put(dev);
-                return 1;
-        }
-        return 0;
-}
-static int __cpuinit longhaul_cpu_init(struct cpufreq_policy *policy)
-{
-        struct cpuinfo_x86 *c = &cpu_data(0);
-        char *cpuname = NULL;
-        int ret;
-        u32 lo, hi;
-        /* Check what we have on this motherboard */
-        switch (c->x86_model) {
-        case 6:
-                cpu_model = CPU_SAMUEL;
-                cpuname = "C3 'Samuel' [C5A]";
-                longhaul_version = TYPE_LONGHAUL_V1;
-                memcpy(mults, samuel1_mults, sizeof(samuel1_mults));
-                memcpy(eblcr, samuel1_eblcr, sizeof(samuel1_eblcr));
-                break;
-        case 7:
-                switch (c->x86_mask) {
-                case 0:
-                        longhaul_version = TYPE_LONGHAUL_V1;
-                        cpu_model = CPU_SAMUEL2;
-                        cpuname = "C3 'Samuel 2' [C5B]";
-                        /* Note, this is not a typo, early Samuel2's had
-                         * Samuel1 ratios. */
-                        memcpy(mults, samuel1_mults, sizeof(samuel1_mults));
-                        memcpy(eblcr, samuel2_eblcr, sizeof(samuel2_eblcr));
-                        break;
-                case 1 ... 15:
-                        longhaul_version = TYPE_LONGHAUL_V2;
-                        if (c->x86_mask < 8) {
-                                cpu_model = CPU_SAMUEL2;
-                                cpuname = "C3 'Samuel 2' [C5B]";
-                        } else {
-                                cpu_model = CPU_EZRA;
-                                cpuname = "C3 'Ezra' [C5C]";
-                        }
-                        memcpy(mults, ezra_mults, sizeof(ezra_mults));
-                        memcpy(eblcr, ezra_eblcr, sizeof(ezra_eblcr));
-                        break;
-                }
-                break;
-        case 8:
-                cpu_model = CPU_EZRA_T;
-                cpuname = "C3 'Ezra-T' [C5M]";
-                longhaul_version = TYPE_POWERSAVER;
-                numscales = 32;
-                memcpy(mults, ezrat_mults, sizeof(ezrat_mults));
-                memcpy(eblcr, ezrat_eblcr, sizeof(ezrat_eblcr));
-                break;
-        case 9:
-                longhaul_version = TYPE_POWERSAVER;
-                numscales = 32;
-                memcpy(mults, nehemiah_mults, sizeof(nehemiah_mults));
-                memcpy(eblcr, nehemiah_eblcr, sizeof(nehemiah_eblcr));
-                switch (c->x86_mask) {
-                case 0 ... 1:
-                        cpu_model = CPU_NEHEMIAH;
-                        cpuname = "C3 'Nehemiah A' [C5XLOE]";
-                        break;
-                case 2 ... 4:
-                        cpu_model = CPU_NEHEMIAH;
-                        cpuname = "C3 'Nehemiah B' [C5XLOH]";
-                        break;
-                case 5 ... 15:
-                        cpu_model = CPU_NEHEMIAH_C;
-                        cpuname = "C3 'Nehemiah C' [C5P]";
-                        break;
-                }
-                break;
-        default:
-                cpuname = "Unknown";
-                break;
-        }
-        /* Check Longhaul ver. 2 */
-        if (longhaul_version == TYPE_LONGHAUL_V2) {
-                rdmsr(MSR_VIA_LONGHAUL, lo, hi);
-                if (lo == 0 && hi == 0)
-                        /* Looks like MSR isn't present */
-                        longhaul_version = TYPE_LONGHAUL_V1;
-        }
-        printk(KERN_INFO PFX "VIA %s CPU detected.  ", cpuname);
-        switch (longhaul_version) {
-        case TYPE_LONGHAUL_V1:
-        case TYPE_LONGHAUL_V2:
-                printk(KERN_CONT "Longhaul v%d supported.\n", longhaul_version);
-                break;
-        case TYPE_POWERSAVER:
-                printk(KERN_CONT "Powersaver supported.\n");
-                break;
-        };
-        /* Doesn't hurt */
-        longhaul_setup_southbridge();
-        /* Find ACPI data for processor */
-        acpi_walk_namespace(ACPI_TYPE_PROCESSOR, ACPI_ROOT_OBJECT,
-                                ACPI_UINT32_MAX, &longhaul_walk_callback, NULL,
-                                NULL, (void *)&pr);
-        /* Check ACPI support for C3 state */
-        if (pr != NULL && longhaul_version == TYPE_POWERSAVER) {
-                cx = &pr->power.states[ACPI_STATE_C3];
-                if (cx->address > 0 && cx->latency <= 1000)
-                        longhaul_flags |= USE_ACPI_C3;
-        }
-        /* Disable if it isn't working */
-        if (disable_acpi_c3)
-                longhaul_flags &= ~USE_ACPI_C3;
-        /* Check if northbridge is friendly */
-        if (enable_arbiter_disable())
-                longhaul_flags |= USE_NORTHBRIDGE;
-        /* Check ACPI support for bus master arbiter disable */
-        if (!(longhaul_flags & USE_ACPI_C3
-             || longhaul_flags & USE_NORTHBRIDGE)
-            && ((pr == NULL) || !(pr->flags.bm_control))) {
-                printk(KERN_ERR PFX
-                        "No ACPI support. Unsupported northbridge.\n");
-                return -ENODEV;
-        }
-        if (longhaul_flags & USE_NORTHBRIDGE)
-                printk(KERN_INFO PFX "Using northbridge support.\n");
-        if (longhaul_flags & USE_ACPI_C3)
-                printk(KERN_INFO PFX "Using ACPI support.\n");
-        ret = longhaul_get_ranges();
-        if (ret != 0)
-                return ret;
-        if ((longhaul_version != TYPE_LONGHAUL_V1) && (scale_voltage != 0))
-                longhaul_setup_voltagescaling();
-        policy->cpuinfo.transition_latency = 200000;    /* nsec */
-        policy->cur = calc_speed(longhaul_get_cpu_mult());
-        ret = cpufreq_frequency_table_cpuinfo(policy, longhaul_table);
-        if (ret)
-                return ret;
-        cpufreq_frequency_table_get_attr(longhaul_table, policy->cpu);
-        return 0;
-}
-static int __devexit longhaul_cpu_exit(struct cpufreq_policy *policy)
-{
-        cpufreq_frequency_table_put_attr(policy->cpu);
-        return 0;
-}
-static struct freq_attr *longhaul_attr[] = {
-        &cpufreq_freq_attr_scaling_available_freqs,
-        NULL,
-};
-static struct cpufreq_driver longhaul_driver = {
-        .verify = longhaul_verify,
-        .target = longhaul_target,
-        .get    = longhaul_get,
-        .init   = longhaul_cpu_init,
-        .exit   = __devexit_p(longhaul_cpu_exit),
-        .name   = "longhaul",
-        .owner  = THIS_MODULE,
-        .attr   = longhaul_attr,
-};
-static int __init longhaul_init(void)
-{
-        struct cpuinfo_x86 *c = &cpu_data(0);
-        if (c->x86_vendor != X86_VENDOR_CENTAUR || c->x86 != 6)
-                return -ENODEV;
-#ifdef CONFIG_SMP
-        if (num_online_cpus() > 1) {
-                printk(KERN_ERR PFX "More than 1 CPU detected, "
-                                "longhaul disabled.\n");
-                return -ENODEV;
-        }
-#endif
-#ifdef CONFIG_X86_IO_APIC
-        if (cpu_has_apic) {
-                printk(KERN_ERR PFX "APIC detected. Longhaul is currently "
-                                "broken in this configuration.\n");
-                return -ENODEV;
-        }
-#endif
-        switch (c->x86_model) {
-        case 6 ... 9:
-                return cpufreq_register_driver(&longhaul_driver);
-        case 10:
-                printk(KERN_ERR PFX "Use acpi-cpufreq driver for VIA C7\n");
-        default:
-                ;
-        }
-        return -ENODEV;
-}
-static void __exit longhaul_exit(void)
-{
-        int i;
-        for (i = 0; i < numscales; i++) {
-                if (mults[i] == maxmult) {
-                        longhaul_setstate(i);
-                        break;
-                }
-        }
-        cpufreq_unregister_driver(&longhaul_driver);
-        kfree(longhaul_table);
-}
-/* Even if BIOS is exporting ACPI C3 state, and it is used
- * with success when CPU is idle, this state doesn't
- * trigger frequency transition in some cases. */
-module_param(disable_acpi_c3, int, 0644);
-MODULE_PARM_DESC(disable_acpi_c3, "Don't use ACPI C3 support");
-/* Change CPU voltage with frequency. Very usefull to save
- * power, but most VIA C3 processors aren't supporting it. */
-module_param(scale_voltage, int, 0644);
-MODULE_PARM_DESC(scale_voltage, "Scale voltage of processor");
-/* Force revision key to 0 for processors which doesn't
- * support voltage scaling, but are introducing itself as
- * such. */
-module_param(revid_errata, int, 0644);
-MODULE_PARM_DESC(revid_errata, "Ignore CPU Revision ID");
-MODULE_AUTHOR("Dave Jones <davej@redhat.com>");
-MODULE_DESCRIPTION("Longhaul driver for VIA Cyrix processors.");
-MODULE_LICENSE("GPL");
-late_initcall(longhaul_init);
-module_exit(longhaul_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.h b/arch/x86/kernel/cpu/cpufreq/longhaul.h
deleted file mode 100644
index cbf48fbca881..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/longhaul.h
+++ /dev/null
@@ -1,353 +0,0 @@
-/*
- *  longhaul.h
- *  (C) 2003 Dave Jones.
- *
- *  Licensed under the terms of the GNU GPL License version 2.
- *
- *  VIA-specific information
- */
-union msr_bcr2 {
-        struct {
-                unsigned Reseved:19,    // 18:0
-                ESOFTBF:1,              // 19
-                Reserved2:3,            // 22:20
-                CLOCKMUL:4,             // 26:23
-                Reserved3:5;            // 31:27
-        } bits;
-        unsigned long val;
-};
-union msr_longhaul {
-        struct {
-                unsigned RevisionID:4,  // 3:0
-                RevisionKey:4,          // 7:4
-                EnableSoftBusRatio:1,   // 8
-                EnableSoftVID:1,        // 9
-                EnableSoftBSEL:1,       // 10
-                Reserved:3,             // 11:13
-                SoftBusRatio4:1,        // 14
-                VRMRev:1,               // 15
-                SoftBusRatio:4,         // 19:16
-                SoftVID:5,              // 24:20
-                Reserved2:3,            // 27:25
-                SoftBSEL:2,             // 29:28
-                Reserved3:2,            // 31:30
-                MaxMHzBR:4,             // 35:32
-                MaximumVID:5,           // 40:36
-                MaxMHzFSB:2,            // 42:41
-                MaxMHzBR4:1,            // 43
-                Reserved4:4,            // 47:44
-                MinMHzBR:4,             // 51:48
-                MinimumVID:5,           // 56:52
-                MinMHzFSB:2,            // 58:57
-                MinMHzBR4:1,            // 59
-                Reserved5:4;            // 63:60
-        } bits;
-        unsigned long long val;
-};
-/*
- * Clock ratio tables. Div/Mod by 10 to get ratio.
- * The eblcr values specify the ratio read from the CPU.
- * The mults values specify what to write to the CPU.
- */
-/*
- * VIA C3 Samuel 1  & Samuel 2 (stepping 0)
- */
-static const int __cpuinitdata samuel1_mults[16] = {
-        -1, /* 0000 -> RESERVED */
-        30, /* 0001 ->  3.0x */
-        40, /* 0010 ->  4.0x */
-        -1, /* 0011 -> RESERVED */
-        -1, /* 0100 -> RESERVED */
-        35, /* 0101 ->  3.5x */
-        45, /* 0110 ->  4.5x */
-        55, /* 0111 ->  5.5x */
-        60, /* 1000 ->  6.0x */
-        70, /* 1001 ->  7.0x */
-        80, /* 1010 ->  8.0x */
-        50, /* 1011 ->  5.0x */
-        65, /* 1100 ->  6.5x */
-        75, /* 1101 ->  7.5x */
-        -1, /* 1110 -> RESERVED */
-        -1, /* 1111 -> RESERVED */
-};
-static const int __cpuinitdata samuel1_eblcr[16] = {
-        50, /* 0000 -> RESERVED */
-        30, /* 0001 ->  3.0x */
-        40, /* 0010 ->  4.0x */
-        -1, /* 0011 -> RESERVED */
-        55, /* 0100 ->  5.5x */
-        35, /* 0101 ->  3.5x */
-        45, /* 0110 ->  4.5x */
-        -1, /* 0111 -> RESERVED */
-        -1, /* 1000 -> RESERVED */
-        70, /* 1001 ->  7.0x */
-        80, /* 1010 ->  8.0x */
-        60, /* 1011 ->  6.0x */
-        -1, /* 1100 -> RESERVED */
-        75, /* 1101 ->  7.5x */
-        -1, /* 1110 -> RESERVED */
-        65, /* 1111 ->  6.5x */
-};
-/*
- * VIA C3 Samuel2 Stepping 1->15
- */
-static const int __cpuinitdata samuel2_eblcr[16] = {
-        50,  /* 0000 ->  5.0x */
-        30,  /* 0001 ->  3.0x */
-        40,  /* 0010 ->  4.0x */
-        100, /* 0011 -> 10.0x */
-        55,  /* 0100 ->  5.5x */
-        35,  /* 0101 ->  3.5x */
-        45,  /* 0110 ->  4.5x */
-        110, /* 0111 -> 11.0x */
-        90,  /* 1000 ->  9.0x */
-        70,  /* 1001 ->  7.0x */
-        80,  /* 1010 ->  8.0x */
-        60,  /* 1011 ->  6.0x */
-        120, /* 1100 -> 12.0x */
-        75,  /* 1101 ->  7.5x */
-        130, /* 1110 -> 13.0x */
-        65,  /* 1111 ->  6.5x */
-};
-/*
- * VIA C3 Ezra
- */
-static const int __cpuinitdata ezra_mults[16] = {
-        100, /* 0000 -> 10.0x */
-        30,  /* 0001 ->  3.0x */
-        40,  /* 0010 ->  4.0x */
-        90,  /* 0011 ->  9.0x */
-        95,  /* 0100 ->  9.5x */
-        35,  /* 0101 ->  3.5x */
-        45,  /* 0110 ->  4.5x */
-        55,  /* 0111 ->  5.5x */
-        60,  /* 1000 ->  6.0x */
-        70,  /* 1001 ->  7.0x */
-        80,  /* 1010 ->  8.0x */
-        50,  /* 1011 ->  5.0x */
-        65,  /* 1100 ->  6.5x */
-        75,  /* 1101 ->  7.5x */
-        85,  /* 1110 ->  8.5x */
-        120, /* 1111 -> 12.0x */
-};
-static const int __cpuinitdata ezra_eblcr[16] = {
-        50,  /* 0000 ->  5.0x */
-        30,  /* 0001 ->  3.0x */
-        40,  /* 0010 ->  4.0x */
-        100, /* 0011 -> 10.0x */
-        55,  /* 0100 ->  5.5x */
-        35,  /* 0101 ->  3.5x */
-        45,  /* 0110 ->  4.5x */
-        95,  /* 0111 ->  9.5x */
-        90,  /* 1000 ->  9.0x */
-        70,  /* 1001 ->  7.0x */
-        80,  /* 1010 ->  8.0x */
-        60,  /* 1011 ->  6.0x */
-        120, /* 1100 -> 12.0x */
-        75,  /* 1101 ->  7.5x */
-        85,  /* 1110 ->  8.5x */
-        65,  /* 1111 ->  6.5x */
-};
-/*
- * VIA C3 (Ezra-T) [C5M].
- */
-static const int __cpuinitdata ezrat_mults[32] = {
-        100, /* 0000 -> 10.0x */
-        30,  /* 0001 ->  3.0x */
-        40,  /* 0010 ->  4.0x */
-        90,  /* 0011 ->  9.0x */
-        95,  /* 0100 ->  9.5x */
-        35,  /* 0101 ->  3.5x */
-        45,  /* 0110 ->  4.5x */
-        55,  /* 0111 ->  5.5x */
-        60,  /* 1000 ->  6.0x */
-        70,  /* 1001 ->  7.0x */
-        80,  /* 1010 ->  8.0x */
-        50,  /* 1011 ->  5.0x */
-        65,  /* 1100 ->  6.5x */
-        75,  /* 1101 ->  7.5x */
-        85,  /* 1110 ->  8.5x */
-        120, /* 1111 ->  12.0x */
-        -1,  /* 0000 -> RESERVED (10.0x) */
-        110, /* 0001 -> 11.0x */
-        -1, /* 0010 -> 12.0x */
-        -1,  /* 0011 -> RESERVED (9.0x)*/
-        105, /* 0100 -> 10.5x */
-        115, /* 0101 -> 11.5x */
-        125, /* 0110 -> 12.5x */
-        135, /* 0111 -> 13.5x */
-        140, /* 1000 -> 14.0x */
-        150, /* 1001 -> 15.0x */
-        160, /* 1010 -> 16.0x */
-        130, /* 1011 -> 13.0x */
-        145, /* 1100 -> 14.5x */
-        155, /* 1101 -> 15.5x */
-        -1,  /* 1110 -> RESERVED (13.0x) */
-        -1,  /* 1111 -> RESERVED (12.0x) */
-};
-static const int __cpuinitdata ezrat_eblcr[32] = {
-        50,  /* 0000 ->  5.0x */
-        30,  /* 0001 ->  3.0x */
-        40,  /* 0010 ->  4.0x */
-        100, /* 0011 -> 10.0x */
-        55,  /* 0100 ->  5.5x */
-        35,  /* 0101 ->  3.5x */
-        45,  /* 0110 ->  4.5x */
-        95,  /* 0111 ->  9.5x */
-        90,  /* 1000 ->  9.0x */
-        70,  /* 1001 ->  7.0x */
-        80,  /* 1010 ->  8.0x */
-        60,  /* 1011 ->  6.0x */
-        120, /* 1100 -> 12.0x */
-        75,  /* 1101 ->  7.5x */
-        85,  /* 1110 ->  8.5x */
-        65,  /* 1111 ->  6.5x */
-        -1,  /* 0000 -> RESERVED (9.0x) */
-        110, /* 0001 -> 11.0x */
-        120, /* 0010 -> 12.0x */
-        -1,  /* 0011 -> RESERVED (10.0x)*/
-        135, /* 0100 -> 13.5x */
-        115, /* 0101 -> 11.5x */
-        125, /* 0110 -> 12.5x */
-        105, /* 0111 -> 10.5x */
-        130, /* 1000 -> 13.0x */
-        150, /* 1001 -> 15.0x */
-        160, /* 1010 -> 16.0x */
-        140, /* 1011 -> 14.0x */
-        -1,  /* 1100 -> RESERVED (12.0x) */
-        155, /* 1101 -> 15.5x */
-        -1,  /* 1110 -> RESERVED (13.0x) */
-        145, /* 1111 -> 14.5x */
-};
-/*
- * VIA C3 Nehemiah */
-static const int __cpuinitdata nehemiah_mults[32] = {
-        100, /* 0000 -> 10.0x */
-        -1, /* 0001 -> 16.0x */
-        40,  /* 0010 ->  4.0x */
-        90,  /* 0011 ->  9.0x */
-        95,  /* 0100 ->  9.5x */
-        -1,  /* 0101 ->  RESERVED */
-        45,  /* 0110 ->  4.5x */
-        55,  /* 0111 ->  5.5x */
-        60,  /* 1000 ->  6.0x */
-        70,  /* 1001 ->  7.0x */
-        80,  /* 1010 ->  8.0x */
-        50,  /* 1011 ->  5.0x */
-        65,  /* 1100 ->  6.5x */
-        75,  /* 1101 ->  7.5x */
-        85,  /* 1110 ->  8.5x */
-        120, /* 1111 -> 12.0x */
-        -1, /* 0000 -> 10.0x */
-        110, /* 0001 -> 11.0x */
-        -1, /* 0010 -> 12.0x */
-        -1,  /* 0011 ->  9.0x */
-        105, /* 0100 -> 10.5x */
-        115, /* 0101 -> 11.5x */
-        125, /* 0110 -> 12.5x */
-        135, /* 0111 -> 13.5x */
-        140, /* 1000 -> 14.0x */
-        150, /* 1001 -> 15.0x */
-        160, /* 1010 -> 16.0x */
-        130, /* 1011 -> 13.0x */
-        145, /* 1100 -> 14.5x */
-        155, /* 1101 -> 15.5x */
-        -1,  /* 1110 -> RESERVED (13.0x) */
-        -1, /* 1111 -> 12.0x */
-};
-static const int __cpuinitdata nehemiah_eblcr[32] = {
-        50,  /* 0000 ->  5.0x */
-        160, /* 0001 -> 16.0x */
-        40,  /* 0010 ->  4.0x */
-        100, /* 0011 -> 10.0x */
-        55,  /* 0100 ->  5.5x */
-        -1,  /* 0101 ->  RESERVED */
-        45,  /* 0110 ->  4.5x */
-        95,  /* 0111 ->  9.5x */
-        90,  /* 1000 ->  9.0x */
-        70,  /* 1001 ->  7.0x */
-        80,  /* 1010 ->  8.0x */
-        60,  /* 1011 ->  6.0x */
-        120, /* 1100 -> 12.0x */
-        75,  /* 1101 ->  7.5x */
-        85,  /* 1110 ->  8.5x */
-        65,  /* 1111 ->  6.5x */
-        90,  /* 0000 ->  9.0x */
-        110, /* 0001 -> 11.0x */
-        120, /* 0010 -> 12.0x */
-        100, /* 0011 -> 10.0x */
-        135, /* 0100 -> 13.5x */
-        115, /* 0101 -> 11.5x */
-        125, /* 0110 -> 12.5x */
-        105, /* 0111 -> 10.5x */
-        130, /* 1000 -> 13.0x */
-        150, /* 1001 -> 15.0x */
-        160, /* 1010 -> 16.0x */
-        140, /* 1011 -> 14.0x */
-        120, /* 1100 -> 12.0x */
-        155, /* 1101 -> 15.5x */
-        -1,  /* 1110 -> RESERVED (13.0x) */
-        145 /* 1111 -> 14.5x */
-};
-/*
- * Voltage scales. Div/Mod by 1000 to get actual voltage.
- * Which scale to use depends on the VRM type in use.
- */
-struct mV_pos {
-        unsigned short mV;
-        unsigned short pos;
-};
-static const struct mV_pos __cpuinitdata vrm85_mV[32] = {
-        {1250, 8},      {1200, 6},      {1150, 4},      {1100, 2},
-        {1050, 0},      {1800, 30},     {1750, 28},     {1700, 26},
-        {1650, 24},     {1600, 22},     {1550, 20},     {1500, 18},
-        {1450, 16},     {1400, 14},     {1350, 12},     {1300, 10},
-        {1275, 9},      {1225, 7},      {1175, 5},      {1125, 3},
-        {1075, 1},      {1825, 31},     {1775, 29},     {1725, 27},
-        {1675, 25},     {1625, 23},     {1575, 21},     {1525, 19},
-        {1475, 17},     {1425, 15},     {1375, 13},     {1325, 11}
-};
-static const unsigned char __cpuinitdata mV_vrm85[32] = {
-        0x04,   0x14,   0x03,   0x13,   0x02,   0x12,   0x01,   0x11,
-        0x00,   0x10,   0x0f,   0x1f,   0x0e,   0x1e,   0x0d,   0x1d,
-        0x0c,   0x1c,   0x0b,   0x1b,   0x0a,   0x1a,   0x09,   0x19,
-        0x08,   0x18,   0x07,   0x17,   0x06,   0x16,   0x05,   0x15
-};
-static const struct mV_pos __cpuinitdata mobilevrm_mV[32] = {
-        {1750, 31},     {1700, 30},     {1650, 29},     {1600, 28},
-        {1550, 27},     {1500, 26},     {1450, 25},     {1400, 24},
-        {1350, 23},     {1300, 22},     {1250, 21},     {1200, 20},
-        {1150, 19},     {1100, 18},     {1050, 17},     {1000, 16},
-        {975, 15},      {950, 14},      {925, 13},      {900, 12},
-        {875, 11},      {850, 10},      {825, 9},       {800, 8},
-        {775, 7},       {750, 6},       {725, 5},       {700, 4},
-        {675, 3},       {650, 2},       {625, 1},       {600, 0}
-};
-static const unsigned char __cpuinitdata mV_mobilevrm[32] = {
-        0x1f,   0x1e,   0x1d,   0x1c,   0x1b,   0x1a,   0x19,   0x18,
-        0x17,   0x16,   0x15,   0x14,   0x13,   0x12,   0x11,   0x10,
-        0x0f,   0x0e,   0x0d,   0x0c,   0x0b,   0x0a,   0x09,   0x08,
-        0x07,   0x06,   0x05,   0x04,   0x03,   0x02,   0x01,   0x00
-};
diff --git a/arch/x86/kernel/cpu/cpufreq/longrun.c b/arch/x86/kernel/cpu/cpufreq/longrun.c
deleted file mode 100644
index fc09f142d94d..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/longrun.c
+++ /dev/null
@@ -1,327 +0,0 @@
-/*
- * (C) 2002 - 2003  Dominik Brodowski <linux@brodo.de>
- *
- *  Licensed under the terms of the GNU GPL License version 2.
- *
- *  BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
- */
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/cpufreq.h>
-#include <linux/timex.h>
-#include <asm/msr.h>
-#include <asm/processor.h>
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
-                "longrun", msg)
-static struct cpufreq_driver    longrun_driver;
-/**
- * longrun_{low,high}_freq is needed for the conversion of cpufreq kHz
- * values into per cent values. In TMTA microcode, the following is valid:
- * performance_pctg = (current_freq - low_freq)/(high_freq - low_freq)
- */
-static unsigned int longrun_low_freq, longrun_high_freq;
-/**
- * longrun_get_policy - get the current LongRun policy
- * @policy: struct cpufreq_policy where current policy is written into
- *
- * Reads the current LongRun policy by access to MSR_TMTA_LONGRUN_FLAGS
- * and MSR_TMTA_LONGRUN_CTRL
- */
-static void __init longrun_get_policy(struct cpufreq_policy *policy)
-{
-        u32 msr_lo, msr_hi;
-        rdmsr(MSR_TMTA_LONGRUN_FLAGS, msr_lo, msr_hi);
-        dprintk("longrun flags are %x - %x\n", msr_lo, msr_hi);
-        if (msr_lo & 0x01)
-                policy->policy = CPUFREQ_POLICY_PERFORMANCE;
-        else
-                policy->policy = CPUFREQ_POLICY_POWERSAVE;
-        rdmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi);
-        dprintk("longrun ctrl is %x - %x\n", msr_lo, msr_hi);
-        msr_lo &= 0x0000007F;
-        msr_hi &= 0x0000007F;
-        if (longrun_high_freq <= longrun_low_freq) {
-                /* Assume degenerate Longrun table */
-                policy->min = policy->max = longrun_high_freq;
-        } else {
-                policy->min = longrun_low_freq + msr_lo *
-                        ((longrun_high_freq - longrun_low_freq) / 100);
-                policy->max = longrun_low_freq + msr_hi *
-                        ((longrun_high_freq - longrun_low_freq) / 100);
-        }
-        policy->cpu = 0;
-}
-/**
- * longrun_set_policy - sets a new CPUFreq policy
- * @policy: new policy
- *
- * Sets a new CPUFreq policy on LongRun-capable processors. This function
- * has to be called with cpufreq_driver locked.
- */
-static int longrun_set_policy(struct cpufreq_policy *policy)
-{
-        u32 msr_lo, msr_hi;
-        u32 pctg_lo, pctg_hi;
-        if (!policy)
-                return -EINVAL;
-        if (longrun_high_freq <= longrun_low_freq) {
-                /* Assume degenerate Longrun table */
-                pctg_lo = pctg_hi = 100;
-        } else {
-                pctg_lo = (policy->min - longrun_low_freq) /
-                        ((longrun_high_freq - longrun_low_freq) / 100);
-                pctg_hi = (policy->max - longrun_low_freq) /
-                        ((longrun_high_freq - longrun_low_freq) / 100);
-        }
-        if (pctg_hi > 100)
-                pctg_hi = 100;
-        if (pctg_lo > pctg_hi)
-                pctg_lo = pctg_hi;
-        /* performance or economy mode */
-        rdmsr(MSR_TMTA_LONGRUN_FLAGS, msr_lo, msr_hi);
-        msr_lo &= 0xFFFFFFFE;
-        switch (policy->policy) {
-        case CPUFREQ_POLICY_PERFORMANCE:
-                msr_lo |= 0x00000001;
-                break;
-        case CPUFREQ_POLICY_POWERSAVE:
-                break;
-        }
-        wrmsr(MSR_TMTA_LONGRUN_FLAGS, msr_lo, msr_hi);
-        /* lower and upper boundary */
-        rdmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi);
-        msr_lo &= 0xFFFFFF80;
-        msr_hi &= 0xFFFFFF80;
-        msr_lo |= pctg_lo;
-        msr_hi |= pctg_hi;
-        wrmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi);
-        return 0;
-}
-/**
- * longrun_verify_poliy - verifies a new CPUFreq policy
- * @policy: the policy to verify
- *
- * Validates a new CPUFreq policy. This function has to be called with
- * cpufreq_driver locked.
- */
-static int longrun_verify_policy(struct cpufreq_policy *policy)
-{
-        if (!policy)
-                return -EINVAL;
-        policy->cpu = 0;
-        cpufreq_verify_within_limits(policy,
-                policy->cpuinfo.min_freq,
-                policy->cpuinfo.max_freq);
-        if ((policy->policy != CPUFREQ_POLICY_POWERSAVE) &&
-            (policy->policy != CPUFREQ_POLICY_PERFORMANCE))
-                return -EINVAL;
-        return 0;
-}
-static unsigned int longrun_get(unsigned int cpu)
-{
-        u32 eax, ebx, ecx, edx;
-        if (cpu)
-                return 0;
-        cpuid(0x80860007, &eax, &ebx, &ecx, &edx);
-        dprintk("cpuid eax is %u\n", eax);
-        return eax * 1000;
-}
-/**
- * longrun_determine_freqs - determines the lowest and highest possible core frequency
- * @low_freq: an int to put the lowest frequency into
- * @high_freq: an int to put the highest frequency into
- *
- * Determines the lowest and highest possible core frequencies on this CPU.
- * This is necessary to calculate the performance percentage according to
- * TMTA rules:
- * performance_pctg = (target_freq - low_freq)/(high_freq - low_freq)
- */
-static unsigned int __cpuinit longrun_determine_freqs(unsigned int *low_freq,
-                                                      unsigned int *high_freq)
-{
-        u32 msr_lo, msr_hi;
-        u32 save_lo, save_hi;
-        u32 eax, ebx, ecx, edx;
-        u32 try_hi;
-        struct cpuinfo_x86 *c = &cpu_data(0);
-        if (!low_freq || !high_freq)
-                return -EINVAL;
-        if (cpu_has(c, X86_FEATURE_LRTI)) {
-                /* if the LongRun Table Interface is present, the
-                 * detection is a bit easier:
-                 * For minimum frequency, read out the maximum
-                 * level (msr_hi), write that into "currently
-                 * selected level", and read out the frequency.
-                 * For maximum frequency, read out level zero.
-                 */
-                /* minimum */
-                rdmsr(MSR_TMTA_LRTI_READOUT, msr_lo, msr_hi);
-                wrmsr(MSR_TMTA_LRTI_READOUT, msr_hi, msr_hi);
-                rdmsr(MSR_TMTA_LRTI_VOLT_MHZ, msr_lo, msr_hi);
-                *low_freq = msr_lo * 1000; /* to kHz */
-                /* maximum */
-                wrmsr(MSR_TMTA_LRTI_READOUT, 0, msr_hi);
-                rdmsr(MSR_TMTA_LRTI_VOLT_MHZ, msr_lo, msr_hi);
-                *high_freq = msr_lo * 1000; /* to kHz */
-                dprintk("longrun table interface told %u - %u kHz\n",
-                                *low_freq, *high_freq);
-                if (*low_freq > *high_freq)
-                        *low_freq = *high_freq;
-                return 0;
-        }
-        /* set the upper border to the value determined during TSC init */
-        *high_freq = (cpu_khz / 1000);
-        *high_freq = *high_freq * 1000;
-        dprintk("high frequency is %u kHz\n", *high_freq);
-        /* get current borders */
-        rdmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi);
-        save_lo = msr_lo & 0x0000007F;
-        save_hi = msr_hi & 0x0000007F;
-        /* if current perf_pctg is larger than 90%, we need to decrease the
-         * upper limit to make the calculation more accurate.
-         */
-        cpuid(0x80860007, &eax, &ebx, &ecx, &edx);
-        /* try decreasing in 10% steps, some processors react only
-         * on some barrier values */
-        for (try_hi = 80; try_hi > 0 && ecx > 90; try_hi -= 10) {
-                /* set to 0 to try_hi perf_pctg */
-                msr_lo &= 0xFFFFFF80;
-                msr_hi &= 0xFFFFFF80;
-                msr_hi |= try_hi;
-                wrmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi);
-                /* read out current core MHz and current perf_pctg */
-                cpuid(0x80860007, &eax, &ebx, &ecx, &edx);
-                /* restore values */
-                wrmsr(MSR_TMTA_LONGRUN_CTRL, save_lo, save_hi);
-        }
-        dprintk("percentage is %u %%, freq is %u MHz\n", ecx, eax);
-        /* performance_pctg = (current_freq - low_freq)/(high_freq - low_freq)
-         * eqals
-         * low_freq * (1 - perf_pctg) = (cur_freq - high_freq * perf_pctg)
-         *
-         * high_freq * perf_pctg is stored tempoarily into "ebx".
-         */
-        ebx = (((cpu_khz / 1000) * ecx) / 100); /* to MHz */
-        if ((ecx > 95) || (ecx == 0) || (eax < ebx))
-                return -EIO;
-        edx = ((eax - ebx) * 100) / (100 - ecx);
-        *low_freq = edx * 1000; /* back to kHz */
-        dprintk("low frequency is %u kHz\n", *low_freq);
-        if (*low_freq > *high_freq)
-                *low_freq = *high_freq;
-        return 0;
-}
-static int __cpuinit longrun_cpu_init(struct cpufreq_policy *policy)
-{
-        int result = 0;
-        /* capability check */
-        if (policy->cpu != 0)
-                return -ENODEV;
-        /* detect low and high frequency */
-        result = longrun_determine_freqs(&longrun_low_freq, &longrun_high_freq);
-        if (result)
-                return result;
-        /* cpuinfo and default policy values */
-        policy->cpuinfo.min_freq = longrun_low_freq;
-        policy->cpuinfo.max_freq = longrun_high_freq;
-        policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
-        longrun_get_policy(policy);
-        return 0;
-}
-static struct cpufreq_driver longrun_driver = {
-        .flags          = CPUFREQ_CONST_LOOPS,
-        .verify         = longrun_verify_policy,
-        .setpolicy      = longrun_set_policy,
-        .get            = longrun_get,
-        .init           = longrun_cpu_init,
-        .name           = "longrun",
-        .owner          = THIS_MODULE,
-};
-/**
- * longrun_init - initializes the Transmeta Crusoe LongRun CPUFreq driver
- *
- * Initializes the LongRun support.
- */
-static int __init longrun_init(void)
-{
-        struct cpuinfo_x86 *c = &cpu_data(0);
-        if (c->x86_vendor != X86_VENDOR_TRANSMETA ||
-            !cpu_has(c, X86_FEATURE_LONGRUN))
-                return -ENODEV;
-        return cpufreq_register_driver(&longrun_driver);
-}
-/**
- * longrun_exit - unregisters LongRun support
- */
-static void __exit longrun_exit(void)
-{
-        cpufreq_unregister_driver(&longrun_driver);
-}
-MODULE_AUTHOR("Dominik Brodowski <linux@brodo.de>");
-MODULE_DESCRIPTION("LongRun driver for Transmeta Crusoe and "
-                "Efficeon processors.");
-MODULE_LICENSE("GPL");
-module_init(longrun_init);
-module_exit(longrun_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/mperf.c b/arch/x86/kernel/cpu/cpufreq/mperf.c
deleted file mode 100644
index 911e193018ae..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/mperf.c
+++ /dev/null
@@ -1,51 +0,0 @@
-#include <linux/kernel.h>
-#include <linux/smp.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/cpufreq.h>
-#include <linux/slab.h>
-#include "mperf.h"
-static DEFINE_PER_CPU(struct aperfmperf, acfreq_old_perf);
-/* Called via smp_call_function_single(), on the target CPU */
-static void read_measured_perf_ctrs(void *_cur)
-{
-        struct aperfmperf *am = _cur;
-        get_aperfmperf(am);
-}
-/*
- * Return the measured active (C0) frequency on this CPU since last call
- * to this function.
- * Input: cpu number
- * Return: Average CPU frequency in terms of max frequency (zero on error)
- *
- * We use IA32_MPERF and IA32_APERF MSRs to get the measured performance
- * over a period of time, while CPU is in C0 state.
- * IA32_MPERF counts at the rate of max advertised frequency
- * IA32_APERF counts at the rate of actual CPU frequency
- * Only IA32_APERF/IA32_MPERF ratio is architecturally defined and
- * no meaning should be associated with absolute values of these MSRs.
- */
-unsigned int cpufreq_get_measured_perf(struct cpufreq_policy *policy,
-                                        unsigned int cpu)
-{
-        struct aperfmperf perf;
-        unsigned long ratio;
-        unsigned int retval;
-        if (smp_call_function_single(cpu, read_measured_perf_ctrs, &perf, 1))
-                return 0;
-        ratio = calc_aperfmperf_ratio(&per_cpu(acfreq_old_perf, cpu), &perf);
-        per_cpu(acfreq_old_perf, cpu) = perf;
-        retval = (policy->cpuinfo.max_freq * ratio) >> APERFMPERF_SHIFT;
-        return retval;
-}
-EXPORT_SYMBOL_GPL(cpufreq_get_measured_perf);
-MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/cpu/cpufreq/mperf.h b/arch/x86/kernel/cpu/cpufreq/mperf.h
deleted file mode 100644
index 5dbf2950dc22..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/mperf.h
+++ /dev/null
@@ -1,9 +0,0 @@
-/*
- *  (c) 2010 Advanced Micro Devices, Inc.
- *  Your use of this code is subject to the terms and conditions of the
- *  GNU general public license version 2. See "COPYING" or
- *  http://www.gnu.org/licenses/gpl.html
- */
-unsigned int cpufreq_get_measured_perf(struct cpufreq_policy *policy,
-                                        unsigned int cpu);
diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
deleted file mode 100644
index bd1cac747f67..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
+++ /dev/null
@@ -1,331 +0,0 @@
-/*
- *      Pentium 4/Xeon CPU on demand clock modulation/speed scaling
- *      (C) 2002 - 2003 Dominik Brodowski <linux@brodo.de>
- *      (C) 2002 Zwane Mwaikambo <zwane@commfireservices.com>
- *      (C) 2002 Arjan van de Ven <arjanv@redhat.com>
- *      (C) 2002 Tora T. Engstad
- *      All Rights Reserved
- *
- *      This program is free software; you can redistribute it and/or
- *      modify it under the terms of the GNU General Public License
- *      as published by the Free Software Foundation; either version
- *      2 of the License, or (at your option) any later version.
- *
- *      The author(s) of this software shall not be held liable for damages
- *      of any nature resulting due to the use of this software. This
- *      software is provided AS-IS with no warranties.
- *
- *      Date            Errata                  Description
- *      20020525        N44, O17        12.5% or 25% DC causes lockup
- *
- */
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/smp.h>
-#include <linux/cpufreq.h>
-#include <linux/cpumask.h>
-#include <linux/timex.h>
-#include <asm/processor.h>
-#include <asm/msr.h>
-#include <asm/timer.h>
-#include "speedstep-lib.h"
-#define PFX     "p4-clockmod: "
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
-                "p4-clockmod", msg)
-/*
- * Duty Cycle (3bits), note DC_DISABLE is not specified in
- * intel docs i just use it to mean disable
- */
-enum {
-        DC_RESV, DC_DFLT, DC_25PT, DC_38PT, DC_50PT,
-        DC_64PT, DC_75PT, DC_88PT, DC_DISABLE
-};
-#define DC_ENTRIES      8
-static int has_N44_O17_errata[NR_CPUS];
-static unsigned int stock_freq;
-static struct cpufreq_driver p4clockmod_driver;
-static unsigned int cpufreq_p4_get(unsigned int cpu);
-static int cpufreq_p4_setdc(unsigned int cpu, unsigned int newstate)
-{
-        u32 l, h;
-        if (!cpu_online(cpu) ||
-            (newstate > DC_DISABLE) || (newstate == DC_RESV))
-                return -EINVAL;
-        rdmsr_on_cpu(cpu, MSR_IA32_THERM_STATUS, &l, &h);
-        if (l & 0x01)
-                dprintk("CPU#%d currently thermal throttled\n", cpu);
-        if (has_N44_O17_errata[cpu] &&
-            (newstate == DC_25PT || newstate == DC_DFLT))
-                newstate = DC_38PT;
-        rdmsr_on_cpu(cpu, MSR_IA32_THERM_CONTROL, &l, &h);
-        if (newstate == DC_DISABLE) {
-                dprintk("CPU#%d disabling modulation\n", cpu);
-                wrmsr_on_cpu(cpu, MSR_IA32_THERM_CONTROL, l & ~(1<<4), h);
-        } else {
-                dprintk("CPU#%d setting duty cycle to %d%%\n",
-                        cpu, ((125 * newstate) / 10));
-                /* bits 63 - 5  : reserved
-                 * bit  4       : enable/disable
-                 * bits 3-1     : duty cycle
-                 * bit  0       : reserved
-                 */
-                l = (l & ~14);
-                l = l | (1<<4) | ((newstate & 0x7)<<1);
-                wrmsr_on_cpu(cpu, MSR_IA32_THERM_CONTROL, l, h);
-        }
-        return 0;
-}
-static struct cpufreq_frequency_table p4clockmod_table[] = {
-        {DC_RESV, CPUFREQ_ENTRY_INVALID},
-        {DC_DFLT, 0},
-        {DC_25PT, 0},
-        {DC_38PT, 0},
-        {DC_50PT, 0},
-        {DC_64PT, 0},
-        {DC_75PT, 0},
-        {DC_88PT, 0},
-        {DC_DISABLE, 0},
-        {DC_RESV, CPUFREQ_TABLE_END},
-};
-static int cpufreq_p4_target(struct cpufreq_policy *policy,
-                             unsigned int target_freq,
-                             unsigned int relation)
-{
-        unsigned int    newstate = DC_RESV;
-        struct cpufreq_freqs freqs;
-        int i;
-        if (cpufreq_frequency_table_target(policy, &p4clockmod_table[0],
-                                target_freq, relation, &newstate))
-                return -EINVAL;
-        freqs.old = cpufreq_p4_get(policy->cpu);
-        freqs.new = stock_freq * p4clockmod_table[newstate].index / 8;
-        if (freqs.new == freqs.old)
-                return 0;
-        /* notifiers */
-        for_each_cpu(i, policy->cpus) {
-                freqs.cpu = i;
-                cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
-        }
-        /* run on each logical CPU,
-         * see section 13.15.3 of IA32 Intel Architecture Software
-         * Developer's Manual, Volume 3
-         */
-        for_each_cpu(i, policy->cpus)
-                cpufreq_p4_setdc(i, p4clockmod_table[newstate].index);
-        /* notifiers */
-        for_each_cpu(i, policy->cpus) {
-                freqs.cpu = i;
-                cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
-        }
-        return 0;
-}
-static int cpufreq_p4_verify(struct cpufreq_policy *policy)
-{
-        return cpufreq_frequency_table_verify(policy, &p4clockmod_table[0]);
-}
-static unsigned int cpufreq_p4_get_frequency(struct cpuinfo_x86 *c)
-{
-        if (c->x86 == 0x06) {
-                if (cpu_has(c, X86_FEATURE_EST))
-                        printk(KERN_WARNING PFX "Warning: EST-capable CPU "
-                               "detected. The acpi-cpufreq module offers "
-                               "voltage scaling in addition of frequency "
-                               "scaling. You should use that instead of "
-                               "p4-clockmod, if possible.\n");
-                switch (c->x86_model) {
-                case 0x0E: /* Core */
-                case 0x0F: /* Core Duo */
-                case 0x16: /* Celeron Core */
-                case 0x1C: /* Atom */
-                        p4clockmod_driver.flags |= CPUFREQ_CONST_LOOPS;
-                        return speedstep_get_frequency(SPEEDSTEP_CPU_PCORE);
-                case 0x0D: /* Pentium M (Dothan) */
-                        p4clockmod_driver.flags |= CPUFREQ_CONST_LOOPS;
-                        /* fall through */
-                case 0x09: /* Pentium M (Banias) */
-                        return speedstep_get_frequency(SPEEDSTEP_CPU_PM);
-                }
-        }
-        if (c->x86 != 0xF)
-                return 0;
-        /* on P-4s, the TSC runs with constant frequency independent whether
-         * throttling is active or not. */
-        p4clockmod_driver.flags |= CPUFREQ_CONST_LOOPS;
-        if (speedstep_detect_processor() == SPEEDSTEP_CPU_P4M) {
-                printk(KERN_WARNING PFX "Warning: Pentium 4-M detected. "
-                       "The speedstep-ich or acpi cpufreq modules offer "
-                       "voltage scaling in addition of frequency scaling. "
-                       "You should use either one instead of p4-clockmod, "
-                       "if possible.\n");
-                return speedstep_get_frequency(SPEEDSTEP_CPU_P4M);
-        }
-        return speedstep_get_frequency(SPEEDSTEP_CPU_P4D);
-}
-static int cpufreq_p4_cpu_init(struct cpufreq_policy *policy)
-{
-        struct cpuinfo_x86 *c = &cpu_data(policy->cpu);
-        int cpuid = 0;
-        unsigned int i;
-#ifdef CONFIG_SMP
-        cpumask_copy(policy->cpus, cpu_sibling_mask(policy->cpu));
-#endif
-        /* Errata workaround */
-        cpuid = (c->x86 << 8) | (c->x86_model << 4) | c->x86_mask;
-        switch (cpuid) {
-        case 0x0f07:
-        case 0x0f0a:
-        case 0x0f11:
-        case 0x0f12:
-                has_N44_O17_errata[policy->cpu] = 1;
-                dprintk("has errata -- disabling low frequencies\n");
-        }
-        if (speedstep_detect_processor() == SPEEDSTEP_CPU_P4D &&
-            c->x86_model < 2) {
-                /* switch to maximum frequency and measure result */
-                cpufreq_p4_setdc(policy->cpu, DC_DISABLE);
-                recalibrate_cpu_khz();
-        }
-        /* get max frequency */
-        stock_freq = cpufreq_p4_get_frequency(c);
-        if (!stock_freq)
-                return -EINVAL;
-        /* table init */
-        for (i = 1; (p4clockmod_table[i].frequency != CPUFREQ_TABLE_END); i++) {
-                if ((i < 2) && (has_N44_O17_errata[policy->cpu]))
-                        p4clockmod_table[i].frequency = CPUFREQ_ENTRY_INVALID;
-                else
-                        p4clockmod_table[i].frequency = (stock_freq * i)/8;
-        }
-        cpufreq_frequency_table_get_attr(p4clockmod_table, policy->cpu);
-        /* cpuinfo and default policy values */
-        /* the transition latency is set to be 1 higher than the maximum
-         * transition latency of the ondemand governor */
-        policy->cpuinfo.transition_latency = 10000001;
-        policy->cur = stock_freq;
-        return cpufreq_frequency_table_cpuinfo(policy, &p4clockmod_table[0]);
-}
-static int cpufreq_p4_cpu_exit(struct cpufreq_policy *policy)
-{
-        cpufreq_frequency_table_put_attr(policy->cpu);
-        return 0;
-}
-static unsigned int cpufreq_p4_get(unsigned int cpu)
-{
-        u32 l, h;
-        rdmsr_on_cpu(cpu, MSR_IA32_THERM_CONTROL, &l, &h);
-        if (l & 0x10) {
-                l = l >> 1;
-                l &= 0x7;
-        } else
-                l = DC_DISABLE;
-        if (l != DC_DISABLE)
-                return stock_freq * l / 8;
-        return stock_freq;
-}
-static struct freq_attr *p4clockmod_attr[] = {
-        &cpufreq_freq_attr_scaling_available_freqs,
-        NULL,
-};
-static struct cpufreq_driver p4clockmod_driver = {
-        .verify         = cpufreq_p4_verify,
-        .target         = cpufreq_p4_target,
-        .init           = cpufreq_p4_cpu_init,
-        .exit           = cpufreq_p4_cpu_exit,
-        .get            = cpufreq_p4_get,
-        .name           = "p4-clockmod",
-        .owner          = THIS_MODULE,
-        .attr           = p4clockmod_attr,
-};
-static int __init cpufreq_p4_init(void)
-{
-        struct cpuinfo_x86 *c = &cpu_data(0);
-        int ret;
-        /*
-         * THERM_CONTROL is architectural for IA32 now, so
-         * we can rely on the capability checks
-         */
-        if (c->x86_vendor != X86_VENDOR_INTEL)
-                return -ENODEV;
-        if (!test_cpu_cap(c, X86_FEATURE_ACPI) ||
-                                !test_cpu_cap(c, X86_FEATURE_ACC))
-                return -ENODEV;
-        ret = cpufreq_register_driver(&p4clockmod_driver);
-        if (!ret)
-                printk(KERN_INFO PFX "P4/Xeon(TM) CPU On-Demand Clock "
-                                "Modulation available\n");
-        return ret;
-}
-static void __exit cpufreq_p4_exit(void)
-{
-        cpufreq_unregister_driver(&p4clockmod_driver);
-}
-MODULE_AUTHOR("Zwane Mwaikambo <zwane@commfireservices.com>");
-MODULE_DESCRIPTION("cpufreq driver for Pentium(TM) 4/Xeon(TM)");
-MODULE_LICENSE("GPL");
-late_initcall(cpufreq_p4_init);
-module_exit(cpufreq_p4_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
deleted file mode 100644
index 4f6f679f2799..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
+++ /dev/null
@@ -1,626 +0,0 @@
-/*
- *  pcc-cpufreq.c - Processor Clocking Control firmware cpufreq interface
- *
- *  Copyright (C) 2009 Red Hat, Matthew Garrett <mjg@redhat.com>
- *  Copyright (C) 2009 Hewlett-Packard Development Company, L.P.
- *      Nagananda Chumbalkar <nagananda.chumbalkar@hp.com>
- *
- * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; version 2 of the License.
- *
- *  This program is distributed in the hope that it will be useful, but
- *  WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or NON
- *  INFRINGEMENT. See the GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License along
- *  with this program; if not, write to the Free Software Foundation, Inc.,
- *  675 Mass Ave, Cambridge, MA 02139, USA.
- *
- * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- */
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/smp.h>
-#include <linux/sched.h>
-#include <linux/cpufreq.h>
-#include <linux/compiler.h>
-#include <linux/slab.h>
-#include <linux/acpi.h>
-#include <linux/io.h>
-#include <linux/spinlock.h>
-#include <linux/uaccess.h>
-#include <acpi/processor.h>
-#define PCC_VERSION     "1.00.00"
-#define POLL_LOOPS      300
-#define CMD_COMPLETE    0x1
-#define CMD_GET_FREQ    0x0
-#define CMD_SET_FREQ    0x1
-#define BUF_SZ          4
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER,      \
-                                             "pcc-cpufreq", msg)
-struct pcc_register_resource {
-        u8 descriptor;
-        u16 length;
-        u8 space_id;
-        u8 bit_width;
-        u8 bit_offset;
-        u8 access_size;
-        u64 address;
-} __attribute__ ((packed));
-struct pcc_memory_resource {
-        u8 descriptor;
-        u16 length;
-        u8 space_id;
-        u8 resource_usage;
-        u8 type_specific;
-        u64 granularity;
-        u64 minimum;
-        u64 maximum;
-        u64 translation_offset;
-        u64 address_length;
-} __attribute__ ((packed));
-static struct cpufreq_driver pcc_cpufreq_driver;
-struct pcc_header {
-        u32 signature;
-        u16 length;
-        u8 major;
-        u8 minor;
-        u32 features;
-        u16 command;
-        u16 status;
-        u32 latency;
-        u32 minimum_time;
-        u32 maximum_time;
-        u32 nominal;
-        u32 throttled_frequency;
-        u32 minimum_frequency;
-};
-static void __iomem *pcch_virt_addr;
-static struct pcc_header __iomem *pcch_hdr;
-static DEFINE_SPINLOCK(pcc_lock);
-static struct acpi_generic_address doorbell;
-static u64 doorbell_preserve;
-static u64 doorbell_write;
-static u8 OSC_UUID[16] = {0x63, 0x9B, 0x2C, 0x9F, 0x70, 0x91, 0x49, 0x1f,
-                          0xBB, 0x4F, 0xA5, 0x98, 0x2F, 0xA1, 0xB5, 0x46};
-struct pcc_cpu {
-        u32 input_offset;
-        u32 output_offset;
-};
-static struct pcc_cpu __percpu *pcc_cpu_info;
-static int pcc_cpufreq_verify(struct cpufreq_policy *policy)
-{
-        cpufreq_verify_within_limits(policy, policy->cpuinfo.min_freq,
-                                     policy->cpuinfo.max_freq);
-        return 0;
-}
-static inline void pcc_cmd(void)
-{
-        u64 doorbell_value;
-        int i;
-        acpi_read(&doorbell_value, &doorbell);
-        acpi_write((doorbell_value & doorbell_preserve) | doorbell_write,
-                   &doorbell);
-        for (i = 0; i < POLL_LOOPS; i++) {
-                if (ioread16(&pcch_hdr->status) & CMD_COMPLETE)
-                        break;
-        }
-}
-static inline void pcc_clear_mapping(void)
-{
-        if (pcch_virt_addr)
-                iounmap(pcch_virt_addr);
-        pcch_virt_addr = NULL;
-}
-static unsigned int pcc_get_freq(unsigned int cpu)
-{
-        struct pcc_cpu *pcc_cpu_data;
-        unsigned int curr_freq;
-        unsigned int freq_limit;
-        u16 status;
-        u32 input_buffer;
-        u32 output_buffer;
-        spin_lock(&pcc_lock);
-        dprintk("get: get_freq for CPU %d\n", cpu);
-        pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu);
-        input_buffer = 0x1;
-        iowrite32(input_buffer,
-                        (pcch_virt_addr + pcc_cpu_data->input_offset));
-        iowrite16(CMD_GET_FREQ, &pcch_hdr->command);
-        pcc_cmd();
-        output_buffer =
-                ioread32(pcch_virt_addr + pcc_cpu_data->output_offset);
-        /* Clear the input buffer - we are done with the current command */
-        memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ);
-        status = ioread16(&pcch_hdr->status);
-        if (status != CMD_COMPLETE) {
-                dprintk("get: FAILED: for CPU %d, status is %d\n",
-                        cpu, status);
-                goto cmd_incomplete;
-        }
-        iowrite16(0, &pcch_hdr->status);
-        curr_freq = (((ioread32(&pcch_hdr->nominal) * (output_buffer & 0xff))
-                        / 100) * 1000);
-        dprintk("get: SUCCESS: (virtual) output_offset for cpu %d is "
-                "0x%x, contains a value of: 0x%x. Speed is: %d MHz\n",
-                cpu, (pcch_virt_addr + pcc_cpu_data->output_offset),
-                output_buffer, curr_freq);
-        freq_limit = (output_buffer >> 8) & 0xff;
-        if (freq_limit != 0xff) {
-                dprintk("get: frequency for cpu %d is being temporarily"
-                        " capped at %d\n", cpu, curr_freq);
-        }
-        spin_unlock(&pcc_lock);
-        return curr_freq;
-cmd_incomplete:
-        iowrite16(0, &pcch_hdr->status);
-        spin_unlock(&pcc_lock);
-        return -EINVAL;
-}
-static int pcc_cpufreq_target(struct cpufreq_policy *policy,
-                              unsigned int target_freq,
-                              unsigned int relation)
-{
-        struct pcc_cpu *pcc_cpu_data;
-        struct cpufreq_freqs freqs;
-        u16 status;
-        u32 input_buffer;
-        int cpu;
-        spin_lock(&pcc_lock);
-        cpu = policy->cpu;
-        pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu);
-        dprintk("target: CPU %d should go to target freq: %d "
-                "(virtual) input_offset is 0x%x\n",
-                cpu, target_freq,
-                (pcch_virt_addr + pcc_cpu_data->input_offset));
-        freqs.new = target_freq;
-        freqs.cpu = cpu;
-        cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
-        input_buffer = 0x1 | (((target_freq * 100)
-                               / (ioread32(&pcch_hdr->nominal) * 1000)) << 8);
-        iowrite32(input_buffer,
-                        (pcch_virt_addr + pcc_cpu_data->input_offset));
-        iowrite16(CMD_SET_FREQ, &pcch_hdr->command);
-        pcc_cmd();
-        /* Clear the input buffer - we are done with the current command */
-        memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ);
-        status = ioread16(&pcch_hdr->status);
-        if (status != CMD_COMPLETE) {
-                dprintk("target: FAILED for cpu %d, with status: 0x%x\n",
-                        cpu, status);
-                goto cmd_incomplete;
-        }
-        iowrite16(0, &pcch_hdr->status);
-        cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
-        dprintk("target: was SUCCESSFUL for cpu %d\n", cpu);
-        spin_unlock(&pcc_lock);
-        return 0;
-cmd_incomplete:
-        iowrite16(0, &pcch_hdr->status);
-        spin_unlock(&pcc_lock);
-        return -EINVAL;
-}
-static int pcc_get_offset(int cpu)
-{
-        acpi_status status;
-        struct acpi_buffer buffer = {ACPI_ALLOCATE_BUFFER, NULL};
-        union acpi_object *pccp, *offset;
-        struct pcc_cpu *pcc_cpu_data;
-        struct acpi_processor *pr;
-        int ret = 0;
-        pr = per_cpu(processors, cpu);
-        pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu);
-        status = acpi_evaluate_object(pr->handle, "PCCP", NULL, &buffer);
-        if (ACPI_FAILURE(status))
-                return -ENODEV;
-        pccp = buffer.pointer;
-        if (!pccp || pccp->type != ACPI_TYPE_PACKAGE) {
-                ret = -ENODEV;
-                goto out_free;
-        };
-        offset = &(pccp->package.elements[0]);
-        if (!offset || offset->type != ACPI_TYPE_INTEGER) {
-                ret = -ENODEV;
-                goto out_free;
-        }
-        pcc_cpu_data->input_offset = offset->integer.value;
-        offset = &(pccp->package.elements[1]);
-        if (!offset || offset->type != ACPI_TYPE_INTEGER) {
-                ret = -ENODEV;
-                goto out_free;
-        }
-        pcc_cpu_data->output_offset = offset->integer.value;
-        memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ);
-        memset_io((pcch_virt_addr + pcc_cpu_data->output_offset), 0, BUF_SZ);
-        dprintk("pcc_get_offset: for CPU %d: pcc_cpu_data "
-                "input_offset: 0x%x, pcc_cpu_data output_offset: 0x%x\n",
-                cpu, pcc_cpu_data->input_offset, pcc_cpu_data->output_offset);
-out_free:
-        kfree(buffer.pointer);
-        return ret;
-}
-static int __init pcc_cpufreq_do_osc(acpi_handle *handle)
-{
-        acpi_status status;
-        struct acpi_object_list input;
-        struct acpi_buffer output = {ACPI_ALLOCATE_BUFFER, NULL};
-        union acpi_object in_params[4];
-        union acpi_object *out_obj;
-        u32 capabilities[2];
-        u32 errors;
-        u32 supported;
-        int ret = 0;
-        input.count = 4;
-        input.pointer = in_params;
-        input.count = 4;
-        input.pointer = in_params;
-        in_params[0].type               = ACPI_TYPE_BUFFER;
-        in_params[0].buffer.length      = 16;
-        in_params[0].buffer.pointer     = OSC_UUID;
-        in_params[1].type               = ACPI_TYPE_INTEGER;
-        in_params[1].integer.value      = 1;
-        in_params[2].type               = ACPI_TYPE_INTEGER;
-        in_params[2].integer.value      = 2;
-        in_params[3].type               = ACPI_TYPE_BUFFER;
-        in_params[3].buffer.length      = 8;
-        in_params[3].buffer.pointer     = (u8 *)&capabilities;
-        capabilities[0] = OSC_QUERY_ENABLE;
-        capabilities[1] = 0x1;
-        status = acpi_evaluate_object(*handle, "_OSC", &input, &output);
-        if (ACPI_FAILURE(status))
-                return -ENODEV;
-        if (!output.length)
-                return -ENODEV;
-        out_obj = output.pointer;
-        if (out_obj->type != ACPI_TYPE_BUFFER) {
-                ret = -ENODEV;
-                goto out_free;
-        }
-        errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0);
-        if (errors) {
-                ret = -ENODEV;
-                goto out_free;
-        }
-        supported = *((u32 *)(out_obj->buffer.pointer + 4));
-        if (!(supported & 0x1)) {
-                ret = -ENODEV;
-                goto out_free;
-        }
-        kfree(output.pointer);
-        capabilities[0] = 0x0;
-        capabilities[1] = 0x1;
-        status = acpi_evaluate_object(*handle, "_OSC", &input, &output);
-        if (ACPI_FAILURE(status))
-                return -ENODEV;
-        if (!output.length)
-                return -ENODEV;
-        out_obj = output.pointer;
-        if (out_obj->type != ACPI_TYPE_BUFFER) {
-                ret = -ENODEV;
-                goto out_free;
-        }
-        errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0);
-        if (errors) {
-                ret = -ENODEV;
-                goto out_free;
-        }
-        supported = *((u32 *)(out_obj->buffer.pointer + 4));
-        if (!(supported & 0x1)) {
-                ret = -ENODEV;
-                goto out_free;
-        }
-out_free:
-        kfree(output.pointer);
-        return ret;
-}
-static int __init pcc_cpufreq_probe(void)
-{
-        acpi_status status;
-        struct acpi_buffer output = {ACPI_ALLOCATE_BUFFER, NULL};
-        struct pcc_memory_resource *mem_resource;
-        struct pcc_register_resource *reg_resource;
-        union acpi_object *out_obj, *member;
-        acpi_handle handle, osc_handle, pcch_handle;
-        int ret = 0;
-        status = acpi_get_handle(NULL, "\\_SB", &handle);
-        if (ACPI_FAILURE(status))
-                return -ENODEV;
-        status = acpi_get_handle(handle, "PCCH", &pcch_handle);
-        if (ACPI_FAILURE(status))
-                return -ENODEV;
-        status = acpi_get_handle(handle, "_OSC", &osc_handle);
-        if (ACPI_SUCCESS(status)) {
-                ret = pcc_cpufreq_do_osc(&osc_handle);
-                if (ret)
-                        dprintk("probe: _OSC evaluation did not succeed\n");
-                /* Firmware's use of _OSC is optional */
-                ret = 0;
-        }
-        status = acpi_evaluate_object(handle, "PCCH", NULL, &output);
-        if (ACPI_FAILURE(status))
-                return -ENODEV;
-        out_obj = output.pointer;
-        if (out_obj->type != ACPI_TYPE_PACKAGE) {
-                ret = -ENODEV;
-                goto out_free;
-        }
-        member = &out_obj->package.elements[0];
-        if (member->type != ACPI_TYPE_BUFFER) {
-                ret = -ENODEV;
-                goto out_free;
-        }
-        mem_resource = (struct pcc_memory_resource *)member->buffer.pointer;
-        dprintk("probe: mem_resource descriptor: 0x%x,"
-                " length: %d, space_id: %d, resource_usage: %d,"
-                " type_specific: %d, granularity: 0x%llx,"
-                " minimum: 0x%llx, maximum: 0x%llx,"
-                " translation_offset: 0x%llx, address_length: 0x%llx\n",
-                mem_resource->descriptor, mem_resource->length,
-                mem_resource->space_id, mem_resource->resource_usage,
-                mem_resource->type_specific, mem_resource->granularity,
-                mem_resource->minimum, mem_resource->maximum,
-                mem_resource->translation_offset,
-                mem_resource->address_length);
-        if (mem_resource->space_id != ACPI_ADR_SPACE_SYSTEM_MEMORY) {
-                ret = -ENODEV;
-                goto out_free;
-        }
-        pcch_virt_addr = ioremap_nocache(mem_resource->minimum,
-                                        mem_resource->address_length);
-        if (pcch_virt_addr == NULL) {
-                dprintk("probe: could not map shared mem region\n");
-                goto out_free;
-        }
-        pcch_hdr = pcch_virt_addr;
-        dprintk("probe: PCCH header (virtual) addr: 0x%p\n", pcch_hdr);
-        dprintk("probe: PCCH header is at physical address: 0x%llx,"
-                " signature: 0x%x, length: %d bytes, major: %d, minor: %d,"
-                " supported features: 0x%x, command field: 0x%x,"
-                " status field: 0x%x, nominal latency: %d us\n",
-                mem_resource->minimum, ioread32(&pcch_hdr->signature),
-                ioread16(&pcch_hdr->length), ioread8(&pcch_hdr->major),
-                ioread8(&pcch_hdr->minor), ioread32(&pcch_hdr->features),
-                ioread16(&pcch_hdr->command), ioread16(&pcch_hdr->status),
-                ioread32(&pcch_hdr->latency));
-        dprintk("probe: min time between commands: %d us,"
-                " max time between commands: %d us,"
-                " nominal CPU frequency: %d MHz,"
-                " minimum CPU frequency: %d MHz,"
-                " minimum CPU frequency without throttling: %d MHz\n",
-                ioread32(&pcch_hdr->minimum_time),
-                ioread32(&pcch_hdr->maximum_time),
-                ioread32(&pcch_hdr->nominal),
-                ioread32(&pcch_hdr->throttled_frequency),
-                ioread32(&pcch_hdr->minimum_frequency));
-        member = &out_obj->package.elements[1];
-        if (member->type != ACPI_TYPE_BUFFER) {
-                ret = -ENODEV;
-                goto pcch_free;
-        }
-        reg_resource = (struct pcc_register_resource *)member->buffer.pointer;
-        doorbell.space_id = reg_resource->space_id;
-        doorbell.bit_width = reg_resource->bit_width;
-        doorbell.bit_offset = reg_resource->bit_offset;
-        doorbell.access_width = 64;
-        doorbell.address = reg_resource->address;
-        dprintk("probe: doorbell: space_id is %d, bit_width is %d, "
-                "bit_offset is %d, access_width is %d, address is 0x%llx\n",
-                doorbell.space_id, doorbell.bit_width, doorbell.bit_offset,
-                doorbell.access_width, reg_resource->address);
-        member = &out_obj->package.elements[2];
-        if (member->type != ACPI_TYPE_INTEGER) {
-                ret = -ENODEV;
-                goto pcch_free;
-        }
-        doorbell_preserve = member->integer.value;
-        member = &out_obj->package.elements[3];
-        if (member->type != ACPI_TYPE_INTEGER) {
-                ret = -ENODEV;
-                goto pcch_free;
-        }
-        doorbell_write = member->integer.value;
-        dprintk("probe: doorbell_preserve: 0x%llx,"
-                " doorbell_write: 0x%llx\n",
-                doorbell_preserve, doorbell_write);
-        pcc_cpu_info = alloc_percpu(struct pcc_cpu);
-        if (!pcc_cpu_info) {
-                ret = -ENOMEM;
-                goto pcch_free;
-        }
-        printk(KERN_DEBUG "pcc-cpufreq: (v%s) driver loaded with frequency"
-               " limits: %d MHz, %d MHz\n", PCC_VERSION,
-               ioread32(&pcch_hdr->minimum_frequency),
-               ioread32(&pcch_hdr->nominal));
-        kfree(output.pointer);
-        return ret;
-pcch_free:
-        pcc_clear_mapping();
-out_free:
-        kfree(output.pointer);
-        return ret;
-}
-static int pcc_cpufreq_cpu_init(struct cpufreq_policy *policy)
-{
-        unsigned int cpu = policy->cpu;
-        unsigned int result = 0;
-        if (!pcch_virt_addr) {
-                result = -1;
-                goto out;
-        }
-        result = pcc_get_offset(cpu);
-        if (result) {
-                dprintk("init: PCCP evaluation failed\n");
-                goto out;
-        }
-        policy->max = policy->cpuinfo.max_freq =
-                ioread32(&pcch_hdr->nominal) * 1000;
-        policy->min = policy->cpuinfo.min_freq =
-                ioread32(&pcch_hdr->minimum_frequency) * 1000;
-        policy->cur = pcc_get_freq(cpu);
-        if (!policy->cur) {
-                dprintk("init: Unable to get current CPU frequency\n");
-                result = -EINVAL;
-                goto out;
-        }
-        dprintk("init: policy->max is %d, policy->min is %d\n",
-                policy->max, policy->min);
-out:
-        return result;
-}
-static int pcc_cpufreq_cpu_exit(struct cpufreq_policy *policy)
-{
-        return 0;
-}
-static struct cpufreq_driver pcc_cpufreq_driver = {
-        .flags = CPUFREQ_CONST_LOOPS,
-        .get = pcc_get_freq,
-        .verify = pcc_cpufreq_verify,
-        .target = pcc_cpufreq_target,
-        .init = pcc_cpufreq_cpu_init,
-        .exit = pcc_cpufreq_cpu_exit,
-        .name = "pcc-cpufreq",
-        .owner = THIS_MODULE,
-};
-static int __init pcc_cpufreq_init(void)
-{
-        int ret;
-        if (acpi_disabled)
-                return 0;
-        ret = pcc_cpufreq_probe();
-        if (ret) {
-                dprintk("pcc_cpufreq_init: PCCH evaluation failed\n");
-                return ret;
-        }
-        ret = cpufreq_register_driver(&pcc_cpufreq_driver);
-        return ret;
-}
-static void __exit pcc_cpufreq_exit(void)
-{
-        cpufreq_unregister_driver(&pcc_cpufreq_driver);
-        pcc_clear_mapping();
-        free_percpu(pcc_cpu_info);
-}
-MODULE_AUTHOR("Matthew Garrett, Naga Chumbalkar");
-MODULE_VERSION(PCC_VERSION);
-MODULE_DESCRIPTION("Processor Clocking Control interface driver");
-MODULE_LICENSE("GPL");
-late_initcall(pcc_cpufreq_init);
-module_exit(pcc_cpufreq_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
deleted file mode 100644
index b3379d6a5c57..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
+++ /dev/null
@@ -1,261 +0,0 @@
-/*
- *  This file was based upon code in Powertweak Linux (http://powertweak.sf.net)
- *  (C) 2000-2003  Dave Jones, Arjan van de Ven, Janne Pänkälä,
- *                 Dominik Brodowski.
- *
- *  Licensed under the terms of the GNU GPL License version 2.
- *
- *  BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
- */
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/cpufreq.h>
-#include <linux/ioport.h>
-#include <linux/timex.h>
-#include <linux/io.h>
-#include <asm/msr.h>
-#define POWERNOW_IOPORT 0xfff0          /* it doesn't matter where, as long
-                                           as it is unused */
-#define PFX "powernow-k6: "
-static unsigned int                     busfreq;   /* FSB, in 10 kHz */
-static unsigned int                     max_multiplier;
-/* Clock ratio multiplied by 10 - see table 27 in AMD#23446 */
-static struct cpufreq_frequency_table clock_ratio[] = {
-        {45,  /* 000 -> 4.5x */ 0},
-        {50,  /* 001 -> 5.0x */ 0},
-        {40,  /* 010 -> 4.0x */ 0},
-        {55,  /* 011 -> 5.5x */ 0},
-        {20,  /* 100 -> 2.0x */ 0},
-        {30,  /* 101 -> 3.0x */ 0},
-        {60,  /* 110 -> 6.0x */ 0},
-        {35,  /* 111 -> 3.5x */ 0},
-        {0, CPUFREQ_TABLE_END}
-};
-/**
- * powernow_k6_get_cpu_multiplier - returns the current FSB multiplier
- *
- *   Returns the current setting of the frequency multiplier. Core clock
- * speed is frequency of the Front-Side Bus multiplied with this value.
- */
-static int powernow_k6_get_cpu_multiplier(void)
-{
-        u64 invalue = 0;
-        u32 msrval;
-        msrval = POWERNOW_IOPORT + 0x1;
-        wrmsr(MSR_K6_EPMR, msrval, 0); /* enable the PowerNow port */
-        invalue = inl(POWERNOW_IOPORT + 0x8);
-        msrval = POWERNOW_IOPORT + 0x0;
-        wrmsr(MSR_K6_EPMR, msrval, 0); /* disable it again */
-        return clock_ratio[(invalue >> 5)&7].index;
-}
-/**
- * powernow_k6_set_state - set the PowerNow! multiplier
- * @best_i: clock_ratio[best_i] is the target multiplier
- *
- *   Tries to change the PowerNow! multiplier
- */
-static void powernow_k6_set_state(unsigned int best_i)
-{
-        unsigned long outvalue = 0, invalue = 0;
-        unsigned long msrval;
-        struct cpufreq_freqs freqs;
-        if (clock_ratio[best_i].index > max_multiplier) {
-                printk(KERN_ERR PFX "invalid target frequency\n");
-                return;
-        }
-        freqs.old = busfreq * powernow_k6_get_cpu_multiplier();
-        freqs.new = busfreq * clock_ratio[best_i].index;
-        freqs.cpu = 0; /* powernow-k6.c is UP only driver */
-        cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
-        /* we now need to transform best_i to the BVC format, see AMD#23446 */
-        outvalue = (1<<12) | (1<<10) | (1<<9) | (best_i<<5);
-        msrval = POWERNOW_IOPORT + 0x1;
-        wrmsr(MSR_K6_EPMR, msrval, 0); /* enable the PowerNow port */
-        invalue = inl(POWERNOW_IOPORT + 0x8);
-        invalue = invalue & 0xf;
-        outvalue = outvalue | invalue;
-        outl(outvalue , (POWERNOW_IOPORT + 0x8));
-        msrval = POWERNOW_IOPORT + 0x0;
-        wrmsr(MSR_K6_EPMR, msrval, 0); /* disable it again */
-        cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
-        return;
-}
-/**
- * powernow_k6_verify - verifies a new CPUfreq policy
- * @policy: new policy
- *
- * Policy must be within lowest and highest possible CPU Frequency,
- * and at least one possible state must be within min and max.
- */
-static int powernow_k6_verify(struct cpufreq_policy *policy)
-{
-        return cpufreq_frequency_table_verify(policy, &clock_ratio[0]);
-}
-/**
- * powernow_k6_setpolicy - sets a new CPUFreq policy
- * @policy: new policy
- * @target_freq: the target frequency
- * @relation: how that frequency relates to achieved frequency
- *  (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H)
- *
- * sets a new CPUFreq policy
- */
-static int powernow_k6_target(struct cpufreq_policy *policy,
-                               unsigned int target_freq,
-                               unsigned int relation)
-{
-        unsigned int newstate = 0;
-        if (cpufreq_frequency_table_target(policy, &clock_ratio[0],
-                                target_freq, relation, &newstate))
-                return -EINVAL;
-        powernow_k6_set_state(newstate);
-        return 0;
-}
-static int powernow_k6_cpu_init(struct cpufreq_policy *policy)
-{
-        unsigned int i, f;
-        int result;
-        if (policy->cpu != 0)
-                return -ENODEV;
-        /* get frequencies */
-        max_multiplier = powernow_k6_get_cpu_multiplier();
-        busfreq = cpu_khz / max_multiplier;
-        /* table init */
-        for (i = 0; (clock_ratio[i].frequency != CPUFREQ_TABLE_END); i++) {
-                f = clock_ratio[i].index;
-                if (f > max_multiplier)
-                        clock_ratio[i].frequency = CPUFREQ_ENTRY_INVALID;
-                else
-                        clock_ratio[i].frequency = busfreq * f;
-        }
-        /* cpuinfo and default policy values */
-        policy->cpuinfo.transition_latency = 200000;
-        policy->cur = busfreq * max_multiplier;
-        result = cpufreq_frequency_table_cpuinfo(policy, clock_ratio);
-        if (result)
-                return result;
-        cpufreq_frequency_table_get_attr(clock_ratio, policy->cpu);
-        return 0;
-}
-static int powernow_k6_cpu_exit(struct cpufreq_policy *policy)
-{
-        unsigned int i;
-        for (i = 0; i < 8; i++) {
-                if (i == max_multiplier)
-                        powernow_k6_set_state(i);
-        }
-        cpufreq_frequency_table_put_attr(policy->cpu);
-        return 0;
-}
-static unsigned int powernow_k6_get(unsigned int cpu)
-{
-        unsigned int ret;
-        ret = (busfreq * powernow_k6_get_cpu_multiplier());
-        return ret;
-}
-static struct freq_attr *powernow_k6_attr[] = {
-        &cpufreq_freq_attr_scaling_available_freqs,
-        NULL,
-};
-static struct cpufreq_driver powernow_k6_driver = {
-        .verify         = powernow_k6_verify,
-        .target         = powernow_k6_target,
-        .init           = powernow_k6_cpu_init,
-        .exit           = powernow_k6_cpu_exit,
-        .get            = powernow_k6_get,
-        .name           = "powernow-k6",
-        .owner          = THIS_MODULE,
-        .attr           = powernow_k6_attr,
-};
-/**
- * powernow_k6_init - initializes the k6 PowerNow! CPUFreq driver
- *
- *   Initializes the K6 PowerNow! support. Returns -ENODEV on unsupported
- * devices, -EINVAL or -ENOMEM on problems during initiatization, and zero
- * on success.
- */
-static int __init powernow_k6_init(void)
-{
-        struct cpuinfo_x86 *c = &cpu_data(0);
-        if ((c->x86_vendor != X86_VENDOR_AMD) || (c->x86 != 5) ||
-                ((c->x86_model != 12) && (c->x86_model != 13)))
-                return -ENODEV;
-        if (!request_region(POWERNOW_IOPORT, 16, "PowerNow!")) {
-                printk(KERN_INFO PFX "PowerNow IOPORT region already used.\n");
-                return -EIO;
-        }
-        if (cpufreq_register_driver(&powernow_k6_driver)) {
-                release_region(POWERNOW_IOPORT, 16);
-                return -EINVAL;
-        }
-        return 0;
-}
-/**
- * powernow_k6_exit - unregisters AMD K6-2+/3+ PowerNow! support
- *
- *   Unregisters AMD K6-2+ / K6-3+ PowerNow! support.
- */
-static void __exit powernow_k6_exit(void)
-{
-        cpufreq_unregister_driver(&powernow_k6_driver);
-        release_region(POWERNOW_IOPORT, 16);
-}
-MODULE_AUTHOR("Arjan van de Ven, Dave Jones <davej@redhat.com>, "
-                "Dominik Brodowski <linux@brodo.de>");
-MODULE_DESCRIPTION("PowerNow! driver for AMD K6-2+ / K6-3+ processors.");
-MODULE_LICENSE("GPL");
-module_init(powernow_k6_init);
-module_exit(powernow_k6_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
deleted file mode 100644
index 4a45fd6e41ba..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
+++ /dev/null
@@ -1,752 +0,0 @@
-/*
- *  AMD K7 Powernow driver.
- *  (C) 2003 Dave Jones on behalf of SuSE Labs.
- *  (C) 2003-2004 Dave Jones <davej@redhat.com>
- *
- *  Licensed under the terms of the GNU GPL License version 2.
- *  Based upon datasheets & sample CPUs kindly provided by AMD.
- *
- * Errata 5:
- *  CPU may fail to execute a FID/VID change in presence of interrupt.
- *  - We cli/sti on stepping A0 CPUs around the FID/VID transition.
- * Errata 15:
- *  CPU with half frequency multipliers may hang upon wakeup from disconnect.
- *  - We disable half multipliers if ACPI is used on A0 stepping CPUs.
- */
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/moduleparam.h>
-#include <linux/init.h>
-#include <linux/cpufreq.h>
-#include <linux/slab.h>
-#include <linux/string.h>
-#include <linux/dmi.h>
-#include <linux/timex.h>
-#include <linux/io.h>
-#include <asm/timer.h>          /* Needed for recalibrate_cpu_khz() */
-#include <asm/msr.h>
-#include <asm/system.h>
-#ifdef CONFIG_X86_POWERNOW_K7_ACPI
-#include <linux/acpi.h>
-#include <acpi/processor.h>
-#endif
-#include "powernow-k7.h"
-#define PFX "powernow: "
-struct psb_s {
-        u8 signature[10];
-        u8 tableversion;
-        u8 flags;
-        u16 settlingtime;
-        u8 reserved1;
-        u8 numpst;
-};
-struct pst_s {
-        u32 cpuid;
-        u8 fsbspeed;
-        u8 maxfid;
-        u8 startvid;
-        u8 numpstates;
-};
-#ifdef CONFIG_X86_POWERNOW_K7_ACPI
-union powernow_acpi_control_t {
-        struct {
-                unsigned long fid:5,
-                        vid:5,
-                        sgtc:20,
-                        res1:2;
-        } bits;
-        unsigned long val;
-};
-#endif
-#ifdef CONFIG_CPU_FREQ_DEBUG
-/* divide by 1000 to get VCore voltage in V. */
-static const int mobile_vid_table[32] = {
-    2000, 1950, 1900, 1850, 1800, 1750, 1700, 1650,
-    1600, 1550, 1500, 1450, 1400, 1350, 1300, 0,
-    1275, 1250, 1225, 1200, 1175, 1150, 1125, 1100,
-    1075, 1050, 1025, 1000, 975, 950, 925, 0,
-};
-#endif
-/* divide by 10 to get FID. */
-static const int fid_codes[32] = {
-    110, 115, 120, 125, 50, 55, 60, 65,
-    70, 75, 80, 85, 90, 95, 100, 105,
-    30, 190, 40, 200, 130, 135, 140, 210,
-    150, 225, 160, 165, 170, 180, -1, -1,
-};
-/* This parameter is used in order to force ACPI instead of legacy method for
- * configuration purpose.
- */
-static int acpi_force;
-static struct cpufreq_frequency_table *powernow_table;
-static unsigned int can_scale_bus;
-static unsigned int can_scale_vid;
-static unsigned int minimum_speed = -1;
-static unsigned int maximum_speed;
-static unsigned int number_scales;
-static unsigned int fsb;
-static unsigned int latency;
-static char have_a0;
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
-                "powernow-k7", msg)
-static int check_fsb(unsigned int fsbspeed)
-{
-        int delta;
-        unsigned int f = fsb / 1000;
-        delta = (fsbspeed > f) ? fsbspeed - f : f - fsbspeed;
-        return delta < 5;
-}
-static int check_powernow(void)
-{
-        struct cpuinfo_x86 *c = &cpu_data(0);
-        unsigned int maxei, eax, ebx, ecx, edx;
-        if ((c->x86_vendor != X86_VENDOR_AMD) || (c->x86 != 6)) {
-#ifdef MODULE
-                printk(KERN_INFO PFX "This module only works with "
-                                "AMD K7 CPUs\n");
-#endif
-                return 0;
-        }
-        /* Get maximum capabilities */
-        maxei = cpuid_eax(0x80000000);
-        if (maxei < 0x80000007) {       /* Any powernow info ? */
-#ifdef MODULE
-                printk(KERN_INFO PFX "No powernow capabilities detected\n");
-#endif
-                return 0;
-        }
-        if ((c->x86_model == 6) && (c->x86_mask == 0)) {
-                printk(KERN_INFO PFX "K7 660[A0] core detected, "
-                                "enabling errata workarounds\n");
-                have_a0 = 1;
-        }
-        cpuid(0x80000007, &eax, &ebx, &ecx, &edx);
-        /* Check we can actually do something before we say anything.*/
-        if (!(edx & (1 << 1 | 1 << 2)))
-                return 0;
-        printk(KERN_INFO PFX "PowerNOW! Technology present. Can scale: ");
-        if (edx & 1 << 1) {
-                printk("frequency");
-                can_scale_bus = 1;
-        }
-        if ((edx & (1 << 1 | 1 << 2)) == 0x6)
-                printk(" and ");
-        if (edx & 1 << 2) {
-                printk("voltage");
-                can_scale_vid = 1;
-        }
-        printk(".\n");
-        return 1;
-}
-#ifdef CONFIG_X86_POWERNOW_K7_ACPI
-static void invalidate_entry(unsigned int entry)
-{
-        powernow_table[entry].frequency = CPUFREQ_ENTRY_INVALID;
-}
-#endif
-static int get_ranges(unsigned char *pst)
-{
-        unsigned int j;
-        unsigned int speed;
-        u8 fid, vid;
-        powernow_table = kzalloc((sizeof(struct cpufreq_frequency_table) *
-                                (number_scales + 1)), GFP_KERNEL);
-        if (!powernow_table)
-                return -ENOMEM;
-        for (j = 0 ; j < number_scales; j++) {
-                fid = *pst++;
-                powernow_table[j].frequency = (fsb * fid_codes[fid]) / 10;
-                powernow_table[j].index = fid; /* lower 8 bits */
-                speed = powernow_table[j].frequency;
-                if ((fid_codes[fid] % 10) == 5) {
-#ifdef CONFIG_X86_POWERNOW_K7_ACPI
-                        if (have_a0 == 1)
-                                invalidate_entry(j);
-#endif
-                }
-                if (speed < minimum_speed)
-                        minimum_speed = speed;
-                if (speed > maximum_speed)
-                        maximum_speed = speed;
-                vid = *pst++;
-                powernow_table[j].index |= (vid << 8); /* upper 8 bits */
-                dprintk("   FID: 0x%x (%d.%dx [%dMHz])  "
-                         "VID: 0x%x (%d.%03dV)\n", fid, fid_codes[fid] / 10,
-                         fid_codes[fid] % 10, speed/1000, vid,
-                         mobile_vid_table[vid]/1000,
-                         mobile_vid_table[vid]%1000);
-        }
-        powernow_table[number_scales].frequency = CPUFREQ_TABLE_END;
-        powernow_table[number_scales].index = 0;
-        return 0;
-}
-static void change_FID(int fid)
-{
-        union msr_fidvidctl fidvidctl;
-        rdmsrl(MSR_K7_FID_VID_CTL, fidvidctl.val);
-        if (fidvidctl.bits.FID != fid) {
-                fidvidctl.bits.SGTC = latency;
-                fidvidctl.bits.FID = fid;
-                fidvidctl.bits.VIDC = 0;
-                fidvidctl.bits.FIDC = 1;
-                wrmsrl(MSR_K7_FID_VID_CTL, fidvidctl.val);
-        }
-}
-static void change_VID(int vid)
-{
-        union msr_fidvidctl fidvidctl;
-        rdmsrl(MSR_K7_FID_VID_CTL, fidvidctl.val);
-        if (fidvidctl.bits.VID != vid) {
-                fidvidctl.bits.SGTC = latency;
-                fidvidctl.bits.VID = vid;
-                fidvidctl.bits.FIDC = 0;
-                fidvidctl.bits.VIDC = 1;
-                wrmsrl(MSR_K7_FID_VID_CTL, fidvidctl.val);
-        }
-}
-static void change_speed(unsigned int index)
-{
-        u8 fid, vid;
-        struct cpufreq_freqs freqs;
-        union msr_fidvidstatus fidvidstatus;
-        int cfid;
-        /* fid are the lower 8 bits of the index we stored into
-         * the cpufreq frequency table in powernow_decode_bios,
-         * vid are the upper 8 bits.
-         */
-        fid = powernow_table[index].index & 0xFF;
-        vid = (powernow_table[index].index & 0xFF00) >> 8;
-        freqs.cpu = 0;
-        rdmsrl(MSR_K7_FID_VID_STATUS, fidvidstatus.val);
-        cfid = fidvidstatus.bits.CFID;
-        freqs.old = fsb * fid_codes[cfid] / 10;
-        freqs.new = powernow_table[index].frequency;
-        cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
-        /* Now do the magic poking into the MSRs.  */
-        if (have_a0 == 1)       /* A0 errata 5 */
-                local_irq_disable();
-        if (freqs.old > freqs.new) {
-                /* Going down, so change FID first */
-                change_FID(fid);
-                change_VID(vid);
-        } else {
-                /* Going up, so change VID first */
-                change_VID(vid);
-                change_FID(fid);
-        }
-        if (have_a0 == 1)
-                local_irq_enable();
-        cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
-}
-#ifdef CONFIG_X86_POWERNOW_K7_ACPI
-static struct acpi_processor_performance *acpi_processor_perf;
-static int powernow_acpi_init(void)
-{
-        int i;
-        int retval = 0;
-        union powernow_acpi_control_t pc;
-        if (acpi_processor_perf != NULL && powernow_table != NULL) {
-                retval = -EINVAL;
-                goto err0;
-        }
-        acpi_processor_perf = kzalloc(sizeof(struct acpi_processor_performance),
-                                      GFP_KERNEL);
-        if (!acpi_processor_perf) {
-                retval = -ENOMEM;
-                goto err0;
-        }
-        if (!zalloc_cpumask_var(&acpi_processor_perf->shared_cpu_map,
-                                                                GFP_KERNEL)) {
-                retval = -ENOMEM;
-                goto err05;
-        }
-        if (acpi_processor_register_performance(acpi_processor_perf, 0)) {
-                retval = -EIO;
-                goto err1;
-        }
-        if (acpi_processor_perf->control_register.space_id !=
-                        ACPI_ADR_SPACE_FIXED_HARDWARE) {
-                retval = -ENODEV;
-                goto err2;
-        }
-        if (acpi_processor_perf->status_register.space_id !=
-                        ACPI_ADR_SPACE_FIXED_HARDWARE) {
-                retval = -ENODEV;
-                goto err2;
-        }
-        number_scales = acpi_processor_perf->state_count;
-        if (number_scales < 2) {
-                retval = -ENODEV;
-                goto err2;
-        }
-        powernow_table = kzalloc((sizeof(struct cpufreq_frequency_table) *
-                                (number_scales + 1)), GFP_KERNEL);
-        if (!powernow_table) {
-                retval = -ENOMEM;
-                goto err2;
-        }
-        pc.val = (unsigned long) acpi_processor_perf->states[0].control;
-        for (i = 0; i < number_scales; i++) {
-                u8 fid, vid;
-                struct acpi_processor_px *state =
-                        &acpi_processor_perf->states[i];
-                unsigned int speed, speed_mhz;
-                pc.val = (unsigned long) state->control;
-                dprintk("acpi:  P%d: %d MHz %d mW %d uS control %08x SGTC %d\n",
-                         i,
-                         (u32) state->core_frequency,
-                         (u32) state->power,
-                         (u32) state->transition_latency,
-                         (u32) state->control,
-                         pc.bits.sgtc);
-                vid = pc.bits.vid;
-                fid = pc.bits.fid;
-                powernow_table[i].frequency = fsb * fid_codes[fid] / 10;
-                powernow_table[i].index = fid; /* lower 8 bits */
-                powernow_table[i].index |= (vid << 8); /* upper 8 bits */
-                speed = powernow_table[i].frequency;
-                speed_mhz = speed / 1000;
-                /* processor_perflib will multiply the MHz value by 1000 to
-                 * get a KHz value (e.g. 1266000). However, powernow-k7 works
-                 * with true KHz values (e.g. 1266768). To ensure that all
-                 * powernow frequencies are available, we must ensure that
-                 * ACPI doesn't restrict them, so we round up the MHz value
-                 * to ensure that perflib's computed KHz value is greater than
-                 * or equal to powernow's KHz value.
-                 */
-                if (speed % 1000 > 0)
-                        speed_mhz++;
-                if ((fid_codes[fid] % 10) == 5) {
-                        if (have_a0 == 1)
-                                invalidate_entry(i);
-                }
-                dprintk("   FID: 0x%x (%d.%dx [%dMHz])  "
-                         "VID: 0x%x (%d.%03dV)\n", fid, fid_codes[fid] / 10,
-                         fid_codes[fid] % 10, speed_mhz, vid,
-                         mobile_vid_table[vid]/1000,
-                         mobile_vid_table[vid]%1000);
-                if (state->core_frequency != speed_mhz) {
-                        state->core_frequency = speed_mhz;
-                        dprintk("   Corrected ACPI frequency to %d\n",
-                                speed_mhz);
-                }
-                if (latency < pc.bits.sgtc)
-                        latency = pc.bits.sgtc;
-                if (speed < minimum_speed)
-                        minimum_speed = speed;
-                if (speed > maximum_speed)
-                        maximum_speed = speed;
-        }
-        powernow_table[i].frequency = CPUFREQ_TABLE_END;
-        powernow_table[i].index = 0;
-        /* notify BIOS that we exist */
-        acpi_processor_notify_smm(THIS_MODULE);
-        return 0;
-err2:
-        acpi_processor_unregister_performance(acpi_processor_perf, 0);
-err1:
-        free_cpumask_var(acpi_processor_perf->shared_cpu_map);
-err05:
-        kfree(acpi_processor_perf);
-err0:
-        printk(KERN_WARNING PFX "ACPI perflib can not be used on "
-                        "this platform\n");
-        acpi_processor_perf = NULL;
-        return retval;
-}
-#else
-static int powernow_acpi_init(void)
-{
-        printk(KERN_INFO PFX "no support for ACPI processor found."
-               "  Please recompile your kernel with ACPI processor\n");
-        return -EINVAL;
-}
-#endif
-static void print_pst_entry(struct pst_s *pst, unsigned int j)
-{
-        dprintk("PST:%d (@%p)\n", j, pst);
-        dprintk(" cpuid: 0x%x  fsb: %d  maxFID: 0x%x  startvid: 0x%x\n",
-                pst->cpuid, pst->fsbspeed, pst->maxfid, pst->startvid);
-}
-static int powernow_decode_bios(int maxfid, int startvid)
-{
-        struct psb_s *psb;
-        struct pst_s *pst;
-        unsigned int i, j;
-        unsigned char *p;
-        unsigned int etuple;
-        unsigned int ret;
-        etuple = cpuid_eax(0x80000001);
-        for (i = 0xC0000; i < 0xffff0 ; i += 16) {
-                p = phys_to_virt(i);
-                if (memcmp(p, "AMDK7PNOW!",  10) == 0) {
-                        dprintk("Found PSB header at %p\n", p);
-                        psb = (struct psb_s *) p;
-                        dprintk("Table version: 0x%x\n", psb->tableversion);
-                        if (psb->tableversion != 0x12) {
-                                printk(KERN_INFO PFX "Sorry, only v1.2 tables"
-                                                " supported right now\n");
-                                return -ENODEV;
-                        }
-                        dprintk("Flags: 0x%x\n", psb->flags);
-                        if ((psb->flags & 1) == 0)
-                                dprintk("Mobile voltage regulator\n");
-                        else
-                                dprintk("Desktop voltage regulator\n");
-                        latency = psb->settlingtime;
-                        if (latency < 100) {
-                                printk(KERN_INFO PFX "BIOS set settling time "
-                                                "to %d microseconds. "
-                                                "Should be at least 100. "
-                                                "Correcting.\n", latency);
-                                latency = 100;
-                        }
-                        dprintk("Settling Time: %d microseconds.\n",
-                                        psb->settlingtime);
-                        dprintk("Has %d PST tables. (Only dumping ones "
-                                        "relevant to this CPU).\n",
-                                        psb->numpst);
-                        p += sizeof(struct psb_s);
-                        pst = (struct pst_s *) p;
-                        for (j = 0; j < psb->numpst; j++) {
-                                pst = (struct pst_s *) p;
-                                number_scales = pst->numpstates;
-                                if ((etuple == pst->cpuid) &&
-                                    check_fsb(pst->fsbspeed) &&
-                                    (maxfid == pst->maxfid) &&
-                                    (startvid == pst->startvid)) {
-                                        print_pst_entry(pst, j);
-                                        p = (char *)pst + sizeof(struct pst_s);
-                                        ret = get_ranges(p);
-                                        return ret;
-                                } else {
-                                        unsigned int k;
-                                        p = (char *)pst + sizeof(struct pst_s);
-                                        for (k = 0; k < number_scales; k++)
-                                                p += 2;
-                                }
-                        }
-                        printk(KERN_INFO PFX "No PST tables match this cpuid "
-                                        "(0x%x)\n", etuple);
-                        printk(KERN_INFO PFX "This is indicative of a broken "
-                                        "BIOS.\n");
-                        return -EINVAL;
-                }
-                p++;
-        }
-        return -ENODEV;
-}
-static int powernow_target(struct cpufreq_policy *policy,
-                            unsigned int target_freq,
-                            unsigned int relation)
-{
-        unsigned int newstate;
-        if (cpufreq_frequency_table_target(policy, powernow_table, target_freq,
-                                relation, &newstate))
-                return -EINVAL;
-        change_speed(newstate);
-        return 0;
-}
-static int powernow_verify(struct cpufreq_policy *policy)
-{
-        return cpufreq_frequency_table_verify(policy, powernow_table);
-}
-/*
- * We use the fact that the bus frequency is somehow
- * a multiple of 100000/3 khz, then we compute sgtc according
- * to this multiple.
- * That way, we match more how AMD thinks all of that work.
- * We will then get the same kind of behaviour already tested under
- * the "well-known" other OS.
- */
-static int __cpuinit fixup_sgtc(void)
-{
-        unsigned int sgtc;
-        unsigned int m;
-        m = fsb / 3333;
-        if ((m % 10) >= 5)
-                m += 5;
-        m /= 10;
-        sgtc = 100 * m * latency;
-        sgtc = sgtc / 3;
-        if (sgtc > 0xfffff) {
-                printk(KERN_WARNING PFX "SGTC too large %d\n", sgtc);
-                sgtc = 0xfffff;
-        }
-        return sgtc;
-}
-static unsigned int powernow_get(unsigned int cpu)
-{
-        union msr_fidvidstatus fidvidstatus;
-        unsigned int cfid;
-        if (cpu)
-                return 0;
-        rdmsrl(MSR_K7_FID_VID_STATUS, fidvidstatus.val);
-        cfid = fidvidstatus.bits.CFID;
-        return fsb * fid_codes[cfid] / 10;
-}
-static int __cpuinit acer_cpufreq_pst(const struct dmi_system_id *d)
-{
-        printk(KERN_WARNING PFX
-                "%s laptop with broken PST tables in BIOS detected.\n",
-                d->ident);
-        printk(KERN_WARNING PFX
-                "You need to downgrade to 3A21 (09/09/2002), or try a newer "
-                "BIOS than 3A71 (01/20/2003)\n");
-        printk(KERN_WARNING PFX
-                "cpufreq scaling has been disabled as a result of this.\n");
-        return 0;
-}
-/*
- * Some Athlon laptops have really fucked PST tables.
- * A BIOS update is all that can save them.
- * Mention this, and disable cpufreq.
- */
-static struct dmi_system_id __cpuinitdata powernow_dmi_table[] = {
-        {
-                .callback = acer_cpufreq_pst,
-                .ident = "Acer Aspire",
-                .matches = {
-                        DMI_MATCH(DMI_SYS_VENDOR, "Insyde Software"),
-                        DMI_MATCH(DMI_BIOS_VERSION, "3A71"),
-                },
-        },
-        { }
-};
-static int __cpuinit powernow_cpu_init(struct cpufreq_policy *policy)
-{
-        union msr_fidvidstatus fidvidstatus;
-        int result;
-        if (policy->cpu != 0)
-                return -ENODEV;
-        rdmsrl(MSR_K7_FID_VID_STATUS, fidvidstatus.val);
-        recalibrate_cpu_khz();
-        fsb = (10 * cpu_khz) / fid_codes[fidvidstatus.bits.CFID];
-        if (!fsb) {
-                printk(KERN_WARNING PFX "can not determine bus frequency\n");
-                return -EINVAL;
-        }
-        dprintk("FSB: %3dMHz\n", fsb/1000);
-        if (dmi_check_system(powernow_dmi_table) || acpi_force) {
-                printk(KERN_INFO PFX "PSB/PST known to be broken.  "
-                                "Trying ACPI instead\n");
-                result = powernow_acpi_init();
-        } else {
-                result = powernow_decode_bios(fidvidstatus.bits.MFID,
-                                fidvidstatus.bits.SVID);
-                if (result) {
-                        printk(KERN_INFO PFX "Trying ACPI perflib\n");
-                        maximum_speed = 0;
-                        minimum_speed = -1;
-                        latency = 0;
-                        result = powernow_acpi_init();
-                        if (result) {
-                                printk(KERN_INFO PFX
-                                        "ACPI and legacy methods failed\n");
-                        }
-                } else {
-                        /* SGTC use the bus clock as timer */
-                        latency = fixup_sgtc();
-                        printk(KERN_INFO PFX "SGTC: %d\n", latency);
-                }
-        }
-        if (result)
-                return result;
-        printk(KERN_INFO PFX "Minimum speed %d MHz. Maximum speed %d MHz.\n",
-                                minimum_speed/1000, maximum_speed/1000);
-        policy->cpuinfo.transition_latency =
-                cpufreq_scale(2000000UL, fsb, latency);
-        policy->cur = powernow_get(0);
-        cpufreq_frequency_table_get_attr(powernow_table, policy->cpu);
-        return cpufreq_frequency_table_cpuinfo(policy, powernow_table);
-}
-static int powernow_cpu_exit(struct cpufreq_policy *policy)
-{
-        cpufreq_frequency_table_put_attr(policy->cpu);
-#ifdef CONFIG_X86_POWERNOW_K7_ACPI
-        if (acpi_processor_perf) {
-                acpi_processor_unregister_performance(acpi_processor_perf, 0);
-                free_cpumask_var(acpi_processor_perf->shared_cpu_map);
-                kfree(acpi_processor_perf);
-        }
-#endif
-        kfree(powernow_table);
-        return 0;
-}
-static struct freq_attr *powernow_table_attr[] = {
-        &cpufreq_freq_attr_scaling_available_freqs,
-        NULL,
-};
-static struct cpufreq_driver powernow_driver = {
-        .verify         = powernow_verify,
-        .target         = powernow_target,
-        .get            = powernow_get,
-#ifdef CONFIG_X86_POWERNOW_K7_ACPI
-        .bios_limit     = acpi_processor_get_bios_limit,
-#endif
-        .init           = powernow_cpu_init,
-        .exit           = powernow_cpu_exit,
-        .name           = "powernow-k7",
-        .owner          = THIS_MODULE,
-        .attr           = powernow_table_attr,
-};
-static int __init powernow_init(void)
-{
-        if (check_powernow() == 0)
-                return -ENODEV;
-        return cpufreq_register_driver(&powernow_driver);
-}
-static void __exit powernow_exit(void)
-{
-        cpufreq_unregister_driver(&powernow_driver);
-}
-module_param(acpi_force,  int, 0444);
-MODULE_PARM_DESC(acpi_force, "Force ACPI to be used.");
-MODULE_AUTHOR("Dave Jones <davej@redhat.com>");
-MODULE_DESCRIPTION("Powernow driver for AMD K7 processors.");
-MODULE_LICENSE("GPL");
-late_initcall(powernow_init);
-module_exit(powernow_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.h b/arch/x86/kernel/cpu/cpufreq/powernow-k7.h
deleted file mode 100644
index 35fb4eaf6e1c..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- *  (C) 2003 Dave Jones.
- *
- *  Licensed under the terms of the GNU GPL License version 2.
- *
- *  AMD-specific information
- *
- */
-union msr_fidvidctl {
-        struct {
-                unsigned FID:5,                 // 4:0
-                reserved1:3,    // 7:5
-                VID:5,                  // 12:8
-                reserved2:3,    // 15:13
-                FIDC:1,                 // 16
-                VIDC:1,                 // 17
-                reserved3:2,    // 19:18
-                FIDCHGRATIO:1,  // 20
-                reserved4:11,   // 31-21
-                SGTC:20,                // 32:51
-                reserved5:12;   // 63:52
-        } bits;
-        unsigned long long val;
-};
-union msr_fidvidstatus {
-        struct {
-                unsigned CFID:5,                        // 4:0
-                reserved1:3,    // 7:5
-                SFID:5,                 // 12:8
-                reserved2:3,    // 15:13
-                MFID:5,                 // 20:16
-                reserved3:11,   // 31:21
-                CVID:5,                 // 36:32
-                reserved4:3,    // 39:37
-                SVID:5,                 // 44:40
-                reserved5:3,    // 47:45
-                MVID:5,                 // 52:48
-                reserved6:11;   // 63:53
-        } bits;
-        unsigned long long val;
-};
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
deleted file mode 100644
index 491977baf6c0..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ /dev/null
@@ -1,1601 +0,0 @@
-/*
- *   (c) 2003-2010 Advanced Micro Devices, Inc.
- *  Your use of this code is subject to the terms and conditions of the
- *  GNU general public license version 2. See "COPYING" or
- *  http://www.gnu.org/licenses/gpl.html
- *
- *  Support : mark.langsdorf@amd.com
- *
- *  Based on the powernow-k7.c module written by Dave Jones.
- *  (C) 2003 Dave Jones on behalf of SuSE Labs
- *  (C) 2004 Dominik Brodowski <linux@brodo.de>
- *  (C) 2004 Pavel Machek <pavel@ucw.cz>
- *  Licensed under the terms of the GNU GPL License version 2.
- *  Based upon datasheets & sample CPUs kindly provided by AMD.
- *
- *  Valuable input gratefully received from Dave Jones, Pavel Machek,
- *  Dominik Brodowski, Jacob Shin, and others.
- *  Originally developed by Paul Devriendt.
- *  Processor information obtained from Chapter 9 (Power and Thermal Management)
- *  of the "BIOS and Kernel Developer's Guide for the AMD Athlon 64 and AMD
- *  Opteron Processors" available for download from www.amd.com
- *
- *  Tables for specific CPUs can be inferred from
- *     http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/30430.pdf
- */
-#include <linux/kernel.h>
-#include <linux/smp.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/cpufreq.h>
-#include <linux/slab.h>
-#include <linux/string.h>
-#include <linux/cpumask.h>
-#include <linux/sched.h>        /* for current / set_cpus_allowed() */
-#include <linux/io.h>
-#include <linux/delay.h>
-#include <asm/msr.h>
-#include <linux/acpi.h>
-#include <linux/mutex.h>
-#include <acpi/processor.h>
-#define PFX "powernow-k8: "
-#define VERSION "version 2.20.00"
-#include "powernow-k8.h"
-#include "mperf.h"
-/* serialize freq changes  */
-static DEFINE_MUTEX(fidvid_mutex);
-static DEFINE_PER_CPU(struct powernow_k8_data *, powernow_data);
-static int cpu_family = CPU_OPTERON;
-/* core performance boost */
-static bool cpb_capable, cpb_enabled;
-static struct msr __percpu *msrs;
-static struct cpufreq_driver cpufreq_amd64_driver;
-#ifndef CONFIG_SMP
-static inline const struct cpumask *cpu_core_mask(int cpu)
-{
-        return cpumask_of(0);
-}
-#endif
-/* Return a frequency in MHz, given an input fid */
-static u32 find_freq_from_fid(u32 fid)
-{
-        return 800 + (fid * 100);
-}
-/* Return a frequency in KHz, given an input fid */
-static u32 find_khz_freq_from_fid(u32 fid)
-{
-        return 1000 * find_freq_from_fid(fid);
-}
-static u32 find_khz_freq_from_pstate(struct cpufreq_frequency_table *data,
-                u32 pstate)
-{
-        return data[pstate].frequency;
-}
-/* Return the vco fid for an input fid
- *
- * Each "low" fid has corresponding "high" fid, and you can get to "low" fids
- * only from corresponding high fids. This returns "high" fid corresponding to
- * "low" one.
- */
-static u32 convert_fid_to_vco_fid(u32 fid)
-{
-        if (fid < HI_FID_TABLE_BOTTOM)
-                return 8 + (2 * fid);
-        else
-                return fid;
-}
-/*
- * Return 1 if the pending bit is set. Unless we just instructed the processor
- * to transition to a new state, seeing this bit set is really bad news.
- */
-static int pending_bit_stuck(void)
-{
-        u32 lo, hi;
-        if (cpu_family == CPU_HW_PSTATE)
-                return 0;
-        rdmsr(MSR_FIDVID_STATUS, lo, hi);
-        return lo & MSR_S_LO_CHANGE_PENDING ? 1 : 0;
-}
-/*
- * Update the global current fid / vid values from the status msr.
- * Returns 1 on error.
- */
-static int query_current_values_with_pending_wait(struct powernow_k8_data *data)
-{
-        u32 lo, hi;
-        u32 i = 0;
-        if (cpu_family == CPU_HW_PSTATE) {
-                rdmsr(MSR_PSTATE_STATUS, lo, hi);
-                i = lo & HW_PSTATE_MASK;
-                data->currpstate = i;
-                /*
-                 * a workaround for family 11h erratum 311 might cause
-                 * an "out-of-range Pstate if the core is in Pstate-0
-                 */
-                if ((boot_cpu_data.x86 == 0x11) && (i >= data->numps))
-                        data->currpstate = HW_PSTATE_0;
-                return 0;
-        }
-        do {
-                if (i++ > 10000) {
-                        dprintk("detected change pending stuck\n");
-                        return 1;
-                }
-                rdmsr(MSR_FIDVID_STATUS, lo, hi);
-        } while (lo & MSR_S_LO_CHANGE_PENDING);
-        data->currvid = hi & MSR_S_HI_CURRENT_VID;
-        data->currfid = lo & MSR_S_LO_CURRENT_FID;
-        return 0;
-}
-/* the isochronous relief time */
-static void count_off_irt(struct powernow_k8_data *data)
-{
-        udelay((1 << data->irt) * 10);
-        return;
-}
-/* the voltage stabilization time */
-static void count_off_vst(struct powernow_k8_data *data)
-{
-        udelay(data->vstable * VST_UNITS_20US);
-        return;
-}
-/* need to init the control msr to a safe value (for each cpu) */
-static void fidvid_msr_init(void)
-{
-        u32 lo, hi;
-        u8 fid, vid;
-        rdmsr(MSR_FIDVID_STATUS, lo, hi);
-        vid = hi & MSR_S_HI_CURRENT_VID;
-        fid = lo & MSR_S_LO_CURRENT_FID;
-        lo = fid | (vid << MSR_C_LO_VID_SHIFT);
-        hi = MSR_C_HI_STP_GNT_BENIGN;
-        dprintk("cpu%d, init lo 0x%x, hi 0x%x\n", smp_processor_id(), lo, hi);
-        wrmsr(MSR_FIDVID_CTL, lo, hi);
-}
-/* write the new fid value along with the other control fields to the msr */
-static int write_new_fid(struct powernow_k8_data *data, u32 fid)
-{
-        u32 lo;
-        u32 savevid = data->currvid;
-        u32 i = 0;
-        if ((fid & INVALID_FID_MASK) || (data->currvid & INVALID_VID_MASK)) {
-                printk(KERN_ERR PFX "internal error - overflow on fid write\n");
-                return 1;
-        }
-        lo = fid;
-        lo |= (data->currvid << MSR_C_LO_VID_SHIFT);
-        lo |= MSR_C_LO_INIT_FID_VID;
-        dprintk("writing fid 0x%x, lo 0x%x, hi 0x%x\n",
-                fid, lo, data->plllock * PLL_LOCK_CONVERSION);
-        do {
-                wrmsr(MSR_FIDVID_CTL, lo, data->plllock * PLL_LOCK_CONVERSION);
-                if (i++ > 100) {
-                        printk(KERN_ERR PFX
-                                "Hardware error - pending bit very stuck - "
-                                "no further pstate changes possible\n");
-                        return 1;
-                }
-        } while (query_current_values_with_pending_wait(data));
-        count_off_irt(data);
-        if (savevid != data->currvid) {
-                printk(KERN_ERR PFX
-                        "vid change on fid trans, old 0x%x, new 0x%x\n",
-                        savevid, data->currvid);
-                return 1;
-        }
-        if (fid != data->currfid) {
-                printk(KERN_ERR PFX
-                        "fid trans failed, fid 0x%x, curr 0x%x\n", fid,
-                        data->currfid);
-                return 1;
-        }
-        return 0;
-}
-/* Write a new vid to the hardware */
-static int write_new_vid(struct powernow_k8_data *data, u32 vid)
-{
-        u32 lo;
-        u32 savefid = data->currfid;
-        int i = 0;
-        if ((data->currfid & INVALID_FID_MASK) || (vid & INVALID_VID_MASK)) {
-                printk(KERN_ERR PFX "internal error - overflow on vid write\n");
-                return 1;
-        }
-        lo = data->currfid;
-        lo |= (vid << MSR_C_LO_VID_SHIFT);
-        lo |= MSR_C_LO_INIT_FID_VID;
-        dprintk("writing vid 0x%x, lo 0x%x, hi 0x%x\n",
-                vid, lo, STOP_GRANT_5NS);
-        do {
-                wrmsr(MSR_FIDVID_CTL, lo, STOP_GRANT_5NS);
-                if (i++ > 100) {
-                        printk(KERN_ERR PFX "internal error - pending bit "
-                                        "very stuck - no further pstate "
-                                        "changes possible\n");
-                        return 1;
-                }
-        } while (query_current_values_with_pending_wait(data));
-        if (savefid != data->currfid) {
-                printk(KERN_ERR PFX "fid changed on vid trans, old "
-                        "0x%x new 0x%x\n",
-                       savefid, data->currfid);
-                return 1;
-        }
-        if (vid != data->currvid) {
-                printk(KERN_ERR PFX "vid trans failed, vid 0x%x, "
-                                "curr 0x%x\n",
-                                vid, data->currvid);
-                return 1;
-        }
-        return 0;
-}
-/*
- * Reduce the vid by the max of step or reqvid.
- * Decreasing vid codes represent increasing voltages:
- * vid of 0 is 1.550V, vid of 0x1e is 0.800V, vid of VID_OFF is off.
- */
-static int decrease_vid_code_by_step(struct powernow_k8_data *data,
-                u32 reqvid, u32 step)
-{
-        if ((data->currvid - reqvid) > step)
-                reqvid = data->currvid - step;
-        if (write_new_vid(data, reqvid))
-                return 1;
-        count_off_vst(data);
-        return 0;
-}
-/* Change hardware pstate by single MSR write */
-static int transition_pstate(struct powernow_k8_data *data, u32 pstate)
-{
-        wrmsr(MSR_PSTATE_CTRL, pstate, 0);
-        data->currpstate = pstate;
-        return 0;
-}
-/* Change Opteron/Athlon64 fid and vid, by the 3 phases. */
-static int transition_fid_vid(struct powernow_k8_data *data,
-                u32 reqfid, u32 reqvid)
-{
-        if (core_voltage_pre_transition(data, reqvid, reqfid))
-                return 1;
-        if (core_frequency_transition(data, reqfid))
-                return 1;
-        if (core_voltage_post_transition(data, reqvid))
-                return 1;
-        if (query_current_values_with_pending_wait(data))
-                return 1;
-        if ((reqfid != data->currfid) || (reqvid != data->currvid)) {
-                printk(KERN_ERR PFX "failed (cpu%d): req 0x%x 0x%x, "
-                                "curr 0x%x 0x%x\n",
-                                smp_processor_id(),
-                                reqfid, reqvid, data->currfid, data->currvid);
-                return 1;
-        }
-        dprintk("transitioned (cpu%d): new fid 0x%x, vid 0x%x\n",
-                smp_processor_id(), data->currfid, data->currvid);
-        return 0;
-}
-/* Phase 1 - core voltage transition ... setup voltage */
-static int core_voltage_pre_transition(struct powernow_k8_data *data,
-                u32 reqvid, u32 reqfid)
-{
-        u32 rvosteps = data->rvo;
-        u32 savefid = data->currfid;
-        u32 maxvid, lo, rvomult = 1;
-        dprintk("ph1 (cpu%d): start, currfid 0x%x, currvid 0x%x, "
-                "reqvid 0x%x, rvo 0x%x\n",
-                smp_processor_id(),
-                data->currfid, data->currvid, reqvid, data->rvo);
-        if ((savefid < LO_FID_TABLE_TOP) && (reqfid < LO_FID_TABLE_TOP))
-                rvomult = 2;
-        rvosteps *= rvomult;
-        rdmsr(MSR_FIDVID_STATUS, lo, maxvid);
-        maxvid = 0x1f & (maxvid >> 16);
-        dprintk("ph1 maxvid=0x%x\n", maxvid);
-        if (reqvid < maxvid) /* lower numbers are higher voltages */
-                reqvid = maxvid;
-        while (data->currvid > reqvid) {
-                dprintk("ph1: curr 0x%x, req vid 0x%x\n",
-                        data->currvid, reqvid);
-                if (decrease_vid_code_by_step(data, reqvid, data->vidmvs))
-                        return 1;
-        }
-        while ((rvosteps > 0) &&
-                        ((rvomult * data->rvo + data->currvid) > reqvid)) {
-                if (data->currvid == maxvid) {
-                        rvosteps = 0;
-                } else {
-                        dprintk("ph1: changing vid for rvo, req 0x%x\n",
-                                data->currvid - 1);
-                        if (decrease_vid_code_by_step(data, data->currvid-1, 1))
-                                return 1;
-                        rvosteps--;
-                }
-        }
-        if (query_current_values_with_pending_wait(data))
-                return 1;
-        if (savefid != data->currfid) {
-                printk(KERN_ERR PFX "ph1 err, currfid changed 0x%x\n",
-                                data->currfid);
-                return 1;
-        }
-        dprintk("ph1 complete, currfid 0x%x, currvid 0x%x\n",
-                data->currfid, data->currvid);
-        return 0;
-}
-/* Phase 2 - core frequency transition */
-static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid)
-{
-        u32 vcoreqfid, vcocurrfid, vcofiddiff;
-        u32 fid_interval, savevid = data->currvid;
-        if (data->currfid == reqfid) {
-                printk(KERN_ERR PFX "ph2 null fid transition 0x%x\n",
-                                data->currfid);
-                return 0;
-        }
-        dprintk("ph2 (cpu%d): starting, currfid 0x%x, currvid 0x%x, "
-                "reqfid 0x%x\n",
-                smp_processor_id(),
-                data->currfid, data->currvid, reqfid);
-        vcoreqfid = convert_fid_to_vco_fid(reqfid);
-        vcocurrfid = convert_fid_to_vco_fid(data->currfid);
-        vcofiddiff = vcocurrfid > vcoreqfid ? vcocurrfid - vcoreqfid
-            : vcoreqfid - vcocurrfid;
-        if ((reqfid <= LO_FID_TABLE_TOP) && (data->currfid <= LO_FID_TABLE_TOP))
-                vcofiddiff = 0;
-        while (vcofiddiff > 2) {
-                (data->currfid & 1) ? (fid_interval = 1) : (fid_interval = 2);
-                if (reqfid > data->currfid) {
-                        if (data->currfid > LO_FID_TABLE_TOP) {
-                                if (write_new_fid(data,
-                                                data->currfid + fid_interval))
-                                        return 1;
-                        } else {
-                                if (write_new_fid
-                                    (data,
-                                     2 + convert_fid_to_vco_fid(data->currfid)))
-                                        return 1;
-                        }
-                } else {
-                        if (write_new_fid(data, data->currfid - fid_interval))
-                                return 1;
-                }
-                vcocurrfid = convert_fid_to_vco_fid(data->currfid);
-                vcofiddiff = vcocurrfid > vcoreqfid ? vcocurrfid - vcoreqfid
-                    : vcoreqfid - vcocurrfid;
-        }
-        if (write_new_fid(data, reqfid))
-                return 1;
-        if (query_current_values_with_pending_wait(data))
-                return 1;
-        if (data->currfid != reqfid) {
-                printk(KERN_ERR PFX
-                        "ph2: mismatch, failed fid transition, "
-                        "curr 0x%x, req 0x%x\n",
-                        data->currfid, reqfid);
-                return 1;
-        }
-        if (savevid != data->currvid) {
-                printk(KERN_ERR PFX "ph2: vid changed, save 0x%x, curr 0x%x\n",
-                        savevid, data->currvid);
-                return 1;
-        }
-        dprintk("ph2 complete, currfid 0x%x, currvid 0x%x\n",
-                data->currfid, data->currvid);
-        return 0;
-}
-/* Phase 3 - core voltage transition flow ... jump to the final vid. */
-static int core_voltage_post_transition(struct powernow_k8_data *data,
-                u32 reqvid)
-{
-        u32 savefid = data->currfid;
-        u32 savereqvid = reqvid;
-        dprintk("ph3 (cpu%d): starting, currfid 0x%x, currvid 0x%x\n",
-                smp_processor_id(),
-                data->currfid, data->currvid);
-        if (reqvid != data->currvid) {
-                if (write_new_vid(data, reqvid))
-                        return 1;
-                if (savefid != data->currfid) {
-                        printk(KERN_ERR PFX
-                               "ph3: bad fid change, save 0x%x, curr 0x%x\n",
-                               savefid, data->currfid);
-                        return 1;
-                }
-                if (data->currvid != reqvid) {
-                        printk(KERN_ERR PFX
-                               "ph3: failed vid transition\n, "
-                               "req 0x%x, curr 0x%x",
-                               reqvid, data->currvid);
-                        return 1;
-                }
-        }
-        if (query_current_values_with_pending_wait(data))
-                return 1;
-        if (savereqvid != data->currvid) {
-                dprintk("ph3 failed, currvid 0x%x\n", data->currvid);
-                return 1;
-        }
-        if (savefid != data->currfid) {
-                dprintk("ph3 failed, currfid changed 0x%x\n",
-                        data->currfid);
-                return 1;
-        }
-        dprintk("ph3 complete, currfid 0x%x, currvid 0x%x\n",
-                data->currfid, data->currvid);
-        return 0;
-}
-static void check_supported_cpu(void *_rc)
-{
-        u32 eax, ebx, ecx, edx;
-        int *rc = _rc;
-        *rc = -ENODEV;
-        if (current_cpu_data.x86_vendor != X86_VENDOR_AMD)
-                return;
-        eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE);
-        if (((eax & CPUID_XFAM) != CPUID_XFAM_K8) &&
-            ((eax & CPUID_XFAM) < CPUID_XFAM_10H))
-                return;
-        if ((eax & CPUID_XFAM) == CPUID_XFAM_K8) {
-                if (((eax & CPUID_USE_XFAM_XMOD) != CPUID_USE_XFAM_XMOD) ||
-                    ((eax & CPUID_XMOD) > CPUID_XMOD_REV_MASK)) {
-                        printk(KERN_INFO PFX
-                                "Processor cpuid %x not supported\n", eax);
-                        return;
-                }
-                eax = cpuid_eax(CPUID_GET_MAX_CAPABILITIES);
-                if (eax < CPUID_FREQ_VOLT_CAPABILITIES) {
-                        printk(KERN_INFO PFX
-                               "No frequency change capabilities detected\n");
-                        return;
-                }
-                cpuid(CPUID_FREQ_VOLT_CAPABILITIES, &eax, &ebx, &ecx, &edx);
-                if ((edx & P_STATE_TRANSITION_CAPABLE)
-                        != P_STATE_TRANSITION_CAPABLE) {
-                        printk(KERN_INFO PFX
-                                "Power state transitions not supported\n");
-                        return;
-                }
-        } else { /* must be a HW Pstate capable processor */
-                cpuid(CPUID_FREQ_VOLT_CAPABILITIES, &eax, &ebx, &ecx, &edx);
-                if ((edx & USE_HW_PSTATE) == USE_HW_PSTATE)
-                        cpu_family = CPU_HW_PSTATE;
-                else
-                        return;
-        }
-        *rc = 0;
-}
-static int check_pst_table(struct powernow_k8_data *data, struct pst_s *pst,
-                u8 maxvid)
-{
-        unsigned int j;
-        u8 lastfid = 0xff;
-        for (j = 0; j < data->numps; j++) {
-                if (pst[j].vid > LEAST_VID) {
-                        printk(KERN_ERR FW_BUG PFX "vid %d invalid : 0x%x\n",
-                               j, pst[j].vid);
-                        return -EINVAL;
-                }
-                if (pst[j].vid < data->rvo) {
-                        /* vid + rvo >= 0 */
-                        printk(KERN_ERR FW_BUG PFX "0 vid exceeded with pstate"
-                               " %d\n", j);
-                        return -ENODEV;
-                }
-                if (pst[j].vid < maxvid + data->rvo) {
-                        /* vid + rvo >= maxvid */
-                        printk(KERN_ERR FW_BUG PFX "maxvid exceeded with pstate"
-                               " %d\n", j);
-                        return -ENODEV;
-                }
-                if (pst[j].fid > MAX_FID) {
-                        printk(KERN_ERR FW_BUG PFX "maxfid exceeded with pstate"
-                               " %d\n", j);
-                        return -ENODEV;
-                }
-                if (j && (pst[j].fid < HI_FID_TABLE_BOTTOM)) {
-                        /* Only first fid is allowed to be in "low" range */
-                        printk(KERN_ERR FW_BUG PFX "two low fids - %d : "
-                               "0x%x\n", j, pst[j].fid);
-                        return -EINVAL;
-                }
-                if (pst[j].fid < lastfid)
-                        lastfid = pst[j].fid;
-        }
-        if (lastfid & 1) {
-                printk(KERN_ERR FW_BUG PFX "lastfid invalid\n");
-                return -EINVAL;
-        }
-        if (lastfid > LO_FID_TABLE_TOP)
-                printk(KERN_INFO FW_BUG PFX
-                        "first fid not from lo freq table\n");
-        return 0;
-}
-static void invalidate_entry(struct cpufreq_frequency_table *powernow_table,
-                unsigned int entry)
-{
-        powernow_table[entry].frequency = CPUFREQ_ENTRY_INVALID;
-}
-static void print_basics(struct powernow_k8_data *data)
-{
-        int j;
-        for (j = 0; j < data->numps; j++) {
-                if (data->powernow_table[j].frequency !=
-                                CPUFREQ_ENTRY_INVALID) {
-                        if (cpu_family == CPU_HW_PSTATE) {
-                                printk(KERN_INFO PFX
-                                        "   %d : pstate %d (%d MHz)\n", j,
-                                        data->powernow_table[j].index,
-                                        data->powernow_table[j].frequency/1000);
-                        } else {
-                                printk(KERN_INFO PFX
-                                        "   %d : fid 0x%x (%d MHz), vid 0x%x\n",
-                                        j,
-                                        data->powernow_table[j].index & 0xff,
-                                        data->powernow_table[j].frequency/1000,
-                                        data->powernow_table[j].index >> 8);
-                        }
-                }
-        }
-        if (data->batps)
-                printk(KERN_INFO PFX "Only %d pstates on battery\n",
-                                data->batps);
-}
-static u32 freq_from_fid_did(u32 fid, u32 did)
-{
-        u32 mhz = 0;
-        if (boot_cpu_data.x86 == 0x10)
-                mhz = (100 * (fid + 0x10)) >> did;
-        else if (boot_cpu_data.x86 == 0x11)
-                mhz = (100 * (fid + 8)) >> did;
-        else
-                BUG();
-        return mhz * 1000;
-}
-static int fill_powernow_table(struct powernow_k8_data *data,
-                struct pst_s *pst, u8 maxvid)
-{
-        struct cpufreq_frequency_table *powernow_table;
-        unsigned int j;
-        if (data->batps) {
-                /* use ACPI support to get full speed on mains power */
-                printk(KERN_WARNING PFX
-                        "Only %d pstates usable (use ACPI driver for full "
-                        "range\n", data->batps);
-                data->numps = data->batps;
-        }
-        for (j = 1; j < data->numps; j++) {
-                if (pst[j-1].fid >= pst[j].fid) {
-                        printk(KERN_ERR PFX "PST out of sequence\n");
-                        return -EINVAL;
-                }
-        }
-        if (data->numps < 2) {
-                printk(KERN_ERR PFX "no p states to transition\n");
-                return -ENODEV;
-        }
-        if (check_pst_table(data, pst, maxvid))
-                return -EINVAL;
-        powernow_table = kmalloc((sizeof(struct cpufreq_frequency_table)
-                * (data->numps + 1)), GFP_KERNEL);
-        if (!powernow_table) {
-                printk(KERN_ERR PFX "powernow_table memory alloc failure\n");
-                return -ENOMEM;
-        }
-        for (j = 0; j < data->numps; j++) {
-                int freq;
-                powernow_table[j].index = pst[j].fid; /* lower 8 bits */
-                powernow_table[j].index |= (pst[j].vid << 8); /* upper 8 bits */
-                freq = find_khz_freq_from_fid(pst[j].fid);
-                powernow_table[j].frequency = freq;
-        }
-        powernow_table[data->numps].frequency = CPUFREQ_TABLE_END;
-        powernow_table[data->numps].index = 0;
-        if (query_current_values_with_pending_wait(data)) {
-                kfree(powernow_table);
-                return -EIO;
-        }
-        dprintk("cfid 0x%x, cvid 0x%x\n", data->currfid, data->currvid);
-        data->powernow_table = powernow_table;
-        if (cpumask_first(cpu_core_mask(data->cpu)) == data->cpu)
-                print_basics(data);
-        for (j = 0; j < data->numps; j++)
-                if ((pst[j].fid == data->currfid) &&
-                    (pst[j].vid == data->currvid))
-                        return 0;
-        dprintk("currfid/vid do not match PST, ignoring\n");
-        return 0;
-}
-/* Find and validate the PSB/PST table in BIOS. */
-static int find_psb_table(struct powernow_k8_data *data)
-{
-        struct psb_s *psb;
-        unsigned int i;
-        u32 mvs;
-        u8 maxvid;
-        u32 cpst = 0;
-        u32 thiscpuid;
-        for (i = 0xc0000; i < 0xffff0; i += 0x10) {
-                /* Scan BIOS looking for the signature. */
-                /* It can not be at ffff0 - it is too big. */
-                psb = phys_to_virt(i);
-                if (memcmp(psb, PSB_ID_STRING, PSB_ID_STRING_LEN) != 0)
-                        continue;
-                dprintk("found PSB header at 0x%p\n", psb);
-                dprintk("table vers: 0x%x\n", psb->tableversion);
-                if (psb->tableversion != PSB_VERSION_1_4) {
-                        printk(KERN_ERR FW_BUG PFX "PSB table is not v1.4\n");
-                        return -ENODEV;
-                }
-                dprintk("flags: 0x%x\n", psb->flags1);
-                if (psb->flags1) {
-                        printk(KERN_ERR FW_BUG PFX "unknown flags\n");
-                        return -ENODEV;
-                }
-                data->vstable = psb->vstable;
-                dprintk("voltage stabilization time: %d(*20us)\n",
-                                data->vstable);
-                dprintk("flags2: 0x%x\n", psb->flags2);
-                data->rvo = psb->flags2 & 3;
-                data->irt = ((psb->flags2) >> 2) & 3;
-                mvs = ((psb->flags2) >> 4) & 3;
-                data->vidmvs = 1 << mvs;
-                data->batps = ((psb->flags2) >> 6) & 3;
-                dprintk("ramp voltage offset: %d\n", data->rvo);
-                dprintk("isochronous relief time: %d\n", data->irt);
-                dprintk("maximum voltage step: %d - 0x%x\n", mvs, data->vidmvs);
-                dprintk("numpst: 0x%x\n", psb->num_tables);
-                cpst = psb->num_tables;
-                if ((psb->cpuid == 0x00000fc0) ||
-                    (psb->cpuid == 0x00000fe0)) {
-                        thiscpuid = cpuid_eax(CPUID_PROCESSOR_SIGNATURE);
-                        if ((thiscpuid == 0x00000fc0) ||
-                            (thiscpuid == 0x00000fe0))
-                                cpst = 1;
-                }
-                if (cpst != 1) {
-                        printk(KERN_ERR FW_BUG PFX "numpst must be 1\n");
-                        return -ENODEV;
-                }
-                data->plllock = psb->plllocktime;
-                dprintk("plllocktime: 0x%x (units 1us)\n", psb->plllocktime);
-                dprintk("maxfid: 0x%x\n", psb->maxfid);
-                dprintk("maxvid: 0x%x\n", psb->maxvid);
-                maxvid = psb->maxvid;
-                data->numps = psb->numps;
-                dprintk("numpstates: 0x%x\n", data->numps);
-                return fill_powernow_table(data,
-                                (struct pst_s *)(psb+1), maxvid);
-        }
-        /*
-         * If you see this message, complain to BIOS manufacturer. If
-         * he tells you "we do not support Linux" or some similar
-         * nonsense, remember that Windows 2000 uses the same legacy
-         * mechanism that the old Linux PSB driver uses. Tell them it
-         * is broken with Windows 2000.
-         *
-         * The reference to the AMD documentation is chapter 9 in the
-         * BIOS and Kernel Developer's Guide, which is available on
-         * www.amd.com
-         */
-        printk(KERN_ERR FW_BUG PFX "No PSB or ACPI _PSS objects\n");
-        printk(KERN_ERR PFX "Make sure that your BIOS is up to date"
-                " and Cool'N'Quiet support is enabled in BIOS setup\n");
-        return -ENODEV;
-}
-static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data,
-                unsigned int index)
-{
-        u64 control;
-        if (!data->acpi_data.state_count || (cpu_family == CPU_HW_PSTATE))
-                return;
-        control = data->acpi_data.states[index].control;
-        data->irt = (control >> IRT_SHIFT) & IRT_MASK;
-        data->rvo = (control >> RVO_SHIFT) & RVO_MASK;
-        data->exttype = (control >> EXT_TYPE_SHIFT) & EXT_TYPE_MASK;
-        data->plllock = (control >> PLL_L_SHIFT) & PLL_L_MASK;
-        data->vidmvs = 1 << ((control >> MVS_SHIFT) & MVS_MASK);
-        data->vstable = (control >> VST_SHIFT) & VST_MASK;
-}
-static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)
-{
-        struct cpufreq_frequency_table *powernow_table;
-        int ret_val = -ENODEV;
-        u64 control, status;
-        if (acpi_processor_register_performance(&data->acpi_data, data->cpu)) {
-                dprintk("register performance failed: bad ACPI data\n");
-                return -EIO;
-        }
-        /* verify the data contained in the ACPI structures */
-        if (data->acpi_data.state_count <= 1) {
-                dprintk("No ACPI P-States\n");
-                goto err_out;
-        }
-        control = data->acpi_data.control_register.space_id;
-        status = data->acpi_data.status_register.space_id;
-        if ((control != ACPI_ADR_SPACE_FIXED_HARDWARE) ||
-            (status != ACPI_ADR_SPACE_FIXED_HARDWARE)) {
-                dprintk("Invalid control/status registers (%x - %x)\n",
-                        control, status);
-                goto err_out;
-        }
-        /* fill in data->powernow_table */
-        powernow_table = kmalloc((sizeof(struct cpufreq_frequency_table)
-                * (data->acpi_data.state_count + 1)), GFP_KERNEL);
-        if (!powernow_table) {
-                dprintk("powernow_table memory alloc failure\n");
-                goto err_out;
-        }
-        /* fill in data */
-        data->numps = data->acpi_data.state_count;
-        powernow_k8_acpi_pst_values(data, 0);
-        if (cpu_family == CPU_HW_PSTATE)
-                ret_val = fill_powernow_table_pstate(data, powernow_table);
-        else
-                ret_val = fill_powernow_table_fidvid(data, powernow_table);
-        if (ret_val)
-                goto err_out_mem;
-        powernow_table[data->acpi_data.state_count].frequency =
-                CPUFREQ_TABLE_END;
-        powernow_table[data->acpi_data.state_count].index = 0;
-        data->powernow_table = powernow_table;
-        if (cpumask_first(cpu_core_mask(data->cpu)) == data->cpu)
-                print_basics(data);
-        /* notify BIOS that we exist */
-        acpi_processor_notify_smm(THIS_MODULE);
-        if (!zalloc_cpumask_var(&data->acpi_data.shared_cpu_map, GFP_KERNEL)) {
-                printk(KERN_ERR PFX
-                                "unable to alloc powernow_k8_data cpumask\n");
-                ret_val = -ENOMEM;
-                goto err_out_mem;
-        }
-        return 0;
-err_out_mem:
-        kfree(powernow_table);
-err_out:
-        acpi_processor_unregister_performance(&data->acpi_data, data->cpu);
-        /* data->acpi_data.state_count informs us at ->exit()
-         * whether ACPI was used */
-        data->acpi_data.state_count = 0;
-        return ret_val;
-}
-static int fill_powernow_table_pstate(struct powernow_k8_data *data,
-                struct cpufreq_frequency_table *powernow_table)
-{
-        int i;
-        u32 hi = 0, lo = 0;
-        rdmsr(MSR_PSTATE_CUR_LIMIT, lo, hi);
-        data->max_hw_pstate = (lo & HW_PSTATE_MAX_MASK) >> HW_PSTATE_MAX_SHIFT;
-        for (i = 0; i < data->acpi_data.state_count; i++) {
-                u32 index;
-                index = data->acpi_data.states[i].control & HW_PSTATE_MASK;
-                if (index > data->max_hw_pstate) {
-                        printk(KERN_ERR PFX "invalid pstate %d - "
-                                        "bad value %d.\n", i, index);
-                        printk(KERN_ERR PFX "Please report to BIOS "
-                                        "manufacturer\n");
-                        invalidate_entry(powernow_table, i);
-                        continue;
-                }
-                rdmsr(MSR_PSTATE_DEF_BASE + index, lo, hi);
-                if (!(hi & HW_PSTATE_VALID_MASK)) {
-                        dprintk("invalid pstate %d, ignoring\n", index);
-                        invalidate_entry(powernow_table, i);
-                        continue;
-                }
-                powernow_table[i].index = index;
-                /* Frequency may be rounded for these */
-                if ((boot_cpu_data.x86 == 0x10 && boot_cpu_data.x86_model < 10)
-                                 || boot_cpu_data.x86 == 0x11) {
-                        powernow_table[i].frequency =
-                                freq_from_fid_did(lo & 0x3f, (lo >> 6) & 7);
-                } else
-                        powernow_table[i].frequency =
-                                data->acpi_data.states[i].core_frequency * 1000;
-        }
-        return 0;
-}
-static int fill_powernow_table_fidvid(struct powernow_k8_data *data,
-                struct cpufreq_frequency_table *powernow_table)
-{
-        int i;
-        for (i = 0; i < data->acpi_data.state_count; i++) {
-                u32 fid;
-                u32 vid;
-                u32 freq, index;
-                u64 status, control;
-                if (data->exttype) {
-                        status =  data->acpi_data.states[i].status;
-                        fid = status & EXT_FID_MASK;
-                        vid = (status >> VID_SHIFT) & EXT_VID_MASK;
-                } else {
-                        control =  data->acpi_data.states[i].control;
-                        fid = control & FID_MASK;
-                        vid = (control >> VID_SHIFT) & VID_MASK;
-                }
-                dprintk("   %d : fid 0x%x, vid 0x%x\n", i, fid, vid);
-                index = fid | (vid<<8);
-                powernow_table[i].index = index;
-                freq = find_khz_freq_from_fid(fid);
-                powernow_table[i].frequency = freq;
-                /* verify frequency is OK */
-                if ((freq > (MAX_FREQ * 1000)) || (freq < (MIN_FREQ * 1000))) {
-                        dprintk("invalid freq %u kHz, ignoring\n", freq);
-                        invalidate_entry(powernow_table, i);
-                        continue;
-                }
-                /* verify voltage is OK -
-                 * BIOSs are using "off" to indicate invalid */
-                if (vid == VID_OFF) {
-                        dprintk("invalid vid %u, ignoring\n", vid);
-                        invalidate_entry(powernow_table, i);
-                        continue;
-                }
-                if (freq != (data->acpi_data.states[i].core_frequency * 1000)) {
-                        printk(KERN_INFO PFX "invalid freq entries "
-                                "%u kHz vs. %u kHz\n", freq,
-                                (unsigned int)
-                                (data->acpi_data.states[i].core_frequency
-                                 * 1000));
-                        invalidate_entry(powernow_table, i);
-                        continue;
-                }
-        }
-        return 0;
-}
-static void powernow_k8_cpu_exit_acpi(struct powernow_k8_data *data)
-{
-        if (data->acpi_data.state_count)
-                acpi_processor_unregister_performance(&data->acpi_data,
-                                data->cpu);
-        free_cpumask_var(data->acpi_data.shared_cpu_map);
-}
-static int get_transition_latency(struct powernow_k8_data *data)
-{
-        int max_latency = 0;
-        int i;
-        for (i = 0; i < data->acpi_data.state_count; i++) {
-                int cur_latency = data->acpi_data.states[i].transition_latency
-                        + data->acpi_data.states[i].bus_master_latency;
-                if (cur_latency > max_latency)
-                        max_latency = cur_latency;
-        }
-        if (max_latency == 0) {
-                /*
-                 * Fam 11h and later may return 0 as transition latency. This
-                 * is intended and means "very fast". While cpufreq core and
-                 * governors currently can handle that gracefully, better set it
-                 * to 1 to avoid problems in the future.
-                 */
-                if (boot_cpu_data.x86 < 0x11)
-                        printk(KERN_ERR FW_WARN PFX "Invalid zero transition "
-                                "latency\n");
-                max_latency = 1;
-        }
-        /* value in usecs, needs to be in nanoseconds */
-        return 1000 * max_latency;
-}
-/* Take a frequency, and issue the fid/vid transition command */
-static int transition_frequency_fidvid(struct powernow_k8_data *data,
-                unsigned int index)
-{
-        u32 fid = 0;
-        u32 vid = 0;
-        int res, i;
-        struct cpufreq_freqs freqs;
-        dprintk("cpu %d transition to index %u\n", smp_processor_id(), index);
-        /* fid/vid correctness check for k8 */
-        /* fid are the lower 8 bits of the index we stored into
-         * the cpufreq frequency table in find_psb_table, vid
-         * are the upper 8 bits.
-         */
-        fid = data->powernow_table[index].index & 0xFF;
-        vid = (data->powernow_table[index].index & 0xFF00) >> 8;
-        dprintk("table matched fid 0x%x, giving vid 0x%x\n", fid, vid);
-        if (query_current_values_with_pending_wait(data))
-                return 1;
-        if ((data->currvid == vid) && (data->currfid == fid)) {
-                dprintk("target matches current values (fid 0x%x, vid 0x%x)\n",
-                        fid, vid);
-                return 0;
-        }
-        dprintk("cpu %d, changing to fid 0x%x, vid 0x%x\n",
-                smp_processor_id(), fid, vid);
-        freqs.old = find_khz_freq_from_fid(data->currfid);
-        freqs.new = find_khz_freq_from_fid(fid);
-        for_each_cpu(i, data->available_cores) {
-                freqs.cpu = i;
-                cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
-        }
-        res = transition_fid_vid(data, fid, vid);
-        freqs.new = find_khz_freq_from_fid(data->currfid);
-        for_each_cpu(i, data->available_cores) {
-                freqs.cpu = i;
-                cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
-        }
-        return res;
-}
-/* Take a frequency, and issue the hardware pstate transition command */
-static int transition_frequency_pstate(struct powernow_k8_data *data,
-                unsigned int index)
-{
-        u32 pstate = 0;
-        int res, i;
-        struct cpufreq_freqs freqs;
-        dprintk("cpu %d transition to index %u\n", smp_processor_id(), index);
-        /* get MSR index for hardware pstate transition */
-        pstate = index & HW_PSTATE_MASK;
-        if (pstate > data->max_hw_pstate)
-                return 0;
-        freqs.old = find_khz_freq_from_pstate(data->powernow_table,
-                        data->currpstate);
-        freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate);
-        for_each_cpu(i, data->available_cores) {
-                freqs.cpu = i;
-                cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
-        }
-        res = transition_pstate(data, pstate);
-        freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate);
-        for_each_cpu(i, data->available_cores) {
-                freqs.cpu = i;
-                cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
-        }
-        return res;
-}
-/* Driver entry point to switch to the target frequency */
-static int powernowk8_target(struct cpufreq_policy *pol,
-                unsigned targfreq, unsigned relation)
-{
-        cpumask_var_t oldmask;
-        struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu);
-        u32 checkfid;
-        u32 checkvid;
-        unsigned int newstate;
-        int ret = -EIO;
-        if (!data)
-                return -EINVAL;
-        checkfid = data->currfid;
-        checkvid = data->currvid;
-        /* only run on specific CPU from here on. */
-        /* This is poor form: use a workqueue or smp_call_function_single */
-        if (!alloc_cpumask_var(&oldmask, GFP_KERNEL))
-                return -ENOMEM;
-        cpumask_copy(oldmask, tsk_cpus_allowed(current));
-        set_cpus_allowed_ptr(current, cpumask_of(pol->cpu));
-        if (smp_processor_id() != pol->cpu) {
-                printk(KERN_ERR PFX "limiting to cpu %u failed\n", pol->cpu);
-                goto err_out;
-        }
-        if (pending_bit_stuck()) {
-                printk(KERN_ERR PFX "failing targ, change pending bit set\n");
-                goto err_out;
-        }
-        dprintk("targ: cpu %d, %d kHz, min %d, max %d, relation %d\n",
-                pol->cpu, targfreq, pol->min, pol->max, relation);
-        if (query_current_values_with_pending_wait(data))
-                goto err_out;
-        if (cpu_family != CPU_HW_PSTATE) {
-                dprintk("targ: curr fid 0x%x, vid 0x%x\n",
-                data->currfid, data->currvid);
-                if ((checkvid != data->currvid) ||
-                    (checkfid != data->currfid)) {
-                        printk(KERN_INFO PFX
-                                "error - out of sync, fix 0x%x 0x%x, "
-                                "vid 0x%x 0x%x\n",
-                                checkfid, data->currfid,
-                                checkvid, data->currvid);
-                }
-        }
-        if (cpufreq_frequency_table_target(pol, data->powernow_table,
-                                targfreq, relation, &newstate))
-                goto err_out;
-        mutex_lock(&fidvid_mutex);
-        powernow_k8_acpi_pst_values(data, newstate);
-        if (cpu_family == CPU_HW_PSTATE)
-                ret = transition_frequency_pstate(data, newstate);
-        else
-                ret = transition_frequency_fidvid(data, newstate);
-        if (ret) {
-                printk(KERN_ERR PFX "transition frequency failed\n");
-                ret = 1;
-                mutex_unlock(&fidvid_mutex);
-                goto err_out;
-        }
-        mutex_unlock(&fidvid_mutex);
-        if (cpu_family == CPU_HW_PSTATE)
-                pol->cur = find_khz_freq_from_pstate(data->powernow_table,
-                                newstate);
-        else
-                pol->cur = find_khz_freq_from_fid(data->currfid);
-        ret = 0;
-err_out:
-        set_cpus_allowed_ptr(current, oldmask);
-        free_cpumask_var(oldmask);
-        return ret;
-}
-/* Driver entry point to verify the policy and range of frequencies */
-static int powernowk8_verify(struct cpufreq_policy *pol)
-{
-        struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu);
-        if (!data)
-                return -EINVAL;
-        return cpufreq_frequency_table_verify(pol, data->powernow_table);
-}
-struct init_on_cpu {
-        struct powernow_k8_data *data;
-        int rc;
-};
-static void __cpuinit powernowk8_cpu_init_on_cpu(void *_init_on_cpu)
-{
-        struct init_on_cpu *init_on_cpu = _init_on_cpu;
-        if (pending_bit_stuck()) {
-                printk(KERN_ERR PFX "failing init, change pending bit set\n");
-                init_on_cpu->rc = -ENODEV;
-                return;
-        }
-        if (query_current_values_with_pending_wait(init_on_cpu->data)) {
-                init_on_cpu->rc = -ENODEV;
-                return;
-        }
-        if (cpu_family == CPU_OPTERON)
-                fidvid_msr_init();
-        init_on_cpu->rc = 0;
-}
-/* per CPU init entry point to the driver */
-static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
-{
-        static const char ACPI_PSS_BIOS_BUG_MSG[] =
-                KERN_ERR FW_BUG PFX "No compatible ACPI _PSS objects found.\n"
-                FW_BUG PFX "Try again with latest BIOS.\n";
-        struct powernow_k8_data *data;
-        struct init_on_cpu init_on_cpu;
-        int rc;
-        struct cpuinfo_x86 *c = &cpu_data(pol->cpu);
-        if (!cpu_online(pol->cpu))
-                return -ENODEV;
-        smp_call_function_single(pol->cpu, check_supported_cpu, &rc, 1);
-        if (rc)
-                return -ENODEV;
-        data = kzalloc(sizeof(struct powernow_k8_data), GFP_KERNEL);
-        if (!data) {
-                printk(KERN_ERR PFX "unable to alloc powernow_k8_data");
-                return -ENOMEM;
-        }
-        data->cpu = pol->cpu;
-        data->currpstate = HW_PSTATE_INVALID;
-        if (powernow_k8_cpu_init_acpi(data)) {
-                /*
-                 * Use the PSB BIOS structure. This is only availabe on
-                 * an UP version, and is deprecated by AMD.
-                 */
-                if (num_online_cpus() != 1) {
-                        printk_once(ACPI_PSS_BIOS_BUG_MSG);
-                        goto err_out;
-                }
-                if (pol->cpu != 0) {
-                        printk(KERN_ERR FW_BUG PFX "No ACPI _PSS objects for "
-                               "CPU other than CPU0. Complain to your BIOS "
-                               "vendor.\n");
-                        goto err_out;
-                }
-                rc = find_psb_table(data);
-                if (rc)
-                        goto err_out;
-                /* Take a crude guess here.
-                 * That guess was in microseconds, so multiply with 1000 */
-                pol->cpuinfo.transition_latency = (
-                         ((data->rvo + 8) * data->vstable * VST_UNITS_20US) +
-                         ((1 << data->irt) * 30)) * 1000;
-        } else /* ACPI _PSS objects available */
-                pol->cpuinfo.transition_latency = get_transition_latency(data);
-        /* only run on specific CPU from here on */
-        init_on_cpu.data = data;
-        smp_call_function_single(data->cpu, powernowk8_cpu_init_on_cpu,
-                                 &init_on_cpu, 1);
-        rc = init_on_cpu.rc;
-        if (rc != 0)
-                goto err_out_exit_acpi;
-        if (cpu_family == CPU_HW_PSTATE)
-                cpumask_copy(pol->cpus, cpumask_of(pol->cpu));
-        else
-                cpumask_copy(pol->cpus, cpu_core_mask(pol->cpu));
-        data->available_cores = pol->cpus;
-        if (cpu_family == CPU_HW_PSTATE)
-                pol->cur = find_khz_freq_from_pstate(data->powernow_table,
-                                data->currpstate);
-        else
-                pol->cur = find_khz_freq_from_fid(data->currfid);
-        dprintk("policy current frequency %d kHz\n", pol->cur);
-        /* min/max the cpu is capable of */
-        if (cpufreq_frequency_table_cpuinfo(pol, data->powernow_table)) {
-                printk(KERN_ERR FW_BUG PFX "invalid powernow_table\n");
-                powernow_k8_cpu_exit_acpi(data);
-                kfree(data->powernow_table);
-                kfree(data);
-                return -EINVAL;
-        }
-        /* Check for APERF/MPERF support in hardware */
-        if (cpu_has(c, X86_FEATURE_APERFMPERF))
-                cpufreq_amd64_driver.getavg = cpufreq_get_measured_perf;
-        cpufreq_frequency_table_get_attr(data->powernow_table, pol->cpu);
-        if (cpu_family == CPU_HW_PSTATE)
-                dprintk("cpu_init done, current pstate 0x%x\n",
-                                data->currpstate);
-        else
-                dprintk("cpu_init done, current fid 0x%x, vid 0x%x\n",
-                        data->currfid, data->currvid);
-        per_cpu(powernow_data, pol->cpu) = data;
-        return 0;
-err_out_exit_acpi:
-        powernow_k8_cpu_exit_acpi(data);
-err_out:
-        kfree(data);
-        return -ENODEV;
-}
-static int __devexit powernowk8_cpu_exit(struct cpufreq_policy *pol)
-{
-        struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu);
-        if (!data)
-                return -EINVAL;
-        powernow_k8_cpu_exit_acpi(data);
-        cpufreq_frequency_table_put_attr(pol->cpu);
-        kfree(data->powernow_table);
-        kfree(data);
-        per_cpu(powernow_data, pol->cpu) = NULL;
-        return 0;
-}
-static void query_values_on_cpu(void *_err)
-{
-        int *err = _err;
-        struct powernow_k8_data *data = __get_cpu_var(powernow_data);
-        *err = query_current_values_with_pending_wait(data);
-}
-static unsigned int powernowk8_get(unsigned int cpu)
-{
-        struct powernow_k8_data *data = per_cpu(powernow_data, cpu);
-        unsigned int khz = 0;
-        int err;
-        if (!data)
-                return 0;
-        smp_call_function_single(cpu, query_values_on_cpu, &err, true);
-        if (err)
-                goto out;
-        if (cpu_family == CPU_HW_PSTATE)
-                khz = find_khz_freq_from_pstate(data->powernow_table,
-                                                data->currpstate);
-        else
-                khz = find_khz_freq_from_fid(data->currfid);
-out:
-        return khz;
-}
-static void _cpb_toggle_msrs(bool t)
-{
-        int cpu;
-        get_online_cpus();
-        rdmsr_on_cpus(cpu_online_mask, MSR_K7_HWCR, msrs);
-        for_each_cpu(cpu, cpu_online_mask) {
-                struct msr *reg = per_cpu_ptr(msrs, cpu);
-                if (t)
-                        reg->l &= ~BIT(25);
-                else
-                        reg->l |= BIT(25);
-        }
-        wrmsr_on_cpus(cpu_online_mask, MSR_K7_HWCR, msrs);
-        put_online_cpus();
-}
-/*
- * Switch on/off core performance boosting.
- *
- * 0=disable
- * 1=enable.
- */
-static void cpb_toggle(bool t)
-{
-        if (!cpb_capable)
-                return;
-        if (t && !cpb_enabled) {
-                cpb_enabled = true;
-                _cpb_toggle_msrs(t);
-                printk(KERN_INFO PFX "Core Boosting enabled.\n");
-        } else if (!t && cpb_enabled) {
-                cpb_enabled = false;
-                _cpb_toggle_msrs(t);
-                printk(KERN_INFO PFX "Core Boosting disabled.\n");
-        }
-}
-static ssize_t store_cpb(struct cpufreq_policy *policy, const char *buf,
-                                 size_t count)
-{
-        int ret = -EINVAL;
-        unsigned long val = 0;
-        ret = strict_strtoul(buf, 10, &val);
-        if (!ret && (val == 0 || val == 1) && cpb_capable)
-                cpb_toggle(val);
-        else
-                return -EINVAL;
-        return count;
-}
-static ssize_t show_cpb(struct cpufreq_policy *policy, char *buf)
-{
-        return sprintf(buf, "%u\n", cpb_enabled);
-}
-#define define_one_rw(_name) \
-static struct freq_attr _name = \
-__ATTR(_name, 0644, show_##_name, store_##_name)
-define_one_rw(cpb);
-static struct freq_attr *powernow_k8_attr[] = {
-        &cpufreq_freq_attr_scaling_available_freqs,
-        &cpb,
-        NULL,
-};
-static struct cpufreq_driver cpufreq_amd64_driver = {
-        .verify         = powernowk8_verify,
-        .target         = powernowk8_target,
-        .bios_limit     = acpi_processor_get_bios_limit,
-        .init           = powernowk8_cpu_init,
-        .exit           = __devexit_p(powernowk8_cpu_exit),
-        .get            = powernowk8_get,
-        .name           = "powernow-k8",
-        .owner          = THIS_MODULE,
-        .attr           = powernow_k8_attr,
-};
-/*
- * Clear the boost-disable flag on the CPU_DOWN path so that this cpu
- * cannot block the remaining ones from boosting. On the CPU_UP path we
- * simply keep the boost-disable flag in sync with the current global
- * state.
- */
-static int cpb_notify(struct notifier_block *nb, unsigned long action,
-                      void *hcpu)
-{
-        unsigned cpu = (long)hcpu;
-        u32 lo, hi;
-        switch (action) {
-        case CPU_UP_PREPARE:
-        case CPU_UP_PREPARE_FROZEN:
-                if (!cpb_enabled) {
-                        rdmsr_on_cpu(cpu, MSR_K7_HWCR, &lo, &hi);
-                        lo |= BIT(25);
-                        wrmsr_on_cpu(cpu, MSR_K7_HWCR, lo, hi);
-                }
-                break;
-        case CPU_DOWN_PREPARE:
-        case CPU_DOWN_PREPARE_FROZEN:
-                rdmsr_on_cpu(cpu, MSR_K7_HWCR, &lo, &hi);
-                lo &= ~BIT(25);
-                wrmsr_on_cpu(cpu, MSR_K7_HWCR, lo, hi);
-                break;
-        default:
-                break;
-        }
-        return NOTIFY_OK;
-}
-static struct notifier_block cpb_nb = {
-        .notifier_call          = cpb_notify,
-};
-/* driver entry point for init */
-static int __cpuinit powernowk8_init(void)
-{
-        unsigned int i, supported_cpus = 0, cpu;
-        for_each_online_cpu(i) {
-                int rc;
-                smp_call_function_single(i, check_supported_cpu, &rc, 1);
-                if (rc == 0)
-                        supported_cpus++;
-        }
-        if (supported_cpus != num_online_cpus())
-                return -ENODEV;
-        printk(KERN_INFO PFX "Found %d %s (%d cpu cores) (" VERSION ")\n",
-                num_online_nodes(), boot_cpu_data.x86_model_id, supported_cpus);
-        if (boot_cpu_has(X86_FEATURE_CPB)) {
-                cpb_capable = true;
-                register_cpu_notifier(&cpb_nb);
-                msrs = msrs_alloc();
-                if (!msrs) {
-                        printk(KERN_ERR "%s: Error allocating msrs!\n", __func__);
-                        return -ENOMEM;
-                }
-                rdmsr_on_cpus(cpu_online_mask, MSR_K7_HWCR, msrs);
-                for_each_cpu(cpu, cpu_online_mask) {
-                        struct msr *reg = per_cpu_ptr(msrs, cpu);
-                        cpb_enabled |= !(!!(reg->l & BIT(25)));
-                }
-                printk(KERN_INFO PFX "Core Performance Boosting: %s.\n",
-                        (cpb_enabled ? "on" : "off"));
-        }
-        return cpufreq_register_driver(&cpufreq_amd64_driver);
-}
-/* driver entry point for term */
-static void __exit powernowk8_exit(void)
-{
-        dprintk("exit\n");
-        if (boot_cpu_has(X86_FEATURE_CPB)) {
-                msrs_free(msrs);
-                msrs = NULL;
-                unregister_cpu_notifier(&cpb_nb);
-        }
-        cpufreq_unregister_driver(&cpufreq_amd64_driver);
-}
-MODULE_AUTHOR("Paul Devriendt <paul.devriendt@amd.com> and "
-                "Mark Langsdorf <mark.langsdorf@amd.com>");
-MODULE_DESCRIPTION("AMD Athlon 64 and Opteron processor frequency driver.");
-MODULE_LICENSE("GPL");
-late_initcall(powernowk8_init);
-module_exit(powernowk8_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
deleted file mode 100644
index df3529b1c02d..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
+++ /dev/null
@@ -1,224 +0,0 @@
-/*
- *  (c) 2003-2006 Advanced Micro Devices, Inc.
- *  Your use of this code is subject to the terms and conditions of the
- *  GNU general public license version 2. See "COPYING" or
- *  http://www.gnu.org/licenses/gpl.html
- */
-enum pstate {
-        HW_PSTATE_INVALID = 0xff,
-        HW_PSTATE_0 = 0,
-        HW_PSTATE_1 = 1,
-        HW_PSTATE_2 = 2,
-        HW_PSTATE_3 = 3,
-        HW_PSTATE_4 = 4,
-        HW_PSTATE_5 = 5,
-        HW_PSTATE_6 = 6,
-        HW_PSTATE_7 = 7,
-};
-struct powernow_k8_data {
-        unsigned int cpu;
-        u32 numps;  /* number of p-states */
-        u32 batps;  /* number of p-states supported on battery */
-        u32 max_hw_pstate; /* maximum legal hardware pstate */
-        /* these values are constant when the PSB is used to determine
-         * vid/fid pairings, but are modified during the ->target() call
-         * when ACPI is used */
-        u32 rvo;     /* ramp voltage offset */
-        u32 irt;     /* isochronous relief time */
-        u32 vidmvs;  /* usable value calculated from mvs */
-        u32 vstable; /* voltage stabilization time, units 20 us */
-        u32 plllock; /* pll lock time, units 1 us */
-        u32 exttype; /* extended interface = 1 */
-        /* keep track of the current fid / vid or pstate */
-        u32 currvid;
-        u32 currfid;
-        enum pstate currpstate;
-        /* the powernow_table includes all frequency and vid/fid pairings:
-         * fid are the lower 8 bits of the index, vid are the upper 8 bits.
-         * frequency is in kHz */
-        struct cpufreq_frequency_table  *powernow_table;
-        /* the acpi table needs to be kept. it's only available if ACPI was
-         * used to determine valid frequency/vid/fid states */
-        struct acpi_processor_performance acpi_data;
-        /* we need to keep track of associated cores, but let cpufreq
-         * handle hotplug events - so just point at cpufreq pol->cpus
-         * structure */
-        struct cpumask *available_cores;
-};
-/* processor's cpuid instruction support */
-#define CPUID_PROCESSOR_SIGNATURE       1       /* function 1 */
-#define CPUID_XFAM                      0x0ff00000      /* extended family */
-#define CPUID_XFAM_K8                   0
-#define CPUID_XMOD                      0x000f0000      /* extended model */
-#define CPUID_XMOD_REV_MASK             0x000c0000
-#define CPUID_XFAM_10H                  0x00100000      /* family 0x10 */
-#define CPUID_USE_XFAM_XMOD             0x00000f00
-#define CPUID_GET_MAX_CAPABILITIES      0x80000000
-#define CPUID_FREQ_VOLT_CAPABILITIES    0x80000007
-#define P_STATE_TRANSITION_CAPABLE      6
-/* Model Specific Registers for p-state transitions. MSRs are 64-bit. For     */
-/* writes (wrmsr - opcode 0f 30), the register number is placed in ecx, and   */
-/* the value to write is placed in edx:eax. For reads (rdmsr - opcode 0f 32), */
-/* the register number is placed in ecx, and the data is returned in edx:eax. */
-#define MSR_FIDVID_CTL      0xc0010041
-#define MSR_FIDVID_STATUS   0xc0010042
-/* Field definitions within the FID VID Low Control MSR : */
-#define MSR_C_LO_INIT_FID_VID     0x00010000
-#define MSR_C_LO_NEW_VID          0x00003f00
-#define MSR_C_LO_NEW_FID          0x0000003f
-#define MSR_C_LO_VID_SHIFT        8
-/* Field definitions within the FID VID High Control MSR : */
-#define MSR_C_HI_STP_GNT_TO       0x000fffff
-/* Field definitions within the FID VID Low Status MSR : */
-#define MSR_S_LO_CHANGE_PENDING   0x80000000   /* cleared when completed */
-#define MSR_S_LO_MAX_RAMP_VID     0x3f000000
-#define MSR_S_LO_MAX_FID          0x003f0000
-#define MSR_S_LO_START_FID        0x00003f00
-#define MSR_S_LO_CURRENT_FID      0x0000003f
-/* Field definitions within the FID VID High Status MSR : */
-#define MSR_S_HI_MIN_WORKING_VID  0x3f000000
-#define MSR_S_HI_MAX_WORKING_VID  0x003f0000
-#define MSR_S_HI_START_VID        0x00003f00
-#define MSR_S_HI_CURRENT_VID      0x0000003f
-#define MSR_C_HI_STP_GNT_BENIGN   0x00000001
-/* Hardware Pstate _PSS and MSR definitions */
-#define USE_HW_PSTATE           0x00000080
-#define HW_PSTATE_MASK          0x00000007
-#define HW_PSTATE_VALID_MASK    0x80000000
-#define HW_PSTATE_MAX_MASK      0x000000f0
-#define HW_PSTATE_MAX_SHIFT     4
-#define MSR_PSTATE_DEF_BASE     0xc0010064 /* base of Pstate MSRs */
-#define MSR_PSTATE_STATUS       0xc0010063 /* Pstate Status MSR */
-#define MSR_PSTATE_CTRL         0xc0010062 /* Pstate control MSR */
-#define MSR_PSTATE_CUR_LIMIT    0xc0010061 /* pstate current limit MSR */
-/* define the two driver architectures */
-#define CPU_OPTERON 0
-#define CPU_HW_PSTATE 1
-/*
- * There are restrictions frequencies have to follow:
- * - only 1 entry in the low fid table ( <=1.4GHz )
- * - lowest entry in the high fid table must be >= 2 * the entry in the
- *   low fid table
- * - lowest entry in the high fid table must be a <= 200MHz + 2 * the entry
- *   in the low fid table
- * - the parts can only step at <= 200 MHz intervals, odd fid values are
- *   supported in revision G and later revisions.
- * - lowest frequency must be >= interprocessor hypertransport link speed
- *   (only applies to MP systems obviously)
- */
-/* fids (frequency identifiers) are arranged in 2 tables - lo and hi */
-#define LO_FID_TABLE_TOP     7  /* fid values marking the boundary    */
-#define HI_FID_TABLE_BOTTOM  8  /* between the low and high tables    */
-#define LO_VCOFREQ_TABLE_TOP    1400    /* corresponding vco frequency values */
-#define HI_VCOFREQ_TABLE_BOTTOM 1600
-#define MIN_FREQ_RESOLUTION  200 /* fids jump by 2 matching freq jumps by 200 */
-#define MAX_FID 0x2a    /* Spec only gives FID values as far as 5 GHz */
-#define LEAST_VID 0x3e  /* Lowest (numerically highest) useful vid value */
-#define MIN_FREQ 800    /* Min and max freqs, per spec */
-#define MAX_FREQ 5000
-#define INVALID_FID_MASK 0xffffffc0  /* not a valid fid if these bits are set */
-#define INVALID_VID_MASK 0xffffffc0  /* not a valid vid if these bits are set */
-#define VID_OFF 0x3f
-#define STOP_GRANT_5NS 1 /* min poss memory access latency for voltage change */
-#define PLL_LOCK_CONVERSION (1000/5) /* ms to ns, then divide by clock period */
-#define MAXIMUM_VID_STEPS 1  /* Current cpus only allow a single step of 25mV */
-#define VST_UNITS_20US 20   /* Voltage Stabilization Time is in units of 20us */
-/*
- * Most values of interest are encoded in a single field of the _PSS
- * entries: the "control" value.
- */
-#define IRT_SHIFT      30
-#define RVO_SHIFT      28
-#define EXT_TYPE_SHIFT 27
-#define PLL_L_SHIFT    20
-#define MVS_SHIFT      18
-#define VST_SHIFT      11
-#define VID_SHIFT       6
-#define IRT_MASK        3
-#define RVO_MASK        3
-#define EXT_TYPE_MASK   1
-#define PLL_L_MASK   0x7f
-#define MVS_MASK        3
-#define VST_MASK     0x7f
-#define VID_MASK     0x1f
-#define FID_MASK     0x1f
-#define EXT_VID_MASK 0x3f
-#define EXT_FID_MASK 0x3f
-/*
- * Version 1.4 of the PSB table. This table is constructed by BIOS and is
- * to tell the OS's power management driver which VIDs and FIDs are
- * supported by this particular processor.
- * If the data in the PSB / PST is wrong, then this driver will program the
- * wrong values into hardware, which is very likely to lead to a crash.
- */
-#define PSB_ID_STRING      "AMDK7PNOW!"
-#define PSB_ID_STRING_LEN  10
-#define PSB_VERSION_1_4  0x14
-struct psb_s {
-        u8 signature[10];
-        u8 tableversion;
-        u8 flags1;
-        u16 vstable;
-        u8 flags2;
-        u8 num_tables;
-        u32 cpuid;
-        u8 plllocktime;
-        u8 maxfid;
-        u8 maxvid;
-        u8 numps;
-};
-/* Pairs of fid/vid values are appended to the version 1.4 PSB table. */
-struct pst_s {
-        u8 fid;
-        u8 vid;
-};
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "powernow-k8", msg)
-static int core_voltage_pre_transition(struct powernow_k8_data *data,
-        u32 reqvid, u32 regfid);
-static int core_voltage_post_transition(struct powernow_k8_data *data, u32 reqvid);
-static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid);
-static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, unsigned int index);
-static int fill_powernow_table_pstate(struct powernow_k8_data *data, struct cpufreq_frequency_table *powernow_table);
-static int fill_powernow_table_fidvid(struct powernow_k8_data *data, struct cpufreq_frequency_table *powernow_table);
diff --git a/arch/x86/kernel/cpu/cpufreq/sc520_freq.c b/arch/x86/kernel/cpu/cpufreq/sc520_freq.c
deleted file mode 100644
index 435a996a613a..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/sc520_freq.c
+++ /dev/null
@@ -1,194 +0,0 @@
-/*
- *      sc520_freq.c: cpufreq driver for the AMD Elan sc520
- *
- *      Copyright (C) 2005 Sean Young <sean@mess.org>
- *
- *      This program is free software; you can redistribute it and/or
- *      modify it under the terms of the GNU General Public License
- *      as published by the Free Software Foundation; either version
- *      2 of the License, or (at your option) any later version.
- *
- *      Based on elanfreq.c
- *
- *      2005-03-30: - initial revision
- */
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/delay.h>
-#include <linux/cpufreq.h>
-#include <linux/timex.h>
-#include <linux/io.h>
-#include <asm/msr.h>
-#define MMCR_BASE       0xfffef000      /* The default base address */
-#define OFFS_CPUCTL     0x2   /* CPU Control Register */
-static __u8 __iomem *cpuctl;
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
-                "sc520_freq", msg)
-#define PFX "sc520_freq: "
-static struct cpufreq_frequency_table sc520_freq_table[] = {
-        {0x01,  100000},
-        {0x02,  133000},
-        {0,     CPUFREQ_TABLE_END},
-};
-static unsigned int sc520_freq_get_cpu_frequency(unsigned int cpu)
-{
-        u8 clockspeed_reg = *cpuctl;
-        switch (clockspeed_reg & 0x03) {
-        default:
-                printk(KERN_ERR PFX "error: cpuctl register has unexpected "
-                                "value %02x\n", clockspeed_reg);
-        case 0x01:
-                return 100000;
-        case 0x02:
-                return 133000;
-        }
-}
-static void sc520_freq_set_cpu_state(unsigned int state)
-{
-        struct cpufreq_freqs    freqs;
-        u8 clockspeed_reg;
-        freqs.old = sc520_freq_get_cpu_frequency(0);
-        freqs.new = sc520_freq_table[state].frequency;
-        freqs.cpu = 0; /* AMD Elan is UP */
-        cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
-        dprintk("attempting to set frequency to %i kHz\n",
-                        sc520_freq_table[state].frequency);
-        local_irq_disable();
-        clockspeed_reg = *cpuctl & ~0x03;
-        *cpuctl = clockspeed_reg | sc520_freq_table[state].index;
-        local_irq_enable();
-        cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
-};
-static int sc520_freq_verify(struct cpufreq_policy *policy)
-{
-        return cpufreq_frequency_table_verify(policy, &sc520_freq_table[0]);
-}
-static int sc520_freq_target(struct cpufreq_policy *policy,
-                            unsigned int target_freq,
-                            unsigned int relation)
-{
-        unsigned int newstate = 0;
-        if (cpufreq_frequency_table_target(policy, sc520_freq_table,
-                                target_freq, relation, &newstate))
-                return -EINVAL;
-        sc520_freq_set_cpu_state(newstate);
-        return 0;
-}
-/*
- *      Module init and exit code
- */
-static int sc520_freq_cpu_init(struct cpufreq_policy *policy)
-{
-        struct cpuinfo_x86 *c = &cpu_data(0);
-        int result;
-        /* capability check */
-        if (c->x86_vendor != X86_VENDOR_AMD ||
-            c->x86 != 4 || c->x86_model != 9)
-                return -ENODEV;
-        /* cpuinfo and default policy values */
-        policy->cpuinfo.transition_latency = 1000000; /* 1ms */
-        policy->cur = sc520_freq_get_cpu_frequency(0);
-        result = cpufreq_frequency_table_cpuinfo(policy, sc520_freq_table);
-        if (result)
-                return result;
-        cpufreq_frequency_table_get_attr(sc520_freq_table, policy->cpu);
-        return 0;
-}
-static int sc520_freq_cpu_exit(struct cpufreq_policy *policy)
-{
-        cpufreq_frequency_table_put_attr(policy->cpu);
-        return 0;
-}
-static struct freq_attr *sc520_freq_attr[] = {
-        &cpufreq_freq_attr_scaling_available_freqs,
-        NULL,
-};
-static struct cpufreq_driver sc520_freq_driver = {
-        .get    = sc520_freq_get_cpu_frequency,
-        .verify = sc520_freq_verify,
-        .target = sc520_freq_target,
-        .init   = sc520_freq_cpu_init,
-        .exit   = sc520_freq_cpu_exit,
-        .name   = "sc520_freq",
-        .owner  = THIS_MODULE,
-        .attr   = sc520_freq_attr,
-};
-static int __init sc520_freq_init(void)
-{
-        struct cpuinfo_x86 *c = &cpu_data(0);
-        int err;
-        /* Test if we have the right hardware */
-        if (c->x86_vendor != X86_VENDOR_AMD ||
-            c->x86 != 4 || c->x86_model != 9) {
-                dprintk("no Elan SC520 processor found!\n");
-                return -ENODEV;
-        }
-        cpuctl = ioremap((unsigned long)(MMCR_BASE + OFFS_CPUCTL), 1);
-        if (!cpuctl) {
-                printk(KERN_ERR "sc520_freq: error: failed to remap memory\n");
-                return -ENOMEM;
-        }
-        err = cpufreq_register_driver(&sc520_freq_driver);
-        if (err)
-                iounmap(cpuctl);
-        return err;
-}
-static void __exit sc520_freq_exit(void)
-{
-        cpufreq_unregister_driver(&sc520_freq_driver);
-        iounmap(cpuctl);
-}
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Sean Young <sean@mess.org>");
-MODULE_DESCRIPTION("cpufreq driver for AMD's Elan sc520 CPU");
-module_init(sc520_freq_init);
-module_exit(sc520_freq_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
deleted file mode 100644
index 9b1ff37de46a..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
+++ /dev/null
@@ -1,636 +0,0 @@
-/*
- * cpufreq driver for Enhanced SpeedStep, as found in Intel's Pentium
- * M (part of the Centrino chipset).
- *
- * Since the original Pentium M, most new Intel CPUs support Enhanced
- * SpeedStep.
- *
- * Despite the "SpeedStep" in the name, this is almost entirely unlike
- * traditional SpeedStep.
- *
- * Modelled on speedstep.c
- *
- * Copyright (C) 2003 Jeremy Fitzhardinge <jeremy@goop.org>
- */
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/cpufreq.h>
-#include <linux/sched.h>        /* current */
-#include <linux/delay.h>
-#include <linux/compiler.h>
-#include <linux/gfp.h>
-#include <asm/msr.h>
-#include <asm/processor.h>
-#include <asm/cpufeature.h>
-#define PFX             "speedstep-centrino: "
-#define MAINTAINER      "cpufreq@vger.kernel.org"
-#define dprintk(msg...) \
-        cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "speedstep-centrino", msg)
-#define INTEL_MSR_RANGE (0xffff)
-struct cpu_id
-{
-        __u8    x86;            /* CPU family */
-        __u8    x86_model;      /* model */
-        __u8    x86_mask;       /* stepping */
-};
-enum {
-        CPU_BANIAS,
-        CPU_DOTHAN_A1,
-        CPU_DOTHAN_A2,
-        CPU_DOTHAN_B0,
-        CPU_MP4HT_D0,
-        CPU_MP4HT_E0,
-};
-static const struct cpu_id cpu_ids[] = {
-        [CPU_BANIAS]    = { 6,  9, 5 },
-        [CPU_DOTHAN_A1] = { 6, 13, 1 },
-        [CPU_DOTHAN_A2] = { 6, 13, 2 },
-        [CPU_DOTHAN_B0] = { 6, 13, 6 },
-        [CPU_MP4HT_D0]  = {15,  3, 4 },
-        [CPU_MP4HT_E0]  = {15,  4, 1 },
-};
-#define N_IDS   ARRAY_SIZE(cpu_ids)
-struct cpu_model
-{
-        const struct cpu_id *cpu_id;
-        const char      *model_name;
-        unsigned        max_freq; /* max clock in kHz */
-        struct cpufreq_frequency_table *op_points; /* clock/voltage pairs */
-};
-static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c,
-                                  const struct cpu_id *x);
-/* Operating points for current CPU */
-static DEFINE_PER_CPU(struct cpu_model *, centrino_model);
-static DEFINE_PER_CPU(const struct cpu_id *, centrino_cpu);
-static struct cpufreq_driver centrino_driver;
-#ifdef CONFIG_X86_SPEEDSTEP_CENTRINO_TABLE
-/* Computes the correct form for IA32_PERF_CTL MSR for a particular
-   frequency/voltage operating point; frequency in MHz, volts in mV.
-   This is stored as "index" in the structure. */
-#define OP(mhz, mv)                                                     \
-        {                                                               \
-                .frequency = (mhz) * 1000,                              \
-                .index = (((mhz)/100) << 8) | ((mv - 700) / 16)         \
-        }
-/*
- * These voltage tables were derived from the Intel Pentium M
- * datasheet, document 25261202.pdf, Table 5.  I have verified they
- * are consistent with my IBM ThinkPad X31, which has a 1.3GHz Pentium
- * M.
- */
-/* Ultra Low Voltage Intel Pentium M processor 900MHz (Banias) */
-static struct cpufreq_frequency_table banias_900[] =
-{
-        OP(600,  844),
-        OP(800,  988),
-        OP(900, 1004),
-        { .frequency = CPUFREQ_TABLE_END }
-};
-/* Ultra Low Voltage Intel Pentium M processor 1000MHz (Banias) */
-static struct cpufreq_frequency_table banias_1000[] =
-{
-        OP(600,   844),
-        OP(800,   972),
-        OP(900,   988),
-        OP(1000, 1004),
-        { .frequency = CPUFREQ_TABLE_END }
-};
-/* Low Voltage Intel Pentium M processor 1.10GHz (Banias) */
-static struct cpufreq_frequency_table banias_1100[] =
-{
-        OP( 600,  956),
-        OP( 800, 1020),
-        OP( 900, 1100),
-        OP(1000, 1164),
-        OP(1100, 1180),
-        { .frequency = CPUFREQ_TABLE_END }
-};
-/* Low Voltage Intel Pentium M processor 1.20GHz (Banias) */
-static struct cpufreq_frequency_table banias_1200[] =
-{
-        OP( 600,  956),
-        OP( 800, 1004),
-        OP( 900, 1020),
-        OP(1000, 1100),
-        OP(1100, 1164),
-        OP(1200, 1180),
-        { .frequency = CPUFREQ_TABLE_END }
-};
-/* Intel Pentium M processor 1.30GHz (Banias) */
-static struct cpufreq_frequency_table banias_1300[] =
-{
-        OP( 600,  956),
-        OP( 800, 1260),
-        OP(1000, 1292),
-        OP(1200, 1356),
-        OP(1300, 1388),
-        { .frequency = CPUFREQ_TABLE_END }
-};
-/* Intel Pentium M processor 1.40GHz (Banias) */
-static struct cpufreq_frequency_table banias_1400[] =
-{
-        OP( 600,  956),
-        OP( 800, 1180),
-        OP(1000, 1308),
-        OP(1200, 1436),
-        OP(1400, 1484),
-        { .frequency = CPUFREQ_TABLE_END }
-};
-/* Intel Pentium M processor 1.50GHz (Banias) */
-static struct cpufreq_frequency_table banias_1500[] =
-{
-        OP( 600,  956),
-        OP( 800, 1116),
-        OP(1000, 1228),
-        OP(1200, 1356),
-        OP(1400, 1452),
-        OP(1500, 1484),
-        { .frequency = CPUFREQ_TABLE_END }
-};
-/* Intel Pentium M processor 1.60GHz (Banias) */
-static struct cpufreq_frequency_table banias_1600[] =
-{
-        OP( 600,  956),
-        OP( 800, 1036),
-        OP(1000, 1164),
-        OP(1200, 1276),
-        OP(1400, 1420),
-        OP(1600, 1484),
-        { .frequency = CPUFREQ_TABLE_END }
-};
-/* Intel Pentium M processor 1.70GHz (Banias) */
-static struct cpufreq_frequency_table banias_1700[] =
-{
-        OP( 600,  956),
-        OP( 800, 1004),
-        OP(1000, 1116),
-        OP(1200, 1228),
-        OP(1400, 1308),
-        OP(1700, 1484),
-        { .frequency = CPUFREQ_TABLE_END }
-};
-#undef OP
-#define _BANIAS(cpuid, max, name)       \
-{       .cpu_id         = cpuid,        \
-        .model_name     = "Intel(R) Pentium(R) M processor " name "MHz", \
-        .max_freq       = (max)*1000,   \
-        .op_points      = banias_##max, \
-}
-#define BANIAS(max)     _BANIAS(&cpu_ids[CPU_BANIAS], max, #max)
-/* CPU models, their operating frequency range, and freq/voltage
-   operating points */
-static struct cpu_model models[] =
-{
-        _BANIAS(&cpu_ids[CPU_BANIAS], 900, " 900"),
-        BANIAS(1000),
-        BANIAS(1100),
-        BANIAS(1200),
-        BANIAS(1300),
-        BANIAS(1400),
-        BANIAS(1500),
-        BANIAS(1600),
-        BANIAS(1700),
-        /* NULL model_name is a wildcard */
-        { &cpu_ids[CPU_DOTHAN_A1], NULL, 0, NULL },
-        { &cpu_ids[CPU_DOTHAN_A2], NULL, 0, NULL },
-        { &cpu_ids[CPU_DOTHAN_B0], NULL, 0, NULL },
-        { &cpu_ids[CPU_MP4HT_D0], NULL, 0, NULL },
-        { &cpu_ids[CPU_MP4HT_E0], NULL, 0, NULL },
-        { NULL, }
-};
-#undef _BANIAS
-#undef BANIAS
-static int centrino_cpu_init_table(struct cpufreq_policy *policy)
-{
-        struct cpuinfo_x86 *cpu = &cpu_data(policy->cpu);
-        struct cpu_model *model;
-        for(model = models; model->cpu_id != NULL; model++)
-                if (centrino_verify_cpu_id(cpu, model->cpu_id) &&
-                    (model->model_name == NULL ||
-                     strcmp(cpu->x86_model_id, model->model_name) == 0))
-                        break;
-        if (model->cpu_id == NULL) {
-                /* No match at all */
-                dprintk("no support for CPU model \"%s\": "
-                       "send /proc/cpuinfo to " MAINTAINER "\n",
-                       cpu->x86_model_id);
-                return -ENOENT;
-        }
-        if (model->op_points == NULL) {
-                /* Matched a non-match */
-                dprintk("no table support for CPU model \"%s\"\n",
-                       cpu->x86_model_id);
-                dprintk("try using the acpi-cpufreq driver\n");
-                return -ENOENT;
-        }
-        per_cpu(centrino_model, policy->cpu) = model;
-        dprintk("found \"%s\": max frequency: %dkHz\n",
-               model->model_name, model->max_freq);
-        return 0;
-}
-#else
-static inline int centrino_cpu_init_table(struct cpufreq_policy *policy)
-{
-        return -ENODEV;
-}
-#endif /* CONFIG_X86_SPEEDSTEP_CENTRINO_TABLE */
-static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c,
-                                  const struct cpu_id *x)
-{
-        if ((c->x86 == x->x86) &&
-            (c->x86_model == x->x86_model) &&
-            (c->x86_mask == x->x86_mask))
-                return 1;
-        return 0;
-}
-/* To be called only after centrino_model is initialized */
-static unsigned extract_clock(unsigned msr, unsigned int cpu, int failsafe)
-{
-        int i;
-        /*
-         * Extract clock in kHz from PERF_CTL value
-         * for centrino, as some DSDTs are buggy.
-         * Ideally, this can be done using the acpi_data structure.
-         */
-        if ((per_cpu(centrino_cpu, cpu) == &cpu_ids[CPU_BANIAS]) ||
-            (per_cpu(centrino_cpu, cpu) == &cpu_ids[CPU_DOTHAN_A1]) ||
-            (per_cpu(centrino_cpu, cpu) == &cpu_ids[CPU_DOTHAN_B0])) {
-                msr = (msr >> 8) & 0xff;
-                return msr * 100000;
-        }
-        if ((!per_cpu(centrino_model, cpu)) ||
-            (!per_cpu(centrino_model, cpu)->op_points))
-                return 0;
-        msr &= 0xffff;
-        for (i = 0;
-                per_cpu(centrino_model, cpu)->op_points[i].frequency
-                                                        != CPUFREQ_TABLE_END;
-             i++) {
-                if (msr == per_cpu(centrino_model, cpu)->op_points[i].index)
-                        return per_cpu(centrino_model, cpu)->
-                                                        op_points[i].frequency;
-        }
-        if (failsafe)
-                return per_cpu(centrino_model, cpu)->op_points[i-1].frequency;
-        else
-                return 0;
-}
-/* Return the current CPU frequency in kHz */
-static unsigned int get_cur_freq(unsigned int cpu)
-{
-        unsigned l, h;
-        unsigned clock_freq;
-        rdmsr_on_cpu(cpu, MSR_IA32_PERF_STATUS, &l, &h);
-        clock_freq = extract_clock(l, cpu, 0);
-        if (unlikely(clock_freq == 0)) {
-                /*
-                 * On some CPUs, we can see transient MSR values (which are
-                 * not present in _PSS), while CPU is doing some automatic
-                 * P-state transition (like TM2). Get the last freq set 
-                 * in PERF_CTL.
-                 */
-                rdmsr_on_cpu(cpu, MSR_IA32_PERF_CTL, &l, &h);
-                clock_freq = extract_clock(l, cpu, 1);
-        }
-        return clock_freq;
-}
-static int centrino_cpu_init(struct cpufreq_policy *policy)
-{
-        struct cpuinfo_x86 *cpu = &cpu_data(policy->cpu);
-        unsigned freq;
-        unsigned l, h;
-        int ret;
-        int i;
-        /* Only Intel makes Enhanced Speedstep-capable CPUs */
-        if (cpu->x86_vendor != X86_VENDOR_INTEL ||
-            !cpu_has(cpu, X86_FEATURE_EST))
-                return -ENODEV;
-        if (cpu_has(cpu, X86_FEATURE_CONSTANT_TSC))
-                centrino_driver.flags |= CPUFREQ_CONST_LOOPS;
-        if (policy->cpu != 0)
-                return -ENODEV;
-        for (i = 0; i < N_IDS; i++)
-                if (centrino_verify_cpu_id(cpu, &cpu_ids[i]))
-                        break;
-        if (i != N_IDS)
-                per_cpu(centrino_cpu, policy->cpu) = &cpu_ids[i];
-        if (!per_cpu(centrino_cpu, policy->cpu)) {
-                dprintk("found unsupported CPU with "
-                "Enhanced SpeedStep: send /proc/cpuinfo to "
-                MAINTAINER "\n");
-                return -ENODEV;
-        }
-        if (centrino_cpu_init_table(policy)) {
-                return -ENODEV;
-        }
-        /* Check to see if Enhanced SpeedStep is enabled, and try to
-           enable it if not. */
-        rdmsr(MSR_IA32_MISC_ENABLE, l, h);
-        if (!(l & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP)) {
-                l |= MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP;
-                dprintk("trying to enable Enhanced SpeedStep (%x)\n", l);
-                wrmsr(MSR_IA32_MISC_ENABLE, l, h);
-                /* check to see if it stuck */
-                rdmsr(MSR_IA32_MISC_ENABLE, l, h);
-                if (!(l & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP)) {
-                        printk(KERN_INFO PFX
-                                "couldn't enable Enhanced SpeedStep\n");
-                        return -ENODEV;
-                }
-        }
-        freq = get_cur_freq(policy->cpu);
-        policy->cpuinfo.transition_latency = 10000;
-                                                /* 10uS transition latency */
-        policy->cur = freq;
-        dprintk("centrino_cpu_init: cur=%dkHz\n", policy->cur);
-        ret = cpufreq_frequency_table_cpuinfo(policy,
-                per_cpu(centrino_model, policy->cpu)->op_points);
-        if (ret)
-                return (ret);
-        cpufreq_frequency_table_get_attr(
-                per_cpu(centrino_model, policy->cpu)->op_points, policy->cpu);
-        return 0;
-}
-static int centrino_cpu_exit(struct cpufreq_policy *policy)
-{
-        unsigned int cpu = policy->cpu;
-        if (!per_cpu(centrino_model, cpu))
-                return -ENODEV;
-        cpufreq_frequency_table_put_attr(cpu);
-        per_cpu(centrino_model, cpu) = NULL;
-        return 0;
-}
-/**
- * centrino_verify - verifies a new CPUFreq policy
- * @policy: new policy
- *
- * Limit must be within this model's frequency range at least one
- * border included.
- */
-static int centrino_verify (struct cpufreq_policy *policy)
-{
-        return cpufreq_frequency_table_verify(policy,
-                        per_cpu(centrino_model, policy->cpu)->op_points);
-}
-/**
- * centrino_setpolicy - set a new CPUFreq policy
- * @policy: new policy
- * @target_freq: the target frequency
- * @relation: how that frequency relates to achieved frequency
- *      (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H)
- *
- * Sets a new CPUFreq policy.
- */
-static int centrino_target (struct cpufreq_policy *policy,
-                            unsigned int target_freq,
-                            unsigned int relation)
-{
-        unsigned int    newstate = 0;
-        unsigned int    msr, oldmsr = 0, h = 0, cpu = policy->cpu;
-        struct cpufreq_freqs    freqs;
-        int                     retval = 0;
-        unsigned int            j, k, first_cpu, tmp;
-        cpumask_var_t covered_cpus;
-        if (unlikely(!zalloc_cpumask_var(&covered_cpus, GFP_KERNEL)))
-                return -ENOMEM;
-        if (unlikely(per_cpu(centrino_model, cpu) == NULL)) {
-                retval = -ENODEV;
-                goto out;
-        }
-        if (unlikely(cpufreq_frequency_table_target(policy,
-                        per_cpu(centrino_model, cpu)->op_points,
-                        target_freq,
-                        relation,
-                        &newstate))) {
-                retval = -EINVAL;
-                goto out;
-        }
-        first_cpu = 1;
-        for_each_cpu(j, policy->cpus) {
-                int good_cpu;
-                /* cpufreq holds the hotplug lock, so we are safe here */
-                if (!cpu_online(j))
-                        continue;
-                /*
-                 * Support for SMP systems.
-                 * Make sure we are running on CPU that wants to change freq
-                 */
-                if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY)
-                        good_cpu = cpumask_any_and(policy->cpus,
-                                                   cpu_online_mask);
-                else
-                        good_cpu = j;
-                if (good_cpu >= nr_cpu_ids) {
-                        dprintk("couldn't limit to CPUs in this domain\n");
-                        retval = -EAGAIN;
-                        if (first_cpu) {
-                                /* We haven't started the transition yet. */
-                                goto out;
-                        }
-                        break;
-                }
-                msr = per_cpu(centrino_model, cpu)->op_points[newstate].index;
-                if (first_cpu) {
-                        rdmsr_on_cpu(good_cpu, MSR_IA32_PERF_CTL, &oldmsr, &h);
-                        if (msr == (oldmsr & 0xffff)) {
-                                dprintk("no change needed - msr was and needs "
-                                        "to be %x\n", oldmsr);
-                                retval = 0;
-                                goto out;
-                        }
-                        freqs.old = extract_clock(oldmsr, cpu, 0);
-                        freqs.new = extract_clock(msr, cpu, 0);
-                        dprintk("target=%dkHz old=%d new=%d msr=%04x\n",
-                                target_freq, freqs.old, freqs.new, msr);
-                        for_each_cpu(k, policy->cpus) {
-                                if (!cpu_online(k))
-                                        continue;
-                                freqs.cpu = k;
-                                cpufreq_notify_transition(&freqs,
-                                        CPUFREQ_PRECHANGE);
-                        }
-                        first_cpu = 0;
-                        /* all but 16 LSB are reserved, treat them with care */
-                        oldmsr &= ~0xffff;
-                        msr &= 0xffff;
-                        oldmsr |= msr;
-                }
-                wrmsr_on_cpu(good_cpu, MSR_IA32_PERF_CTL, oldmsr, h);
-                if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY)
-                        break;
-                cpumask_set_cpu(j, covered_cpus);
-        }
-        for_each_cpu(k, policy->cpus) {
-                if (!cpu_online(k))
-                        continue;
-                freqs.cpu = k;
-                cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
-        }
-        if (unlikely(retval)) {
-                /*
-                 * We have failed halfway through the frequency change.
-                 * We have sent callbacks to policy->cpus and
-                 * MSRs have already been written on coverd_cpus.
-                 * Best effort undo..
-                 */
-                for_each_cpu(j, covered_cpus)
-                        wrmsr_on_cpu(j, MSR_IA32_PERF_CTL, oldmsr, h);
-                tmp = freqs.new;
-                freqs.new = freqs.old;
-                freqs.old = tmp;
-                for_each_cpu(j, policy->cpus) {
-                        if (!cpu_online(j))
-                                continue;
-                        cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
-                        cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
-                }
-        }
-        retval = 0;
-out:
-        free_cpumask_var(covered_cpus);
-        return retval;
-}
-static struct freq_attr* centrino_attr[] = {
-        &cpufreq_freq_attr_scaling_available_freqs,
-        NULL,
-};
-static struct cpufreq_driver centrino_driver = {
-        .name           = "centrino", /* should be speedstep-centrino,
-                                         but there's a 16 char limit */
-        .init           = centrino_cpu_init,
-        .exit           = centrino_cpu_exit,
-        .verify         = centrino_verify,
-        .target         = centrino_target,
-        .get            = get_cur_freq,
-        .attr           = centrino_attr,
-        .owner          = THIS_MODULE,
-};
-/**
- * centrino_init - initializes the Enhanced SpeedStep CPUFreq driver
- *
- * Initializes the Enhanced SpeedStep support. Returns -ENODEV on
- * unsupported devices, -ENOENT if there's no voltage table for this
- * particular CPU model, -EINVAL on problems during initiatization,
- * and zero on success.
- *
- * This is quite picky.  Not only does the CPU have to advertise the
- * "est" flag in the cpuid capability flags, we look for a specific
- * CPU model and stepping, and we need to have the exact model name in
- * our voltage tables.  That is, be paranoid about not releasing
- * someone's valuable magic smoke.
- */
-static int __init centrino_init(void)
-{
-        struct cpuinfo_x86 *cpu = &cpu_data(0);
-        if (!cpu_has(cpu, X86_FEATURE_EST))
-                return -ENODEV;
-        return cpufreq_register_driver(&centrino_driver);
-}
-static void __exit centrino_exit(void)
-{
-        cpufreq_unregister_driver(&centrino_driver);
-}
-MODULE_AUTHOR ("Jeremy Fitzhardinge <jeremy@goop.org>");
-MODULE_DESCRIPTION ("Enhanced SpeedStep driver for Intel Pentium M processors.");
-MODULE_LICENSE ("GPL");
-late_initcall(centrino_init);
-module_exit(centrino_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
deleted file mode 100644
index 561758e95180..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
+++ /dev/null
@@ -1,452 +0,0 @@
-/*
- * (C) 2001  Dave Jones, Arjan van de ven.
- * (C) 2002 - 2003  Dominik Brodowski <linux@brodo.de>
- *
- *  Licensed under the terms of the GNU GPL License version 2.
- *  Based upon reverse engineered information, and on Intel documentation
- *  for chipsets ICH2-M and ICH3-M.
- *
- *  Many thanks to Ducrot Bruno for finding and fixing the last
- *  "missing link" for ICH2-M/ICH3-M support, and to Thomas Winkler
- *  for extensive testing.
- *
- *  BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
- */
-/*********************************************************************
- *                        SPEEDSTEP - DEFINITIONS                    *
- *********************************************************************/
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/cpufreq.h>
-#include <linux/pci.h>
-#include <linux/sched.h>
-#include "speedstep-lib.h"
-/* speedstep_chipset:
- *   It is necessary to know which chipset is used. As accesses to
- * this device occur at various places in this module, we need a
- * static struct pci_dev * pointing to that device.
- */
-static struct pci_dev *speedstep_chipset_dev;
-/* speedstep_processor
- */
-static enum speedstep_processor speedstep_processor;
-static u32 pmbase;
-/*
- *   There are only two frequency states for each processor. Values
- * are in kHz for the time being.
- */
-static struct cpufreq_frequency_table speedstep_freqs[] = {
-        {SPEEDSTEP_HIGH,        0},
-        {SPEEDSTEP_LOW,         0},
-        {0,                     CPUFREQ_TABLE_END},
-};
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
-                "speedstep-ich", msg)
-/**
- * speedstep_find_register - read the PMBASE address
- *
- * Returns: -ENODEV if no register could be found
- */
-static int speedstep_find_register(void)
-{
-        if (!speedstep_chipset_dev)
-                return -ENODEV;
-        /* get PMBASE */
-        pci_read_config_dword(speedstep_chipset_dev, 0x40, &pmbase);
-        if (!(pmbase & 0x01)) {
-                printk(KERN_ERR "speedstep-ich: could not find speedstep register\n");
-                return -ENODEV;
-        }
-        pmbase &= 0xFFFFFFFE;
-        if (!pmbase) {
-                printk(KERN_ERR "speedstep-ich: could not find speedstep register\n");
-                return -ENODEV;
-        }
-        dprintk("pmbase is 0x%x\n", pmbase);
-        return 0;
-}
-/**
- * speedstep_set_state - set the SpeedStep state
- * @state: new processor frequency state (SPEEDSTEP_LOW or SPEEDSTEP_HIGH)
- *
- *   Tries to change the SpeedStep state.  Can be called from
- *   smp_call_function_single.
- */
-static void speedstep_set_state(unsigned int state)
-{
-        u8 pm2_blk;
-        u8 value;
-        unsigned long flags;
-        if (state > 0x1)
-                return;
-        /* Disable IRQs */
-        local_irq_save(flags);
-        /* read state */
-        value = inb(pmbase + 0x50);
-        dprintk("read at pmbase 0x%x + 0x50 returned 0x%x\n", pmbase, value);
-        /* write new state */
-        value &= 0xFE;
-        value |= state;
-        dprintk("writing 0x%x to pmbase 0x%x + 0x50\n", value, pmbase);
-        /* Disable bus master arbitration */
-        pm2_blk = inb(pmbase + 0x20);
-        pm2_blk |= 0x01;
-        outb(pm2_blk, (pmbase + 0x20));
-        /* Actual transition */
-        outb(value, (pmbase + 0x50));
-        /* Restore bus master arbitration */
-        pm2_blk &= 0xfe;
-        outb(pm2_blk, (pmbase + 0x20));
-        /* check if transition was successful */
-        value = inb(pmbase + 0x50);
-        /* Enable IRQs */
-        local_irq_restore(flags);
-        dprintk("read at pmbase 0x%x + 0x50 returned 0x%x\n", pmbase, value);
-        if (state == (value & 0x1))
-                dprintk("change to %u MHz succeeded\n",
-                        speedstep_get_frequency(speedstep_processor) / 1000);
-        else
-                printk(KERN_ERR "cpufreq: change failed - I/O error\n");
-        return;
-}
-/* Wrapper for smp_call_function_single. */
-static void _speedstep_set_state(void *_state)
-{
-        speedstep_set_state(*(unsigned int *)_state);
-}
-/**
- * speedstep_activate - activate SpeedStep control in the chipset
- *
- *   Tries to activate the SpeedStep status and control registers.
- * Returns -EINVAL on an unsupported chipset, and zero on success.
- */
-static int speedstep_activate(void)
-{
-        u16 value = 0;
-        if (!speedstep_chipset_dev)
-                return -EINVAL;
-        pci_read_config_word(speedstep_chipset_dev, 0x00A0, &value);
-        if (!(value & 0x08)) {
-                value |= 0x08;
-                dprintk("activating SpeedStep (TM) registers\n");
-                pci_write_config_word(speedstep_chipset_dev, 0x00A0, value);
-        }
-        return 0;
-}
-/**
- * speedstep_detect_chipset - detect the Southbridge which contains SpeedStep logic
- *
- *   Detects ICH2-M, ICH3-M and ICH4-M so far. The pci_dev points to
- * the LPC bridge / PM module which contains all power-management
- * functions. Returns the SPEEDSTEP_CHIPSET_-number for the detected
- * chipset, or zero on failure.
- */
-static unsigned int speedstep_detect_chipset(void)
-{
-        speedstep_chipset_dev = pci_get_subsys(PCI_VENDOR_ID_INTEL,
-                              PCI_DEVICE_ID_INTEL_82801DB_12,
-                              PCI_ANY_ID, PCI_ANY_ID,
-                              NULL);
-        if (speedstep_chipset_dev)
-                return 4; /* 4-M */
-        speedstep_chipset_dev = pci_get_subsys(PCI_VENDOR_ID_INTEL,
-                              PCI_DEVICE_ID_INTEL_82801CA_12,
-                              PCI_ANY_ID, PCI_ANY_ID,
-                              NULL);
-        if (speedstep_chipset_dev)
-                return 3; /* 3-M */
-        speedstep_chipset_dev = pci_get_subsys(PCI_VENDOR_ID_INTEL,
-                              PCI_DEVICE_ID_INTEL_82801BA_10,
-                              PCI_ANY_ID, PCI_ANY_ID,
-                              NULL);
-        if (speedstep_chipset_dev) {
-                /* speedstep.c causes lockups on Dell Inspirons 8000 and
-                 * 8100 which use a pretty old revision of the 82815
-                 * host brige. Abort on these systems.
-                 */
-                static struct pci_dev *hostbridge;
-                hostbridge  = pci_get_subsys(PCI_VENDOR_ID_INTEL,
-                              PCI_DEVICE_ID_INTEL_82815_MC,
-                              PCI_ANY_ID, PCI_ANY_ID,
-                              NULL);
-                if (!hostbridge)
-                        return 2; /* 2-M */
-                if (hostbridge->revision < 5) {
-                        dprintk("hostbridge does not support speedstep\n");
-                        speedstep_chipset_dev = NULL;
-                        pci_dev_put(hostbridge);
-                        return 0;
-                }
-                pci_dev_put(hostbridge);
-                return 2; /* 2-M */
-        }
-        return 0;
-}
-static void get_freq_data(void *_speed)
-{
-        unsigned int *speed = _speed;
-        *speed = speedstep_get_frequency(speedstep_processor);
-}
-static unsigned int speedstep_get(unsigned int cpu)
-{
-        unsigned int speed;
-        /* You're supposed to ensure CPU is online. */
-        if (smp_call_function_single(cpu, get_freq_data, &speed, 1) != 0)
-                BUG();
-        dprintk("detected %u kHz as current frequency\n", speed);
-        return speed;
-}
-/**
- * speedstep_target - set a new CPUFreq policy
- * @policy: new policy
- * @target_freq: the target frequency
- * @relation: how that frequency relates to achieved frequency
- *      (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H)
- *
- * Sets a new CPUFreq policy.
- */
-static int speedstep_target(struct cpufreq_policy *policy,
-                             unsigned int target_freq,
-                             unsigned int relation)
-{
-        unsigned int newstate = 0, policy_cpu;
-        struct cpufreq_freqs freqs;
-        int i;
-        if (cpufreq_frequency_table_target(policy, &speedstep_freqs[0],
-                                target_freq, relation, &newstate))
-                return -EINVAL;
-        policy_cpu = cpumask_any_and(policy->cpus, cpu_online_mask);
-        freqs.old = speedstep_get(policy_cpu);
-        freqs.new = speedstep_freqs[newstate].frequency;
-        freqs.cpu = policy->cpu;
-        dprintk("transiting from %u to %u kHz\n", freqs.old, freqs.new);
-        /* no transition necessary */
-        if (freqs.old == freqs.new)
-                return 0;
-        for_each_cpu(i, policy->cpus) {
-                freqs.cpu = i;
-                cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
-        }
-        smp_call_function_single(policy_cpu, _speedstep_set_state, &newstate,
-                                 true);
-        for_each_cpu(i, policy->cpus) {
-                freqs.cpu = i;
-                cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
-        }
-        return 0;
-}
-/**
- * speedstep_verify - verifies a new CPUFreq policy
- * @policy: new policy
- *
- * Limit must be within speedstep_low_freq and speedstep_high_freq, with
- * at least one border included.
- */
-static int speedstep_verify(struct cpufreq_policy *policy)
-{
-        return cpufreq_frequency_table_verify(policy, &speedstep_freqs[0]);
-}
-struct get_freqs {
-        struct cpufreq_policy *policy;
-        int ret;
-};
-static void get_freqs_on_cpu(void *_get_freqs)
-{
-        struct get_freqs *get_freqs = _get_freqs;
-        get_freqs->ret =
-                speedstep_get_freqs(speedstep_processor,
-                            &speedstep_freqs[SPEEDSTEP_LOW].frequency,
-                            &speedstep_freqs[SPEEDSTEP_HIGH].frequency,
-                            &get_freqs->policy->cpuinfo.transition_latency,
-                            &speedstep_set_state);
-}
-static int speedstep_cpu_init(struct cpufreq_policy *policy)
-{
-        int result;
-        unsigned int policy_cpu, speed;
-        struct get_freqs gf;
-        /* only run on CPU to be set, or on its sibling */
-#ifdef CONFIG_SMP
-        cpumask_copy(policy->cpus, cpu_sibling_mask(policy->cpu));
-#endif
-        policy_cpu = cpumask_any_and(policy->cpus, cpu_online_mask);
-        /* detect low and high frequency and transition latency */
-        gf.policy = policy;
-        smp_call_function_single(policy_cpu, get_freqs_on_cpu, &gf, 1);
-        if (gf.ret)
-                return gf.ret;
-        /* get current speed setting */
-        speed = speedstep_get(policy_cpu);
-        if (!speed)
-                return -EIO;
-        dprintk("currently at %s speed setting - %i MHz\n",
-                (speed == speedstep_freqs[SPEEDSTEP_LOW].frequency)
-                ? "low" : "high",
-                (speed / 1000));
-        /* cpuinfo and default policy values */
-        policy->cur = speed;
-        result = cpufreq_frequency_table_cpuinfo(policy, speedstep_freqs);
-        if (result)
-                return result;
-        cpufreq_frequency_table_get_attr(speedstep_freqs, policy->cpu);
-        return 0;
-}
-static int speedstep_cpu_exit(struct cpufreq_policy *policy)
-{
-        cpufreq_frequency_table_put_attr(policy->cpu);
-        return 0;
-}
-static struct freq_attr *speedstep_attr[] = {
-        &cpufreq_freq_attr_scaling_available_freqs,
-        NULL,
-};
-static struct cpufreq_driver speedstep_driver = {
-        .name   = "speedstep-ich",
-        .verify = speedstep_verify,
-        .target = speedstep_target,
-        .init   = speedstep_cpu_init,
-        .exit   = speedstep_cpu_exit,
-        .get    = speedstep_get,
-        .owner  = THIS_MODULE,
-        .attr   = speedstep_attr,
-};
-/**
- * speedstep_init - initializes the SpeedStep CPUFreq driver
- *
- *   Initializes the SpeedStep support. Returns -ENODEV on unsupported
- * devices, -EINVAL on problems during initiatization, and zero on
- * success.
- */
-static int __init speedstep_init(void)
-{
-        /* detect processor */
-        speedstep_processor = speedstep_detect_processor();
-        if (!speedstep_processor) {
-                dprintk("Intel(R) SpeedStep(TM) capable processor "
-                                "not found\n");
-                return -ENODEV;
-        }
-        /* detect chipset */
-        if (!speedstep_detect_chipset()) {
-                dprintk("Intel(R) SpeedStep(TM) for this chipset not "
-                                "(yet) available.\n");
-                return -ENODEV;
-        }
-        /* activate speedstep support */
-        if (speedstep_activate()) {
-                pci_dev_put(speedstep_chipset_dev);
-                return -EINVAL;
-        }
-        if (speedstep_find_register())
-                return -ENODEV;
-        return cpufreq_register_driver(&speedstep_driver);
-}
-/**
- * speedstep_exit - unregisters SpeedStep support
- *
- *   Unregisters SpeedStep support.
- */
-static void __exit speedstep_exit(void)
-{
-        pci_dev_put(speedstep_chipset_dev);
-        cpufreq_unregister_driver(&speedstep_driver);
-}
-MODULE_AUTHOR("Dave Jones <davej@redhat.com>, "
-                "Dominik Brodowski <linux@brodo.de>");
-MODULE_DESCRIPTION("Speedstep driver for Intel mobile processors on chipsets "
-                "with ICH-M southbridges.");
-MODULE_LICENSE("GPL");
-module_init(speedstep_init);
-module_exit(speedstep_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c
deleted file mode 100644
index a94ec6be69fa..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c
+++ /dev/null
@@ -1,481 +0,0 @@
-/*
- * (C) 2002 - 2003 Dominik Brodowski <linux@brodo.de>
- *
- *  Licensed under the terms of the GNU GPL License version 2.
- *
- *  Library for common functions for Intel SpeedStep v.1 and v.2 support
- *
- *  BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
- */
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/moduleparam.h>
-#include <linux/init.h>
-#include <linux/cpufreq.h>
-#include <asm/msr.h>
-#include <asm/tsc.h>
-#include "speedstep-lib.h"
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
-                "speedstep-lib", msg)
-#define PFX "speedstep-lib: "
-#ifdef CONFIG_X86_SPEEDSTEP_RELAXED_CAP_CHECK
-static int relaxed_check;
-#else
-#define relaxed_check 0
-#endif
-/*********************************************************************
- *                   GET PROCESSOR CORE SPEED IN KHZ                 *
- *********************************************************************/
-static unsigned int pentium3_get_frequency(enum speedstep_processor processor)
-{
-        /* See table 14 of p3_ds.pdf and table 22 of 29834003.pdf */
-        struct {
-                unsigned int ratio;     /* Frequency Multiplier (x10) */
-                u8 bitmap;              /* power on configuration bits
-                                        [27, 25:22] (in MSR 0x2a) */
-        } msr_decode_mult[] = {
-                { 30, 0x01 },
-                { 35, 0x05 },
-                { 40, 0x02 },
-                { 45, 0x06 },
-                { 50, 0x00 },
-                { 55, 0x04 },
-                { 60, 0x0b },
-                { 65, 0x0f },
-                { 70, 0x09 },
-                { 75, 0x0d },
-                { 80, 0x0a },
-                { 85, 0x26 },
-                { 90, 0x20 },
-                { 100, 0x2b },
-                { 0, 0xff }     /* error or unknown value */
-        };
-        /* PIII(-M) FSB settings: see table b1-b of 24547206.pdf */
-        struct {
-                unsigned int value;     /* Front Side Bus speed in MHz */
-                u8 bitmap;              /* power on configuration bits [18: 19]
-                                        (in MSR 0x2a) */
-        } msr_decode_fsb[] = {
-                {  66, 0x0 },
-                { 100, 0x2 },
-                { 133, 0x1 },
-                {   0, 0xff}
-        };
-        u32 msr_lo, msr_tmp;
-        int i = 0, j = 0;
-        /* read MSR 0x2a - we only need the low 32 bits */
-        rdmsr(MSR_IA32_EBL_CR_POWERON, msr_lo, msr_tmp);
-        dprintk("P3 - MSR_IA32_EBL_CR_POWERON: 0x%x 0x%x\n", msr_lo, msr_tmp);
-        msr_tmp = msr_lo;
-        /* decode the FSB */
-        msr_tmp &= 0x00c0000;
-        msr_tmp >>= 18;
-        while (msr_tmp != msr_decode_fsb[i].bitmap) {
-                if (msr_decode_fsb[i].bitmap == 0xff)
-                        return 0;
-                i++;
-        }
-        /* decode the multiplier */
-        if (processor == SPEEDSTEP_CPU_PIII_C_EARLY) {
-                dprintk("workaround for early PIIIs\n");
-                msr_lo &= 0x03c00000;
-        } else
-                msr_lo &= 0x0bc00000;
-        msr_lo >>= 22;
-        while (msr_lo != msr_decode_mult[j].bitmap) {
-                if (msr_decode_mult[j].bitmap == 0xff)
-                        return 0;
-                j++;
-        }
-        dprintk("speed is %u\n",
-                (msr_decode_mult[j].ratio * msr_decode_fsb[i].value * 100));
-        return msr_decode_mult[j].ratio * msr_decode_fsb[i].value * 100;
-}
-static unsigned int pentiumM_get_frequency(void)
-{
-        u32 msr_lo, msr_tmp;
-        rdmsr(MSR_IA32_EBL_CR_POWERON, msr_lo, msr_tmp);
-        dprintk("PM - MSR_IA32_EBL_CR_POWERON: 0x%x 0x%x\n", msr_lo, msr_tmp);
-        /* see table B-2 of 24547212.pdf */
-        if (msr_lo & 0x00040000) {
-                printk(KERN_DEBUG PFX "PM - invalid FSB: 0x%x 0x%x\n",
-                                msr_lo, msr_tmp);
-                return 0;
-        }
-        msr_tmp = (msr_lo >> 22) & 0x1f;
-        dprintk("bits 22-26 are 0x%x, speed is %u\n",
-                        msr_tmp, (msr_tmp * 100 * 1000));
-        return msr_tmp * 100 * 1000;
-}
-static unsigned int pentium_core_get_frequency(void)
-{
-        u32 fsb = 0;
-        u32 msr_lo, msr_tmp;
-        int ret;
-        rdmsr(MSR_FSB_FREQ, msr_lo, msr_tmp);
-        /* see table B-2 of 25366920.pdf */
-        switch (msr_lo & 0x07) {
-        case 5:
-                fsb = 100000;
-                break;
-        case 1:
-                fsb = 133333;
-                break;
-        case 3:
-                fsb = 166667;
-                break;
-        case 2:
-                fsb = 200000;
-                break;
-        case 0:
-                fsb = 266667;
-                break;
-        case 4:
-                fsb = 333333;
-                break;
-        default:
-                printk(KERN_ERR "PCORE - MSR_FSB_FREQ undefined value");
-        }
-        rdmsr(MSR_IA32_EBL_CR_POWERON, msr_lo, msr_tmp);
-        dprintk("PCORE - MSR_IA32_EBL_CR_POWERON: 0x%x 0x%x\n",
-                        msr_lo, msr_tmp);
-        msr_tmp = (msr_lo >> 22) & 0x1f;
-        dprintk("bits 22-26 are 0x%x, speed is %u\n",
-                        msr_tmp, (msr_tmp * fsb));
-        ret = (msr_tmp * fsb);
-        return ret;
-}
-static unsigned int pentium4_get_frequency(void)
-{
-        struct cpuinfo_x86 *c = &boot_cpu_data;
-        u32 msr_lo, msr_hi, mult;
-        unsigned int fsb = 0;
-        unsigned int ret;
-        u8 fsb_code;
-        /* Pentium 4 Model 0 and 1 do not have the Core Clock Frequency
-         * to System Bus Frequency Ratio Field in the Processor Frequency
-         * Configuration Register of the MSR. Therefore the current
-         * frequency cannot be calculated and has to be measured.
-         */
-        if (c->x86_model < 2)
-                return cpu_khz;
-        rdmsr(0x2c, msr_lo, msr_hi);
-        dprintk("P4 - MSR_EBC_FREQUENCY_ID: 0x%x 0x%x\n", msr_lo, msr_hi);
-        /* decode the FSB: see IA-32 Intel (C) Architecture Software
-         * Developer's Manual, Volume 3: System Prgramming Guide,
-         * revision #12 in Table B-1: MSRs in the Pentium 4 and
-         * Intel Xeon Processors, on page B-4 and B-5.
-         */
-        fsb_code = (msr_lo >> 16) & 0x7;
-        switch (fsb_code) {
-        case 0:
-                fsb = 100 * 1000;
-                break;
-        case 1:
-                fsb = 13333 * 10;
-                break;
-        case 2:
-                fsb = 200 * 1000;
-                break;
-        }
-        if (!fsb)
-                printk(KERN_DEBUG PFX "couldn't detect FSB speed. "
-                                "Please send an e-mail to <linux@brodo.de>\n");
-        /* Multiplier. */
-        mult = msr_lo >> 24;
-        dprintk("P4 - FSB %u kHz; Multiplier %u; Speed %u kHz\n",
-                        fsb, mult, (fsb * mult));
-        ret = (fsb * mult);
-        return ret;
-}
-/* Warning: may get called from smp_call_function_single. */
-unsigned int speedstep_get_frequency(enum speedstep_processor processor)
-{
-        switch (processor) {
-        case SPEEDSTEP_CPU_PCORE:
-                return pentium_core_get_frequency();
-        case SPEEDSTEP_CPU_PM:
-                return pentiumM_get_frequency();
-        case SPEEDSTEP_CPU_P4D:
-        case SPEEDSTEP_CPU_P4M:
-                return pentium4_get_frequency();
-        case SPEEDSTEP_CPU_PIII_T:
-        case SPEEDSTEP_CPU_PIII_C:
-        case SPEEDSTEP_CPU_PIII_C_EARLY:
-                return pentium3_get_frequency(processor);
-        default:
-                return 0;
-        };
-        return 0;
-}
-EXPORT_SYMBOL_GPL(speedstep_get_frequency);
-/*********************************************************************
- *                 DETECT SPEEDSTEP-CAPABLE PROCESSOR                *
- *********************************************************************/
-unsigned int speedstep_detect_processor(void)
-{
-        struct cpuinfo_x86 *c = &cpu_data(0);
-        u32 ebx, msr_lo, msr_hi;
-        dprintk("x86: %x, model: %x\n", c->x86, c->x86_model);
-        if ((c->x86_vendor != X86_VENDOR_INTEL) ||
-            ((c->x86 != 6) && (c->x86 != 0xF)))
-                return 0;
-        if (c->x86 == 0xF) {
-                /* Intel Mobile Pentium 4-M
-                 * or Intel Mobile Pentium 4 with 533 MHz FSB */
-                if (c->x86_model != 2)
-                        return 0;
-                ebx = cpuid_ebx(0x00000001);
-                ebx &= 0x000000FF;
-                dprintk("ebx value is %x, x86_mask is %x\n", ebx, c->x86_mask);
-                switch (c->x86_mask) {
-                case 4:
-                        /*
-                         * B-stepping [M-P4-M]
-                         * sample has ebx = 0x0f, production has 0x0e.
-                         */
-                        if ((ebx == 0x0e) || (ebx == 0x0f))
-                                return SPEEDSTEP_CPU_P4M;
-                        break;
-                case 7:
-                        /*
-                         * C-stepping [M-P4-M]
-                         * needs to have ebx=0x0e, else it's a celeron:
-                         * cf. 25130917.pdf / page 7, footnote 5 even
-                         * though 25072120.pdf / page 7 doesn't say
-                         * samples are only of B-stepping...
-                         */
-                        if (ebx == 0x0e)
-                                return SPEEDSTEP_CPU_P4M;
-                        break;
-                case 9:
-                        /*
-                         * D-stepping [M-P4-M or M-P4/533]
-                         *
-                         * this is totally strange: CPUID 0x0F29 is
-                         * used by M-P4-M, M-P4/533 and(!) Celeron CPUs.
-                         * The latter need to be sorted out as they don't
-                         * support speedstep.
-                         * Celerons with CPUID 0x0F29 may have either
-                         * ebx=0x8 or 0xf -- 25130917.pdf doesn't say anything
-                         * specific.
-                         * M-P4-Ms may have either ebx=0xe or 0xf [see above]
-                         * M-P4/533 have either ebx=0xe or 0xf. [25317607.pdf]
-                         * also, M-P4M HTs have ebx=0x8, too
-                         * For now, they are distinguished by the model_id
-                         * string
-                         */
-                        if ((ebx == 0x0e) ||
-                                (strstr(c->x86_model_id,
-                                    "Mobile Intel(R) Pentium(R) 4") != NULL))
-                                return SPEEDSTEP_CPU_P4M;
-                        break;
-                default:
-                        break;
-                }
-                return 0;
-        }
-        switch (c->x86_model) {
-        case 0x0B: /* Intel PIII [Tualatin] */
-                /* cpuid_ebx(1) is 0x04 for desktop PIII,
-                 * 0x06 for mobile PIII-M */
-                ebx = cpuid_ebx(0x00000001);
-                dprintk("ebx is %x\n", ebx);
-                ebx &= 0x000000FF;
-                if (ebx != 0x06)
-                        return 0;
-                /* So far all PIII-M processors support SpeedStep. See
-                 * Intel's 24540640.pdf of June 2003
-                 */
-                return SPEEDSTEP_CPU_PIII_T;
-        case 0x08: /* Intel PIII [Coppermine] */
-                /* all mobile PIII Coppermines have FSB 100 MHz
-                 * ==> sort out a few desktop PIIIs. */
-                rdmsr(MSR_IA32_EBL_CR_POWERON, msr_lo, msr_hi);
-                dprintk("Coppermine: MSR_IA32_EBL_CR_POWERON is 0x%x, 0x%x\n",
-                                msr_lo, msr_hi);
-                msr_lo &= 0x00c0000;
-                if (msr_lo != 0x0080000)
-                        return 0;
-                /*
-                 * If the processor is a mobile version,
-                 * platform ID has bit 50 set
-                 * it has SpeedStep technology if either
-                 * bit 56 or 57 is set
-                 */
-                rdmsr(MSR_IA32_PLATFORM_ID, msr_lo, msr_hi);
-                dprintk("Coppermine: MSR_IA32_PLATFORM ID is 0x%x, 0x%x\n",
-                                msr_lo, msr_hi);
-                if ((msr_hi & (1<<18)) &&
-                    (relaxed_check ? 1 : (msr_hi & (3<<24)))) {
-                        if (c->x86_mask == 0x01) {
-                                dprintk("early PIII version\n");
-                                return SPEEDSTEP_CPU_PIII_C_EARLY;
-                        } else
-                                return SPEEDSTEP_CPU_PIII_C;
-                }
-        default:
-                return 0;
-        }
-}
-EXPORT_SYMBOL_GPL(speedstep_detect_processor);
-/*********************************************************************
- *                     DETECT SPEEDSTEP SPEEDS                       *
- *********************************************************************/
-unsigned int speedstep_get_freqs(enum speedstep_processor processor,
-                                  unsigned int *low_speed,
-                                  unsigned int *high_speed,
-                                  unsigned int *transition_latency,
-                                  void (*set_state) (unsigned int state))
-{
-        unsigned int prev_speed;
-        unsigned int ret = 0;
-        unsigned long flags;
-        struct timeval tv1, tv2;
-        if ((!processor) || (!low_speed) || (!high_speed) || (!set_state))
-                return -EINVAL;
-        dprintk("trying to determine both speeds\n");
-        /* get current speed */
-        prev_speed = speedstep_get_frequency(processor);
-        if (!prev_speed)
-                return -EIO;
-        dprintk("previous speed is %u\n", prev_speed);
-        local_irq_save(flags);
-        /* switch to low state */
-        set_state(SPEEDSTEP_LOW);
-        *low_speed = speedstep_get_frequency(processor);
-        if (!*low_speed) {
-                ret = -EIO;
-                goto out;
-        }
-        dprintk("low speed is %u\n", *low_speed);
-        /* start latency measurement */
-        if (transition_latency)
-                do_gettimeofday(&tv1);
-        /* switch to high state */
-        set_state(SPEEDSTEP_HIGH);
-        /* end latency measurement */
-        if (transition_latency)
-                do_gettimeofday(&tv2);
-        *high_speed = speedstep_get_frequency(processor);
-        if (!*high_speed) {
-                ret = -EIO;
-                goto out;
-        }
-        dprintk("high speed is %u\n", *high_speed);
-        if (*low_speed == *high_speed) {
-                ret = -ENODEV;
-                goto out;
-        }
-        /* switch to previous state, if necessary */
-        if (*high_speed != prev_speed)
-                set_state(SPEEDSTEP_LOW);
-        if (transition_latency) {
-                *transition_latency = (tv2.tv_sec - tv1.tv_sec) * USEC_PER_SEC +
-                        tv2.tv_usec - tv1.tv_usec;
-                dprintk("transition latency is %u uSec\n", *transition_latency);
-                /* convert uSec to nSec and add 20% for safety reasons */
-                *transition_latency *= 1200;
-                /* check if the latency measurement is too high or too low
-                 * and set it to a safe value (500uSec) in that case
-                 */
-                if (*transition_latency > 10000000 ||
-                    *transition_latency < 50000) {
-                        printk(KERN_WARNING PFX "frequency transition "
-                                        "measured seems out of range (%u "
-                                        "nSec), falling back to a safe one of"
-                                        "%u nSec.\n",
-                                        *transition_latency, 500000);
-                        *transition_latency = 500000;
-                }
-        }
-out:
-        local_irq_restore(flags);
-        return ret;
-}
-EXPORT_SYMBOL_GPL(speedstep_get_freqs);
-#ifdef CONFIG_X86_SPEEDSTEP_RELAXED_CAP_CHECK
-module_param(relaxed_check, int, 0444);
-MODULE_PARM_DESC(relaxed_check,
-                "Don't do all checks for speedstep capability.");
-#endif
-MODULE_AUTHOR("Dominik Brodowski <linux@brodo.de>");
-MODULE_DESCRIPTION("Library for Intel SpeedStep 1 or 2 cpufreq drivers.");
-MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h
deleted file mode 100644
index 70d9cea1219d..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * (C) 2002 - 2003 Dominik Brodowski <linux@brodo.de>
- *
- *  Licensed under the terms of the GNU GPL License version 2.
- *
- *  Library for common functions for Intel SpeedStep v.1 and v.2 support
- *
- *  BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
- */
-/* processors */
-enum speedstep_processor {
-        SPEEDSTEP_CPU_PIII_C_EARLY = 0x00000001,  /* Coppermine core */
-        SPEEDSTEP_CPU_PIII_C       = 0x00000002,  /* Coppermine core */
-        SPEEDSTEP_CPU_PIII_T       = 0x00000003,  /* Tualatin core */
-        SPEEDSTEP_CPU_P4M          = 0x00000004,  /* P4-M  */
-/* the following processors are not speedstep-capable and are not auto-detected
- * in speedstep_detect_processor(). However, their speed can be detected using
- * the speedstep_get_frequency() call. */
-        SPEEDSTEP_CPU_PM           = 0xFFFFFF03,  /* Pentium M  */
-        SPEEDSTEP_CPU_P4D          = 0xFFFFFF04,  /* desktop P4  */
-        SPEEDSTEP_CPU_PCORE        = 0xFFFFFF05,  /* Core */
-};
-/* speedstep states -- only two of them */
-#define SPEEDSTEP_HIGH  0x00000000
-#define SPEEDSTEP_LOW   0x00000001
-/* detect a speedstep-capable processor */
-extern enum speedstep_processor speedstep_detect_processor(void);
-/* detect the current speed (in khz) of the processor */
-extern unsigned int speedstep_get_frequency(enum speedstep_processor processor);
-/* detect the low and high speeds of the processor. The callback
- * set_state"'s first argument is either SPEEDSTEP_HIGH or
- * SPEEDSTEP_LOW; the second argument is zero so that no
- * cpufreq_notify_transition calls are initiated.
- */
-extern unsigned int speedstep_get_freqs(enum speedstep_processor processor,
-        unsigned int *low_speed,
-        unsigned int *high_speed,
-        unsigned int *transition_latency,
-        void (*set_state) (unsigned int state));
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c b/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c
deleted file mode 100644
index 8abd869baabf..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c
+++ /dev/null
@@ -1,467 +0,0 @@
-/*
- * Intel SpeedStep SMI driver.
- *
- * (C) 2003  Hiroshi Miura <miura@da-cha.org>
- *
- *  Licensed under the terms of the GNU GPL License version 2.
- *
- */
-/*********************************************************************
- *                        SPEEDSTEP - DEFINITIONS                    *
- *********************************************************************/
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/moduleparam.h>
-#include <linux/init.h>
-#include <linux/cpufreq.h>
-#include <linux/delay.h>
-#include <linux/io.h>
-#include <asm/ist.h>
-#include "speedstep-lib.h"
-/* speedstep system management interface port/command.
- *
- * These parameters are got from IST-SMI BIOS call.
- * If user gives it, these are used.
- *
- */
-static int smi_port;
-static int smi_cmd;
-static unsigned int smi_sig;
-/* info about the processor */
-static enum speedstep_processor speedstep_processor;
-/*
- * There are only two frequency states for each processor. Values
- * are in kHz for the time being.
- */
-static struct cpufreq_frequency_table speedstep_freqs[] = {
-        {SPEEDSTEP_HIGH,        0},
-        {SPEEDSTEP_LOW,         0},
-        {0,                     CPUFREQ_TABLE_END},
-};
-#define GET_SPEEDSTEP_OWNER 0
-#define GET_SPEEDSTEP_STATE 1
-#define SET_SPEEDSTEP_STATE 2
-#define GET_SPEEDSTEP_FREQS 4
-/* how often shall the SMI call be tried if it failed, e.g. because
- * of DMA activity going on? */
-#define SMI_TRIES 5
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
-                "speedstep-smi", msg)
-/**
- * speedstep_smi_ownership
- */
-static int speedstep_smi_ownership(void)
-{
-        u32 command, result, magic, dummy;
-        u32 function = GET_SPEEDSTEP_OWNER;
-        unsigned char magic_data[] = "Copyright (c) 1999 Intel Corporation";
-        command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff);
-        magic = virt_to_phys(magic_data);
-        dprintk("trying to obtain ownership with command %x at port %x\n",
-                        command, smi_port);
-        __asm__ __volatile__(
-                "push %%ebp\n"
-                "out %%al, (%%dx)\n"
-                "pop %%ebp\n"
-                : "=D" (result),
-                  "=a" (dummy), "=b" (dummy), "=c" (dummy), "=d" (dummy),
-                  "=S" (dummy)
-                : "a" (command), "b" (function), "c" (0), "d" (smi_port),
-                  "D" (0), "S" (magic)
-                : "memory"
-        );
-        dprintk("result is %x\n", result);
-        return result;
-}
-/**
- * speedstep_smi_get_freqs - get SpeedStep preferred & current freq.
- * @low: the low frequency value is placed here
- * @high: the high frequency value is placed here
- *
- * Only available on later SpeedStep-enabled systems, returns false results or
- * even hangs [cf. bugme.osdl.org # 1422] on earlier systems. Empirical testing
- * shows that the latter occurs if !(ist_info.event & 0xFFFF).
- */
-static int speedstep_smi_get_freqs(unsigned int *low, unsigned int *high)
-{
-        u32 command, result = 0, edi, high_mhz, low_mhz, dummy;
-        u32 state = 0;
-        u32 function = GET_SPEEDSTEP_FREQS;
-        if (!(ist_info.event & 0xFFFF)) {
-                dprintk("bug #1422 -- can't read freqs from BIOS\n");
-                return -ENODEV;
-        }
-        command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff);
-        dprintk("trying to determine frequencies with command %x at port %x\n",
-                        command, smi_port);
-        __asm__ __volatile__(
-                "push %%ebp\n"
-                "out %%al, (%%dx)\n"
-                "pop %%ebp"
-                : "=a" (result),
-                  "=b" (high_mhz),
-                  "=c" (low_mhz),
-                  "=d" (state), "=D" (edi), "=S" (dummy)
-                : "a" (command),
-                  "b" (function),
-                  "c" (state),
-                  "d" (smi_port), "S" (0), "D" (0)
-        );
-        dprintk("result %x, low_freq %u, high_freq %u\n",
-                        result, low_mhz, high_mhz);
-        /* abort if results are obviously incorrect... */
-        if ((high_mhz + low_mhz) < 600)
-                return -EINVAL;
-        *high = high_mhz * 1000;
-        *low  = low_mhz  * 1000;
-        return result;
-}
-/**
- * speedstep_get_state - set the SpeedStep state
- * @state: processor frequency state (SPEEDSTEP_LOW or SPEEDSTEP_HIGH)
- *
- */
-static int speedstep_get_state(void)
-{
-        u32 function = GET_SPEEDSTEP_STATE;
-        u32 result, state, edi, command, dummy;
-        command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff);
-        dprintk("trying to determine current setting with command %x "
-                "at port %x\n", command, smi_port);
-        __asm__ __volatile__(
-                "push %%ebp\n"
-                "out %%al, (%%dx)\n"
-                "pop %%ebp\n"
-                : "=a" (result),
-                  "=b" (state), "=D" (edi),
-                  "=c" (dummy), "=d" (dummy), "=S" (dummy)
-                : "a" (command), "b" (function), "c" (0),
-                  "d" (smi_port), "S" (0), "D" (0)
-        );
-        dprintk("state is %x, result is %x\n", state, result);
-        return state & 1;
-}
-/**
- * speedstep_set_state - set the SpeedStep state
- * @state: new processor frequency state (SPEEDSTEP_LOW or SPEEDSTEP_HIGH)
- *
- */
-static void speedstep_set_state(unsigned int state)
-{
-        unsigned int result = 0, command, new_state, dummy;
-        unsigned long flags;
-        unsigned int function = SET_SPEEDSTEP_STATE;
-        unsigned int retry = 0;
-        if (state > 0x1)
-                return;
-        /* Disable IRQs */
-        local_irq_save(flags);
-        command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff);
-        dprintk("trying to set frequency to state %u "
-                "with command %x at port %x\n",
-                state, command, smi_port);
-        do {
-                if (retry) {
-                        dprintk("retry %u, previous result %u, waiting...\n",
-                                        retry, result);
-                        mdelay(retry * 50);
-                }
-                retry++;
-                __asm__ __volatile__(
-                        "push %%ebp\n"
-                        "out %%al, (%%dx)\n"
-                        "pop %%ebp"
-                        : "=b" (new_state), "=D" (result),
-                          "=c" (dummy), "=a" (dummy),
-                          "=d" (dummy), "=S" (dummy)
-                        : "a" (command), "b" (function), "c" (state),
-                          "d" (smi_port), "S" (0), "D" (0)
-                        );
-        } while ((new_state != state) && (retry <= SMI_TRIES));
-        /* enable IRQs */
-        local_irq_restore(flags);
-        if (new_state == state)
-                dprintk("change to %u MHz succeeded after %u tries "
-                        "with result %u\n",
-                        (speedstep_freqs[new_state].frequency / 1000),
-                        retry, result);
-        else
-                printk(KERN_ERR "cpufreq: change to state %u "
-                        "failed with new_state %u and result %u\n",
-                        state, new_state, result);
-        return;
-}
-/**
- * speedstep_target - set a new CPUFreq policy
- * @policy: new policy
- * @target_freq: new freq
- * @relation:
- *
- * Sets a new CPUFreq policy/freq.
- */
-static int speedstep_target(struct cpufreq_policy *policy,
-                        unsigned int target_freq, unsigned int relation)
-{
-        unsigned int newstate = 0;
-        struct cpufreq_freqs freqs;
-        if (cpufreq_frequency_table_target(policy, &speedstep_freqs[0],
-                                target_freq, relation, &newstate))
-                return -EINVAL;
-        freqs.old = speedstep_freqs[speedstep_get_state()].frequency;
-        freqs.new = speedstep_freqs[newstate].frequency;
-        freqs.cpu = 0; /* speedstep.c is UP only driver */
-        if (freqs.old == freqs.new)
-                return 0;
-        cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
-        speedstep_set_state(newstate);
-        cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
-        return 0;
-}
-/**
- * speedstep_verify - verifies a new CPUFreq policy
- * @policy: new policy
- *
- * Limit must be within speedstep_low_freq and speedstep_high_freq, with
- * at least one border included.
- */
-static int speedstep_verify(struct cpufreq_policy *policy)
-{
-        return cpufreq_frequency_table_verify(policy, &speedstep_freqs[0]);
-}
-static int speedstep_cpu_init(struct cpufreq_policy *policy)
-{
-        int result;
-        unsigned int speed, state;
-        unsigned int *low, *high;
-        /* capability check */
-        if (policy->cpu != 0)
-                return -ENODEV;
-        result = speedstep_smi_ownership();
-        if (result) {
-                dprintk("fails in aquiring ownership of a SMI interface.\n");
-                return -EINVAL;
-        }
-        /* detect low and high frequency */
-        low = &speedstep_freqs[SPEEDSTEP_LOW].frequency;
-        high = &speedstep_freqs[SPEEDSTEP_HIGH].frequency;
-        result = speedstep_smi_get_freqs(low, high);
-        if (result) {
-                /* fall back to speedstep_lib.c dection mechanism:
-                 * try both states out */
-                dprintk("could not detect low and high frequencies "
-                                "by SMI call.\n");
-                result = speedstep_get_freqs(speedstep_processor,
-                                low, high,
-                                NULL,
-                                &speedstep_set_state);
-                if (result) {
-                        dprintk("could not detect two different speeds"
-                                        " -- aborting.\n");
-                        return result;
-                } else
-                        dprintk("workaround worked.\n");
-        }
-        /* get current speed setting */
-        state = speedstep_get_state();
-        speed = speedstep_freqs[state].frequency;
-        dprintk("currently at %s speed setting - %i MHz\n",
-                (speed == speedstep_freqs[SPEEDSTEP_LOW].frequency)
-                ? "low" : "high",
-                (speed / 1000));
-        /* cpuinfo and default policy values */
-        policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
-        policy->cur = speed;
-        result = cpufreq_frequency_table_cpuinfo(policy, speedstep_freqs);
-        if (result)
-                return result;
-        cpufreq_frequency_table_get_attr(speedstep_freqs, policy->cpu);
-        return 0;
-}
-static int speedstep_cpu_exit(struct cpufreq_policy *policy)
-{
-        cpufreq_frequency_table_put_attr(policy->cpu);
-        return 0;
-}
-static unsigned int speedstep_get(unsigned int cpu)
-{
-        if (cpu)
-                return -ENODEV;
-        return speedstep_get_frequency(speedstep_processor);
-}
-static int speedstep_resume(struct cpufreq_policy *policy)
-{
-        int result = speedstep_smi_ownership();
-        if (result)
-                dprintk("fails in re-aquiring ownership of a SMI interface.\n");
-        return result;
-}
-static struct freq_attr *speedstep_attr[] = {
-        &cpufreq_freq_attr_scaling_available_freqs,
-        NULL,
-};
-static struct cpufreq_driver speedstep_driver = {
-        .name           = "speedstep-smi",
-        .verify         = speedstep_verify,
-        .target         = speedstep_target,
-        .init           = speedstep_cpu_init,
-        .exit           = speedstep_cpu_exit,
-        .get            = speedstep_get,
-        .resume         = speedstep_resume,
-        .owner          = THIS_MODULE,
-        .attr           = speedstep_attr,
-};
-/**
- * speedstep_init - initializes the SpeedStep CPUFreq driver
- *
- *   Initializes the SpeedStep support. Returns -ENODEV on unsupported
- * BIOS, -EINVAL on problems during initiatization, and zero on
- * success.
- */
-static int __init speedstep_init(void)
-{
-        speedstep_processor = speedstep_detect_processor();
-        switch (speedstep_processor) {
-        case SPEEDSTEP_CPU_PIII_T:
-        case SPEEDSTEP_CPU_PIII_C:
-        case SPEEDSTEP_CPU_PIII_C_EARLY:
-                break;
-        default:
-                speedstep_processor = 0;
-        }
-        if (!speedstep_processor) {
-                dprintk("No supported Intel CPU detected.\n");
-                return -ENODEV;
-        }
-        dprintk("signature:0x%.8lx, command:0x%.8lx, "
-                "event:0x%.8lx, perf_level:0x%.8lx.\n",
-                ist_info.signature, ist_info.command,
-                ist_info.event, ist_info.perf_level);
-        /* Error if no IST-SMI BIOS or no PARM
-                 sig= 'ISGE' aka 'Intel Speedstep Gate E' */
-        if ((ist_info.signature !=  0x47534943) && (
-            (smi_port == 0) || (smi_cmd == 0)))
-                return -ENODEV;
-        if (smi_sig == 1)
-                smi_sig = 0x47534943;
-        else
-                smi_sig = ist_info.signature;
-        /* setup smi_port from MODLULE_PARM or BIOS */
-        if ((smi_port > 0xff) || (smi_port < 0))
-                return -EINVAL;
-        else if (smi_port == 0)
-                smi_port = ist_info.command & 0xff;
-        if ((smi_cmd > 0xff) || (smi_cmd < 0))
-                return -EINVAL;
-        else if (smi_cmd == 0)
-                smi_cmd = (ist_info.command >> 16) & 0xff;
-        return cpufreq_register_driver(&speedstep_driver);
-}
-/**
- * speedstep_exit - unregisters SpeedStep support
- *
- *   Unregisters SpeedStep support.
- */
-static void __exit speedstep_exit(void)
-{
-        cpufreq_unregister_driver(&speedstep_driver);
-}
-module_param(smi_port, int, 0444);
-module_param(smi_cmd,  int, 0444);
-module_param(smi_sig, uint, 0444);
-MODULE_PARM_DESC(smi_port, "Override the BIOS-given IST port with this value "
-                "-- Intel's default setting is 0xb2");
-MODULE_PARM_DESC(smi_cmd, "Override the BIOS-given IST command with this value "
-                "-- Intel's default setting is 0x82");
-MODULE_PARM_DESC(smi_sig, "Set to 1 to fake the IST signature when using the "
-                "SMI interface.");
-MODULE_AUTHOR("Hiroshi Miura");
-MODULE_DESCRIPTION("Speedstep driver for IST applet SMI interface.");
-MODULE_LICENSE("GPL");
-module_init(speedstep_init);
-module_exit(speedstep_exit);
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index b4389441efbb..1edf5ba4fb2b 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -29,10 +29,10 @@
 static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
 {
+        u64 misc_enable;
        /* Unmask CPUID levels if masked: */
        if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) {
-                u64 misc_enable;
                rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
                if (misc_enable & MSR_IA32_MISC_ENABLE_LIMIT_CPUID) {
@@ -118,8 +118,6 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
         * (model 2) with the same problem.
         */
        if (c->x86 == 15) {
-                u64 misc_enable;
                rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
                if (misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING) {
@@ -130,6 +128,19 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
                }
        }
 #endif
+        /*
+         * If fast string is not enabled in IA32_MISC_ENABLE for any reason,
+         * clear the fast string and enhanced fast string CPU capabilities.
+         */
+        if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) {
+                rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
+                if (!(misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING)) {
+                        printk(KERN_INFO "Disabled fast string operations\n");
+                        setup_clear_cpu_cap(X86_FEATURE_REP_GOOD);
+                        setup_clear_cpu_cap(X86_FEATURE_ERMS);
+                }
+        }
 }
 #ifdef CONFIG_X86_32
@@ -170,7 +181,7 @@ static void __cpuinit intel_smp_check(struct cpuinfo_x86 *c)
 {
 #ifdef CONFIG_SMP
        /* calling is from identify_secondary_cpu() ? */
-        if (c->cpu_index == boot_cpu_id)
+        if (!c->cpu_index)
                return;
        /*
@@ -276,17 +287,14 @@ static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
 static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
 {
-#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
+#ifdef CONFIG_NUMA
        unsigned node;
        int cpu = smp_processor_id();
-        int apicid = cpu_has_apic ? hard_smp_processor_id() : c->apicid;
        /* Don't do the funky fallback heuristics the AMD version employs
           for now. */
-        node = apicid_to_node[apicid];
+        node = numa_cpu_node(cpu);
-        if (node == NUMA_NO_NODE)
+        if (node == NUMA_NO_NODE || !node_online(node)) {
-                node = first_node(node_online_map);
-        else if (!node_online(node)) {
                /* reuse the value from init_cpu_to_node() */
                node = cpu_to_node(cpu);
        }
@@ -403,12 +411,10 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
                switch (c->x86_model) {
                case 5:
-                        if (c->x86_mask == 0) {
+                        if (l2 == 0)
-                                if (l2 == 0)
+                                p = "Celeron (Covington)";
-                                        p = "Celeron (Covington)";
+                        else if (l2 == 256)
-                                else if (l2 == 256)
+                                p = "Mobile Pentium II (Dixon)";
-                                        p = "Mobile Pentium II (Dixon)";
-                        }
                        break;
                case 6:
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 3fec7d9bfd62..0bf12644aa73 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -17,7 +17,7 @@
 #include <asm/processor.h>
 #include <linux/smp.h>
-#include <asm/k8.h>
+#include <asm/amd_nb.h>
 #include <asm/smp.h>
 #define LVL_1_INST      1
@@ -45,6 +45,7 @@ static const struct _cache_table __cpuinitconst cache_table[] =
        { 0x0a, LVL_1_DATA, 8 },        /* 2 way set assoc, 32 byte line size */
        { 0x0c, LVL_1_DATA, 16 },       /* 4-way set assoc, 32 byte line size */
        { 0x0d, LVL_1_DATA, 16 },       /* 4-way set assoc, 64 byte line size */
+        { 0x0e, LVL_1_DATA, 24 },       /* 6-way set assoc, 64 byte line size */
        { 0x21, LVL_2,      256 },      /* 8-way set assoc, 64 byte line size */
        { 0x22, LVL_3,      512 },      /* 4-way set assoc, sectored cache, 64 byte line size */
        { 0x23, LVL_3,      MB(1) },    /* 8-way set assoc, sectored cache, 64 byte line size */
@@ -66,6 +67,7 @@ static const struct _cache_table __cpuinitconst cache_table[] =
        { 0x45, LVL_2,      MB(2) },    /* 4-way set assoc, 32 byte line size */
        { 0x46, LVL_3,      MB(4) },    /* 4-way set assoc, 64 byte line size */
        { 0x47, LVL_3,      MB(8) },    /* 8-way set assoc, 64 byte line size */
+        { 0x48, LVL_2,      MB(3) },    /* 12-way set assoc, 64 byte line size */
        { 0x49, LVL_3,      MB(4) },    /* 16-way set assoc, 64 byte line size */
        { 0x4a, LVL_3,      MB(6) },    /* 12-way set assoc, 64 byte line size */
        { 0x4b, LVL_3,      MB(8) },    /* 16-way set assoc, 64 byte line size */
@@ -87,6 +89,7 @@ static const struct _cache_table __cpuinitconst cache_table[] =
        { 0x7c, LVL_2,      MB(1) },    /* 8-way set assoc, sectored cache, 64 byte line size */
        { 0x7d, LVL_2,      MB(2) },    /* 8-way set assoc, 64 byte line size */
        { 0x7f, LVL_2,      512 },      /* 2-way set assoc, 64 byte line size */
+        { 0x80, LVL_2,      512 },      /* 8-way set assoc, 64 byte line size */
        { 0x82, LVL_2,      256 },      /* 8-way set assoc, 32 byte line size */
        { 0x83, LVL_2,      512 },      /* 8-way set assoc, 32 byte line size */
        { 0x84, LVL_2,      MB(1) },    /* 8-way set assoc, 32 byte line size */
@@ -149,8 +152,7 @@ union _cpuid4_leaf_ecx {
 };
 struct amd_l3_cache {
-        struct   pci_dev *dev;
+        struct   amd_northbridge *nb;
-        bool     can_disable;
        unsigned indices;
        u8       subcaches[4];
 };
@@ -266,7 +268,7 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
                line_size = l2.line_size;
                lines_per_tag = l2.lines_per_tag;
                /* cpu_data has errata corrections for K7 applied */
-                size_in_kb = current_cpu_data.x86_cache_size;
+                size_in_kb = __this_cpu_read(cpu_info.x86_cache_size);
                break;
        case 3:
                if (!l3.val)
@@ -288,7 +290,7 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
        eax->split.type = types[leaf];
        eax->split.level = levels[leaf];
        eax->split.num_threads_sharing = 0;
-        eax->split.num_cores_on_die = current_cpu_data.x86_max_cores - 1;
+        eax->split.num_cores_on_die = __this_cpu_read(cpu_info.x86_max_cores) - 1;
        if (assoc == 0xffff)
@@ -302,23 +304,22 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
 struct _cache_attr {
        struct attribute attr;
-        ssize_t (*show)(struct _cpuid4_info *, char *);
+        ssize_t (*show)(struct _cpuid4_info *, char *, unsigned int);
-        ssize_t (*store)(struct _cpuid4_info *, const char *, size_t count);
+        ssize_t (*store)(struct _cpuid4_info *, const char *, size_t count,
+                         unsigned int);
 };
-#ifdef CONFIG_CPU_SUP_AMD
+#ifdef CONFIG_AMD_NB
 /*
 * L3 cache descriptors
 */
-static struct amd_l3_cache **__cpuinitdata l3_caches;
 static void __cpuinit amd_calc_l3_indices(struct amd_l3_cache *l3)
 {
        unsigned int sc0, sc1, sc2, sc3;
        u32 val = 0;
-        pci_read_config_dword(l3->dev, 0x1C4, &val);
+        pci_read_config_dword(l3->nb->misc, 0x1C4, &val);
        /* calculate subcache sizes */
        l3->subcaches[0] = sc0 = !(val & BIT(0));
@@ -326,50 +327,17 @@ static void __cpuinit amd_calc_l3_indices(struct amd_l3_cache *l3)
        l3->subcaches[2] = sc2 = !(val & BIT(8))  + !(val & BIT(9));
        l3->subcaches[3] = sc3 = !(val & BIT(12)) + !(val & BIT(13));
-        l3->indices = (max(max(max(sc0, sc1), sc2), sc3) << 10) - 1;
+        l3->indices = (max(max3(sc0, sc1, sc2), sc3) << 10) - 1;
-}
-static struct amd_l3_cache * __cpuinit amd_init_l3_cache(int node)
-{
-        struct amd_l3_cache *l3;
-        struct pci_dev *dev = node_to_k8_nb_misc(node);
-        l3 = kzalloc(sizeof(struct amd_l3_cache), GFP_ATOMIC);
-        if (!l3) {
-                printk(KERN_WARNING "Error allocating L3 struct\n");
-                return NULL;
-        }
-        l3->dev = dev;
-        amd_calc_l3_indices(l3);
-        return l3;
 }
-static void __cpuinit amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf,
+static void __cpuinit amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf,
-                                           int index)
+                                        int index)
 {
+        static struct amd_l3_cache *__cpuinitdata l3_caches;
        int node;
-        if (boot_cpu_data.x86 != 0x10)
+        /* only for L3, and not in virtualized environments */
-                return;
+        if (index < 3 || amd_nb_num() == 0)
-        if (index < 3)
-                return;
-        /* see errata #382 and #388 */
-        if (boot_cpu_data.x86_model < 0x8)
-                return;
-        if ((boot_cpu_data.x86_model == 0x8 ||
-             boot_cpu_data.x86_model == 0x9)
-                &&
-             boot_cpu_data.x86_mask < 0x1)
-                        return;
-        /* not in virtualized environments */
-        if (num_k8_northbridges == 0)
                return;
        /*
@@ -377,7 +345,7 @@ static void __cpuinit amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf,
         * never freed but this is done only on shutdown so it doesn't matter.
         */
        if (!l3_caches) {
-                int size = num_k8_northbridges * sizeof(struct amd_l3_cache *);
+                int size = amd_nb_num() * sizeof(struct amd_l3_cache);
                l3_caches = kzalloc(size, GFP_ATOMIC);
                if (!l3_caches)
@@ -386,14 +354,12 @@ static void __cpuinit amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf,
        node = amd_get_nb_id(smp_processor_id());
-        if (!l3_caches[node]) {
+        if (!l3_caches[node].nb) {
-                l3_caches[node] = amd_init_l3_cache(node);
+                l3_caches[node].nb = node_to_amd_nb(node);
-                l3_caches[node]->can_disable = true;
+                amd_calc_l3_indices(&l3_caches[node]);
        }
-        WARN_ON(!l3_caches[node]);
+        this_leaf->l3 = &l3_caches[node];
-        this_leaf->l3 = l3_caches[node];
 }
 /*
@@ -407,7 +373,7 @@ int amd_get_l3_disable_slot(struct amd_l3_cache *l3, unsigned slot)
 {
        unsigned int reg = 0;
-        pci_read_config_dword(l3->dev, 0x1BC + slot * 4, &reg);
+        pci_read_config_dword(l3->nb->misc, 0x1BC + slot * 4, &reg);
        /* check whether this slot is activated already */
        if (reg & (3UL << 30))
@@ -421,7 +387,8 @@ static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf,
 {
        int index;
-        if (!this_leaf->l3 || !this_leaf->l3->can_disable)
+        if (!this_leaf->l3 ||
+            !amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE))
                return -EINVAL;
        index = amd_get_l3_disable_slot(this_leaf->l3, slot);
@@ -433,7 +400,8 @@ static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf,
 #define SHOW_CACHE_DISABLE(slot)                                        \
 static ssize_t                                                          \
-show_cache_disable_##slot(struct _cpuid4_info *this_leaf, char *buf)    \
+show_cache_disable_##slot(struct _cpuid4_info *this_leaf, char *buf,    \
+                          unsigned int cpu)                             \
 {                                                                       \
        return show_cache_disable(this_leaf, buf, slot);                \
 }
@@ -456,7 +424,7 @@ static void amd_l3_disable_index(struct amd_l3_cache *l3, int cpu,
                if (!l3->subcaches[i])
                        continue;
-                pci_write_config_dword(l3->dev, 0x1BC + slot * 4, reg);
+                pci_write_config_dword(l3->nb->misc, 0x1BC + slot * 4, reg);
                /*
                 * We need to WBINVD on a core on the node containing the L3
@@ -466,7 +434,7 @@ static void amd_l3_disable_index(struct amd_l3_cache *l3, int cpu,
                wbinvd_on_cpu(cpu);
                reg |= BIT(31);
-                pci_write_config_dword(l3->dev, 0x1BC + slot * 4, reg);
+                pci_write_config_dword(l3->nb->misc, 0x1BC + slot * 4, reg);
        }
 }
@@ -485,27 +453,16 @@ int amd_set_l3_disable_slot(struct amd_l3_cache *l3, int cpu, unsigned slot,
 {
        int ret = 0;
-#define SUBCACHE_MASK   (3UL << 20)
+        /*  check if @slot is already used or the index is already disabled */
-#define SUBCACHE_INDEX  0xfff
-        /*
-         * check whether this slot is already used or
-         * the index is already disabled
-         */
        ret = amd_get_l3_disable_slot(l3, slot);
        if (ret >= 0)
                return -EINVAL;
-        /*
+        if (index > l3->indices)
-         * check whether the other slot has disabled the
-         * same index already
-         */
-        if (index == amd_get_l3_disable_slot(l3, !slot))
                return -EINVAL;
-        /* do not allow writes outside of allowed bits */
+        /* check whether the other slot has disabled the same index already */
-        if ((index & ~(SUBCACHE_MASK | SUBCACHE_INDEX)) ||
+        if (index == amd_get_l3_disable_slot(l3, !slot))
-            ((index & SUBCACHE_INDEX) > l3->indices))
                return -EINVAL;
        amd_l3_disable_index(l3, cpu, slot, index);
@@ -523,7 +480,8 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
-        if (!this_leaf->l3 || !this_leaf->l3->can_disable)
+        if (!this_leaf->l3 ||
+            !amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE))
                return -EINVAL;
        cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
@@ -544,7 +502,8 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
 #define STORE_CACHE_DISABLE(slot)                                       \
 static ssize_t                                                          \
 store_cache_disable_##slot(struct _cpuid4_info *this_leaf,              \
-                            const char *buf, size_t count)              \
+                           const char *buf, size_t count,               \
+                           unsigned int cpu)                            \
 {                                                                       \
        return store_cache_disable(this_leaf, buf, count, slot);        \
 }
@@ -556,25 +515,55 @@ static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644,
 static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644,
                show_cache_disable_1, store_cache_disable_1);
-#else   /* CONFIG_CPU_SUP_AMD */
+static ssize_t
-static void __cpuinit
+show_subcaches(struct _cpuid4_info *this_leaf, char *buf, unsigned int cpu)
-amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf, int index)
 {
-};
+        if (!this_leaf->l3 || !amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
-#endif /* CONFIG_CPU_SUP_AMD */
+                return -EINVAL;
+        return sprintf(buf, "%x\n", amd_get_subcaches(cpu));
+}
+static ssize_t
+store_subcaches(struct _cpuid4_info *this_leaf, const char *buf, size_t count,
+                unsigned int cpu)
+{
+        unsigned long val;
+        if (!capable(CAP_SYS_ADMIN))
+                return -EPERM;
+        if (!this_leaf->l3 || !amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
+                return -EINVAL;
+        if (strict_strtoul(buf, 16, &val) < 0)
+                return -EINVAL;
+        if (amd_set_subcaches(cpu, val))
+                return -EINVAL;
+        return count;
+}
+static struct _cache_attr subcaches =
+        __ATTR(subcaches, 0644, show_subcaches, store_subcaches);
+#else   /* CONFIG_AMD_NB */
+#define amd_init_l3_cache(x, y)
+#endif /* CONFIG_AMD_NB */
 static int
 __cpuinit cpuid4_cache_lookup_regs(int index,
                                   struct _cpuid4_info_regs *this_leaf)
 {
-        union _cpuid4_leaf_eax  eax;
+        union _cpuid4_leaf_eax  eax;
-        union _cpuid4_leaf_ebx  ebx;
+        union _cpuid4_leaf_ebx  ebx;
-        union _cpuid4_leaf_ecx  ecx;
+        union _cpuid4_leaf_ecx  ecx;
        unsigned                edx;
        if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
                amd_cpuid4(index, &eax, &ebx, &ecx);
-                amd_check_l3_disable(this_leaf, index);
+                amd_init_l3_cache(this_leaf, index);
        } else {
                cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx);
        }
@@ -784,11 +773,11 @@ static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index)
        struct cpuinfo_x86 *c = &cpu_data(cpu);
        if ((index == 3) && (c->x86_vendor == X86_VENDOR_AMD)) {
-                for_each_cpu(i, c->llc_shared_map) {
+                for_each_cpu(i, cpu_llc_shared_mask(cpu)) {
                        if (!per_cpu(ici_cpuid4_info, i))
                                continue;
                        this_leaf = CPUID4_INFO_IDX(i, index);
-                        for_each_cpu(sibling, c->llc_shared_map) {
+                        for_each_cpu(sibling, cpu_llc_shared_mask(cpu)) {
                                if (!cpu_online(sibling))
                                        continue;
                                set_bit(sibling, this_leaf->shared_cpu_map);
@@ -922,8 +911,8 @@ static DEFINE_PER_CPU(struct _index_kobject *, ici_index_kobject);
 #define INDEX_KOBJECT_PTR(x, y)         (&((per_cpu(ici_index_kobject, x))[y]))
 #define show_one_plus(file_name, object, val)                           \
-static ssize_t show_##file_name                                         \
+static ssize_t show_##file_name(struct _cpuid4_info *this_leaf, char *buf, \
-                        (struct _cpuid4_info *this_leaf, char *buf)     \
+                                unsigned int cpu)                       \
 {                                                                       \
        return sprintf(buf, "%lu\n", (unsigned long)this_leaf->object + val); \
 }
@@ -934,7 +923,8 @@ show_one_plus(physical_line_partition, ebx.split.physical_line_partition, 1);
 show_one_plus(ways_of_associativity, ebx.split.ways_of_associativity, 1);
 show_one_plus(number_of_sets, ecx.split.number_of_sets, 1);
-static ssize_t show_size(struct _cpuid4_info *this_leaf, char *buf)
+static ssize_t show_size(struct _cpuid4_info *this_leaf, char *buf,
+                         unsigned int cpu)
 {
        return sprintf(buf, "%luK\n", this_leaf->size / 1024);
 }
@@ -958,17 +948,20 @@ static ssize_t show_shared_cpu_map_func(struct _cpuid4_info *this_leaf,
        return n;
 }
-static inline ssize_t show_shared_cpu_map(struct _cpuid4_info *leaf, char *buf)
+static inline ssize_t show_shared_cpu_map(struct _cpuid4_info *leaf, char *buf,
+                                          unsigned int cpu)
 {
        return show_shared_cpu_map_func(leaf, 0, buf);
 }
-static inline ssize_t show_shared_cpu_list(struct _cpuid4_info *leaf, char *buf)
+static inline ssize_t show_shared_cpu_list(struct _cpuid4_info *leaf, char *buf,
+                                           unsigned int cpu)
 {
        return show_shared_cpu_map_func(leaf, 1, buf);
 }
-static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf)
+static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf,
+                         unsigned int cpu)
 {
        switch (this_leaf->eax.split.type) {
        case CACHE_TYPE_DATA:
@@ -999,30 +992,54 @@ define_one_ro(size);
 define_one_ro(shared_cpu_map);
 define_one_ro(shared_cpu_list);
-#define DEFAULT_SYSFS_CACHE_ATTRS       \
-        &type.attr,                     \
-        &level.attr,                    \
-        &coherency_line_size.attr,      \
-        &physical_line_partition.attr,  \
-        &ways_of_associativity.attr,    \
-        &number_of_sets.attr,           \
-        &size.attr,                     \
-        &shared_cpu_map.attr,           \
-        &shared_cpu_list.attr
 static struct attribute *default_attrs[] = {
-        DEFAULT_SYSFS_CACHE_ATTRS,
+        &type.attr,
+        &level.attr,
+        &coherency_line_size.attr,
+        &physical_line_partition.attr,
+        &ways_of_associativity.attr,
+        &number_of_sets.attr,
+        &size.attr,
+        &shared_cpu_map.attr,
+        &shared_cpu_list.attr,
        NULL
 };
-static struct attribute *default_l3_attrs[] = {
+#ifdef CONFIG_AMD_NB
-        DEFAULT_SYSFS_CACHE_ATTRS,
+static struct attribute ** __cpuinit amd_l3_attrs(void)
-#ifdef CONFIG_CPU_SUP_AMD
+{
-        &cache_disable_0.attr,
+        static struct attribute **attrs;
-        &cache_disable_1.attr,
+        int n;
+        if (attrs)
+                return attrs;
+        n = sizeof (default_attrs) / sizeof (struct attribute *);
+        if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE))
+                n += 2;
+        if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
+                n += 1;
+        attrs = kzalloc(n * sizeof (struct attribute *), GFP_KERNEL);
+        if (attrs == NULL)
+                return attrs = default_attrs;
+        for (n = 0; default_attrs[n]; n++)
+                attrs[n] = default_attrs[n];
+        if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) {
+                attrs[n++] = &cache_disable_0.attr;
+                attrs[n++] = &cache_disable_1.attr;
+        }
+        if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
+                attrs[n++] = &subcaches.attr;
+        return attrs;
+}
 #endif
-        NULL
-};
 static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf)
 {
@@ -1032,7 +1049,7 @@ static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf)
        ret = fattr->show ?
                fattr->show(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index),
-                        buf) :
+                        buf, this_leaf->cpu) :
                0;
        return ret;
 }
@@ -1046,7 +1063,7 @@ static ssize_t store(struct kobject *kobj, struct attribute *attr,
        ret = fattr->store ?
                fattr->store(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index),
-                        buf, count) :
+                        buf, count, this_leaf->cpu) :
                0;
        return ret;
 }
@@ -1133,11 +1150,11 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
                this_leaf = CPUID4_INFO_IDX(cpu, i);
-                if (this_leaf->l3 && this_leaf->l3->can_disable)
+                ktype_cache.default_attrs = default_attrs;
-                        ktype_cache.default_attrs = default_l3_attrs;
+#ifdef CONFIG_AMD_NB
-                else
+                if (this_leaf->l3)
-                        ktype_cache.default_attrs = default_attrs;
+                        ktype_cache.default_attrs = amd_l3_attrs();
+#endif
                retval = kobject_init_and_add(&(this_object->kobj),
                                              &ktype_cache,
                                              per_cpu(ici_cache_kobject, cpu),
diff --git a/arch/x86/kernel/cpu/mcheck/mce-apei.c b/arch/x86/kernel/cpu/mcheck/mce-apei.c
index 8209472b27a5..83930deec3c6 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-apei.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-apei.c
@@ -106,24 +106,34 @@ int apei_write_mce(struct mce *m)
 ssize_t apei_read_mce(struct mce *m, u64 *record_id)
 {
        struct cper_mce_record rcd;
-        ssize_t len;
+        int rc, pos;
-        len = erst_read_next(&rcd.hdr, sizeof(rcd));
+        rc = erst_get_record_id_begin(&pos);
-        if (len <= 0)
+        if (rc)
-                return len;
+                return rc;
-        /* Can not skip other records in storage via ERST unless clear them */
+retry:
-        else if (len != sizeof(rcd) ||
+        rc = erst_get_record_id_next(&pos, record_id);
-                 uuid_le_cmp(rcd.hdr.creator_id, CPER_CREATOR_MCE)) {
+        if (rc)
-                if (printk_ratelimit())
+                goto out;
-                        pr_warning(
+        /* no more record */
-                        "MCE-APEI: Can not skip the unknown record in ERST");
+        if (*record_id == APEI_ERST_INVALID_RECORD_ID)
-                return -EIO;
+                goto out;
-        }
+        rc = erst_read(*record_id, &rcd.hdr, sizeof(rcd));
+        /* someone else has cleared the record, try next one */
+        if (rc == -ENOENT)
+                goto retry;
+        else if (rc < 0)
+                goto out;
+        /* try to skip other type records in storage */
+        else if (rc != sizeof(rcd) ||
+                 uuid_le_cmp(rcd.hdr.creator_id, CPER_CREATOR_MCE))
+                goto retry;
        memcpy(m, &rcd.mce, sizeof(*m));
-        *record_id = rcd.hdr.record_id;
+        rc = sizeof(*m);
+out:
+        erst_get_record_id_end();
-        return sizeof(*m);
+        return rc;
 }
 /* Check whether there is record in ERST */
diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c
index e7dbde7bfedb..0ed633c5048b 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-inject.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c
@@ -25,13 +25,14 @@
 #include <linux/gfp.h>
 #include <asm/mce.h>
 #include <asm/apic.h>
+#include <asm/nmi.h>
 /* Update fake mce registers on current CPU. */
 static void inject_mce(struct mce *m)
 {
        struct mce *i = &per_cpu(injectm, m->extcpu);
-        /* Make sure noone reads partially written injectm */
+        /* Make sure no one reads partially written injectm */
        i->finished = 0;
        mb();
        m->finished = 0;
@@ -83,7 +84,7 @@ static int mce_raise_notify(struct notifier_block *self,
        struct die_args *args = (struct die_args *)data;
        int cpu = smp_processor_id();
        struct mce *m = &__get_cpu_var(injectm);
-        if (val != DIE_NMI_IPI || !cpumask_test_cpu(cpu, mce_inject_cpumask))
+        if (val != DIE_NMI || !cpumask_test_cpu(cpu, mce_inject_cpumask))
                return NOTIFY_DONE;
        cpumask_clear_cpu(cpu, mce_inject_cpumask);
        if (m->inject_flags & MCJ_EXCEPTION)
@@ -95,7 +96,7 @@ static int mce_raise_notify(struct notifier_block *self,
 static struct notifier_block mce_raise_nb = {
        .notifier_call = mce_raise_notify,
-        .priority = 1000,
+        .priority = NMI_LOCAL_NORMAL_PRIOR,
 };
 /* Inject mce on current CPU */
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
index 8a85dd1b1aa1..1e8d66c1336a 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -192,6 +192,7 @@ static const struct file_operations severities_coverage_fops = {
        .release        = seq_release,
        .read           = seq_read,
        .write          = severities_coverage_write,
+        .llseek         = seq_lseek,
 };
 static int __init severities_debugfs_init(void)
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index ed41562909fe..ff1ae9b6464d 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -21,6 +21,7 @@
 #include <linux/percpu.h>
 #include <linux/string.h>
 #include <linux/sysdev.h>
+#include <linux/syscore_ops.h>
 #include <linux/delay.h>
 #include <linux/ctype.h>
 #include <linux/sched.h>
@@ -104,20 +105,6 @@ static int			cpu_missing;
 ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain);
 EXPORT_SYMBOL_GPL(x86_mce_decoder_chain);
-static int default_decode_mce(struct notifier_block *nb, unsigned long val,
-                               void *data)
-{
-        pr_emerg(HW_ERR "No human readable MCE decoding support on this CPU type.\n");
-        pr_emerg(HW_ERR "Run the message through 'mcelog --ascii' to decode.\n");
-        return NOTIFY_STOP;
-}
-static struct notifier_block mce_dec_nb = {
-        .notifier_call = default_decode_mce,
-        .priority      = -1,
-};
 /* MCA banks polled by the period polling timer for corrected events */
 DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
        [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
@@ -211,6 +198,8 @@ void mce_log(struct mce *mce)
 static void print_mce(struct mce *m)
 {
+        int ret = 0;
        pr_emerg(HW_ERR "CPU %d: Machine Check Exception: %Lx Bank %d: %016Lx\n",
               m->extcpu, m->mcgstatus, m->bank, m->status);
@@ -238,7 +227,11 @@ static void print_mce(struct mce *m)
         * Print out human-readable details about the MCE error,
         * (if the CPU has an implementation for that)
         */
-        atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m);
+        ret = atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m);
+        if (ret == NOTIFY_STOP)
+                return;
+        pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n");
 }
 #define PANIC_TIMEOUT 5 /* 5 seconds */
@@ -326,7 +319,7 @@ static void mce_panic(char *msg, struct mce *final, char *exp)
 static int msr_to_offset(u32 msr)
 {
-        unsigned bank = __get_cpu_var(injectm.bank);
+        unsigned bank = __this_cpu_read(injectm.bank);
        if (msr == rip_msr)
                return offsetof(struct mce, ip);
@@ -346,7 +339,7 @@ static u64 mce_rdmsrl(u32 msr)
 {
        u64 v;
-        if (__get_cpu_var(injectm).finished) {
+        if (__this_cpu_read(injectm.finished)) {
                int offset = msr_to_offset(msr);
                if (offset < 0)
@@ -369,7 +362,7 @@ static u64 mce_rdmsrl(u32 msr)
 static void mce_wrmsrl(u32 msr, u64 v)
 {
-        if (__get_cpu_var(injectm).finished) {
+        if (__this_cpu_read(injectm.finished)) {
                int offset = msr_to_offset(msr);
                if (offset >= 0)
@@ -589,7 +582,6 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
                if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) {
                        mce_log(&m);
                        atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, &m);
-                        add_taint(TAINT_MACHINE_CHECK);
                }
                /*
@@ -881,7 +873,7 @@ reset:
 * Check if the address reported by the CPU is in a format we can parse.
 * It would be possible to add code for most other cases, but all would
 * be somewhat complicated (e.g. segment offset would require an instruction
- * parser). So only support physical addresses upto page granuality for now.
+ * parser). So only support physical addresses up to page granuality for now.
 */
 static int mce_usable_address(struct mce *m)
 {
@@ -1159,7 +1151,7 @@ static void mce_start_timer(unsigned long data)
        WARN_ON(smp_processor_id() != data);
-        if (mce_available(&current_cpu_data)) {
+        if (mce_available(__this_cpu_ptr(&cpu_info))) {
                machine_check_poll(MCP_TIMESTAMP,
                                &__get_cpu_var(mce_poll_banks));
        }
@@ -1625,7 +1617,7 @@ out:
 static unsigned int mce_poll(struct file *file, poll_table *wait)
 {
        poll_wait(file, &mce_wait, wait);
-        if (rcu_dereference_check_mce(mcelog.next))
+        if (rcu_access_index(mcelog.next))
                return POLLIN | POLLRDNORM;
        if (!mce_apei_read_done && apei_check_mce())
                return POLLIN | POLLRDNORM;
@@ -1665,6 +1657,7 @@ struct file_operations mce_chrdev_ops = {
        .read                   = mce_read,
        .poll                   = mce_poll,
        .unlocked_ioctl         = mce_ioctl,
+        .llseek         = no_llseek,
 };
 EXPORT_SYMBOL_GPL(mce_chrdev_ops);
@@ -1720,8 +1713,6 @@ __setup("mce", mcheck_enable);
 int __init mcheck_init(void)
 {
-        atomic_notifier_chain_register(&x86_mce_decoder_chain, &mce_dec_nb);
        mcheck_intel_therm_init();
        return 0;
@@ -1748,14 +1739,14 @@ static int mce_disable_error_reporting(void)
        return 0;
 }
-static int mce_suspend(struct sys_device *dev, pm_message_t state)
+static int mce_suspend(void)
 {
        return mce_disable_error_reporting();
 }
-static int mce_shutdown(struct sys_device *dev)
+static void mce_shutdown(void)
 {
-        return mce_disable_error_reporting();
+        mce_disable_error_reporting();
 }
 /*
@@ -1763,18 +1754,22 @@ static int mce_shutdown(struct sys_device *dev)
 * Only one CPU is active at this time, the others get re-added later using
 * CPU hotplug:
 */
-static int mce_resume(struct sys_device *dev)
+static void mce_resume(void)
 {
        __mcheck_cpu_init_generic();
-        __mcheck_cpu_init_vendor(&current_cpu_data);
+        __mcheck_cpu_init_vendor(__this_cpu_ptr(&cpu_info));
-        return 0;
 }
+static struct syscore_ops mce_syscore_ops = {
+        .suspend        = mce_suspend,
+        .shutdown       = mce_shutdown,
+        .resume         = mce_resume,
+};
 static void mce_cpu_restart(void *data)
 {
        del_timer_sync(&__get_cpu_var(mce_timer));
-        if (!mce_available(&current_cpu_data))
+        if (!mce_available(__this_cpu_ptr(&cpu_info)))
                return;
        __mcheck_cpu_init_generic();
        __mcheck_cpu_init_timer();
@@ -1789,7 +1784,7 @@ static void mce_restart(void)
 /* Toggle features for corrected errors */
 static void mce_disable_ce(void *all)
 {
-        if (!mce_available(&current_cpu_data))
+        if (!mce_available(__this_cpu_ptr(&cpu_info)))
                return;
        if (all)
                del_timer_sync(&__get_cpu_var(mce_timer));
@@ -1798,7 +1793,7 @@ static void mce_disable_ce(void *all)
 static void mce_enable_ce(void *all)
 {
-        if (!mce_available(&current_cpu_data))
+        if (!mce_available(__this_cpu_ptr(&cpu_info)))
                return;
        cmci_reenable();
        cmci_recheck();
@@ -1807,9 +1802,6 @@ static void mce_enable_ce(void *all)
 }
 static struct sysdev_class mce_sysclass = {
-        .suspend        = mce_suspend,
-        .shutdown       = mce_shutdown,
-        .resume         = mce_resume,
        .name           = "machinecheck",
 };
@@ -2021,7 +2013,7 @@ static void __cpuinit mce_disable_cpu(void *h)
        unsigned long action = *(unsigned long *)h;
        int i;
-        if (!mce_available(&current_cpu_data))
+        if (!mce_available(__this_cpu_ptr(&cpu_info)))
                return;
        if (!(action & CPU_TASKS_FROZEN))
@@ -2039,7 +2031,7 @@ static void __cpuinit mce_reenable_cpu(void *h)
        unsigned long action = *(unsigned long *)h;
        int i;
-        if (!mce_available(&current_cpu_data))
+        if (!mce_available(__this_cpu_ptr(&cpu_info)))
                return;
        if (!(action & CPU_TASKS_FROZEN))
@@ -2138,6 +2130,7 @@ static __init int mcheck_init_device(void)
                        return err;
        }
+        register_syscore_ops(&mce_syscore_ops);
        register_hotcpu_notifier(&mce_cpu_notifier);
        misc_register(&mce_log_device);
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 39aaee5c1ab2..bb0adad35143 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -31,8 +31,6 @@
 #include <asm/mce.h>
 #include <asm/msr.h>
-#define PFX               "mce_threshold: "
-#define VERSION           "version 1.1.1"
 #define NR_BANKS          6
 #define NR_BLOCKS         9
 #define THRESHOLD_MAX     0xFFF
@@ -59,12 +57,6 @@ struct threshold_block {
        struct list_head        miscj;
 };
-/* defaults used early on boot */
-static struct threshold_block threshold_defaults = {
-        .interrupt_enable       = 0,
-        .threshold_limit        = THRESHOLD_MAX,
-};
 struct threshold_bank {
        struct kobject          *kobj;
        struct threshold_block  *blocks;
@@ -89,49 +81,101 @@ static void amd_threshold_interrupt(void);
 struct thresh_restart {
        struct threshold_block  *b;
        int                     reset;
+        int                     set_lvt_off;
+        int                     lvt_off;
        u16                     old_limit;
 };
+static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi)
+{
+        int msr = (hi & MASK_LVTOFF_HI) >> 20;
+        if (apic < 0) {
+                pr_err(FW_BUG "cpu %d, failed to setup threshold interrupt "
+                       "for bank %d, block %d (MSR%08X=0x%x%08x)\n", b->cpu,
+                       b->bank, b->block, b->address, hi, lo);
+                return 0;
+        }
+        if (apic != msr) {
+                pr_err(FW_BUG "cpu %d, invalid threshold interrupt offset %d "
+                       "for bank %d, block %d (MSR%08X=0x%x%08x)\n",
+                       b->cpu, apic, b->bank, b->block, b->address, hi, lo);
+                return 0;
+        }
+        return 1;
+};
 /* must be called with correct cpu affinity */
 /* Called via smp_call_function_single() */
 static void threshold_restart_bank(void *_tr)
 {
        struct thresh_restart *tr = _tr;
-        u32 mci_misc_hi, mci_misc_lo;
+        u32 hi, lo;
-        rdmsr(tr->b->address, mci_misc_lo, mci_misc_hi);
+        rdmsr(tr->b->address, lo, hi);
-        if (tr->b->threshold_limit < (mci_misc_hi & THRESHOLD_MAX))
+        if (tr->b->threshold_limit < (hi & THRESHOLD_MAX))
                tr->reset = 1;  /* limit cannot be lower than err count */
        if (tr->reset) {                /* reset err count and overflow bit */
-                mci_misc_hi =
+                hi =
-                    (mci_misc_hi & ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI)) |
+                    (hi & ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI)) |
                    (THRESHOLD_MAX - tr->b->threshold_limit);
        } else if (tr->old_limit) {     /* change limit w/o reset */
-                int new_count = (mci_misc_hi & THRESHOLD_MAX) +
+                int new_count = (hi & THRESHOLD_MAX) +
                    (tr->old_limit - tr->b->threshold_limit);
-                mci_misc_hi = (mci_misc_hi & ~MASK_ERR_COUNT_HI) |
+                hi = (hi & ~MASK_ERR_COUNT_HI) |
                    (new_count & THRESHOLD_MAX);
        }
+        if (tr->set_lvt_off) {
+                if (lvt_off_valid(tr->b, tr->lvt_off, lo, hi)) {
+                        /* set new lvt offset */
+                        hi &= ~MASK_LVTOFF_HI;
+                        hi |= tr->lvt_off << 20;
+                }
+        }
        tr->b->interrupt_enable ?
-            (mci_misc_hi = (mci_misc_hi & ~MASK_INT_TYPE_HI) | INT_TYPE_APIC) :
+            (hi = (hi & ~MASK_INT_TYPE_HI) | INT_TYPE_APIC) :
-            (mci_misc_hi &= ~MASK_INT_TYPE_HI);
+            (hi &= ~MASK_INT_TYPE_HI);
-        mci_misc_hi |= MASK_COUNT_EN_HI;
+        hi |= MASK_COUNT_EN_HI;
-        wrmsr(tr->b->address, mci_misc_lo, mci_misc_hi);
+        wrmsr(tr->b->address, lo, hi);
+}
+static void mce_threshold_block_init(struct threshold_block *b, int offset)
+{
+        struct thresh_restart tr = {
+                .b                      = b,
+                .set_lvt_off            = 1,
+                .lvt_off                = offset,
+        };
+        b->threshold_limit              = THRESHOLD_MAX;
+        threshold_restart_bank(&tr);
+};
+static int setup_APIC_mce(int reserved, int new)
+{
+        if (reserved < 0 && !setup_APIC_eilvt(new, THRESHOLD_APIC_VECTOR,
+                                              APIC_EILVT_MSG_FIX, 0))
+                return new;
+        return reserved;
 }
 /* cpu init entry point, called from mce.c with preempt off */
 void mce_amd_feature_init(struct cpuinfo_x86 *c)
 {
+        struct threshold_block b;
        unsigned int cpu = smp_processor_id();
        u32 low = 0, high = 0, address = 0;
        unsigned int bank, block;
-        struct thresh_restart tr;
+        int offset = -1;
-        u8 lvt_off;
        for (bank = 0; bank < NR_BANKS; ++bank) {
                for (block = 0; block < NR_BLOCKS; ++block) {
@@ -162,19 +206,16 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
                        if (shared_bank[bank] && c->cpu_core_id)
                                break;
 #endif
-                        lvt_off = setup_APIC_eilvt_mce(THRESHOLD_APIC_VECTOR,
+                        offset = setup_APIC_mce(offset,
-                                                       APIC_EILVT_MSG_FIX, 0);
+                                                (high & MASK_LVTOFF_HI) >> 20);
-                        high &= ~MASK_LVTOFF_HI;
+                        memset(&b, 0, sizeof(b));
-                        high |= lvt_off << 20;
+                        b.cpu           = cpu;
-                        wrmsr(address, low, high);
+                        b.bank          = bank;
+                        b.block         = block;
-                        threshold_defaults.address = address;
+                        b.address       = address;
-                        tr.b = &threshold_defaults;
-                        tr.reset = 0;
-                        tr.old_limit = 0;
-                        threshold_restart_bank(&tr);
+                        mce_threshold_block_init(&b, offset);
                        mce_threshold_vector = amd_threshold_interrupt;
                }
        }
@@ -277,9 +318,8 @@ store_interrupt_enable(struct threshold_block *b, const char *buf, size_t size)
        b->interrupt_enable = !!new;
+        memset(&tr, 0, sizeof(tr));
        tr.b            = b;
-        tr.reset        = 0;
-        tr.old_limit    = 0;
        smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
@@ -300,10 +340,10 @@ store_threshold_limit(struct threshold_block *b, const char *buf, size_t size)
        if (new < 1)
                new = 1;
+        memset(&tr, 0, sizeof(tr));
        tr.old_limit = b->threshold_limit;
        b->threshold_limit = new;
        tr.b = b;
-        tr.reset = 0;
        smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
@@ -469,6 +509,7 @@ recurse:
 out_free:
        if (b) {
                kobject_put(&b->kobj);
+                list_del(&b->miscj);
                kfree(b);
        }
        return err;
@@ -487,15 +528,12 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
        int i, err = 0;
        struct threshold_bank *b = NULL;
        char name[32];
-#ifdef CONFIG_SMP
-        struct cpuinfo_x86 *c = &cpu_data(cpu);
-#endif
        sprintf(name, "threshold_bank%i", bank);
 #ifdef CONFIG_SMP
        if (cpu_data(cpu).cpu_core_id && shared_bank[bank]) {   /* symlink */
-                i = cpumask_first(c->llc_shared_map);
+                i = cpumask_first(cpu_llc_shared_mask(cpu));
                /* first core not up yet */
                if (cpu_data(i).cpu_core_id)
@@ -515,7 +553,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
                if (err)
                        goto out;
-                cpumask_copy(b->cpus, c->llc_shared_map);
+                cpumask_copy(b->cpus, cpu_llc_shared_mask(cpu));
                per_cpu(threshold_banks, cpu)[bank] = b;
                goto out;
@@ -582,9 +620,9 @@ static __cpuinit int threshold_create_device(unsigned int cpu)
                        continue;
                err = threshold_create_bank(cpu, bank);
                if (err)
-                        goto out;
+                        return err;
        }
-out:
        return err;
 }
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index 6fcd0936194f..8694ef56459d 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -130,7 +130,7 @@ void cmci_recheck(void)
        unsigned long flags;
        int banks;
-        if (!mce_available(&current_cpu_data) || !cmci_supported(&banks))
+        if (!mce_available(__this_cpu_ptr(&cpu_info)) || !cmci_supported(&banks))
                return;
        local_irq_save(flags);
        machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index 169d8804a9f8..27c625178bf1 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -53,8 +53,14 @@ struct thermal_state {
        struct _thermal_state core_power_limit;
        struct _thermal_state package_throttle;
        struct _thermal_state package_power_limit;
+        struct _thermal_state core_thresh0;
+        struct _thermal_state core_thresh1;
 };
+/* Callback to handle core threshold interrupts */
+int (*platform_thermal_notify)(__u64 msr_val);
+EXPORT_SYMBOL(platform_thermal_notify);
 static DEFINE_PER_CPU(struct thermal_state, thermal_state);
 static atomic_t therm_throt_en  = ATOMIC_INIT(0);
@@ -181,8 +187,6 @@ static int therm_throt_process(bool new_event, int event, int level)
                                this_cpu,
                                level == CORE_LEVEL ? "Core" : "Package",
                                state->count);
-                add_taint(TAINT_MACHINE_CHECK);
                return 1;
        }
        if (old_event) {
@@ -200,6 +204,22 @@ static int therm_throt_process(bool new_event, int event, int level)
        return 0;
 }
+static int thresh_event_valid(int event)
+{
+        struct _thermal_state *state;
+        unsigned int this_cpu = smp_processor_id();
+        struct thermal_state *pstate = &per_cpu(thermal_state, this_cpu);
+        u64 now = get_jiffies_64();
+        state = (event == 0) ? &pstate->core_thresh0 : &pstate->core_thresh1;
+        if (time_before64(now, state->next_check))
+                return 0;
+        state->next_check = now + CHECK_INTERVAL;
+        return 1;
+}
 #ifdef CONFIG_SYSFS
 /* Add/Remove thermal_throttle interface for CPU device: */
 static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev,
@@ -313,32 +333,50 @@ device_initcall(thermal_throttle_init_device);
 #define PACKAGE_THROTTLED       ((__u64)2 << 62)
 #define PACKAGE_POWER_LIMIT     ((__u64)3 << 62)
+static void notify_thresholds(__u64 msr_val)
+{
+        /* check whether the interrupt handler is defined;
+         * otherwise simply return
+         */
+        if (!platform_thermal_notify)
+                return;
+        /* lower threshold reached */
+        if ((msr_val & THERM_LOG_THRESHOLD0) && thresh_event_valid(0))
+                platform_thermal_notify(msr_val);
+        /* higher threshold reached */
+        if ((msr_val & THERM_LOG_THRESHOLD1) && thresh_event_valid(1))
+                platform_thermal_notify(msr_val);
+}
 /* Thermal transition interrupt handler */
 static void intel_thermal_interrupt(void)
 {
        __u64 msr_val;
-        struct cpuinfo_x86 *c = &cpu_data(smp_processor_id());
        rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
+        /* Check for violation of core thermal thresholds*/
+        notify_thresholds(msr_val);
        if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT,
                                THERMAL_THROTTLING_EVENT,
                                CORE_LEVEL) != 0)
                mce_log_therm_throt_event(CORE_THROTTLED | msr_val);
-        if (cpu_has(c, X86_FEATURE_PLN))
+        if (this_cpu_has(X86_FEATURE_PLN))
                if (therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT,
                                        POWER_LIMIT_EVENT,
                                        CORE_LEVEL) != 0)
                        mce_log_therm_throt_event(CORE_POWER_LIMIT | msr_val);
-        if (cpu_has(c, X86_FEATURE_PTS)) {
+        if (this_cpu_has(X86_FEATURE_PTS)) {
                rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val);
                if (therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT,
                                        THERMAL_THROTTLING_EVENT,
                                        PACKAGE_LEVEL) != 0)
                        mce_log_therm_throt_event(PACKAGE_THROTTLED | msr_val);
-                if (cpu_has(c, X86_FEATURE_PLN))
+                if (this_cpu_has(X86_FEATURE_PLN))
                        if (therm_throt_process(msr_val &
                                        PACKAGE_THERM_STATUS_POWER_LIMIT,
                                        POWER_LIMIT_EVENT,
@@ -350,9 +388,8 @@ static void intel_thermal_interrupt(void)
 static void unexpected_thermal_interrupt(void)
 {
-        printk(KERN_ERR "CPU%d: Unexpected LVT TMR interrupt!\n",
+        printk(KERN_ERR "CPU%d: Unexpected LVT thermal interrupt!\n",
                        smp_processor_id());
-        add_taint(TAINT_MACHINE_CHECK);
 }
 static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt;
@@ -405,18 +442,20 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
         */
        rdmsr(MSR_IA32_MISC_ENABLE, l, h);
+        h = lvtthmr_init;
        /*
         * The initial value of thermal LVT entries on all APs always reads
         * 0x10000 because APs are woken up by BSP issuing INIT-SIPI-SIPI
         * sequence to them and LVT registers are reset to 0s except for
         * the mask bits which are set to 1s when APs receive INIT IPI.
-         * Always restore the value that BIOS has programmed on AP based on
+         * If BIOS takes over the thermal interrupt and sets its interrupt
-         * BSP's info we saved since BIOS is always setting the same value
+         * delivery mode to SMI (not fixed), it restores the value that the
-         * for all threads/cores
+         * BIOS has programmed on AP based on BSP's info we saved since BIOS
+         * is always setting the same value for all threads/cores.
         */
-        apic_write(APIC_LVTTHMR, lvtthmr_init);
+        if ((h & APIC_DM_FIXED_MASK) != APIC_DM_FIXED)
+                apic_write(APIC_LVTTHMR, lvtthmr_init);
-        h = lvtthmr_init;
        if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) {
                printk(KERN_DEBUG
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
index c5f59d071425..ac140c7be396 100644
--- a/arch/x86/kernel/cpu/mtrr/cleanup.c
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -827,7 +827,7 @@ int __init amd_special_default_mtrr(void)
        if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
                return 0;
-        if (boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x11)
+        if (boot_cpu_data.x86 < 0xf)
                return 0;
        /* In case some hypervisor doesn't pass SYSCFG through: */
        if (rdmsr_safe(MSR_K8_SYSCFG, &l, &h) < 0)
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 7d28d7d03885..a71efcdbb092 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -1,6 +1,6 @@
 /*
 * This only handles 32bit MTRR on 32bit hosts. This is strictly wrong
- * because MTRRs can span upto 40 bits (36bits on most modern x86)
+ * because MTRRs can span up to 40 bits (36bits on most modern x86)
 */
 #define DEBUG
@@ -64,18 +64,59 @@ static inline void k8_check_syscfg_dram_mod_en(void)
        }
 }
+/* Get the size of contiguous MTRR range */
+static u64 get_mtrr_size(u64 mask)
+{
+        u64 size;
+        mask >>= PAGE_SHIFT;
+        mask |= size_or_mask;
+        size = -mask;
+        size <<= PAGE_SHIFT;
+        return size;
+}
 /*
- * Returns the effective MTRR type for the region
+ * Check and return the effective type for MTRR-MTRR type overlap.
- * Error returns:
+ * Returns 1 if the effective type is UNCACHEABLE, else returns 0
- * - 0xFE - when the range is "not entirely covered" by _any_ var range MTRR
- * - 0xFF - when MTRR is not enabled
 */
-u8 mtrr_type_lookup(u64 start, u64 end)
+static int check_type_overlap(u8 *prev, u8 *curr)
+{
+        if (*prev == MTRR_TYPE_UNCACHABLE || *curr == MTRR_TYPE_UNCACHABLE) {
+                *prev = MTRR_TYPE_UNCACHABLE;
+                *curr = MTRR_TYPE_UNCACHABLE;
+                return 1;
+        }
+        if ((*prev == MTRR_TYPE_WRBACK && *curr == MTRR_TYPE_WRTHROUGH) ||
+            (*prev == MTRR_TYPE_WRTHROUGH && *curr == MTRR_TYPE_WRBACK)) {
+                *prev = MTRR_TYPE_WRTHROUGH;
+                *curr = MTRR_TYPE_WRTHROUGH;
+        }
+        if (*prev != *curr) {
+                *prev = MTRR_TYPE_UNCACHABLE;
+                *curr = MTRR_TYPE_UNCACHABLE;
+                return 1;
+        }
+        return 0;
+}
+/*
+ * Error/Semi-error returns:
+ * 0xFF - when MTRR is not enabled
+ * *repeat == 1 implies [start:end] spanned across MTRR range and type returned
+ *              corresponds only to [start:*partial_end].
+ *              Caller has to lookup again for [*partial_end:end].
+ */
+static u8 __mtrr_type_lookup(u64 start, u64 end, u64 *partial_end, int *repeat)
 {
        int i;
        u64 base, mask;
        u8 prev_match, curr_match;
+        *repeat = 0;
        if (!mtrr_state_set)
                return 0xFF;
@@ -126,8 +167,34 @@ u8 mtrr_type_lookup(u64 start, u64 end)
                start_state = ((start & mask) == (base & mask));
                end_state = ((end & mask) == (base & mask));
-                if (start_state != end_state)
-                        return 0xFE;
+                if (start_state != end_state) {
+                        /*
+                         * We have start:end spanning across an MTRR.
+                         * We split the region into
+                         * either
+                         * (start:mtrr_end) (mtrr_end:end)
+                         * or
+                         * (start:mtrr_start) (mtrr_start:end)
+                         * depending on kind of overlap.
+                         * Return the type for first region and a pointer to
+                         * the start of second region so that caller will
+                         * lookup again on the second region.
+                         * Note: This way we handle multiple overlaps as well.
+                         */
+                        if (start_state)
+                                *partial_end = base + get_mtrr_size(mask);
+                        else
+                                *partial_end = base;
+                        if (unlikely(*partial_end <= start)) {
+                                WARN_ON(1);
+                                *partial_end = start + PAGE_SIZE;
+                        }
+                        end = *partial_end - 1; /* end is inclusive */
+                        *repeat = 1;
+                }
                if ((start & mask) != (base & mask))
                        continue;
@@ -138,21 +205,8 @@ u8 mtrr_type_lookup(u64 start, u64 end)
                        continue;
                }
-                if (prev_match == MTRR_TYPE_UNCACHABLE ||
+                if (check_type_overlap(&prev_match, &curr_match))
-                    curr_match == MTRR_TYPE_UNCACHABLE) {
+                        return curr_match;
-                        return MTRR_TYPE_UNCACHABLE;
-                }
-                if ((prev_match == MTRR_TYPE_WRBACK &&
-                     curr_match == MTRR_TYPE_WRTHROUGH) ||
-                    (prev_match == MTRR_TYPE_WRTHROUGH &&
-                     curr_match == MTRR_TYPE_WRBACK)) {
-                        prev_match = MTRR_TYPE_WRTHROUGH;
-                        curr_match = MTRR_TYPE_WRTHROUGH;
-                }
-                if (prev_match != curr_match)
-                        return MTRR_TYPE_UNCACHABLE;
        }
        if (mtrr_tom2) {
@@ -166,6 +220,36 @@ u8 mtrr_type_lookup(u64 start, u64 end)
        return mtrr_state.def_type;
 }
+/*
+ * Returns the effective MTRR type for the region
+ * Error return:
+ * 0xFF - when MTRR is not enabled
+ */
+u8 mtrr_type_lookup(u64 start, u64 end)
+{
+        u8 type, prev_type;
+        int repeat;
+        u64 partial_end;
+        type = __mtrr_type_lookup(start, end, &partial_end, &repeat);
+        /*
+         * Common path is with repeat = 0.
+         * However, we can have cases where [start:end] spans across some
+         * MTRR range. Do repeated lookups for that case here.
+         */
+        while (repeat) {
+                prev_type = type;
+                start = partial_end;
+                type = __mtrr_type_lookup(start, end, &partial_end, &repeat);
+                if (check_type_overlap(&prev_type, &type))
+                        return type;
+        }
+        return type;
+}
 /* Get the MSR pair relating to a var range */
 static void
 get_mtrr_var_range(unsigned int index, struct mtrr_var_range *vr)
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 01c0f3ee6cc3..929739a653d1 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -45,6 +45,7 @@
 #include <linux/cpu.h>
 #include <linux/pci.h>
 #include <linux/smp.h>
+#include <linux/syscore_ops.h>
 #include <asm/processor.h>
 #include <asm/e820.h>
@@ -292,14 +293,24 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ
        /*
         * HACK!
-         * We use this same function to initialize the mtrrs on boot.
+         *
-         * The state of the boot cpu's mtrrs has been saved, and we want
+         * We use this same function to initialize the mtrrs during boot,
-         * to replicate across all the APs.
+         * resume, runtime cpu online and on an explicit request to set a
-         * If we're doing that @reg is set to something special...
+         * specific MTRR.
+         *
+         * During boot or suspend, the state of the boot cpu's mtrrs has been
+         * saved, and we want to replicate that across all the cpus that come
+         * online (either at the end of boot or resume or during a runtime cpu
+         * online). If we're doing that, @reg is set to something special and on
+         * this cpu we still do mtrr_if->set_all(). During boot/resume, this
+         * is unnecessary if at this point we are still on the cpu that started
+         * the boot/resume sequence. But there is no guarantee that we are still
+         * on the same cpu. So we do mtrr_if->set_all() on this cpu aswell to be
+         * sure that we are in sync with everyone else.
         */
        if (reg != ~0U)
                mtrr_if->set(reg, base, size, type);
-        else if (!mtrr_aps_delayed_init)
+        else
                mtrr_if->set_all();
        /* Wait for the others */
@@ -630,7 +641,7 @@ struct mtrr_value {
 static struct mtrr_value mtrr_value[MTRR_MAX_VAR_RANGES];
-static int mtrr_save(struct sys_device *sysdev, pm_message_t state)
+static int mtrr_save(void)
 {
        int i;
@@ -642,7 +653,7 @@ static int mtrr_save(struct sys_device *sysdev, pm_message_t state)
        return 0;
 }
-static int mtrr_restore(struct sys_device *sysdev)
+static void mtrr_restore(void)
 {
        int i;
@@ -653,12 +664,11 @@ static int mtrr_restore(struct sys_device *sysdev)
                                    mtrr_value[i].ltype);
                }
        }
-        return 0;
 }
-static struct sysdev_driver mtrr_sysdev_driver = {
+static struct syscore_ops mtrr_syscore_ops = {
        .suspend        = mtrr_save,
        .resume         = mtrr_restore,
 };
@@ -793,13 +803,21 @@ void set_mtrr_aps_delayed_init(void)
 }
 /*
- * MTRR initialization for all AP's
+ * Delayed MTRR initialization for all AP's
 */
 void mtrr_aps_init(void)
 {
        if (!use_intel())
                return;
+        /*
+         * Check if someone has requested the delay of AP MTRR initialization,
+         * by doing set_mtrr_aps_delayed_init(), prior to this point. If not,
+         * then we are done.
+         */
+        if (!mtrr_aps_delayed_init)
+                return;
        set_mtrr(~0U, 0, 0, 0);
        mtrr_aps_delayed_init = false;
 }
@@ -831,7 +849,7 @@ static int __init mtrr_init_finialize(void)
         * TBD: is there any system with such CPU which supports
         * suspend/resume? If no, we should remove the code.
         */
-        sysdev_driver_register(&cpu_sysdev_class, &mtrr_sysdev_driver);
+        register_syscore_ops(&mtrr_syscore_ops);
        return 0;
 }
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 03a5b0385ad6..3a0338b4b179 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -30,6 +30,8 @@
 #include <asm/stacktrace.h>
 #include <asm/nmi.h>
 #include <asm/compat.h>
+#include <asm/smp.h>
+#include <asm/alternative.h>
 #if 0
 #undef wrmsrl
@@ -49,7 +51,6 @@ static unsigned long
 copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
 {
        unsigned long offset, addr = (unsigned long)from;
-        int type = in_nmi() ? KM_NMI : KM_IRQ0;
        unsigned long size, len = 0;
        struct page *page;
        void *map;
@@ -63,9 +64,9 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
                offset = addr & (PAGE_SIZE - 1);
                size = min(PAGE_SIZE - offset, n - len);
-                map = kmap_atomic(page, type);
+                map = kmap_atomic(page);
                memcpy(to, map+offset, size);
-                kunmap_atomic(map, type);
+                kunmap_atomic(map);
                put_page(page);
                len  += size;
@@ -94,6 +95,8 @@ struct amd_nb {
        struct event_constraint event_constraints[X86_PMC_IDX_MAX];
 };
+struct intel_percore;
 #define MAX_LBR_ENTRIES         16
 struct cpu_hw_events {
@@ -129,6 +132,13 @@ struct cpu_hw_events {
        struct perf_branch_entry        lbr_entries[MAX_LBR_ENTRIES];
        /*
+         * Intel percore register state.
+         * Coordinate shared resources between HT threads.
+         */
+        int                             percore_used; /* Used by this CPU? */
+        struct intel_percore            *per_core;
+        /*
         * AMD specific bits
         */
        struct amd_nb           *amd_nb;
@@ -167,7 +177,7 @@ struct cpu_hw_events {
 /*
 * Constraint on the Event code + UMask
 */
-#define PEBS_EVENT_CONSTRAINT(c, n)     \
+#define INTEL_UEVENT_CONSTRAINT(c, n)   \
        EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK)
 #define EVENT_CONSTRAINT_END            \
@@ -176,6 +186,28 @@ struct cpu_hw_events {
 #define for_each_event_constraint(e, c) \
        for ((e) = (c); (e)->weight; (e)++)
+/*
+ * Extra registers for specific events.
+ * Some events need large masks and require external MSRs.
+ * Define a mapping to these extra registers.
+ */
+struct extra_reg {
+        unsigned int            event;
+        unsigned int            msr;
+        u64                     config_mask;
+        u64                     valid_mask;
+};
+#define EVENT_EXTRA_REG(e, ms, m, vm) { \
+        .event = (e),           \
+        .msr = (ms),            \
+        .config_mask = (m),     \
+        .valid_mask = (vm),     \
+        }
+#define INTEL_EVENT_EXTRA_REG(event, msr, vm)   \
+        EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT, vm)
+#define EVENT_EXTRA_END EVENT_EXTRA_REG(0, 0, 0, 0)
 union perf_capabilities {
        struct {
                u64     lbr_format    : 6;
@@ -220,6 +252,7 @@ struct x86_pmu {
        void            (*put_event_constraints)(struct cpu_hw_events *cpuc,
                                                 struct perf_event *event);
        struct event_constraint *event_constraints;
+        struct event_constraint *percore_constraints;
        void            (*quirks)(void);
        int             perfctr_second_write;
@@ -238,6 +271,7 @@ struct x86_pmu {
         * Intel DebugStore bits
         */
        int             bts, pebs;
+        int             bts_active, pebs_active;
        int             pebs_record_size;
        void            (*drain_pebs)(struct pt_regs *regs);
        struct event_constraint *pebs_constraints;
@@ -247,6 +281,11 @@ struct x86_pmu {
         */
        unsigned long   lbr_tos, lbr_from, lbr_to; /* MSR base regs       */
        int             lbr_nr;                    /* hardware stack size */
+        /*
+         * Extra registers for events
+         */
+        struct extra_reg *extra_regs;
 };
 static struct x86_pmu x86_pmu __read_mostly;
@@ -271,6 +310,10 @@ static u64 __read_mostly hw_cache_event_ids
                                [PERF_COUNT_HW_CACHE_MAX]
                                [PERF_COUNT_HW_CACHE_OP_MAX]
                                [PERF_COUNT_HW_CACHE_RESULT_MAX];
+static u64 __read_mostly hw_cache_extra_regs
+                                [PERF_COUNT_HW_CACHE_MAX]
+                                [PERF_COUNT_HW_CACHE_OP_MAX]
+                                [PERF_COUNT_HW_CACHE_RESULT_MAX];
 /*
 * Propagate event elapsed time into the generic event.
@@ -298,7 +341,7 @@ x86_perf_event_update(struct perf_event *event)
         */
 again:
        prev_raw_count = local64_read(&hwc->prev_count);
-        rdmsrl(hwc->event_base + idx, new_raw_count);
+        rdmsrl(hwc->event_base, new_raw_count);
        if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
                                        new_raw_count) != prev_raw_count)
@@ -321,6 +364,55 @@ again:
        return new_raw_count;
 }
+static inline int x86_pmu_addr_offset(int index)
+{
+        int offset;
+        /* offset = X86_FEATURE_PERFCTR_CORE ? index << 1 : index */
+        alternative_io(ASM_NOP2,
+                       "shll $1, %%eax",
+                       X86_FEATURE_PERFCTR_CORE,
+                       "=a" (offset),
+                       "a"  (index));
+        return offset;
+}
+static inline unsigned int x86_pmu_config_addr(int index)
+{
+        return x86_pmu.eventsel + x86_pmu_addr_offset(index);
+}
+static inline unsigned int x86_pmu_event_addr(int index)
+{
+        return x86_pmu.perfctr + x86_pmu_addr_offset(index);
+}
+/*
+ * Find and validate any extra registers to set up.
+ */
+static int x86_pmu_extra_regs(u64 config, struct perf_event *event)
+{
+        struct extra_reg *er;
+        event->hw.extra_reg = 0;
+        event->hw.extra_config = 0;
+        if (!x86_pmu.extra_regs)
+                return 0;
+        for (er = x86_pmu.extra_regs; er->msr; er++) {
+                if (er->event != (config & er->config_mask))
+                        continue;
+                if (event->attr.config1 & ~er->valid_mask)
+                        return -EINVAL;
+                event->hw.extra_reg = er->msr;
+                event->hw.extra_config = event->attr.config1;
+                break;
+        }
+        return 0;
+}
 static atomic_t active_events;
 static DEFINE_MUTEX(pmc_reserve_mutex);
@@ -330,16 +422,13 @@ static bool reserve_pmc_hardware(void)
 {
        int i;
-        if (nmi_watchdog == NMI_LOCAL_APIC)
-                disable_lapic_nmi_watchdog();
        for (i = 0; i < x86_pmu.num_counters; i++) {
-                if (!reserve_perfctr_nmi(x86_pmu.perfctr + i))
+                if (!reserve_perfctr_nmi(x86_pmu_event_addr(i)))
                        goto perfctr_fail;
        }
        for (i = 0; i < x86_pmu.num_counters; i++) {
-                if (!reserve_evntsel_nmi(x86_pmu.eventsel + i))
+                if (!reserve_evntsel_nmi(x86_pmu_config_addr(i)))
                        goto eventsel_fail;
        }
@@ -347,16 +436,13 @@ static bool reserve_pmc_hardware(void)
 eventsel_fail:
        for (i--; i >= 0; i--)
-                release_evntsel_nmi(x86_pmu.eventsel + i);
+                release_evntsel_nmi(x86_pmu_config_addr(i));
        i = x86_pmu.num_counters;
 perfctr_fail:
        for (i--; i >= 0; i--)
-                release_perfctr_nmi(x86_pmu.perfctr + i);
+                release_perfctr_nmi(x86_pmu_event_addr(i));
-        if (nmi_watchdog == NMI_LOCAL_APIC)
-                enable_lapic_nmi_watchdog();
        return false;
 }
@@ -366,12 +452,9 @@ static void release_pmc_hardware(void)
        int i;
        for (i = 0; i < x86_pmu.num_counters; i++) {
-                release_perfctr_nmi(x86_pmu.perfctr + i);
+                release_perfctr_nmi(x86_pmu_event_addr(i));
-                release_evntsel_nmi(x86_pmu.eventsel + i);
+                release_evntsel_nmi(x86_pmu_config_addr(i));
        }
-        if (nmi_watchdog == NMI_LOCAL_APIC)
-                enable_lapic_nmi_watchdog();
 }
 #else
@@ -381,7 +464,64 @@ static void release_pmc_hardware(void) {}
 #endif
-static int reserve_ds_buffers(void);
+static bool check_hw_exists(void)
+{
+        u64 val, val_new = 0;
+        int i, reg, ret = 0;
+        /*
+         * Check to see if the BIOS enabled any of the counters, if so
+         * complain and bail.
+         */
+        for (i = 0; i < x86_pmu.num_counters; i++) {
+                reg = x86_pmu_config_addr(i);
+                ret = rdmsrl_safe(reg, &val);
+                if (ret)
+                        goto msr_fail;
+                if (val & ARCH_PERFMON_EVENTSEL_ENABLE)
+                        goto bios_fail;
+        }
+        if (x86_pmu.num_counters_fixed) {
+                reg = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
+                ret = rdmsrl_safe(reg, &val);
+                if (ret)
+                        goto msr_fail;
+                for (i = 0; i < x86_pmu.num_counters_fixed; i++) {
+                        if (val & (0x03 << i*4))
+                                goto bios_fail;
+                }
+        }
+        /*
+         * Now write a value and read it back to see if it matches,
+         * this is needed to detect certain hardware emulators (qemu/kvm)
+         * that don't trap on the MSR access and always return 0s.
+         */
+        val = 0xabcdUL;
+        ret = checking_wrmsrl(x86_pmu_event_addr(0), val);
+        ret |= rdmsrl_safe(x86_pmu_event_addr(0), &val_new);
+        if (ret || val != val_new)
+                goto msr_fail;
+        return true;
+bios_fail:
+        /*
+         * We still allow the PMU driver to operate:
+         */
+        printk(KERN_CONT "Broken BIOS detected, complain to your hardware vendor.\n");
+        printk(KERN_ERR FW_BUG "the BIOS has corrupted hw-PMU resources (MSR %x is %Lx)\n", reg, val);
+        return true;
+msr_fail:
+        printk(KERN_CONT "Broken PMU hardware detected, using software events only.\n");
+        return false;
+}
+static void reserve_ds_buffers(void);
 static void release_ds_buffers(void);
 static void hw_perf_event_destroy(struct perf_event *event)
@@ -399,8 +539,9 @@ static inline int x86_pmu_initialized(void)
 }
 static inline int
-set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event_attr *attr)
+set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event *event)
 {
+        struct perf_event_attr *attr = &event->attr;
        unsigned int cache_type, cache_op, cache_result;
        u64 config, val;
@@ -427,8 +568,8 @@ set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event_attr *attr)
                return -EINVAL;
        hwc->config |= val;
+        attr->config1 = hw_cache_extra_regs[cache_type][cache_op][cache_result];
-        return 0;
+        return x86_pmu_extra_regs(val, event);
 }
 static int x86_setup_perfctr(struct perf_event *event)
@@ -437,7 +578,7 @@ static int x86_setup_perfctr(struct perf_event *event)
        struct hw_perf_event *hwc = &event->hw;
        u64 config;
-        if (!hwc->sample_period) {
+        if (!is_sampling_event(event)) {
                hwc->sample_period = x86_pmu.max_period;
                hwc->last_period = hwc->sample_period;
                local64_set(&hwc->period_left, hwc->sample_period);
@@ -452,11 +593,15 @@ static int x86_setup_perfctr(struct perf_event *event)
                        return -EOPNOTSUPP;
        }
+        /*
+         * Do not allow config1 (extended registers) to propagate,
+         * there's no sane user-space generalization yet:
+         */
        if (attr->type == PERF_TYPE_RAW)
                return 0;
        if (attr->type == PERF_TYPE_HW_CACHE)
-                return set_ext_hw_attr(hwc, attr);
+                return set_ext_hw_attr(hwc, event);
        if (attr->config >= x86_pmu.max_events)
                return -EINVAL;
@@ -475,10 +620,10 @@ static int x86_setup_perfctr(struct perf_event *event)
        /*
         * Branch tracing:
         */
-        if ((attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) &&
+        if (attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS &&
-            (hwc->sample_period == 1)) {
+            !attr->freq && hwc->sample_period == 1) {
                /* BTS is not supported by this architecture. */
-                if (!x86_pmu.bts)
+                if (!x86_pmu.bts_active)
                        return -EOPNOTSUPP;
                /* BTS is currently only allowed for user-mode. */
@@ -497,12 +642,13 @@ static int x86_pmu_hw_config(struct perf_event *event)
                int precise = 0;
                /* Support for constant skid */
-                if (x86_pmu.pebs)
+                if (x86_pmu.pebs_active) {
                        precise++;
-                /* Support for IP fixup */
+                        /* Support for IP fixup */
-                if (x86_pmu.lbr_nr)
+                        if (x86_pmu.lbr_nr)
-                        precise++;
+                                precise++;
+                }
                if (event->attr.precise_ip > precise)
                        return -EOPNOTSUPP;
@@ -531,7 +677,7 @@ static int x86_pmu_hw_config(struct perf_event *event)
 /*
 * Setup the hardware configuration for a given attr_type
 */
-static int __hw_perf_event_init(struct perf_event *event)
+static int __x86_pmu_event_init(struct perf_event *event)
 {
        int err;
@@ -544,11 +690,8 @@ static int __hw_perf_event_init(struct perf_event *event)
                if (atomic_read(&active_events) == 0) {
                        if (!reserve_pmc_hardware())
                                err = -EBUSY;
-                        else {
+                        else
-                                err = reserve_ds_buffers();
+                                reserve_ds_buffers();
-                                if (err)
-                                        release_pmc_hardware();
-                        }
                }
                if (!err)
                        atomic_inc(&active_events);
@@ -576,15 +719,15 @@ static void x86_pmu_disable_all(void)
                if (!test_bit(idx, cpuc->active_mask))
                        continue;
-                rdmsrl(x86_pmu.eventsel + idx, val);
+                rdmsrl(x86_pmu_config_addr(idx), val);
                if (!(val & ARCH_PERFMON_EVENTSEL_ENABLE))
                        continue;
                val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
-                wrmsrl(x86_pmu.eventsel + idx, val);
+                wrmsrl(x86_pmu_config_addr(idx), val);
        }
 }
-void hw_perf_disable(void)
+static void x86_pmu_disable(struct pmu *pmu)
 {
        struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
@@ -601,25 +744,30 @@ void hw_perf_disable(void)
        x86_pmu.disable_all();
 }
+static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,
+                                          u64 enable_mask)
+{
+        if (hwc->extra_reg)
+                wrmsrl(hwc->extra_reg, hwc->extra_config);
+        wrmsrl(hwc->config_base, hwc->config | enable_mask);
+}
 static void x86_pmu_enable_all(int added)
 {
        struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
        int idx;
        for (idx = 0; idx < x86_pmu.num_counters; idx++) {
-                struct perf_event *event = cpuc->events[idx];
+                struct hw_perf_event *hwc = &cpuc->events[idx]->hw;
-                u64 val;
                if (!test_bit(idx, cpuc->active_mask))
                        continue;
-                val = event->hw.config;
+                __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
-                val |= ARCH_PERFMON_EVENTSEL_ENABLE;
-                wrmsrl(x86_pmu.eventsel + idx, val);
        }
 }
-static const struct pmu pmu;
+static struct pmu pmu;
 static inline int is_x86_event(struct perf_event *event)
 {
@@ -780,15 +928,10 @@ static inline void x86_assign_hw_event(struct perf_event *event,
                hwc->event_base = 0;
        } else if (hwc->idx >= X86_PMC_IDX_FIXED) {
                hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
-                /*
+                hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 + (hwc->idx - X86_PMC_IDX_FIXED);
-                 * We set it so that event_base + idx in wrmsr/rdmsr maps to
-                 * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2:
-                 */
-                hwc->event_base =
-                        MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED;
        } else {
-                hwc->config_base = x86_pmu.eventsel;
+                hwc->config_base = x86_pmu_config_addr(hwc->idx);
-                hwc->event_base  = x86_pmu.perfctr;
+                hwc->event_base  = x86_pmu_event_addr(hwc->idx);
        }
 }
@@ -801,10 +944,10 @@ static inline int match_prev_assignment(struct hw_perf_event *hwc,
                hwc->last_tag == cpuc->tags[i];
 }
-static int x86_pmu_start(struct perf_event *event);
+static void x86_pmu_start(struct perf_event *event, int flags);
-static void x86_pmu_stop(struct perf_event *event);
+static void x86_pmu_stop(struct perf_event *event, int flags);
-void hw_perf_enable(void)
+static void x86_pmu_enable(struct pmu *pmu)
 {
        struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
        struct perf_event *event;
@@ -840,7 +983,14 @@ void hw_perf_enable(void)
                            match_prev_assignment(hwc, cpuc, i))
                                continue;
-                        x86_pmu_stop(event);
+                        /*
+                         * Ensure we don't accidentally enable a stopped
+                         * counter simply because we rescheduled.
+                         */
+                        if (hwc->state & PERF_HES_STOPPED)
+                                hwc->state |= PERF_HES_ARCH;
+                        x86_pmu_stop(event, PERF_EF_UPDATE);
                }
                for (i = 0; i < cpuc->n_events; i++) {
@@ -852,7 +1002,10 @@ void hw_perf_enable(void)
                        else if (i < n_running)
                                continue;
-                        x86_pmu_start(event);
+                        if (hwc->state & PERF_HES_ARCH)
+                                continue;
+                        x86_pmu_start(event, PERF_EF_RELOAD);
                }
                cpuc->n_added = 0;
                perf_events_lapic_init();
@@ -864,17 +1017,11 @@ void hw_perf_enable(void)
        x86_pmu.enable_all(added);
 }
-static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,
-                                          u64 enable_mask)
-{
-        wrmsrl(hwc->config_base + hwc->idx, hwc->config | enable_mask);
-}
 static inline void x86_pmu_disable_event(struct perf_event *event)
 {
        struct hw_perf_event *hwc = &event->hw;
-        wrmsrl(hwc->config_base + hwc->idx, hwc->config);
+        wrmsrl(hwc->config_base, hwc->config);
 }
 static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
@@ -927,7 +1074,7 @@ x86_perf_event_set_period(struct perf_event *event)
         */
        local64_set(&hwc->prev_count, (u64)-left);
-        wrmsrl(hwc->event_base + idx, (u64)(-left) & x86_pmu.cntval_mask);
+        wrmsrl(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask);
        /*
         * Due to erratum on certan cpu we need
@@ -935,7 +1082,7 @@ x86_perf_event_set_period(struct perf_event *event)
         * is updated properly
         */
        if (x86_pmu.perfctr_second_write) {
-                wrmsrl(hwc->event_base + idx,
+                wrmsrl(hwc->event_base,
                        (u64)(-left) & x86_pmu.cntval_mask);
        }
@@ -946,22 +1093,18 @@ x86_perf_event_set_period(struct perf_event *event)
 static void x86_pmu_enable_event(struct perf_event *event)
 {
-        struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+        if (__this_cpu_read(cpu_hw_events.enabled))
-        if (cpuc->enabled)
                __x86_pmu_enable_event(&event->hw,
                                       ARCH_PERFMON_EVENTSEL_ENABLE);
 }
 /*
- * activate a single event
+ * Add a single event to the PMU.
 *
 * The event is added to the group of enabled events
 * but only if it can be scehduled with existing events.
- *
- * Called with PMU disabled. If successful and return value 1,
- * then guaranteed to call perf_enable() and hw_perf_enable()
 */
-static int x86_pmu_enable(struct perf_event *event)
+static int x86_pmu_add(struct perf_event *event, int flags)
 {
        struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
        struct hw_perf_event *hwc;
@@ -970,58 +1113,67 @@ static int x86_pmu_enable(struct perf_event *event)
        hwc = &event->hw;
+        perf_pmu_disable(event->pmu);
        n0 = cpuc->n_events;
-        n = collect_events(cpuc, event, false);
+        ret = n = collect_events(cpuc, event, false);
-        if (n < 0)
+        if (ret < 0)
-                return n;
+                goto out;
+        hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+        if (!(flags & PERF_EF_START))
+                hwc->state |= PERF_HES_ARCH;
        /*
         * If group events scheduling transaction was started,
-         * skip the schedulability test here, it will be peformed
+         * skip the schedulability test here, it will be performed
-         * at commit time(->commit_txn) as a whole
+         * at commit time (->commit_txn) as a whole
         */
        if (cpuc->group_flag & PERF_EVENT_TXN)
-                goto out;
+                goto done_collect;
        ret = x86_pmu.schedule_events(cpuc, n, assign);
        if (ret)
-                return ret;
+                goto out;
        /*
         * copy new assignment, now we know it is possible
         * will be used by hw_perf_enable()
         */
        memcpy(cpuc->assign, assign, n*sizeof(int));
-out:
+done_collect:
        cpuc->n_events = n;
        cpuc->n_added += n - n0;
        cpuc->n_txn += n - n0;
-        return 0;
+        ret = 0;
+out:
+        perf_pmu_enable(event->pmu);
+        return ret;
 }
-static int x86_pmu_start(struct perf_event *event)
+static void x86_pmu_start(struct perf_event *event, int flags)
 {
        struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
        int idx = event->hw.idx;
-        if (idx == -1)
+        if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
-                return -EAGAIN;
+                return;
+        if (WARN_ON_ONCE(idx == -1))
+                return;
+        if (flags & PERF_EF_RELOAD) {
+                WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));
+                x86_perf_event_set_period(event);
+        }
+        event->hw.state = 0;
-        x86_perf_event_set_period(event);
        cpuc->events[idx] = event;
        __set_bit(idx, cpuc->active_mask);
        __set_bit(idx, cpuc->running);
        x86_pmu.enable(event);
        perf_event_update_userpage(event);
-        return 0;
-}
-static void x86_pmu_unthrottle(struct perf_event *event)
-{
-        int ret = x86_pmu_start(event);
-        WARN_ON_ONCE(ret);
 }
 void perf_event_print_debug(void)
@@ -1057,8 +1209,8 @@ void perf_event_print_debug(void)
        pr_info("CPU#%d: active:     %016llx\n", cpu, *(u64 *)cpuc->active_mask);
        for (idx = 0; idx < x86_pmu.num_counters; idx++) {
-                rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl);
+                rdmsrl(x86_pmu_config_addr(idx), pmc_ctrl);
-                rdmsrl(x86_pmu.perfctr  + idx, pmc_count);
+                rdmsrl(x86_pmu_event_addr(idx), pmc_count);
                prev_left = per_cpu(pmc_prev_left[idx], cpu);
@@ -1078,27 +1230,29 @@ void perf_event_print_debug(void)
        local_irq_restore(flags);
 }
-static void x86_pmu_stop(struct perf_event *event)
+static void x86_pmu_stop(struct perf_event *event, int flags)
 {
        struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
        struct hw_perf_event *hwc = &event->hw;
-        int idx = hwc->idx;
-        if (!__test_and_clear_bit(idx, cpuc->active_mask))
-                return;
-        x86_pmu.disable(event);
+        if (__test_and_clear_bit(hwc->idx, cpuc->active_mask)) {
+                x86_pmu.disable(event);
-        /*
+                cpuc->events[hwc->idx] = NULL;
-         * Drain the remaining delta count out of a event
+                WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
-         * that we are disabling:
+                hwc->state |= PERF_HES_STOPPED;
-         */
+        }
-        x86_perf_event_update(event);
-        cpuc->events[idx] = NULL;
+        if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
+                /*
+                 * Drain the remaining delta count out of a event
+                 * that we are disabling:
+                 */
+                x86_perf_event_update(event);
+                hwc->state |= PERF_HES_UPTODATE;
+        }
 }
-static void x86_pmu_disable(struct perf_event *event)
+static void x86_pmu_del(struct perf_event *event, int flags)
 {
        struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
        int i;
@@ -1111,7 +1265,7 @@ static void x86_pmu_disable(struct perf_event *event)
        if (cpuc->group_flag & PERF_EVENT_TXN)
                return;
-        x86_pmu_stop(event);
+        x86_pmu_stop(event, PERF_EF_UPDATE);
        for (i = 0; i < cpuc->n_events; i++) {
                if (event == cpuc->event_list[i]) {
@@ -1134,7 +1288,6 @@ static int x86_pmu_handle_irq(struct pt_regs *regs)
        struct perf_sample_data data;
        struct cpu_hw_events *cpuc;
        struct perf_event *event;
-        struct hw_perf_event *hwc;
        int idx, handled = 0;
        u64 val;
@@ -1142,6 +1295,16 @@ static int x86_pmu_handle_irq(struct pt_regs *regs)
        cpuc = &__get_cpu_var(cpu_hw_events);
+        /*
+         * Some chipsets need to unmask the LVTPC in a particular spot
+         * inside the nmi handler.  As a result, the unmasking was pushed
+         * into all the nmi handlers.
+         *
+         * This generic handler doesn't seem to have any issues where the
+         * unmasking occurs so it was left at the top.
+         */
+        apic_write(APIC_LVTPC, APIC_DM_NMI);
        for (idx = 0; idx < x86_pmu.num_counters; idx++) {
                if (!test_bit(idx, cpuc->active_mask)) {
                        /*
@@ -1155,7 +1318,6 @@ static int x86_pmu_handle_irq(struct pt_regs *regs)
                }
                event = cpuc->events[idx];
-                hwc = &event->hw;
                val = x86_perf_event_update(event);
                if (val & (1ULL << (x86_pmu.cntval_bits - 1)))
@@ -1171,7 +1333,7 @@ static int x86_pmu_handle_irq(struct pt_regs *regs)
                        continue;
                if (perf_event_overflow(event, 1, &data, regs))
-                        x86_pmu_stop(event);
+                        x86_pmu_stop(event, 0);
        }
        if (handled)
@@ -1180,25 +1342,6 @@ static int x86_pmu_handle_irq(struct pt_regs *regs)
        return handled;
 }
-void smp_perf_pending_interrupt(struct pt_regs *regs)
-{
-        irq_enter();
-        ack_APIC_irq();
-        inc_irq_stat(apic_pending_irqs);
-        perf_event_do_pending();
-        irq_exit();
-}
-void set_perf_event_pending(void)
-{
-#ifdef CONFIG_X86_LOCAL_APIC
-        if (!x86_pmu.apic || !x86_pmu_initialized())
-                return;
-        apic->send_IPI_self(LOCAL_PENDING_VECTOR);
-#endif
-}
 void perf_events_lapic_init(void)
 {
        if (!x86_pmu.apic || !x86_pmu_initialized())
@@ -1230,11 +1373,10 @@ perf_event_nmi_handler(struct notifier_block *self,
        switch (cmd) {
        case DIE_NMI:
-        case DIE_NMI_IPI:
                break;
        case DIE_NMIUNKNOWN:
                this_nmi = percpu_read(irq_stat.__nmi_count);
-                if (this_nmi != __get_cpu_var(pmu_nmi).marked)
+                if (this_nmi != __this_cpu_read(pmu_nmi.marked))
                        /* let the kernel handle the unknown nmi */
                        return NOTIFY_DONE;
                /*
@@ -1249,8 +1391,6 @@ perf_event_nmi_handler(struct notifier_block *self,
                return NOTIFY_DONE;
        }
-        apic_write(APIC_LVTPC, APIC_DM_NMI);
        handled = x86_pmu.handle_irq(args->regs);
        if (!handled)
                return NOTIFY_DONE;
@@ -1258,8 +1398,8 @@ perf_event_nmi_handler(struct notifier_block *self,
        this_nmi = percpu_read(irq_stat.__nmi_count);
        if ((handled > 1) ||
                /* the next nmi could be a back-to-back nmi */
-            ((__get_cpu_var(pmu_nmi).marked == this_nmi) &&
+            ((__this_cpu_read(pmu_nmi.marked) == this_nmi) &&
-             (__get_cpu_var(pmu_nmi).handled > 1))) {
+             (__this_cpu_read(pmu_nmi.handled) > 1))) {
                /*
                 * We could have two subsequent back-to-back nmis: The
                 * first handles more than one counter, the 2nd
@@ -1270,8 +1410,8 @@ perf_event_nmi_handler(struct notifier_block *self,
                 * handling more than one counter. We will mark the
                 * next (3rd) and then drop it if unhandled.
                 */
-                __get_cpu_var(pmu_nmi).marked   = this_nmi + 1;
+                __this_cpu_write(pmu_nmi.marked, this_nmi + 1);
-                __get_cpu_var(pmu_nmi).handled  = handled;
+                __this_cpu_write(pmu_nmi.handled, handled);
        }
        return NOTIFY_STOP;
@@ -1280,7 +1420,7 @@ perf_event_nmi_handler(struct notifier_block *self,
 static __read_mostly struct notifier_block perf_event_nmi_notifier = {
        .notifier_call          = perf_event_nmi_handler,
        .next                   = NULL,
-        .priority               = 1
+        .priority               = NMI_LOCAL_LOW_PRIOR,
 };
 static struct event_constraint unconstrained;
@@ -1353,7 +1493,7 @@ static void __init pmu_check_apic(void)
        pr_info("no hardware sampling interrupt available.\n");
 }
-void __init init_hw_perf_events(void)
+static int __init init_hw_perf_events(void)
 {
        struct event_constraint *c;
        int err;
@@ -1368,15 +1508,19 @@ void __init init_hw_perf_events(void)
                err = amd_pmu_init();
                break;
        default:
-                return;
+                return 0;
        }
        if (err != 0) {
                pr_cont("no PMU driver, software events only.\n");
-                return;
+                return 0;
        }
        pmu_check_apic();
+        /* sanity check that the hardware exists or is emulated */
+        if (!check_hw_exists())
+                return 0;
        pr_cont("%s PMU driver.\n", x86_pmu.name);
        if (x86_pmu.quirks)
@@ -1388,7 +1532,6 @@ void __init init_hw_perf_events(void)
                x86_pmu.num_counters = X86_PMC_MAX_GENERIC;
        }
        x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1;
-        perf_max_events = x86_pmu.num_counters;
        if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) {
                WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!",
@@ -1424,8 +1567,12 @@ void __init init_hw_perf_events(void)
        pr_info("... fixed-purpose events:   %d\n",     x86_pmu.num_counters_fixed);
        pr_info("... event mask:             %016Lx\n", x86_pmu.intel_ctrl);
+        perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW);
        perf_cpu_notifier(x86_pmu_notifier);
+        return 0;
 }
+early_initcall(init_hw_perf_events);
 static inline void x86_pmu_read(struct perf_event *event)
 {
@@ -1437,12 +1584,11 @@ static inline void x86_pmu_read(struct perf_event *event)
 * Set the flag to make pmu::enable() not perform the
 * schedulability test, it will be performed at commit time
 */
-static void x86_pmu_start_txn(const struct pmu *pmu)
+static void x86_pmu_start_txn(struct pmu *pmu)
 {
-        struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+        perf_pmu_disable(pmu);
+        __this_cpu_or(cpu_hw_events.group_flag, PERF_EVENT_TXN);
-        cpuc->group_flag |= PERF_EVENT_TXN;
+        __this_cpu_write(cpu_hw_events.n_txn, 0);
-        cpuc->n_txn = 0;
 }
 /*
@@ -1450,16 +1596,15 @@ static void x86_pmu_start_txn(const struct pmu *pmu)
 * Clear the flag and pmu::enable() will perform the
 * schedulability test.
 */
-static void x86_pmu_cancel_txn(const struct pmu *pmu)
+static void x86_pmu_cancel_txn(struct pmu *pmu)
 {
-        struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+        __this_cpu_and(cpu_hw_events.group_flag, ~PERF_EVENT_TXN);
-        cpuc->group_flag &= ~PERF_EVENT_TXN;
        /*
         * Truncate the collected events.
         */
-        cpuc->n_added -= cpuc->n_txn;
+        __this_cpu_sub(cpu_hw_events.n_added, __this_cpu_read(cpu_hw_events.n_txn));
-        cpuc->n_events -= cpuc->n_txn;
+        __this_cpu_sub(cpu_hw_events.n_events, __this_cpu_read(cpu_hw_events.n_txn));
+        perf_pmu_enable(pmu);
 }
 /*
@@ -1467,7 +1612,7 @@ static void x86_pmu_cancel_txn(const struct pmu *pmu)
 * Perform the group schedulability test as a whole
 * Return 0 if success
 */
-static int x86_pmu_commit_txn(const struct pmu *pmu)
+static int x86_pmu_commit_txn(struct pmu *pmu)
 {
        struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
        int assign[X86_PMC_IDX_MAX];
@@ -1489,22 +1634,10 @@ static int x86_pmu_commit_txn(const struct pmu *pmu)
        memcpy(cpuc->assign, assign, n*sizeof(int));
        cpuc->group_flag &= ~PERF_EVENT_TXN;
+        perf_pmu_enable(pmu);
        return 0;
 }
-static const struct pmu pmu = {
-        .enable         = x86_pmu_enable,
-        .disable        = x86_pmu_disable,
-        .start          = x86_pmu_start,
-        .stop           = x86_pmu_stop,
-        .read           = x86_pmu_read,
-        .unthrottle     = x86_pmu_unthrottle,
-        .start_txn      = x86_pmu_start_txn,
-        .cancel_txn     = x86_pmu_cancel_txn,
-        .commit_txn     = x86_pmu_commit_txn,
-};
 /*
 * validate that we can schedule this event
 */
@@ -1579,12 +1712,22 @@ out:
        return ret;
 }
-const struct pmu *hw_perf_event_init(struct perf_event *event)
+static int x86_pmu_event_init(struct perf_event *event)
 {
-        const struct pmu *tmp;
+        struct pmu *tmp;
        int err;
-        err = __hw_perf_event_init(event);
+        switch (event->attr.type) {
+        case PERF_TYPE_RAW:
+        case PERF_TYPE_HARDWARE:
+        case PERF_TYPE_HW_CACHE:
+                break;
+        default:
+                return -ENOENT;
+        }
+        err = __x86_pmu_event_init(event);
        if (!err) {
                /*
                 * we temporarily connect event to its pmu
@@ -1604,37 +1747,31 @@ const struct pmu *hw_perf_event_init(struct perf_event *event)
        if (err) {
                if (event->destroy)
                        event->destroy(event);
-                return ERR_PTR(err);
        }
-        return &pmu;
+        return err;
 }
-/*
+static struct pmu pmu = {
- * callchain support
+        .pmu_enable     = x86_pmu_enable,
- */
+        .pmu_disable    = x86_pmu_disable,
-static inline
-void callchain_store(struct perf_callchain_entry *entry, u64 ip)
-{
-        if (entry->nr < PERF_MAX_STACK_DEPTH)
-                entry->ip[entry->nr++] = ip;
-}
-static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry);
+        .event_init     = x86_pmu_event_init,
-static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry);
+        .add            = x86_pmu_add,
+        .del            = x86_pmu_del,
+        .start          = x86_pmu_start,
+        .stop           = x86_pmu_stop,
+        .read           = x86_pmu_read,
-static void
+        .start_txn      = x86_pmu_start_txn,
-backtrace_warning_symbol(void *data, char *msg, unsigned long symbol)
+        .cancel_txn     = x86_pmu_cancel_txn,
-{
+        .commit_txn     = x86_pmu_commit_txn,
-        /* Ignore warnings */
+};
-}
-static void backtrace_warning(void *data, char *msg)
+/*
-{
+ * callchain support
-        /* Ignore warnings */
+ */
-}
 static int backtrace_stack(void *data, char *name)
 {
@@ -1645,24 +1782,26 @@ static void backtrace_address(void *data, unsigned long addr, int reliable)
 {
        struct perf_callchain_entry *entry = data;
-        callchain_store(entry, addr);
+        perf_callchain_store(entry, addr);
 }
 static const struct stacktrace_ops backtrace_ops = {
-        .warning                = backtrace_warning,
-        .warning_symbol         = backtrace_warning_symbol,
        .stack                  = backtrace_stack,
        .address                = backtrace_address,
        .walk_stack             = print_context_stack_bp,
 };
-static void
+void
-perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
+perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
 {
-        callchain_store(entry, PERF_CONTEXT_KERNEL);
+        if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
-        callchain_store(entry, regs->ip);
+                /* TODO: We don't support guest os callchain now */
+                return;
+        }
+        perf_callchain_store(entry, regs->ip);
-        dump_trace(NULL, regs, NULL, regs->bp, &backtrace_ops, entry);
+        dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry);
 }
 #ifdef CONFIG_COMPAT
@@ -1689,7 +1828,7 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
                if (fp < compat_ptr(regs->sp))
                        break;
-                callchain_store(entry, frame.return_address);
+                perf_callchain_store(entry, frame.return_address);
                fp = compat_ptr(frame.next_frame);
        }
        return 1;
@@ -1702,19 +1841,20 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
 }
 #endif
-static void
+void
-perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
+perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
 {
        struct stack_frame frame;
        const void __user *fp;
-        if (!user_mode(regs))
+        if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
-                regs = task_pt_regs(current);
+                /* TODO: We don't support guest os callchain now */
+                return;
+        }
        fp = (void __user *)regs->bp;
-        callchain_store(entry, PERF_CONTEXT_USER);
+        perf_callchain_store(entry, regs->ip);
-        callchain_store(entry, regs->ip);
        if (perf_callchain_user32(regs, entry))
                return;
@@ -1731,52 +1871,11 @@ perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
                if ((unsigned long)fp < regs->sp)
                        break;
-                callchain_store(entry, frame.return_address);
+                perf_callchain_store(entry, frame.return_address);
                fp = frame.next_frame;
        }
 }
-static void
-perf_do_callchain(struct pt_regs *regs, struct perf_callchain_entry *entry)
-{
-        int is_user;
-        if (!regs)
-                return;
-        is_user = user_mode(regs);
-        if (is_user && current->state != TASK_RUNNING)
-                return;
-        if (!is_user)
-                perf_callchain_kernel(regs, entry);
-        if (current->mm)
-                perf_callchain_user(regs, entry);
-}
-struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
-{
-        struct perf_callchain_entry *entry;
-        if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
-                /* TODO: We don't support guest os callchain now */
-                return NULL;
-        }
-        if (in_nmi())
-                entry = &__get_cpu_var(pmc_nmi_entry);
-        else
-                entry = &__get_cpu_var(pmc_irq_entry);
-        entry->nr = 0;
-        perf_do_callchain(regs, entry);
-        return entry;
-}
 unsigned long perf_instruction_pointer(struct pt_regs *regs)
 {
        unsigned long ip;
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index c2897b7b4a3b..fe29c1d2219e 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -1,7 +1,5 @@
 #ifdef CONFIG_CPU_SUP_AMD
-static DEFINE_RAW_SPINLOCK(amd_nb_lock);
 static __initconst const u64 amd_hw_cache_event_ids
                                [PERF_COUNT_HW_CACHE_MAX]
                                [PERF_COUNT_HW_CACHE_OP_MAX]
@@ -10,7 +8,7 @@ static __initconst const u64 amd_hw_cache_event_ids
 [ C(L1D) ] = {
        [ C(OP_READ) ] = {
                [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses        */
-                [ C(RESULT_MISS)   ] = 0x0041, /* Data Cache Misses          */
+                [ C(RESULT_MISS)   ] = 0x0141, /* Data Cache Misses          */
        },
        [ C(OP_WRITE) ] = {
                [ C(RESULT_ACCESS) ] = 0x0142, /* Data Cache Refills :system */
@@ -52,7 +50,7 @@ static __initconst const u64 amd_hw_cache_event_ids
 [ C(DTLB) ] = {
        [ C(OP_READ) ] = {
                [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses        */
-                [ C(RESULT_MISS)   ] = 0x0046, /* L1 DTLB and L2 DLTB Miss   */
+                [ C(RESULT_MISS)   ] = 0x0746, /* L1_DTLB_AND_L2_DLTB_MISS.ALL */
        },
        [ C(OP_WRITE) ] = {
                [ C(RESULT_ACCESS) ] = 0,
@@ -66,7 +64,7 @@ static __initconst const u64 amd_hw_cache_event_ids
 [ C(ITLB) ] = {
        [ C(OP_READ) ] = {
                [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes        */
-                [ C(RESULT_MISS)   ] = 0x0085, /* Instr. fetch ITLB misses   */
+                [ C(RESULT_MISS)   ] = 0x0385, /* L1_ITLB_AND_L2_ITLB_MISS.ALL */
        },
        [ C(OP_WRITE) ] = {
                [ C(RESULT_ACCESS) ] = -1,
@@ -98,12 +96,14 @@ static __initconst const u64 amd_hw_cache_event_ids
 */
 static const u64 amd_perfmon_event_map[] =
 {
-  [PERF_COUNT_HW_CPU_CYCLES]            = 0x0076,
+  [PERF_COUNT_HW_CPU_CYCLES]                    = 0x0076,
-  [PERF_COUNT_HW_INSTRUCTIONS]          = 0x00c0,
+  [PERF_COUNT_HW_INSTRUCTIONS]                  = 0x00c0,
-  [PERF_COUNT_HW_CACHE_REFERENCES]      = 0x0080,
+  [PERF_COUNT_HW_CACHE_REFERENCES]              = 0x0080,
-  [PERF_COUNT_HW_CACHE_MISSES]          = 0x0081,
+  [PERF_COUNT_HW_CACHE_MISSES]                  = 0x0081,
-  [PERF_COUNT_HW_BRANCH_INSTRUCTIONS]   = 0x00c2,
+  [PERF_COUNT_HW_BRANCH_INSTRUCTIONS]           = 0x00c2,
-  [PERF_COUNT_HW_BRANCH_MISSES]         = 0x00c3,
+  [PERF_COUNT_HW_BRANCH_MISSES]                 = 0x00c3,
+  [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND]       = 0x00d0, /* "Decoder empty" event */
+  [PERF_COUNT_HW_STALLED_CYCLES_BACKEND]        = 0x00d1, /* "Dispatch stalls" event */
 };
 static u64 amd_pmu_event_map(int hw_event)
@@ -129,6 +129,11 @@ static int amd_pmu_hw_config(struct perf_event *event)
 /*
 * AMD64 events are detected based on their event codes.
 */
+static inline unsigned int amd_get_event_code(struct hw_perf_event *hwc)
+{
+        return ((hwc->config >> 24) & 0x0f00) | (hwc->config & 0x00ff);
+}
 static inline int amd_is_nb_event(struct hw_perf_event *hwc)
 {
        return (hwc->config & 0xe0) == 0xe0;
@@ -275,17 +280,17 @@ done:
        return &emptyconstraint;
 }
-static struct amd_nb *amd_alloc_nb(int cpu, int nb_id)
+static struct amd_nb *amd_alloc_nb(int cpu)
 {
        struct amd_nb *nb;
        int i;
-        nb = kmalloc(sizeof(struct amd_nb), GFP_KERNEL);
+        nb = kmalloc_node(sizeof(struct amd_nb), GFP_KERNEL | __GFP_ZERO,
+                          cpu_to_node(cpu));
        if (!nb)
                return NULL;
-        memset(nb, 0, sizeof(*nb));
+        nb->nb_id = -1;
-        nb->nb_id = nb_id;
        /*
         * initialize all possible NB constraints
@@ -306,7 +311,7 @@ static int amd_pmu_cpu_prepare(int cpu)
        if (boot_cpu_data.x86_max_cores < 2)
                return NOTIFY_OK;
-        cpuc->amd_nb = amd_alloc_nb(cpu, -1);
+        cpuc->amd_nb = amd_alloc_nb(cpu);
        if (!cpuc->amd_nb)
                return NOTIFY_BAD;
@@ -325,8 +330,6 @@ static void amd_pmu_cpu_starting(int cpu)
        nb_id = amd_get_nb_id(cpu);
        WARN_ON_ONCE(nb_id == BAD_APICID);
-        raw_spin_lock(&amd_nb_lock);
        for_each_online_cpu(i) {
                nb = per_cpu(cpu_hw_events, i).amd_nb;
                if (WARN_ON_ONCE(!nb))
@@ -341,8 +344,6 @@ static void amd_pmu_cpu_starting(int cpu)
        cpuc->amd_nb->nb_id = nb_id;
        cpuc->amd_nb->refcnt++;
-        raw_spin_unlock(&amd_nb_lock);
 }
 static void amd_pmu_cpu_dead(int cpu)
@@ -354,8 +355,6 @@ static void amd_pmu_cpu_dead(int cpu)
        cpuhw = &per_cpu(cpu_hw_events, cpu);
-        raw_spin_lock(&amd_nb_lock);
        if (cpuhw->amd_nb) {
                struct amd_nb *nb = cpuhw->amd_nb;
@@ -364,8 +363,6 @@ static void amd_pmu_cpu_dead(int cpu)
                cpuhw->amd_nb = NULL;
        }
-        raw_spin_unlock(&amd_nb_lock);
 }
 static __initconst const struct x86_pmu amd_pmu = {
@@ -395,13 +392,195 @@ static __initconst const struct x86_pmu amd_pmu = {
        .cpu_dead               = amd_pmu_cpu_dead,
 };
+/* AMD Family 15h */
+#define AMD_EVENT_TYPE_MASK     0x000000F0ULL
+#define AMD_EVENT_FP            0x00000000ULL ... 0x00000010ULL
+#define AMD_EVENT_LS            0x00000020ULL ... 0x00000030ULL
+#define AMD_EVENT_DC            0x00000040ULL ... 0x00000050ULL
+#define AMD_EVENT_CU            0x00000060ULL ... 0x00000070ULL
+#define AMD_EVENT_IC_DE         0x00000080ULL ... 0x00000090ULL
+#define AMD_EVENT_EX_LS         0x000000C0ULL
+#define AMD_EVENT_DE            0x000000D0ULL
+#define AMD_EVENT_NB            0x000000E0ULL ... 0x000000F0ULL
+/*
+ * AMD family 15h event code/PMC mappings:
+ *
+ * type = event_code & 0x0F0:
+ *
+ * 0x000        FP      PERF_CTL[5:3]
+ * 0x010        FP      PERF_CTL[5:3]
+ * 0x020        LS      PERF_CTL[5:0]
+ * 0x030        LS      PERF_CTL[5:0]
+ * 0x040        DC      PERF_CTL[5:0]
+ * 0x050        DC      PERF_CTL[5:0]
+ * 0x060        CU      PERF_CTL[2:0]
+ * 0x070        CU      PERF_CTL[2:0]
+ * 0x080        IC/DE   PERF_CTL[2:0]
+ * 0x090        IC/DE   PERF_CTL[2:0]
+ * 0x0A0        ---
+ * 0x0B0        ---
+ * 0x0C0        EX/LS   PERF_CTL[5:0]
+ * 0x0D0        DE      PERF_CTL[2:0]
+ * 0x0E0        NB      NB_PERF_CTL[3:0]
+ * 0x0F0        NB      NB_PERF_CTL[3:0]
+ *
+ * Exceptions:
+ *
+ * 0x000        FP      PERF_CTL[3], PERF_CTL[5:3] (*)
+ * 0x003        FP      PERF_CTL[3]
+ * 0x004        FP      PERF_CTL[3], PERF_CTL[5:3] (*)
+ * 0x00B        FP      PERF_CTL[3]
+ * 0x00D        FP      PERF_CTL[3]
+ * 0x023        DE      PERF_CTL[2:0]
+ * 0x02D        LS      PERF_CTL[3]
+ * 0x02E        LS      PERF_CTL[3,0]
+ * 0x043        CU      PERF_CTL[2:0]
+ * 0x045        CU      PERF_CTL[2:0]
+ * 0x046        CU      PERF_CTL[2:0]
+ * 0x054        CU      PERF_CTL[2:0]
+ * 0x055        CU      PERF_CTL[2:0]
+ * 0x08F        IC      PERF_CTL[0]
+ * 0x187        DE      PERF_CTL[0]
+ * 0x188        DE      PERF_CTL[0]
+ * 0x0DB        EX      PERF_CTL[5:0]
+ * 0x0DC        LS      PERF_CTL[5:0]
+ * 0x0DD        LS      PERF_CTL[5:0]
+ * 0x0DE        LS      PERF_CTL[5:0]
+ * 0x0DF        LS      PERF_CTL[5:0]
+ * 0x1D6        EX      PERF_CTL[5:0]
+ * 0x1D8        EX      PERF_CTL[5:0]
+ *
+ * (*) depending on the umask all FPU counters may be used
+ */
+static struct event_constraint amd_f15_PMC0  = EVENT_CONSTRAINT(0, 0x01, 0);
+static struct event_constraint amd_f15_PMC20 = EVENT_CONSTRAINT(0, 0x07, 0);
+static struct event_constraint amd_f15_PMC3  = EVENT_CONSTRAINT(0, 0x08, 0);
+static struct event_constraint amd_f15_PMC30 = EVENT_CONSTRAINT(0, 0x09, 0);
+static struct event_constraint amd_f15_PMC50 = EVENT_CONSTRAINT(0, 0x3F, 0);
+static struct event_constraint amd_f15_PMC53 = EVENT_CONSTRAINT(0, 0x38, 0);
+static struct event_constraint *
+amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, struct perf_event *event)
+{
+        struct hw_perf_event *hwc = &event->hw;
+        unsigned int event_code = amd_get_event_code(hwc);
+        switch (event_code & AMD_EVENT_TYPE_MASK) {
+        case AMD_EVENT_FP:
+                switch (event_code) {
+                case 0x000:
+                        if (!(hwc->config & 0x0000F000ULL))
+                                break;
+                        if (!(hwc->config & 0x00000F00ULL))
+                                break;
+                        return &amd_f15_PMC3;
+                case 0x004:
+                        if (hweight_long(hwc->config & ARCH_PERFMON_EVENTSEL_UMASK) <= 1)
+                                break;
+                        return &amd_f15_PMC3;
+                case 0x003:
+                case 0x00B:
+                case 0x00D:
+                        return &amd_f15_PMC3;
+                }
+                return &amd_f15_PMC53;
+        case AMD_EVENT_LS:
+        case AMD_EVENT_DC:
+        case AMD_EVENT_EX_LS:
+                switch (event_code) {
+                case 0x023:
+                case 0x043:
+                case 0x045:
+                case 0x046:
+                case 0x054:
+                case 0x055:
+                        return &amd_f15_PMC20;
+                case 0x02D:
+                        return &amd_f15_PMC3;
+                case 0x02E:
+                        return &amd_f15_PMC30;
+                default:
+                        return &amd_f15_PMC50;
+                }
+        case AMD_EVENT_CU:
+        case AMD_EVENT_IC_DE:
+        case AMD_EVENT_DE:
+                switch (event_code) {
+                case 0x08F:
+                case 0x187:
+                case 0x188:
+                        return &amd_f15_PMC0;
+                case 0x0DB ... 0x0DF:
+                case 0x1D6:
+                case 0x1D8:
+                        return &amd_f15_PMC50;
+                default:
+                        return &amd_f15_PMC20;
+                }
+        case AMD_EVENT_NB:
+                /* not yet implemented */
+                return &emptyconstraint;
+        default:
+                return &emptyconstraint;
+        }
+}
+static __initconst const struct x86_pmu amd_pmu_f15h = {
+        .name                   = "AMD Family 15h",
+        .handle_irq             = x86_pmu_handle_irq,
+        .disable_all            = x86_pmu_disable_all,
+        .enable_all             = x86_pmu_enable_all,
+        .enable                 = x86_pmu_enable_event,
+        .disable                = x86_pmu_disable_event,
+        .hw_config              = amd_pmu_hw_config,
+        .schedule_events        = x86_schedule_events,
+        .eventsel               = MSR_F15H_PERF_CTL,
+        .perfctr                = MSR_F15H_PERF_CTR,
+        .event_map              = amd_pmu_event_map,
+        .max_events             = ARRAY_SIZE(amd_perfmon_event_map),
+        .num_counters           = 6,
+        .cntval_bits            = 48,
+        .cntval_mask            = (1ULL << 48) - 1,
+        .apic                   = 1,
+        /* use highest bit to detect overflow */
+        .max_period             = (1ULL << 47) - 1,
+        .get_event_constraints  = amd_get_event_constraints_f15h,
+        /* nortbridge counters not yet implemented: */
+#if 0
+        .put_event_constraints  = amd_put_event_constraints,
+        .cpu_prepare            = amd_pmu_cpu_prepare,
+        .cpu_starting           = amd_pmu_cpu_starting,
+        .cpu_dead               = amd_pmu_cpu_dead,
+#endif
+};
 static __init int amd_pmu_init(void)
 {
        /* Performance-monitoring supported from K7 and later: */
        if (boot_cpu_data.x86 < 6)
                return -ENODEV;
-        x86_pmu = amd_pmu;
+        /*
+         * If core performance counter extensions exists, it must be
+         * family 15h, otherwise fail. See x86_pmu_addr_offset().
+         */
+        switch (boot_cpu_data.x86) {
+        case 0x15:
+                if (!cpu_has_perfctr_core)
+                        return -ENODEV;
+                x86_pmu = amd_pmu_f15h;
+                break;
+        default:
+                if (cpu_has_perfctr_core)
+                        return -ENODEV;
+                x86_pmu = amd_pmu;
+                break;
+        }
        /* Events are common for all AMDs */
        memcpy(hw_cache_event_ids, amd_hw_cache_event_ids,
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index ee05c90012d2..41178c826c48 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1,9 +1,31 @@
 #ifdef CONFIG_CPU_SUP_INTEL
+#define MAX_EXTRA_REGS 2
+/*
+ * Per register state.
+ */
+struct er_account {
+        int                     ref;            /* reference count */
+        unsigned int            extra_reg;      /* extra MSR number */
+        u64                     extra_config;   /* extra MSR config */
+};
+/*
+ * Per core state
+ * This used to coordinate shared registers for HT threads.
+ */
+struct intel_percore {
+        raw_spinlock_t          lock;           /* protect structure */
+        struct er_account       regs[MAX_EXTRA_REGS];
+        int                     refcnt;         /* number of threads */
+        unsigned                core_id;
+};
 /*
 * Intel PerfMon, used on Core and later.
 */
-static const u64 intel_perfmon_event_map[] =
+static u64 intel_perfmon_event_map[PERF_COUNT_HW_MAX] __read_mostly =
 {
  [PERF_COUNT_HW_CPU_CYCLES]            = 0x003c,
  [PERF_COUNT_HW_INSTRUCTIONS]          = 0x00c0,
@@ -14,7 +36,7 @@ static const u64 intel_perfmon_event_map[] =
  [PERF_COUNT_HW_BUS_CYCLES]            = 0x013c,
 };
-static struct event_constraint intel_core_event_constraints[] =
+static struct event_constraint intel_core_event_constraints[] __read_mostly =
 {
        INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */
        INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
@@ -25,7 +47,7 @@ static struct event_constraint intel_core_event_constraints[] =
        EVENT_CONSTRAINT_END
 };
-static struct event_constraint intel_core2_event_constraints[] =
+static struct event_constraint intel_core2_event_constraints[] __read_mostly =
 {
        FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
        FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
@@ -48,7 +70,7 @@ static struct event_constraint intel_core2_event_constraints[] =
        EVENT_CONSTRAINT_END
 };
-static struct event_constraint intel_nehalem_event_constraints[] =
+static struct event_constraint intel_nehalem_event_constraints[] __read_mostly =
 {
        FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
        FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
@@ -64,7 +86,19 @@ static struct event_constraint intel_nehalem_event_constraints[] =
        EVENT_CONSTRAINT_END
 };
-static struct event_constraint intel_westmere_event_constraints[] =
+static struct extra_reg intel_nehalem_extra_regs[] __read_mostly =
+{
+        INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff),
+        EVENT_EXTRA_END
+};
+static struct event_constraint intel_nehalem_percore_constraints[] __read_mostly =
+{
+        INTEL_EVENT_CONSTRAINT(0xb7, 0),
+        EVENT_CONSTRAINT_END
+};
+static struct event_constraint intel_westmere_event_constraints[] __read_mostly =
 {
        FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
        FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
@@ -76,7 +110,34 @@ static struct event_constraint intel_westmere_event_constraints[] =
        EVENT_CONSTRAINT_END
 };
-static struct event_constraint intel_gen_event_constraints[] =
+static struct event_constraint intel_snb_event_constraints[] __read_mostly =
+{
+        FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
+        FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
+        /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */
+        INTEL_EVENT_CONSTRAINT(0x48, 0x4), /* L1D_PEND_MISS.PENDING */
+        INTEL_EVENT_CONSTRAINT(0xb7, 0x1), /* OFF_CORE_RESPONSE_0 */
+        INTEL_EVENT_CONSTRAINT(0xbb, 0x8), /* OFF_CORE_RESPONSE_1 */
+        INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */
+        INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */
+        EVENT_CONSTRAINT_END
+};
+static struct extra_reg intel_westmere_extra_regs[] __read_mostly =
+{
+        INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff),
+        INTEL_EVENT_EXTRA_REG(0xbb, MSR_OFFCORE_RSP_1, 0xffff),
+        EVENT_EXTRA_END
+};
+static struct event_constraint intel_westmere_percore_constraints[] __read_mostly =
+{
+        INTEL_EVENT_CONSTRAINT(0xb7, 0),
+        INTEL_EVENT_CONSTRAINT(0xbb, 0),
+        EVENT_CONSTRAINT_END
+};
+static struct event_constraint intel_gen_event_constraints[] __read_mostly =
 {
        FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
        FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
@@ -89,6 +150,103 @@ static u64 intel_pmu_event_map(int hw_event)
        return intel_perfmon_event_map[hw_event];
 }
+static __initconst const u64 snb_hw_cache_event_ids
+                                [PERF_COUNT_HW_CACHE_MAX]
+                                [PERF_COUNT_HW_CACHE_OP_MAX]
+                                [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+        [ C(OP_READ) ] = {
+                [ C(RESULT_ACCESS) ] = 0xf1d0, /* MEM_UOP_RETIRED.LOADS        */
+                [ C(RESULT_MISS)   ] = 0x0151, /* L1D.REPLACEMENT              */
+        },
+        [ C(OP_WRITE) ] = {
+                [ C(RESULT_ACCESS) ] = 0xf2d0, /* MEM_UOP_RETIRED.STORES       */
+                [ C(RESULT_MISS)   ] = 0x0851, /* L1D.ALL_M_REPLACEMENT        */
+        },
+        [ C(OP_PREFETCH) ] = {
+                [ C(RESULT_ACCESS) ] = 0x0,
+                [ C(RESULT_MISS)   ] = 0x024e, /* HW_PRE_REQ.DL1_MISS          */
+        },
+ },
+ [ C(L1I ) ] = {
+        [ C(OP_READ) ] = {
+                [ C(RESULT_ACCESS) ] = 0x0,
+                [ C(RESULT_MISS)   ] = 0x0280, /* ICACHE.MISSES */
+        },
+        [ C(OP_WRITE) ] = {
+                [ C(RESULT_ACCESS) ] = -1,
+                [ C(RESULT_MISS)   ] = -1,
+        },
+        [ C(OP_PREFETCH) ] = {
+                [ C(RESULT_ACCESS) ] = 0x0,
+                [ C(RESULT_MISS)   ] = 0x0,
+        },
+ },
+ [ C(LL  ) ] = {
+        [ C(OP_READ) ] = {
+                /* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */
+                [ C(RESULT_ACCESS) ] = 0x01b7,
+                /* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */
+                [ C(RESULT_MISS)   ] = 0x01b7,
+        },
+        [ C(OP_WRITE) ] = {
+                /* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */
+                [ C(RESULT_ACCESS) ] = 0x01b7,
+                /* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */
+                [ C(RESULT_MISS)   ] = 0x01b7,
+        },
+        [ C(OP_PREFETCH) ] = {
+                /* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */
+                [ C(RESULT_ACCESS) ] = 0x01b7,
+                /* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */
+                [ C(RESULT_MISS)   ] = 0x01b7,
+        },
+ },
+ [ C(DTLB) ] = {
+        [ C(OP_READ) ] = {
+                [ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_UOP_RETIRED.ALL_LOADS */
+                [ C(RESULT_MISS)   ] = 0x0108, /* DTLB_LOAD_MISSES.CAUSES_A_WALK */
+        },
+        [ C(OP_WRITE) ] = {
+                [ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_UOP_RETIRED.ALL_STORES */
+                [ C(RESULT_MISS)   ] = 0x0149, /* DTLB_STORE_MISSES.MISS_CAUSES_A_WALK */
+        },
+        [ C(OP_PREFETCH) ] = {
+                [ C(RESULT_ACCESS) ] = 0x0,
+                [ C(RESULT_MISS)   ] = 0x0,
+        },
+ },
+ [ C(ITLB) ] = {
+        [ C(OP_READ) ] = {
+                [ C(RESULT_ACCESS) ] = 0x1085, /* ITLB_MISSES.STLB_HIT         */
+                [ C(RESULT_MISS)   ] = 0x0185, /* ITLB_MISSES.CAUSES_A_WALK    */
+        },
+        [ C(OP_WRITE) ] = {
+                [ C(RESULT_ACCESS) ] = -1,
+                [ C(RESULT_MISS)   ] = -1,
+        },
+        [ C(OP_PREFETCH) ] = {
+                [ C(RESULT_ACCESS) ] = -1,
+                [ C(RESULT_MISS)   ] = -1,
+        },
+ },
+ [ C(BPU ) ] = {
+        [ C(OP_READ) ] = {
+                [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
+                [ C(RESULT_MISS)   ] = 0x00c5, /* BR_MISP_RETIRED.ALL_BRANCHES */
+        },
+        [ C(OP_WRITE) ] = {
+                [ C(RESULT_ACCESS) ] = -1,
+                [ C(RESULT_MISS)   ] = -1,
+        },
+        [ C(OP_PREFETCH) ] = {
+                [ C(RESULT_ACCESS) ] = -1,
+                [ C(RESULT_MISS)   ] = -1,
+        },
+ },
+};
 static __initconst const u64 westmere_hw_cache_event_ids
                                [PERF_COUNT_HW_CACHE_MAX]
                                [PERF_COUNT_HW_CACHE_OP_MAX]
@@ -124,16 +282,26 @@ static __initconst const u64 westmere_hw_cache_event_ids
 },
 [ C(LL  ) ] = {
        [ C(OP_READ) ] = {
-                [ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS               */
+                /* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */
-                [ C(RESULT_MISS)   ] = 0x0224, /* L2_RQSTS.LD_MISS             */
+                [ C(RESULT_ACCESS) ] = 0x01b7,
+                /* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */
+                [ C(RESULT_MISS)   ] = 0x01b7,
        },
+        /*
+         * Use RFO, not WRITEBACK, because a write miss would typically occur
+         * on RFO.
+         */
        [ C(OP_WRITE) ] = {
-                [ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS                */
+                /* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */
-                [ C(RESULT_MISS)   ] = 0x0824, /* L2_RQSTS.RFO_MISS            */
+                [ C(RESULT_ACCESS) ] = 0x01b7,
+                /* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */
+                [ C(RESULT_MISS)   ] = 0x01b7,
        },
        [ C(OP_PREFETCH) ] = {
-                [ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference                */
+                /* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */
-                [ C(RESULT_MISS)   ] = 0x412e, /* LLC Misses                   */
+                [ C(RESULT_ACCESS) ] = 0x01b7,
+                /* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */
+                [ C(RESULT_MISS)   ] = 0x01b7,
        },
 },
 [ C(DTLB) ] = {
@@ -180,6 +348,59 @@ static __initconst const u64 westmere_hw_cache_event_ids
 },
 };
+/*
+ * Nehalem/Westmere MSR_OFFCORE_RESPONSE bits;
+ * See IA32 SDM Vol 3B 30.6.1.3
+ */
+#define NHM_DMND_DATA_RD        (1 << 0)
+#define NHM_DMND_RFO            (1 << 1)
+#define NHM_DMND_IFETCH         (1 << 2)
+#define NHM_DMND_WB             (1 << 3)
+#define NHM_PF_DATA_RD          (1 << 4)
+#define NHM_PF_DATA_RFO         (1 << 5)
+#define NHM_PF_IFETCH           (1 << 6)
+#define NHM_OFFCORE_OTHER       (1 << 7)
+#define NHM_UNCORE_HIT          (1 << 8)
+#define NHM_OTHER_CORE_HIT_SNP  (1 << 9)
+#define NHM_OTHER_CORE_HITM     (1 << 10)
+                                /* reserved */
+#define NHM_REMOTE_CACHE_FWD    (1 << 12)
+#define NHM_REMOTE_DRAM         (1 << 13)
+#define NHM_LOCAL_DRAM          (1 << 14)
+#define NHM_NON_DRAM            (1 << 15)
+#define NHM_ALL_DRAM            (NHM_REMOTE_DRAM|NHM_LOCAL_DRAM)
+#define NHM_DMND_READ           (NHM_DMND_DATA_RD)
+#define NHM_DMND_WRITE          (NHM_DMND_RFO|NHM_DMND_WB)
+#define NHM_DMND_PREFETCH       (NHM_PF_DATA_RD|NHM_PF_DATA_RFO)
+#define NHM_L3_HIT      (NHM_UNCORE_HIT|NHM_OTHER_CORE_HIT_SNP|NHM_OTHER_CORE_HITM)
+#define NHM_L3_MISS     (NHM_NON_DRAM|NHM_ALL_DRAM|NHM_REMOTE_CACHE_FWD)
+#define NHM_L3_ACCESS   (NHM_L3_HIT|NHM_L3_MISS)
+static __initconst const u64 nehalem_hw_cache_extra_regs
+                                [PERF_COUNT_HW_CACHE_MAX]
+                                [PERF_COUNT_HW_CACHE_OP_MAX]
+                                [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(LL  ) ] = {
+        [ C(OP_READ) ] = {
+                [ C(RESULT_ACCESS) ] = NHM_DMND_READ|NHM_L3_ACCESS,
+                [ C(RESULT_MISS)   ] = NHM_DMND_READ|NHM_L3_MISS,
+        },
+        [ C(OP_WRITE) ] = {
+                [ C(RESULT_ACCESS) ] = NHM_DMND_WRITE|NHM_L3_ACCESS,
+                [ C(RESULT_MISS)   ] = NHM_DMND_WRITE|NHM_L3_MISS,
+        },
+        [ C(OP_PREFETCH) ] = {
+                [ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_L3_ACCESS,
+                [ C(RESULT_MISS)   ] = NHM_DMND_PREFETCH|NHM_L3_MISS,
+        },
+ }
+};
 static __initconst const u64 nehalem_hw_cache_event_ids
                                [PERF_COUNT_HW_CACHE_MAX]
                                [PERF_COUNT_HW_CACHE_OP_MAX]
@@ -187,12 +408,12 @@ static __initconst const u64 nehalem_hw_cache_event_ids
 {
 [ C(L1D) ] = {
        [ C(OP_READ) ] = {
-                [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI            */
+                [ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS       */
-                [ C(RESULT_MISS)   ] = 0x0140, /* L1D_CACHE_LD.I_STATE         */
+                [ C(RESULT_MISS)   ] = 0x0151, /* L1D.REPL                     */
        },
        [ C(OP_WRITE) ] = {
-                [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI            */
+                [ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES      */
-                [ C(RESULT_MISS)   ] = 0x0141, /* L1D_CACHE_ST.I_STATE         */
+                [ C(RESULT_MISS)   ] = 0x0251, /* L1D.M_REPL                   */
        },
        [ C(OP_PREFETCH) ] = {
                [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS        */
@@ -215,16 +436,26 @@ static __initconst const u64 nehalem_hw_cache_event_ids
 },
 [ C(LL  ) ] = {
        [ C(OP_READ) ] = {
-                [ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS               */
+                /* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */
-                [ C(RESULT_MISS)   ] = 0x0224, /* L2_RQSTS.LD_MISS             */
+                [ C(RESULT_ACCESS) ] = 0x01b7,
+                /* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */
+                [ C(RESULT_MISS)   ] = 0x01b7,
        },
+        /*
+         * Use RFO, not WRITEBACK, because a write miss would typically occur
+         * on RFO.
+         */
        [ C(OP_WRITE) ] = {
-                [ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS                */
+                /* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */
-                [ C(RESULT_MISS)   ] = 0x0824, /* L2_RQSTS.RFO_MISS            */
+                [ C(RESULT_ACCESS) ] = 0x01b7,
+                /* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */
+                [ C(RESULT_MISS)   ] = 0x01b7,
        },
        [ C(OP_PREFETCH) ] = {
-                [ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference                */
+                /* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */
-                [ C(RESULT_MISS)   ] = 0x412e, /* LLC Misses                   */
+                [ C(RESULT_ACCESS) ] = 0x01b7,
+                /* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */
+                [ C(RESULT_MISS)   ] = 0x01b7,
        },
 },
 [ C(DTLB) ] = {
@@ -649,7 +880,7 @@ static void intel_pmu_enable_event(struct perf_event *event)
        struct hw_perf_event *hwc = &event->hw;
        if (unlikely(hwc->idx == X86_PMC_IDX_FIXED_BTS)) {
-                if (!__get_cpu_var(cpu_hw_events).enabled)
+                if (!__this_cpu_read(cpu_hw_events.enabled))
                        return;
                intel_pmu_enable_bts(hwc->config);
@@ -679,7 +910,7 @@ static int intel_pmu_save_and_restart(struct perf_event *event)
 static void intel_pmu_reset(void)
 {
-        struct debug_store *ds = __get_cpu_var(cpu_hw_events).ds;
+        struct debug_store *ds = __this_cpu_read(cpu_hw_events.ds);
        unsigned long flags;
        int idx;
@@ -691,8 +922,8 @@ static void intel_pmu_reset(void)
        printk("clearing PMU state on CPU#%d\n", smp_processor_id());
        for (idx = 0; idx < x86_pmu.num_counters; idx++) {
-                checking_wrmsrl(x86_pmu.eventsel + idx, 0ull);
+                checking_wrmsrl(x86_pmu_config_addr(idx), 0ull);
-                checking_wrmsrl(x86_pmu.perfctr  + idx, 0ull);
+                checking_wrmsrl(x86_pmu_event_addr(idx),  0ull);
        }
        for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++)
                checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
@@ -713,18 +944,28 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
        struct cpu_hw_events *cpuc;
        int bit, loops;
        u64 status;
-        int handled = 0;
+        int handled;
        perf_sample_data_init(&data, 0);
        cpuc = &__get_cpu_var(cpu_hw_events);
+        /*
+         * Some chipsets need to unmask the LVTPC in a particular spot
+         * inside the nmi handler.  As a result, the unmasking was pushed
+         * into all the nmi handlers.
+         *
+         * This handler doesn't seem to have any issues with the unmasking
+         * so it was left at the top.
+         */
+        apic_write(APIC_LVTPC, APIC_DM_NMI);
        intel_pmu_disable_all();
-        intel_pmu_drain_bts_buffer();
+        handled = intel_pmu_drain_bts_buffer();
        status = intel_pmu_get_status();
        if (!status) {
                intel_pmu_enable_all(0);
-                return 0;
+                return handled;
        }
        loops = 0;
@@ -763,7 +1004,7 @@ again:
                data.period = event->hw.last_period;
                if (perf_event_overflow(event, 1, &data, regs))
-                        x86_pmu_stop(event);
+                        x86_pmu_stop(event, 0);
        }
        /*
@@ -784,6 +1025,9 @@ intel_bts_constraints(struct perf_event *event)
        struct hw_perf_event *hwc = &event->hw;
        unsigned int hw_event, bts_event;
+        if (event->attr.freq)
+                return NULL;
        hw_event = hwc->config & INTEL_ARCH_EVENT_MASK;
        bts_event = x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS);
@@ -794,6 +1038,67 @@ intel_bts_constraints(struct perf_event *event)
 }
 static struct event_constraint *
+intel_percore_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
+{
+        struct hw_perf_event *hwc = &event->hw;
+        unsigned int e = hwc->config & ARCH_PERFMON_EVENTSEL_EVENT;
+        struct event_constraint *c;
+        struct intel_percore *pc;
+        struct er_account *era;
+        int i;
+        int free_slot;
+        int found;
+        if (!x86_pmu.percore_constraints || hwc->extra_alloc)
+                return NULL;
+        for (c = x86_pmu.percore_constraints; c->cmask; c++) {
+                if (e != c->code)
+                        continue;
+                /*
+                 * Allocate resource per core.
+                 */
+                pc = cpuc->per_core;
+                if (!pc)
+                        break;
+                c = &emptyconstraint;
+                raw_spin_lock(&pc->lock);
+                free_slot = -1;
+                found = 0;
+                for (i = 0; i < MAX_EXTRA_REGS; i++) {
+                        era = &pc->regs[i];
+                        if (era->ref > 0 && hwc->extra_reg == era->extra_reg) {
+                                /* Allow sharing same config */
+                                if (hwc->extra_config == era->extra_config) {
+                                        era->ref++;
+                                        cpuc->percore_used = 1;
+                                        hwc->extra_alloc = 1;
+                                        c = NULL;
+                                }
+                                /* else conflict */
+                                found = 1;
+                                break;
+                        } else if (era->ref == 0 && free_slot == -1)
+                                free_slot = i;
+                }
+                if (!found && free_slot != -1) {
+                        era = &pc->regs[free_slot];
+                        era->ref = 1;
+                        era->extra_reg = hwc->extra_reg;
+                        era->extra_config = hwc->extra_config;
+                        cpuc->percore_used = 1;
+                        hwc->extra_alloc = 1;
+                        c = NULL;
+                }
+                raw_spin_unlock(&pc->lock);
+                return c;
+        }
+        return NULL;
+}
+static struct event_constraint *
 intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
 {
        struct event_constraint *c;
@@ -806,9 +1111,51 @@ intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event
        if (c)
                return c;
+        c = intel_percore_constraints(cpuc, event);
+        if (c)
+                return c;
        return x86_get_event_constraints(cpuc, event);
 }
+static void intel_put_event_constraints(struct cpu_hw_events *cpuc,
+                                        struct perf_event *event)
+{
+        struct extra_reg *er;
+        struct intel_percore *pc;
+        struct er_account *era;
+        struct hw_perf_event *hwc = &event->hw;
+        int i, allref;
+        if (!cpuc->percore_used)
+                return;
+        for (er = x86_pmu.extra_regs; er->msr; er++) {
+                if (er->event != (hwc->config & er->config_mask))
+                        continue;
+                pc = cpuc->per_core;
+                raw_spin_lock(&pc->lock);
+                for (i = 0; i < MAX_EXTRA_REGS; i++) {
+                        era = &pc->regs[i];
+                        if (era->ref > 0 &&
+                            era->extra_config == hwc->extra_config &&
+                            era->extra_reg == er->msr) {
+                                era->ref--;
+                                hwc->extra_alloc = 0;
+                                break;
+                        }
+                }
+                allref = 0;
+                for (i = 0; i < MAX_EXTRA_REGS; i++)
+                        allref += pc->regs[i].ref;
+                if (allref == 0)
+                        cpuc->percore_used = 0;
+                raw_spin_unlock(&pc->lock);
+                break;
+        }
+}
 static int intel_pmu_hw_config(struct perf_event *event)
 {
        int ret = x86_pmu_hw_config(event);
@@ -816,6 +1163,32 @@ static int intel_pmu_hw_config(struct perf_event *event)
        if (ret)
                return ret;
+        if (event->attr.precise_ip &&
+            (event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) {
+                /*
+                 * Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P
+                 * (0x003c) so that we can use it with PEBS.
+                 *
+                 * The regular CPU_CLK_UNHALTED.THREAD_P event (0x003c) isn't
+                 * PEBS capable. However we can use INST_RETIRED.ANY_P
+                 * (0x00c0), which is a PEBS capable event, to get the same
+                 * count.
+                 *
+                 * INST_RETIRED.ANY_P counts the number of cycles that retires
+                 * CNTMASK instructions. By setting CNTMASK to a value (16)
+                 * larger than the maximum number of instructions that can be
+                 * retired per cycle (4) and then inverting the condition, we
+                 * count all cycles that retire 16 or less instructions, which
+                 * is every cycle.
+                 *
+                 * Thereby we gain a PEBS capable cycle counter.
+                 */
+                u64 alt_config = 0x108000c0; /* INST_RETIRED.TOTAL_CYCLES */
+                alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK);
+                event->hw.config = alt_config;
+        }
        if (event->attr.type != PERF_TYPE_RAW)
                return 0;
@@ -854,20 +1227,67 @@ static __initconst const struct x86_pmu core_pmu = {
         */
        .max_period             = (1ULL << 31) - 1,
        .get_event_constraints  = intel_get_event_constraints,
+        .put_event_constraints  = intel_put_event_constraints,
        .event_constraints      = intel_core_event_constraints,
 };
+static int intel_pmu_cpu_prepare(int cpu)
+{
+        struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
+        if (!cpu_has_ht_siblings())
+                return NOTIFY_OK;
+        cpuc->per_core = kzalloc_node(sizeof(struct intel_percore),
+                                      GFP_KERNEL, cpu_to_node(cpu));
+        if (!cpuc->per_core)
+                return NOTIFY_BAD;
+        raw_spin_lock_init(&cpuc->per_core->lock);
+        cpuc->per_core->core_id = -1;
+        return NOTIFY_OK;
+}
 static void intel_pmu_cpu_starting(int cpu)
 {
+        struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
+        int core_id = topology_core_id(cpu);
+        int i;
        init_debug_store_on_cpu(cpu);
        /*
         * Deal with CPUs that don't clear their LBRs on power-up.
         */
        intel_pmu_lbr_reset();
+        if (!cpu_has_ht_siblings())
+                return;
+        for_each_cpu(i, topology_thread_cpumask(cpu)) {
+                struct intel_percore *pc = per_cpu(cpu_hw_events, i).per_core;
+                if (pc && pc->core_id == core_id) {
+                        kfree(cpuc->per_core);
+                        cpuc->per_core = pc;
+                        break;
+                }
+        }
+        cpuc->per_core->core_id = core_id;
+        cpuc->per_core->refcnt++;
 }
 static void intel_pmu_cpu_dying(int cpu)
 {
+        struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
+        struct intel_percore *pc = cpuc->per_core;
+        if (pc) {
+                if (pc->core_id == -1 || --pc->refcnt == 0)
+                        kfree(pc);
+                cpuc->per_core = NULL;
+        }
        fini_debug_store_on_cpu(cpu);
 }
@@ -892,7 +1312,9 @@ static __initconst const struct x86_pmu intel_pmu = {
         */
        .max_period             = (1ULL << 31) - 1,
        .get_event_constraints  = intel_get_event_constraints,
+        .put_event_constraints  = intel_put_event_constraints,
+        .cpu_prepare            = intel_pmu_cpu_prepare,
        .cpu_starting           = intel_pmu_cpu_starting,
        .cpu_dying              = intel_pmu_cpu_dying,
 };
@@ -913,7 +1335,7 @@ static void intel_clovertown_quirks(void)
         * AJ106 could possibly be worked around by not allowing LBR
         *       usage from PEBS, including the fixup.
         * AJ68  could possibly be worked around by always programming
-         *       a pebs_event_reset[0] value and coping with the lost events.
+         *       a pebs_event_reset[0] value and coping with the lost events.
         *
         * But taken together it might just make sense to not enable PEBS on
         * these chips.
@@ -998,6 +1420,7 @@ static __init int intel_pmu_init(void)
                intel_pmu_lbr_init_core();
                x86_pmu.event_constraints = intel_core2_event_constraints;
+                x86_pmu.pebs_constraints = intel_core2_pebs_event_constraints;
                pr_cont("Core2 events, ");
                break;
@@ -1006,11 +1429,33 @@ static __init int intel_pmu_init(void)
        case 46: /* 45 nm nehalem-ex, "Beckton" */
                memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
                       sizeof(hw_cache_event_ids));
+                memcpy(hw_cache_extra_regs, nehalem_hw_cache_extra_regs,
+                       sizeof(hw_cache_extra_regs));
                intel_pmu_lbr_init_nhm();
                x86_pmu.event_constraints = intel_nehalem_event_constraints;
+                x86_pmu.pebs_constraints = intel_nehalem_pebs_event_constraints;
+                x86_pmu.percore_constraints = intel_nehalem_percore_constraints;
                x86_pmu.enable_all = intel_pmu_nhm_enable_all;
+                x86_pmu.extra_regs = intel_nehalem_extra_regs;
+                /* UOPS_ISSUED.STALLED_CYCLES */
+                intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e;
+                /* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */
+                intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x1803fb1;
+                if (ebx & 0x40) {
+                        /*
+                         * Erratum AAJ80 detected, we work it around by using
+                         * the BR_MISP_EXEC.ANY event. This will over-count
+                         * branch-misses, but it's still much better than the
+                         * architectural event which is often completely bogus:
+                         */
+                        intel_perfmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x7f89;
+                        pr_cont("erratum AAJ80 worked around, ");
+                }
                pr_cont("Nehalem events, ");
                break;
@@ -1021,21 +1466,51 @@ static __init int intel_pmu_init(void)
                intel_pmu_lbr_init_atom();
                x86_pmu.event_constraints = intel_gen_event_constraints;
+                x86_pmu.pebs_constraints = intel_atom_pebs_event_constraints;
                pr_cont("Atom events, ");
                break;
        case 37: /* 32 nm nehalem, "Clarkdale" */
        case 44: /* 32 nm nehalem, "Gulftown" */
+        case 47: /* 32 nm Xeon E7 */
                memcpy(hw_cache_event_ids, westmere_hw_cache_event_ids,
                       sizeof(hw_cache_event_ids));
+                memcpy(hw_cache_extra_regs, nehalem_hw_cache_extra_regs,
+                       sizeof(hw_cache_extra_regs));
                intel_pmu_lbr_init_nhm();
                x86_pmu.event_constraints = intel_westmere_event_constraints;
+                x86_pmu.percore_constraints = intel_westmere_percore_constraints;
                x86_pmu.enable_all = intel_pmu_nhm_enable_all;
+                x86_pmu.pebs_constraints = intel_westmere_pebs_event_constraints;
+                x86_pmu.extra_regs = intel_westmere_extra_regs;
+                /* UOPS_ISSUED.STALLED_CYCLES */
+                intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e;
+                /* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */
+                intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x1803fb1;
                pr_cont("Westmere events, ");
                break;
+        case 42: /* SandyBridge */
+                memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
+                       sizeof(hw_cache_event_ids));
+                intel_pmu_lbr_init_nhm();
+                x86_pmu.event_constraints = intel_snb_event_constraints;
+                x86_pmu.pebs_constraints = intel_snb_pebs_events;
+                /* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */
+                intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e;
+                /* UOPS_DISPATCHED.THREAD,c=1,i=1 to count stall cycles*/
+                intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x18001b1;
+                pr_cont("SandyBridge events, ");
+                break;
        default:
                /*
                 * default constraints for v2 and up
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 18018d1311cd..bab491b8ee25 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -74,6 +74,107 @@ static void fini_debug_store_on_cpu(int cpu)
        wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0);
 }
+static int alloc_pebs_buffer(int cpu)
+{
+        struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+        int node = cpu_to_node(cpu);
+        int max, thresh = 1; /* always use a single PEBS record */
+        void *buffer;
+        if (!x86_pmu.pebs)
+                return 0;
+        buffer = kmalloc_node(PEBS_BUFFER_SIZE, GFP_KERNEL | __GFP_ZERO, node);
+        if (unlikely(!buffer))
+                return -ENOMEM;
+        max = PEBS_BUFFER_SIZE / x86_pmu.pebs_record_size;
+        ds->pebs_buffer_base = (u64)(unsigned long)buffer;
+        ds->pebs_index = ds->pebs_buffer_base;
+        ds->pebs_absolute_maximum = ds->pebs_buffer_base +
+                max * x86_pmu.pebs_record_size;
+        ds->pebs_interrupt_threshold = ds->pebs_buffer_base +
+                thresh * x86_pmu.pebs_record_size;
+        return 0;
+}
+static void release_pebs_buffer(int cpu)
+{
+        struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+        if (!ds || !x86_pmu.pebs)
+                return;
+        kfree((void *)(unsigned long)ds->pebs_buffer_base);
+        ds->pebs_buffer_base = 0;
+}
+static int alloc_bts_buffer(int cpu)
+{
+        struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+        int node = cpu_to_node(cpu);
+        int max, thresh;
+        void *buffer;
+        if (!x86_pmu.bts)
+                return 0;
+        buffer = kmalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_ZERO, node);
+        if (unlikely(!buffer))
+                return -ENOMEM;
+        max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE;
+        thresh = max / 16;
+        ds->bts_buffer_base = (u64)(unsigned long)buffer;
+        ds->bts_index = ds->bts_buffer_base;
+        ds->bts_absolute_maximum = ds->bts_buffer_base +
+                max * BTS_RECORD_SIZE;
+        ds->bts_interrupt_threshold = ds->bts_absolute_maximum -
+                thresh * BTS_RECORD_SIZE;
+        return 0;
+}
+static void release_bts_buffer(int cpu)
+{
+        struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+        if (!ds || !x86_pmu.bts)
+                return;
+        kfree((void *)(unsigned long)ds->bts_buffer_base);
+        ds->bts_buffer_base = 0;
+}
+static int alloc_ds_buffer(int cpu)
+{
+        int node = cpu_to_node(cpu);
+        struct debug_store *ds;
+        ds = kmalloc_node(sizeof(*ds), GFP_KERNEL | __GFP_ZERO, node);
+        if (unlikely(!ds))
+                return -ENOMEM;
+        per_cpu(cpu_hw_events, cpu).ds = ds;
+        return 0;
+}
+static void release_ds_buffer(int cpu)
+{
+        struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+        if (!ds)
+                return;
+        per_cpu(cpu_hw_events, cpu).ds = NULL;
+        kfree(ds);
+}
 static void release_ds_buffers(void)
 {
        int cpu;
@@ -82,93 +183,77 @@ static void release_ds_buffers(void)
                return;
        get_online_cpus();
        for_each_online_cpu(cpu)
                fini_debug_store_on_cpu(cpu);
        for_each_possible_cpu(cpu) {
-                struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+                release_pebs_buffer(cpu);
+                release_bts_buffer(cpu);
-                if (!ds)
+                release_ds_buffer(cpu);
-                        continue;
-                per_cpu(cpu_hw_events, cpu).ds = NULL;
-                kfree((void *)(unsigned long)ds->pebs_buffer_base);
-                kfree((void *)(unsigned long)ds->bts_buffer_base);
-                kfree(ds);
        }
        put_online_cpus();
 }
-static int reserve_ds_buffers(void)
+static void reserve_ds_buffers(void)
 {
-        int cpu, err = 0;
+        int bts_err = 0, pebs_err = 0;
+        int cpu;
+        x86_pmu.bts_active = 0;
+        x86_pmu.pebs_active = 0;
        if (!x86_pmu.bts && !x86_pmu.pebs)
-                return 0;
+                return;
+        if (!x86_pmu.bts)
+                bts_err = 1;
+        if (!x86_pmu.pebs)
+                pebs_err = 1;
        get_online_cpus();
        for_each_possible_cpu(cpu) {
-                struct debug_store *ds;
+                if (alloc_ds_buffer(cpu)) {
-                void *buffer;
+                        bts_err = 1;
-                int max, thresh;
+                        pebs_err = 1;
+                }
-                err = -ENOMEM;
+                if (!bts_err && alloc_bts_buffer(cpu))
-                ds = kzalloc(sizeof(*ds), GFP_KERNEL);
+                        bts_err = 1;
-                if (unlikely(!ds))
+                if (!pebs_err && alloc_pebs_buffer(cpu))
+                        pebs_err = 1;
+                if (bts_err && pebs_err)
                        break;
-                per_cpu(cpu_hw_events, cpu).ds = ds;
+        }
-                if (x86_pmu.bts) {
-                        buffer = kzalloc(BTS_BUFFER_SIZE, GFP_KERNEL);
-                        if (unlikely(!buffer))
-                                break;
-                        max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE;
-                        thresh = max / 16;
-                        ds->bts_buffer_base = (u64)(unsigned long)buffer;
-                        ds->bts_index = ds->bts_buffer_base;
-                        ds->bts_absolute_maximum = ds->bts_buffer_base +
-                                max * BTS_RECORD_SIZE;
-                        ds->bts_interrupt_threshold = ds->bts_absolute_maximum -
-                                thresh * BTS_RECORD_SIZE;
-                }
-                if (x86_pmu.pebs) {
+        if (bts_err) {
-                        buffer = kzalloc(PEBS_BUFFER_SIZE, GFP_KERNEL);
+                for_each_possible_cpu(cpu)
-                        if (unlikely(!buffer))
+                        release_bts_buffer(cpu);
-                                break;
+        }
-                        max = PEBS_BUFFER_SIZE / x86_pmu.pebs_record_size;
-                        ds->pebs_buffer_base = (u64)(unsigned long)buffer;
-                        ds->pebs_index = ds->pebs_buffer_base;
-                        ds->pebs_absolute_maximum = ds->pebs_buffer_base +
-                                max * x86_pmu.pebs_record_size;
-                        /*
-                         * Always use single record PEBS
-                         */
-                        ds->pebs_interrupt_threshold = ds->pebs_buffer_base +
-                                x86_pmu.pebs_record_size;
-                }
-                err = 0;
+        if (pebs_err) {
+                for_each_possible_cpu(cpu)
+                        release_pebs_buffer(cpu);
        }
-        if (err)
+        if (bts_err && pebs_err) {
-                release_ds_buffers();
+                for_each_possible_cpu(cpu)
-        else {
+                        release_ds_buffer(cpu);
+        } else {
+                if (x86_pmu.bts && !bts_err)
+                        x86_pmu.bts_active = 1;
+                if (x86_pmu.pebs && !pebs_err)
+                        x86_pmu.pebs_active = 1;
                for_each_online_cpu(cpu)
                        init_debug_store_on_cpu(cpu);
        }
        put_online_cpus();
-        return err;
 }
 /*
@@ -214,7 +299,7 @@ static void intel_pmu_disable_bts(void)
        update_debugctlmsr(debugctlmsr);
 }
-static void intel_pmu_drain_bts_buffer(void)
+static int intel_pmu_drain_bts_buffer(void)
 {
        struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
        struct debug_store *ds = cpuc->ds;
@@ -231,16 +316,16 @@ static void intel_pmu_drain_bts_buffer(void)
        struct pt_regs regs;
        if (!event)
-                return;
+                return 0;
-        if (!ds)
+        if (!x86_pmu.bts_active)
-                return;
+                return 0;
        at  = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
        top = (struct bts_record *)(unsigned long)ds->bts_index;
        if (top <= at)
-                return;
+                return 0;
        ds->bts_index = ds->bts_buffer_base;
@@ -256,7 +341,7 @@ static void intel_pmu_drain_bts_buffer(void)
        perf_prepare_sample(&header, &data, event, &regs);
        if (perf_output_begin(&handle, event, header.size * (top - at), 1, 1))
-                return;
+                return 1;
        for (; at < top; at++) {
                data.ip         = at->from;
@@ -270,35 +355,76 @@ static void intel_pmu_drain_bts_buffer(void)
        /* There's new data available. */
        event->hw.interrupts++;
        event->pending_kill = POLL_IN;
+        return 1;
 }
 /*
 * PEBS
 */
+static struct event_constraint intel_core2_pebs_event_constraints[] = {
+        INTEL_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */
+        INTEL_UEVENT_CONSTRAINT(0xfec1, 0x1), /* X87_OPS_RETIRED.ANY */
+        INTEL_UEVENT_CONSTRAINT(0x00c5, 0x1), /* BR_INST_RETIRED.MISPRED */
+        INTEL_UEVENT_CONSTRAINT(0x1fc7, 0x1), /* SIMD_INST_RETURED.ANY */
+        INTEL_EVENT_CONSTRAINT(0xcb, 0x1),    /* MEM_LOAD_RETIRED.* */
+        EVENT_CONSTRAINT_END
+};
+static struct event_constraint intel_atom_pebs_event_constraints[] = {
+        INTEL_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */
+        INTEL_UEVENT_CONSTRAINT(0x00c5, 0x1), /* MISPREDICTED_BRANCH_RETIRED */
+        INTEL_EVENT_CONSTRAINT(0xcb, 0x1),    /* MEM_LOAD_RETIRED.* */
+        EVENT_CONSTRAINT_END
+};
+static struct event_constraint intel_nehalem_pebs_event_constraints[] = {
+        INTEL_EVENT_CONSTRAINT(0x0b, 0xf),    /* MEM_INST_RETIRED.* */
+        INTEL_EVENT_CONSTRAINT(0x0f, 0xf),    /* MEM_UNCORE_RETIRED.* */
+        INTEL_UEVENT_CONSTRAINT(0x010c, 0xf), /* MEM_STORE_RETIRED.DTLB_MISS */
+        INTEL_EVENT_CONSTRAINT(0xc0, 0xf),    /* INST_RETIRED.ANY */
+        INTEL_EVENT_CONSTRAINT(0xc2, 0xf),    /* UOPS_RETIRED.* */
+        INTEL_EVENT_CONSTRAINT(0xc4, 0xf),    /* BR_INST_RETIRED.* */
+        INTEL_UEVENT_CONSTRAINT(0x02c5, 0xf), /* BR_MISP_RETIRED.NEAR_CALL */
+        INTEL_EVENT_CONSTRAINT(0xc7, 0xf),    /* SSEX_UOPS_RETIRED.* */
+        INTEL_UEVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */
+        INTEL_EVENT_CONSTRAINT(0xcb, 0xf),    /* MEM_LOAD_RETIRED.* */
+        INTEL_EVENT_CONSTRAINT(0xf7, 0xf),    /* FP_ASSIST.* */
+        EVENT_CONSTRAINT_END
+};
-static struct event_constraint intel_core_pebs_events[] = {
+static struct event_constraint intel_westmere_pebs_event_constraints[] = {
-        PEBS_EVENT_CONSTRAINT(0x00c0, 0x1), /* INSTR_RETIRED.ANY */
+        INTEL_EVENT_CONSTRAINT(0x0b, 0xf),    /* MEM_INST_RETIRED.* */
-        PEBS_EVENT_CONSTRAINT(0xfec1, 0x1), /* X87_OPS_RETIRED.ANY */
+        INTEL_EVENT_CONSTRAINT(0x0f, 0xf),    /* MEM_UNCORE_RETIRED.* */
-        PEBS_EVENT_CONSTRAINT(0x00c5, 0x1), /* BR_INST_RETIRED.MISPRED */
+        INTEL_UEVENT_CONSTRAINT(0x010c, 0xf), /* MEM_STORE_RETIRED.DTLB_MISS */
-        PEBS_EVENT_CONSTRAINT(0x1fc7, 0x1), /* SIMD_INST_RETURED.ANY */
+        INTEL_EVENT_CONSTRAINT(0xc0, 0xf),    /* INSTR_RETIRED.* */
-        PEBS_EVENT_CONSTRAINT(0x01cb, 0x1), /* MEM_LOAD_RETIRED.L1D_MISS */
+        INTEL_EVENT_CONSTRAINT(0xc2, 0xf),    /* UOPS_RETIRED.* */
-        PEBS_EVENT_CONSTRAINT(0x02cb, 0x1), /* MEM_LOAD_RETIRED.L1D_LINE_MISS */
+        INTEL_EVENT_CONSTRAINT(0xc4, 0xf),    /* BR_INST_RETIRED.* */
-        PEBS_EVENT_CONSTRAINT(0x04cb, 0x1), /* MEM_LOAD_RETIRED.L2_MISS */
+        INTEL_EVENT_CONSTRAINT(0xc5, 0xf),    /* BR_MISP_RETIRED.* */
-        PEBS_EVENT_CONSTRAINT(0x08cb, 0x1), /* MEM_LOAD_RETIRED.L2_LINE_MISS */
+        INTEL_EVENT_CONSTRAINT(0xc7, 0xf),    /* SSEX_UOPS_RETIRED.* */
-        PEBS_EVENT_CONSTRAINT(0x10cb, 0x1), /* MEM_LOAD_RETIRED.DTLB_MISS */
+        INTEL_UEVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */
+        INTEL_EVENT_CONSTRAINT(0xcb, 0xf),    /* MEM_LOAD_RETIRED.* */
+        INTEL_EVENT_CONSTRAINT(0xf7, 0xf),    /* FP_ASSIST.* */
        EVENT_CONSTRAINT_END
 };
-static struct event_constraint intel_nehalem_pebs_events[] = {
+static struct event_constraint intel_snb_pebs_events[] = {
-        PEBS_EVENT_CONSTRAINT(0x00c0, 0xf), /* INSTR_RETIRED.ANY */
+        INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */
-        PEBS_EVENT_CONSTRAINT(0xfec1, 0xf), /* X87_OPS_RETIRED.ANY */
+        INTEL_UEVENT_CONSTRAINT(0x01c2, 0xf), /* UOPS_RETIRED.ALL */
-        PEBS_EVENT_CONSTRAINT(0x00c5, 0xf), /* BR_INST_RETIRED.MISPRED */
+        INTEL_UEVENT_CONSTRAINT(0x02c2, 0xf), /* UOPS_RETIRED.RETIRE_SLOTS */
-        PEBS_EVENT_CONSTRAINT(0x1fc7, 0xf), /* SIMD_INST_RETURED.ANY */
+        INTEL_EVENT_CONSTRAINT(0xc4, 0xf),    /* BR_INST_RETIRED.* */
-        PEBS_EVENT_CONSTRAINT(0x01cb, 0xf), /* MEM_LOAD_RETIRED.L1D_MISS */
+        INTEL_EVENT_CONSTRAINT(0xc5, 0xf),    /* BR_MISP_RETIRED.* */
-        PEBS_EVENT_CONSTRAINT(0x02cb, 0xf), /* MEM_LOAD_RETIRED.L1D_LINE_MISS */
+        INTEL_EVENT_CONSTRAINT(0xcd, 0x8),    /* MEM_TRANS_RETIRED.* */
-        PEBS_EVENT_CONSTRAINT(0x04cb, 0xf), /* MEM_LOAD_RETIRED.L2_MISS */
+        INTEL_UEVENT_CONSTRAINT(0x11d0, 0xf), /* MEM_UOP_RETIRED.STLB_MISS_LOADS */
-        PEBS_EVENT_CONSTRAINT(0x08cb, 0xf), /* MEM_LOAD_RETIRED.L2_LINE_MISS */
+        INTEL_UEVENT_CONSTRAINT(0x12d0, 0xf), /* MEM_UOP_RETIRED.STLB_MISS_STORES */
-        PEBS_EVENT_CONSTRAINT(0x10cb, 0xf), /* MEM_LOAD_RETIRED.DTLB_MISS */
+        INTEL_UEVENT_CONSTRAINT(0x21d0, 0xf), /* MEM_UOP_RETIRED.LOCK_LOADS */
+        INTEL_UEVENT_CONSTRAINT(0x22d0, 0xf), /* MEM_UOP_RETIRED.LOCK_STORES */
+        INTEL_UEVENT_CONSTRAINT(0x41d0, 0xf), /* MEM_UOP_RETIRED.SPLIT_LOADS */
+        INTEL_UEVENT_CONSTRAINT(0x42d0, 0xf), /* MEM_UOP_RETIRED.SPLIT_STORES */
+        INTEL_UEVENT_CONSTRAINT(0x81d0, 0xf), /* MEM_UOP_RETIRED.ANY_LOADS */
+        INTEL_UEVENT_CONSTRAINT(0x82d0, 0xf), /* MEM_UOP_RETIRED.ANY_STORES */
+        INTEL_EVENT_CONSTRAINT(0xd1, 0xf),    /* MEM_LOAD_UOPS_RETIRED.* */
+        INTEL_EVENT_CONSTRAINT(0xd2, 0xf),    /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
+        INTEL_UEVENT_CONSTRAINT(0x02d4, 0xf), /* MEM_LOAD_UOPS_MISC_RETIRED.LLC_MISS */
        EVENT_CONSTRAINT_END
 };
@@ -491,7 +617,7 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
                regs.flags &= ~PERF_EFLAGS_EXACT;
        if (perf_event_overflow(event, 1, &data, &regs))
-                x86_pmu_stop(event);
+                x86_pmu_stop(event, 0);
 }
 static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
@@ -502,7 +628,7 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
        struct pebs_record_core *at, *top;
        int n;
-        if (!ds || !x86_pmu.pebs)
+        if (!x86_pmu.pebs_active)
                return;
        at  = (struct pebs_record_core *)(unsigned long)ds->pebs_buffer_base;
@@ -544,7 +670,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
        u64 status = 0;
        int bit, n;
-        if (!ds || !x86_pmu.pebs)
+        if (!x86_pmu.pebs_active)
                return;
        at  = (struct pebs_record_nhm *)(unsigned long)ds->pebs_buffer_base;
@@ -609,29 +735,25 @@ static void intel_ds_init(void)
                        printk(KERN_CONT "PEBS fmt0%c, ", pebs_type);
                        x86_pmu.pebs_record_size = sizeof(struct pebs_record_core);
                        x86_pmu.drain_pebs = intel_pmu_drain_pebs_core;
-                        x86_pmu.pebs_constraints = intel_core_pebs_events;
                        break;
                case 1:
                        printk(KERN_CONT "PEBS fmt1%c, ", pebs_type);
                        x86_pmu.pebs_record_size = sizeof(struct pebs_record_nhm);
                        x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm;
-                        x86_pmu.pebs_constraints = intel_nehalem_pebs_events;
                        break;
                default:
                        printk(KERN_CONT "no PEBS fmt%d%c, ", format, pebs_type);
                        x86_pmu.pebs = 0;
-                        break;
                }
        }
 }
 #else /* CONFIG_CPU_SUP_INTEL */
-static int reserve_ds_buffers(void)
+static void reserve_ds_buffers(void)
 {
-        return 0;
 }
 static void release_ds_buffers(void)
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index 249015173992..ead584fb6a7d 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -1,5 +1,5 @@
 /*
- * Netburst Perfomance Events (P4, old Xeon)
+ * Netburst Performance Events (P4, old Xeon)
 *
 *  Copyright (C) 2010 Parallels, Inc., Cyrill Gorcunov <gorcunov@openvz.org>
 *  Copyright (C) 2010 Intel Corporation, Lin Ming <ming.m.lin@intel.com>
@@ -18,6 +18,8 @@
 struct p4_event_bind {
        unsigned int opcode;                    /* Event code and ESCR selector */
        unsigned int escr_msr[2];               /* ESCR MSR for this event */
+        unsigned int escr_emask;                /* valid ESCR EventMask bits */
+        unsigned int shared;                    /* event is shared across threads */
        char cntr[2][P4_CNTR_LIMIT];            /* counter index (offset), -1 on abscence */
 };
@@ -66,231 +68,435 @@ static struct p4_event_bind p4_event_bind_map[] = {
        [P4_EVENT_TC_DELIVER_MODE] = {
                .opcode         = P4_OPCODE(P4_EVENT_TC_DELIVER_MODE),
                .escr_msr       = { MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1 },
+                .escr_emask     =
+                        P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DD)                 |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DB)                 |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DI)                 |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, BD)                 |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, BB)                 |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, BI)                 |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, ID),
+                .shared         = 1,
                .cntr           = { {4, 5, -1}, {6, 7, -1} },
        },
        [P4_EVENT_BPU_FETCH_REQUEST] = {
                .opcode         = P4_OPCODE(P4_EVENT_BPU_FETCH_REQUEST),
                .escr_msr       = { MSR_P4_BPU_ESCR0, MSR_P4_BPU_ESCR1 },
+                .escr_emask     =
+                        P4_ESCR_EMASK_BIT(P4_EVENT_BPU_FETCH_REQUEST, TCMISS),
                .cntr           = { {0, -1, -1}, {2, -1, -1} },
        },
        [P4_EVENT_ITLB_REFERENCE] = {
                .opcode         = P4_OPCODE(P4_EVENT_ITLB_REFERENCE),
                .escr_msr       = { MSR_P4_ITLB_ESCR0, MSR_P4_ITLB_ESCR1 },
+                .escr_emask     =
+                        P4_ESCR_EMASK_BIT(P4_EVENT_ITLB_REFERENCE, HIT)                 |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_ITLB_REFERENCE, MISS)                |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_ITLB_REFERENCE, HIT_UK),
                .cntr           = { {0, -1, -1}, {2, -1, -1} },
        },
        [P4_EVENT_MEMORY_CANCEL] = {
                .opcode         = P4_OPCODE(P4_EVENT_MEMORY_CANCEL),
                .escr_msr       = { MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1 },
+                .escr_emask     =
+                        P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_CANCEL, ST_RB_FULL)           |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_CANCEL, 64K_CONF),
                .cntr           = { {8, 9, -1}, {10, 11, -1} },
        },
        [P4_EVENT_MEMORY_COMPLETE] = {
                .opcode         = P4_OPCODE(P4_EVENT_MEMORY_COMPLETE),
                .escr_msr       = { MSR_P4_SAAT_ESCR0 , MSR_P4_SAAT_ESCR1 },
+                .escr_emask     =
+                        P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_COMPLETE, LSC)                |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_COMPLETE, SSC),
                .cntr           = { {8, 9, -1}, {10, 11, -1} },
        },
        [P4_EVENT_LOAD_PORT_REPLAY] = {
                .opcode         = P4_OPCODE(P4_EVENT_LOAD_PORT_REPLAY),
                .escr_msr       = { MSR_P4_SAAT_ESCR0, MSR_P4_SAAT_ESCR1 },
+                .escr_emask     =
+                        P4_ESCR_EMASK_BIT(P4_EVENT_LOAD_PORT_REPLAY, SPLIT_LD),
                .cntr           = { {8, 9, -1}, {10, 11, -1} },
        },
        [P4_EVENT_STORE_PORT_REPLAY] = {
                .opcode         = P4_OPCODE(P4_EVENT_STORE_PORT_REPLAY),
                .escr_msr       = { MSR_P4_SAAT_ESCR0 ,  MSR_P4_SAAT_ESCR1 },
+                .escr_emask     =
+                        P4_ESCR_EMASK_BIT(P4_EVENT_STORE_PORT_REPLAY, SPLIT_ST),
                .cntr           = { {8, 9, -1}, {10, 11, -1} },
        },
        [P4_EVENT_MOB_LOAD_REPLAY] = {
                .opcode         = P4_OPCODE(P4_EVENT_MOB_LOAD_REPLAY),
                .escr_msr       = { MSR_P4_MOB_ESCR0, MSR_P4_MOB_ESCR1 },
+                .escr_emask     =
+                        P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, NO_STA)             |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, NO_STD)             |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, PARTIAL_DATA)       |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, UNALGN_ADDR),
                .cntr           = { {0, -1, -1}, {2, -1, -1} },
        },
        [P4_EVENT_PAGE_WALK_TYPE] = {
                .opcode         = P4_OPCODE(P4_EVENT_PAGE_WALK_TYPE),
                .escr_msr       = { MSR_P4_PMH_ESCR0, MSR_P4_PMH_ESCR1 },
+                .escr_emask     =
+                        P4_ESCR_EMASK_BIT(P4_EVENT_PAGE_WALK_TYPE, DTMISS)              |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_PAGE_WALK_TYPE, ITMISS),
+                .shared         = 1,
                .cntr           = { {0, -1, -1}, {2, -1, -1} },
        },
        [P4_EVENT_BSQ_CACHE_REFERENCE] = {
                .opcode         = P4_OPCODE(P4_EVENT_BSQ_CACHE_REFERENCE),
                .escr_msr       = { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR1 },
+                .escr_emask     =
+                        P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITS)   |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITE)   |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITM)   |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITS)   |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITE)   |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITM)   |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_MISS)   |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_MISS)   |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, WR_2ndL_MISS),
                .cntr           = { {0, -1, -1}, {2, -1, -1} },
        },
        [P4_EVENT_IOQ_ALLOCATION] = {
                .opcode         = P4_OPCODE(P4_EVENT_IOQ_ALLOCATION),
                .escr_msr       = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+                .escr_emask     =
+                        P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, DEFAULT)             |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, ALL_READ)            |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, ALL_WRITE)           |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_UC)              |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WC)              |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WT)              |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WP)              |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WB)              |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, OWN)                 |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, OTHER)               |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, PREFETCH),
                .cntr           = { {0, -1, -1}, {2, -1, -1} },
        },
        [P4_EVENT_IOQ_ACTIVE_ENTRIES] = {       /* shared ESCR */
                .opcode         = P4_OPCODE(P4_EVENT_IOQ_ACTIVE_ENTRIES),
                .escr_msr       = { MSR_P4_FSB_ESCR1,  MSR_P4_FSB_ESCR1 },
+                .escr_emask     =
+                        P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, DEFAULT)         |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, ALL_READ)        |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, ALL_WRITE)       |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_UC)          |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WC)          |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WT)          |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WP)          |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WB)          |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, OWN)             |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, OTHER)           |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, PREFETCH),
                .cntr           = { {2, -1, -1}, {3, -1, -1} },
        },
        [P4_EVENT_FSB_DATA_ACTIVITY] = {
                .opcode         = P4_OPCODE(P4_EVENT_FSB_DATA_ACTIVITY),
                .escr_msr       = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+                .escr_emask     =
+                        P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_DRV)         |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OWN)         |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OTHER)       |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_DRV)         |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_OWN)         |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_OTHER),
+                .shared         = 1,
                .cntr           = { {0, -1, -1}, {2, -1, -1} },
        },
        [P4_EVENT_BSQ_ALLOCATION] = {           /* shared ESCR, broken CCCR1 */
                .opcode         = P4_OPCODE(P4_EVENT_BSQ_ALLOCATION),
                .escr_msr       = { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR0 },
+                .escr_emask     =
+                        P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_TYPE0)           |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_TYPE1)           |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_LEN0)            |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_LEN1)            |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_IO_TYPE)         |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_LOCK_TYPE)       |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_CACHE_TYPE)      |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_SPLIT_TYPE)      |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_DEM_TYPE)        |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_ORD_TYPE)        |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE0)           |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE1)           |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE2),
                .cntr           = { {0, -1, -1}, {1, -1, -1} },
        },
        [P4_EVENT_BSQ_ACTIVE_ENTRIES] = {       /* shared ESCR */
                .opcode         = P4_OPCODE(P4_EVENT_BSQ_ACTIVE_ENTRIES),
                .escr_msr       = { MSR_P4_BSU_ESCR1 , MSR_P4_BSU_ESCR1 },
+                .escr_emask     =
+                        P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_TYPE0)       |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_TYPE1)       |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LEN0)        |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LEN1)        |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_IO_TYPE)     |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LOCK_TYPE)   |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_CACHE_TYPE)  |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_SPLIT_TYPE)  |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_DEM_TYPE)    |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_ORD_TYPE)    |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE0)       |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE1)       |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE2),
                .cntr           = { {2, -1, -1}, {3, -1, -1} },
        },
        [P4_EVENT_SSE_INPUT_ASSIST] = {
                .opcode         = P4_OPCODE(P4_EVENT_SSE_INPUT_ASSIST),
                .escr_msr       = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+                .escr_emask     =
+                        P4_ESCR_EMASK_BIT(P4_EVENT_SSE_INPUT_ASSIST, ALL),
+                .shared         = 1,
                .cntr           = { {8, 9, -1}, {10, 11, -1} },
        },
        [P4_EVENT_PACKED_SP_UOP] = {
                .opcode         = P4_OPCODE(P4_EVENT_PACKED_SP_UOP),
                .escr_msr       = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+                .escr_emask     =
+                        P4_ESCR_EMASK_BIT(P4_EVENT_PACKED_SP_UOP, ALL),
+                .shared         = 1,
                .cntr           = { {8, 9, -1}, {10, 11, -1} },
        },
        [P4_EVENT_PACKED_DP_UOP] = {
                .opcode         = P4_OPCODE(P4_EVENT_PACKED_DP_UOP),
                .escr_msr       = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+                .escr_emask     =
+                        P4_ESCR_EMASK_BIT(P4_EVENT_PACKED_DP_UOP, ALL),
+                .shared         = 1,
                .cntr           = { {8, 9, -1}, {10, 11, -1} },
        },
        [P4_EVENT_SCALAR_SP_UOP] = {
                .opcode         = P4_OPCODE(P4_EVENT_SCALAR_SP_UOP),
                .escr_msr       = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+                .escr_emask     =
+                        P4_ESCR_EMASK_BIT(P4_EVENT_SCALAR_SP_UOP, ALL),
+                .shared         = 1,
                .cntr           = { {8, 9, -1}, {10, 11, -1} },
        },
        [P4_EVENT_SCALAR_DP_UOP] = {
                .opcode         = P4_OPCODE(P4_EVENT_SCALAR_DP_UOP),
                .escr_msr       = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+                .escr_emask     =
+                        P4_ESCR_EMASK_BIT(P4_EVENT_SCALAR_DP_UOP, ALL),
+                .shared         = 1,
                .cntr           = { {8, 9, -1}, {10, 11, -1} },
        },
        [P4_EVENT_64BIT_MMX_UOP] = {
                .opcode         = P4_OPCODE(P4_EVENT_64BIT_MMX_UOP),
                .escr_msr       = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+                .escr_emask     =
+                        P4_ESCR_EMASK_BIT(P4_EVENT_64BIT_MMX_UOP, ALL),
+                .shared         = 1,
                .cntr           = { {8, 9, -1}, {10, 11, -1} },
        },
        [P4_EVENT_128BIT_MMX_UOP] = {
                .opcode         = P4_OPCODE(P4_EVENT_128BIT_MMX_UOP),
                .escr_msr       = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+                .escr_emask     =
+                        P4_ESCR_EMASK_BIT(P4_EVENT_128BIT_MMX_UOP, ALL),
+                .shared         = 1,
                .cntr           = { {8, 9, -1}, {10, 11, -1} },
        },
        [P4_EVENT_X87_FP_UOP] = {
                .opcode         = P4_OPCODE(P4_EVENT_X87_FP_UOP),
                .escr_msr       = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+                .escr_emask     =
+                        P4_ESCR_EMASK_BIT(P4_EVENT_X87_FP_UOP, ALL),
+                .shared         = 1,
                .cntr           = { {8, 9, -1}, {10, 11, -1} },
        },
        [P4_EVENT_TC_MISC] = {
                .opcode         = P4_OPCODE(P4_EVENT_TC_MISC),
                .escr_msr       = { MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1 },
+                .escr_emask     =
+                        P4_ESCR_EMASK_BIT(P4_EVENT_TC_MISC, FLUSH),
                .cntr           = { {4, 5, -1}, {6, 7, -1} },
        },
        [P4_EVENT_GLOBAL_POWER_EVENTS] = {
                .opcode         = P4_OPCODE(P4_EVENT_GLOBAL_POWER_EVENTS),
                .escr_msr       = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+                .escr_emask     =
+                        P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING),
                .cntr           = { {0, -1, -1}, {2, -1, -1} },
        },
        [P4_EVENT_TC_MS_XFER] = {
                .opcode         = P4_OPCODE(P4_EVENT_TC_MS_XFER),
                .escr_msr       = { MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1 },
+                .escr_emask     =
+                        P4_ESCR_EMASK_BIT(P4_EVENT_TC_MS_XFER, CISC),
                .cntr           = { {4, 5, -1}, {6, 7, -1} },
        },
        [P4_EVENT_UOP_QUEUE_WRITES] = {
                .opcode         = P4_OPCODE(P4_EVENT_UOP_QUEUE_WRITES),
                .escr_msr       = { MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1 },
+                .escr_emask     =
+                        P4_ESCR_EMASK_BIT(P4_EVENT_UOP_QUEUE_WRITES, FROM_TC_BUILD)     |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_UOP_QUEUE_WRITES, FROM_TC_DELIVER)   |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_UOP_QUEUE_WRITES, FROM_ROM),
                .cntr           = { {4, 5, -1}, {6, 7, -1} },
        },
        [P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE] = {
                .opcode         = P4_OPCODE(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE),
                .escr_msr       = { MSR_P4_TBPU_ESCR0 , MSR_P4_TBPU_ESCR0 },
+                .escr_emask     =
+                        P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, CONDITIONAL)    |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, CALL)           |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, RETURN)         |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, INDIRECT),
                .cntr           = { {4, 5, -1}, {6, 7, -1} },
        },
        [P4_EVENT_RETIRED_BRANCH_TYPE] = {
                .opcode         = P4_OPCODE(P4_EVENT_RETIRED_BRANCH_TYPE),
                .escr_msr       = { MSR_P4_TBPU_ESCR0 , MSR_P4_TBPU_ESCR1 },
+                .escr_emask     =
+                        P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CONDITIONAL)    |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CALL)           |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, RETURN)         |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, INDIRECT),
                .cntr           = { {4, 5, -1}, {6, 7, -1} },
        },
        [P4_EVENT_RESOURCE_STALL] = {
                .opcode         = P4_OPCODE(P4_EVENT_RESOURCE_STALL),
                .escr_msr       = { MSR_P4_ALF_ESCR0, MSR_P4_ALF_ESCR1 },
+                .escr_emask     =
+                        P4_ESCR_EMASK_BIT(P4_EVENT_RESOURCE_STALL, SBFULL),
                .cntr           = { {12, 13, 16}, {14, 15, 17} },
        },
        [P4_EVENT_WC_BUFFER] = {
                .opcode         = P4_OPCODE(P4_EVENT_WC_BUFFER),
                .escr_msr       = { MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1 },
+                .escr_emask     =
+                        P4_ESCR_EMASK_BIT(P4_EVENT_WC_BUFFER, WCB_EVICTS)               |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_WC_BUFFER, WCB_FULL_EVICTS),
+                .shared         = 1,
                .cntr           = { {8, 9, -1}, {10, 11, -1} },
        },
        [P4_EVENT_B2B_CYCLES] = {
                .opcode         = P4_OPCODE(P4_EVENT_B2B_CYCLES),
                .escr_msr       = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+                .escr_emask     = 0,
                .cntr           = { {0, -1, -1}, {2, -1, -1} },
        },
        [P4_EVENT_BNR] = {
                .opcode         = P4_OPCODE(P4_EVENT_BNR),
                .escr_msr       = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+                .escr_emask     = 0,
                .cntr           = { {0, -1, -1}, {2, -1, -1} },
        },
        [P4_EVENT_SNOOP] = {
                .opcode         = P4_OPCODE(P4_EVENT_SNOOP),
                .escr_msr       = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+                .escr_emask     = 0,
                .cntr           = { {0, -1, -1}, {2, -1, -1} },
        },
        [P4_EVENT_RESPONSE] = {
                .opcode         = P4_OPCODE(P4_EVENT_RESPONSE),
                .escr_msr       = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+                .escr_emask     = 0,
                .cntr           = { {0, -1, -1}, {2, -1, -1} },
        },
        [P4_EVENT_FRONT_END_EVENT] = {
                .opcode         = P4_OPCODE(P4_EVENT_FRONT_END_EVENT),
                .escr_msr       = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+                .escr_emask     =
+                        P4_ESCR_EMASK_BIT(P4_EVENT_FRONT_END_EVENT, NBOGUS)             |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_FRONT_END_EVENT, BOGUS),
                .cntr           = { {12, 13, 16}, {14, 15, 17} },
        },
        [P4_EVENT_EXECUTION_EVENT] = {
                .opcode         = P4_OPCODE(P4_EVENT_EXECUTION_EVENT),
                .escr_msr       = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+                .escr_emask     =
+                        P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS0)            |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS1)            |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS2)            |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS3)            |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS0)             |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS1)             |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS2)             |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS3),
                .cntr           = { {12, 13, 16}, {14, 15, 17} },
        },
        [P4_EVENT_REPLAY_EVENT] = {
                .opcode         = P4_OPCODE(P4_EVENT_REPLAY_EVENT),
                .escr_msr       = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+                .escr_emask     =
+                        P4_ESCR_EMASK_BIT(P4_EVENT_REPLAY_EVENT, NBOGUS)                |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_REPLAY_EVENT, BOGUS),
                .cntr           = { {12, 13, 16}, {14, 15, 17} },
        },
        [P4_EVENT_INSTR_RETIRED] = {
                .opcode         = P4_OPCODE(P4_EVENT_INSTR_RETIRED),
                .escr_msr       = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
+                .escr_emask     =
+                        P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, NBOGUSNTAG)           |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, NBOGUSTAG)            |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, BOGUSNTAG)            |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, BOGUSTAG),
                .cntr           = { {12, 13, 16}, {14, 15, 17} },
        },
        [P4_EVENT_UOPS_RETIRED] = {
                .opcode         = P4_OPCODE(P4_EVENT_UOPS_RETIRED),
                .escr_msr       = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
+                .escr_emask     =
+                        P4_ESCR_EMASK_BIT(P4_EVENT_UOPS_RETIRED, NBOGUS)                |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_UOPS_RETIRED, BOGUS),
                .cntr           = { {12, 13, 16}, {14, 15, 17} },
        },
        [P4_EVENT_UOP_TYPE] = {
                .opcode         = P4_OPCODE(P4_EVENT_UOP_TYPE),
                .escr_msr       = { MSR_P4_RAT_ESCR0, MSR_P4_RAT_ESCR1 },
+                .escr_emask     =
+                        P4_ESCR_EMASK_BIT(P4_EVENT_UOP_TYPE, TAGLOADS)                  |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_UOP_TYPE, TAGSTORES),
                .cntr           = { {12, 13, 16}, {14, 15, 17} },
        },
        [P4_EVENT_BRANCH_RETIRED] = {
                .opcode         = P4_OPCODE(P4_EVENT_BRANCH_RETIRED),
                .escr_msr       = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+                .escr_emask     =
+                        P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMNP)                |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMNM)                |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMTP)                |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMTM),
                .cntr           = { {12, 13, 16}, {14, 15, 17} },
        },
        [P4_EVENT_MISPRED_BRANCH_RETIRED] = {
                .opcode         = P4_OPCODE(P4_EVENT_MISPRED_BRANCH_RETIRED),
                .escr_msr       = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
+                .escr_emask     =
+                        P4_ESCR_EMASK_BIT(P4_EVENT_MISPRED_BRANCH_RETIRED, NBOGUS),
                .cntr           = { {12, 13, 16}, {14, 15, 17} },
        },
        [P4_EVENT_X87_ASSIST] = {
                .opcode         = P4_OPCODE(P4_EVENT_X87_ASSIST),
                .escr_msr       = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+                .escr_emask     =
+                        P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, FPSU)                    |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, FPSO)                    |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, POAO)                    |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, POAU)                    |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, PREA),
                .cntr           = { {12, 13, 16}, {14, 15, 17} },
        },
        [P4_EVENT_MACHINE_CLEAR] = {
                .opcode         = P4_OPCODE(P4_EVENT_MACHINE_CLEAR),
                .escr_msr       = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+                .escr_emask     =
+                        P4_ESCR_EMASK_BIT(P4_EVENT_MACHINE_CLEAR, CLEAR)                |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_MACHINE_CLEAR, MOCLEAR)              |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_MACHINE_CLEAR, SMCLEAR),
                .cntr           = { {12, 13, 16}, {14, 15, 17} },
        },
        [P4_EVENT_INSTR_COMPLETED] = {
                .opcode         = P4_OPCODE(P4_EVENT_INSTR_COMPLETED),
                .escr_msr       = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
+                .escr_emask     =
+                        P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_COMPLETED, NBOGUS)             |
+                        P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_COMPLETED, BOGUS),
                .cntr           = { {12, 13, 16}, {14, 15, 17} },
        },
 };
@@ -428,29 +634,73 @@ static u64 p4_pmu_event_map(int hw_event)
        return config;
 }
+/* check cpu model specifics */
+static bool p4_event_match_cpu_model(unsigned int event_idx)
+{
+        /* INSTR_COMPLETED event only exist for model 3, 4, 6 (Prescott) */
+        if (event_idx == P4_EVENT_INSTR_COMPLETED) {
+                if (boot_cpu_data.x86_model != 3 &&
+                        boot_cpu_data.x86_model != 4 &&
+                        boot_cpu_data.x86_model != 6)
+                        return false;
+        }
+        /*
+         * For info
+         * - IQ_ESCR0, IQ_ESCR1 only for models 1 and 2
+         */
+        return true;
+}
 static int p4_validate_raw_event(struct perf_event *event)
 {
-        unsigned int v;
+        unsigned int v, emask;
-        /* user data may have out-of-bound event index */
+        /* User data may have out-of-bound event index */
        v = p4_config_unpack_event(event->attr.config);
-        if (v >= ARRAY_SIZE(p4_event_bind_map)) {
+        if (v >= ARRAY_SIZE(p4_event_bind_map))
-                pr_warning("P4 PMU: Unknown event code: %d\n", v);
+                return -EINVAL;
+        /* It may be unsupported: */
+        if (!p4_event_match_cpu_model(v))
                return -EINVAL;
+        /*
+         * NOTE: P4_CCCR_THREAD_ANY has not the same meaning as
+         * in Architectural Performance Monitoring, it means not
+         * on _which_ logical cpu to count but rather _when_, ie it
+         * depends on logical cpu state -- count event if one cpu active,
+         * none, both or any, so we just allow user to pass any value
+         * desired.
+         *
+         * In turn we always set Tx_OS/Tx_USR bits bound to logical
+         * cpu without their propagation to another cpu
+         */
+        /*
+         * if an event is shared across the logical threads
+         * the user needs special permissions to be able to use it
+         */
+        if (p4_ht_active() && p4_event_bind_map[v].shared) {
+                if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
+                        return -EACCES;
        }
+        /* ESCR EventMask bits may be invalid */
+        emask = p4_config_unpack_escr(event->attr.config) & P4_ESCR_EVENTMASK_MASK;
+        if (emask & ~p4_event_bind_map[v].escr_emask)
+                return -EINVAL;
        /*
-         * it may have some screwed PEBS bits
+         * it may have some invalid PEBS bits
         */
-        if (p4_config_pebs_has(event->attr.config, P4_PEBS_CONFIG_ENABLE)) {
+        if (p4_config_pebs_has(event->attr.config, P4_PEBS_CONFIG_ENABLE))
-                pr_warning("P4 PMU: PEBS are not supported yet\n");
                return -EINVAL;
-        }
        v = p4_config_unpack_metric(event->attr.config);
-        if (v >= ARRAY_SIZE(p4_pebs_bind_map)) {
+        if (v >= ARRAY_SIZE(p4_pebs_bind_map))
-                pr_warning("P4 PMU: Unknown metric code: %d\n", v);
                return -EINVAL;
-        }
        return 0;
 }
@@ -477,28 +727,30 @@ static int p4_hw_config(struct perf_event *event)
                event->hw.config = p4_set_ht_bit(event->hw.config);
        if (event->attr.type == PERF_TYPE_RAW) {
+                struct p4_event_bind *bind;
+                unsigned int esel;
+                /*
+                 * Clear bits we reserve to be managed by kernel itself
+                 * and never allowed from a user space
+                 */
+                 event->attr.config &= P4_CONFIG_MASK;
                rc = p4_validate_raw_event(event);
                if (rc)
                        goto out;
                /*
-                 * We don't control raw events so it's up to the caller
-                 * to pass sane values (and we don't count the thread number
-                 * on HT machine but allow HT-compatible specifics to be
-                 * passed on)
-                 *
                 * Note that for RAW events we allow user to use P4_CCCR_RESERVED
                 * bits since we keep additional info here (for cache events and etc)
-                 *
-                 * XXX: HT wide things should check perf_paranoid_cpu() &&
-                 *      CAP_SYS_ADMIN
                 */
-                event->hw.config |= event->attr.config &
+                event->hw.config |= event->attr.config;
-                        (p4_config_pack_escr(P4_ESCR_MASK_HT) |
+                bind = p4_config_get_bind(event->attr.config);
-                         p4_config_pack_cccr(P4_CCCR_MASK_HT | P4_CCCR_RESERVED));
+                if (!bind) {
+                        rc = -EINVAL;
-                event->hw.config &= ~P4_CCCR_FORCE_OVF;
+                        goto out;
+                }
+                esel = P4_OPCODE_ESEL(bind->opcode);
+                event->hw.config |= p4_config_pack_cccr(P4_CCCR_ESEL(esel));
        }
        rc = x86_setup_perfctr(event);
@@ -509,19 +761,27 @@ out:
 static inline int p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc)
 {
-        int overflow = 0;
+        u64 v;
-        u32 low, high;
-        rdmsr(hwc->config_base + hwc->idx, low, high);
-        /* we need to check high bit for unflagged overflows */
+        /* an official way for overflow indication */
-        if ((low & P4_CCCR_OVF) || !(high & (1 << 31))) {
+        rdmsrl(hwc->config_base, v);
-                overflow = 1;
+        if (v & P4_CCCR_OVF) {
-                (void)checking_wrmsrl(hwc->config_base + hwc->idx,
+                wrmsrl(hwc->config_base, v & ~P4_CCCR_OVF);
-                        ((u64)low) & ~P4_CCCR_OVF);
+                return 1;
        }
-        return overflow;
+        /*
+         * In some circumstances the overflow might issue an NMI but did
+         * not set P4_CCCR_OVF bit. Because a counter holds a negative value
+         * we simply check for high bit being set, if it's cleared it means
+         * the counter has reached zero value and continued counting before
+         * real NMI signal was received:
+         */
+        rdmsrl(hwc->event_base, v);
+        if (!(v & ARCH_P4_UNFLAGGED_BIT))
+                return 1;
+        return 0;
 }
 static void p4_pmu_disable_pebs(void)
@@ -531,13 +791,13 @@ static void p4_pmu_disable_pebs(void)
         *
         * It's still allowed that two threads setup same cache
         * events so we can't simply clear metrics until we knew
-         * noone is depending on us, so we need kind of counter
+         * no one is depending on us, so we need kind of counter
         * for "ReplayEvent" users.
         *
         * What is more complex -- RAW events, if user (for some
         * reason) will pass some cache event metric with improper
         * event opcode -- it's fine from hardware point of view
-         * but completely nonsence from "meaning" of such action.
+         * but completely nonsense from "meaning" of such action.
         *
         * So at moment let leave metrics turned on forever -- it's
         * ok for now but need to be revisited!
@@ -556,7 +816,7 @@ static inline void p4_pmu_disable_event(struct perf_event *event)
         * state we need to clear P4_CCCR_OVF, otherwise interrupt get
         * asserted again and again
         */
-        (void)checking_wrmsrl(hwc->config_base + hwc->idx,
+        (void)checking_wrmsrl(hwc->config_base,
                (u64)(p4_config_unpack_cccr(hwc->config)) &
                        ~P4_CCCR_ENABLE & ~P4_CCCR_OVF & ~P4_CCCR_RESERVED);
 }
@@ -626,7 +886,7 @@ static void p4_pmu_enable_event(struct perf_event *event)
        p4_pmu_enable_pebs(hwc->config);
        (void)checking_wrmsrl(escr_addr, escr_conf);
-        (void)checking_wrmsrl(hwc->config_base + hwc->idx,
+        (void)checking_wrmsrl(hwc->config_base,
                                (cccr & ~P4_CCCR_RESERVED) | P4_CCCR_ENABLE);
 }
@@ -652,8 +912,7 @@ static int p4_pmu_handle_irq(struct pt_regs *regs)
        int idx, handled = 0;
        u64 val;
-        data.addr = 0;
+        perf_sample_data_init(&data, 0);
-        data.raw = NULL;
        cpuc = &__get_cpu_var(cpu_hw_events);
@@ -687,14 +946,23 @@ static int p4_pmu_handle_irq(struct pt_regs *regs)
                if (!x86_perf_event_set_period(event))
                        continue;
                if (perf_event_overflow(event, 1, &data, regs))
-                        p4_pmu_disable_event(event);
+                        x86_pmu_stop(event, 0);
        }
-        if (handled) {
+        if (handled)
-                /* p4 quirk: unmask it again */
-                apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED);
                inc_irq_stat(apic_perf_irqs);
-        }
+        /*
+         * When dealing with the unmasking of the LVTPC on P4 perf hw, it has
+         * been observed that the OVF bit flag has to be cleared first _before_
+         * the LVTPC can be unmasked.
+         *
+         * The reason is the NMI line will continue to be asserted while the OVF
+         * bit is set.  This causes a second NMI to generate if the LVTPC is
+         * unmasked before the OVF bit is cleared, leading to unknown NMI
+         * messages.
+         */
+        apic_write(APIC_LVTPC, APIC_DM_NMI);
        return handled;
 }
@@ -908,9 +1176,9 @@ static __initconst const struct x86_pmu p4_pmu = {
         */
        .num_counters           = ARCH_P4_MAX_CCCR,
        .apic                   = 1,
-        .cntval_bits            = 40,
+        .cntval_bits            = ARCH_P4_CNTRVAL_BITS,
-        .cntval_mask            = (1ULL << 40) - 1,
+        .cntval_mask            = ARCH_P4_CNTRVAL_MASK,
-        .max_period             = (1ULL << 39) - 1,
+        .max_period             = (1ULL << (ARCH_P4_CNTRVAL_BITS - 1)) - 1,
        .hw_config              = p4_hw_config,
        .schedule_events        = p4_pmu_schedule_events,
        /*
@@ -928,7 +1196,7 @@ static __init int p4_pmu_init(void)
 {
        unsigned int low, high;
-        /* If we get stripped -- indexig fails */
+        /* If we get stripped -- indexing fails */
        BUILD_BUG_ON(ARCH_P4_MAX_CCCR > X86_PMC_MAX_GENERIC);
        rdmsr(MSR_IA32_MISC_ENABLE, low, high);
diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c
index 34ba07be2cda..20c097e33860 100644
--- a/arch/x86/kernel/cpu/perf_event_p6.c
+++ b/arch/x86/kernel/cpu/perf_event_p6.c
@@ -68,7 +68,7 @@ p6_pmu_disable_event(struct perf_event *event)
        if (cpuc->enabled)
                val |= ARCH_PERFMON_EVENTSEL_ENABLE;
-        (void)checking_wrmsrl(hwc->config_base + hwc->idx, val);
+        (void)checking_wrmsrl(hwc->config_base, val);
 }
 static void p6_pmu_enable_event(struct perf_event *event)
@@ -81,7 +81,7 @@ static void p6_pmu_enable_event(struct perf_event *event)
        if (cpuc->enabled)
                val |= ARCH_PERFMON_EVENTSEL_ENABLE;
-        (void)checking_wrmsrl(hwc->config_base + hwc->idx, val);
+        (void)checking_wrmsrl(hwc->config_base, val);
 }
 static __initconst const struct x86_pmu p6_pmu = {
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index fb329e9f8494..966512b2cacf 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -16,32 +16,12 @@
 #include <linux/kernel.h>
 #include <linux/bitops.h>
 #include <linux/smp.h>
-#include <linux/nmi.h>
+#include <asm/nmi.h>
 #include <linux/kprobes.h>
 #include <asm/apic.h>
 #include <asm/perf_event.h>
-struct nmi_watchdog_ctlblk {
-        unsigned int cccr_msr;
-        unsigned int perfctr_msr;  /* the MSR to reset in NMI handler */
-        unsigned int evntsel_msr;  /* the MSR to select the events to handle */
-};
-/* Interface defining a CPU specific perfctr watchdog */
-struct wd_ops {
-        int (*reserve)(void);
-        void (*unreserve)(void);
-        int (*setup)(unsigned nmi_hz);
-        void (*rearm)(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz);
-        void (*stop)(void);
-        unsigned perfctr;
-        unsigned evntsel;
-        u64 checkbit;
-};
-static const struct wd_ops *wd_ops;
 /*
 * this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's
 * offset from MSR_P4_BSU_ESCR0.
@@ -60,14 +40,14 @@ static const struct wd_ops *wd_ops;
 static DECLARE_BITMAP(perfctr_nmi_owner, NMI_MAX_COUNTER_BITS);
 static DECLARE_BITMAP(evntsel_nmi_owner, NMI_MAX_COUNTER_BITS);
-static DEFINE_PER_CPU(struct nmi_watchdog_ctlblk, nmi_watchdog_ctlblk);
 /* converts an msr to an appropriate reservation bit */
 static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr)
 {
        /* returns the bit offset of the performance counter register */
        switch (boot_cpu_data.x86_vendor) {
        case X86_VENDOR_AMD:
+                if (msr >= MSR_F15H_PERF_CTR)
+                        return (msr - MSR_F15H_PERF_CTR) >> 1;
                return msr - MSR_K7_PERFCTR0;
        case X86_VENDOR_INTEL:
                if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
@@ -92,6 +72,8 @@ static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr)
        /* returns the bit offset of the event selection register */
        switch (boot_cpu_data.x86_vendor) {
        case X86_VENDOR_AMD:
+                if (msr >= MSR_F15H_PERF_CTL)
+                        return (msr - MSR_F15H_PERF_CTL) >> 1;
                return msr - MSR_K7_EVNTSEL0;
        case X86_VENDOR_INTEL:
                if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
@@ -172,624 +154,3 @@ void release_evntsel_nmi(unsigned int msr)
        clear_bit(counter, evntsel_nmi_owner);
 }
 EXPORT_SYMBOL(release_evntsel_nmi);
-void disable_lapic_nmi_watchdog(void)
-{
-        BUG_ON(nmi_watchdog != NMI_LOCAL_APIC);
-        if (atomic_read(&nmi_active) <= 0)
-                return;
-        on_each_cpu(stop_apic_nmi_watchdog, NULL, 1);
-        if (wd_ops)
-                wd_ops->unreserve();
-        BUG_ON(atomic_read(&nmi_active) != 0);
-}
-void enable_lapic_nmi_watchdog(void)
-{
-        BUG_ON(nmi_watchdog != NMI_LOCAL_APIC);
-        /* are we already enabled */
-        if (atomic_read(&nmi_active) != 0)
-                return;
-        /* are we lapic aware */
-        if (!wd_ops)
-                return;
-        if (!wd_ops->reserve()) {
-                printk(KERN_ERR "NMI watchdog: cannot reserve perfctrs\n");
-                return;
-        }
-        on_each_cpu(setup_apic_nmi_watchdog, NULL, 1);
-        touch_nmi_watchdog();
-}
-/*
- * Activate the NMI watchdog via the local APIC.
- */
-static unsigned int adjust_for_32bit_ctr(unsigned int hz)
-{
-        u64 counter_val;
-        unsigned int retval = hz;
-        /*
-         * On Intel CPUs with P6/ARCH_PERFMON only 32 bits in the counter
-         * are writable, with higher bits sign extending from bit 31.
-         * So, we can only program the counter with 31 bit values and
-         * 32nd bit should be 1, for 33.. to be 1.
-         * Find the appropriate nmi_hz
-         */
-        counter_val = (u64)cpu_khz * 1000;
-        do_div(counter_val, retval);
-        if (counter_val > 0x7fffffffULL) {
-                u64 count = (u64)cpu_khz * 1000;
-                do_div(count, 0x7fffffffUL);
-                retval = count + 1;
-        }
-        return retval;
-}
-static void write_watchdog_counter(unsigned int perfctr_msr,
-                                const char *descr, unsigned nmi_hz)
-{
-        u64 count = (u64)cpu_khz * 1000;
-        do_div(count, nmi_hz);
-        if (descr)
-                pr_debug("setting %s to -0x%08Lx\n", descr, count);
-        wrmsrl(perfctr_msr, 0 - count);
-}
-static void write_watchdog_counter32(unsigned int perfctr_msr,
-                                const char *descr, unsigned nmi_hz)
-{
-        u64 count = (u64)cpu_khz * 1000;
-        do_div(count, nmi_hz);
-        if (descr)
-                pr_debug("setting %s to -0x%08Lx\n", descr, count);
-        wrmsr(perfctr_msr, (u32)(-count), 0);
-}
-/*
- * AMD K7/K8/Family10h/Family11h support.
- * AMD keeps this interface nicely stable so there is not much variety
- */
-#define K7_EVNTSEL_ENABLE       (1 << 22)
-#define K7_EVNTSEL_INT          (1 << 20)
-#define K7_EVNTSEL_OS           (1 << 17)
-#define K7_EVNTSEL_USR          (1 << 16)
-#define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING    0x76
-#define K7_NMI_EVENT            K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING
-static int setup_k7_watchdog(unsigned nmi_hz)
-{
-        unsigned int perfctr_msr, evntsel_msr;
-        unsigned int evntsel;
-        struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
-        perfctr_msr = wd_ops->perfctr;
-        evntsel_msr = wd_ops->evntsel;
-        wrmsrl(perfctr_msr, 0UL);
-        evntsel = K7_EVNTSEL_INT
-                | K7_EVNTSEL_OS
-                | K7_EVNTSEL_USR
-                | K7_NMI_EVENT;
-        /* setup the timer */
-        wrmsr(evntsel_msr, evntsel, 0);
-        write_watchdog_counter(perfctr_msr, "K7_PERFCTR0", nmi_hz);
-        /* initialize the wd struct before enabling */
-        wd->perfctr_msr = perfctr_msr;
-        wd->evntsel_msr = evntsel_msr;
-        wd->cccr_msr = 0;  /* unused */
-        /* ok, everything is initialized, announce that we're set */
-        cpu_nmi_set_wd_enabled();
-        apic_write(APIC_LVTPC, APIC_DM_NMI);
-        evntsel |= K7_EVNTSEL_ENABLE;
-        wrmsr(evntsel_msr, evntsel, 0);
-        return 1;
-}
-static void single_msr_stop_watchdog(void)
-{
-        struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
-        wrmsr(wd->evntsel_msr, 0, 0);
-}
-static int single_msr_reserve(void)
-{
-        if (!reserve_perfctr_nmi(wd_ops->perfctr))
-                return 0;
-        if (!reserve_evntsel_nmi(wd_ops->evntsel)) {
-                release_perfctr_nmi(wd_ops->perfctr);
-                return 0;
-        }
-        return 1;
-}
-static void single_msr_unreserve(void)
-{
-        release_evntsel_nmi(wd_ops->evntsel);
-        release_perfctr_nmi(wd_ops->perfctr);
-}
-static void __kprobes
-single_msr_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
-{
-        /* start the cycle over again */
-        write_watchdog_counter(wd->perfctr_msr, NULL, nmi_hz);
-}
-static const struct wd_ops k7_wd_ops = {
-        .reserve        = single_msr_reserve,
-        .unreserve      = single_msr_unreserve,
-        .setup          = setup_k7_watchdog,
-        .rearm          = single_msr_rearm,
-        .stop           = single_msr_stop_watchdog,
-        .perfctr        = MSR_K7_PERFCTR0,
-        .evntsel        = MSR_K7_EVNTSEL0,
-        .checkbit       = 1ULL << 47,
-};
-/*
- * Intel Model 6 (PPro+,P2,P3,P-M,Core1)
- */
-#define P6_EVNTSEL0_ENABLE      (1 << 22)
-#define P6_EVNTSEL_INT          (1 << 20)
-#define P6_EVNTSEL_OS           (1 << 17)
-#define P6_EVNTSEL_USR          (1 << 16)
-#define P6_EVENT_CPU_CLOCKS_NOT_HALTED  0x79
-#define P6_NMI_EVENT            P6_EVENT_CPU_CLOCKS_NOT_HALTED
-static int setup_p6_watchdog(unsigned nmi_hz)
-{
-        unsigned int perfctr_msr, evntsel_msr;
-        unsigned int evntsel;
-        struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
-        perfctr_msr = wd_ops->perfctr;
-        evntsel_msr = wd_ops->evntsel;
-        /* KVM doesn't implement this MSR */
-        if (wrmsr_safe(perfctr_msr, 0, 0) < 0)
-                return 0;
-        evntsel = P6_EVNTSEL_INT
-                | P6_EVNTSEL_OS
-                | P6_EVNTSEL_USR
-                | P6_NMI_EVENT;
-        /* setup the timer */
-        wrmsr(evntsel_msr, evntsel, 0);
-        nmi_hz = adjust_for_32bit_ctr(nmi_hz);
-        write_watchdog_counter32(perfctr_msr, "P6_PERFCTR0", nmi_hz);
-        /* initialize the wd struct before enabling */
-        wd->perfctr_msr = perfctr_msr;
-        wd->evntsel_msr = evntsel_msr;
-        wd->cccr_msr = 0;  /* unused */
-        /* ok, everything is initialized, announce that we're set */
-        cpu_nmi_set_wd_enabled();
-        apic_write(APIC_LVTPC, APIC_DM_NMI);
-        evntsel |= P6_EVNTSEL0_ENABLE;
-        wrmsr(evntsel_msr, evntsel, 0);
-        return 1;
-}
-static void __kprobes p6_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
-{
-        /*
-         * P6 based Pentium M need to re-unmask
-         * the apic vector but it doesn't hurt
-         * other P6 variant.
-         * ArchPerfom/Core Duo also needs this
-         */
-        apic_write(APIC_LVTPC, APIC_DM_NMI);
-        /* P6/ARCH_PERFMON has 32 bit counter write */
-        write_watchdog_counter32(wd->perfctr_msr, NULL, nmi_hz);
-}
-static const struct wd_ops p6_wd_ops = {
-        .reserve        = single_msr_reserve,
-        .unreserve      = single_msr_unreserve,
-        .setup          = setup_p6_watchdog,
-        .rearm          = p6_rearm,
-        .stop           = single_msr_stop_watchdog,
-        .perfctr        = MSR_P6_PERFCTR0,
-        .evntsel        = MSR_P6_EVNTSEL0,
-        .checkbit       = 1ULL << 39,
-};
-/*
- * Intel P4 performance counters.
- * By far the most complicated of all.
- */
-#define MSR_P4_MISC_ENABLE_PERF_AVAIL   (1 << 7)
-#define P4_ESCR_EVENT_SELECT(N) ((N) << 25)
-#define P4_ESCR_OS              (1 << 3)
-#define P4_ESCR_USR             (1 << 2)
-#define P4_CCCR_OVF_PMI0        (1 << 26)
-#define P4_CCCR_OVF_PMI1        (1 << 27)
-#define P4_CCCR_THRESHOLD(N)    ((N) << 20)
-#define P4_CCCR_COMPLEMENT      (1 << 19)
-#define P4_CCCR_COMPARE         (1 << 18)
-#define P4_CCCR_REQUIRED        (3 << 16)
-#define P4_CCCR_ESCR_SELECT(N)  ((N) << 13)
-#define P4_CCCR_ENABLE          (1 << 12)
-#define P4_CCCR_OVF             (1 << 31)
-#define P4_CONTROLS 18
-static unsigned int p4_controls[18] = {
-        MSR_P4_BPU_CCCR0,
-        MSR_P4_BPU_CCCR1,
-        MSR_P4_BPU_CCCR2,
-        MSR_P4_BPU_CCCR3,
-        MSR_P4_MS_CCCR0,
-        MSR_P4_MS_CCCR1,
-        MSR_P4_MS_CCCR2,
-        MSR_P4_MS_CCCR3,
-        MSR_P4_FLAME_CCCR0,
-        MSR_P4_FLAME_CCCR1,
-        MSR_P4_FLAME_CCCR2,
-        MSR_P4_FLAME_CCCR3,
-        MSR_P4_IQ_CCCR0,
-        MSR_P4_IQ_CCCR1,
-        MSR_P4_IQ_CCCR2,
-        MSR_P4_IQ_CCCR3,
-        MSR_P4_IQ_CCCR4,
-        MSR_P4_IQ_CCCR5,
-};
-/*
- * Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter
- * CRU_ESCR0 (with any non-null event selector) through a complemented
- * max threshold. [IA32-Vol3, Section 14.9.9]
- */
-static int setup_p4_watchdog(unsigned nmi_hz)
-{
-        unsigned int perfctr_msr, evntsel_msr, cccr_msr;
-        unsigned int evntsel, cccr_val;
-        unsigned int misc_enable, dummy;
-        unsigned int ht_num;
-        struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
-        rdmsr(MSR_IA32_MISC_ENABLE, misc_enable, dummy);
-        if (!(misc_enable & MSR_P4_MISC_ENABLE_PERF_AVAIL))
-                return 0;
-#ifdef CONFIG_SMP
-        /* detect which hyperthread we are on */
-        if (smp_num_siblings == 2) {
-                unsigned int ebx, apicid;
-                ebx = cpuid_ebx(1);
-                apicid = (ebx >> 24) & 0xff;
-                ht_num = apicid & 1;
-        } else
-#endif
-                ht_num = 0;
-        /*
-         * performance counters are shared resources
-         * assign each hyperthread its own set
-         * (re-use the ESCR0 register, seems safe
-         * and keeps the cccr_val the same)
-         */
-        if (!ht_num) {
-                /* logical cpu 0 */
-                perfctr_msr = MSR_P4_IQ_PERFCTR0;
-                evntsel_msr = MSR_P4_CRU_ESCR0;
-                cccr_msr = MSR_P4_IQ_CCCR0;
-                cccr_val = P4_CCCR_OVF_PMI0 | P4_CCCR_ESCR_SELECT(4);
-                /*
-                 * If we're on the kdump kernel or other situation, we may
-                 * still have other performance counter registers set to
-                 * interrupt and they'll keep interrupting forever because
-                 * of the P4_CCCR_OVF quirk. So we need to ACK all the
-                 * pending interrupts and disable all the registers here,
-                 * before reenabling the NMI delivery. Refer to p4_rearm()
-                 * about the P4_CCCR_OVF quirk.
-                 */
-                if (reset_devices) {
-                        unsigned int low, high;
-                        int i;
-                        for (i = 0; i < P4_CONTROLS; i++) {
-                                rdmsr(p4_controls[i], low, high);
-                                low &= ~(P4_CCCR_ENABLE | P4_CCCR_OVF);
-                                wrmsr(p4_controls[i], low, high);
-                        }
-                }
-        } else {
-                /* logical cpu 1 */
-                perfctr_msr = MSR_P4_IQ_PERFCTR1;
-                evntsel_msr = MSR_P4_CRU_ESCR0;
-                cccr_msr = MSR_P4_IQ_CCCR1;
-                /* Pentium 4 D processors don't support P4_CCCR_OVF_PMI1 */
-                if (boot_cpu_data.x86_model == 4 && boot_cpu_data.x86_mask == 4)
-                        cccr_val = P4_CCCR_OVF_PMI0;
-                else
-                        cccr_val = P4_CCCR_OVF_PMI1;
-                cccr_val |= P4_CCCR_ESCR_SELECT(4);
-        }
-        evntsel = P4_ESCR_EVENT_SELECT(0x3F)
-                | P4_ESCR_OS
-                | P4_ESCR_USR;
-        cccr_val |= P4_CCCR_THRESHOLD(15)
-                 | P4_CCCR_COMPLEMENT
-                 | P4_CCCR_COMPARE
-                 | P4_CCCR_REQUIRED;
-        wrmsr(evntsel_msr, evntsel, 0);
-        wrmsr(cccr_msr, cccr_val, 0);
-        write_watchdog_counter(perfctr_msr, "P4_IQ_COUNTER0", nmi_hz);
-        wd->perfctr_msr = perfctr_msr;
-        wd->evntsel_msr = evntsel_msr;
-        wd->cccr_msr = cccr_msr;
-        /* ok, everything is initialized, announce that we're set */
-        cpu_nmi_set_wd_enabled();
-        apic_write(APIC_LVTPC, APIC_DM_NMI);
-        cccr_val |= P4_CCCR_ENABLE;
-        wrmsr(cccr_msr, cccr_val, 0);
-        return 1;
-}
-static void stop_p4_watchdog(void)
-{
-        struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
-        wrmsr(wd->cccr_msr, 0, 0);
-        wrmsr(wd->evntsel_msr, 0, 0);
-}
-static int p4_reserve(void)
-{
-        if (!reserve_perfctr_nmi(MSR_P4_IQ_PERFCTR0))
-                return 0;
-#ifdef CONFIG_SMP
-        if (smp_num_siblings > 1 && !reserve_perfctr_nmi(MSR_P4_IQ_PERFCTR1))
-                goto fail1;
-#endif
-        if (!reserve_evntsel_nmi(MSR_P4_CRU_ESCR0))
-                goto fail2;
-        /* RED-PEN why is ESCR1 not reserved here? */
-        return 1;
- fail2:
-#ifdef CONFIG_SMP
-        if (smp_num_siblings > 1)
-                release_perfctr_nmi(MSR_P4_IQ_PERFCTR1);
- fail1:
-#endif
-        release_perfctr_nmi(MSR_P4_IQ_PERFCTR0);
-        return 0;
-}
-static void p4_unreserve(void)
-{
-#ifdef CONFIG_SMP
-        if (smp_num_siblings > 1)
-                release_perfctr_nmi(MSR_P4_IQ_PERFCTR1);
-#endif
-        release_evntsel_nmi(MSR_P4_CRU_ESCR0);
-        release_perfctr_nmi(MSR_P4_IQ_PERFCTR0);
-}
-static void __kprobes p4_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
-{
-        unsigned dummy;
-        /*
-         * P4 quirks:
-         * - An overflown perfctr will assert its interrupt
-         *   until the OVF flag in its CCCR is cleared.
-         * - LVTPC is masked on interrupt and must be
-         *   unmasked by the LVTPC handler.
-         */
-        rdmsrl(wd->cccr_msr, dummy);
-        dummy &= ~P4_CCCR_OVF;
-        wrmsrl(wd->cccr_msr, dummy);
-        apic_write(APIC_LVTPC, APIC_DM_NMI);
-        /* start the cycle over again */
-        write_watchdog_counter(wd->perfctr_msr, NULL, nmi_hz);
-}
-static const struct wd_ops p4_wd_ops = {
-        .reserve        = p4_reserve,
-        .unreserve      = p4_unreserve,
-        .setup          = setup_p4_watchdog,
-        .rearm          = p4_rearm,
-        .stop           = stop_p4_watchdog,
-        /* RED-PEN this is wrong for the other sibling */
-        .perfctr        = MSR_P4_BPU_PERFCTR0,
-        .evntsel        = MSR_P4_BSU_ESCR0,
-        .checkbit       = 1ULL << 39,
-};
-/*
- * Watchdog using the Intel architected PerfMon.
- * Used for Core2 and hopefully all future Intel CPUs.
- */
-#define ARCH_PERFMON_NMI_EVENT_SEL      ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL
-#define ARCH_PERFMON_NMI_EVENT_UMASK    ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK
-static struct wd_ops intel_arch_wd_ops;
-static int setup_intel_arch_watchdog(unsigned nmi_hz)
-{
-        unsigned int ebx;
-        union cpuid10_eax eax;
-        unsigned int unused;
-        unsigned int perfctr_msr, evntsel_msr;
-        unsigned int evntsel;
-        struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
-        /*
-         * Check whether the Architectural PerfMon supports
-         * Unhalted Core Cycles Event or not.
-         * NOTE: Corresponding bit = 0 in ebx indicates event present.
-         */
-        cpuid(10, &(eax.full), &ebx, &unused, &unused);
-        if ((eax.split.mask_length <
-                        (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) ||
-            (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT))
-                return 0;
-        perfctr_msr = wd_ops->perfctr;
-        evntsel_msr = wd_ops->evntsel;
-        wrmsrl(perfctr_msr, 0UL);
-        evntsel = ARCH_PERFMON_EVENTSEL_INT
-                | ARCH_PERFMON_EVENTSEL_OS
-                | ARCH_PERFMON_EVENTSEL_USR
-                | ARCH_PERFMON_NMI_EVENT_SEL
-                | ARCH_PERFMON_NMI_EVENT_UMASK;
-        /* setup the timer */
-        wrmsr(evntsel_msr, evntsel, 0);
-        nmi_hz = adjust_for_32bit_ctr(nmi_hz);
-        write_watchdog_counter32(perfctr_msr, "INTEL_ARCH_PERFCTR0", nmi_hz);
-        wd->perfctr_msr = perfctr_msr;
-        wd->evntsel_msr = evntsel_msr;
-        wd->cccr_msr = 0;  /* unused */
-        /* ok, everything is initialized, announce that we're set */
-        cpu_nmi_set_wd_enabled();
-        apic_write(APIC_LVTPC, APIC_DM_NMI);
-        evntsel |= ARCH_PERFMON_EVENTSEL_ENABLE;
-        wrmsr(evntsel_msr, evntsel, 0);
-        intel_arch_wd_ops.checkbit = 1ULL << (eax.split.bit_width - 1);
-        return 1;
-}
-static struct wd_ops intel_arch_wd_ops __read_mostly = {
-        .reserve        = single_msr_reserve,
-        .unreserve      = single_msr_unreserve,
-        .setup          = setup_intel_arch_watchdog,
-        .rearm          = p6_rearm,
-        .stop           = single_msr_stop_watchdog,
-        .perfctr        = MSR_ARCH_PERFMON_PERFCTR1,
-        .evntsel        = MSR_ARCH_PERFMON_EVENTSEL1,
-};
-static void probe_nmi_watchdog(void)
-{
-        switch (boot_cpu_data.x86_vendor) {
-        case X86_VENDOR_AMD:
-                if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15 &&
-                    boot_cpu_data.x86 != 16 && boot_cpu_data.x86 != 17)
-                        return;
-                wd_ops = &k7_wd_ops;
-                break;
-        case X86_VENDOR_INTEL:
-                /* Work around where perfctr1 doesn't have a working enable
-                 * bit as described in the following errata:
-                 * AE49 Core Duo and Intel Core Solo 65 nm
-                 * AN49 Intel Pentium Dual-Core
-                 * AF49 Dual-Core Intel Xeon Processor LV
-                 */
-                if ((boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 14) ||
-                    ((boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 15 &&
-                     boot_cpu_data.x86_mask == 4))) {
-                        intel_arch_wd_ops.perfctr = MSR_ARCH_PERFMON_PERFCTR0;
-                        intel_arch_wd_ops.evntsel = MSR_ARCH_PERFMON_EVENTSEL0;
-                }
-                if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
-                        wd_ops = &intel_arch_wd_ops;
-                        break;
-                }
-                switch (boot_cpu_data.x86) {
-                case 6:
-                        if (boot_cpu_data.x86_model > 13)
-                                return;
-                        wd_ops = &p6_wd_ops;
-                        break;
-                case 15:
-                        wd_ops = &p4_wd_ops;
-                        break;
-                default:
-                        return;
-                }
-                break;
-        }
-}
-/* Interface to nmi.c */
-int lapic_watchdog_init(unsigned nmi_hz)
-{
-        if (!wd_ops) {
-                probe_nmi_watchdog();
-                if (!wd_ops) {
-                        printk(KERN_INFO "NMI watchdog: CPU not supported\n");
-                        return -1;
-                }
-                if (!wd_ops->reserve()) {
-                        printk(KERN_ERR
-                                "NMI watchdog: cannot reserve perfctrs\n");
-                        return -1;
-                }
-        }
-        if (!(wd_ops->setup(nmi_hz))) {
-                printk(KERN_ERR "Cannot setup NMI watchdog on CPU %d\n",
-                       raw_smp_processor_id());
-                return -1;
-        }
-        return 0;
-}
-void lapic_watchdog_stop(void)
-{
-        if (wd_ops)
-                wd_ops->stop();
-}
-unsigned lapic_adjust_nmi_hz(unsigned hz)
-{
-        struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
-        if (wd->perfctr_msr == MSR_P6_PERFCTR0 ||
-            wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR1)
-                hz = adjust_for_32bit_ctr(hz);
-        return hz;
-}
-int __kprobes lapic_wd_event(unsigned nmi_hz)
-{
-        struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
-        u64 ctr;
-        rdmsrl(wd->perfctr_msr, ctr);
-        if (ctr & wd_ops->checkbit) /* perfctr still running? */
-                return 0;
-        wd_ops->rearm(wd, nmi_hz);
-        return 1;
-}
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index d49079515122..c7f64e6f537a 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -44,6 +44,12 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
                { X86_FEATURE_LBRV,             CR_EDX, 1, 0x8000000a, 0 },
                { X86_FEATURE_SVML,             CR_EDX, 2, 0x8000000a, 0 },
                { X86_FEATURE_NRIPS,            CR_EDX, 3, 0x8000000a, 0 },
+                { X86_FEATURE_TSCRATEMSR,       CR_EDX, 4, 0x8000000a, 0 },
+                { X86_FEATURE_VMCBCLEAN,        CR_EDX, 5, 0x8000000a, 0 },
+                { X86_FEATURE_FLUSHBYASID,      CR_EDX, 6, 0x8000000a, 0 },
+                { X86_FEATURE_DECODEASSISTS,    CR_EDX, 7, 0x8000000a, 0 },
+                { X86_FEATURE_PAUSEFILTER,      CR_EDX,10, 0x8000000a, 0 },
+                { X86_FEATURE_PFTHRESHOLD,      CR_EDX,12, 0x8000000a, 0 },
                { 0, 0, 0, 0, 0 }
        };
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index 227b0448960d..d22d0c4edcfd 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -86,7 +86,7 @@ static void __init vmware_platform_setup(void)
 }
 /*
- * While checking the dmi string infomation, just checking the product
+ * While checking the dmi string information, just checking the product
 * serial key should be enough, as this will always have a VMware
 * specific string when running under VMware hypervisor.
 */
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index 1b7b31ab7d86..212a6a42527c 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -33,7 +33,6 @@
 #include <linux/init.h>
 #include <linux/poll.h>
 #include <linux/smp.h>
-#include <linux/smp_lock.h>
 #include <linux/major.h>
 #include <linux/fs.h>
 #include <linux/device.h>
diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c
index 67414550c3cc..642f75a68cd5 100644
--- a/arch/x86/kernel/crash_dump_32.c
+++ b/arch/x86/kernel/crash_dump_32.c
@@ -14,9 +14,6 @@
 static void *kdump_buf_page;
-/* Stores the physical address of elf header of crash image. */
-unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
 static inline bool is_crashed_pfn_valid(unsigned long pfn)
 {
 #ifndef CONFIG_X86_PAE
@@ -61,7 +58,7 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
        if (!is_crashed_pfn_valid(pfn))
                return -EFAULT;
-        vaddr = kmap_atomic_pfn(pfn, KM_PTE0);
+        vaddr = kmap_atomic_pfn(pfn);
        if (!userbuf) {
                memcpy(buf, (vaddr + offset), csize);
diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c
index 045b36cada65..afa64adb75ee 100644
--- a/arch/x86/kernel/crash_dump_64.c
+++ b/arch/x86/kernel/crash_dump_64.c
@@ -10,9 +10,6 @@
 #include <linux/uaccess.h>
 #include <linux/io.h>
-/* Stores the physical address of elf header of crash image. */
-unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
 /**
 * copy_oldmem_page - copy one page from "oldmem"
 * @pfn: page frame number to be copied
@@ -34,7 +31,7 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
        if (!csize)
                return 0;
-        vaddr = ioremap(pfn << PAGE_SHIFT, PAGE_SIZE);
+        vaddr = ioremap_cache(pfn << PAGE_SHIFT, PAGE_SIZE);
        if (!vaddr)
                return -ENOMEM;
@@ -46,6 +43,7 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
        } else
                memcpy(buf, vaddr + offset, csize);
+        set_iounmap_nonlazy();
        iounmap(vaddr);
        return csize;
 }
diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c
new file mode 100644
index 000000000000..9aeb78a23de4
--- /dev/null
+++ b/arch/x86/kernel/devicetree.c
@@ -0,0 +1,452 @@
+/*
+ * Architecture specific OF callbacks.
+ */
+#include <linux/bootmem.h>
+#include <linux/io.h>
+#include <linux/interrupt.h>
+#include <linux/list.h>
+#include <linux/of.h>
+#include <linux/of_fdt.h>
+#include <linux/of_address.h>
+#include <linux/of_platform.h>
+#include <linux/of_irq.h>
+#include <linux/slab.h>
+#include <linux/pci.h>
+#include <linux/of_pci.h>
+#include <linux/initrd.h>
+#include <asm/hpet.h>
+#include <asm/irq_controller.h>
+#include <asm/apic.h>
+#include <asm/pci_x86.h>
+__initdata u64 initial_dtb;
+char __initdata cmd_line[COMMAND_LINE_SIZE];
+static LIST_HEAD(irq_domains);
+static DEFINE_RAW_SPINLOCK(big_irq_lock);
+int __initdata of_ioapic;
+#ifdef CONFIG_X86_IO_APIC
+static void add_interrupt_host(struct irq_domain *ih)
+{
+        unsigned long flags;
+        raw_spin_lock_irqsave(&big_irq_lock, flags);
+        list_add(&ih->l, &irq_domains);
+        raw_spin_unlock_irqrestore(&big_irq_lock, flags);
+}
+#endif
+static struct irq_domain *get_ih_from_node(struct device_node *controller)
+{
+        struct irq_domain *ih, *found = NULL;
+        unsigned long flags;
+        raw_spin_lock_irqsave(&big_irq_lock, flags);
+        list_for_each_entry(ih, &irq_domains, l) {
+                if (ih->controller ==  controller) {
+                        found = ih;
+                        break;
+                }
+        }
+        raw_spin_unlock_irqrestore(&big_irq_lock, flags);
+        return found;
+}
+unsigned int irq_create_of_mapping(struct device_node *controller,
+                                   const u32 *intspec, unsigned int intsize)
+{
+        struct irq_domain *ih;
+        u32 virq, type;
+        int ret;
+        ih = get_ih_from_node(controller);
+        if (!ih)
+                return 0;
+        ret = ih->xlate(ih, intspec, intsize, &virq, &type);
+        if (ret)
+                return 0;
+        if (type == IRQ_TYPE_NONE)
+                return virq;
+        irq_set_irq_type(virq, type);
+        return virq;
+}
+EXPORT_SYMBOL_GPL(irq_create_of_mapping);
+unsigned long pci_address_to_pio(phys_addr_t address)
+{
+        /*
+         * The ioport address can be directly used by inX / outX
+         */
+        BUG_ON(address >= (1 << 16));
+        return (unsigned long)address;
+}
+EXPORT_SYMBOL_GPL(pci_address_to_pio);
+void __init early_init_dt_scan_chosen_arch(unsigned long node)
+{
+        BUG();
+}
+void __init early_init_dt_add_memory_arch(u64 base, u64 size)
+{
+        BUG();
+}
+void * __init early_init_dt_alloc_memory_arch(u64 size, u64 align)
+{
+        return __alloc_bootmem(size, align, __pa(MAX_DMA_ADDRESS));
+}
+#ifdef CONFIG_BLK_DEV_INITRD
+void __init early_init_dt_setup_initrd_arch(unsigned long start,
+                                            unsigned long end)
+{
+        initrd_start = (unsigned long)__va(start);
+        initrd_end = (unsigned long)__va(end);
+        initrd_below_start_ok = 1;
+}
+#endif
+void __init add_dtb(u64 data)
+{
+        initial_dtb = data + offsetof(struct setup_data, data);
+}
+/*
+ * CE4100 ids. Will be moved to machine_device_initcall() once we have it.
+ */
+static struct of_device_id __initdata ce4100_ids[] = {
+        { .compatible = "intel,ce4100-cp", },
+        { .compatible = "isa", },
+        { .compatible = "pci", },
+        {},
+};
+static int __init add_bus_probe(void)
+{
+        if (!of_have_populated_dt())
+                return 0;
+        return of_platform_bus_probe(NULL, ce4100_ids, NULL);
+}
+module_init(add_bus_probe);
+#ifdef CONFIG_PCI
+static int x86_of_pci_irq_enable(struct pci_dev *dev)
+{
+        struct of_irq oirq;
+        u32 virq;
+        int ret;
+        u8 pin;
+        ret = pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
+        if (ret)
+                return ret;
+        if (!pin)
+                return 0;
+        ret = of_irq_map_pci(dev, &oirq);
+        if (ret)
+                return ret;
+        virq = irq_create_of_mapping(oirq.controller, oirq.specifier,
+                        oirq.size);
+        if (virq == 0)
+                return -EINVAL;
+        dev->irq = virq;
+        return 0;
+}
+static void x86_of_pci_irq_disable(struct pci_dev *dev)
+{
+}
+void __cpuinit x86_of_pci_init(void)
+{
+        struct device_node *np;
+        pcibios_enable_irq = x86_of_pci_irq_enable;
+        pcibios_disable_irq = x86_of_pci_irq_disable;
+        for_each_node_by_type(np, "pci") {
+                const void *prop;
+                struct pci_bus *bus;
+                unsigned int bus_min;
+                struct device_node *child;
+                prop = of_get_property(np, "bus-range", NULL);
+                if (!prop)
+                        continue;
+                bus_min = be32_to_cpup(prop);
+                bus = pci_find_bus(0, bus_min);
+                if (!bus) {
+                        printk(KERN_ERR "Can't find a node for bus %s.\n",
+                                        np->full_name);
+                        continue;
+                }
+                if (bus->self)
+                        bus->self->dev.of_node = np;
+                else
+                        bus->dev.of_node = np;
+                for_each_child_of_node(np, child) {
+                        struct pci_dev *dev;
+                        u32 devfn;
+                        prop = of_get_property(child, "reg", NULL);
+                        if (!prop)
+                                continue;
+                        devfn = (be32_to_cpup(prop) >> 8) & 0xff;
+                        dev = pci_get_slot(bus, devfn);
+                        if (!dev)
+                                continue;
+                        dev->dev.of_node = child;
+                        pci_dev_put(dev);
+                }
+        }
+}
+#endif
+static void __init dtb_setup_hpet(void)
+{
+#ifdef CONFIG_HPET_TIMER
+        struct device_node *dn;
+        struct resource r;
+        int ret;
+        dn = of_find_compatible_node(NULL, NULL, "intel,ce4100-hpet");
+        if (!dn)
+                return;
+        ret = of_address_to_resource(dn, 0, &r);
+        if (ret) {
+                WARN_ON(1);
+                return;
+        }
+        hpet_address = r.start;
+#endif
+}
+static void __init dtb_lapic_setup(void)
+{
+#ifdef CONFIG_X86_LOCAL_APIC
+        struct device_node *dn;
+        struct resource r;
+        int ret;
+        dn = of_find_compatible_node(NULL, NULL, "intel,ce4100-lapic");
+        if (!dn)
+                return;
+        ret = of_address_to_resource(dn, 0, &r);
+        if (WARN_ON(ret))
+                return;
+        /* Did the boot loader setup the local APIC ? */
+        if (!cpu_has_apic) {
+                if (apic_force_enable(r.start))
+                        return;
+        }
+        smp_found_config = 1;
+        pic_mode = 1;
+        register_lapic_address(r.start);
+        generic_processor_info(boot_cpu_physical_apicid,
+                               GET_APIC_VERSION(apic_read(APIC_LVR)));
+#endif
+}
+#ifdef CONFIG_X86_IO_APIC
+static unsigned int ioapic_id;
+static void __init dtb_add_ioapic(struct device_node *dn)
+{
+        struct resource r;
+        int ret;
+        ret = of_address_to_resource(dn, 0, &r);
+        if (ret) {
+                printk(KERN_ERR "Can't obtain address from node %s.\n",
+                                dn->full_name);
+                return;
+        }
+        mp_register_ioapic(++ioapic_id, r.start, gsi_top);
+}
+static void __init dtb_ioapic_setup(void)
+{
+        struct device_node *dn;
+        for_each_compatible_node(dn, NULL, "intel,ce4100-ioapic")
+                dtb_add_ioapic(dn);
+        if (nr_ioapics) {
+                of_ioapic = 1;
+                return;
+        }
+        printk(KERN_ERR "Error: No information about IO-APIC in OF.\n");
+}
+#else
+static void __init dtb_ioapic_setup(void) {}
+#endif
+static void __init dtb_apic_setup(void)
+{
+        dtb_lapic_setup();
+        dtb_ioapic_setup();
+}
+#ifdef CONFIG_OF_FLATTREE
+static void __init x86_flattree_get_config(void)
+{
+        u32 size, map_len;
+        void *new_dtb;
+        if (!initial_dtb)
+                return;
+        map_len = max(PAGE_SIZE - (initial_dtb & ~PAGE_MASK),
+                        (u64)sizeof(struct boot_param_header));
+        initial_boot_params = early_memremap(initial_dtb, map_len);
+        size = be32_to_cpu(initial_boot_params->totalsize);
+        if (map_len < size) {
+                early_iounmap(initial_boot_params, map_len);
+                initial_boot_params = early_memremap(initial_dtb, size);
+                map_len = size;
+        }
+        new_dtb = alloc_bootmem(size);
+        memcpy(new_dtb, initial_boot_params, size);
+        early_iounmap(initial_boot_params, map_len);
+        initial_boot_params = new_dtb;
+        /* root level address cells */
+        of_scan_flat_dt(early_init_dt_scan_root, NULL);
+        unflatten_device_tree();
+}
+#else
+static inline void x86_flattree_get_config(void) { }
+#endif
+void __init x86_dtb_init(void)
+{
+        x86_flattree_get_config();
+        if (!of_have_populated_dt())
+                return;
+        dtb_setup_hpet();
+        dtb_apic_setup();
+}
+#ifdef CONFIG_X86_IO_APIC
+struct of_ioapic_type {
+        u32 out_type;
+        u32 trigger;
+        u32 polarity;
+};
+static struct of_ioapic_type of_ioapic_type[] =
+{
+        {
+                .out_type       = IRQ_TYPE_EDGE_RISING,
+                .trigger        = IOAPIC_EDGE,
+                .polarity       = 1,
+        },
+        {
+                .out_type       = IRQ_TYPE_LEVEL_LOW,
+                .trigger        = IOAPIC_LEVEL,
+                .polarity       = 0,
+        },
+        {
+                .out_type       = IRQ_TYPE_LEVEL_HIGH,
+                .trigger        = IOAPIC_LEVEL,
+                .polarity       = 1,
+        },
+        {
+                .out_type       = IRQ_TYPE_EDGE_FALLING,
+                .trigger        = IOAPIC_EDGE,
+                .polarity       = 0,
+        },
+};
+static int ioapic_xlate(struct irq_domain *id, const u32 *intspec, u32 intsize,
+                        u32 *out_hwirq, u32 *out_type)
+{
+        struct mp_ioapic_gsi *gsi_cfg;
+        struct io_apic_irq_attr attr;
+        struct of_ioapic_type *it;
+        u32 line, idx, type;
+        if (intsize < 2)
+                return -EINVAL;
+        line = *intspec;
+        idx = (u32) id->priv;
+        gsi_cfg = mp_ioapic_gsi_routing(idx);
+        *out_hwirq = line + gsi_cfg->gsi_base;
+        intspec++;
+        type = *intspec;
+        if (type >= ARRAY_SIZE(of_ioapic_type))
+                return -EINVAL;
+        it = of_ioapic_type + type;
+        *out_type = it->out_type;
+        set_io_apic_irq_attr(&attr, idx, line, it->trigger, it->polarity);
+        return io_apic_setup_irq_pin_once(*out_hwirq, cpu_to_node(0), &attr);
+}
+static void __init ioapic_add_ofnode(struct device_node *np)
+{
+        struct resource r;
+        int i, ret;
+        ret = of_address_to_resource(np, 0, &r);
+        if (ret) {
+                printk(KERN_ERR "Failed to obtain address for %s\n",
+                                np->full_name);
+                return;
+        }
+        for (i = 0; i < nr_ioapics; i++) {
+                if (r.start == mpc_ioapic_addr(i)) {
+                        struct irq_domain *id;
+                        id = kzalloc(sizeof(*id), GFP_KERNEL);
+                        BUG_ON(!id);
+                        id->controller = np;
+                        id->xlate = ioapic_xlate;
+                        id->priv = (void *)i;
+                        add_interrupt_host(id);
+                        return;
+                }
+        }
+        printk(KERN_ERR "IOxAPIC at %s is not registered.\n", np->full_name);
+}
+void __init x86_add_irq_domains(void)
+{
+        struct device_node *dp;
+        if (!of_have_populated_dt())
+                return;
+        for_each_node_with_property(dp, "interrupt-controller") {
+                if (of_device_is_compatible(dp, "intel,ce4100-ioapic"))
+                        ioapic_add_ofnode(dp);
+        }
+}
+#else
+void __init x86_add_irq_domains(void) { }
+#endif
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 6e8752c1bd52..1aae78f775fc 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -27,7 +27,7 @@ static int die_counter;
 void printk_address(unsigned long address, int reliable)
 {
-        printk(" [<%p>] %s%pS\n", (void *) address,
+        printk(" [<%p>] %s%pB\n", (void *) address,
                        reliable ? "" : "? ", (void *) address);
 }
@@ -135,20 +135,6 @@ print_context_stack_bp(struct thread_info *tinfo,
 }
 EXPORT_SYMBOL_GPL(print_context_stack_bp);
-static void
-print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
-{
-        printk(data);
-        print_symbol(msg, symbol);
-        printk("\n");
-}
-static void print_trace_warning(void *data, char *msg)
-{
-        printk("%s%s\n", (char *)data, msg);
-}
 static int print_trace_stack(void *data, char *name)
 {
        printk("%s <%s> ", (char *)data, name);
@@ -166,8 +152,6 @@ static void print_trace_address(void *data, unsigned long addr, int reliable)
 }
 static const struct stacktrace_ops print_trace_ops = {
-        .warning                = print_trace_warning,
-        .warning_symbol         = print_trace_warning_symbol,
        .stack                  = print_trace_stack,
        .address                = print_trace_address,
        .walk_stack             = print_context_stack,
@@ -197,14 +181,10 @@ void show_stack(struct task_struct *task, unsigned long *sp)
 */
 void dump_stack(void)
 {
-        unsigned long bp = 0;
+        unsigned long bp;
        unsigned long stack;
-#ifdef CONFIG_FRAME_POINTER
+        bp = stack_frame(current, NULL);
-        if (!bp)
-                get_bp(bp);
-#endif
        printk("Pid: %d, comm: %.20s %s %s %.*s\n",
                current->pid, current->comm, print_tainted(),
                init_utsname()->release,
@@ -240,6 +220,7 @@ unsigned __kprobes long oops_begin(void)
        bust_spinlocks(1);
        return flags;
 }
+EXPORT_SYMBOL_GPL(oops_begin);
 void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
 {
@@ -282,7 +263,6 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err)
        printk("DEBUG_PAGEALLOC");
 #endif
        printk("\n");
-        sysfs_printk_last_file();
        if (notify_die(DIE_OOPS, str, regs, err,
                        current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
                return 1;
@@ -325,41 +305,6 @@ void die(const char *str, struct pt_regs *regs, long err)
        oops_end(flags, regs, sig);
 }
-void notrace __kprobes
-die_nmi(char *str, struct pt_regs *regs, int do_panic)
-{
-        unsigned long flags;
-        if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP)
-                return;
-        /*
-         * We are in trouble anyway, lets at least try
-         * to get a message out.
-         */
-        flags = oops_begin();
-        printk(KERN_EMERG "%s", str);
-        printk(" on CPU%d, ip %08lx, registers:\n",
-                smp_processor_id(), regs->ip);
-        show_registers(regs);
-        oops_end(flags, regs, 0);
-        if (do_panic || panic_on_oops)
-                panic("Non maskable interrupt");
-        nmi_exit();
-        local_irq_enable();
-        do_exit(SIGBUS);
-}
-static int __init oops_setup(char *s)
-{
-        if (!s)
-                return -EINVAL;
-        if (!strcmp(s, "panic"))
-                panic_on_oops = 1;
-        return 0;
-}
-early_param("oops", oops_setup);
 static int __init kstack_setup(char *s)
 {
        if (!s)
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index 0f6376ffa2d9..3b97a80ce329 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -34,17 +34,8 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
                        stack = (unsigned long *)task->thread.sp;
        }
-#ifdef CONFIG_FRAME_POINTER
+        if (!bp)
-        if (!bp) {
+                bp = stack_frame(task, regs);
-                if (task == current) {
-                        /* Grab bp right from our regs */
-                        get_bp(bp);
-                } else {
-                        /* bp is the last reg pushed by switch_to */
-                        bp = *(unsigned long *) task->thread.sp;
-                }
-        }
-#endif
        for (;;) {
                struct thread_info *context;
@@ -82,11 +73,11 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
                if (kstack_end(stack))
                        break;
                if (i && ((i % STACKSLOTS_PER_LINE) == 0))
-                        printk("\n%s", log_lvl);
+                        printk(KERN_CONT "\n");
-                printk(" %08lx", *stack++);
+                printk(KERN_CONT " %08lx", *stack++);
                touch_nmi_watchdog();
        }
-        printk("\n");
+        printk(KERN_CONT "\n");
        show_trace_log_lvl(task, regs, sp, bp, log_lvl);
 }
@@ -112,8 +103,7 @@ void show_registers(struct pt_regs *regs)
                u8 *ip;
                printk(KERN_EMERG "Stack:\n");
-                show_stack_log_lvl(NULL, regs, &regs->sp,
+                show_stack_log_lvl(NULL, regs, &regs->sp, 0, KERN_EMERG);
-                                0, KERN_EMERG);
                printk(KERN_EMERG "Code: ");
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 57a21f11c791..e71c98d3c0d2 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -149,29 +149,19 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
        unsigned used = 0;
        struct thread_info *tinfo;
        int graph = 0;
+        unsigned long dummy;
        if (!task)
                task = current;
        if (!stack) {
-                unsigned long dummy;
                stack = &dummy;
                if (task && task != current)
                        stack = (unsigned long *)task->thread.sp;
        }
-#ifdef CONFIG_FRAME_POINTER
+        if (!bp)
-        if (!bp) {
+                bp = stack_frame(task, regs);
-                if (task == current) {
-                        /* Grab bp right from our regs */
-                        get_bp(bp);
-                } else {
-                        /* bp is the last reg pushed by switch_to */
-                        bp = *(unsigned long *) task->thread.sp;
-                }
-        }
-#endif
        /*
         * Print function call entries in all stacks, starting at the
         * current stack address. If the stacks consist of nested
@@ -265,20 +255,20 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
                if (stack >= irq_stack && stack <= irq_stack_end) {
                        if (stack == irq_stack_end) {
                                stack = (unsigned long *) (irq_stack_end[-1]);
-                                printk(" <EOI> ");
+                                printk(KERN_CONT " <EOI> ");
                        }
                } else {
                if (((long) stack & (THREAD_SIZE-1)) == 0)
                        break;
                }
                if (i && ((i % STACKSLOTS_PER_LINE) == 0))
-                        printk("\n%s", log_lvl);
+                        printk(KERN_CONT "\n");
-                printk(" %016lx", *stack++);
+                printk(KERN_CONT " %016lx", *stack++);
                touch_nmi_watchdog();
        }
        preempt_enable();
-        printk("\n");
+        printk(KERN_CONT "\n");
        show_trace_log_lvl(task, regs, sp, bp, log_lvl);
 }
@@ -308,7 +298,7 @@ void show_registers(struct pt_regs *regs)
                printk(KERN_EMERG "Stack:\n");
                show_stack_log_lvl(NULL, regs, (unsigned long *)sp,
-                                regs->bp, KERN_EMERG);
+                                   0, KERN_EMERG);
                printk(KERN_EMERG "Code: ");
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 0d6fc71bedb1..3e2ef8425316 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -11,10 +11,13 @@
 #include <linux/kernel.h>
 #include <linux/types.h>
 #include <linux/init.h>
+#include <linux/crash_dump.h>
 #include <linux/bootmem.h>
 #include <linux/pfn.h>
 #include <linux/suspend.h>
+#include <linux/acpi.h>
 #include <linux/firmware-map.h>
+#include <linux/memblock.h>
 #include <asm/e820.h>
 #include <asm/proto.h>
@@ -665,21 +668,15 @@ __init void e820_setup_gap(void)
 * boot_params.e820_map, others are passed via SETUP_E820_EXT node of
 * linked list of struct setup_data, which is parsed here.
 */
-void __init parse_e820_ext(struct setup_data *sdata, unsigned long pa_data)
+void __init parse_e820_ext(struct setup_data *sdata)
 {
-        u32 map_len;
        int entries;
        struct e820entry *extmap;
        entries = sdata->len / sizeof(struct e820entry);
-        map_len = sdata->len + sizeof(struct setup_data);
-        if (map_len > PAGE_SIZE)
-                sdata = early_ioremap(pa_data, map_len);
        extmap = (struct e820entry *)(sdata->data);
        __append_e820_map(extmap, entries);
        sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
-        if (map_len > PAGE_SIZE)
-                early_iounmap(sdata, map_len);
        printk(KERN_INFO "extended physical RAM map:\n");
        e820_print_map("extended");
 }
@@ -738,73 +735,7 @@ core_initcall(e820_mark_nvs_memory);
 #endif
 /*
- * Find a free area with specified alignment in a specific range.
+ * pre allocated 4k and reserved it in memblock and e820_saved
- */
-u64 __init find_e820_area(u64 start, u64 end, u64 size, u64 align)
-{
-        int i;
-        for (i = 0; i < e820.nr_map; i++) {
-                struct e820entry *ei = &e820.map[i];
-                u64 addr;
-                u64 ei_start, ei_last;
-                if (ei->type != E820_RAM)
-                        continue;
-                ei_last = ei->addr + ei->size;
-                ei_start = ei->addr;
-                addr = find_early_area(ei_start, ei_last, start, end,
-                                         size, align);
-                if (addr != -1ULL)
-                        return addr;
-        }
-        return -1ULL;
-}
-u64 __init find_fw_memmap_area(u64 start, u64 end, u64 size, u64 align)
-{
-        return find_e820_area(start, end, size, align);
-}
-u64 __init get_max_mapped(void)
-{
-        u64 end = max_pfn_mapped;
-        end <<= PAGE_SHIFT;
-        return end;
-}
-/*
- * Find next free range after *start
- */
-u64 __init find_e820_area_size(u64 start, u64 *sizep, u64 align)
-{
-        int i;
-        for (i = 0; i < e820.nr_map; i++) {
-                struct e820entry *ei = &e820.map[i];
-                u64 addr;
-                u64 ei_start, ei_last;
-                if (ei->type != E820_RAM)
-                        continue;
-                ei_last = ei->addr + ei->size;
-                ei_start = ei->addr;
-                addr = find_early_area_size(ei_start, ei_last, start,
-                                         sizep, align);
-                if (addr != -1ULL)
-                        return addr;
-        }
-        return -1ULL;
-}
-/*
- * pre allocated 4k and reserved it in e820
 */
 u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align)
 {
@@ -813,8 +744,8 @@ u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align)
        u64 start;
        for (start = startt; ; start += size) {
-                start = find_e820_area_size(start, &size, align);
+                start = memblock_x86_find_in_range_size(start, &size, align);
-                if (!(start + 1))
+                if (start == MEMBLOCK_ERROR)
                        return 0;
                if (size >= sizet)
                        break;
@@ -830,10 +761,9 @@ u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align)
        addr = round_down(start + size - sizet, align);
        if (addr < start)
                return 0;
-        e820_update_range(addr, sizet, E820_RAM, E820_RESERVED);
+        memblock_x86_reserve_range(addr, addr + sizet, "new next");
        e820_update_range_saved(addr, sizet, E820_RAM, E820_RESERVED);
-        printk(KERN_INFO "update e820 for early_reserve_e820\n");
+        printk(KERN_INFO "update e820_saved for early_reserve_e820\n");
-        update_e820();
        update_e820_saved();
        return addr;
@@ -895,74 +825,6 @@ unsigned long __init e820_end_of_low_ram_pfn(void)
 {
        return e820_end_pfn(1UL<<(32 - PAGE_SHIFT), E820_RAM);
 }
-/*
- * Finds an active region in the address range from start_pfn to last_pfn and
- * returns its range in ei_startpfn and ei_endpfn for the e820 entry.
- */
-int __init e820_find_active_region(const struct e820entry *ei,
-                                  unsigned long start_pfn,
-                                  unsigned long last_pfn,
-                                  unsigned long *ei_startpfn,
-                                  unsigned long *ei_endpfn)
-{
-        u64 align = PAGE_SIZE;
-        *ei_startpfn = round_up(ei->addr, align) >> PAGE_SHIFT;
-        *ei_endpfn = round_down(ei->addr + ei->size, align) >> PAGE_SHIFT;
-        /* Skip map entries smaller than a page */
-        if (*ei_startpfn >= *ei_endpfn)
-                return 0;
-        /* Skip if map is outside the node */
-        if (ei->type != E820_RAM || *ei_endpfn <= start_pfn ||
-                                    *ei_startpfn >= last_pfn)
-                return 0;
-        /* Check for overlaps */
-        if (*ei_startpfn < start_pfn)
-                *ei_startpfn = start_pfn;
-        if (*ei_endpfn > last_pfn)
-                *ei_endpfn = last_pfn;
-        return 1;
-}
-/* Walk the e820 map and register active regions within a node */
-void __init e820_register_active_regions(int nid, unsigned long start_pfn,
-                                         unsigned long last_pfn)
-{
-        unsigned long ei_startpfn;
-        unsigned long ei_endpfn;
-        int i;
-        for (i = 0; i < e820.nr_map; i++)
-                if (e820_find_active_region(&e820.map[i],
-                                            start_pfn, last_pfn,
-                                            &ei_startpfn, &ei_endpfn))
-                        add_active_range(nid, ei_startpfn, ei_endpfn);
-}
-/*
- * Find the hole size (in bytes) in the memory range.
- * @start: starting address of the memory range to scan
- * @end: ending address of the memory range to scan
- */
-u64 __init e820_hole_size(u64 start, u64 end)
-{
-        unsigned long start_pfn = start >> PAGE_SHIFT;
-        unsigned long last_pfn = end >> PAGE_SHIFT;
-        unsigned long ei_startpfn, ei_endpfn, ram = 0;
-        int i;
-        for (i = 0; i < e820.nr_map; i++) {
-                if (e820_find_active_region(&e820.map[i],
-                                            start_pfn, last_pfn,
-                                            &ei_startpfn, &ei_endpfn))
-                        ram += ei_endpfn - ei_startpfn;
-        }
-        return end - start - ((u64)ram << PAGE_SHIFT);
-}
 static void early_panic(char *msg)
 {
@@ -980,15 +842,21 @@ static int __init parse_memopt(char *p)
        if (!p)
                return -EINVAL;
-#ifdef CONFIG_X86_32
        if (!strcmp(p, "nopentium")) {
+#ifdef CONFIG_X86_32
                setup_clear_cpu_cap(X86_FEATURE_PSE);
                return 0;
-        }
+#else
+                printk(KERN_WARNING "mem=nopentium ignored! (only supported on x86_32)\n");
+                return -EINVAL;
 #endif
+        }
        userdef = 1;
        mem_size = memparse(p, &p);
+        /* don't remove all of memory when handling "mem={invalid}" param */
+        if (mem_size == 0)
+                return -EINVAL;
        e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1);
        return 0;
@@ -1210,3 +1078,48 @@ void __init setup_memory_map(void)
        printk(KERN_INFO "BIOS-provided physical RAM map:\n");
        e820_print_map(who);
 }
+void __init memblock_x86_fill(void)
+{
+        int i;
+        u64 end;
+        /*
+         * EFI may have more than 128 entries
+         * We are safe to enable resizing, beause memblock_x86_fill()
+         * is rather later for x86
+         */
+        memblock_can_resize = 1;
+        for (i = 0; i < e820.nr_map; i++) {
+                struct e820entry *ei = &e820.map[i];
+                end = ei->addr + ei->size;
+                if (end != (resource_size_t)end)
+                        continue;
+                if (ei->type != E820_RAM && ei->type != E820_RESERVED_KERN)
+                        continue;
+                memblock_add(ei->addr, ei->size);
+        }
+        memblock_analyze();
+        memblock_dump_all();
+}
+void __init memblock_find_dma_reserve(void)
+{
+#ifdef CONFIG_X86_64
+        u64 free_size_pfn;
+        u64 mem_size_pfn;
+        /*
+         * need to find out used area below MAX_DMA_PFN
+         * need to use memblock to get free size in [0, MAX_DMA_PFN]
+         * at first, and assume boot_mem will not take below MAX_DMA_PFN
+         */
+        mem_size_pfn = memblock_x86_memory_in_range(0, MAX_DMA_PFN << PAGE_SHIFT) >> PAGE_SHIFT;
+        free_size_pfn = memblock_x86_free_memory_in_range(0, MAX_DMA_PFN << PAGE_SHIFT) >> PAGE_SHIFT;
+        set_dma_reserve(mem_size_pfn - free_size_pfn);
+#endif
+}
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index ebdb85cf2686..3755ef494390 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -97,7 +97,6 @@ static void __init nvidia_bugs(int num, int slot, int func)
 }
 #if defined(CONFIG_ACPI) && defined(CONFIG_X86_IO_APIC)
-#if defined(CONFIG_ACPI) && defined(CONFIG_X86_IO_APIC)
 static u32 __init ati_ixp4x0_rev(int num, int slot, int func)
 {
        u32 d;
@@ -115,7 +114,6 @@ static u32 __init ati_ixp4x0_rev(int num, int slot, int func)
        d &= 0xff;
        return d;
 }
-#endif
 static void __init ati_bugs(int num, int slot, int func)
 {
@@ -145,15 +143,10 @@ static void __init ati_bugs(int num, int slot, int func)
 static u32 __init ati_sbx00_rev(int num, int slot, int func)
 {
-        u32 old, d;
+        u32 d;
-        d = read_pci_config(num, slot, func, 0x70);
-        old = d;
-        d &= ~(1<<8);
-        write_pci_config(num, slot, func, 0x70, d);
        d = read_pci_config(num, slot, func, 0x8);
        d &= 0xff;
-        write_pci_config(num, slot, func, 0x70, old);
        return d;
 }
@@ -162,11 +155,19 @@ static void __init ati_bugs_contd(int num, int slot, int func)
 {
        u32 d, rev;
-        if (acpi_use_timer_override)
+        rev = ati_sbx00_rev(num, slot, func);
+        if (rev >= 0x40)
+                acpi_fix_pin2_polarity = 1;
+        /*
+         * SB600: revisions 0x11, 0x12, 0x13, 0x14, ...
+         * SB700: revisions 0x39, 0x3a, ...
+         * SB800: revisions 0x40, 0x41, ...
+         */
+        if (rev >= 0x39)
                return;
-        rev = ati_sbx00_rev(num, slot, func);
+        if (acpi_use_timer_override)
-        if (rev > 0x13)
                return;
        /* check for IRQ0 interrupt swap */
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index fa99bae75ace..cd28a350f7f9 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -14,6 +14,7 @@
 #include <xen/hvc-console.h>
 #include <asm/pci-direct.h>
 #include <asm/fixmap.h>
+#include <asm/mrst.h>
 #include <asm/pgtable.h>
 #include <linux/usb/ehci_def.h>
@@ -239,6 +240,17 @@ static int __init setup_early_printk(char *buf)
                if (!strncmp(buf, "xen", 3))
                        early_console_register(&xenboot_console, keep);
 #endif
+#ifdef CONFIG_EARLY_PRINTK_MRST
+                if (!strncmp(buf, "mrst", 4)) {
+                        mrst_early_console_init();
+                        early_console_register(&early_mrst_console, keep);
+                }
+                if (!strncmp(buf, "hsu", 3)) {
+                        hsu_early_console_init();
+                        early_console_register(&early_hsu_console, keep);
+                }
+#endif
                buf++;
        }
        return 0;
diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c
deleted file mode 100644
index c2fa9b8b497e..000000000000
--- a/arch/x86/kernel/efi.c
+++ /dev/null
@@ -1,612 +0,0 @@
-/*
- * Common EFI (Extensible Firmware Interface) support functions
- * Based on Extensible Firmware Interface Specification version 1.0
- *
- * Copyright (C) 1999 VA Linux Systems
- * Copyright (C) 1999 Walt Drummond <drummond@valinux.com>
- * Copyright (C) 1999-2002 Hewlett-Packard Co.
- *      David Mosberger-Tang <davidm@hpl.hp.com>
- *      Stephane Eranian <eranian@hpl.hp.com>
- * Copyright (C) 2005-2008 Intel Co.
- *      Fenghua Yu <fenghua.yu@intel.com>
- *      Bibo Mao <bibo.mao@intel.com>
- *      Chandramouli Narayanan <mouli@linux.intel.com>
- *      Huang Ying <ying.huang@intel.com>
- *
- * Copied from efi_32.c to eliminate the duplicated code between EFI
- * 32/64 support code. --ying 2007-10-26
- *
- * All EFI Runtime Services are not implemented yet as EFI only
- * supports physical mode addressing on SoftSDV. This is to be fixed
- * in a future version.  --drummond 1999-07-20
- *
- * Implemented EFI runtime services and virtual mode calls.  --davidm
- *
- * Goutham Rao: <goutham.rao@intel.com>
- *      Skip non-WB memory and ignore empty memory ranges.
- */
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/efi.h>
-#include <linux/bootmem.h>
-#include <linux/spinlock.h>
-#include <linux/uaccess.h>
-#include <linux/time.h>
-#include <linux/io.h>
-#include <linux/reboot.h>
-#include <linux/bcd.h>
-#include <asm/setup.h>
-#include <asm/efi.h>
-#include <asm/time.h>
-#include <asm/cacheflush.h>
-#include <asm/tlbflush.h>
-#include <asm/x86_init.h>
-#define EFI_DEBUG       1
-#define PFX             "EFI: "
-int efi_enabled;
-EXPORT_SYMBOL(efi_enabled);
-struct efi efi;
-EXPORT_SYMBOL(efi);
-struct efi_memory_map memmap;
-static struct efi efi_phys __initdata;
-static efi_system_table_t efi_systab __initdata;
-static int __init setup_noefi(char *arg)
-{
-        efi_enabled = 0;
-        return 0;
-}
-early_param("noefi", setup_noefi);
-int add_efi_memmap;
-EXPORT_SYMBOL(add_efi_memmap);
-static int __init setup_add_efi_memmap(char *arg)
-{
-        add_efi_memmap = 1;
-        return 0;
-}
-early_param("add_efi_memmap", setup_add_efi_memmap);
-static efi_status_t virt_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc)
-{
-        return efi_call_virt2(get_time, tm, tc);
-}
-static efi_status_t virt_efi_set_time(efi_time_t *tm)
-{
-        return efi_call_virt1(set_time, tm);
-}
-static efi_status_t virt_efi_get_wakeup_time(efi_bool_t *enabled,
-                                             efi_bool_t *pending,
-                                             efi_time_t *tm)
-{
-        return efi_call_virt3(get_wakeup_time,
-                              enabled, pending, tm);
-}
-static efi_status_t virt_efi_set_wakeup_time(efi_bool_t enabled, efi_time_t *tm)
-{
-        return efi_call_virt2(set_wakeup_time,
-                              enabled, tm);
-}
-static efi_status_t virt_efi_get_variable(efi_char16_t *name,
-                                          efi_guid_t *vendor,
-                                          u32 *attr,
-                                          unsigned long *data_size,
-                                          void *data)
-{
-        return efi_call_virt5(get_variable,
-                              name, vendor, attr,
-                              data_size, data);
-}
-static efi_status_t virt_efi_get_next_variable(unsigned long *name_size,
-                                               efi_char16_t *name,
-                                               efi_guid_t *vendor)
-{
-        return efi_call_virt3(get_next_variable,
-                              name_size, name, vendor);
-}
-static efi_status_t virt_efi_set_variable(efi_char16_t *name,
-                                          efi_guid_t *vendor,
-                                          unsigned long attr,
-                                          unsigned long data_size,
-                                          void *data)
-{
-        return efi_call_virt5(set_variable,
-                              name, vendor, attr,
-                              data_size, data);
-}
-static efi_status_t virt_efi_get_next_high_mono_count(u32 *count)
-{
-        return efi_call_virt1(get_next_high_mono_count, count);
-}
-static void virt_efi_reset_system(int reset_type,
-                                  efi_status_t status,
-                                  unsigned long data_size,
-                                  efi_char16_t *data)
-{
-        efi_call_virt4(reset_system, reset_type, status,
-                       data_size, data);
-}
-static efi_status_t virt_efi_set_virtual_address_map(
-        unsigned long memory_map_size,
-        unsigned long descriptor_size,
-        u32 descriptor_version,
-        efi_memory_desc_t *virtual_map)
-{
-        return efi_call_virt4(set_virtual_address_map,
-                              memory_map_size, descriptor_size,
-                              descriptor_version, virtual_map);
-}
-static efi_status_t __init phys_efi_set_virtual_address_map(
-        unsigned long memory_map_size,
-        unsigned long descriptor_size,
-        u32 descriptor_version,
-        efi_memory_desc_t *virtual_map)
-{
-        efi_status_t status;
-        efi_call_phys_prelog();
-        status = efi_call_phys4(efi_phys.set_virtual_address_map,
-                                memory_map_size, descriptor_size,
-                                descriptor_version, virtual_map);
-        efi_call_phys_epilog();
-        return status;
-}
-static efi_status_t __init phys_efi_get_time(efi_time_t *tm,
-                                             efi_time_cap_t *tc)
-{
-        efi_status_t status;
-        efi_call_phys_prelog();
-        status = efi_call_phys2(efi_phys.get_time, tm, tc);
-        efi_call_phys_epilog();
-        return status;
-}
-int efi_set_rtc_mmss(unsigned long nowtime)
-{
-        int real_seconds, real_minutes;
-        efi_status_t    status;
-        efi_time_t      eft;
-        efi_time_cap_t  cap;
-        status = efi.get_time(&eft, &cap);
-        if (status != EFI_SUCCESS) {
-                printk(KERN_ERR "Oops: efitime: can't read time!\n");
-                return -1;
-        }
-        real_seconds = nowtime % 60;
-        real_minutes = nowtime / 60;
-        if (((abs(real_minutes - eft.minute) + 15)/30) & 1)
-                real_minutes += 30;
-        real_minutes %= 60;
-        eft.minute = real_minutes;
-        eft.second = real_seconds;
-        status = efi.set_time(&eft);
-        if (status != EFI_SUCCESS) {
-                printk(KERN_ERR "Oops: efitime: can't write time!\n");
-                return -1;
-        }
-        return 0;
-}
-unsigned long efi_get_time(void)
-{
-        efi_status_t status;
-        efi_time_t eft;
-        efi_time_cap_t cap;
-        status = efi.get_time(&eft, &cap);
-        if (status != EFI_SUCCESS)
-                printk(KERN_ERR "Oops: efitime: can't read time!\n");
-        return mktime(eft.year, eft.month, eft.day, eft.hour,
-                      eft.minute, eft.second);
-}
-/*
- * Tell the kernel about the EFI memory map.  This might include
- * more than the max 128 entries that can fit in the e820 legacy
- * (zeropage) memory map.
- */
-static void __init do_add_efi_memmap(void)
-{
-        void *p;
-        for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
-                efi_memory_desc_t *md = p;
-                unsigned long long start = md->phys_addr;
-                unsigned long long size = md->num_pages << EFI_PAGE_SHIFT;
-                int e820_type;
-                switch (md->type) {
-                case EFI_LOADER_CODE:
-                case EFI_LOADER_DATA:
-                case EFI_BOOT_SERVICES_CODE:
-                case EFI_BOOT_SERVICES_DATA:
-                case EFI_CONVENTIONAL_MEMORY:
-                        if (md->attribute & EFI_MEMORY_WB)
-                                e820_type = E820_RAM;
-                        else
-                                e820_type = E820_RESERVED;
-                        break;
-                case EFI_ACPI_RECLAIM_MEMORY:
-                        e820_type = E820_ACPI;
-                        break;
-                case EFI_ACPI_MEMORY_NVS:
-                        e820_type = E820_NVS;
-                        break;
-                case EFI_UNUSABLE_MEMORY:
-                        e820_type = E820_UNUSABLE;
-                        break;
-                default:
-                        /*
-                         * EFI_RESERVED_TYPE EFI_RUNTIME_SERVICES_CODE
-                         * EFI_RUNTIME_SERVICES_DATA EFI_MEMORY_MAPPED_IO
-                         * EFI_MEMORY_MAPPED_IO_PORT_SPACE EFI_PAL_CODE
-                         */
-                        e820_type = E820_RESERVED;
-                        break;
-                }
-                e820_add_region(start, size, e820_type);
-        }
-        sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
-}
-void __init efi_reserve_early(void)
-{
-        unsigned long pmap;
-#ifdef CONFIG_X86_32
-        pmap = boot_params.efi_info.efi_memmap;
-#else
-        pmap = (boot_params.efi_info.efi_memmap |
-                ((__u64)boot_params.efi_info.efi_memmap_hi<<32));
-#endif
-        memmap.phys_map = (void *)pmap;
-        memmap.nr_map = boot_params.efi_info.efi_memmap_size /
-                boot_params.efi_info.efi_memdesc_size;
-        memmap.desc_version = boot_params.efi_info.efi_memdesc_version;
-        memmap.desc_size = boot_params.efi_info.efi_memdesc_size;
-        reserve_early(pmap, pmap + memmap.nr_map * memmap.desc_size,
-                      "EFI memmap");
-}
-#if EFI_DEBUG
-static void __init print_efi_memmap(void)
-{
-        efi_memory_desc_t *md;
-        void *p;
-        int i;
-        for (p = memmap.map, i = 0;
-             p < memmap.map_end;
-             p += memmap.desc_size, i++) {
-                md = p;
-                printk(KERN_INFO PFX "mem%02u: type=%u, attr=0x%llx, "
-                        "range=[0x%016llx-0x%016llx) (%lluMB)\n",
-                        i, md->type, md->attribute, md->phys_addr,
-                        md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT),
-                        (md->num_pages >> (20 - EFI_PAGE_SHIFT)));
-        }
-}
-#endif  /*  EFI_DEBUG  */
-void __init efi_init(void)
-{
-        efi_config_table_t *config_tables;
-        efi_runtime_services_t *runtime;
-        efi_char16_t *c16;
-        char vendor[100] = "unknown";
-        int i = 0;
-        void *tmp;
-#ifdef CONFIG_X86_32
-        efi_phys.systab = (efi_system_table_t *)boot_params.efi_info.efi_systab;
-#else
-        efi_phys.systab = (efi_system_table_t *)
-                (boot_params.efi_info.efi_systab |
-                 ((__u64)boot_params.efi_info.efi_systab_hi<<32));
-#endif
-        efi.systab = early_ioremap((unsigned long)efi_phys.systab,
-                                   sizeof(efi_system_table_t));
-        if (efi.systab == NULL)
-                printk(KERN_ERR "Couldn't map the EFI system table!\n");
-        memcpy(&efi_systab, efi.systab, sizeof(efi_system_table_t));
-        early_iounmap(efi.systab, sizeof(efi_system_table_t));
-        efi.systab = &efi_systab;
-        /*
-         * Verify the EFI Table
-         */
-        if (efi.systab->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE)
-                printk(KERN_ERR "EFI system table signature incorrect!\n");
-        if ((efi.systab->hdr.revision >> 16) == 0)
-                printk(KERN_ERR "Warning: EFI system table version "
-                       "%d.%02d, expected 1.00 or greater!\n",
-                       efi.systab->hdr.revision >> 16,
-                       efi.systab->hdr.revision & 0xffff);
-        /*
-         * Show what we know for posterity
-         */
-        c16 = tmp = early_ioremap(efi.systab->fw_vendor, 2);
-        if (c16) {
-                for (i = 0; i < sizeof(vendor) - 1 && *c16; ++i)
-                        vendor[i] = *c16++;
-                vendor[i] = '\0';
-        } else
-                printk(KERN_ERR PFX "Could not map the firmware vendor!\n");
-        early_iounmap(tmp, 2);
-        printk(KERN_INFO "EFI v%u.%.02u by %s\n",
-               efi.systab->hdr.revision >> 16,
-               efi.systab->hdr.revision & 0xffff, vendor);
-        /*
-         * Let's see what config tables the firmware passed to us.
-         */
-        config_tables = early_ioremap(
-                efi.systab->tables,
-                efi.systab->nr_tables * sizeof(efi_config_table_t));
-        if (config_tables == NULL)
-                printk(KERN_ERR "Could not map EFI Configuration Table!\n");
-        printk(KERN_INFO);
-        for (i = 0; i < efi.systab->nr_tables; i++) {
-                if (!efi_guidcmp(config_tables[i].guid, MPS_TABLE_GUID)) {
-                        efi.mps = config_tables[i].table;
-                        printk(" MPS=0x%lx ", config_tables[i].table);
-                } else if (!efi_guidcmp(config_tables[i].guid,
-                                        ACPI_20_TABLE_GUID)) {
-                        efi.acpi20 = config_tables[i].table;
-                        printk(" ACPI 2.0=0x%lx ", config_tables[i].table);
-                } else if (!efi_guidcmp(config_tables[i].guid,
-                                        ACPI_TABLE_GUID)) {
-                        efi.acpi = config_tables[i].table;
-                        printk(" ACPI=0x%lx ", config_tables[i].table);
-                } else if (!efi_guidcmp(config_tables[i].guid,
-                                        SMBIOS_TABLE_GUID)) {
-                        efi.smbios = config_tables[i].table;
-                        printk(" SMBIOS=0x%lx ", config_tables[i].table);
-#ifdef CONFIG_X86_UV
-                } else if (!efi_guidcmp(config_tables[i].guid,
-                                        UV_SYSTEM_TABLE_GUID)) {
-                        efi.uv_systab = config_tables[i].table;
-                        printk(" UVsystab=0x%lx ", config_tables[i].table);
-#endif
-                } else if (!efi_guidcmp(config_tables[i].guid,
-                                        HCDP_TABLE_GUID)) {
-                        efi.hcdp = config_tables[i].table;
-                        printk(" HCDP=0x%lx ", config_tables[i].table);
-                } else if (!efi_guidcmp(config_tables[i].guid,
-                                        UGA_IO_PROTOCOL_GUID)) {
-                        efi.uga = config_tables[i].table;
-                        printk(" UGA=0x%lx ", config_tables[i].table);
-                }
-        }
-        printk("\n");
-        early_iounmap(config_tables,
-                          efi.systab->nr_tables * sizeof(efi_config_table_t));
-        /*
-         * Check out the runtime services table. We need to map
-         * the runtime services table so that we can grab the physical
-         * address of several of the EFI runtime functions, needed to
-         * set the firmware into virtual mode.
-         */
-        runtime = early_ioremap((unsigned long)efi.systab->runtime,
-                                sizeof(efi_runtime_services_t));
-        if (runtime != NULL) {
-                /*
-                 * We will only need *early* access to the following
-                 * two EFI runtime services before set_virtual_address_map
-                 * is invoked.
-                 */
-                efi_phys.get_time = (efi_get_time_t *)runtime->get_time;
-                efi_phys.set_virtual_address_map =
-                        (efi_set_virtual_address_map_t *)
-                        runtime->set_virtual_address_map;
-                /*
-                 * Make efi_get_time can be called before entering
-                 * virtual mode.
-                 */
-                efi.get_time = phys_efi_get_time;
-        } else
-                printk(KERN_ERR "Could not map the EFI runtime service "
-                       "table!\n");
-        early_iounmap(runtime, sizeof(efi_runtime_services_t));
-        /* Map the EFI memory map */
-        memmap.map = early_ioremap((unsigned long)memmap.phys_map,
-                                   memmap.nr_map * memmap.desc_size);
-        if (memmap.map == NULL)
-                printk(KERN_ERR "Could not map the EFI memory map!\n");
-        memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size);
-        if (memmap.desc_size != sizeof(efi_memory_desc_t))
-                printk(KERN_WARNING
-                  "Kernel-defined memdesc doesn't match the one from EFI!\n");
-        if (add_efi_memmap)
-                do_add_efi_memmap();
-#ifdef CONFIG_X86_32
-        x86_platform.get_wallclock = efi_get_time;
-        x86_platform.set_wallclock = efi_set_rtc_mmss;
-#endif
-        /* Setup for EFI runtime service */
-        reboot_type = BOOT_EFI;
-#if EFI_DEBUG
-        print_efi_memmap();
-#endif
-}
-static void __init runtime_code_page_mkexec(void)
-{
-        efi_memory_desc_t *md;
-        void *p;
-        u64 addr, npages;
-        /* Make EFI runtime service code area executable */
-        for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
-                md = p;
-                if (md->type != EFI_RUNTIME_SERVICES_CODE)
-                        continue;
-                addr = md->virt_addr;
-                npages = md->num_pages;
-                memrange_efi_to_native(&addr, &npages);
-                set_memory_x(addr, npages);
-        }
-}
-/*
- * This function will switch the EFI runtime services to virtual mode.
- * Essentially, look through the EFI memmap and map every region that
- * has the runtime attribute bit set in its memory descriptor and update
- * that memory descriptor with the virtual address obtained from ioremap().
- * This enables the runtime services to be called without having to
- * thunk back into physical mode for every invocation.
- */
-void __init efi_enter_virtual_mode(void)
-{
-        efi_memory_desc_t *md;
-        efi_status_t status;
-        unsigned long size;
-        u64 end, systab, addr, npages, end_pfn;
-        void *p, *va;
-        efi.systab = NULL;
-        for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
-                md = p;
-                if (!(md->attribute & EFI_MEMORY_RUNTIME))
-                        continue;
-                size = md->num_pages << EFI_PAGE_SHIFT;
-                end = md->phys_addr + size;
-                end_pfn = PFN_UP(end);
-                if (end_pfn <= max_low_pfn_mapped
-                    || (end_pfn > (1UL << (32 - PAGE_SHIFT))
-                        && end_pfn <= max_pfn_mapped))
-                        va = __va(md->phys_addr);
-                else
-                        va = efi_ioremap(md->phys_addr, size, md->type);
-                md->virt_addr = (u64) (unsigned long) va;
-                if (!va) {
-                        printk(KERN_ERR PFX "ioremap of 0x%llX failed!\n",
-                               (unsigned long long)md->phys_addr);
-                        continue;
-                }
-                if (!(md->attribute & EFI_MEMORY_WB)) {
-                        addr = md->virt_addr;
-                        npages = md->num_pages;
-                        memrange_efi_to_native(&addr, &npages);
-                        set_memory_uc(addr, npages);
-                }
-                systab = (u64) (unsigned long) efi_phys.systab;
-                if (md->phys_addr <= systab && systab < end) {
-                        systab += md->virt_addr - md->phys_addr;
-                        efi.systab = (efi_system_table_t *) (unsigned long) systab;
-                }
-        }
-        BUG_ON(!efi.systab);
-        status = phys_efi_set_virtual_address_map(
-                memmap.desc_size * memmap.nr_map,
-                memmap.desc_size,
-                memmap.desc_version,
-                memmap.phys_map);
-        if (status != EFI_SUCCESS) {
-                printk(KERN_ALERT "Unable to switch EFI into virtual mode "
-                       "(status=%lx)!\n", status);
-                panic("EFI call to SetVirtualAddressMap() failed!");
-        }
-        /*
-         * Now that EFI is in virtual mode, update the function
-         * pointers in the runtime service table to the new virtual addresses.
-         *
-         * Call EFI services through wrapper functions.
-         */
-        efi.get_time = virt_efi_get_time;
-        efi.set_time = virt_efi_set_time;
-        efi.get_wakeup_time = virt_efi_get_wakeup_time;
-        efi.set_wakeup_time = virt_efi_set_wakeup_time;
-        efi.get_variable = virt_efi_get_variable;
-        efi.get_next_variable = virt_efi_get_next_variable;
-        efi.set_variable = virt_efi_set_variable;
-        efi.get_next_high_mono_count = virt_efi_get_next_high_mono_count;
-        efi.reset_system = virt_efi_reset_system;
-        efi.set_virtual_address_map = virt_efi_set_virtual_address_map;
-        if (__supported_pte_mask & _PAGE_NX)
-                runtime_code_page_mkexec();
-        early_iounmap(memmap.map, memmap.nr_map * memmap.desc_size);
-        memmap.map = NULL;
-}
-/*
- * Convenience functions to obtain memory types and attributes
- */
-u32 efi_mem_type(unsigned long phys_addr)
-{
-        efi_memory_desc_t *md;
-        void *p;
-        for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
-                md = p;
-                if ((md->phys_addr <= phys_addr) &&
-                    (phys_addr < (md->phys_addr +
-                                  (md->num_pages << EFI_PAGE_SHIFT))))
-                        return md->type;
-        }
-        return 0;
-}
-u64 efi_mem_attributes(unsigned long phys_addr)
-{
-        efi_memory_desc_t *md;
-        void *p;
-        for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
-                md = p;
-                if ((md->phys_addr <= phys_addr) &&
-                    (phys_addr < (md->phys_addr +
-                                  (md->num_pages << EFI_PAGE_SHIFT))))
-                        return md->attribute;
-        }
-        return 0;
-}
diff --git a/arch/x86/kernel/efi_32.c b/arch/x86/kernel/efi_32.c
deleted file mode 100644
index 5cab48ee61a4..000000000000
--- a/arch/x86/kernel/efi_32.c
+++ /dev/null
@@ -1,112 +0,0 @@
-/*
- * Extensible Firmware Interface
- *
- * Based on Extensible Firmware Interface Specification version 1.0
- *
- * Copyright (C) 1999 VA Linux Systems
- * Copyright (C) 1999 Walt Drummond <drummond@valinux.com>
- * Copyright (C) 1999-2002 Hewlett-Packard Co.
- *      David Mosberger-Tang <davidm@hpl.hp.com>
- *      Stephane Eranian <eranian@hpl.hp.com>
- *
- * All EFI Runtime Services are not implemented yet as EFI only
- * supports physical mode addressing on SoftSDV. This is to be fixed
- * in a future version.  --drummond 1999-07-20
- *
- * Implemented EFI runtime services and virtual mode calls.  --davidm
- *
- * Goutham Rao: <goutham.rao@intel.com>
- *      Skip non-WB memory and ignore empty memory ranges.
- */
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/ioport.h>
-#include <linux/efi.h>
-#include <asm/io.h>
-#include <asm/page.h>
-#include <asm/pgtable.h>
-#include <asm/tlbflush.h>
-#include <asm/efi.h>
-/*
- * To make EFI call EFI runtime service in physical addressing mode we need
- * prelog/epilog before/after the invocation to disable interrupt, to
- * claim EFI runtime service handler exclusively and to duplicate a memory in
- * low memory space say 0 - 3G.
- */
-static unsigned long efi_rt_eflags;
-static pgd_t efi_bak_pg_dir_pointer[2];
-void efi_call_phys_prelog(void)
-{
-        unsigned long cr4;
-        unsigned long temp;
-        struct desc_ptr gdt_descr;
-        local_irq_save(efi_rt_eflags);
-        /*
-         * If I don't have PAE, I should just duplicate two entries in page
-         * directory. If I have PAE, I just need to duplicate one entry in
-         * page directory.
-         */
-        cr4 = read_cr4_safe();
-        if (cr4 & X86_CR4_PAE) {
-                efi_bak_pg_dir_pointer[0].pgd =
-                    swapper_pg_dir[pgd_index(0)].pgd;
-                swapper_pg_dir[0].pgd =
-                    swapper_pg_dir[pgd_index(PAGE_OFFSET)].pgd;
-        } else {
-                efi_bak_pg_dir_pointer[0].pgd =
-                    swapper_pg_dir[pgd_index(0)].pgd;
-                efi_bak_pg_dir_pointer[1].pgd =
-                    swapper_pg_dir[pgd_index(0x400000)].pgd;
-                swapper_pg_dir[pgd_index(0)].pgd =
-                    swapper_pg_dir[pgd_index(PAGE_OFFSET)].pgd;
-                temp = PAGE_OFFSET + 0x400000;
-                swapper_pg_dir[pgd_index(0x400000)].pgd =
-                    swapper_pg_dir[pgd_index(temp)].pgd;
-        }
-        /*
-         * After the lock is released, the original page table is restored.
-         */
-        __flush_tlb_all();
-        gdt_descr.address = __pa(get_cpu_gdt_table(0));
-        gdt_descr.size = GDT_SIZE - 1;
-        load_gdt(&gdt_descr);
-}
-void efi_call_phys_epilog(void)
-{
-        unsigned long cr4;
-        struct desc_ptr gdt_descr;
-        gdt_descr.address = (unsigned long)get_cpu_gdt_table(0);
-        gdt_descr.size = GDT_SIZE - 1;
-        load_gdt(&gdt_descr);
-        cr4 = read_cr4_safe();
-        if (cr4 & X86_CR4_PAE) {
-                swapper_pg_dir[pgd_index(0)].pgd =
-                    efi_bak_pg_dir_pointer[0].pgd;
-        } else {
-                swapper_pg_dir[pgd_index(0)].pgd =
-                    efi_bak_pg_dir_pointer[0].pgd;
-                swapper_pg_dir[pgd_index(0x400000)].pgd =
-                    efi_bak_pg_dir_pointer[1].pgd;
-        }
-        /*
-         * After the lock is released, the original page table is restored.
-         */
-        __flush_tlb_all();
-        local_irq_restore(efi_rt_eflags);
-}
diff --git a/arch/x86/kernel/efi_64.c b/arch/x86/kernel/efi_64.c
deleted file mode 100644
index ac0621a7ac3d..000000000000
--- a/arch/x86/kernel/efi_64.c
+++ /dev/null
@@ -1,114 +0,0 @@
-/*
- * x86_64 specific EFI support functions
- * Based on Extensible Firmware Interface Specification version 1.0
- *
- * Copyright (C) 2005-2008 Intel Co.
- *      Fenghua Yu <fenghua.yu@intel.com>
- *      Bibo Mao <bibo.mao@intel.com>
- *      Chandramouli Narayanan <mouli@linux.intel.com>
- *      Huang Ying <ying.huang@intel.com>
- *
- * Code to convert EFI to E820 map has been implemented in elilo bootloader
- * based on a EFI patch by Edgar Hucek. Based on the E820 map, the page table
- * is setup appropriately for EFI runtime code.
- * - mouli 06/14/2007.
- *
- */
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/mm.h>
-#include <linux/types.h>
-#include <linux/spinlock.h>
-#include <linux/bootmem.h>
-#include <linux/ioport.h>
-#include <linux/module.h>
-#include <linux/efi.h>
-#include <linux/uaccess.h>
-#include <linux/io.h>
-#include <linux/reboot.h>
-#include <asm/setup.h>
-#include <asm/page.h>
-#include <asm/e820.h>
-#include <asm/pgtable.h>
-#include <asm/tlbflush.h>
-#include <asm/proto.h>
-#include <asm/efi.h>
-#include <asm/cacheflush.h>
-#include <asm/fixmap.h>
-static pgd_t save_pgd __initdata;
-static unsigned long efi_flags __initdata;
-static void __init early_mapping_set_exec(unsigned long start,
-                                          unsigned long end,
-                                          int executable)
-{
-        unsigned long num_pages;
-        start &= PMD_MASK;
-        end = (end + PMD_SIZE - 1) & PMD_MASK;
-        num_pages = (end - start) >> PAGE_SHIFT;
-        if (executable)
-                set_memory_x((unsigned long)__va(start), num_pages);
-        else
-                set_memory_nx((unsigned long)__va(start), num_pages);
-}
-static void __init early_runtime_code_mapping_set_exec(int executable)
-{
-        efi_memory_desc_t *md;
-        void *p;
-        if (!(__supported_pte_mask & _PAGE_NX))
-                return;
-        /* Make EFI runtime service code area executable */
-        for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
-                md = p;
-                if (md->type == EFI_RUNTIME_SERVICES_CODE) {
-                        unsigned long end;
-                        end = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT);
-                        early_mapping_set_exec(md->phys_addr, end, executable);
-                }
-        }
-}
-void __init efi_call_phys_prelog(void)
-{
-        unsigned long vaddress;
-        early_runtime_code_mapping_set_exec(1);
-        local_irq_save(efi_flags);
-        vaddress = (unsigned long)__va(0x0UL);
-        save_pgd = *pgd_offset_k(0x0UL);
-        set_pgd(pgd_offset_k(0x0UL), *pgd_offset_k(vaddress));
-        __flush_tlb_all();
-}
-void __init efi_call_phys_epilog(void)
-{
-        /*
-         * After the lock is released, the original page table is restored.
-         */
-        set_pgd(pgd_offset_k(0x0UL), save_pgd);
-        __flush_tlb_all();
-        local_irq_restore(efi_flags);
-        early_runtime_code_mapping_set_exec(0);
-}
-void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size,
-                                 u32 type)
-{
-        unsigned long last_map_pfn;
-        if (type == EFI_MEMORY_MAPPED_IO)
-                return ioremap(phys_addr, size);
-        last_map_pfn = init_memory_mapping(phys_addr, phys_addr + size);
-        if ((last_map_pfn << PAGE_SHIFT) < phys_addr + size)
-                return NULL;
-        return (void __iomem *)__va(phys_addr);
-}
diff --git a/arch/x86/kernel/efi_stub_32.S b/arch/x86/kernel/efi_stub_32.S
deleted file mode 100644
index fbe66e626c09..000000000000
--- a/arch/x86/kernel/efi_stub_32.S
+++ /dev/null
@@ -1,123 +0,0 @@
-/*
- * EFI call stub for IA32.
- *
- * This stub allows us to make EFI calls in physical mode with interrupts
- * turned off.
- */
-#include <linux/linkage.h>
-#include <asm/page_types.h>
-/*
- * efi_call_phys(void *, ...) is a function with variable parameters.
- * All the callers of this function assure that all the parameters are 4-bytes.
- */
-/*
- * In gcc calling convention, EBX, ESP, EBP, ESI and EDI are all callee save.
- * So we'd better save all of them at the beginning of this function and restore
- * at the end no matter how many we use, because we can not assure EFI runtime
- * service functions will comply with gcc calling convention, too.
- */
-.text
-ENTRY(efi_call_phys)
-        /*
-         * 0. The function can only be called in Linux kernel. So CS has been
-         * set to 0x0010, DS and SS have been set to 0x0018. In EFI, I found
-         * the values of these registers are the same. And, the corresponding
-         * GDT entries are identical. So I will do nothing about segment reg
-         * and GDT, but change GDT base register in prelog and epilog.
-         */
-        /*
-         * 1. Now I am running with EIP = <physical address> + PAGE_OFFSET.
-         * But to make it smoothly switch from virtual mode to flat mode.
-         * The mapping of lower virtual memory has been created in prelog and
-         * epilog.
-         */
-        movl    $1f, %edx
-        subl    $__PAGE_OFFSET, %edx
-        jmp     *%edx
-1:
-        /*
-         * 2. Now on the top of stack is the return
-         * address in the caller of efi_call_phys(), then parameter 1,
-         * parameter 2, ..., param n. To make things easy, we save the return
-         * address of efi_call_phys in a global variable.
-         */
-        popl    %edx
-        movl    %edx, saved_return_addr
-        /* get the function pointer into ECX*/
-        popl    %ecx
-        movl    %ecx, efi_rt_function_ptr
-        movl    $2f, %edx
-        subl    $__PAGE_OFFSET, %edx
-        pushl   %edx
-        /*
-         * 3. Clear PG bit in %CR0.
-         */
-        movl    %cr0, %edx
-        andl    $0x7fffffff, %edx
-        movl    %edx, %cr0
-        jmp     1f
-1:
-        /*
-         * 4. Adjust stack pointer.
-         */
-        subl    $__PAGE_OFFSET, %esp
-        /*
-         * 5. Call the physical function.
-         */
-        jmp     *%ecx
-2:
-        /*
-         * 6. After EFI runtime service returns, control will return to
-         * following instruction. We'd better readjust stack pointer first.
-         */
-        addl    $__PAGE_OFFSET, %esp
-        /*
-         * 7. Restore PG bit
-         */
-        movl    %cr0, %edx
-        orl     $0x80000000, %edx
-        movl    %edx, %cr0
-        jmp     1f
-1:
-        /*
-         * 8. Now restore the virtual mode from flat mode by
-         * adding EIP with PAGE_OFFSET.
-         */
-        movl    $1f, %edx
-        jmp     *%edx
-1:
-        /*
-         * 9. Balance the stack. And because EAX contain the return value,
-         * we'd better not clobber it.
-         */
-        leal    efi_rt_function_ptr, %edx
-        movl    (%edx), %ecx
-        pushl   %ecx
-        /*
-         * 10. Push the saved return address onto the stack and return.
-         */
-        leal    saved_return_addr, %edx
-        movl    (%edx), %ecx
-        pushl   %ecx
-        ret
-ENDPROC(efi_call_phys)
-.previous
-.data
-saved_return_addr:
-        .long 0
-efi_rt_function_ptr:
-        .long 0
diff --git a/arch/x86/kernel/efi_stub_64.S b/arch/x86/kernel/efi_stub_64.S
deleted file mode 100644
index 4c07ccab8146..000000000000
--- a/arch/x86/kernel/efi_stub_64.S
+++ /dev/null
@@ -1,116 +0,0 @@
-/*
- * Function calling ABI conversion from Linux to EFI for x86_64
- *
- * Copyright (C) 2007 Intel Corp
- *      Bibo Mao <bibo.mao@intel.com>
- *      Huang Ying <ying.huang@intel.com>
- */
-#include <linux/linkage.h>
-#define SAVE_XMM                        \
-        mov %rsp, %rax;                 \
-        subq $0x70, %rsp;               \
-        and $~0xf, %rsp;                \
-        mov %rax, (%rsp);               \
-        mov %cr0, %rax;                 \
-        clts;                           \
-        mov %rax, 0x8(%rsp);            \
-        movaps %xmm0, 0x60(%rsp);       \
-        movaps %xmm1, 0x50(%rsp);       \
-        movaps %xmm2, 0x40(%rsp);       \
-        movaps %xmm3, 0x30(%rsp);       \
-        movaps %xmm4, 0x20(%rsp);       \
-        movaps %xmm5, 0x10(%rsp)
-#define RESTORE_XMM                     \
-        movaps 0x60(%rsp), %xmm0;       \
-        movaps 0x50(%rsp), %xmm1;       \
-        movaps 0x40(%rsp), %xmm2;       \
-        movaps 0x30(%rsp), %xmm3;       \
-        movaps 0x20(%rsp), %xmm4;       \
-        movaps 0x10(%rsp), %xmm5;       \
-        mov 0x8(%rsp), %rsi;            \
-        mov %rsi, %cr0;                 \
-        mov (%rsp), %rsp
-ENTRY(efi_call0)
-        SAVE_XMM
-        subq $32, %rsp
-        call *%rdi
-        addq $32, %rsp
-        RESTORE_XMM
-        ret
-ENDPROC(efi_call0)
-ENTRY(efi_call1)
-        SAVE_XMM
-        subq $32, %rsp
-        mov  %rsi, %rcx
-        call *%rdi
-        addq $32, %rsp
-        RESTORE_XMM
-        ret
-ENDPROC(efi_call1)
-ENTRY(efi_call2)
-        SAVE_XMM
-        subq $32, %rsp
-        mov  %rsi, %rcx
-        call *%rdi
-        addq $32, %rsp
-        RESTORE_XMM
-        ret
-ENDPROC(efi_call2)
-ENTRY(efi_call3)
-        SAVE_XMM
-        subq $32, %rsp
-        mov  %rcx, %r8
-        mov  %rsi, %rcx
-        call *%rdi
-        addq $32, %rsp
-        RESTORE_XMM
-        ret
-ENDPROC(efi_call3)
-ENTRY(efi_call4)
-        SAVE_XMM
-        subq $32, %rsp
-        mov %r8, %r9
-        mov %rcx, %r8
-        mov %rsi, %rcx
-        call *%rdi
-        addq $32, %rsp
-        RESTORE_XMM
-        ret
-ENDPROC(efi_call4)
-ENTRY(efi_call5)
-        SAVE_XMM
-        subq $48, %rsp
-        mov %r9, 32(%rsp)
-        mov %r8, %r9
-        mov %rcx, %r8
-        mov %rsi, %rcx
-        call *%rdi
-        addq $48, %rsp
-        RESTORE_XMM
-        ret
-ENDPROC(efi_call5)
-ENTRY(efi_call6)
-        SAVE_XMM
-        mov (%rsp), %rax
-        mov 8(%rax), %rax
-        subq $48, %rsp
-        mov %r9, 32(%rsp)
-        mov %rax, 40(%rsp)
-        mov %r8, %r9
-        mov %rcx, %r8
-        mov %rsi, %rcx
-        call *%rdi
-        addq $48, %rsp
-        RESTORE_XMM
-        ret
-ENDPROC(efi_call6)
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 227d00920d2f..5c1a91974918 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -65,6 +65,8 @@
 #define sysexit_audit   syscall_exit_work
 #endif
+        .section .entry.text, "ax"
 /*
 * We use macros for low-level operations which need to be overridden
 * for paravirtualization.  The following will never clobber any registers:
@@ -115,8 +117,7 @@
 /* unfortunately push/pop can't be no-op */
 .macro PUSH_GS
-        pushl $0
+        pushl_cfi $0
-        CFI_ADJUST_CFA_OFFSET 4
 .endm
 .macro POP_GS pop=0
        addl $(4 + \pop), %esp
@@ -140,14 +141,12 @@
 #else   /* CONFIG_X86_32_LAZY_GS */
 .macro PUSH_GS
-        pushl %gs
+        pushl_cfi %gs
-        CFI_ADJUST_CFA_OFFSET 4
        /*CFI_REL_OFFSET gs, 0*/
 .endm
 .macro POP_GS pop=0
-98:     popl %gs
+98:     popl_cfi %gs
-        CFI_ADJUST_CFA_OFFSET -4
        /*CFI_RESTORE gs*/
  .if \pop <> 0
        add $\pop, %esp
@@ -195,35 +194,25 @@
 .macro SAVE_ALL
        cld
        PUSH_GS
-        pushl %fs
+        pushl_cfi %fs
-        CFI_ADJUST_CFA_OFFSET 4
        /*CFI_REL_OFFSET fs, 0;*/
-        pushl %es
+        pushl_cfi %es
-        CFI_ADJUST_CFA_OFFSET 4
        /*CFI_REL_OFFSET es, 0;*/
-        pushl %ds
+        pushl_cfi %ds
-        CFI_ADJUST_CFA_OFFSET 4
        /*CFI_REL_OFFSET ds, 0;*/
-        pushl %eax
+        pushl_cfi %eax
-        CFI_ADJUST_CFA_OFFSET 4
        CFI_REL_OFFSET eax, 0
-        pushl %ebp
+        pushl_cfi %ebp
-        CFI_ADJUST_CFA_OFFSET 4
        CFI_REL_OFFSET ebp, 0
-        pushl %edi
+        pushl_cfi %edi
-        CFI_ADJUST_CFA_OFFSET 4
        CFI_REL_OFFSET edi, 0
-        pushl %esi
+        pushl_cfi %esi
-        CFI_ADJUST_CFA_OFFSET 4
        CFI_REL_OFFSET esi, 0
-        pushl %edx
+        pushl_cfi %edx
-        CFI_ADJUST_CFA_OFFSET 4
        CFI_REL_OFFSET edx, 0
-        pushl %ecx
+        pushl_cfi %ecx
-        CFI_ADJUST_CFA_OFFSET 4
        CFI_REL_OFFSET ecx, 0
-        pushl %ebx
+        pushl_cfi %ebx
-        CFI_ADJUST_CFA_OFFSET 4
        CFI_REL_OFFSET ebx, 0
        movl $(__USER_DS), %edx
        movl %edx, %ds
@@ -234,39 +223,29 @@
 .endm
 .macro RESTORE_INT_REGS
-        popl %ebx
+        popl_cfi %ebx
-        CFI_ADJUST_CFA_OFFSET -4
        CFI_RESTORE ebx
-        popl %ecx
+        popl_cfi %ecx
-        CFI_ADJUST_CFA_OFFSET -4
        CFI_RESTORE ecx
-        popl %edx
+        popl_cfi %edx
-        CFI_ADJUST_CFA_OFFSET -4
        CFI_RESTORE edx
-        popl %esi
+        popl_cfi %esi
-        CFI_ADJUST_CFA_OFFSET -4
        CFI_RESTORE esi
-        popl %edi
+        popl_cfi %edi
-        CFI_ADJUST_CFA_OFFSET -4
        CFI_RESTORE edi
-        popl %ebp
+        popl_cfi %ebp
-        CFI_ADJUST_CFA_OFFSET -4
        CFI_RESTORE ebp
-        popl %eax
+        popl_cfi %eax
-        CFI_ADJUST_CFA_OFFSET -4
        CFI_RESTORE eax
 .endm
 .macro RESTORE_REGS pop=0
        RESTORE_INT_REGS
-1:      popl %ds
+1:      popl_cfi %ds
-        CFI_ADJUST_CFA_OFFSET -4
        /*CFI_RESTORE ds;*/
-2:      popl %es
+2:      popl_cfi %es
-        CFI_ADJUST_CFA_OFFSET -4
        /*CFI_RESTORE es;*/
-3:      popl %fs
+3:      popl_cfi %fs
-        CFI_ADJUST_CFA_OFFSET -4
        /*CFI_RESTORE fs;*/
        POP_GS \pop
 .pushsection .fixup, "ax"
@@ -320,16 +299,12 @@
 ENTRY(ret_from_fork)
        CFI_STARTPROC
-        pushl %eax
+        pushl_cfi %eax
-        CFI_ADJUST_CFA_OFFSET 4
        call schedule_tail
        GET_THREAD_INFO(%ebp)
-        popl %eax
+        popl_cfi %eax
-        CFI_ADJUST_CFA_OFFSET -4
+        pushl_cfi $0x0202               # Reset kernel eflags
-        pushl $0x0202                   # Reset kernel eflags
+        popfl_cfi
-        CFI_ADJUST_CFA_OFFSET 4
-        popfl
-        CFI_ADJUST_CFA_OFFSET -4
        jmp syscall_exit
        CFI_ENDPROC
 END(ret_from_fork)
@@ -409,29 +384,23 @@ sysenter_past_esp:
         * enough kernel state to call TRACE_IRQS_OFF can be called - but
         * we immediately enable interrupts at that point anyway.
         */
-        pushl $(__USER_DS)
+        pushl_cfi $__USER_DS
-        CFI_ADJUST_CFA_OFFSET 4
        /*CFI_REL_OFFSET ss, 0*/
-        pushl %ebp
+        pushl_cfi %ebp
-        CFI_ADJUST_CFA_OFFSET 4
        CFI_REL_OFFSET esp, 0
-        pushfl
+        pushfl_cfi
        orl $X86_EFLAGS_IF, (%esp)
-        CFI_ADJUST_CFA_OFFSET 4
+        pushl_cfi $__USER_CS
-        pushl $(__USER_CS)
-        CFI_ADJUST_CFA_OFFSET 4
        /*CFI_REL_OFFSET cs, 0*/
        /*
         * Push current_thread_info()->sysenter_return to the stack.
         * A tiny bit of offset fixup is necessary - 4*4 means the 4 words
         * pushed above; +8 corresponds to copy_thread's esp0 setting.
         */
-        pushl (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp)
+        pushl_cfi ((TI_sysenter_return)-THREAD_SIZE+8+4*4)(%esp)
-        CFI_ADJUST_CFA_OFFSET 4
        CFI_REL_OFFSET eip, 0
-        pushl %eax
+        pushl_cfi %eax
-        CFI_ADJUST_CFA_OFFSET 4
        SAVE_ALL
        ENABLE_INTERRUPTS(CLBR_NONE)
@@ -486,8 +455,7 @@ sysenter_audit:
        movl %eax,%edx                  /* 2nd arg: syscall number */
        movl $AUDIT_ARCH_I386,%eax      /* 1st arg: audit arch */
        call audit_syscall_entry
-        pushl %ebx
+        pushl_cfi %ebx
-        CFI_ADJUST_CFA_OFFSET 4
        movl PT_EAX(%esp),%eax          /* reload syscall number */
        jmp sysenter_do_call
@@ -529,8 +497,7 @@ ENDPROC(ia32_sysenter_target)
        # system call handler stub
 ENTRY(system_call)
        RING0_INT_FRAME                 # can't unwind into user space anyway
-        pushl %eax                      # save orig_eax
+        pushl_cfi %eax                  # save orig_eax
-        CFI_ADJUST_CFA_OFFSET 4
        SAVE_ALL
        GET_THREAD_INFO(%ebp)
                                        # system call tracing in operation / emulation
@@ -566,7 +533,6 @@ restore_all_notrace:
        je ldt_ss                       # returning to user-space with LDT SS
 restore_nocheck:
        RESTORE_REGS 4                  # skip orig_eax/error_code
-        CFI_ADJUST_CFA_OFFSET -4
 irq_return:
        INTERRUPT_RETURN
 .section .fixup,"ax"
@@ -619,10 +585,8 @@ ldt_ss:
        shr $16, %edx
        mov %dl, GDT_ESPFIX_SS + 4 /* bits 16..23 */
        mov %dh, GDT_ESPFIX_SS + 7 /* bits 24..31 */
-        pushl $__ESPFIX_SS
+        pushl_cfi $__ESPFIX_SS
-        CFI_ADJUST_CFA_OFFSET 4
+        pushl_cfi %eax                  /* new kernel esp */
-        push %eax                       /* new kernel esp */
-        CFI_ADJUST_CFA_OFFSET 4
        /* Disable interrupts, but do not irqtrace this section: we
         * will soon execute iret and the tracer was already set to
         * the irqstate after the iret */
@@ -666,11 +630,9 @@ work_notifysig:				# deal with pending signals and
        ALIGN
 work_notifysig_v86:
-        pushl %ecx                      # save ti_flags for do_notify_resume
+        pushl_cfi %ecx                  # save ti_flags for do_notify_resume
-        CFI_ADJUST_CFA_OFFSET 4
        call save_v86_state             # %eax contains pt_regs pointer
-        popl %ecx
+        popl_cfi %ecx
-        CFI_ADJUST_CFA_OFFSET -4
        movl %eax, %esp
 #else
        movl %esp, %eax
@@ -750,14 +712,18 @@ ptregs_##name: \
 #define PTREGSCALL3(name) \
        ALIGN; \
 ptregs_##name: \
+        CFI_STARTPROC; \
        leal 4(%esp),%eax; \
-        pushl %eax; \
+        pushl_cfi %eax; \
        movl PT_EDX(%eax),%ecx; \
        movl PT_ECX(%eax),%edx; \
        movl PT_EBX(%eax),%eax; \
        call sys_##name; \
        addl $4,%esp; \
-        ret
+        CFI_ADJUST_CFA_OFFSET -4; \
+        ret; \
+        CFI_ENDPROC; \
+ENDPROC(ptregs_##name)
 PTREGSCALL1(iopl)
 PTREGSCALL0(fork)
@@ -772,15 +738,19 @@ PTREGSCALL1(vm86old)
 /* Clone is an oddball.  The 4th arg is in %edi */
        ALIGN;
 ptregs_clone:
+        CFI_STARTPROC
        leal 4(%esp),%eax
-        pushl %eax
+        pushl_cfi %eax
-        pushl PT_EDI(%eax)
+        pushl_cfi PT_EDI(%eax)
        movl PT_EDX(%eax),%ecx
        movl PT_ECX(%eax),%edx
        movl PT_EBX(%eax),%eax
        call sys_clone
        addl $8,%esp
+        CFI_ADJUST_CFA_OFFSET -8
        ret
+        CFI_ENDPROC
+ENDPROC(ptregs_clone)
 .macro FIXUP_ESPFIX_STACK
 /*
@@ -795,10 +765,8 @@ ptregs_clone:
        mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */
        shl $16, %eax
        addl %esp, %eax                 /* the adjusted stack pointer */
-        pushl $__KERNEL_DS
+        pushl_cfi $__KERNEL_DS
-        CFI_ADJUST_CFA_OFFSET 4
+        pushl_cfi %eax
-        pushl %eax
-        CFI_ADJUST_CFA_OFFSET 4
        lss (%esp), %esp                /* switch to the normal stack segment */
        CFI_ADJUST_CFA_OFFSET -8
 .endm
@@ -822,7 +790,7 @@ ptregs_clone:
 */
 .section .init.rodata,"a"
 ENTRY(interrupt)
-.text
+.section .entry.text, "ax"
        .p2align 5
        .p2align CONFIG_X86_L1_CACHE_SHIFT
 ENTRY(irq_entries_start)
@@ -835,14 +803,13 @@ vector=FIRST_EXTERNAL_VECTOR
      .if vector <> FIRST_EXTERNAL_VECTOR
        CFI_ADJUST_CFA_OFFSET -4
      .endif
-1:      pushl $(~vector+0x80)   /* Note: always in signed byte range */
+1:      pushl_cfi $(~vector+0x80)       /* Note: always in signed byte range */
-        CFI_ADJUST_CFA_OFFSET 4
      .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6
        jmp 2f
      .endif
      .previous
        .long 1b
-      .text
+      .section .entry.text, "ax"
 vector=vector+1
    .endif
  .endr
@@ -876,8 +843,7 @@ ENDPROC(common_interrupt)
 #define BUILD_INTERRUPT3(name, nr, fn)  \
 ENTRY(name)                             \
        RING0_INT_FRAME;                \
-        pushl $~(nr);                   \
+        pushl_cfi $~(nr);               \
-        CFI_ADJUST_CFA_OFFSET 4;        \
        SAVE_ALL;                       \
        TRACE_IRQS_OFF                  \
        movl %esp,%eax;                 \
@@ -893,21 +859,18 @@ ENDPROC(name)
 ENTRY(coprocessor_error)
        RING0_INT_FRAME
-        pushl $0
+        pushl_cfi $0
-        CFI_ADJUST_CFA_OFFSET 4
+        pushl_cfi $do_coprocessor_error
-        pushl $do_coprocessor_error
-        CFI_ADJUST_CFA_OFFSET 4
        jmp error_code
        CFI_ENDPROC
 END(coprocessor_error)
 ENTRY(simd_coprocessor_error)
        RING0_INT_FRAME
-        pushl $0
+        pushl_cfi $0
-        CFI_ADJUST_CFA_OFFSET 4
 #ifdef CONFIG_X86_INVD_BUG
        /* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */
-661:    pushl $do_general_protection
+661:    pushl_cfi $do_general_protection
 662:
 .section .altinstructions,"a"
        .balign 4
@@ -922,19 +885,16 @@ ENTRY(simd_coprocessor_error)
 664:
 .previous
 #else
-        pushl $do_simd_coprocessor_error
+        pushl_cfi $do_simd_coprocessor_error
 #endif
-        CFI_ADJUST_CFA_OFFSET 4
        jmp error_code
        CFI_ENDPROC
 END(simd_coprocessor_error)
 ENTRY(device_not_available)
        RING0_INT_FRAME
-        pushl $-1                       # mark this as an int
+        pushl_cfi $-1                   # mark this as an int
-        CFI_ADJUST_CFA_OFFSET 4
+        pushl_cfi $do_device_not_available
-        pushl $do_device_not_available
-        CFI_ADJUST_CFA_OFFSET 4
        jmp error_code
        CFI_ENDPROC
 END(device_not_available)
@@ -956,82 +916,68 @@ END(native_irq_enable_sysexit)
 ENTRY(overflow)
        RING0_INT_FRAME
-        pushl $0
+        pushl_cfi $0
-        CFI_ADJUST_CFA_OFFSET 4
+        pushl_cfi $do_overflow
-        pushl $do_overflow
-        CFI_ADJUST_CFA_OFFSET 4
        jmp error_code
        CFI_ENDPROC
 END(overflow)
 ENTRY(bounds)
        RING0_INT_FRAME
-        pushl $0
+        pushl_cfi $0
-        CFI_ADJUST_CFA_OFFSET 4
+        pushl_cfi $do_bounds
-        pushl $do_bounds
-        CFI_ADJUST_CFA_OFFSET 4
        jmp error_code
        CFI_ENDPROC
 END(bounds)
 ENTRY(invalid_op)
        RING0_INT_FRAME
-        pushl $0
+        pushl_cfi $0
-        CFI_ADJUST_CFA_OFFSET 4
+        pushl_cfi $do_invalid_op
-        pushl $do_invalid_op
-        CFI_ADJUST_CFA_OFFSET 4
        jmp error_code
        CFI_ENDPROC
 END(invalid_op)
 ENTRY(coprocessor_segment_overrun)
        RING0_INT_FRAME
-        pushl $0
+        pushl_cfi $0
-        CFI_ADJUST_CFA_OFFSET 4
+        pushl_cfi $do_coprocessor_segment_overrun
-        pushl $do_coprocessor_segment_overrun
-        CFI_ADJUST_CFA_OFFSET 4
        jmp error_code
        CFI_ENDPROC
 END(coprocessor_segment_overrun)
 ENTRY(invalid_TSS)
        RING0_EC_FRAME
-        pushl $do_invalid_TSS
+        pushl_cfi $do_invalid_TSS
-        CFI_ADJUST_CFA_OFFSET 4
        jmp error_code
        CFI_ENDPROC
 END(invalid_TSS)
 ENTRY(segment_not_present)
        RING0_EC_FRAME
-        pushl $do_segment_not_present
+        pushl_cfi $do_segment_not_present
-        CFI_ADJUST_CFA_OFFSET 4
        jmp error_code
        CFI_ENDPROC
 END(segment_not_present)
 ENTRY(stack_segment)
        RING0_EC_FRAME
-        pushl $do_stack_segment
+        pushl_cfi $do_stack_segment
-        CFI_ADJUST_CFA_OFFSET 4
        jmp error_code
        CFI_ENDPROC
 END(stack_segment)
 ENTRY(alignment_check)
        RING0_EC_FRAME
-        pushl $do_alignment_check
+        pushl_cfi $do_alignment_check
-        CFI_ADJUST_CFA_OFFSET 4
        jmp error_code
        CFI_ENDPROC
 END(alignment_check)
 ENTRY(divide_error)
        RING0_INT_FRAME
-        pushl $0                        # no error code
+        pushl_cfi $0                    # no error code
-        CFI_ADJUST_CFA_OFFSET 4
+        pushl_cfi $do_divide_error
-        pushl $do_divide_error
-        CFI_ADJUST_CFA_OFFSET 4
        jmp error_code
        CFI_ENDPROC
 END(divide_error)
@@ -1039,10 +985,8 @@ END(divide_error)
 #ifdef CONFIG_X86_MCE
 ENTRY(machine_check)
        RING0_INT_FRAME
-        pushl $0
+        pushl_cfi $0
-        CFI_ADJUST_CFA_OFFSET 4
+        pushl_cfi machine_check_vector
-        pushl machine_check_vector
-        CFI_ADJUST_CFA_OFFSET 4
        jmp error_code
        CFI_ENDPROC
 END(machine_check)
@@ -1050,10 +994,8 @@ END(machine_check)
 ENTRY(spurious_interrupt_bug)
        RING0_INT_FRAME
-        pushl $0
+        pushl_cfi $0
-        CFI_ADJUST_CFA_OFFSET 4
+        pushl_cfi $do_spurious_interrupt_bug
-        pushl $do_spurious_interrupt_bug
-        CFI_ADJUST_CFA_OFFSET 4
        jmp error_code
        CFI_ENDPROC
 END(spurious_interrupt_bug)
@@ -1084,8 +1026,7 @@ ENTRY(xen_sysenter_target)
 ENTRY(xen_hypervisor_callback)
        CFI_STARTPROC
-        pushl $0
+        pushl_cfi $0
-        CFI_ADJUST_CFA_OFFSET 4
        SAVE_ALL
        TRACE_IRQS_OFF
@@ -1121,23 +1062,20 @@ ENDPROC(xen_hypervisor_callback)
 # We distinguish between categories by maintaining a status value in EAX.
 ENTRY(xen_failsafe_callback)
        CFI_STARTPROC
-        pushl %eax
+        pushl_cfi %eax
-        CFI_ADJUST_CFA_OFFSET 4
        movl $1,%eax
 1:      mov 4(%esp),%ds
 2:      mov 8(%esp),%es
 3:      mov 12(%esp),%fs
 4:      mov 16(%esp),%gs
        testl %eax,%eax
-        popl %eax
+        popl_cfi %eax
-        CFI_ADJUST_CFA_OFFSET -4
        lea 16(%esp),%esp
        CFI_ADJUST_CFA_OFFSET -16
        jz 5f
        addl $16,%esp
        jmp iret_exc            # EAX != 0 => Category 2 (Bad IRET)
-5:      pushl $0                # EAX == 0 => Category 1 (Bad segment)
+5:      pushl_cfi $0            # EAX == 0 => Category 1 (Bad segment)
-        CFI_ADJUST_CFA_OFFSET 4
        SAVE_ALL
        jmp ret_from_exception
        CFI_ENDPROC
@@ -1287,40 +1225,29 @@ syscall_table_size=(.-sys_call_table)
 ENTRY(page_fault)
        RING0_EC_FRAME
-        pushl $do_page_fault
+        pushl_cfi $do_page_fault
-        CFI_ADJUST_CFA_OFFSET 4
        ALIGN
 error_code:
        /* the function address is in %gs's slot on the stack */
-        pushl %fs
+        pushl_cfi %fs
-        CFI_ADJUST_CFA_OFFSET 4
        /*CFI_REL_OFFSET fs, 0*/
-        pushl %es
+        pushl_cfi %es
-        CFI_ADJUST_CFA_OFFSET 4
        /*CFI_REL_OFFSET es, 0*/
-        pushl %ds
+        pushl_cfi %ds
-        CFI_ADJUST_CFA_OFFSET 4
        /*CFI_REL_OFFSET ds, 0*/
-        pushl %eax
+        pushl_cfi %eax
-        CFI_ADJUST_CFA_OFFSET 4
        CFI_REL_OFFSET eax, 0
-        pushl %ebp
+        pushl_cfi %ebp
-        CFI_ADJUST_CFA_OFFSET 4
        CFI_REL_OFFSET ebp, 0
-        pushl %edi
+        pushl_cfi %edi
-        CFI_ADJUST_CFA_OFFSET 4
        CFI_REL_OFFSET edi, 0
-        pushl %esi
+        pushl_cfi %esi
-        CFI_ADJUST_CFA_OFFSET 4
        CFI_REL_OFFSET esi, 0
-        pushl %edx
+        pushl_cfi %edx
-        CFI_ADJUST_CFA_OFFSET 4
        CFI_REL_OFFSET edx, 0
-        pushl %ecx
+        pushl_cfi %ecx
-        CFI_ADJUST_CFA_OFFSET 4
        CFI_REL_OFFSET ecx, 0
-        pushl %ebx
+        pushl_cfi %ebx
-        CFI_ADJUST_CFA_OFFSET 4
        CFI_REL_OFFSET ebx, 0
        cld
        movl $(__KERNEL_PERCPU), %ecx
@@ -1362,12 +1289,9 @@ END(page_fault)
        movl TSS_sysenter_sp0 + \offset(%esp), %esp
        CFI_DEF_CFA esp, 0
        CFI_UNDEFINED eip
-        pushfl
+        pushfl_cfi
-        CFI_ADJUST_CFA_OFFSET 4
+        pushl_cfi $__KERNEL_CS
-        pushl $__KERNEL_CS
+        pushl_cfi $sysenter_past_esp
-        CFI_ADJUST_CFA_OFFSET 4
-        pushl $sysenter_past_esp
-        CFI_ADJUST_CFA_OFFSET 4
        CFI_REL_OFFSET eip, 0
 .endm
@@ -1377,8 +1301,7 @@ ENTRY(debug)
        jne debug_stack_correct
        FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn
 debug_stack_correct:
-        pushl $-1                       # mark this as an int
+        pushl_cfi $-1                   # mark this as an int
-        CFI_ADJUST_CFA_OFFSET 4
        SAVE_ALL
        TRACE_IRQS_OFF
        xorl %edx,%edx                  # error code 0
@@ -1398,32 +1321,27 @@ END(debug)
 */
 ENTRY(nmi)
        RING0_INT_FRAME
-        pushl %eax
+        pushl_cfi %eax
-        CFI_ADJUST_CFA_OFFSET 4
        movl %ss, %eax
        cmpw $__ESPFIX_SS, %ax
-        popl %eax
+        popl_cfi %eax
-        CFI_ADJUST_CFA_OFFSET -4
        je nmi_espfix_stack
        cmpl $ia32_sysenter_target,(%esp)
        je nmi_stack_fixup
-        pushl %eax
+        pushl_cfi %eax
-        CFI_ADJUST_CFA_OFFSET 4
        movl %esp,%eax
        /* Do not access memory above the end of our stack page,
         * it might not exist.
         */
        andl $(THREAD_SIZE-1),%eax
        cmpl $(THREAD_SIZE-20),%eax
-        popl %eax
+        popl_cfi %eax
-        CFI_ADJUST_CFA_OFFSET -4
        jae nmi_stack_correct
        cmpl $ia32_sysenter_target,12(%esp)
        je nmi_debug_stack_check
 nmi_stack_correct:
        /* We have a RING0_INT_FRAME here */
-        pushl %eax
+        pushl_cfi %eax
-        CFI_ADJUST_CFA_OFFSET 4
        SAVE_ALL
        xorl %edx,%edx          # zero error code
        movl %esp,%eax          # pt_regs pointer
@@ -1452,18 +1370,14 @@ nmi_espfix_stack:
         *
         * create the pointer to lss back
         */
-        pushl %ss
+        pushl_cfi %ss
-        CFI_ADJUST_CFA_OFFSET 4
+        pushl_cfi %esp
-        pushl %esp
-        CFI_ADJUST_CFA_OFFSET 4
        addl $4, (%esp)
        /* copy the iret frame of 12 bytes */
        .rept 3
-        pushl 16(%esp)
+        pushl_cfi 16(%esp)
-        CFI_ADJUST_CFA_OFFSET 4
        .endr
-        pushl %eax
+        pushl_cfi %eax
-        CFI_ADJUST_CFA_OFFSET 4
        SAVE_ALL
        FIXUP_ESPFIX_STACK              # %eax == %esp
        xorl %edx,%edx                  # zero error code
@@ -1477,8 +1391,7 @@ END(nmi)
 ENTRY(int3)
        RING0_INT_FRAME
-        pushl $-1                       # mark this as an int
+        pushl_cfi $-1                   # mark this as an int
-        CFI_ADJUST_CFA_OFFSET 4
        SAVE_ALL
        TRACE_IRQS_OFF
        xorl %edx,%edx          # zero error code
@@ -1490,12 +1403,20 @@ END(int3)
 ENTRY(general_protection)
        RING0_EC_FRAME
-        pushl $do_general_protection
+        pushl_cfi $do_general_protection
-        CFI_ADJUST_CFA_OFFSET 4
        jmp error_code
        CFI_ENDPROC
 END(general_protection)
+#ifdef CONFIG_KVM_GUEST
+ENTRY(async_page_fault)
+        RING0_EC_FRAME
+        pushl_cfi $do_async_page_fault
+        jmp error_code
+        CFI_ENDPROC
+END(async_page_fault)
+#endif
 /*
 * End of kprobes section
 */
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 115e8951e8c8..47a4bcd2e503 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -18,7 +18,7 @@
 * A note on terminology:
 * - top of stack: Architecture defined interrupt frame from SS to RIP
 * at the top of the kernel process stack.
- * - partial stack frame: partially saved registers upto R11.
+ * - partial stack frame: partially saved registers up to R11.
 * - full stack frame: Like partial stack frame, but all register saved.
 *
 * Some macro usage:
@@ -61,6 +61,8 @@
 #define __AUDIT_ARCH_LE    0x40000000
        .code64
+        .section .entry.text, "ax"
 #ifdef CONFIG_FUNCTION_TRACER
 #ifdef CONFIG_DYNAMIC_FTRACE
 ENTRY(mcount)
@@ -213,23 +215,17 @@ ENDPROC(native_usergs_sysret64)
        .macro FAKE_STACK_FRAME child_rip
        /* push in order ss, rsp, eflags, cs, rip */
        xorl %eax, %eax
-        pushq $__KERNEL_DS /* ss */
+        pushq_cfi $__KERNEL_DS /* ss */
-        CFI_ADJUST_CFA_OFFSET   8
        /*CFI_REL_OFFSET        ss,0*/
-        pushq %rax /* rsp */
+        pushq_cfi %rax /* rsp */
-        CFI_ADJUST_CFA_OFFSET   8
        CFI_REL_OFFSET  rsp,0
-        pushq $X86_EFLAGS_IF /* eflags - interrupts on */
+        pushq_cfi $X86_EFLAGS_IF /* eflags - interrupts on */
-        CFI_ADJUST_CFA_OFFSET   8
        /*CFI_REL_OFFSET        rflags,0*/
-        pushq $__KERNEL_CS /* cs */
+        pushq_cfi $__KERNEL_CS /* cs */
-        CFI_ADJUST_CFA_OFFSET   8
        /*CFI_REL_OFFSET        cs,0*/
-        pushq \child_rip /* rip */
+        pushq_cfi \child_rip /* rip */
-        CFI_ADJUST_CFA_OFFSET   8
        CFI_REL_OFFSET  rip,0
-        pushq   %rax /* orig rax */
+        pushq_cfi %rax /* orig rax */
-        CFI_ADJUST_CFA_OFFSET   8
        .endm
        .macro UNFAKE_STACK_FRAME
@@ -301,20 +297,25 @@ ENDPROC(native_usergs_sysret64)
        .endm
 /* save partial stack frame */
+        .pushsection .kprobes.text, "ax"
 ENTRY(save_args)
        XCPT_FRAME
        cld
-        movq_cfi rdi, RDI+16-ARGOFFSET
+        /*
-        movq_cfi rsi, RSI+16-ARGOFFSET
+         * start from rbp in pt_regs and jump over
-        movq_cfi rdx, RDX+16-ARGOFFSET
+         * return address.
-        movq_cfi rcx, RCX+16-ARGOFFSET
+         */
-        movq_cfi rax, RAX+16-ARGOFFSET
+        movq_cfi rdi, RDI+8-RBP
-        movq_cfi  r8,  R8+16-ARGOFFSET
+        movq_cfi rsi, RSI+8-RBP
-        movq_cfi  r9,  R9+16-ARGOFFSET
+        movq_cfi rdx, RDX+8-RBP
-        movq_cfi r10, R10+16-ARGOFFSET
+        movq_cfi rcx, RCX+8-RBP
-        movq_cfi r11, R11+16-ARGOFFSET
+        movq_cfi rax, RAX+8-RBP
+        movq_cfi  r8,  R8+8-RBP
-        leaq -ARGOFFSET+16(%rsp),%rdi   /* arg1 for handler */
+        movq_cfi  r9,  R9+8-RBP
+        movq_cfi r10, R10+8-RBP
+        movq_cfi r11, R11+8-RBP
+        leaq -RBP+8(%rsp),%rdi  /* arg1 for handler */
        movq_cfi rbp, 8         /* push %rbp */
        leaq 8(%rsp), %rbp              /* mov %rsp, %ebp */
        testl $3, CS(%rdi)
@@ -340,6 +341,7 @@ ENTRY(save_args)
        ret
        CFI_ENDPROC
 END(save_args)
+        .popsection
 ENTRY(save_rest)
        PARTIAL_FRAME 1 REST_SKIP+8
@@ -398,10 +400,8 @@ ENTRY(ret_from_fork)
        LOCK ; btr $TIF_FORK,TI_flags(%r8)
-        push kernel_eflags(%rip)
+        pushq_cfi kernel_eflags(%rip)
-        CFI_ADJUST_CFA_OFFSET 8
+        popfq_cfi                               # reset kernel eflags
-        popf                                    # reset kernel eflags
-        CFI_ADJUST_CFA_OFFSET -8
        call schedule_tail                      # rdi: 'prev' task parameter
@@ -422,7 +422,7 @@ ENTRY(ret_from_fork)
 END(ret_from_fork)
 /*
- * System call entry. Upto 6 arguments in registers are supported.
+ * System call entry. Up to 6 arguments in registers are supported.
 *
 * SYSCALL does not save anything on the stack and does not change the
 * stack pointer.
@@ -521,11 +521,9 @@ sysret_careful:
        jnc sysret_signal
        TRACE_IRQS_ON
        ENABLE_INTERRUPTS(CLBR_NONE)
-        pushq %rdi
+        pushq_cfi %rdi
-        CFI_ADJUST_CFA_OFFSET 8
        call schedule
-        popq  %rdi
+        popq_cfi %rdi
-        CFI_ADJUST_CFA_OFFSET -8
        jmp sysret_check
        /* Handle a signal */
@@ -634,11 +632,9 @@ int_careful:
        jnc  int_very_careful
        TRACE_IRQS_ON
        ENABLE_INTERRUPTS(CLBR_NONE)
-        pushq %rdi
+        pushq_cfi %rdi
-        CFI_ADJUST_CFA_OFFSET 8
        call schedule
-        popq %rdi
+        popq_cfi %rdi
-        CFI_ADJUST_CFA_OFFSET -8
        DISABLE_INTERRUPTS(CLBR_NONE)
        TRACE_IRQS_OFF
        jmp int_with_check
@@ -652,12 +648,10 @@ int_check_syscall_exit_work:
        /* Check for syscall exit trace */
        testl $_TIF_WORK_SYSCALL_EXIT,%edx
        jz int_signal
-        pushq %rdi
+        pushq_cfi %rdi
-        CFI_ADJUST_CFA_OFFSET 8
        leaq 8(%rsp),%rdi       # &ptregs -> arg1
        call syscall_trace_leave
-        popq %rdi
+        popq_cfi %rdi
-        CFI_ADJUST_CFA_OFFSET -8
        andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi
        jmp int_restore_rest
@@ -714,9 +708,8 @@ END(ptregscall_common)
 ENTRY(stub_execve)
        CFI_STARTPROC
-        popq %r11
+        addq $8, %rsp
-        CFI_ADJUST_CFA_OFFSET -8
+        PARTIAL_FRAME 0
-        CFI_REGISTER rip, r11
        SAVE_REST
        FIXUP_TOP_OF_STACK %r11
        movq %rsp, %rcx
@@ -735,7 +728,7 @@ END(stub_execve)
 ENTRY(stub_rt_sigreturn)
        CFI_STARTPROC
        addq $8, %rsp
-        CFI_ADJUST_CFA_OFFSET   -8
+        PARTIAL_FRAME 0
        SAVE_REST
        movq %rsp,%rdi
        FIXUP_TOP_OF_STACK %r11
@@ -753,7 +746,7 @@ END(stub_rt_sigreturn)
 */
        .section .init.rodata,"a"
 ENTRY(interrupt)
-        .text
+        .section .entry.text
        .p2align 5
        .p2align CONFIG_X86_L1_CACHE_SHIFT
 ENTRY(irq_entries_start)
@@ -766,14 +759,13 @@ vector=FIRST_EXTERNAL_VECTOR
      .if vector <> FIRST_EXTERNAL_VECTOR
        CFI_ADJUST_CFA_OFFSET -8
      .endif
-1:      pushq $(~vector+0x80)   /* Note: always in signed byte range */
+1:      pushq_cfi $(~vector+0x80)       /* Note: always in signed byte range */
-        CFI_ADJUST_CFA_OFFSET 8
      .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6
        jmp 2f
      .endif
      .previous
        .quad 1b
-      .text
+      .section .entry.text
 vector=vector+1
    .endif
  .endr
@@ -796,8 +788,9 @@ END(interrupt)
 /* 0(%rsp): ~(interrupt number) */
        .macro interrupt func
-        subq $10*8, %rsp
+        /* reserve pt_regs for scratch regs and rbp */
-        CFI_ADJUST_CFA_OFFSET 10*8
+        subq $ORIG_RAX-RBP, %rsp
+        CFI_ADJUST_CFA_OFFSET ORIG_RAX-RBP
        call save_args
        PARTIAL_FRAME 0
        call \func
@@ -822,8 +815,14 @@ ret_from_intr:
        TRACE_IRQS_OFF
        decl PER_CPU_VAR(irq_count)
        leaveq
+        CFI_RESTORE             rbp
        CFI_DEF_CFA_REGISTER    rsp
        CFI_ADJUST_CFA_OFFSET   -8
+        /* we did not save rbx, restore only from ARGOFFSET */
+        addq $8, %rsp
+        CFI_ADJUST_CFA_OFFSET   -8
 exit_intr:
        GET_THREAD_INFO(%rcx)
        testl $3,CS-ARGOFFSET(%rsp)
@@ -903,11 +902,9 @@ retint_careful:
        jnc   retint_signal
        TRACE_IRQS_ON
        ENABLE_INTERRUPTS(CLBR_NONE)
-        pushq %rdi
+        pushq_cfi %rdi
-        CFI_ADJUST_CFA_OFFSET   8
        call  schedule
-        popq %rdi
+        popq_cfi %rdi
-        CFI_ADJUST_CFA_OFFSET   -8
        GET_THREAD_INFO(%rcx)
        DISABLE_INTERRUPTS(CLBR_NONE)
        TRACE_IRQS_OFF
@@ -956,8 +953,7 @@ END(common_interrupt)
 .macro apicinterrupt num sym do_sym
 ENTRY(\sym)
        INTR_FRAME
-        pushq $~(\num)
+        pushq_cfi $~(\num)
-        CFI_ADJUST_CFA_OFFSET 8
        interrupt \do_sym
        jmp ret_from_intr
        CFI_ENDPROC
@@ -981,22 +977,13 @@ apicinterrupt X86_PLATFORM_IPI_VECTOR \
        x86_platform_ipi smp_x86_platform_ipi
 #ifdef CONFIG_SMP
-apicinterrupt INVALIDATE_TLB_VECTOR_START+0 \
+.irp idx,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, \
-        invalidate_interrupt0 smp_invalidate_interrupt
+        16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
-apicinterrupt INVALIDATE_TLB_VECTOR_START+1 \
+.if NUM_INVALIDATE_TLB_VECTORS > \idx
-        invalidate_interrupt1 smp_invalidate_interrupt
+apicinterrupt (INVALIDATE_TLB_VECTOR_START)+\idx \
-apicinterrupt INVALIDATE_TLB_VECTOR_START+2 \
+        invalidate_interrupt\idx smp_invalidate_interrupt
-        invalidate_interrupt2 smp_invalidate_interrupt
+.endif
-apicinterrupt INVALIDATE_TLB_VECTOR_START+3 \
+.endr
-        invalidate_interrupt3 smp_invalidate_interrupt
-apicinterrupt INVALIDATE_TLB_VECTOR_START+4 \
-        invalidate_interrupt4 smp_invalidate_interrupt
-apicinterrupt INVALIDATE_TLB_VECTOR_START+5 \
-        invalidate_interrupt5 smp_invalidate_interrupt
-apicinterrupt INVALIDATE_TLB_VECTOR_START+6 \
-        invalidate_interrupt6 smp_invalidate_interrupt
-apicinterrupt INVALIDATE_TLB_VECTOR_START+7 \
-        invalidate_interrupt7 smp_invalidate_interrupt
 #endif
 apicinterrupt THRESHOLD_APIC_VECTOR \
@@ -1025,9 +1012,9 @@ apicinterrupt ERROR_APIC_VECTOR \
 apicinterrupt SPURIOUS_APIC_VECTOR \
        spurious_interrupt smp_spurious_interrupt
-#ifdef CONFIG_PERF_EVENTS
+#ifdef CONFIG_IRQ_WORK
-apicinterrupt LOCAL_PENDING_VECTOR \
+apicinterrupt IRQ_WORK_VECTOR \
-        perf_pending_interrupt smp_perf_pending_interrupt
+        irq_work_interrupt smp_irq_work_interrupt
 #endif
 /*
@@ -1038,8 +1025,8 @@ ENTRY(\sym)
        INTR_FRAME
        PARAVIRT_ADJUST_EXCEPTION_FRAME
        pushq_cfi $-1           /* ORIG_RAX: no syscall to restart */
-        subq $15*8,%rsp
+        subq $ORIG_RAX-R15, %rsp
-        CFI_ADJUST_CFA_OFFSET 15*8
+        CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
        call error_entry
        DEFAULT_FRAME 0
        movq %rsp,%rdi          /* pt_regs pointer */
@@ -1054,9 +1041,9 @@ END(\sym)
 ENTRY(\sym)
        INTR_FRAME
        PARAVIRT_ADJUST_EXCEPTION_FRAME
-        pushq $-1               /* ORIG_RAX: no syscall to restart */
+        pushq_cfi $-1           /* ORIG_RAX: no syscall to restart */
-        CFI_ADJUST_CFA_OFFSET 8
+        subq $ORIG_RAX-R15, %rsp
-        subq $15*8, %rsp
+        CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
        call save_paranoid
        TRACE_IRQS_OFF
        movq %rsp,%rdi          /* pt_regs pointer */
@@ -1072,9 +1059,9 @@ END(\sym)
 ENTRY(\sym)
        INTR_FRAME
        PARAVIRT_ADJUST_EXCEPTION_FRAME
-        pushq $-1               /* ORIG_RAX: no syscall to restart */
+        pushq_cfi $-1           /* ORIG_RAX: no syscall to restart */
-        CFI_ADJUST_CFA_OFFSET 8
+        subq $ORIG_RAX-R15, %rsp
-        subq $15*8, %rsp
+        CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
        call save_paranoid
        TRACE_IRQS_OFF
        movq %rsp,%rdi          /* pt_regs pointer */
@@ -1091,8 +1078,8 @@ END(\sym)
 ENTRY(\sym)
        XCPT_FRAME
        PARAVIRT_ADJUST_EXCEPTION_FRAME
-        subq $15*8,%rsp
+        subq $ORIG_RAX-R15, %rsp
-        CFI_ADJUST_CFA_OFFSET 15*8
+        CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
        call error_entry
        DEFAULT_FRAME 0
        movq %rsp,%rdi                  /* pt_regs pointer */
@@ -1109,8 +1096,8 @@ END(\sym)
 ENTRY(\sym)
        XCPT_FRAME
        PARAVIRT_ADJUST_EXCEPTION_FRAME
-        subq $15*8,%rsp
+        subq $ORIG_RAX-R15, %rsp
-        CFI_ADJUST_CFA_OFFSET 15*8
+        CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
        call save_paranoid
        DEFAULT_FRAME 0
        TRACE_IRQS_OFF
@@ -1141,16 +1128,14 @@ zeroentry simd_coprocessor_error do_simd_coprocessor_error
        /* edi:  new selector */
 ENTRY(native_load_gs_index)
        CFI_STARTPROC
-        pushf
+        pushfq_cfi
-        CFI_ADJUST_CFA_OFFSET 8
        DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI)
        SWAPGS
 gs_change:
        movl %edi,%gs
 2:      mfence          /* workaround */
        SWAPGS
-        popf
+        popfq_cfi
-        CFI_ADJUST_CFA_OFFSET -8
        ret
        CFI_ENDPROC
 END(native_load_gs_index)
@@ -1217,8 +1202,7 @@ END(kernel_execve)
 /* Call softirq on interrupt stack. Interrupts are off. */
 ENTRY(call_softirq)
        CFI_STARTPROC
-        push %rbp
+        pushq_cfi %rbp
-        CFI_ADJUST_CFA_OFFSET   8
        CFI_REL_OFFSET rbp,0
        mov  %rsp,%rbp
        CFI_DEF_CFA_REGISTER rbp
@@ -1227,6 +1211,7 @@ ENTRY(call_softirq)
        push  %rbp                      # backlink for old unwinder
        call __do_softirq
        leaveq
+        CFI_RESTORE             rbp
        CFI_DEF_CFA_REGISTER    rsp
        CFI_ADJUST_CFA_OFFSET   -8
        decl PER_CPU_VAR(irq_count)
@@ -1270,7 +1255,7 @@ ENTRY(xen_do_hypervisor_callback)   # do_hypervisor_callback(struct *pt_regs)
        decl PER_CPU_VAR(irq_count)
        jmp  error_exit
        CFI_ENDPROC
-END(do_hypervisor_callback)
+END(xen_do_hypervisor_callback)
 /*
 * Hypervisor uses this for application faults while it executes.
@@ -1351,6 +1336,9 @@ errorentry xen_stack_segment do_stack_segment
 #endif
 errorentry general_protection do_general_protection
 errorentry page_fault do_page_fault
+#ifdef CONFIG_KVM_GUEST
+errorentry async_page_fault do_async_page_fault
+#endif
 #ifdef CONFIG_X86_MCE
 paranoidzeroentry machine_check *machine_check_vector(%rip)
 #endif
@@ -1370,7 +1358,7 @@ paranoidzeroentry machine_check *machine_check_vector(%rip)
        /* ebx: no swapgs flag */
 ENTRY(paranoid_exit)
-        INTR_FRAME
+        DEFAULT_FRAME
        DISABLE_INTERRUPTS(CLBR_NONE)
        TRACE_IRQS_OFF
        testl %ebx,%ebx                         /* swapgs needed? */
@@ -1447,7 +1435,6 @@ error_swapgs:
 error_sti:
        TRACE_IRQS_OFF
        ret
-        CFI_ENDPROC
 /*
 * There are two places in the kernel that can potentially fault with
@@ -1472,6 +1459,7 @@ bstep_iret:
        /* Fix truncated RIP */
        movq %rcx,RIP+8(%rsp)
        jmp error_swapgs
+        CFI_ENDPROC
 END(error_entry)
@@ -1500,8 +1488,8 @@ ENTRY(nmi)
        INTR_FRAME
        PARAVIRT_ADJUST_EXCEPTION_FRAME
        pushq_cfi $-1
-        subq $15*8, %rsp
+        subq $ORIG_RAX-R15, %rsp
-        CFI_ADJUST_CFA_OFFSET 15*8
+        CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
        call save_paranoid
        DEFAULT_FRAME 0
        /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index cd37469b54ee..c9a281f272fd 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -19,6 +19,7 @@
 #include <linux/sched.h>
 #include <linux/init.h>
 #include <linux/list.h>
+#include <linux/module.h>
 #include <trace/syscall.h>
@@ -49,6 +50,7 @@ static DEFINE_PER_CPU(int, save_modifying_code);
 int ftrace_arch_code_modify_prepare(void)
 {
        set_kernel_text_rw();
+        set_all_modules_text_rw();
        modifying_code = 1;
        return 0;
 }
@@ -56,6 +58,7 @@ int ftrace_arch_code_modify_prepare(void)
 int ftrace_arch_code_modify_post_process(void)
 {
        modifying_code = 0;
+        set_all_modules_text_ro();
        set_kernel_text_ro();
        return 0;
 }
@@ -120,7 +123,7 @@ static unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
 static atomic_t nmi_running = ATOMIC_INIT(0);
 static int mod_code_status;             /* holds return value of text write */
 static void *mod_code_ip;               /* holds the IP to write to */
-static void *mod_code_newcode;          /* holds the text to write to the IP */
+static const void *mod_code_newcode;    /* holds the text to write to the IP */
 static unsigned nmi_wait_count;
 static atomic_t nmi_update_count = ATOMIC_INIT(0);
@@ -167,9 +170,9 @@ static void ftrace_mod_code(void)
 void ftrace_nmi_enter(void)
 {
-        __get_cpu_var(save_modifying_code) = modifying_code;
+        __this_cpu_write(save_modifying_code, modifying_code);
-        if (!__get_cpu_var(save_modifying_code))
+        if (!__this_cpu_read(save_modifying_code))
                return;
        if (atomic_inc_return(&nmi_running) & MOD_CODE_WRITE_FLAG) {
@@ -183,7 +186,7 @@ void ftrace_nmi_enter(void)
 void ftrace_nmi_exit(void)
 {
-        if (!__get_cpu_var(save_modifying_code))
+        if (!__this_cpu_read(save_modifying_code))
                return;
        /* Finish all executions before clearing nmi_running */
@@ -222,7 +225,7 @@ within(unsigned long addr, unsigned long start, unsigned long end)
 }
 static int
-do_ftrace_mod_code(unsigned long ip, void *new_code)
+do_ftrace_mod_code(unsigned long ip, const void *new_code)
 {
        /*
         * On x86_64, kernel text mappings are mapped read-only with
@@ -257,19 +260,14 @@ do_ftrace_mod_code(unsigned long ip, void *new_code)
        return mod_code_status;
 }
+static const unsigned char *ftrace_nop_replace(void)
-static unsigned char ftrace_nop[MCOUNT_INSN_SIZE];
-static unsigned char *ftrace_nop_replace(void)
 {
-        return ftrace_nop;
+        return ideal_nops[NOP_ATOMIC5];
 }
 static int
-ftrace_modify_code(unsigned long ip, unsigned char *old_code,
+ftrace_modify_code(unsigned long ip, unsigned const char *old_code,
-                   unsigned char *new_code)
+                   unsigned const char *new_code)
 {
        unsigned char replaced[MCOUNT_INSN_SIZE];
@@ -303,7 +301,7 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code,
 int ftrace_make_nop(struct module *mod,
                    struct dyn_ftrace *rec, unsigned long addr)
 {
-        unsigned char *new, *old;
+        unsigned const char *new, *old;
        unsigned long ip = rec->ip;
        old = ftrace_call_replace(ip, addr);
@@ -314,7 +312,7 @@ int ftrace_make_nop(struct module *mod,
 int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
 {
-        unsigned char *new, *old;
+        unsigned const char *new, *old;
        unsigned long ip = rec->ip;
        old = ftrace_nop_replace();
@@ -338,62 +336,6 @@ int ftrace_update_ftrace_func(ftrace_func_t func)
 int __init ftrace_dyn_arch_init(void *data)
 {
-        extern const unsigned char ftrace_test_p6nop[];
-        extern const unsigned char ftrace_test_nop5[];
-        extern const unsigned char ftrace_test_jmp[];
-        int faulted = 0;
-        /*
-         * There is no good nop for all x86 archs.
-         * We will default to using the P6_NOP5, but first we
-         * will test to make sure that the nop will actually
-         * work on this CPU. If it faults, we will then
-         * go to a lesser efficient 5 byte nop. If that fails
-         * we then just use a jmp as our nop. This isn't the most
-         * efficient nop, but we can not use a multi part nop
-         * since we would then risk being preempted in the middle
-         * of that nop, and if we enabled tracing then, it might
-         * cause a system crash.
-         *
-         * TODO: check the cpuid to determine the best nop.
-         */
-        asm volatile (
-                "ftrace_test_jmp:"
-                "jmp ftrace_test_p6nop\n"
-                "nop\n"
-                "nop\n"
-                "nop\n"  /* 2 byte jmp + 3 bytes */
-                "ftrace_test_p6nop:"
-                P6_NOP5
-                "jmp 1f\n"
-                "ftrace_test_nop5:"
-                ".byte 0x66,0x66,0x66,0x66,0x90\n"
-                "1:"
-                ".section .fixup, \"ax\"\n"
-                "2:     movl $1, %0\n"
-                "       jmp ftrace_test_nop5\n"
-                "3:     movl $2, %0\n"
-                "       jmp 1b\n"
-                ".previous\n"
-                _ASM_EXTABLE(ftrace_test_p6nop, 2b)
-                _ASM_EXTABLE(ftrace_test_nop5, 3b)
-                : "=r"(faulted) : "0" (faulted));
-        switch (faulted) {
-        case 0:
-                pr_info("converting mcount calls to 0f 1f 44 00 00\n");
-                memcpy(ftrace_nop, ftrace_test_p6nop, MCOUNT_INSN_SIZE);
-                break;
-        case 1:
-                pr_info("converting mcount calls to 66 66 66 66 90\n");
-                memcpy(ftrace_nop, ftrace_test_nop5, MCOUNT_INSN_SIZE);
-                break;
-        case 2:
-                pr_info("converting mcount calls to jmp . + 5\n");
-                memcpy(ftrace_nop, ftrace_test_jmp, MCOUNT_INSN_SIZE);
-                break;
-        }
        /* The return code is retured via data */
        *(unsigned long *)data = 0;
@@ -495,18 +437,19 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr,
                return;
        }
-        if (ftrace_push_return_trace(old, self_addr, &trace.depth,
-                    frame_pointer) == -EBUSY) {
-                *parent = old;
-                return;
-        }
        trace.func = self_addr;
+        trace.depth = current->curr_ret_stack + 1;
        /* Only trace if the calling function expects to */
        if (!ftrace_graph_entry(&trace)) {
-                current->curr_ret_stack--;
                *parent = old;
+                return;
+        }
+        if (ftrace_push_return_trace(old, self_addr, &trace.depth,
+                    frame_pointer) == -EBUSY) {
+                *parent = old;
+                return;
        }
 }
 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
diff --git a/arch/x86/kernel/head.c b/arch/x86/kernel/head.c
index 3e66bd364a9d..af0699ba48cf 100644
--- a/arch/x86/kernel/head.c
+++ b/arch/x86/kernel/head.c
@@ -1,5 +1,6 @@
 #include <linux/kernel.h>
 #include <linux/init.h>
+#include <linux/memblock.h>
 #include <asm/setup.h>
 #include <asm/bios_ebda.h>
@@ -51,5 +52,5 @@ void __init reserve_ebda_region(void)
                lowmem = 0x9f000;
        /* reserve all memory between lowmem and the 1MB mark */
-        reserve_early_overlap_ok(lowmem, 0x100000, "BIOS reserved");
+        memblock_x86_reserve_range(lowmem, 0x100000, "* BIOS reserved");
 }
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index 784360c0625c..3bb08509a7a1 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -8,6 +8,7 @@
 #include <linux/init.h>
 #include <linux/start_kernel.h>
 #include <linux/mm.h>
+#include <linux/memblock.h>
 #include <asm/setup.h>
 #include <asm/sections.h>
@@ -17,11 +18,11 @@
 #include <asm/apic.h>
 #include <asm/io_apic.h>
 #include <asm/bios_ebda.h>
+#include <asm/tlbflush.h>
 static void __init i386_default_early_setup(void)
 {
        /* Initialize 32bit specific setup functions */
-        x86_init.resources.probe_roms = probe_roms;
        x86_init.resources.reserve_resources = i386_reserve_resources;
        x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc;
@@ -30,17 +31,9 @@ static void __init i386_default_early_setup(void)
 void __init i386_start_kernel(void)
 {
-#ifdef CONFIG_X86_TRAMPOLINE
+        memblock_init();
-        /*
-         * But first pinch a few for the stack/trampoline stuff
-         * FIXME: Don't need the extra page at 4K, but need to fix
-         * trampoline before removing it. (see the GDT stuff)
-         */
-        reserve_early_overlap_ok(PAGE_SIZE, PAGE_SIZE + PAGE_SIZE,
-                                         "EX TRAMPOLINE");
-#endif
-        reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS");
+        memblock_x86_reserve_range(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS");
 #ifdef CONFIG_BLK_DEV_INITRD
        /* Reserve INITRD */
@@ -49,7 +42,7 @@ void __init i386_start_kernel(void)
                u64 ramdisk_image = boot_params.hdr.ramdisk_image;
                u64 ramdisk_size  = boot_params.hdr.ramdisk_size;
                u64 ramdisk_end   = PAGE_ALIGN(ramdisk_image + ramdisk_size);
-                reserve_early(ramdisk_image, ramdisk_end, "RAMDISK");
+                memblock_x86_reserve_range(ramdisk_image, ramdisk_end, "RAMDISK");
        }
 #endif
@@ -58,6 +51,9 @@ void __init i386_start_kernel(void)
        case X86_SUBARCH_MRST:
                x86_mrst_early_setup();
                break;
+        case X86_SUBARCH_CE4100:
+                x86_ce4100_early_setup();
+                break;
        default:
                i386_default_early_setup();
                break;
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 7147143fd614..5655c2272adb 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -12,6 +12,7 @@
 #include <linux/percpu.h>
 #include <linux/start_kernel.h>
 #include <linux/io.h>
+#include <linux/memblock.h>
 #include <asm/processor.h>
 #include <asm/proto.h>
@@ -76,8 +77,7 @@ void __init x86_64_start_kernel(char * real_mode_data)
        /* Make NULL pointers segfault */
        zap_identity_mappings();
-        /* Cleanup the over mapped high alias */
+        max_pfn_mapped = KERNEL_IMAGE_SIZE >> PAGE_SHIFT;
-        cleanup_highmap();
        for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) {
 #ifdef CONFIG_EARLY_PRINTK
@@ -98,7 +98,9 @@ void __init x86_64_start_reservations(char *real_mode_data)
 {
        copy_bootdata(__va(real_mode_data));
-        reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS");
+        memblock_init();
+        memblock_x86_reserve_range(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS");
 #ifdef CONFIG_BLK_DEV_INITRD
        /* Reserve INITRD */
@@ -107,7 +109,7 @@ void __init x86_64_start_reservations(char *real_mode_data)
                unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
                unsigned long ramdisk_size  = boot_params.hdr.ramdisk_size;
                unsigned long ramdisk_end   = PAGE_ALIGN(ramdisk_image + ramdisk_size);
-                reserve_early(ramdisk_image, ramdisk_end, "RAMDISK");
+                memblock_x86_reserve_range(ramdisk_image, ramdisk_end, "RAMDISK");
        }
 #endif
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index fa8c1b8e09fb..ce0be7cd085e 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -60,18 +60,20 @@
 #define PAGE_TABLE_SIZE(pages) ((pages) / PTRS_PER_PGD)
 #endif
+/* Number of possible pages in the lowmem region */
+LOWMEM_PAGES = (((1<<32) - __PAGE_OFFSET) >> PAGE_SHIFT)
+        
 /* Enough space to fit pagetables for the low memory linear map */
-MAPPING_BEYOND_END = \
+MAPPING_BEYOND_END = PAGE_TABLE_SIZE(LOWMEM_PAGES) << PAGE_SHIFT
-        PAGE_TABLE_SIZE(((1<<32) - __PAGE_OFFSET) >> PAGE_SHIFT) << PAGE_SHIFT
 /*
 * Worst-case size of the kernel mapping we need to make:
- * the worst-case size of the kernel itself, plus the extra we need
+ * a relocatable kernel can live anywhere in lowmem, so we need to be able
- * to map for the linear map.
+ * to map all of lowmem.
 */
-KERNEL_PAGES = (KERNEL_IMAGE_SIZE + MAPPING_BEYOND_END)>>PAGE_SHIFT
+KERNEL_PAGES = LOWMEM_PAGES
-INIT_MAP_SIZE = PAGE_TABLE_SIZE(KERNEL_PAGES) * PAGE_SIZE_asm
+INIT_MAP_SIZE = PAGE_TABLE_SIZE(KERNEL_PAGES) * PAGE_SIZE
 RESERVE_BRK(pagetables, INIT_MAP_SIZE)
 /*
@@ -83,6 +85,8 @@ RESERVE_BRK(pagetables, INIT_MAP_SIZE)
 */
 __HEAD
 ENTRY(startup_32)
+        movl pa(stack_start),%ecx
+        
        /* test KEEP_SEGMENTS flag to see if the bootloader is asking
                us to not reload segments */
        testb $(1<<6), BP_loadflags(%esi)
@@ -97,7 +101,9 @@ ENTRY(startup_32)
        movl %eax,%es
        movl %eax,%fs
        movl %eax,%gs
+        movl %eax,%ss
 2:
+        leal -__PAGE_OFFSET(%ecx),%esp
 /*
 * Clear BSS first so that there are no surprises...
@@ -124,72 +130,35 @@ ENTRY(startup_32)
        movsl
        movl pa(boot_params) + NEW_CL_POINTER,%esi
        andl %esi,%esi
-        jz 1f                   # No comand line
+        jz 1f                   # No command line
        movl $pa(boot_command_line),%edi
        movl $(COMMAND_LINE_SIZE/4),%ecx
        rep
        movsl
 1:
-#ifdef CONFIG_OLPC_OPENFIRMWARE
+#ifdef CONFIG_OLPC
        /* save OFW's pgdir table for later use when calling into OFW */
        movl %cr3, %eax
        movl %eax, pa(olpc_ofw_pgd)
 #endif
-#ifdef CONFIG_PARAVIRT
-        /* This is can only trip for a broken bootloader... */
-        cmpw $0x207, pa(boot_params + BP_version)
-        jb default_entry
-        /* Paravirt-compatible boot parameters.  Look to see what architecture
-                we're booting under. */
-        movl pa(boot_params + BP_hardware_subarch), %eax
-        cmpl $num_subarch_entries, %eax
-        jae bad_subarch
-        movl pa(subarch_entries)(,%eax,4), %eax
-        subl $__PAGE_OFFSET, %eax
-        jmp *%eax
-bad_subarch:
-WEAK(lguest_entry)
-WEAK(xen_entry)
-        /* Unknown implementation; there's really
-           nothing we can do at this point. */
-        ud2a
-        __INITDATA
-subarch_entries:
-        .long default_entry             /* normal x86/PC */
-        .long lguest_entry              /* lguest hypervisor */
-        .long xen_entry                 /* Xen hypervisor */
-        .long default_entry             /* Moorestown MID */
-num_subarch_entries = (. - subarch_entries) / 4
-.previous
-#endif /* CONFIG_PARAVIRT */
 /*
 * Initialize page tables.  This creates a PDE and a set of page
 * tables, which are located immediately beyond __brk_base.  The variable
 * _brk_end is set up to point to the first "safe" location.
 * Mappings are created both at virtual address 0 (identity mapping)
 * and PAGE_OFFSET for up to _end.
- *
- * Note that the stack is not yet set up!
 */
-default_entry:
 #ifdef CONFIG_X86_PAE
        /*
-         * In PAE mode swapper_pg_dir is statically defined to contain enough
+         * In PAE mode initial_page_table is statically defined to contain
-         * entries to cover the VMSPLIT option (that is the top 1, 2 or 3
+         * enough entries to cover the VMSPLIT option (that is the top 1, 2 or 3
-         * entries). The identity mapping is handled by pointing two PGD
+         * entries). The identity mapping is handled by pointing two PGD entries
-         * entries to the first kernel PMD.
+         * to the first kernel PMD.
         *
-         * Note the upper half of each PMD or PTE are always zero at
+         * Note the upper half of each PMD or PTE are always zero at this stage.
-         * this stage.
         */
 #define KPMDS (((-__PAGE_OFFSET) >> 30) & 3) /* Number of kernel PMDs */
@@ -197,7 +166,7 @@ default_entry:
        xorl %ebx,%ebx                          /* %ebx is kept at zero */
        movl $pa(__brk_base), %edi
-        movl $pa(swapper_pg_pmd), %edx
+        movl $pa(initial_pg_pmd), %edx
        movl $PTE_IDENT_ATTR, %eax
 10:
        leal PDE_IDENT_ATTR(%edi),%ecx          /* Create PMD entry */
@@ -226,14 +195,14 @@ default_entry:
        movl %eax, pa(max_pfn_mapped)
        /* Do early initialization of the fixmap area */
-        movl $pa(swapper_pg_fixmap)+PDE_IDENT_ATTR,%eax
+        movl $pa(initial_pg_fixmap)+PDE_IDENT_ATTR,%eax
-        movl %eax,pa(swapper_pg_pmd+0x1000*KPMDS-8)
+        movl %eax,pa(initial_pg_pmd+0x1000*KPMDS-8)
 #else   /* Not PAE */
 page_pde_offset = (__PAGE_OFFSET >> 20);
        movl $pa(__brk_base), %edi
-        movl $pa(swapper_pg_dir), %edx
+        movl $pa(initial_page_table), %edx
        movl $PTE_IDENT_ATTR, %eax
 10:
        leal PDE_IDENT_ATTR(%edi),%ecx          /* Create PDE entry */
@@ -257,10 +226,45 @@ page_pde_offset = (__PAGE_OFFSET >> 20);
        movl %eax, pa(max_pfn_mapped)
        /* Do early initialization of the fixmap area */
-        movl $pa(swapper_pg_fixmap)+PDE_IDENT_ATTR,%eax
+        movl $pa(initial_pg_fixmap)+PDE_IDENT_ATTR,%eax
-        movl %eax,pa(swapper_pg_dir+0xffc)
+        movl %eax,pa(initial_page_table+0xffc)
 #endif
-        jmp 3f
+#ifdef CONFIG_PARAVIRT
+        /* This is can only trip for a broken bootloader... */
+        cmpw $0x207, pa(boot_params + BP_version)
+        jb default_entry
+        /* Paravirt-compatible boot parameters.  Look to see what architecture
+                we're booting under. */
+        movl pa(boot_params + BP_hardware_subarch), %eax
+        cmpl $num_subarch_entries, %eax
+        jae bad_subarch
+        movl pa(subarch_entries)(,%eax,4), %eax
+        subl $__PAGE_OFFSET, %eax
+        jmp *%eax
+bad_subarch:
+WEAK(lguest_entry)
+WEAK(xen_entry)
+        /* Unknown implementation; there's really
+           nothing we can do at this point. */
+        ud2a
+        __INITDATA
+subarch_entries:
+        .long default_entry             /* normal x86/PC */
+        .long lguest_entry              /* lguest hypervisor */
+        .long xen_entry                 /* Xen hypervisor */
+        .long default_entry             /* Moorestown MID */
+num_subarch_entries = (. - subarch_entries) / 4
+.previous
+#else
+        jmp default_entry
+#endif /* CONFIG_PARAVIRT */
 /*
 * Non-boot CPU entry point; entered from trampoline.S
 * We can't lgdt here, because lgdt itself uses a data segment, but
@@ -280,8 +284,11 @@ ENTRY(startup_32_smp)
        movl %eax,%es
        movl %eax,%fs
        movl %eax,%gs
+        movl pa(stack_start),%ecx
+        movl %eax,%ss
+        leal -__PAGE_OFFSET(%ecx),%esp
 #endif /* CONFIG_SMP */
-3:
+default_entry:
 /*
 *      New page tables may be in 4Mbyte page mode and may
@@ -315,6 +322,10 @@ ENTRY(startup_32_smp)
        subl $0x80000001, %eax
        cmpl $(0x8000ffff-0x80000001), %eax
        ja 6f
+        /* Clear bogus XD_DISABLE bits */
+        call verify_cpu
        mov $0x80000001, %eax
        cpuid
        /* Execute Disable bit supported? */
@@ -334,15 +345,15 @@ ENTRY(startup_32_smp)
 /*
 * Enable paging
 */
-        movl pa(initial_page_table), %eax
+        movl $pa(initial_page_table), %eax
        movl %eax,%cr3          /* set the page table pointer.. */
        movl %cr0,%eax
        orl  $X86_CR0_PG,%eax
        movl %eax,%cr0          /* ..and set paging (PG) bit */
        ljmp $__BOOT_CS,$1f     /* Clear prefetch and normalize %eip */
 1:
-        /* Set up the stack pointer */
+        /* Shift the stack pointer to a virtual address */
-        lss stack_start,%esp
+        addl $__PAGE_OFFSET, %esp
 /*
 * Initialize eflags.  Some BIOS's leave bits like NT set.  This would
@@ -354,9 +365,7 @@ ENTRY(startup_32_smp)
 #ifdef CONFIG_SMP
        cmpb $0, ready
-        jz  1f                          /* Initial CPU cleans BSS */
+        jnz checkCPUtype
-        jmp checkCPUtype
-1:
 #endif /* CONFIG_SMP */
 /*
@@ -464,14 +473,7 @@ is386:	movl $2,%ecx		# set MP
        cld                     # gcc2 wants the direction flag cleared at all times
        pushl $0                # fake return address for unwinder
-#ifdef CONFIG_SMP
-        movb ready, %cl
        movb $1, ready
-        cmpb $0,%cl             # the first CPU calls start_kernel
-        je   1f
-        movl (stack_start), %esp
-1:
-#endif /* CONFIG_SMP */
        jmp *(initial_code)
 /*
@@ -610,33 +612,31 @@ ignore_int:
 #endif
        iret
+#include "verify_cpu.S"
        __REFDATA
 .align 4
 ENTRY(initial_code)
        .long i386_start_kernel
-ENTRY(initial_page_table)
-        .long pa(swapper_pg_dir)
 /*
 * BSS section
 */
 __PAGE_ALIGNED_BSS
-        .align PAGE_SIZE_asm
+        .align PAGE_SIZE
 #ifdef CONFIG_X86_PAE
-swapper_pg_pmd:
+initial_pg_pmd:
        .fill 1024*KPMDS,4,0
 #else
-ENTRY(swapper_pg_dir)
+ENTRY(initial_page_table)
        .fill 1024,4,0
 #endif
-swapper_pg_fixmap:
+initial_pg_fixmap:
-        .fill 1024,4,0
-#ifdef CONFIG_X86_TRAMPOLINE
-ENTRY(trampoline_pg_dir)
        .fill 1024,4,0
-#endif
 ENTRY(empty_zero_page)
        .fill 4096,1,0
+ENTRY(swapper_pg_dir)
+        .fill 1024,4,0
 /*
 * This starts the data section.
@@ -644,37 +644,37 @@ ENTRY(empty_zero_page)
 #ifdef CONFIG_X86_PAE
 __PAGE_ALIGNED_DATA
        /* Page-aligned for the benefit of paravirt? */
-        .align PAGE_SIZE_asm
+        .align PAGE_SIZE
-ENTRY(swapper_pg_dir)
+ENTRY(initial_page_table)
-        .long   pa(swapper_pg_pmd+PGD_IDENT_ATTR),0     /* low identity map */
+        .long   pa(initial_pg_pmd+PGD_IDENT_ATTR),0     /* low identity map */
 # if KPMDS == 3
-        .long   pa(swapper_pg_pmd+PGD_IDENT_ATTR),0
+        .long   pa(initial_pg_pmd+PGD_IDENT_ATTR),0
-        .long   pa(swapper_pg_pmd+PGD_IDENT_ATTR+0x1000),0
+        .long   pa(initial_pg_pmd+PGD_IDENT_ATTR+0x1000),0
-        .long   pa(swapper_pg_pmd+PGD_IDENT_ATTR+0x2000),0
+        .long   pa(initial_pg_pmd+PGD_IDENT_ATTR+0x2000),0
 # elif KPMDS == 2
        .long   0,0
-        .long   pa(swapper_pg_pmd+PGD_IDENT_ATTR),0
+        .long   pa(initial_pg_pmd+PGD_IDENT_ATTR),0
-        .long   pa(swapper_pg_pmd+PGD_IDENT_ATTR+0x1000),0
+        .long   pa(initial_pg_pmd+PGD_IDENT_ATTR+0x1000),0
 # elif KPMDS == 1
        .long   0,0
        .long   0,0
-        .long   pa(swapper_pg_pmd+PGD_IDENT_ATTR),0
+        .long   pa(initial_pg_pmd+PGD_IDENT_ATTR),0
 # else
 #  error "Kernel PMDs should be 1, 2 or 3"
 # endif
-        .align PAGE_SIZE_asm            /* needs to be page-sized too */
+        .align PAGE_SIZE                /* needs to be page-sized too */
 #endif
 .data
+.balign 4
 ENTRY(stack_start)
        .long init_thread_union+THREAD_SIZE
-        .long __BOOT_DS
-ready:  .byte 0
 early_recursion_flag:
        .long 0
+ready:  .byte 0
 int_msg:
        .asciz "Unknown interrupt or fault at: %p %p %p\n"
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 239046bd447f..e11e39478a49 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -136,10 +136,9 @@ ident_complete:
        /* Fixup phys_base */
        addq    %rbp, phys_base(%rip)
-#ifdef CONFIG_X86_TRAMPOLINE
+        /* Fixup trampoline */
        addq    %rbp, trampoline_level4_pgt + 0(%rip)
        addq    %rbp, trampoline_level4_pgt + (511*8)(%rip)
-#endif
        /* Due to ENTRY(), sometimes the empty space gets filled with
         * zeros. Better take a jmp than relying on empty space being
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 7494999141b3..6781765b3a0d 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -27,6 +27,9 @@
 #define HPET_DEV_FSB_CAP                0x1000
 #define HPET_DEV_PERI_CAP               0x2000
+#define HPET_MIN_CYCLES                 128
+#define HPET_MIN_PROG_DELTA             (HPET_MIN_CYCLES + (HPET_MIN_CYCLES >> 1))
 #define EVT_TO_HPET_DEV(evt) container_of(evt, struct hpet_dev, evt)
 /*
@@ -214,7 +217,7 @@ static void hpet_reserve_platform_timers(unsigned int id) { }
 /*
 * Common hpet info
 */
-static unsigned long hpet_period;
+static unsigned long hpet_freq;
 static void hpet_legacy_set_mode(enum clock_event_mode mode,
                          struct clock_event_device *evt);
@@ -229,7 +232,6 @@ static struct clock_event_device hpet_clockevent = {
        .features       = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
        .set_mode       = hpet_legacy_set_mode,
        .set_next_event = hpet_legacy_next_event,
-        .shift          = 32,
        .irq            = 0,
        .rating         = 50,
 };
@@ -287,27 +289,12 @@ static void hpet_legacy_clockevent_register(void)
        hpet_enable_legacy_int();
        /*
-         * The mult factor is defined as (include/linux/clockchips.h)
-         *  mult/2^shift = cyc/ns (in contrast to ns/cyc in clocksource.h)
-         * hpet_period is in units of femtoseconds (per cycle), so
-         *  mult/2^shift = cyc/ns = 10^6/hpet_period
-         *  mult = (10^6 * 2^shift)/hpet_period
-         *  mult = (FSEC_PER_NSEC << hpet_clockevent.shift)/hpet_period
-         */
-        hpet_clockevent.mult = div_sc((unsigned long) FSEC_PER_NSEC,
-                                      hpet_period, hpet_clockevent.shift);
-        /* Calculate the min / max delta */
-        hpet_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF,
-                                                           &hpet_clockevent);
-        /* 5 usec minimum reprogramming delta. */
-        hpet_clockevent.min_delta_ns = 5000;
-        /*
         * Start hpet with the boot cpu mask and make it
         * global after the IO_APIC has been initialized.
         */
        hpet_clockevent.cpumask = cpumask_of(smp_processor_id());
-        clockevents_register_device(&hpet_clockevent);
+        clockevents_config_and_register(&hpet_clockevent, hpet_freq,
+                                        HPET_MIN_PROG_DELTA, 0x7FFFFFFF);
        global_clock_event = &hpet_clockevent;
        printk(KERN_DEBUG "hpet clockevent registered\n");
 }
@@ -380,44 +367,37 @@ static int hpet_next_event(unsigned long delta,
                           struct clock_event_device *evt, int timer)
 {
        u32 cnt;
+        s32 res;
        cnt = hpet_readl(HPET_COUNTER);
        cnt += (u32) delta;
        hpet_writel(cnt, HPET_Tn_CMP(timer));
        /*
-         * We need to read back the CMP register on certain HPET
+         * HPETs are a complete disaster. The compare register is
-         * implementations (ATI chipsets) which seem to delay the
+         * based on a equal comparison and neither provides a less
-         * transfer of the compare register into the internal compare
+         * than or equal functionality (which would require to take
-         * logic. With small deltas this might actually be too late as
+         * the wraparound into account) nor a simple count down event
-         * the counter could already be higher than the compare value
+         * mode. Further the write to the comparator register is
-         * at that point and we would wait for the next hpet interrupt
+         * delayed internally up to two HPET clock cycles in certain
-         * forever. We found out that reading the CMP register back
+         * chipsets (ATI, ICH9,10). Some newer AMD chipsets have even
-         * forces the transfer so we can rely on the comparison with
+         * longer delays. We worked around that by reading back the
-         * the counter register below. If the read back from the
+         * compare register, but that required another workaround for
-         * compare register does not match the value we programmed
+         * ICH9,10 chips where the first readout after write can
-         * then we might have a real hardware problem. We can not do
+         * return the old stale value. We already had a minimum
-         * much about it here, but at least alert the user/admin with
+         * programming delta of 5us enforced, but a NMI or SMI hitting
-         * a prominent warning.
+         * between the counter readout and the comparator write can
-         *
+         * move us behind that point easily. Now instead of reading
-         * An erratum on some chipsets (ICH9,..), results in
+         * the compare register back several times, we make the ETIME
-         * comparator read immediately following a write returning old
+         * decision based on the following: Return ETIME if the
-         * value. Workaround for this is to read this value second
+         * counter value after the write is less than HPET_MIN_CYCLES
-         * time, when first read returns old value.
+         * away from the event or if the counter is already ahead of
-         *
+         * the event. The minimum programming delta for the generic
-         * In fact the write to the comparator register is delayed up
+         * clockevents code is set to 1.5 * HPET_MIN_CYCLES.
-         * to two HPET cycles so the workaround we tried to restrict
-         * the readback to those known to be borked ATI chipsets
-         * failed miserably. So we give up on optimizations forever
-         * and penalize all HPET incarnations unconditionally.
         */
-        if (unlikely((u32)hpet_readl(HPET_Tn_CMP(timer)) != cnt)) {
+        res = (s32)(cnt - hpet_readl(HPET_COUNTER));
-                if (hpet_readl(HPET_Tn_CMP(timer)) != cnt)
-                        printk_once(KERN_WARNING
-                                "hpet: compare register read back failed.\n");
-        }
-        return (s32)(hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0;
+        return res < HPET_MIN_CYCLES ? -ETIME : 0;
 }
 static void hpet_legacy_set_mode(enum clock_event_mode mode,
@@ -440,9 +420,9 @@ static int hpet_legacy_next_event(unsigned long delta,
 static DEFINE_PER_CPU(struct hpet_dev *, cpu_hpet_dev);
 static struct hpet_dev  *hpet_devs;
-void hpet_msi_unmask(unsigned int irq)
+void hpet_msi_unmask(struct irq_data *data)
 {
-        struct hpet_dev *hdev = get_irq_data(irq);
+        struct hpet_dev *hdev = data->handler_data;
        unsigned int cfg;
        /* unmask it */
@@ -451,10 +431,10 @@ void hpet_msi_unmask(unsigned int irq)
        hpet_writel(cfg, HPET_Tn_CFG(hdev->num));
 }
-void hpet_msi_mask(unsigned int irq)
+void hpet_msi_mask(struct irq_data *data)
 {
+        struct hpet_dev *hdev = data->handler_data;
        unsigned int cfg;
-        struct hpet_dev *hdev = get_irq_data(irq);
        /* mask it */
        cfg = hpet_readl(HPET_Tn_CFG(hdev->num));
@@ -462,18 +442,14 @@ void hpet_msi_mask(unsigned int irq)
        hpet_writel(cfg, HPET_Tn_CFG(hdev->num));
 }
-void hpet_msi_write(unsigned int irq, struct msi_msg *msg)
+void hpet_msi_write(struct hpet_dev *hdev, struct msi_msg *msg)
 {
-        struct hpet_dev *hdev = get_irq_data(irq);
        hpet_writel(msg->data, HPET_Tn_ROUTE(hdev->num));
        hpet_writel(msg->address_lo, HPET_Tn_ROUTE(hdev->num) + 4);
 }
-void hpet_msi_read(unsigned int irq, struct msi_msg *msg)
+void hpet_msi_read(struct hpet_dev *hdev, struct msi_msg *msg)
 {
-        struct hpet_dev *hdev = get_irq_data(irq);
        msg->data = hpet_readl(HPET_Tn_ROUTE(hdev->num));
        msg->address_lo = hpet_readl(HPET_Tn_ROUTE(hdev->num) + 4);
        msg->address_hi = 0;
@@ -510,7 +486,7 @@ static int hpet_assign_irq(struct hpet_dev *dev)
        if (!irq)
                return -EINVAL;
-        set_irq_data(irq, dev);
+        irq_set_handler_data(irq, dev);
        if (hpet_setup_msi_irq(irq))
                return -EINVAL;
@@ -556,7 +532,6 @@ static int hpet_setup_irq(struct hpet_dev *dev)
 static void init_one_hpet_msi_clockevent(struct hpet_dev *hdev, int cpu)
 {
        struct clock_event_device *evt = &hdev->evt;
-        uint64_t hpet_freq;
        WARN_ON(cpu != smp_processor_id());
        if (!(hdev->flags & HPET_DEV_VALID))
@@ -578,24 +553,10 @@ static void init_one_hpet_msi_clockevent(struct hpet_dev *hdev, int cpu)
        evt->set_mode = hpet_msi_set_mode;
        evt->set_next_event = hpet_msi_next_event;
-        evt->shift = 32;
-        /*
-         * The period is a femto seconds value. We need to calculate the
-         * scaled math multiplication factor for nanosecond to hpet tick
-         * conversion.
-         */
-        hpet_freq = FSEC_PER_SEC;
-        do_div(hpet_freq, hpet_period);
-        evt->mult = div_sc((unsigned long) hpet_freq,
-                                      NSEC_PER_SEC, evt->shift);
-        /* Calculate the max delta */
-        evt->max_delta_ns = clockevent_delta2ns(0x7FFFFFFF, evt);
-        /* 5 usec minimum reprogramming delta. */
-        evt->min_delta_ns = 5000;
        evt->cpumask = cpumask_of(hdev->cpu);
-        clockevents_register_device(evt);
+        clockevents_config_and_register(evt, hpet_freq, HPET_MIN_PROG_DELTA,
+                                        0x7FFFFFFF);
 }
 #ifdef CONFIG_HPET
@@ -726,7 +687,7 @@ static int hpet_cpuhp_notify(struct notifier_block *n,
        switch (action & 0xf) {
        case CPU_ONLINE:
-                INIT_DELAYED_WORK_ON_STACK(&work.work, hpet_work);
+                INIT_DELAYED_WORK_ONSTACK(&work.work, hpet_work);
                init_completion(&work.complete);
                /* FIXME: add schedule_work_on() */
                schedule_delayed_work_on(cpu, &work.work, 0);
@@ -799,7 +760,6 @@ static struct clocksource clocksource_hpet = {
 static int hpet_clocksource_register(void)
 {
        u64 start, now;
-        u64 hpet_freq;
        cycle_t t1;
        /* Start the counter */
@@ -826,24 +786,7 @@ static int hpet_clocksource_register(void)
                return -ENODEV;
        }
-        /*
-         * The definition of mult is (include/linux/clocksource.h)
-         * mult/2^shift = ns/cyc and hpet_period is in units of fsec/cyc
-         * so we first need to convert hpet_period to ns/cyc units:
-         *  mult/2^shift = ns/cyc = hpet_period/10^6
-         *  mult = (hpet_period * 2^shift)/10^6
-         *  mult = (hpet_period << shift)/FSEC_PER_NSEC
-         */
-        /* Need to convert hpet_period (fsec/cyc) to cyc/sec:
-         *
-         * cyc/sec = FSEC_PER_SEC/hpet_period(fsec/cyc)
-         * cyc/sec = (FSEC_PER_NSEC * NSEC_PER_SEC)/hpet_period
-         */
-        hpet_freq = FSEC_PER_SEC;
-        do_div(hpet_freq, hpet_period);
        clocksource_register_hz(&clocksource_hpet, (u32)hpet_freq);
        return 0;
 }
@@ -852,7 +795,9 @@ static int hpet_clocksource_register(void)
 */
 int __init hpet_enable(void)
 {
+        unsigned long hpet_period;
        unsigned int id;
+        u64 freq;
        int i;
        if (!is_hpet_capable())
@@ -891,6 +836,14 @@ int __init hpet_enable(void)
                goto out_nohpet;
        /*
+         * The period is a femto seconds value. Convert it to a
+         * frequency.
+         */
+        freq = FSEC_PER_SEC;
+        do_div(freq, hpet_period);
+        hpet_freq = freq;
+        /*
         * Read the HPET ID register to retrieve the IRQ routing
         * information and the number of channels
         */
diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
index ff15c9dcc25d..02f07634d265 100644
--- a/arch/x86/kernel/hw_breakpoint.c
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -122,7 +122,7 @@ int arch_install_hw_breakpoint(struct perf_event *bp)
                return -EBUSY;
        set_debugreg(info->address, i);
-        __get_cpu_var(cpu_debugreg[i]) = info->address;
+        __this_cpu_write(cpu_debugreg[i], info->address);
        dr7 = &__get_cpu_var(cpu_dr7);
        *dr7 |= encode_dr7(i, info->len, info->type);
@@ -397,12 +397,12 @@ void flush_ptrace_hw_breakpoint(struct task_struct *tsk)
 void hw_breakpoint_restore(void)
 {
-        set_debugreg(__get_cpu_var(cpu_debugreg[0]), 0);
+        set_debugreg(__this_cpu_read(cpu_debugreg[0]), 0);
-        set_debugreg(__get_cpu_var(cpu_debugreg[1]), 1);
+        set_debugreg(__this_cpu_read(cpu_debugreg[1]), 1);
-        set_debugreg(__get_cpu_var(cpu_debugreg[2]), 2);
+        set_debugreg(__this_cpu_read(cpu_debugreg[2]), 2);
-        set_debugreg(__get_cpu_var(cpu_debugreg[3]), 3);
+        set_debugreg(__this_cpu_read(cpu_debugreg[3]), 3);
        set_debugreg(current->thread.debugreg6, 6);
-        set_debugreg(__get_cpu_var(cpu_dr7), 7);
+        set_debugreg(__this_cpu_read(cpu_dr7), 7);
 }
 EXPORT_SYMBOL_GPL(hw_breakpoint_restore);
@@ -433,6 +433,10 @@ static int __kprobes hw_breakpoint_handler(struct die_args *args)
        dr6_p = (unsigned long *)ERR_PTR(args->err);
        dr6 = *dr6_p;
+        /* If it's a single step, TRAP bits are random */
+        if (dr6 & DR_STEP)
+                return NOTIFY_DONE;
        /* Do an early return if no trap bits are set in DR6 */
        if ((dr6 & DR_TRAP_BITS) == 0)
                return NOTIFY_DONE;
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index a46cb3522c0c..12aff2537682 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -68,19 +68,22 @@ static void __cpuinit init_thread_xstate(void)
         */
        if (!HAVE_HWFP) {
+                /*
+                 * Disable xsave as we do not support it if i387
+                 * emulation is enabled.
+                 */
+                setup_clear_cpu_cap(X86_FEATURE_XSAVE);
+                setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
                xstate_size = sizeof(struct i387_soft_struct);
                return;
        }
        if (cpu_has_fxsr)
                xstate_size = sizeof(struct i387_fxsave_struct);
-#ifdef CONFIG_X86_32
        else
                xstate_size = sizeof(struct i387_fsave_struct);
-#endif
 }
-#ifdef CONFIG_X86_64
 /*
 * Called at bootup to set up the initial FPU state that is later cloned
 * into all processes.
@@ -88,12 +91,21 @@ static void __cpuinit init_thread_xstate(void)
 void __cpuinit fpu_init(void)
 {
-        unsigned long oldcr0 = read_cr0();
+        unsigned long cr0;
+        unsigned long cr4_mask = 0;
-        set_in_cr4(X86_CR4_OSFXSR);
-        set_in_cr4(X86_CR4_OSXMMEXCPT);
-        write_cr0(oldcr0 & ~(X86_CR0_TS|X86_CR0_EM)); /* clear TS and EM */
+        if (cpu_has_fxsr)
+                cr4_mask |= X86_CR4_OSFXSR;
+        if (cpu_has_xmm)
+                cr4_mask |= X86_CR4_OSXMMEXCPT;
+        if (cr4_mask)
+                set_in_cr4(cr4_mask);
+        cr0 = read_cr0();
+        cr0 &= ~(X86_CR0_TS|X86_CR0_EM); /* clear TS and EM */
+        if (!HAVE_HWFP)
+                cr0 |= X86_CR0_EM;
+        write_cr0(cr0);
        if (!smp_processor_id())
                init_thread_xstate();
@@ -104,24 +116,12 @@ void __cpuinit fpu_init(void)
        clear_used_math();
 }
-#else   /* CONFIG_X86_64 */
-void __cpuinit fpu_init(void)
-{
-        if (!smp_processor_id())
-                init_thread_xstate();
-}
-#endif  /* CONFIG_X86_32 */
 void fpu_finit(struct fpu *fpu)
 {
-#ifdef CONFIG_X86_32
        if (!HAVE_HWFP) {
                finit_soft_fpu(&fpu->state->soft);
                return;
        }
-#endif
        if (cpu_has_fxsr) {
                struct i387_fxsave_struct *fx = &fpu->state->fxsave;
@@ -145,7 +145,7 @@ EXPORT_SYMBOL_GPL(fpu_finit);
 * The _current_ task is using the FPU for the first time
 * so initialize it and set the mxcsr to its default
 * value at reset if we support XMM instructions and then
- * remeber the current task has used the FPU.
+ * remember the current task has used the FPU.
 */
 int init_fpu(struct task_struct *tsk)
 {
@@ -169,6 +169,7 @@ int init_fpu(struct task_struct *tsk)
        set_stopped_child_used_math(tsk);
        return 0;
 }
+EXPORT_SYMBOL_GPL(init_fpu);
 /*
 * The xstateregs_active() routine is the same as the fpregs_active() routine,
@@ -386,19 +387,17 @@ convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk)
 #ifdef CONFIG_X86_64
        env->fip = fxsave->rip;
        env->foo = fxsave->rdp;
+        /*
+         * should be actually ds/cs at fpu exception time, but
+         * that information is not available in 64bit mode.
+         */
+        env->fcs = task_pt_regs(tsk)->cs;
        if (tsk == current) {
-                /*
+                savesegment(ds, env->fos);
-                 * should be actually ds/cs at fpu exception time, but
-                 * that information is not available in 64bit mode.
-                 */
-                asm("mov %%ds, %[fos]" : [fos] "=r" (env->fos));
-                asm("mov %%cs, %[fcs]" : [fcs] "=r" (env->fcs));
        } else {
-                struct pt_regs *regs = task_pt_regs(tsk);
+                env->fos = tsk->thread.ds;
-                env->fos = 0xffff0000 | tsk->thread.ds;
-                env->fcs = regs->cs;
        }
+        env->fos |= 0xffff0000;
 #else
        env->fip = fxsave->fip;
        env->fcs = (u16) fxsave->fcs | ((u32) fxsave->fop << 16);
diff --git a/arch/x86/kernel/i8237.c b/arch/x86/kernel/i8237.c
index b42ca694dc68..8eeaa81de066 100644
--- a/arch/x86/kernel/i8237.c
+++ b/arch/x86/kernel/i8237.c
@@ -10,7 +10,7 @@
 */
 #include <linux/init.h>
-#include <linux/sysdev.h>
+#include <linux/syscore_ops.h>
 #include <asm/dma.h>
@@ -21,7 +21,7 @@
 * in asm/dma.h.
 */
-static int i8237A_resume(struct sys_device *dev)
+static void i8237A_resume(void)
 {
        unsigned long flags;
        int i;
@@ -41,31 +41,15 @@ static int i8237A_resume(struct sys_device *dev)
        enable_dma(4);
        release_dma_lock(flags);
-        return 0;
 }
-static int i8237A_suspend(struct sys_device *dev, pm_message_t state)
+static struct syscore_ops i8237_syscore_ops = {
-{
-        return 0;
-}
-static struct sysdev_class i8237_sysdev_class = {
-        .name           = "i8237",
-        .suspend        = i8237A_suspend,
        .resume         = i8237A_resume,
 };
-static struct sys_device device_i8237A = {
+static int __init i8237A_init_ops(void)
-        .id             = 0,
-        .cls            = &i8237_sysdev_class,
-};
-static int __init i8237A_init_sysfs(void)
 {
-        int error = sysdev_class_register(&i8237_sysdev_class);
+        register_syscore_ops(&i8237_syscore_ops);
-        if (!error)
+        return 0;
-                error = sysdev_register(&device_i8237A);
-        return error;
 }
-device_initcall(i8237A_init_sysfs);
+device_initcall(i8237A_init_ops);
diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c
index 2dfd31597443..fb66dc9e36cb 100644
--- a/arch/x86/kernel/i8253.c
+++ b/arch/x86/kernel/i8253.c
@@ -93,7 +93,6 @@ static struct clock_event_device pit_ce = {
        .features       = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
        .set_mode       = init_pit_timer,
        .set_next_event = pit_next_event,
-        .shift          = 32,
        .irq            = 0,
 };
@@ -108,90 +107,12 @@ void __init setup_pit_timer(void)
         * IO_APIC has been initialized.
         */
        pit_ce.cpumask = cpumask_of(smp_processor_id());
-        pit_ce.mult = div_sc(CLOCK_TICK_RATE, NSEC_PER_SEC, pit_ce.shift);
-        pit_ce.max_delta_ns = clockevent_delta2ns(0x7FFF, &pit_ce);
-        pit_ce.min_delta_ns = clockevent_delta2ns(0xF, &pit_ce);
-        clockevents_register_device(&pit_ce);
+        clockevents_config_and_register(&pit_ce, CLOCK_TICK_RATE, 0xF, 0x7FFF);
        global_clock_event = &pit_ce;
 }
 #ifndef CONFIG_X86_64
-/*
- * Since the PIT overflows every tick, its not very useful
- * to just read by itself. So use jiffies to emulate a free
- * running counter:
- */
-static cycle_t pit_read(struct clocksource *cs)
-{
-        static int old_count;
-        static u32 old_jifs;
-        unsigned long flags;
-        int count;
-        u32 jifs;
-        raw_spin_lock_irqsave(&i8253_lock, flags);
-        /*
-         * Although our caller may have the read side of xtime_lock,
-         * this is now a seqlock, and we are cheating in this routine
-         * by having side effects on state that we cannot undo if
-         * there is a collision on the seqlock and our caller has to
-         * retry.  (Namely, old_jifs and old_count.)  So we must treat
-         * jiffies as volatile despite the lock.  We read jiffies
-         * before latching the timer count to guarantee that although
-         * the jiffies value might be older than the count (that is,
-         * the counter may underflow between the last point where
-         * jiffies was incremented and the point where we latch the
-         * count), it cannot be newer.
-         */
-        jifs = jiffies;
-        outb_pit(0x00, PIT_MODE);       /* latch the count ASAP */
-        count = inb_pit(PIT_CH0);       /* read the latched count */
-        count |= inb_pit(PIT_CH0) << 8;
-        /* VIA686a test code... reset the latch if count > max + 1 */
-        if (count > LATCH) {
-                outb_pit(0x34, PIT_MODE);
-                outb_pit(LATCH & 0xff, PIT_CH0);
-                outb_pit(LATCH >> 8, PIT_CH0);
-                count = LATCH - 1;
-        }
-        /*
-         * It's possible for count to appear to go the wrong way for a
-         * couple of reasons:
-         *
-         *  1. The timer counter underflows, but we haven't handled the
-         *     resulting interrupt and incremented jiffies yet.
-         *  2. Hardware problem with the timer, not giving us continuous time,
-         *     the counter does small "jumps" upwards on some Pentium systems,
-         *     (see c't 95/10 page 335 for Neptun bug.)
-         *
-         * Previous attempts to handle these cases intelligently were
-         * buggy, so we just do the simple thing now.
-         */
-        if (count > old_count && jifs == old_jifs)
-                count = old_count;
-        old_count = count;
-        old_jifs = jifs;
-        raw_spin_unlock_irqrestore(&i8253_lock, flags);
-        count = (LATCH - 1) - count;
-        return (cycle_t)(jifs * LATCH) + count;
-}
-static struct clocksource pit_cs = {
-        .name           = "pit",
-        .rating         = 110,
-        .read           = pit_read,
-        .mask           = CLOCKSOURCE_MASK(32),
-        .mult           = 0,
-        .shift          = 20,
-};
 static int __init init_pit_clocksource(void)
 {
         /*
@@ -205,10 +126,7 @@ static int __init init_pit_clocksource(void)
            pit_ce.mode != CLOCK_EVT_MODE_PERIODIC)
                return 0;
-        pit_cs.mult = clocksource_hz2mult(CLOCK_TICK_RATE, pit_cs.shift);
+        return clocksource_i8253_init();
-        return clocksource_register(&pit_cs);
 }
 arch_initcall(init_pit_clocksource);
 #endif /* !CONFIG_X86_64 */
diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c
index cafa7c80ac95..65b8f5c2eebf 100644
--- a/arch/x86/kernel/i8259.c
+++ b/arch/x86/kernel/i8259.c
@@ -8,7 +8,7 @@
 #include <linux/random.h>
 #include <linux/init.h>
 #include <linux/kernel_stat.h>
-#include <linux/sysdev.h>
+#include <linux/syscore_ops.h>
 #include <linux/bitops.h>
 #include <linux/acpi.h>
 #include <linux/io.h>
@@ -29,24 +29,10 @@
 * plus some generic x86 specific things if generic specifics makes
 * any sense at all.
 */
+static void init_8259A(int auto_eoi);
 static int i8259A_auto_eoi;
 DEFINE_RAW_SPINLOCK(i8259A_lock);
-static void mask_and_ack_8259A(unsigned int);
-static void mask_8259A(void);
-static void unmask_8259A(void);
-static void disable_8259A_irq(unsigned int irq);
-static void enable_8259A_irq(unsigned int irq);
-static void init_8259A(int auto_eoi);
-static int i8259A_irq_pending(unsigned int irq);
-struct irq_chip i8259A_chip = {
-        .name           = "XT-PIC",
-        .mask           = disable_8259A_irq,
-        .disable        = disable_8259A_irq,
-        .unmask         = enable_8259A_irq,
-        .mask_ack       = mask_and_ack_8259A,
-};
 /*
 * 8259A PIC functions to handle ISA devices:
@@ -68,7 +54,7 @@ unsigned int cached_irq_mask = 0xffff;
 */
 unsigned long io_apic_irqs;
-static void disable_8259A_irq(unsigned int irq)
+static void mask_8259A_irq(unsigned int irq)
 {
        unsigned int mask = 1 << irq;
        unsigned long flags;
@@ -82,7 +68,12 @@ static void disable_8259A_irq(unsigned int irq)
        raw_spin_unlock_irqrestore(&i8259A_lock, flags);
 }
-static void enable_8259A_irq(unsigned int irq)
+static void disable_8259A_irq(struct irq_data *data)
+{
+        mask_8259A_irq(data->irq);
+}
+static void unmask_8259A_irq(unsigned int irq)
 {
        unsigned int mask = ~(1 << irq);
        unsigned long flags;
@@ -96,6 +87,11 @@ static void enable_8259A_irq(unsigned int irq)
        raw_spin_unlock_irqrestore(&i8259A_lock, flags);
 }
+static void enable_8259A_irq(struct irq_data *data)
+{
+        unmask_8259A_irq(data->irq);
+}
 static int i8259A_irq_pending(unsigned int irq)
 {
        unsigned int mask = 1<<irq;
@@ -116,8 +112,8 @@ static void make_8259A_irq(unsigned int irq)
 {
        disable_irq_nosync(irq);
        io_apic_irqs &= ~(1<<irq);
-        set_irq_chip_and_handler_name(irq, &i8259A_chip, handle_level_irq,
+        irq_set_chip_and_handler_name(irq, &i8259A_chip, handle_level_irq,
-                                      "XT");
+                                      i8259A_chip.name);
        enable_irq(irq);
 }
@@ -150,8 +146,9 @@ static inline int i8259A_irq_real(unsigned int irq)
 * first, _then_ send the EOI, and the order of EOI
 * to the two 8259s is important!
 */
-static void mask_and_ack_8259A(unsigned int irq)
+static void mask_and_ack_8259A(struct irq_data *data)
 {
+        unsigned int irq = data->irq;
        unsigned int irqmask = 1 << irq;
        unsigned long flags;
@@ -223,6 +220,14 @@ spurious_8259A_irq:
        }
 }
+struct irq_chip i8259A_chip = {
+        .name           = "XT-PIC",
+        .irq_mask       = disable_8259A_irq,
+        .irq_disable    = disable_8259A_irq,
+        .irq_unmask     = enable_8259A_irq,
+        .irq_mask_ack   = mask_and_ack_8259A,
+};
 static char irq_trigger[2];
 /**
 * ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ
@@ -240,20 +245,19 @@ static void save_ELCR(char *trigger)
        trigger[1] = inb(0x4d1) & 0xDE;
 }
-static int i8259A_resume(struct sys_device *dev)
+static void i8259A_resume(void)
 {
        init_8259A(i8259A_auto_eoi);
        restore_ELCR(irq_trigger);
-        return 0;
 }
-static int i8259A_suspend(struct sys_device *dev, pm_message_t state)
+static int i8259A_suspend(void)
 {
        save_ELCR(irq_trigger);
        return 0;
 }
-static int i8259A_shutdown(struct sys_device *dev)
+static void i8259A_shutdown(void)
 {
        /* Put the i8259A into a quiescent state that
         * the kernel initialization code can get it
@@ -261,21 +265,14 @@ static int i8259A_shutdown(struct sys_device *dev)
         */
        outb(0xff, PIC_MASTER_IMR);     /* mask all of 8259A-1 */
        outb(0xff, PIC_SLAVE_IMR);      /* mask all of 8259A-1 */
-        return 0;
 }
-static struct sysdev_class i8259_sysdev_class = {
+static struct syscore_ops i8259_syscore_ops = {
-        .name = "i8259",
        .suspend = i8259A_suspend,
        .resume = i8259A_resume,
        .shutdown = i8259A_shutdown,
 };
-static struct sys_device device_i8259A = {
-        .id     = 0,
-        .cls    = &i8259_sysdev_class,
-};
 static void mask_8259A(void)
 {
        unsigned long flags;
@@ -342,9 +339,9 @@ static void init_8259A(int auto_eoi)
                 * In AEOI mode we just have to mask the interrupt
                 * when acking.
                 */
-                i8259A_chip.mask_ack = disable_8259A_irq;
+                i8259A_chip.irq_mask_ack = disable_8259A_irq;
        else
-                i8259A_chip.mask_ack = mask_and_ack_8259A;
+                i8259A_chip.irq_mask_ack = mask_and_ack_8259A;
        udelay(100);            /* wait for 8259A to initialize */
@@ -363,14 +360,6 @@ static void init_8259A(int auto_eoi)
 static void legacy_pic_noop(void) { };
 static void legacy_pic_uint_noop(unsigned int unused) { };
 static void legacy_pic_int_noop(int unused) { };
-static struct irq_chip dummy_pic_chip  = {
-        .name = "dummy pic",
-        .mask = legacy_pic_uint_noop,
-        .unmask = legacy_pic_uint_noop,
-        .disable = legacy_pic_uint_noop,
-        .mask_ack = legacy_pic_uint_noop,
-};
 static int legacy_pic_irq_pending_noop(unsigned int irq)
 {
        return 0;
@@ -378,7 +367,9 @@ static int legacy_pic_irq_pending_noop(unsigned int irq)
 struct legacy_pic null_legacy_pic = {
        .nr_legacy_irqs = 0,
-        .chip = &dummy_pic_chip,
+        .chip = &dummy_irq_chip,
+        .mask = legacy_pic_uint_noop,
+        .unmask = legacy_pic_uint_noop,
        .mask_all = legacy_pic_noop,
        .restore_mask = legacy_pic_noop,
        .init = legacy_pic_int_noop,
@@ -389,7 +380,9 @@ struct legacy_pic null_legacy_pic = {
 struct legacy_pic default_legacy_pic = {
        .nr_legacy_irqs = NR_IRQS_LEGACY,
        .chip  = &i8259A_chip,
-        .mask_all  = mask_8259A,
+        .mask = mask_8259A_irq,
+        .unmask = unmask_8259A_irq,
+        .mask_all = mask_8259A,
        .restore_mask = unmask_8259A,
        .init = init_8259A,
        .irq_pending = i8259A_irq_pending,
@@ -398,17 +391,12 @@ struct legacy_pic default_legacy_pic = {
 struct legacy_pic *legacy_pic = &default_legacy_pic;
-static int __init i8259A_init_sysfs(void)
+static int __init i8259A_init_ops(void)
 {
-        int error;
+        if (legacy_pic == &default_legacy_pic)
+                register_syscore_ops(&i8259_syscore_ops);
-        if (legacy_pic != &default_legacy_pic)
-                return 0;
-        error = sysdev_class_register(&i8259_sysdev_class);
+        return 0;
-        if (!error)
-                error = sysdev_register(&device_i8259A);
-        return error;
 }
-device_initcall(i8259A_init_sysfs);
+device_initcall(i8259A_init_ops);
diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c
index 8eec0ec59af2..8c968974253d 100644
--- a/arch/x86/kernel/ioport.c
+++ b/arch/x86/kernel/ioport.c
@@ -14,22 +14,9 @@
 #include <linux/slab.h>
 #include <linux/thread_info.h>
 #include <linux/syscalls.h>
+#include <linux/bitmap.h>
 #include <asm/syscalls.h>
-/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
-static void set_bitmap(unsigned long *bitmap, unsigned int base,
-                       unsigned int extent, int new_value)
-{
-        unsigned int i;
-        for (i = base; i < base + extent; i++) {
-                if (new_value)
-                        __set_bit(i, bitmap);
-                else
-                        __clear_bit(i, bitmap);
-        }
-}
 /*
 * this changes the io permissions bitmap in the current task.
 */
@@ -69,7 +56,10 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
         */
        tss = &per_cpu(init_tss, get_cpu());
-        set_bitmap(t->io_bitmap_ptr, from, num, !turn_on);
+        if (turn_on)
+                bitmap_clear(t->io_bitmap_ptr, from, num);
+        else
+                bitmap_set(t->io_bitmap_ptr, from, num);
        /*
         * Search for a (possibly new) maximum. This is simple and stupid,
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 91fd0c70a18a..6c0802eb2f7f 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -4,9 +4,11 @@
 #include <linux/cpu.h>
 #include <linux/interrupt.h>
 #include <linux/kernel_stat.h>
+#include <linux/of.h>
 #include <linux/seq_file.h>
 #include <linux/smp.h>
 #include <linux/ftrace.h>
+#include <linux/delay.h>
 #include <asm/apic.h>
 #include <asm/io_apic.h>
@@ -43,9 +45,9 @@ void ack_bad_irq(unsigned int irq)
 #define irq_stats(x)            (&per_cpu(irq_stat, x))
 /*
- * /proc/interrupts printing:
+ * /proc/interrupts printing for arch specific interrupts
 */
-static int show_other_interrupts(struct seq_file *p, int prec)
+int arch_show_interrupts(struct seq_file *p, int prec)
 {
        int j;
@@ -67,10 +69,10 @@ static int show_other_interrupts(struct seq_file *p, int prec)
        for_each_online_cpu(j)
                seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs);
        seq_printf(p, "  Performance monitoring interrupts\n");
-        seq_printf(p, "%*s: ", prec, "PND");
+        seq_printf(p, "%*s: ", prec, "IWI");
        for_each_online_cpu(j)
-                seq_printf(p, "%10u ", irq_stats(j)->apic_pending_irqs);
+                seq_printf(p, "%10u ", irq_stats(j)->apic_irq_work_irqs);
-        seq_printf(p, "  Performance pending work\n");
+        seq_printf(p, "  IRQ work interrupts\n");
 #endif
        if (x86_platform_ipi_callback) {
                seq_printf(p, "%*s: ", prec, "PLT");
@@ -121,59 +123,6 @@ static int show_other_interrupts(struct seq_file *p, int prec)
        return 0;
 }
-int show_interrupts(struct seq_file *p, void *v)
-{
-        unsigned long flags, any_count = 0;
-        int i = *(loff_t *) v, j, prec;
-        struct irqaction *action;
-        struct irq_desc *desc;
-        if (i > nr_irqs)
-                return 0;
-        for (prec = 3, j = 1000; prec < 10 && j <= nr_irqs; ++prec)
-                j *= 10;
-        if (i == nr_irqs)
-                return show_other_interrupts(p, prec);
-        /* print header */
-        if (i == 0) {
-                seq_printf(p, "%*s", prec + 8, "");
-                for_each_online_cpu(j)
-                        seq_printf(p, "CPU%-8d", j);
-                seq_putc(p, '\n');
-        }
-        desc = irq_to_desc(i);
-        if (!desc)
-                return 0;
-        raw_spin_lock_irqsave(&desc->lock, flags);
-        for_each_online_cpu(j)
-                any_count |= kstat_irqs_cpu(i, j);
-        action = desc->action;
-        if (!action && !any_count)
-                goto out;
-        seq_printf(p, "%*d: ", prec, i);
-        for_each_online_cpu(j)
-                seq_printf(p, "%10u ", kstat_irqs_cpu(i, j));
-        seq_printf(p, " %8s", desc->chip->name);
-        seq_printf(p, "-%-8s", desc->name);
-        if (action) {
-                seq_printf(p, "  %s", action->name);
-                while ((action = action->next) != NULL)
-                        seq_printf(p, ", %s", action->name);
-        }
-        seq_putc(p, '\n');
-out:
-        raw_spin_unlock_irqrestore(&desc->lock, flags);
-        return 0;
-}
 /*
 * /proc/stat helpers
 */
@@ -185,7 +134,7 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
        sum += irq_stats(cpu)->apic_timer_irqs;
        sum += irq_stats(cpu)->irq_spurious_count;
        sum += irq_stats(cpu)->apic_perf_irqs;
-        sum += irq_stats(cpu)->apic_pending_irqs;
+        sum += irq_stats(cpu)->apic_irq_work_irqs;
 #endif
        if (x86_platform_ipi_callback)
                sum += irq_stats(cpu)->x86_platform_ipis;
@@ -234,7 +183,7 @@ unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
        exit_idle();
        irq_enter();
-        irq = __get_cpu_var(vector_irq)[vector];
+        irq = __this_cpu_read(vector_irq[vector]);
        if (!handle_irq(irq, regs)) {
                ack_APIC_irq();
@@ -282,6 +231,8 @@ void fixup_irqs(void)
        unsigned int irq, vector;
        static int warned;
        struct irq_desc *desc;
+        struct irq_data *data;
+        struct irq_chip *chip;
        for_each_irq_desc(irq, desc) {
                int break_affinity = 0;
@@ -296,9 +247,10 @@ void fixup_irqs(void)
                /* interrupt's are disabled at this point */
                raw_spin_lock(&desc->lock);
-                affinity = desc->affinity;
+                data = irq_desc_get_irq_data(desc);
-                if (!irq_has_action(irq) ||
+                affinity = data->affinity;
-                    cpumask_equal(affinity, cpu_online_mask)) {
+                if (!irq_has_action(irq) || irqd_is_per_cpu(data) ||
+                    cpumask_subset(affinity, cpu_online_mask)) {
                        raw_spin_unlock(&desc->lock);
                        continue;
                }
@@ -315,16 +267,18 @@ void fixup_irqs(void)
                        affinity = cpu_all_mask;
                }
-                if (!(desc->status & IRQ_MOVE_PCNTXT) && desc->chip->mask)
+                chip = irq_data_get_irq_chip(data);
-                        desc->chip->mask(irq);
+                if (!irqd_can_move_in_process_context(data) && chip->irq_mask)
+                        chip->irq_mask(data);
-                if (desc->chip->set_affinity)
+                if (chip->irq_set_affinity)
-                        desc->chip->set_affinity(irq, affinity);
+                        chip->irq_set_affinity(data, affinity, true);
                else if (!(warned++))
                        set_affinity = 0;
-                if (!(desc->status & IRQ_MOVE_PCNTXT) && desc->chip->unmask)
+                if (!irqd_can_move_in_process_context(data) &&
-                        desc->chip->unmask(irq);
+                    !irqd_irq_disabled(data) && chip->irq_unmask)
+                        chip->irq_unmask(data);
                raw_spin_unlock(&desc->lock);
@@ -348,17 +302,19 @@ void fixup_irqs(void)
        for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
                unsigned int irr;
-                if (__get_cpu_var(vector_irq)[vector] < 0)
+                if (__this_cpu_read(vector_irq[vector]) < 0)
                        continue;
                irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
                if (irr  & (1 << (vector % 32))) {
-                        irq = __get_cpu_var(vector_irq)[vector];
+                        irq = __this_cpu_read(vector_irq[vector]);
                        desc = irq_to_desc(irq);
+                        data = irq_desc_get_irq_data(desc);
+                        chip = irq_data_get_irq_chip(data);
                        raw_spin_lock(&desc->lock);
-                        if (desc->chip->retrigger)
+                        if (chip->irq_retrigger)
-                                desc->chip->retrigger(irq);
+                                chip->irq_retrigger(data);
                        raw_spin_unlock(&desc->lock);
                }
        }
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 10709f29d166..72090705a656 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -17,6 +17,7 @@
 #include <linux/delay.h>
 #include <linux/uaccess.h>
 #include <linux/percpu.h>
+#include <linux/mm.h>
 #include <asm/apic.h>
@@ -49,21 +50,17 @@ static inline int check_stack_overflow(void) { return 0; }
 static inline void print_stack_overflow(void) { }
 #endif
-#ifdef CONFIG_4KSTACKS
 /*
 * per-CPU IRQ handling contexts (thread information and stack)
 */
 union irq_ctx {
        struct thread_info      tinfo;
        u32                     stack[THREAD_SIZE/sizeof(u32)];
-} __attribute__((aligned(PAGE_SIZE)));
+} __attribute__((aligned(THREAD_SIZE)));
 static DEFINE_PER_CPU(union irq_ctx *, hardirq_ctx);
 static DEFINE_PER_CPU(union irq_ctx *, softirq_ctx);
-static DEFINE_PER_CPU_PAGE_ALIGNED(union irq_ctx, hardirq_stack);
-static DEFINE_PER_CPU_PAGE_ALIGNED(union irq_ctx, softirq_stack);
 static void call_on_stack(void *func, void *stack)
 {
        asm volatile("xchgl     %%ebx,%%esp     \n"
@@ -82,7 +79,7 @@ execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq)
        u32 *isp, arg1, arg2;
        curctx = (union irq_ctx *) current_thread_info();
-        irqctx = __get_cpu_var(hardirq_ctx);
+        irqctx = __this_cpu_read(hardirq_ctx);
        /*
         * this is where we switch to the IRQ stack. However, if we are
@@ -129,20 +126,21 @@ void __cpuinit irq_ctx_init(int cpu)
        if (per_cpu(hardirq_ctx, cpu))
                return;
-        irqctx = &per_cpu(hardirq_stack, cpu);
+        irqctx = page_address(alloc_pages_node(cpu_to_node(cpu),
-        irqctx->tinfo.task              = NULL;
+                                               THREAD_FLAGS,
-        irqctx->tinfo.exec_domain       = NULL;
+                                               THREAD_ORDER));
+        memset(&irqctx->tinfo, 0, sizeof(struct thread_info));
        irqctx->tinfo.cpu               = cpu;
        irqctx->tinfo.preempt_count     = HARDIRQ_OFFSET;
        irqctx->tinfo.addr_limit        = MAKE_MM_SEG(0);
        per_cpu(hardirq_ctx, cpu) = irqctx;
-        irqctx = &per_cpu(softirq_stack, cpu);
+        irqctx = page_address(alloc_pages_node(cpu_to_node(cpu),
-        irqctx->tinfo.task              = NULL;
+                                               THREAD_FLAGS,
-        irqctx->tinfo.exec_domain       = NULL;
+                                               THREAD_ORDER));
+        memset(&irqctx->tinfo, 0, sizeof(struct thread_info));
        irqctx->tinfo.cpu               = cpu;
-        irqctx->tinfo.preempt_count     = 0;
        irqctx->tinfo.addr_limit        = MAKE_MM_SEG(0);
        per_cpu(softirq_ctx, cpu) = irqctx;
@@ -151,11 +149,6 @@ void __cpuinit irq_ctx_init(int cpu)
               cpu, per_cpu(hardirq_ctx, cpu),  per_cpu(softirq_ctx, cpu));
 }
-void irq_ctx_exit(int cpu)
-{
-        per_cpu(hardirq_ctx, cpu) = NULL;
-}
 asmlinkage void do_softirq(void)
 {
        unsigned long flags;
@@ -170,7 +163,7 @@ asmlinkage void do_softirq(void)
        if (local_softirq_pending()) {
                curctx = current_thread_info();
-                irqctx = __get_cpu_var(softirq_ctx);
+                irqctx = __this_cpu_read(softirq_ctx);
                irqctx->tinfo.task = curctx->task;
                irqctx->tinfo.previous_esp = current_stack_pointer;
@@ -179,7 +172,7 @@ asmlinkage void do_softirq(void)
                call_on_stack(__do_softirq, isp);
                /*
-                 * Shouldnt happen, we returned above if in_interrupt():
+                 * Shouldn't happen, we returned above if in_interrupt():
                 */
                WARN_ON_ONCE(softirq_count());
        }
@@ -187,11 +180,6 @@ asmlinkage void do_softirq(void)
        local_irq_restore(flags);
 }
-#else
-static inline int
-execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq) { return 0; }
-#endif
 bool handle_irq(unsigned irq, struct pt_regs *regs)
 {
        struct irq_desc *desc;
diff --git a/arch/x86/kernel/irq_work.c b/arch/x86/kernel/irq_work.c
new file mode 100644
index 000000000000..ca8f703a1e70
--- /dev/null
+++ b/arch/x86/kernel/irq_work.c
@@ -0,0 +1,30 @@
+/*
+ * x86 specific code for irq_work
+ *
+ * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ */
+#include <linux/kernel.h>
+#include <linux/irq_work.h>
+#include <linux/hardirq.h>
+#include <asm/apic.h>
+void smp_irq_work_interrupt(struct pt_regs *regs)
+{
+        irq_enter();
+        ack_APIC_irq();
+        inc_irq_stat(apic_irq_work_irqs);
+        irq_work_run();
+        irq_exit();
+}
+void arch_irq_work_raise(void)
+{
+#ifdef CONFIG_X86_LOCAL_APIC
+        if (!cpu_has_apic)
+                return;
+        apic->send_IPI_self(IRQ_WORK_VECTOR);
+        apic_wait_icr_idle();
+#endif
+}
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 9772b1a0f9a4..48acf71c6534 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -25,6 +25,7 @@
 #include <asm/setup.h>
 #include <asm/i8259.h>
 #include <asm/traps.h>
+#include <asm/prom.h>
 /*
 * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts:
@@ -71,6 +72,7 @@ static irqreturn_t math_error_irq(int cpl, void *dev_id)
 static struct irqaction fpu_irq = {
        .handler = math_error_irq,
        .name = "fpu",
+        .flags = IRQF_NO_THREAD,
 };
 #endif
@@ -80,6 +82,7 @@ static struct irqaction fpu_irq = {
 static struct irqaction irq2 = {
        .handler = no_action,
        .name = "cascade",
+        .flags = IRQF_NO_THREAD,
 };
 DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
@@ -100,6 +103,8 @@ int vector_used_by_percpu_irq(unsigned int vector)
 void __init init_ISA_irqs(void)
 {
+        struct irq_chip *chip = legacy_pic->chip;
+        const char *name = chip->name;
        int i;
 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)
@@ -107,19 +112,8 @@ void __init init_ISA_irqs(void)
 #endif
        legacy_pic->init(0);
-        /*
+        for (i = 0; i < legacy_pic->nr_legacy_irqs; i++)
-         * 16 old-style INTA-cycle interrupts:
+                irq_set_chip_and_handler_name(i, chip, handle_level_irq, name);
-         */
-        for (i = 0; i < legacy_pic->nr_legacy_irqs; i++) {
-                struct irq_desc *desc = irq_to_desc(i);
-                desc->status = IRQ_DISABLED;
-                desc->action = NULL;
-                desc->depth = 1;
-                set_irq_chip_and_handler_name(i, &i8259A_chip,
-                                              handle_level_irq, "XT");
-        }
 }
 void __init init_IRQ(void)
@@ -127,6 +121,12 @@ void __init init_IRQ(void)
        int i;
        /*
+         * We probably need a better place for this, but it works for
+         * now ...
+         */
+        x86_add_irq_domains();
+        /*
         * On cpu 0, Assign IRQ0_VECTOR..IRQ15_VECTOR's to IRQ 0..15.
         * If these IRQ's are handled by legacy interrupt-controllers like PIC,
         * then this configuration will likely be static after the boot. If
@@ -173,14 +173,77 @@ static void __init smp_intr_init(void)
        alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
        /* IPIs for invalidation */
-        alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0);
+#define ALLOC_INVTLB_VEC(NR) \
-        alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1);
+        alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+NR, \
-        alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2);
+                invalidate_interrupt##NR)
-        alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3);
-        alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4);
+        switch (NUM_INVALIDATE_TLB_VECTORS) {
-        alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5);
+        default:
-        alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6);
+                ALLOC_INVTLB_VEC(31);
-        alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7);
+        case 31:
+                ALLOC_INVTLB_VEC(30);
+        case 30:
+                ALLOC_INVTLB_VEC(29);
+        case 29:
+                ALLOC_INVTLB_VEC(28);
+        case 28:
+                ALLOC_INVTLB_VEC(27);
+        case 27:
+                ALLOC_INVTLB_VEC(26);
+        case 26:
+                ALLOC_INVTLB_VEC(25);
+        case 25:
+                ALLOC_INVTLB_VEC(24);
+        case 24:
+                ALLOC_INVTLB_VEC(23);
+        case 23:
+                ALLOC_INVTLB_VEC(22);
+        case 22:
+                ALLOC_INVTLB_VEC(21);
+        case 21:
+                ALLOC_INVTLB_VEC(20);
+        case 20:
+                ALLOC_INVTLB_VEC(19);
+        case 19:
+                ALLOC_INVTLB_VEC(18);
+        case 18:
+                ALLOC_INVTLB_VEC(17);
+        case 17:
+                ALLOC_INVTLB_VEC(16);
+        case 16:
+                ALLOC_INVTLB_VEC(15);
+        case 15:
+                ALLOC_INVTLB_VEC(14);
+        case 14:
+                ALLOC_INVTLB_VEC(13);
+        case 13:
+                ALLOC_INVTLB_VEC(12);
+        case 12:
+                ALLOC_INVTLB_VEC(11);
+        case 11:
+                ALLOC_INVTLB_VEC(10);
+        case 10:
+                ALLOC_INVTLB_VEC(9);
+        case 9:
+                ALLOC_INVTLB_VEC(8);
+        case 8:
+                ALLOC_INVTLB_VEC(7);
+        case 7:
+                ALLOC_INVTLB_VEC(6);
+        case 6:
+                ALLOC_INVTLB_VEC(5);
+        case 5:
+                ALLOC_INVTLB_VEC(4);
+        case 4:
+                ALLOC_INVTLB_VEC(3);
+        case 3:
+                ALLOC_INVTLB_VEC(2);
+        case 2:
+                ALLOC_INVTLB_VEC(1);
+        case 1:
+                ALLOC_INVTLB_VEC(0);
+                break;
+        }
        /* IPI for generic function call */
        alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
@@ -227,9 +290,9 @@ static void __init apic_intr_init(void)
        alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
        alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
-        /* Performance monitoring interrupts: */
+        /* IRQ work interrupts: */
-# ifdef CONFIG_PERF_EVENTS
+# ifdef CONFIG_IRQ_WORK
-        alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt);
+        alloc_intr_gate(IRQ_WORK_VECTOR, irq_work_interrupt);
 # endif
 #endif
@@ -255,7 +318,7 @@ void __init native_init_IRQ(void)
                        set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]);
        }
-        if (!acpi_ioapic)
+        if (!acpi_ioapic && !of_ioapic)
                setup_irq(2, &irq2);
 #ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c
new file mode 100644
index 000000000000..3fee346ef545
--- /dev/null
+++ b/arch/x86/kernel/jump_label.c
@@ -0,0 +1,51 @@
+/*
+ * jump label x86 support
+ *
+ * Copyright (C) 2009 Jason Baron <jbaron@redhat.com>
+ *
+ */
+#include <linux/jump_label.h>
+#include <linux/memory.h>
+#include <linux/uaccess.h>
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/jhash.h>
+#include <linux/cpu.h>
+#include <asm/kprobes.h>
+#include <asm/alternative.h>
+#ifdef HAVE_JUMP_LABEL
+union jump_code_union {
+        char code[JUMP_LABEL_NOP_SIZE];
+        struct {
+                char jump;
+                int offset;
+        } __attribute__((packed));
+};
+void arch_jump_label_transform(struct jump_entry *entry,
+                               enum jump_label_type type)
+{
+        union jump_code_union code;
+        if (type == JUMP_LABEL_ENABLE) {
+                code.jump = 0xe9;
+                code.offset = entry->target -
+                                (entry->code + JUMP_LABEL_NOP_SIZE);
+        } else
+                memcpy(&code, ideal_nops[NOP_ATOMIC5], JUMP_LABEL_NOP_SIZE);
+        get_online_cpus();
+        mutex_lock(&text_mutex);
+        text_poke_smp((void *)entry->code, &code, JUMP_LABEL_NOP_SIZE);
+        mutex_unlock(&text_mutex);
+        put_online_cpus();
+}
+void arch_jump_label_text_poke_early(jump_label_t addr)
+{
+        text_poke_early((void *)addr, ideal_nops[NOP_ATOMIC5],
+                        JUMP_LABEL_NOP_SIZE);
+}
+#endif
diff --git a/arch/x86/kernel/k8.c b/arch/x86/kernel/k8.c
deleted file mode 100644
index 0f7bc20cfcde..000000000000
--- a/arch/x86/kernel/k8.c
+++ /dev/null
@@ -1,137 +0,0 @@
-/*
- * Shared support code for AMD K8 northbridges and derivates.
- * Copyright 2006 Andi Kleen, SUSE Labs. Subject to GPLv2.
- */
-#include <linux/types.h>
-#include <linux/slab.h>
-#include <linux/init.h>
-#include <linux/errno.h>
-#include <linux/module.h>
-#include <linux/spinlock.h>
-#include <asm/k8.h>
-int num_k8_northbridges;
-EXPORT_SYMBOL(num_k8_northbridges);
-static u32 *flush_words;
-struct pci_device_id k8_nb_ids[] = {
-        { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) },
-        { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) },
-        {}
-};
-EXPORT_SYMBOL(k8_nb_ids);
-struct pci_dev **k8_northbridges;
-EXPORT_SYMBOL(k8_northbridges);
-static struct pci_dev *next_k8_northbridge(struct pci_dev *dev)
-{
-        do {
-                dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev);
-                if (!dev)
-                        break;
-        } while (!pci_match_id(&k8_nb_ids[0], dev));
-        return dev;
-}
-int cache_k8_northbridges(void)
-{
-        int i;
-        struct pci_dev *dev;
-        if (num_k8_northbridges)
-                return 0;
-        dev = NULL;
-        while ((dev = next_k8_northbridge(dev)) != NULL)
-                num_k8_northbridges++;
-        k8_northbridges = kmalloc((num_k8_northbridges + 1) * sizeof(void *),
-                                  GFP_KERNEL);
-        if (!k8_northbridges)
-                return -ENOMEM;
-        if (!num_k8_northbridges) {
-                k8_northbridges[0] = NULL;
-                return 0;
-        }
-        flush_words = kmalloc(num_k8_northbridges * sizeof(u32), GFP_KERNEL);
-        if (!flush_words) {
-                kfree(k8_northbridges);
-                return -ENOMEM;
-        }
-        dev = NULL;
-        i = 0;
-        while ((dev = next_k8_northbridge(dev)) != NULL) {
-                k8_northbridges[i] = dev;
-                pci_read_config_dword(dev, 0x9c, &flush_words[i++]);
-        }
-        k8_northbridges[i] = NULL;
-        return 0;
-}
-EXPORT_SYMBOL_GPL(cache_k8_northbridges);
-/* Ignores subdevice/subvendor but as far as I can figure out
-   they're useless anyways */
-int __init early_is_k8_nb(u32 device)
-{
-        struct pci_device_id *id;
-        u32 vendor = device & 0xffff;
-        device >>= 16;
-        for (id = k8_nb_ids; id->vendor; id++)
-                if (vendor == id->vendor && device == id->device)
-                        return 1;
-        return 0;
-}
-void k8_flush_garts(void)
-{
-        int flushed, i;
-        unsigned long flags;
-        static DEFINE_SPINLOCK(gart_lock);
-        /* Avoid races between AGP and IOMMU. In theory it's not needed
-           but I'm not sure if the hardware won't lose flush requests
-           when another is pending. This whole thing is so expensive anyways
-           that it doesn't matter to serialize more. -AK */
-        spin_lock_irqsave(&gart_lock, flags);
-        flushed = 0;
-        for (i = 0; i < num_k8_northbridges; i++) {
-                pci_write_config_dword(k8_northbridges[i], 0x9c,
-                                       flush_words[i]|1);
-                flushed++;
-        }
-        for (i = 0; i < num_k8_northbridges; i++) {
-                u32 w;
-                /* Make sure the hardware actually executed the flush*/
-                for (;;) {
-                        pci_read_config_dword(k8_northbridges[i],
-                                              0x9c, &w);
-                        if (!(w & 1))
-                                break;
-                        cpu_relax();
-                }
-        }
-        spin_unlock_irqrestore(&gart_lock, flags);
-        if (!flushed)
-                printk("nothing to flush?\n");
-}
-EXPORT_SYMBOL_GPL(k8_flush_garts);
-static __init int init_k8_nbs(void)
-{
-        int err = 0;
-        err = cache_k8_northbridges();
-        if (err < 0)
-                printk(KERN_NOTICE "K8 NB: Cannot enumerate AMD northbridges.\n");
-        return err;
-}
-/* This has to go after the PCI subsystem */
-fs_initcall(init_k8_nbs);
diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c
index 8afd9f321f10..90fcf62854bb 100644
--- a/arch/x86/kernel/kdebugfs.c
+++ b/arch/x86/kernel/kdebugfs.c
@@ -78,6 +78,7 @@ static int setup_data_open(struct inode *inode, struct file *file)
 static const struct file_operations fops_setup_data = {
        .read           = setup_data_read,
        .open           = setup_data_open,
+        .llseek         = default_llseek,
 };
 static int __init
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 852b81967a37..5f9ecff328b5 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -48,6 +48,7 @@
 #include <asm/apicdef.h>
 #include <asm/system.h>
 #include <asm/apic.h>
+#include <asm/nmi.h>
 struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] =
 {
@@ -120,8 +121,8 @@ char *dbg_get_reg(int regno, void *mem, struct pt_regs *regs)
                memcpy(mem, (void *)regs + dbg_reg_def[regno].offset,
                       dbg_reg_def[regno].size);
-        switch (regno) {
 #ifdef CONFIG_X86_32
+        switch (regno) {
        case GDB_SS:
                if (!user_mode_vm(regs))
                        *(unsigned long *)mem = __KERNEL_DS;
@@ -134,8 +135,8 @@ char *dbg_get_reg(int regno, void *mem, struct pt_regs *regs)
        case GDB_FS:
                *(unsigned long *)mem = 0xFFFF;
                break;
-#endif
        }
+#endif
        return dbg_reg_def[regno].name;
 }
@@ -277,7 +278,7 @@ static int hw_break_release_slot(int breakno)
                pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu);
                if (dbg_release_bp_slot(*pevent))
                        /*
-                         * The debugger is responisble for handing the retry on
+                         * The debugger is responsible for handing the retry on
                         * remove failure.
                         */
                        return -1;
@@ -315,14 +316,18 @@ static void kgdb_remove_all_hw_break(void)
                if (!breakinfo[i].enabled)
                        continue;
                bp = *per_cpu_ptr(breakinfo[i].pev, cpu);
-                if (bp->attr.disabled == 1)
+                if (!bp->attr.disabled) {
+                        arch_uninstall_hw_breakpoint(bp);
+                        bp->attr.disabled = 1;
                        continue;
+                }
                if (dbg_is_early)
                        early_dr7 &= ~encode_dr7(i, breakinfo[i].len,
                                                 breakinfo[i].type);
-                else
+                else if (hw_break_release_slot(i))
-                        arch_uninstall_hw_breakpoint(bp);
+                        printk(KERN_ERR "KGDB: hw bpt remove failed %lx\n",
-                bp->attr.disabled = 1;
+                               breakinfo[i].addr);
+                breakinfo[i].enabled = 0;
        }
 }
@@ -387,7 +392,7 @@ kgdb_set_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype)
 *      disable hardware debugging while it is processing gdb packets or
 *      handling exception.
 */
-void kgdb_disable_hw_debug(struct pt_regs *regs)
+static void kgdb_disable_hw_debug(struct pt_regs *regs)
 {
        int i;
        int cpu = raw_smp_processor_id();
@@ -477,8 +482,6 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code,
                                   raw_smp_processor_id());
                }
-                kgdb_correct_hw_break();
                return 0;
        }
@@ -523,10 +526,6 @@ static int __kgdb_notify(struct die_args *args, unsigned long cmd)
                }
                return NOTIFY_DONE;
-        case DIE_NMI_IPI:
-                /* Just ignore, we will handle the roundup on DIE_NMI. */
-                return NOTIFY_DONE;
        case DIE_NMIUNKNOWN:
                if (was_in_debug_nmi[raw_smp_processor_id()]) {
                        was_in_debug_nmi[raw_smp_processor_id()] = 0;
@@ -534,15 +533,6 @@ static int __kgdb_notify(struct die_args *args, unsigned long cmd)
                }
                return NOTIFY_DONE;
-        case DIE_NMIWATCHDOG:
-                if (atomic_read(&kgdb_active) != -1) {
-                        /* KGDB CPU roundup: */
-                        kgdb_nmicallback(raw_smp_processor_id(), regs);
-                        return NOTIFY_STOP;
-                }
-                /* Enter debugger: */
-                break;
        case DIE_DEBUG:
                if (atomic_read(&kgdb_cpu_doing_single_step) != -1) {
                        if (user_mode(regs))
@@ -604,7 +594,7 @@ static struct notifier_block kgdb_notifier = {
        /*
         * Lowest-prio notifier priority, we want to be notified last:
         */
-        .priority       = -INT_MAX,
+        .priority       = NMI_LOCAL_LOW_PRIOR,
 };
 /**
@@ -621,7 +611,12 @@ int kgdb_arch_init(void)
 static void kgdb_hw_overflow_handler(struct perf_event *event, int nmi,
                struct perf_sample_data *data, struct pt_regs *regs)
 {
-        kgdb_ll_trap(DIE_DEBUG, "debug", regs, 0, 0, SIGTRAP);
+        struct task_struct *tsk = current;
+        int i;
+        for (i = 0; i < 4; i++)
+                if (breakinfo[i].enabled)
+                        tsk->thread.debugreg6 |= (DR_TRAP0 << i);
 }
 void kgdb_arch_late(void)
@@ -644,7 +639,7 @@ void kgdb_arch_late(void)
                if (breakinfo[i].pev)
                        continue;
                breakinfo[i].pev = register_wide_hw_breakpoint(&attr, NULL);
-                if (IS_ERR(breakinfo[i].pev)) {
+                if (IS_ERR((void * __force)breakinfo[i].pev)) {
                        printk(KERN_ERR "kgdb: Could not allocate hw"
                               "breakpoints\nDisabling the kernel debugger\n");
                        breakinfo[i].pev = NULL;
@@ -721,6 +716,7 @@ struct kgdb_arch arch_kgdb_ops = {
        .flags                  = KGDB_HW_BREAKPOINT,
        .set_hw_breakpoint      = kgdb_set_hw_break,
        .remove_hw_breakpoint   = kgdb_remove_hw_break,
+        .disable_hw_break       = kgdb_disable_hw_debug,
        .remove_all_hw_break    = kgdb_remove_all_hw_break,
        .correct_hw_break       = kgdb_correct_hw_break,
 };
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index 770ebfb349e9..f1a6244d7d93 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -230,9 +230,6 @@ static int recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr)
        return 0;
 }
-/* Dummy buffers for kallsyms_lookup */
-static char __dummy_buf[KSYM_NAME_LEN];
 /* Check if paddr is at an instruction boundary */
 static int __kprobes can_probe(unsigned long paddr)
 {
@@ -241,7 +238,7 @@ static int __kprobes can_probe(unsigned long paddr)
        struct insn insn;
        kprobe_opcode_t buf[MAX_INSN_SIZE];
-        if (!kallsyms_lookup(paddr, NULL, &offset, NULL, __dummy_buf))
+        if (!kallsyms_lookup_size_offset(paddr, NULL, &offset))
                return 0;
        /* Decode instructions */
@@ -406,7 +403,7 @@ static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb)
 static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb)
 {
-        __get_cpu_var(current_kprobe) = kcb->prev_kprobe.kp;
+        __this_cpu_write(current_kprobe, kcb->prev_kprobe.kp);
        kcb->kprobe_status = kcb->prev_kprobe.status;
        kcb->kprobe_old_flags = kcb->prev_kprobe.old_flags;
        kcb->kprobe_saved_flags = kcb->prev_kprobe.saved_flags;
@@ -415,7 +412,7 @@ static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb)
 static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
                                struct kprobe_ctlblk *kcb)
 {
-        __get_cpu_var(current_kprobe) = p;
+        __this_cpu_write(current_kprobe, p);
        kcb->kprobe_saved_flags = kcb->kprobe_old_flags
                = (regs->flags & (X86_EFLAGS_TF | X86_EFLAGS_IF));
        if (is_IF_modifier(p->ainsn.insn))
@@ -589,7 +586,7 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
                preempt_enable_no_resched();
                return 1;
        } else if (kprobe_running()) {
-                p = __get_cpu_var(current_kprobe);
+                p = __this_cpu_read(current_kprobe);
                if (p->break_handler && p->break_handler(p, regs)) {
                        setup_singlestep(p, regs, kcb, 0);
                        return 1;
@@ -762,11 +759,11 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs)
                orig_ret_address = (unsigned long)ri->ret_addr;
                if (ri->rp && ri->rp->handler) {
-                        __get_cpu_var(current_kprobe) = &ri->rp->kp;
+                        __this_cpu_write(current_kprobe, &ri->rp->kp);
                        get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE;
                        ri->ret_addr = correct_ret_addr;
                        ri->rp->handler(ri, regs);
-                        __get_cpu_var(current_kprobe) = NULL;
+                        __this_cpu_write(current_kprobe, NULL);
                }
                recycle_rp_inst(ri, &empty_rp);
@@ -1129,7 +1126,7 @@ static void __kprobes synthesize_set_arg1(kprobe_opcode_t *addr,
        *(unsigned long *)addr = val;
 }
-void __kprobes kprobes_optinsn_template_holder(void)
+static void __used __kprobes kprobes_optinsn_template_holder(void)
 {
        asm volatile (
                        ".global optprobe_template_entry\n"
@@ -1186,8 +1183,13 @@ static void __kprobes optimized_callback(struct optimized_kprobe *op,
                                         struct pt_regs *regs)
 {
        struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+        unsigned long flags;
-        preempt_disable();
+        /* This is possible if op is under delayed unoptimizing */
+        if (kprobe_disabled(&op->kp))
+                return;
+        local_irq_save(flags);
        if (kprobe_running()) {
                kprobes_inc_nmissed_count(&op->kp);
        } else {
@@ -1201,12 +1203,12 @@ static void __kprobes optimized_callback(struct optimized_kprobe *op,
                regs->ip = (unsigned long)op->kp.addr + INT3_SIZE;
                regs->orig_ax = ~0UL;
-                __get_cpu_var(current_kprobe) = &op->kp;
+                __this_cpu_write(current_kprobe, &op->kp);
                kcb->kprobe_status = KPROBE_HIT_ACTIVE;
                opt_pre_handler(&op->kp, regs);
-                __get_cpu_var(current_kprobe) = NULL;
+                __this_cpu_write(current_kprobe, NULL);
        }
-        preempt_enable_no_resched();
+        local_irq_restore(flags);
 }
 static int __kprobes copy_optimized_instructions(u8 *dest, u8 *src)
@@ -1221,7 +1223,8 @@ static int __kprobes copy_optimized_instructions(u8 *dest, u8 *src)
        }
        /* Check whether the address range is reserved */
        if (ftrace_text_reserved(src, src + len - 1) ||
-            alternatives_text_reserved(src, src + len - 1))
+            alternatives_text_reserved(src, src + len - 1) ||
+            jump_label_text_reserved(src, src + len - 1))
                return -EBUSY;
        return len;
@@ -1269,11 +1272,17 @@ static int __kprobes can_optimize(unsigned long paddr)
        unsigned long addr, size = 0, offset = 0;
        struct insn insn;
        kprobe_opcode_t buf[MAX_INSN_SIZE];
-        /* Dummy buffers for lookup_symbol_attrs */
-        static char __dummy_buf[KSYM_NAME_LEN];
        /* Lookup symbol including addr */
-        if (!kallsyms_lookup(paddr, &size, &offset, NULL, __dummy_buf))
+        if (!kallsyms_lookup_size_offset(paddr, &size, &offset))
+                return 0;
+        /*
+         * Do not optimize in the entry code due to the unstable
+         * stack handling.
+         */
+        if ((paddr >= (unsigned long )__entry_text_start) &&
+            (paddr <  (unsigned long )__entry_text_end))
                return 0;
        /* Check there is enough space for a relative jump. */
@@ -1405,10 +1414,16 @@ int __kprobes arch_prepare_optimized_kprobe(struct optimized_kprobe *op)
        return 0;
 }
-/* Replace a breakpoint (int3) with a relative jump.  */
+#define MAX_OPTIMIZE_PROBES 256
-int __kprobes arch_optimize_kprobe(struct optimized_kprobe *op)
+static struct text_poke_param *jump_poke_params;
+static struct jump_poke_buffer {
+        u8 buf[RELATIVEJUMP_SIZE];
+} *jump_poke_bufs;
+static void __kprobes setup_optimize_kprobe(struct text_poke_param *tprm,
+                                            u8 *insn_buf,
+                                            struct optimized_kprobe *op)
 {
-        unsigned char jmp_code[RELATIVEJUMP_SIZE];
        s32 rel = (s32)((long)op->optinsn.insn -
                        ((long)op->kp.addr + RELATIVEJUMP_SIZE));
@@ -1416,16 +1431,79 @@ int __kprobes arch_optimize_kprobe(struct optimized_kprobe *op)
        memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_SIZE,
               RELATIVE_ADDR_SIZE);
-        jmp_code[0] = RELATIVEJUMP_OPCODE;
+        insn_buf[0] = RELATIVEJUMP_OPCODE;
-        *(s32 *)(&jmp_code[1]) = rel;
+        *(s32 *)(&insn_buf[1]) = rel;
+        tprm->addr = op->kp.addr;
+        tprm->opcode = insn_buf;
+        tprm->len = RELATIVEJUMP_SIZE;
+}
+/*
+ * Replace breakpoints (int3) with relative jumps.
+ * Caller must call with locking kprobe_mutex and text_mutex.
+ */
+void __kprobes arch_optimize_kprobes(struct list_head *oplist)
+{
+        struct optimized_kprobe *op, *tmp;
+        int c = 0;
+        list_for_each_entry_safe(op, tmp, oplist, list) {
+                WARN_ON(kprobe_disabled(&op->kp));
+                /* Setup param */
+                setup_optimize_kprobe(&jump_poke_params[c],
+                                      jump_poke_bufs[c].buf, op);
+                list_del_init(&op->list);
+                if (++c >= MAX_OPTIMIZE_PROBES)
+                        break;
+        }
        /*
         * text_poke_smp doesn't support NMI/MCE code modifying.
         * However, since kprobes itself also doesn't support NMI/MCE
         * code probing, it's not a problem.
         */
-        text_poke_smp(op->kp.addr, jmp_code, RELATIVEJUMP_SIZE);
+        text_poke_smp_batch(jump_poke_params, c);
-        return 0;
+}
+static void __kprobes setup_unoptimize_kprobe(struct text_poke_param *tprm,
+                                              u8 *insn_buf,
+                                              struct optimized_kprobe *op)
+{
+        /* Set int3 to first byte for kprobes */
+        insn_buf[0] = BREAKPOINT_INSTRUCTION;
+        memcpy(insn_buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE);
+        tprm->addr = op->kp.addr;
+        tprm->opcode = insn_buf;
+        tprm->len = RELATIVEJUMP_SIZE;
+}
+/*
+ * Recover original instructions and breakpoints from relative jumps.
+ * Caller must call with locking kprobe_mutex.
+ */
+extern void arch_unoptimize_kprobes(struct list_head *oplist,
+                                    struct list_head *done_list)
+{
+        struct optimized_kprobe *op, *tmp;
+        int c = 0;
+        list_for_each_entry_safe(op, tmp, oplist, list) {
+                /* Setup param */
+                setup_unoptimize_kprobe(&jump_poke_params[c],
+                                        jump_poke_bufs[c].buf, op);
+                list_move(&op->list, done_list);
+                if (++c >= MAX_OPTIMIZE_PROBES)
+                        break;
+        }
+        /*
+         * text_poke_smp doesn't support NMI/MCE code modifying.
+         * However, since kprobes itself also doesn't support NMI/MCE
+         * code probing, it's not a problem.
+         */
+        text_poke_smp_batch(jump_poke_params, c);
 }
 /* Replace a relative jump with a breakpoint (int3).  */
@@ -1457,11 +1535,35 @@ static int  __kprobes setup_detour_execution(struct kprobe *p,
        }
        return 0;
 }
+static int __kprobes init_poke_params(void)
+{
+        /* Allocate code buffer and parameter array */
+        jump_poke_bufs = kmalloc(sizeof(struct jump_poke_buffer) *
+                                 MAX_OPTIMIZE_PROBES, GFP_KERNEL);
+        if (!jump_poke_bufs)
+                return -ENOMEM;
+        jump_poke_params = kmalloc(sizeof(struct text_poke_param) *
+                                   MAX_OPTIMIZE_PROBES, GFP_KERNEL);
+        if (!jump_poke_params) {
+                kfree(jump_poke_bufs);
+                jump_poke_bufs = NULL;
+                return -ENOMEM;
+        }
+        return 0;
+}
+#else   /* !CONFIG_OPTPROBES */
+static int __kprobes init_poke_params(void)
+{
+        return 0;
+}
 #endif
 int __init arch_init_kprobes(void)
 {
-        return 0;
+        return init_poke_params();
 }
 int __kprobes arch_trampoline_kprobe(struct kprobe *p)
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 63b0ec8d3d4a..33c07b0b122e 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -27,16 +27,37 @@
 #include <linux/mm.h>
 #include <linux/highmem.h>
 #include <linux/hardirq.h>
+#include <linux/notifier.h>
+#include <linux/reboot.h>
+#include <linux/hash.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/kprobes.h>
 #include <asm/timer.h>
+#include <asm/cpu.h>
+#include <asm/traps.h>
+#include <asm/desc.h>
+#include <asm/tlbflush.h>
 #define MMU_QUEUE_SIZE 1024
+static int kvmapf = 1;
+static int parse_no_kvmapf(char *arg)
+{
+        kvmapf = 0;
+        return 0;
+}
+early_param("no-kvmapf", parse_no_kvmapf);
 struct kvm_para_state {
        u8 mmu_queue[MMU_QUEUE_SIZE];
        int mmu_queue_len;
 };
 static DEFINE_PER_CPU(struct kvm_para_state, para_state);
+static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64);
 static struct kvm_para_state *kvm_para_state(void)
 {
@@ -50,6 +71,195 @@ static void kvm_io_delay(void)
 {
 }
+#define KVM_TASK_SLEEP_HASHBITS 8
+#define KVM_TASK_SLEEP_HASHSIZE (1<<KVM_TASK_SLEEP_HASHBITS)
+struct kvm_task_sleep_node {
+        struct hlist_node link;
+        wait_queue_head_t wq;
+        u32 token;
+        int cpu;
+        bool halted;
+        struct mm_struct *mm;
+};
+static struct kvm_task_sleep_head {
+        spinlock_t lock;
+        struct hlist_head list;
+} async_pf_sleepers[KVM_TASK_SLEEP_HASHSIZE];
+static struct kvm_task_sleep_node *_find_apf_task(struct kvm_task_sleep_head *b,
+                                                  u32 token)
+{
+        struct hlist_node *p;
+        hlist_for_each(p, &b->list) {
+                struct kvm_task_sleep_node *n =
+                        hlist_entry(p, typeof(*n), link);
+                if (n->token == token)
+                        return n;
+        }
+        return NULL;
+}
+void kvm_async_pf_task_wait(u32 token)
+{
+        u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
+        struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
+        struct kvm_task_sleep_node n, *e;
+        DEFINE_WAIT(wait);
+        int cpu, idle;
+        cpu = get_cpu();
+        idle = idle_cpu(cpu);
+        put_cpu();
+        spin_lock(&b->lock);
+        e = _find_apf_task(b, token);
+        if (e) {
+                /* dummy entry exist -> wake up was delivered ahead of PF */
+                hlist_del(&e->link);
+                kfree(e);
+                spin_unlock(&b->lock);
+                return;
+        }
+        n.token = token;
+        n.cpu = smp_processor_id();
+        n.mm = current->active_mm;
+        n.halted = idle || preempt_count() > 1;
+        atomic_inc(&n.mm->mm_count);
+        init_waitqueue_head(&n.wq);
+        hlist_add_head(&n.link, &b->list);
+        spin_unlock(&b->lock);
+        for (;;) {
+                if (!n.halted)
+                        prepare_to_wait(&n.wq, &wait, TASK_UNINTERRUPTIBLE);
+                if (hlist_unhashed(&n.link))
+                        break;
+                if (!n.halted) {
+                        local_irq_enable();
+                        schedule();
+                        local_irq_disable();
+                } else {
+                        /*
+                         * We cannot reschedule. So halt.
+                         */
+                        native_safe_halt();
+                        local_irq_disable();
+                }
+        }
+        if (!n.halted)
+                finish_wait(&n.wq, &wait);
+        return;
+}
+EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait);
+static void apf_task_wake_one(struct kvm_task_sleep_node *n)
+{
+        hlist_del_init(&n->link);
+        if (!n->mm)
+                return;
+        mmdrop(n->mm);
+        if (n->halted)
+                smp_send_reschedule(n->cpu);
+        else if (waitqueue_active(&n->wq))
+                wake_up(&n->wq);
+}
+static void apf_task_wake_all(void)
+{
+        int i;
+        for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) {
+                struct hlist_node *p, *next;
+                struct kvm_task_sleep_head *b = &async_pf_sleepers[i];
+                spin_lock(&b->lock);
+                hlist_for_each_safe(p, next, &b->list) {
+                        struct kvm_task_sleep_node *n =
+                                hlist_entry(p, typeof(*n), link);
+                        if (n->cpu == smp_processor_id())
+                                apf_task_wake_one(n);
+                }
+                spin_unlock(&b->lock);
+        }
+}
+void kvm_async_pf_task_wake(u32 token)
+{
+        u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
+        struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
+        struct kvm_task_sleep_node *n;
+        if (token == ~0) {
+                apf_task_wake_all();
+                return;
+        }
+again:
+        spin_lock(&b->lock);
+        n = _find_apf_task(b, token);
+        if (!n) {
+                /*
+                 * async PF was not yet handled.
+                 * Add dummy entry for the token.
+                 */
+                n = kmalloc(sizeof(*n), GFP_ATOMIC);
+                if (!n) {
+                        /*
+                         * Allocation failed! Busy wait while other cpu
+                         * handles async PF.
+                         */
+                        spin_unlock(&b->lock);
+                        cpu_relax();
+                        goto again;
+                }
+                n->token = token;
+                n->cpu = smp_processor_id();
+                n->mm = NULL;
+                init_waitqueue_head(&n->wq);
+                hlist_add_head(&n->link, &b->list);
+        } else
+                apf_task_wake_one(n);
+        spin_unlock(&b->lock);
+        return;
+}
+EXPORT_SYMBOL_GPL(kvm_async_pf_task_wake);
+u32 kvm_read_and_reset_pf_reason(void)
+{
+        u32 reason = 0;
+        if (__get_cpu_var(apf_reason).enabled) {
+                reason = __get_cpu_var(apf_reason).reason;
+                __get_cpu_var(apf_reason).reason = 0;
+        }
+        return reason;
+}
+EXPORT_SYMBOL_GPL(kvm_read_and_reset_pf_reason);
+dotraplinkage void __kprobes
+do_async_page_fault(struct pt_regs *regs, unsigned long error_code)
+{
+        switch (kvm_read_and_reset_pf_reason()) {
+        default:
+                do_page_fault(regs, error_code);
+                break;
+        case KVM_PV_REASON_PAGE_NOT_PRESENT:
+                /* page is swapped out by the host. */
+                kvm_async_pf_task_wait((u32)read_cr2());
+                break;
+        case KVM_PV_REASON_PAGE_READY:
+                kvm_async_pf_task_wake((u32)read_cr2());
+                break;
+        }
+}
 static void kvm_mmu_op(void *buffer, unsigned len)
 {
        int r;
@@ -231,10 +441,117 @@ static void __init paravirt_ops_setup(void)
 #endif
 }
+void __cpuinit kvm_guest_cpu_init(void)
+{
+        if (!kvm_para_available())
+                return;
+        if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf) {
+                u64 pa = __pa(&__get_cpu_var(apf_reason));
+#ifdef CONFIG_PREEMPT
+                pa |= KVM_ASYNC_PF_SEND_ALWAYS;
+#endif
+                wrmsrl(MSR_KVM_ASYNC_PF_EN, pa | KVM_ASYNC_PF_ENABLED);
+                __get_cpu_var(apf_reason).enabled = 1;
+                printk(KERN_INFO"KVM setup async PF for cpu %d\n",
+                       smp_processor_id());
+        }
+}
+static void kvm_pv_disable_apf(void *unused)
+{
+        if (!__get_cpu_var(apf_reason).enabled)
+                return;
+        wrmsrl(MSR_KVM_ASYNC_PF_EN, 0);
+        __get_cpu_var(apf_reason).enabled = 0;
+        printk(KERN_INFO"Unregister pv shared memory for cpu %d\n",
+               smp_processor_id());
+}
+static int kvm_pv_reboot_notify(struct notifier_block *nb,
+                                unsigned long code, void *unused)
+{
+        if (code == SYS_RESTART)
+                on_each_cpu(kvm_pv_disable_apf, NULL, 1);
+        return NOTIFY_DONE;
+}
+static struct notifier_block kvm_pv_reboot_nb = {
+        .notifier_call = kvm_pv_reboot_notify,
+};
+#ifdef CONFIG_SMP
+static void __init kvm_smp_prepare_boot_cpu(void)
+{
+#ifdef CONFIG_KVM_CLOCK
+        WARN_ON(kvm_register_clock("primary cpu clock"));
+#endif
+        kvm_guest_cpu_init();
+        native_smp_prepare_boot_cpu();
+}
+static void __cpuinit kvm_guest_cpu_online(void *dummy)
+{
+        kvm_guest_cpu_init();
+}
+static void kvm_guest_cpu_offline(void *dummy)
+{
+        kvm_pv_disable_apf(NULL);
+        apf_task_wake_all();
+}
+static int __cpuinit kvm_cpu_notify(struct notifier_block *self,
+                                    unsigned long action, void *hcpu)
+{
+        int cpu = (unsigned long)hcpu;
+        switch (action) {
+        case CPU_ONLINE:
+        case CPU_DOWN_FAILED:
+        case CPU_ONLINE_FROZEN:
+                smp_call_function_single(cpu, kvm_guest_cpu_online, NULL, 0);
+                break;
+        case CPU_DOWN_PREPARE:
+        case CPU_DOWN_PREPARE_FROZEN:
+                smp_call_function_single(cpu, kvm_guest_cpu_offline, NULL, 1);
+                break;
+        default:
+                break;
+        }
+        return NOTIFY_OK;
+}
+static struct notifier_block __cpuinitdata kvm_cpu_notifier = {
+        .notifier_call  = kvm_cpu_notify,
+};
+#endif
+static void __init kvm_apf_trap_init(void)
+{
+        set_intr_gate(14, &async_page_fault);
+}
 void __init kvm_guest_init(void)
 {
+        int i;
        if (!kvm_para_available())
                return;
        paravirt_ops_setup();
+        register_reboot_notifier(&kvm_pv_reboot_nb);
+        for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++)
+                spin_lock_init(&async_pf_sleepers[i].lock);
+        if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF))
+                x86_init.irqs.trap_init = kvm_apf_trap_init;
+#ifdef CONFIG_SMP
+        smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
+        register_cpu_notifier(&kvm_cpu_notifier);
+#else
+        kvm_guest_cpu_init();
+#endif
 }
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index eb9b76c716c2..6389a6bca11b 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -26,8 +26,6 @@
 #include <asm/x86_init.h>
 #include <asm/reboot.h>
-#define KVM_SCALE 22
 static int kvmclock = 1;
 static int msr_kvm_system_time = MSR_KVM_SYSTEM_TIME;
 static int msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK;
@@ -120,21 +118,21 @@ static struct clocksource kvm_clock = {
        .read = kvm_clock_get_cycles,
        .rating = 400,
        .mask = CLOCKSOURCE_MASK(64),
-        .mult = 1 << KVM_SCALE,
-        .shift = KVM_SCALE,
        .flags = CLOCK_SOURCE_IS_CONTINUOUS,
 };
-static int kvm_register_clock(char *txt)
+int kvm_register_clock(char *txt)
 {
        int cpu = smp_processor_id();
-        int low, high;
+        int low, high, ret;
        low = (int)__pa(&per_cpu(hv_clock, cpu)) | 1;
        high = ((u64)__pa(&per_cpu(hv_clock, cpu)) >> 32);
+        ret = native_write_msr_safe(msr_kvm_system_time, low, high);
        printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n",
               cpu, high, low, txt);
-        return native_write_msr_safe(msr_kvm_system_time, low, high);
+        return ret;
 }
 #ifdef CONFIG_X86_LOCAL_APIC
@@ -150,14 +148,6 @@ static void __cpuinit kvm_setup_secondary_clock(void)
 }
 #endif
-#ifdef CONFIG_SMP
-static void __init kvm_smp_prepare_boot_cpu(void)
-{
-        WARN_ON(kvm_register_clock("primary cpu clock"));
-        native_smp_prepare_boot_cpu();
-}
-#endif
 /*
 * After the clock is registered, the host will keep writing to the
 * registered memory location. If the guest happens to shutdown, this memory
@@ -204,15 +194,12 @@ void __init kvmclock_init(void)
        x86_cpuinit.setup_percpu_clockev =
                kvm_setup_secondary_clock;
 #endif
-#ifdef CONFIG_SMP
-        smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
-#endif
        machine_ops.shutdown  = kvm_shutdown;
 #ifdef CONFIG_KEXEC
        machine_ops.crash_shutdown  = kvm_crash_shutdown;
 #endif
        kvm_get_preset_lpj();
-        clocksource_register(&kvm_clock);
+        clocksource_register_hz(&kvm_clock, NSEC_PER_SEC);
        pv_info.paravirt_enabled = 1;
        pv_info.name = "KVM";
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 035c8c529181..b3ea9db39db6 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -36,7 +36,7 @@ static int init_one_level2_page(struct kimage *image, pgd_t *pgd,
                if (!page)
                        goto out;
                pud = (pud_t *)page_address(page);
-                memset(pud, 0, PAGE_SIZE);
+                clear_page(pud);
                set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE));
        }
        pud = pud_offset(pgd, addr);
@@ -45,7 +45,7 @@ static int init_one_level2_page(struct kimage *image, pgd_t *pgd,
                if (!page)
                        goto out;
                pmd = (pmd_t *)page_address(page);
-                memset(pmd, 0, PAGE_SIZE);
+                clear_page(pmd);
                set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
        }
        pmd = pmd_offset(pud, addr);
diff --git a/arch/x86/kernel/mca_32.c b/arch/x86/kernel/mca_32.c
index 63eaf6596233..177183cbb6ae 100644
--- a/arch/x86/kernel/mca_32.c
+++ b/arch/x86/kernel/mca_32.c
@@ -259,7 +259,7 @@ static int __init mca_init(void)
        /*
         * WARNING: Be careful when making changes here. Putting an adapter
         * and the motherboard simultaneously into setup mode may result in
-         * damage to chips (according to The Indispensible PC Hardware Book
+         * damage to chips (according to The Indispensable PC Hardware Book
         * by Hans-Peter Messmer). Also, we disable system interrupts (so
         * that we are not disturbed in the middle of this).
         */
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
index e1af7c055c7d..c5610384ab16 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/microcode_amd.c
@@ -66,7 +66,6 @@ struct microcode_amd {
        unsigned int                    mpb[0];
 };
-#define UCODE_MAX_SIZE                  2048
 #define UCODE_CONTAINER_SECTION_HDR     8
 #define UCODE_CONTAINER_HEADER_SIZE     12
@@ -77,20 +76,20 @@ static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig)
        struct cpuinfo_x86 *c = &cpu_data(cpu);
        u32 dummy;
-        memset(csig, 0, sizeof(*csig));
        if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) {
-                pr_warning("microcode: CPU%d: AMD CPU family 0x%x not "
+                pr_warning("CPU%d: family %d not supported\n", cpu, c->x86);
-                           "supported\n", cpu, c->x86);
                return -1;
        }
        rdmsr(MSR_AMD64_PATCH_LEVEL, csig->rev, dummy);
-        pr_info("CPU%d: patch_level=0x%x\n", cpu, csig->rev);
+        pr_info("CPU%d: patch_level=0x%08x\n", cpu, csig->rev);
        return 0;
 }
-static int get_matching_microcode(int cpu, void *mc, int rev)
+static int get_matching_microcode(int cpu, struct microcode_header_amd *mc_hdr,
+                                  int rev)
 {
-        struct microcode_header_amd *mc_header = mc;
        unsigned int current_cpu_id;
        u16 equiv_cpu_id = 0;
        unsigned int i = 0;
@@ -109,17 +108,17 @@ static int get_matching_microcode(int cpu, void *mc, int rev)
        if (!equiv_cpu_id)
                return 0;
-        if (mc_header->processor_rev_id != equiv_cpu_id)
+        if (mc_hdr->processor_rev_id != equiv_cpu_id)
                return 0;
        /* ucode might be chipset specific -- currently we don't support this */
-        if (mc_header->nb_dev_id || mc_header->sb_dev_id) {
+        if (mc_hdr->nb_dev_id || mc_hdr->sb_dev_id) {
-                pr_err("CPU%d: loading of chipset specific code not yet supported\n",
+                pr_err("CPU%d: chipset specific code not yet supported\n",
                       cpu);
                return 0;
        }
-        if (mc_header->patch_id <= rev)
+        if (mc_hdr->patch_id <= rev)
                return 0;
        return 1;
@@ -144,85 +143,93 @@ static int apply_microcode_amd(int cpu)
        /* check current patch id and patch's id for match */
        if (rev != mc_amd->hdr.patch_id) {
-                pr_err("CPU%d: update failed (for patch_level=0x%x)\n",
+                pr_err("CPU%d: update failed for patch_level=0x%08x\n",
                       cpu, mc_amd->hdr.patch_id);
                return -1;
        }
-        pr_info("CPU%d: updated (new patch_level=0x%x)\n", cpu, rev);
+        pr_info("CPU%d: new patch_level=0x%08x\n", cpu, rev);
        uci->cpu_sig.rev = rev;
        return 0;
 }
-static int get_ucode_data(void *to, const u8 *from, size_t n)
+static unsigned int verify_ucode_size(int cpu, const u8 *buf, unsigned int size)
 {
-        memcpy(to, from, n);
+        struct cpuinfo_x86 *c = &cpu_data(cpu);
-        return 0;
+        unsigned int max_size, actual_size;
+#define F1XH_MPB_MAX_SIZE 2048
+#define F14H_MPB_MAX_SIZE 1824
+#define F15H_MPB_MAX_SIZE 4096
+        switch (c->x86) {
+        case 0x14:
+                max_size = F14H_MPB_MAX_SIZE;
+                break;
+        case 0x15:
+                max_size = F15H_MPB_MAX_SIZE;
+                break;
+        default:
+                max_size = F1XH_MPB_MAX_SIZE;
+                break;
+        }
+        actual_size = buf[4] + (buf[5] << 8);
+        if (actual_size > size || actual_size > max_size) {
+                pr_err("section size mismatch\n");
+                return 0;
+        }
+        return actual_size;
 }
-static void *
+static struct microcode_header_amd *
-get_next_ucode(const u8 *buf, unsigned int size, unsigned int *mc_size)
+get_next_ucode(int cpu, const u8 *buf, unsigned int size, unsigned int *mc_size)
 {
-        unsigned int total_size;
+        struct microcode_header_amd *mc = NULL;
-        u8 section_hdr[UCODE_CONTAINER_SECTION_HDR];
+        unsigned int actual_size = 0;
-        void *mc;
-        if (get_ucode_data(section_hdr, buf, UCODE_CONTAINER_SECTION_HDR))
+        if (buf[0] != UCODE_UCODE_TYPE) {
-                return NULL;
+                pr_err("invalid type field in container file section header\n");
+                goto out;
-        if (section_hdr[0] != UCODE_UCODE_TYPE) {
-                pr_err("error: invalid type field in container file section header\n");
-                return NULL;
        }
-        total_size = (unsigned long) (section_hdr[4] + (section_hdr[5] << 8));
+        actual_size = verify_ucode_size(cpu, buf, size);
+        if (!actual_size)
+                goto out;
-        if (total_size > size || total_size > UCODE_MAX_SIZE) {
+        mc = vzalloc(actual_size);
-                pr_err("error: size mismatch\n");
+        if (!mc)
-                return NULL;
+                goto out;
-        }
-        mc = vmalloc(UCODE_MAX_SIZE);
+        get_ucode_data(mc, buf + UCODE_CONTAINER_SECTION_HDR, actual_size);
-        if (mc) {
+        *mc_size = actual_size + UCODE_CONTAINER_SECTION_HDR;
-                memset(mc, 0, UCODE_MAX_SIZE);
-                if (get_ucode_data(mc, buf + UCODE_CONTAINER_SECTION_HDR,
+out:
-                                   total_size)) {
-                        vfree(mc);
-                        mc = NULL;
-                } else
-                        *mc_size = total_size + UCODE_CONTAINER_SECTION_HDR;
-        }
        return mc;
 }
 static int install_equiv_cpu_table(const u8 *buf)
 {
-        u8 *container_hdr[UCODE_CONTAINER_HEADER_SIZE];
+        unsigned int *ibuf = (unsigned int *)buf;
-        unsigned int *buf_pos = (unsigned int *)container_hdr;
+        unsigned int type = ibuf[1];
-        unsigned long size;
+        unsigned int size = ibuf[2];
-        if (get_ucode_data(&container_hdr, buf, UCODE_CONTAINER_HEADER_SIZE))
+        if (type != UCODE_EQUIV_CPU_TABLE_TYPE || !size) {
-                return 0;
+                pr_err("empty section/"
+                       "invalid type field in container file section header\n");
-        size = buf_pos[2];
+                return -EINVAL;
-        if (buf_pos[1] != UCODE_EQUIV_CPU_TABLE_TYPE || !size) {
-                pr_err("error: invalid type field in container file section header\n");
-                return 0;
        }
-        equiv_cpu_table = (struct equiv_cpu_entry *) vmalloc(size);
+        equiv_cpu_table = vmalloc(size);
        if (!equiv_cpu_table) {
                pr_err("failed to allocate equivalent CPU table\n");
-                return 0;
+                return -ENOMEM;
        }
-        buf += UCODE_CONTAINER_HEADER_SIZE;
+        get_ucode_data(equiv_cpu_table, buf + UCODE_CONTAINER_HEADER_SIZE, size);
-        if (get_ucode_data(equiv_cpu_table, buf, size)) {
-                vfree(equiv_cpu_table);
-                return 0;
-        }
        return size + UCODE_CONTAINER_HEADER_SIZE; /* add header length */
 }
@@ -237,16 +244,16 @@ static enum ucode_state
 generic_load_microcode(int cpu, const u8 *data, size_t size)
 {
        struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+        struct microcode_header_amd *mc_hdr = NULL;
+        unsigned int mc_size, leftover;
+        int offset;
        const u8 *ucode_ptr = data;
        void *new_mc = NULL;
-        void *mc;
+        unsigned int new_rev = uci->cpu_sig.rev;
-        int new_rev = uci->cpu_sig.rev;
-        unsigned int leftover;
-        unsigned long offset;
        enum ucode_state state = UCODE_OK;
        offset = install_equiv_cpu_table(ucode_ptr);
-        if (!offset) {
+        if (offset < 0) {
                pr_err("failed to create equivalent cpu table\n");
                return UCODE_ERROR;
        }
@@ -255,64 +262,65 @@ generic_load_microcode(int cpu, const u8 *data, size_t size)
        leftover = size - offset;
        while (leftover) {
-                unsigned int uninitialized_var(mc_size);
+                mc_hdr = get_next_ucode(cpu, ucode_ptr, leftover, &mc_size);
-                struct microcode_header_amd *mc_header;
+                if (!mc_hdr)
-                mc = get_next_ucode(ucode_ptr, leftover, &mc_size);
-                if (!mc)
                        break;
-                mc_header = (struct microcode_header_amd *)mc;
+                if (get_matching_microcode(cpu, mc_hdr, new_rev)) {
-                if (get_matching_microcode(cpu, mc, new_rev)) {
                        vfree(new_mc);
-                        new_rev = mc_header->patch_id;
+                        new_rev = mc_hdr->patch_id;
-                        new_mc  = mc;
+                        new_mc  = mc_hdr;
                } else
-                        vfree(mc);
+                        vfree(mc_hdr);
                ucode_ptr += mc_size;
                leftover  -= mc_size;
        }
-        if (new_mc) {
+        if (!new_mc) {
-                if (!leftover) {
-                        vfree(uci->mc);
-                        uci->mc = new_mc;
-                        pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n",
-                                 cpu, new_rev, uci->cpu_sig.rev);
-                } else {
-                        vfree(new_mc);
-                        state = UCODE_ERROR;
-                }
-        } else
                state = UCODE_NFOUND;
+                goto free_table;
+        }
+        if (!leftover) {
+                vfree(uci->mc);
+                uci->mc = new_mc;
+                pr_debug("CPU%d update ucode (0x%08x -> 0x%08x)\n",
+                         cpu, uci->cpu_sig.rev, new_rev);
+        } else {
+                vfree(new_mc);
+                state = UCODE_ERROR;
+        }
+free_table:
        free_equiv_cpu_table();
        return state;
 }
-static enum ucode_state request_microcode_fw(int cpu, struct device *device)
+static enum ucode_state request_microcode_amd(int cpu, struct device *device)
 {
        const char *fw_name = "amd-ucode/microcode_amd.bin";
-        const struct firmware *firmware;
+        const struct firmware *fw;
-        enum ucode_state ret;
+        enum ucode_state ret = UCODE_NFOUND;
-        if (request_firmware(&firmware, fw_name, device)) {
+        if (request_firmware(&fw, fw_name, device)) {
-                printk(KERN_ERR "microcode: failed to load file %s\n", fw_name);
+                pr_err("failed to load file %s\n", fw_name);
-                return UCODE_NFOUND;
+                goto out;
        }
-        if (*(u32 *)firmware->data != UCODE_MAGIC) {
+        ret = UCODE_ERROR;
-                pr_err("invalid UCODE_MAGIC (0x%08x)\n",
+        if (*(u32 *)fw->data != UCODE_MAGIC) {
-                       *(u32 *)firmware->data);
+                pr_err("invalid magic value (0x%08x)\n", *(u32 *)fw->data);
-                return UCODE_ERROR;
+                goto fw_release;
        }
-        ret = generic_load_microcode(cpu, firmware->data, firmware->size);
+        ret = generic_load_microcode(cpu, fw->data, fw->size);
-        release_firmware(firmware);
+fw_release:
+        release_firmware(fw);
+out:
        return ret;
 }
@@ -333,7 +341,7 @@ static void microcode_fini_cpu_amd(int cpu)
 static struct microcode_ops microcode_amd_ops = {
        .request_microcode_user           = request_microcode_user,
-        .request_microcode_fw             = request_microcode_fw,
+        .request_microcode_fw             = request_microcode_amd,
        .collect_cpu_info                 = collect_cpu_info_amd,
        .apply_microcode                  = apply_microcode_amd,
        .microcode_fini_cpu               = microcode_fini_cpu_amd,
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index fa6551d36c10..f9242800bc84 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -12,7 +12,7 @@
 *      Software Developer's Manual
 *      Order Number 253668 or free download from:
 *
- *      http://developer.intel.com/design/pentium4/manuals/253668.htm
+ *      http://developer.intel.com/Assets/PDF/manual/253668.pdf 
 *
 *      For more information, go to http://www.urbanmyth.org/microcode
 *
@@ -82,6 +82,7 @@
 #include <linux/cpu.h>
 #include <linux/fs.h>
 #include <linux/mm.h>
+#include <linux/syscore_ops.h>
 #include <asm/microcode.h>
 #include <asm/processor.h>
@@ -232,6 +233,7 @@ static const struct file_operations microcode_fops = {
        .owner                  = THIS_MODULE,
        .write                  = microcode_write,
        .open                   = microcode_open,
+        .llseek         = no_llseek,
 };
 static struct miscdevice microcode_dev = {
@@ -416,8 +418,10 @@ static int mc_sysdev_add(struct sys_device *sys_dev)
        if (err)
                return err;
-        if (microcode_init_cpu(cpu) == UCODE_ERROR)
+        if (microcode_init_cpu(cpu) == UCODE_ERROR) {
-                err = -EINVAL;
+                sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
+                return -EINVAL;
+        }
        return err;
 }
@@ -435,33 +439,25 @@ static int mc_sysdev_remove(struct sys_device *sys_dev)
        return 0;
 }
-static int mc_sysdev_resume(struct sys_device *dev)
+static struct sysdev_driver mc_sysdev_driver = {
+        .add                    = mc_sysdev_add,
+        .remove                 = mc_sysdev_remove,
+};
+/**
+ * mc_bp_resume - Update boot CPU microcode during resume.
+ */
+static void mc_bp_resume(void)
 {
-        int cpu = dev->id;
+        int cpu = smp_processor_id();
        struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
-        if (!cpu_online(cpu))
-                return 0;
-        /*
-         * All non-bootup cpus are still disabled,
-         * so only CPU 0 will apply ucode here.
-         *
-         * Moreover, there can be no concurrent
-         * updates from any other places at this point.
-         */
-        WARN_ON(cpu != 0);
        if (uci->valid && uci->mc)
                microcode_ops->apply_microcode(cpu);
-        return 0;
 }
-static struct sysdev_driver mc_sysdev_driver = {
+static struct syscore_ops mc_syscore_ops = {
-        .add                    = mc_sysdev_add,
+        .resume                 = mc_bp_resume,
-        .remove                 = mc_sysdev_remove,
-        .resume                 = mc_sysdev_resume,
 };
 static __cpuinit int
@@ -539,6 +535,7 @@ static int __init microcode_init(void)
        if (error)
                return error;
+        register_syscore_ops(&mc_syscore_ops);
        register_hotcpu_notifier(&mc_cpu_notifier);
        pr_info("Microcode Update Driver: v" MICROCODE_VERSION
@@ -553,6 +550,7 @@ static void __exit microcode_exit(void)
        microcode_dev_exit();
        unregister_hotcpu_notifier(&mc_cpu_notifier);
+        unregister_syscore_ops(&mc_syscore_ops);
        get_online_cpus();
        mutex_lock(&microcode_mutex);
diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c
index 356170262a93..1a1b606d3e92 100644
--- a/arch/x86/kernel/microcode_intel.c
+++ b/arch/x86/kernel/microcode_intel.c
@@ -12,7 +12,7 @@
 *      Software Developer's Manual
 *      Order Number 253668 or free download from:
 *
- *      http://developer.intel.com/design/pentium4/manuals/253668.htm
+ *      http://developer.intel.com/Assets/PDF/manual/253668.pdf 
 *
 *      For more information, go to http://www.urbanmyth.org/microcode
 *
@@ -364,8 +364,7 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
                /* For performance reasons, reuse mc area when possible */
                if (!mc || mc_size > curr_mc_size) {
-                        if (mc)
+                        vfree(mc);
-                                vfree(mc);
                        mc = vmalloc(mc_size);
                        if (!mc)
                                break;
@@ -374,13 +373,11 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
                if (get_ucode_data(mc, ucode_ptr, mc_size) ||
                    microcode_sanity_check(mc) < 0) {
-                        vfree(mc);
                        break;
                }
                if (get_matching_microcode(&uci->cpu_sig, mc, new_rev)) {
-                        if (new_mc)
+                        vfree(new_mc);
-                                vfree(new_mc);
                        new_rev = mc_header.rev;
                        new_mc  = mc;
                        mc = NULL;      /* trigger new vmalloc */
@@ -390,12 +387,10 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
                leftover  -= mc_size;
        }
-        if (mc)
+        vfree(mc);
-                vfree(mc);
        if (leftover) {
-                if (new_mc)
+                vfree(new_mc);
-                        vfree(new_mc);
                state = UCODE_ERROR;
                goto out;
        }
@@ -405,8 +400,7 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
                goto out;
        }
-        if (uci->mc)
+        vfree(uci->mc);
-                vfree(uci->mc);
        uci->mc = (struct microcode_intel *)new_mc;
        pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n",
diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c
index 71825806cd44..ac861b8348e2 100644
--- a/arch/x86/kernel/mmconf-fam10h_64.c
+++ b/arch/x86/kernel/mmconf-fam10h_64.c
@@ -25,7 +25,6 @@ struct pci_hostbridge_probe {
 };
 static u64 __cpuinitdata fam10h_pci_mmconf_base;
-static int __cpuinitdata fam10h_pci_mmconf_base_status;
 static struct pci_hostbridge_probe pci_probes[] __cpuinitdata = {
        { 0, 0x18, PCI_VENDOR_ID_AMD, 0x1200 },
@@ -44,10 +43,12 @@ static int __cpuinit cmp_range(const void *x1, const void *x2)
        return start1 - start2;
 }
-/*[47:0] */
+#define MMCONF_UNIT (1ULL << FAM10H_MMIO_CONF_BASE_SHIFT)
-/* need to avoid (0xfd<<32) and (0xfe<<32), ht used space */
+#define MMCONF_MASK (~(MMCONF_UNIT - 1))
+#define MMCONF_SIZE (MMCONF_UNIT << 8)
+/* need to avoid (0xfd<<32), (0xfe<<32), and (0xff<<32), ht used space */
 #define FAM10H_PCI_MMCONF_BASE (0xfcULL<<32)
-#define BASE_VALID(b) ((b != (0xfdULL << 32)) && (b != (0xfeULL << 32)))
+#define BASE_VALID(b) ((b) + MMCONF_SIZE <= (0xfdULL<<32) || (b) >= (1ULL<<40))
 static void __cpuinit get_fam10h_pci_mmconf_base(void)
 {
        int i;
@@ -64,12 +65,11 @@ static void __cpuinit get_fam10h_pci_mmconf_base(void)
        struct range range[8];
        /* only try to get setting from BSP */
-        /* -1 or 1 */
+        if (fam10h_pci_mmconf_base)
-        if (fam10h_pci_mmconf_base_status)
                return;
        if (!early_pci_allowed())
-                goto fail;
+                return;
        found = 0;
        for (i = 0; i < ARRAY_SIZE(pci_probes); i++) {
@@ -91,7 +91,7 @@ static void __cpuinit get_fam10h_pci_mmconf_base(void)
        }
        if (!found)
-                goto fail;
+                return;
        /* SYS_CFG */
        address = MSR_K8_SYSCFG;
@@ -99,16 +99,16 @@ static void __cpuinit get_fam10h_pci_mmconf_base(void)
        /* TOP_MEM2 is not enabled? */
        if (!(val & (1<<21))) {
-                tom2 = 0;
+                tom2 = 1ULL << 32;
        } else {
                /* TOP_MEM2 */
                address = MSR_K8_TOP_MEM2;
                rdmsrl(address, val);
-                tom2 = val & (0xffffULL<<32);
+                tom2 = max(val & 0xffffff800000ULL, 1ULL << 32);
        }
        if (base <= tom2)
-                base = tom2 + (1ULL<<32);
+                base = (tom2 + 2 * MMCONF_UNIT - 1) & MMCONF_MASK;
        /*
         * need to check if the range is in the high mmio range that is
@@ -123,11 +123,11 @@ static void __cpuinit get_fam10h_pci_mmconf_base(void)
                if (!(reg & 3))
                        continue;
-                start = (((u64)reg) << 8) & (0xffULL << 32); /* 39:16 on 31:8*/
+                start = (u64)(reg & 0xffffff00) << 8; /* 39:16 on 31:8*/
                reg = read_pci_config(bus, slot, 1, 0x84 + (i << 3));
-                end = (((u64)reg) << 8) & (0xffULL << 32); /* 39:16 on 31:8*/
+                end = ((u64)(reg & 0xffffff00) << 8) | 0xffff; /* 39:16 on 31:8*/
-                if (!end)
+                if (end < tom2)
                        continue;
                range[hi_mmio_num].start = start;
@@ -143,32 +143,27 @@ static void __cpuinit get_fam10h_pci_mmconf_base(void)
        if (range[hi_mmio_num - 1].end < base)
                goto out;
-        if (range[0].start > base)
+        if (range[0].start > base + MMCONF_SIZE)
                goto out;
        /* need to find one window */
-        base = range[0].start - (1ULL << 32);
+        base = (range[0].start & MMCONF_MASK) - MMCONF_UNIT;
        if ((base > tom2) && BASE_VALID(base))
                goto out;
-        base = range[hi_mmio_num - 1].end + (1ULL << 32);
+        base = (range[hi_mmio_num - 1].end + MMCONF_UNIT) & MMCONF_MASK;
-        if ((base > tom2) && BASE_VALID(base))
+        if (BASE_VALID(base))
                goto out;
        /* need to find window between ranges */
-        if (hi_mmio_num > 1)
+        for (i = 1; i < hi_mmio_num; i++) {
-        for (i = 0; i < hi_mmio_num - 1; i++) {
+                base = (range[i - 1].end + MMCONF_UNIT) & MMCONF_MASK;
-                if (range[i + 1].start > (range[i].end + (1ULL << 32))) {
+                val = range[i].start & MMCONF_MASK;
-                        base = range[i].end + (1ULL << 32);
+                if (val >= base + MMCONF_SIZE && BASE_VALID(base))
-                        if ((base > tom2) && BASE_VALID(base))
+                        goto out;
-                                goto out;
-                }
        }
-fail:
-        fam10h_pci_mmconf_base_status = -1;
        return;
 out:
        fam10h_pci_mmconf_base = base;
-        fam10h_pci_mmconf_base_status = 1;
 }
 void __cpuinit fam10h_check_enable_mmcfg(void)
@@ -190,11 +185,10 @@ void __cpuinit fam10h_check_enable_mmcfg(void)
                /* only trust the one handle 256 buses, if acpi=off */
                if (!acpi_pci_disabled || busnbits >= 8) {
-                        u64 base;
+                        u64 base = val & MMCONF_MASK;
-                        base = val & (0xffffULL << 32);
-                        if (fam10h_pci_mmconf_base_status <= 0) {
+                        if (!fam10h_pci_mmconf_base) {
                                fam10h_pci_mmconf_base = base;
-                                fam10h_pci_mmconf_base_status = 1;
                                return;
                        } else if (fam10h_pci_mmconf_base ==  base)
                                return;
@@ -206,8 +200,10 @@ void __cpuinit fam10h_check_enable_mmcfg(void)
         * with 256 buses
         */
        get_fam10h_pci_mmconf_base();
-        if (fam10h_pci_mmconf_base_status <= 0)
+        if (!fam10h_pci_mmconf_base) {
+                pci_probe &= ~PCI_CHECK_ENABLE_AMD_MMCONF;
                return;
+        }
        printk(KERN_INFO "Enable MMCONFIG on AMD Family 10h\n");
        val &= ~((FAM10H_MMIO_CONF_BASE_MASK<<FAM10H_MMIO_CONF_BASE_SHIFT) |
@@ -217,13 +213,13 @@ void __cpuinit fam10h_check_enable_mmcfg(void)
        wrmsrl(address, val);
 }
-static int __devinit set_check_enable_amd_mmconf(const struct dmi_system_id *d)
+static int __init set_check_enable_amd_mmconf(const struct dmi_system_id *d)
 {
        pci_probe |= PCI_CHECK_ENABLE_AMD_MMCONF;
        return 0;
 }
-static const struct dmi_system_id __cpuinitconst mmconf_dmi_table[] = {
+static const struct dmi_system_id __initconst mmconf_dmi_table[] = {
        {
                .callback = set_check_enable_amd_mmconf,
                .ident = "Sun Microsystems Machine",
@@ -234,7 +230,8 @@ static const struct dmi_system_id __cpuinitconst mmconf_dmi_table[] = {
        {}
 };
-void __cpuinit check_enable_amd_mmconf_dmi(void)
+/* Called from a __cpuinit function, but only on the BSP. */
+void __ref check_enable_amd_mmconf_dmi(void)
 {
        dmi_check_system(mmconf_dmi_table);
 }
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index 1c355c550960..52f256f2cc81 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -24,6 +24,7 @@
 #include <linux/bug.h>
 #include <linux/mm.h>
 #include <linux/gfp.h>
+#include <linux/jump_label.h>
 #include <asm/system.h>
 #include <asm/page.h>
@@ -37,20 +38,11 @@
 void *module_alloc(unsigned long size)
 {
-        struct vm_struct *area;
+        if (PAGE_ALIGN(size) > MODULES_LEN)
-        if (!size)
-                return NULL;
-        size = PAGE_ALIGN(size);
-        if (size > MODULES_LEN)
-                return NULL;
-        area = __get_vm_area(size, VM_ALLOC, MODULES_VADDR, MODULES_END);
-        if (!area)
                return NULL;
+        return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
-        return __vmalloc_area(area, GFP_KERNEL | __GFP_HIGHMEM,
+                                GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC,
-                                        PAGE_KERNEL_EXEC);
+                                -1, __builtin_return_address(0));
 }
 /* Free memory returned from module_alloc */
@@ -239,6 +231,9 @@ int module_finalize(const Elf_Ehdr *hdr,
                apply_paravirt(pseg, pseg + para->sh_size);
        }
+        /* make jump label nops */
+        jump_label_apply_nops(me);
        return 0;
 }
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index d7b6f7fb4fec..9103b89c145a 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -11,6 +11,7 @@
 #include <linux/init.h>
 #include <linux/delay.h>
 #include <linux/bootmem.h>
+#include <linux/memblock.h>
 #include <linux/kernel_stat.h>
 #include <linux/mc146818rtc.h>
 #include <linux/bitops.h>
@@ -117,21 +118,8 @@ static void __init MP_bus_info(struct mpc_bus *m)
 static void __init MP_ioapic_info(struct mpc_ioapic *m)
 {
-        if (!(m->flags & MPC_APIC_USABLE))
+        if (m->flags & MPC_APIC_USABLE)
-                return;
+                mp_register_ioapic(m->apicid, m->apicaddr, gsi_top);
-        printk(KERN_INFO "I/O APIC #%d Version %d at 0x%X.\n",
-               m->apicid, m->apicver, m->apicaddr);
-        mp_register_ioapic(m->apicid, m->apicaddr, gsi_top);
-}
-static void print_MP_intsrc_info(struct mpc_intsrc *m)
-{
-        apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x,"
-                " IRQ %02x, APIC ID %x, APIC INT %02x\n",
-                m->irqtype, m->irqflag & 3, (m->irqflag >> 2) & 3, m->srcbus,
-                m->srcbusirq, m->dstapic, m->dstirq);
 }
 static void __init print_mp_irq_info(struct mpc_intsrc *mp_irq)
@@ -143,73 +131,11 @@ static void __init print_mp_irq_info(struct mpc_intsrc *mp_irq)
                mp_irq->srcbusirq, mp_irq->dstapic, mp_irq->dstirq);
 }
-static void __init assign_to_mp_irq(struct mpc_intsrc *m,
-                                    struct mpc_intsrc *mp_irq)
-{
-        mp_irq->dstapic = m->dstapic;
-        mp_irq->type = m->type;
-        mp_irq->irqtype = m->irqtype;
-        mp_irq->irqflag = m->irqflag;
-        mp_irq->srcbus = m->srcbus;
-        mp_irq->srcbusirq = m->srcbusirq;
-        mp_irq->dstirq = m->dstirq;
-}
-static void __init assign_to_mpc_intsrc(struct mpc_intsrc *mp_irq,
-                                        struct mpc_intsrc *m)
-{
-        m->dstapic = mp_irq->dstapic;
-        m->type = mp_irq->type;
-        m->irqtype = mp_irq->irqtype;
-        m->irqflag = mp_irq->irqflag;
-        m->srcbus = mp_irq->srcbus;
-        m->srcbusirq = mp_irq->srcbusirq;
-        m->dstirq = mp_irq->dstirq;
-}
-static int __init mp_irq_mpc_intsrc_cmp(struct mpc_intsrc *mp_irq,
-                                        struct mpc_intsrc *m)
-{
-        if (mp_irq->dstapic != m->dstapic)
-                return 1;
-        if (mp_irq->type != m->type)
-                return 2;
-        if (mp_irq->irqtype != m->irqtype)
-                return 3;
-        if (mp_irq->irqflag != m->irqflag)
-                return 4;
-        if (mp_irq->srcbus != m->srcbus)
-                return 5;
-        if (mp_irq->srcbusirq != m->srcbusirq)
-                return 6;
-        if (mp_irq->dstirq != m->dstirq)
-                return 7;
-        return 0;
-}
-static void __init MP_intsrc_info(struct mpc_intsrc *m)
-{
-        int i;
-        print_MP_intsrc_info(m);
-        for (i = 0; i < mp_irq_entries; i++) {
-                if (!mp_irq_mpc_intsrc_cmp(&mp_irqs[i], m))
-                        return;
-        }
-        assign_to_mp_irq(m, &mp_irqs[mp_irq_entries]);
-        if (++mp_irq_entries == MAX_IRQ_SOURCES)
-                panic("Max # of irq sources exceeded!!\n");
-}
 #else /* CONFIG_X86_IO_APIC */
 static inline void __init MP_bus_info(struct mpc_bus *m) {}
 static inline void __init MP_ioapic_info(struct mpc_ioapic *m) {}
-static inline void __init MP_intsrc_info(struct mpc_intsrc *m) {}
 #endif /* CONFIG_X86_IO_APIC */
 static void __init MP_lintsrc_info(struct mpc_lintsrc *m)
 {
        apic_printk(APIC_VERBOSE, "Lint: type %d, pol %d, trig %d, bus %02x,"
@@ -221,7 +147,6 @@ static void __init MP_lintsrc_info(struct mpc_lintsrc *m)
 /*
 * Read/parse the MPC
 */
 static int __init smp_check_mpc(struct mpc_table *mpc, char *oem, char *str)
 {
@@ -274,18 +199,6 @@ static void __init smp_dump_mptable(struct mpc_table *mpc, unsigned char *mpt)
 void __init default_smp_read_mpc_oem(struct mpc_table *mpc) { }
-static void __init smp_register_lapic_address(unsigned long address)
-{
-        mp_lapic_addr = address;
-        set_fixmap_nocache(FIX_APIC_BASE, address);
-        if (boot_cpu_physical_apicid == -1U) {
-                boot_cpu_physical_apicid  = read_apic_id();
-                apic_version[boot_cpu_physical_apicid] =
-                         GET_APIC_VERSION(apic_read(APIC_LVR));
-        }
-}
 static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
 {
        char str[16];
@@ -300,17 +213,13 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
 #ifdef CONFIG_X86_32
        generic_mps_oem_check(mpc, oem, str);
 #endif
-        /* save the local APIC address, it might be non-default */
+        /* Initialize the lapic mapping */
        if (!acpi_lapic)
-                mp_lapic_addr = mpc->lapic;
+                register_lapic_address(mpc->lapic);
        if (early)
                return 1;
-        /* Initialize the lapic mapping */
-        if (!acpi_lapic)
-                smp_register_lapic_address(mpc->lapic);
        if (mpc->oemptr)
                x86_init.mpparse.smp_read_mpc_oem(mpc);
@@ -336,7 +245,7 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
                        skip_entry(&mpt, &count, sizeof(struct mpc_ioapic));
                        break;
                case MP_INTSRC:
-                        MP_intsrc_info((struct mpc_intsrc *)mpt);
+                        mp_save_irq((struct mpc_intsrc *)mpt);
                        skip_entry(&mpt, &count, sizeof(struct mpc_intsrc));
                        break;
                case MP_LINTSRC:
@@ -376,7 +285,7 @@ static void __init construct_default_ioirq_mptable(int mpc_default_type)
        intsrc.type = MP_INTSRC;
        intsrc.irqflag = 0;     /* conforming */
        intsrc.srcbus = 0;
-        intsrc.dstapic = mp_ioapics[0].apicid;
+        intsrc.dstapic = mpc_ioapic_id(0);
        intsrc.irqtype = mp_INT;
@@ -428,13 +337,13 @@ static void __init construct_default_ioirq_mptable(int mpc_default_type)
                intsrc.srcbusirq = i;
                intsrc.dstirq = i ? i : 2;      /* IRQ0 to INTIN2 */
-                MP_intsrc_info(&intsrc);
+                mp_save_irq(&intsrc);
        }
        intsrc.irqtype = mp_ExtINT;
        intsrc.srcbusirq = 0;
        intsrc.dstirq = 0;      /* 8259A to INTIN0 */
-        MP_intsrc_info(&intsrc);
+        mp_save_irq(&intsrc);
 }
@@ -657,7 +566,7 @@ static void __init smp_reserve_memory(struct mpf_intel *mpf)
 {
        unsigned long size = get_mpc_size(mpf->physptr);
-        reserve_early_overlap_ok(mpf->physptr, mpf->physptr+size, "MP-table mpc");
+        memblock_x86_reserve_range(mpf->physptr, mpf->physptr+size, "* MP-table mpc");
 }
 static int __init smp_scan_config(unsigned long base, unsigned long length)
@@ -686,7 +595,7 @@ static int __init smp_scan_config(unsigned long base, unsigned long length)
                               mpf, (u64)virt_to_phys(mpf));
                        mem = virt_to_phys(mpf);
-                        reserve_early_overlap_ok(mem, mem + sizeof(*mpf), "MP-table mpf");
+                        memblock_x86_reserve_range(mem, mem + sizeof(*mpf), "* MP-table mpf");
                        if (mpf->physptr)
                                smp_reserve_memory(mpf);
@@ -783,11 +692,11 @@ static void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare)
        int i;
        apic_printk(APIC_VERBOSE, "OLD ");
-        print_MP_intsrc_info(m);
+        print_mp_irq_info(m);
        i = get_MP_intsrc_index(m);
        if (i > 0) {
-                assign_to_mpc_intsrc(&mp_irqs[i], m);
+                memcpy(m, &mp_irqs[i], sizeof(*m));
                apic_printk(APIC_VERBOSE, "NEW ");
                print_mp_irq_info(&mp_irqs[i]);
                return;
@@ -805,23 +714,21 @@ static void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare)
                *nr_m_spare += 1;
        }
 }
-#else /* CONFIG_X86_IO_APIC */
-static
-inline void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) {}
-#endif /* CONFIG_X86_IO_APIC */
-static int
+static int __init
 check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length, int count)
 {
-        int ret = 0;
        if (!mpc_new_phys || count <= mpc_new_length) {
                WARN(1, "update_mptable: No spare slots (length: %x)\n", count);
                return -1;
        }
-        return ret;
+        return 0;
 }
+#else /* CONFIG_X86_IO_APIC */
+static
+inline void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) {}
+#endif /* CONFIG_X86_IO_APIC */
 static int  __init replace_intsrc_all(struct mpc_table *mpc,
                                        unsigned long mpc_new_phys,
@@ -874,14 +781,14 @@ static int  __init replace_intsrc_all(struct mpc_table *mpc,
                if (nr_m_spare > 0) {
                        apic_printk(APIC_VERBOSE, "*NEW* found\n");
                        nr_m_spare--;
-                        assign_to_mpc_intsrc(&mp_irqs[i], m_spare[nr_m_spare]);
+                        memcpy(m_spare[nr_m_spare], &mp_irqs[i], sizeof(mp_irqs[i]));
                        m_spare[nr_m_spare] = NULL;
                } else {
                        struct mpc_intsrc *m = (struct mpc_intsrc *)mpt;
                        count += sizeof(struct mpc_intsrc);
                        if (check_slot(mpc_new_phys, mpc_new_length, count) < 0)
                                goto out;
-                        assign_to_mpc_intsrc(&mp_irqs[i], m);
+                        memcpy(m, &mp_irqs[i], sizeof(*m));
                        mpc->length = count;
                        mpt += sizeof(struct mpc_intsrc);
                }
@@ -974,7 +881,7 @@ static int __init update_mp_table(void)
        if (!mpc_new_phys) {
                unsigned char old, new;
-                /* check if we can change the postion */
+                /* check if we can change the position */
                mpc->checksum = 0;
                old = mpf_checksum((unsigned char *)mpc, mpc->length);
                mpc->checksum = 0xff;
@@ -983,7 +890,7 @@ static int __init update_mp_table(void)
                        printk(KERN_INFO "mpc is readonly, please try alloc_mptable instead\n");
                        return 0;
                }
-                printk(KERN_INFO "use in-positon replacing\n");
+                printk(KERN_INFO "use in-position replacing\n");
        } else {
                mpf->physptr = mpc_new_phys;
                mpc_new = phys_to_virt(mpc_new_phys);
diff --git a/arch/x86/kernel/mrst.c b/arch/x86/kernel/mrst.c
deleted file mode 100644
index 79ae68154e87..000000000000
--- a/arch/x86/kernel/mrst.c
+++ /dev/null
@@ -1,311 +0,0 @@
-/*
- * mrst.c: Intel Moorestown platform specific setup code
- *
- * (C) Copyright 2008 Intel Corporation
- * Author: Jacob Pan (jacob.jun.pan@intel.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; version 2
- * of the License.
- */
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/sfi.h>
-#include <linux/irq.h>
-#include <linux/module.h>
-#include <asm/setup.h>
-#include <asm/mpspec_def.h>
-#include <asm/hw_irq.h>
-#include <asm/apic.h>
-#include <asm/io_apic.h>
-#include <asm/mrst.h>
-#include <asm/io.h>
-#include <asm/i8259.h>
-#include <asm/apb_timer.h>
-/*
- * the clockevent devices on Moorestown/Medfield can be APBT or LAPIC clock,
- * cmdline option x86_mrst_timer can be used to override the configuration
- * to prefer one or the other.
- * at runtime, there are basically three timer configurations:
- * 1. per cpu apbt clock only
- * 2. per cpu always-on lapic clocks only, this is Penwell/Medfield only
- * 3. per cpu lapic clock (C3STOP) and one apbt clock, with broadcast.
- *
- * by default (without cmdline option), platform code first detects cpu type
- * to see if we are on lincroft or penwell, then set up both lapic or apbt
- * clocks accordingly.
- * i.e. by default, medfield uses configuration #2, moorestown uses #1.
- * config #3 is supported but not recommended on medfield.
- *
- * rating and feature summary:
- * lapic (with C3STOP) --------- 100
- * apbt (always-on) ------------ 110
- * lapic (always-on,ARAT) ------ 150
- */
-__cpuinitdata enum mrst_timer_options mrst_timer_options;
-static u32 sfi_mtimer_usage[SFI_MTMR_MAX_NUM];
-static struct sfi_timer_table_entry sfi_mtimer_array[SFI_MTMR_MAX_NUM];
-enum mrst_cpu_type __mrst_cpu_chip;
-EXPORT_SYMBOL_GPL(__mrst_cpu_chip);
-int sfi_mtimer_num;
-struct sfi_rtc_table_entry sfi_mrtc_array[SFI_MRTC_MAX];
-EXPORT_SYMBOL_GPL(sfi_mrtc_array);
-int sfi_mrtc_num;
-static inline void assign_to_mp_irq(struct mpc_intsrc *m,
-                                    struct mpc_intsrc *mp_irq)
-{
-        memcpy(mp_irq, m, sizeof(struct mpc_intsrc));
-}
-static inline int mp_irq_cmp(struct mpc_intsrc *mp_irq,
-                                struct mpc_intsrc *m)
-{
-        return memcmp(mp_irq, m, sizeof(struct mpc_intsrc));
-}
-static void save_mp_irq(struct mpc_intsrc *m)
-{
-        int i;
-        for (i = 0; i < mp_irq_entries; i++) {
-                if (!mp_irq_cmp(&mp_irqs[i], m))
-                        return;
-        }
-        assign_to_mp_irq(m, &mp_irqs[mp_irq_entries]);
-        if (++mp_irq_entries == MAX_IRQ_SOURCES)
-                panic("Max # of irq sources exceeded!!\n");
-}
-/* parse all the mtimer info to a static mtimer array */
-static int __init sfi_parse_mtmr(struct sfi_table_header *table)
-{
-        struct sfi_table_simple *sb;
-        struct sfi_timer_table_entry *pentry;
-        struct mpc_intsrc mp_irq;
-        int totallen;
-        sb = (struct sfi_table_simple *)table;
-        if (!sfi_mtimer_num) {
-                sfi_mtimer_num = SFI_GET_NUM_ENTRIES(sb,
-                                        struct sfi_timer_table_entry);
-                pentry = (struct sfi_timer_table_entry *) sb->pentry;
-                totallen = sfi_mtimer_num * sizeof(*pentry);
-                memcpy(sfi_mtimer_array, pentry, totallen);
-        }
-        printk(KERN_INFO "SFI: MTIMER info (num = %d):\n", sfi_mtimer_num);
-        pentry = sfi_mtimer_array;
-        for (totallen = 0; totallen < sfi_mtimer_num; totallen++, pentry++) {
-                printk(KERN_INFO "timer[%d]: paddr = 0x%08x, freq = %dHz,"
-                        " irq = %d\n", totallen, (u32)pentry->phys_addr,
-                        pentry->freq_hz, pentry->irq);
-                        if (!pentry->irq)
-                                continue;
-                        mp_irq.type = MP_IOAPIC;
-                        mp_irq.irqtype = mp_INT;
-/* triggering mode edge bit 2-3, active high polarity bit 0-1 */
-                        mp_irq.irqflag = 5;
-                        mp_irq.srcbus = 0;
-                        mp_irq.srcbusirq = pentry->irq; /* IRQ */
-                        mp_irq.dstapic = MP_APIC_ALL;
-                        mp_irq.dstirq = pentry->irq;
-                        save_mp_irq(&mp_irq);
-        }
-        return 0;
-}
-struct sfi_timer_table_entry *sfi_get_mtmr(int hint)
-{
-        int i;
-        if (hint < sfi_mtimer_num) {
-                if (!sfi_mtimer_usage[hint]) {
-                        pr_debug("hint taken for timer %d irq %d\n",\
-                                hint, sfi_mtimer_array[hint].irq);
-                        sfi_mtimer_usage[hint] = 1;
-                        return &sfi_mtimer_array[hint];
-                }
-        }
-        /* take the first timer available */
-        for (i = 0; i < sfi_mtimer_num;) {
-                if (!sfi_mtimer_usage[i]) {
-                        sfi_mtimer_usage[i] = 1;
-                        return &sfi_mtimer_array[i];
-                }
-                i++;
-        }
-        return NULL;
-}
-void sfi_free_mtmr(struct sfi_timer_table_entry *mtmr)
-{
-        int i;
-        for (i = 0; i < sfi_mtimer_num;) {
-                if (mtmr->irq == sfi_mtimer_array[i].irq) {
-                        sfi_mtimer_usage[i] = 0;
-                        return;
-                }
-                i++;
-        }
-}
-/* parse all the mrtc info to a global mrtc array */
-int __init sfi_parse_mrtc(struct sfi_table_header *table)
-{
-        struct sfi_table_simple *sb;
-        struct sfi_rtc_table_entry *pentry;
-        struct mpc_intsrc mp_irq;
-        int totallen;
-        sb = (struct sfi_table_simple *)table;
-        if (!sfi_mrtc_num) {
-                sfi_mrtc_num = SFI_GET_NUM_ENTRIES(sb,
-                                                struct sfi_rtc_table_entry);
-                pentry = (struct sfi_rtc_table_entry *)sb->pentry;
-                totallen = sfi_mrtc_num * sizeof(*pentry);
-                memcpy(sfi_mrtc_array, pentry, totallen);
-        }
-        printk(KERN_INFO "SFI: RTC info (num = %d):\n", sfi_mrtc_num);
-        pentry = sfi_mrtc_array;
-        for (totallen = 0; totallen < sfi_mrtc_num; totallen++, pentry++) {
-                printk(KERN_INFO "RTC[%d]: paddr = 0x%08x, irq = %d\n",
-                        totallen, (u32)pentry->phys_addr, pentry->irq);
-                mp_irq.type = MP_IOAPIC;
-                mp_irq.irqtype = mp_INT;
-                mp_irq.irqflag = 0;
-                mp_irq.srcbus = 0;
-                mp_irq.srcbusirq = pentry->irq; /* IRQ */
-                mp_irq.dstapic = MP_APIC_ALL;
-                mp_irq.dstirq = pentry->irq;
-                save_mp_irq(&mp_irq);
-        }
-        return 0;
-}
-static unsigned long __init mrst_calibrate_tsc(void)
-{
-        unsigned long flags, fast_calibrate;
-        local_irq_save(flags);
-        fast_calibrate = apbt_quick_calibrate();
-        local_irq_restore(flags);
-        if (fast_calibrate)
-                return fast_calibrate;
-        return 0;
-}
-void __init mrst_time_init(void)
-{
-        switch (mrst_timer_options) {
-        case MRST_TIMER_APBT_ONLY:
-                break;
-        case MRST_TIMER_LAPIC_APBT:
-                x86_init.timers.setup_percpu_clockev = setup_boot_APIC_clock;
-                x86_cpuinit.setup_percpu_clockev = setup_secondary_APIC_clock;
-                break;
-        default:
-                if (!boot_cpu_has(X86_FEATURE_ARAT))
-                        break;
-                x86_init.timers.setup_percpu_clockev = setup_boot_APIC_clock;
-                x86_cpuinit.setup_percpu_clockev = setup_secondary_APIC_clock;
-                return;
-        }
-        /* we need at least one APB timer */
-        sfi_table_parse(SFI_SIG_MTMR, NULL, NULL, sfi_parse_mtmr);
-        pre_init_apic_IRQ0();
-        apbt_time_init();
-}
-void __init mrst_rtc_init(void)
-{
-        sfi_table_parse(SFI_SIG_MRTC, NULL, NULL, sfi_parse_mrtc);
-}
-void __cpuinit mrst_arch_setup(void)
-{
-        if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x27)
-                __mrst_cpu_chip = MRST_CPU_CHIP_PENWELL;
-        else if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x26)
-                __mrst_cpu_chip = MRST_CPU_CHIP_LINCROFT;
-        else {
-                pr_err("Unknown Moorestown CPU (%d:%d), default to Lincroft\n",
-                        boot_cpu_data.x86, boot_cpu_data.x86_model);
-                __mrst_cpu_chip = MRST_CPU_CHIP_LINCROFT;
-        }
-        pr_debug("Moorestown CPU %s identified\n",
-                (__mrst_cpu_chip == MRST_CPU_CHIP_LINCROFT) ?
-                "Lincroft" : "Penwell");
-}
-/* MID systems don't have i8042 controller */
-static int mrst_i8042_detect(void)
-{
-        return 0;
-}
-/*
- * Moorestown specific x86_init function overrides and early setup
- * calls.
- */
-void __init x86_mrst_early_setup(void)
-{
-        x86_init.resources.probe_roms = x86_init_noop;
-        x86_init.resources.reserve_resources = x86_init_noop;
-        x86_init.timers.timer_init = mrst_time_init;
-        x86_init.timers.setup_percpu_clockev = x86_init_noop;
-        x86_init.irqs.pre_vector_init = x86_init_noop;
-        x86_init.oem.arch_setup = mrst_arch_setup;
-        x86_cpuinit.setup_percpu_clockev = apbt_setup_secondary_clock;
-        x86_platform.calibrate_tsc = mrst_calibrate_tsc;
-        x86_platform.i8042_detect = mrst_i8042_detect;
-        x86_init.pci.init = pci_mrst_init;
-        x86_init.pci.fixup_irqs = x86_init_noop;
-        legacy_pic = &null_legacy_pic;
-        /* Avoid searching for BIOS MP tables */
-        x86_init.mpparse.find_smp_config = x86_init_noop;
-        x86_init.mpparse.get_smp_config = x86_init_uint_noop;
-}
-/*
- * if user does not want to use per CPU apb timer, just give it a lower rating
- * than local apic timer and skip the late per cpu timer init.
- */
-static inline int __init setup_x86_mrst_timer(char *arg)
-{
-        if (!arg)
-                return -EINVAL;
-        if (strcmp("apbt_only", arg) == 0)
-                mrst_timer_options = MRST_TIMER_APBT_ONLY;
-        else if (strcmp("lapic_and_apbt", arg) == 0)
-                mrst_timer_options = MRST_TIMER_LAPIC_APBT;
-        else {
-                pr_warning("X86 MRST timer option %s not recognised"
-                           " use x86_mrst_timer=apbt_only or lapic_and_apbt\n",
-                           arg);
-                return -EINVAL;
-        }
-        return 0;
-}
-__setup("x86_mrst_timer=", setup_x86_mrst_timer);
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 7bf2dc4c8f70..12fcbe2c143e 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -30,7 +30,6 @@
 #include <linux/init.h>
 #include <linux/poll.h>
 #include <linux/smp.h>
-#include <linux/smp_lock.h>
 #include <linux/major.h>
 #include <linux/fs.h>
 #include <linux/device.h>
diff --git a/arch/x86/kernel/olpc.c b/arch/x86/kernel/olpc.c
deleted file mode 100644
index 0e0cdde519be..000000000000
--- a/arch/x86/kernel/olpc.c
+++ /dev/null
@@ -1,260 +0,0 @@
-/*
- * Support for the OLPC DCON and OLPC EC access
- *
- * Copyright © 2006  Advanced Micro Devices, Inc.
- * Copyright © 2007-2008  Andres Salomon <dilinger@debian.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- */
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/delay.h>
-#include <linux/spinlock.h>
-#include <linux/io.h>
-#include <linux/string.h>
-#include <asm/geode.h>
-#include <asm/setup.h>
-#include <asm/olpc.h>
-#include <asm/olpc_ofw.h>
-struct olpc_platform_t olpc_platform_info;
-EXPORT_SYMBOL_GPL(olpc_platform_info);
-static DEFINE_SPINLOCK(ec_lock);
-/* what the timeout *should* be (in ms) */
-#define EC_BASE_TIMEOUT 20
-/* the timeout that bugs in the EC might force us to actually use */
-static int ec_timeout = EC_BASE_TIMEOUT;
-static int __init olpc_ec_timeout_set(char *str)
-{
-        if (get_option(&str, &ec_timeout) != 1) {
-                ec_timeout = EC_BASE_TIMEOUT;
-                printk(KERN_ERR "olpc-ec:  invalid argument to "
-                                "'olpc_ec_timeout=', ignoring!\n");
-        }
-        printk(KERN_DEBUG "olpc-ec:  using %d ms delay for EC commands.\n",
-                        ec_timeout);
-        return 1;
-}
-__setup("olpc_ec_timeout=", olpc_ec_timeout_set);
-/*
- * These {i,o}bf_status functions return whether the buffers are full or not.
- */
-static inline unsigned int ibf_status(unsigned int port)
-{
-        return !!(inb(port) & 0x02);
-}
-static inline unsigned int obf_status(unsigned int port)
-{
-        return inb(port) & 0x01;
-}
-#define wait_on_ibf(p, d) __wait_on_ibf(__LINE__, (p), (d))
-static int __wait_on_ibf(unsigned int line, unsigned int port, int desired)
-{
-        unsigned int timeo;
-        int state = ibf_status(port);
-        for (timeo = ec_timeout; state != desired && timeo; timeo--) {
-                mdelay(1);
-                state = ibf_status(port);
-        }
-        if ((state == desired) && (ec_timeout > EC_BASE_TIMEOUT) &&
-                        timeo < (ec_timeout - EC_BASE_TIMEOUT)) {
-                printk(KERN_WARNING "olpc-ec:  %d: waited %u ms for IBF!\n",
-                                line, ec_timeout - timeo);
-        }
-        return !(state == desired);
-}
-#define wait_on_obf(p, d) __wait_on_obf(__LINE__, (p), (d))
-static int __wait_on_obf(unsigned int line, unsigned int port, int desired)
-{
-        unsigned int timeo;
-        int state = obf_status(port);
-        for (timeo = ec_timeout; state != desired && timeo; timeo--) {
-                mdelay(1);
-                state = obf_status(port);
-        }
-        if ((state == desired) && (ec_timeout > EC_BASE_TIMEOUT) &&
-                        timeo < (ec_timeout - EC_BASE_TIMEOUT)) {
-                printk(KERN_WARNING "olpc-ec:  %d: waited %u ms for OBF!\n",
-                                line, ec_timeout - timeo);
-        }
-        return !(state == desired);
-}
-/*
- * This allows the kernel to run Embedded Controller commands.  The EC is
- * documented at <http://wiki.laptop.org/go/Embedded_controller>, and the
- * available EC commands are here:
- * <http://wiki.laptop.org/go/Ec_specification>.  Unfortunately, while
- * OpenFirmware's source is available, the EC's is not.
- */
-int olpc_ec_cmd(unsigned char cmd, unsigned char *inbuf, size_t inlen,
-                unsigned char *outbuf,  size_t outlen)
-{
-        unsigned long flags;
-        int ret = -EIO;
-        int i;
-        spin_lock_irqsave(&ec_lock, flags);
-        /* Clear OBF */
-        for (i = 0; i < 10 && (obf_status(0x6c) == 1); i++)
-                inb(0x68);
-        if (i == 10) {
-                printk(KERN_ERR "olpc-ec:  timeout while attempting to "
-                                "clear OBF flag!\n");
-                goto err;
-        }
-        if (wait_on_ibf(0x6c, 0)) {
-                printk(KERN_ERR "olpc-ec:  timeout waiting for EC to "
-                                "quiesce!\n");
-                goto err;
-        }
-restart:
-        /*
-         * Note that if we time out during any IBF checks, that's a failure;
-         * we have to return.  There's no way for the kernel to clear that.
-         *
-         * If we time out during an OBF check, we can restart the command;
-         * reissuing it will clear the OBF flag, and we should be alright.
-         * The OBF flag will sometimes misbehave due to what we believe
-         * is a hardware quirk..
-         */
-        pr_devel("olpc-ec:  running cmd 0x%x\n", cmd);
-        outb(cmd, 0x6c);
-        if (wait_on_ibf(0x6c, 0)) {
-                printk(KERN_ERR "olpc-ec:  timeout waiting for EC to read "
-                                "command!\n");
-                goto err;
-        }
-        if (inbuf && inlen) {
-                /* write data to EC */
-                for (i = 0; i < inlen; i++) {
-                        if (wait_on_ibf(0x6c, 0)) {
-                                printk(KERN_ERR "olpc-ec:  timeout waiting for"
-                                                " EC accept data!\n");
-                                goto err;
-                        }
-                        pr_devel("olpc-ec:  sending cmd arg 0x%x\n", inbuf[i]);
-                        outb(inbuf[i], 0x68);
-                }
-        }
-        if (outbuf && outlen) {
-                /* read data from EC */
-                for (i = 0; i < outlen; i++) {
-                        if (wait_on_obf(0x6c, 1)) {
-                                printk(KERN_ERR "olpc-ec:  timeout waiting for"
-                                                " EC to provide data!\n");
-                                goto restart;
-                        }
-                        outbuf[i] = inb(0x68);
-                        pr_devel("olpc-ec:  received 0x%x\n", outbuf[i]);
-                }
-        }
-        ret = 0;
-err:
-        spin_unlock_irqrestore(&ec_lock, flags);
-        return ret;
-}
-EXPORT_SYMBOL_GPL(olpc_ec_cmd);
-#ifdef CONFIG_OLPC_OPENFIRMWARE
-static void __init platform_detect(void)
-{
-        size_t propsize;
-        __be32 rev;
-        const void *args[] = { NULL, "board-revision-int", &rev, (void *)4 };
-        void *res[] = { &propsize };
-        if (olpc_ofw("getprop", args, res) || propsize != 4) {
-                printk(KERN_ERR "ofw: getprop call failed!\n");
-                rev = cpu_to_be32(0);
-        }
-        olpc_platform_info.boardrev = be32_to_cpu(rev);
-}
-#else
-static void __init platform_detect(void)
-{
-        /* stopgap until OFW support is added to the kernel */
-        olpc_platform_info.boardrev = olpc_board(0xc2);
-}
-#endif
-static int __init olpc_init(void)
-{
-        unsigned char *romsig;
-        /* The ioremap check is dangerous; limit what we run it on */
-        if (!is_geode() || cs5535_has_vsa2())
-                return 0;
-        spin_lock_init(&ec_lock);
-        romsig = ioremap(0xffffffc0, 16);
-        if (!romsig)
-                return 0;
-        if (strncmp(romsig, "CL1   Q", 7))
-                goto unmap;
-        if (strncmp(romsig+6, romsig+13, 3)) {
-                printk(KERN_INFO "OLPC BIOS signature looks invalid.  "
-                                "Assuming not OLPC\n");
-                goto unmap;
-        }
-        printk(KERN_INFO "OLPC board with OpenFirmware %.16s\n", romsig);
-        olpc_platform_info.flags |= OLPC_F_PRESENT;
-        /* get the platform revision */
-        platform_detect();
-        /* assume B1 and above models always have a DCON */
-        if (olpc_board_at_least(olpc_board(0xb1)))
-                olpc_platform_info.flags |= OLPC_F_DCON;
-        /* get the EC revision */
-        olpc_ec_cmd(EC_FIRMWARE_REV, NULL, 0,
-                        (unsigned char *) &olpc_platform_info.ecver, 1);
-#ifdef CONFIG_PCI_OLPC
-        /* If the VSA exists let it emulate PCI, if not emulate in kernel */
-        if (!cs5535_has_vsa2())
-                x86_init.pci.arch_init = pci_olpc_init;
-#endif
-        printk(KERN_INFO "OLPC board revision %s%X (EC=%x)\n",
-                        ((olpc_platform_info.boardrev & 0xf) < 8) ? "pre" : "",
-                        olpc_platform_info.boardrev >> 4,
-                        olpc_platform_info.ecver);
-unmap:
-        iounmap(romsig);
-        return 0;
-}
-postcore_initcall(olpc_init);
diff --git a/arch/x86/kernel/olpc_ofw.c b/arch/x86/kernel/olpc_ofw.c
deleted file mode 100644
index 3218aa71ab5e..000000000000
--- a/arch/x86/kernel/olpc_ofw.c
+++ /dev/null
@@ -1,106 +0,0 @@
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <asm/page.h>
-#include <asm/setup.h>
-#include <asm/io.h>
-#include <asm/pgtable.h>
-#include <asm/olpc_ofw.h>
-/* address of OFW callback interface; will be NULL if OFW isn't found */
-static int (*olpc_ofw_cif)(int *);
-/* page dir entry containing OFW's pgdir table; filled in by head_32.S */
-u32 olpc_ofw_pgd __initdata;
-static DEFINE_SPINLOCK(ofw_lock);
-#define MAXARGS 10
-void __init setup_olpc_ofw_pgd(void)
-{
-        pgd_t *base, *ofw_pde;
-        if (!olpc_ofw_cif)
-                return;
-        /* fetch OFW's PDE */
-        base = early_ioremap(olpc_ofw_pgd, sizeof(olpc_ofw_pgd) * PTRS_PER_PGD);
-        if (!base) {
-                printk(KERN_ERR "failed to remap OFW's pgd - disabling OFW!\n");
-                olpc_ofw_cif = NULL;
-                return;
-        }
-        ofw_pde = &base[OLPC_OFW_PDE_NR];
-        /* install OFW's PDE permanently into the kernel's pgtable */
-        set_pgd(&swapper_pg_dir[OLPC_OFW_PDE_NR], *ofw_pde);
-        /* implicit optimization barrier here due to uninline function return */
-        early_iounmap(base, sizeof(olpc_ofw_pgd) * PTRS_PER_PGD);
-}
-int __olpc_ofw(const char *name, int nr_args, const void **args, int nr_res,
-                void **res)
-{
-        int ofw_args[MAXARGS + 3];
-        unsigned long flags;
-        int ret, i, *p;
-        BUG_ON(nr_args + nr_res > MAXARGS);
-        if (!olpc_ofw_cif)
-                return -EIO;
-        ofw_args[0] = (int)name;
-        ofw_args[1] = nr_args;
-        ofw_args[2] = nr_res;
-        p = &ofw_args[3];
-        for (i = 0; i < nr_args; i++, p++)
-                *p = (int)args[i];
-        /* call into ofw */
-        spin_lock_irqsave(&ofw_lock, flags);
-        ret = olpc_ofw_cif(ofw_args);
-        spin_unlock_irqrestore(&ofw_lock, flags);
-        if (!ret) {
-                for (i = 0; i < nr_res; i++, p++)
-                        *((int *)res[i]) = *p;
-        }
-        return ret;
-}
-EXPORT_SYMBOL_GPL(__olpc_ofw);
-/* OFW cif _should_ be above this address */
-#define OFW_MIN 0xff000000
-/* OFW starts on a 1MB boundary */
-#define OFW_BOUND (1<<20)
-void __init olpc_ofw_detect(void)
-{
-        struct olpc_ofw_header *hdr = &boot_params.olpc_ofw_header;
-        unsigned long start;
-        /* ensure OFW booted us by checking for "OFW " string */
-        if (hdr->ofw_magic != OLPC_OFW_SIG)
-                return;
-        olpc_ofw_cif = (int (*)(int *))hdr->cif_handler;
-        if ((unsigned long)olpc_ofw_cif < OFW_MIN) {
-                printk(KERN_ERR "OFW detected, but cif has invalid address 0x%lx - disabling.\n",
-                                (unsigned long)olpc_ofw_cif);
-                olpc_ofw_cif = NULL;
-                return;
-        }
-        /* determine where OFW starts in memory */
-        start = round_down((unsigned long)olpc_ofw_cif, OFW_BOUND);
-        printk(KERN_INFO "OFW detected in memory, cif @ 0x%lx (reserving top %ldMB)\n",
-                        (unsigned long)olpc_ofw_cif, (-start) >> 20);
-        reserve_top_address(-start);
-}
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 1db183ed7c01..869e1aeeb71b 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -413,7 +413,6 @@ struct pv_mmu_ops pv_mmu_ops = {
        .alloc_pte = paravirt_nop,
        .alloc_pmd = paravirt_nop,
-        .alloc_pmd_clone = paravirt_nop,
        .alloc_pud = paravirt_nop,
        .release_pte = paravirt_nop,
        .release_pmd = paravirt_nop,
@@ -422,8 +421,11 @@ struct pv_mmu_ops pv_mmu_ops = {
        .set_pte = native_set_pte,
        .set_pte_at = native_set_pte_at,
        .set_pmd = native_set_pmd,
+        .set_pmd_at = native_set_pmd_at,
        .pte_update = paravirt_nop,
        .pte_update_defer = paravirt_nop,
+        .pmd_update = paravirt_nop,
+        .pmd_update_defer = paravirt_nop,
        .ptep_modify_prot_start = __ptep_modify_prot_start,
        .ptep_modify_prot_commit = __ptep_modify_prot_commit,
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index 078d4ec1a9d9..e8c33a302006 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -47,6 +47,7 @@
 #include <asm/rio.h>
 #include <asm/bios_ebda.h>
 #include <asm/x86_init.h>
+#include <asm/iommu_table.h>
 #ifdef CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT
 int use_calgary __read_mostly = 1;
@@ -1278,7 +1279,7 @@ static int __init calgary_bus_has_devices(int bus, unsigned short pci_dev)
        if (pci_dev == PCI_DEVICE_ID_IBM_CALIOC2) {
                /*
-                 * FIXME: properly scan for devices accross the
+                 * FIXME: properly scan for devices across the
                 * PCI-to-PCI bridge on every CalIOC2 port.
                 */
                return 1;
@@ -1294,7 +1295,7 @@ static int __init calgary_bus_has_devices(int bus, unsigned short pci_dev)
 /*
 * calgary_init_bitmap_from_tce_table():
- * Funtion for kdump case. In the second/kdump kernel initialize
+ * Function for kdump case. In the second/kdump kernel initialize
 * the bitmap based on the tce table entries obtained from first kernel
 */
 static void calgary_init_bitmap_from_tce_table(struct iommu_table *tbl)
@@ -1364,7 +1365,7 @@ static int __init calgary_iommu_init(void)
        return 0;
 }
-void __init detect_calgary(void)
+int __init detect_calgary(void)
 {
        int bus;
        void *tbl;
@@ -1378,13 +1379,13 @@ void __init detect_calgary(void)
         * another HW IOMMU already, bail out.
         */
        if (no_iommu || iommu_detected)
-                return;
+                return -ENODEV;
        if (!use_calgary)
-                return;
+                return -ENODEV;
        if (!early_pci_allowed())
-                return;
+                return -ENODEV;
        printk(KERN_DEBUG "Calgary: detecting Calgary via BIOS EBDA area\n");
@@ -1410,13 +1411,13 @@ void __init detect_calgary(void)
        if (!rio_table_hdr) {
                printk(KERN_DEBUG "Calgary: Unable to locate Rio Grande table "
                       "in EBDA - bailing!\n");
-                return;
+                return -ENODEV;
        }
        ret = build_detail_arrays();
        if (ret) {
                printk(KERN_DEBUG "Calgary: build_detail_arrays ret %d\n", ret);
-                return;
+                return -ENOMEM;
        }
        specified_table_size = determine_tce_table_size((is_kdump_kernel() ?
@@ -1464,7 +1465,7 @@ void __init detect_calgary(void)
                x86_init.iommu.iommu_init = calgary_iommu_init;
        }
-        return;
+        return calgary_found;
 cleanup:
        for (--bus; bus >= 0; --bus) {
@@ -1473,6 +1474,7 @@ cleanup:
                if (info->tce_space)
                        free_tce_table(info->tce_space);
        }
+        return -ENOMEM;
 }
 static int __init calgary_parse_options(char *p)
@@ -1594,3 +1596,5 @@ static int __init calgary_fixup_tce_spaces(void)
 * and before device_initcall.
 */
 rootfs_initcall(calgary_fixup_tce_spaces);
+IOMMU_INIT_POST(detect_calgary);
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 9f07cfcbd3a5..b49d00da2aed 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -11,9 +11,8 @@
 #include <asm/iommu.h>
 #include <asm/gart.h>
 #include <asm/calgary.h>
-#include <asm/amd_iommu.h>
 #include <asm/x86_init.h>
-#include <asm/xen/swiotlb-xen.h>
+#include <asm/iommu_table.h>
 static int forbid_dac __read_mostly;
@@ -45,6 +44,8 @@ int iommu_detected __read_mostly = 0;
 */
 int iommu_pass_through __read_mostly;
+extern struct iommu_table_entry __iommu_table[], __iommu_table_end[];
 /* Dummy device used for NULL arguments (normally ISA). */
 struct device x86_dma_fallback_dev = {
        .init_name = "fallback device",
@@ -67,89 +68,23 @@ int dma_set_mask(struct device *dev, u64 mask)
 }
 EXPORT_SYMBOL(dma_set_mask);
-#if defined(CONFIG_X86_64) && !defined(CONFIG_NUMA)
-static __initdata void *dma32_bootmem_ptr;
-static unsigned long dma32_bootmem_size __initdata = (128ULL<<20);
-static int __init parse_dma32_size_opt(char *p)
-{
-        if (!p)
-                return -EINVAL;
-        dma32_bootmem_size = memparse(p, &p);
-        return 0;
-}
-early_param("dma32_size", parse_dma32_size_opt);
-void __init dma32_reserve_bootmem(void)
-{
-        unsigned long size, align;
-        if (max_pfn <= MAX_DMA32_PFN)
-                return;
-        /*
-         * check aperture_64.c allocate_aperture() for reason about
-         * using 512M as goal
-         */
-        align = 64ULL<<20;
-        size = roundup(dma32_bootmem_size, align);
-        dma32_bootmem_ptr = __alloc_bootmem_nopanic(size, align,
-                                 512ULL<<20);
-        /*
-         * Kmemleak should not scan this block as it may not be mapped via the
-         * kernel direct mapping.
-         */
-        kmemleak_ignore(dma32_bootmem_ptr);
-        if (dma32_bootmem_ptr)
-                dma32_bootmem_size = size;
-        else
-                dma32_bootmem_size = 0;
-}
-static void __init dma32_free_bootmem(void)
-{
-        if (max_pfn <= MAX_DMA32_PFN)
-                return;
-        if (!dma32_bootmem_ptr)
-                return;
-        free_bootmem(__pa(dma32_bootmem_ptr), dma32_bootmem_size);
-        dma32_bootmem_ptr = NULL;
-        dma32_bootmem_size = 0;
-}
-#else
-void __init dma32_reserve_bootmem(void)
-{
-}
-static void __init dma32_free_bootmem(void)
-{
-}
-#endif
 void __init pci_iommu_alloc(void)
 {
-        /* free the range so iommu could get some range less than 4G */
+        struct iommu_table_entry *p;
-        dma32_free_bootmem();
+        sort_iommu_table(__iommu_table, __iommu_table_end);
-        if (pci_xen_swiotlb_detect() || pci_swiotlb_detect())
+        check_iommu_entries(__iommu_table, __iommu_table_end);
-                goto out;
+        for (p = __iommu_table; p < __iommu_table_end; p++) {
-        gart_iommu_hole_init();
+                if (p && p->detect && p->detect() > 0) {
+                        p->flags |= IOMMU_DETECTED;
-        detect_calgary();
+                        if (p->early_init)
+                                p->early_init();
-        detect_intel_iommu();
+                        if (p->flags & IOMMU_FINISH_IF_DETECTED)
+                                break;
-        /* needs to be called after gart_iommu_hole_init */
+                }
-        amd_iommu_detect();
+        }
-out:
-        pci_xen_swiotlb_init();
-        pci_swiotlb_init();
 }
 void *dma_generic_alloc_coherent(struct device *dev, size_t size,
                                 dma_addr_t *dma_addr, gfp_t flag)
 {
@@ -292,6 +227,7 @@ EXPORT_SYMBOL(dma_supported);
 static int __init pci_iommu_init(void)
 {
+        struct iommu_table_entry *p;
        dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES);
 #ifdef CONFIG_PCI
@@ -299,12 +235,10 @@ static int __init pci_iommu_init(void)
 #endif
        x86_init.iommu.iommu_init();
-        if (swiotlb || xen_swiotlb) {
+        for (p = __iommu_table; p < __iommu_table_end; p++) {
-                printk(KERN_INFO "PCI-DMA: "
+                if (p && (p->flags & IOMMU_DETECTED) && p->late_init)
-                       "Using software bounce buffering for IO (SWIOTLB)\n");
+                        p->late_init();
-                swiotlb_print_info();
+        }
-        } else
-                swiotlb_free();
        return 0;
 }
diff --git a/arch/x86/kernel/pci-iommu_table.c b/arch/x86/kernel/pci-iommu_table.c
new file mode 100644
index 000000000000..35ccf75696eb
--- /dev/null
+++ b/arch/x86/kernel/pci-iommu_table.c
@@ -0,0 +1,79 @@
+#include <linux/dma-mapping.h>
+#include <asm/iommu_table.h>
+#include <linux/string.h>
+#include <linux/kallsyms.h>
+#define DEBUG 1
+static struct iommu_table_entry * __init
+find_dependents_of(struct iommu_table_entry *start,
+                   struct iommu_table_entry *finish,
+                   struct iommu_table_entry *q)
+{
+        struct iommu_table_entry *p;
+        if (!q)
+                return NULL;
+        for (p = start; p < finish; p++)
+                if (p->detect == q->depend)
+                        return p;
+        return NULL;
+}
+void __init sort_iommu_table(struct iommu_table_entry *start,
+                             struct iommu_table_entry *finish) {
+        struct iommu_table_entry *p, *q, tmp;
+        for (p = start; p < finish; p++) {
+again:
+                q = find_dependents_of(start, finish, p);
+                /* We are bit sneaky here. We use the memory address to figure
+                 * out if the node we depend on is past our point, if so, swap.
+                 */
+                if (q > p) {
+                        tmp = *p;
+                        memmove(p, q, sizeof(*p));
+                        *q = tmp;
+                        goto again;
+                }
+        }
+}
+#ifdef DEBUG
+void __init check_iommu_entries(struct iommu_table_entry *start,
+                                struct iommu_table_entry *finish)
+{
+        struct iommu_table_entry *p, *q, *x;
+        /* Simple cyclic dependency checker. */
+        for (p = start; p < finish; p++) {
+                q = find_dependents_of(start, finish, p);
+                x = find_dependents_of(start, finish, q);
+                if (p == x) {
+                        printk(KERN_ERR "CYCLIC DEPENDENCY FOUND! %pS depends on %pS and vice-versa. BREAKING IT.\n",
+                               p->detect, q->detect);
+                        /* Heavy handed way..*/
+                        x->depend = 0;
+                }
+        }
+        for (p = start; p < finish; p++) {
+                q = find_dependents_of(p, finish, p);
+                if (q && q > p) {
+                        printk(KERN_ERR "EXECUTION ORDER INVALID! %pS should be called before %pS!\n",
+                               p->detect, q->detect);
+                }
+        }
+}
+#else
+inline void check_iommu_entries(struct iommu_table_entry *start,
+                                       struct iommu_table_entry *finish)
+{
+}
+#endif
diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c
index a5bc528d4328..8f972cbddef0 100644
--- a/arch/x86/kernel/pci-swiotlb.c
+++ b/arch/x86/kernel/pci-swiotlb.c
@@ -10,7 +10,8 @@
 #include <asm/iommu.h>
 #include <asm/swiotlb.h>
 #include <asm/dma.h>
+#include <asm/xen/swiotlb-xen.h>
+#include <asm/iommu_table.h>
 int swiotlb __read_mostly;
 static void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size,
@@ -41,25 +42,42 @@ static struct dma_map_ops swiotlb_dma_ops = {
 };
 /*
- * pci_swiotlb_detect - set swiotlb to 1 if necessary
+ * pci_swiotlb_detect_override - set swiotlb to 1 if necessary
 *
 * This returns non-zero if we are forced to use swiotlb (by the boot
 * option).
 */
-int __init pci_swiotlb_detect(void)
+int __init pci_swiotlb_detect_override(void)
 {
        int use_swiotlb = swiotlb | swiotlb_force;
+        if (swiotlb_force)
+                swiotlb = 1;
+        return use_swiotlb;
+}
+IOMMU_INIT_FINISH(pci_swiotlb_detect_override,
+                  pci_xen_swiotlb_detect,
+                  pci_swiotlb_init,
+                  pci_swiotlb_late_init);
+/*
+ * if 4GB or more detected (and iommu=off not set) return 1
+ * and set swiotlb to 1.
+ */
+int __init pci_swiotlb_detect_4gb(void)
+{
        /* don't initialize swiotlb if iommu=off (no_iommu=1) */
 #ifdef CONFIG_X86_64
        if (!no_iommu && max_pfn > MAX_DMA32_PFN)
                swiotlb = 1;
 #endif
-        if (swiotlb_force)
+        return swiotlb;
-                swiotlb = 1;
-        return use_swiotlb;
 }
+IOMMU_INIT(pci_swiotlb_detect_4gb,
+           pci_swiotlb_detect_override,
+           pci_swiotlb_init,
+           pci_swiotlb_late_init);
 void __init pci_swiotlb_init(void)
 {
@@ -68,3 +86,15 @@ void __init pci_swiotlb_init(void)
                dma_ops = &swiotlb_dma_ops;
        }
 }
+void __init pci_swiotlb_late_init(void)
+{
+        /* An IOMMU turned us off. */
+        if (!swiotlb)
+                swiotlb_free();
+        else {
+                printk(KERN_INFO "PCI-DMA: "
+                       "Using software bounce buffering for IO (SWIOTLB)\n");
+                swiotlb_print_info();
+        }
+}
diff --git a/arch/x86/kernel/pmtimer_64.c b/arch/x86/kernel/pmtimer_64.c
deleted file mode 100644
index b112406f1996..000000000000
--- a/arch/x86/kernel/pmtimer_64.c
+++ /dev/null
@@ -1,69 +0,0 @@
-/* Ported over from i386 by AK, original copyright was:
- *
- * (C) Dominik Brodowski <linux@brodo.de> 2003
- *
- * Driver to use the Power Management Timer (PMTMR) available in some
- * southbridges as primary timing source for the Linux kernel.
- *
- * Based on parts of linux/drivers/acpi/hardware/hwtimer.c, timer_pit.c,
- * timer_hpet.c, and on Arjan van de Ven's implementation for 2.4.
- *
- * This file is licensed under the GPL v2.
- *
- * Dropped all the hardware bug workarounds for now. Hopefully they
- * are not needed on 64bit chipsets.
- */
-#include <linux/jiffies.h>
-#include <linux/kernel.h>
-#include <linux/time.h>
-#include <linux/init.h>
-#include <linux/cpumask.h>
-#include <linux/acpi_pmtmr.h>
-#include <asm/io.h>
-#include <asm/proto.h>
-#include <asm/msr.h>
-#include <asm/vsyscall.h>
-static inline u32 cyc2us(u32 cycles)
-{
-        /* The Power Management Timer ticks at 3.579545 ticks per microsecond.
-         * 1 / PM_TIMER_FREQUENCY == 0.27936511 =~ 286/1024 [error: 0.024%]
-         *
-         * Even with HZ = 100, delta is at maximum 35796 ticks, so it can
-         * easily be multiplied with 286 (=0x11E) without having to fear
-         * u32 overflows.
-         */
-        cycles *= 286;
-        return (cycles >> 10);
-}
-static unsigned pmtimer_wait_tick(void)
-{
-        u32 a, b;
-        for (a = b = inl(pmtmr_ioport) & ACPI_PM_MASK;
-             a == b;
-             b = inl(pmtmr_ioport) & ACPI_PM_MASK)
-                cpu_relax();
-        return b;
-}
-/* note: wait time is rounded up to one tick */
-void pmtimer_wait(unsigned us)
-{
-        u32 a, b;
-        a = pmtimer_wait_tick();
-        do {
-                b = inl(pmtmr_ioport);
-                cpu_relax();
-        } while (cyc2us(b - a) < us);
-}
-static int __init nopmtimer_setup(char *s)
-{
-        pmtmr_ioport = 0;
-        return 1;
-}
-__setup("nopmtimer", nopmtimer_setup);
diff --git a/arch/x86/kernel/probe_roms_32.c b/arch/x86/kernel/probe_roms.c
index 071e7fea42e5..ba0a4cce53be 100644
--- a/arch/x86/kernel/probe_roms_32.c
+++ b/arch/x86/kernel/probe_roms.c
@@ -73,6 +73,107 @@ static struct resource video_rom_resource = {
        .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
 };
+/* does this oprom support the given pci device, or any of the devices
+ * that the driver supports?
+ */
+static bool match_id(struct pci_dev *pdev, unsigned short vendor, unsigned short device)
+{
+        struct pci_driver *drv = pdev->driver;
+        const struct pci_device_id *id;
+        if (pdev->vendor == vendor && pdev->device == device)
+                return true;
+        for (id = drv ? drv->id_table : NULL; id && id->vendor; id++)
+                if (id->vendor == vendor && id->device == device)
+                        break;
+        return id && id->vendor;
+}
+static bool probe_list(struct pci_dev *pdev, unsigned short vendor,
+                       const unsigned char *rom_list)
+{
+        unsigned short device;
+        do {
+                if (probe_kernel_address(rom_list, device) != 0)
+                        device = 0;
+                if (device && match_id(pdev, vendor, device))
+                        break;
+                rom_list += 2;
+        } while (device);
+        return !!device;
+}
+static struct resource *find_oprom(struct pci_dev *pdev)
+{
+        struct resource *oprom = NULL;
+        int i;
+        for (i = 0; i < ARRAY_SIZE(adapter_rom_resources); i++) {
+                struct resource *res = &adapter_rom_resources[i];
+                unsigned short offset, vendor, device, list, rev;
+                const unsigned char *rom;
+                if (res->end == 0)
+                        break;
+                rom = isa_bus_to_virt(res->start);
+                if (probe_kernel_address(rom + 0x18, offset) != 0)
+                        continue;
+                if (probe_kernel_address(rom + offset + 0x4, vendor) != 0)
+                        continue;
+                if (probe_kernel_address(rom + offset + 0x6, device) != 0)
+                        continue;
+                if (match_id(pdev, vendor, device)) {
+                        oprom = res;
+                        break;
+                }
+                if (probe_kernel_address(rom + offset + 0x8, list) == 0 &&
+                    probe_kernel_address(rom + offset + 0xc, rev) == 0 &&
+                    rev >= 3 && list &&
+                    probe_list(pdev, vendor, rom + offset + list)) {
+                        oprom = res;
+                        break;
+                }
+        }
+        return oprom;
+}
+void *pci_map_biosrom(struct pci_dev *pdev)
+{
+        struct resource *oprom = find_oprom(pdev);
+        if (!oprom)
+                return NULL;
+        return ioremap(oprom->start, resource_size(oprom));
+}
+EXPORT_SYMBOL(pci_map_biosrom);
+void pci_unmap_biosrom(void __iomem *image)
+{
+        iounmap(image);
+}
+EXPORT_SYMBOL(pci_unmap_biosrom);
+size_t pci_biosrom_size(struct pci_dev *pdev)
+{
+        struct resource *oprom = find_oprom(pdev);
+        return oprom ? resource_size(oprom) : 0;
+}
+EXPORT_SYMBOL(pci_biosrom_size);
 #define ROMSIGNATURE 0xaa55
 static int __init romsignature(const unsigned char *rom)
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 57d1868a86aa..e1ba8cb24e4e 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -14,6 +14,7 @@
 #include <linux/utsname.h>
 #include <trace/events/power.h>
 #include <linux/hw_breakpoint.h>
+#include <asm/cpu.h>
 #include <asm/system.h>
 #include <asm/apic.h>
 #include <asm/syscalls.h>
@@ -22,11 +23,6 @@
 #include <asm/i387.h>
 #include <asm/debugreg.h>
-unsigned long idle_halt;
-EXPORT_SYMBOL(idle_halt);
-unsigned long idle_nomwait;
-EXPORT_SYMBOL(idle_nomwait);
 struct kmem_cache *task_xstate_cachep;
 EXPORT_SYMBOL_GPL(task_xstate_cachep);
@@ -91,27 +87,33 @@ void exit_thread(void)
 void show_regs(struct pt_regs *regs)
 {
        show_registers(regs);
-        show_trace(NULL, regs, (unsigned long *)kernel_stack_pointer(regs),
+        show_trace(NULL, regs, (unsigned long *)kernel_stack_pointer(regs), 0);
-                   regs->bp);
 }
 void show_regs_common(void)
 {
-        const char *board, *product;
+        const char *vendor, *product, *board;
-        board = dmi_get_system_info(DMI_BOARD_NAME);
+        vendor = dmi_get_system_info(DMI_SYS_VENDOR);
-        if (!board)
+        if (!vendor)
-                board = "";
+                vendor = "";
        product = dmi_get_system_info(DMI_PRODUCT_NAME);
        if (!product)
                product = "";
+        /* Board Name is optional */
+        board = dmi_get_system_info(DMI_BOARD_NAME);
        printk(KERN_CONT "\n");
-        printk(KERN_DEFAULT "Pid: %d, comm: %.20s %s %s %.*s %s/%s\n",
+        printk(KERN_DEFAULT "Pid: %d, comm: %.20s %s %s %.*s",
                current->pid, current->comm, print_tainted(),
                init_utsname()->release,
                (int)strcspn(init_utsname()->version, " "),
-                init_utsname()->version, board, product);
+                init_utsname()->version);
+        printk(KERN_CONT " %s %s", vendor, product);
+        if (board)
+                printk(KERN_CONT "/%s", board);
+        printk(KERN_CONT "\n");
 }
 void flush_thread(void)
@@ -328,14 +330,16 @@ long sys_execve(const char __user *name,
 /*
 * Idle related variables and functions
 */
-unsigned long boot_option_idle_override = 0;
+unsigned long boot_option_idle_override = IDLE_NO_OVERRIDE;
 EXPORT_SYMBOL(boot_option_idle_override);
 /*
 * Powermanagement idle function, if any..
 */
 void (*pm_idle)(void);
+#ifdef CONFIG_APM_MODULE
 EXPORT_SYMBOL(pm_idle);
+#endif
 #ifdef CONFIG_X86_32
 /*
@@ -374,6 +378,7 @@ void default_idle(void)
 {
        if (hlt_use_halt()) {
                trace_power_start(POWER_CSTATE, 1, smp_processor_id());
+                trace_cpu_idle(1, smp_processor_id());
                current_thread_info()->status &= ~TS_POLLING;
                /*
                 * TS_POLLING-cleared state must be visible before we
@@ -386,6 +391,8 @@ void default_idle(void)
                else
                        local_irq_enable();
                current_thread_info()->status |= TS_POLLING;
+                trace_power_end(smp_processor_id());
+                trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
        } else {
                local_irq_enable();
                /* loop is done by the caller */
@@ -443,9 +450,8 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait);
 */
 void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
 {
-        trace_power_start(POWER_CSTATE, (ax>>4)+1, smp_processor_id());
        if (!need_resched()) {
-                if (cpu_has(&current_cpu_data, X86_FEATURE_CLFLUSH_MONITOR))
+                if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR))
                        clflush((void *)&current_thread_info()->flags);
                __monitor((void *)&current_thread_info()->flags, 0, 0);
@@ -460,7 +466,8 @@ static void mwait_idle(void)
 {
        if (!need_resched()) {
                trace_power_start(POWER_CSTATE, 1, smp_processor_id());
-                if (cpu_has(&current_cpu_data, X86_FEATURE_CLFLUSH_MONITOR))
+                trace_cpu_idle(1, smp_processor_id());
+                if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR))
                        clflush((void *)&current_thread_info()->flags);
                __monitor((void *)&current_thread_info()->flags, 0, 0);
@@ -469,6 +476,8 @@ static void mwait_idle(void)
                        __sti_mwait(0, 0);
                else
                        local_irq_enable();
+                trace_power_end(smp_processor_id());
+                trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
        } else
                local_irq_enable();
 }
@@ -481,10 +490,12 @@ static void mwait_idle(void)
 static void poll_idle(void)
 {
        trace_power_start(POWER_CSTATE, 0, smp_processor_id());
+        trace_cpu_idle(0, smp_processor_id());
        local_irq_enable();
        while (!need_resched())
                cpu_relax();
-        trace_power_end(0);
+        trace_power_end(smp_processor_id());
+        trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
 }
 /*
@@ -499,17 +510,16 @@ static void poll_idle(void)
 *
 * idle=mwait overrides this decision and forces the usage of mwait.
 */
-static int __cpuinitdata force_mwait;
 #define MWAIT_INFO                      0x05
 #define MWAIT_ECX_EXTENDED_INFO         0x01
 #define MWAIT_EDX_C1                    0xf0
-static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c)
+int mwait_usable(const struct cpuinfo_x86 *c)
 {
        u32 eax, ebx, ecx, edx;
-        if (force_mwait)
+        if (boot_option_idle_override == IDLE_FORCE_MWAIT)
                return 1;
        if (c->cpuid_level < MWAIT_INFO)
@@ -527,45 +537,45 @@ static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c)
        return (edx & MWAIT_EDX_C1);
 }
-bool c1e_detected;
+bool amd_e400_c1e_detected;
-EXPORT_SYMBOL(c1e_detected);
+EXPORT_SYMBOL(amd_e400_c1e_detected);
-static cpumask_var_t c1e_mask;
+static cpumask_var_t amd_e400_c1e_mask;
-void c1e_remove_cpu(int cpu)
+void amd_e400_remove_cpu(int cpu)
 {
-        if (c1e_mask != NULL)
+        if (amd_e400_c1e_mask != NULL)
-                cpumask_clear_cpu(cpu, c1e_mask);
+                cpumask_clear_cpu(cpu, amd_e400_c1e_mask);
 }
 /*
- * C1E aware idle routine. We check for C1E active in the interrupt
+ * AMD Erratum 400 aware idle routine. We check for C1E active in the interrupt
 * pending message MSR. If we detect C1E, then we handle it the same
 * way as C3 power states (local apic timer and TSC stop)
 */
-static void c1e_idle(void)
+static void amd_e400_idle(void)
 {
        if (need_resched())
                return;
-        if (!c1e_detected) {
+        if (!amd_e400_c1e_detected) {
                u32 lo, hi;
                rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi);
                if (lo & K8_INTP_C1E_ACTIVE_MASK) {
-                        c1e_detected = true;
+                        amd_e400_c1e_detected = true;
                        if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
                                mark_tsc_unstable("TSC halt in AMD C1E");
                        printk(KERN_INFO "System has AMD C1E enabled\n");
                }
        }
-        if (c1e_detected) {
+        if (amd_e400_c1e_detected) {
                int cpu = smp_processor_id();
-                if (!cpumask_test_cpu(cpu, c1e_mask)) {
+                if (!cpumask_test_cpu(cpu, amd_e400_c1e_mask)) {
-                        cpumask_set_cpu(cpu, c1e_mask);
+                        cpumask_set_cpu(cpu, amd_e400_c1e_mask);
                        /*
                         * Force broadcast so ACPI can not interfere.
                         */
@@ -608,17 +618,17 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
                pm_idle = mwait_idle;
        } else if (cpu_has_amd_erratum(amd_erratum_400)) {
                /* E400: APIC timer interrupt does not wake up CPU from C1e */
-                printk(KERN_INFO "using C1E aware idle routine\n");
+                printk(KERN_INFO "using AMD E400 aware idle routine\n");
-                pm_idle = c1e_idle;
+                pm_idle = amd_e400_idle;
        } else
                pm_idle = default_idle;
 }
-void __init init_c1e_mask(void)
+void __init init_amd_e400_c1e_mask(void)
 {
-        /* If we're using c1e_idle, we need to allocate c1e_mask. */
+        /* If we're using amd_e400_idle, we need to allocate amd_e400_c1e_mask. */
-        if (pm_idle == c1e_idle)
+        if (pm_idle == amd_e400_idle)
-                zalloc_cpumask_var(&c1e_mask, GFP_KERNEL);
+                zalloc_cpumask_var(&amd_e400_c1e_mask, GFP_KERNEL);
 }
 static int __init idle_setup(char *str)
@@ -629,9 +639,11 @@ static int __init idle_setup(char *str)
        if (!strcmp(str, "poll")) {
                printk("using polling idle threads.\n");
                pm_idle = poll_idle;
-        } else if (!strcmp(str, "mwait"))
+                boot_option_idle_override = IDLE_POLL;
-                force_mwait = 1;
+        } else if (!strcmp(str, "mwait")) {
-        else if (!strcmp(str, "halt")) {
+                boot_option_idle_override = IDLE_FORCE_MWAIT;
+                WARN_ONCE(1, "\"idle=mwait\" will be removed in 2012\n");
+        } else if (!strcmp(str, "halt")) {
                /*
                 * When the boot option of idle=halt is added, halt is
                 * forced to be used for CPU idle. In such case CPU C2/C3
@@ -640,8 +652,7 @@ static int __init idle_setup(char *str)
                 * the boot_option_idle_override.
                 */
                pm_idle = default_idle;
-                idle_halt = 1;
+                boot_option_idle_override = IDLE_HALT;
-                return 0;
        } else if (!strcmp(str, "nomwait")) {
                /*
                 * If the boot option of "idle=nomwait" is added,
@@ -649,12 +660,10 @@ static int __init idle_setup(char *str)
                 * states. In such case it won't touch the variable
                 * of boot_option_idle_override.
                 */
-                idle_nomwait = 1;
+                boot_option_idle_override = IDLE_NOMWAIT;
-                return 0;
        } else
                return -1;
-        boot_option_idle_override = 1;
        return 0;
 }
 early_param("idle", idle_setup);
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 96586c3cbbbf..a3d0dc59067b 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -57,8 +57,6 @@
 #include <asm/syscalls.h>
 #include <asm/debugreg.h>
-#include <trace/events/power.h>
 asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
 /*
@@ -113,8 +111,6 @@ void cpu_idle(void)
                        stop_critical_timings();
                        pm_idle();
                        start_critical_timings();
-                        trace_power_end(smp_processor_id());
                }
                tick_nohz_restart_sched_tick();
                preempt_enable_no_resched();
@@ -249,7 +245,6 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
 {
        set_user_gs(regs, 0);
        regs->fs                = 0;
-        set_fs(USER_DS);
        regs->ds                = __USER_DS;
        regs->es                = __USER_DS;
        regs->ss                = __USER_DS;
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 3d9ea531ddd1..ca6f7ab8df33 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -51,8 +51,6 @@
 #include <asm/syscalls.h>
 #include <asm/debugreg.h>
-#include <trace/events/power.h>
 asmlinkage extern void ret_from_fork(void);
 DEFINE_PER_CPU(unsigned long, old_rsp);
@@ -141,8 +139,6 @@ void cpu_idle(void)
                        pm_idle();
                        start_critical_timings();
-                        trace_power_end(smp_processor_id());
                        /* In many cases the interrupt that ended idle
                           has already called exit_idle. But some idle
                           loops can be woken up without interrupt. */
@@ -342,7 +338,6 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip,
        regs->cs                = _cs;
        regs->ss                = _ss;
        regs->flags             = X86_EFLAGS_IF;
-        set_fs(USER_DS);
        /*
         * Free the old FP and other extended state
         */
@@ -424,7 +419,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
        load_TLS(next, cpu);
        /* Must be after DS reload */
-        unlazy_fpu(prev_p);
+        __unlazy_fpu(prev_p);
        /* Make sure cpu is ready for new context */
        if (preload_fpu)
@@ -505,6 +500,10 @@ void set_personality_64bit(void)
        /* Make sure to be in 64bit mode */
        clear_thread_flag(TIF_IA32);
+        /* Ensure the corresponding mm is not marked. */
+        if (current->mm)
+                current->mm->context.ia32_compat = 0;
        /* TBD: overwrites user setup. Should have two bits.
           But 64bit processes have always behaved this way,
           so it's not too bad. The main problem is just that
@@ -520,6 +519,10 @@ void set_personality_ia32(void)
        set_thread_flag(TIF_IA32);
        current->personality |= force_personality32;
+        /* Mark the associated mm as containing 32-bit tasks. */
+        if (current->mm)
+                current->mm->context.ia32_compat = 1;
        /* Prepare the first "return" to user space */
        current_thread_info()->status |= TS_COMPAT;
 }
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 70c4872cd8aa..807c2a2b80f1 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -608,6 +608,9 @@ static int ptrace_write_dr7(struct task_struct *tsk, unsigned long data)
        unsigned len, type;
        struct perf_event *bp;
+        if (ptrace_get_breakpoints(tsk) < 0)
+                return -ESRCH;
        data &= ~DR_CONTROL_RESERVED;
        old_dr7 = ptrace_get_dr7(thread->ptrace_bps);
 restore:
@@ -655,6 +658,9 @@ restore:
                }
                goto restore;
        }
+        ptrace_put_breakpoints(tsk);
        return ((orig_ret < 0) ? orig_ret : rc);
 }
@@ -668,10 +674,17 @@ static unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n)
        if (n < HBP_NUM) {
                struct perf_event *bp;
+                if (ptrace_get_breakpoints(tsk) < 0)
+                        return -ESRCH;
                bp = thread->ptrace_bps[n];
                if (!bp)
-                        return 0;
+                        val = 0;
-                val = bp->hw.info.address;
+                else
+                        val = bp->hw.info.address;
+                ptrace_put_breakpoints(tsk);
        } else if (n == 6) {
                val = thread->debugreg6;
         } else if (n == 7) {
@@ -686,6 +699,10 @@ static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr,
        struct perf_event *bp;
        struct thread_struct *t = &tsk->thread;
        struct perf_event_attr attr;
+        int err = 0;
+        if (ptrace_get_breakpoints(tsk) < 0)
+                return -ESRCH;
        if (!t->ptrace_bps[nr]) {
                ptrace_breakpoint_init(&attr);
@@ -709,24 +726,23 @@ static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr,
                 * writing for the user. And anyway this is the previous
                 * behaviour.
                 */
-                if (IS_ERR(bp))
+                if (IS_ERR(bp)) {
-                        return PTR_ERR(bp);
+                        err = PTR_ERR(bp);
+                        goto put;
+                }
                t->ptrace_bps[nr] = bp;
        } else {
-                int err;
                bp = t->ptrace_bps[nr];
                attr = bp->attr;
                attr.bp_addr = addr;
                err = modify_user_hw_breakpoint(bp, &attr);
-                if (err)
-                        return err;
        }
+put:
-        return 0;
+        ptrace_put_breakpoints(tsk);
+        return err;
 }
 /*
@@ -801,7 +817,8 @@ void ptrace_disable(struct task_struct *child)
 static const struct user_regset_view user_x86_32_view; /* Initialized below. */
 #endif
-long arch_ptrace(struct task_struct *child, long request, long addr, long data)
+long arch_ptrace(struct task_struct *child, long request,
+                 unsigned long addr, unsigned long data)
 {
        int ret;
        unsigned long __user *datap = (unsigned long __user *)data;
@@ -812,8 +829,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
                unsigned long tmp;
                ret = -EIO;
-                if ((addr & (sizeof(data) - 1)) || addr < 0 ||
+                if ((addr & (sizeof(data) - 1)) || addr >= sizeof(struct user))
-                    addr >= sizeof(struct user))
                        break;
                tmp = 0;  /* Default return condition */
@@ -830,8 +846,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
        case PTRACE_POKEUSR: /* write the word at location addr in the USER area */
                ret = -EIO;
-                if ((addr & (sizeof(data) - 1)) || addr < 0 ||
+                if ((addr & (sizeof(data) - 1)) || addr >= sizeof(struct user))
-                    addr >= sizeof(struct user))
                        break;
                if (addr < sizeof(struct user_regs_struct))
@@ -888,17 +903,17 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
        case PTRACE_GET_THREAD_AREA:
-                if (addr < 0)
+                if ((int) addr < 0)
                        return -EIO;
                ret = do_get_thread_area(child, addr,
-                                         (struct user_desc __user *) data);
+                                        (struct user_desc __user *)data);
                break;
        case PTRACE_SET_THREAD_AREA:
-                if (addr < 0)
+                if ((int) addr < 0)
                        return -EIO;
                ret = do_set_thread_area(child, addr,
-                                         (struct user_desc __user *) data, 0);
+                                        (struct user_desc __user *)data, 0);
                break;
 #endif
@@ -1348,7 +1363,7 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs,
 * We must return the syscall number to actually look up in the table.
 * This can be -1L to skip running any syscall at all.
 */
-asmregparm long syscall_trace_enter(struct pt_regs *regs)
+long syscall_trace_enter(struct pt_regs *regs)
 {
        long ret = 0;
@@ -1393,7 +1408,7 @@ asmregparm long syscall_trace_enter(struct pt_regs *regs)
        return ret ?: regs->orig_ax;
 }
-asmregparm void syscall_trace_leave(struct pt_regs *regs)
+void syscall_trace_leave(struct pt_regs *regs)
 {
        bool step;
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index 239427ca02af..42eb3300dfc6 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -41,48 +41,11 @@ void pvclock_set_flags(u8 flags)
        valid_flags = flags;
 }
-/*
- * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
- * yielding a 64-bit result.
- */
-static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
-{
-        u64 product;
-#ifdef __i386__
-        u32 tmp1, tmp2;
-#endif
-        if (shift < 0)
-                delta >>= -shift;
-        else
-                delta <<= shift;
-#ifdef __i386__
-        __asm__ (
-                "mul  %5       ; "
-                "mov  %4,%%eax ; "
-                "mov  %%edx,%4 ; "
-                "mul  %5       ; "
-                "xor  %5,%5    ; "
-                "add  %4,%%eax ; "
-                "adc  %5,%%edx ; "
-                : "=A" (product), "=r" (tmp1), "=r" (tmp2)
-                : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
-#elif defined(__x86_64__)
-        __asm__ (
-                "mul %%rdx ; shrd $32,%%rdx,%%rax"
-                : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
-#else
-#error implement me!
-#endif
-        return product;
-}
 static u64 pvclock_get_nsec_offset(struct pvclock_shadow_time *shadow)
 {
        u64 delta = native_read_tsc() - shadow->tsc_timestamp;
-        return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
+        return pvclock_scale_delta(delta, shadow->tsc_to_nsec_mul,
+                                   shadow->tsc_shift);
 }
 /*
@@ -120,6 +83,11 @@ unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src)
 static atomic64_t last_value = ATOMIC64_INIT(0);
+void pvclock_resume(void)
+{
+        atomic64_set(&last_value, 0);
+}
 cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
 {
        struct pvclock_shadow_time shadow;
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index 939b9e98245f..8bbe8c56916d 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -344,6 +344,8 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8235,
                         vt8237_force_enable_hpet);
 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8237,
                         vt8237_force_enable_hpet);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_CX700,
+                         vt8237_force_enable_hpet);
 static void ati_force_hpet_resume(void)
 {
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index e3af342fe83a..9242436e9937 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -6,6 +6,7 @@
 #include <linux/dmi.h>
 #include <linux/sched.h>
 #include <linux/tboot.h>
+#include <linux/delay.h>
 #include <acpi/reboot.h>
 #include <asm/io.h>
 #include <asm/apic.h>
@@ -18,6 +19,7 @@
 #include <asm/pci_x86.h>
 #include <asm/virtext.h>
 #include <asm/cpu.h>
+#include <asm/nmi.h>
 #ifdef CONFIG_X86_32
 # include <linux/ctype.h>
@@ -34,7 +36,7 @@ EXPORT_SYMBOL(pm_power_off);
 static const struct desc_ptr no_idt = {};
 static int reboot_mode;
-enum reboot_type reboot_type = BOOT_KBD;
+enum reboot_type reboot_type = BOOT_ACPI;
 int reboot_force;
 #if defined(CONFIG_X86_32) && defined(CONFIG_SMP)
@@ -84,7 +86,7 @@ static int __init reboot_setup(char *str)
                        }
                                /* we will leave sorting out the final value
                                   when we are ready to reboot, since we might not
-                                   have set up boot_cpu_id or smp_num_cpu */
+                                   have detected BSP APIC ID or smp_num_cpu */
                        break;
 #endif /* CONFIG_SMP */
@@ -284,6 +286,22 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
                        DMI_MATCH(DMI_BOARD_NAME, "P4S800"),
                },
        },
+        {       /* Handle problems with rebooting on VersaLogic Menlow boards */
+                .callback = set_bios_reboot,
+                .ident = "VersaLogic Menlow based board",
+                .matches = {
+                        DMI_MATCH(DMI_BOARD_VENDOR, "VersaLogic Corporation"),
+                        DMI_MATCH(DMI_BOARD_NAME, "VersaLogic Menlow board"),
+                },
+        },
+        { /* Handle reboot issue on Acer Aspire one */
+                .callback = set_bios_reboot,
+                .ident = "Acer Aspire One A110",
+                .matches = {
+                        DMI_MATCH(DMI_SYS_VENDOR, "Acer"),
+                        DMI_MATCH(DMI_PRODUCT_NAME, "AOA110"),
+                },
+        },
        { }
 };
@@ -294,68 +312,16 @@ static int __init reboot_init(void)
 }
 core_initcall(reboot_init);
-/* The following code and data reboots the machine by switching to real
+extern const unsigned char machine_real_restart_asm[];
-   mode and jumping to the BIOS reset entry point, as if the CPU has
+extern const u64 machine_real_restart_gdt[3];
-   really been reset.  The previous version asked the keyboard
-   controller to pulse the CPU reset line, which is more thorough, but
-   doesn't work with at least one type of 486 motherboard.  It is easy
-   to stop this code working; hence the copious comments. */
-static const unsigned long long
-real_mode_gdt_entries [3] =
-{
-        0x0000000000000000ULL,  /* Null descriptor */
-        0x00009b000000ffffULL,  /* 16-bit real-mode 64k code at 0x00000000 */
-        0x000093000100ffffULL   /* 16-bit real-mode 64k data at 0x00000100 */
-};
-static const struct desc_ptr
+void machine_real_restart(unsigned int type)
-real_mode_gdt = { sizeof (real_mode_gdt_entries) - 1, (long)real_mode_gdt_entries },
-real_mode_idt = { 0x3ff, 0 };
-/* This is 16-bit protected mode code to disable paging and the cache,
-   switch to real mode and jump to the BIOS reset code.
-   The instruction that switches to real mode by writing to CR0 must be
-   followed immediately by a far jump instruction, which set CS to a
-   valid value for real mode, and flushes the prefetch queue to avoid
-   running instructions that have already been decoded in protected
-   mode.
-   Clears all the flags except ET, especially PG (paging), PE
-   (protected-mode enable) and TS (task switch for coprocessor state
-   save).  Flushes the TLB after paging has been disabled.  Sets CD and
-   NW, to disable the cache on a 486, and invalidates the cache.  This
-   is more like the state of a 486 after reset.  I don't know if
-   something else should be done for other chips.
-   More could be done here to set up the registers as if a CPU reset had
-   occurred; hopefully real BIOSs don't assume much. */
-static const unsigned char real_mode_switch [] =
 {
-        0x66, 0x0f, 0x20, 0xc0,                 /*    movl  %cr0,%eax        */
+        void *restart_va;
-        0x66, 0x83, 0xe0, 0x11,                 /*    andl  $0x00000011,%eax */
+        unsigned long restart_pa;
-        0x66, 0x0d, 0x00, 0x00, 0x00, 0x60,     /*    orl   $0x60000000,%eax */
+        void (*restart_lowmem)(unsigned int);
-        0x66, 0x0f, 0x22, 0xc0,                 /*    movl  %eax,%cr0        */
+        u64 *lowmem_gdt;
-        0x66, 0x0f, 0x22, 0xd8,                 /*    movl  %eax,%cr3        */
-        0x66, 0x0f, 0x20, 0xc3,                 /*    movl  %cr0,%ebx        */
-        0x66, 0x81, 0xe3, 0x00, 0x00, 0x00, 0x60,       /*    andl  $0x60000000,%ebx */
-        0x74, 0x02,                             /*    jz    f                */
-        0x0f, 0x09,                             /*    wbinvd                 */
-        0x24, 0x10,                             /* f: andb  $0x10,al         */
-        0x66, 0x0f, 0x22, 0xc0                  /*    movl  %eax,%cr0        */
-};
-static const unsigned char jump_to_bios [] =
-{
-        0xea, 0x00, 0x00, 0xff, 0xff            /*    ljmp  $0xffff,$0x0000  */
-};
-/*
- * Switch to real mode and then execute the code
- * specified by the code and length parameters.
- * We assume that length will aways be less that 100!
- */
-void machine_real_restart(const unsigned char *code, int length)
-{
        local_irq_disable();
        /* Write zero to CMOS register number 0x0f, which the BIOS POST
@@ -371,16 +337,10 @@ void machine_real_restart(const unsigned char *code, int length)
        CMOS_WRITE(0x00, 0x8f);
        spin_unlock(&rtc_lock);
-        /* Remap the kernel at virtual address zero, as well as offset zero
-           from the kernel segment.  This assumes the kernel segment starts at
-           virtual address PAGE_OFFSET. */
-        memcpy(swapper_pg_dir, swapper_pg_dir + KERNEL_PGD_BOUNDARY,
-                sizeof(swapper_pg_dir [0]) * KERNEL_PGD_PTRS);
        /*
-         * Use `swapper_pg_dir' as our page directory.
+         * Switch back to the initial page table.
         */
-        load_cr3(swapper_pg_dir);
+        load_cr3(initial_page_table);
        /* Write 0x1234 to absolute memory location 0x472.  The BIOS reads
           this on booting to tell it to "Bypass memory test (also warm
@@ -389,41 +349,23 @@ void machine_real_restart(const unsigned char *code, int length)
           too. */
        *((unsigned short *)0x472) = reboot_mode;
-        /* For the switch to real mode, copy some code to low memory.  It has
+        /* Patch the GDT in the low memory trampoline */
-           to be in the first 64k because it is running in 16-bit mode, and it
+        lowmem_gdt = TRAMPOLINE_SYM(machine_real_restart_gdt);
-           has to have the same physical and virtual address, because it turns
-           off paging.  Copy it near the end of the first page, out of the way
+        restart_va = TRAMPOLINE_SYM(machine_real_restart_asm);
-           of BIOS variables. */
+        restart_pa = virt_to_phys(restart_va);
-        memcpy((void *)(0x1000 - sizeof(real_mode_switch) - 100),
+        restart_lowmem = (void (*)(unsigned int))restart_pa;
-                real_mode_switch, sizeof (real_mode_switch));
-        memcpy((void *)(0x1000 - 100), code, length);
+        /* GDT[0]: GDT self-pointer */
+        lowmem_gdt[0] =
-        /* Set up the IDT for real mode. */
+                (u64)(sizeof(machine_real_restart_gdt) - 1) +
-        load_idt(&real_mode_idt);
+                ((u64)virt_to_phys(lowmem_gdt) << 16);
+        /* GDT[1]: 64K real mode code segment */
-        /* Set up a GDT from which we can load segment descriptors for real
+        lowmem_gdt[1] =
-           mode.  The GDT is not used in real mode; it is just needed here to
+                GDT_ENTRY(0x009b, restart_pa, 0xffff);
-           prepare the descriptors. */
-        load_gdt(&real_mode_gdt);
+        /* Jump to the identity-mapped low memory code */
+        restart_lowmem(type);
-        /* Load the data segment registers, and thus the descriptors ready for
-           real mode.  The base address of each segment is 0x100, 16 times the
-           selector value being loaded here.  This is so that the segment
-           registers don't have to be reloaded after switching to real mode:
-           the values are consistent for real mode operation already. */
-        __asm__ __volatile__ ("movl $0x0010,%%eax\n"
-                                "\tmovl %%eax,%%ds\n"
-                                "\tmovl %%eax,%%es\n"
-                                "\tmovl %%eax,%%fs\n"
-                                "\tmovl %%eax,%%gs\n"
-                                "\tmovl %%eax,%%ss" : : : "eax");
-        /* Jump to the 16-bit code that we copied earlier.  It disables paging
-           and the cache, switches to real mode, and jumps to the BIOS reset
-           entry point. */
-        __asm__ __volatile__ ("ljmp $0x0008,%0"
-                                :
-                                : "i" ((void *)(0x1000 - sizeof (real_mode_switch) - 100)));
 }
 #ifdef CONFIG_APM_MODULE
 EXPORT_SYMBOL(machine_real_restart);
@@ -477,6 +419,30 @@ static struct dmi_system_id __initdata pci_reboot_dmi_table[] = {
                        DMI_MATCH(DMI_PRODUCT_NAME, "iMac9,1"),
                },
        },
+        {       /* Handle problems with rebooting on the Latitude E6320. */
+                .callback = set_pci_reboot,
+                .ident = "Dell Latitude E6320",
+                .matches = {
+                        DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+                        DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E6320"),
+                },
+        },
+        {       /* Handle problems with rebooting on the Latitude E5420. */
+                .callback = set_pci_reboot,
+                .ident = "Dell Latitude E5420",
+                .matches = {
+                        DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+                        DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E5420"),
+                },
+        },
+        {       /* Handle problems with rebooting on the Latitude E6420. */
+                .callback = set_pci_reboot,
+                .ident = "Dell Latitude E6420",
+                .matches = {
+                        DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+                        DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E6420"),
+                },
+        },
        { }
 };
@@ -544,9 +510,24 @@ void __attribute__((weak)) mach_reboot_fixups(void)
 {
 }
+/*
+ * Windows compatible x86 hardware expects the following on reboot:
+ *
+ * 1) If the FADT has the ACPI reboot register flag set, try it
+ * 2) If still alive, write to the keyboard controller
+ * 3) If still alive, write to the ACPI reboot register again
+ * 4) If still alive, write to the keyboard controller again
+ *
+ * If the machine is still alive at this stage, it gives up. We default to
+ * following the same pattern, except that if we're still alive after (4) we'll
+ * try to force a triple fault and then cycle between hitting the keyboard
+ * controller and doing that
+ */
 static void native_machine_emergency_restart(void)
 {
        int i;
+        int attempt = 0;
+        int orig_reboot_type = reboot_type;
        if (reboot_emergency)
                emergency_vmx_disable_all();
@@ -568,6 +549,13 @@ static void native_machine_emergency_restart(void)
                                outb(0xfe, 0x64); /* pulse reset low */
                                udelay(50);
                        }
+                        if (attempt == 0 && orig_reboot_type == BOOT_ACPI) {
+                                attempt = 1;
+                                reboot_type = BOOT_ACPI;
+                        } else {
+                                reboot_type = BOOT_TRIPLE;
+                        }
+                        break;
                case BOOT_TRIPLE:
                        load_idt(&no_idt);
@@ -578,7 +566,7 @@ static void native_machine_emergency_restart(void)
 #ifdef CONFIG_X86_32
                case BOOT_BIOS:
-                        machine_real_restart(jump_to_bios, sizeof(jump_to_bios));
+                        machine_real_restart(MRR_BIOS);
                        reboot_type = BOOT_KBD;
                        break;
@@ -641,7 +629,7 @@ void native_machine_shutdown(void)
        /* O.K Now that I'm on the appropriate processor,
         * stop all of the others.
         */
-        smp_send_stop();
+        stop_other_cpus();
 #endif
        lapic_shutdown();
@@ -753,7 +741,7 @@ static int crash_nmi_callback(struct notifier_block *self,
 {
        int cpu;
-        if (val != DIE_NMI_IPI)
+        if (val != DIE_NMI)
                return NOTIFY_OK;
        cpu = raw_smp_processor_id();
@@ -784,6 +772,8 @@ static void smp_send_nmi_allbutself(void)
 static struct notifier_block crash_nmi_nb = {
        .notifier_call = crash_nmi_callback,
+        /* we want to be the first one called */
+        .priority = NMI_LOCAL_HIGH_PRIOR+1,
 };
 /* Halt all other CPUs, calling the specified function on each of them
diff --git a/arch/x86/kernel/reboot_32.S b/arch/x86/kernel/reboot_32.S
new file mode 100644
index 000000000000..1d5c46df0d78
--- /dev/null
+++ b/arch/x86/kernel/reboot_32.S
@@ -0,0 +1,135 @@
+#include <linux/linkage.h>
+#include <linux/init.h>
+#include <asm/segment.h>
+#include <asm/page_types.h>
+/*
+ * The following code and data reboots the machine by switching to real
+ * mode and jumping to the BIOS reset entry point, as if the CPU has
+ * really been reset.  The previous version asked the keyboard
+ * controller to pulse the CPU reset line, which is more thorough, but
+ * doesn't work with at least one type of 486 motherboard.  It is easy
+ * to stop this code working; hence the copious comments.
+ *
+ * This code is called with the restart type (0 = BIOS, 1 = APM) in %eax.
+ */
+        .section ".x86_trampoline","a"
+        .balign 16
+        .code32
+ENTRY(machine_real_restart_asm)
+r_base = .
+        /* Get our own relocated address */
+        call    1f
+1:      popl    %ebx
+        subl    $(1b - r_base), %ebx
+        /* Compute the equivalent real-mode segment */
+        movl    %ebx, %ecx
+        shrl    $4, %ecx
+        
+        /* Patch post-real-mode segment jump */
+        movw    (dispatch_table - r_base)(%ebx,%eax,2),%ax
+        movw    %ax, (101f - r_base)(%ebx)
+        movw    %cx, (102f - r_base)(%ebx)
+        /* Set up the IDT for real mode. */
+        lidtl   (machine_real_restart_idt - r_base)(%ebx)
+        /*
+         * Set up a GDT from which we can load segment descriptors for real
+         * mode.  The GDT is not used in real mode; it is just needed here to
+         * prepare the descriptors.
+         */
+        lgdtl   (machine_real_restart_gdt - r_base)(%ebx)
+        /*
+         * Load the data segment registers with 16-bit compatible values
+         */
+        movl    $16, %ecx
+        movl    %ecx, %ds
+        movl    %ecx, %es
+        movl    %ecx, %fs
+        movl    %ecx, %gs
+        movl    %ecx, %ss
+        ljmpl   $8, $1f - r_base
+/*
+ * This is 16-bit protected mode code to disable paging and the cache,
+ * switch to real mode and jump to the BIOS reset code.
+ *
+ * The instruction that switches to real mode by writing to CR0 must be
+ * followed immediately by a far jump instruction, which set CS to a
+ * valid value for real mode, and flushes the prefetch queue to avoid
+ * running instructions that have already been decoded in protected
+ * mode.
+ *
+ * Clears all the flags except ET, especially PG (paging), PE
+ * (protected-mode enable) and TS (task switch for coprocessor state
+ * save).  Flushes the TLB after paging has been disabled.  Sets CD and
+ * NW, to disable the cache on a 486, and invalidates the cache.  This
+ * is more like the state of a 486 after reset.  I don't know if
+ * something else should be done for other chips.
+ *
+ * More could be done here to set up the registers as if a CPU reset had
+ * occurred; hopefully real BIOSs don't assume much.  This is not the
+ * actual BIOS entry point, anyway (that is at 0xfffffff0).
+ *
+ * Most of this work is probably excessive, but it is what is tested.
+ */
+        .code16
+1:
+        xorl    %ecx, %ecx
+        movl    %cr0, %eax
+        andl    $0x00000011, %eax
+        orl     $0x60000000, %eax
+        movl    %eax, %cr0
+        movl    %ecx, %cr3
+        movl    %cr0, %edx
+        andl    $0x60000000, %edx       /* If no cache bits -> no wbinvd */
+        jz      2f
+        wbinvd
+2:
+        andb    $0x10, %al
+        movl    %eax, %cr0
+        .byte   0xea                    /* ljmpw */
+101:    .word   0                       /* Offset */
+102:    .word   0                       /* Segment */
+bios:
+        ljmpw   $0xf000, $0xfff0
+apm:
+        movw    $0x1000, %ax
+        movw    %ax, %ss
+        movw    $0xf000, %sp
+        movw    $0x5307, %ax
+        movw    $0x0001, %bx
+        movw    $0x0003, %cx
+        int     $0x15
+END(machine_real_restart_asm)
+        .balign 16
+        /* These must match <asm/reboot.h */
+dispatch_table:
+        .word   bios - r_base
+        .word   apm - r_base
+END(dispatch_table)
+        .balign 16
+machine_real_restart_idt:
+        .word   0xffff          /* Length - real mode default value */
+        .long   0               /* Base - real mode default value */
+END(machine_real_restart_idt)
+        .balign 16
+ENTRY(machine_real_restart_gdt)
+        .quad   0               /* Self-pointer, filled in by PM code */
+        .quad   0               /* 16-bit code segment, filled in by PM code */
+        /*
+         * 16-bit data segment with the selector value 16 = 0x10 and
+         * base value 0x100; since this is consistent with real mode
+         * semantics we don't have to reload the segments once CR0.PE = 0.
+         */
+        .quad   GDT_ENTRY(0x0093, 0x100, 0xffff)
+END(machine_real_restart_gdt)
diff --git a/arch/x86/kernel/reboot_fixups_32.c b/arch/x86/kernel/reboot_fixups_32.c
index fda313ebbb03..c8e41e90f59c 100644
--- a/arch/x86/kernel/reboot_fixups_32.c
+++ b/arch/x86/kernel/reboot_fixups_32.c
@@ -43,17 +43,33 @@ static void rdc321x_reset(struct pci_dev *dev)
        outb(1, 0x92);
 }
+static void ce4100_reset(struct pci_dev *dev)
+{
+        int i;
+        for (i = 0; i < 10; i++) {
+                outb(0x2, 0xcf9);
+                udelay(50);
+        }
+}
 struct device_fixup {
        unsigned int vendor;
        unsigned int device;
        void (*reboot_fixup)(struct pci_dev *);
 };
+/*
+ * PCI ids solely used for fixups_table go here
+ */
+#define PCI_DEVICE_ID_INTEL_CE4100      0x0708
 static const struct device_fixup fixups_table[] = {
 { PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY, cs5530a_warm_reset },
 { PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CS5536_ISA, cs5536_warm_reset },
 { PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SC1100_BRIDGE, cs5530a_warm_reset },
 { PCI_VENDOR_ID_RDC, PCI_DEVICE_ID_RDC_R6030, rdc321x_reset },
+{ PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CE4100, ce4100_reset },
 };
 /*
diff --git a/arch/x86/kernel/resource.c b/arch/x86/kernel/resource.c
new file mode 100644
index 000000000000..2a26819bb6a8
--- /dev/null
+++ b/arch/x86/kernel/resource.c
@@ -0,0 +1,48 @@
+#include <linux/ioport.h>
+#include <asm/e820.h>
+static void resource_clip(struct resource *res, resource_size_t start,
+                          resource_size_t end)
+{
+        resource_size_t low = 0, high = 0;
+        if (res->end < start || res->start > end)
+                return;         /* no conflict */
+        if (res->start < start)
+                low = start - res->start;
+        if (res->end > end)
+                high = res->end - end;
+        /* Keep the area above or below the conflict, whichever is larger */
+        if (low > high)
+                res->end = start - 1;
+        else
+                res->start = end + 1;
+}
+static void remove_e820_regions(struct resource *avail)
+{
+        int i;
+        struct e820entry *entry;
+        for (i = 0; i < e820.nr_map; i++) {
+                entry = &e820.map[i];
+                resource_clip(avail, entry->addr,
+                              entry->addr + entry->size - 1);
+        }
+}
+void arch_remove_reservations(struct resource *avail)
+{
+        /* Trim out BIOS areas (low 1MB and high 2MB) and E820 regions */
+        if (avail->flags & IORESOURCE_MEM) {
+                if (avail->start < BIOS_END)
+                        avail->start = BIOS_END;
+                resource_clip(avail, BIOS_ROM_BASE, BIOS_ROM_END);
+                remove_e820_regions(avail);
+        }
+}
diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c
index 1cfbbfc3ae26..3f2ad2640d85 100644
--- a/arch/x86/kernel/rtc.c
+++ b/arch/x86/kernel/rtc.c
@@ -6,6 +6,7 @@
 #include <linux/acpi.h>
 #include <linux/bcd.h>
 #include <linux/pnp.h>
+#include <linux/of.h>
 #include <asm/vsyscall.h>
 #include <asm/x86_init.h>
@@ -76,7 +77,7 @@ int mach_set_rtc_mmss(unsigned long nowtime)
                CMOS_WRITE(real_seconds, RTC_SECONDS);
                CMOS_WRITE(real_minutes, RTC_MINUTES);
        } else {
-                printk(KERN_WARNING
+                printk_once(KERN_NOTICE
                       "set_rtc_mmss: can't update from %d to %d\n",
                       cmos_minutes, real_minutes);
                retval = -1;
@@ -236,6 +237,8 @@ static __init int add_rtc_cmos(void)
                }
        }
 #endif
+        if (of_have_populated_dt())
+                return 0;
        platform_device_register(&rtc_device);
        dev_info(&rtc_device.dev,
diff --git a/arch/x86/kernel/scx200_32.c b/arch/x86/kernel/scx200_32.c
deleted file mode 100644
index 7e004acbe526..000000000000
--- a/arch/x86/kernel/scx200_32.c
+++ /dev/null
@@ -1,131 +0,0 @@
-/*
- *  Copyright (c) 2001,2002 Christer Weinigel <wingel@nano-system.com>
- *
- *  National Semiconductor SCx200 support.
- */
-#include <linux/module.h>
-#include <linux/errno.h>
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/mutex.h>
-#include <linux/pci.h>
-#include <linux/scx200.h>
-#include <linux/scx200_gpio.h>
-/* Verify that the configuration block really is there */
-#define scx200_cb_probe(base) (inw((base) + SCx200_CBA) == (base))
-#define NAME "scx200"
-MODULE_AUTHOR("Christer Weinigel <wingel@nano-system.com>");
-MODULE_DESCRIPTION("NatSemi SCx200 Driver");
-MODULE_LICENSE("GPL");
-unsigned scx200_gpio_base = 0;
-unsigned long scx200_gpio_shadow[2];
-unsigned scx200_cb_base = 0;
-static struct pci_device_id scx200_tbl[] = {
-        { PCI_DEVICE(PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SCx200_BRIDGE) },
-        { PCI_DEVICE(PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SC1100_BRIDGE) },
-        { PCI_DEVICE(PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SCx200_XBUS)   },
-        { PCI_DEVICE(PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SC1100_XBUS)   },
-        { },
-};
-MODULE_DEVICE_TABLE(pci,scx200_tbl);
-static int __devinit scx200_probe(struct pci_dev *, const struct pci_device_id *);
-static struct pci_driver scx200_pci_driver = {
-        .name = "scx200",
-        .id_table = scx200_tbl,
-        .probe = scx200_probe,
-};
-static DEFINE_MUTEX(scx200_gpio_config_lock);
-static void __devinit scx200_init_shadow(void)
-{
-        int bank;
-        /* read the current values driven on the GPIO signals */
-        for (bank = 0; bank < 2; ++bank)
-                scx200_gpio_shadow[bank] = inl(scx200_gpio_base + 0x10 * bank);
-}
-static int __devinit scx200_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
-{
-        unsigned base;
-        if (pdev->device == PCI_DEVICE_ID_NS_SCx200_BRIDGE ||
-            pdev->device == PCI_DEVICE_ID_NS_SC1100_BRIDGE) {
-                base = pci_resource_start(pdev, 0);
-                printk(KERN_INFO NAME ": GPIO base 0x%x\n", base);
-                if (!request_region(base, SCx200_GPIO_SIZE, "NatSemi SCx200 GPIO")) {
-                        printk(KERN_ERR NAME ": can't allocate I/O for GPIOs\n");
-                        return -EBUSY;
-                }
-                scx200_gpio_base = base;
-                scx200_init_shadow();
-        } else {
-                /* find the base of the Configuration Block */
-                if (scx200_cb_probe(SCx200_CB_BASE_FIXED)) {
-                        scx200_cb_base = SCx200_CB_BASE_FIXED;
-                } else {
-                        pci_read_config_dword(pdev, SCx200_CBA_SCRATCH, &base);
-                        if (scx200_cb_probe(base)) {
-                                scx200_cb_base = base;
-                        } else {
-                                printk(KERN_WARNING NAME ": Configuration Block not found\n");
-                                return -ENODEV;
-                        }
-                }
-                printk(KERN_INFO NAME ": Configuration Block base 0x%x\n", scx200_cb_base);
-        }
-        return 0;
-}
-u32 scx200_gpio_configure(unsigned index, u32 mask, u32 bits)
-{
-        u32 config, new_config;
-        mutex_lock(&scx200_gpio_config_lock);
-        outl(index, scx200_gpio_base + 0x20);
-        config = inl(scx200_gpio_base + 0x24);
-        new_config = (config & mask) | bits;
-        outl(new_config, scx200_gpio_base + 0x24);
-        mutex_unlock(&scx200_gpio_config_lock);
-        return config;
-}
-static int __init scx200_init(void)
-{
-        printk(KERN_INFO NAME ": NatSemi SCx200 Driver\n");
-        return pci_register_driver(&scx200_pci_driver);
-}
-static void __exit scx200_cleanup(void)
-{
-        pci_unregister_driver(&scx200_pci_driver);
-        release_region(scx200_gpio_base, SCx200_GPIO_SIZE);
-}
-module_init(scx200_init);
-module_exit(scx200_cleanup);
-EXPORT_SYMBOL(scx200_gpio_base);
-EXPORT_SYMBOL(scx200_gpio_shadow);
-EXPORT_SYMBOL(scx200_gpio_configure);
-EXPORT_SYMBOL(scx200_cb_base);
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index c3a4fbb2b996..afaf38447ef5 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -31,6 +31,7 @@
 #include <linux/apm_bios.h>
 #include <linux/initrd.h>
 #include <linux/bootmem.h>
+#include <linux/memblock.h>
 #include <linux/seq_file.h>
 #include <linux/console.h>
 #include <linux/mca.h>
@@ -83,7 +84,6 @@
 #include <asm/dmi.h>
 #include <asm/io_apic.h>
 #include <asm/ist.h>
-#include <asm/vmi.h>
 #include <asm/setup_arch.h>
 #include <asm/bios_ebda.h>
 #include <asm/cacheflush.h>
@@ -107,11 +107,13 @@
 #include <asm/percpu.h>
 #include <asm/topology.h>
 #include <asm/apicdef.h>
-#include <asm/k8.h>
+#include <asm/amd_nb.h>
 #ifdef CONFIG_X86_64
 #include <asm/numa_64.h>
 #endif
 #include <asm/mce.h>
+#include <asm/alternative.h>
+#include <asm/prom.h>
 /*
 * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
@@ -125,7 +127,6 @@ unsigned long max_pfn_mapped;
 RESERVE_BRK(dmi_alloc, 65536);
 #endif
-unsigned int boot_cpu_id __read_mostly;
 static __initdata unsigned long _brk_start = (unsigned long)__brk_base;
 unsigned long _brk_end = (unsigned long)__brk_base;
@@ -297,12 +298,15 @@ static void __init init_gbpages(void)
 static inline void init_gbpages(void)
 {
 }
+static void __init cleanup_highmap(void)
+{
+}
 #endif
 static void __init reserve_brk(void)
 {
        if (_brk_end > _brk_start)
-                reserve_early(__pa(_brk_start), __pa(_brk_end), "BRK");
+                memblock_x86_reserve_range(__pa(_brk_start), __pa(_brk_end), "BRK");
        /* Mark brk area as locked down and no longer taking any
           new allocations */
@@ -324,17 +328,16 @@ static void __init relocate_initrd(void)
        char *p, *q;
        /* We need to move the initrd down into lowmem */
-        ramdisk_here = find_e820_area(0, end_of_lowmem, area_size,
+        ramdisk_here = memblock_find_in_range(0, end_of_lowmem, area_size,
                                         PAGE_SIZE);
-        if (ramdisk_here == -1ULL)
+        if (ramdisk_here == MEMBLOCK_ERROR)
                panic("Cannot find place for new RAMDISK of size %lld\n",
                         ramdisk_size);
        /* Note: this includes all the lowmem currently occupied by
           the initrd, we rely on that fact to keep the data intact. */
-        reserve_early(ramdisk_here, ramdisk_here + area_size,
+        memblock_x86_reserve_range(ramdisk_here, ramdisk_here + area_size, "NEW RAMDISK");
-                         "NEW RAMDISK");
        initrd_start = ramdisk_here + PAGE_OFFSET;
        initrd_end   = initrd_start + ramdisk_size;
        printk(KERN_INFO "Allocated new RAMDISK: %08llx - %08llx\n",
@@ -390,7 +393,7 @@ static void __init reserve_initrd(void)
        initrd_start = 0;
        if (ramdisk_size >= (end_of_lowmem>>1)) {
-                free_early(ramdisk_image, ramdisk_end);
+                memblock_x86_free_range(ramdisk_image, ramdisk_end);
                printk(KERN_ERR "initrd too large to handle, "
                       "disabling initrd\n");
                return;
@@ -413,7 +416,7 @@ static void __init reserve_initrd(void)
        relocate_initrd();
-        free_early(ramdisk_image, ramdisk_end);
+        memblock_x86_free_range(ramdisk_image, ramdisk_end);
 }
 #else
 static void __init reserve_initrd(void)
@@ -430,16 +433,30 @@ static void __init parse_setup_data(void)
                return;
        pa_data = boot_params.hdr.setup_data;
        while (pa_data) {
-                data = early_memremap(pa_data, PAGE_SIZE);
+                u32 data_len, map_len;
+                map_len = max(PAGE_SIZE - (pa_data & ~PAGE_MASK),
+                              (u64)sizeof(struct setup_data));
+                data = early_memremap(pa_data, map_len);
+                data_len = data->len + sizeof(struct setup_data);
+                if (data_len > map_len) {
+                        early_iounmap(data, map_len);
+                        data = early_memremap(pa_data, data_len);
+                        map_len = data_len;
+                }
                switch (data->type) {
                case SETUP_E820_EXT:
-                        parse_e820_ext(data, pa_data);
+                        parse_e820_ext(data);
+                        break;
+                case SETUP_DTB:
+                        add_dtb(pa_data);
                        break;
                default:
                        break;
                }
                pa_data = data->next;
-                early_iounmap(data, PAGE_SIZE);
+                early_iounmap(data, map_len);
        }
 }
@@ -469,7 +486,7 @@ static void __init e820_reserve_setup_data(void)
        e820_print_map("reserve setup_data");
 }
-static void __init reserve_early_setup_data(void)
+static void __init memblock_x86_reserve_range_setup_data(void)
 {
        struct setup_data *data;
        u64 pa_data;
@@ -481,7 +498,7 @@ static void __init reserve_early_setup_data(void)
        while (pa_data) {
                data = early_memremap(pa_data, sizeof(*data));
                sprintf(buf, "setup data %x", data->type);
-                reserve_early(pa_data, pa_data+sizeof(*data)+data->len, buf);
+                memblock_x86_reserve_range(pa_data, pa_data+sizeof(*data)+data->len, buf);
                pa_data = data->next;
                early_iounmap(data, sizeof(*data));
        }
@@ -502,6 +519,18 @@ static inline unsigned long long get_total_mem(void)
        return total << PAGE_SHIFT;
 }
+/*
+ * Keep the crash kernel below this limit.  On 32 bits earlier kernels
+ * would limit the kernel to the low 512 MiB due to mapping restrictions.
+ * On 64 bits, kexec-tools currently limits us to 896 MiB; increase this
+ * limit once kexec-tools are fixed.
+ */
+#ifdef CONFIG_X86_32
+# define CRASH_KERNEL_ADDR_MAX  (512 << 20)
+#else
+# define CRASH_KERNEL_ADDR_MAX  (896 << 20)
+#endif
 static void __init reserve_crashkernel(void)
 {
        unsigned long long total_mem;
@@ -519,23 +548,27 @@ static void __init reserve_crashkernel(void)
        if (crash_base <= 0) {
                const unsigned long long alignment = 16<<20;    /* 16M */
-                crash_base = find_e820_area(alignment, ULONG_MAX, crash_size,
+                /*
-                                 alignment);
+                 *  kexec want bzImage is below CRASH_KERNEL_ADDR_MAX
-                if (crash_base == -1ULL) {
+                 */
+                crash_base = memblock_find_in_range(alignment,
+                               CRASH_KERNEL_ADDR_MAX, crash_size, alignment);
+                if (crash_base == MEMBLOCK_ERROR) {
                        pr_info("crashkernel reservation failed - No suitable area found.\n");
                        return;
                }
        } else {
                unsigned long long start;
-                start = find_e820_area(crash_base, ULONG_MAX, crash_size,
+                start = memblock_find_in_range(crash_base,
-                                 1<<20);
+                                 crash_base + crash_size, crash_size, 1<<20);
                if (start != crash_base) {
                        pr_info("crashkernel reservation failed - memory is in use.\n");
                        return;
                }
        }
-        reserve_early(crash_base, crash_base + crash_size, "CRASH KERNEL");
+        memblock_x86_reserve_range(crash_base, crash_base + crash_size, "CRASH KERNEL");
        printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
                        "for crashkernel (System RAM: %ldMB)\n",
@@ -586,28 +619,6 @@ void __init reserve_standard_io_resources(void)
 }
-/*
- * Note: elfcorehdr_addr is not just limited to vmcore. It is also used by
- * is_kdump_kernel() to determine if we are booting after a panic. Hence
- * ifdef it under CONFIG_CRASH_DUMP and not CONFIG_PROC_VMCORE.
- */
-#ifdef CONFIG_CRASH_DUMP
-/* elfcorehdr= specifies the location of elf core header
- * stored by the crashed kernel. This option will be passed
- * by kexec loader to the capture kernel.
- */
-static int __init setup_elfcorehdr(char *arg)
-{
-        char *end;
-        if (!arg)
-                return -EINVAL;
-        elfcorehdr_addr = memparse(arg, &end);
-        return end > arg ? 0 : -EINVAL;
-}
-early_param("elfcorehdr", setup_elfcorehdr);
-#endif
 static __init void reserve_ibft_region(void)
 {
        unsigned long addr, size = 0;
@@ -615,82 +626,10 @@ static __init void reserve_ibft_region(void)
        addr = find_ibft_region(&size);
        if (size)
-                reserve_early_overlap_ok(addr, addr + size, "ibft");
+                memblock_x86_reserve_range(addr, addr + size, "* ibft");
 }
-#ifdef CONFIG_X86_RESERVE_LOW_64K
+static unsigned reserve_low = CONFIG_X86_RESERVE_LOW << 10;
-static int __init dmi_low_memory_corruption(const struct dmi_system_id *d)
-{
-        printk(KERN_NOTICE
-                "%s detected: BIOS may corrupt low RAM, working around it.\n",
-                d->ident);
-        e820_update_range(0, 0x10000, E820_RAM, E820_RESERVED);
-        sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
-        return 0;
-}
-#endif
-/* List of systems that have known low memory corruption BIOS problems */
-static struct dmi_system_id __initdata bad_bios_dmi_table[] = {
-#ifdef CONFIG_X86_RESERVE_LOW_64K
-        {
-                .callback = dmi_low_memory_corruption,
-                .ident = "AMI BIOS",
-                .matches = {
-                        DMI_MATCH(DMI_BIOS_VENDOR, "American Megatrends Inc."),
-                },
-        },
-        {
-                .callback = dmi_low_memory_corruption,
-                .ident = "Phoenix BIOS",
-                .matches = {
-                        DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies"),
-                },
-        },
-        {
-                .callback = dmi_low_memory_corruption,
-                .ident = "Phoenix/MSC BIOS",
-                .matches = {
-                        DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix/MSC"),
-                },
-        },
-        /*
-         * AMI BIOS with low memory corruption was found on Intel DG45ID and
-         * DG45FC boards.
-         * It has a different DMI_BIOS_VENDOR = "Intel Corp.", for now we will
-         * match only DMI_BOARD_NAME and see if there is more bad products
-         * with this vendor.
-         */
-        {
-                .callback = dmi_low_memory_corruption,
-                .ident = "AMI BIOS",
-                .matches = {
-                        DMI_MATCH(DMI_BOARD_NAME, "DG45ID"),
-                },
-        },
-        {
-                .callback = dmi_low_memory_corruption,
-                .ident = "AMI BIOS",
-                .matches = {
-                        DMI_MATCH(DMI_BOARD_NAME, "DG45FC"),
-                },
-        },
-        /*
-         * The Dell Inspiron Mini 1012 has DMI_BIOS_VENDOR = "Dell Inc.", so
-         * match on the product name.
-         */
-        {
-                .callback = dmi_low_memory_corruption,
-                .ident = "Phoenix BIOS",
-                .matches = {
-                        DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 1012"),
-                },
-        },
-#endif
-        {}
-};
 static void __init trim_bios_range(void)
 {
@@ -698,8 +637,14 @@ static void __init trim_bios_range(void)
         * A special case is the first 4Kb of memory;
         * This is a BIOS owned area, not kernel ram, but generally
         * not listed as such in the E820 table.
+         *
+         * This typically reserves additional memory (64KiB by default)
+         * since some BIOSes are known to corrupt low memory.  See the
+         * Kconfig help text for X86_RESERVE_LOW.
         */
-        e820_update_range(0, PAGE_SIZE, E820_RAM, E820_RESERVED);
+        e820_update_range(0, ALIGN(reserve_low, PAGE_SIZE),
+                          E820_RAM, E820_RESERVED);
        /*
         * special case: Some BIOSen report the PC BIOS
         * area (640->1Mb) as ram even though it is not.
@@ -709,6 +654,28 @@ static void __init trim_bios_range(void)
        sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
 }
+static int __init parse_reservelow(char *p)
+{
+        unsigned long long size;
+        if (!p)
+                return -EINVAL;
+        size = memparse(p, &p);
+        if (size < 4096)
+                size = 4096;
+        if (size > 640*1024)
+                size = 640*1024;
+        reserve_low = size;
+        return 0;
+}
+early_param("reservelow", parse_reservelow);
 /*
 * Determine if we were loaded by an EFI loader.  If so, then we have also been
 * passed the efi memmap, systab, etc., so we should use these data structures
@@ -724,20 +691,28 @@ static void __init trim_bios_range(void)
 void __init setup_arch(char **cmdline_p)
 {
-        int acpi = 0;
-        int k8 = 0;
 #ifdef CONFIG_X86_32
        memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
        visws_early_detect();
+        /*
+         * copy kernel address range established so far and switch
+         * to the proper swapper page table
+         */
+        clone_pgd_range(swapper_pg_dir     + KERNEL_PGD_BOUNDARY,
+                        initial_page_table + KERNEL_PGD_BOUNDARY,
+                        KERNEL_PGD_PTRS);
+        load_cr3(swapper_pg_dir);
+        __flush_tlb_all();
 #else
        printk(KERN_INFO "Command line: %s\n", boot_command_line);
 #endif
-        /* VMI may relocate the fixmap; do this before touching ioremap area */
+        /*
-        vmi_init();
+         * If we have OLPC OFW, we might end up relocating the fixmap due to
+         * reserve_top(), so do this before touching the ioremap area.
-        /* OFW also may relocate the fixmap */
+         */
        olpc_ofw_detect();
        early_trap_init();
@@ -782,12 +757,13 @@ void __init setup_arch(char **cmdline_p)
 #endif
         4)) {
                efi_enabled = 1;
-                efi_reserve_early();
+                efi_memblock_x86_reserve_range();
        }
 #endif
        x86_init.oem.arch_setup();
+        iomem_resource.end = (1ULL << boot_cpu_data.x86_phys_bits) - 1;
        setup_memory_map();
        parse_setup_data();
        /* update the e820_saved too */
@@ -838,11 +814,8 @@ void __init setup_arch(char **cmdline_p)
        x86_report_nx();
-        /* Must be before kernel pagetables are setup */
-        vmi_activate();
        /* after early param, so could get panic from serial */
-        reserve_early_setup_data();
+        memblock_x86_reserve_range_setup_data();
        if (acpi_mps_check()) {
 #ifdef CONFIG_X86_LOCAL_APIC
@@ -863,8 +836,6 @@ void __init setup_arch(char **cmdline_p)
        dmi_scan_machine();
-        dmi_check_system(bad_bios_dmi_table);
        /*
         * VMware detection requires dmi to be available, so this
         * needs to be done after dmi_scan_machine, for the BP.
@@ -897,8 +868,6 @@ void __init setup_arch(char **cmdline_p)
         */
        max_pfn = e820_end_of_ram_pfn();
-        /* preallocate 4k for mptable mpc */
-        early_reserve_e820_mpc_new();
        /* update e820 for memory not covered by WB MTRRs */
        mtrr_bp_init();
        if (mtrr_trim_uncached_memory(max_pfn))
@@ -920,18 +889,8 @@ void __init setup_arch(char **cmdline_p)
                max_low_pfn = max_pfn;
        high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;
-        max_pfn_mapped = KERNEL_IMAGE_SIZE >> PAGE_SHIFT;
 #endif
-#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION
-        setup_bios_corruption_check();
-#endif
-        printk(KERN_DEBUG "initial memory mapped : 0 - %08lx\n",
-                        max_pfn_mapped<<PAGE_SHIFT);
-        reserve_brk();
        /*
         * Find and reserve possible boot-time SMP configuration:
         */
@@ -939,15 +898,37 @@ void __init setup_arch(char **cmdline_p)
        reserve_ibft_region();
-        reserve_trampoline_memory();
+        /*
+         * Need to conclude brk, before memblock_x86_fill()
+         *  it could use memblock_find_in_range, could overlap with
+         *  brk area.
+         */
+        reserve_brk();
+        cleanup_highmap();
+        memblock.current_limit = get_max_mapped();
+        memblock_x86_fill();
-#ifdef CONFIG_ACPI_SLEEP
        /*
-         * Reserve low memory region for sleep support.
+         * The EFI specification says that boot service code won't be called
-         * even before init_memory_mapping
+         * after ExitBootServices(). This is, in fact, a lie.
         */
-        acpi_reserve_wakeup_memory();
+        if (efi_enabled)
+                efi_reserve_boot_services();
+        /* preallocate 4k for mptable mpc */
+        early_reserve_e820_mpc_new();
+#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION
+        setup_bios_corruption_check();
 #endif
+        printk(KERN_DEBUG "initial memory mapped : 0 - %08lx\n",
+                        max_pfn_mapped<<PAGE_SHIFT);
+        setup_trampolines();
        init_gbpages();
        /* max_pfn_mapped is updated here */
@@ -962,6 +943,7 @@ void __init setup_arch(char **cmdline_p)
                max_low_pfn = max_pfn;
        }
 #endif
+        memblock.current_limit = get_max_mapped();
        /*
         * NOTE: On x86-32, only from this point on, fixmaps are ready for use.
@@ -971,6 +953,8 @@ void __init setup_arch(char **cmdline_p)
        if (init_ohci1394_dma_early)
                init_ohci1394_dma_on_all_controllers();
 #endif
+        /* Allocate bigger log buffer */
+        setup_log_buf(1);
        reserve_initrd();
@@ -987,24 +971,8 @@ void __init setup_arch(char **cmdline_p)
        early_acpi_boot_init();
-#ifdef CONFIG_ACPI_NUMA
+        initmem_init();
-        /*
+        memblock_find_dma_reserve();
-         * Parse SRAT to discover nodes.
-         */
-        acpi = acpi_numa_init();
-#endif
-#ifdef CONFIG_K8_NUMA
-        if (!acpi)
-                k8 = !k8_numa_init(0, max_pfn);
-#endif
-        initmem_init(0, max_pfn, acpi, k8);
-#ifndef CONFIG_NO_BOOTMEM
-        early_res_to_bootmem(0, max_low_pfn<<PAGE_SHIFT);
-#endif
-        dma32_reserve_bootmem();
 #ifdef CONFIG_KVM_CLOCK
        kvmclock_init();
@@ -1014,7 +982,17 @@ void __init setup_arch(char **cmdline_p)
        paging_init();
        x86_init.paging.pagetable_setup_done(swapper_pg_dir);
-        setup_trampoline_page_table();
+        if (boot_cpu_data.cpuid_level >= 0) {
+                /* A CPU has %cr4 if and only if it has CPUID */
+                mmu_cr4_features = read_cr4();
+        }
+#ifdef CONFIG_X86_32
+        /* sync back kernel address range */
+        clone_pgd_range(initial_page_table + KERNEL_PGD_BOUNDARY,
+                        swapper_pg_dir     + KERNEL_PGD_BOUNDARY,
+                        KERNEL_PGD_PTRS);
+#endif
        tboot_probe();
@@ -1030,8 +1008,8 @@ void __init setup_arch(char **cmdline_p)
         * Read APIC and some other early information from ACPI tables.
         */
        acpi_boot_init();
        sfi_init();
+        x86_dtb_init();
        /*
         * get boot-time SMP configuration:
@@ -1041,15 +1019,10 @@ void __init setup_arch(char **cmdline_p)
        prefill_possible_map();
-#ifdef CONFIG_X86_64
        init_cpu_to_node();
-#endif
        init_apic_mappings();
-        ioapic_init_mappings();
+        ioapic_and_gsi_init();
-        /* need to wait for io_apic is mapped */
-        probe_nr_irqs_gsi();
        kvm_guest_init();
@@ -1070,7 +1043,11 @@ void __init setup_arch(char **cmdline_p)
 #endif
        x86_init.oem.banner();
+        x86_init.timers.wallclock_init();
        mcheck_init();
+        arch_init_ideal_nops();
 }
 #ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index a60df9ae6454..71f4727da373 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -131,13 +131,7 @@ static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align)
 static void __init pcpu_fc_free(void *ptr, size_t size)
 {
-#ifdef CONFIG_NO_BOOTMEM
-        u64 start = __pa(ptr);
-        u64 end = start + size;
-        free_early_partial(start, end);
-#else
        free_bootmem(__pa(ptr), size);
-#endif
 }
 static int __init pcpu_cpu_distance(unsigned int from, unsigned int to)
@@ -231,10 +225,15 @@ void __init setup_per_cpu_areas(void)
                per_cpu(x86_bios_cpu_apicid, cpu) =
                        early_per_cpu_map(x86_bios_cpu_apicid, cpu);
 #endif
+#ifdef CONFIG_X86_32
+                per_cpu(x86_cpu_to_logical_apicid, cpu) =
+                        early_per_cpu_map(x86_cpu_to_logical_apicid, cpu);
+#endif
 #ifdef CONFIG_X86_64
                per_cpu(irq_stack_ptr, cpu) =
                        per_cpu(irq_stack_union.irq_stack, cpu) +
                        IRQ_STACK_SIZE - 64;
+#endif
 #ifdef CONFIG_NUMA
                per_cpu(x86_cpu_to_node_map, cpu) =
                        early_per_cpu_map(x86_cpu_to_node_map, cpu);
@@ -248,12 +247,11 @@ void __init setup_per_cpu_areas(void)
                 */
                set_cpu_numa_node(cpu, early_cpu_to_node(cpu));
 #endif
-#endif
                /*
                 * Up to this point, the boot CPU has been using .init.data
                 * area.  Reload any changed state for the boot CPU.
                 */
-                if (cpu == boot_cpu_id)
+                if (!cpu)
                        switch_to_new_gdt(cpu);
        }
@@ -262,7 +260,10 @@ void __init setup_per_cpu_areas(void)
        early_per_cpu_ptr(x86_cpu_to_apicid) = NULL;
        early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL;
 #endif
-#if defined(CONFIG_X86_64) && defined(CONFIG_NUMA)
+#ifdef CONFIG_X86_32
+        early_per_cpu_ptr(x86_cpu_to_logical_apicid) = NULL;
+#endif
+#ifdef CONFIG_NUMA
        early_per_cpu_ptr(x86_cpu_to_node_map) = NULL;
 #endif
diff --git a/arch/x86/kernel/sfi.c b/arch/x86/kernel/sfi.c
deleted file mode 100644
index cb22acf3ed09..000000000000
--- a/arch/x86/kernel/sfi.c
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * sfi.c - x86 architecture SFI support.
- *
- * Copyright (c) 2009, Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
- *
- */
-#define KMSG_COMPONENT "SFI"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
-#include <linux/acpi.h>
-#include <linux/init.h>
-#include <linux/sfi.h>
-#include <linux/io.h>
-#include <asm/io_apic.h>
-#include <asm/mpspec.h>
-#include <asm/setup.h>
-#include <asm/apic.h>
-#ifdef CONFIG_X86_LOCAL_APIC
-static unsigned long sfi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
-void __init mp_sfi_register_lapic_address(unsigned long address)
-{
-        mp_lapic_addr = address;
-        set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr);
-        if (boot_cpu_physical_apicid == -1U)
-                boot_cpu_physical_apicid = read_apic_id();
-        pr_info("Boot CPU = %d\n", boot_cpu_physical_apicid);
-}
-/* All CPUs enumerated by SFI must be present and enabled */
-void __cpuinit mp_sfi_register_lapic(u8 id)
-{
-        if (MAX_APICS - id <= 0) {
-                pr_warning("Processor #%d invalid (max %d)\n",
-                        id, MAX_APICS);
-                return;
-        }
-        pr_info("registering lapic[%d]\n", id);
-        generic_processor_info(id, GET_APIC_VERSION(apic_read(APIC_LVR)));
-}
-static int __init sfi_parse_cpus(struct sfi_table_header *table)
-{
-        struct sfi_table_simple *sb;
-        struct sfi_cpu_table_entry *pentry;
-        int i;
-        int cpu_num;
-        sb = (struct sfi_table_simple *)table;
-        cpu_num = SFI_GET_NUM_ENTRIES(sb, struct sfi_cpu_table_entry);
-        pentry = (struct sfi_cpu_table_entry *)sb->pentry;
-        for (i = 0; i < cpu_num; i++) {
-                mp_sfi_register_lapic(pentry->apic_id);
-                pentry++;
-        }
-        smp_found_config = 1;
-        return 0;
-}
-#endif /* CONFIG_X86_LOCAL_APIC */
-#ifdef CONFIG_X86_IO_APIC
-static int __init sfi_parse_ioapic(struct sfi_table_header *table)
-{
-        struct sfi_table_simple *sb;
-        struct sfi_apic_table_entry *pentry;
-        int i, num;
-        sb = (struct sfi_table_simple *)table;
-        num = SFI_GET_NUM_ENTRIES(sb, struct sfi_apic_table_entry);
-        pentry = (struct sfi_apic_table_entry *)sb->pentry;
-        for (i = 0; i < num; i++) {
-                mp_register_ioapic(i, pentry->phys_addr, gsi_top);
-                pentry++;
-        }
-        WARN(pic_mode, KERN_WARNING
-                "SFI: pic_mod shouldn't be 1 when IOAPIC table is present\n");
-        pic_mode = 0;
-        return 0;
-}
-#endif /* CONFIG_X86_IO_APIC */
-/*
- * sfi_platform_init(): register lapics & io-apics
- */
-int __init sfi_platform_init(void)
-{
-#ifdef CONFIG_X86_LOCAL_APIC
-        mp_sfi_register_lapic_address(sfi_lapic_addr);
-        sfi_table_parse(SFI_SIG_CPUS, NULL, NULL, sfi_parse_cpus);
-#endif
-#ifdef CONFIG_X86_IO_APIC
-        sfi_table_parse(SFI_SIG_APIC, NULL, NULL, sfi_parse_ioapic);
-#endif
-        return 0;
-}
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 4fd173cd8e57..40a24932a8a1 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -601,10 +601,7 @@ long sys_rt_sigreturn(struct pt_regs *regs)
                goto badframe;
        sigdelsetmask(&set, ~_BLOCKABLE);
-        spin_lock_irq(&current->sighand->siglock);
+        set_current_blocked(&set);
-        current->blocked = set;
-        recalc_sigpending();
-        spin_unlock_irq(&current->sighand->siglock);
        if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax))
                goto badframe;
@@ -682,6 +679,7 @@ static int
 handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
              sigset_t *oldset, struct pt_regs *regs)
 {
+        sigset_t blocked;
        int ret;
        /* Are we from a system call? */
@@ -741,12 +739,10 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
         */
        regs->flags &= ~X86_EFLAGS_TF;
-        spin_lock_irq(&current->sighand->siglock);
+        sigorsets(&blocked, &current->blocked, &ka->sa.sa_mask);
-        sigorsets(&current->blocked, &current->blocked, &ka->sa.sa_mask);
        if (!(ka->sa.sa_flags & SA_NODEFER))
-                sigaddset(&current->blocked, sig);
+                sigaddset(&blocked, sig);
-        recalc_sigpending();
+        set_current_blocked(&blocked);
-        spin_unlock_irq(&current->sighand->siglock);
        tracehook_signal_handler(sig, info, ka, regs,
                                 test_thread_flag(TIF_SINGLESTEP));
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 74cca6014c0e..ed4c4f54e2ae 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -174,10 +174,10 @@ asmlinkage void smp_reboot_interrupt(void)
        irq_exit();
 }
-static void native_smp_send_stop(void)
+static void native_stop_other_cpus(int wait)
 {
        unsigned long flags;
-        unsigned long wait;
+        unsigned long timeout;
        if (reboot_force)
                return;
@@ -194,9 +194,12 @@ static void native_smp_send_stop(void)
        if (num_online_cpus() > 1) {
                apic->send_IPI_allbutself(REBOOT_VECTOR);
-                /* Don't wait longer than a second */
+                /*
-                wait = USEC_PER_SEC;
+                 * Don't wait longer than a second if the caller
-                while (num_online_cpus() > 1 && wait--)
+                 * didn't ask us to wait.
+                 */
+                timeout = USEC_PER_SEC;
+                while (num_online_cpus() > 1 && (wait || timeout--))
                        udelay(1);
        }
@@ -206,9 +209,7 @@ static void native_smp_send_stop(void)
 }
 /*
- * Reschedule call back. Nothing to do,
+ * Reschedule call back.
- * all the work is done automatically when
- * we return from the interrupt.
 */
 void smp_reschedule_interrupt(struct pt_regs *regs)
 {
@@ -216,6 +217,11 @@ void smp_reschedule_interrupt(struct pt_regs *regs)
        /* LITMUS^RT: this IPI might need to trigger the sched state machine. */
        sched_state_ipi();
        inc_irq_stat(irq_resched_count);
+        /*
+         * LITMUS^RT: starting from 3.0 schedule_ipi() actually does something.
+         * This may increase IPI latencies compared with previous versions.
+         */
+        scheduler_ipi();
        TS_SEND_RESCHED_END;
        /*
         * KVM uses this interrupt to force a cpu out of guest mode
@@ -254,7 +260,7 @@ struct smp_ops smp_ops = {
        .smp_prepare_cpus       = native_smp_prepare_cpus,
        .smp_cpus_done          = native_smp_cpus_done,
-        .smp_send_stop          = native_smp_send_stop,
+        .stop_other_cpus        = native_stop_other_cpus,
        .smp_send_reschedule    = native_smp_send_reschedule,
        .cpu_up                 = native_cpu_up,
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 8b3bfc4dd708..9fd3137230d4 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -62,8 +62,9 @@
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
 #include <asm/mtrr.h>
-#include <asm/vmi.h>
+#include <asm/mwait.h>
 #include <asm/apic.h>
+#include <asm/io_apic.h>
 #include <asm/setup.h>
 #include <asm/uv/uv.h>
 #include <linux/mc146818rtc.h>
@@ -71,10 +72,6 @@
 #include <asm/smpboot_hooks.h>
 #include <asm/i8259.h>
-#ifdef CONFIG_X86_32
-u8 apicid_2_node[MAX_APICID];
-#endif
 /* State of each CPU */
 DEFINE_PER_CPU(int, cpu_state) = { 0 };
@@ -97,12 +94,12 @@ static DEFINE_PER_CPU(struct task_struct *, idle_thread_array);
 */
 static DEFINE_MUTEX(x86_cpu_hotplug_driver_mutex);
-void cpu_hotplug_driver_lock()
+void cpu_hotplug_driver_lock(void)
 {
        mutex_lock(&x86_cpu_hotplug_driver_mutex);
 }
-void cpu_hotplug_driver_unlock()
+void cpu_hotplug_driver_unlock(void)
 {
        mutex_unlock(&x86_cpu_hotplug_driver_mutex);
 }
@@ -130,68 +127,14 @@ EXPORT_PER_CPU_SYMBOL(cpu_sibling_map);
 DEFINE_PER_CPU(cpumask_var_t, cpu_core_map);
 EXPORT_PER_CPU_SYMBOL(cpu_core_map);
+DEFINE_PER_CPU(cpumask_var_t, cpu_llc_shared_map);
 /* Per CPU bogomips and other parameters */
 DEFINE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info);
 EXPORT_PER_CPU_SYMBOL(cpu_info);
 atomic_t init_deasserted;
-#if defined(CONFIG_NUMA) && defined(CONFIG_X86_32)
-/* which node each logical CPU is on */
-int cpu_to_node_map[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = 0 };
-EXPORT_SYMBOL(cpu_to_node_map);
-/* set up a mapping between cpu and node. */
-static void map_cpu_to_node(int cpu, int node)
-{
-        printk(KERN_INFO "Mapping cpu %d to node %d\n", cpu, node);
-        cpumask_set_cpu(cpu, node_to_cpumask_map[node]);
-        cpu_to_node_map[cpu] = node;
-}
-/* undo a mapping between cpu and node. */
-static void unmap_cpu_to_node(int cpu)
-{
-        int node;
-        printk(KERN_INFO "Unmapping cpu %d from all nodes\n", cpu);
-        for (node = 0; node < MAX_NUMNODES; node++)
-                cpumask_clear_cpu(cpu, node_to_cpumask_map[node]);
-        cpu_to_node_map[cpu] = 0;
-}
-#else /* !(CONFIG_NUMA && CONFIG_X86_32) */
-#define map_cpu_to_node(cpu, node)      ({})
-#define unmap_cpu_to_node(cpu)  ({})
-#endif
-#ifdef CONFIG_X86_32
-static int boot_cpu_logical_apicid;
-u8 cpu_2_logical_apicid[NR_CPUS] __read_mostly =
-                                        { [0 ... NR_CPUS-1] = BAD_APICID };
-static void map_cpu_to_logical_apicid(void)
-{
-        int cpu = smp_processor_id();
-        int apicid = logical_smp_processor_id();
-        int node = apic->apicid_to_node(apicid);
-        if (!node_online(node))
-                node = first_online_node;
-        cpu_2_logical_apicid[cpu] = apicid;
-        map_cpu_to_node(cpu, node);
-}
-void numa_remove_cpu(int cpu)
-{
-        cpu_2_logical_apicid[cpu] = BAD_APICID;
-        unmap_cpu_to_node(cpu);
-}
-#else
-#define map_cpu_to_logical_apicid()  do {} while (0)
-#endif
 /*
 * Report back to the Boot Processor.
 * Running on AP.
@@ -259,7 +202,6 @@ static void __cpuinit smp_callin(void)
                apic->smp_callin_clear_local_apic();
        setup_local_APIC();
        end_local_APIC_setup();
-        map_cpu_to_logical_apicid();
        /*
         * Need to setup vector mappings before we enable interrupts.
@@ -281,6 +223,13 @@ static void __cpuinit smp_callin(void)
         */
        smp_store_cpu_info(cpuid);
+        /*
+         * This must be done before setting cpu_online_mask
+         * or calling notify_cpu_starting.
+         */
+        set_cpu_sibling_map(raw_smp_processor_id());
+        wmb();
        notify_cpu_starting(cpuid);
        /*
@@ -299,23 +248,16 @@ notrace static void __cpuinit start_secondary(void *unused)
         * fragile that we want to limit the things done here to the
         * most necessary things.
         */
+        cpu_init();
+        preempt_disable();
+        smp_callin();
 #ifdef CONFIG_X86_32
-        /*
+        /* switch away from the initial page table */
-         * Switch away from the trampoline page-table
-         *
-         * Do this before cpu_init() because it needs to access per-cpu
-         * data which may not be mapped in the trampoline page-table.
-         */
        load_cr3(swapper_pg_dir);
        __flush_tlb_all();
 #endif
-        vmi_bringup();
-        cpu_init();
-        preempt_disable();
-        smp_callin();
        /* otherwise gcc will move up smp_processor_id before the cpu_init */
        barrier();
        /*
@@ -323,16 +265,6 @@ notrace static void __cpuinit start_secondary(void *unused)
         */
        check_tsc_sync_target();
-        if (nmi_watchdog == NMI_IO_APIC) {
-                legacy_pic->chip->mask(0);
-                enable_NMI_through_LVT0();
-                legacy_pic->chip->unmask(0);
-        }
-        /* This must be done before setting cpu_online_mask */
-        set_cpu_sibling_map(raw_smp_processor_id());
-        wmb();
        /*
         * We need to hold call_lock, so there is no inconsistency
         * between the time smp_call_function() determines number of
@@ -353,6 +285,19 @@ notrace static void __cpuinit start_secondary(void *unused)
        per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
        x86_platform.nmi_init();
+        /*
+         * Wait until the cpu which brought this one up marked it
+         * online before enabling interrupts. If we don't do that then
+         * we can end up waking up the softirq thread before this cpu
+         * reached the active state, which makes the scheduler unhappy
+         * and schedule the softirq thread on the wrong cpu. This is
+         * only observable with forced threaded interrupts, but in
+         * theory it could also happen w/o them. It's just way harder
+         * to achieve.
+         */
+        while (!cpumask_test_cpu(smp_processor_id(), cpu_active_mask))
+                cpu_relax();
        /* enable local interrupts */
        local_irq_enable();
@@ -365,23 +310,6 @@ notrace static void __cpuinit start_secondary(void *unused)
        cpu_idle();
 }
-#ifdef CONFIG_CPUMASK_OFFSTACK
-/* In this case, llc_shared_map is a pointer to a cpumask. */
-static inline void copy_cpuinfo_x86(struct cpuinfo_x86 *dst,
-                                    const struct cpuinfo_x86 *src)
-{
-        struct cpumask *llc = dst->llc_shared_map;
-        *dst = *src;
-        dst->llc_shared_map = llc;
-}
-#else
-static inline void copy_cpuinfo_x86(struct cpuinfo_x86 *dst,
-                                    const struct cpuinfo_x86 *src)
-{
-        *dst = *src;
-}
-#endif /* CONFIG_CPUMASK_OFFSTACK */
 /*
 * The bootstrap kernel entry code has set these up. Save them for
 * a given CPU
@@ -391,12 +319,22 @@ void __cpuinit smp_store_cpu_info(int id)
 {
        struct cpuinfo_x86 *c = &cpu_data(id);
-        copy_cpuinfo_x86(c, &boot_cpu_data);
+        *c = boot_cpu_data;
        c->cpu_index = id;
        if (id != 0)
                identify_secondary_cpu(c);
 }
+static void __cpuinit link_thread_siblings(int cpu1, int cpu2)
+{
+        cpumask_set_cpu(cpu1, cpu_sibling_mask(cpu2));
+        cpumask_set_cpu(cpu2, cpu_sibling_mask(cpu1));
+        cpumask_set_cpu(cpu1, cpu_core_mask(cpu2));
+        cpumask_set_cpu(cpu2, cpu_core_mask(cpu1));
+        cpumask_set_cpu(cpu1, cpu_llc_shared_mask(cpu2));
+        cpumask_set_cpu(cpu2, cpu_llc_shared_mask(cpu1));
+}
 void __cpuinit set_cpu_sibling_map(int cpu)
 {
@@ -409,23 +347,23 @@ void __cpuinit set_cpu_sibling_map(int cpu)
                for_each_cpu(i, cpu_sibling_setup_mask) {
                        struct cpuinfo_x86 *o = &cpu_data(i);
-                        if (c->phys_proc_id == o->phys_proc_id &&
+                        if (cpu_has(c, X86_FEATURE_TOPOEXT)) {
-                            c->cpu_core_id == o->cpu_core_id) {
+                                if (c->phys_proc_id == o->phys_proc_id &&
-                                cpumask_set_cpu(i, cpu_sibling_mask(cpu));
+                                    per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i) &&
-                                cpumask_set_cpu(cpu, cpu_sibling_mask(i));
+                                    c->compute_unit_id == o->compute_unit_id)
-                                cpumask_set_cpu(i, cpu_core_mask(cpu));
+                                        link_thread_siblings(cpu, i);
-                                cpumask_set_cpu(cpu, cpu_core_mask(i));
+                        } else if (c->phys_proc_id == o->phys_proc_id &&
-                                cpumask_set_cpu(i, c->llc_shared_map);
+                                   c->cpu_core_id == o->cpu_core_id) {
-                                cpumask_set_cpu(cpu, o->llc_shared_map);
+                                link_thread_siblings(cpu, i);
                        }
                }
        } else {
                cpumask_set_cpu(cpu, cpu_sibling_mask(cpu));
        }
-        cpumask_set_cpu(cpu, c->llc_shared_map);
+        cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu));
-        if (current_cpu_data.x86_max_cores == 1) {
+        if (__this_cpu_read(cpu_info.x86_max_cores) == 1) {
                cpumask_copy(cpu_core_mask(cpu), cpu_sibling_mask(cpu));
                c->booted_cores = 1;
                return;
@@ -434,8 +372,8 @@ void __cpuinit set_cpu_sibling_map(int cpu)
        for_each_cpu(i, cpu_sibling_setup_mask) {
                if (per_cpu(cpu_llc_id, cpu) != BAD_APICID &&
                    per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i)) {
-                        cpumask_set_cpu(i, c->llc_shared_map);
+                        cpumask_set_cpu(i, cpu_llc_shared_mask(cpu));
-                        cpumask_set_cpu(cpu, cpu_data(i).llc_shared_map);
+                        cpumask_set_cpu(cpu, cpu_llc_shared_mask(i));
                }
                if (c->phys_proc_id == cpu_data(i).phys_proc_id) {
                        cpumask_set_cpu(i, cpu_core_mask(cpu));
@@ -474,7 +412,7 @@ const struct cpumask *cpu_coregroup_mask(int cpu)
            !(cpu_has(c, X86_FEATURE_AMD_DCM)))
                return cpu_core_mask(cpu);
        else
-                return c->llc_shared_map;
+                return cpu_llc_shared_mask(cpu);
 }
 static void impress_friends(void)
@@ -636,7 +574,7 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
         * target processor state.
         */
        startup_ipi_hook(phys_apicid, (unsigned long) start_secondary,
-                         (unsigned long)stack_start.sp);
+                         stack_start);
        /*
         * Run STARTUP IPI loop.
@@ -742,7 +680,7 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
                .done   = COMPLETION_INITIALIZER_ONSTACK(c_idle.done),
        };
-        INIT_WORK_ON_STACK(&c_idle.work, do_fork_idle);
+        INIT_WORK_ONSTACK(&c_idle.work, do_fork_idle);
        alternatives_smp_switch(1);
@@ -774,7 +712,6 @@ do_rest:
 #ifdef CONFIG_X86_32
        /* Stack for startup_32 can be just as for start_secondary onwards */
        irq_ctx_init(cpu);
-        initial_page_table = __pa(&trampoline_pg_dir);
 #else
        clear_tsk_thread_flag(c_idle.idle, TIF_FORK);
        initial_gs = per_cpu_offset(cpu);
@@ -784,10 +721,10 @@ do_rest:
 #endif
        early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
        initial_code = (unsigned long)start_secondary;
-        stack_start.sp = (void *) c_idle.idle->thread.sp;
+        stack_start  = c_idle.idle->thread.sp;
        /* start_ip had better be page-aligned! */
-        start_ip = setup_trampoline();
+        start_ip = trampoline_address();
        /* So we see what's up */
        announce_cpu(cpu, apicid);
@@ -797,6 +734,8 @@ do_rest:
         * the targeted processor.
         */
+        printk(KERN_DEBUG "smpboot cpu %d: start_ip = %lx\n", cpu, start_ip);
        atomic_set(&init_deasserted, 0);
        if (get_uv_system_type() != UV_NON_UNIQUE_APIC) {
@@ -850,8 +789,8 @@ do_rest:
                        pr_debug("CPU%d: has booted.\n", cpu);
                else {
                        boot_error = 1;
-                        if (*((volatile unsigned char *)trampoline_base)
+                        if (*(volatile u32 *)TRAMPOLINE_SYM(trampoline_status)
-                                        == 0xA5)
+                            == 0xA5A5A5A5)
                                /* trampoline started but...? */
                                pr_err("CPU%d: Stuck ??\n", cpu);
                        else
@@ -877,7 +816,7 @@ do_rest:
        }
        /* mark "stuck" area as not stuck */
-        *((volatile unsigned long *)trampoline_base) = 0;
+        *(volatile u32 *)TRAMPOLINE_SYM(trampoline_status) = 0;
        if (get_uv_system_type() != UV_NON_UNIQUE_APIC) {
                /*
@@ -923,7 +862,6 @@ int __cpuinit native_cpu_up(unsigned int cpu)
        per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
        err = do_boot_cpu(apicid, cpu);
        if (err) {
                pr_debug("do_boot_cpu failed %d\n", err);
                return -EIO;
@@ -945,6 +883,14 @@ int __cpuinit native_cpu_up(unsigned int cpu)
        return 0;
 }
+/**
+ * arch_disable_smp_support() - disables SMP support for x86 at runtime
+ */
+void arch_disable_smp_support(void)
+{
+        disable_ioapic_support();
+}
 /*
 * Fall back to non SMP mode after errors.
 *
@@ -960,7 +906,6 @@ static __init void disable_smp(void)
                physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map);
        else
                physid_set_mask_of_physid(0, &phys_cpu_present_map);
-        map_cpu_to_logical_apicid();
        cpumask_set_cpu(0, cpu_sibling_mask(0));
        cpumask_set_cpu(0, cpu_core_mask(0));
 }
@@ -1045,7 +990,7 @@ static int __init smp_sanity_check(unsigned max_cpus)
                                "(tell your hw vendor)\n");
                }
                smpboot_clear_io_apic();
-                arch_disable_smp_support();
+                disable_ioapic_support();
                return -1;
        }
@@ -1058,11 +1003,9 @@ static int __init smp_sanity_check(unsigned max_cpus)
                printk(KERN_INFO "SMP mode deactivated.\n");
                smpboot_clear_io_apic();
-                localise_nmi_watchdog();
                connect_bsp_APIC();
                setup_local_APIC();
-                end_local_APIC_setup();
+                bsp_end_local_APIC_setup();
                return -1;
        }
@@ -1091,26 +1034,22 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
        preempt_disable();
        smp_cpu_index_default();
-        current_cpu_data = boot_cpu_data;
-        cpumask_copy(cpu_callin_mask, cpumask_of(0));
-        mb();
        /*
         * Setup boot CPU information
         */
        smp_store_cpu_info(0); /* Final full version of the data */
-#ifdef CONFIG_X86_32
+        cpumask_copy(cpu_callin_mask, cpumask_of(0));
-        boot_cpu_logical_apicid = logical_smp_processor_id();
+        mb();
-#endif
        current_thread_info()->cpu = 0;  /* needed? */
        for_each_possible_cpu(i) {
                zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL);
                zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL);
-                zalloc_cpumask_var(&cpu_data(i).llc_shared_map, GFP_KERNEL);
+                zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL);
        }
        set_cpu_sibling_map(0);
-        enable_IR_x2apic();
-        default_setup_apic_routing();
        if (smp_sanity_check(max_cpus) < 0) {
                printk(KERN_INFO "SMP disabled\n");
@@ -1118,6 +1057,8 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
                goto out;
        }
+        default_setup_apic_routing();
        preempt_disable();
        if (read_apic_id() != boot_cpu_physical_apicid) {
                panic("Boot APIC ID in local APIC unexpected (%d vs %d)",
@@ -1139,9 +1080,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
        if (!skip_ioapic_setup && nr_ioapics)
                enable_IO_APIC();
-        end_local_APIC_setup();
+        bsp_end_local_APIC_setup();
-        map_cpu_to_logical_apicid();
        if (apic->setup_portio_remap)
                apic->setup_portio_remap();
@@ -1163,6 +1102,20 @@ out:
        preempt_enable();
 }
+void arch_disable_nonboot_cpus_begin(void)
+{
+        /*
+         * Avoid the smp alternatives switch during the disable_nonboot_cpus().
+         * In the suspend path, we will be back in the SMP mode shortly anyways.
+         */
+        skip_smp_alternatives = true;
+}
+void arch_disable_nonboot_cpus_end(void)
+{
+        skip_smp_alternatives = false;
+}
 void arch_enable_nonboot_cpus_begin(void)
 {
        set_mtrr_aps_delayed_init();
@@ -1193,7 +1146,6 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
 #ifdef CONFIG_X86_IO_APIC
        setup_ioapic_dest();
 #endif
-        check_nmi_watchdog();
        mtrr_aps_init();
 }
@@ -1338,8 +1290,6 @@ int native_cpu_disable(void)
        if (cpu == 0)
                return -EBUSY;
-        if (nmi_watchdog == NMI_LOCAL_APIC)
-                stop_apic_nmi_watchdog(NULL);
        clear_local_APIC();
        cpu_disable_common();
@@ -1370,12 +1320,11 @@ void play_dead_common(void)
 {
        idle_task_exit();
        reset_lazy_tlbstate();
-        irq_ctx_exit(raw_smp_processor_id());
+        amd_e400_remove_cpu(raw_smp_processor_id());
-        c1e_remove_cpu(raw_smp_processor_id());
        mb();
        /* Ack it */
-        __get_cpu_var(cpu_state) = CPU_DEAD;
+        __this_cpu_write(cpu_state, CPU_DEAD);
        /*
         * With physical CPU hotplug, we should halt the cpu
@@ -1383,11 +1332,89 @@ void play_dead_common(void)
        local_irq_disable();
 }
+/*
+ * We need to flush the caches before going to sleep, lest we have
+ * dirty data in our caches when we come back up.
+ */
+static inline void mwait_play_dead(void)
+{
+        unsigned int eax, ebx, ecx, edx;
+        unsigned int highest_cstate = 0;
+        unsigned int highest_subcstate = 0;
+        int i;
+        void *mwait_ptr;
+        struct cpuinfo_x86 *c = __this_cpu_ptr(&cpu_info);
+        if (!(this_cpu_has(X86_FEATURE_MWAIT) && mwait_usable(c)))
+                return;
+        if (!this_cpu_has(X86_FEATURE_CLFLSH))
+                return;
+        if (__this_cpu_read(cpu_info.cpuid_level) < CPUID_MWAIT_LEAF)
+                return;
+        eax = CPUID_MWAIT_LEAF;
+        ecx = 0;
+        native_cpuid(&eax, &ebx, &ecx, &edx);
+        /*
+         * eax will be 0 if EDX enumeration is not valid.
+         * Initialized below to cstate, sub_cstate value when EDX is valid.
+         */
+        if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED)) {
+                eax = 0;
+        } else {
+                edx >>= MWAIT_SUBSTATE_SIZE;
+                for (i = 0; i < 7 && edx; i++, edx >>= MWAIT_SUBSTATE_SIZE) {
+                        if (edx & MWAIT_SUBSTATE_MASK) {
+                                highest_cstate = i;
+                                highest_subcstate = edx & MWAIT_SUBSTATE_MASK;
+                        }
+                }
+                eax = (highest_cstate << MWAIT_SUBSTATE_SIZE) |
+                        (highest_subcstate - 1);
+        }
+        /*
+         * This should be a memory location in a cache line which is
+         * unlikely to be touched by other processors.  The actual
+         * content is immaterial as it is not actually modified in any way.
+         */
+        mwait_ptr = &current_thread_info()->flags;
+        wbinvd();
+        while (1) {
+                /*
+                 * The CLFLUSH is a workaround for erratum AAI65 for
+                 * the Xeon 7400 series.  It's not clear it is actually
+                 * needed, but it should be harmless in either case.
+                 * The WBINVD is insufficient due to the spurious-wakeup
+                 * case where we return around the loop.
+                 */
+                clflush(mwait_ptr);
+                __monitor(mwait_ptr, 0, 0);
+                mb();
+                __mwait(eax, 0);
+        }
+}
+static inline void hlt_play_dead(void)
+{
+        if (__this_cpu_read(cpu_info.x86) >= 4)
+                wbinvd();
+        while (1) {
+                native_halt();
+        }
+}
 void native_play_dead(void)
 {
        play_dead_common();
        tboot_shutdown(TB_SHUTDOWN_WFS);
-        wbinvd_halt();
+        mwait_play_dead();      /* Only returns on failure */
+        hlt_play_dead();
 }
 #else /* ... !CONFIG_HOTPLUG_CPU */
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index b53c525368a7..55d9bc03f696 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -9,15 +9,6 @@
 #include <linux/uaccess.h>
 #include <asm/stacktrace.h>
-static void save_stack_warning(void *data, char *msg)
-{
-}
-static void
-save_stack_warning_symbol(void *data, char *msg, unsigned long symbol)
-{
-}
 static int save_stack_stack(void *data, char *name)
 {
        return 0;
@@ -53,16 +44,12 @@ save_stack_address_nosched(void *data, unsigned long addr, int reliable)
 }
 static const struct stacktrace_ops save_stack_ops = {
-        .warning        = save_stack_warning,
-        .warning_symbol = save_stack_warning_symbol,
        .stack          = save_stack_stack,
        .address        = save_stack_address,
        .walk_stack     = print_context_stack,
 };
 static const struct stacktrace_ops save_stack_ops_nosched = {
-        .warning        = save_stack_warning,
-        .warning_symbol = save_stack_warning_symbol,
        .stack          = save_stack_stack,
        .address        = save_stack_address_nosched,
        .walk_stack     = print_context_stack,
@@ -79,9 +66,9 @@ void save_stack_trace(struct stack_trace *trace)
 }
 EXPORT_SYMBOL_GPL(save_stack_trace);
-void save_stack_trace_bp(struct stack_trace *trace, unsigned long bp)
+void save_stack_trace_regs(struct stack_trace *trace, struct pt_regs *regs)
 {
-        dump_trace(current, NULL, NULL, bp, &save_stack_ops, trace);
+        dump_trace(current, regs, NULL, 0, &save_stack_ops, trace);
        if (trace->nr_entries < trace->max_entries)
                trace->entries[trace->nr_entries++] = ULONG_MAX;
 }
diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c
index 58de45ee08b6..7977f0cfe339 100644
--- a/arch/x86/kernel/step.c
+++ b/arch/x86/kernel/step.c
@@ -166,7 +166,7 @@ static void enable_step(struct task_struct *child, bool block)
         * Make sure block stepping (BTF) is not enabled unless it should be.
         * Note that we don't try to worry about any is_setting_trap_flag()
         * instructions after the first when using block stepping.
-         * So noone should try to use debugger block stepping in a program
+         * So no one should try to use debugger block stepping in a program
         * that uses user-mode single stepping itself.
         */
        if (enable_single_step(child) && block) {
diff --git a/arch/x86/kernel/sys_i386_32.c b/arch/x86/kernel/sys_i386_32.c
index d5e06624e34a..0b0cb5fede19 100644
--- a/arch/x86/kernel/sys_i386_32.c
+++ b/arch/x86/kernel/sys_i386_32.c
@@ -33,8 +33,8 @@ int kernel_execve(const char *filename,
                  const char *const envp[])
 {
        long __res;
-        asm volatile ("push %%ebx ; movl %2,%%ebx ; int $0x80 ; pop %%ebx"
+        asm volatile ("int $0x80"
        : "=a" (__res)
-        : "0" (__NR_execve), "ri" (filename), "c" (argv), "d" (envp) : "memory");
+        : "0" (__NR_execve), "b" (filename), "c" (argv), "d" (envp) : "memory");
        return __res;
 }
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index 37702905f658..d0126222b394 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -340,15 +340,21 @@ ENTRY(sys_call_table)
        .long sys_fanotify_init
        .long sys_fanotify_mark
        .long sys_prlimit64             /* 340 */
-        .long sys_set_rt_task_param     /* LITMUS^RT 341 */
+        .long sys_name_to_handle_at
+        .long sys_open_by_handle_at
+        .long sys_clock_adjtime
+        .long sys_syncfs
+        .long sys_sendmmsg              /* 345 */
+        .long sys_setns
+        .long sys_set_rt_task_param     /* LITMUS^RT 347 */
        .long sys_get_rt_task_param
        .long sys_complete_job
        .long sys_od_open
        .long sys_od_close
-        .long sys_litmus_lock
+        .long sys_litmus_lock           /* +5 */
        .long sys_litmus_unlock
        .long sys_query_job_no
        .long sys_wait_for_job_release
        .long sys_wait_for_ts_release
-        .long sys_release_ts
+        .long sys_release_ts            /* +10 */
        .long sys_null_call
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index c2f1b26141e2..30ac65df7d4e 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -110,7 +110,6 @@ static struct mm_struct tboot_mm = {
        .mmap_sem       = __RWSEM_INITIALIZER(init_mm.mmap_sem),
        .page_table_lock =  __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
        .mmlist         = LIST_HEAD_INIT(init_mm.mmlist),
-        .cpu_vm_mask    = CPU_MASK_ALL,
 };
 static inline void switch_to_tboot_pt(void)
@@ -133,7 +132,7 @@ static int map_tboot_page(unsigned long vaddr, unsigned long pfn,
        pmd = pmd_alloc(&tboot_mm, pud, vaddr);
        if (!pmd)
                return -1;
-        pte = pte_alloc_map(&tboot_mm, pmd, vaddr);
+        pte = pte_alloc_map(&tboot_mm, NULL, pmd, vaddr);
        if (!pte)
                return -1;
        set_pte_at(&tboot_mm, vaddr, pte, pfn_pte(pfn, prot));
diff --git a/arch/x86/kernel/test_nx.c b/arch/x86/kernel/test_nx.c
index 787a5e499dd1..3f92ce07e525 100644
--- a/arch/x86/kernel/test_nx.c
+++ b/arch/x86/kernel/test_nx.c
@@ -161,7 +161,7 @@ static int test_NX(void)
        }
 #endif
-        return 0;
+        return ret;
 }
 static void test_exit(void)
diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c
index fb5cc5e14cfa..00cbb272627f 100644
--- a/arch/x86/kernel/time.c
+++ b/arch/x86/kernel/time.c
@@ -22,12 +22,8 @@
 #include <asm/hpet.h>
 #include <asm/time.h>
-#if defined(CONFIG_X86_32) && defined(CONFIG_X86_IO_APIC)
-int timer_ack;
-#endif
 #ifdef CONFIG_X86_64
-volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES;
+DEFINE_VVAR(volatile unsigned long, jiffies) = INITIAL_JIFFIES;
 #endif
 unsigned long profile_pc(struct pt_regs *regs)
@@ -63,20 +59,6 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id)
        /* Keep nmi watchdog up to date */
        inc_irq_stat(irq0_irqs);
-        /* Optimized out for !IO_APIC and x86_64 */
-        if (timer_ack) {
-                /*
-                 * Subtle, when I/O APICs are used we have to ack timer IRQ
-                 * manually to deassert NMI lines for the watchdog if run
-                 * on an 82489DX-based system.
-                 */
-                raw_spin_lock(&i8259A_lock);
-                outb(0x0c, PIC_MASTER_OCW3);
-                /* Ack the IRQ; AEOI will end it automatically. */
-                inb(PIC_MASTER_POLL);
-                raw_spin_unlock(&i8259A_lock);
-        }
        global_clock_event->event_handler(global_clock_event);
        /* MCA bus quirk: Acknowledge irq0 by setting bit 7 in port 0x61 */
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
deleted file mode 100644
index 312ef0292815..000000000000
--- a/arch/x86/kernel/tlb_uv.c
+++ /dev/null
@@ -1,1655 +0,0 @@
-/*
- *      SGI UltraViolet TLB flush routines.
- *
- *      (c) 2008-2010 Cliff Wickman <cpw@sgi.com>, SGI.
- *
- *      This code is released under the GNU General Public License version 2 or
- *      later.
- */
-#include <linux/seq_file.h>
-#include <linux/proc_fs.h>
-#include <linux/debugfs.h>
-#include <linux/kernel.h>
-#include <linux/slab.h>
-#include <asm/mmu_context.h>
-#include <asm/uv/uv.h>
-#include <asm/uv/uv_mmrs.h>
-#include <asm/uv/uv_hub.h>
-#include <asm/uv/uv_bau.h>
-#include <asm/apic.h>
-#include <asm/idle.h>
-#include <asm/tsc.h>
-#include <asm/irq_vectors.h>
-#include <asm/timer.h>
-/* timeouts in nanoseconds (indexed by UVH_AGING_PRESCALE_SEL urgency7 30:28) */
-static int timeout_base_ns[] = {
-                20,
-                160,
-                1280,
-                10240,
-                81920,
-                655360,
-                5242880,
-                167772160
-};
-static int timeout_us;
-static int nobau;
-static int baudisabled;
-static spinlock_t disable_lock;
-static cycles_t congested_cycles;
-/* tunables: */
-static int max_bau_concurrent = MAX_BAU_CONCURRENT;
-static int max_bau_concurrent_constant = MAX_BAU_CONCURRENT;
-static int plugged_delay = PLUGGED_DELAY;
-static int plugsb4reset = PLUGSB4RESET;
-static int timeoutsb4reset = TIMEOUTSB4RESET;
-static int ipi_reset_limit = IPI_RESET_LIMIT;
-static int complete_threshold = COMPLETE_THRESHOLD;
-static int congested_response_us = CONGESTED_RESPONSE_US;
-static int congested_reps = CONGESTED_REPS;
-static int congested_period = CONGESTED_PERIOD;
-static struct dentry *tunables_dir;
-static struct dentry *tunables_file;
-static int __init setup_nobau(char *arg)
-{
-        nobau = 1;
-        return 0;
-}
-early_param("nobau", setup_nobau);
-/* base pnode in this partition */
-static int uv_partition_base_pnode __read_mostly;
-/* position of pnode (which is nasid>>1): */
-static int uv_nshift __read_mostly;
-static unsigned long uv_mmask __read_mostly;
-static DEFINE_PER_CPU(struct ptc_stats, ptcstats);
-static DEFINE_PER_CPU(struct bau_control, bau_control);
-static DEFINE_PER_CPU(cpumask_var_t, uv_flush_tlb_mask);
-/*
- * Determine the first node on a uvhub. 'Nodes' are used for kernel
- * memory allocation.
- */
-static int __init uvhub_to_first_node(int uvhub)
-{
-        int node, b;
-        for_each_online_node(node) {
-                b = uv_node_to_blade_id(node);
-                if (uvhub == b)
-                        return node;
-        }
-        return -1;
-}
-/*
- * Determine the apicid of the first cpu on a uvhub.
- */
-static int __init uvhub_to_first_apicid(int uvhub)
-{
-        int cpu;
-        for_each_present_cpu(cpu)
-                if (uvhub == uv_cpu_to_blade_id(cpu))
-                        return per_cpu(x86_cpu_to_apicid, cpu);
-        return -1;
-}
-/*
- * Free a software acknowledge hardware resource by clearing its Pending
- * bit. This will return a reply to the sender.
- * If the message has timed out, a reply has already been sent by the
- * hardware but the resource has not been released. In that case our
- * clear of the Timeout bit (as well) will free the resource. No reply will
- * be sent (the hardware will only do one reply per message).
- */
-static inline void uv_reply_to_message(struct msg_desc *mdp,
-                                       struct bau_control *bcp)
-{
-        unsigned long dw;
-        struct bau_payload_queue_entry *msg;
-        msg = mdp->msg;
-        if (!msg->canceled) {
-                dw = (msg->sw_ack_vector << UV_SW_ACK_NPENDING) |
-                                                msg->sw_ack_vector;
-                uv_write_local_mmr(
-                                UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, dw);
-        }
-        msg->replied_to = 1;
-        msg->sw_ack_vector = 0;
-}
-/*
- * Process the receipt of a RETRY message
- */
-static inline void uv_bau_process_retry_msg(struct msg_desc *mdp,
-                                            struct bau_control *bcp)
-{
-        int i;
-        int cancel_count = 0;
-        int slot2;
-        unsigned long msg_res;
-        unsigned long mmr = 0;
-        struct bau_payload_queue_entry *msg;
-        struct bau_payload_queue_entry *msg2;
-        struct ptc_stats *stat;
-        msg = mdp->msg;
-        stat = bcp->statp;
-        stat->d_retries++;
-        /*
-         * cancel any message from msg+1 to the retry itself
-         */
-        for (msg2 = msg+1, i = 0; i < DEST_Q_SIZE; msg2++, i++) {
-                if (msg2 > mdp->va_queue_last)
-                        msg2 = mdp->va_queue_first;
-                if (msg2 == msg)
-                        break;
-                /* same conditions for cancellation as uv_do_reset */
-                if ((msg2->replied_to == 0) && (msg2->canceled == 0) &&
-                    (msg2->sw_ack_vector) && ((msg2->sw_ack_vector &
-                        msg->sw_ack_vector) == 0) &&
-                    (msg2->sending_cpu == msg->sending_cpu) &&
-                    (msg2->msg_type != MSG_NOOP)) {
-                        slot2 = msg2 - mdp->va_queue_first;
-                        mmr = uv_read_local_mmr
-                                (UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE);
-                        msg_res = msg2->sw_ack_vector;
-                        /*
-                         * This is a message retry; clear the resources held
-                         * by the previous message only if they timed out.
-                         * If it has not timed out we have an unexpected
-                         * situation to report.
-                         */
-                        if (mmr & (msg_res << UV_SW_ACK_NPENDING)) {
-                                /*
-                                 * is the resource timed out?
-                                 * make everyone ignore the cancelled message.
-                                 */
-                                msg2->canceled = 1;
-                                stat->d_canceled++;
-                                cancel_count++;
-                                uv_write_local_mmr(
-                                    UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS,
-                                        (msg_res << UV_SW_ACK_NPENDING) |
-                                         msg_res);
-                        }
-                }
-        }
-        if (!cancel_count)
-                stat->d_nocanceled++;
-}
-/*
- * Do all the things a cpu should do for a TLB shootdown message.
- * Other cpu's may come here at the same time for this message.
- */
-static void uv_bau_process_message(struct msg_desc *mdp,
-                                   struct bau_control *bcp)
-{
-        int msg_ack_count;
-        short socket_ack_count = 0;
-        struct ptc_stats *stat;
-        struct bau_payload_queue_entry *msg;
-        struct bau_control *smaster = bcp->socket_master;
-        /*
-         * This must be a normal message, or retry of a normal message
-         */
-        msg = mdp->msg;
-        stat = bcp->statp;
-        if (msg->address == TLB_FLUSH_ALL) {
-                local_flush_tlb();
-                stat->d_alltlb++;
-        } else {
-                __flush_tlb_one(msg->address);
-                stat->d_onetlb++;
-        }
-        stat->d_requestee++;
-        /*
-         * One cpu on each uvhub has the additional job on a RETRY
-         * of releasing the resource held by the message that is
-         * being retried.  That message is identified by sending
-         * cpu number.
-         */
-        if (msg->msg_type == MSG_RETRY && bcp == bcp->uvhub_master)
-                uv_bau_process_retry_msg(mdp, bcp);
-        /*
-         * This is a sw_ack message, so we have to reply to it.
-         * Count each responding cpu on the socket. This avoids
-         * pinging the count's cache line back and forth between
-         * the sockets.
-         */
-        socket_ack_count = atomic_add_short_return(1, (struct atomic_short *)
-                        &smaster->socket_acknowledge_count[mdp->msg_slot]);
-        if (socket_ack_count == bcp->cpus_in_socket) {
-                /*
-                 * Both sockets dump their completed count total into
-                 * the message's count.
-                 */
-                smaster->socket_acknowledge_count[mdp->msg_slot] = 0;
-                msg_ack_count = atomic_add_short_return(socket_ack_count,
-                                (struct atomic_short *)&msg->acknowledge_count);
-                if (msg_ack_count == bcp->cpus_in_uvhub) {
-                        /*
-                         * All cpus in uvhub saw it; reply
-                         */
-                        uv_reply_to_message(mdp, bcp);
-                }
-        }
-        return;
-}
-/*
- * Determine the first cpu on a uvhub.
- */
-static int uvhub_to_first_cpu(int uvhub)
-{
-        int cpu;
-        for_each_present_cpu(cpu)
-                if (uvhub == uv_cpu_to_blade_id(cpu))
-                        return cpu;
-        return -1;
-}
-/*
- * Last resort when we get a large number of destination timeouts is
- * to clear resources held by a given cpu.
- * Do this with IPI so that all messages in the BAU message queue
- * can be identified by their nonzero sw_ack_vector field.
- *
- * This is entered for a single cpu on the uvhub.
- * The sender want's this uvhub to free a specific message's
- * sw_ack resources.
- */
-static void
-uv_do_reset(void *ptr)
-{
-        int i;
-        int slot;
-        int count = 0;
-        unsigned long mmr;
-        unsigned long msg_res;
-        struct bau_control *bcp;
-        struct reset_args *rap;
-        struct bau_payload_queue_entry *msg;
-        struct ptc_stats *stat;
-        bcp = &per_cpu(bau_control, smp_processor_id());
-        rap = (struct reset_args *)ptr;
-        stat = bcp->statp;
-        stat->d_resets++;
-        /*
-         * We're looking for the given sender, and
-         * will free its sw_ack resource.
-         * If all cpu's finally responded after the timeout, its
-         * message 'replied_to' was set.
-         */
-        for (msg = bcp->va_queue_first, i = 0; i < DEST_Q_SIZE; msg++, i++) {
-                /* uv_do_reset: same conditions for cancellation as
-                   uv_bau_process_retry_msg() */
-                if ((msg->replied_to == 0) &&
-                    (msg->canceled == 0) &&
-                    (msg->sending_cpu == rap->sender) &&
-                    (msg->sw_ack_vector) &&
-                    (msg->msg_type != MSG_NOOP)) {
-                        /*
-                         * make everyone else ignore this message
-                         */
-                        msg->canceled = 1;
-                        slot = msg - bcp->va_queue_first;
-                        count++;
-                        /*
-                         * only reset the resource if it is still pending
-                         */
-                        mmr = uv_read_local_mmr
-                                        (UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE);
-                        msg_res = msg->sw_ack_vector;
-                        if (mmr & msg_res) {
-                                stat->d_rcanceled++;
-                                uv_write_local_mmr(
-                                    UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS,
-                                        (msg_res << UV_SW_ACK_NPENDING) |
-                                         msg_res);
-                        }
-                }
-        }
-        return;
-}
-/*
- * Use IPI to get all target uvhubs to release resources held by
- * a given sending cpu number.
- */
-static void uv_reset_with_ipi(struct bau_target_uvhubmask *distribution,
-                              int sender)
-{
-        int uvhub;
-        int cpu;
-        cpumask_t mask;
-        struct reset_args reset_args;
-        reset_args.sender = sender;
-        cpus_clear(mask);
-        /* find a single cpu for each uvhub in this distribution mask */
-        for (uvhub = 0;
-                    uvhub < sizeof(struct bau_target_uvhubmask) * BITSPERBYTE;
-                    uvhub++) {
-                if (!bau_uvhub_isset(uvhub, distribution))
-                        continue;
-                /* find a cpu for this uvhub */
-                cpu = uvhub_to_first_cpu(uvhub);
-                cpu_set(cpu, mask);
-        }
-        /* IPI all cpus; Preemption is already disabled */
-        smp_call_function_many(&mask, uv_do_reset, (void *)&reset_args, 1);
-        return;
-}
-static inline unsigned long
-cycles_2_us(unsigned long long cyc)
-{
-        unsigned long long ns;
-        unsigned long us;
-        ns =  (cyc * per_cpu(cyc2ns, smp_processor_id()))
-                                                >> CYC2NS_SCALE_FACTOR;
-        us = ns / 1000;
-        return us;
-}
-/*
- * wait for all cpus on this hub to finish their sends and go quiet
- * leaves uvhub_quiesce set so that no new broadcasts are started by
- * bau_flush_send_and_wait()
- */
-static inline void
-quiesce_local_uvhub(struct bau_control *hmaster)
-{
-        atomic_add_short_return(1, (struct atomic_short *)
-                 &hmaster->uvhub_quiesce);
-}
-/*
- * mark this quiet-requestor as done
- */
-static inline void
-end_uvhub_quiesce(struct bau_control *hmaster)
-{
-        atomic_add_short_return(-1, (struct atomic_short *)
-                &hmaster->uvhub_quiesce);
-}
-/*
- * Wait for completion of a broadcast software ack message
- * return COMPLETE, RETRY(PLUGGED or TIMEOUT) or GIVEUP
- */
-static int uv_wait_completion(struct bau_desc *bau_desc,
-        unsigned long mmr_offset, int right_shift, int this_cpu,
-        struct bau_control *bcp, struct bau_control *smaster, long try)
-{
-        unsigned long descriptor_status;
-        cycles_t ttime;
-        struct ptc_stats *stat = bcp->statp;
-        struct bau_control *hmaster;
-        hmaster = bcp->uvhub_master;
-        /* spin on the status MMR, waiting for it to go idle */
-        while ((descriptor_status = (((unsigned long)
-                uv_read_local_mmr(mmr_offset) >>
-                        right_shift) & UV_ACT_STATUS_MASK)) !=
-                        DESC_STATUS_IDLE) {
-                /*
-                 * Our software ack messages may be blocked because there are
-                 * no swack resources available.  As long as none of them
-                 * has timed out hardware will NACK our message and its
-                 * state will stay IDLE.
-                 */
-                if (descriptor_status == DESC_STATUS_SOURCE_TIMEOUT) {
-                        stat->s_stimeout++;
-                        return FLUSH_GIVEUP;
-                } else if (descriptor_status ==
-                                        DESC_STATUS_DESTINATION_TIMEOUT) {
-                        stat->s_dtimeout++;
-                        ttime = get_cycles();
-                        /*
-                         * Our retries may be blocked by all destination
-                         * swack resources being consumed, and a timeout
-                         * pending.  In that case hardware returns the
-                         * ERROR that looks like a destination timeout.
-                         */
-                        if (cycles_2_us(ttime - bcp->send_message) <
-                                                        timeout_us) {
-                                bcp->conseccompletes = 0;
-                                return FLUSH_RETRY_PLUGGED;
-                        }
-                        bcp->conseccompletes = 0;
-                        return FLUSH_RETRY_TIMEOUT;
-                } else {
-                        /*
-                         * descriptor_status is still BUSY
-                         */
-                        cpu_relax();
-                }
-        }
-        bcp->conseccompletes++;
-        return FLUSH_COMPLETE;
-}
-static inline cycles_t
-sec_2_cycles(unsigned long sec)
-{
-        unsigned long ns;
-        cycles_t cyc;
-        ns = sec * 1000000000;
-        cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id()));
-        return cyc;
-}
-/*
- * conditionally add 1 to *v, unless *v is >= u
- * return 0 if we cannot add 1 to *v because it is >= u
- * return 1 if we can add 1 to *v because it is < u
- * the add is atomic
- *
- * This is close to atomic_add_unless(), but this allows the 'u' value
- * to be lowered below the current 'v'.  atomic_add_unless can only stop
- * on equal.
- */
-static inline int atomic_inc_unless_ge(spinlock_t *lock, atomic_t *v, int u)
-{
-        spin_lock(lock);
-        if (atomic_read(v) >= u) {
-                spin_unlock(lock);
-                return 0;
-        }
-        atomic_inc(v);
-        spin_unlock(lock);
-        return 1;
-}
-/*
- * Our retries are blocked by all destination swack resources being
- * in use, and a timeout is pending. In that case hardware immediately
- * returns the ERROR that looks like a destination timeout.
- */
-static void
-destination_plugged(struct bau_desc *bau_desc, struct bau_control *bcp,
-                        struct bau_control *hmaster, struct ptc_stats *stat)
-{
-        udelay(bcp->plugged_delay);
-        bcp->plugged_tries++;
-        if (bcp->plugged_tries >= bcp->plugsb4reset) {
-                bcp->plugged_tries = 0;
-                quiesce_local_uvhub(hmaster);
-                spin_lock(&hmaster->queue_lock);
-                uv_reset_with_ipi(&bau_desc->distribution, bcp->cpu);
-                spin_unlock(&hmaster->queue_lock);
-                end_uvhub_quiesce(hmaster);
-                bcp->ipi_attempts++;
-                stat->s_resets_plug++;
-        }
-}
-static void
-destination_timeout(struct bau_desc *bau_desc, struct bau_control *bcp,
-                        struct bau_control *hmaster, struct ptc_stats *stat)
-{
-        hmaster->max_bau_concurrent = 1;
-        bcp->timeout_tries++;
-        if (bcp->timeout_tries >= bcp->timeoutsb4reset) {
-                bcp->timeout_tries = 0;
-                quiesce_local_uvhub(hmaster);
-                spin_lock(&hmaster->queue_lock);
-                uv_reset_with_ipi(&bau_desc->distribution, bcp->cpu);
-                spin_unlock(&hmaster->queue_lock);
-                end_uvhub_quiesce(hmaster);
-                bcp->ipi_attempts++;
-                stat->s_resets_timeout++;
-        }
-}
-/*
- * Completions are taking a very long time due to a congested numalink
- * network.
- */
-static void
-disable_for_congestion(struct bau_control *bcp, struct ptc_stats *stat)
-{
-        int tcpu;
-        struct bau_control *tbcp;
-        /* let only one cpu do this disabling */
-        spin_lock(&disable_lock);
-        if (!baudisabled && bcp->period_requests &&
-            ((bcp->period_time / bcp->period_requests) > congested_cycles)) {
-                /* it becomes this cpu's job to turn on the use of the
-                   BAU again */
-                baudisabled = 1;
-                bcp->set_bau_off = 1;
-                bcp->set_bau_on_time = get_cycles() +
-                        sec_2_cycles(bcp->congested_period);
-                stat->s_bau_disabled++;
-                for_each_present_cpu(tcpu) {
-                        tbcp = &per_cpu(bau_control, tcpu);
-                                tbcp->baudisabled = 1;
-                }
-        }
-        spin_unlock(&disable_lock);
-}
-/**
- * uv_flush_send_and_wait
- *
- * Send a broadcast and wait for it to complete.
- *
- * The flush_mask contains the cpus the broadcast is to be sent to including
- * cpus that are on the local uvhub.
- *
- * Returns 0 if all flushing represented in the mask was done.
- * Returns 1 if it gives up entirely and the original cpu mask is to be
- * returned to the kernel.
- */
-int uv_flush_send_and_wait(struct bau_desc *bau_desc,
-                           struct cpumask *flush_mask, struct bau_control *bcp)
-{
-        int right_shift;
-        int completion_status = 0;
-        int seq_number = 0;
-        long try = 0;
-        int cpu = bcp->uvhub_cpu;
-        int this_cpu = bcp->cpu;
-        unsigned long mmr_offset;
-        unsigned long index;
-        cycles_t time1;
-        cycles_t time2;
-        cycles_t elapsed;
-        struct ptc_stats *stat = bcp->statp;
-        struct bau_control *smaster = bcp->socket_master;
-        struct bau_control *hmaster = bcp->uvhub_master;
-        if (!atomic_inc_unless_ge(&hmaster->uvhub_lock,
-                        &hmaster->active_descriptor_count,
-                        hmaster->max_bau_concurrent)) {
-                stat->s_throttles++;
-                do {
-                        cpu_relax();
-                } while (!atomic_inc_unless_ge(&hmaster->uvhub_lock,
-                        &hmaster->active_descriptor_count,
-                        hmaster->max_bau_concurrent));
-        }
-        while (hmaster->uvhub_quiesce)
-                cpu_relax();
-        if (cpu < UV_CPUS_PER_ACT_STATUS) {
-                mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0;
-                right_shift = cpu * UV_ACT_STATUS_SIZE;
-        } else {
-                mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_1;
-                right_shift =
-                    ((cpu - UV_CPUS_PER_ACT_STATUS) * UV_ACT_STATUS_SIZE);
-        }
-        time1 = get_cycles();
-        do {
-                if (try == 0) {
-                        bau_desc->header.msg_type = MSG_REGULAR;
-                        seq_number = bcp->message_number++;
-                } else {
-                        bau_desc->header.msg_type = MSG_RETRY;
-                        stat->s_retry_messages++;
-                }
-                bau_desc->header.sequence = seq_number;
-                index = (1UL << UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT) |
-                        bcp->uvhub_cpu;
-                bcp->send_message = get_cycles();
-                uv_write_local_mmr(UVH_LB_BAU_SB_ACTIVATION_CONTROL, index);
-                try++;
-                completion_status = uv_wait_completion(bau_desc, mmr_offset,
-                        right_shift, this_cpu, bcp, smaster, try);
-                if (completion_status == FLUSH_RETRY_PLUGGED) {
-                        destination_plugged(bau_desc, bcp, hmaster, stat);
-                } else if (completion_status == FLUSH_RETRY_TIMEOUT) {
-                        destination_timeout(bau_desc, bcp, hmaster, stat);
-                }
-                if (bcp->ipi_attempts >= bcp->ipi_reset_limit) {
-                        bcp->ipi_attempts = 0;
-                        completion_status = FLUSH_GIVEUP;
-                        break;
-                }
-                cpu_relax();
-        } while ((completion_status == FLUSH_RETRY_PLUGGED) ||
-                 (completion_status == FLUSH_RETRY_TIMEOUT));
-        time2 = get_cycles();
-        bcp->plugged_tries = 0;
-        bcp->timeout_tries = 0;
-        if ((completion_status == FLUSH_COMPLETE) &&
-            (bcp->conseccompletes > bcp->complete_threshold) &&
-            (hmaster->max_bau_concurrent <
-                                        hmaster->max_bau_concurrent_constant))
-                        hmaster->max_bau_concurrent++;
-        while (hmaster->uvhub_quiesce)
-                cpu_relax();
-        atomic_dec(&hmaster->active_descriptor_count);
-        if (time2 > time1) {
-                elapsed = time2 - time1;
-                stat->s_time += elapsed;
-                if ((completion_status == FLUSH_COMPLETE) && (try == 1)) {
-                        bcp->period_requests++;
-                        bcp->period_time += elapsed;
-                        if ((elapsed > congested_cycles) &&
-                            (bcp->period_requests > bcp->congested_reps)) {
-                                disable_for_congestion(bcp, stat);
-                        }
-                }
-        } else
-                stat->s_requestor--;
-        if (completion_status == FLUSH_COMPLETE && try > 1)
-                stat->s_retriesok++;
-        else if (completion_status == FLUSH_GIVEUP) {
-                stat->s_giveup++;
-                return 1;
-        }
-        return 0;
-}
-/**
- * uv_flush_tlb_others - globally purge translation cache of a virtual
- * address or all TLB's
- * @cpumask: mask of all cpu's in which the address is to be removed
- * @mm: mm_struct containing virtual address range
- * @va: virtual address to be removed (or TLB_FLUSH_ALL for all TLB's on cpu)
- * @cpu: the current cpu
- *
- * This is the entry point for initiating any UV global TLB shootdown.
- *
- * Purges the translation caches of all specified processors of the given
- * virtual address, or purges all TLB's on specified processors.
- *
- * The caller has derived the cpumask from the mm_struct.  This function
- * is called only if there are bits set in the mask. (e.g. flush_tlb_page())
- *
- * The cpumask is converted into a uvhubmask of the uvhubs containing
- * those cpus.
- *
- * Note that this function should be called with preemption disabled.
- *
- * Returns NULL if all remote flushing was done.
- * Returns pointer to cpumask if some remote flushing remains to be
- * done.  The returned pointer is valid till preemption is re-enabled.
- */
-const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
-                                          struct mm_struct *mm,
-                                          unsigned long va, unsigned int cpu)
-{
-        int tcpu;
-        int uvhub;
-        int locals = 0;
-        int remotes = 0;
-        int hubs = 0;
-        struct bau_desc *bau_desc;
-        struct cpumask *flush_mask;
-        struct ptc_stats *stat;
-        struct bau_control *bcp;
-        struct bau_control *tbcp;
-        /* kernel was booted 'nobau' */
-        if (nobau)
-                return cpumask;
-        bcp = &per_cpu(bau_control, cpu);
-        stat = bcp->statp;
-        /* bau was disabled due to slow response */
-        if (bcp->baudisabled) {
-                /* the cpu that disabled it must re-enable it */
-                if (bcp->set_bau_off) {
-                        if (get_cycles() >= bcp->set_bau_on_time) {
-                                stat->s_bau_reenabled++;
-                                baudisabled = 0;
-                                for_each_present_cpu(tcpu) {
-                                        tbcp = &per_cpu(bau_control, tcpu);
-                                        tbcp->baudisabled = 0;
-                                        tbcp->period_requests = 0;
-                                        tbcp->period_time = 0;
-                                }
-                        }
-                }
-                return cpumask;
-        }
-        /*
-         * Each sending cpu has a per-cpu mask which it fills from the caller's
-         * cpu mask.  All cpus are converted to uvhubs and copied to the
-         * activation descriptor.
-         */
-        flush_mask = (struct cpumask *)per_cpu(uv_flush_tlb_mask, cpu);
-        /* don't actually do a shootdown of the local cpu */
-        cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu));
-        if (cpu_isset(cpu, *cpumask))
-                stat->s_ntargself++;
-        bau_desc = bcp->descriptor_base;
-        bau_desc += UV_ITEMS_PER_DESCRIPTOR * bcp->uvhub_cpu;
-        bau_uvhubs_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE);
-        /* cpu statistics */
-        for_each_cpu(tcpu, flush_mask) {
-                uvhub = uv_cpu_to_blade_id(tcpu);
-                bau_uvhub_set(uvhub, &bau_desc->distribution);
-                if (uvhub == bcp->uvhub)
-                        locals++;
-                else
-                        remotes++;
-        }
-        if ((locals + remotes) == 0)
-                return NULL;
-        stat->s_requestor++;
-        stat->s_ntargcpu += remotes + locals;
-        stat->s_ntargremotes += remotes;
-        stat->s_ntarglocals += locals;
-        remotes = bau_uvhub_weight(&bau_desc->distribution);
-        /* uvhub statistics */
-        hubs = bau_uvhub_weight(&bau_desc->distribution);
-        if (locals) {
-                stat->s_ntarglocaluvhub++;
-                stat->s_ntargremoteuvhub += (hubs - 1);
-        } else
-                stat->s_ntargremoteuvhub += hubs;
-        stat->s_ntarguvhub += hubs;
-        if (hubs >= 16)
-                stat->s_ntarguvhub16++;
-        else if (hubs >= 8)
-                stat->s_ntarguvhub8++;
-        else if (hubs >= 4)
-                stat->s_ntarguvhub4++;
-        else if (hubs >= 2)
-                stat->s_ntarguvhub2++;
-        else
-                stat->s_ntarguvhub1++;
-        bau_desc->payload.address = va;
-        bau_desc->payload.sending_cpu = cpu;
-        /*
-         * uv_flush_send_and_wait returns 0 if all cpu's were messaged,
-         * or 1 if it gave up and the original cpumask should be returned.
-         */
-        if (!uv_flush_send_and_wait(bau_desc, flush_mask, bcp))
-                return NULL;
-        else
-                return cpumask;
-}
-/*
- * The BAU message interrupt comes here. (registered by set_intr_gate)
- * See entry_64.S
- *
- * We received a broadcast assist message.
- *
- * Interrupts are disabled; this interrupt could represent
- * the receipt of several messages.
- *
- * All cores/threads on this hub get this interrupt.
- * The last one to see it does the software ack.
- * (the resource will not be freed until noninterruptable cpus see this
- *  interrupt; hardware may timeout the s/w ack and reply ERROR)
- */
-void uv_bau_message_interrupt(struct pt_regs *regs)
-{
-        int count = 0;
-        cycles_t time_start;
-        struct bau_payload_queue_entry *msg;
-        struct bau_control *bcp;
-        struct ptc_stats *stat;
-        struct msg_desc msgdesc;
-        time_start = get_cycles();
-        bcp = &per_cpu(bau_control, smp_processor_id());
-        stat = bcp->statp;
-        msgdesc.va_queue_first = bcp->va_queue_first;
-        msgdesc.va_queue_last = bcp->va_queue_last;
-        msg = bcp->bau_msg_head;
-        while (msg->sw_ack_vector) {
-                count++;
-                msgdesc.msg_slot = msg - msgdesc.va_queue_first;
-                msgdesc.sw_ack_slot = ffs(msg->sw_ack_vector) - 1;
-                msgdesc.msg = msg;
-                uv_bau_process_message(&msgdesc, bcp);
-                msg++;
-                if (msg > msgdesc.va_queue_last)
-                        msg = msgdesc.va_queue_first;
-                bcp->bau_msg_head = msg;
-        }
-        stat->d_time += (get_cycles() - time_start);
-        if (!count)
-                stat->d_nomsg++;
-        else if (count > 1)
-                stat->d_multmsg++;
-        ack_APIC_irq();
-}
-/*
- * uv_enable_timeouts
- *
- * Each target uvhub (i.e. a uvhub that has no cpu's) needs to have
- * shootdown message timeouts enabled.  The timeout does not cause
- * an interrupt, but causes an error message to be returned to
- * the sender.
- */
-static void uv_enable_timeouts(void)
-{
-        int uvhub;
-        int nuvhubs;
-        int pnode;
-        unsigned long mmr_image;
-        nuvhubs = uv_num_possible_blades();
-        for (uvhub = 0; uvhub < nuvhubs; uvhub++) {
-                if (!uv_blade_nr_possible_cpus(uvhub))
-                        continue;
-                pnode = uv_blade_to_pnode(uvhub);
-                mmr_image =
-                    uv_read_global_mmr64(pnode, UVH_LB_BAU_MISC_CONTROL);
-                /*
-                 * Set the timeout period and then lock it in, in three
-                 * steps; captures and locks in the period.
-                 *
-                 * To program the period, the SOFT_ACK_MODE must be off.
-                 */
-                mmr_image &= ~((unsigned long)1 <<
-                    UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT);
-                uv_write_global_mmr64
-                    (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image);
-                /*
-                 * Set the 4-bit period.
-                 */
-                mmr_image &= ~((unsigned long)0xf <<
-                     UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT);
-                mmr_image |= (UV_INTD_SOFT_ACK_TIMEOUT_PERIOD <<
-                     UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT);
-                uv_write_global_mmr64
-                    (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image);
-                /*
-                 * Subsequent reversals of the timebase bit (3) cause an
-                 * immediate timeout of one or all INTD resources as
-                 * indicated in bits 2:0 (7 causes all of them to timeout).
-                 */
-                mmr_image |= ((unsigned long)1 <<
-                    UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT);
-                uv_write_global_mmr64
-                    (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image);
-        }
-}
-static void *uv_ptc_seq_start(struct seq_file *file, loff_t *offset)
-{
-        if (*offset < num_possible_cpus())
-                return offset;
-        return NULL;
-}
-static void *uv_ptc_seq_next(struct seq_file *file, void *data, loff_t *offset)
-{
-        (*offset)++;
-        if (*offset < num_possible_cpus())
-                return offset;
-        return NULL;
-}
-static void uv_ptc_seq_stop(struct seq_file *file, void *data)
-{
-}
-static inline unsigned long long
-microsec_2_cycles(unsigned long microsec)
-{
-        unsigned long ns;
-        unsigned long long cyc;
-        ns = microsec * 1000;
-        cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id()));
-        return cyc;
-}
-/*
- * Display the statistics thru /proc.
- * 'data' points to the cpu number
- */
-static int uv_ptc_seq_show(struct seq_file *file, void *data)
-{
-        struct ptc_stats *stat;
-        int cpu;
-        cpu = *(loff_t *)data;
-        if (!cpu) {
-                seq_printf(file,
-                        "# cpu sent stime self locals remotes ncpus localhub ");
-                seq_printf(file,
-                        "remotehub numuvhubs numuvhubs16 numuvhubs8 ");
-                seq_printf(file,
-                        "numuvhubs4 numuvhubs2 numuvhubs1 dto ");
-                seq_printf(file,
-                        "retries rok resetp resett giveup sto bz throt ");
-                seq_printf(file,
-                        "sw_ack recv rtime all ");
-                seq_printf(file,
-                        "one mult none retry canc nocan reset rcan ");
-                seq_printf(file,
-                        "disable enable\n");
-        }
-        if (cpu < num_possible_cpus() && cpu_online(cpu)) {
-                stat = &per_cpu(ptcstats, cpu);
-                /* source side statistics */
-                seq_printf(file,
-                        "cpu %d %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ",
-                           cpu, stat->s_requestor, cycles_2_us(stat->s_time),
-                           stat->s_ntargself, stat->s_ntarglocals,
-                           stat->s_ntargremotes, stat->s_ntargcpu,
-                           stat->s_ntarglocaluvhub, stat->s_ntargremoteuvhub,
-                           stat->s_ntarguvhub, stat->s_ntarguvhub16);
-                seq_printf(file, "%ld %ld %ld %ld %ld ",
-                           stat->s_ntarguvhub8, stat->s_ntarguvhub4,
-                           stat->s_ntarguvhub2, stat->s_ntarguvhub1,
-                           stat->s_dtimeout);
-                seq_printf(file, "%ld %ld %ld %ld %ld %ld %ld %ld ",
-                           stat->s_retry_messages, stat->s_retriesok,
-                           stat->s_resets_plug, stat->s_resets_timeout,
-                           stat->s_giveup, stat->s_stimeout,
-                           stat->s_busy, stat->s_throttles);
-                /* destination side statistics */
-                seq_printf(file,
-                           "%lx %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ",
-                           uv_read_global_mmr64(uv_cpu_to_pnode(cpu),
-                                        UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE),
-                           stat->d_requestee, cycles_2_us(stat->d_time),
-                           stat->d_alltlb, stat->d_onetlb, stat->d_multmsg,
-                           stat->d_nomsg, stat->d_retries, stat->d_canceled,
-                           stat->d_nocanceled, stat->d_resets,
-                           stat->d_rcanceled);
-                seq_printf(file, "%ld %ld\n",
-                        stat->s_bau_disabled, stat->s_bau_reenabled);
-        }
-        return 0;
-}
-/*
- * Display the tunables thru debugfs
- */
-static ssize_t tunables_read(struct file *file, char __user *userbuf,
-                                                size_t count, loff_t *ppos)
-{
-        char buf[300];
-        int ret;
-        ret = snprintf(buf, 300, "%s %s %s\n%d %d %d %d %d %d %d %d %d\n",
-                "max_bau_concurrent plugged_delay plugsb4reset",
-                "timeoutsb4reset ipi_reset_limit complete_threshold",
-                "congested_response_us congested_reps congested_period",
-                max_bau_concurrent, plugged_delay, plugsb4reset,
-                timeoutsb4reset, ipi_reset_limit, complete_threshold,
-                congested_response_us, congested_reps, congested_period);
-        return simple_read_from_buffer(userbuf, count, ppos, buf, ret);
-}
-/*
- * -1: resetf the statistics
- *  0: display meaning of the statistics
- */
-static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user,
-                                 size_t count, loff_t *data)
-{
-        int cpu;
-        long input_arg;
-        char optstr[64];
-        struct ptc_stats *stat;
-        if (count == 0 || count > sizeof(optstr))
-                return -EINVAL;
-        if (copy_from_user(optstr, user, count))
-                return -EFAULT;
-        optstr[count - 1] = '\0';
-        if (strict_strtol(optstr, 10, &input_arg) < 0) {
-                printk(KERN_DEBUG "%s is invalid\n", optstr);
-                return -EINVAL;
-        }
-        if (input_arg == 0) {
-                printk(KERN_DEBUG "# cpu:      cpu number\n");
-                printk(KERN_DEBUG "Sender statistics:\n");
-                printk(KERN_DEBUG
-                "sent:     number of shootdown messages sent\n");
-                printk(KERN_DEBUG
-                "stime:    time spent sending messages\n");
-                printk(KERN_DEBUG
-                "numuvhubs: number of hubs targeted with shootdown\n");
-                printk(KERN_DEBUG
-                "numuvhubs16: number times 16 or more hubs targeted\n");
-                printk(KERN_DEBUG
-                "numuvhubs8: number times 8 or more hubs targeted\n");
-                printk(KERN_DEBUG
-                "numuvhubs4: number times 4 or more hubs targeted\n");
-                printk(KERN_DEBUG
-                "numuvhubs2: number times 2 or more hubs targeted\n");
-                printk(KERN_DEBUG
-                "numuvhubs1: number times 1 hub targeted\n");
-                printk(KERN_DEBUG
-                "numcpus:  number of cpus targeted with shootdown\n");
-                printk(KERN_DEBUG
-                "dto:      number of destination timeouts\n");
-                printk(KERN_DEBUG
-                "retries:  destination timeout retries sent\n");
-                printk(KERN_DEBUG
-                "rok:   :  destination timeouts successfully retried\n");
-                printk(KERN_DEBUG
-                "resetp:   ipi-style resource resets for plugs\n");
-                printk(KERN_DEBUG
-                "resett:   ipi-style resource resets for timeouts\n");
-                printk(KERN_DEBUG
-                "giveup:   fall-backs to ipi-style shootdowns\n");
-                printk(KERN_DEBUG
-                "sto:      number of source timeouts\n");
-                printk(KERN_DEBUG
-                "bz:       number of stay-busy's\n");
-                printk(KERN_DEBUG
-                "throt:    number times spun in throttle\n");
-                printk(KERN_DEBUG "Destination side statistics:\n");
-                printk(KERN_DEBUG
-                "sw_ack:   image of UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE\n");
-                printk(KERN_DEBUG
-                "recv:     shootdown messages received\n");
-                printk(KERN_DEBUG
-                "rtime:    time spent processing messages\n");
-                printk(KERN_DEBUG
-                "all:      shootdown all-tlb messages\n");
-                printk(KERN_DEBUG
-                "one:      shootdown one-tlb messages\n");
-                printk(KERN_DEBUG
-                "mult:     interrupts that found multiple messages\n");
-                printk(KERN_DEBUG
-                "none:     interrupts that found no messages\n");
-                printk(KERN_DEBUG
-                "retry:    number of retry messages processed\n");
-                printk(KERN_DEBUG
-                "canc:     number messages canceled by retries\n");
-                printk(KERN_DEBUG
-                "nocan:    number retries that found nothing to cancel\n");
-                printk(KERN_DEBUG
-                "reset:    number of ipi-style reset requests processed\n");
-                printk(KERN_DEBUG
-                "rcan:     number messages canceled by reset requests\n");
-                printk(KERN_DEBUG
-                "disable:  number times use of the BAU was disabled\n");
-                printk(KERN_DEBUG
-                "enable:   number times use of the BAU was re-enabled\n");
-        } else if (input_arg == -1) {
-                for_each_present_cpu(cpu) {
-                        stat = &per_cpu(ptcstats, cpu);
-                        memset(stat, 0, sizeof(struct ptc_stats));
-                }
-        }
-        return count;
-}
-static int local_atoi(const char *name)
-{
-        int val = 0;
-        for (;; name++) {
-                switch (*name) {
-                case '0' ... '9':
-                        val = 10*val+(*name-'0');
-                        break;
-                default:
-                        return val;
-                }
-        }
-}
-/*
- * set the tunables
- * 0 values reset them to defaults
- */
-static ssize_t tunables_write(struct file *file, const char __user *user,
-                                 size_t count, loff_t *data)
-{
-        int cpu;
-        int cnt = 0;
-        int val;
-        char *p;
-        char *q;
-        char instr[64];
-        struct bau_control *bcp;
-        if (count == 0 || count > sizeof(instr)-1)
-                return -EINVAL;
-        if (copy_from_user(instr, user, count))
-                return -EFAULT;
-        instr[count] = '\0';
-        /* count the fields */
-        p = instr + strspn(instr, WHITESPACE);
-        q = p;
-        for (; *p; p = q + strspn(q, WHITESPACE)) {
-                q = p + strcspn(p, WHITESPACE);
-                cnt++;
-                if (q == p)
-                        break;
-        }
-        if (cnt != 9) {
-                printk(KERN_INFO "bau tunable error: should be 9 numbers\n");
-                return -EINVAL;
-        }
-        p = instr + strspn(instr, WHITESPACE);
-        q = p;
-        for (cnt = 0; *p; p = q + strspn(q, WHITESPACE), cnt++) {
-                q = p + strcspn(p, WHITESPACE);
-                val = local_atoi(p);
-                switch (cnt) {
-                case 0:
-                        if (val == 0) {
-                                max_bau_concurrent = MAX_BAU_CONCURRENT;
-                                max_bau_concurrent_constant =
-                                                        MAX_BAU_CONCURRENT;
-                                continue;
-                        }
-                        bcp = &per_cpu(bau_control, smp_processor_id());
-                        if (val < 1 || val > bcp->cpus_in_uvhub) {
-                                printk(KERN_DEBUG
-                                "Error: BAU max concurrent %d is invalid\n",
-                                val);
-                                return -EINVAL;
-                        }
-                        max_bau_concurrent = val;
-                        max_bau_concurrent_constant = val;
-                        continue;
-                case 1:
-                        if (val == 0)
-                                plugged_delay = PLUGGED_DELAY;
-                        else
-                                plugged_delay = val;
-                        continue;
-                case 2:
-                        if (val == 0)
-                                plugsb4reset = PLUGSB4RESET;
-                        else
-                                plugsb4reset = val;
-                        continue;
-                case 3:
-                        if (val == 0)
-                                timeoutsb4reset = TIMEOUTSB4RESET;
-                        else
-                                timeoutsb4reset = val;
-                        continue;
-                case 4:
-                        if (val == 0)
-                                ipi_reset_limit = IPI_RESET_LIMIT;
-                        else
-                                ipi_reset_limit = val;
-                        continue;
-                case 5:
-                        if (val == 0)
-                                complete_threshold = COMPLETE_THRESHOLD;
-                        else
-                                complete_threshold = val;
-                        continue;
-                case 6:
-                        if (val == 0)
-                                congested_response_us = CONGESTED_RESPONSE_US;
-                        else
-                                congested_response_us = val;
-                        continue;
-                case 7:
-                        if (val == 0)
-                                congested_reps = CONGESTED_REPS;
-                        else
-                                congested_reps = val;
-                        continue;
-                case 8:
-                        if (val == 0)
-                                congested_period = CONGESTED_PERIOD;
-                        else
-                                congested_period = val;
-                        continue;
-                }
-                if (q == p)
-                        break;
-        }
-        for_each_present_cpu(cpu) {
-                bcp = &per_cpu(bau_control, cpu);
-                bcp->max_bau_concurrent = max_bau_concurrent;
-                bcp->max_bau_concurrent_constant = max_bau_concurrent;
-                bcp->plugged_delay = plugged_delay;
-                bcp->plugsb4reset = plugsb4reset;
-                bcp->timeoutsb4reset = timeoutsb4reset;
-                bcp->ipi_reset_limit = ipi_reset_limit;
-                bcp->complete_threshold = complete_threshold;
-                bcp->congested_response_us = congested_response_us;
-                bcp->congested_reps = congested_reps;
-                bcp->congested_period = congested_period;
-        }
-        return count;
-}
-static const struct seq_operations uv_ptc_seq_ops = {
-        .start          = uv_ptc_seq_start,
-        .next           = uv_ptc_seq_next,
-        .stop           = uv_ptc_seq_stop,
-        .show           = uv_ptc_seq_show
-};
-static int uv_ptc_proc_open(struct inode *inode, struct file *file)
-{
-        return seq_open(file, &uv_ptc_seq_ops);
-}
-static int tunables_open(struct inode *inode, struct file *file)
-{
-        return 0;
-}
-static const struct file_operations proc_uv_ptc_operations = {
-        .open           = uv_ptc_proc_open,
-        .read           = seq_read,
-        .write          = uv_ptc_proc_write,
-        .llseek         = seq_lseek,
-        .release        = seq_release,
-};
-static const struct file_operations tunables_fops = {
-        .open           = tunables_open,
-        .read           = tunables_read,
-        .write          = tunables_write,
-};
-static int __init uv_ptc_init(void)
-{
-        struct proc_dir_entry *proc_uv_ptc;
-        if (!is_uv_system())
-                return 0;
-        proc_uv_ptc = proc_create(UV_PTC_BASENAME, 0444, NULL,
-                                  &proc_uv_ptc_operations);
-        if (!proc_uv_ptc) {
-                printk(KERN_ERR "unable to create %s proc entry\n",
-                       UV_PTC_BASENAME);
-                return -EINVAL;
-        }
-        tunables_dir = debugfs_create_dir(UV_BAU_TUNABLES_DIR, NULL);
-        if (!tunables_dir) {
-                printk(KERN_ERR "unable to create debugfs directory %s\n",
-                       UV_BAU_TUNABLES_DIR);
-                return -EINVAL;
-        }
-        tunables_file = debugfs_create_file(UV_BAU_TUNABLES_FILE, 0600,
-                        tunables_dir, NULL, &tunables_fops);
-        if (!tunables_file) {
-                printk(KERN_ERR "unable to create debugfs file %s\n",
-                       UV_BAU_TUNABLES_FILE);
-                return -EINVAL;
-        }
-        return 0;
-}
-/*
- * initialize the sending side's sending buffers
- */
-static void
-uv_activation_descriptor_init(int node, int pnode)
-{
-        int i;
-        int cpu;
-        unsigned long pa;
-        unsigned long m;
-        unsigned long n;
-        struct bau_desc *bau_desc;
-        struct bau_desc *bd2;
-        struct bau_control *bcp;
-        /*
-         * each bau_desc is 64 bytes; there are 8 (UV_ITEMS_PER_DESCRIPTOR)
-         * per cpu; and up to 32 (UV_ADP_SIZE) cpu's per uvhub
-         */
-        bau_desc = (struct bau_desc *)kmalloc_node(sizeof(struct bau_desc)*
-                UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR, GFP_KERNEL, node);
-        BUG_ON(!bau_desc);
-        pa = uv_gpa(bau_desc); /* need the real nasid*/
-        n = pa >> uv_nshift;
-        m = pa & uv_mmask;
-        uv_write_global_mmr64(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE,
-                              (n << UV_DESC_BASE_PNODE_SHIFT | m));
-        /*
-         * initializing all 8 (UV_ITEMS_PER_DESCRIPTOR) descriptors for each
-         * cpu even though we only use the first one; one descriptor can
-         * describe a broadcast to 256 uv hubs.
-         */
-        for (i = 0, bd2 = bau_desc; i < (UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR);
-                i++, bd2++) {
-                memset(bd2, 0, sizeof(struct bau_desc));
-                bd2->header.sw_ack_flag = 1;
-                /*
-                 * base_dest_nodeid is the nasid (pnode<<1) of the first uvhub
-                 * in the partition. The bit map will indicate uvhub numbers,
-                 * which are 0-N in a partition. Pnodes are unique system-wide.
-                 */
-                bd2->header.base_dest_nodeid = uv_partition_base_pnode << 1;
-                bd2->header.dest_subnodeid = 0x10; /* the LB */
-                bd2->header.command = UV_NET_ENDPOINT_INTD;
-                bd2->header.int_both = 1;
-                /*
-                 * all others need to be set to zero:
-                 *   fairness chaining multilevel count replied_to
-                 */
-        }
-        for_each_present_cpu(cpu) {
-                if (pnode != uv_blade_to_pnode(uv_cpu_to_blade_id(cpu)))
-                        continue;
-                bcp = &per_cpu(bau_control, cpu);
-                bcp->descriptor_base = bau_desc;
-        }
-}
-/*
- * initialize the destination side's receiving buffers
- * entered for each uvhub in the partition
- * - node is first node (kernel memory notion) on the uvhub
- * - pnode is the uvhub's physical identifier
- */
-static void
-uv_payload_queue_init(int node, int pnode)
-{
-        int pn;
-        int cpu;
-        char *cp;
-        unsigned long pa;
-        struct bau_payload_queue_entry *pqp;
-        struct bau_payload_queue_entry *pqp_malloc;
-        struct bau_control *bcp;
-        pqp = (struct bau_payload_queue_entry *) kmalloc_node(
-                (DEST_Q_SIZE + 1) * sizeof(struct bau_payload_queue_entry),
-                GFP_KERNEL, node);
-        BUG_ON(!pqp);
-        pqp_malloc = pqp;
-        cp = (char *)pqp + 31;
-        pqp = (struct bau_payload_queue_entry *)(((unsigned long)cp >> 5) << 5);
-        for_each_present_cpu(cpu) {
-                if (pnode != uv_cpu_to_pnode(cpu))
-                        continue;
-                /* for every cpu on this pnode: */
-                bcp = &per_cpu(bau_control, cpu);
-                bcp->va_queue_first = pqp;
-                bcp->bau_msg_head = pqp;
-                bcp->va_queue_last = pqp + (DEST_Q_SIZE - 1);
-        }
-        /*
-         * need the pnode of where the memory was really allocated
-         */
-        pa = uv_gpa(pqp);
-        pn = pa >> uv_nshift;
-        uv_write_global_mmr64(pnode,
-                              UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST,
-                              ((unsigned long)pn << UV_PAYLOADQ_PNODE_SHIFT) |
-                              uv_physnodeaddr(pqp));
-        uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL,
-                              uv_physnodeaddr(pqp));
-        uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST,
-                              (unsigned long)
-                              uv_physnodeaddr(pqp + (DEST_Q_SIZE - 1)));
-        /* in effect, all msg_type's are set to MSG_NOOP */
-        memset(pqp, 0, sizeof(struct bau_payload_queue_entry) * DEST_Q_SIZE);
-}
-/*
- * Initialization of each UV hub's structures
- */
-static void __init uv_init_uvhub(int uvhub, int vector)
-{
-        int node;
-        int pnode;
-        unsigned long apicid;
-        node = uvhub_to_first_node(uvhub);
-        pnode = uv_blade_to_pnode(uvhub);
-        uv_activation_descriptor_init(node, pnode);
-        uv_payload_queue_init(node, pnode);
-        /*
-         * the below initialization can't be in firmware because the
-         * messaging IRQ will be determined by the OS
-         */
-        apicid = uvhub_to_first_apicid(uvhub);
-        uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG,
-                                      ((apicid << 32) | vector));
-}
-/*
- * We will set BAU_MISC_CONTROL with a timeout period.
- * But the BIOS has set UVH_AGING_PRESCALE_SEL and UVH_TRANSACTION_TIMEOUT.
- * So the destination timeout period has be be calculated from them.
- */
-static int
-calculate_destination_timeout(void)
-{
-        unsigned long mmr_image;
-        int mult1;
-        int mult2;
-        int index;
-        int base;
-        int ret;
-        unsigned long ts_ns;
-        mult1 = UV_INTD_SOFT_ACK_TIMEOUT_PERIOD & BAU_MISC_CONTROL_MULT_MASK;
-        mmr_image = uv_read_local_mmr(UVH_AGING_PRESCALE_SEL);
-        index = (mmr_image >> BAU_URGENCY_7_SHIFT) & BAU_URGENCY_7_MASK;
-        mmr_image = uv_read_local_mmr(UVH_TRANSACTION_TIMEOUT);
-        mult2 = (mmr_image >> BAU_TRANS_SHIFT) & BAU_TRANS_MASK;
-        base = timeout_base_ns[index];
-        ts_ns = base * mult1 * mult2;
-        ret = ts_ns / 1000;
-        return ret;
-}
-/*
- * initialize the bau_control structure for each cpu
- */
-static void __init uv_init_per_cpu(int nuvhubs)
-{
-        int i;
-        int cpu;
-        int pnode;
-        int uvhub;
-        int have_hmaster;
-        short socket = 0;
-        unsigned short socket_mask;
-        unsigned char *uvhub_mask;
-        struct bau_control *bcp;
-        struct uvhub_desc *bdp;
-        struct socket_desc *sdp;
-        struct bau_control *hmaster = NULL;
-        struct bau_control *smaster = NULL;
-        struct socket_desc {
-                short num_cpus;
-                short cpu_number[16];
-        };
-        struct uvhub_desc {
-                unsigned short socket_mask;
-                short num_cpus;
-                short uvhub;
-                short pnode;
-                struct socket_desc socket[2];
-        };
-        struct uvhub_desc *uvhub_descs;
-        timeout_us = calculate_destination_timeout();
-        uvhub_descs = (struct uvhub_desc *)
-                kmalloc(nuvhubs * sizeof(struct uvhub_desc), GFP_KERNEL);
-        memset(uvhub_descs, 0, nuvhubs * sizeof(struct uvhub_desc));
-        uvhub_mask = kzalloc((nuvhubs+7)/8, GFP_KERNEL);
-        for_each_present_cpu(cpu) {
-                bcp = &per_cpu(bau_control, cpu);
-                memset(bcp, 0, sizeof(struct bau_control));
-                pnode = uv_cpu_hub_info(cpu)->pnode;
-                uvhub = uv_cpu_hub_info(cpu)->numa_blade_id;
-                *(uvhub_mask + (uvhub/8)) |= (1 << (uvhub%8));
-                bdp = &uvhub_descs[uvhub];
-                bdp->num_cpus++;
-                bdp->uvhub = uvhub;
-                bdp->pnode = pnode;
-                /* kludge: 'assuming' one node per socket, and assuming that
-                   disabling a socket just leaves a gap in node numbers */
-                socket = (cpu_to_node(cpu) & 1);
-                bdp->socket_mask |= (1 << socket);
-                sdp = &bdp->socket[socket];
-                sdp->cpu_number[sdp->num_cpus] = cpu;
-                sdp->num_cpus++;
-        }
-        for (uvhub = 0; uvhub < nuvhubs; uvhub++) {
-                if (!(*(uvhub_mask + (uvhub/8)) & (1 << (uvhub%8))))
-                        continue;
-                have_hmaster = 0;
-                bdp = &uvhub_descs[uvhub];
-                socket_mask = bdp->socket_mask;
-                socket = 0;
-                while (socket_mask) {
-                        if (!(socket_mask & 1))
-                                goto nextsocket;
-                        sdp = &bdp->socket[socket];
-                        for (i = 0; i < sdp->num_cpus; i++) {
-                                cpu = sdp->cpu_number[i];
-                                bcp = &per_cpu(bau_control, cpu);
-                                bcp->cpu = cpu;
-                                if (i == 0) {
-                                        smaster = bcp;
-                                        if (!have_hmaster) {
-                                                have_hmaster++;
-                                                hmaster = bcp;
-                                        }
-                                }
-                                bcp->cpus_in_uvhub = bdp->num_cpus;
-                                bcp->cpus_in_socket = sdp->num_cpus;
-                                bcp->socket_master = smaster;
-                                bcp->uvhub = bdp->uvhub;
-                                bcp->uvhub_master = hmaster;
-                                bcp->uvhub_cpu = uv_cpu_hub_info(cpu)->
-                                                blade_processor_id;
-                        }
-nextsocket:
-                        socket++;
-                        socket_mask = (socket_mask >> 1);
-                }
-        }
-        kfree(uvhub_descs);
-        kfree(uvhub_mask);
-        for_each_present_cpu(cpu) {
-                bcp = &per_cpu(bau_control, cpu);
-                bcp->baudisabled = 0;
-                bcp->statp = &per_cpu(ptcstats, cpu);
-                /* time interval to catch a hardware stay-busy bug */
-                bcp->timeout_interval = microsec_2_cycles(2*timeout_us);
-                bcp->max_bau_concurrent = max_bau_concurrent;
-                bcp->max_bau_concurrent_constant = max_bau_concurrent;
-                bcp->plugged_delay = plugged_delay;
-                bcp->plugsb4reset = plugsb4reset;
-                bcp->timeoutsb4reset = timeoutsb4reset;
-                bcp->ipi_reset_limit = ipi_reset_limit;
-                bcp->complete_threshold = complete_threshold;
-                bcp->congested_response_us = congested_response_us;
-                bcp->congested_reps = congested_reps;
-                bcp->congested_period = congested_period;
-        }
-}
-/*
- * Initialization of BAU-related structures
- */
-static int __init uv_bau_init(void)
-{
-        int uvhub;
-        int pnode;
-        int nuvhubs;
-        int cur_cpu;
-        int vector;
-        unsigned long mmr;
-        if (!is_uv_system())
-                return 0;
-        if (nobau)
-                return 0;
-        for_each_possible_cpu(cur_cpu)
-                zalloc_cpumask_var_node(&per_cpu(uv_flush_tlb_mask, cur_cpu),
-                                       GFP_KERNEL, cpu_to_node(cur_cpu));
-        uv_nshift = uv_hub_info->m_val;
-        uv_mmask = (1UL << uv_hub_info->m_val) - 1;
-        nuvhubs = uv_num_possible_blades();
-        spin_lock_init(&disable_lock);
-        congested_cycles = microsec_2_cycles(congested_response_us);
-        uv_init_per_cpu(nuvhubs);
-        uv_partition_base_pnode = 0x7fffffff;
-        for (uvhub = 0; uvhub < nuvhubs; uvhub++)
-                if (uv_blade_nr_possible_cpus(uvhub) &&
-                        (uv_blade_to_pnode(uvhub) < uv_partition_base_pnode))
-                        uv_partition_base_pnode = uv_blade_to_pnode(uvhub);
-        vector = UV_BAU_MESSAGE;
-        for_each_possible_blade(uvhub)
-                if (uv_blade_nr_possible_cpus(uvhub))
-                        uv_init_uvhub(uvhub, vector);
-        uv_enable_timeouts();
-        alloc_intr_gate(vector, uv_bau_message_intr1);
-        for_each_possible_blade(uvhub) {
-                if (uv_blade_nr_possible_cpus(uvhub)) {
-                        pnode = uv_blade_to_pnode(uvhub);
-                        /* INIT the bau */
-                        uv_write_global_mmr64(pnode,
-                                        UVH_LB_BAU_SB_ACTIVATION_CONTROL,
-                                        ((unsigned long)1 << 63));
-                        mmr = 1; /* should be 1 to broadcast to both sockets */
-                        uv_write_global_mmr64(pnode, UVH_BAU_DATA_BROADCAST,
-                                                mmr);
-                }
-        }
-        return 0;
-}
-core_initcall(uv_bau_init);
-fs_initcall(uv_ptc_init);
diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c
index 7e4515957a1c..8927486a4649 100644
--- a/arch/x86/kernel/topology.c
+++ b/arch/x86/kernel/topology.c
@@ -39,7 +39,7 @@ int __ref arch_register_cpu(int num)
        /*
         * CPU0 cannot be offlined due to several
         * restrictions and assumptions in kernel. This basically
-         * doesnt add a control file, one cannot attempt to offline
+         * doesn't add a control file, one cannot attempt to offline
         * BSP.
         *
         * Also certain PCI quirks require not to enable hotplug control
diff --git a/arch/x86/kernel/trampoline.c b/arch/x86/kernel/trampoline.c
index e2a595257390..a91ae7709b49 100644
--- a/arch/x86/kernel/trampoline.c
+++ b/arch/x86/kernel/trampoline.c
@@ -1,56 +1,42 @@
 #include <linux/io.h>
+#include <linux/memblock.h>
 #include <asm/trampoline.h>
+#include <asm/cacheflush.h>
 #include <asm/pgtable.h>
-#include <asm/e820.h>
-#if defined(CONFIG_X86_64) && defined(CONFIG_ACPI_SLEEP)
+unsigned char *x86_trampoline_base;
-#define __trampinit
-#define __trampinitdata
-#else
-#define __trampinit __cpuinit
-#define __trampinitdata __cpuinitdata
-#endif
-/* ready for x86_64 and x86 */
+void __init setup_trampolines(void)
-unsigned char *__trampinitdata trampoline_base;
-void __init reserve_trampoline_memory(void)
 {
-        unsigned long mem;
+        phys_addr_t mem;
+        size_t size = PAGE_ALIGN(x86_trampoline_end - x86_trampoline_start);
        /* Has to be in very low memory so we can execute real-mode AP code. */
-        mem = find_e820_area(0, 1<<20, TRAMPOLINE_SIZE, PAGE_SIZE);
+        mem = memblock_find_in_range(0, 1<<20, size, PAGE_SIZE);
-        if (mem == -1L)
+        if (mem == MEMBLOCK_ERROR)
                panic("Cannot allocate trampoline\n");
-        trampoline_base = __va(mem);
+        x86_trampoline_base = __va(mem);
-        reserve_early(mem, mem + TRAMPOLINE_SIZE, "TRAMPOLINE");
+        memblock_x86_reserve_range(mem, mem + size, "TRAMPOLINE");
+        printk(KERN_DEBUG "Base memory trampoline at [%p] %llx size %zu\n",
+               x86_trampoline_base, (unsigned long long)mem, size);
+        memcpy(x86_trampoline_base, x86_trampoline_start, size);
 }
 /*
- * Currently trivial. Write the real->protected mode
+ * setup_trampolines() gets called very early, to guarantee the
- * bootstrap into the page concerned. The caller
+ * availability of low memory.  This is before the proper kernel page
- * has made sure it's suitably aligned.
+ * tables are set up, so we cannot set page permissions in that
+ * function.  Thus, we use an arch_initcall instead.
 */
-unsigned long __trampinit setup_trampoline(void)
+static int __init configure_trampolines(void)
 {
-        memcpy(trampoline_base, trampoline_data, TRAMPOLINE_SIZE);
+        size_t size = PAGE_ALIGN(x86_trampoline_end - x86_trampoline_start);
-        return virt_to_phys(trampoline_base);
-}
-void __init setup_trampoline_page_table(void)
+        set_memory_x((unsigned long)x86_trampoline_base, size >> PAGE_SHIFT);
-{
+        return 0;
-#ifdef CONFIG_X86_32
-        /* Copy kernel address range */
-        clone_pgd_range(trampoline_pg_dir + KERNEL_PGD_BOUNDARY,
-                        swapper_pg_dir + KERNEL_PGD_BOUNDARY,
-                        KERNEL_PGD_PTRS);
-        /* Initialize low mappings */
-        clone_pgd_range(trampoline_pg_dir,
-                        swapper_pg_dir + KERNEL_PGD_BOUNDARY,
-                        min_t(unsigned long, KERNEL_PGD_PTRS,
-                              KERNEL_PGD_BOUNDARY));
-#endif
 }
+arch_initcall(configure_trampolines);
diff --git a/arch/x86/kernel/trampoline_32.S b/arch/x86/kernel/trampoline_32.S
index 8508237e8e43..451c0a7ef7fd 100644
--- a/arch/x86/kernel/trampoline_32.S
+++ b/arch/x86/kernel/trampoline_32.S
@@ -32,9 +32,11 @@
 #include <asm/segment.h>
 #include <asm/page_types.h>
-/* We can free up trampoline after bootup if cpu hotplug is not supported. */
+#ifdef CONFIG_SMP
-__CPUINITRODATA
-.code16
+        .section ".x86_trampoline","a"
+        .balign PAGE_SIZE
+        .code16
 ENTRY(trampoline_data)
 r_base = .
@@ -44,7 +46,7 @@ r_base = .
        cli                     # We should be safe anyway
-        movl    $0xA5A5A5A5, trampoline_data - r_base
+        movl    $0xA5A5A5A5, trampoline_status - r_base
                                # write marker for master knows we're running
        /* GDT tables in non default location kernel can be beyond 16MB and
@@ -72,5 +74,10 @@ boot_idt_descr:
        .word   0                               # idt limit = 0
        .long   0                               # idt base = 0L
+ENTRY(trampoline_status)
+        .long   0
 .globl trampoline_end
 trampoline_end:
+#endif /* CONFIG_SMP */
diff --git a/arch/x86/kernel/trampoline_64.S b/arch/x86/kernel/trampoline_64.S
index 3af2dff58b21..09ff51799e96 100644
--- a/arch/x86/kernel/trampoline_64.S
+++ b/arch/x86/kernel/trampoline_64.S
@@ -32,13 +32,9 @@
 #include <asm/segment.h>
 #include <asm/processor-flags.h>
-#ifdef CONFIG_ACPI_SLEEP
+        .section ".x86_trampoline","a"
-.section .rodata, "a", @progbits
+        .balign PAGE_SIZE
-#else
+        .code16
-/* We can free up the trampoline after bootup if cpu hotplug is not supported. */
-__CPUINITRODATA
-#endif
-.code16
 ENTRY(trampoline_data)
 r_base = .
@@ -50,7 +46,7 @@ r_base = .
        mov     %ax, %ss
-        movl    $0xA5A5A5A5, trampoline_data - r_base
+        movl    $0xA5A5A5A5, trampoline_status - r_base
                                # write marker for master knows we're running
                                        # Setup stack
@@ -64,10 +60,13 @@ r_base = .
        movzx   %ax, %esi               # Find the 32bit trampoline location
        shll    $4, %esi
-                                        # Fixup the vectors
+                                        # Fixup the absolute vectors
-        addl    %esi, startup_32_vector - r_base
+        leal    (startup_32 - r_base)(%esi), %eax
-        addl    %esi, startup_64_vector - r_base
+        movl    %eax, startup_32_vector - r_base
-        addl    %esi, tgdt + 2 - r_base # Fixup the gdt pointer
+        leal    (startup_64 - r_base)(%esi), %eax
+        movl    %eax, startup_64_vector - r_base
+        leal    (tgdt - r_base)(%esi), %eax
+        movl    %eax, (tgdt + 2 - r_base)
        /*
         * GDT tables in non default location kernel can be beyond 16MB and
@@ -127,8 +126,9 @@ startup_64:
 no_longmode:
        hlt
        jmp no_longmode
-#include "verify_cpu_64.S"
+#include "verify_cpu.S"
+        .balign 4
        # Careful these need to be in the same 64K segment as the above;
 tidt:
        .word   0                       # idt limit = 0
@@ -156,6 +156,10 @@ startup_64_vector:
        .long   startup_64 - r_base
        .word   __KERNEL_CS, 0
+        .balign 4
+ENTRY(trampoline_status)
+        .long   0
 trampoline_stack:
        .org 0x1000
 trampoline_stack_end:
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 60788dee0f8a..b9b67166f9de 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -83,6 +83,13 @@ EXPORT_SYMBOL_GPL(used_vectors);
 static int ignore_nmis;
+int unknown_nmi_panic;
+/*
+ * Prevent NMI reason port (0x61) being accessed simultaneously, can
+ * only be used in NMI handler.
+ */
+static DEFINE_RAW_SPINLOCK(nmi_reason_lock);
 static inline void conditional_sti(struct pt_regs *regs)
 {
        if (regs->flags & X86_EFLAGS_IF)
@@ -300,16 +307,23 @@ gp_in_kernel:
        die("general protection fault", regs, error_code);
 }
-static notrace __kprobes void
+static int __init setup_unknown_nmi_panic(char *str)
-mem_parity_error(unsigned char reason, struct pt_regs *regs)
 {
-        printk(KERN_EMERG
+        unknown_nmi_panic = 1;
-                "Uhhuh. NMI received for unknown reason %02x on CPU %d.\n",
+        return 1;
-                        reason, smp_processor_id());
+}
+__setup("unknown_nmi_panic", setup_unknown_nmi_panic);
-        printk(KERN_EMERG
+static notrace __kprobes void
-                "You have some hardware problem, likely on the PCI bus.\n");
+pci_serr_error(unsigned char reason, struct pt_regs *regs)
+{
+        pr_emerg("NMI: PCI system error (SERR) for reason %02x on CPU %d.\n",
+                 reason, smp_processor_id());
+        /*
+         * On some machines, PCI SERR line is used to report memory
+         * errors. EDAC makes use of it.
+         */
 #if defined(CONFIG_EDAC)
        if (edac_handler_set()) {
                edac_atomic_assert_error();
@@ -320,11 +334,11 @@ mem_parity_error(unsigned char reason, struct pt_regs *regs)
        if (panic_on_unrecovered_nmi)
                panic("NMI: Not continuing");
-        printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
+        pr_emerg("Dazed and confused, but trying to continue\n");
-        /* Clear and disable the memory parity error line. */
+        /* Clear and disable the PCI SERR error line. */
-        reason = (reason & 0xf) | 4;
+        reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_SERR;
-        outb(reason, 0x61);
+        outb(reason, NMI_REASON_PORT);
 }
 static notrace __kprobes void
@@ -332,22 +346,26 @@ io_check_error(unsigned char reason, struct pt_regs *regs)
 {
        unsigned long i;
-        printk(KERN_EMERG "NMI: IOCK error (debug interrupt?)\n");
+        pr_emerg(
+        "NMI: IOCK error (debug interrupt?) for reason %02x on CPU %d.\n",
+                 reason, smp_processor_id());
        show_registers(regs);
        if (panic_on_io_nmi)
                panic("NMI IOCK error: Not continuing");
        /* Re-enable the IOCK line, wait for a few seconds */
-        reason = (reason & 0xf) | 8;
+        reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_IOCHK;
-        outb(reason, 0x61);
+        outb(reason, NMI_REASON_PORT);
-        i = 2000;
+        i = 20000;
-        while (--i)
+        while (--i) {
-                udelay(1000);
+                touch_nmi_watchdog();
+                udelay(100);
+        }
-        reason &= ~8;
+        reason &= ~NMI_REASON_CLEAR_IOCHK;
-        outb(reason, 0x61);
+        outb(reason, NMI_REASON_PORT);
 }
 static notrace __kprobes void
@@ -366,69 +384,50 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
                return;
        }
 #endif
-        printk(KERN_EMERG
+        pr_emerg("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n",
-                "Uhhuh. NMI received for unknown reason %02x on CPU %d.\n",
+                 reason, smp_processor_id());
-                        reason, smp_processor_id());
-        printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n");
+        pr_emerg("Do you have a strange power saving mode enabled?\n");
-        if (panic_on_unrecovered_nmi)
+        if (unknown_nmi_panic || panic_on_unrecovered_nmi)
                panic("NMI: Not continuing");
-        printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
+        pr_emerg("Dazed and confused, but trying to continue\n");
 }
 static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
 {
        unsigned char reason = 0;
-        int cpu;
-        cpu = smp_processor_id();
+        /*
+         * CPU-specific NMI must be processed before non-CPU-specific
-        /* Only the BSP gets external NMIs from the system. */
+         * NMI, otherwise we may lose it, because the CPU-specific
-        if (!cpu)
+         * NMI can not be detected/processed on other CPUs.
-                reason = get_nmi_reason();
+         */
+        if (notify_die(DIE_NMI, "nmi", regs, 0, 2, SIGINT) == NOTIFY_STOP)
-        if (!(reason & 0xc0)) {
+                return;
-                if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT)
-                                                                == NOTIFY_STOP)
-                        return;
-#ifdef CONFIG_X86_LOCAL_APIC
+        /* Non-CPU-specific NMI: NMI sources can be processed on any CPU */
-                if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
+        raw_spin_lock(&nmi_reason_lock);
-                                                        == NOTIFY_STOP)
+        reason = get_nmi_reason();
-                        return;
-#ifndef CONFIG_LOCKUP_DETECTOR
+        if (reason & NMI_REASON_MASK) {
+                if (reason & NMI_REASON_SERR)
+                        pci_serr_error(reason, regs);
+                else if (reason & NMI_REASON_IOCHK)
+                        io_check_error(reason, regs);
+#ifdef CONFIG_X86_32
                /*
-                 * Ok, so this is none of the documented NMI sources,
+                 * Reassert NMI in case it became active
-                 * so it must be the NMI watchdog.
+                 * meanwhile as it's edge-triggered:
                 */
-                if (nmi_watchdog_tick(regs, reason))
+                reassert_nmi();
-                        return;
-                if (!do_nmi_callback(regs, cpu))
-#endif /* !CONFIG_LOCKUP_DETECTOR */
-                        unknown_nmi_error(reason, regs);
-#else
-                unknown_nmi_error(reason, regs);
 #endif
+                raw_spin_unlock(&nmi_reason_lock);
                return;
        }
-        if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
+        raw_spin_unlock(&nmi_reason_lock);
-                return;
-        /* AK: following checks seem to be broken on modern chipsets. FIXME */
+        unknown_nmi_error(reason, regs);
-        if (reason & 0x80)
-                mem_parity_error(reason, regs);
-        if (reason & 0x40)
-                io_check_error(reason, regs);
-#ifdef CONFIG_X86_32
-        /*
-         * Reassert NMI in case it became active meanwhile
-         * as it's edge-triggered:
-         */
-        reassert_nmi();
-#endif
 }
 dotraplinkage notrace __kprobes void
@@ -446,14 +445,12 @@ do_nmi(struct pt_regs *regs, long error_code)
 void stop_nmi(void)
 {
-        acpi_nmi_disable();
        ignore_nmis++;
 }
 void restart_nmi(void)
 {
        ignore_nmis--;
-        acpi_nmi_enable();
 }
 /* May run on IST stack. */
@@ -575,6 +572,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
        if (regs->flags & X86_VM_MASK) {
                handle_vm86_trap((struct kernel_vm86_regs *) regs,
                                error_code, 1);
+                preempt_conditional_cli(regs);
                return;
        }
@@ -776,21 +774,10 @@ asmlinkage void math_state_restore(void)
 }
 EXPORT_SYMBOL_GPL(math_state_restore);
-#ifndef CONFIG_MATH_EMULATION
-void math_emulate(struct math_emu_info *info)
-{
-        printk(KERN_EMERG
-                "math-emulation not enabled and no coprocessor found.\n");
-        printk(KERN_EMERG "killing %s.\n", current->comm);
-        force_sig(SIGFPE, current);
-        schedule();
-}
-#endif /* CONFIG_MATH_EMULATION */
 dotraplinkage void __kprobes
 do_device_not_available(struct pt_regs *regs, long error_code)
 {
-#ifdef CONFIG_X86_32
+#ifdef CONFIG_MATH_EMULATION
        if (read_cr0() & X86_CR0_EM) {
                struct math_emu_info info = { };
@@ -798,12 +785,12 @@ do_device_not_available(struct pt_regs *regs, long error_code)
                info.regs = regs;
                math_emulate(&info);
-        } else {
+                return;
-                math_state_restore(); /* interrupts still off */
-                conditional_sti(regs);
        }
-#else
+#endif
-        math_state_restore();
+        math_state_restore(); /* interrupts still off */
+#ifdef CONFIG_X86_32
+        conditional_sti(regs);
 #endif
 }
@@ -881,18 +868,6 @@ void __init trap_init(void)
 #endif
 #ifdef CONFIG_X86_32
-        if (cpu_has_fxsr) {
-                printk(KERN_INFO "Enabling fast FPU save and restore... ");
-                set_in_cr4(X86_CR4_OSFXSR);
-                printk("done.\n");
-        }
-        if (cpu_has_xmm) {
-                printk(KERN_INFO
-                        "Enabling unmasked SIMD FPU exception support... ");
-                set_in_cr4(X86_CR4_OSXMMEXCPT);
-                printk("done.\n");
-        }
        set_system_trap_gate(SYSCALL_VECTOR, &system_call);
        set_bit(SYSCALL_VECTOR, used_vectors);
 #endif
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 26a863a9c2a8..6cc6922262af 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -104,10 +104,14 @@ int __init notsc_setup(char *str)
 __setup("notsc", notsc_setup);
+static int no_sched_irq_time;
 static int __init tsc_setup(char *str)
 {
        if (!strcmp(str, "reliable"))
                tsc_clocksource_reliable = 1;
+        if (!strncmp(str, "noirqtime", 9))
+                no_sched_irq_time = 1;
        return 1;
 }
@@ -423,7 +427,7 @@ unsigned long native_calibrate_tsc(void)
         * the delta to the previous read. We keep track of the min
         * and max values of that delta. The delta is mostly defined
         * by the IO time of the PIT access, so we can detect when a
-         * SMI/SMM disturbance happend between the two reads. If the
+         * SMI/SMM disturbance happened between the two reads. If the
         * maximum time is significantly larger than the minimum time,
         * then we discard the result and have another try.
         *
@@ -460,7 +464,7 @@ unsigned long native_calibrate_tsc(void)
                tsc_pit_min = min(tsc_pit_min, tsc_pit_khz);
                /* hpet or pmtimer available ? */
-                if (!hpet && !ref1 && !ref2)
+                if (ref1 == ref2)
                        continue;
                /* Check, whether the sampling was disturbed by an SMI */
@@ -655,7 +659,7 @@ void restore_sched_clock_state(void)
        local_irq_save(flags);
-        __get_cpu_var(cyc2ns_offset) = 0;
+        __this_cpu_write(cyc2ns_offset, 0);
        offset = cyc2ns_suspend - sched_clock();
        for_each_possible_cpu(cpu)
@@ -759,25 +763,6 @@ static cycle_t read_tsc(struct clocksource *cs)
                ret : clocksource_tsc.cycle_last;
 }
-#ifdef CONFIG_X86_64
-static cycle_t __vsyscall_fn vread_tsc(void)
-{
-        cycle_t ret;
-        /*
-         * Surround the RDTSC by barriers, to make sure it's not
-         * speculated to outside the seqlock critical section and
-         * does not cause time warps:
-         */
-        rdtsc_barrier();
-        ret = (cycle_t)vget_cycles();
-        rdtsc_barrier();
-        return ret >= __vsyscall_gtod_data.clock.cycle_last ?
-                ret : __vsyscall_gtod_data.clock.cycle_last;
-}
-#endif
 static void resume_tsc(struct clocksource *cs)
 {
        clocksource_tsc.cycle_last = 0;
@@ -801,6 +786,7 @@ void mark_tsc_unstable(char *reason)
        if (!tsc_unstable) {
                tsc_unstable = 1;
                sched_clock_stable = 0;
+                disable_sched_clock_irqtime();
                printk(KERN_INFO "Marking TSC unstable due to %s\n", reason);
                /* Change only the rating, when not registered */
                if (clocksource_tsc.mult)
@@ -867,6 +853,9 @@ __cpuinit int unsynchronized_tsc(void)
        if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
                return 0;
+        if (tsc_clocksource_reliable)
+                return 0;
        /*
         * Intel systems are normally all synchronized.
         * Exceptions must mark TSC as unstable:
@@ -874,14 +863,92 @@ __cpuinit int unsynchronized_tsc(void)
        if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) {
                /* assume multi socket systems are not synchronized: */
                if (num_possible_cpus() > 1)
-                        tsc_unstable = 1;
+                        return 1;
        }
-        return tsc_unstable;
+        return 0;
 }
-static void __init init_tsc_clocksource(void)
+static void tsc_refine_calibration_work(struct work_struct *work);
+static DECLARE_DELAYED_WORK(tsc_irqwork, tsc_refine_calibration_work);
+/**
+ * tsc_refine_calibration_work - Further refine tsc freq calibration
+ * @work - ignored.
+ *
+ * This functions uses delayed work over a period of a
+ * second to further refine the TSC freq value. Since this is
+ * timer based, instead of loop based, we don't block the boot
+ * process while this longer calibration is done.
+ *
+ * If there are any calibration anomalies (too many SMIs, etc),
+ * or the refined calibration is off by 1% of the fast early
+ * calibration, we throw out the new calibration and use the
+ * early calibration.
+ */
+static void tsc_refine_calibration_work(struct work_struct *work)
 {
+        static u64 tsc_start = -1, ref_start;
+        static int hpet;
+        u64 tsc_stop, ref_stop, delta;
+        unsigned long freq;
+        /* Don't bother refining TSC on unstable systems */
+        if (check_tsc_unstable())
+                goto out;
+        /*
+         * Since the work is started early in boot, we may be
+         * delayed the first time we expire. So set the workqueue
+         * again once we know timers are working.
+         */
+        if (tsc_start == -1) {
+                /*
+                 * Only set hpet once, to avoid mixing hardware
+                 * if the hpet becomes enabled later.
+                 */
+                hpet = is_hpet_enabled();
+                schedule_delayed_work(&tsc_irqwork, HZ);
+                tsc_start = tsc_read_refs(&ref_start, hpet);
+                return;
+        }
+        tsc_stop = tsc_read_refs(&ref_stop, hpet);
+        /* hpet or pmtimer available ? */
+        if (ref_start == ref_stop)
+                goto out;
+        /* Check, whether the sampling was disturbed by an SMI */
+        if (tsc_start == ULLONG_MAX || tsc_stop == ULLONG_MAX)
+                goto out;
+        delta = tsc_stop - tsc_start;
+        delta *= 1000000LL;
+        if (hpet)
+                freq = calc_hpet_ref(delta, ref_start, ref_stop);
+        else
+                freq = calc_pmtimer_ref(delta, ref_start, ref_stop);
+        /* Make sure we're within 1% */
+        if (abs(tsc_khz - freq) > tsc_khz/100)
+                goto out;
+        tsc_khz = freq;
+        printk(KERN_INFO "Refined TSC clocksource calibration: "
+                "%lu.%03lu MHz.\n", (unsigned long)tsc_khz / 1000,
+                                        (unsigned long)tsc_khz % 1000);
+out:
+        clocksource_register_khz(&clocksource_tsc, tsc_khz);
+}
+static int __init init_tsc_clocksource(void)
+{
+        if (!cpu_has_tsc || tsc_disabled > 0 || !tsc_khz)
+                return 0;
        if (tsc_clocksource_reliable)
                clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
        /* lower the rating if we already know its unstable: */
@@ -889,62 +956,14 @@ static void __init init_tsc_clocksource(void)
                clocksource_tsc.rating = 0;
                clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS;
        }
-        clocksource_register_khz(&clocksource_tsc, tsc_khz);
+        schedule_delayed_work(&tsc_irqwork, 0);
+        return 0;
 }
-#ifdef CONFIG_X86_64
 /*
- * calibrate_cpu is used on systems with fixed rate TSCs to determine
+ * We use device_initcall here, to ensure we run after the hpet
- * processor frequency
+ * is fully initialized, which may occur at fs_initcall time.
 */
-#define TICK_COUNT 100000000
+device_initcall(init_tsc_clocksource);
-static unsigned long __init calibrate_cpu(void)
-{
-        int tsc_start, tsc_now;
-        int i, no_ctr_free;
-        unsigned long evntsel3 = 0, pmc3 = 0, pmc_now = 0;
-        unsigned long flags;
-        for (i = 0; i < 4; i++)
-                if (avail_to_resrv_perfctr_nmi_bit(i))
-                        break;
-        no_ctr_free = (i == 4);
-        if (no_ctr_free) {
-                WARN(1, KERN_WARNING "Warning: AMD perfctrs busy ... "
-                     "cpu_khz value may be incorrect.\n");
-                i = 3;
-                rdmsrl(MSR_K7_EVNTSEL3, evntsel3);
-                wrmsrl(MSR_K7_EVNTSEL3, 0);
-                rdmsrl(MSR_K7_PERFCTR3, pmc3);
-        } else {
-                reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i);
-                reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
-        }
-        local_irq_save(flags);
-        /* start measuring cycles, incrementing from 0 */
-        wrmsrl(MSR_K7_PERFCTR0 + i, 0);
-        wrmsrl(MSR_K7_EVNTSEL0 + i, 1 << 22 | 3 << 16 | 0x76);
-        rdtscl(tsc_start);
-        do {
-                rdmsrl(MSR_K7_PERFCTR0 + i, pmc_now);
-                tsc_now = get_cycles();
-        } while ((tsc_now - tsc_start) < TICK_COUNT);
-        local_irq_restore(flags);
-        if (no_ctr_free) {
-                wrmsrl(MSR_K7_EVNTSEL3, 0);
-                wrmsrl(MSR_K7_PERFCTR3, pmc3);
-                wrmsrl(MSR_K7_EVNTSEL3, evntsel3);
-        } else {
-                release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
-                release_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
-        }
-        return pmc_now * tsc_khz / (tsc_now - tsc_start);
-}
-#else
-static inline unsigned long calibrate_cpu(void) { return cpu_khz; }
-#endif
 void __init tsc_init(void)
 {
@@ -964,10 +983,6 @@ void __init tsc_init(void)
                return;
        }
-        if (cpu_has(&boot_cpu_data, X86_FEATURE_CONSTANT_TSC) &&
-                        (boot_cpu_data.x86_vendor == X86_VENDOR_AMD))
-                cpu_khz = calibrate_cpu();
        printk("Detected %lu.%03lu MHz processor.\n",
                        (unsigned long)cpu_khz / 1000,
                        (unsigned long)cpu_khz % 1000);
@@ -987,6 +1002,9 @@ void __init tsc_init(void)
        /* now allow native_sched_clock() to use rdtsc */
        tsc_disabled = 0;
+        if (!no_sched_irq_time)
+                enable_sched_clock_irqtime();
        lpj = ((u64)tsc_khz * 1000);
        do_div(lpj, HZ);
        lpj_fine = lpj;
@@ -999,6 +1017,5 @@ void __init tsc_init(void)
                mark_tsc_unstable("TSCs unsynchronized");
        check_system_tsc_reliable();
-        init_tsc_clocksource();
 }
diff --git a/arch/x86/kernel/uv_irq.c b/arch/x86/kernel/uv_irq.c
deleted file mode 100644
index 1132129db792..000000000000
--- a/arch/x86/kernel/uv_irq.c
+++ /dev/null
@@ -1,302 +0,0 @@
-/*
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file "COPYING" in the main directory of this archive
- * for more details.
- *
- * SGI UV IRQ functions
- *
- * Copyright (C) 2008 Silicon Graphics, Inc. All rights reserved.
- */
-#include <linux/module.h>
-#include <linux/rbtree.h>
-#include <linux/slab.h>
-#include <linux/irq.h>
-#include <asm/apic.h>
-#include <asm/uv/uv_irq.h>
-#include <asm/uv/uv_hub.h>
-/* MMR offset and pnode of hub sourcing interrupts for a given irq */
-struct uv_irq_2_mmr_pnode{
-        struct rb_node          list;
-        unsigned long           offset;
-        int                     pnode;
-        int                     irq;
-};
-static spinlock_t               uv_irq_lock;
-static struct rb_root           uv_irq_root;
-static int uv_set_irq_affinity(unsigned int, const struct cpumask *);
-static void uv_noop(unsigned int irq)
-{
-}
-static unsigned int uv_noop_ret(unsigned int irq)
-{
-        return 0;
-}
-static void uv_ack_apic(unsigned int irq)
-{
-        ack_APIC_irq();
-}
-static struct irq_chip uv_irq_chip = {
-        .name           = "UV-CORE",
-        .startup        = uv_noop_ret,
-        .shutdown       = uv_noop,
-        .enable         = uv_noop,
-        .disable        = uv_noop,
-        .ack            = uv_noop,
-        .mask           = uv_noop,
-        .unmask         = uv_noop,
-        .eoi            = uv_ack_apic,
-        .end            = uv_noop,
-        .set_affinity   = uv_set_irq_affinity,
-};
-/*
- * Add offset and pnode information of the hub sourcing interrupts to the
- * rb tree for a specific irq.
- */
-static int uv_set_irq_2_mmr_info(int irq, unsigned long offset, unsigned blade)
-{
-        struct rb_node **link = &uv_irq_root.rb_node;
-        struct rb_node *parent = NULL;
-        struct uv_irq_2_mmr_pnode *n;
-        struct uv_irq_2_mmr_pnode *e;
-        unsigned long irqflags;
-        n = kmalloc_node(sizeof(struct uv_irq_2_mmr_pnode), GFP_KERNEL,
-                                uv_blade_to_memory_nid(blade));
-        if (!n)
-                return -ENOMEM;
-        n->irq = irq;
-        n->offset = offset;
-        n->pnode = uv_blade_to_pnode(blade);
-        spin_lock_irqsave(&uv_irq_lock, irqflags);
-        /* Find the right place in the rbtree: */
-        while (*link) {
-                parent = *link;
-                e = rb_entry(parent, struct uv_irq_2_mmr_pnode, list);
-                if (unlikely(irq == e->irq)) {
-                        /* irq entry exists */
-                        e->pnode = uv_blade_to_pnode(blade);
-                        e->offset = offset;
-                        spin_unlock_irqrestore(&uv_irq_lock, irqflags);
-                        kfree(n);
-                        return 0;
-                }
-                if (irq < e->irq)
-                        link = &(*link)->rb_left;
-                else
-                        link = &(*link)->rb_right;
-        }
-        /* Insert the node into the rbtree. */
-        rb_link_node(&n->list, parent, link);
-        rb_insert_color(&n->list, &uv_irq_root);
-        spin_unlock_irqrestore(&uv_irq_lock, irqflags);
-        return 0;
-}
-/* Retrieve offset and pnode information from the rb tree for a specific irq */
-int uv_irq_2_mmr_info(int irq, unsigned long *offset, int *pnode)
-{
-        struct uv_irq_2_mmr_pnode *e;
-        struct rb_node *n;
-        unsigned long irqflags;
-        spin_lock_irqsave(&uv_irq_lock, irqflags);
-        n = uv_irq_root.rb_node;
-        while (n) {
-                e = rb_entry(n, struct uv_irq_2_mmr_pnode, list);
-                if (e->irq == irq) {
-                        *offset = e->offset;
-                        *pnode = e->pnode;
-                        spin_unlock_irqrestore(&uv_irq_lock, irqflags);
-                        return 0;
-                }
-                if (irq < e->irq)
-                        n = n->rb_left;
-                else
-                        n = n->rb_right;
-        }
-        spin_unlock_irqrestore(&uv_irq_lock, irqflags);
-        return -1;
-}
-/*
- * Re-target the irq to the specified CPU and enable the specified MMR located
- * on the specified blade to allow the sending of MSIs to the specified CPU.
- */
-static int
-arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
-                       unsigned long mmr_offset, int limit)
-{
-        const struct cpumask *eligible_cpu = cpumask_of(cpu);
-        struct irq_desc *desc = irq_to_desc(irq);
-        struct irq_cfg *cfg;
-        int mmr_pnode;
-        unsigned long mmr_value;
-        struct uv_IO_APIC_route_entry *entry;
-        int err;
-        BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) !=
-                        sizeof(unsigned long));
-        cfg = irq_cfg(irq);
-        err = assign_irq_vector(irq, cfg, eligible_cpu);
-        if (err != 0)
-                return err;
-        if (limit == UV_AFFINITY_CPU)
-                desc->status |= IRQ_NO_BALANCING;
-        else
-                desc->status |= IRQ_MOVE_PCNTXT;
-        set_irq_chip_and_handler_name(irq, &uv_irq_chip, handle_percpu_irq,
-                                      irq_name);
-        mmr_value = 0;
-        entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
-        entry->vector           = cfg->vector;
-        entry->delivery_mode    = apic->irq_delivery_mode;
-        entry->dest_mode        = apic->irq_dest_mode;
-        entry->polarity         = 0;
-        entry->trigger          = 0;
-        entry->mask             = 0;
-        entry->dest             = apic->cpu_mask_to_apicid(eligible_cpu);
-        mmr_pnode = uv_blade_to_pnode(mmr_blade);
-        uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
-        if (cfg->move_in_progress)
-                send_cleanup_vector(cfg);
-        return irq;
-}
-/*
- * Disable the specified MMR located on the specified blade so that MSIs are
- * longer allowed to be sent.
- */
-static void arch_disable_uv_irq(int mmr_pnode, unsigned long mmr_offset)
-{
-        unsigned long mmr_value;
-        struct uv_IO_APIC_route_entry *entry;
-        BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) !=
-                        sizeof(unsigned long));
-        mmr_value = 0;
-        entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
-        entry->mask = 1;
-        uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
-}
-static int uv_set_irq_affinity(unsigned int irq, const struct cpumask *mask)
-{
-        struct irq_desc *desc = irq_to_desc(irq);
-        struct irq_cfg *cfg = desc->chip_data;
-        unsigned int dest;
-        unsigned long mmr_value;
-        struct uv_IO_APIC_route_entry *entry;
-        unsigned long mmr_offset;
-        int mmr_pnode;
-        if (set_desc_affinity(desc, mask, &dest))
-                return -1;
-        mmr_value = 0;
-        entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
-        entry->vector           = cfg->vector;
-        entry->delivery_mode    = apic->irq_delivery_mode;
-        entry->dest_mode        = apic->irq_dest_mode;
-        entry->polarity         = 0;
-        entry->trigger          = 0;
-        entry->mask             = 0;
-        entry->dest             = dest;
-        /* Get previously stored MMR and pnode of hub sourcing interrupts */
-        if (uv_irq_2_mmr_info(irq, &mmr_offset, &mmr_pnode))
-                return -1;
-        uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
-        if (cfg->move_in_progress)
-                send_cleanup_vector(cfg);
-        return 0;
-}
-/*
- * Set up a mapping of an available irq and vector, and enable the specified
- * MMR that defines the MSI that is to be sent to the specified CPU when an
- * interrupt is raised.
- */
-int uv_setup_irq(char *irq_name, int cpu, int mmr_blade,
-                 unsigned long mmr_offset, int limit)
-{
-        int irq, ret;
-        irq = create_irq_nr(NR_IRQS_LEGACY, uv_blade_to_memory_nid(mmr_blade));
-        if (irq <= 0)
-                return -EBUSY;
-        ret = arch_enable_uv_irq(irq_name, irq, cpu, mmr_blade, mmr_offset,
-                limit);
-        if (ret == irq)
-                uv_set_irq_2_mmr_info(irq, mmr_offset, mmr_blade);
-        else
-                destroy_irq(irq);
-        return ret;
-}
-EXPORT_SYMBOL_GPL(uv_setup_irq);
-/*
- * Tear down a mapping of an irq and vector, and disable the specified MMR that
- * defined the MSI that was to be sent to the specified CPU when an interrupt
- * was raised.
- *
- * Set mmr_blade and mmr_offset to what was passed in on uv_setup_irq().
- */
-void uv_teardown_irq(unsigned int irq)
-{
-        struct uv_irq_2_mmr_pnode *e;
-        struct rb_node *n;
-        unsigned long irqflags;
-        spin_lock_irqsave(&uv_irq_lock, irqflags);
-        n = uv_irq_root.rb_node;
-        while (n) {
-                e = rb_entry(n, struct uv_irq_2_mmr_pnode, list);
-                if (e->irq == irq) {
-                        arch_disable_uv_irq(e->pnode, e->offset);
-                        rb_erase(n, &uv_irq_root);
-                        kfree(e);
-                        break;
-                }
-                if (irq < e->irq)
-                        n = n->rb_left;
-                else
-                        n = n->rb_right;
-        }
-        spin_unlock_irqrestore(&uv_irq_lock, irqflags);
-        destroy_irq(irq);
-}
-EXPORT_SYMBOL_GPL(uv_teardown_irq);
diff --git a/arch/x86/kernel/uv_sysfs.c b/arch/x86/kernel/uv_sysfs.c
deleted file mode 100644
index 309c70fb7759..000000000000
--- a/arch/x86/kernel/uv_sysfs.c
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- * This file supports the /sys/firmware/sgi_uv interfaces for SGI UV.
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2 of the License, or
- *  (at your option) any later version.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
- *
- *  Copyright (c) 2008 Silicon Graphics, Inc.  All Rights Reserved.
- *  Copyright (c) Russ Anderson
- */
-#include <linux/sysdev.h>
-#include <asm/uv/bios.h>
-#include <asm/uv/uv.h>
-struct kobject *sgi_uv_kobj;
-static ssize_t partition_id_show(struct kobject *kobj,
-                        struct kobj_attribute *attr, char *buf)
-{
-        return snprintf(buf, PAGE_SIZE, "%ld\n", sn_partition_id);
-}
-static ssize_t coherence_id_show(struct kobject *kobj,
-                        struct kobj_attribute *attr, char *buf)
-{
-        return snprintf(buf, PAGE_SIZE, "%ld\n", partition_coherence_id());
-}
-static struct kobj_attribute partition_id_attr =
-        __ATTR(partition_id, S_IRUGO, partition_id_show, NULL);
-static struct kobj_attribute coherence_id_attr =
-        __ATTR(coherence_id, S_IRUGO, coherence_id_show, NULL);
-static int __init sgi_uv_sysfs_init(void)
-{
-        unsigned long ret;
-        if (!is_uv_system())
-                return -ENODEV;
-        if (!sgi_uv_kobj)
-                sgi_uv_kobj = kobject_create_and_add("sgi_uv", firmware_kobj);
-        if (!sgi_uv_kobj) {
-                printk(KERN_WARNING "kobject_create_and_add sgi_uv failed\n");
-                return -EINVAL;
-        }
-        ret = sysfs_create_file(sgi_uv_kobj, &partition_id_attr.attr);
-        if (ret) {
-                printk(KERN_WARNING "sysfs_create_file partition_id failed\n");
-                return ret;
-        }
-        ret = sysfs_create_file(sgi_uv_kobj, &coherence_id_attr.attr);
-        if (ret) {
-                printk(KERN_WARNING "sysfs_create_file coherence_id failed\n");
-                return ret;
-        }
-        return 0;
-}
-device_initcall(sgi_uv_sysfs_init);
diff --git a/arch/x86/kernel/uv_time.c b/arch/x86/kernel/uv_time.c
deleted file mode 100644
index 56e421bc379b..000000000000
--- a/arch/x86/kernel/uv_time.c
+++ /dev/null
@@ -1,423 +0,0 @@
-/*
- * SGI RTC clock/timer routines.
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2 of the License, or
- *  (at your option) any later version.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
- *
- *  Copyright (c) 2009 Silicon Graphics, Inc.  All Rights Reserved.
- *  Copyright (c) Dimitri Sivanich
- */
-#include <linux/clockchips.h>
-#include <linux/slab.h>
-#include <asm/uv/uv_mmrs.h>
-#include <asm/uv/uv_hub.h>
-#include <asm/uv/bios.h>
-#include <asm/uv/uv.h>
-#include <asm/apic.h>
-#include <asm/cpu.h>
-#define RTC_NAME                "sgi_rtc"
-static cycle_t uv_read_rtc(struct clocksource *cs);
-static int uv_rtc_next_event(unsigned long, struct clock_event_device *);
-static void uv_rtc_timer_setup(enum clock_event_mode,
-                                struct clock_event_device *);
-static struct clocksource clocksource_uv = {
-        .name           = RTC_NAME,
-        .rating         = 400,
-        .read           = uv_read_rtc,
-        .mask           = (cycle_t)UVH_RTC_REAL_TIME_CLOCK_MASK,
-        .shift          = 10,
-        .flags          = CLOCK_SOURCE_IS_CONTINUOUS,
-};
-static struct clock_event_device clock_event_device_uv = {
-        .name           = RTC_NAME,
-        .features       = CLOCK_EVT_FEAT_ONESHOT,
-        .shift          = 20,
-        .rating         = 400,
-        .irq            = -1,
-        .set_next_event = uv_rtc_next_event,
-        .set_mode       = uv_rtc_timer_setup,
-        .event_handler  = NULL,
-};
-static DEFINE_PER_CPU(struct clock_event_device, cpu_ced);
-/* There is one of these allocated per node */
-struct uv_rtc_timer_head {
-        spinlock_t      lock;
-        /* next cpu waiting for timer, local node relative: */
-        int             next_cpu;
-        /* number of cpus on this node: */
-        int             ncpus;
-        struct {
-                int     lcpu;           /* systemwide logical cpu number */
-                u64     expires;        /* next timer expiration for this cpu */
-        } cpu[1];
-};
-/*
- * Access to uv_rtc_timer_head via blade id.
- */
-static struct uv_rtc_timer_head         **blade_info __read_mostly;
-static int                              uv_rtc_evt_enable;
-/*
- * Hardware interface routines
- */
-/* Send IPIs to another node */
-static void uv_rtc_send_IPI(int cpu)
-{
-        unsigned long apicid, val;
-        int pnode;
-        apicid = cpu_physical_id(cpu);
-        pnode = uv_apicid_to_pnode(apicid);
-        val = (1UL << UVH_IPI_INT_SEND_SHFT) |
-              (apicid << UVH_IPI_INT_APIC_ID_SHFT) |
-              (X86_PLATFORM_IPI_VECTOR << UVH_IPI_INT_VECTOR_SHFT);
-        uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
-}
-/* Check for an RTC interrupt pending */
-static int uv_intr_pending(int pnode)
-{
-        return uv_read_global_mmr64(pnode, UVH_EVENT_OCCURRED0) &
-                UVH_EVENT_OCCURRED0_RTC1_MASK;
-}
-/* Setup interrupt and return non-zero if early expiration occurred. */
-static int uv_setup_intr(int cpu, u64 expires)
-{
-        u64 val;
-        int pnode = uv_cpu_to_pnode(cpu);
-        uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG,
-                UVH_RTC1_INT_CONFIG_M_MASK);
-        uv_write_global_mmr64(pnode, UVH_INT_CMPB, -1L);
-        uv_write_global_mmr64(pnode, UVH_EVENT_OCCURRED0_ALIAS,
-                UVH_EVENT_OCCURRED0_RTC1_MASK);
-        val = (X86_PLATFORM_IPI_VECTOR << UVH_RTC1_INT_CONFIG_VECTOR_SHFT) |
-                ((u64)cpu_physical_id(cpu) << UVH_RTC1_INT_CONFIG_APIC_ID_SHFT);
-        /* Set configuration */
-        uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG, val);
-        /* Initialize comparator value */
-        uv_write_global_mmr64(pnode, UVH_INT_CMPB, expires);
-        if (uv_read_rtc(NULL) <= expires)
-                return 0;
-        return !uv_intr_pending(pnode);
-}
-/*
- * Per-cpu timer tracking routines
- */
-static __init void uv_rtc_deallocate_timers(void)
-{
-        int bid;
-        for_each_possible_blade(bid) {
-                kfree(blade_info[bid]);
-        }
-        kfree(blade_info);
-}
-/* Allocate per-node list of cpu timer expiration times. */
-static __init int uv_rtc_allocate_timers(void)
-{
-        int cpu;
-        blade_info = kmalloc(uv_possible_blades * sizeof(void *), GFP_KERNEL);
-        if (!blade_info)
-                return -ENOMEM;
-        memset(blade_info, 0, uv_possible_blades * sizeof(void *));
-        for_each_present_cpu(cpu) {
-                int nid = cpu_to_node(cpu);
-                int bid = uv_cpu_to_blade_id(cpu);
-                int bcpu = uv_cpu_hub_info(cpu)->blade_processor_id;
-                struct uv_rtc_timer_head *head = blade_info[bid];
-                if (!head) {
-                        head = kmalloc_node(sizeof(struct uv_rtc_timer_head) +
-                                (uv_blade_nr_possible_cpus(bid) *
-                                        2 * sizeof(u64)),
-                                GFP_KERNEL, nid);
-                        if (!head) {
-                                uv_rtc_deallocate_timers();
-                                return -ENOMEM;
-                        }
-                        spin_lock_init(&head->lock);
-                        head->ncpus = uv_blade_nr_possible_cpus(bid);
-                        head->next_cpu = -1;
-                        blade_info[bid] = head;
-                }
-                head->cpu[bcpu].lcpu = cpu;
-                head->cpu[bcpu].expires = ULLONG_MAX;
-        }
-        return 0;
-}
-/* Find and set the next expiring timer.  */
-static void uv_rtc_find_next_timer(struct uv_rtc_timer_head *head, int pnode)
-{
-        u64 lowest = ULLONG_MAX;
-        int c, bcpu = -1;
-        head->next_cpu = -1;
-        for (c = 0; c < head->ncpus; c++) {
-                u64 exp = head->cpu[c].expires;
-                if (exp < lowest) {
-                        bcpu = c;
-                        lowest = exp;
-                }
-        }
-        if (bcpu >= 0) {
-                head->next_cpu = bcpu;
-                c = head->cpu[bcpu].lcpu;
-                if (uv_setup_intr(c, lowest))
-                        /* If we didn't set it up in time, trigger */
-                        uv_rtc_send_IPI(c);
-        } else {
-                uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG,
-                        UVH_RTC1_INT_CONFIG_M_MASK);
-        }
-}
-/*
- * Set expiration time for current cpu.
- *
- * Returns 1 if we missed the expiration time.
- */
-static int uv_rtc_set_timer(int cpu, u64 expires)
-{
-        int pnode = uv_cpu_to_pnode(cpu);
-        int bid = uv_cpu_to_blade_id(cpu);
-        struct uv_rtc_timer_head *head = blade_info[bid];
-        int bcpu = uv_cpu_hub_info(cpu)->blade_processor_id;
-        u64 *t = &head->cpu[bcpu].expires;
-        unsigned long flags;
-        int next_cpu;
-        spin_lock_irqsave(&head->lock, flags);
-        next_cpu = head->next_cpu;
-        *t = expires;
-        /* Will this one be next to go off? */
-        if (next_cpu < 0 || bcpu == next_cpu ||
-                        expires < head->cpu[next_cpu].expires) {
-                head->next_cpu = bcpu;
-                if (uv_setup_intr(cpu, expires)) {
-                        *t = ULLONG_MAX;
-                        uv_rtc_find_next_timer(head, pnode);
-                        spin_unlock_irqrestore(&head->lock, flags);
-                        return -ETIME;
-                }
-        }
-        spin_unlock_irqrestore(&head->lock, flags);
-        return 0;
-}
-/*
- * Unset expiration time for current cpu.
- *
- * Returns 1 if this timer was pending.
- */
-static int uv_rtc_unset_timer(int cpu, int force)
-{
-        int pnode = uv_cpu_to_pnode(cpu);
-        int bid = uv_cpu_to_blade_id(cpu);
-        struct uv_rtc_timer_head *head = blade_info[bid];
-        int bcpu = uv_cpu_hub_info(cpu)->blade_processor_id;
-        u64 *t = &head->cpu[bcpu].expires;
-        unsigned long flags;
-        int rc = 0;
-        spin_lock_irqsave(&head->lock, flags);
-        if ((head->next_cpu == bcpu && uv_read_rtc(NULL) >= *t) || force)
-                rc = 1;
-        if (rc) {
-                *t = ULLONG_MAX;
-                /* Was the hardware setup for this timer? */
-                if (head->next_cpu == bcpu)
-                        uv_rtc_find_next_timer(head, pnode);
-        }
-        spin_unlock_irqrestore(&head->lock, flags);
-        return rc;
-}
-/*
- * Kernel interface routines.
- */
-/*
- * Read the RTC.
- *
- * Starting with HUB rev 2.0, the UV RTC register is replicated across all
- * cachelines of it's own page.  This allows faster simultaneous reads
- * from a given socket.
- */
-static cycle_t uv_read_rtc(struct clocksource *cs)
-{
-        unsigned long offset;
-        if (uv_get_min_hub_revision_id() == 1)
-                offset = 0;
-        else
-                offset = (uv_blade_processor_id() * L1_CACHE_BYTES) % PAGE_SIZE;
-        return (cycle_t)uv_read_local_mmr(UVH_RTC | offset);
-}
-/*
- * Program the next event, relative to now
- */
-static int uv_rtc_next_event(unsigned long delta,
-                             struct clock_event_device *ced)
-{
-        int ced_cpu = cpumask_first(ced->cpumask);
-        return uv_rtc_set_timer(ced_cpu, delta + uv_read_rtc(NULL));
-}
-/*
- * Setup the RTC timer in oneshot mode
- */
-static void uv_rtc_timer_setup(enum clock_event_mode mode,
-                               struct clock_event_device *evt)
-{
-        int ced_cpu = cpumask_first(evt->cpumask);
-        switch (mode) {
-        case CLOCK_EVT_MODE_PERIODIC:
-        case CLOCK_EVT_MODE_ONESHOT:
-        case CLOCK_EVT_MODE_RESUME:
-                /* Nothing to do here yet */
-                break;
-        case CLOCK_EVT_MODE_UNUSED:
-        case CLOCK_EVT_MODE_SHUTDOWN:
-                uv_rtc_unset_timer(ced_cpu, 1);
-                break;
-        }
-}
-static void uv_rtc_interrupt(void)
-{
-        int cpu = smp_processor_id();
-        struct clock_event_device *ced = &per_cpu(cpu_ced, cpu);
-        if (!ced || !ced->event_handler)
-                return;
-        if (uv_rtc_unset_timer(cpu, 0) != 1)
-                return;
-        ced->event_handler(ced);
-}
-static int __init uv_enable_evt_rtc(char *str)
-{
-        uv_rtc_evt_enable = 1;
-        return 1;
-}
-__setup("uvrtcevt", uv_enable_evt_rtc);
-static __init void uv_rtc_register_clockevents(struct work_struct *dummy)
-{
-        struct clock_event_device *ced = &__get_cpu_var(cpu_ced);
-        *ced = clock_event_device_uv;
-        ced->cpumask = cpumask_of(smp_processor_id());
-        clockevents_register_device(ced);
-}
-static __init int uv_rtc_setup_clock(void)
-{
-        int rc;
-        if (!is_uv_system())
-                return -ENODEV;
-        clocksource_uv.mult = clocksource_hz2mult(sn_rtc_cycles_per_second,
-                                clocksource_uv.shift);
-        /* If single blade, prefer tsc */
-        if (uv_num_possible_blades() == 1)
-                clocksource_uv.rating = 250;
-        rc = clocksource_register(&clocksource_uv);
-        if (rc)
-                printk(KERN_INFO "UV RTC clocksource failed rc %d\n", rc);
-        else
-                printk(KERN_INFO "UV RTC clocksource registered freq %lu MHz\n",
-                        sn_rtc_cycles_per_second/(unsigned long)1E6);
-        if (rc || !uv_rtc_evt_enable || x86_platform_ipi_callback)
-                return rc;
-        /* Setup and register clockevents */
-        rc = uv_rtc_allocate_timers();
-        if (rc)
-                goto error;
-        x86_platform_ipi_callback = uv_rtc_interrupt;
-        clock_event_device_uv.mult = div_sc(sn_rtc_cycles_per_second,
-                                NSEC_PER_SEC, clock_event_device_uv.shift);
-        clock_event_device_uv.min_delta_ns = NSEC_PER_SEC /
-                                                sn_rtc_cycles_per_second;
-        clock_event_device_uv.max_delta_ns = clocksource_uv.mask *
-                                (NSEC_PER_SEC / sn_rtc_cycles_per_second);
-        rc = schedule_on_each_cpu(uv_rtc_register_clockevents);
-        if (rc) {
-                x86_platform_ipi_callback = NULL;
-                uv_rtc_deallocate_timers();
-                goto error;
-        }
-        printk(KERN_INFO "UV RTC clockevents registered\n");
-        return 0;
-error:
-        clocksource_unregister(&clocksource_uv);
-        printk(KERN_INFO "UV RTC clockevents failed rc %d\n", rc);
-        return rc;
-}
-arch_initcall(uv_rtc_setup_clock);
diff --git a/arch/x86/kernel/verify_cpu_64.S b/arch/x86/kernel/verify_cpu.S
index 56a8c2a867d9..b9242bacbe59 100644
--- a/arch/x86/kernel/verify_cpu_64.S
+++ b/arch/x86/kernel/verify_cpu.S
@@ -7,6 +7,7 @@
 *      Copyright (c) 2007  Andi Kleen (ak@suse.de)
 *      Copyright (c) 2007  Eric Biederman (ebiederm@xmission.com)
 *      Copyright (c) 2007  Vivek Goyal (vgoyal@in.ibm.com)
+ *      Copyright (c) 2010  Kees Cook (kees.cook@canonical.com)
 *
 *      This source code is licensed under the GNU General Public License,
 *      Version 2.  See the file COPYING for more details.
@@ -14,18 +15,17 @@
 *      This is a common code for verification whether CPU supports
 *      long mode and SSE or not. It is not called directly instead this
 *      file is included at various places and compiled in that context.
- *      Following are the current usage.
+ *      This file is expected to run in 32bit code.  Currently:
 *
- *      This file is included by both 16bit and 32bit code.
+ *      arch/x86/boot/compressed/head_64.S: Boot cpu verification
+ *      arch/x86/kernel/trampoline_64.S: secondary processor verification
+ *      arch/x86/kernel/head_32.S: processor startup
 *
- *      arch/x86_64/boot/setup.S : Boot cpu verification (16bit)
+ *      verify_cpu, returns the status of longmode and SSE in register %eax.
- *      arch/x86_64/boot/compressed/head.S: Boot cpu verification (32bit)
- *      arch/x86_64/kernel/trampoline.S: secondary processor verfication (16bit)
- *      arch/x86_64/kernel/acpi/wakeup.S:Verfication at resume (16bit)
- *
- *      verify_cpu, returns the status of cpu check in register %eax.
 *              0: Success    1: Failure
 *
+ *      On Intel, the XD_DISABLE flag will be cleared as a side-effect.
+ *
 *      The caller needs to check for the error code and take the action
 *      appropriately. Either display a message or halt.
 */
@@ -62,8 +62,41 @@ verify_cpu:
        cmpl    $0x444d4163,%ecx
        jnz     verify_cpu_noamd
        mov     $1,%di                  # cpu is from AMD
+        jmp     verify_cpu_check
 verify_cpu_noamd:
+        cmpl    $0x756e6547,%ebx        # GenuineIntel?
+        jnz     verify_cpu_check
+        cmpl    $0x49656e69,%edx
+        jnz     verify_cpu_check
+        cmpl    $0x6c65746e,%ecx
+        jnz     verify_cpu_check
+        # only call IA32_MISC_ENABLE when:
+        # family > 6 || (family == 6 && model >= 0xd)
+        movl    $0x1, %eax              # check CPU family and model
+        cpuid
+        movl    %eax, %ecx
+        andl    $0x0ff00f00, %eax       # mask family and extended family
+        shrl    $8, %eax
+        cmpl    $6, %eax
+        ja      verify_cpu_clear_xd     # family > 6, ok
+        jb      verify_cpu_check        # family < 6, skip
+        andl    $0x000f00f0, %ecx       # mask model and extended model
+        shrl    $4, %ecx
+        cmpl    $0xd, %ecx
+        jb      verify_cpu_check        # family == 6, model < 0xd, skip
+verify_cpu_clear_xd:
+        movl    $MSR_IA32_MISC_ENABLE, %ecx
+        rdmsr
+        btrl    $2, %edx                # clear MSR_IA32_MISC_ENABLE_XD_DISABLE
+        jnc     verify_cpu_check        # only write MSR if bit was changed
+        wrmsr
+verify_cpu_check:
        movl    $0x1,%eax               # Does the cpu have what it takes
        cpuid
        andl    $REQUIRED_MASK0,%edx
diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c
deleted file mode 100644
index e680ea52db9b..000000000000
--- a/arch/x86/kernel/visws_quirks.c
+++ /dev/null
@@ -1,666 +0,0 @@
-/*
- *  SGI Visual Workstation support and quirks, unmaintained.
- *
- *  Split out from setup.c by davej@suse.de
- *
- *      Copyright (C) 1999 Bent Hagemark, Ingo Molnar
- *
- *  SGI Visual Workstation interrupt controller
- *
- *  The Cobalt system ASIC in the Visual Workstation contains a "Cobalt" APIC
- *  which serves as the main interrupt controller in the system.  Non-legacy
- *  hardware in the system uses this controller directly.  Legacy devices
- *  are connected to the PIIX4 which in turn has its 8259(s) connected to
- *  a of the Cobalt APIC entry.
- *
- *  09/02/2000 - Updated for 2.4 by jbarnes@sgi.com
- *
- *  25/11/2002 - Updated for 2.5 by Andrey Panin <pazke@orbita1.ru>
- */
-#include <linux/interrupt.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/smp.h>
-#include <asm/visws/cobalt.h>
-#include <asm/visws/piix4.h>
-#include <asm/io_apic.h>
-#include <asm/fixmap.h>
-#include <asm/reboot.h>
-#include <asm/setup.h>
-#include <asm/apic.h>
-#include <asm/e820.h>
-#include <asm/time.h>
-#include <asm/io.h>
-#include <linux/kernel_stat.h>
-#include <asm/i8259.h>
-#include <asm/irq_vectors.h>
-#include <asm/visws/lithium.h>
-#include <linux/sched.h>
-#include <linux/kernel.h>
-#include <linux/pci.h>
-#include <linux/pci_ids.h>
-extern int no_broadcast;
-char visws_board_type   = -1;
-char visws_board_rev    = -1;
-static void __init visws_time_init(void)
-{
-        printk(KERN_INFO "Starting Cobalt Timer system clock\n");
-        /* Set the countdown value */
-        co_cpu_write(CO_CPU_TIMEVAL, CO_TIME_HZ/HZ);
-        /* Start the timer */
-        co_cpu_write(CO_CPU_CTRL, co_cpu_read(CO_CPU_CTRL) | CO_CTRL_TIMERUN);
-        /* Enable (unmask) the timer interrupt */
-        co_cpu_write(CO_CPU_CTRL, co_cpu_read(CO_CPU_CTRL) & ~CO_CTRL_TIMEMASK);
-        setup_default_timer_irq();
-}
-/* Replaces the default init_ISA_irqs in the generic setup */
-static void __init visws_pre_intr_init(void)
-{
-        init_VISWS_APIC_irqs();
-}
-/* Quirk for machine specific memory setup. */
-#define MB (1024 * 1024)
-unsigned long sgivwfb_mem_phys;
-unsigned long sgivwfb_mem_size;
-EXPORT_SYMBOL(sgivwfb_mem_phys);
-EXPORT_SYMBOL(sgivwfb_mem_size);
-long long mem_size __initdata = 0;
-static char * __init visws_memory_setup(void)
-{
-        long long gfx_mem_size = 8 * MB;
-        mem_size = boot_params.alt_mem_k;
-        if (!mem_size) {
-                printk(KERN_WARNING "Bootloader didn't set memory size, upgrade it !\n");
-                mem_size = 128 * MB;
-        }
-        /*
-         * this hardcodes the graphics memory to 8 MB
-         * it really should be sized dynamically (or at least
-         * set as a boot param)
-         */
-        if (!sgivwfb_mem_size) {
-                printk(KERN_WARNING "Defaulting to 8 MB framebuffer size\n");
-                sgivwfb_mem_size = 8 * MB;
-        }
-        /*
-         * Trim to nearest MB
-         */
-        sgivwfb_mem_size &= ~((1 << 20) - 1);
-        sgivwfb_mem_phys = mem_size - gfx_mem_size;
-        e820_add_region(0, LOWMEMSIZE(), E820_RAM);
-        e820_add_region(HIGH_MEMORY, mem_size - sgivwfb_mem_size - HIGH_MEMORY, E820_RAM);
-        e820_add_region(sgivwfb_mem_phys, sgivwfb_mem_size, E820_RESERVED);
-        return "PROM";
-}
-static void visws_machine_emergency_restart(void)
-{
-        /*
-         * Visual Workstations restart after this
-         * register is poked on the PIIX4
-         */
-        outb(PIIX4_RESET_VAL, PIIX4_RESET_PORT);
-}
-static void visws_machine_power_off(void)
-{
-        unsigned short pm_status;
-/*      extern unsigned int pci_bus0; */
-        while ((pm_status = inw(PMSTS_PORT)) & 0x100)
-                outw(pm_status, PMSTS_PORT);
-        outw(PM_SUSPEND_ENABLE, PMCNTRL_PORT);
-        mdelay(10);
-#define PCI_CONF1_ADDRESS(bus, devfn, reg) \
-        (0x80000000 | (bus << 16) | (devfn << 8) | (reg & ~3))
-/*      outl(PCI_CONF1_ADDRESS(pci_bus0, SPECIAL_DEV, SPECIAL_REG), 0xCF8); */
-        outl(PIIX_SPECIAL_STOP, 0xCFC);
-}
-static void __init visws_get_smp_config(unsigned int early)
-{
-}
-/*
- * The Visual Workstation is Intel MP compliant in the hardware
- * sense, but it doesn't have a BIOS(-configuration table).
- * No problem for Linux.
- */
-static void __init MP_processor_info(struct mpc_cpu *m)
-{
-        int ver, logical_apicid;
-        physid_mask_t apic_cpus;
-        if (!(m->cpuflag & CPU_ENABLED))
-                return;
-        logical_apicid = m->apicid;
-        printk(KERN_INFO "%sCPU #%d %u:%u APIC version %d\n",
-               m->cpuflag & CPU_BOOTPROCESSOR ? "Bootup " : "",
-               m->apicid, (m->cpufeature & CPU_FAMILY_MASK) >> 8,
-               (m->cpufeature & CPU_MODEL_MASK) >> 4, m->apicver);
-        if (m->cpuflag & CPU_BOOTPROCESSOR)
-                boot_cpu_physical_apicid = m->apicid;
-        ver = m->apicver;
-        if ((ver >= 0x14 && m->apicid >= 0xff) || m->apicid >= 0xf) {
-                printk(KERN_ERR "Processor #%d INVALID. (Max ID: %d).\n",
-                        m->apicid, MAX_APICS);
-                return;
-        }
-        apic->apicid_to_cpu_present(m->apicid, &apic_cpus);
-        physids_or(phys_cpu_present_map, phys_cpu_present_map, apic_cpus);
-        /*
-         * Validate version
-         */
-        if (ver == 0x0) {
-                printk(KERN_ERR "BIOS bug, APIC version is 0 for CPU#%d! "
-                        "fixing up to 0x10. (tell your hw vendor)\n",
-                        m->apicid);
-                ver = 0x10;
-        }
-        apic_version[m->apicid] = ver;
-}
-static void __init visws_find_smp_config(void)
-{
-        struct mpc_cpu *mp = phys_to_virt(CO_CPU_TAB_PHYS);
-        unsigned short ncpus = readw(phys_to_virt(CO_CPU_NUM_PHYS));
-        if (ncpus > CO_CPU_MAX) {
-                printk(KERN_WARNING "find_visws_smp: got cpu count of %d at %p\n",
-                        ncpus, mp);
-                ncpus = CO_CPU_MAX;
-        }
-        if (ncpus > setup_max_cpus)
-                ncpus = setup_max_cpus;
-#ifdef CONFIG_X86_LOCAL_APIC
-        smp_found_config = 1;
-#endif
-        while (ncpus--)
-                MP_processor_info(mp++);
-        mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
-}
-static void visws_trap_init(void);
-void __init visws_early_detect(void)
-{
-        int raw;
-        visws_board_type = (char)(inb_p(PIIX_GPI_BD_REG) & PIIX_GPI_BD_REG)
-                                                         >> PIIX_GPI_BD_SHIFT;
-        if (visws_board_type < 0)
-                return;
-        /*
-         * Override the default platform setup functions
-         */
-        x86_init.resources.memory_setup = visws_memory_setup;
-        x86_init.mpparse.get_smp_config = visws_get_smp_config;
-        x86_init.mpparse.find_smp_config = visws_find_smp_config;
-        x86_init.irqs.pre_vector_init = visws_pre_intr_init;
-        x86_init.irqs.trap_init = visws_trap_init;
-        x86_init.timers.timer_init = visws_time_init;
-        x86_init.pci.init = pci_visws_init;
-        x86_init.pci.init_irq = x86_init_noop;
-        /*
-         * Install reboot quirks:
-         */
-        pm_power_off                    = visws_machine_power_off;
-        machine_ops.emergency_restart   = visws_machine_emergency_restart;
-        /*
-         * Do not use broadcast IPIs:
-         */
-        no_broadcast = 0;
-#ifdef CONFIG_X86_IO_APIC
-        /*
-         * Turn off IO-APIC detection and initialization:
-         */
-        skip_ioapic_setup               = 1;
-#endif
-        /*
-         * Get Board rev.
-         * First, we have to initialize the 307 part to allow us access
-         * to the GPIO registers.  Let's map them at 0x0fc0 which is right
-         * after the PIIX4 PM section.
-         */
-        outb_p(SIO_DEV_SEL, SIO_INDEX);
-        outb_p(SIO_GP_DEV, SIO_DATA);   /* Talk to GPIO regs. */
-        outb_p(SIO_DEV_MSB, SIO_INDEX);
-        outb_p(SIO_GP_MSB, SIO_DATA);   /* MSB of GPIO base address */
-        outb_p(SIO_DEV_LSB, SIO_INDEX);
-        outb_p(SIO_GP_LSB, SIO_DATA);   /* LSB of GPIO base address */
-        outb_p(SIO_DEV_ENB, SIO_INDEX);
-        outb_p(1, SIO_DATA);            /* Enable GPIO registers. */
-        /*
-         * Now, we have to map the power management section to write
-         * a bit which enables access to the GPIO registers.
-         * What lunatic came up with this shit?
-         */
-        outb_p(SIO_DEV_SEL, SIO_INDEX);
-        outb_p(SIO_PM_DEV, SIO_DATA);   /* Talk to GPIO regs. */
-        outb_p(SIO_DEV_MSB, SIO_INDEX);
-        outb_p(SIO_PM_MSB, SIO_DATA);   /* MSB of PM base address */
-        outb_p(SIO_DEV_LSB, SIO_INDEX);
-        outb_p(SIO_PM_LSB, SIO_DATA);   /* LSB of PM base address */
-        outb_p(SIO_DEV_ENB, SIO_INDEX);
-        outb_p(1, SIO_DATA);            /* Enable PM registers. */
-        /*
-         * Now, write the PM register which enables the GPIO registers.
-         */
-        outb_p(SIO_PM_FER2, SIO_PM_INDEX);
-        outb_p(SIO_PM_GP_EN, SIO_PM_DATA);
-        /*
-         * Now, initialize the GPIO registers.
-         * We want them all to be inputs which is the
-         * power on default, so let's leave them alone.
-         * So, let's just read the board rev!
-         */
-        raw = inb_p(SIO_GP_DATA1);
-        raw &= 0x7f;    /* 7 bits of valid board revision ID. */
-        if (visws_board_type == VISWS_320) {
-                if (raw < 0x6) {
-                        visws_board_rev = 4;
-                } else if (raw < 0xc) {
-                        visws_board_rev = 5;
-                } else {
-                        visws_board_rev = 6;
-                }
-        } else if (visws_board_type == VISWS_540) {
-                        visws_board_rev = 2;
-                } else {
-                        visws_board_rev = raw;
-                }
-        printk(KERN_INFO "Silicon Graphics Visual Workstation %s (rev %d) detected\n",
-               (visws_board_type == VISWS_320 ? "320" :
-               (visws_board_type == VISWS_540 ? "540" :
-                "unknown")), visws_board_rev);
-}
-#define A01234 (LI_INTA_0 | LI_INTA_1 | LI_INTA_2 | LI_INTA_3 | LI_INTA_4)
-#define BCD (LI_INTB | LI_INTC | LI_INTD)
-#define ALLDEVS (A01234 | BCD)
-static __init void lithium_init(void)
-{
-        set_fixmap(FIX_LI_PCIA, LI_PCI_A_PHYS);
-        set_fixmap(FIX_LI_PCIB, LI_PCI_B_PHYS);
-        if ((li_pcia_read16(PCI_VENDOR_ID) != PCI_VENDOR_ID_SGI) ||
-            (li_pcia_read16(PCI_DEVICE_ID) != PCI_DEVICE_ID_SGI_LITHIUM)) {
-                printk(KERN_EMERG "Lithium hostbridge %c not found\n", 'A');
-/*              panic("This machine is not SGI Visual Workstation 320/540"); */
-        }
-        if ((li_pcib_read16(PCI_VENDOR_ID) != PCI_VENDOR_ID_SGI) ||
-            (li_pcib_read16(PCI_DEVICE_ID) != PCI_DEVICE_ID_SGI_LITHIUM)) {
-                printk(KERN_EMERG "Lithium hostbridge %c not found\n", 'B');
-/*              panic("This machine is not SGI Visual Workstation 320/540"); */
-        }
-        li_pcia_write16(LI_PCI_INTEN, ALLDEVS);
-        li_pcib_write16(LI_PCI_INTEN, ALLDEVS);
-}
-static __init void cobalt_init(void)
-{
-        /*
-         * On normal SMP PC this is used only with SMP, but we have to
-         * use it and set it up here to start the Cobalt clock
-         */
-        set_fixmap(FIX_APIC_BASE, APIC_DEFAULT_PHYS_BASE);
-        setup_local_APIC();
-        printk(KERN_INFO "Local APIC Version %#x, ID %#x\n",
-                (unsigned int)apic_read(APIC_LVR),
-                (unsigned int)apic_read(APIC_ID));
-        set_fixmap(FIX_CO_CPU, CO_CPU_PHYS);
-        set_fixmap(FIX_CO_APIC, CO_APIC_PHYS);
-        printk(KERN_INFO "Cobalt Revision %#lx, APIC ID %#lx\n",
-                co_cpu_read(CO_CPU_REV), co_apic_read(CO_APIC_ID));
-        /* Enable Cobalt APIC being careful to NOT change the ID! */
-        co_apic_write(CO_APIC_ID, co_apic_read(CO_APIC_ID) | CO_APIC_ENABLE);
-        printk(KERN_INFO "Cobalt APIC enabled: ID reg %#lx\n",
-                co_apic_read(CO_APIC_ID));
-}
-static void __init visws_trap_init(void)
-{
-        lithium_init();
-        cobalt_init();
-}
-/*
- * IRQ controller / APIC support:
- */
-static DEFINE_SPINLOCK(cobalt_lock);
-/*
- * Set the given Cobalt APIC Redirection Table entry to point
- * to the given IDT vector/index.
- */
-static inline void co_apic_set(int entry, int irq)
-{
-        co_apic_write(CO_APIC_LO(entry), CO_APIC_LEVEL | (irq + FIRST_EXTERNAL_VECTOR));
-        co_apic_write(CO_APIC_HI(entry), 0);
-}
-/*
- * Cobalt (IO)-APIC functions to handle PCI devices.
- */
-static inline int co_apic_ide0_hack(void)
-{
-        extern char visws_board_type;
-        extern char visws_board_rev;
-        if (visws_board_type == VISWS_320 && visws_board_rev == 5)
-                return 5;
-        return CO_APIC_IDE0;
-}
-static int is_co_apic(unsigned int irq)
-{
-        if (IS_CO_APIC(irq))
-                return CO_APIC(irq);
-        switch (irq) {
-                case 0: return CO_APIC_CPU;
-                case CO_IRQ_IDE0: return co_apic_ide0_hack();
-                case CO_IRQ_IDE1: return CO_APIC_IDE1;
-                default: return -1;
-        }
-}
-/*
- * This is the SGI Cobalt (IO-)APIC:
- */
-static void enable_cobalt_irq(unsigned int irq)
-{
-        co_apic_set(is_co_apic(irq), irq);
-}
-static void disable_cobalt_irq(unsigned int irq)
-{
-        int entry = is_co_apic(irq);
-        co_apic_write(CO_APIC_LO(entry), CO_APIC_MASK);
-        co_apic_read(CO_APIC_LO(entry));
-}
-/*
- * "irq" really just serves to identify the device.  Here is where we
- * map this to the Cobalt APIC entry where it's physically wired.
- * This is called via request_irq -> setup_irq -> irq_desc->startup()
- */
-static unsigned int startup_cobalt_irq(unsigned int irq)
-{
-        unsigned long flags;
-        struct irq_desc *desc = irq_to_desc(irq);
-        spin_lock_irqsave(&cobalt_lock, flags);
-        if ((desc->status & (IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING)))
-                desc->status &= ~(IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING);
-        enable_cobalt_irq(irq);
-        spin_unlock_irqrestore(&cobalt_lock, flags);
-        return 0;
-}
-static void ack_cobalt_irq(unsigned int irq)
-{
-        unsigned long flags;
-        spin_lock_irqsave(&cobalt_lock, flags);
-        disable_cobalt_irq(irq);
-        apic_write(APIC_EOI, APIC_EIO_ACK);
-        spin_unlock_irqrestore(&cobalt_lock, flags);
-}
-static void end_cobalt_irq(unsigned int irq)
-{
-        unsigned long flags;
-        struct irq_desc *desc = irq_to_desc(irq);
-        spin_lock_irqsave(&cobalt_lock, flags);
-        if (!(desc->status & (IRQ_DISABLED | IRQ_INPROGRESS)))
-                enable_cobalt_irq(irq);
-        spin_unlock_irqrestore(&cobalt_lock, flags);
-}
-static struct irq_chip cobalt_irq_type = {
-        .name =         "Cobalt-APIC",
-        .startup =      startup_cobalt_irq,
-        .shutdown =     disable_cobalt_irq,
-        .enable =       enable_cobalt_irq,
-        .disable =      disable_cobalt_irq,
-        .ack =          ack_cobalt_irq,
-        .end =          end_cobalt_irq,
-};
-/*
- * This is the PIIX4-based 8259 that is wired up indirectly to Cobalt
- * -- not the manner expected by the code in i8259.c.
- *
- * there is a 'master' physical interrupt source that gets sent to
- * the CPU. But in the chipset there are various 'virtual' interrupts
- * waiting to be handled. We represent this to Linux through a 'master'
- * interrupt controller type, and through a special virtual interrupt-
- * controller. Device drivers only see the virtual interrupt sources.
- */
-static unsigned int startup_piix4_master_irq(unsigned int irq)
-{
-        legacy_pic->init(0);
-        return startup_cobalt_irq(irq);
-}
-static void end_piix4_master_irq(unsigned int irq)
-{
-        unsigned long flags;
-        spin_lock_irqsave(&cobalt_lock, flags);
-        enable_cobalt_irq(irq);
-        spin_unlock_irqrestore(&cobalt_lock, flags);
-}
-static struct irq_chip piix4_master_irq_type = {
-        .name =         "PIIX4-master",
-        .startup =      startup_piix4_master_irq,
-        .ack =          ack_cobalt_irq,
-        .end =          end_piix4_master_irq,
-};
-static struct irq_chip piix4_virtual_irq_type = {
-        .name =         "PIIX4-virtual",
-};
-/*
- * PIIX4-8259 master/virtual functions to handle interrupt requests
- * from legacy devices: floppy, parallel, serial, rtc.
- *
- * None of these get Cobalt APIC entries, neither do they have IDT
- * entries. These interrupts are purely virtual and distributed from
- * the 'master' interrupt source: CO_IRQ_8259.
- *
- * When the 8259 interrupts its handler figures out which of these
- * devices is interrupting and dispatches to its handler.
- *
- * CAREFUL: devices see the 'virtual' interrupt only. Thus disable/
- * enable_irq gets the right irq. This 'master' irq is never directly
- * manipulated by any driver.
- */
-static irqreturn_t piix4_master_intr(int irq, void *dev_id)
-{
-        int realirq;
-        struct irq_desc *desc;
-        unsigned long flags;
-        raw_spin_lock_irqsave(&i8259A_lock, flags);
-        /* Find out what's interrupting in the PIIX4 master 8259 */
-        outb(0x0c, 0x20);               /* OCW3 Poll command */
-        realirq = inb(0x20);
-        /*
-         * Bit 7 == 0 means invalid/spurious
-         */
-        if (unlikely(!(realirq & 0x80)))
-                goto out_unlock;
-        realirq &= 7;
-        if (unlikely(realirq == 2)) {
-                outb(0x0c, 0xa0);
-                realirq = inb(0xa0);
-                if (unlikely(!(realirq & 0x80)))
-                        goto out_unlock;
-                realirq = (realirq & 7) + 8;
-        }
-        /* mask and ack interrupt */
-        cached_irq_mask |= 1 << realirq;
-        if (unlikely(realirq > 7)) {
-                inb(0xa1);
-                outb(cached_slave_mask, 0xa1);
-                outb(0x60 + (realirq & 7), 0xa0);
-                outb(0x60 + 2, 0x20);
-        } else {
-                inb(0x21);
-                outb(cached_master_mask, 0x21);
-                outb(0x60 + realirq, 0x20);
-        }
-        raw_spin_unlock_irqrestore(&i8259A_lock, flags);
-        desc = irq_to_desc(realirq);
-        /*
-         * handle this 'virtual interrupt' as a Cobalt one now.
-         */
-        kstat_incr_irqs_this_cpu(realirq, desc);
-        if (likely(desc->action != NULL))
-                handle_IRQ_event(realirq, desc->action);
-        if (!(desc->status & IRQ_DISABLED))
-                legacy_pic->chip->unmask(realirq);
-        return IRQ_HANDLED;
-out_unlock:
-        raw_spin_unlock_irqrestore(&i8259A_lock, flags);
-        return IRQ_NONE;
-}
-static struct irqaction master_action = {
-        .handler =      piix4_master_intr,
-        .name =         "PIIX4-8259",
-};
-static struct irqaction cascade_action = {
-        .handler =      no_action,
-        .name =         "cascade",
-};
-static inline void set_piix4_virtual_irq_type(void)
-{
-        piix4_virtual_irq_type.shutdown = i8259A_chip.mask;
-        piix4_virtual_irq_type.enable = i8259A_chip.unmask;
-        piix4_virtual_irq_type.disable = i8259A_chip.mask;
-}
-void init_VISWS_APIC_irqs(void)
-{
-        int i;
-        for (i = 0; i < CO_IRQ_APIC0 + CO_APIC_LAST + 1; i++) {
-                struct irq_desc *desc = irq_to_desc(i);
-                desc->status = IRQ_DISABLED;
-                desc->action = 0;
-                desc->depth = 1;
-                if (i == 0) {
-                        desc->chip = &cobalt_irq_type;
-                }
-                else if (i == CO_IRQ_IDE0) {
-                        desc->chip = &cobalt_irq_type;
-                }
-                else if (i == CO_IRQ_IDE1) {
-                        desc->chip = &cobalt_irq_type;
-                }
-                else if (i == CO_IRQ_8259) {
-                        desc->chip = &piix4_master_irq_type;
-                }
-                else if (i < CO_IRQ_APIC0) {
-                        set_piix4_virtual_irq_type();
-                        desc->chip = &piix4_virtual_irq_type;
-                }
-                else if (IS_CO_APIC(i)) {
-                        desc->chip = &cobalt_irq_type;
-                }
-        }
-        setup_irq(CO_IRQ_8259, &master_action);
-        setup_irq(2, &cascade_action);
-}
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 5ffb5622f793..863f8753ab0a 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -179,6 +179,7 @@ static void mark_screen_rdonly(struct mm_struct *mm)
        if (pud_none_or_clear_bad(pud))
                goto out;
        pmd = pmd_offset(pud, 0xA0000);
+        split_huge_page_pmd(mm, pmd);
        if (pmd_none_or_clear_bad(pmd))
                goto out;
        pte = pte_offset_map_lock(mm, pmd, 0xA0000, &ptl);
@@ -551,8 +552,14 @@ cannot_handle:
 int handle_vm86_trap(struct kernel_vm86_regs *regs, long error_code, int trapno)
 {
        if (VMPI.is_vm86pus) {
-                if ((trapno == 3) || (trapno == 1))
+                if ((trapno == 3) || (trapno == 1)) {
-                        return_to_32bit(regs, VM86_TRAP + (trapno << 8));
+                        KVM86->regs32->ax = VM86_TRAP + (trapno << 8);
+                        /* setting this flag forces the code in entry_32.S to
+                           call save_v86_state() and change the stack pointer
+                           to KVM86->regs32 */
+                        set_thread_flag(TIF_IRET);
+                        return 0;
+                }
                do_int(regs, trapno, (unsigned char __user *) (regs->pt.ss << 4), SP(regs));
                return 0;
        }
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
deleted file mode 100644
index ce9fbacb7526..000000000000
--- a/arch/x86/kernel/vmi_32.c
+++ /dev/null
@@ -1,893 +0,0 @@
-/*
- * VMI specific paravirt-ops implementation
- *
- * Copyright (C) 2005, VMware, Inc.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT.  See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- * Send feedback to zach@vmware.com
- *
- */
-#include <linux/module.h>
-#include <linux/cpu.h>
-#include <linux/bootmem.h>
-#include <linux/mm.h>
-#include <linux/highmem.h>
-#include <linux/sched.h>
-#include <linux/gfp.h>
-#include <asm/vmi.h>
-#include <asm/io.h>
-#include <asm/fixmap.h>
-#include <asm/apicdef.h>
-#include <asm/apic.h>
-#include <asm/pgalloc.h>
-#include <asm/processor.h>
-#include <asm/timer.h>
-#include <asm/vmi_time.h>
-#include <asm/kmap_types.h>
-#include <asm/setup.h>
-/* Convenient for calling VMI functions indirectly in the ROM */
-typedef u32 __attribute__((regparm(1))) (VROMFUNC)(void);
-typedef u64 __attribute__((regparm(2))) (VROMLONGFUNC)(int);
-#define call_vrom_func(rom,func) \
-   (((VROMFUNC *)(rom->func))())
-#define call_vrom_long_func(rom,func,arg) \
-   (((VROMLONGFUNC *)(rom->func)) (arg))
-static struct vrom_header *vmi_rom;
-static int disable_pge;
-static int disable_pse;
-static int disable_sep;
-static int disable_tsc;
-static int disable_mtrr;
-static int disable_noidle;
-static int disable_vmi_timer;
-/* Cached VMI operations */
-static struct {
-        void (*cpuid)(void /* non-c */);
-        void (*_set_ldt)(u32 selector);
-        void (*set_tr)(u32 selector);
-        void (*write_idt_entry)(struct desc_struct *, int, u32, u32);
-        void (*write_gdt_entry)(struct desc_struct *, int, u32, u32);
-        void (*write_ldt_entry)(struct desc_struct *, int, u32, u32);
-        void (*set_kernel_stack)(u32 selector, u32 sp0);
-        void (*allocate_page)(u32, u32, u32, u32, u32);
-        void (*release_page)(u32, u32);
-        void (*set_pte)(pte_t, pte_t *, unsigned);
-        void (*update_pte)(pte_t *, unsigned);
-        void (*set_linear_mapping)(int, void *, u32, u32);
-        void (*_flush_tlb)(int);
-        void (*set_initial_ap_state)(int, int);
-        void (*halt)(void);
-        void (*set_lazy_mode)(int mode);
-} vmi_ops;
-/* Cached VMI operations */
-struct vmi_timer_ops vmi_timer_ops;
-/*
- * VMI patching routines.
- */
-#define MNEM_CALL 0xe8
-#define MNEM_JMP  0xe9
-#define MNEM_RET  0xc3
-#define IRQ_PATCH_INT_MASK 0
-#define IRQ_PATCH_DISABLE  5
-static inline void patch_offset(void *insnbuf,
-                                unsigned long ip, unsigned long dest)
-{
-        *(unsigned long *)(insnbuf+1) = dest-ip-5;
-}
-static unsigned patch_internal(int call, unsigned len, void *insnbuf,
-                               unsigned long ip)
-{
-        u64 reloc;
-        struct vmi_relocation_info *const rel = (struct vmi_relocation_info *)&reloc;
-        reloc = call_vrom_long_func(vmi_rom, get_reloc, call);
-        switch(rel->type) {
-                case VMI_RELOCATION_CALL_REL:
-                        BUG_ON(len < 5);
-                        *(char *)insnbuf = MNEM_CALL;
-                        patch_offset(insnbuf, ip, (unsigned long)rel->eip);
-                        return 5;
-                case VMI_RELOCATION_JUMP_REL:
-                        BUG_ON(len < 5);
-                        *(char *)insnbuf = MNEM_JMP;
-                        patch_offset(insnbuf, ip, (unsigned long)rel->eip);
-                        return 5;
-                case VMI_RELOCATION_NOP:
-                        /* obliterate the whole thing */
-                        return 0;
-                case VMI_RELOCATION_NONE:
-                        /* leave native code in place */
-                        break;
-                default:
-                        BUG();
-        }
-        return len;
-}
-/*
- * Apply patch if appropriate, return length of new instruction
- * sequence.  The callee does nop padding for us.
- */
-static unsigned vmi_patch(u8 type, u16 clobbers, void *insns,
-                          unsigned long ip, unsigned len)
-{
-        switch (type) {
-                case PARAVIRT_PATCH(pv_irq_ops.irq_disable):
-                        return patch_internal(VMI_CALL_DisableInterrupts, len,
-                                              insns, ip);
-                case PARAVIRT_PATCH(pv_irq_ops.irq_enable):
-                        return patch_internal(VMI_CALL_EnableInterrupts, len,
-                                              insns, ip);
-                case PARAVIRT_PATCH(pv_irq_ops.restore_fl):
-                        return patch_internal(VMI_CALL_SetInterruptMask, len,
-                                              insns, ip);
-                case PARAVIRT_PATCH(pv_irq_ops.save_fl):
-                        return patch_internal(VMI_CALL_GetInterruptMask, len,
-                                              insns, ip);
-                case PARAVIRT_PATCH(pv_cpu_ops.iret):
-                        return patch_internal(VMI_CALL_IRET, len, insns, ip);
-                case PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit):
-                        return patch_internal(VMI_CALL_SYSEXIT, len, insns, ip);
-                default:
-                        break;
-        }
-        return len;
-}
-/* CPUID has non-C semantics, and paravirt-ops API doesn't match hardware ISA */
-static void vmi_cpuid(unsigned int *ax, unsigned int *bx,
-                               unsigned int *cx, unsigned int *dx)
-{
-        int override = 0;
-        if (*ax == 1)
-                override = 1;
-        asm volatile ("call *%6"
-                      : "=a" (*ax),
-                        "=b" (*bx),
-                        "=c" (*cx),
-                        "=d" (*dx)
-                      : "0" (*ax), "2" (*cx), "r" (vmi_ops.cpuid));
-        if (override) {
-                if (disable_pse)
-                        *dx &= ~X86_FEATURE_PSE;
-                if (disable_pge)
-                        *dx &= ~X86_FEATURE_PGE;
-                if (disable_sep)
-                        *dx &= ~X86_FEATURE_SEP;
-                if (disable_tsc)
-                        *dx &= ~X86_FEATURE_TSC;
-                if (disable_mtrr)
-                        *dx &= ~X86_FEATURE_MTRR;
-        }
-}
-static inline void vmi_maybe_load_tls(struct desc_struct *gdt, int nr, struct desc_struct *new)
-{
-        if (gdt[nr].a != new->a || gdt[nr].b != new->b)
-                write_gdt_entry(gdt, nr, new, 0);
-}
-static void vmi_load_tls(struct thread_struct *t, unsigned int cpu)
-{
-        struct desc_struct *gdt = get_cpu_gdt_table(cpu);
-        vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 0, &t->tls_array[0]);
-        vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 1, &t->tls_array[1]);
-        vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 2, &t->tls_array[2]);
-}
-static void vmi_set_ldt(const void *addr, unsigned entries)
-{
-        unsigned cpu = smp_processor_id();
-        struct desc_struct desc;
-        pack_descriptor(&desc, (unsigned long)addr,
-                        entries * sizeof(struct desc_struct) - 1,
-                        DESC_LDT, 0);
-        write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, &desc, DESC_LDT);
-        vmi_ops._set_ldt(entries ? GDT_ENTRY_LDT*sizeof(struct desc_struct) : 0);
-}
-static void vmi_set_tr(void)
-{
-        vmi_ops.set_tr(GDT_ENTRY_TSS*sizeof(struct desc_struct));
-}
-static void vmi_write_idt_entry(gate_desc *dt, int entry, const gate_desc *g)
-{
-        u32 *idt_entry = (u32 *)g;
-        vmi_ops.write_idt_entry(dt, entry, idt_entry[0], idt_entry[1]);
-}
-static void vmi_write_gdt_entry(struct desc_struct *dt, int entry,
-                                const void *desc, int type)
-{
-        u32 *gdt_entry = (u32 *)desc;
-        vmi_ops.write_gdt_entry(dt, entry, gdt_entry[0], gdt_entry[1]);
-}
-static void vmi_write_ldt_entry(struct desc_struct *dt, int entry,
-                                const void *desc)
-{
-        u32 *ldt_entry = (u32 *)desc;
-        vmi_ops.write_ldt_entry(dt, entry, ldt_entry[0], ldt_entry[1]);
-}
-static void vmi_load_sp0(struct tss_struct *tss,
-                                   struct thread_struct *thread)
-{
-        tss->x86_tss.sp0 = thread->sp0;
-        /* This can only happen when SEP is enabled, no need to test "SEP"arately */
-        if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) {
-                tss->x86_tss.ss1 = thread->sysenter_cs;
-                wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
-        }
-        vmi_ops.set_kernel_stack(__KERNEL_DS, tss->x86_tss.sp0);
-}
-static void vmi_flush_tlb_user(void)
-{
-        vmi_ops._flush_tlb(VMI_FLUSH_TLB);
-}
-static void vmi_flush_tlb_kernel(void)
-{
-        vmi_ops._flush_tlb(VMI_FLUSH_TLB | VMI_FLUSH_GLOBAL);
-}
-/* Stub to do nothing at all; used for delays and unimplemented calls */
-static void vmi_nop(void)
-{
-}
-static void vmi_allocate_pte(struct mm_struct *mm, unsigned long pfn)
-{
-        vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0);
-}
-static void vmi_allocate_pmd(struct mm_struct *mm, unsigned long pfn)
-{
-        /*
-         * This call comes in very early, before mem_map is setup.
-         * It is called only for swapper_pg_dir, which already has
-         * data on it.
-         */
-        vmi_ops.allocate_page(pfn, VMI_PAGE_L2, 0, 0, 0);
-}
-static void vmi_allocate_pmd_clone(unsigned long pfn, unsigned long clonepfn, unsigned long start, unsigned long count)
-{
-        vmi_ops.allocate_page(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE, clonepfn, start, count);
-}
-static void vmi_release_pte(unsigned long pfn)
-{
-        vmi_ops.release_page(pfn, VMI_PAGE_L1);
-}
-static void vmi_release_pmd(unsigned long pfn)
-{
-        vmi_ops.release_page(pfn, VMI_PAGE_L2);
-}
-/*
- * We use the pgd_free hook for releasing the pgd page:
- */
-static void vmi_pgd_free(struct mm_struct *mm, pgd_t *pgd)
-{
-        unsigned long pfn = __pa(pgd) >> PAGE_SHIFT;
-        vmi_ops.release_page(pfn, VMI_PAGE_L2);
-}
-/*
- * Helper macros for MMU update flags.  We can defer updates until a flush
- * or page invalidation only if the update is to the current address space
- * (otherwise, there is no flush).  We must check against init_mm, since
- * this could be a kernel update, which usually passes init_mm, although
- * sometimes this check can be skipped if we know the particular function
- * is only called on user mode PTEs.  We could change the kernel to pass
- * current->active_mm here, but in particular, I was unsure if changing
- * mm/highmem.c to do this would still be correct on other architectures.
- */
-#define is_current_as(mm, mustbeuser) ((mm) == current->active_mm ||    \
-                                       (!mustbeuser && (mm) == &init_mm))
-#define vmi_flags_addr(mm, addr, level, user)                           \
-        ((level) | (is_current_as(mm, user) ?                           \
-                (VMI_PAGE_CURRENT_AS | ((addr) & VMI_PAGE_VA_MASK)) : 0))
-#define vmi_flags_addr_defer(mm, addr, level, user)                     \
-        ((level) | (is_current_as(mm, user) ?                           \
-                (VMI_PAGE_DEFER | VMI_PAGE_CURRENT_AS | ((addr) & VMI_PAGE_VA_MASK)) : 0))
-static void vmi_update_pte(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
-{
-        vmi_ops.update_pte(ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
-}
-static void vmi_update_pte_defer(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
-{
-        vmi_ops.update_pte(ptep, vmi_flags_addr_defer(mm, addr, VMI_PAGE_PT, 0));
-}
-static void vmi_set_pte(pte_t *ptep, pte_t pte)
-{
-        /* XXX because of set_pmd_pte, this can be called on PT or PD layers */
-        vmi_ops.set_pte(pte, ptep, VMI_PAGE_PT);
-}
-static void vmi_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte)
-{
-        vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
-}
-static void vmi_set_pmd(pmd_t *pmdp, pmd_t pmdval)
-{
-#ifdef CONFIG_X86_PAE
-        const pte_t pte = { .pte = pmdval.pmd };
-#else
-        const pte_t pte = { pmdval.pud.pgd.pgd };
-#endif
-        vmi_ops.set_pte(pte, (pte_t *)pmdp, VMI_PAGE_PD);
-}
-#ifdef CONFIG_X86_PAE
-static void vmi_set_pte_atomic(pte_t *ptep, pte_t pteval)
-{
-        /*
-         * XXX This is called from set_pmd_pte, but at both PT
-         * and PD layers so the VMI_PAGE_PT flag is wrong.  But
-         * it is only called for large page mapping changes,
-         * the Xen backend, doesn't support large pages, and the
-         * ESX backend doesn't depend on the flag.
-         */
-        set_64bit((unsigned long long *)ptep,pte_val(pteval));
-        vmi_ops.update_pte(ptep, VMI_PAGE_PT);
-}
-static void vmi_set_pud(pud_t *pudp, pud_t pudval)
-{
-        /* Um, eww */
-        const pte_t pte = { .pte = pudval.pgd.pgd };
-        vmi_ops.set_pte(pte, (pte_t *)pudp, VMI_PAGE_PDP);
-}
-static void vmi_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
-{
-        const pte_t pte = { .pte = 0 };
-        vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
-}
-static void vmi_pmd_clear(pmd_t *pmd)
-{
-        const pte_t pte = { .pte = 0 };
-        vmi_ops.set_pte(pte, (pte_t *)pmd, VMI_PAGE_PD);
-}
-#endif
-#ifdef CONFIG_SMP
-static void __devinit
-vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip,
-                     unsigned long start_esp)
-{
-        struct vmi_ap_state ap;
-        /* Default everything to zero.  This is fine for most GPRs. */
-        memset(&ap, 0, sizeof(struct vmi_ap_state));
-        ap.gdtr_limit = GDT_SIZE - 1;
-        ap.gdtr_base = (unsigned long) get_cpu_gdt_table(phys_apicid);
-        ap.idtr_limit = IDT_ENTRIES * 8 - 1;
-        ap.idtr_base = (unsigned long) idt_table;
-        ap.ldtr = 0;
-        ap.cs = __KERNEL_CS;
-        ap.eip = (unsigned long) start_eip;
-        ap.ss = __KERNEL_DS;
-        ap.esp = (unsigned long) start_esp;
-        ap.ds = __USER_DS;
-        ap.es = __USER_DS;
-        ap.fs = __KERNEL_PERCPU;
-        ap.gs = __KERNEL_STACK_CANARY;
-        ap.eflags = 0;
-#ifdef CONFIG_X86_PAE
-        /* efer should match BSP efer. */
-        if (cpu_has_nx) {
-                unsigned l, h;
-                rdmsr(MSR_EFER, l, h);
-                ap.efer = (unsigned long long) h << 32 | l;
-        }
-#endif
-        ap.cr3 = __pa(swapper_pg_dir);
-        /* Protected mode, paging, AM, WP, NE, MP. */
-        ap.cr0 = 0x80050023;
-        ap.cr4 = mmu_cr4_features;
-        vmi_ops.set_initial_ap_state((u32)&ap, phys_apicid);
-}
-#endif
-static void vmi_start_context_switch(struct task_struct *prev)
-{
-        paravirt_start_context_switch(prev);
-        vmi_ops.set_lazy_mode(2);
-}
-static void vmi_end_context_switch(struct task_struct *next)
-{
-        vmi_ops.set_lazy_mode(0);
-        paravirt_end_context_switch(next);
-}
-static void vmi_enter_lazy_mmu(void)
-{
-        paravirt_enter_lazy_mmu();
-        vmi_ops.set_lazy_mode(1);
-}
-static void vmi_leave_lazy_mmu(void)
-{
-        vmi_ops.set_lazy_mode(0);
-        paravirt_leave_lazy_mmu();
-}
-static inline int __init check_vmi_rom(struct vrom_header *rom)
-{
-        struct pci_header *pci;
-        struct pnp_header *pnp;
-        const char *manufacturer = "UNKNOWN";
-        const char *product = "UNKNOWN";
-        const char *license = "unspecified";
-        if (rom->rom_signature != 0xaa55)
-                return 0;
-        if (rom->vrom_signature != VMI_SIGNATURE)
-                return 0;
-        if (rom->api_version_maj != VMI_API_REV_MAJOR ||
-            rom->api_version_min+1 < VMI_API_REV_MINOR+1) {
-                printk(KERN_WARNING "VMI: Found mismatched rom version %d.%d\n",
-                                rom->api_version_maj,
-                                rom->api_version_min);
-                return 0;
-        }
-        /*
-         * Relying on the VMI_SIGNATURE field is not 100% safe, so check
-         * the PCI header and device type to make sure this is really a
-         * VMI device.
-         */
-        if (!rom->pci_header_offs) {
-                printk(KERN_WARNING "VMI: ROM does not contain PCI header.\n");
-                return 0;
-        }
-        pci = (struct pci_header *)((char *)rom+rom->pci_header_offs);
-        if (pci->vendorID != PCI_VENDOR_ID_VMWARE ||
-            pci->deviceID != PCI_DEVICE_ID_VMWARE_VMI) {
-                /* Allow it to run... anyways, but warn */
-                printk(KERN_WARNING "VMI: ROM from unknown manufacturer\n");
-        }
-        if (rom->pnp_header_offs) {
-                pnp = (struct pnp_header *)((char *)rom+rom->pnp_header_offs);
-                if (pnp->manufacturer_offset)
-                        manufacturer = (const char *)rom+pnp->manufacturer_offset;
-                if (pnp->product_offset)
-                        product = (const char *)rom+pnp->product_offset;
-        }
-        if (rom->license_offs)
-                license = (char *)rom+rom->license_offs;
-        printk(KERN_INFO "VMI: Found %s %s, API version %d.%d, ROM version %d.%d\n",
-                manufacturer, product,
-                rom->api_version_maj, rom->api_version_min,
-                pci->rom_version_maj, pci->rom_version_min);
-        /* Don't allow BSD/MIT here for now because we don't want to end up
-           with any binary only shim layers */
-        if (strcmp(license, "GPL") && strcmp(license, "GPL v2")) {
-                printk(KERN_WARNING "VMI: Non GPL license `%s' found for ROM. Not used.\n",
-                        license);
-                return 0;
-        }
-        return 1;
-}
-/*
- * Probe for the VMI option ROM
- */
-static inline int __init probe_vmi_rom(void)
-{
-        unsigned long base;
-        /* VMI ROM is in option ROM area, check signature */
-        for (base = 0xC0000; base < 0xE0000; base += 2048) {
-                struct vrom_header *romstart;
-                romstart = (struct vrom_header *)isa_bus_to_virt(base);
-                if (check_vmi_rom(romstart)) {
-                        vmi_rom = romstart;
-                        return 1;
-                }
-        }
-        return 0;
-}
-/*
- * VMI setup common to all processors
- */
-void vmi_bringup(void)
-{
-        /* We must establish the lowmem mapping for MMU ops to work */
-        if (vmi_ops.set_linear_mapping)
-                vmi_ops.set_linear_mapping(0, (void *)__PAGE_OFFSET, MAXMEM_PFN, 0);
-}
-/*
- * Return a pointer to a VMI function or NULL if unimplemented
- */
-static void *vmi_get_function(int vmicall)
-{
-        u64 reloc;
-        const struct vmi_relocation_info *rel = (struct vmi_relocation_info *)&reloc;
-        reloc = call_vrom_long_func(vmi_rom, get_reloc, vmicall);
-        BUG_ON(rel->type == VMI_RELOCATION_JUMP_REL);
-        if (rel->type == VMI_RELOCATION_CALL_REL)
-                return (void *)rel->eip;
-        else
-                return NULL;
-}
-/*
- * Helper macro for making the VMI paravirt-ops fill code readable.
- * For unimplemented operations, fall back to default, unless nop
- * is returned by the ROM.
- */
-#define para_fill(opname, vmicall)                              \
-do {                                                            \
-        reloc = call_vrom_long_func(vmi_rom, get_reloc,         \
-                                    VMI_CALL_##vmicall);        \
-        if (rel->type == VMI_RELOCATION_CALL_REL)               \
-                opname = (void *)rel->eip;                      \
-        else if (rel->type == VMI_RELOCATION_NOP)               \
-                opname = (void *)vmi_nop;                       \
-        else if (rel->type != VMI_RELOCATION_NONE)              \
-                printk(KERN_WARNING "VMI: Unknown relocation "  \
-                                    "type %d for " #vmicall"\n",\
-                                        rel->type);             \
-} while (0)
-/*
- * Helper macro for making the VMI paravirt-ops fill code readable.
- * For cached operations which do not match the VMI ROM ABI and must
- * go through a tranlation stub.  Ignore NOPs, since it is not clear
- * a NOP * VMI function corresponds to a NOP paravirt-op when the
- * functions are not in 1-1 correspondence.
- */
-#define para_wrap(opname, wrapper, cache, vmicall)              \
-do {                                                            \
-        reloc = call_vrom_long_func(vmi_rom, get_reloc,         \
-                                    VMI_CALL_##vmicall);        \
-        BUG_ON(rel->type == VMI_RELOCATION_JUMP_REL);           \
-        if (rel->type == VMI_RELOCATION_CALL_REL) {             \
-                opname = wrapper;                               \
-                vmi_ops.cache = (void *)rel->eip;               \
-        }                                                       \
-} while (0)
-/*
- * Activate the VMI interface and switch into paravirtualized mode
- */
-static inline int __init activate_vmi(void)
-{
-        short kernel_cs;
-        u64 reloc;
-        const struct vmi_relocation_info *rel = (struct vmi_relocation_info *)&reloc;
-        /*
-         * Prevent page tables from being allocated in highmem, even if
-         * CONFIG_HIGHPTE is enabled.
-         */
-        __userpte_alloc_gfp &= ~__GFP_HIGHMEM;
-        if (call_vrom_func(vmi_rom, vmi_init) != 0) {
-                printk(KERN_ERR "VMI ROM failed to initialize!");
-                return 0;
-        }
-        savesegment(cs, kernel_cs);
-        pv_info.paravirt_enabled = 1;
-        pv_info.kernel_rpl = kernel_cs & SEGMENT_RPL_MASK;
-        pv_info.name = "vmi [deprecated]";
-        pv_init_ops.patch = vmi_patch;
-        /*
-         * Many of these operations are ABI compatible with VMI.
-         * This means we can fill in the paravirt-ops with direct
-         * pointers into the VMI ROM.  If the calling convention for
-         * these operations changes, this code needs to be updated.
-         *
-         * Exceptions
-         *  CPUID paravirt-op uses pointers, not the native ISA
-         *  halt has no VMI equivalent; all VMI halts are "safe"
-         *  no MSR support yet - just trap and emulate.  VMI uses the
-         *    same ABI as the native ISA, but Linux wants exceptions
-         *    from bogus MSR read / write handled
-         *  rdpmc is not yet used in Linux
-         */
-        /* CPUID is special, so very special it gets wrapped like a present */
-        para_wrap(pv_cpu_ops.cpuid, vmi_cpuid, cpuid, CPUID);
-        para_fill(pv_cpu_ops.clts, CLTS);
-        para_fill(pv_cpu_ops.get_debugreg, GetDR);
-        para_fill(pv_cpu_ops.set_debugreg, SetDR);
-        para_fill(pv_cpu_ops.read_cr0, GetCR0);
-        para_fill(pv_mmu_ops.read_cr2, GetCR2);
-        para_fill(pv_mmu_ops.read_cr3, GetCR3);
-        para_fill(pv_cpu_ops.read_cr4, GetCR4);
-        para_fill(pv_cpu_ops.write_cr0, SetCR0);
-        para_fill(pv_mmu_ops.write_cr2, SetCR2);
-        para_fill(pv_mmu_ops.write_cr3, SetCR3);
-        para_fill(pv_cpu_ops.write_cr4, SetCR4);
-        para_fill(pv_irq_ops.save_fl.func, GetInterruptMask);
-        para_fill(pv_irq_ops.restore_fl.func, SetInterruptMask);
-        para_fill(pv_irq_ops.irq_disable.func, DisableInterrupts);
-        para_fill(pv_irq_ops.irq_enable.func, EnableInterrupts);
-        para_fill(pv_cpu_ops.wbinvd, WBINVD);
-        para_fill(pv_cpu_ops.read_tsc, RDTSC);
-        /* The following we emulate with trap and emulate for now */
-        /* paravirt_ops.read_msr = vmi_rdmsr */
-        /* paravirt_ops.write_msr = vmi_wrmsr */
-        /* paravirt_ops.rdpmc = vmi_rdpmc */
-        /* TR interface doesn't pass TR value, wrap */
-        para_wrap(pv_cpu_ops.load_tr_desc, vmi_set_tr, set_tr, SetTR);
-        /* LDT is special, too */
-        para_wrap(pv_cpu_ops.set_ldt, vmi_set_ldt, _set_ldt, SetLDT);
-        para_fill(pv_cpu_ops.load_gdt, SetGDT);
-        para_fill(pv_cpu_ops.load_idt, SetIDT);
-        para_fill(pv_cpu_ops.store_gdt, GetGDT);
-        para_fill(pv_cpu_ops.store_idt, GetIDT);
-        para_fill(pv_cpu_ops.store_tr, GetTR);
-        pv_cpu_ops.load_tls = vmi_load_tls;
-        para_wrap(pv_cpu_ops.write_ldt_entry, vmi_write_ldt_entry,
-                  write_ldt_entry, WriteLDTEntry);
-        para_wrap(pv_cpu_ops.write_gdt_entry, vmi_write_gdt_entry,
-                  write_gdt_entry, WriteGDTEntry);
-        para_wrap(pv_cpu_ops.write_idt_entry, vmi_write_idt_entry,
-                  write_idt_entry, WriteIDTEntry);
-        para_wrap(pv_cpu_ops.load_sp0, vmi_load_sp0, set_kernel_stack, UpdateKernelStack);
-        para_fill(pv_cpu_ops.set_iopl_mask, SetIOPLMask);
-        para_fill(pv_cpu_ops.io_delay, IODelay);
-        para_wrap(pv_cpu_ops.start_context_switch, vmi_start_context_switch,
-                  set_lazy_mode, SetLazyMode);
-        para_wrap(pv_cpu_ops.end_context_switch, vmi_end_context_switch,
-                  set_lazy_mode, SetLazyMode);
-        para_wrap(pv_mmu_ops.lazy_mode.enter, vmi_enter_lazy_mmu,
-                  set_lazy_mode, SetLazyMode);
-        para_wrap(pv_mmu_ops.lazy_mode.leave, vmi_leave_lazy_mmu,
-                  set_lazy_mode, SetLazyMode);
-        /* user and kernel flush are just handled with different flags to FlushTLB */
-        para_wrap(pv_mmu_ops.flush_tlb_user, vmi_flush_tlb_user, _flush_tlb, FlushTLB);
-        para_wrap(pv_mmu_ops.flush_tlb_kernel, vmi_flush_tlb_kernel, _flush_tlb, FlushTLB);
-        para_fill(pv_mmu_ops.flush_tlb_single, InvalPage);
-        /*
-         * Until a standard flag format can be agreed on, we need to
-         * implement these as wrappers in Linux.  Get the VMI ROM
-         * function pointers for the two backend calls.
-         */
-#ifdef CONFIG_X86_PAE
-        vmi_ops.set_pte = vmi_get_function(VMI_CALL_SetPxELong);
-        vmi_ops.update_pte = vmi_get_function(VMI_CALL_UpdatePxELong);
-#else
-        vmi_ops.set_pte = vmi_get_function(VMI_CALL_SetPxE);
-        vmi_ops.update_pte = vmi_get_function(VMI_CALL_UpdatePxE);
-#endif
-        if (vmi_ops.set_pte) {
-                pv_mmu_ops.set_pte = vmi_set_pte;
-                pv_mmu_ops.set_pte_at = vmi_set_pte_at;
-                pv_mmu_ops.set_pmd = vmi_set_pmd;
-#ifdef CONFIG_X86_PAE
-                pv_mmu_ops.set_pte_atomic = vmi_set_pte_atomic;
-                pv_mmu_ops.set_pud = vmi_set_pud;
-                pv_mmu_ops.pte_clear = vmi_pte_clear;
-                pv_mmu_ops.pmd_clear = vmi_pmd_clear;
-#endif
-        }
-        if (vmi_ops.update_pte) {
-                pv_mmu_ops.pte_update = vmi_update_pte;
-                pv_mmu_ops.pte_update_defer = vmi_update_pte_defer;
-        }
-        vmi_ops.allocate_page = vmi_get_function(VMI_CALL_AllocatePage);
-        if (vmi_ops.allocate_page) {
-                pv_mmu_ops.alloc_pte = vmi_allocate_pte;
-                pv_mmu_ops.alloc_pmd = vmi_allocate_pmd;
-                pv_mmu_ops.alloc_pmd_clone = vmi_allocate_pmd_clone;
-        }
-        vmi_ops.release_page = vmi_get_function(VMI_CALL_ReleasePage);
-        if (vmi_ops.release_page) {
-                pv_mmu_ops.release_pte = vmi_release_pte;
-                pv_mmu_ops.release_pmd = vmi_release_pmd;
-                pv_mmu_ops.pgd_free = vmi_pgd_free;
-        }
-        /* Set linear is needed in all cases */
-        vmi_ops.set_linear_mapping = vmi_get_function(VMI_CALL_SetLinearMapping);
-        /*
-         * These MUST always be patched.  Don't support indirect jumps
-         * through these operations, as the VMI interface may use either
-         * a jump or a call to get to these operations, depending on
-         * the backend.  They are performance critical anyway, so requiring
-         * a patch is not a big problem.
-         */
-        pv_cpu_ops.irq_enable_sysexit = (void *)0xfeedbab0;
-        pv_cpu_ops.iret = (void *)0xbadbab0;
-#ifdef CONFIG_SMP
-        para_wrap(pv_apic_ops.startup_ipi_hook, vmi_startup_ipi_hook, set_initial_ap_state, SetInitialAPState);
-#endif
-#ifdef CONFIG_X86_LOCAL_APIC
-       para_fill(apic->read, APICRead);
-       para_fill(apic->write, APICWrite);
-#endif
-        /*
-         * Check for VMI timer functionality by probing for a cycle frequency method
-         */
-        reloc = call_vrom_long_func(vmi_rom, get_reloc, VMI_CALL_GetCycleFrequency);
-        if (!disable_vmi_timer && rel->type != VMI_RELOCATION_NONE) {
-                vmi_timer_ops.get_cycle_frequency = (void *)rel->eip;
-                vmi_timer_ops.get_cycle_counter =
-                        vmi_get_function(VMI_CALL_GetCycleCounter);
-                vmi_timer_ops.get_wallclock =
-                        vmi_get_function(VMI_CALL_GetWallclockTime);
-                vmi_timer_ops.wallclock_updated =
-                        vmi_get_function(VMI_CALL_WallclockUpdated);
-                vmi_timer_ops.set_alarm = vmi_get_function(VMI_CALL_SetAlarm);
-                vmi_timer_ops.cancel_alarm =
-                         vmi_get_function(VMI_CALL_CancelAlarm);
-                x86_init.timers.timer_init = vmi_time_init;
-#ifdef CONFIG_X86_LOCAL_APIC
-                x86_init.timers.setup_percpu_clockev = vmi_time_bsp_init;
-                x86_cpuinit.setup_percpu_clockev = vmi_time_ap_init;
-#endif
-                pv_time_ops.sched_clock = vmi_sched_clock;
-                x86_platform.calibrate_tsc = vmi_tsc_khz;
-                x86_platform.get_wallclock = vmi_get_wallclock;
-                x86_platform.set_wallclock = vmi_set_wallclock;
-                /* We have true wallclock functions; disable CMOS clock sync */
-                no_sync_cmos_clock = 1;
-        } else {
-                disable_noidle = 1;
-                disable_vmi_timer = 1;
-        }
-        para_fill(pv_irq_ops.safe_halt, Halt);
-        /*
-         * Alternative instruction rewriting doesn't happen soon enough
-         * to convert VMI_IRET to a call instead of a jump; so we have
-         * to do this before IRQs get reenabled.  Fortunately, it is
-         * idempotent.
-         */
-        apply_paravirt(__parainstructions, __parainstructions_end);
-        vmi_bringup();
-        return 1;
-}
-#undef para_fill
-void __init vmi_init(void)
-{
-        if (!vmi_rom)
-                probe_vmi_rom();
-        else
-                check_vmi_rom(vmi_rom);
-        /* In case probing for or validating the ROM failed, basil */
-        if (!vmi_rom)
-                return;
-        reserve_top_address(-vmi_rom->virtual_top);
-#ifdef CONFIG_X86_IO_APIC
-        /* This is virtual hardware; timer routing is wired correctly */
-        no_timer_check = 1;
-#endif
-}
-void __init vmi_activate(void)
-{
-        unsigned long flags;
-        if (!vmi_rom)
-                return;
-        local_irq_save(flags);
-        activate_vmi();
-        local_irq_restore(flags & X86_EFLAGS_IF);
-}
-static int __init parse_vmi(char *arg)
-{
-        if (!arg)
-                return -EINVAL;
-        if (!strcmp(arg, "disable_pge")) {
-                clear_cpu_cap(&boot_cpu_data, X86_FEATURE_PGE);
-                disable_pge = 1;
-        } else if (!strcmp(arg, "disable_pse")) {
-                clear_cpu_cap(&boot_cpu_data, X86_FEATURE_PSE);
-                disable_pse = 1;
-        } else if (!strcmp(arg, "disable_sep")) {
-                clear_cpu_cap(&boot_cpu_data, X86_FEATURE_SEP);
-                disable_sep = 1;
-        } else if (!strcmp(arg, "disable_tsc")) {
-                clear_cpu_cap(&boot_cpu_data, X86_FEATURE_TSC);
-                disable_tsc = 1;
-        } else if (!strcmp(arg, "disable_mtrr")) {
-                clear_cpu_cap(&boot_cpu_data, X86_FEATURE_MTRR);
-                disable_mtrr = 1;
-        } else if (!strcmp(arg, "disable_timer")) {
-                disable_vmi_timer = 1;
-                disable_noidle = 1;
-        } else if (!strcmp(arg, "disable_noidle"))
-                disable_noidle = 1;
-        return 0;
-}
-early_param("vmi", parse_vmi);
diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c
deleted file mode 100644
index 5e1ff66ecd73..000000000000
--- a/arch/x86/kernel/vmiclock_32.c
+++ /dev/null
@@ -1,317 +0,0 @@
-/*
- * VMI paravirtual timer support routines.
- *
- * Copyright (C) 2007, VMware, Inc.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT.  See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- */
-#include <linux/smp.h>
-#include <linux/interrupt.h>
-#include <linux/cpumask.h>
-#include <linux/clocksource.h>
-#include <linux/clockchips.h>
-#include <asm/vmi.h>
-#include <asm/vmi_time.h>
-#include <asm/apicdef.h>
-#include <asm/apic.h>
-#include <asm/timer.h>
-#include <asm/i8253.h>
-#include <asm/irq_vectors.h>
-#define VMI_ONESHOT  (VMI_ALARM_IS_ONESHOT  | VMI_CYCLES_REAL | vmi_get_alarm_wiring())
-#define VMI_PERIODIC (VMI_ALARM_IS_PERIODIC | VMI_CYCLES_REAL | vmi_get_alarm_wiring())
-static DEFINE_PER_CPU(struct clock_event_device, local_events);
-static inline u32 vmi_counter(u32 flags)
-{
-        /* Given VMI_ONESHOT or VMI_PERIODIC, return the corresponding
-         * cycle counter. */
-        return flags & VMI_ALARM_COUNTER_MASK;
-}
-/* paravirt_ops.get_wallclock = vmi_get_wallclock */
-unsigned long vmi_get_wallclock(void)
-{
-        unsigned long long wallclock;
-        wallclock = vmi_timer_ops.get_wallclock(); // nsec
-        (void)do_div(wallclock, 1000000000);       // sec
-        return wallclock;
-}
-/* paravirt_ops.set_wallclock = vmi_set_wallclock */
-int vmi_set_wallclock(unsigned long now)
-{
-        return 0;
-}
-/* paravirt_ops.sched_clock = vmi_sched_clock */
-unsigned long long vmi_sched_clock(void)
-{
-        return cycles_2_ns(vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE));
-}
-/* x86_platform.calibrate_tsc = vmi_tsc_khz */
-unsigned long vmi_tsc_khz(void)
-{
-        unsigned long long khz;
-        khz = vmi_timer_ops.get_cycle_frequency();
-        (void)do_div(khz, 1000);
-        return khz;
-}
-static inline unsigned int vmi_get_timer_vector(void)
-{
-        return IRQ0_VECTOR;
-}
-/** vmi clockchip */
-#ifdef CONFIG_X86_LOCAL_APIC
-static unsigned int startup_timer_irq(unsigned int irq)
-{
-        unsigned long val = apic_read(APIC_LVTT);
-        apic_write(APIC_LVTT, vmi_get_timer_vector());
-        return (val & APIC_SEND_PENDING);
-}
-static void mask_timer_irq(unsigned int irq)
-{
-        unsigned long val = apic_read(APIC_LVTT);
-        apic_write(APIC_LVTT, val | APIC_LVT_MASKED);
-}
-static void unmask_timer_irq(unsigned int irq)
-{
-        unsigned long val = apic_read(APIC_LVTT);
-        apic_write(APIC_LVTT, val & ~APIC_LVT_MASKED);
-}
-static void ack_timer_irq(unsigned int irq)
-{
-        ack_APIC_irq();
-}
-static struct irq_chip vmi_chip __read_mostly = {
-        .name           = "VMI-LOCAL",
-        .startup        = startup_timer_irq,
-        .mask           = mask_timer_irq,
-        .unmask         = unmask_timer_irq,
-        .ack            = ack_timer_irq
-};
-#endif
-/** vmi clockevent */
-#define VMI_ALARM_WIRED_IRQ0    0x00000000
-#define VMI_ALARM_WIRED_LVTT    0x00010000
-static int vmi_wiring = VMI_ALARM_WIRED_IRQ0;
-static inline int vmi_get_alarm_wiring(void)
-{
-        return vmi_wiring;
-}
-static void vmi_timer_set_mode(enum clock_event_mode mode,
-                               struct clock_event_device *evt)
-{
-        cycle_t now, cycles_per_hz;
-        BUG_ON(!irqs_disabled());
-        switch (mode) {
-        case CLOCK_EVT_MODE_ONESHOT:
-        case CLOCK_EVT_MODE_RESUME:
-                break;
-        case CLOCK_EVT_MODE_PERIODIC:
-                cycles_per_hz = vmi_timer_ops.get_cycle_frequency();
-                (void)do_div(cycles_per_hz, HZ);
-                now = vmi_timer_ops.get_cycle_counter(vmi_counter(VMI_PERIODIC));
-                vmi_timer_ops.set_alarm(VMI_PERIODIC, now, cycles_per_hz);
-                break;
-        case CLOCK_EVT_MODE_UNUSED:
-        case CLOCK_EVT_MODE_SHUTDOWN:
-                switch (evt->mode) {
-                case CLOCK_EVT_MODE_ONESHOT:
-                        vmi_timer_ops.cancel_alarm(VMI_ONESHOT);
-                        break;
-                case CLOCK_EVT_MODE_PERIODIC:
-                        vmi_timer_ops.cancel_alarm(VMI_PERIODIC);
-                        break;
-                default:
-                        break;
-                }
-                break;
-        default:
-                break;
-        }
-}
-static int vmi_timer_next_event(unsigned long delta,
-                                struct clock_event_device *evt)
-{
-        /* Unfortunately, set_next_event interface only passes relative
-         * expiry, but we want absolute expiry.  It'd be better if were
-         * were passed an absolute expiry, since a bunch of time may
-         * have been stolen between the time the delta is computed and
-         * when we set the alarm below. */
-        cycle_t now = vmi_timer_ops.get_cycle_counter(vmi_counter(VMI_ONESHOT));
-        BUG_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
-        vmi_timer_ops.set_alarm(VMI_ONESHOT, now + delta, 0);
-        return 0;
-}
-static struct clock_event_device vmi_clockevent = {
-        .name           = "vmi-timer",
-        .features       = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
-        .shift          = 22,
-        .set_mode       = vmi_timer_set_mode,
-        .set_next_event = vmi_timer_next_event,
-        .rating         = 1000,
-        .irq            = 0,
-};
-static irqreturn_t vmi_timer_interrupt(int irq, void *dev_id)
-{
-        struct clock_event_device *evt = &__get_cpu_var(local_events);
-        evt->event_handler(evt);
-        return IRQ_HANDLED;
-}
-static struct irqaction vmi_clock_action  = {
-        .name           = "vmi-timer",
-        .handler        = vmi_timer_interrupt,
-        .flags          = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_TIMER,
-};
-static void __devinit vmi_time_init_clockevent(void)
-{
-        cycle_t cycles_per_msec;
-        struct clock_event_device *evt;
-        int cpu = smp_processor_id();
-        evt = &__get_cpu_var(local_events);
-        /* Use cycles_per_msec since div_sc params are 32-bits. */
-        cycles_per_msec = vmi_timer_ops.get_cycle_frequency();
-        (void)do_div(cycles_per_msec, 1000);
-        memcpy(evt, &vmi_clockevent, sizeof(*evt));
-        /* Must pick .shift such that .mult fits in 32-bits.  Choosing
-         * .shift to be 22 allows 2^(32-22) cycles per nano-seconds
-         * before overflow. */
-        evt->mult = div_sc(cycles_per_msec, NSEC_PER_MSEC, evt->shift);
-        /* Upper bound is clockevent's use of ulong for cycle deltas. */
-        evt->max_delta_ns = clockevent_delta2ns(ULONG_MAX, evt);
-        evt->min_delta_ns = clockevent_delta2ns(1, evt);
-        evt->cpumask = cpumask_of(cpu);
-        printk(KERN_WARNING "vmi: registering clock event %s. mult=%u shift=%u\n",
-               evt->name, evt->mult, evt->shift);
-        clockevents_register_device(evt);
-}
-void __init vmi_time_init(void)
-{
-        unsigned int cpu;
-        /* Disable PIT: BIOSes start PIT CH0 with 18.2hz peridic. */
-        outb_pit(0x3a, PIT_MODE); /* binary, mode 5, LSB/MSB, ch 0 */
-        vmi_time_init_clockevent();
-        setup_irq(0, &vmi_clock_action);
-        for_each_possible_cpu(cpu)
-                per_cpu(vector_irq, cpu)[vmi_get_timer_vector()] = 0;
-}
-#ifdef CONFIG_X86_LOCAL_APIC
-void __devinit vmi_time_bsp_init(void)
-{
-        /*
-         * On APIC systems, we want local timers to fire on each cpu.  We do
-         * this by programming LVTT to deliver timer events to the IRQ handler
-         * for IRQ-0, since we can't re-use the APIC local timer handler
-         * without interfering with that code.
-         */
-        clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
-        local_irq_disable();
-#ifdef CONFIG_SMP
-        /*
-         * XXX handle_percpu_irq only defined for SMP; we need to switch over
-         * to using it, since this is a local interrupt, which each CPU must
-         * handle individually without locking out or dropping simultaneous
-         * local timers on other CPUs.  We also don't want to trigger the
-         * quirk workaround code for interrupts which gets invoked from
-         * handle_percpu_irq via eoi, so we use our own IRQ chip.
-         */
-        set_irq_chip_and_handler_name(0, &vmi_chip, handle_percpu_irq, "lvtt");
-#else
-        set_irq_chip_and_handler_name(0, &vmi_chip, handle_edge_irq, "lvtt");
-#endif
-        vmi_wiring = VMI_ALARM_WIRED_LVTT;
-        apic_write(APIC_LVTT, vmi_get_timer_vector());
-        local_irq_enable();
-        clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL);
-}
-void __devinit vmi_time_ap_init(void)
-{
-        vmi_time_init_clockevent();
-        apic_write(APIC_LVTT, vmi_get_timer_vector());
-}
-#endif
-/** vmi clocksource */
-static struct clocksource clocksource_vmi;
-static cycle_t read_real_cycles(struct clocksource *cs)
-{
-        cycle_t ret = (cycle_t)vmi_timer_ops.get_cycle_counter(VMI_CYCLES_REAL);
-        return max(ret, clocksource_vmi.cycle_last);
-}
-static struct clocksource clocksource_vmi = {
-        .name                   = "vmi-timer",
-        .rating                 = 450,
-        .read                   = read_real_cycles,
-        .mask                   = CLOCKSOURCE_MASK(64),
-        .mult                   = 0, /* to be set */
-        .shift                  = 22,
-        .flags                  = CLOCK_SOURCE_IS_CONTINUOUS,
-};
-static int __init init_vmi_clocksource(void)
-{
-        cycle_t cycles_per_msec;
-        if (!vmi_timer_ops.get_cycle_frequency)
-                return 0;
-        /* Use khz2mult rather than hz2mult since hz arg is only 32-bits. */
-        cycles_per_msec = vmi_timer_ops.get_cycle_frequency();
-        (void)do_div(cycles_per_msec, 1000);
-        /* Note that clocksource.{mult, shift} converts in the opposite direction
-         * as clockevents.  */
-        clocksource_vmi.mult = clocksource_khz2mult(cycles_per_msec,
-                                                    clocksource_vmi.shift);
-        printk(KERN_WARNING "vmi: registering clock source khz=%lld\n", cycles_per_msec);
-        return clocksource_register(&clocksource_vmi);
-}
-module_init(init_vmi_clocksource);
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index d0bb52296fa3..89aed99aafce 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -69,7 +69,7 @@ jiffies_64 = jiffies;
 PHDRS {
        text PT_LOAD FLAGS(5);          /* R_E */
-        data PT_LOAD FLAGS(7);          /* RWE */
+        data PT_LOAD FLAGS(6);          /* RW_ */
 #ifdef CONFIG_X86_64
        user PT_LOAD FLAGS(5);          /* R_E */
 #ifdef CONFIG_SMP
@@ -105,6 +105,7 @@ SECTIONS
                SCHED_TEXT
                LOCK_TEXT
                KPROBES_TEXT
+                ENTRY_TEXT
                IRQENTRY_TEXT
                *(.fixup)
                *(.gnu.warning)
@@ -116,6 +117,10 @@ SECTIONS
        EXCEPTION_TABLE(16) :text = 0x9090
+#if defined(CONFIG_DEBUG_RODATA)
+        /* .text should occupy whole number of pages */
+        . = ALIGN(PAGE_SIZE);
+#endif
        X64_ALIGN_DEBUG_RODATA_BEGIN
        RO_DATA(PAGE_SIZE)
        X64_ALIGN_DEBUG_RODATA_END
@@ -156,6 +161,12 @@ SECTIONS
 #define VVIRT_OFFSET (VSYSCALL_ADDR - __vsyscall_0)
 #define VVIRT(x) (ADDR(x) - VVIRT_OFFSET)
+#define EMIT_VVAR(x, offset) .vsyscall_var_ ## x        \
+        ADDR(.vsyscall_0) + offset                      \
+        : AT(VLOAD(.vsyscall_var_ ## x)) {              \
+                *(.vsyscall_var_ ## x)                  \
+        }                                               \
+        x = VVIRT(.vsyscall_var_ ## x);
        . = ALIGN(4096);
        __vsyscall_0 = .;
@@ -170,18 +181,6 @@ SECTIONS
                *(.vsyscall_fn)
        }
-        . = ALIGN(L1_CACHE_BYTES);
-        .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data)) {
-                *(.vsyscall_gtod_data)
-        }
-        vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data);
-        .vsyscall_clock : AT(VLOAD(.vsyscall_clock)) {
-                *(.vsyscall_clock)
-        }
-        vsyscall_clock = VVIRT(.vsyscall_clock);
        .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) {
                *(.vsyscall_1)
        }
@@ -189,21 +188,14 @@ SECTIONS
                *(.vsyscall_2)
        }
-        .vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) {
-                *(.vgetcpu_mode)
-        }
-        vgetcpu_mode = VVIRT(.vgetcpu_mode);
-        . = ALIGN(L1_CACHE_BYTES);
-        .jiffies : AT(VLOAD(.jiffies)) {
-                *(.jiffies)
-        }
-        jiffies = VVIRT(.jiffies);
        .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) {
                *(.vsyscall_3)
        }
+#define __VVAR_KERNEL_LDS
+#include <asm/vvar.h>
+#undef __VVAR_KERNEL_LDS
        . = __vsyscall_0 + PAGE_SIZE;
 #undef VSYSCALL_ADDR
@@ -211,6 +203,7 @@ SECTIONS
 #undef VLOAD
 #undef VVIRT_OFFSET
 #undef VVIRT
+#undef EMIT_VVAR
 #endif /* CONFIG_X86_64 */
@@ -226,7 +219,7 @@ SECTIONS
         * output PHDR, so the next output section - .init.text - should
         * start another segment - init.
         */
-        PERCPU_VADDR(0, :percpu)
+        PERCPU_VADDR(INTERNODE_CACHE_BYTES, 0, :percpu)
 #endif
        INIT_TEXT_SECTION(PAGE_SIZE)
@@ -236,12 +229,30 @@ SECTIONS
        INIT_DATA_SECTION(16)
+        /*
+         * Code and data for a variety of lowlevel trampolines, to be
+         * copied into base memory (< 1 MiB) during initialization.
+         * Since it is copied early, the main copy can be discarded
+         * afterwards.
+         */
+         .x86_trampoline : AT(ADDR(.x86_trampoline) - LOAD_OFFSET) {
+                x86_trampoline_start = .;
+                *(.x86_trampoline)
+                x86_trampoline_end = .;
+        }
        .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
                __x86_cpu_dev_start = .;
                *(.x86_cpu_dev.init)
                __x86_cpu_dev_end = .;
        }
+        /*
+         * start address and size of operations which during runtime
+         * can be patched with virtualization friendly instructions or
+         * baremetal native ones. Think page table operations.
+         * Details in paravirt_types.h
+         */
        . = ALIGN(8);
        .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
                __parainstructions = .;
@@ -249,6 +260,11 @@ SECTIONS
                __parainstructions_end = .;
        }
+        /*
+         * struct alt_inst entries. From the header (alternative.h):
+         * "Alternative instructions for different CPU types or capabilities"
+         * Think locking instructions on spinlocks.
+         */
        . = ALIGN(8);
        .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
                __alt_instructions = .;
@@ -256,11 +272,36 @@ SECTIONS
                __alt_instructions_end = .;
        }
+        /*
+         * And here are the replacement instructions. The linker sticks
+         * them as binary blobs. The .altinstructions has enough data to
+         * get the address and the length of them to patch the kernel safely.
+         */
        .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
                *(.altinstr_replacement)
        }
        /*
+         * struct iommu_table_entry entries are injected in this section.
+         * It is an array of IOMMUs which during run time gets sorted depending
+         * on its dependency order. After rootfs_initcall is complete
+         * this section can be safely removed.
+         */
+        .iommu_table : AT(ADDR(.iommu_table) - LOAD_OFFSET) {
+                __iommu_table = .;
+                *(.iommu_table)
+                __iommu_table_end = .;
+        }
+        . = ALIGN(8);
+        .apicdrivers : AT(ADDR(.apicdrivers) - LOAD_OFFSET) {
+                __apicdrivers = .;
+                *(.apicdrivers);
+                __apicdrivers_end = .;
+        }
+        . = ALIGN(8);
+        /*
         * .exit.text is discard at runtime, not link time, to deal with
         *  references from .altinstructions and .eh_frame
         */
@@ -273,7 +314,7 @@ SECTIONS
        }
 #if !defined(CONFIG_X86_64) || !defined(CONFIG_SMP)
-        PERCPU(PAGE_SIZE)
+        PERCPU_SECTION(INTERNODE_CACHE_BYTES)
 #endif
        . = ALIGN(PAGE_SIZE);
@@ -307,7 +348,7 @@ SECTIONS
                __bss_start = .;
                *(.bss..page_aligned)
                *(.bss)
-                . = ALIGN(4);
+                . = ALIGN(PAGE_SIZE);
                __bss_stop = .;
        }
diff --git a/arch/x86/kernel/vread_tsc_64.c b/arch/x86/kernel/vread_tsc_64.c
new file mode 100644
index 000000000000..a81aa9e9894c
--- /dev/null
+++ b/arch/x86/kernel/vread_tsc_64.c
@@ -0,0 +1,36 @@
+/* This code runs in userspace. */
+#define DISABLE_BRANCH_PROFILING
+#include <asm/vgtod.h>
+notrace cycle_t __vsyscall_fn vread_tsc(void)
+{
+        cycle_t ret;
+        u64 last;
+        /*
+         * Empirically, a fence (of type that depends on the CPU)
+         * before rdtsc is enough to ensure that rdtsc is ordered
+         * with respect to loads.  The various CPU manuals are unclear
+         * as to whether rdtsc can be reordered with later loads,
+         * but no one has ever seen it happen.
+         */
+        rdtsc_barrier();
+        ret = (cycle_t)vget_cycles();
+        last = VVAR(vsyscall_gtod_data).clock.cycle_last;
+        if (likely(ret >= last))
+                return ret;
+        /*
+         * GCC likes to generate cmov here, but this branch is extremely
+         * predictable (it's just a funciton of time and the likely is
+         * very likely) and there's a data dependence, so force GCC
+         * to generate a branch instead.  I don't barrier() because
+         * we don't actually need a barrier, and if this function
+         * ever gets inlined it will generate worse code.
+         */
+        asm volatile ("");
+        return last;
+}
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index dcbb28c4b694..3e682184d76c 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -49,17 +49,10 @@
                __attribute__ ((unused, __section__(".vsyscall_" #nr))) notrace
 #define __syscall_clobber "r11","cx","memory"
-/*
+DEFINE_VVAR(int, vgetcpu_mode);
- * vsyscall_gtod_data contains data that is :
+DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data) =
- * - readonly from vsyscalls
- * - written by timer interrupt or systcl (/proc/sys/kernel/vsyscall64)
- * Try to keep this structure as small as possible to avoid cache line ping pongs
- */
-int __vgetcpu_mode __section_vgetcpu_mode;
-struct vsyscall_gtod_data __vsyscall_gtod_data __section_vsyscall_gtod_data =
 {
-        .lock = SEQLOCK_UNLOCKED,
+        .lock = __SEQLOCK_UNLOCKED(__vsyscall_gtod_data.lock),
        .sysctl_enabled = 1,
 };
@@ -97,7 +90,7 @@ void update_vsyscall(struct timespec *wall_time, struct timespec *wtm,
 */
 static __always_inline void do_get_tz(struct timezone * tz)
 {
-        *tz = __vsyscall_gtod_data.sys_tz;
+        *tz = VVAR(vsyscall_gtod_data).sys_tz;
 }
 static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz)
@@ -126,23 +119,24 @@ static __always_inline void do_vgettimeofday(struct timeval * tv)
        unsigned long mult, shift, nsec;
        cycle_t (*vread)(void);
        do {
-                seq = read_seqbegin(&__vsyscall_gtod_data.lock);
+                seq = read_seqbegin(&VVAR(vsyscall_gtod_data).lock);
-                vread = __vsyscall_gtod_data.clock.vread;
+                vread = VVAR(vsyscall_gtod_data).clock.vread;
-                if (unlikely(!__vsyscall_gtod_data.sysctl_enabled || !vread)) {
+                if (unlikely(!VVAR(vsyscall_gtod_data).sysctl_enabled ||
+                             !vread)) {
                        gettimeofday(tv,NULL);
                        return;
                }
                now = vread();
-                base = __vsyscall_gtod_data.clock.cycle_last;
+                base = VVAR(vsyscall_gtod_data).clock.cycle_last;
-                mask = __vsyscall_gtod_data.clock.mask;
+                mask = VVAR(vsyscall_gtod_data).clock.mask;
-                mult = __vsyscall_gtod_data.clock.mult;
+                mult = VVAR(vsyscall_gtod_data).clock.mult;
-                shift = __vsyscall_gtod_data.clock.shift;
+                shift = VVAR(vsyscall_gtod_data).clock.shift;
-                tv->tv_sec = __vsyscall_gtod_data.wall_time_sec;
+                tv->tv_sec = VVAR(vsyscall_gtod_data).wall_time_sec;
-                nsec = __vsyscall_gtod_data.wall_time_nsec;
+                nsec = VVAR(vsyscall_gtod_data).wall_time_nsec;
-        } while (read_seqretry(&__vsyscall_gtod_data.lock, seq));
+        } while (read_seqretry(&VVAR(vsyscall_gtod_data).lock, seq));
        /* calculate interval: */
        cycle_delta = (now - base) & mask;
@@ -171,15 +165,15 @@ time_t __vsyscall(1) vtime(time_t *t)
 {
        unsigned seq;
        time_t result;
-        if (unlikely(!__vsyscall_gtod_data.sysctl_enabled))
+        if (unlikely(!VVAR(vsyscall_gtod_data).sysctl_enabled))
                return time_syscall(t);
        do {
-                seq = read_seqbegin(&__vsyscall_gtod_data.lock);
+                seq = read_seqbegin(&VVAR(vsyscall_gtod_data).lock);
-                result = __vsyscall_gtod_data.wall_time_sec;
+                result = VVAR(vsyscall_gtod_data).wall_time_sec;
-        } while (read_seqretry(&__vsyscall_gtod_data.lock, seq));
+        } while (read_seqretry(&VVAR(vsyscall_gtod_data).lock, seq));
        if (t)
                *t = result;
@@ -208,9 +202,9 @@ vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
           We do this here because otherwise user space would do it on
           its own in a likely inferior way (no access to jiffies).
           If you don't like it pass NULL. */
-        if (tcache && tcache->blob[0] == (j = __jiffies)) {
+        if (tcache && tcache->blob[0] == (j = VVAR(jiffies))) {
                p = tcache->blob[1];
-        } else if (__vgetcpu_mode == VGETCPU_RDTSCP) {
+        } else if (VVAR(vgetcpu_mode) == VGETCPU_RDTSCP) {
                /* Load per CPU data from RDTSCP */
                native_read_tscp(&p);
        } else {
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index 1b950d151e58..9796c2f3d074 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -52,6 +52,7 @@ extern void *__memcpy(void *, const void *, __kernel_size_t);
 EXPORT_SYMBOL(memset);
 EXPORT_SYMBOL(memcpy);
 EXPORT_SYMBOL(__memcpy);
+EXPORT_SYMBOL(memmove);
 EXPORT_SYMBOL(empty_zero_page);
 #ifndef CONFIG_PARAVIRT
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index cd6da6bf3eca..6f164bd5e14d 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -6,10 +6,12 @@
 #include <linux/init.h>
 #include <linux/ioport.h>
 #include <linux/module.h>
+#include <linux/pci.h>
 #include <asm/bios_ebda.h>
 #include <asm/paravirt.h>
 #include <asm/pci_x86.h>
+#include <asm/pci.h>
 #include <asm/mpspec.h>
 #include <asm/setup.h>
 #include <asm/apic.h>
@@ -33,7 +35,7 @@ void iommu_shutdown_noop(void) { }
 struct x86_init_ops x86_init __initdata = {
        .resources = {
-                .probe_roms             = x86_init_noop,
+                .probe_roms             = probe_roms,
                .reserve_resources      = reserve_standard_io_resources,
                .memory_setup           = default_machine_specific_memory_setup,
        },
@@ -59,6 +61,10 @@ struct x86_init_ops x86_init __initdata = {
                .banner                 = default_banner,
        },
+        .mapping = {
+                .pagetable_reserve              = native_pagetable_reserve,
+        },
        .paging = {
                .pagetable_setup_start  = native_pagetable_setup_start,
                .pagetable_setup_done   = native_pagetable_setup_done,
@@ -68,6 +74,7 @@ struct x86_init_ops x86_init __initdata = {
                .setup_percpu_clockev   = setup_boot_APIC_clock,
                .tsc_pre_init           = x86_init_noop,
                .timer_init             = hpet_time_init,
+                .wallclock_init         = x86_init_noop,
        },
        .iommu = {
@@ -99,3 +106,8 @@ struct x86_platform_ops x86_platform = {
 };
 EXPORT_SYMBOL_GPL(x86_platform);
+struct x86_msi_ops x86_msi = {
+        .setup_msi_irqs = native_setup_msi_irqs,
+        .teardown_msi_irq = native_teardown_msi_irq,
+        .teardown_msi_irqs = default_teardown_msi_irqs,
+};
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index 9c253bd65e24..a3911343976b 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -53,7 +53,7 @@ void __sanitize_i387_state(struct task_struct *tsk)
        /*
         * None of the feature bits are in init state. So nothing else
-         * to do for us, as the memory layout is upto date.
+         * to do for us, as the memory layout is up to date.
         */
        if ((xstate_bv & pcntxt_mask) == pcntxt_mask)
                return;
@@ -394,7 +394,8 @@ static void __init setup_xstate_init(void)
         * Setup init_xstate_buf to represent the init state of
         * all the features managed by the xsave
         */
-        init_xstate_buf = alloc_bootmem(xstate_size);
+        init_xstate_buf = alloc_bootmem_align(xstate_size,
+                                              __alignof__(struct xsave_struct));
        init_xstate_buf->i387.mxcsr = MXCSR_DEFAULT;
        clts();
author	Glenn Elliott <gelliott@cs.unc.edu>	2012-03-04 19:47:13 -0500
committer	Glenn Elliott <gelliott@cs.unc.edu>	2012-03-04 19:47:13 -0500
commit	c71c03bda1e86c9d5198c5d83f712e695c4f2a1e (patch)
tree	ecb166cb3e2b7e2adb3b5e292245fefd23381ac8 /arch/x86/kernel
parent	ea53c912f8a86a8567697115b6a0d8152beee5c8 (diff)
parent	6a00f206debf8a5c8899055726ad127dbeeed098 (diff)