76 files changed, 2329 insertions, 1142 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index e77b22083721..fedf32a8c3ec 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -11,6 +11,8 @@ ifdef CONFIG_FUNCTION_TRACER
 CFLAGS_REMOVE_tsc.o = -pg
 CFLAGS_REMOVE_rtc.o = -pg
 CFLAGS_REMOVE_paravirt-spinlocks.o = -pg
+CFLAGS_REMOVE_pvclock.o = -pg
+CFLAGS_REMOVE_kvmclock.o = -pg
 CFLAGS_REMOVE_ftrace.o = -pg
 CFLAGS_REMOVE_early_printk.o = -pg
 endif
@@ -104,6 +106,7 @@ obj-$(CONFIG_SCx200)		+= scx200.o
 scx200-y                        += scx200_32.o
 obj-$(CONFIG_OLPC)              += olpc.o
+obj-$(CONFIG_OLPC_OPENFIRMWARE) += olpc_ofw.o
 obj-$(CONFIG_X86_MRST)          += mrst.o
 microcode-y                             := microcode_core.o
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c
index fb7a5f052e2b..fb16f17e59be 100644
--- a/arch/x86/kernel/acpi/cstate.c
+++ b/arch/x86/kernel/acpi/cstate.c
@@ -61,7 +61,7 @@ struct cstate_entry {
                unsigned int ecx;
        } states[ACPI_PROCESSOR_MAX_POWER];
 };
-static struct cstate_entry *cpu_cstate_entry;   /* per CPU ptr */
+static struct cstate_entry __percpu *cpu_cstate_entry;  /* per CPU ptr */
 static short mwait_supported[ACPI_PROCESSOR_MAX_POWER];
diff --git a/arch/x86/kernel/acpi/realmode/wakeup.S b/arch/x86/kernel/acpi/realmode/wakeup.S
index 580b4e296010..28595d6df47c 100644
--- a/arch/x86/kernel/acpi/realmode/wakeup.S
+++ b/arch/x86/kernel/acpi/realmode/wakeup.S
@@ -104,7 +104,7 @@ _start:
        movl    %eax, %ecx
        orl     %edx, %ecx
        jz      1f
-        movl    $0xc0000080, %ecx
+        movl    $MSR_EFER, %ecx
        wrmsr
 1:
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 70237732a6c7..f65ab8b014c4 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -214,6 +214,7 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
                u8 *instr = a->instr;
                BUG_ON(a->replacementlen > a->instrlen);
                BUG_ON(a->instrlen > sizeof(insnbuf));
+                BUG_ON(a->cpuid >= NCAPINTS*32);
                if (!boot_cpu_has(a->cpuid))
                        continue;
 #ifdef CONFIG_X86_64
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 0d20286d78c6..679b6450382b 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -1953,6 +1953,7 @@ static void __unmap_single(struct dma_ops_domain *dma_dom,
                           size_t size,
                           int dir)
 {
+        dma_addr_t flush_addr;
        dma_addr_t i, start;
        unsigned int pages;
@@ -1960,6 +1961,7 @@ static void __unmap_single(struct dma_ops_domain *dma_dom,
            (dma_addr + size > dma_dom->aperture_size))
                return;
+        flush_addr = dma_addr;
        pages = iommu_num_pages(dma_addr, size, PAGE_SIZE);
        dma_addr &= PAGE_MASK;
        start = dma_addr;
@@ -1974,7 +1976,7 @@ static void __unmap_single(struct dma_ops_domain *dma_dom,
        dma_ops_free_addresses(dma_dom, dma_addr, pages);
        if (amd_iommu_unmap_flush || dma_dom->need_flush) {
-                iommu_flush_pages(&dma_dom->domain, dma_addr, size);
+                iommu_flush_pages(&dma_dom->domain, flush_addr, size);
                dma_dom->need_flush = false;
        }
 }
@@ -2572,6 +2574,11 @@ static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
 static int amd_iommu_domain_has_cap(struct iommu_domain *domain,
                                    unsigned long cap)
 {
+        switch (cap) {
+        case IOMMU_CAP_CACHE_COHERENCY:
+                return 1;
+        }
        return 0;
 }
@@ -2609,8 +2616,7 @@ int __init amd_iommu_init_passthrough(void)
        pt_domain->mode |= PAGE_MODE_NONE;
-        while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
+        for_each_pci_dev(dev) {
                if (!check_device(&dev->dev))
                        continue;
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 3cc63e2b8dd4..5a170cbbbed8 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -632,6 +632,13 @@ static void __init init_iommu_from_pci(struct amd_iommu *iommu)
        iommu->last_device = calc_devid(MMIO_GET_BUS(range),
                                        MMIO_GET_LD(range));
        iommu->evt_msi_num = MMIO_MSI_NUM(misc);
+        if (is_rd890_iommu(iommu->dev)) {
+                pci_read_config_dword(iommu->dev, 0xf0, &iommu->cache_cfg[0]);
+                pci_read_config_dword(iommu->dev, 0xf4, &iommu->cache_cfg[1]);
+                pci_read_config_dword(iommu->dev, 0xf8, &iommu->cache_cfg[2]);
+                pci_read_config_dword(iommu->dev, 0xfc, &iommu->cache_cfg[3]);
+        }
 }
 /*
@@ -649,29 +656,9 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
        struct ivhd_entry *e;
        /*
-         * First set the recommended feature enable bits from ACPI
+         * First save the recommended feature enable bits from ACPI
-         * into the IOMMU control registers
         */
-        h->flags & IVHD_FLAG_HT_TUN_EN_MASK ?
+        iommu->acpi_flags = h->flags;
-                iommu_feature_enable(iommu, CONTROL_HT_TUN_EN) :
-                iommu_feature_disable(iommu, CONTROL_HT_TUN_EN);
-        h->flags & IVHD_FLAG_PASSPW_EN_MASK ?
-                iommu_feature_enable(iommu, CONTROL_PASSPW_EN) :
-                iommu_feature_disable(iommu, CONTROL_PASSPW_EN);
-        h->flags & IVHD_FLAG_RESPASSPW_EN_MASK ?
-                iommu_feature_enable(iommu, CONTROL_RESPASSPW_EN) :
-                iommu_feature_disable(iommu, CONTROL_RESPASSPW_EN);
-        h->flags & IVHD_FLAG_ISOC_EN_MASK ?
-                iommu_feature_enable(iommu, CONTROL_ISOC_EN) :
-                iommu_feature_disable(iommu, CONTROL_ISOC_EN);
-        /*
-         * make IOMMU memory accesses cache coherent
-         */
-        iommu_feature_enable(iommu, CONTROL_COHERENT_EN);
        /*
         * Done. Now parse the device entries
@@ -1116,6 +1103,40 @@ static void init_device_table(void)
        }
 }
+static void iommu_init_flags(struct amd_iommu *iommu)
+{
+        iommu->acpi_flags & IVHD_FLAG_HT_TUN_EN_MASK ?
+                iommu_feature_enable(iommu, CONTROL_HT_TUN_EN) :
+                iommu_feature_disable(iommu, CONTROL_HT_TUN_EN);
+        iommu->acpi_flags & IVHD_FLAG_PASSPW_EN_MASK ?
+                iommu_feature_enable(iommu, CONTROL_PASSPW_EN) :
+                iommu_feature_disable(iommu, CONTROL_PASSPW_EN);
+        iommu->acpi_flags & IVHD_FLAG_RESPASSPW_EN_MASK ?
+                iommu_feature_enable(iommu, CONTROL_RESPASSPW_EN) :
+                iommu_feature_disable(iommu, CONTROL_RESPASSPW_EN);
+        iommu->acpi_flags & IVHD_FLAG_ISOC_EN_MASK ?
+                iommu_feature_enable(iommu, CONTROL_ISOC_EN) :
+                iommu_feature_disable(iommu, CONTROL_ISOC_EN);
+        /*
+         * make IOMMU memory accesses cache coherent
+         */
+        iommu_feature_enable(iommu, CONTROL_COHERENT_EN);
+}
+static void iommu_apply_quirks(struct amd_iommu *iommu)
+{
+        if (is_rd890_iommu(iommu->dev)) {
+                pci_write_config_dword(iommu->dev, 0xf0, iommu->cache_cfg[0]);
+                pci_write_config_dword(iommu->dev, 0xf4, iommu->cache_cfg[1]);
+                pci_write_config_dword(iommu->dev, 0xf8, iommu->cache_cfg[2]);
+                pci_write_config_dword(iommu->dev, 0xfc, iommu->cache_cfg[3]);
+        }
+}
 /*
 * This function finally enables all IOMMUs found in the system after
 * they have been initialized
@@ -1126,6 +1147,8 @@ static void enable_iommus(void)
        for_each_iommu(iommu) {
                iommu_disable(iommu);
+                iommu_apply_quirks(iommu);
+                iommu_init_flags(iommu);
                iommu_set_device_table(iommu);
                iommu_enable_command_buffer(iommu);
                iommu_enable_event_buffer(iommu);
diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c
index a35347501d36..8dd77800ff5d 100644
--- a/arch/x86/kernel/apb_timer.c
+++ b/arch/x86/kernel/apb_timer.c
@@ -43,10 +43,11 @@
 #include <asm/fixmap.h>
 #include <asm/apb_timer.h>
+#include <asm/mrst.h>
 #define APBT_MASK                       CLOCKSOURCE_MASK(32)
 #define APBT_SHIFT                      22
-#define APBT_CLOCKEVENT_RATING          150
+#define APBT_CLOCKEVENT_RATING          110
 #define APBT_CLOCKSOURCE_RATING         250
 #define APBT_MIN_DELTA_USEC             200
@@ -83,8 +84,6 @@ struct apbt_dev {
        char name[10];
 };
-int disable_apbt_percpu __cpuinitdata;
 static DEFINE_PER_CPU(struct apbt_dev, cpu_apbt_dev);
 #ifdef CONFIG_SMP
@@ -195,29 +194,6 @@ static struct clock_event_device apbt_clockevent = {
 };
 /*
- * if user does not want to use per CPU apb timer, just give it a lower rating
- * than local apic timer and skip the late per cpu timer init.
- */
-static inline int __init setup_x86_mrst_timer(char *arg)
-{
-        if (!arg)
-                return -EINVAL;
-        if (strcmp("apbt_only", arg) == 0)
-                disable_apbt_percpu = 0;
-        else if (strcmp("lapic_and_apbt", arg) == 0)
-                disable_apbt_percpu = 1;
-        else {
-                pr_warning("X86 MRST timer option %s not recognised"
-                           " use x86_mrst_timer=apbt_only or lapic_and_apbt\n",
-                           arg);
-                return -EINVAL;
-        }
-        return 0;
-}
-__setup("x86_mrst_timer=", setup_x86_mrst_timer);
-/*
 * start count down from 0xffff_ffff. this is done by toggling the enable bit
 * then load initial load count to ~0.
 */
@@ -335,7 +311,7 @@ static int __init apbt_clockevent_register(void)
        adev->num = smp_processor_id();
        memcpy(&adev->evt, &apbt_clockevent, sizeof(struct clock_event_device));
-        if (disable_apbt_percpu) {
+        if (mrst_timer_options == MRST_TIMER_LAPIC_APBT) {
                apbt_clockevent.rating = APBT_CLOCKEVENT_RATING - 100;
                global_clock_event = &adev->evt;
                printk(KERN_DEBUG "%s clockevent registered as global\n",
@@ -429,7 +405,8 @@ static int apbt_cpuhp_notify(struct notifier_block *n,
 static __init int apbt_late_init(void)
 {
-        if (disable_apbt_percpu || !apb_timer_block_enabled)
+        if (mrst_timer_options == MRST_TIMER_LAPIC_APBT ||
+                !apb_timer_block_enabled)
                return 0;
        /* This notifier should be called after workqueue is ready */
        hotcpu_notifier(apbt_cpuhp_notify, -20);
@@ -450,6 +427,8 @@ static void apbt_set_mode(enum clock_event_mode mode,
        int timer_num;
        struct apbt_dev *adev = EVT_TO_APBT_DEV(evt);
+        BUG_ON(!apbt_virt_address);
        timer_num = adev->num;
        pr_debug("%s CPU %d timer %d mode=%d\n",
                 __func__, first_cpu(*evt->cpumask), timer_num, mode);
@@ -676,7 +655,7 @@ void __init apbt_time_init(void)
        }
 #ifdef CONFIG_SMP
        /* kernel cmdline disable apb timer, so we will use lapic timers */
-        if (disable_apbt_percpu) {
+        if (mrst_timer_options == MRST_TIMER_LAPIC_APBT) {
                printk(KERN_INFO "apbt: disabled per cpu timer\n");
                return;
        }
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index b5d8b0bcf235..a2e0caf26e17 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -280,7 +280,7 @@ void __init early_gart_iommu_check(void)
         * or BIOS forget to put that in reserved.
         * try to update e820 to make that region as reserved.
         */
-        u32 agp_aper_base = 0, agp_aper_order = 0;
+        u32 agp_aper_order = 0;
        int i, fix, slot, valid_agp = 0;
        u32 ctl;
        u32 aper_size = 0, aper_order = 0, last_aper_order = 0;
@@ -291,7 +291,7 @@ void __init early_gart_iommu_check(void)
                return;
        /* This is mostly duplicate of iommu_hole_init */
-        agp_aper_base = search_agp_bridge(&agp_aper_order, &valid_agp);
+        search_agp_bridge(&agp_aper_order, &valid_agp);
        fix = 0;
        for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) {
diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile
index 565c1bfc507d..910f20b457c4 100644
--- a/arch/x86/kernel/apic/Makefile
+++ b/arch/x86/kernel/apic/Makefile
@@ -2,7 +2,12 @@
 # Makefile for local APIC drivers and for the IO-APIC code
 #
-obj-$(CONFIG_X86_LOCAL_APIC)    += apic.o apic_noop.o probe_$(BITS).o ipi.o nmi.o
+obj-$(CONFIG_X86_LOCAL_APIC)    += apic.o apic_noop.o probe_$(BITS).o ipi.o
+ifneq ($(CONFIG_HARDLOCKUP_DETECTOR),y)
+obj-$(CONFIG_X86_LOCAL_APIC)    += nmi.o
+endif
+obj-$(CONFIG_HARDLOCKUP_DETECTOR)       += hw_nmi.o
 obj-$(CONFIG_X86_IO_APIC)       += io_apic.o
 obj-$(CONFIG_SMP)               += ipi.o
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 980508c79082..e3b534cda49a 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -1606,7 +1606,7 @@ void __init init_apic_mappings(void)
                 * acpi lapic path already maps that address in
                 * acpi_register_lapic_address()
                 */
-                if (!acpi_lapic)
+                if (!acpi_lapic && !smp_found_config)
                        set_fixmap_nocache(FIX_APIC_BASE, apic_phys);
                apic_printk(APIC_VERBOSE, "mapped APIC to %08lx (%08lx)\n",
diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c
index 425e53a87feb..8593582d8022 100644
--- a/arch/x86/kernel/apic/es7000_32.c
+++ b/arch/x86/kernel/apic/es7000_32.c
@@ -129,7 +129,6 @@ int					es7000_plat;
 * GSI override for ES7000 platforms.
 */
-static unsigned int                     base;
 static int __cpuinit wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip)
 {
diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
new file mode 100644
index 000000000000..cefd6942f0e9
--- /dev/null
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -0,0 +1,107 @@
+/*
+ *  HW NMI watchdog support
+ *
+ *  started by Don Zickus, Copyright (C) 2010 Red Hat, Inc.
+ *
+ *  Arch specific calls to support NMI watchdog
+ *
+ *  Bits copied from original nmi.c file
+ *
+ */
+#include <asm/apic.h>
+#include <linux/cpumask.h>
+#include <linux/kdebug.h>
+#include <linux/notifier.h>
+#include <linux/kprobes.h>
+#include <linux/nmi.h>
+#include <linux/module.h>
+/* For reliability, we're prepared to waste bits here. */
+static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly;
+u64 hw_nmi_get_sample_period(void)
+{
+        return (u64)(cpu_khz) * 1000 * 60;
+}
+#ifdef ARCH_HAS_NMI_WATCHDOG
+void arch_trigger_all_cpu_backtrace(void)
+{
+        int i;
+        cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask);
+        printk(KERN_INFO "sending NMI to all CPUs:\n");
+        apic->send_IPI_all(NMI_VECTOR);
+        /* Wait for up to 10 seconds for all CPUs to do the backtrace */
+        for (i = 0; i < 10 * 1000; i++) {
+                if (cpumask_empty(to_cpumask(backtrace_mask)))
+                        break;
+                mdelay(1);
+        }
+}
+static int __kprobes
+arch_trigger_all_cpu_backtrace_handler(struct notifier_block *self,
+                         unsigned long cmd, void *__args)
+{
+        struct die_args *args = __args;
+        struct pt_regs *regs;
+        int cpu = smp_processor_id();
+        switch (cmd) {
+        case DIE_NMI:
+        case DIE_NMI_IPI:
+                break;
+        default:
+                return NOTIFY_DONE;
+        }
+        regs = args->regs;
+        if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) {
+                static arch_spinlock_t lock = __ARCH_SPIN_LOCK_UNLOCKED;
+                arch_spin_lock(&lock);
+                printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu);
+                show_regs(regs);
+                dump_stack();
+                arch_spin_unlock(&lock);
+                cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask));
+                return NOTIFY_STOP;
+        }
+        return NOTIFY_DONE;
+}
+static __read_mostly struct notifier_block backtrace_notifier = {
+        .notifier_call          = arch_trigger_all_cpu_backtrace_handler,
+        .next                   = NULL,
+        .priority               = 1
+};
+static int __init register_trigger_all_cpu_backtrace(void)
+{
+        register_die_notifier(&backtrace_notifier);
+        return 0;
+}
+early_initcall(register_trigger_all_cpu_backtrace);
+#endif
+/* STUB calls to mimic old nmi_watchdog behaviour */
+#if defined(CONFIG_X86_LOCAL_APIC)
+unsigned int nmi_watchdog = NMI_NONE;
+EXPORT_SYMBOL(nmi_watchdog);
+void acpi_nmi_enable(void) { return; }
+void acpi_nmi_disable(void) { return; }
+#endif
+atomic_t nmi_active = ATOMIC_INIT(0);           /* oprofile uses this */
+EXPORT_SYMBOL(nmi_active);
+int unknown_nmi_panic;
+void cpu_nmi_set_wd_enabled(void) { return; }
+void stop_apic_nmi_watchdog(void *unused) { return; }
+void setup_apic_nmi_watchdog(void *unused) { return; }
+int __init check_nmi_watchdog(void) { return 0; }
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index e41ed24ab26d..5c5b8f3dddb5 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -306,14 +306,19 @@ void arch_init_copy_chip_data(struct irq_desc *old_desc,
        old_cfg = old_desc->chip_data;
-        memcpy(cfg, old_cfg, sizeof(struct irq_cfg));
+        cfg->vector = old_cfg->vector;
+        cfg->move_in_progress = old_cfg->move_in_progress;
+        cpumask_copy(cfg->domain, old_cfg->domain);
+        cpumask_copy(cfg->old_domain, old_cfg->old_domain);
        init_copy_irq_2_pin(old_cfg, cfg, node);
 }
-static void free_irq_cfg(struct irq_cfg *old_cfg)
+static void free_irq_cfg(struct irq_cfg *cfg)
 {
-        kfree(old_cfg);
+        free_cpumask_var(cfg->domain);
+        free_cpumask_var(cfg->old_domain);
+        kfree(cfg);
 }
 void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc)
@@ -1728,6 +1733,8 @@ __apicdebuginit(void) print_IO_APIC(void)
                struct irq_pin_list *entry;
                cfg = desc->chip_data;
+                if (!cfg)
+                        continue;
                entry = cfg->irq_2_pin;
                if (!entry)
                        continue;
@@ -3397,7 +3404,7 @@ static int set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
        cfg = desc->chip_data;
-        read_msi_msg_desc(desc, &msg);
+        get_cached_msi_msg_desc(desc, &msg);
        msg.data &= ~MSI_DATA_VECTOR_MASK;
        msg.data |= MSI_DATA_VECTOR(cfg->vector);
diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c
index 1edaf15c0b8e..a43f71cb30f8 100644
--- a/arch/x86/kernel/apic/nmi.c
+++ b/arch/x86/kernel/apic/nmi.c
@@ -401,13 +401,6 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
        int cpu = smp_processor_id();
        int rc = 0;
-        /* check for other users first */
-        if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
-                        == NOTIFY_STOP) {
-                rc = 1;
-                touched = 1;
-        }
        sum = get_timer_irqs(cpu);
        if (__get_cpu_var(nmi_touch)) {
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index e46f98f36e31..f744f54cb248 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -604,6 +604,10 @@ int uv_handle_nmi(struct notifier_block *self, unsigned long reason, void *data)
 {
        if (reason != DIE_NMI_IPI)
                return NOTIFY_OK;
+        if (in_crash_kexec)
+                /* do nothing if entering the crash kernel */
+                return NOTIFY_OK;
        /*
         * Use a lock so only one cpu prints at a time
         * to prevent intermixed output.
@@ -694,9 +698,11 @@ void __init uv_system_init(void)
                for (j = 0; j < 64; j++) {
                        if (!test_bit(j, &present))
                                continue;
-                        uv_blade_info[blade].pnode = (i * 64 + j);
+                        pnode = (i * 64 + j);
+                        uv_blade_info[blade].pnode = pnode;
                        uv_blade_info[blade].nr_possible_cpus = 0;
                        uv_blade_info[blade].nr_online_cpus = 0;
+                        max_pnode = max(pnode, max_pnode);
                        blade++;
                }
        }
@@ -734,7 +740,6 @@ void __init uv_system_init(void)
                uv_cpu_hub_info(cpu)->scir.offset = uv_scir_offset(apicid);
                uv_node_to_blade[nid] = blade;
                uv_cpu_to_blade[cpu] = blade;
-                max_pnode = max(pnode, max_pnode);
        }
        /* Add blade/pnode info for nodes without cpus */
@@ -746,7 +751,6 @@ void __init uv_system_init(void)
                pnode = (paddr >> m_val) & pnode_mask;
                blade = boot_pnode_to_blade(pnode);
                uv_node_to_blade[nid] = blade;
-                max_pnode = max(pnode, max_pnode);
        }
        map_gru_high(max_pnode);
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 3a785da34b6f..3f0ebe429a01 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -12,11 +12,11 @@ endif
 nostackp := $(call cc-option, -fno-stack-protector)
 CFLAGS_common.o         := $(nostackp)
-obj-y                   := intel_cacheinfo.o addon_cpuid_features.o
+obj-y                   := intel_cacheinfo.o scattered.o topology.o
 obj-y                   += proc.o capflags.o powerflags.o common.o
 obj-y                   += vmware.o hypervisor.o sched.o mshyperv.o
-obj-$(CONFIG_X86_32)    += bugs.o cmpxchg.o
+obj-$(CONFIG_X86_32)    += bugs.o
 obj-$(CONFIG_X86_64)    += bugs_64.o
 obj-$(CONFIG_CPU_SUP_INTEL)             += intel.o
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index e485825130d2..ba5f62f45f01 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -466,7 +466,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
                }
        }
-        if (c->x86 == 0x10 || c->x86 == 0x11)
+        if (c->x86 >= 0x10)
                set_cpu_cap(c, X86_FEATURE_REP_GOOD);
        /* get apicid instead of initial apic id from cpuid */
@@ -529,7 +529,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
                        num_cache_leaves = 3;
        }
-        if (c->x86 >= 0xf && c->x86 <= 0x11)
+        if (c->x86 >= 0xf)
                set_cpu_cap(c, X86_FEATURE_K8);
        if (cpu_has_xmm2) {
@@ -546,7 +546,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
                fam10h_check_enable_mmcfg();
        }
-        if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) {
+        if (c == &boot_cpu_data && c->x86 >= 0xf) {
                unsigned long long tseg;
                /*
@@ -609,3 +609,74 @@ static const struct cpu_dev __cpuinitconst amd_cpu_dev = {
 };
 cpu_dev_register(amd_cpu_dev);
+/*
+ * AMD errata checking
+ *
+ * Errata are defined as arrays of ints using the AMD_LEGACY_ERRATUM() or
+ * AMD_OSVW_ERRATUM() macros. The latter is intended for newer errata that
+ * have an OSVW id assigned, which it takes as first argument. Both take a
+ * variable number of family-specific model-stepping ranges created by
+ * AMD_MODEL_RANGE(). Each erratum also has to be declared as extern const
+ * int[] in arch/x86/include/asm/processor.h.
+ *
+ * Example:
+ *
+ * const int amd_erratum_319[] =
+ *      AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0x4, 0x2),
+ *                         AMD_MODEL_RANGE(0x10, 0x8, 0x0, 0x8, 0x0),
+ *                         AMD_MODEL_RANGE(0x10, 0x9, 0x0, 0x9, 0x0));
+ */
+const int amd_erratum_400[] =
+        AMD_OSVW_ERRATUM(1, AMD_MODEL_RANGE(0xf, 0x41, 0x2, 0xff, 0xf),
+                            AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0xff, 0xf));
+EXPORT_SYMBOL_GPL(amd_erratum_400);
+const int amd_erratum_383[] =
+        AMD_OSVW_ERRATUM(3, AMD_MODEL_RANGE(0x10, 0, 0, 0xff, 0xf));
+EXPORT_SYMBOL_GPL(amd_erratum_383);
+bool cpu_has_amd_erratum(const int *erratum)
+{
+        struct cpuinfo_x86 *cpu = &current_cpu_data;
+        int osvw_id = *erratum++;
+        u32 range;
+        u32 ms;
+        /*
+         * If called early enough that current_cpu_data hasn't been initialized
+         * yet, fall back to boot_cpu_data.
+         */
+        if (cpu->x86 == 0)
+                cpu = &boot_cpu_data;
+        if (cpu->x86_vendor != X86_VENDOR_AMD)
+                return false;
+        if (osvw_id >= 0 && osvw_id < 65536 &&
+            cpu_has(cpu, X86_FEATURE_OSVW)) {
+                u64 osvw_len;
+                rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, osvw_len);
+                if (osvw_id < osvw_len) {
+                        u64 osvw_bits;
+                        rdmsrl(MSR_AMD64_OSVW_STATUS + (osvw_id >> 6),
+                            osvw_bits);
+                        return osvw_bits & (1ULL << (osvw_id & 0x3f));
+                }
+        }
+        /* OSVW unavailable or ID unknown, match family-model-stepping range */
+        ms = (cpu->x86_model << 4) | cpu->x86_mask;
+        while ((range = *erratum++))
+                if ((cpu->x86 == AMD_MODEL_RANGE_FAMILY(range)) &&
+                    (ms >= AMD_MODEL_RANGE_START(range)) &&
+                    (ms <= AMD_MODEL_RANGE_END(range)))
+                        return true;
+        return false;
+}
+EXPORT_SYMBOL_GPL(cpu_has_amd_erratum);
diff --git a/arch/x86/kernel/cpu/cmpxchg.c b/arch/x86/kernel/cpu/cmpxchg.c
deleted file mode 100644
index 2056ccf572cc..000000000000
--- a/arch/x86/kernel/cpu/cmpxchg.c
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * cmpxchg*() fallbacks for CPU not supporting these instructions
- */
-#include <linux/kernel.h>
-#include <linux/smp.h>
-#include <linux/module.h>
-#ifndef CONFIG_X86_CMPXCHG
-unsigned long cmpxchg_386_u8(volatile void *ptr, u8 old, u8 new)
-{
-        u8 prev;
-        unsigned long flags;
-        /* Poor man's cmpxchg for 386. Unsuitable for SMP */
-        local_irq_save(flags);
-        prev = *(u8 *)ptr;
-        if (prev == old)
-                *(u8 *)ptr = new;
-        local_irq_restore(flags);
-        return prev;
-}
-EXPORT_SYMBOL(cmpxchg_386_u8);
-unsigned long cmpxchg_386_u16(volatile void *ptr, u16 old, u16 new)
-{
-        u16 prev;
-        unsigned long flags;
-        /* Poor man's cmpxchg for 386. Unsuitable for SMP */
-        local_irq_save(flags);
-        prev = *(u16 *)ptr;
-        if (prev == old)
-                *(u16 *)ptr = new;
-        local_irq_restore(flags);
-        return prev;
-}
-EXPORT_SYMBOL(cmpxchg_386_u16);
-unsigned long cmpxchg_386_u32(volatile void *ptr, u32 old, u32 new)
-{
-        u32 prev;
-        unsigned long flags;
-        /* Poor man's cmpxchg for 386. Unsuitable for SMP */
-        local_irq_save(flags);
-        prev = *(u32 *)ptr;
-        if (prev == old)
-                *(u32 *)ptr = new;
-        local_irq_restore(flags);
-        return prev;
-}
-EXPORT_SYMBOL(cmpxchg_386_u32);
-#endif
-#ifndef CONFIG_X86_CMPXCHG64
-unsigned long long cmpxchg_486_u64(volatile void *ptr, u64 old, u64 new)
-{
-        u64 prev;
-        unsigned long flags;
-        /* Poor man's cmpxchg8b for 386 and 486. Unsuitable for SMP */
-        local_irq_save(flags);
-        prev = *(u64 *)ptr;
-        if (prev == old)
-                *(u64 *)ptr = new;
-        local_irq_restore(flags);
-        return prev;
-}
-EXPORT_SYMBOL(cmpxchg_486_u64);
-#endif
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 68e4a6f2211e..f2f9ac7da25c 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -140,10 +140,18 @@ EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
 static int __init x86_xsave_setup(char *s)
 {
        setup_clear_cpu_cap(X86_FEATURE_XSAVE);
+        setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
        return 1;
 }
 __setup("noxsave", x86_xsave_setup);
+static int __init x86_xsaveopt_setup(char *s)
+{
+        setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
+        return 1;
+}
+__setup("noxsaveopt", x86_xsaveopt_setup);
 #ifdef CONFIG_X86_32
 static int cachesize_override __cpuinitdata = -1;
 static int disable_x86_serial_nr __cpuinitdata = 1;
@@ -537,7 +545,7 @@ void __cpuinit cpu_detect(struct cpuinfo_x86 *c)
        }
 }
-static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c)
+void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c)
 {
        u32 tfms, xlvl;
        u32 ebx;
@@ -551,6 +559,16 @@ static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c)
                c->x86_capability[4] = excap;
        }
+        /* Additional Intel-defined flags: level 0x00000007 */
+        if (c->cpuid_level >= 0x00000007) {
+                u32 eax, ebx, ecx, edx;
+                cpuid_count(0x00000007, 0, &eax, &ebx, &ecx, &edx);
+                if (eax > 0)
+                        c->x86_capability[9] = ebx;
+        }
        /* AMD-defined flags: level 0x80000001 */
        xlvl = cpuid_eax(0x80000000);
        c->extended_cpuid_level = xlvl;
@@ -576,6 +594,7 @@ static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c)
        if (c->extended_cpuid_level >= 0x80000007)
                c->x86_power = cpuid_edx(0x80000007);
+        init_scattered_cpuid_features(c);
 }
 static void __cpuinit identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
@@ -731,7 +750,6 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 *c)
        get_model_name(c); /* Default name */
-        init_scattered_cpuid_features(c);
        detect_nopl(c);
 }
@@ -1192,6 +1210,7 @@ void __cpuinit cpu_init(void)
        dbg_restore_debug_regs();
        fpu_init();
+        xsave_init();
        raw_local_save_flags(kernel_eflags);
@@ -1252,12 +1271,7 @@ void __cpuinit cpu_init(void)
        clear_used_math();
        mxcsr_feature_mask_init();
-        /*
+        fpu_init();
-         * Boot processor to setup the FP and extended state context info.
-         */
-        if (smp_processor_id() == boot_cpu_id)
-                init_thread_xstate();
        xsave_init();
 }
 #endif
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index 3624e8a0f71b..f668bb1f7d43 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -33,5 +33,6 @@ extern const struct cpu_dev *const __x86_cpu_dev_start[],
                            *const __x86_cpu_dev_end[];
 extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c);
+extern void get_cpu_cap(struct cpuinfo_x86 *c);
 #endif
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 246cd3afbb5f..cd8da247dda1 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -72,7 +72,7 @@ struct acpi_cpufreq_data {
 static DEFINE_PER_CPU(struct acpi_cpufreq_data *, acfreq_data);
 /* acpi_perf_data is a pointer to percpu data. */
-static struct acpi_processor_performance *acpi_perf_data;
+static struct acpi_processor_performance __percpu *acpi_perf_data;
 static struct cpufreq_driver acpi_cpufreq_driver;
diff --git a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
index a36de5bbb622..4f6f679f2799 100644
--- a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
@@ -110,7 +110,7 @@ struct pcc_cpu {
        u32 output_offset;
 };
-static struct pcc_cpu *pcc_cpu_info;
+static struct pcc_cpu __percpu *pcc_cpu_info;
 static int pcc_cpufreq_verify(struct cpufreq_policy *policy)
 {
@@ -368,16 +368,22 @@ static int __init pcc_cpufreq_do_osc(acpi_handle *handle)
                return -ENODEV;
        out_obj = output.pointer;
-        if (out_obj->type != ACPI_TYPE_BUFFER)
+        if (out_obj->type != ACPI_TYPE_BUFFER) {
-                return -ENODEV;
+                ret = -ENODEV;
+                goto out_free;
+        }
        errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0);
-        if (errors)
+        if (errors) {
-                return -ENODEV;
+                ret = -ENODEV;
+                goto out_free;
+        }
        supported = *((u32 *)(out_obj->buffer.pointer + 4));
-        if (!(supported & 0x1))
+        if (!(supported & 0x1)) {
-                return -ENODEV;
+                ret = -ENODEV;
+                goto out_free;
+        }
 out_free:
        kfree(output.pointer);
diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c
index dd531cc56a8f..8095f8611f8a 100644
--- a/arch/x86/kernel/cpu/hypervisor.c
+++ b/arch/x86/kernel/cpu/hypervisor.c
@@ -34,6 +34,9 @@ static const __initconst struct hypervisor_x86 * const hypervisors[] =
 {
        &x86_hyper_vmware,
        &x86_hyper_ms_hyperv,
+#ifdef CONFIG_XEN_PVHVM
+        &x86_hyper_xen_hvm,
+#endif
 };
 const struct hypervisor_x86 *x86_hyper;
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 85f69cdeae10..b4389441efbb 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -39,6 +39,7 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
                        misc_enable &= ~MSR_IA32_MISC_ENABLE_LIMIT_CPUID;
                        wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
                        c->cpuid_level = cpuid_eax(0);
+                        get_cpu_cap(c);
                }
        }
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 33eae2062cf5..898c2f4eab88 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -347,8 +347,8 @@ static struct amd_l3_cache * __cpuinit amd_init_l3_cache(int node)
        return l3;
 }
-static void __cpuinit
+static void __cpuinit amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf,
-amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
+                                           int index)
 {
        int node;
@@ -396,20 +396,39 @@ amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
        this_leaf->l3 = l3_caches[node];
 }
+/*
+ * check whether a slot used for disabling an L3 index is occupied.
+ * @l3: L3 cache descriptor
+ * @slot: slot number (0..1)
+ *
+ * @returns: the disabled index if used or negative value if slot free.
+ */
+int amd_get_l3_disable_slot(struct amd_l3_cache *l3, unsigned slot)
+{
+        unsigned int reg = 0;
+        pci_read_config_dword(l3->dev, 0x1BC + slot * 4, &reg);
+        /* check whether this slot is activated already */
+        if (reg & (3UL << 30))
+                return reg & 0xfff;
+        return -1;
+}
 static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf,
                                  unsigned int slot)
 {
-        struct pci_dev *dev = this_leaf->l3->dev;
+        int index;
-        unsigned int reg = 0;
        if (!this_leaf->l3 || !this_leaf->l3->can_disable)
                return -EINVAL;
-        if (!dev)
+        index = amd_get_l3_disable_slot(this_leaf->l3, slot);
-                return -EINVAL;
+        if (index >= 0)
+                return sprintf(buf, "%d\n", index);
-        pci_read_config_dword(dev, 0x1BC + slot * 4, &reg);
+        return sprintf(buf, "FREE\n");
-        return sprintf(buf, "0x%08x\n", reg);
 }
 #define SHOW_CACHE_DISABLE(slot)                                        \
@@ -451,37 +470,74 @@ static void amd_l3_disable_index(struct amd_l3_cache *l3, int cpu,
        }
 }
+/*
-static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
+ * disable a L3 cache index by using a disable-slot
-                                   const char *buf, size_t count,
+ *
-                                   unsigned int slot)
+ * @l3:    L3 cache descriptor
+ * @cpu:   A CPU on the node containing the L3 cache
+ * @slot:  slot number (0..1)
+ * @index: index to disable
+ *
+ * @return: 0 on success, error status on failure
+ */
+int amd_set_l3_disable_slot(struct amd_l3_cache *l3, int cpu, unsigned slot,
+                            unsigned long index)
 {
-        struct pci_dev *dev = this_leaf->l3->dev;
+        int ret = 0;
-        int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
-        unsigned long val = 0;
 #define SUBCACHE_MASK   (3UL << 20)
 #define SUBCACHE_INDEX  0xfff
-        if (!this_leaf->l3 || !this_leaf->l3->can_disable)
+        /*
+         * check whether this slot is already used or
+         * the index is already disabled
+         */
+        ret = amd_get_l3_disable_slot(l3, slot);
+        if (ret >= 0)
                return -EINVAL;
+        /*
+         * check whether the other slot has disabled the
+         * same index already
+         */
+        if (index == amd_get_l3_disable_slot(l3, !slot))
+                return -EINVAL;
+        /* do not allow writes outside of allowed bits */
+        if ((index & ~(SUBCACHE_MASK | SUBCACHE_INDEX)) ||
+            ((index & SUBCACHE_INDEX) > l3->indices))
+                return -EINVAL;
+        amd_l3_disable_index(l3, cpu, slot, index);
+        return 0;
+}
+static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
+                                  const char *buf, size_t count,
+                                  unsigned int slot)
+{
+        unsigned long val = 0;
+        int cpu, err = 0;
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
-        if (!dev)
+        if (!this_leaf->l3 || !this_leaf->l3->can_disable)
                return -EINVAL;
-        if (strict_strtoul(buf, 10, &val) < 0)
+        cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
-                return -EINVAL;
-        /* do not allow writes outside of allowed bits */
+        if (strict_strtoul(buf, 10, &val) < 0)
-        if ((val & ~(SUBCACHE_MASK | SUBCACHE_INDEX)) ||
-            ((val & SUBCACHE_INDEX) > this_leaf->l3->indices))
                return -EINVAL;
-        amd_l3_disable_index(this_leaf->l3, cpu, slot, val);
+        err = amd_set_l3_disable_slot(this_leaf->l3, cpu, slot, val);
+        if (err) {
+                if (err == -EEXIST)
+                        printk(KERN_WARNING "L3 disable slot %d in use!\n",
+                                            slot);
+                return err;
+        }
        return count;
 }
@@ -502,7 +558,7 @@ static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644,
 #else   /* CONFIG_CPU_SUP_AMD */
 static void __cpuinit
-amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
+amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf, int index)
 {
 };
 #endif /* CONFIG_CPU_SUP_AMD */
@@ -518,7 +574,7 @@ __cpuinit cpuid4_cache_lookup_regs(int index,
        if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
                amd_cpuid4(index, &eax, &ebx, &ecx);
-                amd_check_l3_disable(index, this_leaf);
+                amd_check_l3_disable(this_leaf, index);
        } else {
                cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx);
        }
diff --git a/arch/x86/kernel/cpu/mcheck/mce-apei.c b/arch/x86/kernel/cpu/mcheck/mce-apei.c
index 745b54f9be89..8209472b27a5 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-apei.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-apei.c
@@ -80,7 +80,7 @@ int apei_write_mce(struct mce *m)
        rcd.hdr.revision = CPER_RECORD_REV;
        rcd.hdr.signature_end = CPER_SIG_END;
        rcd.hdr.section_count = 1;
-        rcd.hdr.error_severity = CPER_SER_FATAL;
+        rcd.hdr.error_severity = CPER_SEV_FATAL;
        /* timestamp, platform_id, partition_id are all invalid */
        rcd.hdr.validation_bits = 0;
        rcd.hdr.record_length = sizeof(rcd);
@@ -96,7 +96,7 @@ int apei_write_mce(struct mce *m)
        rcd.sec_hdr.validation_bits = 0;
        rcd.sec_hdr.flags = CPER_SEC_PRIMARY;
        rcd.sec_hdr.section_type = CPER_SECTION_TYPE_MCE;
-        rcd.sec_hdr.section_severity = CPER_SER_FATAL;
+        rcd.sec_hdr.section_severity = CPER_SEV_FATAL;
        memcpy(&rcd.mce, m, sizeof(*m));
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 1970ef911c99..ed41562909fe 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -51,7 +51,7 @@
 static DEFINE_MUTEX(mce_read_mutex);
 #define rcu_dereference_check_mce(p) \
-        rcu_dereference_check((p), \
+        rcu_dereference_index_check((p), \
                              rcu_read_lock_sched_held() || \
                              lockdep_is_held(&mce_read_mutex))
@@ -107,8 +107,8 @@ EXPORT_SYMBOL_GPL(x86_mce_decoder_chain);
 static int default_decode_mce(struct notifier_block *nb, unsigned long val,
                               void *data)
 {
-        pr_emerg("No human readable MCE decoding support on this CPU type.\n");
+        pr_emerg(HW_ERR "No human readable MCE decoding support on this CPU type.\n");
-        pr_emerg("Run the message through 'mcelog --ascii' to decode.\n");
+        pr_emerg(HW_ERR "Run the message through 'mcelog --ascii' to decode.\n");
        return NOTIFY_STOP;
 }
@@ -211,11 +211,11 @@ void mce_log(struct mce *mce)
 static void print_mce(struct mce *m)
 {
-        pr_emerg("CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
+        pr_emerg(HW_ERR "CPU %d: Machine Check Exception: %Lx Bank %d: %016Lx\n",
               m->extcpu, m->mcgstatus, m->bank, m->status);
        if (m->ip) {
-                pr_emerg("RIP%s %02x:<%016Lx> ",
+                pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ",
                        !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
                                m->cs, m->ip);
@@ -224,14 +224,14 @@ static void print_mce(struct mce *m)
                pr_cont("\n");
        }
-        pr_emerg("TSC %llx ", m->tsc);
+        pr_emerg(HW_ERR "TSC %llx ", m->tsc);
        if (m->addr)
                pr_cont("ADDR %llx ", m->addr);
        if (m->misc)
                pr_cont("MISC %llx ", m->misc);
        pr_cont("\n");
-        pr_emerg("PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n",
+        pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n",
                m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid);
        /*
@@ -241,16 +241,6 @@ static void print_mce(struct mce *m)
        atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m);
 }
-static void print_mce_head(void)
-{
-        pr_emerg("\nHARDWARE ERROR\n");
-}
-static void print_mce_tail(void)
-{
-        pr_emerg("This is not a software problem!\n");
-}
 #define PANIC_TIMEOUT 5 /* 5 seconds */
 static atomic_t mce_paniced;
@@ -291,7 +281,6 @@ static void mce_panic(char *msg, struct mce *final, char *exp)
                if (atomic_inc_return(&mce_fake_paniced) > 1)
                        return;
        }
-        print_mce_head();
        /* First print corrected ones that are still unlogged */
        for (i = 0; i < MCE_LOG_LEN; i++) {
                struct mce *m = &mcelog.entry[i];
@@ -322,16 +311,15 @@ static void mce_panic(char *msg, struct mce *final, char *exp)
                        apei_err = apei_write_mce(final);
        }
        if (cpu_missing)
-                printk(KERN_EMERG "Some CPUs didn't answer in synchronization\n");
+                pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n");
-        print_mce_tail();
        if (exp)
-                printk(KERN_EMERG "Machine check: %s\n", exp);
+                pr_emerg(HW_ERR "Machine check: %s\n", exp);
        if (!fake_panic) {
                if (panic_timeout == 0)
                        panic_timeout = mce_panic_timeout;
                panic(msg);
        } else
-                printk(KERN_EMERG "Fake kernel panic: %s\n", msg);
+                pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg);
 }
 /* Support code for software error injection */
@@ -1221,7 +1209,7 @@ int mce_notify_irq(void)
                        schedule_work(&mce_trigger_work);
                if (__ratelimit(&ratelimit))
-                        printk(KERN_INFO "Machine check events logged\n");
+                        pr_info(HW_ERR "Machine check events logged\n");
                return 1;
        }
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 224392d8fe8c..5e975298fa81 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -530,7 +530,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
                err = -ENOMEM;
                goto out;
        }
-        if (!alloc_cpumask_var(&b->cpus, GFP_KERNEL)) {
+        if (!zalloc_cpumask_var(&b->cpus, GFP_KERNEL)) {
                kfree(b);
                err = -ENOMEM;
                goto out;
@@ -543,7 +543,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
 #ifndef CONFIG_SMP
        cpumask_setall(b->cpus);
 #else
-        cpumask_copy(b->cpus, c->llc_shared_map);
+        cpumask_set_cpu(cpu, b->cpus);
 #endif
        per_cpu(threshold_banks, cpu)[bank] = b;
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index 62b48e40920a..6fcd0936194f 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -95,19 +95,20 @@ static void cmci_discover(int banks, int boot)
                rdmsrl(MSR_IA32_MCx_CTL2(i), val);
                /* Already owned by someone else? */
-                if (val & CMCI_EN) {
+                if (val & MCI_CTL2_CMCI_EN) {
                        if (test_and_clear_bit(i, owned) && !boot)
                                print_update("SHD", &hdr, i);
                        __clear_bit(i, __get_cpu_var(mce_poll_banks));
                        continue;
                }
-                val |= CMCI_EN | CMCI_THRESHOLD;
+                val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK;
+                val |= MCI_CTL2_CMCI_EN | CMCI_THRESHOLD;
                wrmsrl(MSR_IA32_MCx_CTL2(i), val);
                rdmsrl(MSR_IA32_MCx_CTL2(i), val);
                /* Did the enable bit stick? -- the bank supports CMCI */
-                if (val & CMCI_EN) {
+                if (val & MCI_CTL2_CMCI_EN) {
                        if (!test_and_set_bit(i, owned) && !boot)
                                print_update("CMCI", &hdr, i);
                        __clear_bit(i, __get_cpu_var(mce_poll_banks));
@@ -155,7 +156,7 @@ void cmci_clear(void)
                        continue;
                /* Disable CMCI */
                rdmsrl(MSR_IA32_MCx_CTL2(i), val);
-                val &= ~(CMCI_EN|CMCI_THRESHOLD_MASK);
+                val &= ~(MCI_CTL2_CMCI_EN|MCI_CTL2_CMCI_THRESHOLD_MASK);
                wrmsrl(MSR_IA32_MCx_CTL2(i), val);
                __clear_bit(i, __get_cpu_var(mce_banks_owned));
        }
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index e1a0a3bf9716..d9368eeda309 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -34,15 +34,25 @@
 /* How long to wait between reporting thermal events */
 #define CHECK_INTERVAL          (300 * HZ)
+#define THERMAL_THROTTLING_EVENT        0
+#define POWER_LIMIT_EVENT               1
 /*
- * Current thermal throttling state:
+ * Current thermal event state:
 */
-struct thermal_state {
+struct _thermal_state {
-        bool                    is_throttled;
+        bool                    new_event;
+        int                     event;
        u64                     next_check;
-        unsigned long           throttle_count;
+        unsigned long           count;
-        unsigned long           last_throttle_count;
+        unsigned long           last_count;
+};
+struct thermal_state {
+        struct _thermal_state core_throttle;
+        struct _thermal_state core_power_limit;
+        struct _thermal_state package_throttle;
+        struct _thermal_state package_power_limit;
 };
 static DEFINE_PER_CPU(struct thermal_state, thermal_state);
@@ -53,11 +63,13 @@ static u32 lvtthmr_init __read_mostly;
 #ifdef CONFIG_SYSFS
 #define define_therm_throt_sysdev_one_ro(_name)                         \
-        static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL)
+        static SYSDEV_ATTR(_name, 0444,                                 \
+                           therm_throt_sysdev_show_##_name,             \
+                                   NULL)                                \
-#define define_therm_throt_sysdev_show_func(name)                       \
+#define define_therm_throt_sysdev_show_func(event, name)                \
                                                                        \
-static ssize_t therm_throt_sysdev_show_##name(                          \
+static ssize_t therm_throt_sysdev_show_##event##_##name(                \
                        struct sys_device *dev,                         \
                        struct sysdev_attribute *attr,                  \
                        char *buf)                                      \
@@ -66,30 +78,42 @@ static ssize_t therm_throt_sysdev_show_##name(				\
        ssize_t ret;                                                    \
                                                                        \
        preempt_disable();      /* CPU hotplug */                       \
-        if (cpu_online(cpu))                                            \
+        if (cpu_online(cpu)) {                                          \
                ret = sprintf(buf, "%lu\n",                             \
-                              per_cpu(thermal_state, cpu).name);        \
+                              per_cpu(thermal_state, cpu).event.name);  \
-        else                                                            \
+        } else                                                          \
                ret = 0;                                                \
        preempt_enable();                                               \
                                                                        \
        return ret;                                                     \
 }
-define_therm_throt_sysdev_show_func(throttle_count);
+define_therm_throt_sysdev_show_func(core_throttle, count);
-define_therm_throt_sysdev_one_ro(throttle_count);
+define_therm_throt_sysdev_one_ro(core_throttle_count);
+define_therm_throt_sysdev_show_func(core_power_limit, count);
+define_therm_throt_sysdev_one_ro(core_power_limit_count);
+define_therm_throt_sysdev_show_func(package_throttle, count);
+define_therm_throt_sysdev_one_ro(package_throttle_count);
+define_therm_throt_sysdev_show_func(package_power_limit, count);
+define_therm_throt_sysdev_one_ro(package_power_limit_count);
 static struct attribute *thermal_throttle_attrs[] = {
-        &attr_throttle_count.attr,
+        &attr_core_throttle_count.attr,
        NULL
 };
-static struct attribute_group thermal_throttle_attr_group = {
+static struct attribute_group thermal_attr_group = {
        .attrs  = thermal_throttle_attrs,
        .name   = "thermal_throttle"
 };
 #endif /* CONFIG_SYSFS */
+#define CORE_LEVEL      0
+#define PACKAGE_LEVEL   1
 /***
 * therm_throt_process - Process thermal throttling event from interrupt
 * @curr: Whether the condition is current or not (boolean), since the
@@ -106,39 +130,70 @@ static struct attribute_group thermal_throttle_attr_group = {
 *          1 : Event should be logged further, and a message has been
 *              printed to the syslog.
 */
-static int therm_throt_process(bool is_throttled)
+static int therm_throt_process(bool new_event, int event, int level)
 {
-        struct thermal_state *state;
+        struct _thermal_state *state;
-        unsigned int this_cpu;
+        unsigned int this_cpu = smp_processor_id();
-        bool was_throttled;
+        bool old_event;
        u64 now;
+        struct thermal_state *pstate = &per_cpu(thermal_state, this_cpu);
-        this_cpu = smp_processor_id();
        now = get_jiffies_64();
-        state = &per_cpu(thermal_state, this_cpu);
+        if (level == CORE_LEVEL) {
+                if (event == THERMAL_THROTTLING_EVENT)
+                        state = &pstate->core_throttle;
+                else if (event == POWER_LIMIT_EVENT)
+                        state = &pstate->core_power_limit;
+                else
+                         return 0;
+        } else if (level == PACKAGE_LEVEL) {
+                if (event == THERMAL_THROTTLING_EVENT)
+                        state = &pstate->package_throttle;
+                else if (event == POWER_LIMIT_EVENT)
+                        state = &pstate->package_power_limit;
+                else
+                        return 0;
+        } else
+                return 0;
-        was_throttled = state->is_throttled;
+        old_event = state->new_event;
-        state->is_throttled = is_throttled;
+        state->new_event = new_event;
-        if (is_throttled)
+        if (new_event)
-                state->throttle_count++;
+                state->count++;
        if (time_before64(now, state->next_check) &&
-                        state->throttle_count != state->last_throttle_count)
+                        state->count != state->last_count)
                return 0;
        state->next_check = now + CHECK_INTERVAL;
-        state->last_throttle_count = state->throttle_count;
+        state->last_count = state->count;
        /* if we just entered the thermal event */
-        if (is_throttled) {
+        if (new_event) {
-                printk(KERN_CRIT "CPU%d: Temperature above threshold, cpu clock throttled (total events = %lu)\n", this_cpu, state->throttle_count);
+                if (event == THERMAL_THROTTLING_EVENT)
+                        printk(KERN_CRIT "CPU%d: %s temperature above threshold, cpu clock throttled (total events = %lu)\n",
+                                this_cpu,
+                                level == CORE_LEVEL ? "Core" : "Package",
+                                state->count);
+                else
+                        printk(KERN_CRIT "CPU%d: %s power limit notification (total events = %lu)\n",
+                                this_cpu,
+                                level == CORE_LEVEL ? "Core" : "Package",
+                                state->count);
                add_taint(TAINT_MACHINE_CHECK);
                return 1;
        }
-        if (was_throttled) {
+        if (old_event) {
-                printk(KERN_INFO "CPU%d: Temperature/speed normal\n", this_cpu);
+                if (event == THERMAL_THROTTLING_EVENT)
+                        printk(KERN_INFO "CPU%d: %s temperature/speed normal\n",
+                                this_cpu,
+                                level == CORE_LEVEL ? "Core" : "Package");
+                else
+                        printk(KERN_INFO "CPU%d: %s power limit normal\n",
+                                this_cpu,
+                                level == CORE_LEVEL ? "Core" : "Package");
                return 1;
        }
@@ -147,15 +202,35 @@ static int therm_throt_process(bool is_throttled)
 #ifdef CONFIG_SYSFS
 /* Add/Remove thermal_throttle interface for CPU device: */
-static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev)
+static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev,
+                                unsigned int cpu)
 {
-        return sysfs_create_group(&sys_dev->kobj,
+        int err;
-                                  &thermal_throttle_attr_group);
+        struct cpuinfo_x86 *c = &cpu_data(cpu);
+        err = sysfs_create_group(&sys_dev->kobj, &thermal_attr_group);
+        if (err)
+                return err;
+        if (cpu_has(c, X86_FEATURE_PLN))
+                err = sysfs_add_file_to_group(&sys_dev->kobj,
+                                              &attr_core_power_limit_count.attr,
+                                              thermal_attr_group.name);
+        if (cpu_has(c, X86_FEATURE_PTS))
+                err = sysfs_add_file_to_group(&sys_dev->kobj,
+                                              &attr_package_throttle_count.attr,
+                                              thermal_attr_group.name);
+                if (cpu_has(c, X86_FEATURE_PLN))
+                        err = sysfs_add_file_to_group(&sys_dev->kobj,
+                                        &attr_package_power_limit_count.attr,
+                                        thermal_attr_group.name);
+        return err;
 }
 static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev)
 {
-        sysfs_remove_group(&sys_dev->kobj, &thermal_throttle_attr_group);
+        sysfs_remove_group(&sys_dev->kobj, &thermal_attr_group);
 }
 /* Mutex protecting device creation against CPU hotplug: */
@@ -177,7 +252,7 @@ thermal_throttle_cpu_callback(struct notifier_block *nfb,
        case CPU_UP_PREPARE:
        case CPU_UP_PREPARE_FROZEN:
                mutex_lock(&therm_cpu_lock);
-                err = thermal_throttle_add_dev(sys_dev);
+                err = thermal_throttle_add_dev(sys_dev, cpu);
                mutex_unlock(&therm_cpu_lock);
                WARN_ON(err);
                break;
@@ -213,7 +288,7 @@ static __init int thermal_throttle_init_device(void)
 #endif
        /* connect live CPUs to sysfs */
        for_each_online_cpu(cpu) {
-                err = thermal_throttle_add_dev(get_cpu_sysdev(cpu));
+                err = thermal_throttle_add_dev(get_cpu_sysdev(cpu), cpu);
                WARN_ON(err);
        }
 #ifdef CONFIG_HOTPLUG_CPU
@@ -226,14 +301,50 @@ device_initcall(thermal_throttle_init_device);
 #endif /* CONFIG_SYSFS */
+/*
+ * Set up the most two significant bit to notify mce log that this thermal
+ * event type.
+ * This is a temp solution. May be changed in the future with mce log
+ * infrasture.
+ */
+#define CORE_THROTTLED          (0)
+#define CORE_POWER_LIMIT        ((__u64)1 << 62)
+#define PACKAGE_THROTTLED       ((__u64)2 << 62)
+#define PACKAGE_POWER_LIMIT     ((__u64)3 << 62)
 /* Thermal transition interrupt handler */
 static void intel_thermal_interrupt(void)
 {
        __u64 msr_val;
+        struct cpuinfo_x86 *c = &cpu_data(smp_processor_id());
        rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
-        if (therm_throt_process((msr_val & THERM_STATUS_PROCHOT) != 0))
-                mce_log_therm_throt_event(msr_val);
+        if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT,
+                                THERMAL_THROTTLING_EVENT,
+                                CORE_LEVEL) != 0)
+                mce_log_therm_throt_event(CORE_THROTTLED | msr_val);
+        if (cpu_has(c, X86_FEATURE_PLN))
+                if (therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT,
+                                        POWER_LIMIT_EVENT,
+                                        CORE_LEVEL) != 0)
+                        mce_log_therm_throt_event(CORE_POWER_LIMIT | msr_val);
+        if (cpu_has(c, X86_FEATURE_PTS)) {
+                rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val);
+                if (therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT,
+                                        THERMAL_THROTTLING_EVENT,
+                                        PACKAGE_LEVEL) != 0)
+                        mce_log_therm_throt_event(PACKAGE_THROTTLED | msr_val);
+                if (cpu_has(c, X86_FEATURE_PLN))
+                        if (therm_throt_process(msr_val &
+                                        PACKAGE_THERM_STATUS_POWER_LIMIT,
+                                        POWER_LIMIT_EVENT,
+                                        PACKAGE_LEVEL) != 0)
+                                mce_log_therm_throt_event(PACKAGE_POWER_LIMIT
+                                                          | msr_val);
+        }
 }
 static void unexpected_thermal_interrupt(void)
@@ -335,8 +446,26 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
        apic_write(APIC_LVTTHMR, h);
        rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
-        wrmsr(MSR_IA32_THERM_INTERRUPT,
+        if (cpu_has(c, X86_FEATURE_PLN))
-                l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h);
+                wrmsr(MSR_IA32_THERM_INTERRUPT,
+                      l | (THERM_INT_LOW_ENABLE
+                        | THERM_INT_HIGH_ENABLE | THERM_INT_PLN_ENABLE), h);
+        else
+                wrmsr(MSR_IA32_THERM_INTERRUPT,
+                      l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h);
+        if (cpu_has(c, X86_FEATURE_PTS)) {
+                rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
+                if (cpu_has(c, X86_FEATURE_PLN))
+                        wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
+                              l | (PACKAGE_THERM_INT_LOW_ENABLE
+                                | PACKAGE_THERM_INT_HIGH_ENABLE
+                                | PACKAGE_THERM_INT_PLN_ENABLE), h);
+                else
+                        wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
+                              l | (PACKAGE_THERM_INT_LOW_ENABLE
+                                | PACKAGE_THERM_INT_HIGH_ENABLE), h);
+        }
        smp_thermal_vector = intel_thermal_interrupt;
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 16f41bbe46b6..d944bf6c50e9 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -18,6 +18,7 @@
 #include <asm/mshyperv.h>
 struct ms_hyperv_info ms_hyperv;
+EXPORT_SYMBOL_GPL(ms_hyperv);
 static bool __init ms_hyperv_platform(void)
 {
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
index 06130b52f012..c5f59d071425 100644
--- a/arch/x86/kernel/cpu/mtrr/cleanup.c
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -632,9 +632,9 @@ static void __init mtrr_print_out_one_result(int i)
        unsigned long gran_base, chunk_base, lose_base;
        char gran_factor, chunk_factor, lose_factor;
-        gran_base = to_size_factor(result[i].gran_sizek, &gran_factor),
+        gran_base = to_size_factor(result[i].gran_sizek, &gran_factor);
-        chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor),
+        chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor);
-        lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor),
+        lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor);
        pr_info("%sgran_size: %ld%c \tchunk_size: %ld%c \t",
                result[i].bad ? "*BAD*" : " ",
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index fd31a441c61c..7d28d7d03885 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -433,13 +433,12 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
 {
        unsigned int mask_lo, mask_hi, base_lo, base_hi;
        unsigned int tmp, hi;
-        int cpu;
        /*
         * get_mtrr doesn't need to update mtrr_state, also it could be called
         * from any cpu, so try to print it out directly.
         */
-        cpu = get_cpu();
+        get_cpu();
        rdmsr(MTRRphysMask_MSR(reg), mask_lo, mask_hi);
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 79556bd9b602..01c0f3ee6cc3 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -35,6 +35,7 @@
 #include <linux/types.h> /* FIXME: kvm_para.h needs this */
+#include <linux/stop_machine.h>
 #include <linux/kvm_para.h>
 #include <linux/uaccess.h>
 #include <linux/module.h>
@@ -143,22 +144,28 @@ struct set_mtrr_data {
        mtrr_type       smp_type;
 };
+static DEFINE_PER_CPU(struct cpu_stop_work, mtrr_work);
 /**
- * ipi_handler - Synchronisation handler. Executed by "other" CPUs.
+ * mtrr_work_handler - Synchronisation handler. Executed by "other" CPUs.
 * @info: pointer to mtrr configuration data
 *
 * Returns nothing.
 */
-static void ipi_handler(void *info)
+static int mtrr_work_handler(void *info)
 {
 #ifdef CONFIG_SMP
        struct set_mtrr_data *data = info;
        unsigned long flags;
+        atomic_dec(&data->count);
+        while (!atomic_read(&data->gate))
+                cpu_relax();
        local_irq_save(flags);
        atomic_dec(&data->count);
-        while (!atomic_read(&data->gate))
+        while (atomic_read(&data->gate))
                cpu_relax();
        /*  The master has cleared me to execute  */
@@ -173,12 +180,13 @@ static void ipi_handler(void *info)
        }
        atomic_dec(&data->count);
-        while (atomic_read(&data->gate))
+        while (!atomic_read(&data->gate))
                cpu_relax();
        atomic_dec(&data->count);
        local_irq_restore(flags);
 #endif
+        return 0;
 }
 static inline int types_compatible(mtrr_type type1, mtrr_type type2)
@@ -198,7 +206,7 @@ static inline int types_compatible(mtrr_type type1, mtrr_type type2)
 *
 * This is kinda tricky, but fortunately, Intel spelled it out for us cleanly:
 *
- * 1. Send IPI to do the following:
+ * 1. Queue work to do the following on all processors:
 * 2. Disable Interrupts
 * 3. Wait for all procs to do so
 * 4. Enter no-fill cache mode
@@ -215,14 +223,17 @@ static inline int types_compatible(mtrr_type type1, mtrr_type type2)
 * 15. Enable interrupts.
 *
 * What does that mean for us? Well, first we set data.count to the number
- * of CPUs. As each CPU disables interrupts, it'll decrement it once. We wait
+ * of CPUs. As each CPU announces that it started the rendezvous handler by
- * until it hits 0 and proceed. We set the data.gate flag and reset data.count.
+ * decrementing the count, We reset data.count and set the data.gate flag
- * Meanwhile, they are waiting for that flag to be set. Once it's set, each
+ * allowing all the cpu's to proceed with the work. As each cpu disables
+ * interrupts, it'll decrement data.count once. We wait until it hits 0 and
+ * proceed. We clear the data.gate flag and reset data.count. Meanwhile, they
+ * are waiting for that flag to be cleared. Once it's cleared, each
 * CPU goes through the transition of updating MTRRs.
 * The CPU vendors may each do it differently,
 * so we call mtrr_if->set() callback and let them take care of it.
 * When they're done, they again decrement data->count and wait for data.gate
- * to be reset.
+ * to be set.
 * When we finish, we wait for data.count to hit 0 and toggle the data.gate flag
 * Everyone then enables interrupts and we all continue on.
 *
@@ -234,6 +245,9 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ
 {
        struct set_mtrr_data data;
        unsigned long flags;
+        int cpu;
+        preempt_disable();
        data.smp_reg = reg;
        data.smp_base = base;
@@ -246,10 +260,15 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ
        atomic_set(&data.gate, 0);
        /* Start the ball rolling on other CPUs */
-        if (smp_call_function(ipi_handler, &data, 0) != 0)
+        for_each_online_cpu(cpu) {
-                panic("mtrr: timed out waiting for other CPUs\n");
+                struct cpu_stop_work *work = &per_cpu(mtrr_work, cpu);
+                if (cpu == smp_processor_id())
+                        continue;
+                stop_one_cpu_nowait(cpu, mtrr_work_handler, &data, work);
+        }
-        local_irq_save(flags);
        while (atomic_read(&data.count))
                cpu_relax();
@@ -259,6 +278,16 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ
        smp_wmb();
        atomic_set(&data.gate, 1);
+        local_irq_save(flags);
+        while (atomic_read(&data.count))
+                cpu_relax();
+        /* Ok, reset count and toggle gate */
+        atomic_set(&data.count, num_booting_cpus() - 1);
+        smp_wmb();
+        atomic_set(&data.gate, 0);
        /* Do our MTRR business */
        /*
@@ -279,7 +308,7 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ
        atomic_set(&data.count, num_booting_cpus() - 1);
        smp_wmb();
-        atomic_set(&data.gate, 0);
+        atomic_set(&data.gate, 1);
        /*
         * Wait here for everyone to have seen the gate change
@@ -289,6 +318,7 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ
                cpu_relax();
        local_irq_restore(flags);
+        preempt_enable();
 }
 /**
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 5db5b7d65a18..03a5b0385ad6 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -102,6 +102,7 @@ struct cpu_hw_events {
         */
        struct perf_event       *events[X86_PMC_IDX_MAX]; /* in counter order */
        unsigned long           active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+        unsigned long           running[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
        int                     enabled;
        int                     n_events;
@@ -220,6 +221,7 @@ struct x86_pmu {
                                                 struct perf_event *event);
        struct event_constraint *event_constraints;
        void            (*quirks)(void);
+        int             perfctr_second_write;
        int             (*cpu_prepare)(int cpu);
        void            (*cpu_starting)(int cpu);
@@ -295,10 +297,10 @@ x86_perf_event_update(struct perf_event *event)
         * count to the generic event atomically:
         */
 again:
-        prev_raw_count = atomic64_read(&hwc->prev_count);
+        prev_raw_count = local64_read(&hwc->prev_count);
        rdmsrl(hwc->event_base + idx, new_raw_count);
-        if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count,
+        if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
                                        new_raw_count) != prev_raw_count)
                goto again;
@@ -313,8 +315,8 @@ again:
        delta = (new_raw_count << shift) - (prev_raw_count << shift);
        delta >>= shift;
-        atomic64_add(delta, &event->count);
+        local64_add(delta, &event->count);
-        atomic64_sub(delta, &hwc->period_left);
+        local64_sub(delta, &hwc->period_left);
        return new_raw_count;
 }
@@ -438,7 +440,7 @@ static int x86_setup_perfctr(struct perf_event *event)
        if (!hwc->sample_period) {
                hwc->sample_period = x86_pmu.max_period;
                hwc->last_period = hwc->sample_period;
-                atomic64_set(&hwc->period_left, hwc->sample_period);
+                local64_set(&hwc->period_left, hwc->sample_period);
        } else {
                /*
                 * If we have a PMU initialized but no APIC
@@ -885,7 +887,7 @@ static int
 x86_perf_event_set_period(struct perf_event *event)
 {
        struct hw_perf_event *hwc = &event->hw;
-        s64 left = atomic64_read(&hwc->period_left);
+        s64 left = local64_read(&hwc->period_left);
        s64 period = hwc->sample_period;
        int ret = 0, idx = hwc->idx;
@@ -897,14 +899,14 @@ x86_perf_event_set_period(struct perf_event *event)
         */
        if (unlikely(left <= -period)) {
                left = period;
-                atomic64_set(&hwc->period_left, left);
+                local64_set(&hwc->period_left, left);
                hwc->last_period = period;
                ret = 1;
        }
        if (unlikely(left <= 0)) {
                left += period;
-                atomic64_set(&hwc->period_left, left);
+                local64_set(&hwc->period_left, left);
                hwc->last_period = period;
                ret = 1;
        }
@@ -923,10 +925,19 @@ x86_perf_event_set_period(struct perf_event *event)
         * The hw event starts counting from this event offset,
         * mark it to be able to extra future deltas:
         */
-        atomic64_set(&hwc->prev_count, (u64)-left);
+        local64_set(&hwc->prev_count, (u64)-left);
-        wrmsrl(hwc->event_base + idx,
+        wrmsrl(hwc->event_base + idx, (u64)(-left) & x86_pmu.cntval_mask);
+        /*
+         * Due to erratum on certan cpu we need
+         * a second write to be sure the register
+         * is updated properly
+         */
+        if (x86_pmu.perfctr_second_write) {
+                wrmsrl(hwc->event_base + idx,
                        (u64)(-left) & x86_pmu.cntval_mask);
+        }
        perf_event_update_userpage(event);
@@ -969,7 +980,7 @@ static int x86_pmu_enable(struct perf_event *event)
         * skip the schedulability test here, it will be peformed
         * at commit time(->commit_txn) as a whole
         */
-        if (cpuc->group_flag & PERF_EVENT_TXN_STARTED)
+        if (cpuc->group_flag & PERF_EVENT_TXN)
                goto out;
        ret = x86_pmu.schedule_events(cpuc, n, assign);
@@ -1000,6 +1011,7 @@ static int x86_pmu_start(struct perf_event *event)
        x86_perf_event_set_period(event);
        cpuc->events[idx] = event;
        __set_bit(idx, cpuc->active_mask);
+        __set_bit(idx, cpuc->running);
        x86_pmu.enable(event);
        perf_event_update_userpage(event);
@@ -1096,7 +1108,7 @@ static void x86_pmu_disable(struct perf_event *event)
         * The events never got scheduled and ->cancel_txn will truncate
         * the event_list.
         */
-        if (cpuc->group_flag & PERF_EVENT_TXN_STARTED)
+        if (cpuc->group_flag & PERF_EVENT_TXN)
                return;
        x86_pmu_stop(event);
@@ -1131,8 +1143,16 @@ static int x86_pmu_handle_irq(struct pt_regs *regs)
        cpuc = &__get_cpu_var(cpu_hw_events);
        for (idx = 0; idx < x86_pmu.num_counters; idx++) {
-                if (!test_bit(idx, cpuc->active_mask))
+                if (!test_bit(idx, cpuc->active_mask)) {
+                        /*
+                         * Though we deactivated the counter some cpus
+                         * might still deliver spurious interrupts still
+                         * in flight. Catch them:
+                         */
+                        if (__test_and_clear_bit(idx, cpuc->running))
+                                handled++;
                        continue;
+                }
                event = cpuc->events[idx];
                hwc = &event->hw;
@@ -1144,7 +1164,7 @@ static int x86_pmu_handle_irq(struct pt_regs *regs)
                /*
                 * event overflow
                 */
-                handled         = 1;
+                handled++;
                data.period     = event->hw.last_period;
                if (!x86_perf_event_set_period(event))
@@ -1190,12 +1210,20 @@ void perf_events_lapic_init(void)
        apic_write(APIC_LVTPC, APIC_DM_NMI);
 }
+struct pmu_nmi_state {
+        unsigned int    marked;
+        int             handled;
+};
+static DEFINE_PER_CPU(struct pmu_nmi_state, pmu_nmi);
 static int __kprobes
 perf_event_nmi_handler(struct notifier_block *self,
                         unsigned long cmd, void *__args)
 {
        struct die_args *args = __args;
-        struct pt_regs *regs;
+        unsigned int this_nmi;
+        int handled;
        if (!atomic_read(&active_events))
                return NOTIFY_DONE;
@@ -1204,22 +1232,47 @@ perf_event_nmi_handler(struct notifier_block *self,
        case DIE_NMI:
        case DIE_NMI_IPI:
                break;
+        case DIE_NMIUNKNOWN:
+                this_nmi = percpu_read(irq_stat.__nmi_count);
+                if (this_nmi != __get_cpu_var(pmu_nmi).marked)
+                        /* let the kernel handle the unknown nmi */
+                        return NOTIFY_DONE;
+                /*
+                 * This one is a PMU back-to-back nmi. Two events
+                 * trigger 'simultaneously' raising two back-to-back
+                 * NMIs. If the first NMI handles both, the latter
+                 * will be empty and daze the CPU. So, we drop it to
+                 * avoid false-positive 'unknown nmi' messages.
+                 */
+                return NOTIFY_STOP;
        default:
                return NOTIFY_DONE;
        }
-        regs = args->regs;
        apic_write(APIC_LVTPC, APIC_DM_NMI);
-        /*
-         * Can't rely on the handled return value to say it was our NMI, two
+        handled = x86_pmu.handle_irq(args->regs);
-         * events could trigger 'simultaneously' raising two back-to-back NMIs.
+        if (!handled)
-         *
+                return NOTIFY_DONE;
-         * If the first NMI handles both, the latter will be empty and daze
-         * the CPU.
+        this_nmi = percpu_read(irq_stat.__nmi_count);
-         */
+        if ((handled > 1) ||
-        x86_pmu.handle_irq(regs);
+                /* the next nmi could be a back-to-back nmi */
+            ((__get_cpu_var(pmu_nmi).marked == this_nmi) &&
+             (__get_cpu_var(pmu_nmi).handled > 1))) {
+                /*
+                 * We could have two subsequent back-to-back nmis: The
+                 * first handles more than one counter, the 2nd
+                 * handles only one counter and the 3rd handles no
+                 * counter.
+                 *
+                 * This is the 2nd nmi because the previous was
+                 * handling more than one counter. We will mark the
+                 * next (3rd) and then drop it if unhandled.
+                 */
+                __get_cpu_var(pmu_nmi).marked   = this_nmi + 1;
+                __get_cpu_var(pmu_nmi).handled  = handled;
+        }
        return NOTIFY_STOP;
 }
@@ -1388,7 +1441,7 @@ static void x86_pmu_start_txn(const struct pmu *pmu)
 {
        struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
-        cpuc->group_flag |= PERF_EVENT_TXN_STARTED;
+        cpuc->group_flag |= PERF_EVENT_TXN;
        cpuc->n_txn = 0;
 }
@@ -1401,7 +1454,7 @@ static void x86_pmu_cancel_txn(const struct pmu *pmu)
 {
        struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
-        cpuc->group_flag &= ~PERF_EVENT_TXN_STARTED;
+        cpuc->group_flag &= ~PERF_EVENT_TXN;
        /*
         * Truncate the collected events.
         */
@@ -1435,11 +1488,7 @@ static int x86_pmu_commit_txn(const struct pmu *pmu)
         */
        memcpy(cpuc->assign, assign, n*sizeof(int));
-        /*
+        cpuc->group_flag &= ~PERF_EVENT_TXN;
-         * Clear out the txn count so that ->cancel_txn() which gets
-         * run after ->commit_txn() doesn't undo things.
-         */
-        cpuc->n_txn = 0;
        return 0;
 }
@@ -1607,8 +1656,6 @@ static const struct stacktrace_ops backtrace_ops = {
        .walk_stack             = print_context_stack_bp,
 };
-#include "../dumpstack.h"
 static void
 perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
 {
@@ -1730,22 +1777,6 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
        return entry;
 }
-void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip)
-{
-        regs->ip = ip;
-        /*
-         * perf_arch_fetch_caller_regs adds another call, we need to increment
-         * the skip level
-         */
-        regs->bp = rewind_frame_pointer(skip + 1);
-        regs->cs = __KERNEL_CS;
-        /*
-         * We abuse bit 3 to pass exact information, see perf_misc_flags
-         * and the comment with PERF_EFLAGS_EXACT.
-         */
-        regs->flags = 0;
-}
 unsigned long perf_instruction_pointer(struct pt_regs *regs)
 {
        unsigned long ip;
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 214ac860ebe0..ee05c90012d2 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -491,33 +491,78 @@ static void intel_pmu_enable_all(int added)
 *   Intel Errata AAP53  (model 30)
 *   Intel Errata BD53   (model 44)
 *
- * These chips need to be 'reset' when adding counters by programming
+ * The official story:
- * the magic three (non counting) events 0x4300D2, 0x4300B1 and 0x4300B5
+ *   These chips need to be 'reset' when adding counters by programming the
- * either in sequence on the same PMC or on different PMCs.
+ *   magic three (non-counting) events 0x4300B5, 0x4300D2, and 0x4300B1 either
+ *   in sequence on the same PMC or on different PMCs.
+ *
+ * In practise it appears some of these events do in fact count, and
+ * we need to programm all 4 events.
 */
-static void intel_pmu_nhm_enable_all(int added)
+static void intel_pmu_nhm_workaround(void)
 {
-        if (added) {
+        struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
-                struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+        static const unsigned long nhm_magic[4] = {
-                int i;
+                0x4300B5,
+                0x4300D2,
+                0x4300B1,
+                0x4300B1
+        };
+        struct perf_event *event;
+        int i;
+        /*
+         * The Errata requires below steps:
+         * 1) Clear MSR_IA32_PEBS_ENABLE and MSR_CORE_PERF_GLOBAL_CTRL;
+         * 2) Configure 4 PERFEVTSELx with the magic events and clear
+         *    the corresponding PMCx;
+         * 3) set bit0~bit3 of MSR_CORE_PERF_GLOBAL_CTRL;
+         * 4) Clear MSR_CORE_PERF_GLOBAL_CTRL;
+         * 5) Clear 4 pairs of ERFEVTSELx and PMCx;
+         */
-                wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + 0, 0x4300D2);
+        /*
-                wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + 1, 0x4300B1);
+         * The real steps we choose are a little different from above.
-                wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + 2, 0x4300B5);
+         * A) To reduce MSR operations, we don't run step 1) as they
+         *    are already cleared before this function is called;
+         * B) Call x86_perf_event_update to save PMCx before configuring
+         *    PERFEVTSELx with magic number;
+         * C) With step 5), we do clear only when the PERFEVTSELx is
+         *    not used currently.
+         * D) Call x86_perf_event_set_period to restore PMCx;
+         */
+        /* We always operate 4 pairs of PERF Counters */
+        for (i = 0; i < 4; i++) {
+                event = cpuc->events[i];
+                if (event)
+                        x86_perf_event_update(event);
+        }
-                wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0x3);
+        for (i = 0; i < 4; i++) {
-                wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0x0);
+                wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + i, nhm_magic[i]);
+                wrmsrl(MSR_ARCH_PERFMON_PERFCTR0 + i, 0x0);
+        }
-                for (i = 0; i < 3; i++) {
+        wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0xf);
-                        struct perf_event *event = cpuc->events[i];
+        wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0x0);
-                        if (!event)
+        for (i = 0; i < 4; i++) {
-                                continue;
+                event = cpuc->events[i];
+                if (event) {
+                        x86_perf_event_set_period(event);
                        __x86_pmu_enable_event(&event->hw,
-                                               ARCH_PERFMON_EVENTSEL_ENABLE);
+                                        ARCH_PERFMON_EVENTSEL_ENABLE);
-                }
+                } else
+                        wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + i, 0x0);
        }
+}
+static void intel_pmu_nhm_enable_all(int added)
+{
+        if (added)
+                intel_pmu_nhm_workaround();
        intel_pmu_enable_all(added);
 }
@@ -667,7 +712,8 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
        struct perf_sample_data data;
        struct cpu_hw_events *cpuc;
        int bit, loops;
-        u64 ack, status;
+        u64 status;
+        int handled = 0;
        perf_sample_data_init(&data, 0);
@@ -683,6 +729,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
        loops = 0;
 again:
+        intel_pmu_ack_status(status);
        if (++loops > 100) {
                WARN_ONCE(1, "perfevents: irq loop stuck!\n");
                perf_event_print_debug();
@@ -691,19 +738,22 @@ again:
        }
        inc_irq_stat(apic_perf_irqs);
-        ack = status;
        intel_pmu_lbr_read();
        /*
         * PEBS overflow sets bit 62 in the global status register
         */
-        if (__test_and_clear_bit(62, (unsigned long *)&status))
+        if (__test_and_clear_bit(62, (unsigned long *)&status)) {
+                handled++;
                x86_pmu.drain_pebs(regs);
+        }
        for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
                struct perf_event *event = cpuc->events[bit];
+                handled++;
                if (!test_bit(bit, cpuc->active_mask))
                        continue;
@@ -716,8 +766,6 @@ again:
                        x86_pmu_stop(event);
        }
-        intel_pmu_ack_status(ack);
        /*
         * Repeat if there is more work to be done:
         */
@@ -727,7 +775,7 @@ again:
 done:
        intel_pmu_enable_all(0);
-        return 1;
+        return handled;
 }
 static struct event_constraint *
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index ae85d69644d1..249015173992 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -21,22 +21,36 @@ struct p4_event_bind {
        char cntr[2][P4_CNTR_LIMIT];            /* counter index (offset), -1 on abscence */
 };
-struct p4_cache_event_bind {
+struct p4_pebs_bind {
        unsigned int metric_pebs;
        unsigned int metric_vert;
 };
-#define P4_GEN_CACHE_EVENT_BIND(name)           \
+/* it sets P4_PEBS_ENABLE_UOP_TAG as well */
-        [P4_CACHE__##name] = {                  \
+#define P4_GEN_PEBS_BIND(name, pebs, vert)                      \
-                .metric_pebs = P4_PEBS__##name, \
+        [P4_PEBS_METRIC__##name] = {                            \
-                .metric_vert = P4_VERT__##name, \
+                .metric_pebs = pebs | P4_PEBS_ENABLE_UOP_TAG,   \
+                .metric_vert = vert,                            \
        }
-static struct p4_cache_event_bind p4_cache_event_bind_map[] = {
+/*
-        P4_GEN_CACHE_EVENT_BIND(1stl_cache_load_miss_retired),
+ * note we have P4_PEBS_ENABLE_UOP_TAG always set here
-        P4_GEN_CACHE_EVENT_BIND(2ndl_cache_load_miss_retired),
+ *
-        P4_GEN_CACHE_EVENT_BIND(dtlb_load_miss_retired),
+ * it's needed for mapping P4_PEBS_CONFIG_METRIC_MASK bits of
-        P4_GEN_CACHE_EVENT_BIND(dtlb_store_miss_retired),
+ * event configuration to find out which values are to be
+ * written into MSR_IA32_PEBS_ENABLE and MSR_P4_PEBS_MATRIX_VERT
+ * resgisters
+ */
+static struct p4_pebs_bind p4_pebs_bind_map[] = {
+        P4_GEN_PEBS_BIND(1stl_cache_load_miss_retired,  0x0000001, 0x0000001),
+        P4_GEN_PEBS_BIND(2ndl_cache_load_miss_retired,  0x0000002, 0x0000001),
+        P4_GEN_PEBS_BIND(dtlb_load_miss_retired,        0x0000004, 0x0000001),
+        P4_GEN_PEBS_BIND(dtlb_store_miss_retired,       0x0000004, 0x0000002),
+        P4_GEN_PEBS_BIND(dtlb_all_miss_retired,         0x0000004, 0x0000003),
+        P4_GEN_PEBS_BIND(tagged_mispred_branch,         0x0018000, 0x0000010),
+        P4_GEN_PEBS_BIND(mob_load_replay_retired,       0x0000200, 0x0000001),
+        P4_GEN_PEBS_BIND(split_load_retired,            0x0000400, 0x0000001),
+        P4_GEN_PEBS_BIND(split_store_retired,           0x0000400, 0x0000002),
 };
 /*
@@ -281,10 +295,10 @@ static struct p4_event_bind p4_event_bind_map[] = {
        },
 };
-#define P4_GEN_CACHE_EVENT(event, bit, cache_event)                       \
+#define P4_GEN_CACHE_EVENT(event, bit, metric)                            \
        p4_config_pack_escr(P4_ESCR_EVENT(event)                        | \
                            P4_ESCR_EMASK_BIT(event, bit))              | \
-        p4_config_pack_cccr(cache_event                                 | \
+        p4_config_pack_cccr(metric                                      | \
                            P4_CCCR_ESEL(P4_OPCODE_ESEL(P4_OPCODE(event))))
 static __initconst const u64 p4_hw_cache_event_ids
@@ -296,34 +310,34 @@ static __initconst const u64 p4_hw_cache_event_ids
        [ C(OP_READ) ] = {
                [ C(RESULT_ACCESS) ] = 0x0,
                [ C(RESULT_MISS)   ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
-                                                P4_CACHE__1stl_cache_load_miss_retired),
+                                                P4_PEBS_METRIC__1stl_cache_load_miss_retired),
        },
 },
 [ C(LL  ) ] = {
        [ C(OP_READ) ] = {
                [ C(RESULT_ACCESS) ] = 0x0,
                [ C(RESULT_MISS)   ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
-                                                P4_CACHE__2ndl_cache_load_miss_retired),
+                                                P4_PEBS_METRIC__2ndl_cache_load_miss_retired),
        },
 },
 [ C(DTLB) ] = {
        [ C(OP_READ) ] = {
                [ C(RESULT_ACCESS) ] = 0x0,
                [ C(RESULT_MISS)   ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
-                                                P4_CACHE__dtlb_load_miss_retired),
+                                                P4_PEBS_METRIC__dtlb_load_miss_retired),
        },
        [ C(OP_WRITE) ] = {
                [ C(RESULT_ACCESS) ] = 0x0,
                [ C(RESULT_MISS)   ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
-                                                P4_CACHE__dtlb_store_miss_retired),
+                                                P4_PEBS_METRIC__dtlb_store_miss_retired),
        },
 },
 [ C(ITLB) ] = {
        [ C(OP_READ) ] = {
                [ C(RESULT_ACCESS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_ITLB_REFERENCE, HIT,
-                                                P4_CACHE__itlb_reference_hit),
+                                                P4_PEBS_METRIC__none),
                [ C(RESULT_MISS)   ] = P4_GEN_CACHE_EVENT(P4_EVENT_ITLB_REFERENCE, MISS,
-                                                P4_CACHE__itlb_reference_miss),
+                                                P4_PEBS_METRIC__none),
        },
        [ C(OP_WRITE) ] = {
                [ C(RESULT_ACCESS) ] = -1,
@@ -414,11 +428,37 @@ static u64 p4_pmu_event_map(int hw_event)
        return config;
 }
+static int p4_validate_raw_event(struct perf_event *event)
+{
+        unsigned int v;
+        /* user data may have out-of-bound event index */
+        v = p4_config_unpack_event(event->attr.config);
+        if (v >= ARRAY_SIZE(p4_event_bind_map)) {
+                pr_warning("P4 PMU: Unknown event code: %d\n", v);
+                return -EINVAL;
+        }
+        /*
+         * it may have some screwed PEBS bits
+         */
+        if (p4_config_pebs_has(event->attr.config, P4_PEBS_CONFIG_ENABLE)) {
+                pr_warning("P4 PMU: PEBS are not supported yet\n");
+                return -EINVAL;
+        }
+        v = p4_config_unpack_metric(event->attr.config);
+        if (v >= ARRAY_SIZE(p4_pebs_bind_map)) {
+                pr_warning("P4 PMU: Unknown metric code: %d\n", v);
+                return -EINVAL;
+        }
+        return 0;
+}
 static int p4_hw_config(struct perf_event *event)
 {
        int cpu = get_cpu();
        int rc = 0;
-        unsigned int evnt;
        u32 escr, cccr;
        /*
@@ -438,12 +478,9 @@ static int p4_hw_config(struct perf_event *event)
        if (event->attr.type == PERF_TYPE_RAW) {
-                /* user data may have out-of-bound event index */
+                rc = p4_validate_raw_event(event);
-                evnt = p4_config_unpack_event(event->attr.config);
+                if (rc)
-                if (evnt >= ARRAY_SIZE(p4_event_bind_map)) {
-                        rc = -EINVAL;
                        goto out;
-                }
                /*
                 * We don't control raw events so it's up to the caller
@@ -451,12 +488,17 @@ static int p4_hw_config(struct perf_event *event)
                 * on HT machine but allow HT-compatible specifics to be
                 * passed on)
                 *
+                 * Note that for RAW events we allow user to use P4_CCCR_RESERVED
+                 * bits since we keep additional info here (for cache events and etc)
+                 *
                 * XXX: HT wide things should check perf_paranoid_cpu() &&
                 *      CAP_SYS_ADMIN
                 */
                event->hw.config |= event->attr.config &
                        (p4_config_pack_escr(P4_ESCR_MASK_HT) |
-                         p4_config_pack_cccr(P4_CCCR_MASK_HT));
+                         p4_config_pack_cccr(P4_CCCR_MASK_HT | P4_CCCR_RESERVED));
+                event->hw.config &= ~P4_CCCR_FORCE_OVF;
        }
        rc = x86_setup_perfctr(event);
@@ -482,6 +524,29 @@ static inline int p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc)
        return overflow;
 }
+static void p4_pmu_disable_pebs(void)
+{
+        /*
+         * FIXME
+         *
+         * It's still allowed that two threads setup same cache
+         * events so we can't simply clear metrics until we knew
+         * noone is depending on us, so we need kind of counter
+         * for "ReplayEvent" users.
+         *
+         * What is more complex -- RAW events, if user (for some
+         * reason) will pass some cache event metric with improper
+         * event opcode -- it's fine from hardware point of view
+         * but completely nonsence from "meaning" of such action.
+         *
+         * So at moment let leave metrics turned on forever -- it's
+         * ok for now but need to be revisited!
+         *
+         * (void)checking_wrmsrl(MSR_IA32_PEBS_ENABLE, (u64)0);
+         * (void)checking_wrmsrl(MSR_P4_PEBS_MATRIX_VERT, (u64)0);
+         */
+}
 static inline void p4_pmu_disable_event(struct perf_event *event)
 {
        struct hw_perf_event *hwc = &event->hw;
@@ -507,6 +572,26 @@ static void p4_pmu_disable_all(void)
                        continue;
                p4_pmu_disable_event(event);
        }
+        p4_pmu_disable_pebs();
+}
+/* configuration must be valid */
+static void p4_pmu_enable_pebs(u64 config)
+{
+        struct p4_pebs_bind *bind;
+        unsigned int idx;
+        BUILD_BUG_ON(P4_PEBS_METRIC__max > P4_PEBS_CONFIG_METRIC_MASK);
+        idx = p4_config_unpack_metric(config);
+        if (idx == P4_PEBS_METRIC__none)
+                return;
+        bind = &p4_pebs_bind_map[idx];
+        (void)checking_wrmsrl(MSR_IA32_PEBS_ENABLE,     (u64)bind->metric_pebs);
+        (void)checking_wrmsrl(MSR_P4_PEBS_MATRIX_VERT,  (u64)bind->metric_vert);
 }
 static void p4_pmu_enable_event(struct perf_event *event)
@@ -515,9 +600,7 @@ static void p4_pmu_enable_event(struct perf_event *event)
        int thread = p4_ht_config_thread(hwc->config);
        u64 escr_conf = p4_config_unpack_escr(p4_clear_ht_bit(hwc->config));
        unsigned int idx = p4_config_unpack_event(hwc->config);
-        unsigned int idx_cache = p4_config_unpack_cache_event(hwc->config);
        struct p4_event_bind *bind;
-        struct p4_cache_event_bind *bind_cache;
        u64 escr_addr, cccr;
        bind = &p4_event_bind_map[idx];
@@ -537,16 +620,10 @@ static void p4_pmu_enable_event(struct perf_event *event)
        cccr = p4_config_unpack_cccr(hwc->config);
        /*
-         * it could be Cache event so that we need to
+         * it could be Cache event so we need to write metrics
-         * set metrics into additional MSRs
+         * into additional MSRs
         */
-        BUILD_BUG_ON(P4_CACHE__MAX > P4_CCCR_CACHE_OPS_MASK);
+        p4_pmu_enable_pebs(hwc->config);
-        if (idx_cache > P4_CACHE__NONE &&
-                idx_cache < ARRAY_SIZE(p4_cache_event_bind_map)) {
-                bind_cache = &p4_cache_event_bind_map[idx_cache];
-                (void)checking_wrmsrl(MSR_IA32_PEBS_ENABLE, (u64)bind_cache->metric_pebs);
-                (void)checking_wrmsrl(MSR_P4_PEBS_MATRIX_VERT, (u64)bind_cache->metric_vert);
-        }
        (void)checking_wrmsrl(escr_addr, escr_conf);
        (void)checking_wrmsrl(hwc->config_base + hwc->idx,
@@ -581,9 +658,14 @@ static int p4_pmu_handle_irq(struct pt_regs *regs)
        cpuc = &__get_cpu_var(cpu_hw_events);
        for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+                int overflow;
-                if (!test_bit(idx, cpuc->active_mask))
+                if (!test_bit(idx, cpuc->active_mask)) {
+                        /* catch in-flight IRQs */
+                        if (__test_and_clear_bit(idx, cpuc->running))
+                                handled++;
                        continue;
+                }
                event = cpuc->events[idx];
                hwc = &event->hw;
@@ -591,12 +673,14 @@ static int p4_pmu_handle_irq(struct pt_regs *regs)
                WARN_ON_ONCE(hwc->idx != idx);
                /* it might be unflagged overflow */
-                handled = p4_pmu_clear_cccr_ovf(hwc);
+                overflow = p4_pmu_clear_cccr_ovf(hwc);
                val = x86_perf_event_update(event);
-                if (!handled && (val & (1ULL << (x86_pmu.cntval_bits - 1))))
+                if (!overflow && (val & (1ULL << (x86_pmu.cntval_bits - 1))))
                        continue;
+                handled += overflow;
                /* event overflow for sure */
                data.period = event->hw.last_period;
@@ -829,6 +913,15 @@ static __initconst const struct x86_pmu p4_pmu = {
        .max_period             = (1ULL << 39) - 1,
        .hw_config              = p4_hw_config,
        .schedule_events        = p4_pmu_schedule_events,
+        /*
+         * This handles erratum N15 in intel doc 249199-029,
+         * the counter may not be updated correctly on write
+         * so we need a second write operation to do the trick
+         * (the official workaround didn't work)
+         *
+         * the former idea is taken from OProfile code
+         */
+        .perfctr_second_write   = 1,
 };
 static __init int p4_pmu_init(void)
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
new file mode 100644
index 000000000000..d49079515122
--- /dev/null
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -0,0 +1,64 @@
+/*
+ *      Routines to indentify additional cpu features that are scattered in
+ *      cpuid space.
+ */
+#include <linux/cpu.h>
+#include <asm/pat.h>
+#include <asm/processor.h>
+#include <asm/apic.h>
+struct cpuid_bit {
+        u16 feature;
+        u8 reg;
+        u8 bit;
+        u32 level;
+        u32 sub_leaf;
+};
+enum cpuid_regs {
+        CR_EAX = 0,
+        CR_ECX,
+        CR_EDX,
+        CR_EBX
+};
+void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
+{
+        u32 max_level;
+        u32 regs[4];
+        const struct cpuid_bit *cb;
+        static const struct cpuid_bit __cpuinitconst cpuid_bits[] = {
+                { X86_FEATURE_DTS,              CR_EAX, 0, 0x00000006, 0 },
+                { X86_FEATURE_IDA,              CR_EAX, 1, 0x00000006, 0 },
+                { X86_FEATURE_ARAT,             CR_EAX, 2, 0x00000006, 0 },
+                { X86_FEATURE_PLN,              CR_EAX, 4, 0x00000006, 0 },
+                { X86_FEATURE_PTS,              CR_EAX, 6, 0x00000006, 0 },
+                { X86_FEATURE_APERFMPERF,       CR_ECX, 0, 0x00000006, 0 },
+                { X86_FEATURE_EPB,              CR_ECX, 3, 0x00000006, 0 },
+                { X86_FEATURE_XSAVEOPT,         CR_EAX, 0, 0x0000000d, 1 },
+                { X86_FEATURE_CPB,              CR_EDX, 9, 0x80000007, 0 },
+                { X86_FEATURE_NPT,              CR_EDX, 0, 0x8000000a, 0 },
+                { X86_FEATURE_LBRV,             CR_EDX, 1, 0x8000000a, 0 },
+                { X86_FEATURE_SVML,             CR_EDX, 2, 0x8000000a, 0 },
+                { X86_FEATURE_NRIPS,            CR_EDX, 3, 0x8000000a, 0 },
+                { 0, 0, 0, 0, 0 }
+        };
+        for (cb = cpuid_bits; cb->feature; cb++) {
+                /* Verify that the level is valid */
+                max_level = cpuid_eax(cb->level & 0xffff0000);
+                if (max_level < cb->level ||
+                    max_level > (cb->level | 0xffff))
+                        continue;
+                cpuid_count(cb->level, cb->sub_leaf, &regs[CR_EAX],
+                            &regs[CR_EBX], &regs[CR_ECX], &regs[CR_EDX]);
+                if (regs[cb->reg] & (1 << cb->bit))
+                        set_cpu_cap(c, cb->feature);
+        }
+}
diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/topology.c
index 10fa5684a662..4397e987a1cf 100644
--- a/arch/x86/kernel/cpu/addon_cpuid_features.c
+++ b/arch/x86/kernel/cpu/topology.c
@@ -1,62 +1,14 @@
 /*
- *      Routines to indentify additional cpu features that are scattered in
+ * Check for extended topology enumeration cpuid leaf 0xb and if it
- *      cpuid space.
+ * exists, use it for populating initial_apicid and cpu topology
+ * detection.
 */
-#include <linux/cpu.h>
+#include <linux/cpu.h>
+#include <asm/apic.h>
 #include <asm/pat.h>
 #include <asm/processor.h>
-#include <asm/apic.h>
-struct cpuid_bit {
-        u16 feature;
-        u8 reg;
-        u8 bit;
-        u32 level;
-};
-enum cpuid_regs {
-        CR_EAX = 0,
-        CR_ECX,
-        CR_EDX,
-        CR_EBX
-};
-void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
-{
-        u32 max_level;
-        u32 regs[4];
-        const struct cpuid_bit *cb;
-        static const struct cpuid_bit __cpuinitconst cpuid_bits[] = {
-                { X86_FEATURE_IDA,              CR_EAX, 1, 0x00000006 },
-                { X86_FEATURE_ARAT,             CR_EAX, 2, 0x00000006 },
-                { X86_FEATURE_APERFMPERF,       CR_ECX, 0, 0x00000006 },
-                { X86_FEATURE_CPB,              CR_EDX, 9, 0x80000007 },
-                { X86_FEATURE_NPT,              CR_EDX, 0, 0x8000000a },
-                { X86_FEATURE_LBRV,             CR_EDX, 1, 0x8000000a },
-                { X86_FEATURE_SVML,             CR_EDX, 2, 0x8000000a },
-                { X86_FEATURE_NRIPS,            CR_EDX, 3, 0x8000000a },
-                { 0, 0, 0, 0 }
-        };
-        for (cb = cpuid_bits; cb->feature; cb++) {
-                /* Verify that the level is valid */
-                max_level = cpuid_eax(cb->level & 0xffff0000);
-                if (max_level < cb->level ||
-                    max_level > (cb->level | 0xffff))
-                        continue;
-                cpuid(cb->level, &regs[CR_EAX], &regs[CR_EBX],
-                        &regs[CR_ECX], &regs[CR_EDX]);
-                if (regs[cb->reg] & (1 << cb->bit))
-                        set_cpu_cap(c, cb->feature);
-        }
-}
 /* leaf 0xb SMT level */
 #define SMT_LEVEL       0
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index b9d1ff588445..227b0448960d 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -51,7 +51,7 @@ static inline int __vmware_platform(void)
 static unsigned long vmware_get_tsc_khz(void)
 {
-        uint64_t tsc_hz;
+        uint64_t tsc_hz, lpj;
        uint32_t eax, ebx, ecx, edx;
        VMWARE_PORT(GETHZ, eax, ebx, ecx, edx);
@@ -62,6 +62,13 @@ static unsigned long vmware_get_tsc_khz(void)
        printk(KERN_INFO "TSC freq read from hypervisor : %lu.%03lu MHz\n",
                         (unsigned long) tsc_hz / 1000,
                         (unsigned long) tsc_hz % 1000);
+        if (!preset_lpj) {
+                lpj = ((u64)tsc_hz * 1000);
+                do_div(lpj, HZ);
+                preset_lpj = lpj;
+        }
        return tsc_hz;
 }
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index ebd4c51d096a..764c7c2b1811 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -28,6 +28,8 @@
 #include <asm/reboot.h>
 #include <asm/virtext.h>
+int in_crash_kexec;
 #if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC)
 static void kdump_nmi_callback(int cpu, struct die_args *args)
@@ -61,6 +63,7 @@ static void kdump_nmi_callback(int cpu, struct die_args *args)
 static void kdump_nmi_shootdown_cpus(void)
 {
+        in_crash_kexec = 1;
        nmi_shootdown_cpus(kdump_nmi_callback);
        disable_local_APIC();
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index c89a386930b7..6e8752c1bd52 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -18,7 +18,6 @@
 #include <asm/stacktrace.h>
-#include "dumpstack.h"
 int panic_on_unrecovered_nmi;
 int panic_on_io_nmi;
diff --git a/arch/x86/kernel/dumpstack.h b/arch/x86/kernel/dumpstack.h
deleted file mode 100644
index e1a93be4fd44..000000000000
--- a/arch/x86/kernel/dumpstack.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- *  Copyright (C) 1991, 1992  Linus Torvalds
- *  Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
- */
-#ifndef DUMPSTACK_H
-#define DUMPSTACK_H
-#ifdef CONFIG_X86_32
-#define STACKSLOTS_PER_LINE 8
-#define get_bp(bp) asm("movl %%ebp, %0" : "=r" (bp) :)
-#else
-#define STACKSLOTS_PER_LINE 4
-#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :)
-#endif
-#include <linux/uaccess.h>
-extern void
-show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
-                unsigned long *stack, unsigned long bp, char *log_lvl);
-extern void
-show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
-                unsigned long *sp, unsigned long bp, char *log_lvl);
-extern unsigned int code_bytes;
-/* The form of the top of the frame on the stack */
-struct stack_frame {
-        struct stack_frame *next_frame;
-        unsigned long return_address;
-};
-struct stack_frame_ia32 {
-    u32 next_frame;
-    u32 return_address;
-};
-static inline unsigned long rewind_frame_pointer(int n)
-{
-        struct stack_frame *frame;
-        get_bp(frame);
-#ifdef CONFIG_FRAME_POINTER
-        while (n--) {
-                if (probe_kernel_address(&frame->next_frame, frame))
-                        break;
-        }
-#endif
-        return (unsigned long)frame;
-}
-#endif /* DUMPSTACK_H */
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index 11540a189d93..0f6376ffa2d9 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -16,8 +16,6 @@
 #include <asm/stacktrace.h>
-#include "dumpstack.h"
 void dump_trace(struct task_struct *task, struct pt_regs *regs,
                unsigned long *stack, unsigned long bp,
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 272c9f1f05f3..57a21f11c791 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -16,7 +16,6 @@
 #include <asm/stacktrace.h>
-#include "dumpstack.h"
 #define N_EXCEPTION_STACKS_END \
                (N_EXCEPTION_STACKS + DEBUG_STKSZ/EXCEPTION_STKSZ - 2)
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index e5cc7e82e60d..ebdb85cf2686 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -18,7 +18,6 @@
 #include <asm/apic.h>
 #include <asm/iommu.h>
 #include <asm/gart.h>
-#include <asm/hpet.h>
 static void __init fix_hypertransport_config(int num, int slot, int func)
 {
@@ -192,21 +191,6 @@ static void __init ati_bugs_contd(int num, int slot, int func)
 }
 #endif
-/*
- * Force the read back of the CMP register in hpet_next_event()
- * to work around the problem that the CMP register write seems to be
- * delayed. See hpet_next_event() for details.
- *
- * We do this on all SMBUS incarnations for now until we have more
- * information about the affected chipsets.
- */
-static void __init ati_hpet_bugs(int num, int slot, int func)
-{
-#ifdef CONFIG_HPET_TIMER
-        hpet_readback_cmp = 1;
-#endif
-}
 #define QFLAG_APPLY_ONCE        0x1
 #define QFLAG_APPLIED           0x2
 #define QFLAG_DONE              (QFLAG_APPLY_ONCE|QFLAG_APPLIED)
@@ -236,8 +220,6 @@ static struct chipset early_qrk[] __initdata = {
          PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_bugs },
        { PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_SBX00_SMBUS,
          PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_bugs_contd },
-        { PCI_VENDOR_ID_ATI, PCI_ANY_ID,
-          PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_hpet_bugs },
        {}
 };
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index cd49141cf153..227d00920d2f 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -611,14 +611,14 @@ ldt_ss:
 * compensating for the offset by changing to the ESPFIX segment with
 * a base address that matches for the difference.
 */
+#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + (GDT_ENTRY_ESPFIX_SS * 8)
        mov %esp, %edx                  /* load kernel esp */
        mov PT_OLDESP(%esp), %eax       /* load userspace esp */
        mov %dx, %ax                    /* eax: new kernel esp */
        sub %eax, %edx                  /* offset (low word is 0) */
-        PER_CPU(gdt_page, %ebx)
        shr $16, %edx
-        mov %dl, GDT_ENTRY_ESPFIX_SS * 8 + 4(%ebx) /* bits 16..23 */
+        mov %dl, GDT_ESPFIX_SS + 4 /* bits 16..23 */
-        mov %dh, GDT_ENTRY_ESPFIX_SS * 8 + 7(%ebx) /* bits 24..31 */
+        mov %dh, GDT_ESPFIX_SS + 7 /* bits 24..31 */
        pushl $__ESPFIX_SS
        CFI_ADJUST_CFA_OFFSET 4
        push %eax                       /* new kernel esp */
@@ -791,9 +791,8 @@ ptregs_clone:
 * normal stack and adjusts ESP with the matching offset.
 */
        /* fixup the stack */
-        PER_CPU(gdt_page, %ebx)
+        mov GDT_ESPFIX_SS + 4, %al /* bits 16..23 */
-        mov GDT_ENTRY_ESPFIX_SS * 8 + 4(%ebx), %al /* bits 16..23 */
+        mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */
-        mov GDT_ENTRY_ESPFIX_SS * 8 + 7(%ebx), %ah /* bits 24..31 */
        shl $16, %eax
        addl %esp, %eax                 /* the adjusted stack pointer */
        pushl $__KERNEL_DS
@@ -914,7 +913,7 @@ ENTRY(simd_coprocessor_error)
        .balign 4
        .long 661b
        .long 663f
-        .byte X86_FEATURE_XMM
+        .word X86_FEATURE_XMM
        .byte 662b-661b
        .byte 664f-663f
 .previous
@@ -1166,6 +1165,9 @@ ENTRY(xen_failsafe_callback)
 .previous
 ENDPROC(xen_failsafe_callback)
+BUILD_INTERRUPT3(xen_hvm_callback_vector, XEN_HVM_EVTCHN_CALLBACK,
+                xen_evtchn_do_upcall)
 #endif  /* CONFIG_XEN */
 #ifdef CONFIG_FUNCTION_TRACER
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 4db7c4d12ffa..17be5ec7cbba 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1065,6 +1065,7 @@ ENTRY(\sym)
 END(\sym)
 .endm
+#define INIT_TSS_IST(x) PER_CPU_VAR(init_tss) + (TSS_ist + ((x) - 1) * 8)
 .macro paranoidzeroentry_ist sym do_sym ist
 ENTRY(\sym)
        INTR_FRAME
@@ -1076,10 +1077,9 @@ ENTRY(\sym)
        TRACE_IRQS_OFF
        movq %rsp,%rdi          /* pt_regs pointer */
        xorl %esi,%esi          /* no error code */
-        PER_CPU(init_tss, %r12)
+        subq $EXCEPTION_STKSZ, INIT_TSS_IST(\ist)
-        subq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%r12)
        call \do_sym
-        addq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%r12)
+        addq $EXCEPTION_STKSZ, INIT_TSS_IST(\ist)
        jmp paranoid_exit       /* %ebx: no swapgs flag */
        CFI_ENDPROC
 END(\sym)
@@ -1185,13 +1185,13 @@ END(kernel_thread_helper)
 * execve(). This function needs to use IRET, not SYSRET, to set up all state properly.
 *
 * C extern interface:
- *       extern long execve(char *name, char **argv, char **envp)
+ *       extern long execve(const char *name, char **argv, char **envp)
 *
 * asm input arguments:
 *      rdi: name, rsi: argv, rdx: envp
 *
 * We want to fallback into:
- *      extern long sys_execve(char *name, char **argv,char **envp, struct pt_regs *regs)
+ *      extern long sys_execve(const char *name, char **argv,char **envp, struct pt_regs *regs)
 *
 * do_sys_execve asm fallback arguments:
 *      rdi: name, rsi: argv, rdx: envp, rcx: fake frame on the stack
@@ -1329,6 +1329,9 @@ ENTRY(xen_failsafe_callback)
        CFI_ENDPROC
 END(xen_failsafe_callback)
+apicinterrupt XEN_HVM_EVTCHN_CALLBACK \
+        xen_hvm_callback_vector xen_evtchn_do_upcall
 #endif /* CONFIG_XEN */
 /*
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 37c3d4b17d85..fa8c1b8e09fb 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -131,6 +131,12 @@ ENTRY(startup_32)
        movsl
 1:
+#ifdef CONFIG_OLPC_OPENFIRMWARE
+        /* save OFW's pgdir table for later use when calling into OFW */
+        movl %cr3, %eax
+        movl %eax, pa(olpc_ofw_pgd)
+#endif
 #ifdef CONFIG_PARAVIRT
        /* This is can only trip for a broken bootloader... */
        cmpw $0x207, pa(boot_params + BP_version)
@@ -328,7 +334,7 @@ ENTRY(startup_32_smp)
 /*
 * Enable paging
 */
-        movl $pa(swapper_pg_dir),%eax
+        movl pa(initial_page_table), %eax
        movl %eax,%cr3          /* set the page table pointer.. */
        movl %cr0,%eax
        orl  $X86_CR0_PG,%eax
@@ -608,6 +614,8 @@ ignore_int:
 .align 4
 ENTRY(initial_code)
        .long i386_start_kernel
+ENTRY(initial_page_table)
+        .long pa(swapper_pg_dir)
 /*
 * BSS section
@@ -623,6 +631,10 @@ ENTRY(swapper_pg_dir)
 #endif
 swapper_pg_fixmap:
        .fill 1024,4,0
+#ifdef CONFIG_X86_TRAMPOLINE
+ENTRY(trampoline_pg_dir)
+        .fill 1024,4,0
+#endif
 ENTRY(empty_zero_page)
        .fill 4096,1,0
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 3d1e6f16b7a6..239046bd447f 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -234,9 +234,8 @@ ENTRY(secondary_startup_64)
         * init data section till per cpu areas are set up.
         */
        movl    $MSR_GS_BASE,%ecx
-        movq    initial_gs(%rip),%rax
+        movl    initial_gs(%rip),%eax
-        movq    %rax,%rdx
+        movl    initial_gs+4(%rip),%edx
-        shrq    $32,%rdx
        wrmsr   
        /* esi is pointer to real mode structure with interesting info.
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index ba390d731175..7494999141b3 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -16,7 +16,6 @@
 #include <asm/hpet.h>
 #define HPET_MASK                       CLOCKSOURCE_MASK(32)
-#define HPET_SHIFT                      22
 /* FSEC = 10^-15
   NSEC = 10^-9 */
@@ -36,7 +35,6 @@
 unsigned long                           hpet_address;
 u8                                      hpet_blockid; /* OS timer block num */
 u8                                      hpet_msi_disable;
-u8                                      hpet_readback_cmp;
 #ifdef CONFIG_PCI_MSI
 static unsigned long                    hpet_num_timers;
@@ -396,23 +394,27 @@ static int hpet_next_event(unsigned long delta,
         * at that point and we would wait for the next hpet interrupt
         * forever. We found out that reading the CMP register back
         * forces the transfer so we can rely on the comparison with
-         * the counter register below.
+         * the counter register below. If the read back from the
+         * compare register does not match the value we programmed
+         * then we might have a real hardware problem. We can not do
+         * much about it here, but at least alert the user/admin with
+         * a prominent warning.
         *
-         * That works fine on those ATI chipsets, but on newer Intel
+         * An erratum on some chipsets (ICH9,..), results in
-         * chipsets (ICH9...) this triggers due to an erratum: Reading
+         * comparator read immediately following a write returning old
-         * the comparator immediately following a write is returning
+         * value. Workaround for this is to read this value second
-         * the old value.
+         * time, when first read returns old value.
         *
-         * We restrict the read back to the affected ATI chipsets (set
+         * In fact the write to the comparator register is delayed up
-         * by quirks) and also run it with hpet=verbose for debugging
+         * to two HPET cycles so the workaround we tried to restrict
-         * purposes.
+         * the readback to those known to be borked ATI chipsets
+         * failed miserably. So we give up on optimizations forever
+         * and penalize all HPET incarnations unconditionally.
         */
-        if (hpet_readback_cmp || hpet_verbose) {
+        if (unlikely((u32)hpet_readl(HPET_Tn_CMP(timer)) != cnt)) {
-                u32 cmp = hpet_readl(HPET_Tn_CMP(timer));
+                if (hpet_readl(HPET_Tn_CMP(timer)) != cnt)
-                if (cmp != cnt)
                        printk_once(KERN_WARNING
-                            "hpet: compare register read back failed.\n");
+                                "hpet: compare register read back failed.\n");
        }
        return (s32)(hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0;
@@ -504,7 +506,7 @@ static int hpet_assign_irq(struct hpet_dev *dev)
 {
        unsigned int irq;
-        irq = create_irq();
+        irq = create_irq_nr(0, -1);
        if (!irq)
                return -EINVAL;
@@ -583,7 +585,7 @@ static void init_one_hpet_msi_clockevent(struct hpet_dev *hdev, int cpu)
         * scaled math multiplication factor for nanosecond to hpet tick
         * conversion.
         */
-        hpet_freq = 1000000000000000ULL;
+        hpet_freq = FSEC_PER_SEC;
        do_div(hpet_freq, hpet_period);
        evt->mult = div_sc((unsigned long) hpet_freq,
                                      NSEC_PER_SEC, evt->shift);
@@ -787,7 +789,6 @@ static struct clocksource clocksource_hpet = {
        .rating         = 250,
        .read           = read_hpet,
        .mask           = HPET_MASK,
-        .shift          = HPET_SHIFT,
        .flags          = CLOCK_SOURCE_IS_CONTINUOUS,
        .resume         = hpet_resume_counter,
 #ifdef CONFIG_X86_64
@@ -798,6 +799,7 @@ static struct clocksource clocksource_hpet = {
 static int hpet_clocksource_register(void)
 {
        u64 start, now;
+        u64 hpet_freq;
        cycle_t t1;
        /* Start the counter */
@@ -832,9 +834,15 @@ static int hpet_clocksource_register(void)
         *  mult = (hpet_period * 2^shift)/10^6
         *  mult = (hpet_period << shift)/FSEC_PER_NSEC
         */
-        clocksource_hpet.mult = div_sc(hpet_period, FSEC_PER_NSEC, HPET_SHIFT);
-        clocksource_register(&clocksource_hpet);
+        /* Need to convert hpet_period (fsec/cyc) to cyc/sec:
+         *
+         * cyc/sec = FSEC_PER_SEC/hpet_period(fsec/cyc)
+         * cyc/sec = (FSEC_PER_NSEC * NSEC_PER_SEC)/hpet_period
+         */
+        hpet_freq = FSEC_PER_SEC;
+        do_div(hpet_freq, hpet_period);
+        clocksource_register_hz(&clocksource_hpet, (u32)hpet_freq);
        return 0;
 }
diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
index a8f1b803d2fd..ff15c9dcc25d 100644
--- a/arch/x86/kernel/hw_breakpoint.c
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -206,6 +206,25 @@ int arch_check_bp_in_kernelspace(struct perf_event *bp)
 int arch_bp_generic_fields(int x86_len, int x86_type,
                           int *gen_len, int *gen_type)
 {
+        /* Type */
+        switch (x86_type) {
+        case X86_BREAKPOINT_EXECUTE:
+                if (x86_len != X86_BREAKPOINT_LEN_X)
+                        return -EINVAL;
+                *gen_type = HW_BREAKPOINT_X;
+                *gen_len = sizeof(long);
+                return 0;
+        case X86_BREAKPOINT_WRITE:
+                *gen_type = HW_BREAKPOINT_W;
+                break;
+        case X86_BREAKPOINT_RW:
+                *gen_type = HW_BREAKPOINT_W | HW_BREAKPOINT_R;
+                break;
+        default:
+                return -EINVAL;
+        }
        /* Len */
        switch (x86_len) {
        case X86_BREAKPOINT_LEN_1:
@@ -226,21 +245,6 @@ int arch_bp_generic_fields(int x86_len, int x86_type,
                return -EINVAL;
        }
-        /* Type */
-        switch (x86_type) {
-        case X86_BREAKPOINT_EXECUTE:
-                *gen_type = HW_BREAKPOINT_X;
-                break;
-        case X86_BREAKPOINT_WRITE:
-                *gen_type = HW_BREAKPOINT_W;
-                break;
-        case X86_BREAKPOINT_RW:
-                *gen_type = HW_BREAKPOINT_W | HW_BREAKPOINT_R;
-                break;
-        default:
-                return -EINVAL;
-        }
        return 0;
 }
@@ -251,6 +255,29 @@ static int arch_build_bp_info(struct perf_event *bp)
        info->address = bp->attr.bp_addr;
+        /* Type */
+        switch (bp->attr.bp_type) {
+        case HW_BREAKPOINT_W:
+                info->type = X86_BREAKPOINT_WRITE;
+                break;
+        case HW_BREAKPOINT_W | HW_BREAKPOINT_R:
+                info->type = X86_BREAKPOINT_RW;
+                break;
+        case HW_BREAKPOINT_X:
+                info->type = X86_BREAKPOINT_EXECUTE;
+                /*
+                 * x86 inst breakpoints need to have a specific undefined len.
+                 * But we still need to check userspace is not trying to setup
+                 * an unsupported length, to get a range breakpoint for example.
+                 */
+                if (bp->attr.bp_len == sizeof(long)) {
+                        info->len = X86_BREAKPOINT_LEN_X;
+                        return 0;
+                }
+        default:
+                return -EINVAL;
+        }
        /* Len */
        switch (bp->attr.bp_len) {
        case HW_BREAKPOINT_LEN_1:
@@ -271,21 +298,6 @@ static int arch_build_bp_info(struct perf_event *bp)
                return -EINVAL;
        }
-        /* Type */
-        switch (bp->attr.bp_type) {
-        case HW_BREAKPOINT_W:
-                info->type = X86_BREAKPOINT_WRITE;
-                break;
-        case HW_BREAKPOINT_W | HW_BREAKPOINT_R:
-                info->type = X86_BREAKPOINT_RW;
-                break;
-        case HW_BREAKPOINT_X:
-                info->type = X86_BREAKPOINT_EXECUTE;
-                break;
-        default:
-                return -EINVAL;
-        }
        return 0;
 }
 /*
@@ -466,6 +478,13 @@ static int __kprobes hw_breakpoint_handler(struct die_args *args)
                perf_bp_event(bp, args->regs);
+                /*
+                 * Set up resume flag to avoid breakpoint recursion when
+                 * returning back to origin.
+                 */
+                if (bp->hw.info.type == X86_BREAKPOINT_EXECUTE)
+                        args->regs->flags |= X86_EFLAGS_RF;
                rcu_read_unlock();
        }
        /*
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index c4444bce8469..a46cb3522c0c 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -40,6 +40,7 @@
 static unsigned int             mxcsr_feature_mask __read_mostly = 0xffffffffu;
 unsigned int xstate_size;
+EXPORT_SYMBOL_GPL(xstate_size);
 unsigned int sig_xstate_ia32_size = sizeof(struct _fpstate_ia32);
 static struct i387_fxsave_struct fx_scratch __cpuinitdata;
@@ -59,18 +60,18 @@ void __cpuinit mxcsr_feature_mask_init(void)
        stts();
 }
-void __cpuinit init_thread_xstate(void)
+static void __cpuinit init_thread_xstate(void)
 {
+        /*
+         * Note that xstate_size might be overwriten later during
+         * xsave_init().
+         */
        if (!HAVE_HWFP) {
                xstate_size = sizeof(struct i387_soft_struct);
                return;
        }
-        if (cpu_has_xsave) {
-                xsave_cntxt_init();
-                return;
-        }
        if (cpu_has_fxsr)
                xstate_size = sizeof(struct i387_fxsave_struct);
 #ifdef CONFIG_X86_32
@@ -84,6 +85,7 @@ void __cpuinit init_thread_xstate(void)
 * Called at bootup to set up the initial FPU state that is later cloned
 * into all processes.
 */
 void __cpuinit fpu_init(void)
 {
        unsigned long oldcr0 = read_cr0();
@@ -93,19 +95,24 @@ void __cpuinit fpu_init(void)
        write_cr0(oldcr0 & ~(X86_CR0_TS|X86_CR0_EM)); /* clear TS and EM */
-        /*
-         * Boot processor to setup the FP and extended state context info.
-         */
        if (!smp_processor_id())
                init_thread_xstate();
-        xsave_init();
        mxcsr_feature_mask_init();
        /* clean state in init */
        current_thread_info()->status = 0;
        clear_used_math();
 }
-#endif  /* CONFIG_X86_64 */
+#else   /* CONFIG_X86_64 */
+void __cpuinit fpu_init(void)
+{
+        if (!smp_processor_id())
+                init_thread_xstate();
+}
+#endif  /* CONFIG_X86_32 */
 void fpu_finit(struct fpu *fpu)
 {
@@ -191,6 +198,8 @@ int xfpregs_get(struct task_struct *target, const struct user_regset *regset,
        if (ret)
                return ret;
+        sanitize_i387_state(target);
        return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
                                   &target->thread.fpu.state->fxsave, 0, -1);
 }
@@ -208,6 +217,8 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
        if (ret)
                return ret;
+        sanitize_i387_state(target);
        ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
                                 &target->thread.fpu.state->fxsave, 0, -1);
@@ -447,6 +458,8 @@ int fpregs_get(struct task_struct *target, const struct user_regset *regset,
                                           -1);
        }
+        sanitize_i387_state(target);
        if (kbuf && pos == 0 && count == sizeof(env)) {
                convert_from_fxsr(kbuf, target);
                return 0;
@@ -468,6 +481,8 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset,
        if (ret)
                return ret;
+        sanitize_i387_state(target);
        if (!HAVE_HWFP)
                return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf);
@@ -534,6 +549,9 @@ static int save_i387_xsave(void __user *buf)
        struct _fpstate_ia32 __user *fx = buf;
        int err = 0;
+        sanitize_i387_state(tsk);
        /*
         * For legacy compatible, we always set FP/SSE bits in the bit
         * vector while saving the state to the user context.
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 01ab17ae2ae7..852b81967a37 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -49,55 +49,94 @@
 #include <asm/system.h>
 #include <asm/apic.h>
-/**
+struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] =
- *      pt_regs_to_gdb_regs - Convert ptrace regs to GDB regs
- *      @gdb_regs: A pointer to hold the registers in the order GDB wants.
- *      @regs: The &struct pt_regs of the current process.
- *
- *      Convert the pt_regs in @regs into the format for registers that
- *      GDB expects, stored in @gdb_regs.
- */
-void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
 {
-#ifndef CONFIG_X86_32
+#ifdef CONFIG_X86_32
-        u32 *gdb_regs32 = (u32 *)gdb_regs;
+        { "ax", 4, offsetof(struct pt_regs, ax) },
+        { "cx", 4, offsetof(struct pt_regs, cx) },
+        { "dx", 4, offsetof(struct pt_regs, dx) },
+        { "bx", 4, offsetof(struct pt_regs, bx) },
+        { "sp", 4, offsetof(struct pt_regs, sp) },
+        { "bp", 4, offsetof(struct pt_regs, bp) },
+        { "si", 4, offsetof(struct pt_regs, si) },
+        { "di", 4, offsetof(struct pt_regs, di) },
+        { "ip", 4, offsetof(struct pt_regs, ip) },
+        { "flags", 4, offsetof(struct pt_regs, flags) },
+        { "cs", 4, offsetof(struct pt_regs, cs) },
+        { "ss", 4, offsetof(struct pt_regs, ss) },
+        { "ds", 4, offsetof(struct pt_regs, ds) },
+        { "es", 4, offsetof(struct pt_regs, es) },
+        { "fs", 4, -1 },
+        { "gs", 4, -1 },
+#else
+        { "ax", 8, offsetof(struct pt_regs, ax) },
+        { "bx", 8, offsetof(struct pt_regs, bx) },
+        { "cx", 8, offsetof(struct pt_regs, cx) },
+        { "dx", 8, offsetof(struct pt_regs, dx) },
+        { "si", 8, offsetof(struct pt_regs, dx) },
+        { "di", 8, offsetof(struct pt_regs, di) },
+        { "bp", 8, offsetof(struct pt_regs, bp) },
+        { "sp", 8, offsetof(struct pt_regs, sp) },
+        { "r8", 8, offsetof(struct pt_regs, r8) },
+        { "r9", 8, offsetof(struct pt_regs, r9) },
+        { "r10", 8, offsetof(struct pt_regs, r10) },
+        { "r11", 8, offsetof(struct pt_regs, r11) },
+        { "r12", 8, offsetof(struct pt_regs, r12) },
+        { "r13", 8, offsetof(struct pt_regs, r13) },
+        { "r14", 8, offsetof(struct pt_regs, r14) },
+        { "r15", 8, offsetof(struct pt_regs, r15) },
+        { "ip", 8, offsetof(struct pt_regs, ip) },
+        { "flags", 4, offsetof(struct pt_regs, flags) },
+        { "cs", 4, offsetof(struct pt_regs, cs) },
+        { "ss", 4, offsetof(struct pt_regs, ss) },
 #endif
-        gdb_regs[GDB_AX]        = regs->ax;
+};
-        gdb_regs[GDB_BX]        = regs->bx;
-        gdb_regs[GDB_CX]        = regs->cx;
+int dbg_set_reg(int regno, void *mem, struct pt_regs *regs)
-        gdb_regs[GDB_DX]        = regs->dx;
+{
-        gdb_regs[GDB_SI]        = regs->si;
+        if (
-        gdb_regs[GDB_DI]        = regs->di;
-        gdb_regs[GDB_BP]        = regs->bp;
-        gdb_regs[GDB_PC]        = regs->ip;
 #ifdef CONFIG_X86_32
-        gdb_regs[GDB_PS]        = regs->flags;
+            regno == GDB_SS || regno == GDB_FS || regno == GDB_GS ||
-        gdb_regs[GDB_DS]        = regs->ds;
+#endif
-        gdb_regs[GDB_ES]        = regs->es;
+            regno == GDB_SP || regno == GDB_ORIG_AX)
-        gdb_regs[GDB_CS]        = regs->cs;
+                return 0;
-        gdb_regs[GDB_FS]        = 0xFFFF;
-        gdb_regs[GDB_GS]        = 0xFFFF;
+        if (dbg_reg_def[regno].offset != -1)
-        if (user_mode_vm(regs)) {
+                memcpy((void *)regs + dbg_reg_def[regno].offset, mem,
-                gdb_regs[GDB_SS] = regs->ss;
+                       dbg_reg_def[regno].size);
-                gdb_regs[GDB_SP] = regs->sp;
+        return 0;
-        } else {
+}
-                gdb_regs[GDB_SS] = __KERNEL_DS;
-                gdb_regs[GDB_SP] = kernel_stack_pointer(regs);
+char *dbg_get_reg(int regno, void *mem, struct pt_regs *regs)
+{
+        if (regno == GDB_ORIG_AX) {
+                memcpy(mem, &regs->orig_ax, sizeof(regs->orig_ax));
+                return "orig_ax";
        }
-#else
+        if (regno >= DBG_MAX_REG_NUM || regno < 0)
-        gdb_regs[GDB_R8]        = regs->r8;
+                return NULL;
-        gdb_regs[GDB_R9]        = regs->r9;
-        gdb_regs[GDB_R10]       = regs->r10;
+        if (dbg_reg_def[regno].offset != -1)
-        gdb_regs[GDB_R11]       = regs->r11;
+                memcpy(mem, (void *)regs + dbg_reg_def[regno].offset,
-        gdb_regs[GDB_R12]       = regs->r12;
+                       dbg_reg_def[regno].size);
-        gdb_regs[GDB_R13]       = regs->r13;
-        gdb_regs[GDB_R14]       = regs->r14;
+        switch (regno) {
-        gdb_regs[GDB_R15]       = regs->r15;
+#ifdef CONFIG_X86_32
-        gdb_regs32[GDB_PS]      = regs->flags;
+        case GDB_SS:
-        gdb_regs32[GDB_CS]      = regs->cs;
+                if (!user_mode_vm(regs))
-        gdb_regs32[GDB_SS]      = regs->ss;
+                        *(unsigned long *)mem = __KERNEL_DS;
-        gdb_regs[GDB_SP]        = kernel_stack_pointer(regs);
+                break;
+        case GDB_SP:
+                if (!user_mode_vm(regs))
+                        *(unsigned long *)mem = kernel_stack_pointer(regs);
+                break;
+        case GDB_GS:
+        case GDB_FS:
+                *(unsigned long *)mem = 0xFFFF;
+                break;
 #endif
+        }
+        return dbg_reg_def[regno].name;
 }
 /**
@@ -150,54 +189,13 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
        gdb_regs[GDB_SP]        = p->thread.sp;
 }
-/**
- *      gdb_regs_to_pt_regs - Convert GDB regs to ptrace regs.
- *      @gdb_regs: A pointer to hold the registers we've received from GDB.
- *      @regs: A pointer to a &struct pt_regs to hold these values in.
- *
- *      Convert the GDB regs in @gdb_regs into the pt_regs, and store them
- *      in @regs.
- */
-void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs)
-{
-#ifndef CONFIG_X86_32
-        u32 *gdb_regs32 = (u32 *)gdb_regs;
-#endif
-        regs->ax                = gdb_regs[GDB_AX];
-        regs->bx                = gdb_regs[GDB_BX];
-        regs->cx                = gdb_regs[GDB_CX];
-        regs->dx                = gdb_regs[GDB_DX];
-        regs->si                = gdb_regs[GDB_SI];
-        regs->di                = gdb_regs[GDB_DI];
-        regs->bp                = gdb_regs[GDB_BP];
-        regs->ip                = gdb_regs[GDB_PC];
-#ifdef CONFIG_X86_32
-        regs->flags             = gdb_regs[GDB_PS];
-        regs->ds                = gdb_regs[GDB_DS];
-        regs->es                = gdb_regs[GDB_ES];
-        regs->cs                = gdb_regs[GDB_CS];
-#else
-        regs->r8                = gdb_regs[GDB_R8];
-        regs->r9                = gdb_regs[GDB_R9];
-        regs->r10               = gdb_regs[GDB_R10];
-        regs->r11               = gdb_regs[GDB_R11];
-        regs->r12               = gdb_regs[GDB_R12];
-        regs->r13               = gdb_regs[GDB_R13];
-        regs->r14               = gdb_regs[GDB_R14];
-        regs->r15               = gdb_regs[GDB_R15];
-        regs->flags             = gdb_regs32[GDB_PS];
-        regs->cs                = gdb_regs32[GDB_CS];
-        regs->ss                = gdb_regs32[GDB_SS];
-#endif
-}
 static struct hw_breakpoint {
        unsigned                enabled;
        unsigned long           addr;
        int                     len;
        int                     type;
-        struct perf_event       **pev;
+        struct perf_event       * __percpu *pev;
-} breakinfo[4];
+} breakinfo[HBP_NUM];
 static unsigned long early_dr7;
@@ -205,7 +203,7 @@ static void kgdb_correct_hw_break(void)
 {
        int breakno;
-        for (breakno = 0; breakno < 4; breakno++) {
+        for (breakno = 0; breakno < HBP_NUM; breakno++) {
                struct perf_event *bp;
                struct arch_hw_breakpoint *info;
                int val;
@@ -292,10 +290,10 @@ kgdb_remove_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype)
 {
        int i;
-        for (i = 0; i < 4; i++)
+        for (i = 0; i < HBP_NUM; i++)
                if (breakinfo[i].addr == addr && breakinfo[i].enabled)
                        break;
-        if (i == 4)
+        if (i == HBP_NUM)
                return -1;
        if (hw_break_release_slot(i)) {
@@ -313,7 +311,7 @@ static void kgdb_remove_all_hw_break(void)
        int cpu = raw_smp_processor_id();
        struct perf_event *bp;
-        for (i = 0; i < 4; i++) {
+        for (i = 0; i < HBP_NUM; i++) {
                if (!breakinfo[i].enabled)
                        continue;
                bp = *per_cpu_ptr(breakinfo[i].pev, cpu);
@@ -333,10 +331,10 @@ kgdb_set_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype)
 {
        int i;
-        for (i = 0; i < 4; i++)
+        for (i = 0; i < HBP_NUM; i++)
                if (!breakinfo[i].enabled)
                        break;
-        if (i == 4)
+        if (i == HBP_NUM)
                return -1;
        switch (bptype) {
@@ -397,7 +395,7 @@ void kgdb_disable_hw_debug(struct pt_regs *regs)
        /* Disable hardware debugging while we are in kgdb: */
        set_debugreg(0UL, 7);
-        for (i = 0; i < 4; i++) {
+        for (i = 0; i < HBP_NUM; i++) {
                if (!breakinfo[i].enabled)
                        continue;
                if (dbg_is_early) {
@@ -458,7 +456,6 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code,
 {
        unsigned long addr;
        char *ptr;
-        int newPC;
        switch (remcomInBuffer[0]) {
        case 'c':
@@ -469,8 +466,6 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code,
                        linux_regs->ip = addr;
        case 'D':
        case 'k':
-                newPC = linux_regs->ip;
                /* clear the trace bit */
                linux_regs->flags &= ~X86_EFLAGS_TF;
                atomic_set(&kgdb_cpu_doing_single_step, -1);
@@ -645,7 +640,7 @@ void kgdb_arch_late(void)
        attr.bp_len = HW_BREAKPOINT_LEN_1;
        attr.bp_type = HW_BREAKPOINT_W;
        attr.disabled = 1;
-        for (i = 0; i < 4; i++) {
+        for (i = 0; i < HBP_NUM; i++) {
                if (breakinfo[i].pev)
                        continue;
                breakinfo[i].pev = register_wide_hw_breakpoint(&attr, NULL);
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index 675879b65ce6..770ebfb349e9 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -126,16 +126,22 @@ static void __kprobes synthesize_reljump(void *from, void *to)
 }
 /*
- * Check for the REX prefix which can only exist on X86_64
+ * Skip the prefixes of the instruction.
- * X86_32 always returns 0
 */
-static int __kprobes is_REX_prefix(kprobe_opcode_t *insn)
+static kprobe_opcode_t *__kprobes skip_prefixes(kprobe_opcode_t *insn)
 {
+        insn_attr_t attr;
+        attr = inat_get_opcode_attribute((insn_byte_t)*insn);
+        while (inat_is_legacy_prefix(attr)) {
+                insn++;
+                attr = inat_get_opcode_attribute((insn_byte_t)*insn);
+        }
 #ifdef CONFIG_X86_64
-        if ((*insn & 0xf0) == 0x40)
+        if (inat_is_rex_prefix(attr))
-                return 1;
+                insn++;
 #endif
-        return 0;
+        return insn;
 }
 /*
@@ -272,6 +278,9 @@ static int __kprobes can_probe(unsigned long paddr)
 */
 static int __kprobes is_IF_modifier(kprobe_opcode_t *insn)
 {
+        /* Skip prefixes */
+        insn = skip_prefixes(insn);
        switch (*insn) {
        case 0xfa:              /* cli */
        case 0xfb:              /* sti */
@@ -280,13 +289,6 @@ static int __kprobes is_IF_modifier(kprobe_opcode_t *insn)
                return 1;
        }
-        /*
-         * on X86_64, 0x40-0x4f are REX prefixes so we need to look
-         * at the next byte instead.. but of course not recurse infinitely
-         */
-        if (is_REX_prefix(insn))
-                return is_IF_modifier(++insn);
        return 0;
 }
@@ -707,6 +709,7 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs)
        struct hlist_node *node, *tmp;
        unsigned long flags, orig_ret_address = 0;
        unsigned long trampoline_address = (unsigned long)&kretprobe_trampoline;
+        kprobe_opcode_t *correct_ret_addr = NULL;
        INIT_HLIST_HEAD(&empty_rp);
        kretprobe_hash_lock(current, &head, &flags);
@@ -738,14 +741,34 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs)
                        /* another task is sharing our hash bucket */
                        continue;
+                orig_ret_address = (unsigned long)ri->ret_addr;
+                if (orig_ret_address != trampoline_address)
+                        /*
+                         * This is the real return address. Any other
+                         * instances associated with this task are for
+                         * other calls deeper on the call stack
+                         */
+                        break;
+        }
+        kretprobe_assert(ri, orig_ret_address, trampoline_address);
+        correct_ret_addr = ri->ret_addr;
+        hlist_for_each_entry_safe(ri, node, tmp, head, hlist) {
+                if (ri->task != current)
+                        /* another task is sharing our hash bucket */
+                        continue;
+                orig_ret_address = (unsigned long)ri->ret_addr;
                if (ri->rp && ri->rp->handler) {
                        __get_cpu_var(current_kprobe) = &ri->rp->kp;
                        get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE;
+                        ri->ret_addr = correct_ret_addr;
                        ri->rp->handler(ri, regs);
                        __get_cpu_var(current_kprobe) = NULL;
                }
-                orig_ret_address = (unsigned long)ri->ret_addr;
                recycle_rp_inst(ri, &empty_rp);
                if (orig_ret_address != trampoline_address)
@@ -757,8 +780,6 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs)
                        break;
        }
-        kretprobe_assert(ri, orig_ret_address, trampoline_address);
        kretprobe_hash_unlock(current, &flags);
        hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) {
@@ -803,9 +824,8 @@ static void __kprobes resume_execution(struct kprobe *p,
        unsigned long orig_ip = (unsigned long)p->addr;
        kprobe_opcode_t *insn = p->ainsn.insn;
-        /*skip the REX prefix*/
+        /* Skip prefixes */
-        if (is_REX_prefix(insn))
+        insn = skip_prefixes(insn);
-                insn++;
        regs->flags &= ~X86_EFLAGS_TF;
        switch (*insn) {
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index e0bc186d7501..1c355c550960 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -239,11 +239,10 @@ int module_finalize(const Elf_Ehdr *hdr,
                apply_paravirt(pseg, pseg + para->sh_size);
        }
-        return module_bug_finalize(hdr, sechdrs, me);
+        return 0;
 }
 void module_arch_cleanup(struct module *mod)
 {
        alternatives_smp_module_del(mod);
-        module_bug_cleanup(mod);
 }
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index d86dbf7e54be..d7b6f7fb4fec 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -274,6 +274,18 @@ static void __init smp_dump_mptable(struct mpc_table *mpc, unsigned char *mpt)
 void __init default_smp_read_mpc_oem(struct mpc_table *mpc) { }
+static void __init smp_register_lapic_address(unsigned long address)
+{
+        mp_lapic_addr = address;
+        set_fixmap_nocache(FIX_APIC_BASE, address);
+        if (boot_cpu_physical_apicid == -1U) {
+                boot_cpu_physical_apicid  = read_apic_id();
+                apic_version[boot_cpu_physical_apicid] =
+                         GET_APIC_VERSION(apic_read(APIC_LVR));
+        }
+}
 static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
 {
        char str[16];
@@ -295,6 +307,10 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
        if (early)
                return 1;
+        /* Initialize the lapic mapping */
+        if (!acpi_lapic)
+                smp_register_lapic_address(mpc->lapic);
        if (mpc->oemptr)
                x86_init.mpparse.smp_read_mpc_oem(mpc);
diff --git a/arch/x86/kernel/mrst.c b/arch/x86/kernel/mrst.c
index 5915e0b33303..79ae68154e87 100644
--- a/arch/x86/kernel/mrst.c
+++ b/arch/x86/kernel/mrst.c
@@ -25,8 +25,34 @@
 #include <asm/i8259.h>
 #include <asm/apb_timer.h>
+/*
+ * the clockevent devices on Moorestown/Medfield can be APBT or LAPIC clock,
+ * cmdline option x86_mrst_timer can be used to override the configuration
+ * to prefer one or the other.
+ * at runtime, there are basically three timer configurations:
+ * 1. per cpu apbt clock only
+ * 2. per cpu always-on lapic clocks only, this is Penwell/Medfield only
+ * 3. per cpu lapic clock (C3STOP) and one apbt clock, with broadcast.
+ *
+ * by default (without cmdline option), platform code first detects cpu type
+ * to see if we are on lincroft or penwell, then set up both lapic or apbt
+ * clocks accordingly.
+ * i.e. by default, medfield uses configuration #2, moorestown uses #1.
+ * config #3 is supported but not recommended on medfield.
+ *
+ * rating and feature summary:
+ * lapic (with C3STOP) --------- 100
+ * apbt (always-on) ------------ 110
+ * lapic (always-on,ARAT) ------ 150
+ */
+__cpuinitdata enum mrst_timer_options mrst_timer_options;
 static u32 sfi_mtimer_usage[SFI_MTMR_MAX_NUM];
 static struct sfi_timer_table_entry sfi_mtimer_array[SFI_MTMR_MAX_NUM];
+enum mrst_cpu_type __mrst_cpu_chip;
+EXPORT_SYMBOL_GPL(__mrst_cpu_chip);
 int sfi_mtimer_num;
 struct sfi_rtc_table_entry sfi_mrtc_array[SFI_MRTC_MAX];
@@ -167,18 +193,6 @@ int __init sfi_parse_mrtc(struct sfi_table_header *table)
        return 0;
 }
-/*
- * the secondary clock in Moorestown can be APBT or LAPIC clock, default to
- * APBT but cmdline option can also override it.
- */
-static void __cpuinit mrst_setup_secondary_clock(void)
-{
-        /* restore default lapic clock if disabled by cmdline */
-        if (disable_apbt_percpu)
-                return setup_secondary_APIC_clock();
-        apbt_setup_secondary_clock();
-}
 static unsigned long __init mrst_calibrate_tsc(void)
 {
        unsigned long flags, fast_calibrate;
@@ -195,6 +209,21 @@ static unsigned long __init mrst_calibrate_tsc(void)
 void __init mrst_time_init(void)
 {
+        switch (mrst_timer_options) {
+        case MRST_TIMER_APBT_ONLY:
+                break;
+        case MRST_TIMER_LAPIC_APBT:
+                x86_init.timers.setup_percpu_clockev = setup_boot_APIC_clock;
+                x86_cpuinit.setup_percpu_clockev = setup_secondary_APIC_clock;
+                break;
+        default:
+                if (!boot_cpu_has(X86_FEATURE_ARAT))
+                        break;
+                x86_init.timers.setup_percpu_clockev = setup_boot_APIC_clock;
+                x86_cpuinit.setup_percpu_clockev = setup_secondary_APIC_clock;
+                return;
+        }
+        /* we need at least one APB timer */
        sfi_table_parse(SFI_SIG_MTMR, NULL, NULL, sfi_parse_mtmr);
        pre_init_apic_IRQ0();
        apbt_time_init();
@@ -205,16 +234,21 @@ void __init mrst_rtc_init(void)
        sfi_table_parse(SFI_SIG_MRTC, NULL, NULL, sfi_parse_mrtc);
 }
-/*
+void __cpuinit mrst_arch_setup(void)
- * if we use per cpu apb timer, the bootclock already setup. if we use lapic
- * timer and one apbt timer for broadcast, we need to set up lapic boot clock.
- */
-static void __init mrst_setup_boot_clock(void)
 {
-        pr_info("%s: per cpu apbt flag %d \n", __func__, disable_apbt_percpu);
+        if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x27)
-        if (disable_apbt_percpu)
+                __mrst_cpu_chip = MRST_CPU_CHIP_PENWELL;
-                setup_boot_APIC_clock();
+        else if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x26)
-};
+                __mrst_cpu_chip = MRST_CPU_CHIP_LINCROFT;
+        else {
+                pr_err("Unknown Moorestown CPU (%d:%d), default to Lincroft\n",
+                        boot_cpu_data.x86, boot_cpu_data.x86_model);
+                __mrst_cpu_chip = MRST_CPU_CHIP_LINCROFT;
+        }
+        pr_debug("Moorestown CPU %s identified\n",
+                (__mrst_cpu_chip == MRST_CPU_CHIP_LINCROFT) ?
+                "Lincroft" : "Penwell");
+}
 /* MID systems don't have i8042 controller */
 static int mrst_i8042_detect(void)
@@ -232,11 +266,13 @@ void __init x86_mrst_early_setup(void)
        x86_init.resources.reserve_resources = x86_init_noop;
        x86_init.timers.timer_init = mrst_time_init;
-        x86_init.timers.setup_percpu_clockev = mrst_setup_boot_clock;
+        x86_init.timers.setup_percpu_clockev = x86_init_noop;
        x86_init.irqs.pre_vector_init = x86_init_noop;
-        x86_cpuinit.setup_percpu_clockev = mrst_setup_secondary_clock;
+        x86_init.oem.arch_setup = mrst_arch_setup;
+        x86_cpuinit.setup_percpu_clockev = apbt_setup_secondary_clock;
        x86_platform.calibrate_tsc = mrst_calibrate_tsc;
        x86_platform.i8042_detect = mrst_i8042_detect;
@@ -250,3 +286,26 @@ void __init x86_mrst_early_setup(void)
        x86_init.mpparse.get_smp_config = x86_init_uint_noop;
 }
+/*
+ * if user does not want to use per CPU apb timer, just give it a lower rating
+ * than local apic timer and skip the late per cpu timer init.
+ */
+static inline int __init setup_x86_mrst_timer(char *arg)
+{
+        if (!arg)
+                return -EINVAL;
+        if (strcmp("apbt_only", arg) == 0)
+                mrst_timer_options = MRST_TIMER_APBT_ONLY;
+        else if (strcmp("lapic_and_apbt", arg) == 0)
+                mrst_timer_options = MRST_TIMER_LAPIC_APBT;
+        else {
+                pr_warning("X86 MRST timer option %s not recognised"
+                           " use x86_mrst_timer=apbt_only or lapic_and_apbt\n",
+                           arg);
+                return -EINVAL;
+        }
+        return 0;
+}
+__setup("x86_mrst_timer=", setup_x86_mrst_timer);
diff --git a/arch/x86/kernel/olpc.c b/arch/x86/kernel/olpc.c
index 8297160c41b3..0e0cdde519be 100644
--- a/arch/x86/kernel/olpc.c
+++ b/arch/x86/kernel/olpc.c
@@ -21,10 +21,7 @@
 #include <asm/geode.h>
 #include <asm/setup.h>
 #include <asm/olpc.h>
+#include <asm/olpc_ofw.h>
-#ifdef CONFIG_OPEN_FIRMWARE
-#include <asm/ofw.h>
-#endif
 struct olpc_platform_t olpc_platform_info;
 EXPORT_SYMBOL_GPL(olpc_platform_info);
@@ -145,7 +142,7 @@ restart:
         * The OBF flag will sometimes misbehave due to what we believe
         * is a hardware quirk..
         */
-        printk(KERN_DEBUG "olpc-ec:  running cmd 0x%x\n", cmd);
+        pr_devel("olpc-ec:  running cmd 0x%x\n", cmd);
        outb(cmd, 0x6c);
        if (wait_on_ibf(0x6c, 0)) {
@@ -162,8 +159,7 @@ restart:
                                                " EC accept data!\n");
                                goto err;
                        }
-                        printk(KERN_DEBUG "olpc-ec:  sending cmd arg 0x%x\n",
+                        pr_devel("olpc-ec:  sending cmd arg 0x%x\n", inbuf[i]);
-                                        inbuf[i]);
                        outb(inbuf[i], 0x68);
                }
        }
@@ -176,8 +172,7 @@ restart:
                                goto restart;
                        }
                        outbuf[i] = inb(0x68);
-                        printk(KERN_DEBUG "olpc-ec:  received 0x%x\n",
+                        pr_devel("olpc-ec:  received 0x%x\n", outbuf[i]);
-                                        outbuf[i]);
                }
        }
@@ -188,14 +183,15 @@ err:
 }
 EXPORT_SYMBOL_GPL(olpc_ec_cmd);
-#ifdef CONFIG_OPEN_FIRMWARE
+#ifdef CONFIG_OLPC_OPENFIRMWARE
 static void __init platform_detect(void)
 {
        size_t propsize;
        __be32 rev;
+        const void *args[] = { NULL, "board-revision-int", &rev, (void *)4 };
+        void *res[] = { &propsize };
-        if (ofw("getprop", 4, 1, NULL, "board-revision-int", &rev, 4,
+        if (olpc_ofw("getprop", args, res) || propsize != 4) {
-                        &propsize) || propsize != 4) {
                printk(KERN_ERR "ofw: getprop call failed!\n");
                rev = cpu_to_be32(0);
        }
diff --git a/arch/x86/kernel/olpc_ofw.c b/arch/x86/kernel/olpc_ofw.c
new file mode 100644
index 000000000000..3218aa71ab5e
--- /dev/null
+++ b/arch/x86/kernel/olpc_ofw.c
@@ -0,0 +1,106 @@
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <asm/page.h>
+#include <asm/setup.h>
+#include <asm/io.h>
+#include <asm/pgtable.h>
+#include <asm/olpc_ofw.h>
+/* address of OFW callback interface; will be NULL if OFW isn't found */
+static int (*olpc_ofw_cif)(int *);
+/* page dir entry containing OFW's pgdir table; filled in by head_32.S */
+u32 olpc_ofw_pgd __initdata;
+static DEFINE_SPINLOCK(ofw_lock);
+#define MAXARGS 10
+void __init setup_olpc_ofw_pgd(void)
+{
+        pgd_t *base, *ofw_pde;
+        if (!olpc_ofw_cif)
+                return;
+        /* fetch OFW's PDE */
+        base = early_ioremap(olpc_ofw_pgd, sizeof(olpc_ofw_pgd) * PTRS_PER_PGD);
+        if (!base) {
+                printk(KERN_ERR "failed to remap OFW's pgd - disabling OFW!\n");
+                olpc_ofw_cif = NULL;
+                return;
+        }
+        ofw_pde = &base[OLPC_OFW_PDE_NR];
+        /* install OFW's PDE permanently into the kernel's pgtable */
+        set_pgd(&swapper_pg_dir[OLPC_OFW_PDE_NR], *ofw_pde);
+        /* implicit optimization barrier here due to uninline function return */
+        early_iounmap(base, sizeof(olpc_ofw_pgd) * PTRS_PER_PGD);
+}
+int __olpc_ofw(const char *name, int nr_args, const void **args, int nr_res,
+                void **res)
+{
+        int ofw_args[MAXARGS + 3];
+        unsigned long flags;
+        int ret, i, *p;
+        BUG_ON(nr_args + nr_res > MAXARGS);
+        if (!olpc_ofw_cif)
+                return -EIO;
+        ofw_args[0] = (int)name;
+        ofw_args[1] = nr_args;
+        ofw_args[2] = nr_res;
+        p = &ofw_args[3];
+        for (i = 0; i < nr_args; i++, p++)
+                *p = (int)args[i];
+        /* call into ofw */
+        spin_lock_irqsave(&ofw_lock, flags);
+        ret = olpc_ofw_cif(ofw_args);
+        spin_unlock_irqrestore(&ofw_lock, flags);
+        if (!ret) {
+                for (i = 0; i < nr_res; i++, p++)
+                        *((int *)res[i]) = *p;
+        }
+        return ret;
+}
+EXPORT_SYMBOL_GPL(__olpc_ofw);
+/* OFW cif _should_ be above this address */
+#define OFW_MIN 0xff000000
+/* OFW starts on a 1MB boundary */
+#define OFW_BOUND (1<<20)
+void __init olpc_ofw_detect(void)
+{
+        struct olpc_ofw_header *hdr = &boot_params.olpc_ofw_header;
+        unsigned long start;
+        /* ensure OFW booted us by checking for "OFW " string */
+        if (hdr->ofw_magic != OLPC_OFW_SIG)
+                return;
+        olpc_ofw_cif = (int (*)(int *))hdr->cif_handler;
+        if ((unsigned long)olpc_ofw_cif < OFW_MIN) {
+                printk(KERN_ERR "OFW detected, but cif has invalid address 0x%lx - disabling.\n",
+                                (unsigned long)olpc_ofw_cif);
+                olpc_ofw_cif = NULL;
+                return;
+        }
+        /* determine where OFW starts in memory */
+        start = round_down((unsigned long)olpc_ofw_cif, OFW_BOUND);
+        printk(KERN_INFO "OFW detected in memory, cif @ 0x%lx (reserving top %ldMB)\n",
+                        (unsigned long)olpc_ofw_cif, (-start) >> 20);
+        reserve_top_address(-start);
+}
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 4b7e3d8b01dd..9f07cfcbd3a5 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -13,6 +13,7 @@
 #include <asm/calgary.h>
 #include <asm/amd_iommu.h>
 #include <asm/x86_init.h>
+#include <asm/xen/swiotlb-xen.h>
 static int forbid_dac __read_mostly;
@@ -132,7 +133,7 @@ void __init pci_iommu_alloc(void)
        /* free the range so iommu could get some range less than 4G */
        dma32_free_bootmem();
-        if (pci_swiotlb_detect())
+        if (pci_xen_swiotlb_detect() || pci_swiotlb_detect())
                goto out;
        gart_iommu_hole_init();
@@ -144,6 +145,8 @@ void __init pci_iommu_alloc(void)
        /* needs to be called after gart_iommu_hole_init */
        amd_iommu_detect();
 out:
+        pci_xen_swiotlb_init();
        pci_swiotlb_init();
 }
@@ -296,7 +299,7 @@ static int __init pci_iommu_init(void)
 #endif
        x86_init.iommu.iommu_init();
-        if (swiotlb) {
+        if (swiotlb || xen_swiotlb) {
                printk(KERN_INFO "PCI-DMA: "
                       "Using software bounce buffering for IO (SWIOTLB)\n");
                swiotlb_print_info();
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index cbcf013a0ec6..57d1868a86aa 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -301,8 +301,9 @@ EXPORT_SYMBOL(kernel_thread);
 /*
 * sys_execve() executes a new program.
 */
-long sys_execve(char __user *name, char __user * __user *argv,
+long sys_execve(const char __user *name,
-                char __user * __user *envp, struct pt_regs *regs)
+                const char __user *const __user *argv,
+                const char __user *const __user *envp, struct pt_regs *regs)
 {
        long error;
        char *filename;
@@ -526,44 +527,10 @@ static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c)
        return (edx & MWAIT_EDX_C1);
 }
-/*
+bool c1e_detected;
- * Check for AMD CPUs, where APIC timer interrupt does not wake up CPU from C1e.
+EXPORT_SYMBOL(c1e_detected);
- * For more information see
- * - Erratum #400 for NPT family 0xf and family 0x10 CPUs
- * - Erratum #365 for family 0x11 (not affected because C1e not in use)
- */
-static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c)
-{
-        u64 val;
-        if (c->x86_vendor != X86_VENDOR_AMD)
-                goto no_c1e_idle;
-        /* Family 0x0f models < rev F do not have C1E */
-        if (c->x86 == 0x0F && c->x86_model >= 0x40)
-                return 1;
-        if (c->x86 == 0x10) {
-                /*
-                 * check OSVW bit for CPUs that are not affected
-                 * by erratum #400
-                 */
-                if (cpu_has(c, X86_FEATURE_OSVW)) {
-                        rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, val);
-                        if (val >= 2) {
-                                rdmsrl(MSR_AMD64_OSVW_STATUS, val);
-                                if (!(val & BIT(1)))
-                                        goto no_c1e_idle;
-                        }
-                }
-                return 1;
-        }
-no_c1e_idle:
-        return 0;
-}
 static cpumask_var_t c1e_mask;
-static int c1e_detected;
 void c1e_remove_cpu(int cpu)
 {
@@ -585,12 +552,12 @@ static void c1e_idle(void)
                u32 lo, hi;
                rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi);
                if (lo & K8_INTP_C1E_ACTIVE_MASK) {
-                        c1e_detected = 1;
+                        c1e_detected = true;
                        if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
                                mark_tsc_unstable("TSC halt in AMD C1E");
                        printk(KERN_INFO "System has AMD C1E enabled\n");
-                        set_cpu_cap(&boot_cpu_data, X86_FEATURE_AMDC1E);
                }
        }
@@ -639,7 +606,8 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
                 */
                printk(KERN_INFO "using mwait in idle threads.\n");
                pm_idle = mwait_idle;
-        } else if (check_c1e_idle(c)) {
+        } else if (cpu_has_amd_erratum(amd_erratum_400)) {
+                /* E400: APIC timer interrupt does not wake up CPU from C1e */
                printk(KERN_INFO "using C1E aware idle routine\n");
                pm_idle = c1e_idle;
        } else
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 8d128783af47..96586c3cbbbf 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -57,6 +57,8 @@
 #include <asm/syscalls.h>
 #include <asm/debugreg.h>
+#include <trace/events/power.h>
 asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
 /*
@@ -111,6 +113,8 @@ void cpu_idle(void)
                        stop_critical_timings();
                        pm_idle();
                        start_critical_timings();
+                        trace_power_end(smp_processor_id());
                }
                tick_nohz_restart_sched_tick();
                preempt_enable_no_resched();
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 3c2422a99f1f..3d9ea531ddd1 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -51,6 +51,8 @@
 #include <asm/syscalls.h>
 #include <asm/debugreg.h>
+#include <trace/events/power.h>
 asmlinkage extern void ret_from_fork(void);
 DEFINE_PER_CPU(unsigned long, old_rsp);
@@ -138,6 +140,9 @@ void cpu_idle(void)
                        stop_critical_timings();
                        pm_idle();
                        start_critical_timings();
+                        trace_power_end(smp_processor_id());
                        /* In many cases the interrupt that ended idle
                           has already called exit_idle. But some idle
                           loops can be woken up without interrupt. */
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index b4ae4acbd031..c3a4fbb2b996 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -102,6 +102,7 @@
 #include <asm/paravirt.h>
 #include <asm/hypervisor.h>
+#include <asm/olpc_ofw.h>
 #include <asm/percpu.h>
 #include <asm/topology.h>
@@ -736,10 +737,15 @@ void __init setup_arch(char **cmdline_p)
        /* VMI may relocate the fixmap; do this before touching ioremap area */
        vmi_init();
+        /* OFW also may relocate the fixmap */
+        olpc_ofw_detect();
        early_trap_init();
        early_cpu_init();
        early_ioremap_init();
+        setup_olpc_ofw_pgd();
        ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev);
        screen_info = boot_params.screen_info;
        edid_info = boot_params.edid_info;
@@ -1008,6 +1014,8 @@ void __init setup_arch(char **cmdline_p)
        paging_init();
        x86_init.paging.pagetable_setup_done(swapper_pg_dir);
+        setup_trampoline_page_table();
        tboot_probe();
 #ifdef CONFIG_X86_64
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index c4f33b2e77d6..8b3bfc4dd708 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -73,7 +73,6 @@
 #ifdef CONFIG_X86_32
 u8 apicid_2_node[MAX_APICID];
-static int low_mappings;
 #endif
 /* State of each CPU */
@@ -91,6 +90,25 @@ DEFINE_PER_CPU(int, cpu_state) = { 0 };
 static DEFINE_PER_CPU(struct task_struct *, idle_thread_array);
 #define get_idle_for_cpu(x)      (per_cpu(idle_thread_array, x))
 #define set_idle_for_cpu(x, p)   (per_cpu(idle_thread_array, x) = (p))
+/*
+ * We need this for trampoline_base protection from concurrent accesses when
+ * off- and onlining cores wildly.
+ */
+static DEFINE_MUTEX(x86_cpu_hotplug_driver_mutex);
+void cpu_hotplug_driver_lock()
+{
+        mutex_lock(&x86_cpu_hotplug_driver_mutex);
+}
+void cpu_hotplug_driver_unlock()
+{
+        mutex_unlock(&x86_cpu_hotplug_driver_mutex);
+}
+ssize_t arch_cpu_probe(const char *buf, size_t count) { return -1; }
+ssize_t arch_cpu_release(const char *buf, size_t count) { return -1; }
 #else
 static struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ;
 #define get_idle_for_cpu(x)      (idle_thread_array[(x)])
@@ -281,6 +299,18 @@ notrace static void __cpuinit start_secondary(void *unused)
         * fragile that we want to limit the things done here to the
         * most necessary things.
         */
+#ifdef CONFIG_X86_32
+        /*
+         * Switch away from the trampoline page-table
+         *
+         * Do this before cpu_init() because it needs to access per-cpu
+         * data which may not be mapped in the trampoline page-table.
+         */
+        load_cr3(swapper_pg_dir);
+        __flush_tlb_all();
+#endif
        vmi_bringup();
        cpu_init();
        preempt_disable();
@@ -299,12 +329,6 @@ notrace static void __cpuinit start_secondary(void *unused)
                legacy_pic->chip->unmask(0);
        }
-#ifdef CONFIG_X86_32
-        while (low_mappings)
-                cpu_relax();
-        __flush_tlb_all();
-#endif
        /* This must be done before setting cpu_online_mask */
        set_cpu_sibling_map(raw_smp_processor_id());
        wmb();
@@ -735,12 +759,8 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
                goto do_rest;
        }
-        if (!keventd_up() || current_is_keventd())
+        schedule_work(&c_idle.work);
-                c_idle.work.func(&c_idle.work);
+        wait_for_completion(&c_idle.done);
-        else {
-                schedule_work(&c_idle.work);
-                wait_for_completion(&c_idle.done);
-        }
        if (IS_ERR(c_idle.idle)) {
                printk("failed fork for CPU %d\n", cpu);
@@ -754,6 +774,7 @@ do_rest:
 #ifdef CONFIG_X86_32
        /* Stack for startup_32 can be just as for start_secondary onwards */
        irq_ctx_init(cpu);
+        initial_page_table = __pa(&trampoline_pg_dir);
 #else
        clear_tsk_thread_flag(c_idle.idle, TIF_FORK);
        initial_gs = per_cpu_offset(cpu);
@@ -816,6 +837,13 @@ do_rest:
                        if (cpumask_test_cpu(cpu, cpu_callin_mask))
                                break;  /* It has booted */
                        udelay(100);
+                        /*
+                         * Allow other tasks to run while we wait for the
+                         * AP to come online. This also gives a chance
+                         * for the MTRR work(triggered by the AP coming online)
+                         * to be completed in the stop machine context.
+                         */
+                        schedule();
                }
                if (cpumask_test_cpu(cpu, cpu_callin_mask))
@@ -894,20 +922,8 @@ int __cpuinit native_cpu_up(unsigned int cpu)
        per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
-#ifdef CONFIG_X86_32
-        /* init low mem mapping */
-        clone_pgd_range(swapper_pg_dir, swapper_pg_dir + KERNEL_PGD_BOUNDARY,
-                min_t(unsigned long, KERNEL_PGD_PTRS, KERNEL_PGD_BOUNDARY));
-        flush_tlb_all();
-        low_mappings = 1;
        err = do_boot_cpu(apicid, cpu);
-        zap_low_mappings(false);
-        low_mappings = 0;
-#else
-        err = do_boot_cpu(apicid, cpu);
-#endif
        if (err) {
                pr_debug("do_boot_cpu failed %d\n", err);
                return -EIO;
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index 922eefbb3f6c..b53c525368a7 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -23,11 +23,16 @@ static int save_stack_stack(void *data, char *name)
        return 0;
 }
-static void save_stack_address(void *data, unsigned long addr, int reliable)
+static void
+__save_stack_address(void *data, unsigned long addr, bool reliable, bool nosched)
 {
        struct stack_trace *trace = data;
+#ifdef CONFIG_FRAME_POINTER
        if (!reliable)
                return;
+#endif
+        if (nosched && in_sched_functions(addr))
+                return;
        if (trace->skip > 0) {
                trace->skip--;
                return;
@@ -36,20 +41,15 @@ static void save_stack_address(void *data, unsigned long addr, int reliable)
                trace->entries[trace->nr_entries++] = addr;
 }
+static void save_stack_address(void *data, unsigned long addr, int reliable)
+{
+        return __save_stack_address(data, addr, reliable, false);
+}
 static void
 save_stack_address_nosched(void *data, unsigned long addr, int reliable)
 {
-        struct stack_trace *trace = (struct stack_trace *)data;
+        return __save_stack_address(data, addr, reliable, true);
-        if (!reliable)
-                return;
-        if (in_sched_functions(addr))
-                return;
-        if (trace->skip > 0) {
-                trace->skip--;
-                return;
-        }
-        if (trace->nr_entries < trace->max_entries)
-                trace->entries[trace->nr_entries++] = addr;
 }
 static const struct stacktrace_ops save_stack_ops = {
@@ -96,12 +96,13 @@ EXPORT_SYMBOL_GPL(save_stack_trace_tsk);
 /* Userspace stacktrace - based on kernel/trace/trace_sysprof.c */
-struct stack_frame {
+struct stack_frame_user {
        const void __user       *next_fp;
        unsigned long           ret_addr;
 };
-static int copy_stack_frame(const void __user *fp, struct stack_frame *frame)
+static int
+copy_stack_frame(const void __user *fp, struct stack_frame_user *frame)
 {
        int ret;
@@ -126,7 +127,7 @@ static inline void __save_stack_trace_user(struct stack_trace *trace)
                trace->entries[trace->nr_entries++] = regs->ip;
        while (trace->nr_entries < trace->max_entries) {
-                struct stack_frame frame;
+                struct stack_frame_user frame;
                frame.next_fp = NULL;
                frame.ret_addr = 0;
diff --git a/arch/x86/kernel/sys_i386_32.c b/arch/x86/kernel/sys_i386_32.c
index 196552bb412c..d5e06624e34a 100644
--- a/arch/x86/kernel/sys_i386_32.c
+++ b/arch/x86/kernel/sys_i386_32.c
@@ -28,7 +28,9 @@
 * Do a system call from kernel instead of calling sys_execve so we
 * end up with proper pt_regs.
 */
-int kernel_execve(const char *filename, char *const argv[], char *const envp[])
+int kernel_execve(const char *filename,
+                  const char *const argv[],
+                  const char *const envp[])
 {
        long __res;
        asm volatile ("push %%ebx ; movl %2,%%ebx ; int $0x80 ; pop %%ebx"
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index 8b3729341216..b35786dc9b8f 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -337,3 +337,6 @@ ENTRY(sys_call_table)
        .long sys_rt_tgsigqueueinfo     /* 335 */
        .long sys_perf_event_open
        .long sys_recvmmsg
+        .long sys_fanotify_init
+        .long sys_fanotify_mark
+        .long sys_prlimit64             /* 340 */
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index 7fea555929e2..312ef0292815 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -8,6 +8,7 @@
 */
 #include <linux/seq_file.h>
 #include <linux/proc_fs.h>
+#include <linux/debugfs.h>
 #include <linux/kernel.h>
 #include <linux/slab.h>
@@ -22,19 +23,37 @@
 #include <asm/irq_vectors.h>
 #include <asm/timer.h>
-struct msg_desc {
+/* timeouts in nanoseconds (indexed by UVH_AGING_PRESCALE_SEL urgency7 30:28) */
-        struct bau_payload_queue_entry *msg;
+static int timeout_base_ns[] = {
-        int msg_slot;
+                20,
-        int sw_ack_slot;
+                160,
-        struct bau_payload_queue_entry *va_queue_first;
+                1280,
-        struct bau_payload_queue_entry *va_queue_last;
+                10240,
+                81920,
+                655360,
+                5242880,
+                167772160
 };
+static int timeout_us;
-#define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD 0x000000000bUL
-static int uv_bau_max_concurrent __read_mostly;
 static int nobau;
+static int baudisabled;
+static spinlock_t disable_lock;
+static cycles_t congested_cycles;
+/* tunables: */
+static int max_bau_concurrent = MAX_BAU_CONCURRENT;
+static int max_bau_concurrent_constant = MAX_BAU_CONCURRENT;
+static int plugged_delay = PLUGGED_DELAY;
+static int plugsb4reset = PLUGSB4RESET;
+static int timeoutsb4reset = TIMEOUTSB4RESET;
+static int ipi_reset_limit = IPI_RESET_LIMIT;
+static int complete_threshold = COMPLETE_THRESHOLD;
+static int congested_response_us = CONGESTED_RESPONSE_US;
+static int congested_reps = CONGESTED_REPS;
+static int congested_period = CONGESTED_PERIOD;
+static struct dentry *tunables_dir;
+static struct dentry *tunables_file;
 static int __init setup_nobau(char *arg)
 {
        nobau = 1;
@@ -52,10 +71,6 @@ static DEFINE_PER_CPU(struct ptc_stats, ptcstats);
 static DEFINE_PER_CPU(struct bau_control, bau_control);
 static DEFINE_PER_CPU(cpumask_var_t, uv_flush_tlb_mask);
-struct reset_args {
-        int sender;
-};
 /*
 * Determine the first node on a uvhub. 'Nodes' are used for kernel
 * memory allocation.
@@ -126,7 +141,7 @@ static inline void uv_bau_process_retry_msg(struct msg_desc *mdp,
        struct ptc_stats *stat;
        msg = mdp->msg;
-        stat = &per_cpu(ptcstats, bcp->cpu);
+        stat = bcp->statp;
        stat->d_retries++;
        /*
         * cancel any message from msg+1 to the retry itself
@@ -146,15 +161,14 @@ static inline void uv_bau_process_retry_msg(struct msg_desc *mdp,
                        slot2 = msg2 - mdp->va_queue_first;
                        mmr = uv_read_local_mmr
                                (UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE);
-                        msg_res = ((msg2->sw_ack_vector << 8) |
+                        msg_res = msg2->sw_ack_vector;
-                                   msg2->sw_ack_vector);
                        /*
                         * This is a message retry; clear the resources held
                         * by the previous message only if they timed out.
                         * If it has not timed out we have an unexpected
                         * situation to report.
                         */
-                        if (mmr & (msg_res << 8)) {
+                        if (mmr & (msg_res << UV_SW_ACK_NPENDING)) {
                                /*
                                 * is the resource timed out?
                                 * make everyone ignore the cancelled message.
@@ -164,9 +178,9 @@ static inline void uv_bau_process_retry_msg(struct msg_desc *mdp,
                                cancel_count++;
                                uv_write_local_mmr(
                                    UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS,
-                                        (msg_res << 8) | msg_res);
+                                        (msg_res << UV_SW_ACK_NPENDING) |
-                        } else
+                                         msg_res);
-                                printk(KERN_INFO "note bau retry: no effect\n");
+                        }
                }
        }
        if (!cancel_count)
@@ -190,7 +204,7 @@ static void uv_bau_process_message(struct msg_desc *mdp,
         * This must be a normal message, or retry of a normal message
         */
        msg = mdp->msg;
-        stat = &per_cpu(ptcstats, bcp->cpu);
+        stat = bcp->statp;
        if (msg->address == TLB_FLUSH_ALL) {
                local_flush_tlb();
                stat->d_alltlb++;
@@ -274,7 +288,7 @@ uv_do_reset(void *ptr)
        bcp = &per_cpu(bau_control, smp_processor_id());
        rap = (struct reset_args *)ptr;
-        stat = &per_cpu(ptcstats, bcp->cpu);
+        stat = bcp->statp;
        stat->d_resets++;
        /*
@@ -302,13 +316,13 @@ uv_do_reset(void *ptr)
                         */
                        mmr = uv_read_local_mmr
                                        (UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE);
-                        msg_res = ((msg->sw_ack_vector << 8) |
+                        msg_res = msg->sw_ack_vector;
-                                                   msg->sw_ack_vector);
                        if (mmr & msg_res) {
                                stat->d_rcanceled++;
                                uv_write_local_mmr(
                                    UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS,
-                                                        msg_res);
+                                        (msg_res << UV_SW_ACK_NPENDING) |
+                                         msg_res);
                        }
                }
        }
@@ -386,17 +400,12 @@ static int uv_wait_completion(struct bau_desc *bau_desc,
        unsigned long mmr_offset, int right_shift, int this_cpu,
        struct bau_control *bcp, struct bau_control *smaster, long try)
 {
-        int relaxes = 0;
        unsigned long descriptor_status;
-        unsigned long mmr;
-        unsigned long mask;
        cycles_t ttime;
-        cycles_t timeout_time;
+        struct ptc_stats *stat = bcp->statp;
-        struct ptc_stats *stat = &per_cpu(ptcstats, this_cpu);
        struct bau_control *hmaster;
        hmaster = bcp->uvhub_master;
-        timeout_time = get_cycles() + bcp->timeout_interval;
        /* spin on the status MMR, waiting for it to go idle */
        while ((descriptor_status = (((unsigned long)
@@ -423,7 +432,8 @@ static int uv_wait_completion(struct bau_desc *bau_desc,
                         * pending.  In that case hardware returns the
                         * ERROR that looks like a destination timeout.
                         */
-                        if (cycles_2_us(ttime - bcp->send_message) < BIOS_TO) {
+                        if (cycles_2_us(ttime - bcp->send_message) <
+                                                        timeout_us) {
                                bcp->conseccompletes = 0;
                                return FLUSH_RETRY_PLUGGED;
                        }
@@ -435,26 +445,6 @@ static int uv_wait_completion(struct bau_desc *bau_desc,
                         * descriptor_status is still BUSY
                         */
                        cpu_relax();
-                        relaxes++;
-                        if (relaxes >= 10000) {
-                                relaxes = 0;
-                                if (get_cycles() > timeout_time) {
-                                        quiesce_local_uvhub(hmaster);
-                                        /* single-thread the register change */
-                                        spin_lock(&hmaster->masks_lock);
-                                        mmr = uv_read_local_mmr(mmr_offset);
-                                        mask = 0UL;
-                                        mask |= (3UL < right_shift);
-                                        mask = ~mask;
-                                        mmr &= mask;
-                                        uv_write_local_mmr(mmr_offset, mmr);
-                                        spin_unlock(&hmaster->masks_lock);
-                                        end_uvhub_quiesce(hmaster);
-                                        stat->s_busy++;
-                                        return FLUSH_GIVEUP;
-                                }
-                        }
                }
        }
        bcp->conseccompletes++;
@@ -494,56 +484,116 @@ static inline int atomic_inc_unless_ge(spinlock_t *lock, atomic_t *v, int u)
        return 1;
 }
+/*
+ * Our retries are blocked by all destination swack resources being
+ * in use, and a timeout is pending. In that case hardware immediately
+ * returns the ERROR that looks like a destination timeout.
+ */
+static void
+destination_plugged(struct bau_desc *bau_desc, struct bau_control *bcp,
+                        struct bau_control *hmaster, struct ptc_stats *stat)
+{
+        udelay(bcp->plugged_delay);
+        bcp->plugged_tries++;
+        if (bcp->plugged_tries >= bcp->plugsb4reset) {
+                bcp->plugged_tries = 0;
+                quiesce_local_uvhub(hmaster);
+                spin_lock(&hmaster->queue_lock);
+                uv_reset_with_ipi(&bau_desc->distribution, bcp->cpu);
+                spin_unlock(&hmaster->queue_lock);
+                end_uvhub_quiesce(hmaster);
+                bcp->ipi_attempts++;
+                stat->s_resets_plug++;
+        }
+}
+static void
+destination_timeout(struct bau_desc *bau_desc, struct bau_control *bcp,
+                        struct bau_control *hmaster, struct ptc_stats *stat)
+{
+        hmaster->max_bau_concurrent = 1;
+        bcp->timeout_tries++;
+        if (bcp->timeout_tries >= bcp->timeoutsb4reset) {
+                bcp->timeout_tries = 0;
+                quiesce_local_uvhub(hmaster);
+                spin_lock(&hmaster->queue_lock);
+                uv_reset_with_ipi(&bau_desc->distribution, bcp->cpu);
+                spin_unlock(&hmaster->queue_lock);
+                end_uvhub_quiesce(hmaster);
+                bcp->ipi_attempts++;
+                stat->s_resets_timeout++;
+        }
+}
+/*
+ * Completions are taking a very long time due to a congested numalink
+ * network.
+ */
+static void
+disable_for_congestion(struct bau_control *bcp, struct ptc_stats *stat)
+{
+        int tcpu;
+        struct bau_control *tbcp;
+        /* let only one cpu do this disabling */
+        spin_lock(&disable_lock);
+        if (!baudisabled && bcp->period_requests &&
+            ((bcp->period_time / bcp->period_requests) > congested_cycles)) {
+                /* it becomes this cpu's job to turn on the use of the
+                   BAU again */
+                baudisabled = 1;
+                bcp->set_bau_off = 1;
+                bcp->set_bau_on_time = get_cycles() +
+                        sec_2_cycles(bcp->congested_period);
+                stat->s_bau_disabled++;
+                for_each_present_cpu(tcpu) {
+                        tbcp = &per_cpu(bau_control, tcpu);
+                                tbcp->baudisabled = 1;
+                }
+        }
+        spin_unlock(&disable_lock);
+}
 /**
 * uv_flush_send_and_wait
 *
 * Send a broadcast and wait for it to complete.
 *
- * The flush_mask contains the cpus the broadcast is to be sent to, plus
+ * The flush_mask contains the cpus the broadcast is to be sent to including
 * cpus that are on the local uvhub.
 *
- * Returns NULL if all flushing represented in the mask was done. The mask
+ * Returns 0 if all flushing represented in the mask was done.
- * is zeroed.
+ * Returns 1 if it gives up entirely and the original cpu mask is to be
- * Returns @flush_mask if some remote flushing remains to be done. The
+ * returned to the kernel.
- * mask will have some bits still set, representing any cpus on the local
- * uvhub (not current cpu) and any on remote uvhubs if the broadcast failed.
 */
-const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc,
+int uv_flush_send_and_wait(struct bau_desc *bau_desc,
-                                             struct cpumask *flush_mask,
+                           struct cpumask *flush_mask, struct bau_control *bcp)
-                                             struct bau_control *bcp)
 {
        int right_shift;
-        int uvhub;
-        int bit;
        int completion_status = 0;
        int seq_number = 0;
        long try = 0;
        int cpu = bcp->uvhub_cpu;
        int this_cpu = bcp->cpu;
-        int this_uvhub = bcp->uvhub;
        unsigned long mmr_offset;
        unsigned long index;
        cycles_t time1;
        cycles_t time2;
-        struct ptc_stats *stat = &per_cpu(ptcstats, bcp->cpu);
+        cycles_t elapsed;
+        struct ptc_stats *stat = bcp->statp;
        struct bau_control *smaster = bcp->socket_master;
        struct bau_control *hmaster = bcp->uvhub_master;
-        /*
-         * Spin here while there are hmaster->max_concurrent or more active
-         * descriptors. This is the per-uvhub 'throttle'.
-         */
        if (!atomic_inc_unless_ge(&hmaster->uvhub_lock,
                        &hmaster->active_descriptor_count,
-                        hmaster->max_concurrent)) {
+                        hmaster->max_bau_concurrent)) {
                stat->s_throttles++;
                do {
                        cpu_relax();
                } while (!atomic_inc_unless_ge(&hmaster->uvhub_lock,
                        &hmaster->active_descriptor_count,
-                        hmaster->max_concurrent));
+                        hmaster->max_bau_concurrent));
        }
        while (hmaster->uvhub_quiesce)
                cpu_relax();
@@ -557,23 +607,10 @@ const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc,
        }
        time1 = get_cycles();
        do {
-                /*
-                 * Every message from any given cpu gets a unique message
-                 * sequence number. But retries use that same number.
-                 * Our message may have timed out at the destination because
-                 * all sw-ack resources are in use and there is a timeout
-                 * pending there.  In that case, our last send never got
-                 * placed into the queue and we need to persist until it
-                 * does.
-                 *
-                 * Make any retry a type MSG_RETRY so that the destination will
-                 * free any resource held by a previous message from this cpu.
-                 */
                if (try == 0) {
-                        /* use message type set by the caller the first time */
+                        bau_desc->header.msg_type = MSG_REGULAR;
                        seq_number = bcp->message_number++;
                } else {
-                        /* use RETRY type on all the rest; same sequence */
                        bau_desc->header.msg_type = MSG_RETRY;
                        stat->s_retry_messages++;
                }
@@ -581,50 +618,17 @@ const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc,
                index = (1UL << UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT) |
                        bcp->uvhub_cpu;
                bcp->send_message = get_cycles();
                uv_write_local_mmr(UVH_LB_BAU_SB_ACTIVATION_CONTROL, index);
                try++;
                completion_status = uv_wait_completion(bau_desc, mmr_offset,
                        right_shift, this_cpu, bcp, smaster, try);
                if (completion_status == FLUSH_RETRY_PLUGGED) {
-                        /*
+                        destination_plugged(bau_desc, bcp, hmaster, stat);
-                         * Our retries may be blocked by all destination swack
-                         * resources being consumed, and a timeout pending. In
-                         * that case hardware immediately returns the ERROR
-                         * that looks like a destination timeout.
-                         */
-                        udelay(TIMEOUT_DELAY);
-                        bcp->plugged_tries++;
-                        if (bcp->plugged_tries >= PLUGSB4RESET) {
-                                bcp->plugged_tries = 0;
-                                quiesce_local_uvhub(hmaster);
-                                spin_lock(&hmaster->queue_lock);
-                                uv_reset_with_ipi(&bau_desc->distribution,
-                                                        this_cpu);
-                                spin_unlock(&hmaster->queue_lock);
-                                end_uvhub_quiesce(hmaster);
-                                bcp->ipi_attempts++;
-                                stat->s_resets_plug++;
-                        }
                } else if (completion_status == FLUSH_RETRY_TIMEOUT) {
-                        hmaster->max_concurrent = 1;
+                        destination_timeout(bau_desc, bcp, hmaster, stat);
-                        bcp->timeout_tries++;
-                        udelay(TIMEOUT_DELAY);
-                        if (bcp->timeout_tries >= TIMEOUTSB4RESET) {
-                                bcp->timeout_tries = 0;
-                                quiesce_local_uvhub(hmaster);
-                                spin_lock(&hmaster->queue_lock);
-                                uv_reset_with_ipi(&bau_desc->distribution,
-                                                                this_cpu);
-                                spin_unlock(&hmaster->queue_lock);
-                                end_uvhub_quiesce(hmaster);
-                                bcp->ipi_attempts++;
-                                stat->s_resets_timeout++;
-                        }
                }
-                if (bcp->ipi_attempts >= 3) {
+                if (bcp->ipi_attempts >= bcp->ipi_reset_limit) {
                        bcp->ipi_attempts = 0;
                        completion_status = FLUSH_GIVEUP;
                        break;
@@ -633,49 +637,36 @@ const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc,
        } while ((completion_status == FLUSH_RETRY_PLUGGED) ||
                 (completion_status == FLUSH_RETRY_TIMEOUT));
        time2 = get_cycles();
+        bcp->plugged_tries = 0;
-        if ((completion_status == FLUSH_COMPLETE) && (bcp->conseccompletes > 5)
+        bcp->timeout_tries = 0;
-            && (hmaster->max_concurrent < hmaster->max_concurrent_constant))
+        if ((completion_status == FLUSH_COMPLETE) &&
-                        hmaster->max_concurrent++;
+            (bcp->conseccompletes > bcp->complete_threshold) &&
+            (hmaster->max_bau_concurrent <
-        /*
+                                        hmaster->max_bau_concurrent_constant))
-         * hold any cpu not timing out here; no other cpu currently held by
+                        hmaster->max_bau_concurrent++;
-         * the 'throttle' should enter the activation code
-         */
        while (hmaster->uvhub_quiesce)
                cpu_relax();
        atomic_dec(&hmaster->active_descriptor_count);
+        if (time2 > time1) {
-        /* guard against cycles wrap */
+                elapsed = time2 - time1;
-        if (time2 > time1)
+                stat->s_time += elapsed;
-                stat->s_time += (time2 - time1);
+                if ((completion_status == FLUSH_COMPLETE) && (try == 1)) {
-        else
+                        bcp->period_requests++;
-                stat->s_requestor--; /* don't count this one */
+                        bcp->period_time += elapsed;
+                        if ((elapsed > congested_cycles) &&
+                            (bcp->period_requests > bcp->congested_reps)) {
+                                disable_for_congestion(bcp, stat);
+                        }
+                }
+        } else
+                stat->s_requestor--;
        if (completion_status == FLUSH_COMPLETE && try > 1)
                stat->s_retriesok++;
        else if (completion_status == FLUSH_GIVEUP) {
-                /*
-                 * Cause the caller to do an IPI-style TLB shootdown on
-                 * the target cpu's, all of which are still in the mask.
-                 */
                stat->s_giveup++;
-                return flush_mask;
+                return 1;
-        }
-        /*
-         * Success, so clear the remote cpu's from the mask so we don't
-         * use the IPI method of shootdown on them.
-         */
-        for_each_cpu(bit, flush_mask) {
-                uvhub = uv_cpu_to_blade_id(bit);
-                if (uvhub == this_uvhub)
-                        continue;
-                cpumask_clear_cpu(bit, flush_mask);
        }
-        if (!cpumask_empty(flush_mask))
+        return 0;
-                return flush_mask;
-        return NULL;
 }
 /**
@@ -707,70 +698,89 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
                                          struct mm_struct *mm,
                                          unsigned long va, unsigned int cpu)
 {
-        int remotes;
        int tcpu;
        int uvhub;
        int locals = 0;
+        int remotes = 0;
+        int hubs = 0;
        struct bau_desc *bau_desc;
        struct cpumask *flush_mask;
        struct ptc_stats *stat;
        struct bau_control *bcp;
+        struct bau_control *tbcp;
+        /* kernel was booted 'nobau' */
        if (nobau)
                return cpumask;
        bcp = &per_cpu(bau_control, cpu);
+        stat = bcp->statp;
+        /* bau was disabled due to slow response */
+        if (bcp->baudisabled) {
+                /* the cpu that disabled it must re-enable it */
+                if (bcp->set_bau_off) {
+                        if (get_cycles() >= bcp->set_bau_on_time) {
+                                stat->s_bau_reenabled++;
+                                baudisabled = 0;
+                                for_each_present_cpu(tcpu) {
+                                        tbcp = &per_cpu(bau_control, tcpu);
+                                        tbcp->baudisabled = 0;
+                                        tbcp->period_requests = 0;
+                                        tbcp->period_time = 0;
+                                }
+                        }
+                }
+                return cpumask;
+        }
        /*
         * Each sending cpu has a per-cpu mask which it fills from the caller's
-         * cpu mask.  Only remote cpus are converted to uvhubs and copied.
+         * cpu mask.  All cpus are converted to uvhubs and copied to the
+         * activation descriptor.
         */
        flush_mask = (struct cpumask *)per_cpu(uv_flush_tlb_mask, cpu);
-        /*
+        /* don't actually do a shootdown of the local cpu */
-         * copy cpumask to flush_mask, removing current cpu
-         * (current cpu should already have been flushed by the caller and
-         *  should never be returned if we return flush_mask)
-         */
        cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu));
        if (cpu_isset(cpu, *cpumask))
-                locals++;  /* current cpu was targeted */
+                stat->s_ntargself++;
        bau_desc = bcp->descriptor_base;
        bau_desc += UV_ITEMS_PER_DESCRIPTOR * bcp->uvhub_cpu;
        bau_uvhubs_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE);
-        remotes = 0;
+        /* cpu statistics */
        for_each_cpu(tcpu, flush_mask) {
                uvhub = uv_cpu_to_blade_id(tcpu);
-                if (uvhub == bcp->uvhub) {
-                        locals++;
-                        continue;
-                }
                bau_uvhub_set(uvhub, &bau_desc->distribution);
-                remotes++;
+                if (uvhub == bcp->uvhub)
-        }
+                        locals++;
-        if (remotes == 0) {
-                /*
-                 * No off_hub flushing; return status for local hub.
-                 * Return the caller's mask if all were local (the current
-                 * cpu may be in that mask).
-                 */
-                if (locals)
-                        return cpumask;
                else
-                        return NULL;
+                        remotes++;
        }
-        stat = &per_cpu(ptcstats, cpu);
+        if ((locals + remotes) == 0)
+                return NULL;
        stat->s_requestor++;
-        stat->s_ntargcpu += remotes;
+        stat->s_ntargcpu += remotes + locals;
+        stat->s_ntargremotes += remotes;
+        stat->s_ntarglocals += locals;
        remotes = bau_uvhub_weight(&bau_desc->distribution);
-        stat->s_ntarguvhub += remotes;
-        if (remotes >= 16)
+        /* uvhub statistics */
+        hubs = bau_uvhub_weight(&bau_desc->distribution);
+        if (locals) {
+                stat->s_ntarglocaluvhub++;
+                stat->s_ntargremoteuvhub += (hubs - 1);
+        } else
+                stat->s_ntargremoteuvhub += hubs;
+        stat->s_ntarguvhub += hubs;
+        if (hubs >= 16)
                stat->s_ntarguvhub16++;
-        else if (remotes >= 8)
+        else if (hubs >= 8)
                stat->s_ntarguvhub8++;
-        else if (remotes >= 4)
+        else if (hubs >= 4)
                stat->s_ntarguvhub4++;
-        else if (remotes >= 2)
+        else if (hubs >= 2)
                stat->s_ntarguvhub2++;
        else
                stat->s_ntarguvhub1++;
@@ -779,10 +789,13 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
        bau_desc->payload.sending_cpu = cpu;
        /*
-         * uv_flush_send_and_wait returns null if all cpu's were messaged, or
+         * uv_flush_send_and_wait returns 0 if all cpu's were messaged,
-         * the adjusted flush_mask if any cpu's were not messaged.
+         * or 1 if it gave up and the original cpumask should be returned.
         */
-        return uv_flush_send_and_wait(bau_desc, flush_mask, bcp);
+        if (!uv_flush_send_and_wait(bau_desc, flush_mask, bcp))
+                return NULL;
+        else
+                return cpumask;
 }
 /*
@@ -810,7 +823,7 @@ void uv_bau_message_interrupt(struct pt_regs *regs)
        time_start = get_cycles();
        bcp = &per_cpu(bau_control, smp_processor_id());
-        stat = &per_cpu(ptcstats, smp_processor_id());
+        stat = bcp->statp;
        msgdesc.va_queue_first = bcp->va_queue_first;
        msgdesc.va_queue_last = bcp->va_queue_last;
        msg = bcp->bau_msg_head;
@@ -908,12 +921,12 @@ static void uv_ptc_seq_stop(struct seq_file *file, void *data)
 }
 static inline unsigned long long
-millisec_2_cycles(unsigned long millisec)
+microsec_2_cycles(unsigned long microsec)
 {
        unsigned long ns;
        unsigned long long cyc;
-        ns = millisec * 1000;
+        ns = microsec * 1000;
        cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id()));
        return cyc;
 }
@@ -931,15 +944,19 @@ static int uv_ptc_seq_show(struct seq_file *file, void *data)
        if (!cpu) {
                seq_printf(file,
-                        "# cpu sent stime numuvhubs numuvhubs16 numuvhubs8 ");
+                        "# cpu sent stime self locals remotes ncpus localhub ");
+                seq_printf(file,
+                        "remotehub numuvhubs numuvhubs16 numuvhubs8 ");
                seq_printf(file,
-                        "numuvhubs4 numuvhubs2 numuvhubs1 numcpus dto ");
+                        "numuvhubs4 numuvhubs2 numuvhubs1 dto ");
                seq_printf(file,
                        "retries rok resetp resett giveup sto bz throt ");
                seq_printf(file,
                        "sw_ack recv rtime all ");
                seq_printf(file,
-                        "one mult none retry canc nocan reset rcan\n");
+                        "one mult none retry canc nocan reset rcan ");
+                seq_printf(file,
+                        "disable enable\n");
        }
        if (cpu < num_possible_cpus() && cpu_online(cpu)) {
                stat = &per_cpu(ptcstats, cpu);
@@ -947,18 +964,23 @@ static int uv_ptc_seq_show(struct seq_file *file, void *data)
                seq_printf(file,
                        "cpu %d %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ",
                           cpu, stat->s_requestor, cycles_2_us(stat->s_time),
-                           stat->s_ntarguvhub, stat->s_ntarguvhub16,
+                           stat->s_ntargself, stat->s_ntarglocals,
+                           stat->s_ntargremotes, stat->s_ntargcpu,
+                           stat->s_ntarglocaluvhub, stat->s_ntargremoteuvhub,
+                           stat->s_ntarguvhub, stat->s_ntarguvhub16);
+                seq_printf(file, "%ld %ld %ld %ld %ld ",
                           stat->s_ntarguvhub8, stat->s_ntarguvhub4,
                           stat->s_ntarguvhub2, stat->s_ntarguvhub1,
-                           stat->s_ntargcpu, stat->s_dtimeout);
+                           stat->s_dtimeout);
                seq_printf(file, "%ld %ld %ld %ld %ld %ld %ld %ld ",
                           stat->s_retry_messages, stat->s_retriesok,
                           stat->s_resets_plug, stat->s_resets_timeout,
                           stat->s_giveup, stat->s_stimeout,
                           stat->s_busy, stat->s_throttles);
                /* destination side statistics */
                seq_printf(file,
-                           "%lx %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld\n",
+                           "%lx %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ",
                           uv_read_global_mmr64(uv_cpu_to_pnode(cpu),
                                        UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE),
                           stat->d_requestee, cycles_2_us(stat->d_time),
@@ -966,15 +988,36 @@ static int uv_ptc_seq_show(struct seq_file *file, void *data)
                           stat->d_nomsg, stat->d_retries, stat->d_canceled,
                           stat->d_nocanceled, stat->d_resets,
                           stat->d_rcanceled);
+                seq_printf(file, "%ld %ld\n",
+                        stat->s_bau_disabled, stat->s_bau_reenabled);
        }
        return 0;
 }
 /*
+ * Display the tunables thru debugfs
+ */
+static ssize_t tunables_read(struct file *file, char __user *userbuf,
+                                                size_t count, loff_t *ppos)
+{
+        char buf[300];
+        int ret;
+        ret = snprintf(buf, 300, "%s %s %s\n%d %d %d %d %d %d %d %d %d\n",
+                "max_bau_concurrent plugged_delay plugsb4reset",
+                "timeoutsb4reset ipi_reset_limit complete_threshold",
+                "congested_response_us congested_reps congested_period",
+                max_bau_concurrent, plugged_delay, plugsb4reset,
+                timeoutsb4reset, ipi_reset_limit, complete_threshold,
+                congested_response_us, congested_reps, congested_period);
+        return simple_read_from_buffer(userbuf, count, ppos, buf, ret);
+}
+/*
 * -1: resetf the statistics
 *  0: display meaning of the statistics
- * >0: maximum concurrent active descriptors per uvhub (throttle)
 */
 static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user,
                                 size_t count, loff_t *data)
@@ -983,7 +1026,6 @@ static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user,
        long input_arg;
        char optstr[64];
        struct ptc_stats *stat;
-        struct bau_control *bcp;
        if (count == 0 || count > sizeof(optstr))
                return -EINVAL;
@@ -1059,29 +1101,158 @@ static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user,
                "reset:    number of ipi-style reset requests processed\n");
                printk(KERN_DEBUG
                "rcan:     number messages canceled by reset requests\n");
+                printk(KERN_DEBUG
+                "disable:  number times use of the BAU was disabled\n");
+                printk(KERN_DEBUG
+                "enable:   number times use of the BAU was re-enabled\n");
        } else if (input_arg == -1) {
                for_each_present_cpu(cpu) {
                        stat = &per_cpu(ptcstats, cpu);
                        memset(stat, 0, sizeof(struct ptc_stats));
                }
-        } else {
+        }
-                uv_bau_max_concurrent = input_arg;
-                bcp = &per_cpu(bau_control, smp_processor_id());
+        return count;
-                if (uv_bau_max_concurrent < 1 ||
+}
-                    uv_bau_max_concurrent > bcp->cpus_in_uvhub) {
-                        printk(KERN_DEBUG
+static int local_atoi(const char *name)
-                                "Error: BAU max concurrent %d; %d is invalid\n",
+{
-                                bcp->max_concurrent, uv_bau_max_concurrent);
+        int val = 0;
-                        return -EINVAL;
-                }
+        for (;; name++) {
-                printk(KERN_DEBUG "Set BAU max concurrent:%d\n",
+                switch (*name) {
-                       uv_bau_max_concurrent);
+                case '0' ... '9':
-                for_each_present_cpu(cpu) {
+                        val = 10*val+(*name-'0');
-                        bcp = &per_cpu(bau_control, cpu);
+                        break;
-                        bcp->max_concurrent = uv_bau_max_concurrent;
+                default:
+                        return val;
                }
        }
+}
+/*
+ * set the tunables
+ * 0 values reset them to defaults
+ */
+static ssize_t tunables_write(struct file *file, const char __user *user,
+                                 size_t count, loff_t *data)
+{
+        int cpu;
+        int cnt = 0;
+        int val;
+        char *p;
+        char *q;
+        char instr[64];
+        struct bau_control *bcp;
+        if (count == 0 || count > sizeof(instr)-1)
+                return -EINVAL;
+        if (copy_from_user(instr, user, count))
+                return -EFAULT;
+        instr[count] = '\0';
+        /* count the fields */
+        p = instr + strspn(instr, WHITESPACE);
+        q = p;
+        for (; *p; p = q + strspn(q, WHITESPACE)) {
+                q = p + strcspn(p, WHITESPACE);
+                cnt++;
+                if (q == p)
+                        break;
+        }
+        if (cnt != 9) {
+                printk(KERN_INFO "bau tunable error: should be 9 numbers\n");
+                return -EINVAL;
+        }
+        p = instr + strspn(instr, WHITESPACE);
+        q = p;
+        for (cnt = 0; *p; p = q + strspn(q, WHITESPACE), cnt++) {
+                q = p + strcspn(p, WHITESPACE);
+                val = local_atoi(p);
+                switch (cnt) {
+                case 0:
+                        if (val == 0) {
+                                max_bau_concurrent = MAX_BAU_CONCURRENT;
+                                max_bau_concurrent_constant =
+                                                        MAX_BAU_CONCURRENT;
+                                continue;
+                        }
+                        bcp = &per_cpu(bau_control, smp_processor_id());
+                        if (val < 1 || val > bcp->cpus_in_uvhub) {
+                                printk(KERN_DEBUG
+                                "Error: BAU max concurrent %d is invalid\n",
+                                val);
+                                return -EINVAL;
+                        }
+                        max_bau_concurrent = val;
+                        max_bau_concurrent_constant = val;
+                        continue;
+                case 1:
+                        if (val == 0)
+                                plugged_delay = PLUGGED_DELAY;
+                        else
+                                plugged_delay = val;
+                        continue;
+                case 2:
+                        if (val == 0)
+                                plugsb4reset = PLUGSB4RESET;
+                        else
+                                plugsb4reset = val;
+                        continue;
+                case 3:
+                        if (val == 0)
+                                timeoutsb4reset = TIMEOUTSB4RESET;
+                        else
+                                timeoutsb4reset = val;
+                        continue;
+                case 4:
+                        if (val == 0)
+                                ipi_reset_limit = IPI_RESET_LIMIT;
+                        else
+                                ipi_reset_limit = val;
+                        continue;
+                case 5:
+                        if (val == 0)
+                                complete_threshold = COMPLETE_THRESHOLD;
+                        else
+                                complete_threshold = val;
+                        continue;
+                case 6:
+                        if (val == 0)
+                                congested_response_us = CONGESTED_RESPONSE_US;
+                        else
+                                congested_response_us = val;
+                        continue;
+                case 7:
+                        if (val == 0)
+                                congested_reps = CONGESTED_REPS;
+                        else
+                                congested_reps = val;
+                        continue;
+                case 8:
+                        if (val == 0)
+                                congested_period = CONGESTED_PERIOD;
+                        else
+                                congested_period = val;
+                        continue;
+                }
+                if (q == p)
+                        break;
+        }
+        for_each_present_cpu(cpu) {
+                bcp = &per_cpu(bau_control, cpu);
+                bcp->max_bau_concurrent = max_bau_concurrent;
+                bcp->max_bau_concurrent_constant = max_bau_concurrent;
+                bcp->plugged_delay = plugged_delay;
+                bcp->plugsb4reset = plugsb4reset;
+                bcp->timeoutsb4reset = timeoutsb4reset;
+                bcp->ipi_reset_limit = ipi_reset_limit;
+                bcp->complete_threshold = complete_threshold;
+                bcp->congested_response_us = congested_response_us;
+                bcp->congested_reps = congested_reps;
+                bcp->congested_period = congested_period;
+        }
        return count;
 }
@@ -1097,6 +1268,11 @@ static int uv_ptc_proc_open(struct inode *inode, struct file *file)
        return seq_open(file, &uv_ptc_seq_ops);
 }
+static int tunables_open(struct inode *inode, struct file *file)
+{
+        return 0;
+}
 static const struct file_operations proc_uv_ptc_operations = {
        .open           = uv_ptc_proc_open,
        .read           = seq_read,
@@ -1105,6 +1281,12 @@ static const struct file_operations proc_uv_ptc_operations = {
        .release        = seq_release,
 };
+static const struct file_operations tunables_fops = {
+        .open           = tunables_open,
+        .read           = tunables_read,
+        .write          = tunables_write,
+};
 static int __init uv_ptc_init(void)
 {
        struct proc_dir_entry *proc_uv_ptc;
@@ -1119,6 +1301,20 @@ static int __init uv_ptc_init(void)
                       UV_PTC_BASENAME);
                return -EINVAL;
        }
+        tunables_dir = debugfs_create_dir(UV_BAU_TUNABLES_DIR, NULL);
+        if (!tunables_dir) {
+                printk(KERN_ERR "unable to create debugfs directory %s\n",
+                       UV_BAU_TUNABLES_DIR);
+                return -EINVAL;
+        }
+        tunables_file = debugfs_create_file(UV_BAU_TUNABLES_FILE, 0600,
+                        tunables_dir, NULL, &tunables_fops);
+        if (!tunables_file) {
+                printk(KERN_ERR "unable to create debugfs file %s\n",
+                       UV_BAU_TUNABLES_FILE);
+                return -EINVAL;
+        }
        return 0;
 }
@@ -1259,15 +1455,45 @@ static void __init uv_init_uvhub(int uvhub, int vector)
 }
 /*
+ * We will set BAU_MISC_CONTROL with a timeout period.
+ * But the BIOS has set UVH_AGING_PRESCALE_SEL and UVH_TRANSACTION_TIMEOUT.
+ * So the destination timeout period has be be calculated from them.
+ */
+static int
+calculate_destination_timeout(void)
+{
+        unsigned long mmr_image;
+        int mult1;
+        int mult2;
+        int index;
+        int base;
+        int ret;
+        unsigned long ts_ns;
+        mult1 = UV_INTD_SOFT_ACK_TIMEOUT_PERIOD & BAU_MISC_CONTROL_MULT_MASK;
+        mmr_image = uv_read_local_mmr(UVH_AGING_PRESCALE_SEL);
+        index = (mmr_image >> BAU_URGENCY_7_SHIFT) & BAU_URGENCY_7_MASK;
+        mmr_image = uv_read_local_mmr(UVH_TRANSACTION_TIMEOUT);
+        mult2 = (mmr_image >> BAU_TRANS_SHIFT) & BAU_TRANS_MASK;
+        base = timeout_base_ns[index];
+        ts_ns = base * mult1 * mult2;
+        ret = ts_ns / 1000;
+        return ret;
+}
+/*
 * initialize the bau_control structure for each cpu
 */
-static void uv_init_per_cpu(int nuvhubs)
+static void __init uv_init_per_cpu(int nuvhubs)
 {
-        int i, j, k;
+        int i;
        int cpu;
        int pnode;
        int uvhub;
+        int have_hmaster;
        short socket = 0;
+        unsigned short socket_mask;
+        unsigned char *uvhub_mask;
        struct bau_control *bcp;
        struct uvhub_desc *bdp;
        struct socket_desc *sdp;
@@ -1278,7 +1504,7 @@ static void uv_init_per_cpu(int nuvhubs)
                short cpu_number[16];
        };
        struct uvhub_desc {
-                short num_sockets;
+                unsigned short socket_mask;
                short num_cpus;
                short uvhub;
                short pnode;
@@ -1286,57 +1512,84 @@ static void uv_init_per_cpu(int nuvhubs)
        };
        struct uvhub_desc *uvhub_descs;
+        timeout_us = calculate_destination_timeout();
        uvhub_descs = (struct uvhub_desc *)
                kmalloc(nuvhubs * sizeof(struct uvhub_desc), GFP_KERNEL);
        memset(uvhub_descs, 0, nuvhubs * sizeof(struct uvhub_desc));
+        uvhub_mask = kzalloc((nuvhubs+7)/8, GFP_KERNEL);
        for_each_present_cpu(cpu) {
                bcp = &per_cpu(bau_control, cpu);
                memset(bcp, 0, sizeof(struct bau_control));
-                spin_lock_init(&bcp->masks_lock);
-                bcp->max_concurrent = uv_bau_max_concurrent;
                pnode = uv_cpu_hub_info(cpu)->pnode;
                uvhub = uv_cpu_hub_info(cpu)->numa_blade_id;
+                *(uvhub_mask + (uvhub/8)) |= (1 << (uvhub%8));
                bdp = &uvhub_descs[uvhub];
                bdp->num_cpus++;
                bdp->uvhub = uvhub;
                bdp->pnode = pnode;
-                /* time interval to catch a hardware stay-busy bug */
+                /* kludge: 'assuming' one node per socket, and assuming that
-                bcp->timeout_interval = millisec_2_cycles(3);
+                   disabling a socket just leaves a gap in node numbers */
-                /* kludge: assume uv_hub.h is constant */
+                socket = (cpu_to_node(cpu) & 1);
-                socket = (cpu_physical_id(cpu)>>5)&1;
+                bdp->socket_mask |= (1 << socket);
-                if (socket >= bdp->num_sockets)
-                        bdp->num_sockets = socket+1;
                sdp = &bdp->socket[socket];
                sdp->cpu_number[sdp->num_cpus] = cpu;
                sdp->num_cpus++;
        }
-        socket = 0;
+        for (uvhub = 0; uvhub < nuvhubs; uvhub++) {
-        for_each_possible_blade(uvhub) {
+                if (!(*(uvhub_mask + (uvhub/8)) & (1 << (uvhub%8))))
+                        continue;
+                have_hmaster = 0;
                bdp = &uvhub_descs[uvhub];
-                for (i = 0; i < bdp->num_sockets; i++) {
+                socket_mask = bdp->socket_mask;
-                        sdp = &bdp->socket[i];
+                socket = 0;
-                        for (j = 0; j < sdp->num_cpus; j++) {
+                while (socket_mask) {
-                                cpu = sdp->cpu_number[j];
+                        if (!(socket_mask & 1))
+                                goto nextsocket;
+                        sdp = &bdp->socket[socket];
+                        for (i = 0; i < sdp->num_cpus; i++) {
+                                cpu = sdp->cpu_number[i];
                                bcp = &per_cpu(bau_control, cpu);
                                bcp->cpu = cpu;
-                                if (j == 0) {
+                                if (i == 0) {
                                        smaster = bcp;
-                                        if (i == 0)
+                                        if (!have_hmaster) {
+                                                have_hmaster++;
                                                hmaster = bcp;
+                                        }
                                }
                                bcp->cpus_in_uvhub = bdp->num_cpus;
                                bcp->cpus_in_socket = sdp->num_cpus;
                                bcp->socket_master = smaster;
+                                bcp->uvhub = bdp->uvhub;
                                bcp->uvhub_master = hmaster;
-                                for (k = 0; k < DEST_Q_SIZE; k++)
+                                bcp->uvhub_cpu = uv_cpu_hub_info(cpu)->
-                                        bcp->socket_acknowledge_count[k] = 0;
+                                                blade_processor_id;
-                                bcp->uvhub_cpu =
-                                  uv_cpu_hub_info(cpu)->blade_processor_id;
                        }
+nextsocket:
                        socket++;
+                        socket_mask = (socket_mask >> 1);
                }
        }
        kfree(uvhub_descs);
+        kfree(uvhub_mask);
+        for_each_present_cpu(cpu) {
+                bcp = &per_cpu(bau_control, cpu);
+                bcp->baudisabled = 0;
+                bcp->statp = &per_cpu(ptcstats, cpu);
+                /* time interval to catch a hardware stay-busy bug */
+                bcp->timeout_interval = microsec_2_cycles(2*timeout_us);
+                bcp->max_bau_concurrent = max_bau_concurrent;
+                bcp->max_bau_concurrent_constant = max_bau_concurrent;
+                bcp->plugged_delay = plugged_delay;
+                bcp->plugsb4reset = plugsb4reset;
+                bcp->timeoutsb4reset = timeoutsb4reset;
+                bcp->ipi_reset_limit = ipi_reset_limit;
+                bcp->complete_threshold = complete_threshold;
+                bcp->congested_response_us = congested_response_us;
+                bcp->congested_reps = congested_reps;
+                bcp->congested_period = congested_period;
+        }
 }
 /*
@@ -1361,10 +1614,11 @@ static int __init uv_bau_init(void)
                zalloc_cpumask_var_node(&per_cpu(uv_flush_tlb_mask, cur_cpu),
                                       GFP_KERNEL, cpu_to_node(cur_cpu));
-        uv_bau_max_concurrent = MAX_BAU_CONCURRENT;
        uv_nshift = uv_hub_info->m_val;
        uv_mmask = (1UL << uv_hub_info->m_val) - 1;
        nuvhubs = uv_num_possible_blades();
+        spin_lock_init(&disable_lock);
+        congested_cycles = microsec_2_cycles(congested_response_us);
        uv_init_per_cpu(nuvhubs);
@@ -1383,15 +1637,19 @@ static int __init uv_bau_init(void)
        alloc_intr_gate(vector, uv_bau_message_intr1);
        for_each_possible_blade(uvhub) {
-                pnode = uv_blade_to_pnode(uvhub);
+                if (uv_blade_nr_possible_cpus(uvhub)) {
-                /* INIT the bau */
+                        pnode = uv_blade_to_pnode(uvhub);
-                uv_write_global_mmr64(pnode, UVH_LB_BAU_SB_ACTIVATION_CONTROL,
+                        /* INIT the bau */
-                                      ((unsigned long)1 << 63));
+                        uv_write_global_mmr64(pnode,
-                mmr = 1; /* should be 1 to broadcast to both sockets */
+                                        UVH_LB_BAU_SB_ACTIVATION_CONTROL,
-                uv_write_global_mmr64(pnode, UVH_BAU_DATA_BROADCAST, mmr);
+                                        ((unsigned long)1 << 63));
+                        mmr = 1; /* should be 1 to broadcast to both sockets */
+                        uv_write_global_mmr64(pnode, UVH_BAU_DATA_BROADCAST,
+                                                mmr);
+                }
        }
        return 0;
 }
 core_initcall(uv_bau_init);
-core_initcall(uv_ptc_init);
+fs_initcall(uv_ptc_init);
diff --git a/arch/x86/kernel/trampoline.c b/arch/x86/kernel/trampoline.c
index c652ef62742d..e2a595257390 100644
--- a/arch/x86/kernel/trampoline.c
+++ b/arch/x86/kernel/trampoline.c
@@ -1,6 +1,7 @@
 #include <linux/io.h>
 #include <asm/trampoline.h>
+#include <asm/pgtable.h>
 #include <asm/e820.h>
 #if defined(CONFIG_X86_64) && defined(CONFIG_ACPI_SLEEP)
@@ -37,3 +38,19 @@ unsigned long __trampinit setup_trampoline(void)
        memcpy(trampoline_base, trampoline_data, TRAMPOLINE_SIZE);
        return virt_to_phys(trampoline_base);
 }
+void __init setup_trampoline_page_table(void)
+{
+#ifdef CONFIG_X86_32
+        /* Copy kernel address range */
+        clone_pgd_range(trampoline_pg_dir + KERNEL_PGD_BOUNDARY,
+                        swapper_pg_dir + KERNEL_PGD_BOUNDARY,
+                        KERNEL_PGD_PTRS);
+        /* Initialize low mappings */
+        clone_pgd_range(trampoline_pg_dir,
+                        swapper_pg_dir + KERNEL_PGD_BOUNDARY,
+                        min_t(unsigned long, KERNEL_PGD_PTRS,
+                              KERNEL_PGD_BOUNDARY));
+#endif
+}
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 725ef4d17cd5..60788dee0f8a 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -392,7 +392,13 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
                if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT)
                                                                == NOTIFY_STOP)
                        return;
 #ifdef CONFIG_X86_LOCAL_APIC
+                if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
+                                                        == NOTIFY_STOP)
+                        return;
+#ifndef CONFIG_LOCKUP_DETECTOR
                /*
                 * Ok, so this is none of the documented NMI sources,
                 * so it must be the NMI watchdog.
@@ -400,6 +406,7 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
                if (nmi_watchdog_tick(regs, reason))
                        return;
                if (!do_nmi_callback(regs, cpu))
+#endif /* !CONFIG_LOCKUP_DETECTOR */
                        unknown_nmi_error(reason, regs);
 #else
                unknown_nmi_error(reason, regs);
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 9faf91ae1841..26a863a9c2a8 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -626,6 +626,44 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
        local_irq_restore(flags);
 }
+static unsigned long long cyc2ns_suspend;
+void save_sched_clock_state(void)
+{
+        if (!sched_clock_stable)
+                return;
+        cyc2ns_suspend = sched_clock();
+}
+/*
+ * Even on processors with invariant TSC, TSC gets reset in some the
+ * ACPI system sleep states. And in some systems BIOS seem to reinit TSC to
+ * arbitrary value (still sync'd across cpu's) during resume from such sleep
+ * states. To cope up with this, recompute the cyc2ns_offset for each cpu so
+ * that sched_clock() continues from the point where it was left off during
+ * suspend.
+ */
+void restore_sched_clock_state(void)
+{
+        unsigned long long offset;
+        unsigned long flags;
+        int cpu;
+        if (!sched_clock_stable)
+                return;
+        local_irq_save(flags);
+        __get_cpu_var(cyc2ns_offset) = 0;
+        offset = cyc2ns_suspend - sched_clock();
+        for_each_possible_cpu(cpu)
+                per_cpu(cyc2ns_offset, cpu) = offset;
+        local_irq_restore(flags);
+}
 #ifdef CONFIG_CPU_FREQ
 /* Frequency scaling support. Adjust the TSC based timer when the cpu frequency
@@ -751,7 +789,6 @@ static struct clocksource clocksource_tsc = {
        .read                   = read_tsc,
        .resume                 = resume_tsc,
        .mask                   = CLOCKSOURCE_MASK(64),
-        .shift                  = 22,
        .flags                  = CLOCK_SOURCE_IS_CONTINUOUS |
                                  CLOCK_SOURCE_MUST_VERIFY,
 #ifdef CONFIG_X86_64
@@ -845,8 +882,6 @@ __cpuinit int unsynchronized_tsc(void)
 static void __init init_tsc_clocksource(void)
 {
-        clocksource_tsc.mult = clocksource_khz2mult(tsc_khz,
-                        clocksource_tsc.shift);
        if (tsc_clocksource_reliable)
                clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
        /* lower the rating if we already know its unstable: */
@@ -854,7 +889,7 @@ static void __init init_tsc_clocksource(void)
                clocksource_tsc.rating = 0;
                clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS;
        }
-        clocksource_register(&clocksource_tsc);
+        clocksource_register_khz(&clocksource_tsc, tsc_khz);
 }
 #ifdef CONFIG_X86_64
diff --git a/arch/x86/kernel/verify_cpu_64.S b/arch/x86/kernel/verify_cpu_64.S
index 45b6f8a975a1..56a8c2a867d9 100644
--- a/arch/x86/kernel/verify_cpu_64.S
+++ b/arch/x86/kernel/verify_cpu_64.S
@@ -31,6 +31,7 @@
 */
 #include <asm/cpufeature.h>
+#include <asm/msr-index.h>
 verify_cpu:
        pushfl                          # Save caller passed flags
@@ -88,7 +89,7 @@ verify_cpu_sse_test:
        je      verify_cpu_sse_ok
        test    %di,%di
        jz      verify_cpu_no_longmode  # only try to force SSE on AMD
-        movl    $0xc0010015,%ecx        # HWCR
+        movl    $MSR_K7_HWCR,%ecx
        rdmsr
        btr     $15,%eax                # enable SSE
        wrmsr
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 1c0c6ab9c60f..dcbb28c4b694 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -73,8 +73,8 @@ void update_vsyscall_tz(void)
        write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
 }
-void update_vsyscall(struct timespec *wall_time, struct clocksource *clock,
+void update_vsyscall(struct timespec *wall_time, struct timespec *wtm,
-                     u32 mult)
+                        struct clocksource *clock, u32 mult)
 {
        unsigned long flags;
@@ -87,7 +87,7 @@ void update_vsyscall(struct timespec *wall_time, struct clocksource *clock,
        vsyscall_gtod_data.clock.shift = clock->shift;
        vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec;
        vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
-        vsyscall_gtod_data.wall_to_monotonic = wall_to_monotonic;
+        vsyscall_gtod_data.wall_to_monotonic = *wtm;
        vsyscall_gtod_data.wall_time_coarse = __current_kernel_time();
        write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
 }
@@ -169,13 +169,18 @@ int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz)
 * unlikely */
 time_t __vsyscall(1) vtime(time_t *t)
 {
-        struct timeval tv;
+        unsigned seq;
        time_t result;
        if (unlikely(!__vsyscall_gtod_data.sysctl_enabled))
                return time_syscall(t);
-        vgettimeofday(&tv, NULL);
+        do {
-        result = tv.tv_sec;
+                seq = read_seqbegin(&__vsyscall_gtod_data.lock);
+                result = __vsyscall_gtod_data.wall_time_sec;
+        } while (read_seqretry(&__vsyscall_gtod_data.lock, seq));
        if (t)
                *t = result;
        return result;
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index 37e68fc5e24a..9c253bd65e24 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -16,11 +16,88 @@
 */
 u64 pcntxt_mask;
+/*
+ * Represents init state for the supported extended state.
+ */
+static struct xsave_struct *init_xstate_buf;
 struct _fpx_sw_bytes fx_sw_reserved;
 #ifdef CONFIG_IA32_EMULATION
 struct _fpx_sw_bytes fx_sw_reserved_ia32;
 #endif
+static unsigned int *xstate_offsets, *xstate_sizes, xstate_features;
+/*
+ * If a processor implementation discern that a processor state component is
+ * in its initialized state it may modify the corresponding bit in the
+ * xsave_hdr.xstate_bv as '0', with out modifying the corresponding memory
+ * layout in the case of xsaveopt. While presenting the xstate information to
+ * the user, we always ensure that the memory layout of a feature will be in
+ * the init state if the corresponding header bit is zero. This is to ensure
+ * that the user doesn't see some stale state in the memory layout during
+ * signal handling, debugging etc.
+ */
+void __sanitize_i387_state(struct task_struct *tsk)
+{
+        u64 xstate_bv;
+        int feature_bit = 0x2;
+        struct i387_fxsave_struct *fx = &tsk->thread.fpu.state->fxsave;
+        if (!fx)
+                return;
+        BUG_ON(task_thread_info(tsk)->status & TS_USEDFPU);
+        xstate_bv = tsk->thread.fpu.state->xsave.xsave_hdr.xstate_bv;
+        /*
+         * None of the feature bits are in init state. So nothing else
+         * to do for us, as the memory layout is upto date.
+         */
+        if ((xstate_bv & pcntxt_mask) == pcntxt_mask)
+                return;
+        /*
+         * FP is in init state
+         */
+        if (!(xstate_bv & XSTATE_FP)) {
+                fx->cwd = 0x37f;
+                fx->swd = 0;
+                fx->twd = 0;
+                fx->fop = 0;
+                fx->rip = 0;
+                fx->rdp = 0;
+                memset(&fx->st_space[0], 0, 128);
+        }
+        /*
+         * SSE is in init state
+         */
+        if (!(xstate_bv & XSTATE_SSE))
+                memset(&fx->xmm_space[0], 0, 256);
+        xstate_bv = (pcntxt_mask & ~xstate_bv) >> 2;
+        /*
+         * Update all the other memory layouts for which the corresponding
+         * header bit is in the init state.
+         */
+        while (xstate_bv) {
+                if (xstate_bv & 0x1) {
+                        int offset = xstate_offsets[feature_bit];
+                        int size = xstate_sizes[feature_bit];
+                        memcpy(((void *) fx) + offset,
+                               ((void *) init_xstate_buf) + offset,
+                               size);
+                }
+                xstate_bv >>= 1;
+                feature_bit++;
+        }
+}
 /*
 * Check for the presence of extended state information in the
 * user fpstate pointer in the sigcontext.
@@ -36,15 +113,14 @@ int check_for_xstate(struct i387_fxsave_struct __user *buf,
        err = __copy_from_user(fx_sw_user, &buf->sw_reserved[0],
                               sizeof(struct _fpx_sw_bytes));
        if (err)
-                return err;
+                return -EFAULT;
        /*
         * First Magic check failed.
         */
        if (fx_sw_user->magic1 != FP_XSTATE_MAGIC1)
-                return -1;
+                return -EINVAL;
        /*
         * Check for error scenarios.
@@ -52,19 +128,21 @@ int check_for_xstate(struct i387_fxsave_struct __user *buf,
        if (fx_sw_user->xstate_size < min_xstate_size ||
            fx_sw_user->xstate_size > xstate_size ||
            fx_sw_user->xstate_size > fx_sw_user->extended_size)
-                return -1;
+                return -EINVAL;
        err = __get_user(magic2, (__u32 *) (((void *)fpstate) +
                                            fx_sw_user->extended_size -
                                            FP_XSTATE_MAGIC2_SIZE));
+        if (err)
+                return err;
        /*
         * Check for the presence of second magic word at the end of memory
         * layout. This detects the case where the user just copied the legacy
         * fpstate layout with out copying the extended state information
         * in the memory layout.
         */
-        if (err || magic2 != FP_XSTATE_MAGIC2)
+        if (magic2 != FP_XSTATE_MAGIC2)
-                return -1;
+                return -EFAULT;
        return 0;
 }
@@ -91,14 +169,6 @@ int save_i387_xstate(void __user *buf)
                return 0;
        if (task_thread_info(tsk)->status & TS_USEDFPU) {
-                /*
-                 * Start with clearing the user buffer. This will present a
-                 * clean context for the bytes not touched by the fxsave/xsave.
-                 */
-                err = __clear_user(buf, sig_xstate_size);
-                if (err)
-                        return err;
                if (use_xsave())
                        err = xsave_user(buf);
                else
@@ -109,6 +179,7 @@ int save_i387_xstate(void __user *buf)
                task_thread_info(tsk)->status &= ~TS_USEDFPU;
                stts();
        } else {
+                sanitize_i387_state(tsk);
                if (__copy_to_user(buf, &tsk->thread.fpu.state->fxsave,
                                   xstate_size))
                        return -1;
@@ -184,8 +255,8 @@ static int restore_user_xstate(void __user *buf)
         * init the state skipped by the user.
         */
        mask = pcntxt_mask & ~mask;
+        if (unlikely(mask))
-        xrstor_state(init_xstate_buf, mask);
+                xrstor_state(init_xstate_buf, mask);
        return 0;
@@ -274,11 +345,6 @@ static void prepare_fx_sw_frame(void)
 #endif
 }
-/*
- * Represents init state for the supported extended state.
- */
-struct xsave_struct *init_xstate_buf;
 #ifdef CONFIG_X86_64
 unsigned int sig_xstate_size = sizeof(struct _fpstate);
 #endif
@@ -286,37 +352,77 @@ unsigned int sig_xstate_size = sizeof(struct _fpstate);
 /*
 * Enable the extended processor state save/restore feature
 */
-void __cpuinit xsave_init(void)
+static inline void xstate_enable(void)
 {
-        if (!cpu_has_xsave)
-                return;
        set_in_cr4(X86_CR4_OSXSAVE);
-        /*
-         * Enable all the features that the HW is capable of
-         * and the Linux kernel is aware of.
-         */
        xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask);
 }
 /*
+ * Record the offsets and sizes of different state managed by the xsave
+ * memory layout.
+ */
+static void __init setup_xstate_features(void)
+{
+        int eax, ebx, ecx, edx, leaf = 0x2;
+        xstate_features = fls64(pcntxt_mask);
+        xstate_offsets = alloc_bootmem(xstate_features * sizeof(int));
+        xstate_sizes = alloc_bootmem(xstate_features * sizeof(int));
+        do {
+                cpuid_count(XSTATE_CPUID, leaf, &eax, &ebx, &ecx, &edx);
+                if (eax == 0)
+                        break;
+                xstate_offsets[leaf] = ebx;
+                xstate_sizes[leaf] = eax;
+                leaf++;
+        } while (1);
+}
+/*
 * setup the xstate image representing the init state
 */
 static void __init setup_xstate_init(void)
 {
+        setup_xstate_features();
+        /*
+         * Setup init_xstate_buf to represent the init state of
+         * all the features managed by the xsave
+         */
        init_xstate_buf = alloc_bootmem(xstate_size);
        init_xstate_buf->i387.mxcsr = MXCSR_DEFAULT;
+        clts();
+        /*
+         * Init all the features state with header_bv being 0x0
+         */
+        xrstor_state(init_xstate_buf, -1);
+        /*
+         * Dump the init state again. This is to identify the init state
+         * of any feature which is not represented by all zero's.
+         */
+        xsave_state(init_xstate_buf, -1);
+        stts();
 }
 /*
 * Enable and initialize the xsave feature.
 */
-void __ref xsave_cntxt_init(void)
+static void __init xstate_enable_boot_cpu(void)
 {
        unsigned int eax, ebx, ecx, edx;
-        cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx);
+        if (boot_cpu_data.cpuid_level < XSTATE_CPUID) {
+                WARN(1, KERN_ERR "XSTATE_CPUID missing\n");
+                return;
+        }
+        cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
        pcntxt_mask = eax + ((u64)edx << 32);
        if ((pcntxt_mask & XSTATE_FPSSE) != XSTATE_FPSSE) {
@@ -329,12 +435,13 @@ void __ref xsave_cntxt_init(void)
         * Support only the state known to OS.
         */
        pcntxt_mask = pcntxt_mask & XCNTXT_MASK;
-        xsave_init();
+        xstate_enable();
        /*
         * Recompute the context size for enabled features
         */
-        cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx);
+        cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
        xstate_size = ebx;
        update_regset_xstate_info(xstate_size, pcntxt_mask);
@@ -346,3 +453,23 @@ void __ref xsave_cntxt_init(void)
               "cntxt size 0x%x\n",
               pcntxt_mask, xstate_size);
 }
+/*
+ * For the very first instance, this calls xstate_enable_boot_cpu();
+ * for all subsequent instances, this calls xstate_enable().
+ *
+ * This is somewhat obfuscated due to the lack of powerful enough
+ * overrides for the section checks.
+ */
+void __cpuinit xsave_init(void)
+{
+        static __refdata void (*next_func)(void) = xstate_enable_boot_cpu;
+        void (*this_func)(void);
+        if (!cpu_has_xsave)
+                return;
+        this_func = next_func;
+        next_func = xstate_enable;
+        this_func();
+}