58 files changed, 4257 insertions, 3980 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 4c58352209e0..e77b22083721 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -47,8 +47,6 @@ obj-$(CONFIG_X86_TRAMPOLINE)	+= trampoline.o
 obj-y                           += process.o
 obj-y                           += i387.o xsave.o
 obj-y                           += ptrace.o
-obj-$(CONFIG_X86_DS)            += ds.o
-obj-$(CONFIG_X86_DS_SELFTEST)           += ds_selftest.o
 obj-$(CONFIG_X86_32)            += tls.o
 obj-$(CONFIG_IA32_EMULATION)    += tls.o
 obj-y                           += step.o
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index cd40aba6aa95..9a5ed58f09dc 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -94,6 +94,53 @@ enum acpi_irq_model_id acpi_irq_model = ACPI_IRQ_MODEL_PIC;
 /*
+ * ISA irqs by default are the first 16 gsis but can be
+ * any gsi as specified by an interrupt source override.
+ */
+static u32 isa_irq_to_gsi[NR_IRQS_LEGACY] __read_mostly = {
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+};
+static unsigned int gsi_to_irq(unsigned int gsi)
+{
+        unsigned int irq = gsi + NR_IRQS_LEGACY;
+        unsigned int i;
+        for (i = 0; i < NR_IRQS_LEGACY; i++) {
+                if (isa_irq_to_gsi[i] == gsi) {
+                        return i;
+                }
+        }
+        /* Provide an identity mapping of gsi == irq
+         * except on truly weird platforms that have
+         * non isa irqs in the first 16 gsis.
+         */
+        if (gsi >= NR_IRQS_LEGACY)
+                irq = gsi;
+        else
+                irq = gsi_end + 1 + gsi;
+        return irq;
+}
+static u32 irq_to_gsi(int irq)
+{
+        unsigned int gsi;
+        if (irq < NR_IRQS_LEGACY)
+                gsi = isa_irq_to_gsi[irq];
+        else if (irq <= gsi_end)
+                gsi = irq;
+        else if (irq <= (gsi_end + NR_IRQS_LEGACY))
+                gsi = irq - gsi_end;
+        else
+                gsi = 0xffffffff;
+        return gsi;
+}
+/*
 * Temporarily use the virtual area starting from FIX_IO_APIC_BASE_END,
 * to map the target physical address. The problem is that set_fixmap()
 * provides a single page, and it is possible that the page is not
@@ -313,7 +360,7 @@ acpi_parse_ioapic(struct acpi_subtable_header * header, const unsigned long end)
 /*
 * Parse Interrupt Source Override for the ACPI SCI
 */
-static void __init acpi_sci_ioapic_setup(u32 gsi, u16 polarity, u16 trigger)
+static void __init acpi_sci_ioapic_setup(u8 bus_irq, u16 polarity, u16 trigger, u32 gsi)
 {
        if (trigger == 0)       /* compatible SCI trigger is level */
                trigger = 3;
@@ -333,7 +380,7 @@ static void __init acpi_sci_ioapic_setup(u32 gsi, u16 polarity, u16 trigger)
         * If GSI is < 16, this will update its flags,
         * else it will create a new mp_irqs[] entry.
         */
-        mp_override_legacy_irq(gsi, polarity, trigger, gsi);
+        mp_override_legacy_irq(bus_irq, polarity, trigger, gsi);
        /*
         * stash over-ride to indicate we've been here
@@ -357,9 +404,10 @@ acpi_parse_int_src_ovr(struct acpi_subtable_header * header,
        acpi_table_print_madt_entry(header);
        if (intsrc->source_irq == acpi_gbl_FADT.sci_interrupt) {
-                acpi_sci_ioapic_setup(intsrc->global_irq,
+                acpi_sci_ioapic_setup(intsrc->source_irq,
                                      intsrc->inti_flags & ACPI_MADT_POLARITY_MASK,
-                                      (intsrc->inti_flags & ACPI_MADT_TRIGGER_MASK) >> 2);
+                                      (intsrc->inti_flags & ACPI_MADT_TRIGGER_MASK) >> 2,
+                                      intsrc->global_irq);
                return 0;
        }
@@ -448,7 +496,7 @@ void __init acpi_pic_sci_set_trigger(unsigned int irq, u16 trigger)
 int acpi_gsi_to_irq(u32 gsi, unsigned int *irq)
 {
-        *irq = gsi;
+        *irq = gsi_to_irq(gsi);
 #ifdef CONFIG_X86_IO_APIC
        if (acpi_irq_model == ACPI_IRQ_MODEL_IOAPIC)
@@ -458,6 +506,14 @@ int acpi_gsi_to_irq(u32 gsi, unsigned int *irq)
        return 0;
 }
+int acpi_isa_irq_to_gsi(unsigned isa_irq, u32 *gsi)
+{
+        if (isa_irq >= 16)
+                return -1;
+        *gsi = irq_to_gsi(isa_irq);
+        return 0;
+}
 /*
 * success: return IRQ number (>=0)
 * failure: return < 0
@@ -482,7 +538,7 @@ int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
                plat_gsi = mp_register_gsi(dev, gsi, trigger, polarity);
        }
 #endif
-        irq = plat_gsi;
+        irq = gsi_to_irq(plat_gsi);
        return irq;
 }
@@ -867,29 +923,6 @@ static int __init acpi_parse_madt_lapic_entries(void)
 extern int es7000_plat;
 #endif
-int __init acpi_probe_gsi(void)
-{
-        int idx;
-        int gsi;
-        int max_gsi = 0;
-        if (acpi_disabled)
-                return 0;
-        if (!acpi_ioapic)
-                return 0;
-        max_gsi = 0;
-        for (idx = 0; idx < nr_ioapics; idx++) {
-                gsi = mp_gsi_routing[idx].gsi_end;
-                if (gsi > max_gsi)
-                        max_gsi = gsi;
-        }
-        return max_gsi + 1;
-}
 static void assign_to_mp_irq(struct mpc_intsrc *m,
                                    struct mpc_intsrc *mp_irq)
 {
@@ -947,13 +980,13 @@ void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
        mp_irq.dstirq = pin;    /* INTIN# */
        save_mp_irq(&mp_irq);
+        isa_irq_to_gsi[bus_irq] = gsi;
 }
 void __init mp_config_acpi_legacy_irqs(void)
 {
        int i;
-        int ioapic;
-        unsigned int dstapic;
        struct mpc_intsrc mp_irq;
 #if defined (CONFIG_MCA) || defined (CONFIG_EISA)
@@ -974,19 +1007,27 @@ void __init mp_config_acpi_legacy_irqs(void)
 #endif
        /*
-         * Locate the IOAPIC that manages the ISA IRQs (0-15).
-         */
-        ioapic = mp_find_ioapic(0);
-        if (ioapic < 0)
-                return;
-        dstapic = mp_ioapics[ioapic].apicid;
-        /*
         * Use the default configuration for the IRQs 0-15.  Unless
         * overridden by (MADT) interrupt source override entries.
         */
        for (i = 0; i < 16; i++) {
+                int ioapic, pin;
+                unsigned int dstapic;
                int idx;
+                u32 gsi;
+                /* Locate the gsi that irq i maps to. */
+                if (acpi_isa_irq_to_gsi(i, &gsi))
+                        continue;
+                /*
+                 * Locate the IOAPIC that manages the ISA IRQ.
+                 */
+                ioapic = mp_find_ioapic(gsi);
+                if (ioapic < 0)
+                        continue;
+                pin = mp_find_ioapic_pin(ioapic, gsi);
+                dstapic = mp_ioapics[ioapic].apicid;
                for (idx = 0; idx < mp_irq_entries; idx++) {
                        struct mpc_intsrc *irq = mp_irqs + idx;
@@ -996,7 +1037,7 @@ void __init mp_config_acpi_legacy_irqs(void)
                                break;
                        /* Do we already have a mapping for this IOAPIC pin */
-                        if (irq->dstapic == dstapic && irq->dstirq == i)
+                        if (irq->dstapic == dstapic && irq->dstirq == pin)
                                break;
                }
@@ -1011,7 +1052,7 @@ void __init mp_config_acpi_legacy_irqs(void)
                mp_irq.dstapic = dstapic;
                mp_irq.irqtype = mp_INT;
                mp_irq.srcbusirq = i; /* Identity mapped */
-                mp_irq.dstirq = i;
+                mp_irq.dstirq = pin;
                save_mp_irq(&mp_irq);
        }
@@ -1076,11 +1117,6 @@ int mp_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
        ioapic_pin = mp_find_ioapic_pin(ioapic, gsi);
-#ifdef CONFIG_X86_32
-        if (ioapic_renumber_irq)
-                gsi = ioapic_renumber_irq(ioapic, gsi);
-#endif
        if (ioapic_pin > MP_MAX_IOAPIC_PIN) {
                printk(KERN_ERR "Invalid reference to IOAPIC pin "
                       "%d-%d\n", mp_ioapics[ioapic].apicid,
@@ -1094,7 +1130,7 @@ int mp_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
        set_io_apic_irq_attr(&irq_attr, ioapic, ioapic_pin,
                             trigger == ACPI_EDGE_SENSITIVE ? 0 : 1,
                             polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
-        io_apic_set_pci_routing(dev, gsi, &irq_attr);
+        io_apic_set_pci_routing(dev, gsi_to_irq(gsi), &irq_attr);
        return gsi;
 }
@@ -1154,7 +1190,8 @@ static int __init acpi_parse_madt_ioapic_entries(void)
         * pretend we got one so we can set the SCI flags.
         */
        if (!acpi_sci_override_gsi)
-                acpi_sci_ioapic_setup(acpi_gbl_FADT.sci_interrupt, 0, 0);
+                acpi_sci_ioapic_setup(acpi_gbl_FADT.sci_interrupt, 0, 0,
+                                      acpi_gbl_FADT.sci_interrupt);
        /* Fill in identity legacy mappings where no override */
        mp_config_acpi_legacy_irqs();
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 1a160d5d44d0..70237732a6c7 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -194,7 +194,7 @@ static void __init_or_module add_nops(void *insns, unsigned int len)
 }
 extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
-extern u8 *__smp_locks[], *__smp_locks_end[];
+extern s32 __smp_locks[], __smp_locks_end[];
 static void *text_poke_early(void *addr, const void *opcode, size_t len);
 /* Replace instructions with better alternatives for this CPU type.
@@ -235,37 +235,41 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
 #ifdef CONFIG_SMP
-static void alternatives_smp_lock(u8 **start, u8 **end, u8 *text, u8 *text_end)
+static void alternatives_smp_lock(const s32 *start, const s32 *end,
+                                  u8 *text, u8 *text_end)
 {
-        u8 **ptr;
+        const s32 *poff;
        mutex_lock(&text_mutex);
-        for (ptr = start; ptr < end; ptr++) {
+        for (poff = start; poff < end; poff++) {
-                if (*ptr < text)
+                u8 *ptr = (u8 *)poff + *poff;
-                        continue;
-                if (*ptr > text_end)
+                if (!*poff || ptr < text || ptr >= text_end)
                        continue;
                /* turn DS segment override prefix into lock prefix */
-                text_poke(*ptr, ((unsigned char []){0xf0}), 1);
+                if (*ptr == 0x3e)
+                        text_poke(ptr, ((unsigned char []){0xf0}), 1);
        };
        mutex_unlock(&text_mutex);
 }
-static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end)
+static void alternatives_smp_unlock(const s32 *start, const s32 *end,
+                                    u8 *text, u8 *text_end)
 {
-        u8 **ptr;
+        const s32 *poff;
        if (noreplace_smp)
                return;
        mutex_lock(&text_mutex);
-        for (ptr = start; ptr < end; ptr++) {
+        for (poff = start; poff < end; poff++) {
-                if (*ptr < text)
+                u8 *ptr = (u8 *)poff + *poff;
-                        continue;
-                if (*ptr > text_end)
+                if (!*poff || ptr < text || ptr >= text_end)
                        continue;
                /* turn lock prefix into DS segment override prefix */
-                text_poke(*ptr, ((unsigned char []){0x3E}), 1);
+                if (*ptr == 0xf0)
+                        text_poke(ptr, ((unsigned char []){0x3E}), 1);
        };
        mutex_unlock(&text_mutex);
 }
@@ -276,8 +280,8 @@ struct smp_alt_module {
        char            *name;
        /* ptrs to lock prefixes */
-        u8              **locks;
+        const s32       *locks;
-        u8              **locks_end;
+        const s32       *locks_end;
        /* .text segment, needed to avoid patching init code ;) */
        u8              *text;
@@ -398,16 +402,19 @@ void alternatives_smp_switch(int smp)
 int alternatives_text_reserved(void *start, void *end)
 {
        struct smp_alt_module *mod;
-        u8 **ptr;
+        const s32 *poff;
        u8 *text_start = start;
        u8 *text_end = end;
        list_for_each_entry(mod, &smp_alt_modules, next) {
                if (mod->text > text_end || mod->text_end < text_start)
                        continue;
-                for (ptr = mod->locks; ptr < mod->locks_end; ptr++)
+                for (poff = mod->locks; poff < mod->locks_end; poff++) {
-                        if (text_start <= *ptr && text_end >= *ptr)
+                        const u8 *ptr = (const u8 *)poff + *poff;
+                        if (text_start <= ptr && text_end > ptr)
                                return 1;
+                }
        }
        return 0;
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index f854d89b7edf..fa5a1474cd18 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -731,18 +731,22 @@ static bool increase_address_space(struct protection_domain *domain,
 static u64 *alloc_pte(struct protection_domain *domain,
                      unsigned long address,
-                      int end_lvl,
+                      unsigned long page_size,
                      u64 **pte_page,
                      gfp_t gfp)
 {
+        int level, end_lvl;
        u64 *pte, *page;
-        int level;
+        BUG_ON(!is_power_of_2(page_size));
        while (address > PM_LEVEL_SIZE(domain->mode))
                increase_address_space(domain, gfp);
-        level =  domain->mode - 1;
+        level   = domain->mode - 1;
-        pte   = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
+        pte     = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
+        address = PAGE_SIZE_ALIGN(address, page_size);
+        end_lvl = PAGE_SIZE_LEVEL(page_size);
        while (level > end_lvl) {
                if (!IOMMU_PTE_PRESENT(*pte)) {
@@ -752,6 +756,10 @@ static u64 *alloc_pte(struct protection_domain *domain,
                        *pte = PM_LEVEL_PDE(level, virt_to_phys(page));
                }
+                /* No level skipping support yet */
+                if (PM_PTE_LEVEL(*pte) != level)
+                        return NULL;
                level -= 1;
                pte = IOMMU_PTE_PAGE(*pte);
@@ -769,28 +777,47 @@ static u64 *alloc_pte(struct protection_domain *domain,
 * This function checks if there is a PTE for a given dma address. If
 * there is one, it returns the pointer to it.
 */
-static u64 *fetch_pte(struct protection_domain *domain,
+static u64 *fetch_pte(struct protection_domain *domain, unsigned long address)
-                      unsigned long address, int map_size)
 {
        int level;
        u64 *pte;
-        level =  domain->mode - 1;
+        if (address > PM_LEVEL_SIZE(domain->mode))
-        pte   = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
+                return NULL;
+        level   =  domain->mode - 1;
+        pte     = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
-        while (level > map_size) {
+        while (level > 0) {
+                /* Not Present */
                if (!IOMMU_PTE_PRESENT(*pte))
                        return NULL;
+                /* Large PTE */
+                if (PM_PTE_LEVEL(*pte) == 0x07) {
+                        unsigned long pte_mask, __pte;
+                        /*
+                         * If we have a series of large PTEs, make
+                         * sure to return a pointer to the first one.
+                         */
+                        pte_mask = PTE_PAGE_SIZE(*pte);
+                        pte_mask = ~((PAGE_SIZE_PTE_COUNT(pte_mask) << 3) - 1);
+                        __pte    = ((unsigned long)pte) & pte_mask;
+                        return (u64 *)__pte;
+                }
+                /* No level skipping support yet */
+                if (PM_PTE_LEVEL(*pte) != level)
+                        return NULL;
                level -= 1;
+                /* Walk to the next level */
                pte = IOMMU_PTE_PAGE(*pte);
                pte = &pte[PM_LEVEL_INDEX(level, address)];
-                if ((PM_PTE_LEVEL(*pte) == 0) && level != map_size) {
-                        pte = NULL;
-                        break;
-                }
        }
        return pte;
@@ -807,44 +834,84 @@ static int iommu_map_page(struct protection_domain *dom,
                          unsigned long bus_addr,
                          unsigned long phys_addr,
                          int prot,
-                          int map_size)
+                          unsigned long page_size)
 {
        u64 __pte, *pte;
+        int i, count;
-        bus_addr  = PAGE_ALIGN(bus_addr);
-        phys_addr = PAGE_ALIGN(phys_addr);
-        BUG_ON(!PM_ALIGNED(map_size, bus_addr));
-        BUG_ON(!PM_ALIGNED(map_size, phys_addr));
        if (!(prot & IOMMU_PROT_MASK))
                return -EINVAL;
-        pte = alloc_pte(dom, bus_addr, map_size, NULL, GFP_KERNEL);
+        bus_addr  = PAGE_ALIGN(bus_addr);
+        phys_addr = PAGE_ALIGN(phys_addr);
+        count     = PAGE_SIZE_PTE_COUNT(page_size);
+        pte       = alloc_pte(dom, bus_addr, page_size, NULL, GFP_KERNEL);
+        for (i = 0; i < count; ++i)
+                if (IOMMU_PTE_PRESENT(pte[i]))
+                        return -EBUSY;
-        if (IOMMU_PTE_PRESENT(*pte))
+        if (page_size > PAGE_SIZE) {
-                return -EBUSY;
+                __pte = PAGE_SIZE_PTE(phys_addr, page_size);
+                __pte |= PM_LEVEL_ENC(7) | IOMMU_PTE_P | IOMMU_PTE_FC;
+        } else
+                __pte = phys_addr | IOMMU_PTE_P | IOMMU_PTE_FC;
-        __pte = phys_addr | IOMMU_PTE_P;
        if (prot & IOMMU_PROT_IR)
                __pte |= IOMMU_PTE_IR;
        if (prot & IOMMU_PROT_IW)
                __pte |= IOMMU_PTE_IW;
-        *pte = __pte;
+        for (i = 0; i < count; ++i)
+                pte[i] = __pte;
        update_domain(dom);
        return 0;
 }
-static void iommu_unmap_page(struct protection_domain *dom,
+static unsigned long iommu_unmap_page(struct protection_domain *dom,
-                             unsigned long bus_addr, int map_size)
+                                      unsigned long bus_addr,
+                                      unsigned long page_size)
 {
-        u64 *pte = fetch_pte(dom, bus_addr, map_size);
+        unsigned long long unmap_size, unmapped;
+        u64 *pte;
+        BUG_ON(!is_power_of_2(page_size));
+        unmapped = 0;
-        if (pte)
+        while (unmapped < page_size) {
-                *pte = 0;
+                pte = fetch_pte(dom, bus_addr);
+                if (!pte) {
+                        /*
+                         * No PTE for this address
+                         * move forward in 4kb steps
+                         */
+                        unmap_size = PAGE_SIZE;
+                } else if (PM_PTE_LEVEL(*pte) == 0) {
+                        /* 4kb PTE found for this address */
+                        unmap_size = PAGE_SIZE;
+                        *pte       = 0ULL;
+                } else {
+                        int count, i;
+                        /* Large PTE found which maps this address */
+                        unmap_size = PTE_PAGE_SIZE(*pte);
+                        count      = PAGE_SIZE_PTE_COUNT(unmap_size);
+                        for (i = 0; i < count; i++)
+                                pte[i] = 0ULL;
+                }
+                bus_addr  = (bus_addr & ~(unmap_size - 1)) + unmap_size;
+                unmapped += unmap_size;
+        }
+        BUG_ON(!is_power_of_2(unmapped));
+        return unmapped;
 }
 /*
@@ -878,7 +945,7 @@ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
        for (addr = e->address_start; addr < e->address_end;
             addr += PAGE_SIZE) {
                ret = iommu_map_page(&dma_dom->domain, addr, addr, e->prot,
-                                     PM_MAP_4k);
+                                     PAGE_SIZE);
                if (ret)
                        return ret;
                /*
@@ -1006,7 +1073,7 @@ static int alloc_new_range(struct dma_ops_domain *dma_dom,
                u64 *pte, *pte_page;
                for (i = 0; i < num_ptes; ++i) {
-                        pte = alloc_pte(&dma_dom->domain, address, PM_MAP_4k,
+                        pte = alloc_pte(&dma_dom->domain, address, PAGE_SIZE,
                                        &pte_page, gfp);
                        if (!pte)
                                goto out_free;
@@ -1042,7 +1109,7 @@ static int alloc_new_range(struct dma_ops_domain *dma_dom,
        for (i = dma_dom->aperture[index]->offset;
             i < dma_dom->aperture_size;
             i += PAGE_SIZE) {
-                u64 *pte = fetch_pte(&dma_dom->domain, i, PM_MAP_4k);
+                u64 *pte = fetch_pte(&dma_dom->domain, i);
                if (!pte || !IOMMU_PTE_PRESENT(*pte))
                        continue;
@@ -1712,7 +1779,7 @@ static u64* dma_ops_get_pte(struct dma_ops_domain *dom,
        pte = aperture->pte_pages[APERTURE_PAGE_INDEX(address)];
        if (!pte) {
-                pte = alloc_pte(&dom->domain, address, PM_MAP_4k, &pte_page,
+                pte = alloc_pte(&dom->domain, address, PAGE_SIZE, &pte_page,
                                GFP_ATOMIC);
                aperture->pte_pages[APERTURE_PAGE_INDEX(address)] = pte_page;
        } else
@@ -2439,12 +2506,11 @@ static int amd_iommu_attach_device(struct iommu_domain *dom,
        return ret;
 }
-static int amd_iommu_map_range(struct iommu_domain *dom,
+static int amd_iommu_map(struct iommu_domain *dom, unsigned long iova,
-                               unsigned long iova, phys_addr_t paddr,
+                         phys_addr_t paddr, int gfp_order, int iommu_prot)
-                               size_t size, int iommu_prot)
 {
+        unsigned long page_size = 0x1000UL << gfp_order;
        struct protection_domain *domain = dom->priv;
-        unsigned long i,  npages = iommu_num_pages(paddr, size, PAGE_SIZE);
        int prot = 0;
        int ret;
@@ -2453,61 +2519,50 @@ static int amd_iommu_map_range(struct iommu_domain *dom,
        if (iommu_prot & IOMMU_WRITE)
                prot |= IOMMU_PROT_IW;
-        iova  &= PAGE_MASK;
-        paddr &= PAGE_MASK;
        mutex_lock(&domain->api_lock);
+        ret = iommu_map_page(domain, iova, paddr, prot, page_size);
-        for (i = 0; i < npages; ++i) {
-                ret = iommu_map_page(domain, iova, paddr, prot, PM_MAP_4k);
-                if (ret)
-                        return ret;
-                iova  += PAGE_SIZE;
-                paddr += PAGE_SIZE;
-        }
        mutex_unlock(&domain->api_lock);
-        return 0;
+        return ret;
 }
-static void amd_iommu_unmap_range(struct iommu_domain *dom,
+static int amd_iommu_unmap(struct iommu_domain *dom, unsigned long iova,
-                                  unsigned long iova, size_t size)
+                           int gfp_order)
 {
        struct protection_domain *domain = dom->priv;
-        unsigned long i,  npages = iommu_num_pages(iova, size, PAGE_SIZE);
+        unsigned long page_size, unmap_size;
-        iova  &= PAGE_MASK;
+        page_size  = 0x1000UL << gfp_order;
        mutex_lock(&domain->api_lock);
+        unmap_size = iommu_unmap_page(domain, iova, page_size);
-        for (i = 0; i < npages; ++i) {
+        mutex_unlock(&domain->api_lock);
-                iommu_unmap_page(domain, iova, PM_MAP_4k);
-                iova  += PAGE_SIZE;
-        }
        iommu_flush_tlb_pde(domain);
-        mutex_unlock(&domain->api_lock);
+        return get_order(unmap_size);
 }
 static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
                                          unsigned long iova)
 {
        struct protection_domain *domain = dom->priv;
-        unsigned long offset = iova & ~PAGE_MASK;
+        unsigned long offset_mask;
        phys_addr_t paddr;
-        u64 *pte;
+        u64 *pte, __pte;
-        pte = fetch_pte(domain, iova, PM_MAP_4k);
+        pte = fetch_pte(domain, iova);
        if (!pte || !IOMMU_PTE_PRESENT(*pte))
                return 0;
-        paddr  = *pte & IOMMU_PAGE_MASK;
+        if (PM_PTE_LEVEL(*pte) == 0)
-        paddr |= offset;
+                offset_mask = PAGE_SIZE - 1;
+        else
+                offset_mask = PTE_PAGE_SIZE(*pte) - 1;
+        __pte = *pte & PM_ADDR_MASK;
+        paddr = (__pte & ~offset_mask) | (iova & offset_mask);
        return paddr;
 }
@@ -2523,8 +2578,8 @@ static struct iommu_ops amd_iommu_ops = {
        .domain_destroy = amd_iommu_domain_destroy,
        .attach_dev = amd_iommu_attach_device,
        .detach_dev = amd_iommu_detach_device,
-        .map = amd_iommu_map_range,
+        .map = amd_iommu_map,
-        .unmap = amd_iommu_unmap_range,
+        .unmap = amd_iommu_unmap,
        .iova_to_phys = amd_iommu_iova_to_phys,
        .domain_has_cap = amd_iommu_domain_has_cap,
 };
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 6360abf993d4..3bacb4d0844c 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -120,6 +120,7 @@ struct ivmd_header {
 bool amd_iommu_dump;
 static int __initdata amd_iommu_detected;
+static bool __initdata amd_iommu_disabled;
 u16 amd_iommu_last_bdf;                 /* largest PCI device id we have
                                           to handle */
@@ -1372,6 +1373,9 @@ void __init amd_iommu_detect(void)
        if (no_iommu || (iommu_detected && !gart_iommu_aperture))
                return;
+        if (amd_iommu_disabled)
+                return;
        if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) {
                iommu_detected = 1;
                amd_iommu_detected = 1;
@@ -1401,6 +1405,8 @@ static int __init parse_amd_iommu_options(char *str)
        for (; *str; ++str) {
                if (strncmp(str, "fullflush", 9) == 0)
                        amd_iommu_unmap_flush = true;
+                if (strncmp(str, "off", 3) == 0)
+                        amd_iommu_disabled = true;
        }
        return 1;
diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c
index 03ba1b895f5e..425e53a87feb 100644
--- a/arch/x86/kernel/apic/es7000_32.c
+++ b/arch/x86/kernel/apic/es7000_32.c
@@ -131,24 +131,6 @@ int					es7000_plat;
 static unsigned int                     base;
-static int
-es7000_rename_gsi(int ioapic, int gsi)
-{
-        if (es7000_plat == ES7000_ZORRO)
-                return gsi;
-        if (!base) {
-                int i;
-                for (i = 0; i < nr_ioapics; i++)
-                        base += nr_ioapic_registers[i];
-        }
-        if (!ioapic && (gsi < 16))
-                gsi += base;
-        return gsi;
-}
 static int __cpuinit wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip)
 {
        unsigned long vect = 0, psaival = 0;
@@ -190,7 +172,6 @@ static void setup_unisys(void)
                es7000_plat = ES7000_ZORRO;
        else
                es7000_plat = ES7000_CLASSIC;
-        ioapic_renumber_irq = es7000_rename_gsi;
 }
 /*
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index eb2789c3f721..33f3563a2a52 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -89,6 +89,9 @@ int nr_ioapics;
 /* IO APIC gsi routing info */
 struct mp_ioapic_gsi  mp_gsi_routing[MAX_IO_APICS];
+/* The last gsi number used */
+u32 gsi_end;
 /* MP IRQ source entries */
 struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES];
@@ -1013,10 +1016,9 @@ static inline int irq_trigger(int idx)
        return MPBIOS_trigger(idx);
 }
-int (*ioapic_renumber_irq)(int ioapic, int irq);
 static int pin_2_irq(int idx, int apic, int pin)
 {
-        int irq, i;
+        int irq;
        int bus = mp_irqs[idx].srcbus;
        /*
@@ -1028,18 +1030,12 @@ static int pin_2_irq(int idx, int apic, int pin)
        if (test_bit(bus, mp_bus_not_pci)) {
                irq = mp_irqs[idx].srcbusirq;
        } else {
-                /*
+                u32 gsi = mp_gsi_routing[apic].gsi_base + pin;
-                 * PCI IRQs are mapped in order
-                 */
+                if (gsi >= NR_IRQS_LEGACY)
-                i = irq = 0;
+                        irq = gsi;
-                while (i < apic)
+                else
-                        irq += nr_ioapic_registers[i++];
+                        irq = gsi_end + 1 + gsi;
-                irq += pin;
-                /*
-                 * For MPS mode, so far only needed by ES7000 platform
-                 */
-                if (ioapic_renumber_irq)
-                        irq = ioapic_renumber_irq(apic, irq);
        }
 #ifdef CONFIG_X86_32
@@ -1950,20 +1946,8 @@ static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
 void __init enable_IO_APIC(void)
 {
-        union IO_APIC_reg_01 reg_01;
        int i8259_apic, i8259_pin;
        int apic;
-        unsigned long flags;
-        /*
-         * The number of IO-APIC IRQ registers (== #pins):
-         */
-        for (apic = 0; apic < nr_ioapics; apic++) {
-                raw_spin_lock_irqsave(&ioapic_lock, flags);
-                reg_01.raw = io_apic_read(apic, 1);
-                raw_spin_unlock_irqrestore(&ioapic_lock, flags);
-                nr_ioapic_registers[apic] = reg_01.bits.entries+1;
-        }
        if (!legacy_pic->nr_legacy_irqs)
                return;
@@ -3858,27 +3842,20 @@ int __init io_apic_get_redir_entries (int ioapic)
        reg_01.raw = io_apic_read(ioapic, 1);
        raw_spin_unlock_irqrestore(&ioapic_lock, flags);
-        return reg_01.bits.entries;
+        /* The register returns the maximum index redir index
+         * supported, which is one less than the total number of redir
+         * entries.
+         */
+        return reg_01.bits.entries + 1;
 }
 void __init probe_nr_irqs_gsi(void)
 {
-        int nr = 0;
+        int nr;
-        nr = acpi_probe_gsi();
+        nr = gsi_end + 1 + NR_IRQS_LEGACY;
-        if (nr > nr_irqs_gsi) {
+        if (nr > nr_irqs_gsi)
                nr_irqs_gsi = nr;
-        } else {
-                /* for acpi=off or acpi is not compiled in */
-                int idx;
-                nr = 0;
-                for (idx = 0; idx < nr_ioapics; idx++)
-                        nr += io_apic_get_redir_entries(idx) + 1;
-                if (nr > nr_irqs_gsi)
-                        nr_irqs_gsi = nr;
-        }
        printk(KERN_DEBUG "nr_irqs_gsi: %d\n", nr_irqs_gsi);
 }
@@ -4085,22 +4062,27 @@ int __init io_apic_get_version(int ioapic)
        return reg_01.bits.version;
 }
-int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity)
+int acpi_get_override_irq(u32 gsi, int *trigger, int *polarity)
 {
-        int i;
+        int ioapic, pin, idx;
        if (skip_ioapic_setup)
                return -1;
-        for (i = 0; i < mp_irq_entries; i++)
+        ioapic = mp_find_ioapic(gsi);
-                if (mp_irqs[i].irqtype == mp_INT &&
+        if (ioapic < 0)
-                    mp_irqs[i].srcbusirq == bus_irq)
-                        break;
-        if (i >= mp_irq_entries)
                return -1;
-        *trigger = irq_trigger(i);
+        pin = mp_find_ioapic_pin(ioapic, gsi);
-        *polarity = irq_polarity(i);
+        if (pin < 0)
+                return -1;
+        idx = find_irq_entry(ioapic, pin, mp_INT);
+        if (idx < 0)
+                return -1;
+        *trigger = irq_trigger(idx);
+        *polarity = irq_polarity(idx);
        return 0;
 }
@@ -4241,7 +4223,7 @@ void __init ioapic_insert_resources(void)
        }
 }
-int mp_find_ioapic(int gsi)
+int mp_find_ioapic(u32 gsi)
 {
        int i = 0;
@@ -4256,7 +4238,7 @@ int mp_find_ioapic(int gsi)
        return -1;
 }
-int mp_find_ioapic_pin(int ioapic, int gsi)
+int mp_find_ioapic_pin(int ioapic, u32 gsi)
 {
        if (WARN_ON(ioapic == -1))
                return -1;
@@ -4284,6 +4266,7 @@ static int bad_ioapic(unsigned long address)
 void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
 {
        int idx = 0;
+        int entries;
        if (bad_ioapic(address))
                return;
@@ -4302,9 +4285,17 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
         * Build basic GSI lookup table to facilitate gsi->io_apic lookups
         * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
         */
+        entries = io_apic_get_redir_entries(idx);
        mp_gsi_routing[idx].gsi_base = gsi_base;
-        mp_gsi_routing[idx].gsi_end = gsi_base +
+        mp_gsi_routing[idx].gsi_end = gsi_base + entries - 1;
-            io_apic_get_redir_entries(idx);
+        /*
+         * The number of IO-APIC IRQ registers (== #pins):
+         */
+        nr_ioapic_registers[idx] = entries;
+        if (mp_gsi_routing[idx].gsi_end > gsi_end)
+                gsi_end = mp_gsi_routing[idx].gsi_end;
        printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
               "GSI %d-%d\n", idx, mp_ioapics[idx].apicid,
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index c085d52dbaf2..e46f98f36e31 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -735,9 +735,6 @@ void __init uv_system_init(void)
                uv_node_to_blade[nid] = blade;
                uv_cpu_to_blade[cpu] = blade;
                max_pnode = max(pnode, max_pnode);
-                printk(KERN_DEBUG "UV: cpu %d, apicid 0x%x, pnode %d, nid %d, lcpu %d, blade %d\n",
-                        cpu, apicid, pnode, nid, lcpu, blade);
        }
        /* Add blade/pnode info for nodes without cpus */
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 031aa887b0eb..c4f9182ca3ac 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -1224,7 +1224,7 @@ static void reinit_timer(void)
 #ifdef INIT_TIMER_AFTER_SUSPEND
        unsigned long flags;
-        spin_lock_irqsave(&i8253_lock, flags);
+        raw_spin_lock_irqsave(&i8253_lock, flags);
        /* set the clock to HZ */
        outb_pit(0x34, PIT_MODE);               /* binary, mode 2, LSB/MSB, ch 0 */
        udelay(10);
@@ -1232,7 +1232,7 @@ static void reinit_timer(void)
        udelay(10);
        outb_pit(LATCH >> 8, PIT_CH0);  /* MSB */
        udelay(10);
-        spin_unlock_irqrestore(&i8253_lock, flags);
+        raw_spin_unlock_irqrestore(&i8253_lock, flags);
 #endif
 }
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index c202b62f3671..3a785da34b6f 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -14,7 +14,7 @@ CFLAGS_common.o		:= $(nostackp)
 obj-y                   := intel_cacheinfo.o addon_cpuid_features.o
 obj-y                   += proc.o capflags.o powerflags.o common.o
-obj-y                   += vmware.o hypervisor.o sched.o
+obj-y                   += vmware.o hypervisor.o sched.o mshyperv.o
 obj-$(CONFIG_X86_32)    += bugs.o cmpxchg.o
 obj-$(CONFIG_X86_64)    += bugs_64.o
diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c
index 97ad79cdf688..10fa5684a662 100644
--- a/arch/x86/kernel/cpu/addon_cpuid_features.c
+++ b/arch/x86/kernel/cpu/addon_cpuid_features.c
@@ -30,12 +30,14 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
        const struct cpuid_bit *cb;
        static const struct cpuid_bit __cpuinitconst cpuid_bits[] = {
-                { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006 },
+                { X86_FEATURE_IDA,              CR_EAX, 1, 0x00000006 },
-                { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006 },
+                { X86_FEATURE_ARAT,             CR_EAX, 2, 0x00000006 },
-                { X86_FEATURE_NPT,   CR_EDX, 0, 0x8000000a },
+                { X86_FEATURE_APERFMPERF,       CR_ECX, 0, 0x00000006 },
-                { X86_FEATURE_LBRV,  CR_EDX, 1, 0x8000000a },
+                { X86_FEATURE_CPB,              CR_EDX, 9, 0x80000007 },
-                { X86_FEATURE_SVML,  CR_EDX, 2, 0x8000000a },
+                { X86_FEATURE_NPT,              CR_EDX, 0, 0x8000000a },
-                { X86_FEATURE_NRIPS, CR_EDX, 3, 0x8000000a },
+                { X86_FEATURE_LBRV,             CR_EDX, 1, 0x8000000a },
+                { X86_FEATURE_SVML,             CR_EDX, 2, 0x8000000a },
+                { X86_FEATURE_NRIPS,            CR_EDX, 3, 0x8000000a },
                { 0, 0, 0, 0 }
        };
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 01a265212395..c39576cb3018 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -86,7 +86,7 @@ static void __init check_fpu(void)
 static void __init check_hlt(void)
 {
-        if (paravirt_enabled())
+        if (boot_cpu_data.x86 >= 5 || paravirt_enabled())
                return;
        printk(KERN_INFO "Checking 'hlt' instruction... ");
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 4868e4a951ee..c1c00d0b1692 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1243,10 +1243,7 @@ void __cpuinit cpu_init(void)
        /*
         * Force FPU initialization:
         */
-        if (cpu_has_xsave)
+        current_thread_info()->status = 0;
-                current_thread_info()->status = TS_XSAVE;
-        else
-                current_thread_info()->status = 0;
        clear_used_math();
        mxcsr_feature_mask_init();
diff --git a/arch/x86/kernel/cpu/cpufreq/Makefile b/arch/x86/kernel/cpu/cpufreq/Makefile
index 1840c0a5170b..bd54bf67e6fb 100644
--- a/arch/x86/kernel/cpu/cpufreq/Makefile
+++ b/arch/x86/kernel/cpu/cpufreq/Makefile
@@ -2,8 +2,8 @@
 # K8 systems. ACPI is preferred to all other hardware-specific drivers.
 # speedstep-* is preferred over p4-clockmod.
-obj-$(CONFIG_X86_POWERNOW_K8)           += powernow-k8.o
+obj-$(CONFIG_X86_POWERNOW_K8)           += powernow-k8.o mperf.o
-obj-$(CONFIG_X86_ACPI_CPUFREQ)          += acpi-cpufreq.o
+obj-$(CONFIG_X86_ACPI_CPUFREQ)          += acpi-cpufreq.o mperf.o
 obj-$(CONFIG_X86_PCC_CPUFREQ)           += pcc-cpufreq.o
 obj-$(CONFIG_X86_POWERNOW_K6)           += powernow-k6.o
 obj-$(CONFIG_X86_POWERNOW_K7)           += powernow-k7.o
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 459168083b77..1d3cddaa40ee 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -46,6 +46,7 @@
 #include <asm/msr.h>
 #include <asm/processor.h>
 #include <asm/cpufeature.h>
+#include "mperf.h"
 #define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
                "acpi-cpufreq", msg)
@@ -71,8 +72,6 @@ struct acpi_cpufreq_data {
 static DEFINE_PER_CPU(struct acpi_cpufreq_data *, acfreq_data);
-static DEFINE_PER_CPU(struct aperfmperf, acfreq_old_perf);
 /* acpi_perf_data is a pointer to percpu data. */
 static struct acpi_processor_performance *acpi_perf_data;
@@ -240,45 +239,6 @@ static u32 get_cur_val(const struct cpumask *mask)
        return cmd.val;
 }
-/* Called via smp_call_function_single(), on the target CPU */
-static void read_measured_perf_ctrs(void *_cur)
-{
-        struct aperfmperf *am = _cur;
-        get_aperfmperf(am);
-}
-/*
- * Return the measured active (C0) frequency on this CPU since last call
- * to this function.
- * Input: cpu number
- * Return: Average CPU frequency in terms of max frequency (zero on error)
- *
- * We use IA32_MPERF and IA32_APERF MSRs to get the measured performance
- * over a period of time, while CPU is in C0 state.
- * IA32_MPERF counts at the rate of max advertised frequency
- * IA32_APERF counts at the rate of actual CPU frequency
- * Only IA32_APERF/IA32_MPERF ratio is architecturally defined and
- * no meaning should be associated with absolute values of these MSRs.
- */
-static unsigned int get_measured_perf(struct cpufreq_policy *policy,
-                                      unsigned int cpu)
-{
-        struct aperfmperf perf;
-        unsigned long ratio;
-        unsigned int retval;
-        if (smp_call_function_single(cpu, read_measured_perf_ctrs, &perf, 1))
-                return 0;
-        ratio = calc_aperfmperf_ratio(&per_cpu(acfreq_old_perf, cpu), &perf);
-        per_cpu(acfreq_old_perf, cpu) = perf;
-        retval = (policy->cpuinfo.max_freq * ratio) >> APERFMPERF_SHIFT;
-        return retval;
-}
 static unsigned int get_cur_freq_on_cpu(unsigned int cpu)
 {
        struct acpi_cpufreq_data *data = per_cpu(acfreq_data, cpu);
@@ -702,7 +662,7 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
        /* Check for APERF/MPERF support in hardware */
        if (cpu_has(c, X86_FEATURE_APERFMPERF))
-                acpi_cpufreq_driver.getavg = get_measured_perf;
+                acpi_cpufreq_driver.getavg = cpufreq_get_measured_perf;
        dprintk("CPU%u - ACPI performance management activated.\n", cpu);
        for (i = 0; i < perf->state_count; i++)
diff --git a/arch/x86/kernel/cpu/cpufreq/mperf.c b/arch/x86/kernel/cpu/cpufreq/mperf.c
new file mode 100644
index 000000000000..911e193018ae
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpufreq/mperf.c
@@ -0,0 +1,51 @@
+#include <linux/kernel.h>
+#include <linux/smp.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/cpufreq.h>
+#include <linux/slab.h>
+#include "mperf.h"
+static DEFINE_PER_CPU(struct aperfmperf, acfreq_old_perf);
+/* Called via smp_call_function_single(), on the target CPU */
+static void read_measured_perf_ctrs(void *_cur)
+{
+        struct aperfmperf *am = _cur;
+        get_aperfmperf(am);
+}
+/*
+ * Return the measured active (C0) frequency on this CPU since last call
+ * to this function.
+ * Input: cpu number
+ * Return: Average CPU frequency in terms of max frequency (zero on error)
+ *
+ * We use IA32_MPERF and IA32_APERF MSRs to get the measured performance
+ * over a period of time, while CPU is in C0 state.
+ * IA32_MPERF counts at the rate of max advertised frequency
+ * IA32_APERF counts at the rate of actual CPU frequency
+ * Only IA32_APERF/IA32_MPERF ratio is architecturally defined and
+ * no meaning should be associated with absolute values of these MSRs.
+ */
+unsigned int cpufreq_get_measured_perf(struct cpufreq_policy *policy,
+                                        unsigned int cpu)
+{
+        struct aperfmperf perf;
+        unsigned long ratio;
+        unsigned int retval;
+        if (smp_call_function_single(cpu, read_measured_perf_ctrs, &perf, 1))
+                return 0;
+        ratio = calc_aperfmperf_ratio(&per_cpu(acfreq_old_perf, cpu), &perf);
+        per_cpu(acfreq_old_perf, cpu) = perf;
+        retval = (policy->cpuinfo.max_freq * ratio) >> APERFMPERF_SHIFT;
+        return retval;
+}
+EXPORT_SYMBOL_GPL(cpufreq_get_measured_perf);
+MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/cpu/cpufreq/mperf.h b/arch/x86/kernel/cpu/cpufreq/mperf.h
new file mode 100644
index 000000000000..5dbf2950dc22
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpufreq/mperf.h
@@ -0,0 +1,9 @@
+/*
+ *  (c) 2010 Advanced Micro Devices, Inc.
+ *  Your use of this code is subject to the terms and conditions of the
+ *  GNU general public license version 2. See "COPYING" or
+ *  http://www.gnu.org/licenses/gpl.html
+ */
+unsigned int cpufreq_get_measured_perf(struct cpufreq_policy *policy,
+                                        unsigned int cpu);
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index b6215b9798e2..6f3dc8fbbfdc 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -1,6 +1,5 @@
 /*
- *   (c) 2003-2006 Advanced Micro Devices, Inc.
+ *   (c) 2003-2010 Advanced Micro Devices, Inc.
 *  Your use of this code is subject to the terms and conditions of the
 *  GNU general public license version 2. See "COPYING" or
 *  http://www.gnu.org/licenses/gpl.html
@@ -46,6 +45,7 @@
 #define PFX "powernow-k8: "
 #define VERSION "version 2.20.00"
 #include "powernow-k8.h"
+#include "mperf.h"
 /* serialize freq changes  */
 static DEFINE_MUTEX(fidvid_mutex);
@@ -54,6 +54,12 @@ static DEFINE_PER_CPU(struct powernow_k8_data *, powernow_data);
 static int cpu_family = CPU_OPTERON;
+/* core performance boost */
+static bool cpb_capable, cpb_enabled;
+static struct msr __percpu *msrs;
+static struct cpufreq_driver cpufreq_amd64_driver;
 #ifndef CONFIG_SMP
 static inline const struct cpumask *cpu_core_mask(int cpu)
 {
@@ -1249,6 +1255,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
        struct powernow_k8_data *data;
        struct init_on_cpu init_on_cpu;
        int rc;
+        struct cpuinfo_x86 *c = &cpu_data(pol->cpu);
        if (!cpu_online(pol->cpu))
                return -ENODEV;
@@ -1323,6 +1330,10 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
                return -EINVAL;
        }
+        /* Check for APERF/MPERF support in hardware */
+        if (cpu_has(c, X86_FEATURE_APERFMPERF))
+                cpufreq_amd64_driver.getavg = cpufreq_get_measured_perf;
        cpufreq_frequency_table_get_attr(data->powernow_table, pol->cpu);
        if (cpu_family == CPU_HW_PSTATE)
@@ -1394,8 +1405,77 @@ out:
        return khz;
 }
+static void _cpb_toggle_msrs(bool t)
+{
+        int cpu;
+        get_online_cpus();
+        rdmsr_on_cpus(cpu_online_mask, MSR_K7_HWCR, msrs);
+        for_each_cpu(cpu, cpu_online_mask) {
+                struct msr *reg = per_cpu_ptr(msrs, cpu);
+                if (t)
+                        reg->l &= ~BIT(25);
+                else
+                        reg->l |= BIT(25);
+        }
+        wrmsr_on_cpus(cpu_online_mask, MSR_K7_HWCR, msrs);
+        put_online_cpus();
+}
+/*
+ * Switch on/off core performance boosting.
+ *
+ * 0=disable
+ * 1=enable.
+ */
+static void cpb_toggle(bool t)
+{
+        if (!cpb_capable)
+                return;
+        if (t && !cpb_enabled) {
+                cpb_enabled = true;
+                _cpb_toggle_msrs(t);
+                printk(KERN_INFO PFX "Core Boosting enabled.\n");
+        } else if (!t && cpb_enabled) {
+                cpb_enabled = false;
+                _cpb_toggle_msrs(t);
+                printk(KERN_INFO PFX "Core Boosting disabled.\n");
+        }
+}
+static ssize_t store_cpb(struct cpufreq_policy *policy, const char *buf,
+                                 size_t count)
+{
+        int ret = -EINVAL;
+        unsigned long val = 0;
+        ret = strict_strtoul(buf, 10, &val);
+        if (!ret && (val == 0 || val == 1) && cpb_capable)
+                cpb_toggle(val);
+        else
+                return -EINVAL;
+        return count;
+}
+static ssize_t show_cpb(struct cpufreq_policy *policy, char *buf)
+{
+        return sprintf(buf, "%u\n", cpb_enabled);
+}
+#define define_one_rw(_name) \
+static struct freq_attr _name = \
+__ATTR(_name, 0644, show_##_name, store_##_name)
+define_one_rw(cpb);
 static struct freq_attr *powernow_k8_attr[] = {
        &cpufreq_freq_attr_scaling_available_freqs,
+        &cpb,
        NULL,
 };
@@ -1411,10 +1491,51 @@ static struct cpufreq_driver cpufreq_amd64_driver = {
        .attr           = powernow_k8_attr,
 };
+/*
+ * Clear the boost-disable flag on the CPU_DOWN path so that this cpu
+ * cannot block the remaining ones from boosting. On the CPU_UP path we
+ * simply keep the boost-disable flag in sync with the current global
+ * state.
+ */
+static int __cpuinit cpb_notify(struct notifier_block *nb, unsigned long action,
+                                void *hcpu)
+{
+        unsigned cpu = (long)hcpu;
+        u32 lo, hi;
+        switch (action) {
+        case CPU_UP_PREPARE:
+        case CPU_UP_PREPARE_FROZEN:
+                if (!cpb_enabled) {
+                        rdmsr_on_cpu(cpu, MSR_K7_HWCR, &lo, &hi);
+                        lo |= BIT(25);
+                        wrmsr_on_cpu(cpu, MSR_K7_HWCR, lo, hi);
+                }
+                break;
+        case CPU_DOWN_PREPARE:
+        case CPU_DOWN_PREPARE_FROZEN:
+                rdmsr_on_cpu(cpu, MSR_K7_HWCR, &lo, &hi);
+                lo &= ~BIT(25);
+                wrmsr_on_cpu(cpu, MSR_K7_HWCR, lo, hi);
+                break;
+        default:
+                break;
+        }
+        return NOTIFY_OK;
+}
+static struct notifier_block __cpuinitdata cpb_nb = {
+        .notifier_call          = cpb_notify,
+};
 /* driver entry point for init */
 static int __cpuinit powernowk8_init(void)
 {
-        unsigned int i, supported_cpus = 0;
+        unsigned int i, supported_cpus = 0, cpu;
        for_each_online_cpu(i) {
                int rc;
@@ -1423,15 +1544,36 @@ static int __cpuinit powernowk8_init(void)
                        supported_cpus++;
        }
-        if (supported_cpus == num_online_cpus()) {
+        if (supported_cpus != num_online_cpus())
-                printk(KERN_INFO PFX "Found %d %s "
+                return -ENODEV;
-                        "processors (%d cpu cores) (" VERSION ")\n",
-                        num_online_nodes(),
+        printk(KERN_INFO PFX "Found %d %s (%d cpu cores) (" VERSION ")\n",
-                        boot_cpu_data.x86_model_id, supported_cpus);
+                num_online_nodes(), boot_cpu_data.x86_model_id, supported_cpus);
-                return cpufreq_register_driver(&cpufreq_amd64_driver);
+        if (boot_cpu_has(X86_FEATURE_CPB)) {
+                cpb_capable = true;
+                register_cpu_notifier(&cpb_nb);
+                msrs = msrs_alloc();
+                if (!msrs) {
+                        printk(KERN_ERR "%s: Error allocating msrs!\n", __func__);
+                        return -ENOMEM;
+                }
+                rdmsr_on_cpus(cpu_online_mask, MSR_K7_HWCR, msrs);
+                for_each_cpu(cpu, cpu_online_mask) {
+                        struct msr *reg = per_cpu_ptr(msrs, cpu);
+                        cpb_enabled |= !(!!(reg->l & BIT(25)));
+                }
+                printk(KERN_INFO PFX "Core Performance Boosting: %s.\n",
+                        (cpb_enabled ? "on" : "off"));
        }
-        return -ENODEV;
+        return cpufreq_register_driver(&cpufreq_amd64_driver);
 }
 /* driver entry point for term */
@@ -1439,6 +1581,13 @@ static void __exit powernowk8_exit(void)
 {
        dprintk("exit\n");
+        if (boot_cpu_has(X86_FEATURE_CPB)) {
+                msrs_free(msrs);
+                msrs = NULL;
+                unregister_cpu_notifier(&cpb_nb);
+        }
        cpufreq_unregister_driver(&cpufreq_amd64_driver);
 }
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
index 02ce824073cb..df3529b1c02d 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
@@ -5,7 +5,6 @@
 *  http://www.gnu.org/licenses/gpl.html
 */
 enum pstate {
        HW_PSTATE_INVALID = 0xff,
        HW_PSTATE_0 = 0,
@@ -55,7 +54,6 @@ struct powernow_k8_data {
        struct cpumask *available_cores;
 };
 /* processor's cpuid instruction support */
 #define CPUID_PROCESSOR_SIGNATURE       1       /* function 1 */
 #define CPUID_XFAM                      0x0ff00000      /* extended family */
diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c
index 08be922de33a..dd531cc56a8f 100644
--- a/arch/x86/kernel/cpu/hypervisor.c
+++ b/arch/x86/kernel/cpu/hypervisor.c
@@ -21,37 +21,55 @@
 *
 */
+#include <linux/module.h>
 #include <asm/processor.h>
-#include <asm/vmware.h>
 #include <asm/hypervisor.h>
-static inline void __cpuinit
+/*
-detect_hypervisor_vendor(struct cpuinfo_x86 *c)
+ * Hypervisor detect order.  This is specified explicitly here because
+ * some hypervisors might implement compatibility modes for other
+ * hypervisors and therefore need to be detected in specific sequence.
+ */
+static const __initconst struct hypervisor_x86 * const hypervisors[] =
 {
-        if (vmware_platform())
+        &x86_hyper_vmware,
-                c->x86_hyper_vendor = X86_HYPER_VENDOR_VMWARE;
+        &x86_hyper_ms_hyperv,
-        else
+};
-                c->x86_hyper_vendor = X86_HYPER_VENDOR_NONE;
-}
-static inline void __cpuinit
+const struct hypervisor_x86 *x86_hyper;
-hypervisor_set_feature_bits(struct cpuinfo_x86 *c)
+EXPORT_SYMBOL(x86_hyper);
+static inline void __init
+detect_hypervisor_vendor(void)
 {
-        if (boot_cpu_data.x86_hyper_vendor == X86_HYPER_VENDOR_VMWARE) {
+        const struct hypervisor_x86 *h, * const *p;
-                vmware_set_feature_bits(c);
-                return;
+        for (p = hypervisors; p < hypervisors + ARRAY_SIZE(hypervisors); p++) {
+                h = *p;
+                if (h->detect()) {
+                        x86_hyper = h;
+                        printk(KERN_INFO "Hypervisor detected: %s\n", h->name);
+                        break;
+                }
        }
 }
 void __cpuinit init_hypervisor(struct cpuinfo_x86 *c)
 {
-        detect_hypervisor_vendor(c);
+        if (x86_hyper && x86_hyper->set_cpu_features)
-        hypervisor_set_feature_bits(c);
+                x86_hyper->set_cpu_features(c);
 }
 void __init init_hypervisor_platform(void)
 {
+        detect_hypervisor_vendor();
+        if (!x86_hyper)
+                return;
        init_hypervisor(&boot_cpu_data);
-        if (boot_cpu_data.x86_hyper_vendor == X86_HYPER_VENDOR_VMWARE)
-                vmware_platform_setup();
+        if (x86_hyper->init_platform)
+                x86_hyper->init_platform();
 }
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 1366c7cfd483..85f69cdeae10 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -12,7 +12,6 @@
 #include <asm/processor.h>
 #include <asm/pgtable.h>
 #include <asm/msr.h>
-#include <asm/ds.h>
 #include <asm/bugs.h>
 #include <asm/cpu.h>
@@ -373,12 +372,6 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
                        set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
        }
-        if (c->cpuid_level > 6) {
-                unsigned ecx = cpuid_ecx(6);
-                if (ecx & 0x01)
-                        set_cpu_cap(c, X86_FEATURE_APERFMPERF);
-        }
        if (cpu_has_xmm2)
                set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
        if (cpu_has_ds) {
@@ -388,7 +381,6 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
                        set_cpu_cap(c, X86_FEATURE_BTS);
                if (!(l1 & (1<<12)))
                        set_cpu_cap(c, X86_FEATURE_PEBS);
-                ds_init_intel(c);
        }
        if (c->x86 == 6 && c->x86_model == 29 && cpu_has_clflush)
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index b3eeb66c0a51..33eae2062cf5 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -148,13 +148,19 @@ union _cpuid4_leaf_ecx {
        u32 full;
 };
+struct amd_l3_cache {
+        struct   pci_dev *dev;
+        bool     can_disable;
+        unsigned indices;
+        u8       subcaches[4];
+};
 struct _cpuid4_info {
        union _cpuid4_leaf_eax eax;
        union _cpuid4_leaf_ebx ebx;
        union _cpuid4_leaf_ecx ecx;
        unsigned long size;
-        bool can_disable;
+        struct amd_l3_cache *l3;
-        unsigned int l3_indices;
        DECLARE_BITMAP(shared_cpu_map, NR_CPUS);
 };
@@ -164,8 +170,7 @@ struct _cpuid4_info_regs {
        union _cpuid4_leaf_ebx ebx;
        union _cpuid4_leaf_ecx ecx;
        unsigned long size;
-        bool can_disable;
+        struct amd_l3_cache *l3;
-        unsigned int l3_indices;
 };
 unsigned short                  num_cache_leaves;
@@ -302,87 +307,163 @@ struct _cache_attr {
 };
 #ifdef CONFIG_CPU_SUP_AMD
-static unsigned int __cpuinit amd_calc_l3_indices(void)
+/*
+ * L3 cache descriptors
+ */
+static struct amd_l3_cache **__cpuinitdata l3_caches;
+static void __cpuinit amd_calc_l3_indices(struct amd_l3_cache *l3)
 {
-        /*
-         * We're called over smp_call_function_single() and therefore
-         * are on the correct cpu.
-         */
-        int cpu = smp_processor_id();
-        int node = cpu_to_node(cpu);
-        struct pci_dev *dev = node_to_k8_nb_misc(node);
        unsigned int sc0, sc1, sc2, sc3;
        u32 val = 0;
-        pci_read_config_dword(dev, 0x1C4, &val);
+        pci_read_config_dword(l3->dev, 0x1C4, &val);
        /* calculate subcache sizes */
-        sc0 = !(val & BIT(0));
+        l3->subcaches[0] = sc0 = !(val & BIT(0));
-        sc1 = !(val & BIT(4));
+        l3->subcaches[1] = sc1 = !(val & BIT(4));
-        sc2 = !(val & BIT(8))  + !(val & BIT(9));
+        l3->subcaches[2] = sc2 = !(val & BIT(8))  + !(val & BIT(9));
-        sc3 = !(val & BIT(12)) + !(val & BIT(13));
+        l3->subcaches[3] = sc3 = !(val & BIT(12)) + !(val & BIT(13));
-        return (max(max(max(sc0, sc1), sc2), sc3) << 10) - 1;
+        l3->indices = (max(max(max(sc0, sc1), sc2), sc3) << 10) - 1;
+}
+static struct amd_l3_cache * __cpuinit amd_init_l3_cache(int node)
+{
+        struct amd_l3_cache *l3;
+        struct pci_dev *dev = node_to_k8_nb_misc(node);
+        l3 = kzalloc(sizeof(struct amd_l3_cache), GFP_ATOMIC);
+        if (!l3) {
+                printk(KERN_WARNING "Error allocating L3 struct\n");
+                return NULL;
+        }
+        l3->dev = dev;
+        amd_calc_l3_indices(l3);
+        return l3;
 }
 static void __cpuinit
 amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
 {
-        if (index < 3)
+        int node;
+        if (boot_cpu_data.x86 != 0x10)
                return;
-        if (boot_cpu_data.x86 == 0x11)
+        if (index < 3)
                return;
        /* see errata #382 and #388 */
-        if ((boot_cpu_data.x86 == 0x10) &&
+        if (boot_cpu_data.x86_model < 0x8)
-            ((boot_cpu_data.x86_model < 0x8) ||
+                return;
-             (boot_cpu_data.x86_mask  < 0x1)))
+        if ((boot_cpu_data.x86_model == 0x8 ||
+             boot_cpu_data.x86_model == 0x9)
+                &&
+             boot_cpu_data.x86_mask < 0x1)
+                        return;
+        /* not in virtualized environments */
+        if (num_k8_northbridges == 0)
                return;
-        this_leaf->can_disable = true;
+        /*
-        this_leaf->l3_indices  = amd_calc_l3_indices();
+         * Strictly speaking, the amount in @size below is leaked since it is
+         * never freed but this is done only on shutdown so it doesn't matter.
+         */
+        if (!l3_caches) {
+                int size = num_k8_northbridges * sizeof(struct amd_l3_cache *);
+                l3_caches = kzalloc(size, GFP_ATOMIC);
+                if (!l3_caches)
+                        return;
+        }
+        node = amd_get_nb_id(smp_processor_id());
+        if (!l3_caches[node]) {
+                l3_caches[node] = amd_init_l3_cache(node);
+                l3_caches[node]->can_disable = true;
+        }
+        WARN_ON(!l3_caches[node]);
+        this_leaf->l3 = l3_caches[node];
 }
 static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf,
-                                  unsigned int index)
+                                  unsigned int slot)
 {
-        int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
+        struct pci_dev *dev = this_leaf->l3->dev;
-        int node = amd_get_nb_id(cpu);
-        struct pci_dev *dev = node_to_k8_nb_misc(node);
        unsigned int reg = 0;
-        if (!this_leaf->can_disable)
+        if (!this_leaf->l3 || !this_leaf->l3->can_disable)
                return -EINVAL;
        if (!dev)
                return -EINVAL;
-        pci_read_config_dword(dev, 0x1BC + index * 4, &reg);
+        pci_read_config_dword(dev, 0x1BC + slot * 4, &reg);
        return sprintf(buf, "0x%08x\n", reg);
 }
-#define SHOW_CACHE_DISABLE(index)                                       \
+#define SHOW_CACHE_DISABLE(slot)                                        \
 static ssize_t                                                          \
-show_cache_disable_##index(struct _cpuid4_info *this_leaf, char *buf)   \
+show_cache_disable_##slot(struct _cpuid4_info *this_leaf, char *buf)    \
 {                                                                       \
-        return show_cache_disable(this_leaf, buf, index);               \
+        return show_cache_disable(this_leaf, buf, slot);                \
 }
 SHOW_CACHE_DISABLE(0)
 SHOW_CACHE_DISABLE(1)
+static void amd_l3_disable_index(struct amd_l3_cache *l3, int cpu,
+                                 unsigned slot, unsigned long idx)
+{
+        int i;
+        idx |= BIT(30);
+        /*
+         *  disable index in all 4 subcaches
+         */
+        for (i = 0; i < 4; i++) {
+                u32 reg = idx | (i << 20);
+                if (!l3->subcaches[i])
+                        continue;
+                pci_write_config_dword(l3->dev, 0x1BC + slot * 4, reg);
+                /*
+                 * We need to WBINVD on a core on the node containing the L3
+                 * cache which indices we disable therefore a simple wbinvd()
+                 * is not sufficient.
+                 */
+                wbinvd_on_cpu(cpu);
+                reg |= BIT(31);
+                pci_write_config_dword(l3->dev, 0x1BC + slot * 4, reg);
+        }
+}
 static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
-        const char *buf, size_t count, unsigned int index)
+                                   const char *buf, size_t count,
+                                   unsigned int slot)
 {
+        struct pci_dev *dev = this_leaf->l3->dev;
        int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
-        int node = amd_get_nb_id(cpu);
-        struct pci_dev *dev = node_to_k8_nb_misc(node);
        unsigned long val = 0;
 #define SUBCACHE_MASK   (3UL << 20)
 #define SUBCACHE_INDEX  0xfff
-        if (!this_leaf->can_disable)
+        if (!this_leaf->l3 || !this_leaf->l3->can_disable)
                return -EINVAL;
        if (!capable(CAP_SYS_ADMIN))
@@ -396,26 +477,20 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
        /* do not allow writes outside of allowed bits */
        if ((val & ~(SUBCACHE_MASK | SUBCACHE_INDEX)) ||
-            ((val & SUBCACHE_INDEX) > this_leaf->l3_indices))
+            ((val & SUBCACHE_INDEX) > this_leaf->l3->indices))
                return -EINVAL;
-        val |= BIT(30);
+        amd_l3_disable_index(this_leaf->l3, cpu, slot, val);
-        pci_write_config_dword(dev, 0x1BC + index * 4, val);
-        /*
-         * We need to WBINVD on a core on the node containing the L3 cache which
-         * indices we disable therefore a simple wbinvd() is not sufficient.
-         */
-        wbinvd_on_cpu(cpu);
-        pci_write_config_dword(dev, 0x1BC + index * 4, val | BIT(31));
        return count;
 }
-#define STORE_CACHE_DISABLE(index)                                      \
+#define STORE_CACHE_DISABLE(slot)                                       \
 static ssize_t                                                          \
-store_cache_disable_##index(struct _cpuid4_info *this_leaf,             \
+store_cache_disable_##slot(struct _cpuid4_info *this_leaf,              \
                            const char *buf, size_t count)              \
 {                                                                       \
-        return store_cache_disable(this_leaf, buf, count, index);       \
+        return store_cache_disable(this_leaf, buf, count, slot);        \
 }
 STORE_CACHE_DISABLE(0)
 STORE_CACHE_DISABLE(1)
@@ -443,8 +518,7 @@ __cpuinit cpuid4_cache_lookup_regs(int index,
        if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
                amd_cpuid4(index, &eax, &ebx, &ecx);
-                if (boot_cpu_data.x86 >= 0x10)
+                amd_check_l3_disable(index, this_leaf);
-                        amd_check_l3_disable(index, this_leaf);
        } else {
                cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx);
        }
@@ -701,6 +775,7 @@ static void __cpuinit free_cache_attributes(unsigned int cpu)
        for (i = 0; i < num_cache_leaves; i++)
                cache_remove_shared_cpu_map(cpu, i);
+        kfree(per_cpu(ici_cpuid4_info, cpu)->l3);
        kfree(per_cpu(ici_cpuid4_info, cpu));
        per_cpu(ici_cpuid4_info, cpu) = NULL;
 }
@@ -985,7 +1060,7 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
                this_leaf = CPUID4_INFO_IDX(cpu, i);
-                if (this_leaf->can_disable)
+                if (this_leaf->l3 && this_leaf->l3->can_disable)
                        ktype_cache.default_attrs = default_l3_attrs;
                else
                        ktype_cache.default_attrs = default_attrs;
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 8a6f0afa767e..7a355ddcc64b 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -539,7 +539,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
        struct mce m;
        int i;
-        __get_cpu_var(mce_poll_count)++;
+        percpu_inc(mce_poll_count);
        mce_setup(&m);
@@ -934,7 +934,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
        atomic_inc(&mce_entry);
-        __get_cpu_var(mce_exception_count)++;
+        percpu_inc(mce_exception_count);
        if (notify_die(DIE_NMI, "machine check", regs, error_code,
                           18, SIGKILL) == NOTIFY_STOP)
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
new file mode 100644
index 000000000000..16f41bbe46b6
--- /dev/null
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -0,0 +1,55 @@
+/*
+ * HyperV  Detection code.
+ *
+ * Copyright (C) 2010, Novell, Inc.
+ * Author : K. Y. Srinivasan <ksrinivasan@novell.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ */
+#include <linux/types.h>
+#include <linux/module.h>
+#include <asm/processor.h>
+#include <asm/hypervisor.h>
+#include <asm/hyperv.h>
+#include <asm/mshyperv.h>
+struct ms_hyperv_info ms_hyperv;
+static bool __init ms_hyperv_platform(void)
+{
+        u32 eax;
+        u32 hyp_signature[3];
+        if (!boot_cpu_has(X86_FEATURE_HYPERVISOR))
+                return false;
+        cpuid(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS,
+              &eax, &hyp_signature[0], &hyp_signature[1], &hyp_signature[2]);
+        return eax >= HYPERV_CPUID_MIN &&
+                eax <= HYPERV_CPUID_MAX &&
+                !memcmp("Microsoft Hv", hyp_signature, 12);
+}
+static void __init ms_hyperv_init_platform(void)
+{
+        /*
+         * Extract the features and hints
+         */
+        ms_hyperv.features = cpuid_eax(HYPERV_CPUID_FEATURES);
+        ms_hyperv.hints    = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO);
+        printk(KERN_INFO "HyperV: features 0x%x, hints 0x%x\n",
+               ms_hyperv.features, ms_hyperv.hints);
+}
+const __refconst struct hypervisor_x86 x86_hyper_ms_hyperv = {
+        .name                   = "Microsoft HyperV",
+        .detect                 = ms_hyperv_platform,
+        .init_platform          = ms_hyperv_init_platform,
+};
+EXPORT_SYMBOL(x86_hyper_ms_hyperv);
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index db5bdc8addf8..fd4db0db3708 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -31,46 +31,51 @@
 #include <asm/nmi.h>
 #include <asm/compat.h>
-static u64 perf_event_mask __read_mostly;
+#if 0
+#undef wrmsrl
+#define wrmsrl(msr, val)                                        \
+do {                                                            \
+        trace_printk("wrmsrl(%lx, %lx)\n", (unsigned long)(msr),\
+                        (unsigned long)(val));                  \
+        native_write_msr((msr), (u32)((u64)(val)),              \
+                        (u32)((u64)(val) >> 32));               \
+} while (0)
+#endif
-/* The maximal number of PEBS events: */
+/*
-#define MAX_PEBS_EVENTS 4
+ * best effort, GUP based copy_from_user() that assumes IRQ or NMI context
+ */
+static unsigned long
+copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
+{
+        unsigned long offset, addr = (unsigned long)from;
+        int type = in_nmi() ? KM_NMI : KM_IRQ0;
+        unsigned long size, len = 0;
+        struct page *page;
+        void *map;
+        int ret;
-/* The size of a BTS record in bytes: */
+        do {
-#define BTS_RECORD_SIZE         24
+                ret = __get_user_pages_fast(addr, 1, 0, &page);
+                if (!ret)
+                        break;
-/* The size of a per-cpu BTS buffer in bytes: */
+                offset = addr & (PAGE_SIZE - 1);
-#define BTS_BUFFER_SIZE         (BTS_RECORD_SIZE * 2048)
+                size = min(PAGE_SIZE - offset, n - len);
-/* The BTS overflow threshold in bytes from the end of the buffer: */
+                map = kmap_atomic(page, type);
-#define BTS_OVFL_TH             (BTS_RECORD_SIZE * 128)
+                memcpy(to, map+offset, size);
+                kunmap_atomic(map, type);
+                put_page(page);
+                len  += size;
+                to   += size;
+                addr += size;
-/*
+        } while (len < n);
- * Bits in the debugctlmsr controlling branch tracing.
- */
-#define X86_DEBUGCTL_TR                 (1 << 6)
-#define X86_DEBUGCTL_BTS                (1 << 7)
-#define X86_DEBUGCTL_BTINT              (1 << 8)
-#define X86_DEBUGCTL_BTS_OFF_OS         (1 << 9)
-#define X86_DEBUGCTL_BTS_OFF_USR        (1 << 10)
-/*
+        return len;
- * A debug store configuration.
+}
- *
- * We only support architectures that use 64bit fields.
- */
-struct debug_store {
-        u64     bts_buffer_base;
-        u64     bts_index;
-        u64     bts_absolute_maximum;
-        u64     bts_interrupt_threshold;
-        u64     pebs_buffer_base;
-        u64     pebs_index;
-        u64     pebs_absolute_maximum;
-        u64     pebs_interrupt_threshold;
-        u64     pebs_event_reset[MAX_PEBS_EVENTS];
-};
 struct event_constraint {
        union {
@@ -89,18 +94,41 @@ struct amd_nb {
        struct event_constraint event_constraints[X86_PMC_IDX_MAX];
 };
+#define MAX_LBR_ENTRIES         16
 struct cpu_hw_events {
+        /*
+         * Generic x86 PMC bits
+         */
        struct perf_event       *events[X86_PMC_IDX_MAX]; /* in counter order */
        unsigned long           active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
-        unsigned long           interrupts;
        int                     enabled;
-        struct debug_store      *ds;
        int                     n_events;
        int                     n_added;
        int                     assign[X86_PMC_IDX_MAX]; /* event to counter assignment */
        u64                     tags[X86_PMC_IDX_MAX];
        struct perf_event       *event_list[X86_PMC_IDX_MAX]; /* in enabled order */
+        unsigned int            group_flag;
+        /*
+         * Intel DebugStore bits
+         */
+        struct debug_store      *ds;
+        u64                     pebs_enabled;
+        /*
+         * Intel LBR bits
+         */
+        int                             lbr_users;
+        void                            *lbr_context;
+        struct perf_branch_stack        lbr_stack;
+        struct perf_branch_entry        lbr_entries[MAX_LBR_ENTRIES];
+        /*
+         * AMD specific bits
+         */
        struct amd_nb           *amd_nb;
 };
@@ -114,44 +142,75 @@ struct cpu_hw_events {
 #define EVENT_CONSTRAINT(c, n, m)       \
        __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n))
+/*
+ * Constraint on the Event code.
+ */
 #define INTEL_EVENT_CONSTRAINT(c, n)    \
-        EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVTSEL_MASK)
+        EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT)
+/*
+ * Constraint on the Event code + UMask + fixed-mask
+ *
+ * filter mask to validate fixed counter events.
+ * the following filters disqualify for fixed counters:
+ *  - inv
+ *  - edge
+ *  - cnt-mask
+ *  The other filters are supported by fixed counters.
+ *  The any-thread option is supported starting with v3.
+ */
 #define FIXED_EVENT_CONSTRAINT(c, n)    \
-        EVENT_CONSTRAINT(c, (1ULL << (32+n)), INTEL_ARCH_FIXED_MASK)
+        EVENT_CONSTRAINT(c, (1ULL << (32+n)), X86_RAW_EVENT_MASK)
+/*
+ * Constraint on the Event code + UMask
+ */
+#define PEBS_EVENT_CONSTRAINT(c, n)     \
+        EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK)
 #define EVENT_CONSTRAINT_END            \
        EVENT_CONSTRAINT(0, 0, 0)
 #define for_each_event_constraint(e, c) \
-        for ((e) = (c); (e)->cmask; (e)++)
+        for ((e) = (c); (e)->weight; (e)++)
+union perf_capabilities {
+        struct {
+                u64     lbr_format    : 6;
+                u64     pebs_trap     : 1;
+                u64     pebs_arch_reg : 1;
+                u64     pebs_format   : 4;
+                u64     smm_freeze    : 1;
+        };
+        u64     capabilities;
+};
 /*
 * struct x86_pmu - generic x86 pmu
 */
 struct x86_pmu {
+        /*
+         * Generic x86 PMC bits
+         */
        const char      *name;
        int             version;
        int             (*handle_irq)(struct pt_regs *);
        void            (*disable_all)(void);
-        void            (*enable_all)(void);
+        void            (*enable_all)(int added);
        void            (*enable)(struct perf_event *);
        void            (*disable)(struct perf_event *);
+        int             (*hw_config)(struct perf_event *event);
+        int             (*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign);
        unsigned        eventsel;
        unsigned        perfctr;
        u64             (*event_map)(int);
-        u64             (*raw_event)(u64);
        int             max_events;
-        int             num_events;
+        int             num_counters;
-        int             num_events_fixed;
+        int             num_counters_fixed;
-        int             event_bits;
+        int             cntval_bits;
-        u64             event_mask;
+        u64             cntval_mask;
        int             apic;
        u64             max_period;
-        u64             intel_ctrl;
-        void            (*enable_bts)(u64 config);
-        void            (*disable_bts)(void);
        struct event_constraint *
                        (*get_event_constraints)(struct cpu_hw_events *cpuc,
                                                 struct perf_event *event);
@@ -159,11 +218,32 @@ struct x86_pmu {
        void            (*put_event_constraints)(struct cpu_hw_events *cpuc,
                                                 struct perf_event *event);
        struct event_constraint *event_constraints;
+        void            (*quirks)(void);
        int             (*cpu_prepare)(int cpu);
        void            (*cpu_starting)(int cpu);
        void            (*cpu_dying)(int cpu);
        void            (*cpu_dead)(int cpu);
+        /*
+         * Intel Arch Perfmon v2+
+         */
+        u64                     intel_ctrl;
+        union perf_capabilities intel_cap;
+        /*
+         * Intel DebugStore bits
+         */
+        int             bts, pebs;
+        int             pebs_record_size;
+        void            (*drain_pebs)(struct pt_regs *regs);
+        struct event_constraint *pebs_constraints;
+        /*
+         * Intel LBR
+         */
+        unsigned long   lbr_tos, lbr_from, lbr_to; /* MSR base regs       */
+        int             lbr_nr;                    /* hardware stack size */
 };
 static struct x86_pmu x86_pmu __read_mostly;
@@ -198,7 +278,7 @@ static u64
 x86_perf_event_update(struct perf_event *event)
 {
        struct hw_perf_event *hwc = &event->hw;
-        int shift = 64 - x86_pmu.event_bits;
+        int shift = 64 - x86_pmu.cntval_bits;
        u64 prev_raw_count, new_raw_count;
        int idx = hwc->idx;
        s64 delta;
@@ -241,33 +321,32 @@ again:
 static atomic_t active_events;
 static DEFINE_MUTEX(pmc_reserve_mutex);
+#ifdef CONFIG_X86_LOCAL_APIC
 static bool reserve_pmc_hardware(void)
 {
-#ifdef CONFIG_X86_LOCAL_APIC
        int i;
        if (nmi_watchdog == NMI_LOCAL_APIC)
                disable_lapic_nmi_watchdog();
-        for (i = 0; i < x86_pmu.num_events; i++) {
+        for (i = 0; i < x86_pmu.num_counters; i++) {
                if (!reserve_perfctr_nmi(x86_pmu.perfctr + i))
                        goto perfctr_fail;
        }
-        for (i = 0; i < x86_pmu.num_events; i++) {
+        for (i = 0; i < x86_pmu.num_counters; i++) {
                if (!reserve_evntsel_nmi(x86_pmu.eventsel + i))
                        goto eventsel_fail;
        }
-#endif
        return true;
-#ifdef CONFIG_X86_LOCAL_APIC
 eventsel_fail:
        for (i--; i >= 0; i--)
                release_evntsel_nmi(x86_pmu.eventsel + i);
-        i = x86_pmu.num_events;
+        i = x86_pmu.num_counters;
 perfctr_fail:
        for (i--; i >= 0; i--)
@@ -277,128 +356,36 @@ perfctr_fail:
                enable_lapic_nmi_watchdog();
        return false;
-#endif
 }
 static void release_pmc_hardware(void)
 {
-#ifdef CONFIG_X86_LOCAL_APIC
        int i;
-        for (i = 0; i < x86_pmu.num_events; i++) {
+        for (i = 0; i < x86_pmu.num_counters; i++) {
                release_perfctr_nmi(x86_pmu.perfctr + i);
                release_evntsel_nmi(x86_pmu.eventsel + i);
        }
        if (nmi_watchdog == NMI_LOCAL_APIC)
                enable_lapic_nmi_watchdog();
-#endif
-}
-static inline bool bts_available(void)
-{
-        return x86_pmu.enable_bts != NULL;
 }
-static void init_debug_store_on_cpu(int cpu)
+#else
-{
-        struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
-        if (!ds)
-                return;
-        wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA,
-                     (u32)((u64)(unsigned long)ds),
-                     (u32)((u64)(unsigned long)ds >> 32));
-}
-static void fini_debug_store_on_cpu(int cpu)
-{
-        if (!per_cpu(cpu_hw_events, cpu).ds)
-                return;
-        wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0);
-}
-static void release_bts_hardware(void)
-{
-        int cpu;
-        if (!bts_available())
-                return;
-        get_online_cpus();
-        for_each_online_cpu(cpu)
-                fini_debug_store_on_cpu(cpu);
-        for_each_possible_cpu(cpu) {
-                struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
-                if (!ds)
-                        continue;
-                per_cpu(cpu_hw_events, cpu).ds = NULL;
-                kfree((void *)(unsigned long)ds->bts_buffer_base);
-                kfree(ds);
-        }
-        put_online_cpus();
-}
-static int reserve_bts_hardware(void)
-{
-        int cpu, err = 0;
-        if (!bts_available())
-                return 0;
-        get_online_cpus();
-        for_each_possible_cpu(cpu) {
-                struct debug_store *ds;
-                void *buffer;
-                err = -ENOMEM;
-                buffer = kzalloc(BTS_BUFFER_SIZE, GFP_KERNEL);
-                if (unlikely(!buffer))
-                        break;
-                ds = kzalloc(sizeof(*ds), GFP_KERNEL);
-                if (unlikely(!ds)) {
-                        kfree(buffer);
-                        break;
-                }
-                ds->bts_buffer_base = (u64)(unsigned long)buffer;
-                ds->bts_index = ds->bts_buffer_base;
-                ds->bts_absolute_maximum =
-                        ds->bts_buffer_base + BTS_BUFFER_SIZE;
-                ds->bts_interrupt_threshold =
-                        ds->bts_absolute_maximum - BTS_OVFL_TH;
-                per_cpu(cpu_hw_events, cpu).ds = ds;
-                err = 0;
-        }
-        if (err)
+static bool reserve_pmc_hardware(void) { return true; }
-                release_bts_hardware();
+static void release_pmc_hardware(void) {}
-        else {
-                for_each_online_cpu(cpu)
-                        init_debug_store_on_cpu(cpu);
-        }
-        put_online_cpus();
+#endif
-        return err;
+static int reserve_ds_buffers(void);
-}
+static void release_ds_buffers(void);
 static void hw_perf_event_destroy(struct perf_event *event)
 {
        if (atomic_dec_and_mutex_lock(&active_events, &pmc_reserve_mutex)) {
                release_pmc_hardware();
-                release_bts_hardware();
+                release_ds_buffers();
                mutex_unlock(&pmc_reserve_mutex);
        }
 }
@@ -441,54 +428,11 @@ set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event_attr *attr)
        return 0;
 }
-/*
+static int x86_setup_perfctr(struct perf_event *event)
- * Setup the hardware configuration for a given attr_type
- */
-static int __hw_perf_event_init(struct perf_event *event)
 {
        struct perf_event_attr *attr = &event->attr;
        struct hw_perf_event *hwc = &event->hw;
        u64 config;
-        int err;
-        if (!x86_pmu_initialized())
-                return -ENODEV;
-        err = 0;
-        if (!atomic_inc_not_zero(&active_events)) {
-                mutex_lock(&pmc_reserve_mutex);
-                if (atomic_read(&active_events) == 0) {
-                        if (!reserve_pmc_hardware())
-                                err = -EBUSY;
-                        else
-                                err = reserve_bts_hardware();
-                }
-                if (!err)
-                        atomic_inc(&active_events);
-                mutex_unlock(&pmc_reserve_mutex);
-        }
-        if (err)
-                return err;
-        event->destroy = hw_perf_event_destroy;
-        /*
-         * Generate PMC IRQs:
-         * (keep 'enabled' bit clear for now)
-         */
-        hwc->config = ARCH_PERFMON_EVENTSEL_INT;
-        hwc->idx = -1;
-        hwc->last_cpu = -1;
-        hwc->last_tag = ~0ULL;
-        /*
-         * Count user and OS events unless requested not to.
-         */
-        if (!attr->exclude_user)
-                hwc->config |= ARCH_PERFMON_EVENTSEL_USR;
-        if (!attr->exclude_kernel)
-                hwc->config |= ARCH_PERFMON_EVENTSEL_OS;
        if (!hwc->sample_period) {
                hwc->sample_period = x86_pmu.max_period;
@@ -505,16 +449,8 @@ static int __hw_perf_event_init(struct perf_event *event)
                        return -EOPNOTSUPP;
        }
-        /*
+        if (attr->type == PERF_TYPE_RAW)
-         * Raw hw_event type provide the config in the hw_event structure
-         */
-        if (attr->type == PERF_TYPE_RAW) {
-                hwc->config |= x86_pmu.raw_event(attr->config);
-                if ((hwc->config & ARCH_PERFMON_EVENTSEL_ANY) &&
-                    perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
-                        return -EACCES;
                return 0;
-        }
        if (attr->type == PERF_TYPE_HW_CACHE)
                return set_ext_hw_attr(hwc, attr);
@@ -539,11 +475,11 @@ static int __hw_perf_event_init(struct perf_event *event)
        if ((attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) &&
            (hwc->sample_period == 1)) {
                /* BTS is not supported by this architecture. */
-                if (!bts_available())
+                if (!x86_pmu.bts)
                        return -EOPNOTSUPP;
                /* BTS is currently only allowed for user-mode. */
-                if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
+                if (!attr->exclude_kernel)
                        return -EOPNOTSUPP;
        }
@@ -552,12 +488,87 @@ static int __hw_perf_event_init(struct perf_event *event)
        return 0;
 }
+static int x86_pmu_hw_config(struct perf_event *event)
+{
+        if (event->attr.precise_ip) {
+                int precise = 0;
+                /* Support for constant skid */
+                if (x86_pmu.pebs)
+                        precise++;
+                /* Support for IP fixup */
+                if (x86_pmu.lbr_nr)
+                        precise++;
+                if (event->attr.precise_ip > precise)
+                        return -EOPNOTSUPP;
+        }
+        /*
+         * Generate PMC IRQs:
+         * (keep 'enabled' bit clear for now)
+         */
+        event->hw.config = ARCH_PERFMON_EVENTSEL_INT;
+        /*
+         * Count user and OS events unless requested not to
+         */
+        if (!event->attr.exclude_user)
+                event->hw.config |= ARCH_PERFMON_EVENTSEL_USR;
+        if (!event->attr.exclude_kernel)
+                event->hw.config |= ARCH_PERFMON_EVENTSEL_OS;
+        if (event->attr.type == PERF_TYPE_RAW)
+                event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK;
+        return x86_setup_perfctr(event);
+}
+/*
+ * Setup the hardware configuration for a given attr_type
+ */
+static int __hw_perf_event_init(struct perf_event *event)
+{
+        int err;
+        if (!x86_pmu_initialized())
+                return -ENODEV;
+        err = 0;
+        if (!atomic_inc_not_zero(&active_events)) {
+                mutex_lock(&pmc_reserve_mutex);
+                if (atomic_read(&active_events) == 0) {
+                        if (!reserve_pmc_hardware())
+                                err = -EBUSY;
+                        else {
+                                err = reserve_ds_buffers();
+                                if (err)
+                                        release_pmc_hardware();
+                        }
+                }
+                if (!err)
+                        atomic_inc(&active_events);
+                mutex_unlock(&pmc_reserve_mutex);
+        }
+        if (err)
+                return err;
+        event->destroy = hw_perf_event_destroy;
+        event->hw.idx = -1;
+        event->hw.last_cpu = -1;
+        event->hw.last_tag = ~0ULL;
+        return x86_pmu.hw_config(event);
+}
 static void x86_pmu_disable_all(void)
 {
        struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
        int idx;
-        for (idx = 0; idx < x86_pmu.num_events; idx++) {
+        for (idx = 0; idx < x86_pmu.num_counters; idx++) {
                u64 val;
                if (!test_bit(idx, cpuc->active_mask))
@@ -587,12 +598,12 @@ void hw_perf_disable(void)
        x86_pmu.disable_all();
 }
-static void x86_pmu_enable_all(void)
+static void x86_pmu_enable_all(int added)
 {
        struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
        int idx;
-        for (idx = 0; idx < x86_pmu.num_events; idx++) {
+        for (idx = 0; idx < x86_pmu.num_counters; idx++) {
                struct perf_event *event = cpuc->events[idx];
                u64 val;
@@ -667,14 +678,14 @@ static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
         * assign events to counters starting with most
         * constrained events.
         */
-        wmax = x86_pmu.num_events;
+        wmax = x86_pmu.num_counters;
        /*
         * when fixed event counters are present,
         * wmax is incremented by 1 to account
         * for one more choice
         */
-        if (x86_pmu.num_events_fixed)
+        if (x86_pmu.num_counters_fixed)
                wmax++;
        for (w = 1, num = n; num && w <= wmax; w++) {
@@ -724,7 +735,7 @@ static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader,
        struct perf_event *event;
        int n, max_count;
-        max_count = x86_pmu.num_events + x86_pmu.num_events_fixed;
+        max_count = x86_pmu.num_counters + x86_pmu.num_counters_fixed;
        /* current number of events already accepted */
        n = cpuc->n_events;
@@ -795,7 +806,7 @@ void hw_perf_enable(void)
        struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
        struct perf_event *event;
        struct hw_perf_event *hwc;
-        int i;
+        int i, added = cpuc->n_added;
        if (!x86_pmu_initialized())
                return;
@@ -847,19 +858,20 @@ void hw_perf_enable(void)
        cpuc->enabled = 1;
        barrier();
-        x86_pmu.enable_all();
+        x86_pmu.enable_all(added);
 }
-static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc)
+static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,
+                                          u64 enable_mask)
 {
-        (void)checking_wrmsrl(hwc->config_base + hwc->idx,
+        wrmsrl(hwc->config_base + hwc->idx, hwc->config | enable_mask);
-                              hwc->config | ARCH_PERFMON_EVENTSEL_ENABLE);
 }
 static inline void x86_pmu_disable_event(struct perf_event *event)
 {
        struct hw_perf_event *hwc = &event->hw;
-        (void)checking_wrmsrl(hwc->config_base + hwc->idx, hwc->config);
+        wrmsrl(hwc->config_base + hwc->idx, hwc->config);
 }
 static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
@@ -874,7 +886,7 @@ x86_perf_event_set_period(struct perf_event *event)
        struct hw_perf_event *hwc = &event->hw;
        s64 left = atomic64_read(&hwc->period_left);
        s64 period = hwc->sample_period;
-        int err, ret = 0, idx = hwc->idx;
+        int ret = 0, idx = hwc->idx;
        if (idx == X86_PMC_IDX_FIXED_BTS)
                return 0;
@@ -912,8 +924,8 @@ x86_perf_event_set_period(struct perf_event *event)
         */
        atomic64_set(&hwc->prev_count, (u64)-left);
-        err = checking_wrmsrl(hwc->event_base + idx,
+        wrmsrl(hwc->event_base + idx,
-                             (u64)(-left) & x86_pmu.event_mask);
+                        (u64)(-left) & x86_pmu.cntval_mask);
        perf_event_update_userpage(event);
@@ -924,7 +936,8 @@ static void x86_pmu_enable_event(struct perf_event *event)
 {
        struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
        if (cpuc->enabled)
-                __x86_pmu_enable_event(&event->hw);
+                __x86_pmu_enable_event(&event->hw,
+                                       ARCH_PERFMON_EVENTSEL_ENABLE);
 }
 /*
@@ -950,7 +963,15 @@ static int x86_pmu_enable(struct perf_event *event)
        if (n < 0)
                return n;
-        ret = x86_schedule_events(cpuc, n, assign);
+        /*
+         * If group events scheduling transaction was started,
+         * skip the schedulability test here, it will be peformed
+         * at commit time(->commit_txn) as a whole
+         */
+        if (cpuc->group_flag & PERF_EVENT_TXN_STARTED)
+                goto out;
+        ret = x86_pmu.schedule_events(cpuc, n, assign);
        if (ret)
                return ret;
        /*
@@ -959,6 +980,7 @@ static int x86_pmu_enable(struct perf_event *event)
         */
        memcpy(cpuc->assign, assign, n*sizeof(int));
+out:
        cpuc->n_events = n;
        cpuc->n_added += n - n0;
@@ -991,11 +1013,12 @@ static void x86_pmu_unthrottle(struct perf_event *event)
 void perf_event_print_debug(void)
 {
        u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
+        u64 pebs;
        struct cpu_hw_events *cpuc;
        unsigned long flags;
        int cpu, idx;
-        if (!x86_pmu.num_events)
+        if (!x86_pmu.num_counters)
                return;
        local_irq_save(flags);
@@ -1008,16 +1031,18 @@ void perf_event_print_debug(void)
                rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
                rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
                rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
+                rdmsrl(MSR_IA32_PEBS_ENABLE, pebs);
                pr_info("\n");
                pr_info("CPU#%d: ctrl:       %016llx\n", cpu, ctrl);
                pr_info("CPU#%d: status:     %016llx\n", cpu, status);
                pr_info("CPU#%d: overflow:   %016llx\n", cpu, overflow);
                pr_info("CPU#%d: fixed:      %016llx\n", cpu, fixed);
+                pr_info("CPU#%d: pebs:       %016llx\n", cpu, pebs);
        }
-        pr_info("CPU#%d: active:       %016llx\n", cpu, *(u64 *)cpuc->active_mask);
+        pr_info("CPU#%d: active:     %016llx\n", cpu, *(u64 *)cpuc->active_mask);
-        for (idx = 0; idx < x86_pmu.num_events; idx++) {
+        for (idx = 0; idx < x86_pmu.num_counters; idx++) {
                rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl);
                rdmsrl(x86_pmu.perfctr  + idx, pmc_count);
@@ -1030,7 +1055,7 @@ void perf_event_print_debug(void)
                pr_info("CPU#%d:   gen-PMC%d left:  %016llx\n",
                        cpu, idx, prev_left);
        }
-        for (idx = 0; idx < x86_pmu.num_events_fixed; idx++) {
+        for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
                rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);
                pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
@@ -1095,7 +1120,7 @@ static int x86_pmu_handle_irq(struct pt_regs *regs)
        cpuc = &__get_cpu_var(cpu_hw_events);
-        for (idx = 0; idx < x86_pmu.num_events; idx++) {
+        for (idx = 0; idx < x86_pmu.num_counters; idx++) {
                if (!test_bit(idx, cpuc->active_mask))
                        continue;
@@ -1103,7 +1128,7 @@ static int x86_pmu_handle_irq(struct pt_regs *regs)
                hwc = &event->hw;
                val = x86_perf_event_update(event);
-                if (val & (1ULL << (x86_pmu.event_bits - 1)))
+                if (val & (1ULL << (x86_pmu.cntval_bits - 1)))
                        continue;
                /*
@@ -1146,7 +1171,6 @@ void set_perf_event_pending(void)
 void perf_events_lapic_init(void)
 {
-#ifdef CONFIG_X86_LOCAL_APIC
        if (!x86_pmu.apic || !x86_pmu_initialized())
                return;
@@ -1154,7 +1178,6 @@ void perf_events_lapic_init(void)
         * Always use NMI for PMU
         */
        apic_write(APIC_LVTPC, APIC_DM_NMI);
-#endif
 }
 static int __kprobes
@@ -1178,9 +1201,7 @@ perf_event_nmi_handler(struct notifier_block *self,
        regs = args->regs;
-#ifdef CONFIG_X86_LOCAL_APIC
        apic_write(APIC_LVTPC, APIC_DM_NMI);
-#endif
        /*
         * Can't rely on the handled return value to say it was our NMI, two
         * events could trigger 'simultaneously' raising two back-to-back NMIs.
@@ -1217,118 +1238,11 @@ x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
        return &unconstrained;
 }
-static int x86_event_sched_in(struct perf_event *event,
-                          struct perf_cpu_context *cpuctx)
-{
-        int ret = 0;
-        event->state = PERF_EVENT_STATE_ACTIVE;
-        event->oncpu = smp_processor_id();
-        event->tstamp_running += event->ctx->time - event->tstamp_stopped;
-        if (!is_x86_event(event))
-                ret = event->pmu->enable(event);
-        if (!ret && !is_software_event(event))
-                cpuctx->active_oncpu++;
-        if (!ret && event->attr.exclusive)
-                cpuctx->exclusive = 1;
-        return ret;
-}
-static void x86_event_sched_out(struct perf_event *event,
-                            struct perf_cpu_context *cpuctx)
-{
-        event->state = PERF_EVENT_STATE_INACTIVE;
-        event->oncpu = -1;
-        if (!is_x86_event(event))
-                event->pmu->disable(event);
-        event->tstamp_running -= event->ctx->time - event->tstamp_stopped;
-        if (!is_software_event(event))
-                cpuctx->active_oncpu--;
-        if (event->attr.exclusive || !cpuctx->active_oncpu)
-                cpuctx->exclusive = 0;
-}
-/*
- * Called to enable a whole group of events.
- * Returns 1 if the group was enabled, or -EAGAIN if it could not be.
- * Assumes the caller has disabled interrupts and has
- * frozen the PMU with hw_perf_save_disable.
- *
- * called with PMU disabled. If successful and return value 1,
- * then guaranteed to call perf_enable() and hw_perf_enable()
- */
-int hw_perf_group_sched_in(struct perf_event *leader,
-               struct perf_cpu_context *cpuctx,
-               struct perf_event_context *ctx)
-{
-        struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
-        struct perf_event *sub;
-        int assign[X86_PMC_IDX_MAX];
-        int n0, n1, ret;
-        /* n0 = total number of events */
-        n0 = collect_events(cpuc, leader, true);
-        if (n0 < 0)
-                return n0;
-        ret = x86_schedule_events(cpuc, n0, assign);
-        if (ret)
-                return ret;
-        ret = x86_event_sched_in(leader, cpuctx);
-        if (ret)
-                return ret;
-        n1 = 1;
-        list_for_each_entry(sub, &leader->sibling_list, group_entry) {
-                if (sub->state > PERF_EVENT_STATE_OFF) {
-                        ret = x86_event_sched_in(sub, cpuctx);
-                        if (ret)
-                                goto undo;
-                        ++n1;
-                }
-        }
-        /*
-         * copy new assignment, now we know it is possible
-         * will be used by hw_perf_enable()
-         */
-        memcpy(cpuc->assign, assign, n0*sizeof(int));
-        cpuc->n_events  = n0;
-        cpuc->n_added  += n1;
-        ctx->nr_active += n1;
-        /*
-         * 1 means successful and events are active
-         * This is not quite true because we defer
-         * actual activation until hw_perf_enable() but
-         * this way we* ensure caller won't try to enable
-         * individual events
-         */
-        return 1;
-undo:
-        x86_event_sched_out(leader, cpuctx);
-        n0  = 1;
-        list_for_each_entry(sub, &leader->sibling_list, group_entry) {
-                if (sub->state == PERF_EVENT_STATE_ACTIVE) {
-                        x86_event_sched_out(sub, cpuctx);
-                        if (++n0 == n1)
-                                break;
-                }
-        }
-        return ret;
-}
 #include "perf_event_amd.c"
 #include "perf_event_p6.c"
+#include "perf_event_p4.c"
+#include "perf_event_intel_lbr.c"
+#include "perf_event_intel_ds.c"
 #include "perf_event_intel.c"
 static int __cpuinit
@@ -1402,48 +1316,50 @@ void __init init_hw_perf_events(void)
        pr_cont("%s PMU driver.\n", x86_pmu.name);
-        if (x86_pmu.num_events > X86_PMC_MAX_GENERIC) {
+        if (x86_pmu.quirks)
+                x86_pmu.quirks();
+        if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) {
                WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!",
-                     x86_pmu.num_events, X86_PMC_MAX_GENERIC);
+                     x86_pmu.num_counters, X86_PMC_MAX_GENERIC);
-                x86_pmu.num_events = X86_PMC_MAX_GENERIC;
+                x86_pmu.num_counters = X86_PMC_MAX_GENERIC;
        }
-        perf_event_mask = (1 << x86_pmu.num_events) - 1;
+        x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1;
-        perf_max_events = x86_pmu.num_events;
+        perf_max_events = x86_pmu.num_counters;
-        if (x86_pmu.num_events_fixed > X86_PMC_MAX_FIXED) {
+        if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) {
                WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!",
-                     x86_pmu.num_events_fixed, X86_PMC_MAX_FIXED);
+                     x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED);
-                x86_pmu.num_events_fixed = X86_PMC_MAX_FIXED;
+                x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED;
        }
-        perf_event_mask |=
+        x86_pmu.intel_ctrl |=
-                ((1LL << x86_pmu.num_events_fixed)-1) << X86_PMC_IDX_FIXED;
+                ((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED;
-        x86_pmu.intel_ctrl = perf_event_mask;
        perf_events_lapic_init();
        register_die_notifier(&perf_event_nmi_notifier);
        unconstrained = (struct event_constraint)
-                __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_events) - 1,
+                __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1,
-                                   0, x86_pmu.num_events);
+                                   0, x86_pmu.num_counters);
        if (x86_pmu.event_constraints) {
                for_each_event_constraint(c, x86_pmu.event_constraints) {
-                        if (c->cmask != INTEL_ARCH_FIXED_MASK)
+                        if (c->cmask != X86_RAW_EVENT_MASK)
                                continue;
-                        c->idxmsk64 |= (1ULL << x86_pmu.num_events) - 1;
+                        c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1;
-                        c->weight += x86_pmu.num_events;
+                        c->weight += x86_pmu.num_counters;
                }
        }
        pr_info("... version:                %d\n",     x86_pmu.version);
-        pr_info("... bit width:              %d\n",     x86_pmu.event_bits);
+        pr_info("... bit width:              %d\n",     x86_pmu.cntval_bits);
-        pr_info("... generic registers:      %d\n",     x86_pmu.num_events);
+        pr_info("... generic registers:      %d\n",     x86_pmu.num_counters);
-        pr_info("... value mask:             %016Lx\n", x86_pmu.event_mask);
+        pr_info("... value mask:             %016Lx\n", x86_pmu.cntval_mask);
        pr_info("... max period:             %016Lx\n", x86_pmu.max_period);
-        pr_info("... fixed-purpose events:   %d\n",     x86_pmu.num_events_fixed);
+        pr_info("... fixed-purpose events:   %d\n",     x86_pmu.num_counters_fixed);
-        pr_info("... event mask:             %016Lx\n", perf_event_mask);
+        pr_info("... event mask:             %016Lx\n", x86_pmu.intel_ctrl);
        perf_cpu_notifier(x86_pmu_notifier);
 }
@@ -1453,6 +1369,59 @@ static inline void x86_pmu_read(struct perf_event *event)
        x86_perf_event_update(event);
 }
+/*
+ * Start group events scheduling transaction
+ * Set the flag to make pmu::enable() not perform the
+ * schedulability test, it will be performed at commit time
+ */
+static void x86_pmu_start_txn(const struct pmu *pmu)
+{
+        struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+        cpuc->group_flag |= PERF_EVENT_TXN_STARTED;
+}
+/*
+ * Stop group events scheduling transaction
+ * Clear the flag and pmu::enable() will perform the
+ * schedulability test.
+ */
+static void x86_pmu_cancel_txn(const struct pmu *pmu)
+{
+        struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+        cpuc->group_flag &= ~PERF_EVENT_TXN_STARTED;
+}
+/*
+ * Commit group events scheduling transaction
+ * Perform the group schedulability test as a whole
+ * Return 0 if success
+ */
+static int x86_pmu_commit_txn(const struct pmu *pmu)
+{
+        struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+        int assign[X86_PMC_IDX_MAX];
+        int n, ret;
+        n = cpuc->n_events;
+        if (!x86_pmu_initialized())
+                return -EAGAIN;
+        ret = x86_pmu.schedule_events(cpuc, n, assign);
+        if (ret)
+                return ret;
+        /*
+         * copy new assignment, now we know it is possible
+         * will be used by hw_perf_enable()
+         */
+        memcpy(cpuc->assign, assign, n*sizeof(int));
+        return 0;
+}
 static const struct pmu pmu = {
        .enable         = x86_pmu_enable,
        .disable        = x86_pmu_disable,
@@ -1460,9 +1429,38 @@ static const struct pmu pmu = {
        .stop           = x86_pmu_stop,
        .read           = x86_pmu_read,
        .unthrottle     = x86_pmu_unthrottle,
+        .start_txn      = x86_pmu_start_txn,
+        .cancel_txn     = x86_pmu_cancel_txn,
+        .commit_txn     = x86_pmu_commit_txn,
 };
 /*
+ * validate that we can schedule this event
+ */
+static int validate_event(struct perf_event *event)
+{
+        struct cpu_hw_events *fake_cpuc;
+        struct event_constraint *c;
+        int ret = 0;
+        fake_cpuc = kmalloc(sizeof(*fake_cpuc), GFP_KERNEL | __GFP_ZERO);
+        if (!fake_cpuc)
+                return -ENOMEM;
+        c = x86_pmu.get_event_constraints(fake_cpuc, event);
+        if (!c || !c->weight)
+                ret = -ENOSPC;
+        if (x86_pmu.put_event_constraints)
+                x86_pmu.put_event_constraints(fake_cpuc, event);
+        kfree(fake_cpuc);
+        return ret;
+}
+/*
 * validate a single event group
 *
 * validation include:
@@ -1502,7 +1500,7 @@ static int validate_group(struct perf_event *event)
        fake_cpuc->n_events = n;
-        ret = x86_schedule_events(fake_cpuc, n, NULL);
+        ret = x86_pmu.schedule_events(fake_cpuc, n, NULL);
 out_free:
        kfree(fake_cpuc);
@@ -1527,6 +1525,8 @@ const struct pmu *hw_perf_event_init(struct perf_event *event)
                if (event->group_leader != event)
                        err = validate_group(event);
+                else
+                        err = validate_event(event);
                event->pmu = tmp;
        }
@@ -1574,8 +1574,7 @@ static void backtrace_address(void *data, unsigned long addr, int reliable)
 {
        struct perf_callchain_entry *entry = data;
-        if (reliable)
+        callchain_store(entry, addr);
-                callchain_store(entry, addr);
 }
 static const struct stacktrace_ops backtrace_ops = {
@@ -1597,41 +1596,6 @@ perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
        dump_trace(NULL, regs, NULL, regs->bp, &backtrace_ops, entry);
 }
-/*
- * best effort, GUP based copy_from_user() that assumes IRQ or NMI context
- */
-static unsigned long
-copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
-{
-        unsigned long offset, addr = (unsigned long)from;
-        int type = in_nmi() ? KM_NMI : KM_IRQ0;
-        unsigned long size, len = 0;
-        struct page *page;
-        void *map;
-        int ret;
-        do {
-                ret = __get_user_pages_fast(addr, 1, 0, &page);
-                if (!ret)
-                        break;
-                offset = addr & (PAGE_SIZE - 1);
-                size = min(PAGE_SIZE - offset, n - len);
-                map = kmap_atomic(page, type);
-                memcpy(to, map+offset, size);
-                kunmap_atomic(map, type);
-                put_page(page);
-                len  += size;
-                to   += size;
-                addr += size;
-        } while (len < n);
-        return len;
-}
 #ifdef CONFIG_COMPAT
 static inline int
 perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
@@ -1727,6 +1691,11 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
 {
        struct perf_callchain_entry *entry;
+        if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
+                /* TODO: We don't support guest os callchain now */
+                return NULL;
+        }
        if (in_nmi())
                entry = &__get_cpu_var(pmc_nmi_entry);
        else
@@ -1750,3 +1719,37 @@ void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int ski
        regs->cs = __KERNEL_CS;
        local_save_flags(regs->flags);
 }
+unsigned long perf_instruction_pointer(struct pt_regs *regs)
+{
+        unsigned long ip;
+        if (perf_guest_cbs && perf_guest_cbs->is_in_guest())
+                ip = perf_guest_cbs->get_guest_ip();
+        else
+                ip = instruction_pointer(regs);
+        return ip;
+}
+unsigned long perf_misc_flags(struct pt_regs *regs)
+{
+        int misc = 0;
+        if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
+                if (perf_guest_cbs->is_user_mode())
+                        misc |= PERF_RECORD_MISC_GUEST_USER;
+                else
+                        misc |= PERF_RECORD_MISC_GUEST_KERNEL;
+        } else {
+                if (user_mode(regs))
+                        misc |= PERF_RECORD_MISC_USER;
+                else
+                        misc |= PERF_RECORD_MISC_KERNEL;
+        }
+        if (regs->flags & PERF_EFLAGS_EXACT)
+                misc |= PERF_RECORD_MISC_EXACT_IP;
+        return misc;
+}
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index db6f7d4056e1..611df11ba15e 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -2,7 +2,7 @@
 static DEFINE_RAW_SPINLOCK(amd_nb_lock);
-static __initconst u64 amd_hw_cache_event_ids
+static __initconst const u64 amd_hw_cache_event_ids
                                [PERF_COUNT_HW_CACHE_MAX]
                                [PERF_COUNT_HW_CACHE_OP_MAX]
                                [PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -111,22 +111,19 @@ static u64 amd_pmu_event_map(int hw_event)
        return amd_perfmon_event_map[hw_event];
 }
-static u64 amd_pmu_raw_event(u64 hw_event)
+static int amd_pmu_hw_config(struct perf_event *event)
 {
-#define K7_EVNTSEL_EVENT_MASK   0xF000000FFULL
+        int ret = x86_pmu_hw_config(event);
-#define K7_EVNTSEL_UNIT_MASK    0x00000FF00ULL
-#define K7_EVNTSEL_EDGE_MASK    0x000040000ULL
+        if (ret)
-#define K7_EVNTSEL_INV_MASK     0x000800000ULL
+                return ret;
-#define K7_EVNTSEL_REG_MASK     0x0FF000000ULL
+        if (event->attr.type != PERF_TYPE_RAW)
-#define K7_EVNTSEL_MASK                 \
+                return 0;
-        (K7_EVNTSEL_EVENT_MASK |        \
-         K7_EVNTSEL_UNIT_MASK  |        \
+        event->hw.config |= event->attr.config & AMD64_RAW_EVENT_MASK;
-         K7_EVNTSEL_EDGE_MASK  |        \
-         K7_EVNTSEL_INV_MASK   |        \
+        return 0;
-         K7_EVNTSEL_REG_MASK)
-        return hw_event & K7_EVNTSEL_MASK;
 }
 /*
@@ -165,7 +162,7 @@ static void amd_put_event_constraints(struct cpu_hw_events *cpuc,
         * be removed on one CPU at a time AND PMU is disabled
         * when we come here
         */
-        for (i = 0; i < x86_pmu.num_events; i++) {
+        for (i = 0; i < x86_pmu.num_counters; i++) {
                if (nb->owners[i] == event) {
                        cmpxchg(nb->owners+i, event, NULL);
                        break;
@@ -215,7 +212,7 @@ amd_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
        struct hw_perf_event *hwc = &event->hw;
        struct amd_nb *nb = cpuc->amd_nb;
        struct perf_event *old = NULL;
-        int max = x86_pmu.num_events;
+        int max = x86_pmu.num_counters;
        int i, j, k = -1;
        /*
@@ -293,7 +290,7 @@ static struct amd_nb *amd_alloc_nb(int cpu, int nb_id)
        /*
         * initialize all possible NB constraints
         */
-        for (i = 0; i < x86_pmu.num_events; i++) {
+        for (i = 0; i < x86_pmu.num_counters; i++) {
                __set_bit(i, nb->event_constraints[i].idxmsk);
                nb->event_constraints[i].weight = 1;
        }
@@ -371,21 +368,22 @@ static void amd_pmu_cpu_dead(int cpu)
        raw_spin_unlock(&amd_nb_lock);
 }
-static __initconst struct x86_pmu amd_pmu = {
+static __initconst const struct x86_pmu amd_pmu = {
        .name                   = "AMD",
        .handle_irq             = x86_pmu_handle_irq,
        .disable_all            = x86_pmu_disable_all,
        .enable_all             = x86_pmu_enable_all,
        .enable                 = x86_pmu_enable_event,
        .disable                = x86_pmu_disable_event,
+        .hw_config              = amd_pmu_hw_config,
+        .schedule_events        = x86_schedule_events,
        .eventsel               = MSR_K7_EVNTSEL0,
        .perfctr                = MSR_K7_PERFCTR0,
        .event_map              = amd_pmu_event_map,
-        .raw_event              = amd_pmu_raw_event,
        .max_events             = ARRAY_SIZE(amd_perfmon_event_map),
-        .num_events             = 4,
+        .num_counters           = 4,
-        .event_bits             = 48,
+        .cntval_bits            = 48,
-        .event_mask             = (1ULL << 48) - 1,
+        .cntval_mask            = (1ULL << 48) - 1,
        .apic                   = 1,
        /* use highest bit to detect overflow */
        .max_period             = (1ULL << 47) - 1,
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 9c794ac87837..fdbc652d3feb 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -88,7 +88,7 @@ static u64 intel_pmu_event_map(int hw_event)
        return intel_perfmon_event_map[hw_event];
 }
-static __initconst u64 westmere_hw_cache_event_ids
+static __initconst const u64 westmere_hw_cache_event_ids
                                [PERF_COUNT_HW_CACHE_MAX]
                                [PERF_COUNT_HW_CACHE_OP_MAX]
                                [PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -179,7 +179,7 @@ static __initconst u64 westmere_hw_cache_event_ids
 },
 };
-static __initconst u64 nehalem_hw_cache_event_ids
+static __initconst const u64 nehalem_hw_cache_event_ids
                                [PERF_COUNT_HW_CACHE_MAX]
                                [PERF_COUNT_HW_CACHE_OP_MAX]
                                [PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -270,7 +270,7 @@ static __initconst u64 nehalem_hw_cache_event_ids
 },
 };
-static __initconst u64 core2_hw_cache_event_ids
+static __initconst const u64 core2_hw_cache_event_ids
                                [PERF_COUNT_HW_CACHE_MAX]
                                [PERF_COUNT_HW_CACHE_OP_MAX]
                                [PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -361,7 +361,7 @@ static __initconst u64 core2_hw_cache_event_ids
 },
 };
-static __initconst u64 atom_hw_cache_event_ids
+static __initconst const u64 atom_hw_cache_event_ids
                                [PERF_COUNT_HW_CACHE_MAX]
                                [PERF_COUNT_HW_CACHE_OP_MAX]
                                [PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -452,60 +452,6 @@ static __initconst u64 atom_hw_cache_event_ids
 },
 };
-static u64 intel_pmu_raw_event(u64 hw_event)
-{
-#define CORE_EVNTSEL_EVENT_MASK         0x000000FFULL
-#define CORE_EVNTSEL_UNIT_MASK          0x0000FF00ULL
-#define CORE_EVNTSEL_EDGE_MASK          0x00040000ULL
-#define CORE_EVNTSEL_INV_MASK           0x00800000ULL
-#define CORE_EVNTSEL_REG_MASK           0xFF000000ULL
-#define CORE_EVNTSEL_MASK               \
-        (INTEL_ARCH_EVTSEL_MASK |       \
-         INTEL_ARCH_UNIT_MASK   |       \
-         INTEL_ARCH_EDGE_MASK   |       \
-         INTEL_ARCH_INV_MASK    |       \
-         INTEL_ARCH_CNT_MASK)
-        return hw_event & CORE_EVNTSEL_MASK;
-}
-static void intel_pmu_enable_bts(u64 config)
-{
-        unsigned long debugctlmsr;
-        debugctlmsr = get_debugctlmsr();
-        debugctlmsr |= X86_DEBUGCTL_TR;
-        debugctlmsr |= X86_DEBUGCTL_BTS;
-        debugctlmsr |= X86_DEBUGCTL_BTINT;
-        if (!(config & ARCH_PERFMON_EVENTSEL_OS))
-                debugctlmsr |= X86_DEBUGCTL_BTS_OFF_OS;
-        if (!(config & ARCH_PERFMON_EVENTSEL_USR))
-                debugctlmsr |= X86_DEBUGCTL_BTS_OFF_USR;
-        update_debugctlmsr(debugctlmsr);
-}
-static void intel_pmu_disable_bts(void)
-{
-        struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
-        unsigned long debugctlmsr;
-        if (!cpuc->ds)
-                return;
-        debugctlmsr = get_debugctlmsr();
-        debugctlmsr &=
-                ~(X86_DEBUGCTL_TR | X86_DEBUGCTL_BTS | X86_DEBUGCTL_BTINT |
-                  X86_DEBUGCTL_BTS_OFF_OS | X86_DEBUGCTL_BTS_OFF_USR);
-        update_debugctlmsr(debugctlmsr);
-}
 static void intel_pmu_disable_all(void)
 {
        struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
@@ -514,12 +460,17 @@ static void intel_pmu_disable_all(void)
        if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask))
                intel_pmu_disable_bts();
+        intel_pmu_pebs_disable_all();
+        intel_pmu_lbr_disable_all();
 }
-static void intel_pmu_enable_all(void)
+static void intel_pmu_enable_all(int added)
 {
        struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+        intel_pmu_pebs_enable_all();
+        intel_pmu_lbr_enable_all();
        wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
        if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) {
@@ -533,6 +484,42 @@ static void intel_pmu_enable_all(void)
        }
 }
+/*
+ * Workaround for:
+ *   Intel Errata AAK100 (model 26)
+ *   Intel Errata AAP53  (model 30)
+ *   Intel Errata BD53   (model 44)
+ *
+ * These chips need to be 'reset' when adding counters by programming
+ * the magic three (non counting) events 0x4300D2, 0x4300B1 and 0x4300B5
+ * either in sequence on the same PMC or on different PMCs.
+ */
+static void intel_pmu_nhm_enable_all(int added)
+{
+        if (added) {
+                struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+                int i;
+                wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + 0, 0x4300D2);
+                wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + 1, 0x4300B1);
+                wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + 2, 0x4300B5);
+                wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0x3);
+                wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0x0);
+                for (i = 0; i < 3; i++) {
+                        struct perf_event *event = cpuc->events[i];
+                        if (!event)
+                                continue;
+                        __x86_pmu_enable_event(&event->hw,
+                                               ARCH_PERFMON_EVENTSEL_ENABLE);
+                }
+        }
+        intel_pmu_enable_all(added);
+}
 static inline u64 intel_pmu_get_status(void)
 {
        u64 status;
@@ -547,8 +534,7 @@ static inline void intel_pmu_ack_status(u64 ack)
        wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
 }
-static inline void
+static void intel_pmu_disable_fixed(struct hw_perf_event *hwc)
-intel_pmu_disable_fixed(struct hw_perf_event *hwc)
 {
        int idx = hwc->idx - X86_PMC_IDX_FIXED;
        u64 ctrl_val, mask;
@@ -557,71 +543,10 @@ intel_pmu_disable_fixed(struct hw_perf_event *hwc)
        rdmsrl(hwc->config_base, ctrl_val);
        ctrl_val &= ~mask;
-        (void)checking_wrmsrl(hwc->config_base, ctrl_val);
+        wrmsrl(hwc->config_base, ctrl_val);
-}
-static void intel_pmu_drain_bts_buffer(void)
-{
-        struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
-        struct debug_store *ds = cpuc->ds;
-        struct bts_record {
-                u64     from;
-                u64     to;
-                u64     flags;
-        };
-        struct perf_event *event = cpuc->events[X86_PMC_IDX_FIXED_BTS];
-        struct bts_record *at, *top;
-        struct perf_output_handle handle;
-        struct perf_event_header header;
-        struct perf_sample_data data;
-        struct pt_regs regs;
-        if (!event)
-                return;
-        if (!ds)
-                return;
-        at  = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
-        top = (struct bts_record *)(unsigned long)ds->bts_index;
-        if (top <= at)
-                return;
-        ds->bts_index = ds->bts_buffer_base;
-        perf_sample_data_init(&data, 0);
-        data.period     = event->hw.last_period;
-        regs.ip         = 0;
-        /*
-         * Prepare a generic sample, i.e. fill in the invariant fields.
-         * We will overwrite the from and to address before we output
-         * the sample.
-         */
-        perf_prepare_sample(&header, &data, event, &regs);
-        if (perf_output_begin(&handle, event,
-                              header.size * (top - at), 1, 1))
-                return;
-        for (; at < top; at++) {
-                data.ip         = at->from;
-                data.addr       = at->to;
-                perf_output_sample(&handle, &header, &data, event);
-        }
-        perf_output_end(&handle);
-        /* There's new data available. */
-        event->hw.interrupts++;
-        event->pending_kill = POLL_IN;
 }
-static inline void
+static void intel_pmu_disable_event(struct perf_event *event)
-intel_pmu_disable_event(struct perf_event *event)
 {
        struct hw_perf_event *hwc = &event->hw;
@@ -637,14 +562,15 @@ intel_pmu_disable_event(struct perf_event *event)
        }
        x86_pmu_disable_event(event);
+        if (unlikely(event->attr.precise_ip))
+                intel_pmu_pebs_disable(event);
 }
-static inline void
+static void intel_pmu_enable_fixed(struct hw_perf_event *hwc)
-intel_pmu_enable_fixed(struct hw_perf_event *hwc)
 {
        int idx = hwc->idx - X86_PMC_IDX_FIXED;
        u64 ctrl_val, bits, mask;
-        int err;
        /*
         * Enable IRQ generation (0x8),
@@ -669,7 +595,7 @@ intel_pmu_enable_fixed(struct hw_perf_event *hwc)
        rdmsrl(hwc->config_base, ctrl_val);
        ctrl_val &= ~mask;
        ctrl_val |= bits;
-        err = checking_wrmsrl(hwc->config_base, ctrl_val);
+        wrmsrl(hwc->config_base, ctrl_val);
 }
 static void intel_pmu_enable_event(struct perf_event *event)
@@ -689,7 +615,10 @@ static void intel_pmu_enable_event(struct perf_event *event)
                return;
        }
-        __x86_pmu_enable_event(hwc);
+        if (unlikely(event->attr.precise_ip))
+                intel_pmu_pebs_enable(event);
+        __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
 }
 /*
@@ -708,20 +637,20 @@ static void intel_pmu_reset(void)
        unsigned long flags;
        int idx;
-        if (!x86_pmu.num_events)
+        if (!x86_pmu.num_counters)
                return;
        local_irq_save(flags);
        printk("clearing PMU state on CPU#%d\n", smp_processor_id());
-        for (idx = 0; idx < x86_pmu.num_events; idx++) {
+        for (idx = 0; idx < x86_pmu.num_counters; idx++) {
                checking_wrmsrl(x86_pmu.eventsel + idx, 0ull);
                checking_wrmsrl(x86_pmu.perfctr  + idx, 0ull);
        }
-        for (idx = 0; idx < x86_pmu.num_events_fixed; idx++) {
+        for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++)
                checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
-        }
        if (ds)
                ds->bts_index = ds->bts_buffer_base;
@@ -747,7 +676,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
        intel_pmu_drain_bts_buffer();
        status = intel_pmu_get_status();
        if (!status) {
-                intel_pmu_enable_all();
+                intel_pmu_enable_all(0);
                return 0;
        }
@@ -762,6 +691,15 @@ again:
        inc_irq_stat(apic_perf_irqs);
        ack = status;
+        intel_pmu_lbr_read();
+        /*
+         * PEBS overflow sets bit 62 in the global status register
+         */
+        if (__test_and_clear_bit(62, (unsigned long *)&status))
+                x86_pmu.drain_pebs(regs);
        for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
                struct perf_event *event = cpuc->events[bit];
@@ -787,26 +725,22 @@ again:
                goto again;
 done:
-        intel_pmu_enable_all();
+        intel_pmu_enable_all(0);
        return 1;
 }
-static struct event_constraint bts_constraint =
-        EVENT_CONSTRAINT(0, 1ULL << X86_PMC_IDX_FIXED_BTS, 0);
 static struct event_constraint *
-intel_special_constraints(struct perf_event *event)
+intel_bts_constraints(struct perf_event *event)
 {
-        unsigned int hw_event;
+        struct hw_perf_event *hwc = &event->hw;
+        unsigned int hw_event, bts_event;
-        hw_event = event->hw.config & INTEL_ARCH_EVENT_MASK;
-        if (unlikely((hw_event ==
+        hw_event = hwc->config & INTEL_ARCH_EVENT_MASK;
-                      x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS)) &&
+        bts_event = x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS);
-                     (event->hw.sample_period == 1))) {
+        if (unlikely(hw_event == bts_event && hwc->sample_period == 1))
                return &bts_constraint;
-        }
        return NULL;
 }
@@ -815,24 +749,53 @@ intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event
 {
        struct event_constraint *c;
-        c = intel_special_constraints(event);
+        c = intel_bts_constraints(event);
+        if (c)
+                return c;
+        c = intel_pebs_constraints(event);
        if (c)
                return c;
        return x86_get_event_constraints(cpuc, event);
 }
-static __initconst struct x86_pmu core_pmu = {
+static int intel_pmu_hw_config(struct perf_event *event)
+{
+        int ret = x86_pmu_hw_config(event);
+        if (ret)
+                return ret;
+        if (event->attr.type != PERF_TYPE_RAW)
+                return 0;
+        if (!(event->attr.config & ARCH_PERFMON_EVENTSEL_ANY))
+                return 0;
+        if (x86_pmu.version < 3)
+                return -EINVAL;
+        if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
+                return -EACCES;
+        event->hw.config |= ARCH_PERFMON_EVENTSEL_ANY;
+        return 0;
+}
+static __initconst const struct x86_pmu core_pmu = {
        .name                   = "core",
        .handle_irq             = x86_pmu_handle_irq,
        .disable_all            = x86_pmu_disable_all,
        .enable_all             = x86_pmu_enable_all,
        .enable                 = x86_pmu_enable_event,
        .disable                = x86_pmu_disable_event,
+        .hw_config              = x86_pmu_hw_config,
+        .schedule_events        = x86_schedule_events,
        .eventsel               = MSR_ARCH_PERFMON_EVENTSEL0,
        .perfctr                = MSR_ARCH_PERFMON_PERFCTR0,
        .event_map              = intel_pmu_event_map,
-        .raw_event              = intel_pmu_raw_event,
        .max_events             = ARRAY_SIZE(intel_perfmon_event_map),
        .apic                   = 1,
        /*
@@ -845,17 +808,32 @@ static __initconst struct x86_pmu core_pmu = {
        .event_constraints      = intel_core_event_constraints,
 };
-static __initconst struct x86_pmu intel_pmu = {
+static void intel_pmu_cpu_starting(int cpu)
+{
+        init_debug_store_on_cpu(cpu);
+        /*
+         * Deal with CPUs that don't clear their LBRs on power-up.
+         */
+        intel_pmu_lbr_reset();
+}
+static void intel_pmu_cpu_dying(int cpu)
+{
+        fini_debug_store_on_cpu(cpu);
+}
+static __initconst const struct x86_pmu intel_pmu = {
        .name                   = "Intel",
        .handle_irq             = intel_pmu_handle_irq,
        .disable_all            = intel_pmu_disable_all,
        .enable_all             = intel_pmu_enable_all,
        .enable                 = intel_pmu_enable_event,
        .disable                = intel_pmu_disable_event,
+        .hw_config              = intel_pmu_hw_config,
+        .schedule_events        = x86_schedule_events,
        .eventsel               = MSR_ARCH_PERFMON_EVENTSEL0,
        .perfctr                = MSR_ARCH_PERFMON_PERFCTR0,
        .event_map              = intel_pmu_event_map,
-        .raw_event              = intel_pmu_raw_event,
        .max_events             = ARRAY_SIZE(intel_perfmon_event_map),
        .apic                   = 1,
        /*
@@ -864,14 +842,38 @@ static __initconst struct x86_pmu intel_pmu = {
         * the generic event period:
         */
        .max_period             = (1ULL << 31) - 1,
-        .enable_bts             = intel_pmu_enable_bts,
-        .disable_bts            = intel_pmu_disable_bts,
        .get_event_constraints  = intel_get_event_constraints,
-        .cpu_starting           = init_debug_store_on_cpu,
+        .cpu_starting           = intel_pmu_cpu_starting,
-        .cpu_dying              = fini_debug_store_on_cpu,
+        .cpu_dying              = intel_pmu_cpu_dying,
 };
+static void intel_clovertown_quirks(void)
+{
+        /*
+         * PEBS is unreliable due to:
+         *
+         *   AJ67  - PEBS may experience CPL leaks
+         *   AJ68  - PEBS PMI may be delayed by one event
+         *   AJ69  - GLOBAL_STATUS[62] will only be set when DEBUGCTL[12]
+         *   AJ106 - FREEZE_LBRS_ON_PMI doesn't work in combination with PEBS
+         *
+         * AJ67 could be worked around by restricting the OS/USR flags.
+         * AJ69 could be worked around by setting PMU_FREEZE_ON_PMI.
+         *
+         * AJ106 could possibly be worked around by not allowing LBR
+         *       usage from PEBS, including the fixup.
+         * AJ68  could possibly be worked around by always programming
+         *       a pebs_event_reset[0] value and coping with the lost events.
+         *
+         * But taken together it might just make sense to not enable PEBS on
+         * these chips.
+         */
+        printk(KERN_WARNING "PEBS disabled due to CPU errata.\n");
+        x86_pmu.pebs = 0;
+        x86_pmu.pebs_constraints = NULL;
+}
 static __init int intel_pmu_init(void)
 {
        union cpuid10_edx edx;
@@ -881,12 +883,13 @@ static __init int intel_pmu_init(void)
        int version;
        if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
-                /* check for P6 processor family */
+                switch (boot_cpu_data.x86) {
-           if (boot_cpu_data.x86 == 6) {
+                case 0x6:
-                return p6_pmu_init();
+                        return p6_pmu_init();
-           } else {
+                case 0xf:
+                        return p4_pmu_init();
+                }
                return -ENODEV;
-           }
        }
        /*
@@ -904,16 +907,28 @@ static __init int intel_pmu_init(void)
                x86_pmu = intel_pmu;
        x86_pmu.version                 = version;
-        x86_pmu.num_events              = eax.split.num_events;
+        x86_pmu.num_counters            = eax.split.num_counters;
-        x86_pmu.event_bits              = eax.split.bit_width;
+        x86_pmu.cntval_bits             = eax.split.bit_width;
-        x86_pmu.event_mask              = (1ULL << eax.split.bit_width) - 1;
+        x86_pmu.cntval_mask             = (1ULL << eax.split.bit_width) - 1;
        /*
         * Quirk: v2 perfmon does not report fixed-purpose events, so
         * assume at least 3 events:
         */
        if (version > 1)
-                x86_pmu.num_events_fixed = max((int)edx.split.num_events_fixed, 3);
+                x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3);
+        /*
+         * v2 and above have a perf capabilities MSR
+         */
+        if (version > 1) {
+                u64 capabilities;
+                rdmsrl(MSR_IA32_PERF_CAPABILITIES, capabilities);
+                x86_pmu.intel_cap.capabilities = capabilities;
+        }
+        intel_ds_init();
        /*
         * Install the hw-cache-events table:
@@ -924,12 +939,15 @@ static __init int intel_pmu_init(void)
                break;
        case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */
+                x86_pmu.quirks = intel_clovertown_quirks;
        case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */
        case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */
        case 29: /* six-core 45 nm xeon "Dunnington" */
                memcpy(hw_cache_event_ids, core2_hw_cache_event_ids,
                       sizeof(hw_cache_event_ids));
+                intel_pmu_lbr_init_core();
                x86_pmu.event_constraints = intel_core2_event_constraints;
                pr_cont("Core2 events, ");
                break;
@@ -940,13 +958,19 @@ static __init int intel_pmu_init(void)
                memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
                       sizeof(hw_cache_event_ids));
+                intel_pmu_lbr_init_nhm();
                x86_pmu.event_constraints = intel_nehalem_event_constraints;
-                pr_cont("Nehalem/Corei7 events, ");
+                x86_pmu.enable_all = intel_pmu_nhm_enable_all;
+                pr_cont("Nehalem events, ");
                break;
        case 28: /* Atom */
                memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
                       sizeof(hw_cache_event_ids));
+                intel_pmu_lbr_init_atom();
                x86_pmu.event_constraints = intel_gen_event_constraints;
                pr_cont("Atom events, ");
                break;
@@ -956,7 +980,10 @@ static __init int intel_pmu_init(void)
                memcpy(hw_cache_event_ids, westmere_hw_cache_event_ids,
                       sizeof(hw_cache_event_ids));
+                intel_pmu_lbr_init_nhm();
                x86_pmu.event_constraints = intel_westmere_event_constraints;
+                x86_pmu.enable_all = intel_pmu_nhm_enable_all;
                pr_cont("Westmere events, ");
                break;
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
new file mode 100644
index 000000000000..18018d1311cd
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -0,0 +1,641 @@
+#ifdef CONFIG_CPU_SUP_INTEL
+/* The maximal number of PEBS events: */
+#define MAX_PEBS_EVENTS         4
+/* The size of a BTS record in bytes: */
+#define BTS_RECORD_SIZE         24
+#define BTS_BUFFER_SIZE         (PAGE_SIZE << 4)
+#define PEBS_BUFFER_SIZE        PAGE_SIZE
+/*
+ * pebs_record_32 for p4 and core not supported
+struct pebs_record_32 {
+        u32 flags, ip;
+        u32 ax, bc, cx, dx;
+        u32 si, di, bp, sp;
+};
+ */
+struct pebs_record_core {
+        u64 flags, ip;
+        u64 ax, bx, cx, dx;
+        u64 si, di, bp, sp;
+        u64 r8,  r9,  r10, r11;
+        u64 r12, r13, r14, r15;
+};
+struct pebs_record_nhm {
+        u64 flags, ip;
+        u64 ax, bx, cx, dx;
+        u64 si, di, bp, sp;
+        u64 r8,  r9,  r10, r11;
+        u64 r12, r13, r14, r15;
+        u64 status, dla, dse, lat;
+};
+/*
+ * A debug store configuration.
+ *
+ * We only support architectures that use 64bit fields.
+ */
+struct debug_store {
+        u64     bts_buffer_base;
+        u64     bts_index;
+        u64     bts_absolute_maximum;
+        u64     bts_interrupt_threshold;
+        u64     pebs_buffer_base;
+        u64     pebs_index;
+        u64     pebs_absolute_maximum;
+        u64     pebs_interrupt_threshold;
+        u64     pebs_event_reset[MAX_PEBS_EVENTS];
+};
+static void init_debug_store_on_cpu(int cpu)
+{
+        struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+        if (!ds)
+                return;
+        wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA,
+                     (u32)((u64)(unsigned long)ds),
+                     (u32)((u64)(unsigned long)ds >> 32));
+}
+static void fini_debug_store_on_cpu(int cpu)
+{
+        if (!per_cpu(cpu_hw_events, cpu).ds)
+                return;
+        wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0);
+}
+static void release_ds_buffers(void)
+{
+        int cpu;
+        if (!x86_pmu.bts && !x86_pmu.pebs)
+                return;
+        get_online_cpus();
+        for_each_online_cpu(cpu)
+                fini_debug_store_on_cpu(cpu);
+        for_each_possible_cpu(cpu) {
+                struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+                if (!ds)
+                        continue;
+                per_cpu(cpu_hw_events, cpu).ds = NULL;
+                kfree((void *)(unsigned long)ds->pebs_buffer_base);
+                kfree((void *)(unsigned long)ds->bts_buffer_base);
+                kfree(ds);
+        }
+        put_online_cpus();
+}
+static int reserve_ds_buffers(void)
+{
+        int cpu, err = 0;
+        if (!x86_pmu.bts && !x86_pmu.pebs)
+                return 0;
+        get_online_cpus();
+        for_each_possible_cpu(cpu) {
+                struct debug_store *ds;
+                void *buffer;
+                int max, thresh;
+                err = -ENOMEM;
+                ds = kzalloc(sizeof(*ds), GFP_KERNEL);
+                if (unlikely(!ds))
+                        break;
+                per_cpu(cpu_hw_events, cpu).ds = ds;
+                if (x86_pmu.bts) {
+                        buffer = kzalloc(BTS_BUFFER_SIZE, GFP_KERNEL);
+                        if (unlikely(!buffer))
+                                break;
+                        max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE;
+                        thresh = max / 16;
+                        ds->bts_buffer_base = (u64)(unsigned long)buffer;
+                        ds->bts_index = ds->bts_buffer_base;
+                        ds->bts_absolute_maximum = ds->bts_buffer_base +
+                                max * BTS_RECORD_SIZE;
+                        ds->bts_interrupt_threshold = ds->bts_absolute_maximum -
+                                thresh * BTS_RECORD_SIZE;
+                }
+                if (x86_pmu.pebs) {
+                        buffer = kzalloc(PEBS_BUFFER_SIZE, GFP_KERNEL);
+                        if (unlikely(!buffer))
+                                break;
+                        max = PEBS_BUFFER_SIZE / x86_pmu.pebs_record_size;
+                        ds->pebs_buffer_base = (u64)(unsigned long)buffer;
+                        ds->pebs_index = ds->pebs_buffer_base;
+                        ds->pebs_absolute_maximum = ds->pebs_buffer_base +
+                                max * x86_pmu.pebs_record_size;
+                        /*
+                         * Always use single record PEBS
+                         */
+                        ds->pebs_interrupt_threshold = ds->pebs_buffer_base +
+                                x86_pmu.pebs_record_size;
+                }
+                err = 0;
+        }
+        if (err)
+                release_ds_buffers();
+        else {
+                for_each_online_cpu(cpu)
+                        init_debug_store_on_cpu(cpu);
+        }
+        put_online_cpus();
+        return err;
+}
+/*
+ * BTS
+ */
+static struct event_constraint bts_constraint =
+        EVENT_CONSTRAINT(0, 1ULL << X86_PMC_IDX_FIXED_BTS, 0);
+static void intel_pmu_enable_bts(u64 config)
+{
+        unsigned long debugctlmsr;
+        debugctlmsr = get_debugctlmsr();
+        debugctlmsr |= DEBUGCTLMSR_TR;
+        debugctlmsr |= DEBUGCTLMSR_BTS;
+        debugctlmsr |= DEBUGCTLMSR_BTINT;
+        if (!(config & ARCH_PERFMON_EVENTSEL_OS))
+                debugctlmsr |= DEBUGCTLMSR_BTS_OFF_OS;
+        if (!(config & ARCH_PERFMON_EVENTSEL_USR))
+                debugctlmsr |= DEBUGCTLMSR_BTS_OFF_USR;
+        update_debugctlmsr(debugctlmsr);
+}
+static void intel_pmu_disable_bts(void)
+{
+        struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+        unsigned long debugctlmsr;
+        if (!cpuc->ds)
+                return;
+        debugctlmsr = get_debugctlmsr();
+        debugctlmsr &=
+                ~(DEBUGCTLMSR_TR | DEBUGCTLMSR_BTS | DEBUGCTLMSR_BTINT |
+                  DEBUGCTLMSR_BTS_OFF_OS | DEBUGCTLMSR_BTS_OFF_USR);
+        update_debugctlmsr(debugctlmsr);
+}
+static void intel_pmu_drain_bts_buffer(void)
+{
+        struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+        struct debug_store *ds = cpuc->ds;
+        struct bts_record {
+                u64     from;
+                u64     to;
+                u64     flags;
+        };
+        struct perf_event *event = cpuc->events[X86_PMC_IDX_FIXED_BTS];
+        struct bts_record *at, *top;
+        struct perf_output_handle handle;
+        struct perf_event_header header;
+        struct perf_sample_data data;
+        struct pt_regs regs;
+        if (!event)
+                return;
+        if (!ds)
+                return;
+        at  = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
+        top = (struct bts_record *)(unsigned long)ds->bts_index;
+        if (top <= at)
+                return;
+        ds->bts_index = ds->bts_buffer_base;
+        perf_sample_data_init(&data, 0);
+        data.period = event->hw.last_period;
+        regs.ip     = 0;
+        /*
+         * Prepare a generic sample, i.e. fill in the invariant fields.
+         * We will overwrite the from and to address before we output
+         * the sample.
+         */
+        perf_prepare_sample(&header, &data, event, &regs);
+        if (perf_output_begin(&handle, event, header.size * (top - at), 1, 1))
+                return;
+        for (; at < top; at++) {
+                data.ip         = at->from;
+                data.addr       = at->to;
+                perf_output_sample(&handle, &header, &data, event);
+        }
+        perf_output_end(&handle);
+        /* There's new data available. */
+        event->hw.interrupts++;
+        event->pending_kill = POLL_IN;
+}
+/*
+ * PEBS
+ */
+static struct event_constraint intel_core_pebs_events[] = {
+        PEBS_EVENT_CONSTRAINT(0x00c0, 0x1), /* INSTR_RETIRED.ANY */
+        PEBS_EVENT_CONSTRAINT(0xfec1, 0x1), /* X87_OPS_RETIRED.ANY */
+        PEBS_EVENT_CONSTRAINT(0x00c5, 0x1), /* BR_INST_RETIRED.MISPRED */
+        PEBS_EVENT_CONSTRAINT(0x1fc7, 0x1), /* SIMD_INST_RETURED.ANY */
+        PEBS_EVENT_CONSTRAINT(0x01cb, 0x1), /* MEM_LOAD_RETIRED.L1D_MISS */
+        PEBS_EVENT_CONSTRAINT(0x02cb, 0x1), /* MEM_LOAD_RETIRED.L1D_LINE_MISS */
+        PEBS_EVENT_CONSTRAINT(0x04cb, 0x1), /* MEM_LOAD_RETIRED.L2_MISS */
+        PEBS_EVENT_CONSTRAINT(0x08cb, 0x1), /* MEM_LOAD_RETIRED.L2_LINE_MISS */
+        PEBS_EVENT_CONSTRAINT(0x10cb, 0x1), /* MEM_LOAD_RETIRED.DTLB_MISS */
+        EVENT_CONSTRAINT_END
+};
+static struct event_constraint intel_nehalem_pebs_events[] = {
+        PEBS_EVENT_CONSTRAINT(0x00c0, 0xf), /* INSTR_RETIRED.ANY */
+        PEBS_EVENT_CONSTRAINT(0xfec1, 0xf), /* X87_OPS_RETIRED.ANY */
+        PEBS_EVENT_CONSTRAINT(0x00c5, 0xf), /* BR_INST_RETIRED.MISPRED */
+        PEBS_EVENT_CONSTRAINT(0x1fc7, 0xf), /* SIMD_INST_RETURED.ANY */
+        PEBS_EVENT_CONSTRAINT(0x01cb, 0xf), /* MEM_LOAD_RETIRED.L1D_MISS */
+        PEBS_EVENT_CONSTRAINT(0x02cb, 0xf), /* MEM_LOAD_RETIRED.L1D_LINE_MISS */
+        PEBS_EVENT_CONSTRAINT(0x04cb, 0xf), /* MEM_LOAD_RETIRED.L2_MISS */
+        PEBS_EVENT_CONSTRAINT(0x08cb, 0xf), /* MEM_LOAD_RETIRED.L2_LINE_MISS */
+        PEBS_EVENT_CONSTRAINT(0x10cb, 0xf), /* MEM_LOAD_RETIRED.DTLB_MISS */
+        EVENT_CONSTRAINT_END
+};
+static struct event_constraint *
+intel_pebs_constraints(struct perf_event *event)
+{
+        struct event_constraint *c;
+        if (!event->attr.precise_ip)
+                return NULL;
+        if (x86_pmu.pebs_constraints) {
+                for_each_event_constraint(c, x86_pmu.pebs_constraints) {
+                        if ((event->hw.config & c->cmask) == c->code)
+                                return c;
+                }
+        }
+        return &emptyconstraint;
+}
+static void intel_pmu_pebs_enable(struct perf_event *event)
+{
+        struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+        struct hw_perf_event *hwc = &event->hw;
+        hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT;
+        cpuc->pebs_enabled |= 1ULL << hwc->idx;
+        WARN_ON_ONCE(cpuc->enabled);
+        if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1)
+                intel_pmu_lbr_enable(event);
+}
+static void intel_pmu_pebs_disable(struct perf_event *event)
+{
+        struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+        struct hw_perf_event *hwc = &event->hw;
+        cpuc->pebs_enabled &= ~(1ULL << hwc->idx);
+        if (cpuc->enabled)
+                wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled);
+        hwc->config |= ARCH_PERFMON_EVENTSEL_INT;
+        if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1)
+                intel_pmu_lbr_disable(event);
+}
+static void intel_pmu_pebs_enable_all(void)
+{
+        struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+        if (cpuc->pebs_enabled)
+                wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled);
+}
+static void intel_pmu_pebs_disable_all(void)
+{
+        struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+        if (cpuc->pebs_enabled)
+                wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
+}
+#include <asm/insn.h>
+static inline bool kernel_ip(unsigned long ip)
+{
+#ifdef CONFIG_X86_32
+        return ip > PAGE_OFFSET;
+#else
+        return (long)ip < 0;
+#endif
+}
+static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
+{
+        struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+        unsigned long from = cpuc->lbr_entries[0].from;
+        unsigned long old_to, to = cpuc->lbr_entries[0].to;
+        unsigned long ip = regs->ip;
+        /*
+         * We don't need to fixup if the PEBS assist is fault like
+         */
+        if (!x86_pmu.intel_cap.pebs_trap)
+                return 1;
+        /*
+         * No LBR entry, no basic block, no rewinding
+         */
+        if (!cpuc->lbr_stack.nr || !from || !to)
+                return 0;
+        /*
+         * Basic blocks should never cross user/kernel boundaries
+         */
+        if (kernel_ip(ip) != kernel_ip(to))
+                return 0;
+        /*
+         * unsigned math, either ip is before the start (impossible) or
+         * the basic block is larger than 1 page (sanity)
+         */
+        if ((ip - to) > PAGE_SIZE)
+                return 0;
+        /*
+         * We sampled a branch insn, rewind using the LBR stack
+         */
+        if (ip == to) {
+                regs->ip = from;
+                return 1;
+        }
+        do {
+                struct insn insn;
+                u8 buf[MAX_INSN_SIZE];
+                void *kaddr;
+                old_to = to;
+                if (!kernel_ip(ip)) {
+                        int bytes, size = MAX_INSN_SIZE;
+                        bytes = copy_from_user_nmi(buf, (void __user *)to, size);
+                        if (bytes != size)
+                                return 0;
+                        kaddr = buf;
+                } else
+                        kaddr = (void *)to;
+                kernel_insn_init(&insn, kaddr);
+                insn_get_length(&insn);
+                to += insn.length;
+        } while (to < ip);
+        if (to == ip) {
+                regs->ip = old_to;
+                return 1;
+        }
+        /*
+         * Even though we decoded the basic block, the instruction stream
+         * never matched the given IP, either the TO or the IP got corrupted.
+         */
+        return 0;
+}
+static int intel_pmu_save_and_restart(struct perf_event *event);
+static void __intel_pmu_pebs_event(struct perf_event *event,
+                                   struct pt_regs *iregs, void *__pebs)
+{
+        /*
+         * We cast to pebs_record_core since that is a subset of
+         * both formats and we don't use the other fields in this
+         * routine.
+         */
+        struct pebs_record_core *pebs = __pebs;
+        struct perf_sample_data data;
+        struct pt_regs regs;
+        if (!intel_pmu_save_and_restart(event))
+                return;
+        perf_sample_data_init(&data, 0);
+        data.period = event->hw.last_period;
+        /*
+         * We use the interrupt regs as a base because the PEBS record
+         * does not contain a full regs set, specifically it seems to
+         * lack segment descriptors, which get used by things like
+         * user_mode().
+         *
+         * In the simple case fix up only the IP and BP,SP regs, for
+         * PERF_SAMPLE_IP and PERF_SAMPLE_CALLCHAIN to function properly.
+         * A possible PERF_SAMPLE_REGS will have to transfer all regs.
+         */
+        regs = *iregs;
+        regs.ip = pebs->ip;
+        regs.bp = pebs->bp;
+        regs.sp = pebs->sp;
+        if (event->attr.precise_ip > 1 && intel_pmu_pebs_fixup_ip(&regs))
+                regs.flags |= PERF_EFLAGS_EXACT;
+        else
+                regs.flags &= ~PERF_EFLAGS_EXACT;
+        if (perf_event_overflow(event, 1, &data, &regs))
+                x86_pmu_stop(event);
+}
+static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
+{
+        struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+        struct debug_store *ds = cpuc->ds;
+        struct perf_event *event = cpuc->events[0]; /* PMC0 only */
+        struct pebs_record_core *at, *top;
+        int n;
+        if (!ds || !x86_pmu.pebs)
+                return;
+        at  = (struct pebs_record_core *)(unsigned long)ds->pebs_buffer_base;
+        top = (struct pebs_record_core *)(unsigned long)ds->pebs_index;
+        /*
+         * Whatever else happens, drain the thing
+         */
+        ds->pebs_index = ds->pebs_buffer_base;
+        if (!test_bit(0, cpuc->active_mask))
+                return;
+        WARN_ON_ONCE(!event);
+        if (!event->attr.precise_ip)
+                return;
+        n = top - at;
+        if (n <= 0)
+                return;
+        /*
+         * Should not happen, we program the threshold at 1 and do not
+         * set a reset value.
+         */
+        WARN_ON_ONCE(n > 1);
+        at += n - 1;
+        __intel_pmu_pebs_event(event, iregs, at);
+}
+static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
+{
+        struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+        struct debug_store *ds = cpuc->ds;
+        struct pebs_record_nhm *at, *top;
+        struct perf_event *event = NULL;
+        u64 status = 0;
+        int bit, n;
+        if (!ds || !x86_pmu.pebs)
+                return;
+        at  = (struct pebs_record_nhm *)(unsigned long)ds->pebs_buffer_base;
+        top = (struct pebs_record_nhm *)(unsigned long)ds->pebs_index;
+        ds->pebs_index = ds->pebs_buffer_base;
+        n = top - at;
+        if (n <= 0)
+                return;
+        /*
+         * Should not happen, we program the threshold at 1 and do not
+         * set a reset value.
+         */
+        WARN_ON_ONCE(n > MAX_PEBS_EVENTS);
+        for ( ; at < top; at++) {
+                for_each_set_bit(bit, (unsigned long *)&at->status, MAX_PEBS_EVENTS) {
+                        event = cpuc->events[bit];
+                        if (!test_bit(bit, cpuc->active_mask))
+                                continue;
+                        WARN_ON_ONCE(!event);
+                        if (!event->attr.precise_ip)
+                                continue;
+                        if (__test_and_set_bit(bit, (unsigned long *)&status))
+                                continue;
+                        break;
+                }
+                if (!event || bit >= MAX_PEBS_EVENTS)
+                        continue;
+                __intel_pmu_pebs_event(event, iregs, at);
+        }
+}
+/*
+ * BTS, PEBS probe and setup
+ */
+static void intel_ds_init(void)
+{
+        /*
+         * No support for 32bit formats
+         */
+        if (!boot_cpu_has(X86_FEATURE_DTES64))
+                return;
+        x86_pmu.bts  = boot_cpu_has(X86_FEATURE_BTS);
+        x86_pmu.pebs = boot_cpu_has(X86_FEATURE_PEBS);
+        if (x86_pmu.pebs) {
+                char pebs_type = x86_pmu.intel_cap.pebs_trap ?  '+' : '-';
+                int format = x86_pmu.intel_cap.pebs_format;
+                switch (format) {
+                case 0:
+                        printk(KERN_CONT "PEBS fmt0%c, ", pebs_type);
+                        x86_pmu.pebs_record_size = sizeof(struct pebs_record_core);
+                        x86_pmu.drain_pebs = intel_pmu_drain_pebs_core;
+                        x86_pmu.pebs_constraints = intel_core_pebs_events;
+                        break;
+                case 1:
+                        printk(KERN_CONT "PEBS fmt1%c, ", pebs_type);
+                        x86_pmu.pebs_record_size = sizeof(struct pebs_record_nhm);
+                        x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm;
+                        x86_pmu.pebs_constraints = intel_nehalem_pebs_events;
+                        break;
+                default:
+                        printk(KERN_CONT "no PEBS fmt%d%c, ", format, pebs_type);
+                        x86_pmu.pebs = 0;
+                        break;
+                }
+        }
+}
+#else /* CONFIG_CPU_SUP_INTEL */
+static int reserve_ds_buffers(void)
+{
+        return 0;
+}
+static void release_ds_buffers(void)
+{
+}
+#endif /* CONFIG_CPU_SUP_INTEL */
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
new file mode 100644
index 000000000000..d202c1bece1a
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -0,0 +1,218 @@
+#ifdef CONFIG_CPU_SUP_INTEL
+enum {
+        LBR_FORMAT_32           = 0x00,
+        LBR_FORMAT_LIP          = 0x01,
+        LBR_FORMAT_EIP          = 0x02,
+        LBR_FORMAT_EIP_FLAGS    = 0x03,
+};
+/*
+ * We only support LBR implementations that have FREEZE_LBRS_ON_PMI
+ * otherwise it becomes near impossible to get a reliable stack.
+ */
+static void __intel_pmu_lbr_enable(void)
+{
+        u64 debugctl;
+        rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+        debugctl |= (DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI);
+        wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+}
+static void __intel_pmu_lbr_disable(void)
+{
+        u64 debugctl;
+        rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+        debugctl &= ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI);
+        wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+}
+static void intel_pmu_lbr_reset_32(void)
+{
+        int i;
+        for (i = 0; i < x86_pmu.lbr_nr; i++)
+                wrmsrl(x86_pmu.lbr_from + i, 0);
+}
+static void intel_pmu_lbr_reset_64(void)
+{
+        int i;
+        for (i = 0; i < x86_pmu.lbr_nr; i++) {
+                wrmsrl(x86_pmu.lbr_from + i, 0);
+                wrmsrl(x86_pmu.lbr_to   + i, 0);
+        }
+}
+static void intel_pmu_lbr_reset(void)
+{
+        if (!x86_pmu.lbr_nr)
+                return;
+        if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32)
+                intel_pmu_lbr_reset_32();
+        else
+                intel_pmu_lbr_reset_64();
+}
+static void intel_pmu_lbr_enable(struct perf_event *event)
+{
+        struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+        if (!x86_pmu.lbr_nr)
+                return;
+        WARN_ON_ONCE(cpuc->enabled);
+        /*
+         * Reset the LBR stack if we changed task context to
+         * avoid data leaks.
+         */
+        if (event->ctx->task && cpuc->lbr_context != event->ctx) {
+                intel_pmu_lbr_reset();
+                cpuc->lbr_context = event->ctx;
+        }
+        cpuc->lbr_users++;
+}
+static void intel_pmu_lbr_disable(struct perf_event *event)
+{
+        struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+        if (!x86_pmu.lbr_nr)
+                return;
+        cpuc->lbr_users--;
+        WARN_ON_ONCE(cpuc->lbr_users < 0);
+        if (cpuc->enabled && !cpuc->lbr_users)
+                __intel_pmu_lbr_disable();
+}
+static void intel_pmu_lbr_enable_all(void)
+{
+        struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+        if (cpuc->lbr_users)
+                __intel_pmu_lbr_enable();
+}
+static void intel_pmu_lbr_disable_all(void)
+{
+        struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+        if (cpuc->lbr_users)
+                __intel_pmu_lbr_disable();
+}
+static inline u64 intel_pmu_lbr_tos(void)
+{
+        u64 tos;
+        rdmsrl(x86_pmu.lbr_tos, tos);
+        return tos;
+}
+static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)
+{
+        unsigned long mask = x86_pmu.lbr_nr - 1;
+        u64 tos = intel_pmu_lbr_tos();
+        int i;
+        for (i = 0; i < x86_pmu.lbr_nr; i++) {
+                unsigned long lbr_idx = (tos - i) & mask;
+                union {
+                        struct {
+                                u32 from;
+                                u32 to;
+                        };
+                        u64     lbr;
+                } msr_lastbranch;
+                rdmsrl(x86_pmu.lbr_from + lbr_idx, msr_lastbranch.lbr);
+                cpuc->lbr_entries[i].from  = msr_lastbranch.from;
+                cpuc->lbr_entries[i].to    = msr_lastbranch.to;
+                cpuc->lbr_entries[i].flags = 0;
+        }
+        cpuc->lbr_stack.nr = i;
+}
+#define LBR_FROM_FLAG_MISPRED  (1ULL << 63)
+/*
+ * Due to lack of segmentation in Linux the effective address (offset)
+ * is the same as the linear address, allowing us to merge the LIP and EIP
+ * LBR formats.
+ */
+static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
+{
+        unsigned long mask = x86_pmu.lbr_nr - 1;
+        int lbr_format = x86_pmu.intel_cap.lbr_format;
+        u64 tos = intel_pmu_lbr_tos();
+        int i;
+        for (i = 0; i < x86_pmu.lbr_nr; i++) {
+                unsigned long lbr_idx = (tos - i) & mask;
+                u64 from, to, flags = 0;
+                rdmsrl(x86_pmu.lbr_from + lbr_idx, from);
+                rdmsrl(x86_pmu.lbr_to   + lbr_idx, to);
+                if (lbr_format == LBR_FORMAT_EIP_FLAGS) {
+                        flags = !!(from & LBR_FROM_FLAG_MISPRED);
+                        from = (u64)((((s64)from) << 1) >> 1);
+                }
+                cpuc->lbr_entries[i].from  = from;
+                cpuc->lbr_entries[i].to    = to;
+                cpuc->lbr_entries[i].flags = flags;
+        }
+        cpuc->lbr_stack.nr = i;
+}
+static void intel_pmu_lbr_read(void)
+{
+        struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+        if (!cpuc->lbr_users)
+                return;
+        if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32)
+                intel_pmu_lbr_read_32(cpuc);
+        else
+                intel_pmu_lbr_read_64(cpuc);
+}
+static void intel_pmu_lbr_init_core(void)
+{
+        x86_pmu.lbr_nr     = 4;
+        x86_pmu.lbr_tos    = 0x01c9;
+        x86_pmu.lbr_from   = 0x40;
+        x86_pmu.lbr_to     = 0x60;
+}
+static void intel_pmu_lbr_init_nhm(void)
+{
+        x86_pmu.lbr_nr     = 16;
+        x86_pmu.lbr_tos    = 0x01c9;
+        x86_pmu.lbr_from   = 0x680;
+        x86_pmu.lbr_to     = 0x6c0;
+}
+static void intel_pmu_lbr_init_atom(void)
+{
+        x86_pmu.lbr_nr     = 8;
+        x86_pmu.lbr_tos    = 0x01c9;
+        x86_pmu.lbr_from   = 0x40;
+        x86_pmu.lbr_to     = 0x60;
+}
+#endif /* CONFIG_CPU_SUP_INTEL */
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
new file mode 100644
index 000000000000..424fc8de68e4
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -0,0 +1,857 @@
+/*
+ * Netburst Perfomance Events (P4, old Xeon)
+ *
+ *  Copyright (C) 2010 Parallels, Inc., Cyrill Gorcunov <gorcunov@openvz.org>
+ *  Copyright (C) 2010 Intel Corporation, Lin Ming <ming.m.lin@intel.com>
+ *
+ *  For licencing details see kernel-base/COPYING
+ */
+#ifdef CONFIG_CPU_SUP_INTEL
+#include <asm/perf_event_p4.h>
+#define P4_CNTR_LIMIT 3
+/*
+ * array indices: 0,1 - HT threads, used with HT enabled cpu
+ */
+struct p4_event_bind {
+        unsigned int opcode;                    /* Event code and ESCR selector */
+        unsigned int escr_msr[2];               /* ESCR MSR for this event */
+        char cntr[2][P4_CNTR_LIMIT];            /* counter index (offset), -1 on abscence */
+};
+struct p4_cache_event_bind {
+        unsigned int metric_pebs;
+        unsigned int metric_vert;
+};
+#define P4_GEN_CACHE_EVENT_BIND(name)           \
+        [P4_CACHE__##name] = {                  \
+                .metric_pebs = P4_PEBS__##name, \
+                .metric_vert = P4_VERT__##name, \
+        }
+static struct p4_cache_event_bind p4_cache_event_bind_map[] = {
+        P4_GEN_CACHE_EVENT_BIND(1stl_cache_load_miss_retired),
+        P4_GEN_CACHE_EVENT_BIND(2ndl_cache_load_miss_retired),
+        P4_GEN_CACHE_EVENT_BIND(dtlb_load_miss_retired),
+        P4_GEN_CACHE_EVENT_BIND(dtlb_store_miss_retired),
+};
+/*
+ * Note that we don't use CCCR1 here, there is an
+ * exception for P4_BSQ_ALLOCATION but we just have
+ * no workaround
+ *
+ * consider this binding as resources which particular
+ * event may borrow, it doesn't contain EventMask,
+ * Tags and friends -- they are left to a caller
+ */
+static struct p4_event_bind p4_event_bind_map[] = {
+        [P4_EVENT_TC_DELIVER_MODE] = {
+                .opcode         = P4_OPCODE(P4_EVENT_TC_DELIVER_MODE),
+                .escr_msr       = { MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1 },
+                .cntr           = { {4, 5, -1}, {6, 7, -1} },
+        },
+        [P4_EVENT_BPU_FETCH_REQUEST] = {
+                .opcode         = P4_OPCODE(P4_EVENT_BPU_FETCH_REQUEST),
+                .escr_msr       = { MSR_P4_BPU_ESCR0, MSR_P4_BPU_ESCR1 },
+                .cntr           = { {0, -1, -1}, {2, -1, -1} },
+        },
+        [P4_EVENT_ITLB_REFERENCE] = {
+                .opcode         = P4_OPCODE(P4_EVENT_ITLB_REFERENCE),
+                .escr_msr       = { MSR_P4_ITLB_ESCR0, MSR_P4_ITLB_ESCR1 },
+                .cntr           = { {0, -1, -1}, {2, -1, -1} },
+        },
+        [P4_EVENT_MEMORY_CANCEL] = {
+                .opcode         = P4_OPCODE(P4_EVENT_MEMORY_CANCEL),
+                .escr_msr       = { MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1 },
+                .cntr           = { {8, 9, -1}, {10, 11, -1} },
+        },
+        [P4_EVENT_MEMORY_COMPLETE] = {
+                .opcode         = P4_OPCODE(P4_EVENT_MEMORY_COMPLETE),
+                .escr_msr       = { MSR_P4_SAAT_ESCR0 , MSR_P4_SAAT_ESCR1 },
+                .cntr           = { {8, 9, -1}, {10, 11, -1} },
+        },
+        [P4_EVENT_LOAD_PORT_REPLAY] = {
+                .opcode         = P4_OPCODE(P4_EVENT_LOAD_PORT_REPLAY),
+                .escr_msr       = { MSR_P4_SAAT_ESCR0, MSR_P4_SAAT_ESCR1 },
+                .cntr           = { {8, 9, -1}, {10, 11, -1} },
+        },
+        [P4_EVENT_STORE_PORT_REPLAY] = {
+                .opcode         = P4_OPCODE(P4_EVENT_STORE_PORT_REPLAY),
+                .escr_msr       = { MSR_P4_SAAT_ESCR0 ,  MSR_P4_SAAT_ESCR1 },
+                .cntr           = { {8, 9, -1}, {10, 11, -1} },
+        },
+        [P4_EVENT_MOB_LOAD_REPLAY] = {
+                .opcode         = P4_OPCODE(P4_EVENT_MOB_LOAD_REPLAY),
+                .escr_msr       = { MSR_P4_MOB_ESCR0, MSR_P4_MOB_ESCR1 },
+                .cntr           = { {0, -1, -1}, {2, -1, -1} },
+        },
+        [P4_EVENT_PAGE_WALK_TYPE] = {
+                .opcode         = P4_OPCODE(P4_EVENT_PAGE_WALK_TYPE),
+                .escr_msr       = { MSR_P4_PMH_ESCR0, MSR_P4_PMH_ESCR1 },
+                .cntr           = { {0, -1, -1}, {2, -1, -1} },
+        },
+        [P4_EVENT_BSQ_CACHE_REFERENCE] = {
+                .opcode         = P4_OPCODE(P4_EVENT_BSQ_CACHE_REFERENCE),
+                .escr_msr       = { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR1 },
+                .cntr           = { {0, -1, -1}, {2, -1, -1} },
+        },
+        [P4_EVENT_IOQ_ALLOCATION] = {
+                .opcode         = P4_OPCODE(P4_EVENT_IOQ_ALLOCATION),
+                .escr_msr       = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+                .cntr           = { {0, -1, -1}, {2, -1, -1} },
+        },
+        [P4_EVENT_IOQ_ACTIVE_ENTRIES] = {       /* shared ESCR */
+                .opcode         = P4_OPCODE(P4_EVENT_IOQ_ACTIVE_ENTRIES),
+                .escr_msr       = { MSR_P4_FSB_ESCR1,  MSR_P4_FSB_ESCR1 },
+                .cntr           = { {2, -1, -1}, {3, -1, -1} },
+        },
+        [P4_EVENT_FSB_DATA_ACTIVITY] = {
+                .opcode         = P4_OPCODE(P4_EVENT_FSB_DATA_ACTIVITY),
+                .escr_msr       = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+                .cntr           = { {0, -1, -1}, {2, -1, -1} },
+        },
+        [P4_EVENT_BSQ_ALLOCATION] = {           /* shared ESCR, broken CCCR1 */
+                .opcode         = P4_OPCODE(P4_EVENT_BSQ_ALLOCATION),
+                .escr_msr       = { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR0 },
+                .cntr           = { {0, -1, -1}, {1, -1, -1} },
+        },
+        [P4_EVENT_BSQ_ACTIVE_ENTRIES] = {       /* shared ESCR */
+                .opcode         = P4_OPCODE(P4_EVENT_BSQ_ACTIVE_ENTRIES),
+                .escr_msr       = { MSR_P4_BSU_ESCR1 , MSR_P4_BSU_ESCR1 },
+                .cntr           = { {2, -1, -1}, {3, -1, -1} },
+        },
+        [P4_EVENT_SSE_INPUT_ASSIST] = {
+                .opcode         = P4_OPCODE(P4_EVENT_SSE_INPUT_ASSIST),
+                .escr_msr       = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+                .cntr           = { {8, 9, -1}, {10, 11, -1} },
+        },
+        [P4_EVENT_PACKED_SP_UOP] = {
+                .opcode         = P4_OPCODE(P4_EVENT_PACKED_SP_UOP),
+                .escr_msr       = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+                .cntr           = { {8, 9, -1}, {10, 11, -1} },
+        },
+        [P4_EVENT_PACKED_DP_UOP] = {
+                .opcode         = P4_OPCODE(P4_EVENT_PACKED_DP_UOP),
+                .escr_msr       = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+                .cntr           = { {8, 9, -1}, {10, 11, -1} },
+        },
+        [P4_EVENT_SCALAR_SP_UOP] = {
+                .opcode         = P4_OPCODE(P4_EVENT_SCALAR_SP_UOP),
+                .escr_msr       = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+                .cntr           = { {8, 9, -1}, {10, 11, -1} },
+        },
+        [P4_EVENT_SCALAR_DP_UOP] = {
+                .opcode         = P4_OPCODE(P4_EVENT_SCALAR_DP_UOP),
+                .escr_msr       = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+                .cntr           = { {8, 9, -1}, {10, 11, -1} },
+        },
+        [P4_EVENT_64BIT_MMX_UOP] = {
+                .opcode         = P4_OPCODE(P4_EVENT_64BIT_MMX_UOP),
+                .escr_msr       = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+                .cntr           = { {8, 9, -1}, {10, 11, -1} },
+        },
+        [P4_EVENT_128BIT_MMX_UOP] = {
+                .opcode         = P4_OPCODE(P4_EVENT_128BIT_MMX_UOP),
+                .escr_msr       = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+                .cntr           = { {8, 9, -1}, {10, 11, -1} },
+        },
+        [P4_EVENT_X87_FP_UOP] = {
+                .opcode         = P4_OPCODE(P4_EVENT_X87_FP_UOP),
+                .escr_msr       = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+                .cntr           = { {8, 9, -1}, {10, 11, -1} },
+        },
+        [P4_EVENT_TC_MISC] = {
+                .opcode         = P4_OPCODE(P4_EVENT_TC_MISC),
+                .escr_msr       = { MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1 },
+                .cntr           = { {4, 5, -1}, {6, 7, -1} },
+        },
+        [P4_EVENT_GLOBAL_POWER_EVENTS] = {
+                .opcode         = P4_OPCODE(P4_EVENT_GLOBAL_POWER_EVENTS),
+                .escr_msr       = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+                .cntr           = { {0, -1, -1}, {2, -1, -1} },
+        },
+        [P4_EVENT_TC_MS_XFER] = {
+                .opcode         = P4_OPCODE(P4_EVENT_TC_MS_XFER),
+                .escr_msr       = { MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1 },
+                .cntr           = { {4, 5, -1}, {6, 7, -1} },
+        },
+        [P4_EVENT_UOP_QUEUE_WRITES] = {
+                .opcode         = P4_OPCODE(P4_EVENT_UOP_QUEUE_WRITES),
+                .escr_msr       = { MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1 },
+                .cntr           = { {4, 5, -1}, {6, 7, -1} },
+        },
+        [P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE] = {
+                .opcode         = P4_OPCODE(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE),
+                .escr_msr       = { MSR_P4_TBPU_ESCR0 , MSR_P4_TBPU_ESCR0 },
+                .cntr           = { {4, 5, -1}, {6, 7, -1} },
+        },
+        [P4_EVENT_RETIRED_BRANCH_TYPE] = {
+                .opcode         = P4_OPCODE(P4_EVENT_RETIRED_BRANCH_TYPE),
+                .escr_msr       = { MSR_P4_TBPU_ESCR0 , MSR_P4_TBPU_ESCR1 },
+                .cntr           = { {4, 5, -1}, {6, 7, -1} },
+        },
+        [P4_EVENT_RESOURCE_STALL] = {
+                .opcode         = P4_OPCODE(P4_EVENT_RESOURCE_STALL),
+                .escr_msr       = { MSR_P4_ALF_ESCR0, MSR_P4_ALF_ESCR1 },
+                .cntr           = { {12, 13, 16}, {14, 15, 17} },
+        },
+        [P4_EVENT_WC_BUFFER] = {
+                .opcode         = P4_OPCODE(P4_EVENT_WC_BUFFER),
+                .escr_msr       = { MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1 },
+                .cntr           = { {8, 9, -1}, {10, 11, -1} },
+        },
+        [P4_EVENT_B2B_CYCLES] = {
+                .opcode         = P4_OPCODE(P4_EVENT_B2B_CYCLES),
+                .escr_msr       = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+                .cntr           = { {0, -1, -1}, {2, -1, -1} },
+        },
+        [P4_EVENT_BNR] = {
+                .opcode         = P4_OPCODE(P4_EVENT_BNR),
+                .escr_msr       = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+                .cntr           = { {0, -1, -1}, {2, -1, -1} },
+        },
+        [P4_EVENT_SNOOP] = {
+                .opcode         = P4_OPCODE(P4_EVENT_SNOOP),
+                .escr_msr       = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+                .cntr           = { {0, -1, -1}, {2, -1, -1} },
+        },
+        [P4_EVENT_RESPONSE] = {
+                .opcode         = P4_OPCODE(P4_EVENT_RESPONSE),
+                .escr_msr       = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+                .cntr           = { {0, -1, -1}, {2, -1, -1} },
+        },
+        [P4_EVENT_FRONT_END_EVENT] = {
+                .opcode         = P4_OPCODE(P4_EVENT_FRONT_END_EVENT),
+                .escr_msr       = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+                .cntr           = { {12, 13, 16}, {14, 15, 17} },
+        },
+        [P4_EVENT_EXECUTION_EVENT] = {
+                .opcode         = P4_OPCODE(P4_EVENT_EXECUTION_EVENT),
+                .escr_msr       = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+                .cntr           = { {12, 13, 16}, {14, 15, 17} },
+        },
+        [P4_EVENT_REPLAY_EVENT] = {
+                .opcode         = P4_OPCODE(P4_EVENT_REPLAY_EVENT),
+                .escr_msr       = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+                .cntr           = { {12, 13, 16}, {14, 15, 17} },
+        },
+        [P4_EVENT_INSTR_RETIRED] = {
+                .opcode         = P4_OPCODE(P4_EVENT_INSTR_RETIRED),
+                .escr_msr       = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
+                .cntr           = { {12, 13, 16}, {14, 15, 17} },
+        },
+        [P4_EVENT_UOPS_RETIRED] = {
+                .opcode         = P4_OPCODE(P4_EVENT_UOPS_RETIRED),
+                .escr_msr       = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
+                .cntr           = { {12, 13, 16}, {14, 15, 17} },
+        },
+        [P4_EVENT_UOP_TYPE] = {
+                .opcode         = P4_OPCODE(P4_EVENT_UOP_TYPE),
+                .escr_msr       = { MSR_P4_RAT_ESCR0, MSR_P4_RAT_ESCR1 },
+                .cntr           = { {12, 13, 16}, {14, 15, 17} },
+        },
+        [P4_EVENT_BRANCH_RETIRED] = {
+                .opcode         = P4_OPCODE(P4_EVENT_BRANCH_RETIRED),
+                .escr_msr       = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+                .cntr           = { {12, 13, 16}, {14, 15, 17} },
+        },
+        [P4_EVENT_MISPRED_BRANCH_RETIRED] = {
+                .opcode         = P4_OPCODE(P4_EVENT_MISPRED_BRANCH_RETIRED),
+                .escr_msr       = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
+                .cntr           = { {12, 13, 16}, {14, 15, 17} },
+        },
+        [P4_EVENT_X87_ASSIST] = {
+                .opcode         = P4_OPCODE(P4_EVENT_X87_ASSIST),
+                .escr_msr       = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+                .cntr           = { {12, 13, 16}, {14, 15, 17} },
+        },
+        [P4_EVENT_MACHINE_CLEAR] = {
+                .opcode         = P4_OPCODE(P4_EVENT_MACHINE_CLEAR),
+                .escr_msr       = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+                .cntr           = { {12, 13, 16}, {14, 15, 17} },
+        },
+        [P4_EVENT_INSTR_COMPLETED] = {
+                .opcode         = P4_OPCODE(P4_EVENT_INSTR_COMPLETED),
+                .escr_msr       = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
+                .cntr           = { {12, 13, 16}, {14, 15, 17} },
+        },
+};
+#define P4_GEN_CACHE_EVENT(event, bit, cache_event)                       \
+        p4_config_pack_escr(P4_ESCR_EVENT(event)                        | \
+                            P4_ESCR_EMASK_BIT(event, bit))              | \
+        p4_config_pack_cccr(cache_event                                 | \
+                            P4_CCCR_ESEL(P4_OPCODE_ESEL(P4_OPCODE(event))))
+static __initconst const u64 p4_hw_cache_event_ids
+                                [PERF_COUNT_HW_CACHE_MAX]
+                                [PERF_COUNT_HW_CACHE_OP_MAX]
+                                [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D ) ] = {
+        [ C(OP_READ) ] = {
+                [ C(RESULT_ACCESS) ] = 0x0,
+                [ C(RESULT_MISS)   ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
+                                                P4_CACHE__1stl_cache_load_miss_retired),
+        },
+ },
+ [ C(LL  ) ] = {
+        [ C(OP_READ) ] = {
+                [ C(RESULT_ACCESS) ] = 0x0,
+                [ C(RESULT_MISS)   ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
+                                                P4_CACHE__2ndl_cache_load_miss_retired),
+        },
+},
+ [ C(DTLB) ] = {
+        [ C(OP_READ) ] = {
+                [ C(RESULT_ACCESS) ] = 0x0,
+                [ C(RESULT_MISS)   ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
+                                                P4_CACHE__dtlb_load_miss_retired),
+        },
+        [ C(OP_WRITE) ] = {
+                [ C(RESULT_ACCESS) ] = 0x0,
+                [ C(RESULT_MISS)   ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
+                                                P4_CACHE__dtlb_store_miss_retired),
+        },
+ },
+ [ C(ITLB) ] = {
+        [ C(OP_READ) ] = {
+                [ C(RESULT_ACCESS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_ITLB_REFERENCE, HIT,
+                                                P4_CACHE__itlb_reference_hit),
+                [ C(RESULT_MISS)   ] = P4_GEN_CACHE_EVENT(P4_EVENT_ITLB_REFERENCE, MISS,
+                                                P4_CACHE__itlb_reference_miss),
+        },
+        [ C(OP_WRITE) ] = {
+                [ C(RESULT_ACCESS) ] = -1,
+                [ C(RESULT_MISS)   ] = -1,
+        },
+        [ C(OP_PREFETCH) ] = {
+                [ C(RESULT_ACCESS) ] = -1,
+                [ C(RESULT_MISS)   ] = -1,
+        },
+ },
+};
+static u64 p4_general_events[PERF_COUNT_HW_MAX] = {
+  /* non-halted CPU clocks */
+  [PERF_COUNT_HW_CPU_CYCLES] =
+        p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_GLOBAL_POWER_EVENTS)         |
+                P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING)),
+  /*
+   * retired instructions
+   * in a sake of simplicity we don't use the FSB tagging
+   */
+  [PERF_COUNT_HW_INSTRUCTIONS] =
+        p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_INSTR_RETIRED)               |
+                P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, NBOGUSNTAG)           |
+                P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, BOGUSNTAG)),
+  /* cache hits */
+  [PERF_COUNT_HW_CACHE_REFERENCES] =
+        p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_BSQ_CACHE_REFERENCE)         |
+                P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITS)   |
+                P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITE)   |
+                P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITM)   |
+                P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITS)   |
+                P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITE)   |
+                P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITM)),
+  /* cache misses */
+  [PERF_COUNT_HW_CACHE_MISSES] =
+        p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_BSQ_CACHE_REFERENCE)         |
+                P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_MISS)   |
+                P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_MISS)   |
+                P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, WR_2ndL_MISS)),
+  /* branch instructions retired */
+  [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] =
+        p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_RETIRED_BRANCH_TYPE)         |
+                P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CONDITIONAL)    |
+                P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CALL)           |
+                P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, RETURN)         |
+                P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, INDIRECT)),
+  /* mispredicted branches retired */
+  [PERF_COUNT_HW_BRANCH_MISSES] =
+        p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_MISPRED_BRANCH_RETIRED)      |
+                P4_ESCR_EMASK_BIT(P4_EVENT_MISPRED_BRANCH_RETIRED, NBOGUS)),
+  /* bus ready clocks (cpu is driving #DRDY_DRV\#DRDY_OWN):  */
+  [PERF_COUNT_HW_BUS_CYCLES] =
+        p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_FSB_DATA_ACTIVITY)           |
+                P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_DRV)         |
+                P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OWN))        |
+        p4_config_pack_cccr(P4_CCCR_EDGE | P4_CCCR_COMPARE),
+};
+static struct p4_event_bind *p4_config_get_bind(u64 config)
+{
+        unsigned int evnt = p4_config_unpack_event(config);
+        struct p4_event_bind *bind = NULL;
+        if (evnt < ARRAY_SIZE(p4_event_bind_map))
+                bind = &p4_event_bind_map[evnt];
+        return bind;
+}
+static u64 p4_pmu_event_map(int hw_event)
+{
+        struct p4_event_bind *bind;
+        unsigned int esel;
+        u64 config;
+        config = p4_general_events[hw_event];
+        bind = p4_config_get_bind(config);
+        esel = P4_OPCODE_ESEL(bind->opcode);
+        config |= p4_config_pack_cccr(P4_CCCR_ESEL(esel));
+        return config;
+}
+static int p4_hw_config(struct perf_event *event)
+{
+        int cpu = get_cpu();
+        int rc = 0;
+        unsigned int evnt;
+        u32 escr, cccr;
+        /*
+         * the reason we use cpu that early is that: if we get scheduled
+         * first time on the same cpu -- we will not need swap thread
+         * specific flags in config (and will save some cpu cycles)
+         */
+        cccr = p4_default_cccr_conf(cpu);
+        escr = p4_default_escr_conf(cpu, event->attr.exclude_kernel,
+                                         event->attr.exclude_user);
+        event->hw.config = p4_config_pack_escr(escr) |
+                           p4_config_pack_cccr(cccr);
+        if (p4_ht_active() && p4_ht_thread(cpu))
+                event->hw.config = p4_set_ht_bit(event->hw.config);
+        if (event->attr.type == PERF_TYPE_RAW) {
+                /* user data may have out-of-bound event index */
+                evnt = p4_config_unpack_event(event->attr.config);
+                if (evnt >= ARRAY_SIZE(p4_event_bind_map)) {
+                        rc = -EINVAL;
+                        goto out;
+                }
+                /*
+                 * We don't control raw events so it's up to the caller
+                 * to pass sane values (and we don't count the thread number
+                 * on HT machine but allow HT-compatible specifics to be
+                 * passed on)
+                 *
+                 * XXX: HT wide things should check perf_paranoid_cpu() &&
+                 *      CAP_SYS_ADMIN
+                 */
+                event->hw.config |= event->attr.config &
+                        (p4_config_pack_escr(P4_ESCR_MASK_HT) |
+                         p4_config_pack_cccr(P4_CCCR_MASK_HT));
+        }
+        rc = x86_setup_perfctr(event);
+out:
+        put_cpu();
+        return rc;
+}
+static inline void p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc)
+{
+        unsigned long dummy;
+        rdmsrl(hwc->config_base + hwc->idx, dummy);
+        if (dummy & P4_CCCR_OVF) {
+                (void)checking_wrmsrl(hwc->config_base + hwc->idx,
+                        ((u64)dummy) & ~P4_CCCR_OVF);
+        }
+}
+static inline void p4_pmu_disable_event(struct perf_event *event)
+{
+        struct hw_perf_event *hwc = &event->hw;
+        /*
+         * If event gets disabled while counter is in overflowed
+         * state we need to clear P4_CCCR_OVF, otherwise interrupt get
+         * asserted again and again
+         */
+        (void)checking_wrmsrl(hwc->config_base + hwc->idx,
+                (u64)(p4_config_unpack_cccr(hwc->config)) &
+                        ~P4_CCCR_ENABLE & ~P4_CCCR_OVF & ~P4_CCCR_RESERVED);
+}
+static void p4_pmu_disable_all(void)
+{
+        struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+        int idx;
+        for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+                struct perf_event *event = cpuc->events[idx];
+                if (!test_bit(idx, cpuc->active_mask))
+                        continue;
+                p4_pmu_disable_event(event);
+        }
+}
+static void p4_pmu_enable_event(struct perf_event *event)
+{
+        struct hw_perf_event *hwc = &event->hw;
+        int thread = p4_ht_config_thread(hwc->config);
+        u64 escr_conf = p4_config_unpack_escr(p4_clear_ht_bit(hwc->config));
+        unsigned int idx = p4_config_unpack_event(hwc->config);
+        unsigned int idx_cache = p4_config_unpack_cache_event(hwc->config);
+        struct p4_event_bind *bind;
+        struct p4_cache_event_bind *bind_cache;
+        u64 escr_addr, cccr;
+        bind = &p4_event_bind_map[idx];
+        escr_addr = (u64)bind->escr_msr[thread];
+        /*
+         * - we dont support cascaded counters yet
+         * - and counter 1 is broken (erratum)
+         */
+        WARN_ON_ONCE(p4_is_event_cascaded(hwc->config));
+        WARN_ON_ONCE(hwc->idx == 1);
+        /* we need a real Event value */
+        escr_conf &= ~P4_ESCR_EVENT_MASK;
+        escr_conf |= P4_ESCR_EVENT(P4_OPCODE_EVNT(bind->opcode));
+        cccr = p4_config_unpack_cccr(hwc->config);
+        /*
+         * it could be Cache event so that we need to
+         * set metrics into additional MSRs
+         */
+        BUILD_BUG_ON(P4_CACHE__MAX > P4_CCCR_CACHE_OPS_MASK);
+        if (idx_cache > P4_CACHE__NONE &&
+                idx_cache < ARRAY_SIZE(p4_cache_event_bind_map)) {
+                bind_cache = &p4_cache_event_bind_map[idx_cache];
+                (void)checking_wrmsrl(MSR_IA32_PEBS_ENABLE, (u64)bind_cache->metric_pebs);
+                (void)checking_wrmsrl(MSR_P4_PEBS_MATRIX_VERT, (u64)bind_cache->metric_vert);
+        }
+        (void)checking_wrmsrl(escr_addr, escr_conf);
+        (void)checking_wrmsrl(hwc->config_base + hwc->idx,
+                                (cccr & ~P4_CCCR_RESERVED) | P4_CCCR_ENABLE);
+}
+static void p4_pmu_enable_all(int added)
+{
+        struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+        int idx;
+        for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+                struct perf_event *event = cpuc->events[idx];
+                if (!test_bit(idx, cpuc->active_mask))
+                        continue;
+                p4_pmu_enable_event(event);
+        }
+}
+static int p4_pmu_handle_irq(struct pt_regs *regs)
+{
+        struct perf_sample_data data;
+        struct cpu_hw_events *cpuc;
+        struct perf_event *event;
+        struct hw_perf_event *hwc;
+        int idx, handled = 0;
+        u64 val;
+        data.addr = 0;
+        data.raw = NULL;
+        cpuc = &__get_cpu_var(cpu_hw_events);
+        for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+                if (!test_bit(idx, cpuc->active_mask))
+                        continue;
+                event = cpuc->events[idx];
+                hwc = &event->hw;
+                WARN_ON_ONCE(hwc->idx != idx);
+                /*
+                 * FIXME: Redundant call, actually not needed
+                 * but just to check if we're screwed
+                 */
+                p4_pmu_clear_cccr_ovf(hwc);
+                val = x86_perf_event_update(event);
+                if (val & (1ULL << (x86_pmu.cntval_bits - 1)))
+                        continue;
+                /*
+                 * event overflow
+                 */
+                handled         = 1;
+                data.period     = event->hw.last_period;
+                if (!x86_perf_event_set_period(event))
+                        continue;
+                if (perf_event_overflow(event, 1, &data, regs))
+                        p4_pmu_disable_event(event);
+        }
+        if (handled) {
+                /* p4 quirk: unmask it again */
+                apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED);
+                inc_irq_stat(apic_perf_irqs);
+        }
+        return handled;
+}
+/*
+ * swap thread specific fields according to a thread
+ * we are going to run on
+ */
+static void p4_pmu_swap_config_ts(struct hw_perf_event *hwc, int cpu)
+{
+        u32 escr, cccr;
+        /*
+         * we either lucky and continue on same cpu or no HT support
+         */
+        if (!p4_should_swap_ts(hwc->config, cpu))
+                return;
+        /*
+         * the event is migrated from an another logical
+         * cpu, so we need to swap thread specific flags
+         */
+        escr = p4_config_unpack_escr(hwc->config);
+        cccr = p4_config_unpack_cccr(hwc->config);
+        if (p4_ht_thread(cpu)) {
+                cccr &= ~P4_CCCR_OVF_PMI_T0;
+                cccr |= P4_CCCR_OVF_PMI_T1;
+                if (escr & P4_ESCR_T0_OS) {
+                        escr &= ~P4_ESCR_T0_OS;
+                        escr |= P4_ESCR_T1_OS;
+                }
+                if (escr & P4_ESCR_T0_USR) {
+                        escr &= ~P4_ESCR_T0_USR;
+                        escr |= P4_ESCR_T1_USR;
+                }
+                hwc->config  = p4_config_pack_escr(escr);
+                hwc->config |= p4_config_pack_cccr(cccr);
+                hwc->config |= P4_CONFIG_HT;
+        } else {
+                cccr &= ~P4_CCCR_OVF_PMI_T1;
+                cccr |= P4_CCCR_OVF_PMI_T0;
+                if (escr & P4_ESCR_T1_OS) {
+                        escr &= ~P4_ESCR_T1_OS;
+                        escr |= P4_ESCR_T0_OS;
+                }
+                if (escr & P4_ESCR_T1_USR) {
+                        escr &= ~P4_ESCR_T1_USR;
+                        escr |= P4_ESCR_T0_USR;
+                }
+                hwc->config  = p4_config_pack_escr(escr);
+                hwc->config |= p4_config_pack_cccr(cccr);
+                hwc->config &= ~P4_CONFIG_HT;
+        }
+}
+/*
+ * ESCR address hashing is tricky, ESCRs are not sequential
+ * in memory but all starts from MSR_P4_BSU_ESCR0 (0x03e0) and
+ * the metric between any ESCRs is laid in range [0xa0,0xe1]
+ *
+ * so we make ~70% filled hashtable
+ */
+#define P4_ESCR_MSR_BASE                0x000003a0
+#define P4_ESCR_MSR_MAX                 0x000003e1
+#define P4_ESCR_MSR_TABLE_SIZE          (P4_ESCR_MSR_MAX - P4_ESCR_MSR_BASE + 1)
+#define P4_ESCR_MSR_IDX(msr)            (msr - P4_ESCR_MSR_BASE)
+#define P4_ESCR_MSR_TABLE_ENTRY(msr)    [P4_ESCR_MSR_IDX(msr)] = msr
+static const unsigned int p4_escr_table[P4_ESCR_MSR_TABLE_SIZE] = {
+        P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ALF_ESCR0),
+        P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ALF_ESCR1),
+        P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BPU_ESCR0),
+        P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BPU_ESCR1),
+        P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BSU_ESCR0),
+        P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BSU_ESCR1),
+        P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR0),
+        P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR1),
+        P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR2),
+        P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR3),
+        P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR4),
+        P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR5),
+        P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_DAC_ESCR0),
+        P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_DAC_ESCR1),
+        P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FIRM_ESCR0),
+        P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FIRM_ESCR1),
+        P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FLAME_ESCR0),
+        P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FLAME_ESCR1),
+        P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FSB_ESCR0),
+        P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FSB_ESCR1),
+        P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IQ_ESCR0),
+        P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IQ_ESCR1),
+        P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IS_ESCR0),
+        P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IS_ESCR1),
+        P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ITLB_ESCR0),
+        P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ITLB_ESCR1),
+        P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IX_ESCR0),
+        P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IX_ESCR1),
+        P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MOB_ESCR0),
+        P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MOB_ESCR1),
+        P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MS_ESCR0),
+        P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MS_ESCR1),
+        P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_PMH_ESCR0),
+        P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_PMH_ESCR1),
+        P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_RAT_ESCR0),
+        P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_RAT_ESCR1),
+        P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SAAT_ESCR0),
+        P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SAAT_ESCR1),
+        P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SSU_ESCR0),
+        P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SSU_ESCR1),
+        P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TBPU_ESCR0),
+        P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TBPU_ESCR1),
+        P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TC_ESCR0),
+        P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TC_ESCR1),
+        P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_U2L_ESCR0),
+        P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_U2L_ESCR1),
+};
+static int p4_get_escr_idx(unsigned int addr)
+{
+        unsigned int idx = P4_ESCR_MSR_IDX(addr);
+        if (unlikely(idx >= P4_ESCR_MSR_TABLE_SIZE ||
+                        !p4_escr_table[idx])) {
+                WARN_ONCE(1, "P4 PMU: Wrong address passed: %x\n", addr);
+                return -1;
+        }
+        return idx;
+}
+static int p4_next_cntr(int thread, unsigned long *used_mask,
+                        struct p4_event_bind *bind)
+{
+        int i, j;
+        for (i = 0; i < P4_CNTR_LIMIT; i++) {
+                j = bind->cntr[thread][i];
+                if (j != -1 && !test_bit(j, used_mask))
+                        return j;
+        }
+        return -1;
+}
+static int p4_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
+{
+        unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+        unsigned long escr_mask[BITS_TO_LONGS(P4_ESCR_MSR_TABLE_SIZE)];
+        int cpu = raw_smp_processor_id();
+        struct hw_perf_event *hwc;
+        struct p4_event_bind *bind;
+        unsigned int i, thread, num;
+        int cntr_idx, escr_idx;
+        bitmap_zero(used_mask, X86_PMC_IDX_MAX);
+        bitmap_zero(escr_mask, P4_ESCR_MSR_TABLE_SIZE);
+        for (i = 0, num = n; i < n; i++, num--) {
+                hwc = &cpuc->event_list[i]->hw;
+                thread = p4_ht_thread(cpu);
+                bind = p4_config_get_bind(hwc->config);
+                escr_idx = p4_get_escr_idx(bind->escr_msr[thread]);
+                if (unlikely(escr_idx == -1))
+                        goto done;
+                if (hwc->idx != -1 && !p4_should_swap_ts(hwc->config, cpu)) {
+                        cntr_idx = hwc->idx;
+                        if (assign)
+                                assign[i] = hwc->idx;
+                        goto reserve;
+                }
+                cntr_idx = p4_next_cntr(thread, used_mask, bind);
+                if (cntr_idx == -1 || test_bit(escr_idx, escr_mask))
+                        goto done;
+                p4_pmu_swap_config_ts(hwc, cpu);
+                if (assign)
+                        assign[i] = cntr_idx;
+reserve:
+                set_bit(cntr_idx, used_mask);
+                set_bit(escr_idx, escr_mask);
+        }
+done:
+        return num ? -ENOSPC : 0;
+}
+static __initconst const struct x86_pmu p4_pmu = {
+        .name                   = "Netburst P4/Xeon",
+        .handle_irq             = p4_pmu_handle_irq,
+        .disable_all            = p4_pmu_disable_all,
+        .enable_all             = p4_pmu_enable_all,
+        .enable                 = p4_pmu_enable_event,
+        .disable                = p4_pmu_disable_event,
+        .eventsel               = MSR_P4_BPU_CCCR0,
+        .perfctr                = MSR_P4_BPU_PERFCTR0,
+        .event_map              = p4_pmu_event_map,
+        .max_events             = ARRAY_SIZE(p4_general_events),
+        .get_event_constraints  = x86_get_event_constraints,
+        /*
+         * IF HT disabled we may need to use all
+         * ARCH_P4_MAX_CCCR counters simulaneously
+         * though leave it restricted at moment assuming
+         * HT is on
+         */
+        .num_counters           = ARCH_P4_MAX_CCCR,
+        .apic                   = 1,
+        .cntval_bits            = 40,
+        .cntval_mask            = (1ULL << 40) - 1,
+        .max_period             = (1ULL << 39) - 1,
+        .hw_config              = p4_hw_config,
+        .schedule_events        = p4_pmu_schedule_events,
+};
+static __init int p4_pmu_init(void)
+{
+        unsigned int low, high;
+        /* If we get stripped -- indexig fails */
+        BUILD_BUG_ON(ARCH_P4_MAX_CCCR > X86_PMC_MAX_GENERIC);
+        rdmsr(MSR_IA32_MISC_ENABLE, low, high);
+        if (!(low & (1 << 7))) {
+                pr_cont("unsupported Netburst CPU model %d ",
+                        boot_cpu_data.x86_model);
+                return -ENODEV;
+        }
+        memcpy(hw_cache_event_ids, p4_hw_cache_event_ids,
+                sizeof(hw_cache_event_ids));
+        pr_cont("Netburst events, ");
+        x86_pmu = p4_pmu;
+        return 0;
+}
+#endif /* CONFIG_CPU_SUP_INTEL */
diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c
index a330485d14da..34ba07be2cda 100644
--- a/arch/x86/kernel/cpu/perf_event_p6.c
+++ b/arch/x86/kernel/cpu/perf_event_p6.c
@@ -27,24 +27,6 @@ static u64 p6_pmu_event_map(int hw_event)
 */
 #define P6_NOP_EVENT                    0x0000002EULL
-static u64 p6_pmu_raw_event(u64 hw_event)
-{
-#define P6_EVNTSEL_EVENT_MASK           0x000000FFULL
-#define P6_EVNTSEL_UNIT_MASK            0x0000FF00ULL
-#define P6_EVNTSEL_EDGE_MASK            0x00040000ULL
-#define P6_EVNTSEL_INV_MASK             0x00800000ULL
-#define P6_EVNTSEL_REG_MASK             0xFF000000ULL
-#define P6_EVNTSEL_MASK                 \
-        (P6_EVNTSEL_EVENT_MASK |        \
-         P6_EVNTSEL_UNIT_MASK  |        \
-         P6_EVNTSEL_EDGE_MASK  |        \
-         P6_EVNTSEL_INV_MASK   |        \
-         P6_EVNTSEL_REG_MASK)
-        return hw_event & P6_EVNTSEL_MASK;
-}
 static struct event_constraint p6_event_constraints[] =
 {
        INTEL_EVENT_CONSTRAINT(0xc1, 0x1),      /* FLOPS */
@@ -66,7 +48,7 @@ static void p6_pmu_disable_all(void)
        wrmsrl(MSR_P6_EVNTSEL0, val);
 }
-static void p6_pmu_enable_all(void)
+static void p6_pmu_enable_all(int added)
 {
        unsigned long val;
@@ -102,22 +84,23 @@ static void p6_pmu_enable_event(struct perf_event *event)
        (void)checking_wrmsrl(hwc->config_base + hwc->idx, val);
 }
-static __initconst struct x86_pmu p6_pmu = {
+static __initconst const struct x86_pmu p6_pmu = {
        .name                   = "p6",
        .handle_irq             = x86_pmu_handle_irq,
        .disable_all            = p6_pmu_disable_all,
        .enable_all             = p6_pmu_enable_all,
        .enable                 = p6_pmu_enable_event,
        .disable                = p6_pmu_disable_event,
+        .hw_config              = x86_pmu_hw_config,
+        .schedule_events        = x86_schedule_events,
        .eventsel               = MSR_P6_EVNTSEL0,
        .perfctr                = MSR_P6_PERFCTR0,
        .event_map              = p6_pmu_event_map,
-        .raw_event              = p6_pmu_raw_event,
        .max_events             = ARRAY_SIZE(p6_perfmon_event_map),
        .apic                   = 1,
        .max_period             = (1ULL << 31) - 1,
        .version                = 0,
-        .num_events             = 2,
+        .num_counters           = 2,
        /*
         * Events have 40 bits implemented. However they are designed such
         * that bits [32-39] are sign extensions of bit 31. As such the
@@ -125,8 +108,8 @@ static __initconst struct x86_pmu p6_pmu = {
         *
         * See IA-32 Intel Architecture Software developer manual Vol 3B
         */
-        .event_bits             = 32,
+        .cntval_bits            = 32,
-        .event_mask             = (1ULL << 32) - 1,
+        .cntval_mask            = (1ULL << 32) - 1,
        .get_event_constraints  = x86_get_event_constraints,
        .event_constraints      = p6_event_constraints,
 };
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index dfdb4dba2320..b9d1ff588445 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -24,8 +24,8 @@
 #include <linux/dmi.h>
 #include <linux/module.h>
 #include <asm/div64.h>
-#include <asm/vmware.h>
 #include <asm/x86_init.h>
+#include <asm/hypervisor.h>
 #define CPUID_VMWARE_INFO_LEAF  0x40000000
 #define VMWARE_HYPERVISOR_MAGIC 0x564D5868
@@ -65,7 +65,7 @@ static unsigned long vmware_get_tsc_khz(void)
        return tsc_hz;
 }
-void __init vmware_platform_setup(void)
+static void __init vmware_platform_setup(void)
 {
        uint32_t eax, ebx, ecx, edx;
@@ -83,26 +83,22 @@ void __init vmware_platform_setup(void)
 * serial key should be enough, as this will always have a VMware
 * specific string when running under VMware hypervisor.
 */
-int vmware_platform(void)
+static bool __init vmware_platform(void)
 {
        if (cpu_has_hypervisor) {
-                unsigned int eax, ebx, ecx, edx;
+                unsigned int eax;
-                char hyper_vendor_id[13];
+                unsigned int hyper_vendor_id[3];
-                cpuid(CPUID_VMWARE_INFO_LEAF, &eax, &ebx, &ecx, &edx);
+                cpuid(CPUID_VMWARE_INFO_LEAF, &eax, &hyper_vendor_id[0],
-                memcpy(hyper_vendor_id + 0, &ebx, 4);
+                      &hyper_vendor_id[1], &hyper_vendor_id[2]);
-                memcpy(hyper_vendor_id + 4, &ecx, 4);
+                if (!memcmp(hyper_vendor_id, "VMwareVMware", 12))
-                memcpy(hyper_vendor_id + 8, &edx, 4);
+                        return true;
-                hyper_vendor_id[12] = '\0';
-                if (!strcmp(hyper_vendor_id, "VMwareVMware"))
-                        return 1;
        } else if (dmi_available && dmi_name_in_serial("VMware") &&
                   __vmware_platform())
-                return 1;
+                return true;
-        return 0;
+        return false;
 }
-EXPORT_SYMBOL(vmware_platform);
 /*
 * VMware hypervisor takes care of exporting a reliable TSC to the guest.
@@ -116,8 +112,16 @@ EXPORT_SYMBOL(vmware_platform);
 * so that the kernel could just trust the hypervisor with providing a
 * reliable virtual TSC that is suitable for timekeeping.
 */
-void __cpuinit vmware_set_feature_bits(struct cpuinfo_x86 *c)
+static void __cpuinit vmware_set_cpu_features(struct cpuinfo_x86 *c)
 {
        set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
        set_cpu_cap(c, X86_FEATURE_TSC_RELIABLE);
 }
+const __refconst struct hypervisor_x86 x86_hyper_vmware = {
+        .name                   = "VMware",
+        .detect                 = vmware_platform,
+        .set_cpu_features       = vmware_set_cpu_features,
+        .init_platform          = vmware_platform_setup,
+};
+EXPORT_SYMBOL(x86_hyper_vmware);
diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
deleted file mode 100644
index 1c47390dd0e5..000000000000
--- a/arch/x86/kernel/ds.c
+++ /dev/null
@@ -1,1437 +0,0 @@
-/*
- * Debug Store support
- *
- * This provides a low-level interface to the hardware's Debug Store
- * feature that is used for branch trace store (BTS) and
- * precise-event based sampling (PEBS).
- *
- * It manages:
- * - DS and BTS hardware configuration
- * - buffer overflow handling (to be done)
- * - buffer access
- *
- * It does not do:
- * - security checking (is the caller allowed to trace the task)
- * - buffer allocation (memory accounting)
- *
- *
- * Copyright (C) 2007-2009 Intel Corporation.
- * Markus Metzger <markus.t.metzger@intel.com>, 2007-2009
- */
-#include <linux/kernel.h>
-#include <linux/string.h>
-#include <linux/errno.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/mm.h>
-#include <linux/trace_clock.h>
-#include <asm/ds.h>
-#include "ds_selftest.h"
-/*
- * The configuration for a particular DS hardware implementation:
- */
-struct ds_configuration {
-        /* The name of the configuration: */
-        const char              *name;
-        /* The size of pointer-typed fields in DS, BTS, and PEBS: */
-        unsigned char           sizeof_ptr_field;
-        /* The size of a BTS/PEBS record in bytes: */
-        unsigned char           sizeof_rec[2];
-        /* The number of pebs counter reset values in the DS structure. */
-        unsigned char           nr_counter_reset;
-        /* Control bit-masks indexed by enum ds_feature: */
-        unsigned long           ctl[dsf_ctl_max];
-};
-static struct ds_configuration ds_cfg __read_mostly;
-/* Maximal size of a DS configuration: */
-#define MAX_SIZEOF_DS           0x80
-/* Maximal size of a BTS record: */
-#define MAX_SIZEOF_BTS          (3 * 8)
-/* BTS and PEBS buffer alignment: */
-#define DS_ALIGNMENT            (1 << 3)
-/* Number of buffer pointers in DS: */
-#define NUM_DS_PTR_FIELDS       8
-/* Size of a pebs reset value in DS: */
-#define PEBS_RESET_FIELD_SIZE   8
-/* Mask of control bits in the DS MSR register: */
-#define BTS_CONTROL                               \
-        ( ds_cfg.ctl[dsf_bts]                   | \
-          ds_cfg.ctl[dsf_bts_kernel]            | \
-          ds_cfg.ctl[dsf_bts_user]              | \
-          ds_cfg.ctl[dsf_bts_overflow] )
-/*
- * A BTS or PEBS tracer.
- *
- * This holds the configuration of the tracer and serves as a handle
- * to identify tracers.
- */
-struct ds_tracer {
-        /* The DS context (partially) owned by this tracer. */
-        struct ds_context       *context;
-        /* The buffer provided on ds_request() and its size in bytes. */
-        void                    *buffer;
-        size_t                  size;
-};
-struct bts_tracer {
-        /* The common DS part: */
-        struct ds_tracer        ds;
-        /* The trace including the DS configuration: */
-        struct bts_trace        trace;
-        /* Buffer overflow notification function: */
-        bts_ovfl_callback_t     ovfl;
-        /* Active flags affecting trace collection. */
-        unsigned int            flags;
-};
-struct pebs_tracer {
-        /* The common DS part: */
-        struct ds_tracer        ds;
-        /* The trace including the DS configuration: */
-        struct pebs_trace       trace;
-        /* Buffer overflow notification function: */
-        pebs_ovfl_callback_t    ovfl;
-};
-/*
- * Debug Store (DS) save area configuration (see Intel64 and IA32
- * Architectures Software Developer's Manual, section 18.5)
- *
- * The DS configuration consists of the following fields; different
- * architetures vary in the size of those fields.
- *
- * - double-word aligned base linear address of the BTS buffer
- * - write pointer into the BTS buffer
- * - end linear address of the BTS buffer (one byte beyond the end of
- *   the buffer)
- * - interrupt pointer into BTS buffer
- *   (interrupt occurs when write pointer passes interrupt pointer)
- * - double-word aligned base linear address of the PEBS buffer
- * - write pointer into the PEBS buffer
- * - end linear address of the PEBS buffer (one byte beyond the end of
- *   the buffer)
- * - interrupt pointer into PEBS buffer
- *   (interrupt occurs when write pointer passes interrupt pointer)
- * - value to which counter is reset following counter overflow
- *
- * Later architectures use 64bit pointers throughout, whereas earlier
- * architectures use 32bit pointers in 32bit mode.
- *
- *
- * We compute the base address for the first 8 fields based on:
- * - the field size stored in the DS configuration
- * - the relative field position
- * - an offset giving the start of the respective region
- *
- * This offset is further used to index various arrays holding
- * information for BTS and PEBS at the respective index.
- *
- * On later 32bit processors, we only access the lower 32bit of the
- * 64bit pointer fields. The upper halves will be zeroed out.
- */
-enum ds_field {
-        ds_buffer_base = 0,
-        ds_index,
-        ds_absolute_maximum,
-        ds_interrupt_threshold,
-};
-enum ds_qualifier {
-        ds_bts = 0,
-        ds_pebs
-};
-static inline unsigned long
-ds_get(const unsigned char *base, enum ds_qualifier qual, enum ds_field field)
-{
-        base += (ds_cfg.sizeof_ptr_field * (field + (4 * qual)));
-        return *(unsigned long *)base;
-}
-static inline void
-ds_set(unsigned char *base, enum ds_qualifier qual, enum ds_field field,
-       unsigned long value)
-{
-        base += (ds_cfg.sizeof_ptr_field * (field + (4 * qual)));
-        (*(unsigned long *)base) = value;
-}
-/*
- * Locking is done only for allocating BTS or PEBS resources.
- */
-static DEFINE_SPINLOCK(ds_lock);
-/*
- * We either support (system-wide) per-cpu or per-thread allocation.
- * We distinguish the two based on the task_struct pointer, where a
- * NULL pointer indicates per-cpu allocation for the current cpu.
- *
- * Allocations are use-counted. As soon as resources are allocated,
- * further allocations must be of the same type (per-cpu or
- * per-thread). We model this by counting allocations (i.e. the number
- * of tracers of a certain type) for one type negatively:
- *   =0  no tracers
- *   >0  number of per-thread tracers
- *   <0  number of per-cpu tracers
- *
- * Tracers essentially gives the number of ds contexts for a certain
- * type of allocation.
- */
-static atomic_t tracers = ATOMIC_INIT(0);
-static inline int get_tracer(struct task_struct *task)
-{
-        int error;
-        spin_lock_irq(&ds_lock);
-        if (task) {
-                error = -EPERM;
-                if (atomic_read(&tracers) < 0)
-                        goto out;
-                atomic_inc(&tracers);
-        } else {
-                error = -EPERM;
-                if (atomic_read(&tracers) > 0)
-                        goto out;
-                atomic_dec(&tracers);
-        }
-        error = 0;
-out:
-        spin_unlock_irq(&ds_lock);
-        return error;
-}
-static inline void put_tracer(struct task_struct *task)
-{
-        if (task)
-                atomic_dec(&tracers);
-        else
-                atomic_inc(&tracers);
-}
-/*
- * The DS context is either attached to a thread or to a cpu:
- * - in the former case, the thread_struct contains a pointer to the
- *   attached context.
- * - in the latter case, we use a static array of per-cpu context
- *   pointers.
- *
- * Contexts are use-counted. They are allocated on first access and
- * deallocated when the last user puts the context.
- */
-struct ds_context {
-        /* The DS configuration; goes into MSR_IA32_DS_AREA: */
-        unsigned char           ds[MAX_SIZEOF_DS];
-        /* The owner of the BTS and PEBS configuration, respectively: */
-        struct bts_tracer       *bts_master;
-        struct pebs_tracer      *pebs_master;
-        /* Use count: */
-        unsigned long           count;
-        /* Pointer to the context pointer field: */
-        struct ds_context       **this;
-        /* The traced task; NULL for cpu tracing: */
-        struct task_struct      *task;
-        /* The traced cpu; only valid if task is NULL: */
-        int                     cpu;
-};
-static DEFINE_PER_CPU(struct ds_context *, cpu_ds_context);
-static struct ds_context *ds_get_context(struct task_struct *task, int cpu)
-{
-        struct ds_context **p_context =
-                (task ? &task->thread.ds_ctx : &per_cpu(cpu_ds_context, cpu));
-        struct ds_context *context = NULL;
-        struct ds_context *new_context = NULL;
-        /* Chances are small that we already have a context. */
-        new_context = kzalloc(sizeof(*new_context), GFP_KERNEL);
-        if (!new_context)
-                return NULL;
-        spin_lock_irq(&ds_lock);
-        context = *p_context;
-        if (likely(!context)) {
-                context = new_context;
-                context->this = p_context;
-                context->task = task;
-                context->cpu = cpu;
-                context->count = 0;
-                *p_context = context;
-        }
-        context->count++;
-        spin_unlock_irq(&ds_lock);
-        if (context != new_context)
-                kfree(new_context);
-        return context;
-}
-static void ds_put_context(struct ds_context *context)
-{
-        struct task_struct *task;
-        unsigned long irq;
-        if (!context)
-                return;
-        spin_lock_irqsave(&ds_lock, irq);
-        if (--context->count) {
-                spin_unlock_irqrestore(&ds_lock, irq);
-                return;
-        }
-        *(context->this) = NULL;
-        task = context->task;
-        if (task)
-                clear_tsk_thread_flag(task, TIF_DS_AREA_MSR);
-        /*
-         * We leave the (now dangling) pointer to the DS configuration in
-         * the DS_AREA msr. This is as good or as bad as replacing it with
-         * NULL - the hardware would crash if we enabled tracing.
-         *
-         * This saves us some problems with having to write an msr on a
-         * different cpu while preventing others from doing the same for the
-         * next context for that same cpu.
-         */
-        spin_unlock_irqrestore(&ds_lock, irq);
-        /* The context might still be in use for context switching. */
-        if (task && (task != current))
-                wait_task_context_switch(task);
-        kfree(context);
-}
-static void ds_install_ds_area(struct ds_context *context)
-{
-        unsigned long ds;
-        ds = (unsigned long)context->ds;
-        /*
-         * There is a race between the bts master and the pebs master.
-         *
-         * The thread/cpu access is synchronized via get/put_cpu() for
-         * task tracing and via wrmsr_on_cpu for cpu tracing.
-         *
-         * If bts and pebs are collected for the same task or same cpu,
-         * the same confiuration is written twice.
-         */
-        if (context->task) {
-                get_cpu();
-                if (context->task == current)
-                        wrmsrl(MSR_IA32_DS_AREA, ds);
-                set_tsk_thread_flag(context->task, TIF_DS_AREA_MSR);
-                put_cpu();
-        } else
-                wrmsr_on_cpu(context->cpu, MSR_IA32_DS_AREA,
-                             (u32)((u64)ds), (u32)((u64)ds >> 32));
-}
-/*
- * Call the tracer's callback on a buffer overflow.
- *
- * context: the ds context
- * qual: the buffer type
- */
-static void ds_overflow(struct ds_context *context, enum ds_qualifier qual)
-{
-        switch (qual) {
-        case ds_bts:
-                if (context->bts_master &&
-                    context->bts_master->ovfl)
-                        context->bts_master->ovfl(context->bts_master);
-                break;
-        case ds_pebs:
-                if (context->pebs_master &&
-                    context->pebs_master->ovfl)
-                        context->pebs_master->ovfl(context->pebs_master);
-                break;
-        }
-}
-/*
- * Write raw data into the BTS or PEBS buffer.
- *
- * The remainder of any partially written record is zeroed out.
- *
- * context: the DS context
- * qual:    the buffer type
- * record:  the data to write
- * size:    the size of the data
- */
-static int ds_write(struct ds_context *context, enum ds_qualifier qual,
-                    const void *record, size_t size)
-{
-        int bytes_written = 0;
-        if (!record)
-                return -EINVAL;
-        while (size) {
-                unsigned long base, index, end, write_end, int_th;
-                unsigned long write_size, adj_write_size;
-                /*
-                 * Write as much as possible without producing an
-                 * overflow interrupt.
-                 *
-                 * Interrupt_threshold must either be
-                 * - bigger than absolute_maximum or
-                 * - point to a record between buffer_base and absolute_maximum
-                 *
-                 * Index points to a valid record.
-                 */
-                base   = ds_get(context->ds, qual, ds_buffer_base);
-                index  = ds_get(context->ds, qual, ds_index);
-                end    = ds_get(context->ds, qual, ds_absolute_maximum);
-                int_th = ds_get(context->ds, qual, ds_interrupt_threshold);
-                write_end = min(end, int_th);
-                /*
-                 * If we are already beyond the interrupt threshold,
-                 * we fill the entire buffer.
-                 */
-                if (write_end <= index)
-                        write_end = end;
-                if (write_end <= index)
-                        break;
-                write_size = min((unsigned long) size, write_end - index);
-                memcpy((void *)index, record, write_size);
-                record = (const char *)record + write_size;
-                size -= write_size;
-                bytes_written += write_size;
-                adj_write_size = write_size / ds_cfg.sizeof_rec[qual];
-                adj_write_size *= ds_cfg.sizeof_rec[qual];
-                /* Zero out trailing bytes. */
-                memset((char *)index + write_size, 0,
-                       adj_write_size - write_size);
-                index += adj_write_size;
-                if (index >= end)
-                        index = base;
-                ds_set(context->ds, qual, ds_index, index);
-                if (index >= int_th)
-                        ds_overflow(context, qual);
-        }
-        return bytes_written;
-}
-/*
- * Branch Trace Store (BTS) uses the following format. Different
- * architectures vary in the size of those fields.
- * - source linear address
- * - destination linear address
- * - flags
- *
- * Later architectures use 64bit pointers throughout, whereas earlier
- * architectures use 32bit pointers in 32bit mode.
- *
- * We compute the base address for the fields based on:
- * - the field size stored in the DS configuration
- * - the relative field position
- *
- * In order to store additional information in the BTS buffer, we use
- * a special source address to indicate that the record requires
- * special interpretation.
- *
- * Netburst indicated via a bit in the flags field whether the branch
- * was predicted; this is ignored.
- *
- * We use two levels of abstraction:
- * - the raw data level defined here
- * - an arch-independent level defined in ds.h
- */
-enum bts_field {
-        bts_from,
-        bts_to,
-        bts_flags,
-        bts_qual                = bts_from,
-        bts_clock               = bts_to,
-        bts_pid                 = bts_flags,
-        bts_qual_mask           = (bts_qual_max - 1),
-        bts_escape              = ((unsigned long)-1 & ~bts_qual_mask)
-};
-static inline unsigned long bts_get(const char *base, unsigned long field)
-{
-        base += (ds_cfg.sizeof_ptr_field * field);
-        return *(unsigned long *)base;
-}
-static inline void bts_set(char *base, unsigned long field, unsigned long val)
-{
-        base += (ds_cfg.sizeof_ptr_field * field);
-        (*(unsigned long *)base) = val;
-}
-/*
- * The raw BTS data is architecture dependent.
- *
- * For higher-level users, we give an arch-independent view.
- * - ds.h defines struct bts_struct
- * - bts_read translates one raw bts record into a bts_struct
- * - bts_write translates one bts_struct into the raw format and
- *   writes it into the top of the parameter tracer's buffer.
- *
- * return: bytes read/written on success; -Eerrno, otherwise
- */
-static int
-bts_read(struct bts_tracer *tracer, const void *at, struct bts_struct *out)
-{
-        if (!tracer)
-                return -EINVAL;
-        if (at < tracer->trace.ds.begin)
-                return -EINVAL;
-        if (tracer->trace.ds.end < (at + tracer->trace.ds.size))
-                return -EINVAL;
-        memset(out, 0, sizeof(*out));
-        if ((bts_get(at, bts_qual) & ~bts_qual_mask) == bts_escape) {
-                out->qualifier = (bts_get(at, bts_qual) & bts_qual_mask);
-                out->variant.event.clock = bts_get(at, bts_clock);
-                out->variant.event.pid = bts_get(at, bts_pid);
-        } else {
-                out->qualifier = bts_branch;
-                out->variant.lbr.from = bts_get(at, bts_from);
-                out->variant.lbr.to   = bts_get(at, bts_to);
-                if (!out->variant.lbr.from && !out->variant.lbr.to)
-                        out->qualifier = bts_invalid;
-        }
-        return ds_cfg.sizeof_rec[ds_bts];
-}
-static int bts_write(struct bts_tracer *tracer, const struct bts_struct *in)
-{
-        unsigned char raw[MAX_SIZEOF_BTS];
-        if (!tracer)
-                return -EINVAL;
-        if (MAX_SIZEOF_BTS < ds_cfg.sizeof_rec[ds_bts])
-                return -EOVERFLOW;
-        switch (in->qualifier) {
-        case bts_invalid:
-                bts_set(raw, bts_from, 0);
-                bts_set(raw, bts_to, 0);
-                bts_set(raw, bts_flags, 0);
-                break;
-        case bts_branch:
-                bts_set(raw, bts_from, in->variant.lbr.from);
-                bts_set(raw, bts_to,   in->variant.lbr.to);
-                bts_set(raw, bts_flags, 0);
-                break;
-        case bts_task_arrives:
-        case bts_task_departs:
-                bts_set(raw, bts_qual, (bts_escape | in->qualifier));
-                bts_set(raw, bts_clock, in->variant.event.clock);
-                bts_set(raw, bts_pid, in->variant.event.pid);
-                break;
-        default:
-                return -EINVAL;
-        }
-        return ds_write(tracer->ds.context, ds_bts, raw,
-                        ds_cfg.sizeof_rec[ds_bts]);
-}
-static void ds_write_config(struct ds_context *context,
-                            struct ds_trace *cfg, enum ds_qualifier qual)
-{
-        unsigned char *ds = context->ds;
-        ds_set(ds, qual, ds_buffer_base, (unsigned long)cfg->begin);
-        ds_set(ds, qual, ds_index, (unsigned long)cfg->top);
-        ds_set(ds, qual, ds_absolute_maximum, (unsigned long)cfg->end);
-        ds_set(ds, qual, ds_interrupt_threshold, (unsigned long)cfg->ith);
-}
-static void ds_read_config(struct ds_context *context,
-                           struct ds_trace *cfg, enum ds_qualifier qual)
-{
-        unsigned char *ds = context->ds;
-        cfg->begin = (void *)ds_get(ds, qual, ds_buffer_base);
-        cfg->top = (void *)ds_get(ds, qual, ds_index);
-        cfg->end = (void *)ds_get(ds, qual, ds_absolute_maximum);
-        cfg->ith = (void *)ds_get(ds, qual, ds_interrupt_threshold);
-}
-static void ds_init_ds_trace(struct ds_trace *trace, enum ds_qualifier qual,
-                             void *base, size_t size, size_t ith,
-                             unsigned int flags) {
-        unsigned long buffer, adj;
-        /*
-         * Adjust the buffer address and size to meet alignment
-         * constraints:
-         * - buffer is double-word aligned
-         * - size is multiple of record size
-         *
-         * We checked the size at the very beginning; we have enough
-         * space to do the adjustment.
-         */
-        buffer = (unsigned long)base;
-        adj = ALIGN(buffer, DS_ALIGNMENT) - buffer;
-        buffer += adj;
-        size   -= adj;
-        trace->n = size / ds_cfg.sizeof_rec[qual];
-        trace->size = ds_cfg.sizeof_rec[qual];
-        size = (trace->n * trace->size);
-        trace->begin = (void *)buffer;
-        trace->top = trace->begin;
-        trace->end = (void *)(buffer + size);
-        /*
-         * The value for 'no threshold' is -1, which will set the
-         * threshold outside of the buffer, just like we want it.
-         */
-        ith *= ds_cfg.sizeof_rec[qual];
-        trace->ith = (void *)(buffer + size - ith);
-        trace->flags = flags;
-}
-static int ds_request(struct ds_tracer *tracer, struct ds_trace *trace,
-                      enum ds_qualifier qual, struct task_struct *task,
-                      int cpu, void *base, size_t size, size_t th)
-{
-        struct ds_context *context;
-        int error;
-        size_t req_size;
-        error = -EOPNOTSUPP;
-        if (!ds_cfg.sizeof_rec[qual])
-                goto out;
-        error = -EINVAL;
-        if (!base)
-                goto out;
-        req_size = ds_cfg.sizeof_rec[qual];
-        /* We might need space for alignment adjustments. */
-        if (!IS_ALIGNED((unsigned long)base, DS_ALIGNMENT))
-                req_size += DS_ALIGNMENT;
-        error = -EINVAL;
-        if (size < req_size)
-                goto out;
-        if (th != (size_t)-1) {
-                th *= ds_cfg.sizeof_rec[qual];
-                error = -EINVAL;
-                if (size <= th)
-                        goto out;
-        }
-        tracer->buffer = base;
-        tracer->size = size;
-        error = -ENOMEM;
-        context = ds_get_context(task, cpu);
-        if (!context)
-                goto out;
-        tracer->context = context;
-        /*
-         * Defer any tracer-specific initialization work for the context until
-         * context ownership has been clarified.
-         */
-        error = 0;
- out:
-        return error;
-}
-static struct bts_tracer *ds_request_bts(struct task_struct *task, int cpu,
-                                         void *base, size_t size,
-                                         bts_ovfl_callback_t ovfl, size_t th,
-                                         unsigned int flags)
-{
-        struct bts_tracer *tracer;
-        int error;
-        /* Buffer overflow notification is not yet implemented. */
-        error = -EOPNOTSUPP;
-        if (ovfl)
-                goto out;
-        error = get_tracer(task);
-        if (error < 0)
-                goto out;
-        error = -ENOMEM;
-        tracer = kzalloc(sizeof(*tracer), GFP_KERNEL);
-        if (!tracer)
-                goto out_put_tracer;
-        tracer->ovfl = ovfl;
-        /* Do some more error checking and acquire a tracing context. */
-        error = ds_request(&tracer->ds, &tracer->trace.ds,
-                           ds_bts, task, cpu, base, size, th);
-        if (error < 0)
-                goto out_tracer;
-        /* Claim the bts part of the tracing context we acquired above. */
-        spin_lock_irq(&ds_lock);
-        error = -EPERM;
-        if (tracer->ds.context->bts_master)
-                goto out_unlock;
-        tracer->ds.context->bts_master = tracer;
-        spin_unlock_irq(&ds_lock);
-        /*
-         * Now that we own the bts part of the context, let's complete the
-         * initialization for that part.
-         */
-        ds_init_ds_trace(&tracer->trace.ds, ds_bts, base, size, th, flags);
-        ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_bts);
-        ds_install_ds_area(tracer->ds.context);
-        tracer->trace.read  = bts_read;
-        tracer->trace.write = bts_write;
-        /* Start tracing. */
-        ds_resume_bts(tracer);
-        return tracer;
- out_unlock:
-        spin_unlock_irq(&ds_lock);
-        ds_put_context(tracer->ds.context);
- out_tracer:
-        kfree(tracer);
- out_put_tracer:
-        put_tracer(task);
- out:
-        return ERR_PTR(error);
-}
-struct bts_tracer *ds_request_bts_task(struct task_struct *task,
-                                       void *base, size_t size,
-                                       bts_ovfl_callback_t ovfl,
-                                       size_t th, unsigned int flags)
-{
-        return ds_request_bts(task, 0, base, size, ovfl, th, flags);
-}
-struct bts_tracer *ds_request_bts_cpu(int cpu, void *base, size_t size,
-                                      bts_ovfl_callback_t ovfl,
-                                      size_t th, unsigned int flags)
-{
-        return ds_request_bts(NULL, cpu, base, size, ovfl, th, flags);
-}
-static struct pebs_tracer *ds_request_pebs(struct task_struct *task, int cpu,
-                                           void *base, size_t size,
-                                           pebs_ovfl_callback_t ovfl, size_t th,
-                                           unsigned int flags)
-{
-        struct pebs_tracer *tracer;
-        int error;
-        /* Buffer overflow notification is not yet implemented. */
-        error = -EOPNOTSUPP;
-        if (ovfl)
-                goto out;
-        error = get_tracer(task);
-        if (error < 0)
-                goto out;
-        error = -ENOMEM;
-        tracer = kzalloc(sizeof(*tracer), GFP_KERNEL);
-        if (!tracer)
-                goto out_put_tracer;
-        tracer->ovfl = ovfl;
-        /* Do some more error checking and acquire a tracing context. */
-        error = ds_request(&tracer->ds, &tracer->trace.ds,
-                           ds_pebs, task, cpu, base, size, th);
-        if (error < 0)
-                goto out_tracer;
-        /* Claim the pebs part of the tracing context we acquired above. */
-        spin_lock_irq(&ds_lock);
-        error = -EPERM;
-        if (tracer->ds.context->pebs_master)
-                goto out_unlock;
-        tracer->ds.context->pebs_master = tracer;
-        spin_unlock_irq(&ds_lock);
-        /*
-         * Now that we own the pebs part of the context, let's complete the
-         * initialization for that part.
-         */
-        ds_init_ds_trace(&tracer->trace.ds, ds_pebs, base, size, th, flags);
-        ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_pebs);
-        ds_install_ds_area(tracer->ds.context);
-        /* Start tracing. */
-        ds_resume_pebs(tracer);
-        return tracer;
- out_unlock:
-        spin_unlock_irq(&ds_lock);
-        ds_put_context(tracer->ds.context);
- out_tracer:
-        kfree(tracer);
- out_put_tracer:
-        put_tracer(task);
- out:
-        return ERR_PTR(error);
-}
-struct pebs_tracer *ds_request_pebs_task(struct task_struct *task,
-                                         void *base, size_t size,
-                                         pebs_ovfl_callback_t ovfl,
-                                         size_t th, unsigned int flags)
-{
-        return ds_request_pebs(task, 0, base, size, ovfl, th, flags);
-}
-struct pebs_tracer *ds_request_pebs_cpu(int cpu, void *base, size_t size,
-                                        pebs_ovfl_callback_t ovfl,
-                                        size_t th, unsigned int flags)
-{
-        return ds_request_pebs(NULL, cpu, base, size, ovfl, th, flags);
-}
-static void ds_free_bts(struct bts_tracer *tracer)
-{
-        struct task_struct *task;
-        task = tracer->ds.context->task;
-        WARN_ON_ONCE(tracer->ds.context->bts_master != tracer);
-        tracer->ds.context->bts_master = NULL;
-        /* Make sure tracing stopped and the tracer is not in use. */
-        if (task && (task != current))
-                wait_task_context_switch(task);
-        ds_put_context(tracer->ds.context);
-        put_tracer(task);
-        kfree(tracer);
-}
-void ds_release_bts(struct bts_tracer *tracer)
-{
-        might_sleep();
-        if (!tracer)
-                return;
-        ds_suspend_bts(tracer);
-        ds_free_bts(tracer);
-}
-int ds_release_bts_noirq(struct bts_tracer *tracer)
-{
-        struct task_struct *task;
-        unsigned long irq;
-        int error;
-        if (!tracer)
-                return 0;
-        task = tracer->ds.context->task;
-        local_irq_save(irq);
-        error = -EPERM;
-        if (!task &&
-            (tracer->ds.context->cpu != smp_processor_id()))
-                goto out;
-        error = -EPERM;
-        if (task && (task != current))
-                goto out;
-        ds_suspend_bts_noirq(tracer);
-        ds_free_bts(tracer);
-        error = 0;
- out:
-        local_irq_restore(irq);
-        return error;
-}
-static void update_task_debugctlmsr(struct task_struct *task,
-                                    unsigned long debugctlmsr)
-{
-        task->thread.debugctlmsr = debugctlmsr;
-        get_cpu();
-        if (task == current)
-                update_debugctlmsr(debugctlmsr);
-        put_cpu();
-}
-void ds_suspend_bts(struct bts_tracer *tracer)
-{
-        struct task_struct *task;
-        unsigned long debugctlmsr;
-        int cpu;
-        if (!tracer)
-                return;
-        tracer->flags = 0;
-        task = tracer->ds.context->task;
-        cpu  = tracer->ds.context->cpu;
-        WARN_ON(!task && irqs_disabled());
-        debugctlmsr = (task ?
-                       task->thread.debugctlmsr :
-                       get_debugctlmsr_on_cpu(cpu));
-        debugctlmsr &= ~BTS_CONTROL;
-        if (task)
-                update_task_debugctlmsr(task, debugctlmsr);
-        else
-                update_debugctlmsr_on_cpu(cpu, debugctlmsr);
-}
-int ds_suspend_bts_noirq(struct bts_tracer *tracer)
-{
-        struct task_struct *task;
-        unsigned long debugctlmsr, irq;
-        int cpu, error = 0;
-        if (!tracer)
-                return 0;
-        tracer->flags = 0;
-        task = tracer->ds.context->task;
-        cpu  = tracer->ds.context->cpu;
-        local_irq_save(irq);
-        error = -EPERM;
-        if (!task && (cpu != smp_processor_id()))
-                goto out;
-        debugctlmsr = (task ?
-                       task->thread.debugctlmsr :
-                       get_debugctlmsr());
-        debugctlmsr &= ~BTS_CONTROL;
-        if (task)
-                update_task_debugctlmsr(task, debugctlmsr);
-        else
-                update_debugctlmsr(debugctlmsr);
-        error = 0;
- out:
-        local_irq_restore(irq);
-        return error;
-}
-static unsigned long ds_bts_control(struct bts_tracer *tracer)
-{
-        unsigned long control;
-        control = ds_cfg.ctl[dsf_bts];
-        if (!(tracer->trace.ds.flags & BTS_KERNEL))
-                control |= ds_cfg.ctl[dsf_bts_kernel];
-        if (!(tracer->trace.ds.flags & BTS_USER))
-                control |= ds_cfg.ctl[dsf_bts_user];
-        return control;
-}
-void ds_resume_bts(struct bts_tracer *tracer)
-{
-        struct task_struct *task;
-        unsigned long debugctlmsr;
-        int cpu;
-        if (!tracer)
-                return;
-        tracer->flags = tracer->trace.ds.flags;
-        task = tracer->ds.context->task;
-        cpu  = tracer->ds.context->cpu;
-        WARN_ON(!task && irqs_disabled());
-        debugctlmsr = (task ?
-                       task->thread.debugctlmsr :
-                       get_debugctlmsr_on_cpu(cpu));
-        debugctlmsr |= ds_bts_control(tracer);
-        if (task)
-                update_task_debugctlmsr(task, debugctlmsr);
-        else
-                update_debugctlmsr_on_cpu(cpu, debugctlmsr);
-}
-int ds_resume_bts_noirq(struct bts_tracer *tracer)
-{
-        struct task_struct *task;
-        unsigned long debugctlmsr, irq;
-        int cpu, error = 0;
-        if (!tracer)
-                return 0;
-        tracer->flags = tracer->trace.ds.flags;
-        task = tracer->ds.context->task;
-        cpu  = tracer->ds.context->cpu;
-        local_irq_save(irq);
-        error = -EPERM;
-        if (!task && (cpu != smp_processor_id()))
-                goto out;
-        debugctlmsr = (task ?
-                       task->thread.debugctlmsr :
-                       get_debugctlmsr());
-        debugctlmsr |= ds_bts_control(tracer);
-        if (task)
-                update_task_debugctlmsr(task, debugctlmsr);
-        else
-                update_debugctlmsr(debugctlmsr);
-        error = 0;
- out:
-        local_irq_restore(irq);
-        return error;
-}
-static void ds_free_pebs(struct pebs_tracer *tracer)
-{
-        struct task_struct *task;
-        task = tracer->ds.context->task;
-        WARN_ON_ONCE(tracer->ds.context->pebs_master != tracer);
-        tracer->ds.context->pebs_master = NULL;
-        ds_put_context(tracer->ds.context);
-        put_tracer(task);
-        kfree(tracer);
-}
-void ds_release_pebs(struct pebs_tracer *tracer)
-{
-        might_sleep();
-        if (!tracer)
-                return;
-        ds_suspend_pebs(tracer);
-        ds_free_pebs(tracer);
-}
-int ds_release_pebs_noirq(struct pebs_tracer *tracer)
-{
-        struct task_struct *task;
-        unsigned long irq;
-        int error;
-        if (!tracer)
-                return 0;
-        task = tracer->ds.context->task;
-        local_irq_save(irq);
-        error = -EPERM;
-        if (!task &&
-            (tracer->ds.context->cpu != smp_processor_id()))
-                goto out;
-        error = -EPERM;
-        if (task && (task != current))
-                goto out;
-        ds_suspend_pebs_noirq(tracer);
-        ds_free_pebs(tracer);
-        error = 0;
- out:
-        local_irq_restore(irq);
-        return error;
-}
-void ds_suspend_pebs(struct pebs_tracer *tracer)
-{
-}
-int ds_suspend_pebs_noirq(struct pebs_tracer *tracer)
-{
-        return 0;
-}
-void ds_resume_pebs(struct pebs_tracer *tracer)
-{
-}
-int ds_resume_pebs_noirq(struct pebs_tracer *tracer)
-{
-        return 0;
-}
-const struct bts_trace *ds_read_bts(struct bts_tracer *tracer)
-{
-        if (!tracer)
-                return NULL;
-        ds_read_config(tracer->ds.context, &tracer->trace.ds, ds_bts);
-        return &tracer->trace;
-}
-const struct pebs_trace *ds_read_pebs(struct pebs_tracer *tracer)
-{
-        if (!tracer)
-                return NULL;
-        ds_read_config(tracer->ds.context, &tracer->trace.ds, ds_pebs);
-        tracer->trace.counters = ds_cfg.nr_counter_reset;
-        memcpy(tracer->trace.counter_reset,
-               tracer->ds.context->ds +
-               (NUM_DS_PTR_FIELDS * ds_cfg.sizeof_ptr_field),
-               ds_cfg.nr_counter_reset * PEBS_RESET_FIELD_SIZE);
-        return &tracer->trace;
-}
-int ds_reset_bts(struct bts_tracer *tracer)
-{
-        if (!tracer)
-                return -EINVAL;
-        tracer->trace.ds.top = tracer->trace.ds.begin;
-        ds_set(tracer->ds.context->ds, ds_bts, ds_index,
-               (unsigned long)tracer->trace.ds.top);
-        return 0;
-}
-int ds_reset_pebs(struct pebs_tracer *tracer)
-{
-        if (!tracer)
-                return -EINVAL;
-        tracer->trace.ds.top = tracer->trace.ds.begin;
-        ds_set(tracer->ds.context->ds, ds_pebs, ds_index,
-               (unsigned long)tracer->trace.ds.top);
-        return 0;
-}
-int ds_set_pebs_reset(struct pebs_tracer *tracer,
-                      unsigned int counter, u64 value)
-{
-        if (!tracer)
-                return -EINVAL;
-        if (ds_cfg.nr_counter_reset < counter)
-                return -EINVAL;
-        *(u64 *)(tracer->ds.context->ds +
-                 (NUM_DS_PTR_FIELDS * ds_cfg.sizeof_ptr_field) +
-                 (counter * PEBS_RESET_FIELD_SIZE)) = value;
-        return 0;
-}
-static const struct ds_configuration ds_cfg_netburst = {
-        .name = "Netburst",
-        .ctl[dsf_bts]           = (1 << 2) | (1 << 3),
-        .ctl[dsf_bts_kernel]    = (1 << 5),
-        .ctl[dsf_bts_user]      = (1 << 6),
-        .nr_counter_reset       = 1,
-};
-static const struct ds_configuration ds_cfg_pentium_m = {
-        .name = "Pentium M",
-        .ctl[dsf_bts]           = (1 << 6) | (1 << 7),
-        .nr_counter_reset       = 1,
-};
-static const struct ds_configuration ds_cfg_core2_atom = {
-        .name = "Core 2/Atom",
-        .ctl[dsf_bts]           = (1 << 6) | (1 << 7),
-        .ctl[dsf_bts_kernel]    = (1 << 9),
-        .ctl[dsf_bts_user]      = (1 << 10),
-        .nr_counter_reset       = 1,
-};
-static const struct ds_configuration ds_cfg_core_i7 = {
-        .name = "Core i7",
-        .ctl[dsf_bts]           = (1 << 6) | (1 << 7),
-        .ctl[dsf_bts_kernel]    = (1 << 9),
-        .ctl[dsf_bts_user]      = (1 << 10),
-        .nr_counter_reset       = 4,
-};
-static void
-ds_configure(const struct ds_configuration *cfg,
-             struct cpuinfo_x86 *cpu)
-{
-        unsigned long nr_pebs_fields = 0;
-        printk(KERN_INFO "[ds] using %s configuration\n", cfg->name);
-#ifdef __i386__
-        nr_pebs_fields = 10;
-#else
-        nr_pebs_fields = 18;
-#endif
-        /*
-         * Starting with version 2, architectural performance
-         * monitoring supports a format specifier.
-         */
-        if ((cpuid_eax(0xa) & 0xff) > 1) {
-                unsigned long perf_capabilities, format;
-                rdmsrl(MSR_IA32_PERF_CAPABILITIES, perf_capabilities);
-                format = (perf_capabilities >> 8) & 0xf;
-                switch (format) {
-                case 0:
-                        nr_pebs_fields = 18;
-                        break;
-                case 1:
-                        nr_pebs_fields = 22;
-                        break;
-                default:
-                        printk(KERN_INFO
-                               "[ds] unknown PEBS format: %lu\n", format);
-                        nr_pebs_fields = 0;
-                        break;
-                }
-        }
-        memset(&ds_cfg, 0, sizeof(ds_cfg));
-        ds_cfg = *cfg;
-        ds_cfg.sizeof_ptr_field =
-                (cpu_has(cpu, X86_FEATURE_DTES64) ? 8 : 4);
-        ds_cfg.sizeof_rec[ds_bts]  = ds_cfg.sizeof_ptr_field * 3;
-        ds_cfg.sizeof_rec[ds_pebs] = ds_cfg.sizeof_ptr_field * nr_pebs_fields;
-        if (!cpu_has(cpu, X86_FEATURE_BTS)) {
-                ds_cfg.sizeof_rec[ds_bts] = 0;
-                printk(KERN_INFO "[ds] bts not available\n");
-        }
-        if (!cpu_has(cpu, X86_FEATURE_PEBS)) {
-                ds_cfg.sizeof_rec[ds_pebs] = 0;
-                printk(KERN_INFO "[ds] pebs not available\n");
-        }
-        printk(KERN_INFO "[ds] sizes: address: %u bit, ",
-               8 * ds_cfg.sizeof_ptr_field);
-        printk("bts/pebs record: %u/%u bytes\n",
-               ds_cfg.sizeof_rec[ds_bts], ds_cfg.sizeof_rec[ds_pebs]);
-        WARN_ON_ONCE(MAX_PEBS_COUNTERS < ds_cfg.nr_counter_reset);
-}
-void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
-{
-        /* Only configure the first cpu. Others are identical. */
-        if (ds_cfg.name)
-                return;
-        switch (c->x86) {
-        case 0x6:
-                switch (c->x86_model) {
-                case 0x9:
-                case 0xd: /* Pentium M */
-                        ds_configure(&ds_cfg_pentium_m, c);
-                        break;
-                case 0xf:
-                case 0x17: /* Core2 */
-                case 0x1c: /* Atom */
-                        ds_configure(&ds_cfg_core2_atom, c);
-                        break;
-                case 0x1a: /* Core i7 */
-                        ds_configure(&ds_cfg_core_i7, c);
-                        break;
-                default:
-                        /* Sorry, don't know about them. */
-                        break;
-                }
-                break;
-        case 0xf:
-                switch (c->x86_model) {
-                case 0x0:
-                case 0x1:
-                case 0x2: /* Netburst */
-                        ds_configure(&ds_cfg_netburst, c);
-                        break;
-                default:
-                        /* Sorry, don't know about them. */
-                        break;
-                }
-                break;
-        default:
-                /* Sorry, don't know about them. */
-                break;
-        }
-}
-static inline void ds_take_timestamp(struct ds_context *context,
-                                     enum bts_qualifier qualifier,
-                                     struct task_struct *task)
-{
-        struct bts_tracer *tracer = context->bts_master;
-        struct bts_struct ts;
-        /* Prevent compilers from reading the tracer pointer twice. */
-        barrier();
-        if (!tracer || !(tracer->flags & BTS_TIMESTAMPS))
-                return;
-        memset(&ts, 0, sizeof(ts));
-        ts.qualifier            = qualifier;
-        ts.variant.event.clock  = trace_clock_global();
-        ts.variant.event.pid    = task->pid;
-        bts_write(tracer, &ts);
-}
-/*
- * Change the DS configuration from tracing prev to tracing next.
- */
-void ds_switch_to(struct task_struct *prev, struct task_struct *next)
-{
-        struct ds_context *prev_ctx     = prev->thread.ds_ctx;
-        struct ds_context *next_ctx     = next->thread.ds_ctx;
-        unsigned long debugctlmsr       = next->thread.debugctlmsr;
-        /* Make sure all data is read before we start. */
-        barrier();
-        if (prev_ctx) {
-                update_debugctlmsr(0);
-                ds_take_timestamp(prev_ctx, bts_task_departs, prev);
-        }
-        if (next_ctx) {
-                ds_take_timestamp(next_ctx, bts_task_arrives, next);
-                wrmsrl(MSR_IA32_DS_AREA, (unsigned long)next_ctx->ds);
-        }
-        update_debugctlmsr(debugctlmsr);
-}
-static __init int ds_selftest(void)
-{
-        if (ds_cfg.sizeof_rec[ds_bts]) {
-                int error;
-                error = ds_selftest_bts();
-                if (error) {
-                        WARN(1, "[ds] selftest failed. disabling bts.\n");
-                        ds_cfg.sizeof_rec[ds_bts] = 0;
-                }
-        }
-        if (ds_cfg.sizeof_rec[ds_pebs]) {
-                int error;
-                error = ds_selftest_pebs();
-                if (error) {
-                        WARN(1, "[ds] selftest failed. disabling pebs.\n");
-                        ds_cfg.sizeof_rec[ds_pebs] = 0;
-                }
-        }
-        return 0;
-}
-device_initcall(ds_selftest);
diff --git a/arch/x86/kernel/ds_selftest.c b/arch/x86/kernel/ds_selftest.c
deleted file mode 100644
index 6bc7c199ab99..000000000000
--- a/arch/x86/kernel/ds_selftest.c
+++ /dev/null
@@ -1,408 +0,0 @@
-/*
- * Debug Store support - selftest
- *
- *
- * Copyright (C) 2009 Intel Corporation.
- * Markus Metzger <markus.t.metzger@intel.com>, 2009
- */
-#include "ds_selftest.h"
-#include <linux/kernel.h>
-#include <linux/string.h>
-#include <linux/smp.h>
-#include <linux/cpu.h>
-#include <asm/ds.h>
-#define BUFFER_SIZE             521     /* Intentionally chose an odd size. */
-#define SMALL_BUFFER_SIZE       24      /* A single bts entry. */
-struct ds_selftest_bts_conf {
-        struct bts_tracer *tracer;
-        int error;
-        int (*suspend)(struct bts_tracer *);
-        int (*resume)(struct bts_tracer *);
-};
-static int ds_selftest_bts_consistency(const struct bts_trace *trace)
-{
-        int error = 0;
-        if (!trace) {
-                printk(KERN_CONT "failed to access trace...");
-                /* Bail out. Other tests are pointless. */
-                return -1;
-        }
-        if (!trace->read) {
-                printk(KERN_CONT "bts read not available...");
-                error = -1;
-        }
-        /* Do some sanity checks on the trace configuration. */
-        if (!trace->ds.n) {
-                printk(KERN_CONT "empty bts buffer...");
-                error = -1;
-        }
-        if (!trace->ds.size) {
-                printk(KERN_CONT "bad bts trace setup...");
-                error = -1;
-        }
-        if (trace->ds.end !=
-            (char *)trace->ds.begin + (trace->ds.n * trace->ds.size)) {
-                printk(KERN_CONT "bad bts buffer setup...");
-                error = -1;
-        }
-        /*
-         * We allow top in [begin; end], since its not clear when the
-         * overflow adjustment happens: after the increment or before the
-         * write.
-         */
-        if ((trace->ds.top < trace->ds.begin) ||
-            (trace->ds.end < trace->ds.top)) {
-                printk(KERN_CONT "bts top out of bounds...");
-                error = -1;
-        }
-        return error;
-}
-static int ds_selftest_bts_read(struct bts_tracer *tracer,
-                                const struct bts_trace *trace,
-                                const void *from, const void *to)
-{
-        const unsigned char *at;
-        /*
-         * Check a few things which do not belong to this test.
-         * They should be covered by other tests.
-         */
-        if (!trace)
-                return -1;
-        if (!trace->read)
-                return -1;
-        if (to < from)
-                return -1;
-        if (from < trace->ds.begin)
-                return -1;
-        if (trace->ds.end < to)
-                return -1;
-        if (!trace->ds.size)
-                return -1;
-        /* Now to the test itself. */
-        for (at = from; (void *)at < to; at += trace->ds.size) {
-                struct bts_struct bts;
-                unsigned long index;
-                int error;
-                if (((void *)at - trace->ds.begin) % trace->ds.size) {
-                        printk(KERN_CONT
-                               "read from non-integer index...");
-                        return -1;
-                }
-                index = ((void *)at - trace->ds.begin) / trace->ds.size;
-                memset(&bts, 0, sizeof(bts));
-                error = trace->read(tracer, at, &bts);
-                if (error < 0) {
-                        printk(KERN_CONT
-                               "error reading bts trace at [%lu] (0x%p)...",
-                               index, at);
-                        return error;
-                }
-                switch (bts.qualifier) {
-                case BTS_BRANCH:
-                        break;
-                default:
-                        printk(KERN_CONT
-                               "unexpected bts entry %llu at [%lu] (0x%p)...",
-                               bts.qualifier, index, at);
-                        return -1;
-                }
-        }
-        return 0;
-}
-static void ds_selftest_bts_cpu(void *arg)
-{
-        struct ds_selftest_bts_conf *conf = arg;
-        const struct bts_trace *trace;
-        void *top;
-        if (IS_ERR(conf->tracer)) {
-                conf->error = PTR_ERR(conf->tracer);
-                conf->tracer = NULL;
-                printk(KERN_CONT
-                       "initialization failed (err: %d)...", conf->error);
-                return;
-        }
-        /* We should meanwhile have enough trace. */
-        conf->error = conf->suspend(conf->tracer);
-        if (conf->error < 0)
-                return;
-        /* Let's see if we can access the trace. */
-        trace = ds_read_bts(conf->tracer);
-        conf->error = ds_selftest_bts_consistency(trace);
-        if (conf->error < 0)
-                return;
-        /* If everything went well, we should have a few trace entries. */
-        if (trace->ds.top == trace->ds.begin) {
-                /*
-                 * It is possible but highly unlikely that we got a
-                 * buffer overflow and end up at exactly the same
-                 * position we started from.
-                 * Let's issue a warning, but continue.
-                 */
-                printk(KERN_CONT "no trace/overflow...");
-        }
-        /* Let's try to read the trace we collected. */
-        conf->error =
-                ds_selftest_bts_read(conf->tracer, trace,
-                                     trace->ds.begin, trace->ds.top);
-        if (conf->error < 0)
-                return;
-        /*
-         * Let's read the trace again.
-         * Since we suspended tracing, we should get the same result.
-         */
-        top = trace->ds.top;
-        trace = ds_read_bts(conf->tracer);
-        conf->error = ds_selftest_bts_consistency(trace);
-        if (conf->error < 0)
-                return;
-        if (top != trace->ds.top) {
-                printk(KERN_CONT "suspend not working...");
-                conf->error = -1;
-                return;
-        }
-        /* Let's collect some more trace - see if resume is working. */
-        conf->error = conf->resume(conf->tracer);
-        if (conf->error < 0)
-                return;
-        conf->error = conf->suspend(conf->tracer);
-        if (conf->error < 0)
-                return;
-        trace = ds_read_bts(conf->tracer);
-        conf->error = ds_selftest_bts_consistency(trace);
-        if (conf->error < 0)
-                return;
-        if (trace->ds.top == top) {
-                /*
-                 * It is possible but highly unlikely that we got a
-                 * buffer overflow and end up at exactly the same
-                 * position we started from.
-                 * Let's issue a warning and check the full trace.
-                 */
-                printk(KERN_CONT
-                       "no resume progress/overflow...");
-                conf->error =
-                        ds_selftest_bts_read(conf->tracer, trace,
-                                             trace->ds.begin, trace->ds.end);
-        } else if (trace->ds.top < top) {
-                /*
-                 * We had a buffer overflow - the entire buffer should
-                 * contain trace records.
-                 */
-                conf->error =
-                        ds_selftest_bts_read(conf->tracer, trace,
-                                             trace->ds.begin, trace->ds.end);
-        } else {
-                /*
-                 * It is quite likely that the buffer did not overflow.
-                 * Let's just check the delta trace.
-                 */
-                conf->error =
-                        ds_selftest_bts_read(conf->tracer, trace, top,
-                                             trace->ds.top);
-        }
-        if (conf->error < 0)
-                return;
-        conf->error = 0;
-}
-static int ds_suspend_bts_wrap(struct bts_tracer *tracer)
-{
-        ds_suspend_bts(tracer);
-        return 0;
-}
-static int ds_resume_bts_wrap(struct bts_tracer *tracer)
-{
-        ds_resume_bts(tracer);
-        return 0;
-}
-static void ds_release_bts_noirq_wrap(void *tracer)
-{
-        (void)ds_release_bts_noirq(tracer);
-}
-static int ds_selftest_bts_bad_release_noirq(int cpu,
-                                             struct bts_tracer *tracer)
-{
-        int error = -EPERM;
-        /* Try to release the tracer on the wrong cpu. */
-        get_cpu();
-        if (cpu != smp_processor_id()) {
-                error = ds_release_bts_noirq(tracer);
-                if (error != -EPERM)
-                        printk(KERN_CONT "release on wrong cpu...");
-        }
-        put_cpu();
-        return error ? 0 : -1;
-}
-static int ds_selftest_bts_bad_request_cpu(int cpu, void *buffer)
-{
-        struct bts_tracer *tracer;
-        int error;
-        /* Try to request cpu tracing while task tracing is active. */
-        tracer = ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE, NULL,
-                                    (size_t)-1, BTS_KERNEL);
-        error = PTR_ERR(tracer);
-        if (!IS_ERR(tracer)) {
-                ds_release_bts(tracer);
-                error = 0;
-        }
-        if (error != -EPERM)
-                printk(KERN_CONT "cpu/task tracing overlap...");
-        return error ? 0 : -1;
-}
-static int ds_selftest_bts_bad_request_task(void *buffer)
-{
-        struct bts_tracer *tracer;
-        int error;
-        /* Try to request cpu tracing while task tracing is active. */
-        tracer = ds_request_bts_task(current, buffer, BUFFER_SIZE, NULL,
-                                    (size_t)-1, BTS_KERNEL);
-        error = PTR_ERR(tracer);
-        if (!IS_ERR(tracer)) {
-                error = 0;
-                ds_release_bts(tracer);
-        }
-        if (error != -EPERM)
-                printk(KERN_CONT "task/cpu tracing overlap...");
-        return error ? 0 : -1;
-}
-int ds_selftest_bts(void)
-{
-        struct ds_selftest_bts_conf conf;
-        unsigned char buffer[BUFFER_SIZE], *small_buffer;
-        unsigned long irq;
-        int cpu;
-        printk(KERN_INFO "[ds] bts selftest...");
-        conf.error = 0;
-        small_buffer = (unsigned char *)ALIGN((unsigned long)buffer, 8) + 8;
-        get_online_cpus();
-        for_each_online_cpu(cpu) {
-                conf.suspend = ds_suspend_bts_wrap;
-                conf.resume = ds_resume_bts_wrap;
-                conf.tracer =
-                        ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE,
-                                           NULL, (size_t)-1, BTS_KERNEL);
-                ds_selftest_bts_cpu(&conf);
-                if (conf.error >= 0)
-                        conf.error = ds_selftest_bts_bad_request_task(buffer);
-                ds_release_bts(conf.tracer);
-                if (conf.error < 0)
-                        goto out;
-                conf.suspend = ds_suspend_bts_noirq;
-                conf.resume = ds_resume_bts_noirq;
-                conf.tracer =
-                        ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE,
-                                           NULL, (size_t)-1, BTS_KERNEL);
-                smp_call_function_single(cpu, ds_selftest_bts_cpu, &conf, 1);
-                if (conf.error >= 0) {
-                        conf.error =
-                                ds_selftest_bts_bad_release_noirq(cpu,
-                                                                  conf.tracer);
-                        /* We must not release the tracer twice. */
-                        if (conf.error < 0)
-                                conf.tracer = NULL;
-                }
-                if (conf.error >= 0)
-                        conf.error = ds_selftest_bts_bad_request_task(buffer);
-                smp_call_function_single(cpu, ds_release_bts_noirq_wrap,
-                                         conf.tracer, 1);
-                if (conf.error < 0)
-                        goto out;
-        }
-        conf.suspend = ds_suspend_bts_wrap;
-        conf.resume = ds_resume_bts_wrap;
-        conf.tracer =
-                ds_request_bts_task(current, buffer, BUFFER_SIZE,
-                                    NULL, (size_t)-1, BTS_KERNEL);
-        ds_selftest_bts_cpu(&conf);
-        if (conf.error >= 0)
-                conf.error = ds_selftest_bts_bad_request_cpu(0, buffer);
-        ds_release_bts(conf.tracer);
-        if (conf.error < 0)
-                goto out;
-        conf.suspend = ds_suspend_bts_noirq;
-        conf.resume = ds_resume_bts_noirq;
-        conf.tracer =
-                ds_request_bts_task(current, small_buffer, SMALL_BUFFER_SIZE,
-                                   NULL, (size_t)-1, BTS_KERNEL);
-        local_irq_save(irq);
-        ds_selftest_bts_cpu(&conf);
-        if (conf.error >= 0)
-                conf.error = ds_selftest_bts_bad_request_cpu(0, buffer);
-        ds_release_bts_noirq(conf.tracer);
-        local_irq_restore(irq);
-        if (conf.error < 0)
-                goto out;
-        conf.error = 0;
- out:
-        put_online_cpus();
-        printk(KERN_CONT "%s.\n", (conf.error ? "failed" : "passed"));
-        return conf.error;
-}
-int ds_selftest_pebs(void)
-{
-        return 0;
-}
diff --git a/arch/x86/kernel/ds_selftest.h b/arch/x86/kernel/ds_selftest.h
deleted file mode 100644
index 2ba8745c6663..000000000000
--- a/arch/x86/kernel/ds_selftest.h
+++ /dev/null
@@ -1,15 +0,0 @@
-/*
- * Debug Store support - selftest
- *
- *
- * Copyright (C) 2009 Intel Corporation.
- * Markus Metzger <markus.t.metzger@intel.com>, 2009
- */
-#ifdef CONFIG_X86_DS_SELFTEST
-extern int ds_selftest_bts(void);
-extern int ds_selftest_pebs(void);
-#else
-static inline int ds_selftest_bts(void) { return 0; }
-static inline int ds_selftest_pebs(void) { return 0; }
-#endif
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 6d817554780a..c89a386930b7 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -224,11 +224,6 @@ unsigned __kprobes long oops_begin(void)
        int cpu;
        unsigned long flags;
-        /* notify the hw-branch tracer so it may disable tracing and
-           add the last trace to the trace buffer -
-           the earlier this happens, the more useful the trace. */
-        trace_hw_branch_oops();
        oops_enter();
        /* racy, but better than risking deadlock. */
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 44a8e0dc6737..cd49141cf153 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -53,6 +53,7 @@
 #include <asm/processor-flags.h>
 #include <asm/ftrace.h>
 #include <asm/irq_vectors.h>
+#include <asm/cpufeature.h>
 /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this.  */
 #include <linux/elf-em.h>
@@ -905,7 +906,25 @@ ENTRY(simd_coprocessor_error)
        RING0_INT_FRAME
        pushl $0
        CFI_ADJUST_CFA_OFFSET 4
+#ifdef CONFIG_X86_INVD_BUG
+        /* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */
+661:    pushl $do_general_protection
+662:
+.section .altinstructions,"a"
+        .balign 4
+        .long 661b
+        .long 663f
+        .byte X86_FEATURE_XMM
+        .byte 662b-661b
+        .byte 664f-663f
+.previous
+.section .altinstr_replacement,"ax"
+663:    pushl $do_simd_coprocessor_error
+664:
+.previous
+#else
        pushl $do_simd_coprocessor_error
+#endif
        CFI_ADJUST_CFA_OFFSET 4
        jmp error_code
        CFI_ENDPROC
diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
index d6cc065f519f..a8f1b803d2fd 100644
--- a/arch/x86/kernel/hw_breakpoint.c
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -189,25 +189,16 @@ static int get_hbp_len(u8 hbp_len)
 }
 /*
- * Check for virtual address in user space.
- */
-int arch_check_va_in_userspace(unsigned long va, u8 hbp_len)
-{
-        unsigned int len;
-        len = get_hbp_len(hbp_len);
-        return (va <= TASK_SIZE - len);
-}
-/*
 * Check for virtual address in kernel space.
 */
-static int arch_check_va_in_kernelspace(unsigned long va, u8 hbp_len)
+int arch_check_bp_in_kernelspace(struct perf_event *bp)
 {
        unsigned int len;
+        unsigned long va;
+        struct arch_hw_breakpoint *info = counter_arch_bp(bp);
-        len = get_hbp_len(hbp_len);
+        va = info->address;
+        len = get_hbp_len(info->len);
        return (va >= TASK_SIZE) && ((va + len - 1) >= TASK_SIZE);
 }
@@ -300,8 +291,7 @@ static int arch_build_bp_info(struct perf_event *bp)
 /*
 * Validate the arch-specific HW Breakpoint register settings
 */
-int arch_validate_hwbkpt_settings(struct perf_event *bp,
+int arch_validate_hwbkpt_settings(struct perf_event *bp)
-                                  struct task_struct *tsk)
 {
        struct arch_hw_breakpoint *info = counter_arch_bp(bp);
        unsigned int align;
@@ -314,16 +304,6 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp,
        ret = -EINVAL;
-        if (info->type == X86_BREAKPOINT_EXECUTE)
-                /*
-                 * Ptrace-refactoring code
-                 * For now, we'll allow instruction breakpoint only for user-space
-                 * addresses
-                 */
-                if ((!arch_check_va_in_userspace(info->address, info->len)) &&
-                        info->len != X86_BREAKPOINT_EXECUTE)
-                        return ret;
        switch (info->len) {
        case X86_BREAKPOINT_LEN_1:
                align = 0;
@@ -350,15 +330,6 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp,
        if (info->address & align)
                return -EINVAL;
-        /* Check that the virtual address is in the proper range */
-        if (tsk) {
-                if (!arch_check_va_in_userspace(info->address, info->len))
-                        return -EFAULT;
-        } else {
-                if (!arch_check_va_in_kernelspace(info->address, info->len))
-                        return -EFAULT;
-        }
        return 0;
 }
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index 54c31c285488..86cef6b32253 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -102,65 +102,62 @@ void __cpuinit fpu_init(void)
        mxcsr_feature_mask_init();
        /* clean state in init */
-        if (cpu_has_xsave)
+        current_thread_info()->status = 0;
-                current_thread_info()->status = TS_XSAVE;
-        else
-                current_thread_info()->status = 0;
        clear_used_math();
 }
 #endif  /* CONFIG_X86_64 */
-/*
+static void fpu_finit(struct fpu *fpu)
- * The _current_ task is using the FPU for the first time
- * so initialize it and set the mxcsr to its default
- * value at reset if we support XMM instructions and then
- * remeber the current task has used the FPU.
- */
-int init_fpu(struct task_struct *tsk)
 {
-        if (tsk_used_math(tsk)) {
-                if (HAVE_HWFP && tsk == current)
-                        unlazy_fpu(tsk);
-                return 0;
-        }
-        /*
-         * Memory allocation at the first usage of the FPU and other state.
-         */
-        if (!tsk->thread.xstate) {
-                tsk->thread.xstate = kmem_cache_alloc(task_xstate_cachep,
-                                                      GFP_KERNEL);
-                if (!tsk->thread.xstate)
-                        return -ENOMEM;
-        }
 #ifdef CONFIG_X86_32
        if (!HAVE_HWFP) {
-                memset(tsk->thread.xstate, 0, xstate_size);
+                finit_soft_fpu(&fpu->state->soft);
-                finit_task(tsk);
+                return;
-                set_stopped_child_used_math(tsk);
-                return 0;
        }
 #endif
        if (cpu_has_fxsr) {
-                struct i387_fxsave_struct *fx = &tsk->thread.xstate->fxsave;
+                struct i387_fxsave_struct *fx = &fpu->state->fxsave;
                memset(fx, 0, xstate_size);
                fx->cwd = 0x37f;
                if (cpu_has_xmm)
                        fx->mxcsr = MXCSR_DEFAULT;
        } else {
-                struct i387_fsave_struct *fp = &tsk->thread.xstate->fsave;
+                struct i387_fsave_struct *fp = &fpu->state->fsave;
                memset(fp, 0, xstate_size);
                fp->cwd = 0xffff037fu;
                fp->swd = 0xffff0000u;
                fp->twd = 0xffffffffu;
                fp->fos = 0xffff0000u;
        }
+}
+/*
+ * The _current_ task is using the FPU for the first time
+ * so initialize it and set the mxcsr to its default
+ * value at reset if we support XMM instructions and then
+ * remeber the current task has used the FPU.
+ */
+int init_fpu(struct task_struct *tsk)
+{
+        int ret;
+        if (tsk_used_math(tsk)) {
+                if (HAVE_HWFP && tsk == current)
+                        unlazy_fpu(tsk);
+                return 0;
+        }
        /*
-         * Only the device not available exception or ptrace can call init_fpu.
+         * Memory allocation at the first usage of the FPU and other state.
         */
+        ret = fpu_alloc(&tsk->thread.fpu);
+        if (ret)
+                return ret;
+        fpu_finit(&tsk->thread.fpu);
        set_stopped_child_used_math(tsk);
        return 0;
 }
@@ -194,7 +191,7 @@ int xfpregs_get(struct task_struct *target, const struct user_regset *regset,
                return ret;
        return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
-                                   &target->thread.xstate->fxsave, 0, -1);
+                                   &target->thread.fpu.state->fxsave, 0, -1);
 }
 int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
@@ -211,19 +208,19 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
                return ret;
        ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
-                                 &target->thread.xstate->fxsave, 0, -1);
+                                 &target->thread.fpu.state->fxsave, 0, -1);
        /*
         * mxcsr reserved bits must be masked to zero for security reasons.
         */
-        target->thread.xstate->fxsave.mxcsr &= mxcsr_feature_mask;
+        target->thread.fpu.state->fxsave.mxcsr &= mxcsr_feature_mask;
        /*
         * update the header bits in the xsave header, indicating the
         * presence of FP and SSE state.
         */
        if (cpu_has_xsave)
-                target->thread.xstate->xsave.xsave_hdr.xstate_bv |= XSTATE_FPSSE;
+                target->thread.fpu.state->xsave.xsave_hdr.xstate_bv |= XSTATE_FPSSE;
        return ret;
 }
@@ -246,14 +243,14 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset,
         * memory layout in the thread struct, so that we can copy the entire
         * xstateregs to the user using one user_regset_copyout().
         */
-        memcpy(&target->thread.xstate->fxsave.sw_reserved,
+        memcpy(&target->thread.fpu.state->fxsave.sw_reserved,
               xstate_fx_sw_bytes, sizeof(xstate_fx_sw_bytes));
        /*
         * Copy the xstate memory layout.
         */
        ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
-                                  &target->thread.xstate->xsave, 0, -1);
+                                  &target->thread.fpu.state->xsave, 0, -1);
        return ret;
 }
@@ -272,14 +269,14 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset,
                return ret;
        ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
-                                 &target->thread.xstate->xsave, 0, -1);
+                                 &target->thread.fpu.state->xsave, 0, -1);
        /*
         * mxcsr reserved bits must be masked to zero for security reasons.
         */
-        target->thread.xstate->fxsave.mxcsr &= mxcsr_feature_mask;
+        target->thread.fpu.state->fxsave.mxcsr &= mxcsr_feature_mask;
-        xsave_hdr = &target->thread.xstate->xsave.xsave_hdr;
+        xsave_hdr = &target->thread.fpu.state->xsave.xsave_hdr;
        xsave_hdr->xstate_bv &= pcntxt_mask;
        /*
@@ -365,7 +362,7 @@ static inline u32 twd_fxsr_to_i387(struct i387_fxsave_struct *fxsave)
 static void
 convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk)
 {
-        struct i387_fxsave_struct *fxsave = &tsk->thread.xstate->fxsave;
+        struct i387_fxsave_struct *fxsave = &tsk->thread.fpu.state->fxsave;
        struct _fpreg *to = (struct _fpreg *) &env->st_space[0];
        struct _fpxreg *from = (struct _fpxreg *) &fxsave->st_space[0];
        int i;
@@ -405,7 +402,7 @@ static void convert_to_fxsr(struct task_struct *tsk,
                            const struct user_i387_ia32_struct *env)
 {
-        struct i387_fxsave_struct *fxsave = &tsk->thread.xstate->fxsave;
+        struct i387_fxsave_struct *fxsave = &tsk->thread.fpu.state->fxsave;
        struct _fpreg *from = (struct _fpreg *) &env->st_space[0];
        struct _fpxreg *to = (struct _fpxreg *) &fxsave->st_space[0];
        int i;
@@ -445,7 +442,7 @@ int fpregs_get(struct task_struct *target, const struct user_regset *regset,
        if (!cpu_has_fxsr) {
                return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
-                                           &target->thread.xstate->fsave, 0,
+                                           &target->thread.fpu.state->fsave, 0,
                                           -1);
        }
@@ -475,7 +472,7 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset,
        if (!cpu_has_fxsr) {
                return user_regset_copyin(&pos, &count, &kbuf, &ubuf,
-                                          &target->thread.xstate->fsave, 0, -1);
+                                          &target->thread.fpu.state->fsave, 0, -1);
        }
        if (pos > 0 || count < sizeof(env))
@@ -490,7 +487,7 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset,
         * presence of FP.
         */
        if (cpu_has_xsave)
-                target->thread.xstate->xsave.xsave_hdr.xstate_bv |= XSTATE_FP;
+                target->thread.fpu.state->xsave.xsave_hdr.xstate_bv |= XSTATE_FP;
        return ret;
 }
@@ -501,7 +498,7 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset,
 static inline int save_i387_fsave(struct _fpstate_ia32 __user *buf)
 {
        struct task_struct *tsk = current;
-        struct i387_fsave_struct *fp = &tsk->thread.xstate->fsave;
+        struct i387_fsave_struct *fp = &tsk->thread.fpu.state->fsave;
        fp->status = fp->swd;
        if (__copy_to_user(buf, fp, sizeof(struct i387_fsave_struct)))
@@ -512,7 +509,7 @@ static inline int save_i387_fsave(struct _fpstate_ia32 __user *buf)
 static int save_i387_fxsave(struct _fpstate_ia32 __user *buf)
 {
        struct task_struct *tsk = current;
-        struct i387_fxsave_struct *fx = &tsk->thread.xstate->fxsave;
+        struct i387_fxsave_struct *fx = &tsk->thread.fpu.state->fxsave;
        struct user_i387_ia32_struct env;
        int err = 0;
@@ -547,7 +544,7 @@ static int save_i387_xsave(void __user *buf)
         * header as well as change any contents in the memory layout.
         * xrestore as part of sigreturn will capture all the changes.
         */
-        tsk->thread.xstate->xsave.xsave_hdr.xstate_bv |= XSTATE_FPSSE;
+        tsk->thread.fpu.state->xsave.xsave_hdr.xstate_bv |= XSTATE_FPSSE;
        if (save_i387_fxsave(fx) < 0)
                return -1;
@@ -599,7 +596,7 @@ static inline int restore_i387_fsave(struct _fpstate_ia32 __user *buf)
 {
        struct task_struct *tsk = current;
-        return __copy_from_user(&tsk->thread.xstate->fsave, buf,
+        return __copy_from_user(&tsk->thread.fpu.state->fsave, buf,
                                sizeof(struct i387_fsave_struct));
 }
@@ -610,10 +607,10 @@ static int restore_i387_fxsave(struct _fpstate_ia32 __user *buf,
        struct user_i387_ia32_struct env;
        int err;
-        err = __copy_from_user(&tsk->thread.xstate->fxsave, &buf->_fxsr_env[0],
+        err = __copy_from_user(&tsk->thread.fpu.state->fxsave, &buf->_fxsr_env[0],
                               size);
        /* mxcsr reserved bits must be masked to zero for security reasons */
-        tsk->thread.xstate->fxsave.mxcsr &= mxcsr_feature_mask;
+        tsk->thread.fpu.state->fxsave.mxcsr &= mxcsr_feature_mask;
        if (err || __copy_from_user(&env, buf, sizeof(env)))
                return 1;
        convert_to_fxsr(tsk, &env);
@@ -629,7 +626,7 @@ static int restore_i387_xsave(void __user *buf)
        struct i387_fxsave_struct __user *fx =
                (struct i387_fxsave_struct __user *) &fx_user->_fxsr_env[0];
        struct xsave_hdr_struct *xsave_hdr =
-                                &current->thread.xstate->xsave.xsave_hdr;
+                                &current->thread.fpu.state->xsave.xsave_hdr;
        u64 mask;
        int err;
diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c
index 23c167925a5c..2dfd31597443 100644
--- a/arch/x86/kernel/i8253.c
+++ b/arch/x86/kernel/i8253.c
@@ -16,7 +16,7 @@
 #include <asm/hpet.h>
 #include <asm/smp.h>
-DEFINE_SPINLOCK(i8253_lock);
+DEFINE_RAW_SPINLOCK(i8253_lock);
 EXPORT_SYMBOL(i8253_lock);
 /*
@@ -33,7 +33,7 @@ struct clock_event_device *global_clock_event;
 static void init_pit_timer(enum clock_event_mode mode,
                           struct clock_event_device *evt)
 {
-        spin_lock(&i8253_lock);
+        raw_spin_lock(&i8253_lock);
        switch (mode) {
        case CLOCK_EVT_MODE_PERIODIC:
@@ -62,7 +62,7 @@ static void init_pit_timer(enum clock_event_mode mode,
                /* Nothing to do here */
                break;
        }
-        spin_unlock(&i8253_lock);
+        raw_spin_unlock(&i8253_lock);
 }
 /*
@@ -72,10 +72,10 @@ static void init_pit_timer(enum clock_event_mode mode,
 */
 static int pit_next_event(unsigned long delta, struct clock_event_device *evt)
 {
-        spin_lock(&i8253_lock);
+        raw_spin_lock(&i8253_lock);
        outb_pit(delta & 0xff , PIT_CH0);       /* LSB */
        outb_pit(delta >> 8 , PIT_CH0);         /* MSB */
-        spin_unlock(&i8253_lock);
+        raw_spin_unlock(&i8253_lock);
        return 0;
 }
@@ -130,7 +130,7 @@ static cycle_t pit_read(struct clocksource *cs)
        int count;
        u32 jifs;
-        spin_lock_irqsave(&i8253_lock, flags);
+        raw_spin_lock_irqsave(&i8253_lock, flags);
        /*
         * Although our caller may have the read side of xtime_lock,
         * this is now a seqlock, and we are cheating in this routine
@@ -176,7 +176,7 @@ static cycle_t pit_read(struct clocksource *cs)
        old_count = count;
        old_jifs = jifs;
-        spin_unlock_irqrestore(&i8253_lock, flags);
+        raw_spin_unlock_irqrestore(&i8253_lock, flags);
        count = (LATCH - 1) - count;
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 0ed2d300cd46..990ae7cfc578 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -60,7 +60,7 @@ static irqreturn_t math_error_irq(int cpl, void *dev_id)
        outb(0, 0xF0);
        if (ignore_fpu_irq || !boot_cpu_data.hard_math)
                return IRQ_NONE;
-        math_error((void __user *)get_irq_regs()->ip);
+        math_error(get_irq_regs(), 0, 16);
        return IRQ_HANDLED;
 }
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index 1658efdfb4e5..345a4b1fe144 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -422,14 +422,22 @@ static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
 static void __kprobes clear_btf(void)
 {
-        if (test_thread_flag(TIF_DEBUGCTLMSR))
+        if (test_thread_flag(TIF_BLOCKSTEP)) {
-                update_debugctlmsr(0);
+                unsigned long debugctl = get_debugctlmsr();
+                debugctl &= ~DEBUGCTLMSR_BTF;
+                update_debugctlmsr(debugctl);
+        }
 }
 static void __kprobes restore_btf(void)
 {
-        if (test_thread_flag(TIF_DEBUGCTLMSR))
+        if (test_thread_flag(TIF_BLOCKSTEP)) {
-                update_debugctlmsr(current->thread.debugctlmsr);
+                unsigned long debugctl = get_debugctlmsr();
+                debugctl |= DEBUGCTLMSR_BTF;
+                update_debugctlmsr(debugctl);
+        }
 }
 void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index cceb5bc3c3c2..2cd8c544e41a 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -201,9 +201,9 @@ static int do_microcode_update(const void __user *buf, size_t size)
        return error;
 }
-static int microcode_open(struct inode *unused1, struct file *unused2)
+static int microcode_open(struct inode *inode, struct file *file)
 {
-        return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
+        return capable(CAP_SYS_RAWIO) ? nonseekable_open(inode, file) : -EPERM;
 }
 static ssize_t microcode_write(struct file *file, const char __user *buf,
diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c
index 85a343e28937..356170262a93 100644
--- a/arch/x86/kernel/microcode_intel.c
+++ b/arch/x86/kernel/microcode_intel.c
@@ -343,10 +343,11 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
                                int (*get_ucode_data)(void *, const void *, size_t))
 {
        struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
-        u8 *ucode_ptr = data, *new_mc = NULL, *mc;
+        u8 *ucode_ptr = data, *new_mc = NULL, *mc = NULL;
        int new_rev = uci->cpu_sig.rev;
        unsigned int leftover = size;
        enum ucode_state state = UCODE_OK;
+        unsigned int curr_mc_size = 0;
        while (leftover) {
                struct microcode_header_intel mc_header;
@@ -361,9 +362,15 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
                        break;
                }
-                mc = vmalloc(mc_size);
+                /* For performance reasons, reuse mc area when possible */
-                if (!mc)
+                if (!mc || mc_size > curr_mc_size) {
-                        break;
+                        if (mc)
+                                vfree(mc);
+                        mc = vmalloc(mc_size);
+                        if (!mc)
+                                break;
+                        curr_mc_size = mc_size;
+                }
                if (get_ucode_data(mc, ucode_ptr, mc_size) ||
                    microcode_sanity_check(mc) < 0) {
@@ -376,13 +383,16 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
                                vfree(new_mc);
                        new_rev = mc_header.rev;
                        new_mc  = mc;
-                } else
+                        mc = NULL;      /* trigger new vmalloc */
-                        vfree(mc);
+                }
                ucode_ptr += mc_size;
                leftover  -= mc_size;
        }
+        if (mc)
+                vfree(mc);
        if (leftover) {
                if (new_mc)
                        vfree(new_mc);
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index e81030f71a8f..5ae5d2426edf 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -115,21 +115,6 @@ static void __init MP_bus_info(struct mpc_bus *m)
                printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str);
 }
-static int bad_ioapic(unsigned long address)
-{
-        if (nr_ioapics >= MAX_IO_APICS) {
-                printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
-                       "(found %d)\n", MAX_IO_APICS, nr_ioapics);
-                panic("Recompile kernel with bigger MAX_IO_APICS!\n");
-        }
-        if (!address) {
-                printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address"
-                       " found in table, skipping!\n");
-                return 1;
-        }
-        return 0;
-}
 static void __init MP_ioapic_info(struct mpc_ioapic *m)
 {
        if (!(m->flags & MPC_APIC_USABLE))
@@ -138,15 +123,7 @@ static void __init MP_ioapic_info(struct mpc_ioapic *m)
        printk(KERN_INFO "I/O APIC #%d Version %d at 0x%X.\n",
               m->apicid, m->apicver, m->apicaddr);
-        if (bad_ioapic(m->apicaddr))
+        mp_register_ioapic(m->apicid, m->apicaddr, gsi_end + 1);
-                return;
-        mp_ioapics[nr_ioapics].apicaddr = m->apicaddr;
-        mp_ioapics[nr_ioapics].apicid = m->apicid;
-        mp_ioapics[nr_ioapics].type = m->type;
-        mp_ioapics[nr_ioapics].apicver = m->apicver;
-        mp_ioapics[nr_ioapics].flags = m->flags;
-        nr_ioapics++;
 }
 static void print_MP_intsrc_info(struct mpc_intsrc *m)
diff --git a/arch/x86/kernel/mrst.c b/arch/x86/kernel/mrst.c
index 0aad8670858e..e796448f0eb5 100644
--- a/arch/x86/kernel/mrst.c
+++ b/arch/x86/kernel/mrst.c
@@ -237,4 +237,9 @@ void __init x86_mrst_early_setup(void)
        x86_init.pci.fixup_irqs = x86_init_noop;
        legacy_pic = &null_legacy_pic;
+        /* Avoid searching for BIOS MP tables */
+        x86_init.mpparse.find_smp_config = x86_init_noop;
+        x86_init.mpparse.get_smp_config = x86_init_uint_noop;
 }
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 28ad9f4d8b94..e7e35219b32f 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -20,7 +20,6 @@
 #include <asm/idle.h>
 #include <asm/uaccess.h>
 #include <asm/i387.h>
-#include <asm/ds.h>
 #include <asm/debugreg.h>
 unsigned long idle_halt;
@@ -32,26 +31,22 @@ struct kmem_cache *task_xstate_cachep;
 int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
 {
+        int ret;
        *dst = *src;
-        if (src->thread.xstate) {
+        if (fpu_allocated(&src->thread.fpu)) {
-                dst->thread.xstate = kmem_cache_alloc(task_xstate_cachep,
+                memset(&dst->thread.fpu, 0, sizeof(dst->thread.fpu));
-                                                      GFP_KERNEL);
+                ret = fpu_alloc(&dst->thread.fpu);
-                if (!dst->thread.xstate)
+                if (ret)
-                        return -ENOMEM;
+                        return ret;
-                WARN_ON((unsigned long)dst->thread.xstate & 15);
+                fpu_copy(&dst->thread.fpu, &src->thread.fpu);
-                memcpy(dst->thread.xstate, src->thread.xstate, xstate_size);
        }
        return 0;
 }
 void free_thread_xstate(struct task_struct *tsk)
 {
-        if (tsk->thread.xstate) {
+        fpu_free(&tsk->thread.fpu);
-                kmem_cache_free(task_xstate_cachep, tsk->thread.xstate);
-                tsk->thread.xstate = NULL;
-        }
-        WARN(tsk->thread.ds_ctx, "leaking DS context\n");
 }
 void free_thread_info(struct thread_info *ti)
@@ -198,11 +193,16 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
        prev = &prev_p->thread;
        next = &next_p->thread;
-        if (test_tsk_thread_flag(next_p, TIF_DS_AREA_MSR) ||
+        if (test_tsk_thread_flag(prev_p, TIF_BLOCKSTEP) ^
-            test_tsk_thread_flag(prev_p, TIF_DS_AREA_MSR))
+            test_tsk_thread_flag(next_p, TIF_BLOCKSTEP)) {
-                ds_switch_to(prev_p, next_p);
+                unsigned long debugctl = get_debugctlmsr();
-        else if (next->debugctlmsr != prev->debugctlmsr)
-                update_debugctlmsr(next->debugctlmsr);
+                debugctl &= ~DEBUGCTLMSR_BTF;
+                if (test_tsk_thread_flag(next_p, TIF_BLOCKSTEP))
+                        debugctl |= DEBUGCTLMSR_BTF;
+                update_debugctlmsr(debugctl);
+        }
        if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
            test_tsk_thread_flag(next_p, TIF_NOTSC)) {
@@ -546,11 +546,13 @@ static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c)
                 * check OSVW bit for CPUs that are not affected
                 * by erratum #400
                 */
-                rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, val);
+                if (cpu_has(c, X86_FEATURE_OSVW)) {
-                if (val >= 2) {
+                        rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, val);
-                        rdmsrl(MSR_AMD64_OSVW_STATUS, val);
+                        if (val >= 2) {
-                        if (!(val & BIT(1)))
+                                rdmsrl(MSR_AMD64_OSVW_STATUS, val);
-                                goto no_c1e_idle;
+                                if (!(val & BIT(1)))
+                                        goto no_c1e_idle;
+                        }
                }
                return 1;
        }
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index f6c62667e30c..8d128783af47 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -55,7 +55,6 @@
 #include <asm/cpu.h>
 #include <asm/idle.h>
 #include <asm/syscalls.h>
-#include <asm/ds.h>
 #include <asm/debugreg.h>
 asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
@@ -238,13 +237,6 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
                kfree(p->thread.io_bitmap_ptr);
                p->thread.io_bitmap_max = 0;
        }
-        clear_tsk_thread_flag(p, TIF_DS_AREA_MSR);
-        p->thread.ds_ctx = NULL;
-        clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR);
-        p->thread.debugctlmsr = 0;
        return err;
 }
@@ -317,7 +309,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
        /* we're going to use this soon, after a few expensive things */
        if (preload_fpu)
-                prefetch(next->xstate);
+                prefetch(next->fpu.state);
        /*
         * Reload esp0.
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 17cb3295cbf7..3c2422a99f1f 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -49,7 +49,6 @@
 #include <asm/ia32.h>
 #include <asm/idle.h>
 #include <asm/syscalls.h>
-#include <asm/ds.h>
 #include <asm/debugreg.h>
 asmlinkage extern void ret_from_fork(void);
@@ -313,13 +312,6 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
                if (err)
                        goto out;
        }
-        clear_tsk_thread_flag(p, TIF_DS_AREA_MSR);
-        p->thread.ds_ctx = NULL;
-        clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR);
-        p->thread.debugctlmsr = 0;
        err = 0;
 out:
        if (err && p->thread.io_bitmap_ptr) {
@@ -396,7 +388,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
        /* we're going to use this soon, after a few expensive things */
        if (preload_fpu)
-                prefetch(next->xstate);
+                prefetch(next->fpu.state);
        /*
         * Reload esp0, LDT and the page table pointer:
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 2e9b55027b7e..70c4872cd8aa 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -2,9 +2,6 @@
 /*
 * Pentium III FXSR, SSE support
 *      Gareth Hughes <gareth@valinux.com>, May 2000
- *
- * BTS tracing
- *      Markus Metzger <markus.t.metzger@intel.com>, Dec 2007
 */
 #include <linux/kernel.h>
@@ -22,7 +19,6 @@
 #include <linux/audit.h>
 #include <linux/seccomp.h>
 #include <linux/signal.h>
-#include <linux/workqueue.h>
 #include <linux/perf_event.h>
 #include <linux/hw_breakpoint.h>
@@ -36,7 +32,6 @@
 #include <asm/desc.h>
 #include <asm/prctl.h>
 #include <asm/proto.h>
-#include <asm/ds.h>
 #include <asm/hw_breakpoint.h>
 #include "tls.h"
@@ -693,7 +688,7 @@ static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr,
        struct perf_event_attr attr;
        if (!t->ptrace_bps[nr]) {
-                hw_breakpoint_init(&attr);
+                ptrace_breakpoint_init(&attr);
                /*
                 * Put stub len and type to register (reserve) an inactive but
                 * correct bp
@@ -789,342 +784,6 @@ static int ioperm_get(struct task_struct *target,
                                   0, IO_BITMAP_BYTES);
 }
-#ifdef CONFIG_X86_PTRACE_BTS
-/*
- * A branch trace store context.
- *
- * Contexts may only be installed by ptrace_bts_config() and only for
- * ptraced tasks.
- *
- * Contexts are destroyed when the tracee is detached from the tracer.
- * The actual destruction work requires interrupts enabled, so the
- * work is deferred and will be scheduled during __ptrace_unlink().
- *
- * Contexts hold an additional task_struct reference on the traced
- * task, as well as a reference on the tracer's mm.
- *
- * Ptrace already holds a task_struct for the duration of ptrace operations,
- * but since destruction is deferred, it may be executed after both
- * tracer and tracee exited.
- */
-struct bts_context {
-        /* The branch trace handle. */
-        struct bts_tracer       *tracer;
-        /* The buffer used to store the branch trace and its size. */
-        void                    *buffer;
-        unsigned int            size;
-        /* The mm that paid for the above buffer. */
-        struct mm_struct        *mm;
-        /* The task this context belongs to. */
-        struct task_struct      *task;
-        /* The signal to send on a bts buffer overflow. */
-        unsigned int            bts_ovfl_signal;
-        /* The work struct to destroy a context. */
-        struct work_struct      work;
-};
-static int alloc_bts_buffer(struct bts_context *context, unsigned int size)
-{
-        void *buffer = NULL;
-        int err = -ENOMEM;
-        err = account_locked_memory(current->mm, current->signal->rlim, size);
-        if (err < 0)
-                return err;
-        buffer = kzalloc(size, GFP_KERNEL);
-        if (!buffer)
-                goto out_refund;
-        context->buffer = buffer;
-        context->size = size;
-        context->mm = get_task_mm(current);
-        return 0;
- out_refund:
-        refund_locked_memory(current->mm, size);
-        return err;
-}
-static inline void free_bts_buffer(struct bts_context *context)
-{
-        if (!context->buffer)
-                return;
-        kfree(context->buffer);
-        context->buffer = NULL;
-        refund_locked_memory(context->mm, context->size);
-        context->size = 0;
-        mmput(context->mm);
-        context->mm = NULL;
-}
-static void free_bts_context_work(struct work_struct *w)
-{
-        struct bts_context *context;
-        context = container_of(w, struct bts_context, work);
-        ds_release_bts(context->tracer);
-        put_task_struct(context->task);
-        free_bts_buffer(context);
-        kfree(context);
-}
-static inline void free_bts_context(struct bts_context *context)
-{
-        INIT_WORK(&context->work, free_bts_context_work);
-        schedule_work(&context->work);
-}
-static inline struct bts_context *alloc_bts_context(struct task_struct *task)
-{
-        struct bts_context *context = kzalloc(sizeof(*context), GFP_KERNEL);
-        if (context) {
-                context->task = task;
-                task->bts = context;
-                get_task_struct(task);
-        }
-        return context;
-}
-static int ptrace_bts_read_record(struct task_struct *child, size_t index,
-                                  struct bts_struct __user *out)
-{
-        struct bts_context *context;
-        const struct bts_trace *trace;
-        struct bts_struct bts;
-        const unsigned char *at;
-        int error;
-        context = child->bts;
-        if (!context)
-                return -ESRCH;
-        trace = ds_read_bts(context->tracer);
-        if (!trace)
-                return -ESRCH;
-        at = trace->ds.top - ((index + 1) * trace->ds.size);
-        if ((void *)at < trace->ds.begin)
-                at += (trace->ds.n * trace->ds.size);
-        if (!trace->read)
-                return -EOPNOTSUPP;
-        error = trace->read(context->tracer, at, &bts);
-        if (error < 0)
-                return error;
-        if (copy_to_user(out, &bts, sizeof(bts)))
-                return -EFAULT;
-        return sizeof(bts);
-}
-static int ptrace_bts_drain(struct task_struct *child,
-                            long size,
-                            struct bts_struct __user *out)
-{
-        struct bts_context *context;
-        const struct bts_trace *trace;
-        const unsigned char *at;
-        int error, drained = 0;
-        context = child->bts;
-        if (!context)
-                return -ESRCH;
-        trace = ds_read_bts(context->tracer);
-        if (!trace)
-                return -ESRCH;
-        if (!trace->read)
-                return -EOPNOTSUPP;
-        if (size < (trace->ds.top - trace->ds.begin))
-                return -EIO;
-        for (at = trace->ds.begin; (void *)at < trace->ds.top;
-             out++, drained++, at += trace->ds.size) {
-                struct bts_struct bts;
-                error = trace->read(context->tracer, at, &bts);
-                if (error < 0)
-                        return error;
-                if (copy_to_user(out, &bts, sizeof(bts)))
-                        return -EFAULT;
-        }
-        memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size);
-        error = ds_reset_bts(context->tracer);
-        if (error < 0)
-                return error;
-        return drained;
-}
-static int ptrace_bts_config(struct task_struct *child,
-                             long cfg_size,
-                             const struct ptrace_bts_config __user *ucfg)
-{
-        struct bts_context *context;
-        struct ptrace_bts_config cfg;
-        unsigned int flags = 0;
-        if (cfg_size < sizeof(cfg))
-                return -EIO;
-        if (copy_from_user(&cfg, ucfg, sizeof(cfg)))
-                return -EFAULT;
-        context = child->bts;
-        if (!context)
-                context = alloc_bts_context(child);
-        if (!context)
-                return -ENOMEM;
-        if (cfg.flags & PTRACE_BTS_O_SIGNAL) {
-                if (!cfg.signal)
-                        return -EINVAL;
-                return -EOPNOTSUPP;
-                context->bts_ovfl_signal = cfg.signal;
-        }
-        ds_release_bts(context->tracer);
-        context->tracer = NULL;
-        if ((cfg.flags & PTRACE_BTS_O_ALLOC) && (cfg.size != context->size)) {
-                int err;
-                free_bts_buffer(context);
-                if (!cfg.size)
-                        return 0;
-                err = alloc_bts_buffer(context, cfg.size);
-                if (err < 0)
-                        return err;
-        }
-        if (cfg.flags & PTRACE_BTS_O_TRACE)
-                flags |= BTS_USER;
-        if (cfg.flags & PTRACE_BTS_O_SCHED)
-                flags |= BTS_TIMESTAMPS;
-        context->tracer =
-                ds_request_bts_task(child, context->buffer, context->size,
-                                    NULL, (size_t)-1, flags);
-        if (unlikely(IS_ERR(context->tracer))) {
-                int error = PTR_ERR(context->tracer);
-                free_bts_buffer(context);
-                context->tracer = NULL;
-                return error;
-        }
-        return sizeof(cfg);
-}
-static int ptrace_bts_status(struct task_struct *child,
-                             long cfg_size,
-                             struct ptrace_bts_config __user *ucfg)
-{
-        struct bts_context *context;
-        const struct bts_trace *trace;
-        struct ptrace_bts_config cfg;
-        context = child->bts;
-        if (!context)
-                return -ESRCH;
-        if (cfg_size < sizeof(cfg))
-                return -EIO;
-        trace = ds_read_bts(context->tracer);
-        if (!trace)
-                return -ESRCH;
-        memset(&cfg, 0, sizeof(cfg));
-        cfg.size        = trace->ds.end - trace->ds.begin;
-        cfg.signal      = context->bts_ovfl_signal;
-        cfg.bts_size    = sizeof(struct bts_struct);
-        if (cfg.signal)
-                cfg.flags |= PTRACE_BTS_O_SIGNAL;
-        if (trace->ds.flags & BTS_USER)
-                cfg.flags |= PTRACE_BTS_O_TRACE;
-        if (trace->ds.flags & BTS_TIMESTAMPS)
-                cfg.flags |= PTRACE_BTS_O_SCHED;
-        if (copy_to_user(ucfg, &cfg, sizeof(cfg)))
-                return -EFAULT;
-        return sizeof(cfg);
-}
-static int ptrace_bts_clear(struct task_struct *child)
-{
-        struct bts_context *context;
-        const struct bts_trace *trace;
-        context = child->bts;
-        if (!context)
-                return -ESRCH;
-        trace = ds_read_bts(context->tracer);
-        if (!trace)
-                return -ESRCH;
-        memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size);
-        return ds_reset_bts(context->tracer);
-}
-static int ptrace_bts_size(struct task_struct *child)
-{
-        struct bts_context *context;
-        const struct bts_trace *trace;
-        context = child->bts;
-        if (!context)
-                return -ESRCH;
-        trace = ds_read_bts(context->tracer);
-        if (!trace)
-                return -ESRCH;
-        return (trace->ds.top - trace->ds.begin) / trace->ds.size;
-}
-/*
- * Called from __ptrace_unlink() after the child has been moved back
- * to its original parent.
- */
-void ptrace_bts_untrace(struct task_struct *child)
-{
-        if (unlikely(child->bts)) {
-                free_bts_context(child->bts);
-                child->bts = NULL;
-        }
-}
-#endif /* CONFIG_X86_PTRACE_BTS */
 /*
 * Called by kernel/ptrace.c when detaching..
 *
@@ -1252,39 +911,6 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
                break;
 #endif
-        /*
-         * These bits need more cooking - not enabled yet:
-         */
-#ifdef CONFIG_X86_PTRACE_BTS
-        case PTRACE_BTS_CONFIG:
-                ret = ptrace_bts_config
-                        (child, data, (struct ptrace_bts_config __user *)addr);
-                break;
-        case PTRACE_BTS_STATUS:
-                ret = ptrace_bts_status
-                        (child, data, (struct ptrace_bts_config __user *)addr);
-                break;
-        case PTRACE_BTS_SIZE:
-                ret = ptrace_bts_size(child);
-                break;
-        case PTRACE_BTS_GET:
-                ret = ptrace_bts_read_record
-                        (child, data, (struct bts_struct __user *) addr);
-                break;
-        case PTRACE_BTS_CLEAR:
-                ret = ptrace_bts_clear(child);
-                break;
-        case PTRACE_BTS_DRAIN:
-                ret = ptrace_bts_drain
-                        (child, data, (struct bts_struct __user *) addr);
-                break;
-#endif /* CONFIG_X86_PTRACE_BTS */
        default:
                ret = ptrace_request(child, request, addr, data);
                break;
@@ -1544,14 +1170,6 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
        case PTRACE_GET_THREAD_AREA:
        case PTRACE_SET_THREAD_AREA:
-#ifdef CONFIG_X86_PTRACE_BTS
-        case PTRACE_BTS_CONFIG:
-        case PTRACE_BTS_STATUS:
-        case PTRACE_BTS_SIZE:
-        case PTRACE_BTS_GET:
-        case PTRACE_BTS_CLEAR:
-        case PTRACE_BTS_DRAIN:
-#endif /* CONFIG_X86_PTRACE_BTS */
                return arch_ptrace(child, request, addr, data);
        default:
diff --git a/arch/x86/kernel/sfi.c b/arch/x86/kernel/sfi.c
index 34e099382651..7ded57896c0a 100644
--- a/arch/x86/kernel/sfi.c
+++ b/arch/x86/kernel/sfi.c
@@ -81,7 +81,6 @@ static int __init sfi_parse_cpus(struct sfi_table_header *table)
 #endif /* CONFIG_X86_LOCAL_APIC */
 #ifdef CONFIG_X86_IO_APIC
-static u32 gsi_base;
 static int __init sfi_parse_ioapic(struct sfi_table_header *table)
 {
@@ -94,8 +93,7 @@ static int __init sfi_parse_ioapic(struct sfi_table_header *table)
        pentry = (struct sfi_apic_table_entry *)sb->pentry;
        for (i = 0; i < num; i++) {
-                mp_register_ioapic(i, pentry->phys_addr, gsi_base);
+                mp_register_ioapic(i, pentry->phys_addr, gsi_end + 1);
-                gsi_base += io_apic_get_redir_entries(i);
                pentry++;
        }
diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c
index 3149032ff107..58de45ee08b6 100644
--- a/arch/x86/kernel/step.c
+++ b/arch/x86/kernel/step.c
@@ -158,22 +158,6 @@ static int enable_single_step(struct task_struct *child)
 }
 /*
- * Install this value in MSR_IA32_DEBUGCTLMSR whenever child is running.
- */
-static void write_debugctlmsr(struct task_struct *child, unsigned long val)
-{
-        if (child->thread.debugctlmsr == val)
-                return;
-        child->thread.debugctlmsr = val;
-        if (child != current)
-                return;
-        update_debugctlmsr(val);
-}
-/*
 * Enable single or block step.
 */
 static void enable_step(struct task_struct *child, bool block)
@@ -186,15 +170,17 @@ static void enable_step(struct task_struct *child, bool block)
         * that uses user-mode single stepping itself.
         */
        if (enable_single_step(child) && block) {
-                set_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
+                unsigned long debugctl = get_debugctlmsr();
-                write_debugctlmsr(child,
-                                  child->thread.debugctlmsr | DEBUGCTLMSR_BTF);
+                debugctl |= DEBUGCTLMSR_BTF;
-        } else {
+                update_debugctlmsr(debugctl);
-                write_debugctlmsr(child,
+                set_tsk_thread_flag(child, TIF_BLOCKSTEP);
-                                  child->thread.debugctlmsr & ~DEBUGCTLMSR_BTF);
+        } else if (test_tsk_thread_flag(child, TIF_BLOCKSTEP)) {
+                unsigned long debugctl = get_debugctlmsr();
-                if (!child->thread.debugctlmsr)
-                        clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
+                debugctl &= ~DEBUGCTLMSR_BTF;
+                update_debugctlmsr(debugctl);
+                clear_tsk_thread_flag(child, TIF_BLOCKSTEP);
        }
 }
@@ -213,11 +199,13 @@ void user_disable_single_step(struct task_struct *child)
        /*
         * Make sure block stepping (BTF) is disabled.
         */
-        write_debugctlmsr(child,
+        if (test_tsk_thread_flag(child, TIF_BLOCKSTEP)) {
-                          child->thread.debugctlmsr & ~DEBUGCTLMSR_BTF);
+                unsigned long debugctl = get_debugctlmsr();
-        if (!child->thread.debugctlmsr)
+                debugctl &= ~DEBUGCTLMSR_BTF;
-                clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
+                update_debugctlmsr(debugctl);
+                clear_tsk_thread_flag(child, TIF_BLOCKSTEP);
+        }
        /* Always clear TIF_SINGLESTEP... */
        clear_tsk_thread_flag(child, TIF_SINGLESTEP);
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index 86c9f91b48ae..cc2c60474fd0 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -175,6 +175,9 @@ static void add_mac_region(phys_addr_t start, unsigned long size)
        struct tboot_mac_region *mr;
        phys_addr_t end = start + size;
+        if (tboot->num_mac_regions >= MAX_TB_MAC_REGIONS)
+                panic("tboot: Too many MAC regions\n");
        if (start && size) {
                mr = &tboot->mac_regions[tboot->num_mac_regions++];
                mr->start = round_down(start, PAGE_SIZE);
@@ -184,18 +187,17 @@ static void add_mac_region(phys_addr_t start, unsigned long size)
 static int tboot_setup_sleep(void)
 {
+        int i;
        tboot->num_mac_regions = 0;
-        /* S3 resume code */
+        for (i = 0; i < e820.nr_map; i++) {
-        add_mac_region(acpi_wakeup_address, WAKEUP_SIZE);
+                if ((e820.map[i].type != E820_RAM)
+                 && (e820.map[i].type != E820_RESERVED_KERN))
+                        continue;
-#ifdef CONFIG_X86_TRAMPOLINE
+                add_mac_region(e820.map[i].addr, e820.map[i].size);
-        /* AP trampoline code */
+        }
-        add_mac_region(virt_to_phys(trampoline_base), TRAMPOLINE_SIZE);
-#endif
-        /* kernel code + data + bss */
-        add_mac_region(virt_to_phys(_text), _end - _text);
        tboot->acpi_sinfo.kernel_s3_resume_vector = acpi_wakeup_address;
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index 17b03dd3a6b5..7fea555929e2 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -1,7 +1,7 @@
 /*
 *      SGI UltraViolet TLB flush routines.
 *
- *      (c) 2008 Cliff Wickman <cpw@sgi.com>, SGI.
+ *      (c) 2008-2010 Cliff Wickman <cpw@sgi.com>, SGI.
 *
 *      This code is released under the GNU General Public License version 2 or
 *      later.
@@ -20,42 +20,67 @@
 #include <asm/idle.h>
 #include <asm/tsc.h>
 #include <asm/irq_vectors.h>
+#include <asm/timer.h>
-static struct bau_control       **uv_bau_table_bases __read_mostly;
+struct msg_desc {
-static int                      uv_bau_retry_limit __read_mostly;
+        struct bau_payload_queue_entry *msg;
+        int msg_slot;
+        int sw_ack_slot;
+        struct bau_payload_queue_entry *va_queue_first;
+        struct bau_payload_queue_entry *va_queue_last;
+};
-/* base pnode in this partition */
+#define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD 0x000000000bUL
-static int                      uv_partition_base_pnode __read_mostly;
+static int uv_bau_max_concurrent __read_mostly;
+static int nobau;
+static int __init setup_nobau(char *arg)
+{
+        nobau = 1;
+        return 0;
+}
+early_param("nobau", setup_nobau);
-static unsigned long            uv_mmask __read_mostly;
+/* base pnode in this partition */
+static int uv_partition_base_pnode __read_mostly;
+/* position of pnode (which is nasid>>1): */
+static int uv_nshift __read_mostly;
+static unsigned long uv_mmask __read_mostly;
 static DEFINE_PER_CPU(struct ptc_stats, ptcstats);
 static DEFINE_PER_CPU(struct bau_control, bau_control);
+static DEFINE_PER_CPU(cpumask_var_t, uv_flush_tlb_mask);
+struct reset_args {
+        int sender;
+};
 /*
- * Determine the first node on a blade.
+ * Determine the first node on a uvhub. 'Nodes' are used for kernel
+ * memory allocation.
 */
-static int __init blade_to_first_node(int blade)
+static int __init uvhub_to_first_node(int uvhub)
 {
        int node, b;
        for_each_online_node(node) {
                b = uv_node_to_blade_id(node);
-                if (blade == b)
+                if (uvhub == b)
                        return node;
        }
-        return -1; /* shouldn't happen */
+        return -1;
 }
 /*
- * Determine the apicid of the first cpu on a blade.
+ * Determine the apicid of the first cpu on a uvhub.
 */
-static int __init blade_to_first_apicid(int blade)
+static int __init uvhub_to_first_apicid(int uvhub)
 {
        int cpu;
        for_each_present_cpu(cpu)
-                if (blade == uv_cpu_to_blade_id(cpu))
+                if (uvhub == uv_cpu_to_blade_id(cpu))
                        return per_cpu(x86_cpu_to_apicid, cpu);
        return -1;
 }
@@ -68,195 +93,459 @@ static int __init blade_to_first_apicid(int blade)
 * clear of the Timeout bit (as well) will free the resource. No reply will
 * be sent (the hardware will only do one reply per message).
 */
-static void uv_reply_to_message(int resource,
+static inline void uv_reply_to_message(struct msg_desc *mdp,
-                                struct bau_payload_queue_entry *msg,
+                                       struct bau_control *bcp)
-                                struct bau_msg_status *msp)
 {
        unsigned long dw;
+        struct bau_payload_queue_entry *msg;
-        dw = (1 << (resource + UV_SW_ACK_NPENDING)) | (1 << resource);
+        msg = mdp->msg;
+        if (!msg->canceled) {
+                dw = (msg->sw_ack_vector << UV_SW_ACK_NPENDING) |
+                                                msg->sw_ack_vector;
+                uv_write_local_mmr(
+                                UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, dw);
+        }
        msg->replied_to = 1;
        msg->sw_ack_vector = 0;
-        if (msp)
-                msp->seen_by.bits = 0;
-        uv_write_local_mmr(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, dw);
 }
 /*
- * Do all the things a cpu should do for a TLB shootdown message.
+ * Process the receipt of a RETRY message
- * Other cpu's may come here at the same time for this message.
 */
-static void uv_bau_process_message(struct bau_payload_queue_entry *msg,
+static inline void uv_bau_process_retry_msg(struct msg_desc *mdp,
-                                   int msg_slot, int sw_ack_slot)
+                                            struct bau_control *bcp)
 {
-        unsigned long this_cpu_mask;
+        int i;
-        struct bau_msg_status *msp;
+        int cancel_count = 0;
-        int cpu;
+        int slot2;
+        unsigned long msg_res;
+        unsigned long mmr = 0;
+        struct bau_payload_queue_entry *msg;
+        struct bau_payload_queue_entry *msg2;
+        struct ptc_stats *stat;
-        msp = __get_cpu_var(bau_control).msg_statuses + msg_slot;
+        msg = mdp->msg;
-        cpu = uv_blade_processor_id();
+        stat = &per_cpu(ptcstats, bcp->cpu);
-        msg->number_of_cpus =
+        stat->d_retries++;
-                uv_blade_nr_online_cpus(uv_node_to_blade_id(numa_node_id()));
+        /*
-        this_cpu_mask = 1UL << cpu;
+         * cancel any message from msg+1 to the retry itself
-        if (msp->seen_by.bits & this_cpu_mask)
+         */
-                return;
+        for (msg2 = msg+1, i = 0; i < DEST_Q_SIZE; msg2++, i++) {
-        atomic_or_long(&msp->seen_by.bits, this_cpu_mask);
+                if (msg2 > mdp->va_queue_last)
+                        msg2 = mdp->va_queue_first;
+                if (msg2 == msg)
+                        break;
+                /* same conditions for cancellation as uv_do_reset */
+                if ((msg2->replied_to == 0) && (msg2->canceled == 0) &&
+                    (msg2->sw_ack_vector) && ((msg2->sw_ack_vector &
+                        msg->sw_ack_vector) == 0) &&
+                    (msg2->sending_cpu == msg->sending_cpu) &&
+                    (msg2->msg_type != MSG_NOOP)) {
+                        slot2 = msg2 - mdp->va_queue_first;
+                        mmr = uv_read_local_mmr
+                                (UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE);
+                        msg_res = ((msg2->sw_ack_vector << 8) |
+                                   msg2->sw_ack_vector);
+                        /*
+                         * This is a message retry; clear the resources held
+                         * by the previous message only if they timed out.
+                         * If it has not timed out we have an unexpected
+                         * situation to report.
+                         */
+                        if (mmr & (msg_res << 8)) {
+                                /*
+                                 * is the resource timed out?
+                                 * make everyone ignore the cancelled message.
+                                 */
+                                msg2->canceled = 1;
+                                stat->d_canceled++;
+                                cancel_count++;
+                                uv_write_local_mmr(
+                                    UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS,
+                                        (msg_res << 8) | msg_res);
+                        } else
+                                printk(KERN_INFO "note bau retry: no effect\n");
+                }
+        }
+        if (!cancel_count)
+                stat->d_nocanceled++;
+}
-        if (msg->replied_to == 1)
+/*
-                return;
+ * Do all the things a cpu should do for a TLB shootdown message.
+ * Other cpu's may come here at the same time for this message.
+ */
+static void uv_bau_process_message(struct msg_desc *mdp,
+                                   struct bau_control *bcp)
+{
+        int msg_ack_count;
+        short socket_ack_count = 0;
+        struct ptc_stats *stat;
+        struct bau_payload_queue_entry *msg;
+        struct bau_control *smaster = bcp->socket_master;
+        /*
+         * This must be a normal message, or retry of a normal message
+         */
+        msg = mdp->msg;
+        stat = &per_cpu(ptcstats, bcp->cpu);
        if (msg->address == TLB_FLUSH_ALL) {
                local_flush_tlb();
-                __get_cpu_var(ptcstats).alltlb++;
+                stat->d_alltlb++;
        } else {
                __flush_tlb_one(msg->address);
-                __get_cpu_var(ptcstats).onetlb++;
+                stat->d_onetlb++;
        }
+        stat->d_requestee++;
+        /*
+         * One cpu on each uvhub has the additional job on a RETRY
+         * of releasing the resource held by the message that is
+         * being retried.  That message is identified by sending
+         * cpu number.
+         */
+        if (msg->msg_type == MSG_RETRY && bcp == bcp->uvhub_master)
+                uv_bau_process_retry_msg(mdp, bcp);
-        __get_cpu_var(ptcstats).requestee++;
+        /*
+         * This is a sw_ack message, so we have to reply to it.
+         * Count each responding cpu on the socket. This avoids
+         * pinging the count's cache line back and forth between
+         * the sockets.
+         */
+        socket_ack_count = atomic_add_short_return(1, (struct atomic_short *)
+                        &smaster->socket_acknowledge_count[mdp->msg_slot]);
+        if (socket_ack_count == bcp->cpus_in_socket) {
+                /*
+                 * Both sockets dump their completed count total into
+                 * the message's count.
+                 */
+                smaster->socket_acknowledge_count[mdp->msg_slot] = 0;
+                msg_ack_count = atomic_add_short_return(socket_ack_count,
+                                (struct atomic_short *)&msg->acknowledge_count);
+                if (msg_ack_count == bcp->cpus_in_uvhub) {
+                        /*
+                         * All cpus in uvhub saw it; reply
+                         */
+                        uv_reply_to_message(mdp, bcp);
+                }
+        }
-        atomic_inc_short(&msg->acknowledge_count);
+        return;
-        if (msg->number_of_cpus == msg->acknowledge_count)
-                uv_reply_to_message(sw_ack_slot, msg, msp);
 }
 /*
- * Examine the payload queue on one distribution node to see
+ * Determine the first cpu on a uvhub.
- * which messages have not been seen, and which cpu(s) have not seen them.
+ */
+static int uvhub_to_first_cpu(int uvhub)
+{
+        int cpu;
+        for_each_present_cpu(cpu)
+                if (uvhub == uv_cpu_to_blade_id(cpu))
+                        return cpu;
+        return -1;
+}
+/*
+ * Last resort when we get a large number of destination timeouts is
+ * to clear resources held by a given cpu.
+ * Do this with IPI so that all messages in the BAU message queue
+ * can be identified by their nonzero sw_ack_vector field.
 *
- * Returns the number of cpu's that have not responded.
+ * This is entered for a single cpu on the uvhub.
+ * The sender want's this uvhub to free a specific message's
+ * sw_ack resources.
 */
-static int uv_examine_destination(struct bau_control *bau_tablesp, int sender)
+static void
+uv_do_reset(void *ptr)
 {
-        struct bau_payload_queue_entry *msg;
-        struct bau_msg_status *msp;
-        int count = 0;
        int i;
-        int j;
+        int slot;
+        int count = 0;
+        unsigned long mmr;
+        unsigned long msg_res;
+        struct bau_control *bcp;
+        struct reset_args *rap;
+        struct bau_payload_queue_entry *msg;
+        struct ptc_stats *stat;
-        for (msg = bau_tablesp->va_queue_first, i = 0; i < DEST_Q_SIZE;
+        bcp = &per_cpu(bau_control, smp_processor_id());
-             msg++, i++) {
+        rap = (struct reset_args *)ptr;
-                if ((msg->sending_cpu == sender) && (!msg->replied_to)) {
+        stat = &per_cpu(ptcstats, bcp->cpu);
-                        msp = bau_tablesp->msg_statuses + i;
+        stat->d_resets++;
-                        printk(KERN_DEBUG
-                               "blade %d: address:%#lx %d of %d, not cpu(s): ",
+        /*
-                               i, msg->address, msg->acknowledge_count,
+         * We're looking for the given sender, and
-                               msg->number_of_cpus);
+         * will free its sw_ack resource.
-                        for (j = 0; j < msg->number_of_cpus; j++) {
+         * If all cpu's finally responded after the timeout, its
-                                if (!((1L << j) & msp->seen_by.bits)) {
+         * message 'replied_to' was set.
-                                        count++;
+         */
-                                        printk("%d ", j);
+        for (msg = bcp->va_queue_first, i = 0; i < DEST_Q_SIZE; msg++, i++) {
-                                }
+                /* uv_do_reset: same conditions for cancellation as
+                   uv_bau_process_retry_msg() */
+                if ((msg->replied_to == 0) &&
+                    (msg->canceled == 0) &&
+                    (msg->sending_cpu == rap->sender) &&
+                    (msg->sw_ack_vector) &&
+                    (msg->msg_type != MSG_NOOP)) {
+                        /*
+                         * make everyone else ignore this message
+                         */
+                        msg->canceled = 1;
+                        slot = msg - bcp->va_queue_first;
+                        count++;
+                        /*
+                         * only reset the resource if it is still pending
+                         */
+                        mmr = uv_read_local_mmr
+                                        (UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE);
+                        msg_res = ((msg->sw_ack_vector << 8) |
+                                                   msg->sw_ack_vector);
+                        if (mmr & msg_res) {
+                                stat->d_rcanceled++;
+                                uv_write_local_mmr(
+                                    UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS,
+                                                        msg_res);
                        }
-                        printk("\n");
                }
        }
-        return count;
+        return;
 }
 /*
- * Examine the payload queue on all the distribution nodes to see
+ * Use IPI to get all target uvhubs to release resources held by
- * which messages have not been seen, and which cpu(s) have not seen them.
+ * a given sending cpu number.
- *
- * Returns the number of cpu's that have not responded.
 */
-static int uv_examine_destinations(struct bau_target_nodemask *distribution)
+static void uv_reset_with_ipi(struct bau_target_uvhubmask *distribution,
+                              int sender)
 {
-        int sender;
+        int uvhub;
-        int i;
+        int cpu;
-        int count = 0;
+        cpumask_t mask;
+        struct reset_args reset_args;
+        reset_args.sender = sender;
-        sender = smp_processor_id();
+        cpus_clear(mask);
-        for (i = 0; i < sizeof(struct bau_target_nodemask) * BITSPERBYTE; i++) {
+        /* find a single cpu for each uvhub in this distribution mask */
-                if (!bau_node_isset(i, distribution))
+        for (uvhub = 0;
+                    uvhub < sizeof(struct bau_target_uvhubmask) * BITSPERBYTE;
+                    uvhub++) {
+                if (!bau_uvhub_isset(uvhub, distribution))
                        continue;
-                count += uv_examine_destination(uv_bau_table_bases[i], sender);
+                /* find a cpu for this uvhub */
+                cpu = uvhub_to_first_cpu(uvhub);
+                cpu_set(cpu, mask);
        }
-        return count;
+        /* IPI all cpus; Preemption is already disabled */
+        smp_call_function_many(&mask, uv_do_reset, (void *)&reset_args, 1);
+        return;
+}
+static inline unsigned long
+cycles_2_us(unsigned long long cyc)
+{
+        unsigned long long ns;
+        unsigned long us;
+        ns =  (cyc * per_cpu(cyc2ns, smp_processor_id()))
+                                                >> CYC2NS_SCALE_FACTOR;
+        us = ns / 1000;
+        return us;
 }
 /*
- * wait for completion of a broadcast message
+ * wait for all cpus on this hub to finish their sends and go quiet
- *
+ * leaves uvhub_quiesce set so that no new broadcasts are started by
- * return COMPLETE, RETRY or GIVEUP
+ * bau_flush_send_and_wait()
+ */
+static inline void
+quiesce_local_uvhub(struct bau_control *hmaster)
+{
+        atomic_add_short_return(1, (struct atomic_short *)
+                 &hmaster->uvhub_quiesce);
+}
+/*
+ * mark this quiet-requestor as done
+ */
+static inline void
+end_uvhub_quiesce(struct bau_control *hmaster)
+{
+        atomic_add_short_return(-1, (struct atomic_short *)
+                &hmaster->uvhub_quiesce);
+}
+/*
+ * Wait for completion of a broadcast software ack message
+ * return COMPLETE, RETRY(PLUGGED or TIMEOUT) or GIVEUP
 */
 static int uv_wait_completion(struct bau_desc *bau_desc,
-                              unsigned long mmr_offset, int right_shift)
+        unsigned long mmr_offset, int right_shift, int this_cpu,
+        struct bau_control *bcp, struct bau_control *smaster, long try)
 {
-        int exams = 0;
+        int relaxes = 0;
-        long destination_timeouts = 0;
-        long source_timeouts = 0;
        unsigned long descriptor_status;
+        unsigned long mmr;
+        unsigned long mask;
+        cycles_t ttime;
+        cycles_t timeout_time;
+        struct ptc_stats *stat = &per_cpu(ptcstats, this_cpu);
+        struct bau_control *hmaster;
+        hmaster = bcp->uvhub_master;
+        timeout_time = get_cycles() + bcp->timeout_interval;
+        /* spin on the status MMR, waiting for it to go idle */
        while ((descriptor_status = (((unsigned long)
                uv_read_local_mmr(mmr_offset) >>
                        right_shift) & UV_ACT_STATUS_MASK)) !=
                        DESC_STATUS_IDLE) {
-                if (descriptor_status == DESC_STATUS_SOURCE_TIMEOUT) {
-                        source_timeouts++;
-                        if (source_timeouts > SOURCE_TIMEOUT_LIMIT)
-                                source_timeouts = 0;
-                        __get_cpu_var(ptcstats).s_retry++;
-                        return FLUSH_RETRY;
-                }
                /*
-                 * spin here looking for progress at the destinations
+                 * Our software ack messages may be blocked because there are
+                 * no swack resources available.  As long as none of them
+                 * has timed out hardware will NACK our message and its
+                 * state will stay IDLE.
                 */
-                if (descriptor_status == DESC_STATUS_DESTINATION_TIMEOUT) {
+                if (descriptor_status == DESC_STATUS_SOURCE_TIMEOUT) {
-                        destination_timeouts++;
+                        stat->s_stimeout++;
-                        if (destination_timeouts > DESTINATION_TIMEOUT_LIMIT) {
+                        return FLUSH_GIVEUP;
-                                /*
+                } else if (descriptor_status ==
-                                 * returns number of cpus not responding
+                                        DESC_STATUS_DESTINATION_TIMEOUT) {
-                                 */
+                        stat->s_dtimeout++;
-                                if (uv_examine_destinations
+                        ttime = get_cycles();
-                                    (&bau_desc->distribution) == 0) {
-                                        __get_cpu_var(ptcstats).d_retry++;
+                        /*
-                                        return FLUSH_RETRY;
+                         * Our retries may be blocked by all destination
-                                }
+                         * swack resources being consumed, and a timeout
-                                exams++;
+                         * pending.  In that case hardware returns the
-                                if (exams >= uv_bau_retry_limit) {
+                         * ERROR that looks like a destination timeout.
-                                        printk(KERN_DEBUG
+                         */
-                                               "uv_flush_tlb_others");
+                        if (cycles_2_us(ttime - bcp->send_message) < BIOS_TO) {
-                                        printk("giving up on cpu %d\n",
+                                bcp->conseccompletes = 0;
-                                               smp_processor_id());
+                                return FLUSH_RETRY_PLUGGED;
+                        }
+                        bcp->conseccompletes = 0;
+                        return FLUSH_RETRY_TIMEOUT;
+                } else {
+                        /*
+                         * descriptor_status is still BUSY
+                         */
+                        cpu_relax();
+                        relaxes++;
+                        if (relaxes >= 10000) {
+                                relaxes = 0;
+                                if (get_cycles() > timeout_time) {
+                                        quiesce_local_uvhub(hmaster);
+                                        /* single-thread the register change */
+                                        spin_lock(&hmaster->masks_lock);
+                                        mmr = uv_read_local_mmr(mmr_offset);
+                                        mask = 0UL;
+                                        mask |= (3UL < right_shift);
+                                        mask = ~mask;
+                                        mmr &= mask;
+                                        uv_write_local_mmr(mmr_offset, mmr);
+                                        spin_unlock(&hmaster->masks_lock);
+                                        end_uvhub_quiesce(hmaster);
+                                        stat->s_busy++;
                                        return FLUSH_GIVEUP;
                                }
-                                /*
-                                 * delays can hang the simulator
-                                   udelay(1000);
-                                 */
-                                destination_timeouts = 0;
                        }
                }
-                cpu_relax();
        }
+        bcp->conseccompletes++;
        return FLUSH_COMPLETE;
 }
+static inline cycles_t
+sec_2_cycles(unsigned long sec)
+{
+        unsigned long ns;
+        cycles_t cyc;
+        ns = sec * 1000000000;
+        cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id()));
+        return cyc;
+}
+/*
+ * conditionally add 1 to *v, unless *v is >= u
+ * return 0 if we cannot add 1 to *v because it is >= u
+ * return 1 if we can add 1 to *v because it is < u
+ * the add is atomic
+ *
+ * This is close to atomic_add_unless(), but this allows the 'u' value
+ * to be lowered below the current 'v'.  atomic_add_unless can only stop
+ * on equal.
+ */
+static inline int atomic_inc_unless_ge(spinlock_t *lock, atomic_t *v, int u)
+{
+        spin_lock(lock);
+        if (atomic_read(v) >= u) {
+                spin_unlock(lock);
+                return 0;
+        }
+        atomic_inc(v);
+        spin_unlock(lock);
+        return 1;
+}
 /**
 * uv_flush_send_and_wait
 *
- * Send a broadcast and wait for a broadcast message to complete.
+ * Send a broadcast and wait for it to complete.
 *
- * The flush_mask contains the cpus the broadcast was sent to.
+ * The flush_mask contains the cpus the broadcast is to be sent to, plus
+ * cpus that are on the local uvhub.
 *
- * Returns NULL if all remote flushing was done. The mask is zeroed.
+ * Returns NULL if all flushing represented in the mask was done. The mask
+ * is zeroed.
 * Returns @flush_mask if some remote flushing remains to be done. The
- * mask will have some bits still set.
+ * mask will have some bits still set, representing any cpus on the local
+ * uvhub (not current cpu) and any on remote uvhubs if the broadcast failed.
 */
-const struct cpumask *uv_flush_send_and_wait(int cpu, int this_pnode,
+const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc,
-                                             struct bau_desc *bau_desc,
+                                             struct cpumask *flush_mask,
-                                             struct cpumask *flush_mask)
+                                             struct bau_control *bcp)
 {
-        int completion_status = 0;
        int right_shift;
-        int tries = 0;
+        int uvhub;
-        int pnode;
        int bit;
+        int completion_status = 0;
+        int seq_number = 0;
+        long try = 0;
+        int cpu = bcp->uvhub_cpu;
+        int this_cpu = bcp->cpu;
+        int this_uvhub = bcp->uvhub;
        unsigned long mmr_offset;
        unsigned long index;
        cycles_t time1;
        cycles_t time2;
+        struct ptc_stats *stat = &per_cpu(ptcstats, bcp->cpu);
+        struct bau_control *smaster = bcp->socket_master;
+        struct bau_control *hmaster = bcp->uvhub_master;
+        /*
+         * Spin here while there are hmaster->max_concurrent or more active
+         * descriptors. This is the per-uvhub 'throttle'.
+         */
+        if (!atomic_inc_unless_ge(&hmaster->uvhub_lock,
+                        &hmaster->active_descriptor_count,
+                        hmaster->max_concurrent)) {
+                stat->s_throttles++;
+                do {
+                        cpu_relax();
+                } while (!atomic_inc_unless_ge(&hmaster->uvhub_lock,
+                        &hmaster->active_descriptor_count,
+                        hmaster->max_concurrent));
+        }
+        while (hmaster->uvhub_quiesce)
+                cpu_relax();
        if (cpu < UV_CPUS_PER_ACT_STATUS) {
                mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0;
@@ -268,24 +557,108 @@ const struct cpumask *uv_flush_send_and_wait(int cpu, int this_pnode,
        }
        time1 = get_cycles();
        do {
-                tries++;
+                /*
+                 * Every message from any given cpu gets a unique message
+                 * sequence number. But retries use that same number.
+                 * Our message may have timed out at the destination because
+                 * all sw-ack resources are in use and there is a timeout
+                 * pending there.  In that case, our last send never got
+                 * placed into the queue and we need to persist until it
+                 * does.
+                 *
+                 * Make any retry a type MSG_RETRY so that the destination will
+                 * free any resource held by a previous message from this cpu.
+                 */
+                if (try == 0) {
+                        /* use message type set by the caller the first time */
+                        seq_number = bcp->message_number++;
+                } else {
+                        /* use RETRY type on all the rest; same sequence */
+                        bau_desc->header.msg_type = MSG_RETRY;
+                        stat->s_retry_messages++;
+                }
+                bau_desc->header.sequence = seq_number;
                index = (1UL << UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT) |
-                        cpu;
+                        bcp->uvhub_cpu;
+                bcp->send_message = get_cycles();
                uv_write_local_mmr(UVH_LB_BAU_SB_ACTIVATION_CONTROL, index);
+                try++;
                completion_status = uv_wait_completion(bau_desc, mmr_offset,
-                                        right_shift);
+                        right_shift, this_cpu, bcp, smaster, try);
-        } while (completion_status == FLUSH_RETRY);
+                if (completion_status == FLUSH_RETRY_PLUGGED) {
+                        /*
+                         * Our retries may be blocked by all destination swack
+                         * resources being consumed, and a timeout pending. In
+                         * that case hardware immediately returns the ERROR
+                         * that looks like a destination timeout.
+                         */
+                        udelay(TIMEOUT_DELAY);
+                        bcp->plugged_tries++;
+                        if (bcp->plugged_tries >= PLUGSB4RESET) {
+                                bcp->plugged_tries = 0;
+                                quiesce_local_uvhub(hmaster);
+                                spin_lock(&hmaster->queue_lock);
+                                uv_reset_with_ipi(&bau_desc->distribution,
+                                                        this_cpu);
+                                spin_unlock(&hmaster->queue_lock);
+                                end_uvhub_quiesce(hmaster);
+                                bcp->ipi_attempts++;
+                                stat->s_resets_plug++;
+                        }
+                } else if (completion_status == FLUSH_RETRY_TIMEOUT) {
+                        hmaster->max_concurrent = 1;
+                        bcp->timeout_tries++;
+                        udelay(TIMEOUT_DELAY);
+                        if (bcp->timeout_tries >= TIMEOUTSB4RESET) {
+                                bcp->timeout_tries = 0;
+                                quiesce_local_uvhub(hmaster);
+                                spin_lock(&hmaster->queue_lock);
+                                uv_reset_with_ipi(&bau_desc->distribution,
+                                                                this_cpu);
+                                spin_unlock(&hmaster->queue_lock);
+                                end_uvhub_quiesce(hmaster);
+                                bcp->ipi_attempts++;
+                                stat->s_resets_timeout++;
+                        }
+                }
+                if (bcp->ipi_attempts >= 3) {
+                        bcp->ipi_attempts = 0;
+                        completion_status = FLUSH_GIVEUP;
+                        break;
+                }
+                cpu_relax();
+        } while ((completion_status == FLUSH_RETRY_PLUGGED) ||
+                 (completion_status == FLUSH_RETRY_TIMEOUT));
        time2 = get_cycles();
-        __get_cpu_var(ptcstats).sflush += (time2 - time1);
-        if (tries > 1)
-                __get_cpu_var(ptcstats).retriesok++;
-        if (completion_status == FLUSH_GIVEUP) {
+        if ((completion_status == FLUSH_COMPLETE) && (bcp->conseccompletes > 5)
+            && (hmaster->max_concurrent < hmaster->max_concurrent_constant))
+                        hmaster->max_concurrent++;
+        /*
+         * hold any cpu not timing out here; no other cpu currently held by
+         * the 'throttle' should enter the activation code
+         */
+        while (hmaster->uvhub_quiesce)
+                cpu_relax();
+        atomic_dec(&hmaster->active_descriptor_count);
+        /* guard against cycles wrap */
+        if (time2 > time1)
+                stat->s_time += (time2 - time1);
+        else
+                stat->s_requestor--; /* don't count this one */
+        if (completion_status == FLUSH_COMPLETE && try > 1)
+                stat->s_retriesok++;
+        else if (completion_status == FLUSH_GIVEUP) {
                /*
                 * Cause the caller to do an IPI-style TLB shootdown on
-                 * the cpu's, all of which are still in the mask.
+                 * the target cpu's, all of which are still in the mask.
                 */
-                __get_cpu_var(ptcstats).ptc_i++;
+                stat->s_giveup++;
                return flush_mask;
        }
@@ -294,18 +667,17 @@ const struct cpumask *uv_flush_send_and_wait(int cpu, int this_pnode,
         * use the IPI method of shootdown on them.
         */
        for_each_cpu(bit, flush_mask) {
-                pnode = uv_cpu_to_pnode(bit);
+                uvhub = uv_cpu_to_blade_id(bit);
-                if (pnode == this_pnode)
+                if (uvhub == this_uvhub)
                        continue;
                cpumask_clear_cpu(bit, flush_mask);
        }
        if (!cpumask_empty(flush_mask))
                return flush_mask;
        return NULL;
 }
-static DEFINE_PER_CPU(cpumask_var_t, uv_flush_tlb_mask);
 /**
 * uv_flush_tlb_others - globally purge translation cache of a virtual
 * address or all TLB's
@@ -322,8 +694,8 @@ static DEFINE_PER_CPU(cpumask_var_t, uv_flush_tlb_mask);
 * The caller has derived the cpumask from the mm_struct.  This function
 * is called only if there are bits set in the mask. (e.g. flush_tlb_page())
 *
- * The cpumask is converted into a nodemask of the nodes containing
+ * The cpumask is converted into a uvhubmask of the uvhubs containing
- * the cpus.
+ * those cpus.
 *
 * Note that this function should be called with preemption disabled.
 *
@@ -335,52 +707,82 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
                                          struct mm_struct *mm,
                                          unsigned long va, unsigned int cpu)
 {
-        struct cpumask *flush_mask = __get_cpu_var(uv_flush_tlb_mask);
+        int remotes;
-        int i;
+        int tcpu;
-        int bit;
+        int uvhub;
-        int pnode;
-        int uv_cpu;
-        int this_pnode;
        int locals = 0;
        struct bau_desc *bau_desc;
+        struct cpumask *flush_mask;
+        struct ptc_stats *stat;
+        struct bau_control *bcp;
-        cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu));
+        if (nobau)
+                return cpumask;
-        uv_cpu = uv_blade_processor_id();
+        bcp = &per_cpu(bau_control, cpu);
-        this_pnode = uv_hub_info->pnode;
+        /*
-        bau_desc = __get_cpu_var(bau_control).descriptor_base;
+         * Each sending cpu has a per-cpu mask which it fills from the caller's
-        bau_desc += UV_ITEMS_PER_DESCRIPTOR * uv_cpu;
+         * cpu mask.  Only remote cpus are converted to uvhubs and copied.
+         */
+        flush_mask = (struct cpumask *)per_cpu(uv_flush_tlb_mask, cpu);
+        /*
+         * copy cpumask to flush_mask, removing current cpu
+         * (current cpu should already have been flushed by the caller and
+         *  should never be returned if we return flush_mask)
+         */
+        cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu));
+        if (cpu_isset(cpu, *cpumask))
+                locals++;  /* current cpu was targeted */
-        bau_nodes_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE);
+        bau_desc = bcp->descriptor_base;
+        bau_desc += UV_ITEMS_PER_DESCRIPTOR * bcp->uvhub_cpu;
-        i = 0;
+        bau_uvhubs_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE);
-        for_each_cpu(bit, flush_mask) {
+        remotes = 0;
-                pnode = uv_cpu_to_pnode(bit);
+        for_each_cpu(tcpu, flush_mask) {
-                BUG_ON(pnode > (UV_DISTRIBUTION_SIZE - 1));
+                uvhub = uv_cpu_to_blade_id(tcpu);
-                if (pnode == this_pnode) {
+                if (uvhub == bcp->uvhub) {
                        locals++;
                        continue;
                }
-                bau_node_set(pnode - uv_partition_base_pnode,
+                bau_uvhub_set(uvhub, &bau_desc->distribution);
-                                &bau_desc->distribution);
+                remotes++;
-                i++;
        }
-        if (i == 0) {
+        if (remotes == 0) {
                /*
-                 * no off_node flushing; return status for local node
+                 * No off_hub flushing; return status for local hub.
+                 * Return the caller's mask if all were local (the current
+                 * cpu may be in that mask).
                 */
                if (locals)
-                        return flush_mask;
+                        return cpumask;
                else
                        return NULL;
        }
-        __get_cpu_var(ptcstats).requestor++;
+        stat = &per_cpu(ptcstats, cpu);
-        __get_cpu_var(ptcstats).ntargeted += i;
+        stat->s_requestor++;
+        stat->s_ntargcpu += remotes;
+        remotes = bau_uvhub_weight(&bau_desc->distribution);
+        stat->s_ntarguvhub += remotes;
+        if (remotes >= 16)
+                stat->s_ntarguvhub16++;
+        else if (remotes >= 8)
+                stat->s_ntarguvhub8++;
+        else if (remotes >= 4)
+                stat->s_ntarguvhub4++;
+        else if (remotes >= 2)
+                stat->s_ntarguvhub2++;
+        else
+                stat->s_ntarguvhub1++;
        bau_desc->payload.address = va;
        bau_desc->payload.sending_cpu = cpu;
-        return uv_flush_send_and_wait(uv_cpu, this_pnode, bau_desc, flush_mask);
+        /*
+         * uv_flush_send_and_wait returns null if all cpu's were messaged, or
+         * the adjusted flush_mask if any cpu's were not messaged.
+         */
+        return uv_flush_send_and_wait(bau_desc, flush_mask, bcp);
 }
 /*
@@ -389,87 +791,70 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
 *
 * We received a broadcast assist message.
 *
- * Interrupts may have been disabled; this interrupt could represent
+ * Interrupts are disabled; this interrupt could represent
 * the receipt of several messages.
 *
- * All cores/threads on this node get this interrupt.
+ * All cores/threads on this hub get this interrupt.
- * The last one to see it does the s/w ack.
+ * The last one to see it does the software ack.
 * (the resource will not be freed until noninterruptable cpus see this
- *  interrupt; hardware will timeout the s/w ack and reply ERROR)
+ *  interrupt; hardware may timeout the s/w ack and reply ERROR)
 */
 void uv_bau_message_interrupt(struct pt_regs *regs)
 {
-        struct bau_payload_queue_entry *va_queue_first;
-        struct bau_payload_queue_entry *va_queue_last;
-        struct bau_payload_queue_entry *msg;
-        struct pt_regs *old_regs = set_irq_regs(regs);
-        cycles_t time1;
-        cycles_t time2;
-        int msg_slot;
-        int sw_ack_slot;
-        int fw;
        int count = 0;
-        unsigned long local_pnode;
+        cycles_t time_start;
+        struct bau_payload_queue_entry *msg;
-        ack_APIC_irq();
+        struct bau_control *bcp;
-        exit_idle();
+        struct ptc_stats *stat;
-        irq_enter();
+        struct msg_desc msgdesc;
-        time1 = get_cycles();
+        time_start = get_cycles();
+        bcp = &per_cpu(bau_control, smp_processor_id());
-        local_pnode = uv_blade_to_pnode(uv_numa_blade_id());
+        stat = &per_cpu(ptcstats, smp_processor_id());
+        msgdesc.va_queue_first = bcp->va_queue_first;
-        va_queue_first = __get_cpu_var(bau_control).va_queue_first;
+        msgdesc.va_queue_last = bcp->va_queue_last;
-        va_queue_last = __get_cpu_var(bau_control).va_queue_last;
+        msg = bcp->bau_msg_head;
-        msg = __get_cpu_var(bau_control).bau_msg_head;
        while (msg->sw_ack_vector) {
                count++;
-                fw = msg->sw_ack_vector;
+                msgdesc.msg_slot = msg - msgdesc.va_queue_first;
-                msg_slot = msg - va_queue_first;
+                msgdesc.sw_ack_slot = ffs(msg->sw_ack_vector) - 1;
-                sw_ack_slot = ffs(fw) - 1;
+                msgdesc.msg = msg;
+                uv_bau_process_message(&msgdesc, bcp);
-                uv_bau_process_message(msg, msg_slot, sw_ack_slot);
                msg++;
-                if (msg > va_queue_last)
+                if (msg > msgdesc.va_queue_last)
-                        msg = va_queue_first;
+                        msg = msgdesc.va_queue_first;
-                __get_cpu_var(bau_control).bau_msg_head = msg;
+                bcp->bau_msg_head = msg;
        }
+        stat->d_time += (get_cycles() - time_start);
        if (!count)
-                __get_cpu_var(ptcstats).nomsg++;
+                stat->d_nomsg++;
        else if (count > 1)
-                __get_cpu_var(ptcstats).multmsg++;
+                stat->d_multmsg++;
+        ack_APIC_irq();
-        time2 = get_cycles();
-        __get_cpu_var(ptcstats).dflush += (time2 - time1);
-        irq_exit();
-        set_irq_regs(old_regs);
 }
 /*
 * uv_enable_timeouts
 *
- * Each target blade (i.e. blades that have cpu's) needs to have
+ * Each target uvhub (i.e. a uvhub that has no cpu's) needs to have
 * shootdown message timeouts enabled.  The timeout does not cause
 * an interrupt, but causes an error message to be returned to
 * the sender.
 */
 static void uv_enable_timeouts(void)
 {
-        int blade;
+        int uvhub;
-        int nblades;
+        int nuvhubs;
        int pnode;
        unsigned long mmr_image;
-        nblades = uv_num_possible_blades();
+        nuvhubs = uv_num_possible_blades();
-        for (blade = 0; blade < nblades; blade++) {
+        for (uvhub = 0; uvhub < nuvhubs; uvhub++) {
-                if (!uv_blade_nr_possible_cpus(blade))
+                if (!uv_blade_nr_possible_cpus(uvhub))
                        continue;
-                pnode = uv_blade_to_pnode(blade);
+                pnode = uv_blade_to_pnode(uvhub);
                mmr_image =
                    uv_read_global_mmr64(pnode, UVH_LB_BAU_MISC_CONTROL);
                /*
@@ -479,16 +864,16 @@ static void uv_enable_timeouts(void)
                 * To program the period, the SOFT_ACK_MODE must be off.
                 */
                mmr_image &= ~((unsigned long)1 <<
-                               UV_ENABLE_INTD_SOFT_ACK_MODE_SHIFT);
+                    UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT);
                uv_write_global_mmr64
                    (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image);
                /*
                 * Set the 4-bit period.
                 */
                mmr_image &= ~((unsigned long)0xf <<
-                        UV_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHIFT);
+                     UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT);
                mmr_image |= (UV_INTD_SOFT_ACK_TIMEOUT_PERIOD <<
-                             UV_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHIFT);
+                     UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT);
                uv_write_global_mmr64
                    (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image);
                /*
@@ -497,7 +882,7 @@ static void uv_enable_timeouts(void)
                 * indicated in bits 2:0 (7 causes all of them to timeout).
                 */
                mmr_image |= ((unsigned long)1 <<
-                              UV_ENABLE_INTD_SOFT_ACK_MODE_SHIFT);
+                    UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT);
                uv_write_global_mmr64
                    (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image);
        }
@@ -522,9 +907,20 @@ static void uv_ptc_seq_stop(struct seq_file *file, void *data)
 {
 }
+static inline unsigned long long
+millisec_2_cycles(unsigned long millisec)
+{
+        unsigned long ns;
+        unsigned long long cyc;
+        ns = millisec * 1000;
+        cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id()));
+        return cyc;
+}
 /*
- * Display the statistics thru /proc
+ * Display the statistics thru /proc.
- * data points to the cpu number
+ * 'data' points to the cpu number
 */
 static int uv_ptc_seq_show(struct seq_file *file, void *data)
 {
@@ -535,78 +931,155 @@ static int uv_ptc_seq_show(struct seq_file *file, void *data)
        if (!cpu) {
                seq_printf(file,
-                "# cpu requestor requestee one all sretry dretry ptc_i ");
+                        "# cpu sent stime numuvhubs numuvhubs16 numuvhubs8 ");
                seq_printf(file,
-                "sw_ack sflush dflush sok dnomsg dmult starget\n");
+                        "numuvhubs4 numuvhubs2 numuvhubs1 numcpus dto ");
+                seq_printf(file,
+                        "retries rok resetp resett giveup sto bz throt ");
+                seq_printf(file,
+                        "sw_ack recv rtime all ");
+                seq_printf(file,
+                        "one mult none retry canc nocan reset rcan\n");
        }
        if (cpu < num_possible_cpus() && cpu_online(cpu)) {
                stat = &per_cpu(ptcstats, cpu);
-                seq_printf(file, "cpu %d %ld %ld %ld %ld %ld %ld %ld ",
+                /* source side statistics */
-                           cpu, stat->requestor,
+                seq_printf(file,
-                           stat->requestee, stat->onetlb, stat->alltlb,
+                        "cpu %d %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ",
-                           stat->s_retry, stat->d_retry, stat->ptc_i);
+                           cpu, stat->s_requestor, cycles_2_us(stat->s_time),
-                seq_printf(file, "%lx %ld %ld %ld %ld %ld %ld\n",
+                           stat->s_ntarguvhub, stat->s_ntarguvhub16,
+                           stat->s_ntarguvhub8, stat->s_ntarguvhub4,
+                           stat->s_ntarguvhub2, stat->s_ntarguvhub1,
+                           stat->s_ntargcpu, stat->s_dtimeout);
+                seq_printf(file, "%ld %ld %ld %ld %ld %ld %ld %ld ",
+                           stat->s_retry_messages, stat->s_retriesok,
+                           stat->s_resets_plug, stat->s_resets_timeout,
+                           stat->s_giveup, stat->s_stimeout,
+                           stat->s_busy, stat->s_throttles);
+                /* destination side statistics */
+                seq_printf(file,
+                           "%lx %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld\n",
                           uv_read_global_mmr64(uv_cpu_to_pnode(cpu),
                                        UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE),
-                           stat->sflush, stat->dflush,
+                           stat->d_requestee, cycles_2_us(stat->d_time),
-                           stat->retriesok, stat->nomsg,
+                           stat->d_alltlb, stat->d_onetlb, stat->d_multmsg,
-                           stat->multmsg, stat->ntargeted);
+                           stat->d_nomsg, stat->d_retries, stat->d_canceled,
+                           stat->d_nocanceled, stat->d_resets,
+                           stat->d_rcanceled);
        }
        return 0;
 }
 /*
+ * -1: resetf the statistics
 *  0: display meaning of the statistics
- * >0: retry limit
+ * >0: maximum concurrent active descriptors per uvhub (throttle)
 */
 static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user,
                                 size_t count, loff_t *data)
 {
-        long newmode;
+        int cpu;
+        long input_arg;
        char optstr[64];
+        struct ptc_stats *stat;
+        struct bau_control *bcp;
        if (count == 0 || count > sizeof(optstr))
                return -EINVAL;
        if (copy_from_user(optstr, user, count))
                return -EFAULT;
        optstr[count - 1] = '\0';
-        if (strict_strtoul(optstr, 10, &newmode) < 0) {
+        if (strict_strtol(optstr, 10, &input_arg) < 0) {
                printk(KERN_DEBUG "%s is invalid\n", optstr);
                return -EINVAL;
        }
-        if (newmode == 0) {
+        if (input_arg == 0) {
                printk(KERN_DEBUG "# cpu:      cpu number\n");
+                printk(KERN_DEBUG "Sender statistics:\n");
+                printk(KERN_DEBUG
+                "sent:     number of shootdown messages sent\n");
+                printk(KERN_DEBUG
+                "stime:    time spent sending messages\n");
+                printk(KERN_DEBUG
+                "numuvhubs: number of hubs targeted with shootdown\n");
+                printk(KERN_DEBUG
+                "numuvhubs16: number times 16 or more hubs targeted\n");
+                printk(KERN_DEBUG
+                "numuvhubs8: number times 8 or more hubs targeted\n");
+                printk(KERN_DEBUG
+                "numuvhubs4: number times 4 or more hubs targeted\n");
+                printk(KERN_DEBUG
+                "numuvhubs2: number times 2 or more hubs targeted\n");
+                printk(KERN_DEBUG
+                "numuvhubs1: number times 1 hub targeted\n");
+                printk(KERN_DEBUG
+                "numcpus:  number of cpus targeted with shootdown\n");
+                printk(KERN_DEBUG
+                "dto:      number of destination timeouts\n");
+                printk(KERN_DEBUG
+                "retries:  destination timeout retries sent\n");
+                printk(KERN_DEBUG
+                "rok:   :  destination timeouts successfully retried\n");
+                printk(KERN_DEBUG
+                "resetp:   ipi-style resource resets for plugs\n");
+                printk(KERN_DEBUG
+                "resett:   ipi-style resource resets for timeouts\n");
+                printk(KERN_DEBUG
+                "giveup:   fall-backs to ipi-style shootdowns\n");
+                printk(KERN_DEBUG
+                "sto:      number of source timeouts\n");
+                printk(KERN_DEBUG
+                "bz:       number of stay-busy's\n");
+                printk(KERN_DEBUG
+                "throt:    number times spun in throttle\n");
+                printk(KERN_DEBUG "Destination side statistics:\n");
                printk(KERN_DEBUG
-                "requestor:  times this cpu was the flush requestor\n");
+                "sw_ack:   image of UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE\n");
                printk(KERN_DEBUG
-                "requestee:  times this cpu was requested to flush its TLBs\n");
+                "recv:     shootdown messages received\n");
                printk(KERN_DEBUG
-                "one:        times requested to flush a single address\n");
+                "rtime:    time spent processing messages\n");
                printk(KERN_DEBUG
-                "all:        times requested to flush all TLB's\n");
+                "all:      shootdown all-tlb messages\n");
                printk(KERN_DEBUG
-                "sretry:     number of retries of source-side timeouts\n");
+                "one:      shootdown one-tlb messages\n");
                printk(KERN_DEBUG
-                "dretry:     number of retries of destination-side timeouts\n");
+                "mult:     interrupts that found multiple messages\n");
                printk(KERN_DEBUG
-                "ptc_i:      times UV fell through to IPI-style flushes\n");
+                "none:     interrupts that found no messages\n");
                printk(KERN_DEBUG
-                "sw_ack:     image of UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE\n");
+                "retry:    number of retry messages processed\n");
                printk(KERN_DEBUG
-                "sflush_us:  cycles spent in uv_flush_tlb_others()\n");
+                "canc:     number messages canceled by retries\n");
                printk(KERN_DEBUG
-                "dflush_us:  cycles spent in handling flush requests\n");
+                "nocan:    number retries that found nothing to cancel\n");
-                printk(KERN_DEBUG "sok:        successes on retry\n");
-                printk(KERN_DEBUG "dnomsg:     interrupts with no message\n");
                printk(KERN_DEBUG
-                "dmult:      interrupts with multiple messages\n");
+                "reset:    number of ipi-style reset requests processed\n");
-                printk(KERN_DEBUG "starget:    nodes targeted\n");
+                printk(KERN_DEBUG
+                "rcan:     number messages canceled by reset requests\n");
+        } else if (input_arg == -1) {
+                for_each_present_cpu(cpu) {
+                        stat = &per_cpu(ptcstats, cpu);
+                        memset(stat, 0, sizeof(struct ptc_stats));
+                }
        } else {
-                uv_bau_retry_limit = newmode;
+                uv_bau_max_concurrent = input_arg;
-                printk(KERN_DEBUG "timeout retry limit:%d\n",
+                bcp = &per_cpu(bau_control, smp_processor_id());
-                       uv_bau_retry_limit);
+                if (uv_bau_max_concurrent < 1 ||
+                    uv_bau_max_concurrent > bcp->cpus_in_uvhub) {
+                        printk(KERN_DEBUG
+                                "Error: BAU max concurrent %d; %d is invalid\n",
+                                bcp->max_concurrent, uv_bau_max_concurrent);
+                        return -EINVAL;
+                }
+                printk(KERN_DEBUG "Set BAU max concurrent:%d\n",
+                       uv_bau_max_concurrent);
+                for_each_present_cpu(cpu) {
+                        bcp = &per_cpu(bau_control, cpu);
+                        bcp->max_concurrent = uv_bau_max_concurrent;
+                }
        }
        return count;
@@ -650,79 +1123,30 @@ static int __init uv_ptc_init(void)
 }
 /*
- * begin the initialization of the per-blade control structures
- */
-static struct bau_control * __init uv_table_bases_init(int blade, int node)
-{
-        int i;
-        struct bau_msg_status *msp;
-        struct bau_control *bau_tabp;
-        bau_tabp =
-            kmalloc_node(sizeof(struct bau_control), GFP_KERNEL, node);
-        BUG_ON(!bau_tabp);
-        bau_tabp->msg_statuses =
-            kmalloc_node(sizeof(struct bau_msg_status) *
-                         DEST_Q_SIZE, GFP_KERNEL, node);
-        BUG_ON(!bau_tabp->msg_statuses);
-        for (i = 0, msp = bau_tabp->msg_statuses; i < DEST_Q_SIZE; i++, msp++)
-                bau_cpubits_clear(&msp->seen_by, (int)
-                                  uv_blade_nr_possible_cpus(blade));
-        uv_bau_table_bases[blade] = bau_tabp;
-        return bau_tabp;
-}
-/*
- * finish the initialization of the per-blade control structures
- */
-static void __init
-uv_table_bases_finish(int blade,
-                      struct bau_control *bau_tablesp,
-                      struct bau_desc *adp)
-{
-        struct bau_control *bcp;
-        int cpu;
-        for_each_present_cpu(cpu) {
-                if (blade != uv_cpu_to_blade_id(cpu))
-                        continue;
-                bcp = (struct bau_control *)&per_cpu(bau_control, cpu);
-                bcp->bau_msg_head       = bau_tablesp->va_queue_first;
-                bcp->va_queue_first     = bau_tablesp->va_queue_first;
-                bcp->va_queue_last      = bau_tablesp->va_queue_last;
-                bcp->msg_statuses       = bau_tablesp->msg_statuses;
-                bcp->descriptor_base    = adp;
-        }
-}
-/*
 * initialize the sending side's sending buffers
 */
-static struct bau_desc * __init
+static void
 uv_activation_descriptor_init(int node, int pnode)
 {
        int i;
+        int cpu;
        unsigned long pa;
        unsigned long m;
        unsigned long n;
-        struct bau_desc *adp;
+        struct bau_desc *bau_desc;
-        struct bau_desc *ad2;
+        struct bau_desc *bd2;
+        struct bau_control *bcp;
        /*
         * each bau_desc is 64 bytes; there are 8 (UV_ITEMS_PER_DESCRIPTOR)
-         * per cpu; and up to 32 (UV_ADP_SIZE) cpu's per blade
+         * per cpu; and up to 32 (UV_ADP_SIZE) cpu's per uvhub
         */
-        adp = (struct bau_desc *)kmalloc_node(sizeof(struct bau_desc)*
+        bau_desc = (struct bau_desc *)kmalloc_node(sizeof(struct bau_desc)*
                UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR, GFP_KERNEL, node);
-        BUG_ON(!adp);
+        BUG_ON(!bau_desc);
-        pa = uv_gpa(adp); /* need the real nasid*/
+        pa = uv_gpa(bau_desc); /* need the real nasid*/
-        n = uv_gpa_to_pnode(pa);
+        n = pa >> uv_nshift;
        m = pa & uv_mmask;
        uv_write_global_mmr64(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE,
@@ -731,96 +1155,188 @@ uv_activation_descriptor_init(int node, int pnode)
        /*
         * initializing all 8 (UV_ITEMS_PER_DESCRIPTOR) descriptors for each
         * cpu even though we only use the first one; one descriptor can
-         * describe a broadcast to 256 nodes.
+         * describe a broadcast to 256 uv hubs.
         */
-        for (i = 0, ad2 = adp; i < (UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR);
+        for (i = 0, bd2 = bau_desc; i < (UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR);
-                i++, ad2++) {
+                i++, bd2++) {
-                memset(ad2, 0, sizeof(struct bau_desc));
+                memset(bd2, 0, sizeof(struct bau_desc));
-                ad2->header.sw_ack_flag = 1;
+                bd2->header.sw_ack_flag = 1;
                /*
-                 * base_dest_nodeid is the first node in the partition, so
+                 * base_dest_nodeid is the nasid (pnode<<1) of the first uvhub
-                 * the bit map will indicate partition-relative node numbers.
+                 * in the partition. The bit map will indicate uvhub numbers,
-                 * note that base_dest_nodeid is actually a nasid.
+                 * which are 0-N in a partition. Pnodes are unique system-wide.
                 */
-                ad2->header.base_dest_nodeid = uv_partition_base_pnode << 1;
+                bd2->header.base_dest_nodeid = uv_partition_base_pnode << 1;
-                ad2->header.dest_subnodeid = 0x10; /* the LB */
+                bd2->header.dest_subnodeid = 0x10; /* the LB */
-                ad2->header.command = UV_NET_ENDPOINT_INTD;
+                bd2->header.command = UV_NET_ENDPOINT_INTD;
-                ad2->header.int_both = 1;
+                bd2->header.int_both = 1;
                /*
                 * all others need to be set to zero:
                 *   fairness chaining multilevel count replied_to
                 */
        }
-        return adp;
+        for_each_present_cpu(cpu) {
+                if (pnode != uv_blade_to_pnode(uv_cpu_to_blade_id(cpu)))
+                        continue;
+                bcp = &per_cpu(bau_control, cpu);
+                bcp->descriptor_base = bau_desc;
+        }
 }
 /*
 * initialize the destination side's receiving buffers
+ * entered for each uvhub in the partition
+ * - node is first node (kernel memory notion) on the uvhub
+ * - pnode is the uvhub's physical identifier
 */
-static struct bau_payload_queue_entry * __init
+static void
-uv_payload_queue_init(int node, int pnode, struct bau_control *bau_tablesp)
+uv_payload_queue_init(int node, int pnode)
 {
-        struct bau_payload_queue_entry *pqp;
-        unsigned long pa;
        int pn;
+        int cpu;
        char *cp;
+        unsigned long pa;
+        struct bau_payload_queue_entry *pqp;
+        struct bau_payload_queue_entry *pqp_malloc;
+        struct bau_control *bcp;
        pqp = (struct bau_payload_queue_entry *) kmalloc_node(
                (DEST_Q_SIZE + 1) * sizeof(struct bau_payload_queue_entry),
                GFP_KERNEL, node);
        BUG_ON(!pqp);
+        pqp_malloc = pqp;
        cp = (char *)pqp + 31;
        pqp = (struct bau_payload_queue_entry *)(((unsigned long)cp >> 5) << 5);
-        bau_tablesp->va_queue_first = pqp;
+        for_each_present_cpu(cpu) {
+                if (pnode != uv_cpu_to_pnode(cpu))
+                        continue;
+                /* for every cpu on this pnode: */
+                bcp = &per_cpu(bau_control, cpu);
+                bcp->va_queue_first = pqp;
+                bcp->bau_msg_head = pqp;
+                bcp->va_queue_last = pqp + (DEST_Q_SIZE - 1);
+        }
        /*
         * need the pnode of where the memory was really allocated
         */
        pa = uv_gpa(pqp);
-        pn = uv_gpa_to_pnode(pa);
+        pn = pa >> uv_nshift;
        uv_write_global_mmr64(pnode,
                              UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST,
                              ((unsigned long)pn << UV_PAYLOADQ_PNODE_SHIFT) |
                              uv_physnodeaddr(pqp));
        uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL,
                              uv_physnodeaddr(pqp));
-        bau_tablesp->va_queue_last = pqp + (DEST_Q_SIZE - 1);
        uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST,
                              (unsigned long)
-                              uv_physnodeaddr(bau_tablesp->va_queue_last));
+                              uv_physnodeaddr(pqp + (DEST_Q_SIZE - 1)));
+        /* in effect, all msg_type's are set to MSG_NOOP */
        memset(pqp, 0, sizeof(struct bau_payload_queue_entry) * DEST_Q_SIZE);
-        return pqp;
 }
 /*
- * Initialization of each UV blade's structures
+ * Initialization of each UV hub's structures
 */
-static int __init uv_init_blade(int blade)
+static void __init uv_init_uvhub(int uvhub, int vector)
 {
        int node;
        int pnode;
-        unsigned long pa;
        unsigned long apicid;
-        struct bau_desc *adp;
-        struct bau_payload_queue_entry *pqp;
+        node = uvhub_to_first_node(uvhub);
-        struct bau_control *bau_tablesp;
+        pnode = uv_blade_to_pnode(uvhub);
+        uv_activation_descriptor_init(node, pnode);
-        node = blade_to_first_node(blade);
+        uv_payload_queue_init(node, pnode);
-        bau_tablesp = uv_table_bases_init(blade, node);
-        pnode = uv_blade_to_pnode(blade);
-        adp = uv_activation_descriptor_init(node, pnode);
-        pqp = uv_payload_queue_init(node, pnode, bau_tablesp);
-        uv_table_bases_finish(blade, bau_tablesp, adp);
        /*
         * the below initialization can't be in firmware because the
         * messaging IRQ will be determined by the OS
         */
-        apicid = blade_to_first_apicid(blade);
+        apicid = uvhub_to_first_apicid(uvhub);
-        pa = uv_read_global_mmr64(pnode, UVH_BAU_DATA_CONFIG);
        uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG,
-                                      ((apicid << 32) | UV_BAU_MESSAGE));
+                                      ((apicid << 32) | vector));
-        return 0;
+}
+/*
+ * initialize the bau_control structure for each cpu
+ */
+static void uv_init_per_cpu(int nuvhubs)
+{
+        int i, j, k;
+        int cpu;
+        int pnode;
+        int uvhub;
+        short socket = 0;
+        struct bau_control *bcp;
+        struct uvhub_desc *bdp;
+        struct socket_desc *sdp;
+        struct bau_control *hmaster = NULL;
+        struct bau_control *smaster = NULL;
+        struct socket_desc {
+                short num_cpus;
+                short cpu_number[16];
+        };
+        struct uvhub_desc {
+                short num_sockets;
+                short num_cpus;
+                short uvhub;
+                short pnode;
+                struct socket_desc socket[2];
+        };
+        struct uvhub_desc *uvhub_descs;
+        uvhub_descs = (struct uvhub_desc *)
+                kmalloc(nuvhubs * sizeof(struct uvhub_desc), GFP_KERNEL);
+        memset(uvhub_descs, 0, nuvhubs * sizeof(struct uvhub_desc));
+        for_each_present_cpu(cpu) {
+                bcp = &per_cpu(bau_control, cpu);
+                memset(bcp, 0, sizeof(struct bau_control));
+                spin_lock_init(&bcp->masks_lock);
+                bcp->max_concurrent = uv_bau_max_concurrent;
+                pnode = uv_cpu_hub_info(cpu)->pnode;
+                uvhub = uv_cpu_hub_info(cpu)->numa_blade_id;
+                bdp = &uvhub_descs[uvhub];
+                bdp->num_cpus++;
+                bdp->uvhub = uvhub;
+                bdp->pnode = pnode;
+                /* time interval to catch a hardware stay-busy bug */
+                bcp->timeout_interval = millisec_2_cycles(3);
+                /* kludge: assume uv_hub.h is constant */
+                socket = (cpu_physical_id(cpu)>>5)&1;
+                if (socket >= bdp->num_sockets)
+                        bdp->num_sockets = socket+1;
+                sdp = &bdp->socket[socket];
+                sdp->cpu_number[sdp->num_cpus] = cpu;
+                sdp->num_cpus++;
+        }
+        socket = 0;
+        for_each_possible_blade(uvhub) {
+                bdp = &uvhub_descs[uvhub];
+                for (i = 0; i < bdp->num_sockets; i++) {
+                        sdp = &bdp->socket[i];
+                        for (j = 0; j < sdp->num_cpus; j++) {
+                                cpu = sdp->cpu_number[j];
+                                bcp = &per_cpu(bau_control, cpu);
+                                bcp->cpu = cpu;
+                                if (j == 0) {
+                                        smaster = bcp;
+                                        if (i == 0)
+                                                hmaster = bcp;
+                                }
+                                bcp->cpus_in_uvhub = bdp->num_cpus;
+                                bcp->cpus_in_socket = sdp->num_cpus;
+                                bcp->socket_master = smaster;
+                                bcp->uvhub_master = hmaster;
+                                for (k = 0; k < DEST_Q_SIZE; k++)
+                                        bcp->socket_acknowledge_count[k] = 0;
+                                bcp->uvhub_cpu =
+                                  uv_cpu_hub_info(cpu)->blade_processor_id;
+                        }
+                        socket++;
+                }
+        }
+        kfree(uvhub_descs);
 }
 /*
@@ -828,38 +1344,54 @@ static int __init uv_init_blade(int blade)
 */
 static int __init uv_bau_init(void)
 {
-        int blade;
+        int uvhub;
-        int nblades;
+        int pnode;
+        int nuvhubs;
        int cur_cpu;
+        int vector;
+        unsigned long mmr;
        if (!is_uv_system())
                return 0;
+        if (nobau)
+                return 0;
        for_each_possible_cpu(cur_cpu)
                zalloc_cpumask_var_node(&per_cpu(uv_flush_tlb_mask, cur_cpu),
                                       GFP_KERNEL, cpu_to_node(cur_cpu));
-        uv_bau_retry_limit = 1;
+        uv_bau_max_concurrent = MAX_BAU_CONCURRENT;
+        uv_nshift = uv_hub_info->m_val;
        uv_mmask = (1UL << uv_hub_info->m_val) - 1;
-        nblades = uv_num_possible_blades();
+        nuvhubs = uv_num_possible_blades();
-        uv_bau_table_bases = (struct bau_control **)
+        uv_init_per_cpu(nuvhubs);
-            kmalloc(nblades * sizeof(struct bau_control *), GFP_KERNEL);
-        BUG_ON(!uv_bau_table_bases);
        uv_partition_base_pnode = 0x7fffffff;
-        for (blade = 0; blade < nblades; blade++)
+        for (uvhub = 0; uvhub < nuvhubs; uvhub++)
-                if (uv_blade_nr_possible_cpus(blade) &&
+                if (uv_blade_nr_possible_cpus(uvhub) &&
-                        (uv_blade_to_pnode(blade) < uv_partition_base_pnode))
+                        (uv_blade_to_pnode(uvhub) < uv_partition_base_pnode))
-                        uv_partition_base_pnode = uv_blade_to_pnode(blade);
+                        uv_partition_base_pnode = uv_blade_to_pnode(uvhub);
-        for (blade = 0; blade < nblades; blade++)
-                if (uv_blade_nr_possible_cpus(blade))
+        vector = UV_BAU_MESSAGE;
-                        uv_init_blade(blade);
+        for_each_possible_blade(uvhub)
+                if (uv_blade_nr_possible_cpus(uvhub))
-        alloc_intr_gate(UV_BAU_MESSAGE, uv_bau_message_intr1);
+                        uv_init_uvhub(uvhub, vector);
        uv_enable_timeouts();
+        alloc_intr_gate(vector, uv_bau_message_intr1);
+        for_each_possible_blade(uvhub) {
+                pnode = uv_blade_to_pnode(uvhub);
+                /* INIT the bau */
+                uv_write_global_mmr64(pnode, UVH_LB_BAU_SB_ACTIVATION_CONTROL,
+                                      ((unsigned long)1 << 63));
+                mmr = 1; /* should be 1 to broadcast to both sockets */
+                uv_write_global_mmr64(pnode, UVH_BAU_DATA_BROADCAST, mmr);
+        }
        return 0;
 }
-__initcall(uv_bau_init);
+core_initcall(uv_bau_init);
-__initcall(uv_ptc_init);
+core_initcall(uv_ptc_init);
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 1168e4454188..02cfb9b8f5b1 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -108,15 +108,6 @@ static inline void preempt_conditional_cli(struct pt_regs *regs)
        dec_preempt_count();
 }
-#ifdef CONFIG_X86_32
-static inline void
-die_if_kernel(const char *str, struct pt_regs *regs, long err)
-{
-        if (!user_mode_vm(regs))
-                die(str, regs, err);
-}
-#endif
 static void __kprobes
 do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
        long error_code, siginfo_t *info)
@@ -543,11 +534,11 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
        /* DR6 may or may not be cleared by the CPU */
        set_debugreg(0, 6);
        /*
         * The processor cleared BTF, so don't mark that we need it set.
         */
-        clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR);
+        clear_tsk_thread_flag(tsk, TIF_BLOCKSTEP);
-        tsk->thread.debugctlmsr = 0;
        /* Store the virtualized DR6 value */
        tsk->thread.debugreg6 = dr6;
@@ -585,55 +576,67 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
        return;
 }
-#ifdef CONFIG_X86_64
-static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr)
-{
-        if (fixup_exception(regs))
-                return 1;
-        notify_die(DIE_GPF, str, regs, 0, trapnr, SIGFPE);
-        /* Illegal floating point operation in the kernel */
-        current->thread.trap_no = trapnr;
-        die(str, regs, 0);
-        return 0;
-}
-#endif
 /*
 * Note that we play around with the 'TS' bit in an attempt to get
 * the correct behaviour even in the presence of the asynchronous
 * IRQ13 behaviour
 */
-void math_error(void __user *ip)
+void math_error(struct pt_regs *regs, int error_code, int trapnr)
 {
-        struct task_struct *task;
+        struct task_struct *task = current;
        siginfo_t info;
-        unsigned short cwd, swd, err;
+        unsigned short err;
+        char *str = (trapnr == 16) ? "fpu exception" : "simd exception";
+        if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, SIGFPE) == NOTIFY_STOP)
+                return;
+        conditional_sti(regs);
+        if (!user_mode_vm(regs))
+        {
+                if (!fixup_exception(regs)) {
+                        task->thread.error_code = error_code;
+                        task->thread.trap_no = trapnr;
+                        die(str, regs, error_code);
+                }
+                return;
+        }
        /*
         * Save the info for the exception handler and clear the error.
         */
-        task = current;
        save_init_fpu(task);
-        task->thread.trap_no = 16;
+        task->thread.trap_no = trapnr;
-        task->thread.error_code = 0;
+        task->thread.error_code = error_code;
        info.si_signo = SIGFPE;
        info.si_errno = 0;
-        info.si_addr = ip;
+        info.si_addr = (void __user *)regs->ip;
-        /*
+        if (trapnr == 16) {
-         * (~cwd & swd) will mask out exceptions that are not set to unmasked
+                unsigned short cwd, swd;
-         * status.  0x3f is the exception bits in these regs, 0x200 is the
+                /*
-         * C1 reg you need in case of a stack fault, 0x040 is the stack
+                 * (~cwd & swd) will mask out exceptions that are not set to unmasked
-         * fault bit.  We should only be taking one exception at a time,
+                 * status.  0x3f is the exception bits in these regs, 0x200 is the
-         * so if this combination doesn't produce any single exception,
+                 * C1 reg you need in case of a stack fault, 0x040 is the stack
-         * then we have a bad program that isn't synchronizing its FPU usage
+                 * fault bit.  We should only be taking one exception at a time,
-         * and it will suffer the consequences since we won't be able to
+                 * so if this combination doesn't produce any single exception,
-         * fully reproduce the context of the exception
+                 * then we have a bad program that isn't synchronizing its FPU usage
-         */
+                 * and it will suffer the consequences since we won't be able to
-        cwd = get_fpu_cwd(task);
+                 * fully reproduce the context of the exception
-        swd = get_fpu_swd(task);
+                 */
+                cwd = get_fpu_cwd(task);
+                swd = get_fpu_swd(task);
-        err = swd & ~cwd;
+                err = swd & ~cwd;
+        } else {
+                /*
+                 * The SIMD FPU exceptions are handled a little differently, as there
+                 * is only a single status/control register.  Thus, to determine which
+                 * unmasked exception was caught we must mask the exception mask bits
+                 * at 0x1f80, and then use these to mask the exception bits at 0x3f.
+                 */
+                unsigned short mxcsr = get_fpu_mxcsr(task);
+                err = ~(mxcsr >> 7) & mxcsr;
+        }
        if (err & 0x001) {      /* Invalid op */
                /*
@@ -662,97 +665,17 @@ void math_error(void __user *ip)
 dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code)
 {
-        conditional_sti(regs);
 #ifdef CONFIG_X86_32
        ignore_fpu_irq = 1;
-#else
-        if (!user_mode(regs) &&
-            kernel_math_error(regs, "kernel x87 math error", 16))
-                return;
 #endif
-        math_error((void __user *)regs->ip);
+        math_error(regs, error_code, 16);
-}
-static void simd_math_error(void __user *ip)
-{
-        struct task_struct *task;
-        siginfo_t info;
-        unsigned short mxcsr;
-        /*
-         * Save the info for the exception handler and clear the error.
-         */
-        task = current;
-        save_init_fpu(task);
-        task->thread.trap_no = 19;
-        task->thread.error_code = 0;
-        info.si_signo = SIGFPE;
-        info.si_errno = 0;
-        info.si_code = __SI_FAULT;
-        info.si_addr = ip;
-        /*
-         * The SIMD FPU exceptions are handled a little differently, as there
-         * is only a single status/control register.  Thus, to determine which
-         * unmasked exception was caught we must mask the exception mask bits
-         * at 0x1f80, and then use these to mask the exception bits at 0x3f.
-         */
-        mxcsr = get_fpu_mxcsr(task);
-        switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) {
-        case 0x000:
-        default:
-                break;
-        case 0x001: /* Invalid Op */
-                info.si_code = FPE_FLTINV;
-                break;
-        case 0x002: /* Denormalize */
-        case 0x010: /* Underflow */
-                info.si_code = FPE_FLTUND;
-                break;
-        case 0x004: /* Zero Divide */
-                info.si_code = FPE_FLTDIV;
-                break;
-        case 0x008: /* Overflow */
-                info.si_code = FPE_FLTOVF;
-                break;
-        case 0x020: /* Precision */
-                info.si_code = FPE_FLTRES;
-                break;
-        }
-        force_sig_info(SIGFPE, &info, task);
 }
 dotraplinkage void
 do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
 {
-        conditional_sti(regs);
+        math_error(regs, error_code, 19);
-#ifdef CONFIG_X86_32
-        if (cpu_has_xmm) {
-                /* Handle SIMD FPU exceptions on PIII+ processors. */
-                ignore_fpu_irq = 1;
-                simd_math_error((void __user *)regs->ip);
-                return;
-        }
-        /*
-         * Handle strange cache flush from user space exception
-         * in all other cases.  This is undocumented behaviour.
-         */
-        if (regs->flags & X86_VM_MASK) {
-                handle_vm86_fault((struct kernel_vm86_regs *)regs, error_code);
-                return;
-        }
-        current->thread.trap_no = 19;
-        current->thread.error_code = error_code;
-        die_if_kernel("cache flush denied", regs, error_code);
-        force_sig(SIGSEGV, current);
-#else
-        if (!user_mode(regs) &&
-                        kernel_math_error(regs, "kernel simd math error", 19))
-                return;
-        simd_math_error((void __user *)regs->ip);
-#endif
 }
 dotraplinkage void
diff --git a/arch/x86/kernel/uv_irq.c b/arch/x86/kernel/uv_irq.c
index 1d40336b030a..1132129db792 100644
--- a/arch/x86/kernel/uv_irq.c
+++ b/arch/x86/kernel/uv_irq.c
@@ -44,7 +44,7 @@ static void uv_ack_apic(unsigned int irq)
        ack_APIC_irq();
 }
-struct irq_chip uv_irq_chip = {
+static struct irq_chip uv_irq_chip = {
        .name           = "UV-CORE",
        .startup        = uv_noop_ret,
        .shutdown       = uv_noop,
@@ -141,7 +141,7 @@ int uv_irq_2_mmr_info(int irq, unsigned long *offset, int *pnode)
 */
 static int
 arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
-                       unsigned long mmr_offset, int restrict)
+                       unsigned long mmr_offset, int limit)
 {
        const struct cpumask *eligible_cpu = cpumask_of(cpu);
        struct irq_desc *desc = irq_to_desc(irq);
@@ -160,7 +160,7 @@ arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
        if (err != 0)
                return err;
-        if (restrict == UV_AFFINITY_CPU)
+        if (limit == UV_AFFINITY_CPU)
                desc->status |= IRQ_NO_BALANCING;
        else
                desc->status |= IRQ_MOVE_PCNTXT;
@@ -214,7 +214,7 @@ static int uv_set_irq_affinity(unsigned int irq, const struct cpumask *mask)
        unsigned long mmr_value;
        struct uv_IO_APIC_route_entry *entry;
        unsigned long mmr_offset;
-        unsigned mmr_pnode;
+        int mmr_pnode;
        if (set_desc_affinity(desc, mask, &dest))
                return -1;
@@ -248,7 +248,7 @@ static int uv_set_irq_affinity(unsigned int irq, const struct cpumask *mask)
 * interrupt is raised.
 */
 int uv_setup_irq(char *irq_name, int cpu, int mmr_blade,
-                 unsigned long mmr_offset, int restrict)
+                 unsigned long mmr_offset, int limit)
 {
        int irq, ret;
@@ -258,7 +258,7 @@ int uv_setup_irq(char *irq_name, int cpu, int mmr_blade,
                return -EBUSY;
        ret = arch_enable_uv_irq(irq_name, irq, cpu, mmr_blade, mmr_offset,
-                restrict);
+                limit);
        if (ret == irq)
                uv_set_irq_2_mmr_info(irq, mmr_offset, mmr_blade);
        else
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index 693920b22496..1b950d151e58 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -54,7 +54,6 @@ EXPORT_SYMBOL(memcpy);
 EXPORT_SYMBOL(__memcpy);
 EXPORT_SYMBOL(empty_zero_page);
-EXPORT_SYMBOL(init_level4_pgt);
 #ifndef CONFIG_PARAVIRT
 EXPORT_SYMBOL(native_load_gs_index);
 #endif
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index 782c3a362ec6..37e68fc5e24a 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -99,7 +99,7 @@ int save_i387_xstate(void __user *buf)
                if (err)
                        return err;
-                if (task_thread_info(tsk)->status & TS_XSAVE)
+                if (use_xsave())
                        err = xsave_user(buf);
                else
                        err = fxsave_user(buf);
@@ -109,14 +109,14 @@ int save_i387_xstate(void __user *buf)
                task_thread_info(tsk)->status &= ~TS_USEDFPU;
                stts();
        } else {
-                if (__copy_to_user(buf, &tsk->thread.xstate->fxsave,
+                if (__copy_to_user(buf, &tsk->thread.fpu.state->fxsave,
                                   xstate_size))
                        return -1;
        }
        clear_used_math(); /* trigger finit */
-        if (task_thread_info(tsk)->status & TS_XSAVE) {
+        if (use_xsave()) {
                struct _fpstate __user *fx = buf;
                struct _xstate __user *x = buf;
                u64 xstate_bv;
@@ -225,7 +225,7 @@ int restore_i387_xstate(void __user *buf)
                clts();
                task_thread_info(current)->status |= TS_USEDFPU;
        }
-        if (task_thread_info(tsk)->status & TS_XSAVE)
+        if (use_xsave())
                err = restore_user_xstate(buf);
        else
                err = fxrstor_checking((__force struct i387_fxsave_struct *)