47 files changed, 1335 insertions, 5825 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 90b06d4daee2..04105574c8e9 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -24,17 +24,12 @@ endif
 nostackp := $(call cc-option, -fno-stack-protector)
 CFLAGS_vsyscall_64.o    := $(PROFILING) -g0 $(nostackp)
 CFLAGS_hpet.o           := $(nostackp)
-CFLAGS_vread_tsc_64.o   := $(nostackp)
 CFLAGS_paravirt.o       := $(nostackp)
 GCOV_PROFILE_vsyscall_64.o      := n
 GCOV_PROFILE_hpet.o             := n
 GCOV_PROFILE_tsc.o              := n
-GCOV_PROFILE_vread_tsc_64.o     := n
 GCOV_PROFILE_paravirt.o         := n
-# vread_tsc_64 is hot and should be fully optimized:
-CFLAGS_REMOVE_vread_tsc_64.o = -pg -fno-optimize-sibling-calls
 obj-y                   := process_$(BITS).o signal.o entry_$(BITS).o
 obj-y                   += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
 obj-y                   += time.o ioport.o ldt.o dumpstack.o
@@ -43,7 +38,8 @@ obj-$(CONFIG_IRQ_WORK)  += irq_work.o
 obj-y                   += probe_roms.o
 obj-$(CONFIG_X86_32)    += sys_i386_32.o i386_ksyms_32.o
 obj-$(CONFIG_X86_64)    += sys_x86_64.o x8664_ksyms_64.o
-obj-$(CONFIG_X86_64)    += syscall_64.o vsyscall_64.o vread_tsc_64.o
+obj-$(CONFIG_X86_64)    += syscall_64.o vsyscall_64.o
+obj-$(CONFIG_X86_64)    += vsyscall_emu_64.o
 obj-y                   += bootflag.o e820.o
 obj-y                   += pci-dma.o quirks.o topology.o kdebugfs.o
 obj-y                   += alternative.o i8253.o pci-nommu.o hw_breakpoint.o
@@ -123,7 +119,6 @@ ifeq ($(CONFIG_X86_64),y)
        obj-$(CONFIG_GART_IOMMU)        += amd_gart_64.o aperture_64.o
        obj-$(CONFIG_CALGARY_IOMMU)     += pci-calgary_64.o tce_64.o
-        obj-$(CONFIG_AMD_IOMMU)         += amd_iommu_init.o amd_iommu.o
        obj-$(CONFIG_PCI_MMCONFIG)      += mmconf-fam10h_64.o
        obj-y                           += vsmp_64.o
diff --git a/arch/x86/kernel/acpi/realmode/wakeup.S b/arch/x86/kernel/acpi/realmode/wakeup.S
index ead21b663117..b4fd836e4053 100644
--- a/arch/x86/kernel/acpi/realmode/wakeup.S
+++ b/arch/x86/kernel/acpi/realmode/wakeup.S
@@ -28,6 +28,8 @@ pmode_cr3:	.long	0	/* Saved %cr3 */
 pmode_cr4:      .long   0       /* Saved %cr4 */
 pmode_efer:     .quad   0       /* Saved EFER */
 pmode_gdt:      .quad   0
+pmode_misc_en:  .quad   0       /* Saved MISC_ENABLE MSR */
+pmode_behavior: .long   0       /* Wakeup behavior flags */
 realmode_flags: .long   0
 real_magic:     .long   0
 trampoline_segment:     .word 0
@@ -91,6 +93,18 @@ wakeup_code:
        /* Call the C code */
        calll   main
+        /* Restore MISC_ENABLE before entering protected mode, in case
+           BIOS decided to clear XD_DISABLE during S3. */
+        movl    pmode_behavior, %eax
+        btl     $WAKEUP_BEHAVIOR_RESTORE_MISC_ENABLE, %eax
+        jnc     1f
+        movl    pmode_misc_en, %eax
+        movl    pmode_misc_en + 4, %edx
+        movl    $MSR_IA32_MISC_ENABLE, %ecx
+        wrmsr
+1:
        /* Do any other stuff... */
 #ifndef CONFIG_64BIT
diff --git a/arch/x86/kernel/acpi/realmode/wakeup.h b/arch/x86/kernel/acpi/realmode/wakeup.h
index e1828c07e79c..97a29e1430e3 100644
--- a/arch/x86/kernel/acpi/realmode/wakeup.h
+++ b/arch/x86/kernel/acpi/realmode/wakeup.h
@@ -21,6 +21,9 @@ struct wakeup_header {
        u32 pmode_efer_low;     /* Protected mode EFER */
        u32 pmode_efer_high;
        u64 pmode_gdt;
+        u32 pmode_misc_en_low;  /* Protected mode MISC_ENABLE */
+        u32 pmode_misc_en_high;
+        u32 pmode_behavior;     /* Wakeup routine behavior flags */
        u32 realmode_flags;
        u32 real_magic;
        u16 trampoline_segment; /* segment with trampoline code, 64-bit only */
@@ -39,4 +42,7 @@ extern struct wakeup_header wakeup_header;
 #define WAKEUP_HEADER_SIGNATURE 0x51ee1111
 #define WAKEUP_END_SIGNATURE    0x65a22c82
+/* Wakeup behavior bits */
+#define WAKEUP_BEHAVIOR_RESTORE_MISC_ENABLE     0
 #endif /* ARCH_X86_KERNEL_ACPI_RM_WAKEUP_H */
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index 18a857ba7a25..103b6ab368d3 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -77,6 +77,12 @@ int acpi_suspend_lowlevel(void)
        header->pmode_cr0 = read_cr0();
        header->pmode_cr4 = read_cr4_safe();
+        header->pmode_behavior = 0;
+        if (!rdmsr_safe(MSR_IA32_MISC_ENABLE,
+                        &header->pmode_misc_en_low,
+                        &header->pmode_misc_en_high))
+                header->pmode_behavior |=
+                        (1 << WAKEUP_BEHAVIOR_RESTORE_MISC_ENABLE);
        header->realmode_flags = acpi_realmode_flags;
        header->real_magic = 0x12345678;
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index a81f2d52f869..c63822816249 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -14,7 +14,6 @@
 #include <asm/pgtable.h>
 #include <asm/mce.h>
 #include <asm/nmi.h>
-#include <asm/vsyscall.h>
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
 #include <asm/io.h>
@@ -250,7 +249,6 @@ static void __init_or_module add_nops(void *insns, unsigned int len)
 extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
 extern s32 __smp_locks[], __smp_locks_end[];
-extern char __vsyscall_0;
 void *text_poke_early(void *addr, const void *opcode, size_t len);
 /* Replace instructions with better alternatives for this CPU type.
@@ -263,6 +261,7 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
                                         struct alt_instr *end)
 {
        struct alt_instr *a;
+        u8 *instr, *replacement;
        u8 insnbuf[MAX_PATCH_LEN];
        DPRINTK("%s: alt table %p -> %p\n", __func__, start, end);
@@ -276,25 +275,23 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
         * order.
         */
        for (a = start; a < end; a++) {
-                u8 *instr = a->instr;
+                instr = (u8 *)&a->instr_offset + a->instr_offset;
+                replacement = (u8 *)&a->repl_offset + a->repl_offset;
                BUG_ON(a->replacementlen > a->instrlen);
                BUG_ON(a->instrlen > sizeof(insnbuf));
                BUG_ON(a->cpuid >= NCAPINTS*32);
                if (!boot_cpu_has(a->cpuid))
                        continue;
-#ifdef CONFIG_X86_64
-                /* vsyscall code is not mapped yet. resolve it manually. */
+                memcpy(insnbuf, replacement, a->replacementlen);
-                if (instr >= (u8 *)VSYSCALL_START && instr < (u8*)VSYSCALL_END) {
-                        instr = __va(instr - (u8*)VSYSCALL_START + (u8*)__pa_symbol(&__vsyscall_0));
+                /* 0xe8 is a relative jump; fix the offset. */
-                        DPRINTK("%s: vsyscall fixup: %p => %p\n",
-                                __func__, a->instr, instr);
-                }
-#endif
-                memcpy(insnbuf, a->replacement, a->replacementlen);
                if (*insnbuf == 0xe8 && a->replacementlen == 5)
-                    *(s32 *)(insnbuf + 1) += a->replacement - a->instr;
+                    *(s32 *)(insnbuf + 1) += replacement - instr;
                add_nops(insnbuf + a->replacementlen,
                         a->instrlen - a->replacementlen);
                text_poke_early(instr, insnbuf, a->instrlen);
        }
 }
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
deleted file mode 100644
index 7c3a95e54ec5..000000000000
--- a/arch/x86/kernel/amd_iommu.c
+++ /dev/null
@@ -1,2764 +0,0 @@
-/*
- * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
- * Author: Joerg Roedel <joerg.roedel@amd.com>
- *         Leo Duran <leo.duran@amd.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published
- * by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
- */
-#include <linux/pci.h>
-#include <linux/pci-ats.h>
-#include <linux/bitmap.h>
-#include <linux/slab.h>
-#include <linux/debugfs.h>
-#include <linux/scatterlist.h>
-#include <linux/dma-mapping.h>
-#include <linux/iommu-helper.h>
-#include <linux/iommu.h>
-#include <linux/delay.h>
-#include <asm/proto.h>
-#include <asm/iommu.h>
-#include <asm/gart.h>
-#include <asm/dma.h>
-#include <asm/amd_iommu_proto.h>
-#include <asm/amd_iommu_types.h>
-#include <asm/amd_iommu.h>
-#define CMD_SET_TYPE(cmd, t) ((cmd)->data[1] |= ((t) << 28))
-#define LOOP_TIMEOUT    100000
-static DEFINE_RWLOCK(amd_iommu_devtable_lock);
-/* A list of preallocated protection domains */
-static LIST_HEAD(iommu_pd_list);
-static DEFINE_SPINLOCK(iommu_pd_list_lock);
-/*
- * Domain for untranslated devices - only allocated
- * if iommu=pt passed on kernel cmd line.
- */
-static struct protection_domain *pt_domain;
-static struct iommu_ops amd_iommu_ops;
-/*
- * general struct to manage commands send to an IOMMU
- */
-struct iommu_cmd {
-        u32 data[4];
-};
-static void update_domain(struct protection_domain *domain);
-/****************************************************************************
- *
- * Helper functions
- *
- ****************************************************************************/
-static inline u16 get_device_id(struct device *dev)
-{
-        struct pci_dev *pdev = to_pci_dev(dev);
-        return calc_devid(pdev->bus->number, pdev->devfn);
-}
-static struct iommu_dev_data *get_dev_data(struct device *dev)
-{
-        return dev->archdata.iommu;
-}
-/*
- * In this function the list of preallocated protection domains is traversed to
- * find the domain for a specific device
- */
-static struct dma_ops_domain *find_protection_domain(u16 devid)
-{
-        struct dma_ops_domain *entry, *ret = NULL;
-        unsigned long flags;
-        u16 alias = amd_iommu_alias_table[devid];
-        if (list_empty(&iommu_pd_list))
-                return NULL;
-        spin_lock_irqsave(&iommu_pd_list_lock, flags);
-        list_for_each_entry(entry, &iommu_pd_list, list) {
-                if (entry->target_dev == devid ||
-                    entry->target_dev == alias) {
-                        ret = entry;
-                        break;
-                }
-        }
-        spin_unlock_irqrestore(&iommu_pd_list_lock, flags);
-        return ret;
-}
-/*
- * This function checks if the driver got a valid device from the caller to
- * avoid dereferencing invalid pointers.
- */
-static bool check_device(struct device *dev)
-{
-        u16 devid;
-        if (!dev || !dev->dma_mask)
-                return false;
-        /* No device or no PCI device */
-        if (dev->bus != &pci_bus_type)
-                return false;
-        devid = get_device_id(dev);
-        /* Out of our scope? */
-        if (devid > amd_iommu_last_bdf)
-                return false;
-        if (amd_iommu_rlookup_table[devid] == NULL)
-                return false;
-        return true;
-}
-static int iommu_init_device(struct device *dev)
-{
-        struct iommu_dev_data *dev_data;
-        struct pci_dev *pdev;
-        u16 devid, alias;
-        if (dev->archdata.iommu)
-                return 0;
-        dev_data = kzalloc(sizeof(*dev_data), GFP_KERNEL);
-        if (!dev_data)
-                return -ENOMEM;
-        dev_data->dev = dev;
-        devid = get_device_id(dev);
-        alias = amd_iommu_alias_table[devid];
-        pdev = pci_get_bus_and_slot(PCI_BUS(alias), alias & 0xff);
-        if (pdev)
-                dev_data->alias = &pdev->dev;
-        else {
-                kfree(dev_data);
-                return -ENOTSUPP;
-        }
-        atomic_set(&dev_data->bind, 0);
-        dev->archdata.iommu = dev_data;
-        return 0;
-}
-static void iommu_ignore_device(struct device *dev)
-{
-        u16 devid, alias;
-        devid = get_device_id(dev);
-        alias = amd_iommu_alias_table[devid];
-        memset(&amd_iommu_dev_table[devid], 0, sizeof(struct dev_table_entry));
-        memset(&amd_iommu_dev_table[alias], 0, sizeof(struct dev_table_entry));
-        amd_iommu_rlookup_table[devid] = NULL;
-        amd_iommu_rlookup_table[alias] = NULL;
-}
-static void iommu_uninit_device(struct device *dev)
-{
-        kfree(dev->archdata.iommu);
-}
-void __init amd_iommu_uninit_devices(void)
-{
-        struct pci_dev *pdev = NULL;
-        for_each_pci_dev(pdev) {
-                if (!check_device(&pdev->dev))
-                        continue;
-                iommu_uninit_device(&pdev->dev);
-        }
-}
-int __init amd_iommu_init_devices(void)
-{
-        struct pci_dev *pdev = NULL;
-        int ret = 0;
-        for_each_pci_dev(pdev) {
-                if (!check_device(&pdev->dev))
-                        continue;
-                ret = iommu_init_device(&pdev->dev);
-                if (ret == -ENOTSUPP)
-                        iommu_ignore_device(&pdev->dev);
-                else if (ret)
-                        goto out_free;
-        }
-        return 0;
-out_free:
-        amd_iommu_uninit_devices();
-        return ret;
-}
-#ifdef CONFIG_AMD_IOMMU_STATS
-/*
- * Initialization code for statistics collection
- */
-DECLARE_STATS_COUNTER(compl_wait);
-DECLARE_STATS_COUNTER(cnt_map_single);
-DECLARE_STATS_COUNTER(cnt_unmap_single);
-DECLARE_STATS_COUNTER(cnt_map_sg);
-DECLARE_STATS_COUNTER(cnt_unmap_sg);
-DECLARE_STATS_COUNTER(cnt_alloc_coherent);
-DECLARE_STATS_COUNTER(cnt_free_coherent);
-DECLARE_STATS_COUNTER(cross_page);
-DECLARE_STATS_COUNTER(domain_flush_single);
-DECLARE_STATS_COUNTER(domain_flush_all);
-DECLARE_STATS_COUNTER(alloced_io_mem);
-DECLARE_STATS_COUNTER(total_map_requests);
-static struct dentry *stats_dir;
-static struct dentry *de_fflush;
-static void amd_iommu_stats_add(struct __iommu_counter *cnt)
-{
-        if (stats_dir == NULL)
-                return;
-        cnt->dent = debugfs_create_u64(cnt->name, 0444, stats_dir,
-                                       &cnt->value);
-}
-static void amd_iommu_stats_init(void)
-{
-        stats_dir = debugfs_create_dir("amd-iommu", NULL);
-        if (stats_dir == NULL)
-                return;
-        de_fflush  = debugfs_create_bool("fullflush", 0444, stats_dir,
-                                         (u32 *)&amd_iommu_unmap_flush);
-        amd_iommu_stats_add(&compl_wait);
-        amd_iommu_stats_add(&cnt_map_single);
-        amd_iommu_stats_add(&cnt_unmap_single);
-        amd_iommu_stats_add(&cnt_map_sg);
-        amd_iommu_stats_add(&cnt_unmap_sg);
-        amd_iommu_stats_add(&cnt_alloc_coherent);
-        amd_iommu_stats_add(&cnt_free_coherent);
-        amd_iommu_stats_add(&cross_page);
-        amd_iommu_stats_add(&domain_flush_single);
-        amd_iommu_stats_add(&domain_flush_all);
-        amd_iommu_stats_add(&alloced_io_mem);
-        amd_iommu_stats_add(&total_map_requests);
-}
-#endif
-/****************************************************************************
- *
- * Interrupt handling functions
- *
- ****************************************************************************/
-static void dump_dte_entry(u16 devid)
-{
-        int i;
-        for (i = 0; i < 8; ++i)
-                pr_err("AMD-Vi: DTE[%d]: %08x\n", i,
-                        amd_iommu_dev_table[devid].data[i]);
-}
-static void dump_command(unsigned long phys_addr)
-{
-        struct iommu_cmd *cmd = phys_to_virt(phys_addr);
-        int i;
-        for (i = 0; i < 4; ++i)
-                pr_err("AMD-Vi: CMD[%d]: %08x\n", i, cmd->data[i]);
-}
-static void iommu_print_event(struct amd_iommu *iommu, void *__evt)
-{
-        u32 *event = __evt;
-        int type  = (event[1] >> EVENT_TYPE_SHIFT)  & EVENT_TYPE_MASK;
-        int devid = (event[0] >> EVENT_DEVID_SHIFT) & EVENT_DEVID_MASK;
-        int domid = (event[1] >> EVENT_DOMID_SHIFT) & EVENT_DOMID_MASK;
-        int flags = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK;
-        u64 address = (u64)(((u64)event[3]) << 32) | event[2];
-        printk(KERN_ERR "AMD-Vi: Event logged [");
-        switch (type) {
-        case EVENT_TYPE_ILL_DEV:
-                printk("ILLEGAL_DEV_TABLE_ENTRY device=%02x:%02x.%x "
-                       "address=0x%016llx flags=0x%04x]\n",
-                       PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
-                       address, flags);
-                dump_dte_entry(devid);
-                break;
-        case EVENT_TYPE_IO_FAULT:
-                printk("IO_PAGE_FAULT device=%02x:%02x.%x "
-                       "domain=0x%04x address=0x%016llx flags=0x%04x]\n",
-                       PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
-                       domid, address, flags);
-                break;
-        case EVENT_TYPE_DEV_TAB_ERR:
-                printk("DEV_TAB_HARDWARE_ERROR device=%02x:%02x.%x "
-                       "address=0x%016llx flags=0x%04x]\n",
-                       PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
-                       address, flags);
-                break;
-        case EVENT_TYPE_PAGE_TAB_ERR:
-                printk("PAGE_TAB_HARDWARE_ERROR device=%02x:%02x.%x "
-                       "domain=0x%04x address=0x%016llx flags=0x%04x]\n",
-                       PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
-                       domid, address, flags);
-                break;
-        case EVENT_TYPE_ILL_CMD:
-                printk("ILLEGAL_COMMAND_ERROR address=0x%016llx]\n", address);
-                dump_command(address);
-                break;
-        case EVENT_TYPE_CMD_HARD_ERR:
-                printk("COMMAND_HARDWARE_ERROR address=0x%016llx "
-                       "flags=0x%04x]\n", address, flags);
-                break;
-        case EVENT_TYPE_IOTLB_INV_TO:
-                printk("IOTLB_INV_TIMEOUT device=%02x:%02x.%x "
-                       "address=0x%016llx]\n",
-                       PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
-                       address);
-                break;
-        case EVENT_TYPE_INV_DEV_REQ:
-                printk("INVALID_DEVICE_REQUEST device=%02x:%02x.%x "
-                       "address=0x%016llx flags=0x%04x]\n",
-                       PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
-                       address, flags);
-                break;
-        default:
-                printk(KERN_ERR "UNKNOWN type=0x%02x]\n", type);
-        }
-}
-static void iommu_poll_events(struct amd_iommu *iommu)
-{
-        u32 head, tail;
-        unsigned long flags;
-        spin_lock_irqsave(&iommu->lock, flags);
-        head = readl(iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
-        tail = readl(iommu->mmio_base + MMIO_EVT_TAIL_OFFSET);
-        while (head != tail) {
-                iommu_print_event(iommu, iommu->evt_buf + head);
-                head = (head + EVENT_ENTRY_SIZE) % iommu->evt_buf_size;
-        }
-        writel(head, iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
-        spin_unlock_irqrestore(&iommu->lock, flags);
-}
-irqreturn_t amd_iommu_int_thread(int irq, void *data)
-{
-        struct amd_iommu *iommu;
-        for_each_iommu(iommu)
-                iommu_poll_events(iommu);
-        return IRQ_HANDLED;
-}
-irqreturn_t amd_iommu_int_handler(int irq, void *data)
-{
-        return IRQ_WAKE_THREAD;
-}
-/****************************************************************************
- *
- * IOMMU command queuing functions
- *
- ****************************************************************************/
-static int wait_on_sem(volatile u64 *sem)
-{
-        int i = 0;
-        while (*sem == 0 && i < LOOP_TIMEOUT) {
-                udelay(1);
-                i += 1;
-        }
-        if (i == LOOP_TIMEOUT) {
-                pr_alert("AMD-Vi: Completion-Wait loop timed out\n");
-                return -EIO;
-        }
-        return 0;
-}
-static void copy_cmd_to_buffer(struct amd_iommu *iommu,
-                               struct iommu_cmd *cmd,
-                               u32 tail)
-{
-        u8 *target;
-        target = iommu->cmd_buf + tail;
-        tail   = (tail + sizeof(*cmd)) % iommu->cmd_buf_size;
-        /* Copy command to buffer */
-        memcpy(target, cmd, sizeof(*cmd));
-        /* Tell the IOMMU about it */
-        writel(tail, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
-}
-static void build_completion_wait(struct iommu_cmd *cmd, u64 address)
-{
-        WARN_ON(address & 0x7ULL);
-        memset(cmd, 0, sizeof(*cmd));
-        cmd->data[0] = lower_32_bits(__pa(address)) | CMD_COMPL_WAIT_STORE_MASK;
-        cmd->data[1] = upper_32_bits(__pa(address));
-        cmd->data[2] = 1;
-        CMD_SET_TYPE(cmd, CMD_COMPL_WAIT);
-}
-static void build_inv_dte(struct iommu_cmd *cmd, u16 devid)
-{
-        memset(cmd, 0, sizeof(*cmd));
-        cmd->data[0] = devid;
-        CMD_SET_TYPE(cmd, CMD_INV_DEV_ENTRY);
-}
-static void build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address,
-                                  size_t size, u16 domid, int pde)
-{
-        u64 pages;
-        int s;
-        pages = iommu_num_pages(address, size, PAGE_SIZE);
-        s     = 0;
-        if (pages > 1) {
-                /*
-                 * If we have to flush more than one page, flush all
-                 * TLB entries for this domain
-                 */
-                address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS;
-                s = 1;
-        }
-        address &= PAGE_MASK;
-        memset(cmd, 0, sizeof(*cmd));
-        cmd->data[1] |= domid;
-        cmd->data[2]  = lower_32_bits(address);
-        cmd->data[3]  = upper_32_bits(address);
-        CMD_SET_TYPE(cmd, CMD_INV_IOMMU_PAGES);
-        if (s) /* size bit - we flush more than one 4kb page */
-                cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
-        if (pde) /* PDE bit - we wan't flush everything not only the PTEs */
-                cmd->data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK;
-}
-static void build_inv_iotlb_pages(struct iommu_cmd *cmd, u16 devid, int qdep,
-                                  u64 address, size_t size)
-{
-        u64 pages;
-        int s;
-        pages = iommu_num_pages(address, size, PAGE_SIZE);
-        s     = 0;
-        if (pages > 1) {
-                /*
-                 * If we have to flush more than one page, flush all
-                 * TLB entries for this domain
-                 */
-                address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS;
-                s = 1;
-        }
-        address &= PAGE_MASK;
-        memset(cmd, 0, sizeof(*cmd));
-        cmd->data[0]  = devid;
-        cmd->data[0] |= (qdep & 0xff) << 24;
-        cmd->data[1]  = devid;
-        cmd->data[2]  = lower_32_bits(address);
-        cmd->data[3]  = upper_32_bits(address);
-        CMD_SET_TYPE(cmd, CMD_INV_IOTLB_PAGES);
-        if (s)
-                cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
-}
-static void build_inv_all(struct iommu_cmd *cmd)
-{
-        memset(cmd, 0, sizeof(*cmd));
-        CMD_SET_TYPE(cmd, CMD_INV_ALL);
-}
-/*
- * Writes the command to the IOMMUs command buffer and informs the
- * hardware about the new command.
- */
-static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
-{
-        u32 left, tail, head, next_tail;
-        unsigned long flags;
-        WARN_ON(iommu->cmd_buf_size & CMD_BUFFER_UNINITIALIZED);
-again:
-        spin_lock_irqsave(&iommu->lock, flags);
-        head      = readl(iommu->mmio_base + MMIO_CMD_HEAD_OFFSET);
-        tail      = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
-        next_tail = (tail + sizeof(*cmd)) % iommu->cmd_buf_size;
-        left      = (head - next_tail) % iommu->cmd_buf_size;
-        if (left <= 2) {
-                struct iommu_cmd sync_cmd;
-                volatile u64 sem = 0;
-                int ret;
-                build_completion_wait(&sync_cmd, (u64)&sem);
-                copy_cmd_to_buffer(iommu, &sync_cmd, tail);
-                spin_unlock_irqrestore(&iommu->lock, flags);
-                if ((ret = wait_on_sem(&sem)) != 0)
-                        return ret;
-                goto again;
-        }
-        copy_cmd_to_buffer(iommu, cmd, tail);
-        /* We need to sync now to make sure all commands are processed */
-        iommu->need_sync = true;
-        spin_unlock_irqrestore(&iommu->lock, flags);
-        return 0;
-}
-/*
- * This function queues a completion wait command into the command
- * buffer of an IOMMU
- */
-static int iommu_completion_wait(struct amd_iommu *iommu)
-{
-        struct iommu_cmd cmd;
-        volatile u64 sem = 0;
-        int ret;
-        if (!iommu->need_sync)
-                return 0;
-        build_completion_wait(&cmd, (u64)&sem);
-        ret = iommu_queue_command(iommu, &cmd);
-        if (ret)
-                return ret;
-        return wait_on_sem(&sem);
-}
-static int iommu_flush_dte(struct amd_iommu *iommu, u16 devid)
-{
-        struct iommu_cmd cmd;
-        build_inv_dte(&cmd, devid);
-        return iommu_queue_command(iommu, &cmd);
-}
-static void iommu_flush_dte_all(struct amd_iommu *iommu)
-{
-        u32 devid;
-        for (devid = 0; devid <= 0xffff; ++devid)
-                iommu_flush_dte(iommu, devid);
-        iommu_completion_wait(iommu);
-}
-/*
- * This function uses heavy locking and may disable irqs for some time. But
- * this is no issue because it is only called during resume.
- */
-static void iommu_flush_tlb_all(struct amd_iommu *iommu)
-{
-        u32 dom_id;
-        for (dom_id = 0; dom_id <= 0xffff; ++dom_id) {
-                struct iommu_cmd cmd;
-                build_inv_iommu_pages(&cmd, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS,
-                                      dom_id, 1);
-                iommu_queue_command(iommu, &cmd);
-        }
-        iommu_completion_wait(iommu);
-}
-static void iommu_flush_all(struct amd_iommu *iommu)
-{
-        struct iommu_cmd cmd;
-        build_inv_all(&cmd);
-        iommu_queue_command(iommu, &cmd);
-        iommu_completion_wait(iommu);
-}
-void iommu_flush_all_caches(struct amd_iommu *iommu)
-{
-        if (iommu_feature(iommu, FEATURE_IA)) {
-                iommu_flush_all(iommu);
-        } else {
-                iommu_flush_dte_all(iommu);
-                iommu_flush_tlb_all(iommu);
-        }
-}
-/*
- * Command send function for flushing on-device TLB
- */
-static int device_flush_iotlb(struct device *dev, u64 address, size_t size)
-{
-        struct pci_dev *pdev = to_pci_dev(dev);
-        struct amd_iommu *iommu;
-        struct iommu_cmd cmd;
-        u16 devid;
-        int qdep;
-        qdep  = pci_ats_queue_depth(pdev);
-        devid = get_device_id(dev);
-        iommu = amd_iommu_rlookup_table[devid];
-        build_inv_iotlb_pages(&cmd, devid, qdep, address, size);
-        return iommu_queue_command(iommu, &cmd);
-}
-/*
- * Command send function for invalidating a device table entry
- */
-static int device_flush_dte(struct device *dev)
-{
-        struct amd_iommu *iommu;
-        struct pci_dev *pdev;
-        u16 devid;
-        int ret;
-        pdev  = to_pci_dev(dev);
-        devid = get_device_id(dev);
-        iommu = amd_iommu_rlookup_table[devid];
-        ret = iommu_flush_dte(iommu, devid);
-        if (ret)
-                return ret;
-        if (pci_ats_enabled(pdev))
-                ret = device_flush_iotlb(dev, 0, ~0UL);
-        return ret;
-}
-/*
- * TLB invalidation function which is called from the mapping functions.
- * It invalidates a single PTE if the range to flush is within a single
- * page. Otherwise it flushes the whole TLB of the IOMMU.
- */
-static void __domain_flush_pages(struct protection_domain *domain,
-                                 u64 address, size_t size, int pde)
-{
-        struct iommu_dev_data *dev_data;
-        struct iommu_cmd cmd;
-        int ret = 0, i;
-        build_inv_iommu_pages(&cmd, address, size, domain->id, pde);
-        for (i = 0; i < amd_iommus_present; ++i) {
-                if (!domain->dev_iommu[i])
-                        continue;
-                /*
-                 * Devices of this domain are behind this IOMMU
-                 * We need a TLB flush
-                 */
-                ret |= iommu_queue_command(amd_iommus[i], &cmd);
-        }
-        list_for_each_entry(dev_data, &domain->dev_list, list) {
-                struct pci_dev *pdev = to_pci_dev(dev_data->dev);
-                if (!pci_ats_enabled(pdev))
-                        continue;
-                ret |= device_flush_iotlb(dev_data->dev, address, size);
-        }
-        WARN_ON(ret);
-}
-static void domain_flush_pages(struct protection_domain *domain,
-                               u64 address, size_t size)
-{
-        __domain_flush_pages(domain, address, size, 0);
-}
-/* Flush the whole IO/TLB for a given protection domain */
-static void domain_flush_tlb(struct protection_domain *domain)
-{
-        __domain_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 0);
-}
-/* Flush the whole IO/TLB for a given protection domain - including PDE */
-static void domain_flush_tlb_pde(struct protection_domain *domain)
-{
-        __domain_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 1);
-}
-static void domain_flush_complete(struct protection_domain *domain)
-{
-        int i;
-        for (i = 0; i < amd_iommus_present; ++i) {
-                if (!domain->dev_iommu[i])
-                        continue;
-                /*
-                 * Devices of this domain are behind this IOMMU
-                 * We need to wait for completion of all commands.
-                 */
-                iommu_completion_wait(amd_iommus[i]);
-        }
-}
-/*
- * This function flushes the DTEs for all devices in domain
- */
-static void domain_flush_devices(struct protection_domain *domain)
-{
-        struct iommu_dev_data *dev_data;
-        unsigned long flags;
-        spin_lock_irqsave(&domain->lock, flags);
-        list_for_each_entry(dev_data, &domain->dev_list, list)
-                device_flush_dte(dev_data->dev);
-        spin_unlock_irqrestore(&domain->lock, flags);
-}
-/****************************************************************************
- *
- * The functions below are used the create the page table mappings for
- * unity mapped regions.
- *
- ****************************************************************************/
-/*
- * This function is used to add another level to an IO page table. Adding
- * another level increases the size of the address space by 9 bits to a size up
- * to 64 bits.
- */
-static bool increase_address_space(struct protection_domain *domain,
-                                   gfp_t gfp)
-{
-        u64 *pte;
-        if (domain->mode == PAGE_MODE_6_LEVEL)
-                /* address space already 64 bit large */
-                return false;
-        pte = (void *)get_zeroed_page(gfp);
-        if (!pte)
-                return false;
-        *pte             = PM_LEVEL_PDE(domain->mode,
-                                        virt_to_phys(domain->pt_root));
-        domain->pt_root  = pte;
-        domain->mode    += 1;
-        domain->updated  = true;
-        return true;
-}
-static u64 *alloc_pte(struct protection_domain *domain,
-                      unsigned long address,
-                      unsigned long page_size,
-                      u64 **pte_page,
-                      gfp_t gfp)
-{
-        int level, end_lvl;
-        u64 *pte, *page;
-        BUG_ON(!is_power_of_2(page_size));
-        while (address > PM_LEVEL_SIZE(domain->mode))
-                increase_address_space(domain, gfp);
-        level   = domain->mode - 1;
-        pte     = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
-        address = PAGE_SIZE_ALIGN(address, page_size);
-        end_lvl = PAGE_SIZE_LEVEL(page_size);
-        while (level > end_lvl) {
-                if (!IOMMU_PTE_PRESENT(*pte)) {
-                        page = (u64 *)get_zeroed_page(gfp);
-                        if (!page)
-                                return NULL;
-                        *pte = PM_LEVEL_PDE(level, virt_to_phys(page));
-                }
-                /* No level skipping support yet */
-                if (PM_PTE_LEVEL(*pte) != level)
-                        return NULL;
-                level -= 1;
-                pte = IOMMU_PTE_PAGE(*pte);
-                if (pte_page && level == end_lvl)
-                        *pte_page = pte;
-                pte = &pte[PM_LEVEL_INDEX(level, address)];
-        }
-        return pte;
-}
-/*
- * This function checks if there is a PTE for a given dma address. If
- * there is one, it returns the pointer to it.
- */
-static u64 *fetch_pte(struct protection_domain *domain, unsigned long address)
-{
-        int level;
-        u64 *pte;
-        if (address > PM_LEVEL_SIZE(domain->mode))
-                return NULL;
-        level   =  domain->mode - 1;
-        pte     = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
-        while (level > 0) {
-                /* Not Present */
-                if (!IOMMU_PTE_PRESENT(*pte))
-                        return NULL;
-                /* Large PTE */
-                if (PM_PTE_LEVEL(*pte) == 0x07) {
-                        unsigned long pte_mask, __pte;
-                        /*
-                         * If we have a series of large PTEs, make
-                         * sure to return a pointer to the first one.
-                         */
-                        pte_mask = PTE_PAGE_SIZE(*pte);
-                        pte_mask = ~((PAGE_SIZE_PTE_COUNT(pte_mask) << 3) - 1);
-                        __pte    = ((unsigned long)pte) & pte_mask;
-                        return (u64 *)__pte;
-                }
-                /* No level skipping support yet */
-                if (PM_PTE_LEVEL(*pte) != level)
-                        return NULL;
-                level -= 1;
-                /* Walk to the next level */
-                pte = IOMMU_PTE_PAGE(*pte);
-                pte = &pte[PM_LEVEL_INDEX(level, address)];
-        }
-        return pte;
-}
-/*
- * Generic mapping functions. It maps a physical address into a DMA
- * address space. It allocates the page table pages if necessary.
- * In the future it can be extended to a generic mapping function
- * supporting all features of AMD IOMMU page tables like level skipping
- * and full 64 bit address spaces.
- */
-static int iommu_map_page(struct protection_domain *dom,
-                          unsigned long bus_addr,
-                          unsigned long phys_addr,
-                          int prot,
-                          unsigned long page_size)
-{
-        u64 __pte, *pte;
-        int i, count;
-        if (!(prot & IOMMU_PROT_MASK))
-                return -EINVAL;
-        bus_addr  = PAGE_ALIGN(bus_addr);
-        phys_addr = PAGE_ALIGN(phys_addr);
-        count     = PAGE_SIZE_PTE_COUNT(page_size);
-        pte       = alloc_pte(dom, bus_addr, page_size, NULL, GFP_KERNEL);
-        for (i = 0; i < count; ++i)
-                if (IOMMU_PTE_PRESENT(pte[i]))
-                        return -EBUSY;
-        if (page_size > PAGE_SIZE) {
-                __pte = PAGE_SIZE_PTE(phys_addr, page_size);
-                __pte |= PM_LEVEL_ENC(7) | IOMMU_PTE_P | IOMMU_PTE_FC;
-        } else
-                __pte = phys_addr | IOMMU_PTE_P | IOMMU_PTE_FC;
-        if (prot & IOMMU_PROT_IR)
-                __pte |= IOMMU_PTE_IR;
-        if (prot & IOMMU_PROT_IW)
-                __pte |= IOMMU_PTE_IW;
-        for (i = 0; i < count; ++i)
-                pte[i] = __pte;
-        update_domain(dom);
-        return 0;
-}
-static unsigned long iommu_unmap_page(struct protection_domain *dom,
-                                      unsigned long bus_addr,
-                                      unsigned long page_size)
-{
-        unsigned long long unmap_size, unmapped;
-        u64 *pte;
-        BUG_ON(!is_power_of_2(page_size));
-        unmapped = 0;
-        while (unmapped < page_size) {
-                pte = fetch_pte(dom, bus_addr);
-                if (!pte) {
-                        /*
-                         * No PTE for this address
-                         * move forward in 4kb steps
-                         */
-                        unmap_size = PAGE_SIZE;
-                } else if (PM_PTE_LEVEL(*pte) == 0) {
-                        /* 4kb PTE found for this address */
-                        unmap_size = PAGE_SIZE;
-                        *pte       = 0ULL;
-                } else {
-                        int count, i;
-                        /* Large PTE found which maps this address */
-                        unmap_size = PTE_PAGE_SIZE(*pte);
-                        count      = PAGE_SIZE_PTE_COUNT(unmap_size);
-                        for (i = 0; i < count; i++)
-                                pte[i] = 0ULL;
-                }
-                bus_addr  = (bus_addr & ~(unmap_size - 1)) + unmap_size;
-                unmapped += unmap_size;
-        }
-        BUG_ON(!is_power_of_2(unmapped));
-        return unmapped;
-}
-/*
- * This function checks if a specific unity mapping entry is needed for
- * this specific IOMMU.
- */
-static int iommu_for_unity_map(struct amd_iommu *iommu,
-                               struct unity_map_entry *entry)
-{
-        u16 bdf, i;
-        for (i = entry->devid_start; i <= entry->devid_end; ++i) {
-                bdf = amd_iommu_alias_table[i];
-                if (amd_iommu_rlookup_table[bdf] == iommu)
-                        return 1;
-        }
-        return 0;
-}
-/*
- * This function actually applies the mapping to the page table of the
- * dma_ops domain.
- */
-static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
-                             struct unity_map_entry *e)
-{
-        u64 addr;
-        int ret;
-        for (addr = e->address_start; addr < e->address_end;
-             addr += PAGE_SIZE) {
-                ret = iommu_map_page(&dma_dom->domain, addr, addr, e->prot,
-                                     PAGE_SIZE);
-                if (ret)
-                        return ret;
-                /*
-                 * if unity mapping is in aperture range mark the page
-                 * as allocated in the aperture
-                 */
-                if (addr < dma_dom->aperture_size)
-                        __set_bit(addr >> PAGE_SHIFT,
-                                  dma_dom->aperture[0]->bitmap);
-        }
-        return 0;
-}
-/*
- * Init the unity mappings for a specific IOMMU in the system
- *
- * Basically iterates over all unity mapping entries and applies them to
- * the default domain DMA of that IOMMU if necessary.
- */
-static int iommu_init_unity_mappings(struct amd_iommu *iommu)
-{
-        struct unity_map_entry *entry;
-        int ret;
-        list_for_each_entry(entry, &amd_iommu_unity_map, list) {
-                if (!iommu_for_unity_map(iommu, entry))
-                        continue;
-                ret = dma_ops_unity_map(iommu->default_dom, entry);
-                if (ret)
-                        return ret;
-        }
-        return 0;
-}
-/*
- * Inits the unity mappings required for a specific device
- */
-static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom,
-                                          u16 devid)
-{
-        struct unity_map_entry *e;
-        int ret;
-        list_for_each_entry(e, &amd_iommu_unity_map, list) {
-                if (!(devid >= e->devid_start && devid <= e->devid_end))
-                        continue;
-                ret = dma_ops_unity_map(dma_dom, e);
-                if (ret)
-                        return ret;
-        }
-        return 0;
-}
-/****************************************************************************
- *
- * The next functions belong to the address allocator for the dma_ops
- * interface functions. They work like the allocators in the other IOMMU
- * drivers. Its basically a bitmap which marks the allocated pages in
- * the aperture. Maybe it could be enhanced in the future to a more
- * efficient allocator.
- *
- ****************************************************************************/
-/*
- * The address allocator core functions.
- *
- * called with domain->lock held
- */
-/*
- * Used to reserve address ranges in the aperture (e.g. for exclusion
- * ranges.
- */
-static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
-                                      unsigned long start_page,
-                                      unsigned int pages)
-{
-        unsigned int i, last_page = dom->aperture_size >> PAGE_SHIFT;
-        if (start_page + pages > last_page)
-                pages = last_page - start_page;
-        for (i = start_page; i < start_page + pages; ++i) {
-                int index = i / APERTURE_RANGE_PAGES;
-                int page  = i % APERTURE_RANGE_PAGES;
-                __set_bit(page, dom->aperture[index]->bitmap);
-        }
-}
-/*
- * This function is used to add a new aperture range to an existing
- * aperture in case of dma_ops domain allocation or address allocation
- * failure.
- */
-static int alloc_new_range(struct dma_ops_domain *dma_dom,
-                           bool populate, gfp_t gfp)
-{
-        int index = dma_dom->aperture_size >> APERTURE_RANGE_SHIFT;
-        struct amd_iommu *iommu;
-        unsigned long i;
-#ifdef CONFIG_IOMMU_STRESS
-        populate = false;
-#endif
-        if (index >= APERTURE_MAX_RANGES)
-                return -ENOMEM;
-        dma_dom->aperture[index] = kzalloc(sizeof(struct aperture_range), gfp);
-        if (!dma_dom->aperture[index])
-                return -ENOMEM;
-        dma_dom->aperture[index]->bitmap = (void *)get_zeroed_page(gfp);
-        if (!dma_dom->aperture[index]->bitmap)
-                goto out_free;
-        dma_dom->aperture[index]->offset = dma_dom->aperture_size;
-        if (populate) {
-                unsigned long address = dma_dom->aperture_size;
-                int i, num_ptes = APERTURE_RANGE_PAGES / 512;
-                u64 *pte, *pte_page;
-                for (i = 0; i < num_ptes; ++i) {
-                        pte = alloc_pte(&dma_dom->domain, address, PAGE_SIZE,
-                                        &pte_page, gfp);
-                        if (!pte)
-                                goto out_free;
-                        dma_dom->aperture[index]->pte_pages[i] = pte_page;
-                        address += APERTURE_RANGE_SIZE / 64;
-                }
-        }
-        dma_dom->aperture_size += APERTURE_RANGE_SIZE;
-        /* Initialize the exclusion range if necessary */
-        for_each_iommu(iommu) {
-                if (iommu->exclusion_start &&
-                    iommu->exclusion_start >= dma_dom->aperture[index]->offset
-                    && iommu->exclusion_start < dma_dom->aperture_size) {
-                        unsigned long startpage;
-                        int pages = iommu_num_pages(iommu->exclusion_start,
-                                                    iommu->exclusion_length,
-                                                    PAGE_SIZE);
-                        startpage = iommu->exclusion_start >> PAGE_SHIFT;
-                        dma_ops_reserve_addresses(dma_dom, startpage, pages);
-                }
-        }
-        /*
-         * Check for areas already mapped as present in the new aperture
-         * range and mark those pages as reserved in the allocator. Such
-         * mappings may already exist as a result of requested unity
-         * mappings for devices.
-         */
-        for (i = dma_dom->aperture[index]->offset;
-             i < dma_dom->aperture_size;
-             i += PAGE_SIZE) {
-                u64 *pte = fetch_pte(&dma_dom->domain, i);
-                if (!pte || !IOMMU_PTE_PRESENT(*pte))
-                        continue;
-                dma_ops_reserve_addresses(dma_dom, i << PAGE_SHIFT, 1);
-        }
-        update_domain(&dma_dom->domain);
-        return 0;
-out_free:
-        update_domain(&dma_dom->domain);
-        free_page((unsigned long)dma_dom->aperture[index]->bitmap);
-        kfree(dma_dom->aperture[index]);
-        dma_dom->aperture[index] = NULL;
-        return -ENOMEM;
-}
-static unsigned long dma_ops_area_alloc(struct device *dev,
-                                        struct dma_ops_domain *dom,
-                                        unsigned int pages,
-                                        unsigned long align_mask,
-                                        u64 dma_mask,
-                                        unsigned long start)
-{
-        unsigned long next_bit = dom->next_address % APERTURE_RANGE_SIZE;
-        int max_index = dom->aperture_size >> APERTURE_RANGE_SHIFT;
-        int i = start >> APERTURE_RANGE_SHIFT;
-        unsigned long boundary_size;
-        unsigned long address = -1;
-        unsigned long limit;
-        next_bit >>= PAGE_SHIFT;
-        boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1,
-                        PAGE_SIZE) >> PAGE_SHIFT;
-        for (;i < max_index; ++i) {
-                unsigned long offset = dom->aperture[i]->offset >> PAGE_SHIFT;
-                if (dom->aperture[i]->offset >= dma_mask)
-                        break;
-                limit = iommu_device_max_index(APERTURE_RANGE_PAGES, offset,
-                                               dma_mask >> PAGE_SHIFT);
-                address = iommu_area_alloc(dom->aperture[i]->bitmap,
-                                           limit, next_bit, pages, 0,
-                                            boundary_size, align_mask);
-                if (address != -1) {
-                        address = dom->aperture[i]->offset +
-                                  (address << PAGE_SHIFT);
-                        dom->next_address = address + (pages << PAGE_SHIFT);
-                        break;
-                }
-                next_bit = 0;
-        }
-        return address;
-}
-static unsigned long dma_ops_alloc_addresses(struct device *dev,
-                                             struct dma_ops_domain *dom,
-                                             unsigned int pages,
-                                             unsigned long align_mask,
-                                             u64 dma_mask)
-{
-        unsigned long address;
-#ifdef CONFIG_IOMMU_STRESS
-        dom->next_address = 0;
-        dom->need_flush = true;
-#endif
-        address = dma_ops_area_alloc(dev, dom, pages, align_mask,
-                                     dma_mask, dom->next_address);
-        if (address == -1) {
-                dom->next_address = 0;
-                address = dma_ops_area_alloc(dev, dom, pages, align_mask,
-                                             dma_mask, 0);
-                dom->need_flush = true;
-        }
-        if (unlikely(address == -1))
-                address = DMA_ERROR_CODE;
-        WARN_ON((address + (PAGE_SIZE*pages)) > dom->aperture_size);
-        return address;
-}
-/*
- * The address free function.
- *
- * called with domain->lock held
- */
-static void dma_ops_free_addresses(struct dma_ops_domain *dom,
-                                   unsigned long address,
-                                   unsigned int pages)
-{
-        unsigned i = address >> APERTURE_RANGE_SHIFT;
-        struct aperture_range *range = dom->aperture[i];
-        BUG_ON(i >= APERTURE_MAX_RANGES || range == NULL);
-#ifdef CONFIG_IOMMU_STRESS
-        if (i < 4)
-                return;
-#endif
-        if (address >= dom->next_address)
-                dom->need_flush = true;
-        address = (address % APERTURE_RANGE_SIZE) >> PAGE_SHIFT;
-        bitmap_clear(range->bitmap, address, pages);
-}
-/****************************************************************************
- *
- * The next functions belong to the domain allocation. A domain is
- * allocated for every IOMMU as the default domain. If device isolation
- * is enabled, every device get its own domain. The most important thing
- * about domains is the page table mapping the DMA address space they
- * contain.
- *
- ****************************************************************************/
-/*
- * This function adds a protection domain to the global protection domain list
- */
-static void add_domain_to_list(struct protection_domain *domain)
-{
-        unsigned long flags;
-        spin_lock_irqsave(&amd_iommu_pd_lock, flags);
-        list_add(&domain->list, &amd_iommu_pd_list);
-        spin_unlock_irqrestore(&amd_iommu_pd_lock, flags);
-}
-/*
- * This function removes a protection domain to the global
- * protection domain list
- */
-static void del_domain_from_list(struct protection_domain *domain)
-{
-        unsigned long flags;
-        spin_lock_irqsave(&amd_iommu_pd_lock, flags);
-        list_del(&domain->list);
-        spin_unlock_irqrestore(&amd_iommu_pd_lock, flags);
-}
-static u16 domain_id_alloc(void)
-{
-        unsigned long flags;
-        int id;
-        write_lock_irqsave(&amd_iommu_devtable_lock, flags);
-        id = find_first_zero_bit(amd_iommu_pd_alloc_bitmap, MAX_DOMAIN_ID);
-        BUG_ON(id == 0);
-        if (id > 0 && id < MAX_DOMAIN_ID)
-                __set_bit(id, amd_iommu_pd_alloc_bitmap);
-        else
-                id = 0;
-        write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
-        return id;
-}
-static void domain_id_free(int id)
-{
-        unsigned long flags;
-        write_lock_irqsave(&amd_iommu_devtable_lock, flags);
-        if (id > 0 && id < MAX_DOMAIN_ID)
-                __clear_bit(id, amd_iommu_pd_alloc_bitmap);
-        write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
-}
-static void free_pagetable(struct protection_domain *domain)
-{
-        int i, j;
-        u64 *p1, *p2, *p3;
-        p1 = domain->pt_root;
-        if (!p1)
-                return;
-        for (i = 0; i < 512; ++i) {
-                if (!IOMMU_PTE_PRESENT(p1[i]))
-                        continue;
-                p2 = IOMMU_PTE_PAGE(p1[i]);
-                for (j = 0; j < 512; ++j) {
-                        if (!IOMMU_PTE_PRESENT(p2[j]))
-                                continue;
-                        p3 = IOMMU_PTE_PAGE(p2[j]);
-                        free_page((unsigned long)p3);
-                }
-                free_page((unsigned long)p2);
-        }
-        free_page((unsigned long)p1);
-        domain->pt_root = NULL;
-}
-/*
- * Free a domain, only used if something went wrong in the
- * allocation path and we need to free an already allocated page table
- */
-static void dma_ops_domain_free(struct dma_ops_domain *dom)
-{
-        int i;
-        if (!dom)
-                return;
-        del_domain_from_list(&dom->domain);
-        free_pagetable(&dom->domain);
-        for (i = 0; i < APERTURE_MAX_RANGES; ++i) {
-                if (!dom->aperture[i])
-                        continue;
-                free_page((unsigned long)dom->aperture[i]->bitmap);
-                kfree(dom->aperture[i]);
-        }
-        kfree(dom);
-}
-/*
- * Allocates a new protection domain usable for the dma_ops functions.
- * It also initializes the page table and the address allocator data
- * structures required for the dma_ops interface
- */
-static struct dma_ops_domain *dma_ops_domain_alloc(void)
-{
-        struct dma_ops_domain *dma_dom;
-        dma_dom = kzalloc(sizeof(struct dma_ops_domain), GFP_KERNEL);
-        if (!dma_dom)
-                return NULL;
-        spin_lock_init(&dma_dom->domain.lock);
-        dma_dom->domain.id = domain_id_alloc();
-        if (dma_dom->domain.id == 0)
-                goto free_dma_dom;
-        INIT_LIST_HEAD(&dma_dom->domain.dev_list);
-        dma_dom->domain.mode = PAGE_MODE_2_LEVEL;
-        dma_dom->domain.pt_root = (void *)get_zeroed_page(GFP_KERNEL);
-        dma_dom->domain.flags = PD_DMA_OPS_MASK;
-        dma_dom->domain.priv = dma_dom;
-        if (!dma_dom->domain.pt_root)
-                goto free_dma_dom;
-        dma_dom->need_flush = false;
-        dma_dom->target_dev = 0xffff;
-        add_domain_to_list(&dma_dom->domain);
-        if (alloc_new_range(dma_dom, true, GFP_KERNEL))
-                goto free_dma_dom;
-        /*
-         * mark the first page as allocated so we never return 0 as
-         * a valid dma-address. So we can use 0 as error value
-         */
-        dma_dom->aperture[0]->bitmap[0] = 1;
-        dma_dom->next_address = 0;
-        return dma_dom;
-free_dma_dom:
-        dma_ops_domain_free(dma_dom);
-        return NULL;
-}
-/*
- * little helper function to check whether a given protection domain is a
- * dma_ops domain
- */
-static bool dma_ops_domain(struct protection_domain *domain)
-{
-        return domain->flags & PD_DMA_OPS_MASK;
-}
-static void set_dte_entry(u16 devid, struct protection_domain *domain, bool ats)
-{
-        u64 pte_root = virt_to_phys(domain->pt_root);
-        u32 flags = 0;
-        pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK)
-                    << DEV_ENTRY_MODE_SHIFT;
-        pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | IOMMU_PTE_TV;
-        if (ats)
-                flags |= DTE_FLAG_IOTLB;
-        amd_iommu_dev_table[devid].data[3] |= flags;
-        amd_iommu_dev_table[devid].data[2]  = domain->id;
-        amd_iommu_dev_table[devid].data[1]  = upper_32_bits(pte_root);
-        amd_iommu_dev_table[devid].data[0]  = lower_32_bits(pte_root);
-}
-static void clear_dte_entry(u16 devid)
-{
-        /* remove entry from the device table seen by the hardware */
-        amd_iommu_dev_table[devid].data[0] = IOMMU_PTE_P | IOMMU_PTE_TV;
-        amd_iommu_dev_table[devid].data[1] = 0;
-        amd_iommu_dev_table[devid].data[2] = 0;
-        amd_iommu_apply_erratum_63(devid);
-}
-static void do_attach(struct device *dev, struct protection_domain *domain)
-{
-        struct iommu_dev_data *dev_data;
-        struct amd_iommu *iommu;
-        struct pci_dev *pdev;
-        bool ats = false;
-        u16 devid;
-        devid    = get_device_id(dev);
-        iommu    = amd_iommu_rlookup_table[devid];
-        dev_data = get_dev_data(dev);
-        pdev     = to_pci_dev(dev);
-        if (amd_iommu_iotlb_sup)
-                ats = pci_ats_enabled(pdev);
-        /* Update data structures */
-        dev_data->domain = domain;
-        list_add(&dev_data->list, &domain->dev_list);
-        set_dte_entry(devid, domain, ats);
-        /* Do reference counting */
-        domain->dev_iommu[iommu->index] += 1;
-        domain->dev_cnt                 += 1;
-        /* Flush the DTE entry */
-        device_flush_dte(dev);
-}
-static void do_detach(struct device *dev)
-{
-        struct iommu_dev_data *dev_data;
-        struct amd_iommu *iommu;
-        u16 devid;
-        devid    = get_device_id(dev);
-        iommu    = amd_iommu_rlookup_table[devid];
-        dev_data = get_dev_data(dev);
-        /* decrease reference counters */
-        dev_data->domain->dev_iommu[iommu->index] -= 1;
-        dev_data->domain->dev_cnt                 -= 1;
-        /* Update data structures */
-        dev_data->domain = NULL;
-        list_del(&dev_data->list);
-        clear_dte_entry(devid);
-        /* Flush the DTE entry */
-        device_flush_dte(dev);
-}
-/*
- * If a device is not yet associated with a domain, this function does
- * assigns it visible for the hardware
- */
-static int __attach_device(struct device *dev,
-                           struct protection_domain *domain)
-{
-        struct iommu_dev_data *dev_data, *alias_data;
-        int ret;
-        dev_data   = get_dev_data(dev);
-        alias_data = get_dev_data(dev_data->alias);
-        if (!alias_data)
-                return -EINVAL;
-        /* lock domain */
-        spin_lock(&domain->lock);
-        /* Some sanity checks */
-        ret = -EBUSY;
-        if (alias_data->domain != NULL &&
-            alias_data->domain != domain)
-                goto out_unlock;
-        if (dev_data->domain != NULL &&
-            dev_data->domain != domain)
-                goto out_unlock;
-        /* Do real assignment */
-        if (dev_data->alias != dev) {
-                alias_data = get_dev_data(dev_data->alias);
-                if (alias_data->domain == NULL)
-                        do_attach(dev_data->alias, domain);
-                atomic_inc(&alias_data->bind);
-        }
-        if (dev_data->domain == NULL)
-                do_attach(dev, domain);
-        atomic_inc(&dev_data->bind);
-        ret = 0;
-out_unlock:
-        /* ready */
-        spin_unlock(&domain->lock);
-        return ret;
-}
-/*
- * If a device is not yet associated with a domain, this function does
- * assigns it visible for the hardware
- */
-static int attach_device(struct device *dev,
-                         struct protection_domain *domain)
-{
-        struct pci_dev *pdev = to_pci_dev(dev);
-        unsigned long flags;
-        int ret;
-        if (amd_iommu_iotlb_sup)
-                pci_enable_ats(pdev, PAGE_SHIFT);
-        write_lock_irqsave(&amd_iommu_devtable_lock, flags);
-        ret = __attach_device(dev, domain);
-        write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
-        /*
-         * We might boot into a crash-kernel here. The crashed kernel
-         * left the caches in the IOMMU dirty. So we have to flush
-         * here to evict all dirty stuff.
-         */
-        domain_flush_tlb_pde(domain);
-        return ret;
-}
-/*
- * Removes a device from a protection domain (unlocked)
- */
-static void __detach_device(struct device *dev)
-{
-        struct iommu_dev_data *dev_data = get_dev_data(dev);
-        struct iommu_dev_data *alias_data;
-        struct protection_domain *domain;
-        unsigned long flags;
-        BUG_ON(!dev_data->domain);
-        domain = dev_data->domain;
-        spin_lock_irqsave(&domain->lock, flags);
-        if (dev_data->alias != dev) {
-                alias_data = get_dev_data(dev_data->alias);
-                if (atomic_dec_and_test(&alias_data->bind))
-                        do_detach(dev_data->alias);
-        }
-        if (atomic_dec_and_test(&dev_data->bind))
-                do_detach(dev);
-        spin_unlock_irqrestore(&domain->lock, flags);
-        /*
-         * If we run in passthrough mode the device must be assigned to the
-         * passthrough domain if it is detached from any other domain.
-         * Make sure we can deassign from the pt_domain itself.
-         */
-        if (iommu_pass_through &&
-            (dev_data->domain == NULL && domain != pt_domain))
-                __attach_device(dev, pt_domain);
-}
-/*
- * Removes a device from a protection domain (with devtable_lock held)
- */
-static void detach_device(struct device *dev)
-{
-        struct pci_dev *pdev = to_pci_dev(dev);
-        unsigned long flags;
-        /* lock device table */
-        write_lock_irqsave(&amd_iommu_devtable_lock, flags);
-        __detach_device(dev);
-        write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
-        if (amd_iommu_iotlb_sup && pci_ats_enabled(pdev))
-                pci_disable_ats(pdev);
-}
-/*
- * Find out the protection domain structure for a given PCI device. This
- * will give us the pointer to the page table root for example.
- */
-static struct protection_domain *domain_for_device(struct device *dev)
-{
-        struct protection_domain *dom;
-        struct iommu_dev_data *dev_data, *alias_data;
-        unsigned long flags;
-        u16 devid;
-        devid      = get_device_id(dev);
-        dev_data   = get_dev_data(dev);
-        alias_data = get_dev_data(dev_data->alias);
-        if (!alias_data)
-                return NULL;
-        read_lock_irqsave(&amd_iommu_devtable_lock, flags);
-        dom = dev_data->domain;
-        if (dom == NULL &&
-            alias_data->domain != NULL) {
-                __attach_device(dev, alias_data->domain);
-                dom = alias_data->domain;
-        }
-        read_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
-        return dom;
-}
-static int device_change_notifier(struct notifier_block *nb,
-                                  unsigned long action, void *data)
-{
-        struct device *dev = data;
-        u16 devid;
-        struct protection_domain *domain;
-        struct dma_ops_domain *dma_domain;
-        struct amd_iommu *iommu;
-        unsigned long flags;
-        if (!check_device(dev))
-                return 0;
-        devid  = get_device_id(dev);
-        iommu  = amd_iommu_rlookup_table[devid];
-        switch (action) {
-        case BUS_NOTIFY_UNBOUND_DRIVER:
-                domain = domain_for_device(dev);
-                if (!domain)
-                        goto out;
-                if (iommu_pass_through)
-                        break;
-                detach_device(dev);
-                break;
-        case BUS_NOTIFY_ADD_DEVICE:
-                iommu_init_device(dev);
-                domain = domain_for_device(dev);
-                /* allocate a protection domain if a device is added */
-                dma_domain = find_protection_domain(devid);
-                if (dma_domain)
-                        goto out;
-                dma_domain = dma_ops_domain_alloc();
-                if (!dma_domain)
-                        goto out;
-                dma_domain->target_dev = devid;
-                spin_lock_irqsave(&iommu_pd_list_lock, flags);
-                list_add_tail(&dma_domain->list, &iommu_pd_list);
-                spin_unlock_irqrestore(&iommu_pd_list_lock, flags);
-                break;
-        case BUS_NOTIFY_DEL_DEVICE:
-                iommu_uninit_device(dev);
-        default:
-                goto out;
-        }
-        device_flush_dte(dev);
-        iommu_completion_wait(iommu);
-out:
-        return 0;
-}
-static struct notifier_block device_nb = {
-        .notifier_call = device_change_notifier,
-};
-void amd_iommu_init_notifier(void)
-{
-        bus_register_notifier(&pci_bus_type, &device_nb);
-}
-/*****************************************************************************
- *
- * The next functions belong to the dma_ops mapping/unmapping code.
- *
- *****************************************************************************/
-/*
- * In the dma_ops path we only have the struct device. This function
- * finds the corresponding IOMMU, the protection domain and the
- * requestor id for a given device.
- * If the device is not yet associated with a domain this is also done
- * in this function.
- */
-static struct protection_domain *get_domain(struct device *dev)
-{
-        struct protection_domain *domain;
-        struct dma_ops_domain *dma_dom;
-        u16 devid = get_device_id(dev);
-        if (!check_device(dev))
-                return ERR_PTR(-EINVAL);
-        domain = domain_for_device(dev);
-        if (domain != NULL && !dma_ops_domain(domain))
-                return ERR_PTR(-EBUSY);
-        if (domain != NULL)
-                return domain;
-        /* Device not bount yet - bind it */
-        dma_dom = find_protection_domain(devid);
-        if (!dma_dom)
-                dma_dom = amd_iommu_rlookup_table[devid]->default_dom;
-        attach_device(dev, &dma_dom->domain);
-        DUMP_printk("Using protection domain %d for device %s\n",
-                    dma_dom->domain.id, dev_name(dev));
-        return &dma_dom->domain;
-}
-static void update_device_table(struct protection_domain *domain)
-{
-        struct iommu_dev_data *dev_data;
-        list_for_each_entry(dev_data, &domain->dev_list, list) {
-                struct pci_dev *pdev = to_pci_dev(dev_data->dev);
-                u16 devid = get_device_id(dev_data->dev);
-                set_dte_entry(devid, domain, pci_ats_enabled(pdev));
-        }
-}
-static void update_domain(struct protection_domain *domain)
-{
-        if (!domain->updated)
-                return;
-        update_device_table(domain);
-        domain_flush_devices(domain);
-        domain_flush_tlb_pde(domain);
-        domain->updated = false;
-}
-/*
- * This function fetches the PTE for a given address in the aperture
- */
-static u64* dma_ops_get_pte(struct dma_ops_domain *dom,
-                            unsigned long address)
-{
-        struct aperture_range *aperture;
-        u64 *pte, *pte_page;
-        aperture = dom->aperture[APERTURE_RANGE_INDEX(address)];
-        if (!aperture)
-                return NULL;
-        pte = aperture->pte_pages[APERTURE_PAGE_INDEX(address)];
-        if (!pte) {
-                pte = alloc_pte(&dom->domain, address, PAGE_SIZE, &pte_page,
-                                GFP_ATOMIC);
-                aperture->pte_pages[APERTURE_PAGE_INDEX(address)] = pte_page;
-        } else
-                pte += PM_LEVEL_INDEX(0, address);
-        update_domain(&dom->domain);
-        return pte;
-}
-/*
- * This is the generic map function. It maps one 4kb page at paddr to
- * the given address in the DMA address space for the domain.
- */
-static dma_addr_t dma_ops_domain_map(struct dma_ops_domain *dom,
-                                     unsigned long address,
-                                     phys_addr_t paddr,
-                                     int direction)
-{
-        u64 *pte, __pte;
-        WARN_ON(address > dom->aperture_size);
-        paddr &= PAGE_MASK;
-        pte  = dma_ops_get_pte(dom, address);
-        if (!pte)
-                return DMA_ERROR_CODE;
-        __pte = paddr | IOMMU_PTE_P | IOMMU_PTE_FC;
-        if (direction == DMA_TO_DEVICE)
-                __pte |= IOMMU_PTE_IR;
-        else if (direction == DMA_FROM_DEVICE)
-                __pte |= IOMMU_PTE_IW;
-        else if (direction == DMA_BIDIRECTIONAL)
-                __pte |= IOMMU_PTE_IR | IOMMU_PTE_IW;
-        WARN_ON(*pte);
-        *pte = __pte;
-        return (dma_addr_t)address;
-}
-/*
- * The generic unmapping function for on page in the DMA address space.
- */
-static void dma_ops_domain_unmap(struct dma_ops_domain *dom,
-                                 unsigned long address)
-{
-        struct aperture_range *aperture;
-        u64 *pte;
-        if (address >= dom->aperture_size)
-                return;
-        aperture = dom->aperture[APERTURE_RANGE_INDEX(address)];
-        if (!aperture)
-                return;
-        pte  = aperture->pte_pages[APERTURE_PAGE_INDEX(address)];
-        if (!pte)
-                return;
-        pte += PM_LEVEL_INDEX(0, address);
-        WARN_ON(!*pte);
-        *pte = 0ULL;
-}
-/*
- * This function contains common code for mapping of a physically
- * contiguous memory region into DMA address space. It is used by all
- * mapping functions provided with this IOMMU driver.
- * Must be called with the domain lock held.
- */
-static dma_addr_t __map_single(struct device *dev,
-                               struct dma_ops_domain *dma_dom,
-                               phys_addr_t paddr,
-                               size_t size,
-                               int dir,
-                               bool align,
-                               u64 dma_mask)
-{
-        dma_addr_t offset = paddr & ~PAGE_MASK;
-        dma_addr_t address, start, ret;
-        unsigned int pages;
-        unsigned long align_mask = 0;
-        int i;
-        pages = iommu_num_pages(paddr, size, PAGE_SIZE);
-        paddr &= PAGE_MASK;
-        INC_STATS_COUNTER(total_map_requests);
-        if (pages > 1)
-                INC_STATS_COUNTER(cross_page);
-        if (align)
-                align_mask = (1UL << get_order(size)) - 1;
-retry:
-        address = dma_ops_alloc_addresses(dev, dma_dom, pages, align_mask,
-                                          dma_mask);
-        if (unlikely(address == DMA_ERROR_CODE)) {
-                /*
-                 * setting next_address here will let the address
-                 * allocator only scan the new allocated range in the
-                 * first run. This is a small optimization.
-                 */
-                dma_dom->next_address = dma_dom->aperture_size;
-                if (alloc_new_range(dma_dom, false, GFP_ATOMIC))
-                        goto out;
-                /*
-                 * aperture was successfully enlarged by 128 MB, try
-                 * allocation again
-                 */
-                goto retry;
-        }
-        start = address;
-        for (i = 0; i < pages; ++i) {
-                ret = dma_ops_domain_map(dma_dom, start, paddr, dir);
-                if (ret == DMA_ERROR_CODE)
-                        goto out_unmap;
-                paddr += PAGE_SIZE;
-                start += PAGE_SIZE;
-        }
-        address += offset;
-        ADD_STATS_COUNTER(alloced_io_mem, size);
-        if (unlikely(dma_dom->need_flush && !amd_iommu_unmap_flush)) {
-                domain_flush_tlb(&dma_dom->domain);
-                dma_dom->need_flush = false;
-        } else if (unlikely(amd_iommu_np_cache))
-                domain_flush_pages(&dma_dom->domain, address, size);
-out:
-        return address;
-out_unmap:
-        for (--i; i >= 0; --i) {
-                start -= PAGE_SIZE;
-                dma_ops_domain_unmap(dma_dom, start);
-        }
-        dma_ops_free_addresses(dma_dom, address, pages);
-        return DMA_ERROR_CODE;
-}
-/*
- * Does the reverse of the __map_single function. Must be called with
- * the domain lock held too
- */
-static void __unmap_single(struct dma_ops_domain *dma_dom,
-                           dma_addr_t dma_addr,
-                           size_t size,
-                           int dir)
-{
-        dma_addr_t flush_addr;
-        dma_addr_t i, start;
-        unsigned int pages;
-        if ((dma_addr == DMA_ERROR_CODE) ||
-            (dma_addr + size > dma_dom->aperture_size))
-                return;
-        flush_addr = dma_addr;
-        pages = iommu_num_pages(dma_addr, size, PAGE_SIZE);
-        dma_addr &= PAGE_MASK;
-        start = dma_addr;
-        for (i = 0; i < pages; ++i) {
-                dma_ops_domain_unmap(dma_dom, start);
-                start += PAGE_SIZE;
-        }
-        SUB_STATS_COUNTER(alloced_io_mem, size);
-        dma_ops_free_addresses(dma_dom, dma_addr, pages);
-        if (amd_iommu_unmap_flush || dma_dom->need_flush) {
-                domain_flush_pages(&dma_dom->domain, flush_addr, size);
-                dma_dom->need_flush = false;
-        }
-}
-/*
- * The exported map_single function for dma_ops.
- */
-static dma_addr_t map_page(struct device *dev, struct page *page,
-                           unsigned long offset, size_t size,
-                           enum dma_data_direction dir,
-                           struct dma_attrs *attrs)
-{
-        unsigned long flags;
-        struct protection_domain *domain;
-        dma_addr_t addr;
-        u64 dma_mask;
-        phys_addr_t paddr = page_to_phys(page) + offset;
-        INC_STATS_COUNTER(cnt_map_single);
-        domain = get_domain(dev);
-        if (PTR_ERR(domain) == -EINVAL)
-                return (dma_addr_t)paddr;
-        else if (IS_ERR(domain))
-                return DMA_ERROR_CODE;
-        dma_mask = *dev->dma_mask;
-        spin_lock_irqsave(&domain->lock, flags);
-        addr = __map_single(dev, domain->priv, paddr, size, dir, false,
-                            dma_mask);
-        if (addr == DMA_ERROR_CODE)
-                goto out;
-        domain_flush_complete(domain);
-out:
-        spin_unlock_irqrestore(&domain->lock, flags);
-        return addr;
-}
-/*
- * The exported unmap_single function for dma_ops.
- */
-static void unmap_page(struct device *dev, dma_addr_t dma_addr, size_t size,
-                       enum dma_data_direction dir, struct dma_attrs *attrs)
-{
-        unsigned long flags;
-        struct protection_domain *domain;
-        INC_STATS_COUNTER(cnt_unmap_single);
-        domain = get_domain(dev);
-        if (IS_ERR(domain))
-                return;
-        spin_lock_irqsave(&domain->lock, flags);
-        __unmap_single(domain->priv, dma_addr, size, dir);
-        domain_flush_complete(domain);
-        spin_unlock_irqrestore(&domain->lock, flags);
-}
-/*
- * This is a special map_sg function which is used if we should map a
- * device which is not handled by an AMD IOMMU in the system.
- */
-static int map_sg_no_iommu(struct device *dev, struct scatterlist *sglist,
-                           int nelems, int dir)
-{
-        struct scatterlist *s;
-        int i;
-        for_each_sg(sglist, s, nelems, i) {
-                s->dma_address = (dma_addr_t)sg_phys(s);
-                s->dma_length  = s->length;
-        }
-        return nelems;
-}
-/*
- * The exported map_sg function for dma_ops (handles scatter-gather
- * lists).
- */
-static int map_sg(struct device *dev, struct scatterlist *sglist,
-                  int nelems, enum dma_data_direction dir,
-                  struct dma_attrs *attrs)
-{
-        unsigned long flags;
-        struct protection_domain *domain;
-        int i;
-        struct scatterlist *s;
-        phys_addr_t paddr;
-        int mapped_elems = 0;
-        u64 dma_mask;
-        INC_STATS_COUNTER(cnt_map_sg);
-        domain = get_domain(dev);
-        if (PTR_ERR(domain) == -EINVAL)
-                return map_sg_no_iommu(dev, sglist, nelems, dir);
-        else if (IS_ERR(domain))
-                return 0;
-        dma_mask = *dev->dma_mask;
-        spin_lock_irqsave(&domain->lock, flags);
-        for_each_sg(sglist, s, nelems, i) {
-                paddr = sg_phys(s);
-                s->dma_address = __map_single(dev, domain->priv,
-                                              paddr, s->length, dir, false,
-                                              dma_mask);
-                if (s->dma_address) {
-                        s->dma_length = s->length;
-                        mapped_elems++;
-                } else
-                        goto unmap;
-        }
-        domain_flush_complete(domain);
-out:
-        spin_unlock_irqrestore(&domain->lock, flags);
-        return mapped_elems;
-unmap:
-        for_each_sg(sglist, s, mapped_elems, i) {
-                if (s->dma_address)
-                        __unmap_single(domain->priv, s->dma_address,
-                                       s->dma_length, dir);
-                s->dma_address = s->dma_length = 0;
-        }
-        mapped_elems = 0;
-        goto out;
-}
-/*
- * The exported map_sg function for dma_ops (handles scatter-gather
- * lists).
- */
-static void unmap_sg(struct device *dev, struct scatterlist *sglist,
-                     int nelems, enum dma_data_direction dir,
-                     struct dma_attrs *attrs)
-{
-        unsigned long flags;
-        struct protection_domain *domain;
-        struct scatterlist *s;
-        int i;
-        INC_STATS_COUNTER(cnt_unmap_sg);
-        domain = get_domain(dev);
-        if (IS_ERR(domain))
-                return;
-        spin_lock_irqsave(&domain->lock, flags);
-        for_each_sg(sglist, s, nelems, i) {
-                __unmap_single(domain->priv, s->dma_address,
-                               s->dma_length, dir);
-                s->dma_address = s->dma_length = 0;
-        }
-        domain_flush_complete(domain);
-        spin_unlock_irqrestore(&domain->lock, flags);
-}
-/*
- * The exported alloc_coherent function for dma_ops.
- */
-static void *alloc_coherent(struct device *dev, size_t size,
-                            dma_addr_t *dma_addr, gfp_t flag)
-{
-        unsigned long flags;
-        void *virt_addr;
-        struct protection_domain *domain;
-        phys_addr_t paddr;
-        u64 dma_mask = dev->coherent_dma_mask;
-        INC_STATS_COUNTER(cnt_alloc_coherent);
-        domain = get_domain(dev);
-        if (PTR_ERR(domain) == -EINVAL) {
-                virt_addr = (void *)__get_free_pages(flag, get_order(size));
-                *dma_addr = __pa(virt_addr);
-                return virt_addr;
-        } else if (IS_ERR(domain))
-                return NULL;
-        dma_mask  = dev->coherent_dma_mask;
-        flag     &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
-        flag     |= __GFP_ZERO;
-        virt_addr = (void *)__get_free_pages(flag, get_order(size));
-        if (!virt_addr)
-                return NULL;
-        paddr = virt_to_phys(virt_addr);
-        if (!dma_mask)
-                dma_mask = *dev->dma_mask;
-        spin_lock_irqsave(&domain->lock, flags);
-        *dma_addr = __map_single(dev, domain->priv, paddr,
-                                 size, DMA_BIDIRECTIONAL, true, dma_mask);
-        if (*dma_addr == DMA_ERROR_CODE) {
-                spin_unlock_irqrestore(&domain->lock, flags);
-                goto out_free;
-        }
-        domain_flush_complete(domain);
-        spin_unlock_irqrestore(&domain->lock, flags);
-        return virt_addr;
-out_free:
-        free_pages((unsigned long)virt_addr, get_order(size));
-        return NULL;
-}
-/*
- * The exported free_coherent function for dma_ops.
- */
-static void free_coherent(struct device *dev, size_t size,
-                          void *virt_addr, dma_addr_t dma_addr)
-{
-        unsigned long flags;
-        struct protection_domain *domain;
-        INC_STATS_COUNTER(cnt_free_coherent);
-        domain = get_domain(dev);
-        if (IS_ERR(domain))
-                goto free_mem;
-        spin_lock_irqsave(&domain->lock, flags);
-        __unmap_single(domain->priv, dma_addr, size, DMA_BIDIRECTIONAL);
-        domain_flush_complete(domain);
-        spin_unlock_irqrestore(&domain->lock, flags);
-free_mem:
-        free_pages((unsigned long)virt_addr, get_order(size));
-}
-/*
- * This function is called by the DMA layer to find out if we can handle a
- * particular device. It is part of the dma_ops.
- */
-static int amd_iommu_dma_supported(struct device *dev, u64 mask)
-{
-        return check_device(dev);
-}
-/*
- * The function for pre-allocating protection domains.
- *
- * If the driver core informs the DMA layer if a driver grabs a device
- * we don't need to preallocate the protection domains anymore.
- * For now we have to.
- */
-static void prealloc_protection_domains(void)
-{
-        struct pci_dev *dev = NULL;
-        struct dma_ops_domain *dma_dom;
-        u16 devid;
-        for_each_pci_dev(dev) {
-                /* Do we handle this device? */
-                if (!check_device(&dev->dev))
-                        continue;
-                /* Is there already any domain for it? */
-                if (domain_for_device(&dev->dev))
-                        continue;
-                devid = get_device_id(&dev->dev);
-                dma_dom = dma_ops_domain_alloc();
-                if (!dma_dom)
-                        continue;
-                init_unity_mappings_for_device(dma_dom, devid);
-                dma_dom->target_dev = devid;
-                attach_device(&dev->dev, &dma_dom->domain);
-                list_add_tail(&dma_dom->list, &iommu_pd_list);
-        }
-}
-static struct dma_map_ops amd_iommu_dma_ops = {
-        .alloc_coherent = alloc_coherent,
-        .free_coherent = free_coherent,
-        .map_page = map_page,
-        .unmap_page = unmap_page,
-        .map_sg = map_sg,
-        .unmap_sg = unmap_sg,
-        .dma_supported = amd_iommu_dma_supported,
-};
-static unsigned device_dma_ops_init(void)
-{
-        struct pci_dev *pdev = NULL;
-        unsigned unhandled = 0;
-        for_each_pci_dev(pdev) {
-                if (!check_device(&pdev->dev)) {
-                        unhandled += 1;
-                        continue;
-                }
-                pdev->dev.archdata.dma_ops = &amd_iommu_dma_ops;
-        }
-        return unhandled;
-}
-/*
- * The function which clues the AMD IOMMU driver into dma_ops.
- */
-void __init amd_iommu_init_api(void)
-{
-        register_iommu(&amd_iommu_ops);
-}
-int __init amd_iommu_init_dma_ops(void)
-{
-        struct amd_iommu *iommu;
-        int ret, unhandled;
-        /*
-         * first allocate a default protection domain for every IOMMU we
-         * found in the system. Devices not assigned to any other
-         * protection domain will be assigned to the default one.
-         */
-        for_each_iommu(iommu) {
-                iommu->default_dom = dma_ops_domain_alloc();
-                if (iommu->default_dom == NULL)
-                        return -ENOMEM;
-                iommu->default_dom->domain.flags |= PD_DEFAULT_MASK;
-                ret = iommu_init_unity_mappings(iommu);
-                if (ret)
-                        goto free_domains;
-        }
-        /*
-         * Pre-allocate the protection domains for each device.
-         */
-        prealloc_protection_domains();
-        iommu_detected = 1;
-        swiotlb = 0;
-        /* Make the driver finally visible to the drivers */
-        unhandled = device_dma_ops_init();
-        if (unhandled && max_pfn > MAX_DMA32_PFN) {
-                /* There are unhandled devices - initialize swiotlb for them */
-                swiotlb = 1;
-        }
-        amd_iommu_stats_init();
-        return 0;
-free_domains:
-        for_each_iommu(iommu) {
-                if (iommu->default_dom)
-                        dma_ops_domain_free(iommu->default_dom);
-        }
-        return ret;
-}
-/*****************************************************************************
- *
- * The following functions belong to the exported interface of AMD IOMMU
- *
- * This interface allows access to lower level functions of the IOMMU
- * like protection domain handling and assignement of devices to domains
- * which is not possible with the dma_ops interface.
- *
- *****************************************************************************/
-static void cleanup_domain(struct protection_domain *domain)
-{
-        struct iommu_dev_data *dev_data, *next;
-        unsigned long flags;
-        write_lock_irqsave(&amd_iommu_devtable_lock, flags);
-        list_for_each_entry_safe(dev_data, next, &domain->dev_list, list) {
-                struct device *dev = dev_data->dev;
-                __detach_device(dev);
-                atomic_set(&dev_data->bind, 0);
-        }
-        write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
-}
-static void protection_domain_free(struct protection_domain *domain)
-{
-        if (!domain)
-                return;
-        del_domain_from_list(domain);
-        if (domain->id)
-                domain_id_free(domain->id);
-        kfree(domain);
-}
-static struct protection_domain *protection_domain_alloc(void)
-{
-        struct protection_domain *domain;
-        domain = kzalloc(sizeof(*domain), GFP_KERNEL);
-        if (!domain)
-                return NULL;
-        spin_lock_init(&domain->lock);
-        mutex_init(&domain->api_lock);
-        domain->id = domain_id_alloc();
-        if (!domain->id)
-                goto out_err;
-        INIT_LIST_HEAD(&domain->dev_list);
-        add_domain_to_list(domain);
-        return domain;
-out_err:
-        kfree(domain);
-        return NULL;
-}
-static int amd_iommu_domain_init(struct iommu_domain *dom)
-{
-        struct protection_domain *domain;
-        domain = protection_domain_alloc();
-        if (!domain)
-                goto out_free;
-        domain->mode    = PAGE_MODE_3_LEVEL;
-        domain->pt_root = (void *)get_zeroed_page(GFP_KERNEL);
-        if (!domain->pt_root)
-                goto out_free;
-        dom->priv = domain;
-        return 0;
-out_free:
-        protection_domain_free(domain);
-        return -ENOMEM;
-}
-static void amd_iommu_domain_destroy(struct iommu_domain *dom)
-{
-        struct protection_domain *domain = dom->priv;
-        if (!domain)
-                return;
-        if (domain->dev_cnt > 0)
-                cleanup_domain(domain);
-        BUG_ON(domain->dev_cnt != 0);
-        free_pagetable(domain);
-        protection_domain_free(domain);
-        dom->priv = NULL;
-}
-static void amd_iommu_detach_device(struct iommu_domain *dom,
-                                    struct device *dev)
-{
-        struct iommu_dev_data *dev_data = dev->archdata.iommu;
-        struct amd_iommu *iommu;
-        u16 devid;
-        if (!check_device(dev))
-                return;
-        devid = get_device_id(dev);
-        if (dev_data->domain != NULL)
-                detach_device(dev);
-        iommu = amd_iommu_rlookup_table[devid];
-        if (!iommu)
-                return;
-        device_flush_dte(dev);
-        iommu_completion_wait(iommu);
-}
-static int amd_iommu_attach_device(struct iommu_domain *dom,
-                                   struct device *dev)
-{
-        struct protection_domain *domain = dom->priv;
-        struct iommu_dev_data *dev_data;
-        struct amd_iommu *iommu;
-        int ret;
-        u16 devid;
-        if (!check_device(dev))
-                return -EINVAL;
-        dev_data = dev->archdata.iommu;
-        devid = get_device_id(dev);
-        iommu = amd_iommu_rlookup_table[devid];
-        if (!iommu)
-                return -EINVAL;
-        if (dev_data->domain)
-                detach_device(dev);
-        ret = attach_device(dev, domain);
-        iommu_completion_wait(iommu);
-        return ret;
-}
-static int amd_iommu_map(struct iommu_domain *dom, unsigned long iova,
-                         phys_addr_t paddr, int gfp_order, int iommu_prot)
-{
-        unsigned long page_size = 0x1000UL << gfp_order;
-        struct protection_domain *domain = dom->priv;
-        int prot = 0;
-        int ret;
-        if (iommu_prot & IOMMU_READ)
-                prot |= IOMMU_PROT_IR;
-        if (iommu_prot & IOMMU_WRITE)
-                prot |= IOMMU_PROT_IW;
-        mutex_lock(&domain->api_lock);
-        ret = iommu_map_page(domain, iova, paddr, prot, page_size);
-        mutex_unlock(&domain->api_lock);
-        return ret;
-}
-static int amd_iommu_unmap(struct iommu_domain *dom, unsigned long iova,
-                           int gfp_order)
-{
-        struct protection_domain *domain = dom->priv;
-        unsigned long page_size, unmap_size;
-        page_size  = 0x1000UL << gfp_order;
-        mutex_lock(&domain->api_lock);
-        unmap_size = iommu_unmap_page(domain, iova, page_size);
-        mutex_unlock(&domain->api_lock);
-        domain_flush_tlb_pde(domain);
-        return get_order(unmap_size);
-}
-static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
-                                          unsigned long iova)
-{
-        struct protection_domain *domain = dom->priv;
-        unsigned long offset_mask;
-        phys_addr_t paddr;
-        u64 *pte, __pte;
-        pte = fetch_pte(domain, iova);
-        if (!pte || !IOMMU_PTE_PRESENT(*pte))
-                return 0;
-        if (PM_PTE_LEVEL(*pte) == 0)
-                offset_mask = PAGE_SIZE - 1;
-        else
-                offset_mask = PTE_PAGE_SIZE(*pte) - 1;
-        __pte = *pte & PM_ADDR_MASK;
-        paddr = (__pte & ~offset_mask) | (iova & offset_mask);
-        return paddr;
-}
-static int amd_iommu_domain_has_cap(struct iommu_domain *domain,
-                                    unsigned long cap)
-{
-        switch (cap) {
-        case IOMMU_CAP_CACHE_COHERENCY:
-                return 1;
-        }
-        return 0;
-}
-static struct iommu_ops amd_iommu_ops = {
-        .domain_init = amd_iommu_domain_init,
-        .domain_destroy = amd_iommu_domain_destroy,
-        .attach_dev = amd_iommu_attach_device,
-        .detach_dev = amd_iommu_detach_device,
-        .map = amd_iommu_map,
-        .unmap = amd_iommu_unmap,
-        .iova_to_phys = amd_iommu_iova_to_phys,
-        .domain_has_cap = amd_iommu_domain_has_cap,
-};
-/*****************************************************************************
- *
- * The next functions do a basic initialization of IOMMU for pass through
- * mode
- *
- * In passthrough mode the IOMMU is initialized and enabled but not used for
- * DMA-API translation.
- *
- *****************************************************************************/
-int __init amd_iommu_init_passthrough(void)
-{
-        struct amd_iommu *iommu;
-        struct pci_dev *dev = NULL;
-        u16 devid;
-        /* allocate passthrough domain */
-        pt_domain = protection_domain_alloc();
-        if (!pt_domain)
-                return -ENOMEM;
-        pt_domain->mode |= PAGE_MODE_NONE;
-        for_each_pci_dev(dev) {
-                if (!check_device(&dev->dev))
-                        continue;
-                devid = get_device_id(&dev->dev);
-                iommu = amd_iommu_rlookup_table[devid];
-                if (!iommu)
-                        continue;
-                attach_device(&dev->dev, pt_domain);
-        }
-        pr_info("AMD-Vi: Initialized for Passthrough Mode\n");
-        return 0;
-}
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
deleted file mode 100644
index bfc8453bd98d..000000000000
--- a/arch/x86/kernel/amd_iommu_init.c
+++ /dev/null
@@ -1,1572 +0,0 @@
-/*
- * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
- * Author: Joerg Roedel <joerg.roedel@amd.com>
- *         Leo Duran <leo.duran@amd.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published
- * by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
- */
-#include <linux/pci.h>
-#include <linux/acpi.h>
-#include <linux/list.h>
-#include <linux/slab.h>
-#include <linux/syscore_ops.h>
-#include <linux/interrupt.h>
-#include <linux/msi.h>
-#include <asm/pci-direct.h>
-#include <asm/amd_iommu_proto.h>
-#include <asm/amd_iommu_types.h>
-#include <asm/amd_iommu.h>
-#include <asm/iommu.h>
-#include <asm/gart.h>
-#include <asm/x86_init.h>
-#include <asm/iommu_table.h>
-/*
- * definitions for the ACPI scanning code
- */
-#define IVRS_HEADER_LENGTH 48
-#define ACPI_IVHD_TYPE                  0x10
-#define ACPI_IVMD_TYPE_ALL              0x20
-#define ACPI_IVMD_TYPE                  0x21
-#define ACPI_IVMD_TYPE_RANGE            0x22
-#define IVHD_DEV_ALL                    0x01
-#define IVHD_DEV_SELECT                 0x02
-#define IVHD_DEV_SELECT_RANGE_START     0x03
-#define IVHD_DEV_RANGE_END              0x04
-#define IVHD_DEV_ALIAS                  0x42
-#define IVHD_DEV_ALIAS_RANGE            0x43
-#define IVHD_DEV_EXT_SELECT             0x46
-#define IVHD_DEV_EXT_SELECT_RANGE       0x47
-#define IVHD_FLAG_HT_TUN_EN_MASK        0x01
-#define IVHD_FLAG_PASSPW_EN_MASK        0x02
-#define IVHD_FLAG_RESPASSPW_EN_MASK     0x04
-#define IVHD_FLAG_ISOC_EN_MASK          0x08
-#define IVMD_FLAG_EXCL_RANGE            0x08
-#define IVMD_FLAG_UNITY_MAP             0x01
-#define ACPI_DEVFLAG_INITPASS           0x01
-#define ACPI_DEVFLAG_EXTINT             0x02
-#define ACPI_DEVFLAG_NMI                0x04
-#define ACPI_DEVFLAG_SYSMGT1            0x10
-#define ACPI_DEVFLAG_SYSMGT2            0x20
-#define ACPI_DEVFLAG_LINT0              0x40
-#define ACPI_DEVFLAG_LINT1              0x80
-#define ACPI_DEVFLAG_ATSDIS             0x10000000
-/*
- * ACPI table definitions
- *
- * These data structures are laid over the table to parse the important values
- * out of it.
- */
-/*
- * structure describing one IOMMU in the ACPI table. Typically followed by one
- * or more ivhd_entrys.
- */
-struct ivhd_header {
-        u8 type;
-        u8 flags;
-        u16 length;
-        u16 devid;
-        u16 cap_ptr;
-        u64 mmio_phys;
-        u16 pci_seg;
-        u16 info;
-        u32 reserved;
-} __attribute__((packed));
-/*
- * A device entry describing which devices a specific IOMMU translates and
- * which requestor ids they use.
- */
-struct ivhd_entry {
-        u8 type;
-        u16 devid;
-        u8 flags;
-        u32 ext;
-} __attribute__((packed));
-/*
- * An AMD IOMMU memory definition structure. It defines things like exclusion
- * ranges for devices and regions that should be unity mapped.
- */
-struct ivmd_header {
-        u8 type;
-        u8 flags;
-        u16 length;
-        u16 devid;
-        u16 aux;
-        u64 resv;
-        u64 range_start;
-        u64 range_length;
-} __attribute__((packed));
-bool amd_iommu_dump;
-static int __initdata amd_iommu_detected;
-static bool __initdata amd_iommu_disabled;
-u16 amd_iommu_last_bdf;                 /* largest PCI device id we have
-                                           to handle */
-LIST_HEAD(amd_iommu_unity_map);         /* a list of required unity mappings
-                                           we find in ACPI */
-bool amd_iommu_unmap_flush;             /* if true, flush on every unmap */
-LIST_HEAD(amd_iommu_list);              /* list of all AMD IOMMUs in the
-                                           system */
-/* Array to assign indices to IOMMUs*/
-struct amd_iommu *amd_iommus[MAX_IOMMUS];
-int amd_iommus_present;
-/* IOMMUs have a non-present cache? */
-bool amd_iommu_np_cache __read_mostly;
-bool amd_iommu_iotlb_sup __read_mostly = true;
-/*
- * The ACPI table parsing functions set this variable on an error
- */
-static int __initdata amd_iommu_init_err;
-/*
- * List of protection domains - used during resume
- */
-LIST_HEAD(amd_iommu_pd_list);
-spinlock_t amd_iommu_pd_lock;
-/*
- * Pointer to the device table which is shared by all AMD IOMMUs
- * it is indexed by the PCI device id or the HT unit id and contains
- * information about the domain the device belongs to as well as the
- * page table root pointer.
- */
-struct dev_table_entry *amd_iommu_dev_table;
-/*
- * The alias table is a driver specific data structure which contains the
- * mappings of the PCI device ids to the actual requestor ids on the IOMMU.
- * More than one device can share the same requestor id.
- */
-u16 *amd_iommu_alias_table;
-/*
- * The rlookup table is used to find the IOMMU which is responsible
- * for a specific device. It is also indexed by the PCI device id.
- */
-struct amd_iommu **amd_iommu_rlookup_table;
-/*
- * AMD IOMMU allows up to 2^16 differend protection domains. This is a bitmap
- * to know which ones are already in use.
- */
-unsigned long *amd_iommu_pd_alloc_bitmap;
-static u32 dev_table_size;      /* size of the device table */
-static u32 alias_table_size;    /* size of the alias table */
-static u32 rlookup_table_size;  /* size if the rlookup table */
-/*
- * This function flushes all internal caches of
- * the IOMMU used by this driver.
- */
-extern void iommu_flush_all_caches(struct amd_iommu *iommu);
-static inline void update_last_devid(u16 devid)
-{
-        if (devid > amd_iommu_last_bdf)
-                amd_iommu_last_bdf = devid;
-}
-static inline unsigned long tbl_size(int entry_size)
-{
-        unsigned shift = PAGE_SHIFT +
-                         get_order(((int)amd_iommu_last_bdf + 1) * entry_size);
-        return 1UL << shift;
-}
-/* Access to l1 and l2 indexed register spaces */
-static u32 iommu_read_l1(struct amd_iommu *iommu, u16 l1, u8 address)
-{
-        u32 val;
-        pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16));
-        pci_read_config_dword(iommu->dev, 0xfc, &val);
-        return val;
-}
-static void iommu_write_l1(struct amd_iommu *iommu, u16 l1, u8 address, u32 val)
-{
-        pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16 | 1 << 31));
-        pci_write_config_dword(iommu->dev, 0xfc, val);
-        pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16));
-}
-static u32 iommu_read_l2(struct amd_iommu *iommu, u8 address)
-{
-        u32 val;
-        pci_write_config_dword(iommu->dev, 0xf0, address);
-        pci_read_config_dword(iommu->dev, 0xf4, &val);
-        return val;
-}
-static void iommu_write_l2(struct amd_iommu *iommu, u8 address, u32 val)
-{
-        pci_write_config_dword(iommu->dev, 0xf0, (address | 1 << 8));
-        pci_write_config_dword(iommu->dev, 0xf4, val);
-}
-/****************************************************************************
- *
- * AMD IOMMU MMIO register space handling functions
- *
- * These functions are used to program the IOMMU device registers in
- * MMIO space required for that driver.
- *
- ****************************************************************************/
-/*
- * This function set the exclusion range in the IOMMU. DMA accesses to the
- * exclusion range are passed through untranslated
- */
-static void iommu_set_exclusion_range(struct amd_iommu *iommu)
-{
-        u64 start = iommu->exclusion_start & PAGE_MASK;
-        u64 limit = (start + iommu->exclusion_length) & PAGE_MASK;
-        u64 entry;
-        if (!iommu->exclusion_start)
-                return;
-        entry = start | MMIO_EXCL_ENABLE_MASK;
-        memcpy_toio(iommu->mmio_base + MMIO_EXCL_BASE_OFFSET,
-                        &entry, sizeof(entry));
-        entry = limit;
-        memcpy_toio(iommu->mmio_base + MMIO_EXCL_LIMIT_OFFSET,
-                        &entry, sizeof(entry));
-}
-/* Programs the physical address of the device table into the IOMMU hardware */
-static void __init iommu_set_device_table(struct amd_iommu *iommu)
-{
-        u64 entry;
-        BUG_ON(iommu->mmio_base == NULL);
-        entry = virt_to_phys(amd_iommu_dev_table);
-        entry |= (dev_table_size >> 12) - 1;
-        memcpy_toio(iommu->mmio_base + MMIO_DEV_TABLE_OFFSET,
-                        &entry, sizeof(entry));
-}
-/* Generic functions to enable/disable certain features of the IOMMU. */
-static void iommu_feature_enable(struct amd_iommu *iommu, u8 bit)
-{
-        u32 ctrl;
-        ctrl = readl(iommu->mmio_base + MMIO_CONTROL_OFFSET);
-        ctrl |= (1 << bit);
-        writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET);
-}
-static void iommu_feature_disable(struct amd_iommu *iommu, u8 bit)
-{
-        u32 ctrl;
-        ctrl = readl(iommu->mmio_base + MMIO_CONTROL_OFFSET);
-        ctrl &= ~(1 << bit);
-        writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET);
-}
-/* Function to enable the hardware */
-static void iommu_enable(struct amd_iommu *iommu)
-{
-        static const char * const feat_str[] = {
-                "PreF", "PPR", "X2APIC", "NX", "GT", "[5]",
-                "IA", "GA", "HE", "PC", NULL
-        };
-        int i;
-        printk(KERN_INFO "AMD-Vi: Enabling IOMMU at %s cap 0x%hx",
-               dev_name(&iommu->dev->dev), iommu->cap_ptr);
-        if (iommu->cap & (1 << IOMMU_CAP_EFR)) {
-                printk(KERN_CONT " extended features: ");
-                for (i = 0; feat_str[i]; ++i)
-                        if (iommu_feature(iommu, (1ULL << i)))
-                                printk(KERN_CONT " %s", feat_str[i]);
-        }
-        printk(KERN_CONT "\n");
-        iommu_feature_enable(iommu, CONTROL_IOMMU_EN);
-}
-static void iommu_disable(struct amd_iommu *iommu)
-{
-        /* Disable command buffer */
-        iommu_feature_disable(iommu, CONTROL_CMDBUF_EN);
-        /* Disable event logging and event interrupts */
-        iommu_feature_disable(iommu, CONTROL_EVT_INT_EN);
-        iommu_feature_disable(iommu, CONTROL_EVT_LOG_EN);
-        /* Disable IOMMU hardware itself */
-        iommu_feature_disable(iommu, CONTROL_IOMMU_EN);
-}
-/*
- * mapping and unmapping functions for the IOMMU MMIO space. Each AMD IOMMU in
- * the system has one.
- */
-static u8 * __init iommu_map_mmio_space(u64 address)
-{
-        u8 *ret;
-        if (!request_mem_region(address, MMIO_REGION_LENGTH, "amd_iommu")) {
-                pr_err("AMD-Vi: Can not reserve memory region %llx for mmio\n",
-                        address);
-                pr_err("AMD-Vi: This is a BIOS bug. Please contact your hardware vendor\n");
-                return NULL;
-        }
-        ret = ioremap_nocache(address, MMIO_REGION_LENGTH);
-        if (ret != NULL)
-                return ret;
-        release_mem_region(address, MMIO_REGION_LENGTH);
-        return NULL;
-}
-static void __init iommu_unmap_mmio_space(struct amd_iommu *iommu)
-{
-        if (iommu->mmio_base)
-                iounmap(iommu->mmio_base);
-        release_mem_region(iommu->mmio_phys, MMIO_REGION_LENGTH);
-}
-/****************************************************************************
- *
- * The functions below belong to the first pass of AMD IOMMU ACPI table
- * parsing. In this pass we try to find out the highest device id this
- * code has to handle. Upon this information the size of the shared data
- * structures is determined later.
- *
- ****************************************************************************/
-/*
- * This function calculates the length of a given IVHD entry
- */
-static inline int ivhd_entry_length(u8 *ivhd)
-{
-        return 0x04 << (*ivhd >> 6);
-}
-/*
- * This function reads the last device id the IOMMU has to handle from the PCI
- * capability header for this IOMMU
- */
-static int __init find_last_devid_on_pci(int bus, int dev, int fn, int cap_ptr)
-{
-        u32 cap;
-        cap = read_pci_config(bus, dev, fn, cap_ptr+MMIO_RANGE_OFFSET);
-        update_last_devid(calc_devid(MMIO_GET_BUS(cap), MMIO_GET_LD(cap)));
-        return 0;
-}
-/*
- * After reading the highest device id from the IOMMU PCI capability header
- * this function looks if there is a higher device id defined in the ACPI table
- */
-static int __init find_last_devid_from_ivhd(struct ivhd_header *h)
-{
-        u8 *p = (void *)h, *end = (void *)h;
-        struct ivhd_entry *dev;
-        p += sizeof(*h);
-        end += h->length;
-        find_last_devid_on_pci(PCI_BUS(h->devid),
-                        PCI_SLOT(h->devid),
-                        PCI_FUNC(h->devid),
-                        h->cap_ptr);
-        while (p < end) {
-                dev = (struct ivhd_entry *)p;
-                switch (dev->type) {
-                case IVHD_DEV_SELECT:
-                case IVHD_DEV_RANGE_END:
-                case IVHD_DEV_ALIAS:
-                case IVHD_DEV_EXT_SELECT:
-                        /* all the above subfield types refer to device ids */
-                        update_last_devid(dev->devid);
-                        break;
-                default:
-                        break;
-                }
-                p += ivhd_entry_length(p);
-        }
-        WARN_ON(p != end);
-        return 0;
-}
-/*
- * Iterate over all IVHD entries in the ACPI table and find the highest device
- * id which we need to handle. This is the first of three functions which parse
- * the ACPI table. So we check the checksum here.
- */
-static int __init find_last_devid_acpi(struct acpi_table_header *table)
-{
-        int i;
-        u8 checksum = 0, *p = (u8 *)table, *end = (u8 *)table;
-        struct ivhd_header *h;
-        /*
-         * Validate checksum here so we don't need to do it when
-         * we actually parse the table
-         */
-        for (i = 0; i < table->length; ++i)
-                checksum += p[i];
-        if (checksum != 0) {
-                /* ACPI table corrupt */
-                amd_iommu_init_err = -ENODEV;
-                return 0;
-        }
-        p += IVRS_HEADER_LENGTH;
-        end += table->length;
-        while (p < end) {
-                h = (struct ivhd_header *)p;
-                switch (h->type) {
-                case ACPI_IVHD_TYPE:
-                        find_last_devid_from_ivhd(h);
-                        break;
-                default:
-                        break;
-                }
-                p += h->length;
-        }
-        WARN_ON(p != end);
-        return 0;
-}
-/****************************************************************************
- *
- * The following functions belong the the code path which parses the ACPI table
- * the second time. In this ACPI parsing iteration we allocate IOMMU specific
- * data structures, initialize the device/alias/rlookup table and also
- * basically initialize the hardware.
- *
- ****************************************************************************/
-/*
- * Allocates the command buffer. This buffer is per AMD IOMMU. We can
- * write commands to that buffer later and the IOMMU will execute them
- * asynchronously
- */
-static u8 * __init alloc_command_buffer(struct amd_iommu *iommu)
-{
-        u8 *cmd_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
-                        get_order(CMD_BUFFER_SIZE));
-        if (cmd_buf == NULL)
-                return NULL;
-        iommu->cmd_buf_size = CMD_BUFFER_SIZE | CMD_BUFFER_UNINITIALIZED;
-        return cmd_buf;
-}
-/*
- * This function resets the command buffer if the IOMMU stopped fetching
- * commands from it.
- */
-void amd_iommu_reset_cmd_buffer(struct amd_iommu *iommu)
-{
-        iommu_feature_disable(iommu, CONTROL_CMDBUF_EN);
-        writel(0x00, iommu->mmio_base + MMIO_CMD_HEAD_OFFSET);
-        writel(0x00, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
-        iommu_feature_enable(iommu, CONTROL_CMDBUF_EN);
-}
-/*
- * This function writes the command buffer address to the hardware and
- * enables it.
- */
-static void iommu_enable_command_buffer(struct amd_iommu *iommu)
-{
-        u64 entry;
-        BUG_ON(iommu->cmd_buf == NULL);
-        entry = (u64)virt_to_phys(iommu->cmd_buf);
-        entry |= MMIO_CMD_SIZE_512;
-        memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET,
-                    &entry, sizeof(entry));
-        amd_iommu_reset_cmd_buffer(iommu);
-        iommu->cmd_buf_size &= ~(CMD_BUFFER_UNINITIALIZED);
-}
-static void __init free_command_buffer(struct amd_iommu *iommu)
-{
-        free_pages((unsigned long)iommu->cmd_buf,
-                   get_order(iommu->cmd_buf_size & ~(CMD_BUFFER_UNINITIALIZED)));
-}
-/* allocates the memory where the IOMMU will log its events to */
-static u8 * __init alloc_event_buffer(struct amd_iommu *iommu)
-{
-        iommu->evt_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
-                                                get_order(EVT_BUFFER_SIZE));
-        if (iommu->evt_buf == NULL)
-                return NULL;
-        iommu->evt_buf_size = EVT_BUFFER_SIZE;
-        return iommu->evt_buf;
-}
-static void iommu_enable_event_buffer(struct amd_iommu *iommu)
-{
-        u64 entry;
-        BUG_ON(iommu->evt_buf == NULL);
-        entry = (u64)virt_to_phys(iommu->evt_buf) | EVT_LEN_MASK;
-        memcpy_toio(iommu->mmio_base + MMIO_EVT_BUF_OFFSET,
-                    &entry, sizeof(entry));
-        /* set head and tail to zero manually */
-        writel(0x00, iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
-        writel(0x00, iommu->mmio_base + MMIO_EVT_TAIL_OFFSET);
-        iommu_feature_enable(iommu, CONTROL_EVT_LOG_EN);
-}
-static void __init free_event_buffer(struct amd_iommu *iommu)
-{
-        free_pages((unsigned long)iommu->evt_buf, get_order(EVT_BUFFER_SIZE));
-}
-/* sets a specific bit in the device table entry. */
-static void set_dev_entry_bit(u16 devid, u8 bit)
-{
-        int i = (bit >> 5) & 0x07;
-        int _bit = bit & 0x1f;
-        amd_iommu_dev_table[devid].data[i] |= (1 << _bit);
-}
-static int get_dev_entry_bit(u16 devid, u8 bit)
-{
-        int i = (bit >> 5) & 0x07;
-        int _bit = bit & 0x1f;
-        return (amd_iommu_dev_table[devid].data[i] & (1 << _bit)) >> _bit;
-}
-void amd_iommu_apply_erratum_63(u16 devid)
-{
-        int sysmgt;
-        sysmgt = get_dev_entry_bit(devid, DEV_ENTRY_SYSMGT1) |
-                 (get_dev_entry_bit(devid, DEV_ENTRY_SYSMGT2) << 1);
-        if (sysmgt == 0x01)
-                set_dev_entry_bit(devid, DEV_ENTRY_IW);
-}
-/* Writes the specific IOMMU for a device into the rlookup table */
-static void __init set_iommu_for_device(struct amd_iommu *iommu, u16 devid)
-{
-        amd_iommu_rlookup_table[devid] = iommu;
-}
-/*
- * This function takes the device specific flags read from the ACPI
- * table and sets up the device table entry with that information
- */
-static void __init set_dev_entry_from_acpi(struct amd_iommu *iommu,
-                                           u16 devid, u32 flags, u32 ext_flags)
-{
-        if (flags & ACPI_DEVFLAG_INITPASS)
-                set_dev_entry_bit(devid, DEV_ENTRY_INIT_PASS);
-        if (flags & ACPI_DEVFLAG_EXTINT)
-                set_dev_entry_bit(devid, DEV_ENTRY_EINT_PASS);
-        if (flags & ACPI_DEVFLAG_NMI)
-                set_dev_entry_bit(devid, DEV_ENTRY_NMI_PASS);
-        if (flags & ACPI_DEVFLAG_SYSMGT1)
-                set_dev_entry_bit(devid, DEV_ENTRY_SYSMGT1);
-        if (flags & ACPI_DEVFLAG_SYSMGT2)
-                set_dev_entry_bit(devid, DEV_ENTRY_SYSMGT2);
-        if (flags & ACPI_DEVFLAG_LINT0)
-                set_dev_entry_bit(devid, DEV_ENTRY_LINT0_PASS);
-        if (flags & ACPI_DEVFLAG_LINT1)
-                set_dev_entry_bit(devid, DEV_ENTRY_LINT1_PASS);
-        amd_iommu_apply_erratum_63(devid);
-        set_iommu_for_device(iommu, devid);
-}
-/*
- * Reads the device exclusion range from ACPI and initialize IOMMU with
- * it
- */
-static void __init set_device_exclusion_range(u16 devid, struct ivmd_header *m)
-{
-        struct amd_iommu *iommu = amd_iommu_rlookup_table[devid];
-        if (!(m->flags & IVMD_FLAG_EXCL_RANGE))
-                return;
-        if (iommu) {
-                /*
-                 * We only can configure exclusion ranges per IOMMU, not
-                 * per device. But we can enable the exclusion range per
-                 * device. This is done here
-                 */
-                set_dev_entry_bit(m->devid, DEV_ENTRY_EX);
-                iommu->exclusion_start = m->range_start;
-                iommu->exclusion_length = m->range_length;
-        }
-}
-/*
- * This function reads some important data from the IOMMU PCI space and
- * initializes the driver data structure with it. It reads the hardware
- * capabilities and the first/last device entries
- */
-static void __init init_iommu_from_pci(struct amd_iommu *iommu)
-{
-        int cap_ptr = iommu->cap_ptr;
-        u32 range, misc, low, high;
-        int i, j;
-        pci_read_config_dword(iommu->dev, cap_ptr + MMIO_CAP_HDR_OFFSET,
-                              &iommu->cap);
-        pci_read_config_dword(iommu->dev, cap_ptr + MMIO_RANGE_OFFSET,
-                              &range);
-        pci_read_config_dword(iommu->dev, cap_ptr + MMIO_MISC_OFFSET,
-                              &misc);
-        iommu->first_device = calc_devid(MMIO_GET_BUS(range),
-                                         MMIO_GET_FD(range));
-        iommu->last_device = calc_devid(MMIO_GET_BUS(range),
-                                        MMIO_GET_LD(range));
-        iommu->evt_msi_num = MMIO_MSI_NUM(misc);
-        if (!(iommu->cap & (1 << IOMMU_CAP_IOTLB)))
-                amd_iommu_iotlb_sup = false;
-        /* read extended feature bits */
-        low  = readl(iommu->mmio_base + MMIO_EXT_FEATURES);
-        high = readl(iommu->mmio_base + MMIO_EXT_FEATURES + 4);
-        iommu->features = ((u64)high << 32) | low;
-        if (!is_rd890_iommu(iommu->dev))
-                return;
-        /*
-         * Some rd890 systems may not be fully reconfigured by the BIOS, so
-         * it's necessary for us to store this information so it can be
-         * reprogrammed on resume
-         */
-        pci_read_config_dword(iommu->dev, iommu->cap_ptr + 4,
-                              &iommu->stored_addr_lo);
-        pci_read_config_dword(iommu->dev, iommu->cap_ptr + 8,
-                              &iommu->stored_addr_hi);
-        /* Low bit locks writes to configuration space */
-        iommu->stored_addr_lo &= ~1;
-        for (i = 0; i < 6; i++)
-                for (j = 0; j < 0x12; j++)
-                        iommu->stored_l1[i][j] = iommu_read_l1(iommu, i, j);
-        for (i = 0; i < 0x83; i++)
-                iommu->stored_l2[i] = iommu_read_l2(iommu, i);
-}
-/*
- * Takes a pointer to an AMD IOMMU entry in the ACPI table and
- * initializes the hardware and our data structures with it.
- */
-static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
-                                        struct ivhd_header *h)
-{
-        u8 *p = (u8 *)h;
-        u8 *end = p, flags = 0;
-        u16 devid = 0, devid_start = 0, devid_to = 0;
-        u32 dev_i, ext_flags = 0;
-        bool alias = false;
-        struct ivhd_entry *e;
-        /*
-         * First save the recommended feature enable bits from ACPI
-         */
-        iommu->acpi_flags = h->flags;
-        /*
-         * Done. Now parse the device entries
-         */
-        p += sizeof(struct ivhd_header);
-        end += h->length;
-        while (p < end) {
-                e = (struct ivhd_entry *)p;
-                switch (e->type) {
-                case IVHD_DEV_ALL:
-                        DUMP_printk("  DEV_ALL\t\t\t first devid: %02x:%02x.%x"
-                                    " last device %02x:%02x.%x flags: %02x\n",
-                                    PCI_BUS(iommu->first_device),
-                                    PCI_SLOT(iommu->first_device),
-                                    PCI_FUNC(iommu->first_device),
-                                    PCI_BUS(iommu->last_device),
-                                    PCI_SLOT(iommu->last_device),
-                                    PCI_FUNC(iommu->last_device),
-                                    e->flags);
-                        for (dev_i = iommu->first_device;
-                                        dev_i <= iommu->last_device; ++dev_i)
-                                set_dev_entry_from_acpi(iommu, dev_i,
-                                                        e->flags, 0);
-                        break;
-                case IVHD_DEV_SELECT:
-                        DUMP_printk("  DEV_SELECT\t\t\t devid: %02x:%02x.%x "
-                                    "flags: %02x\n",
-                                    PCI_BUS(e->devid),
-                                    PCI_SLOT(e->devid),
-                                    PCI_FUNC(e->devid),
-                                    e->flags);
-                        devid = e->devid;
-                        set_dev_entry_from_acpi(iommu, devid, e->flags, 0);
-                        break;
-                case IVHD_DEV_SELECT_RANGE_START:
-                        DUMP_printk("  DEV_SELECT_RANGE_START\t "
-                                    "devid: %02x:%02x.%x flags: %02x\n",
-                                    PCI_BUS(e->devid),
-                                    PCI_SLOT(e->devid),
-                                    PCI_FUNC(e->devid),
-                                    e->flags);
-                        devid_start = e->devid;
-                        flags = e->flags;
-                        ext_flags = 0;
-                        alias = false;
-                        break;
-                case IVHD_DEV_ALIAS:
-                        DUMP_printk("  DEV_ALIAS\t\t\t devid: %02x:%02x.%x "
-                                    "flags: %02x devid_to: %02x:%02x.%x\n",
-                                    PCI_BUS(e->devid),
-                                    PCI_SLOT(e->devid),
-                                    PCI_FUNC(e->devid),
-                                    e->flags,
-                                    PCI_BUS(e->ext >> 8),
-                                    PCI_SLOT(e->ext >> 8),
-                                    PCI_FUNC(e->ext >> 8));
-                        devid = e->devid;
-                        devid_to = e->ext >> 8;
-                        set_dev_entry_from_acpi(iommu, devid   , e->flags, 0);
-                        set_dev_entry_from_acpi(iommu, devid_to, e->flags, 0);
-                        amd_iommu_alias_table[devid] = devid_to;
-                        break;
-                case IVHD_DEV_ALIAS_RANGE:
-                        DUMP_printk("  DEV_ALIAS_RANGE\t\t "
-                                    "devid: %02x:%02x.%x flags: %02x "
-                                    "devid_to: %02x:%02x.%x\n",
-                                    PCI_BUS(e->devid),
-                                    PCI_SLOT(e->devid),
-                                    PCI_FUNC(e->devid),
-                                    e->flags,
-                                    PCI_BUS(e->ext >> 8),
-                                    PCI_SLOT(e->ext >> 8),
-                                    PCI_FUNC(e->ext >> 8));
-                        devid_start = e->devid;
-                        flags = e->flags;
-                        devid_to = e->ext >> 8;
-                        ext_flags = 0;
-                        alias = true;
-                        break;
-                case IVHD_DEV_EXT_SELECT:
-                        DUMP_printk("  DEV_EXT_SELECT\t\t devid: %02x:%02x.%x "
-                                    "flags: %02x ext: %08x\n",
-                                    PCI_BUS(e->devid),
-                                    PCI_SLOT(e->devid),
-                                    PCI_FUNC(e->devid),
-                                    e->flags, e->ext);
-                        devid = e->devid;
-                        set_dev_entry_from_acpi(iommu, devid, e->flags,
-                                                e->ext);
-                        break;
-                case IVHD_DEV_EXT_SELECT_RANGE:
-                        DUMP_printk("  DEV_EXT_SELECT_RANGE\t devid: "
-                                    "%02x:%02x.%x flags: %02x ext: %08x\n",
-                                    PCI_BUS(e->devid),
-                                    PCI_SLOT(e->devid),
-                                    PCI_FUNC(e->devid),
-                                    e->flags, e->ext);
-                        devid_start = e->devid;
-                        flags = e->flags;
-                        ext_flags = e->ext;
-                        alias = false;
-                        break;
-                case IVHD_DEV_RANGE_END:
-                        DUMP_printk("  DEV_RANGE_END\t\t devid: %02x:%02x.%x\n",
-                                    PCI_BUS(e->devid),
-                                    PCI_SLOT(e->devid),
-                                    PCI_FUNC(e->devid));
-                        devid = e->devid;
-                        for (dev_i = devid_start; dev_i <= devid; ++dev_i) {
-                                if (alias) {
-                                        amd_iommu_alias_table[dev_i] = devid_to;
-                                        set_dev_entry_from_acpi(iommu,
-                                                devid_to, flags, ext_flags);
-                                }
-                                set_dev_entry_from_acpi(iommu, dev_i,
-                                                        flags, ext_flags);
-                        }
-                        break;
-                default:
-                        break;
-                }
-                p += ivhd_entry_length(p);
-        }
-}
-/* Initializes the device->iommu mapping for the driver */
-static int __init init_iommu_devices(struct amd_iommu *iommu)
-{
-        u32 i;
-        for (i = iommu->first_device; i <= iommu->last_device; ++i)
-                set_iommu_for_device(iommu, i);
-        return 0;
-}
-static void __init free_iommu_one(struct amd_iommu *iommu)
-{
-        free_command_buffer(iommu);
-        free_event_buffer(iommu);
-        iommu_unmap_mmio_space(iommu);
-}
-static void __init free_iommu_all(void)
-{
-        struct amd_iommu *iommu, *next;
-        for_each_iommu_safe(iommu, next) {
-                list_del(&iommu->list);
-                free_iommu_one(iommu);
-                kfree(iommu);
-        }
-}
-/*
- * This function clues the initialization function for one IOMMU
- * together and also allocates the command buffer and programs the
- * hardware. It does NOT enable the IOMMU. This is done afterwards.
- */
-static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
-{
-        spin_lock_init(&iommu->lock);
-        /* Add IOMMU to internal data structures */
-        list_add_tail(&iommu->list, &amd_iommu_list);
-        iommu->index             = amd_iommus_present++;
-        if (unlikely(iommu->index >= MAX_IOMMUS)) {
-                WARN(1, "AMD-Vi: System has more IOMMUs than supported by this driver\n");
-                return -ENOSYS;
-        }
-        /* Index is fine - add IOMMU to the array */
-        amd_iommus[iommu->index] = iommu;
-        /*
-         * Copy data from ACPI table entry to the iommu struct
-         */
-        iommu->dev = pci_get_bus_and_slot(PCI_BUS(h->devid), h->devid & 0xff);
-        if (!iommu->dev)
-                return 1;
-        iommu->cap_ptr = h->cap_ptr;
-        iommu->pci_seg = h->pci_seg;
-        iommu->mmio_phys = h->mmio_phys;
-        iommu->mmio_base = iommu_map_mmio_space(h->mmio_phys);
-        if (!iommu->mmio_base)
-                return -ENOMEM;
-        iommu->cmd_buf = alloc_command_buffer(iommu);
-        if (!iommu->cmd_buf)
-                return -ENOMEM;
-        iommu->evt_buf = alloc_event_buffer(iommu);
-        if (!iommu->evt_buf)
-                return -ENOMEM;
-        iommu->int_enabled = false;
-        init_iommu_from_pci(iommu);
-        init_iommu_from_acpi(iommu, h);
-        init_iommu_devices(iommu);
-        if (iommu->cap & (1UL << IOMMU_CAP_NPCACHE))
-                amd_iommu_np_cache = true;
-        return pci_enable_device(iommu->dev);
-}
-/*
- * Iterates over all IOMMU entries in the ACPI table, allocates the
- * IOMMU structure and initializes it with init_iommu_one()
- */
-static int __init init_iommu_all(struct acpi_table_header *table)
-{
-        u8 *p = (u8 *)table, *end = (u8 *)table;
-        struct ivhd_header *h;
-        struct amd_iommu *iommu;
-        int ret;
-        end += table->length;
-        p += IVRS_HEADER_LENGTH;
-        while (p < end) {
-                h = (struct ivhd_header *)p;
-                switch (*p) {
-                case ACPI_IVHD_TYPE:
-                        DUMP_printk("device: %02x:%02x.%01x cap: %04x "
-                                    "seg: %d flags: %01x info %04x\n",
-                                    PCI_BUS(h->devid), PCI_SLOT(h->devid),
-                                    PCI_FUNC(h->devid), h->cap_ptr,
-                                    h->pci_seg, h->flags, h->info);
-                        DUMP_printk("       mmio-addr: %016llx\n",
-                                    h->mmio_phys);
-                        iommu = kzalloc(sizeof(struct amd_iommu), GFP_KERNEL);
-                        if (iommu == NULL) {
-                                amd_iommu_init_err = -ENOMEM;
-                                return 0;
-                        }
-                        ret = init_iommu_one(iommu, h);
-                        if (ret) {
-                                amd_iommu_init_err = ret;
-                                return 0;
-                        }
-                        break;
-                default:
-                        break;
-                }
-                p += h->length;
-        }
-        WARN_ON(p != end);
-        return 0;
-}
-/****************************************************************************
- *
- * The following functions initialize the MSI interrupts for all IOMMUs
- * in the system. Its a bit challenging because there could be multiple
- * IOMMUs per PCI BDF but we can call pci_enable_msi(x) only once per
- * pci_dev.
- *
- ****************************************************************************/
-static int iommu_setup_msi(struct amd_iommu *iommu)
-{
-        int r;
-        if (pci_enable_msi(iommu->dev))
-                return 1;
-        r = request_threaded_irq(iommu->dev->irq,
-                                 amd_iommu_int_handler,
-                                 amd_iommu_int_thread,
-                                 0, "AMD-Vi",
-                                 iommu->dev);
-        if (r) {
-                pci_disable_msi(iommu->dev);
-                return 1;
-        }
-        iommu->int_enabled = true;
-        iommu_feature_enable(iommu, CONTROL_EVT_INT_EN);
-        return 0;
-}
-static int iommu_init_msi(struct amd_iommu *iommu)
-{
-        if (iommu->int_enabled)
-                return 0;
-        if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSI))
-                return iommu_setup_msi(iommu);
-        return 1;
-}
-/****************************************************************************
- *
- * The next functions belong to the third pass of parsing the ACPI
- * table. In this last pass the memory mapping requirements are
- * gathered (like exclusion and unity mapping reanges).
- *
- ****************************************************************************/
-static void __init free_unity_maps(void)
-{
-        struct unity_map_entry *entry, *next;
-        list_for_each_entry_safe(entry, next, &amd_iommu_unity_map, list) {
-                list_del(&entry->list);
-                kfree(entry);
-        }
-}
-/* called when we find an exclusion range definition in ACPI */
-static int __init init_exclusion_range(struct ivmd_header *m)
-{
-        int i;
-        switch (m->type) {
-        case ACPI_IVMD_TYPE:
-                set_device_exclusion_range(m->devid, m);
-                break;
-        case ACPI_IVMD_TYPE_ALL:
-                for (i = 0; i <= amd_iommu_last_bdf; ++i)
-                        set_device_exclusion_range(i, m);
-                break;
-        case ACPI_IVMD_TYPE_RANGE:
-                for (i = m->devid; i <= m->aux; ++i)
-                        set_device_exclusion_range(i, m);
-                break;
-        default:
-                break;
-        }
-        return 0;
-}
-/* called for unity map ACPI definition */
-static int __init init_unity_map_range(struct ivmd_header *m)
-{
-        struct unity_map_entry *e = 0;
-        char *s;
-        e = kzalloc(sizeof(*e), GFP_KERNEL);
-        if (e == NULL)
-                return -ENOMEM;
-        switch (m->type) {
-        default:
-                kfree(e);
-                return 0;
-        case ACPI_IVMD_TYPE:
-                s = "IVMD_TYPEi\t\t\t";
-                e->devid_start = e->devid_end = m->devid;
-                break;
-        case ACPI_IVMD_TYPE_ALL:
-                s = "IVMD_TYPE_ALL\t\t";
-                e->devid_start = 0;
-                e->devid_end = amd_iommu_last_bdf;
-                break;
-        case ACPI_IVMD_TYPE_RANGE:
-                s = "IVMD_TYPE_RANGE\t\t";
-                e->devid_start = m->devid;
-                e->devid_end = m->aux;
-                break;
-        }
-        e->address_start = PAGE_ALIGN(m->range_start);
-        e->address_end = e->address_start + PAGE_ALIGN(m->range_length);
-        e->prot = m->flags >> 1;
-        DUMP_printk("%s devid_start: %02x:%02x.%x devid_end: %02x:%02x.%x"
-                    " range_start: %016llx range_end: %016llx flags: %x\n", s,
-                    PCI_BUS(e->devid_start), PCI_SLOT(e->devid_start),
-                    PCI_FUNC(e->devid_start), PCI_BUS(e->devid_end),
-                    PCI_SLOT(e->devid_end), PCI_FUNC(e->devid_end),
-                    e->address_start, e->address_end, m->flags);
-        list_add_tail(&e->list, &amd_iommu_unity_map);
-        return 0;
-}
-/* iterates over all memory definitions we find in the ACPI table */
-static int __init init_memory_definitions(struct acpi_table_header *table)
-{
-        u8 *p = (u8 *)table, *end = (u8 *)table;
-        struct ivmd_header *m;
-        end += table->length;
-        p += IVRS_HEADER_LENGTH;
-        while (p < end) {
-                m = (struct ivmd_header *)p;
-                if (m->flags & IVMD_FLAG_EXCL_RANGE)
-                        init_exclusion_range(m);
-                else if (m->flags & IVMD_FLAG_UNITY_MAP)
-                        init_unity_map_range(m);
-                p += m->length;
-        }
-        return 0;
-}
-/*
- * Init the device table to not allow DMA access for devices and
- * suppress all page faults
- */
-static void init_device_table(void)
-{
-        u32 devid;
-        for (devid = 0; devid <= amd_iommu_last_bdf; ++devid) {
-                set_dev_entry_bit(devid, DEV_ENTRY_VALID);
-                set_dev_entry_bit(devid, DEV_ENTRY_TRANSLATION);
-        }
-}
-static void iommu_init_flags(struct amd_iommu *iommu)
-{
-        iommu->acpi_flags & IVHD_FLAG_HT_TUN_EN_MASK ?
-                iommu_feature_enable(iommu, CONTROL_HT_TUN_EN) :
-                iommu_feature_disable(iommu, CONTROL_HT_TUN_EN);
-        iommu->acpi_flags & IVHD_FLAG_PASSPW_EN_MASK ?
-                iommu_feature_enable(iommu, CONTROL_PASSPW_EN) :
-                iommu_feature_disable(iommu, CONTROL_PASSPW_EN);
-        iommu->acpi_flags & IVHD_FLAG_RESPASSPW_EN_MASK ?
-                iommu_feature_enable(iommu, CONTROL_RESPASSPW_EN) :
-                iommu_feature_disable(iommu, CONTROL_RESPASSPW_EN);
-        iommu->acpi_flags & IVHD_FLAG_ISOC_EN_MASK ?
-                iommu_feature_enable(iommu, CONTROL_ISOC_EN) :
-                iommu_feature_disable(iommu, CONTROL_ISOC_EN);
-        /*
-         * make IOMMU memory accesses cache coherent
-         */
-        iommu_feature_enable(iommu, CONTROL_COHERENT_EN);
-}
-static void iommu_apply_resume_quirks(struct amd_iommu *iommu)
-{
-        int i, j;
-        u32 ioc_feature_control;
-        struct pci_dev *pdev = NULL;
-        /* RD890 BIOSes may not have completely reconfigured the iommu */
-        if (!is_rd890_iommu(iommu->dev))
-                return;
-        /*
-         * First, we need to ensure that the iommu is enabled. This is
-         * controlled by a register in the northbridge
-         */
-        pdev = pci_get_bus_and_slot(iommu->dev->bus->number, PCI_DEVFN(0, 0));
-        if (!pdev)
-                return;
-        /* Select Northbridge indirect register 0x75 and enable writing */
-        pci_write_config_dword(pdev, 0x60, 0x75 | (1 << 7));
-        pci_read_config_dword(pdev, 0x64, &ioc_feature_control);
-        /* Enable the iommu */
-        if (!(ioc_feature_control & 0x1))
-                pci_write_config_dword(pdev, 0x64, ioc_feature_control | 1);
-        pci_dev_put(pdev);
-        /* Restore the iommu BAR */
-        pci_write_config_dword(iommu->dev, iommu->cap_ptr + 4,
-                               iommu->stored_addr_lo);
-        pci_write_config_dword(iommu->dev, iommu->cap_ptr + 8,
-                               iommu->stored_addr_hi);
-        /* Restore the l1 indirect regs for each of the 6 l1s */
-        for (i = 0; i < 6; i++)
-                for (j = 0; j < 0x12; j++)
-                        iommu_write_l1(iommu, i, j, iommu->stored_l1[i][j]);
-        /* Restore the l2 indirect regs */
-        for (i = 0; i < 0x83; i++)
-                iommu_write_l2(iommu, i, iommu->stored_l2[i]);
-        /* Lock PCI setup registers */
-        pci_write_config_dword(iommu->dev, iommu->cap_ptr + 4,
-                               iommu->stored_addr_lo | 1);
-}
-/*
- * This function finally enables all IOMMUs found in the system after
- * they have been initialized
- */
-static void enable_iommus(void)
-{
-        struct amd_iommu *iommu;
-        for_each_iommu(iommu) {
-                iommu_disable(iommu);
-                iommu_init_flags(iommu);
-                iommu_set_device_table(iommu);
-                iommu_enable_command_buffer(iommu);
-                iommu_enable_event_buffer(iommu);
-                iommu_set_exclusion_range(iommu);
-                iommu_init_msi(iommu);
-                iommu_enable(iommu);
-                iommu_flush_all_caches(iommu);
-        }
-}
-static void disable_iommus(void)
-{
-        struct amd_iommu *iommu;
-        for_each_iommu(iommu)
-                iommu_disable(iommu);
-}
-/*
- * Suspend/Resume support
- * disable suspend until real resume implemented
- */
-static void amd_iommu_resume(void)
-{
-        struct amd_iommu *iommu;
-        for_each_iommu(iommu)
-                iommu_apply_resume_quirks(iommu);
-        /* re-load the hardware */
-        enable_iommus();
-        /*
-         * we have to flush after the IOMMUs are enabled because a
-         * disabled IOMMU will never execute the commands we send
-         */
-        for_each_iommu(iommu)
-                iommu_flush_all_caches(iommu);
-}
-static int amd_iommu_suspend(void)
-{
-        /* disable IOMMUs to go out of the way for BIOS */
-        disable_iommus();
-        return 0;
-}
-static struct syscore_ops amd_iommu_syscore_ops = {
-        .suspend = amd_iommu_suspend,
-        .resume = amd_iommu_resume,
-};
-/*
- * This is the core init function for AMD IOMMU hardware in the system.
- * This function is called from the generic x86 DMA layer initialization
- * code.
- *
- * This function basically parses the ACPI table for AMD IOMMU (IVRS)
- * three times:
- *
- *      1 pass) Find the highest PCI device id the driver has to handle.
- *              Upon this information the size of the data structures is
- *              determined that needs to be allocated.
- *
- *      2 pass) Initialize the data structures just allocated with the
- *              information in the ACPI table about available AMD IOMMUs
- *              in the system. It also maps the PCI devices in the
- *              system to specific IOMMUs
- *
- *      3 pass) After the basic data structures are allocated and
- *              initialized we update them with information about memory
- *              remapping requirements parsed out of the ACPI table in
- *              this last pass.
- *
- * After that the hardware is initialized and ready to go. In the last
- * step we do some Linux specific things like registering the driver in
- * the dma_ops interface and initializing the suspend/resume support
- * functions. Finally it prints some information about AMD IOMMUs and
- * the driver state and enables the hardware.
- */
-static int __init amd_iommu_init(void)
-{
-        int i, ret = 0;
-        /*
-         * First parse ACPI tables to find the largest Bus/Dev/Func
-         * we need to handle. Upon this information the shared data
-         * structures for the IOMMUs in the system will be allocated
-         */
-        if (acpi_table_parse("IVRS", find_last_devid_acpi) != 0)
-                return -ENODEV;
-        ret = amd_iommu_init_err;
-        if (ret)
-                goto out;
-        dev_table_size     = tbl_size(DEV_TABLE_ENTRY_SIZE);
-        alias_table_size   = tbl_size(ALIAS_TABLE_ENTRY_SIZE);
-        rlookup_table_size = tbl_size(RLOOKUP_TABLE_ENTRY_SIZE);
-        ret = -ENOMEM;
-        /* Device table - directly used by all IOMMUs */
-        amd_iommu_dev_table = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
-                                      get_order(dev_table_size));
-        if (amd_iommu_dev_table == NULL)
-                goto out;
-        /*
-         * Alias table - map PCI Bus/Dev/Func to Bus/Dev/Func the
-         * IOMMU see for that device
-         */
-        amd_iommu_alias_table = (void *)__get_free_pages(GFP_KERNEL,
-                        get_order(alias_table_size));
-        if (amd_iommu_alias_table == NULL)
-                goto free;
-        /* IOMMU rlookup table - find the IOMMU for a specific device */
-        amd_iommu_rlookup_table = (void *)__get_free_pages(
-                        GFP_KERNEL | __GFP_ZERO,
-                        get_order(rlookup_table_size));
-        if (amd_iommu_rlookup_table == NULL)
-                goto free;
-        amd_iommu_pd_alloc_bitmap = (void *)__get_free_pages(
-                                            GFP_KERNEL | __GFP_ZERO,
-                                            get_order(MAX_DOMAIN_ID/8));
-        if (amd_iommu_pd_alloc_bitmap == NULL)
-                goto free;
-        /* init the device table */
-        init_device_table();
-        /*
-         * let all alias entries point to itself
-         */
-        for (i = 0; i <= amd_iommu_last_bdf; ++i)
-                amd_iommu_alias_table[i] = i;
-        /*
-         * never allocate domain 0 because its used as the non-allocated and
-         * error value placeholder
-         */
-        amd_iommu_pd_alloc_bitmap[0] = 1;
-        spin_lock_init(&amd_iommu_pd_lock);
-        /*
-         * now the data structures are allocated and basically initialized
-         * start the real acpi table scan
-         */
-        ret = -ENODEV;
-        if (acpi_table_parse("IVRS", init_iommu_all) != 0)
-                goto free;
-        if (amd_iommu_init_err) {
-                ret = amd_iommu_init_err;
-                goto free;
-        }
-        if (acpi_table_parse("IVRS", init_memory_definitions) != 0)
-                goto free;
-        if (amd_iommu_init_err) {
-                ret = amd_iommu_init_err;
-                goto free;
-        }
-        ret = amd_iommu_init_devices();
-        if (ret)
-                goto free;
-        enable_iommus();
-        if (iommu_pass_through)
-                ret = amd_iommu_init_passthrough();
-        else
-                ret = amd_iommu_init_dma_ops();
-        if (ret)
-                goto free_disable;
-        amd_iommu_init_api();
-        amd_iommu_init_notifier();
-        register_syscore_ops(&amd_iommu_syscore_ops);
-        if (iommu_pass_through)
-                goto out;
-        if (amd_iommu_unmap_flush)
-                printk(KERN_INFO "AMD-Vi: IO/TLB flush on unmap enabled\n");
-        else
-                printk(KERN_INFO "AMD-Vi: Lazy IO/TLB flushing enabled\n");
-        x86_platform.iommu_shutdown = disable_iommus;
-out:
-        return ret;
-free_disable:
-        disable_iommus();
-free:
-        amd_iommu_uninit_devices();
-        free_pages((unsigned long)amd_iommu_pd_alloc_bitmap,
-                   get_order(MAX_DOMAIN_ID/8));
-        free_pages((unsigned long)amd_iommu_rlookup_table,
-                   get_order(rlookup_table_size));
-        free_pages((unsigned long)amd_iommu_alias_table,
-                   get_order(alias_table_size));
-        free_pages((unsigned long)amd_iommu_dev_table,
-                   get_order(dev_table_size));
-        free_iommu_all();
-        free_unity_maps();
-#ifdef CONFIG_GART_IOMMU
-        /*
-         * We failed to initialize the AMD IOMMU - try fallback to GART
-         * if possible.
-         */
-        gart_iommu_init();
-#endif
-        goto out;
-}
-/****************************************************************************
- *
- * Early detect code. This code runs at IOMMU detection time in the DMA
- * layer. It just looks if there is an IVRS ACPI table to detect AMD
- * IOMMUs
- *
- ****************************************************************************/
-static int __init early_amd_iommu_detect(struct acpi_table_header *table)
-{
-        return 0;
-}
-int __init amd_iommu_detect(void)
-{
-        if (no_iommu || (iommu_detected && !gart_iommu_aperture))
-                return -ENODEV;
-        if (amd_iommu_disabled)
-                return -ENODEV;
-        if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) {
-                iommu_detected = 1;
-                amd_iommu_detected = 1;
-                x86_init.iommu.iommu_init = amd_iommu_init;
-                /* Make sure ACS will be enabled */
-                pci_request_acs();
-                return 1;
-        }
-        return -ENODEV;
-}
-/****************************************************************************
- *
- * Parsing functions for the AMD IOMMU specific kernel command line
- * options.
- *
- ****************************************************************************/
-static int __init parse_amd_iommu_dump(char *str)
-{
-        amd_iommu_dump = true;
-        return 1;
-}
-static int __init parse_amd_iommu_options(char *str)
-{
-        for (; *str; ++str) {
-                if (strncmp(str, "fullflush", 9) == 0)
-                        amd_iommu_unmap_flush = true;
-                if (strncmp(str, "off", 3) == 0)
-                        amd_iommu_disabled = true;
-        }
-        return 1;
-}
-__setup("amd_iommu_dump", parse_amd_iommu_dump);
-__setup("amd_iommu=", parse_amd_iommu_options);
-IOMMU_INIT_FINISH(amd_iommu_detect,
-                  gart_iommu_hole_init,
-                  0,
-                  0);
diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c
index 289e92862fd9..afdc3f756dea 100644
--- a/arch/x86/kernel/apb_timer.c
+++ b/arch/x86/kernel/apb_timer.c
@@ -27,15 +27,12 @@
 * timer, but by default APB timer has higher rating than local APIC timers.
 */
-#include <linux/clocksource.h>
-#include <linux/clockchips.h>
 #include <linux/delay.h>
+#include <linux/dw_apb_timer.h>
 #include <linux/errno.h>
 #include <linux/init.h>
-#include <linux/sysdev.h>
 #include <linux/slab.h>
 #include <linux/pm.h>
-#include <linux/pci.h>
 #include <linux/sfi.h>
 #include <linux/interrupt.h>
 #include <linux/cpu.h>
@@ -44,76 +41,48 @@
 #include <asm/fixmap.h>
 #include <asm/apb_timer.h>
 #include <asm/mrst.h>
+#include <asm/time.h>
-#define APBT_MASK                       CLOCKSOURCE_MASK(32)
-#define APBT_SHIFT                      22
 #define APBT_CLOCKEVENT_RATING          110
 #define APBT_CLOCKSOURCE_RATING         250
-#define APBT_MIN_DELTA_USEC             200
-#define EVT_TO_APBT_DEV(evt) container_of(evt, struct apbt_dev, evt)
 #define APBT_CLOCKEVENT0_NUM   (0)
-#define APBT_CLOCKEVENT1_NUM   (1)
 #define APBT_CLOCKSOURCE_NUM   (2)
-static unsigned long apbt_address;
+static phys_addr_t apbt_address;
 static int apb_timer_block_enabled;
 static void __iomem *apbt_virt_address;
-static int phy_cs_timer_id;
 /*
 * Common DW APB timer info
 */
-static uint64_t apbt_freq;
+static unsigned long apbt_freq;
-static void apbt_set_mode(enum clock_event_mode mode,
-                          struct clock_event_device *evt);
-static int apbt_next_event(unsigned long delta,
-                           struct clock_event_device *evt);
-static cycle_t apbt_read_clocksource(struct clocksource *cs);
-static void apbt_restart_clocksource(struct clocksource *cs);
 struct apbt_dev {
-        struct clock_event_device evt;
+        struct dw_apb_clock_event_device        *timer;
-        unsigned int num;
+        unsigned int                            num;
-        int cpu;
+        int                                     cpu;
-        unsigned int irq;
+        unsigned int                            irq;
-        unsigned int tick;
+        char                                    name[10];
-        unsigned int count;
-        unsigned int flags;
-        char name[10];
 };
-static DEFINE_PER_CPU(struct apbt_dev, cpu_apbt_dev);
+static struct dw_apb_clocksource *clocksource_apbt;
-#ifdef CONFIG_SMP
+static inline void __iomem *adev_virt_addr(struct apbt_dev *adev)
-static unsigned int apbt_num_timers_used;
-static struct apbt_dev *apbt_devs;
-#endif
-static  inline unsigned long apbt_readl_reg(unsigned long a)
 {
-        return readl(apbt_virt_address + a);
+        return apbt_virt_address + adev->num * APBTMRS_REG_SIZE;
 }
-static inline void apbt_writel_reg(unsigned long d, unsigned long a)
+static DEFINE_PER_CPU(struct apbt_dev, cpu_apbt_dev);
-{
-        writel(d, apbt_virt_address + a);
-}
-static inline unsigned long apbt_readl(int n, unsigned long a)
-{
-        return readl(apbt_virt_address + a + n * APBTMRS_REG_SIZE);
-}
-static inline void apbt_writel(int n, unsigned long d, unsigned long a)
+#ifdef CONFIG_SMP
-{
+static unsigned int apbt_num_timers_used;
-        writel(d, apbt_virt_address + a + n * APBTMRS_REG_SIZE);
+#endif
-}
 static inline void apbt_set_mapping(void)
 {
        struct sfi_timer_table_entry *mtmr;
+        int phy_cs_timer_id = 0;
        if (apbt_virt_address) {
                pr_debug("APBT base already mapped\n");
@@ -125,21 +94,18 @@ static inline void apbt_set_mapping(void)
                       APBT_CLOCKEVENT0_NUM);
                return;
        }
-        apbt_address = (unsigned long)mtmr->phys_addr;
+        apbt_address = (phys_addr_t)mtmr->phys_addr;
        if (!apbt_address) {
                printk(KERN_WARNING "No timer base from SFI, use default\n");
                apbt_address = APBT_DEFAULT_BASE;
        }
        apbt_virt_address = ioremap_nocache(apbt_address, APBT_MMAP_SIZE);
-        if (apbt_virt_address) {
+        if (!apbt_virt_address) {
-                pr_debug("Mapped APBT physical addr %p at virtual addr %p\n",\
+                pr_debug("Failed mapping APBT phy address at %lu\n",\
-                         (void *)apbt_address, (void *)apbt_virt_address);
+                         (unsigned long)apbt_address);
-        } else {
-                pr_debug("Failed mapping APBT phy address at %p\n",\
-                         (void *)apbt_address);
                goto panic_noapbt;
        }
-        apbt_freq = mtmr->freq_hz / USEC_PER_SEC;
+        apbt_freq = mtmr->freq_hz;
        sfi_free_mtmr(mtmr);
        /* Now figure out the physical timer id for clocksource device */
@@ -148,9 +114,14 @@ static inline void apbt_set_mapping(void)
                goto panic_noapbt;
        /* Now figure out the physical timer id */
-        phy_cs_timer_id = (unsigned int)(mtmr->phys_addr & 0xff)
+        pr_debug("Use timer %d for clocksource\n",
-                / APBTMRS_REG_SIZE;
+                 (int)(mtmr->phys_addr & 0xff) / APBTMRS_REG_SIZE);
-        pr_debug("Use timer %d for clocksource\n", phy_cs_timer_id);
+        phy_cs_timer_id = (unsigned int)(mtmr->phys_addr & 0xff) /
+                APBTMRS_REG_SIZE;
+        clocksource_apbt = dw_apb_clocksource_init(APBT_CLOCKSOURCE_RATING,
+                "apbt0", apbt_virt_address + phy_cs_timer_id *
+                APBTMRS_REG_SIZE, apbt_freq);
        return;
 panic_noapbt:
@@ -172,82 +143,6 @@ static inline int is_apbt_capable(void)
        return apbt_virt_address ? 1 : 0;
 }
-static struct clocksource clocksource_apbt = {
-        .name           = "apbt",
-        .rating         = APBT_CLOCKSOURCE_RATING,
-        .read           = apbt_read_clocksource,
-        .mask           = APBT_MASK,
-        .flags          = CLOCK_SOURCE_IS_CONTINUOUS,
-        .resume         = apbt_restart_clocksource,
-};
-/* boot APB clock event device */
-static struct clock_event_device apbt_clockevent = {
-        .name           = "apbt0",
-        .features       = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
-        .set_mode       = apbt_set_mode,
-        .set_next_event = apbt_next_event,
-        .shift          = APBT_SHIFT,
-        .irq            = 0,
-        .rating         = APBT_CLOCKEVENT_RATING,
-};
-/*
- * start count down from 0xffff_ffff. this is done by toggling the enable bit
- * then load initial load count to ~0.
- */
-static void apbt_start_counter(int n)
-{
-        unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL);
-        ctrl &= ~APBTMR_CONTROL_ENABLE;
-        apbt_writel(n, ctrl, APBTMR_N_CONTROL);
-        apbt_writel(n, ~0, APBTMR_N_LOAD_COUNT);
-        /* enable, mask interrupt */
-        ctrl &= ~APBTMR_CONTROL_MODE_PERIODIC;
-        ctrl |= (APBTMR_CONTROL_ENABLE | APBTMR_CONTROL_INT);
-        apbt_writel(n, ctrl, APBTMR_N_CONTROL);
-        /* read it once to get cached counter value initialized */
-        apbt_read_clocksource(&clocksource_apbt);
-}
-static irqreturn_t apbt_interrupt_handler(int irq, void *data)
-{
-        struct apbt_dev *dev = (struct apbt_dev *)data;
-        struct clock_event_device *aevt = &dev->evt;
-        if (!aevt->event_handler) {
-                printk(KERN_INFO "Spurious APBT timer interrupt on %d\n",
-                       dev->num);
-                return IRQ_NONE;
-        }
-        aevt->event_handler(aevt);
-        return IRQ_HANDLED;
-}
-static void apbt_restart_clocksource(struct clocksource *cs)
-{
-        apbt_start_counter(phy_cs_timer_id);
-}
-static void apbt_enable_int(int n)
-{
-        unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL);
-        /* clear pending intr */
-        apbt_readl(n, APBTMR_N_EOI);
-        ctrl &= ~APBTMR_CONTROL_INT;
-        apbt_writel(n, ctrl, APBTMR_N_CONTROL);
-}
-static void apbt_disable_int(int n)
-{
-        unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL);
-        ctrl |= APBTMR_CONTROL_INT;
-        apbt_writel(n, ctrl, APBTMR_N_CONTROL);
-}
 static int __init apbt_clockevent_register(void)
 {
        struct sfi_timer_table_entry *mtmr;
@@ -260,45 +155,21 @@ static int __init apbt_clockevent_register(void)
                return -ENODEV;
        }
-        /*
-         * We need to calculate the scaled math multiplication factor for
-         * nanosecond to apbt tick conversion.
-         * mult = (nsec/cycle)*2^APBT_SHIFT
-         */
-        apbt_clockevent.mult = div_sc((unsigned long) mtmr->freq_hz
-                                      , NSEC_PER_SEC, APBT_SHIFT);
-        /* Calculate the min / max delta */
-        apbt_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF,
-                                                           &apbt_clockevent);
-        apbt_clockevent.min_delta_ns = clockevent_delta2ns(
-                APBT_MIN_DELTA_USEC*apbt_freq,
-                &apbt_clockevent);
-        /*
-         * Start apbt with the boot cpu mask and make it
-         * global if not used for per cpu timer.
-         */
-        apbt_clockevent.cpumask = cpumask_of(smp_processor_id());
        adev->num = smp_processor_id();
-        memcpy(&adev->evt, &apbt_clockevent, sizeof(struct clock_event_device));
+        adev->timer = dw_apb_clockevent_init(smp_processor_id(), "apbt0",
+                mrst_timer_options == MRST_TIMER_LAPIC_APBT ?
+                APBT_CLOCKEVENT_RATING - 100 : APBT_CLOCKEVENT_RATING,
+                adev_virt_addr(adev), 0, apbt_freq);
+        /* Firmware does EOI handling for us. */
+        adev->timer->eoi = NULL;
        if (mrst_timer_options == MRST_TIMER_LAPIC_APBT) {
-                adev->evt.rating = APBT_CLOCKEVENT_RATING - 100;
+                global_clock_event = &adev->timer->ced;
-                global_clock_event = &adev->evt;
                printk(KERN_DEBUG "%s clockevent registered as global\n",
                       global_clock_event->name);
        }
-        if (request_irq(apbt_clockevent.irq, apbt_interrupt_handler,
+        dw_apb_clockevent_register(adev->timer);
-                        IRQF_TIMER | IRQF_DISABLED | IRQF_NOBALANCING,
-                        apbt_clockevent.name, adev)) {
-                printk(KERN_ERR "Failed request IRQ for APBT%d\n",
-                       apbt_clockevent.irq);
-        }
-        clockevents_register_device(&adev->evt);
-        /* Start APBT 0 interrupts */
-        apbt_enable_int(APBT_CLOCKEVENT0_NUM);
        sfi_free_mtmr(mtmr);
        return 0;
@@ -316,52 +187,34 @@ static void apbt_setup_irq(struct apbt_dev *adev)
        irq_set_affinity(adev->irq, cpumask_of(adev->cpu));
        /* APB timer irqs are set up as mp_irqs, timer is edge type */
        __irq_set_handler(adev->irq, handle_edge_irq, 0, "edge");
-        if (system_state == SYSTEM_BOOTING) {
-                if (request_irq(adev->irq, apbt_interrupt_handler,
-                                        IRQF_TIMER | IRQF_DISABLED |
-                                        IRQF_NOBALANCING,
-                                        adev->name, adev)) {
-                        printk(KERN_ERR "Failed request IRQ for APBT%d\n",
-                               adev->num);
-                }
-        } else
-                enable_irq(adev->irq);
 }
 /* Should be called with per cpu */
 void apbt_setup_secondary_clock(void)
 {
        struct apbt_dev *adev;
-        struct clock_event_device *aevt;
        int cpu;
        /* Don't register boot CPU clockevent */
        cpu = smp_processor_id();
        if (!cpu)
                return;
-        /*
-         * We need to calculate the scaled math multiplication factor for
-         * nanosecond to apbt tick conversion.
-         * mult = (nsec/cycle)*2^APBT_SHIFT
-         */
-        printk(KERN_INFO "Init per CPU clockevent %d\n", cpu);
-        adev = &per_cpu(cpu_apbt_dev, cpu);
-        aevt = &adev->evt;
-        memcpy(aevt, &apbt_clockevent, sizeof(*aevt));
+        adev = &__get_cpu_var(cpu_apbt_dev);
-        aevt->cpumask = cpumask_of(cpu);
+        if (!adev->timer) {
-        aevt->name = adev->name;
+                adev->timer = dw_apb_clockevent_init(cpu, adev->name,
-        aevt->mode = CLOCK_EVT_MODE_UNUSED;
+                        APBT_CLOCKEVENT_RATING, adev_virt_addr(adev),
+                        adev->irq, apbt_freq);
+                adev->timer->eoi = NULL;
+        } else {
+                dw_apb_clockevent_resume(adev->timer);
+        }
-        printk(KERN_INFO "Registering CPU %d clockevent device %s, mask %08x\n",
+        printk(KERN_INFO "Registering CPU %d clockevent device %s, cpu %08x\n",
-               cpu, aevt->name, *(u32 *)aevt->cpumask);
+               cpu, adev->name, adev->cpu);
        apbt_setup_irq(adev);
+        dw_apb_clockevent_register(adev->timer);
-        clockevents_register_device(aevt);
-        apbt_enable_int(cpu);
        return;
 }
@@ -384,13 +237,12 @@ static int apbt_cpuhp_notify(struct notifier_block *n,
        switch (action & 0xf) {
        case CPU_DEAD:
-                disable_irq(adev->irq);
+                dw_apb_clockevent_pause(adev->timer);
-                apbt_disable_int(cpu);
                if (system_state == SYSTEM_RUNNING) {
                        pr_debug("skipping APBT CPU %lu offline\n", cpu);
                } else if (adev) {
                        pr_debug("APBT clockevent for cpu %lu offline\n", cpu);
-                        free_irq(adev->irq, adev);
+                        dw_apb_clockevent_stop(adev->timer);
                }
                break;
        default:
@@ -415,116 +267,16 @@ void apbt_setup_secondary_clock(void) {}
 #endif /* CONFIG_SMP */
-static void apbt_set_mode(enum clock_event_mode mode,
-                          struct clock_event_device *evt)
-{
-        unsigned long ctrl;
-        uint64_t delta;
-        int timer_num;
-        struct apbt_dev *adev = EVT_TO_APBT_DEV(evt);
-        BUG_ON(!apbt_virt_address);
-        timer_num = adev->num;
-        pr_debug("%s CPU %d timer %d mode=%d\n",
-                 __func__, first_cpu(*evt->cpumask), timer_num, mode);
-        switch (mode) {
-        case CLOCK_EVT_MODE_PERIODIC:
-                delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * apbt_clockevent.mult;
-                delta >>= apbt_clockevent.shift;
-                ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL);
-                ctrl |= APBTMR_CONTROL_MODE_PERIODIC;
-                apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
-                /*
-                 * DW APB p. 46, have to disable timer before load counter,
-                 * may cause sync problem.
-                 */
-                ctrl &= ~APBTMR_CONTROL_ENABLE;
-                apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
-                udelay(1);
-                pr_debug("Setting clock period %d for HZ %d\n", (int)delta, HZ);
-                apbt_writel(timer_num, delta, APBTMR_N_LOAD_COUNT);
-                ctrl |= APBTMR_CONTROL_ENABLE;
-                apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
-                break;
-                /* APB timer does not have one-shot mode, use free running mode */
-        case CLOCK_EVT_MODE_ONESHOT:
-                ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL);
-                /*
-                 * set free running mode, this mode will let timer reload max
-                 * timeout which will give time (3min on 25MHz clock) to rearm
-                 * the next event, therefore emulate the one-shot mode.
-                 */
-                ctrl &= ~APBTMR_CONTROL_ENABLE;
-                ctrl &= ~APBTMR_CONTROL_MODE_PERIODIC;
-                apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
-                /* write again to set free running mode */
-                apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
-                /*
-                 * DW APB p. 46, load counter with all 1s before starting free
-                 * running mode.
-                 */
-                apbt_writel(timer_num, ~0, APBTMR_N_LOAD_COUNT);
-                ctrl &= ~APBTMR_CONTROL_INT;
-                ctrl |= APBTMR_CONTROL_ENABLE;
-                apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
-                break;
-        case CLOCK_EVT_MODE_UNUSED:
-        case CLOCK_EVT_MODE_SHUTDOWN:
-                apbt_disable_int(timer_num);
-                ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL);
-                ctrl &= ~APBTMR_CONTROL_ENABLE;
-                apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
-                break;
-        case CLOCK_EVT_MODE_RESUME:
-                apbt_enable_int(timer_num);
-                break;
-        }
-}
-static int apbt_next_event(unsigned long delta,
-                           struct clock_event_device *evt)
-{
-        unsigned long ctrl;
-        int timer_num;
-        struct apbt_dev *adev = EVT_TO_APBT_DEV(evt);
-        timer_num = adev->num;
-        /* Disable timer */
-        ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL);
-        ctrl &= ~APBTMR_CONTROL_ENABLE;
-        apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
-        /* write new count */
-        apbt_writel(timer_num, delta, APBTMR_N_LOAD_COUNT);
-        ctrl |= APBTMR_CONTROL_ENABLE;
-        apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
-        return 0;
-}
-static cycle_t apbt_read_clocksource(struct clocksource *cs)
-{
-        unsigned long current_count;
-        current_count = apbt_readl(phy_cs_timer_id, APBTMR_N_CURRENT_VALUE);
-        return (cycle_t)~current_count;
-}
 static int apbt_clocksource_register(void)
 {
        u64 start, now;
        cycle_t t1;
        /* Start the counter, use timer 2 as source, timer 0/1 for event */
-        apbt_start_counter(phy_cs_timer_id);
+        dw_apb_clocksource_start(clocksource_apbt);
        /* Verify whether apbt counter works */
-        t1 = apbt_read_clocksource(&clocksource_apbt);
+        t1 = dw_apb_clocksource_read(clocksource_apbt);
        rdtscll(start);
        /*
@@ -539,10 +291,10 @@ static int apbt_clocksource_register(void)
        } while ((now - start) < 200000UL);
        /* APBT is the only always on clocksource, it has to work! */
-        if (t1 == apbt_read_clocksource(&clocksource_apbt))
+        if (t1 == dw_apb_clocksource_read(clocksource_apbt))
                panic("APBT counter not counting. APBT disabled\n");
-        clocksource_register_khz(&clocksource_apbt, (u32)apbt_freq*1000);
+        dw_apb_clocksource_register(clocksource_apbt);
        return 0;
 }
@@ -566,10 +318,7 @@ void __init apbt_time_init(void)
        if (apb_timer_block_enabled)
                return;
        apbt_set_mapping();
-        if (apbt_virt_address) {
+        if (!apbt_virt_address)
-                pr_debug("Found APBT version 0x%lx\n",\
-                         apbt_readl_reg(APBTMRS_COMP_VERSION));
-        } else
                goto out_noapbt;
        /*
         * Read the frequency and check for a sane value, for ESL model
@@ -577,7 +326,7 @@ void __init apbt_time_init(void)
         */
        if (apbt_freq < APBT_MIN_FREQ || apbt_freq > APBT_MAX_FREQ) {
-                pr_debug("APBT has invalid freq 0x%llx\n", apbt_freq);
+                pr_debug("APBT has invalid freq 0x%lx\n", apbt_freq);
                goto out_noapbt;
        }
        if (apbt_clocksource_register()) {
@@ -603,30 +352,20 @@ void __init apbt_time_init(void)
        } else {
                percpu_timer = 0;
                apbt_num_timers_used = 1;
-                adev = &per_cpu(cpu_apbt_dev, 0);
-                adev->flags &= ~APBT_DEV_USED;
        }
        pr_debug("%s: %d APB timers used\n", __func__, apbt_num_timers_used);
        /* here we set up per CPU timer data structure */
-        apbt_devs = kzalloc(sizeof(struct apbt_dev) * apbt_num_timers_used,
-                            GFP_KERNEL);
-        if (!apbt_devs) {
-                printk(KERN_ERR "Failed to allocate APB timer devices\n");
-                return;
-        }
        for (i = 0; i < apbt_num_timers_used; i++) {
                adev = &per_cpu(cpu_apbt_dev, i);
                adev->num = i;
                adev->cpu = i;
                p_mtmr = sfi_get_mtmr(i);
-                if (p_mtmr) {
+                if (p_mtmr)
-                        adev->tick = p_mtmr->freq_hz;
                        adev->irq = p_mtmr->irq;
-                } else
+                else
                        printk(KERN_ERR "Failed to get timer for cpu %d\n", i);
-                adev->count = 0;
+                snprintf(adev->name, sizeof(adev->name) - 1, "apbt%d", i);
-                sprintf(adev->name, "apbt%d", i);
        }
 #endif
@@ -638,17 +377,8 @@ out_noapbt:
        panic("failed to enable APB timer\n");
 }
-static inline void apbt_disable(int n)
-{
-        if (is_apbt_capable()) {
-                unsigned long ctrl =  apbt_readl(n, APBTMR_N_CONTROL);
-                ctrl &= ~APBTMR_CONTROL_ENABLE;
-                apbt_writel(n, ctrl, APBTMR_N_CONTROL);
-        }
-}
 /* called before apb_timer_enable, use early map */
-unsigned long apbt_quick_calibrate()
+unsigned long apbt_quick_calibrate(void)
 {
        int i, scale;
        u64 old, new;
@@ -657,31 +387,31 @@ unsigned long apbt_quick_calibrate()
        u32 loop, shift;
        apbt_set_mapping();
-        apbt_start_counter(phy_cs_timer_id);
+        dw_apb_clocksource_start(clocksource_apbt);
        /* check if the timer can count down, otherwise return */
-        old = apbt_read_clocksource(&clocksource_apbt);
+        old = dw_apb_clocksource_read(clocksource_apbt);
        i = 10000;
        while (--i) {
-                if (old != apbt_read_clocksource(&clocksource_apbt))
+                if (old != dw_apb_clocksource_read(clocksource_apbt))
                        break;
        }
        if (!i)
                goto failed;
        /* count 16 ms */
-        loop = (apbt_freq * 1000) << 4;
+        loop = (apbt_freq / 1000) << 4;
        /* restart the timer to ensure it won't get to 0 in the calibration */
-        apbt_start_counter(phy_cs_timer_id);
+        dw_apb_clocksource_start(clocksource_apbt);
-        old = apbt_read_clocksource(&clocksource_apbt);
+        old = dw_apb_clocksource_read(clocksource_apbt);
        old += loop;
        t1 = __native_read_tsc();
        do {
-                new = apbt_read_clocksource(&clocksource_apbt);
+                new = dw_apb_clocksource_read(clocksource_apbt);
        } while (new < old);
        t2 = __native_read_tsc();
@@ -693,7 +423,7 @@ unsigned long apbt_quick_calibrate()
                return 0;
        }
        scale = (int)div_u64((t2 - t1), loop >> shift);
-        khz = (scale * apbt_freq * 1000) >> shift;
+        khz = (scale * (apbt_freq / 1000)) >> shift;
        printk(KERN_INFO "TSC freq calculated by APB timer is %lu khz\n", khz);
        return khz;
 failed:
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index b9338b8cf420..b24be38c8cf8 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -27,6 +27,7 @@
 #include <linux/syscore_ops.h>
 #include <linux/delay.h>
 #include <linux/timex.h>
+#include <linux/i8253.h>
 #include <linux/dmar.h>
 #include <linux/init.h>
 #include <linux/cpu.h>
@@ -39,7 +40,6 @@
 #include <asm/pgalloc.h>
 #include <asm/atomic.h>
 #include <asm/mpspec.h>
-#include <asm/i8253.h>
 #include <asm/i8259.h>
 #include <asm/proto.h>
 #include <asm/apic.h>
@@ -48,6 +48,7 @@
 #include <asm/hpet.h>
 #include <asm/idle.h>
 #include <asm/mtrr.h>
+#include <asm/time.h>
 #include <asm/smp.h>
 #include <asm/mce.h>
 #include <asm/tsc.h>
@@ -1429,7 +1430,7 @@ void enable_x2apic(void)
        rdmsr(MSR_IA32_APICBASE, msr, msr2);
        if (!(msr & X2APIC_ENABLE)) {
                printk_once(KERN_INFO "Enabling x2apic\n");
-                wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, 0);
+                wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, msr2);
        }
 }
 #endif /* CONFIG_X86_X2APIC */
@@ -1943,10 +1944,28 @@ void disconnect_bsp_APIC(int virt_wire_setup)
 void __cpuinit generic_processor_info(int apicid, int version)
 {
-        int cpu;
+        int cpu, max = nr_cpu_ids;
+        bool boot_cpu_detected = physid_isset(boot_cpu_physical_apicid,
+                                phys_cpu_present_map);
+        /*
+         * If boot cpu has not been detected yet, then only allow upto
+         * nr_cpu_ids - 1 processors and keep one slot free for boot cpu
+         */
+        if (!boot_cpu_detected && num_processors >= nr_cpu_ids - 1 &&
+            apicid != boot_cpu_physical_apicid) {
+                int thiscpu = max + disabled_cpus - 1;
+                pr_warning(
+                        "ACPI: NR_CPUS/possible_cpus limit of %i almost"
+                        " reached. Keeping one slot for boot cpu."
+                        "  Processor %d/0x%x ignored.\n", max, thiscpu, apicid);
+                disabled_cpus++;
+                return;
+        }
        if (num_processors >= nr_cpu_ids) {
-                int max = nr_cpu_ids;
                int thiscpu = max + disabled_cpus;
                pr_warning(
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index e5293394b548..8eb863e27ea6 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1295,6 +1295,16 @@ static int setup_ioapic_entry(int apic_id, int irq,
                 * irq handler will do the explicit EOI to the io-apic.
                 */
                ir_entry->vector = pin;
+                apic_printk(APIC_VERBOSE, KERN_DEBUG "IOAPIC[%d]: "
+                        "Set IRTE entry (P:%d FPD:%d Dst_Mode:%d "
+                        "Redir_hint:%d Trig_Mode:%d Dlvry_Mode:%X "
+                        "Avail:%X Vector:%02X Dest:%08X "
+                        "SID:%04X SQ:%X SVT:%X)\n",
+                        apic_id, irte.present, irte.fpd, irte.dst_mode,
+                        irte.redir_hint, irte.trigger_mode, irte.dlvry_mode,
+                        irte.avail, irte.vector, irte.dest_id,
+                        irte.sid, irte.sq, irte.svt);
        } else {
                entry->delivery_mode = apic->irq_delivery_mode;
                entry->dest_mode = apic->irq_dest_mode;
@@ -1337,9 +1347,9 @@ static void setup_ioapic_irq(int apic_id, int pin, unsigned int irq,
        apic_printk(APIC_VERBOSE,KERN_DEBUG
                    "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> "
-                    "IRQ %d Mode:%i Active:%i)\n",
+                    "IRQ %d Mode:%i Active:%i Dest:%d)\n",
                    apic_id, mpc_ioapic_id(apic_id), pin, cfg->vector,
-                    irq, trigger, polarity);
+                    irq, trigger, polarity, dest);
        if (setup_ioapic_entry(mpc_ioapic_id(apic_id), irq, &entry,
@@ -1522,10 +1532,12 @@ __apicdebuginit(void) print_IO_APIC(void)
        printk(KERN_DEBUG ".......    : LTS          : %X\n", reg_00.bits.LTS);
        printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)&reg_01);
-        printk(KERN_DEBUG ".......     : max redirection entries: %04X\n", reg_01.bits.entries);
+        printk(KERN_DEBUG ".......     : max redirection entries: %02X\n",
+                reg_01.bits.entries);
        printk(KERN_DEBUG ".......     : PRQ implemented: %X\n", reg_01.bits.PRQ);
-        printk(KERN_DEBUG ".......     : IO APIC version: %04X\n", reg_01.bits.version);
+        printk(KERN_DEBUG ".......     : IO APIC version: %02X\n",
+                reg_01.bits.version);
        /*
         * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02,
@@ -1550,31 +1562,60 @@ __apicdebuginit(void) print_IO_APIC(void)
        printk(KERN_DEBUG ".... IRQ redirection table:\n");
-        printk(KERN_DEBUG " NR Dst Mask Trig IRR Pol"
+        if (intr_remapping_enabled) {
-                          " Stat Dmod Deli Vect:\n");
+                printk(KERN_DEBUG " NR Indx Fmt Mask Trig IRR"
+                        " Pol Stat Indx2 Zero Vect:\n");
+        } else {
+                printk(KERN_DEBUG " NR Dst Mask Trig IRR Pol"
+                        " Stat Dmod Deli Vect:\n");
+        }
        for (i = 0; i <= reg_01.bits.entries; i++) {
-                struct IO_APIC_route_entry entry;
+                if (intr_remapping_enabled) {
+                        struct IO_APIC_route_entry entry;
-                entry = ioapic_read_entry(apic, i);
+                        struct IR_IO_APIC_route_entry *ir_entry;
-                printk(KERN_DEBUG " %02x %03X ",
+                        entry = ioapic_read_entry(apic, i);
-                        i,
+                        ir_entry = (struct IR_IO_APIC_route_entry *) &entry;
-                        entry.dest
+                        printk(KERN_DEBUG " %02x %04X ",
-                );
+                                i,
+                                ir_entry->index
+                        );
+                        printk("%1d   %1d    %1d    %1d   %1d   "
+                                "%1d    %1d     %X    %02X\n",
+                                ir_entry->format,
+                                ir_entry->mask,
+                                ir_entry->trigger,
+                                ir_entry->irr,
+                                ir_entry->polarity,
+                                ir_entry->delivery_status,
+                                ir_entry->index2,
+                                ir_entry->zero,
+                                ir_entry->vector
+                        );
+                } else {
+                        struct IO_APIC_route_entry entry;
-                printk("%1d    %1d    %1d   %1d   %1d    %1d    %1d    %02X\n",
+                        entry = ioapic_read_entry(apic, i);
-                        entry.mask,
+                        printk(KERN_DEBUG " %02x %02X  ",
-                        entry.trigger,
+                                i,
-                        entry.irr,
+                                entry.dest
-                        entry.polarity,
+                        );
-                        entry.delivery_status,
+                        printk("%1d    %1d    %1d   %1d   %1d    "
-                        entry.dest_mode,
+                                "%1d    %1d    %02X\n",
-                        entry.delivery_mode,
+                                entry.mask,
-                        entry.vector
+                                entry.trigger,
-                );
+                                entry.irr,
+                                entry.polarity,
+                                entry.delivery_status,
+                                entry.dest_mode,
+                                entry.delivery_mode,
+                                entry.vector
+                        );
+                }
        }
        }
        printk(KERN_DEBUG "IRQ to pin mappings:\n");
        for_each_active_irq(irq) {
                struct irq_pin_list *entry;
@@ -1792,7 +1833,7 @@ __apicdebuginit(int) print_ICs(void)
        return 0;
 }
-fs_initcall(print_ICs);
+late_initcall(print_ICs);
 /* Where if anywhere is the i8259 connect in external int mode */
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 965a7666c283..0371c484bb8a 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -229,11 +229,11 @@
 #include <linux/jiffies.h>
 #include <linux/acpi.h>
 #include <linux/syscore_ops.h>
+#include <linux/i8253.h>
 #include <asm/system.h>
 #include <asm/uaccess.h>
 #include <asm/desc.h>
-#include <asm/i8253.h>
 #include <asm/olpc.h>
 #include <asm/paravirt.h>
 #include <asm/reboot.h>
@@ -1220,11 +1220,11 @@ static void reinit_timer(void)
        raw_spin_lock_irqsave(&i8253_lock, flags);
        /* set the clock to HZ */
-        outb_pit(0x34, PIT_MODE);               /* binary, mode 2, LSB/MSB, ch 0 */
+        outb_p(0x34, PIT_MODE);         /* binary, mode 2, LSB/MSB, ch 0 */
        udelay(10);
-        outb_pit(LATCH & 0xff, PIT_CH0);        /* LSB */
+        outb_p(LATCH & 0xff, PIT_CH0);  /* LSB */
        udelay(10);
-        outb_pit(LATCH >> 8, PIT_CH0);  /* MSB */
+        outb_p(LATCH >> 8, PIT_CH0);    /* MSB */
        udelay(10);
        raw_spin_unlock_irqrestore(&i8253_lock, flags);
 #endif
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index c29d631af6fc..395a10e68067 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -63,7 +63,6 @@ void foo(void)
        BLANK();
        OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled);
        OFFSET(LGUEST_DATA_irq_pending, lguest_data, irq_pending);
-        OFFSET(LGUEST_DATA_pgdir, lguest_data, pgdir);
        BLANK();
        OFFSET(LGUEST_PAGES_host_gdt_desc, lguest_pages, state.host_gdt_desc);
diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c
index 8095f8611f8a..755f64fb0743 100644
--- a/arch/x86/kernel/cpu/hypervisor.c
+++ b/arch/x86/kernel/cpu/hypervisor.c
@@ -32,11 +32,11 @@
 */
 static const __initconst struct hypervisor_x86 * const hypervisors[] =
 {
-        &x86_hyper_vmware,
-        &x86_hyper_ms_hyperv,
 #ifdef CONFIG_XEN_PVHVM
        &x86_hyper_xen_hvm,
 #endif
+        &x86_hyper_vmware,
+        &x86_hyper_ms_hyperv,
 };
 const struct hypervisor_x86 *x86_hyper;
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 1edf5ba4fb2b..ed6086eedf1d 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -456,6 +456,24 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
        if (cpu_has(c, X86_FEATURE_VMX))
                detect_vmx_virtcap(c);
+        /*
+         * Initialize MSR_IA32_ENERGY_PERF_BIAS if BIOS did not.
+         * x86_energy_perf_policy(8) is available to change it at run-time
+         */
+        if (cpu_has(c, X86_FEATURE_EPB)) {
+                u64 epb;
+                rdmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb);
+                if ((epb & 0xF) == ENERGY_PERF_BIAS_PERFORMANCE) {
+                        printk_once(KERN_WARNING "ENERGY_PERF_BIAS:"
+                                " Set to 'normal', was 'performance'\n"
+                                "ENERGY_PERF_BIAS: View and update with"
+                                " x86_energy_perf_policy(8)\n");
+                        epb = (epb & ~0xF) | ENERGY_PERF_BIAS_NORMAL;
+                        wrmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb);
+                }
+        }
 }
 #ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
index 1e8d66c1336a..7395d5f4272d 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -43,61 +43,105 @@ static struct severity {
        unsigned char covered;
        char *msg;
 } severities[] = {
-#define KERNEL .context = IN_KERNEL
+#define MCESEV(s, m, c...) { .sev = MCE_ ## s ## _SEVERITY, .msg = m, ## c }
-#define USER .context = IN_USER
+#define  KERNEL         .context = IN_KERNEL
-#define SER .ser = SER_REQUIRED
+#define  USER           .context = IN_USER
-#define NOSER .ser = NO_SER
+#define  SER            .ser = SER_REQUIRED
-#define SEV(s) .sev = MCE_ ## s ## _SEVERITY
+#define  NOSER          .ser = NO_SER
-#define BITCLR(x, s, m, r...) { .mask = x, .result = 0, SEV(s), .msg = m, ## r }
+#define  BITCLR(x)      .mask = x, .result = 0
-#define BITSET(x, s, m, r...) { .mask = x, .result = x, SEV(s), .msg = m, ## r }
+#define  BITSET(x)      .mask = x, .result = x
-#define MCGMASK(x, res, s, m, r...) \
+#define  MCGMASK(x, y)  .mcgmask = x, .mcgres = y
-        { .mcgmask = x, .mcgres = res, SEV(s), .msg = m, ## r }
+#define  MASK(x, y)     .mask = x, .result = y
-#define MASK(x, y, s, m, r...) \
-        { .mask = x, .result = y, SEV(s), .msg = m, ## r }
 #define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S)
 #define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR)
 #define MCACOD 0xffff
-        BITCLR(MCI_STATUS_VAL, NO, "Invalid"),
+        MCESEV(
-        BITCLR(MCI_STATUS_EN, NO, "Not enabled"),
+                NO, "Invalid",
-        BITSET(MCI_STATUS_PCC, PANIC, "Processor context corrupt"),
+                BITCLR(MCI_STATUS_VAL)
+                ),
+        MCESEV(
+                NO, "Not enabled",
+                BITCLR(MCI_STATUS_EN)
+                ),
+        MCESEV(
+                PANIC, "Processor context corrupt",
+                BITSET(MCI_STATUS_PCC)
+                ),
        /* When MCIP is not set something is very confused */
-        MCGMASK(MCG_STATUS_MCIP, 0, PANIC, "MCIP not set in MCA handler"),
+        MCESEV(
+                PANIC, "MCIP not set in MCA handler",
+                MCGMASK(MCG_STATUS_MCIP, 0)
+                ),
        /* Neither return not error IP -- no chance to recover -> PANIC */
-        MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, 0, PANIC,
+        MCESEV(
-                "Neither restart nor error IP"),
+                PANIC, "Neither restart nor error IP",
-        MCGMASK(MCG_STATUS_RIPV, 0, PANIC, "In kernel and no restart IP",
+                MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, 0)
-                KERNEL),
+                ),
-        BITCLR(MCI_STATUS_UC, KEEP, "Corrected error", NOSER),
+        MCESEV(
-        MASK(MCI_STATUS_OVER|MCI_STATUS_UC|MCI_STATUS_EN, MCI_STATUS_UC, SOME,
+                PANIC, "In kernel and no restart IP",
-             "Spurious not enabled", SER),
+                KERNEL, MCGMASK(MCG_STATUS_RIPV, 0)
+                ),
+        MCESEV(
+                KEEP, "Corrected error",
+                NOSER, BITCLR(MCI_STATUS_UC)
+                ),
        /* ignore OVER for UCNA */
-        MASK(MCI_UC_SAR, MCI_STATUS_UC, KEEP,
+        MCESEV(
-             "Uncorrected no action required", SER),
+                KEEP, "Uncorrected no action required",
-        MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_UC|MCI_STATUS_AR, PANIC,
+                SER, MASK(MCI_UC_SAR, MCI_STATUS_UC)
-             "Illegal combination (UCNA with AR=1)", SER),
+                ),
-        MASK(MCI_STATUS_S, 0, KEEP, "Non signalled machine check", SER),
+        MCESEV(
+                PANIC, "Illegal combination (UCNA with AR=1)",
+                SER,
+                MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_UC|MCI_STATUS_AR)
+                ),
+        MCESEV(
+                KEEP, "Non signalled machine check",
+                SER, BITCLR(MCI_STATUS_S)
+                ),
        /* AR add known MCACODs here */
-        MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_OVER|MCI_UC_SAR, PANIC,
+        MCESEV(
-             "Action required with lost events", SER),
+                PANIC, "Action required with lost events",
-        MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD, MCI_UC_SAR, PANIC,
+                SER, BITSET(MCI_STATUS_OVER|MCI_UC_SAR)
-             "Action required; unknown MCACOD", SER),
+                ),
+        MCESEV(
+                PANIC, "Action required: unknown MCACOD",
+                SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_SAR)
+                ),
        /* known AO MCACODs: */
-        MASK(MCI_UC_SAR|MCI_STATUS_OVER|0xfff0, MCI_UC_S|0xc0, AO,
+        MCESEV(
-             "Action optional: memory scrubbing error", SER),
+                AO, "Action optional: memory scrubbing error",
-        MASK(MCI_UC_SAR|MCI_STATUS_OVER|MCACOD, MCI_UC_S|0x17a, AO,
+                SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|0xfff0, MCI_UC_S|0x00c0)
-             "Action optional: last level cache writeback error", SER),
+                ),
+        MCESEV(
-        MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S, SOME,
+                AO, "Action optional: last level cache writeback error",
-             "Action optional unknown MCACOD", SER),
+                SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD, MCI_UC_S|0x017a)
-        MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S|MCI_STATUS_OVER, SOME,
+                ),
-             "Action optional with lost events", SER),
+        MCESEV(
-        BITSET(MCI_STATUS_UC|MCI_STATUS_OVER, PANIC, "Overflowed uncorrected"),
+                SOME, "Action optional: unknown MCACOD",
-        BITSET(MCI_STATUS_UC, UC, "Uncorrected"),
+                SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S)
-        BITSET(0, SOME, "No match")     /* always matches. keep at end */
+                ),
+        MCESEV(
+                SOME, "Action optional with lost events",
+                SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_OVER|MCI_UC_S)
+                ),
+        MCESEV(
+                PANIC, "Overflowed uncorrected",
+                BITSET(MCI_STATUS_OVER|MCI_STATUS_UC)
+                ),
+        MCESEV(
+                UC, "Uncorrected",
+                BITSET(MCI_STATUS_UC)
+                ),
+        MCESEV(
+                SOME, "No match",
+                BITSET(0)
+                )       /* always matches. keep at end */
 };
 /*
@@ -112,15 +156,15 @@ static int error_context(struct mce *m)
        return IN_KERNEL;
 }
-int mce_severity(struct mce *a, int tolerant, char **msg)
+int mce_severity(struct mce *m, int tolerant, char **msg)
 {
-        enum context ctx = error_context(a);
+        enum context ctx = error_context(m);
        struct severity *s;
        for (s = severities;; s++) {
-                if ((a->status & s->mask) != s->result)
+                if ((m->status & s->mask) != s->result)
                        continue;
-                if ((a->mcgstatus & s->mcgmask) != s->mcgres)
+                if ((m->mcgstatus & s->mcgmask) != s->mcgres)
                        continue;
                if (s->ser == SER_REQUIRED && !mce_ser)
                        continue;
@@ -197,15 +241,15 @@ static const struct file_operations severities_coverage_fops = {
 static int __init severities_debugfs_init(void)
 {
-        struct dentry *dmce = NULL, *fseverities_coverage = NULL;
+        struct dentry *dmce, *fsev;
        dmce = mce_get_debugfs_dir();
-        if (dmce == NULL)
+        if (!dmce)
                goto err_out;
-        fseverities_coverage = debugfs_create_file("severities-coverage",
-                                                   0444, dmce, NULL,
+        fsev = debugfs_create_file("severities-coverage", 0444, dmce, NULL,
-                                                   &severities_coverage_fops);
+                                   &severities_coverage_fops);
-        if (fseverities_coverage == NULL)
+        if (!fsev)
                goto err_out;
        return 0;
@@ -214,4 +258,4 @@ err_out:
        return -ENOMEM;
 }
 late_initcall(severities_debugfs_init);
-#endif
+#endif /* CONFIG_DEBUG_FS */
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index ff1ae9b6464d..08363b042122 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -10,7 +10,6 @@
 #include <linux/thread_info.h>
 #include <linux/capability.h>
 #include <linux/miscdevice.h>
-#include <linux/interrupt.h>
 #include <linux/ratelimit.h>
 #include <linux/kallsyms.h>
 #include <linux/rcupdate.h>
@@ -38,23 +37,20 @@
 #include <linux/mm.h>
 #include <linux/debugfs.h>
 #include <linux/edac_mce.h>
+#include <linux/irq_work.h>
 #include <asm/processor.h>
-#include <asm/hw_irq.h>
-#include <asm/apic.h>
-#include <asm/idle.h>
-#include <asm/ipi.h>
 #include <asm/mce.h>
 #include <asm/msr.h>
 #include "mce-internal.h"
-static DEFINE_MUTEX(mce_read_mutex);
+static DEFINE_MUTEX(mce_chrdev_read_mutex);
 #define rcu_dereference_check_mce(p) \
        rcu_dereference_index_check((p), \
                              rcu_read_lock_sched_held() || \
-                              lockdep_is_held(&mce_read_mutex))
+                              lockdep_is_held(&mce_chrdev_read_mutex))
 #define CREATE_TRACE_POINTS
 #include <trace/events/mce.h>
@@ -94,7 +90,8 @@ static unsigned long		mce_need_notify;
 static char                     mce_helper[128];
 static char                     *mce_helper_argv[2] = { mce_helper, NULL };
-static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
+static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait);
 static DEFINE_PER_CPU(struct mce, mces_seen);
 static int                      cpu_missing;
@@ -373,6 +370,31 @@ static void mce_wrmsrl(u32 msr, u64 v)
 }
 /*
+ * Collect all global (w.r.t. this processor) status about this machine
+ * check into our "mce" struct so that we can use it later to assess
+ * the severity of the problem as we read per-bank specific details.
+ */
+static inline void mce_gather_info(struct mce *m, struct pt_regs *regs)
+{
+        mce_setup(m);
+        m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
+        if (regs) {
+                /*
+                 * Get the address of the instruction at the time of
+                 * the machine check error.
+                 */
+                if (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) {
+                        m->ip = regs->ip;
+                        m->cs = regs->cs;
+                }
+                /* Use accurate RIP reporting if available. */
+                if (rip_msr)
+                        m->ip = mce_rdmsrl(rip_msr);
+        }
+}
+/*
 * Simple lockless ring to communicate PFNs from the exception handler with the
 * process context work function. This is vastly simplified because there's
 * only a single reader and a single writer.
@@ -443,40 +465,13 @@ static void mce_schedule_work(void)
        }
 }
-/*
+DEFINE_PER_CPU(struct irq_work, mce_irq_work);
- * Get the address of the instruction at the time of the machine check
- * error.
- */
-static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
-{
-        if (regs && (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV))) {
-                m->ip = regs->ip;
-                m->cs = regs->cs;
-        } else {
-                m->ip = 0;
-                m->cs = 0;
-        }
-        if (rip_msr)
-                m->ip = mce_rdmsrl(rip_msr);
-}
-#ifdef CONFIG_X86_LOCAL_APIC
+static void mce_irq_work_cb(struct irq_work *entry)
-/*
- * Called after interrupts have been reenabled again
- * when a MCE happened during an interrupts off region
- * in the kernel.
- */
-asmlinkage void smp_mce_self_interrupt(struct pt_regs *regs)
 {
-        ack_APIC_irq();
-        exit_idle();
-        irq_enter();
        mce_notify_irq();
        mce_schedule_work();
-        irq_exit();
 }
-#endif
 static void mce_report_event(struct pt_regs *regs)
 {
@@ -492,29 +487,7 @@ static void mce_report_event(struct pt_regs *regs)
                return;
        }
-#ifdef CONFIG_X86_LOCAL_APIC
+        irq_work_queue(&__get_cpu_var(mce_irq_work));
-        /*
-         * Without APIC do not notify. The event will be picked
-         * up eventually.
-         */
-        if (!cpu_has_apic)
-                return;
-        /*
-         * When interrupts are disabled we cannot use
-         * kernel services safely. Trigger an self interrupt
-         * through the APIC to instead do the notification
-         * after interrupts are reenabled again.
-         */
-        apic->send_IPI_self(MCE_SELF_VECTOR);
-        /*
-         * Wait for idle afterwards again so that we don't leave the
-         * APIC in a non idle state because the normal APIC writes
-         * cannot exclude us.
-         */
-        apic_wait_icr_idle();
-#endif
 }
 DEFINE_PER_CPU(unsigned, mce_poll_count);
@@ -541,9 +514,8 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
        percpu_inc(mce_poll_count);
-        mce_setup(&m);
+        mce_gather_info(&m, NULL);
-        m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
        for (i = 0; i < banks; i++) {
                if (!mce_banks[i].ctl || !test_bit(i, *b))
                        continue;
@@ -879,9 +851,9 @@ static int mce_usable_address(struct mce *m)
 {
        if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV))
                return 0;
-        if ((m->misc & 0x3f) > PAGE_SHIFT)
+        if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT)
                return 0;
-        if (((m->misc >> 6) & 7) != MCM_ADDR_PHYS)
+        if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS)
                return 0;
        return 1;
 }
@@ -942,9 +914,8 @@ void do_machine_check(struct pt_regs *regs, long error_code)
        if (!banks)
                goto out;
-        mce_setup(&m);
+        mce_gather_info(&m, regs);
-        m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
        final = &__get_cpu_var(mces_seen);
        *final = m;
@@ -1028,7 +999,6 @@ void do_machine_check(struct pt_regs *regs, long error_code)
                if (severity == MCE_AO_SEVERITY && mce_usable_address(&m))
                        mce_ring_add(m.addr >> PAGE_SHIFT);
-                mce_get_rip(&m, regs);
                mce_log(&m);
                if (severity > worst) {
@@ -1190,7 +1160,8 @@ int mce_notify_irq(void)
        clear_thread_flag(TIF_MCE_NOTIFY);
        if (test_and_clear_bit(0, &mce_need_notify)) {
-                wake_up_interruptible(&mce_wait);
+                /* wake processes polling /dev/mcelog */
+                wake_up_interruptible(&mce_chrdev_wait);
                /*
                 * There is no risk of missing notifications because
@@ -1363,18 +1334,23 @@ static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
        return 0;
 }
-static void __cpuinit __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
+static int __cpuinit __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
 {
        if (c->x86 != 5)
-                return;
+                return 0;
        switch (c->x86_vendor) {
        case X86_VENDOR_INTEL:
                intel_p5_mcheck_init(c);
+                return 1;
                break;
        case X86_VENDOR_CENTAUR:
                winchip_mcheck_init(c);
+                return 1;
                break;
        }
+        return 0;
 }
 static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
@@ -1428,7 +1404,8 @@ void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c)
        if (mce_disabled)
                return;
-        __mcheck_cpu_ancient_init(c);
+        if (__mcheck_cpu_ancient_init(c))
+                return;
        if (!mce_available(c))
                return;
@@ -1444,44 +1421,45 @@ void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c)
        __mcheck_cpu_init_vendor(c);
        __mcheck_cpu_init_timer();
        INIT_WORK(&__get_cpu_var(mce_work), mce_process_work);
+        init_irq_work(&__get_cpu_var(mce_irq_work), &mce_irq_work_cb);
 }
 /*
- * Character device to read and clear the MCE log.
+ * mce_chrdev: Character device /dev/mcelog to read and clear the MCE log.
 */
-static DEFINE_SPINLOCK(mce_state_lock);
+static DEFINE_SPINLOCK(mce_chrdev_state_lock);
-static int              open_count;             /* #times opened */
+static int mce_chrdev_open_count;       /* #times opened */
-static int              open_exclu;             /* already open exclusive? */
+static int mce_chrdev_open_exclu;       /* already open exclusive? */
-static int mce_open(struct inode *inode, struct file *file)
+static int mce_chrdev_open(struct inode *inode, struct file *file)
 {
-        spin_lock(&mce_state_lock);
+        spin_lock(&mce_chrdev_state_lock);
-        if (open_exclu || (open_count && (file->f_flags & O_EXCL))) {
+        if (mce_chrdev_open_exclu ||
-                spin_unlock(&mce_state_lock);
+            (mce_chrdev_open_count && (file->f_flags & O_EXCL))) {
+                spin_unlock(&mce_chrdev_state_lock);
                return -EBUSY;
        }
        if (file->f_flags & O_EXCL)
-                open_exclu = 1;
+                mce_chrdev_open_exclu = 1;
-        open_count++;
+        mce_chrdev_open_count++;
-        spin_unlock(&mce_state_lock);
+        spin_unlock(&mce_chrdev_state_lock);
        return nonseekable_open(inode, file);
 }
-static int mce_release(struct inode *inode, struct file *file)
+static int mce_chrdev_release(struct inode *inode, struct file *file)
 {
-        spin_lock(&mce_state_lock);
+        spin_lock(&mce_chrdev_state_lock);
-        open_count--;
+        mce_chrdev_open_count--;
-        open_exclu = 0;
+        mce_chrdev_open_exclu = 0;
-        spin_unlock(&mce_state_lock);
+        spin_unlock(&mce_chrdev_state_lock);
        return 0;
 }
@@ -1530,8 +1508,8 @@ static int __mce_read_apei(char __user **ubuf, size_t usize)
        return 0;
 }
-static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
+static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf,
-                        loff_t *off)
+                                size_t usize, loff_t *off)
 {
        char __user *buf = ubuf;
        unsigned long *cpu_tsc;
@@ -1542,7 +1520,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
        if (!cpu_tsc)
                return -ENOMEM;
-        mutex_lock(&mce_read_mutex);
+        mutex_lock(&mce_chrdev_read_mutex);
        if (!mce_apei_read_done) {
                err = __mce_read_apei(&buf, usize);
@@ -1562,19 +1540,18 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
        do {
                for (i = prev; i < next; i++) {
                        unsigned long start = jiffies;
+                        struct mce *m = &mcelog.entry[i];
-                        while (!mcelog.entry[i].finished) {
+                        while (!m->finished) {
                                if (time_after_eq(jiffies, start + 2)) {
-                                        memset(mcelog.entry + i, 0,
+                                        memset(m, 0, sizeof(*m));
-                                               sizeof(struct mce));
                                        goto timeout;
                                }
                                cpu_relax();
                        }
                        smp_rmb();
-                        err |= copy_to_user(buf, mcelog.entry + i,
+                        err |= copy_to_user(buf, m, sizeof(*m));
-                                            sizeof(struct mce));
+                        buf += sizeof(*m);
-                        buf += sizeof(struct mce);
 timeout:
                        ;
                }
@@ -1594,13 +1571,13 @@ timeout:
        on_each_cpu(collect_tscs, cpu_tsc, 1);
        for (i = next; i < MCE_LOG_LEN; i++) {
-                if (mcelog.entry[i].finished &&
+                struct mce *m = &mcelog.entry[i];
-                    mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
-                        err |= copy_to_user(buf, mcelog.entry+i,
+                if (m->finished && m->tsc < cpu_tsc[m->cpu]) {
-                                            sizeof(struct mce));
+                        err |= copy_to_user(buf, m, sizeof(*m));
                        smp_rmb();
-                        buf += sizeof(struct mce);
+                        buf += sizeof(*m);
-                        memset(&mcelog.entry[i], 0, sizeof(struct mce));
+                        memset(m, 0, sizeof(*m));
                }
        }
@@ -1608,15 +1585,15 @@ timeout:
                err = -EFAULT;
 out:
-        mutex_unlock(&mce_read_mutex);
+        mutex_unlock(&mce_chrdev_read_mutex);
        kfree(cpu_tsc);
        return err ? err : buf - ubuf;
 }
-static unsigned int mce_poll(struct file *file, poll_table *wait)
+static unsigned int mce_chrdev_poll(struct file *file, poll_table *wait)
 {
-        poll_wait(file, &mce_wait, wait);
+        poll_wait(file, &mce_chrdev_wait, wait);
        if (rcu_access_index(mcelog.next))
                return POLLIN | POLLRDNORM;
        if (!mce_apei_read_done && apei_check_mce())
@@ -1624,7 +1601,8 @@ static unsigned int mce_poll(struct file *file, poll_table *wait)
        return 0;
 }
-static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
+static long mce_chrdev_ioctl(struct file *f, unsigned int cmd,
+                                unsigned long arg)
 {
        int __user *p = (int __user *)arg;
@@ -1652,16 +1630,16 @@ static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
 /* Modified in mce-inject.c, so not static or const */
 struct file_operations mce_chrdev_ops = {
-        .open                   = mce_open,
+        .open                   = mce_chrdev_open,
-        .release                = mce_release,
+        .release                = mce_chrdev_release,
-        .read                   = mce_read,
+        .read                   = mce_chrdev_read,
-        .poll                   = mce_poll,
+        .poll                   = mce_chrdev_poll,
-        .unlocked_ioctl         = mce_ioctl,
+        .unlocked_ioctl         = mce_chrdev_ioctl,
-        .llseek         = no_llseek,
+        .llseek                 = no_llseek,
 };
 EXPORT_SYMBOL_GPL(mce_chrdev_ops);
-static struct miscdevice mce_log_device = {
+static struct miscdevice mce_chrdev_device = {
        MISC_MCELOG_MINOR,
        "mcelog",
        &mce_chrdev_ops,
@@ -1719,7 +1697,7 @@ int __init mcheck_init(void)
 }
 /*
- * Sysfs support
+ * mce_syscore: PM support
 */
 /*
@@ -1739,12 +1717,12 @@ static int mce_disable_error_reporting(void)
        return 0;
 }
-static int mce_suspend(void)
+static int mce_syscore_suspend(void)
 {
        return mce_disable_error_reporting();
 }
-static void mce_shutdown(void)
+static void mce_syscore_shutdown(void)
 {
        mce_disable_error_reporting();
 }
@@ -1754,18 +1732,22 @@ static void mce_shutdown(void)
 * Only one CPU is active at this time, the others get re-added later using
 * CPU hotplug:
 */
-static void mce_resume(void)
+static void mce_syscore_resume(void)
 {
        __mcheck_cpu_init_generic();
        __mcheck_cpu_init_vendor(__this_cpu_ptr(&cpu_info));
 }
 static struct syscore_ops mce_syscore_ops = {
-        .suspend        = mce_suspend,
+        .suspend        = mce_syscore_suspend,
-        .shutdown       = mce_shutdown,
+        .shutdown       = mce_syscore_shutdown,
-        .resume         = mce_resume,
+        .resume         = mce_syscore_resume,
 };
+/*
+ * mce_sysdev: Sysfs support
+ */
 static void mce_cpu_restart(void *data)
 {
        del_timer_sync(&__get_cpu_var(mce_timer));
@@ -1801,11 +1783,11 @@ static void mce_enable_ce(void *all)
                __mcheck_cpu_init_timer();
 }
-static struct sysdev_class mce_sysclass = {
+static struct sysdev_class mce_sysdev_class = {
        .name           = "machinecheck",
 };
-DEFINE_PER_CPU(struct sys_device, mce_dev);
+DEFINE_PER_CPU(struct sys_device, mce_sysdev);
 __cpuinitdata
 void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
@@ -1934,7 +1916,7 @@ static struct sysdev_ext_attribute attr_cmci_disabled = {
        &mce_cmci_disabled
 };
-static struct sysdev_attribute *mce_attrs[] = {
+static struct sysdev_attribute *mce_sysdev_attrs[] = {
        &attr_tolerant.attr,
        &attr_check_interval.attr,
        &attr_trigger,
@@ -1945,66 +1927,67 @@ static struct sysdev_attribute *mce_attrs[] = {
        NULL
 };
-static cpumask_var_t mce_dev_initialized;
+static cpumask_var_t mce_sysdev_initialized;
 /* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */
-static __cpuinit int mce_create_device(unsigned int cpu)
+static __cpuinit int mce_sysdev_create(unsigned int cpu)
 {
+        struct sys_device *sysdev = &per_cpu(mce_sysdev, cpu);
        int err;
        int i, j;
        if (!mce_available(&boot_cpu_data))
                return -EIO;
-        memset(&per_cpu(mce_dev, cpu).kobj, 0, sizeof(struct kobject));
+        memset(&sysdev->kobj, 0, sizeof(struct kobject));
-        per_cpu(mce_dev, cpu).id        = cpu;
+        sysdev->id  = cpu;
-        per_cpu(mce_dev, cpu).cls       = &mce_sysclass;
+        sysdev->cls = &mce_sysdev_class;
-        err = sysdev_register(&per_cpu(mce_dev, cpu));
+        err = sysdev_register(sysdev);
        if (err)
                return err;
-        for (i = 0; mce_attrs[i]; i++) {
+        for (i = 0; mce_sysdev_attrs[i]; i++) {
-                err = sysdev_create_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
+                err = sysdev_create_file(sysdev, mce_sysdev_attrs[i]);
                if (err)
                        goto error;
        }
        for (j = 0; j < banks; j++) {
-                err = sysdev_create_file(&per_cpu(mce_dev, cpu),
+                err = sysdev_create_file(sysdev, &mce_banks[j].attr);
-                                        &mce_banks[j].attr);
                if (err)
                        goto error2;
        }
-        cpumask_set_cpu(cpu, mce_dev_initialized);
+        cpumask_set_cpu(cpu, mce_sysdev_initialized);
        return 0;
 error2:
        while (--j >= 0)
-                sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[j].attr);
+                sysdev_remove_file(sysdev, &mce_banks[j].attr);
 error:
        while (--i >= 0)
-                sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
+                sysdev_remove_file(sysdev, mce_sysdev_attrs[i]);
-        sysdev_unregister(&per_cpu(mce_dev, cpu));
+        sysdev_unregister(sysdev);
        return err;
 }
-static __cpuinit void mce_remove_device(unsigned int cpu)
+static __cpuinit void mce_sysdev_remove(unsigned int cpu)
 {
+        struct sys_device *sysdev = &per_cpu(mce_sysdev, cpu);
        int i;
-        if (!cpumask_test_cpu(cpu, mce_dev_initialized))
+        if (!cpumask_test_cpu(cpu, mce_sysdev_initialized))
                return;
-        for (i = 0; mce_attrs[i]; i++)
+        for (i = 0; mce_sysdev_attrs[i]; i++)
-                sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
+                sysdev_remove_file(sysdev, mce_sysdev_attrs[i]);
        for (i = 0; i < banks; i++)
-                sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[i].attr);
+                sysdev_remove_file(sysdev, &mce_banks[i].attr);
-        sysdev_unregister(&per_cpu(mce_dev, cpu));
+        sysdev_unregister(sysdev);
-        cpumask_clear_cpu(cpu, mce_dev_initialized);
+        cpumask_clear_cpu(cpu, mce_sysdev_initialized);
 }
 /* Make sure there are no machine checks on offlined CPUs. */
@@ -2054,7 +2037,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
        switch (action) {
        case CPU_ONLINE:
        case CPU_ONLINE_FROZEN:
-                mce_create_device(cpu);
+                mce_sysdev_create(cpu);
                if (threshold_cpu_callback)
                        threshold_cpu_callback(action, cpu);
                break;
@@ -2062,7 +2045,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
        case CPU_DEAD_FROZEN:
                if (threshold_cpu_callback)
                        threshold_cpu_callback(action, cpu);
-                mce_remove_device(cpu);
+                mce_sysdev_remove(cpu);
                break;
        case CPU_DOWN_PREPARE:
        case CPU_DOWN_PREPARE_FROZEN:
@@ -2116,27 +2099,28 @@ static __init int mcheck_init_device(void)
        if (!mce_available(&boot_cpu_data))
                return -EIO;
-        zalloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL);
+        zalloc_cpumask_var(&mce_sysdev_initialized, GFP_KERNEL);
        mce_init_banks();
-        err = sysdev_class_register(&mce_sysclass);
+        err = sysdev_class_register(&mce_sysdev_class);
        if (err)
                return err;
        for_each_online_cpu(i) {
-                err = mce_create_device(i);
+                err = mce_sysdev_create(i);
                if (err)
                        return err;
        }
        register_syscore_ops(&mce_syscore_ops);
        register_hotcpu_notifier(&mce_cpu_notifier);
-        misc_register(&mce_log_device);
+        /* register character device /dev/mcelog */
+        misc_register(&mce_chrdev_device);
        return err;
 }
 device_initcall(mcheck_init_device);
 /*
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index bb0adad35143..f5474218cffe 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -548,7 +548,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
                if (!b)
                        goto out;
-                err = sysfs_create_link(&per_cpu(mce_dev, cpu).kobj,
+                err = sysfs_create_link(&per_cpu(mce_sysdev, cpu).kobj,
                                        b->kobj, name);
                if (err)
                        goto out;
@@ -571,7 +571,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
                goto out;
        }
-        b->kobj = kobject_create_and_add(name, &per_cpu(mce_dev, cpu).kobj);
+        b->kobj = kobject_create_and_add(name, &per_cpu(mce_sysdev, cpu).kobj);
        if (!b->kobj)
                goto out_free;
@@ -591,7 +591,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
                if (i == cpu)
                        continue;
-                err = sysfs_create_link(&per_cpu(mce_dev, i).kobj,
+                err = sysfs_create_link(&per_cpu(mce_sysdev, i).kobj,
                                        b->kobj, name);
                if (err)
                        goto out;
@@ -669,7 +669,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
 #ifdef CONFIG_SMP
        /* sibling symlink */
        if (shared_bank[bank] && b->blocks->cpu != cpu) {
-                sysfs_remove_link(&per_cpu(mce_dev, cpu).kobj, name);
+                sysfs_remove_link(&per_cpu(mce_sysdev, cpu).kobj, name);
                per_cpu(threshold_banks, cpu)[bank] = NULL;
                return;
@@ -681,7 +681,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
                if (i == cpu)
                        continue;
-                sysfs_remove_link(&per_cpu(mce_dev, i).kobj, name);
+                sysfs_remove_link(&per_cpu(mce_sysdev, i).kobj, name);
                per_cpu(threshold_banks, i)[bank] = NULL;
        }
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 929739a653d1..08119a37e53c 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -79,7 +79,6 @@ void set_mtrr_ops(const struct mtrr_ops *ops)
 static int have_wrcomb(void)
 {
        struct pci_dev *dev;
-        u8 rev;
        dev = pci_get_class(PCI_CLASS_BRIDGE_HOST << 8, NULL);
        if (dev != NULL) {
@@ -89,13 +88,11 @@ static int have_wrcomb(void)
                 * chipsets to be tagged
                 */
                if (dev->vendor == PCI_VENDOR_ID_SERVERWORKS &&
-                    dev->device == PCI_DEVICE_ID_SERVERWORKS_LE) {
+                    dev->device == PCI_DEVICE_ID_SERVERWORKS_LE &&
-                        pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev);
+                    dev->revision <= 5) {
-                        if (rev <= 5) {
+                        pr_info("mtrr: Serverworks LE rev < 6 detected. Write-combining disabled.\n");
-                                pr_info("mtrr: Serverworks LE rev < 6 detected. Write-combining disabled.\n");
+                        pci_dev_put(dev);
-                                pci_dev_put(dev);
+                        return 0;
-                                return 0;
-                        }
                }
                /*
                 * Intel 450NX errata # 23. Non ascending cacheline evictions to
@@ -137,55 +134,43 @@ static void __init init_table(void)
 }
 struct set_mtrr_data {
-        atomic_t        count;
-        atomic_t        gate;
        unsigned long   smp_base;
        unsigned long   smp_size;
        unsigned int    smp_reg;
        mtrr_type       smp_type;
 };
-static DEFINE_PER_CPU(struct cpu_stop_work, mtrr_work);
 /**
- * mtrr_work_handler - Synchronisation handler. Executed by "other" CPUs.
+ * mtrr_rendezvous_handler - Work done in the synchronization handler. Executed
+ * by all the CPUs.
 * @info: pointer to mtrr configuration data
 *
 * Returns nothing.
 */
-static int mtrr_work_handler(void *info)
+static int mtrr_rendezvous_handler(void *info)
 {
 #ifdef CONFIG_SMP
        struct set_mtrr_data *data = info;
-        unsigned long flags;
-        atomic_dec(&data->count);
-        while (!atomic_read(&data->gate))
-                cpu_relax();
-        local_irq_save(flags);
-        atomic_dec(&data->count);
-        while (atomic_read(&data->gate))
-                cpu_relax();
-        /*  The master has cleared me to execute  */
+        /*
+         * We use this same function to initialize the mtrrs during boot,
+         * resume, runtime cpu online and on an explicit request to set a
+         * specific MTRR.
+         *
+         * During boot or suspend, the state of the boot cpu's mtrrs has been
+         * saved, and we want to replicate that across all the cpus that come
+         * online (either at the end of boot or resume or during a runtime cpu
+         * online). If we're doing that, @reg is set to something special and on
+         * all the cpu's we do mtrr_if->set_all() (On the logical cpu that
+         * started the boot/resume sequence, this might be a duplicate
+         * set_all()).
+         */
        if (data->smp_reg != ~0U) {
                mtrr_if->set(data->smp_reg, data->smp_base,
                             data->smp_size, data->smp_type);
-        } else if (mtrr_aps_delayed_init) {
+        } else if (mtrr_aps_delayed_init || !cpu_online(smp_processor_id())) {
-                /*
-                 * Initialize the MTRRs inaddition to the synchronisation.
-                 */
                mtrr_if->set_all();
        }
-        atomic_dec(&data->count);
-        while (!atomic_read(&data->gate))
-                cpu_relax();
-        atomic_dec(&data->count);
-        local_irq_restore(flags);
 #endif
        return 0;
 }
@@ -223,20 +208,11 @@ static inline int types_compatible(mtrr_type type1, mtrr_type type2)
 * 14. Wait for buddies to catch up
 * 15. Enable interrupts.
 *
- * What does that mean for us? Well, first we set data.count to the number
+ * What does that mean for us? Well, stop_machine() will ensure that
- * of CPUs. As each CPU announces that it started the rendezvous handler by
+ * the rendezvous handler is started on each CPU. And in lockstep they
- * decrementing the count, We reset data.count and set the data.gate flag
+ * do the state transition of disabling interrupts, updating MTRR's
- * allowing all the cpu's to proceed with the work. As each cpu disables
+ * (the CPU vendors may each do it differently, so we call mtrr_if->set()
- * interrupts, it'll decrement data.count once. We wait until it hits 0 and
+ * callback and let them take care of it.) and enabling interrupts.
- * proceed. We clear the data.gate flag and reset data.count. Meanwhile, they
- * are waiting for that flag to be cleared. Once it's cleared, each
- * CPU goes through the transition of updating MTRRs.
- * The CPU vendors may each do it differently,
- * so we call mtrr_if->set() callback and let them take care of it.
- * When they're done, they again decrement data->count and wait for data.gate
- * to be set.
- * When we finish, we wait for data.count to hit 0 and toggle the data.gate flag
- * Everyone then enables interrupts and we all continue on.
 *
 * Note that the mechanism is the same for UP systems, too; all the SMP stuff
 * becomes nops.
@@ -244,92 +220,26 @@ static inline int types_compatible(mtrr_type type1, mtrr_type type2)
 static void
 set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type)
 {
-        struct set_mtrr_data data;
+        struct set_mtrr_data data = { .smp_reg = reg,
-        unsigned long flags;
+                                      .smp_base = base,
-        int cpu;
+                                      .smp_size = size,
+                                      .smp_type = type
-        preempt_disable();
+                                    };
-        data.smp_reg = reg;
-        data.smp_base = base;
-        data.smp_size = size;
-        data.smp_type = type;
-        atomic_set(&data.count, num_booting_cpus() - 1);
-        /* Make sure data.count is visible before unleashing other CPUs */
-        smp_wmb();
-        atomic_set(&data.gate, 0);
-        /* Start the ball rolling on other CPUs */
-        for_each_online_cpu(cpu) {
-                struct cpu_stop_work *work = &per_cpu(mtrr_work, cpu);
-                if (cpu == smp_processor_id())
-                        continue;
-                stop_one_cpu_nowait(cpu, mtrr_work_handler, &data, work);
-        }
-        while (atomic_read(&data.count))
-                cpu_relax();
-        /* Ok, reset count and toggle gate */
-        atomic_set(&data.count, num_booting_cpus() - 1);
-        smp_wmb();
-        atomic_set(&data.gate, 1);
-        local_irq_save(flags);
-        while (atomic_read(&data.count))
-                cpu_relax();
-        /* Ok, reset count and toggle gate */
-        atomic_set(&data.count, num_booting_cpus() - 1);
-        smp_wmb();
-        atomic_set(&data.gate, 0);
-        /* Do our MTRR business */
-        /*
-         * HACK!
-         *
-         * We use this same function to initialize the mtrrs during boot,
-         * resume, runtime cpu online and on an explicit request to set a
-         * specific MTRR.
-         *
-         * During boot or suspend, the state of the boot cpu's mtrrs has been
-         * saved, and we want to replicate that across all the cpus that come
-         * online (either at the end of boot or resume or during a runtime cpu
-         * online). If we're doing that, @reg is set to something special and on
-         * this cpu we still do mtrr_if->set_all(). During boot/resume, this
-         * is unnecessary if at this point we are still on the cpu that started
-         * the boot/resume sequence. But there is no guarantee that we are still
-         * on the same cpu. So we do mtrr_if->set_all() on this cpu aswell to be
-         * sure that we are in sync with everyone else.
-         */
-        if (reg != ~0U)
-                mtrr_if->set(reg, base, size, type);
-        else
-                mtrr_if->set_all();
-        /* Wait for the others */
+        stop_machine(mtrr_rendezvous_handler, &data, cpu_online_mask);
-        while (atomic_read(&data.count))
+}
-                cpu_relax();
-        atomic_set(&data.count, num_booting_cpus() - 1);
-        smp_wmb();
-        atomic_set(&data.gate, 1);
-        /*
-         * Wait here for everyone to have seen the gate change
-         * So we're the last ones to touch 'data'
-         */
-        while (atomic_read(&data.count))
-                cpu_relax();
-        local_irq_restore(flags);
+static void set_mtrr_from_inactive_cpu(unsigned int reg, unsigned long base,
-        preempt_enable();
+                                      unsigned long size, mtrr_type type)
+{
+        struct set_mtrr_data data = { .smp_reg = reg,
+                                      .smp_base = base,
+                                      .smp_size = size,
+                                      .smp_type = type
+                                    };
+        stop_machine_from_inactive_cpu(mtrr_rendezvous_handler, &data,
+                                       cpu_callout_mask);
 }
 /**
@@ -783,7 +693,7 @@ void mtrr_ap_init(void)
         *   2. cpu hotadd time. We let mtrr_add/del_page hold cpuhotplug
         *      lock to prevent mtrr entry changes
         */
-        set_mtrr(~0U, 0, 0, 0);
+        set_mtrr_from_inactive_cpu(~0U, 0, 0, 0);
 }
 /**
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 3a0338b4b179..4ee3abf20ed6 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -22,7 +22,6 @@
 #include <linux/sched.h>
 #include <linux/uaccess.h>
 #include <linux/slab.h>
-#include <linux/highmem.h>
 #include <linux/cpu.h>
 #include <linux/bitops.h>
@@ -45,38 +44,27 @@ do {								\
 #endif
 /*
- * best effort, GUP based copy_from_user() that assumes IRQ or NMI context
+ *          |   NHM/WSM    |      SNB     |
+ * register -------------------------------
+ *          |  HT  | no HT |  HT  | no HT |
+ *-----------------------------------------
+ * offcore  | core | core  | cpu  | core  |
+ * lbr_sel  | core | core  | cpu  | core  |
+ * ld_lat   | cpu  | core  | cpu  | core  |
+ *-----------------------------------------
+ *
+ * Given that there is a small number of shared regs,
+ * we can pre-allocate their slot in the per-cpu
+ * per-core reg tables.
 */
-static unsigned long
+enum extra_reg_type {
-copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
+        EXTRA_REG_NONE  = -1,   /* not used */
-{
-        unsigned long offset, addr = (unsigned long)from;
-        unsigned long size, len = 0;
-        struct page *page;
-        void *map;
-        int ret;
-        do {
-                ret = __get_user_pages_fast(addr, 1, 0, &page);
-                if (!ret)
-                        break;
-                offset = addr & (PAGE_SIZE - 1);
-                size = min(PAGE_SIZE - offset, n - len);
-                map = kmap_atomic(page);
-                memcpy(to, map+offset, size);
-                kunmap_atomic(map);
-                put_page(page);
-                len  += size;
+        EXTRA_REG_RSP_0 = 0,    /* offcore_response_0 */
-                to   += size;
+        EXTRA_REG_RSP_1 = 1,    /* offcore_response_1 */
-                addr += size;
-        } while (len < n);
+        EXTRA_REG_MAX           /* number of entries needed */
+};
-        return len;
-}
 struct event_constraint {
        union {
@@ -132,11 +120,10 @@ struct cpu_hw_events {
        struct perf_branch_entry        lbr_entries[MAX_LBR_ENTRIES];
        /*
-         * Intel percore register state.
+         * manage shared (per-core, per-cpu) registers
-         * Coordinate shared resources between HT threads.
+         * used on Intel NHM/WSM/SNB
         */
-        int                             percore_used; /* Used by this CPU? */
+        struct intel_shared_regs        *shared_regs;
-        struct intel_percore            *per_core;
        /*
         * AMD specific bits
@@ -187,26 +174,45 @@ struct cpu_hw_events {
        for ((e) = (c); (e)->weight; (e)++)
 /*
+ * Per register state.
+ */
+struct er_account {
+        raw_spinlock_t          lock;   /* per-core: protect structure */
+        u64                     config; /* extra MSR config */
+        u64                     reg;    /* extra MSR number */
+        atomic_t                ref;    /* reference count */
+};
+/*
 * Extra registers for specific events.
+ *
 * Some events need large masks and require external MSRs.
- * Define a mapping to these extra registers.
+ * Those extra MSRs end up being shared for all events on
+ * a PMU and sometimes between PMU of sibling HT threads.
+ * In either case, the kernel needs to handle conflicting
+ * accesses to those extra, shared, regs. The data structure
+ * to manage those registers is stored in cpu_hw_event.
 */
 struct extra_reg {
        unsigned int            event;
        unsigned int            msr;
        u64                     config_mask;
        u64                     valid_mask;
+        int                     idx;  /* per_xxx->regs[] reg index */
 };
-#define EVENT_EXTRA_REG(e, ms, m, vm) { \
+#define EVENT_EXTRA_REG(e, ms, m, vm, i) {      \
        .event = (e),           \
        .msr = (ms),            \
        .config_mask = (m),     \
        .valid_mask = (vm),     \
+        .idx = EXTRA_REG_##i    \
        }
-#define INTEL_EVENT_EXTRA_REG(event, msr, vm)   \
-        EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT, vm)
+#define INTEL_EVENT_EXTRA_REG(event, msr, vm, idx)      \
-#define EVENT_EXTRA_END EVENT_EXTRA_REG(0, 0, 0, 0)
+        EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT, vm, idx)
+#define EVENT_EXTRA_END EVENT_EXTRA_REG(0, 0, 0, 0, RSP_0)
 union perf_capabilities {
        struct {
@@ -252,7 +258,6 @@ struct x86_pmu {
        void            (*put_event_constraints)(struct cpu_hw_events *cpuc,
                                                 struct perf_event *event);
        struct event_constraint *event_constraints;
-        struct event_constraint *percore_constraints;
        void            (*quirks)(void);
        int             perfctr_second_write;
@@ -286,8 +291,12 @@ struct x86_pmu {
         * Extra registers for events
         */
        struct extra_reg *extra_regs;
+        unsigned int er_flags;
 };
+#define ERF_NO_HT_SHARING       1
+#define ERF_HAS_RSP_1           2
 static struct x86_pmu x86_pmu __read_mostly;
 static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
@@ -393,10 +402,10 @@ static inline unsigned int x86_pmu_event_addr(int index)
 */
 static int x86_pmu_extra_regs(u64 config, struct perf_event *event)
 {
+        struct hw_perf_event_extra *reg;
        struct extra_reg *er;
-        event->hw.extra_reg = 0;
+        reg = &event->hw.extra_reg;
-        event->hw.extra_config = 0;
        if (!x86_pmu.extra_regs)
                return 0;
@@ -406,8 +415,10 @@ static int x86_pmu_extra_regs(u64 config, struct perf_event *event)
                        continue;
                if (event->attr.config1 & ~er->valid_mask)
                        return -EINVAL;
-                event->hw.extra_reg = er->msr;
-                event->hw.extra_config = event->attr.config1;
+                reg->idx = er->idx;
+                reg->config = event->attr.config1;
+                reg->reg = er->msr;
                break;
        }
        return 0;
@@ -706,6 +717,9 @@ static int __x86_pmu_event_init(struct perf_event *event)
        event->hw.last_cpu = -1;
        event->hw.last_tag = ~0ULL;
+        /* mark unused */
+        event->hw.extra_reg.idx = EXTRA_REG_NONE;
        return x86_pmu.hw_config(event);
 }
@@ -747,8 +761,8 @@ static void x86_pmu_disable(struct pmu *pmu)
 static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,
                                          u64 enable_mask)
 {
-        if (hwc->extra_reg)
+        if (hwc->extra_reg.reg)
-                wrmsrl(hwc->extra_reg, hwc->extra_config);
+                wrmsrl(hwc->extra_reg.reg, hwc->extra_reg.config);
        wrmsrl(hwc->config_base, hwc->config | enable_mask);
 }
@@ -1332,7 +1346,7 @@ static int x86_pmu_handle_irq(struct pt_regs *regs)
                if (!x86_perf_event_set_period(event))
                        continue;
-                if (perf_event_overflow(event, 1, &data, regs))
+                if (perf_event_overflow(event, &data, regs))
                        x86_pmu_stop(event, 0);
        }
@@ -1637,6 +1651,40 @@ static int x86_pmu_commit_txn(struct pmu *pmu)
        perf_pmu_enable(pmu);
        return 0;
 }
+/*
+ * a fake_cpuc is used to validate event groups. Due to
+ * the extra reg logic, we need to also allocate a fake
+ * per_core and per_cpu structure. Otherwise, group events
+ * using extra reg may conflict without the kernel being
+ * able to catch this when the last event gets added to
+ * the group.
+ */
+static void free_fake_cpuc(struct cpu_hw_events *cpuc)
+{
+        kfree(cpuc->shared_regs);
+        kfree(cpuc);
+}
+static struct cpu_hw_events *allocate_fake_cpuc(void)
+{
+        struct cpu_hw_events *cpuc;
+        int cpu = raw_smp_processor_id();
+        cpuc = kzalloc(sizeof(*cpuc), GFP_KERNEL);
+        if (!cpuc)
+                return ERR_PTR(-ENOMEM);
+        /* only needed, if we have extra_regs */
+        if (x86_pmu.extra_regs) {
+                cpuc->shared_regs = allocate_shared_regs(cpu);
+                if (!cpuc->shared_regs)
+                        goto error;
+        }
+        return cpuc;
+error:
+        free_fake_cpuc(cpuc);
+        return ERR_PTR(-ENOMEM);
+}
 /*
 * validate that we can schedule this event
@@ -1647,9 +1695,9 @@ static int validate_event(struct perf_event *event)
        struct event_constraint *c;
        int ret = 0;
-        fake_cpuc = kmalloc(sizeof(*fake_cpuc), GFP_KERNEL | __GFP_ZERO);
+        fake_cpuc = allocate_fake_cpuc();
-        if (!fake_cpuc)
+        if (IS_ERR(fake_cpuc))
-                return -ENOMEM;
+                return PTR_ERR(fake_cpuc);
        c = x86_pmu.get_event_constraints(fake_cpuc, event);
@@ -1659,7 +1707,7 @@ static int validate_event(struct perf_event *event)
        if (x86_pmu.put_event_constraints)
                x86_pmu.put_event_constraints(fake_cpuc, event);
-        kfree(fake_cpuc);
+        free_fake_cpuc(fake_cpuc);
        return ret;
 }
@@ -1679,36 +1727,32 @@ static int validate_group(struct perf_event *event)
 {
        struct perf_event *leader = event->group_leader;
        struct cpu_hw_events *fake_cpuc;
-        int ret, n;
+        int ret = -ENOSPC, n;
-        ret = -ENOMEM;
-        fake_cpuc = kmalloc(sizeof(*fake_cpuc), GFP_KERNEL | __GFP_ZERO);
-        if (!fake_cpuc)
-                goto out;
+        fake_cpuc = allocate_fake_cpuc();
+        if (IS_ERR(fake_cpuc))
+                return PTR_ERR(fake_cpuc);
        /*
         * the event is not yet connected with its
         * siblings therefore we must first collect
         * existing siblings, then add the new event
         * before we can simulate the scheduling
         */
-        ret = -ENOSPC;
        n = collect_events(fake_cpuc, leader, true);
        if (n < 0)
-                goto out_free;
+                goto out;
        fake_cpuc->n_events = n;
        n = collect_events(fake_cpuc, event, false);
        if (n < 0)
-                goto out_free;
+                goto out;
        fake_cpuc->n_events = n;
        ret = x86_pmu.schedule_events(fake_cpuc, n, NULL);
-out_free:
-        kfree(fake_cpuc);
 out:
+        free_fake_cpuc(fake_cpuc);
        return ret;
 }
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index fe29c1d2219e..941caa2e449b 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -89,6 +89,20 @@ static __initconst const u64 amd_hw_cache_event_ids
                [ C(RESULT_MISS)   ] = -1,
        },
 },
+ [ C(NODE) ] = {
+        [ C(OP_READ) ] = {
+                [ C(RESULT_ACCESS) ] = 0xb8e9, /* CPU Request to Memory, l+r */
+                [ C(RESULT_MISS)   ] = 0x98e9, /* CPU Request to Memory, r   */
+        },
+        [ C(OP_WRITE) ] = {
+                [ C(RESULT_ACCESS) ] = -1,
+                [ C(RESULT_MISS)   ] = -1,
+        },
+        [ C(OP_PREFETCH) ] = {
+                [ C(RESULT_ACCESS) ] = -1,
+                [ C(RESULT_MISS)   ] = -1,
+        },
+ },
 };
 /*
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 41178c826c48..45fbb8f7f549 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1,25 +1,15 @@
 #ifdef CONFIG_CPU_SUP_INTEL
-#define MAX_EXTRA_REGS 2
-/*
- * Per register state.
- */
-struct er_account {
-        int                     ref;            /* reference count */
-        unsigned int            extra_reg;      /* extra MSR number */
-        u64                     extra_config;   /* extra MSR config */
-};
 /*
- * Per core state
+ * Per core/cpu state
- * This used to coordinate shared registers for HT threads.
+ *
+ * Used to coordinate shared registers between HT threads or
+ * among events on a single PMU.
 */
-struct intel_percore {
+struct intel_shared_regs {
-        raw_spinlock_t          lock;           /* protect structure */
+        struct er_account       regs[EXTRA_REG_MAX];
-        struct er_account       regs[MAX_EXTRA_REGS];
+        int                     refcnt;         /* per-core: #HT threads */
-        int                     refcnt;         /* number of threads */
+        unsigned                core_id;        /* per-core: core id */
-        unsigned                core_id;
 };
 /*
@@ -88,16 +78,10 @@ static struct event_constraint intel_nehalem_event_constraints[] __read_mostly =
 static struct extra_reg intel_nehalem_extra_regs[] __read_mostly =
 {
-        INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff),
+        INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0),
        EVENT_EXTRA_END
 };
-static struct event_constraint intel_nehalem_percore_constraints[] __read_mostly =
-{
-        INTEL_EVENT_CONSTRAINT(0xb7, 0),
-        EVENT_CONSTRAINT_END
-};
 static struct event_constraint intel_westmere_event_constraints[] __read_mostly =
 {
        FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
@@ -116,8 +100,6 @@ static struct event_constraint intel_snb_event_constraints[] __read_mostly =
        FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
        /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */
        INTEL_EVENT_CONSTRAINT(0x48, 0x4), /* L1D_PEND_MISS.PENDING */
-        INTEL_EVENT_CONSTRAINT(0xb7, 0x1), /* OFF_CORE_RESPONSE_0 */
-        INTEL_EVENT_CONSTRAINT(0xbb, 0x8), /* OFF_CORE_RESPONSE_1 */
        INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */
        INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */
        EVENT_CONSTRAINT_END
@@ -125,15 +107,13 @@ static struct event_constraint intel_snb_event_constraints[] __read_mostly =
 static struct extra_reg intel_westmere_extra_regs[] __read_mostly =
 {
-        INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff),
+        INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0),
-        INTEL_EVENT_EXTRA_REG(0xbb, MSR_OFFCORE_RSP_1, 0xffff),
+        INTEL_EVENT_EXTRA_REG(0xbb, MSR_OFFCORE_RSP_1, 0xffff, RSP_1),
        EVENT_EXTRA_END
 };
-static struct event_constraint intel_westmere_percore_constraints[] __read_mostly =
+static struct event_constraint intel_v1_event_constraints[] __read_mostly =
 {
-        INTEL_EVENT_CONSTRAINT(0xb7, 0),
-        INTEL_EVENT_CONSTRAINT(0xbb, 0),
        EVENT_CONSTRAINT_END
 };
@@ -145,6 +125,12 @@ static struct event_constraint intel_gen_event_constraints[] __read_mostly =
        EVENT_CONSTRAINT_END
 };
+static struct extra_reg intel_snb_extra_regs[] __read_mostly = {
+        INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0x3fffffffffull, RSP_0),
+        INTEL_EVENT_EXTRA_REG(0xbb, MSR_OFFCORE_RSP_1, 0x3fffffffffull, RSP_1),
+        EVENT_EXTRA_END
+};
 static u64 intel_pmu_event_map(int hw_event)
 {
        return intel_perfmon_event_map[hw_event];
@@ -245,6 +231,21 @@ static __initconst const u64 snb_hw_cache_event_ids
                [ C(RESULT_MISS)   ] = -1,
        },
 },
+ [ C(NODE) ] = {
+        [ C(OP_READ) ] = {
+                [ C(RESULT_ACCESS) ] = -1,
+                [ C(RESULT_MISS)   ] = -1,
+        },
+        [ C(OP_WRITE) ] = {
+                [ C(RESULT_ACCESS) ] = -1,
+                [ C(RESULT_MISS)   ] = -1,
+        },
+        [ C(OP_PREFETCH) ] = {
+                [ C(RESULT_ACCESS) ] = -1,
+                [ C(RESULT_MISS)   ] = -1,
+        },
+ },
 };
 static __initconst const u64 westmere_hw_cache_event_ids
@@ -346,6 +347,20 @@ static __initconst const u64 westmere_hw_cache_event_ids
                [ C(RESULT_MISS)   ] = -1,
        },
 },
+ [ C(NODE) ] = {
+        [ C(OP_READ) ] = {
+                [ C(RESULT_ACCESS) ] = 0x01b7,
+                [ C(RESULT_MISS)   ] = 0x01b7,
+        },
+        [ C(OP_WRITE) ] = {
+                [ C(RESULT_ACCESS) ] = 0x01b7,
+                [ C(RESULT_MISS)   ] = 0x01b7,
+        },
+        [ C(OP_PREFETCH) ] = {
+                [ C(RESULT_ACCESS) ] = 0x01b7,
+                [ C(RESULT_MISS)   ] = 0x01b7,
+        },
+ },
 };
 /*
@@ -398,7 +413,21 @@ static __initconst const u64 nehalem_hw_cache_extra_regs
                [ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_L3_ACCESS,
                [ C(RESULT_MISS)   ] = NHM_DMND_PREFETCH|NHM_L3_MISS,
        },
- }
+ },
+ [ C(NODE) ] = {
+        [ C(OP_READ) ] = {
+                [ C(RESULT_ACCESS) ] = NHM_DMND_READ|NHM_ALL_DRAM,
+                [ C(RESULT_MISS)   ] = NHM_DMND_READ|NHM_REMOTE_DRAM,
+        },
+        [ C(OP_WRITE) ] = {
+                [ C(RESULT_ACCESS) ] = NHM_DMND_WRITE|NHM_ALL_DRAM,
+                [ C(RESULT_MISS)   ] = NHM_DMND_WRITE|NHM_REMOTE_DRAM,
+        },
+        [ C(OP_PREFETCH) ] = {
+                [ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_ALL_DRAM,
+                [ C(RESULT_MISS)   ] = NHM_DMND_PREFETCH|NHM_REMOTE_DRAM,
+        },
+ },
 };
 static __initconst const u64 nehalem_hw_cache_event_ids
@@ -500,6 +529,20 @@ static __initconst const u64 nehalem_hw_cache_event_ids
                [ C(RESULT_MISS)   ] = -1,
        },
 },
+ [ C(NODE) ] = {
+        [ C(OP_READ) ] = {
+                [ C(RESULT_ACCESS) ] = 0x01b7,
+                [ C(RESULT_MISS)   ] = 0x01b7,
+        },
+        [ C(OP_WRITE) ] = {
+                [ C(RESULT_ACCESS) ] = 0x01b7,
+                [ C(RESULT_MISS)   ] = 0x01b7,
+        },
+        [ C(OP_PREFETCH) ] = {
+                [ C(RESULT_ACCESS) ] = 0x01b7,
+                [ C(RESULT_MISS)   ] = 0x01b7,
+        },
+ },
 };
 static __initconst const u64 core2_hw_cache_event_ids
@@ -1003,7 +1046,7 @@ again:
                data.period = event->hw.last_period;
-                if (perf_event_overflow(event, 1, &data, regs))
+                if (perf_event_overflow(event, &data, regs))
                        x86_pmu_stop(event, 0);
        }
@@ -1037,65 +1080,121 @@ intel_bts_constraints(struct perf_event *event)
        return NULL;
 }
+static bool intel_try_alt_er(struct perf_event *event, int orig_idx)
+{
+        if (!(x86_pmu.er_flags & ERF_HAS_RSP_1))
+                return false;
+        if (event->hw.extra_reg.idx == EXTRA_REG_RSP_0) {
+                event->hw.config &= ~INTEL_ARCH_EVENT_MASK;
+                event->hw.config |= 0x01bb;
+                event->hw.extra_reg.idx = EXTRA_REG_RSP_1;
+                event->hw.extra_reg.reg = MSR_OFFCORE_RSP_1;
+        } else if (event->hw.extra_reg.idx == EXTRA_REG_RSP_1) {
+                event->hw.config &= ~INTEL_ARCH_EVENT_MASK;
+                event->hw.config |= 0x01b7;
+                event->hw.extra_reg.idx = EXTRA_REG_RSP_0;
+                event->hw.extra_reg.reg = MSR_OFFCORE_RSP_0;
+        }
+        if (event->hw.extra_reg.idx == orig_idx)
+                return false;
+        return true;
+}
+/*
+ * manage allocation of shared extra msr for certain events
+ *
+ * sharing can be:
+ * per-cpu: to be shared between the various events on a single PMU
+ * per-core: per-cpu + shared by HT threads
+ */
 static struct event_constraint *
-intel_percore_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
+__intel_shared_reg_get_constraints(struct cpu_hw_events *cpuc,
+                                   struct perf_event *event)
 {
-        struct hw_perf_event *hwc = &event->hw;
+        struct event_constraint *c = &emptyconstraint;
-        unsigned int e = hwc->config & ARCH_PERFMON_EVENTSEL_EVENT;
+        struct hw_perf_event_extra *reg = &event->hw.extra_reg;
-        struct event_constraint *c;
-        struct intel_percore *pc;
        struct er_account *era;
-        int i;
+        unsigned long flags;
-        int free_slot;
+        int orig_idx = reg->idx;
-        int found;
-        if (!x86_pmu.percore_constraints || hwc->extra_alloc)
+        /* already allocated shared msr */
-                return NULL;
+        if (reg->alloc)
+                return &unconstrained;
-        for (c = x86_pmu.percore_constraints; c->cmask; c++) {
+again:
-                if (e != c->code)
+        era = &cpuc->shared_regs->regs[reg->idx];
-                        continue;
+        /*
+         * we use spin_lock_irqsave() to avoid lockdep issues when
+         * passing a fake cpuc
+         */
+        raw_spin_lock_irqsave(&era->lock, flags);
+        if (!atomic_read(&era->ref) || era->config == reg->config) {
+                /* lock in msr value */
+                era->config = reg->config;
+                era->reg = reg->reg;
+                /* one more user */
+                atomic_inc(&era->ref);
+                /* no need to reallocate during incremental event scheduling */
+                reg->alloc = 1;
                /*
-                 * Allocate resource per core.
+                 * All events using extra_reg are unconstrained.
+                 * Avoids calling x86_get_event_constraints()
+                 *
+                 * Must revisit if extra_reg controlling events
+                 * ever have constraints. Worst case we go through
+                 * the regular event constraint table.
                 */
-                pc = cpuc->per_core;
+                c = &unconstrained;
-                if (!pc)
+        } else if (intel_try_alt_er(event, orig_idx)) {
-                        break;
+                raw_spin_unlock(&era->lock);
-                c = &emptyconstraint;
+                goto again;
-                raw_spin_lock(&pc->lock);
-                free_slot = -1;
-                found = 0;
-                for (i = 0; i < MAX_EXTRA_REGS; i++) {
-                        era = &pc->regs[i];
-                        if (era->ref > 0 && hwc->extra_reg == era->extra_reg) {
-                                /* Allow sharing same config */
-                                if (hwc->extra_config == era->extra_config) {
-                                        era->ref++;
-                                        cpuc->percore_used = 1;
-                                        hwc->extra_alloc = 1;
-                                        c = NULL;
-                                }
-                                /* else conflict */
-                                found = 1;
-                                break;
-                        } else if (era->ref == 0 && free_slot == -1)
-                                free_slot = i;
-                }
-                if (!found && free_slot != -1) {
-                        era = &pc->regs[free_slot];
-                        era->ref = 1;
-                        era->extra_reg = hwc->extra_reg;
-                        era->extra_config = hwc->extra_config;
-                        cpuc->percore_used = 1;
-                        hwc->extra_alloc = 1;
-                        c = NULL;
-                }
-                raw_spin_unlock(&pc->lock);
-                return c;
        }
+        raw_spin_unlock_irqrestore(&era->lock, flags);
-        return NULL;
+        return c;
+}
+static void
+__intel_shared_reg_put_constraints(struct cpu_hw_events *cpuc,
+                                   struct hw_perf_event_extra *reg)
+{
+        struct er_account *era;
+        /*
+         * only put constraint if extra reg was actually
+         * allocated. Also takes care of event which do
+         * not use an extra shared reg
+         */
+        if (!reg->alloc)
+                return;
+        era = &cpuc->shared_regs->regs[reg->idx];
+        /* one fewer user */
+        atomic_dec(&era->ref);
+        /* allocate again next time */
+        reg->alloc = 0;
+}
+static struct event_constraint *
+intel_shared_regs_constraints(struct cpu_hw_events *cpuc,
+                              struct perf_event *event)
+{
+        struct event_constraint *c = NULL;
+        if (event->hw.extra_reg.idx != EXTRA_REG_NONE)
+                c = __intel_shared_reg_get_constraints(cpuc, event);
+        return c;
 }
 static struct event_constraint *
@@ -1111,49 +1210,28 @@ intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event
        if (c)
                return c;
-        c = intel_percore_constraints(cpuc, event);
+        c = intel_shared_regs_constraints(cpuc, event);
        if (c)
                return c;
        return x86_get_event_constraints(cpuc, event);
 }
-static void intel_put_event_constraints(struct cpu_hw_events *cpuc,
+static void
+intel_put_shared_regs_event_constraints(struct cpu_hw_events *cpuc,
                                        struct perf_event *event)
 {
-        struct extra_reg *er;
+        struct hw_perf_event_extra *reg;
-        struct intel_percore *pc;
-        struct er_account *era;
-        struct hw_perf_event *hwc = &event->hw;
-        int i, allref;
-        if (!cpuc->percore_used)
+        reg = &event->hw.extra_reg;
-                return;
+        if (reg->idx != EXTRA_REG_NONE)
+                __intel_shared_reg_put_constraints(cpuc, reg);
-        for (er = x86_pmu.extra_regs; er->msr; er++) {
+}
-                if (er->event != (hwc->config & er->config_mask))
-                        continue;
-                pc = cpuc->per_core;
+static void intel_put_event_constraints(struct cpu_hw_events *cpuc,
-                raw_spin_lock(&pc->lock);
+                                        struct perf_event *event)
-                for (i = 0; i < MAX_EXTRA_REGS; i++) {
+{
-                        era = &pc->regs[i];
+        intel_put_shared_regs_event_constraints(cpuc, event);
-                        if (era->ref > 0 &&
-                            era->extra_config == hwc->extra_config &&
-                            era->extra_reg == er->msr) {
-                                era->ref--;
-                                hwc->extra_alloc = 0;
-                                break;
-                        }
-                }
-                allref = 0;
-                for (i = 0; i < MAX_EXTRA_REGS; i++)
-                        allref += pc->regs[i].ref;
-                if (allref == 0)
-                        cpuc->percore_used = 0;
-                raw_spin_unlock(&pc->lock);
-                break;
-        }
 }
 static int intel_pmu_hw_config(struct perf_event *event)
@@ -1231,20 +1309,36 @@ static __initconst const struct x86_pmu core_pmu = {
        .event_constraints      = intel_core_event_constraints,
 };
+static struct intel_shared_regs *allocate_shared_regs(int cpu)
+{
+        struct intel_shared_regs *regs;
+        int i;
+        regs = kzalloc_node(sizeof(struct intel_shared_regs),
+                            GFP_KERNEL, cpu_to_node(cpu));
+        if (regs) {
+                /*
+                 * initialize the locks to keep lockdep happy
+                 */
+                for (i = 0; i < EXTRA_REG_MAX; i++)
+                        raw_spin_lock_init(&regs->regs[i].lock);
+                regs->core_id = -1;
+        }
+        return regs;
+}
 static int intel_pmu_cpu_prepare(int cpu)
 {
        struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
-        if (!cpu_has_ht_siblings())
+        if (!x86_pmu.extra_regs)
                return NOTIFY_OK;
-        cpuc->per_core = kzalloc_node(sizeof(struct intel_percore),
+        cpuc->shared_regs = allocate_shared_regs(cpu);
-                                      GFP_KERNEL, cpu_to_node(cpu));
+        if (!cpuc->shared_regs)
-        if (!cpuc->per_core)
                return NOTIFY_BAD;
-        raw_spin_lock_init(&cpuc->per_core->lock);
-        cpuc->per_core->core_id = -1;
        return NOTIFY_OK;
 }
@@ -1260,32 +1354,34 @@ static void intel_pmu_cpu_starting(int cpu)
         */
        intel_pmu_lbr_reset();
-        if (!cpu_has_ht_siblings())
+        if (!cpuc->shared_regs || (x86_pmu.er_flags & ERF_NO_HT_SHARING))
                return;
        for_each_cpu(i, topology_thread_cpumask(cpu)) {
-                struct intel_percore *pc = per_cpu(cpu_hw_events, i).per_core;
+                struct intel_shared_regs *pc;
+                pc = per_cpu(cpu_hw_events, i).shared_regs;
                if (pc && pc->core_id == core_id) {
-                        kfree(cpuc->per_core);
+                        kfree(cpuc->shared_regs);
-                        cpuc->per_core = pc;
+                        cpuc->shared_regs = pc;
                        break;
                }
        }
-        cpuc->per_core->core_id = core_id;
+        cpuc->shared_regs->core_id = core_id;
-        cpuc->per_core->refcnt++;
+        cpuc->shared_regs->refcnt++;
 }
 static void intel_pmu_cpu_dying(int cpu)
 {
        struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
-        struct intel_percore *pc = cpuc->per_core;
+        struct intel_shared_regs *pc;
+        pc = cpuc->shared_regs;
        if (pc) {
                if (pc->core_id == -1 || --pc->refcnt == 0)
                        kfree(pc);
-                cpuc->per_core = NULL;
+                cpuc->shared_regs = NULL;
        }
        fini_debug_store_on_cpu(cpu);
@@ -1436,7 +1532,6 @@ static __init int intel_pmu_init(void)
                x86_pmu.event_constraints = intel_nehalem_event_constraints;
                x86_pmu.pebs_constraints = intel_nehalem_pebs_event_constraints;
-                x86_pmu.percore_constraints = intel_nehalem_percore_constraints;
                x86_pmu.enable_all = intel_pmu_nhm_enable_all;
                x86_pmu.extra_regs = intel_nehalem_extra_regs;
@@ -1481,10 +1576,10 @@ static __init int intel_pmu_init(void)
                intel_pmu_lbr_init_nhm();
                x86_pmu.event_constraints = intel_westmere_event_constraints;
-                x86_pmu.percore_constraints = intel_westmere_percore_constraints;
                x86_pmu.enable_all = intel_pmu_nhm_enable_all;
                x86_pmu.pebs_constraints = intel_westmere_pebs_event_constraints;
                x86_pmu.extra_regs = intel_westmere_extra_regs;
+                x86_pmu.er_flags |= ERF_HAS_RSP_1;
                /* UOPS_ISSUED.STALLED_CYCLES */
                intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e;
@@ -1502,6 +1597,10 @@ static __init int intel_pmu_init(void)
                x86_pmu.event_constraints = intel_snb_event_constraints;
                x86_pmu.pebs_constraints = intel_snb_pebs_events;
+                x86_pmu.extra_regs = intel_snb_extra_regs;
+                /* all extra regs are per-cpu when HT is on */
+                x86_pmu.er_flags |= ERF_HAS_RSP_1;
+                x86_pmu.er_flags |= ERF_NO_HT_SHARING;
                /* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */
                intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e;
@@ -1512,11 +1611,19 @@ static __init int intel_pmu_init(void)
                break;
        default:
-                /*
+                switch (x86_pmu.version) {
-                 * default constraints for v2 and up
+                case 1:
-                 */
+                        x86_pmu.event_constraints = intel_v1_event_constraints;
-                x86_pmu.event_constraints = intel_gen_event_constraints;
+                        pr_cont("generic architected perfmon v1, ");
-                pr_cont("generic architected perfmon, ");
+                        break;
+                default:
+                        /*
+                         * default constraints for v2 and up
+                         */
+                        x86_pmu.event_constraints = intel_gen_event_constraints;
+                        pr_cont("generic architected perfmon, ");
+                        break;
+                }
        }
        return 0;
 }
@@ -1528,4 +1635,8 @@ static int intel_pmu_init(void)
        return 0;
 }
+static struct intel_shared_regs *allocate_shared_regs(int cpu)
+{
+        return NULL;
+}
 #endif /* CONFIG_CPU_SUP_INTEL */
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index bab491b8ee25..1b1ef3addcfd 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -340,7 +340,7 @@ static int intel_pmu_drain_bts_buffer(void)
         */
        perf_prepare_sample(&header, &data, event, &regs);
-        if (perf_output_begin(&handle, event, header.size * (top - at), 1, 1))
+        if (perf_output_begin(&handle, event, header.size * (top - at)))
                return 1;
        for (; at < top; at++) {
@@ -616,7 +616,7 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
        else
                regs.flags &= ~PERF_EFLAGS_EXACT;
-        if (perf_event_overflow(event, 1, &data, &regs))
+        if (perf_event_overflow(event, &data, &regs))
                x86_pmu_stop(event, 0);
 }
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index ead584fb6a7d..7809d2bcb209 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -554,13 +554,102 @@ static __initconst const u64 p4_hw_cache_event_ids
                [ C(RESULT_MISS)   ] = -1,
        },
 },
+ [ C(NODE) ] = {
+        [ C(OP_READ) ] = {
+                [ C(RESULT_ACCESS) ] = -1,
+                [ C(RESULT_MISS)   ] = -1,
+        },
+        [ C(OP_WRITE) ] = {
+                [ C(RESULT_ACCESS) ] = -1,
+                [ C(RESULT_MISS)   ] = -1,
+        },
+        [ C(OP_PREFETCH) ] = {
+                [ C(RESULT_ACCESS) ] = -1,
+                [ C(RESULT_MISS)   ] = -1,
+        },
+ },
 };
+/*
+ * Because of Netburst being quite restricted in how many
+ * identical events may run simultaneously, we introduce event aliases,
+ * ie the different events which have the same functionality but
+ * utilize non-intersected resources (ESCR/CCCR/counter registers).
+ *
+ * This allow us to relax restrictions a bit and run two or more
+ * identical events together.
+ *
+ * Never set any custom internal bits such as P4_CONFIG_HT,
+ * P4_CONFIG_ALIASABLE or bits for P4_PEBS_METRIC, they are
+ * either up to date automatically or not applicable at all.
+ */
+struct p4_event_alias {
+        u64 original;
+        u64 alternative;
+} p4_event_aliases[] = {
+        {
+                /*
+                 * Non-halted cycles can be substituted with non-sleeping cycles (see
+                 * Intel SDM Vol3b for details). We need this alias to be able
+                 * to run nmi-watchdog and 'perf top' (or any other user space tool
+                 * which is interested in running PERF_COUNT_HW_CPU_CYCLES)
+                 * simultaneously.
+                 */
+        .original       =
+                p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_GLOBAL_POWER_EVENTS)         |
+                                    P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING)),
+        .alternative    =
+                p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_EXECUTION_EVENT)             |
+                                    P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS0)|
+                                    P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS1)|
+                                    P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS2)|
+                                    P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS3)|
+                                    P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS0) |
+                                    P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS1) |
+                                    P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS2) |
+                                    P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS3))|
+                p4_config_pack_cccr(P4_CCCR_THRESHOLD(15) | P4_CCCR_COMPLEMENT          |
+                                    P4_CCCR_COMPARE),
+        },
+};
+static u64 p4_get_alias_event(u64 config)
+{
+        u64 config_match;
+        int i;
+        /*
+         * Only event with special mark is allowed,
+         * we're to be sure it didn't come as malformed
+         * RAW event.
+         */
+        if (!(config & P4_CONFIG_ALIASABLE))
+                return 0;
+        config_match = config & P4_CONFIG_EVENT_ALIAS_MASK;
+        for (i = 0; i < ARRAY_SIZE(p4_event_aliases); i++) {
+                if (config_match == p4_event_aliases[i].original) {
+                        config_match = p4_event_aliases[i].alternative;
+                        break;
+                } else if (config_match == p4_event_aliases[i].alternative) {
+                        config_match = p4_event_aliases[i].original;
+                        break;
+                }
+        }
+        if (i >= ARRAY_SIZE(p4_event_aliases))
+                return 0;
+        return config_match | (config & P4_CONFIG_EVENT_ALIAS_IMMUTABLE_BITS);
+}
 static u64 p4_general_events[PERF_COUNT_HW_MAX] = {
  /* non-halted CPU clocks */
  [PERF_COUNT_HW_CPU_CYCLES] =
        p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_GLOBAL_POWER_EVENTS)         |
-                P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING)),
+                P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING))       |
+                P4_CONFIG_ALIASABLE,
  /*
   * retired instructions
@@ -945,7 +1034,7 @@ static int p4_pmu_handle_irq(struct pt_regs *regs)
                if (!x86_perf_event_set_period(event))
                        continue;
-                if (perf_event_overflow(event, 1, &data, regs))
+                if (perf_event_overflow(event, &data, regs))
                        x86_pmu_stop(event, 0);
        }
@@ -1120,6 +1209,8 @@ static int p4_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign
        struct p4_event_bind *bind;
        unsigned int i, thread, num;
        int cntr_idx, escr_idx;
+        u64 config_alias;
+        int pass;
        bitmap_zero(used_mask, X86_PMC_IDX_MAX);
        bitmap_zero(escr_mask, P4_ESCR_MSR_TABLE_SIZE);
@@ -1128,6 +1219,17 @@ static int p4_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign
                hwc = &cpuc->event_list[i]->hw;
                thread = p4_ht_thread(cpu);
+                pass = 0;
+again:
+                /*
+                 * It's possible to hit a circular lock
+                 * between original and alternative events
+                 * if both are scheduled already.
+                 */
+                if (pass > 2)
+                        goto done;
                bind = p4_config_get_bind(hwc->config);
                escr_idx = p4_get_escr_idx(bind->escr_msr[thread]);
                if (unlikely(escr_idx == -1))
@@ -1141,8 +1243,17 @@ static int p4_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign
                }
                cntr_idx = p4_next_cntr(thread, used_mask, bind);
-                if (cntr_idx == -1 || test_bit(escr_idx, escr_mask))
+                if (cntr_idx == -1 || test_bit(escr_idx, escr_mask)) {
-                        goto done;
+                        /*
+                         * Check whether an event alias is still available.
+                         */
+                        config_alias = p4_get_alias_event(hwc->config);
+                        if (!config_alias)
+                                goto done;
+                        hwc->config = config_alias;
+                        pass++;
+                        goto again;
+                }
                p4_pmu_swap_config_ts(hwc, cpu);
                if (assign)
diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c
index 9aeb78a23de4..a621f3427685 100644
--- a/arch/x86/kernel/devicetree.c
+++ b/arch/x86/kernel/devicetree.c
@@ -134,6 +134,24 @@ static int __init add_bus_probe(void)
 module_init(add_bus_probe);
 #ifdef CONFIG_PCI
+struct device_node *pcibios_get_phb_of_node(struct pci_bus *bus)
+{
+        struct device_node *np;
+        for_each_node_by_type(np, "pci") {
+                const void *prop;
+                unsigned int bus_min;
+                prop = of_get_property(np, "bus-range", NULL);
+                if (!prop)
+                        continue;
+                bus_min = be32_to_cpup(prop);
+                if (bus->number == bus_min)
+                        return np;
+        }
+        return NULL;
+}
 static int x86_of_pci_irq_enable(struct pci_dev *dev)
 {
        struct of_irq oirq;
@@ -165,50 +183,8 @@ static void x86_of_pci_irq_disable(struct pci_dev *dev)
 void __cpuinit x86_of_pci_init(void)
 {
-        struct device_node *np;
        pcibios_enable_irq = x86_of_pci_irq_enable;
        pcibios_disable_irq = x86_of_pci_irq_disable;
-        for_each_node_by_type(np, "pci") {
-                const void *prop;
-                struct pci_bus *bus;
-                unsigned int bus_min;
-                struct device_node *child;
-                prop = of_get_property(np, "bus-range", NULL);
-                if (!prop)
-                        continue;
-                bus_min = be32_to_cpup(prop);
-                bus = pci_find_bus(0, bus_min);
-                if (!bus) {
-                        printk(KERN_ERR "Can't find a node for bus %s.\n",
-                                        np->full_name);
-                        continue;
-                }
-                if (bus->self)
-                        bus->self->dev.of_node = np;
-                else
-                        bus->dev.of_node = np;
-                for_each_child_of_node(np, child) {
-                        struct pci_dev *dev;
-                        u32 devfn;
-                        prop = of_get_property(child, "reg", NULL);
-                        if (!prop)
-                                continue;
-                        devfn = (be32_to_cpup(prop) >> 8) & 0xff;
-                        dev = pci_get_slot(bus, devfn);
-                        if (!dev)
-                                continue;
-                        dev->dev.of_node = child;
-                        pci_dev_put(dev);
-                }
-        }
 }
 #endif
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index e71c98d3c0d2..19853ad8afc5 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -105,34 +105,6 @@ in_irq_stack(unsigned long *stack, unsigned long *irq_stack,
 }
 /*
- * We are returning from the irq stack and go to the previous one.
- * If the previous stack is also in the irq stack, then bp in the first
- * frame of the irq stack points to the previous, interrupted one.
- * Otherwise we have another level of indirection: We first save
- * the bp of the previous stack, then we switch the stack to the irq one
- * and save a new bp that links to the previous one.
- * (See save_args())
- */
-static inline unsigned long
-fixup_bp_irq_link(unsigned long bp, unsigned long *stack,
-                  unsigned long *irq_stack, unsigned long *irq_stack_end)
-{
-#ifdef CONFIG_FRAME_POINTER
-        struct stack_frame *frame = (struct stack_frame *)bp;
-        unsigned long next;
-        if (!in_irq_stack(stack, irq_stack, irq_stack_end)) {
-                if (!probe_kernel_address(&frame->next_frame, next))
-                        return next;
-                else
-                        WARN_ONCE(1, "Perf: bad frame pointer = %p in "
-                                  "callchain\n", &frame->next_frame);
-        }
-#endif
-        return bp;
-}
-/*
 * x86-64 can have up to three kernel stacks:
 * process stack
 * interrupt stack
@@ -155,9 +127,12 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
                task = current;
        if (!stack) {
-                stack = &dummy;
+                if (regs)
-                if (task && task != current)
+                        stack = (unsigned long *)regs->sp;
+                else if (task && task != current)
                        stack = (unsigned long *)task->thread.sp;
+                else
+                        stack = &dummy;
        }
        if (!bp)
@@ -205,8 +180,6 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
                                 * pointer (index -1 to end) in the IRQ stack:
                                 */
                                stack = (unsigned long *) (irq_stack_end[-1]);
-                                bp = fixup_bp_irq_link(bp, stack, irq_stack,
-                                                       irq_stack_end);
                                irq_stack_end = NULL;
                                ops->stack(data, "EOI");
                                continue;
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 8a445a0c989e..e13329d800c8 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -9,6 +9,8 @@
 /*
 * entry.S contains the system-call and fault low-level handling routines.
 *
+ * Some of this is documented in Documentation/x86/entry_64.txt
+ *
 * NOTE: This code handles signal-recognition, which happens every time
 * after an interrupt and after each system call.
 *
@@ -297,27 +299,26 @@ ENDPROC(native_usergs_sysret64)
        .endm
 /* save partial stack frame */
-        .pushsection .kprobes.text, "ax"
+        .macro SAVE_ARGS_IRQ
-ENTRY(save_args)
-        XCPT_FRAME
        cld
-        /*
+        /* start from rbp in pt_regs and jump over */
-         * start from rbp in pt_regs and jump over
+        movq_cfi rdi, RDI-RBP
-         * return address.
+        movq_cfi rsi, RSI-RBP
-         */
+        movq_cfi rdx, RDX-RBP
-        movq_cfi rdi, RDI+8-RBP
+        movq_cfi rcx, RCX-RBP
-        movq_cfi rsi, RSI+8-RBP
+        movq_cfi rax, RAX-RBP
-        movq_cfi rdx, RDX+8-RBP
+        movq_cfi  r8,  R8-RBP
-        movq_cfi rcx, RCX+8-RBP
+        movq_cfi  r9,  R9-RBP
-        movq_cfi rax, RAX+8-RBP
+        movq_cfi r10, R10-RBP
-        movq_cfi  r8,  R8+8-RBP
+        movq_cfi r11, R11-RBP
-        movq_cfi  r9,  R9+8-RBP
-        movq_cfi r10, R10+8-RBP
+        /* Save rbp so that we can unwind from get_irq_regs() */
-        movq_cfi r11, R11+8-RBP
+        movq_cfi rbp, 0
-        leaq -RBP+8(%rsp),%rdi  /* arg1 for handler */
+        /* Save previous stack value */
-        movq_cfi rbp, 8         /* push %rbp */
+        movq %rsp, %rsi
-        leaq 8(%rsp), %rbp              /* mov %rsp, %ebp */
+        leaq -RBP(%rsp),%rdi    /* arg1 for handler */
        testl $3, CS(%rdi)
        je 1f
        SWAPGS
@@ -329,19 +330,14 @@ ENTRY(save_args)
         */
 1:      incl PER_CPU_VAR(irq_count)
        jne 2f
-        popq_cfi %rax                   /* move return address... */
        mov PER_CPU_VAR(irq_stack_ptr),%rsp
        EMPTY_FRAME 0
-        pushq_cfi %rbp                  /* backlink for unwinder */
-        pushq_cfi %rax                  /* ... to the new stack */
+2:      /* Store previous stack value */
-        /*
+        pushq %rsi
-         * We entered an interrupt context - irqs are off:
+        /* We entered an interrupt context - irqs are off: */
-         */
+        TRACE_IRQS_OFF
-2:      TRACE_IRQS_OFF
+        .endm
-        ret
-        CFI_ENDPROC
-END(save_args)
-        .popsection
 ENTRY(save_rest)
        PARTIAL_FRAME 1 REST_SKIP+8
@@ -473,7 +469,7 @@ ENTRY(system_call_after_swapgs)
         * and short:
         */
        ENABLE_INTERRUPTS(CLBR_NONE)
-        SAVE_ARGS 8,1
+        SAVE_ARGS 8,0
        movq  %rax,ORIG_RAX-ARGOFFSET(%rsp)
        movq  %rcx,RIP-ARGOFFSET(%rsp)
        CFI_REL_OFFSET rip,RIP-ARGOFFSET
@@ -508,7 +504,7 @@ sysret_check:
        TRACE_IRQS_ON
        movq RIP-ARGOFFSET(%rsp),%rcx
        CFI_REGISTER    rip,rcx
-        RESTORE_ARGS 0,-ARG_SKIP,1
+        RESTORE_ARGS 1,-ARG_SKIP,0
        /*CFI_REGISTER  rflags,r11*/
        movq    PER_CPU_VAR(old_rsp), %rsp
        USERGS_SYSRET64
@@ -791,7 +787,7 @@ END(interrupt)
        /* reserve pt_regs for scratch regs and rbp */
        subq $ORIG_RAX-RBP, %rsp
        CFI_ADJUST_CFA_OFFSET ORIG_RAX-RBP
-        call save_args
+        SAVE_ARGS_IRQ
        PARTIAL_FRAME 0
        call \func
        .endm
@@ -814,15 +810,14 @@ ret_from_intr:
        DISABLE_INTERRUPTS(CLBR_NONE)
        TRACE_IRQS_OFF
        decl PER_CPU_VAR(irq_count)
-        leaveq
-        CFI_RESTORE             rbp
+        /* Restore saved previous stack */
+        popq %rsi
+        leaq 16(%rsi), %rsp
        CFI_DEF_CFA_REGISTER    rsp
-        CFI_ADJUST_CFA_OFFSET   -8
+        CFI_ADJUST_CFA_OFFSET   -16
-        /* we did not save rbx, restore only from ARGOFFSET */
-        addq $8, %rsp
-        CFI_ADJUST_CFA_OFFSET   -8
 exit_intr:
        GET_THREAD_INFO(%rcx)
        testl $3,CS-ARGOFFSET(%rsp)
@@ -858,7 +853,7 @@ retint_restore_args:	/* return to kernel space */
         */
        TRACE_IRQS_IRETQ
 restore_args:
-        RESTORE_ARGS 0,8,0
+        RESTORE_ARGS 1,8,1
 irq_return:
        INTERRUPT_RETURN
@@ -991,11 +986,6 @@ apicinterrupt THRESHOLD_APIC_VECTOR \
 apicinterrupt THERMAL_APIC_VECTOR \
        thermal_interrupt smp_thermal_interrupt
-#ifdef CONFIG_X86_MCE
-apicinterrupt MCE_SELF_VECTOR \
-        mce_self_interrupt smp_mce_self_interrupt
-#endif
 #ifdef CONFIG_SMP
 apicinterrupt CALL_FUNCTION_SINGLE_VECTOR \
        call_function_single_interrupt smp_call_function_single_interrupt
@@ -1121,6 +1111,8 @@ zeroentry spurious_interrupt_bug do_spurious_interrupt_bug
 zeroentry coprocessor_error do_coprocessor_error
 errorentry alignment_check do_alignment_check
 zeroentry simd_coprocessor_error do_simd_coprocessor_error
+zeroentry emulate_vsyscall do_emulate_vsyscall
        /* Reload gs selector with exception handling */
        /* edi:  new selector */
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 6781765b3a0d..4aecc54236a9 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -4,6 +4,7 @@
 #include <linux/sysdev.h>
 #include <linux/delay.h>
 #include <linux/errno.h>
+#include <linux/i8253.h>
 #include <linux/slab.h>
 #include <linux/hpet.h>
 #include <linux/init.h>
@@ -12,8 +13,8 @@
 #include <linux/io.h>
 #include <asm/fixmap.h>
-#include <asm/i8253.h>
 #include <asm/hpet.h>
+#include <asm/time.h>
 #define HPET_MASK                       CLOCKSOURCE_MASK(32)
@@ -71,7 +72,7 @@ static inline void hpet_set_mapping(void)
 {
        hpet_virt_address = ioremap_nocache(hpet_address, HPET_MMAP_SIZE);
 #ifdef CONFIG_X86_64
-        __set_fixmap(VSYSCALL_HPET, hpet_address, PAGE_KERNEL_VSYSCALL_NOCACHE);
+        __set_fixmap(VSYSCALL_HPET, hpet_address, PAGE_KERNEL_VVAR_NOCACHE);
 #endif
 }
@@ -738,13 +739,6 @@ static cycle_t read_hpet(struct clocksource *cs)
        return (cycle_t)hpet_readl(HPET_COUNTER);
 }
-#ifdef CONFIG_X86_64
-static cycle_t __vsyscall_fn vread_hpet(void)
-{
-        return readl((const void __iomem *)fix_to_virt(VSYSCALL_HPET) + 0xf0);
-}
-#endif
 static struct clocksource clocksource_hpet = {
        .name           = "hpet",
        .rating         = 250,
@@ -753,7 +747,7 @@ static struct clocksource clocksource_hpet = {
        .flags          = CLOCK_SOURCE_IS_CONTINUOUS,
        .resume         = hpet_resume_counter,
 #ifdef CONFIG_X86_64
-        .vread          = vread_hpet,
+        .archdata       = { .vclock_mode = VCLOCK_HPET },
 #endif
 };
diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c
index fb66dc9e36cb..f2b96de3c7c1 100644
--- a/arch/x86/kernel/i8253.c
+++ b/arch/x86/kernel/i8253.c
@@ -3,113 +3,24 @@
 *
 */
 #include <linux/clockchips.h>
-#include <linux/interrupt.h>
-#include <linux/spinlock.h>
-#include <linux/jiffies.h>
 #include <linux/module.h>
 #include <linux/timex.h>
-#include <linux/delay.h>
+#include <linux/i8253.h>
-#include <linux/init.h>
-#include <linux/io.h>
-#include <asm/i8253.h>
 #include <asm/hpet.h>
+#include <asm/time.h>
 #include <asm/smp.h>
-DEFINE_RAW_SPINLOCK(i8253_lock);
-EXPORT_SYMBOL(i8253_lock);
 /*
 * HPET replaces the PIT, when enabled. So we need to know, which of
 * the two timers is used
 */
 struct clock_event_device *global_clock_event;
-/*
- * Initialize the PIT timer.
- *
- * This is also called after resume to bring the PIT into operation again.
- */
-static void init_pit_timer(enum clock_event_mode mode,
-                           struct clock_event_device *evt)
-{
-        raw_spin_lock(&i8253_lock);
-        switch (mode) {
-        case CLOCK_EVT_MODE_PERIODIC:
-                /* binary, mode 2, LSB/MSB, ch 0 */
-                outb_pit(0x34, PIT_MODE);
-                outb_pit(LATCH & 0xff , PIT_CH0);       /* LSB */
-                outb_pit(LATCH >> 8 , PIT_CH0);         /* MSB */
-                break;
-        case CLOCK_EVT_MODE_SHUTDOWN:
-        case CLOCK_EVT_MODE_UNUSED:
-                if (evt->mode == CLOCK_EVT_MODE_PERIODIC ||
-                    evt->mode == CLOCK_EVT_MODE_ONESHOT) {
-                        outb_pit(0x30, PIT_MODE);
-                        outb_pit(0, PIT_CH0);
-                        outb_pit(0, PIT_CH0);
-                }
-                break;
-        case CLOCK_EVT_MODE_ONESHOT:
-                /* One shot setup */
-                outb_pit(0x38, PIT_MODE);
-                break;
-        case CLOCK_EVT_MODE_RESUME:
-                /* Nothing to do here */
-                break;
-        }
-        raw_spin_unlock(&i8253_lock);
-}
-/*
- * Program the next event in oneshot mode
- *
- * Delta is given in PIT ticks
- */
-static int pit_next_event(unsigned long delta, struct clock_event_device *evt)
-{
-        raw_spin_lock(&i8253_lock);
-        outb_pit(delta & 0xff , PIT_CH0);       /* LSB */
-        outb_pit(delta >> 8 , PIT_CH0);         /* MSB */
-        raw_spin_unlock(&i8253_lock);
-        return 0;
-}
-/*
- * On UP the PIT can serve all of the possible timer functions. On SMP systems
- * it can be solely used for the global tick.
- *
- * The profiling and update capabilities are switched off once the local apic is
- * registered. This mechanism replaces the previous #ifdef LOCAL_APIC -
- * !using_apic_timer decisions in do_timer_interrupt_hook()
- */
-static struct clock_event_device pit_ce = {
-        .name           = "pit",
-        .features       = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
-        .set_mode       = init_pit_timer,
-        .set_next_event = pit_next_event,
-        .irq            = 0,
-};
-/*
- * Initialize the conversion factor and the min/max deltas of the clock event
- * structure and register the clock event source with the framework.
- */
 void __init setup_pit_timer(void)
 {
-        /*
+        clockevent_i8253_init(true);
-         * Start pit with the boot cpu mask and make it global after the
+        global_clock_event = &i8253_clockevent;
-         * IO_APIC has been initialized.
-         */
-        pit_ce.cpumask = cpumask_of(smp_processor_id());
-        clockevents_config_and_register(&pit_ce, CLOCK_TICK_RATE, 0xF, 0x7FFF);
-        global_clock_event = &pit_ce;
 }
 #ifndef CONFIG_X86_64
@@ -123,7 +34,7 @@ static int __init init_pit_clocksource(void)
          * - when local APIC timer is active (PIT is switched off)
          */
        if (num_possible_cpus() > 1 || is_hpet_enabled() ||
-            pit_ce.mode != CLOCK_EVT_MODE_PERIODIC)
+            i8253_clockevent.mode != CLOCK_EVT_MODE_PERIODIC)
                return 0;
        return clocksource_i8253_init();
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index f470e4ef993e..f09d4bbe2d2d 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -272,9 +272,6 @@ static void __init apic_intr_init(void)
 #ifdef CONFIG_X86_MCE_THRESHOLD
        alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
 #endif
-#if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_LOCAL_APIC)
-        alloc_intr_gate(MCE_SELF_VECTOR, mce_self_interrupt);
-#endif
 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)
        /* self generated IPI for local APIC timer */
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 5f9ecff328b5..00354d4919a9 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -608,7 +608,7 @@ int kgdb_arch_init(void)
        return register_die_notifier(&kgdb_notifier);
 }
-static void kgdb_hw_overflow_handler(struct perf_event *event, int nmi,
+static void kgdb_hw_overflow_handler(struct perf_event *event,
                struct perf_sample_data *data, struct pt_regs *regs)
 {
        struct task_struct *tsk = current;
@@ -638,7 +638,7 @@ void kgdb_arch_late(void)
        for (i = 0; i < HBP_NUM; i++) {
                if (breakinfo[i].pev)
                        continue;
-                breakinfo[i].pev = register_wide_hw_breakpoint(&attr, NULL);
+                breakinfo[i].pev = register_wide_hw_breakpoint(&attr, NULL, NULL);
                if (IS_ERR((void * __force)breakinfo[i].pev)) {
                        printk(KERN_ERR "kgdb: Could not allocate hw"
                               "breakpoints\nDisabling the kernel debugger\n");
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
index c5610384ab16..591be0ee1934 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/microcode_amd.c
@@ -66,8 +66,8 @@ struct microcode_amd {
        unsigned int                    mpb[0];
 };
-#define UCODE_CONTAINER_SECTION_HDR     8
+#define SECTION_HDR_SIZE        8
-#define UCODE_CONTAINER_HEADER_SIZE     12
+#define CONTAINER_HDR_SZ        12
 static struct equiv_cpu_entry *equiv_cpu_table;
@@ -157,7 +157,7 @@ static int apply_microcode_amd(int cpu)
 static unsigned int verify_ucode_size(int cpu, const u8 *buf, unsigned int size)
 {
        struct cpuinfo_x86 *c = &cpu_data(cpu);
-        unsigned int max_size, actual_size;
+        u32 max_size, actual_size;
 #define F1XH_MPB_MAX_SIZE 2048
 #define F14H_MPB_MAX_SIZE 1824
@@ -175,9 +175,9 @@ static unsigned int verify_ucode_size(int cpu, const u8 *buf, unsigned int size)
                break;
        }
-        actual_size = buf[4] + (buf[5] << 8);
+        actual_size = *(u32 *)(buf + 4);
-        if (actual_size > size || actual_size > max_size) {
+        if (actual_size + SECTION_HDR_SIZE > size || actual_size > max_size) {
                pr_err("section size mismatch\n");
                return 0;
        }
@@ -191,7 +191,7 @@ get_next_ucode(int cpu, const u8 *buf, unsigned int size, unsigned int *mc_size)
        struct microcode_header_amd *mc = NULL;
        unsigned int actual_size = 0;
-        if (buf[0] != UCODE_UCODE_TYPE) {
+        if (*(u32 *)buf != UCODE_UCODE_TYPE) {
                pr_err("invalid type field in container file section header\n");
                goto out;
        }
@@ -204,8 +204,8 @@ get_next_ucode(int cpu, const u8 *buf, unsigned int size, unsigned int *mc_size)
        if (!mc)
                goto out;
-        get_ucode_data(mc, buf + UCODE_CONTAINER_SECTION_HDR, actual_size);
+        get_ucode_data(mc, buf + SECTION_HDR_SIZE, actual_size);
-        *mc_size = actual_size + UCODE_CONTAINER_SECTION_HDR;
+        *mc_size = actual_size + SECTION_HDR_SIZE;
 out:
        return mc;
@@ -229,9 +229,10 @@ static int install_equiv_cpu_table(const u8 *buf)
                return -ENOMEM;
        }
-        get_ucode_data(equiv_cpu_table, buf + UCODE_CONTAINER_HEADER_SIZE, size);
+        get_ucode_data(equiv_cpu_table, buf + CONTAINER_HDR_SZ, size);
-        return size + UCODE_CONTAINER_HEADER_SIZE; /* add header length */
+        /* add header length */
+        return size + CONTAINER_HDR_SZ;
 }
 static void free_equiv_cpu_table(void)
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 807c2a2b80f1..82528799c5de 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -528,7 +528,7 @@ static int genregs_set(struct task_struct *target,
        return ret;
 }
-static void ptrace_triggered(struct perf_event *bp, int nmi,
+static void ptrace_triggered(struct perf_event *bp,
                             struct perf_sample_data *data,
                             struct pt_regs *regs)
 {
@@ -715,7 +715,8 @@ static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr,
                attr.bp_type = HW_BREAKPOINT_W;
                attr.disabled = 1;
-                bp = register_user_hw_breakpoint(&attr, ptrace_triggered, tsk);
+                bp = register_user_hw_breakpoint(&attr, ptrace_triggered,
+                                                 NULL, tsk);
                /*
                 * CHECKME: the previous code returned -EIO if the addr wasn't
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index 8bbe8c56916d..b78643d0f9a5 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -10,7 +10,7 @@
 static void __devinit quirk_intel_irqbalance(struct pci_dev *dev)
 {
-        u8 config, rev;
+        u8 config;
        u16 word;
        /* BIOS may enable hardware IRQ balancing for
@@ -18,8 +18,7 @@ static void __devinit quirk_intel_irqbalance(struct pci_dev *dev)
         * based platforms.
         * Disable SW irqbalance/affinity on those platforms.
         */
-        pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev);
+        if (dev->revision > 0x9)
-        if (rev > 0x9)
                return;
        /* enable access to config space*/
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 0c016f727695..9242436e9937 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -294,6 +294,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
                        DMI_MATCH(DMI_BOARD_NAME, "VersaLogic Menlow board"),
                },
        },
+        { /* Handle reboot issue on Acer Aspire one */
+                .callback = set_bios_reboot,
+                .ident = "Acer Aspire One A110",
+                .matches = {
+                        DMI_MATCH(DMI_SYS_VENDOR, "Acer"),
+                        DMI_MATCH(DMI_PRODUCT_NAME, "AOA110"),
+                },
+        },
        { }
 };
@@ -411,6 +419,30 @@ static struct dmi_system_id __initdata pci_reboot_dmi_table[] = {
                        DMI_MATCH(DMI_PRODUCT_NAME, "iMac9,1"),
                },
        },
+        {       /* Handle problems with rebooting on the Latitude E6320. */
+                .callback = set_pci_reboot,
+                .ident = "Dell Latitude E6320",
+                .matches = {
+                        DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+                        DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E6320"),
+                },
+        },
+        {       /* Handle problems with rebooting on the Latitude E5420. */
+                .callback = set_pci_reboot,
+                .ident = "Dell Latitude E5420",
+                .matches = {
+                        DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+                        DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E5420"),
+                },
+        },
+        {       /* Handle problems with rebooting on the Latitude E6420. */
+                .callback = set_pci_reboot,
+                .ident = "Dell Latitude E6420",
+                .matches = {
+                        DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+                        DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E6420"),
+                },
+        },
        { }
 };
diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S
index 41235531b11c..36818f8ec2be 100644
--- a/arch/x86/kernel/relocate_kernel_32.S
+++ b/arch/x86/kernel/relocate_kernel_32.S
@@ -97,6 +97,8 @@ relocate_kernel:
        ret
 identity_mapped:
+        /* set return address to 0 if not preserving context */
+        pushl   $0
        /* store the start address on the stack */
        pushl   %edx
diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S
index 4de8f5b3d476..7a6f3b3be3cf 100644
--- a/arch/x86/kernel/relocate_kernel_64.S
+++ b/arch/x86/kernel/relocate_kernel_64.S
@@ -100,6 +100,8 @@ relocate_kernel:
        ret
 identity_mapped:
+        /* set return address to 0 if not preserving context */
+        pushq   $0
        /* store the start address on the stack */
        pushq   %rdx
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 40a24932a8a1..54ddaeb221c1 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -485,17 +485,18 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
 asmlinkage int
 sys_sigsuspend(int history0, int history1, old_sigset_t mask)
 {
-        mask &= _BLOCKABLE;
+        sigset_t blocked;
-        spin_lock_irq(&current->sighand->siglock);
        current->saved_sigmask = current->blocked;
-        siginitset(&current->blocked, mask);
-        recalc_sigpending();
+        mask &= _BLOCKABLE;
-        spin_unlock_irq(&current->sighand->siglock);
+        siginitset(&blocked, mask);
+        set_current_blocked(&blocked);
        current->state = TASK_INTERRUPTIBLE;
        schedule();
-        set_restore_sigmask();
+        set_restore_sigmask();
        return -ERESTARTNOHAND;
 }
@@ -572,10 +573,7 @@ unsigned long sys_sigreturn(struct pt_regs *regs)
                goto badframe;
        sigdelsetmask(&set, ~_BLOCKABLE);
-        spin_lock_irq(&current->sighand->siglock);
+        set_current_blocked(&set);
-        current->blocked = set;
-        recalc_sigpending();
-        spin_unlock_irq(&current->sighand->siglock);
        if (restore_sigcontext(regs, &frame->sc, &ax))
                goto badframe;
@@ -653,11 +651,15 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka,
 static int
 setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
-               sigset_t *set, struct pt_regs *regs)
+                struct pt_regs *regs)
 {
        int usig = signr_convert(sig);
+        sigset_t *set = &current->blocked;
        int ret;
+        if (current_thread_info()->status & TS_RESTORE_SIGMASK)
+                set = &current->saved_sigmask;
        /* Set up the stack frame */
        if (is_ia32) {
                if (ka->sa.sa_flags & SA_SIGINFO)
@@ -672,12 +674,13 @@ setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
                return -EFAULT;
        }
+        current_thread_info()->status &= ~TS_RESTORE_SIGMASK;
        return ret;
 }
 static int
 handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
-              sigset_t *oldset, struct pt_regs *regs)
+                struct pt_regs *regs)
 {
        sigset_t blocked;
        int ret;
@@ -712,20 +715,11 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
            likely(test_and_clear_thread_flag(TIF_FORCED_TF)))
                regs->flags &= ~X86_EFLAGS_TF;
-        ret = setup_rt_frame(sig, ka, info, oldset, regs);
+        ret = setup_rt_frame(sig, ka, info, regs);
        if (ret)
                return ret;
-#ifdef CONFIG_X86_64
-        /*
-         * This has nothing to do with segment registers,
-         * despite the name.  This magic affects uaccess.h
-         * macros' behavior.  Reset it to the normal setting.
-         */
-        set_fs(USER_DS);
-#endif
        /*
         * Clear the direction flag as per the ABI for function entry.
         */
@@ -767,7 +761,6 @@ static void do_signal(struct pt_regs *regs)
        struct k_sigaction ka;
        siginfo_t info;
        int signr;
-        sigset_t *oldset;
        /*
         * We want the common case to go fast, which is why we may in certain
@@ -779,23 +772,10 @@ static void do_signal(struct pt_regs *regs)
        if (!user_mode(regs))
                return;
-        if (current_thread_info()->status & TS_RESTORE_SIGMASK)
-                oldset = &current->saved_sigmask;
-        else
-                oldset = &current->blocked;
        signr = get_signal_to_deliver(&info, &ka, regs, NULL);
        if (signr > 0) {
                /* Whee! Actually deliver the signal.  */
-                if (handle_signal(signr, &info, &ka, oldset, regs) == 0) {
+                handle_signal(signr, &info, &ka, regs);
-                        /*
-                         * A signal was successfully delivered; the saved
-                         * sigmask will have been stored in the signal frame,
-                         * and will be restored by sigreturn, so we can simply
-                         * clear the TS_RESTORE_SIGMASK flag.
-                         */
-                        current_thread_info()->status &= ~TS_RESTORE_SIGMASK;
-                }
                return;
        }
@@ -823,7 +803,7 @@ static void do_signal(struct pt_regs *regs)
         */
        if (current_thread_info()->status & TS_RESTORE_SIGMASK) {
                current_thread_info()->status &= ~TS_RESTORE_SIGMASK;
-                sigprocmask(SIG_SETMASK, &current->saved_sigmask, NULL);
+                set_current_blocked(&current->saved_sigmask);
        }
 }
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 9fd3137230d4..9f548cb4a958 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -438,7 +438,7 @@ static void impress_friends(void)
 void __inquire_remote_apic(int apicid)
 {
        unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 };
-        char *names[] = { "ID", "VERSION", "SPIV" };
+        const char * const names[] = { "ID", "VERSION", "SPIV" };
        int timeout;
        u32 status;
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index 55d9bc03f696..fdd0c6430e5a 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -66,7 +66,7 @@ void save_stack_trace(struct stack_trace *trace)
 }
 EXPORT_SYMBOL_GPL(save_stack_trace);
-void save_stack_trace_regs(struct stack_trace *trace, struct pt_regs *regs)
+void save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace)
 {
        dump_trace(current, regs, NULL, 0, &save_stack_ops, trace);
        if (trace->nr_entries < trace->max_entries)
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index 30ac65df7d4e..e07a2fc876b9 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -36,6 +36,7 @@
 #include <asm/bootparam.h>
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
+#include <asm/swiotlb.h>
 #include <asm/fixmap.h>
 #include <asm/proto.h>
 #include <asm/setup.h>
diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c
index 00cbb272627f..5a64d057be57 100644
--- a/arch/x86/kernel/time.c
+++ b/arch/x86/kernel/time.c
@@ -11,13 +11,13 @@
 #include <linux/clockchips.h>
 #include <linux/interrupt.h>
+#include <linux/i8253.h>
 #include <linux/time.h>
 #include <linux/mca.h>
 #include <asm/vsyscall.h>
 #include <asm/x86_init.h>
 #include <asm/i8259.h>
-#include <asm/i8253.h>
 #include <asm/timer.h>
 #include <asm/hpet.h>
 #include <asm/time.h>
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index b9b67166f9de..fbc097a085ca 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -872,6 +872,12 @@ void __init trap_init(void)
        set_bit(SYSCALL_VECTOR, used_vectors);
 #endif
+#ifdef CONFIG_X86_64
+        BUG_ON(test_bit(VSYSCALL_EMU_VECTOR, used_vectors));
+        set_system_intr_gate(VSYSCALL_EMU_VECTOR, &emulate_vsyscall);
+        set_bit(VSYSCALL_EMU_VECTOR, used_vectors);
+#endif
        /*
         * Should be a barrier for any external CPU state:
         */
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 6cc6922262af..db483369f10b 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -5,7 +5,6 @@
 #include <linux/timer.h>
 #include <linux/acpi_pmtmr.h>
 #include <linux/cpufreq.h>
-#include <linux/dmi.h>
 #include <linux/delay.h>
 #include <linux/clocksource.h>
 #include <linux/percpu.h>
@@ -777,7 +776,7 @@ static struct clocksource clocksource_tsc = {
        .flags                  = CLOCK_SOURCE_IS_CONTINUOUS |
                                  CLOCK_SOURCE_MUST_VERIFY,
 #ifdef CONFIG_X86_64
-        .vread                  = vread_tsc,
+        .archdata               = { .vclock_mode = VCLOCK_TSC },
 #endif
 };
@@ -800,27 +799,6 @@ void mark_tsc_unstable(char *reason)
 EXPORT_SYMBOL_GPL(mark_tsc_unstable);
-static int __init dmi_mark_tsc_unstable(const struct dmi_system_id *d)
-{
-        printk(KERN_NOTICE "%s detected: marking TSC unstable.\n",
-                        d->ident);
-        tsc_unstable = 1;
-        return 0;
-}
-/* List of systems that have known TSC problems */
-static struct dmi_system_id __initdata bad_tsc_dmi_table[] = {
-        {
-                .callback = dmi_mark_tsc_unstable,
-                .ident = "IBM Thinkpad 380XD",
-                .matches = {
-                        DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
-                        DMI_MATCH(DMI_BOARD_NAME, "2635FA0"),
-                },
-        },
-        {}
-};
 static void __init check_system_tsc_reliable(void)
 {
 #ifdef CONFIG_MGEODE_LX
@@ -1010,8 +988,6 @@ void __init tsc_init(void)
        lpj_fine = lpj;
        use_tsc_delay();
-        /* Check and install the TSC clocksource */
-        dmi_check_system(bad_tsc_dmi_table);
        if (unsynchronized_tsc())
                mark_tsc_unstable("TSCs unsynchronized");
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 89aed99aafce..4aa9c54a9b76 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -161,50 +161,47 @@ SECTIONS
 #define VVIRT_OFFSET (VSYSCALL_ADDR - __vsyscall_0)
 #define VVIRT(x) (ADDR(x) - VVIRT_OFFSET)
-#define EMIT_VVAR(x, offset) .vsyscall_var_ ## x        \
-        ADDR(.vsyscall_0) + offset                      \
-        : AT(VLOAD(.vsyscall_var_ ## x)) {              \
-                *(.vsyscall_var_ ## x)                  \
-        }                                               \
-        x = VVIRT(.vsyscall_var_ ## x);
        . = ALIGN(4096);
        __vsyscall_0 = .;
        . = VSYSCALL_ADDR;
-        .vsyscall_0 : AT(VLOAD(.vsyscall_0)) {
+        .vsyscall : AT(VLOAD(.vsyscall)) {
                *(.vsyscall_0)
-        } :user
-        . = ALIGN(L1_CACHE_BYTES);
+                . = 1024;
-        .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) {
-                *(.vsyscall_fn)
-        }
-        .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) {
                *(.vsyscall_1)
-        }
-        .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2)) {
-                *(.vsyscall_2)
-        }
-        .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) {
+                . = 2048;
-                *(.vsyscall_3)
+                *(.vsyscall_2)
-        }
-#define __VVAR_KERNEL_LDS
-#include <asm/vvar.h>
-#undef __VVAR_KERNEL_LDS
-        . = __vsyscall_0 + PAGE_SIZE;
+                . = 4096;  /* Pad the whole page. */
+        } :user =0xcc
+        . = ALIGN(__vsyscall_0 + PAGE_SIZE, PAGE_SIZE);
 #undef VSYSCALL_ADDR
 #undef VLOAD_OFFSET
 #undef VLOAD
 #undef VVIRT_OFFSET
 #undef VVIRT
+        __vvar_page = .;
+        .vvar : AT(ADDR(.vvar) - LOAD_OFFSET) {
+              /* Place all vvars at the offsets in asm/vvar.h. */
+#define EMIT_VVAR(name, offset)                 \
+                . = offset;             \
+                *(.vvar_ ## name)
+#define __VVAR_KERNEL_LDS
+#include <asm/vvar.h>
+#undef __VVAR_KERNEL_LDS
 #undef EMIT_VVAR
+        } :data
+       . = ALIGN(__vvar_page + PAGE_SIZE, PAGE_SIZE);
 #endif /* CONFIG_X86_64 */
        /* Init code and data - will be freed after init */
diff --git a/arch/x86/kernel/vread_tsc_64.c b/arch/x86/kernel/vread_tsc_64.c
deleted file mode 100644
index a81aa9e9894c..000000000000
--- a/arch/x86/kernel/vread_tsc_64.c
+++ /dev/null
@@ -1,36 +0,0 @@
-/* This code runs in userspace. */
-#define DISABLE_BRANCH_PROFILING
-#include <asm/vgtod.h>
-notrace cycle_t __vsyscall_fn vread_tsc(void)
-{
-        cycle_t ret;
-        u64 last;
-        /*
-         * Empirically, a fence (of type that depends on the CPU)
-         * before rdtsc is enough to ensure that rdtsc is ordered
-         * with respect to loads.  The various CPU manuals are unclear
-         * as to whether rdtsc can be reordered with later loads,
-         * but no one has ever seen it happen.
-         */
-        rdtsc_barrier();
-        ret = (cycle_t)vget_cycles();
-        last = VVAR(vsyscall_gtod_data).clock.cycle_last;
-        if (likely(ret >= last))
-                return ret;
-        /*
-         * GCC likes to generate cmov here, but this branch is extremely
-         * predictable (it's just a funciton of time and the likely is
-         * very likely) and there's a data dependence, so force GCC
-         * to generate a branch instead.  I don't barrier() because
-         * we don't actually need a barrier, and if this function
-         * ever gets inlined it will generate worse code.
-         */
-        asm volatile ("");
-        return last;
-}
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 3e682184d76c..dda7dff9cef7 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -2,6 +2,8 @@
 *  Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
 *  Copyright 2003 Andi Kleen, SuSE Labs.
 *
+ *  [ NOTE: this mechanism is now deprecated in favor of the vDSO. ]
+ *
 *  Thanks to hpa@transmeta.com for some useful hint.
 *  Special thanks to Ingo Molnar for his early experience with
 *  a different vsyscall implementation for Linux/IA32 and for the name.
@@ -11,10 +13,9 @@
 *  vsyscalls. One vsyscall can reserve more than 1 slot to avoid
 *  jumping out of line if necessary. We cannot add more with this
 *  mechanism because older kernels won't return -ENOSYS.
- *  If we want more than four we need a vDSO.
 *
- *  Note: the concept clashes with user mode linux. If you use UML and
+ *  Note: the concept clashes with user mode linux.  UML users should
- *  want per guest time just set the kernel.vsyscall64 sysctl to 0.
+ *  use the vDSO.
 */
 /* Disable profiling for userspace code: */
@@ -32,9 +33,12 @@
 #include <linux/cpu.h>
 #include <linux/smp.h>
 #include <linux/notifier.h>
+#include <linux/syscalls.h>
+#include <linux/ratelimit.h>
 #include <asm/vsyscall.h>
 #include <asm/pgtable.h>
+#include <asm/compat.h>
 #include <asm/page.h>
 #include <asm/unistd.h>
 #include <asm/fixmap.h>
@@ -44,16 +48,12 @@
 #include <asm/desc.h>
 #include <asm/topology.h>
 #include <asm/vgtod.h>
+#include <asm/traps.h>
-#define __vsyscall(nr) \
-                __attribute__ ((unused, __section__(".vsyscall_" #nr))) notrace
-#define __syscall_clobber "r11","cx","memory"
 DEFINE_VVAR(int, vgetcpu_mode);
 DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data) =
 {
        .lock = __SEQLOCK_UNLOCKED(__vsyscall_gtod_data.lock),
-        .sysctl_enabled = 1,
 };
 void update_vsyscall_tz(void)
@@ -72,179 +72,149 @@ void update_vsyscall(struct timespec *wall_time, struct timespec *wtm,
        unsigned long flags;
        write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags);
        /* copy vsyscall data */
-        vsyscall_gtod_data.clock.vread = clock->vread;
+        vsyscall_gtod_data.clock.vclock_mode    = clock->archdata.vclock_mode;
-        vsyscall_gtod_data.clock.cycle_last = clock->cycle_last;
+        vsyscall_gtod_data.clock.cycle_last     = clock->cycle_last;
-        vsyscall_gtod_data.clock.mask = clock->mask;
+        vsyscall_gtod_data.clock.mask           = clock->mask;
-        vsyscall_gtod_data.clock.mult = mult;
+        vsyscall_gtod_data.clock.mult           = mult;
-        vsyscall_gtod_data.clock.shift = clock->shift;
+        vsyscall_gtod_data.clock.shift          = clock->shift;
-        vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec;
+        vsyscall_gtod_data.wall_time_sec        = wall_time->tv_sec;
-        vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
+        vsyscall_gtod_data.wall_time_nsec       = wall_time->tv_nsec;
-        vsyscall_gtod_data.wall_to_monotonic = *wtm;
+        vsyscall_gtod_data.wall_to_monotonic    = *wtm;
-        vsyscall_gtod_data.wall_time_coarse = __current_kernel_time();
+        vsyscall_gtod_data.wall_time_coarse     = __current_kernel_time();
        write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
 }
-/* RED-PEN may want to readd seq locking, but then the variable should be
+static void warn_bad_vsyscall(const char *level, struct pt_regs *regs,
- * write-once.
+                              const char *message)
- */
-static __always_inline void do_get_tz(struct timezone * tz)
 {
-        *tz = VVAR(vsyscall_gtod_data).sys_tz;
+        static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST);
-}
+        struct task_struct *tsk;
-static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz)
+        if (!show_unhandled_signals || !__ratelimit(&rs))
-{
+                return;
-        int ret;
-        asm volatile("syscall"
-                : "=a" (ret)
-                : "0" (__NR_gettimeofday),"D" (tv),"S" (tz)
-                : __syscall_clobber );
-        return ret;
-}
-static __always_inline long time_syscall(long *t)
+        tsk = current;
-{
-        long secs;
-        asm volatile("syscall"
-                : "=a" (secs)
-                : "0" (__NR_time),"D" (t) : __syscall_clobber);
-        return secs;
-}
-static __always_inline void do_vgettimeofday(struct timeval * tv)
+        printk("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n",
-{
+               level, tsk->comm, task_pid_nr(tsk),
-        cycle_t now, base, mask, cycle_delta;
+               message, regs->ip - 2, regs->cs,
-        unsigned seq;
+               regs->sp, regs->ax, regs->si, regs->di);
-        unsigned long mult, shift, nsec;
-        cycle_t (*vread)(void);
-        do {
-                seq = read_seqbegin(&VVAR(vsyscall_gtod_data).lock);
-                vread = VVAR(vsyscall_gtod_data).clock.vread;
-                if (unlikely(!VVAR(vsyscall_gtod_data).sysctl_enabled ||
-                             !vread)) {
-                        gettimeofday(tv,NULL);
-                        return;
-                }
-                now = vread();
-                base = VVAR(vsyscall_gtod_data).clock.cycle_last;
-                mask = VVAR(vsyscall_gtod_data).clock.mask;
-                mult = VVAR(vsyscall_gtod_data).clock.mult;
-                shift = VVAR(vsyscall_gtod_data).clock.shift;
-                tv->tv_sec = VVAR(vsyscall_gtod_data).wall_time_sec;
-                nsec = VVAR(vsyscall_gtod_data).wall_time_nsec;
-        } while (read_seqretry(&VVAR(vsyscall_gtod_data).lock, seq));
-        /* calculate interval: */
-        cycle_delta = (now - base) & mask;
-        /* convert to nsecs: */
-        nsec += (cycle_delta * mult) >> shift;
-        while (nsec >= NSEC_PER_SEC) {
-                tv->tv_sec += 1;
-                nsec -= NSEC_PER_SEC;
-        }
-        tv->tv_usec = nsec / NSEC_PER_USEC;
 }
-int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz)
+static int addr_to_vsyscall_nr(unsigned long addr)
 {
-        if (tv)
+        int nr;
-                do_vgettimeofday(tv);
-        if (tz)
-                do_get_tz(tz);
-        return 0;
-}
-/* This will break when the xtime seconds get inaccurate, but that is
+        if ((addr & ~0xC00UL) != VSYSCALL_START)
- * unlikely */
+                return -EINVAL;
-time_t __vsyscall(1) vtime(time_t *t)
-{
-        unsigned seq;
-        time_t result;
-        if (unlikely(!VVAR(vsyscall_gtod_data).sysctl_enabled))
-                return time_syscall(t);
-        do {
+        nr = (addr & 0xC00UL) >> 10;
-                seq = read_seqbegin(&VVAR(vsyscall_gtod_data).lock);
+        if (nr >= 3)
+                return -EINVAL;
-                result = VVAR(vsyscall_gtod_data).wall_time_sec;
+        return nr;
+}
-        } while (read_seqretry(&VVAR(vsyscall_gtod_data).lock, seq));
+void dotraplinkage do_emulate_vsyscall(struct pt_regs *regs, long error_code)
+{
+        struct task_struct *tsk;
+        unsigned long caller;
+        int vsyscall_nr;
+        long ret;
+        local_irq_enable();
+        /*
+         * Real 64-bit user mode code has cs == __USER_CS.  Anything else
+         * is bogus.
+         */
+        if (regs->cs != __USER_CS) {
+                /*
+                 * If we trapped from kernel mode, we might as well OOPS now
+                 * instead of returning to some random address and OOPSing
+                 * then.
+                 */
+                BUG_ON(!user_mode(regs));
+                /* Compat mode and non-compat 32-bit CS should both segfault. */
+                warn_bad_vsyscall(KERN_WARNING, regs,
+                                  "illegal int 0xcc from 32-bit mode");
+                goto sigsegv;
+        }
-        if (t)
+        /*
-                *t = result;
+         * x86-ism here: regs->ip points to the instruction after the int 0xcc,
-        return result;
+         * and int 0xcc is two bytes long.
-}
+         */
+        vsyscall_nr = addr_to_vsyscall_nr(regs->ip - 2);
+        if (vsyscall_nr < 0) {
+                warn_bad_vsyscall(KERN_WARNING, regs,
+                                  "illegal int 0xcc (exploit attempt?)");
+                goto sigsegv;
+        }
-/* Fast way to get current CPU and node.
+        if (get_user(caller, (unsigned long __user *)regs->sp) != 0) {
-   This helps to do per node and per CPU caches in user space.
+                warn_bad_vsyscall(KERN_WARNING, regs, "int 0xcc with bad stack (exploit attempt?)");
-   The result is not guaranteed without CPU affinity, but usually
+                goto sigsegv;
-   works out because the scheduler tries to keep a thread on the same
+        }
-   CPU.
-   tcache must point to a two element sized long array.
+        tsk = current;
-   All arguments can be NULL. */
+        if (seccomp_mode(&tsk->seccomp))
-long __vsyscall(2)
+                do_exit(SIGKILL);
-vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
-{
+        switch (vsyscall_nr) {
-        unsigned int p;
+        case 0:
-        unsigned long j = 0;
+                ret = sys_gettimeofday(
+                        (struct timeval __user *)regs->di,
-        /* Fast cache - only recompute value once per jiffies and avoid
+                        (struct timezone __user *)regs->si);
-           relatively costly rdtscp/cpuid otherwise.
+                break;
-           This works because the scheduler usually keeps the process
-           on the same CPU and this syscall doesn't guarantee its
+        case 1:
-           results anyways.
+                ret = sys_time((time_t __user *)regs->di);
-           We do this here because otherwise user space would do it on
+                break;
-           its own in a likely inferior way (no access to jiffies).
-           If you don't like it pass NULL. */
+        case 2:
-        if (tcache && tcache->blob[0] == (j = VVAR(jiffies))) {
+                ret = sys_getcpu((unsigned __user *)regs->di,
-                p = tcache->blob[1];
+                                 (unsigned __user *)regs->si,
-        } else if (VVAR(vgetcpu_mode) == VGETCPU_RDTSCP) {
+                                 0);
-                /* Load per CPU data from RDTSCP */
+                break;
-                native_read_tscp(&p);
-        } else {
-                /* Load per CPU data from GDT */
-                asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG));
        }
-        if (tcache) {
-                tcache->blob[0] = j;
+        if (ret == -EFAULT) {
-                tcache->blob[1] = p;
+                /*
+                 * Bad news -- userspace fed a bad pointer to a vsyscall.
+                 *
+                 * With a real vsyscall, that would have caused SIGSEGV.
+                 * To make writing reliable exploits using the emulated
+                 * vsyscalls harder, generate SIGSEGV here as well.
+                 */
+                warn_bad_vsyscall(KERN_INFO, regs,
+                                  "vsyscall fault (exploit attempt?)");
+                goto sigsegv;
        }
-        if (cpu)
-                *cpu = p & 0xfff;
-        if (node)
-                *node = p >> 12;
-        return 0;
-}
-static long __vsyscall(3) venosys_1(void)
+        regs->ax = ret;
-{
-        return -ENOSYS;
-}
-#ifdef CONFIG_SYSCTL
+        /* Emulate a ret instruction. */
-static ctl_table kernel_table2[] = {
+        regs->ip = caller;
-        { .procname = "vsyscall64",
+        regs->sp += 8;
-          .data = &vsyscall_gtod_data.sysctl_enabled, .maxlen = sizeof(int),
-          .mode = 0644,
-          .proc_handler = proc_dointvec },
-        {}
-};
-static ctl_table kernel_root_table2[] = {
+        local_irq_disable();
-        { .procname = "kernel", .mode = 0555,
+        return;
-          .child = kernel_table2 },
-        {}
+sigsegv:
-};
+        regs->ip -= 2;  /* The faulting instruction should be the int 0xcc. */
-#endif
+        force_sig(SIGSEGV, current);
+        local_irq_disable();
+}
-/* Assume __initcall executes before all user space. Hopefully kmod
+/*
-   doesn't violate that. We'll find out if it does. */
+ * Assume __initcall executes before all user space. Hopefully kmod
+ * doesn't violate that. We'll find out if it does.
+ */
 static void __cpuinit vsyscall_set_cpu(int cpu)
 {
        unsigned long d;
@@ -255,13 +225,15 @@ static void __cpuinit vsyscall_set_cpu(int cpu)
        if (cpu_has(&cpu_data(cpu), X86_FEATURE_RDTSCP))
                write_rdtscp_aux((node << 12) | cpu);
-        /* Store cpu number in limit so that it can be loaded quickly
+        /*
-           in user space in vgetcpu.
+         * Store cpu number in limit so that it can be loaded quickly
-           12 bits for the CPU and 8 bits for the node. */
+         * in user space in vgetcpu. (12 bits for the CPU and 8 bits for the node)
+         */
        d = 0x0f40000000000ULL;
        d |= cpu;
        d |= (node & 0xf) << 12;
        d |= (node >> 4) << 48;
        write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S);
 }
@@ -275,8 +247,10 @@ static int __cpuinit
 cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg)
 {
        long cpu = (long)arg;
        if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN)
                smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 1);
        return NOTIFY_DONE;
 }
@@ -284,25 +258,23 @@ void __init map_vsyscall(void)
 {
        extern char __vsyscall_0;
        unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0);
+        extern char __vvar_page;
+        unsigned long physaddr_vvar_page = __pa_symbol(&__vvar_page);
        /* Note that VSYSCALL_MAPPED_PAGES must agree with the code below. */
        __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL);
+        __set_fixmap(VVAR_PAGE, physaddr_vvar_page, PAGE_KERNEL_VVAR);
+        BUILD_BUG_ON((unsigned long)__fix_to_virt(VVAR_PAGE) != (unsigned long)VVAR_ADDRESS);
 }
 static int __init vsyscall_init(void)
 {
-        BUG_ON(((unsigned long) &vgettimeofday !=
+        BUG_ON(VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE));
-                        VSYSCALL_ADDR(__NR_vgettimeofday)));
-        BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime));
-        BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE)));
-        BUG_ON((unsigned long) &vgetcpu != VSYSCALL_ADDR(__NR_vgetcpu));
-#ifdef CONFIG_SYSCTL
-        register_sysctl_table(kernel_root_table2);
-#endif
        on_each_cpu(cpu_vsyscall_init, NULL, 1);
        /* notifier priority > KVM */
        hotcpu_notifier(cpu_vsyscall_notifier, 30);
        return 0;
 }
 __initcall(vsyscall_init);
diff --git a/arch/x86/kernel/vsyscall_emu_64.S b/arch/x86/kernel/vsyscall_emu_64.S
new file mode 100644
index 000000000000..ffa845eae5ca
--- /dev/null
+++ b/arch/x86/kernel/vsyscall_emu_64.S
@@ -0,0 +1,27 @@
+/*
+ * vsyscall_emu_64.S: Vsyscall emulation page
+ *
+ * Copyright (c) 2011 Andy Lutomirski
+ *
+ * Subject to the GNU General Public License, version 2
+ */
+#include <linux/linkage.h>
+#include <asm/irq_vectors.h>
+/* The unused parts of the page are filled with 0xcc by the linker script. */
+.section .vsyscall_0, "a"
+ENTRY(vsyscall_0)
+        int $VSYSCALL_EMU_VECTOR
+END(vsyscall_0)
+.section .vsyscall_1, "a"
+ENTRY(vsyscall_1)
+        int $VSYSCALL_EMU_VECTOR
+END(vsyscall_1)
+.section .vsyscall_2, "a"
+ENTRY(vsyscall_2)
+        int $VSYSCALL_EMU_VECTOR
+END(vsyscall_2)