59 files changed, 2189 insertions, 324 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 9b0a34e2cd79..cb648c84b327 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -29,10 +29,11 @@ obj-$(CONFIG_X86_64)	+= sys_x86_64.o x8664_ksyms_64.o
 obj-y                   += syscall_$(BITS).o
 obj-$(CONFIG_X86_64)    += vsyscall_64.o
 obj-$(CONFIG_X86_64)    += vsyscall_emu_64.o
+obj-$(CONFIG_SYSFS)     += ksysfs.o
 obj-y                   += bootflag.o e820.o
 obj-y                   += pci-dma.o quirks.o topology.o kdebugfs.o
 obj-y                   += alternative.o i8253.o pci-nommu.o hw_breakpoint.o
-obj-y                   += tsc.o io_delay.o rtc.o
+obj-y                   += tsc.o tsc_msr.o io_delay.o rtc.o
 obj-y                   += pci-iommu_table.o
 obj-y                   += resource.o
@@ -91,15 +92,6 @@ obj-$(CONFIG_PARAVIRT_CLOCK)	+= pvclock.o
 obj-$(CONFIG_PCSPKR_PLATFORM)   += pcspeaker.o
-obj-$(CONFIG_MICROCODE_EARLY)           += microcode_core_early.o
-obj-$(CONFIG_MICROCODE_INTEL_EARLY)     += microcode_intel_early.o
-obj-$(CONFIG_MICROCODE_INTEL_LIB)       += microcode_intel_lib.o
-microcode-y                             := microcode_core.o
-microcode-$(CONFIG_MICROCODE_INTEL)     += microcode_intel.o
-microcode-$(CONFIG_MICROCODE_AMD)       += microcode_amd.o
-obj-$(CONFIG_MICROCODE_AMD_EARLY)       += microcode_amd_early.o
-obj-$(CONFIG_MICROCODE)                 += microcode.o
 obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o
 obj-$(CONFIG_SWIOTLB)                   += pci-swiotlb.o
@@ -111,6 +103,7 @@ obj-$(CONFIG_EFI)			+= sysfb_efi.o
 obj-$(CONFIG_PERF_EVENTS)               += perf_regs.o
 obj-$(CONFIG_TRACING)                   += tracepoint.o
+obj-$(CONFIG_IOSF_MBI)                  += iosf_mbi.o
 ###
 # 64 bit specific files
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c
index d2b7f27781bc..e69182fd01cf 100644
--- a/arch/x86/kernel/acpi/cstate.c
+++ b/arch/x86/kernel/acpi/cstate.c
@@ -150,29 +150,6 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu,
 }
 EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_probe);
-/*
- * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
- * which can obviate IPI to trigger checking of need_resched.
- * We execute MONITOR against need_resched and enter optimized wait state
- * through MWAIT. Whenever someone changes need_resched, we would be woken
- * up from MWAIT (without an IPI).
- *
- * New with Core Duo processors, MWAIT can take some hints based on CPU
- * capability.
- */
-void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
-{
-        if (!need_resched()) {
-                if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR))
-                        clflush((void *)&current_thread_info()->flags);
-                __monitor((void *)&current_thread_info()->flags, 0, 0);
-                smp_mb();
-                if (!need_resched())
-                        __mwait(ax, cx);
-        }
-}
 void acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cx)
 {
        unsigned int cpu = smp_processor_id();
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index d278736bf774..7f26c9a70a9e 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -75,6 +75,13 @@ unsigned int max_physical_apicid;
 physid_mask_t phys_cpu_present_map;
 /*
+ * Processor to be disabled specified by kernel parameter
+ * disable_cpu_apicid=<int>, mostly used for the kdump 2nd kernel to
+ * avoid undefined behaviour caused by sending INIT from AP to BSP.
+ */
+static unsigned int disabled_cpu_apicid __read_mostly = BAD_APICID;
+/*
 * Map cpu index to physical APIC ID
 */
 DEFINE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_cpu_to_apicid, BAD_APICID);
@@ -1968,7 +1975,7 @@ __visible void smp_trace_spurious_interrupt(struct pt_regs *regs)
 */
 static inline void __smp_error_interrupt(struct pt_regs *regs)
 {
-        u32 v0, v1;
+        u32 v;
        u32 i = 0;
        static const char * const error_interrupt_reason[] = {
                "Send CS error",                /* APIC Error Bit 0 */
@@ -1982,21 +1989,20 @@ static inline void __smp_error_interrupt(struct pt_regs *regs)
        };
        /* First tickle the hardware, only then report what went on. -- REW */
-        v0 = apic_read(APIC_ESR);
        apic_write(APIC_ESR, 0);
-        v1 = apic_read(APIC_ESR);
+        v = apic_read(APIC_ESR);
        ack_APIC_irq();
        atomic_inc(&irq_err_count);
-        apic_printk(APIC_DEBUG, KERN_DEBUG "APIC error on CPU%d: %02x(%02x)",
+        apic_printk(APIC_DEBUG, KERN_DEBUG "APIC error on CPU%d: %02x",
-                    smp_processor_id(), v0 , v1);
+                    smp_processor_id(), v);
-        v1 = v1 & 0xff;
+        v &= 0xff;
-        while (v1) {
+        while (v) {
-                if (v1 & 0x1)
+                if (v & 0x1)
                        apic_printk(APIC_DEBUG, KERN_CONT " : %s", error_interrupt_reason[i]);
                i++;
-                v1 >>= 1;
+                v >>= 1;
        }
        apic_printk(APIC_DEBUG, KERN_CONT "\n");
@@ -2115,6 +2121,39 @@ int generic_processor_info(int apicid, int version)
                                phys_cpu_present_map);
        /*
+         * boot_cpu_physical_apicid is designed to have the apicid
+         * returned by read_apic_id(), i.e, the apicid of the
+         * currently booting-up processor. However, on some platforms,
+         * it is temporarily modified by the apicid reported as BSP
+         * through MP table. Concretely:
+         *
+         * - arch/x86/kernel/mpparse.c: MP_processor_info()
+         * - arch/x86/mm/amdtopology.c: amd_numa_init()
+         * - arch/x86/platform/visws/visws_quirks.c: MP_processor_info()
+         *
+         * This function is executed with the modified
+         * boot_cpu_physical_apicid. So, disabled_cpu_apicid kernel
+         * parameter doesn't work to disable APs on kdump 2nd kernel.
+         *
+         * Since fixing handling of boot_cpu_physical_apicid requires
+         * another discussion and tests on each platform, we leave it
+         * for now and here we use read_apic_id() directly in this
+         * function, generic_processor_info().
+         */
+        if (disabled_cpu_apicid != BAD_APICID &&
+            disabled_cpu_apicid != read_apic_id() &&
+            disabled_cpu_apicid == apicid) {
+                int thiscpu = num_processors + disabled_cpus;
+                pr_warning("APIC: Disabling requested cpu."
+                           " Processor %d/0x%x ignored.\n",
+                           thiscpu, apicid);
+                disabled_cpus++;
+                return -ENODEV;
+        }
+        /*
         * If boot cpu has not been detected yet, then only allow upto
         * nr_cpu_ids - 1 processors and keep one slot free for boot cpu
         */
@@ -2592,3 +2631,12 @@ static int __init lapic_insert_resource(void)
 * that is using request_resource
 */
 late_initcall(lapic_insert_resource);
+static int __init apic_set_disabled_cpu_apicid(char *arg)
+{
+        if (!arg || !get_option(&arg, &disabled_cpu_apicid))
+                return -EINVAL;
+        return 0;
+}
+early_param("disable_cpu_apicid", apic_set_disabled_cpu_apicid);
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index 00c77cf78e9e..5d5b9eb2b7a4 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -14,7 +14,6 @@
 #include <linux/string.h>
 #include <linux/kernel.h>
 #include <linux/ctype.h>
-#include <linux/init.h>
 #include <linux/hardirq.h>
 #include <linux/module.h>
 #include <asm/smp.h>
diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c
index e145f28b4099..191ce75c0e54 100644
--- a/arch/x86/kernel/apic/apic_noop.c
+++ b/arch/x86/kernel/apic/apic_noop.c
@@ -15,7 +15,6 @@
 #include <linux/string.h>
 #include <linux/kernel.h>
 #include <linux/ctype.h>
-#include <linux/init.h>
 #include <linux/errno.h>
 #include <asm/fixmap.h>
 #include <asm/mpspec.h>
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index e63a5bd2a78f..a43f068ebec1 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1142,9 +1142,10 @@ next:
                if (test_bit(vector, used_vectors))
                        goto next;
-                for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask)
+                for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask) {
-                        if (per_cpu(vector_irq, new_cpu)[vector] != -1)
+                        if (per_cpu(vector_irq, new_cpu)[vector] > VECTOR_UNDEFINED)
                                goto next;
+                }
                /* Found one! */
                current_vector = vector;
                current_offset = offset;
@@ -1183,7 +1184,7 @@ static void __clear_irq_vector(int irq, struct irq_cfg *cfg)
        vector = cfg->vector;
        for_each_cpu_and(cpu, cfg->domain, cpu_online_mask)
-                per_cpu(vector_irq, cpu)[vector] = -1;
+                per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED;
        cfg->vector = 0;
        cpumask_clear(cfg->domain);
@@ -1191,11 +1192,10 @@ static void __clear_irq_vector(int irq, struct irq_cfg *cfg)
        if (likely(!cfg->move_in_progress))
                return;
        for_each_cpu_and(cpu, cfg->old_domain, cpu_online_mask) {
-                for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS;
+                for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
-                                                                vector++) {
                        if (per_cpu(vector_irq, cpu)[vector] != irq)
                                continue;
-                        per_cpu(vector_irq, cpu)[vector] = -1;
+                        per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED;
                        break;
                }
        }
@@ -1228,12 +1228,12 @@ void __setup_vector_irq(int cpu)
        /* Mark the free vectors */
        for (vector = 0; vector < NR_VECTORS; ++vector) {
                irq = per_cpu(vector_irq, cpu)[vector];
-                if (irq < 0)
+                if (irq <= VECTOR_UNDEFINED)
                        continue;
                cfg = irq_cfg(irq);
                if (!cpumask_test_cpu(cpu, cfg->domain))
-                        per_cpu(vector_irq, cpu)[vector] = -1;
+                        per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED;
        }
        raw_spin_unlock(&vector_lock);
 }
@@ -2202,13 +2202,13 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
        me = smp_processor_id();
        for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
-                unsigned int irq;
+                int irq;
                unsigned int irr;
                struct irq_desc *desc;
                struct irq_cfg *cfg;
                irq = __this_cpu_read(vector_irq[vector]);
-                if (irq == -1)
+                if (irq <= VECTOR_UNDEFINED)
                        continue;
                desc = irq_to_desc(irq);
diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c
index 7434d8556d09..62071569bd50 100644
--- a/arch/x86/kernel/apic/ipi.c
+++ b/arch/x86/kernel/apic/ipi.c
@@ -1,6 +1,5 @@
 #include <linux/cpumask.h>
 #include <linux/interrupt.h>
-#include <linux/init.h>
 #include <linux/mm.h>
 #include <linux/delay.h>
diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c
index 77c95c0e1bf7..00146f9b0254 100644
--- a/arch/x86/kernel/apic/summit_32.c
+++ b/arch/x86/kernel/apic/summit_32.c
@@ -29,7 +29,6 @@
 #define pr_fmt(fmt) "summit: %s: " fmt, __func__
 #include <linux/mm.h>
-#include <linux/init.h>
 #include <asm/io.h>
 #include <asm/bios_ebda.h>
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index 140e29db478d..cac85ee6913f 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -3,7 +3,6 @@
 #include <linux/string.h>
 #include <linux/kernel.h>
 #include <linux/ctype.h>
-#include <linux/init.h>
 #include <linux/dmar.h>
 #include <linux/cpu.h>
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index 562a76d433c8..de231e328cae 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -3,7 +3,6 @@
 #include <linux/string.h>
 #include <linux/kernel.h>
 #include <linux/ctype.h>
-#include <linux/init.h>
 #include <linux/dmar.h>
 #include <asm/smp.h>
diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c
index e2dbcb7dabdd..83a7995625a6 100644
--- a/arch/x86/kernel/check.c
+++ b/arch/x86/kernel/check.c
@@ -91,7 +91,7 @@ void __init setup_bios_corruption_check(void)
        corruption_check_size = round_up(corruption_check_size, PAGE_SIZE);
-        for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL) {
+        for_each_free_mem_range(i, NUMA_NO_NODE, &start, &end, NULL) {
                start = clamp_t(phys_addr_t, round_up(start, PAGE_SIZE),
                                PAGE_SIZE, corruption_check_size);
                end = clamp_t(phys_addr_t, round_down(end, PAGE_SIZE),
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 47b56a7e99cb..7fd54f09b011 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -36,12 +36,13 @@ obj-$(CONFIG_CPU_SUP_AMD)		+= perf_event_amd_iommu.o
 endif
 obj-$(CONFIG_CPU_SUP_INTEL)             += perf_event_p6.o perf_event_knc.o perf_event_p4.o
 obj-$(CONFIG_CPU_SUP_INTEL)             += perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o
-obj-$(CONFIG_CPU_SUP_INTEL)             += perf_event_intel_uncore.o
+obj-$(CONFIG_CPU_SUP_INTEL)             += perf_event_intel_uncore.o perf_event_intel_rapl.o
 endif
 obj-$(CONFIG_X86_MCE)                   += mcheck/
 obj-$(CONFIG_MTRR)                      += mtrr/
+obj-$(CONFIG_MICROCODE)                 += microcode/
 obj-$(CONFIG_X86_LOCAL_APIC)            += perfctr-watchdog.o perf_event_amd_ibs.o
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index bca023bdd6b2..d3153e281d72 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -1,5 +1,4 @@
 #include <linux/export.h>
-#include <linux/init.h>
 #include <linux/bitops.h>
 #include <linux/elf.h>
 #include <linux/mm.h>
@@ -487,7 +486,7 @@ static void early_init_amd(struct cpuinfo_x86 *c)
                set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
                set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
                if (!check_tsc_unstable())
-                        sched_clock_stable = 1;
+                        set_sched_clock_stable();
        }
 #ifdef CONFIG_X86_64
@@ -508,6 +507,16 @@ static void early_init_amd(struct cpuinfo_x86 *c)
                        set_cpu_cap(c, X86_FEATURE_EXTD_APICID);
        }
 #endif
+        /* F16h erratum 793, CVE-2013-6885 */
+        if (c->x86 == 0x16 && c->x86_model <= 0xf) {
+                u64 val;
+                rdmsrl(MSR_AMD64_LS_CFG, val);
+                if (!(val & BIT(15)))
+                        wrmsrl(MSR_AMD64_LS_CFG, val | BIT(15));
+        }
 }
 static const int amd_erratum_383[];
@@ -790,14 +799,10 @@ static void cpu_detect_tlb_amd(struct cpuinfo_x86 *c)
        }
        /* Handle DTLB 2M and 4M sizes, fall back to L1 if L2 is disabled */
-        if (!((eax >> 16) & mask)) {
+        if (!((eax >> 16) & mask))
-                u32 a, b, c, d;
+                tlb_lld_2m[ENTRIES] = (cpuid_eax(0x80000005) >> 16) & 0xff;
+        else
-                cpuid(0x80000005, &a, &b, &c, &d);
-                tlb_lld_2m[ENTRIES] = (a >> 16) & 0xff;
-        } else {
                tlb_lld_2m[ENTRIES] = (eax >> 16) & mask;
-        }
        /* a 4M entry uses two 2M entries */
        tlb_lld_4m[ENTRIES] = tlb_lld_2m[ENTRIES] >> 1;
diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c
index 8d5652dc99dd..8779edab684e 100644
--- a/arch/x86/kernel/cpu/centaur.c
+++ b/arch/x86/kernel/cpu/centaur.c
@@ -1,6 +1,5 @@
 #include <linux/bitops.h>
 #include <linux/kernel.h>
-#include <linux/init.h>
 #include <asm/processor.h>
 #include <asm/e820.h>
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 6abc172b8258..24b6fd10625a 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -472,6 +472,7 @@ u16 __read_mostly tlb_lli_4m[NR_INFO];
 u16 __read_mostly tlb_lld_4k[NR_INFO];
 u16 __read_mostly tlb_lld_2m[NR_INFO];
 u16 __read_mostly tlb_lld_4m[NR_INFO];
+u16 __read_mostly tlb_lld_1g[NR_INFO];
 /*
 * tlb_flushall_shift shows the balance point in replacing cr3 write
@@ -486,13 +487,13 @@ void cpu_detect_tlb(struct cpuinfo_x86 *c)
        if (this_cpu->c_detect_tlb)
                this_cpu->c_detect_tlb(c);
-        printk(KERN_INFO "Last level iTLB entries: 4KB %d, 2MB %d, 4MB %d\n" \
+        printk(KERN_INFO "Last level iTLB entries: 4KB %d, 2MB %d, 4MB %d\n"
-                "Last level dTLB entries: 4KB %d, 2MB %d, 4MB %d\n"          \
+                "Last level dTLB entries: 4KB %d, 2MB %d, 4MB %d, 1GB %d\n"
                "tlb_flushall_shift: %d\n",
                tlb_lli_4k[ENTRIES], tlb_lli_2m[ENTRIES],
                tlb_lli_4m[ENTRIES], tlb_lld_4k[ENTRIES],
                tlb_lld_2m[ENTRIES], tlb_lld_4m[ENTRIES],
-                tlb_flushall_shift);
+                tlb_lld_1g[ENTRIES], tlb_flushall_shift);
 }
 void detect_ht(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c
index d0969c75ab54..aaf152e79637 100644
--- a/arch/x86/kernel/cpu/cyrix.c
+++ b/arch/x86/kernel/cpu/cyrix.c
@@ -1,4 +1,3 @@
-#include <linux/init.h>
 #include <linux/bitops.h>
 #include <linux/delay.h>
 #include <linux/pci.h>
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index dc1ec0dff939..3db61c644e44 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -1,4 +1,3 @@
-#include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/string.h>
@@ -93,7 +92,7 @@ static void early_init_intel(struct cpuinfo_x86 *c)
                set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
                set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
                if (!check_tsc_unstable())
-                        sched_clock_stable = 1;
+                        set_sched_clock_stable();
        }
        /* Penwell and Cloverview have the TSC which doesn't sleep on S3 */
@@ -387,7 +386,8 @@ static void init_intel(struct cpuinfo_x86 *c)
                        set_cpu_cap(c, X86_FEATURE_PEBS);
        }
-        if (c->x86 == 6 && c->x86_model == 29 && cpu_has_clflush)
+        if (c->x86 == 6 && cpu_has_clflush &&
+            (c->x86_model == 29 || c->x86_model == 46 || c->x86_model == 47))
                set_cpu_cap(c, X86_FEATURE_CLFLUSH_MONITOR);
 #ifdef CONFIG_X86_64
@@ -505,6 +505,7 @@ static unsigned int intel_size_cache(struct cpuinfo_x86 *c, unsigned int size)
 #define TLB_DATA0_2M_4M 0x23
 #define STLB_4K         0x41
+#define STLB_4K_2M      0x42
 static const struct _tlb_table intel_tlb_table[] = {
        { 0x01, TLB_INST_4K,            32,     " TLB_INST 4 KByte pages, 4-way set associative" },
@@ -525,13 +526,20 @@ static const struct _tlb_table intel_tlb_table[] = {
        { 0x5b, TLB_DATA_4K_4M,         64,     " TLB_DATA 4 KByte and 4 MByte pages" },
        { 0x5c, TLB_DATA_4K_4M,         128,    " TLB_DATA 4 KByte and 4 MByte pages" },
        { 0x5d, TLB_DATA_4K_4M,         256,    " TLB_DATA 4 KByte and 4 MByte pages" },
+        { 0x61, TLB_INST_4K,            48,     " TLB_INST 4 KByte pages, full associative" },
+        { 0x63, TLB_DATA_1G,            4,      " TLB_DATA 1 GByte pages, 4-way set associative" },
+        { 0x76, TLB_INST_2M_4M,         8,      " TLB_INST 2-MByte or 4-MByte pages, fully associative" },
        { 0xb0, TLB_INST_4K,            128,    " TLB_INST 4 KByte pages, 4-way set associative" },
        { 0xb1, TLB_INST_2M_4M,         4,      " TLB_INST 2M pages, 4-way, 8 entries or 4M pages, 4-way entries" },
        { 0xb2, TLB_INST_4K,            64,     " TLB_INST 4KByte pages, 4-way set associative" },
        { 0xb3, TLB_DATA_4K,            128,    " TLB_DATA 4 KByte pages, 4-way set associative" },
        { 0xb4, TLB_DATA_4K,            256,    " TLB_DATA 4 KByte pages, 4-way associative" },
+        { 0xb5, TLB_INST_4K,            64,     " TLB_INST 4 KByte pages, 8-way set ssociative" },
+        { 0xb6, TLB_INST_4K,            128,    " TLB_INST 4 KByte pages, 8-way set ssociative" },
        { 0xba, TLB_DATA_4K,            64,     " TLB_DATA 4 KByte pages, 4-way associative" },
        { 0xc0, TLB_DATA_4K_4M,         8,      " TLB_DATA 4 KByte and 4 MByte pages, 4-way associative" },
+        { 0xc1, STLB_4K_2M,             1024,   " STLB 4 KByte and 2 MByte pages, 8-way associative" },
+        { 0xc2, TLB_DATA_2M_4M,         16,     " DTLB 2 MByte/4MByte pages, 4-way associative" },
        { 0xca, STLB_4K,                512,    " STLB 4 KByte pages, 4-way associative" },
        { 0x00, 0, 0 }
 };
@@ -557,6 +565,20 @@ static void intel_tlb_lookup(const unsigned char desc)
                if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries)
                        tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries;
                break;
+        case STLB_4K_2M:
+                if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries)
+                        tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries;
+                if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries)
+                        tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries;
+                if (tlb_lli_2m[ENTRIES] < intel_tlb_table[k].entries)
+                        tlb_lli_2m[ENTRIES] = intel_tlb_table[k].entries;
+                if (tlb_lld_2m[ENTRIES] < intel_tlb_table[k].entries)
+                        tlb_lld_2m[ENTRIES] = intel_tlb_table[k].entries;
+                if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries)
+                        tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries;
+                if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries)
+                        tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries;
+                break;
        case TLB_INST_ALL:
                if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries)
                        tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries;
@@ -602,6 +624,10 @@ static void intel_tlb_lookup(const unsigned char desc)
                if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries)
                        tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries;
                break;
+        case TLB_DATA_1G:
+                if (tlb_lld_1g[ENTRIES] < intel_tlb_table[k].entries)
+                        tlb_lld_1g[ENTRIES] = intel_tlb_table[k].entries;
+                break;
        }
 }
diff --git a/arch/x86/kernel/cpu/mcheck/mce-apei.c b/arch/x86/kernel/cpu/mcheck/mce-apei.c
index de8b60a53f69..a1aef9533154 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-apei.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-apei.c
@@ -33,22 +33,28 @@
 #include <linux/acpi.h>
 #include <linux/cper.h>
 #include <acpi/apei.h>
+#include <acpi/ghes.h>
 #include <asm/mce.h>
 #include "mce-internal.h"
-void apei_mce_report_mem_error(int corrected, struct cper_sec_mem_err *mem_err)
+void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err)
 {
        struct mce m;
-        /* Only corrected MC is reported */
+        if (!(mem_err->validation_bits & CPER_MEM_VALID_PA))
-        if (!corrected || !(mem_err->validation_bits & CPER_MEM_VALID_PA))
                return;
        mce_setup(&m);
        m.bank = 1;
-        /* Fake a memory read corrected error with unknown channel */
+        /* Fake a memory read error with unknown channel */
        m.status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | 0x9f;
+        if (severity >= GHES_SEV_RECOVERABLE)
+                m.status |= MCI_STATUS_UC;
+        if (severity >= GHES_SEV_PANIC)
+                m.status |= MCI_STATUS_PCC;
        m.addr = mem_err->physical_addr;
        mce_log(&m);
        mce_notify_irq();
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index b3218cdee95f..4d5419b249da 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -1638,15 +1638,15 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
 static void mce_start_timer(unsigned int cpu, struct timer_list *t)
 {
-        unsigned long iv = mce_adjust_timer(check_interval * HZ);
+        unsigned long iv = check_interval * HZ;
-        __this_cpu_write(mce_next_interval, iv);
        if (mca_cfg.ignore_ce || !iv)
                return;
+        per_cpu(mce_next_interval, cpu) = iv;
        t->expires = round_jiffies(jiffies + iv);
-        add_timer_on(t, smp_processor_id());
+        add_timer_on(t, cpu);
 }
 static void __mcheck_cpu_init_timer(void)
@@ -2272,8 +2272,10 @@ static int mce_device_create(unsigned int cpu)
        dev->release = &mce_device_release;
        err = device_register(dev);
-        if (err)
+        if (err) {
+                put_device(dev);
                return err;
+        }
        for (i = 0; mce_device_attrs[i]; i++) {
                err = device_create_file(dev, mce_device_attrs[i]);
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index 4cfe0458ca66..fb6156fee6f7 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -6,7 +6,6 @@
 */
 #include <linux/gfp.h>
-#include <linux/init.h>
 #include <linux/interrupt.h>
 #include <linux/percpu.h>
 #include <linux/sched.h>
diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c
index 1c044b1ccc59..a3042989398c 100644
--- a/arch/x86/kernel/cpu/mcheck/p5.c
+++ b/arch/x86/kernel/cpu/mcheck/p5.c
@@ -5,7 +5,6 @@
 #include <linux/interrupt.h>
 #include <linux/kernel.h>
 #include <linux/types.h>
-#include <linux/init.h>
 #include <linux/smp.h>
 #include <asm/processor.h>
diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c
index e9a701aecaa1..7dc5564d0cdf 100644
--- a/arch/x86/kernel/cpu/mcheck/winchip.c
+++ b/arch/x86/kernel/cpu/mcheck/winchip.c
@@ -5,7 +5,6 @@
 #include <linux/interrupt.h>
 #include <linux/kernel.h>
 #include <linux/types.h>
-#include <linux/init.h>
 #include <asm/processor.h>
 #include <asm/mce.h>
diff --git a/arch/x86/kernel/cpu/microcode/Makefile b/arch/x86/kernel/cpu/microcode/Makefile
new file mode 100644
index 000000000000..285c85427c32
--- /dev/null
+++ b/arch/x86/kernel/cpu/microcode/Makefile
@@ -0,0 +1,7 @@
+microcode-y                             := core.o
+obj-$(CONFIG_MICROCODE)                 += microcode.o
+microcode-$(CONFIG_MICROCODE_INTEL)     += intel.o intel_lib.o
+microcode-$(CONFIG_MICROCODE_AMD)       += amd.o
+obj-$(CONFIG_MICROCODE_EARLY)           += core_early.o
+obj-$(CONFIG_MICROCODE_INTEL_EARLY)     += intel_early.o
+obj-$(CONFIG_MICROCODE_AMD_EARLY)       += amd_early.o
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index c3d4cc972eca..8fffd845e22b 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -182,10 +182,10 @@ int __apply_microcode_amd(struct microcode_amd *mc_amd)
 {
        u32 rev, dummy;
-        wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc_amd->hdr.data_code);
+        native_wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc_amd->hdr.data_code);
        /* verify patch application was successful */
-        rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
+        native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
        if (rev != mc_amd->hdr.patch_id)
                return -1;
@@ -332,6 +332,9 @@ static int verify_and_add_patch(u8 family, u8 *fw, unsigned int leftover)
        patch->patch_id  = mc_hdr->patch_id;
        patch->equiv_cpu = proc_id;
+        pr_debug("%s: Added patch_id: 0x%08x, proc_id: 0x%04x\n",
+                 __func__, patch->patch_id, proc_id);
        /* ... and add to cache. */
        update_cache(patch);
@@ -390,9 +393,9 @@ enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t size)
        if (cpu_data(smp_processor_id()).cpu_index == boot_cpu_data.cpu_index) {
                struct ucode_patch *p = find_patch(smp_processor_id());
                if (p) {
-                        memset(amd_bsp_mpb, 0, MPB_MAX_SIZE);
+                        memset(amd_ucode_patch, 0, PATCH_MAX_SIZE);
-                        memcpy(amd_bsp_mpb, p->data, min_t(u32, ksize(p->data),
+                        memcpy(amd_ucode_patch, p->data, min_t(u32, ksize(p->data),
-                                                           MPB_MAX_SIZE));
+                                                               PATCH_MAX_SIZE));
                }
        }
 #endif
@@ -430,7 +433,7 @@ static enum ucode_state request_microcode_amd(int cpu, struct device *device,
        if (c->x86 >= 0x15)
                snprintf(fw_name, sizeof(fw_name), "amd-ucode/microcode_amd_fam%.2xh.bin", c->x86);
-        if (request_firmware(&fw, (const char *)fw_name, device)) {
+        if (request_firmware_direct(&fw, (const char *)fw_name, device)) {
                pr_debug("failed to load file %s\n", fw_name);
                goto out;
        }
diff --git a/arch/x86/kernel/microcode_amd_early.c b/arch/x86/kernel/cpu/microcode/amd_early.c
index 6073104ccaa3..8384c0fa206f 100644
--- a/arch/x86/kernel/microcode_amd_early.c
+++ b/arch/x86/kernel/cpu/microcode/amd_early.c
@@ -2,6 +2,7 @@
 * Copyright (C) 2013 Advanced Micro Devices, Inc.
 *
 * Author: Jacob Shin <jacob.shin@amd.com>
+ * Fixes: Borislav Petkov <bp@suse.de>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as
@@ -15,10 +16,18 @@
 #include <asm/setup.h>
 #include <asm/microcode_amd.h>
-static bool ucode_loaded;
+/*
+ * This points to the current valid container of microcode patches which we will
+ * save from the initrd before jettisoning its contents.
+ */
+static u8 *container;
+static size_t container_size;
 static u32 ucode_new_rev;
-static unsigned long ucode_offset;
+u8 amd_ucode_patch[PATCH_MAX_SIZE];
-static size_t ucode_size;
+static u16 this_equiv_id;
+struct cpio_data ucode_cpio;
 /*
 * Microcode patch container file is prepended to the initrd in cpio format.
@@ -32,9 +41,6 @@ static struct cpio_data __init find_ucode_in_initrd(void)
        char *path;
        void *start;
        size_t size;
-        unsigned long *uoffset;
-        size_t *usize;
-        struct cpio_data cd;
 #ifdef CONFIG_X86_32
        struct boot_params *p;
@@ -47,30 +53,50 @@ static struct cpio_data __init find_ucode_in_initrd(void)
        path    = (char *)__pa_nodebug(ucode_path);
        start   = (void *)p->hdr.ramdisk_image;
        size    = p->hdr.ramdisk_size;
-        uoffset = (unsigned long *)__pa_nodebug(&ucode_offset);
-        usize   = (size_t *)__pa_nodebug(&ucode_size);
 #else
        path    = ucode_path;
        start   = (void *)(boot_params.hdr.ramdisk_image + PAGE_OFFSET);
        size    = boot_params.hdr.ramdisk_size;
-        uoffset = &ucode_offset;
-        usize   = &ucode_size;
 #endif
-        cd = find_cpio_data(path, start, size, &offset);
+        return find_cpio_data(path, start, size, &offset);
-        if (!cd.data)
+}
-                return cd;
-        if (*(u32 *)cd.data != UCODE_MAGIC) {
+static size_t compute_container_size(u8 *data, u32 total_size)
-                cd.data = NULL;
+{
-                cd.size = 0;
+        size_t size = 0;
-                return cd;
+        u32 *header = (u32 *)data;
-        }
-        *uoffset = (u8 *)cd.data - (u8 *)start;
+        if (header[0] != UCODE_MAGIC ||
-        *usize   = cd.size;
+            header[1] != UCODE_EQUIV_CPU_TABLE_TYPE || /* type */
+            header[2] == 0)                            /* size */
+                return size;
-        return cd;
+        size = header[2] + CONTAINER_HDR_SZ;
+        total_size -= size;
+        data += size;
+        while (total_size) {
+                u16 patch_size;
+                header = (u32 *)data;
+                if (header[0] != UCODE_UCODE_TYPE)
+                        break;
+                /*
+                 * Sanity-check patch size.
+                 */
+                patch_size = header[1];
+                if (patch_size > PATCH_MAX_SIZE)
+                        break;
+                size       += patch_size + SECTION_HDR_SIZE;
+                data       += patch_size + SECTION_HDR_SIZE;
+                total_size -= patch_size + SECTION_HDR_SIZE;
+        }
+        return size;
 }
 /*
@@ -85,23 +111,22 @@ static struct cpio_data __init find_ucode_in_initrd(void)
 static void apply_ucode_in_initrd(void *ucode, size_t size)
 {
        struct equiv_cpu_entry *eq;
+        size_t *cont_sz;
        u32 *header;
-        u8  *data;
+        u8  *data, **cont;
        u16 eq_id = 0;
        int offset, left;
-        u32 rev, eax;
+        u32 rev, eax, ebx, ecx, edx;
        u32 *new_rev;
-        unsigned long *uoffset;
-        size_t *usize;
 #ifdef CONFIG_X86_32
        new_rev = (u32 *)__pa_nodebug(&ucode_new_rev);
-        uoffset = (unsigned long *)__pa_nodebug(&ucode_offset);
+        cont_sz = (size_t *)__pa_nodebug(&container_size);
-        usize   = (size_t *)__pa_nodebug(&ucode_size);
+        cont    = (u8 **)__pa_nodebug(&container);
 #else
        new_rev = &ucode_new_rev;
-        uoffset = &ucode_offset;
+        cont_sz = &container_size;
-        usize   = &ucode_size;
+        cont    = &container;
 #endif
        data   = ucode;
@@ -109,23 +134,37 @@ static void apply_ucode_in_initrd(void *ucode, size_t size)
        header = (u32 *)data;
        /* find equiv cpu table */
+        if (header[0] != UCODE_MAGIC ||
-        if (header[1] != UCODE_EQUIV_CPU_TABLE_TYPE || /* type */
+            header[1] != UCODE_EQUIV_CPU_TABLE_TYPE || /* type */
            header[2] == 0)                            /* size */
                return;
-        eax = cpuid_eax(0x00000001);
+        eax = 0x00000001;
+        ecx = 0;
+        native_cpuid(&eax, &ebx, &ecx, &edx);
        while (left > 0) {
                eq = (struct equiv_cpu_entry *)(data + CONTAINER_HDR_SZ);
+                *cont = data;
+                /* Advance past the container header */
                offset = header[2] + CONTAINER_HDR_SZ;
                data  += offset;
                left  -= offset;
                eq_id = find_equiv_id(eq, eax);
-                if (eq_id)
+                if (eq_id) {
+                        this_equiv_id = eq_id;
+                        *cont_sz = compute_container_size(*cont, left + offset);
+                        /*
+                         * truncate how much we need to iterate over in the
+                         * ucode update loop below
+                         */
+                        left = *cont_sz - offset;
                        break;
+                }
                /*
                 * support multiple container files appended together. if this
@@ -145,19 +184,18 @@ static void apply_ucode_in_initrd(void *ucode, size_t size)
                /* mark where the next microcode container file starts */
                offset    = data - (u8 *)ucode;
-                *uoffset += offset;
-                *usize   -= offset;
                ucode     = data;
        }
        if (!eq_id) {
-                *usize = 0;
+                *cont = NULL;
+                *cont_sz = 0;
                return;
        }
        /* find ucode and update if needed */
-        rdmsr(MSR_AMD64_PATCH_LEVEL, rev, eax);
+        native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, eax);
        while (left > 0) {
                struct microcode_amd *mc;
@@ -168,73 +206,83 @@ static void apply_ucode_in_initrd(void *ucode, size_t size)
                        break;
                mc = (struct microcode_amd *)(data + SECTION_HDR_SIZE);
-                if (eq_id == mc->hdr.processor_rev_id && rev < mc->hdr.patch_id)
-                        if (__apply_microcode_amd(mc) == 0) {
+                if (eq_id == mc->hdr.processor_rev_id && rev < mc->hdr.patch_id) {
+                        if (!__apply_microcode_amd(mc)) {
                                rev = mc->hdr.patch_id;
                                *new_rev = rev;
+                                /* save ucode patch */
+                                memcpy(amd_ucode_patch, mc,
+                                       min_t(u32, header[1], PATCH_MAX_SIZE));
                        }
+                }
                offset  = header[1] + SECTION_HDR_SIZE;
                data   += offset;
                left   -= offset;
        }
-        /* mark where this microcode container file ends */
-        offset  = *usize - (data - (u8 *)ucode);
-        *usize -= offset;
-        if (!(*new_rev))
-                *usize = 0;
 }
 void __init load_ucode_amd_bsp(void)
 {
-        struct cpio_data cd = find_ucode_in_initrd();
+        struct cpio_data cp;
-        if (!cd.data)
+        void **data;
+        size_t *size;
+#ifdef CONFIG_X86_32
+        data =  (void **)__pa_nodebug(&ucode_cpio.data);
+        size = (size_t *)__pa_nodebug(&ucode_cpio.size);
+#else
+        data = &ucode_cpio.data;
+        size = &ucode_cpio.size;
+#endif
+        cp = find_ucode_in_initrd();
+        if (!cp.data)
                return;
-        apply_ucode_in_initrd(cd.data, cd.size);
+        *data = cp.data;
+        *size = cp.size;
+        apply_ucode_in_initrd(cp.data, cp.size);
 }
 #ifdef CONFIG_X86_32
-u8 amd_bsp_mpb[MPB_MAX_SIZE];
 /*
 * On 32-bit, since AP's early load occurs before paging is turned on, we
 * cannot traverse cpu_equiv_table and pcache in kernel heap memory. So during
 * cold boot, AP will apply_ucode_in_initrd() just like the BSP. During
- * save_microcode_in_initrd_amd() BSP's patch is copied to amd_bsp_mpb, which
+ * save_microcode_in_initrd_amd() BSP's patch is copied to amd_ucode_patch,
- * is used upon resume from suspend.
+ * which is used upon resume from suspend.
 */
 void load_ucode_amd_ap(void)
 {
        struct microcode_amd *mc;
-        unsigned long *initrd;
-        unsigned long *uoffset;
        size_t *usize;
-        void *ucode;
+        void **ucode;
-        mc = (struct microcode_amd *)__pa(amd_bsp_mpb);
+        mc = (struct microcode_amd *)__pa(amd_ucode_patch);
        if (mc->hdr.patch_id && mc->hdr.processor_rev_id) {
                __apply_microcode_amd(mc);
                return;
        }
-        initrd  = (unsigned long *)__pa(&initrd_start);
+        ucode = (void *)__pa_nodebug(&container);
-        uoffset = (unsigned long *)__pa(&ucode_offset);
+        usize = (size_t *)__pa_nodebug(&container_size);
-        usize   = (size_t *)__pa(&ucode_size);
-        if (!*usize || !*initrd)
+        if (!*ucode || !*usize)
                return;
-        ucode = (void *)((unsigned long)__pa(*initrd) + *uoffset);
+        apply_ucode_in_initrd(*ucode, *usize);
-        apply_ucode_in_initrd(ucode, *usize);
 }
 static void __init collect_cpu_sig_on_bsp(void *arg)
 {
        unsigned int cpu = smp_processor_id();
        struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
        uci->cpu_sig.sig = cpuid_eax(0x00000001);
 }
 #else
@@ -242,36 +290,54 @@ void load_ucode_amd_ap(void)
 {
        unsigned int cpu = smp_processor_id();
        struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+        struct equiv_cpu_entry *eq;
+        struct microcode_amd *mc;
        u32 rev, eax;
+        u16 eq_id;
+        /* Exit if called on the BSP. */
+        if (!cpu)
+                return;
+        if (!container)
+                return;
        rdmsr(MSR_AMD64_PATCH_LEVEL, rev, eax);
-        eax = cpuid_eax(0x00000001);
        uci->cpu_sig.rev = rev;
        uci->cpu_sig.sig = eax;
-        if (cpu && !ucode_loaded) {
+        eax = cpuid_eax(0x00000001);
-                void *ucode;
+        eq  = (struct equiv_cpu_entry *)(container + CONTAINER_HDR_SZ);
-                if (!ucode_size || !initrd_start)
+        eq_id = find_equiv_id(eq, eax);
-                        return;
+        if (!eq_id)
+                return;
+        if (eq_id == this_equiv_id) {
+                mc = (struct microcode_amd *)amd_ucode_patch;
-                ucode = (void *)(initrd_start + ucode_offset);
+                if (mc && rev < mc->hdr.patch_id) {
-                eax   = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff);
+                        if (!__apply_microcode_amd(mc))
-                if (load_microcode_amd(eax, ucode, ucode_size) != UCODE_OK)
+                                ucode_new_rev = mc->hdr.patch_id;
+                }
+        } else {
+                if (!ucode_cpio.data)
                        return;
-                ucode_loaded = true;
+                /*
+                 * AP has a different equivalence ID than BSP, looks like
+                 * mixed-steppings silicon so go through the ucode blob anew.
+                 */
+                apply_ucode_in_initrd(ucode_cpio.data, ucode_cpio.size);
        }
-        apply_microcode_amd(cpu);
 }
 #endif
 int __init save_microcode_in_initrd_amd(void)
 {
        enum ucode_state ret;
-        void *ucode;
        u32 eax;
 #ifdef CONFIG_X86_32
@@ -280,22 +346,35 @@ int __init save_microcode_in_initrd_amd(void)
        if (!uci->cpu_sig.sig)
                smp_call_function_single(bsp, collect_cpu_sig_on_bsp, NULL, 1);
+        /*
+         * Take into account the fact that the ramdisk might get relocated
+         * and therefore we need to recompute the container's position in
+         * virtual memory space.
+         */
+        container = (u8 *)(__va((u32)relocated_ramdisk) +
+                           ((u32)container - boot_params.hdr.ramdisk_image));
 #endif
        if (ucode_new_rev)
                pr_info("microcode: updated early to new patch_level=0x%08x\n",
                        ucode_new_rev);
-        if (ucode_loaded || !ucode_size || !initrd_start)
+        if (!container)
-                return 0;
+                return -EINVAL;
-        ucode = (void *)(initrd_start + ucode_offset);
        eax   = cpuid_eax(0x00000001);
        eax   = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff);
-        ret = load_microcode_amd(eax, ucode, ucode_size);
+        ret = load_microcode_amd(eax, container, container_size);
        if (ret != UCODE_OK)
                return -EINVAL;
-        ucode_loaded = true;
+        /*
+         * This will be freed any msec now, stash patches for the current
+         * family and switch to patch cache for cpu hotplug, etc later.
+         */
+        container = NULL;
+        container_size = 0;
        return 0;
 }
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/cpu/microcode/core.c
index 15c987698b0f..15c987698b0f 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
diff --git a/arch/x86/kernel/microcode_core_early.c b/arch/x86/kernel/cpu/microcode/core_early.c
index be7f8514f577..be7f8514f577 100644
--- a/arch/x86/kernel/microcode_core_early.c
+++ b/arch/x86/kernel/cpu/microcode/core_early.c
diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index 5fb2cebf556b..a276fa75d9b5 100644
--- a/arch/x86/kernel/microcode_intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -278,7 +278,7 @@ static enum ucode_state request_microcode_fw(int cpu, struct device *device,
        sprintf(name, "intel-ucode/%02x-%02x-%02x",
                c->x86, c->x86_model, c->x86_mask);
-        if (request_firmware(&firmware, name, device)) {
+        if (request_firmware_direct(&firmware, name, device)) {
                pr_debug("data file %s load failed\n", name);
                return UCODE_NFOUND;
        }
diff --git a/arch/x86/kernel/microcode_intel_early.c b/arch/x86/kernel/cpu/microcode/intel_early.c
index 1575deb2e636..18f739129e72 100644
--- a/arch/x86/kernel/microcode_intel_early.c
+++ b/arch/x86/kernel/cpu/microcode/intel_early.c
@@ -365,16 +365,6 @@ out:
        return state;
 }
-#define native_rdmsr(msr, val1, val2)           \
-do {                                            \
-        u64 __val = native_read_msr((msr));     \
-        (void)((val1) = (u32)__val);            \
-        (void)((val2) = (u32)(__val >> 32));    \
-} while (0)
-#define native_wrmsr(msr, low, high)            \
-        native_write_msr(msr, low, high);
 static int collect_cpu_info_early(struct ucode_cpu_info *uci)
 {
        unsigned int val[2];
diff --git a/arch/x86/kernel/microcode_intel_lib.c b/arch/x86/kernel/cpu/microcode/intel_lib.c
index ce69320d0179..ce69320d0179 100644
--- a/arch/x86/kernel/microcode_intel_lib.c
+++ b/arch/x86/kernel/cpu/microcode/intel_lib.c
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 8e132931614d..b88645191fe5 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1883,21 +1883,27 @@ static struct pmu pmu = {
 void arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now)
 {
+        struct cyc2ns_data *data;
        userpg->cap_user_time = 0;
        userpg->cap_user_time_zero = 0;
        userpg->cap_user_rdpmc = x86_pmu.attr_rdpmc;
        userpg->pmc_width = x86_pmu.cntval_bits;
-        if (!sched_clock_stable)
+        if (!sched_clock_stable())
                return;
+        data = cyc2ns_read_begin();
        userpg->cap_user_time = 1;
-        userpg->time_mult = this_cpu_read(cyc2ns);
+        userpg->time_mult = data->cyc2ns_mul;
-        userpg->time_shift = CYC2NS_SCALE_FACTOR;
+        userpg->time_shift = data->cyc2ns_shift;
-        userpg->time_offset = this_cpu_read(cyc2ns_offset) - now;
+        userpg->time_offset = data->cyc2ns_offset - now;
        userpg->cap_user_time_zero = 1;
-        userpg->time_zero = this_cpu_read(cyc2ns_offset);
+        userpg->time_zero = data->cyc2ns_offset;
+        cyc2ns_read_end(data);
 }
 /*
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index fd00bb29425d..c1a861829d81 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -262,11 +262,20 @@ struct cpu_hw_events {
        __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK, \
                          HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST_HSW)
-#define EVENT_CONSTRAINT_END            \
+/*
-        EVENT_CONSTRAINT(0, 0, 0)
+ * We define the end marker as having a weight of -1
+ * to enable blacklisting of events using a counter bitmask
+ * of zero and thus a weight of zero.
+ * The end marker has a weight that cannot possibly be
+ * obtained from counting the bits in the bitmask.
+ */
+#define EVENT_CONSTRAINT_END { .weight = -1 }
+/*
+ * Check for end marker with weight == -1
+ */
 #define for_each_event_constraint(e, c) \
-        for ((e) = (c); (e)->weight; (e)++)
+        for ((e) = (c); (e)->weight != -1; (e)++)
 /*
 * Extra registers for specific events.
diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
index e09f0bfb7b8f..4b8e4d3cd6ea 100644
--- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c
+++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
@@ -10,6 +10,7 @@
 #include <linux/module.h>
 #include <linux/pci.h>
 #include <linux/ptrace.h>
+#include <linux/syscore_ops.h>
 #include <asm/apic.h>
@@ -816,6 +817,18 @@ out:
        return ret;
 }
+static void ibs_eilvt_setup(void)
+{
+        /*
+         * Force LVT offset assignment for family 10h: The offsets are
+         * not assigned by the BIOS for this family, so the OS is
+         * responsible for doing it. If the OS assignment fails, fall
+         * back to BIOS settings and try to setup this.
+         */
+        if (boot_cpu_data.x86 == 0x10)
+                force_ibs_eilvt_setup();
+}
 static inline int get_ibs_lvt_offset(void)
 {
        u64 val;
@@ -851,6 +864,36 @@ static void clear_APIC_ibs(void *dummy)
                setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_FIX, 1);
 }
+#ifdef CONFIG_PM
+static int perf_ibs_suspend(void)
+{
+        clear_APIC_ibs(NULL);
+        return 0;
+}
+static void perf_ibs_resume(void)
+{
+        ibs_eilvt_setup();
+        setup_APIC_ibs(NULL);
+}
+static struct syscore_ops perf_ibs_syscore_ops = {
+        .resume         = perf_ibs_resume,
+        .suspend        = perf_ibs_suspend,
+};
+static void perf_ibs_pm_init(void)
+{
+        register_syscore_ops(&perf_ibs_syscore_ops);
+}
+#else
+static inline void perf_ibs_pm_init(void) { }
+#endif
 static int
 perf_ibs_cpu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
 {
@@ -877,18 +920,12 @@ static __init int amd_ibs_init(void)
        if (!caps)
                return -ENODEV; /* ibs not supported by the cpu */
-        /*
+        ibs_eilvt_setup();
-         * Force LVT offset assignment for family 10h: The offsets are
-         * not assigned by the BIOS for this family, so the OS is
-         * responsible for doing it. If the OS assignment fails, fall
-         * back to BIOS settings and try to setup this.
-         */
-        if (boot_cpu_data.x86 == 0x10)
-                force_ibs_eilvt_setup();
        if (!ibs_eilvt_valid())
                goto out;
+        perf_ibs_pm_init();
        get_online_cpus();
        ibs_caps = caps;
        /* make ibs_caps visible to other cpus: */
diff --git a/arch/x86/kernel/cpu/perf_event_intel_rapl.c b/arch/x86/kernel/cpu/perf_event_intel_rapl.c
new file mode 100644
index 000000000000..5ad35ad94d0f
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_intel_rapl.c
@@ -0,0 +1,679 @@
+/*
+ * perf_event_intel_rapl.c: support Intel RAPL energy consumption counters
+ * Copyright (C) 2013 Google, Inc., Stephane Eranian
+ *
+ * Intel RAPL interface is specified in the IA-32 Manual Vol3b
+ * section 14.7.1 (September 2013)
+ *
+ * RAPL provides more controls than just reporting energy consumption
+ * however here we only expose the 3 energy consumption free running
+ * counters (pp0, pkg, dram).
+ *
+ * Each of those counters increments in a power unit defined by the
+ * RAPL_POWER_UNIT MSR. On SandyBridge, this unit is 1/(2^16) Joules
+ * but it can vary.
+ *
+ * Counter to rapl events mappings:
+ *
+ *  pp0 counter: consumption of all physical cores (power plane 0)
+ *        event: rapl_energy_cores
+ *    perf code: 0x1
+ *
+ *  pkg counter: consumption of the whole processor package
+ *        event: rapl_energy_pkg
+ *    perf code: 0x2
+ *
+ * dram counter: consumption of the dram domain (servers only)
+ *        event: rapl_energy_dram
+ *    perf code: 0x3
+ *
+ * dram counter: consumption of the builtin-gpu domain (client only)
+ *        event: rapl_energy_gpu
+ *    perf code: 0x4
+ *
+ * We manage those counters as free running (read-only). They may be
+ * use simultaneously by other tools, such as turbostat.
+ *
+ * The events only support system-wide mode counting. There is no
+ * sampling support because it does not make sense and is not
+ * supported by the RAPL hardware.
+ *
+ * Because we want to avoid floating-point operations in the kernel,
+ * the events are all reported in fixed point arithmetic (32.32).
+ * Tools must adjust the counts to convert them to Watts using
+ * the duration of the measurement. Tools may use a function such as
+ * ldexp(raw_count, -32);
+ */
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/perf_event.h>
+#include <asm/cpu_device_id.h>
+#include "perf_event.h"
+/*
+ * RAPL energy status counters
+ */
+#define RAPL_IDX_PP0_NRG_STAT   0       /* all cores */
+#define INTEL_RAPL_PP0          0x1     /* pseudo-encoding */
+#define RAPL_IDX_PKG_NRG_STAT   1       /* entire package */
+#define INTEL_RAPL_PKG          0x2     /* pseudo-encoding */
+#define RAPL_IDX_RAM_NRG_STAT   2       /* DRAM */
+#define INTEL_RAPL_RAM          0x3     /* pseudo-encoding */
+#define RAPL_IDX_PP1_NRG_STAT   3       /* DRAM */
+#define INTEL_RAPL_PP1          0x4     /* pseudo-encoding */
+/* Clients have PP0, PKG */
+#define RAPL_IDX_CLN    (1<<RAPL_IDX_PP0_NRG_STAT|\
+                         1<<RAPL_IDX_PKG_NRG_STAT|\
+                         1<<RAPL_IDX_PP1_NRG_STAT)
+/* Servers have PP0, PKG, RAM */
+#define RAPL_IDX_SRV    (1<<RAPL_IDX_PP0_NRG_STAT|\
+                         1<<RAPL_IDX_PKG_NRG_STAT|\
+                         1<<RAPL_IDX_RAM_NRG_STAT)
+/*
+ * event code: LSB 8 bits, passed in attr->config
+ * any other bit is reserved
+ */
+#define RAPL_EVENT_MASK 0xFFULL
+#define DEFINE_RAPL_FORMAT_ATTR(_var, _name, _format)           \
+static ssize_t __rapl_##_var##_show(struct kobject *kobj,       \
+                                struct kobj_attribute *attr,    \
+                                char *page)                     \
+{                                                               \
+        BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE);             \
+        return sprintf(page, _format "\n");                     \
+}                                                               \
+static struct kobj_attribute format_attr_##_var =               \
+        __ATTR(_name, 0444, __rapl_##_var##_show, NULL)
+#define RAPL_EVENT_DESC(_name, _config)                         \
+{                                                               \
+        .attr   = __ATTR(_name, 0444, rapl_event_show, NULL),   \
+        .config = _config,                                      \
+}
+#define RAPL_CNTR_WIDTH 32 /* 32-bit rapl counters */
+struct rapl_pmu {
+        spinlock_t       lock;
+        int              hw_unit;  /* 1/2^hw_unit Joule */
+        int              n_active; /* number of active events */
+        struct list_head active_list;
+        struct pmu       *pmu; /* pointer to rapl_pmu_class */
+        ktime_t          timer_interval; /* in ktime_t unit */
+        struct hrtimer   hrtimer;
+};
+static struct pmu rapl_pmu_class;
+static cpumask_t rapl_cpu_mask;
+static int rapl_cntr_mask;
+static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu);
+static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu_to_free);
+static inline u64 rapl_read_counter(struct perf_event *event)
+{
+        u64 raw;
+        rdmsrl(event->hw.event_base, raw);
+        return raw;
+}
+static inline u64 rapl_scale(u64 v)
+{
+        /*
+         * scale delta to smallest unit (1/2^32)
+         * users must then scale back: count * 1/(1e9*2^32) to get Joules
+         * or use ldexp(count, -32).
+         * Watts = Joules/Time delta
+         */
+        return v << (32 - __get_cpu_var(rapl_pmu)->hw_unit);
+}
+static u64 rapl_event_update(struct perf_event *event)
+{
+        struct hw_perf_event *hwc = &event->hw;
+        u64 prev_raw_count, new_raw_count;
+        s64 delta, sdelta;
+        int shift = RAPL_CNTR_WIDTH;
+again:
+        prev_raw_count = local64_read(&hwc->prev_count);
+        rdmsrl(event->hw.event_base, new_raw_count);
+        if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
+                            new_raw_count) != prev_raw_count) {
+                cpu_relax();
+                goto again;
+        }
+        /*
+         * Now we have the new raw value and have updated the prev
+         * timestamp already. We can now calculate the elapsed delta
+         * (event-)time and add that to the generic event.
+         *
+         * Careful, not all hw sign-extends above the physical width
+         * of the count.
+         */
+        delta = (new_raw_count << shift) - (prev_raw_count << shift);
+        delta >>= shift;
+        sdelta = rapl_scale(delta);
+        local64_add(sdelta, &event->count);
+        return new_raw_count;
+}
+static void rapl_start_hrtimer(struct rapl_pmu *pmu)
+{
+        __hrtimer_start_range_ns(&pmu->hrtimer,
+                        pmu->timer_interval, 0,
+                        HRTIMER_MODE_REL_PINNED, 0);
+}
+static void rapl_stop_hrtimer(struct rapl_pmu *pmu)
+{
+        hrtimer_cancel(&pmu->hrtimer);
+}
+static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer)
+{
+        struct rapl_pmu *pmu = __get_cpu_var(rapl_pmu);
+        struct perf_event *event;
+        unsigned long flags;
+        if (!pmu->n_active)
+                return HRTIMER_NORESTART;
+        spin_lock_irqsave(&pmu->lock, flags);
+        list_for_each_entry(event, &pmu->active_list, active_entry) {
+                rapl_event_update(event);
+        }
+        spin_unlock_irqrestore(&pmu->lock, flags);
+        hrtimer_forward_now(hrtimer, pmu->timer_interval);
+        return HRTIMER_RESTART;
+}
+static void rapl_hrtimer_init(struct rapl_pmu *pmu)
+{
+        struct hrtimer *hr = &pmu->hrtimer;
+        hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+        hr->function = rapl_hrtimer_handle;
+}
+static void __rapl_pmu_event_start(struct rapl_pmu *pmu,
+                                   struct perf_event *event)
+{
+        if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
+                return;
+        event->hw.state = 0;
+        list_add_tail(&event->active_entry, &pmu->active_list);
+        local64_set(&event->hw.prev_count, rapl_read_counter(event));
+        pmu->n_active++;
+        if (pmu->n_active == 1)
+                rapl_start_hrtimer(pmu);
+}
+static void rapl_pmu_event_start(struct perf_event *event, int mode)
+{
+        struct rapl_pmu *pmu = __get_cpu_var(rapl_pmu);
+        unsigned long flags;
+        spin_lock_irqsave(&pmu->lock, flags);
+        __rapl_pmu_event_start(pmu, event);
+        spin_unlock_irqrestore(&pmu->lock, flags);
+}
+static void rapl_pmu_event_stop(struct perf_event *event, int mode)
+{
+        struct rapl_pmu *pmu = __get_cpu_var(rapl_pmu);
+        struct hw_perf_event *hwc = &event->hw;
+        unsigned long flags;
+        spin_lock_irqsave(&pmu->lock, flags);
+        /* mark event as deactivated and stopped */
+        if (!(hwc->state & PERF_HES_STOPPED)) {
+                WARN_ON_ONCE(pmu->n_active <= 0);
+                pmu->n_active--;
+                if (pmu->n_active == 0)
+                        rapl_stop_hrtimer(pmu);
+                list_del(&event->active_entry);
+                WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
+                hwc->state |= PERF_HES_STOPPED;
+        }
+        /* check if update of sw counter is necessary */
+        if ((mode & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
+                /*
+                 * Drain the remaining delta count out of a event
+                 * that we are disabling:
+                 */
+                rapl_event_update(event);
+                hwc->state |= PERF_HES_UPTODATE;
+        }
+        spin_unlock_irqrestore(&pmu->lock, flags);
+}
+static int rapl_pmu_event_add(struct perf_event *event, int mode)
+{
+        struct rapl_pmu *pmu = __get_cpu_var(rapl_pmu);
+        struct hw_perf_event *hwc = &event->hw;
+        unsigned long flags;
+        spin_lock_irqsave(&pmu->lock, flags);
+        hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+        if (mode & PERF_EF_START)
+                __rapl_pmu_event_start(pmu, event);
+        spin_unlock_irqrestore(&pmu->lock, flags);
+        return 0;
+}
+static void rapl_pmu_event_del(struct perf_event *event, int flags)
+{
+        rapl_pmu_event_stop(event, PERF_EF_UPDATE);
+}
+static int rapl_pmu_event_init(struct perf_event *event)
+{
+        u64 cfg = event->attr.config & RAPL_EVENT_MASK;
+        int bit, msr, ret = 0;
+        /* only look at RAPL events */
+        if (event->attr.type != rapl_pmu_class.type)
+                return -ENOENT;
+        /* check only supported bits are set */
+        if (event->attr.config & ~RAPL_EVENT_MASK)
+                return -EINVAL;
+        /*
+         * check event is known (determines counter)
+         */
+        switch (cfg) {
+        case INTEL_RAPL_PP0:
+                bit = RAPL_IDX_PP0_NRG_STAT;
+                msr = MSR_PP0_ENERGY_STATUS;
+                break;
+        case INTEL_RAPL_PKG:
+                bit = RAPL_IDX_PKG_NRG_STAT;
+                msr = MSR_PKG_ENERGY_STATUS;
+                break;
+        case INTEL_RAPL_RAM:
+                bit = RAPL_IDX_RAM_NRG_STAT;
+                msr = MSR_DRAM_ENERGY_STATUS;
+                break;
+        case INTEL_RAPL_PP1:
+                bit = RAPL_IDX_PP1_NRG_STAT;
+                msr = MSR_PP1_ENERGY_STATUS;
+                break;
+        default:
+                return -EINVAL;
+        }
+        /* check event supported */
+        if (!(rapl_cntr_mask & (1 << bit)))
+                return -EINVAL;
+        /* unsupported modes and filters */
+        if (event->attr.exclude_user   ||
+            event->attr.exclude_kernel ||
+            event->attr.exclude_hv     ||
+            event->attr.exclude_idle   ||
+            event->attr.exclude_host   ||
+            event->attr.exclude_guest  ||
+            event->attr.sample_period) /* no sampling */
+                return -EINVAL;
+        /* must be done before validate_group */
+        event->hw.event_base = msr;
+        event->hw.config = cfg;
+        event->hw.idx = bit;
+        return ret;
+}
+static void rapl_pmu_event_read(struct perf_event *event)
+{
+        rapl_event_update(event);
+}
+static ssize_t rapl_get_attr_cpumask(struct device *dev,
+                                struct device_attribute *attr, char *buf)
+{
+        int n = cpulist_scnprintf(buf, PAGE_SIZE - 2, &rapl_cpu_mask);
+        buf[n++] = '\n';
+        buf[n] = '\0';
+        return n;
+}
+static DEVICE_ATTR(cpumask, S_IRUGO, rapl_get_attr_cpumask, NULL);
+static struct attribute *rapl_pmu_attrs[] = {
+        &dev_attr_cpumask.attr,
+        NULL,
+};
+static struct attribute_group rapl_pmu_attr_group = {
+        .attrs = rapl_pmu_attrs,
+};
+EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01");
+EVENT_ATTR_STR(energy-pkg  ,   rapl_pkg, "event=0x02");
+EVENT_ATTR_STR(energy-ram  ,   rapl_ram, "event=0x03");
+EVENT_ATTR_STR(energy-gpu  ,   rapl_gpu, "event=0x04");
+EVENT_ATTR_STR(energy-cores.unit, rapl_cores_unit, "Joules");
+EVENT_ATTR_STR(energy-pkg.unit  ,   rapl_pkg_unit, "Joules");
+EVENT_ATTR_STR(energy-ram.unit  ,   rapl_ram_unit, "Joules");
+EVENT_ATTR_STR(energy-gpu.unit  ,   rapl_gpu_unit, "Joules");
+/*
+ * we compute in 0.23 nJ increments regardless of MSR
+ */
+EVENT_ATTR_STR(energy-cores.scale, rapl_cores_scale, "2.3283064365386962890625e-10");
+EVENT_ATTR_STR(energy-pkg.scale,     rapl_pkg_scale, "2.3283064365386962890625e-10");
+EVENT_ATTR_STR(energy-ram.scale,     rapl_ram_scale, "2.3283064365386962890625e-10");
+EVENT_ATTR_STR(energy-gpu.scale,     rapl_gpu_scale, "2.3283064365386962890625e-10");
+static struct attribute *rapl_events_srv_attr[] = {
+        EVENT_PTR(rapl_cores),
+        EVENT_PTR(rapl_pkg),
+        EVENT_PTR(rapl_ram),
+        EVENT_PTR(rapl_cores_unit),
+        EVENT_PTR(rapl_pkg_unit),
+        EVENT_PTR(rapl_ram_unit),
+        EVENT_PTR(rapl_cores_scale),
+        EVENT_PTR(rapl_pkg_scale),
+        EVENT_PTR(rapl_ram_scale),
+        NULL,
+};
+static struct attribute *rapl_events_cln_attr[] = {
+        EVENT_PTR(rapl_cores),
+        EVENT_PTR(rapl_pkg),
+        EVENT_PTR(rapl_gpu),
+        EVENT_PTR(rapl_cores_unit),
+        EVENT_PTR(rapl_pkg_unit),
+        EVENT_PTR(rapl_gpu_unit),
+        EVENT_PTR(rapl_cores_scale),
+        EVENT_PTR(rapl_pkg_scale),
+        EVENT_PTR(rapl_gpu_scale),
+        NULL,
+};
+static struct attribute_group rapl_pmu_events_group = {
+        .name = "events",
+        .attrs = NULL, /* patched at runtime */
+};
+DEFINE_RAPL_FORMAT_ATTR(event, event, "config:0-7");
+static struct attribute *rapl_formats_attr[] = {
+        &format_attr_event.attr,
+        NULL,
+};
+static struct attribute_group rapl_pmu_format_group = {
+        .name = "format",
+        .attrs = rapl_formats_attr,
+};
+const struct attribute_group *rapl_attr_groups[] = {
+        &rapl_pmu_attr_group,
+        &rapl_pmu_format_group,
+        &rapl_pmu_events_group,
+        NULL,
+};
+static struct pmu rapl_pmu_class = {
+        .attr_groups    = rapl_attr_groups,
+        .task_ctx_nr    = perf_invalid_context, /* system-wide only */
+        .event_init     = rapl_pmu_event_init,
+        .add            = rapl_pmu_event_add, /* must have */
+        .del            = rapl_pmu_event_del, /* must have */
+        .start          = rapl_pmu_event_start,
+        .stop           = rapl_pmu_event_stop,
+        .read           = rapl_pmu_event_read,
+};
+static void rapl_cpu_exit(int cpu)
+{
+        struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu);
+        int i, phys_id = topology_physical_package_id(cpu);
+        int target = -1;
+        /* find a new cpu on same package */
+        for_each_online_cpu(i) {
+                if (i == cpu)
+                        continue;
+                if (phys_id == topology_physical_package_id(i)) {
+                        target = i;
+                        break;
+                }
+        }
+        /*
+         * clear cpu from cpumask
+         * if was set in cpumask and still some cpu on package,
+         * then move to new cpu
+         */
+        if (cpumask_test_and_clear_cpu(cpu, &rapl_cpu_mask) && target >= 0)
+                cpumask_set_cpu(target, &rapl_cpu_mask);
+        WARN_ON(cpumask_empty(&rapl_cpu_mask));
+        /*
+         * migrate events and context to new cpu
+         */
+        if (target >= 0)
+                perf_pmu_migrate_context(pmu->pmu, cpu, target);
+        /* cancel overflow polling timer for CPU */
+        rapl_stop_hrtimer(pmu);
+}
+static void rapl_cpu_init(int cpu)
+{
+        int i, phys_id = topology_physical_package_id(cpu);
+        /* check if phys_is is already covered */
+        for_each_cpu(i, &rapl_cpu_mask) {
+                if (phys_id == topology_physical_package_id(i))
+                        return;
+        }
+        /* was not found, so add it */
+        cpumask_set_cpu(cpu, &rapl_cpu_mask);
+}
+static int rapl_cpu_prepare(int cpu)
+{
+        struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu);
+        int phys_id = topology_physical_package_id(cpu);
+        u64 ms;
+        if (pmu)
+                return 0;
+        if (phys_id < 0)
+                return -1;
+        pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu));
+        if (!pmu)
+                return -1;
+        spin_lock_init(&pmu->lock);
+        INIT_LIST_HEAD(&pmu->active_list);
+        /*
+         * grab power unit as: 1/2^unit Joules
+         *
+         * we cache in local PMU instance
+         */
+        rdmsrl(MSR_RAPL_POWER_UNIT, pmu->hw_unit);
+        pmu->hw_unit = (pmu->hw_unit >> 8) & 0x1FULL;
+        pmu->pmu = &rapl_pmu_class;
+        /*
+         * use reference of 200W for scaling the timeout
+         * to avoid missing counter overflows.
+         * 200W = 200 Joules/sec
+         * divide interval by 2 to avoid lockstep (2 * 100)
+         * if hw unit is 32, then we use 2 ms 1/200/2
+         */
+        if (pmu->hw_unit < 32)
+                ms = (1000 / (2 * 100)) * (1ULL << (32 - pmu->hw_unit - 1));
+        else
+                ms = 2;
+        pmu->timer_interval = ms_to_ktime(ms);
+        rapl_hrtimer_init(pmu);
+        /* set RAPL pmu for this cpu for now */
+        per_cpu(rapl_pmu, cpu) = pmu;
+        per_cpu(rapl_pmu_to_free, cpu) = NULL;
+        return 0;
+}
+static void rapl_cpu_kfree(int cpu)
+{
+        struct rapl_pmu *pmu = per_cpu(rapl_pmu_to_free, cpu);
+        kfree(pmu);
+        per_cpu(rapl_pmu_to_free, cpu) = NULL;
+}
+static int rapl_cpu_dying(int cpu)
+{
+        struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu);
+        if (!pmu)
+                return 0;
+        per_cpu(rapl_pmu, cpu) = NULL;
+        per_cpu(rapl_pmu_to_free, cpu) = pmu;
+        return 0;
+}
+static int rapl_cpu_notifier(struct notifier_block *self,
+                             unsigned long action, void *hcpu)
+{
+        unsigned int cpu = (long)hcpu;
+        switch (action & ~CPU_TASKS_FROZEN) {
+        case CPU_UP_PREPARE:
+                rapl_cpu_prepare(cpu);
+                break;
+        case CPU_STARTING:
+                rapl_cpu_init(cpu);
+                break;
+        case CPU_UP_CANCELED:
+        case CPU_DYING:
+                rapl_cpu_dying(cpu);
+                break;
+        case CPU_ONLINE:
+        case CPU_DEAD:
+                rapl_cpu_kfree(cpu);
+                break;
+        case CPU_DOWN_PREPARE:
+                rapl_cpu_exit(cpu);
+                break;
+        default:
+                break;
+        }
+        return NOTIFY_OK;
+}
+static const struct x86_cpu_id rapl_cpu_match[] = {
+        [0] = { .vendor = X86_VENDOR_INTEL, .family = 6 },
+        [1] = {},
+};
+static int __init rapl_pmu_init(void)
+{
+        struct rapl_pmu *pmu;
+        int cpu, ret;
+        /*
+         * check for Intel processor family 6
+         */
+        if (!x86_match_cpu(rapl_cpu_match))
+                return 0;
+        /* check supported CPU */
+        switch (boot_cpu_data.x86_model) {
+        case 42: /* Sandy Bridge */
+        case 58: /* Ivy Bridge */
+        case 60: /* Haswell */
+        case 69: /* Haswell-Celeron */
+                rapl_cntr_mask = RAPL_IDX_CLN;
+                rapl_pmu_events_group.attrs = rapl_events_cln_attr;
+                break;
+        case 45: /* Sandy Bridge-EP */
+        case 62: /* IvyTown */
+                rapl_cntr_mask = RAPL_IDX_SRV;
+                rapl_pmu_events_group.attrs = rapl_events_srv_attr;
+                break;
+        default:
+                /* unsupported */
+                return 0;
+        }
+        get_online_cpus();
+        for_each_online_cpu(cpu) {
+                rapl_cpu_prepare(cpu);
+                rapl_cpu_init(cpu);
+        }
+        perf_cpu_notifier(rapl_cpu_notifier);
+        ret = perf_pmu_register(&rapl_pmu_class, "power", -1);
+        if (WARN_ON(ret)) {
+                pr_info("RAPL PMU detected, registration failed (%d), RAPL PMU disabled\n", ret);
+                put_online_cpus();
+                return -1;
+        }
+        pmu = __get_cpu_var(rapl_pmu);
+        pr_info("RAPL PMU detected, hw unit 2^-%d Joules,"
+                " API unit is 2^-32 Joules,"
+                " %d fixed counters"
+                " %llu ms ovfl timer\n",
+                pmu->hw_unit,
+                hweight32(rapl_cntr_mask),
+                ktime_to_ms(pmu->timer_interval));
+        put_online_cpus();
+        return 0;
+}
+device_initcall(rapl_pmu_init);
diff --git a/arch/x86/kernel/cpu/rdrand.c b/arch/x86/kernel/cpu/rdrand.c
index 88db010845cb..384df5105fbc 100644
--- a/arch/x86/kernel/cpu/rdrand.c
+++ b/arch/x86/kernel/cpu/rdrand.c
@@ -31,20 +31,6 @@ static int __init x86_rdrand_setup(char *s)
 }
 __setup("nordrand", x86_rdrand_setup);
-/* We can't use arch_get_random_long() here since alternatives haven't run */
-static inline int rdrand_long(unsigned long *v)
-{
-        int ok;
-        asm volatile("1: " RDRAND_LONG "\n\t"
-                     "jc 2f\n\t"
-                     "decl %0\n\t"
-                     "jnz 1b\n\t"
-                     "2:"
-                     : "=r" (ok), "=a" (*v)
-                     : "0" (RDRAND_RETRY_LOOPS));
-        return ok;
-}
 /*
 * Force a reseed cycle; we are architecturally guaranteed a reseed
 * after no more than 512 128-bit chunks of random data.  This also
diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c
index aa0430d69b90..3fa0e5ad86b4 100644
--- a/arch/x86/kernel/cpu/transmeta.c
+++ b/arch/x86/kernel/cpu/transmeta.c
@@ -1,6 +1,5 @@
 #include <linux/kernel.h>
 #include <linux/mm.h>
-#include <linux/init.h>
 #include <asm/processor.h>
 #include <asm/msr.h>
 #include "cpu.h"
diff --git a/arch/x86/kernel/cpu/umc.c b/arch/x86/kernel/cpu/umc.c
index 75c5ad5d35cc..ef9c2a0078bd 100644
--- a/arch/x86/kernel/cpu/umc.c
+++ b/arch/x86/kernel/cpu/umc.c
@@ -1,5 +1,4 @@
 #include <linux/kernel.h>
-#include <linux/init.h>
 #include <asm/processor.h>
 #include "cpu.h"
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index 18677a90d6a3..a57902efe2d5 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -7,7 +7,6 @@
 *
 */
-#include <linux/init.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/smp.h>
diff --git a/arch/x86/kernel/doublefault.c b/arch/x86/kernel/doublefault.c
index 5d3fe8d36e4a..f6dfd9334b67 100644
--- a/arch/x86/kernel/doublefault.c
+++ b/arch/x86/kernel/doublefault.c
@@ -1,6 +1,5 @@
 #include <linux/mm.h>
 #include <linux/sched.h>
-#include <linux/init.h>
 #include <linux/init_task.h>
 #include <linux/fs.h>
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 174da5fc5a7b..988c00a1f60d 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -1120,7 +1120,7 @@ void __init memblock_find_dma_reserve(void)
                nr_pages += end_pfn - start_pfn;
        }
-        for_each_free_mem_range(u, MAX_NUMNODES, &start, &end, NULL) {
+        for_each_free_mem_range(u, NUMA_NO_NODE, &start, &end, NULL) {
                start_pfn = min_t(unsigned long, PFN_UP(start), MAX_DMA_PFN);
                end_pfn = min_t(unsigned long, PFN_DOWN(end), MAX_DMA_PFN);
                if (start_pfn < end_pfn)
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 51e2988c5728..a2a4f4697889 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -1082,7 +1082,7 @@ ENTRY(ftrace_caller)
        pushl $0        /* Pass NULL as regs pointer */
        movl 4*4(%esp), %eax
        movl 0x4(%ebp), %edx
-        leal function_trace_op, %ecx
+        movl function_trace_op, %ecx
        subl $MCOUNT_INSN_SIZE, %eax
 .globl ftrace_call
@@ -1140,7 +1140,7 @@ ENTRY(ftrace_regs_caller)
        movl 12*4(%esp), %eax   /* Load ip (1st parameter) */
        subl $MCOUNT_INSN_SIZE, %eax    /* Adjust ip */
        movl 0x4(%ebp), %edx    /* Load parent ip (2nd parameter) */
-        leal function_trace_op, %ecx /* Save ftrace_pos in 3rd parameter */
+        movl function_trace_op, %ecx /* Save ftrace_pos in 3rd parameter */
        pushl %esp              /* Save pt_regs as 4th parameter */
 GLOBAL(ftrace_regs_call)
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index e21b0785a85b..1e96c3628bf2 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -88,7 +88,7 @@ END(function_hook)
        MCOUNT_SAVE_FRAME \skip
        /* Load the ftrace_ops into the 3rd parameter */
-        leaq function_trace_op, %rdx
+        movq function_trace_op(%rip), %rdx
        /* Load ip into the first parameter */
        movq RIP(%rsp), %rdi
diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
index f66ff162dce8..a67b47c31314 100644
--- a/arch/x86/kernel/hw_breakpoint.c
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -38,7 +38,6 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/sched.h>
-#include <linux/init.h>
 #include <linux/smp.h>
 #include <asm/hw_breakpoint.h>
diff --git a/arch/x86/kernel/iosf_mbi.c b/arch/x86/kernel/iosf_mbi.c
new file mode 100644
index 000000000000..c3aae6672843
--- /dev/null
+++ b/arch/x86/kernel/iosf_mbi.c
@@ -0,0 +1,226 @@
+/*
+ * IOSF-SB MailBox Interface Driver
+ * Copyright (c) 2013, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ *
+ * The IOSF-SB is a fabric bus available on Atom based SOC's that uses a
+ * mailbox interface (MBI) to communicate with mutiple devices. This
+ * driver implements access to this interface for those platforms that can
+ * enumerate the device using PCI.
+ */
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/pci.h>
+#include <asm/iosf_mbi.h>
+static DEFINE_SPINLOCK(iosf_mbi_lock);
+static inline u32 iosf_mbi_form_mcr(u8 op, u8 port, u8 offset)
+{
+        return (op << 24) | (port << 16) | (offset << 8) | MBI_ENABLE;
+}
+static struct pci_dev *mbi_pdev;        /* one mbi device */
+static int iosf_mbi_pci_read_mdr(u32 mcrx, u32 mcr, u32 *mdr)
+{
+        int result;
+        if (!mbi_pdev)
+                return -ENODEV;
+        if (mcrx) {
+                result = pci_write_config_dword(mbi_pdev, MBI_MCRX_OFFSET,
+                                                mcrx);
+                if (result < 0)
+                        goto fail_read;
+        }
+        result = pci_write_config_dword(mbi_pdev, MBI_MCR_OFFSET, mcr);
+        if (result < 0)
+                goto fail_read;
+        result = pci_read_config_dword(mbi_pdev, MBI_MDR_OFFSET, mdr);
+        if (result < 0)
+                goto fail_read;
+        return 0;
+fail_read:
+        dev_err(&mbi_pdev->dev, "PCI config access failed with %d\n", result);
+        return result;
+}
+static int iosf_mbi_pci_write_mdr(u32 mcrx, u32 mcr, u32 mdr)
+{
+        int result;
+        if (!mbi_pdev)
+                return -ENODEV;
+        result = pci_write_config_dword(mbi_pdev, MBI_MDR_OFFSET, mdr);
+        if (result < 0)
+                goto fail_write;
+        if (mcrx) {
+                result = pci_write_config_dword(mbi_pdev, MBI_MCRX_OFFSET,
+                                                mcrx);
+                if (result < 0)
+                        goto fail_write;
+        }
+        result = pci_write_config_dword(mbi_pdev, MBI_MCR_OFFSET, mcr);
+        if (result < 0)
+                goto fail_write;
+        return 0;
+fail_write:
+        dev_err(&mbi_pdev->dev, "PCI config access failed with %d\n", result);
+        return result;
+}
+int iosf_mbi_read(u8 port, u8 opcode, u32 offset, u32 *mdr)
+{
+        u32 mcr, mcrx;
+        unsigned long flags;
+        int ret;
+        /*Access to the GFX unit is handled by GPU code */
+        if (port == BT_MBI_UNIT_GFX) {
+                WARN_ON(1);
+                return -EPERM;
+        }
+        mcr = iosf_mbi_form_mcr(opcode, port, offset & MBI_MASK_LO);
+        mcrx = offset & MBI_MASK_HI;
+        spin_lock_irqsave(&iosf_mbi_lock, flags);
+        ret = iosf_mbi_pci_read_mdr(mcrx, mcr, mdr);
+        spin_unlock_irqrestore(&iosf_mbi_lock, flags);
+        return ret;
+}
+EXPORT_SYMBOL(iosf_mbi_read);
+int iosf_mbi_write(u8 port, u8 opcode, u32 offset, u32 mdr)
+{
+        u32 mcr, mcrx;
+        unsigned long flags;
+        int ret;
+        /*Access to the GFX unit is handled by GPU code */
+        if (port == BT_MBI_UNIT_GFX) {
+                WARN_ON(1);
+                return -EPERM;
+        }
+        mcr = iosf_mbi_form_mcr(opcode, port, offset & MBI_MASK_LO);
+        mcrx = offset & MBI_MASK_HI;
+        spin_lock_irqsave(&iosf_mbi_lock, flags);
+        ret = iosf_mbi_pci_write_mdr(mcrx, mcr, mdr);
+        spin_unlock_irqrestore(&iosf_mbi_lock, flags);
+        return ret;
+}
+EXPORT_SYMBOL(iosf_mbi_write);
+int iosf_mbi_modify(u8 port, u8 opcode, u32 offset, u32 mdr, u32 mask)
+{
+        u32 mcr, mcrx;
+        u32 value;
+        unsigned long flags;
+        int ret;
+        /*Access to the GFX unit is handled by GPU code */
+        if (port == BT_MBI_UNIT_GFX) {
+                WARN_ON(1);
+                return -EPERM;
+        }
+        mcr = iosf_mbi_form_mcr(opcode, port, offset & MBI_MASK_LO);
+        mcrx = offset & MBI_MASK_HI;
+        spin_lock_irqsave(&iosf_mbi_lock, flags);
+        /* Read current mdr value */
+        ret = iosf_mbi_pci_read_mdr(mcrx, mcr & MBI_RD_MASK, &value);
+        if (ret < 0) {
+                spin_unlock_irqrestore(&iosf_mbi_lock, flags);
+                return ret;
+        }
+        /* Apply mask */
+        value &= ~mask;
+        mdr &= mask;
+        value |= mdr;
+        /* Write back */
+        ret = iosf_mbi_pci_write_mdr(mcrx, mcr | MBI_WR_MASK, value);
+        spin_unlock_irqrestore(&iosf_mbi_lock, flags);
+        return ret;
+}
+EXPORT_SYMBOL(iosf_mbi_modify);
+static int iosf_mbi_probe(struct pci_dev *pdev,
+                          const struct pci_device_id *unused)
+{
+        int ret;
+        ret = pci_enable_device(pdev);
+        if (ret < 0) {
+                dev_err(&pdev->dev, "error: could not enable device\n");
+                return ret;
+        }
+        mbi_pdev = pci_dev_get(pdev);
+        return 0;
+}
+static DEFINE_PCI_DEVICE_TABLE(iosf_mbi_pci_ids) = {
+        { PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x0F00) },
+        { 0, },
+};
+MODULE_DEVICE_TABLE(pci, iosf_mbi_pci_ids);
+static struct pci_driver iosf_mbi_pci_driver = {
+        .name           = "iosf_mbi_pci",
+        .probe          = iosf_mbi_probe,
+        .id_table       = iosf_mbi_pci_ids,
+};
+static int __init iosf_mbi_init(void)
+{
+        return pci_register_driver(&iosf_mbi_pci_driver);
+}
+static void __exit iosf_mbi_exit(void)
+{
+        pci_unregister_driver(&iosf_mbi_pci_driver);
+        if (mbi_pdev) {
+                pci_dev_put(mbi_pdev);
+                mbi_pdev = NULL;
+        }
+}
+module_init(iosf_mbi_init);
+module_exit(iosf_mbi_exit);
+MODULE_AUTHOR("David E. Box <david.e.box@linux.intel.com>");
+MODULE_DESCRIPTION("IOSF Mailbox Interface accessor");
+MODULE_LICENSE("GPL v2");
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 22d0687e7fda..dbb60878b744 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -193,9 +193,13 @@ __visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
        if (!handle_irq(irq, regs)) {
                ack_APIC_irq();
-                if (printk_ratelimit())
+                if (irq != VECTOR_RETRIGGERED) {
-                        pr_emerg("%s: %d.%d No irq handler for vector (irq %d)\n",
+                        pr_emerg_ratelimited("%s: %d.%d No irq handler for vector (irq %d)\n",
-                                __func__, smp_processor_id(), vector, irq);
+                                             __func__, smp_processor_id(),
+                                             vector, irq);
+                } else {
+                        __this_cpu_write(vector_irq[vector], VECTOR_UNDEFINED);
+                }
        }
        irq_exit();
@@ -262,6 +266,76 @@ __visible void smp_trace_x86_platform_ipi(struct pt_regs *regs)
 EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq);
 #ifdef CONFIG_HOTPLUG_CPU
+/*
+ * This cpu is going to be removed and its vectors migrated to the remaining
+ * online cpus.  Check to see if there are enough vectors in the remaining cpus.
+ * This function is protected by stop_machine().
+ */
+int check_irq_vectors_for_cpu_disable(void)
+{
+        int irq, cpu;
+        unsigned int this_cpu, vector, this_count, count;
+        struct irq_desc *desc;
+        struct irq_data *data;
+        struct cpumask affinity_new, online_new;
+        this_cpu = smp_processor_id();
+        cpumask_copy(&online_new, cpu_online_mask);
+        cpu_clear(this_cpu, online_new);
+        this_count = 0;
+        for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
+                irq = __this_cpu_read(vector_irq[vector]);
+                if (irq >= 0) {
+                        desc = irq_to_desc(irq);
+                        data = irq_desc_get_irq_data(desc);
+                        cpumask_copy(&affinity_new, data->affinity);
+                        cpu_clear(this_cpu, affinity_new);
+                        /* Do not count inactive or per-cpu irqs. */
+                        if (!irq_has_action(irq) || irqd_is_per_cpu(data))
+                                continue;
+                        /*
+                         * A single irq may be mapped to multiple
+                         * cpu's vector_irq[] (for example IOAPIC cluster
+                         * mode).  In this case we have two
+                         * possibilities:
+                         *
+                         * 1) the resulting affinity mask is empty; that is
+                         * this the down'd cpu is the last cpu in the irq's
+                         * affinity mask, or
+                         *
+                         * 2) the resulting affinity mask is no longer
+                         * a subset of the online cpus but the affinity
+                         * mask is not zero; that is the down'd cpu is the
+                         * last online cpu in a user set affinity mask.
+                         */
+                        if (cpumask_empty(&affinity_new) ||
+                            !cpumask_subset(&affinity_new, &online_new))
+                                this_count++;
+                }
+        }
+        count = 0;
+        for_each_online_cpu(cpu) {
+                if (cpu == this_cpu)
+                        continue;
+                for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS;
+                     vector++) {
+                        if (per_cpu(vector_irq, cpu)[vector] < 0)
+                                count++;
+                }
+        }
+        if (count < this_count) {
+                pr_warn("CPU %d disable failed: CPU has %u vectors assigned and there are only %u available.\n",
+                        this_cpu, this_count, count);
+                return -ERANGE;
+        }
+        return 0;
+}
 /* A cpu has been removed from cpu_online_mask.  Reset irq affinities. */
 void fixup_irqs(void)
 {
@@ -344,7 +418,7 @@ void fixup_irqs(void)
        for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
                unsigned int irr;
-                if (__this_cpu_read(vector_irq[vector]) < 0)
+                if (__this_cpu_read(vector_irq[vector]) <= VECTOR_UNDEFINED)
                        continue;
                irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
@@ -355,11 +429,14 @@ void fixup_irqs(void)
                        data = irq_desc_get_irq_data(desc);
                        chip = irq_data_get_irq_chip(data);
                        raw_spin_lock(&desc->lock);
-                        if (chip->irq_retrigger)
+                        if (chip->irq_retrigger) {
                                chip->irq_retrigger(data);
+                                __this_cpu_write(vector_irq[vector], VECTOR_RETRIGGERED);
+                        }
                        raw_spin_unlock(&desc->lock);
                }
-                __this_cpu_write(vector_irq[vector], -1);
+                if (__this_cpu_read(vector_irq[vector]) != VECTOR_RETRIGGERED)
+                        __this_cpu_write(vector_irq[vector], VECTOR_UNDEFINED);
        }
 }
 #endif
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index a2a1fbc594ff..7f50156542fb 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -52,7 +52,7 @@ static struct irqaction irq2 = {
 };
 DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
-        [0 ... NR_VECTORS - 1] = -1,
+        [0 ... NR_VECTORS - 1] = VECTOR_UNDEFINED,
 };
 int vector_used_by_percpu_irq(unsigned int vector)
@@ -60,7 +60,7 @@ int vector_used_by_percpu_irq(unsigned int vector)
        int cpu;
        for_each_online_cpu(cpu) {
-                if (per_cpu(vector_irq, cpu)[vector] != -1)
+                if (per_cpu(vector_irq, cpu)[vector] > VECTOR_UNDEFINED)
                        return 1;
        }
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 836f8322960e..7ec1d5f8d283 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -39,7 +39,6 @@
 #include <linux/sched.h>
 #include <linux/delay.h>
 #include <linux/kgdb.h>
-#include <linux/init.h>
 #include <linux/smp.h>
 #include <linux/nmi.h>
 #include <linux/hw_breakpoint.h>
diff --git a/arch/x86/kernel/ksysfs.c b/arch/x86/kernel/ksysfs.c
new file mode 100644
index 000000000000..c2bedaea11f7
--- /dev/null
+++ b/arch/x86/kernel/ksysfs.c
@@ -0,0 +1,340 @@
+/*
+ * Architecture specific sysfs attributes in /sys/kernel
+ *
+ * Copyright (C) 2007, Intel Corp.
+ *      Huang Ying <ying.huang@intel.com>
+ * Copyright (C) 2013, 2013 Red Hat, Inc.
+ *      Dave Young <dyoung@redhat.com>
+ *
+ * This file is released under the GPLv2
+ */
+#include <linux/kobject.h>
+#include <linux/string.h>
+#include <linux/sysfs.h>
+#include <linux/init.h>
+#include <linux/stat.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <asm/io.h>
+#include <asm/setup.h>
+static ssize_t version_show(struct kobject *kobj,
+                            struct kobj_attribute *attr, char *buf)
+{
+        return sprintf(buf, "0x%04x\n", boot_params.hdr.version);
+}
+static struct kobj_attribute boot_params_version_attr = __ATTR_RO(version);
+static ssize_t boot_params_data_read(struct file *fp, struct kobject *kobj,
+                                     struct bin_attribute *bin_attr,
+                                     char *buf, loff_t off, size_t count)
+{
+        memcpy(buf, (void *)&boot_params + off, count);
+        return count;
+}
+static struct bin_attribute boot_params_data_attr = {
+        .attr = {
+                .name = "data",
+                .mode = S_IRUGO,
+        },
+        .read = boot_params_data_read,
+        .size = sizeof(boot_params),
+};
+static struct attribute *boot_params_version_attrs[] = {
+        &boot_params_version_attr.attr,
+        NULL,
+};
+static struct bin_attribute *boot_params_data_attrs[] = {
+        &boot_params_data_attr,
+        NULL,
+};
+static struct attribute_group boot_params_attr_group = {
+        .attrs = boot_params_version_attrs,
+        .bin_attrs = boot_params_data_attrs,
+};
+static int kobj_to_setup_data_nr(struct kobject *kobj, int *nr)
+{
+        const char *name;
+        name = kobject_name(kobj);
+        return kstrtoint(name, 10, nr);
+}
+static int get_setup_data_paddr(int nr, u64 *paddr)
+{
+        int i = 0;
+        struct setup_data *data;
+        u64 pa_data = boot_params.hdr.setup_data;
+        while (pa_data) {
+                if (nr == i) {
+                        *paddr = pa_data;
+                        return 0;
+                }
+                data = ioremap_cache(pa_data, sizeof(*data));
+                if (!data)
+                        return -ENOMEM;
+                pa_data = data->next;
+                iounmap(data);
+                i++;
+        }
+        return -EINVAL;
+}
+static int __init get_setup_data_size(int nr, size_t *size)
+{
+        int i = 0;
+        struct setup_data *data;
+        u64 pa_data = boot_params.hdr.setup_data;
+        while (pa_data) {
+                data = ioremap_cache(pa_data, sizeof(*data));
+                if (!data)
+                        return -ENOMEM;
+                if (nr == i) {
+                        *size = data->len;
+                        iounmap(data);
+                        return 0;
+                }
+                pa_data = data->next;
+                iounmap(data);
+                i++;
+        }
+        return -EINVAL;
+}
+static ssize_t type_show(struct kobject *kobj,
+                         struct kobj_attribute *attr, char *buf)
+{
+        int nr, ret;
+        u64 paddr;
+        struct setup_data *data;
+        ret = kobj_to_setup_data_nr(kobj, &nr);
+        if (ret)
+                return ret;
+        ret = get_setup_data_paddr(nr, &paddr);
+        if (ret)
+                return ret;
+        data = ioremap_cache(paddr, sizeof(*data));
+        if (!data)
+                return -ENOMEM;
+        ret = sprintf(buf, "0x%x\n", data->type);
+        iounmap(data);
+        return ret;
+}
+static ssize_t setup_data_data_read(struct file *fp,
+                                    struct kobject *kobj,
+                                    struct bin_attribute *bin_attr,
+                                    char *buf,
+                                    loff_t off, size_t count)
+{
+        int nr, ret = 0;
+        u64 paddr;
+        struct setup_data *data;
+        void *p;
+        ret = kobj_to_setup_data_nr(kobj, &nr);
+        if (ret)
+                return ret;
+        ret = get_setup_data_paddr(nr, &paddr);
+        if (ret)
+                return ret;
+        data = ioremap_cache(paddr, sizeof(*data));
+        if (!data)
+                return -ENOMEM;
+        if (off > data->len) {
+                ret = -EINVAL;
+                goto out;
+        }
+        if (count > data->len - off)
+                count = data->len - off;
+        if (!count)
+                goto out;
+        ret = count;
+        p = ioremap_cache(paddr + sizeof(*data), data->len);
+        if (!p) {
+                ret = -ENOMEM;
+                goto out;
+        }
+        memcpy(buf, p + off, count);
+        iounmap(p);
+out:
+        iounmap(data);
+        return ret;
+}
+static struct kobj_attribute type_attr = __ATTR_RO(type);
+static struct bin_attribute data_attr = {
+        .attr = {
+                .name = "data",
+                .mode = S_IRUGO,
+        },
+        .read = setup_data_data_read,
+};
+static struct attribute *setup_data_type_attrs[] = {
+        &type_attr.attr,
+        NULL,
+};
+static struct bin_attribute *setup_data_data_attrs[] = {
+        &data_attr,
+        NULL,
+};
+static struct attribute_group setup_data_attr_group = {
+        .attrs = setup_data_type_attrs,
+        .bin_attrs = setup_data_data_attrs,
+};
+static int __init create_setup_data_node(struct kobject *parent,
+                                         struct kobject **kobjp, int nr)
+{
+        int ret = 0;
+        size_t size;
+        struct kobject *kobj;
+        char name[16]; /* should be enough for setup_data nodes numbers */
+        snprintf(name, 16, "%d", nr);
+        kobj = kobject_create_and_add(name, parent);
+        if (!kobj)
+                return -ENOMEM;
+        ret = get_setup_data_size(nr, &size);
+        if (ret)
+                goto out_kobj;
+        data_attr.size = size;
+        ret = sysfs_create_group(kobj, &setup_data_attr_group);
+        if (ret)
+                goto out_kobj;
+        *kobjp = kobj;
+        return 0;
+out_kobj:
+        kobject_put(kobj);
+        return ret;
+}
+static void __init cleanup_setup_data_node(struct kobject *kobj)
+{
+        sysfs_remove_group(kobj, &setup_data_attr_group);
+        kobject_put(kobj);
+}
+static int __init get_setup_data_total_num(u64 pa_data, int *nr)
+{
+        int ret = 0;
+        struct setup_data *data;
+        *nr = 0;
+        while (pa_data) {
+                *nr += 1;
+                data = ioremap_cache(pa_data, sizeof(*data));
+                if (!data) {
+                        ret = -ENOMEM;
+                        goto out;
+                }
+                pa_data = data->next;
+                iounmap(data);
+        }
+out:
+        return ret;
+}
+static int __init create_setup_data_nodes(struct kobject *parent)
+{
+        struct kobject *setup_data_kobj, **kobjp;
+        u64 pa_data;
+        int i, j, nr, ret = 0;
+        pa_data = boot_params.hdr.setup_data;
+        if (!pa_data)
+                return 0;
+        setup_data_kobj = kobject_create_and_add("setup_data", parent);
+        if (!setup_data_kobj) {
+                ret = -ENOMEM;
+                goto out;
+        }
+        ret = get_setup_data_total_num(pa_data, &nr);
+        if (ret)
+                goto out_setup_data_kobj;
+        kobjp = kmalloc(sizeof(*kobjp) * nr, GFP_KERNEL);
+        if (!kobjp) {
+                ret = -ENOMEM;
+                goto out_setup_data_kobj;
+        }
+        for (i = 0; i < nr; i++) {
+                ret = create_setup_data_node(setup_data_kobj, kobjp + i, i);
+                if (ret)
+                        goto out_clean_nodes;
+        }
+        kfree(kobjp);
+        return 0;
+out_clean_nodes:
+        for (j = i - 1; j > 0; j--)
+                cleanup_setup_data_node(*(kobjp + j));
+        kfree(kobjp);
+out_setup_data_kobj:
+        kobject_put(setup_data_kobj);
+out:
+        return ret;
+}
+static int __init boot_params_ksysfs_init(void)
+{
+        int ret;
+        struct kobject *boot_params_kobj;
+        boot_params_kobj = kobject_create_and_add("boot_params",
+                                                  kernel_kobj);
+        if (!boot_params_kobj) {
+                ret = -ENOMEM;
+                goto out;
+        }
+        ret = sysfs_create_group(boot_params_kobj, &boot_params_attr_group);
+        if (ret)
+                goto out_boot_params_kobj;
+        ret = create_setup_data_nodes(boot_params_kobj);
+        if (ret)
+                goto out_create_group;
+        return 0;
+out_create_group:
+        sysfs_remove_group(boot_params_kobj, &boot_params_attr_group);
+out_boot_params_kobj:
+        kobject_put(boot_params_kobj);
+out:
+        return ret;
+}
+arch_initcall(boot_params_ksysfs_init);
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
index 5b19e4d78b00..1667b1de8d5d 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -9,7 +9,6 @@
 #include <linux/mm.h>
 #include <linux/kexec.h>
 #include <linux/delay.h>
-#include <linux/init.h>
 #include <linux/numa.h>
 #include <linux/ftrace.h>
 #include <linux/suspend.h>
diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c
index 871be4a84c7d..da15918d1c81 100644
--- a/arch/x86/kernel/pci-nommu.c
+++ b/arch/x86/kernel/pci-nommu.c
@@ -3,7 +3,6 @@
 #include <linux/dma-mapping.h>
 #include <linux/scatterlist.h>
 #include <linux/string.h>
-#include <linux/init.h>
 #include <linux/gfp.h>
 #include <linux/pci.h>
 #include <linux/mm.h>
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 6f1236c29c4b..0de43e98ce08 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -24,7 +24,6 @@
 #include <linux/interrupt.h>
 #include <linux/delay.h>
 #include <linux/reboot.h>
-#include <linux/init.h>
 #include <linux/mc146818rtc.h>
 #include <linux/module.h>
 #include <linux/kallsyms.h>
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index da3c599584a3..c752cb43e52f 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -558,6 +558,17 @@ void native_machine_shutdown(void)
 {
        /* Stop the cpus and apics */
 #ifdef CONFIG_X86_IO_APIC
+        /*
+         * Disabling IO APIC before local APIC is a workaround for
+         * erratum AVR31 in "Intel Atom Processor C2000 Product Family
+         * Specification Update". In this situation, interrupts that target
+         * a Logical Processor whose Local APIC is either in the process of
+         * being hardware disabled or software disabled are neither delivered
+         * nor discarded. When this erratum occurs, the processor may hang.
+         *
+         * Even without the erratum, it still makes sense to quiet IO APIC
+         * before disabling Local APIC.
+         */
        disable_IO_APIC();
 #endif
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index cb233bc9dee3..c9675594d7ca 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -295,6 +295,8 @@ static void __init reserve_brk(void)
        _brk_start = 0;
 }
+u64 relocated_ramdisk;
 #ifdef CONFIG_BLK_DEV_INITRD
 static u64 __init get_ramdisk_image(void)
@@ -321,25 +323,24 @@ static void __init relocate_initrd(void)
        u64 ramdisk_image = get_ramdisk_image();
        u64 ramdisk_size  = get_ramdisk_size();
        u64 area_size     = PAGE_ALIGN(ramdisk_size);
-        u64 ramdisk_here;
        unsigned long slop, clen, mapaddr;
        char *p, *q;
        /* We need to move the initrd down into directly mapped mem */
-        ramdisk_here = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped),
+        relocated_ramdisk = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped),
-                                                 area_size, PAGE_SIZE);
+                                                   area_size, PAGE_SIZE);
-        if (!ramdisk_here)
+        if (!relocated_ramdisk)
                panic("Cannot find place for new RAMDISK of size %lld\n",
-                         ramdisk_size);
+                      ramdisk_size);
        /* Note: this includes all the mem currently occupied by
           the initrd, we rely on that fact to keep the data intact. */
-        memblock_reserve(ramdisk_here, area_size);
+        memblock_reserve(relocated_ramdisk, area_size);
-        initrd_start = ramdisk_here + PAGE_OFFSET;
+        initrd_start = relocated_ramdisk + PAGE_OFFSET;
        initrd_end   = initrd_start + ramdisk_size;
        printk(KERN_INFO "Allocated new RAMDISK: [mem %#010llx-%#010llx]\n",
-                         ramdisk_here, ramdisk_here + ramdisk_size - 1);
+               relocated_ramdisk, relocated_ramdisk + ramdisk_size - 1);
        q = (char *)initrd_start;
@@ -363,7 +364,7 @@ static void __init relocate_initrd(void)
        printk(KERN_INFO "Move RAMDISK from [mem %#010llx-%#010llx] to"
                " [mem %#010llx-%#010llx]\n",
                ramdisk_image, ramdisk_image + ramdisk_size - 1,
-                ramdisk_here, ramdisk_here + ramdisk_size - 1);
+                relocated_ramdisk, relocated_ramdisk + ramdisk_size - 1);
 }
 static void __init early_reserve_initrd(void)
@@ -447,6 +448,9 @@ static void __init parse_setup_data(void)
                case SETUP_DTB:
                        add_dtb(pa_data);
                        break;
+                case SETUP_EFI:
+                        parse_efi_setup(pa_data, data_len);
+                        break;
                default:
                        break;
                }
@@ -824,6 +828,20 @@ static void __init trim_low_memory_range(void)
 }
        
 /*
+ * Dump out kernel offset information on panic.
+ */
+static int
+dump_kernel_offset(struct notifier_block *self, unsigned long v, void *p)
+{
+        pr_emerg("Kernel Offset: 0x%lx from 0x%lx "
+                 "(relocation range: 0x%lx-0x%lx)\n",
+                 (unsigned long)&_text - __START_KERNEL, __START_KERNEL,
+                 __START_KERNEL_map, MODULES_VADDR-1);
+        return 0;
+}
+/*
 * Determine if we were loaded by an EFI loader.  If so, then we have also been
 * passed the efi memmap, systab, etc., so we should use these data structures
 * for initialization.  Note, the efi init code path is determined by the
@@ -924,8 +942,6 @@ void __init setup_arch(char **cmdline_p)
        iomem_resource.end = (1ULL << boot_cpu_data.x86_phys_bits) - 1;
        setup_memory_map();
        parse_setup_data();
-        /* update the e820_saved too */
-        e820_reserve_setup_data();
        copy_edd();
@@ -987,6 +1003,8 @@ void __init setup_arch(char **cmdline_p)
                early_dump_pci_devices();
 #endif
+        /* update the e820_saved too */
+        e820_reserve_setup_data();
        finish_e820_parsing();
        if (efi_enabled(EFI_BOOT))
@@ -1101,7 +1119,7 @@ void __init setup_arch(char **cmdline_p)
        setup_real_mode();
-        memblock_set_current_limit(get_max_mapped());
+        memblock_set_current_limit(get_max_low_mapped());
        dma_contiguous_reserve(0);
        /*
@@ -1248,3 +1266,15 @@ void __init i386_reserve_resources(void)
 }
 #endif /* CONFIG_X86_32 */
+static struct notifier_block kernel_offset_notifier = {
+        .notifier_call = dump_kernel_offset
+};
+static int __init register_kernel_offset_dumper(void)
+{
+        atomic_notifier_chain_register(&panic_notifier_list,
+                                        &kernel_offset_notifier);
+        return 0;
+}
+__initcall(register_kernel_offset_dumper);
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 85dc05a3aa02..a32da804252e 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1312,6 +1312,12 @@ void cpu_disable_common(void)
 int native_cpu_disable(void)
 {
+        int ret;
+        ret = check_irq_vectors_for_cpu_disable();
+        if (ret)
+                return ret;
        clear_local_APIC();
        cpu_disable_common();
@@ -1417,7 +1423,9 @@ static inline void mwait_play_dead(void)
                 * The WBINVD is insufficient due to the spurious-wakeup
                 * case where we return around the loop.
                 */
+                mb();
                clflush(mwait_ptr);
+                mb();
                __monitor(mwait_ptr, 0, 0);
                mb();
                __mwait(eax, 0);
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index b857ed890b4c..57409f6b8c62 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -211,21 +211,17 @@ dotraplinkage void do_##name(struct pt_regs *regs, long error_code)	\
        exception_exit(prev_state);                                     \
 }
-DO_ERROR_INFO(X86_TRAP_DE, SIGFPE, "divide error", divide_error, FPE_INTDIV,
+DO_ERROR_INFO(X86_TRAP_DE,     SIGFPE,  "divide error",                 divide_error,                FPE_INTDIV, regs->ip )
-                regs->ip)
+DO_ERROR     (X86_TRAP_OF,     SIGSEGV, "overflow",                     overflow                                          )
-DO_ERROR(X86_TRAP_OF, SIGSEGV, "overflow", overflow)
+DO_ERROR     (X86_TRAP_BR,     SIGSEGV, "bounds",                       bounds                                            )
-DO_ERROR(X86_TRAP_BR, SIGSEGV, "bounds", bounds)
+DO_ERROR_INFO(X86_TRAP_UD,     SIGILL,  "invalid opcode",               invalid_op,                  ILL_ILLOPN, regs->ip )
-DO_ERROR_INFO(X86_TRAP_UD, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN,
+DO_ERROR     (X86_TRAP_OLD_MF, SIGFPE,  "coprocessor segment overrun",  coprocessor_segment_overrun                       )
-                regs->ip)
+DO_ERROR     (X86_TRAP_TS,     SIGSEGV, "invalid TSS",                  invalid_TSS                                       )
-DO_ERROR(X86_TRAP_OLD_MF, SIGFPE, "coprocessor segment overrun",
+DO_ERROR     (X86_TRAP_NP,     SIGBUS,  "segment not present",          segment_not_present                               )
-                coprocessor_segment_overrun)
-DO_ERROR(X86_TRAP_TS, SIGSEGV, "invalid TSS", invalid_TSS)
-DO_ERROR(X86_TRAP_NP, SIGBUS, "segment not present", segment_not_present)
 #ifdef CONFIG_X86_32
-DO_ERROR(X86_TRAP_SS, SIGBUS, "stack segment", stack_segment)
+DO_ERROR     (X86_TRAP_SS,     SIGBUS,  "stack segment",                stack_segment                                     )
 #endif
-DO_ERROR_INFO(X86_TRAP_AC, SIGBUS, "alignment check", alignment_check,
+DO_ERROR_INFO(X86_TRAP_AC,     SIGBUS,  "alignment check",              alignment_check,             BUS_ADRALN, 0        )
-                BUS_ADRALN, 0)
 #ifdef CONFIG_X86_64
 /* Runs on IST stack */
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 930e5d48f560..a3acbac2ee72 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -11,6 +11,7 @@
 #include <linux/clocksource.h>
 #include <linux/percpu.h>
 #include <linux/timex.h>
+#include <linux/static_key.h>
 #include <asm/hpet.h>
 #include <asm/timer.h>
@@ -37,13 +38,244 @@ static int __read_mostly tsc_unstable;
   erroneous rdtsc usage on !cpu_has_tsc processors */
 static int __read_mostly tsc_disabled = -1;
+static struct static_key __use_tsc = STATIC_KEY_INIT;
 int tsc_clocksource_reliable;
+/*
+ * Use a ring-buffer like data structure, where a writer advances the head by
+ * writing a new data entry and a reader advances the tail when it observes a
+ * new entry.
+ *
+ * Writers are made to wait on readers until there's space to write a new
+ * entry.
+ *
+ * This means that we can always use an {offset, mul} pair to compute a ns
+ * value that is 'roughly' in the right direction, even if we're writing a new
+ * {offset, mul} pair during the clock read.
+ *
+ * The down-side is that we can no longer guarantee strict monotonicity anymore
+ * (assuming the TSC was that to begin with), because while we compute the
+ * intersection point of the two clock slopes and make sure the time is
+ * continuous at the point of switching; we can no longer guarantee a reader is
+ * strictly before or after the switch point.
+ *
+ * It does mean a reader no longer needs to disable IRQs in order to avoid
+ * CPU-Freq updates messing with his times, and similarly an NMI reader will
+ * no longer run the risk of hitting half-written state.
+ */
+struct cyc2ns {
+        struct cyc2ns_data data[2];     /*  0 + 2*24 = 48 */
+        struct cyc2ns_data *head;       /* 48 + 8    = 56 */
+        struct cyc2ns_data *tail;       /* 56 + 8    = 64 */
+}; /* exactly fits one cacheline */
+static DEFINE_PER_CPU_ALIGNED(struct cyc2ns, cyc2ns);
+struct cyc2ns_data *cyc2ns_read_begin(void)
+{
+        struct cyc2ns_data *head;
+        preempt_disable();
+        head = this_cpu_read(cyc2ns.head);
+        /*
+         * Ensure we observe the entry when we observe the pointer to it.
+         * matches the wmb from cyc2ns_write_end().
+         */
+        smp_read_barrier_depends();
+        head->__count++;
+        barrier();
+        return head;
+}
+void cyc2ns_read_end(struct cyc2ns_data *head)
+{
+        barrier();
+        /*
+         * If we're the outer most nested read; update the tail pointer
+         * when we're done. This notifies possible pending writers
+         * that we've observed the head pointer and that the other
+         * entry is now free.
+         */
+        if (!--head->__count) {
+                /*
+                 * x86-TSO does not reorder writes with older reads;
+                 * therefore once this write becomes visible to another
+                 * cpu, we must be finished reading the cyc2ns_data.
+                 *
+                 * matches with cyc2ns_write_begin().
+                 */
+                this_cpu_write(cyc2ns.tail, head);
+        }
+        preempt_enable();
+}
+/*
+ * Begin writing a new @data entry for @cpu.
+ *
+ * Assumes some sort of write side lock; currently 'provided' by the assumption
+ * that cpufreq will call its notifiers sequentially.
+ */
+static struct cyc2ns_data *cyc2ns_write_begin(int cpu)
+{
+        struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu);
+        struct cyc2ns_data *data = c2n->data;
+        if (data == c2n->head)
+                data++;
+        /* XXX send an IPI to @cpu in order to guarantee a read? */
+        /*
+         * When we observe the tail write from cyc2ns_read_end(),
+         * the cpu must be done with that entry and its safe
+         * to start writing to it.
+         */
+        while (c2n->tail == data)
+                cpu_relax();
+        return data;
+}
+static void cyc2ns_write_end(int cpu, struct cyc2ns_data *data)
+{
+        struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu);
+        /*
+         * Ensure the @data writes are visible before we publish the
+         * entry. Matches the data-depencency in cyc2ns_read_begin().
+         */
+        smp_wmb();
+        ACCESS_ONCE(c2n->head) = data;
+}
+/*
+ * Accelerators for sched_clock()
+ * convert from cycles(64bits) => nanoseconds (64bits)
+ *  basic equation:
+ *              ns = cycles / (freq / ns_per_sec)
+ *              ns = cycles * (ns_per_sec / freq)
+ *              ns = cycles * (10^9 / (cpu_khz * 10^3))
+ *              ns = cycles * (10^6 / cpu_khz)
+ *
+ *      Then we use scaling math (suggested by george@mvista.com) to get:
+ *              ns = cycles * (10^6 * SC / cpu_khz) / SC
+ *              ns = cycles * cyc2ns_scale / SC
+ *
+ *      And since SC is a constant power of two, we can convert the div
+ *  into a shift.
+ *
+ *  We can use khz divisor instead of mhz to keep a better precision, since
+ *  cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
+ *  (mathieu.desnoyers@polymtl.ca)
+ *
+ *                      -johnstul@us.ibm.com "math is hard, lets go shopping!"
+ */
+#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
+static void cyc2ns_data_init(struct cyc2ns_data *data)
+{
+        data->cyc2ns_mul = 1U << CYC2NS_SCALE_FACTOR;
+        data->cyc2ns_shift = CYC2NS_SCALE_FACTOR;
+        data->cyc2ns_offset = 0;
+        data->__count = 0;
+}
+static void cyc2ns_init(int cpu)
+{
+        struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu);
+        cyc2ns_data_init(&c2n->data[0]);
+        cyc2ns_data_init(&c2n->data[1]);
+        c2n->head = c2n->data;
+        c2n->tail = c2n->data;
+}
+static inline unsigned long long cycles_2_ns(unsigned long long cyc)
+{
+        struct cyc2ns_data *data, *tail;
+        unsigned long long ns;
+        /*
+         * See cyc2ns_read_*() for details; replicated in order to avoid
+         * an extra few instructions that came with the abstraction.
+         * Notable, it allows us to only do the __count and tail update
+         * dance when its actually needed.
+         */
+        preempt_disable();
+        data = this_cpu_read(cyc2ns.head);
+        tail = this_cpu_read(cyc2ns.tail);
+        if (likely(data == tail)) {
+                ns = data->cyc2ns_offset;
+                ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR);
+        } else {
+                data->__count++;
+                barrier();
+                ns = data->cyc2ns_offset;
+                ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR);
+                barrier();
+                if (!--data->__count)
+                        this_cpu_write(cyc2ns.tail, data);
+        }
+        preempt_enable();
+        return ns;
+}
+/* XXX surely we already have this someplace in the kernel?! */
+#define DIV_ROUND(n, d) (((n) + ((d) / 2)) / (d))
+static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
+{
+        unsigned long long tsc_now, ns_now;
+        struct cyc2ns_data *data;
+        unsigned long flags;
+        local_irq_save(flags);
+        sched_clock_idle_sleep_event();
+        if (!cpu_khz)
+                goto done;
+        data = cyc2ns_write_begin(cpu);
+        rdtscll(tsc_now);
+        ns_now = cycles_2_ns(tsc_now);
+        /*
+         * Compute a new multiplier as per the above comment and ensure our
+         * time function is continuous; see the comment near struct
+         * cyc2ns_data.
+         */
+        data->cyc2ns_mul = DIV_ROUND(NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR, cpu_khz);
+        data->cyc2ns_shift = CYC2NS_SCALE_FACTOR;
+        data->cyc2ns_offset = ns_now -
+                mul_u64_u32_shr(tsc_now, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR);
+        cyc2ns_write_end(cpu, data);
+done:
+        sched_clock_idle_wakeup_event(0);
+        local_irq_restore(flags);
+}
 /*
 * Scheduler clock - returns current time in nanosec units.
 */
 u64 native_sched_clock(void)
 {
-        u64 this_offset;
+        u64 tsc_now;
        /*
         * Fall back to jiffies if there's no TSC available:
@@ -53,16 +285,16 @@ u64 native_sched_clock(void)
         *   very important for it to be as fast as the platform
         *   can achieve it. )
         */
-        if (unlikely(tsc_disabled)) {
+        if (!static_key_false(&__use_tsc)) {
                /* No locking but a rare wrong value is not a big deal: */
                return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ);
        }
        /* read the Time Stamp Counter: */
-        rdtscll(this_offset);
+        rdtscll(tsc_now);
        /* return the value in ns */
-        return __cycles_2_ns(this_offset);
+        return cycles_2_ns(tsc_now);
 }
 /* We need to define a real function for sched_clock, to override the
@@ -419,6 +651,16 @@ unsigned long native_calibrate_tsc(void)
        unsigned long flags, latch, ms, fast_calibrate;
        int hpet = is_hpet_enabled(), i, loopmin;
+        /* Calibrate TSC using MSR for Intel Atom SoCs */
+        local_irq_save(flags);
+        i = try_msr_calibrate_tsc(&fast_calibrate);
+        local_irq_restore(flags);
+        if (i >= 0) {
+                if (i == 0)
+                        pr_warn("Fast TSC calibration using MSR failed\n");
+                return fast_calibrate;
+        }
        local_irq_save(flags);
        fast_calibrate = quick_pit_calibrate();
        local_irq_restore(flags);
@@ -589,61 +831,11 @@ int recalibrate_cpu_khz(void)
 EXPORT_SYMBOL(recalibrate_cpu_khz);
-/* Accelerators for sched_clock()
- * convert from cycles(64bits) => nanoseconds (64bits)
- *  basic equation:
- *              ns = cycles / (freq / ns_per_sec)
- *              ns = cycles * (ns_per_sec / freq)
- *              ns = cycles * (10^9 / (cpu_khz * 10^3))
- *              ns = cycles * (10^6 / cpu_khz)
- *
- *      Then we use scaling math (suggested by george@mvista.com) to get:
- *              ns = cycles * (10^6 * SC / cpu_khz) / SC
- *              ns = cycles * cyc2ns_scale / SC
- *
- *      And since SC is a constant power of two, we can convert the div
- *  into a shift.
- *
- *  We can use khz divisor instead of mhz to keep a better precision, since
- *  cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
- *  (mathieu.desnoyers@polymtl.ca)
- *
- *                      -johnstul@us.ibm.com "math is hard, lets go shopping!"
- */
-DEFINE_PER_CPU(unsigned long, cyc2ns);
-DEFINE_PER_CPU(unsigned long long, cyc2ns_offset);
-static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
-{
-        unsigned long long tsc_now, ns_now, *offset;
-        unsigned long flags, *scale;
-        local_irq_save(flags);
-        sched_clock_idle_sleep_event();
-        scale = &per_cpu(cyc2ns, cpu);
-        offset = &per_cpu(cyc2ns_offset, cpu);
-        rdtscll(tsc_now);
-        ns_now = __cycles_2_ns(tsc_now);
-        if (cpu_khz) {
-                *scale = ((NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR) +
-                                cpu_khz / 2) / cpu_khz;
-                *offset = ns_now - mult_frac(tsc_now, *scale,
-                                             (1UL << CYC2NS_SCALE_FACTOR));
-        }
-        sched_clock_idle_wakeup_event(0);
-        local_irq_restore(flags);
-}
 static unsigned long long cyc2ns_suspend;
 void tsc_save_sched_clock_state(void)
 {
-        if (!sched_clock_stable)
+        if (!sched_clock_stable())
                return;
        cyc2ns_suspend = sched_clock();
@@ -663,16 +855,26 @@ void tsc_restore_sched_clock_state(void)
        unsigned long flags;
        int cpu;
-        if (!sched_clock_stable)
+        if (!sched_clock_stable())
                return;
        local_irq_save(flags);
-        __this_cpu_write(cyc2ns_offset, 0);
+        /*
+         * We're comming out of suspend, there's no concurrency yet; don't
+         * bother being nice about the RCU stuff, just write to both
+         * data fields.
+         */
+        this_cpu_write(cyc2ns.data[0].cyc2ns_offset, 0);
+        this_cpu_write(cyc2ns.data[1].cyc2ns_offset, 0);
        offset = cyc2ns_suspend - sched_clock();
-        for_each_possible_cpu(cpu)
+        for_each_possible_cpu(cpu) {
-                per_cpu(cyc2ns_offset, cpu) = offset;
+                per_cpu(cyc2ns.data[0].cyc2ns_offset, cpu) = offset;
+                per_cpu(cyc2ns.data[1].cyc2ns_offset, cpu) = offset;
+        }
        local_irq_restore(flags);
 }
@@ -795,7 +997,7 @@ void mark_tsc_unstable(char *reason)
 {
        if (!tsc_unstable) {
                tsc_unstable = 1;
-                sched_clock_stable = 0;
+                clear_sched_clock_stable();
                disable_sched_clock_irqtime();
                pr_info("Marking TSC unstable due to %s\n", reason);
                /* Change only the rating, when not registered */
@@ -995,14 +1197,18 @@ void __init tsc_init(void)
         * speed as the bootup CPU. (cpufreq notifiers will fix this
         * up if their speed diverges)
         */
-        for_each_possible_cpu(cpu)
+        for_each_possible_cpu(cpu) {
+                cyc2ns_init(cpu);
                set_cyc2ns_scale(cpu_khz, cpu);
+        }
        if (tsc_disabled > 0)
                return;
        /* now allow native_sched_clock() to use rdtsc */
        tsc_disabled = 0;
+        static_key_slow_inc(&__use_tsc);
        if (!no_sched_irq_time)
                enable_sched_clock_irqtime();
diff --git a/arch/x86/kernel/tsc_msr.c b/arch/x86/kernel/tsc_msr.c
new file mode 100644
index 000000000000..8b5434f4389f
--- /dev/null
+++ b/arch/x86/kernel/tsc_msr.c
@@ -0,0 +1,127 @@
+/*
+ * tsc_msr.c - MSR based TSC calibration on Intel Atom SoC platforms.
+ *
+ * TSC in Intel Atom SoC runs at a constant rate which can be figured
+ * by this formula:
+ * <maximum core-clock to bus-clock ratio> * <maximum resolved frequency>
+ * See Intel 64 and IA-32 System Programming Guid section 16.12 and 30.11.5
+ * for details.
+ * Especially some Intel Atom SoCs don't have PIT(i8254) or HPET, so MSR
+ * based calibration is the only option.
+ *
+ *
+ * Copyright (C) 2013 Intel Corporation
+ * Author: Bin Gao <bin.gao@intel.com>
+ *
+ * This file is released under the GPLv2.
+ */
+#include <linux/kernel.h>
+#include <asm/processor.h>
+#include <asm/setup.h>
+#include <asm/apic.h>
+#include <asm/param.h>
+/* CPU reference clock frequency: in KHz */
+#define FREQ_83         83200
+#define FREQ_100        99840
+#define FREQ_133        133200
+#define FREQ_166        166400
+#define MAX_NUM_FREQS   8
+/*
+ * According to Intel 64 and IA-32 System Programming Guide,
+ * if MSR_PERF_STAT[31] is set, the maximum resolved bus ratio can be
+ * read in MSR_PLATFORM_ID[12:8], otherwise in MSR_PERF_STAT[44:40].
+ * Unfortunately some Intel Atom SoCs aren't quite compliant to this,
+ * so we need manually differentiate SoC families. This is what the
+ * field msr_plat does.
+ */
+struct freq_desc {
+        u8 x86_family;  /* CPU family */
+        u8 x86_model;   /* model */
+        u8 msr_plat;    /* 1: use MSR_PLATFORM_INFO, 0: MSR_IA32_PERF_STATUS */
+        u32 freqs[MAX_NUM_FREQS];
+};
+static struct freq_desc freq_desc_tables[] = {
+        /* PNW */
+        { 6, 0x27, 0, { 0, 0, 0, 0, 0, FREQ_100, 0, FREQ_83 } },
+        /* CLV+ */
+        { 6, 0x35, 0, { 0, FREQ_133, 0, 0, 0, FREQ_100, 0, FREQ_83 } },
+        /* TNG */
+        { 6, 0x4a, 1, { 0, FREQ_100, FREQ_133, 0, 0, 0, 0, 0 } },
+        /* VLV2 */
+        { 6, 0x37, 1, { 0, FREQ_100, FREQ_133, FREQ_166, 0, 0, 0, 0 } },
+        /* ANN */
+        { 6, 0x5a, 1, { FREQ_83, FREQ_100, FREQ_133, FREQ_100, 0, 0, 0, 0 } },
+};
+static int match_cpu(u8 family, u8 model)
+{
+        int i;
+        for (i = 0; i < ARRAY_SIZE(freq_desc_tables); i++) {
+                if ((family == freq_desc_tables[i].x86_family) &&
+                        (model == freq_desc_tables[i].x86_model))
+                        return i;
+        }
+        return -1;
+}
+/* Map CPU reference clock freq ID(0-7) to CPU reference clock freq(KHz) */
+#define id_to_freq(cpu_index, freq_id) \
+        (freq_desc_tables[cpu_index].freqs[freq_id])
+/*
+ * Do MSR calibration only for known/supported CPUs.
+ * Return values:
+ * -1: CPU is unknown/unsupported for MSR based calibration
+ *  0: CPU is known/supported, but calibration failed
+ *  1: CPU is known/supported, and calibration succeeded
+ */
+int try_msr_calibrate_tsc(unsigned long *fast_calibrate)
+{
+        int cpu_index;
+        u32 lo, hi, ratio, freq_id, freq;
+        cpu_index = match_cpu(boot_cpu_data.x86, boot_cpu_data.x86_model);
+        if (cpu_index < 0)
+                return -1;
+        *fast_calibrate = 0;
+        if (freq_desc_tables[cpu_index].msr_plat) {
+                rdmsr(MSR_PLATFORM_INFO, lo, hi);
+                ratio = (lo >> 8) & 0x1f;
+        } else {
+                rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
+                ratio = (hi >> 8) & 0x1f;
+        }
+        pr_info("Maximum core-clock to bus-clock ratio: 0x%x\n", ratio);
+        if (!ratio)
+                return 0;
+        /* Get FSB FREQ ID */
+        rdmsr(MSR_FSB_FREQ, lo, hi);
+        freq_id = lo & 0x7;
+        freq = id_to_freq(cpu_index, freq_id);
+        pr_info("Resolved frequency ID: %u, frequency: %u KHz\n",
+                                freq_id, freq);
+        if (!freq)
+                return 0;
+        /* TSC frequency = maximum resolved freq * maximum resolved bus ratio */
+        *fast_calibrate = freq * ratio;
+        pr_info("TSC runs at %lu KHz\n", *fast_calibrate);
+#ifdef CONFIG_X86_LOCAL_APIC
+        lapic_timer_frequency = (freq * 1000) / HZ;
+        pr_info("lapic_timer_frequency = %d\n", lapic_timer_frequency);
+#endif
+        return 1;
+}
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index adfdf56a3714..26488487bc61 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -16,7 +16,6 @@
 */
 #include <linux/spinlock.h>
 #include <linux/kernel.h>
-#include <linux/init.h>
 #include <linux/smp.h>
 #include <linux/nmi.h>
 #include <asm/tsc.h>
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index 422fd8223470..a4b451c6addf 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -562,6 +562,16 @@ static void __init xstate_enable_boot_cpu(void)
        if (cpu_has_xsaveopt && eagerfpu != DISABLE)
                eagerfpu = ENABLE;
+        if (pcntxt_mask & XSTATE_EAGER) {
+                if (eagerfpu == DISABLE) {
+                        pr_err("eagerfpu not present, disabling some xstate features: 0x%llx\n",
+                                        pcntxt_mask & XSTATE_EAGER);
+                        pcntxt_mask &= ~XSTATE_EAGER;
+                } else {
+                        eagerfpu = ENABLE;
+                }
+        }
        pr_info("enabled xstate_bv 0x%llx, cntxt size 0x%x\n",
                pcntxt_mask, xstate_size);
 }