26 files changed, 1673 insertions, 814 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 5369059c07a9..532d2e090e6f 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -69,6 +69,7 @@ obj-$(CONFIG_KEXEC)		+= machine_kexec_$(BITS).o
 obj-$(CONFIG_KEXEC)             += relocate_kernel_$(BITS).o crash.o
 obj-$(CONFIG_CRASH_DUMP)        += crash_dump_$(BITS).o
 obj-$(CONFIG_KPROBES)           += kprobes.o
+obj-$(CONFIG_OPTPROBES)         += kprobes-opt.o
 obj-$(CONFIG_MODULES)           += module.o
 obj-$(CONFIG_DOUBLEFAULT)       += doublefault_32.o
 obj-$(CONFIG_KGDB)              += kgdb.o
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index f4773f4aae35..0a44b90602b0 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -5,6 +5,7 @@
 #include <linux/mm.h>
 #include <linux/io.h>
+#include <linux/sched.h>
 #include <asm/processor.h>
 #include <asm/apic.h>
 #include <asm/cpu.h>
@@ -456,6 +457,8 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
        if (c->x86_power & (1 << 8)) {
                set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
                set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
+                if (!check_tsc_unstable())
+                        sched_clock_stable = 1;
        }
 #ifdef CONFIG_X86_64
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index d43cad74f166..c0f7d68d318f 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1044,6 +1044,9 @@ DEFINE_PER_CPU(char *, irq_stack_ptr) =
 DEFINE_PER_CPU(unsigned int, irq_count) = -1;
+DEFINE_PER_CPU(struct task_struct *, fpu_owner_task);
+EXPORT_PER_CPU_SYMBOL(fpu_owner_task);
 /*
 * Special IST stacks which the CPU switches to when it calls
 * an IST-marked descriptor entry. Up to 7 stacks (hardware
@@ -1111,6 +1114,8 @@ void debug_stack_reset(void)
 DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
 EXPORT_PER_CPU_SYMBOL(current_task);
+DEFINE_PER_CPU(struct task_struct *, fpu_owner_task);
+EXPORT_PER_CPU_SYMBOL(fpu_owner_task);
 #ifdef CONFIG_CC_STACKPROTECTOR
 DEFINE_PER_CPU_ALIGNED(struct stack_canary, stack_canary);
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 6b45e5e7a901..73d08ed98a64 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -326,8 +326,7 @@ static void __cpuinit amd_calc_l3_indices(struct amd_northbridge *nb)
        l3->indices = (max(max3(sc0, sc1, sc2), sc3) << 10) - 1;
 }
-static void __cpuinit amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf,
+static void __cpuinit amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf, int index)
-                                        int index)
 {
        int node;
@@ -725,14 +724,16 @@ static DEFINE_PER_CPU(struct _cpuid4_info *, ici_cpuid4_info);
 #define CPUID4_INFO_IDX(x, y)   (&((per_cpu(ici_cpuid4_info, x))[y]))
 #ifdef CONFIG_SMP
-static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index)
+static int __cpuinit cache_shared_amd_cpu_map_setup(unsigned int cpu, int index)
 {
-        struct _cpuid4_info     *this_leaf, *sibling_leaf;
+        struct _cpuid4_info *this_leaf;
-        unsigned long num_threads_sharing;
+        int ret, i, sibling;
-        int index_msb, i, sibling;
        struct cpuinfo_x86 *c = &cpu_data(cpu);
-        if ((index == 3) && (c->x86_vendor == X86_VENDOR_AMD)) {
+        ret = 0;
+        if (index == 3) {
+                ret = 1;
                for_each_cpu(i, cpu_llc_shared_mask(cpu)) {
                        if (!per_cpu(ici_cpuid4_info, i))
                                continue;
@@ -743,8 +744,35 @@ static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index)
                                set_bit(sibling, this_leaf->shared_cpu_map);
                        }
                }
-                return;
+        } else if ((c->x86 == 0x15) && ((index == 1) || (index == 2))) {
+                ret = 1;
+                for_each_cpu(i, cpu_sibling_mask(cpu)) {
+                        if (!per_cpu(ici_cpuid4_info, i))
+                                continue;
+                        this_leaf = CPUID4_INFO_IDX(i, index);
+                        for_each_cpu(sibling, cpu_sibling_mask(cpu)) {
+                                if (!cpu_online(sibling))
+                                        continue;
+                                set_bit(sibling, this_leaf->shared_cpu_map);
+                        }
+                }
        }
+        return ret;
+}
+static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index)
+{
+        struct _cpuid4_info *this_leaf, *sibling_leaf;
+        unsigned long num_threads_sharing;
+        int index_msb, i;
+        struct cpuinfo_x86 *c = &cpu_data(cpu);
+        if (c->x86_vendor == X86_VENDOR_AMD) {
+                if (cache_shared_amd_cpu_map_setup(cpu, index))
+                        return;
+        }
        this_leaf = CPUID4_INFO_IDX(cpu, index);
        num_threads_sharing = 1 + this_leaf->base.eax.split.num_threads_sharing;
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 786e76a86322..e4eeaaf58a47 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -528,6 +528,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
        sprintf(name, "threshold_bank%i", bank);
+#ifdef CONFIG_SMP
        if (cpu_data(cpu).cpu_core_id && shared_bank[bank]) {   /* symlink */
                i = cpumask_first(cpu_llc_shared_mask(cpu));
@@ -553,6 +554,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
                goto out;
        }
+#endif
        b = kzalloc(sizeof(struct threshold_bank), GFP_KERNEL);
        if (!b) {
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 5adce1040b11..0a18d16cb58d 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -24,6 +24,7 @@
 #include <linux/slab.h>
 #include <linux/cpu.h>
 #include <linux/bitops.h>
+#include <linux/device.h>
 #include <asm/apic.h>
 #include <asm/stacktrace.h>
@@ -31,6 +32,7 @@
 #include <asm/compat.h>
 #include <asm/smp.h>
 #include <asm/alternative.h>
+#include <asm/timer.h>
 #include "perf_event.h"
@@ -351,6 +353,36 @@ int x86_setup_perfctr(struct perf_event *event)
        return 0;
 }
+/*
+ * check that branch_sample_type is compatible with
+ * settings needed for precise_ip > 1 which implies
+ * using the LBR to capture ALL taken branches at the
+ * priv levels of the measurement
+ */
+static inline int precise_br_compat(struct perf_event *event)
+{
+        u64 m = event->attr.branch_sample_type;
+        u64 b = 0;
+        /* must capture all branches */
+        if (!(m & PERF_SAMPLE_BRANCH_ANY))
+                return 0;
+        m &= PERF_SAMPLE_BRANCH_KERNEL | PERF_SAMPLE_BRANCH_USER;
+        if (!event->attr.exclude_user)
+                b |= PERF_SAMPLE_BRANCH_USER;
+        if (!event->attr.exclude_kernel)
+                b |= PERF_SAMPLE_BRANCH_KERNEL;
+        /*
+         * ignore PERF_SAMPLE_BRANCH_HV, not supported on x86
+         */
+        return m == b;
+}
 int x86_pmu_hw_config(struct perf_event *event)
 {
        if (event->attr.precise_ip) {
@@ -367,6 +399,36 @@ int x86_pmu_hw_config(struct perf_event *event)
                if (event->attr.precise_ip > precise)
                        return -EOPNOTSUPP;
+                /*
+                 * check that PEBS LBR correction does not conflict with
+                 * whatever the user is asking with attr->branch_sample_type
+                 */
+                if (event->attr.precise_ip > 1) {
+                        u64 *br_type = &event->attr.branch_sample_type;
+                        if (has_branch_stack(event)) {
+                                if (!precise_br_compat(event))
+                                        return -EOPNOTSUPP;
+                                /* branch_sample_type is compatible */
+                        } else {
+                                /*
+                                 * user did not specify  branch_sample_type
+                                 *
+                                 * For PEBS fixups, we capture all
+                                 * the branches at the priv level of the
+                                 * event.
+                                 */
+                                *br_type = PERF_SAMPLE_BRANCH_ANY;
+                                if (!event->attr.exclude_user)
+                                        *br_type |= PERF_SAMPLE_BRANCH_USER;
+                                if (!event->attr.exclude_kernel)
+                                        *br_type |= PERF_SAMPLE_BRANCH_KERNEL;
+                        }
+                }
        }
        /*
@@ -424,6 +486,10 @@ static int __x86_pmu_event_init(struct perf_event *event)
        /* mark unused */
        event->hw.extra_reg.idx = EXTRA_REG_NONE;
+        /* mark not used */
+        event->hw.extra_reg.idx = EXTRA_REG_NONE;
+        event->hw.branch_reg.idx = EXTRA_REG_NONE;
        return x86_pmu.hw_config(event);
 }
@@ -1210,6 +1276,8 @@ x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
                break;
        case CPU_STARTING:
+                if (x86_pmu.attr_rdpmc)
+                        set_in_cr4(X86_CR4_PCE);
                if (x86_pmu.cpu_starting)
                        x86_pmu.cpu_starting(cpu);
                break;
@@ -1319,6 +1387,8 @@ static int __init init_hw_perf_events(void)
                }
        }
+        x86_pmu.attr_rdpmc = 1; /* enable userspace RDPMC usage by default */
        pr_info("... version:                %d\n",     x86_pmu.version);
        pr_info("... bit width:              %d\n",     x86_pmu.cntval_bits);
        pr_info("... generic registers:      %d\n",     x86_pmu.num_counters);
@@ -1542,23 +1612,106 @@ static int x86_pmu_event_init(struct perf_event *event)
        return err;
 }
+static int x86_pmu_event_idx(struct perf_event *event)
+{
+        int idx = event->hw.idx;
+        if (x86_pmu.num_counters_fixed && idx >= X86_PMC_IDX_FIXED) {
+                idx -= X86_PMC_IDX_FIXED;
+                idx |= 1 << 30;
+        }
+        return idx + 1;
+}
+static ssize_t get_attr_rdpmc(struct device *cdev,
+                              struct device_attribute *attr,
+                              char *buf)
+{
+        return snprintf(buf, 40, "%d\n", x86_pmu.attr_rdpmc);
+}
+static void change_rdpmc(void *info)
+{
+        bool enable = !!(unsigned long)info;
+        if (enable)
+                set_in_cr4(X86_CR4_PCE);
+        else
+                clear_in_cr4(X86_CR4_PCE);
+}
+static ssize_t set_attr_rdpmc(struct device *cdev,
+                              struct device_attribute *attr,
+                              const char *buf, size_t count)
+{
+        unsigned long val = simple_strtoul(buf, NULL, 0);
+        if (!!val != !!x86_pmu.attr_rdpmc) {
+                x86_pmu.attr_rdpmc = !!val;
+                smp_call_function(change_rdpmc, (void *)val, 1);
+        }
+        return count;
+}
+static DEVICE_ATTR(rdpmc, S_IRUSR | S_IWUSR, get_attr_rdpmc, set_attr_rdpmc);
+static struct attribute *x86_pmu_attrs[] = {
+        &dev_attr_rdpmc.attr,
+        NULL,
+};
+static struct attribute_group x86_pmu_attr_group = {
+        .attrs = x86_pmu_attrs,
+};
+static const struct attribute_group *x86_pmu_attr_groups[] = {
+        &x86_pmu_attr_group,
+        NULL,
+};
+static void x86_pmu_flush_branch_stack(void)
+{
+        if (x86_pmu.flush_branch_stack)
+                x86_pmu.flush_branch_stack();
+}
 static struct pmu pmu = {
-        .pmu_enable     = x86_pmu_enable,
+        .pmu_enable             = x86_pmu_enable,
-        .pmu_disable    = x86_pmu_disable,
+        .pmu_disable            = x86_pmu_disable,
+        .attr_groups    = x86_pmu_attr_groups,
        .event_init     = x86_pmu_event_init,
-        .add            = x86_pmu_add,
+        .add                    = x86_pmu_add,
-        .del            = x86_pmu_del,
+        .del                    = x86_pmu_del,
-        .start          = x86_pmu_start,
+        .start                  = x86_pmu_start,
-        .stop           = x86_pmu_stop,
+        .stop                   = x86_pmu_stop,
-        .read           = x86_pmu_read,
+        .read                   = x86_pmu_read,
        .start_txn      = x86_pmu_start_txn,
        .cancel_txn     = x86_pmu_cancel_txn,
        .commit_txn     = x86_pmu_commit_txn,
+        .event_idx      = x86_pmu_event_idx,
+        .flush_branch_stack     = x86_pmu_flush_branch_stack,
 };
+void perf_update_user_clock(struct perf_event_mmap_page *userpg, u64 now)
+{
+        if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
+                return;
+        if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
+                return;
+        userpg->time_mult = this_cpu_read(cyc2ns);
+        userpg->time_shift = CYC2NS_SCALE_FACTOR;
+        userpg->time_offset = this_cpu_read(cyc2ns_offset) - now;
+}
 /*
 * callchain support
 */
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 8944062f46e2..8484e77c211e 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -33,6 +33,7 @@ enum extra_reg_type {
        EXTRA_REG_RSP_0 = 0,    /* offcore_response_0 */
        EXTRA_REG_RSP_1 = 1,    /* offcore_response_1 */
+        EXTRA_REG_LBR   = 2,    /* lbr_select */
        EXTRA_REG_MAX           /* number of entries needed */
 };
@@ -130,6 +131,8 @@ struct cpu_hw_events {
        void                            *lbr_context;
        struct perf_branch_stack        lbr_stack;
        struct perf_branch_entry        lbr_entries[MAX_LBR_ENTRIES];
+        struct er_account               *lbr_sel;
+        u64                             br_sel;
        /*
         * Intel host/guest exclude bits
@@ -147,7 +150,9 @@ struct cpu_hw_events {
        /*
         * AMD specific bits
         */
-        struct amd_nb           *amd_nb;
+        struct amd_nb                   *amd_nb;
+        /* Inverted mask of bits to clear in the perf_ctr ctrl registers */
+        u64                             perf_ctr_virt_mask;
        void                            *kfree_on_online;
 };
@@ -266,6 +271,29 @@ struct x86_pmu_quirk {
        void (*func)(void);
 };
+union x86_pmu_config {
+        struct {
+                u64 event:8,
+                    umask:8,
+                    usr:1,
+                    os:1,
+                    edge:1,
+                    pc:1,
+                    interrupt:1,
+                    __reserved1:1,
+                    en:1,
+                    inv:1,
+                    cmask:8,
+                    event2:4,
+                    __reserved2:4,
+                    go:1,
+                    ho:1;
+        } bits;
+        u64 value;
+};
+#define X86_CONFIG(args...) ((union x86_pmu_config){.bits = {args}}).value
 /*
 * struct x86_pmu - generic x86 pmu
 */
@@ -307,10 +335,19 @@ struct x86_pmu {
        struct x86_pmu_quirk *quirks;
        int             perfctr_second_write;
+        /*
+         * sysfs attrs
+         */
+        int             attr_rdpmc;
+        /*
+         * CPU Hotplug hooks
+         */
        int             (*cpu_prepare)(int cpu);
        void            (*cpu_starting)(int cpu);
        void            (*cpu_dying)(int cpu);
        void            (*cpu_dead)(int cpu);
+        void            (*flush_branch_stack)(void);
        /*
         * Intel Arch Perfmon v2+
@@ -332,6 +369,8 @@ struct x86_pmu {
         */
        unsigned long   lbr_tos, lbr_from, lbr_to; /* MSR base regs       */
        int             lbr_nr;                    /* hardware stack size */
+        u64             lbr_sel_mask;              /* LBR_SELECT valid bits */
+        const int       *lbr_sel_map;              /* lbr_select mappings */
        /*
         * Extra registers for events
@@ -417,9 +456,11 @@ void x86_pmu_disable_all(void);
 static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,
                                          u64 enable_mask)
 {
+        u64 disable_mask = __this_cpu_read(cpu_hw_events.perf_ctr_virt_mask);
        if (hwc->extra_reg.reg)
                wrmsrl(hwc->extra_reg.reg, hwc->extra_reg.config);
-        wrmsrl(hwc->config_base, hwc->config | enable_mask);
+        wrmsrl(hwc->config_base, (hwc->config | enable_mask) & ~disable_mask);
 }
 void x86_pmu_enable_all(int added);
@@ -443,6 +484,15 @@ extern struct event_constraint emptyconstraint;
 extern struct event_constraint unconstrained;
+static inline bool kernel_ip(unsigned long ip)
+{
+#ifdef CONFIG_X86_32
+        return ip > PAGE_OFFSET;
+#else
+        return (long)ip < 0;
+#endif
+}
 #ifdef CONFIG_CPU_SUP_AMD
 int amd_pmu_init(void);
@@ -523,6 +573,10 @@ void intel_pmu_lbr_init_nhm(void);
 void intel_pmu_lbr_init_atom(void);
+void intel_pmu_lbr_init_snb(void);
+int intel_pmu_setup_lbr_filter(struct perf_event *event);
 int p4_pmu_init(void);
 int p6_pmu_init(void);
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index 0397b23be8e9..dd002faff7a6 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -1,4 +1,5 @@
 #include <linux/perf_event.h>
+#include <linux/export.h>
 #include <linux/types.h>
 #include <linux/init.h>
 #include <linux/slab.h>
@@ -138,6 +139,9 @@ static int amd_pmu_hw_config(struct perf_event *event)
        if (ret)
                return ret;
+        if (has_branch_stack(event))
+                return -EOPNOTSUPP;
        if (event->attr.exclude_host && event->attr.exclude_guest)
                /*
                 * When HO == GO == 1 the hardware treats that as GO == HO == 0
@@ -357,7 +361,9 @@ static void amd_pmu_cpu_starting(int cpu)
        struct amd_nb *nb;
        int i, nb_id;
-        if (boot_cpu_data.x86_max_cores < 2)
+        cpuc->perf_ctr_virt_mask = AMD_PERFMON_EVENTSEL_HOSTONLY;
+        if (boot_cpu_data.x86_max_cores < 2 || boot_cpu_data.x86 == 0x15)
                return;
        nb_id = amd_get_nb_id(cpu);
@@ -587,9 +593,9 @@ static __initconst const struct x86_pmu amd_pmu_f15h = {
        .put_event_constraints  = amd_put_event_constraints,
        .cpu_prepare            = amd_pmu_cpu_prepare,
-        .cpu_starting           = amd_pmu_cpu_starting,
        .cpu_dead               = amd_pmu_cpu_dead,
 #endif
+        .cpu_starting           = amd_pmu_cpu_starting,
 };
 __init int amd_pmu_init(void)
@@ -621,3 +627,33 @@ __init int amd_pmu_init(void)
        return 0;
 }
+void amd_pmu_enable_virt(void)
+{
+        struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+        cpuc->perf_ctr_virt_mask = 0;
+        /* Reload all events */
+        x86_pmu_disable_all();
+        x86_pmu_enable_all(0);
+}
+EXPORT_SYMBOL_GPL(amd_pmu_enable_virt);
+void amd_pmu_disable_virt(void)
+{
+        struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+        /*
+         * We only mask out the Host-only bit so that host-only counting works
+         * when SVM is disabled. If someone sets up a guest-only counter when
+         * SVM is disabled the Guest-only bits still gets set and the counter
+         * will not count anything.
+         */
+        cpuc->perf_ctr_virt_mask = AMD_PERFMON_EVENTSEL_HOSTONLY;
+        /* Reload all events */
+        x86_pmu_disable_all();
+        x86_pmu_enable_all(0);
+}
+EXPORT_SYMBOL_GPL(amd_pmu_disable_virt);
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 3bd37bdf1b8e..6a84e7f28f05 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -385,14 +385,15 @@ static __initconst const u64 westmere_hw_cache_event_ids
 #define NHM_LOCAL_DRAM          (1 << 14)
 #define NHM_NON_DRAM            (1 << 15)
-#define NHM_ALL_DRAM            (NHM_REMOTE_DRAM|NHM_LOCAL_DRAM)
+#define NHM_LOCAL               (NHM_LOCAL_DRAM|NHM_REMOTE_CACHE_FWD)
+#define NHM_REMOTE              (NHM_REMOTE_DRAM)
 #define NHM_DMND_READ           (NHM_DMND_DATA_RD)
 #define NHM_DMND_WRITE          (NHM_DMND_RFO|NHM_DMND_WB)
 #define NHM_DMND_PREFETCH       (NHM_PF_DATA_RD|NHM_PF_DATA_RFO)
 #define NHM_L3_HIT      (NHM_UNCORE_HIT|NHM_OTHER_CORE_HIT_SNP|NHM_OTHER_CORE_HITM)
-#define NHM_L3_MISS     (NHM_NON_DRAM|NHM_ALL_DRAM|NHM_REMOTE_CACHE_FWD)
+#define NHM_L3_MISS     (NHM_NON_DRAM|NHM_LOCAL_DRAM|NHM_REMOTE_DRAM|NHM_REMOTE_CACHE_FWD)
 #define NHM_L3_ACCESS   (NHM_L3_HIT|NHM_L3_MISS)
 static __initconst const u64 nehalem_hw_cache_extra_regs
@@ -416,16 +417,16 @@ static __initconst const u64 nehalem_hw_cache_extra_regs
 },
 [ C(NODE) ] = {
        [ C(OP_READ) ] = {
-                [ C(RESULT_ACCESS) ] = NHM_DMND_READ|NHM_ALL_DRAM,
+                [ C(RESULT_ACCESS) ] = NHM_DMND_READ|NHM_LOCAL|NHM_REMOTE,
-                [ C(RESULT_MISS)   ] = NHM_DMND_READ|NHM_REMOTE_DRAM,
+                [ C(RESULT_MISS)   ] = NHM_DMND_READ|NHM_REMOTE,
        },
        [ C(OP_WRITE) ] = {
-                [ C(RESULT_ACCESS) ] = NHM_DMND_WRITE|NHM_ALL_DRAM,
+                [ C(RESULT_ACCESS) ] = NHM_DMND_WRITE|NHM_LOCAL|NHM_REMOTE,
-                [ C(RESULT_MISS)   ] = NHM_DMND_WRITE|NHM_REMOTE_DRAM,
+                [ C(RESULT_MISS)   ] = NHM_DMND_WRITE|NHM_REMOTE,
        },
        [ C(OP_PREFETCH) ] = {
-                [ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_ALL_DRAM,
+                [ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_LOCAL|NHM_REMOTE,
-                [ C(RESULT_MISS)   ] = NHM_DMND_PREFETCH|NHM_REMOTE_DRAM,
+                [ C(RESULT_MISS)   ] = NHM_DMND_PREFETCH|NHM_REMOTE,
        },
 },
 };
@@ -727,6 +728,19 @@ static __initconst const u64 atom_hw_cache_event_ids
 },
 };
+static inline bool intel_pmu_needs_lbr_smpl(struct perf_event *event)
+{
+        /* user explicitly requested branch sampling */
+        if (has_branch_stack(event))
+                return true;
+        /* implicit branch sampling to correct PEBS skid */
+        if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1)
+                return true;
+        return false;
+}
 static void intel_pmu_disable_all(void)
 {
        struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
@@ -881,6 +895,13 @@ static void intel_pmu_disable_event(struct perf_event *event)
        cpuc->intel_ctrl_guest_mask &= ~(1ull << hwc->idx);
        cpuc->intel_ctrl_host_mask &= ~(1ull << hwc->idx);
+        /*
+         * must disable before any actual event
+         * because any event may be combined with LBR
+         */
+        if (intel_pmu_needs_lbr_smpl(event))
+                intel_pmu_lbr_disable(event);
        if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
                intel_pmu_disable_fixed(hwc);
                return;
@@ -935,6 +956,12 @@ static void intel_pmu_enable_event(struct perf_event *event)
                intel_pmu_enable_bts(hwc->config);
                return;
        }
+        /*
+         * must enabled before any actual event
+         * because any event may be combined with LBR
+         */
+        if (intel_pmu_needs_lbr_smpl(event))
+                intel_pmu_lbr_enable(event);
        if (event->attr.exclude_host)
                cpuc->intel_ctrl_guest_mask |= (1ull << hwc->idx);
@@ -1057,6 +1084,9 @@ again:
                data.period = event->hw.last_period;
+                if (has_branch_stack(event))
+                        data.br_stack = &cpuc->lbr_stack;
                if (perf_event_overflow(event, &data, regs))
                        x86_pmu_stop(event, 0);
        }
@@ -1123,17 +1153,17 @@ static bool intel_try_alt_er(struct perf_event *event, int orig_idx)
 */
 static struct event_constraint *
 __intel_shared_reg_get_constraints(struct cpu_hw_events *cpuc,
-                                   struct perf_event *event)
+                                   struct perf_event *event,
+                                   struct hw_perf_event_extra *reg)
 {
        struct event_constraint *c = &emptyconstraint;
-        struct hw_perf_event_extra *reg = &event->hw.extra_reg;
        struct er_account *era;
        unsigned long flags;
        int orig_idx = reg->idx;
        /* already allocated shared msr */
        if (reg->alloc)
-                return &unconstrained;
+                return NULL; /* call x86_get_event_constraint() */
 again:
        era = &cpuc->shared_regs->regs[reg->idx];
@@ -1156,14 +1186,10 @@ again:
                reg->alloc = 1;
                /*
-                 * All events using extra_reg are unconstrained.
+                 * need to call x86_get_event_constraint()
-                 * Avoids calling x86_get_event_constraints()
+                 * to check if associated event has constraints
-                 *
-                 * Must revisit if extra_reg controlling events
-                 * ever have constraints. Worst case we go through
-                 * the regular event constraint table.
                 */
-                c = &unconstrained;
+                c = NULL;
        } else if (intel_try_alt_er(event, orig_idx)) {
                raw_spin_unlock_irqrestore(&era->lock, flags);
                goto again;
@@ -1200,11 +1226,23 @@ static struct event_constraint *
 intel_shared_regs_constraints(struct cpu_hw_events *cpuc,
                              struct perf_event *event)
 {
-        struct event_constraint *c = NULL;
+        struct event_constraint *c = NULL, *d;
+        struct hw_perf_event_extra *xreg, *breg;
-        if (event->hw.extra_reg.idx != EXTRA_REG_NONE)
-                c = __intel_shared_reg_get_constraints(cpuc, event);
+        xreg = &event->hw.extra_reg;
+        if (xreg->idx != EXTRA_REG_NONE) {
+                c = __intel_shared_reg_get_constraints(cpuc, event, xreg);
+                if (c == &emptyconstraint)
+                        return c;
+        }
+        breg = &event->hw.branch_reg;
+        if (breg->idx != EXTRA_REG_NONE) {
+                d = __intel_shared_reg_get_constraints(cpuc, event, breg);
+                if (d == &emptyconstraint) {
+                        __intel_shared_reg_put_constraints(cpuc, xreg);
+                        c = d;
+                }
+        }
        return c;
 }
@@ -1252,6 +1290,10 @@ intel_put_shared_regs_event_constraints(struct cpu_hw_events *cpuc,
        reg = &event->hw.extra_reg;
        if (reg->idx != EXTRA_REG_NONE)
                __intel_shared_reg_put_constraints(cpuc, reg);
+        reg = &event->hw.branch_reg;
+        if (reg->idx != EXTRA_REG_NONE)
+                __intel_shared_reg_put_constraints(cpuc, reg);
 }
 static void intel_put_event_constraints(struct cpu_hw_events *cpuc,
@@ -1287,12 +1329,19 @@ static int intel_pmu_hw_config(struct perf_event *event)
                 *
                 * Thereby we gain a PEBS capable cycle counter.
                 */
-                u64 alt_config = 0x108000c0; /* INST_RETIRED.TOTAL_CYCLES */
+                u64 alt_config = X86_CONFIG(.event=0xc0, .inv=1, .cmask=16);
                alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK);
                event->hw.config = alt_config;
        }
+        if (intel_pmu_needs_lbr_smpl(event)) {
+                ret = intel_pmu_setup_lbr_filter(event);
+                if (ret)
+                        return ret;
+        }
        if (event->attr.type != PERF_TYPE_RAW)
                return 0;
@@ -1431,7 +1480,7 @@ static int intel_pmu_cpu_prepare(int cpu)
 {
        struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
-        if (!x86_pmu.extra_regs)
+        if (!(x86_pmu.extra_regs || x86_pmu.lbr_sel_map))
                return NOTIFY_OK;
        cpuc->shared_regs = allocate_shared_regs(cpu);
@@ -1453,22 +1502,28 @@ static void intel_pmu_cpu_starting(int cpu)
         */
        intel_pmu_lbr_reset();
-        if (!cpuc->shared_regs || (x86_pmu.er_flags & ERF_NO_HT_SHARING))
+        cpuc->lbr_sel = NULL;
+        if (!cpuc->shared_regs)
                return;
-        for_each_cpu(i, topology_thread_cpumask(cpu)) {
+        if (!(x86_pmu.er_flags & ERF_NO_HT_SHARING)) {
-                struct intel_shared_regs *pc;
+                for_each_cpu(i, topology_thread_cpumask(cpu)) {
+                        struct intel_shared_regs *pc;
-                pc = per_cpu(cpu_hw_events, i).shared_regs;
+                        pc = per_cpu(cpu_hw_events, i).shared_regs;
-                if (pc && pc->core_id == core_id) {
+                        if (pc && pc->core_id == core_id) {
-                        cpuc->kfree_on_online = cpuc->shared_regs;
+                                cpuc->kfree_on_online = cpuc->shared_regs;
-                        cpuc->shared_regs = pc;
+                                cpuc->shared_regs = pc;
-                        break;
+                                break;
+                        }
                }
+                cpuc->shared_regs->core_id = core_id;
+                cpuc->shared_regs->refcnt++;
        }
-        cpuc->shared_regs->core_id = core_id;
+        if (x86_pmu.lbr_sel_map)
-        cpuc->shared_regs->refcnt++;
+                cpuc->lbr_sel = &cpuc->shared_regs->regs[EXTRA_REG_LBR];
 }
 static void intel_pmu_cpu_dying(int cpu)
@@ -1486,6 +1541,18 @@ static void intel_pmu_cpu_dying(int cpu)
        fini_debug_store_on_cpu(cpu);
 }
+static void intel_pmu_flush_branch_stack(void)
+{
+        /*
+         * Intel LBR does not tag entries with the
+         * PID of the current task, then we need to
+         * flush it on ctxsw
+         * For now, we simply reset it
+         */
+        if (x86_pmu.lbr_nr)
+                intel_pmu_lbr_reset();
+}
 static __initconst const struct x86_pmu intel_pmu = {
        .name                   = "Intel",
        .handle_irq             = intel_pmu_handle_irq,
@@ -1513,6 +1580,7 @@ static __initconst const struct x86_pmu intel_pmu = {
        .cpu_starting           = intel_pmu_cpu_starting,
        .cpu_dying              = intel_pmu_cpu_dying,
        .guest_get_msrs         = intel_guest_get_msrs,
+        .flush_branch_stack     = intel_pmu_flush_branch_stack,
 };
 static __init void intel_clovertown_quirk(void)
@@ -1689,9 +1757,11 @@ __init int intel_pmu_init(void)
                x86_pmu.extra_regs = intel_nehalem_extra_regs;
                /* UOPS_ISSUED.STALLED_CYCLES */
-                intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e;
+                intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
+                        X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
                /* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */
-                intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x1803fb1;
+                intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
+                        X86_CONFIG(.event=0xb1, .umask=0x3f, .inv=1, .cmask=1);
                x86_add_quirk(intel_nehalem_quirk);
@@ -1726,9 +1796,11 @@ __init int intel_pmu_init(void)
                x86_pmu.er_flags |= ERF_HAS_RSP_1;
                /* UOPS_ISSUED.STALLED_CYCLES */
-                intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e;
+                intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
+                        X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
                /* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */
-                intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x1803fb1;
+                intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
+                        X86_CONFIG(.event=0xb1, .umask=0x3f, .inv=1, .cmask=1);
                pr_cont("Westmere events, ");
                break;
@@ -1739,7 +1811,7 @@ __init int intel_pmu_init(void)
                memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
                       sizeof(hw_cache_event_ids));
-                intel_pmu_lbr_init_nhm();
+                intel_pmu_lbr_init_snb();
                x86_pmu.event_constraints = intel_snb_event_constraints;
                x86_pmu.pebs_constraints = intel_snb_pebs_event_constraints;
@@ -1749,9 +1821,11 @@ __init int intel_pmu_init(void)
                x86_pmu.er_flags |= ERF_NO_HT_SHARING;
                /* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */
-                intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e;
+                intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
+                        X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
                /* UOPS_DISPATCHED.THREAD,c=1,i=1 to count stall cycles*/
-                intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x18001b1;
+                intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
+                        X86_CONFIG(.event=0xb1, .umask=0x01, .inv=1, .cmask=1);
                pr_cont("SandyBridge events, ");
                break;
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index d6bd49faa40c..7f64df19e7dd 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -3,6 +3,7 @@
 #include <linux/slab.h>
 #include <asm/perf_event.h>
+#include <asm/insn.h>
 #include "perf_event.h"
@@ -439,9 +440,6 @@ void intel_pmu_pebs_enable(struct perf_event *event)
        hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT;
        cpuc->pebs_enabled |= 1ULL << hwc->idx;
-        if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1)
-                intel_pmu_lbr_enable(event);
 }
 void intel_pmu_pebs_disable(struct perf_event *event)
@@ -454,9 +452,6 @@ void intel_pmu_pebs_disable(struct perf_event *event)
                wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled);
        hwc->config |= ARCH_PERFMON_EVENTSEL_INT;
-        if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1)
-                intel_pmu_lbr_disable(event);
 }
 void intel_pmu_pebs_enable_all(void)
@@ -475,17 +470,6 @@ void intel_pmu_pebs_disable_all(void)
                wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
 }
-#include <asm/insn.h>
-static inline bool kernel_ip(unsigned long ip)
-{
-#ifdef CONFIG_X86_32
-        return ip > PAGE_OFFSET;
-#else
-        return (long)ip < 0;
-#endif
-}
 static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
 {
        struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
@@ -572,6 +556,7 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
         * both formats and we don't use the other fields in this
         * routine.
         */
+        struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
        struct pebs_record_core *pebs = __pebs;
        struct perf_sample_data data;
        struct pt_regs regs;
@@ -602,6 +587,9 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
        else
                regs.flags &= ~PERF_EFLAGS_EXACT;
+        if (has_branch_stack(event))
+                data.br_stack = &cpuc->lbr_stack;
        if (perf_event_overflow(event, &data, &regs))
                x86_pmu_stop(event, 0);
 }
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index 47a7e63bfe54..520b4265fcd2 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -3,6 +3,7 @@
 #include <asm/perf_event.h>
 #include <asm/msr.h>
+#include <asm/insn.h>
 #include "perf_event.h"
@@ -14,6 +15,100 @@ enum {
 };
 /*
+ * Intel LBR_SELECT bits
+ * Intel Vol3a, April 2011, Section 16.7 Table 16-10
+ *
+ * Hardware branch filter (not available on all CPUs)
+ */
+#define LBR_KERNEL_BIT          0 /* do not capture at ring0 */
+#define LBR_USER_BIT            1 /* do not capture at ring > 0 */
+#define LBR_JCC_BIT             2 /* do not capture conditional branches */
+#define LBR_REL_CALL_BIT        3 /* do not capture relative calls */
+#define LBR_IND_CALL_BIT        4 /* do not capture indirect calls */
+#define LBR_RETURN_BIT          5 /* do not capture near returns */
+#define LBR_IND_JMP_BIT         6 /* do not capture indirect jumps */
+#define LBR_REL_JMP_BIT         7 /* do not capture relative jumps */
+#define LBR_FAR_BIT             8 /* do not capture far branches */
+#define LBR_KERNEL      (1 << LBR_KERNEL_BIT)
+#define LBR_USER        (1 << LBR_USER_BIT)
+#define LBR_JCC         (1 << LBR_JCC_BIT)
+#define LBR_REL_CALL    (1 << LBR_REL_CALL_BIT)
+#define LBR_IND_CALL    (1 << LBR_IND_CALL_BIT)
+#define LBR_RETURN      (1 << LBR_RETURN_BIT)
+#define LBR_REL_JMP     (1 << LBR_REL_JMP_BIT)
+#define LBR_IND_JMP     (1 << LBR_IND_JMP_BIT)
+#define LBR_FAR         (1 << LBR_FAR_BIT)
+#define LBR_PLM (LBR_KERNEL | LBR_USER)
+#define LBR_SEL_MASK    0x1ff   /* valid bits in LBR_SELECT */
+#define LBR_NOT_SUPP    -1      /* LBR filter not supported */
+#define LBR_IGN         0       /* ignored */
+#define LBR_ANY          \
+        (LBR_JCC        |\
+         LBR_REL_CALL   |\
+         LBR_IND_CALL   |\
+         LBR_RETURN     |\
+         LBR_REL_JMP    |\
+         LBR_IND_JMP    |\
+         LBR_FAR)
+#define LBR_FROM_FLAG_MISPRED  (1ULL << 63)
+#define for_each_branch_sample_type(x) \
+        for ((x) = PERF_SAMPLE_BRANCH_USER; \
+             (x) < PERF_SAMPLE_BRANCH_MAX; (x) <<= 1)
+/*
+ * x86control flow change classification
+ * x86control flow changes include branches, interrupts, traps, faults
+ */
+enum {
+        X86_BR_NONE     = 0,      /* unknown */
+        X86_BR_USER     = 1 << 0, /* branch target is user */
+        X86_BR_KERNEL   = 1 << 1, /* branch target is kernel */
+        X86_BR_CALL     = 1 << 2, /* call */
+        X86_BR_RET      = 1 << 3, /* return */
+        X86_BR_SYSCALL  = 1 << 4, /* syscall */
+        X86_BR_SYSRET   = 1 << 5, /* syscall return */
+        X86_BR_INT      = 1 << 6, /* sw interrupt */
+        X86_BR_IRET     = 1 << 7, /* return from interrupt */
+        X86_BR_JCC      = 1 << 8, /* conditional */
+        X86_BR_JMP      = 1 << 9, /* jump */
+        X86_BR_IRQ      = 1 << 10,/* hw interrupt or trap or fault */
+        X86_BR_IND_CALL = 1 << 11,/* indirect calls */
+};
+#define X86_BR_PLM (X86_BR_USER | X86_BR_KERNEL)
+#define X86_BR_ANY       \
+        (X86_BR_CALL    |\
+         X86_BR_RET     |\
+         X86_BR_SYSCALL |\
+         X86_BR_SYSRET  |\
+         X86_BR_INT     |\
+         X86_BR_IRET    |\
+         X86_BR_JCC     |\
+         X86_BR_JMP      |\
+         X86_BR_IRQ      |\
+         X86_BR_IND_CALL)
+#define X86_BR_ALL (X86_BR_PLM | X86_BR_ANY)
+#define X86_BR_ANY_CALL          \
+        (X86_BR_CALL            |\
+         X86_BR_IND_CALL        |\
+         X86_BR_SYSCALL         |\
+         X86_BR_IRQ             |\
+         X86_BR_INT)
+static void intel_pmu_lbr_filter(struct cpu_hw_events *cpuc);
+/*
 * We only support LBR implementations that have FREEZE_LBRS_ON_PMI
 * otherwise it becomes near impossible to get a reliable stack.
 */
@@ -21,6 +116,10 @@ enum {
 static void __intel_pmu_lbr_enable(void)
 {
        u64 debugctl;
+        struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+        if (cpuc->lbr_sel)
+                wrmsrl(MSR_LBR_SELECT, cpuc->lbr_sel->config);
        rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
        debugctl |= (DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI);
@@ -76,11 +175,11 @@ void intel_pmu_lbr_enable(struct perf_event *event)
         * Reset the LBR stack if we changed task context to
         * avoid data leaks.
         */
        if (event->ctx->task && cpuc->lbr_context != event->ctx) {
                intel_pmu_lbr_reset();
                cpuc->lbr_context = event->ctx;
        }
+        cpuc->br_sel = event->hw.branch_reg.reg;
        cpuc->lbr_users++;
 }
@@ -95,8 +194,11 @@ void intel_pmu_lbr_disable(struct perf_event *event)
        cpuc->lbr_users--;
        WARN_ON_ONCE(cpuc->lbr_users < 0);
-        if (cpuc->enabled && !cpuc->lbr_users)
+        if (cpuc->enabled && !cpuc->lbr_users) {
                __intel_pmu_lbr_disable();
+                /* avoid stale pointer */
+                cpuc->lbr_context = NULL;
+        }
 }
 void intel_pmu_lbr_enable_all(void)
@@ -115,6 +217,9 @@ void intel_pmu_lbr_disable_all(void)
                __intel_pmu_lbr_disable();
 }
+/*
+ * TOS = most recently recorded branch
+ */
 static inline u64 intel_pmu_lbr_tos(void)
 {
        u64 tos;
@@ -142,15 +247,15 @@ static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)
                rdmsrl(x86_pmu.lbr_from + lbr_idx, msr_lastbranch.lbr);
-                cpuc->lbr_entries[i].from  = msr_lastbranch.from;
+                cpuc->lbr_entries[i].from       = msr_lastbranch.from;
-                cpuc->lbr_entries[i].to    = msr_lastbranch.to;
+                cpuc->lbr_entries[i].to         = msr_lastbranch.to;
-                cpuc->lbr_entries[i].flags = 0;
+                cpuc->lbr_entries[i].mispred    = 0;
+                cpuc->lbr_entries[i].predicted  = 0;
+                cpuc->lbr_entries[i].reserved   = 0;
        }
        cpuc->lbr_stack.nr = i;
 }
-#define LBR_FROM_FLAG_MISPRED  (1ULL << 63)
 /*
 * Due to lack of segmentation in Linux the effective address (offset)
 * is the same as the linear address, allowing us to merge the LIP and EIP
@@ -165,19 +270,22 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
        for (i = 0; i < x86_pmu.lbr_nr; i++) {
                unsigned long lbr_idx = (tos - i) & mask;
-                u64 from, to, flags = 0;
+                u64 from, to, mis = 0, pred = 0;
                rdmsrl(x86_pmu.lbr_from + lbr_idx, from);
                rdmsrl(x86_pmu.lbr_to   + lbr_idx, to);
                if (lbr_format == LBR_FORMAT_EIP_FLAGS) {
-                        flags = !!(from & LBR_FROM_FLAG_MISPRED);
+                        mis = !!(from & LBR_FROM_FLAG_MISPRED);
+                        pred = !mis;
                        from = (u64)((((s64)from) << 1) >> 1);
                }
-                cpuc->lbr_entries[i].from  = from;
+                cpuc->lbr_entries[i].from       = from;
-                cpuc->lbr_entries[i].to    = to;
+                cpuc->lbr_entries[i].to         = to;
-                cpuc->lbr_entries[i].flags = flags;
+                cpuc->lbr_entries[i].mispred    = mis;
+                cpuc->lbr_entries[i].predicted  = pred;
+                cpuc->lbr_entries[i].reserved   = 0;
        }
        cpuc->lbr_stack.nr = i;
 }
@@ -193,28 +301,404 @@ void intel_pmu_lbr_read(void)
                intel_pmu_lbr_read_32(cpuc);
        else
                intel_pmu_lbr_read_64(cpuc);
+        intel_pmu_lbr_filter(cpuc);
+}
+/*
+ * SW filter is used:
+ * - in case there is no HW filter
+ * - in case the HW filter has errata or limitations
+ */
+static void intel_pmu_setup_sw_lbr_filter(struct perf_event *event)
+{
+        u64 br_type = event->attr.branch_sample_type;
+        int mask = 0;
+        if (br_type & PERF_SAMPLE_BRANCH_USER)
+                mask |= X86_BR_USER;
+        if (br_type & PERF_SAMPLE_BRANCH_KERNEL)
+                mask |= X86_BR_KERNEL;
+        /* we ignore BRANCH_HV here */
+        if (br_type & PERF_SAMPLE_BRANCH_ANY)
+                mask |= X86_BR_ANY;
+        if (br_type & PERF_SAMPLE_BRANCH_ANY_CALL)
+                mask |= X86_BR_ANY_CALL;
+        if (br_type & PERF_SAMPLE_BRANCH_ANY_RETURN)
+                mask |= X86_BR_RET | X86_BR_IRET | X86_BR_SYSRET;
+        if (br_type & PERF_SAMPLE_BRANCH_IND_CALL)
+                mask |= X86_BR_IND_CALL;
+        /*
+         * stash actual user request into reg, it may
+         * be used by fixup code for some CPU
+         */
+        event->hw.branch_reg.reg = mask;
+}
+/*
+ * setup the HW LBR filter
+ * Used only when available, may not be enough to disambiguate
+ * all branches, may need the help of the SW filter
+ */
+static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event)
+{
+        struct hw_perf_event_extra *reg;
+        u64 br_type = event->attr.branch_sample_type;
+        u64 mask = 0, m;
+        u64 v;
+        for_each_branch_sample_type(m) {
+                if (!(br_type & m))
+                        continue;
+                v = x86_pmu.lbr_sel_map[m];
+                if (v == LBR_NOT_SUPP)
+                        return -EOPNOTSUPP;
+                if (v != LBR_IGN)
+                        mask |= v;
+        }
+        reg = &event->hw.branch_reg;
+        reg->idx = EXTRA_REG_LBR;
+        /* LBR_SELECT operates in suppress mode so invert mask */
+        reg->config = ~mask & x86_pmu.lbr_sel_mask;
+        return 0;
+}
+int intel_pmu_setup_lbr_filter(struct perf_event *event)
+{
+        int ret = 0;
+        /*
+         * no LBR on this PMU
+         */
+        if (!x86_pmu.lbr_nr)
+                return -EOPNOTSUPP;
+        /*
+         * setup SW LBR filter
+         */
+        intel_pmu_setup_sw_lbr_filter(event);
+        /*
+         * setup HW LBR filter, if any
+         */
+        if (x86_pmu.lbr_sel_map)
+                ret = intel_pmu_setup_hw_lbr_filter(event);
+        return ret;
 }
+/*
+ * return the type of control flow change at address "from"
+ * intruction is not necessarily a branch (in case of interrupt).
+ *
+ * The branch type returned also includes the priv level of the
+ * target of the control flow change (X86_BR_USER, X86_BR_KERNEL).
+ *
+ * If a branch type is unknown OR the instruction cannot be
+ * decoded (e.g., text page not present), then X86_BR_NONE is
+ * returned.
+ */
+static int branch_type(unsigned long from, unsigned long to)
+{
+        struct insn insn;
+        void *addr;
+        int bytes, size = MAX_INSN_SIZE;
+        int ret = X86_BR_NONE;
+        int ext, to_plm, from_plm;
+        u8 buf[MAX_INSN_SIZE];
+        int is64 = 0;
+        to_plm = kernel_ip(to) ? X86_BR_KERNEL : X86_BR_USER;
+        from_plm = kernel_ip(from) ? X86_BR_KERNEL : X86_BR_USER;
+        /*
+         * maybe zero if lbr did not fill up after a reset by the time
+         * we get a PMU interrupt
+         */
+        if (from == 0 || to == 0)
+                return X86_BR_NONE;
+        if (from_plm == X86_BR_USER) {
+                /*
+                 * can happen if measuring at the user level only
+                 * and we interrupt in a kernel thread, e.g., idle.
+                 */
+                if (!current->mm)
+                        return X86_BR_NONE;
+                /* may fail if text not present */
+                bytes = copy_from_user_nmi(buf, (void __user *)from, size);
+                if (bytes != size)
+                        return X86_BR_NONE;
+                addr = buf;
+        } else
+                addr = (void *)from;
+        /*
+         * decoder needs to know the ABI especially
+         * on 64-bit systems running 32-bit apps
+         */
+#ifdef CONFIG_X86_64
+        is64 = kernel_ip((unsigned long)addr) || !test_thread_flag(TIF_IA32);
+#endif
+        insn_init(&insn, addr, is64);
+        insn_get_opcode(&insn);
+        switch (insn.opcode.bytes[0]) {
+        case 0xf:
+                switch (insn.opcode.bytes[1]) {
+                case 0x05: /* syscall */
+                case 0x34: /* sysenter */
+                        ret = X86_BR_SYSCALL;
+                        break;
+                case 0x07: /* sysret */
+                case 0x35: /* sysexit */
+                        ret = X86_BR_SYSRET;
+                        break;
+                case 0x80 ... 0x8f: /* conditional */
+                        ret = X86_BR_JCC;
+                        break;
+                default:
+                        ret = X86_BR_NONE;
+                }
+                break;
+        case 0x70 ... 0x7f: /* conditional */
+                ret = X86_BR_JCC;
+                break;
+        case 0xc2: /* near ret */
+        case 0xc3: /* near ret */
+        case 0xca: /* far ret */
+        case 0xcb: /* far ret */
+                ret = X86_BR_RET;
+                break;
+        case 0xcf: /* iret */
+                ret = X86_BR_IRET;
+                break;
+        case 0xcc ... 0xce: /* int */
+                ret = X86_BR_INT;
+                break;
+        case 0xe8: /* call near rel */
+        case 0x9a: /* call far absolute */
+                ret = X86_BR_CALL;
+                break;
+        case 0xe0 ... 0xe3: /* loop jmp */
+                ret = X86_BR_JCC;
+                break;
+        case 0xe9 ... 0xeb: /* jmp */
+                ret = X86_BR_JMP;
+                break;
+        case 0xff: /* call near absolute, call far absolute ind */
+                insn_get_modrm(&insn);
+                ext = (insn.modrm.bytes[0] >> 3) & 0x7;
+                switch (ext) {
+                case 2: /* near ind call */
+                case 3: /* far ind call */
+                        ret = X86_BR_IND_CALL;
+                        break;
+                case 4:
+                case 5:
+                        ret = X86_BR_JMP;
+                        break;
+                }
+                break;
+        default:
+                ret = X86_BR_NONE;
+        }
+        /*
+         * interrupts, traps, faults (and thus ring transition) may
+         * occur on any instructions. Thus, to classify them correctly,
+         * we need to first look at the from and to priv levels. If they
+         * are different and to is in the kernel, then it indicates
+         * a ring transition. If the from instruction is not a ring
+         * transition instr (syscall, systenter, int), then it means
+         * it was a irq, trap or fault.
+         *
+         * we have no way of detecting kernel to kernel faults.
+         */
+        if (from_plm == X86_BR_USER && to_plm == X86_BR_KERNEL
+            && ret != X86_BR_SYSCALL && ret != X86_BR_INT)
+                ret = X86_BR_IRQ;
+        /*
+         * branch priv level determined by target as
+         * is done by HW when LBR_SELECT is implemented
+         */
+        if (ret != X86_BR_NONE)
+                ret |= to_plm;
+        return ret;
+}
+/*
+ * implement actual branch filter based on user demand.
+ * Hardware may not exactly satisfy that request, thus
+ * we need to inspect opcodes. Mismatched branches are
+ * discarded. Therefore, the number of branches returned
+ * in PERF_SAMPLE_BRANCH_STACK sample may vary.
+ */
+static void
+intel_pmu_lbr_filter(struct cpu_hw_events *cpuc)
+{
+        u64 from, to;
+        int br_sel = cpuc->br_sel;
+        int i, j, type;
+        bool compress = false;
+        /* if sampling all branches, then nothing to filter */
+        if ((br_sel & X86_BR_ALL) == X86_BR_ALL)
+                return;
+        for (i = 0; i < cpuc->lbr_stack.nr; i++) {
+                from = cpuc->lbr_entries[i].from;
+                to = cpuc->lbr_entries[i].to;
+                type = branch_type(from, to);
+                /* if type does not correspond, then discard */
+                if (type == X86_BR_NONE || (br_sel & type) != type) {
+                        cpuc->lbr_entries[i].from = 0;
+                        compress = true;
+                }
+        }
+        if (!compress)
+                return;
+        /* remove all entries with from=0 */
+        for (i = 0; i < cpuc->lbr_stack.nr; ) {
+                if (!cpuc->lbr_entries[i].from) {
+                        j = i;
+                        while (++j < cpuc->lbr_stack.nr)
+                                cpuc->lbr_entries[j-1] = cpuc->lbr_entries[j];
+                        cpuc->lbr_stack.nr--;
+                        if (!cpuc->lbr_entries[i].from)
+                                continue;
+                }
+                i++;
+        }
+}
+/*
+ * Map interface branch filters onto LBR filters
+ */
+static const int nhm_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX] = {
+        [PERF_SAMPLE_BRANCH_ANY]        = LBR_ANY,
+        [PERF_SAMPLE_BRANCH_USER]       = LBR_USER,
+        [PERF_SAMPLE_BRANCH_KERNEL]     = LBR_KERNEL,
+        [PERF_SAMPLE_BRANCH_HV]         = LBR_IGN,
+        [PERF_SAMPLE_BRANCH_ANY_RETURN] = LBR_RETURN | LBR_REL_JMP
+                                        | LBR_IND_JMP | LBR_FAR,
+        /*
+         * NHM/WSM erratum: must include REL_JMP+IND_JMP to get CALL branches
+         */
+        [PERF_SAMPLE_BRANCH_ANY_CALL] =
+         LBR_REL_CALL | LBR_IND_CALL | LBR_REL_JMP | LBR_IND_JMP | LBR_FAR,
+        /*
+         * NHM/WSM erratum: must include IND_JMP to capture IND_CALL
+         */
+        [PERF_SAMPLE_BRANCH_IND_CALL] = LBR_IND_CALL | LBR_IND_JMP,
+};
+static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX] = {
+        [PERF_SAMPLE_BRANCH_ANY]        = LBR_ANY,
+        [PERF_SAMPLE_BRANCH_USER]       = LBR_USER,
+        [PERF_SAMPLE_BRANCH_KERNEL]     = LBR_KERNEL,
+        [PERF_SAMPLE_BRANCH_HV]         = LBR_IGN,
+        [PERF_SAMPLE_BRANCH_ANY_RETURN] = LBR_RETURN | LBR_FAR,
+        [PERF_SAMPLE_BRANCH_ANY_CALL]   = LBR_REL_CALL | LBR_IND_CALL
+                                        | LBR_FAR,
+        [PERF_SAMPLE_BRANCH_IND_CALL]   = LBR_IND_CALL,
+};
+/* core */
 void intel_pmu_lbr_init_core(void)
 {
        x86_pmu.lbr_nr     = 4;
-        x86_pmu.lbr_tos    = 0x01c9;
+        x86_pmu.lbr_tos    = MSR_LBR_TOS;
-        x86_pmu.lbr_from   = 0x40;
+        x86_pmu.lbr_from   = MSR_LBR_CORE_FROM;
-        x86_pmu.lbr_to     = 0x60;
+        x86_pmu.lbr_to     = MSR_LBR_CORE_TO;
+        /*
+         * SW branch filter usage:
+         * - compensate for lack of HW filter
+         */
+        pr_cont("4-deep LBR, ");
 }
+/* nehalem/westmere */
 void intel_pmu_lbr_init_nhm(void)
 {
        x86_pmu.lbr_nr     = 16;
-        x86_pmu.lbr_tos    = 0x01c9;
+        x86_pmu.lbr_tos    = MSR_LBR_TOS;
-        x86_pmu.lbr_from   = 0x680;
+        x86_pmu.lbr_from   = MSR_LBR_NHM_FROM;
-        x86_pmu.lbr_to     = 0x6c0;
+        x86_pmu.lbr_to     = MSR_LBR_NHM_TO;
+        x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
+        x86_pmu.lbr_sel_map  = nhm_lbr_sel_map;
+        /*
+         * SW branch filter usage:
+         * - workaround LBR_SEL errata (see above)
+         * - support syscall, sysret capture.
+         *   That requires LBR_FAR but that means far
+         *   jmp need to be filtered out
+         */
+        pr_cont("16-deep LBR, ");
+}
+/* sandy bridge */
+void intel_pmu_lbr_init_snb(void)
+{
+        x86_pmu.lbr_nr   = 16;
+        x86_pmu.lbr_tos  = MSR_LBR_TOS;
+        x86_pmu.lbr_from = MSR_LBR_NHM_FROM;
+        x86_pmu.lbr_to   = MSR_LBR_NHM_TO;
+        x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
+        x86_pmu.lbr_sel_map  = snb_lbr_sel_map;
+        /*
+         * SW branch filter usage:
+         * - support syscall, sysret capture.
+         *   That requires LBR_FAR but that means far
+         *   jmp need to be filtered out
+         */
+        pr_cont("16-deep LBR, ");
 }
+/* atom */
 void intel_pmu_lbr_init_atom(void)
 {
+        /*
+         * only models starting at stepping 10 seems
+         * to have an operational LBR which can freeze
+         * on PMU interrupt
+         */
+        if (boot_cpu_data.x86_mask < 10) {
+                pr_cont("LBR disabled due to erratum");
+                return;
+        }
        x86_pmu.lbr_nr     = 8;
-        x86_pmu.lbr_tos    = 0x01c9;
+        x86_pmu.lbr_tos    = MSR_LBR_TOS;
-        x86_pmu.lbr_from   = 0x40;
+        x86_pmu.lbr_from   = MSR_LBR_CORE_FROM;
-        x86_pmu.lbr_to     = 0x60;
+        x86_pmu.lbr_to     = MSR_LBR_CORE_TO;
+        /*
+         * SW branch filter usage:
+         * - compensate for lack of HW filter
+         */
+        pr_cont("8-deep LBR, ");
 }
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 3fe8239fd8fb..1333d9851778 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1532,10 +1532,17 @@ ENTRY(nmi)
        pushq_cfi %rdx
        /*
+         * If %cs was not the kernel segment, then the NMI triggered in user
+         * space, which means it is definitely not nested.
+         */
+        cmpl $__KERNEL_CS, 16(%rsp)
+        jne first_nmi
+        /*
         * Check the special variable on the stack to see if NMIs are
         * executing.
         */
-        cmp $1, -8(%rsp)
+        cmpl $1, -8(%rsp)
        je nested_nmi
        /*
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 40fc86161d92..58b7f27cb3e9 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -100,13 +100,8 @@ execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq)
        irqctx->tinfo.task = curctx->tinfo.task;
        irqctx->tinfo.previous_esp = current_stack_pointer;
-        /*
+        /* Copy the preempt_count so that the [soft]irq checks work. */
-         * Copy the softirq bits in preempt_count so that the
+        irqctx->tinfo.preempt_count = curctx->tinfo.preempt_count;
-         * softirq checks work in the hardirq context.
-         */
-        irqctx->tinfo.preempt_count =
-                (irqctx->tinfo.preempt_count & ~SOFTIRQ_MASK) |
-                (curctx->tinfo.preempt_count & SOFTIRQ_MASK);
        if (unlikely(overflow))
                call_on_stack(print_stack_overflow, isp);
@@ -196,7 +191,7 @@ bool handle_irq(unsigned irq, struct pt_regs *regs)
        if (unlikely(!desc))
                return false;
-        if (!execute_on_irq_stack(overflow, desc, irq)) {
+        if (user_mode_vm(regs) || !execute_on_irq_stack(overflow, desc, irq)) {
                if (unlikely(overflow))
                        print_stack_overflow();
                desc->handle_irq(irq, desc);
diff --git a/arch/x86/kernel/kprobes-common.h b/arch/x86/kernel/kprobes-common.h
new file mode 100644
index 000000000000..3230b68ef29a
--- /dev/null
+++ b/arch/x86/kernel/kprobes-common.h
@@ -0,0 +1,102 @@
+#ifndef __X86_KERNEL_KPROBES_COMMON_H
+#define __X86_KERNEL_KPROBES_COMMON_H
+/* Kprobes and Optprobes common header */
+#ifdef CONFIG_X86_64
+#define SAVE_REGS_STRING                        \
+        /* Skip cs, ip, orig_ax. */             \
+        "       subq $24, %rsp\n"               \
+        "       pushq %rdi\n"                   \
+        "       pushq %rsi\n"                   \
+        "       pushq %rdx\n"                   \
+        "       pushq %rcx\n"                   \
+        "       pushq %rax\n"                   \
+        "       pushq %r8\n"                    \
+        "       pushq %r9\n"                    \
+        "       pushq %r10\n"                   \
+        "       pushq %r11\n"                   \
+        "       pushq %rbx\n"                   \
+        "       pushq %rbp\n"                   \
+        "       pushq %r12\n"                   \
+        "       pushq %r13\n"                   \
+        "       pushq %r14\n"                   \
+        "       pushq %r15\n"
+#define RESTORE_REGS_STRING                     \
+        "       popq %r15\n"                    \
+        "       popq %r14\n"                    \
+        "       popq %r13\n"                    \
+        "       popq %r12\n"                    \
+        "       popq %rbp\n"                    \
+        "       popq %rbx\n"                    \
+        "       popq %r11\n"                    \
+        "       popq %r10\n"                    \
+        "       popq %r9\n"                     \
+        "       popq %r8\n"                     \
+        "       popq %rax\n"                    \
+        "       popq %rcx\n"                    \
+        "       popq %rdx\n"                    \
+        "       popq %rsi\n"                    \
+        "       popq %rdi\n"                    \
+        /* Skip orig_ax, ip, cs */              \
+        "       addq $24, %rsp\n"
+#else
+#define SAVE_REGS_STRING                        \
+        /* Skip cs, ip, orig_ax and gs. */      \
+        "       subl $16, %esp\n"               \
+        "       pushl %fs\n"                    \
+        "       pushl %es\n"                    \
+        "       pushl %ds\n"                    \
+        "       pushl %eax\n"                   \
+        "       pushl %ebp\n"                   \
+        "       pushl %edi\n"                   \
+        "       pushl %esi\n"                   \
+        "       pushl %edx\n"                   \
+        "       pushl %ecx\n"                   \
+        "       pushl %ebx\n"
+#define RESTORE_REGS_STRING                     \
+        "       popl %ebx\n"                    \
+        "       popl %ecx\n"                    \
+        "       popl %edx\n"                    \
+        "       popl %esi\n"                    \
+        "       popl %edi\n"                    \
+        "       popl %ebp\n"                    \
+        "       popl %eax\n"                    \
+        /* Skip ds, es, fs, gs, orig_ax, and ip. Note: don't pop cs here*/\
+        "       addl $24, %esp\n"
+#endif
+/* Ensure if the instruction can be boostable */
+extern int can_boost(kprobe_opcode_t *instruction);
+/* Recover instruction if given address is probed */
+extern unsigned long recover_probed_instruction(kprobe_opcode_t *buf,
+                                         unsigned long addr);
+/*
+ * Copy an instruction and adjust the displacement if the instruction
+ * uses the %rip-relative addressing mode.
+ */
+extern int __copy_instruction(u8 *dest, u8 *src);
+/* Generate a relative-jump/call instruction */
+extern void synthesize_reljump(void *from, void *to);
+extern void synthesize_relcall(void *from, void *to);
+#ifdef  CONFIG_OPTPROBES
+extern int arch_init_optprobes(void);
+extern int setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter);
+extern unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, unsigned long addr);
+#else   /* !CONFIG_OPTPROBES */
+static inline int arch_init_optprobes(void)
+{
+        return 0;
+}
+static inline int setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter)
+{
+        return 0;
+}
+static inline unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, unsigned long addr)
+{
+        return addr;
+}
+#endif
+#endif
diff --git a/arch/x86/kernel/kprobes-opt.c b/arch/x86/kernel/kprobes-opt.c
new file mode 100644
index 000000000000..c5e410eed403
--- /dev/null
+++ b/arch/x86/kernel/kprobes-opt.c
@@ -0,0 +1,512 @@
+/*
+ *  Kernel Probes Jump Optimization (Optprobes)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2002, 2004
+ * Copyright (C) Hitachi Ltd., 2012
+ */
+#include <linux/kprobes.h>
+#include <linux/ptrace.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/hardirq.h>
+#include <linux/preempt.h>
+#include <linux/module.h>
+#include <linux/kdebug.h>
+#include <linux/kallsyms.h>
+#include <linux/ftrace.h>
+#include <asm/cacheflush.h>
+#include <asm/desc.h>
+#include <asm/pgtable.h>
+#include <asm/uaccess.h>
+#include <asm/alternative.h>
+#include <asm/insn.h>
+#include <asm/debugreg.h>
+#include "kprobes-common.h"
+unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, unsigned long addr)
+{
+        struct optimized_kprobe *op;
+        struct kprobe *kp;
+        long offs;
+        int i;
+        for (i = 0; i < RELATIVEJUMP_SIZE; i++) {
+                kp = get_kprobe((void *)addr - i);
+                /* This function only handles jump-optimized kprobe */
+                if (kp && kprobe_optimized(kp)) {
+                        op = container_of(kp, struct optimized_kprobe, kp);
+                        /* If op->list is not empty, op is under optimizing */
+                        if (list_empty(&op->list))
+                                goto found;
+                }
+        }
+        return addr;
+found:
+        /*
+         * If the kprobe can be optimized, original bytes which can be
+         * overwritten by jump destination address. In this case, original
+         * bytes must be recovered from op->optinsn.copied_insn buffer.
+         */
+        memcpy(buf, (void *)addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
+        if (addr == (unsigned long)kp->addr) {
+                buf[0] = kp->opcode;
+                memcpy(buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE);
+        } else {
+                offs = addr - (unsigned long)kp->addr - 1;
+                memcpy(buf, op->optinsn.copied_insn + offs, RELATIVE_ADDR_SIZE - offs);
+        }
+        return (unsigned long)buf;
+}
+/* Insert a move instruction which sets a pointer to eax/rdi (1st arg). */
+static void __kprobes synthesize_set_arg1(kprobe_opcode_t *addr, unsigned long val)
+{
+#ifdef CONFIG_X86_64
+        *addr++ = 0x48;
+        *addr++ = 0xbf;
+#else
+        *addr++ = 0xb8;
+#endif
+        *(unsigned long *)addr = val;
+}
+static void __used __kprobes kprobes_optinsn_template_holder(void)
+{
+        asm volatile (
+                        ".global optprobe_template_entry\n"
+                        "optprobe_template_entry:\n"
+#ifdef CONFIG_X86_64
+                        /* We don't bother saving the ss register */
+                        "       pushq %rsp\n"
+                        "       pushfq\n"
+                        SAVE_REGS_STRING
+                        "       movq %rsp, %rsi\n"
+                        ".global optprobe_template_val\n"
+                        "optprobe_template_val:\n"
+                        ASM_NOP5
+                        ASM_NOP5
+                        ".global optprobe_template_call\n"
+                        "optprobe_template_call:\n"
+                        ASM_NOP5
+                        /* Move flags to rsp */
+                        "       movq 144(%rsp), %rdx\n"
+                        "       movq %rdx, 152(%rsp)\n"
+                        RESTORE_REGS_STRING
+                        /* Skip flags entry */
+                        "       addq $8, %rsp\n"
+                        "       popfq\n"
+#else /* CONFIG_X86_32 */
+                        "       pushf\n"
+                        SAVE_REGS_STRING
+                        "       movl %esp, %edx\n"
+                        ".global optprobe_template_val\n"
+                        "optprobe_template_val:\n"
+                        ASM_NOP5
+                        ".global optprobe_template_call\n"
+                        "optprobe_template_call:\n"
+                        ASM_NOP5
+                        RESTORE_REGS_STRING
+                        "       addl $4, %esp\n"        /* skip cs */
+                        "       popf\n"
+#endif
+                        ".global optprobe_template_end\n"
+                        "optprobe_template_end:\n");
+}
+#define TMPL_MOVE_IDX \
+        ((long)&optprobe_template_val - (long)&optprobe_template_entry)
+#define TMPL_CALL_IDX \
+        ((long)&optprobe_template_call - (long)&optprobe_template_entry)
+#define TMPL_END_IDX \
+        ((long)&optprobe_template_end - (long)&optprobe_template_entry)
+#define INT3_SIZE sizeof(kprobe_opcode_t)
+/* Optimized kprobe call back function: called from optinsn */
+static void __kprobes optimized_callback(struct optimized_kprobe *op, struct pt_regs *regs)
+{
+        struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+        unsigned long flags;
+        /* This is possible if op is under delayed unoptimizing */
+        if (kprobe_disabled(&op->kp))
+                return;
+        local_irq_save(flags);
+        if (kprobe_running()) {
+                kprobes_inc_nmissed_count(&op->kp);
+        } else {
+                /* Save skipped registers */
+#ifdef CONFIG_X86_64
+                regs->cs = __KERNEL_CS;
+#else
+                regs->cs = __KERNEL_CS | get_kernel_rpl();
+                regs->gs = 0;
+#endif
+                regs->ip = (unsigned long)op->kp.addr + INT3_SIZE;
+                regs->orig_ax = ~0UL;
+                __this_cpu_write(current_kprobe, &op->kp);
+                kcb->kprobe_status = KPROBE_HIT_ACTIVE;
+                opt_pre_handler(&op->kp, regs);
+                __this_cpu_write(current_kprobe, NULL);
+        }
+        local_irq_restore(flags);
+}
+static int __kprobes copy_optimized_instructions(u8 *dest, u8 *src)
+{
+        int len = 0, ret;
+        while (len < RELATIVEJUMP_SIZE) {
+                ret = __copy_instruction(dest + len, src + len);
+                if (!ret || !can_boost(dest + len))
+                        return -EINVAL;
+                len += ret;
+        }
+        /* Check whether the address range is reserved */
+        if (ftrace_text_reserved(src, src + len - 1) ||
+            alternatives_text_reserved(src, src + len - 1) ||
+            jump_label_text_reserved(src, src + len - 1))
+                return -EBUSY;
+        return len;
+}
+/* Check whether insn is indirect jump */
+static int __kprobes insn_is_indirect_jump(struct insn *insn)
+{
+        return ((insn->opcode.bytes[0] == 0xff &&
+                (X86_MODRM_REG(insn->modrm.value) & 6) == 4) || /* Jump */
+                insn->opcode.bytes[0] == 0xea); /* Segment based jump */
+}
+/* Check whether insn jumps into specified address range */
+static int insn_jump_into_range(struct insn *insn, unsigned long start, int len)
+{
+        unsigned long target = 0;
+        switch (insn->opcode.bytes[0]) {
+        case 0xe0:      /* loopne */
+        case 0xe1:      /* loope */
+        case 0xe2:      /* loop */
+        case 0xe3:      /* jcxz */
+        case 0xe9:      /* near relative jump */
+        case 0xeb:      /* short relative jump */
+                break;
+        case 0x0f:
+                if ((insn->opcode.bytes[1] & 0xf0) == 0x80) /* jcc near */
+                        break;
+                return 0;
+        default:
+                if ((insn->opcode.bytes[0] & 0xf0) == 0x70) /* jcc short */
+                        break;
+                return 0;
+        }
+        target = (unsigned long)insn->next_byte + insn->immediate.value;
+        return (start <= target && target <= start + len);
+}
+/* Decode whole function to ensure any instructions don't jump into target */
+static int __kprobes can_optimize(unsigned long paddr)
+{
+        unsigned long addr, size = 0, offset = 0;
+        struct insn insn;
+        kprobe_opcode_t buf[MAX_INSN_SIZE];
+        /* Lookup symbol including addr */
+        if (!kallsyms_lookup_size_offset(paddr, &size, &offset))
+                return 0;
+        /*
+         * Do not optimize in the entry code due to the unstable
+         * stack handling.
+         */
+        if ((paddr >= (unsigned long)__entry_text_start) &&
+            (paddr <  (unsigned long)__entry_text_end))
+                return 0;
+        /* Check there is enough space for a relative jump. */
+        if (size - offset < RELATIVEJUMP_SIZE)
+                return 0;
+        /* Decode instructions */
+        addr = paddr - offset;
+        while (addr < paddr - offset + size) { /* Decode until function end */
+                if (search_exception_tables(addr))
+                        /*
+                         * Since some fixup code will jumps into this function,
+                         * we can't optimize kprobe in this function.
+                         */
+                        return 0;
+                kernel_insn_init(&insn, (void *)recover_probed_instruction(buf, addr));
+                insn_get_length(&insn);
+                /* Another subsystem puts a breakpoint */
+                if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION)
+                        return 0;
+                /* Recover address */
+                insn.kaddr = (void *)addr;
+                insn.next_byte = (void *)(addr + insn.length);
+                /* Check any instructions don't jump into target */
+                if (insn_is_indirect_jump(&insn) ||
+                    insn_jump_into_range(&insn, paddr + INT3_SIZE,
+                                         RELATIVE_ADDR_SIZE))
+                        return 0;
+                addr += insn.length;
+        }
+        return 1;
+}
+/* Check optimized_kprobe can actually be optimized. */
+int __kprobes arch_check_optimized_kprobe(struct optimized_kprobe *op)
+{
+        int i;
+        struct kprobe *p;
+        for (i = 1; i < op->optinsn.size; i++) {
+                p = get_kprobe(op->kp.addr + i);
+                if (p && !kprobe_disabled(p))
+                        return -EEXIST;
+        }
+        return 0;
+}
+/* Check the addr is within the optimized instructions. */
+int __kprobes
+arch_within_optimized_kprobe(struct optimized_kprobe *op, unsigned long addr)
+{
+        return ((unsigned long)op->kp.addr <= addr &&
+                (unsigned long)op->kp.addr + op->optinsn.size > addr);
+}
+/* Free optimized instruction slot */
+static __kprobes
+void __arch_remove_optimized_kprobe(struct optimized_kprobe *op, int dirty)
+{
+        if (op->optinsn.insn) {
+                free_optinsn_slot(op->optinsn.insn, dirty);
+                op->optinsn.insn = NULL;
+                op->optinsn.size = 0;
+        }
+}
+void __kprobes arch_remove_optimized_kprobe(struct optimized_kprobe *op)
+{
+        __arch_remove_optimized_kprobe(op, 1);
+}
+/*
+ * Copy replacing target instructions
+ * Target instructions MUST be relocatable (checked inside)
+ * This is called when new aggr(opt)probe is allocated or reused.
+ */
+int __kprobes arch_prepare_optimized_kprobe(struct optimized_kprobe *op)
+{
+        u8 *buf;
+        int ret;
+        long rel;
+        if (!can_optimize((unsigned long)op->kp.addr))
+                return -EILSEQ;
+        op->optinsn.insn = get_optinsn_slot();
+        if (!op->optinsn.insn)
+                return -ENOMEM;
+        /*
+         * Verify if the address gap is in 2GB range, because this uses
+         * a relative jump.
+         */
+        rel = (long)op->optinsn.insn - (long)op->kp.addr + RELATIVEJUMP_SIZE;
+        if (abs(rel) > 0x7fffffff)
+                return -ERANGE;
+        buf = (u8 *)op->optinsn.insn;
+        /* Copy instructions into the out-of-line buffer */
+        ret = copy_optimized_instructions(buf + TMPL_END_IDX, op->kp.addr);
+        if (ret < 0) {
+                __arch_remove_optimized_kprobe(op, 0);
+                return ret;
+        }
+        op->optinsn.size = ret;
+        /* Copy arch-dep-instance from template */
+        memcpy(buf, &optprobe_template_entry, TMPL_END_IDX);
+        /* Set probe information */
+        synthesize_set_arg1(buf + TMPL_MOVE_IDX, (unsigned long)op);
+        /* Set probe function call */
+        synthesize_relcall(buf + TMPL_CALL_IDX, optimized_callback);
+        /* Set returning jmp instruction at the tail of out-of-line buffer */
+        synthesize_reljump(buf + TMPL_END_IDX + op->optinsn.size,
+                           (u8 *)op->kp.addr + op->optinsn.size);
+        flush_icache_range((unsigned long) buf,
+                           (unsigned long) buf + TMPL_END_IDX +
+                           op->optinsn.size + RELATIVEJUMP_SIZE);
+        return 0;
+}
+#define MAX_OPTIMIZE_PROBES 256
+static struct text_poke_param *jump_poke_params;
+static struct jump_poke_buffer {
+        u8 buf[RELATIVEJUMP_SIZE];
+} *jump_poke_bufs;
+static void __kprobes setup_optimize_kprobe(struct text_poke_param *tprm,
+                                            u8 *insn_buf,
+                                            struct optimized_kprobe *op)
+{
+        s32 rel = (s32)((long)op->optinsn.insn -
+                        ((long)op->kp.addr + RELATIVEJUMP_SIZE));
+        /* Backup instructions which will be replaced by jump address */
+        memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_SIZE,
+               RELATIVE_ADDR_SIZE);
+        insn_buf[0] = RELATIVEJUMP_OPCODE;
+        *(s32 *)(&insn_buf[1]) = rel;
+        tprm->addr = op->kp.addr;
+        tprm->opcode = insn_buf;
+        tprm->len = RELATIVEJUMP_SIZE;
+}
+/*
+ * Replace breakpoints (int3) with relative jumps.
+ * Caller must call with locking kprobe_mutex and text_mutex.
+ */
+void __kprobes arch_optimize_kprobes(struct list_head *oplist)
+{
+        struct optimized_kprobe *op, *tmp;
+        int c = 0;
+        list_for_each_entry_safe(op, tmp, oplist, list) {
+                WARN_ON(kprobe_disabled(&op->kp));
+                /* Setup param */
+                setup_optimize_kprobe(&jump_poke_params[c],
+                                      jump_poke_bufs[c].buf, op);
+                list_del_init(&op->list);
+                if (++c >= MAX_OPTIMIZE_PROBES)
+                        break;
+        }
+        /*
+         * text_poke_smp doesn't support NMI/MCE code modifying.
+         * However, since kprobes itself also doesn't support NMI/MCE
+         * code probing, it's not a problem.
+         */
+        text_poke_smp_batch(jump_poke_params, c);
+}
+static void __kprobes setup_unoptimize_kprobe(struct text_poke_param *tprm,
+                                              u8 *insn_buf,
+                                              struct optimized_kprobe *op)
+{
+        /* Set int3 to first byte for kprobes */
+        insn_buf[0] = BREAKPOINT_INSTRUCTION;
+        memcpy(insn_buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE);
+        tprm->addr = op->kp.addr;
+        tprm->opcode = insn_buf;
+        tprm->len = RELATIVEJUMP_SIZE;
+}
+/*
+ * Recover original instructions and breakpoints from relative jumps.
+ * Caller must call with locking kprobe_mutex.
+ */
+extern void arch_unoptimize_kprobes(struct list_head *oplist,
+                                    struct list_head *done_list)
+{
+        struct optimized_kprobe *op, *tmp;
+        int c = 0;
+        list_for_each_entry_safe(op, tmp, oplist, list) {
+                /* Setup param */
+                setup_unoptimize_kprobe(&jump_poke_params[c],
+                                        jump_poke_bufs[c].buf, op);
+                list_move(&op->list, done_list);
+                if (++c >= MAX_OPTIMIZE_PROBES)
+                        break;
+        }
+        /*
+         * text_poke_smp doesn't support NMI/MCE code modifying.
+         * However, since kprobes itself also doesn't support NMI/MCE
+         * code probing, it's not a problem.
+         */
+        text_poke_smp_batch(jump_poke_params, c);
+}
+/* Replace a relative jump with a breakpoint (int3).  */
+void __kprobes arch_unoptimize_kprobe(struct optimized_kprobe *op)
+{
+        u8 buf[RELATIVEJUMP_SIZE];
+        /* Set int3 to first byte for kprobes */
+        buf[0] = BREAKPOINT_INSTRUCTION;
+        memcpy(buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE);
+        text_poke_smp(op->kp.addr, buf, RELATIVEJUMP_SIZE);
+}
+int  __kprobes
+setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter)
+{
+        struct optimized_kprobe *op;
+        if (p->flags & KPROBE_FLAG_OPTIMIZED) {
+                /* This kprobe is really able to run optimized path. */
+                op = container_of(p, struct optimized_kprobe, kp);
+                /* Detour through copied instructions */
+                regs->ip = (unsigned long)op->optinsn.insn + TMPL_END_IDX;
+                if (!reenter)
+                        reset_current_kprobe();
+                preempt_enable_no_resched();
+                return 1;
+        }
+        return 0;
+}
+int __kprobes arch_init_optprobes(void)
+{
+        /* Allocate code buffer and parameter array */
+        jump_poke_bufs = kmalloc(sizeof(struct jump_poke_buffer) *
+                                 MAX_OPTIMIZE_PROBES, GFP_KERNEL);
+        if (!jump_poke_bufs)
+                return -ENOMEM;
+        jump_poke_params = kmalloc(sizeof(struct text_poke_param) *
+                                   MAX_OPTIMIZE_PROBES, GFP_KERNEL);
+        if (!jump_poke_params) {
+                kfree(jump_poke_bufs);
+                jump_poke_bufs = NULL;
+                return -ENOMEM;
+        }
+        return 0;
+}
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index 7da647d8b64c..e213fc8408d2 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -30,16 +30,15 @@
 *              <jkenisto@us.ibm.com> and Prasanna S Panchamukhi
 *              <prasanna@in.ibm.com> added function-return probes.
 * 2005-May     Rusty Lynch <rusty.lynch@intel.com>
- *              Added function return probes functionality
+ *              Added function return probes functionality
 * 2006-Feb     Masami Hiramatsu <hiramatu@sdl.hitachi.co.jp> added
- *              kprobe-booster and kretprobe-booster for i386.
+ *              kprobe-booster and kretprobe-booster for i386.
 * 2007-Dec     Masami Hiramatsu <mhiramat@redhat.com> added kprobe-booster
- *              and kretprobe-booster for x86-64
+ *              and kretprobe-booster for x86-64
 * 2007-Dec     Masami Hiramatsu <mhiramat@redhat.com>, Arjan van de Ven
- *              <arjan@infradead.org> and Jim Keniston <jkenisto@us.ibm.com>
+ *              <arjan@infradead.org> and Jim Keniston <jkenisto@us.ibm.com>
- *              unified x86 kprobes code.
+ *              unified x86 kprobes code.
 */
 #include <linux/kprobes.h>
 #include <linux/ptrace.h>
 #include <linux/string.h>
@@ -59,6 +58,8 @@
 #include <asm/insn.h>
 #include <asm/debugreg.h>
+#include "kprobes-common.h"
 void jprobe_return_end(void);
 DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL;
@@ -108,6 +109,7 @@ struct kretprobe_blackpoint kretprobe_blacklist[] = {
                              doesn't switch kernel stack.*/
        {NULL, NULL}    /* Terminator */
 };
 const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist);
 static void __kprobes __synthesize_relative_insn(void *from, void *to, u8 op)
@@ -123,11 +125,17 @@ static void __kprobes __synthesize_relative_insn(void *from, void *to, u8 op)
 }
 /* Insert a jump instruction at address 'from', which jumps to address 'to'.*/
-static void __kprobes synthesize_reljump(void *from, void *to)
+void __kprobes synthesize_reljump(void *from, void *to)
 {
        __synthesize_relative_insn(from, to, RELATIVEJUMP_OPCODE);
 }
+/* Insert a call instruction at address 'from', which calls address 'to'.*/
+void __kprobes synthesize_relcall(void *from, void *to)
+{
+        __synthesize_relative_insn(from, to, RELATIVECALL_OPCODE);
+}
 /*
 * Skip the prefixes of the instruction.
 */
@@ -151,7 +159,7 @@ static kprobe_opcode_t *__kprobes skip_prefixes(kprobe_opcode_t *insn)
 * Returns non-zero if opcode is boostable.
 * RIP relative instructions are adjusted at copying time in 64 bits mode
 */
-static int __kprobes can_boost(kprobe_opcode_t *opcodes)
+int __kprobes can_boost(kprobe_opcode_t *opcodes)
 {
        kprobe_opcode_t opcode;
        kprobe_opcode_t *orig_opcodes = opcodes;
@@ -207,13 +215,15 @@ retry:
        }
 }
-/* Recover the probed instruction at addr for further analysis. */
+static unsigned long
-static int recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr)
+__recover_probed_insn(kprobe_opcode_t *buf, unsigned long addr)
 {
        struct kprobe *kp;
        kp = get_kprobe((void *)addr);
+        /* There is no probe, return original address */
        if (!kp)
-                return -EINVAL;
+                return addr;
        /*
         *  Basically, kp->ainsn.insn has an original instruction.
@@ -230,14 +240,29 @@ static int recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr)
         */
        memcpy(buf, kp->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
        buf[0] = kp->opcode;
-        return 0;
+        return (unsigned long)buf;
+}
+/*
+ * Recover the probed instruction at addr for further analysis.
+ * Caller must lock kprobes by kprobe_mutex, or disable preemption
+ * for preventing to release referencing kprobes.
+ */
+unsigned long recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr)
+{
+        unsigned long __addr;
+        __addr = __recover_optprobed_insn(buf, addr);
+        if (__addr != addr)
+                return __addr;
+        return __recover_probed_insn(buf, addr);
 }
 /* Check if paddr is at an instruction boundary */
 static int __kprobes can_probe(unsigned long paddr)
 {
-        int ret;
+        unsigned long addr, __addr, offset = 0;
-        unsigned long addr, offset = 0;
        struct insn insn;
        kprobe_opcode_t buf[MAX_INSN_SIZE];
@@ -247,26 +272,24 @@ static int __kprobes can_probe(unsigned long paddr)
        /* Decode instructions */
        addr = paddr - offset;
        while (addr < paddr) {
-                kernel_insn_init(&insn, (void *)addr);
-                insn_get_opcode(&insn);
                /*
                 * Check if the instruction has been modified by another
                 * kprobe, in which case we replace the breakpoint by the
                 * original instruction in our buffer.
+                 * Also, jump optimization will change the breakpoint to
+                 * relative-jump. Since the relative-jump itself is
+                 * normally used, we just go through if there is no kprobe.
                 */
-                if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) {
+                __addr = recover_probed_instruction(buf, addr);
-                        ret = recover_probed_instruction(buf, addr);
+                kernel_insn_init(&insn, (void *)__addr);
-                        if (ret)
-                                /*
-                                 * Another debugging subsystem might insert
-                                 * this breakpoint. In that case, we can't
-                                 * recover it.
-                                 */
-                                return 0;
-                        kernel_insn_init(&insn, buf);
-                }
                insn_get_length(&insn);
+                /*
+                 * Another debugging subsystem might insert this breakpoint.
+                 * In that case, we can't recover it.
+                 */
+                if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION)
+                        return 0;
                addr += insn.length;
        }
@@ -299,24 +322,16 @@ static int __kprobes is_IF_modifier(kprobe_opcode_t *insn)
 * If not, return null.
 * Only applicable to 64-bit x86.
 */
-static int __kprobes __copy_instruction(u8 *dest, u8 *src, int recover)
+int __kprobes __copy_instruction(u8 *dest, u8 *src)
 {
        struct insn insn;
-        int ret;
        kprobe_opcode_t buf[MAX_INSN_SIZE];
-        kernel_insn_init(&insn, src);
+        kernel_insn_init(&insn, (void *)recover_probed_instruction(buf, (unsigned long)src));
-        if (recover) {
-                insn_get_opcode(&insn);
-                if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) {
-                        ret = recover_probed_instruction(buf,
-                                                         (unsigned long)src);
-                        if (ret)
-                                return 0;
-                        kernel_insn_init(&insn, buf);
-                }
-        }
        insn_get_length(&insn);
+        /* Another subsystem puts a breakpoint, failed to recover */
+        if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION)
+                return 0;
        memcpy(dest, insn.kaddr, insn.length);
 #ifdef CONFIG_X86_64
@@ -337,8 +352,7 @@ static int __kprobes __copy_instruction(u8 *dest, u8 *src, int recover)
                 * extension of the original signed 32-bit displacement would
                 * have given.
                 */
-                newdisp = (u8 *) src + (s64) insn.displacement.value -
+                newdisp = (u8 *) src + (s64) insn.displacement.value - (u8 *) dest;
-                          (u8 *) dest;
                BUG_ON((s64) (s32) newdisp != newdisp); /* Sanity check.  */
                disp = (u8 *) dest + insn_offset_displacement(&insn);
                *(s32 *) disp = (s32) newdisp;
@@ -349,18 +363,20 @@ static int __kprobes __copy_instruction(u8 *dest, u8 *src, int recover)
 static void __kprobes arch_copy_kprobe(struct kprobe *p)
 {
+        /* Copy an instruction with recovering if other optprobe modifies it.*/
+        __copy_instruction(p->ainsn.insn, p->addr);
        /*
-         * Copy an instruction without recovering int3, because it will be
+         * __copy_instruction can modify the displacement of the instruction,
-         * put by another subsystem.
+         * but it doesn't affect boostable check.
         */
-        __copy_instruction(p->ainsn.insn, p->addr, 0);
+        if (can_boost(p->ainsn.insn))
-        if (can_boost(p->addr))
                p->ainsn.boostable = 0;
        else
                p->ainsn.boostable = -1;
-        p->opcode = *p->addr;
+        /* Also, displacement change doesn't affect the first byte */
+        p->opcode = p->ainsn.insn[0];
 }
 int __kprobes arch_prepare_kprobe(struct kprobe *p)
@@ -442,8 +458,8 @@ static void __kprobes restore_btf(void)
        }
 }
-void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
+void __kprobes
-                                      struct pt_regs *regs)
+arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs)
 {
        unsigned long *sara = stack_addr(regs);
@@ -453,16 +469,8 @@ void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
        *sara = (unsigned long) &kretprobe_trampoline;
 }
-#ifdef CONFIG_OPTPROBES
+static void __kprobes
-static int  __kprobes setup_detour_execution(struct kprobe *p,
+setup_singlestep(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb, int reenter)
-                                             struct pt_regs *regs,
-                                             int reenter);
-#else
-#define setup_detour_execution(p, regs, reenter) (0)
-#endif
-static void __kprobes setup_singlestep(struct kprobe *p, struct pt_regs *regs,
-                                       struct kprobe_ctlblk *kcb, int reenter)
 {
        if (setup_detour_execution(p, regs, reenter))
                return;
@@ -504,8 +512,8 @@ static void __kprobes setup_singlestep(struct kprobe *p, struct pt_regs *regs,
 * within the handler. We save the original kprobes variables and just single
 * step on the instruction of the new probe without calling any user handlers.
 */
-static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs,
+static int __kprobes
-                                    struct kprobe_ctlblk *kcb)
+reenter_kprobe(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb)
 {
        switch (kcb->kprobe_status) {
        case KPROBE_HIT_SSDONE:
@@ -600,69 +608,6 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
        return 0;
 }
-#ifdef CONFIG_X86_64
-#define SAVE_REGS_STRING                \
-        /* Skip cs, ip, orig_ax. */     \
-        "       subq $24, %rsp\n"       \
-        "       pushq %rdi\n"           \
-        "       pushq %rsi\n"           \
-        "       pushq %rdx\n"           \
-        "       pushq %rcx\n"           \
-        "       pushq %rax\n"           \
-        "       pushq %r8\n"            \
-        "       pushq %r9\n"            \
-        "       pushq %r10\n"           \
-        "       pushq %r11\n"           \
-        "       pushq %rbx\n"           \
-        "       pushq %rbp\n"           \
-        "       pushq %r12\n"           \
-        "       pushq %r13\n"           \
-        "       pushq %r14\n"           \
-        "       pushq %r15\n"
-#define RESTORE_REGS_STRING             \
-        "       popq %r15\n"            \
-        "       popq %r14\n"            \
-        "       popq %r13\n"            \
-        "       popq %r12\n"            \
-        "       popq %rbp\n"            \
-        "       popq %rbx\n"            \
-        "       popq %r11\n"            \
-        "       popq %r10\n"            \
-        "       popq %r9\n"             \
-        "       popq %r8\n"             \
-        "       popq %rax\n"            \
-        "       popq %rcx\n"            \
-        "       popq %rdx\n"            \
-        "       popq %rsi\n"            \
-        "       popq %rdi\n"            \
-        /* Skip orig_ax, ip, cs */      \
-        "       addq $24, %rsp\n"
-#else
-#define SAVE_REGS_STRING                \
-        /* Skip cs, ip, orig_ax and gs. */      \
-        "       subl $16, %esp\n"       \
-        "       pushl %fs\n"            \
-        "       pushl %es\n"            \
-        "       pushl %ds\n"            \
-        "       pushl %eax\n"           \
-        "       pushl %ebp\n"           \
-        "       pushl %edi\n"           \
-        "       pushl %esi\n"           \
-        "       pushl %edx\n"           \
-        "       pushl %ecx\n"           \
-        "       pushl %ebx\n"
-#define RESTORE_REGS_STRING             \
-        "       popl %ebx\n"            \
-        "       popl %ecx\n"            \
-        "       popl %edx\n"            \
-        "       popl %esi\n"            \
-        "       popl %edi\n"            \
-        "       popl %ebp\n"            \
-        "       popl %eax\n"            \
-        /* Skip ds, es, fs, gs, orig_ax, and ip. Note: don't pop cs here*/\
-        "       addl $24, %esp\n"
-#endif
 /*
 * When a retprobed function returns, this code saves registers and
 * calls trampoline_handler() runs, which calls the kretprobe's handler.
@@ -816,8 +761,8 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs)
 * jump instruction after the copied instruction, that jumps to the next
 * instruction after the probepoint.
 */
-static void __kprobes resume_execution(struct kprobe *p,
+static void __kprobes
-                struct pt_regs *regs, struct kprobe_ctlblk *kcb)
+resume_execution(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb)
 {
        unsigned long *tos = stack_addr(regs);
        unsigned long copy_ip = (unsigned long)p->ainsn.insn;
@@ -996,8 +941,8 @@ int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr)
 /*
 * Wrapper routine for handling exceptions.
 */
-int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
+int __kprobes
-                                       unsigned long val, void *data)
+kprobe_exceptions_notify(struct notifier_block *self, unsigned long val, void *data)
 {
        struct die_args *args = data;
        int ret = NOTIFY_DONE;
@@ -1107,466 +1052,9 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
        return 0;
 }
-#ifdef CONFIG_OPTPROBES
-/* Insert a call instruction at address 'from', which calls address 'to'.*/
-static void __kprobes synthesize_relcall(void *from, void *to)
-{
-        __synthesize_relative_insn(from, to, RELATIVECALL_OPCODE);
-}
-/* Insert a move instruction which sets a pointer to eax/rdi (1st arg). */
-static void __kprobes synthesize_set_arg1(kprobe_opcode_t *addr,
-                                          unsigned long val)
-{
-#ifdef CONFIG_X86_64
-        *addr++ = 0x48;
-        *addr++ = 0xbf;
-#else
-        *addr++ = 0xb8;
-#endif
-        *(unsigned long *)addr = val;
-}
-static void __used __kprobes kprobes_optinsn_template_holder(void)
-{
-        asm volatile (
-                        ".global optprobe_template_entry\n"
-                        "optprobe_template_entry: \n"
-#ifdef CONFIG_X86_64
-                        /* We don't bother saving the ss register */
-                        "       pushq %rsp\n"
-                        "       pushfq\n"
-                        SAVE_REGS_STRING
-                        "       movq %rsp, %rsi\n"
-                        ".global optprobe_template_val\n"
-                        "optprobe_template_val: \n"
-                        ASM_NOP5
-                        ASM_NOP5
-                        ".global optprobe_template_call\n"
-                        "optprobe_template_call: \n"
-                        ASM_NOP5
-                        /* Move flags to rsp */
-                        "       movq 144(%rsp), %rdx\n"
-                        "       movq %rdx, 152(%rsp)\n"
-                        RESTORE_REGS_STRING
-                        /* Skip flags entry */
-                        "       addq $8, %rsp\n"
-                        "       popfq\n"
-#else /* CONFIG_X86_32 */
-                        "       pushf\n"
-                        SAVE_REGS_STRING
-                        "       movl %esp, %edx\n"
-                        ".global optprobe_template_val\n"
-                        "optprobe_template_val: \n"
-                        ASM_NOP5
-                        ".global optprobe_template_call\n"
-                        "optprobe_template_call: \n"
-                        ASM_NOP5
-                        RESTORE_REGS_STRING
-                        "       addl $4, %esp\n"        /* skip cs */
-                        "       popf\n"
-#endif
-                        ".global optprobe_template_end\n"
-                        "optprobe_template_end: \n");
-}
-#define TMPL_MOVE_IDX \
-        ((long)&optprobe_template_val - (long)&optprobe_template_entry)
-#define TMPL_CALL_IDX \
-        ((long)&optprobe_template_call - (long)&optprobe_template_entry)
-#define TMPL_END_IDX \
-        ((long)&optprobe_template_end - (long)&optprobe_template_entry)
-#define INT3_SIZE sizeof(kprobe_opcode_t)
-/* Optimized kprobe call back function: called from optinsn */
-static void __kprobes optimized_callback(struct optimized_kprobe *op,
-                                         struct pt_regs *regs)
-{
-        struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
-        unsigned long flags;
-        /* This is possible if op is under delayed unoptimizing */
-        if (kprobe_disabled(&op->kp))
-                return;
-        local_irq_save(flags);
-        if (kprobe_running()) {
-                kprobes_inc_nmissed_count(&op->kp);
-        } else {
-                /* Save skipped registers */
-#ifdef CONFIG_X86_64
-                regs->cs = __KERNEL_CS;
-#else
-                regs->cs = __KERNEL_CS | get_kernel_rpl();
-                regs->gs = 0;
-#endif
-                regs->ip = (unsigned long)op->kp.addr + INT3_SIZE;
-                regs->orig_ax = ~0UL;
-                __this_cpu_write(current_kprobe, &op->kp);
-                kcb->kprobe_status = KPROBE_HIT_ACTIVE;
-                opt_pre_handler(&op->kp, regs);
-                __this_cpu_write(current_kprobe, NULL);
-        }
-        local_irq_restore(flags);
-}
-static int __kprobes copy_optimized_instructions(u8 *dest, u8 *src)
-{
-        int len = 0, ret;
-        while (len < RELATIVEJUMP_SIZE) {
-                ret = __copy_instruction(dest + len, src + len, 1);
-                if (!ret || !can_boost(dest + len))
-                        return -EINVAL;
-                len += ret;
-        }
-        /* Check whether the address range is reserved */
-        if (ftrace_text_reserved(src, src + len - 1) ||
-            alternatives_text_reserved(src, src + len - 1) ||
-            jump_label_text_reserved(src, src + len - 1))
-                return -EBUSY;
-        return len;
-}
-/* Check whether insn is indirect jump */
-static int __kprobes insn_is_indirect_jump(struct insn *insn)
-{
-        return ((insn->opcode.bytes[0] == 0xff &&
-                (X86_MODRM_REG(insn->modrm.value) & 6) == 4) || /* Jump */
-                insn->opcode.bytes[0] == 0xea); /* Segment based jump */
-}
-/* Check whether insn jumps into specified address range */
-static int insn_jump_into_range(struct insn *insn, unsigned long start, int len)
-{
-        unsigned long target = 0;
-        switch (insn->opcode.bytes[0]) {
-        case 0xe0:      /* loopne */
-        case 0xe1:      /* loope */
-        case 0xe2:      /* loop */
-        case 0xe3:      /* jcxz */
-        case 0xe9:      /* near relative jump */
-        case 0xeb:      /* short relative jump */
-                break;
-        case 0x0f:
-                if ((insn->opcode.bytes[1] & 0xf0) == 0x80) /* jcc near */
-                        break;
-                return 0;
-        default:
-                if ((insn->opcode.bytes[0] & 0xf0) == 0x70) /* jcc short */
-                        break;
-                return 0;
-        }
-        target = (unsigned long)insn->next_byte + insn->immediate.value;
-        return (start <= target && target <= start + len);
-}
-/* Decode whole function to ensure any instructions don't jump into target */
-static int __kprobes can_optimize(unsigned long paddr)
-{
-        int ret;
-        unsigned long addr, size = 0, offset = 0;
-        struct insn insn;
-        kprobe_opcode_t buf[MAX_INSN_SIZE];
-        /* Lookup symbol including addr */
-        if (!kallsyms_lookup_size_offset(paddr, &size, &offset))
-                return 0;
-        /*
-         * Do not optimize in the entry code due to the unstable
-         * stack handling.
-         */
-        if ((paddr >= (unsigned long )__entry_text_start) &&
-            (paddr <  (unsigned long )__entry_text_end))
-                return 0;
-        /* Check there is enough space for a relative jump. */
-        if (size - offset < RELATIVEJUMP_SIZE)
-                return 0;
-        /* Decode instructions */
-        addr = paddr - offset;
-        while (addr < paddr - offset + size) { /* Decode until function end */
-                if (search_exception_tables(addr))
-                        /*
-                         * Since some fixup code will jumps into this function,
-                         * we can't optimize kprobe in this function.
-                         */
-                        return 0;
-                kernel_insn_init(&insn, (void *)addr);
-                insn_get_opcode(&insn);
-                if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) {
-                        ret = recover_probed_instruction(buf, addr);
-                        if (ret)
-                                return 0;
-                        kernel_insn_init(&insn, buf);
-                }
-                insn_get_length(&insn);
-                /* Recover address */
-                insn.kaddr = (void *)addr;
-                insn.next_byte = (void *)(addr + insn.length);
-                /* Check any instructions don't jump into target */
-                if (insn_is_indirect_jump(&insn) ||
-                    insn_jump_into_range(&insn, paddr + INT3_SIZE,
-                                         RELATIVE_ADDR_SIZE))
-                        return 0;
-                addr += insn.length;
-        }
-        return 1;
-}
-/* Check optimized_kprobe can actually be optimized. */
-int __kprobes arch_check_optimized_kprobe(struct optimized_kprobe *op)
-{
-        int i;
-        struct kprobe *p;
-        for (i = 1; i < op->optinsn.size; i++) {
-                p = get_kprobe(op->kp.addr + i);
-                if (p && !kprobe_disabled(p))
-                        return -EEXIST;
-        }
-        return 0;
-}
-/* Check the addr is within the optimized instructions. */
-int __kprobes arch_within_optimized_kprobe(struct optimized_kprobe *op,
-                                           unsigned long addr)
-{
-        return ((unsigned long)op->kp.addr <= addr &&
-                (unsigned long)op->kp.addr + op->optinsn.size > addr);
-}
-/* Free optimized instruction slot */
-static __kprobes
-void __arch_remove_optimized_kprobe(struct optimized_kprobe *op, int dirty)
-{
-        if (op->optinsn.insn) {
-                free_optinsn_slot(op->optinsn.insn, dirty);
-                op->optinsn.insn = NULL;
-                op->optinsn.size = 0;
-        }
-}
-void __kprobes arch_remove_optimized_kprobe(struct optimized_kprobe *op)
-{
-        __arch_remove_optimized_kprobe(op, 1);
-}
-/*
- * Copy replacing target instructions
- * Target instructions MUST be relocatable (checked inside)
- */
-int __kprobes arch_prepare_optimized_kprobe(struct optimized_kprobe *op)
-{
-        u8 *buf;
-        int ret;
-        long rel;
-        if (!can_optimize((unsigned long)op->kp.addr))
-                return -EILSEQ;
-        op->optinsn.insn = get_optinsn_slot();
-        if (!op->optinsn.insn)
-                return -ENOMEM;
-        /*
-         * Verify if the address gap is in 2GB range, because this uses
-         * a relative jump.
-         */
-        rel = (long)op->optinsn.insn - (long)op->kp.addr + RELATIVEJUMP_SIZE;
-        if (abs(rel) > 0x7fffffff)
-                return -ERANGE;
-        buf = (u8 *)op->optinsn.insn;
-        /* Copy instructions into the out-of-line buffer */
-        ret = copy_optimized_instructions(buf + TMPL_END_IDX, op->kp.addr);
-        if (ret < 0) {
-                __arch_remove_optimized_kprobe(op, 0);
-                return ret;
-        }
-        op->optinsn.size = ret;
-        /* Copy arch-dep-instance from template */
-        memcpy(buf, &optprobe_template_entry, TMPL_END_IDX);
-        /* Set probe information */
-        synthesize_set_arg1(buf + TMPL_MOVE_IDX, (unsigned long)op);
-        /* Set probe function call */
-        synthesize_relcall(buf + TMPL_CALL_IDX, optimized_callback);
-        /* Set returning jmp instruction at the tail of out-of-line buffer */
-        synthesize_reljump(buf + TMPL_END_IDX + op->optinsn.size,
-                           (u8 *)op->kp.addr + op->optinsn.size);
-        flush_icache_range((unsigned long) buf,
-                           (unsigned long) buf + TMPL_END_IDX +
-                           op->optinsn.size + RELATIVEJUMP_SIZE);
-        return 0;
-}
-#define MAX_OPTIMIZE_PROBES 256
-static struct text_poke_param *jump_poke_params;
-static struct jump_poke_buffer {
-        u8 buf[RELATIVEJUMP_SIZE];
-} *jump_poke_bufs;
-static void __kprobes setup_optimize_kprobe(struct text_poke_param *tprm,
-                                            u8 *insn_buf,
-                                            struct optimized_kprobe *op)
-{
-        s32 rel = (s32)((long)op->optinsn.insn -
-                        ((long)op->kp.addr + RELATIVEJUMP_SIZE));
-        /* Backup instructions which will be replaced by jump address */
-        memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_SIZE,
-               RELATIVE_ADDR_SIZE);
-        insn_buf[0] = RELATIVEJUMP_OPCODE;
-        *(s32 *)(&insn_buf[1]) = rel;
-        tprm->addr = op->kp.addr;
-        tprm->opcode = insn_buf;
-        tprm->len = RELATIVEJUMP_SIZE;
-}
-/*
- * Replace breakpoints (int3) with relative jumps.
- * Caller must call with locking kprobe_mutex and text_mutex.
- */
-void __kprobes arch_optimize_kprobes(struct list_head *oplist)
-{
-        struct optimized_kprobe *op, *tmp;
-        int c = 0;
-        list_for_each_entry_safe(op, tmp, oplist, list) {
-                WARN_ON(kprobe_disabled(&op->kp));
-                /* Setup param */
-                setup_optimize_kprobe(&jump_poke_params[c],
-                                      jump_poke_bufs[c].buf, op);
-                list_del_init(&op->list);
-                if (++c >= MAX_OPTIMIZE_PROBES)
-                        break;
-        }
-        /*
-         * text_poke_smp doesn't support NMI/MCE code modifying.
-         * However, since kprobes itself also doesn't support NMI/MCE
-         * code probing, it's not a problem.
-         */
-        text_poke_smp_batch(jump_poke_params, c);
-}
-static void __kprobes setup_unoptimize_kprobe(struct text_poke_param *tprm,
-                                              u8 *insn_buf,
-                                              struct optimized_kprobe *op)
-{
-        /* Set int3 to first byte for kprobes */
-        insn_buf[0] = BREAKPOINT_INSTRUCTION;
-        memcpy(insn_buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE);
-        tprm->addr = op->kp.addr;
-        tprm->opcode = insn_buf;
-        tprm->len = RELATIVEJUMP_SIZE;
-}
-/*
- * Recover original instructions and breakpoints from relative jumps.
- * Caller must call with locking kprobe_mutex.
- */
-extern void arch_unoptimize_kprobes(struct list_head *oplist,
-                                    struct list_head *done_list)
-{
-        struct optimized_kprobe *op, *tmp;
-        int c = 0;
-        list_for_each_entry_safe(op, tmp, oplist, list) {
-                /* Setup param */
-                setup_unoptimize_kprobe(&jump_poke_params[c],
-                                        jump_poke_bufs[c].buf, op);
-                list_move(&op->list, done_list);
-                if (++c >= MAX_OPTIMIZE_PROBES)
-                        break;
-        }
-        /*
-         * text_poke_smp doesn't support NMI/MCE code modifying.
-         * However, since kprobes itself also doesn't support NMI/MCE
-         * code probing, it's not a problem.
-         */
-        text_poke_smp_batch(jump_poke_params, c);
-}
-/* Replace a relative jump with a breakpoint (int3).  */
-void __kprobes arch_unoptimize_kprobe(struct optimized_kprobe *op)
-{
-        u8 buf[RELATIVEJUMP_SIZE];
-        /* Set int3 to first byte for kprobes */
-        buf[0] = BREAKPOINT_INSTRUCTION;
-        memcpy(buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE);
-        text_poke_smp(op->kp.addr, buf, RELATIVEJUMP_SIZE);
-}
-static int  __kprobes setup_detour_execution(struct kprobe *p,
-                                             struct pt_regs *regs,
-                                             int reenter)
-{
-        struct optimized_kprobe *op;
-        if (p->flags & KPROBE_FLAG_OPTIMIZED) {
-                /* This kprobe is really able to run optimized path. */
-                op = container_of(p, struct optimized_kprobe, kp);
-                /* Detour through copied instructions */
-                regs->ip = (unsigned long)op->optinsn.insn + TMPL_END_IDX;
-                if (!reenter)
-                        reset_current_kprobe();
-                preempt_enable_no_resched();
-                return 1;
-        }
-        return 0;
-}
-static int __kprobes init_poke_params(void)
-{
-        /* Allocate code buffer and parameter array */
-        jump_poke_bufs = kmalloc(sizeof(struct jump_poke_buffer) *
-                                 MAX_OPTIMIZE_PROBES, GFP_KERNEL);
-        if (!jump_poke_bufs)
-                return -ENOMEM;
-        jump_poke_params = kmalloc(sizeof(struct text_poke_param) *
-                                   MAX_OPTIMIZE_PROBES, GFP_KERNEL);
-        if (!jump_poke_params) {
-                kfree(jump_poke_bufs);
-                jump_poke_bufs = NULL;
-                return -ENOMEM;
-        }
-        return 0;
-}
-#else   /* !CONFIG_OPTPROBES */
-static int __kprobes init_poke_params(void)
-{
-        return 0;
-}
-#endif
 int __init arch_init_kprobes(void)
 {
-        return init_poke_params();
+        return arch_init_optprobes();
 }
 int __kprobes arch_trampoline_kprobe(struct kprobe *p)
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index f0c6fd6f176b..694d801bf606 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -438,9 +438,9 @@ void __init kvm_guest_init(void)
 static __init int activate_jump_labels(void)
 {
        if (has_steal_clock) {
-                jump_label_inc(&paravirt_steal_enabled);
+                static_key_slow_inc(&paravirt_steal_enabled);
                if (steal_acc)
-                        jump_label_inc(&paravirt_steal_rq_enabled);
+                        static_key_slow_inc(&paravirt_steal_rq_enabled);
        }
        return 0;
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
index ac0417be9131..73465aab28f8 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/microcode_amd.c
@@ -360,7 +360,6 @@ out:
 static enum ucode_state
 request_microcode_user(int cpu, const void __user *buf, size_t size)
 {
-        pr_info("AMD microcode update via /dev/cpu/microcode not supported\n");
        return UCODE_ERROR;
 }
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index d90272e6bc40..ada2f99388dd 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -202,8 +202,8 @@ static void native_flush_tlb_single(unsigned long addr)
        __native_flush_tlb_single(addr);
 }
-struct jump_label_key paravirt_steal_enabled;
+struct static_key paravirt_steal_enabled;
-struct jump_label_key paravirt_steal_rq_enabled;
+struct static_key paravirt_steal_rq_enabled;
 static u64 native_steal_clock(int cpu)
 {
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 15763af7bfe3..44eefde92109 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -377,8 +377,8 @@ static inline int hlt_use_halt(void)
 void default_idle(void)
 {
        if (hlt_use_halt()) {
-                trace_power_start(POWER_CSTATE, 1, smp_processor_id());
+                trace_power_start_rcuidle(POWER_CSTATE, 1, smp_processor_id());
-                trace_cpu_idle(1, smp_processor_id());
+                trace_cpu_idle_rcuidle(1, smp_processor_id());
                current_thread_info()->status &= ~TS_POLLING;
                /*
                 * TS_POLLING-cleared state must be visible before we
@@ -391,8 +391,8 @@ void default_idle(void)
                else
                        local_irq_enable();
                current_thread_info()->status |= TS_POLLING;
-                trace_power_end(smp_processor_id());
+                trace_power_end_rcuidle(smp_processor_id());
-                trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
+                trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
        } else {
                local_irq_enable();
                /* loop is done by the caller */
@@ -450,8 +450,8 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait);
 static void mwait_idle(void)
 {
        if (!need_resched()) {
-                trace_power_start(POWER_CSTATE, 1, smp_processor_id());
+                trace_power_start_rcuidle(POWER_CSTATE, 1, smp_processor_id());
-                trace_cpu_idle(1, smp_processor_id());
+                trace_cpu_idle_rcuidle(1, smp_processor_id());
                if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR))
                        clflush((void *)&current_thread_info()->flags);
@@ -461,8 +461,8 @@ static void mwait_idle(void)
                        __sti_mwait(0, 0);
                else
                        local_irq_enable();
-                trace_power_end(smp_processor_id());
+                trace_power_end_rcuidle(smp_processor_id());
-                trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
+                trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
        } else
                local_irq_enable();
 }
@@ -474,13 +474,13 @@ static void mwait_idle(void)
 */
 static void poll_idle(void)
 {
-        trace_power_start(POWER_CSTATE, 0, smp_processor_id());
+        trace_power_start_rcuidle(POWER_CSTATE, 0, smp_processor_id());
-        trace_cpu_idle(0, smp_processor_id());
+        trace_cpu_idle_rcuidle(0, smp_processor_id());
        local_irq_enable();
        while (!need_resched())
                cpu_relax();
-        trace_power_end(smp_processor_id());
+        trace_power_end_rcuidle(smp_processor_id());
-        trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
+        trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
 }
 /*
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 485204f58cda..49888fefe794 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -119,9 +119,7 @@ void cpu_idle(void)
                }
                rcu_idle_exit();
                tick_nohz_idle_exit();
-                preempt_enable_no_resched();
+                schedule_preempt_disabled();
-                schedule();
-                preempt_disable();
        }
 }
@@ -214,6 +212,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
        task_user_gs(p) = get_user_gs(regs);
+        p->fpu_counter = 0;
        p->thread.io_bitmap_ptr = NULL;
        tsk = current;
        err = -ENOMEM;
@@ -299,22 +298,11 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
                                 *next = &next_p->thread;
        int cpu = smp_processor_id();
        struct tss_struct *tss = &per_cpu(init_tss, cpu);
-        bool preload_fpu;
+        fpu_switch_t fpu;
        /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */
-        /*
+        fpu = switch_fpu_prepare(prev_p, next_p, cpu);
-         * If the task has used fpu the last 5 timeslices, just do a full
-         * restore of the math state immediately to avoid the trap; the
-         * chances of needing FPU soon are obviously high now
-         */
-        preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5;
-        __unlazy_fpu(prev_p);
-        /* we're going to use this soon, after a few expensive things */
-        if (preload_fpu)
-                prefetch(next->fpu.state);
        /*
         * Reload esp0.
@@ -354,11 +342,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
                     task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT))
                __switch_to_xtra(prev_p, next_p, tss);
-        /* If we're going to preload the fpu context, make sure clts
-           is run while we're batching the cpu state updates. */
-        if (preload_fpu)
-                clts();
        /*
         * Leave lazy mode, flushing any hypercalls made here.
         * This must be done before restoring TLS segments so
@@ -368,15 +351,14 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
         */
        arch_end_context_switch(next_p);
-        if (preload_fpu)
-                __math_state_restore();
        /*
         * Restore %gs if needed (which is common)
         */
        if (prev->gs | next->gs)
                lazy_load_gs(next->gs);
+        switch_fpu_finish(next_p, fpu);
        percpu_write(current_task, next_p);
        return prev_p;
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 9b9fe4a85c87..e34257c70c28 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -156,9 +156,7 @@ void cpu_idle(void)
                }
                tick_nohz_idle_exit();
-                preempt_enable_no_resched();
+                schedule_preempt_disabled();
-                schedule();
-                preempt_disable();
        }
 }
@@ -286,6 +284,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
        set_tsk_thread_flag(p, TIF_FORK);
+        p->fpu_counter = 0;
        p->thread.io_bitmap_ptr = NULL;
        savesegment(gs, p->thread.gsindex);
@@ -386,18 +385,9 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
        int cpu = smp_processor_id();
        struct tss_struct *tss = &per_cpu(init_tss, cpu);
        unsigned fsindex, gsindex;
-        bool preload_fpu;
+        fpu_switch_t fpu;
-        /*
+        fpu = switch_fpu_prepare(prev_p, next_p, cpu);
-         * If the task has used fpu the last 5 timeslices, just do a full
-         * restore of the math state immediately to avoid the trap; the
-         * chances of needing FPU soon are obviously high now
-         */
-        preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5;
-        /* we're going to use this soon, after a few expensive things */
-        if (preload_fpu)
-                prefetch(next->fpu.state);
        /*
         * Reload esp0, LDT and the page table pointer:
@@ -427,13 +417,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
        load_TLS(next, cpu);
-        /* Must be after DS reload */
-        __unlazy_fpu(prev_p);
-        /* Make sure cpu is ready for new context */
-        if (preload_fpu)
-                clts();
        /*
         * Leave lazy mode, flushing any hypercalls made here.
         * This must be done before restoring TLS segments so
@@ -474,6 +457,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
                wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
        prev->gsindex = gsindex;
+        switch_fpu_finish(next_p, fpu);
        /*
         * Switch the PDA and FPU contexts.
         */
@@ -492,13 +477,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
                     task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
                __switch_to_xtra(prev_p, next_p, tss);
-        /*
-         * Preload the FPU context, now that we've determined that the
-         * task is likely to be using it. 
-         */
-        if (preload_fpu)
-                __math_state_restore();
        return prev_p;
 }
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 66d250c00d11..58f78165d308 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -291,19 +291,6 @@ notrace static void __cpuinit start_secondary(void *unused)
        per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
        x86_platform.nmi_init();
-        /*
-         * Wait until the cpu which brought this one up marked it
-         * online before enabling interrupts. If we don't do that then
-         * we can end up waking up the softirq thread before this cpu
-         * reached the active state, which makes the scheduler unhappy
-         * and schedule the softirq thread on the wrong cpu. This is
-         * only observable with forced threaded interrupts, but in
-         * theory it could also happen w/o them. It's just way harder
-         * to achieve.
-         */
-        while (!cpumask_test_cpu(smp_processor_id(), cpu_active_mask))
-                cpu_relax();
        /* enable local interrupts */
        local_irq_enable();
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 8ba27dbc107a..4bbe04d96744 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -571,28 +571,6 @@ asmlinkage void __attribute__((weak)) smp_threshold_interrupt(void)
 }
 /*
- * __math_state_restore assumes that cr0.TS is already clear and the
- * fpu state is all ready for use.  Used during context switch.
- */
-void __math_state_restore(void)
-{
-        struct thread_info *thread = current_thread_info();
-        struct task_struct *tsk = thread->task;
-        /*
-         * Paranoid restore. send a SIGSEGV if we fail to restore the state.
-         */
-        if (unlikely(restore_fpu_checking(tsk))) {
-                stts();
-                force_sig(SIGSEGV, tsk);
-                return;
-        }
-        thread->status |= TS_USEDFPU;   /* So we fnsave on switch_to() */
-        tsk->fpu_counter++;
-}
-/*
 * 'math_state_restore()' saves the current math information in the
 * old math state array, and gets the new ones from the current task
 *
@@ -604,8 +582,7 @@ void __math_state_restore(void)
 */
 void math_state_restore(void)
 {
-        struct thread_info *thread = current_thread_info();
+        struct task_struct *tsk = current;
-        struct task_struct *tsk = thread->task;
        if (!tsk_used_math(tsk)) {
                local_irq_enable();
@@ -622,16 +599,23 @@ void math_state_restore(void)
                local_irq_disable();
        }
-        clts();                         /* Allow maths ops (or we recurse) */
+        __thread_fpu_begin(tsk);
+        /*
+         * Paranoid restore. send a SIGSEGV if we fail to restore the state.
+         */
+        if (unlikely(restore_fpu_checking(tsk))) {
+                __thread_fpu_end(tsk);
+                force_sig(SIGSEGV, tsk);
+                return;
+        }
-        __math_state_restore();
+        tsk->fpu_counter++;
 }
 EXPORT_SYMBOL_GPL(math_state_restore);
 dotraplinkage void __kprobes
 do_device_not_available(struct pt_regs *regs, long error_code)
 {
-        WARN_ON_ONCE(!user_mode_vm(regs));
 #ifdef CONFIG_MATH_EMULATION
        if (read_cr0() & X86_CR0_EM) {
                struct math_emu_info info = { };
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index a62c201c97ec..183c5925a9fe 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -620,7 +620,8 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
        if (cpu_khz) {
                *scale = (NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR)/cpu_khz;
-                *offset = ns_now - (tsc_now * *scale >> CYC2NS_SCALE_FACTOR);
+                *offset = ns_now - mult_frac(tsc_now, *scale,
+                                             (1UL << CYC2NS_SCALE_FACTOR));
        }
        sched_clock_idle_wakeup_event(0);
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index a3911343976b..711091114119 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -47,7 +47,7 @@ void __sanitize_i387_state(struct task_struct *tsk)
        if (!fx)
                return;
-        BUG_ON(task_thread_info(tsk)->status & TS_USEDFPU);
+        BUG_ON(__thread_has_fpu(tsk));
        xstate_bv = tsk->thread.fpu.state->xsave.xsave_hdr.xstate_bv;
@@ -168,7 +168,7 @@ int save_i387_xstate(void __user *buf)
        if (!used_math())
                return 0;
-        if (task_thread_info(tsk)->status & TS_USEDFPU) {
+        if (user_has_fpu()) {
                if (use_xsave())
                        err = xsave_user(buf);
                else
@@ -176,8 +176,7 @@ int save_i387_xstate(void __user *buf)
                if (err)
                        return err;
-                task_thread_info(tsk)->status &= ~TS_USEDFPU;
+                user_fpu_end();
-                stts();
        } else {
                sanitize_i387_state(tsk);
                if (__copy_to_user(buf, &tsk->thread.fpu.state->fxsave,
@@ -292,10 +291,7 @@ int restore_i387_xstate(void __user *buf)
                        return err;
        }
-        if (!(task_thread_info(current)->status & TS_USEDFPU)) {
+        user_fpu_begin();
-                clts();
-                task_thread_info(current)->status |= TS_USEDFPU;
-        }
        if (use_xsave())
                err = restore_user_xstate(buf);
        else