aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/Makefile1
-rw-r--r--arch/x86/kernel/cpu/amd.c3
-rw-r--r--arch/x86/kernel/cpu/common.c5
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c44
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c2
-rw-r--r--arch/x86/kernel/cpu/perf_event.c167
-rw-r--r--arch/x86/kernel/cpu/perf_event.h58
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd.c40
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c158
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_ds.c22
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_lbr.c526
-rw-r--r--arch/x86/kernel/entry_64.S9
-rw-r--r--arch/x86/kernel/irq_32.c11
-rw-r--r--arch/x86/kernel/kprobes-common.h102
-rw-r--r--arch/x86/kernel/kprobes-opt.c512
-rw-r--r--arch/x86/kernel/kprobes.c664
-rw-r--r--arch/x86/kernel/kvm.c4
-rw-r--r--arch/x86/kernel/microcode_amd.c1
-rw-r--r--arch/x86/kernel/paravirt.c4
-rw-r--r--arch/x86/kernel/process.c24
-rw-r--r--arch/x86/kernel/process_32.c30
-rw-r--r--arch/x86/kernel/process_64.c34
-rw-r--r--arch/x86/kernel/smpboot.c13
-rw-r--r--arch/x86/kernel/traps.c38
-rw-r--r--arch/x86/kernel/tsc.c3
-rw-r--r--arch/x86/kernel/xsave.c12
26 files changed, 1673 insertions, 814 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 5369059c07a9..532d2e090e6f 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -69,6 +69,7 @@ obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o
69obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o 69obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o
70obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o 70obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o
71obj-$(CONFIG_KPROBES) += kprobes.o 71obj-$(CONFIG_KPROBES) += kprobes.o
72obj-$(CONFIG_OPTPROBES) += kprobes-opt.o
72obj-$(CONFIG_MODULES) += module.o 73obj-$(CONFIG_MODULES) += module.o
73obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o 74obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o
74obj-$(CONFIG_KGDB) += kgdb.o 75obj-$(CONFIG_KGDB) += kgdb.o
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index f4773f4aae35..0a44b90602b0 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -5,6 +5,7 @@
5#include <linux/mm.h> 5#include <linux/mm.h>
6 6
7#include <linux/io.h> 7#include <linux/io.h>
8#include <linux/sched.h>
8#include <asm/processor.h> 9#include <asm/processor.h>
9#include <asm/apic.h> 10#include <asm/apic.h>
10#include <asm/cpu.h> 11#include <asm/cpu.h>
@@ -456,6 +457,8 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
456 if (c->x86_power & (1 << 8)) { 457 if (c->x86_power & (1 << 8)) {
457 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); 458 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
458 set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); 459 set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
460 if (!check_tsc_unstable())
461 sched_clock_stable = 1;
459 } 462 }
460 463
461#ifdef CONFIG_X86_64 464#ifdef CONFIG_X86_64
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index d43cad74f166..c0f7d68d318f 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1044,6 +1044,9 @@ DEFINE_PER_CPU(char *, irq_stack_ptr) =
1044 1044
1045DEFINE_PER_CPU(unsigned int, irq_count) = -1; 1045DEFINE_PER_CPU(unsigned int, irq_count) = -1;
1046 1046
1047DEFINE_PER_CPU(struct task_struct *, fpu_owner_task);
1048EXPORT_PER_CPU_SYMBOL(fpu_owner_task);
1049
1047/* 1050/*
1048 * Special IST stacks which the CPU switches to when it calls 1051 * Special IST stacks which the CPU switches to when it calls
1049 * an IST-marked descriptor entry. Up to 7 stacks (hardware 1052 * an IST-marked descriptor entry. Up to 7 stacks (hardware
@@ -1111,6 +1114,8 @@ void debug_stack_reset(void)
1111 1114
1112DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; 1115DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
1113EXPORT_PER_CPU_SYMBOL(current_task); 1116EXPORT_PER_CPU_SYMBOL(current_task);
1117DEFINE_PER_CPU(struct task_struct *, fpu_owner_task);
1118EXPORT_PER_CPU_SYMBOL(fpu_owner_task);
1114 1119
1115#ifdef CONFIG_CC_STACKPROTECTOR 1120#ifdef CONFIG_CC_STACKPROTECTOR
1116DEFINE_PER_CPU_ALIGNED(struct stack_canary, stack_canary); 1121DEFINE_PER_CPU_ALIGNED(struct stack_canary, stack_canary);
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 6b45e5e7a901..73d08ed98a64 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -326,8 +326,7 @@ static void __cpuinit amd_calc_l3_indices(struct amd_northbridge *nb)
326 l3->indices = (max(max3(sc0, sc1, sc2), sc3) << 10) - 1; 326 l3->indices = (max(max3(sc0, sc1, sc2), sc3) << 10) - 1;
327} 327}
328 328
329static void __cpuinit amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf, 329static void __cpuinit amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf, int index)
330 int index)
331{ 330{
332 int node; 331 int node;
333 332
@@ -725,14 +724,16 @@ static DEFINE_PER_CPU(struct _cpuid4_info *, ici_cpuid4_info);
725#define CPUID4_INFO_IDX(x, y) (&((per_cpu(ici_cpuid4_info, x))[y])) 724#define CPUID4_INFO_IDX(x, y) (&((per_cpu(ici_cpuid4_info, x))[y]))
726 725
727#ifdef CONFIG_SMP 726#ifdef CONFIG_SMP
728static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index) 727
728static int __cpuinit cache_shared_amd_cpu_map_setup(unsigned int cpu, int index)
729{ 729{
730 struct _cpuid4_info *this_leaf, *sibling_leaf; 730 struct _cpuid4_info *this_leaf;
731 unsigned long num_threads_sharing; 731 int ret, i, sibling;
732 int index_msb, i, sibling;
733 struct cpuinfo_x86 *c = &cpu_data(cpu); 732 struct cpuinfo_x86 *c = &cpu_data(cpu);
734 733
735 if ((index == 3) && (c->x86_vendor == X86_VENDOR_AMD)) { 734 ret = 0;
735 if (index == 3) {
736 ret = 1;
736 for_each_cpu(i, cpu_llc_shared_mask(cpu)) { 737 for_each_cpu(i, cpu_llc_shared_mask(cpu)) {
737 if (!per_cpu(ici_cpuid4_info, i)) 738 if (!per_cpu(ici_cpuid4_info, i))
738 continue; 739 continue;
@@ -743,8 +744,35 @@ static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index)
743 set_bit(sibling, this_leaf->shared_cpu_map); 744 set_bit(sibling, this_leaf->shared_cpu_map);
744 } 745 }
745 } 746 }
746 return; 747 } else if ((c->x86 == 0x15) && ((index == 1) || (index == 2))) {
748 ret = 1;
749 for_each_cpu(i, cpu_sibling_mask(cpu)) {
750 if (!per_cpu(ici_cpuid4_info, i))
751 continue;
752 this_leaf = CPUID4_INFO_IDX(i, index);
753 for_each_cpu(sibling, cpu_sibling_mask(cpu)) {
754 if (!cpu_online(sibling))
755 continue;
756 set_bit(sibling, this_leaf->shared_cpu_map);
757 }
758 }
747 } 759 }
760
761 return ret;
762}
763
764static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index)
765{
766 struct _cpuid4_info *this_leaf, *sibling_leaf;
767 unsigned long num_threads_sharing;
768 int index_msb, i;
769 struct cpuinfo_x86 *c = &cpu_data(cpu);
770
771 if (c->x86_vendor == X86_VENDOR_AMD) {
772 if (cache_shared_amd_cpu_map_setup(cpu, index))
773 return;
774 }
775
748 this_leaf = CPUID4_INFO_IDX(cpu, index); 776 this_leaf = CPUID4_INFO_IDX(cpu, index);
749 num_threads_sharing = 1 + this_leaf->base.eax.split.num_threads_sharing; 777 num_threads_sharing = 1 + this_leaf->base.eax.split.num_threads_sharing;
750 778
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 786e76a86322..e4eeaaf58a47 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -528,6 +528,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
528 528
529 sprintf(name, "threshold_bank%i", bank); 529 sprintf(name, "threshold_bank%i", bank);
530 530
531#ifdef CONFIG_SMP
531 if (cpu_data(cpu).cpu_core_id && shared_bank[bank]) { /* symlink */ 532 if (cpu_data(cpu).cpu_core_id && shared_bank[bank]) { /* symlink */
532 i = cpumask_first(cpu_llc_shared_mask(cpu)); 533 i = cpumask_first(cpu_llc_shared_mask(cpu));
533 534
@@ -553,6 +554,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
553 554
554 goto out; 555 goto out;
555 } 556 }
557#endif
556 558
557 b = kzalloc(sizeof(struct threshold_bank), GFP_KERNEL); 559 b = kzalloc(sizeof(struct threshold_bank), GFP_KERNEL);
558 if (!b) { 560 if (!b) {
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 5adce1040b11..0a18d16cb58d 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -24,6 +24,7 @@
24#include <linux/slab.h> 24#include <linux/slab.h>
25#include <linux/cpu.h> 25#include <linux/cpu.h>
26#include <linux/bitops.h> 26#include <linux/bitops.h>
27#include <linux/device.h>
27 28
28#include <asm/apic.h> 29#include <asm/apic.h>
29#include <asm/stacktrace.h> 30#include <asm/stacktrace.h>
@@ -31,6 +32,7 @@
31#include <asm/compat.h> 32#include <asm/compat.h>
32#include <asm/smp.h> 33#include <asm/smp.h>
33#include <asm/alternative.h> 34#include <asm/alternative.h>
35#include <asm/timer.h>
34 36
35#include "perf_event.h" 37#include "perf_event.h"
36 38
@@ -351,6 +353,36 @@ int x86_setup_perfctr(struct perf_event *event)
351 return 0; 353 return 0;
352} 354}
353 355
356/*
357 * check that branch_sample_type is compatible with
358 * settings needed for precise_ip > 1 which implies
359 * using the LBR to capture ALL taken branches at the
360 * priv levels of the measurement
361 */
362static inline int precise_br_compat(struct perf_event *event)
363{
364 u64 m = event->attr.branch_sample_type;
365 u64 b = 0;
366
367 /* must capture all branches */
368 if (!(m & PERF_SAMPLE_BRANCH_ANY))
369 return 0;
370
371 m &= PERF_SAMPLE_BRANCH_KERNEL | PERF_SAMPLE_BRANCH_USER;
372
373 if (!event->attr.exclude_user)
374 b |= PERF_SAMPLE_BRANCH_USER;
375
376 if (!event->attr.exclude_kernel)
377 b |= PERF_SAMPLE_BRANCH_KERNEL;
378
379 /*
380 * ignore PERF_SAMPLE_BRANCH_HV, not supported on x86
381 */
382
383 return m == b;
384}
385
354int x86_pmu_hw_config(struct perf_event *event) 386int x86_pmu_hw_config(struct perf_event *event)
355{ 387{
356 if (event->attr.precise_ip) { 388 if (event->attr.precise_ip) {
@@ -367,6 +399,36 @@ int x86_pmu_hw_config(struct perf_event *event)
367 399
368 if (event->attr.precise_ip > precise) 400 if (event->attr.precise_ip > precise)
369 return -EOPNOTSUPP; 401 return -EOPNOTSUPP;
402 /*
403 * check that PEBS LBR correction does not conflict with
404 * whatever the user is asking with attr->branch_sample_type
405 */
406 if (event->attr.precise_ip > 1) {
407 u64 *br_type = &event->attr.branch_sample_type;
408
409 if (has_branch_stack(event)) {
410 if (!precise_br_compat(event))
411 return -EOPNOTSUPP;
412
413 /* branch_sample_type is compatible */
414
415 } else {
416 /*
417 * user did not specify branch_sample_type
418 *
419 * For PEBS fixups, we capture all
420 * the branches at the priv level of the
421 * event.
422 */
423 *br_type = PERF_SAMPLE_BRANCH_ANY;
424
425 if (!event->attr.exclude_user)
426 *br_type |= PERF_SAMPLE_BRANCH_USER;
427
428 if (!event->attr.exclude_kernel)
429 *br_type |= PERF_SAMPLE_BRANCH_KERNEL;
430 }
431 }
370 } 432 }
371 433
372 /* 434 /*
@@ -424,6 +486,10 @@ static int __x86_pmu_event_init(struct perf_event *event)
424 /* mark unused */ 486 /* mark unused */
425 event->hw.extra_reg.idx = EXTRA_REG_NONE; 487 event->hw.extra_reg.idx = EXTRA_REG_NONE;
426 488
489 /* mark not used */
490 event->hw.extra_reg.idx = EXTRA_REG_NONE;
491 event->hw.branch_reg.idx = EXTRA_REG_NONE;
492
427 return x86_pmu.hw_config(event); 493 return x86_pmu.hw_config(event);
428} 494}
429 495
@@ -1210,6 +1276,8 @@ x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
1210 break; 1276 break;
1211 1277
1212 case CPU_STARTING: 1278 case CPU_STARTING:
1279 if (x86_pmu.attr_rdpmc)
1280 set_in_cr4(X86_CR4_PCE);
1213 if (x86_pmu.cpu_starting) 1281 if (x86_pmu.cpu_starting)
1214 x86_pmu.cpu_starting(cpu); 1282 x86_pmu.cpu_starting(cpu);
1215 break; 1283 break;
@@ -1319,6 +1387,8 @@ static int __init init_hw_perf_events(void)
1319 } 1387 }
1320 } 1388 }
1321 1389
1390 x86_pmu.attr_rdpmc = 1; /* enable userspace RDPMC usage by default */
1391
1322 pr_info("... version: %d\n", x86_pmu.version); 1392 pr_info("... version: %d\n", x86_pmu.version);
1323 pr_info("... bit width: %d\n", x86_pmu.cntval_bits); 1393 pr_info("... bit width: %d\n", x86_pmu.cntval_bits);
1324 pr_info("... generic registers: %d\n", x86_pmu.num_counters); 1394 pr_info("... generic registers: %d\n", x86_pmu.num_counters);
@@ -1542,23 +1612,106 @@ static int x86_pmu_event_init(struct perf_event *event)
1542 return err; 1612 return err;
1543} 1613}
1544 1614
1615static int x86_pmu_event_idx(struct perf_event *event)
1616{
1617 int idx = event->hw.idx;
1618
1619 if (x86_pmu.num_counters_fixed && idx >= X86_PMC_IDX_FIXED) {
1620 idx -= X86_PMC_IDX_FIXED;
1621 idx |= 1 << 30;
1622 }
1623
1624 return idx + 1;
1625}
1626
1627static ssize_t get_attr_rdpmc(struct device *cdev,
1628 struct device_attribute *attr,
1629 char *buf)
1630{
1631 return snprintf(buf, 40, "%d\n", x86_pmu.attr_rdpmc);
1632}
1633
1634static void change_rdpmc(void *info)
1635{
1636 bool enable = !!(unsigned long)info;
1637
1638 if (enable)
1639 set_in_cr4(X86_CR4_PCE);
1640 else
1641 clear_in_cr4(X86_CR4_PCE);
1642}
1643
1644static ssize_t set_attr_rdpmc(struct device *cdev,
1645 struct device_attribute *attr,
1646 const char *buf, size_t count)
1647{
1648 unsigned long val = simple_strtoul(buf, NULL, 0);
1649
1650 if (!!val != !!x86_pmu.attr_rdpmc) {
1651 x86_pmu.attr_rdpmc = !!val;
1652 smp_call_function(change_rdpmc, (void *)val, 1);
1653 }
1654
1655 return count;
1656}
1657
1658static DEVICE_ATTR(rdpmc, S_IRUSR | S_IWUSR, get_attr_rdpmc, set_attr_rdpmc);
1659
1660static struct attribute *x86_pmu_attrs[] = {
1661 &dev_attr_rdpmc.attr,
1662 NULL,
1663};
1664
1665static struct attribute_group x86_pmu_attr_group = {
1666 .attrs = x86_pmu_attrs,
1667};
1668
1669static const struct attribute_group *x86_pmu_attr_groups[] = {
1670 &x86_pmu_attr_group,
1671 NULL,
1672};
1673
1674static void x86_pmu_flush_branch_stack(void)
1675{
1676 if (x86_pmu.flush_branch_stack)
1677 x86_pmu.flush_branch_stack();
1678}
1679
1545static struct pmu pmu = { 1680static struct pmu pmu = {
1546 .pmu_enable = x86_pmu_enable, 1681 .pmu_enable = x86_pmu_enable,
1547 .pmu_disable = x86_pmu_disable, 1682 .pmu_disable = x86_pmu_disable,
1683
1684 .attr_groups = x86_pmu_attr_groups,
1548 1685
1549 .event_init = x86_pmu_event_init, 1686 .event_init = x86_pmu_event_init,
1550 1687
1551 .add = x86_pmu_add, 1688 .add = x86_pmu_add,
1552 .del = x86_pmu_del, 1689 .del = x86_pmu_del,
1553 .start = x86_pmu_start, 1690 .start = x86_pmu_start,
1554 .stop = x86_pmu_stop, 1691 .stop = x86_pmu_stop,
1555 .read = x86_pmu_read, 1692 .read = x86_pmu_read,
1556 1693
1557 .start_txn = x86_pmu_start_txn, 1694 .start_txn = x86_pmu_start_txn,
1558 .cancel_txn = x86_pmu_cancel_txn, 1695 .cancel_txn = x86_pmu_cancel_txn,
1559 .commit_txn = x86_pmu_commit_txn, 1696 .commit_txn = x86_pmu_commit_txn,
1697
1698 .event_idx = x86_pmu_event_idx,
1699 .flush_branch_stack = x86_pmu_flush_branch_stack,
1560}; 1700};
1561 1701
1702void perf_update_user_clock(struct perf_event_mmap_page *userpg, u64 now)
1703{
1704 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
1705 return;
1706
1707 if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
1708 return;
1709
1710 userpg->time_mult = this_cpu_read(cyc2ns);
1711 userpg->time_shift = CYC2NS_SCALE_FACTOR;
1712 userpg->time_offset = this_cpu_read(cyc2ns_offset) - now;
1713}
1714
1562/* 1715/*
1563 * callchain support 1716 * callchain support
1564 */ 1717 */
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 8944062f46e2..8484e77c211e 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -33,6 +33,7 @@ enum extra_reg_type {
33 33
34 EXTRA_REG_RSP_0 = 0, /* offcore_response_0 */ 34 EXTRA_REG_RSP_0 = 0, /* offcore_response_0 */
35 EXTRA_REG_RSP_1 = 1, /* offcore_response_1 */ 35 EXTRA_REG_RSP_1 = 1, /* offcore_response_1 */
36 EXTRA_REG_LBR = 2, /* lbr_select */
36 37
37 EXTRA_REG_MAX /* number of entries needed */ 38 EXTRA_REG_MAX /* number of entries needed */
38}; 39};
@@ -130,6 +131,8 @@ struct cpu_hw_events {
130 void *lbr_context; 131 void *lbr_context;
131 struct perf_branch_stack lbr_stack; 132 struct perf_branch_stack lbr_stack;
132 struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES]; 133 struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES];
134 struct er_account *lbr_sel;
135 u64 br_sel;
133 136
134 /* 137 /*
135 * Intel host/guest exclude bits 138 * Intel host/guest exclude bits
@@ -147,7 +150,9 @@ struct cpu_hw_events {
147 /* 150 /*
148 * AMD specific bits 151 * AMD specific bits
149 */ 152 */
150 struct amd_nb *amd_nb; 153 struct amd_nb *amd_nb;
154 /* Inverted mask of bits to clear in the perf_ctr ctrl registers */
155 u64 perf_ctr_virt_mask;
151 156
152 void *kfree_on_online; 157 void *kfree_on_online;
153}; 158};
@@ -266,6 +271,29 @@ struct x86_pmu_quirk {
266 void (*func)(void); 271 void (*func)(void);
267}; 272};
268 273
274union x86_pmu_config {
275 struct {
276 u64 event:8,
277 umask:8,
278 usr:1,
279 os:1,
280 edge:1,
281 pc:1,
282 interrupt:1,
283 __reserved1:1,
284 en:1,
285 inv:1,
286 cmask:8,
287 event2:4,
288 __reserved2:4,
289 go:1,
290 ho:1;
291 } bits;
292 u64 value;
293};
294
295#define X86_CONFIG(args...) ((union x86_pmu_config){.bits = {args}}).value
296
269/* 297/*
270 * struct x86_pmu - generic x86 pmu 298 * struct x86_pmu - generic x86 pmu
271 */ 299 */
@@ -307,10 +335,19 @@ struct x86_pmu {
307 struct x86_pmu_quirk *quirks; 335 struct x86_pmu_quirk *quirks;
308 int perfctr_second_write; 336 int perfctr_second_write;
309 337
338 /*
339 * sysfs attrs
340 */
341 int attr_rdpmc;
342
343 /*
344 * CPU Hotplug hooks
345 */
310 int (*cpu_prepare)(int cpu); 346 int (*cpu_prepare)(int cpu);
311 void (*cpu_starting)(int cpu); 347 void (*cpu_starting)(int cpu);
312 void (*cpu_dying)(int cpu); 348 void (*cpu_dying)(int cpu);
313 void (*cpu_dead)(int cpu); 349 void (*cpu_dead)(int cpu);
350 void (*flush_branch_stack)(void);
314 351
315 /* 352 /*
316 * Intel Arch Perfmon v2+ 353 * Intel Arch Perfmon v2+
@@ -332,6 +369,8 @@ struct x86_pmu {
332 */ 369 */
333 unsigned long lbr_tos, lbr_from, lbr_to; /* MSR base regs */ 370 unsigned long lbr_tos, lbr_from, lbr_to; /* MSR base regs */
334 int lbr_nr; /* hardware stack size */ 371 int lbr_nr; /* hardware stack size */
372 u64 lbr_sel_mask; /* LBR_SELECT valid bits */
373 const int *lbr_sel_map; /* lbr_select mappings */
335 374
336 /* 375 /*
337 * Extra registers for events 376 * Extra registers for events
@@ -417,9 +456,11 @@ void x86_pmu_disable_all(void);
417static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc, 456static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,
418 u64 enable_mask) 457 u64 enable_mask)
419{ 458{
459 u64 disable_mask = __this_cpu_read(cpu_hw_events.perf_ctr_virt_mask);
460
420 if (hwc->extra_reg.reg) 461 if (hwc->extra_reg.reg)
421 wrmsrl(hwc->extra_reg.reg, hwc->extra_reg.config); 462 wrmsrl(hwc->extra_reg.reg, hwc->extra_reg.config);
422 wrmsrl(hwc->config_base, hwc->config | enable_mask); 463 wrmsrl(hwc->config_base, (hwc->config | enable_mask) & ~disable_mask);
423} 464}
424 465
425void x86_pmu_enable_all(int added); 466void x86_pmu_enable_all(int added);
@@ -443,6 +484,15 @@ extern struct event_constraint emptyconstraint;
443 484
444extern struct event_constraint unconstrained; 485extern struct event_constraint unconstrained;
445 486
487static inline bool kernel_ip(unsigned long ip)
488{
489#ifdef CONFIG_X86_32
490 return ip > PAGE_OFFSET;
491#else
492 return (long)ip < 0;
493#endif
494}
495
446#ifdef CONFIG_CPU_SUP_AMD 496#ifdef CONFIG_CPU_SUP_AMD
447 497
448int amd_pmu_init(void); 498int amd_pmu_init(void);
@@ -523,6 +573,10 @@ void intel_pmu_lbr_init_nhm(void);
523 573
524void intel_pmu_lbr_init_atom(void); 574void intel_pmu_lbr_init_atom(void);
525 575
576void intel_pmu_lbr_init_snb(void);
577
578int intel_pmu_setup_lbr_filter(struct perf_event *event);
579
526int p4_pmu_init(void); 580int p4_pmu_init(void);
527 581
528int p6_pmu_init(void); 582int p6_pmu_init(void);
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index 0397b23be8e9..dd002faff7a6 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -1,4 +1,5 @@
1#include <linux/perf_event.h> 1#include <linux/perf_event.h>
2#include <linux/export.h>
2#include <linux/types.h> 3#include <linux/types.h>
3#include <linux/init.h> 4#include <linux/init.h>
4#include <linux/slab.h> 5#include <linux/slab.h>
@@ -138,6 +139,9 @@ static int amd_pmu_hw_config(struct perf_event *event)
138 if (ret) 139 if (ret)
139 return ret; 140 return ret;
140 141
142 if (has_branch_stack(event))
143 return -EOPNOTSUPP;
144
141 if (event->attr.exclude_host && event->attr.exclude_guest) 145 if (event->attr.exclude_host && event->attr.exclude_guest)
142 /* 146 /*
143 * When HO == GO == 1 the hardware treats that as GO == HO == 0 147 * When HO == GO == 1 the hardware treats that as GO == HO == 0
@@ -357,7 +361,9 @@ static void amd_pmu_cpu_starting(int cpu)
357 struct amd_nb *nb; 361 struct amd_nb *nb;
358 int i, nb_id; 362 int i, nb_id;
359 363
360 if (boot_cpu_data.x86_max_cores < 2) 364 cpuc->perf_ctr_virt_mask = AMD_PERFMON_EVENTSEL_HOSTONLY;
365
366 if (boot_cpu_data.x86_max_cores < 2 || boot_cpu_data.x86 == 0x15)
361 return; 367 return;
362 368
363 nb_id = amd_get_nb_id(cpu); 369 nb_id = amd_get_nb_id(cpu);
@@ -587,9 +593,9 @@ static __initconst const struct x86_pmu amd_pmu_f15h = {
587 .put_event_constraints = amd_put_event_constraints, 593 .put_event_constraints = amd_put_event_constraints,
588 594
589 .cpu_prepare = amd_pmu_cpu_prepare, 595 .cpu_prepare = amd_pmu_cpu_prepare,
590 .cpu_starting = amd_pmu_cpu_starting,
591 .cpu_dead = amd_pmu_cpu_dead, 596 .cpu_dead = amd_pmu_cpu_dead,
592#endif 597#endif
598 .cpu_starting = amd_pmu_cpu_starting,
593}; 599};
594 600
595__init int amd_pmu_init(void) 601__init int amd_pmu_init(void)
@@ -621,3 +627,33 @@ __init int amd_pmu_init(void)
621 627
622 return 0; 628 return 0;
623} 629}
630
631void amd_pmu_enable_virt(void)
632{
633 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
634
635 cpuc->perf_ctr_virt_mask = 0;
636
637 /* Reload all events */
638 x86_pmu_disable_all();
639 x86_pmu_enable_all(0);
640}
641EXPORT_SYMBOL_GPL(amd_pmu_enable_virt);
642
643void amd_pmu_disable_virt(void)
644{
645 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
646
647 /*
648 * We only mask out the Host-only bit so that host-only counting works
649 * when SVM is disabled. If someone sets up a guest-only counter when
650 * SVM is disabled the Guest-only bits still gets set and the counter
651 * will not count anything.
652 */
653 cpuc->perf_ctr_virt_mask = AMD_PERFMON_EVENTSEL_HOSTONLY;
654
655 /* Reload all events */
656 x86_pmu_disable_all();
657 x86_pmu_enable_all(0);
658}
659EXPORT_SYMBOL_GPL(amd_pmu_disable_virt);
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 3bd37bdf1b8e..6a84e7f28f05 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -385,14 +385,15 @@ static __initconst const u64 westmere_hw_cache_event_ids
385#define NHM_LOCAL_DRAM (1 << 14) 385#define NHM_LOCAL_DRAM (1 << 14)
386#define NHM_NON_DRAM (1 << 15) 386#define NHM_NON_DRAM (1 << 15)
387 387
388#define NHM_ALL_DRAM (NHM_REMOTE_DRAM|NHM_LOCAL_DRAM) 388#define NHM_LOCAL (NHM_LOCAL_DRAM|NHM_REMOTE_CACHE_FWD)
389#define NHM_REMOTE (NHM_REMOTE_DRAM)
389 390
390#define NHM_DMND_READ (NHM_DMND_DATA_RD) 391#define NHM_DMND_READ (NHM_DMND_DATA_RD)
391#define NHM_DMND_WRITE (NHM_DMND_RFO|NHM_DMND_WB) 392#define NHM_DMND_WRITE (NHM_DMND_RFO|NHM_DMND_WB)
392#define NHM_DMND_PREFETCH (NHM_PF_DATA_RD|NHM_PF_DATA_RFO) 393#define NHM_DMND_PREFETCH (NHM_PF_DATA_RD|NHM_PF_DATA_RFO)
393 394
394#define NHM_L3_HIT (NHM_UNCORE_HIT|NHM_OTHER_CORE_HIT_SNP|NHM_OTHER_CORE_HITM) 395#define NHM_L3_HIT (NHM_UNCORE_HIT|NHM_OTHER_CORE_HIT_SNP|NHM_OTHER_CORE_HITM)
395#define NHM_L3_MISS (NHM_NON_DRAM|NHM_ALL_DRAM|NHM_REMOTE_CACHE_FWD) 396#define NHM_L3_MISS (NHM_NON_DRAM|NHM_LOCAL_DRAM|NHM_REMOTE_DRAM|NHM_REMOTE_CACHE_FWD)
396#define NHM_L3_ACCESS (NHM_L3_HIT|NHM_L3_MISS) 397#define NHM_L3_ACCESS (NHM_L3_HIT|NHM_L3_MISS)
397 398
398static __initconst const u64 nehalem_hw_cache_extra_regs 399static __initconst const u64 nehalem_hw_cache_extra_regs
@@ -416,16 +417,16 @@ static __initconst const u64 nehalem_hw_cache_extra_regs
416 }, 417 },
417 [ C(NODE) ] = { 418 [ C(NODE) ] = {
418 [ C(OP_READ) ] = { 419 [ C(OP_READ) ] = {
419 [ C(RESULT_ACCESS) ] = NHM_DMND_READ|NHM_ALL_DRAM, 420 [ C(RESULT_ACCESS) ] = NHM_DMND_READ|NHM_LOCAL|NHM_REMOTE,
420 [ C(RESULT_MISS) ] = NHM_DMND_READ|NHM_REMOTE_DRAM, 421 [ C(RESULT_MISS) ] = NHM_DMND_READ|NHM_REMOTE,
421 }, 422 },
422 [ C(OP_WRITE) ] = { 423 [ C(OP_WRITE) ] = {
423 [ C(RESULT_ACCESS) ] = NHM_DMND_WRITE|NHM_ALL_DRAM, 424 [ C(RESULT_ACCESS) ] = NHM_DMND_WRITE|NHM_LOCAL|NHM_REMOTE,
424 [ C(RESULT_MISS) ] = NHM_DMND_WRITE|NHM_REMOTE_DRAM, 425 [ C(RESULT_MISS) ] = NHM_DMND_WRITE|NHM_REMOTE,
425 }, 426 },
426 [ C(OP_PREFETCH) ] = { 427 [ C(OP_PREFETCH) ] = {
427 [ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_ALL_DRAM, 428 [ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_LOCAL|NHM_REMOTE,
428 [ C(RESULT_MISS) ] = NHM_DMND_PREFETCH|NHM_REMOTE_DRAM, 429 [ C(RESULT_MISS) ] = NHM_DMND_PREFETCH|NHM_REMOTE,
429 }, 430 },
430 }, 431 },
431}; 432};
@@ -727,6 +728,19 @@ static __initconst const u64 atom_hw_cache_event_ids
727 }, 728 },
728}; 729};
729 730
731static inline bool intel_pmu_needs_lbr_smpl(struct perf_event *event)
732{
733 /* user explicitly requested branch sampling */
734 if (has_branch_stack(event))
735 return true;
736
737 /* implicit branch sampling to correct PEBS skid */
738 if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1)
739 return true;
740
741 return false;
742}
743
730static void intel_pmu_disable_all(void) 744static void intel_pmu_disable_all(void)
731{ 745{
732 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 746 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
@@ -881,6 +895,13 @@ static void intel_pmu_disable_event(struct perf_event *event)
881 cpuc->intel_ctrl_guest_mask &= ~(1ull << hwc->idx); 895 cpuc->intel_ctrl_guest_mask &= ~(1ull << hwc->idx);
882 cpuc->intel_ctrl_host_mask &= ~(1ull << hwc->idx); 896 cpuc->intel_ctrl_host_mask &= ~(1ull << hwc->idx);
883 897
898 /*
899 * must disable before any actual event
900 * because any event may be combined with LBR
901 */
902 if (intel_pmu_needs_lbr_smpl(event))
903 intel_pmu_lbr_disable(event);
904
884 if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { 905 if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
885 intel_pmu_disable_fixed(hwc); 906 intel_pmu_disable_fixed(hwc);
886 return; 907 return;
@@ -935,6 +956,12 @@ static void intel_pmu_enable_event(struct perf_event *event)
935 intel_pmu_enable_bts(hwc->config); 956 intel_pmu_enable_bts(hwc->config);
936 return; 957 return;
937 } 958 }
959 /*
960 * must enabled before any actual event
961 * because any event may be combined with LBR
962 */
963 if (intel_pmu_needs_lbr_smpl(event))
964 intel_pmu_lbr_enable(event);
938 965
939 if (event->attr.exclude_host) 966 if (event->attr.exclude_host)
940 cpuc->intel_ctrl_guest_mask |= (1ull << hwc->idx); 967 cpuc->intel_ctrl_guest_mask |= (1ull << hwc->idx);
@@ -1057,6 +1084,9 @@ again:
1057 1084
1058 data.period = event->hw.last_period; 1085 data.period = event->hw.last_period;
1059 1086
1087 if (has_branch_stack(event))
1088 data.br_stack = &cpuc->lbr_stack;
1089
1060 if (perf_event_overflow(event, &data, regs)) 1090 if (perf_event_overflow(event, &data, regs))
1061 x86_pmu_stop(event, 0); 1091 x86_pmu_stop(event, 0);
1062 } 1092 }
@@ -1123,17 +1153,17 @@ static bool intel_try_alt_er(struct perf_event *event, int orig_idx)
1123 */ 1153 */
1124static struct event_constraint * 1154static struct event_constraint *
1125__intel_shared_reg_get_constraints(struct cpu_hw_events *cpuc, 1155__intel_shared_reg_get_constraints(struct cpu_hw_events *cpuc,
1126 struct perf_event *event) 1156 struct perf_event *event,
1157 struct hw_perf_event_extra *reg)
1127{ 1158{
1128 struct event_constraint *c = &emptyconstraint; 1159 struct event_constraint *c = &emptyconstraint;
1129 struct hw_perf_event_extra *reg = &event->hw.extra_reg;
1130 struct er_account *era; 1160 struct er_account *era;
1131 unsigned long flags; 1161 unsigned long flags;
1132 int orig_idx = reg->idx; 1162 int orig_idx = reg->idx;
1133 1163
1134 /* already allocated shared msr */ 1164 /* already allocated shared msr */
1135 if (reg->alloc) 1165 if (reg->alloc)
1136 return &unconstrained; 1166 return NULL; /* call x86_get_event_constraint() */
1137 1167
1138again: 1168again:
1139 era = &cpuc->shared_regs->regs[reg->idx]; 1169 era = &cpuc->shared_regs->regs[reg->idx];
@@ -1156,14 +1186,10 @@ again:
1156 reg->alloc = 1; 1186 reg->alloc = 1;
1157 1187
1158 /* 1188 /*
1159 * All events using extra_reg are unconstrained. 1189 * need to call x86_get_event_constraint()
1160 * Avoids calling x86_get_event_constraints() 1190 * to check if associated event has constraints
1161 *
1162 * Must revisit if extra_reg controlling events
1163 * ever have constraints. Worst case we go through
1164 * the regular event constraint table.
1165 */ 1191 */
1166 c = &unconstrained; 1192 c = NULL;
1167 } else if (intel_try_alt_er(event, orig_idx)) { 1193 } else if (intel_try_alt_er(event, orig_idx)) {
1168 raw_spin_unlock_irqrestore(&era->lock, flags); 1194 raw_spin_unlock_irqrestore(&era->lock, flags);
1169 goto again; 1195 goto again;
@@ -1200,11 +1226,23 @@ static struct event_constraint *
1200intel_shared_regs_constraints(struct cpu_hw_events *cpuc, 1226intel_shared_regs_constraints(struct cpu_hw_events *cpuc,
1201 struct perf_event *event) 1227 struct perf_event *event)
1202{ 1228{
1203 struct event_constraint *c = NULL; 1229 struct event_constraint *c = NULL, *d;
1204 1230 struct hw_perf_event_extra *xreg, *breg;
1205 if (event->hw.extra_reg.idx != EXTRA_REG_NONE) 1231
1206 c = __intel_shared_reg_get_constraints(cpuc, event); 1232 xreg = &event->hw.extra_reg;
1207 1233 if (xreg->idx != EXTRA_REG_NONE) {
1234 c = __intel_shared_reg_get_constraints(cpuc, event, xreg);
1235 if (c == &emptyconstraint)
1236 return c;
1237 }
1238 breg = &event->hw.branch_reg;
1239 if (breg->idx != EXTRA_REG_NONE) {
1240 d = __intel_shared_reg_get_constraints(cpuc, event, breg);
1241 if (d == &emptyconstraint) {
1242 __intel_shared_reg_put_constraints(cpuc, xreg);
1243 c = d;
1244 }
1245 }
1208 return c; 1246 return c;
1209} 1247}
1210 1248
@@ -1252,6 +1290,10 @@ intel_put_shared_regs_event_constraints(struct cpu_hw_events *cpuc,
1252 reg = &event->hw.extra_reg; 1290 reg = &event->hw.extra_reg;
1253 if (reg->idx != EXTRA_REG_NONE) 1291 if (reg->idx != EXTRA_REG_NONE)
1254 __intel_shared_reg_put_constraints(cpuc, reg); 1292 __intel_shared_reg_put_constraints(cpuc, reg);
1293
1294 reg = &event->hw.branch_reg;
1295 if (reg->idx != EXTRA_REG_NONE)
1296 __intel_shared_reg_put_constraints(cpuc, reg);
1255} 1297}
1256 1298
1257static void intel_put_event_constraints(struct cpu_hw_events *cpuc, 1299static void intel_put_event_constraints(struct cpu_hw_events *cpuc,
@@ -1287,12 +1329,19 @@ static int intel_pmu_hw_config(struct perf_event *event)
1287 * 1329 *
1288 * Thereby we gain a PEBS capable cycle counter. 1330 * Thereby we gain a PEBS capable cycle counter.
1289 */ 1331 */
1290 u64 alt_config = 0x108000c0; /* INST_RETIRED.TOTAL_CYCLES */ 1332 u64 alt_config = X86_CONFIG(.event=0xc0, .inv=1, .cmask=16);
1333
1291 1334
1292 alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK); 1335 alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK);
1293 event->hw.config = alt_config; 1336 event->hw.config = alt_config;
1294 } 1337 }
1295 1338
1339 if (intel_pmu_needs_lbr_smpl(event)) {
1340 ret = intel_pmu_setup_lbr_filter(event);
1341 if (ret)
1342 return ret;
1343 }
1344
1296 if (event->attr.type != PERF_TYPE_RAW) 1345 if (event->attr.type != PERF_TYPE_RAW)
1297 return 0; 1346 return 0;
1298 1347
@@ -1431,7 +1480,7 @@ static int intel_pmu_cpu_prepare(int cpu)
1431{ 1480{
1432 struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); 1481 struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
1433 1482
1434 if (!x86_pmu.extra_regs) 1483 if (!(x86_pmu.extra_regs || x86_pmu.lbr_sel_map))
1435 return NOTIFY_OK; 1484 return NOTIFY_OK;
1436 1485
1437 cpuc->shared_regs = allocate_shared_regs(cpu); 1486 cpuc->shared_regs = allocate_shared_regs(cpu);
@@ -1453,22 +1502,28 @@ static void intel_pmu_cpu_starting(int cpu)
1453 */ 1502 */
1454 intel_pmu_lbr_reset(); 1503 intel_pmu_lbr_reset();
1455 1504
1456 if (!cpuc->shared_regs || (x86_pmu.er_flags & ERF_NO_HT_SHARING)) 1505 cpuc->lbr_sel = NULL;
1506
1507 if (!cpuc->shared_regs)
1457 return; 1508 return;
1458 1509
1459 for_each_cpu(i, topology_thread_cpumask(cpu)) { 1510 if (!(x86_pmu.er_flags & ERF_NO_HT_SHARING)) {
1460 struct intel_shared_regs *pc; 1511 for_each_cpu(i, topology_thread_cpumask(cpu)) {
1512 struct intel_shared_regs *pc;
1461 1513
1462 pc = per_cpu(cpu_hw_events, i).shared_regs; 1514 pc = per_cpu(cpu_hw_events, i).shared_regs;
1463 if (pc && pc->core_id == core_id) { 1515 if (pc && pc->core_id == core_id) {
1464 cpuc->kfree_on_online = cpuc->shared_regs; 1516 cpuc->kfree_on_online = cpuc->shared_regs;
1465 cpuc->shared_regs = pc; 1517 cpuc->shared_regs = pc;
1466 break; 1518 break;
1519 }
1467 } 1520 }
1521 cpuc->shared_regs->core_id = core_id;
1522 cpuc->shared_regs->refcnt++;
1468 } 1523 }
1469 1524
1470 cpuc->shared_regs->core_id = core_id; 1525 if (x86_pmu.lbr_sel_map)
1471 cpuc->shared_regs->refcnt++; 1526 cpuc->lbr_sel = &cpuc->shared_regs->regs[EXTRA_REG_LBR];
1472} 1527}
1473 1528
1474static void intel_pmu_cpu_dying(int cpu) 1529static void intel_pmu_cpu_dying(int cpu)
@@ -1486,6 +1541,18 @@ static void intel_pmu_cpu_dying(int cpu)
1486 fini_debug_store_on_cpu(cpu); 1541 fini_debug_store_on_cpu(cpu);
1487} 1542}
1488 1543
1544static void intel_pmu_flush_branch_stack(void)
1545{
1546 /*
1547 * Intel LBR does not tag entries with the
1548 * PID of the current task, then we need to
1549 * flush it on ctxsw
1550 * For now, we simply reset it
1551 */
1552 if (x86_pmu.lbr_nr)
1553 intel_pmu_lbr_reset();
1554}
1555
1489static __initconst const struct x86_pmu intel_pmu = { 1556static __initconst const struct x86_pmu intel_pmu = {
1490 .name = "Intel", 1557 .name = "Intel",
1491 .handle_irq = intel_pmu_handle_irq, 1558 .handle_irq = intel_pmu_handle_irq,
@@ -1513,6 +1580,7 @@ static __initconst const struct x86_pmu intel_pmu = {
1513 .cpu_starting = intel_pmu_cpu_starting, 1580 .cpu_starting = intel_pmu_cpu_starting,
1514 .cpu_dying = intel_pmu_cpu_dying, 1581 .cpu_dying = intel_pmu_cpu_dying,
1515 .guest_get_msrs = intel_guest_get_msrs, 1582 .guest_get_msrs = intel_guest_get_msrs,
1583 .flush_branch_stack = intel_pmu_flush_branch_stack,
1516}; 1584};
1517 1585
1518static __init void intel_clovertown_quirk(void) 1586static __init void intel_clovertown_quirk(void)
@@ -1689,9 +1757,11 @@ __init int intel_pmu_init(void)
1689 x86_pmu.extra_regs = intel_nehalem_extra_regs; 1757 x86_pmu.extra_regs = intel_nehalem_extra_regs;
1690 1758
1691 /* UOPS_ISSUED.STALLED_CYCLES */ 1759 /* UOPS_ISSUED.STALLED_CYCLES */
1692 intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e; 1760 intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
1761 X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
1693 /* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */ 1762 /* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */
1694 intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x1803fb1; 1763 intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
1764 X86_CONFIG(.event=0xb1, .umask=0x3f, .inv=1, .cmask=1);
1695 1765
1696 x86_add_quirk(intel_nehalem_quirk); 1766 x86_add_quirk(intel_nehalem_quirk);
1697 1767
@@ -1726,9 +1796,11 @@ __init int intel_pmu_init(void)
1726 x86_pmu.er_flags |= ERF_HAS_RSP_1; 1796 x86_pmu.er_flags |= ERF_HAS_RSP_1;
1727 1797
1728 /* UOPS_ISSUED.STALLED_CYCLES */ 1798 /* UOPS_ISSUED.STALLED_CYCLES */
1729 intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e; 1799 intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
1800 X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
1730 /* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */ 1801 /* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */
1731 intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x1803fb1; 1802 intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
1803 X86_CONFIG(.event=0xb1, .umask=0x3f, .inv=1, .cmask=1);
1732 1804
1733 pr_cont("Westmere events, "); 1805 pr_cont("Westmere events, ");
1734 break; 1806 break;
@@ -1739,7 +1811,7 @@ __init int intel_pmu_init(void)
1739 memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, 1811 memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
1740 sizeof(hw_cache_event_ids)); 1812 sizeof(hw_cache_event_ids));
1741 1813
1742 intel_pmu_lbr_init_nhm(); 1814 intel_pmu_lbr_init_snb();
1743 1815
1744 x86_pmu.event_constraints = intel_snb_event_constraints; 1816 x86_pmu.event_constraints = intel_snb_event_constraints;
1745 x86_pmu.pebs_constraints = intel_snb_pebs_event_constraints; 1817 x86_pmu.pebs_constraints = intel_snb_pebs_event_constraints;
@@ -1749,9 +1821,11 @@ __init int intel_pmu_init(void)
1749 x86_pmu.er_flags |= ERF_NO_HT_SHARING; 1821 x86_pmu.er_flags |= ERF_NO_HT_SHARING;
1750 1822
1751 /* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */ 1823 /* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */
1752 intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e; 1824 intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
1825 X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
1753 /* UOPS_DISPATCHED.THREAD,c=1,i=1 to count stall cycles*/ 1826 /* UOPS_DISPATCHED.THREAD,c=1,i=1 to count stall cycles*/
1754 intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x18001b1; 1827 intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
1828 X86_CONFIG(.event=0xb1, .umask=0x01, .inv=1, .cmask=1);
1755 1829
1756 pr_cont("SandyBridge events, "); 1830 pr_cont("SandyBridge events, ");
1757 break; 1831 break;
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index d6bd49faa40c..7f64df19e7dd 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -3,6 +3,7 @@
3#include <linux/slab.h> 3#include <linux/slab.h>
4 4
5#include <asm/perf_event.h> 5#include <asm/perf_event.h>
6#include <asm/insn.h>
6 7
7#include "perf_event.h" 8#include "perf_event.h"
8 9
@@ -439,9 +440,6 @@ void intel_pmu_pebs_enable(struct perf_event *event)
439 hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT; 440 hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT;
440 441
441 cpuc->pebs_enabled |= 1ULL << hwc->idx; 442 cpuc->pebs_enabled |= 1ULL << hwc->idx;
442
443 if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1)
444 intel_pmu_lbr_enable(event);
445} 443}
446 444
447void intel_pmu_pebs_disable(struct perf_event *event) 445void intel_pmu_pebs_disable(struct perf_event *event)
@@ -454,9 +452,6 @@ void intel_pmu_pebs_disable(struct perf_event *event)
454 wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled); 452 wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled);
455 453
456 hwc->config |= ARCH_PERFMON_EVENTSEL_INT; 454 hwc->config |= ARCH_PERFMON_EVENTSEL_INT;
457
458 if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1)
459 intel_pmu_lbr_disable(event);
460} 455}
461 456
462void intel_pmu_pebs_enable_all(void) 457void intel_pmu_pebs_enable_all(void)
@@ -475,17 +470,6 @@ void intel_pmu_pebs_disable_all(void)
475 wrmsrl(MSR_IA32_PEBS_ENABLE, 0); 470 wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
476} 471}
477 472
478#include <asm/insn.h>
479
480static inline bool kernel_ip(unsigned long ip)
481{
482#ifdef CONFIG_X86_32
483 return ip > PAGE_OFFSET;
484#else
485 return (long)ip < 0;
486#endif
487}
488
489static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs) 473static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
490{ 474{
491 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 475 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
@@ -572,6 +556,7 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
572 * both formats and we don't use the other fields in this 556 * both formats and we don't use the other fields in this
573 * routine. 557 * routine.
574 */ 558 */
559 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
575 struct pebs_record_core *pebs = __pebs; 560 struct pebs_record_core *pebs = __pebs;
576 struct perf_sample_data data; 561 struct perf_sample_data data;
577 struct pt_regs regs; 562 struct pt_regs regs;
@@ -602,6 +587,9 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
602 else 587 else
603 regs.flags &= ~PERF_EFLAGS_EXACT; 588 regs.flags &= ~PERF_EFLAGS_EXACT;
604 589
590 if (has_branch_stack(event))
591 data.br_stack = &cpuc->lbr_stack;
592
605 if (perf_event_overflow(event, &data, &regs)) 593 if (perf_event_overflow(event, &data, &regs))
606 x86_pmu_stop(event, 0); 594 x86_pmu_stop(event, 0);
607} 595}
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index 47a7e63bfe54..520b4265fcd2 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -3,6 +3,7 @@
3 3
4#include <asm/perf_event.h> 4#include <asm/perf_event.h>
5#include <asm/msr.h> 5#include <asm/msr.h>
6#include <asm/insn.h>
6 7
7#include "perf_event.h" 8#include "perf_event.h"
8 9
@@ -14,6 +15,100 @@ enum {
14}; 15};
15 16
16/* 17/*
18 * Intel LBR_SELECT bits
19 * Intel Vol3a, April 2011, Section 16.7 Table 16-10
20 *
21 * Hardware branch filter (not available on all CPUs)
22 */
23#define LBR_KERNEL_BIT 0 /* do not capture at ring0 */
24#define LBR_USER_BIT 1 /* do not capture at ring > 0 */
25#define LBR_JCC_BIT 2 /* do not capture conditional branches */
26#define LBR_REL_CALL_BIT 3 /* do not capture relative calls */
27#define LBR_IND_CALL_BIT 4 /* do not capture indirect calls */
28#define LBR_RETURN_BIT 5 /* do not capture near returns */
29#define LBR_IND_JMP_BIT 6 /* do not capture indirect jumps */
30#define LBR_REL_JMP_BIT 7 /* do not capture relative jumps */
31#define LBR_FAR_BIT 8 /* do not capture far branches */
32
33#define LBR_KERNEL (1 << LBR_KERNEL_BIT)
34#define LBR_USER (1 << LBR_USER_BIT)
35#define LBR_JCC (1 << LBR_JCC_BIT)
36#define LBR_REL_CALL (1 << LBR_REL_CALL_BIT)
37#define LBR_IND_CALL (1 << LBR_IND_CALL_BIT)
38#define LBR_RETURN (1 << LBR_RETURN_BIT)
39#define LBR_REL_JMP (1 << LBR_REL_JMP_BIT)
40#define LBR_IND_JMP (1 << LBR_IND_JMP_BIT)
41#define LBR_FAR (1 << LBR_FAR_BIT)
42
43#define LBR_PLM (LBR_KERNEL | LBR_USER)
44
45#define LBR_SEL_MASK 0x1ff /* valid bits in LBR_SELECT */
46#define LBR_NOT_SUPP -1 /* LBR filter not supported */
47#define LBR_IGN 0 /* ignored */
48
49#define LBR_ANY \
50 (LBR_JCC |\
51 LBR_REL_CALL |\
52 LBR_IND_CALL |\
53 LBR_RETURN |\
54 LBR_REL_JMP |\
55 LBR_IND_JMP |\
56 LBR_FAR)
57
58#define LBR_FROM_FLAG_MISPRED (1ULL << 63)
59
60#define for_each_branch_sample_type(x) \
61 for ((x) = PERF_SAMPLE_BRANCH_USER; \
62 (x) < PERF_SAMPLE_BRANCH_MAX; (x) <<= 1)
63
64/*
65 * x86control flow change classification
66 * x86control flow changes include branches, interrupts, traps, faults
67 */
68enum {
69 X86_BR_NONE = 0, /* unknown */
70
71 X86_BR_USER = 1 << 0, /* branch target is user */
72 X86_BR_KERNEL = 1 << 1, /* branch target is kernel */
73
74 X86_BR_CALL = 1 << 2, /* call */
75 X86_BR_RET = 1 << 3, /* return */
76 X86_BR_SYSCALL = 1 << 4, /* syscall */
77 X86_BR_SYSRET = 1 << 5, /* syscall return */
78 X86_BR_INT = 1 << 6, /* sw interrupt */
79 X86_BR_IRET = 1 << 7, /* return from interrupt */
80 X86_BR_JCC = 1 << 8, /* conditional */
81 X86_BR_JMP = 1 << 9, /* jump */
82 X86_BR_IRQ = 1 << 10,/* hw interrupt or trap or fault */
83 X86_BR_IND_CALL = 1 << 11,/* indirect calls */
84};
85
86#define X86_BR_PLM (X86_BR_USER | X86_BR_KERNEL)
87
88#define X86_BR_ANY \
89 (X86_BR_CALL |\
90 X86_BR_RET |\
91 X86_BR_SYSCALL |\
92 X86_BR_SYSRET |\
93 X86_BR_INT |\
94 X86_BR_IRET |\
95 X86_BR_JCC |\
96 X86_BR_JMP |\
97 X86_BR_IRQ |\
98 X86_BR_IND_CALL)
99
100#define X86_BR_ALL (X86_BR_PLM | X86_BR_ANY)
101
102#define X86_BR_ANY_CALL \
103 (X86_BR_CALL |\
104 X86_BR_IND_CALL |\
105 X86_BR_SYSCALL |\
106 X86_BR_IRQ |\
107 X86_BR_INT)
108
109static void intel_pmu_lbr_filter(struct cpu_hw_events *cpuc);
110
111/*
17 * We only support LBR implementations that have FREEZE_LBRS_ON_PMI 112 * We only support LBR implementations that have FREEZE_LBRS_ON_PMI
18 * otherwise it becomes near impossible to get a reliable stack. 113 * otherwise it becomes near impossible to get a reliable stack.
19 */ 114 */
@@ -21,6 +116,10 @@ enum {
21static void __intel_pmu_lbr_enable(void) 116static void __intel_pmu_lbr_enable(void)
22{ 117{
23 u64 debugctl; 118 u64 debugctl;
119 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
120
121 if (cpuc->lbr_sel)
122 wrmsrl(MSR_LBR_SELECT, cpuc->lbr_sel->config);
24 123
25 rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); 124 rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
26 debugctl |= (DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI); 125 debugctl |= (DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI);
@@ -76,11 +175,11 @@ void intel_pmu_lbr_enable(struct perf_event *event)
76 * Reset the LBR stack if we changed task context to 175 * Reset the LBR stack if we changed task context to
77 * avoid data leaks. 176 * avoid data leaks.
78 */ 177 */
79
80 if (event->ctx->task && cpuc->lbr_context != event->ctx) { 178 if (event->ctx->task && cpuc->lbr_context != event->ctx) {
81 intel_pmu_lbr_reset(); 179 intel_pmu_lbr_reset();
82 cpuc->lbr_context = event->ctx; 180 cpuc->lbr_context = event->ctx;
83 } 181 }
182 cpuc->br_sel = event->hw.branch_reg.reg;
84 183
85 cpuc->lbr_users++; 184 cpuc->lbr_users++;
86} 185}
@@ -95,8 +194,11 @@ void intel_pmu_lbr_disable(struct perf_event *event)
95 cpuc->lbr_users--; 194 cpuc->lbr_users--;
96 WARN_ON_ONCE(cpuc->lbr_users < 0); 195 WARN_ON_ONCE(cpuc->lbr_users < 0);
97 196
98 if (cpuc->enabled && !cpuc->lbr_users) 197 if (cpuc->enabled && !cpuc->lbr_users) {
99 __intel_pmu_lbr_disable(); 198 __intel_pmu_lbr_disable();
199 /* avoid stale pointer */
200 cpuc->lbr_context = NULL;
201 }
100} 202}
101 203
102void intel_pmu_lbr_enable_all(void) 204void intel_pmu_lbr_enable_all(void)
@@ -115,6 +217,9 @@ void intel_pmu_lbr_disable_all(void)
115 __intel_pmu_lbr_disable(); 217 __intel_pmu_lbr_disable();
116} 218}
117 219
220/*
221 * TOS = most recently recorded branch
222 */
118static inline u64 intel_pmu_lbr_tos(void) 223static inline u64 intel_pmu_lbr_tos(void)
119{ 224{
120 u64 tos; 225 u64 tos;
@@ -142,15 +247,15 @@ static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)
142 247
143 rdmsrl(x86_pmu.lbr_from + lbr_idx, msr_lastbranch.lbr); 248 rdmsrl(x86_pmu.lbr_from + lbr_idx, msr_lastbranch.lbr);
144 249
145 cpuc->lbr_entries[i].from = msr_lastbranch.from; 250 cpuc->lbr_entries[i].from = msr_lastbranch.from;
146 cpuc->lbr_entries[i].to = msr_lastbranch.to; 251 cpuc->lbr_entries[i].to = msr_lastbranch.to;
147 cpuc->lbr_entries[i].flags = 0; 252 cpuc->lbr_entries[i].mispred = 0;
253 cpuc->lbr_entries[i].predicted = 0;
254 cpuc->lbr_entries[i].reserved = 0;
148 } 255 }
149 cpuc->lbr_stack.nr = i; 256 cpuc->lbr_stack.nr = i;
150} 257}
151 258
152#define LBR_FROM_FLAG_MISPRED (1ULL << 63)
153
154/* 259/*
155 * Due to lack of segmentation in Linux the effective address (offset) 260 * Due to lack of segmentation in Linux the effective address (offset)
156 * is the same as the linear address, allowing us to merge the LIP and EIP 261 * is the same as the linear address, allowing us to merge the LIP and EIP
@@ -165,19 +270,22 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
165 270
166 for (i = 0; i < x86_pmu.lbr_nr; i++) { 271 for (i = 0; i < x86_pmu.lbr_nr; i++) {
167 unsigned long lbr_idx = (tos - i) & mask; 272 unsigned long lbr_idx = (tos - i) & mask;
168 u64 from, to, flags = 0; 273 u64 from, to, mis = 0, pred = 0;
169 274
170 rdmsrl(x86_pmu.lbr_from + lbr_idx, from); 275 rdmsrl(x86_pmu.lbr_from + lbr_idx, from);
171 rdmsrl(x86_pmu.lbr_to + lbr_idx, to); 276 rdmsrl(x86_pmu.lbr_to + lbr_idx, to);
172 277
173 if (lbr_format == LBR_FORMAT_EIP_FLAGS) { 278 if (lbr_format == LBR_FORMAT_EIP_FLAGS) {
174 flags = !!(from & LBR_FROM_FLAG_MISPRED); 279 mis = !!(from & LBR_FROM_FLAG_MISPRED);
280 pred = !mis;
175 from = (u64)((((s64)from) << 1) >> 1); 281 from = (u64)((((s64)from) << 1) >> 1);
176 } 282 }
177 283
178 cpuc->lbr_entries[i].from = from; 284 cpuc->lbr_entries[i].from = from;
179 cpuc->lbr_entries[i].to = to; 285 cpuc->lbr_entries[i].to = to;
180 cpuc->lbr_entries[i].flags = flags; 286 cpuc->lbr_entries[i].mispred = mis;
287 cpuc->lbr_entries[i].predicted = pred;
288 cpuc->lbr_entries[i].reserved = 0;
181 } 289 }
182 cpuc->lbr_stack.nr = i; 290 cpuc->lbr_stack.nr = i;
183} 291}
@@ -193,28 +301,404 @@ void intel_pmu_lbr_read(void)
193 intel_pmu_lbr_read_32(cpuc); 301 intel_pmu_lbr_read_32(cpuc);
194 else 302 else
195 intel_pmu_lbr_read_64(cpuc); 303 intel_pmu_lbr_read_64(cpuc);
304
305 intel_pmu_lbr_filter(cpuc);
306}
307
308/*
309 * SW filter is used:
310 * - in case there is no HW filter
311 * - in case the HW filter has errata or limitations
312 */
313static void intel_pmu_setup_sw_lbr_filter(struct perf_event *event)
314{
315 u64 br_type = event->attr.branch_sample_type;
316 int mask = 0;
317
318 if (br_type & PERF_SAMPLE_BRANCH_USER)
319 mask |= X86_BR_USER;
320
321 if (br_type & PERF_SAMPLE_BRANCH_KERNEL)
322 mask |= X86_BR_KERNEL;
323
324 /* we ignore BRANCH_HV here */
325
326 if (br_type & PERF_SAMPLE_BRANCH_ANY)
327 mask |= X86_BR_ANY;
328
329 if (br_type & PERF_SAMPLE_BRANCH_ANY_CALL)
330 mask |= X86_BR_ANY_CALL;
331
332 if (br_type & PERF_SAMPLE_BRANCH_ANY_RETURN)
333 mask |= X86_BR_RET | X86_BR_IRET | X86_BR_SYSRET;
334
335 if (br_type & PERF_SAMPLE_BRANCH_IND_CALL)
336 mask |= X86_BR_IND_CALL;
337 /*
338 * stash actual user request into reg, it may
339 * be used by fixup code for some CPU
340 */
341 event->hw.branch_reg.reg = mask;
342}
343
344/*
345 * setup the HW LBR filter
346 * Used only when available, may not be enough to disambiguate
347 * all branches, may need the help of the SW filter
348 */
349static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event)
350{
351 struct hw_perf_event_extra *reg;
352 u64 br_type = event->attr.branch_sample_type;
353 u64 mask = 0, m;
354 u64 v;
355
356 for_each_branch_sample_type(m) {
357 if (!(br_type & m))
358 continue;
359
360 v = x86_pmu.lbr_sel_map[m];
361 if (v == LBR_NOT_SUPP)
362 return -EOPNOTSUPP;
363
364 if (v != LBR_IGN)
365 mask |= v;
366 }
367 reg = &event->hw.branch_reg;
368 reg->idx = EXTRA_REG_LBR;
369
370 /* LBR_SELECT operates in suppress mode so invert mask */
371 reg->config = ~mask & x86_pmu.lbr_sel_mask;
372
373 return 0;
374}
375
376int intel_pmu_setup_lbr_filter(struct perf_event *event)
377{
378 int ret = 0;
379
380 /*
381 * no LBR on this PMU
382 */
383 if (!x86_pmu.lbr_nr)
384 return -EOPNOTSUPP;
385
386 /*
387 * setup SW LBR filter
388 */
389 intel_pmu_setup_sw_lbr_filter(event);
390
391 /*
392 * setup HW LBR filter, if any
393 */
394 if (x86_pmu.lbr_sel_map)
395 ret = intel_pmu_setup_hw_lbr_filter(event);
396
397 return ret;
196} 398}
197 399
400/*
401 * return the type of control flow change at address "from"
402 * intruction is not necessarily a branch (in case of interrupt).
403 *
404 * The branch type returned also includes the priv level of the
405 * target of the control flow change (X86_BR_USER, X86_BR_KERNEL).
406 *
407 * If a branch type is unknown OR the instruction cannot be
408 * decoded (e.g., text page not present), then X86_BR_NONE is
409 * returned.
410 */
411static int branch_type(unsigned long from, unsigned long to)
412{
413 struct insn insn;
414 void *addr;
415 int bytes, size = MAX_INSN_SIZE;
416 int ret = X86_BR_NONE;
417 int ext, to_plm, from_plm;
418 u8 buf[MAX_INSN_SIZE];
419 int is64 = 0;
420
421 to_plm = kernel_ip(to) ? X86_BR_KERNEL : X86_BR_USER;
422 from_plm = kernel_ip(from) ? X86_BR_KERNEL : X86_BR_USER;
423
424 /*
425 * maybe zero if lbr did not fill up after a reset by the time
426 * we get a PMU interrupt
427 */
428 if (from == 0 || to == 0)
429 return X86_BR_NONE;
430
431 if (from_plm == X86_BR_USER) {
432 /*
433 * can happen if measuring at the user level only
434 * and we interrupt in a kernel thread, e.g., idle.
435 */
436 if (!current->mm)
437 return X86_BR_NONE;
438
439 /* may fail if text not present */
440 bytes = copy_from_user_nmi(buf, (void __user *)from, size);
441 if (bytes != size)
442 return X86_BR_NONE;
443
444 addr = buf;
445 } else
446 addr = (void *)from;
447
448 /*
449 * decoder needs to know the ABI especially
450 * on 64-bit systems running 32-bit apps
451 */
452#ifdef CONFIG_X86_64
453 is64 = kernel_ip((unsigned long)addr) || !test_thread_flag(TIF_IA32);
454#endif
455 insn_init(&insn, addr, is64);
456 insn_get_opcode(&insn);
457
458 switch (insn.opcode.bytes[0]) {
459 case 0xf:
460 switch (insn.opcode.bytes[1]) {
461 case 0x05: /* syscall */
462 case 0x34: /* sysenter */
463 ret = X86_BR_SYSCALL;
464 break;
465 case 0x07: /* sysret */
466 case 0x35: /* sysexit */
467 ret = X86_BR_SYSRET;
468 break;
469 case 0x80 ... 0x8f: /* conditional */
470 ret = X86_BR_JCC;
471 break;
472 default:
473 ret = X86_BR_NONE;
474 }
475 break;
476 case 0x70 ... 0x7f: /* conditional */
477 ret = X86_BR_JCC;
478 break;
479 case 0xc2: /* near ret */
480 case 0xc3: /* near ret */
481 case 0xca: /* far ret */
482 case 0xcb: /* far ret */
483 ret = X86_BR_RET;
484 break;
485 case 0xcf: /* iret */
486 ret = X86_BR_IRET;
487 break;
488 case 0xcc ... 0xce: /* int */
489 ret = X86_BR_INT;
490 break;
491 case 0xe8: /* call near rel */
492 case 0x9a: /* call far absolute */
493 ret = X86_BR_CALL;
494 break;
495 case 0xe0 ... 0xe3: /* loop jmp */
496 ret = X86_BR_JCC;
497 break;
498 case 0xe9 ... 0xeb: /* jmp */
499 ret = X86_BR_JMP;
500 break;
501 case 0xff: /* call near absolute, call far absolute ind */
502 insn_get_modrm(&insn);
503 ext = (insn.modrm.bytes[0] >> 3) & 0x7;
504 switch (ext) {
505 case 2: /* near ind call */
506 case 3: /* far ind call */
507 ret = X86_BR_IND_CALL;
508 break;
509 case 4:
510 case 5:
511 ret = X86_BR_JMP;
512 break;
513 }
514 break;
515 default:
516 ret = X86_BR_NONE;
517 }
518 /*
519 * interrupts, traps, faults (and thus ring transition) may
520 * occur on any instructions. Thus, to classify them correctly,
521 * we need to first look at the from and to priv levels. If they
522 * are different and to is in the kernel, then it indicates
523 * a ring transition. If the from instruction is not a ring
524 * transition instr (syscall, systenter, int), then it means
525 * it was a irq, trap or fault.
526 *
527 * we have no way of detecting kernel to kernel faults.
528 */
529 if (from_plm == X86_BR_USER && to_plm == X86_BR_KERNEL
530 && ret != X86_BR_SYSCALL && ret != X86_BR_INT)
531 ret = X86_BR_IRQ;
532
533 /*
534 * branch priv level determined by target as
535 * is done by HW when LBR_SELECT is implemented
536 */
537 if (ret != X86_BR_NONE)
538 ret |= to_plm;
539
540 return ret;
541}
542
543/*
544 * implement actual branch filter based on user demand.
545 * Hardware may not exactly satisfy that request, thus
546 * we need to inspect opcodes. Mismatched branches are
547 * discarded. Therefore, the number of branches returned
548 * in PERF_SAMPLE_BRANCH_STACK sample may vary.
549 */
550static void
551intel_pmu_lbr_filter(struct cpu_hw_events *cpuc)
552{
553 u64 from, to;
554 int br_sel = cpuc->br_sel;
555 int i, j, type;
556 bool compress = false;
557
558 /* if sampling all branches, then nothing to filter */
559 if ((br_sel & X86_BR_ALL) == X86_BR_ALL)
560 return;
561
562 for (i = 0; i < cpuc->lbr_stack.nr; i++) {
563
564 from = cpuc->lbr_entries[i].from;
565 to = cpuc->lbr_entries[i].to;
566
567 type = branch_type(from, to);
568
569 /* if type does not correspond, then discard */
570 if (type == X86_BR_NONE || (br_sel & type) != type) {
571 cpuc->lbr_entries[i].from = 0;
572 compress = true;
573 }
574 }
575
576 if (!compress)
577 return;
578
579 /* remove all entries with from=0 */
580 for (i = 0; i < cpuc->lbr_stack.nr; ) {
581 if (!cpuc->lbr_entries[i].from) {
582 j = i;
583 while (++j < cpuc->lbr_stack.nr)
584 cpuc->lbr_entries[j-1] = cpuc->lbr_entries[j];
585 cpuc->lbr_stack.nr--;
586 if (!cpuc->lbr_entries[i].from)
587 continue;
588 }
589 i++;
590 }
591}
592
593/*
594 * Map interface branch filters onto LBR filters
595 */
596static const int nhm_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX] = {
597 [PERF_SAMPLE_BRANCH_ANY] = LBR_ANY,
598 [PERF_SAMPLE_BRANCH_USER] = LBR_USER,
599 [PERF_SAMPLE_BRANCH_KERNEL] = LBR_KERNEL,
600 [PERF_SAMPLE_BRANCH_HV] = LBR_IGN,
601 [PERF_SAMPLE_BRANCH_ANY_RETURN] = LBR_RETURN | LBR_REL_JMP
602 | LBR_IND_JMP | LBR_FAR,
603 /*
604 * NHM/WSM erratum: must include REL_JMP+IND_JMP to get CALL branches
605 */
606 [PERF_SAMPLE_BRANCH_ANY_CALL] =
607 LBR_REL_CALL | LBR_IND_CALL | LBR_REL_JMP | LBR_IND_JMP | LBR_FAR,
608 /*
609 * NHM/WSM erratum: must include IND_JMP to capture IND_CALL
610 */
611 [PERF_SAMPLE_BRANCH_IND_CALL] = LBR_IND_CALL | LBR_IND_JMP,
612};
613
614static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX] = {
615 [PERF_SAMPLE_BRANCH_ANY] = LBR_ANY,
616 [PERF_SAMPLE_BRANCH_USER] = LBR_USER,
617 [PERF_SAMPLE_BRANCH_KERNEL] = LBR_KERNEL,
618 [PERF_SAMPLE_BRANCH_HV] = LBR_IGN,
619 [PERF_SAMPLE_BRANCH_ANY_RETURN] = LBR_RETURN | LBR_FAR,
620 [PERF_SAMPLE_BRANCH_ANY_CALL] = LBR_REL_CALL | LBR_IND_CALL
621 | LBR_FAR,
622 [PERF_SAMPLE_BRANCH_IND_CALL] = LBR_IND_CALL,
623};
624
625/* core */
198void intel_pmu_lbr_init_core(void) 626void intel_pmu_lbr_init_core(void)
199{ 627{
200 x86_pmu.lbr_nr = 4; 628 x86_pmu.lbr_nr = 4;
201 x86_pmu.lbr_tos = 0x01c9; 629 x86_pmu.lbr_tos = MSR_LBR_TOS;
202 x86_pmu.lbr_from = 0x40; 630 x86_pmu.lbr_from = MSR_LBR_CORE_FROM;
203 x86_pmu.lbr_to = 0x60; 631 x86_pmu.lbr_to = MSR_LBR_CORE_TO;
632
633 /*
634 * SW branch filter usage:
635 * - compensate for lack of HW filter
636 */
637 pr_cont("4-deep LBR, ");
204} 638}
205 639
640/* nehalem/westmere */
206void intel_pmu_lbr_init_nhm(void) 641void intel_pmu_lbr_init_nhm(void)
207{ 642{
208 x86_pmu.lbr_nr = 16; 643 x86_pmu.lbr_nr = 16;
209 x86_pmu.lbr_tos = 0x01c9; 644 x86_pmu.lbr_tos = MSR_LBR_TOS;
210 x86_pmu.lbr_from = 0x680; 645 x86_pmu.lbr_from = MSR_LBR_NHM_FROM;
211 x86_pmu.lbr_to = 0x6c0; 646 x86_pmu.lbr_to = MSR_LBR_NHM_TO;
647
648 x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
649 x86_pmu.lbr_sel_map = nhm_lbr_sel_map;
650
651 /*
652 * SW branch filter usage:
653 * - workaround LBR_SEL errata (see above)
654 * - support syscall, sysret capture.
655 * That requires LBR_FAR but that means far
656 * jmp need to be filtered out
657 */
658 pr_cont("16-deep LBR, ");
659}
660
661/* sandy bridge */
662void intel_pmu_lbr_init_snb(void)
663{
664 x86_pmu.lbr_nr = 16;
665 x86_pmu.lbr_tos = MSR_LBR_TOS;
666 x86_pmu.lbr_from = MSR_LBR_NHM_FROM;
667 x86_pmu.lbr_to = MSR_LBR_NHM_TO;
668
669 x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
670 x86_pmu.lbr_sel_map = snb_lbr_sel_map;
671
672 /*
673 * SW branch filter usage:
674 * - support syscall, sysret capture.
675 * That requires LBR_FAR but that means far
676 * jmp need to be filtered out
677 */
678 pr_cont("16-deep LBR, ");
212} 679}
213 680
681/* atom */
214void intel_pmu_lbr_init_atom(void) 682void intel_pmu_lbr_init_atom(void)
215{ 683{
684 /*
685 * only models starting at stepping 10 seems
686 * to have an operational LBR which can freeze
687 * on PMU interrupt
688 */
689 if (boot_cpu_data.x86_mask < 10) {
690 pr_cont("LBR disabled due to erratum");
691 return;
692 }
693
216 x86_pmu.lbr_nr = 8; 694 x86_pmu.lbr_nr = 8;
217 x86_pmu.lbr_tos = 0x01c9; 695 x86_pmu.lbr_tos = MSR_LBR_TOS;
218 x86_pmu.lbr_from = 0x40; 696 x86_pmu.lbr_from = MSR_LBR_CORE_FROM;
219 x86_pmu.lbr_to = 0x60; 697 x86_pmu.lbr_to = MSR_LBR_CORE_TO;
698
699 /*
700 * SW branch filter usage:
701 * - compensate for lack of HW filter
702 */
703 pr_cont("8-deep LBR, ");
220} 704}
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 3fe8239fd8fb..1333d9851778 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1532,10 +1532,17 @@ ENTRY(nmi)
1532 pushq_cfi %rdx 1532 pushq_cfi %rdx
1533 1533
1534 /* 1534 /*
1535 * If %cs was not the kernel segment, then the NMI triggered in user
1536 * space, which means it is definitely not nested.
1537 */
1538 cmpl $__KERNEL_CS, 16(%rsp)
1539 jne first_nmi
1540
1541 /*
1535 * Check the special variable on the stack to see if NMIs are 1542 * Check the special variable on the stack to see if NMIs are
1536 * executing. 1543 * executing.
1537 */ 1544 */
1538 cmp $1, -8(%rsp) 1545 cmpl $1, -8(%rsp)
1539 je nested_nmi 1546 je nested_nmi
1540 1547
1541 /* 1548 /*
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 40fc86161d92..58b7f27cb3e9 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -100,13 +100,8 @@ execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq)
100 irqctx->tinfo.task = curctx->tinfo.task; 100 irqctx->tinfo.task = curctx->tinfo.task;
101 irqctx->tinfo.previous_esp = current_stack_pointer; 101 irqctx->tinfo.previous_esp = current_stack_pointer;
102 102
103 /* 103 /* Copy the preempt_count so that the [soft]irq checks work. */
104 * Copy the softirq bits in preempt_count so that the 104 irqctx->tinfo.preempt_count = curctx->tinfo.preempt_count;
105 * softirq checks work in the hardirq context.
106 */
107 irqctx->tinfo.preempt_count =
108 (irqctx->tinfo.preempt_count & ~SOFTIRQ_MASK) |
109 (curctx->tinfo.preempt_count & SOFTIRQ_MASK);
110 105
111 if (unlikely(overflow)) 106 if (unlikely(overflow))
112 call_on_stack(print_stack_overflow, isp); 107 call_on_stack(print_stack_overflow, isp);
@@ -196,7 +191,7 @@ bool handle_irq(unsigned irq, struct pt_regs *regs)
196 if (unlikely(!desc)) 191 if (unlikely(!desc))
197 return false; 192 return false;
198 193
199 if (!execute_on_irq_stack(overflow, desc, irq)) { 194 if (user_mode_vm(regs) || !execute_on_irq_stack(overflow, desc, irq)) {
200 if (unlikely(overflow)) 195 if (unlikely(overflow))
201 print_stack_overflow(); 196 print_stack_overflow();
202 desc->handle_irq(irq, desc); 197 desc->handle_irq(irq, desc);
diff --git a/arch/x86/kernel/kprobes-common.h b/arch/x86/kernel/kprobes-common.h
new file mode 100644
index 000000000000..3230b68ef29a
--- /dev/null
+++ b/arch/x86/kernel/kprobes-common.h
@@ -0,0 +1,102 @@
1#ifndef __X86_KERNEL_KPROBES_COMMON_H
2#define __X86_KERNEL_KPROBES_COMMON_H
3
4/* Kprobes and Optprobes common header */
5
6#ifdef CONFIG_X86_64
7#define SAVE_REGS_STRING \
8 /* Skip cs, ip, orig_ax. */ \
9 " subq $24, %rsp\n" \
10 " pushq %rdi\n" \
11 " pushq %rsi\n" \
12 " pushq %rdx\n" \
13 " pushq %rcx\n" \
14 " pushq %rax\n" \
15 " pushq %r8\n" \
16 " pushq %r9\n" \
17 " pushq %r10\n" \
18 " pushq %r11\n" \
19 " pushq %rbx\n" \
20 " pushq %rbp\n" \
21 " pushq %r12\n" \
22 " pushq %r13\n" \
23 " pushq %r14\n" \
24 " pushq %r15\n"
25#define RESTORE_REGS_STRING \
26 " popq %r15\n" \
27 " popq %r14\n" \
28 " popq %r13\n" \
29 " popq %r12\n" \
30 " popq %rbp\n" \
31 " popq %rbx\n" \
32 " popq %r11\n" \
33 " popq %r10\n" \
34 " popq %r9\n" \
35 " popq %r8\n" \
36 " popq %rax\n" \
37 " popq %rcx\n" \
38 " popq %rdx\n" \
39 " popq %rsi\n" \
40 " popq %rdi\n" \
41 /* Skip orig_ax, ip, cs */ \
42 " addq $24, %rsp\n"
43#else
44#define SAVE_REGS_STRING \
45 /* Skip cs, ip, orig_ax and gs. */ \
46 " subl $16, %esp\n" \
47 " pushl %fs\n" \
48 " pushl %es\n" \
49 " pushl %ds\n" \
50 " pushl %eax\n" \
51 " pushl %ebp\n" \
52 " pushl %edi\n" \
53 " pushl %esi\n" \
54 " pushl %edx\n" \
55 " pushl %ecx\n" \
56 " pushl %ebx\n"
57#define RESTORE_REGS_STRING \
58 " popl %ebx\n" \
59 " popl %ecx\n" \
60 " popl %edx\n" \
61 " popl %esi\n" \
62 " popl %edi\n" \
63 " popl %ebp\n" \
64 " popl %eax\n" \
65 /* Skip ds, es, fs, gs, orig_ax, and ip. Note: don't pop cs here*/\
66 " addl $24, %esp\n"
67#endif
68
69/* Ensure if the instruction can be boostable */
70extern int can_boost(kprobe_opcode_t *instruction);
71/* Recover instruction if given address is probed */
72extern unsigned long recover_probed_instruction(kprobe_opcode_t *buf,
73 unsigned long addr);
74/*
75 * Copy an instruction and adjust the displacement if the instruction
76 * uses the %rip-relative addressing mode.
77 */
78extern int __copy_instruction(u8 *dest, u8 *src);
79
80/* Generate a relative-jump/call instruction */
81extern void synthesize_reljump(void *from, void *to);
82extern void synthesize_relcall(void *from, void *to);
83
84#ifdef CONFIG_OPTPROBES
85extern int arch_init_optprobes(void);
86extern int setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter);
87extern unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, unsigned long addr);
88#else /* !CONFIG_OPTPROBES */
89static inline int arch_init_optprobes(void)
90{
91 return 0;
92}
93static inline int setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter)
94{
95 return 0;
96}
97static inline unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, unsigned long addr)
98{
99 return addr;
100}
101#endif
102#endif
diff --git a/arch/x86/kernel/kprobes-opt.c b/arch/x86/kernel/kprobes-opt.c
new file mode 100644
index 000000000000..c5e410eed403
--- /dev/null
+++ b/arch/x86/kernel/kprobes-opt.c
@@ -0,0 +1,512 @@
1/*
2 * Kernel Probes Jump Optimization (Optprobes)
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright (C) IBM Corporation, 2002, 2004
19 * Copyright (C) Hitachi Ltd., 2012
20 */
21#include <linux/kprobes.h>
22#include <linux/ptrace.h>
23#include <linux/string.h>
24#include <linux/slab.h>
25#include <linux/hardirq.h>
26#include <linux/preempt.h>
27#include <linux/module.h>
28#include <linux/kdebug.h>
29#include <linux/kallsyms.h>
30#include <linux/ftrace.h>
31
32#include <asm/cacheflush.h>
33#include <asm/desc.h>
34#include <asm/pgtable.h>
35#include <asm/uaccess.h>
36#include <asm/alternative.h>
37#include <asm/insn.h>
38#include <asm/debugreg.h>
39
40#include "kprobes-common.h"
41
42unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, unsigned long addr)
43{
44 struct optimized_kprobe *op;
45 struct kprobe *kp;
46 long offs;
47 int i;
48
49 for (i = 0; i < RELATIVEJUMP_SIZE; i++) {
50 kp = get_kprobe((void *)addr - i);
51 /* This function only handles jump-optimized kprobe */
52 if (kp && kprobe_optimized(kp)) {
53 op = container_of(kp, struct optimized_kprobe, kp);
54 /* If op->list is not empty, op is under optimizing */
55 if (list_empty(&op->list))
56 goto found;
57 }
58 }
59
60 return addr;
61found:
62 /*
63 * If the kprobe can be optimized, original bytes which can be
64 * overwritten by jump destination address. In this case, original
65 * bytes must be recovered from op->optinsn.copied_insn buffer.
66 */
67 memcpy(buf, (void *)addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
68 if (addr == (unsigned long)kp->addr) {
69 buf[0] = kp->opcode;
70 memcpy(buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE);
71 } else {
72 offs = addr - (unsigned long)kp->addr - 1;
73 memcpy(buf, op->optinsn.copied_insn + offs, RELATIVE_ADDR_SIZE - offs);
74 }
75
76 return (unsigned long)buf;
77}
78
79/* Insert a move instruction which sets a pointer to eax/rdi (1st arg). */
80static void __kprobes synthesize_set_arg1(kprobe_opcode_t *addr, unsigned long val)
81{
82#ifdef CONFIG_X86_64
83 *addr++ = 0x48;
84 *addr++ = 0xbf;
85#else
86 *addr++ = 0xb8;
87#endif
88 *(unsigned long *)addr = val;
89}
90
91static void __used __kprobes kprobes_optinsn_template_holder(void)
92{
93 asm volatile (
94 ".global optprobe_template_entry\n"
95 "optprobe_template_entry:\n"
96#ifdef CONFIG_X86_64
97 /* We don't bother saving the ss register */
98 " pushq %rsp\n"
99 " pushfq\n"
100 SAVE_REGS_STRING
101 " movq %rsp, %rsi\n"
102 ".global optprobe_template_val\n"
103 "optprobe_template_val:\n"
104 ASM_NOP5
105 ASM_NOP5
106 ".global optprobe_template_call\n"
107 "optprobe_template_call:\n"
108 ASM_NOP5
109 /* Move flags to rsp */
110 " movq 144(%rsp), %rdx\n"
111 " movq %rdx, 152(%rsp)\n"
112 RESTORE_REGS_STRING
113 /* Skip flags entry */
114 " addq $8, %rsp\n"
115 " popfq\n"
116#else /* CONFIG_X86_32 */
117 " pushf\n"
118 SAVE_REGS_STRING
119 " movl %esp, %edx\n"
120 ".global optprobe_template_val\n"
121 "optprobe_template_val:\n"
122 ASM_NOP5
123 ".global optprobe_template_call\n"
124 "optprobe_template_call:\n"
125 ASM_NOP5
126 RESTORE_REGS_STRING
127 " addl $4, %esp\n" /* skip cs */
128 " popf\n"
129#endif
130 ".global optprobe_template_end\n"
131 "optprobe_template_end:\n");
132}
133
134#define TMPL_MOVE_IDX \
135 ((long)&optprobe_template_val - (long)&optprobe_template_entry)
136#define TMPL_CALL_IDX \
137 ((long)&optprobe_template_call - (long)&optprobe_template_entry)
138#define TMPL_END_IDX \
139 ((long)&optprobe_template_end - (long)&optprobe_template_entry)
140
141#define INT3_SIZE sizeof(kprobe_opcode_t)
142
143/* Optimized kprobe call back function: called from optinsn */
144static void __kprobes optimized_callback(struct optimized_kprobe *op, struct pt_regs *regs)
145{
146 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
147 unsigned long flags;
148
149 /* This is possible if op is under delayed unoptimizing */
150 if (kprobe_disabled(&op->kp))
151 return;
152
153 local_irq_save(flags);
154 if (kprobe_running()) {
155 kprobes_inc_nmissed_count(&op->kp);
156 } else {
157 /* Save skipped registers */
158#ifdef CONFIG_X86_64
159 regs->cs = __KERNEL_CS;
160#else
161 regs->cs = __KERNEL_CS | get_kernel_rpl();
162 regs->gs = 0;
163#endif
164 regs->ip = (unsigned long)op->kp.addr + INT3_SIZE;
165 regs->orig_ax = ~0UL;
166
167 __this_cpu_write(current_kprobe, &op->kp);
168 kcb->kprobe_status = KPROBE_HIT_ACTIVE;
169 opt_pre_handler(&op->kp, regs);
170 __this_cpu_write(current_kprobe, NULL);
171 }
172 local_irq_restore(flags);
173}
174
175static int __kprobes copy_optimized_instructions(u8 *dest, u8 *src)
176{
177 int len = 0, ret;
178
179 while (len < RELATIVEJUMP_SIZE) {
180 ret = __copy_instruction(dest + len, src + len);
181 if (!ret || !can_boost(dest + len))
182 return -EINVAL;
183 len += ret;
184 }
185 /* Check whether the address range is reserved */
186 if (ftrace_text_reserved(src, src + len - 1) ||
187 alternatives_text_reserved(src, src + len - 1) ||
188 jump_label_text_reserved(src, src + len - 1))
189 return -EBUSY;
190
191 return len;
192}
193
194/* Check whether insn is indirect jump */
195static int __kprobes insn_is_indirect_jump(struct insn *insn)
196{
197 return ((insn->opcode.bytes[0] == 0xff &&
198 (X86_MODRM_REG(insn->modrm.value) & 6) == 4) || /* Jump */
199 insn->opcode.bytes[0] == 0xea); /* Segment based jump */
200}
201
202/* Check whether insn jumps into specified address range */
203static int insn_jump_into_range(struct insn *insn, unsigned long start, int len)
204{
205 unsigned long target = 0;
206
207 switch (insn->opcode.bytes[0]) {
208 case 0xe0: /* loopne */
209 case 0xe1: /* loope */
210 case 0xe2: /* loop */
211 case 0xe3: /* jcxz */
212 case 0xe9: /* near relative jump */
213 case 0xeb: /* short relative jump */
214 break;
215 case 0x0f:
216 if ((insn->opcode.bytes[1] & 0xf0) == 0x80) /* jcc near */
217 break;
218 return 0;
219 default:
220 if ((insn->opcode.bytes[0] & 0xf0) == 0x70) /* jcc short */
221 break;
222 return 0;
223 }
224 target = (unsigned long)insn->next_byte + insn->immediate.value;
225
226 return (start <= target && target <= start + len);
227}
228
229/* Decode whole function to ensure any instructions don't jump into target */
230static int __kprobes can_optimize(unsigned long paddr)
231{
232 unsigned long addr, size = 0, offset = 0;
233 struct insn insn;
234 kprobe_opcode_t buf[MAX_INSN_SIZE];
235
236 /* Lookup symbol including addr */
237 if (!kallsyms_lookup_size_offset(paddr, &size, &offset))
238 return 0;
239
240 /*
241 * Do not optimize in the entry code due to the unstable
242 * stack handling.
243 */
244 if ((paddr >= (unsigned long)__entry_text_start) &&
245 (paddr < (unsigned long)__entry_text_end))
246 return 0;
247
248 /* Check there is enough space for a relative jump. */
249 if (size - offset < RELATIVEJUMP_SIZE)
250 return 0;
251
252 /* Decode instructions */
253 addr = paddr - offset;
254 while (addr < paddr - offset + size) { /* Decode until function end */
255 if (search_exception_tables(addr))
256 /*
257 * Since some fixup code will jumps into this function,
258 * we can't optimize kprobe in this function.
259 */
260 return 0;
261 kernel_insn_init(&insn, (void *)recover_probed_instruction(buf, addr));
262 insn_get_length(&insn);
263 /* Another subsystem puts a breakpoint */
264 if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION)
265 return 0;
266 /* Recover address */
267 insn.kaddr = (void *)addr;
268 insn.next_byte = (void *)(addr + insn.length);
269 /* Check any instructions don't jump into target */
270 if (insn_is_indirect_jump(&insn) ||
271 insn_jump_into_range(&insn, paddr + INT3_SIZE,
272 RELATIVE_ADDR_SIZE))
273 return 0;
274 addr += insn.length;
275 }
276
277 return 1;
278}
279
280/* Check optimized_kprobe can actually be optimized. */
281int __kprobes arch_check_optimized_kprobe(struct optimized_kprobe *op)
282{
283 int i;
284 struct kprobe *p;
285
286 for (i = 1; i < op->optinsn.size; i++) {
287 p = get_kprobe(op->kp.addr + i);
288 if (p && !kprobe_disabled(p))
289 return -EEXIST;
290 }
291
292 return 0;
293}
294
295/* Check the addr is within the optimized instructions. */
296int __kprobes
297arch_within_optimized_kprobe(struct optimized_kprobe *op, unsigned long addr)
298{
299 return ((unsigned long)op->kp.addr <= addr &&
300 (unsigned long)op->kp.addr + op->optinsn.size > addr);
301}
302
303/* Free optimized instruction slot */
304static __kprobes
305void __arch_remove_optimized_kprobe(struct optimized_kprobe *op, int dirty)
306{
307 if (op->optinsn.insn) {
308 free_optinsn_slot(op->optinsn.insn, dirty);
309 op->optinsn.insn = NULL;
310 op->optinsn.size = 0;
311 }
312}
313
314void __kprobes arch_remove_optimized_kprobe(struct optimized_kprobe *op)
315{
316 __arch_remove_optimized_kprobe(op, 1);
317}
318
319/*
320 * Copy replacing target instructions
321 * Target instructions MUST be relocatable (checked inside)
322 * This is called when new aggr(opt)probe is allocated or reused.
323 */
324int __kprobes arch_prepare_optimized_kprobe(struct optimized_kprobe *op)
325{
326 u8 *buf;
327 int ret;
328 long rel;
329
330 if (!can_optimize((unsigned long)op->kp.addr))
331 return -EILSEQ;
332
333 op->optinsn.insn = get_optinsn_slot();
334 if (!op->optinsn.insn)
335 return -ENOMEM;
336
337 /*
338 * Verify if the address gap is in 2GB range, because this uses
339 * a relative jump.
340 */
341 rel = (long)op->optinsn.insn - (long)op->kp.addr + RELATIVEJUMP_SIZE;
342 if (abs(rel) > 0x7fffffff)
343 return -ERANGE;
344
345 buf = (u8 *)op->optinsn.insn;
346
347 /* Copy instructions into the out-of-line buffer */
348 ret = copy_optimized_instructions(buf + TMPL_END_IDX, op->kp.addr);
349 if (ret < 0) {
350 __arch_remove_optimized_kprobe(op, 0);
351 return ret;
352 }
353 op->optinsn.size = ret;
354
355 /* Copy arch-dep-instance from template */
356 memcpy(buf, &optprobe_template_entry, TMPL_END_IDX);
357
358 /* Set probe information */
359 synthesize_set_arg1(buf + TMPL_MOVE_IDX, (unsigned long)op);
360
361 /* Set probe function call */
362 synthesize_relcall(buf + TMPL_CALL_IDX, optimized_callback);
363
364 /* Set returning jmp instruction at the tail of out-of-line buffer */
365 synthesize_reljump(buf + TMPL_END_IDX + op->optinsn.size,
366 (u8 *)op->kp.addr + op->optinsn.size);
367
368 flush_icache_range((unsigned long) buf,
369 (unsigned long) buf + TMPL_END_IDX +
370 op->optinsn.size + RELATIVEJUMP_SIZE);
371 return 0;
372}
373
374#define MAX_OPTIMIZE_PROBES 256
375static struct text_poke_param *jump_poke_params;
376static struct jump_poke_buffer {
377 u8 buf[RELATIVEJUMP_SIZE];
378} *jump_poke_bufs;
379
380static void __kprobes setup_optimize_kprobe(struct text_poke_param *tprm,
381 u8 *insn_buf,
382 struct optimized_kprobe *op)
383{
384 s32 rel = (s32)((long)op->optinsn.insn -
385 ((long)op->kp.addr + RELATIVEJUMP_SIZE));
386
387 /* Backup instructions which will be replaced by jump address */
388 memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_SIZE,
389 RELATIVE_ADDR_SIZE);
390
391 insn_buf[0] = RELATIVEJUMP_OPCODE;
392 *(s32 *)(&insn_buf[1]) = rel;
393
394 tprm->addr = op->kp.addr;
395 tprm->opcode = insn_buf;
396 tprm->len = RELATIVEJUMP_SIZE;
397}
398
399/*
400 * Replace breakpoints (int3) with relative jumps.
401 * Caller must call with locking kprobe_mutex and text_mutex.
402 */
403void __kprobes arch_optimize_kprobes(struct list_head *oplist)
404{
405 struct optimized_kprobe *op, *tmp;
406 int c = 0;
407
408 list_for_each_entry_safe(op, tmp, oplist, list) {
409 WARN_ON(kprobe_disabled(&op->kp));
410 /* Setup param */
411 setup_optimize_kprobe(&jump_poke_params[c],
412 jump_poke_bufs[c].buf, op);
413 list_del_init(&op->list);
414 if (++c >= MAX_OPTIMIZE_PROBES)
415 break;
416 }
417
418 /*
419 * text_poke_smp doesn't support NMI/MCE code modifying.
420 * However, since kprobes itself also doesn't support NMI/MCE
421 * code probing, it's not a problem.
422 */
423 text_poke_smp_batch(jump_poke_params, c);
424}
425
426static void __kprobes setup_unoptimize_kprobe(struct text_poke_param *tprm,
427 u8 *insn_buf,
428 struct optimized_kprobe *op)
429{
430 /* Set int3 to first byte for kprobes */
431 insn_buf[0] = BREAKPOINT_INSTRUCTION;
432 memcpy(insn_buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE);
433
434 tprm->addr = op->kp.addr;
435 tprm->opcode = insn_buf;
436 tprm->len = RELATIVEJUMP_SIZE;
437}
438
439/*
440 * Recover original instructions and breakpoints from relative jumps.
441 * Caller must call with locking kprobe_mutex.
442 */
443extern void arch_unoptimize_kprobes(struct list_head *oplist,
444 struct list_head *done_list)
445{
446 struct optimized_kprobe *op, *tmp;
447 int c = 0;
448
449 list_for_each_entry_safe(op, tmp, oplist, list) {
450 /* Setup param */
451 setup_unoptimize_kprobe(&jump_poke_params[c],
452 jump_poke_bufs[c].buf, op);
453 list_move(&op->list, done_list);
454 if (++c >= MAX_OPTIMIZE_PROBES)
455 break;
456 }
457
458 /*
459 * text_poke_smp doesn't support NMI/MCE code modifying.
460 * However, since kprobes itself also doesn't support NMI/MCE
461 * code probing, it's not a problem.
462 */
463 text_poke_smp_batch(jump_poke_params, c);
464}
465
466/* Replace a relative jump with a breakpoint (int3). */
467void __kprobes arch_unoptimize_kprobe(struct optimized_kprobe *op)
468{
469 u8 buf[RELATIVEJUMP_SIZE];
470
471 /* Set int3 to first byte for kprobes */
472 buf[0] = BREAKPOINT_INSTRUCTION;
473 memcpy(buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE);
474 text_poke_smp(op->kp.addr, buf, RELATIVEJUMP_SIZE);
475}
476
477int __kprobes
478setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter)
479{
480 struct optimized_kprobe *op;
481
482 if (p->flags & KPROBE_FLAG_OPTIMIZED) {
483 /* This kprobe is really able to run optimized path. */
484 op = container_of(p, struct optimized_kprobe, kp);
485 /* Detour through copied instructions */
486 regs->ip = (unsigned long)op->optinsn.insn + TMPL_END_IDX;
487 if (!reenter)
488 reset_current_kprobe();
489 preempt_enable_no_resched();
490 return 1;
491 }
492 return 0;
493}
494
495int __kprobes arch_init_optprobes(void)
496{
497 /* Allocate code buffer and parameter array */
498 jump_poke_bufs = kmalloc(sizeof(struct jump_poke_buffer) *
499 MAX_OPTIMIZE_PROBES, GFP_KERNEL);
500 if (!jump_poke_bufs)
501 return -ENOMEM;
502
503 jump_poke_params = kmalloc(sizeof(struct text_poke_param) *
504 MAX_OPTIMIZE_PROBES, GFP_KERNEL);
505 if (!jump_poke_params) {
506 kfree(jump_poke_bufs);
507 jump_poke_bufs = NULL;
508 return -ENOMEM;
509 }
510
511 return 0;
512}
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index 7da647d8b64c..e213fc8408d2 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -30,16 +30,15 @@
30 * <jkenisto@us.ibm.com> and Prasanna S Panchamukhi 30 * <jkenisto@us.ibm.com> and Prasanna S Panchamukhi
31 * <prasanna@in.ibm.com> added function-return probes. 31 * <prasanna@in.ibm.com> added function-return probes.
32 * 2005-May Rusty Lynch <rusty.lynch@intel.com> 32 * 2005-May Rusty Lynch <rusty.lynch@intel.com>
33 * Added function return probes functionality 33 * Added function return probes functionality
34 * 2006-Feb Masami Hiramatsu <hiramatu@sdl.hitachi.co.jp> added 34 * 2006-Feb Masami Hiramatsu <hiramatu@sdl.hitachi.co.jp> added
35 * kprobe-booster and kretprobe-booster for i386. 35 * kprobe-booster and kretprobe-booster for i386.
36 * 2007-Dec Masami Hiramatsu <mhiramat@redhat.com> added kprobe-booster 36 * 2007-Dec Masami Hiramatsu <mhiramat@redhat.com> added kprobe-booster
37 * and kretprobe-booster for x86-64 37 * and kretprobe-booster for x86-64
38 * 2007-Dec Masami Hiramatsu <mhiramat@redhat.com>, Arjan van de Ven 38 * 2007-Dec Masami Hiramatsu <mhiramat@redhat.com>, Arjan van de Ven
39 * <arjan@infradead.org> and Jim Keniston <jkenisto@us.ibm.com> 39 * <arjan@infradead.org> and Jim Keniston <jkenisto@us.ibm.com>
40 * unified x86 kprobes code. 40 * unified x86 kprobes code.
41 */ 41 */
42
43#include <linux/kprobes.h> 42#include <linux/kprobes.h>
44#include <linux/ptrace.h> 43#include <linux/ptrace.h>
45#include <linux/string.h> 44#include <linux/string.h>
@@ -59,6 +58,8 @@
59#include <asm/insn.h> 58#include <asm/insn.h>
60#include <asm/debugreg.h> 59#include <asm/debugreg.h>
61 60
61#include "kprobes-common.h"
62
62void jprobe_return_end(void); 63void jprobe_return_end(void);
63 64
64DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL; 65DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL;
@@ -108,6 +109,7 @@ struct kretprobe_blackpoint kretprobe_blacklist[] = {
108 doesn't switch kernel stack.*/ 109 doesn't switch kernel stack.*/
109 {NULL, NULL} /* Terminator */ 110 {NULL, NULL} /* Terminator */
110}; 111};
112
111const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist); 113const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist);
112 114
113static void __kprobes __synthesize_relative_insn(void *from, void *to, u8 op) 115static void __kprobes __synthesize_relative_insn(void *from, void *to, u8 op)
@@ -123,11 +125,17 @@ static void __kprobes __synthesize_relative_insn(void *from, void *to, u8 op)
123} 125}
124 126
125/* Insert a jump instruction at address 'from', which jumps to address 'to'.*/ 127/* Insert a jump instruction at address 'from', which jumps to address 'to'.*/
126static void __kprobes synthesize_reljump(void *from, void *to) 128void __kprobes synthesize_reljump(void *from, void *to)
127{ 129{
128 __synthesize_relative_insn(from, to, RELATIVEJUMP_OPCODE); 130 __synthesize_relative_insn(from, to, RELATIVEJUMP_OPCODE);
129} 131}
130 132
133/* Insert a call instruction at address 'from', which calls address 'to'.*/
134void __kprobes synthesize_relcall(void *from, void *to)
135{
136 __synthesize_relative_insn(from, to, RELATIVECALL_OPCODE);
137}
138
131/* 139/*
132 * Skip the prefixes of the instruction. 140 * Skip the prefixes of the instruction.
133 */ 141 */
@@ -151,7 +159,7 @@ static kprobe_opcode_t *__kprobes skip_prefixes(kprobe_opcode_t *insn)
151 * Returns non-zero if opcode is boostable. 159 * Returns non-zero if opcode is boostable.
152 * RIP relative instructions are adjusted at copying time in 64 bits mode 160 * RIP relative instructions are adjusted at copying time in 64 bits mode
153 */ 161 */
154static int __kprobes can_boost(kprobe_opcode_t *opcodes) 162int __kprobes can_boost(kprobe_opcode_t *opcodes)
155{ 163{
156 kprobe_opcode_t opcode; 164 kprobe_opcode_t opcode;
157 kprobe_opcode_t *orig_opcodes = opcodes; 165 kprobe_opcode_t *orig_opcodes = opcodes;
@@ -207,13 +215,15 @@ retry:
207 } 215 }
208} 216}
209 217
210/* Recover the probed instruction at addr for further analysis. */ 218static unsigned long
211static int recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr) 219__recover_probed_insn(kprobe_opcode_t *buf, unsigned long addr)
212{ 220{
213 struct kprobe *kp; 221 struct kprobe *kp;
222
214 kp = get_kprobe((void *)addr); 223 kp = get_kprobe((void *)addr);
224 /* There is no probe, return original address */
215 if (!kp) 225 if (!kp)
216 return -EINVAL; 226 return addr;
217 227
218 /* 228 /*
219 * Basically, kp->ainsn.insn has an original instruction. 229 * Basically, kp->ainsn.insn has an original instruction.
@@ -230,14 +240,29 @@ static int recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr)
230 */ 240 */
231 memcpy(buf, kp->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t)); 241 memcpy(buf, kp->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
232 buf[0] = kp->opcode; 242 buf[0] = kp->opcode;
233 return 0; 243 return (unsigned long)buf;
244}
245
246/*
247 * Recover the probed instruction at addr for further analysis.
248 * Caller must lock kprobes by kprobe_mutex, or disable preemption
249 * for preventing to release referencing kprobes.
250 */
251unsigned long recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr)
252{
253 unsigned long __addr;
254
255 __addr = __recover_optprobed_insn(buf, addr);
256 if (__addr != addr)
257 return __addr;
258
259 return __recover_probed_insn(buf, addr);
234} 260}
235 261
236/* Check if paddr is at an instruction boundary */ 262/* Check if paddr is at an instruction boundary */
237static int __kprobes can_probe(unsigned long paddr) 263static int __kprobes can_probe(unsigned long paddr)
238{ 264{
239 int ret; 265 unsigned long addr, __addr, offset = 0;
240 unsigned long addr, offset = 0;
241 struct insn insn; 266 struct insn insn;
242 kprobe_opcode_t buf[MAX_INSN_SIZE]; 267 kprobe_opcode_t buf[MAX_INSN_SIZE];
243 268
@@ -247,26 +272,24 @@ static int __kprobes can_probe(unsigned long paddr)
247 /* Decode instructions */ 272 /* Decode instructions */
248 addr = paddr - offset; 273 addr = paddr - offset;
249 while (addr < paddr) { 274 while (addr < paddr) {
250 kernel_insn_init(&insn, (void *)addr);
251 insn_get_opcode(&insn);
252
253 /* 275 /*
254 * Check if the instruction has been modified by another 276 * Check if the instruction has been modified by another
255 * kprobe, in which case we replace the breakpoint by the 277 * kprobe, in which case we replace the breakpoint by the
256 * original instruction in our buffer. 278 * original instruction in our buffer.
279 * Also, jump optimization will change the breakpoint to
280 * relative-jump. Since the relative-jump itself is
281 * normally used, we just go through if there is no kprobe.
257 */ 282 */
258 if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) { 283 __addr = recover_probed_instruction(buf, addr);
259 ret = recover_probed_instruction(buf, addr); 284 kernel_insn_init(&insn, (void *)__addr);
260 if (ret)
261 /*
262 * Another debugging subsystem might insert
263 * this breakpoint. In that case, we can't
264 * recover it.
265 */
266 return 0;
267 kernel_insn_init(&insn, buf);
268 }
269 insn_get_length(&insn); 285 insn_get_length(&insn);
286
287 /*
288 * Another debugging subsystem might insert this breakpoint.
289 * In that case, we can't recover it.
290 */
291 if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION)
292 return 0;
270 addr += insn.length; 293 addr += insn.length;
271 } 294 }
272 295
@@ -299,24 +322,16 @@ static int __kprobes is_IF_modifier(kprobe_opcode_t *insn)
299 * If not, return null. 322 * If not, return null.
300 * Only applicable to 64-bit x86. 323 * Only applicable to 64-bit x86.
301 */ 324 */
302static int __kprobes __copy_instruction(u8 *dest, u8 *src, int recover) 325int __kprobes __copy_instruction(u8 *dest, u8 *src)
303{ 326{
304 struct insn insn; 327 struct insn insn;
305 int ret;
306 kprobe_opcode_t buf[MAX_INSN_SIZE]; 328 kprobe_opcode_t buf[MAX_INSN_SIZE];
307 329
308 kernel_insn_init(&insn, src); 330 kernel_insn_init(&insn, (void *)recover_probed_instruction(buf, (unsigned long)src));
309 if (recover) {
310 insn_get_opcode(&insn);
311 if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) {
312 ret = recover_probed_instruction(buf,
313 (unsigned long)src);
314 if (ret)
315 return 0;
316 kernel_insn_init(&insn, buf);
317 }
318 }
319 insn_get_length(&insn); 331 insn_get_length(&insn);
332 /* Another subsystem puts a breakpoint, failed to recover */
333 if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION)
334 return 0;
320 memcpy(dest, insn.kaddr, insn.length); 335 memcpy(dest, insn.kaddr, insn.length);
321 336
322#ifdef CONFIG_X86_64 337#ifdef CONFIG_X86_64
@@ -337,8 +352,7 @@ static int __kprobes __copy_instruction(u8 *dest, u8 *src, int recover)
337 * extension of the original signed 32-bit displacement would 352 * extension of the original signed 32-bit displacement would
338 * have given. 353 * have given.
339 */ 354 */
340 newdisp = (u8 *) src + (s64) insn.displacement.value - 355 newdisp = (u8 *) src + (s64) insn.displacement.value - (u8 *) dest;
341 (u8 *) dest;
342 BUG_ON((s64) (s32) newdisp != newdisp); /* Sanity check. */ 356 BUG_ON((s64) (s32) newdisp != newdisp); /* Sanity check. */
343 disp = (u8 *) dest + insn_offset_displacement(&insn); 357 disp = (u8 *) dest + insn_offset_displacement(&insn);
344 *(s32 *) disp = (s32) newdisp; 358 *(s32 *) disp = (s32) newdisp;
@@ -349,18 +363,20 @@ static int __kprobes __copy_instruction(u8 *dest, u8 *src, int recover)
349 363
350static void __kprobes arch_copy_kprobe(struct kprobe *p) 364static void __kprobes arch_copy_kprobe(struct kprobe *p)
351{ 365{
366 /* Copy an instruction with recovering if other optprobe modifies it.*/
367 __copy_instruction(p->ainsn.insn, p->addr);
368
352 /* 369 /*
353 * Copy an instruction without recovering int3, because it will be 370 * __copy_instruction can modify the displacement of the instruction,
354 * put by another subsystem. 371 * but it doesn't affect boostable check.
355 */ 372 */
356 __copy_instruction(p->ainsn.insn, p->addr, 0); 373 if (can_boost(p->ainsn.insn))
357
358 if (can_boost(p->addr))
359 p->ainsn.boostable = 0; 374 p->ainsn.boostable = 0;
360 else 375 else
361 p->ainsn.boostable = -1; 376 p->ainsn.boostable = -1;
362 377
363 p->opcode = *p->addr; 378 /* Also, displacement change doesn't affect the first byte */
379 p->opcode = p->ainsn.insn[0];
364} 380}
365 381
366int __kprobes arch_prepare_kprobe(struct kprobe *p) 382int __kprobes arch_prepare_kprobe(struct kprobe *p)
@@ -442,8 +458,8 @@ static void __kprobes restore_btf(void)
442 } 458 }
443} 459}
444 460
445void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, 461void __kprobes
446 struct pt_regs *regs) 462arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs)
447{ 463{
448 unsigned long *sara = stack_addr(regs); 464 unsigned long *sara = stack_addr(regs);
449 465
@@ -453,16 +469,8 @@ void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
453 *sara = (unsigned long) &kretprobe_trampoline; 469 *sara = (unsigned long) &kretprobe_trampoline;
454} 470}
455 471
456#ifdef CONFIG_OPTPROBES 472static void __kprobes
457static int __kprobes setup_detour_execution(struct kprobe *p, 473setup_singlestep(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb, int reenter)
458 struct pt_regs *regs,
459 int reenter);
460#else
461#define setup_detour_execution(p, regs, reenter) (0)
462#endif
463
464static void __kprobes setup_singlestep(struct kprobe *p, struct pt_regs *regs,
465 struct kprobe_ctlblk *kcb, int reenter)
466{ 474{
467 if (setup_detour_execution(p, regs, reenter)) 475 if (setup_detour_execution(p, regs, reenter))
468 return; 476 return;
@@ -504,8 +512,8 @@ static void __kprobes setup_singlestep(struct kprobe *p, struct pt_regs *regs,
504 * within the handler. We save the original kprobes variables and just single 512 * within the handler. We save the original kprobes variables and just single
505 * step on the instruction of the new probe without calling any user handlers. 513 * step on the instruction of the new probe without calling any user handlers.
506 */ 514 */
507static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs, 515static int __kprobes
508 struct kprobe_ctlblk *kcb) 516reenter_kprobe(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb)
509{ 517{
510 switch (kcb->kprobe_status) { 518 switch (kcb->kprobe_status) {
511 case KPROBE_HIT_SSDONE: 519 case KPROBE_HIT_SSDONE:
@@ -600,69 +608,6 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
600 return 0; 608 return 0;
601} 609}
602 610
603#ifdef CONFIG_X86_64
604#define SAVE_REGS_STRING \
605 /* Skip cs, ip, orig_ax. */ \
606 " subq $24, %rsp\n" \
607 " pushq %rdi\n" \
608 " pushq %rsi\n" \
609 " pushq %rdx\n" \
610 " pushq %rcx\n" \
611 " pushq %rax\n" \
612 " pushq %r8\n" \
613 " pushq %r9\n" \
614 " pushq %r10\n" \
615 " pushq %r11\n" \
616 " pushq %rbx\n" \
617 " pushq %rbp\n" \
618 " pushq %r12\n" \
619 " pushq %r13\n" \
620 " pushq %r14\n" \
621 " pushq %r15\n"
622#define RESTORE_REGS_STRING \
623 " popq %r15\n" \
624 " popq %r14\n" \
625 " popq %r13\n" \
626 " popq %r12\n" \
627 " popq %rbp\n" \
628 " popq %rbx\n" \
629 " popq %r11\n" \
630 " popq %r10\n" \
631 " popq %r9\n" \
632 " popq %r8\n" \
633 " popq %rax\n" \
634 " popq %rcx\n" \
635 " popq %rdx\n" \
636 " popq %rsi\n" \
637 " popq %rdi\n" \
638 /* Skip orig_ax, ip, cs */ \
639 " addq $24, %rsp\n"
640#else
641#define SAVE_REGS_STRING \
642 /* Skip cs, ip, orig_ax and gs. */ \
643 " subl $16, %esp\n" \
644 " pushl %fs\n" \
645 " pushl %es\n" \
646 " pushl %ds\n" \
647 " pushl %eax\n" \
648 " pushl %ebp\n" \
649 " pushl %edi\n" \
650 " pushl %esi\n" \
651 " pushl %edx\n" \
652 " pushl %ecx\n" \
653 " pushl %ebx\n"
654#define RESTORE_REGS_STRING \
655 " popl %ebx\n" \
656 " popl %ecx\n" \
657 " popl %edx\n" \
658 " popl %esi\n" \
659 " popl %edi\n" \
660 " popl %ebp\n" \
661 " popl %eax\n" \
662 /* Skip ds, es, fs, gs, orig_ax, and ip. Note: don't pop cs here*/\
663 " addl $24, %esp\n"
664#endif
665
666/* 611/*
667 * When a retprobed function returns, this code saves registers and 612 * When a retprobed function returns, this code saves registers and
668 * calls trampoline_handler() runs, which calls the kretprobe's handler. 613 * calls trampoline_handler() runs, which calls the kretprobe's handler.
@@ -816,8 +761,8 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs)
816 * jump instruction after the copied instruction, that jumps to the next 761 * jump instruction after the copied instruction, that jumps to the next
817 * instruction after the probepoint. 762 * instruction after the probepoint.
818 */ 763 */
819static void __kprobes resume_execution(struct kprobe *p, 764static void __kprobes
820 struct pt_regs *regs, struct kprobe_ctlblk *kcb) 765resume_execution(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb)
821{ 766{
822 unsigned long *tos = stack_addr(regs); 767 unsigned long *tos = stack_addr(regs);
823 unsigned long copy_ip = (unsigned long)p->ainsn.insn; 768 unsigned long copy_ip = (unsigned long)p->ainsn.insn;
@@ -996,8 +941,8 @@ int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr)
996/* 941/*
997 * Wrapper routine for handling exceptions. 942 * Wrapper routine for handling exceptions.
998 */ 943 */
999int __kprobes kprobe_exceptions_notify(struct notifier_block *self, 944int __kprobes
1000 unsigned long val, void *data) 945kprobe_exceptions_notify(struct notifier_block *self, unsigned long val, void *data)
1001{ 946{
1002 struct die_args *args = data; 947 struct die_args *args = data;
1003 int ret = NOTIFY_DONE; 948 int ret = NOTIFY_DONE;
@@ -1107,466 +1052,9 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
1107 return 0; 1052 return 0;
1108} 1053}
1109 1054
1110
1111#ifdef CONFIG_OPTPROBES
1112
1113/* Insert a call instruction at address 'from', which calls address 'to'.*/
1114static void __kprobes synthesize_relcall(void *from, void *to)
1115{
1116 __synthesize_relative_insn(from, to, RELATIVECALL_OPCODE);
1117}
1118
1119/* Insert a move instruction which sets a pointer to eax/rdi (1st arg). */
1120static void __kprobes synthesize_set_arg1(kprobe_opcode_t *addr,
1121 unsigned long val)
1122{
1123#ifdef CONFIG_X86_64
1124 *addr++ = 0x48;
1125 *addr++ = 0xbf;
1126#else
1127 *addr++ = 0xb8;
1128#endif
1129 *(unsigned long *)addr = val;
1130}
1131
1132static void __used __kprobes kprobes_optinsn_template_holder(void)
1133{
1134 asm volatile (
1135 ".global optprobe_template_entry\n"
1136 "optprobe_template_entry: \n"
1137#ifdef CONFIG_X86_64
1138 /* We don't bother saving the ss register */
1139 " pushq %rsp\n"
1140 " pushfq\n"
1141 SAVE_REGS_STRING
1142 " movq %rsp, %rsi\n"
1143 ".global optprobe_template_val\n"
1144 "optprobe_template_val: \n"
1145 ASM_NOP5
1146 ASM_NOP5
1147 ".global optprobe_template_call\n"
1148 "optprobe_template_call: \n"
1149 ASM_NOP5
1150 /* Move flags to rsp */
1151 " movq 144(%rsp), %rdx\n"
1152 " movq %rdx, 152(%rsp)\n"
1153 RESTORE_REGS_STRING
1154 /* Skip flags entry */
1155 " addq $8, %rsp\n"
1156 " popfq\n"
1157#else /* CONFIG_X86_32 */
1158 " pushf\n"
1159 SAVE_REGS_STRING
1160 " movl %esp, %edx\n"
1161 ".global optprobe_template_val\n"
1162 "optprobe_template_val: \n"
1163 ASM_NOP5
1164 ".global optprobe_template_call\n"
1165 "optprobe_template_call: \n"
1166 ASM_NOP5
1167 RESTORE_REGS_STRING
1168 " addl $4, %esp\n" /* skip cs */
1169 " popf\n"
1170#endif
1171 ".global optprobe_template_end\n"
1172 "optprobe_template_end: \n");
1173}
1174
1175#define TMPL_MOVE_IDX \
1176 ((long)&optprobe_template_val - (long)&optprobe_template_entry)
1177#define TMPL_CALL_IDX \
1178 ((long)&optprobe_template_call - (long)&optprobe_template_entry)
1179#define TMPL_END_IDX \
1180 ((long)&optprobe_template_end - (long)&optprobe_template_entry)
1181
1182#define INT3_SIZE sizeof(kprobe_opcode_t)
1183
1184/* Optimized kprobe call back function: called from optinsn */
1185static void __kprobes optimized_callback(struct optimized_kprobe *op,
1186 struct pt_regs *regs)
1187{
1188 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
1189 unsigned long flags;
1190
1191 /* This is possible if op is under delayed unoptimizing */
1192 if (kprobe_disabled(&op->kp))
1193 return;
1194
1195 local_irq_save(flags);
1196 if (kprobe_running()) {
1197 kprobes_inc_nmissed_count(&op->kp);
1198 } else {
1199 /* Save skipped registers */
1200#ifdef CONFIG_X86_64
1201 regs->cs = __KERNEL_CS;
1202#else
1203 regs->cs = __KERNEL_CS | get_kernel_rpl();
1204 regs->gs = 0;
1205#endif
1206 regs->ip = (unsigned long)op->kp.addr + INT3_SIZE;
1207 regs->orig_ax = ~0UL;
1208
1209 __this_cpu_write(current_kprobe, &op->kp);
1210 kcb->kprobe_status = KPROBE_HIT_ACTIVE;
1211 opt_pre_handler(&op->kp, regs);
1212 __this_cpu_write(current_kprobe, NULL);
1213 }
1214 local_irq_restore(flags);
1215}
1216
1217static int __kprobes copy_optimized_instructions(u8 *dest, u8 *src)
1218{
1219 int len = 0, ret;
1220
1221 while (len < RELATIVEJUMP_SIZE) {
1222 ret = __copy_instruction(dest + len, src + len, 1);
1223 if (!ret || !can_boost(dest + len))
1224 return -EINVAL;
1225 len += ret;
1226 }
1227 /* Check whether the address range is reserved */
1228 if (ftrace_text_reserved(src, src + len - 1) ||
1229 alternatives_text_reserved(src, src + len - 1) ||
1230 jump_label_text_reserved(src, src + len - 1))
1231 return -EBUSY;
1232
1233 return len;
1234}
1235
1236/* Check whether insn is indirect jump */
1237static int __kprobes insn_is_indirect_jump(struct insn *insn)
1238{
1239 return ((insn->opcode.bytes[0] == 0xff &&
1240 (X86_MODRM_REG(insn->modrm.value) & 6) == 4) || /* Jump */
1241 insn->opcode.bytes[0] == 0xea); /* Segment based jump */
1242}
1243
1244/* Check whether insn jumps into specified address range */
1245static int insn_jump_into_range(struct insn *insn, unsigned long start, int len)
1246{
1247 unsigned long target = 0;
1248
1249 switch (insn->opcode.bytes[0]) {
1250 case 0xe0: /* loopne */
1251 case 0xe1: /* loope */
1252 case 0xe2: /* loop */
1253 case 0xe3: /* jcxz */
1254 case 0xe9: /* near relative jump */
1255 case 0xeb: /* short relative jump */
1256 break;
1257 case 0x0f:
1258 if ((insn->opcode.bytes[1] & 0xf0) == 0x80) /* jcc near */
1259 break;
1260 return 0;
1261 default:
1262 if ((insn->opcode.bytes[0] & 0xf0) == 0x70) /* jcc short */
1263 break;
1264 return 0;
1265 }
1266 target = (unsigned long)insn->next_byte + insn->immediate.value;
1267
1268 return (start <= target && target <= start + len);
1269}
1270
1271/* Decode whole function to ensure any instructions don't jump into target */
1272static int __kprobes can_optimize(unsigned long paddr)
1273{
1274 int ret;
1275 unsigned long addr, size = 0, offset = 0;
1276 struct insn insn;
1277 kprobe_opcode_t buf[MAX_INSN_SIZE];
1278
1279 /* Lookup symbol including addr */
1280 if (!kallsyms_lookup_size_offset(paddr, &size, &offset))
1281 return 0;
1282
1283 /*
1284 * Do not optimize in the entry code due to the unstable
1285 * stack handling.
1286 */
1287 if ((paddr >= (unsigned long )__entry_text_start) &&
1288 (paddr < (unsigned long )__entry_text_end))
1289 return 0;
1290
1291 /* Check there is enough space for a relative jump. */
1292 if (size - offset < RELATIVEJUMP_SIZE)
1293 return 0;
1294
1295 /* Decode instructions */
1296 addr = paddr - offset;
1297 while (addr < paddr - offset + size) { /* Decode until function end */
1298 if (search_exception_tables(addr))
1299 /*
1300 * Since some fixup code will jumps into this function,
1301 * we can't optimize kprobe in this function.
1302 */
1303 return 0;
1304 kernel_insn_init(&insn, (void *)addr);
1305 insn_get_opcode(&insn);
1306 if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) {
1307 ret = recover_probed_instruction(buf, addr);
1308 if (ret)
1309 return 0;
1310 kernel_insn_init(&insn, buf);
1311 }
1312 insn_get_length(&insn);
1313 /* Recover address */
1314 insn.kaddr = (void *)addr;
1315 insn.next_byte = (void *)(addr + insn.length);
1316 /* Check any instructions don't jump into target */
1317 if (insn_is_indirect_jump(&insn) ||
1318 insn_jump_into_range(&insn, paddr + INT3_SIZE,
1319 RELATIVE_ADDR_SIZE))
1320 return 0;
1321 addr += insn.length;
1322 }
1323
1324 return 1;
1325}
1326
1327/* Check optimized_kprobe can actually be optimized. */
1328int __kprobes arch_check_optimized_kprobe(struct optimized_kprobe *op)
1329{
1330 int i;
1331 struct kprobe *p;
1332
1333 for (i = 1; i < op->optinsn.size; i++) {
1334 p = get_kprobe(op->kp.addr + i);
1335 if (p && !kprobe_disabled(p))
1336 return -EEXIST;
1337 }
1338
1339 return 0;
1340}
1341
1342/* Check the addr is within the optimized instructions. */
1343int __kprobes arch_within_optimized_kprobe(struct optimized_kprobe *op,
1344 unsigned long addr)
1345{
1346 return ((unsigned long)op->kp.addr <= addr &&
1347 (unsigned long)op->kp.addr + op->optinsn.size > addr);
1348}
1349
1350/* Free optimized instruction slot */
1351static __kprobes
1352void __arch_remove_optimized_kprobe(struct optimized_kprobe *op, int dirty)
1353{
1354 if (op->optinsn.insn) {
1355 free_optinsn_slot(op->optinsn.insn, dirty);
1356 op->optinsn.insn = NULL;
1357 op->optinsn.size = 0;
1358 }
1359}
1360
1361void __kprobes arch_remove_optimized_kprobe(struct optimized_kprobe *op)
1362{
1363 __arch_remove_optimized_kprobe(op, 1);
1364}
1365
1366/*
1367 * Copy replacing target instructions
1368 * Target instructions MUST be relocatable (checked inside)
1369 */
1370int __kprobes arch_prepare_optimized_kprobe(struct optimized_kprobe *op)
1371{
1372 u8 *buf;
1373 int ret;
1374 long rel;
1375
1376 if (!can_optimize((unsigned long)op->kp.addr))
1377 return -EILSEQ;
1378
1379 op->optinsn.insn = get_optinsn_slot();
1380 if (!op->optinsn.insn)
1381 return -ENOMEM;
1382
1383 /*
1384 * Verify if the address gap is in 2GB range, because this uses
1385 * a relative jump.
1386 */
1387 rel = (long)op->optinsn.insn - (long)op->kp.addr + RELATIVEJUMP_SIZE;
1388 if (abs(rel) > 0x7fffffff)
1389 return -ERANGE;
1390
1391 buf = (u8 *)op->optinsn.insn;
1392
1393 /* Copy instructions into the out-of-line buffer */
1394 ret = copy_optimized_instructions(buf + TMPL_END_IDX, op->kp.addr);
1395 if (ret < 0) {
1396 __arch_remove_optimized_kprobe(op, 0);
1397 return ret;
1398 }
1399 op->optinsn.size = ret;
1400
1401 /* Copy arch-dep-instance from template */
1402 memcpy(buf, &optprobe_template_entry, TMPL_END_IDX);
1403
1404 /* Set probe information */
1405 synthesize_set_arg1(buf + TMPL_MOVE_IDX, (unsigned long)op);
1406
1407 /* Set probe function call */
1408 synthesize_relcall(buf + TMPL_CALL_IDX, optimized_callback);
1409
1410 /* Set returning jmp instruction at the tail of out-of-line buffer */
1411 synthesize_reljump(buf + TMPL_END_IDX + op->optinsn.size,
1412 (u8 *)op->kp.addr + op->optinsn.size);
1413
1414 flush_icache_range((unsigned long) buf,
1415 (unsigned long) buf + TMPL_END_IDX +
1416 op->optinsn.size + RELATIVEJUMP_SIZE);
1417 return 0;
1418}
1419
1420#define MAX_OPTIMIZE_PROBES 256
1421static struct text_poke_param *jump_poke_params;
1422static struct jump_poke_buffer {
1423 u8 buf[RELATIVEJUMP_SIZE];
1424} *jump_poke_bufs;
1425
1426static void __kprobes setup_optimize_kprobe(struct text_poke_param *tprm,
1427 u8 *insn_buf,
1428 struct optimized_kprobe *op)
1429{
1430 s32 rel = (s32)((long)op->optinsn.insn -
1431 ((long)op->kp.addr + RELATIVEJUMP_SIZE));
1432
1433 /* Backup instructions which will be replaced by jump address */
1434 memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_SIZE,
1435 RELATIVE_ADDR_SIZE);
1436
1437 insn_buf[0] = RELATIVEJUMP_OPCODE;
1438 *(s32 *)(&insn_buf[1]) = rel;
1439
1440 tprm->addr = op->kp.addr;
1441 tprm->opcode = insn_buf;
1442 tprm->len = RELATIVEJUMP_SIZE;
1443}
1444
1445/*
1446 * Replace breakpoints (int3) with relative jumps.
1447 * Caller must call with locking kprobe_mutex and text_mutex.
1448 */
1449void __kprobes arch_optimize_kprobes(struct list_head *oplist)
1450{
1451 struct optimized_kprobe *op, *tmp;
1452 int c = 0;
1453
1454 list_for_each_entry_safe(op, tmp, oplist, list) {
1455 WARN_ON(kprobe_disabled(&op->kp));
1456 /* Setup param */
1457 setup_optimize_kprobe(&jump_poke_params[c],
1458 jump_poke_bufs[c].buf, op);
1459 list_del_init(&op->list);
1460 if (++c >= MAX_OPTIMIZE_PROBES)
1461 break;
1462 }
1463
1464 /*
1465 * text_poke_smp doesn't support NMI/MCE code modifying.
1466 * However, since kprobes itself also doesn't support NMI/MCE
1467 * code probing, it's not a problem.
1468 */
1469 text_poke_smp_batch(jump_poke_params, c);
1470}
1471
1472static void __kprobes setup_unoptimize_kprobe(struct text_poke_param *tprm,
1473 u8 *insn_buf,
1474 struct optimized_kprobe *op)
1475{
1476 /* Set int3 to first byte for kprobes */
1477 insn_buf[0] = BREAKPOINT_INSTRUCTION;
1478 memcpy(insn_buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE);
1479
1480 tprm->addr = op->kp.addr;
1481 tprm->opcode = insn_buf;
1482 tprm->len = RELATIVEJUMP_SIZE;
1483}
1484
1485/*
1486 * Recover original instructions and breakpoints from relative jumps.
1487 * Caller must call with locking kprobe_mutex.
1488 */
1489extern void arch_unoptimize_kprobes(struct list_head *oplist,
1490 struct list_head *done_list)
1491{
1492 struct optimized_kprobe *op, *tmp;
1493 int c = 0;
1494
1495 list_for_each_entry_safe(op, tmp, oplist, list) {
1496 /* Setup param */
1497 setup_unoptimize_kprobe(&jump_poke_params[c],
1498 jump_poke_bufs[c].buf, op);
1499 list_move(&op->list, done_list);
1500 if (++c >= MAX_OPTIMIZE_PROBES)
1501 break;
1502 }
1503
1504 /*
1505 * text_poke_smp doesn't support NMI/MCE code modifying.
1506 * However, since kprobes itself also doesn't support NMI/MCE
1507 * code probing, it's not a problem.
1508 */
1509 text_poke_smp_batch(jump_poke_params, c);
1510}
1511
1512/* Replace a relative jump with a breakpoint (int3). */
1513void __kprobes arch_unoptimize_kprobe(struct optimized_kprobe *op)
1514{
1515 u8 buf[RELATIVEJUMP_SIZE];
1516
1517 /* Set int3 to first byte for kprobes */
1518 buf[0] = BREAKPOINT_INSTRUCTION;
1519 memcpy(buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE);
1520 text_poke_smp(op->kp.addr, buf, RELATIVEJUMP_SIZE);
1521}
1522
1523static int __kprobes setup_detour_execution(struct kprobe *p,
1524 struct pt_regs *regs,
1525 int reenter)
1526{
1527 struct optimized_kprobe *op;
1528
1529 if (p->flags & KPROBE_FLAG_OPTIMIZED) {
1530 /* This kprobe is really able to run optimized path. */
1531 op = container_of(p, struct optimized_kprobe, kp);
1532 /* Detour through copied instructions */
1533 regs->ip = (unsigned long)op->optinsn.insn + TMPL_END_IDX;
1534 if (!reenter)
1535 reset_current_kprobe();
1536 preempt_enable_no_resched();
1537 return 1;
1538 }
1539 return 0;
1540}
1541
1542static int __kprobes init_poke_params(void)
1543{
1544 /* Allocate code buffer and parameter array */
1545 jump_poke_bufs = kmalloc(sizeof(struct jump_poke_buffer) *
1546 MAX_OPTIMIZE_PROBES, GFP_KERNEL);
1547 if (!jump_poke_bufs)
1548 return -ENOMEM;
1549
1550 jump_poke_params = kmalloc(sizeof(struct text_poke_param) *
1551 MAX_OPTIMIZE_PROBES, GFP_KERNEL);
1552 if (!jump_poke_params) {
1553 kfree(jump_poke_bufs);
1554 jump_poke_bufs = NULL;
1555 return -ENOMEM;
1556 }
1557
1558 return 0;
1559}
1560#else /* !CONFIG_OPTPROBES */
1561static int __kprobes init_poke_params(void)
1562{
1563 return 0;
1564}
1565#endif
1566
1567int __init arch_init_kprobes(void) 1055int __init arch_init_kprobes(void)
1568{ 1056{
1569 return init_poke_params(); 1057 return arch_init_optprobes();
1570} 1058}
1571 1059
1572int __kprobes arch_trampoline_kprobe(struct kprobe *p) 1060int __kprobes arch_trampoline_kprobe(struct kprobe *p)
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index f0c6fd6f176b..694d801bf606 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -438,9 +438,9 @@ void __init kvm_guest_init(void)
438static __init int activate_jump_labels(void) 438static __init int activate_jump_labels(void)
439{ 439{
440 if (has_steal_clock) { 440 if (has_steal_clock) {
441 jump_label_inc(&paravirt_steal_enabled); 441 static_key_slow_inc(&paravirt_steal_enabled);
442 if (steal_acc) 442 if (steal_acc)
443 jump_label_inc(&paravirt_steal_rq_enabled); 443 static_key_slow_inc(&paravirt_steal_rq_enabled);
444 } 444 }
445 445
446 return 0; 446 return 0;
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
index ac0417be9131..73465aab28f8 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/microcode_amd.c
@@ -360,7 +360,6 @@ out:
360static enum ucode_state 360static enum ucode_state
361request_microcode_user(int cpu, const void __user *buf, size_t size) 361request_microcode_user(int cpu, const void __user *buf, size_t size)
362{ 362{
363 pr_info("AMD microcode update via /dev/cpu/microcode not supported\n");
364 return UCODE_ERROR; 363 return UCODE_ERROR;
365} 364}
366 365
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index d90272e6bc40..ada2f99388dd 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -202,8 +202,8 @@ static void native_flush_tlb_single(unsigned long addr)
202 __native_flush_tlb_single(addr); 202 __native_flush_tlb_single(addr);
203} 203}
204 204
205struct jump_label_key paravirt_steal_enabled; 205struct static_key paravirt_steal_enabled;
206struct jump_label_key paravirt_steal_rq_enabled; 206struct static_key paravirt_steal_rq_enabled;
207 207
208static u64 native_steal_clock(int cpu) 208static u64 native_steal_clock(int cpu)
209{ 209{
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 15763af7bfe3..44eefde92109 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -377,8 +377,8 @@ static inline int hlt_use_halt(void)
377void default_idle(void) 377void default_idle(void)
378{ 378{
379 if (hlt_use_halt()) { 379 if (hlt_use_halt()) {
380 trace_power_start(POWER_CSTATE, 1, smp_processor_id()); 380 trace_power_start_rcuidle(POWER_CSTATE, 1, smp_processor_id());
381 trace_cpu_idle(1, smp_processor_id()); 381 trace_cpu_idle_rcuidle(1, smp_processor_id());
382 current_thread_info()->status &= ~TS_POLLING; 382 current_thread_info()->status &= ~TS_POLLING;
383 /* 383 /*
384 * TS_POLLING-cleared state must be visible before we 384 * TS_POLLING-cleared state must be visible before we
@@ -391,8 +391,8 @@ void default_idle(void)
391 else 391 else
392 local_irq_enable(); 392 local_irq_enable();
393 current_thread_info()->status |= TS_POLLING; 393 current_thread_info()->status |= TS_POLLING;
394 trace_power_end(smp_processor_id()); 394 trace_power_end_rcuidle(smp_processor_id());
395 trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id()); 395 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
396 } else { 396 } else {
397 local_irq_enable(); 397 local_irq_enable();
398 /* loop is done by the caller */ 398 /* loop is done by the caller */
@@ -450,8 +450,8 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait);
450static void mwait_idle(void) 450static void mwait_idle(void)
451{ 451{
452 if (!need_resched()) { 452 if (!need_resched()) {
453 trace_power_start(POWER_CSTATE, 1, smp_processor_id()); 453 trace_power_start_rcuidle(POWER_CSTATE, 1, smp_processor_id());
454 trace_cpu_idle(1, smp_processor_id()); 454 trace_cpu_idle_rcuidle(1, smp_processor_id());
455 if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR)) 455 if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR))
456 clflush((void *)&current_thread_info()->flags); 456 clflush((void *)&current_thread_info()->flags);
457 457
@@ -461,8 +461,8 @@ static void mwait_idle(void)
461 __sti_mwait(0, 0); 461 __sti_mwait(0, 0);
462 else 462 else
463 local_irq_enable(); 463 local_irq_enable();
464 trace_power_end(smp_processor_id()); 464 trace_power_end_rcuidle(smp_processor_id());
465 trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id()); 465 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
466 } else 466 } else
467 local_irq_enable(); 467 local_irq_enable();
468} 468}
@@ -474,13 +474,13 @@ static void mwait_idle(void)
474 */ 474 */
475static void poll_idle(void) 475static void poll_idle(void)
476{ 476{
477 trace_power_start(POWER_CSTATE, 0, smp_processor_id()); 477 trace_power_start_rcuidle(POWER_CSTATE, 0, smp_processor_id());
478 trace_cpu_idle(0, smp_processor_id()); 478 trace_cpu_idle_rcuidle(0, smp_processor_id());
479 local_irq_enable(); 479 local_irq_enable();
480 while (!need_resched()) 480 while (!need_resched())
481 cpu_relax(); 481 cpu_relax();
482 trace_power_end(smp_processor_id()); 482 trace_power_end_rcuidle(smp_processor_id());
483 trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id()); 483 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
484} 484}
485 485
486/* 486/*
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 485204f58cda..49888fefe794 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -119,9 +119,7 @@ void cpu_idle(void)
119 } 119 }
120 rcu_idle_exit(); 120 rcu_idle_exit();
121 tick_nohz_idle_exit(); 121 tick_nohz_idle_exit();
122 preempt_enable_no_resched(); 122 schedule_preempt_disabled();
123 schedule();
124 preempt_disable();
125 } 123 }
126} 124}
127 125
@@ -214,6 +212,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
214 212
215 task_user_gs(p) = get_user_gs(regs); 213 task_user_gs(p) = get_user_gs(regs);
216 214
215 p->fpu_counter = 0;
217 p->thread.io_bitmap_ptr = NULL; 216 p->thread.io_bitmap_ptr = NULL;
218 tsk = current; 217 tsk = current;
219 err = -ENOMEM; 218 err = -ENOMEM;
@@ -299,22 +298,11 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
299 *next = &next_p->thread; 298 *next = &next_p->thread;
300 int cpu = smp_processor_id(); 299 int cpu = smp_processor_id();
301 struct tss_struct *tss = &per_cpu(init_tss, cpu); 300 struct tss_struct *tss = &per_cpu(init_tss, cpu);
302 bool preload_fpu; 301 fpu_switch_t fpu;
303 302
304 /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ 303 /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */
305 304
306 /* 305 fpu = switch_fpu_prepare(prev_p, next_p, cpu);
307 * If the task has used fpu the last 5 timeslices, just do a full
308 * restore of the math state immediately to avoid the trap; the
309 * chances of needing FPU soon are obviously high now
310 */
311 preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5;
312
313 __unlazy_fpu(prev_p);
314
315 /* we're going to use this soon, after a few expensive things */
316 if (preload_fpu)
317 prefetch(next->fpu.state);
318 306
319 /* 307 /*
320 * Reload esp0. 308 * Reload esp0.
@@ -354,11 +342,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
354 task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT)) 342 task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT))
355 __switch_to_xtra(prev_p, next_p, tss); 343 __switch_to_xtra(prev_p, next_p, tss);
356 344
357 /* If we're going to preload the fpu context, make sure clts
358 is run while we're batching the cpu state updates. */
359 if (preload_fpu)
360 clts();
361
362 /* 345 /*
363 * Leave lazy mode, flushing any hypercalls made here. 346 * Leave lazy mode, flushing any hypercalls made here.
364 * This must be done before restoring TLS segments so 347 * This must be done before restoring TLS segments so
@@ -368,15 +351,14 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
368 */ 351 */
369 arch_end_context_switch(next_p); 352 arch_end_context_switch(next_p);
370 353
371 if (preload_fpu)
372 __math_state_restore();
373
374 /* 354 /*
375 * Restore %gs if needed (which is common) 355 * Restore %gs if needed (which is common)
376 */ 356 */
377 if (prev->gs | next->gs) 357 if (prev->gs | next->gs)
378 lazy_load_gs(next->gs); 358 lazy_load_gs(next->gs);
379 359
360 switch_fpu_finish(next_p, fpu);
361
380 percpu_write(current_task, next_p); 362 percpu_write(current_task, next_p);
381 363
382 return prev_p; 364 return prev_p;
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 9b9fe4a85c87..e34257c70c28 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -156,9 +156,7 @@ void cpu_idle(void)
156 } 156 }
157 157
158 tick_nohz_idle_exit(); 158 tick_nohz_idle_exit();
159 preempt_enable_no_resched(); 159 schedule_preempt_disabled();
160 schedule();
161 preempt_disable();
162 } 160 }
163} 161}
164 162
@@ -286,6 +284,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
286 284
287 set_tsk_thread_flag(p, TIF_FORK); 285 set_tsk_thread_flag(p, TIF_FORK);
288 286
287 p->fpu_counter = 0;
289 p->thread.io_bitmap_ptr = NULL; 288 p->thread.io_bitmap_ptr = NULL;
290 289
291 savesegment(gs, p->thread.gsindex); 290 savesegment(gs, p->thread.gsindex);
@@ -386,18 +385,9 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
386 int cpu = smp_processor_id(); 385 int cpu = smp_processor_id();
387 struct tss_struct *tss = &per_cpu(init_tss, cpu); 386 struct tss_struct *tss = &per_cpu(init_tss, cpu);
388 unsigned fsindex, gsindex; 387 unsigned fsindex, gsindex;
389 bool preload_fpu; 388 fpu_switch_t fpu;
390 389
391 /* 390 fpu = switch_fpu_prepare(prev_p, next_p, cpu);
392 * If the task has used fpu the last 5 timeslices, just do a full
393 * restore of the math state immediately to avoid the trap; the
394 * chances of needing FPU soon are obviously high now
395 */
396 preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5;
397
398 /* we're going to use this soon, after a few expensive things */
399 if (preload_fpu)
400 prefetch(next->fpu.state);
401 391
402 /* 392 /*
403 * Reload esp0, LDT and the page table pointer: 393 * Reload esp0, LDT and the page table pointer:
@@ -427,13 +417,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
427 417
428 load_TLS(next, cpu); 418 load_TLS(next, cpu);
429 419
430 /* Must be after DS reload */
431 __unlazy_fpu(prev_p);
432
433 /* Make sure cpu is ready for new context */
434 if (preload_fpu)
435 clts();
436
437 /* 420 /*
438 * Leave lazy mode, flushing any hypercalls made here. 421 * Leave lazy mode, flushing any hypercalls made here.
439 * This must be done before restoring TLS segments so 422 * This must be done before restoring TLS segments so
@@ -474,6 +457,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
474 wrmsrl(MSR_KERNEL_GS_BASE, next->gs); 457 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
475 prev->gsindex = gsindex; 458 prev->gsindex = gsindex;
476 459
460 switch_fpu_finish(next_p, fpu);
461
477 /* 462 /*
478 * Switch the PDA and FPU contexts. 463 * Switch the PDA and FPU contexts.
479 */ 464 */
@@ -492,13 +477,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
492 task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV)) 477 task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
493 __switch_to_xtra(prev_p, next_p, tss); 478 __switch_to_xtra(prev_p, next_p, tss);
494 479
495 /*
496 * Preload the FPU context, now that we've determined that the
497 * task is likely to be using it.
498 */
499 if (preload_fpu)
500 __math_state_restore();
501
502 return prev_p; 480 return prev_p;
503} 481}
504 482
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 66d250c00d11..58f78165d308 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -291,19 +291,6 @@ notrace static void __cpuinit start_secondary(void *unused)
291 per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; 291 per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
292 x86_platform.nmi_init(); 292 x86_platform.nmi_init();
293 293
294 /*
295 * Wait until the cpu which brought this one up marked it
296 * online before enabling interrupts. If we don't do that then
297 * we can end up waking up the softirq thread before this cpu
298 * reached the active state, which makes the scheduler unhappy
299 * and schedule the softirq thread on the wrong cpu. This is
300 * only observable with forced threaded interrupts, but in
301 * theory it could also happen w/o them. It's just way harder
302 * to achieve.
303 */
304 while (!cpumask_test_cpu(smp_processor_id(), cpu_active_mask))
305 cpu_relax();
306
307 /* enable local interrupts */ 294 /* enable local interrupts */
308 local_irq_enable(); 295 local_irq_enable();
309 296
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 8ba27dbc107a..4bbe04d96744 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -571,28 +571,6 @@ asmlinkage void __attribute__((weak)) smp_threshold_interrupt(void)
571} 571}
572 572
573/* 573/*
574 * __math_state_restore assumes that cr0.TS is already clear and the
575 * fpu state is all ready for use. Used during context switch.
576 */
577void __math_state_restore(void)
578{
579 struct thread_info *thread = current_thread_info();
580 struct task_struct *tsk = thread->task;
581
582 /*
583 * Paranoid restore. send a SIGSEGV if we fail to restore the state.
584 */
585 if (unlikely(restore_fpu_checking(tsk))) {
586 stts();
587 force_sig(SIGSEGV, tsk);
588 return;
589 }
590
591 thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */
592 tsk->fpu_counter++;
593}
594
595/*
596 * 'math_state_restore()' saves the current math information in the 574 * 'math_state_restore()' saves the current math information in the
597 * old math state array, and gets the new ones from the current task 575 * old math state array, and gets the new ones from the current task
598 * 576 *
@@ -604,8 +582,7 @@ void __math_state_restore(void)
604 */ 582 */
605void math_state_restore(void) 583void math_state_restore(void)
606{ 584{
607 struct thread_info *thread = current_thread_info(); 585 struct task_struct *tsk = current;
608 struct task_struct *tsk = thread->task;
609 586
610 if (!tsk_used_math(tsk)) { 587 if (!tsk_used_math(tsk)) {
611 local_irq_enable(); 588 local_irq_enable();
@@ -622,16 +599,23 @@ void math_state_restore(void)
622 local_irq_disable(); 599 local_irq_disable();
623 } 600 }
624 601
625 clts(); /* Allow maths ops (or we recurse) */ 602 __thread_fpu_begin(tsk);
603 /*
604 * Paranoid restore. send a SIGSEGV if we fail to restore the state.
605 */
606 if (unlikely(restore_fpu_checking(tsk))) {
607 __thread_fpu_end(tsk);
608 force_sig(SIGSEGV, tsk);
609 return;
610 }
626 611
627 __math_state_restore(); 612 tsk->fpu_counter++;
628} 613}
629EXPORT_SYMBOL_GPL(math_state_restore); 614EXPORT_SYMBOL_GPL(math_state_restore);
630 615
631dotraplinkage void __kprobes 616dotraplinkage void __kprobes
632do_device_not_available(struct pt_regs *regs, long error_code) 617do_device_not_available(struct pt_regs *regs, long error_code)
633{ 618{
634 WARN_ON_ONCE(!user_mode_vm(regs));
635#ifdef CONFIG_MATH_EMULATION 619#ifdef CONFIG_MATH_EMULATION
636 if (read_cr0() & X86_CR0_EM) { 620 if (read_cr0() & X86_CR0_EM) {
637 struct math_emu_info info = { }; 621 struct math_emu_info info = { };
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index a62c201c97ec..183c5925a9fe 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -620,7 +620,8 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
620 620
621 if (cpu_khz) { 621 if (cpu_khz) {
622 *scale = (NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR)/cpu_khz; 622 *scale = (NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR)/cpu_khz;
623 *offset = ns_now - (tsc_now * *scale >> CYC2NS_SCALE_FACTOR); 623 *offset = ns_now - mult_frac(tsc_now, *scale,
624 (1UL << CYC2NS_SCALE_FACTOR));
624 } 625 }
625 626
626 sched_clock_idle_wakeup_event(0); 627 sched_clock_idle_wakeup_event(0);
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index a3911343976b..711091114119 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -47,7 +47,7 @@ void __sanitize_i387_state(struct task_struct *tsk)
47 if (!fx) 47 if (!fx)
48 return; 48 return;
49 49
50 BUG_ON(task_thread_info(tsk)->status & TS_USEDFPU); 50 BUG_ON(__thread_has_fpu(tsk));
51 51
52 xstate_bv = tsk->thread.fpu.state->xsave.xsave_hdr.xstate_bv; 52 xstate_bv = tsk->thread.fpu.state->xsave.xsave_hdr.xstate_bv;
53 53
@@ -168,7 +168,7 @@ int save_i387_xstate(void __user *buf)
168 if (!used_math()) 168 if (!used_math())
169 return 0; 169 return 0;
170 170
171 if (task_thread_info(tsk)->status & TS_USEDFPU) { 171 if (user_has_fpu()) {
172 if (use_xsave()) 172 if (use_xsave())
173 err = xsave_user(buf); 173 err = xsave_user(buf);
174 else 174 else
@@ -176,8 +176,7 @@ int save_i387_xstate(void __user *buf)
176 176
177 if (err) 177 if (err)
178 return err; 178 return err;
179 task_thread_info(tsk)->status &= ~TS_USEDFPU; 179 user_fpu_end();
180 stts();
181 } else { 180 } else {
182 sanitize_i387_state(tsk); 181 sanitize_i387_state(tsk);
183 if (__copy_to_user(buf, &tsk->thread.fpu.state->fxsave, 182 if (__copy_to_user(buf, &tsk->thread.fpu.state->fxsave,
@@ -292,10 +291,7 @@ int restore_i387_xstate(void __user *buf)
292 return err; 291 return err;
293 } 292 }
294 293
295 if (!(task_thread_info(current)->status & TS_USEDFPU)) { 294 user_fpu_begin();
296 clts();
297 task_thread_info(current)->status |= TS_USEDFPU;
298 }
299 if (use_xsave()) 295 if (use_xsave())
300 err = restore_user_xstate(buf); 296 err = restore_user_xstate(buf);
301 else 297 else