diff options
Diffstat (limited to 'arch/x86/kernel')
26 files changed, 1673 insertions, 814 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 5369059c07a9..532d2e090e6f 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile | |||
@@ -69,6 +69,7 @@ obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o | |||
69 | obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o | 69 | obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o |
70 | obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o | 70 | obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o |
71 | obj-$(CONFIG_KPROBES) += kprobes.o | 71 | obj-$(CONFIG_KPROBES) += kprobes.o |
72 | obj-$(CONFIG_OPTPROBES) += kprobes-opt.o | ||
72 | obj-$(CONFIG_MODULES) += module.o | 73 | obj-$(CONFIG_MODULES) += module.o |
73 | obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o | 74 | obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o |
74 | obj-$(CONFIG_KGDB) += kgdb.o | 75 | obj-$(CONFIG_KGDB) += kgdb.o |
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index f4773f4aae35..0a44b90602b0 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c | |||
@@ -5,6 +5,7 @@ | |||
5 | #include <linux/mm.h> | 5 | #include <linux/mm.h> |
6 | 6 | ||
7 | #include <linux/io.h> | 7 | #include <linux/io.h> |
8 | #include <linux/sched.h> | ||
8 | #include <asm/processor.h> | 9 | #include <asm/processor.h> |
9 | #include <asm/apic.h> | 10 | #include <asm/apic.h> |
10 | #include <asm/cpu.h> | 11 | #include <asm/cpu.h> |
@@ -456,6 +457,8 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) | |||
456 | if (c->x86_power & (1 << 8)) { | 457 | if (c->x86_power & (1 << 8)) { |
457 | set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); | 458 | set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); |
458 | set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); | 459 | set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); |
460 | if (!check_tsc_unstable()) | ||
461 | sched_clock_stable = 1; | ||
459 | } | 462 | } |
460 | 463 | ||
461 | #ifdef CONFIG_X86_64 | 464 | #ifdef CONFIG_X86_64 |
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index d43cad74f166..c0f7d68d318f 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c | |||
@@ -1044,6 +1044,9 @@ DEFINE_PER_CPU(char *, irq_stack_ptr) = | |||
1044 | 1044 | ||
1045 | DEFINE_PER_CPU(unsigned int, irq_count) = -1; | 1045 | DEFINE_PER_CPU(unsigned int, irq_count) = -1; |
1046 | 1046 | ||
1047 | DEFINE_PER_CPU(struct task_struct *, fpu_owner_task); | ||
1048 | EXPORT_PER_CPU_SYMBOL(fpu_owner_task); | ||
1049 | |||
1047 | /* | 1050 | /* |
1048 | * Special IST stacks which the CPU switches to when it calls | 1051 | * Special IST stacks which the CPU switches to when it calls |
1049 | * an IST-marked descriptor entry. Up to 7 stacks (hardware | 1052 | * an IST-marked descriptor entry. Up to 7 stacks (hardware |
@@ -1111,6 +1114,8 @@ void debug_stack_reset(void) | |||
1111 | 1114 | ||
1112 | DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; | 1115 | DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; |
1113 | EXPORT_PER_CPU_SYMBOL(current_task); | 1116 | EXPORT_PER_CPU_SYMBOL(current_task); |
1117 | DEFINE_PER_CPU(struct task_struct *, fpu_owner_task); | ||
1118 | EXPORT_PER_CPU_SYMBOL(fpu_owner_task); | ||
1114 | 1119 | ||
1115 | #ifdef CONFIG_CC_STACKPROTECTOR | 1120 | #ifdef CONFIG_CC_STACKPROTECTOR |
1116 | DEFINE_PER_CPU_ALIGNED(struct stack_canary, stack_canary); | 1121 | DEFINE_PER_CPU_ALIGNED(struct stack_canary, stack_canary); |
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 6b45e5e7a901..73d08ed98a64 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c | |||
@@ -326,8 +326,7 @@ static void __cpuinit amd_calc_l3_indices(struct amd_northbridge *nb) | |||
326 | l3->indices = (max(max3(sc0, sc1, sc2), sc3) << 10) - 1; | 326 | l3->indices = (max(max3(sc0, sc1, sc2), sc3) << 10) - 1; |
327 | } | 327 | } |
328 | 328 | ||
329 | static void __cpuinit amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf, | 329 | static void __cpuinit amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf, int index) |
330 | int index) | ||
331 | { | 330 | { |
332 | int node; | 331 | int node; |
333 | 332 | ||
@@ -725,14 +724,16 @@ static DEFINE_PER_CPU(struct _cpuid4_info *, ici_cpuid4_info); | |||
725 | #define CPUID4_INFO_IDX(x, y) (&((per_cpu(ici_cpuid4_info, x))[y])) | 724 | #define CPUID4_INFO_IDX(x, y) (&((per_cpu(ici_cpuid4_info, x))[y])) |
726 | 725 | ||
727 | #ifdef CONFIG_SMP | 726 | #ifdef CONFIG_SMP |
728 | static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index) | 727 | |
728 | static int __cpuinit cache_shared_amd_cpu_map_setup(unsigned int cpu, int index) | ||
729 | { | 729 | { |
730 | struct _cpuid4_info *this_leaf, *sibling_leaf; | 730 | struct _cpuid4_info *this_leaf; |
731 | unsigned long num_threads_sharing; | 731 | int ret, i, sibling; |
732 | int index_msb, i, sibling; | ||
733 | struct cpuinfo_x86 *c = &cpu_data(cpu); | 732 | struct cpuinfo_x86 *c = &cpu_data(cpu); |
734 | 733 | ||
735 | if ((index == 3) && (c->x86_vendor == X86_VENDOR_AMD)) { | 734 | ret = 0; |
735 | if (index == 3) { | ||
736 | ret = 1; | ||
736 | for_each_cpu(i, cpu_llc_shared_mask(cpu)) { | 737 | for_each_cpu(i, cpu_llc_shared_mask(cpu)) { |
737 | if (!per_cpu(ici_cpuid4_info, i)) | 738 | if (!per_cpu(ici_cpuid4_info, i)) |
738 | continue; | 739 | continue; |
@@ -743,8 +744,35 @@ static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index) | |||
743 | set_bit(sibling, this_leaf->shared_cpu_map); | 744 | set_bit(sibling, this_leaf->shared_cpu_map); |
744 | } | 745 | } |
745 | } | 746 | } |
746 | return; | 747 | } else if ((c->x86 == 0x15) && ((index == 1) || (index == 2))) { |
748 | ret = 1; | ||
749 | for_each_cpu(i, cpu_sibling_mask(cpu)) { | ||
750 | if (!per_cpu(ici_cpuid4_info, i)) | ||
751 | continue; | ||
752 | this_leaf = CPUID4_INFO_IDX(i, index); | ||
753 | for_each_cpu(sibling, cpu_sibling_mask(cpu)) { | ||
754 | if (!cpu_online(sibling)) | ||
755 | continue; | ||
756 | set_bit(sibling, this_leaf->shared_cpu_map); | ||
757 | } | ||
758 | } | ||
747 | } | 759 | } |
760 | |||
761 | return ret; | ||
762 | } | ||
763 | |||
764 | static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index) | ||
765 | { | ||
766 | struct _cpuid4_info *this_leaf, *sibling_leaf; | ||
767 | unsigned long num_threads_sharing; | ||
768 | int index_msb, i; | ||
769 | struct cpuinfo_x86 *c = &cpu_data(cpu); | ||
770 | |||
771 | if (c->x86_vendor == X86_VENDOR_AMD) { | ||
772 | if (cache_shared_amd_cpu_map_setup(cpu, index)) | ||
773 | return; | ||
774 | } | ||
775 | |||
748 | this_leaf = CPUID4_INFO_IDX(cpu, index); | 776 | this_leaf = CPUID4_INFO_IDX(cpu, index); |
749 | num_threads_sharing = 1 + this_leaf->base.eax.split.num_threads_sharing; | 777 | num_threads_sharing = 1 + this_leaf->base.eax.split.num_threads_sharing; |
750 | 778 | ||
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 786e76a86322..e4eeaaf58a47 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c | |||
@@ -528,6 +528,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) | |||
528 | 528 | ||
529 | sprintf(name, "threshold_bank%i", bank); | 529 | sprintf(name, "threshold_bank%i", bank); |
530 | 530 | ||
531 | #ifdef CONFIG_SMP | ||
531 | if (cpu_data(cpu).cpu_core_id && shared_bank[bank]) { /* symlink */ | 532 | if (cpu_data(cpu).cpu_core_id && shared_bank[bank]) { /* symlink */ |
532 | i = cpumask_first(cpu_llc_shared_mask(cpu)); | 533 | i = cpumask_first(cpu_llc_shared_mask(cpu)); |
533 | 534 | ||
@@ -553,6 +554,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) | |||
553 | 554 | ||
554 | goto out; | 555 | goto out; |
555 | } | 556 | } |
557 | #endif | ||
556 | 558 | ||
557 | b = kzalloc(sizeof(struct threshold_bank), GFP_KERNEL); | 559 | b = kzalloc(sizeof(struct threshold_bank), GFP_KERNEL); |
558 | if (!b) { | 560 | if (!b) { |
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 5adce1040b11..0a18d16cb58d 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c | |||
@@ -24,6 +24,7 @@ | |||
24 | #include <linux/slab.h> | 24 | #include <linux/slab.h> |
25 | #include <linux/cpu.h> | 25 | #include <linux/cpu.h> |
26 | #include <linux/bitops.h> | 26 | #include <linux/bitops.h> |
27 | #include <linux/device.h> | ||
27 | 28 | ||
28 | #include <asm/apic.h> | 29 | #include <asm/apic.h> |
29 | #include <asm/stacktrace.h> | 30 | #include <asm/stacktrace.h> |
@@ -31,6 +32,7 @@ | |||
31 | #include <asm/compat.h> | 32 | #include <asm/compat.h> |
32 | #include <asm/smp.h> | 33 | #include <asm/smp.h> |
33 | #include <asm/alternative.h> | 34 | #include <asm/alternative.h> |
35 | #include <asm/timer.h> | ||
34 | 36 | ||
35 | #include "perf_event.h" | 37 | #include "perf_event.h" |
36 | 38 | ||
@@ -351,6 +353,36 @@ int x86_setup_perfctr(struct perf_event *event) | |||
351 | return 0; | 353 | return 0; |
352 | } | 354 | } |
353 | 355 | ||
356 | /* | ||
357 | * check that branch_sample_type is compatible with | ||
358 | * settings needed for precise_ip > 1 which implies | ||
359 | * using the LBR to capture ALL taken branches at the | ||
360 | * priv levels of the measurement | ||
361 | */ | ||
362 | static inline int precise_br_compat(struct perf_event *event) | ||
363 | { | ||
364 | u64 m = event->attr.branch_sample_type; | ||
365 | u64 b = 0; | ||
366 | |||
367 | /* must capture all branches */ | ||
368 | if (!(m & PERF_SAMPLE_BRANCH_ANY)) | ||
369 | return 0; | ||
370 | |||
371 | m &= PERF_SAMPLE_BRANCH_KERNEL | PERF_SAMPLE_BRANCH_USER; | ||
372 | |||
373 | if (!event->attr.exclude_user) | ||
374 | b |= PERF_SAMPLE_BRANCH_USER; | ||
375 | |||
376 | if (!event->attr.exclude_kernel) | ||
377 | b |= PERF_SAMPLE_BRANCH_KERNEL; | ||
378 | |||
379 | /* | ||
380 | * ignore PERF_SAMPLE_BRANCH_HV, not supported on x86 | ||
381 | */ | ||
382 | |||
383 | return m == b; | ||
384 | } | ||
385 | |||
354 | int x86_pmu_hw_config(struct perf_event *event) | 386 | int x86_pmu_hw_config(struct perf_event *event) |
355 | { | 387 | { |
356 | if (event->attr.precise_ip) { | 388 | if (event->attr.precise_ip) { |
@@ -367,6 +399,36 @@ int x86_pmu_hw_config(struct perf_event *event) | |||
367 | 399 | ||
368 | if (event->attr.precise_ip > precise) | 400 | if (event->attr.precise_ip > precise) |
369 | return -EOPNOTSUPP; | 401 | return -EOPNOTSUPP; |
402 | /* | ||
403 | * check that PEBS LBR correction does not conflict with | ||
404 | * whatever the user is asking with attr->branch_sample_type | ||
405 | */ | ||
406 | if (event->attr.precise_ip > 1) { | ||
407 | u64 *br_type = &event->attr.branch_sample_type; | ||
408 | |||
409 | if (has_branch_stack(event)) { | ||
410 | if (!precise_br_compat(event)) | ||
411 | return -EOPNOTSUPP; | ||
412 | |||
413 | /* branch_sample_type is compatible */ | ||
414 | |||
415 | } else { | ||
416 | /* | ||
417 | * user did not specify branch_sample_type | ||
418 | * | ||
419 | * For PEBS fixups, we capture all | ||
420 | * the branches at the priv level of the | ||
421 | * event. | ||
422 | */ | ||
423 | *br_type = PERF_SAMPLE_BRANCH_ANY; | ||
424 | |||
425 | if (!event->attr.exclude_user) | ||
426 | *br_type |= PERF_SAMPLE_BRANCH_USER; | ||
427 | |||
428 | if (!event->attr.exclude_kernel) | ||
429 | *br_type |= PERF_SAMPLE_BRANCH_KERNEL; | ||
430 | } | ||
431 | } | ||
370 | } | 432 | } |
371 | 433 | ||
372 | /* | 434 | /* |
@@ -424,6 +486,10 @@ static int __x86_pmu_event_init(struct perf_event *event) | |||
424 | /* mark unused */ | 486 | /* mark unused */ |
425 | event->hw.extra_reg.idx = EXTRA_REG_NONE; | 487 | event->hw.extra_reg.idx = EXTRA_REG_NONE; |
426 | 488 | ||
489 | /* mark not used */ | ||
490 | event->hw.extra_reg.idx = EXTRA_REG_NONE; | ||
491 | event->hw.branch_reg.idx = EXTRA_REG_NONE; | ||
492 | |||
427 | return x86_pmu.hw_config(event); | 493 | return x86_pmu.hw_config(event); |
428 | } | 494 | } |
429 | 495 | ||
@@ -1210,6 +1276,8 @@ x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu) | |||
1210 | break; | 1276 | break; |
1211 | 1277 | ||
1212 | case CPU_STARTING: | 1278 | case CPU_STARTING: |
1279 | if (x86_pmu.attr_rdpmc) | ||
1280 | set_in_cr4(X86_CR4_PCE); | ||
1213 | if (x86_pmu.cpu_starting) | 1281 | if (x86_pmu.cpu_starting) |
1214 | x86_pmu.cpu_starting(cpu); | 1282 | x86_pmu.cpu_starting(cpu); |
1215 | break; | 1283 | break; |
@@ -1319,6 +1387,8 @@ static int __init init_hw_perf_events(void) | |||
1319 | } | 1387 | } |
1320 | } | 1388 | } |
1321 | 1389 | ||
1390 | x86_pmu.attr_rdpmc = 1; /* enable userspace RDPMC usage by default */ | ||
1391 | |||
1322 | pr_info("... version: %d\n", x86_pmu.version); | 1392 | pr_info("... version: %d\n", x86_pmu.version); |
1323 | pr_info("... bit width: %d\n", x86_pmu.cntval_bits); | 1393 | pr_info("... bit width: %d\n", x86_pmu.cntval_bits); |
1324 | pr_info("... generic registers: %d\n", x86_pmu.num_counters); | 1394 | pr_info("... generic registers: %d\n", x86_pmu.num_counters); |
@@ -1542,23 +1612,106 @@ static int x86_pmu_event_init(struct perf_event *event) | |||
1542 | return err; | 1612 | return err; |
1543 | } | 1613 | } |
1544 | 1614 | ||
1615 | static int x86_pmu_event_idx(struct perf_event *event) | ||
1616 | { | ||
1617 | int idx = event->hw.idx; | ||
1618 | |||
1619 | if (x86_pmu.num_counters_fixed && idx >= X86_PMC_IDX_FIXED) { | ||
1620 | idx -= X86_PMC_IDX_FIXED; | ||
1621 | idx |= 1 << 30; | ||
1622 | } | ||
1623 | |||
1624 | return idx + 1; | ||
1625 | } | ||
1626 | |||
1627 | static ssize_t get_attr_rdpmc(struct device *cdev, | ||
1628 | struct device_attribute *attr, | ||
1629 | char *buf) | ||
1630 | { | ||
1631 | return snprintf(buf, 40, "%d\n", x86_pmu.attr_rdpmc); | ||
1632 | } | ||
1633 | |||
1634 | static void change_rdpmc(void *info) | ||
1635 | { | ||
1636 | bool enable = !!(unsigned long)info; | ||
1637 | |||
1638 | if (enable) | ||
1639 | set_in_cr4(X86_CR4_PCE); | ||
1640 | else | ||
1641 | clear_in_cr4(X86_CR4_PCE); | ||
1642 | } | ||
1643 | |||
1644 | static ssize_t set_attr_rdpmc(struct device *cdev, | ||
1645 | struct device_attribute *attr, | ||
1646 | const char *buf, size_t count) | ||
1647 | { | ||
1648 | unsigned long val = simple_strtoul(buf, NULL, 0); | ||
1649 | |||
1650 | if (!!val != !!x86_pmu.attr_rdpmc) { | ||
1651 | x86_pmu.attr_rdpmc = !!val; | ||
1652 | smp_call_function(change_rdpmc, (void *)val, 1); | ||
1653 | } | ||
1654 | |||
1655 | return count; | ||
1656 | } | ||
1657 | |||
1658 | static DEVICE_ATTR(rdpmc, S_IRUSR | S_IWUSR, get_attr_rdpmc, set_attr_rdpmc); | ||
1659 | |||
1660 | static struct attribute *x86_pmu_attrs[] = { | ||
1661 | &dev_attr_rdpmc.attr, | ||
1662 | NULL, | ||
1663 | }; | ||
1664 | |||
1665 | static struct attribute_group x86_pmu_attr_group = { | ||
1666 | .attrs = x86_pmu_attrs, | ||
1667 | }; | ||
1668 | |||
1669 | static const struct attribute_group *x86_pmu_attr_groups[] = { | ||
1670 | &x86_pmu_attr_group, | ||
1671 | NULL, | ||
1672 | }; | ||
1673 | |||
1674 | static void x86_pmu_flush_branch_stack(void) | ||
1675 | { | ||
1676 | if (x86_pmu.flush_branch_stack) | ||
1677 | x86_pmu.flush_branch_stack(); | ||
1678 | } | ||
1679 | |||
1545 | static struct pmu pmu = { | 1680 | static struct pmu pmu = { |
1546 | .pmu_enable = x86_pmu_enable, | 1681 | .pmu_enable = x86_pmu_enable, |
1547 | .pmu_disable = x86_pmu_disable, | 1682 | .pmu_disable = x86_pmu_disable, |
1683 | |||
1684 | .attr_groups = x86_pmu_attr_groups, | ||
1548 | 1685 | ||
1549 | .event_init = x86_pmu_event_init, | 1686 | .event_init = x86_pmu_event_init, |
1550 | 1687 | ||
1551 | .add = x86_pmu_add, | 1688 | .add = x86_pmu_add, |
1552 | .del = x86_pmu_del, | 1689 | .del = x86_pmu_del, |
1553 | .start = x86_pmu_start, | 1690 | .start = x86_pmu_start, |
1554 | .stop = x86_pmu_stop, | 1691 | .stop = x86_pmu_stop, |
1555 | .read = x86_pmu_read, | 1692 | .read = x86_pmu_read, |
1556 | 1693 | ||
1557 | .start_txn = x86_pmu_start_txn, | 1694 | .start_txn = x86_pmu_start_txn, |
1558 | .cancel_txn = x86_pmu_cancel_txn, | 1695 | .cancel_txn = x86_pmu_cancel_txn, |
1559 | .commit_txn = x86_pmu_commit_txn, | 1696 | .commit_txn = x86_pmu_commit_txn, |
1697 | |||
1698 | .event_idx = x86_pmu_event_idx, | ||
1699 | .flush_branch_stack = x86_pmu_flush_branch_stack, | ||
1560 | }; | 1700 | }; |
1561 | 1701 | ||
1702 | void perf_update_user_clock(struct perf_event_mmap_page *userpg, u64 now) | ||
1703 | { | ||
1704 | if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) | ||
1705 | return; | ||
1706 | |||
1707 | if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) | ||
1708 | return; | ||
1709 | |||
1710 | userpg->time_mult = this_cpu_read(cyc2ns); | ||
1711 | userpg->time_shift = CYC2NS_SCALE_FACTOR; | ||
1712 | userpg->time_offset = this_cpu_read(cyc2ns_offset) - now; | ||
1713 | } | ||
1714 | |||
1562 | /* | 1715 | /* |
1563 | * callchain support | 1716 | * callchain support |
1564 | */ | 1717 | */ |
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 8944062f46e2..8484e77c211e 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h | |||
@@ -33,6 +33,7 @@ enum extra_reg_type { | |||
33 | 33 | ||
34 | EXTRA_REG_RSP_0 = 0, /* offcore_response_0 */ | 34 | EXTRA_REG_RSP_0 = 0, /* offcore_response_0 */ |
35 | EXTRA_REG_RSP_1 = 1, /* offcore_response_1 */ | 35 | EXTRA_REG_RSP_1 = 1, /* offcore_response_1 */ |
36 | EXTRA_REG_LBR = 2, /* lbr_select */ | ||
36 | 37 | ||
37 | EXTRA_REG_MAX /* number of entries needed */ | 38 | EXTRA_REG_MAX /* number of entries needed */ |
38 | }; | 39 | }; |
@@ -130,6 +131,8 @@ struct cpu_hw_events { | |||
130 | void *lbr_context; | 131 | void *lbr_context; |
131 | struct perf_branch_stack lbr_stack; | 132 | struct perf_branch_stack lbr_stack; |
132 | struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES]; | 133 | struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES]; |
134 | struct er_account *lbr_sel; | ||
135 | u64 br_sel; | ||
133 | 136 | ||
134 | /* | 137 | /* |
135 | * Intel host/guest exclude bits | 138 | * Intel host/guest exclude bits |
@@ -147,7 +150,9 @@ struct cpu_hw_events { | |||
147 | /* | 150 | /* |
148 | * AMD specific bits | 151 | * AMD specific bits |
149 | */ | 152 | */ |
150 | struct amd_nb *amd_nb; | 153 | struct amd_nb *amd_nb; |
154 | /* Inverted mask of bits to clear in the perf_ctr ctrl registers */ | ||
155 | u64 perf_ctr_virt_mask; | ||
151 | 156 | ||
152 | void *kfree_on_online; | 157 | void *kfree_on_online; |
153 | }; | 158 | }; |
@@ -266,6 +271,29 @@ struct x86_pmu_quirk { | |||
266 | void (*func)(void); | 271 | void (*func)(void); |
267 | }; | 272 | }; |
268 | 273 | ||
274 | union x86_pmu_config { | ||
275 | struct { | ||
276 | u64 event:8, | ||
277 | umask:8, | ||
278 | usr:1, | ||
279 | os:1, | ||
280 | edge:1, | ||
281 | pc:1, | ||
282 | interrupt:1, | ||
283 | __reserved1:1, | ||
284 | en:1, | ||
285 | inv:1, | ||
286 | cmask:8, | ||
287 | event2:4, | ||
288 | __reserved2:4, | ||
289 | go:1, | ||
290 | ho:1; | ||
291 | } bits; | ||
292 | u64 value; | ||
293 | }; | ||
294 | |||
295 | #define X86_CONFIG(args...) ((union x86_pmu_config){.bits = {args}}).value | ||
296 | |||
269 | /* | 297 | /* |
270 | * struct x86_pmu - generic x86 pmu | 298 | * struct x86_pmu - generic x86 pmu |
271 | */ | 299 | */ |
@@ -307,10 +335,19 @@ struct x86_pmu { | |||
307 | struct x86_pmu_quirk *quirks; | 335 | struct x86_pmu_quirk *quirks; |
308 | int perfctr_second_write; | 336 | int perfctr_second_write; |
309 | 337 | ||
338 | /* | ||
339 | * sysfs attrs | ||
340 | */ | ||
341 | int attr_rdpmc; | ||
342 | |||
343 | /* | ||
344 | * CPU Hotplug hooks | ||
345 | */ | ||
310 | int (*cpu_prepare)(int cpu); | 346 | int (*cpu_prepare)(int cpu); |
311 | void (*cpu_starting)(int cpu); | 347 | void (*cpu_starting)(int cpu); |
312 | void (*cpu_dying)(int cpu); | 348 | void (*cpu_dying)(int cpu); |
313 | void (*cpu_dead)(int cpu); | 349 | void (*cpu_dead)(int cpu); |
350 | void (*flush_branch_stack)(void); | ||
314 | 351 | ||
315 | /* | 352 | /* |
316 | * Intel Arch Perfmon v2+ | 353 | * Intel Arch Perfmon v2+ |
@@ -332,6 +369,8 @@ struct x86_pmu { | |||
332 | */ | 369 | */ |
333 | unsigned long lbr_tos, lbr_from, lbr_to; /* MSR base regs */ | 370 | unsigned long lbr_tos, lbr_from, lbr_to; /* MSR base regs */ |
334 | int lbr_nr; /* hardware stack size */ | 371 | int lbr_nr; /* hardware stack size */ |
372 | u64 lbr_sel_mask; /* LBR_SELECT valid bits */ | ||
373 | const int *lbr_sel_map; /* lbr_select mappings */ | ||
335 | 374 | ||
336 | /* | 375 | /* |
337 | * Extra registers for events | 376 | * Extra registers for events |
@@ -417,9 +456,11 @@ void x86_pmu_disable_all(void); | |||
417 | static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc, | 456 | static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc, |
418 | u64 enable_mask) | 457 | u64 enable_mask) |
419 | { | 458 | { |
459 | u64 disable_mask = __this_cpu_read(cpu_hw_events.perf_ctr_virt_mask); | ||
460 | |||
420 | if (hwc->extra_reg.reg) | 461 | if (hwc->extra_reg.reg) |
421 | wrmsrl(hwc->extra_reg.reg, hwc->extra_reg.config); | 462 | wrmsrl(hwc->extra_reg.reg, hwc->extra_reg.config); |
422 | wrmsrl(hwc->config_base, hwc->config | enable_mask); | 463 | wrmsrl(hwc->config_base, (hwc->config | enable_mask) & ~disable_mask); |
423 | } | 464 | } |
424 | 465 | ||
425 | void x86_pmu_enable_all(int added); | 466 | void x86_pmu_enable_all(int added); |
@@ -443,6 +484,15 @@ extern struct event_constraint emptyconstraint; | |||
443 | 484 | ||
444 | extern struct event_constraint unconstrained; | 485 | extern struct event_constraint unconstrained; |
445 | 486 | ||
487 | static inline bool kernel_ip(unsigned long ip) | ||
488 | { | ||
489 | #ifdef CONFIG_X86_32 | ||
490 | return ip > PAGE_OFFSET; | ||
491 | #else | ||
492 | return (long)ip < 0; | ||
493 | #endif | ||
494 | } | ||
495 | |||
446 | #ifdef CONFIG_CPU_SUP_AMD | 496 | #ifdef CONFIG_CPU_SUP_AMD |
447 | 497 | ||
448 | int amd_pmu_init(void); | 498 | int amd_pmu_init(void); |
@@ -523,6 +573,10 @@ void intel_pmu_lbr_init_nhm(void); | |||
523 | 573 | ||
524 | void intel_pmu_lbr_init_atom(void); | 574 | void intel_pmu_lbr_init_atom(void); |
525 | 575 | ||
576 | void intel_pmu_lbr_init_snb(void); | ||
577 | |||
578 | int intel_pmu_setup_lbr_filter(struct perf_event *event); | ||
579 | |||
526 | int p4_pmu_init(void); | 580 | int p4_pmu_init(void); |
527 | 581 | ||
528 | int p6_pmu_init(void); | 582 | int p6_pmu_init(void); |
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c index 0397b23be8e9..dd002faff7a6 100644 --- a/arch/x86/kernel/cpu/perf_event_amd.c +++ b/arch/x86/kernel/cpu/perf_event_amd.c | |||
@@ -1,4 +1,5 @@ | |||
1 | #include <linux/perf_event.h> | 1 | #include <linux/perf_event.h> |
2 | #include <linux/export.h> | ||
2 | #include <linux/types.h> | 3 | #include <linux/types.h> |
3 | #include <linux/init.h> | 4 | #include <linux/init.h> |
4 | #include <linux/slab.h> | 5 | #include <linux/slab.h> |
@@ -138,6 +139,9 @@ static int amd_pmu_hw_config(struct perf_event *event) | |||
138 | if (ret) | 139 | if (ret) |
139 | return ret; | 140 | return ret; |
140 | 141 | ||
142 | if (has_branch_stack(event)) | ||
143 | return -EOPNOTSUPP; | ||
144 | |||
141 | if (event->attr.exclude_host && event->attr.exclude_guest) | 145 | if (event->attr.exclude_host && event->attr.exclude_guest) |
142 | /* | 146 | /* |
143 | * When HO == GO == 1 the hardware treats that as GO == HO == 0 | 147 | * When HO == GO == 1 the hardware treats that as GO == HO == 0 |
@@ -357,7 +361,9 @@ static void amd_pmu_cpu_starting(int cpu) | |||
357 | struct amd_nb *nb; | 361 | struct amd_nb *nb; |
358 | int i, nb_id; | 362 | int i, nb_id; |
359 | 363 | ||
360 | if (boot_cpu_data.x86_max_cores < 2) | 364 | cpuc->perf_ctr_virt_mask = AMD_PERFMON_EVENTSEL_HOSTONLY; |
365 | |||
366 | if (boot_cpu_data.x86_max_cores < 2 || boot_cpu_data.x86 == 0x15) | ||
361 | return; | 367 | return; |
362 | 368 | ||
363 | nb_id = amd_get_nb_id(cpu); | 369 | nb_id = amd_get_nb_id(cpu); |
@@ -587,9 +593,9 @@ static __initconst const struct x86_pmu amd_pmu_f15h = { | |||
587 | .put_event_constraints = amd_put_event_constraints, | 593 | .put_event_constraints = amd_put_event_constraints, |
588 | 594 | ||
589 | .cpu_prepare = amd_pmu_cpu_prepare, | 595 | .cpu_prepare = amd_pmu_cpu_prepare, |
590 | .cpu_starting = amd_pmu_cpu_starting, | ||
591 | .cpu_dead = amd_pmu_cpu_dead, | 596 | .cpu_dead = amd_pmu_cpu_dead, |
592 | #endif | 597 | #endif |
598 | .cpu_starting = amd_pmu_cpu_starting, | ||
593 | }; | 599 | }; |
594 | 600 | ||
595 | __init int amd_pmu_init(void) | 601 | __init int amd_pmu_init(void) |
@@ -621,3 +627,33 @@ __init int amd_pmu_init(void) | |||
621 | 627 | ||
622 | return 0; | 628 | return 0; |
623 | } | 629 | } |
630 | |||
631 | void amd_pmu_enable_virt(void) | ||
632 | { | ||
633 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); | ||
634 | |||
635 | cpuc->perf_ctr_virt_mask = 0; | ||
636 | |||
637 | /* Reload all events */ | ||
638 | x86_pmu_disable_all(); | ||
639 | x86_pmu_enable_all(0); | ||
640 | } | ||
641 | EXPORT_SYMBOL_GPL(amd_pmu_enable_virt); | ||
642 | |||
643 | void amd_pmu_disable_virt(void) | ||
644 | { | ||
645 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); | ||
646 | |||
647 | /* | ||
648 | * We only mask out the Host-only bit so that host-only counting works | ||
649 | * when SVM is disabled. If someone sets up a guest-only counter when | ||
650 | * SVM is disabled the Guest-only bits still gets set and the counter | ||
651 | * will not count anything. | ||
652 | */ | ||
653 | cpuc->perf_ctr_virt_mask = AMD_PERFMON_EVENTSEL_HOSTONLY; | ||
654 | |||
655 | /* Reload all events */ | ||
656 | x86_pmu_disable_all(); | ||
657 | x86_pmu_enable_all(0); | ||
658 | } | ||
659 | EXPORT_SYMBOL_GPL(amd_pmu_disable_virt); | ||
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 3bd37bdf1b8e..6a84e7f28f05 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c | |||
@@ -385,14 +385,15 @@ static __initconst const u64 westmere_hw_cache_event_ids | |||
385 | #define NHM_LOCAL_DRAM (1 << 14) | 385 | #define NHM_LOCAL_DRAM (1 << 14) |
386 | #define NHM_NON_DRAM (1 << 15) | 386 | #define NHM_NON_DRAM (1 << 15) |
387 | 387 | ||
388 | #define NHM_ALL_DRAM (NHM_REMOTE_DRAM|NHM_LOCAL_DRAM) | 388 | #define NHM_LOCAL (NHM_LOCAL_DRAM|NHM_REMOTE_CACHE_FWD) |
389 | #define NHM_REMOTE (NHM_REMOTE_DRAM) | ||
389 | 390 | ||
390 | #define NHM_DMND_READ (NHM_DMND_DATA_RD) | 391 | #define NHM_DMND_READ (NHM_DMND_DATA_RD) |
391 | #define NHM_DMND_WRITE (NHM_DMND_RFO|NHM_DMND_WB) | 392 | #define NHM_DMND_WRITE (NHM_DMND_RFO|NHM_DMND_WB) |
392 | #define NHM_DMND_PREFETCH (NHM_PF_DATA_RD|NHM_PF_DATA_RFO) | 393 | #define NHM_DMND_PREFETCH (NHM_PF_DATA_RD|NHM_PF_DATA_RFO) |
393 | 394 | ||
394 | #define NHM_L3_HIT (NHM_UNCORE_HIT|NHM_OTHER_CORE_HIT_SNP|NHM_OTHER_CORE_HITM) | 395 | #define NHM_L3_HIT (NHM_UNCORE_HIT|NHM_OTHER_CORE_HIT_SNP|NHM_OTHER_CORE_HITM) |
395 | #define NHM_L3_MISS (NHM_NON_DRAM|NHM_ALL_DRAM|NHM_REMOTE_CACHE_FWD) | 396 | #define NHM_L3_MISS (NHM_NON_DRAM|NHM_LOCAL_DRAM|NHM_REMOTE_DRAM|NHM_REMOTE_CACHE_FWD) |
396 | #define NHM_L3_ACCESS (NHM_L3_HIT|NHM_L3_MISS) | 397 | #define NHM_L3_ACCESS (NHM_L3_HIT|NHM_L3_MISS) |
397 | 398 | ||
398 | static __initconst const u64 nehalem_hw_cache_extra_regs | 399 | static __initconst const u64 nehalem_hw_cache_extra_regs |
@@ -416,16 +417,16 @@ static __initconst const u64 nehalem_hw_cache_extra_regs | |||
416 | }, | 417 | }, |
417 | [ C(NODE) ] = { | 418 | [ C(NODE) ] = { |
418 | [ C(OP_READ) ] = { | 419 | [ C(OP_READ) ] = { |
419 | [ C(RESULT_ACCESS) ] = NHM_DMND_READ|NHM_ALL_DRAM, | 420 | [ C(RESULT_ACCESS) ] = NHM_DMND_READ|NHM_LOCAL|NHM_REMOTE, |
420 | [ C(RESULT_MISS) ] = NHM_DMND_READ|NHM_REMOTE_DRAM, | 421 | [ C(RESULT_MISS) ] = NHM_DMND_READ|NHM_REMOTE, |
421 | }, | 422 | }, |
422 | [ C(OP_WRITE) ] = { | 423 | [ C(OP_WRITE) ] = { |
423 | [ C(RESULT_ACCESS) ] = NHM_DMND_WRITE|NHM_ALL_DRAM, | 424 | [ C(RESULT_ACCESS) ] = NHM_DMND_WRITE|NHM_LOCAL|NHM_REMOTE, |
424 | [ C(RESULT_MISS) ] = NHM_DMND_WRITE|NHM_REMOTE_DRAM, | 425 | [ C(RESULT_MISS) ] = NHM_DMND_WRITE|NHM_REMOTE, |
425 | }, | 426 | }, |
426 | [ C(OP_PREFETCH) ] = { | 427 | [ C(OP_PREFETCH) ] = { |
427 | [ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_ALL_DRAM, | 428 | [ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_LOCAL|NHM_REMOTE, |
428 | [ C(RESULT_MISS) ] = NHM_DMND_PREFETCH|NHM_REMOTE_DRAM, | 429 | [ C(RESULT_MISS) ] = NHM_DMND_PREFETCH|NHM_REMOTE, |
429 | }, | 430 | }, |
430 | }, | 431 | }, |
431 | }; | 432 | }; |
@@ -727,6 +728,19 @@ static __initconst const u64 atom_hw_cache_event_ids | |||
727 | }, | 728 | }, |
728 | }; | 729 | }; |
729 | 730 | ||
731 | static inline bool intel_pmu_needs_lbr_smpl(struct perf_event *event) | ||
732 | { | ||
733 | /* user explicitly requested branch sampling */ | ||
734 | if (has_branch_stack(event)) | ||
735 | return true; | ||
736 | |||
737 | /* implicit branch sampling to correct PEBS skid */ | ||
738 | if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1) | ||
739 | return true; | ||
740 | |||
741 | return false; | ||
742 | } | ||
743 | |||
730 | static void intel_pmu_disable_all(void) | 744 | static void intel_pmu_disable_all(void) |
731 | { | 745 | { |
732 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); | 746 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); |
@@ -881,6 +895,13 @@ static void intel_pmu_disable_event(struct perf_event *event) | |||
881 | cpuc->intel_ctrl_guest_mask &= ~(1ull << hwc->idx); | 895 | cpuc->intel_ctrl_guest_mask &= ~(1ull << hwc->idx); |
882 | cpuc->intel_ctrl_host_mask &= ~(1ull << hwc->idx); | 896 | cpuc->intel_ctrl_host_mask &= ~(1ull << hwc->idx); |
883 | 897 | ||
898 | /* | ||
899 | * must disable before any actual event | ||
900 | * because any event may be combined with LBR | ||
901 | */ | ||
902 | if (intel_pmu_needs_lbr_smpl(event)) | ||
903 | intel_pmu_lbr_disable(event); | ||
904 | |||
884 | if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { | 905 | if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { |
885 | intel_pmu_disable_fixed(hwc); | 906 | intel_pmu_disable_fixed(hwc); |
886 | return; | 907 | return; |
@@ -935,6 +956,12 @@ static void intel_pmu_enable_event(struct perf_event *event) | |||
935 | intel_pmu_enable_bts(hwc->config); | 956 | intel_pmu_enable_bts(hwc->config); |
936 | return; | 957 | return; |
937 | } | 958 | } |
959 | /* | ||
960 | * must enabled before any actual event | ||
961 | * because any event may be combined with LBR | ||
962 | */ | ||
963 | if (intel_pmu_needs_lbr_smpl(event)) | ||
964 | intel_pmu_lbr_enable(event); | ||
938 | 965 | ||
939 | if (event->attr.exclude_host) | 966 | if (event->attr.exclude_host) |
940 | cpuc->intel_ctrl_guest_mask |= (1ull << hwc->idx); | 967 | cpuc->intel_ctrl_guest_mask |= (1ull << hwc->idx); |
@@ -1057,6 +1084,9 @@ again: | |||
1057 | 1084 | ||
1058 | data.period = event->hw.last_period; | 1085 | data.period = event->hw.last_period; |
1059 | 1086 | ||
1087 | if (has_branch_stack(event)) | ||
1088 | data.br_stack = &cpuc->lbr_stack; | ||
1089 | |||
1060 | if (perf_event_overflow(event, &data, regs)) | 1090 | if (perf_event_overflow(event, &data, regs)) |
1061 | x86_pmu_stop(event, 0); | 1091 | x86_pmu_stop(event, 0); |
1062 | } | 1092 | } |
@@ -1123,17 +1153,17 @@ static bool intel_try_alt_er(struct perf_event *event, int orig_idx) | |||
1123 | */ | 1153 | */ |
1124 | static struct event_constraint * | 1154 | static struct event_constraint * |
1125 | __intel_shared_reg_get_constraints(struct cpu_hw_events *cpuc, | 1155 | __intel_shared_reg_get_constraints(struct cpu_hw_events *cpuc, |
1126 | struct perf_event *event) | 1156 | struct perf_event *event, |
1157 | struct hw_perf_event_extra *reg) | ||
1127 | { | 1158 | { |
1128 | struct event_constraint *c = &emptyconstraint; | 1159 | struct event_constraint *c = &emptyconstraint; |
1129 | struct hw_perf_event_extra *reg = &event->hw.extra_reg; | ||
1130 | struct er_account *era; | 1160 | struct er_account *era; |
1131 | unsigned long flags; | 1161 | unsigned long flags; |
1132 | int orig_idx = reg->idx; | 1162 | int orig_idx = reg->idx; |
1133 | 1163 | ||
1134 | /* already allocated shared msr */ | 1164 | /* already allocated shared msr */ |
1135 | if (reg->alloc) | 1165 | if (reg->alloc) |
1136 | return &unconstrained; | 1166 | return NULL; /* call x86_get_event_constraint() */ |
1137 | 1167 | ||
1138 | again: | 1168 | again: |
1139 | era = &cpuc->shared_regs->regs[reg->idx]; | 1169 | era = &cpuc->shared_regs->regs[reg->idx]; |
@@ -1156,14 +1186,10 @@ again: | |||
1156 | reg->alloc = 1; | 1186 | reg->alloc = 1; |
1157 | 1187 | ||
1158 | /* | 1188 | /* |
1159 | * All events using extra_reg are unconstrained. | 1189 | * need to call x86_get_event_constraint() |
1160 | * Avoids calling x86_get_event_constraints() | 1190 | * to check if associated event has constraints |
1161 | * | ||
1162 | * Must revisit if extra_reg controlling events | ||
1163 | * ever have constraints. Worst case we go through | ||
1164 | * the regular event constraint table. | ||
1165 | */ | 1191 | */ |
1166 | c = &unconstrained; | 1192 | c = NULL; |
1167 | } else if (intel_try_alt_er(event, orig_idx)) { | 1193 | } else if (intel_try_alt_er(event, orig_idx)) { |
1168 | raw_spin_unlock_irqrestore(&era->lock, flags); | 1194 | raw_spin_unlock_irqrestore(&era->lock, flags); |
1169 | goto again; | 1195 | goto again; |
@@ -1200,11 +1226,23 @@ static struct event_constraint * | |||
1200 | intel_shared_regs_constraints(struct cpu_hw_events *cpuc, | 1226 | intel_shared_regs_constraints(struct cpu_hw_events *cpuc, |
1201 | struct perf_event *event) | 1227 | struct perf_event *event) |
1202 | { | 1228 | { |
1203 | struct event_constraint *c = NULL; | 1229 | struct event_constraint *c = NULL, *d; |
1204 | 1230 | struct hw_perf_event_extra *xreg, *breg; | |
1205 | if (event->hw.extra_reg.idx != EXTRA_REG_NONE) | 1231 | |
1206 | c = __intel_shared_reg_get_constraints(cpuc, event); | 1232 | xreg = &event->hw.extra_reg; |
1207 | 1233 | if (xreg->idx != EXTRA_REG_NONE) { | |
1234 | c = __intel_shared_reg_get_constraints(cpuc, event, xreg); | ||
1235 | if (c == &emptyconstraint) | ||
1236 | return c; | ||
1237 | } | ||
1238 | breg = &event->hw.branch_reg; | ||
1239 | if (breg->idx != EXTRA_REG_NONE) { | ||
1240 | d = __intel_shared_reg_get_constraints(cpuc, event, breg); | ||
1241 | if (d == &emptyconstraint) { | ||
1242 | __intel_shared_reg_put_constraints(cpuc, xreg); | ||
1243 | c = d; | ||
1244 | } | ||
1245 | } | ||
1208 | return c; | 1246 | return c; |
1209 | } | 1247 | } |
1210 | 1248 | ||
@@ -1252,6 +1290,10 @@ intel_put_shared_regs_event_constraints(struct cpu_hw_events *cpuc, | |||
1252 | reg = &event->hw.extra_reg; | 1290 | reg = &event->hw.extra_reg; |
1253 | if (reg->idx != EXTRA_REG_NONE) | 1291 | if (reg->idx != EXTRA_REG_NONE) |
1254 | __intel_shared_reg_put_constraints(cpuc, reg); | 1292 | __intel_shared_reg_put_constraints(cpuc, reg); |
1293 | |||
1294 | reg = &event->hw.branch_reg; | ||
1295 | if (reg->idx != EXTRA_REG_NONE) | ||
1296 | __intel_shared_reg_put_constraints(cpuc, reg); | ||
1255 | } | 1297 | } |
1256 | 1298 | ||
1257 | static void intel_put_event_constraints(struct cpu_hw_events *cpuc, | 1299 | static void intel_put_event_constraints(struct cpu_hw_events *cpuc, |
@@ -1287,12 +1329,19 @@ static int intel_pmu_hw_config(struct perf_event *event) | |||
1287 | * | 1329 | * |
1288 | * Thereby we gain a PEBS capable cycle counter. | 1330 | * Thereby we gain a PEBS capable cycle counter. |
1289 | */ | 1331 | */ |
1290 | u64 alt_config = 0x108000c0; /* INST_RETIRED.TOTAL_CYCLES */ | 1332 | u64 alt_config = X86_CONFIG(.event=0xc0, .inv=1, .cmask=16); |
1333 | |||
1291 | 1334 | ||
1292 | alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK); | 1335 | alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK); |
1293 | event->hw.config = alt_config; | 1336 | event->hw.config = alt_config; |
1294 | } | 1337 | } |
1295 | 1338 | ||
1339 | if (intel_pmu_needs_lbr_smpl(event)) { | ||
1340 | ret = intel_pmu_setup_lbr_filter(event); | ||
1341 | if (ret) | ||
1342 | return ret; | ||
1343 | } | ||
1344 | |||
1296 | if (event->attr.type != PERF_TYPE_RAW) | 1345 | if (event->attr.type != PERF_TYPE_RAW) |
1297 | return 0; | 1346 | return 0; |
1298 | 1347 | ||
@@ -1431,7 +1480,7 @@ static int intel_pmu_cpu_prepare(int cpu) | |||
1431 | { | 1480 | { |
1432 | struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); | 1481 | struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); |
1433 | 1482 | ||
1434 | if (!x86_pmu.extra_regs) | 1483 | if (!(x86_pmu.extra_regs || x86_pmu.lbr_sel_map)) |
1435 | return NOTIFY_OK; | 1484 | return NOTIFY_OK; |
1436 | 1485 | ||
1437 | cpuc->shared_regs = allocate_shared_regs(cpu); | 1486 | cpuc->shared_regs = allocate_shared_regs(cpu); |
@@ -1453,22 +1502,28 @@ static void intel_pmu_cpu_starting(int cpu) | |||
1453 | */ | 1502 | */ |
1454 | intel_pmu_lbr_reset(); | 1503 | intel_pmu_lbr_reset(); |
1455 | 1504 | ||
1456 | if (!cpuc->shared_regs || (x86_pmu.er_flags & ERF_NO_HT_SHARING)) | 1505 | cpuc->lbr_sel = NULL; |
1506 | |||
1507 | if (!cpuc->shared_regs) | ||
1457 | return; | 1508 | return; |
1458 | 1509 | ||
1459 | for_each_cpu(i, topology_thread_cpumask(cpu)) { | 1510 | if (!(x86_pmu.er_flags & ERF_NO_HT_SHARING)) { |
1460 | struct intel_shared_regs *pc; | 1511 | for_each_cpu(i, topology_thread_cpumask(cpu)) { |
1512 | struct intel_shared_regs *pc; | ||
1461 | 1513 | ||
1462 | pc = per_cpu(cpu_hw_events, i).shared_regs; | 1514 | pc = per_cpu(cpu_hw_events, i).shared_regs; |
1463 | if (pc && pc->core_id == core_id) { | 1515 | if (pc && pc->core_id == core_id) { |
1464 | cpuc->kfree_on_online = cpuc->shared_regs; | 1516 | cpuc->kfree_on_online = cpuc->shared_regs; |
1465 | cpuc->shared_regs = pc; | 1517 | cpuc->shared_regs = pc; |
1466 | break; | 1518 | break; |
1519 | } | ||
1467 | } | 1520 | } |
1521 | cpuc->shared_regs->core_id = core_id; | ||
1522 | cpuc->shared_regs->refcnt++; | ||
1468 | } | 1523 | } |
1469 | 1524 | ||
1470 | cpuc->shared_regs->core_id = core_id; | 1525 | if (x86_pmu.lbr_sel_map) |
1471 | cpuc->shared_regs->refcnt++; | 1526 | cpuc->lbr_sel = &cpuc->shared_regs->regs[EXTRA_REG_LBR]; |
1472 | } | 1527 | } |
1473 | 1528 | ||
1474 | static void intel_pmu_cpu_dying(int cpu) | 1529 | static void intel_pmu_cpu_dying(int cpu) |
@@ -1486,6 +1541,18 @@ static void intel_pmu_cpu_dying(int cpu) | |||
1486 | fini_debug_store_on_cpu(cpu); | 1541 | fini_debug_store_on_cpu(cpu); |
1487 | } | 1542 | } |
1488 | 1543 | ||
1544 | static void intel_pmu_flush_branch_stack(void) | ||
1545 | { | ||
1546 | /* | ||
1547 | * Intel LBR does not tag entries with the | ||
1548 | * PID of the current task, then we need to | ||
1549 | * flush it on ctxsw | ||
1550 | * For now, we simply reset it | ||
1551 | */ | ||
1552 | if (x86_pmu.lbr_nr) | ||
1553 | intel_pmu_lbr_reset(); | ||
1554 | } | ||
1555 | |||
1489 | static __initconst const struct x86_pmu intel_pmu = { | 1556 | static __initconst const struct x86_pmu intel_pmu = { |
1490 | .name = "Intel", | 1557 | .name = "Intel", |
1491 | .handle_irq = intel_pmu_handle_irq, | 1558 | .handle_irq = intel_pmu_handle_irq, |
@@ -1513,6 +1580,7 @@ static __initconst const struct x86_pmu intel_pmu = { | |||
1513 | .cpu_starting = intel_pmu_cpu_starting, | 1580 | .cpu_starting = intel_pmu_cpu_starting, |
1514 | .cpu_dying = intel_pmu_cpu_dying, | 1581 | .cpu_dying = intel_pmu_cpu_dying, |
1515 | .guest_get_msrs = intel_guest_get_msrs, | 1582 | .guest_get_msrs = intel_guest_get_msrs, |
1583 | .flush_branch_stack = intel_pmu_flush_branch_stack, | ||
1516 | }; | 1584 | }; |
1517 | 1585 | ||
1518 | static __init void intel_clovertown_quirk(void) | 1586 | static __init void intel_clovertown_quirk(void) |
@@ -1689,9 +1757,11 @@ __init int intel_pmu_init(void) | |||
1689 | x86_pmu.extra_regs = intel_nehalem_extra_regs; | 1757 | x86_pmu.extra_regs = intel_nehalem_extra_regs; |
1690 | 1758 | ||
1691 | /* UOPS_ISSUED.STALLED_CYCLES */ | 1759 | /* UOPS_ISSUED.STALLED_CYCLES */ |
1692 | intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e; | 1760 | intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = |
1761 | X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1); | ||
1693 | /* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */ | 1762 | /* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */ |
1694 | intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x1803fb1; | 1763 | intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = |
1764 | X86_CONFIG(.event=0xb1, .umask=0x3f, .inv=1, .cmask=1); | ||
1695 | 1765 | ||
1696 | x86_add_quirk(intel_nehalem_quirk); | 1766 | x86_add_quirk(intel_nehalem_quirk); |
1697 | 1767 | ||
@@ -1726,9 +1796,11 @@ __init int intel_pmu_init(void) | |||
1726 | x86_pmu.er_flags |= ERF_HAS_RSP_1; | 1796 | x86_pmu.er_flags |= ERF_HAS_RSP_1; |
1727 | 1797 | ||
1728 | /* UOPS_ISSUED.STALLED_CYCLES */ | 1798 | /* UOPS_ISSUED.STALLED_CYCLES */ |
1729 | intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e; | 1799 | intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = |
1800 | X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1); | ||
1730 | /* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */ | 1801 | /* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */ |
1731 | intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x1803fb1; | 1802 | intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = |
1803 | X86_CONFIG(.event=0xb1, .umask=0x3f, .inv=1, .cmask=1); | ||
1732 | 1804 | ||
1733 | pr_cont("Westmere events, "); | 1805 | pr_cont("Westmere events, "); |
1734 | break; | 1806 | break; |
@@ -1739,7 +1811,7 @@ __init int intel_pmu_init(void) | |||
1739 | memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, | 1811 | memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, |
1740 | sizeof(hw_cache_event_ids)); | 1812 | sizeof(hw_cache_event_ids)); |
1741 | 1813 | ||
1742 | intel_pmu_lbr_init_nhm(); | 1814 | intel_pmu_lbr_init_snb(); |
1743 | 1815 | ||
1744 | x86_pmu.event_constraints = intel_snb_event_constraints; | 1816 | x86_pmu.event_constraints = intel_snb_event_constraints; |
1745 | x86_pmu.pebs_constraints = intel_snb_pebs_event_constraints; | 1817 | x86_pmu.pebs_constraints = intel_snb_pebs_event_constraints; |
@@ -1749,9 +1821,11 @@ __init int intel_pmu_init(void) | |||
1749 | x86_pmu.er_flags |= ERF_NO_HT_SHARING; | 1821 | x86_pmu.er_flags |= ERF_NO_HT_SHARING; |
1750 | 1822 | ||
1751 | /* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */ | 1823 | /* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */ |
1752 | intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e; | 1824 | intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = |
1825 | X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1); | ||
1753 | /* UOPS_DISPATCHED.THREAD,c=1,i=1 to count stall cycles*/ | 1826 | /* UOPS_DISPATCHED.THREAD,c=1,i=1 to count stall cycles*/ |
1754 | intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x18001b1; | 1827 | intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = |
1828 | X86_CONFIG(.event=0xb1, .umask=0x01, .inv=1, .cmask=1); | ||
1755 | 1829 | ||
1756 | pr_cont("SandyBridge events, "); | 1830 | pr_cont("SandyBridge events, "); |
1757 | break; | 1831 | break; |
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index d6bd49faa40c..7f64df19e7dd 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c | |||
@@ -3,6 +3,7 @@ | |||
3 | #include <linux/slab.h> | 3 | #include <linux/slab.h> |
4 | 4 | ||
5 | #include <asm/perf_event.h> | 5 | #include <asm/perf_event.h> |
6 | #include <asm/insn.h> | ||
6 | 7 | ||
7 | #include "perf_event.h" | 8 | #include "perf_event.h" |
8 | 9 | ||
@@ -439,9 +440,6 @@ void intel_pmu_pebs_enable(struct perf_event *event) | |||
439 | hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT; | 440 | hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT; |
440 | 441 | ||
441 | cpuc->pebs_enabled |= 1ULL << hwc->idx; | 442 | cpuc->pebs_enabled |= 1ULL << hwc->idx; |
442 | |||
443 | if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1) | ||
444 | intel_pmu_lbr_enable(event); | ||
445 | } | 443 | } |
446 | 444 | ||
447 | void intel_pmu_pebs_disable(struct perf_event *event) | 445 | void intel_pmu_pebs_disable(struct perf_event *event) |
@@ -454,9 +452,6 @@ void intel_pmu_pebs_disable(struct perf_event *event) | |||
454 | wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled); | 452 | wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled); |
455 | 453 | ||
456 | hwc->config |= ARCH_PERFMON_EVENTSEL_INT; | 454 | hwc->config |= ARCH_PERFMON_EVENTSEL_INT; |
457 | |||
458 | if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1) | ||
459 | intel_pmu_lbr_disable(event); | ||
460 | } | 455 | } |
461 | 456 | ||
462 | void intel_pmu_pebs_enable_all(void) | 457 | void intel_pmu_pebs_enable_all(void) |
@@ -475,17 +470,6 @@ void intel_pmu_pebs_disable_all(void) | |||
475 | wrmsrl(MSR_IA32_PEBS_ENABLE, 0); | 470 | wrmsrl(MSR_IA32_PEBS_ENABLE, 0); |
476 | } | 471 | } |
477 | 472 | ||
478 | #include <asm/insn.h> | ||
479 | |||
480 | static inline bool kernel_ip(unsigned long ip) | ||
481 | { | ||
482 | #ifdef CONFIG_X86_32 | ||
483 | return ip > PAGE_OFFSET; | ||
484 | #else | ||
485 | return (long)ip < 0; | ||
486 | #endif | ||
487 | } | ||
488 | |||
489 | static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs) | 473 | static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs) |
490 | { | 474 | { |
491 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); | 475 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); |
@@ -572,6 +556,7 @@ static void __intel_pmu_pebs_event(struct perf_event *event, | |||
572 | * both formats and we don't use the other fields in this | 556 | * both formats and we don't use the other fields in this |
573 | * routine. | 557 | * routine. |
574 | */ | 558 | */ |
559 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); | ||
575 | struct pebs_record_core *pebs = __pebs; | 560 | struct pebs_record_core *pebs = __pebs; |
576 | struct perf_sample_data data; | 561 | struct perf_sample_data data; |
577 | struct pt_regs regs; | 562 | struct pt_regs regs; |
@@ -602,6 +587,9 @@ static void __intel_pmu_pebs_event(struct perf_event *event, | |||
602 | else | 587 | else |
603 | regs.flags &= ~PERF_EFLAGS_EXACT; | 588 | regs.flags &= ~PERF_EFLAGS_EXACT; |
604 | 589 | ||
590 | if (has_branch_stack(event)) | ||
591 | data.br_stack = &cpuc->lbr_stack; | ||
592 | |||
605 | if (perf_event_overflow(event, &data, ®s)) | 593 | if (perf_event_overflow(event, &data, ®s)) |
606 | x86_pmu_stop(event, 0); | 594 | x86_pmu_stop(event, 0); |
607 | } | 595 | } |
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c index 47a7e63bfe54..520b4265fcd2 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c | |||
@@ -3,6 +3,7 @@ | |||
3 | 3 | ||
4 | #include <asm/perf_event.h> | 4 | #include <asm/perf_event.h> |
5 | #include <asm/msr.h> | 5 | #include <asm/msr.h> |
6 | #include <asm/insn.h> | ||
6 | 7 | ||
7 | #include "perf_event.h" | 8 | #include "perf_event.h" |
8 | 9 | ||
@@ -14,6 +15,100 @@ enum { | |||
14 | }; | 15 | }; |
15 | 16 | ||
16 | /* | 17 | /* |
18 | * Intel LBR_SELECT bits | ||
19 | * Intel Vol3a, April 2011, Section 16.7 Table 16-10 | ||
20 | * | ||
21 | * Hardware branch filter (not available on all CPUs) | ||
22 | */ | ||
23 | #define LBR_KERNEL_BIT 0 /* do not capture at ring0 */ | ||
24 | #define LBR_USER_BIT 1 /* do not capture at ring > 0 */ | ||
25 | #define LBR_JCC_BIT 2 /* do not capture conditional branches */ | ||
26 | #define LBR_REL_CALL_BIT 3 /* do not capture relative calls */ | ||
27 | #define LBR_IND_CALL_BIT 4 /* do not capture indirect calls */ | ||
28 | #define LBR_RETURN_BIT 5 /* do not capture near returns */ | ||
29 | #define LBR_IND_JMP_BIT 6 /* do not capture indirect jumps */ | ||
30 | #define LBR_REL_JMP_BIT 7 /* do not capture relative jumps */ | ||
31 | #define LBR_FAR_BIT 8 /* do not capture far branches */ | ||
32 | |||
33 | #define LBR_KERNEL (1 << LBR_KERNEL_BIT) | ||
34 | #define LBR_USER (1 << LBR_USER_BIT) | ||
35 | #define LBR_JCC (1 << LBR_JCC_BIT) | ||
36 | #define LBR_REL_CALL (1 << LBR_REL_CALL_BIT) | ||
37 | #define LBR_IND_CALL (1 << LBR_IND_CALL_BIT) | ||
38 | #define LBR_RETURN (1 << LBR_RETURN_BIT) | ||
39 | #define LBR_REL_JMP (1 << LBR_REL_JMP_BIT) | ||
40 | #define LBR_IND_JMP (1 << LBR_IND_JMP_BIT) | ||
41 | #define LBR_FAR (1 << LBR_FAR_BIT) | ||
42 | |||
43 | #define LBR_PLM (LBR_KERNEL | LBR_USER) | ||
44 | |||
45 | #define LBR_SEL_MASK 0x1ff /* valid bits in LBR_SELECT */ | ||
46 | #define LBR_NOT_SUPP -1 /* LBR filter not supported */ | ||
47 | #define LBR_IGN 0 /* ignored */ | ||
48 | |||
49 | #define LBR_ANY \ | ||
50 | (LBR_JCC |\ | ||
51 | LBR_REL_CALL |\ | ||
52 | LBR_IND_CALL |\ | ||
53 | LBR_RETURN |\ | ||
54 | LBR_REL_JMP |\ | ||
55 | LBR_IND_JMP |\ | ||
56 | LBR_FAR) | ||
57 | |||
58 | #define LBR_FROM_FLAG_MISPRED (1ULL << 63) | ||
59 | |||
60 | #define for_each_branch_sample_type(x) \ | ||
61 | for ((x) = PERF_SAMPLE_BRANCH_USER; \ | ||
62 | (x) < PERF_SAMPLE_BRANCH_MAX; (x) <<= 1) | ||
63 | |||
64 | /* | ||
65 | * x86control flow change classification | ||
66 | * x86control flow changes include branches, interrupts, traps, faults | ||
67 | */ | ||
68 | enum { | ||
69 | X86_BR_NONE = 0, /* unknown */ | ||
70 | |||
71 | X86_BR_USER = 1 << 0, /* branch target is user */ | ||
72 | X86_BR_KERNEL = 1 << 1, /* branch target is kernel */ | ||
73 | |||
74 | X86_BR_CALL = 1 << 2, /* call */ | ||
75 | X86_BR_RET = 1 << 3, /* return */ | ||
76 | X86_BR_SYSCALL = 1 << 4, /* syscall */ | ||
77 | X86_BR_SYSRET = 1 << 5, /* syscall return */ | ||
78 | X86_BR_INT = 1 << 6, /* sw interrupt */ | ||
79 | X86_BR_IRET = 1 << 7, /* return from interrupt */ | ||
80 | X86_BR_JCC = 1 << 8, /* conditional */ | ||
81 | X86_BR_JMP = 1 << 9, /* jump */ | ||
82 | X86_BR_IRQ = 1 << 10,/* hw interrupt or trap or fault */ | ||
83 | X86_BR_IND_CALL = 1 << 11,/* indirect calls */ | ||
84 | }; | ||
85 | |||
86 | #define X86_BR_PLM (X86_BR_USER | X86_BR_KERNEL) | ||
87 | |||
88 | #define X86_BR_ANY \ | ||
89 | (X86_BR_CALL |\ | ||
90 | X86_BR_RET |\ | ||
91 | X86_BR_SYSCALL |\ | ||
92 | X86_BR_SYSRET |\ | ||
93 | X86_BR_INT |\ | ||
94 | X86_BR_IRET |\ | ||
95 | X86_BR_JCC |\ | ||
96 | X86_BR_JMP |\ | ||
97 | X86_BR_IRQ |\ | ||
98 | X86_BR_IND_CALL) | ||
99 | |||
100 | #define X86_BR_ALL (X86_BR_PLM | X86_BR_ANY) | ||
101 | |||
102 | #define X86_BR_ANY_CALL \ | ||
103 | (X86_BR_CALL |\ | ||
104 | X86_BR_IND_CALL |\ | ||
105 | X86_BR_SYSCALL |\ | ||
106 | X86_BR_IRQ |\ | ||
107 | X86_BR_INT) | ||
108 | |||
109 | static void intel_pmu_lbr_filter(struct cpu_hw_events *cpuc); | ||
110 | |||
111 | /* | ||
17 | * We only support LBR implementations that have FREEZE_LBRS_ON_PMI | 112 | * We only support LBR implementations that have FREEZE_LBRS_ON_PMI |
18 | * otherwise it becomes near impossible to get a reliable stack. | 113 | * otherwise it becomes near impossible to get a reliable stack. |
19 | */ | 114 | */ |
@@ -21,6 +116,10 @@ enum { | |||
21 | static void __intel_pmu_lbr_enable(void) | 116 | static void __intel_pmu_lbr_enable(void) |
22 | { | 117 | { |
23 | u64 debugctl; | 118 | u64 debugctl; |
119 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); | ||
120 | |||
121 | if (cpuc->lbr_sel) | ||
122 | wrmsrl(MSR_LBR_SELECT, cpuc->lbr_sel->config); | ||
24 | 123 | ||
25 | rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); | 124 | rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); |
26 | debugctl |= (DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI); | 125 | debugctl |= (DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI); |
@@ -76,11 +175,11 @@ void intel_pmu_lbr_enable(struct perf_event *event) | |||
76 | * Reset the LBR stack if we changed task context to | 175 | * Reset the LBR stack if we changed task context to |
77 | * avoid data leaks. | 176 | * avoid data leaks. |
78 | */ | 177 | */ |
79 | |||
80 | if (event->ctx->task && cpuc->lbr_context != event->ctx) { | 178 | if (event->ctx->task && cpuc->lbr_context != event->ctx) { |
81 | intel_pmu_lbr_reset(); | 179 | intel_pmu_lbr_reset(); |
82 | cpuc->lbr_context = event->ctx; | 180 | cpuc->lbr_context = event->ctx; |
83 | } | 181 | } |
182 | cpuc->br_sel = event->hw.branch_reg.reg; | ||
84 | 183 | ||
85 | cpuc->lbr_users++; | 184 | cpuc->lbr_users++; |
86 | } | 185 | } |
@@ -95,8 +194,11 @@ void intel_pmu_lbr_disable(struct perf_event *event) | |||
95 | cpuc->lbr_users--; | 194 | cpuc->lbr_users--; |
96 | WARN_ON_ONCE(cpuc->lbr_users < 0); | 195 | WARN_ON_ONCE(cpuc->lbr_users < 0); |
97 | 196 | ||
98 | if (cpuc->enabled && !cpuc->lbr_users) | 197 | if (cpuc->enabled && !cpuc->lbr_users) { |
99 | __intel_pmu_lbr_disable(); | 198 | __intel_pmu_lbr_disable(); |
199 | /* avoid stale pointer */ | ||
200 | cpuc->lbr_context = NULL; | ||
201 | } | ||
100 | } | 202 | } |
101 | 203 | ||
102 | void intel_pmu_lbr_enable_all(void) | 204 | void intel_pmu_lbr_enable_all(void) |
@@ -115,6 +217,9 @@ void intel_pmu_lbr_disable_all(void) | |||
115 | __intel_pmu_lbr_disable(); | 217 | __intel_pmu_lbr_disable(); |
116 | } | 218 | } |
117 | 219 | ||
220 | /* | ||
221 | * TOS = most recently recorded branch | ||
222 | */ | ||
118 | static inline u64 intel_pmu_lbr_tos(void) | 223 | static inline u64 intel_pmu_lbr_tos(void) |
119 | { | 224 | { |
120 | u64 tos; | 225 | u64 tos; |
@@ -142,15 +247,15 @@ static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc) | |||
142 | 247 | ||
143 | rdmsrl(x86_pmu.lbr_from + lbr_idx, msr_lastbranch.lbr); | 248 | rdmsrl(x86_pmu.lbr_from + lbr_idx, msr_lastbranch.lbr); |
144 | 249 | ||
145 | cpuc->lbr_entries[i].from = msr_lastbranch.from; | 250 | cpuc->lbr_entries[i].from = msr_lastbranch.from; |
146 | cpuc->lbr_entries[i].to = msr_lastbranch.to; | 251 | cpuc->lbr_entries[i].to = msr_lastbranch.to; |
147 | cpuc->lbr_entries[i].flags = 0; | 252 | cpuc->lbr_entries[i].mispred = 0; |
253 | cpuc->lbr_entries[i].predicted = 0; | ||
254 | cpuc->lbr_entries[i].reserved = 0; | ||
148 | } | 255 | } |
149 | cpuc->lbr_stack.nr = i; | 256 | cpuc->lbr_stack.nr = i; |
150 | } | 257 | } |
151 | 258 | ||
152 | #define LBR_FROM_FLAG_MISPRED (1ULL << 63) | ||
153 | |||
154 | /* | 259 | /* |
155 | * Due to lack of segmentation in Linux the effective address (offset) | 260 | * Due to lack of segmentation in Linux the effective address (offset) |
156 | * is the same as the linear address, allowing us to merge the LIP and EIP | 261 | * is the same as the linear address, allowing us to merge the LIP and EIP |
@@ -165,19 +270,22 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) | |||
165 | 270 | ||
166 | for (i = 0; i < x86_pmu.lbr_nr; i++) { | 271 | for (i = 0; i < x86_pmu.lbr_nr; i++) { |
167 | unsigned long lbr_idx = (tos - i) & mask; | 272 | unsigned long lbr_idx = (tos - i) & mask; |
168 | u64 from, to, flags = 0; | 273 | u64 from, to, mis = 0, pred = 0; |
169 | 274 | ||
170 | rdmsrl(x86_pmu.lbr_from + lbr_idx, from); | 275 | rdmsrl(x86_pmu.lbr_from + lbr_idx, from); |
171 | rdmsrl(x86_pmu.lbr_to + lbr_idx, to); | 276 | rdmsrl(x86_pmu.lbr_to + lbr_idx, to); |
172 | 277 | ||
173 | if (lbr_format == LBR_FORMAT_EIP_FLAGS) { | 278 | if (lbr_format == LBR_FORMAT_EIP_FLAGS) { |
174 | flags = !!(from & LBR_FROM_FLAG_MISPRED); | 279 | mis = !!(from & LBR_FROM_FLAG_MISPRED); |
280 | pred = !mis; | ||
175 | from = (u64)((((s64)from) << 1) >> 1); | 281 | from = (u64)((((s64)from) << 1) >> 1); |
176 | } | 282 | } |
177 | 283 | ||
178 | cpuc->lbr_entries[i].from = from; | 284 | cpuc->lbr_entries[i].from = from; |
179 | cpuc->lbr_entries[i].to = to; | 285 | cpuc->lbr_entries[i].to = to; |
180 | cpuc->lbr_entries[i].flags = flags; | 286 | cpuc->lbr_entries[i].mispred = mis; |
287 | cpuc->lbr_entries[i].predicted = pred; | ||
288 | cpuc->lbr_entries[i].reserved = 0; | ||
181 | } | 289 | } |
182 | cpuc->lbr_stack.nr = i; | 290 | cpuc->lbr_stack.nr = i; |
183 | } | 291 | } |
@@ -193,28 +301,404 @@ void intel_pmu_lbr_read(void) | |||
193 | intel_pmu_lbr_read_32(cpuc); | 301 | intel_pmu_lbr_read_32(cpuc); |
194 | else | 302 | else |
195 | intel_pmu_lbr_read_64(cpuc); | 303 | intel_pmu_lbr_read_64(cpuc); |
304 | |||
305 | intel_pmu_lbr_filter(cpuc); | ||
306 | } | ||
307 | |||
308 | /* | ||
309 | * SW filter is used: | ||
310 | * - in case there is no HW filter | ||
311 | * - in case the HW filter has errata or limitations | ||
312 | */ | ||
313 | static void intel_pmu_setup_sw_lbr_filter(struct perf_event *event) | ||
314 | { | ||
315 | u64 br_type = event->attr.branch_sample_type; | ||
316 | int mask = 0; | ||
317 | |||
318 | if (br_type & PERF_SAMPLE_BRANCH_USER) | ||
319 | mask |= X86_BR_USER; | ||
320 | |||
321 | if (br_type & PERF_SAMPLE_BRANCH_KERNEL) | ||
322 | mask |= X86_BR_KERNEL; | ||
323 | |||
324 | /* we ignore BRANCH_HV here */ | ||
325 | |||
326 | if (br_type & PERF_SAMPLE_BRANCH_ANY) | ||
327 | mask |= X86_BR_ANY; | ||
328 | |||
329 | if (br_type & PERF_SAMPLE_BRANCH_ANY_CALL) | ||
330 | mask |= X86_BR_ANY_CALL; | ||
331 | |||
332 | if (br_type & PERF_SAMPLE_BRANCH_ANY_RETURN) | ||
333 | mask |= X86_BR_RET | X86_BR_IRET | X86_BR_SYSRET; | ||
334 | |||
335 | if (br_type & PERF_SAMPLE_BRANCH_IND_CALL) | ||
336 | mask |= X86_BR_IND_CALL; | ||
337 | /* | ||
338 | * stash actual user request into reg, it may | ||
339 | * be used by fixup code for some CPU | ||
340 | */ | ||
341 | event->hw.branch_reg.reg = mask; | ||
342 | } | ||
343 | |||
344 | /* | ||
345 | * setup the HW LBR filter | ||
346 | * Used only when available, may not be enough to disambiguate | ||
347 | * all branches, may need the help of the SW filter | ||
348 | */ | ||
349 | static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event) | ||
350 | { | ||
351 | struct hw_perf_event_extra *reg; | ||
352 | u64 br_type = event->attr.branch_sample_type; | ||
353 | u64 mask = 0, m; | ||
354 | u64 v; | ||
355 | |||
356 | for_each_branch_sample_type(m) { | ||
357 | if (!(br_type & m)) | ||
358 | continue; | ||
359 | |||
360 | v = x86_pmu.lbr_sel_map[m]; | ||
361 | if (v == LBR_NOT_SUPP) | ||
362 | return -EOPNOTSUPP; | ||
363 | |||
364 | if (v != LBR_IGN) | ||
365 | mask |= v; | ||
366 | } | ||
367 | reg = &event->hw.branch_reg; | ||
368 | reg->idx = EXTRA_REG_LBR; | ||
369 | |||
370 | /* LBR_SELECT operates in suppress mode so invert mask */ | ||
371 | reg->config = ~mask & x86_pmu.lbr_sel_mask; | ||
372 | |||
373 | return 0; | ||
374 | } | ||
375 | |||
376 | int intel_pmu_setup_lbr_filter(struct perf_event *event) | ||
377 | { | ||
378 | int ret = 0; | ||
379 | |||
380 | /* | ||
381 | * no LBR on this PMU | ||
382 | */ | ||
383 | if (!x86_pmu.lbr_nr) | ||
384 | return -EOPNOTSUPP; | ||
385 | |||
386 | /* | ||
387 | * setup SW LBR filter | ||
388 | */ | ||
389 | intel_pmu_setup_sw_lbr_filter(event); | ||
390 | |||
391 | /* | ||
392 | * setup HW LBR filter, if any | ||
393 | */ | ||
394 | if (x86_pmu.lbr_sel_map) | ||
395 | ret = intel_pmu_setup_hw_lbr_filter(event); | ||
396 | |||
397 | return ret; | ||
196 | } | 398 | } |
197 | 399 | ||
400 | /* | ||
401 | * return the type of control flow change at address "from" | ||
402 | * intruction is not necessarily a branch (in case of interrupt). | ||
403 | * | ||
404 | * The branch type returned also includes the priv level of the | ||
405 | * target of the control flow change (X86_BR_USER, X86_BR_KERNEL). | ||
406 | * | ||
407 | * If a branch type is unknown OR the instruction cannot be | ||
408 | * decoded (e.g., text page not present), then X86_BR_NONE is | ||
409 | * returned. | ||
410 | */ | ||
411 | static int branch_type(unsigned long from, unsigned long to) | ||
412 | { | ||
413 | struct insn insn; | ||
414 | void *addr; | ||
415 | int bytes, size = MAX_INSN_SIZE; | ||
416 | int ret = X86_BR_NONE; | ||
417 | int ext, to_plm, from_plm; | ||
418 | u8 buf[MAX_INSN_SIZE]; | ||
419 | int is64 = 0; | ||
420 | |||
421 | to_plm = kernel_ip(to) ? X86_BR_KERNEL : X86_BR_USER; | ||
422 | from_plm = kernel_ip(from) ? X86_BR_KERNEL : X86_BR_USER; | ||
423 | |||
424 | /* | ||
425 | * maybe zero if lbr did not fill up after a reset by the time | ||
426 | * we get a PMU interrupt | ||
427 | */ | ||
428 | if (from == 0 || to == 0) | ||
429 | return X86_BR_NONE; | ||
430 | |||
431 | if (from_plm == X86_BR_USER) { | ||
432 | /* | ||
433 | * can happen if measuring at the user level only | ||
434 | * and we interrupt in a kernel thread, e.g., idle. | ||
435 | */ | ||
436 | if (!current->mm) | ||
437 | return X86_BR_NONE; | ||
438 | |||
439 | /* may fail if text not present */ | ||
440 | bytes = copy_from_user_nmi(buf, (void __user *)from, size); | ||
441 | if (bytes != size) | ||
442 | return X86_BR_NONE; | ||
443 | |||
444 | addr = buf; | ||
445 | } else | ||
446 | addr = (void *)from; | ||
447 | |||
448 | /* | ||
449 | * decoder needs to know the ABI especially | ||
450 | * on 64-bit systems running 32-bit apps | ||
451 | */ | ||
452 | #ifdef CONFIG_X86_64 | ||
453 | is64 = kernel_ip((unsigned long)addr) || !test_thread_flag(TIF_IA32); | ||
454 | #endif | ||
455 | insn_init(&insn, addr, is64); | ||
456 | insn_get_opcode(&insn); | ||
457 | |||
458 | switch (insn.opcode.bytes[0]) { | ||
459 | case 0xf: | ||
460 | switch (insn.opcode.bytes[1]) { | ||
461 | case 0x05: /* syscall */ | ||
462 | case 0x34: /* sysenter */ | ||
463 | ret = X86_BR_SYSCALL; | ||
464 | break; | ||
465 | case 0x07: /* sysret */ | ||
466 | case 0x35: /* sysexit */ | ||
467 | ret = X86_BR_SYSRET; | ||
468 | break; | ||
469 | case 0x80 ... 0x8f: /* conditional */ | ||
470 | ret = X86_BR_JCC; | ||
471 | break; | ||
472 | default: | ||
473 | ret = X86_BR_NONE; | ||
474 | } | ||
475 | break; | ||
476 | case 0x70 ... 0x7f: /* conditional */ | ||
477 | ret = X86_BR_JCC; | ||
478 | break; | ||
479 | case 0xc2: /* near ret */ | ||
480 | case 0xc3: /* near ret */ | ||
481 | case 0xca: /* far ret */ | ||
482 | case 0xcb: /* far ret */ | ||
483 | ret = X86_BR_RET; | ||
484 | break; | ||
485 | case 0xcf: /* iret */ | ||
486 | ret = X86_BR_IRET; | ||
487 | break; | ||
488 | case 0xcc ... 0xce: /* int */ | ||
489 | ret = X86_BR_INT; | ||
490 | break; | ||
491 | case 0xe8: /* call near rel */ | ||
492 | case 0x9a: /* call far absolute */ | ||
493 | ret = X86_BR_CALL; | ||
494 | break; | ||
495 | case 0xe0 ... 0xe3: /* loop jmp */ | ||
496 | ret = X86_BR_JCC; | ||
497 | break; | ||
498 | case 0xe9 ... 0xeb: /* jmp */ | ||
499 | ret = X86_BR_JMP; | ||
500 | break; | ||
501 | case 0xff: /* call near absolute, call far absolute ind */ | ||
502 | insn_get_modrm(&insn); | ||
503 | ext = (insn.modrm.bytes[0] >> 3) & 0x7; | ||
504 | switch (ext) { | ||
505 | case 2: /* near ind call */ | ||
506 | case 3: /* far ind call */ | ||
507 | ret = X86_BR_IND_CALL; | ||
508 | break; | ||
509 | case 4: | ||
510 | case 5: | ||
511 | ret = X86_BR_JMP; | ||
512 | break; | ||
513 | } | ||
514 | break; | ||
515 | default: | ||
516 | ret = X86_BR_NONE; | ||
517 | } | ||
518 | /* | ||
519 | * interrupts, traps, faults (and thus ring transition) may | ||
520 | * occur on any instructions. Thus, to classify them correctly, | ||
521 | * we need to first look at the from and to priv levels. If they | ||
522 | * are different and to is in the kernel, then it indicates | ||
523 | * a ring transition. If the from instruction is not a ring | ||
524 | * transition instr (syscall, systenter, int), then it means | ||
525 | * it was a irq, trap or fault. | ||
526 | * | ||
527 | * we have no way of detecting kernel to kernel faults. | ||
528 | */ | ||
529 | if (from_plm == X86_BR_USER && to_plm == X86_BR_KERNEL | ||
530 | && ret != X86_BR_SYSCALL && ret != X86_BR_INT) | ||
531 | ret = X86_BR_IRQ; | ||
532 | |||
533 | /* | ||
534 | * branch priv level determined by target as | ||
535 | * is done by HW when LBR_SELECT is implemented | ||
536 | */ | ||
537 | if (ret != X86_BR_NONE) | ||
538 | ret |= to_plm; | ||
539 | |||
540 | return ret; | ||
541 | } | ||
542 | |||
543 | /* | ||
544 | * implement actual branch filter based on user demand. | ||
545 | * Hardware may not exactly satisfy that request, thus | ||
546 | * we need to inspect opcodes. Mismatched branches are | ||
547 | * discarded. Therefore, the number of branches returned | ||
548 | * in PERF_SAMPLE_BRANCH_STACK sample may vary. | ||
549 | */ | ||
550 | static void | ||
551 | intel_pmu_lbr_filter(struct cpu_hw_events *cpuc) | ||
552 | { | ||
553 | u64 from, to; | ||
554 | int br_sel = cpuc->br_sel; | ||
555 | int i, j, type; | ||
556 | bool compress = false; | ||
557 | |||
558 | /* if sampling all branches, then nothing to filter */ | ||
559 | if ((br_sel & X86_BR_ALL) == X86_BR_ALL) | ||
560 | return; | ||
561 | |||
562 | for (i = 0; i < cpuc->lbr_stack.nr; i++) { | ||
563 | |||
564 | from = cpuc->lbr_entries[i].from; | ||
565 | to = cpuc->lbr_entries[i].to; | ||
566 | |||
567 | type = branch_type(from, to); | ||
568 | |||
569 | /* if type does not correspond, then discard */ | ||
570 | if (type == X86_BR_NONE || (br_sel & type) != type) { | ||
571 | cpuc->lbr_entries[i].from = 0; | ||
572 | compress = true; | ||
573 | } | ||
574 | } | ||
575 | |||
576 | if (!compress) | ||
577 | return; | ||
578 | |||
579 | /* remove all entries with from=0 */ | ||
580 | for (i = 0; i < cpuc->lbr_stack.nr; ) { | ||
581 | if (!cpuc->lbr_entries[i].from) { | ||
582 | j = i; | ||
583 | while (++j < cpuc->lbr_stack.nr) | ||
584 | cpuc->lbr_entries[j-1] = cpuc->lbr_entries[j]; | ||
585 | cpuc->lbr_stack.nr--; | ||
586 | if (!cpuc->lbr_entries[i].from) | ||
587 | continue; | ||
588 | } | ||
589 | i++; | ||
590 | } | ||
591 | } | ||
592 | |||
593 | /* | ||
594 | * Map interface branch filters onto LBR filters | ||
595 | */ | ||
596 | static const int nhm_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX] = { | ||
597 | [PERF_SAMPLE_BRANCH_ANY] = LBR_ANY, | ||
598 | [PERF_SAMPLE_BRANCH_USER] = LBR_USER, | ||
599 | [PERF_SAMPLE_BRANCH_KERNEL] = LBR_KERNEL, | ||
600 | [PERF_SAMPLE_BRANCH_HV] = LBR_IGN, | ||
601 | [PERF_SAMPLE_BRANCH_ANY_RETURN] = LBR_RETURN | LBR_REL_JMP | ||
602 | | LBR_IND_JMP | LBR_FAR, | ||
603 | /* | ||
604 | * NHM/WSM erratum: must include REL_JMP+IND_JMP to get CALL branches | ||
605 | */ | ||
606 | [PERF_SAMPLE_BRANCH_ANY_CALL] = | ||
607 | LBR_REL_CALL | LBR_IND_CALL | LBR_REL_JMP | LBR_IND_JMP | LBR_FAR, | ||
608 | /* | ||
609 | * NHM/WSM erratum: must include IND_JMP to capture IND_CALL | ||
610 | */ | ||
611 | [PERF_SAMPLE_BRANCH_IND_CALL] = LBR_IND_CALL | LBR_IND_JMP, | ||
612 | }; | ||
613 | |||
614 | static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX] = { | ||
615 | [PERF_SAMPLE_BRANCH_ANY] = LBR_ANY, | ||
616 | [PERF_SAMPLE_BRANCH_USER] = LBR_USER, | ||
617 | [PERF_SAMPLE_BRANCH_KERNEL] = LBR_KERNEL, | ||
618 | [PERF_SAMPLE_BRANCH_HV] = LBR_IGN, | ||
619 | [PERF_SAMPLE_BRANCH_ANY_RETURN] = LBR_RETURN | LBR_FAR, | ||
620 | [PERF_SAMPLE_BRANCH_ANY_CALL] = LBR_REL_CALL | LBR_IND_CALL | ||
621 | | LBR_FAR, | ||
622 | [PERF_SAMPLE_BRANCH_IND_CALL] = LBR_IND_CALL, | ||
623 | }; | ||
624 | |||
625 | /* core */ | ||
198 | void intel_pmu_lbr_init_core(void) | 626 | void intel_pmu_lbr_init_core(void) |
199 | { | 627 | { |
200 | x86_pmu.lbr_nr = 4; | 628 | x86_pmu.lbr_nr = 4; |
201 | x86_pmu.lbr_tos = 0x01c9; | 629 | x86_pmu.lbr_tos = MSR_LBR_TOS; |
202 | x86_pmu.lbr_from = 0x40; | 630 | x86_pmu.lbr_from = MSR_LBR_CORE_FROM; |
203 | x86_pmu.lbr_to = 0x60; | 631 | x86_pmu.lbr_to = MSR_LBR_CORE_TO; |
632 | |||
633 | /* | ||
634 | * SW branch filter usage: | ||
635 | * - compensate for lack of HW filter | ||
636 | */ | ||
637 | pr_cont("4-deep LBR, "); | ||
204 | } | 638 | } |
205 | 639 | ||
640 | /* nehalem/westmere */ | ||
206 | void intel_pmu_lbr_init_nhm(void) | 641 | void intel_pmu_lbr_init_nhm(void) |
207 | { | 642 | { |
208 | x86_pmu.lbr_nr = 16; | 643 | x86_pmu.lbr_nr = 16; |
209 | x86_pmu.lbr_tos = 0x01c9; | 644 | x86_pmu.lbr_tos = MSR_LBR_TOS; |
210 | x86_pmu.lbr_from = 0x680; | 645 | x86_pmu.lbr_from = MSR_LBR_NHM_FROM; |
211 | x86_pmu.lbr_to = 0x6c0; | 646 | x86_pmu.lbr_to = MSR_LBR_NHM_TO; |
647 | |||
648 | x86_pmu.lbr_sel_mask = LBR_SEL_MASK; | ||
649 | x86_pmu.lbr_sel_map = nhm_lbr_sel_map; | ||
650 | |||
651 | /* | ||
652 | * SW branch filter usage: | ||
653 | * - workaround LBR_SEL errata (see above) | ||
654 | * - support syscall, sysret capture. | ||
655 | * That requires LBR_FAR but that means far | ||
656 | * jmp need to be filtered out | ||
657 | */ | ||
658 | pr_cont("16-deep LBR, "); | ||
659 | } | ||
660 | |||
661 | /* sandy bridge */ | ||
662 | void intel_pmu_lbr_init_snb(void) | ||
663 | { | ||
664 | x86_pmu.lbr_nr = 16; | ||
665 | x86_pmu.lbr_tos = MSR_LBR_TOS; | ||
666 | x86_pmu.lbr_from = MSR_LBR_NHM_FROM; | ||
667 | x86_pmu.lbr_to = MSR_LBR_NHM_TO; | ||
668 | |||
669 | x86_pmu.lbr_sel_mask = LBR_SEL_MASK; | ||
670 | x86_pmu.lbr_sel_map = snb_lbr_sel_map; | ||
671 | |||
672 | /* | ||
673 | * SW branch filter usage: | ||
674 | * - support syscall, sysret capture. | ||
675 | * That requires LBR_FAR but that means far | ||
676 | * jmp need to be filtered out | ||
677 | */ | ||
678 | pr_cont("16-deep LBR, "); | ||
212 | } | 679 | } |
213 | 680 | ||
681 | /* atom */ | ||
214 | void intel_pmu_lbr_init_atom(void) | 682 | void intel_pmu_lbr_init_atom(void) |
215 | { | 683 | { |
684 | /* | ||
685 | * only models starting at stepping 10 seems | ||
686 | * to have an operational LBR which can freeze | ||
687 | * on PMU interrupt | ||
688 | */ | ||
689 | if (boot_cpu_data.x86_mask < 10) { | ||
690 | pr_cont("LBR disabled due to erratum"); | ||
691 | return; | ||
692 | } | ||
693 | |||
216 | x86_pmu.lbr_nr = 8; | 694 | x86_pmu.lbr_nr = 8; |
217 | x86_pmu.lbr_tos = 0x01c9; | 695 | x86_pmu.lbr_tos = MSR_LBR_TOS; |
218 | x86_pmu.lbr_from = 0x40; | 696 | x86_pmu.lbr_from = MSR_LBR_CORE_FROM; |
219 | x86_pmu.lbr_to = 0x60; | 697 | x86_pmu.lbr_to = MSR_LBR_CORE_TO; |
698 | |||
699 | /* | ||
700 | * SW branch filter usage: | ||
701 | * - compensate for lack of HW filter | ||
702 | */ | ||
703 | pr_cont("8-deep LBR, "); | ||
220 | } | 704 | } |
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 3fe8239fd8fb..1333d9851778 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S | |||
@@ -1532,10 +1532,17 @@ ENTRY(nmi) | |||
1532 | pushq_cfi %rdx | 1532 | pushq_cfi %rdx |
1533 | 1533 | ||
1534 | /* | 1534 | /* |
1535 | * If %cs was not the kernel segment, then the NMI triggered in user | ||
1536 | * space, which means it is definitely not nested. | ||
1537 | */ | ||
1538 | cmpl $__KERNEL_CS, 16(%rsp) | ||
1539 | jne first_nmi | ||
1540 | |||
1541 | /* | ||
1535 | * Check the special variable on the stack to see if NMIs are | 1542 | * Check the special variable on the stack to see if NMIs are |
1536 | * executing. | 1543 | * executing. |
1537 | */ | 1544 | */ |
1538 | cmp $1, -8(%rsp) | 1545 | cmpl $1, -8(%rsp) |
1539 | je nested_nmi | 1546 | je nested_nmi |
1540 | 1547 | ||
1541 | /* | 1548 | /* |
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 40fc86161d92..58b7f27cb3e9 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c | |||
@@ -100,13 +100,8 @@ execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq) | |||
100 | irqctx->tinfo.task = curctx->tinfo.task; | 100 | irqctx->tinfo.task = curctx->tinfo.task; |
101 | irqctx->tinfo.previous_esp = current_stack_pointer; | 101 | irqctx->tinfo.previous_esp = current_stack_pointer; |
102 | 102 | ||
103 | /* | 103 | /* Copy the preempt_count so that the [soft]irq checks work. */ |
104 | * Copy the softirq bits in preempt_count so that the | 104 | irqctx->tinfo.preempt_count = curctx->tinfo.preempt_count; |
105 | * softirq checks work in the hardirq context. | ||
106 | */ | ||
107 | irqctx->tinfo.preempt_count = | ||
108 | (irqctx->tinfo.preempt_count & ~SOFTIRQ_MASK) | | ||
109 | (curctx->tinfo.preempt_count & SOFTIRQ_MASK); | ||
110 | 105 | ||
111 | if (unlikely(overflow)) | 106 | if (unlikely(overflow)) |
112 | call_on_stack(print_stack_overflow, isp); | 107 | call_on_stack(print_stack_overflow, isp); |
@@ -196,7 +191,7 @@ bool handle_irq(unsigned irq, struct pt_regs *regs) | |||
196 | if (unlikely(!desc)) | 191 | if (unlikely(!desc)) |
197 | return false; | 192 | return false; |
198 | 193 | ||
199 | if (!execute_on_irq_stack(overflow, desc, irq)) { | 194 | if (user_mode_vm(regs) || !execute_on_irq_stack(overflow, desc, irq)) { |
200 | if (unlikely(overflow)) | 195 | if (unlikely(overflow)) |
201 | print_stack_overflow(); | 196 | print_stack_overflow(); |
202 | desc->handle_irq(irq, desc); | 197 | desc->handle_irq(irq, desc); |
diff --git a/arch/x86/kernel/kprobes-common.h b/arch/x86/kernel/kprobes-common.h new file mode 100644 index 000000000000..3230b68ef29a --- /dev/null +++ b/arch/x86/kernel/kprobes-common.h | |||
@@ -0,0 +1,102 @@ | |||
1 | #ifndef __X86_KERNEL_KPROBES_COMMON_H | ||
2 | #define __X86_KERNEL_KPROBES_COMMON_H | ||
3 | |||
4 | /* Kprobes and Optprobes common header */ | ||
5 | |||
6 | #ifdef CONFIG_X86_64 | ||
7 | #define SAVE_REGS_STRING \ | ||
8 | /* Skip cs, ip, orig_ax. */ \ | ||
9 | " subq $24, %rsp\n" \ | ||
10 | " pushq %rdi\n" \ | ||
11 | " pushq %rsi\n" \ | ||
12 | " pushq %rdx\n" \ | ||
13 | " pushq %rcx\n" \ | ||
14 | " pushq %rax\n" \ | ||
15 | " pushq %r8\n" \ | ||
16 | " pushq %r9\n" \ | ||
17 | " pushq %r10\n" \ | ||
18 | " pushq %r11\n" \ | ||
19 | " pushq %rbx\n" \ | ||
20 | " pushq %rbp\n" \ | ||
21 | " pushq %r12\n" \ | ||
22 | " pushq %r13\n" \ | ||
23 | " pushq %r14\n" \ | ||
24 | " pushq %r15\n" | ||
25 | #define RESTORE_REGS_STRING \ | ||
26 | " popq %r15\n" \ | ||
27 | " popq %r14\n" \ | ||
28 | " popq %r13\n" \ | ||
29 | " popq %r12\n" \ | ||
30 | " popq %rbp\n" \ | ||
31 | " popq %rbx\n" \ | ||
32 | " popq %r11\n" \ | ||
33 | " popq %r10\n" \ | ||
34 | " popq %r9\n" \ | ||
35 | " popq %r8\n" \ | ||
36 | " popq %rax\n" \ | ||
37 | " popq %rcx\n" \ | ||
38 | " popq %rdx\n" \ | ||
39 | " popq %rsi\n" \ | ||
40 | " popq %rdi\n" \ | ||
41 | /* Skip orig_ax, ip, cs */ \ | ||
42 | " addq $24, %rsp\n" | ||
43 | #else | ||
44 | #define SAVE_REGS_STRING \ | ||
45 | /* Skip cs, ip, orig_ax and gs. */ \ | ||
46 | " subl $16, %esp\n" \ | ||
47 | " pushl %fs\n" \ | ||
48 | " pushl %es\n" \ | ||
49 | " pushl %ds\n" \ | ||
50 | " pushl %eax\n" \ | ||
51 | " pushl %ebp\n" \ | ||
52 | " pushl %edi\n" \ | ||
53 | " pushl %esi\n" \ | ||
54 | " pushl %edx\n" \ | ||
55 | " pushl %ecx\n" \ | ||
56 | " pushl %ebx\n" | ||
57 | #define RESTORE_REGS_STRING \ | ||
58 | " popl %ebx\n" \ | ||
59 | " popl %ecx\n" \ | ||
60 | " popl %edx\n" \ | ||
61 | " popl %esi\n" \ | ||
62 | " popl %edi\n" \ | ||
63 | " popl %ebp\n" \ | ||
64 | " popl %eax\n" \ | ||
65 | /* Skip ds, es, fs, gs, orig_ax, and ip. Note: don't pop cs here*/\ | ||
66 | " addl $24, %esp\n" | ||
67 | #endif | ||
68 | |||
69 | /* Ensure if the instruction can be boostable */ | ||
70 | extern int can_boost(kprobe_opcode_t *instruction); | ||
71 | /* Recover instruction if given address is probed */ | ||
72 | extern unsigned long recover_probed_instruction(kprobe_opcode_t *buf, | ||
73 | unsigned long addr); | ||
74 | /* | ||
75 | * Copy an instruction and adjust the displacement if the instruction | ||
76 | * uses the %rip-relative addressing mode. | ||
77 | */ | ||
78 | extern int __copy_instruction(u8 *dest, u8 *src); | ||
79 | |||
80 | /* Generate a relative-jump/call instruction */ | ||
81 | extern void synthesize_reljump(void *from, void *to); | ||
82 | extern void synthesize_relcall(void *from, void *to); | ||
83 | |||
84 | #ifdef CONFIG_OPTPROBES | ||
85 | extern int arch_init_optprobes(void); | ||
86 | extern int setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter); | ||
87 | extern unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, unsigned long addr); | ||
88 | #else /* !CONFIG_OPTPROBES */ | ||
89 | static inline int arch_init_optprobes(void) | ||
90 | { | ||
91 | return 0; | ||
92 | } | ||
93 | static inline int setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter) | ||
94 | { | ||
95 | return 0; | ||
96 | } | ||
97 | static inline unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, unsigned long addr) | ||
98 | { | ||
99 | return addr; | ||
100 | } | ||
101 | #endif | ||
102 | #endif | ||
diff --git a/arch/x86/kernel/kprobes-opt.c b/arch/x86/kernel/kprobes-opt.c new file mode 100644 index 000000000000..c5e410eed403 --- /dev/null +++ b/arch/x86/kernel/kprobes-opt.c | |||
@@ -0,0 +1,512 @@ | |||
1 | /* | ||
2 | * Kernel Probes Jump Optimization (Optprobes) | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License as published by | ||
6 | * the Free Software Foundation; either version 2 of the License, or | ||
7 | * (at your option) any later version. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
12 | * GNU General Public License for more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public License | ||
15 | * along with this program; if not, write to the Free Software | ||
16 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. | ||
17 | * | ||
18 | * Copyright (C) IBM Corporation, 2002, 2004 | ||
19 | * Copyright (C) Hitachi Ltd., 2012 | ||
20 | */ | ||
21 | #include <linux/kprobes.h> | ||
22 | #include <linux/ptrace.h> | ||
23 | #include <linux/string.h> | ||
24 | #include <linux/slab.h> | ||
25 | #include <linux/hardirq.h> | ||
26 | #include <linux/preempt.h> | ||
27 | #include <linux/module.h> | ||
28 | #include <linux/kdebug.h> | ||
29 | #include <linux/kallsyms.h> | ||
30 | #include <linux/ftrace.h> | ||
31 | |||
32 | #include <asm/cacheflush.h> | ||
33 | #include <asm/desc.h> | ||
34 | #include <asm/pgtable.h> | ||
35 | #include <asm/uaccess.h> | ||
36 | #include <asm/alternative.h> | ||
37 | #include <asm/insn.h> | ||
38 | #include <asm/debugreg.h> | ||
39 | |||
40 | #include "kprobes-common.h" | ||
41 | |||
42 | unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, unsigned long addr) | ||
43 | { | ||
44 | struct optimized_kprobe *op; | ||
45 | struct kprobe *kp; | ||
46 | long offs; | ||
47 | int i; | ||
48 | |||
49 | for (i = 0; i < RELATIVEJUMP_SIZE; i++) { | ||
50 | kp = get_kprobe((void *)addr - i); | ||
51 | /* This function only handles jump-optimized kprobe */ | ||
52 | if (kp && kprobe_optimized(kp)) { | ||
53 | op = container_of(kp, struct optimized_kprobe, kp); | ||
54 | /* If op->list is not empty, op is under optimizing */ | ||
55 | if (list_empty(&op->list)) | ||
56 | goto found; | ||
57 | } | ||
58 | } | ||
59 | |||
60 | return addr; | ||
61 | found: | ||
62 | /* | ||
63 | * If the kprobe can be optimized, original bytes which can be | ||
64 | * overwritten by jump destination address. In this case, original | ||
65 | * bytes must be recovered from op->optinsn.copied_insn buffer. | ||
66 | */ | ||
67 | memcpy(buf, (void *)addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t)); | ||
68 | if (addr == (unsigned long)kp->addr) { | ||
69 | buf[0] = kp->opcode; | ||
70 | memcpy(buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE); | ||
71 | } else { | ||
72 | offs = addr - (unsigned long)kp->addr - 1; | ||
73 | memcpy(buf, op->optinsn.copied_insn + offs, RELATIVE_ADDR_SIZE - offs); | ||
74 | } | ||
75 | |||
76 | return (unsigned long)buf; | ||
77 | } | ||
78 | |||
79 | /* Insert a move instruction which sets a pointer to eax/rdi (1st arg). */ | ||
80 | static void __kprobes synthesize_set_arg1(kprobe_opcode_t *addr, unsigned long val) | ||
81 | { | ||
82 | #ifdef CONFIG_X86_64 | ||
83 | *addr++ = 0x48; | ||
84 | *addr++ = 0xbf; | ||
85 | #else | ||
86 | *addr++ = 0xb8; | ||
87 | #endif | ||
88 | *(unsigned long *)addr = val; | ||
89 | } | ||
90 | |||
91 | static void __used __kprobes kprobes_optinsn_template_holder(void) | ||
92 | { | ||
93 | asm volatile ( | ||
94 | ".global optprobe_template_entry\n" | ||
95 | "optprobe_template_entry:\n" | ||
96 | #ifdef CONFIG_X86_64 | ||
97 | /* We don't bother saving the ss register */ | ||
98 | " pushq %rsp\n" | ||
99 | " pushfq\n" | ||
100 | SAVE_REGS_STRING | ||
101 | " movq %rsp, %rsi\n" | ||
102 | ".global optprobe_template_val\n" | ||
103 | "optprobe_template_val:\n" | ||
104 | ASM_NOP5 | ||
105 | ASM_NOP5 | ||
106 | ".global optprobe_template_call\n" | ||
107 | "optprobe_template_call:\n" | ||
108 | ASM_NOP5 | ||
109 | /* Move flags to rsp */ | ||
110 | " movq 144(%rsp), %rdx\n" | ||
111 | " movq %rdx, 152(%rsp)\n" | ||
112 | RESTORE_REGS_STRING | ||
113 | /* Skip flags entry */ | ||
114 | " addq $8, %rsp\n" | ||
115 | " popfq\n" | ||
116 | #else /* CONFIG_X86_32 */ | ||
117 | " pushf\n" | ||
118 | SAVE_REGS_STRING | ||
119 | " movl %esp, %edx\n" | ||
120 | ".global optprobe_template_val\n" | ||
121 | "optprobe_template_val:\n" | ||
122 | ASM_NOP5 | ||
123 | ".global optprobe_template_call\n" | ||
124 | "optprobe_template_call:\n" | ||
125 | ASM_NOP5 | ||
126 | RESTORE_REGS_STRING | ||
127 | " addl $4, %esp\n" /* skip cs */ | ||
128 | " popf\n" | ||
129 | #endif | ||
130 | ".global optprobe_template_end\n" | ||
131 | "optprobe_template_end:\n"); | ||
132 | } | ||
133 | |||
134 | #define TMPL_MOVE_IDX \ | ||
135 | ((long)&optprobe_template_val - (long)&optprobe_template_entry) | ||
136 | #define TMPL_CALL_IDX \ | ||
137 | ((long)&optprobe_template_call - (long)&optprobe_template_entry) | ||
138 | #define TMPL_END_IDX \ | ||
139 | ((long)&optprobe_template_end - (long)&optprobe_template_entry) | ||
140 | |||
141 | #define INT3_SIZE sizeof(kprobe_opcode_t) | ||
142 | |||
143 | /* Optimized kprobe call back function: called from optinsn */ | ||
144 | static void __kprobes optimized_callback(struct optimized_kprobe *op, struct pt_regs *regs) | ||
145 | { | ||
146 | struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); | ||
147 | unsigned long flags; | ||
148 | |||
149 | /* This is possible if op is under delayed unoptimizing */ | ||
150 | if (kprobe_disabled(&op->kp)) | ||
151 | return; | ||
152 | |||
153 | local_irq_save(flags); | ||
154 | if (kprobe_running()) { | ||
155 | kprobes_inc_nmissed_count(&op->kp); | ||
156 | } else { | ||
157 | /* Save skipped registers */ | ||
158 | #ifdef CONFIG_X86_64 | ||
159 | regs->cs = __KERNEL_CS; | ||
160 | #else | ||
161 | regs->cs = __KERNEL_CS | get_kernel_rpl(); | ||
162 | regs->gs = 0; | ||
163 | #endif | ||
164 | regs->ip = (unsigned long)op->kp.addr + INT3_SIZE; | ||
165 | regs->orig_ax = ~0UL; | ||
166 | |||
167 | __this_cpu_write(current_kprobe, &op->kp); | ||
168 | kcb->kprobe_status = KPROBE_HIT_ACTIVE; | ||
169 | opt_pre_handler(&op->kp, regs); | ||
170 | __this_cpu_write(current_kprobe, NULL); | ||
171 | } | ||
172 | local_irq_restore(flags); | ||
173 | } | ||
174 | |||
175 | static int __kprobes copy_optimized_instructions(u8 *dest, u8 *src) | ||
176 | { | ||
177 | int len = 0, ret; | ||
178 | |||
179 | while (len < RELATIVEJUMP_SIZE) { | ||
180 | ret = __copy_instruction(dest + len, src + len); | ||
181 | if (!ret || !can_boost(dest + len)) | ||
182 | return -EINVAL; | ||
183 | len += ret; | ||
184 | } | ||
185 | /* Check whether the address range is reserved */ | ||
186 | if (ftrace_text_reserved(src, src + len - 1) || | ||
187 | alternatives_text_reserved(src, src + len - 1) || | ||
188 | jump_label_text_reserved(src, src + len - 1)) | ||
189 | return -EBUSY; | ||
190 | |||
191 | return len; | ||
192 | } | ||
193 | |||
194 | /* Check whether insn is indirect jump */ | ||
195 | static int __kprobes insn_is_indirect_jump(struct insn *insn) | ||
196 | { | ||
197 | return ((insn->opcode.bytes[0] == 0xff && | ||
198 | (X86_MODRM_REG(insn->modrm.value) & 6) == 4) || /* Jump */ | ||
199 | insn->opcode.bytes[0] == 0xea); /* Segment based jump */ | ||
200 | } | ||
201 | |||
202 | /* Check whether insn jumps into specified address range */ | ||
203 | static int insn_jump_into_range(struct insn *insn, unsigned long start, int len) | ||
204 | { | ||
205 | unsigned long target = 0; | ||
206 | |||
207 | switch (insn->opcode.bytes[0]) { | ||
208 | case 0xe0: /* loopne */ | ||
209 | case 0xe1: /* loope */ | ||
210 | case 0xe2: /* loop */ | ||
211 | case 0xe3: /* jcxz */ | ||
212 | case 0xe9: /* near relative jump */ | ||
213 | case 0xeb: /* short relative jump */ | ||
214 | break; | ||
215 | case 0x0f: | ||
216 | if ((insn->opcode.bytes[1] & 0xf0) == 0x80) /* jcc near */ | ||
217 | break; | ||
218 | return 0; | ||
219 | default: | ||
220 | if ((insn->opcode.bytes[0] & 0xf0) == 0x70) /* jcc short */ | ||
221 | break; | ||
222 | return 0; | ||
223 | } | ||
224 | target = (unsigned long)insn->next_byte + insn->immediate.value; | ||
225 | |||
226 | return (start <= target && target <= start + len); | ||
227 | } | ||
228 | |||
229 | /* Decode whole function to ensure any instructions don't jump into target */ | ||
230 | static int __kprobes can_optimize(unsigned long paddr) | ||
231 | { | ||
232 | unsigned long addr, size = 0, offset = 0; | ||
233 | struct insn insn; | ||
234 | kprobe_opcode_t buf[MAX_INSN_SIZE]; | ||
235 | |||
236 | /* Lookup symbol including addr */ | ||
237 | if (!kallsyms_lookup_size_offset(paddr, &size, &offset)) | ||
238 | return 0; | ||
239 | |||
240 | /* | ||
241 | * Do not optimize in the entry code due to the unstable | ||
242 | * stack handling. | ||
243 | */ | ||
244 | if ((paddr >= (unsigned long)__entry_text_start) && | ||
245 | (paddr < (unsigned long)__entry_text_end)) | ||
246 | return 0; | ||
247 | |||
248 | /* Check there is enough space for a relative jump. */ | ||
249 | if (size - offset < RELATIVEJUMP_SIZE) | ||
250 | return 0; | ||
251 | |||
252 | /* Decode instructions */ | ||
253 | addr = paddr - offset; | ||
254 | while (addr < paddr - offset + size) { /* Decode until function end */ | ||
255 | if (search_exception_tables(addr)) | ||
256 | /* | ||
257 | * Since some fixup code will jumps into this function, | ||
258 | * we can't optimize kprobe in this function. | ||
259 | */ | ||
260 | return 0; | ||
261 | kernel_insn_init(&insn, (void *)recover_probed_instruction(buf, addr)); | ||
262 | insn_get_length(&insn); | ||
263 | /* Another subsystem puts a breakpoint */ | ||
264 | if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) | ||
265 | return 0; | ||
266 | /* Recover address */ | ||
267 | insn.kaddr = (void *)addr; | ||
268 | insn.next_byte = (void *)(addr + insn.length); | ||
269 | /* Check any instructions don't jump into target */ | ||
270 | if (insn_is_indirect_jump(&insn) || | ||
271 | insn_jump_into_range(&insn, paddr + INT3_SIZE, | ||
272 | RELATIVE_ADDR_SIZE)) | ||
273 | return 0; | ||
274 | addr += insn.length; | ||
275 | } | ||
276 | |||
277 | return 1; | ||
278 | } | ||
279 | |||
280 | /* Check optimized_kprobe can actually be optimized. */ | ||
281 | int __kprobes arch_check_optimized_kprobe(struct optimized_kprobe *op) | ||
282 | { | ||
283 | int i; | ||
284 | struct kprobe *p; | ||
285 | |||
286 | for (i = 1; i < op->optinsn.size; i++) { | ||
287 | p = get_kprobe(op->kp.addr + i); | ||
288 | if (p && !kprobe_disabled(p)) | ||
289 | return -EEXIST; | ||
290 | } | ||
291 | |||
292 | return 0; | ||
293 | } | ||
294 | |||
295 | /* Check the addr is within the optimized instructions. */ | ||
296 | int __kprobes | ||
297 | arch_within_optimized_kprobe(struct optimized_kprobe *op, unsigned long addr) | ||
298 | { | ||
299 | return ((unsigned long)op->kp.addr <= addr && | ||
300 | (unsigned long)op->kp.addr + op->optinsn.size > addr); | ||
301 | } | ||
302 | |||
303 | /* Free optimized instruction slot */ | ||
304 | static __kprobes | ||
305 | void __arch_remove_optimized_kprobe(struct optimized_kprobe *op, int dirty) | ||
306 | { | ||
307 | if (op->optinsn.insn) { | ||
308 | free_optinsn_slot(op->optinsn.insn, dirty); | ||
309 | op->optinsn.insn = NULL; | ||
310 | op->optinsn.size = 0; | ||
311 | } | ||
312 | } | ||
313 | |||
314 | void __kprobes arch_remove_optimized_kprobe(struct optimized_kprobe *op) | ||
315 | { | ||
316 | __arch_remove_optimized_kprobe(op, 1); | ||
317 | } | ||
318 | |||
319 | /* | ||
320 | * Copy replacing target instructions | ||
321 | * Target instructions MUST be relocatable (checked inside) | ||
322 | * This is called when new aggr(opt)probe is allocated or reused. | ||
323 | */ | ||
324 | int __kprobes arch_prepare_optimized_kprobe(struct optimized_kprobe *op) | ||
325 | { | ||
326 | u8 *buf; | ||
327 | int ret; | ||
328 | long rel; | ||
329 | |||
330 | if (!can_optimize((unsigned long)op->kp.addr)) | ||
331 | return -EILSEQ; | ||
332 | |||
333 | op->optinsn.insn = get_optinsn_slot(); | ||
334 | if (!op->optinsn.insn) | ||
335 | return -ENOMEM; | ||
336 | |||
337 | /* | ||
338 | * Verify if the address gap is in 2GB range, because this uses | ||
339 | * a relative jump. | ||
340 | */ | ||
341 | rel = (long)op->optinsn.insn - (long)op->kp.addr + RELATIVEJUMP_SIZE; | ||
342 | if (abs(rel) > 0x7fffffff) | ||
343 | return -ERANGE; | ||
344 | |||
345 | buf = (u8 *)op->optinsn.insn; | ||
346 | |||
347 | /* Copy instructions into the out-of-line buffer */ | ||
348 | ret = copy_optimized_instructions(buf + TMPL_END_IDX, op->kp.addr); | ||
349 | if (ret < 0) { | ||
350 | __arch_remove_optimized_kprobe(op, 0); | ||
351 | return ret; | ||
352 | } | ||
353 | op->optinsn.size = ret; | ||
354 | |||
355 | /* Copy arch-dep-instance from template */ | ||
356 | memcpy(buf, &optprobe_template_entry, TMPL_END_IDX); | ||
357 | |||
358 | /* Set probe information */ | ||
359 | synthesize_set_arg1(buf + TMPL_MOVE_IDX, (unsigned long)op); | ||
360 | |||
361 | /* Set probe function call */ | ||
362 | synthesize_relcall(buf + TMPL_CALL_IDX, optimized_callback); | ||
363 | |||
364 | /* Set returning jmp instruction at the tail of out-of-line buffer */ | ||
365 | synthesize_reljump(buf + TMPL_END_IDX + op->optinsn.size, | ||
366 | (u8 *)op->kp.addr + op->optinsn.size); | ||
367 | |||
368 | flush_icache_range((unsigned long) buf, | ||
369 | (unsigned long) buf + TMPL_END_IDX + | ||
370 | op->optinsn.size + RELATIVEJUMP_SIZE); | ||
371 | return 0; | ||
372 | } | ||
373 | |||
374 | #define MAX_OPTIMIZE_PROBES 256 | ||
375 | static struct text_poke_param *jump_poke_params; | ||
376 | static struct jump_poke_buffer { | ||
377 | u8 buf[RELATIVEJUMP_SIZE]; | ||
378 | } *jump_poke_bufs; | ||
379 | |||
380 | static void __kprobes setup_optimize_kprobe(struct text_poke_param *tprm, | ||
381 | u8 *insn_buf, | ||
382 | struct optimized_kprobe *op) | ||
383 | { | ||
384 | s32 rel = (s32)((long)op->optinsn.insn - | ||
385 | ((long)op->kp.addr + RELATIVEJUMP_SIZE)); | ||
386 | |||
387 | /* Backup instructions which will be replaced by jump address */ | ||
388 | memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_SIZE, | ||
389 | RELATIVE_ADDR_SIZE); | ||
390 | |||
391 | insn_buf[0] = RELATIVEJUMP_OPCODE; | ||
392 | *(s32 *)(&insn_buf[1]) = rel; | ||
393 | |||
394 | tprm->addr = op->kp.addr; | ||
395 | tprm->opcode = insn_buf; | ||
396 | tprm->len = RELATIVEJUMP_SIZE; | ||
397 | } | ||
398 | |||
399 | /* | ||
400 | * Replace breakpoints (int3) with relative jumps. | ||
401 | * Caller must call with locking kprobe_mutex and text_mutex. | ||
402 | */ | ||
403 | void __kprobes arch_optimize_kprobes(struct list_head *oplist) | ||
404 | { | ||
405 | struct optimized_kprobe *op, *tmp; | ||
406 | int c = 0; | ||
407 | |||
408 | list_for_each_entry_safe(op, tmp, oplist, list) { | ||
409 | WARN_ON(kprobe_disabled(&op->kp)); | ||
410 | /* Setup param */ | ||
411 | setup_optimize_kprobe(&jump_poke_params[c], | ||
412 | jump_poke_bufs[c].buf, op); | ||
413 | list_del_init(&op->list); | ||
414 | if (++c >= MAX_OPTIMIZE_PROBES) | ||
415 | break; | ||
416 | } | ||
417 | |||
418 | /* | ||
419 | * text_poke_smp doesn't support NMI/MCE code modifying. | ||
420 | * However, since kprobes itself also doesn't support NMI/MCE | ||
421 | * code probing, it's not a problem. | ||
422 | */ | ||
423 | text_poke_smp_batch(jump_poke_params, c); | ||
424 | } | ||
425 | |||
426 | static void __kprobes setup_unoptimize_kprobe(struct text_poke_param *tprm, | ||
427 | u8 *insn_buf, | ||
428 | struct optimized_kprobe *op) | ||
429 | { | ||
430 | /* Set int3 to first byte for kprobes */ | ||
431 | insn_buf[0] = BREAKPOINT_INSTRUCTION; | ||
432 | memcpy(insn_buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE); | ||
433 | |||
434 | tprm->addr = op->kp.addr; | ||
435 | tprm->opcode = insn_buf; | ||
436 | tprm->len = RELATIVEJUMP_SIZE; | ||
437 | } | ||
438 | |||
439 | /* | ||
440 | * Recover original instructions and breakpoints from relative jumps. | ||
441 | * Caller must call with locking kprobe_mutex. | ||
442 | */ | ||
443 | extern void arch_unoptimize_kprobes(struct list_head *oplist, | ||
444 | struct list_head *done_list) | ||
445 | { | ||
446 | struct optimized_kprobe *op, *tmp; | ||
447 | int c = 0; | ||
448 | |||
449 | list_for_each_entry_safe(op, tmp, oplist, list) { | ||
450 | /* Setup param */ | ||
451 | setup_unoptimize_kprobe(&jump_poke_params[c], | ||
452 | jump_poke_bufs[c].buf, op); | ||
453 | list_move(&op->list, done_list); | ||
454 | if (++c >= MAX_OPTIMIZE_PROBES) | ||
455 | break; | ||
456 | } | ||
457 | |||
458 | /* | ||
459 | * text_poke_smp doesn't support NMI/MCE code modifying. | ||
460 | * However, since kprobes itself also doesn't support NMI/MCE | ||
461 | * code probing, it's not a problem. | ||
462 | */ | ||
463 | text_poke_smp_batch(jump_poke_params, c); | ||
464 | } | ||
465 | |||
466 | /* Replace a relative jump with a breakpoint (int3). */ | ||
467 | void __kprobes arch_unoptimize_kprobe(struct optimized_kprobe *op) | ||
468 | { | ||
469 | u8 buf[RELATIVEJUMP_SIZE]; | ||
470 | |||
471 | /* Set int3 to first byte for kprobes */ | ||
472 | buf[0] = BREAKPOINT_INSTRUCTION; | ||
473 | memcpy(buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE); | ||
474 | text_poke_smp(op->kp.addr, buf, RELATIVEJUMP_SIZE); | ||
475 | } | ||
476 | |||
477 | int __kprobes | ||
478 | setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter) | ||
479 | { | ||
480 | struct optimized_kprobe *op; | ||
481 | |||
482 | if (p->flags & KPROBE_FLAG_OPTIMIZED) { | ||
483 | /* This kprobe is really able to run optimized path. */ | ||
484 | op = container_of(p, struct optimized_kprobe, kp); | ||
485 | /* Detour through copied instructions */ | ||
486 | regs->ip = (unsigned long)op->optinsn.insn + TMPL_END_IDX; | ||
487 | if (!reenter) | ||
488 | reset_current_kprobe(); | ||
489 | preempt_enable_no_resched(); | ||
490 | return 1; | ||
491 | } | ||
492 | return 0; | ||
493 | } | ||
494 | |||
495 | int __kprobes arch_init_optprobes(void) | ||
496 | { | ||
497 | /* Allocate code buffer and parameter array */ | ||
498 | jump_poke_bufs = kmalloc(sizeof(struct jump_poke_buffer) * | ||
499 | MAX_OPTIMIZE_PROBES, GFP_KERNEL); | ||
500 | if (!jump_poke_bufs) | ||
501 | return -ENOMEM; | ||
502 | |||
503 | jump_poke_params = kmalloc(sizeof(struct text_poke_param) * | ||
504 | MAX_OPTIMIZE_PROBES, GFP_KERNEL); | ||
505 | if (!jump_poke_params) { | ||
506 | kfree(jump_poke_bufs); | ||
507 | jump_poke_bufs = NULL; | ||
508 | return -ENOMEM; | ||
509 | } | ||
510 | |||
511 | return 0; | ||
512 | } | ||
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index 7da647d8b64c..e213fc8408d2 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c | |||
@@ -30,16 +30,15 @@ | |||
30 | * <jkenisto@us.ibm.com> and Prasanna S Panchamukhi | 30 | * <jkenisto@us.ibm.com> and Prasanna S Panchamukhi |
31 | * <prasanna@in.ibm.com> added function-return probes. | 31 | * <prasanna@in.ibm.com> added function-return probes. |
32 | * 2005-May Rusty Lynch <rusty.lynch@intel.com> | 32 | * 2005-May Rusty Lynch <rusty.lynch@intel.com> |
33 | * Added function return probes functionality | 33 | * Added function return probes functionality |
34 | * 2006-Feb Masami Hiramatsu <hiramatu@sdl.hitachi.co.jp> added | 34 | * 2006-Feb Masami Hiramatsu <hiramatu@sdl.hitachi.co.jp> added |
35 | * kprobe-booster and kretprobe-booster for i386. | 35 | * kprobe-booster and kretprobe-booster for i386. |
36 | * 2007-Dec Masami Hiramatsu <mhiramat@redhat.com> added kprobe-booster | 36 | * 2007-Dec Masami Hiramatsu <mhiramat@redhat.com> added kprobe-booster |
37 | * and kretprobe-booster for x86-64 | 37 | * and kretprobe-booster for x86-64 |
38 | * 2007-Dec Masami Hiramatsu <mhiramat@redhat.com>, Arjan van de Ven | 38 | * 2007-Dec Masami Hiramatsu <mhiramat@redhat.com>, Arjan van de Ven |
39 | * <arjan@infradead.org> and Jim Keniston <jkenisto@us.ibm.com> | 39 | * <arjan@infradead.org> and Jim Keniston <jkenisto@us.ibm.com> |
40 | * unified x86 kprobes code. | 40 | * unified x86 kprobes code. |
41 | */ | 41 | */ |
42 | |||
43 | #include <linux/kprobes.h> | 42 | #include <linux/kprobes.h> |
44 | #include <linux/ptrace.h> | 43 | #include <linux/ptrace.h> |
45 | #include <linux/string.h> | 44 | #include <linux/string.h> |
@@ -59,6 +58,8 @@ | |||
59 | #include <asm/insn.h> | 58 | #include <asm/insn.h> |
60 | #include <asm/debugreg.h> | 59 | #include <asm/debugreg.h> |
61 | 60 | ||
61 | #include "kprobes-common.h" | ||
62 | |||
62 | void jprobe_return_end(void); | 63 | void jprobe_return_end(void); |
63 | 64 | ||
64 | DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL; | 65 | DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL; |
@@ -108,6 +109,7 @@ struct kretprobe_blackpoint kretprobe_blacklist[] = { | |||
108 | doesn't switch kernel stack.*/ | 109 | doesn't switch kernel stack.*/ |
109 | {NULL, NULL} /* Terminator */ | 110 | {NULL, NULL} /* Terminator */ |
110 | }; | 111 | }; |
112 | |||
111 | const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist); | 113 | const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist); |
112 | 114 | ||
113 | static void __kprobes __synthesize_relative_insn(void *from, void *to, u8 op) | 115 | static void __kprobes __synthesize_relative_insn(void *from, void *to, u8 op) |
@@ -123,11 +125,17 @@ static void __kprobes __synthesize_relative_insn(void *from, void *to, u8 op) | |||
123 | } | 125 | } |
124 | 126 | ||
125 | /* Insert a jump instruction at address 'from', which jumps to address 'to'.*/ | 127 | /* Insert a jump instruction at address 'from', which jumps to address 'to'.*/ |
126 | static void __kprobes synthesize_reljump(void *from, void *to) | 128 | void __kprobes synthesize_reljump(void *from, void *to) |
127 | { | 129 | { |
128 | __synthesize_relative_insn(from, to, RELATIVEJUMP_OPCODE); | 130 | __synthesize_relative_insn(from, to, RELATIVEJUMP_OPCODE); |
129 | } | 131 | } |
130 | 132 | ||
133 | /* Insert a call instruction at address 'from', which calls address 'to'.*/ | ||
134 | void __kprobes synthesize_relcall(void *from, void *to) | ||
135 | { | ||
136 | __synthesize_relative_insn(from, to, RELATIVECALL_OPCODE); | ||
137 | } | ||
138 | |||
131 | /* | 139 | /* |
132 | * Skip the prefixes of the instruction. | 140 | * Skip the prefixes of the instruction. |
133 | */ | 141 | */ |
@@ -151,7 +159,7 @@ static kprobe_opcode_t *__kprobes skip_prefixes(kprobe_opcode_t *insn) | |||
151 | * Returns non-zero if opcode is boostable. | 159 | * Returns non-zero if opcode is boostable. |
152 | * RIP relative instructions are adjusted at copying time in 64 bits mode | 160 | * RIP relative instructions are adjusted at copying time in 64 bits mode |
153 | */ | 161 | */ |
154 | static int __kprobes can_boost(kprobe_opcode_t *opcodes) | 162 | int __kprobes can_boost(kprobe_opcode_t *opcodes) |
155 | { | 163 | { |
156 | kprobe_opcode_t opcode; | 164 | kprobe_opcode_t opcode; |
157 | kprobe_opcode_t *orig_opcodes = opcodes; | 165 | kprobe_opcode_t *orig_opcodes = opcodes; |
@@ -207,13 +215,15 @@ retry: | |||
207 | } | 215 | } |
208 | } | 216 | } |
209 | 217 | ||
210 | /* Recover the probed instruction at addr for further analysis. */ | 218 | static unsigned long |
211 | static int recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr) | 219 | __recover_probed_insn(kprobe_opcode_t *buf, unsigned long addr) |
212 | { | 220 | { |
213 | struct kprobe *kp; | 221 | struct kprobe *kp; |
222 | |||
214 | kp = get_kprobe((void *)addr); | 223 | kp = get_kprobe((void *)addr); |
224 | /* There is no probe, return original address */ | ||
215 | if (!kp) | 225 | if (!kp) |
216 | return -EINVAL; | 226 | return addr; |
217 | 227 | ||
218 | /* | 228 | /* |
219 | * Basically, kp->ainsn.insn has an original instruction. | 229 | * Basically, kp->ainsn.insn has an original instruction. |
@@ -230,14 +240,29 @@ static int recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr) | |||
230 | */ | 240 | */ |
231 | memcpy(buf, kp->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t)); | 241 | memcpy(buf, kp->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t)); |
232 | buf[0] = kp->opcode; | 242 | buf[0] = kp->opcode; |
233 | return 0; | 243 | return (unsigned long)buf; |
244 | } | ||
245 | |||
246 | /* | ||
247 | * Recover the probed instruction at addr for further analysis. | ||
248 | * Caller must lock kprobes by kprobe_mutex, or disable preemption | ||
249 | * for preventing to release referencing kprobes. | ||
250 | */ | ||
251 | unsigned long recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr) | ||
252 | { | ||
253 | unsigned long __addr; | ||
254 | |||
255 | __addr = __recover_optprobed_insn(buf, addr); | ||
256 | if (__addr != addr) | ||
257 | return __addr; | ||
258 | |||
259 | return __recover_probed_insn(buf, addr); | ||
234 | } | 260 | } |
235 | 261 | ||
236 | /* Check if paddr is at an instruction boundary */ | 262 | /* Check if paddr is at an instruction boundary */ |
237 | static int __kprobes can_probe(unsigned long paddr) | 263 | static int __kprobes can_probe(unsigned long paddr) |
238 | { | 264 | { |
239 | int ret; | 265 | unsigned long addr, __addr, offset = 0; |
240 | unsigned long addr, offset = 0; | ||
241 | struct insn insn; | 266 | struct insn insn; |
242 | kprobe_opcode_t buf[MAX_INSN_SIZE]; | 267 | kprobe_opcode_t buf[MAX_INSN_SIZE]; |
243 | 268 | ||
@@ -247,26 +272,24 @@ static int __kprobes can_probe(unsigned long paddr) | |||
247 | /* Decode instructions */ | 272 | /* Decode instructions */ |
248 | addr = paddr - offset; | 273 | addr = paddr - offset; |
249 | while (addr < paddr) { | 274 | while (addr < paddr) { |
250 | kernel_insn_init(&insn, (void *)addr); | ||
251 | insn_get_opcode(&insn); | ||
252 | |||
253 | /* | 275 | /* |
254 | * Check if the instruction has been modified by another | 276 | * Check if the instruction has been modified by another |
255 | * kprobe, in which case we replace the breakpoint by the | 277 | * kprobe, in which case we replace the breakpoint by the |
256 | * original instruction in our buffer. | 278 | * original instruction in our buffer. |
279 | * Also, jump optimization will change the breakpoint to | ||
280 | * relative-jump. Since the relative-jump itself is | ||
281 | * normally used, we just go through if there is no kprobe. | ||
257 | */ | 282 | */ |
258 | if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) { | 283 | __addr = recover_probed_instruction(buf, addr); |
259 | ret = recover_probed_instruction(buf, addr); | 284 | kernel_insn_init(&insn, (void *)__addr); |
260 | if (ret) | ||
261 | /* | ||
262 | * Another debugging subsystem might insert | ||
263 | * this breakpoint. In that case, we can't | ||
264 | * recover it. | ||
265 | */ | ||
266 | return 0; | ||
267 | kernel_insn_init(&insn, buf); | ||
268 | } | ||
269 | insn_get_length(&insn); | 285 | insn_get_length(&insn); |
286 | |||
287 | /* | ||
288 | * Another debugging subsystem might insert this breakpoint. | ||
289 | * In that case, we can't recover it. | ||
290 | */ | ||
291 | if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) | ||
292 | return 0; | ||
270 | addr += insn.length; | 293 | addr += insn.length; |
271 | } | 294 | } |
272 | 295 | ||
@@ -299,24 +322,16 @@ static int __kprobes is_IF_modifier(kprobe_opcode_t *insn) | |||
299 | * If not, return null. | 322 | * If not, return null. |
300 | * Only applicable to 64-bit x86. | 323 | * Only applicable to 64-bit x86. |
301 | */ | 324 | */ |
302 | static int __kprobes __copy_instruction(u8 *dest, u8 *src, int recover) | 325 | int __kprobes __copy_instruction(u8 *dest, u8 *src) |
303 | { | 326 | { |
304 | struct insn insn; | 327 | struct insn insn; |
305 | int ret; | ||
306 | kprobe_opcode_t buf[MAX_INSN_SIZE]; | 328 | kprobe_opcode_t buf[MAX_INSN_SIZE]; |
307 | 329 | ||
308 | kernel_insn_init(&insn, src); | 330 | kernel_insn_init(&insn, (void *)recover_probed_instruction(buf, (unsigned long)src)); |
309 | if (recover) { | ||
310 | insn_get_opcode(&insn); | ||
311 | if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) { | ||
312 | ret = recover_probed_instruction(buf, | ||
313 | (unsigned long)src); | ||
314 | if (ret) | ||
315 | return 0; | ||
316 | kernel_insn_init(&insn, buf); | ||
317 | } | ||
318 | } | ||
319 | insn_get_length(&insn); | 331 | insn_get_length(&insn); |
332 | /* Another subsystem puts a breakpoint, failed to recover */ | ||
333 | if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) | ||
334 | return 0; | ||
320 | memcpy(dest, insn.kaddr, insn.length); | 335 | memcpy(dest, insn.kaddr, insn.length); |
321 | 336 | ||
322 | #ifdef CONFIG_X86_64 | 337 | #ifdef CONFIG_X86_64 |
@@ -337,8 +352,7 @@ static int __kprobes __copy_instruction(u8 *dest, u8 *src, int recover) | |||
337 | * extension of the original signed 32-bit displacement would | 352 | * extension of the original signed 32-bit displacement would |
338 | * have given. | 353 | * have given. |
339 | */ | 354 | */ |
340 | newdisp = (u8 *) src + (s64) insn.displacement.value - | 355 | newdisp = (u8 *) src + (s64) insn.displacement.value - (u8 *) dest; |
341 | (u8 *) dest; | ||
342 | BUG_ON((s64) (s32) newdisp != newdisp); /* Sanity check. */ | 356 | BUG_ON((s64) (s32) newdisp != newdisp); /* Sanity check. */ |
343 | disp = (u8 *) dest + insn_offset_displacement(&insn); | 357 | disp = (u8 *) dest + insn_offset_displacement(&insn); |
344 | *(s32 *) disp = (s32) newdisp; | 358 | *(s32 *) disp = (s32) newdisp; |
@@ -349,18 +363,20 @@ static int __kprobes __copy_instruction(u8 *dest, u8 *src, int recover) | |||
349 | 363 | ||
350 | static void __kprobes arch_copy_kprobe(struct kprobe *p) | 364 | static void __kprobes arch_copy_kprobe(struct kprobe *p) |
351 | { | 365 | { |
366 | /* Copy an instruction with recovering if other optprobe modifies it.*/ | ||
367 | __copy_instruction(p->ainsn.insn, p->addr); | ||
368 | |||
352 | /* | 369 | /* |
353 | * Copy an instruction without recovering int3, because it will be | 370 | * __copy_instruction can modify the displacement of the instruction, |
354 | * put by another subsystem. | 371 | * but it doesn't affect boostable check. |
355 | */ | 372 | */ |
356 | __copy_instruction(p->ainsn.insn, p->addr, 0); | 373 | if (can_boost(p->ainsn.insn)) |
357 | |||
358 | if (can_boost(p->addr)) | ||
359 | p->ainsn.boostable = 0; | 374 | p->ainsn.boostable = 0; |
360 | else | 375 | else |
361 | p->ainsn.boostable = -1; | 376 | p->ainsn.boostable = -1; |
362 | 377 | ||
363 | p->opcode = *p->addr; | 378 | /* Also, displacement change doesn't affect the first byte */ |
379 | p->opcode = p->ainsn.insn[0]; | ||
364 | } | 380 | } |
365 | 381 | ||
366 | int __kprobes arch_prepare_kprobe(struct kprobe *p) | 382 | int __kprobes arch_prepare_kprobe(struct kprobe *p) |
@@ -442,8 +458,8 @@ static void __kprobes restore_btf(void) | |||
442 | } | 458 | } |
443 | } | 459 | } |
444 | 460 | ||
445 | void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, | 461 | void __kprobes |
446 | struct pt_regs *regs) | 462 | arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs) |
447 | { | 463 | { |
448 | unsigned long *sara = stack_addr(regs); | 464 | unsigned long *sara = stack_addr(regs); |
449 | 465 | ||
@@ -453,16 +469,8 @@ void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, | |||
453 | *sara = (unsigned long) &kretprobe_trampoline; | 469 | *sara = (unsigned long) &kretprobe_trampoline; |
454 | } | 470 | } |
455 | 471 | ||
456 | #ifdef CONFIG_OPTPROBES | 472 | static void __kprobes |
457 | static int __kprobes setup_detour_execution(struct kprobe *p, | 473 | setup_singlestep(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb, int reenter) |
458 | struct pt_regs *regs, | ||
459 | int reenter); | ||
460 | #else | ||
461 | #define setup_detour_execution(p, regs, reenter) (0) | ||
462 | #endif | ||
463 | |||
464 | static void __kprobes setup_singlestep(struct kprobe *p, struct pt_regs *regs, | ||
465 | struct kprobe_ctlblk *kcb, int reenter) | ||
466 | { | 474 | { |
467 | if (setup_detour_execution(p, regs, reenter)) | 475 | if (setup_detour_execution(p, regs, reenter)) |
468 | return; | 476 | return; |
@@ -504,8 +512,8 @@ static void __kprobes setup_singlestep(struct kprobe *p, struct pt_regs *regs, | |||
504 | * within the handler. We save the original kprobes variables and just single | 512 | * within the handler. We save the original kprobes variables and just single |
505 | * step on the instruction of the new probe without calling any user handlers. | 513 | * step on the instruction of the new probe without calling any user handlers. |
506 | */ | 514 | */ |
507 | static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs, | 515 | static int __kprobes |
508 | struct kprobe_ctlblk *kcb) | 516 | reenter_kprobe(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb) |
509 | { | 517 | { |
510 | switch (kcb->kprobe_status) { | 518 | switch (kcb->kprobe_status) { |
511 | case KPROBE_HIT_SSDONE: | 519 | case KPROBE_HIT_SSDONE: |
@@ -600,69 +608,6 @@ static int __kprobes kprobe_handler(struct pt_regs *regs) | |||
600 | return 0; | 608 | return 0; |
601 | } | 609 | } |
602 | 610 | ||
603 | #ifdef CONFIG_X86_64 | ||
604 | #define SAVE_REGS_STRING \ | ||
605 | /* Skip cs, ip, orig_ax. */ \ | ||
606 | " subq $24, %rsp\n" \ | ||
607 | " pushq %rdi\n" \ | ||
608 | " pushq %rsi\n" \ | ||
609 | " pushq %rdx\n" \ | ||
610 | " pushq %rcx\n" \ | ||
611 | " pushq %rax\n" \ | ||
612 | " pushq %r8\n" \ | ||
613 | " pushq %r9\n" \ | ||
614 | " pushq %r10\n" \ | ||
615 | " pushq %r11\n" \ | ||
616 | " pushq %rbx\n" \ | ||
617 | " pushq %rbp\n" \ | ||
618 | " pushq %r12\n" \ | ||
619 | " pushq %r13\n" \ | ||
620 | " pushq %r14\n" \ | ||
621 | " pushq %r15\n" | ||
622 | #define RESTORE_REGS_STRING \ | ||
623 | " popq %r15\n" \ | ||
624 | " popq %r14\n" \ | ||
625 | " popq %r13\n" \ | ||
626 | " popq %r12\n" \ | ||
627 | " popq %rbp\n" \ | ||
628 | " popq %rbx\n" \ | ||
629 | " popq %r11\n" \ | ||
630 | " popq %r10\n" \ | ||
631 | " popq %r9\n" \ | ||
632 | " popq %r8\n" \ | ||
633 | " popq %rax\n" \ | ||
634 | " popq %rcx\n" \ | ||
635 | " popq %rdx\n" \ | ||
636 | " popq %rsi\n" \ | ||
637 | " popq %rdi\n" \ | ||
638 | /* Skip orig_ax, ip, cs */ \ | ||
639 | " addq $24, %rsp\n" | ||
640 | #else | ||
641 | #define SAVE_REGS_STRING \ | ||
642 | /* Skip cs, ip, orig_ax and gs. */ \ | ||
643 | " subl $16, %esp\n" \ | ||
644 | " pushl %fs\n" \ | ||
645 | " pushl %es\n" \ | ||
646 | " pushl %ds\n" \ | ||
647 | " pushl %eax\n" \ | ||
648 | " pushl %ebp\n" \ | ||
649 | " pushl %edi\n" \ | ||
650 | " pushl %esi\n" \ | ||
651 | " pushl %edx\n" \ | ||
652 | " pushl %ecx\n" \ | ||
653 | " pushl %ebx\n" | ||
654 | #define RESTORE_REGS_STRING \ | ||
655 | " popl %ebx\n" \ | ||
656 | " popl %ecx\n" \ | ||
657 | " popl %edx\n" \ | ||
658 | " popl %esi\n" \ | ||
659 | " popl %edi\n" \ | ||
660 | " popl %ebp\n" \ | ||
661 | " popl %eax\n" \ | ||
662 | /* Skip ds, es, fs, gs, orig_ax, and ip. Note: don't pop cs here*/\ | ||
663 | " addl $24, %esp\n" | ||
664 | #endif | ||
665 | |||
666 | /* | 611 | /* |
667 | * When a retprobed function returns, this code saves registers and | 612 | * When a retprobed function returns, this code saves registers and |
668 | * calls trampoline_handler() runs, which calls the kretprobe's handler. | 613 | * calls trampoline_handler() runs, which calls the kretprobe's handler. |
@@ -816,8 +761,8 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs) | |||
816 | * jump instruction after the copied instruction, that jumps to the next | 761 | * jump instruction after the copied instruction, that jumps to the next |
817 | * instruction after the probepoint. | 762 | * instruction after the probepoint. |
818 | */ | 763 | */ |
819 | static void __kprobes resume_execution(struct kprobe *p, | 764 | static void __kprobes |
820 | struct pt_regs *regs, struct kprobe_ctlblk *kcb) | 765 | resume_execution(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb) |
821 | { | 766 | { |
822 | unsigned long *tos = stack_addr(regs); | 767 | unsigned long *tos = stack_addr(regs); |
823 | unsigned long copy_ip = (unsigned long)p->ainsn.insn; | 768 | unsigned long copy_ip = (unsigned long)p->ainsn.insn; |
@@ -996,8 +941,8 @@ int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr) | |||
996 | /* | 941 | /* |
997 | * Wrapper routine for handling exceptions. | 942 | * Wrapper routine for handling exceptions. |
998 | */ | 943 | */ |
999 | int __kprobes kprobe_exceptions_notify(struct notifier_block *self, | 944 | int __kprobes |
1000 | unsigned long val, void *data) | 945 | kprobe_exceptions_notify(struct notifier_block *self, unsigned long val, void *data) |
1001 | { | 946 | { |
1002 | struct die_args *args = data; | 947 | struct die_args *args = data; |
1003 | int ret = NOTIFY_DONE; | 948 | int ret = NOTIFY_DONE; |
@@ -1107,466 +1052,9 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) | |||
1107 | return 0; | 1052 | return 0; |
1108 | } | 1053 | } |
1109 | 1054 | ||
1110 | |||
1111 | #ifdef CONFIG_OPTPROBES | ||
1112 | |||
1113 | /* Insert a call instruction at address 'from', which calls address 'to'.*/ | ||
1114 | static void __kprobes synthesize_relcall(void *from, void *to) | ||
1115 | { | ||
1116 | __synthesize_relative_insn(from, to, RELATIVECALL_OPCODE); | ||
1117 | } | ||
1118 | |||
1119 | /* Insert a move instruction which sets a pointer to eax/rdi (1st arg). */ | ||
1120 | static void __kprobes synthesize_set_arg1(kprobe_opcode_t *addr, | ||
1121 | unsigned long val) | ||
1122 | { | ||
1123 | #ifdef CONFIG_X86_64 | ||
1124 | *addr++ = 0x48; | ||
1125 | *addr++ = 0xbf; | ||
1126 | #else | ||
1127 | *addr++ = 0xb8; | ||
1128 | #endif | ||
1129 | *(unsigned long *)addr = val; | ||
1130 | } | ||
1131 | |||
1132 | static void __used __kprobes kprobes_optinsn_template_holder(void) | ||
1133 | { | ||
1134 | asm volatile ( | ||
1135 | ".global optprobe_template_entry\n" | ||
1136 | "optprobe_template_entry: \n" | ||
1137 | #ifdef CONFIG_X86_64 | ||
1138 | /* We don't bother saving the ss register */ | ||
1139 | " pushq %rsp\n" | ||
1140 | " pushfq\n" | ||
1141 | SAVE_REGS_STRING | ||
1142 | " movq %rsp, %rsi\n" | ||
1143 | ".global optprobe_template_val\n" | ||
1144 | "optprobe_template_val: \n" | ||
1145 | ASM_NOP5 | ||
1146 | ASM_NOP5 | ||
1147 | ".global optprobe_template_call\n" | ||
1148 | "optprobe_template_call: \n" | ||
1149 | ASM_NOP5 | ||
1150 | /* Move flags to rsp */ | ||
1151 | " movq 144(%rsp), %rdx\n" | ||
1152 | " movq %rdx, 152(%rsp)\n" | ||
1153 | RESTORE_REGS_STRING | ||
1154 | /* Skip flags entry */ | ||
1155 | " addq $8, %rsp\n" | ||
1156 | " popfq\n" | ||
1157 | #else /* CONFIG_X86_32 */ | ||
1158 | " pushf\n" | ||
1159 | SAVE_REGS_STRING | ||
1160 | " movl %esp, %edx\n" | ||
1161 | ".global optprobe_template_val\n" | ||
1162 | "optprobe_template_val: \n" | ||
1163 | ASM_NOP5 | ||
1164 | ".global optprobe_template_call\n" | ||
1165 | "optprobe_template_call: \n" | ||
1166 | ASM_NOP5 | ||
1167 | RESTORE_REGS_STRING | ||
1168 | " addl $4, %esp\n" /* skip cs */ | ||
1169 | " popf\n" | ||
1170 | #endif | ||
1171 | ".global optprobe_template_end\n" | ||
1172 | "optprobe_template_end: \n"); | ||
1173 | } | ||
1174 | |||
1175 | #define TMPL_MOVE_IDX \ | ||
1176 | ((long)&optprobe_template_val - (long)&optprobe_template_entry) | ||
1177 | #define TMPL_CALL_IDX \ | ||
1178 | ((long)&optprobe_template_call - (long)&optprobe_template_entry) | ||
1179 | #define TMPL_END_IDX \ | ||
1180 | ((long)&optprobe_template_end - (long)&optprobe_template_entry) | ||
1181 | |||
1182 | #define INT3_SIZE sizeof(kprobe_opcode_t) | ||
1183 | |||
1184 | /* Optimized kprobe call back function: called from optinsn */ | ||
1185 | static void __kprobes optimized_callback(struct optimized_kprobe *op, | ||
1186 | struct pt_regs *regs) | ||
1187 | { | ||
1188 | struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); | ||
1189 | unsigned long flags; | ||
1190 | |||
1191 | /* This is possible if op is under delayed unoptimizing */ | ||
1192 | if (kprobe_disabled(&op->kp)) | ||
1193 | return; | ||
1194 | |||
1195 | local_irq_save(flags); | ||
1196 | if (kprobe_running()) { | ||
1197 | kprobes_inc_nmissed_count(&op->kp); | ||
1198 | } else { | ||
1199 | /* Save skipped registers */ | ||
1200 | #ifdef CONFIG_X86_64 | ||
1201 | regs->cs = __KERNEL_CS; | ||
1202 | #else | ||
1203 | regs->cs = __KERNEL_CS | get_kernel_rpl(); | ||
1204 | regs->gs = 0; | ||
1205 | #endif | ||
1206 | regs->ip = (unsigned long)op->kp.addr + INT3_SIZE; | ||
1207 | regs->orig_ax = ~0UL; | ||
1208 | |||
1209 | __this_cpu_write(current_kprobe, &op->kp); | ||
1210 | kcb->kprobe_status = KPROBE_HIT_ACTIVE; | ||
1211 | opt_pre_handler(&op->kp, regs); | ||
1212 | __this_cpu_write(current_kprobe, NULL); | ||
1213 | } | ||
1214 | local_irq_restore(flags); | ||
1215 | } | ||
1216 | |||
1217 | static int __kprobes copy_optimized_instructions(u8 *dest, u8 *src) | ||
1218 | { | ||
1219 | int len = 0, ret; | ||
1220 | |||
1221 | while (len < RELATIVEJUMP_SIZE) { | ||
1222 | ret = __copy_instruction(dest + len, src + len, 1); | ||
1223 | if (!ret || !can_boost(dest + len)) | ||
1224 | return -EINVAL; | ||
1225 | len += ret; | ||
1226 | } | ||
1227 | /* Check whether the address range is reserved */ | ||
1228 | if (ftrace_text_reserved(src, src + len - 1) || | ||
1229 | alternatives_text_reserved(src, src + len - 1) || | ||
1230 | jump_label_text_reserved(src, src + len - 1)) | ||
1231 | return -EBUSY; | ||
1232 | |||
1233 | return len; | ||
1234 | } | ||
1235 | |||
1236 | /* Check whether insn is indirect jump */ | ||
1237 | static int __kprobes insn_is_indirect_jump(struct insn *insn) | ||
1238 | { | ||
1239 | return ((insn->opcode.bytes[0] == 0xff && | ||
1240 | (X86_MODRM_REG(insn->modrm.value) & 6) == 4) || /* Jump */ | ||
1241 | insn->opcode.bytes[0] == 0xea); /* Segment based jump */ | ||
1242 | } | ||
1243 | |||
1244 | /* Check whether insn jumps into specified address range */ | ||
1245 | static int insn_jump_into_range(struct insn *insn, unsigned long start, int len) | ||
1246 | { | ||
1247 | unsigned long target = 0; | ||
1248 | |||
1249 | switch (insn->opcode.bytes[0]) { | ||
1250 | case 0xe0: /* loopne */ | ||
1251 | case 0xe1: /* loope */ | ||
1252 | case 0xe2: /* loop */ | ||
1253 | case 0xe3: /* jcxz */ | ||
1254 | case 0xe9: /* near relative jump */ | ||
1255 | case 0xeb: /* short relative jump */ | ||
1256 | break; | ||
1257 | case 0x0f: | ||
1258 | if ((insn->opcode.bytes[1] & 0xf0) == 0x80) /* jcc near */ | ||
1259 | break; | ||
1260 | return 0; | ||
1261 | default: | ||
1262 | if ((insn->opcode.bytes[0] & 0xf0) == 0x70) /* jcc short */ | ||
1263 | break; | ||
1264 | return 0; | ||
1265 | } | ||
1266 | target = (unsigned long)insn->next_byte + insn->immediate.value; | ||
1267 | |||
1268 | return (start <= target && target <= start + len); | ||
1269 | } | ||
1270 | |||
1271 | /* Decode whole function to ensure any instructions don't jump into target */ | ||
1272 | static int __kprobes can_optimize(unsigned long paddr) | ||
1273 | { | ||
1274 | int ret; | ||
1275 | unsigned long addr, size = 0, offset = 0; | ||
1276 | struct insn insn; | ||
1277 | kprobe_opcode_t buf[MAX_INSN_SIZE]; | ||
1278 | |||
1279 | /* Lookup symbol including addr */ | ||
1280 | if (!kallsyms_lookup_size_offset(paddr, &size, &offset)) | ||
1281 | return 0; | ||
1282 | |||
1283 | /* | ||
1284 | * Do not optimize in the entry code due to the unstable | ||
1285 | * stack handling. | ||
1286 | */ | ||
1287 | if ((paddr >= (unsigned long )__entry_text_start) && | ||
1288 | (paddr < (unsigned long )__entry_text_end)) | ||
1289 | return 0; | ||
1290 | |||
1291 | /* Check there is enough space for a relative jump. */ | ||
1292 | if (size - offset < RELATIVEJUMP_SIZE) | ||
1293 | return 0; | ||
1294 | |||
1295 | /* Decode instructions */ | ||
1296 | addr = paddr - offset; | ||
1297 | while (addr < paddr - offset + size) { /* Decode until function end */ | ||
1298 | if (search_exception_tables(addr)) | ||
1299 | /* | ||
1300 | * Since some fixup code will jumps into this function, | ||
1301 | * we can't optimize kprobe in this function. | ||
1302 | */ | ||
1303 | return 0; | ||
1304 | kernel_insn_init(&insn, (void *)addr); | ||
1305 | insn_get_opcode(&insn); | ||
1306 | if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) { | ||
1307 | ret = recover_probed_instruction(buf, addr); | ||
1308 | if (ret) | ||
1309 | return 0; | ||
1310 | kernel_insn_init(&insn, buf); | ||
1311 | } | ||
1312 | insn_get_length(&insn); | ||
1313 | /* Recover address */ | ||
1314 | insn.kaddr = (void *)addr; | ||
1315 | insn.next_byte = (void *)(addr + insn.length); | ||
1316 | /* Check any instructions don't jump into target */ | ||
1317 | if (insn_is_indirect_jump(&insn) || | ||
1318 | insn_jump_into_range(&insn, paddr + INT3_SIZE, | ||
1319 | RELATIVE_ADDR_SIZE)) | ||
1320 | return 0; | ||
1321 | addr += insn.length; | ||
1322 | } | ||
1323 | |||
1324 | return 1; | ||
1325 | } | ||
1326 | |||
1327 | /* Check optimized_kprobe can actually be optimized. */ | ||
1328 | int __kprobes arch_check_optimized_kprobe(struct optimized_kprobe *op) | ||
1329 | { | ||
1330 | int i; | ||
1331 | struct kprobe *p; | ||
1332 | |||
1333 | for (i = 1; i < op->optinsn.size; i++) { | ||
1334 | p = get_kprobe(op->kp.addr + i); | ||
1335 | if (p && !kprobe_disabled(p)) | ||
1336 | return -EEXIST; | ||
1337 | } | ||
1338 | |||
1339 | return 0; | ||
1340 | } | ||
1341 | |||
1342 | /* Check the addr is within the optimized instructions. */ | ||
1343 | int __kprobes arch_within_optimized_kprobe(struct optimized_kprobe *op, | ||
1344 | unsigned long addr) | ||
1345 | { | ||
1346 | return ((unsigned long)op->kp.addr <= addr && | ||
1347 | (unsigned long)op->kp.addr + op->optinsn.size > addr); | ||
1348 | } | ||
1349 | |||
1350 | /* Free optimized instruction slot */ | ||
1351 | static __kprobes | ||
1352 | void __arch_remove_optimized_kprobe(struct optimized_kprobe *op, int dirty) | ||
1353 | { | ||
1354 | if (op->optinsn.insn) { | ||
1355 | free_optinsn_slot(op->optinsn.insn, dirty); | ||
1356 | op->optinsn.insn = NULL; | ||
1357 | op->optinsn.size = 0; | ||
1358 | } | ||
1359 | } | ||
1360 | |||
1361 | void __kprobes arch_remove_optimized_kprobe(struct optimized_kprobe *op) | ||
1362 | { | ||
1363 | __arch_remove_optimized_kprobe(op, 1); | ||
1364 | } | ||
1365 | |||
1366 | /* | ||
1367 | * Copy replacing target instructions | ||
1368 | * Target instructions MUST be relocatable (checked inside) | ||
1369 | */ | ||
1370 | int __kprobes arch_prepare_optimized_kprobe(struct optimized_kprobe *op) | ||
1371 | { | ||
1372 | u8 *buf; | ||
1373 | int ret; | ||
1374 | long rel; | ||
1375 | |||
1376 | if (!can_optimize((unsigned long)op->kp.addr)) | ||
1377 | return -EILSEQ; | ||
1378 | |||
1379 | op->optinsn.insn = get_optinsn_slot(); | ||
1380 | if (!op->optinsn.insn) | ||
1381 | return -ENOMEM; | ||
1382 | |||
1383 | /* | ||
1384 | * Verify if the address gap is in 2GB range, because this uses | ||
1385 | * a relative jump. | ||
1386 | */ | ||
1387 | rel = (long)op->optinsn.insn - (long)op->kp.addr + RELATIVEJUMP_SIZE; | ||
1388 | if (abs(rel) > 0x7fffffff) | ||
1389 | return -ERANGE; | ||
1390 | |||
1391 | buf = (u8 *)op->optinsn.insn; | ||
1392 | |||
1393 | /* Copy instructions into the out-of-line buffer */ | ||
1394 | ret = copy_optimized_instructions(buf + TMPL_END_IDX, op->kp.addr); | ||
1395 | if (ret < 0) { | ||
1396 | __arch_remove_optimized_kprobe(op, 0); | ||
1397 | return ret; | ||
1398 | } | ||
1399 | op->optinsn.size = ret; | ||
1400 | |||
1401 | /* Copy arch-dep-instance from template */ | ||
1402 | memcpy(buf, &optprobe_template_entry, TMPL_END_IDX); | ||
1403 | |||
1404 | /* Set probe information */ | ||
1405 | synthesize_set_arg1(buf + TMPL_MOVE_IDX, (unsigned long)op); | ||
1406 | |||
1407 | /* Set probe function call */ | ||
1408 | synthesize_relcall(buf + TMPL_CALL_IDX, optimized_callback); | ||
1409 | |||
1410 | /* Set returning jmp instruction at the tail of out-of-line buffer */ | ||
1411 | synthesize_reljump(buf + TMPL_END_IDX + op->optinsn.size, | ||
1412 | (u8 *)op->kp.addr + op->optinsn.size); | ||
1413 | |||
1414 | flush_icache_range((unsigned long) buf, | ||
1415 | (unsigned long) buf + TMPL_END_IDX + | ||
1416 | op->optinsn.size + RELATIVEJUMP_SIZE); | ||
1417 | return 0; | ||
1418 | } | ||
1419 | |||
1420 | #define MAX_OPTIMIZE_PROBES 256 | ||
1421 | static struct text_poke_param *jump_poke_params; | ||
1422 | static struct jump_poke_buffer { | ||
1423 | u8 buf[RELATIVEJUMP_SIZE]; | ||
1424 | } *jump_poke_bufs; | ||
1425 | |||
1426 | static void __kprobes setup_optimize_kprobe(struct text_poke_param *tprm, | ||
1427 | u8 *insn_buf, | ||
1428 | struct optimized_kprobe *op) | ||
1429 | { | ||
1430 | s32 rel = (s32)((long)op->optinsn.insn - | ||
1431 | ((long)op->kp.addr + RELATIVEJUMP_SIZE)); | ||
1432 | |||
1433 | /* Backup instructions which will be replaced by jump address */ | ||
1434 | memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_SIZE, | ||
1435 | RELATIVE_ADDR_SIZE); | ||
1436 | |||
1437 | insn_buf[0] = RELATIVEJUMP_OPCODE; | ||
1438 | *(s32 *)(&insn_buf[1]) = rel; | ||
1439 | |||
1440 | tprm->addr = op->kp.addr; | ||
1441 | tprm->opcode = insn_buf; | ||
1442 | tprm->len = RELATIVEJUMP_SIZE; | ||
1443 | } | ||
1444 | |||
1445 | /* | ||
1446 | * Replace breakpoints (int3) with relative jumps. | ||
1447 | * Caller must call with locking kprobe_mutex and text_mutex. | ||
1448 | */ | ||
1449 | void __kprobes arch_optimize_kprobes(struct list_head *oplist) | ||
1450 | { | ||
1451 | struct optimized_kprobe *op, *tmp; | ||
1452 | int c = 0; | ||
1453 | |||
1454 | list_for_each_entry_safe(op, tmp, oplist, list) { | ||
1455 | WARN_ON(kprobe_disabled(&op->kp)); | ||
1456 | /* Setup param */ | ||
1457 | setup_optimize_kprobe(&jump_poke_params[c], | ||
1458 | jump_poke_bufs[c].buf, op); | ||
1459 | list_del_init(&op->list); | ||
1460 | if (++c >= MAX_OPTIMIZE_PROBES) | ||
1461 | break; | ||
1462 | } | ||
1463 | |||
1464 | /* | ||
1465 | * text_poke_smp doesn't support NMI/MCE code modifying. | ||
1466 | * However, since kprobes itself also doesn't support NMI/MCE | ||
1467 | * code probing, it's not a problem. | ||
1468 | */ | ||
1469 | text_poke_smp_batch(jump_poke_params, c); | ||
1470 | } | ||
1471 | |||
1472 | static void __kprobes setup_unoptimize_kprobe(struct text_poke_param *tprm, | ||
1473 | u8 *insn_buf, | ||
1474 | struct optimized_kprobe *op) | ||
1475 | { | ||
1476 | /* Set int3 to first byte for kprobes */ | ||
1477 | insn_buf[0] = BREAKPOINT_INSTRUCTION; | ||
1478 | memcpy(insn_buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE); | ||
1479 | |||
1480 | tprm->addr = op->kp.addr; | ||
1481 | tprm->opcode = insn_buf; | ||
1482 | tprm->len = RELATIVEJUMP_SIZE; | ||
1483 | } | ||
1484 | |||
1485 | /* | ||
1486 | * Recover original instructions and breakpoints from relative jumps. | ||
1487 | * Caller must call with locking kprobe_mutex. | ||
1488 | */ | ||
1489 | extern void arch_unoptimize_kprobes(struct list_head *oplist, | ||
1490 | struct list_head *done_list) | ||
1491 | { | ||
1492 | struct optimized_kprobe *op, *tmp; | ||
1493 | int c = 0; | ||
1494 | |||
1495 | list_for_each_entry_safe(op, tmp, oplist, list) { | ||
1496 | /* Setup param */ | ||
1497 | setup_unoptimize_kprobe(&jump_poke_params[c], | ||
1498 | jump_poke_bufs[c].buf, op); | ||
1499 | list_move(&op->list, done_list); | ||
1500 | if (++c >= MAX_OPTIMIZE_PROBES) | ||
1501 | break; | ||
1502 | } | ||
1503 | |||
1504 | /* | ||
1505 | * text_poke_smp doesn't support NMI/MCE code modifying. | ||
1506 | * However, since kprobes itself also doesn't support NMI/MCE | ||
1507 | * code probing, it's not a problem. | ||
1508 | */ | ||
1509 | text_poke_smp_batch(jump_poke_params, c); | ||
1510 | } | ||
1511 | |||
1512 | /* Replace a relative jump with a breakpoint (int3). */ | ||
1513 | void __kprobes arch_unoptimize_kprobe(struct optimized_kprobe *op) | ||
1514 | { | ||
1515 | u8 buf[RELATIVEJUMP_SIZE]; | ||
1516 | |||
1517 | /* Set int3 to first byte for kprobes */ | ||
1518 | buf[0] = BREAKPOINT_INSTRUCTION; | ||
1519 | memcpy(buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE); | ||
1520 | text_poke_smp(op->kp.addr, buf, RELATIVEJUMP_SIZE); | ||
1521 | } | ||
1522 | |||
1523 | static int __kprobes setup_detour_execution(struct kprobe *p, | ||
1524 | struct pt_regs *regs, | ||
1525 | int reenter) | ||
1526 | { | ||
1527 | struct optimized_kprobe *op; | ||
1528 | |||
1529 | if (p->flags & KPROBE_FLAG_OPTIMIZED) { | ||
1530 | /* This kprobe is really able to run optimized path. */ | ||
1531 | op = container_of(p, struct optimized_kprobe, kp); | ||
1532 | /* Detour through copied instructions */ | ||
1533 | regs->ip = (unsigned long)op->optinsn.insn + TMPL_END_IDX; | ||
1534 | if (!reenter) | ||
1535 | reset_current_kprobe(); | ||
1536 | preempt_enable_no_resched(); | ||
1537 | return 1; | ||
1538 | } | ||
1539 | return 0; | ||
1540 | } | ||
1541 | |||
1542 | static int __kprobes init_poke_params(void) | ||
1543 | { | ||
1544 | /* Allocate code buffer and parameter array */ | ||
1545 | jump_poke_bufs = kmalloc(sizeof(struct jump_poke_buffer) * | ||
1546 | MAX_OPTIMIZE_PROBES, GFP_KERNEL); | ||
1547 | if (!jump_poke_bufs) | ||
1548 | return -ENOMEM; | ||
1549 | |||
1550 | jump_poke_params = kmalloc(sizeof(struct text_poke_param) * | ||
1551 | MAX_OPTIMIZE_PROBES, GFP_KERNEL); | ||
1552 | if (!jump_poke_params) { | ||
1553 | kfree(jump_poke_bufs); | ||
1554 | jump_poke_bufs = NULL; | ||
1555 | return -ENOMEM; | ||
1556 | } | ||
1557 | |||
1558 | return 0; | ||
1559 | } | ||
1560 | #else /* !CONFIG_OPTPROBES */ | ||
1561 | static int __kprobes init_poke_params(void) | ||
1562 | { | ||
1563 | return 0; | ||
1564 | } | ||
1565 | #endif | ||
1566 | |||
1567 | int __init arch_init_kprobes(void) | 1055 | int __init arch_init_kprobes(void) |
1568 | { | 1056 | { |
1569 | return init_poke_params(); | 1057 | return arch_init_optprobes(); |
1570 | } | 1058 | } |
1571 | 1059 | ||
1572 | int __kprobes arch_trampoline_kprobe(struct kprobe *p) | 1060 | int __kprobes arch_trampoline_kprobe(struct kprobe *p) |
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index f0c6fd6f176b..694d801bf606 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c | |||
@@ -438,9 +438,9 @@ void __init kvm_guest_init(void) | |||
438 | static __init int activate_jump_labels(void) | 438 | static __init int activate_jump_labels(void) |
439 | { | 439 | { |
440 | if (has_steal_clock) { | 440 | if (has_steal_clock) { |
441 | jump_label_inc(¶virt_steal_enabled); | 441 | static_key_slow_inc(¶virt_steal_enabled); |
442 | if (steal_acc) | 442 | if (steal_acc) |
443 | jump_label_inc(¶virt_steal_rq_enabled); | 443 | static_key_slow_inc(¶virt_steal_rq_enabled); |
444 | } | 444 | } |
445 | 445 | ||
446 | return 0; | 446 | return 0; |
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index ac0417be9131..73465aab28f8 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c | |||
@@ -360,7 +360,6 @@ out: | |||
360 | static enum ucode_state | 360 | static enum ucode_state |
361 | request_microcode_user(int cpu, const void __user *buf, size_t size) | 361 | request_microcode_user(int cpu, const void __user *buf, size_t size) |
362 | { | 362 | { |
363 | pr_info("AMD microcode update via /dev/cpu/microcode not supported\n"); | ||
364 | return UCODE_ERROR; | 363 | return UCODE_ERROR; |
365 | } | 364 | } |
366 | 365 | ||
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index d90272e6bc40..ada2f99388dd 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c | |||
@@ -202,8 +202,8 @@ static void native_flush_tlb_single(unsigned long addr) | |||
202 | __native_flush_tlb_single(addr); | 202 | __native_flush_tlb_single(addr); |
203 | } | 203 | } |
204 | 204 | ||
205 | struct jump_label_key paravirt_steal_enabled; | 205 | struct static_key paravirt_steal_enabled; |
206 | struct jump_label_key paravirt_steal_rq_enabled; | 206 | struct static_key paravirt_steal_rq_enabled; |
207 | 207 | ||
208 | static u64 native_steal_clock(int cpu) | 208 | static u64 native_steal_clock(int cpu) |
209 | { | 209 | { |
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 15763af7bfe3..44eefde92109 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c | |||
@@ -377,8 +377,8 @@ static inline int hlt_use_halt(void) | |||
377 | void default_idle(void) | 377 | void default_idle(void) |
378 | { | 378 | { |
379 | if (hlt_use_halt()) { | 379 | if (hlt_use_halt()) { |
380 | trace_power_start(POWER_CSTATE, 1, smp_processor_id()); | 380 | trace_power_start_rcuidle(POWER_CSTATE, 1, smp_processor_id()); |
381 | trace_cpu_idle(1, smp_processor_id()); | 381 | trace_cpu_idle_rcuidle(1, smp_processor_id()); |
382 | current_thread_info()->status &= ~TS_POLLING; | 382 | current_thread_info()->status &= ~TS_POLLING; |
383 | /* | 383 | /* |
384 | * TS_POLLING-cleared state must be visible before we | 384 | * TS_POLLING-cleared state must be visible before we |
@@ -391,8 +391,8 @@ void default_idle(void) | |||
391 | else | 391 | else |
392 | local_irq_enable(); | 392 | local_irq_enable(); |
393 | current_thread_info()->status |= TS_POLLING; | 393 | current_thread_info()->status |= TS_POLLING; |
394 | trace_power_end(smp_processor_id()); | 394 | trace_power_end_rcuidle(smp_processor_id()); |
395 | trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id()); | 395 | trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); |
396 | } else { | 396 | } else { |
397 | local_irq_enable(); | 397 | local_irq_enable(); |
398 | /* loop is done by the caller */ | 398 | /* loop is done by the caller */ |
@@ -450,8 +450,8 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait); | |||
450 | static void mwait_idle(void) | 450 | static void mwait_idle(void) |
451 | { | 451 | { |
452 | if (!need_resched()) { | 452 | if (!need_resched()) { |
453 | trace_power_start(POWER_CSTATE, 1, smp_processor_id()); | 453 | trace_power_start_rcuidle(POWER_CSTATE, 1, smp_processor_id()); |
454 | trace_cpu_idle(1, smp_processor_id()); | 454 | trace_cpu_idle_rcuidle(1, smp_processor_id()); |
455 | if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR)) | 455 | if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR)) |
456 | clflush((void *)¤t_thread_info()->flags); | 456 | clflush((void *)¤t_thread_info()->flags); |
457 | 457 | ||
@@ -461,8 +461,8 @@ static void mwait_idle(void) | |||
461 | __sti_mwait(0, 0); | 461 | __sti_mwait(0, 0); |
462 | else | 462 | else |
463 | local_irq_enable(); | 463 | local_irq_enable(); |
464 | trace_power_end(smp_processor_id()); | 464 | trace_power_end_rcuidle(smp_processor_id()); |
465 | trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id()); | 465 | trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); |
466 | } else | 466 | } else |
467 | local_irq_enable(); | 467 | local_irq_enable(); |
468 | } | 468 | } |
@@ -474,13 +474,13 @@ static void mwait_idle(void) | |||
474 | */ | 474 | */ |
475 | static void poll_idle(void) | 475 | static void poll_idle(void) |
476 | { | 476 | { |
477 | trace_power_start(POWER_CSTATE, 0, smp_processor_id()); | 477 | trace_power_start_rcuidle(POWER_CSTATE, 0, smp_processor_id()); |
478 | trace_cpu_idle(0, smp_processor_id()); | 478 | trace_cpu_idle_rcuidle(0, smp_processor_id()); |
479 | local_irq_enable(); | 479 | local_irq_enable(); |
480 | while (!need_resched()) | 480 | while (!need_resched()) |
481 | cpu_relax(); | 481 | cpu_relax(); |
482 | trace_power_end(smp_processor_id()); | 482 | trace_power_end_rcuidle(smp_processor_id()); |
483 | trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id()); | 483 | trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); |
484 | } | 484 | } |
485 | 485 | ||
486 | /* | 486 | /* |
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 485204f58cda..49888fefe794 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c | |||
@@ -119,9 +119,7 @@ void cpu_idle(void) | |||
119 | } | 119 | } |
120 | rcu_idle_exit(); | 120 | rcu_idle_exit(); |
121 | tick_nohz_idle_exit(); | 121 | tick_nohz_idle_exit(); |
122 | preempt_enable_no_resched(); | 122 | schedule_preempt_disabled(); |
123 | schedule(); | ||
124 | preempt_disable(); | ||
125 | } | 123 | } |
126 | } | 124 | } |
127 | 125 | ||
@@ -214,6 +212,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, | |||
214 | 212 | ||
215 | task_user_gs(p) = get_user_gs(regs); | 213 | task_user_gs(p) = get_user_gs(regs); |
216 | 214 | ||
215 | p->fpu_counter = 0; | ||
217 | p->thread.io_bitmap_ptr = NULL; | 216 | p->thread.io_bitmap_ptr = NULL; |
218 | tsk = current; | 217 | tsk = current; |
219 | err = -ENOMEM; | 218 | err = -ENOMEM; |
@@ -299,22 +298,11 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) | |||
299 | *next = &next_p->thread; | 298 | *next = &next_p->thread; |
300 | int cpu = smp_processor_id(); | 299 | int cpu = smp_processor_id(); |
301 | struct tss_struct *tss = &per_cpu(init_tss, cpu); | 300 | struct tss_struct *tss = &per_cpu(init_tss, cpu); |
302 | bool preload_fpu; | 301 | fpu_switch_t fpu; |
303 | 302 | ||
304 | /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ | 303 | /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ |
305 | 304 | ||
306 | /* | 305 | fpu = switch_fpu_prepare(prev_p, next_p, cpu); |
307 | * If the task has used fpu the last 5 timeslices, just do a full | ||
308 | * restore of the math state immediately to avoid the trap; the | ||
309 | * chances of needing FPU soon are obviously high now | ||
310 | */ | ||
311 | preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5; | ||
312 | |||
313 | __unlazy_fpu(prev_p); | ||
314 | |||
315 | /* we're going to use this soon, after a few expensive things */ | ||
316 | if (preload_fpu) | ||
317 | prefetch(next->fpu.state); | ||
318 | 306 | ||
319 | /* | 307 | /* |
320 | * Reload esp0. | 308 | * Reload esp0. |
@@ -354,11 +342,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) | |||
354 | task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT)) | 342 | task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT)) |
355 | __switch_to_xtra(prev_p, next_p, tss); | 343 | __switch_to_xtra(prev_p, next_p, tss); |
356 | 344 | ||
357 | /* If we're going to preload the fpu context, make sure clts | ||
358 | is run while we're batching the cpu state updates. */ | ||
359 | if (preload_fpu) | ||
360 | clts(); | ||
361 | |||
362 | /* | 345 | /* |
363 | * Leave lazy mode, flushing any hypercalls made here. | 346 | * Leave lazy mode, flushing any hypercalls made here. |
364 | * This must be done before restoring TLS segments so | 347 | * This must be done before restoring TLS segments so |
@@ -368,15 +351,14 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) | |||
368 | */ | 351 | */ |
369 | arch_end_context_switch(next_p); | 352 | arch_end_context_switch(next_p); |
370 | 353 | ||
371 | if (preload_fpu) | ||
372 | __math_state_restore(); | ||
373 | |||
374 | /* | 354 | /* |
375 | * Restore %gs if needed (which is common) | 355 | * Restore %gs if needed (which is common) |
376 | */ | 356 | */ |
377 | if (prev->gs | next->gs) | 357 | if (prev->gs | next->gs) |
378 | lazy_load_gs(next->gs); | 358 | lazy_load_gs(next->gs); |
379 | 359 | ||
360 | switch_fpu_finish(next_p, fpu); | ||
361 | |||
380 | percpu_write(current_task, next_p); | 362 | percpu_write(current_task, next_p); |
381 | 363 | ||
382 | return prev_p; | 364 | return prev_p; |
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 9b9fe4a85c87..e34257c70c28 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c | |||
@@ -156,9 +156,7 @@ void cpu_idle(void) | |||
156 | } | 156 | } |
157 | 157 | ||
158 | tick_nohz_idle_exit(); | 158 | tick_nohz_idle_exit(); |
159 | preempt_enable_no_resched(); | 159 | schedule_preempt_disabled(); |
160 | schedule(); | ||
161 | preempt_disable(); | ||
162 | } | 160 | } |
163 | } | 161 | } |
164 | 162 | ||
@@ -286,6 +284,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, | |||
286 | 284 | ||
287 | set_tsk_thread_flag(p, TIF_FORK); | 285 | set_tsk_thread_flag(p, TIF_FORK); |
288 | 286 | ||
287 | p->fpu_counter = 0; | ||
289 | p->thread.io_bitmap_ptr = NULL; | 288 | p->thread.io_bitmap_ptr = NULL; |
290 | 289 | ||
291 | savesegment(gs, p->thread.gsindex); | 290 | savesegment(gs, p->thread.gsindex); |
@@ -386,18 +385,9 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) | |||
386 | int cpu = smp_processor_id(); | 385 | int cpu = smp_processor_id(); |
387 | struct tss_struct *tss = &per_cpu(init_tss, cpu); | 386 | struct tss_struct *tss = &per_cpu(init_tss, cpu); |
388 | unsigned fsindex, gsindex; | 387 | unsigned fsindex, gsindex; |
389 | bool preload_fpu; | 388 | fpu_switch_t fpu; |
390 | 389 | ||
391 | /* | 390 | fpu = switch_fpu_prepare(prev_p, next_p, cpu); |
392 | * If the task has used fpu the last 5 timeslices, just do a full | ||
393 | * restore of the math state immediately to avoid the trap; the | ||
394 | * chances of needing FPU soon are obviously high now | ||
395 | */ | ||
396 | preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5; | ||
397 | |||
398 | /* we're going to use this soon, after a few expensive things */ | ||
399 | if (preload_fpu) | ||
400 | prefetch(next->fpu.state); | ||
401 | 391 | ||
402 | /* | 392 | /* |
403 | * Reload esp0, LDT and the page table pointer: | 393 | * Reload esp0, LDT and the page table pointer: |
@@ -427,13 +417,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) | |||
427 | 417 | ||
428 | load_TLS(next, cpu); | 418 | load_TLS(next, cpu); |
429 | 419 | ||
430 | /* Must be after DS reload */ | ||
431 | __unlazy_fpu(prev_p); | ||
432 | |||
433 | /* Make sure cpu is ready for new context */ | ||
434 | if (preload_fpu) | ||
435 | clts(); | ||
436 | |||
437 | /* | 420 | /* |
438 | * Leave lazy mode, flushing any hypercalls made here. | 421 | * Leave lazy mode, flushing any hypercalls made here. |
439 | * This must be done before restoring TLS segments so | 422 | * This must be done before restoring TLS segments so |
@@ -474,6 +457,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) | |||
474 | wrmsrl(MSR_KERNEL_GS_BASE, next->gs); | 457 | wrmsrl(MSR_KERNEL_GS_BASE, next->gs); |
475 | prev->gsindex = gsindex; | 458 | prev->gsindex = gsindex; |
476 | 459 | ||
460 | switch_fpu_finish(next_p, fpu); | ||
461 | |||
477 | /* | 462 | /* |
478 | * Switch the PDA and FPU contexts. | 463 | * Switch the PDA and FPU contexts. |
479 | */ | 464 | */ |
@@ -492,13 +477,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) | |||
492 | task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV)) | 477 | task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV)) |
493 | __switch_to_xtra(prev_p, next_p, tss); | 478 | __switch_to_xtra(prev_p, next_p, tss); |
494 | 479 | ||
495 | /* | ||
496 | * Preload the FPU context, now that we've determined that the | ||
497 | * task is likely to be using it. | ||
498 | */ | ||
499 | if (preload_fpu) | ||
500 | __math_state_restore(); | ||
501 | |||
502 | return prev_p; | 480 | return prev_p; |
503 | } | 481 | } |
504 | 482 | ||
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 66d250c00d11..58f78165d308 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c | |||
@@ -291,19 +291,6 @@ notrace static void __cpuinit start_secondary(void *unused) | |||
291 | per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; | 291 | per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; |
292 | x86_platform.nmi_init(); | 292 | x86_platform.nmi_init(); |
293 | 293 | ||
294 | /* | ||
295 | * Wait until the cpu which brought this one up marked it | ||
296 | * online before enabling interrupts. If we don't do that then | ||
297 | * we can end up waking up the softirq thread before this cpu | ||
298 | * reached the active state, which makes the scheduler unhappy | ||
299 | * and schedule the softirq thread on the wrong cpu. This is | ||
300 | * only observable with forced threaded interrupts, but in | ||
301 | * theory it could also happen w/o them. It's just way harder | ||
302 | * to achieve. | ||
303 | */ | ||
304 | while (!cpumask_test_cpu(smp_processor_id(), cpu_active_mask)) | ||
305 | cpu_relax(); | ||
306 | |||
307 | /* enable local interrupts */ | 294 | /* enable local interrupts */ |
308 | local_irq_enable(); | 295 | local_irq_enable(); |
309 | 296 | ||
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 8ba27dbc107a..4bbe04d96744 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c | |||
@@ -571,28 +571,6 @@ asmlinkage void __attribute__((weak)) smp_threshold_interrupt(void) | |||
571 | } | 571 | } |
572 | 572 | ||
573 | /* | 573 | /* |
574 | * __math_state_restore assumes that cr0.TS is already clear and the | ||
575 | * fpu state is all ready for use. Used during context switch. | ||
576 | */ | ||
577 | void __math_state_restore(void) | ||
578 | { | ||
579 | struct thread_info *thread = current_thread_info(); | ||
580 | struct task_struct *tsk = thread->task; | ||
581 | |||
582 | /* | ||
583 | * Paranoid restore. send a SIGSEGV if we fail to restore the state. | ||
584 | */ | ||
585 | if (unlikely(restore_fpu_checking(tsk))) { | ||
586 | stts(); | ||
587 | force_sig(SIGSEGV, tsk); | ||
588 | return; | ||
589 | } | ||
590 | |||
591 | thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */ | ||
592 | tsk->fpu_counter++; | ||
593 | } | ||
594 | |||
595 | /* | ||
596 | * 'math_state_restore()' saves the current math information in the | 574 | * 'math_state_restore()' saves the current math information in the |
597 | * old math state array, and gets the new ones from the current task | 575 | * old math state array, and gets the new ones from the current task |
598 | * | 576 | * |
@@ -604,8 +582,7 @@ void __math_state_restore(void) | |||
604 | */ | 582 | */ |
605 | void math_state_restore(void) | 583 | void math_state_restore(void) |
606 | { | 584 | { |
607 | struct thread_info *thread = current_thread_info(); | 585 | struct task_struct *tsk = current; |
608 | struct task_struct *tsk = thread->task; | ||
609 | 586 | ||
610 | if (!tsk_used_math(tsk)) { | 587 | if (!tsk_used_math(tsk)) { |
611 | local_irq_enable(); | 588 | local_irq_enable(); |
@@ -622,16 +599,23 @@ void math_state_restore(void) | |||
622 | local_irq_disable(); | 599 | local_irq_disable(); |
623 | } | 600 | } |
624 | 601 | ||
625 | clts(); /* Allow maths ops (or we recurse) */ | 602 | __thread_fpu_begin(tsk); |
603 | /* | ||
604 | * Paranoid restore. send a SIGSEGV if we fail to restore the state. | ||
605 | */ | ||
606 | if (unlikely(restore_fpu_checking(tsk))) { | ||
607 | __thread_fpu_end(tsk); | ||
608 | force_sig(SIGSEGV, tsk); | ||
609 | return; | ||
610 | } | ||
626 | 611 | ||
627 | __math_state_restore(); | 612 | tsk->fpu_counter++; |
628 | } | 613 | } |
629 | EXPORT_SYMBOL_GPL(math_state_restore); | 614 | EXPORT_SYMBOL_GPL(math_state_restore); |
630 | 615 | ||
631 | dotraplinkage void __kprobes | 616 | dotraplinkage void __kprobes |
632 | do_device_not_available(struct pt_regs *regs, long error_code) | 617 | do_device_not_available(struct pt_regs *regs, long error_code) |
633 | { | 618 | { |
634 | WARN_ON_ONCE(!user_mode_vm(regs)); | ||
635 | #ifdef CONFIG_MATH_EMULATION | 619 | #ifdef CONFIG_MATH_EMULATION |
636 | if (read_cr0() & X86_CR0_EM) { | 620 | if (read_cr0() & X86_CR0_EM) { |
637 | struct math_emu_info info = { }; | 621 | struct math_emu_info info = { }; |
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index a62c201c97ec..183c5925a9fe 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c | |||
@@ -620,7 +620,8 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) | |||
620 | 620 | ||
621 | if (cpu_khz) { | 621 | if (cpu_khz) { |
622 | *scale = (NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR)/cpu_khz; | 622 | *scale = (NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR)/cpu_khz; |
623 | *offset = ns_now - (tsc_now * *scale >> CYC2NS_SCALE_FACTOR); | 623 | *offset = ns_now - mult_frac(tsc_now, *scale, |
624 | (1UL << CYC2NS_SCALE_FACTOR)); | ||
624 | } | 625 | } |
625 | 626 | ||
626 | sched_clock_idle_wakeup_event(0); | 627 | sched_clock_idle_wakeup_event(0); |
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index a3911343976b..711091114119 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c | |||
@@ -47,7 +47,7 @@ void __sanitize_i387_state(struct task_struct *tsk) | |||
47 | if (!fx) | 47 | if (!fx) |
48 | return; | 48 | return; |
49 | 49 | ||
50 | BUG_ON(task_thread_info(tsk)->status & TS_USEDFPU); | 50 | BUG_ON(__thread_has_fpu(tsk)); |
51 | 51 | ||
52 | xstate_bv = tsk->thread.fpu.state->xsave.xsave_hdr.xstate_bv; | 52 | xstate_bv = tsk->thread.fpu.state->xsave.xsave_hdr.xstate_bv; |
53 | 53 | ||
@@ -168,7 +168,7 @@ int save_i387_xstate(void __user *buf) | |||
168 | if (!used_math()) | 168 | if (!used_math()) |
169 | return 0; | 169 | return 0; |
170 | 170 | ||
171 | if (task_thread_info(tsk)->status & TS_USEDFPU) { | 171 | if (user_has_fpu()) { |
172 | if (use_xsave()) | 172 | if (use_xsave()) |
173 | err = xsave_user(buf); | 173 | err = xsave_user(buf); |
174 | else | 174 | else |
@@ -176,8 +176,7 @@ int save_i387_xstate(void __user *buf) | |||
176 | 176 | ||
177 | if (err) | 177 | if (err) |
178 | return err; | 178 | return err; |
179 | task_thread_info(tsk)->status &= ~TS_USEDFPU; | 179 | user_fpu_end(); |
180 | stts(); | ||
181 | } else { | 180 | } else { |
182 | sanitize_i387_state(tsk); | 181 | sanitize_i387_state(tsk); |
183 | if (__copy_to_user(buf, &tsk->thread.fpu.state->fxsave, | 182 | if (__copy_to_user(buf, &tsk->thread.fpu.state->fxsave, |
@@ -292,10 +291,7 @@ int restore_i387_xstate(void __user *buf) | |||
292 | return err; | 291 | return err; |
293 | } | 292 | } |
294 | 293 | ||
295 | if (!(task_thread_info(current)->status & TS_USEDFPU)) { | 294 | user_fpu_begin(); |
296 | clts(); | ||
297 | task_thread_info(current)->status |= TS_USEDFPU; | ||
298 | } | ||
299 | if (use_xsave()) | 295 | if (use_xsave()) |
300 | err = restore_user_xstate(buf); | 296 | err = restore_user_xstate(buf); |
301 | else | 297 | else |