aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2014-03-31 14:13:25 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2014-03-31 14:13:25 -0400
commit8c292f11744297dfb3a69f4a0bccbe4a6417b50d (patch)
treef1a89560de25a69b697d459a9b5cf2e738038d9f /arch/x86
parentd31605dc8a63f1df28443ddb3560b1079417af92 (diff)
parent538592ff0b008237ae88f5ce5fb1247127dc3ce5 (diff)
Merge branch 'perf-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull perf changes from Ingo Molnar: "Main changes: Kernel side changes: - Add SNB/IVB/HSW client uncore memory controller support (Stephane Eranian) - Fix various x86/P4 PMU driver bugs (Don Zickus) Tooling, user visible changes: - Add several futex 'perf bench' microbenchmarks (Davidlohr Bueso) - Speed up thread map generation (Don Zickus) - Introduce 'perf kvm --list-cmds' command line option for use by scripts (Ramkumar Ramachandra) - Print the evsel name in the annotate stdio output, prep to fix support outputting annotation for multiple events, not just for the first one (Arnaldo Carvalho de Melo) - Allow setting preferred callchain method in .perfconfig (Jiri Olsa) - Show in what binaries/modules 'perf probe's are set (Masami Hiramatsu) - Support distro-style debuginfo for uprobe in 'perf probe' (Masami Hiramatsu) Tooling, internal changes and fixes: - Use tid in mmap/mmap2 events to find maps (Don Zickus) - Record the reason for filtering an address_location (Namhyung Kim) - Apply all filters to an addr_location (Namhyung Kim) - Merge al->filtered with hist_entry->filtered in report/hists (Namhyung Kim) - Fix memory leak when synthesizing thread records (Namhyung Kim) - Use ui__has_annotation() in 'report' (Namhyung Kim) - hists browser refactorings to reuse code accross UIs (Namhyung Kim) - Add support for the new DWARF unwinder library in elfutils (Jiri Olsa) - Fix build race in the generation of bison files (Jiri Olsa) - Further streamline the feature detection display, trimming it a bit to show just the libraries detected, using VF=1 gets a more verbose output, showing the less interesting feature checks as well (Jiri Olsa). - Check compatible symtab type before loading dso (Namhyung Kim) - Check return value of filename__read_debuglink() (Stephane Eranian) - Move some hashing and fs related code from tools/perf/util/ to tools/lib/ so that it can be used by more tools/ living utilities (Borislav Petkov) - Prepare DWARF unwinding code for using an elfutils alternative unwinding library (Jiri Olsa) - Fix DWARF unwind max_stack processing (Jiri Olsa) - Add dwarf unwind 'perf test' entry (Jiri Olsa) - 'perf probe' improvements including memory leak fixes, sharing the intlist class with other tools, uprobes/kprobes code sharing and use of ref_reloc_sym (Masami Hiramatsu) - Shorten sample symbol resolving by adding cpumode to struct addr_location (Arnaldo Carvalho de Melo) - Fix synthesizing mmaps for threads (Don Zickus) - Fix invalid output on event group stdio report (Namhyung Kim) - Fixup header alignment in 'perf sched latency' output (Ramkumar Ramachandra) - Fix off-by-one error in 'perf timechart record' argv handling (Ramkumar Ramachandra) Tooling, cleanups: - Remove unused thread__find_map function (Jiri Olsa) - Remove unused simple_strtoul() function (Ramkumar Ramachandra) Tooling, documentation updates: - Update function names in debug messages (Ramkumar Ramachandra) - Update some code references in design.txt (Ramkumar Ramachandra) - Clarify load-latency information in the 'perf mem' docs (Andi Kleen) - Clarify x86 register naming in 'perf probe' docs (Andi Kleen)" * 'perf-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (96 commits) perf tools: Remove unused simple_strtoul() function perf tools: Update some code references in design.txt perf evsel: Update function names in debug messages perf tools: Remove thread__find_map function perf annotate: Print the evsel name in the stdio output perf report: Use ui__has_annotation() perf tools: Fix memory leak when synthesizing thread records perf tools: Use tid in mmap/mmap2 events to find maps perf report: Merge al->filtered with hist_entry->filtered perf symbols: Apply all filters to an addr_location perf symbols: Record the reason for filtering an address_location perf sched: Fixup header alignment in 'latency' output perf timechart: Fix off-by-one error in 'record' argv handling perf machine: Factor machine__find_thread to take tid argument perf tools: Speed up thread map generation perf kvm: introduce --list-cmds for use by scripts perf ui hists: Pass evsel to hpp->header/width functions explicitly perf symbols: Introduce thread__find_cpumode_addr_location perf session: Change header.misc dump from decimal to hex perf ui/tui: Reuse generic __hpp__fmt() code ...
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/include/asm/nmi.h3
-rw-r--r--arch/x86/kernel/cpu/perf_event.c47
-rw-r--r--arch/x86/kernel/cpu/perf_event.h8
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_uncore.c544
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_uncore.h5
-rw-r--r--arch/x86/kernel/cpu/perf_event_p4.c34
-rw-r--r--arch/x86/kernel/nmi.c37
7 files changed, 576 insertions, 102 deletions
diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h
index 86f9301903c8..5f2fc4441b11 100644
--- a/arch/x86/include/asm/nmi.h
+++ b/arch/x86/include/asm/nmi.h
@@ -1,6 +1,7 @@
1#ifndef _ASM_X86_NMI_H 1#ifndef _ASM_X86_NMI_H
2#define _ASM_X86_NMI_H 2#define _ASM_X86_NMI_H
3 3
4#include <linux/irq_work.h>
4#include <linux/pm.h> 5#include <linux/pm.h>
5#include <asm/irq.h> 6#include <asm/irq.h>
6#include <asm/io.h> 7#include <asm/io.h>
@@ -38,6 +39,8 @@ typedef int (*nmi_handler_t)(unsigned int, struct pt_regs *);
38struct nmiaction { 39struct nmiaction {
39 struct list_head list; 40 struct list_head list;
40 nmi_handler_t handler; 41 nmi_handler_t handler;
42 u64 max_duration;
43 struct irq_work irq_work;
41 unsigned long flags; 44 unsigned long flags;
42 const char *name; 45 const char *name;
43}; 46};
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 79f9f848bee4..ae407f7226c8 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -892,7 +892,6 @@ static void x86_pmu_enable(struct pmu *pmu)
892 * hw_perf_group_sched_in() or x86_pmu_enable() 892 * hw_perf_group_sched_in() or x86_pmu_enable()
893 * 893 *
894 * step1: save events moving to new counters 894 * step1: save events moving to new counters
895 * step2: reprogram moved events into new counters
896 */ 895 */
897 for (i = 0; i < n_running; i++) { 896 for (i = 0; i < n_running; i++) {
898 event = cpuc->event_list[i]; 897 event = cpuc->event_list[i];
@@ -918,6 +917,9 @@ static void x86_pmu_enable(struct pmu *pmu)
918 x86_pmu_stop(event, PERF_EF_UPDATE); 917 x86_pmu_stop(event, PERF_EF_UPDATE);
919 } 918 }
920 919
920 /*
921 * step2: reprogram moved events into new counters
922 */
921 for (i = 0; i < cpuc->n_events; i++) { 923 for (i = 0; i < cpuc->n_events; i++) {
922 event = cpuc->event_list[i]; 924 event = cpuc->event_list[i];
923 hwc = &event->hw; 925 hwc = &event->hw;
@@ -1043,7 +1045,7 @@ static int x86_pmu_add(struct perf_event *event, int flags)
1043 /* 1045 /*
1044 * If group events scheduling transaction was started, 1046 * If group events scheduling transaction was started,
1045 * skip the schedulability test here, it will be performed 1047 * skip the schedulability test here, it will be performed
1046 * at commit time (->commit_txn) as a whole 1048 * at commit time (->commit_txn) as a whole.
1047 */ 1049 */
1048 if (cpuc->group_flag & PERF_EVENT_TXN) 1050 if (cpuc->group_flag & PERF_EVENT_TXN)
1049 goto done_collect; 1051 goto done_collect;
@@ -1058,6 +1060,10 @@ static int x86_pmu_add(struct perf_event *event, int flags)
1058 memcpy(cpuc->assign, assign, n*sizeof(int)); 1060 memcpy(cpuc->assign, assign, n*sizeof(int));
1059 1061
1060done_collect: 1062done_collect:
1063 /*
1064 * Commit the collect_events() state. See x86_pmu_del() and
1065 * x86_pmu_*_txn().
1066 */
1061 cpuc->n_events = n; 1067 cpuc->n_events = n;
1062 cpuc->n_added += n - n0; 1068 cpuc->n_added += n - n0;
1063 cpuc->n_txn += n - n0; 1069 cpuc->n_txn += n - n0;
@@ -1183,28 +1189,38 @@ static void x86_pmu_del(struct perf_event *event, int flags)
1183 * If we're called during a txn, we don't need to do anything. 1189 * If we're called during a txn, we don't need to do anything.
1184 * The events never got scheduled and ->cancel_txn will truncate 1190 * The events never got scheduled and ->cancel_txn will truncate
1185 * the event_list. 1191 * the event_list.
1192 *
1193 * XXX assumes any ->del() called during a TXN will only be on
1194 * an event added during that same TXN.
1186 */ 1195 */
1187 if (cpuc->group_flag & PERF_EVENT_TXN) 1196 if (cpuc->group_flag & PERF_EVENT_TXN)
1188 return; 1197 return;
1189 1198
1199 /*
1200 * Not a TXN, therefore cleanup properly.
1201 */
1190 x86_pmu_stop(event, PERF_EF_UPDATE); 1202 x86_pmu_stop(event, PERF_EF_UPDATE);
1191 1203
1192 for (i = 0; i < cpuc->n_events; i++) { 1204 for (i = 0; i < cpuc->n_events; i++) {
1193 if (event == cpuc->event_list[i]) { 1205 if (event == cpuc->event_list[i])
1206 break;
1207 }
1194 1208
1195 if (i >= cpuc->n_events - cpuc->n_added) 1209 if (WARN_ON_ONCE(i == cpuc->n_events)) /* called ->del() without ->add() ? */
1196 --cpuc->n_added; 1210 return;
1197 1211
1198 if (x86_pmu.put_event_constraints) 1212 /* If we have a newly added event; make sure to decrease n_added. */
1199 x86_pmu.put_event_constraints(cpuc, event); 1213 if (i >= cpuc->n_events - cpuc->n_added)
1214 --cpuc->n_added;
1200 1215
1201 while (++i < cpuc->n_events) 1216 if (x86_pmu.put_event_constraints)
1202 cpuc->event_list[i-1] = cpuc->event_list[i]; 1217 x86_pmu.put_event_constraints(cpuc, event);
1218
1219 /* Delete the array entry. */
1220 while (++i < cpuc->n_events)
1221 cpuc->event_list[i-1] = cpuc->event_list[i];
1222 --cpuc->n_events;
1203 1223
1204 --cpuc->n_events;
1205 break;
1206 }
1207 }
1208 perf_event_update_userpage(event); 1224 perf_event_update_userpage(event);
1209} 1225}
1210 1226
@@ -1598,7 +1614,8 @@ static void x86_pmu_cancel_txn(struct pmu *pmu)
1598{ 1614{
1599 __this_cpu_and(cpu_hw_events.group_flag, ~PERF_EVENT_TXN); 1615 __this_cpu_and(cpu_hw_events.group_flag, ~PERF_EVENT_TXN);
1600 /* 1616 /*
1601 * Truncate the collected events. 1617 * Truncate collected array by the number of events added in this
1618 * transaction. See x86_pmu_add() and x86_pmu_*_txn().
1602 */ 1619 */
1603 __this_cpu_sub(cpu_hw_events.n_added, __this_cpu_read(cpu_hw_events.n_txn)); 1620 __this_cpu_sub(cpu_hw_events.n_added, __this_cpu_read(cpu_hw_events.n_txn));
1604 __this_cpu_sub(cpu_hw_events.n_events, __this_cpu_read(cpu_hw_events.n_txn)); 1621 __this_cpu_sub(cpu_hw_events.n_events, __this_cpu_read(cpu_hw_events.n_txn));
@@ -1609,6 +1626,8 @@ static void x86_pmu_cancel_txn(struct pmu *pmu)
1609 * Commit group events scheduling transaction 1626 * Commit group events scheduling transaction
1610 * Perform the group schedulability test as a whole 1627 * Perform the group schedulability test as a whole
1611 * Return 0 if success 1628 * Return 0 if success
1629 *
1630 * Does not cancel the transaction on failure; expects the caller to do this.
1612 */ 1631 */
1613static int x86_pmu_commit_txn(struct pmu *pmu) 1632static int x86_pmu_commit_txn(struct pmu *pmu)
1614{ 1633{
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 4972c244d0bc..3b2f9bdd974b 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -130,9 +130,11 @@ struct cpu_hw_events {
130 unsigned long running[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; 130 unsigned long running[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
131 int enabled; 131 int enabled;
132 132
133 int n_events; 133 int n_events; /* the # of events in the below arrays */
134 int n_added; 134 int n_added; /* the # last events in the below arrays;
135 int n_txn; 135 they've never been enabled yet */
136 int n_txn; /* the # last events in the below arrays;
137 added in the current transaction */
136 int assign[X86_PMC_IDX_MAX]; /* event to counter assignment */ 138 int assign[X86_PMC_IDX_MAX]; /* event to counter assignment */
137 u64 tags[X86_PMC_IDX_MAX]; 139 u64 tags[X86_PMC_IDX_MAX];
138 struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */ 140 struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
index 047f540cf3f7..bd2253d40cff 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
@@ -66,6 +66,47 @@ DEFINE_UNCORE_FORMAT_ATTR(mask_vnw, mask_vnw, "config2:3-4");
66DEFINE_UNCORE_FORMAT_ATTR(mask0, mask0, "config2:0-31"); 66DEFINE_UNCORE_FORMAT_ATTR(mask0, mask0, "config2:0-31");
67DEFINE_UNCORE_FORMAT_ATTR(mask1, mask1, "config2:32-63"); 67DEFINE_UNCORE_FORMAT_ATTR(mask1, mask1, "config2:32-63");
68 68
69static void uncore_pmu_start_hrtimer(struct intel_uncore_box *box);
70static void uncore_pmu_cancel_hrtimer(struct intel_uncore_box *box);
71static void uncore_perf_event_update(struct intel_uncore_box *box, struct perf_event *event);
72static void uncore_pmu_event_read(struct perf_event *event);
73
74static struct intel_uncore_pmu *uncore_event_to_pmu(struct perf_event *event)
75{
76 return container_of(event->pmu, struct intel_uncore_pmu, pmu);
77}
78
79static struct intel_uncore_box *
80uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu)
81{
82 struct intel_uncore_box *box;
83
84 box = *per_cpu_ptr(pmu->box, cpu);
85 if (box)
86 return box;
87
88 raw_spin_lock(&uncore_box_lock);
89 list_for_each_entry(box, &pmu->box_list, list) {
90 if (box->phys_id == topology_physical_package_id(cpu)) {
91 atomic_inc(&box->refcnt);
92 *per_cpu_ptr(pmu->box, cpu) = box;
93 break;
94 }
95 }
96 raw_spin_unlock(&uncore_box_lock);
97
98 return *per_cpu_ptr(pmu->box, cpu);
99}
100
101static struct intel_uncore_box *uncore_event_to_box(struct perf_event *event)
102{
103 /*
104 * perf core schedules event on the basis of cpu, uncore events are
105 * collected by one of the cpus inside a physical package.
106 */
107 return uncore_pmu_to_box(uncore_event_to_pmu(event), smp_processor_id());
108}
109
69static u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event) 110static u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event)
70{ 111{
71 u64 count; 112 u64 count;
@@ -1639,6 +1680,349 @@ static struct intel_uncore_type *snb_msr_uncores[] = {
1639 &snb_uncore_cbox, 1680 &snb_uncore_cbox,
1640 NULL, 1681 NULL,
1641}; 1682};
1683
1684enum {
1685 SNB_PCI_UNCORE_IMC,
1686};
1687
1688static struct uncore_event_desc snb_uncore_imc_events[] = {
1689 INTEL_UNCORE_EVENT_DESC(data_reads, "event=0x01"),
1690 INTEL_UNCORE_EVENT_DESC(data_reads.scale, "6.103515625e-5"),
1691 INTEL_UNCORE_EVENT_DESC(data_reads.unit, "MiB"),
1692
1693 INTEL_UNCORE_EVENT_DESC(data_writes, "event=0x02"),
1694 INTEL_UNCORE_EVENT_DESC(data_writes.scale, "6.103515625e-5"),
1695 INTEL_UNCORE_EVENT_DESC(data_writes.unit, "MiB"),
1696
1697 { /* end: all zeroes */ },
1698};
1699
1700#define SNB_UNCORE_PCI_IMC_EVENT_MASK 0xff
1701#define SNB_UNCORE_PCI_IMC_BAR_OFFSET 0x48
1702
1703/* page size multiple covering all config regs */
1704#define SNB_UNCORE_PCI_IMC_MAP_SIZE 0x6000
1705
1706#define SNB_UNCORE_PCI_IMC_DATA_READS 0x1
1707#define SNB_UNCORE_PCI_IMC_DATA_READS_BASE 0x5050
1708#define SNB_UNCORE_PCI_IMC_DATA_WRITES 0x2
1709#define SNB_UNCORE_PCI_IMC_DATA_WRITES_BASE 0x5054
1710#define SNB_UNCORE_PCI_IMC_CTR_BASE SNB_UNCORE_PCI_IMC_DATA_READS_BASE
1711
1712static struct attribute *snb_uncore_imc_formats_attr[] = {
1713 &format_attr_event.attr,
1714 NULL,
1715};
1716
1717static struct attribute_group snb_uncore_imc_format_group = {
1718 .name = "format",
1719 .attrs = snb_uncore_imc_formats_attr,
1720};
1721
1722static void snb_uncore_imc_init_box(struct intel_uncore_box *box)
1723{
1724 struct pci_dev *pdev = box->pci_dev;
1725 int where = SNB_UNCORE_PCI_IMC_BAR_OFFSET;
1726 resource_size_t addr;
1727 u32 pci_dword;
1728
1729 pci_read_config_dword(pdev, where, &pci_dword);
1730 addr = pci_dword;
1731
1732#ifdef CONFIG_PHYS_ADDR_T_64BIT
1733 pci_read_config_dword(pdev, where + 4, &pci_dword);
1734 addr |= ((resource_size_t)pci_dword << 32);
1735#endif
1736
1737 addr &= ~(PAGE_SIZE - 1);
1738
1739 box->io_addr = ioremap(addr, SNB_UNCORE_PCI_IMC_MAP_SIZE);
1740 box->hrtimer_duration = UNCORE_SNB_IMC_HRTIMER_INTERVAL;
1741}
1742
1743static void snb_uncore_imc_enable_box(struct intel_uncore_box *box)
1744{}
1745
1746static void snb_uncore_imc_disable_box(struct intel_uncore_box *box)
1747{}
1748
1749static void snb_uncore_imc_enable_event(struct intel_uncore_box *box, struct perf_event *event)
1750{}
1751
1752static void snb_uncore_imc_disable_event(struct intel_uncore_box *box, struct perf_event *event)
1753{}
1754
1755static u64 snb_uncore_imc_read_counter(struct intel_uncore_box *box, struct perf_event *event)
1756{
1757 struct hw_perf_event *hwc = &event->hw;
1758
1759 return (u64)*(unsigned int *)(box->io_addr + hwc->event_base);
1760}
1761
1762/*
1763 * custom event_init() function because we define our own fixed, free
1764 * running counters, so we do not want to conflict with generic uncore
1765 * logic. Also simplifies processing
1766 */
1767static int snb_uncore_imc_event_init(struct perf_event *event)
1768{
1769 struct intel_uncore_pmu *pmu;
1770 struct intel_uncore_box *box;
1771 struct hw_perf_event *hwc = &event->hw;
1772 u64 cfg = event->attr.config & SNB_UNCORE_PCI_IMC_EVENT_MASK;
1773 int idx, base;
1774
1775 if (event->attr.type != event->pmu->type)
1776 return -ENOENT;
1777
1778 pmu = uncore_event_to_pmu(event);
1779 /* no device found for this pmu */
1780 if (pmu->func_id < 0)
1781 return -ENOENT;
1782
1783 /* Sampling not supported yet */
1784 if (hwc->sample_period)
1785 return -EINVAL;
1786
1787 /* unsupported modes and filters */
1788 if (event->attr.exclude_user ||
1789 event->attr.exclude_kernel ||
1790 event->attr.exclude_hv ||
1791 event->attr.exclude_idle ||
1792 event->attr.exclude_host ||
1793 event->attr.exclude_guest ||
1794 event->attr.sample_period) /* no sampling */
1795 return -EINVAL;
1796
1797 /*
1798 * Place all uncore events for a particular physical package
1799 * onto a single cpu
1800 */
1801 if (event->cpu < 0)
1802 return -EINVAL;
1803
1804 /* check only supported bits are set */
1805 if (event->attr.config & ~SNB_UNCORE_PCI_IMC_EVENT_MASK)
1806 return -EINVAL;
1807
1808 box = uncore_pmu_to_box(pmu, event->cpu);
1809 if (!box || box->cpu < 0)
1810 return -EINVAL;
1811
1812 event->cpu = box->cpu;
1813
1814 event->hw.idx = -1;
1815 event->hw.last_tag = ~0ULL;
1816 event->hw.extra_reg.idx = EXTRA_REG_NONE;
1817 event->hw.branch_reg.idx = EXTRA_REG_NONE;
1818 /*
1819 * check event is known (whitelist, determines counter)
1820 */
1821 switch (cfg) {
1822 case SNB_UNCORE_PCI_IMC_DATA_READS:
1823 base = SNB_UNCORE_PCI_IMC_DATA_READS_BASE;
1824 idx = UNCORE_PMC_IDX_FIXED;
1825 break;
1826 case SNB_UNCORE_PCI_IMC_DATA_WRITES:
1827 base = SNB_UNCORE_PCI_IMC_DATA_WRITES_BASE;
1828 idx = UNCORE_PMC_IDX_FIXED + 1;
1829 break;
1830 default:
1831 return -EINVAL;
1832 }
1833
1834 /* must be done before validate_group */
1835 event->hw.event_base = base;
1836 event->hw.config = cfg;
1837 event->hw.idx = idx;
1838
1839 /* no group validation needed, we have free running counters */
1840
1841 return 0;
1842}
1843
1844static int snb_uncore_imc_hw_config(struct intel_uncore_box *box, struct perf_event *event)
1845{
1846 return 0;
1847}
1848
1849static void snb_uncore_imc_event_start(struct perf_event *event, int flags)
1850{
1851 struct intel_uncore_box *box = uncore_event_to_box(event);
1852 u64 count;
1853
1854 if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
1855 return;
1856
1857 event->hw.state = 0;
1858 box->n_active++;
1859
1860 list_add_tail(&event->active_entry, &box->active_list);
1861
1862 count = snb_uncore_imc_read_counter(box, event);
1863 local64_set(&event->hw.prev_count, count);
1864
1865 if (box->n_active == 1)
1866 uncore_pmu_start_hrtimer(box);
1867}
1868
1869static void snb_uncore_imc_event_stop(struct perf_event *event, int flags)
1870{
1871 struct intel_uncore_box *box = uncore_event_to_box(event);
1872 struct hw_perf_event *hwc = &event->hw;
1873
1874 if (!(hwc->state & PERF_HES_STOPPED)) {
1875 box->n_active--;
1876
1877 WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
1878 hwc->state |= PERF_HES_STOPPED;
1879
1880 list_del(&event->active_entry);
1881
1882 if (box->n_active == 0)
1883 uncore_pmu_cancel_hrtimer(box);
1884 }
1885
1886 if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
1887 /*
1888 * Drain the remaining delta count out of a event
1889 * that we are disabling:
1890 */
1891 uncore_perf_event_update(box, event);
1892 hwc->state |= PERF_HES_UPTODATE;
1893 }
1894}
1895
1896static int snb_uncore_imc_event_add(struct perf_event *event, int flags)
1897{
1898 struct intel_uncore_box *box = uncore_event_to_box(event);
1899 struct hw_perf_event *hwc = &event->hw;
1900
1901 if (!box)
1902 return -ENODEV;
1903
1904 hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
1905 if (!(flags & PERF_EF_START))
1906 hwc->state |= PERF_HES_ARCH;
1907
1908 snb_uncore_imc_event_start(event, 0);
1909
1910 box->n_events++;
1911
1912 return 0;
1913}
1914
1915static void snb_uncore_imc_event_del(struct perf_event *event, int flags)
1916{
1917 struct intel_uncore_box *box = uncore_event_to_box(event);
1918 int i;
1919
1920 snb_uncore_imc_event_stop(event, PERF_EF_UPDATE);
1921
1922 for (i = 0; i < box->n_events; i++) {
1923 if (event == box->event_list[i]) {
1924 --box->n_events;
1925 break;
1926 }
1927 }
1928}
1929
1930static int snb_pci2phy_map_init(int devid)
1931{
1932 struct pci_dev *dev = NULL;
1933 int bus;
1934
1935 dev = pci_get_device(PCI_VENDOR_ID_INTEL, devid, dev);
1936 if (!dev)
1937 return -ENOTTY;
1938
1939 bus = dev->bus->number;
1940
1941 pcibus_to_physid[bus] = 0;
1942
1943 pci_dev_put(dev);
1944
1945 return 0;
1946}
1947
1948static struct pmu snb_uncore_imc_pmu = {
1949 .task_ctx_nr = perf_invalid_context,
1950 .event_init = snb_uncore_imc_event_init,
1951 .add = snb_uncore_imc_event_add,
1952 .del = snb_uncore_imc_event_del,
1953 .start = snb_uncore_imc_event_start,
1954 .stop = snb_uncore_imc_event_stop,
1955 .read = uncore_pmu_event_read,
1956};
1957
1958static struct intel_uncore_ops snb_uncore_imc_ops = {
1959 .init_box = snb_uncore_imc_init_box,
1960 .enable_box = snb_uncore_imc_enable_box,
1961 .disable_box = snb_uncore_imc_disable_box,
1962 .disable_event = snb_uncore_imc_disable_event,
1963 .enable_event = snb_uncore_imc_enable_event,
1964 .hw_config = snb_uncore_imc_hw_config,
1965 .read_counter = snb_uncore_imc_read_counter,
1966};
1967
1968static struct intel_uncore_type snb_uncore_imc = {
1969 .name = "imc",
1970 .num_counters = 2,
1971 .num_boxes = 1,
1972 .fixed_ctr_bits = 32,
1973 .fixed_ctr = SNB_UNCORE_PCI_IMC_CTR_BASE,
1974 .event_descs = snb_uncore_imc_events,
1975 .format_group = &snb_uncore_imc_format_group,
1976 .perf_ctr = SNB_UNCORE_PCI_IMC_DATA_READS_BASE,
1977 .event_mask = SNB_UNCORE_PCI_IMC_EVENT_MASK,
1978 .ops = &snb_uncore_imc_ops,
1979 .pmu = &snb_uncore_imc_pmu,
1980};
1981
1982static struct intel_uncore_type *snb_pci_uncores[] = {
1983 [SNB_PCI_UNCORE_IMC] = &snb_uncore_imc,
1984 NULL,
1985};
1986
1987static DEFINE_PCI_DEVICE_TABLE(snb_uncore_pci_ids) = {
1988 { /* IMC */
1989 PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SNB_IMC),
1990 .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
1991 },
1992 { /* end: all zeroes */ },
1993};
1994
1995static DEFINE_PCI_DEVICE_TABLE(ivb_uncore_pci_ids) = {
1996 { /* IMC */
1997 PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IVB_IMC),
1998 .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
1999 },
2000 { /* end: all zeroes */ },
2001};
2002
2003static DEFINE_PCI_DEVICE_TABLE(hsw_uncore_pci_ids) = {
2004 { /* IMC */
2005 PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_HSW_IMC),
2006 .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
2007 },
2008 { /* end: all zeroes */ },
2009};
2010
2011static struct pci_driver snb_uncore_pci_driver = {
2012 .name = "snb_uncore",
2013 .id_table = snb_uncore_pci_ids,
2014};
2015
2016static struct pci_driver ivb_uncore_pci_driver = {
2017 .name = "ivb_uncore",
2018 .id_table = ivb_uncore_pci_ids,
2019};
2020
2021static struct pci_driver hsw_uncore_pci_driver = {
2022 .name = "hsw_uncore",
2023 .id_table = hsw_uncore_pci_ids,
2024};
2025
1642/* end of Sandy Bridge uncore support */ 2026/* end of Sandy Bridge uncore support */
1643 2027
1644/* Nehalem uncore support */ 2028/* Nehalem uncore support */
@@ -2789,6 +3173,7 @@ again:
2789static enum hrtimer_restart uncore_pmu_hrtimer(struct hrtimer *hrtimer) 3173static enum hrtimer_restart uncore_pmu_hrtimer(struct hrtimer *hrtimer)
2790{ 3174{
2791 struct intel_uncore_box *box; 3175 struct intel_uncore_box *box;
3176 struct perf_event *event;
2792 unsigned long flags; 3177 unsigned long flags;
2793 int bit; 3178 int bit;
2794 3179
@@ -2801,19 +3186,27 @@ static enum hrtimer_restart uncore_pmu_hrtimer(struct hrtimer *hrtimer)
2801 */ 3186 */
2802 local_irq_save(flags); 3187 local_irq_save(flags);
2803 3188
3189 /*
3190 * handle boxes with an active event list as opposed to active
3191 * counters
3192 */
3193 list_for_each_entry(event, &box->active_list, active_entry) {
3194 uncore_perf_event_update(box, event);
3195 }
3196
2804 for_each_set_bit(bit, box->active_mask, UNCORE_PMC_IDX_MAX) 3197 for_each_set_bit(bit, box->active_mask, UNCORE_PMC_IDX_MAX)
2805 uncore_perf_event_update(box, box->events[bit]); 3198 uncore_perf_event_update(box, box->events[bit]);
2806 3199
2807 local_irq_restore(flags); 3200 local_irq_restore(flags);
2808 3201
2809 hrtimer_forward_now(hrtimer, ns_to_ktime(UNCORE_PMU_HRTIMER_INTERVAL)); 3202 hrtimer_forward_now(hrtimer, ns_to_ktime(box->hrtimer_duration));
2810 return HRTIMER_RESTART; 3203 return HRTIMER_RESTART;
2811} 3204}
2812 3205
2813static void uncore_pmu_start_hrtimer(struct intel_uncore_box *box) 3206static void uncore_pmu_start_hrtimer(struct intel_uncore_box *box)
2814{ 3207{
2815 __hrtimer_start_range_ns(&box->hrtimer, 3208 __hrtimer_start_range_ns(&box->hrtimer,
2816 ns_to_ktime(UNCORE_PMU_HRTIMER_INTERVAL), 0, 3209 ns_to_ktime(box->hrtimer_duration), 0,
2817 HRTIMER_MODE_REL_PINNED, 0); 3210 HRTIMER_MODE_REL_PINNED, 0);
2818} 3211}
2819 3212
@@ -2847,43 +3240,12 @@ static struct intel_uncore_box *uncore_alloc_box(struct intel_uncore_type *type,
2847 box->cpu = -1; 3240 box->cpu = -1;
2848 box->phys_id = -1; 3241 box->phys_id = -1;
2849 3242
2850 return box; 3243 /* set default hrtimer timeout */
2851} 3244 box->hrtimer_duration = UNCORE_PMU_HRTIMER_INTERVAL;
2852 3245
2853static struct intel_uncore_box * 3246 INIT_LIST_HEAD(&box->active_list);
2854uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu)
2855{
2856 struct intel_uncore_box *box;
2857 3247
2858 box = *per_cpu_ptr(pmu->box, cpu); 3248 return box;
2859 if (box)
2860 return box;
2861
2862 raw_spin_lock(&uncore_box_lock);
2863 list_for_each_entry(box, &pmu->box_list, list) {
2864 if (box->phys_id == topology_physical_package_id(cpu)) {
2865 atomic_inc(&box->refcnt);
2866 *per_cpu_ptr(pmu->box, cpu) = box;
2867 break;
2868 }
2869 }
2870 raw_spin_unlock(&uncore_box_lock);
2871
2872 return *per_cpu_ptr(pmu->box, cpu);
2873}
2874
2875static struct intel_uncore_pmu *uncore_event_to_pmu(struct perf_event *event)
2876{
2877 return container_of(event->pmu, struct intel_uncore_pmu, pmu);
2878}
2879
2880static struct intel_uncore_box *uncore_event_to_box(struct perf_event *event)
2881{
2882 /*
2883 * perf core schedules event on the basis of cpu, uncore events are
2884 * collected by one of the cpus inside a physical package.
2885 */
2886 return uncore_pmu_to_box(uncore_event_to_pmu(event), smp_processor_id());
2887} 3249}
2888 3250
2889static int 3251static int
@@ -3279,16 +3641,21 @@ static int __init uncore_pmu_register(struct intel_uncore_pmu *pmu)
3279{ 3641{
3280 int ret; 3642 int ret;
3281 3643
3282 pmu->pmu = (struct pmu) { 3644 if (!pmu->type->pmu) {
3283 .attr_groups = pmu->type->attr_groups, 3645 pmu->pmu = (struct pmu) {
3284 .task_ctx_nr = perf_invalid_context, 3646 .attr_groups = pmu->type->attr_groups,
3285 .event_init = uncore_pmu_event_init, 3647 .task_ctx_nr = perf_invalid_context,
3286 .add = uncore_pmu_event_add, 3648 .event_init = uncore_pmu_event_init,
3287 .del = uncore_pmu_event_del, 3649 .add = uncore_pmu_event_add,
3288 .start = uncore_pmu_event_start, 3650 .del = uncore_pmu_event_del,
3289 .stop = uncore_pmu_event_stop, 3651 .start = uncore_pmu_event_start,
3290 .read = uncore_pmu_event_read, 3652 .stop = uncore_pmu_event_stop,
3291 }; 3653 .read = uncore_pmu_event_read,
3654 };
3655 } else {
3656 pmu->pmu = *pmu->type->pmu;
3657 pmu->pmu.attr_groups = pmu->type->attr_groups;
3658 }
3292 3659
3293 if (pmu->type->num_boxes == 1) { 3660 if (pmu->type->num_boxes == 1) {
3294 if (strlen(pmu->type->name) > 0) 3661 if (strlen(pmu->type->name) > 0)
@@ -3502,6 +3869,28 @@ static int __init uncore_pci_init(void)
3502 pci_uncores = ivt_pci_uncores; 3869 pci_uncores = ivt_pci_uncores;
3503 uncore_pci_driver = &ivt_uncore_pci_driver; 3870 uncore_pci_driver = &ivt_uncore_pci_driver;
3504 break; 3871 break;
3872 case 42: /* Sandy Bridge */
3873 ret = snb_pci2phy_map_init(PCI_DEVICE_ID_INTEL_SNB_IMC);
3874 if (ret)
3875 return ret;
3876 pci_uncores = snb_pci_uncores;
3877 uncore_pci_driver = &snb_uncore_pci_driver;
3878 break;
3879 case 58: /* Ivy Bridge */
3880 ret = snb_pci2phy_map_init(PCI_DEVICE_ID_INTEL_IVB_IMC);
3881 if (ret)
3882 return ret;
3883 pci_uncores = snb_pci_uncores;
3884 uncore_pci_driver = &ivb_uncore_pci_driver;
3885 break;
3886 case 60: /* Haswell */
3887 case 69: /* Haswell Celeron */
3888 ret = snb_pci2phy_map_init(PCI_DEVICE_ID_INTEL_HSW_IMC);
3889 if (ret)
3890 return ret;
3891 pci_uncores = snb_pci_uncores;
3892 uncore_pci_driver = &hsw_uncore_pci_driver;
3893 break;
3505 default: 3894 default:
3506 return 0; 3895 return 0;
3507 } 3896 }
@@ -3773,7 +4162,7 @@ static void __init uncore_cpu_setup(void *dummy)
3773 4162
3774static int __init uncore_cpu_init(void) 4163static int __init uncore_cpu_init(void)
3775{ 4164{
3776 int ret, cpu, max_cores; 4165 int ret, max_cores;
3777 4166
3778 max_cores = boot_cpu_data.x86_max_cores; 4167 max_cores = boot_cpu_data.x86_max_cores;
3779 switch (boot_cpu_data.x86_model) { 4168 switch (boot_cpu_data.x86_model) {
@@ -3817,29 +4206,6 @@ static int __init uncore_cpu_init(void)
3817 if (ret) 4206 if (ret)
3818 return ret; 4207 return ret;
3819 4208
3820 get_online_cpus();
3821
3822 for_each_online_cpu(cpu) {
3823 int i, phys_id = topology_physical_package_id(cpu);
3824
3825 for_each_cpu(i, &uncore_cpu_mask) {
3826 if (phys_id == topology_physical_package_id(i)) {
3827 phys_id = -1;
3828 break;
3829 }
3830 }
3831 if (phys_id < 0)
3832 continue;
3833
3834 uncore_cpu_prepare(cpu, phys_id);
3835 uncore_event_init_cpu(cpu);
3836 }
3837 on_each_cpu(uncore_cpu_setup, NULL, 1);
3838
3839 register_cpu_notifier(&uncore_cpu_nb);
3840
3841 put_online_cpus();
3842
3843 return 0; 4209 return 0;
3844} 4210}
3845 4211
@@ -3868,6 +4234,41 @@ static int __init uncore_pmus_register(void)
3868 return 0; 4234 return 0;
3869} 4235}
3870 4236
4237static void __init uncore_cpumask_init(void)
4238{
4239 int cpu;
4240
4241 /*
4242 * ony invoke once from msr or pci init code
4243 */
4244 if (!cpumask_empty(&uncore_cpu_mask))
4245 return;
4246
4247 get_online_cpus();
4248
4249 for_each_online_cpu(cpu) {
4250 int i, phys_id = topology_physical_package_id(cpu);
4251
4252 for_each_cpu(i, &uncore_cpu_mask) {
4253 if (phys_id == topology_physical_package_id(i)) {
4254 phys_id = -1;
4255 break;
4256 }
4257 }
4258 if (phys_id < 0)
4259 continue;
4260
4261 uncore_cpu_prepare(cpu, phys_id);
4262 uncore_event_init_cpu(cpu);
4263 }
4264 on_each_cpu(uncore_cpu_setup, NULL, 1);
4265
4266 register_cpu_notifier(&uncore_cpu_nb);
4267
4268 put_online_cpus();
4269}
4270
4271
3871static int __init intel_uncore_init(void) 4272static int __init intel_uncore_init(void)
3872{ 4273{
3873 int ret; 4274 int ret;
@@ -3886,6 +4287,7 @@ static int __init intel_uncore_init(void)
3886 uncore_pci_exit(); 4287 uncore_pci_exit();
3887 goto fail; 4288 goto fail;
3888 } 4289 }
4290 uncore_cpumask_init();
3889 4291
3890 uncore_pmus_register(); 4292 uncore_pmus_register();
3891 return 0; 4293 return 0;
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
index a80ab71a883d..90236f0c94a9 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
@@ -6,6 +6,7 @@
6 6
7#define UNCORE_PMU_NAME_LEN 32 7#define UNCORE_PMU_NAME_LEN 32
8#define UNCORE_PMU_HRTIMER_INTERVAL (60LL * NSEC_PER_SEC) 8#define UNCORE_PMU_HRTIMER_INTERVAL (60LL * NSEC_PER_SEC)
9#define UNCORE_SNB_IMC_HRTIMER_INTERVAL (5ULL * NSEC_PER_SEC)
9 10
10#define UNCORE_FIXED_EVENT 0xff 11#define UNCORE_FIXED_EVENT 0xff
11#define UNCORE_PMC_IDX_MAX_GENERIC 8 12#define UNCORE_PMC_IDX_MAX_GENERIC 8
@@ -440,6 +441,7 @@ struct intel_uncore_type {
440 struct intel_uncore_ops *ops; 441 struct intel_uncore_ops *ops;
441 struct uncore_event_desc *event_descs; 442 struct uncore_event_desc *event_descs;
442 const struct attribute_group *attr_groups[4]; 443 const struct attribute_group *attr_groups[4];
444 struct pmu *pmu; /* for custom pmu ops */
443}; 445};
444 446
445#define pmu_group attr_groups[0] 447#define pmu_group attr_groups[0]
@@ -488,8 +490,11 @@ struct intel_uncore_box {
488 u64 tags[UNCORE_PMC_IDX_MAX]; 490 u64 tags[UNCORE_PMC_IDX_MAX];
489 struct pci_dev *pci_dev; 491 struct pci_dev *pci_dev;
490 struct intel_uncore_pmu *pmu; 492 struct intel_uncore_pmu *pmu;
493 u64 hrtimer_duration; /* hrtimer timeout for this box */
491 struct hrtimer hrtimer; 494 struct hrtimer hrtimer;
492 struct list_head list; 495 struct list_head list;
496 struct list_head active_list;
497 void *io_addr;
493 struct intel_uncore_extra_reg shared_regs[0]; 498 struct intel_uncore_extra_reg shared_regs[0];
494}; 499};
495 500
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index 3486e6660357..5d466b7d8609 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -1257,7 +1257,24 @@ again:
1257 pass++; 1257 pass++;
1258 goto again; 1258 goto again;
1259 } 1259 }
1260 1260 /*
1261 * Perf does test runs to see if a whole group can be assigned
1262 * together succesfully. There can be multiple rounds of this.
1263 * Unfortunately, p4_pmu_swap_config_ts touches the hwc->config
1264 * bits, such that the next round of group assignments will
1265 * cause the above p4_should_swap_ts to pass instead of fail.
1266 * This leads to counters exclusive to thread0 being used by
1267 * thread1.
1268 *
1269 * Solve this with a cheap hack, reset the idx back to -1 to
1270 * force a new lookup (p4_next_cntr) to get the right counter
1271 * for the right thread.
1272 *
1273 * This probably doesn't comply with the general spirit of how
1274 * perf wants to work, but P4 is special. :-(
1275 */
1276 if (p4_should_swap_ts(hwc->config, cpu))
1277 hwc->idx = -1;
1261 p4_pmu_swap_config_ts(hwc, cpu); 1278 p4_pmu_swap_config_ts(hwc, cpu);
1262 if (assign) 1279 if (assign)
1263 assign[i] = cntr_idx; 1280 assign[i] = cntr_idx;
@@ -1322,6 +1339,7 @@ static __initconst const struct x86_pmu p4_pmu = {
1322__init int p4_pmu_init(void) 1339__init int p4_pmu_init(void)
1323{ 1340{
1324 unsigned int low, high; 1341 unsigned int low, high;
1342 int i, reg;
1325 1343
1326 /* If we get stripped -- indexing fails */ 1344 /* If we get stripped -- indexing fails */
1327 BUILD_BUG_ON(ARCH_P4_MAX_CCCR > INTEL_PMC_MAX_GENERIC); 1345 BUILD_BUG_ON(ARCH_P4_MAX_CCCR > INTEL_PMC_MAX_GENERIC);
@@ -1340,5 +1358,19 @@ __init int p4_pmu_init(void)
1340 1358
1341 x86_pmu = p4_pmu; 1359 x86_pmu = p4_pmu;
1342 1360
1361 /*
1362 * Even though the counters are configured to interrupt a particular
1363 * logical processor when an overflow happens, testing has shown that
1364 * on kdump kernels (which uses a single cpu), thread1's counter
1365 * continues to run and will report an NMI on thread0. Due to the
1366 * overflow bug, this leads to a stream of unknown NMIs.
1367 *
1368 * Solve this by zero'ing out the registers to mimic a reset.
1369 */
1370 for (i = 0; i < x86_pmu.num_counters; i++) {
1371 reg = x86_pmu_config_addr(i);
1372 wrmsrl_safe(reg, 0ULL);
1373 }
1374
1343 return 0; 1375 return 0;
1344} 1376}
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index 6fcb49ce50a1..b4872b999a71 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -87,6 +87,7 @@ __setup("unknown_nmi_panic", setup_unknown_nmi_panic);
87#define nmi_to_desc(type) (&nmi_desc[type]) 87#define nmi_to_desc(type) (&nmi_desc[type])
88 88
89static u64 nmi_longest_ns = 1 * NSEC_PER_MSEC; 89static u64 nmi_longest_ns = 1 * NSEC_PER_MSEC;
90
90static int __init nmi_warning_debugfs(void) 91static int __init nmi_warning_debugfs(void)
91{ 92{
92 debugfs_create_u64("nmi_longest_ns", 0644, 93 debugfs_create_u64("nmi_longest_ns", 0644,
@@ -95,6 +96,20 @@ static int __init nmi_warning_debugfs(void)
95} 96}
96fs_initcall(nmi_warning_debugfs); 97fs_initcall(nmi_warning_debugfs);
97 98
99static void nmi_max_handler(struct irq_work *w)
100{
101 struct nmiaction *a = container_of(w, struct nmiaction, irq_work);
102 int remainder_ns, decimal_msecs;
103 u64 whole_msecs = ACCESS_ONCE(a->max_duration);
104
105 remainder_ns = do_div(whole_msecs, (1000 * 1000));
106 decimal_msecs = remainder_ns / 1000;
107
108 printk_ratelimited(KERN_INFO
109 "INFO: NMI handler (%ps) took too long to run: %lld.%03d msecs\n",
110 a->handler, whole_msecs, decimal_msecs);
111}
112
98static int __kprobes nmi_handle(unsigned int type, struct pt_regs *regs, bool b2b) 113static int __kprobes nmi_handle(unsigned int type, struct pt_regs *regs, bool b2b)
99{ 114{
100 struct nmi_desc *desc = nmi_to_desc(type); 115 struct nmi_desc *desc = nmi_to_desc(type);
@@ -110,26 +125,20 @@ static int __kprobes nmi_handle(unsigned int type, struct pt_regs *regs, bool b2
110 * to handle those situations. 125 * to handle those situations.
111 */ 126 */
112 list_for_each_entry_rcu(a, &desc->head, list) { 127 list_for_each_entry_rcu(a, &desc->head, list) {
113 u64 before, delta, whole_msecs; 128 int thishandled;
114 int remainder_ns, decimal_msecs, thishandled; 129 u64 delta;
115 130
116 before = sched_clock(); 131 delta = sched_clock();
117 thishandled = a->handler(type, regs); 132 thishandled = a->handler(type, regs);
118 handled += thishandled; 133 handled += thishandled;
119 delta = sched_clock() - before; 134 delta = sched_clock() - delta;
120 trace_nmi_handler(a->handler, (int)delta, thishandled); 135 trace_nmi_handler(a->handler, (int)delta, thishandled);
121 136
122 if (delta < nmi_longest_ns) 137 if (delta < nmi_longest_ns || delta < a->max_duration)
123 continue; 138 continue;
124 139
125 nmi_longest_ns = delta; 140 a->max_duration = delta;
126 whole_msecs = delta; 141 irq_work_queue(&a->irq_work);
127 remainder_ns = do_div(whole_msecs, (1000 * 1000));
128 decimal_msecs = remainder_ns / 1000;
129 printk_ratelimited(KERN_INFO
130 "INFO: NMI handler (%ps) took too long to run: "
131 "%lld.%03d msecs\n", a->handler, whole_msecs,
132 decimal_msecs);
133 } 142 }
134 143
135 rcu_read_unlock(); 144 rcu_read_unlock();
@@ -146,6 +155,8 @@ int __register_nmi_handler(unsigned int type, struct nmiaction *action)
146 if (!action->handler) 155 if (!action->handler)
147 return -EINVAL; 156 return -EINVAL;
148 157
158 init_irq_work(&action->irq_work, nmi_max_handler);
159
149 spin_lock_irqsave(&desc->lock, flags); 160 spin_lock_irqsave(&desc->lock, flags);
150 161
151 /* 162 /*