perf, x86: Try to handle unknown nmis with an enabled PMU

When the PMU is enabled it is valid to have unhandled nmis, two events could trigger 'simultaneously' raising two back-to-back NMIs. If the first NMI handles both, the latter will be empty and daze the CPU. The solution to avoid an 'unknown nmi' massage in this case was simply to stop the nmi handler chain when the PMU is enabled by stating the nmi was handled. This has the drawback that a) we can not detect unknown nmis anymore, and b) subsequent nmi handlers are not called. This patch addresses this. Now, we check this unknown NMI if it could be a PMU back-to-back NMI. Otherwise we pass it and let the kernel handle the unknown nmi. This is a debug log: cpu #6, nmi #32333, skip_nmi #32330, handled = 1, time = 1934364430 cpu #6, nmi #32334, skip_nmi #32330, handled = 1, time = 1934704616 cpu #6, nmi #32335, skip_nmi #32336, handled = 2, time = 1936032320 cpu #6, nmi #32336, skip_nmi #32336, handled = 0, time = 1936034139 cpu #6, nmi #32337, skip_nmi #32336, handled = 1, time = 1936120100 cpu #6, nmi #32338, skip_nmi #32336, handled = 1, time = 1936404607 cpu #6, nmi #32339, skip_nmi #32336, handled = 1, time = 1937983416 cpu #6, nmi #32340, skip_nmi #32341, handled = 2, time = 1938201032 cpu #6, nmi #32341, skip_nmi #32341, handled = 0, time = 1938202830 cpu #6, nmi #32342, skip_nmi #32341, handled = 1, time = 1938443743 cpu #6, nmi #32343, skip_nmi #32341, handled = 1, time = 1939956552 cpu #6, nmi #32344, skip_nmi #32341, handled = 1, time = 1940073224 cpu #6, nmi #32345, skip_nmi #32341, handled = 1, time = 1940485677 cpu #6, nmi #32346, skip_nmi #32347, handled = 2, time = 1941947772 cpu #6, nmi #32347, skip_nmi #32347, handled = 1, time = 1941949818 cpu #6, nmi #32348, skip_nmi #32347, handled = 0, time = 1941951591 Uhhuh. NMI received for unknown reason 00 on CPU 6. Do you have a strange power saving mode enabled? Dazed and confused, but trying to continue Deltas: nmi #32334 340186 nmi #32335 1327704 nmi #32336 1819 <<<< back-to-back nmi [1] nmi #32337 85961 nmi #32338 284507 nmi #32339 1578809 nmi #32340 217616 nmi #32341 1798 <<<< back-to-back nmi [2] nmi #32342 240913 nmi #32343 1512809 nmi #32344 116672 nmi #32345 412453 nmi #32346 1462095 <<<< 1st nmi (standard) handling 2 counters nmi #32347 2046 <<<< 2nd nmi (back-to-back) handling one counter nmi #32348 1773 <<<< 3rd nmi (back-to-back) handling no counter! [3] For back-to-back nmi detection there are the following rules: The PMU nmi handler was handling more than one counter and no counter was handled in the subsequent nmi (see [1] and [2] above). There is another case if there are two subsequent back-to-back nmis [3]. The 2nd is detected as back-to-back because the first handled more than one counter. If the second handles one counter and the 3rd handles nothing, we drop the 3rd nmi because it could be a back-to-back nmi. Signed-off-by: Robert Richter <robert.richter@amd.com> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> [ renamed nmi variable to pmu_nmi to avoid clash with .nmi in entry.S ] Signed-off-by: Don Zickus <dzickus@redhat.com> Cc: peterz@infradead.org Cc: gorcunov@gmail.com Cc: fweisbec@gmail.com Cc: ying.huang@intel.com Cc: ming.m.lin@intel.com Cc: eranian@google.com LKML-Reference: <1283454469-1909-3-git-send-email-dzickus@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
author: Robert Richter <robert.richter@amd.com> 2010-09-02 15:07:48 -0400
committer: Ingo Molnar <mingo@elte.hu> 2010-09-03 02:05:18 -0400
commit: 4177c42a6301a34c20038ec2771a33dcc30bb338 (patch)
tree: fe6a374cfc0299ea44e1d7edb459465513452cdf /arch/x86
parent: de725dec9de7a7541996176d59cf8542365b8b0e (diff)
1 files changed, 46 insertions, 13 deletions
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index f2da20fda02..3efdf2870a3 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1154,7 +1154,7 @@ static int x86_pmu_handle_irq(struct pt_regs *regs)
                /*
                 * event overflow
                 */
-                handled         = 1;
+                handled++;
                data.period     = event->hw.last_period;
                if (!x86_perf_event_set_period(event))
@@ -1200,12 +1200,20 @@ void perf_events_lapic_init(void)
        apic_write(APIC_LVTPC, APIC_DM_NMI);
 }
+struct pmu_nmi_state {
+        unsigned int    marked;
+        int             handled;
+};
+static DEFINE_PER_CPU(struct pmu_nmi_state, pmu_nmi);
 static int __kprobes
 perf_event_nmi_handler(struct notifier_block *self,
                         unsigned long cmd, void *__args)
 {
        struct die_args *args = __args;
-        struct pt_regs *regs;
+        unsigned int this_nmi;
+        int handled;
        if (!atomic_read(&active_events))
                return NOTIFY_DONE;
@@ -1214,22 +1222,47 @@ perf_event_nmi_handler(struct notifier_block *self,
        case DIE_NMI:
        case DIE_NMI_IPI:
                break;
+        case DIE_NMIUNKNOWN:
+                this_nmi = percpu_read(irq_stat.__nmi_count);
+                if (this_nmi != __get_cpu_var(pmu_nmi).marked)
+                        /* let the kernel handle the unknown nmi */
+                        return NOTIFY_DONE;
+                /*
+                 * This one is a PMU back-to-back nmi. Two events
+                 * trigger 'simultaneously' raising two back-to-back
+                 * NMIs. If the first NMI handles both, the latter
+                 * will be empty and daze the CPU. So, we drop it to
+                 * avoid false-positive 'unknown nmi' messages.
+                 */
+                return NOTIFY_STOP;
        default:
                return NOTIFY_DONE;
        }
-        regs = args->regs;
        apic_write(APIC_LVTPC, APIC_DM_NMI);
-        /*
-         * Can't rely on the handled return value to say it was our NMI, two
+        handled = x86_pmu.handle_irq(args->regs);
-         * events could trigger 'simultaneously' raising two back-to-back NMIs.
+        if (!handled)
-         *
+                return NOTIFY_DONE;
-         * If the first NMI handles both, the latter will be empty and daze
-         * the CPU.
+        this_nmi = percpu_read(irq_stat.__nmi_count);
-         */
+        if ((handled > 1) ||
-        x86_pmu.handle_irq(regs);
+                /* the next nmi could be a back-to-back nmi */
+            ((__get_cpu_var(pmu_nmi).marked == this_nmi) &&
+             (__get_cpu_var(pmu_nmi).handled > 1))) {
+                /*
+                 * We could have two subsequent back-to-back nmis: The
+                 * first handles more than one counter, the 2nd
+                 * handles only one counter and the 3rd handles no
+                 * counter.
+                 *
+                 * This is the 2nd nmi because the previous was
+                 * handling more than one counter. We will mark the
+                 * next (3rd) and then drop it if unhandled.
+                 */
+                __get_cpu_var(pmu_nmi).marked   = this_nmi + 1;
+                __get_cpu_var(pmu_nmi).handled  = handled;
+        }
        return NOTIFY_STOP;
 }
author	Robert Richter <robert.richter@amd.com>	2010-09-02 15:07:48 -0400
committer	Ingo Molnar <mingo@elte.hu>	2010-09-03 02:05:18 -0400
commit	4177c42a6301a34c20038ec2771a33dcc30bb338 (patch)
tree	fe6a374cfc0299ea44e1d7edb459465513452cdf /arch/x86
parent	de725dec9de7a7541996176d59cf8542365b8b0e (diff)

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index f2da20fda02..3efdf2870a3 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1154,7 +1154,7 @@ static int x86_pmu_handle_irq(struct pt_regs *regs)
1154	/*	1154	/*
1155	* event overflow	1155	* event overflow
1156	*/	1156	*/
1157	handled = 1;	1157	handled++;
1158	data.period = event->hw.last_period;	1158	data.period = event->hw.last_period;
1159		1159
1160	if (!x86_perf_event_set_period(event))	1160	if (!x86_perf_event_set_period(event))
@@ -1200,12 +1200,20 @@ void perf_events_lapic_init(void)
1200	apic_write(APIC_LVTPC, APIC_DM_NMI);	1200	apic_write(APIC_LVTPC, APIC_DM_NMI);
1201	}	1201	}
1202		1202
		1203	struct pmu_nmi_state {
		1204	unsigned int marked;
		1205	int handled;
		1206	};
		1207
		1208	static DEFINE_PER_CPU(struct pmu_nmi_state, pmu_nmi);
		1209
1203	static int __kprobes	1210	static int __kprobes
1204	perf_event_nmi_handler(struct notifier_block *self,	1211	perf_event_nmi_handler(struct notifier_block *self,
1205	unsigned long cmd, void *__args)	1212	unsigned long cmd, void *__args)
1206	{	1213	{
1207	struct die_args *args = __args;	1214	struct die_args *args = __args;
1208	struct pt_regs *regs;	1215	unsigned int this_nmi;
		1216	int handled;
1209		1217
1210	if (!atomic_read(&active_events))	1218	if (!atomic_read(&active_events))
1211	return NOTIFY_DONE;	1219	return NOTIFY_DONE;
@@ -1214,22 +1222,47 @@ perf_event_nmi_handler(struct notifier_block *self,
1214	case DIE_NMI:	1222	case DIE_NMI:
1215	case DIE_NMI_IPI:	1223	case DIE_NMI_IPI:
1216	break;	1224	break;
1217		1225	case DIE_NMIUNKNOWN:
		1226	this_nmi = percpu_read(irq_stat.__nmi_count);
		1227	if (this_nmi != __get_cpu_var(pmu_nmi).marked)
		1228	/* let the kernel handle the unknown nmi */
		1229	return NOTIFY_DONE;
		1230	/*
		1231	* This one is a PMU back-to-back nmi. Two events
		1232	* trigger 'simultaneously' raising two back-to-back
		1233	* NMIs. If the first NMI handles both, the latter
		1234	* will be empty and daze the CPU. So, we drop it to
		1235	* avoid false-positive 'unknown nmi' messages.
		1236	*/
		1237	return NOTIFY_STOP;
1218	default:	1238	default:
1219	return NOTIFY_DONE;	1239	return NOTIFY_DONE;
1220	}	1240	}
1221		1241
1222	regs = args->regs;
1223
1224	apic_write(APIC_LVTPC, APIC_DM_NMI);	1242	apic_write(APIC_LVTPC, APIC_DM_NMI);
1225	/*	1243
1226	* Can't rely on the handled return value to say it was our NMI, two	1244	handled = x86_pmu.handle_irq(args->regs);
1227	* events could trigger 'simultaneously' raising two back-to-back NMIs.	1245	if (!handled)
1228	*	1246	return NOTIFY_DONE;
1229	* If the first NMI handles both, the latter will be empty and daze	1247
1230	* the CPU.	1248	this_nmi = percpu_read(irq_stat.__nmi_count);
1231	*/	1249	if ((handled > 1) \|\|
1232	x86_pmu.handle_irq(regs);	1250	/* the next nmi could be a back-to-back nmi */
		1251	((__get_cpu_var(pmu_nmi).marked == this_nmi) &&
		1252	(__get_cpu_var(pmu_nmi).handled > 1))) {
		1253	/*
		1254	* We could have two subsequent back-to-back nmis: The
		1255	* first handles more than one counter, the 2nd
		1256	* handles only one counter and the 3rd handles no
		1257	* counter.
		1258	*
		1259	* This is the 2nd nmi because the previous was
		1260	* handling more than one counter. We will mark the
		1261	* next (3rd) and then drop it if unhandled.
		1262	*/
		1263	__get_cpu_var(pmu_nmi).marked = this_nmi + 1;
		1264	__get_cpu_var(pmu_nmi).handled = handled;
		1265	}
1233		1266
1234	return NOTIFY_STOP;	1267	return NOTIFY_STOP;
1235	}	1268	}