diff options
author | Thomas Gleixner <tglx@linutronix.de> | 2017-08-15 03:50:13 -0400 |
---|---|---|
committer | Thomas Gleixner <tglx@linutronix.de> | 2017-08-18 06:35:02 -0400 |
commit | 7edaeb6841dfb27e362288ab8466ebdc4972e867 (patch) | |
tree | 419ce3f71f7ffc17131bb911fb16ff037e3c6bb1 | |
parent | ef954844c7ace62f773f4f23e28d2d915adc419f (diff) |
kernel/watchdog: Prevent false positives with turbo modes
The hardlockup detector on x86 uses a performance counter based on unhalted
CPU cycles and a periodic hrtimer. The hrtimer period is about 2/5 of the
performance counter period, so the hrtimer should fire 2-3 times before the
performance counter NMI fires. The NMI code checks whether the hrtimer
fired since the last invocation. If not, it assumess a hard lockup.
The calculation of those periods is based on the nominal CPU
frequency. Turbo modes increase the CPU clock frequency and therefore
shorten the period of the perf/NMI watchdog. With extreme Turbo-modes (3x
nominal frequency) the perf/NMI period is shorter than the hrtimer period
which leads to false positives.
A simple fix would be to shorten the hrtimer period, but that comes with
the side effect of more frequent hrtimer and softlockup thread wakeups,
which is not desired.
Implement a low pass filter, which checks the perf/NMI period against
kernel time. If the perf/NMI fires before 4/5 of the watchdog period has
elapsed then the event is ignored and postponed to the next perf/NMI.
That solves the problem and avoids the overhead of shorter hrtimer periods
and more frequent softlockup thread wakeups.
Fixes: 58687acba592 ("lockup_detector: Combine nmi_watchdog and softlockup detector")
Reported-and-tested-by: Kan Liang <Kan.liang@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: dzickus@redhat.com
Cc: prarit@redhat.com
Cc: ak@linux.intel.com
Cc: babu.moger@oracle.com
Cc: peterz@infradead.org
Cc: eranian@google.com
Cc: acme@redhat.com
Cc: stable@vger.kernel.org
Cc: atomlin@redhat.com
Cc: akpm@linux-foundation.org
Cc: torvalds@linux-foundation.org
Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1708150931310.1886@nanos
-rw-r--r-- | arch/x86/Kconfig | 1 | ||||
-rw-r--r-- | include/linux/nmi.h | 8 | ||||
-rw-r--r-- | kernel/watchdog.c | 1 | ||||
-rw-r--r-- | kernel/watchdog_hld.c | 59 | ||||
-rw-r--r-- | lib/Kconfig.debug | 7 |
5 files changed, 76 insertions, 0 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 781521b7cf9e..9101bfc85539 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig | |||
@@ -100,6 +100,7 @@ config X86 | |||
100 | select GENERIC_STRNCPY_FROM_USER | 100 | select GENERIC_STRNCPY_FROM_USER |
101 | select GENERIC_STRNLEN_USER | 101 | select GENERIC_STRNLEN_USER |
102 | select GENERIC_TIME_VSYSCALL | 102 | select GENERIC_TIME_VSYSCALL |
103 | select HARDLOCKUP_CHECK_TIMESTAMP if X86_64 | ||
103 | select HAVE_ACPI_APEI if ACPI | 104 | select HAVE_ACPI_APEI if ACPI |
104 | select HAVE_ACPI_APEI_NMI if ACPI | 105 | select HAVE_ACPI_APEI_NMI if ACPI |
105 | select HAVE_ALIGNED_STRUCT_PAGE if SLUB | 106 | select HAVE_ALIGNED_STRUCT_PAGE if SLUB |
diff --git a/include/linux/nmi.h b/include/linux/nmi.h index 8aa01fd859fb..a36abe2da13e 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h | |||
@@ -168,6 +168,14 @@ extern int sysctl_hardlockup_all_cpu_backtrace; | |||
168 | #define sysctl_softlockup_all_cpu_backtrace 0 | 168 | #define sysctl_softlockup_all_cpu_backtrace 0 |
169 | #define sysctl_hardlockup_all_cpu_backtrace 0 | 169 | #define sysctl_hardlockup_all_cpu_backtrace 0 |
170 | #endif | 170 | #endif |
171 | |||
172 | #if defined(CONFIG_HARDLOCKUP_CHECK_TIMESTAMP) && \ | ||
173 | defined(CONFIG_HARDLOCKUP_DETECTOR) | ||
174 | void watchdog_update_hrtimer_threshold(u64 period); | ||
175 | #else | ||
176 | static inline void watchdog_update_hrtimer_threshold(u64 period) { } | ||
177 | #endif | ||
178 | |||
171 | extern bool is_hardlockup(void); | 179 | extern bool is_hardlockup(void); |
172 | struct ctl_table; | 180 | struct ctl_table; |
173 | extern int proc_watchdog(struct ctl_table *, int , | 181 | extern int proc_watchdog(struct ctl_table *, int , |
diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 06d3389bca0d..f5d52024f6b7 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c | |||
@@ -240,6 +240,7 @@ static void set_sample_period(void) | |||
240 | * hardlockup detector generates a warning | 240 | * hardlockup detector generates a warning |
241 | */ | 241 | */ |
242 | sample_period = get_softlockup_thresh() * ((u64)NSEC_PER_SEC / 5); | 242 | sample_period = get_softlockup_thresh() * ((u64)NSEC_PER_SEC / 5); |
243 | watchdog_update_hrtimer_threshold(sample_period); | ||
243 | } | 244 | } |
244 | 245 | ||
245 | /* Commands for resetting the watchdog */ | 246 | /* Commands for resetting the watchdog */ |
diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c index 295a0d84934c..3a09ea1b1d3d 100644 --- a/kernel/watchdog_hld.c +++ b/kernel/watchdog_hld.c | |||
@@ -37,6 +37,62 @@ void arch_touch_nmi_watchdog(void) | |||
37 | } | 37 | } |
38 | EXPORT_SYMBOL(arch_touch_nmi_watchdog); | 38 | EXPORT_SYMBOL(arch_touch_nmi_watchdog); |
39 | 39 | ||
40 | #ifdef CONFIG_HARDLOCKUP_CHECK_TIMESTAMP | ||
41 | static DEFINE_PER_CPU(ktime_t, last_timestamp); | ||
42 | static DEFINE_PER_CPU(unsigned int, nmi_rearmed); | ||
43 | static ktime_t watchdog_hrtimer_sample_threshold __read_mostly; | ||
44 | |||
45 | void watchdog_update_hrtimer_threshold(u64 period) | ||
46 | { | ||
47 | /* | ||
48 | * The hrtimer runs with a period of (watchdog_threshold * 2) / 5 | ||
49 | * | ||
50 | * So it runs effectively with 2.5 times the rate of the NMI | ||
51 | * watchdog. That means the hrtimer should fire 2-3 times before | ||
52 | * the NMI watchdog expires. The NMI watchdog on x86 is based on | ||
53 | * unhalted CPU cycles, so if Turbo-Mode is enabled the CPU cycles | ||
54 | * might run way faster than expected and the NMI fires in a | ||
55 | * smaller period than the one deduced from the nominal CPU | ||
56 | * frequency. Depending on the Turbo-Mode factor this might be fast | ||
57 | * enough to get the NMI period smaller than the hrtimer watchdog | ||
58 | * period and trigger false positives. | ||
59 | * | ||
60 | * The sample threshold is used to check in the NMI handler whether | ||
61 | * the minimum time between two NMI samples has elapsed. That | ||
62 | * prevents false positives. | ||
63 | * | ||
64 | * Set this to 4/5 of the actual watchdog threshold period so the | ||
65 | * hrtimer is guaranteed to fire at least once within the real | ||
66 | * watchdog threshold. | ||
67 | */ | ||
68 | watchdog_hrtimer_sample_threshold = period * 2; | ||
69 | } | ||
70 | |||
71 | static bool watchdog_check_timestamp(void) | ||
72 | { | ||
73 | ktime_t delta, now = ktime_get_mono_fast_ns(); | ||
74 | |||
75 | delta = now - __this_cpu_read(last_timestamp); | ||
76 | if (delta < watchdog_hrtimer_sample_threshold) { | ||
77 | /* | ||
78 | * If ktime is jiffies based, a stalled timer would prevent | ||
79 | * jiffies from being incremented and the filter would look | ||
80 | * at a stale timestamp and never trigger. | ||
81 | */ | ||
82 | if (__this_cpu_inc_return(nmi_rearmed) < 10) | ||
83 | return false; | ||
84 | } | ||
85 | __this_cpu_write(nmi_rearmed, 0); | ||
86 | __this_cpu_write(last_timestamp, now); | ||
87 | return true; | ||
88 | } | ||
89 | #else | ||
90 | static inline bool watchdog_check_timestamp(void) | ||
91 | { | ||
92 | return true; | ||
93 | } | ||
94 | #endif | ||
95 | |||
40 | static struct perf_event_attr wd_hw_attr = { | 96 | static struct perf_event_attr wd_hw_attr = { |
41 | .type = PERF_TYPE_HARDWARE, | 97 | .type = PERF_TYPE_HARDWARE, |
42 | .config = PERF_COUNT_HW_CPU_CYCLES, | 98 | .config = PERF_COUNT_HW_CPU_CYCLES, |
@@ -61,6 +117,9 @@ static void watchdog_overflow_callback(struct perf_event *event, | |||
61 | return; | 117 | return; |
62 | } | 118 | } |
63 | 119 | ||
120 | if (!watchdog_check_timestamp()) | ||
121 | return; | ||
122 | |||
64 | /* check for a hardlockup | 123 | /* check for a hardlockup |
65 | * This is done by making sure our timer interrupt | 124 | * This is done by making sure our timer interrupt |
66 | * is incrementing. The timer interrupt should have | 125 | * is incrementing. The timer interrupt should have |
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 98fe715522e8..c617b9d1d6cb 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug | |||
@@ -798,6 +798,13 @@ config HARDLOCKUP_DETECTOR_PERF | |||
798 | select SOFTLOCKUP_DETECTOR | 798 | select SOFTLOCKUP_DETECTOR |
799 | 799 | ||
800 | # | 800 | # |
801 | # Enables a timestamp based low pass filter to compensate for perf based | ||
802 | # hard lockup detection which runs too fast due to turbo modes. | ||
803 | # | ||
804 | config HARDLOCKUP_CHECK_TIMESTAMP | ||
805 | bool | ||
806 | |||
807 | # | ||
801 | # arch/ can define HAVE_HARDLOCKUP_DETECTOR_ARCH to provide their own hard | 808 | # arch/ can define HAVE_HARDLOCKUP_DETECTOR_ARCH to provide their own hard |
802 | # lockup detector rather than the perf based detector. | 809 | # lockup detector rather than the perf based detector. |
803 | # | 810 | # |