aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorThomas Gleixner <tglx@linutronix.de>2017-09-12 15:37:05 -0400
committerIngo Molnar <mingo@kernel.org>2017-09-14 05:41:05 -0400
commit01f0a02701cbcf32d22cfc9d1ab9a3f0ff2ba68c (patch)
treeafdf0ea590c86b5b32667a53c11ab95b28d3e641
parent941154bd6937a710ae9193a3c733c0029e5ae7b8 (diff)
watchdog/core: Remove the park_in_progress obfuscation
Commit: b94f51183b06 ("kernel/watchdog: prevent false hardlockup on overloaded system") tries to fix the following issue: proc_write() set_sample_period() <--- New sample period becoms visible <----- Broken starts proc_watchdog_update() watchdog_enable_all_cpus() watchdog_hrtimer_fn() update_watchdog_all_cpus() restart_timer(sample_period) watchdog_park_threads() thread->park() disable_nmi() <----- Broken ends The reason why this is broken is that the update of the watchdog threshold becomes immediately effective and visible for the hrtimer function which uses that value to rearm the timer. But the NMI/perf side still uses the old value up to the point where it is disabled. If the rate has been lowered then the NMI can run fast enough to 'detect' a hard lockup because the timer has not fired due to the longer period. The patch 'fixed' this by adding a variable: proc_write() set_sample_period() <----- Broken starts proc_watchdog_update() watchdog_enable_all_cpus() watchdog_hrtimer_fn() update_watchdog_all_cpus() restart_timer(sample_period) watchdog_park_threads() park_in_progress = 1 <----- Broken ends nmi_watchdog() if (park_in_progress) return; The only effect of this variable was to make the window where the breakage can hit small enough that it was not longer observable in testing. From a correctness point of view it is a pointless bandaid which merily papers over the root cause: the unsychronized update of the variable. Looking deeper into the related code pathes unearthed similar problems in the watchdog_start()/stop() functions. watchdog_start() perf_nmi_event_start() hrtimer_start() watchdog_stop() hrtimer_cancel() perf_nmi_event_stop() In both cases the call order is wrong because if the tasks gets preempted or the VM gets scheduled out long enough after the first call, then there is a chance that the next NMI will see a stale hrtimer interrupt count and trigger a false positive hard lockup splat. Get rid of park_in_progress so the code can be gradually deobfuscated and pruned from several layers of duct tape papering over the root cause, which has been either ignored or not understood at all. Once this is removed the underlying problem will be fixed by rewriting the proc interface to do a proper synchronized update. Address the start/stop() ordering problem as well by reverting the call order, so this part is at least correct now. Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Reviewed-by: Don Zickus <dzickus@redhat.com> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Borislav Petkov <bp@alien8.de> Cc: Chris Metcalf <cmetcalf@mellanox.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Nicholas Piggin <npiggin@gmail.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Sebastian Siewior <bigeasy@linutronix.de> Cc: Ulrich Obergfell <uobergfe@redhat.com> Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1709052038270.2393@nanos Signed-off-by: Ingo Molnar <mingo@kernel.org>
-rw-r--r--include/linux/nmi.h1
-rw-r--r--kernel/watchdog.c37
-rw-r--r--kernel/watchdog_hld.c7
3 files changed, 19 insertions, 26 deletions
diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index 80354e6fa86d..91a3a4a4c8ae 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -27,7 +27,6 @@ extern void touch_softlockup_watchdog_sync(void);
27extern void touch_all_softlockup_watchdogs(void); 27extern void touch_all_softlockup_watchdogs(void);
28extern unsigned int softlockup_panic; 28extern unsigned int softlockup_panic;
29extern int soft_watchdog_enabled; 29extern int soft_watchdog_enabled;
30extern atomic_t watchdog_park_in_progress;
31#else 30#else
32static inline void touch_softlockup_watchdog_sched(void) 31static inline void touch_softlockup_watchdog_sched(void)
33{ 32{
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index dd1fd59683c5..c290135fb415 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -136,8 +136,6 @@ void __weak watchdog_nmi_reconfigure(void)
136#define for_each_watchdog_cpu(cpu) \ 136#define for_each_watchdog_cpu(cpu) \
137 for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask) 137 for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask)
138 138
139atomic_t watchdog_park_in_progress = ATOMIC_INIT(0);
140
141static u64 __read_mostly sample_period; 139static u64 __read_mostly sample_period;
142 140
143static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); 141static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
@@ -322,8 +320,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
322 int duration; 320 int duration;
323 int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace; 321 int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace;
324 322
325 if (!watchdog_enabled || 323 if (!watchdog_enabled)
326 atomic_read(&watchdog_park_in_progress) != 0)
327 return HRTIMER_NORESTART; 324 return HRTIMER_NORESTART;
328 325
329 /* kick the hardlockup detector */ 326 /* kick the hardlockup detector */
@@ -437,32 +434,37 @@ static void watchdog_set_prio(unsigned int policy, unsigned int prio)
437 434
438static void watchdog_enable(unsigned int cpu) 435static void watchdog_enable(unsigned int cpu)
439{ 436{
440 struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer); 437 struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer);
441 438
442 /* kick off the timer for the hardlockup detector */ 439 /*
440 * Start the timer first to prevent the NMI watchdog triggering
441 * before the timer has a chance to fire.
442 */
443 hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 443 hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
444 hrtimer->function = watchdog_timer_fn; 444 hrtimer->function = watchdog_timer_fn;
445 hrtimer_start(hrtimer, ns_to_ktime(sample_period),
446 HRTIMER_MODE_REL_PINNED);
445 447
448 /* Initialize timestamp */
449 __touch_watchdog();
446 /* Enable the perf event */ 450 /* Enable the perf event */
447 watchdog_nmi_enable(cpu); 451 watchdog_nmi_enable(cpu);
448 452
449 /* done here because hrtimer_start can only pin to smp_processor_id() */
450 hrtimer_start(hrtimer, ns_to_ktime(sample_period),
451 HRTIMER_MODE_REL_PINNED);
452
453 /* initialize timestamp */
454 watchdog_set_prio(SCHED_FIFO, MAX_RT_PRIO - 1); 453 watchdog_set_prio(SCHED_FIFO, MAX_RT_PRIO - 1);
455 __touch_watchdog();
456} 454}
457 455
458static void watchdog_disable(unsigned int cpu) 456static void watchdog_disable(unsigned int cpu)
459{ 457{
460 struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer); 458 struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer);
461 459
462 watchdog_set_prio(SCHED_NORMAL, 0); 460 watchdog_set_prio(SCHED_NORMAL, 0);
463 hrtimer_cancel(hrtimer); 461 /*
464 /* disable the perf event */ 462 * Disable the perf event first. That prevents that a large delay
463 * between disabling the timer and disabling the perf event causes
464 * the perf NMI to detect a false positive.
465 */
465 watchdog_nmi_disable(cpu); 466 watchdog_nmi_disable(cpu);
467 hrtimer_cancel(hrtimer);
466} 468}
467 469
468static void watchdog_cleanup(unsigned int cpu, bool online) 470static void watchdog_cleanup(unsigned int cpu, bool online)
@@ -518,16 +520,11 @@ static int watchdog_park_threads(void)
518{ 520{
519 int cpu, ret = 0; 521 int cpu, ret = 0;
520 522
521 atomic_set(&watchdog_park_in_progress, 1);
522
523 for_each_watchdog_cpu(cpu) { 523 for_each_watchdog_cpu(cpu) {
524 ret = kthread_park(per_cpu(softlockup_watchdog, cpu)); 524 ret = kthread_park(per_cpu(softlockup_watchdog, cpu));
525 if (ret) 525 if (ret)
526 break; 526 break;
527 } 527 }
528
529 atomic_set(&watchdog_park_in_progress, 0);
530
531 return ret; 528 return ret;
532} 529}
533 530
diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c
index 94111ccb09b5..0aa191ee3d51 100644
--- a/kernel/watchdog_hld.c
+++ b/kernel/watchdog_hld.c
@@ -106,15 +106,12 @@ static struct perf_event_attr wd_hw_attr = {
106 106
107/* Callback function for perf event subsystem */ 107/* Callback function for perf event subsystem */
108static void watchdog_overflow_callback(struct perf_event *event, 108static void watchdog_overflow_callback(struct perf_event *event,
109 struct perf_sample_data *data, 109 struct perf_sample_data *data,
110 struct pt_regs *regs) 110 struct pt_regs *regs)
111{ 111{
112 /* Ensure the watchdog never gets throttled */ 112 /* Ensure the watchdog never gets throttled */
113 event->hw.interrupts = 0; 113 event->hw.interrupts = 0;
114 114
115 if (atomic_read(&watchdog_park_in_progress) != 0)
116 return;
117
118 if (__this_cpu_read(watchdog_nmi_touch) == true) { 115 if (__this_cpu_read(watchdog_nmi_touch) == true) {
119 __this_cpu_write(watchdog_nmi_touch, false); 116 __this_cpu_write(watchdog_nmi_touch, false);
120 return; 117 return;