16 files changed, 287 insertions, 297 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index eaee9de224bd..12c679f769c6 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -84,6 +84,7 @@ obj-$(CONFIG_KPROBES) += kprobes.o
 obj-$(CONFIG_KGDB) += debug/
 obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
 obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o
+obj-$(CONFIG_HARDLOCKUP_DETECTOR) += watchdog_hld.o
 obj-$(CONFIG_SECCOMP) += seccomp.o
 obj-$(CONFIG_RELAY) += relay.o
 obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 0874e2edd275..79517e5549f1 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -598,11 +598,11 @@ return_normal:
        /*
         * Wait for the other CPUs to be notified and be waiting for us:
         */
-        time_left = loops_per_jiffy * HZ;
+        time_left = MSEC_PER_SEC;
        while (kgdb_do_roundup && --time_left &&
               (atomic_read(&masters_in_kgdb) + atomic_read(&slaves_in_kgdb)) !=
                   online_cpus)
-                cpu_relax();
+                udelay(1000);
        if (!time_left)
                pr_crit("Timed out waiting for secondary CPUs.\n");
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index 98c9011eac78..e74be38245ad 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -30,6 +30,7 @@
 char kdb_prompt_str[CMD_BUFLEN];
 int kdb_trap_printk;
+int kdb_printf_cpu = -1;
 static int kgdb_transition_check(char *buffer)
 {
@@ -554,31 +555,26 @@ int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap)
        int linecount;
        int colcount;
        int logging, saved_loglevel = 0;
-        int saved_trap_printk;
-        int got_printf_lock = 0;
        int retlen = 0;
        int fnd, len;
+        int this_cpu, old_cpu;
        char *cp, *cp2, *cphold = NULL, replaced_byte = ' ';
        char *moreprompt = "more> ";
        struct console *c = console_drivers;
-        static DEFINE_SPINLOCK(kdb_printf_lock);
        unsigned long uninitialized_var(flags);
-        preempt_disable();
-        saved_trap_printk = kdb_trap_printk;
-        kdb_trap_printk = 0;
        /* Serialize kdb_printf if multiple cpus try to write at once.
         * But if any cpu goes recursive in kdb, just print the output,
         * even if it is interleaved with any other text.
         */
-        if (!KDB_STATE(PRINTF_LOCK)) {
+        local_irq_save(flags);
-                KDB_STATE_SET(PRINTF_LOCK);
+        this_cpu = smp_processor_id();
-                spin_lock_irqsave(&kdb_printf_lock, flags);
+        for (;;) {
-                got_printf_lock = 1;
+                old_cpu = cmpxchg(&kdb_printf_cpu, -1, this_cpu);
-                atomic_inc(&kdb_event);
+                if (old_cpu == -1 || old_cpu == this_cpu)
-        } else {
+                        break;
-                __acquire(kdb_printf_lock);
+                cpu_relax();
        }
        diag = kdbgetintenv("LINES", &linecount);
@@ -847,16 +843,9 @@ kdb_print_out:
        suspend_grep = 0; /* end of what may have been a recursive call */
        if (logging)
                console_loglevel = saved_loglevel;
-        if (KDB_STATE(PRINTF_LOCK) && got_printf_lock) {
+        /* kdb_printf_cpu locked the code above. */
-                got_printf_lock = 0;
+        smp_store_release(&kdb_printf_cpu, old_cpu);
-                spin_unlock_irqrestore(&kdb_printf_lock, flags);
+        local_irq_restore(flags);
-                KDB_STATE_CLEAR(PRINTF_LOCK);
-                atomic_dec(&kdb_event);
-        } else {
-                __release(kdb_printf_lock);
-        }
-        kdb_trap_printk = saved_trap_printk;
-        preempt_enable();
        return retlen;
 }
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 2a20c0dfdafc..ca183919d302 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -60,7 +60,6 @@ int kdb_grep_trailing;
 * Kernel debugger state flags
 */
 int kdb_flags;
-atomic_t kdb_event;
 /*
 * kdb_lock protects updates to kdb_initial_cpu.  Used to
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
index 75014d7f4568..fc224fbcf954 100644
--- a/kernel/debug/kdb/kdb_private.h
+++ b/kernel/debug/kdb/kdb_private.h
@@ -132,7 +132,6 @@ extern int kdb_state;
 #define KDB_STATE_PAGER         0x00000400      /* pager is available */
 #define KDB_STATE_GO_SWITCH     0x00000800      /* go is switching
                                                 * back to initial cpu */
-#define KDB_STATE_PRINTF_LOCK   0x00001000      /* Holds kdb_printf lock */
 #define KDB_STATE_WAIT_IPI      0x00002000      /* Waiting for kdb_ipi() NMI */
 #define KDB_STATE_RECURSE       0x00004000      /* Recursive entry to kdb */
 #define KDB_STATE_IP_ADJUSTED   0x00008000      /* Restart IP has been
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index f9ec9add2164..215871bda3a2 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -301,7 +301,7 @@ int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr,
 retry:
        /* Read the page with vaddr into memory */
        ret = get_user_pages_remote(NULL, mm, vaddr, 1, FOLL_FORCE, &old_page,
-                        &vma);
+                        &vma, NULL);
        if (ret <= 0)
                return ret;
@@ -1712,7 +1712,7 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr)
         * essentially a kernel access to the memory.
         */
        result = get_user_pages_remote(NULL, mm, vaddr, 1, FOLL_FORCE, &page,
-                        NULL);
+                        NULL, NULL);
        if (result < 0)
                return result;
diff --git a/kernel/kcov.c b/kernel/kcov.c
index 3cbb0c879705..cc2fa35ca480 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -1,11 +1,16 @@
 #define pr_fmt(fmt) "kcov: " fmt
 #define DISABLE_BRANCH_PROFILING
+#include <linux/atomic.h>
 #include <linux/compiler.h>
+#include <linux/errno.h>
+#include <linux/export.h>
 #include <linux/types.h>
 #include <linux/file.h>
 #include <linux/fs.h>
+#include <linux/init.h>
 #include <linux/mm.h>
+#include <linux/preempt.h>
 #include <linux/printk.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 561675589511..5617cc412444 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -441,6 +441,8 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
        while (hole_end <= crashk_res.end) {
                unsigned long i;
+                cond_resched();
                if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT)
                        break;
                /* See if I overlap any of the segments */
@@ -1467,9 +1469,6 @@ static int __init crash_save_vmcoreinfo_init(void)
 #endif
        VMCOREINFO_NUMBER(PG_head_mask);
        VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
-#ifdef CONFIG_X86
-        VMCOREINFO_NUMBER(KERNEL_IMAGE_SIZE);
-#endif
 #ifdef CONFIG_HUGETLB_PAGE
        VMCOREINFO_NUMBER(HUGETLB_PAGE_DTOR);
 #endif
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 577f2288d19f..a3ce35e0fa1e 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1926,7 +1926,8 @@ int vprintk_default(const char *fmt, va_list args)
        int r;
 #ifdef CONFIG_KGDB_KDB
-        if (unlikely(kdb_trap_printk)) {
+        /* Allow to pass printk() to kdb but avoid a recursion. */
+        if (unlikely(kdb_trap_printk && kdb_printf_cpu < 0)) {
                r = vkdb_printf(KDB_MSGSRC_PRINTK, fmt, args);
                return r;
        }
diff --git a/kernel/relay.c b/kernel/relay.c
index da79a109dbeb..8f18d314a96a 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -809,11 +809,11 @@ void relay_subbufs_consumed(struct rchan *chan,
 {
        struct rchan_buf *buf;
-        if (!chan)
+        if (!chan || cpu >= NR_CPUS)
                return;
        buf = *per_cpu_ptr(chan->buf, cpu);
-        if (cpu >= NR_CPUS || !buf || subbufs_consumed > chan->n_subbufs)
+        if (!buf || subbufs_consumed > chan->n_subbufs)
                return;
        if (subbufs_consumed > buf->subbufs_produced - buf->subbufs_consumed)
diff --git a/kernel/signal.c b/kernel/signal.c
index 29a410780aa9..ae60996fedff 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2491,6 +2491,13 @@ void __set_current_blocked(const sigset_t *newset)
 {
        struct task_struct *tsk = current;
+        /*
+         * In case the signal mask hasn't changed, there is nothing we need
+         * to do. The current->blocked shouldn't be modified by other task.
+         */
+        if (sigequalsets(&tsk->blocked, newset))
+                return;
        spin_lock_irq(&tsk->sighand->siglock);
        __set_task_blocked(tsk, newset);
        spin_unlock_irq(&tsk->sighand->siglock);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 39b3368f6de6..1475d2545b7e 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -2389,9 +2389,11 @@ static void validate_coredump_safety(void)
 #ifdef CONFIG_COREDUMP
        if (suid_dumpable == SUID_DUMP_ROOT &&
            core_pattern[0] != '/' && core_pattern[0] != '|') {
-                printk(KERN_WARNING "Unsafe core_pattern used with "\
+                printk(KERN_WARNING
-                        "suid_dumpable=2. Pipe handler or fully qualified "\
+"Unsafe core_pattern used with fs.suid_dumpable=2.\n"
-                        "core dump path required.\n");
+"Pipe handler or fully qualified core dump path required.\n"
+"Set kernel.core_pattern before fs.suid_dumpable.\n"
+                );
        }
 #endif
 }
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 6eb99c17dbd8..ece4b177052b 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -1354,8 +1354,8 @@ static void deprecated_sysctl_warning(const int *name, int nlen)
                        "warning: process `%s' used the deprecated sysctl "
                        "system call with ", current->comm);
                for (i = 0; i < nlen; i++)
-                        printk("%d.", name[i]);
+                        printk(KERN_CONT "%d.", name[i]);
-                printk("\n");
+                printk(KERN_CONT "\n");
        }
        return;
 }
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 9b08ca391aed..3921cf7fea8e 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -516,7 +516,8 @@ static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm,
        spin_lock_irqsave(&ptr->it_lock, flags);
        if ((ptr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) {
-                if (posix_timer_event(ptr, 0) != 0)
+                if (IS_ENABLED(CONFIG_POSIX_TIMERS) &&
+                    posix_timer_event(ptr, 0) != 0)
                        ptr->it_overrun++;
        }
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 9acb29f280ec..d4b0fa01cae3 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -24,32 +24,14 @@
 #include <asm/irq_regs.h>
 #include <linux/kvm_para.h>
-#include <linux/perf_event.h>
 #include <linux/kthread.h>
-/*
- * The run state of the lockup detectors is controlled by the content of the
- * 'watchdog_enabled' variable. Each lockup detector has its dedicated bit -
- * bit 0 for the hard lockup detector and bit 1 for the soft lockup detector.
- *
- * 'watchdog_user_enabled', 'nmi_watchdog_enabled' and 'soft_watchdog_enabled'
- * are variables that are only used as an 'interface' between the parameters
- * in /proc/sys/kernel and the internal state bits in 'watchdog_enabled'. The
- * 'watchdog_thresh' variable is handled differently because its value is not
- * boolean, and the lockup detectors are 'suspended' while 'watchdog_thresh'
- * is equal zero.
- */
-#define NMI_WATCHDOG_ENABLED_BIT   0
-#define SOFT_WATCHDOG_ENABLED_BIT  1
-#define NMI_WATCHDOG_ENABLED      (1 << NMI_WATCHDOG_ENABLED_BIT)
-#define SOFT_WATCHDOG_ENABLED     (1 << SOFT_WATCHDOG_ENABLED_BIT)
 static DEFINE_MUTEX(watchdog_proc_mutex);
-#ifdef CONFIG_HARDLOCKUP_DETECTOR
+#if defined(CONFIG_HAVE_NMI_WATCHDOG) || defined(CONFIG_HARDLOCKUP_DETECTOR)
-static unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED|NMI_WATCHDOG_ENABLED;
+unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED|NMI_WATCHDOG_ENABLED;
 #else
-static unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED;
+unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED;
 #endif
 int __read_mostly nmi_watchdog_enabled;
 int __read_mostly soft_watchdog_enabled;
@@ -59,9 +41,6 @@ int __read_mostly watchdog_thresh = 10;
 #ifdef CONFIG_SMP
 int __read_mostly sysctl_softlockup_all_cpu_backtrace;
 int __read_mostly sysctl_hardlockup_all_cpu_backtrace;
-#else
-#define sysctl_softlockup_all_cpu_backtrace 0
-#define sysctl_hardlockup_all_cpu_backtrace 0
 #endif
 static struct cpumask watchdog_cpumask __read_mostly;
 unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask);
@@ -100,50 +79,9 @@ static DEFINE_PER_CPU(bool, soft_watchdog_warn);
 static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts);
 static DEFINE_PER_CPU(unsigned long, soft_lockup_hrtimer_cnt);
 static DEFINE_PER_CPU(struct task_struct *, softlockup_task_ptr_saved);
-#ifdef CONFIG_HARDLOCKUP_DETECTOR
-static DEFINE_PER_CPU(bool, hard_watchdog_warn);
-static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
 static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
-static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
-#endif
 static unsigned long soft_lockup_nmi_warn;
-/* boot commands */
-/*
- * Should we panic when a soft-lockup or hard-lockup occurs:
- */
-#ifdef CONFIG_HARDLOCKUP_DETECTOR
-unsigned int __read_mostly hardlockup_panic =
-                        CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE;
-static unsigned long hardlockup_allcpu_dumped;
-/*
- * We may not want to enable hard lockup detection by default in all cases,
- * for example when running the kernel as a guest on a hypervisor. In these
- * cases this function can be called to disable hard lockup detection. This
- * function should only be executed once by the boot processor before the
- * kernel command line parameters are parsed, because otherwise it is not
- * possible to override this in hardlockup_panic_setup().
- */
-void hardlockup_detector_disable(void)
-{
-        watchdog_enabled &= ~NMI_WATCHDOG_ENABLED;
-}
-static int __init hardlockup_panic_setup(char *str)
-{
-        if (!strncmp(str, "panic", 5))
-                hardlockup_panic = 1;
-        else if (!strncmp(str, "nopanic", 7))
-                hardlockup_panic = 0;
-        else if (!strncmp(str, "0", 1))
-                watchdog_enabled &= ~NMI_WATCHDOG_ENABLED;
-        else if (!strncmp(str, "1", 1))
-                watchdog_enabled |= NMI_WATCHDOG_ENABLED;
-        return 1;
-}
-__setup("nmi_watchdog=", hardlockup_panic_setup);
-#endif
 unsigned int __read_mostly softlockup_panic =
                        CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE;
@@ -264,32 +202,14 @@ void touch_all_softlockup_watchdogs(void)
        wq_watchdog_touch(-1);
 }
-#ifdef CONFIG_HARDLOCKUP_DETECTOR
-void touch_nmi_watchdog(void)
-{
-        /*
-         * Using __raw here because some code paths have
-         * preemption enabled.  If preemption is enabled
-         * then interrupts should be enabled too, in which
-         * case we shouldn't have to worry about the watchdog
-         * going off.
-         */
-        raw_cpu_write(watchdog_nmi_touch, true);
-        touch_softlockup_watchdog();
-}
-EXPORT_SYMBOL(touch_nmi_watchdog);
-#endif
 void touch_softlockup_watchdog_sync(void)
 {
        __this_cpu_write(softlockup_touch_sync, true);
        __this_cpu_write(watchdog_touch_ts, 0);
 }
-#ifdef CONFIG_HARDLOCKUP_DETECTOR
 /* watchdog detector functions */
-static bool is_hardlockup(void)
+bool is_hardlockup(void)
 {
        unsigned long hrint = __this_cpu_read(hrtimer_interrupts);
@@ -299,7 +219,6 @@ static bool is_hardlockup(void)
        __this_cpu_write(hrtimer_interrupts_saved, hrint);
        return false;
 }
-#endif
 static int is_softlockup(unsigned long touch_ts)
 {
@@ -313,78 +232,22 @@ static int is_softlockup(unsigned long touch_ts)
        return 0;
 }
-#ifdef CONFIG_HARDLOCKUP_DETECTOR
-static struct perf_event_attr wd_hw_attr = {
-        .type           = PERF_TYPE_HARDWARE,
-        .config         = PERF_COUNT_HW_CPU_CYCLES,
-        .size           = sizeof(struct perf_event_attr),
-        .pinned         = 1,
-        .disabled       = 1,
-};
-/* Callback function for perf event subsystem */
-static void watchdog_overflow_callback(struct perf_event *event,
-                 struct perf_sample_data *data,
-                 struct pt_regs *regs)
-{
-        /* Ensure the watchdog never gets throttled */
-        event->hw.interrupts = 0;
-        if (__this_cpu_read(watchdog_nmi_touch) == true) {
-                __this_cpu_write(watchdog_nmi_touch, false);
-                return;
-        }
-        /* check for a hardlockup
-         * This is done by making sure our timer interrupt
-         * is incrementing.  The timer interrupt should have
-         * fired multiple times before we overflow'd.  If it hasn't
-         * then this is a good indication the cpu is stuck
-         */
-        if (is_hardlockup()) {
-                int this_cpu = smp_processor_id();
-                struct pt_regs *regs = get_irq_regs();
-                /* only print hardlockups once */
-                if (__this_cpu_read(hard_watchdog_warn) == true)
-                        return;
-                pr_emerg("Watchdog detected hard LOCKUP on cpu %d", this_cpu);
-                print_modules();
-                print_irqtrace_events(current);
-                if (regs)
-                        show_regs(regs);
-                else
-                        dump_stack();
-                /*
-                 * Perform all-CPU dump only once to avoid multiple hardlockups
-                 * generating interleaving traces
-                 */
-                if (sysctl_hardlockup_all_cpu_backtrace &&
-                                !test_and_set_bit(0, &hardlockup_allcpu_dumped))
-                        trigger_allbutself_cpu_backtrace();
-                if (hardlockup_panic)
-                        nmi_panic(regs, "Hard LOCKUP");
-                __this_cpu_write(hard_watchdog_warn, true);
-                return;
-        }
-        __this_cpu_write(hard_watchdog_warn, false);
-        return;
-}
-#endif /* CONFIG_HARDLOCKUP_DETECTOR */
 static void watchdog_interrupt_count(void)
 {
        __this_cpu_inc(hrtimer_interrupts);
 }
-static int watchdog_nmi_enable(unsigned int cpu);
+/*
-static void watchdog_nmi_disable(unsigned int cpu);
+ * These two functions are mostly architecture specific
+ * defining them as weak here.
+ */
+int __weak watchdog_nmi_enable(unsigned int cpu)
+{
+        return 0;
+}
+void __weak watchdog_nmi_disable(unsigned int cpu)
+{
+}
 static int watchdog_enable_all_cpus(void);
 static void watchdog_disable_all_cpus(void);
@@ -577,109 +440,6 @@ static void watchdog(unsigned int cpu)
                watchdog_nmi_disable(cpu);
 }
-#ifdef CONFIG_HARDLOCKUP_DETECTOR
-/*
- * People like the simple clean cpu node info on boot.
- * Reduce the watchdog noise by only printing messages
- * that are different from what cpu0 displayed.
- */
-static unsigned long cpu0_err;
-static int watchdog_nmi_enable(unsigned int cpu)
-{
-        struct perf_event_attr *wd_attr;
-        struct perf_event *event = per_cpu(watchdog_ev, cpu);
-        /* nothing to do if the hard lockup detector is disabled */
-        if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
-                goto out;
-        /* is it already setup and enabled? */
-        if (event && event->state > PERF_EVENT_STATE_OFF)
-                goto out;
-        /* it is setup but not enabled */
-        if (event != NULL)
-                goto out_enable;
-        wd_attr = &wd_hw_attr;
-        wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh);
-        /* Try to register using hardware perf events */
-        event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL);
-        /* save cpu0 error for future comparision */
-        if (cpu == 0 && IS_ERR(event))
-                cpu0_err = PTR_ERR(event);
-        if (!IS_ERR(event)) {
-                /* only print for cpu0 or different than cpu0 */
-                if (cpu == 0 || cpu0_err)
-                        pr_info("enabled on all CPUs, permanently consumes one hw-PMU counter.\n");
-                goto out_save;
-        }
-        /*
-         * Disable the hard lockup detector if _any_ CPU fails to set up
-         * set up the hardware perf event. The watchdog() function checks
-         * the NMI_WATCHDOG_ENABLED bit periodically.
-         *
-         * The barriers are for syncing up watchdog_enabled across all the
-         * cpus, as clear_bit() does not use barriers.
-         */
-        smp_mb__before_atomic();
-        clear_bit(NMI_WATCHDOG_ENABLED_BIT, &watchdog_enabled);
-        smp_mb__after_atomic();
-        /* skip displaying the same error again */
-        if (cpu > 0 && (PTR_ERR(event) == cpu0_err))
-                return PTR_ERR(event);
-        /* vary the KERN level based on the returned errno */
-        if (PTR_ERR(event) == -EOPNOTSUPP)
-                pr_info("disabled (cpu%i): not supported (no LAPIC?)\n", cpu);
-        else if (PTR_ERR(event) == -ENOENT)
-                pr_warn("disabled (cpu%i): hardware events not enabled\n",
-                         cpu);
-        else
-                pr_err("disabled (cpu%i): unable to create perf event: %ld\n",
-                        cpu, PTR_ERR(event));
-        pr_info("Shutting down hard lockup detector on all cpus\n");
-        return PTR_ERR(event);
-        /* success path */
-out_save:
-        per_cpu(watchdog_ev, cpu) = event;
-out_enable:
-        perf_event_enable(per_cpu(watchdog_ev, cpu));
-out:
-        return 0;
-}
-static void watchdog_nmi_disable(unsigned int cpu)
-{
-        struct perf_event *event = per_cpu(watchdog_ev, cpu);
-        if (event) {
-                perf_event_disable(event);
-                per_cpu(watchdog_ev, cpu) = NULL;
-                /* should be in cleanup, but blocks oprofile */
-                perf_event_release_kernel(event);
-        }
-        if (cpu == 0) {
-                /* watchdog_nmi_enable() expects this to be zero initially. */
-                cpu0_err = 0;
-        }
-}
-#else
-static int watchdog_nmi_enable(unsigned int cpu) { return 0; }
-static void watchdog_nmi_disable(unsigned int cpu) { return; }
-#endif /* CONFIG_HARDLOCKUP_DETECTOR */
 static struct smp_hotplug_thread watchdog_threads = {
        .store                  = &softlockup_watchdog,
        .thread_should_run      = watchdog_should_run,
diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c
new file mode 100644
index 000000000000..84016c8aee6b
--- /dev/null
+++ b/kernel/watchdog_hld.c
@@ -0,0 +1,227 @@
+/*
+ * Detect hard lockups on a system
+ *
+ * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc.
+ *
+ * Note: Most of this code is borrowed heavily from the original softlockup
+ * detector, so thanks to Ingo for the initial implementation.
+ * Some chunks also taken from the old x86-specific nmi watchdog code, thanks
+ * to those contributors as well.
+ */
+#define pr_fmt(fmt) "NMI watchdog: " fmt
+#include <linux/nmi.h>
+#include <linux/module.h>
+#include <asm/irq_regs.h>
+#include <linux/perf_event.h>
+static DEFINE_PER_CPU(bool, hard_watchdog_warn);
+static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
+static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
+/* boot commands */
+/*
+ * Should we panic when a soft-lockup or hard-lockup occurs:
+ */
+unsigned int __read_mostly hardlockup_panic =
+                        CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE;
+static unsigned long hardlockup_allcpu_dumped;
+/*
+ * We may not want to enable hard lockup detection by default in all cases,
+ * for example when running the kernel as a guest on a hypervisor. In these
+ * cases this function can be called to disable hard lockup detection. This
+ * function should only be executed once by the boot processor before the
+ * kernel command line parameters are parsed, because otherwise it is not
+ * possible to override this in hardlockup_panic_setup().
+ */
+void hardlockup_detector_disable(void)
+{
+        watchdog_enabled &= ~NMI_WATCHDOG_ENABLED;
+}
+static int __init hardlockup_panic_setup(char *str)
+{
+        if (!strncmp(str, "panic", 5))
+                hardlockup_panic = 1;
+        else if (!strncmp(str, "nopanic", 7))
+                hardlockup_panic = 0;
+        else if (!strncmp(str, "0", 1))
+                watchdog_enabled &= ~NMI_WATCHDOG_ENABLED;
+        else if (!strncmp(str, "1", 1))
+                watchdog_enabled |= NMI_WATCHDOG_ENABLED;
+        return 1;
+}
+__setup("nmi_watchdog=", hardlockup_panic_setup);
+void touch_nmi_watchdog(void)
+{
+        /*
+         * Using __raw here because some code paths have
+         * preemption enabled.  If preemption is enabled
+         * then interrupts should be enabled too, in which
+         * case we shouldn't have to worry about the watchdog
+         * going off.
+         */
+        raw_cpu_write(watchdog_nmi_touch, true);
+        touch_softlockup_watchdog();
+}
+EXPORT_SYMBOL(touch_nmi_watchdog);
+static struct perf_event_attr wd_hw_attr = {
+        .type           = PERF_TYPE_HARDWARE,
+        .config         = PERF_COUNT_HW_CPU_CYCLES,
+        .size           = sizeof(struct perf_event_attr),
+        .pinned         = 1,
+        .disabled       = 1,
+};
+/* Callback function for perf event subsystem */
+static void watchdog_overflow_callback(struct perf_event *event,
+                 struct perf_sample_data *data,
+                 struct pt_regs *regs)
+{
+        /* Ensure the watchdog never gets throttled */
+        event->hw.interrupts = 0;
+        if (__this_cpu_read(watchdog_nmi_touch) == true) {
+                __this_cpu_write(watchdog_nmi_touch, false);
+                return;
+        }
+        /* check for a hardlockup
+         * This is done by making sure our timer interrupt
+         * is incrementing.  The timer interrupt should have
+         * fired multiple times before we overflow'd.  If it hasn't
+         * then this is a good indication the cpu is stuck
+         */
+        if (is_hardlockup()) {
+                int this_cpu = smp_processor_id();
+                /* only print hardlockups once */
+                if (__this_cpu_read(hard_watchdog_warn) == true)
+                        return;
+                pr_emerg("Watchdog detected hard LOCKUP on cpu %d", this_cpu);
+                print_modules();
+                print_irqtrace_events(current);
+                if (regs)
+                        show_regs(regs);
+                else
+                        dump_stack();
+                /*
+                 * Perform all-CPU dump only once to avoid multiple hardlockups
+                 * generating interleaving traces
+                 */
+                if (sysctl_hardlockup_all_cpu_backtrace &&
+                                !test_and_set_bit(0, &hardlockup_allcpu_dumped))
+                        trigger_allbutself_cpu_backtrace();
+                if (hardlockup_panic)
+                        nmi_panic(regs, "Hard LOCKUP");
+                __this_cpu_write(hard_watchdog_warn, true);
+                return;
+        }
+        __this_cpu_write(hard_watchdog_warn, false);
+        return;
+}
+/*
+ * People like the simple clean cpu node info on boot.
+ * Reduce the watchdog noise by only printing messages
+ * that are different from what cpu0 displayed.
+ */
+static unsigned long cpu0_err;
+int watchdog_nmi_enable(unsigned int cpu)
+{
+        struct perf_event_attr *wd_attr;
+        struct perf_event *event = per_cpu(watchdog_ev, cpu);
+        /* nothing to do if the hard lockup detector is disabled */
+        if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
+                goto out;
+        /* is it already setup and enabled? */
+        if (event && event->state > PERF_EVENT_STATE_OFF)
+                goto out;
+        /* it is setup but not enabled */
+        if (event != NULL)
+                goto out_enable;
+        wd_attr = &wd_hw_attr;
+        wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh);
+        /* Try to register using hardware perf events */
+        event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL);
+        /* save cpu0 error for future comparision */
+        if (cpu == 0 && IS_ERR(event))
+                cpu0_err = PTR_ERR(event);
+        if (!IS_ERR(event)) {
+                /* only print for cpu0 or different than cpu0 */
+                if (cpu == 0 || cpu0_err)
+                        pr_info("enabled on all CPUs, permanently consumes one hw-PMU counter.\n");
+                goto out_save;
+        }
+        /*
+         * Disable the hard lockup detector if _any_ CPU fails to set up
+         * set up the hardware perf event. The watchdog() function checks
+         * the NMI_WATCHDOG_ENABLED bit periodically.
+         *
+         * The barriers are for syncing up watchdog_enabled across all the
+         * cpus, as clear_bit() does not use barriers.
+         */
+        smp_mb__before_atomic();
+        clear_bit(NMI_WATCHDOG_ENABLED_BIT, &watchdog_enabled);
+        smp_mb__after_atomic();
+        /* skip displaying the same error again */
+        if (cpu > 0 && (PTR_ERR(event) == cpu0_err))
+                return PTR_ERR(event);
+        /* vary the KERN level based on the returned errno */
+        if (PTR_ERR(event) == -EOPNOTSUPP)
+                pr_info("disabled (cpu%i): not supported (no LAPIC?)\n", cpu);
+        else if (PTR_ERR(event) == -ENOENT)
+                pr_warn("disabled (cpu%i): hardware events not enabled\n",
+                         cpu);
+        else
+                pr_err("disabled (cpu%i): unable to create perf event: %ld\n",
+                        cpu, PTR_ERR(event));
+        pr_info("Shutting down hard lockup detector on all cpus\n");
+        return PTR_ERR(event);
+        /* success path */
+out_save:
+        per_cpu(watchdog_ev, cpu) = event;
+out_enable:
+        perf_event_enable(per_cpu(watchdog_ev, cpu));
+out:
+        return 0;
+}
+void watchdog_nmi_disable(unsigned int cpu)
+{
+        struct perf_event *event = per_cpu(watchdog_ev, cpu);
+        if (event) {
+                perf_event_disable(event);
+                per_cpu(watchdog_ev, cpu) = NULL;
+                /* should be in cleanup, but blocks oprofile */
+                perf_event_release_kernel(event);
+        }
+        if (cpu == 0) {
+                /* watchdog_nmi_enable() expects this to be zero initially. */
+                cpu0_err = 0;
+        }
+}