aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.locks9
-rw-r--r--kernel/cgroup.c58
-rw-r--r--kernel/context_tracking.c3
-rw-r--r--kernel/cpuset.c20
-rw-r--r--kernel/events/core.c39
-rw-r--r--kernel/events/uprobes.c6
-rw-r--r--kernel/fork.c2
-rw-r--r--kernel/irq/irqdesc.c4
-rw-r--r--kernel/kexec.c1
-rw-r--r--kernel/locking/mcs_spinlock.c64
-rw-r--r--kernel/locking/mcs_spinlock.h9
-rw-r--r--kernel/locking/mutex.c2
-rw-r--r--kernel/locking/rtmutex-debug.h5
-rw-r--r--kernel/locking/rtmutex.c243
-rw-r--r--kernel/locking/rtmutex.h5
-rw-r--r--kernel/locking/rwsem-spinlock.c28
-rw-r--r--kernel/locking/rwsem-xadd.c16
-rw-r--r--kernel/locking/rwsem.c2
-rw-r--r--kernel/power/hibernate.c37
-rw-r--r--kernel/power/main.c6
-rw-r--r--kernel/power/process.c1
-rw-r--r--kernel/power/suspend.c4
-rw-r--r--kernel/power/user.c3
-rw-r--r--kernel/printk/printk.c44
-rw-r--r--kernel/rcu/tree.c140
-rw-r--r--kernel/rcu/tree.h6
-rw-r--r--kernel/rcu/tree_plugin.h2
-rw-r--r--kernel/rcu/update.c22
-rw-r--r--kernel/sched/core.c7
-rw-r--r--kernel/sched/debug.c2
-rw-r--r--kernel/smp.c57
-rw-r--r--kernel/sysctl.c18
-rw-r--r--kernel/time/alarmtimer.c20
-rw-r--r--kernel/trace/ftrace.c4
-rw-r--r--kernel/trace/ring_buffer.c4
-rw-r--r--kernel/trace/trace.c20
-rw-r--r--kernel/trace/trace_events.c1
-rw-r--r--kernel/trace/trace_uprobe.c46
-rw-r--r--kernel/tracepoint.c26
-rw-r--r--kernel/watchdog.c41
-rw-r--r--kernel/workqueue.c3
41 files changed, 778 insertions, 252 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index 35536d9c0964..76768ee812b2 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -220,9 +220,16 @@ config INLINE_WRITE_UNLOCK_IRQRESTORE
220 220
221endif 221endif
222 222
223config ARCH_SUPPORTS_ATOMIC_RMW
224 bool
225
223config MUTEX_SPIN_ON_OWNER 226config MUTEX_SPIN_ON_OWNER
224 def_bool y 227 def_bool y
225 depends on SMP && !DEBUG_MUTEXES 228 depends on SMP && !DEBUG_MUTEXES && ARCH_SUPPORTS_ATOMIC_RMW
229
230config RWSEM_SPIN_ON_OWNER
231 def_bool y
232 depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW
226 233
227config ARCH_USE_QUEUE_RWLOCK 234config ARCH_USE_QUEUE_RWLOCK
228 bool 235 bool
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 7868fc3c0bc5..70776aec2562 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1648,10 +1648,13 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1648 int flags, const char *unused_dev_name, 1648 int flags, const char *unused_dev_name,
1649 void *data) 1649 void *data)
1650{ 1650{
1651 struct super_block *pinned_sb = NULL;
1652 struct cgroup_subsys *ss;
1651 struct cgroup_root *root; 1653 struct cgroup_root *root;
1652 struct cgroup_sb_opts opts; 1654 struct cgroup_sb_opts opts;
1653 struct dentry *dentry; 1655 struct dentry *dentry;
1654 int ret; 1656 int ret;
1657 int i;
1655 bool new_sb; 1658 bool new_sb;
1656 1659
1657 /* 1660 /*
@@ -1677,6 +1680,27 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1677 goto out_unlock; 1680 goto out_unlock;
1678 } 1681 }
1679 1682
1683 /*
1684 * Destruction of cgroup root is asynchronous, so subsystems may
1685 * still be dying after the previous unmount. Let's drain the
1686 * dying subsystems. We just need to ensure that the ones
1687 * unmounted previously finish dying and don't care about new ones
1688 * starting. Testing ref liveliness is good enough.
1689 */
1690 for_each_subsys(ss, i) {
1691 if (!(opts.subsys_mask & (1 << i)) ||
1692 ss->root == &cgrp_dfl_root)
1693 continue;
1694
1695 if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) {
1696 mutex_unlock(&cgroup_mutex);
1697 msleep(10);
1698 ret = restart_syscall();
1699 goto out_free;
1700 }
1701 cgroup_put(&ss->root->cgrp);
1702 }
1703
1680 for_each_root(root) { 1704 for_each_root(root) {
1681 bool name_match = false; 1705 bool name_match = false;
1682 1706
@@ -1717,15 +1741,23 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1717 } 1741 }
1718 1742
1719 /* 1743 /*
1720 * A root's lifetime is governed by its root cgroup. 1744 * We want to reuse @root whose lifetime is governed by its
1721 * tryget_live failure indicate that the root is being 1745 * ->cgrp. Let's check whether @root is alive and keep it
1722 * destroyed. Wait for destruction to complete so that the 1746 * that way. As cgroup_kill_sb() can happen anytime, we
1723 * subsystems are free. We can use wait_queue for the wait 1747 * want to block it by pinning the sb so that @root doesn't
1724 * but this path is super cold. Let's just sleep for a bit 1748 * get killed before mount is complete.
1725 * and retry. 1749 *
1750 * With the sb pinned, tryget_live can reliably indicate
1751 * whether @root can be reused. If it's being killed,
1752 * drain it. We can use wait_queue for the wait but this
1753 * path is super cold. Let's just sleep a bit and retry.
1726 */ 1754 */
1727 if (!percpu_ref_tryget_live(&root->cgrp.self.refcnt)) { 1755 pinned_sb = kernfs_pin_sb(root->kf_root, NULL);
1756 if (IS_ERR(pinned_sb) ||
1757 !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) {
1728 mutex_unlock(&cgroup_mutex); 1758 mutex_unlock(&cgroup_mutex);
1759 if (!IS_ERR_OR_NULL(pinned_sb))
1760 deactivate_super(pinned_sb);
1729 msleep(10); 1761 msleep(10);
1730 ret = restart_syscall(); 1762 ret = restart_syscall();
1731 goto out_free; 1763 goto out_free;
@@ -1770,6 +1802,16 @@ out_free:
1770 CGROUP_SUPER_MAGIC, &new_sb); 1802 CGROUP_SUPER_MAGIC, &new_sb);
1771 if (IS_ERR(dentry) || !new_sb) 1803 if (IS_ERR(dentry) || !new_sb)
1772 cgroup_put(&root->cgrp); 1804 cgroup_put(&root->cgrp);
1805
1806 /*
1807 * If @pinned_sb, we're reusing an existing root and holding an
1808 * extra ref on its sb. Mount is complete. Put the extra ref.
1809 */
1810 if (pinned_sb) {
1811 WARN_ON(new_sb);
1812 deactivate_super(pinned_sb);
1813 }
1814
1773 return dentry; 1815 return dentry;
1774} 1816}
1775 1817
@@ -3328,7 +3370,7 @@ bool css_has_online_children(struct cgroup_subsys_state *css)
3328 3370
3329 rcu_read_lock(); 3371 rcu_read_lock();
3330 css_for_each_child(child, css) { 3372 css_for_each_child(child, css) {
3331 if (css->flags & CSS_ONLINE) { 3373 if (child->flags & CSS_ONLINE) {
3332 ret = true; 3374 ret = true;
3333 break; 3375 break;
3334 } 3376 }
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 019d45008448..5664985c46a0 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -19,6 +19,7 @@
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/hardirq.h> 20#include <linux/hardirq.h>
21#include <linux/export.h> 21#include <linux/export.h>
22#include <linux/kprobes.h>
22 23
23#define CREATE_TRACE_POINTS 24#define CREATE_TRACE_POINTS
24#include <trace/events/context_tracking.h> 25#include <trace/events/context_tracking.h>
@@ -104,6 +105,7 @@ void context_tracking_user_enter(void)
104 } 105 }
105 local_irq_restore(flags); 106 local_irq_restore(flags);
106} 107}
108NOKPROBE_SYMBOL(context_tracking_user_enter);
107 109
108#ifdef CONFIG_PREEMPT 110#ifdef CONFIG_PREEMPT
109/** 111/**
@@ -181,6 +183,7 @@ void context_tracking_user_exit(void)
181 } 183 }
182 local_irq_restore(flags); 184 local_irq_restore(flags);
183} 185}
186NOKPROBE_SYMBOL(context_tracking_user_exit);
184 187
185/** 188/**
186 * __context_tracking_task_switch - context switch the syscall callbacks 189 * __context_tracking_task_switch - context switch the syscall callbacks
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index f6b33c696224..116a4164720a 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1181,7 +1181,13 @@ done:
1181 1181
1182int current_cpuset_is_being_rebound(void) 1182int current_cpuset_is_being_rebound(void)
1183{ 1183{
1184 return task_cs(current) == cpuset_being_rebound; 1184 int ret;
1185
1186 rcu_read_lock();
1187 ret = task_cs(current) == cpuset_being_rebound;
1188 rcu_read_unlock();
1189
1190 return ret;
1185} 1191}
1186 1192
1187static int update_relax_domain_level(struct cpuset *cs, s64 val) 1193static int update_relax_domain_level(struct cpuset *cs, s64 val)
@@ -1617,7 +1623,17 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
1617 * resources, wait for the previously scheduled operations before 1623 * resources, wait for the previously scheduled operations before
1618 * proceeding, so that we don't end up keep removing tasks added 1624 * proceeding, so that we don't end up keep removing tasks added
1619 * after execution capability is restored. 1625 * after execution capability is restored.
1626 *
1627 * cpuset_hotplug_work calls back into cgroup core via
1628 * cgroup_transfer_tasks() and waiting for it from a cgroupfs
1629 * operation like this one can lead to a deadlock through kernfs
1630 * active_ref protection. Let's break the protection. Losing the
1631 * protection is okay as we check whether @cs is online after
1632 * grabbing cpuset_mutex anyway. This only happens on the legacy
1633 * hierarchies.
1620 */ 1634 */
1635 css_get(&cs->css);
1636 kernfs_break_active_protection(of->kn);
1621 flush_work(&cpuset_hotplug_work); 1637 flush_work(&cpuset_hotplug_work);
1622 1638
1623 mutex_lock(&cpuset_mutex); 1639 mutex_lock(&cpuset_mutex);
@@ -1645,6 +1661,8 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
1645 free_trial_cpuset(trialcs); 1661 free_trial_cpuset(trialcs);
1646out_unlock: 1662out_unlock:
1647 mutex_unlock(&cpuset_mutex); 1663 mutex_unlock(&cpuset_mutex);
1664 kernfs_unbreak_active_protection(of->kn);
1665 css_put(&cs->css);
1648 return retval ?: nbytes; 1666 return retval ?: nbytes;
1649} 1667}
1650 1668
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 5fa58e4cffac..b0c95f0f06fd 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -40,6 +40,7 @@
40#include <linux/mm_types.h> 40#include <linux/mm_types.h>
41#include <linux/cgroup.h> 41#include <linux/cgroup.h>
42#include <linux/module.h> 42#include <linux/module.h>
43#include <linux/mman.h>
43 44
44#include "internal.h" 45#include "internal.h"
45 46
@@ -2319,7 +2320,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
2319 next_parent = rcu_dereference(next_ctx->parent_ctx); 2320 next_parent = rcu_dereference(next_ctx->parent_ctx);
2320 2321
2321 /* If neither context have a parent context; they cannot be clones. */ 2322 /* If neither context have a parent context; they cannot be clones. */
2322 if (!parent && !next_parent) 2323 if (!parent || !next_parent)
2323 goto unlock; 2324 goto unlock;
2324 2325
2325 if (next_parent == ctx || next_ctx == parent || next_parent == parent) { 2326 if (next_parent == ctx || next_ctx == parent || next_parent == parent) {
@@ -5128,6 +5129,7 @@ struct perf_mmap_event {
5128 int maj, min; 5129 int maj, min;
5129 u64 ino; 5130 u64 ino;
5130 u64 ino_generation; 5131 u64 ino_generation;
5132 u32 prot, flags;
5131 5133
5132 struct { 5134 struct {
5133 struct perf_event_header header; 5135 struct perf_event_header header;
@@ -5169,6 +5171,8 @@ static void perf_event_mmap_output(struct perf_event *event,
5169 mmap_event->event_id.header.size += sizeof(mmap_event->min); 5171 mmap_event->event_id.header.size += sizeof(mmap_event->min);
5170 mmap_event->event_id.header.size += sizeof(mmap_event->ino); 5172 mmap_event->event_id.header.size += sizeof(mmap_event->ino);
5171 mmap_event->event_id.header.size += sizeof(mmap_event->ino_generation); 5173 mmap_event->event_id.header.size += sizeof(mmap_event->ino_generation);
5174 mmap_event->event_id.header.size += sizeof(mmap_event->prot);
5175 mmap_event->event_id.header.size += sizeof(mmap_event->flags);
5172 } 5176 }
5173 5177
5174 perf_event_header__init_id(&mmap_event->event_id.header, &sample, event); 5178 perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
@@ -5187,6 +5191,8 @@ static void perf_event_mmap_output(struct perf_event *event,
5187 perf_output_put(&handle, mmap_event->min); 5191 perf_output_put(&handle, mmap_event->min);
5188 perf_output_put(&handle, mmap_event->ino); 5192 perf_output_put(&handle, mmap_event->ino);
5189 perf_output_put(&handle, mmap_event->ino_generation); 5193 perf_output_put(&handle, mmap_event->ino_generation);
5194 perf_output_put(&handle, mmap_event->prot);
5195 perf_output_put(&handle, mmap_event->flags);
5190 } 5196 }
5191 5197
5192 __output_copy(&handle, mmap_event->file_name, 5198 __output_copy(&handle, mmap_event->file_name,
@@ -5205,6 +5211,7 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
5205 struct file *file = vma->vm_file; 5211 struct file *file = vma->vm_file;
5206 int maj = 0, min = 0; 5212 int maj = 0, min = 0;
5207 u64 ino = 0, gen = 0; 5213 u64 ino = 0, gen = 0;
5214 u32 prot = 0, flags = 0;
5208 unsigned int size; 5215 unsigned int size;
5209 char tmp[16]; 5216 char tmp[16];
5210 char *buf = NULL; 5217 char *buf = NULL;
@@ -5235,6 +5242,28 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
5235 gen = inode->i_generation; 5242 gen = inode->i_generation;
5236 maj = MAJOR(dev); 5243 maj = MAJOR(dev);
5237 min = MINOR(dev); 5244 min = MINOR(dev);
5245
5246 if (vma->vm_flags & VM_READ)
5247 prot |= PROT_READ;
5248 if (vma->vm_flags & VM_WRITE)
5249 prot |= PROT_WRITE;
5250 if (vma->vm_flags & VM_EXEC)
5251 prot |= PROT_EXEC;
5252
5253 if (vma->vm_flags & VM_MAYSHARE)
5254 flags = MAP_SHARED;
5255 else
5256 flags = MAP_PRIVATE;
5257
5258 if (vma->vm_flags & VM_DENYWRITE)
5259 flags |= MAP_DENYWRITE;
5260 if (vma->vm_flags & VM_MAYEXEC)
5261 flags |= MAP_EXECUTABLE;
5262 if (vma->vm_flags & VM_LOCKED)
5263 flags |= MAP_LOCKED;
5264 if (vma->vm_flags & VM_HUGETLB)
5265 flags |= MAP_HUGETLB;
5266
5238 goto got_name; 5267 goto got_name;
5239 } else { 5268 } else {
5240 name = (char *)arch_vma_name(vma); 5269 name = (char *)arch_vma_name(vma);
@@ -5275,6 +5304,8 @@ got_name:
5275 mmap_event->min = min; 5304 mmap_event->min = min;
5276 mmap_event->ino = ino; 5305 mmap_event->ino = ino;
5277 mmap_event->ino_generation = gen; 5306 mmap_event->ino_generation = gen;
5307 mmap_event->prot = prot;
5308 mmap_event->flags = flags;
5278 5309
5279 if (!(vma->vm_flags & VM_EXEC)) 5310 if (!(vma->vm_flags & VM_EXEC))
5280 mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA; 5311 mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA;
@@ -5315,6 +5346,8 @@ void perf_event_mmap(struct vm_area_struct *vma)
5315 /* .min (attr_mmap2 only) */ 5346 /* .min (attr_mmap2 only) */
5316 /* .ino (attr_mmap2 only) */ 5347 /* .ino (attr_mmap2 only) */
5317 /* .ino_generation (attr_mmap2 only) */ 5348 /* .ino_generation (attr_mmap2 only) */
5349 /* .prot (attr_mmap2 only) */
5350 /* .flags (attr_mmap2 only) */
5318 }; 5351 };
5319 5352
5320 perf_event_mmap_event(&mmap_event); 5353 perf_event_mmap_event(&mmap_event);
@@ -6897,10 +6930,6 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
6897 if (ret) 6930 if (ret)
6898 return -EFAULT; 6931 return -EFAULT;
6899 6932
6900 /* disabled for now */
6901 if (attr->mmap2)
6902 return -EINVAL;
6903
6904 if (attr->__reserved_1) 6933 if (attr->__reserved_1)
6905 return -EINVAL; 6934 return -EINVAL;
6906 6935
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index c445e392e93f..6f3254e8c137 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -846,7 +846,7 @@ static void __uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *u
846{ 846{
847 int err; 847 int err;
848 848
849 if (!consumer_del(uprobe, uc)) /* WARN? */ 849 if (WARN_ON(!consumer_del(uprobe, uc)))
850 return; 850 return;
851 851
852 err = register_for_each_vma(uprobe, NULL); 852 err = register_for_each_vma(uprobe, NULL);
@@ -927,7 +927,7 @@ int uprobe_apply(struct inode *inode, loff_t offset,
927 int ret = -ENOENT; 927 int ret = -ENOENT;
928 928
929 uprobe = find_uprobe(inode, offset); 929 uprobe = find_uprobe(inode, offset);
930 if (!uprobe) 930 if (WARN_ON(!uprobe))
931 return ret; 931 return ret;
932 932
933 down_write(&uprobe->register_rwsem); 933 down_write(&uprobe->register_rwsem);
@@ -952,7 +952,7 @@ void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consume
952 struct uprobe *uprobe; 952 struct uprobe *uprobe;
953 953
954 uprobe = find_uprobe(inode, offset); 954 uprobe = find_uprobe(inode, offset);
955 if (!uprobe) 955 if (WARN_ON(!uprobe))
956 return; 956 return;
957 957
958 down_write(&uprobe->register_rwsem); 958 down_write(&uprobe->register_rwsem);
diff --git a/kernel/fork.c b/kernel/fork.c
index d2799d1fc952..6a13c46cd87d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1487,7 +1487,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1487 1487
1488 total_forks++; 1488 total_forks++;
1489 spin_unlock(&current->sighand->siglock); 1489 spin_unlock(&current->sighand->siglock);
1490 syscall_tracepoint_update(p);
1490 write_unlock_irq(&tasklist_lock); 1491 write_unlock_irq(&tasklist_lock);
1492
1491 proc_fork_connector(p); 1493 proc_fork_connector(p);
1492 cgroup_post_fork(p); 1494 cgroup_post_fork(p);
1493 if (clone_flags & CLONE_THREAD) 1495 if (clone_flags & CLONE_THREAD)
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 7339e42a85ab..1487a123db5c 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -455,9 +455,9 @@ EXPORT_SYMBOL_GPL(irq_alloc_hwirqs);
455 */ 455 */
456void irq_free_hwirqs(unsigned int from, int cnt) 456void irq_free_hwirqs(unsigned int from, int cnt)
457{ 457{
458 int i; 458 int i, j;
459 459
460 for (i = from; cnt > 0; i++, cnt--) { 460 for (i = from, j = cnt; j > 0; i++, j--) {
461 irq_set_status_flags(i, _IRQ_NOREQUEST | _IRQ_NOPROBE); 461 irq_set_status_flags(i, _IRQ_NOREQUEST | _IRQ_NOPROBE);
462 arch_teardown_hwirq(i); 462 arch_teardown_hwirq(i);
463 } 463 }
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 6748688813d0..369f41a94124 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1617,6 +1617,7 @@ static int __init crash_save_vmcoreinfo_init(void)
1617#ifdef CONFIG_MEMORY_FAILURE 1617#ifdef CONFIG_MEMORY_FAILURE
1618 VMCOREINFO_NUMBER(PG_hwpoison); 1618 VMCOREINFO_NUMBER(PG_hwpoison);
1619#endif 1619#endif
1620 VMCOREINFO_NUMBER(PG_head_mask);
1620 VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE); 1621 VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
1621 1622
1622 arch_crash_save_vmcoreinfo(); 1623 arch_crash_save_vmcoreinfo();
diff --git a/kernel/locking/mcs_spinlock.c b/kernel/locking/mcs_spinlock.c
index 838dc9e00669..be9ee1559fca 100644
--- a/kernel/locking/mcs_spinlock.c
+++ b/kernel/locking/mcs_spinlock.c
@@ -14,21 +14,47 @@
14 * called from interrupt context and we have preemption disabled while 14 * called from interrupt context and we have preemption disabled while
15 * spinning. 15 * spinning.
16 */ 16 */
17static DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_queue, osq_node); 17static DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_node, osq_node);
18
19/*
20 * We use the value 0 to represent "no CPU", thus the encoded value
21 * will be the CPU number incremented by 1.
22 */
23static inline int encode_cpu(int cpu_nr)
24{
25 return cpu_nr + 1;
26}
27
28static inline struct optimistic_spin_node *decode_cpu(int encoded_cpu_val)
29{
30 int cpu_nr = encoded_cpu_val - 1;
31
32 return per_cpu_ptr(&osq_node, cpu_nr);
33}
18 34
19/* 35/*
20 * Get a stable @node->next pointer, either for unlock() or unqueue() purposes. 36 * Get a stable @node->next pointer, either for unlock() or unqueue() purposes.
21 * Can return NULL in case we were the last queued and we updated @lock instead. 37 * Can return NULL in case we were the last queued and we updated @lock instead.
22 */ 38 */
23static inline struct optimistic_spin_queue * 39static inline struct optimistic_spin_node *
24osq_wait_next(struct optimistic_spin_queue **lock, 40osq_wait_next(struct optimistic_spin_queue *lock,
25 struct optimistic_spin_queue *node, 41 struct optimistic_spin_node *node,
26 struct optimistic_spin_queue *prev) 42 struct optimistic_spin_node *prev)
27{ 43{
28 struct optimistic_spin_queue *next = NULL; 44 struct optimistic_spin_node *next = NULL;
45 int curr = encode_cpu(smp_processor_id());
46 int old;
47
48 /*
49 * If there is a prev node in queue, then the 'old' value will be
50 * the prev node's CPU #, else it's set to OSQ_UNLOCKED_VAL since if
51 * we're currently last in queue, then the queue will then become empty.
52 */
53 old = prev ? prev->cpu : OSQ_UNLOCKED_VAL;
29 54
30 for (;;) { 55 for (;;) {
31 if (*lock == node && cmpxchg(lock, node, prev) == node) { 56 if (atomic_read(&lock->tail) == curr &&
57 atomic_cmpxchg(&lock->tail, curr, old) == curr) {
32 /* 58 /*
33 * We were the last queued, we moved @lock back. @prev 59 * We were the last queued, we moved @lock back. @prev
34 * will now observe @lock and will complete its 60 * will now observe @lock and will complete its
@@ -59,18 +85,23 @@ osq_wait_next(struct optimistic_spin_queue **lock,
59 return next; 85 return next;
60} 86}
61 87
62bool osq_lock(struct optimistic_spin_queue **lock) 88bool osq_lock(struct optimistic_spin_queue *lock)
63{ 89{
64 struct optimistic_spin_queue *node = this_cpu_ptr(&osq_node); 90 struct optimistic_spin_node *node = this_cpu_ptr(&osq_node);
65 struct optimistic_spin_queue *prev, *next; 91 struct optimistic_spin_node *prev, *next;
92 int curr = encode_cpu(smp_processor_id());
93 int old;
66 94
67 node->locked = 0; 95 node->locked = 0;
68 node->next = NULL; 96 node->next = NULL;
97 node->cpu = curr;
69 98
70 node->prev = prev = xchg(lock, node); 99 old = atomic_xchg(&lock->tail, curr);
71 if (likely(prev == NULL)) 100 if (old == OSQ_UNLOCKED_VAL)
72 return true; 101 return true;
73 102
103 prev = decode_cpu(old);
104 node->prev = prev;
74 ACCESS_ONCE(prev->next) = node; 105 ACCESS_ONCE(prev->next) = node;
75 106
76 /* 107 /*
@@ -149,20 +180,21 @@ unqueue:
149 return false; 180 return false;
150} 181}
151 182
152void osq_unlock(struct optimistic_spin_queue **lock) 183void osq_unlock(struct optimistic_spin_queue *lock)
153{ 184{
154 struct optimistic_spin_queue *node = this_cpu_ptr(&osq_node); 185 struct optimistic_spin_node *node, *next;
155 struct optimistic_spin_queue *next; 186 int curr = encode_cpu(smp_processor_id());
156 187
157 /* 188 /*
158 * Fast path for the uncontended case. 189 * Fast path for the uncontended case.
159 */ 190 */
160 if (likely(cmpxchg(lock, node, NULL) == node)) 191 if (likely(atomic_cmpxchg(&lock->tail, curr, OSQ_UNLOCKED_VAL) == curr))
161 return; 192 return;
162 193
163 /* 194 /*
164 * Second most likely case. 195 * Second most likely case.
165 */ 196 */
197 node = this_cpu_ptr(&osq_node);
166 next = xchg(&node->next, NULL); 198 next = xchg(&node->next, NULL);
167 if (next) { 199 if (next) {
168 ACCESS_ONCE(next->locked) = 1; 200 ACCESS_ONCE(next->locked) = 1;
diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h
index a2dbac4aca6b..74356dc0ce29 100644
--- a/kernel/locking/mcs_spinlock.h
+++ b/kernel/locking/mcs_spinlock.h
@@ -118,12 +118,13 @@ void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
118 * mutex_lock()/rwsem_down_{read,write}() etc. 118 * mutex_lock()/rwsem_down_{read,write}() etc.
119 */ 119 */
120 120
121struct optimistic_spin_queue { 121struct optimistic_spin_node {
122 struct optimistic_spin_queue *next, *prev; 122 struct optimistic_spin_node *next, *prev;
123 int locked; /* 1 if lock acquired */ 123 int locked; /* 1 if lock acquired */
124 int cpu; /* encoded CPU # value */
124}; 125};
125 126
126extern bool osq_lock(struct optimistic_spin_queue **lock); 127extern bool osq_lock(struct optimistic_spin_queue *lock);
127extern void osq_unlock(struct optimistic_spin_queue **lock); 128extern void osq_unlock(struct optimistic_spin_queue *lock);
128 129
129#endif /* __LINUX_MCS_SPINLOCK_H */ 130#endif /* __LINUX_MCS_SPINLOCK_H */
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index bc73d33c6760..acca2c1a3c5e 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -60,7 +60,7 @@ __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
60 INIT_LIST_HEAD(&lock->wait_list); 60 INIT_LIST_HEAD(&lock->wait_list);
61 mutex_clear_owner(lock); 61 mutex_clear_owner(lock);
62#ifdef CONFIG_MUTEX_SPIN_ON_OWNER 62#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
63 lock->osq = NULL; 63 osq_lock_init(&lock->osq);
64#endif 64#endif
65 65
66 debug_mutex_init(lock, name, key); 66 debug_mutex_init(lock, name, key);
diff --git a/kernel/locking/rtmutex-debug.h b/kernel/locking/rtmutex-debug.h
index 14193d596d78..ab29b6a22669 100644
--- a/kernel/locking/rtmutex-debug.h
+++ b/kernel/locking/rtmutex-debug.h
@@ -31,3 +31,8 @@ static inline int debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter,
31{ 31{
32 return (waiter != NULL); 32 return (waiter != NULL);
33} 33}
34
35static inline void rt_mutex_print_deadlock(struct rt_mutex_waiter *w)
36{
37 debug_rt_mutex_print_deadlock(w);
38}
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index a620d4d08ca6..fc605941b9b8 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -83,6 +83,47 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
83 owner = *p; 83 owner = *p;
84 } while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner); 84 } while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner);
85} 85}
86
87/*
88 * Safe fastpath aware unlock:
89 * 1) Clear the waiters bit
90 * 2) Drop lock->wait_lock
91 * 3) Try to unlock the lock with cmpxchg
92 */
93static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock)
94 __releases(lock->wait_lock)
95{
96 struct task_struct *owner = rt_mutex_owner(lock);
97
98 clear_rt_mutex_waiters(lock);
99 raw_spin_unlock(&lock->wait_lock);
100 /*
101 * If a new waiter comes in between the unlock and the cmpxchg
102 * we have two situations:
103 *
104 * unlock(wait_lock);
105 * lock(wait_lock);
106 * cmpxchg(p, owner, 0) == owner
107 * mark_rt_mutex_waiters(lock);
108 * acquire(lock);
109 * or:
110 *
111 * unlock(wait_lock);
112 * lock(wait_lock);
113 * mark_rt_mutex_waiters(lock);
114 *
115 * cmpxchg(p, owner, 0) != owner
116 * enqueue_waiter();
117 * unlock(wait_lock);
118 * lock(wait_lock);
119 * wake waiter();
120 * unlock(wait_lock);
121 * lock(wait_lock);
122 * acquire(lock);
123 */
124 return rt_mutex_cmpxchg(lock, owner, NULL);
125}
126
86#else 127#else
87# define rt_mutex_cmpxchg(l,c,n) (0) 128# define rt_mutex_cmpxchg(l,c,n) (0)
88static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) 129static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
@@ -90,6 +131,17 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
90 lock->owner = (struct task_struct *) 131 lock->owner = (struct task_struct *)
91 ((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS); 132 ((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS);
92} 133}
134
135/*
136 * Simple slow path only version: lock->owner is protected by lock->wait_lock.
137 */
138static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock)
139 __releases(lock->wait_lock)
140{
141 lock->owner = NULL;
142 raw_spin_unlock(&lock->wait_lock);
143 return true;
144}
93#endif 145#endif
94 146
95static inline int 147static inline int
@@ -260,27 +312,36 @@ static void rt_mutex_adjust_prio(struct task_struct *task)
260 */ 312 */
261int max_lock_depth = 1024; 313int max_lock_depth = 1024;
262 314
315static inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p)
316{
317 return p->pi_blocked_on ? p->pi_blocked_on->lock : NULL;
318}
319
263/* 320/*
264 * Adjust the priority chain. Also used for deadlock detection. 321 * Adjust the priority chain. Also used for deadlock detection.
265 * Decreases task's usage by one - may thus free the task. 322 * Decreases task's usage by one - may thus free the task.
266 * 323 *
267 * @task: the task owning the mutex (owner) for which a chain walk is probably 324 * @task: the task owning the mutex (owner) for which a chain walk is
268 * needed 325 * probably needed
269 * @deadlock_detect: do we have to carry out deadlock detection? 326 * @deadlock_detect: do we have to carry out deadlock detection?
270 * @orig_lock: the mutex (can be NULL if we are walking the chain to recheck 327 * @orig_lock: the mutex (can be NULL if we are walking the chain to recheck
271 * things for a task that has just got its priority adjusted, and 328 * things for a task that has just got its priority adjusted, and
272 * is waiting on a mutex) 329 * is waiting on a mutex)
330 * @next_lock: the mutex on which the owner of @orig_lock was blocked before
331 * we dropped its pi_lock. Is never dereferenced, only used for
332 * comparison to detect lock chain changes.
273 * @orig_waiter: rt_mutex_waiter struct for the task that has just donated 333 * @orig_waiter: rt_mutex_waiter struct for the task that has just donated
274 * its priority to the mutex owner (can be NULL in the case 334 * its priority to the mutex owner (can be NULL in the case
275 * depicted above or if the top waiter is gone away and we are 335 * depicted above or if the top waiter is gone away and we are
276 * actually deboosting the owner) 336 * actually deboosting the owner)
277 * @top_task: the current top waiter 337 * @top_task: the current top waiter
278 * 338 *
279 * Returns 0 or -EDEADLK. 339 * Returns 0 or -EDEADLK.
280 */ 340 */
281static int rt_mutex_adjust_prio_chain(struct task_struct *task, 341static int rt_mutex_adjust_prio_chain(struct task_struct *task,
282 int deadlock_detect, 342 int deadlock_detect,
283 struct rt_mutex *orig_lock, 343 struct rt_mutex *orig_lock,
344 struct rt_mutex *next_lock,
284 struct rt_mutex_waiter *orig_waiter, 345 struct rt_mutex_waiter *orig_waiter,
285 struct task_struct *top_task) 346 struct task_struct *top_task)
286{ 347{
@@ -314,7 +375,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
314 } 375 }
315 put_task_struct(task); 376 put_task_struct(task);
316 377
317 return deadlock_detect ? -EDEADLK : 0; 378 return -EDEADLK;
318 } 379 }
319 retry: 380 retry:
320 /* 381 /*
@@ -339,6 +400,18 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
339 goto out_unlock_pi; 400 goto out_unlock_pi;
340 401
341 /* 402 /*
403 * We dropped all locks after taking a refcount on @task, so
404 * the task might have moved on in the lock chain or even left
405 * the chain completely and blocks now on an unrelated lock or
406 * on @orig_lock.
407 *
408 * We stored the lock on which @task was blocked in @next_lock,
409 * so we can detect the chain change.
410 */
411 if (next_lock != waiter->lock)
412 goto out_unlock_pi;
413
414 /*
342 * Drop out, when the task has no waiters. Note, 415 * Drop out, when the task has no waiters. Note,
343 * top_waiter can be NULL, when we are in the deboosting 416 * top_waiter can be NULL, when we are in the deboosting
344 * mode! 417 * mode!
@@ -377,7 +450,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
377 if (lock == orig_lock || rt_mutex_owner(lock) == top_task) { 450 if (lock == orig_lock || rt_mutex_owner(lock) == top_task) {
378 debug_rt_mutex_deadlock(deadlock_detect, orig_waiter, lock); 451 debug_rt_mutex_deadlock(deadlock_detect, orig_waiter, lock);
379 raw_spin_unlock(&lock->wait_lock); 452 raw_spin_unlock(&lock->wait_lock);
380 ret = deadlock_detect ? -EDEADLK : 0; 453 ret = -EDEADLK;
381 goto out_unlock_pi; 454 goto out_unlock_pi;
382 } 455 }
383 456
@@ -422,11 +495,26 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
422 __rt_mutex_adjust_prio(task); 495 __rt_mutex_adjust_prio(task);
423 } 496 }
424 497
498 /*
499 * Check whether the task which owns the current lock is pi
500 * blocked itself. If yes we store a pointer to the lock for
501 * the lock chain change detection above. After we dropped
502 * task->pi_lock next_lock cannot be dereferenced anymore.
503 */
504 next_lock = task_blocked_on_lock(task);
505
425 raw_spin_unlock_irqrestore(&task->pi_lock, flags); 506 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
426 507
427 top_waiter = rt_mutex_top_waiter(lock); 508 top_waiter = rt_mutex_top_waiter(lock);
428 raw_spin_unlock(&lock->wait_lock); 509 raw_spin_unlock(&lock->wait_lock);
429 510
511 /*
512 * We reached the end of the lock chain. Stop right here. No
513 * point to go back just to figure that out.
514 */
515 if (!next_lock)
516 goto out_put_task;
517
430 if (!detect_deadlock && waiter != top_waiter) 518 if (!detect_deadlock && waiter != top_waiter)
431 goto out_put_task; 519 goto out_put_task;
432 520
@@ -536,8 +624,9 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
536{ 624{
537 struct task_struct *owner = rt_mutex_owner(lock); 625 struct task_struct *owner = rt_mutex_owner(lock);
538 struct rt_mutex_waiter *top_waiter = waiter; 626 struct rt_mutex_waiter *top_waiter = waiter;
539 unsigned long flags; 627 struct rt_mutex *next_lock;
540 int chain_walk = 0, res; 628 int chain_walk = 0, res;
629 unsigned long flags;
541 630
542 /* 631 /*
543 * Early deadlock detection. We really don't want the task to 632 * Early deadlock detection. We really don't want the task to
@@ -548,7 +637,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
548 * which is wrong, as the other waiter is not in a deadlock 637 * which is wrong, as the other waiter is not in a deadlock
549 * situation. 638 * situation.
550 */ 639 */
551 if (detect_deadlock && owner == task) 640 if (owner == task)
552 return -EDEADLK; 641 return -EDEADLK;
553 642
554 raw_spin_lock_irqsave(&task->pi_lock, flags); 643 raw_spin_lock_irqsave(&task->pi_lock, flags);
@@ -569,20 +658,28 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
569 if (!owner) 658 if (!owner)
570 return 0; 659 return 0;
571 660
661 raw_spin_lock_irqsave(&owner->pi_lock, flags);
572 if (waiter == rt_mutex_top_waiter(lock)) { 662 if (waiter == rt_mutex_top_waiter(lock)) {
573 raw_spin_lock_irqsave(&owner->pi_lock, flags);
574 rt_mutex_dequeue_pi(owner, top_waiter); 663 rt_mutex_dequeue_pi(owner, top_waiter);
575 rt_mutex_enqueue_pi(owner, waiter); 664 rt_mutex_enqueue_pi(owner, waiter);
576 665
577 __rt_mutex_adjust_prio(owner); 666 __rt_mutex_adjust_prio(owner);
578 if (owner->pi_blocked_on) 667 if (owner->pi_blocked_on)
579 chain_walk = 1; 668 chain_walk = 1;
580 raw_spin_unlock_irqrestore(&owner->pi_lock, flags); 669 } else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock)) {
581 }
582 else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock))
583 chain_walk = 1; 670 chain_walk = 1;
671 }
584 672
585 if (!chain_walk) 673 /* Store the lock on which owner is blocked or NULL */
674 next_lock = task_blocked_on_lock(owner);
675
676 raw_spin_unlock_irqrestore(&owner->pi_lock, flags);
677 /*
678 * Even if full deadlock detection is on, if the owner is not
679 * blocked itself, we can avoid finding this out in the chain
680 * walk.
681 */
682 if (!chain_walk || !next_lock)
586 return 0; 683 return 0;
587 684
588 /* 685 /*
@@ -594,8 +691,8 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
594 691
595 raw_spin_unlock(&lock->wait_lock); 692 raw_spin_unlock(&lock->wait_lock);
596 693
597 res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter, 694 res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock,
598 task); 695 next_lock, waiter, task);
599 696
600 raw_spin_lock(&lock->wait_lock); 697 raw_spin_lock(&lock->wait_lock);
601 698
@@ -605,7 +702,8 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
605/* 702/*
606 * Wake up the next waiter on the lock. 703 * Wake up the next waiter on the lock.
607 * 704 *
608 * Remove the top waiter from the current tasks waiter list and wake it up. 705 * Remove the top waiter from the current tasks pi waiter list and
706 * wake it up.
609 * 707 *
610 * Called with lock->wait_lock held. 708 * Called with lock->wait_lock held.
611 */ 709 */
@@ -626,10 +724,23 @@ static void wakeup_next_waiter(struct rt_mutex *lock)
626 */ 724 */
627 rt_mutex_dequeue_pi(current, waiter); 725 rt_mutex_dequeue_pi(current, waiter);
628 726
629 rt_mutex_set_owner(lock, NULL); 727 /*
728 * As we are waking up the top waiter, and the waiter stays
729 * queued on the lock until it gets the lock, this lock
730 * obviously has waiters. Just set the bit here and this has
731 * the added benefit of forcing all new tasks into the
732 * slow path making sure no task of lower priority than
733 * the top waiter can steal this lock.
734 */
735 lock->owner = (void *) RT_MUTEX_HAS_WAITERS;
630 736
631 raw_spin_unlock_irqrestore(&current->pi_lock, flags); 737 raw_spin_unlock_irqrestore(&current->pi_lock, flags);
632 738
739 /*
740 * It's safe to dereference waiter as it cannot go away as
741 * long as we hold lock->wait_lock. The waiter task needs to
742 * acquire it in order to dequeue the waiter.
743 */
633 wake_up_process(waiter->task); 744 wake_up_process(waiter->task);
634} 745}
635 746
@@ -644,8 +755,8 @@ static void remove_waiter(struct rt_mutex *lock,
644{ 755{
645 int first = (waiter == rt_mutex_top_waiter(lock)); 756 int first = (waiter == rt_mutex_top_waiter(lock));
646 struct task_struct *owner = rt_mutex_owner(lock); 757 struct task_struct *owner = rt_mutex_owner(lock);
758 struct rt_mutex *next_lock = NULL;
647 unsigned long flags; 759 unsigned long flags;
648 int chain_walk = 0;
649 760
650 raw_spin_lock_irqsave(&current->pi_lock, flags); 761 raw_spin_lock_irqsave(&current->pi_lock, flags);
651 rt_mutex_dequeue(lock, waiter); 762 rt_mutex_dequeue(lock, waiter);
@@ -669,13 +780,13 @@ static void remove_waiter(struct rt_mutex *lock,
669 } 780 }
670 __rt_mutex_adjust_prio(owner); 781 __rt_mutex_adjust_prio(owner);
671 782
672 if (owner->pi_blocked_on) 783 /* Store the lock on which owner is blocked or NULL */
673 chain_walk = 1; 784 next_lock = task_blocked_on_lock(owner);
674 785
675 raw_spin_unlock_irqrestore(&owner->pi_lock, flags); 786 raw_spin_unlock_irqrestore(&owner->pi_lock, flags);
676 } 787 }
677 788
678 if (!chain_walk) 789 if (!next_lock)
679 return; 790 return;
680 791
681 /* gets dropped in rt_mutex_adjust_prio_chain()! */ 792 /* gets dropped in rt_mutex_adjust_prio_chain()! */
@@ -683,7 +794,7 @@ static void remove_waiter(struct rt_mutex *lock,
683 794
684 raw_spin_unlock(&lock->wait_lock); 795 raw_spin_unlock(&lock->wait_lock);
685 796
686 rt_mutex_adjust_prio_chain(owner, 0, lock, NULL, current); 797 rt_mutex_adjust_prio_chain(owner, 0, lock, next_lock, NULL, current);
687 798
688 raw_spin_lock(&lock->wait_lock); 799 raw_spin_lock(&lock->wait_lock);
689} 800}
@@ -696,6 +807,7 @@ static void remove_waiter(struct rt_mutex *lock,
696void rt_mutex_adjust_pi(struct task_struct *task) 807void rt_mutex_adjust_pi(struct task_struct *task)
697{ 808{
698 struct rt_mutex_waiter *waiter; 809 struct rt_mutex_waiter *waiter;
810 struct rt_mutex *next_lock;
699 unsigned long flags; 811 unsigned long flags;
700 812
701 raw_spin_lock_irqsave(&task->pi_lock, flags); 813 raw_spin_lock_irqsave(&task->pi_lock, flags);
@@ -706,12 +818,13 @@ void rt_mutex_adjust_pi(struct task_struct *task)
706 raw_spin_unlock_irqrestore(&task->pi_lock, flags); 818 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
707 return; 819 return;
708 } 820 }
709 821 next_lock = waiter->lock;
710 raw_spin_unlock_irqrestore(&task->pi_lock, flags); 822 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
711 823
712 /* gets dropped in rt_mutex_adjust_prio_chain()! */ 824 /* gets dropped in rt_mutex_adjust_prio_chain()! */
713 get_task_struct(task); 825 get_task_struct(task);
714 rt_mutex_adjust_prio_chain(task, 0, NULL, NULL, task); 826
827 rt_mutex_adjust_prio_chain(task, 0, NULL, next_lock, NULL, task);
715} 828}
716 829
717/** 830/**
@@ -763,6 +876,26 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
763 return ret; 876 return ret;
764} 877}
765 878
879static void rt_mutex_handle_deadlock(int res, int detect_deadlock,
880 struct rt_mutex_waiter *w)
881{
882 /*
883 * If the result is not -EDEADLOCK or the caller requested
884 * deadlock detection, nothing to do here.
885 */
886 if (res != -EDEADLOCK || detect_deadlock)
887 return;
888
889 /*
890 * Yell lowdly and stop the task right here.
891 */
892 rt_mutex_print_deadlock(w);
893 while (1) {
894 set_current_state(TASK_INTERRUPTIBLE);
895 schedule();
896 }
897}
898
766/* 899/*
767 * Slow path lock function: 900 * Slow path lock function:
768 */ 901 */
@@ -802,8 +935,10 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
802 935
803 set_current_state(TASK_RUNNING); 936 set_current_state(TASK_RUNNING);
804 937
805 if (unlikely(ret)) 938 if (unlikely(ret)) {
806 remove_waiter(lock, &waiter); 939 remove_waiter(lock, &waiter);
940 rt_mutex_handle_deadlock(ret, detect_deadlock, &waiter);
941 }
807 942
808 /* 943 /*
809 * try_to_take_rt_mutex() sets the waiter bit 944 * try_to_take_rt_mutex() sets the waiter bit
@@ -859,12 +994,49 @@ rt_mutex_slowunlock(struct rt_mutex *lock)
859 994
860 rt_mutex_deadlock_account_unlock(current); 995 rt_mutex_deadlock_account_unlock(current);
861 996
862 if (!rt_mutex_has_waiters(lock)) { 997 /*
863 lock->owner = NULL; 998 * We must be careful here if the fast path is enabled. If we
864 raw_spin_unlock(&lock->wait_lock); 999 * have no waiters queued we cannot set owner to NULL here
865 return; 1000 * because of:
1001 *
1002 * foo->lock->owner = NULL;
1003 * rtmutex_lock(foo->lock); <- fast path
1004 * free = atomic_dec_and_test(foo->refcnt);
1005 * rtmutex_unlock(foo->lock); <- fast path
1006 * if (free)
1007 * kfree(foo);
1008 * raw_spin_unlock(foo->lock->wait_lock);
1009 *
1010 * So for the fastpath enabled kernel:
1011 *
1012 * Nothing can set the waiters bit as long as we hold
1013 * lock->wait_lock. So we do the following sequence:
1014 *
1015 * owner = rt_mutex_owner(lock);
1016 * clear_rt_mutex_waiters(lock);
1017 * raw_spin_unlock(&lock->wait_lock);
1018 * if (cmpxchg(&lock->owner, owner, 0) == owner)
1019 * return;
1020 * goto retry;
1021 *
1022 * The fastpath disabled variant is simple as all access to
1023 * lock->owner is serialized by lock->wait_lock:
1024 *
1025 * lock->owner = NULL;
1026 * raw_spin_unlock(&lock->wait_lock);
1027 */
1028 while (!rt_mutex_has_waiters(lock)) {
1029 /* Drops lock->wait_lock ! */
1030 if (unlock_rt_mutex_safe(lock) == true)
1031 return;
1032 /* Relock the rtmutex and try again */
1033 raw_spin_lock(&lock->wait_lock);
866 } 1034 }
867 1035
1036 /*
1037 * The wakeup next waiter path does not suffer from the above
1038 * race. See the comments there.
1039 */
868 wakeup_next_waiter(lock); 1040 wakeup_next_waiter(lock);
869 1041
870 raw_spin_unlock(&lock->wait_lock); 1042 raw_spin_unlock(&lock->wait_lock);
@@ -1112,7 +1284,8 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
1112 return 1; 1284 return 1;
1113 } 1285 }
1114 1286
1115 ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock); 1287 /* We enforce deadlock detection for futexes */
1288 ret = task_blocks_on_rt_mutex(lock, waiter, task, 1);
1116 1289
1117 if (ret && !rt_mutex_owner(lock)) { 1290 if (ret && !rt_mutex_owner(lock)) {
1118 /* 1291 /*
diff --git a/kernel/locking/rtmutex.h b/kernel/locking/rtmutex.h
index a1a1dd06421d..f6a1f3c133b1 100644
--- a/kernel/locking/rtmutex.h
+++ b/kernel/locking/rtmutex.h
@@ -24,3 +24,8 @@
24#define debug_rt_mutex_print_deadlock(w) do { } while (0) 24#define debug_rt_mutex_print_deadlock(w) do { } while (0)
25#define debug_rt_mutex_detect_deadlock(w,d) (d) 25#define debug_rt_mutex_detect_deadlock(w,d) (d)
26#define debug_rt_mutex_reset_waiter(w) do { } while (0) 26#define debug_rt_mutex_reset_waiter(w) do { } while (0)
27
28static inline void rt_mutex_print_deadlock(struct rt_mutex_waiter *w)
29{
30 WARN(1, "rtmutex deadlock detected\n");
31}
diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c
index 9be8a9144978..2c93571162cb 100644
--- a/kernel/locking/rwsem-spinlock.c
+++ b/kernel/locking/rwsem-spinlock.c
@@ -26,7 +26,7 @@ int rwsem_is_locked(struct rw_semaphore *sem)
26 unsigned long flags; 26 unsigned long flags;
27 27
28 if (raw_spin_trylock_irqsave(&sem->wait_lock, flags)) { 28 if (raw_spin_trylock_irqsave(&sem->wait_lock, flags)) {
29 ret = (sem->activity != 0); 29 ret = (sem->count != 0);
30 raw_spin_unlock_irqrestore(&sem->wait_lock, flags); 30 raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
31 } 31 }
32 return ret; 32 return ret;
@@ -46,7 +46,7 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name,
46 debug_check_no_locks_freed((void *)sem, sizeof(*sem)); 46 debug_check_no_locks_freed((void *)sem, sizeof(*sem));
47 lockdep_init_map(&sem->dep_map, name, key, 0); 47 lockdep_init_map(&sem->dep_map, name, key, 0);
48#endif 48#endif
49 sem->activity = 0; 49 sem->count = 0;
50 raw_spin_lock_init(&sem->wait_lock); 50 raw_spin_lock_init(&sem->wait_lock);
51 INIT_LIST_HEAD(&sem->wait_list); 51 INIT_LIST_HEAD(&sem->wait_list);
52} 52}
@@ -95,7 +95,7 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wakewrite)
95 waiter = list_entry(next, struct rwsem_waiter, list); 95 waiter = list_entry(next, struct rwsem_waiter, list);
96 } while (waiter->type != RWSEM_WAITING_FOR_WRITE); 96 } while (waiter->type != RWSEM_WAITING_FOR_WRITE);
97 97
98 sem->activity += woken; 98 sem->count += woken;
99 99
100 out: 100 out:
101 return sem; 101 return sem;
@@ -126,9 +126,9 @@ void __sched __down_read(struct rw_semaphore *sem)
126 126
127 raw_spin_lock_irqsave(&sem->wait_lock, flags); 127 raw_spin_lock_irqsave(&sem->wait_lock, flags);
128 128
129 if (sem->activity >= 0 && list_empty(&sem->wait_list)) { 129 if (sem->count >= 0 && list_empty(&sem->wait_list)) {
130 /* granted */ 130 /* granted */
131 sem->activity++; 131 sem->count++;
132 raw_spin_unlock_irqrestore(&sem->wait_lock, flags); 132 raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
133 goto out; 133 goto out;
134 } 134 }
@@ -170,9 +170,9 @@ int __down_read_trylock(struct rw_semaphore *sem)
170 170
171 raw_spin_lock_irqsave(&sem->wait_lock, flags); 171 raw_spin_lock_irqsave(&sem->wait_lock, flags);
172 172
173 if (sem->activity >= 0 && list_empty(&sem->wait_list)) { 173 if (sem->count >= 0 && list_empty(&sem->wait_list)) {
174 /* granted */ 174 /* granted */
175 sem->activity++; 175 sem->count++;
176 ret = 1; 176 ret = 1;
177 } 177 }
178 178
@@ -206,7 +206,7 @@ void __sched __down_write_nested(struct rw_semaphore *sem, int subclass)
206 * itself into sleep and waiting for system woke it or someone 206 * itself into sleep and waiting for system woke it or someone
207 * else in the head of the wait list up. 207 * else in the head of the wait list up.
208 */ 208 */
209 if (sem->activity == 0) 209 if (sem->count == 0)
210 break; 210 break;
211 set_task_state(tsk, TASK_UNINTERRUPTIBLE); 211 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
212 raw_spin_unlock_irqrestore(&sem->wait_lock, flags); 212 raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
@@ -214,7 +214,7 @@ void __sched __down_write_nested(struct rw_semaphore *sem, int subclass)
214 raw_spin_lock_irqsave(&sem->wait_lock, flags); 214 raw_spin_lock_irqsave(&sem->wait_lock, flags);
215 } 215 }
216 /* got the lock */ 216 /* got the lock */
217 sem->activity = -1; 217 sem->count = -1;
218 list_del(&waiter.list); 218 list_del(&waiter.list);
219 219
220 raw_spin_unlock_irqrestore(&sem->wait_lock, flags); 220 raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
@@ -235,9 +235,9 @@ int __down_write_trylock(struct rw_semaphore *sem)
235 235
236 raw_spin_lock_irqsave(&sem->wait_lock, flags); 236 raw_spin_lock_irqsave(&sem->wait_lock, flags);
237 237
238 if (sem->activity == 0) { 238 if (sem->count == 0) {
239 /* got the lock */ 239 /* got the lock */
240 sem->activity = -1; 240 sem->count = -1;
241 ret = 1; 241 ret = 1;
242 } 242 }
243 243
@@ -255,7 +255,7 @@ void __up_read(struct rw_semaphore *sem)
255 255
256 raw_spin_lock_irqsave(&sem->wait_lock, flags); 256 raw_spin_lock_irqsave(&sem->wait_lock, flags);
257 257
258 if (--sem->activity == 0 && !list_empty(&sem->wait_list)) 258 if (--sem->count == 0 && !list_empty(&sem->wait_list))
259 sem = __rwsem_wake_one_writer(sem); 259 sem = __rwsem_wake_one_writer(sem);
260 260
261 raw_spin_unlock_irqrestore(&sem->wait_lock, flags); 261 raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
@@ -270,7 +270,7 @@ void __up_write(struct rw_semaphore *sem)
270 270
271 raw_spin_lock_irqsave(&sem->wait_lock, flags); 271 raw_spin_lock_irqsave(&sem->wait_lock, flags);
272 272
273 sem->activity = 0; 273 sem->count = 0;
274 if (!list_empty(&sem->wait_list)) 274 if (!list_empty(&sem->wait_list))
275 sem = __rwsem_do_wake(sem, 1); 275 sem = __rwsem_do_wake(sem, 1);
276 276
@@ -287,7 +287,7 @@ void __downgrade_write(struct rw_semaphore *sem)
287 287
288 raw_spin_lock_irqsave(&sem->wait_lock, flags); 288 raw_spin_lock_irqsave(&sem->wait_lock, flags);
289 289
290 sem->activity = 1; 290 sem->count = 1;
291 if (!list_empty(&sem->wait_list)) 291 if (!list_empty(&sem->wait_list))
292 sem = __rwsem_do_wake(sem, 0); 292 sem = __rwsem_do_wake(sem, 0);
293 293
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index dacc32142fcc..a2391ac135c8 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -82,9 +82,9 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name,
82 sem->count = RWSEM_UNLOCKED_VALUE; 82 sem->count = RWSEM_UNLOCKED_VALUE;
83 raw_spin_lock_init(&sem->wait_lock); 83 raw_spin_lock_init(&sem->wait_lock);
84 INIT_LIST_HEAD(&sem->wait_list); 84 INIT_LIST_HEAD(&sem->wait_list);
85#ifdef CONFIG_SMP 85#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
86 sem->owner = NULL; 86 sem->owner = NULL;
87 sem->osq = NULL; 87 osq_lock_init(&sem->osq);
88#endif 88#endif
89} 89}
90 90
@@ -262,7 +262,7 @@ static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem)
262 return false; 262 return false;
263} 263}
264 264
265#ifdef CONFIG_SMP 265#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
266/* 266/*
267 * Try to acquire write lock before the writer has been put on wait queue. 267 * Try to acquire write lock before the writer has been put on wait queue.
268 */ 268 */
@@ -285,10 +285,10 @@ static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
285static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) 285static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
286{ 286{
287 struct task_struct *owner; 287 struct task_struct *owner;
288 bool on_cpu = true; 288 bool on_cpu = false;
289 289
290 if (need_resched()) 290 if (need_resched())
291 return 0; 291 return false;
292 292
293 rcu_read_lock(); 293 rcu_read_lock();
294 owner = ACCESS_ONCE(sem->owner); 294 owner = ACCESS_ONCE(sem->owner);
@@ -297,9 +297,9 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
297 rcu_read_unlock(); 297 rcu_read_unlock();
298 298
299 /* 299 /*
300 * If sem->owner is not set, the rwsem owner may have 300 * If sem->owner is not set, yet we have just recently entered the
301 * just acquired it and not set the owner yet or the rwsem 301 * slowpath, then there is a possibility reader(s) may have the lock.
302 * has been released. 302 * To be safe, avoid spinning in these situations.
303 */ 303 */
304 return on_cpu; 304 return on_cpu;
305} 305}
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 42f806de49d4..e2d3bc7f03b4 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -12,7 +12,7 @@
12 12
13#include <linux/atomic.h> 13#include <linux/atomic.h>
14 14
15#if defined(CONFIG_SMP) && defined(CONFIG_RWSEM_XCHGADD_ALGORITHM) 15#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
16static inline void rwsem_set_owner(struct rw_semaphore *sem) 16static inline void rwsem_set_owner(struct rw_semaphore *sem)
17{ 17{
18 sem->owner = current; 18 sem->owner = current;
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 49e0a20fd010..fcc2611d3f14 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -35,6 +35,7 @@
35 35
36static int nocompress; 36static int nocompress;
37static int noresume; 37static int noresume;
38static int nohibernate;
38static int resume_wait; 39static int resume_wait;
39static unsigned int resume_delay; 40static unsigned int resume_delay;
40static char resume_file[256] = CONFIG_PM_STD_PARTITION; 41static char resume_file[256] = CONFIG_PM_STD_PARTITION;
@@ -62,6 +63,11 @@ bool freezer_test_done;
62 63
63static const struct platform_hibernation_ops *hibernation_ops; 64static const struct platform_hibernation_ops *hibernation_ops;
64 65
66bool hibernation_available(void)
67{
68 return (nohibernate == 0);
69}
70
65/** 71/**
66 * hibernation_set_ops - Set the global hibernate operations. 72 * hibernation_set_ops - Set the global hibernate operations.
67 * @ops: Hibernation operations to use in subsequent hibernation transitions. 73 * @ops: Hibernation operations to use in subsequent hibernation transitions.
@@ -642,6 +648,11 @@ int hibernate(void)
642{ 648{
643 int error; 649 int error;
644 650
651 if (!hibernation_available()) {
652 pr_debug("PM: Hibernation not available.\n");
653 return -EPERM;
654 }
655
645 lock_system_sleep(); 656 lock_system_sleep();
646 /* The snapshot device should not be opened while we're running */ 657 /* The snapshot device should not be opened while we're running */
647 if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { 658 if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
@@ -734,7 +745,7 @@ static int software_resume(void)
734 /* 745 /*
735 * If the user said "noresume".. bail out early. 746 * If the user said "noresume".. bail out early.
736 */ 747 */
737 if (noresume) 748 if (noresume || !hibernation_available())
738 return 0; 749 return 0;
739 750
740 /* 751 /*
@@ -900,6 +911,9 @@ static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr,
900 int i; 911 int i;
901 char *start = buf; 912 char *start = buf;
902 913
914 if (!hibernation_available())
915 return sprintf(buf, "[disabled]\n");
916
903 for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) { 917 for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) {
904 if (!hibernation_modes[i]) 918 if (!hibernation_modes[i])
905 continue; 919 continue;
@@ -934,6 +948,9 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
934 char *p; 948 char *p;
935 int mode = HIBERNATION_INVALID; 949 int mode = HIBERNATION_INVALID;
936 950
951 if (!hibernation_available())
952 return -EPERM;
953
937 p = memchr(buf, '\n', n); 954 p = memchr(buf, '\n', n);
938 len = p ? p - buf : n; 955 len = p ? p - buf : n;
939 956
@@ -1101,6 +1118,10 @@ static int __init hibernate_setup(char *str)
1101 noresume = 1; 1118 noresume = 1;
1102 else if (!strncmp(str, "nocompress", 10)) 1119 else if (!strncmp(str, "nocompress", 10))
1103 nocompress = 1; 1120 nocompress = 1;
1121 else if (!strncmp(str, "no", 2)) {
1122 noresume = 1;
1123 nohibernate = 1;
1124 }
1104 return 1; 1125 return 1;
1105} 1126}
1106 1127
@@ -1125,9 +1146,23 @@ static int __init resumedelay_setup(char *str)
1125 return 1; 1146 return 1;
1126} 1147}
1127 1148
1149static int __init nohibernate_setup(char *str)
1150{
1151 noresume = 1;
1152 nohibernate = 1;
1153 return 1;
1154}
1155
1156static int __init kaslr_nohibernate_setup(char *str)
1157{
1158 return nohibernate_setup(str);
1159}
1160
1128__setup("noresume", noresume_setup); 1161__setup("noresume", noresume_setup);
1129__setup("resume_offset=", resume_offset_setup); 1162__setup("resume_offset=", resume_offset_setup);
1130__setup("resume=", resume_setup); 1163__setup("resume=", resume_setup);
1131__setup("hibernate=", hibernate_setup); 1164__setup("hibernate=", hibernate_setup);
1132__setup("resumewait", resumewait_setup); 1165__setup("resumewait", resumewait_setup);
1133__setup("resumedelay=", resumedelay_setup); 1166__setup("resumedelay=", resumedelay_setup);
1167__setup("nohibernate", nohibernate_setup);
1168__setup("kaslr", kaslr_nohibernate_setup);
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 573410d6647e..8e90f330f139 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -300,13 +300,11 @@ static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr,
300 s += sprintf(s,"%s ", pm_states[i].label); 300 s += sprintf(s,"%s ", pm_states[i].label);
301 301
302#endif 302#endif
303#ifdef CONFIG_HIBERNATION 303 if (hibernation_available())
304 s += sprintf(s, "%s\n", "disk"); 304 s += sprintf(s, "disk ");
305#else
306 if (s != buf) 305 if (s != buf)
307 /* convert the last space to a newline */ 306 /* convert the last space to a newline */
308 *(s-1) = '\n'; 307 *(s-1) = '\n';
309#endif
310 return (s - buf); 308 return (s - buf);
311} 309}
312 310
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 0ca8d83e2369..4ee194eb524b 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -186,6 +186,7 @@ void thaw_processes(void)
186 186
187 printk("Restarting tasks ... "); 187 printk("Restarting tasks ... ");
188 188
189 __usermodehelper_set_disable_depth(UMH_FREEZING);
189 thaw_workqueues(); 190 thaw_workqueues();
190 191
191 read_lock(&tasklist_lock); 192 read_lock(&tasklist_lock);
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 4dd8822f732a..ed35a4790afe 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -306,7 +306,7 @@ int suspend_devices_and_enter(suspend_state_t state)
306 error = suspend_ops->begin(state); 306 error = suspend_ops->begin(state);
307 if (error) 307 if (error)
308 goto Close; 308 goto Close;
309 } else if (state == PM_SUSPEND_FREEZE && freeze_ops->begin) { 309 } else if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->begin) {
310 error = freeze_ops->begin(); 310 error = freeze_ops->begin();
311 if (error) 311 if (error)
312 goto Close; 312 goto Close;
@@ -335,7 +335,7 @@ int suspend_devices_and_enter(suspend_state_t state)
335 Close: 335 Close:
336 if (need_suspend_ops(state) && suspend_ops->end) 336 if (need_suspend_ops(state) && suspend_ops->end)
337 suspend_ops->end(); 337 suspend_ops->end();
338 else if (state == PM_SUSPEND_FREEZE && freeze_ops->end) 338 else if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->end)
339 freeze_ops->end(); 339 freeze_ops->end();
340 340
341 return error; 341 return error;
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 98d357584cd6..526e8911460a 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -49,6 +49,9 @@ static int snapshot_open(struct inode *inode, struct file *filp)
49 struct snapshot_data *data; 49 struct snapshot_data *data;
50 int error; 50 int error;
51 51
52 if (!hibernation_available())
53 return -EPERM;
54
52 lock_system_sleep(); 55 lock_system_sleep();
53 56
54 if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { 57 if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index ea2d5f6962ed..13e839dbca07 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1416,9 +1416,10 @@ static int have_callable_console(void)
1416/* 1416/*
1417 * Can we actually use the console at this time on this cpu? 1417 * Can we actually use the console at this time on this cpu?
1418 * 1418 *
1419 * Console drivers may assume that per-cpu resources have been allocated. So 1419 * Console drivers may assume that per-cpu resources have
1420 * unless they're explicitly marked as being able to cope (CON_ANYTIME) don't 1420 * been allocated. So unless they're explicitly marked as
1421 * call them until this CPU is officially up. 1421 * being able to cope (CON_ANYTIME) don't call them until
1422 * this CPU is officially up.
1422 */ 1423 */
1423static inline int can_use_console(unsigned int cpu) 1424static inline int can_use_console(unsigned int cpu)
1424{ 1425{
@@ -1431,10 +1432,8 @@ static inline int can_use_console(unsigned int cpu)
1431 * console_lock held, and 'console_locked' set) if it 1432 * console_lock held, and 'console_locked' set) if it
1432 * is successful, false otherwise. 1433 * is successful, false otherwise.
1433 */ 1434 */
1434static int console_trylock_for_printk(void) 1435static int console_trylock_for_printk(unsigned int cpu)
1435{ 1436{
1436 unsigned int cpu = smp_processor_id();
1437
1438 if (!console_trylock()) 1437 if (!console_trylock())
1439 return 0; 1438 return 0;
1440 /* 1439 /*
@@ -1609,8 +1608,7 @@ asmlinkage int vprintk_emit(int facility, int level,
1609 */ 1608 */
1610 if (!oops_in_progress && !lockdep_recursing(current)) { 1609 if (!oops_in_progress && !lockdep_recursing(current)) {
1611 recursion_bug = 1; 1610 recursion_bug = 1;
1612 local_irq_restore(flags); 1611 goto out_restore_irqs;
1613 return 0;
1614 } 1612 }
1615 zap_locks(); 1613 zap_locks();
1616 } 1614 }
@@ -1718,27 +1716,21 @@ asmlinkage int vprintk_emit(int facility, int level,
1718 1716
1719 logbuf_cpu = UINT_MAX; 1717 logbuf_cpu = UINT_MAX;
1720 raw_spin_unlock(&logbuf_lock); 1718 raw_spin_unlock(&logbuf_lock);
1721 lockdep_on();
1722 local_irq_restore(flags);
1723 1719
1724 /* If called from the scheduler, we can not call up(). */ 1720 /* If called from the scheduler, we can not call up(). */
1725 if (in_sched) 1721 if (!in_sched) {
1726 return printed_len; 1722 /*
1727 1723 * Try to acquire and then immediately release the console
1728 /* 1724 * semaphore. The release will print out buffers and wake up
1729 * Disable preemption to avoid being preempted while holding 1725 * /dev/kmsg and syslog() users.
1730 * console_sem which would prevent anyone from printing to console 1726 */
1731 */ 1727 if (console_trylock_for_printk(this_cpu))
1732 preempt_disable(); 1728 console_unlock();
1733 /* 1729 }
1734 * Try to acquire and then immediately release the console semaphore.
1735 * The release will print out buffers and wake up /dev/kmsg and syslog()
1736 * users.
1737 */
1738 if (console_trylock_for_printk())
1739 console_unlock();
1740 preempt_enable();
1741 1730
1731 lockdep_on();
1732out_restore_irqs:
1733 local_irq_restore(flags);
1742 return printed_len; 1734 return printed_len;
1743} 1735}
1744EXPORT_SYMBOL(vprintk_emit); 1736EXPORT_SYMBOL(vprintk_emit);
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index f1ba77363fbb..625d0b0cd75a 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -206,6 +206,70 @@ void rcu_bh_qs(int cpu)
206 rdp->passed_quiesce = 1; 206 rdp->passed_quiesce = 1;
207} 207}
208 208
209static DEFINE_PER_CPU(int, rcu_sched_qs_mask);
210
211static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
212 .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE,
213 .dynticks = ATOMIC_INIT(1),
214#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
215 .dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE,
216 .dynticks_idle = ATOMIC_INIT(1),
217#endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
218};
219
220/*
221 * Let the RCU core know that this CPU has gone through the scheduler,
222 * which is a quiescent state. This is called when the need for a
223 * quiescent state is urgent, so we burn an atomic operation and full
224 * memory barriers to let the RCU core know about it, regardless of what
225 * this CPU might (or might not) do in the near future.
226 *
227 * We inform the RCU core by emulating a zero-duration dyntick-idle
228 * period, which we in turn do by incrementing the ->dynticks counter
229 * by two.
230 */
231static void rcu_momentary_dyntick_idle(void)
232{
233 unsigned long flags;
234 struct rcu_data *rdp;
235 struct rcu_dynticks *rdtp;
236 int resched_mask;
237 struct rcu_state *rsp;
238
239 local_irq_save(flags);
240
241 /*
242 * Yes, we can lose flag-setting operations. This is OK, because
243 * the flag will be set again after some delay.
244 */
245 resched_mask = raw_cpu_read(rcu_sched_qs_mask);
246 raw_cpu_write(rcu_sched_qs_mask, 0);
247
248 /* Find the flavor that needs a quiescent state. */
249 for_each_rcu_flavor(rsp) {
250 rdp = raw_cpu_ptr(rsp->rda);
251 if (!(resched_mask & rsp->flavor_mask))
252 continue;
253 smp_mb(); /* rcu_sched_qs_mask before cond_resched_completed. */
254 if (ACCESS_ONCE(rdp->mynode->completed) !=
255 ACCESS_ONCE(rdp->cond_resched_completed))
256 continue;
257
258 /*
259 * Pretend to be momentarily idle for the quiescent state.
260 * This allows the grace-period kthread to record the
261 * quiescent state, with no need for this CPU to do anything
262 * further.
263 */
264 rdtp = this_cpu_ptr(&rcu_dynticks);
265 smp_mb__before_atomic(); /* Earlier stuff before QS. */
266 atomic_add(2, &rdtp->dynticks); /* QS. */
267 smp_mb__after_atomic(); /* Later stuff after QS. */
268 break;
269 }
270 local_irq_restore(flags);
271}
272
209/* 273/*
210 * Note a context switch. This is a quiescent state for RCU-sched, 274 * Note a context switch. This is a quiescent state for RCU-sched,
211 * and requires special handling for preemptible RCU. 275 * and requires special handling for preemptible RCU.
@@ -216,19 +280,12 @@ void rcu_note_context_switch(int cpu)
216 trace_rcu_utilization(TPS("Start context switch")); 280 trace_rcu_utilization(TPS("Start context switch"));
217 rcu_sched_qs(cpu); 281 rcu_sched_qs(cpu);
218 rcu_preempt_note_context_switch(cpu); 282 rcu_preempt_note_context_switch(cpu);
283 if (unlikely(raw_cpu_read(rcu_sched_qs_mask)))
284 rcu_momentary_dyntick_idle();
219 trace_rcu_utilization(TPS("End context switch")); 285 trace_rcu_utilization(TPS("End context switch"));
220} 286}
221EXPORT_SYMBOL_GPL(rcu_note_context_switch); 287EXPORT_SYMBOL_GPL(rcu_note_context_switch);
222 288
223static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
224 .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE,
225 .dynticks = ATOMIC_INIT(1),
226#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
227 .dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE,
228 .dynticks_idle = ATOMIC_INIT(1),
229#endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
230};
231
232static long blimit = 10; /* Maximum callbacks per rcu_do_batch. */ 289static long blimit = 10; /* Maximum callbacks per rcu_do_batch. */
233static long qhimark = 10000; /* If this many pending, ignore blimit. */ 290static long qhimark = 10000; /* If this many pending, ignore blimit. */
234static long qlowmark = 100; /* Once only this many pending, use blimit. */ 291static long qlowmark = 100; /* Once only this many pending, use blimit. */
@@ -243,6 +300,13 @@ static ulong jiffies_till_next_fqs = ULONG_MAX;
243module_param(jiffies_till_first_fqs, ulong, 0644); 300module_param(jiffies_till_first_fqs, ulong, 0644);
244module_param(jiffies_till_next_fqs, ulong, 0644); 301module_param(jiffies_till_next_fqs, ulong, 0644);
245 302
303/*
304 * How long the grace period must be before we start recruiting
305 * quiescent-state help from rcu_note_context_switch().
306 */
307static ulong jiffies_till_sched_qs = HZ / 20;
308module_param(jiffies_till_sched_qs, ulong, 0644);
309
246static bool rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, 310static bool rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
247 struct rcu_data *rdp); 311 struct rcu_data *rdp);
248static void force_qs_rnp(struct rcu_state *rsp, 312static void force_qs_rnp(struct rcu_state *rsp,
@@ -853,6 +917,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
853 bool *isidle, unsigned long *maxj) 917 bool *isidle, unsigned long *maxj)
854{ 918{
855 unsigned int curr; 919 unsigned int curr;
920 int *rcrmp;
856 unsigned int snap; 921 unsigned int snap;
857 922
858 curr = (unsigned int)atomic_add_return(0, &rdp->dynticks->dynticks); 923 curr = (unsigned int)atomic_add_return(0, &rdp->dynticks->dynticks);
@@ -893,27 +958,43 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
893 } 958 }
894 959
895 /* 960 /*
896 * There is a possibility that a CPU in adaptive-ticks state 961 * A CPU running for an extended time within the kernel can
897 * might run in the kernel with the scheduling-clock tick disabled 962 * delay RCU grace periods. When the CPU is in NO_HZ_FULL mode,
898 * for an extended time period. Invoke rcu_kick_nohz_cpu() to 963 * even context-switching back and forth between a pair of
899 * force the CPU to restart the scheduling-clock tick in this 964 * in-kernel CPU-bound tasks cannot advance grace periods.
900 * CPU is in this state. 965 * So if the grace period is old enough, make the CPU pay attention.
901 */ 966 * Note that the unsynchronized assignments to the per-CPU
902 rcu_kick_nohz_cpu(rdp->cpu); 967 * rcu_sched_qs_mask variable are safe. Yes, setting of
903 968 * bits can be lost, but they will be set again on the next
904 /* 969 * force-quiescent-state pass. So lost bit sets do not result
905 * Alternatively, the CPU might be running in the kernel 970 * in incorrect behavior, merely in a grace period lasting
906 * for an extended period of time without a quiescent state. 971 * a few jiffies longer than it might otherwise. Because
907 * Attempt to force the CPU through the scheduler to gain the 972 * there are at most four threads involved, and because the
908 * needed quiescent state, but only if the grace period has gone 973 * updates are only once every few jiffies, the probability of
909 * on for an uncommonly long time. If there are many stuck CPUs, 974 * lossage (and thus of slight grace-period extension) is
910 * we will beat on the first one until it gets unstuck, then move 975 * quite low.
911 * to the next. Only do this for the primary flavor of RCU. 976 *
977 * Note that if the jiffies_till_sched_qs boot/sysfs parameter
978 * is set too high, we override with half of the RCU CPU stall
979 * warning delay.
912 */ 980 */
913 if (rdp->rsp == rcu_state_p && 981 rcrmp = &per_cpu(rcu_sched_qs_mask, rdp->cpu);
982 if (ULONG_CMP_GE(jiffies,
983 rdp->rsp->gp_start + jiffies_till_sched_qs) ||
914 ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) { 984 ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) {
915 rdp->rsp->jiffies_resched += 5; 985 if (!(ACCESS_ONCE(*rcrmp) & rdp->rsp->flavor_mask)) {
916 resched_cpu(rdp->cpu); 986 ACCESS_ONCE(rdp->cond_resched_completed) =
987 ACCESS_ONCE(rdp->mynode->completed);
988 smp_mb(); /* ->cond_resched_completed before *rcrmp. */
989 ACCESS_ONCE(*rcrmp) =
990 ACCESS_ONCE(*rcrmp) + rdp->rsp->flavor_mask;
991 resched_cpu(rdp->cpu); /* Force CPU into scheduler. */
992 rdp->rsp->jiffies_resched += 5; /* Enable beating. */
993 } else if (ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) {
994 /* Time to beat on that CPU again! */
995 resched_cpu(rdp->cpu); /* Force CPU into scheduler. */
996 rdp->rsp->jiffies_resched += 5; /* Re-enable beating. */
997 }
917 } 998 }
918 999
919 return 0; 1000 return 0;
@@ -3491,6 +3572,7 @@ static void __init rcu_init_one(struct rcu_state *rsp,
3491 "rcu_node_fqs_1", 3572 "rcu_node_fqs_1",
3492 "rcu_node_fqs_2", 3573 "rcu_node_fqs_2",
3493 "rcu_node_fqs_3" }; /* Match MAX_RCU_LVLS */ 3574 "rcu_node_fqs_3" }; /* Match MAX_RCU_LVLS */
3575 static u8 fl_mask = 0x1;
3494 int cpustride = 1; 3576 int cpustride = 1;
3495 int i; 3577 int i;
3496 int j; 3578 int j;
@@ -3509,6 +3591,8 @@ static void __init rcu_init_one(struct rcu_state *rsp,
3509 for (i = 1; i < rcu_num_lvls; i++) 3591 for (i = 1; i < rcu_num_lvls; i++)
3510 rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1]; 3592 rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1];
3511 rcu_init_levelspread(rsp); 3593 rcu_init_levelspread(rsp);
3594 rsp->flavor_mask = fl_mask;
3595 fl_mask <<= 1;
3512 3596
3513 /* Initialize the elements themselves, starting from the leaves. */ 3597 /* Initialize the elements themselves, starting from the leaves. */
3514 3598
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index bf2c1e669691..0f69a79c5b7d 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -307,6 +307,9 @@ struct rcu_data {
307 /* 4) reasons this CPU needed to be kicked by force_quiescent_state */ 307 /* 4) reasons this CPU needed to be kicked by force_quiescent_state */
308 unsigned long dynticks_fqs; /* Kicked due to dynticks idle. */ 308 unsigned long dynticks_fqs; /* Kicked due to dynticks idle. */
309 unsigned long offline_fqs; /* Kicked due to being offline. */ 309 unsigned long offline_fqs; /* Kicked due to being offline. */
310 unsigned long cond_resched_completed;
311 /* Grace period that needs help */
312 /* from cond_resched(). */
310 313
311 /* 5) __rcu_pending() statistics. */ 314 /* 5) __rcu_pending() statistics. */
312 unsigned long n_rcu_pending; /* rcu_pending() calls since boot. */ 315 unsigned long n_rcu_pending; /* rcu_pending() calls since boot. */
@@ -392,6 +395,7 @@ struct rcu_state {
392 struct rcu_node *level[RCU_NUM_LVLS]; /* Hierarchy levels. */ 395 struct rcu_node *level[RCU_NUM_LVLS]; /* Hierarchy levels. */
393 u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */ 396 u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */
394 u8 levelspread[RCU_NUM_LVLS]; /* kids/node in each level. */ 397 u8 levelspread[RCU_NUM_LVLS]; /* kids/node in each level. */
398 u8 flavor_mask; /* bit in flavor mask. */
395 struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */ 399 struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */
396 void (*call)(struct rcu_head *head, /* call_rcu() flavor. */ 400 void (*call)(struct rcu_head *head, /* call_rcu() flavor. */
397 void (*func)(struct rcu_head *head)); 401 void (*func)(struct rcu_head *head));
@@ -563,7 +567,7 @@ static bool rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp);
563static void do_nocb_deferred_wakeup(struct rcu_data *rdp); 567static void do_nocb_deferred_wakeup(struct rcu_data *rdp);
564static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp); 568static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp);
565static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp); 569static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp);
566static void rcu_kick_nohz_cpu(int cpu); 570static void __maybe_unused rcu_kick_nohz_cpu(int cpu);
567static bool init_nocb_callback_list(struct rcu_data *rdp); 571static bool init_nocb_callback_list(struct rcu_data *rdp);
568static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq); 572static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq);
569static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq); 573static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq);
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index cbc2c45265e2..02ac0fb186b8 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -2404,7 +2404,7 @@ static bool init_nocb_callback_list(struct rcu_data *rdp)
2404 * if an adaptive-ticks CPU is failing to respond to the current grace 2404 * if an adaptive-ticks CPU is failing to respond to the current grace
2405 * period and has not be idle from an RCU perspective, kick it. 2405 * period and has not be idle from an RCU perspective, kick it.
2406 */ 2406 */
2407static void rcu_kick_nohz_cpu(int cpu) 2407static void __maybe_unused rcu_kick_nohz_cpu(int cpu)
2408{ 2408{
2409#ifdef CONFIG_NO_HZ_FULL 2409#ifdef CONFIG_NO_HZ_FULL
2410 if (tick_nohz_full_cpu(cpu)) 2410 if (tick_nohz_full_cpu(cpu))
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index a2aeb4df0f60..bc7883570530 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -200,12 +200,12 @@ void wait_rcu_gp(call_rcu_func_t crf)
200EXPORT_SYMBOL_GPL(wait_rcu_gp); 200EXPORT_SYMBOL_GPL(wait_rcu_gp);
201 201
202#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD 202#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
203static inline void debug_init_rcu_head(struct rcu_head *head) 203void init_rcu_head(struct rcu_head *head)
204{ 204{
205 debug_object_init(head, &rcuhead_debug_descr); 205 debug_object_init(head, &rcuhead_debug_descr);
206} 206}
207 207
208static inline void debug_rcu_head_free(struct rcu_head *head) 208void destroy_rcu_head(struct rcu_head *head)
209{ 209{
210 debug_object_free(head, &rcuhead_debug_descr); 210 debug_object_free(head, &rcuhead_debug_descr);
211} 211}
@@ -350,21 +350,3 @@ static int __init check_cpu_stall_init(void)
350early_initcall(check_cpu_stall_init); 350early_initcall(check_cpu_stall_init);
351 351
352#endif /* #ifdef CONFIG_RCU_STALL_COMMON */ 352#endif /* #ifdef CONFIG_RCU_STALL_COMMON */
353
354/*
355 * Hooks for cond_resched() and friends to avoid RCU CPU stall warnings.
356 */
357
358DEFINE_PER_CPU(int, rcu_cond_resched_count);
359
360/*
361 * Report a set of RCU quiescent states, for use by cond_resched()
362 * and friends. Out of line due to being called infrequently.
363 */
364void rcu_resched(void)
365{
366 preempt_disable();
367 __this_cpu_write(rcu_cond_resched_count, 0);
368 rcu_note_context_switch(smp_processor_id());
369 preempt_enable();
370}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3bdf01b494fe..bc1638b33449 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4147,7 +4147,6 @@ static void __cond_resched(void)
4147 4147
4148int __sched _cond_resched(void) 4148int __sched _cond_resched(void)
4149{ 4149{
4150 rcu_cond_resched();
4151 if (should_resched()) { 4150 if (should_resched()) {
4152 __cond_resched(); 4151 __cond_resched();
4153 return 1; 4152 return 1;
@@ -4166,18 +4165,15 @@ EXPORT_SYMBOL(_cond_resched);
4166 */ 4165 */
4167int __cond_resched_lock(spinlock_t *lock) 4166int __cond_resched_lock(spinlock_t *lock)
4168{ 4167{
4169 bool need_rcu_resched = rcu_should_resched();
4170 int resched = should_resched(); 4168 int resched = should_resched();
4171 int ret = 0; 4169 int ret = 0;
4172 4170
4173 lockdep_assert_held(lock); 4171 lockdep_assert_held(lock);
4174 4172
4175 if (spin_needbreak(lock) || resched || need_rcu_resched) { 4173 if (spin_needbreak(lock) || resched) {
4176 spin_unlock(lock); 4174 spin_unlock(lock);
4177 if (resched) 4175 if (resched)
4178 __cond_resched(); 4176 __cond_resched();
4179 else if (unlikely(need_rcu_resched))
4180 rcu_resched();
4181 else 4177 else
4182 cpu_relax(); 4178 cpu_relax();
4183 ret = 1; 4179 ret = 1;
@@ -4191,7 +4187,6 @@ int __sched __cond_resched_softirq(void)
4191{ 4187{
4192 BUG_ON(!in_softirq()); 4188 BUG_ON(!in_softirq());
4193 4189
4194 rcu_cond_resched(); /* BH disabled OK, just recording QSes. */
4195 if (should_resched()) { 4190 if (should_resched()) {
4196 local_bh_enable(); 4191 local_bh_enable();
4197 __cond_resched(); 4192 __cond_resched();
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 695f9773bb60..627b3c34b821 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -608,7 +608,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
608 608
609 avg_atom = p->se.sum_exec_runtime; 609 avg_atom = p->se.sum_exec_runtime;
610 if (nr_switches) 610 if (nr_switches)
611 do_div(avg_atom, nr_switches); 611 avg_atom = div64_ul(avg_atom, nr_switches);
612 else 612 else
613 avg_atom = -1LL; 613 avg_atom = -1LL;
614 614
diff --git a/kernel/smp.c b/kernel/smp.c
index 306f8180b0d5..80c33f8de14f 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -29,6 +29,8 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_function_data, cfd_data);
29 29
30static DEFINE_PER_CPU_SHARED_ALIGNED(struct llist_head, call_single_queue); 30static DEFINE_PER_CPU_SHARED_ALIGNED(struct llist_head, call_single_queue);
31 31
32static void flush_smp_call_function_queue(bool warn_cpu_offline);
33
32static int 34static int
33hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) 35hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
34{ 36{
@@ -51,12 +53,27 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
51#ifdef CONFIG_HOTPLUG_CPU 53#ifdef CONFIG_HOTPLUG_CPU
52 case CPU_UP_CANCELED: 54 case CPU_UP_CANCELED:
53 case CPU_UP_CANCELED_FROZEN: 55 case CPU_UP_CANCELED_FROZEN:
56 /* Fall-through to the CPU_DEAD[_FROZEN] case. */
54 57
55 case CPU_DEAD: 58 case CPU_DEAD:
56 case CPU_DEAD_FROZEN: 59 case CPU_DEAD_FROZEN:
57 free_cpumask_var(cfd->cpumask); 60 free_cpumask_var(cfd->cpumask);
58 free_percpu(cfd->csd); 61 free_percpu(cfd->csd);
59 break; 62 break;
63
64 case CPU_DYING:
65 case CPU_DYING_FROZEN:
66 /*
67 * The IPIs for the smp-call-function callbacks queued by other
68 * CPUs might arrive late, either due to hardware latencies or
69 * because this CPU disabled interrupts (inside stop-machine)
70 * before the IPIs were sent. So flush out any pending callbacks
71 * explicitly (without waiting for the IPIs to arrive), to
72 * ensure that the outgoing CPU doesn't go offline with work
73 * still pending.
74 */
75 flush_smp_call_function_queue(false);
76 break;
60#endif 77#endif
61 }; 78 };
62 79
@@ -177,23 +194,47 @@ static int generic_exec_single(int cpu, struct call_single_data *csd,
177 return 0; 194 return 0;
178} 195}
179 196
180/* 197/**
181 * Invoked by arch to handle an IPI for call function single. Must be 198 * generic_smp_call_function_single_interrupt - Execute SMP IPI callbacks
182 * called from the arch with interrupts disabled. 199 *
200 * Invoked by arch to handle an IPI for call function single.
201 * Must be called with interrupts disabled.
183 */ 202 */
184void generic_smp_call_function_single_interrupt(void) 203void generic_smp_call_function_single_interrupt(void)
185{ 204{
205 flush_smp_call_function_queue(true);
206}
207
208/**
209 * flush_smp_call_function_queue - Flush pending smp-call-function callbacks
210 *
211 * @warn_cpu_offline: If set to 'true', warn if callbacks were queued on an
212 * offline CPU. Skip this check if set to 'false'.
213 *
214 * Flush any pending smp-call-function callbacks queued on this CPU. This is
215 * invoked by the generic IPI handler, as well as by a CPU about to go offline,
216 * to ensure that all pending IPI callbacks are run before it goes completely
217 * offline.
218 *
219 * Loop through the call_single_queue and run all the queued callbacks.
220 * Must be called with interrupts disabled.
221 */
222static void flush_smp_call_function_queue(bool warn_cpu_offline)
223{
224 struct llist_head *head;
186 struct llist_node *entry; 225 struct llist_node *entry;
187 struct call_single_data *csd, *csd_next; 226 struct call_single_data *csd, *csd_next;
188 static bool warned; 227 static bool warned;
189 228
190 entry = llist_del_all(&__get_cpu_var(call_single_queue)); 229 WARN_ON(!irqs_disabled());
230
231 head = &__get_cpu_var(call_single_queue);
232 entry = llist_del_all(head);
191 entry = llist_reverse_order(entry); 233 entry = llist_reverse_order(entry);
192 234
193 /* 235 /* There shouldn't be any pending callbacks on an offline CPU. */
194 * Shouldn't receive this interrupt on a cpu that is not yet online. 236 if (unlikely(warn_cpu_offline && !cpu_online(smp_processor_id()) &&
195 */ 237 !warned && !llist_empty(head))) {
196 if (unlikely(!cpu_online(smp_processor_id()) && !warned)) {
197 warned = true; 238 warned = true;
198 WARN(1, "IPI on offline CPU %d\n", smp_processor_id()); 239 WARN(1, "IPI on offline CPU %d\n", smp_processor_id());
199 240
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index ba9ed453c4ed..75b22e22a72c 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -136,7 +136,6 @@ static unsigned long dirty_bytes_min = 2 * PAGE_SIZE;
136/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ 136/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
137static int maxolduid = 65535; 137static int maxolduid = 65535;
138static int minolduid; 138static int minolduid;
139static int min_percpu_pagelist_fract = 8;
140 139
141static int ngroups_max = NGROUPS_MAX; 140static int ngroups_max = NGROUPS_MAX;
142static const int cap_last_cap = CAP_LAST_CAP; 141static const int cap_last_cap = CAP_LAST_CAP;
@@ -152,10 +151,6 @@ static unsigned long hung_task_timeout_max = (LONG_MAX/HZ);
152#ifdef CONFIG_SPARC 151#ifdef CONFIG_SPARC
153#endif 152#endif
154 153
155#ifdef CONFIG_SPARC64
156extern int sysctl_tsb_ratio;
157#endif
158
159#ifdef __hppa__ 154#ifdef __hppa__
160extern int pwrsw_enabled; 155extern int pwrsw_enabled;
161#endif 156#endif
@@ -865,6 +860,17 @@ static struct ctl_table kern_table[] = {
865 .extra1 = &zero, 860 .extra1 = &zero,
866 .extra2 = &one, 861 .extra2 = &one,
867 }, 862 },
863#ifdef CONFIG_SMP
864 {
865 .procname = "softlockup_all_cpu_backtrace",
866 .data = &sysctl_softlockup_all_cpu_backtrace,
867 .maxlen = sizeof(int),
868 .mode = 0644,
869 .proc_handler = proc_dointvec_minmax,
870 .extra1 = &zero,
871 .extra2 = &one,
872 },
873#endif /* CONFIG_SMP */
868 { 874 {
869 .procname = "nmi_watchdog", 875 .procname = "nmi_watchdog",
870 .data = &watchdog_user_enabled, 876 .data = &watchdog_user_enabled,
@@ -1321,7 +1327,7 @@ static struct ctl_table vm_table[] = {
1321 .maxlen = sizeof(percpu_pagelist_fraction), 1327 .maxlen = sizeof(percpu_pagelist_fraction),
1322 .mode = 0644, 1328 .mode = 0644,
1323 .proc_handler = percpu_pagelist_fraction_sysctl_handler, 1329 .proc_handler = percpu_pagelist_fraction_sysctl_handler,
1324 .extra1 = &min_percpu_pagelist_fract, 1330 .extra1 = &zero,
1325 }, 1331 },
1326#ifdef CONFIG_MMU 1332#ifdef CONFIG_MMU
1327 { 1333 {
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 88c9c65a430d..fe75444ae7ec 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -585,9 +585,14 @@ static int alarm_timer_set(struct k_itimer *timr, int flags,
585 struct itimerspec *new_setting, 585 struct itimerspec *new_setting,
586 struct itimerspec *old_setting) 586 struct itimerspec *old_setting)
587{ 587{
588 ktime_t exp;
589
588 if (!rtcdev) 590 if (!rtcdev)
589 return -ENOTSUPP; 591 return -ENOTSUPP;
590 592
593 if (flags & ~TIMER_ABSTIME)
594 return -EINVAL;
595
591 if (old_setting) 596 if (old_setting)
592 alarm_timer_get(timr, old_setting); 597 alarm_timer_get(timr, old_setting);
593 598
@@ -597,8 +602,16 @@ static int alarm_timer_set(struct k_itimer *timr, int flags,
597 602
598 /* start the timer */ 603 /* start the timer */
599 timr->it.alarm.interval = timespec_to_ktime(new_setting->it_interval); 604 timr->it.alarm.interval = timespec_to_ktime(new_setting->it_interval);
600 alarm_start(&timr->it.alarm.alarmtimer, 605 exp = timespec_to_ktime(new_setting->it_value);
601 timespec_to_ktime(new_setting->it_value)); 606 /* Convert (if necessary) to absolute time */
607 if (flags != TIMER_ABSTIME) {
608 ktime_t now;
609
610 now = alarm_bases[timr->it.alarm.alarmtimer.type].gettime();
611 exp = ktime_add(now, exp);
612 }
613
614 alarm_start(&timr->it.alarm.alarmtimer, exp);
602 return 0; 615 return 0;
603} 616}
604 617
@@ -730,6 +743,9 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags,
730 if (!alarmtimer_get_rtcdev()) 743 if (!alarmtimer_get_rtcdev())
731 return -ENOTSUPP; 744 return -ENOTSUPP;
732 745
746 if (flags & ~TIMER_ABSTIME)
747 return -EINVAL;
748
733 if (!capable(CAP_WAKE_ALARM)) 749 if (!capable(CAP_WAKE_ALARM))
734 return -EPERM; 750 return -EPERM;
735 751
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 5b372e3ed675..ac9d1dad630b 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -265,12 +265,12 @@ static void update_ftrace_function(void)
265 func = ftrace_ops_list_func; 265 func = ftrace_ops_list_func;
266 } 266 }
267 267
268 update_function_graph_func();
269
268 /* If there's no change, then do nothing more here */ 270 /* If there's no change, then do nothing more here */
269 if (ftrace_trace_function == func) 271 if (ftrace_trace_function == func)
270 return; 272 return;
271 273
272 update_function_graph_func();
273
274 /* 274 /*
275 * If we are using the list function, it doesn't care 275 * If we are using the list function, it doesn't care
276 * about the function_trace_ops. 276 * about the function_trace_ops.
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 7c56c3d06943..ff7027199a9a 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -616,10 +616,6 @@ int ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu,
616 struct ring_buffer_per_cpu *cpu_buffer; 616 struct ring_buffer_per_cpu *cpu_buffer;
617 struct rb_irq_work *work; 617 struct rb_irq_work *work;
618 618
619 if ((cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) ||
620 (cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu)))
621 return POLLIN | POLLRDNORM;
622
623 if (cpu == RING_BUFFER_ALL_CPUS) 619 if (cpu == RING_BUFFER_ALL_CPUS)
624 work = &buffer->irq_work; 620 work = &buffer->irq_work;
625 else { 621 else {
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 384ede311717..bda9621638cc 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -466,6 +466,12 @@ int __trace_puts(unsigned long ip, const char *str, int size)
466 struct print_entry *entry; 466 struct print_entry *entry;
467 unsigned long irq_flags; 467 unsigned long irq_flags;
468 int alloc; 468 int alloc;
469 int pc;
470
471 if (!(trace_flags & TRACE_ITER_PRINTK))
472 return 0;
473
474 pc = preempt_count();
469 475
470 if (unlikely(tracing_selftest_running || tracing_disabled)) 476 if (unlikely(tracing_selftest_running || tracing_disabled))
471 return 0; 477 return 0;
@@ -475,7 +481,7 @@ int __trace_puts(unsigned long ip, const char *str, int size)
475 local_save_flags(irq_flags); 481 local_save_flags(irq_flags);
476 buffer = global_trace.trace_buffer.buffer; 482 buffer = global_trace.trace_buffer.buffer;
477 event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc, 483 event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc,
478 irq_flags, preempt_count()); 484 irq_flags, pc);
479 if (!event) 485 if (!event)
480 return 0; 486 return 0;
481 487
@@ -492,6 +498,7 @@ int __trace_puts(unsigned long ip, const char *str, int size)
492 entry->buf[size] = '\0'; 498 entry->buf[size] = '\0';
493 499
494 __buffer_unlock_commit(buffer, event); 500 __buffer_unlock_commit(buffer, event);
501 ftrace_trace_stack(buffer, irq_flags, 4, pc);
495 502
496 return size; 503 return size;
497} 504}
@@ -509,6 +516,12 @@ int __trace_bputs(unsigned long ip, const char *str)
509 struct bputs_entry *entry; 516 struct bputs_entry *entry;
510 unsigned long irq_flags; 517 unsigned long irq_flags;
511 int size = sizeof(struct bputs_entry); 518 int size = sizeof(struct bputs_entry);
519 int pc;
520
521 if (!(trace_flags & TRACE_ITER_PRINTK))
522 return 0;
523
524 pc = preempt_count();
512 525
513 if (unlikely(tracing_selftest_running || tracing_disabled)) 526 if (unlikely(tracing_selftest_running || tracing_disabled))
514 return 0; 527 return 0;
@@ -516,7 +529,7 @@ int __trace_bputs(unsigned long ip, const char *str)
516 local_save_flags(irq_flags); 529 local_save_flags(irq_flags);
517 buffer = global_trace.trace_buffer.buffer; 530 buffer = global_trace.trace_buffer.buffer;
518 event = trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size, 531 event = trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size,
519 irq_flags, preempt_count()); 532 irq_flags, pc);
520 if (!event) 533 if (!event)
521 return 0; 534 return 0;
522 535
@@ -525,6 +538,7 @@ int __trace_bputs(unsigned long ip, const char *str)
525 entry->str = str; 538 entry->str = str;
526 539
527 __buffer_unlock_commit(buffer, event); 540 __buffer_unlock_commit(buffer, event);
541 ftrace_trace_stack(buffer, irq_flags, 4, pc);
528 542
529 return 1; 543 return 1;
530} 544}
@@ -1396,7 +1410,6 @@ void tracing_start(void)
1396 1410
1397 arch_spin_unlock(&global_trace.max_lock); 1411 arch_spin_unlock(&global_trace.max_lock);
1398 1412
1399 ftrace_start();
1400 out: 1413 out:
1401 raw_spin_unlock_irqrestore(&global_trace.start_lock, flags); 1414 raw_spin_unlock_irqrestore(&global_trace.start_lock, flags);
1402} 1415}
@@ -1443,7 +1456,6 @@ void tracing_stop(void)
1443 struct ring_buffer *buffer; 1456 struct ring_buffer *buffer;
1444 unsigned long flags; 1457 unsigned long flags;
1445 1458
1446 ftrace_stop();
1447 raw_spin_lock_irqsave(&global_trace.start_lock, flags); 1459 raw_spin_lock_irqsave(&global_trace.start_lock, flags);
1448 if (global_trace.stop_count++) 1460 if (global_trace.stop_count++)
1449 goto out; 1461 goto out;
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index f99e0b3bca8c..2de53628689f 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -470,6 +470,7 @@ static void remove_event_file_dir(struct ftrace_event_file *file)
470 470
471 list_del(&file->list); 471 list_del(&file->list);
472 remove_subsystem(file->system); 472 remove_subsystem(file->system);
473 free_event_filter(file->filter);
473 kmem_cache_free(file_cachep, file); 474 kmem_cache_free(file_cachep, file);
474} 475}
475 476
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 04fdb5de823c..3c9b97e6b1f4 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -893,6 +893,9 @@ probe_event_enable(struct trace_uprobe *tu, struct ftrace_event_file *file,
893 int ret; 893 int ret;
894 894
895 if (file) { 895 if (file) {
896 if (tu->tp.flags & TP_FLAG_PROFILE)
897 return -EINTR;
898
896 link = kmalloc(sizeof(*link), GFP_KERNEL); 899 link = kmalloc(sizeof(*link), GFP_KERNEL);
897 if (!link) 900 if (!link)
898 return -ENOMEM; 901 return -ENOMEM;
@@ -901,29 +904,40 @@ probe_event_enable(struct trace_uprobe *tu, struct ftrace_event_file *file,
901 list_add_tail_rcu(&link->list, &tu->tp.files); 904 list_add_tail_rcu(&link->list, &tu->tp.files);
902 905
903 tu->tp.flags |= TP_FLAG_TRACE; 906 tu->tp.flags |= TP_FLAG_TRACE;
904 } else 907 } else {
905 tu->tp.flags |= TP_FLAG_PROFILE; 908 if (tu->tp.flags & TP_FLAG_TRACE)
909 return -EINTR;
906 910
907 ret = uprobe_buffer_enable(); 911 tu->tp.flags |= TP_FLAG_PROFILE;
908 if (ret < 0) 912 }
909 return ret;
910 913
911 WARN_ON(!uprobe_filter_is_empty(&tu->filter)); 914 WARN_ON(!uprobe_filter_is_empty(&tu->filter));
912 915
913 if (enabled) 916 if (enabled)
914 return 0; 917 return 0;
915 918
919 ret = uprobe_buffer_enable();
920 if (ret)
921 goto err_flags;
922
916 tu->consumer.filter = filter; 923 tu->consumer.filter = filter;
917 ret = uprobe_register(tu->inode, tu->offset, &tu->consumer); 924 ret = uprobe_register(tu->inode, tu->offset, &tu->consumer);
918 if (ret) { 925 if (ret)
919 if (file) { 926 goto err_buffer;
920 list_del(&link->list);
921 kfree(link);
922 tu->tp.flags &= ~TP_FLAG_TRACE;
923 } else
924 tu->tp.flags &= ~TP_FLAG_PROFILE;
925 }
926 927
928 return 0;
929
930 err_buffer:
931 uprobe_buffer_disable();
932
933 err_flags:
934 if (file) {
935 list_del(&link->list);
936 kfree(link);
937 tu->tp.flags &= ~TP_FLAG_TRACE;
938 } else {
939 tu->tp.flags &= ~TP_FLAG_PROFILE;
940 }
927 return ret; 941 return ret;
928} 942}
929 943
@@ -1201,12 +1215,6 @@ static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs)
1201 1215
1202 current->utask->vaddr = (unsigned long) &udd; 1216 current->utask->vaddr = (unsigned long) &udd;
1203 1217
1204#ifdef CONFIG_PERF_EVENTS
1205 if ((tu->tp.flags & TP_FLAG_TRACE) == 0 &&
1206 !uprobe_perf_filter(&tu->consumer, 0, current->mm))
1207 return UPROBE_HANDLER_REMOVE;
1208#endif
1209
1210 if (WARN_ON_ONCE(!uprobe_cpu_buffer)) 1218 if (WARN_ON_ONCE(!uprobe_cpu_buffer))
1211 return 0; 1219 return 0;
1212 1220
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 33cbd8c203f8..3490407dc7b7 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -492,33 +492,29 @@ static int sys_tracepoint_refcount;
492 492
493void syscall_regfunc(void) 493void syscall_regfunc(void)
494{ 494{
495 unsigned long flags; 495 struct task_struct *p, *t;
496 struct task_struct *g, *t;
497 496
498 if (!sys_tracepoint_refcount) { 497 if (!sys_tracepoint_refcount) {
499 read_lock_irqsave(&tasklist_lock, flags); 498 read_lock(&tasklist_lock);
500 do_each_thread(g, t) { 499 for_each_process_thread(p, t) {
501 /* Skip kernel threads. */ 500 set_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT);
502 if (t->mm) 501 }
503 set_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT); 502 read_unlock(&tasklist_lock);
504 } while_each_thread(g, t);
505 read_unlock_irqrestore(&tasklist_lock, flags);
506 } 503 }
507 sys_tracepoint_refcount++; 504 sys_tracepoint_refcount++;
508} 505}
509 506
510void syscall_unregfunc(void) 507void syscall_unregfunc(void)
511{ 508{
512 unsigned long flags; 509 struct task_struct *p, *t;
513 struct task_struct *g, *t;
514 510
515 sys_tracepoint_refcount--; 511 sys_tracepoint_refcount--;
516 if (!sys_tracepoint_refcount) { 512 if (!sys_tracepoint_refcount) {
517 read_lock_irqsave(&tasklist_lock, flags); 513 read_lock(&tasklist_lock);
518 do_each_thread(g, t) { 514 for_each_process_thread(p, t) {
519 clear_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT); 515 clear_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT);
520 } while_each_thread(g, t); 516 }
521 read_unlock_irqrestore(&tasklist_lock, flags); 517 read_unlock(&tasklist_lock);
522 } 518 }
523} 519}
524#endif 520#endif
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 516203e665fc..c3319bd1b040 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -31,6 +31,12 @@
31 31
32int watchdog_user_enabled = 1; 32int watchdog_user_enabled = 1;
33int __read_mostly watchdog_thresh = 10; 33int __read_mostly watchdog_thresh = 10;
34#ifdef CONFIG_SMP
35int __read_mostly sysctl_softlockup_all_cpu_backtrace;
36#else
37#define sysctl_softlockup_all_cpu_backtrace 0
38#endif
39
34static int __read_mostly watchdog_running; 40static int __read_mostly watchdog_running;
35static u64 __read_mostly sample_period; 41static u64 __read_mostly sample_period;
36 42
@@ -47,6 +53,7 @@ static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
47static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); 53static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
48static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); 54static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
49#endif 55#endif
56static unsigned long soft_lockup_nmi_warn;
50 57
51/* boot commands */ 58/* boot commands */
52/* 59/*
@@ -95,6 +102,15 @@ static int __init nosoftlockup_setup(char *str)
95} 102}
96__setup("nosoftlockup", nosoftlockup_setup); 103__setup("nosoftlockup", nosoftlockup_setup);
97/* */ 104/* */
105#ifdef CONFIG_SMP
106static int __init softlockup_all_cpu_backtrace_setup(char *str)
107{
108 sysctl_softlockup_all_cpu_backtrace =
109 !!simple_strtol(str, NULL, 0);
110 return 1;
111}
112__setup("softlockup_all_cpu_backtrace=", softlockup_all_cpu_backtrace_setup);
113#endif
98 114
99/* 115/*
100 * Hard-lockup warnings should be triggered after just a few seconds. Soft- 116 * Hard-lockup warnings should be triggered after just a few seconds. Soft-
@@ -271,6 +287,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
271 unsigned long touch_ts = __this_cpu_read(watchdog_touch_ts); 287 unsigned long touch_ts = __this_cpu_read(watchdog_touch_ts);
272 struct pt_regs *regs = get_irq_regs(); 288 struct pt_regs *regs = get_irq_regs();
273 int duration; 289 int duration;
290 int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace;
274 291
275 /* kick the hardlockup detector */ 292 /* kick the hardlockup detector */
276 watchdog_interrupt_count(); 293 watchdog_interrupt_count();
@@ -317,6 +334,17 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
317 if (__this_cpu_read(soft_watchdog_warn) == true) 334 if (__this_cpu_read(soft_watchdog_warn) == true)
318 return HRTIMER_RESTART; 335 return HRTIMER_RESTART;
319 336
337 if (softlockup_all_cpu_backtrace) {
338 /* Prevent multiple soft-lockup reports if one cpu is already
339 * engaged in dumping cpu back traces
340 */
341 if (test_and_set_bit(0, &soft_lockup_nmi_warn)) {
342 /* Someone else will report us. Let's give up */
343 __this_cpu_write(soft_watchdog_warn, true);
344 return HRTIMER_RESTART;
345 }
346 }
347
320 printk(KERN_EMERG "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", 348 printk(KERN_EMERG "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
321 smp_processor_id(), duration, 349 smp_processor_id(), duration,
322 current->comm, task_pid_nr(current)); 350 current->comm, task_pid_nr(current));
@@ -327,6 +355,17 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
327 else 355 else
328 dump_stack(); 356 dump_stack();
329 357
358 if (softlockup_all_cpu_backtrace) {
359 /* Avoid generating two back traces for current
360 * given that one is already made above
361 */
362 trigger_allbutself_cpu_backtrace();
363
364 clear_bit(0, &soft_lockup_nmi_warn);
365 /* Barrier to sync with other cpus */
366 smp_mb__after_atomic();
367 }
368
330 if (softlockup_panic) 369 if (softlockup_panic)
331 panic("softlockup: hung tasks"); 370 panic("softlockup: hung tasks");
332 __this_cpu_write(soft_watchdog_warn, true); 371 __this_cpu_write(soft_watchdog_warn, true);
@@ -527,10 +566,8 @@ static void update_timers_all_cpus(void)
527 int cpu; 566 int cpu;
528 567
529 get_online_cpus(); 568 get_online_cpus();
530 preempt_disable();
531 for_each_online_cpu(cpu) 569 for_each_online_cpu(cpu)
532 update_timers(cpu); 570 update_timers(cpu);
533 preempt_enable();
534 put_online_cpus(); 571 put_online_cpus();
535} 572}
536 573
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 6203d2900877..35974ac69600 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -3284,6 +3284,7 @@ int workqueue_sysfs_register(struct workqueue_struct *wq)
3284 } 3284 }
3285 } 3285 }
3286 3286
3287 dev_set_uevent_suppress(&wq_dev->dev, false);
3287 kobject_uevent(&wq_dev->dev.kobj, KOBJ_ADD); 3288 kobject_uevent(&wq_dev->dev.kobj, KOBJ_ADD);
3288 return 0; 3289 return 0;
3289} 3290}
@@ -4879,7 +4880,7 @@ static void __init wq_numa_init(void)
4879 BUG_ON(!tbl); 4880 BUG_ON(!tbl);
4880 4881
4881 for_each_node(node) 4882 for_each_node(node)
4882 BUG_ON(!alloc_cpumask_var_node(&tbl[node], GFP_KERNEL, 4883 BUG_ON(!zalloc_cpumask_var_node(&tbl[node], GFP_KERNEL,
4883 node_online(node) ? node : NUMA_NO_NODE)); 4884 node_online(node) ? node : NUMA_NO_NODE));
4884 4885
4885 for_each_possible_cpu(cpu) { 4886 for_each_possible_cpu(cpu) {