aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorIngo Molnar <mingo@kernel.org>2014-07-17 05:45:29 -0400
committerIngo Molnar <mingo@kernel.org>2014-07-17 05:45:29 -0400
commitb5e4111f027c4be85dbe97e090530d03c55c4cf4 (patch)
tree11e0a37cb59314f4e9a7b2810124a4a7a33140e5 /kernel
parent72d5305dcb3637913c2c37e847a4de9028e49244 (diff)
parent9de8033f1bbcce5ed23fe5da9ca1a5060207f7ed (diff)
Merge branch 'locking/urgent' into locking/core, before applying larger changes and to refresh the branch with fixes
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.locks9
-rw-r--r--kernel/cgroup.c58
-rw-r--r--kernel/context_tracking.c3
-rw-r--r--kernel/cpuset.c20
-rw-r--r--kernel/events/core.c37
-rw-r--r--kernel/events/uprobes.c6
-rw-r--r--kernel/fork.c2
-rw-r--r--kernel/irq/irqdesc.c4
-rw-r--r--kernel/kexec.c1
-rw-r--r--kernel/locking/mcs_spinlock.c64
-rw-r--r--kernel/locking/mcs_spinlock.h9
-rw-r--r--kernel/locking/mutex.c2
-rw-r--r--kernel/locking/rwsem-spinlock.c28
-rw-r--r--kernel/locking/rwsem-xadd.c16
-rw-r--r--kernel/locking/rwsem.c2
-rw-r--r--kernel/power/hibernate.c37
-rw-r--r--kernel/power/main.c6
-rw-r--r--kernel/power/user.c3
-rw-r--r--kernel/printk/printk.c44
-rw-r--r--kernel/smp.c57
-rw-r--r--kernel/sysctl.c18
-rw-r--r--kernel/trace/trace.c2
-rw-r--r--kernel/trace/trace_uprobe.c46
-rw-r--r--kernel/tracepoint.c26
-rw-r--r--kernel/watchdog.c41
-rw-r--r--kernel/workqueue.c3
26 files changed, 397 insertions, 147 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index 35536d9c0964..76768ee812b2 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -220,9 +220,16 @@ config INLINE_WRITE_UNLOCK_IRQRESTORE
220 220
221endif 221endif
222 222
223config ARCH_SUPPORTS_ATOMIC_RMW
224 bool
225
223config MUTEX_SPIN_ON_OWNER 226config MUTEX_SPIN_ON_OWNER
224 def_bool y 227 def_bool y
225 depends on SMP && !DEBUG_MUTEXES 228 depends on SMP && !DEBUG_MUTEXES && ARCH_SUPPORTS_ATOMIC_RMW
229
230config RWSEM_SPIN_ON_OWNER
231 def_bool y
232 depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW
226 233
227config ARCH_USE_QUEUE_RWLOCK 234config ARCH_USE_QUEUE_RWLOCK
228 bool 235 bool
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 7868fc3c0bc5..70776aec2562 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1648,10 +1648,13 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1648 int flags, const char *unused_dev_name, 1648 int flags, const char *unused_dev_name,
1649 void *data) 1649 void *data)
1650{ 1650{
1651 struct super_block *pinned_sb = NULL;
1652 struct cgroup_subsys *ss;
1651 struct cgroup_root *root; 1653 struct cgroup_root *root;
1652 struct cgroup_sb_opts opts; 1654 struct cgroup_sb_opts opts;
1653 struct dentry *dentry; 1655 struct dentry *dentry;
1654 int ret; 1656 int ret;
1657 int i;
1655 bool new_sb; 1658 bool new_sb;
1656 1659
1657 /* 1660 /*
@@ -1677,6 +1680,27 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1677 goto out_unlock; 1680 goto out_unlock;
1678 } 1681 }
1679 1682
1683 /*
1684 * Destruction of cgroup root is asynchronous, so subsystems may
1685 * still be dying after the previous unmount. Let's drain the
1686 * dying subsystems. We just need to ensure that the ones
1687 * unmounted previously finish dying and don't care about new ones
1688 * starting. Testing ref liveliness is good enough.
1689 */
1690 for_each_subsys(ss, i) {
1691 if (!(opts.subsys_mask & (1 << i)) ||
1692 ss->root == &cgrp_dfl_root)
1693 continue;
1694
1695 if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) {
1696 mutex_unlock(&cgroup_mutex);
1697 msleep(10);
1698 ret = restart_syscall();
1699 goto out_free;
1700 }
1701 cgroup_put(&ss->root->cgrp);
1702 }
1703
1680 for_each_root(root) { 1704 for_each_root(root) {
1681 bool name_match = false; 1705 bool name_match = false;
1682 1706
@@ -1717,15 +1741,23 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1717 } 1741 }
1718 1742
1719 /* 1743 /*
1720 * A root's lifetime is governed by its root cgroup. 1744 * We want to reuse @root whose lifetime is governed by its
1721 * tryget_live failure indicate that the root is being 1745 * ->cgrp. Let's check whether @root is alive and keep it
1722 * destroyed. Wait for destruction to complete so that the 1746 * that way. As cgroup_kill_sb() can happen anytime, we
1723 * subsystems are free. We can use wait_queue for the wait 1747 * want to block it by pinning the sb so that @root doesn't
1724 * but this path is super cold. Let's just sleep for a bit 1748 * get killed before mount is complete.
1725 * and retry. 1749 *
1750 * With the sb pinned, tryget_live can reliably indicate
1751 * whether @root can be reused. If it's being killed,
1752 * drain it. We can use wait_queue for the wait but this
1753 * path is super cold. Let's just sleep a bit and retry.
1726 */ 1754 */
1727 if (!percpu_ref_tryget_live(&root->cgrp.self.refcnt)) { 1755 pinned_sb = kernfs_pin_sb(root->kf_root, NULL);
1756 if (IS_ERR(pinned_sb) ||
1757 !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) {
1728 mutex_unlock(&cgroup_mutex); 1758 mutex_unlock(&cgroup_mutex);
1759 if (!IS_ERR_OR_NULL(pinned_sb))
1760 deactivate_super(pinned_sb);
1729 msleep(10); 1761 msleep(10);
1730 ret = restart_syscall(); 1762 ret = restart_syscall();
1731 goto out_free; 1763 goto out_free;
@@ -1770,6 +1802,16 @@ out_free:
1770 CGROUP_SUPER_MAGIC, &new_sb); 1802 CGROUP_SUPER_MAGIC, &new_sb);
1771 if (IS_ERR(dentry) || !new_sb) 1803 if (IS_ERR(dentry) || !new_sb)
1772 cgroup_put(&root->cgrp); 1804 cgroup_put(&root->cgrp);
1805
1806 /*
1807 * If @pinned_sb, we're reusing an existing root and holding an
1808 * extra ref on its sb. Mount is complete. Put the extra ref.
1809 */
1810 if (pinned_sb) {
1811 WARN_ON(new_sb);
1812 deactivate_super(pinned_sb);
1813 }
1814
1773 return dentry; 1815 return dentry;
1774} 1816}
1775 1817
@@ -3328,7 +3370,7 @@ bool css_has_online_children(struct cgroup_subsys_state *css)
3328 3370
3329 rcu_read_lock(); 3371 rcu_read_lock();
3330 css_for_each_child(child, css) { 3372 css_for_each_child(child, css) {
3331 if (css->flags & CSS_ONLINE) { 3373 if (child->flags & CSS_ONLINE) {
3332 ret = true; 3374 ret = true;
3333 break; 3375 break;
3334 } 3376 }
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 019d45008448..5664985c46a0 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -19,6 +19,7 @@
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/hardirq.h> 20#include <linux/hardirq.h>
21#include <linux/export.h> 21#include <linux/export.h>
22#include <linux/kprobes.h>
22 23
23#define CREATE_TRACE_POINTS 24#define CREATE_TRACE_POINTS
24#include <trace/events/context_tracking.h> 25#include <trace/events/context_tracking.h>
@@ -104,6 +105,7 @@ void context_tracking_user_enter(void)
104 } 105 }
105 local_irq_restore(flags); 106 local_irq_restore(flags);
106} 107}
108NOKPROBE_SYMBOL(context_tracking_user_enter);
107 109
108#ifdef CONFIG_PREEMPT 110#ifdef CONFIG_PREEMPT
109/** 111/**
@@ -181,6 +183,7 @@ void context_tracking_user_exit(void)
181 } 183 }
182 local_irq_restore(flags); 184 local_irq_restore(flags);
183} 185}
186NOKPROBE_SYMBOL(context_tracking_user_exit);
184 187
185/** 188/**
186 * __context_tracking_task_switch - context switch the syscall callbacks 189 * __context_tracking_task_switch - context switch the syscall callbacks
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index f6b33c696224..116a4164720a 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1181,7 +1181,13 @@ done:
1181 1181
1182int current_cpuset_is_being_rebound(void) 1182int current_cpuset_is_being_rebound(void)
1183{ 1183{
1184 return task_cs(current) == cpuset_being_rebound; 1184 int ret;
1185
1186 rcu_read_lock();
1187 ret = task_cs(current) == cpuset_being_rebound;
1188 rcu_read_unlock();
1189
1190 return ret;
1185} 1191}
1186 1192
1187static int update_relax_domain_level(struct cpuset *cs, s64 val) 1193static int update_relax_domain_level(struct cpuset *cs, s64 val)
@@ -1617,7 +1623,17 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
1617 * resources, wait for the previously scheduled operations before 1623 * resources, wait for the previously scheduled operations before
1618 * proceeding, so that we don't end up keep removing tasks added 1624 * proceeding, so that we don't end up keep removing tasks added
1619 * after execution capability is restored. 1625 * after execution capability is restored.
1626 *
1627 * cpuset_hotplug_work calls back into cgroup core via
1628 * cgroup_transfer_tasks() and waiting for it from a cgroupfs
1629 * operation like this one can lead to a deadlock through kernfs
1630 * active_ref protection. Let's break the protection. Losing the
1631 * protection is okay as we check whether @cs is online after
1632 * grabbing cpuset_mutex anyway. This only happens on the legacy
1633 * hierarchies.
1620 */ 1634 */
1635 css_get(&cs->css);
1636 kernfs_break_active_protection(of->kn);
1621 flush_work(&cpuset_hotplug_work); 1637 flush_work(&cpuset_hotplug_work);
1622 1638
1623 mutex_lock(&cpuset_mutex); 1639 mutex_lock(&cpuset_mutex);
@@ -1645,6 +1661,8 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
1645 free_trial_cpuset(trialcs); 1661 free_trial_cpuset(trialcs);
1646out_unlock: 1662out_unlock:
1647 mutex_unlock(&cpuset_mutex); 1663 mutex_unlock(&cpuset_mutex);
1664 kernfs_unbreak_active_protection(of->kn);
1665 css_put(&cs->css);
1648 return retval ?: nbytes; 1666 return retval ?: nbytes;
1649} 1667}
1650 1668
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 5fa58e4cffac..a33d9a2bcbd7 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -40,6 +40,7 @@
40#include <linux/mm_types.h> 40#include <linux/mm_types.h>
41#include <linux/cgroup.h> 41#include <linux/cgroup.h>
42#include <linux/module.h> 42#include <linux/module.h>
43#include <linux/mman.h>
43 44
44#include "internal.h" 45#include "internal.h"
45 46
@@ -5128,6 +5129,7 @@ struct perf_mmap_event {
5128 int maj, min; 5129 int maj, min;
5129 u64 ino; 5130 u64 ino;
5130 u64 ino_generation; 5131 u64 ino_generation;
5132 u32 prot, flags;
5131 5133
5132 struct { 5134 struct {
5133 struct perf_event_header header; 5135 struct perf_event_header header;
@@ -5169,6 +5171,8 @@ static void perf_event_mmap_output(struct perf_event *event,
5169 mmap_event->event_id.header.size += sizeof(mmap_event->min); 5171 mmap_event->event_id.header.size += sizeof(mmap_event->min);
5170 mmap_event->event_id.header.size += sizeof(mmap_event->ino); 5172 mmap_event->event_id.header.size += sizeof(mmap_event->ino);
5171 mmap_event->event_id.header.size += sizeof(mmap_event->ino_generation); 5173 mmap_event->event_id.header.size += sizeof(mmap_event->ino_generation);
5174 mmap_event->event_id.header.size += sizeof(mmap_event->prot);
5175 mmap_event->event_id.header.size += sizeof(mmap_event->flags);
5172 } 5176 }
5173 5177
5174 perf_event_header__init_id(&mmap_event->event_id.header, &sample, event); 5178 perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
@@ -5187,6 +5191,8 @@ static void perf_event_mmap_output(struct perf_event *event,
5187 perf_output_put(&handle, mmap_event->min); 5191 perf_output_put(&handle, mmap_event->min);
5188 perf_output_put(&handle, mmap_event->ino); 5192 perf_output_put(&handle, mmap_event->ino);
5189 perf_output_put(&handle, mmap_event->ino_generation); 5193 perf_output_put(&handle, mmap_event->ino_generation);
5194 perf_output_put(&handle, mmap_event->prot);
5195 perf_output_put(&handle, mmap_event->flags);
5190 } 5196 }
5191 5197
5192 __output_copy(&handle, mmap_event->file_name, 5198 __output_copy(&handle, mmap_event->file_name,
@@ -5205,6 +5211,7 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
5205 struct file *file = vma->vm_file; 5211 struct file *file = vma->vm_file;
5206 int maj = 0, min = 0; 5212 int maj = 0, min = 0;
5207 u64 ino = 0, gen = 0; 5213 u64 ino = 0, gen = 0;
5214 u32 prot = 0, flags = 0;
5208 unsigned int size; 5215 unsigned int size;
5209 char tmp[16]; 5216 char tmp[16];
5210 char *buf = NULL; 5217 char *buf = NULL;
@@ -5235,6 +5242,28 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
5235 gen = inode->i_generation; 5242 gen = inode->i_generation;
5236 maj = MAJOR(dev); 5243 maj = MAJOR(dev);
5237 min = MINOR(dev); 5244 min = MINOR(dev);
5245
5246 if (vma->vm_flags & VM_READ)
5247 prot |= PROT_READ;
5248 if (vma->vm_flags & VM_WRITE)
5249 prot |= PROT_WRITE;
5250 if (vma->vm_flags & VM_EXEC)
5251 prot |= PROT_EXEC;
5252
5253 if (vma->vm_flags & VM_MAYSHARE)
5254 flags = MAP_SHARED;
5255 else
5256 flags = MAP_PRIVATE;
5257
5258 if (vma->vm_flags & VM_DENYWRITE)
5259 flags |= MAP_DENYWRITE;
5260 if (vma->vm_flags & VM_MAYEXEC)
5261 flags |= MAP_EXECUTABLE;
5262 if (vma->vm_flags & VM_LOCKED)
5263 flags |= MAP_LOCKED;
5264 if (vma->vm_flags & VM_HUGETLB)
5265 flags |= MAP_HUGETLB;
5266
5238 goto got_name; 5267 goto got_name;
5239 } else { 5268 } else {
5240 name = (char *)arch_vma_name(vma); 5269 name = (char *)arch_vma_name(vma);
@@ -5275,6 +5304,8 @@ got_name:
5275 mmap_event->min = min; 5304 mmap_event->min = min;
5276 mmap_event->ino = ino; 5305 mmap_event->ino = ino;
5277 mmap_event->ino_generation = gen; 5306 mmap_event->ino_generation = gen;
5307 mmap_event->prot = prot;
5308 mmap_event->flags = flags;
5278 5309
5279 if (!(vma->vm_flags & VM_EXEC)) 5310 if (!(vma->vm_flags & VM_EXEC))
5280 mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA; 5311 mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA;
@@ -5315,6 +5346,8 @@ void perf_event_mmap(struct vm_area_struct *vma)
5315 /* .min (attr_mmap2 only) */ 5346 /* .min (attr_mmap2 only) */
5316 /* .ino (attr_mmap2 only) */ 5347 /* .ino (attr_mmap2 only) */
5317 /* .ino_generation (attr_mmap2 only) */ 5348 /* .ino_generation (attr_mmap2 only) */
5349 /* .prot (attr_mmap2 only) */
5350 /* .flags (attr_mmap2 only) */
5318 }; 5351 };
5319 5352
5320 perf_event_mmap_event(&mmap_event); 5353 perf_event_mmap_event(&mmap_event);
@@ -6897,10 +6930,6 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
6897 if (ret) 6930 if (ret)
6898 return -EFAULT; 6931 return -EFAULT;
6899 6932
6900 /* disabled for now */
6901 if (attr->mmap2)
6902 return -EINVAL;
6903
6904 if (attr->__reserved_1) 6933 if (attr->__reserved_1)
6905 return -EINVAL; 6934 return -EINVAL;
6906 6935
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index c445e392e93f..6f3254e8c137 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -846,7 +846,7 @@ static void __uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *u
846{ 846{
847 int err; 847 int err;
848 848
849 if (!consumer_del(uprobe, uc)) /* WARN? */ 849 if (WARN_ON(!consumer_del(uprobe, uc)))
850 return; 850 return;
851 851
852 err = register_for_each_vma(uprobe, NULL); 852 err = register_for_each_vma(uprobe, NULL);
@@ -927,7 +927,7 @@ int uprobe_apply(struct inode *inode, loff_t offset,
927 int ret = -ENOENT; 927 int ret = -ENOENT;
928 928
929 uprobe = find_uprobe(inode, offset); 929 uprobe = find_uprobe(inode, offset);
930 if (!uprobe) 930 if (WARN_ON(!uprobe))
931 return ret; 931 return ret;
932 932
933 down_write(&uprobe->register_rwsem); 933 down_write(&uprobe->register_rwsem);
@@ -952,7 +952,7 @@ void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consume
952 struct uprobe *uprobe; 952 struct uprobe *uprobe;
953 953
954 uprobe = find_uprobe(inode, offset); 954 uprobe = find_uprobe(inode, offset);
955 if (!uprobe) 955 if (WARN_ON(!uprobe))
956 return; 956 return;
957 957
958 down_write(&uprobe->register_rwsem); 958 down_write(&uprobe->register_rwsem);
diff --git a/kernel/fork.c b/kernel/fork.c
index d2799d1fc952..6a13c46cd87d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1487,7 +1487,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1487 1487
1488 total_forks++; 1488 total_forks++;
1489 spin_unlock(&current->sighand->siglock); 1489 spin_unlock(&current->sighand->siglock);
1490 syscall_tracepoint_update(p);
1490 write_unlock_irq(&tasklist_lock); 1491 write_unlock_irq(&tasklist_lock);
1492
1491 proc_fork_connector(p); 1493 proc_fork_connector(p);
1492 cgroup_post_fork(p); 1494 cgroup_post_fork(p);
1493 if (clone_flags & CLONE_THREAD) 1495 if (clone_flags & CLONE_THREAD)
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 7339e42a85ab..1487a123db5c 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -455,9 +455,9 @@ EXPORT_SYMBOL_GPL(irq_alloc_hwirqs);
455 */ 455 */
456void irq_free_hwirqs(unsigned int from, int cnt) 456void irq_free_hwirqs(unsigned int from, int cnt)
457{ 457{
458 int i; 458 int i, j;
459 459
460 for (i = from; cnt > 0; i++, cnt--) { 460 for (i = from, j = cnt; j > 0; i++, j--) {
461 irq_set_status_flags(i, _IRQ_NOREQUEST | _IRQ_NOPROBE); 461 irq_set_status_flags(i, _IRQ_NOREQUEST | _IRQ_NOPROBE);
462 arch_teardown_hwirq(i); 462 arch_teardown_hwirq(i);
463 } 463 }
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 6748688813d0..369f41a94124 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1617,6 +1617,7 @@ static int __init crash_save_vmcoreinfo_init(void)
1617#ifdef CONFIG_MEMORY_FAILURE 1617#ifdef CONFIG_MEMORY_FAILURE
1618 VMCOREINFO_NUMBER(PG_hwpoison); 1618 VMCOREINFO_NUMBER(PG_hwpoison);
1619#endif 1619#endif
1620 VMCOREINFO_NUMBER(PG_head_mask);
1620 VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE); 1621 VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
1621 1622
1622 arch_crash_save_vmcoreinfo(); 1623 arch_crash_save_vmcoreinfo();
diff --git a/kernel/locking/mcs_spinlock.c b/kernel/locking/mcs_spinlock.c
index 838dc9e00669..be9ee1559fca 100644
--- a/kernel/locking/mcs_spinlock.c
+++ b/kernel/locking/mcs_spinlock.c
@@ -14,21 +14,47 @@
14 * called from interrupt context and we have preemption disabled while 14 * called from interrupt context and we have preemption disabled while
15 * spinning. 15 * spinning.
16 */ 16 */
17static DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_queue, osq_node); 17static DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_node, osq_node);
18
19/*
20 * We use the value 0 to represent "no CPU", thus the encoded value
21 * will be the CPU number incremented by 1.
22 */
23static inline int encode_cpu(int cpu_nr)
24{
25 return cpu_nr + 1;
26}
27
28static inline struct optimistic_spin_node *decode_cpu(int encoded_cpu_val)
29{
30 int cpu_nr = encoded_cpu_val - 1;
31
32 return per_cpu_ptr(&osq_node, cpu_nr);
33}
18 34
19/* 35/*
20 * Get a stable @node->next pointer, either for unlock() or unqueue() purposes. 36 * Get a stable @node->next pointer, either for unlock() or unqueue() purposes.
21 * Can return NULL in case we were the last queued and we updated @lock instead. 37 * Can return NULL in case we were the last queued and we updated @lock instead.
22 */ 38 */
23static inline struct optimistic_spin_queue * 39static inline struct optimistic_spin_node *
24osq_wait_next(struct optimistic_spin_queue **lock, 40osq_wait_next(struct optimistic_spin_queue *lock,
25 struct optimistic_spin_queue *node, 41 struct optimistic_spin_node *node,
26 struct optimistic_spin_queue *prev) 42 struct optimistic_spin_node *prev)
27{ 43{
28 struct optimistic_spin_queue *next = NULL; 44 struct optimistic_spin_node *next = NULL;
45 int curr = encode_cpu(smp_processor_id());
46 int old;
47
48 /*
49 * If there is a prev node in queue, then the 'old' value will be
50 * the prev node's CPU #, else it's set to OSQ_UNLOCKED_VAL since if
51 * we're currently last in queue, then the queue will then become empty.
52 */
53 old = prev ? prev->cpu : OSQ_UNLOCKED_VAL;
29 54
30 for (;;) { 55 for (;;) {
31 if (*lock == node && cmpxchg(lock, node, prev) == node) { 56 if (atomic_read(&lock->tail) == curr &&
57 atomic_cmpxchg(&lock->tail, curr, old) == curr) {
32 /* 58 /*
33 * We were the last queued, we moved @lock back. @prev 59 * We were the last queued, we moved @lock back. @prev
34 * will now observe @lock and will complete its 60 * will now observe @lock and will complete its
@@ -59,18 +85,23 @@ osq_wait_next(struct optimistic_spin_queue **lock,
59 return next; 85 return next;
60} 86}
61 87
62bool osq_lock(struct optimistic_spin_queue **lock) 88bool osq_lock(struct optimistic_spin_queue *lock)
63{ 89{
64 struct optimistic_spin_queue *node = this_cpu_ptr(&osq_node); 90 struct optimistic_spin_node *node = this_cpu_ptr(&osq_node);
65 struct optimistic_spin_queue *prev, *next; 91 struct optimistic_spin_node *prev, *next;
92 int curr = encode_cpu(smp_processor_id());
93 int old;
66 94
67 node->locked = 0; 95 node->locked = 0;
68 node->next = NULL; 96 node->next = NULL;
97 node->cpu = curr;
69 98
70 node->prev = prev = xchg(lock, node); 99 old = atomic_xchg(&lock->tail, curr);
71 if (likely(prev == NULL)) 100 if (old == OSQ_UNLOCKED_VAL)
72 return true; 101 return true;
73 102
103 prev = decode_cpu(old);
104 node->prev = prev;
74 ACCESS_ONCE(prev->next) = node; 105 ACCESS_ONCE(prev->next) = node;
75 106
76 /* 107 /*
@@ -149,20 +180,21 @@ unqueue:
149 return false; 180 return false;
150} 181}
151 182
152void osq_unlock(struct optimistic_spin_queue **lock) 183void osq_unlock(struct optimistic_spin_queue *lock)
153{ 184{
154 struct optimistic_spin_queue *node = this_cpu_ptr(&osq_node); 185 struct optimistic_spin_node *node, *next;
155 struct optimistic_spin_queue *next; 186 int curr = encode_cpu(smp_processor_id());
156 187
157 /* 188 /*
158 * Fast path for the uncontended case. 189 * Fast path for the uncontended case.
159 */ 190 */
160 if (likely(cmpxchg(lock, node, NULL) == node)) 191 if (likely(atomic_cmpxchg(&lock->tail, curr, OSQ_UNLOCKED_VAL) == curr))
161 return; 192 return;
162 193
163 /* 194 /*
164 * Second most likely case. 195 * Second most likely case.
165 */ 196 */
197 node = this_cpu_ptr(&osq_node);
166 next = xchg(&node->next, NULL); 198 next = xchg(&node->next, NULL);
167 if (next) { 199 if (next) {
168 ACCESS_ONCE(next->locked) = 1; 200 ACCESS_ONCE(next->locked) = 1;
diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h
index a2dbac4aca6b..74356dc0ce29 100644
--- a/kernel/locking/mcs_spinlock.h
+++ b/kernel/locking/mcs_spinlock.h
@@ -118,12 +118,13 @@ void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
118 * mutex_lock()/rwsem_down_{read,write}() etc. 118 * mutex_lock()/rwsem_down_{read,write}() etc.
119 */ 119 */
120 120
121struct optimistic_spin_queue { 121struct optimistic_spin_node {
122 struct optimistic_spin_queue *next, *prev; 122 struct optimistic_spin_node *next, *prev;
123 int locked; /* 1 if lock acquired */ 123 int locked; /* 1 if lock acquired */
124 int cpu; /* encoded CPU # value */
124}; 125};
125 126
126extern bool osq_lock(struct optimistic_spin_queue **lock); 127extern bool osq_lock(struct optimistic_spin_queue *lock);
127extern void osq_unlock(struct optimistic_spin_queue **lock); 128extern void osq_unlock(struct optimistic_spin_queue *lock);
128 129
129#endif /* __LINUX_MCS_SPINLOCK_H */ 130#endif /* __LINUX_MCS_SPINLOCK_H */
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 11b103d87b27..d3100521388c 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -54,7 +54,7 @@ __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
54 INIT_LIST_HEAD(&lock->wait_list); 54 INIT_LIST_HEAD(&lock->wait_list);
55 mutex_clear_owner(lock); 55 mutex_clear_owner(lock);
56#ifdef CONFIG_MUTEX_SPIN_ON_OWNER 56#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
57 lock->osq = NULL; 57 osq_lock_init(&lock->osq);
58#endif 58#endif
59 59
60 debug_mutex_init(lock, name, key); 60 debug_mutex_init(lock, name, key);
diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c
index 9be8a9144978..2c93571162cb 100644
--- a/kernel/locking/rwsem-spinlock.c
+++ b/kernel/locking/rwsem-spinlock.c
@@ -26,7 +26,7 @@ int rwsem_is_locked(struct rw_semaphore *sem)
26 unsigned long flags; 26 unsigned long flags;
27 27
28 if (raw_spin_trylock_irqsave(&sem->wait_lock, flags)) { 28 if (raw_spin_trylock_irqsave(&sem->wait_lock, flags)) {
29 ret = (sem->activity != 0); 29 ret = (sem->count != 0);
30 raw_spin_unlock_irqrestore(&sem->wait_lock, flags); 30 raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
31 } 31 }
32 return ret; 32 return ret;
@@ -46,7 +46,7 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name,
46 debug_check_no_locks_freed((void *)sem, sizeof(*sem)); 46 debug_check_no_locks_freed((void *)sem, sizeof(*sem));
47 lockdep_init_map(&sem->dep_map, name, key, 0); 47 lockdep_init_map(&sem->dep_map, name, key, 0);
48#endif 48#endif
49 sem->activity = 0; 49 sem->count = 0;
50 raw_spin_lock_init(&sem->wait_lock); 50 raw_spin_lock_init(&sem->wait_lock);
51 INIT_LIST_HEAD(&sem->wait_list); 51 INIT_LIST_HEAD(&sem->wait_list);
52} 52}
@@ -95,7 +95,7 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wakewrite)
95 waiter = list_entry(next, struct rwsem_waiter, list); 95 waiter = list_entry(next, struct rwsem_waiter, list);
96 } while (waiter->type != RWSEM_WAITING_FOR_WRITE); 96 } while (waiter->type != RWSEM_WAITING_FOR_WRITE);
97 97
98 sem->activity += woken; 98 sem->count += woken;
99 99
100 out: 100 out:
101 return sem; 101 return sem;
@@ -126,9 +126,9 @@ void __sched __down_read(struct rw_semaphore *sem)
126 126
127 raw_spin_lock_irqsave(&sem->wait_lock, flags); 127 raw_spin_lock_irqsave(&sem->wait_lock, flags);
128 128
129 if (sem->activity >= 0 && list_empty(&sem->wait_list)) { 129 if (sem->count >= 0 && list_empty(&sem->wait_list)) {
130 /* granted */ 130 /* granted */
131 sem->activity++; 131 sem->count++;
132 raw_spin_unlock_irqrestore(&sem->wait_lock, flags); 132 raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
133 goto out; 133 goto out;
134 } 134 }
@@ -170,9 +170,9 @@ int __down_read_trylock(struct rw_semaphore *sem)
170 170
171 raw_spin_lock_irqsave(&sem->wait_lock, flags); 171 raw_spin_lock_irqsave(&sem->wait_lock, flags);
172 172
173 if (sem->activity >= 0 && list_empty(&sem->wait_list)) { 173 if (sem->count >= 0 && list_empty(&sem->wait_list)) {
174 /* granted */ 174 /* granted */
175 sem->activity++; 175 sem->count++;
176 ret = 1; 176 ret = 1;
177 } 177 }
178 178
@@ -206,7 +206,7 @@ void __sched __down_write_nested(struct rw_semaphore *sem, int subclass)
206 * itself into sleep and waiting for system woke it or someone 206 * itself into sleep and waiting for system woke it or someone
207 * else in the head of the wait list up. 207 * else in the head of the wait list up.
208 */ 208 */
209 if (sem->activity == 0) 209 if (sem->count == 0)
210 break; 210 break;
211 set_task_state(tsk, TASK_UNINTERRUPTIBLE); 211 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
212 raw_spin_unlock_irqrestore(&sem->wait_lock, flags); 212 raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
@@ -214,7 +214,7 @@ void __sched __down_write_nested(struct rw_semaphore *sem, int subclass)
214 raw_spin_lock_irqsave(&sem->wait_lock, flags); 214 raw_spin_lock_irqsave(&sem->wait_lock, flags);
215 } 215 }
216 /* got the lock */ 216 /* got the lock */
217 sem->activity = -1; 217 sem->count = -1;
218 list_del(&waiter.list); 218 list_del(&waiter.list);
219 219
220 raw_spin_unlock_irqrestore(&sem->wait_lock, flags); 220 raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
@@ -235,9 +235,9 @@ int __down_write_trylock(struct rw_semaphore *sem)
235 235
236 raw_spin_lock_irqsave(&sem->wait_lock, flags); 236 raw_spin_lock_irqsave(&sem->wait_lock, flags);
237 237
238 if (sem->activity == 0) { 238 if (sem->count == 0) {
239 /* got the lock */ 239 /* got the lock */
240 sem->activity = -1; 240 sem->count = -1;
241 ret = 1; 241 ret = 1;
242 } 242 }
243 243
@@ -255,7 +255,7 @@ void __up_read(struct rw_semaphore *sem)
255 255
256 raw_spin_lock_irqsave(&sem->wait_lock, flags); 256 raw_spin_lock_irqsave(&sem->wait_lock, flags);
257 257
258 if (--sem->activity == 0 && !list_empty(&sem->wait_list)) 258 if (--sem->count == 0 && !list_empty(&sem->wait_list))
259 sem = __rwsem_wake_one_writer(sem); 259 sem = __rwsem_wake_one_writer(sem);
260 260
261 raw_spin_unlock_irqrestore(&sem->wait_lock, flags); 261 raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
@@ -270,7 +270,7 @@ void __up_write(struct rw_semaphore *sem)
270 270
271 raw_spin_lock_irqsave(&sem->wait_lock, flags); 271 raw_spin_lock_irqsave(&sem->wait_lock, flags);
272 272
273 sem->activity = 0; 273 sem->count = 0;
274 if (!list_empty(&sem->wait_list)) 274 if (!list_empty(&sem->wait_list))
275 sem = __rwsem_do_wake(sem, 1); 275 sem = __rwsem_do_wake(sem, 1);
276 276
@@ -287,7 +287,7 @@ void __downgrade_write(struct rw_semaphore *sem)
287 287
288 raw_spin_lock_irqsave(&sem->wait_lock, flags); 288 raw_spin_lock_irqsave(&sem->wait_lock, flags);
289 289
290 sem->activity = 1; 290 sem->count = 1;
291 if (!list_empty(&sem->wait_list)) 291 if (!list_empty(&sem->wait_list))
292 sem = __rwsem_do_wake(sem, 0); 292 sem = __rwsem_do_wake(sem, 0);
293 293
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index dacc32142fcc..a2391ac135c8 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -82,9 +82,9 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name,
82 sem->count = RWSEM_UNLOCKED_VALUE; 82 sem->count = RWSEM_UNLOCKED_VALUE;
83 raw_spin_lock_init(&sem->wait_lock); 83 raw_spin_lock_init(&sem->wait_lock);
84 INIT_LIST_HEAD(&sem->wait_list); 84 INIT_LIST_HEAD(&sem->wait_list);
85#ifdef CONFIG_SMP 85#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
86 sem->owner = NULL; 86 sem->owner = NULL;
87 sem->osq = NULL; 87 osq_lock_init(&sem->osq);
88#endif 88#endif
89} 89}
90 90
@@ -262,7 +262,7 @@ static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem)
262 return false; 262 return false;
263} 263}
264 264
265#ifdef CONFIG_SMP 265#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
266/* 266/*
267 * Try to acquire write lock before the writer has been put on wait queue. 267 * Try to acquire write lock before the writer has been put on wait queue.
268 */ 268 */
@@ -285,10 +285,10 @@ static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
285static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) 285static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
286{ 286{
287 struct task_struct *owner; 287 struct task_struct *owner;
288 bool on_cpu = true; 288 bool on_cpu = false;
289 289
290 if (need_resched()) 290 if (need_resched())
291 return 0; 291 return false;
292 292
293 rcu_read_lock(); 293 rcu_read_lock();
294 owner = ACCESS_ONCE(sem->owner); 294 owner = ACCESS_ONCE(sem->owner);
@@ -297,9 +297,9 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
297 rcu_read_unlock(); 297 rcu_read_unlock();
298 298
299 /* 299 /*
300 * If sem->owner is not set, the rwsem owner may have 300 * If sem->owner is not set, yet we have just recently entered the
301 * just acquired it and not set the owner yet or the rwsem 301 * slowpath, then there is a possibility reader(s) may have the lock.
302 * has been released. 302 * To be safe, avoid spinning in these situations.
303 */ 303 */
304 return on_cpu; 304 return on_cpu;
305} 305}
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 42f806de49d4..e2d3bc7f03b4 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -12,7 +12,7 @@
12 12
13#include <linux/atomic.h> 13#include <linux/atomic.h>
14 14
15#if defined(CONFIG_SMP) && defined(CONFIG_RWSEM_XCHGADD_ALGORITHM) 15#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
16static inline void rwsem_set_owner(struct rw_semaphore *sem) 16static inline void rwsem_set_owner(struct rw_semaphore *sem)
17{ 17{
18 sem->owner = current; 18 sem->owner = current;
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 49e0a20fd010..fcc2611d3f14 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -35,6 +35,7 @@
35 35
36static int nocompress; 36static int nocompress;
37static int noresume; 37static int noresume;
38static int nohibernate;
38static int resume_wait; 39static int resume_wait;
39static unsigned int resume_delay; 40static unsigned int resume_delay;
40static char resume_file[256] = CONFIG_PM_STD_PARTITION; 41static char resume_file[256] = CONFIG_PM_STD_PARTITION;
@@ -62,6 +63,11 @@ bool freezer_test_done;
62 63
63static const struct platform_hibernation_ops *hibernation_ops; 64static const struct platform_hibernation_ops *hibernation_ops;
64 65
66bool hibernation_available(void)
67{
68 return (nohibernate == 0);
69}
70
65/** 71/**
66 * hibernation_set_ops - Set the global hibernate operations. 72 * hibernation_set_ops - Set the global hibernate operations.
67 * @ops: Hibernation operations to use in subsequent hibernation transitions. 73 * @ops: Hibernation operations to use in subsequent hibernation transitions.
@@ -642,6 +648,11 @@ int hibernate(void)
642{ 648{
643 int error; 649 int error;
644 650
651 if (!hibernation_available()) {
652 pr_debug("PM: Hibernation not available.\n");
653 return -EPERM;
654 }
655
645 lock_system_sleep(); 656 lock_system_sleep();
646 /* The snapshot device should not be opened while we're running */ 657 /* The snapshot device should not be opened while we're running */
647 if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { 658 if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
@@ -734,7 +745,7 @@ static int software_resume(void)
734 /* 745 /*
735 * If the user said "noresume".. bail out early. 746 * If the user said "noresume".. bail out early.
736 */ 747 */
737 if (noresume) 748 if (noresume || !hibernation_available())
738 return 0; 749 return 0;
739 750
740 /* 751 /*
@@ -900,6 +911,9 @@ static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr,
900 int i; 911 int i;
901 char *start = buf; 912 char *start = buf;
902 913
914 if (!hibernation_available())
915 return sprintf(buf, "[disabled]\n");
916
903 for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) { 917 for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) {
904 if (!hibernation_modes[i]) 918 if (!hibernation_modes[i])
905 continue; 919 continue;
@@ -934,6 +948,9 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
934 char *p; 948 char *p;
935 int mode = HIBERNATION_INVALID; 949 int mode = HIBERNATION_INVALID;
936 950
951 if (!hibernation_available())
952 return -EPERM;
953
937 p = memchr(buf, '\n', n); 954 p = memchr(buf, '\n', n);
938 len = p ? p - buf : n; 955 len = p ? p - buf : n;
939 956
@@ -1101,6 +1118,10 @@ static int __init hibernate_setup(char *str)
1101 noresume = 1; 1118 noresume = 1;
1102 else if (!strncmp(str, "nocompress", 10)) 1119 else if (!strncmp(str, "nocompress", 10))
1103 nocompress = 1; 1120 nocompress = 1;
1121 else if (!strncmp(str, "no", 2)) {
1122 noresume = 1;
1123 nohibernate = 1;
1124 }
1104 return 1; 1125 return 1;
1105} 1126}
1106 1127
@@ -1125,9 +1146,23 @@ static int __init resumedelay_setup(char *str)
1125 return 1; 1146 return 1;
1126} 1147}
1127 1148
1149static int __init nohibernate_setup(char *str)
1150{
1151 noresume = 1;
1152 nohibernate = 1;
1153 return 1;
1154}
1155
1156static int __init kaslr_nohibernate_setup(char *str)
1157{
1158 return nohibernate_setup(str);
1159}
1160
1128__setup("noresume", noresume_setup); 1161__setup("noresume", noresume_setup);
1129__setup("resume_offset=", resume_offset_setup); 1162__setup("resume_offset=", resume_offset_setup);
1130__setup("resume=", resume_setup); 1163__setup("resume=", resume_setup);
1131__setup("hibernate=", hibernate_setup); 1164__setup("hibernate=", hibernate_setup);
1132__setup("resumewait", resumewait_setup); 1165__setup("resumewait", resumewait_setup);
1133__setup("resumedelay=", resumedelay_setup); 1166__setup("resumedelay=", resumedelay_setup);
1167__setup("nohibernate", nohibernate_setup);
1168__setup("kaslr", kaslr_nohibernate_setup);
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 573410d6647e..8e90f330f139 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -300,13 +300,11 @@ static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr,
300 s += sprintf(s,"%s ", pm_states[i].label); 300 s += sprintf(s,"%s ", pm_states[i].label);
301 301
302#endif 302#endif
303#ifdef CONFIG_HIBERNATION 303 if (hibernation_available())
304 s += sprintf(s, "%s\n", "disk"); 304 s += sprintf(s, "disk ");
305#else
306 if (s != buf) 305 if (s != buf)
307 /* convert the last space to a newline */ 306 /* convert the last space to a newline */
308 *(s-1) = '\n'; 307 *(s-1) = '\n';
309#endif
310 return (s - buf); 308 return (s - buf);
311} 309}
312 310
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 98d357584cd6..526e8911460a 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -49,6 +49,9 @@ static int snapshot_open(struct inode *inode, struct file *filp)
49 struct snapshot_data *data; 49 struct snapshot_data *data;
50 int error; 50 int error;
51 51
52 if (!hibernation_available())
53 return -EPERM;
54
52 lock_system_sleep(); 55 lock_system_sleep();
53 56
54 if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { 57 if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index ea2d5f6962ed..13e839dbca07 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1416,9 +1416,10 @@ static int have_callable_console(void)
1416/* 1416/*
1417 * Can we actually use the console at this time on this cpu? 1417 * Can we actually use the console at this time on this cpu?
1418 * 1418 *
1419 * Console drivers may assume that per-cpu resources have been allocated. So 1419 * Console drivers may assume that per-cpu resources have
1420 * unless they're explicitly marked as being able to cope (CON_ANYTIME) don't 1420 * been allocated. So unless they're explicitly marked as
1421 * call them until this CPU is officially up. 1421 * being able to cope (CON_ANYTIME) don't call them until
1422 * this CPU is officially up.
1422 */ 1423 */
1423static inline int can_use_console(unsigned int cpu) 1424static inline int can_use_console(unsigned int cpu)
1424{ 1425{
@@ -1431,10 +1432,8 @@ static inline int can_use_console(unsigned int cpu)
1431 * console_lock held, and 'console_locked' set) if it 1432 * console_lock held, and 'console_locked' set) if it
1432 * is successful, false otherwise. 1433 * is successful, false otherwise.
1433 */ 1434 */
1434static int console_trylock_for_printk(void) 1435static int console_trylock_for_printk(unsigned int cpu)
1435{ 1436{
1436 unsigned int cpu = smp_processor_id();
1437
1438 if (!console_trylock()) 1437 if (!console_trylock())
1439 return 0; 1438 return 0;
1440 /* 1439 /*
@@ -1609,8 +1608,7 @@ asmlinkage int vprintk_emit(int facility, int level,
1609 */ 1608 */
1610 if (!oops_in_progress && !lockdep_recursing(current)) { 1609 if (!oops_in_progress && !lockdep_recursing(current)) {
1611 recursion_bug = 1; 1610 recursion_bug = 1;
1612 local_irq_restore(flags); 1611 goto out_restore_irqs;
1613 return 0;
1614 } 1612 }
1615 zap_locks(); 1613 zap_locks();
1616 } 1614 }
@@ -1718,27 +1716,21 @@ asmlinkage int vprintk_emit(int facility, int level,
1718 1716
1719 logbuf_cpu = UINT_MAX; 1717 logbuf_cpu = UINT_MAX;
1720 raw_spin_unlock(&logbuf_lock); 1718 raw_spin_unlock(&logbuf_lock);
1721 lockdep_on();
1722 local_irq_restore(flags);
1723 1719
1724 /* If called from the scheduler, we can not call up(). */ 1720 /* If called from the scheduler, we can not call up(). */
1725 if (in_sched) 1721 if (!in_sched) {
1726 return printed_len; 1722 /*
1727 1723 * Try to acquire and then immediately release the console
1728 /* 1724 * semaphore. The release will print out buffers and wake up
1729 * Disable preemption to avoid being preempted while holding 1725 * /dev/kmsg and syslog() users.
1730 * console_sem which would prevent anyone from printing to console 1726 */
1731 */ 1727 if (console_trylock_for_printk(this_cpu))
1732 preempt_disable(); 1728 console_unlock();
1733 /* 1729 }
1734 * Try to acquire and then immediately release the console semaphore.
1735 * The release will print out buffers and wake up /dev/kmsg and syslog()
1736 * users.
1737 */
1738 if (console_trylock_for_printk())
1739 console_unlock();
1740 preempt_enable();
1741 1730
1731 lockdep_on();
1732out_restore_irqs:
1733 local_irq_restore(flags);
1742 return printed_len; 1734 return printed_len;
1743} 1735}
1744EXPORT_SYMBOL(vprintk_emit); 1736EXPORT_SYMBOL(vprintk_emit);
diff --git a/kernel/smp.c b/kernel/smp.c
index 306f8180b0d5..80c33f8de14f 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -29,6 +29,8 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_function_data, cfd_data);
29 29
30static DEFINE_PER_CPU_SHARED_ALIGNED(struct llist_head, call_single_queue); 30static DEFINE_PER_CPU_SHARED_ALIGNED(struct llist_head, call_single_queue);
31 31
32static void flush_smp_call_function_queue(bool warn_cpu_offline);
33
32static int 34static int
33hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) 35hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
34{ 36{
@@ -51,12 +53,27 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
51#ifdef CONFIG_HOTPLUG_CPU 53#ifdef CONFIG_HOTPLUG_CPU
52 case CPU_UP_CANCELED: 54 case CPU_UP_CANCELED:
53 case CPU_UP_CANCELED_FROZEN: 55 case CPU_UP_CANCELED_FROZEN:
56 /* Fall-through to the CPU_DEAD[_FROZEN] case. */
54 57
55 case CPU_DEAD: 58 case CPU_DEAD:
56 case CPU_DEAD_FROZEN: 59 case CPU_DEAD_FROZEN:
57 free_cpumask_var(cfd->cpumask); 60 free_cpumask_var(cfd->cpumask);
58 free_percpu(cfd->csd); 61 free_percpu(cfd->csd);
59 break; 62 break;
63
64 case CPU_DYING:
65 case CPU_DYING_FROZEN:
66 /*
67 * The IPIs for the smp-call-function callbacks queued by other
68 * CPUs might arrive late, either due to hardware latencies or
69 * because this CPU disabled interrupts (inside stop-machine)
70 * before the IPIs were sent. So flush out any pending callbacks
71 * explicitly (without waiting for the IPIs to arrive), to
72 * ensure that the outgoing CPU doesn't go offline with work
73 * still pending.
74 */
75 flush_smp_call_function_queue(false);
76 break;
60#endif 77#endif
61 }; 78 };
62 79
@@ -177,23 +194,47 @@ static int generic_exec_single(int cpu, struct call_single_data *csd,
177 return 0; 194 return 0;
178} 195}
179 196
180/* 197/**
181 * Invoked by arch to handle an IPI for call function single. Must be 198 * generic_smp_call_function_single_interrupt - Execute SMP IPI callbacks
182 * called from the arch with interrupts disabled. 199 *
200 * Invoked by arch to handle an IPI for call function single.
201 * Must be called with interrupts disabled.
183 */ 202 */
184void generic_smp_call_function_single_interrupt(void) 203void generic_smp_call_function_single_interrupt(void)
185{ 204{
205 flush_smp_call_function_queue(true);
206}
207
208/**
209 * flush_smp_call_function_queue - Flush pending smp-call-function callbacks
210 *
211 * @warn_cpu_offline: If set to 'true', warn if callbacks were queued on an
212 * offline CPU. Skip this check if set to 'false'.
213 *
214 * Flush any pending smp-call-function callbacks queued on this CPU. This is
215 * invoked by the generic IPI handler, as well as by a CPU about to go offline,
216 * to ensure that all pending IPI callbacks are run before it goes completely
217 * offline.
218 *
219 * Loop through the call_single_queue and run all the queued callbacks.
220 * Must be called with interrupts disabled.
221 */
222static void flush_smp_call_function_queue(bool warn_cpu_offline)
223{
224 struct llist_head *head;
186 struct llist_node *entry; 225 struct llist_node *entry;
187 struct call_single_data *csd, *csd_next; 226 struct call_single_data *csd, *csd_next;
188 static bool warned; 227 static bool warned;
189 228
190 entry = llist_del_all(&__get_cpu_var(call_single_queue)); 229 WARN_ON(!irqs_disabled());
230
231 head = &__get_cpu_var(call_single_queue);
232 entry = llist_del_all(head);
191 entry = llist_reverse_order(entry); 233 entry = llist_reverse_order(entry);
192 234
193 /* 235 /* There shouldn't be any pending callbacks on an offline CPU. */
194 * Shouldn't receive this interrupt on a cpu that is not yet online. 236 if (unlikely(warn_cpu_offline && !cpu_online(smp_processor_id()) &&
195 */ 237 !warned && !llist_empty(head))) {
196 if (unlikely(!cpu_online(smp_processor_id()) && !warned)) {
197 warned = true; 238 warned = true;
198 WARN(1, "IPI on offline CPU %d\n", smp_processor_id()); 239 WARN(1, "IPI on offline CPU %d\n", smp_processor_id());
199 240
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index ba9ed453c4ed..75b22e22a72c 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -136,7 +136,6 @@ static unsigned long dirty_bytes_min = 2 * PAGE_SIZE;
136/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ 136/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
137static int maxolduid = 65535; 137static int maxolduid = 65535;
138static int minolduid; 138static int minolduid;
139static int min_percpu_pagelist_fract = 8;
140 139
141static int ngroups_max = NGROUPS_MAX; 140static int ngroups_max = NGROUPS_MAX;
142static const int cap_last_cap = CAP_LAST_CAP; 141static const int cap_last_cap = CAP_LAST_CAP;
@@ -152,10 +151,6 @@ static unsigned long hung_task_timeout_max = (LONG_MAX/HZ);
152#ifdef CONFIG_SPARC 151#ifdef CONFIG_SPARC
153#endif 152#endif
154 153
155#ifdef CONFIG_SPARC64
156extern int sysctl_tsb_ratio;
157#endif
158
159#ifdef __hppa__ 154#ifdef __hppa__
160extern int pwrsw_enabled; 155extern int pwrsw_enabled;
161#endif 156#endif
@@ -865,6 +860,17 @@ static struct ctl_table kern_table[] = {
865 .extra1 = &zero, 860 .extra1 = &zero,
866 .extra2 = &one, 861 .extra2 = &one,
867 }, 862 },
863#ifdef CONFIG_SMP
864 {
865 .procname = "softlockup_all_cpu_backtrace",
866 .data = &sysctl_softlockup_all_cpu_backtrace,
867 .maxlen = sizeof(int),
868 .mode = 0644,
869 .proc_handler = proc_dointvec_minmax,
870 .extra1 = &zero,
871 .extra2 = &one,
872 },
873#endif /* CONFIG_SMP */
868 { 874 {
869 .procname = "nmi_watchdog", 875 .procname = "nmi_watchdog",
870 .data = &watchdog_user_enabled, 876 .data = &watchdog_user_enabled,
@@ -1321,7 +1327,7 @@ static struct ctl_table vm_table[] = {
1321 .maxlen = sizeof(percpu_pagelist_fraction), 1327 .maxlen = sizeof(percpu_pagelist_fraction),
1322 .mode = 0644, 1328 .mode = 0644,
1323 .proc_handler = percpu_pagelist_fraction_sysctl_handler, 1329 .proc_handler = percpu_pagelist_fraction_sysctl_handler,
1324 .extra1 = &min_percpu_pagelist_fract, 1330 .extra1 = &zero,
1325 }, 1331 },
1326#ifdef CONFIG_MMU 1332#ifdef CONFIG_MMU
1327 { 1333 {
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 384ede311717..f243444a3772 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1396,7 +1396,6 @@ void tracing_start(void)
1396 1396
1397 arch_spin_unlock(&global_trace.max_lock); 1397 arch_spin_unlock(&global_trace.max_lock);
1398 1398
1399 ftrace_start();
1400 out: 1399 out:
1401 raw_spin_unlock_irqrestore(&global_trace.start_lock, flags); 1400 raw_spin_unlock_irqrestore(&global_trace.start_lock, flags);
1402} 1401}
@@ -1443,7 +1442,6 @@ void tracing_stop(void)
1443 struct ring_buffer *buffer; 1442 struct ring_buffer *buffer;
1444 unsigned long flags; 1443 unsigned long flags;
1445 1444
1446 ftrace_stop();
1447 raw_spin_lock_irqsave(&global_trace.start_lock, flags); 1445 raw_spin_lock_irqsave(&global_trace.start_lock, flags);
1448 if (global_trace.stop_count++) 1446 if (global_trace.stop_count++)
1449 goto out; 1447 goto out;
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 04fdb5de823c..3c9b97e6b1f4 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -893,6 +893,9 @@ probe_event_enable(struct trace_uprobe *tu, struct ftrace_event_file *file,
893 int ret; 893 int ret;
894 894
895 if (file) { 895 if (file) {
896 if (tu->tp.flags & TP_FLAG_PROFILE)
897 return -EINTR;
898
896 link = kmalloc(sizeof(*link), GFP_KERNEL); 899 link = kmalloc(sizeof(*link), GFP_KERNEL);
897 if (!link) 900 if (!link)
898 return -ENOMEM; 901 return -ENOMEM;
@@ -901,29 +904,40 @@ probe_event_enable(struct trace_uprobe *tu, struct ftrace_event_file *file,
901 list_add_tail_rcu(&link->list, &tu->tp.files); 904 list_add_tail_rcu(&link->list, &tu->tp.files);
902 905
903 tu->tp.flags |= TP_FLAG_TRACE; 906 tu->tp.flags |= TP_FLAG_TRACE;
904 } else 907 } else {
905 tu->tp.flags |= TP_FLAG_PROFILE; 908 if (tu->tp.flags & TP_FLAG_TRACE)
909 return -EINTR;
906 910
907 ret = uprobe_buffer_enable(); 911 tu->tp.flags |= TP_FLAG_PROFILE;
908 if (ret < 0) 912 }
909 return ret;
910 913
911 WARN_ON(!uprobe_filter_is_empty(&tu->filter)); 914 WARN_ON(!uprobe_filter_is_empty(&tu->filter));
912 915
913 if (enabled) 916 if (enabled)
914 return 0; 917 return 0;
915 918
919 ret = uprobe_buffer_enable();
920 if (ret)
921 goto err_flags;
922
916 tu->consumer.filter = filter; 923 tu->consumer.filter = filter;
917 ret = uprobe_register(tu->inode, tu->offset, &tu->consumer); 924 ret = uprobe_register(tu->inode, tu->offset, &tu->consumer);
918 if (ret) { 925 if (ret)
919 if (file) { 926 goto err_buffer;
920 list_del(&link->list);
921 kfree(link);
922 tu->tp.flags &= ~TP_FLAG_TRACE;
923 } else
924 tu->tp.flags &= ~TP_FLAG_PROFILE;
925 }
926 927
928 return 0;
929
930 err_buffer:
931 uprobe_buffer_disable();
932
933 err_flags:
934 if (file) {
935 list_del(&link->list);
936 kfree(link);
937 tu->tp.flags &= ~TP_FLAG_TRACE;
938 } else {
939 tu->tp.flags &= ~TP_FLAG_PROFILE;
940 }
927 return ret; 941 return ret;
928} 942}
929 943
@@ -1201,12 +1215,6 @@ static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs)
1201 1215
1202 current->utask->vaddr = (unsigned long) &udd; 1216 current->utask->vaddr = (unsigned long) &udd;
1203 1217
1204#ifdef CONFIG_PERF_EVENTS
1205 if ((tu->tp.flags & TP_FLAG_TRACE) == 0 &&
1206 !uprobe_perf_filter(&tu->consumer, 0, current->mm))
1207 return UPROBE_HANDLER_REMOVE;
1208#endif
1209
1210 if (WARN_ON_ONCE(!uprobe_cpu_buffer)) 1218 if (WARN_ON_ONCE(!uprobe_cpu_buffer))
1211 return 0; 1219 return 0;
1212 1220
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 33cbd8c203f8..3490407dc7b7 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -492,33 +492,29 @@ static int sys_tracepoint_refcount;
492 492
493void syscall_regfunc(void) 493void syscall_regfunc(void)
494{ 494{
495 unsigned long flags; 495 struct task_struct *p, *t;
496 struct task_struct *g, *t;
497 496
498 if (!sys_tracepoint_refcount) { 497 if (!sys_tracepoint_refcount) {
499 read_lock_irqsave(&tasklist_lock, flags); 498 read_lock(&tasklist_lock);
500 do_each_thread(g, t) { 499 for_each_process_thread(p, t) {
501 /* Skip kernel threads. */ 500 set_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT);
502 if (t->mm) 501 }
503 set_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT); 502 read_unlock(&tasklist_lock);
504 } while_each_thread(g, t);
505 read_unlock_irqrestore(&tasklist_lock, flags);
506 } 503 }
507 sys_tracepoint_refcount++; 504 sys_tracepoint_refcount++;
508} 505}
509 506
510void syscall_unregfunc(void) 507void syscall_unregfunc(void)
511{ 508{
512 unsigned long flags; 509 struct task_struct *p, *t;
513 struct task_struct *g, *t;
514 510
515 sys_tracepoint_refcount--; 511 sys_tracepoint_refcount--;
516 if (!sys_tracepoint_refcount) { 512 if (!sys_tracepoint_refcount) {
517 read_lock_irqsave(&tasklist_lock, flags); 513 read_lock(&tasklist_lock);
518 do_each_thread(g, t) { 514 for_each_process_thread(p, t) {
519 clear_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT); 515 clear_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT);
520 } while_each_thread(g, t); 516 }
521 read_unlock_irqrestore(&tasklist_lock, flags); 517 read_unlock(&tasklist_lock);
522 } 518 }
523} 519}
524#endif 520#endif
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 516203e665fc..c3319bd1b040 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -31,6 +31,12 @@
31 31
32int watchdog_user_enabled = 1; 32int watchdog_user_enabled = 1;
33int __read_mostly watchdog_thresh = 10; 33int __read_mostly watchdog_thresh = 10;
34#ifdef CONFIG_SMP
35int __read_mostly sysctl_softlockup_all_cpu_backtrace;
36#else
37#define sysctl_softlockup_all_cpu_backtrace 0
38#endif
39
34static int __read_mostly watchdog_running; 40static int __read_mostly watchdog_running;
35static u64 __read_mostly sample_period; 41static u64 __read_mostly sample_period;
36 42
@@ -47,6 +53,7 @@ static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
47static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); 53static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
48static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); 54static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
49#endif 55#endif
56static unsigned long soft_lockup_nmi_warn;
50 57
51/* boot commands */ 58/* boot commands */
52/* 59/*
@@ -95,6 +102,15 @@ static int __init nosoftlockup_setup(char *str)
95} 102}
96__setup("nosoftlockup", nosoftlockup_setup); 103__setup("nosoftlockup", nosoftlockup_setup);
97/* */ 104/* */
105#ifdef CONFIG_SMP
106static int __init softlockup_all_cpu_backtrace_setup(char *str)
107{
108 sysctl_softlockup_all_cpu_backtrace =
109 !!simple_strtol(str, NULL, 0);
110 return 1;
111}
112__setup("softlockup_all_cpu_backtrace=", softlockup_all_cpu_backtrace_setup);
113#endif
98 114
99/* 115/*
100 * Hard-lockup warnings should be triggered after just a few seconds. Soft- 116 * Hard-lockup warnings should be triggered after just a few seconds. Soft-
@@ -271,6 +287,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
271 unsigned long touch_ts = __this_cpu_read(watchdog_touch_ts); 287 unsigned long touch_ts = __this_cpu_read(watchdog_touch_ts);
272 struct pt_regs *regs = get_irq_regs(); 288 struct pt_regs *regs = get_irq_regs();
273 int duration; 289 int duration;
290 int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace;
274 291
275 /* kick the hardlockup detector */ 292 /* kick the hardlockup detector */
276 watchdog_interrupt_count(); 293 watchdog_interrupt_count();
@@ -317,6 +334,17 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
317 if (__this_cpu_read(soft_watchdog_warn) == true) 334 if (__this_cpu_read(soft_watchdog_warn) == true)
318 return HRTIMER_RESTART; 335 return HRTIMER_RESTART;
319 336
337 if (softlockup_all_cpu_backtrace) {
338 /* Prevent multiple soft-lockup reports if one cpu is already
339 * engaged in dumping cpu back traces
340 */
341 if (test_and_set_bit(0, &soft_lockup_nmi_warn)) {
342 /* Someone else will report us. Let's give up */
343 __this_cpu_write(soft_watchdog_warn, true);
344 return HRTIMER_RESTART;
345 }
346 }
347
320 printk(KERN_EMERG "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", 348 printk(KERN_EMERG "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
321 smp_processor_id(), duration, 349 smp_processor_id(), duration,
322 current->comm, task_pid_nr(current)); 350 current->comm, task_pid_nr(current));
@@ -327,6 +355,17 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
327 else 355 else
328 dump_stack(); 356 dump_stack();
329 357
358 if (softlockup_all_cpu_backtrace) {
359 /* Avoid generating two back traces for current
360 * given that one is already made above
361 */
362 trigger_allbutself_cpu_backtrace();
363
364 clear_bit(0, &soft_lockup_nmi_warn);
365 /* Barrier to sync with other cpus */
366 smp_mb__after_atomic();
367 }
368
330 if (softlockup_panic) 369 if (softlockup_panic)
331 panic("softlockup: hung tasks"); 370 panic("softlockup: hung tasks");
332 __this_cpu_write(soft_watchdog_warn, true); 371 __this_cpu_write(soft_watchdog_warn, true);
@@ -527,10 +566,8 @@ static void update_timers_all_cpus(void)
527 int cpu; 566 int cpu;
528 567
529 get_online_cpus(); 568 get_online_cpus();
530 preempt_disable();
531 for_each_online_cpu(cpu) 569 for_each_online_cpu(cpu)
532 update_timers(cpu); 570 update_timers(cpu);
533 preempt_enable();
534 put_online_cpus(); 571 put_online_cpus();
535} 572}
536 573
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 6203d2900877..35974ac69600 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -3284,6 +3284,7 @@ int workqueue_sysfs_register(struct workqueue_struct *wq)
3284 } 3284 }
3285 } 3285 }
3286 3286
3287 dev_set_uevent_suppress(&wq_dev->dev, false);
3287 kobject_uevent(&wq_dev->dev.kobj, KOBJ_ADD); 3288 kobject_uevent(&wq_dev->dev.kobj, KOBJ_ADD);
3288 return 0; 3289 return 0;
3289} 3290}
@@ -4879,7 +4880,7 @@ static void __init wq_numa_init(void)
4879 BUG_ON(!tbl); 4880 BUG_ON(!tbl);
4880 4881
4881 for_each_node(node) 4882 for_each_node(node)
4882 BUG_ON(!alloc_cpumask_var_node(&tbl[node], GFP_KERNEL, 4883 BUG_ON(!zalloc_cpumask_var_node(&tbl[node], GFP_KERNEL,
4883 node_online(node) ? node : NUMA_NO_NODE)); 4884 node_online(node) ? node : NUMA_NO_NODE));
4884 4885
4885 for_each_possible_cpu(cpu) { 4886 for_each_possible_cpu(cpu) {