aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.locks9
-rw-r--r--kernel/cgroup.c58
-rw-r--r--kernel/context_tracking.c3
-rw-r--r--kernel/cpuset.c20
-rw-r--r--kernel/events/core.c71
-rw-r--r--kernel/events/uprobes.c6
-rw-r--r--kernel/fork.c2
-rw-r--r--kernel/irq/irqdesc.c4
-rw-r--r--kernel/kexec.c1
-rw-r--r--kernel/kprobes.c14
-rw-r--r--kernel/locking/mcs_spinlock.c64
-rw-r--r--kernel/locking/mcs_spinlock.h9
-rw-r--r--kernel/locking/mutex.c2
-rw-r--r--kernel/locking/rtmutex-debug.h5
-rw-r--r--kernel/locking/rtmutex.c243
-rw-r--r--kernel/locking/rtmutex.h5
-rw-r--r--kernel/locking/rwsem-spinlock.c28
-rw-r--r--kernel/locking/rwsem-xadd.c16
-rw-r--r--kernel/locking/rwsem.c2
-rw-r--r--kernel/power/Kconfig3
-rw-r--r--kernel/power/hibernate.c37
-rw-r--r--kernel/power/main.c31
-rw-r--r--kernel/power/power.h7
-rw-r--r--kernel/power/process.c1
-rw-r--r--kernel/power/snapshot.c494
-rw-r--r--kernel/power/suspend.c152
-rw-r--r--kernel/power/suspend_test.c12
-rw-r--r--kernel/power/user.c3
-rw-r--r--kernel/printk/printk.c44
-rw-r--r--kernel/rcu/tree.c140
-rw-r--r--kernel/rcu/tree.h6
-rw-r--r--kernel/rcu/tree_plugin.h2
-rw-r--r--kernel/rcu/update.c22
-rw-r--r--kernel/sched/core.c7
-rw-r--r--kernel/sched/debug.c2
-rw-r--r--kernel/sched/idle.c4
-rw-r--r--kernel/smp.c57
-rw-r--r--kernel/sysctl.c18
-rw-r--r--kernel/time/alarmtimer.c20
-rw-r--r--kernel/trace/ftrace.c4
-rw-r--r--kernel/trace/ring_buffer.c4
-rw-r--r--kernel/trace/trace.c22
-rw-r--r--kernel/trace/trace_clock.c9
-rw-r--r--kernel/trace/trace_events.c1
-rw-r--r--kernel/trace/trace_uprobe.c46
-rw-r--r--kernel/tracepoint.c26
-rw-r--r--kernel/watchdog.c41
-rw-r--r--kernel/workqueue.c3
48 files changed, 1296 insertions, 484 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index 35536d9c0964..76768ee812b2 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -220,9 +220,16 @@ config INLINE_WRITE_UNLOCK_IRQRESTORE
220 220
221endif 221endif
222 222
223config ARCH_SUPPORTS_ATOMIC_RMW
224 bool
225
223config MUTEX_SPIN_ON_OWNER 226config MUTEX_SPIN_ON_OWNER
224 def_bool y 227 def_bool y
225 depends on SMP && !DEBUG_MUTEXES 228 depends on SMP && !DEBUG_MUTEXES && ARCH_SUPPORTS_ATOMIC_RMW
229
230config RWSEM_SPIN_ON_OWNER
231 def_bool y
232 depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW
226 233
227config ARCH_USE_QUEUE_RWLOCK 234config ARCH_USE_QUEUE_RWLOCK
228 bool 235 bool
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 7868fc3c0bc5..70776aec2562 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1648,10 +1648,13 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1648 int flags, const char *unused_dev_name, 1648 int flags, const char *unused_dev_name,
1649 void *data) 1649 void *data)
1650{ 1650{
1651 struct super_block *pinned_sb = NULL;
1652 struct cgroup_subsys *ss;
1651 struct cgroup_root *root; 1653 struct cgroup_root *root;
1652 struct cgroup_sb_opts opts; 1654 struct cgroup_sb_opts opts;
1653 struct dentry *dentry; 1655 struct dentry *dentry;
1654 int ret; 1656 int ret;
1657 int i;
1655 bool new_sb; 1658 bool new_sb;
1656 1659
1657 /* 1660 /*
@@ -1677,6 +1680,27 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1677 goto out_unlock; 1680 goto out_unlock;
1678 } 1681 }
1679 1682
1683 /*
1684 * Destruction of cgroup root is asynchronous, so subsystems may
1685 * still be dying after the previous unmount. Let's drain the
1686 * dying subsystems. We just need to ensure that the ones
1687 * unmounted previously finish dying and don't care about new ones
1688 * starting. Testing ref liveliness is good enough.
1689 */
1690 for_each_subsys(ss, i) {
1691 if (!(opts.subsys_mask & (1 << i)) ||
1692 ss->root == &cgrp_dfl_root)
1693 continue;
1694
1695 if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) {
1696 mutex_unlock(&cgroup_mutex);
1697 msleep(10);
1698 ret = restart_syscall();
1699 goto out_free;
1700 }
1701 cgroup_put(&ss->root->cgrp);
1702 }
1703
1680 for_each_root(root) { 1704 for_each_root(root) {
1681 bool name_match = false; 1705 bool name_match = false;
1682 1706
@@ -1717,15 +1741,23 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1717 } 1741 }
1718 1742
1719 /* 1743 /*
1720 * A root's lifetime is governed by its root cgroup. 1744 * We want to reuse @root whose lifetime is governed by its
1721 * tryget_live failure indicate that the root is being 1745 * ->cgrp. Let's check whether @root is alive and keep it
1722 * destroyed. Wait for destruction to complete so that the 1746 * that way. As cgroup_kill_sb() can happen anytime, we
1723 * subsystems are free. We can use wait_queue for the wait 1747 * want to block it by pinning the sb so that @root doesn't
1724 * but this path is super cold. Let's just sleep for a bit 1748 * get killed before mount is complete.
1725 * and retry. 1749 *
1750 * With the sb pinned, tryget_live can reliably indicate
1751 * whether @root can be reused. If it's being killed,
1752 * drain it. We can use wait_queue for the wait but this
1753 * path is super cold. Let's just sleep a bit and retry.
1726 */ 1754 */
1727 if (!percpu_ref_tryget_live(&root->cgrp.self.refcnt)) { 1755 pinned_sb = kernfs_pin_sb(root->kf_root, NULL);
1756 if (IS_ERR(pinned_sb) ||
1757 !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) {
1728 mutex_unlock(&cgroup_mutex); 1758 mutex_unlock(&cgroup_mutex);
1759 if (!IS_ERR_OR_NULL(pinned_sb))
1760 deactivate_super(pinned_sb);
1729 msleep(10); 1761 msleep(10);
1730 ret = restart_syscall(); 1762 ret = restart_syscall();
1731 goto out_free; 1763 goto out_free;
@@ -1770,6 +1802,16 @@ out_free:
1770 CGROUP_SUPER_MAGIC, &new_sb); 1802 CGROUP_SUPER_MAGIC, &new_sb);
1771 if (IS_ERR(dentry) || !new_sb) 1803 if (IS_ERR(dentry) || !new_sb)
1772 cgroup_put(&root->cgrp); 1804 cgroup_put(&root->cgrp);
1805
1806 /*
1807 * If @pinned_sb, we're reusing an existing root and holding an
1808 * extra ref on its sb. Mount is complete. Put the extra ref.
1809 */
1810 if (pinned_sb) {
1811 WARN_ON(new_sb);
1812 deactivate_super(pinned_sb);
1813 }
1814
1773 return dentry; 1815 return dentry;
1774} 1816}
1775 1817
@@ -3328,7 +3370,7 @@ bool css_has_online_children(struct cgroup_subsys_state *css)
3328 3370
3329 rcu_read_lock(); 3371 rcu_read_lock();
3330 css_for_each_child(child, css) { 3372 css_for_each_child(child, css) {
3331 if (css->flags & CSS_ONLINE) { 3373 if (child->flags & CSS_ONLINE) {
3332 ret = true; 3374 ret = true;
3333 break; 3375 break;
3334 } 3376 }
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 019d45008448..5664985c46a0 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -19,6 +19,7 @@
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/hardirq.h> 20#include <linux/hardirq.h>
21#include <linux/export.h> 21#include <linux/export.h>
22#include <linux/kprobes.h>
22 23
23#define CREATE_TRACE_POINTS 24#define CREATE_TRACE_POINTS
24#include <trace/events/context_tracking.h> 25#include <trace/events/context_tracking.h>
@@ -104,6 +105,7 @@ void context_tracking_user_enter(void)
104 } 105 }
105 local_irq_restore(flags); 106 local_irq_restore(flags);
106} 107}
108NOKPROBE_SYMBOL(context_tracking_user_enter);
107 109
108#ifdef CONFIG_PREEMPT 110#ifdef CONFIG_PREEMPT
109/** 111/**
@@ -181,6 +183,7 @@ void context_tracking_user_exit(void)
181 } 183 }
182 local_irq_restore(flags); 184 local_irq_restore(flags);
183} 185}
186NOKPROBE_SYMBOL(context_tracking_user_exit);
184 187
185/** 188/**
186 * __context_tracking_task_switch - context switch the syscall callbacks 189 * __context_tracking_task_switch - context switch the syscall callbacks
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index f6b33c696224..116a4164720a 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1181,7 +1181,13 @@ done:
1181 1181
1182int current_cpuset_is_being_rebound(void) 1182int current_cpuset_is_being_rebound(void)
1183{ 1183{
1184 return task_cs(current) == cpuset_being_rebound; 1184 int ret;
1185
1186 rcu_read_lock();
1187 ret = task_cs(current) == cpuset_being_rebound;
1188 rcu_read_unlock();
1189
1190 return ret;
1185} 1191}
1186 1192
1187static int update_relax_domain_level(struct cpuset *cs, s64 val) 1193static int update_relax_domain_level(struct cpuset *cs, s64 val)
@@ -1617,7 +1623,17 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
1617 * resources, wait for the previously scheduled operations before 1623 * resources, wait for the previously scheduled operations before
1618 * proceeding, so that we don't end up keep removing tasks added 1624 * proceeding, so that we don't end up keep removing tasks added
1619 * after execution capability is restored. 1625 * after execution capability is restored.
1626 *
1627 * cpuset_hotplug_work calls back into cgroup core via
1628 * cgroup_transfer_tasks() and waiting for it from a cgroupfs
1629 * operation like this one can lead to a deadlock through kernfs
1630 * active_ref protection. Let's break the protection. Losing the
1631 * protection is okay as we check whether @cs is online after
1632 * grabbing cpuset_mutex anyway. This only happens on the legacy
1633 * hierarchies.
1620 */ 1634 */
1635 css_get(&cs->css);
1636 kernfs_break_active_protection(of->kn);
1621 flush_work(&cpuset_hotplug_work); 1637 flush_work(&cpuset_hotplug_work);
1622 1638
1623 mutex_lock(&cpuset_mutex); 1639 mutex_lock(&cpuset_mutex);
@@ -1645,6 +1661,8 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
1645 free_trial_cpuset(trialcs); 1661 free_trial_cpuset(trialcs);
1646out_unlock: 1662out_unlock:
1647 mutex_unlock(&cpuset_mutex); 1663 mutex_unlock(&cpuset_mutex);
1664 kernfs_unbreak_active_protection(of->kn);
1665 css_put(&cs->css);
1648 return retval ?: nbytes; 1666 return retval ?: nbytes;
1649} 1667}
1650 1668
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 5fa58e4cffac..6b17ac1b0c2a 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -40,6 +40,7 @@
40#include <linux/mm_types.h> 40#include <linux/mm_types.h>
41#include <linux/cgroup.h> 41#include <linux/cgroup.h>
42#include <linux/module.h> 42#include <linux/module.h>
43#include <linux/mman.h>
43 44
44#include "internal.h" 45#include "internal.h"
45 46
@@ -2319,7 +2320,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
2319 next_parent = rcu_dereference(next_ctx->parent_ctx); 2320 next_parent = rcu_dereference(next_ctx->parent_ctx);
2320 2321
2321 /* If neither context have a parent context; they cannot be clones. */ 2322 /* If neither context have a parent context; they cannot be clones. */
2322 if (!parent && !next_parent) 2323 if (!parent || !next_parent)
2323 goto unlock; 2324 goto unlock;
2324 2325
2325 if (next_parent == ctx || next_ctx == parent || next_parent == parent) { 2326 if (next_parent == ctx || next_ctx == parent || next_parent == parent) {
@@ -5128,6 +5129,7 @@ struct perf_mmap_event {
5128 int maj, min; 5129 int maj, min;
5129 u64 ino; 5130 u64 ino;
5130 u64 ino_generation; 5131 u64 ino_generation;
5132 u32 prot, flags;
5131 5133
5132 struct { 5134 struct {
5133 struct perf_event_header header; 5135 struct perf_event_header header;
@@ -5169,6 +5171,8 @@ static void perf_event_mmap_output(struct perf_event *event,
5169 mmap_event->event_id.header.size += sizeof(mmap_event->min); 5171 mmap_event->event_id.header.size += sizeof(mmap_event->min);
5170 mmap_event->event_id.header.size += sizeof(mmap_event->ino); 5172 mmap_event->event_id.header.size += sizeof(mmap_event->ino);
5171 mmap_event->event_id.header.size += sizeof(mmap_event->ino_generation); 5173 mmap_event->event_id.header.size += sizeof(mmap_event->ino_generation);
5174 mmap_event->event_id.header.size += sizeof(mmap_event->prot);
5175 mmap_event->event_id.header.size += sizeof(mmap_event->flags);
5172 } 5176 }
5173 5177
5174 perf_event_header__init_id(&mmap_event->event_id.header, &sample, event); 5178 perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
@@ -5187,6 +5191,8 @@ static void perf_event_mmap_output(struct perf_event *event,
5187 perf_output_put(&handle, mmap_event->min); 5191 perf_output_put(&handle, mmap_event->min);
5188 perf_output_put(&handle, mmap_event->ino); 5192 perf_output_put(&handle, mmap_event->ino);
5189 perf_output_put(&handle, mmap_event->ino_generation); 5193 perf_output_put(&handle, mmap_event->ino_generation);
5194 perf_output_put(&handle, mmap_event->prot);
5195 perf_output_put(&handle, mmap_event->flags);
5190 } 5196 }
5191 5197
5192 __output_copy(&handle, mmap_event->file_name, 5198 __output_copy(&handle, mmap_event->file_name,
@@ -5205,6 +5211,7 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
5205 struct file *file = vma->vm_file; 5211 struct file *file = vma->vm_file;
5206 int maj = 0, min = 0; 5212 int maj = 0, min = 0;
5207 u64 ino = 0, gen = 0; 5213 u64 ino = 0, gen = 0;
5214 u32 prot = 0, flags = 0;
5208 unsigned int size; 5215 unsigned int size;
5209 char tmp[16]; 5216 char tmp[16];
5210 char *buf = NULL; 5217 char *buf = NULL;
@@ -5235,6 +5242,28 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
5235 gen = inode->i_generation; 5242 gen = inode->i_generation;
5236 maj = MAJOR(dev); 5243 maj = MAJOR(dev);
5237 min = MINOR(dev); 5244 min = MINOR(dev);
5245
5246 if (vma->vm_flags & VM_READ)
5247 prot |= PROT_READ;
5248 if (vma->vm_flags & VM_WRITE)
5249 prot |= PROT_WRITE;
5250 if (vma->vm_flags & VM_EXEC)
5251 prot |= PROT_EXEC;
5252
5253 if (vma->vm_flags & VM_MAYSHARE)
5254 flags = MAP_SHARED;
5255 else
5256 flags = MAP_PRIVATE;
5257
5258 if (vma->vm_flags & VM_DENYWRITE)
5259 flags |= MAP_DENYWRITE;
5260 if (vma->vm_flags & VM_MAYEXEC)
5261 flags |= MAP_EXECUTABLE;
5262 if (vma->vm_flags & VM_LOCKED)
5263 flags |= MAP_LOCKED;
5264 if (vma->vm_flags & VM_HUGETLB)
5265 flags |= MAP_HUGETLB;
5266
5238 goto got_name; 5267 goto got_name;
5239 } else { 5268 } else {
5240 name = (char *)arch_vma_name(vma); 5269 name = (char *)arch_vma_name(vma);
@@ -5275,6 +5304,8 @@ got_name:
5275 mmap_event->min = min; 5304 mmap_event->min = min;
5276 mmap_event->ino = ino; 5305 mmap_event->ino = ino;
5277 mmap_event->ino_generation = gen; 5306 mmap_event->ino_generation = gen;
5307 mmap_event->prot = prot;
5308 mmap_event->flags = flags;
5278 5309
5279 if (!(vma->vm_flags & VM_EXEC)) 5310 if (!(vma->vm_flags & VM_EXEC))
5280 mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA; 5311 mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA;
@@ -5315,6 +5346,8 @@ void perf_event_mmap(struct vm_area_struct *vma)
5315 /* .min (attr_mmap2 only) */ 5346 /* .min (attr_mmap2 only) */
5316 /* .ino (attr_mmap2 only) */ 5347 /* .ino (attr_mmap2 only) */
5317 /* .ino_generation (attr_mmap2 only) */ 5348 /* .ino_generation (attr_mmap2 only) */
5349 /* .prot (attr_mmap2 only) */
5350 /* .flags (attr_mmap2 only) */
5318 }; 5351 };
5319 5352
5320 perf_event_mmap_event(&mmap_event); 5353 perf_event_mmap_event(&mmap_event);
@@ -6897,10 +6930,6 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
6897 if (ret) 6930 if (ret)
6898 return -EFAULT; 6931 return -EFAULT;
6899 6932
6900 /* disabled for now */
6901 if (attr->mmap2)
6902 return -EINVAL;
6903
6904 if (attr->__reserved_1) 6933 if (attr->__reserved_1)
6905 return -EINVAL; 6934 return -EINVAL;
6906 6935
@@ -7429,7 +7458,19 @@ __perf_event_exit_task(struct perf_event *child_event,
7429 struct perf_event_context *child_ctx, 7458 struct perf_event_context *child_ctx,
7430 struct task_struct *child) 7459 struct task_struct *child)
7431{ 7460{
7432 perf_remove_from_context(child_event, true); 7461 /*
7462 * Do not destroy the 'original' grouping; because of the context
7463 * switch optimization the original events could've ended up in a
7464 * random child task.
7465 *
7466 * If we were to destroy the original group, all group related
7467 * operations would cease to function properly after this random
7468 * child dies.
7469 *
7470 * Do destroy all inherited groups, we don't care about those
7471 * and being thorough is better.
7472 */
7473 perf_remove_from_context(child_event, !!child_event->parent);
7433 7474
7434 /* 7475 /*
7435 * It can happen that the parent exits first, and has events 7476 * It can happen that the parent exits first, and has events
@@ -7445,7 +7486,7 @@ __perf_event_exit_task(struct perf_event *child_event,
7445static void perf_event_exit_task_context(struct task_struct *child, int ctxn) 7486static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
7446{ 7487{
7447 struct perf_event *child_event, *next; 7488 struct perf_event *child_event, *next;
7448 struct perf_event_context *child_ctx; 7489 struct perf_event_context *child_ctx, *parent_ctx;
7449 unsigned long flags; 7490 unsigned long flags;
7450 7491
7451 if (likely(!child->perf_event_ctxp[ctxn])) { 7492 if (likely(!child->perf_event_ctxp[ctxn])) {
@@ -7470,6 +7511,15 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
7470 raw_spin_lock(&child_ctx->lock); 7511 raw_spin_lock(&child_ctx->lock);
7471 task_ctx_sched_out(child_ctx); 7512 task_ctx_sched_out(child_ctx);
7472 child->perf_event_ctxp[ctxn] = NULL; 7513 child->perf_event_ctxp[ctxn] = NULL;
7514
7515 /*
7516 * In order to avoid freeing: child_ctx->parent_ctx->task
7517 * under perf_event_context::lock, grab another reference.
7518 */
7519 parent_ctx = child_ctx->parent_ctx;
7520 if (parent_ctx)
7521 get_ctx(parent_ctx);
7522
7473 /* 7523 /*
7474 * If this context is a clone; unclone it so it can't get 7524 * If this context is a clone; unclone it so it can't get
7475 * swapped to another process while we're removing all 7525 * swapped to another process while we're removing all
@@ -7480,6 +7530,13 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
7480 raw_spin_unlock_irqrestore(&child_ctx->lock, flags); 7530 raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
7481 7531
7482 /* 7532 /*
7533 * Now that we no longer hold perf_event_context::lock, drop
7534 * our extra child_ctx->parent_ctx reference.
7535 */
7536 if (parent_ctx)
7537 put_ctx(parent_ctx);
7538
7539 /*
7483 * Report the task dead after unscheduling the events so that we 7540 * Report the task dead after unscheduling the events so that we
7484 * won't get any samples after PERF_RECORD_EXIT. We can however still 7541 * won't get any samples after PERF_RECORD_EXIT. We can however still
7485 * get a few PERF_RECORD_READ events. 7542 * get a few PERF_RECORD_READ events.
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index c445e392e93f..6f3254e8c137 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -846,7 +846,7 @@ static void __uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *u
846{ 846{
847 int err; 847 int err;
848 848
849 if (!consumer_del(uprobe, uc)) /* WARN? */ 849 if (WARN_ON(!consumer_del(uprobe, uc)))
850 return; 850 return;
851 851
852 err = register_for_each_vma(uprobe, NULL); 852 err = register_for_each_vma(uprobe, NULL);
@@ -927,7 +927,7 @@ int uprobe_apply(struct inode *inode, loff_t offset,
927 int ret = -ENOENT; 927 int ret = -ENOENT;
928 928
929 uprobe = find_uprobe(inode, offset); 929 uprobe = find_uprobe(inode, offset);
930 if (!uprobe) 930 if (WARN_ON(!uprobe))
931 return ret; 931 return ret;
932 932
933 down_write(&uprobe->register_rwsem); 933 down_write(&uprobe->register_rwsem);
@@ -952,7 +952,7 @@ void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consume
952 struct uprobe *uprobe; 952 struct uprobe *uprobe;
953 953
954 uprobe = find_uprobe(inode, offset); 954 uprobe = find_uprobe(inode, offset);
955 if (!uprobe) 955 if (WARN_ON(!uprobe))
956 return; 956 return;
957 957
958 down_write(&uprobe->register_rwsem); 958 down_write(&uprobe->register_rwsem);
diff --git a/kernel/fork.c b/kernel/fork.c
index d2799d1fc952..6a13c46cd87d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1487,7 +1487,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1487 1487
1488 total_forks++; 1488 total_forks++;
1489 spin_unlock(&current->sighand->siglock); 1489 spin_unlock(&current->sighand->siglock);
1490 syscall_tracepoint_update(p);
1490 write_unlock_irq(&tasklist_lock); 1491 write_unlock_irq(&tasklist_lock);
1492
1491 proc_fork_connector(p); 1493 proc_fork_connector(p);
1492 cgroup_post_fork(p); 1494 cgroup_post_fork(p);
1493 if (clone_flags & CLONE_THREAD) 1495 if (clone_flags & CLONE_THREAD)
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 7339e42a85ab..1487a123db5c 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -455,9 +455,9 @@ EXPORT_SYMBOL_GPL(irq_alloc_hwirqs);
455 */ 455 */
456void irq_free_hwirqs(unsigned int from, int cnt) 456void irq_free_hwirqs(unsigned int from, int cnt)
457{ 457{
458 int i; 458 int i, j;
459 459
460 for (i = from; cnt > 0; i++, cnt--) { 460 for (i = from, j = cnt; j > 0; i++, j--) {
461 irq_set_status_flags(i, _IRQ_NOREQUEST | _IRQ_NOPROBE); 461 irq_set_status_flags(i, _IRQ_NOREQUEST | _IRQ_NOPROBE);
462 arch_teardown_hwirq(i); 462 arch_teardown_hwirq(i);
463 } 463 }
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 6748688813d0..369f41a94124 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1617,6 +1617,7 @@ static int __init crash_save_vmcoreinfo_init(void)
1617#ifdef CONFIG_MEMORY_FAILURE 1617#ifdef CONFIG_MEMORY_FAILURE
1618 VMCOREINFO_NUMBER(PG_hwpoison); 1618 VMCOREINFO_NUMBER(PG_hwpoison);
1619#endif 1619#endif
1620 VMCOREINFO_NUMBER(PG_head_mask);
1620 VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE); 1621 VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
1621 1622
1622 arch_crash_save_vmcoreinfo(); 1623 arch_crash_save_vmcoreinfo();
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 3214289df5a7..734e9a7d280b 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -2037,19 +2037,23 @@ static int __init populate_kprobe_blacklist(unsigned long *start,
2037{ 2037{
2038 unsigned long *iter; 2038 unsigned long *iter;
2039 struct kprobe_blacklist_entry *ent; 2039 struct kprobe_blacklist_entry *ent;
2040 unsigned long offset = 0, size = 0; 2040 unsigned long entry, offset = 0, size = 0;
2041 2041
2042 for (iter = start; iter < end; iter++) { 2042 for (iter = start; iter < end; iter++) {
2043 if (!kallsyms_lookup_size_offset(*iter, &size, &offset)) { 2043 entry = arch_deref_entry_point((void *)*iter);
2044 pr_err("Failed to find blacklist %p\n", (void *)*iter); 2044
2045 if (!kernel_text_address(entry) ||
2046 !kallsyms_lookup_size_offset(entry, &size, &offset)) {
2047 pr_err("Failed to find blacklist at %p\n",
2048 (void *)entry);
2045 continue; 2049 continue;
2046 } 2050 }
2047 2051
2048 ent = kmalloc(sizeof(*ent), GFP_KERNEL); 2052 ent = kmalloc(sizeof(*ent), GFP_KERNEL);
2049 if (!ent) 2053 if (!ent)
2050 return -ENOMEM; 2054 return -ENOMEM;
2051 ent->start_addr = *iter; 2055 ent->start_addr = entry;
2052 ent->end_addr = *iter + size; 2056 ent->end_addr = entry + size;
2053 INIT_LIST_HEAD(&ent->list); 2057 INIT_LIST_HEAD(&ent->list);
2054 list_add_tail(&ent->list, &kprobe_blacklist); 2058 list_add_tail(&ent->list, &kprobe_blacklist);
2055 } 2059 }
diff --git a/kernel/locking/mcs_spinlock.c b/kernel/locking/mcs_spinlock.c
index 838dc9e00669..be9ee1559fca 100644
--- a/kernel/locking/mcs_spinlock.c
+++ b/kernel/locking/mcs_spinlock.c
@@ -14,21 +14,47 @@
14 * called from interrupt context and we have preemption disabled while 14 * called from interrupt context and we have preemption disabled while
15 * spinning. 15 * spinning.
16 */ 16 */
17static DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_queue, osq_node); 17static DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_node, osq_node);
18
19/*
20 * We use the value 0 to represent "no CPU", thus the encoded value
21 * will be the CPU number incremented by 1.
22 */
23static inline int encode_cpu(int cpu_nr)
24{
25 return cpu_nr + 1;
26}
27
28static inline struct optimistic_spin_node *decode_cpu(int encoded_cpu_val)
29{
30 int cpu_nr = encoded_cpu_val - 1;
31
32 return per_cpu_ptr(&osq_node, cpu_nr);
33}
18 34
19/* 35/*
20 * Get a stable @node->next pointer, either for unlock() or unqueue() purposes. 36 * Get a stable @node->next pointer, either for unlock() or unqueue() purposes.
21 * Can return NULL in case we were the last queued and we updated @lock instead. 37 * Can return NULL in case we were the last queued and we updated @lock instead.
22 */ 38 */
23static inline struct optimistic_spin_queue * 39static inline struct optimistic_spin_node *
24osq_wait_next(struct optimistic_spin_queue **lock, 40osq_wait_next(struct optimistic_spin_queue *lock,
25 struct optimistic_spin_queue *node, 41 struct optimistic_spin_node *node,
26 struct optimistic_spin_queue *prev) 42 struct optimistic_spin_node *prev)
27{ 43{
28 struct optimistic_spin_queue *next = NULL; 44 struct optimistic_spin_node *next = NULL;
45 int curr = encode_cpu(smp_processor_id());
46 int old;
47
48 /*
49 * If there is a prev node in queue, then the 'old' value will be
50 * the prev node's CPU #, else it's set to OSQ_UNLOCKED_VAL since if
51 * we're currently last in queue, then the queue will then become empty.
52 */
53 old = prev ? prev->cpu : OSQ_UNLOCKED_VAL;
29 54
30 for (;;) { 55 for (;;) {
31 if (*lock == node && cmpxchg(lock, node, prev) == node) { 56 if (atomic_read(&lock->tail) == curr &&
57 atomic_cmpxchg(&lock->tail, curr, old) == curr) {
32 /* 58 /*
33 * We were the last queued, we moved @lock back. @prev 59 * We were the last queued, we moved @lock back. @prev
34 * will now observe @lock and will complete its 60 * will now observe @lock and will complete its
@@ -59,18 +85,23 @@ osq_wait_next(struct optimistic_spin_queue **lock,
59 return next; 85 return next;
60} 86}
61 87
62bool osq_lock(struct optimistic_spin_queue **lock) 88bool osq_lock(struct optimistic_spin_queue *lock)
63{ 89{
64 struct optimistic_spin_queue *node = this_cpu_ptr(&osq_node); 90 struct optimistic_spin_node *node = this_cpu_ptr(&osq_node);
65 struct optimistic_spin_queue *prev, *next; 91 struct optimistic_spin_node *prev, *next;
92 int curr = encode_cpu(smp_processor_id());
93 int old;
66 94
67 node->locked = 0; 95 node->locked = 0;
68 node->next = NULL; 96 node->next = NULL;
97 node->cpu = curr;
69 98
70 node->prev = prev = xchg(lock, node); 99 old = atomic_xchg(&lock->tail, curr);
71 if (likely(prev == NULL)) 100 if (old == OSQ_UNLOCKED_VAL)
72 return true; 101 return true;
73 102
103 prev = decode_cpu(old);
104 node->prev = prev;
74 ACCESS_ONCE(prev->next) = node; 105 ACCESS_ONCE(prev->next) = node;
75 106
76 /* 107 /*
@@ -149,20 +180,21 @@ unqueue:
149 return false; 180 return false;
150} 181}
151 182
152void osq_unlock(struct optimistic_spin_queue **lock) 183void osq_unlock(struct optimistic_spin_queue *lock)
153{ 184{
154 struct optimistic_spin_queue *node = this_cpu_ptr(&osq_node); 185 struct optimistic_spin_node *node, *next;
155 struct optimistic_spin_queue *next; 186 int curr = encode_cpu(smp_processor_id());
156 187
157 /* 188 /*
158 * Fast path for the uncontended case. 189 * Fast path for the uncontended case.
159 */ 190 */
160 if (likely(cmpxchg(lock, node, NULL) == node)) 191 if (likely(atomic_cmpxchg(&lock->tail, curr, OSQ_UNLOCKED_VAL) == curr))
161 return; 192 return;
162 193
163 /* 194 /*
164 * Second most likely case. 195 * Second most likely case.
165 */ 196 */
197 node = this_cpu_ptr(&osq_node);
166 next = xchg(&node->next, NULL); 198 next = xchg(&node->next, NULL);
167 if (next) { 199 if (next) {
168 ACCESS_ONCE(next->locked) = 1; 200 ACCESS_ONCE(next->locked) = 1;
diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h
index a2dbac4aca6b..74356dc0ce29 100644
--- a/kernel/locking/mcs_spinlock.h
+++ b/kernel/locking/mcs_spinlock.h
@@ -118,12 +118,13 @@ void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
118 * mutex_lock()/rwsem_down_{read,write}() etc. 118 * mutex_lock()/rwsem_down_{read,write}() etc.
119 */ 119 */
120 120
121struct optimistic_spin_queue { 121struct optimistic_spin_node {
122 struct optimistic_spin_queue *next, *prev; 122 struct optimistic_spin_node *next, *prev;
123 int locked; /* 1 if lock acquired */ 123 int locked; /* 1 if lock acquired */
124 int cpu; /* encoded CPU # value */
124}; 125};
125 126
126extern bool osq_lock(struct optimistic_spin_queue **lock); 127extern bool osq_lock(struct optimistic_spin_queue *lock);
127extern void osq_unlock(struct optimistic_spin_queue **lock); 128extern void osq_unlock(struct optimistic_spin_queue *lock);
128 129
129#endif /* __LINUX_MCS_SPINLOCK_H */ 130#endif /* __LINUX_MCS_SPINLOCK_H */
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index bc73d33c6760..acca2c1a3c5e 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -60,7 +60,7 @@ __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
60 INIT_LIST_HEAD(&lock->wait_list); 60 INIT_LIST_HEAD(&lock->wait_list);
61 mutex_clear_owner(lock); 61 mutex_clear_owner(lock);
62#ifdef CONFIG_MUTEX_SPIN_ON_OWNER 62#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
63 lock->osq = NULL; 63 osq_lock_init(&lock->osq);
64#endif 64#endif
65 65
66 debug_mutex_init(lock, name, key); 66 debug_mutex_init(lock, name, key);
diff --git a/kernel/locking/rtmutex-debug.h b/kernel/locking/rtmutex-debug.h
index 14193d596d78..ab29b6a22669 100644
--- a/kernel/locking/rtmutex-debug.h
+++ b/kernel/locking/rtmutex-debug.h
@@ -31,3 +31,8 @@ static inline int debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter,
31{ 31{
32 return (waiter != NULL); 32 return (waiter != NULL);
33} 33}
34
35static inline void rt_mutex_print_deadlock(struct rt_mutex_waiter *w)
36{
37 debug_rt_mutex_print_deadlock(w);
38}
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index a620d4d08ca6..fc605941b9b8 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -83,6 +83,47 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
83 owner = *p; 83 owner = *p;
84 } while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner); 84 } while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner);
85} 85}
86
87/*
88 * Safe fastpath aware unlock:
89 * 1) Clear the waiters bit
90 * 2) Drop lock->wait_lock
91 * 3) Try to unlock the lock with cmpxchg
92 */
93static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock)
94 __releases(lock->wait_lock)
95{
96 struct task_struct *owner = rt_mutex_owner(lock);
97
98 clear_rt_mutex_waiters(lock);
99 raw_spin_unlock(&lock->wait_lock);
100 /*
101 * If a new waiter comes in between the unlock and the cmpxchg
102 * we have two situations:
103 *
104 * unlock(wait_lock);
105 * lock(wait_lock);
106 * cmpxchg(p, owner, 0) == owner
107 * mark_rt_mutex_waiters(lock);
108 * acquire(lock);
109 * or:
110 *
111 * unlock(wait_lock);
112 * lock(wait_lock);
113 * mark_rt_mutex_waiters(lock);
114 *
115 * cmpxchg(p, owner, 0) != owner
116 * enqueue_waiter();
117 * unlock(wait_lock);
118 * lock(wait_lock);
119 * wake waiter();
120 * unlock(wait_lock);
121 * lock(wait_lock);
122 * acquire(lock);
123 */
124 return rt_mutex_cmpxchg(lock, owner, NULL);
125}
126
86#else 127#else
87# define rt_mutex_cmpxchg(l,c,n) (0) 128# define rt_mutex_cmpxchg(l,c,n) (0)
88static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) 129static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
@@ -90,6 +131,17 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
90 lock->owner = (struct task_struct *) 131 lock->owner = (struct task_struct *)
91 ((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS); 132 ((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS);
92} 133}
134
135/*
136 * Simple slow path only version: lock->owner is protected by lock->wait_lock.
137 */
138static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock)
139 __releases(lock->wait_lock)
140{
141 lock->owner = NULL;
142 raw_spin_unlock(&lock->wait_lock);
143 return true;
144}
93#endif 145#endif
94 146
95static inline int 147static inline int
@@ -260,27 +312,36 @@ static void rt_mutex_adjust_prio(struct task_struct *task)
260 */ 312 */
261int max_lock_depth = 1024; 313int max_lock_depth = 1024;
262 314
315static inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p)
316{
317 return p->pi_blocked_on ? p->pi_blocked_on->lock : NULL;
318}
319
263/* 320/*
264 * Adjust the priority chain. Also used for deadlock detection. 321 * Adjust the priority chain. Also used for deadlock detection.
265 * Decreases task's usage by one - may thus free the task. 322 * Decreases task's usage by one - may thus free the task.
266 * 323 *
267 * @task: the task owning the mutex (owner) for which a chain walk is probably 324 * @task: the task owning the mutex (owner) for which a chain walk is
268 * needed 325 * probably needed
269 * @deadlock_detect: do we have to carry out deadlock detection? 326 * @deadlock_detect: do we have to carry out deadlock detection?
270 * @orig_lock: the mutex (can be NULL if we are walking the chain to recheck 327 * @orig_lock: the mutex (can be NULL if we are walking the chain to recheck
271 * things for a task that has just got its priority adjusted, and 328 * things for a task that has just got its priority adjusted, and
272 * is waiting on a mutex) 329 * is waiting on a mutex)
330 * @next_lock: the mutex on which the owner of @orig_lock was blocked before
331 * we dropped its pi_lock. Is never dereferenced, only used for
332 * comparison to detect lock chain changes.
273 * @orig_waiter: rt_mutex_waiter struct for the task that has just donated 333 * @orig_waiter: rt_mutex_waiter struct for the task that has just donated
274 * its priority to the mutex owner (can be NULL in the case 334 * its priority to the mutex owner (can be NULL in the case
275 * depicted above or if the top waiter is gone away and we are 335 * depicted above or if the top waiter is gone away and we are
276 * actually deboosting the owner) 336 * actually deboosting the owner)
277 * @top_task: the current top waiter 337 * @top_task: the current top waiter
278 * 338 *
279 * Returns 0 or -EDEADLK. 339 * Returns 0 or -EDEADLK.
280 */ 340 */
281static int rt_mutex_adjust_prio_chain(struct task_struct *task, 341static int rt_mutex_adjust_prio_chain(struct task_struct *task,
282 int deadlock_detect, 342 int deadlock_detect,
283 struct rt_mutex *orig_lock, 343 struct rt_mutex *orig_lock,
344 struct rt_mutex *next_lock,
284 struct rt_mutex_waiter *orig_waiter, 345 struct rt_mutex_waiter *orig_waiter,
285 struct task_struct *top_task) 346 struct task_struct *top_task)
286{ 347{
@@ -314,7 +375,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
314 } 375 }
315 put_task_struct(task); 376 put_task_struct(task);
316 377
317 return deadlock_detect ? -EDEADLK : 0; 378 return -EDEADLK;
318 } 379 }
319 retry: 380 retry:
320 /* 381 /*
@@ -339,6 +400,18 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
339 goto out_unlock_pi; 400 goto out_unlock_pi;
340 401
341 /* 402 /*
403 * We dropped all locks after taking a refcount on @task, so
404 * the task might have moved on in the lock chain or even left
405 * the chain completely and blocks now on an unrelated lock or
406 * on @orig_lock.
407 *
408 * We stored the lock on which @task was blocked in @next_lock,
409 * so we can detect the chain change.
410 */
411 if (next_lock != waiter->lock)
412 goto out_unlock_pi;
413
414 /*
342 * Drop out, when the task has no waiters. Note, 415 * Drop out, when the task has no waiters. Note,
343 * top_waiter can be NULL, when we are in the deboosting 416 * top_waiter can be NULL, when we are in the deboosting
344 * mode! 417 * mode!
@@ -377,7 +450,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
377 if (lock == orig_lock || rt_mutex_owner(lock) == top_task) { 450 if (lock == orig_lock || rt_mutex_owner(lock) == top_task) {
378 debug_rt_mutex_deadlock(deadlock_detect, orig_waiter, lock); 451 debug_rt_mutex_deadlock(deadlock_detect, orig_waiter, lock);
379 raw_spin_unlock(&lock->wait_lock); 452 raw_spin_unlock(&lock->wait_lock);
380 ret = deadlock_detect ? -EDEADLK : 0; 453 ret = -EDEADLK;
381 goto out_unlock_pi; 454 goto out_unlock_pi;
382 } 455 }
383 456
@@ -422,11 +495,26 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
422 __rt_mutex_adjust_prio(task); 495 __rt_mutex_adjust_prio(task);
423 } 496 }
424 497
498 /*
499 * Check whether the task which owns the current lock is pi
500 * blocked itself. If yes we store a pointer to the lock for
501 * the lock chain change detection above. After we dropped
502 * task->pi_lock next_lock cannot be dereferenced anymore.
503 */
504 next_lock = task_blocked_on_lock(task);
505
425 raw_spin_unlock_irqrestore(&task->pi_lock, flags); 506 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
426 507
427 top_waiter = rt_mutex_top_waiter(lock); 508 top_waiter = rt_mutex_top_waiter(lock);
428 raw_spin_unlock(&lock->wait_lock); 509 raw_spin_unlock(&lock->wait_lock);
429 510
511 /*
512 * We reached the end of the lock chain. Stop right here. No
513 * point to go back just to figure that out.
514 */
515 if (!next_lock)
516 goto out_put_task;
517
430 if (!detect_deadlock && waiter != top_waiter) 518 if (!detect_deadlock && waiter != top_waiter)
431 goto out_put_task; 519 goto out_put_task;
432 520
@@ -536,8 +624,9 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
536{ 624{
537 struct task_struct *owner = rt_mutex_owner(lock); 625 struct task_struct *owner = rt_mutex_owner(lock);
538 struct rt_mutex_waiter *top_waiter = waiter; 626 struct rt_mutex_waiter *top_waiter = waiter;
539 unsigned long flags; 627 struct rt_mutex *next_lock;
540 int chain_walk = 0, res; 628 int chain_walk = 0, res;
629 unsigned long flags;
541 630
542 /* 631 /*
543 * Early deadlock detection. We really don't want the task to 632 * Early deadlock detection. We really don't want the task to
@@ -548,7 +637,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
548 * which is wrong, as the other waiter is not in a deadlock 637 * which is wrong, as the other waiter is not in a deadlock
549 * situation. 638 * situation.
550 */ 639 */
551 if (detect_deadlock && owner == task) 640 if (owner == task)
552 return -EDEADLK; 641 return -EDEADLK;
553 642
554 raw_spin_lock_irqsave(&task->pi_lock, flags); 643 raw_spin_lock_irqsave(&task->pi_lock, flags);
@@ -569,20 +658,28 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
569 if (!owner) 658 if (!owner)
570 return 0; 659 return 0;
571 660
661 raw_spin_lock_irqsave(&owner->pi_lock, flags);
572 if (waiter == rt_mutex_top_waiter(lock)) { 662 if (waiter == rt_mutex_top_waiter(lock)) {
573 raw_spin_lock_irqsave(&owner->pi_lock, flags);
574 rt_mutex_dequeue_pi(owner, top_waiter); 663 rt_mutex_dequeue_pi(owner, top_waiter);
575 rt_mutex_enqueue_pi(owner, waiter); 664 rt_mutex_enqueue_pi(owner, waiter);
576 665
577 __rt_mutex_adjust_prio(owner); 666 __rt_mutex_adjust_prio(owner);
578 if (owner->pi_blocked_on) 667 if (owner->pi_blocked_on)
579 chain_walk = 1; 668 chain_walk = 1;
580 raw_spin_unlock_irqrestore(&owner->pi_lock, flags); 669 } else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock)) {
581 }
582 else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock))
583 chain_walk = 1; 670 chain_walk = 1;
671 }
584 672
585 if (!chain_walk) 673 /* Store the lock on which owner is blocked or NULL */
674 next_lock = task_blocked_on_lock(owner);
675
676 raw_spin_unlock_irqrestore(&owner->pi_lock, flags);
677 /*
678 * Even if full deadlock detection is on, if the owner is not
679 * blocked itself, we can avoid finding this out in the chain
680 * walk.
681 */
682 if (!chain_walk || !next_lock)
586 return 0; 683 return 0;
587 684
588 /* 685 /*
@@ -594,8 +691,8 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
594 691
595 raw_spin_unlock(&lock->wait_lock); 692 raw_spin_unlock(&lock->wait_lock);
596 693
597 res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter, 694 res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock,
598 task); 695 next_lock, waiter, task);
599 696
600 raw_spin_lock(&lock->wait_lock); 697 raw_spin_lock(&lock->wait_lock);
601 698
@@ -605,7 +702,8 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
605/* 702/*
606 * Wake up the next waiter on the lock. 703 * Wake up the next waiter on the lock.
607 * 704 *
608 * Remove the top waiter from the current tasks waiter list and wake it up. 705 * Remove the top waiter from the current tasks pi waiter list and
706 * wake it up.
609 * 707 *
610 * Called with lock->wait_lock held. 708 * Called with lock->wait_lock held.
611 */ 709 */
@@ -626,10 +724,23 @@ static void wakeup_next_waiter(struct rt_mutex *lock)
626 */ 724 */
627 rt_mutex_dequeue_pi(current, waiter); 725 rt_mutex_dequeue_pi(current, waiter);
628 726
629 rt_mutex_set_owner(lock, NULL); 727 /*
728 * As we are waking up the top waiter, and the waiter stays
729 * queued on the lock until it gets the lock, this lock
730 * obviously has waiters. Just set the bit here and this has
731 * the added benefit of forcing all new tasks into the
732 * slow path making sure no task of lower priority than
733 * the top waiter can steal this lock.
734 */
735 lock->owner = (void *) RT_MUTEX_HAS_WAITERS;
630 736
631 raw_spin_unlock_irqrestore(&current->pi_lock, flags); 737 raw_spin_unlock_irqrestore(&current->pi_lock, flags);
632 738
739 /*
740 * It's safe to dereference waiter as it cannot go away as
741 * long as we hold lock->wait_lock. The waiter task needs to
742 * acquire it in order to dequeue the waiter.
743 */
633 wake_up_process(waiter->task); 744 wake_up_process(waiter->task);
634} 745}
635 746
@@ -644,8 +755,8 @@ static void remove_waiter(struct rt_mutex *lock,
644{ 755{
645 int first = (waiter == rt_mutex_top_waiter(lock)); 756 int first = (waiter == rt_mutex_top_waiter(lock));
646 struct task_struct *owner = rt_mutex_owner(lock); 757 struct task_struct *owner = rt_mutex_owner(lock);
758 struct rt_mutex *next_lock = NULL;
647 unsigned long flags; 759 unsigned long flags;
648 int chain_walk = 0;
649 760
650 raw_spin_lock_irqsave(&current->pi_lock, flags); 761 raw_spin_lock_irqsave(&current->pi_lock, flags);
651 rt_mutex_dequeue(lock, waiter); 762 rt_mutex_dequeue(lock, waiter);
@@ -669,13 +780,13 @@ static void remove_waiter(struct rt_mutex *lock,
669 } 780 }
670 __rt_mutex_adjust_prio(owner); 781 __rt_mutex_adjust_prio(owner);
671 782
672 if (owner->pi_blocked_on) 783 /* Store the lock on which owner is blocked or NULL */
673 chain_walk = 1; 784 next_lock = task_blocked_on_lock(owner);
674 785
675 raw_spin_unlock_irqrestore(&owner->pi_lock, flags); 786 raw_spin_unlock_irqrestore(&owner->pi_lock, flags);
676 } 787 }
677 788
678 if (!chain_walk) 789 if (!next_lock)
679 return; 790 return;
680 791
681 /* gets dropped in rt_mutex_adjust_prio_chain()! */ 792 /* gets dropped in rt_mutex_adjust_prio_chain()! */
@@ -683,7 +794,7 @@ static void remove_waiter(struct rt_mutex *lock,
683 794
684 raw_spin_unlock(&lock->wait_lock); 795 raw_spin_unlock(&lock->wait_lock);
685 796
686 rt_mutex_adjust_prio_chain(owner, 0, lock, NULL, current); 797 rt_mutex_adjust_prio_chain(owner, 0, lock, next_lock, NULL, current);
687 798
688 raw_spin_lock(&lock->wait_lock); 799 raw_spin_lock(&lock->wait_lock);
689} 800}
@@ -696,6 +807,7 @@ static void remove_waiter(struct rt_mutex *lock,
696void rt_mutex_adjust_pi(struct task_struct *task) 807void rt_mutex_adjust_pi(struct task_struct *task)
697{ 808{
698 struct rt_mutex_waiter *waiter; 809 struct rt_mutex_waiter *waiter;
810 struct rt_mutex *next_lock;
699 unsigned long flags; 811 unsigned long flags;
700 812
701 raw_spin_lock_irqsave(&task->pi_lock, flags); 813 raw_spin_lock_irqsave(&task->pi_lock, flags);
@@ -706,12 +818,13 @@ void rt_mutex_adjust_pi(struct task_struct *task)
706 raw_spin_unlock_irqrestore(&task->pi_lock, flags); 818 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
707 return; 819 return;
708 } 820 }
709 821 next_lock = waiter->lock;
710 raw_spin_unlock_irqrestore(&task->pi_lock, flags); 822 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
711 823
712 /* gets dropped in rt_mutex_adjust_prio_chain()! */ 824 /* gets dropped in rt_mutex_adjust_prio_chain()! */
713 get_task_struct(task); 825 get_task_struct(task);
714 rt_mutex_adjust_prio_chain(task, 0, NULL, NULL, task); 826
827 rt_mutex_adjust_prio_chain(task, 0, NULL, next_lock, NULL, task);
715} 828}
716 829
717/** 830/**
@@ -763,6 +876,26 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
763 return ret; 876 return ret;
764} 877}
765 878
879static void rt_mutex_handle_deadlock(int res, int detect_deadlock,
880 struct rt_mutex_waiter *w)
881{
882 /*
883 * If the result is not -EDEADLOCK or the caller requested
884 * deadlock detection, nothing to do here.
885 */
886 if (res != -EDEADLOCK || detect_deadlock)
887 return;
888
889 /*
890 * Yell lowdly and stop the task right here.
891 */
892 rt_mutex_print_deadlock(w);
893 while (1) {
894 set_current_state(TASK_INTERRUPTIBLE);
895 schedule();
896 }
897}
898
766/* 899/*
767 * Slow path lock function: 900 * Slow path lock function:
768 */ 901 */
@@ -802,8 +935,10 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
802 935
803 set_current_state(TASK_RUNNING); 936 set_current_state(TASK_RUNNING);
804 937
805 if (unlikely(ret)) 938 if (unlikely(ret)) {
806 remove_waiter(lock, &waiter); 939 remove_waiter(lock, &waiter);
940 rt_mutex_handle_deadlock(ret, detect_deadlock, &waiter);
941 }
807 942
808 /* 943 /*
809 * try_to_take_rt_mutex() sets the waiter bit 944 * try_to_take_rt_mutex() sets the waiter bit
@@ -859,12 +994,49 @@ rt_mutex_slowunlock(struct rt_mutex *lock)
859 994
860 rt_mutex_deadlock_account_unlock(current); 995 rt_mutex_deadlock_account_unlock(current);
861 996
862 if (!rt_mutex_has_waiters(lock)) { 997 /*
863 lock->owner = NULL; 998 * We must be careful here if the fast path is enabled. If we
864 raw_spin_unlock(&lock->wait_lock); 999 * have no waiters queued we cannot set owner to NULL here
865 return; 1000 * because of:
1001 *
1002 * foo->lock->owner = NULL;
1003 * rtmutex_lock(foo->lock); <- fast path
1004 * free = atomic_dec_and_test(foo->refcnt);
1005 * rtmutex_unlock(foo->lock); <- fast path
1006 * if (free)
1007 * kfree(foo);
1008 * raw_spin_unlock(foo->lock->wait_lock);
1009 *
1010 * So for the fastpath enabled kernel:
1011 *
1012 * Nothing can set the waiters bit as long as we hold
1013 * lock->wait_lock. So we do the following sequence:
1014 *
1015 * owner = rt_mutex_owner(lock);
1016 * clear_rt_mutex_waiters(lock);
1017 * raw_spin_unlock(&lock->wait_lock);
1018 * if (cmpxchg(&lock->owner, owner, 0) == owner)
1019 * return;
1020 * goto retry;
1021 *
1022 * The fastpath disabled variant is simple as all access to
1023 * lock->owner is serialized by lock->wait_lock:
1024 *
1025 * lock->owner = NULL;
1026 * raw_spin_unlock(&lock->wait_lock);
1027 */
1028 while (!rt_mutex_has_waiters(lock)) {
1029 /* Drops lock->wait_lock ! */
1030 if (unlock_rt_mutex_safe(lock) == true)
1031 return;
1032 /* Relock the rtmutex and try again */
1033 raw_spin_lock(&lock->wait_lock);
866 } 1034 }
867 1035
1036 /*
1037 * The wakeup next waiter path does not suffer from the above
1038 * race. See the comments there.
1039 */
868 wakeup_next_waiter(lock); 1040 wakeup_next_waiter(lock);
869 1041
870 raw_spin_unlock(&lock->wait_lock); 1042 raw_spin_unlock(&lock->wait_lock);
@@ -1112,7 +1284,8 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
1112 return 1; 1284 return 1;
1113 } 1285 }
1114 1286
1115 ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock); 1287 /* We enforce deadlock detection for futexes */
1288 ret = task_blocks_on_rt_mutex(lock, waiter, task, 1);
1116 1289
1117 if (ret && !rt_mutex_owner(lock)) { 1290 if (ret && !rt_mutex_owner(lock)) {
1118 /* 1291 /*
diff --git a/kernel/locking/rtmutex.h b/kernel/locking/rtmutex.h
index a1a1dd06421d..f6a1f3c133b1 100644
--- a/kernel/locking/rtmutex.h
+++ b/kernel/locking/rtmutex.h
@@ -24,3 +24,8 @@
24#define debug_rt_mutex_print_deadlock(w) do { } while (0) 24#define debug_rt_mutex_print_deadlock(w) do { } while (0)
25#define debug_rt_mutex_detect_deadlock(w,d) (d) 25#define debug_rt_mutex_detect_deadlock(w,d) (d)
26#define debug_rt_mutex_reset_waiter(w) do { } while (0) 26#define debug_rt_mutex_reset_waiter(w) do { } while (0)
27
28static inline void rt_mutex_print_deadlock(struct rt_mutex_waiter *w)
29{
30 WARN(1, "rtmutex deadlock detected\n");
31}
diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c
index 9be8a9144978..2c93571162cb 100644
--- a/kernel/locking/rwsem-spinlock.c
+++ b/kernel/locking/rwsem-spinlock.c
@@ -26,7 +26,7 @@ int rwsem_is_locked(struct rw_semaphore *sem)
26 unsigned long flags; 26 unsigned long flags;
27 27
28 if (raw_spin_trylock_irqsave(&sem->wait_lock, flags)) { 28 if (raw_spin_trylock_irqsave(&sem->wait_lock, flags)) {
29 ret = (sem->activity != 0); 29 ret = (sem->count != 0);
30 raw_spin_unlock_irqrestore(&sem->wait_lock, flags); 30 raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
31 } 31 }
32 return ret; 32 return ret;
@@ -46,7 +46,7 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name,
46 debug_check_no_locks_freed((void *)sem, sizeof(*sem)); 46 debug_check_no_locks_freed((void *)sem, sizeof(*sem));
47 lockdep_init_map(&sem->dep_map, name, key, 0); 47 lockdep_init_map(&sem->dep_map, name, key, 0);
48#endif 48#endif
49 sem->activity = 0; 49 sem->count = 0;
50 raw_spin_lock_init(&sem->wait_lock); 50 raw_spin_lock_init(&sem->wait_lock);
51 INIT_LIST_HEAD(&sem->wait_list); 51 INIT_LIST_HEAD(&sem->wait_list);
52} 52}
@@ -95,7 +95,7 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wakewrite)
95 waiter = list_entry(next, struct rwsem_waiter, list); 95 waiter = list_entry(next, struct rwsem_waiter, list);
96 } while (waiter->type != RWSEM_WAITING_FOR_WRITE); 96 } while (waiter->type != RWSEM_WAITING_FOR_WRITE);
97 97
98 sem->activity += woken; 98 sem->count += woken;
99 99
100 out: 100 out:
101 return sem; 101 return sem;
@@ -126,9 +126,9 @@ void __sched __down_read(struct rw_semaphore *sem)
126 126
127 raw_spin_lock_irqsave(&sem->wait_lock, flags); 127 raw_spin_lock_irqsave(&sem->wait_lock, flags);
128 128
129 if (sem->activity >= 0 && list_empty(&sem->wait_list)) { 129 if (sem->count >= 0 && list_empty(&sem->wait_list)) {
130 /* granted */ 130 /* granted */
131 sem->activity++; 131 sem->count++;
132 raw_spin_unlock_irqrestore(&sem->wait_lock, flags); 132 raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
133 goto out; 133 goto out;
134 } 134 }
@@ -170,9 +170,9 @@ int __down_read_trylock(struct rw_semaphore *sem)
170 170
171 raw_spin_lock_irqsave(&sem->wait_lock, flags); 171 raw_spin_lock_irqsave(&sem->wait_lock, flags);
172 172
173 if (sem->activity >= 0 && list_empty(&sem->wait_list)) { 173 if (sem->count >= 0 && list_empty(&sem->wait_list)) {
174 /* granted */ 174 /* granted */
175 sem->activity++; 175 sem->count++;
176 ret = 1; 176 ret = 1;
177 } 177 }
178 178
@@ -206,7 +206,7 @@ void __sched __down_write_nested(struct rw_semaphore *sem, int subclass)
206 * itself into sleep and waiting for system woke it or someone 206 * itself into sleep and waiting for system woke it or someone
207 * else in the head of the wait list up. 207 * else in the head of the wait list up.
208 */ 208 */
209 if (sem->activity == 0) 209 if (sem->count == 0)
210 break; 210 break;
211 set_task_state(tsk, TASK_UNINTERRUPTIBLE); 211 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
212 raw_spin_unlock_irqrestore(&sem->wait_lock, flags); 212 raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
@@ -214,7 +214,7 @@ void __sched __down_write_nested(struct rw_semaphore *sem, int subclass)
214 raw_spin_lock_irqsave(&sem->wait_lock, flags); 214 raw_spin_lock_irqsave(&sem->wait_lock, flags);
215 } 215 }
216 /* got the lock */ 216 /* got the lock */
217 sem->activity = -1; 217 sem->count = -1;
218 list_del(&waiter.list); 218 list_del(&waiter.list);
219 219
220 raw_spin_unlock_irqrestore(&sem->wait_lock, flags); 220 raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
@@ -235,9 +235,9 @@ int __down_write_trylock(struct rw_semaphore *sem)
235 235
236 raw_spin_lock_irqsave(&sem->wait_lock, flags); 236 raw_spin_lock_irqsave(&sem->wait_lock, flags);
237 237
238 if (sem->activity == 0) { 238 if (sem->count == 0) {
239 /* got the lock */ 239 /* got the lock */
240 sem->activity = -1; 240 sem->count = -1;
241 ret = 1; 241 ret = 1;
242 } 242 }
243 243
@@ -255,7 +255,7 @@ void __up_read(struct rw_semaphore *sem)
255 255
256 raw_spin_lock_irqsave(&sem->wait_lock, flags); 256 raw_spin_lock_irqsave(&sem->wait_lock, flags);
257 257
258 if (--sem->activity == 0 && !list_empty(&sem->wait_list)) 258 if (--sem->count == 0 && !list_empty(&sem->wait_list))
259 sem = __rwsem_wake_one_writer(sem); 259 sem = __rwsem_wake_one_writer(sem);
260 260
261 raw_spin_unlock_irqrestore(&sem->wait_lock, flags); 261 raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
@@ -270,7 +270,7 @@ void __up_write(struct rw_semaphore *sem)
270 270
271 raw_spin_lock_irqsave(&sem->wait_lock, flags); 271 raw_spin_lock_irqsave(&sem->wait_lock, flags);
272 272
273 sem->activity = 0; 273 sem->count = 0;
274 if (!list_empty(&sem->wait_list)) 274 if (!list_empty(&sem->wait_list))
275 sem = __rwsem_do_wake(sem, 1); 275 sem = __rwsem_do_wake(sem, 1);
276 276
@@ -287,7 +287,7 @@ void __downgrade_write(struct rw_semaphore *sem)
287 287
288 raw_spin_lock_irqsave(&sem->wait_lock, flags); 288 raw_spin_lock_irqsave(&sem->wait_lock, flags);
289 289
290 sem->activity = 1; 290 sem->count = 1;
291 if (!list_empty(&sem->wait_list)) 291 if (!list_empty(&sem->wait_list))
292 sem = __rwsem_do_wake(sem, 0); 292 sem = __rwsem_do_wake(sem, 0);
293 293
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index dacc32142fcc..a2391ac135c8 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -82,9 +82,9 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name,
82 sem->count = RWSEM_UNLOCKED_VALUE; 82 sem->count = RWSEM_UNLOCKED_VALUE;
83 raw_spin_lock_init(&sem->wait_lock); 83 raw_spin_lock_init(&sem->wait_lock);
84 INIT_LIST_HEAD(&sem->wait_list); 84 INIT_LIST_HEAD(&sem->wait_list);
85#ifdef CONFIG_SMP 85#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
86 sem->owner = NULL; 86 sem->owner = NULL;
87 sem->osq = NULL; 87 osq_lock_init(&sem->osq);
88#endif 88#endif
89} 89}
90 90
@@ -262,7 +262,7 @@ static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem)
262 return false; 262 return false;
263} 263}
264 264
265#ifdef CONFIG_SMP 265#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
266/* 266/*
267 * Try to acquire write lock before the writer has been put on wait queue. 267 * Try to acquire write lock before the writer has been put on wait queue.
268 */ 268 */
@@ -285,10 +285,10 @@ static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
285static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) 285static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
286{ 286{
287 struct task_struct *owner; 287 struct task_struct *owner;
288 bool on_cpu = true; 288 bool on_cpu = false;
289 289
290 if (need_resched()) 290 if (need_resched())
291 return 0; 291 return false;
292 292
293 rcu_read_lock(); 293 rcu_read_lock();
294 owner = ACCESS_ONCE(sem->owner); 294 owner = ACCESS_ONCE(sem->owner);
@@ -297,9 +297,9 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
297 rcu_read_unlock(); 297 rcu_read_unlock();
298 298
299 /* 299 /*
300 * If sem->owner is not set, the rwsem owner may have 300 * If sem->owner is not set, yet we have just recently entered the
301 * just acquired it and not set the owner yet or the rwsem 301 * slowpath, then there is a possibility reader(s) may have the lock.
302 * has been released. 302 * To be safe, avoid spinning in these situations.
303 */ 303 */
304 return on_cpu; 304 return on_cpu;
305} 305}
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 42f806de49d4..e2d3bc7f03b4 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -12,7 +12,7 @@
12 12
13#include <linux/atomic.h> 13#include <linux/atomic.h>
14 14
15#if defined(CONFIG_SMP) && defined(CONFIG_RWSEM_XCHGADD_ALGORITHM) 15#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
16static inline void rwsem_set_owner(struct rw_semaphore *sem) 16static inline void rwsem_set_owner(struct rw_semaphore *sem)
17{ 17{
18 sem->owner = current; 18 sem->owner = current;
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 9a83d780facd..e4e4121fa327 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -253,9 +253,6 @@ config APM_EMULATION
253 anything, try disabling/enabling this option (or disabling/enabling 253 anything, try disabling/enabling this option (or disabling/enabling
254 APM in your BIOS). 254 APM in your BIOS).
255 255
256config ARCH_HAS_OPP
257 bool
258
259config PM_OPP 256config PM_OPP
260 bool 257 bool
261 ---help--- 258 ---help---
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 49e0a20fd010..fcc2611d3f14 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -35,6 +35,7 @@
35 35
36static int nocompress; 36static int nocompress;
37static int noresume; 37static int noresume;
38static int nohibernate;
38static int resume_wait; 39static int resume_wait;
39static unsigned int resume_delay; 40static unsigned int resume_delay;
40static char resume_file[256] = CONFIG_PM_STD_PARTITION; 41static char resume_file[256] = CONFIG_PM_STD_PARTITION;
@@ -62,6 +63,11 @@ bool freezer_test_done;
62 63
63static const struct platform_hibernation_ops *hibernation_ops; 64static const struct platform_hibernation_ops *hibernation_ops;
64 65
66bool hibernation_available(void)
67{
68 return (nohibernate == 0);
69}
70
65/** 71/**
66 * hibernation_set_ops - Set the global hibernate operations. 72 * hibernation_set_ops - Set the global hibernate operations.
67 * @ops: Hibernation operations to use in subsequent hibernation transitions. 73 * @ops: Hibernation operations to use in subsequent hibernation transitions.
@@ -642,6 +648,11 @@ int hibernate(void)
642{ 648{
643 int error; 649 int error;
644 650
651 if (!hibernation_available()) {
652 pr_debug("PM: Hibernation not available.\n");
653 return -EPERM;
654 }
655
645 lock_system_sleep(); 656 lock_system_sleep();
646 /* The snapshot device should not be opened while we're running */ 657 /* The snapshot device should not be opened while we're running */
647 if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { 658 if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
@@ -734,7 +745,7 @@ static int software_resume(void)
734 /* 745 /*
735 * If the user said "noresume".. bail out early. 746 * If the user said "noresume".. bail out early.
736 */ 747 */
737 if (noresume) 748 if (noresume || !hibernation_available())
738 return 0; 749 return 0;
739 750
740 /* 751 /*
@@ -900,6 +911,9 @@ static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr,
900 int i; 911 int i;
901 char *start = buf; 912 char *start = buf;
902 913
914 if (!hibernation_available())
915 return sprintf(buf, "[disabled]\n");
916
903 for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) { 917 for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) {
904 if (!hibernation_modes[i]) 918 if (!hibernation_modes[i])
905 continue; 919 continue;
@@ -934,6 +948,9 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
934 char *p; 948 char *p;
935 int mode = HIBERNATION_INVALID; 949 int mode = HIBERNATION_INVALID;
936 950
951 if (!hibernation_available())
952 return -EPERM;
953
937 p = memchr(buf, '\n', n); 954 p = memchr(buf, '\n', n);
938 len = p ? p - buf : n; 955 len = p ? p - buf : n;
939 956
@@ -1101,6 +1118,10 @@ static int __init hibernate_setup(char *str)
1101 noresume = 1; 1118 noresume = 1;
1102 else if (!strncmp(str, "nocompress", 10)) 1119 else if (!strncmp(str, "nocompress", 10))
1103 nocompress = 1; 1120 nocompress = 1;
1121 else if (!strncmp(str, "no", 2)) {
1122 noresume = 1;
1123 nohibernate = 1;
1124 }
1104 return 1; 1125 return 1;
1105} 1126}
1106 1127
@@ -1125,9 +1146,23 @@ static int __init resumedelay_setup(char *str)
1125 return 1; 1146 return 1;
1126} 1147}
1127 1148
1149static int __init nohibernate_setup(char *str)
1150{
1151 noresume = 1;
1152 nohibernate = 1;
1153 return 1;
1154}
1155
1156static int __init kaslr_nohibernate_setup(char *str)
1157{
1158 return nohibernate_setup(str);
1159}
1160
1128__setup("noresume", noresume_setup); 1161__setup("noresume", noresume_setup);
1129__setup("resume_offset=", resume_offset_setup); 1162__setup("resume_offset=", resume_offset_setup);
1130__setup("resume=", resume_setup); 1163__setup("resume=", resume_setup);
1131__setup("hibernate=", hibernate_setup); 1164__setup("hibernate=", hibernate_setup);
1132__setup("resumewait", resumewait_setup); 1165__setup("resumewait", resumewait_setup);
1133__setup("resumedelay=", resumedelay_setup); 1166__setup("resumedelay=", resumedelay_setup);
1167__setup("nohibernate", nohibernate_setup);
1168__setup("kaslr", kaslr_nohibernate_setup);
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 573410d6647e..9a59d042ea84 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -296,25 +296,22 @@ static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr,
296 suspend_state_t i; 296 suspend_state_t i;
297 297
298 for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++) 298 for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++)
299 if (pm_states[i].state) 299 if (pm_states[i])
300 s += sprintf(s,"%s ", pm_states[i].label); 300 s += sprintf(s,"%s ", pm_states[i]);
301 301
302#endif 302#endif
303#ifdef CONFIG_HIBERNATION 303 if (hibernation_available())
304 s += sprintf(s, "%s\n", "disk"); 304 s += sprintf(s, "disk ");
305#else
306 if (s != buf) 305 if (s != buf)
307 /* convert the last space to a newline */ 306 /* convert the last space to a newline */
308 *(s-1) = '\n'; 307 *(s-1) = '\n';
309#endif
310 return (s - buf); 308 return (s - buf);
311} 309}
312 310
313static suspend_state_t decode_state(const char *buf, size_t n) 311static suspend_state_t decode_state(const char *buf, size_t n)
314{ 312{
315#ifdef CONFIG_SUSPEND 313#ifdef CONFIG_SUSPEND
316 suspend_state_t state = PM_SUSPEND_MIN; 314 suspend_state_t state;
317 struct pm_sleep_state *s;
318#endif 315#endif
319 char *p; 316 char *p;
320 int len; 317 int len;
@@ -327,10 +324,12 @@ static suspend_state_t decode_state(const char *buf, size_t n)
327 return PM_SUSPEND_MAX; 324 return PM_SUSPEND_MAX;
328 325
329#ifdef CONFIG_SUSPEND 326#ifdef CONFIG_SUSPEND
330 for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) 327 for (state = PM_SUSPEND_MIN; state < PM_SUSPEND_MAX; state++) {
331 if (s->state && len == strlen(s->label) 328 const char *label = pm_states[state];
332 && !strncmp(buf, s->label, len)) 329
333 return s->state; 330 if (label && len == strlen(label) && !strncmp(buf, label, len))
331 return state;
332 }
334#endif 333#endif
335 334
336 return PM_SUSPEND_ON; 335 return PM_SUSPEND_ON;
@@ -448,8 +447,8 @@ static ssize_t autosleep_show(struct kobject *kobj,
448 447
449#ifdef CONFIG_SUSPEND 448#ifdef CONFIG_SUSPEND
450 if (state < PM_SUSPEND_MAX) 449 if (state < PM_SUSPEND_MAX)
451 return sprintf(buf, "%s\n", pm_states[state].state ? 450 return sprintf(buf, "%s\n", pm_states[state] ?
452 pm_states[state].label : "error"); 451 pm_states[state] : "error");
453#endif 452#endif
454#ifdef CONFIG_HIBERNATION 453#ifdef CONFIG_HIBERNATION
455 return sprintf(buf, "disk\n"); 454 return sprintf(buf, "disk\n");
@@ -617,7 +616,6 @@ static struct attribute_group attr_group = {
617 .attrs = g, 616 .attrs = g,
618}; 617};
619 618
620#ifdef CONFIG_PM_RUNTIME
621struct workqueue_struct *pm_wq; 619struct workqueue_struct *pm_wq;
622EXPORT_SYMBOL_GPL(pm_wq); 620EXPORT_SYMBOL_GPL(pm_wq);
623 621
@@ -627,9 +625,6 @@ static int __init pm_start_workqueue(void)
627 625
628 return pm_wq ? 0 : -ENOMEM; 626 return pm_wq ? 0 : -ENOMEM;
629} 627}
630#else
631static inline int pm_start_workqueue(void) { return 0; }
632#endif
633 628
634static int __init pm_init(void) 629static int __init pm_init(void)
635{ 630{
diff --git a/kernel/power/power.h b/kernel/power/power.h
index c60f13b5270a..5d49dcac2537 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -178,13 +178,8 @@ extern void swsusp_show_speed(struct timeval *, struct timeval *,
178 unsigned int, char *); 178 unsigned int, char *);
179 179
180#ifdef CONFIG_SUSPEND 180#ifdef CONFIG_SUSPEND
181struct pm_sleep_state {
182 const char *label;
183 suspend_state_t state;
184};
185
186/* kernel/power/suspend.c */ 181/* kernel/power/suspend.c */
187extern struct pm_sleep_state pm_states[]; 182extern const char *pm_states[];
188 183
189extern int suspend_devices_and_enter(suspend_state_t state); 184extern int suspend_devices_and_enter(suspend_state_t state);
190#else /* !CONFIG_SUSPEND */ 185#else /* !CONFIG_SUSPEND */
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 0ca8d83e2369..4ee194eb524b 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -186,6 +186,7 @@ void thaw_processes(void)
186 186
187 printk("Restarting tasks ... "); 187 printk("Restarting tasks ... ");
188 188
189 __usermodehelper_set_disable_depth(UMH_FREEZING);
189 thaw_workqueues(); 190 thaw_workqueues();
190 191
191 read_lock(&tasklist_lock); 192 read_lock(&tasklist_lock);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 1ea328aafdc9..4fc5c32422b3 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -248,33 +248,61 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size)
248 * information is stored (in the form of a block of bitmap) 248 * information is stored (in the form of a block of bitmap)
249 * It also contains the pfns that correspond to the start and end of 249 * It also contains the pfns that correspond to the start and end of
250 * the represented memory area. 250 * the represented memory area.
251 *
252 * The memory bitmap is organized as a radix tree to guarantee fast random
253 * access to the bits. There is one radix tree for each zone (as returned
254 * from create_mem_extents).
255 *
256 * One radix tree is represented by one struct mem_zone_bm_rtree. There are
257 * two linked lists for the nodes of the tree, one for the inner nodes and
258 * one for the leave nodes. The linked leave nodes are used for fast linear
259 * access of the memory bitmap.
260 *
261 * The struct rtree_node represents one node of the radix tree.
251 */ 262 */
252 263
253#define BM_END_OF_MAP (~0UL) 264#define BM_END_OF_MAP (~0UL)
254 265
255#define BM_BITS_PER_BLOCK (PAGE_SIZE * BITS_PER_BYTE) 266#define BM_BITS_PER_BLOCK (PAGE_SIZE * BITS_PER_BYTE)
267#define BM_BLOCK_SHIFT (PAGE_SHIFT + 3)
268#define BM_BLOCK_MASK ((1UL << BM_BLOCK_SHIFT) - 1)
256 269
257struct bm_block { 270/*
258 struct list_head hook; /* hook into a list of bitmap blocks */ 271 * struct rtree_node is a wrapper struct to link the nodes
259 unsigned long start_pfn; /* pfn represented by the first bit */ 272 * of the rtree together for easy linear iteration over
260 unsigned long end_pfn; /* pfn represented by the last bit plus 1 */ 273 * bits and easy freeing
261 unsigned long *data; /* bitmap representing pages */ 274 */
275struct rtree_node {
276 struct list_head list;
277 unsigned long *data;
262}; 278};
263 279
264static inline unsigned long bm_block_bits(struct bm_block *bb) 280/*
265{ 281 * struct mem_zone_bm_rtree represents a bitmap used for one
266 return bb->end_pfn - bb->start_pfn; 282 * populated memory zone.
267} 283 */
284struct mem_zone_bm_rtree {
285 struct list_head list; /* Link Zones together */
286 struct list_head nodes; /* Radix Tree inner nodes */
287 struct list_head leaves; /* Radix Tree leaves */
288 unsigned long start_pfn; /* Zone start page frame */
289 unsigned long end_pfn; /* Zone end page frame + 1 */
290 struct rtree_node *rtree; /* Radix Tree Root */
291 int levels; /* Number of Radix Tree Levels */
292 unsigned int blocks; /* Number of Bitmap Blocks */
293};
268 294
269/* strcut bm_position is used for browsing memory bitmaps */ 295/* strcut bm_position is used for browsing memory bitmaps */
270 296
271struct bm_position { 297struct bm_position {
272 struct bm_block *block; 298 struct mem_zone_bm_rtree *zone;
273 int bit; 299 struct rtree_node *node;
300 unsigned long node_pfn;
301 int node_bit;
274}; 302};
275 303
276struct memory_bitmap { 304struct memory_bitmap {
277 struct list_head blocks; /* list of bitmap blocks */ 305 struct list_head zones;
278 struct linked_page *p_list; /* list of pages used to store zone 306 struct linked_page *p_list; /* list of pages used to store zone
279 * bitmap objects and bitmap block 307 * bitmap objects and bitmap block
280 * objects 308 * objects
@@ -284,38 +312,178 @@ struct memory_bitmap {
284 312
285/* Functions that operate on memory bitmaps */ 313/* Functions that operate on memory bitmaps */
286 314
287static void memory_bm_position_reset(struct memory_bitmap *bm) 315#define BM_ENTRIES_PER_LEVEL (PAGE_SIZE / sizeof(unsigned long))
316#if BITS_PER_LONG == 32
317#define BM_RTREE_LEVEL_SHIFT (PAGE_SHIFT - 2)
318#else
319#define BM_RTREE_LEVEL_SHIFT (PAGE_SHIFT - 3)
320#endif
321#define BM_RTREE_LEVEL_MASK ((1UL << BM_RTREE_LEVEL_SHIFT) - 1)
322
323/*
324 * alloc_rtree_node - Allocate a new node and add it to the radix tree.
325 *
326 * This function is used to allocate inner nodes as well as the
327 * leave nodes of the radix tree. It also adds the node to the
328 * corresponding linked list passed in by the *list parameter.
329 */
330static struct rtree_node *alloc_rtree_node(gfp_t gfp_mask, int safe_needed,
331 struct chain_allocator *ca,
332 struct list_head *list)
288{ 333{
289 bm->cur.block = list_entry(bm->blocks.next, struct bm_block, hook); 334 struct rtree_node *node;
290 bm->cur.bit = 0;
291}
292 335
293static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free); 336 node = chain_alloc(ca, sizeof(struct rtree_node));
337 if (!node)
338 return NULL;
294 339
295/** 340 node->data = get_image_page(gfp_mask, safe_needed);
296 * create_bm_block_list - create a list of block bitmap objects 341 if (!node->data)
297 * @pages - number of pages to track 342 return NULL;
298 * @list - list to put the allocated blocks into 343
299 * @ca - chain allocator to be used for allocating memory 344 list_add_tail(&node->list, list);
345
346 return node;
347}
348
349/*
350 * add_rtree_block - Add a new leave node to the radix tree
351 *
352 * The leave nodes need to be allocated in order to keep the leaves
353 * linked list in order. This is guaranteed by the zone->blocks
354 * counter.
300 */ 355 */
301static int create_bm_block_list(unsigned long pages, 356static int add_rtree_block(struct mem_zone_bm_rtree *zone, gfp_t gfp_mask,
302 struct list_head *list, 357 int safe_needed, struct chain_allocator *ca)
303 struct chain_allocator *ca)
304{ 358{
305 unsigned int nr_blocks = DIV_ROUND_UP(pages, BM_BITS_PER_BLOCK); 359 struct rtree_node *node, *block, **dst;
360 unsigned int levels_needed, block_nr;
361 int i;
306 362
307 while (nr_blocks-- > 0) { 363 block_nr = zone->blocks;
308 struct bm_block *bb; 364 levels_needed = 0;
309 365
310 bb = chain_alloc(ca, sizeof(struct bm_block)); 366 /* How many levels do we need for this block nr? */
311 if (!bb) 367 while (block_nr) {
368 levels_needed += 1;
369 block_nr >>= BM_RTREE_LEVEL_SHIFT;
370 }
371
372 /* Make sure the rtree has enough levels */
373 for (i = zone->levels; i < levels_needed; i++) {
374 node = alloc_rtree_node(gfp_mask, safe_needed, ca,
375 &zone->nodes);
376 if (!node)
312 return -ENOMEM; 377 return -ENOMEM;
313 list_add(&bb->hook, list); 378
379 node->data[0] = (unsigned long)zone->rtree;
380 zone->rtree = node;
381 zone->levels += 1;
382 }
383
384 /* Allocate new block */
385 block = alloc_rtree_node(gfp_mask, safe_needed, ca, &zone->leaves);
386 if (!block)
387 return -ENOMEM;
388
389 /* Now walk the rtree to insert the block */
390 node = zone->rtree;
391 dst = &zone->rtree;
392 block_nr = zone->blocks;
393 for (i = zone->levels; i > 0; i--) {
394 int index;
395
396 if (!node) {
397 node = alloc_rtree_node(gfp_mask, safe_needed, ca,
398 &zone->nodes);
399 if (!node)
400 return -ENOMEM;
401 *dst = node;
402 }
403
404 index = block_nr >> ((i - 1) * BM_RTREE_LEVEL_SHIFT);
405 index &= BM_RTREE_LEVEL_MASK;
406 dst = (struct rtree_node **)&((*dst)->data[index]);
407 node = *dst;
314 } 408 }
315 409
410 zone->blocks += 1;
411 *dst = block;
412
316 return 0; 413 return 0;
317} 414}
318 415
416static void free_zone_bm_rtree(struct mem_zone_bm_rtree *zone,
417 int clear_nosave_free);
418
419/*
420 * create_zone_bm_rtree - create a radix tree for one zone
421 *
422 * Allocated the mem_zone_bm_rtree structure and initializes it.
423 * This function also allocated and builds the radix tree for the
424 * zone.
425 */
426static struct mem_zone_bm_rtree *
427create_zone_bm_rtree(gfp_t gfp_mask, int safe_needed,
428 struct chain_allocator *ca,
429 unsigned long start, unsigned long end)
430{
431 struct mem_zone_bm_rtree *zone;
432 unsigned int i, nr_blocks;
433 unsigned long pages;
434
435 pages = end - start;
436 zone = chain_alloc(ca, sizeof(struct mem_zone_bm_rtree));
437 if (!zone)
438 return NULL;
439
440 INIT_LIST_HEAD(&zone->nodes);
441 INIT_LIST_HEAD(&zone->leaves);
442 zone->start_pfn = start;
443 zone->end_pfn = end;
444 nr_blocks = DIV_ROUND_UP(pages, BM_BITS_PER_BLOCK);
445
446 for (i = 0; i < nr_blocks; i++) {
447 if (add_rtree_block(zone, gfp_mask, safe_needed, ca)) {
448 free_zone_bm_rtree(zone, PG_UNSAFE_CLEAR);
449 return NULL;
450 }
451 }
452
453 return zone;
454}
455
456/*
457 * free_zone_bm_rtree - Free the memory of the radix tree
458 *
459 * Free all node pages of the radix tree. The mem_zone_bm_rtree
460 * structure itself is not freed here nor are the rtree_node
461 * structs.
462 */
463static void free_zone_bm_rtree(struct mem_zone_bm_rtree *zone,
464 int clear_nosave_free)
465{
466 struct rtree_node *node;
467
468 list_for_each_entry(node, &zone->nodes, list)
469 free_image_page(node->data, clear_nosave_free);
470
471 list_for_each_entry(node, &zone->leaves, list)
472 free_image_page(node->data, clear_nosave_free);
473}
474
475static void memory_bm_position_reset(struct memory_bitmap *bm)
476{
477 bm->cur.zone = list_entry(bm->zones.next, struct mem_zone_bm_rtree,
478 list);
479 bm->cur.node = list_entry(bm->cur.zone->leaves.next,
480 struct rtree_node, list);
481 bm->cur.node_pfn = 0;
482 bm->cur.node_bit = 0;
483}
484
485static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free);
486
319struct mem_extent { 487struct mem_extent {
320 struct list_head hook; 488 struct list_head hook;
321 unsigned long start; 489 unsigned long start;
@@ -407,40 +575,22 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
407 int error; 575 int error;
408 576
409 chain_init(&ca, gfp_mask, safe_needed); 577 chain_init(&ca, gfp_mask, safe_needed);
410 INIT_LIST_HEAD(&bm->blocks); 578 INIT_LIST_HEAD(&bm->zones);
411 579
412 error = create_mem_extents(&mem_extents, gfp_mask); 580 error = create_mem_extents(&mem_extents, gfp_mask);
413 if (error) 581 if (error)
414 return error; 582 return error;
415 583
416 list_for_each_entry(ext, &mem_extents, hook) { 584 list_for_each_entry(ext, &mem_extents, hook) {
417 struct bm_block *bb; 585 struct mem_zone_bm_rtree *zone;
418 unsigned long pfn = ext->start;
419 unsigned long pages = ext->end - ext->start;
420
421 bb = list_entry(bm->blocks.prev, struct bm_block, hook);
422 586
423 error = create_bm_block_list(pages, bm->blocks.prev, &ca); 587 zone = create_zone_bm_rtree(gfp_mask, safe_needed, &ca,
424 if (error) 588 ext->start, ext->end);
589 if (!zone) {
590 error = -ENOMEM;
425 goto Error; 591 goto Error;
426
427 list_for_each_entry_continue(bb, &bm->blocks, hook) {
428 bb->data = get_image_page(gfp_mask, safe_needed);
429 if (!bb->data) {
430 error = -ENOMEM;
431 goto Error;
432 }
433
434 bb->start_pfn = pfn;
435 if (pages >= BM_BITS_PER_BLOCK) {
436 pfn += BM_BITS_PER_BLOCK;
437 pages -= BM_BITS_PER_BLOCK;
438 } else {
439 /* This is executed only once in the loop */
440 pfn += pages;
441 }
442 bb->end_pfn = pfn;
443 } 592 }
593 list_add_tail(&zone->list, &bm->zones);
444 } 594 }
445 595
446 bm->p_list = ca.chain; 596 bm->p_list = ca.chain;
@@ -460,51 +610,83 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
460 */ 610 */
461static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free) 611static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free)
462{ 612{
463 struct bm_block *bb; 613 struct mem_zone_bm_rtree *zone;
464 614
465 list_for_each_entry(bb, &bm->blocks, hook) 615 list_for_each_entry(zone, &bm->zones, list)
466 if (bb->data) 616 free_zone_bm_rtree(zone, clear_nosave_free);
467 free_image_page(bb->data, clear_nosave_free);
468 617
469 free_list_of_pages(bm->p_list, clear_nosave_free); 618 free_list_of_pages(bm->p_list, clear_nosave_free);
470 619
471 INIT_LIST_HEAD(&bm->blocks); 620 INIT_LIST_HEAD(&bm->zones);
472} 621}
473 622
474/** 623/**
475 * memory_bm_find_bit - find the bit in the bitmap @bm that corresponds 624 * memory_bm_find_bit - Find the bit for pfn in the memory
476 * to given pfn. The cur_zone_bm member of @bm and the cur_block member 625 * bitmap
477 * of @bm->cur_zone_bm are updated. 626 *
627 * Find the bit in the bitmap @bm that corresponds to given pfn.
628 * The cur.zone, cur.block and cur.node_pfn member of @bm are
629 * updated.
630 * It walks the radix tree to find the page which contains the bit for
631 * pfn and returns the bit position in **addr and *bit_nr.
478 */ 632 */
479static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn, 633static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn,
480 void **addr, unsigned int *bit_nr) 634 void **addr, unsigned int *bit_nr)
481{ 635{
482 struct bm_block *bb; 636 struct mem_zone_bm_rtree *curr, *zone;
637 struct rtree_node *node;
638 int i, block_nr;
483 639
640 zone = bm->cur.zone;
641
642 if (pfn >= zone->start_pfn && pfn < zone->end_pfn)
643 goto zone_found;
644
645 zone = NULL;
646
647 /* Find the right zone */
648 list_for_each_entry(curr, &bm->zones, list) {
649 if (pfn >= curr->start_pfn && pfn < curr->end_pfn) {
650 zone = curr;
651 break;
652 }
653 }
654
655 if (!zone)
656 return -EFAULT;
657
658zone_found:
484 /* 659 /*
485 * Check if the pfn corresponds to the current bitmap block and find 660 * We have a zone. Now walk the radix tree to find the leave
486 * the block where it fits if this is not the case. 661 * node for our pfn.
487 */ 662 */
488 bb = bm->cur.block;
489 if (pfn < bb->start_pfn)
490 list_for_each_entry_continue_reverse(bb, &bm->blocks, hook)
491 if (pfn >= bb->start_pfn)
492 break;
493 663
494 if (pfn >= bb->end_pfn) 664 node = bm->cur.node;
495 list_for_each_entry_continue(bb, &bm->blocks, hook) 665 if (((pfn - zone->start_pfn) & ~BM_BLOCK_MASK) == bm->cur.node_pfn)
496 if (pfn >= bb->start_pfn && pfn < bb->end_pfn) 666 goto node_found;
497 break;
498 667
499 if (&bb->hook == &bm->blocks) 668 node = zone->rtree;
500 return -EFAULT; 669 block_nr = (pfn - zone->start_pfn) >> BM_BLOCK_SHIFT;
670
671 for (i = zone->levels; i > 0; i--) {
672 int index;
673
674 index = block_nr >> ((i - 1) * BM_RTREE_LEVEL_SHIFT);
675 index &= BM_RTREE_LEVEL_MASK;
676 BUG_ON(node->data[index] == 0);
677 node = (struct rtree_node *)node->data[index];
678 }
679
680node_found:
681 /* Update last position */
682 bm->cur.zone = zone;
683 bm->cur.node = node;
684 bm->cur.node_pfn = (pfn - zone->start_pfn) & ~BM_BLOCK_MASK;
685
686 /* Set return values */
687 *addr = node->data;
688 *bit_nr = (pfn - zone->start_pfn) & BM_BLOCK_MASK;
501 689
502 /* The block has been found */
503 bm->cur.block = bb;
504 pfn -= bb->start_pfn;
505 bm->cur.bit = pfn + 1;
506 *bit_nr = pfn;
507 *addr = bb->data;
508 return 0; 690 return 0;
509} 691}
510 692
@@ -528,6 +710,7 @@ static int mem_bm_set_bit_check(struct memory_bitmap *bm, unsigned long pfn)
528 error = memory_bm_find_bit(bm, pfn, &addr, &bit); 710 error = memory_bm_find_bit(bm, pfn, &addr, &bit);
529 if (!error) 711 if (!error)
530 set_bit(bit, addr); 712 set_bit(bit, addr);
713
531 return error; 714 return error;
532} 715}
533 716
@@ -542,6 +725,14 @@ static void memory_bm_clear_bit(struct memory_bitmap *bm, unsigned long pfn)
542 clear_bit(bit, addr); 725 clear_bit(bit, addr);
543} 726}
544 727
728static void memory_bm_clear_current(struct memory_bitmap *bm)
729{
730 int bit;
731
732 bit = max(bm->cur.node_bit - 1, 0);
733 clear_bit(bit, bm->cur.node->data);
734}
735
545static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn) 736static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn)
546{ 737{
547 void *addr; 738 void *addr;
@@ -561,38 +752,70 @@ static bool memory_bm_pfn_present(struct memory_bitmap *bm, unsigned long pfn)
561 return !memory_bm_find_bit(bm, pfn, &addr, &bit); 752 return !memory_bm_find_bit(bm, pfn, &addr, &bit);
562} 753}
563 754
564/** 755/*
565 * memory_bm_next_pfn - find the pfn that corresponds to the next set bit 756 * rtree_next_node - Jumps to the next leave node
566 * in the bitmap @bm. If the pfn cannot be found, BM_END_OF_MAP is 757 *
567 * returned. 758 * Sets the position to the beginning of the next node in the
759 * memory bitmap. This is either the next node in the current
760 * zone's radix tree or the first node in the radix tree of the
761 * next zone.
568 * 762 *
569 * It is required to run memory_bm_position_reset() before the first call to 763 * Returns true if there is a next node, false otherwise.
570 * this function.
571 */ 764 */
765static bool rtree_next_node(struct memory_bitmap *bm)
766{
767 bm->cur.node = list_entry(bm->cur.node->list.next,
768 struct rtree_node, list);
769 if (&bm->cur.node->list != &bm->cur.zone->leaves) {
770 bm->cur.node_pfn += BM_BITS_PER_BLOCK;
771 bm->cur.node_bit = 0;
772 touch_softlockup_watchdog();
773 return true;
774 }
775
776 /* No more nodes, goto next zone */
777 bm->cur.zone = list_entry(bm->cur.zone->list.next,
778 struct mem_zone_bm_rtree, list);
779 if (&bm->cur.zone->list != &bm->zones) {
780 bm->cur.node = list_entry(bm->cur.zone->leaves.next,
781 struct rtree_node, list);
782 bm->cur.node_pfn = 0;
783 bm->cur.node_bit = 0;
784 return true;
785 }
572 786
787 /* No more zones */
788 return false;
789}
790
791/**
792 * memory_bm_rtree_next_pfn - Find the next set bit in the bitmap @bm
793 *
794 * Starting from the last returned position this function searches
795 * for the next set bit in the memory bitmap and returns its
796 * number. If no more bit is set BM_END_OF_MAP is returned.
797 *
798 * It is required to run memory_bm_position_reset() before the
799 * first call to this function.
800 */
573static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm) 801static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm)
574{ 802{
575 struct bm_block *bb; 803 unsigned long bits, pfn, pages;
576 int bit; 804 int bit;
577 805
578 bb = bm->cur.block;
579 do { 806 do {
580 bit = bm->cur.bit; 807 pages = bm->cur.zone->end_pfn - bm->cur.zone->start_pfn;
581 bit = find_next_bit(bb->data, bm_block_bits(bb), bit); 808 bits = min(pages - bm->cur.node_pfn, BM_BITS_PER_BLOCK);
582 if (bit < bm_block_bits(bb)) 809 bit = find_next_bit(bm->cur.node->data, bits,
583 goto Return_pfn; 810 bm->cur.node_bit);
584 811 if (bit < bits) {
585 bb = list_entry(bb->hook.next, struct bm_block, hook); 812 pfn = bm->cur.zone->start_pfn + bm->cur.node_pfn + bit;
586 bm->cur.block = bb; 813 bm->cur.node_bit = bit + 1;
587 bm->cur.bit = 0; 814 return pfn;
588 } while (&bb->hook != &bm->blocks); 815 }
816 } while (rtree_next_node(bm));
589 817
590 memory_bm_position_reset(bm);
591 return BM_END_OF_MAP; 818 return BM_END_OF_MAP;
592
593 Return_pfn:
594 bm->cur.bit = bit + 1;
595 return bb->start_pfn + bit;
596} 819}
597 820
598/** 821/**
@@ -816,12 +1039,17 @@ void free_basic_memory_bitmaps(void)
816 1039
817unsigned int snapshot_additional_pages(struct zone *zone) 1040unsigned int snapshot_additional_pages(struct zone *zone)
818{ 1041{
819 unsigned int res; 1042 unsigned int rtree, nodes;
1043
1044 rtree = nodes = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK);
1045 rtree += DIV_ROUND_UP(rtree * sizeof(struct rtree_node),
1046 LINKED_PAGE_DATA_SIZE);
1047 while (nodes > 1) {
1048 nodes = DIV_ROUND_UP(nodes, BM_ENTRIES_PER_LEVEL);
1049 rtree += nodes;
1050 }
820 1051
821 res = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK); 1052 return 2 * rtree;
822 res += DIV_ROUND_UP(res * sizeof(struct bm_block),
823 LINKED_PAGE_DATA_SIZE);
824 return 2 * res;
825} 1053}
826 1054
827#ifdef CONFIG_HIGHMEM 1055#ifdef CONFIG_HIGHMEM
@@ -1094,23 +1322,35 @@ static struct memory_bitmap copy_bm;
1094 1322
1095void swsusp_free(void) 1323void swsusp_free(void)
1096{ 1324{
1097 struct zone *zone; 1325 unsigned long fb_pfn, fr_pfn;
1098 unsigned long pfn, max_zone_pfn;
1099 1326
1100 for_each_populated_zone(zone) { 1327 memory_bm_position_reset(forbidden_pages_map);
1101 max_zone_pfn = zone_end_pfn(zone); 1328 memory_bm_position_reset(free_pages_map);
1102 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) 1329
1103 if (pfn_valid(pfn)) { 1330loop:
1104 struct page *page = pfn_to_page(pfn); 1331 fr_pfn = memory_bm_next_pfn(free_pages_map);
1105 1332 fb_pfn = memory_bm_next_pfn(forbidden_pages_map);
1106 if (swsusp_page_is_forbidden(page) && 1333
1107 swsusp_page_is_free(page)) { 1334 /*
1108 swsusp_unset_page_forbidden(page); 1335 * Find the next bit set in both bitmaps. This is guaranteed to
1109 swsusp_unset_page_free(page); 1336 * terminate when fb_pfn == fr_pfn == BM_END_OF_MAP.
1110 __free_page(page); 1337 */
1111 } 1338 do {
1112 } 1339 if (fb_pfn < fr_pfn)
1340 fb_pfn = memory_bm_next_pfn(forbidden_pages_map);
1341 if (fr_pfn < fb_pfn)
1342 fr_pfn = memory_bm_next_pfn(free_pages_map);
1343 } while (fb_pfn != fr_pfn);
1344
1345 if (fr_pfn != BM_END_OF_MAP && pfn_valid(fr_pfn)) {
1346 struct page *page = pfn_to_page(fr_pfn);
1347
1348 memory_bm_clear_current(forbidden_pages_map);
1349 memory_bm_clear_current(free_pages_map);
1350 __free_page(page);
1351 goto loop;
1113 } 1352 }
1353
1114 nr_copy_pages = 0; 1354 nr_copy_pages = 0;
1115 nr_meta_pages = 0; 1355 nr_meta_pages = 0;
1116 restore_pblist = NULL; 1356 restore_pblist = NULL;
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 4dd8822f732a..9a071bea80eb 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -31,20 +31,11 @@
31 31
32#include "power.h" 32#include "power.h"
33 33
34struct pm_sleep_state pm_states[PM_SUSPEND_MAX] = { 34static const char *pm_labels[] = { "mem", "standby", "freeze", };
35 [PM_SUSPEND_FREEZE] = { .label = "freeze", .state = PM_SUSPEND_FREEZE }, 35const char *pm_states[PM_SUSPEND_MAX];
36 [PM_SUSPEND_STANDBY] = { .label = "standby", },
37 [PM_SUSPEND_MEM] = { .label = "mem", },
38};
39 36
40static const struct platform_suspend_ops *suspend_ops; 37static const struct platform_suspend_ops *suspend_ops;
41static const struct platform_freeze_ops *freeze_ops; 38static const struct platform_freeze_ops *freeze_ops;
42
43static bool need_suspend_ops(suspend_state_t state)
44{
45 return state > PM_SUSPEND_FREEZE;
46}
47
48static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head); 39static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head);
49static bool suspend_freeze_wake; 40static bool suspend_freeze_wake;
50 41
@@ -97,10 +88,7 @@ static bool relative_states;
97static int __init sleep_states_setup(char *str) 88static int __init sleep_states_setup(char *str)
98{ 89{
99 relative_states = !strncmp(str, "1", 1); 90 relative_states = !strncmp(str, "1", 1);
100 if (relative_states) { 91 pm_states[PM_SUSPEND_FREEZE] = pm_labels[relative_states ? 0 : 2];
101 pm_states[PM_SUSPEND_MEM].state = PM_SUSPEND_FREEZE;
102 pm_states[PM_SUSPEND_FREEZE].state = 0;
103 }
104 return 1; 92 return 1;
105} 93}
106 94
@@ -113,20 +101,20 @@ __setup("relative_sleep_states=", sleep_states_setup);
113void suspend_set_ops(const struct platform_suspend_ops *ops) 101void suspend_set_ops(const struct platform_suspend_ops *ops)
114{ 102{
115 suspend_state_t i; 103 suspend_state_t i;
116 int j = PM_SUSPEND_MAX - 1; 104 int j = 0;
117 105
118 lock_system_sleep(); 106 lock_system_sleep();
119 107
120 suspend_ops = ops; 108 suspend_ops = ops;
121 for (i = PM_SUSPEND_MEM; i >= PM_SUSPEND_STANDBY; i--) 109 for (i = PM_SUSPEND_MEM; i >= PM_SUSPEND_STANDBY; i--)
122 if (valid_state(i)) 110 if (valid_state(i)) {
123 pm_states[j--].state = i; 111 pm_states[i] = pm_labels[j++];
124 else if (!relative_states) 112 } else if (!relative_states) {
125 pm_states[j--].state = 0; 113 pm_states[i] = NULL;
114 j++;
115 }
126 116
127 pm_states[j--].state = PM_SUSPEND_FREEZE; 117 pm_states[PM_SUSPEND_FREEZE] = pm_labels[j];
128 while (j >= PM_SUSPEND_MIN)
129 pm_states[j--].state = 0;
130 118
131 unlock_system_sleep(); 119 unlock_system_sleep();
132} 120}
@@ -145,6 +133,65 @@ int suspend_valid_only_mem(suspend_state_t state)
145} 133}
146EXPORT_SYMBOL_GPL(suspend_valid_only_mem); 134EXPORT_SYMBOL_GPL(suspend_valid_only_mem);
147 135
136static bool sleep_state_supported(suspend_state_t state)
137{
138 return state == PM_SUSPEND_FREEZE || (suspend_ops && suspend_ops->enter);
139}
140
141static int platform_suspend_prepare(suspend_state_t state)
142{
143 return state != PM_SUSPEND_FREEZE && suspend_ops->prepare ?
144 suspend_ops->prepare() : 0;
145}
146
147static int platform_suspend_prepare_late(suspend_state_t state)
148{
149 return state != PM_SUSPEND_FREEZE && suspend_ops->prepare_late ?
150 suspend_ops->prepare_late() : 0;
151}
152
153static void platform_suspend_wake(suspend_state_t state)
154{
155 if (state != PM_SUSPEND_FREEZE && suspend_ops->wake)
156 suspend_ops->wake();
157}
158
159static void platform_suspend_finish(suspend_state_t state)
160{
161 if (state != PM_SUSPEND_FREEZE && suspend_ops->finish)
162 suspend_ops->finish();
163}
164
165static int platform_suspend_begin(suspend_state_t state)
166{
167 if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->begin)
168 return freeze_ops->begin();
169 else if (suspend_ops->begin)
170 return suspend_ops->begin(state);
171 else
172 return 0;
173}
174
175static void platform_suspend_end(suspend_state_t state)
176{
177 if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->end)
178 freeze_ops->end();
179 else if (suspend_ops->end)
180 suspend_ops->end();
181}
182
183static void platform_suspend_recover(suspend_state_t state)
184{
185 if (state != PM_SUSPEND_FREEZE && suspend_ops->recover)
186 suspend_ops->recover();
187}
188
189static bool platform_suspend_again(suspend_state_t state)
190{
191 return state != PM_SUSPEND_FREEZE && suspend_ops->suspend_again ?
192 suspend_ops->suspend_again() : false;
193}
194
148static int suspend_test(int level) 195static int suspend_test(int level)
149{ 196{
150#ifdef CONFIG_PM_DEBUG 197#ifdef CONFIG_PM_DEBUG
@@ -168,7 +215,7 @@ static int suspend_prepare(suspend_state_t state)
168{ 215{
169 int error; 216 int error;
170 217
171 if (need_suspend_ops(state) && (!suspend_ops || !suspend_ops->enter)) 218 if (!sleep_state_supported(state))
172 return -EPERM; 219 return -EPERM;
173 220
174 pm_prepare_console(); 221 pm_prepare_console();
@@ -214,23 +261,18 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
214{ 261{
215 int error; 262 int error;
216 263
217 if (need_suspend_ops(state) && suspend_ops->prepare) { 264 error = platform_suspend_prepare(state);
218 error = suspend_ops->prepare(); 265 if (error)
219 if (error) 266 goto Platform_finish;
220 goto Platform_finish;
221 }
222 267
223 error = dpm_suspend_end(PMSG_SUSPEND); 268 error = dpm_suspend_end(PMSG_SUSPEND);
224 if (error) { 269 if (error) {
225 printk(KERN_ERR "PM: Some devices failed to power down\n"); 270 printk(KERN_ERR "PM: Some devices failed to power down\n");
226 goto Platform_finish; 271 goto Platform_finish;
227 } 272 }
228 273 error = platform_suspend_prepare_late(state);
229 if (need_suspend_ops(state) && suspend_ops->prepare_late) { 274 if (error)
230 error = suspend_ops->prepare_late(); 275 goto Platform_wake;
231 if (error)
232 goto Platform_wake;
233 }
234 276
235 if (suspend_test(TEST_PLATFORM)) 277 if (suspend_test(TEST_PLATFORM))
236 goto Platform_wake; 278 goto Platform_wake;
@@ -278,15 +320,11 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
278 ftrace_start(); 320 ftrace_start();
279 321
280 Platform_wake: 322 Platform_wake:
281 if (need_suspend_ops(state) && suspend_ops->wake) 323 platform_suspend_wake(state);
282 suspend_ops->wake();
283
284 dpm_resume_start(PMSG_RESUME); 324 dpm_resume_start(PMSG_RESUME);
285 325
286 Platform_finish: 326 Platform_finish:
287 if (need_suspend_ops(state) && suspend_ops->finish) 327 platform_suspend_finish(state);
288 suspend_ops->finish();
289
290 return error; 328 return error;
291} 329}
292 330
@@ -299,18 +337,13 @@ int suspend_devices_and_enter(suspend_state_t state)
299 int error; 337 int error;
300 bool wakeup = false; 338 bool wakeup = false;
301 339
302 if (need_suspend_ops(state) && !suspend_ops) 340 if (!sleep_state_supported(state))
303 return -ENOSYS; 341 return -ENOSYS;
304 342
305 if (need_suspend_ops(state) && suspend_ops->begin) { 343 error = platform_suspend_begin(state);
306 error = suspend_ops->begin(state); 344 if (error)
307 if (error) 345 goto Close;
308 goto Close; 346
309 } else if (state == PM_SUSPEND_FREEZE && freeze_ops->begin) {
310 error = freeze_ops->begin();
311 if (error)
312 goto Close;
313 }
314 suspend_console(); 347 suspend_console();
315 suspend_test_start(); 348 suspend_test_start();
316 error = dpm_suspend_start(PMSG_SUSPEND); 349 error = dpm_suspend_start(PMSG_SUSPEND);
@@ -324,25 +357,20 @@ int suspend_devices_and_enter(suspend_state_t state)
324 357
325 do { 358 do {
326 error = suspend_enter(state, &wakeup); 359 error = suspend_enter(state, &wakeup);
327 } while (!error && !wakeup && need_suspend_ops(state) 360 } while (!error && !wakeup && platform_suspend_again(state));
328 && suspend_ops->suspend_again && suspend_ops->suspend_again());
329 361
330 Resume_devices: 362 Resume_devices:
331 suspend_test_start(); 363 suspend_test_start();
332 dpm_resume_end(PMSG_RESUME); 364 dpm_resume_end(PMSG_RESUME);
333 suspend_test_finish("resume devices"); 365 suspend_test_finish("resume devices");
334 resume_console(); 366 resume_console();
335 Close:
336 if (need_suspend_ops(state) && suspend_ops->end)
337 suspend_ops->end();
338 else if (state == PM_SUSPEND_FREEZE && freeze_ops->end)
339 freeze_ops->end();
340 367
368 Close:
369 platform_suspend_end(state);
341 return error; 370 return error;
342 371
343 Recover_platform: 372 Recover_platform:
344 if (need_suspend_ops(state) && suspend_ops->recover) 373 platform_suspend_recover(state);
345 suspend_ops->recover();
346 goto Resume_devices; 374 goto Resume_devices;
347} 375}
348 376
@@ -395,7 +423,7 @@ static int enter_state(suspend_state_t state)
395 printk("done.\n"); 423 printk("done.\n");
396 trace_suspend_resume(TPS("sync_filesystems"), 0, false); 424 trace_suspend_resume(TPS("sync_filesystems"), 0, false);
397 425
398 pr_debug("PM: Preparing system for %s sleep\n", pm_states[state].label); 426 pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]);
399 error = suspend_prepare(state); 427 error = suspend_prepare(state);
400 if (error) 428 if (error)
401 goto Unlock; 429 goto Unlock;
@@ -404,7 +432,7 @@ static int enter_state(suspend_state_t state)
404 goto Finish; 432 goto Finish;
405 433
406 trace_suspend_resume(TPS("suspend_enter"), state, false); 434 trace_suspend_resume(TPS("suspend_enter"), state, false);
407 pr_debug("PM: Entering %s sleep\n", pm_states[state].label); 435 pr_debug("PM: Entering %s sleep\n", pm_states[state]);
408 pm_restrict_gfp_mask(); 436 pm_restrict_gfp_mask();
409 error = suspend_devices_and_enter(state); 437 error = suspend_devices_and_enter(state);
410 pm_restore_gfp_mask(); 438 pm_restore_gfp_mask();
diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c
index 269b097e78ea..2f524928b6aa 100644
--- a/kernel/power/suspend_test.c
+++ b/kernel/power/suspend_test.c
@@ -92,13 +92,13 @@ static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state)
92 } 92 }
93 93
94 if (state == PM_SUSPEND_MEM) { 94 if (state == PM_SUSPEND_MEM) {
95 printk(info_test, pm_states[state].label); 95 printk(info_test, pm_states[state]);
96 status = pm_suspend(state); 96 status = pm_suspend(state);
97 if (status == -ENODEV) 97 if (status == -ENODEV)
98 state = PM_SUSPEND_STANDBY; 98 state = PM_SUSPEND_STANDBY;
99 } 99 }
100 if (state == PM_SUSPEND_STANDBY) { 100 if (state == PM_SUSPEND_STANDBY) {
101 printk(info_test, pm_states[state].label); 101 printk(info_test, pm_states[state]);
102 status = pm_suspend(state); 102 status = pm_suspend(state);
103 } 103 }
104 if (status < 0) 104 if (status < 0)
@@ -141,8 +141,8 @@ static int __init setup_test_suspend(char *value)
141 /* "=mem" ==> "mem" */ 141 /* "=mem" ==> "mem" */
142 value++; 142 value++;
143 for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++) 143 for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++)
144 if (!strcmp(pm_states[i].label, value)) { 144 if (!strcmp(pm_states[i], value)) {
145 test_state = pm_states[i].state; 145 test_state = i;
146 return 0; 146 return 0;
147 } 147 }
148 148
@@ -162,8 +162,8 @@ static int __init test_suspend(void)
162 /* PM is initialized by now; is that state testable? */ 162 /* PM is initialized by now; is that state testable? */
163 if (test_state == PM_SUSPEND_ON) 163 if (test_state == PM_SUSPEND_ON)
164 goto done; 164 goto done;
165 if (!pm_states[test_state].state) { 165 if (!pm_states[test_state]) {
166 printk(warn_bad_state, pm_states[test_state].label); 166 printk(warn_bad_state, pm_states[test_state]);
167 goto done; 167 goto done;
168 } 168 }
169 169
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 98d357584cd6..526e8911460a 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -49,6 +49,9 @@ static int snapshot_open(struct inode *inode, struct file *filp)
49 struct snapshot_data *data; 49 struct snapshot_data *data;
50 int error; 50 int error;
51 51
52 if (!hibernation_available())
53 return -EPERM;
54
52 lock_system_sleep(); 55 lock_system_sleep();
53 56
54 if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { 57 if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index ea2d5f6962ed..13e839dbca07 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1416,9 +1416,10 @@ static int have_callable_console(void)
1416/* 1416/*
1417 * Can we actually use the console at this time on this cpu? 1417 * Can we actually use the console at this time on this cpu?
1418 * 1418 *
1419 * Console drivers may assume that per-cpu resources have been allocated. So 1419 * Console drivers may assume that per-cpu resources have
1420 * unless they're explicitly marked as being able to cope (CON_ANYTIME) don't 1420 * been allocated. So unless they're explicitly marked as
1421 * call them until this CPU is officially up. 1421 * being able to cope (CON_ANYTIME) don't call them until
1422 * this CPU is officially up.
1422 */ 1423 */
1423static inline int can_use_console(unsigned int cpu) 1424static inline int can_use_console(unsigned int cpu)
1424{ 1425{
@@ -1431,10 +1432,8 @@ static inline int can_use_console(unsigned int cpu)
1431 * console_lock held, and 'console_locked' set) if it 1432 * console_lock held, and 'console_locked' set) if it
1432 * is successful, false otherwise. 1433 * is successful, false otherwise.
1433 */ 1434 */
1434static int console_trylock_for_printk(void) 1435static int console_trylock_for_printk(unsigned int cpu)
1435{ 1436{
1436 unsigned int cpu = smp_processor_id();
1437
1438 if (!console_trylock()) 1437 if (!console_trylock())
1439 return 0; 1438 return 0;
1440 /* 1439 /*
@@ -1609,8 +1608,7 @@ asmlinkage int vprintk_emit(int facility, int level,
1609 */ 1608 */
1610 if (!oops_in_progress && !lockdep_recursing(current)) { 1609 if (!oops_in_progress && !lockdep_recursing(current)) {
1611 recursion_bug = 1; 1610 recursion_bug = 1;
1612 local_irq_restore(flags); 1611 goto out_restore_irqs;
1613 return 0;
1614 } 1612 }
1615 zap_locks(); 1613 zap_locks();
1616 } 1614 }
@@ -1718,27 +1716,21 @@ asmlinkage int vprintk_emit(int facility, int level,
1718 1716
1719 logbuf_cpu = UINT_MAX; 1717 logbuf_cpu = UINT_MAX;
1720 raw_spin_unlock(&logbuf_lock); 1718 raw_spin_unlock(&logbuf_lock);
1721 lockdep_on();
1722 local_irq_restore(flags);
1723 1719
1724 /* If called from the scheduler, we can not call up(). */ 1720 /* If called from the scheduler, we can not call up(). */
1725 if (in_sched) 1721 if (!in_sched) {
1726 return printed_len; 1722 /*
1727 1723 * Try to acquire and then immediately release the console
1728 /* 1724 * semaphore. The release will print out buffers and wake up
1729 * Disable preemption to avoid being preempted while holding 1725 * /dev/kmsg and syslog() users.
1730 * console_sem which would prevent anyone from printing to console 1726 */
1731 */ 1727 if (console_trylock_for_printk(this_cpu))
1732 preempt_disable(); 1728 console_unlock();
1733 /* 1729 }
1734 * Try to acquire and then immediately release the console semaphore.
1735 * The release will print out buffers and wake up /dev/kmsg and syslog()
1736 * users.
1737 */
1738 if (console_trylock_for_printk())
1739 console_unlock();
1740 preempt_enable();
1741 1730
1731 lockdep_on();
1732out_restore_irqs:
1733 local_irq_restore(flags);
1742 return printed_len; 1734 return printed_len;
1743} 1735}
1744EXPORT_SYMBOL(vprintk_emit); 1736EXPORT_SYMBOL(vprintk_emit);
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index f1ba77363fbb..625d0b0cd75a 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -206,6 +206,70 @@ void rcu_bh_qs(int cpu)
206 rdp->passed_quiesce = 1; 206 rdp->passed_quiesce = 1;
207} 207}
208 208
209static DEFINE_PER_CPU(int, rcu_sched_qs_mask);
210
211static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
212 .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE,
213 .dynticks = ATOMIC_INIT(1),
214#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
215 .dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE,
216 .dynticks_idle = ATOMIC_INIT(1),
217#endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
218};
219
220/*
221 * Let the RCU core know that this CPU has gone through the scheduler,
222 * which is a quiescent state. This is called when the need for a
223 * quiescent state is urgent, so we burn an atomic operation and full
224 * memory barriers to let the RCU core know about it, regardless of what
225 * this CPU might (or might not) do in the near future.
226 *
227 * We inform the RCU core by emulating a zero-duration dyntick-idle
228 * period, which we in turn do by incrementing the ->dynticks counter
229 * by two.
230 */
231static void rcu_momentary_dyntick_idle(void)
232{
233 unsigned long flags;
234 struct rcu_data *rdp;
235 struct rcu_dynticks *rdtp;
236 int resched_mask;
237 struct rcu_state *rsp;
238
239 local_irq_save(flags);
240
241 /*
242 * Yes, we can lose flag-setting operations. This is OK, because
243 * the flag will be set again after some delay.
244 */
245 resched_mask = raw_cpu_read(rcu_sched_qs_mask);
246 raw_cpu_write(rcu_sched_qs_mask, 0);
247
248 /* Find the flavor that needs a quiescent state. */
249 for_each_rcu_flavor(rsp) {
250 rdp = raw_cpu_ptr(rsp->rda);
251 if (!(resched_mask & rsp->flavor_mask))
252 continue;
253 smp_mb(); /* rcu_sched_qs_mask before cond_resched_completed. */
254 if (ACCESS_ONCE(rdp->mynode->completed) !=
255 ACCESS_ONCE(rdp->cond_resched_completed))
256 continue;
257
258 /*
259 * Pretend to be momentarily idle for the quiescent state.
260 * This allows the grace-period kthread to record the
261 * quiescent state, with no need for this CPU to do anything
262 * further.
263 */
264 rdtp = this_cpu_ptr(&rcu_dynticks);
265 smp_mb__before_atomic(); /* Earlier stuff before QS. */
266 atomic_add(2, &rdtp->dynticks); /* QS. */
267 smp_mb__after_atomic(); /* Later stuff after QS. */
268 break;
269 }
270 local_irq_restore(flags);
271}
272
209/* 273/*
210 * Note a context switch. This is a quiescent state for RCU-sched, 274 * Note a context switch. This is a quiescent state for RCU-sched,
211 * and requires special handling for preemptible RCU. 275 * and requires special handling for preemptible RCU.
@@ -216,19 +280,12 @@ void rcu_note_context_switch(int cpu)
216 trace_rcu_utilization(TPS("Start context switch")); 280 trace_rcu_utilization(TPS("Start context switch"));
217 rcu_sched_qs(cpu); 281 rcu_sched_qs(cpu);
218 rcu_preempt_note_context_switch(cpu); 282 rcu_preempt_note_context_switch(cpu);
283 if (unlikely(raw_cpu_read(rcu_sched_qs_mask)))
284 rcu_momentary_dyntick_idle();
219 trace_rcu_utilization(TPS("End context switch")); 285 trace_rcu_utilization(TPS("End context switch"));
220} 286}
221EXPORT_SYMBOL_GPL(rcu_note_context_switch); 287EXPORT_SYMBOL_GPL(rcu_note_context_switch);
222 288
223static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
224 .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE,
225 .dynticks = ATOMIC_INIT(1),
226#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
227 .dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE,
228 .dynticks_idle = ATOMIC_INIT(1),
229#endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
230};
231
232static long blimit = 10; /* Maximum callbacks per rcu_do_batch. */ 289static long blimit = 10; /* Maximum callbacks per rcu_do_batch. */
233static long qhimark = 10000; /* If this many pending, ignore blimit. */ 290static long qhimark = 10000; /* If this many pending, ignore blimit. */
234static long qlowmark = 100; /* Once only this many pending, use blimit. */ 291static long qlowmark = 100; /* Once only this many pending, use blimit. */
@@ -243,6 +300,13 @@ static ulong jiffies_till_next_fqs = ULONG_MAX;
243module_param(jiffies_till_first_fqs, ulong, 0644); 300module_param(jiffies_till_first_fqs, ulong, 0644);
244module_param(jiffies_till_next_fqs, ulong, 0644); 301module_param(jiffies_till_next_fqs, ulong, 0644);
245 302
303/*
304 * How long the grace period must be before we start recruiting
305 * quiescent-state help from rcu_note_context_switch().
306 */
307static ulong jiffies_till_sched_qs = HZ / 20;
308module_param(jiffies_till_sched_qs, ulong, 0644);
309
246static bool rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, 310static bool rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
247 struct rcu_data *rdp); 311 struct rcu_data *rdp);
248static void force_qs_rnp(struct rcu_state *rsp, 312static void force_qs_rnp(struct rcu_state *rsp,
@@ -853,6 +917,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
853 bool *isidle, unsigned long *maxj) 917 bool *isidle, unsigned long *maxj)
854{ 918{
855 unsigned int curr; 919 unsigned int curr;
920 int *rcrmp;
856 unsigned int snap; 921 unsigned int snap;
857 922
858 curr = (unsigned int)atomic_add_return(0, &rdp->dynticks->dynticks); 923 curr = (unsigned int)atomic_add_return(0, &rdp->dynticks->dynticks);
@@ -893,27 +958,43 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
893 } 958 }
894 959
895 /* 960 /*
896 * There is a possibility that a CPU in adaptive-ticks state 961 * A CPU running for an extended time within the kernel can
897 * might run in the kernel with the scheduling-clock tick disabled 962 * delay RCU grace periods. When the CPU is in NO_HZ_FULL mode,
898 * for an extended time period. Invoke rcu_kick_nohz_cpu() to 963 * even context-switching back and forth between a pair of
899 * force the CPU to restart the scheduling-clock tick in this 964 * in-kernel CPU-bound tasks cannot advance grace periods.
900 * CPU is in this state. 965 * So if the grace period is old enough, make the CPU pay attention.
901 */ 966 * Note that the unsynchronized assignments to the per-CPU
902 rcu_kick_nohz_cpu(rdp->cpu); 967 * rcu_sched_qs_mask variable are safe. Yes, setting of
903 968 * bits can be lost, but they will be set again on the next
904 /* 969 * force-quiescent-state pass. So lost bit sets do not result
905 * Alternatively, the CPU might be running in the kernel 970 * in incorrect behavior, merely in a grace period lasting
906 * for an extended period of time without a quiescent state. 971 * a few jiffies longer than it might otherwise. Because
907 * Attempt to force the CPU through the scheduler to gain the 972 * there are at most four threads involved, and because the
908 * needed quiescent state, but only if the grace period has gone 973 * updates are only once every few jiffies, the probability of
909 * on for an uncommonly long time. If there are many stuck CPUs, 974 * lossage (and thus of slight grace-period extension) is
910 * we will beat on the first one until it gets unstuck, then move 975 * quite low.
911 * to the next. Only do this for the primary flavor of RCU. 976 *
977 * Note that if the jiffies_till_sched_qs boot/sysfs parameter
978 * is set too high, we override with half of the RCU CPU stall
979 * warning delay.
912 */ 980 */
913 if (rdp->rsp == rcu_state_p && 981 rcrmp = &per_cpu(rcu_sched_qs_mask, rdp->cpu);
982 if (ULONG_CMP_GE(jiffies,
983 rdp->rsp->gp_start + jiffies_till_sched_qs) ||
914 ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) { 984 ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) {
915 rdp->rsp->jiffies_resched += 5; 985 if (!(ACCESS_ONCE(*rcrmp) & rdp->rsp->flavor_mask)) {
916 resched_cpu(rdp->cpu); 986 ACCESS_ONCE(rdp->cond_resched_completed) =
987 ACCESS_ONCE(rdp->mynode->completed);
988 smp_mb(); /* ->cond_resched_completed before *rcrmp. */
989 ACCESS_ONCE(*rcrmp) =
990 ACCESS_ONCE(*rcrmp) + rdp->rsp->flavor_mask;
991 resched_cpu(rdp->cpu); /* Force CPU into scheduler. */
992 rdp->rsp->jiffies_resched += 5; /* Enable beating. */
993 } else if (ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) {
994 /* Time to beat on that CPU again! */
995 resched_cpu(rdp->cpu); /* Force CPU into scheduler. */
996 rdp->rsp->jiffies_resched += 5; /* Re-enable beating. */
997 }
917 } 998 }
918 999
919 return 0; 1000 return 0;
@@ -3491,6 +3572,7 @@ static void __init rcu_init_one(struct rcu_state *rsp,
3491 "rcu_node_fqs_1", 3572 "rcu_node_fqs_1",
3492 "rcu_node_fqs_2", 3573 "rcu_node_fqs_2",
3493 "rcu_node_fqs_3" }; /* Match MAX_RCU_LVLS */ 3574 "rcu_node_fqs_3" }; /* Match MAX_RCU_LVLS */
3575 static u8 fl_mask = 0x1;
3494 int cpustride = 1; 3576 int cpustride = 1;
3495 int i; 3577 int i;
3496 int j; 3578 int j;
@@ -3509,6 +3591,8 @@ static void __init rcu_init_one(struct rcu_state *rsp,
3509 for (i = 1; i < rcu_num_lvls; i++) 3591 for (i = 1; i < rcu_num_lvls; i++)
3510 rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1]; 3592 rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1];
3511 rcu_init_levelspread(rsp); 3593 rcu_init_levelspread(rsp);
3594 rsp->flavor_mask = fl_mask;
3595 fl_mask <<= 1;
3512 3596
3513 /* Initialize the elements themselves, starting from the leaves. */ 3597 /* Initialize the elements themselves, starting from the leaves. */
3514 3598
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index bf2c1e669691..0f69a79c5b7d 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -307,6 +307,9 @@ struct rcu_data {
307 /* 4) reasons this CPU needed to be kicked by force_quiescent_state */ 307 /* 4) reasons this CPU needed to be kicked by force_quiescent_state */
308 unsigned long dynticks_fqs; /* Kicked due to dynticks idle. */ 308 unsigned long dynticks_fqs; /* Kicked due to dynticks idle. */
309 unsigned long offline_fqs; /* Kicked due to being offline. */ 309 unsigned long offline_fqs; /* Kicked due to being offline. */
310 unsigned long cond_resched_completed;
311 /* Grace period that needs help */
312 /* from cond_resched(). */
310 313
311 /* 5) __rcu_pending() statistics. */ 314 /* 5) __rcu_pending() statistics. */
312 unsigned long n_rcu_pending; /* rcu_pending() calls since boot. */ 315 unsigned long n_rcu_pending; /* rcu_pending() calls since boot. */
@@ -392,6 +395,7 @@ struct rcu_state {
392 struct rcu_node *level[RCU_NUM_LVLS]; /* Hierarchy levels. */ 395 struct rcu_node *level[RCU_NUM_LVLS]; /* Hierarchy levels. */
393 u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */ 396 u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */
394 u8 levelspread[RCU_NUM_LVLS]; /* kids/node in each level. */ 397 u8 levelspread[RCU_NUM_LVLS]; /* kids/node in each level. */
398 u8 flavor_mask; /* bit in flavor mask. */
395 struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */ 399 struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */
396 void (*call)(struct rcu_head *head, /* call_rcu() flavor. */ 400 void (*call)(struct rcu_head *head, /* call_rcu() flavor. */
397 void (*func)(struct rcu_head *head)); 401 void (*func)(struct rcu_head *head));
@@ -563,7 +567,7 @@ static bool rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp);
563static void do_nocb_deferred_wakeup(struct rcu_data *rdp); 567static void do_nocb_deferred_wakeup(struct rcu_data *rdp);
564static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp); 568static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp);
565static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp); 569static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp);
566static void rcu_kick_nohz_cpu(int cpu); 570static void __maybe_unused rcu_kick_nohz_cpu(int cpu);
567static bool init_nocb_callback_list(struct rcu_data *rdp); 571static bool init_nocb_callback_list(struct rcu_data *rdp);
568static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq); 572static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq);
569static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq); 573static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq);
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index cbc2c45265e2..02ac0fb186b8 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -2404,7 +2404,7 @@ static bool init_nocb_callback_list(struct rcu_data *rdp)
2404 * if an adaptive-ticks CPU is failing to respond to the current grace 2404 * if an adaptive-ticks CPU is failing to respond to the current grace
2405 * period and has not be idle from an RCU perspective, kick it. 2405 * period and has not be idle from an RCU perspective, kick it.
2406 */ 2406 */
2407static void rcu_kick_nohz_cpu(int cpu) 2407static void __maybe_unused rcu_kick_nohz_cpu(int cpu)
2408{ 2408{
2409#ifdef CONFIG_NO_HZ_FULL 2409#ifdef CONFIG_NO_HZ_FULL
2410 if (tick_nohz_full_cpu(cpu)) 2410 if (tick_nohz_full_cpu(cpu))
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index a2aeb4df0f60..bc7883570530 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -200,12 +200,12 @@ void wait_rcu_gp(call_rcu_func_t crf)
200EXPORT_SYMBOL_GPL(wait_rcu_gp); 200EXPORT_SYMBOL_GPL(wait_rcu_gp);
201 201
202#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD 202#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
203static inline void debug_init_rcu_head(struct rcu_head *head) 203void init_rcu_head(struct rcu_head *head)
204{ 204{
205 debug_object_init(head, &rcuhead_debug_descr); 205 debug_object_init(head, &rcuhead_debug_descr);
206} 206}
207 207
208static inline void debug_rcu_head_free(struct rcu_head *head) 208void destroy_rcu_head(struct rcu_head *head)
209{ 209{
210 debug_object_free(head, &rcuhead_debug_descr); 210 debug_object_free(head, &rcuhead_debug_descr);
211} 211}
@@ -350,21 +350,3 @@ static int __init check_cpu_stall_init(void)
350early_initcall(check_cpu_stall_init); 350early_initcall(check_cpu_stall_init);
351 351
352#endif /* #ifdef CONFIG_RCU_STALL_COMMON */ 352#endif /* #ifdef CONFIG_RCU_STALL_COMMON */
353
354/*
355 * Hooks for cond_resched() and friends to avoid RCU CPU stall warnings.
356 */
357
358DEFINE_PER_CPU(int, rcu_cond_resched_count);
359
360/*
361 * Report a set of RCU quiescent states, for use by cond_resched()
362 * and friends. Out of line due to being called infrequently.
363 */
364void rcu_resched(void)
365{
366 preempt_disable();
367 __this_cpu_write(rcu_cond_resched_count, 0);
368 rcu_note_context_switch(smp_processor_id());
369 preempt_enable();
370}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3bdf01b494fe..bc1638b33449 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4147,7 +4147,6 @@ static void __cond_resched(void)
4147 4147
4148int __sched _cond_resched(void) 4148int __sched _cond_resched(void)
4149{ 4149{
4150 rcu_cond_resched();
4151 if (should_resched()) { 4150 if (should_resched()) {
4152 __cond_resched(); 4151 __cond_resched();
4153 return 1; 4152 return 1;
@@ -4166,18 +4165,15 @@ EXPORT_SYMBOL(_cond_resched);
4166 */ 4165 */
4167int __cond_resched_lock(spinlock_t *lock) 4166int __cond_resched_lock(spinlock_t *lock)
4168{ 4167{
4169 bool need_rcu_resched = rcu_should_resched();
4170 int resched = should_resched(); 4168 int resched = should_resched();
4171 int ret = 0; 4169 int ret = 0;
4172 4170
4173 lockdep_assert_held(lock); 4171 lockdep_assert_held(lock);
4174 4172
4175 if (spin_needbreak(lock) || resched || need_rcu_resched) { 4173 if (spin_needbreak(lock) || resched) {
4176 spin_unlock(lock); 4174 spin_unlock(lock);
4177 if (resched) 4175 if (resched)
4178 __cond_resched(); 4176 __cond_resched();
4179 else if (unlikely(need_rcu_resched))
4180 rcu_resched();
4181 else 4177 else
4182 cpu_relax(); 4178 cpu_relax();
4183 ret = 1; 4179 ret = 1;
@@ -4191,7 +4187,6 @@ int __sched __cond_resched_softirq(void)
4191{ 4187{
4192 BUG_ON(!in_softirq()); 4188 BUG_ON(!in_softirq());
4193 4189
4194 rcu_cond_resched(); /* BH disabled OK, just recording QSes. */
4195 if (should_resched()) { 4190 if (should_resched()) {
4196 local_bh_enable(); 4191 local_bh_enable();
4197 __cond_resched(); 4192 __cond_resched();
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 695f9773bb60..627b3c34b821 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -608,7 +608,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
608 608
609 avg_atom = p->se.sum_exec_runtime; 609 avg_atom = p->se.sum_exec_runtime;
610 if (nr_switches) 610 if (nr_switches)
611 do_div(avg_atom, nr_switches); 611 avg_atom = div64_ul(avg_atom, nr_switches);
612 else 612 else
613 avg_atom = -1LL; 613 avg_atom = -1LL;
614 614
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index cf009fb0bc25..658a58dc30f4 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -147,8 +147,6 @@ use_default:
147 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu)) 147 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu))
148 goto use_default; 148 goto use_default;
149 149
150 trace_cpu_idle_rcuidle(next_state, dev->cpu);
151
152 /* 150 /*
153 * Enter the idle state previously returned by the governor decision. 151 * Enter the idle state previously returned by the governor decision.
154 * This function will block until an interrupt occurs and will take 152 * This function will block until an interrupt occurs and will take
@@ -156,8 +154,6 @@ use_default:
156 */ 154 */
157 entered_state = cpuidle_enter(drv, dev, next_state); 155 entered_state = cpuidle_enter(drv, dev, next_state);
158 156
159 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu);
160
161 if (broadcast) 157 if (broadcast)
162 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu); 158 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu);
163 159
diff --git a/kernel/smp.c b/kernel/smp.c
index 306f8180b0d5..80c33f8de14f 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -29,6 +29,8 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_function_data, cfd_data);
29 29
30static DEFINE_PER_CPU_SHARED_ALIGNED(struct llist_head, call_single_queue); 30static DEFINE_PER_CPU_SHARED_ALIGNED(struct llist_head, call_single_queue);
31 31
32static void flush_smp_call_function_queue(bool warn_cpu_offline);
33
32static int 34static int
33hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) 35hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
34{ 36{
@@ -51,12 +53,27 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
51#ifdef CONFIG_HOTPLUG_CPU 53#ifdef CONFIG_HOTPLUG_CPU
52 case CPU_UP_CANCELED: 54 case CPU_UP_CANCELED:
53 case CPU_UP_CANCELED_FROZEN: 55 case CPU_UP_CANCELED_FROZEN:
56 /* Fall-through to the CPU_DEAD[_FROZEN] case. */
54 57
55 case CPU_DEAD: 58 case CPU_DEAD:
56 case CPU_DEAD_FROZEN: 59 case CPU_DEAD_FROZEN:
57 free_cpumask_var(cfd->cpumask); 60 free_cpumask_var(cfd->cpumask);
58 free_percpu(cfd->csd); 61 free_percpu(cfd->csd);
59 break; 62 break;
63
64 case CPU_DYING:
65 case CPU_DYING_FROZEN:
66 /*
67 * The IPIs for the smp-call-function callbacks queued by other
68 * CPUs might arrive late, either due to hardware latencies or
69 * because this CPU disabled interrupts (inside stop-machine)
70 * before the IPIs were sent. So flush out any pending callbacks
71 * explicitly (without waiting for the IPIs to arrive), to
72 * ensure that the outgoing CPU doesn't go offline with work
73 * still pending.
74 */
75 flush_smp_call_function_queue(false);
76 break;
60#endif 77#endif
61 }; 78 };
62 79
@@ -177,23 +194,47 @@ static int generic_exec_single(int cpu, struct call_single_data *csd,
177 return 0; 194 return 0;
178} 195}
179 196
180/* 197/**
181 * Invoked by arch to handle an IPI for call function single. Must be 198 * generic_smp_call_function_single_interrupt - Execute SMP IPI callbacks
182 * called from the arch with interrupts disabled. 199 *
200 * Invoked by arch to handle an IPI for call function single.
201 * Must be called with interrupts disabled.
183 */ 202 */
184void generic_smp_call_function_single_interrupt(void) 203void generic_smp_call_function_single_interrupt(void)
185{ 204{
205 flush_smp_call_function_queue(true);
206}
207
208/**
209 * flush_smp_call_function_queue - Flush pending smp-call-function callbacks
210 *
211 * @warn_cpu_offline: If set to 'true', warn if callbacks were queued on an
212 * offline CPU. Skip this check if set to 'false'.
213 *
214 * Flush any pending smp-call-function callbacks queued on this CPU. This is
215 * invoked by the generic IPI handler, as well as by a CPU about to go offline,
216 * to ensure that all pending IPI callbacks are run before it goes completely
217 * offline.
218 *
219 * Loop through the call_single_queue and run all the queued callbacks.
220 * Must be called with interrupts disabled.
221 */
222static void flush_smp_call_function_queue(bool warn_cpu_offline)
223{
224 struct llist_head *head;
186 struct llist_node *entry; 225 struct llist_node *entry;
187 struct call_single_data *csd, *csd_next; 226 struct call_single_data *csd, *csd_next;
188 static bool warned; 227 static bool warned;
189 228
190 entry = llist_del_all(&__get_cpu_var(call_single_queue)); 229 WARN_ON(!irqs_disabled());
230
231 head = &__get_cpu_var(call_single_queue);
232 entry = llist_del_all(head);
191 entry = llist_reverse_order(entry); 233 entry = llist_reverse_order(entry);
192 234
193 /* 235 /* There shouldn't be any pending callbacks on an offline CPU. */
194 * Shouldn't receive this interrupt on a cpu that is not yet online. 236 if (unlikely(warn_cpu_offline && !cpu_online(smp_processor_id()) &&
195 */ 237 !warned && !llist_empty(head))) {
196 if (unlikely(!cpu_online(smp_processor_id()) && !warned)) {
197 warned = true; 238 warned = true;
198 WARN(1, "IPI on offline CPU %d\n", smp_processor_id()); 239 WARN(1, "IPI on offline CPU %d\n", smp_processor_id());
199 240
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index ba9ed453c4ed..75b22e22a72c 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -136,7 +136,6 @@ static unsigned long dirty_bytes_min = 2 * PAGE_SIZE;
136/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ 136/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
137static int maxolduid = 65535; 137static int maxolduid = 65535;
138static int minolduid; 138static int minolduid;
139static int min_percpu_pagelist_fract = 8;
140 139
141static int ngroups_max = NGROUPS_MAX; 140static int ngroups_max = NGROUPS_MAX;
142static const int cap_last_cap = CAP_LAST_CAP; 141static const int cap_last_cap = CAP_LAST_CAP;
@@ -152,10 +151,6 @@ static unsigned long hung_task_timeout_max = (LONG_MAX/HZ);
152#ifdef CONFIG_SPARC 151#ifdef CONFIG_SPARC
153#endif 152#endif
154 153
155#ifdef CONFIG_SPARC64
156extern int sysctl_tsb_ratio;
157#endif
158
159#ifdef __hppa__ 154#ifdef __hppa__
160extern int pwrsw_enabled; 155extern int pwrsw_enabled;
161#endif 156#endif
@@ -865,6 +860,17 @@ static struct ctl_table kern_table[] = {
865 .extra1 = &zero, 860 .extra1 = &zero,
866 .extra2 = &one, 861 .extra2 = &one,
867 }, 862 },
863#ifdef CONFIG_SMP
864 {
865 .procname = "softlockup_all_cpu_backtrace",
866 .data = &sysctl_softlockup_all_cpu_backtrace,
867 .maxlen = sizeof(int),
868 .mode = 0644,
869 .proc_handler = proc_dointvec_minmax,
870 .extra1 = &zero,
871 .extra2 = &one,
872 },
873#endif /* CONFIG_SMP */
868 { 874 {
869 .procname = "nmi_watchdog", 875 .procname = "nmi_watchdog",
870 .data = &watchdog_user_enabled, 876 .data = &watchdog_user_enabled,
@@ -1321,7 +1327,7 @@ static struct ctl_table vm_table[] = {
1321 .maxlen = sizeof(percpu_pagelist_fraction), 1327 .maxlen = sizeof(percpu_pagelist_fraction),
1322 .mode = 0644, 1328 .mode = 0644,
1323 .proc_handler = percpu_pagelist_fraction_sysctl_handler, 1329 .proc_handler = percpu_pagelist_fraction_sysctl_handler,
1324 .extra1 = &min_percpu_pagelist_fract, 1330 .extra1 = &zero,
1325 }, 1331 },
1326#ifdef CONFIG_MMU 1332#ifdef CONFIG_MMU
1327 { 1333 {
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 88c9c65a430d..fe75444ae7ec 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -585,9 +585,14 @@ static int alarm_timer_set(struct k_itimer *timr, int flags,
585 struct itimerspec *new_setting, 585 struct itimerspec *new_setting,
586 struct itimerspec *old_setting) 586 struct itimerspec *old_setting)
587{ 587{
588 ktime_t exp;
589
588 if (!rtcdev) 590 if (!rtcdev)
589 return -ENOTSUPP; 591 return -ENOTSUPP;
590 592
593 if (flags & ~TIMER_ABSTIME)
594 return -EINVAL;
595
591 if (old_setting) 596 if (old_setting)
592 alarm_timer_get(timr, old_setting); 597 alarm_timer_get(timr, old_setting);
593 598
@@ -597,8 +602,16 @@ static int alarm_timer_set(struct k_itimer *timr, int flags,
597 602
598 /* start the timer */ 603 /* start the timer */
599 timr->it.alarm.interval = timespec_to_ktime(new_setting->it_interval); 604 timr->it.alarm.interval = timespec_to_ktime(new_setting->it_interval);
600 alarm_start(&timr->it.alarm.alarmtimer, 605 exp = timespec_to_ktime(new_setting->it_value);
601 timespec_to_ktime(new_setting->it_value)); 606 /* Convert (if necessary) to absolute time */
607 if (flags != TIMER_ABSTIME) {
608 ktime_t now;
609
610 now = alarm_bases[timr->it.alarm.alarmtimer.type].gettime();
611 exp = ktime_add(now, exp);
612 }
613
614 alarm_start(&timr->it.alarm.alarmtimer, exp);
602 return 0; 615 return 0;
603} 616}
604 617
@@ -730,6 +743,9 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags,
730 if (!alarmtimer_get_rtcdev()) 743 if (!alarmtimer_get_rtcdev())
731 return -ENOTSUPP; 744 return -ENOTSUPP;
732 745
746 if (flags & ~TIMER_ABSTIME)
747 return -EINVAL;
748
733 if (!capable(CAP_WAKE_ALARM)) 749 if (!capable(CAP_WAKE_ALARM))
734 return -EPERM; 750 return -EPERM;
735 751
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 5b372e3ed675..ac9d1dad630b 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -265,12 +265,12 @@ static void update_ftrace_function(void)
265 func = ftrace_ops_list_func; 265 func = ftrace_ops_list_func;
266 } 266 }
267 267
268 update_function_graph_func();
269
268 /* If there's no change, then do nothing more here */ 270 /* If there's no change, then do nothing more here */
269 if (ftrace_trace_function == func) 271 if (ftrace_trace_function == func)
270 return; 272 return;
271 273
272 update_function_graph_func();
273
274 /* 274 /*
275 * If we are using the list function, it doesn't care 275 * If we are using the list function, it doesn't care
276 * about the function_trace_ops. 276 * about the function_trace_ops.
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 7c56c3d06943..ff7027199a9a 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -616,10 +616,6 @@ int ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu,
616 struct ring_buffer_per_cpu *cpu_buffer; 616 struct ring_buffer_per_cpu *cpu_buffer;
617 struct rb_irq_work *work; 617 struct rb_irq_work *work;
618 618
619 if ((cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) ||
620 (cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu)))
621 return POLLIN | POLLRDNORM;
622
623 if (cpu == RING_BUFFER_ALL_CPUS) 619 if (cpu == RING_BUFFER_ALL_CPUS)
624 work = &buffer->irq_work; 620 work = &buffer->irq_work;
625 else { 621 else {
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 384ede311717..291397e66669 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -466,6 +466,12 @@ int __trace_puts(unsigned long ip, const char *str, int size)
466 struct print_entry *entry; 466 struct print_entry *entry;
467 unsigned long irq_flags; 467 unsigned long irq_flags;
468 int alloc; 468 int alloc;
469 int pc;
470
471 if (!(trace_flags & TRACE_ITER_PRINTK))
472 return 0;
473
474 pc = preempt_count();
469 475
470 if (unlikely(tracing_selftest_running || tracing_disabled)) 476 if (unlikely(tracing_selftest_running || tracing_disabled))
471 return 0; 477 return 0;
@@ -475,7 +481,7 @@ int __trace_puts(unsigned long ip, const char *str, int size)
475 local_save_flags(irq_flags); 481 local_save_flags(irq_flags);
476 buffer = global_trace.trace_buffer.buffer; 482 buffer = global_trace.trace_buffer.buffer;
477 event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc, 483 event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc,
478 irq_flags, preempt_count()); 484 irq_flags, pc);
479 if (!event) 485 if (!event)
480 return 0; 486 return 0;
481 487
@@ -492,6 +498,7 @@ int __trace_puts(unsigned long ip, const char *str, int size)
492 entry->buf[size] = '\0'; 498 entry->buf[size] = '\0';
493 499
494 __buffer_unlock_commit(buffer, event); 500 __buffer_unlock_commit(buffer, event);
501 ftrace_trace_stack(buffer, irq_flags, 4, pc);
495 502
496 return size; 503 return size;
497} 504}
@@ -509,6 +516,12 @@ int __trace_bputs(unsigned long ip, const char *str)
509 struct bputs_entry *entry; 516 struct bputs_entry *entry;
510 unsigned long irq_flags; 517 unsigned long irq_flags;
511 int size = sizeof(struct bputs_entry); 518 int size = sizeof(struct bputs_entry);
519 int pc;
520
521 if (!(trace_flags & TRACE_ITER_PRINTK))
522 return 0;
523
524 pc = preempt_count();
512 525
513 if (unlikely(tracing_selftest_running || tracing_disabled)) 526 if (unlikely(tracing_selftest_running || tracing_disabled))
514 return 0; 527 return 0;
@@ -516,7 +529,7 @@ int __trace_bputs(unsigned long ip, const char *str)
516 local_save_flags(irq_flags); 529 local_save_flags(irq_flags);
517 buffer = global_trace.trace_buffer.buffer; 530 buffer = global_trace.trace_buffer.buffer;
518 event = trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size, 531 event = trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size,
519 irq_flags, preempt_count()); 532 irq_flags, pc);
520 if (!event) 533 if (!event)
521 return 0; 534 return 0;
522 535
@@ -525,6 +538,7 @@ int __trace_bputs(unsigned long ip, const char *str)
525 entry->str = str; 538 entry->str = str;
526 539
527 __buffer_unlock_commit(buffer, event); 540 __buffer_unlock_commit(buffer, event);
541 ftrace_trace_stack(buffer, irq_flags, 4, pc);
528 542
529 return 1; 543 return 1;
530} 544}
@@ -809,7 +823,7 @@ static struct {
809 { trace_clock_local, "local", 1 }, 823 { trace_clock_local, "local", 1 },
810 { trace_clock_global, "global", 1 }, 824 { trace_clock_global, "global", 1 },
811 { trace_clock_counter, "counter", 0 }, 825 { trace_clock_counter, "counter", 0 },
812 { trace_clock_jiffies, "uptime", 1 }, 826 { trace_clock_jiffies, "uptime", 0 },
813 { trace_clock, "perf", 1 }, 827 { trace_clock, "perf", 1 },
814 ARCH_TRACE_CLOCKS 828 ARCH_TRACE_CLOCKS
815}; 829};
@@ -1396,7 +1410,6 @@ void tracing_start(void)
1396 1410
1397 arch_spin_unlock(&global_trace.max_lock); 1411 arch_spin_unlock(&global_trace.max_lock);
1398 1412
1399 ftrace_start();
1400 out: 1413 out:
1401 raw_spin_unlock_irqrestore(&global_trace.start_lock, flags); 1414 raw_spin_unlock_irqrestore(&global_trace.start_lock, flags);
1402} 1415}
@@ -1443,7 +1456,6 @@ void tracing_stop(void)
1443 struct ring_buffer *buffer; 1456 struct ring_buffer *buffer;
1444 unsigned long flags; 1457 unsigned long flags;
1445 1458
1446 ftrace_stop();
1447 raw_spin_lock_irqsave(&global_trace.start_lock, flags); 1459 raw_spin_lock_irqsave(&global_trace.start_lock, flags);
1448 if (global_trace.stop_count++) 1460 if (global_trace.stop_count++)
1449 goto out; 1461 goto out;
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index 26dc348332b7..57b67b1f24d1 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -59,13 +59,14 @@ u64 notrace trace_clock(void)
59 59
60/* 60/*
61 * trace_jiffy_clock(): Simply use jiffies as a clock counter. 61 * trace_jiffy_clock(): Simply use jiffies as a clock counter.
62 * Note that this use of jiffies_64 is not completely safe on
63 * 32-bit systems. But the window is tiny, and the effect if
64 * we are affected is that we will have an obviously bogus
65 * timestamp on a trace event - i.e. not life threatening.
62 */ 66 */
63u64 notrace trace_clock_jiffies(void) 67u64 notrace trace_clock_jiffies(void)
64{ 68{
65 u64 jiffy = jiffies - INITIAL_JIFFIES; 69 return jiffies_64_to_clock_t(jiffies_64 - INITIAL_JIFFIES);
66
67 /* Return nsecs */
68 return (u64)jiffies_to_usecs(jiffy) * 1000ULL;
69} 70}
70 71
71/* 72/*
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index f99e0b3bca8c..2de53628689f 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -470,6 +470,7 @@ static void remove_event_file_dir(struct ftrace_event_file *file)
470 470
471 list_del(&file->list); 471 list_del(&file->list);
472 remove_subsystem(file->system); 472 remove_subsystem(file->system);
473 free_event_filter(file->filter);
473 kmem_cache_free(file_cachep, file); 474 kmem_cache_free(file_cachep, file);
474} 475}
475 476
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 04fdb5de823c..3c9b97e6b1f4 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -893,6 +893,9 @@ probe_event_enable(struct trace_uprobe *tu, struct ftrace_event_file *file,
893 int ret; 893 int ret;
894 894
895 if (file) { 895 if (file) {
896 if (tu->tp.flags & TP_FLAG_PROFILE)
897 return -EINTR;
898
896 link = kmalloc(sizeof(*link), GFP_KERNEL); 899 link = kmalloc(sizeof(*link), GFP_KERNEL);
897 if (!link) 900 if (!link)
898 return -ENOMEM; 901 return -ENOMEM;
@@ -901,29 +904,40 @@ probe_event_enable(struct trace_uprobe *tu, struct ftrace_event_file *file,
901 list_add_tail_rcu(&link->list, &tu->tp.files); 904 list_add_tail_rcu(&link->list, &tu->tp.files);
902 905
903 tu->tp.flags |= TP_FLAG_TRACE; 906 tu->tp.flags |= TP_FLAG_TRACE;
904 } else 907 } else {
905 tu->tp.flags |= TP_FLAG_PROFILE; 908 if (tu->tp.flags & TP_FLAG_TRACE)
909 return -EINTR;
906 910
907 ret = uprobe_buffer_enable(); 911 tu->tp.flags |= TP_FLAG_PROFILE;
908 if (ret < 0) 912 }
909 return ret;
910 913
911 WARN_ON(!uprobe_filter_is_empty(&tu->filter)); 914 WARN_ON(!uprobe_filter_is_empty(&tu->filter));
912 915
913 if (enabled) 916 if (enabled)
914 return 0; 917 return 0;
915 918
919 ret = uprobe_buffer_enable();
920 if (ret)
921 goto err_flags;
922
916 tu->consumer.filter = filter; 923 tu->consumer.filter = filter;
917 ret = uprobe_register(tu->inode, tu->offset, &tu->consumer); 924 ret = uprobe_register(tu->inode, tu->offset, &tu->consumer);
918 if (ret) { 925 if (ret)
919 if (file) { 926 goto err_buffer;
920 list_del(&link->list);
921 kfree(link);
922 tu->tp.flags &= ~TP_FLAG_TRACE;
923 } else
924 tu->tp.flags &= ~TP_FLAG_PROFILE;
925 }
926 927
928 return 0;
929
930 err_buffer:
931 uprobe_buffer_disable();
932
933 err_flags:
934 if (file) {
935 list_del(&link->list);
936 kfree(link);
937 tu->tp.flags &= ~TP_FLAG_TRACE;
938 } else {
939 tu->tp.flags &= ~TP_FLAG_PROFILE;
940 }
927 return ret; 941 return ret;
928} 942}
929 943
@@ -1201,12 +1215,6 @@ static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs)
1201 1215
1202 current->utask->vaddr = (unsigned long) &udd; 1216 current->utask->vaddr = (unsigned long) &udd;
1203 1217
1204#ifdef CONFIG_PERF_EVENTS
1205 if ((tu->tp.flags & TP_FLAG_TRACE) == 0 &&
1206 !uprobe_perf_filter(&tu->consumer, 0, current->mm))
1207 return UPROBE_HANDLER_REMOVE;
1208#endif
1209
1210 if (WARN_ON_ONCE(!uprobe_cpu_buffer)) 1218 if (WARN_ON_ONCE(!uprobe_cpu_buffer))
1211 return 0; 1219 return 0;
1212 1220
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 33cbd8c203f8..3490407dc7b7 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -492,33 +492,29 @@ static int sys_tracepoint_refcount;
492 492
493void syscall_regfunc(void) 493void syscall_regfunc(void)
494{ 494{
495 unsigned long flags; 495 struct task_struct *p, *t;
496 struct task_struct *g, *t;
497 496
498 if (!sys_tracepoint_refcount) { 497 if (!sys_tracepoint_refcount) {
499 read_lock_irqsave(&tasklist_lock, flags); 498 read_lock(&tasklist_lock);
500 do_each_thread(g, t) { 499 for_each_process_thread(p, t) {
501 /* Skip kernel threads. */ 500 set_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT);
502 if (t->mm) 501 }
503 set_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT); 502 read_unlock(&tasklist_lock);
504 } while_each_thread(g, t);
505 read_unlock_irqrestore(&tasklist_lock, flags);
506 } 503 }
507 sys_tracepoint_refcount++; 504 sys_tracepoint_refcount++;
508} 505}
509 506
510void syscall_unregfunc(void) 507void syscall_unregfunc(void)
511{ 508{
512 unsigned long flags; 509 struct task_struct *p, *t;
513 struct task_struct *g, *t;
514 510
515 sys_tracepoint_refcount--; 511 sys_tracepoint_refcount--;
516 if (!sys_tracepoint_refcount) { 512 if (!sys_tracepoint_refcount) {
517 read_lock_irqsave(&tasklist_lock, flags); 513 read_lock(&tasklist_lock);
518 do_each_thread(g, t) { 514 for_each_process_thread(p, t) {
519 clear_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT); 515 clear_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT);
520 } while_each_thread(g, t); 516 }
521 read_unlock_irqrestore(&tasklist_lock, flags); 517 read_unlock(&tasklist_lock);
522 } 518 }
523} 519}
524#endif 520#endif
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 516203e665fc..c3319bd1b040 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -31,6 +31,12 @@
31 31
32int watchdog_user_enabled = 1; 32int watchdog_user_enabled = 1;
33int __read_mostly watchdog_thresh = 10; 33int __read_mostly watchdog_thresh = 10;
34#ifdef CONFIG_SMP
35int __read_mostly sysctl_softlockup_all_cpu_backtrace;
36#else
37#define sysctl_softlockup_all_cpu_backtrace 0
38#endif
39
34static int __read_mostly watchdog_running; 40static int __read_mostly watchdog_running;
35static u64 __read_mostly sample_period; 41static u64 __read_mostly sample_period;
36 42
@@ -47,6 +53,7 @@ static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
47static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); 53static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
48static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); 54static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
49#endif 55#endif
56static unsigned long soft_lockup_nmi_warn;
50 57
51/* boot commands */ 58/* boot commands */
52/* 59/*
@@ -95,6 +102,15 @@ static int __init nosoftlockup_setup(char *str)
95} 102}
96__setup("nosoftlockup", nosoftlockup_setup); 103__setup("nosoftlockup", nosoftlockup_setup);
97/* */ 104/* */
105#ifdef CONFIG_SMP
106static int __init softlockup_all_cpu_backtrace_setup(char *str)
107{
108 sysctl_softlockup_all_cpu_backtrace =
109 !!simple_strtol(str, NULL, 0);
110 return 1;
111}
112__setup("softlockup_all_cpu_backtrace=", softlockup_all_cpu_backtrace_setup);
113#endif
98 114
99/* 115/*
100 * Hard-lockup warnings should be triggered after just a few seconds. Soft- 116 * Hard-lockup warnings should be triggered after just a few seconds. Soft-
@@ -271,6 +287,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
271 unsigned long touch_ts = __this_cpu_read(watchdog_touch_ts); 287 unsigned long touch_ts = __this_cpu_read(watchdog_touch_ts);
272 struct pt_regs *regs = get_irq_regs(); 288 struct pt_regs *regs = get_irq_regs();
273 int duration; 289 int duration;
290 int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace;
274 291
275 /* kick the hardlockup detector */ 292 /* kick the hardlockup detector */
276 watchdog_interrupt_count(); 293 watchdog_interrupt_count();
@@ -317,6 +334,17 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
317 if (__this_cpu_read(soft_watchdog_warn) == true) 334 if (__this_cpu_read(soft_watchdog_warn) == true)
318 return HRTIMER_RESTART; 335 return HRTIMER_RESTART;
319 336
337 if (softlockup_all_cpu_backtrace) {
338 /* Prevent multiple soft-lockup reports if one cpu is already
339 * engaged in dumping cpu back traces
340 */
341 if (test_and_set_bit(0, &soft_lockup_nmi_warn)) {
342 /* Someone else will report us. Let's give up */
343 __this_cpu_write(soft_watchdog_warn, true);
344 return HRTIMER_RESTART;
345 }
346 }
347
320 printk(KERN_EMERG "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", 348 printk(KERN_EMERG "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
321 smp_processor_id(), duration, 349 smp_processor_id(), duration,
322 current->comm, task_pid_nr(current)); 350 current->comm, task_pid_nr(current));
@@ -327,6 +355,17 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
327 else 355 else
328 dump_stack(); 356 dump_stack();
329 357
358 if (softlockup_all_cpu_backtrace) {
359 /* Avoid generating two back traces for current
360 * given that one is already made above
361 */
362 trigger_allbutself_cpu_backtrace();
363
364 clear_bit(0, &soft_lockup_nmi_warn);
365 /* Barrier to sync with other cpus */
366 smp_mb__after_atomic();
367 }
368
330 if (softlockup_panic) 369 if (softlockup_panic)
331 panic("softlockup: hung tasks"); 370 panic("softlockup: hung tasks");
332 __this_cpu_write(soft_watchdog_warn, true); 371 __this_cpu_write(soft_watchdog_warn, true);
@@ -527,10 +566,8 @@ static void update_timers_all_cpus(void)
527 int cpu; 566 int cpu;
528 567
529 get_online_cpus(); 568 get_online_cpus();
530 preempt_disable();
531 for_each_online_cpu(cpu) 569 for_each_online_cpu(cpu)
532 update_timers(cpu); 570 update_timers(cpu);
533 preempt_enable();
534 put_online_cpus(); 571 put_online_cpus();
535} 572}
536 573
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 6203d2900877..35974ac69600 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -3284,6 +3284,7 @@ int workqueue_sysfs_register(struct workqueue_struct *wq)
3284 } 3284 }
3285 } 3285 }
3286 3286
3287 dev_set_uevent_suppress(&wq_dev->dev, false);
3287 kobject_uevent(&wq_dev->dev.kobj, KOBJ_ADD); 3288 kobject_uevent(&wq_dev->dev.kobj, KOBJ_ADD);
3288 return 0; 3289 return 0;
3289} 3290}
@@ -4879,7 +4880,7 @@ static void __init wq_numa_init(void)
4879 BUG_ON(!tbl); 4880 BUG_ON(!tbl);
4880 4881
4881 for_each_node(node) 4882 for_each_node(node)
4882 BUG_ON(!alloc_cpumask_var_node(&tbl[node], GFP_KERNEL, 4883 BUG_ON(!zalloc_cpumask_var_node(&tbl[node], GFP_KERNEL,
4883 node_online(node) ? node : NUMA_NO_NODE)); 4884 node_online(node) ? node : NUMA_NO_NODE));
4884 4885
4885 for_each_possible_cpu(cpu) { 4886 for_each_possible_cpu(cpu) {