diff options
Diffstat (limited to 'kernel')
48 files changed, 1296 insertions, 484 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks index 35536d9c0964..76768ee812b2 100644 --- a/kernel/Kconfig.locks +++ b/kernel/Kconfig.locks | |||
@@ -220,9 +220,16 @@ config INLINE_WRITE_UNLOCK_IRQRESTORE | |||
220 | 220 | ||
221 | endif | 221 | endif |
222 | 222 | ||
223 | config ARCH_SUPPORTS_ATOMIC_RMW | ||
224 | bool | ||
225 | |||
223 | config MUTEX_SPIN_ON_OWNER | 226 | config MUTEX_SPIN_ON_OWNER |
224 | def_bool y | 227 | def_bool y |
225 | depends on SMP && !DEBUG_MUTEXES | 228 | depends on SMP && !DEBUG_MUTEXES && ARCH_SUPPORTS_ATOMIC_RMW |
229 | |||
230 | config RWSEM_SPIN_ON_OWNER | ||
231 | def_bool y | ||
232 | depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW | ||
226 | 233 | ||
227 | config ARCH_USE_QUEUE_RWLOCK | 234 | config ARCH_USE_QUEUE_RWLOCK |
228 | bool | 235 | bool |
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 7868fc3c0bc5..70776aec2562 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
@@ -1648,10 +1648,13 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
1648 | int flags, const char *unused_dev_name, | 1648 | int flags, const char *unused_dev_name, |
1649 | void *data) | 1649 | void *data) |
1650 | { | 1650 | { |
1651 | struct super_block *pinned_sb = NULL; | ||
1652 | struct cgroup_subsys *ss; | ||
1651 | struct cgroup_root *root; | 1653 | struct cgroup_root *root; |
1652 | struct cgroup_sb_opts opts; | 1654 | struct cgroup_sb_opts opts; |
1653 | struct dentry *dentry; | 1655 | struct dentry *dentry; |
1654 | int ret; | 1656 | int ret; |
1657 | int i; | ||
1655 | bool new_sb; | 1658 | bool new_sb; |
1656 | 1659 | ||
1657 | /* | 1660 | /* |
@@ -1677,6 +1680,27 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
1677 | goto out_unlock; | 1680 | goto out_unlock; |
1678 | } | 1681 | } |
1679 | 1682 | ||
1683 | /* | ||
1684 | * Destruction of cgroup root is asynchronous, so subsystems may | ||
1685 | * still be dying after the previous unmount. Let's drain the | ||
1686 | * dying subsystems. We just need to ensure that the ones | ||
1687 | * unmounted previously finish dying and don't care about new ones | ||
1688 | * starting. Testing ref liveliness is good enough. | ||
1689 | */ | ||
1690 | for_each_subsys(ss, i) { | ||
1691 | if (!(opts.subsys_mask & (1 << i)) || | ||
1692 | ss->root == &cgrp_dfl_root) | ||
1693 | continue; | ||
1694 | |||
1695 | if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) { | ||
1696 | mutex_unlock(&cgroup_mutex); | ||
1697 | msleep(10); | ||
1698 | ret = restart_syscall(); | ||
1699 | goto out_free; | ||
1700 | } | ||
1701 | cgroup_put(&ss->root->cgrp); | ||
1702 | } | ||
1703 | |||
1680 | for_each_root(root) { | 1704 | for_each_root(root) { |
1681 | bool name_match = false; | 1705 | bool name_match = false; |
1682 | 1706 | ||
@@ -1717,15 +1741,23 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
1717 | } | 1741 | } |
1718 | 1742 | ||
1719 | /* | 1743 | /* |
1720 | * A root's lifetime is governed by its root cgroup. | 1744 | * We want to reuse @root whose lifetime is governed by its |
1721 | * tryget_live failure indicate that the root is being | 1745 | * ->cgrp. Let's check whether @root is alive and keep it |
1722 | * destroyed. Wait for destruction to complete so that the | 1746 | * that way. As cgroup_kill_sb() can happen anytime, we |
1723 | * subsystems are free. We can use wait_queue for the wait | 1747 | * want to block it by pinning the sb so that @root doesn't |
1724 | * but this path is super cold. Let's just sleep for a bit | 1748 | * get killed before mount is complete. |
1725 | * and retry. | 1749 | * |
1750 | * With the sb pinned, tryget_live can reliably indicate | ||
1751 | * whether @root can be reused. If it's being killed, | ||
1752 | * drain it. We can use wait_queue for the wait but this | ||
1753 | * path is super cold. Let's just sleep a bit and retry. | ||
1726 | */ | 1754 | */ |
1727 | if (!percpu_ref_tryget_live(&root->cgrp.self.refcnt)) { | 1755 | pinned_sb = kernfs_pin_sb(root->kf_root, NULL); |
1756 | if (IS_ERR(pinned_sb) || | ||
1757 | !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) { | ||
1728 | mutex_unlock(&cgroup_mutex); | 1758 | mutex_unlock(&cgroup_mutex); |
1759 | if (!IS_ERR_OR_NULL(pinned_sb)) | ||
1760 | deactivate_super(pinned_sb); | ||
1729 | msleep(10); | 1761 | msleep(10); |
1730 | ret = restart_syscall(); | 1762 | ret = restart_syscall(); |
1731 | goto out_free; | 1763 | goto out_free; |
@@ -1770,6 +1802,16 @@ out_free: | |||
1770 | CGROUP_SUPER_MAGIC, &new_sb); | 1802 | CGROUP_SUPER_MAGIC, &new_sb); |
1771 | if (IS_ERR(dentry) || !new_sb) | 1803 | if (IS_ERR(dentry) || !new_sb) |
1772 | cgroup_put(&root->cgrp); | 1804 | cgroup_put(&root->cgrp); |
1805 | |||
1806 | /* | ||
1807 | * If @pinned_sb, we're reusing an existing root and holding an | ||
1808 | * extra ref on its sb. Mount is complete. Put the extra ref. | ||
1809 | */ | ||
1810 | if (pinned_sb) { | ||
1811 | WARN_ON(new_sb); | ||
1812 | deactivate_super(pinned_sb); | ||
1813 | } | ||
1814 | |||
1773 | return dentry; | 1815 | return dentry; |
1774 | } | 1816 | } |
1775 | 1817 | ||
@@ -3328,7 +3370,7 @@ bool css_has_online_children(struct cgroup_subsys_state *css) | |||
3328 | 3370 | ||
3329 | rcu_read_lock(); | 3371 | rcu_read_lock(); |
3330 | css_for_each_child(child, css) { | 3372 | css_for_each_child(child, css) { |
3331 | if (css->flags & CSS_ONLINE) { | 3373 | if (child->flags & CSS_ONLINE) { |
3332 | ret = true; | 3374 | ret = true; |
3333 | break; | 3375 | break; |
3334 | } | 3376 | } |
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 019d45008448..5664985c46a0 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c | |||
@@ -19,6 +19,7 @@ | |||
19 | #include <linux/sched.h> | 19 | #include <linux/sched.h> |
20 | #include <linux/hardirq.h> | 20 | #include <linux/hardirq.h> |
21 | #include <linux/export.h> | 21 | #include <linux/export.h> |
22 | #include <linux/kprobes.h> | ||
22 | 23 | ||
23 | #define CREATE_TRACE_POINTS | 24 | #define CREATE_TRACE_POINTS |
24 | #include <trace/events/context_tracking.h> | 25 | #include <trace/events/context_tracking.h> |
@@ -104,6 +105,7 @@ void context_tracking_user_enter(void) | |||
104 | } | 105 | } |
105 | local_irq_restore(flags); | 106 | local_irq_restore(flags); |
106 | } | 107 | } |
108 | NOKPROBE_SYMBOL(context_tracking_user_enter); | ||
107 | 109 | ||
108 | #ifdef CONFIG_PREEMPT | 110 | #ifdef CONFIG_PREEMPT |
109 | /** | 111 | /** |
@@ -181,6 +183,7 @@ void context_tracking_user_exit(void) | |||
181 | } | 183 | } |
182 | local_irq_restore(flags); | 184 | local_irq_restore(flags); |
183 | } | 185 | } |
186 | NOKPROBE_SYMBOL(context_tracking_user_exit); | ||
184 | 187 | ||
185 | /** | 188 | /** |
186 | * __context_tracking_task_switch - context switch the syscall callbacks | 189 | * __context_tracking_task_switch - context switch the syscall callbacks |
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index f6b33c696224..116a4164720a 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -1181,7 +1181,13 @@ done: | |||
1181 | 1181 | ||
1182 | int current_cpuset_is_being_rebound(void) | 1182 | int current_cpuset_is_being_rebound(void) |
1183 | { | 1183 | { |
1184 | return task_cs(current) == cpuset_being_rebound; | 1184 | int ret; |
1185 | |||
1186 | rcu_read_lock(); | ||
1187 | ret = task_cs(current) == cpuset_being_rebound; | ||
1188 | rcu_read_unlock(); | ||
1189 | |||
1190 | return ret; | ||
1185 | } | 1191 | } |
1186 | 1192 | ||
1187 | static int update_relax_domain_level(struct cpuset *cs, s64 val) | 1193 | static int update_relax_domain_level(struct cpuset *cs, s64 val) |
@@ -1617,7 +1623,17 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, | |||
1617 | * resources, wait for the previously scheduled operations before | 1623 | * resources, wait for the previously scheduled operations before |
1618 | * proceeding, so that we don't end up keep removing tasks added | 1624 | * proceeding, so that we don't end up keep removing tasks added |
1619 | * after execution capability is restored. | 1625 | * after execution capability is restored. |
1626 | * | ||
1627 | * cpuset_hotplug_work calls back into cgroup core via | ||
1628 | * cgroup_transfer_tasks() and waiting for it from a cgroupfs | ||
1629 | * operation like this one can lead to a deadlock through kernfs | ||
1630 | * active_ref protection. Let's break the protection. Losing the | ||
1631 | * protection is okay as we check whether @cs is online after | ||
1632 | * grabbing cpuset_mutex anyway. This only happens on the legacy | ||
1633 | * hierarchies. | ||
1620 | */ | 1634 | */ |
1635 | css_get(&cs->css); | ||
1636 | kernfs_break_active_protection(of->kn); | ||
1621 | flush_work(&cpuset_hotplug_work); | 1637 | flush_work(&cpuset_hotplug_work); |
1622 | 1638 | ||
1623 | mutex_lock(&cpuset_mutex); | 1639 | mutex_lock(&cpuset_mutex); |
@@ -1645,6 +1661,8 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, | |||
1645 | free_trial_cpuset(trialcs); | 1661 | free_trial_cpuset(trialcs); |
1646 | out_unlock: | 1662 | out_unlock: |
1647 | mutex_unlock(&cpuset_mutex); | 1663 | mutex_unlock(&cpuset_mutex); |
1664 | kernfs_unbreak_active_protection(of->kn); | ||
1665 | css_put(&cs->css); | ||
1648 | return retval ?: nbytes; | 1666 | return retval ?: nbytes; |
1649 | } | 1667 | } |
1650 | 1668 | ||
diff --git a/kernel/events/core.c b/kernel/events/core.c index 5fa58e4cffac..6b17ac1b0c2a 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c | |||
@@ -40,6 +40,7 @@ | |||
40 | #include <linux/mm_types.h> | 40 | #include <linux/mm_types.h> |
41 | #include <linux/cgroup.h> | 41 | #include <linux/cgroup.h> |
42 | #include <linux/module.h> | 42 | #include <linux/module.h> |
43 | #include <linux/mman.h> | ||
43 | 44 | ||
44 | #include "internal.h" | 45 | #include "internal.h" |
45 | 46 | ||
@@ -2319,7 +2320,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn, | |||
2319 | next_parent = rcu_dereference(next_ctx->parent_ctx); | 2320 | next_parent = rcu_dereference(next_ctx->parent_ctx); |
2320 | 2321 | ||
2321 | /* If neither context have a parent context; they cannot be clones. */ | 2322 | /* If neither context have a parent context; they cannot be clones. */ |
2322 | if (!parent && !next_parent) | 2323 | if (!parent || !next_parent) |
2323 | goto unlock; | 2324 | goto unlock; |
2324 | 2325 | ||
2325 | if (next_parent == ctx || next_ctx == parent || next_parent == parent) { | 2326 | if (next_parent == ctx || next_ctx == parent || next_parent == parent) { |
@@ -5128,6 +5129,7 @@ struct perf_mmap_event { | |||
5128 | int maj, min; | 5129 | int maj, min; |
5129 | u64 ino; | 5130 | u64 ino; |
5130 | u64 ino_generation; | 5131 | u64 ino_generation; |
5132 | u32 prot, flags; | ||
5131 | 5133 | ||
5132 | struct { | 5134 | struct { |
5133 | struct perf_event_header header; | 5135 | struct perf_event_header header; |
@@ -5169,6 +5171,8 @@ static void perf_event_mmap_output(struct perf_event *event, | |||
5169 | mmap_event->event_id.header.size += sizeof(mmap_event->min); | 5171 | mmap_event->event_id.header.size += sizeof(mmap_event->min); |
5170 | mmap_event->event_id.header.size += sizeof(mmap_event->ino); | 5172 | mmap_event->event_id.header.size += sizeof(mmap_event->ino); |
5171 | mmap_event->event_id.header.size += sizeof(mmap_event->ino_generation); | 5173 | mmap_event->event_id.header.size += sizeof(mmap_event->ino_generation); |
5174 | mmap_event->event_id.header.size += sizeof(mmap_event->prot); | ||
5175 | mmap_event->event_id.header.size += sizeof(mmap_event->flags); | ||
5172 | } | 5176 | } |
5173 | 5177 | ||
5174 | perf_event_header__init_id(&mmap_event->event_id.header, &sample, event); | 5178 | perf_event_header__init_id(&mmap_event->event_id.header, &sample, event); |
@@ -5187,6 +5191,8 @@ static void perf_event_mmap_output(struct perf_event *event, | |||
5187 | perf_output_put(&handle, mmap_event->min); | 5191 | perf_output_put(&handle, mmap_event->min); |
5188 | perf_output_put(&handle, mmap_event->ino); | 5192 | perf_output_put(&handle, mmap_event->ino); |
5189 | perf_output_put(&handle, mmap_event->ino_generation); | 5193 | perf_output_put(&handle, mmap_event->ino_generation); |
5194 | perf_output_put(&handle, mmap_event->prot); | ||
5195 | perf_output_put(&handle, mmap_event->flags); | ||
5190 | } | 5196 | } |
5191 | 5197 | ||
5192 | __output_copy(&handle, mmap_event->file_name, | 5198 | __output_copy(&handle, mmap_event->file_name, |
@@ -5205,6 +5211,7 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) | |||
5205 | struct file *file = vma->vm_file; | 5211 | struct file *file = vma->vm_file; |
5206 | int maj = 0, min = 0; | 5212 | int maj = 0, min = 0; |
5207 | u64 ino = 0, gen = 0; | 5213 | u64 ino = 0, gen = 0; |
5214 | u32 prot = 0, flags = 0; | ||
5208 | unsigned int size; | 5215 | unsigned int size; |
5209 | char tmp[16]; | 5216 | char tmp[16]; |
5210 | char *buf = NULL; | 5217 | char *buf = NULL; |
@@ -5235,6 +5242,28 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) | |||
5235 | gen = inode->i_generation; | 5242 | gen = inode->i_generation; |
5236 | maj = MAJOR(dev); | 5243 | maj = MAJOR(dev); |
5237 | min = MINOR(dev); | 5244 | min = MINOR(dev); |
5245 | |||
5246 | if (vma->vm_flags & VM_READ) | ||
5247 | prot |= PROT_READ; | ||
5248 | if (vma->vm_flags & VM_WRITE) | ||
5249 | prot |= PROT_WRITE; | ||
5250 | if (vma->vm_flags & VM_EXEC) | ||
5251 | prot |= PROT_EXEC; | ||
5252 | |||
5253 | if (vma->vm_flags & VM_MAYSHARE) | ||
5254 | flags = MAP_SHARED; | ||
5255 | else | ||
5256 | flags = MAP_PRIVATE; | ||
5257 | |||
5258 | if (vma->vm_flags & VM_DENYWRITE) | ||
5259 | flags |= MAP_DENYWRITE; | ||
5260 | if (vma->vm_flags & VM_MAYEXEC) | ||
5261 | flags |= MAP_EXECUTABLE; | ||
5262 | if (vma->vm_flags & VM_LOCKED) | ||
5263 | flags |= MAP_LOCKED; | ||
5264 | if (vma->vm_flags & VM_HUGETLB) | ||
5265 | flags |= MAP_HUGETLB; | ||
5266 | |||
5238 | goto got_name; | 5267 | goto got_name; |
5239 | } else { | 5268 | } else { |
5240 | name = (char *)arch_vma_name(vma); | 5269 | name = (char *)arch_vma_name(vma); |
@@ -5275,6 +5304,8 @@ got_name: | |||
5275 | mmap_event->min = min; | 5304 | mmap_event->min = min; |
5276 | mmap_event->ino = ino; | 5305 | mmap_event->ino = ino; |
5277 | mmap_event->ino_generation = gen; | 5306 | mmap_event->ino_generation = gen; |
5307 | mmap_event->prot = prot; | ||
5308 | mmap_event->flags = flags; | ||
5278 | 5309 | ||
5279 | if (!(vma->vm_flags & VM_EXEC)) | 5310 | if (!(vma->vm_flags & VM_EXEC)) |
5280 | mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA; | 5311 | mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA; |
@@ -5315,6 +5346,8 @@ void perf_event_mmap(struct vm_area_struct *vma) | |||
5315 | /* .min (attr_mmap2 only) */ | 5346 | /* .min (attr_mmap2 only) */ |
5316 | /* .ino (attr_mmap2 only) */ | 5347 | /* .ino (attr_mmap2 only) */ |
5317 | /* .ino_generation (attr_mmap2 only) */ | 5348 | /* .ino_generation (attr_mmap2 only) */ |
5349 | /* .prot (attr_mmap2 only) */ | ||
5350 | /* .flags (attr_mmap2 only) */ | ||
5318 | }; | 5351 | }; |
5319 | 5352 | ||
5320 | perf_event_mmap_event(&mmap_event); | 5353 | perf_event_mmap_event(&mmap_event); |
@@ -6897,10 +6930,6 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, | |||
6897 | if (ret) | 6930 | if (ret) |
6898 | return -EFAULT; | 6931 | return -EFAULT; |
6899 | 6932 | ||
6900 | /* disabled for now */ | ||
6901 | if (attr->mmap2) | ||
6902 | return -EINVAL; | ||
6903 | |||
6904 | if (attr->__reserved_1) | 6933 | if (attr->__reserved_1) |
6905 | return -EINVAL; | 6934 | return -EINVAL; |
6906 | 6935 | ||
@@ -7429,7 +7458,19 @@ __perf_event_exit_task(struct perf_event *child_event, | |||
7429 | struct perf_event_context *child_ctx, | 7458 | struct perf_event_context *child_ctx, |
7430 | struct task_struct *child) | 7459 | struct task_struct *child) |
7431 | { | 7460 | { |
7432 | perf_remove_from_context(child_event, true); | 7461 | /* |
7462 | * Do not destroy the 'original' grouping; because of the context | ||
7463 | * switch optimization the original events could've ended up in a | ||
7464 | * random child task. | ||
7465 | * | ||
7466 | * If we were to destroy the original group, all group related | ||
7467 | * operations would cease to function properly after this random | ||
7468 | * child dies. | ||
7469 | * | ||
7470 | * Do destroy all inherited groups, we don't care about those | ||
7471 | * and being thorough is better. | ||
7472 | */ | ||
7473 | perf_remove_from_context(child_event, !!child_event->parent); | ||
7433 | 7474 | ||
7434 | /* | 7475 | /* |
7435 | * It can happen that the parent exits first, and has events | 7476 | * It can happen that the parent exits first, and has events |
@@ -7445,7 +7486,7 @@ __perf_event_exit_task(struct perf_event *child_event, | |||
7445 | static void perf_event_exit_task_context(struct task_struct *child, int ctxn) | 7486 | static void perf_event_exit_task_context(struct task_struct *child, int ctxn) |
7446 | { | 7487 | { |
7447 | struct perf_event *child_event, *next; | 7488 | struct perf_event *child_event, *next; |
7448 | struct perf_event_context *child_ctx; | 7489 | struct perf_event_context *child_ctx, *parent_ctx; |
7449 | unsigned long flags; | 7490 | unsigned long flags; |
7450 | 7491 | ||
7451 | if (likely(!child->perf_event_ctxp[ctxn])) { | 7492 | if (likely(!child->perf_event_ctxp[ctxn])) { |
@@ -7470,6 +7511,15 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) | |||
7470 | raw_spin_lock(&child_ctx->lock); | 7511 | raw_spin_lock(&child_ctx->lock); |
7471 | task_ctx_sched_out(child_ctx); | 7512 | task_ctx_sched_out(child_ctx); |
7472 | child->perf_event_ctxp[ctxn] = NULL; | 7513 | child->perf_event_ctxp[ctxn] = NULL; |
7514 | |||
7515 | /* | ||
7516 | * In order to avoid freeing: child_ctx->parent_ctx->task | ||
7517 | * under perf_event_context::lock, grab another reference. | ||
7518 | */ | ||
7519 | parent_ctx = child_ctx->parent_ctx; | ||
7520 | if (parent_ctx) | ||
7521 | get_ctx(parent_ctx); | ||
7522 | |||
7473 | /* | 7523 | /* |
7474 | * If this context is a clone; unclone it so it can't get | 7524 | * If this context is a clone; unclone it so it can't get |
7475 | * swapped to another process while we're removing all | 7525 | * swapped to another process while we're removing all |
@@ -7480,6 +7530,13 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) | |||
7480 | raw_spin_unlock_irqrestore(&child_ctx->lock, flags); | 7530 | raw_spin_unlock_irqrestore(&child_ctx->lock, flags); |
7481 | 7531 | ||
7482 | /* | 7532 | /* |
7533 | * Now that we no longer hold perf_event_context::lock, drop | ||
7534 | * our extra child_ctx->parent_ctx reference. | ||
7535 | */ | ||
7536 | if (parent_ctx) | ||
7537 | put_ctx(parent_ctx); | ||
7538 | |||
7539 | /* | ||
7483 | * Report the task dead after unscheduling the events so that we | 7540 | * Report the task dead after unscheduling the events so that we |
7484 | * won't get any samples after PERF_RECORD_EXIT. We can however still | 7541 | * won't get any samples after PERF_RECORD_EXIT. We can however still |
7485 | * get a few PERF_RECORD_READ events. | 7542 | * get a few PERF_RECORD_READ events. |
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index c445e392e93f..6f3254e8c137 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c | |||
@@ -846,7 +846,7 @@ static void __uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *u | |||
846 | { | 846 | { |
847 | int err; | 847 | int err; |
848 | 848 | ||
849 | if (!consumer_del(uprobe, uc)) /* WARN? */ | 849 | if (WARN_ON(!consumer_del(uprobe, uc))) |
850 | return; | 850 | return; |
851 | 851 | ||
852 | err = register_for_each_vma(uprobe, NULL); | 852 | err = register_for_each_vma(uprobe, NULL); |
@@ -927,7 +927,7 @@ int uprobe_apply(struct inode *inode, loff_t offset, | |||
927 | int ret = -ENOENT; | 927 | int ret = -ENOENT; |
928 | 928 | ||
929 | uprobe = find_uprobe(inode, offset); | 929 | uprobe = find_uprobe(inode, offset); |
930 | if (!uprobe) | 930 | if (WARN_ON(!uprobe)) |
931 | return ret; | 931 | return ret; |
932 | 932 | ||
933 | down_write(&uprobe->register_rwsem); | 933 | down_write(&uprobe->register_rwsem); |
@@ -952,7 +952,7 @@ void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consume | |||
952 | struct uprobe *uprobe; | 952 | struct uprobe *uprobe; |
953 | 953 | ||
954 | uprobe = find_uprobe(inode, offset); | 954 | uprobe = find_uprobe(inode, offset); |
955 | if (!uprobe) | 955 | if (WARN_ON(!uprobe)) |
956 | return; | 956 | return; |
957 | 957 | ||
958 | down_write(&uprobe->register_rwsem); | 958 | down_write(&uprobe->register_rwsem); |
diff --git a/kernel/fork.c b/kernel/fork.c index d2799d1fc952..6a13c46cd87d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -1487,7 +1487,9 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1487 | 1487 | ||
1488 | total_forks++; | 1488 | total_forks++; |
1489 | spin_unlock(¤t->sighand->siglock); | 1489 | spin_unlock(¤t->sighand->siglock); |
1490 | syscall_tracepoint_update(p); | ||
1490 | write_unlock_irq(&tasklist_lock); | 1491 | write_unlock_irq(&tasklist_lock); |
1492 | |||
1491 | proc_fork_connector(p); | 1493 | proc_fork_connector(p); |
1492 | cgroup_post_fork(p); | 1494 | cgroup_post_fork(p); |
1493 | if (clone_flags & CLONE_THREAD) | 1495 | if (clone_flags & CLONE_THREAD) |
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 7339e42a85ab..1487a123db5c 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c | |||
@@ -455,9 +455,9 @@ EXPORT_SYMBOL_GPL(irq_alloc_hwirqs); | |||
455 | */ | 455 | */ |
456 | void irq_free_hwirqs(unsigned int from, int cnt) | 456 | void irq_free_hwirqs(unsigned int from, int cnt) |
457 | { | 457 | { |
458 | int i; | 458 | int i, j; |
459 | 459 | ||
460 | for (i = from; cnt > 0; i++, cnt--) { | 460 | for (i = from, j = cnt; j > 0; i++, j--) { |
461 | irq_set_status_flags(i, _IRQ_NOREQUEST | _IRQ_NOPROBE); | 461 | irq_set_status_flags(i, _IRQ_NOREQUEST | _IRQ_NOPROBE); |
462 | arch_teardown_hwirq(i); | 462 | arch_teardown_hwirq(i); |
463 | } | 463 | } |
diff --git a/kernel/kexec.c b/kernel/kexec.c index 6748688813d0..369f41a94124 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c | |||
@@ -1617,6 +1617,7 @@ static int __init crash_save_vmcoreinfo_init(void) | |||
1617 | #ifdef CONFIG_MEMORY_FAILURE | 1617 | #ifdef CONFIG_MEMORY_FAILURE |
1618 | VMCOREINFO_NUMBER(PG_hwpoison); | 1618 | VMCOREINFO_NUMBER(PG_hwpoison); |
1619 | #endif | 1619 | #endif |
1620 | VMCOREINFO_NUMBER(PG_head_mask); | ||
1620 | VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE); | 1621 | VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE); |
1621 | 1622 | ||
1622 | arch_crash_save_vmcoreinfo(); | 1623 | arch_crash_save_vmcoreinfo(); |
diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 3214289df5a7..734e9a7d280b 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c | |||
@@ -2037,19 +2037,23 @@ static int __init populate_kprobe_blacklist(unsigned long *start, | |||
2037 | { | 2037 | { |
2038 | unsigned long *iter; | 2038 | unsigned long *iter; |
2039 | struct kprobe_blacklist_entry *ent; | 2039 | struct kprobe_blacklist_entry *ent; |
2040 | unsigned long offset = 0, size = 0; | 2040 | unsigned long entry, offset = 0, size = 0; |
2041 | 2041 | ||
2042 | for (iter = start; iter < end; iter++) { | 2042 | for (iter = start; iter < end; iter++) { |
2043 | if (!kallsyms_lookup_size_offset(*iter, &size, &offset)) { | 2043 | entry = arch_deref_entry_point((void *)*iter); |
2044 | pr_err("Failed to find blacklist %p\n", (void *)*iter); | 2044 | |
2045 | if (!kernel_text_address(entry) || | ||
2046 | !kallsyms_lookup_size_offset(entry, &size, &offset)) { | ||
2047 | pr_err("Failed to find blacklist at %p\n", | ||
2048 | (void *)entry); | ||
2045 | continue; | 2049 | continue; |
2046 | } | 2050 | } |
2047 | 2051 | ||
2048 | ent = kmalloc(sizeof(*ent), GFP_KERNEL); | 2052 | ent = kmalloc(sizeof(*ent), GFP_KERNEL); |
2049 | if (!ent) | 2053 | if (!ent) |
2050 | return -ENOMEM; | 2054 | return -ENOMEM; |
2051 | ent->start_addr = *iter; | 2055 | ent->start_addr = entry; |
2052 | ent->end_addr = *iter + size; | 2056 | ent->end_addr = entry + size; |
2053 | INIT_LIST_HEAD(&ent->list); | 2057 | INIT_LIST_HEAD(&ent->list); |
2054 | list_add_tail(&ent->list, &kprobe_blacklist); | 2058 | list_add_tail(&ent->list, &kprobe_blacklist); |
2055 | } | 2059 | } |
diff --git a/kernel/locking/mcs_spinlock.c b/kernel/locking/mcs_spinlock.c index 838dc9e00669..be9ee1559fca 100644 --- a/kernel/locking/mcs_spinlock.c +++ b/kernel/locking/mcs_spinlock.c | |||
@@ -14,21 +14,47 @@ | |||
14 | * called from interrupt context and we have preemption disabled while | 14 | * called from interrupt context and we have preemption disabled while |
15 | * spinning. | 15 | * spinning. |
16 | */ | 16 | */ |
17 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_queue, osq_node); | 17 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_node, osq_node); |
18 | |||
19 | /* | ||
20 | * We use the value 0 to represent "no CPU", thus the encoded value | ||
21 | * will be the CPU number incremented by 1. | ||
22 | */ | ||
23 | static inline int encode_cpu(int cpu_nr) | ||
24 | { | ||
25 | return cpu_nr + 1; | ||
26 | } | ||
27 | |||
28 | static inline struct optimistic_spin_node *decode_cpu(int encoded_cpu_val) | ||
29 | { | ||
30 | int cpu_nr = encoded_cpu_val - 1; | ||
31 | |||
32 | return per_cpu_ptr(&osq_node, cpu_nr); | ||
33 | } | ||
18 | 34 | ||
19 | /* | 35 | /* |
20 | * Get a stable @node->next pointer, either for unlock() or unqueue() purposes. | 36 | * Get a stable @node->next pointer, either for unlock() or unqueue() purposes. |
21 | * Can return NULL in case we were the last queued and we updated @lock instead. | 37 | * Can return NULL in case we were the last queued and we updated @lock instead. |
22 | */ | 38 | */ |
23 | static inline struct optimistic_spin_queue * | 39 | static inline struct optimistic_spin_node * |
24 | osq_wait_next(struct optimistic_spin_queue **lock, | 40 | osq_wait_next(struct optimistic_spin_queue *lock, |
25 | struct optimistic_spin_queue *node, | 41 | struct optimistic_spin_node *node, |
26 | struct optimistic_spin_queue *prev) | 42 | struct optimistic_spin_node *prev) |
27 | { | 43 | { |
28 | struct optimistic_spin_queue *next = NULL; | 44 | struct optimistic_spin_node *next = NULL; |
45 | int curr = encode_cpu(smp_processor_id()); | ||
46 | int old; | ||
47 | |||
48 | /* | ||
49 | * If there is a prev node in queue, then the 'old' value will be | ||
50 | * the prev node's CPU #, else it's set to OSQ_UNLOCKED_VAL since if | ||
51 | * we're currently last in queue, then the queue will then become empty. | ||
52 | */ | ||
53 | old = prev ? prev->cpu : OSQ_UNLOCKED_VAL; | ||
29 | 54 | ||
30 | for (;;) { | 55 | for (;;) { |
31 | if (*lock == node && cmpxchg(lock, node, prev) == node) { | 56 | if (atomic_read(&lock->tail) == curr && |
57 | atomic_cmpxchg(&lock->tail, curr, old) == curr) { | ||
32 | /* | 58 | /* |
33 | * We were the last queued, we moved @lock back. @prev | 59 | * We were the last queued, we moved @lock back. @prev |
34 | * will now observe @lock and will complete its | 60 | * will now observe @lock and will complete its |
@@ -59,18 +85,23 @@ osq_wait_next(struct optimistic_spin_queue **lock, | |||
59 | return next; | 85 | return next; |
60 | } | 86 | } |
61 | 87 | ||
62 | bool osq_lock(struct optimistic_spin_queue **lock) | 88 | bool osq_lock(struct optimistic_spin_queue *lock) |
63 | { | 89 | { |
64 | struct optimistic_spin_queue *node = this_cpu_ptr(&osq_node); | 90 | struct optimistic_spin_node *node = this_cpu_ptr(&osq_node); |
65 | struct optimistic_spin_queue *prev, *next; | 91 | struct optimistic_spin_node *prev, *next; |
92 | int curr = encode_cpu(smp_processor_id()); | ||
93 | int old; | ||
66 | 94 | ||
67 | node->locked = 0; | 95 | node->locked = 0; |
68 | node->next = NULL; | 96 | node->next = NULL; |
97 | node->cpu = curr; | ||
69 | 98 | ||
70 | node->prev = prev = xchg(lock, node); | 99 | old = atomic_xchg(&lock->tail, curr); |
71 | if (likely(prev == NULL)) | 100 | if (old == OSQ_UNLOCKED_VAL) |
72 | return true; | 101 | return true; |
73 | 102 | ||
103 | prev = decode_cpu(old); | ||
104 | node->prev = prev; | ||
74 | ACCESS_ONCE(prev->next) = node; | 105 | ACCESS_ONCE(prev->next) = node; |
75 | 106 | ||
76 | /* | 107 | /* |
@@ -149,20 +180,21 @@ unqueue: | |||
149 | return false; | 180 | return false; |
150 | } | 181 | } |
151 | 182 | ||
152 | void osq_unlock(struct optimistic_spin_queue **lock) | 183 | void osq_unlock(struct optimistic_spin_queue *lock) |
153 | { | 184 | { |
154 | struct optimistic_spin_queue *node = this_cpu_ptr(&osq_node); | 185 | struct optimistic_spin_node *node, *next; |
155 | struct optimistic_spin_queue *next; | 186 | int curr = encode_cpu(smp_processor_id()); |
156 | 187 | ||
157 | /* | 188 | /* |
158 | * Fast path for the uncontended case. | 189 | * Fast path for the uncontended case. |
159 | */ | 190 | */ |
160 | if (likely(cmpxchg(lock, node, NULL) == node)) | 191 | if (likely(atomic_cmpxchg(&lock->tail, curr, OSQ_UNLOCKED_VAL) == curr)) |
161 | return; | 192 | return; |
162 | 193 | ||
163 | /* | 194 | /* |
164 | * Second most likely case. | 195 | * Second most likely case. |
165 | */ | 196 | */ |
197 | node = this_cpu_ptr(&osq_node); | ||
166 | next = xchg(&node->next, NULL); | 198 | next = xchg(&node->next, NULL); |
167 | if (next) { | 199 | if (next) { |
168 | ACCESS_ONCE(next->locked) = 1; | 200 | ACCESS_ONCE(next->locked) = 1; |
diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h index a2dbac4aca6b..74356dc0ce29 100644 --- a/kernel/locking/mcs_spinlock.h +++ b/kernel/locking/mcs_spinlock.h | |||
@@ -118,12 +118,13 @@ void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node) | |||
118 | * mutex_lock()/rwsem_down_{read,write}() etc. | 118 | * mutex_lock()/rwsem_down_{read,write}() etc. |
119 | */ | 119 | */ |
120 | 120 | ||
121 | struct optimistic_spin_queue { | 121 | struct optimistic_spin_node { |
122 | struct optimistic_spin_queue *next, *prev; | 122 | struct optimistic_spin_node *next, *prev; |
123 | int locked; /* 1 if lock acquired */ | 123 | int locked; /* 1 if lock acquired */ |
124 | int cpu; /* encoded CPU # value */ | ||
124 | }; | 125 | }; |
125 | 126 | ||
126 | extern bool osq_lock(struct optimistic_spin_queue **lock); | 127 | extern bool osq_lock(struct optimistic_spin_queue *lock); |
127 | extern void osq_unlock(struct optimistic_spin_queue **lock); | 128 | extern void osq_unlock(struct optimistic_spin_queue *lock); |
128 | 129 | ||
129 | #endif /* __LINUX_MCS_SPINLOCK_H */ | 130 | #endif /* __LINUX_MCS_SPINLOCK_H */ |
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index bc73d33c6760..acca2c1a3c5e 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c | |||
@@ -60,7 +60,7 @@ __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key) | |||
60 | INIT_LIST_HEAD(&lock->wait_list); | 60 | INIT_LIST_HEAD(&lock->wait_list); |
61 | mutex_clear_owner(lock); | 61 | mutex_clear_owner(lock); |
62 | #ifdef CONFIG_MUTEX_SPIN_ON_OWNER | 62 | #ifdef CONFIG_MUTEX_SPIN_ON_OWNER |
63 | lock->osq = NULL; | 63 | osq_lock_init(&lock->osq); |
64 | #endif | 64 | #endif |
65 | 65 | ||
66 | debug_mutex_init(lock, name, key); | 66 | debug_mutex_init(lock, name, key); |
diff --git a/kernel/locking/rtmutex-debug.h b/kernel/locking/rtmutex-debug.h index 14193d596d78..ab29b6a22669 100644 --- a/kernel/locking/rtmutex-debug.h +++ b/kernel/locking/rtmutex-debug.h | |||
@@ -31,3 +31,8 @@ static inline int debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter, | |||
31 | { | 31 | { |
32 | return (waiter != NULL); | 32 | return (waiter != NULL); |
33 | } | 33 | } |
34 | |||
35 | static inline void rt_mutex_print_deadlock(struct rt_mutex_waiter *w) | ||
36 | { | ||
37 | debug_rt_mutex_print_deadlock(w); | ||
38 | } | ||
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index a620d4d08ca6..fc605941b9b8 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c | |||
@@ -83,6 +83,47 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) | |||
83 | owner = *p; | 83 | owner = *p; |
84 | } while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner); | 84 | } while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner); |
85 | } | 85 | } |
86 | |||
87 | /* | ||
88 | * Safe fastpath aware unlock: | ||
89 | * 1) Clear the waiters bit | ||
90 | * 2) Drop lock->wait_lock | ||
91 | * 3) Try to unlock the lock with cmpxchg | ||
92 | */ | ||
93 | static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock) | ||
94 | __releases(lock->wait_lock) | ||
95 | { | ||
96 | struct task_struct *owner = rt_mutex_owner(lock); | ||
97 | |||
98 | clear_rt_mutex_waiters(lock); | ||
99 | raw_spin_unlock(&lock->wait_lock); | ||
100 | /* | ||
101 | * If a new waiter comes in between the unlock and the cmpxchg | ||
102 | * we have two situations: | ||
103 | * | ||
104 | * unlock(wait_lock); | ||
105 | * lock(wait_lock); | ||
106 | * cmpxchg(p, owner, 0) == owner | ||
107 | * mark_rt_mutex_waiters(lock); | ||
108 | * acquire(lock); | ||
109 | * or: | ||
110 | * | ||
111 | * unlock(wait_lock); | ||
112 | * lock(wait_lock); | ||
113 | * mark_rt_mutex_waiters(lock); | ||
114 | * | ||
115 | * cmpxchg(p, owner, 0) != owner | ||
116 | * enqueue_waiter(); | ||
117 | * unlock(wait_lock); | ||
118 | * lock(wait_lock); | ||
119 | * wake waiter(); | ||
120 | * unlock(wait_lock); | ||
121 | * lock(wait_lock); | ||
122 | * acquire(lock); | ||
123 | */ | ||
124 | return rt_mutex_cmpxchg(lock, owner, NULL); | ||
125 | } | ||
126 | |||
86 | #else | 127 | #else |
87 | # define rt_mutex_cmpxchg(l,c,n) (0) | 128 | # define rt_mutex_cmpxchg(l,c,n) (0) |
88 | static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) | 129 | static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) |
@@ -90,6 +131,17 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) | |||
90 | lock->owner = (struct task_struct *) | 131 | lock->owner = (struct task_struct *) |
91 | ((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS); | 132 | ((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS); |
92 | } | 133 | } |
134 | |||
135 | /* | ||
136 | * Simple slow path only version: lock->owner is protected by lock->wait_lock. | ||
137 | */ | ||
138 | static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock) | ||
139 | __releases(lock->wait_lock) | ||
140 | { | ||
141 | lock->owner = NULL; | ||
142 | raw_spin_unlock(&lock->wait_lock); | ||
143 | return true; | ||
144 | } | ||
93 | #endif | 145 | #endif |
94 | 146 | ||
95 | static inline int | 147 | static inline int |
@@ -260,27 +312,36 @@ static void rt_mutex_adjust_prio(struct task_struct *task) | |||
260 | */ | 312 | */ |
261 | int max_lock_depth = 1024; | 313 | int max_lock_depth = 1024; |
262 | 314 | ||
315 | static inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p) | ||
316 | { | ||
317 | return p->pi_blocked_on ? p->pi_blocked_on->lock : NULL; | ||
318 | } | ||
319 | |||
263 | /* | 320 | /* |
264 | * Adjust the priority chain. Also used for deadlock detection. | 321 | * Adjust the priority chain. Also used for deadlock detection. |
265 | * Decreases task's usage by one - may thus free the task. | 322 | * Decreases task's usage by one - may thus free the task. |
266 | * | 323 | * |
267 | * @task: the task owning the mutex (owner) for which a chain walk is probably | 324 | * @task: the task owning the mutex (owner) for which a chain walk is |
268 | * needed | 325 | * probably needed |
269 | * @deadlock_detect: do we have to carry out deadlock detection? | 326 | * @deadlock_detect: do we have to carry out deadlock detection? |
270 | * @orig_lock: the mutex (can be NULL if we are walking the chain to recheck | 327 | * @orig_lock: the mutex (can be NULL if we are walking the chain to recheck |
271 | * things for a task that has just got its priority adjusted, and | 328 | * things for a task that has just got its priority adjusted, and |
272 | * is waiting on a mutex) | 329 | * is waiting on a mutex) |
330 | * @next_lock: the mutex on which the owner of @orig_lock was blocked before | ||
331 | * we dropped its pi_lock. Is never dereferenced, only used for | ||
332 | * comparison to detect lock chain changes. | ||
273 | * @orig_waiter: rt_mutex_waiter struct for the task that has just donated | 333 | * @orig_waiter: rt_mutex_waiter struct for the task that has just donated |
274 | * its priority to the mutex owner (can be NULL in the case | 334 | * its priority to the mutex owner (can be NULL in the case |
275 | * depicted above or if the top waiter is gone away and we are | 335 | * depicted above or if the top waiter is gone away and we are |
276 | * actually deboosting the owner) | 336 | * actually deboosting the owner) |
277 | * @top_task: the current top waiter | 337 | * @top_task: the current top waiter |
278 | * | 338 | * |
279 | * Returns 0 or -EDEADLK. | 339 | * Returns 0 or -EDEADLK. |
280 | */ | 340 | */ |
281 | static int rt_mutex_adjust_prio_chain(struct task_struct *task, | 341 | static int rt_mutex_adjust_prio_chain(struct task_struct *task, |
282 | int deadlock_detect, | 342 | int deadlock_detect, |
283 | struct rt_mutex *orig_lock, | 343 | struct rt_mutex *orig_lock, |
344 | struct rt_mutex *next_lock, | ||
284 | struct rt_mutex_waiter *orig_waiter, | 345 | struct rt_mutex_waiter *orig_waiter, |
285 | struct task_struct *top_task) | 346 | struct task_struct *top_task) |
286 | { | 347 | { |
@@ -314,7 +375,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, | |||
314 | } | 375 | } |
315 | put_task_struct(task); | 376 | put_task_struct(task); |
316 | 377 | ||
317 | return deadlock_detect ? -EDEADLK : 0; | 378 | return -EDEADLK; |
318 | } | 379 | } |
319 | retry: | 380 | retry: |
320 | /* | 381 | /* |
@@ -339,6 +400,18 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, | |||
339 | goto out_unlock_pi; | 400 | goto out_unlock_pi; |
340 | 401 | ||
341 | /* | 402 | /* |
403 | * We dropped all locks after taking a refcount on @task, so | ||
404 | * the task might have moved on in the lock chain or even left | ||
405 | * the chain completely and blocks now on an unrelated lock or | ||
406 | * on @orig_lock. | ||
407 | * | ||
408 | * We stored the lock on which @task was blocked in @next_lock, | ||
409 | * so we can detect the chain change. | ||
410 | */ | ||
411 | if (next_lock != waiter->lock) | ||
412 | goto out_unlock_pi; | ||
413 | |||
414 | /* | ||
342 | * Drop out, when the task has no waiters. Note, | 415 | * Drop out, when the task has no waiters. Note, |
343 | * top_waiter can be NULL, when we are in the deboosting | 416 | * top_waiter can be NULL, when we are in the deboosting |
344 | * mode! | 417 | * mode! |
@@ -377,7 +450,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, | |||
377 | if (lock == orig_lock || rt_mutex_owner(lock) == top_task) { | 450 | if (lock == orig_lock || rt_mutex_owner(lock) == top_task) { |
378 | debug_rt_mutex_deadlock(deadlock_detect, orig_waiter, lock); | 451 | debug_rt_mutex_deadlock(deadlock_detect, orig_waiter, lock); |
379 | raw_spin_unlock(&lock->wait_lock); | 452 | raw_spin_unlock(&lock->wait_lock); |
380 | ret = deadlock_detect ? -EDEADLK : 0; | 453 | ret = -EDEADLK; |
381 | goto out_unlock_pi; | 454 | goto out_unlock_pi; |
382 | } | 455 | } |
383 | 456 | ||
@@ -422,11 +495,26 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, | |||
422 | __rt_mutex_adjust_prio(task); | 495 | __rt_mutex_adjust_prio(task); |
423 | } | 496 | } |
424 | 497 | ||
498 | /* | ||
499 | * Check whether the task which owns the current lock is pi | ||
500 | * blocked itself. If yes we store a pointer to the lock for | ||
501 | * the lock chain change detection above. After we dropped | ||
502 | * task->pi_lock next_lock cannot be dereferenced anymore. | ||
503 | */ | ||
504 | next_lock = task_blocked_on_lock(task); | ||
505 | |||
425 | raw_spin_unlock_irqrestore(&task->pi_lock, flags); | 506 | raw_spin_unlock_irqrestore(&task->pi_lock, flags); |
426 | 507 | ||
427 | top_waiter = rt_mutex_top_waiter(lock); | 508 | top_waiter = rt_mutex_top_waiter(lock); |
428 | raw_spin_unlock(&lock->wait_lock); | 509 | raw_spin_unlock(&lock->wait_lock); |
429 | 510 | ||
511 | /* | ||
512 | * We reached the end of the lock chain. Stop right here. No | ||
513 | * point to go back just to figure that out. | ||
514 | */ | ||
515 | if (!next_lock) | ||
516 | goto out_put_task; | ||
517 | |||
430 | if (!detect_deadlock && waiter != top_waiter) | 518 | if (!detect_deadlock && waiter != top_waiter) |
431 | goto out_put_task; | 519 | goto out_put_task; |
432 | 520 | ||
@@ -536,8 +624,9 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, | |||
536 | { | 624 | { |
537 | struct task_struct *owner = rt_mutex_owner(lock); | 625 | struct task_struct *owner = rt_mutex_owner(lock); |
538 | struct rt_mutex_waiter *top_waiter = waiter; | 626 | struct rt_mutex_waiter *top_waiter = waiter; |
539 | unsigned long flags; | 627 | struct rt_mutex *next_lock; |
540 | int chain_walk = 0, res; | 628 | int chain_walk = 0, res; |
629 | unsigned long flags; | ||
541 | 630 | ||
542 | /* | 631 | /* |
543 | * Early deadlock detection. We really don't want the task to | 632 | * Early deadlock detection. We really don't want the task to |
@@ -548,7 +637,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, | |||
548 | * which is wrong, as the other waiter is not in a deadlock | 637 | * which is wrong, as the other waiter is not in a deadlock |
549 | * situation. | 638 | * situation. |
550 | */ | 639 | */ |
551 | if (detect_deadlock && owner == task) | 640 | if (owner == task) |
552 | return -EDEADLK; | 641 | return -EDEADLK; |
553 | 642 | ||
554 | raw_spin_lock_irqsave(&task->pi_lock, flags); | 643 | raw_spin_lock_irqsave(&task->pi_lock, flags); |
@@ -569,20 +658,28 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, | |||
569 | if (!owner) | 658 | if (!owner) |
570 | return 0; | 659 | return 0; |
571 | 660 | ||
661 | raw_spin_lock_irqsave(&owner->pi_lock, flags); | ||
572 | if (waiter == rt_mutex_top_waiter(lock)) { | 662 | if (waiter == rt_mutex_top_waiter(lock)) { |
573 | raw_spin_lock_irqsave(&owner->pi_lock, flags); | ||
574 | rt_mutex_dequeue_pi(owner, top_waiter); | 663 | rt_mutex_dequeue_pi(owner, top_waiter); |
575 | rt_mutex_enqueue_pi(owner, waiter); | 664 | rt_mutex_enqueue_pi(owner, waiter); |
576 | 665 | ||
577 | __rt_mutex_adjust_prio(owner); | 666 | __rt_mutex_adjust_prio(owner); |
578 | if (owner->pi_blocked_on) | 667 | if (owner->pi_blocked_on) |
579 | chain_walk = 1; | 668 | chain_walk = 1; |
580 | raw_spin_unlock_irqrestore(&owner->pi_lock, flags); | 669 | } else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock)) { |
581 | } | ||
582 | else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock)) | ||
583 | chain_walk = 1; | 670 | chain_walk = 1; |
671 | } | ||
584 | 672 | ||
585 | if (!chain_walk) | 673 | /* Store the lock on which owner is blocked or NULL */ |
674 | next_lock = task_blocked_on_lock(owner); | ||
675 | |||
676 | raw_spin_unlock_irqrestore(&owner->pi_lock, flags); | ||
677 | /* | ||
678 | * Even if full deadlock detection is on, if the owner is not | ||
679 | * blocked itself, we can avoid finding this out in the chain | ||
680 | * walk. | ||
681 | */ | ||
682 | if (!chain_walk || !next_lock) | ||
586 | return 0; | 683 | return 0; |
587 | 684 | ||
588 | /* | 685 | /* |
@@ -594,8 +691,8 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, | |||
594 | 691 | ||
595 | raw_spin_unlock(&lock->wait_lock); | 692 | raw_spin_unlock(&lock->wait_lock); |
596 | 693 | ||
597 | res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter, | 694 | res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, |
598 | task); | 695 | next_lock, waiter, task); |
599 | 696 | ||
600 | raw_spin_lock(&lock->wait_lock); | 697 | raw_spin_lock(&lock->wait_lock); |
601 | 698 | ||
@@ -605,7 +702,8 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, | |||
605 | /* | 702 | /* |
606 | * Wake up the next waiter on the lock. | 703 | * Wake up the next waiter on the lock. |
607 | * | 704 | * |
608 | * Remove the top waiter from the current tasks waiter list and wake it up. | 705 | * Remove the top waiter from the current tasks pi waiter list and |
706 | * wake it up. | ||
609 | * | 707 | * |
610 | * Called with lock->wait_lock held. | 708 | * Called with lock->wait_lock held. |
611 | */ | 709 | */ |
@@ -626,10 +724,23 @@ static void wakeup_next_waiter(struct rt_mutex *lock) | |||
626 | */ | 724 | */ |
627 | rt_mutex_dequeue_pi(current, waiter); | 725 | rt_mutex_dequeue_pi(current, waiter); |
628 | 726 | ||
629 | rt_mutex_set_owner(lock, NULL); | 727 | /* |
728 | * As we are waking up the top waiter, and the waiter stays | ||
729 | * queued on the lock until it gets the lock, this lock | ||
730 | * obviously has waiters. Just set the bit here and this has | ||
731 | * the added benefit of forcing all new tasks into the | ||
732 | * slow path making sure no task of lower priority than | ||
733 | * the top waiter can steal this lock. | ||
734 | */ | ||
735 | lock->owner = (void *) RT_MUTEX_HAS_WAITERS; | ||
630 | 736 | ||
631 | raw_spin_unlock_irqrestore(¤t->pi_lock, flags); | 737 | raw_spin_unlock_irqrestore(¤t->pi_lock, flags); |
632 | 738 | ||
739 | /* | ||
740 | * It's safe to dereference waiter as it cannot go away as | ||
741 | * long as we hold lock->wait_lock. The waiter task needs to | ||
742 | * acquire it in order to dequeue the waiter. | ||
743 | */ | ||
633 | wake_up_process(waiter->task); | 744 | wake_up_process(waiter->task); |
634 | } | 745 | } |
635 | 746 | ||
@@ -644,8 +755,8 @@ static void remove_waiter(struct rt_mutex *lock, | |||
644 | { | 755 | { |
645 | int first = (waiter == rt_mutex_top_waiter(lock)); | 756 | int first = (waiter == rt_mutex_top_waiter(lock)); |
646 | struct task_struct *owner = rt_mutex_owner(lock); | 757 | struct task_struct *owner = rt_mutex_owner(lock); |
758 | struct rt_mutex *next_lock = NULL; | ||
647 | unsigned long flags; | 759 | unsigned long flags; |
648 | int chain_walk = 0; | ||
649 | 760 | ||
650 | raw_spin_lock_irqsave(¤t->pi_lock, flags); | 761 | raw_spin_lock_irqsave(¤t->pi_lock, flags); |
651 | rt_mutex_dequeue(lock, waiter); | 762 | rt_mutex_dequeue(lock, waiter); |
@@ -669,13 +780,13 @@ static void remove_waiter(struct rt_mutex *lock, | |||
669 | } | 780 | } |
670 | __rt_mutex_adjust_prio(owner); | 781 | __rt_mutex_adjust_prio(owner); |
671 | 782 | ||
672 | if (owner->pi_blocked_on) | 783 | /* Store the lock on which owner is blocked or NULL */ |
673 | chain_walk = 1; | 784 | next_lock = task_blocked_on_lock(owner); |
674 | 785 | ||
675 | raw_spin_unlock_irqrestore(&owner->pi_lock, flags); | 786 | raw_spin_unlock_irqrestore(&owner->pi_lock, flags); |
676 | } | 787 | } |
677 | 788 | ||
678 | if (!chain_walk) | 789 | if (!next_lock) |
679 | return; | 790 | return; |
680 | 791 | ||
681 | /* gets dropped in rt_mutex_adjust_prio_chain()! */ | 792 | /* gets dropped in rt_mutex_adjust_prio_chain()! */ |
@@ -683,7 +794,7 @@ static void remove_waiter(struct rt_mutex *lock, | |||
683 | 794 | ||
684 | raw_spin_unlock(&lock->wait_lock); | 795 | raw_spin_unlock(&lock->wait_lock); |
685 | 796 | ||
686 | rt_mutex_adjust_prio_chain(owner, 0, lock, NULL, current); | 797 | rt_mutex_adjust_prio_chain(owner, 0, lock, next_lock, NULL, current); |
687 | 798 | ||
688 | raw_spin_lock(&lock->wait_lock); | 799 | raw_spin_lock(&lock->wait_lock); |
689 | } | 800 | } |
@@ -696,6 +807,7 @@ static void remove_waiter(struct rt_mutex *lock, | |||
696 | void rt_mutex_adjust_pi(struct task_struct *task) | 807 | void rt_mutex_adjust_pi(struct task_struct *task) |
697 | { | 808 | { |
698 | struct rt_mutex_waiter *waiter; | 809 | struct rt_mutex_waiter *waiter; |
810 | struct rt_mutex *next_lock; | ||
699 | unsigned long flags; | 811 | unsigned long flags; |
700 | 812 | ||
701 | raw_spin_lock_irqsave(&task->pi_lock, flags); | 813 | raw_spin_lock_irqsave(&task->pi_lock, flags); |
@@ -706,12 +818,13 @@ void rt_mutex_adjust_pi(struct task_struct *task) | |||
706 | raw_spin_unlock_irqrestore(&task->pi_lock, flags); | 818 | raw_spin_unlock_irqrestore(&task->pi_lock, flags); |
707 | return; | 819 | return; |
708 | } | 820 | } |
709 | 821 | next_lock = waiter->lock; | |
710 | raw_spin_unlock_irqrestore(&task->pi_lock, flags); | 822 | raw_spin_unlock_irqrestore(&task->pi_lock, flags); |
711 | 823 | ||
712 | /* gets dropped in rt_mutex_adjust_prio_chain()! */ | 824 | /* gets dropped in rt_mutex_adjust_prio_chain()! */ |
713 | get_task_struct(task); | 825 | get_task_struct(task); |
714 | rt_mutex_adjust_prio_chain(task, 0, NULL, NULL, task); | 826 | |
827 | rt_mutex_adjust_prio_chain(task, 0, NULL, next_lock, NULL, task); | ||
715 | } | 828 | } |
716 | 829 | ||
717 | /** | 830 | /** |
@@ -763,6 +876,26 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state, | |||
763 | return ret; | 876 | return ret; |
764 | } | 877 | } |
765 | 878 | ||
879 | static void rt_mutex_handle_deadlock(int res, int detect_deadlock, | ||
880 | struct rt_mutex_waiter *w) | ||
881 | { | ||
882 | /* | ||
883 | * If the result is not -EDEADLOCK or the caller requested | ||
884 | * deadlock detection, nothing to do here. | ||
885 | */ | ||
886 | if (res != -EDEADLOCK || detect_deadlock) | ||
887 | return; | ||
888 | |||
889 | /* | ||
890 | * Yell lowdly and stop the task right here. | ||
891 | */ | ||
892 | rt_mutex_print_deadlock(w); | ||
893 | while (1) { | ||
894 | set_current_state(TASK_INTERRUPTIBLE); | ||
895 | schedule(); | ||
896 | } | ||
897 | } | ||
898 | |||
766 | /* | 899 | /* |
767 | * Slow path lock function: | 900 | * Slow path lock function: |
768 | */ | 901 | */ |
@@ -802,8 +935,10 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, | |||
802 | 935 | ||
803 | set_current_state(TASK_RUNNING); | 936 | set_current_state(TASK_RUNNING); |
804 | 937 | ||
805 | if (unlikely(ret)) | 938 | if (unlikely(ret)) { |
806 | remove_waiter(lock, &waiter); | 939 | remove_waiter(lock, &waiter); |
940 | rt_mutex_handle_deadlock(ret, detect_deadlock, &waiter); | ||
941 | } | ||
807 | 942 | ||
808 | /* | 943 | /* |
809 | * try_to_take_rt_mutex() sets the waiter bit | 944 | * try_to_take_rt_mutex() sets the waiter bit |
@@ -859,12 +994,49 @@ rt_mutex_slowunlock(struct rt_mutex *lock) | |||
859 | 994 | ||
860 | rt_mutex_deadlock_account_unlock(current); | 995 | rt_mutex_deadlock_account_unlock(current); |
861 | 996 | ||
862 | if (!rt_mutex_has_waiters(lock)) { | 997 | /* |
863 | lock->owner = NULL; | 998 | * We must be careful here if the fast path is enabled. If we |
864 | raw_spin_unlock(&lock->wait_lock); | 999 | * have no waiters queued we cannot set owner to NULL here |
865 | return; | 1000 | * because of: |
1001 | * | ||
1002 | * foo->lock->owner = NULL; | ||
1003 | * rtmutex_lock(foo->lock); <- fast path | ||
1004 | * free = atomic_dec_and_test(foo->refcnt); | ||
1005 | * rtmutex_unlock(foo->lock); <- fast path | ||
1006 | * if (free) | ||
1007 | * kfree(foo); | ||
1008 | * raw_spin_unlock(foo->lock->wait_lock); | ||
1009 | * | ||
1010 | * So for the fastpath enabled kernel: | ||
1011 | * | ||
1012 | * Nothing can set the waiters bit as long as we hold | ||
1013 | * lock->wait_lock. So we do the following sequence: | ||
1014 | * | ||
1015 | * owner = rt_mutex_owner(lock); | ||
1016 | * clear_rt_mutex_waiters(lock); | ||
1017 | * raw_spin_unlock(&lock->wait_lock); | ||
1018 | * if (cmpxchg(&lock->owner, owner, 0) == owner) | ||
1019 | * return; | ||
1020 | * goto retry; | ||
1021 | * | ||
1022 | * The fastpath disabled variant is simple as all access to | ||
1023 | * lock->owner is serialized by lock->wait_lock: | ||
1024 | * | ||
1025 | * lock->owner = NULL; | ||
1026 | * raw_spin_unlock(&lock->wait_lock); | ||
1027 | */ | ||
1028 | while (!rt_mutex_has_waiters(lock)) { | ||
1029 | /* Drops lock->wait_lock ! */ | ||
1030 | if (unlock_rt_mutex_safe(lock) == true) | ||
1031 | return; | ||
1032 | /* Relock the rtmutex and try again */ | ||
1033 | raw_spin_lock(&lock->wait_lock); | ||
866 | } | 1034 | } |
867 | 1035 | ||
1036 | /* | ||
1037 | * The wakeup next waiter path does not suffer from the above | ||
1038 | * race. See the comments there. | ||
1039 | */ | ||
868 | wakeup_next_waiter(lock); | 1040 | wakeup_next_waiter(lock); |
869 | 1041 | ||
870 | raw_spin_unlock(&lock->wait_lock); | 1042 | raw_spin_unlock(&lock->wait_lock); |
@@ -1112,7 +1284,8 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock, | |||
1112 | return 1; | 1284 | return 1; |
1113 | } | 1285 | } |
1114 | 1286 | ||
1115 | ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock); | 1287 | /* We enforce deadlock detection for futexes */ |
1288 | ret = task_blocks_on_rt_mutex(lock, waiter, task, 1); | ||
1116 | 1289 | ||
1117 | if (ret && !rt_mutex_owner(lock)) { | 1290 | if (ret && !rt_mutex_owner(lock)) { |
1118 | /* | 1291 | /* |
diff --git a/kernel/locking/rtmutex.h b/kernel/locking/rtmutex.h index a1a1dd06421d..f6a1f3c133b1 100644 --- a/kernel/locking/rtmutex.h +++ b/kernel/locking/rtmutex.h | |||
@@ -24,3 +24,8 @@ | |||
24 | #define debug_rt_mutex_print_deadlock(w) do { } while (0) | 24 | #define debug_rt_mutex_print_deadlock(w) do { } while (0) |
25 | #define debug_rt_mutex_detect_deadlock(w,d) (d) | 25 | #define debug_rt_mutex_detect_deadlock(w,d) (d) |
26 | #define debug_rt_mutex_reset_waiter(w) do { } while (0) | 26 | #define debug_rt_mutex_reset_waiter(w) do { } while (0) |
27 | |||
28 | static inline void rt_mutex_print_deadlock(struct rt_mutex_waiter *w) | ||
29 | { | ||
30 | WARN(1, "rtmutex deadlock detected\n"); | ||
31 | } | ||
diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c index 9be8a9144978..2c93571162cb 100644 --- a/kernel/locking/rwsem-spinlock.c +++ b/kernel/locking/rwsem-spinlock.c | |||
@@ -26,7 +26,7 @@ int rwsem_is_locked(struct rw_semaphore *sem) | |||
26 | unsigned long flags; | 26 | unsigned long flags; |
27 | 27 | ||
28 | if (raw_spin_trylock_irqsave(&sem->wait_lock, flags)) { | 28 | if (raw_spin_trylock_irqsave(&sem->wait_lock, flags)) { |
29 | ret = (sem->activity != 0); | 29 | ret = (sem->count != 0); |
30 | raw_spin_unlock_irqrestore(&sem->wait_lock, flags); | 30 | raw_spin_unlock_irqrestore(&sem->wait_lock, flags); |
31 | } | 31 | } |
32 | return ret; | 32 | return ret; |
@@ -46,7 +46,7 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name, | |||
46 | debug_check_no_locks_freed((void *)sem, sizeof(*sem)); | 46 | debug_check_no_locks_freed((void *)sem, sizeof(*sem)); |
47 | lockdep_init_map(&sem->dep_map, name, key, 0); | 47 | lockdep_init_map(&sem->dep_map, name, key, 0); |
48 | #endif | 48 | #endif |
49 | sem->activity = 0; | 49 | sem->count = 0; |
50 | raw_spin_lock_init(&sem->wait_lock); | 50 | raw_spin_lock_init(&sem->wait_lock); |
51 | INIT_LIST_HEAD(&sem->wait_list); | 51 | INIT_LIST_HEAD(&sem->wait_list); |
52 | } | 52 | } |
@@ -95,7 +95,7 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wakewrite) | |||
95 | waiter = list_entry(next, struct rwsem_waiter, list); | 95 | waiter = list_entry(next, struct rwsem_waiter, list); |
96 | } while (waiter->type != RWSEM_WAITING_FOR_WRITE); | 96 | } while (waiter->type != RWSEM_WAITING_FOR_WRITE); |
97 | 97 | ||
98 | sem->activity += woken; | 98 | sem->count += woken; |
99 | 99 | ||
100 | out: | 100 | out: |
101 | return sem; | 101 | return sem; |
@@ -126,9 +126,9 @@ void __sched __down_read(struct rw_semaphore *sem) | |||
126 | 126 | ||
127 | raw_spin_lock_irqsave(&sem->wait_lock, flags); | 127 | raw_spin_lock_irqsave(&sem->wait_lock, flags); |
128 | 128 | ||
129 | if (sem->activity >= 0 && list_empty(&sem->wait_list)) { | 129 | if (sem->count >= 0 && list_empty(&sem->wait_list)) { |
130 | /* granted */ | 130 | /* granted */ |
131 | sem->activity++; | 131 | sem->count++; |
132 | raw_spin_unlock_irqrestore(&sem->wait_lock, flags); | 132 | raw_spin_unlock_irqrestore(&sem->wait_lock, flags); |
133 | goto out; | 133 | goto out; |
134 | } | 134 | } |
@@ -170,9 +170,9 @@ int __down_read_trylock(struct rw_semaphore *sem) | |||
170 | 170 | ||
171 | raw_spin_lock_irqsave(&sem->wait_lock, flags); | 171 | raw_spin_lock_irqsave(&sem->wait_lock, flags); |
172 | 172 | ||
173 | if (sem->activity >= 0 && list_empty(&sem->wait_list)) { | 173 | if (sem->count >= 0 && list_empty(&sem->wait_list)) { |
174 | /* granted */ | 174 | /* granted */ |
175 | sem->activity++; | 175 | sem->count++; |
176 | ret = 1; | 176 | ret = 1; |
177 | } | 177 | } |
178 | 178 | ||
@@ -206,7 +206,7 @@ void __sched __down_write_nested(struct rw_semaphore *sem, int subclass) | |||
206 | * itself into sleep and waiting for system woke it or someone | 206 | * itself into sleep and waiting for system woke it or someone |
207 | * else in the head of the wait list up. | 207 | * else in the head of the wait list up. |
208 | */ | 208 | */ |
209 | if (sem->activity == 0) | 209 | if (sem->count == 0) |
210 | break; | 210 | break; |
211 | set_task_state(tsk, TASK_UNINTERRUPTIBLE); | 211 | set_task_state(tsk, TASK_UNINTERRUPTIBLE); |
212 | raw_spin_unlock_irqrestore(&sem->wait_lock, flags); | 212 | raw_spin_unlock_irqrestore(&sem->wait_lock, flags); |
@@ -214,7 +214,7 @@ void __sched __down_write_nested(struct rw_semaphore *sem, int subclass) | |||
214 | raw_spin_lock_irqsave(&sem->wait_lock, flags); | 214 | raw_spin_lock_irqsave(&sem->wait_lock, flags); |
215 | } | 215 | } |
216 | /* got the lock */ | 216 | /* got the lock */ |
217 | sem->activity = -1; | 217 | sem->count = -1; |
218 | list_del(&waiter.list); | 218 | list_del(&waiter.list); |
219 | 219 | ||
220 | raw_spin_unlock_irqrestore(&sem->wait_lock, flags); | 220 | raw_spin_unlock_irqrestore(&sem->wait_lock, flags); |
@@ -235,9 +235,9 @@ int __down_write_trylock(struct rw_semaphore *sem) | |||
235 | 235 | ||
236 | raw_spin_lock_irqsave(&sem->wait_lock, flags); | 236 | raw_spin_lock_irqsave(&sem->wait_lock, flags); |
237 | 237 | ||
238 | if (sem->activity == 0) { | 238 | if (sem->count == 0) { |
239 | /* got the lock */ | 239 | /* got the lock */ |
240 | sem->activity = -1; | 240 | sem->count = -1; |
241 | ret = 1; | 241 | ret = 1; |
242 | } | 242 | } |
243 | 243 | ||
@@ -255,7 +255,7 @@ void __up_read(struct rw_semaphore *sem) | |||
255 | 255 | ||
256 | raw_spin_lock_irqsave(&sem->wait_lock, flags); | 256 | raw_spin_lock_irqsave(&sem->wait_lock, flags); |
257 | 257 | ||
258 | if (--sem->activity == 0 && !list_empty(&sem->wait_list)) | 258 | if (--sem->count == 0 && !list_empty(&sem->wait_list)) |
259 | sem = __rwsem_wake_one_writer(sem); | 259 | sem = __rwsem_wake_one_writer(sem); |
260 | 260 | ||
261 | raw_spin_unlock_irqrestore(&sem->wait_lock, flags); | 261 | raw_spin_unlock_irqrestore(&sem->wait_lock, flags); |
@@ -270,7 +270,7 @@ void __up_write(struct rw_semaphore *sem) | |||
270 | 270 | ||
271 | raw_spin_lock_irqsave(&sem->wait_lock, flags); | 271 | raw_spin_lock_irqsave(&sem->wait_lock, flags); |
272 | 272 | ||
273 | sem->activity = 0; | 273 | sem->count = 0; |
274 | if (!list_empty(&sem->wait_list)) | 274 | if (!list_empty(&sem->wait_list)) |
275 | sem = __rwsem_do_wake(sem, 1); | 275 | sem = __rwsem_do_wake(sem, 1); |
276 | 276 | ||
@@ -287,7 +287,7 @@ void __downgrade_write(struct rw_semaphore *sem) | |||
287 | 287 | ||
288 | raw_spin_lock_irqsave(&sem->wait_lock, flags); | 288 | raw_spin_lock_irqsave(&sem->wait_lock, flags); |
289 | 289 | ||
290 | sem->activity = 1; | 290 | sem->count = 1; |
291 | if (!list_empty(&sem->wait_list)) | 291 | if (!list_empty(&sem->wait_list)) |
292 | sem = __rwsem_do_wake(sem, 0); | 292 | sem = __rwsem_do_wake(sem, 0); |
293 | 293 | ||
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index dacc32142fcc..a2391ac135c8 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c | |||
@@ -82,9 +82,9 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name, | |||
82 | sem->count = RWSEM_UNLOCKED_VALUE; | 82 | sem->count = RWSEM_UNLOCKED_VALUE; |
83 | raw_spin_lock_init(&sem->wait_lock); | 83 | raw_spin_lock_init(&sem->wait_lock); |
84 | INIT_LIST_HEAD(&sem->wait_list); | 84 | INIT_LIST_HEAD(&sem->wait_list); |
85 | #ifdef CONFIG_SMP | 85 | #ifdef CONFIG_RWSEM_SPIN_ON_OWNER |
86 | sem->owner = NULL; | 86 | sem->owner = NULL; |
87 | sem->osq = NULL; | 87 | osq_lock_init(&sem->osq); |
88 | #endif | 88 | #endif |
89 | } | 89 | } |
90 | 90 | ||
@@ -262,7 +262,7 @@ static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem) | |||
262 | return false; | 262 | return false; |
263 | } | 263 | } |
264 | 264 | ||
265 | #ifdef CONFIG_SMP | 265 | #ifdef CONFIG_RWSEM_SPIN_ON_OWNER |
266 | /* | 266 | /* |
267 | * Try to acquire write lock before the writer has been put on wait queue. | 267 | * Try to acquire write lock before the writer has been put on wait queue. |
268 | */ | 268 | */ |
@@ -285,10 +285,10 @@ static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem) | |||
285 | static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) | 285 | static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) |
286 | { | 286 | { |
287 | struct task_struct *owner; | 287 | struct task_struct *owner; |
288 | bool on_cpu = true; | 288 | bool on_cpu = false; |
289 | 289 | ||
290 | if (need_resched()) | 290 | if (need_resched()) |
291 | return 0; | 291 | return false; |
292 | 292 | ||
293 | rcu_read_lock(); | 293 | rcu_read_lock(); |
294 | owner = ACCESS_ONCE(sem->owner); | 294 | owner = ACCESS_ONCE(sem->owner); |
@@ -297,9 +297,9 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) | |||
297 | rcu_read_unlock(); | 297 | rcu_read_unlock(); |
298 | 298 | ||
299 | /* | 299 | /* |
300 | * If sem->owner is not set, the rwsem owner may have | 300 | * If sem->owner is not set, yet we have just recently entered the |
301 | * just acquired it and not set the owner yet or the rwsem | 301 | * slowpath, then there is a possibility reader(s) may have the lock. |
302 | * has been released. | 302 | * To be safe, avoid spinning in these situations. |
303 | */ | 303 | */ |
304 | return on_cpu; | 304 | return on_cpu; |
305 | } | 305 | } |
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 42f806de49d4..e2d3bc7f03b4 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c | |||
@@ -12,7 +12,7 @@ | |||
12 | 12 | ||
13 | #include <linux/atomic.h> | 13 | #include <linux/atomic.h> |
14 | 14 | ||
15 | #if defined(CONFIG_SMP) && defined(CONFIG_RWSEM_XCHGADD_ALGORITHM) | 15 | #ifdef CONFIG_RWSEM_SPIN_ON_OWNER |
16 | static inline void rwsem_set_owner(struct rw_semaphore *sem) | 16 | static inline void rwsem_set_owner(struct rw_semaphore *sem) |
17 | { | 17 | { |
18 | sem->owner = current; | 18 | sem->owner = current; |
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 9a83d780facd..e4e4121fa327 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig | |||
@@ -253,9 +253,6 @@ config APM_EMULATION | |||
253 | anything, try disabling/enabling this option (or disabling/enabling | 253 | anything, try disabling/enabling this option (or disabling/enabling |
254 | APM in your BIOS). | 254 | APM in your BIOS). |
255 | 255 | ||
256 | config ARCH_HAS_OPP | ||
257 | bool | ||
258 | |||
259 | config PM_OPP | 256 | config PM_OPP |
260 | bool | 257 | bool |
261 | ---help--- | 258 | ---help--- |
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 49e0a20fd010..fcc2611d3f14 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c | |||
@@ -35,6 +35,7 @@ | |||
35 | 35 | ||
36 | static int nocompress; | 36 | static int nocompress; |
37 | static int noresume; | 37 | static int noresume; |
38 | static int nohibernate; | ||
38 | static int resume_wait; | 39 | static int resume_wait; |
39 | static unsigned int resume_delay; | 40 | static unsigned int resume_delay; |
40 | static char resume_file[256] = CONFIG_PM_STD_PARTITION; | 41 | static char resume_file[256] = CONFIG_PM_STD_PARTITION; |
@@ -62,6 +63,11 @@ bool freezer_test_done; | |||
62 | 63 | ||
63 | static const struct platform_hibernation_ops *hibernation_ops; | 64 | static const struct platform_hibernation_ops *hibernation_ops; |
64 | 65 | ||
66 | bool hibernation_available(void) | ||
67 | { | ||
68 | return (nohibernate == 0); | ||
69 | } | ||
70 | |||
65 | /** | 71 | /** |
66 | * hibernation_set_ops - Set the global hibernate operations. | 72 | * hibernation_set_ops - Set the global hibernate operations. |
67 | * @ops: Hibernation operations to use in subsequent hibernation transitions. | 73 | * @ops: Hibernation operations to use in subsequent hibernation transitions. |
@@ -642,6 +648,11 @@ int hibernate(void) | |||
642 | { | 648 | { |
643 | int error; | 649 | int error; |
644 | 650 | ||
651 | if (!hibernation_available()) { | ||
652 | pr_debug("PM: Hibernation not available.\n"); | ||
653 | return -EPERM; | ||
654 | } | ||
655 | |||
645 | lock_system_sleep(); | 656 | lock_system_sleep(); |
646 | /* The snapshot device should not be opened while we're running */ | 657 | /* The snapshot device should not be opened while we're running */ |
647 | if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { | 658 | if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { |
@@ -734,7 +745,7 @@ static int software_resume(void) | |||
734 | /* | 745 | /* |
735 | * If the user said "noresume".. bail out early. | 746 | * If the user said "noresume".. bail out early. |
736 | */ | 747 | */ |
737 | if (noresume) | 748 | if (noresume || !hibernation_available()) |
738 | return 0; | 749 | return 0; |
739 | 750 | ||
740 | /* | 751 | /* |
@@ -900,6 +911,9 @@ static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr, | |||
900 | int i; | 911 | int i; |
901 | char *start = buf; | 912 | char *start = buf; |
902 | 913 | ||
914 | if (!hibernation_available()) | ||
915 | return sprintf(buf, "[disabled]\n"); | ||
916 | |||
903 | for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) { | 917 | for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) { |
904 | if (!hibernation_modes[i]) | 918 | if (!hibernation_modes[i]) |
905 | continue; | 919 | continue; |
@@ -934,6 +948,9 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr, | |||
934 | char *p; | 948 | char *p; |
935 | int mode = HIBERNATION_INVALID; | 949 | int mode = HIBERNATION_INVALID; |
936 | 950 | ||
951 | if (!hibernation_available()) | ||
952 | return -EPERM; | ||
953 | |||
937 | p = memchr(buf, '\n', n); | 954 | p = memchr(buf, '\n', n); |
938 | len = p ? p - buf : n; | 955 | len = p ? p - buf : n; |
939 | 956 | ||
@@ -1101,6 +1118,10 @@ static int __init hibernate_setup(char *str) | |||
1101 | noresume = 1; | 1118 | noresume = 1; |
1102 | else if (!strncmp(str, "nocompress", 10)) | 1119 | else if (!strncmp(str, "nocompress", 10)) |
1103 | nocompress = 1; | 1120 | nocompress = 1; |
1121 | else if (!strncmp(str, "no", 2)) { | ||
1122 | noresume = 1; | ||
1123 | nohibernate = 1; | ||
1124 | } | ||
1104 | return 1; | 1125 | return 1; |
1105 | } | 1126 | } |
1106 | 1127 | ||
@@ -1125,9 +1146,23 @@ static int __init resumedelay_setup(char *str) | |||
1125 | return 1; | 1146 | return 1; |
1126 | } | 1147 | } |
1127 | 1148 | ||
1149 | static int __init nohibernate_setup(char *str) | ||
1150 | { | ||
1151 | noresume = 1; | ||
1152 | nohibernate = 1; | ||
1153 | return 1; | ||
1154 | } | ||
1155 | |||
1156 | static int __init kaslr_nohibernate_setup(char *str) | ||
1157 | { | ||
1158 | return nohibernate_setup(str); | ||
1159 | } | ||
1160 | |||
1128 | __setup("noresume", noresume_setup); | 1161 | __setup("noresume", noresume_setup); |
1129 | __setup("resume_offset=", resume_offset_setup); | 1162 | __setup("resume_offset=", resume_offset_setup); |
1130 | __setup("resume=", resume_setup); | 1163 | __setup("resume=", resume_setup); |
1131 | __setup("hibernate=", hibernate_setup); | 1164 | __setup("hibernate=", hibernate_setup); |
1132 | __setup("resumewait", resumewait_setup); | 1165 | __setup("resumewait", resumewait_setup); |
1133 | __setup("resumedelay=", resumedelay_setup); | 1166 | __setup("resumedelay=", resumedelay_setup); |
1167 | __setup("nohibernate", nohibernate_setup); | ||
1168 | __setup("kaslr", kaslr_nohibernate_setup); | ||
diff --git a/kernel/power/main.c b/kernel/power/main.c index 573410d6647e..9a59d042ea84 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c | |||
@@ -296,25 +296,22 @@ static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr, | |||
296 | suspend_state_t i; | 296 | suspend_state_t i; |
297 | 297 | ||
298 | for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++) | 298 | for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++) |
299 | if (pm_states[i].state) | 299 | if (pm_states[i]) |
300 | s += sprintf(s,"%s ", pm_states[i].label); | 300 | s += sprintf(s,"%s ", pm_states[i]); |
301 | 301 | ||
302 | #endif | 302 | #endif |
303 | #ifdef CONFIG_HIBERNATION | 303 | if (hibernation_available()) |
304 | s += sprintf(s, "%s\n", "disk"); | 304 | s += sprintf(s, "disk "); |
305 | #else | ||
306 | if (s != buf) | 305 | if (s != buf) |
307 | /* convert the last space to a newline */ | 306 | /* convert the last space to a newline */ |
308 | *(s-1) = '\n'; | 307 | *(s-1) = '\n'; |
309 | #endif | ||
310 | return (s - buf); | 308 | return (s - buf); |
311 | } | 309 | } |
312 | 310 | ||
313 | static suspend_state_t decode_state(const char *buf, size_t n) | 311 | static suspend_state_t decode_state(const char *buf, size_t n) |
314 | { | 312 | { |
315 | #ifdef CONFIG_SUSPEND | 313 | #ifdef CONFIG_SUSPEND |
316 | suspend_state_t state = PM_SUSPEND_MIN; | 314 | suspend_state_t state; |
317 | struct pm_sleep_state *s; | ||
318 | #endif | 315 | #endif |
319 | char *p; | 316 | char *p; |
320 | int len; | 317 | int len; |
@@ -327,10 +324,12 @@ static suspend_state_t decode_state(const char *buf, size_t n) | |||
327 | return PM_SUSPEND_MAX; | 324 | return PM_SUSPEND_MAX; |
328 | 325 | ||
329 | #ifdef CONFIG_SUSPEND | 326 | #ifdef CONFIG_SUSPEND |
330 | for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) | 327 | for (state = PM_SUSPEND_MIN; state < PM_SUSPEND_MAX; state++) { |
331 | if (s->state && len == strlen(s->label) | 328 | const char *label = pm_states[state]; |
332 | && !strncmp(buf, s->label, len)) | 329 | |
333 | return s->state; | 330 | if (label && len == strlen(label) && !strncmp(buf, label, len)) |
331 | return state; | ||
332 | } | ||
334 | #endif | 333 | #endif |
335 | 334 | ||
336 | return PM_SUSPEND_ON; | 335 | return PM_SUSPEND_ON; |
@@ -448,8 +447,8 @@ static ssize_t autosleep_show(struct kobject *kobj, | |||
448 | 447 | ||
449 | #ifdef CONFIG_SUSPEND | 448 | #ifdef CONFIG_SUSPEND |
450 | if (state < PM_SUSPEND_MAX) | 449 | if (state < PM_SUSPEND_MAX) |
451 | return sprintf(buf, "%s\n", pm_states[state].state ? | 450 | return sprintf(buf, "%s\n", pm_states[state] ? |
452 | pm_states[state].label : "error"); | 451 | pm_states[state] : "error"); |
453 | #endif | 452 | #endif |
454 | #ifdef CONFIG_HIBERNATION | 453 | #ifdef CONFIG_HIBERNATION |
455 | return sprintf(buf, "disk\n"); | 454 | return sprintf(buf, "disk\n"); |
@@ -617,7 +616,6 @@ static struct attribute_group attr_group = { | |||
617 | .attrs = g, | 616 | .attrs = g, |
618 | }; | 617 | }; |
619 | 618 | ||
620 | #ifdef CONFIG_PM_RUNTIME | ||
621 | struct workqueue_struct *pm_wq; | 619 | struct workqueue_struct *pm_wq; |
622 | EXPORT_SYMBOL_GPL(pm_wq); | 620 | EXPORT_SYMBOL_GPL(pm_wq); |
623 | 621 | ||
@@ -627,9 +625,6 @@ static int __init pm_start_workqueue(void) | |||
627 | 625 | ||
628 | return pm_wq ? 0 : -ENOMEM; | 626 | return pm_wq ? 0 : -ENOMEM; |
629 | } | 627 | } |
630 | #else | ||
631 | static inline int pm_start_workqueue(void) { return 0; } | ||
632 | #endif | ||
633 | 628 | ||
634 | static int __init pm_init(void) | 629 | static int __init pm_init(void) |
635 | { | 630 | { |
diff --git a/kernel/power/power.h b/kernel/power/power.h index c60f13b5270a..5d49dcac2537 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h | |||
@@ -178,13 +178,8 @@ extern void swsusp_show_speed(struct timeval *, struct timeval *, | |||
178 | unsigned int, char *); | 178 | unsigned int, char *); |
179 | 179 | ||
180 | #ifdef CONFIG_SUSPEND | 180 | #ifdef CONFIG_SUSPEND |
181 | struct pm_sleep_state { | ||
182 | const char *label; | ||
183 | suspend_state_t state; | ||
184 | }; | ||
185 | |||
186 | /* kernel/power/suspend.c */ | 181 | /* kernel/power/suspend.c */ |
187 | extern struct pm_sleep_state pm_states[]; | 182 | extern const char *pm_states[]; |
188 | 183 | ||
189 | extern int suspend_devices_and_enter(suspend_state_t state); | 184 | extern int suspend_devices_and_enter(suspend_state_t state); |
190 | #else /* !CONFIG_SUSPEND */ | 185 | #else /* !CONFIG_SUSPEND */ |
diff --git a/kernel/power/process.c b/kernel/power/process.c index 0ca8d83e2369..4ee194eb524b 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c | |||
@@ -186,6 +186,7 @@ void thaw_processes(void) | |||
186 | 186 | ||
187 | printk("Restarting tasks ... "); | 187 | printk("Restarting tasks ... "); |
188 | 188 | ||
189 | __usermodehelper_set_disable_depth(UMH_FREEZING); | ||
189 | thaw_workqueues(); | 190 | thaw_workqueues(); |
190 | 191 | ||
191 | read_lock(&tasklist_lock); | 192 | read_lock(&tasklist_lock); |
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 1ea328aafdc9..4fc5c32422b3 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c | |||
@@ -248,33 +248,61 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size) | |||
248 | * information is stored (in the form of a block of bitmap) | 248 | * information is stored (in the form of a block of bitmap) |
249 | * It also contains the pfns that correspond to the start and end of | 249 | * It also contains the pfns that correspond to the start and end of |
250 | * the represented memory area. | 250 | * the represented memory area. |
251 | * | ||
252 | * The memory bitmap is organized as a radix tree to guarantee fast random | ||
253 | * access to the bits. There is one radix tree for each zone (as returned | ||
254 | * from create_mem_extents). | ||
255 | * | ||
256 | * One radix tree is represented by one struct mem_zone_bm_rtree. There are | ||
257 | * two linked lists for the nodes of the tree, one for the inner nodes and | ||
258 | * one for the leave nodes. The linked leave nodes are used for fast linear | ||
259 | * access of the memory bitmap. | ||
260 | * | ||
261 | * The struct rtree_node represents one node of the radix tree. | ||
251 | */ | 262 | */ |
252 | 263 | ||
253 | #define BM_END_OF_MAP (~0UL) | 264 | #define BM_END_OF_MAP (~0UL) |
254 | 265 | ||
255 | #define BM_BITS_PER_BLOCK (PAGE_SIZE * BITS_PER_BYTE) | 266 | #define BM_BITS_PER_BLOCK (PAGE_SIZE * BITS_PER_BYTE) |
267 | #define BM_BLOCK_SHIFT (PAGE_SHIFT + 3) | ||
268 | #define BM_BLOCK_MASK ((1UL << BM_BLOCK_SHIFT) - 1) | ||
256 | 269 | ||
257 | struct bm_block { | 270 | /* |
258 | struct list_head hook; /* hook into a list of bitmap blocks */ | 271 | * struct rtree_node is a wrapper struct to link the nodes |
259 | unsigned long start_pfn; /* pfn represented by the first bit */ | 272 | * of the rtree together for easy linear iteration over |
260 | unsigned long end_pfn; /* pfn represented by the last bit plus 1 */ | 273 | * bits and easy freeing |
261 | unsigned long *data; /* bitmap representing pages */ | 274 | */ |
275 | struct rtree_node { | ||
276 | struct list_head list; | ||
277 | unsigned long *data; | ||
262 | }; | 278 | }; |
263 | 279 | ||
264 | static inline unsigned long bm_block_bits(struct bm_block *bb) | 280 | /* |
265 | { | 281 | * struct mem_zone_bm_rtree represents a bitmap used for one |
266 | return bb->end_pfn - bb->start_pfn; | 282 | * populated memory zone. |
267 | } | 283 | */ |
284 | struct mem_zone_bm_rtree { | ||
285 | struct list_head list; /* Link Zones together */ | ||
286 | struct list_head nodes; /* Radix Tree inner nodes */ | ||
287 | struct list_head leaves; /* Radix Tree leaves */ | ||
288 | unsigned long start_pfn; /* Zone start page frame */ | ||
289 | unsigned long end_pfn; /* Zone end page frame + 1 */ | ||
290 | struct rtree_node *rtree; /* Radix Tree Root */ | ||
291 | int levels; /* Number of Radix Tree Levels */ | ||
292 | unsigned int blocks; /* Number of Bitmap Blocks */ | ||
293 | }; | ||
268 | 294 | ||
269 | /* strcut bm_position is used for browsing memory bitmaps */ | 295 | /* strcut bm_position is used for browsing memory bitmaps */ |
270 | 296 | ||
271 | struct bm_position { | 297 | struct bm_position { |
272 | struct bm_block *block; | 298 | struct mem_zone_bm_rtree *zone; |
273 | int bit; | 299 | struct rtree_node *node; |
300 | unsigned long node_pfn; | ||
301 | int node_bit; | ||
274 | }; | 302 | }; |
275 | 303 | ||
276 | struct memory_bitmap { | 304 | struct memory_bitmap { |
277 | struct list_head blocks; /* list of bitmap blocks */ | 305 | struct list_head zones; |
278 | struct linked_page *p_list; /* list of pages used to store zone | 306 | struct linked_page *p_list; /* list of pages used to store zone |
279 | * bitmap objects and bitmap block | 307 | * bitmap objects and bitmap block |
280 | * objects | 308 | * objects |
@@ -284,38 +312,178 @@ struct memory_bitmap { | |||
284 | 312 | ||
285 | /* Functions that operate on memory bitmaps */ | 313 | /* Functions that operate on memory bitmaps */ |
286 | 314 | ||
287 | static void memory_bm_position_reset(struct memory_bitmap *bm) | 315 | #define BM_ENTRIES_PER_LEVEL (PAGE_SIZE / sizeof(unsigned long)) |
316 | #if BITS_PER_LONG == 32 | ||
317 | #define BM_RTREE_LEVEL_SHIFT (PAGE_SHIFT - 2) | ||
318 | #else | ||
319 | #define BM_RTREE_LEVEL_SHIFT (PAGE_SHIFT - 3) | ||
320 | #endif | ||
321 | #define BM_RTREE_LEVEL_MASK ((1UL << BM_RTREE_LEVEL_SHIFT) - 1) | ||
322 | |||
323 | /* | ||
324 | * alloc_rtree_node - Allocate a new node and add it to the radix tree. | ||
325 | * | ||
326 | * This function is used to allocate inner nodes as well as the | ||
327 | * leave nodes of the radix tree. It also adds the node to the | ||
328 | * corresponding linked list passed in by the *list parameter. | ||
329 | */ | ||
330 | static struct rtree_node *alloc_rtree_node(gfp_t gfp_mask, int safe_needed, | ||
331 | struct chain_allocator *ca, | ||
332 | struct list_head *list) | ||
288 | { | 333 | { |
289 | bm->cur.block = list_entry(bm->blocks.next, struct bm_block, hook); | 334 | struct rtree_node *node; |
290 | bm->cur.bit = 0; | ||
291 | } | ||
292 | 335 | ||
293 | static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free); | 336 | node = chain_alloc(ca, sizeof(struct rtree_node)); |
337 | if (!node) | ||
338 | return NULL; | ||
294 | 339 | ||
295 | /** | 340 | node->data = get_image_page(gfp_mask, safe_needed); |
296 | * create_bm_block_list - create a list of block bitmap objects | 341 | if (!node->data) |
297 | * @pages - number of pages to track | 342 | return NULL; |
298 | * @list - list to put the allocated blocks into | 343 | |
299 | * @ca - chain allocator to be used for allocating memory | 344 | list_add_tail(&node->list, list); |
345 | |||
346 | return node; | ||
347 | } | ||
348 | |||
349 | /* | ||
350 | * add_rtree_block - Add a new leave node to the radix tree | ||
351 | * | ||
352 | * The leave nodes need to be allocated in order to keep the leaves | ||
353 | * linked list in order. This is guaranteed by the zone->blocks | ||
354 | * counter. | ||
300 | */ | 355 | */ |
301 | static int create_bm_block_list(unsigned long pages, | 356 | static int add_rtree_block(struct mem_zone_bm_rtree *zone, gfp_t gfp_mask, |
302 | struct list_head *list, | 357 | int safe_needed, struct chain_allocator *ca) |
303 | struct chain_allocator *ca) | ||
304 | { | 358 | { |
305 | unsigned int nr_blocks = DIV_ROUND_UP(pages, BM_BITS_PER_BLOCK); | 359 | struct rtree_node *node, *block, **dst; |
360 | unsigned int levels_needed, block_nr; | ||
361 | int i; | ||
306 | 362 | ||
307 | while (nr_blocks-- > 0) { | 363 | block_nr = zone->blocks; |
308 | struct bm_block *bb; | 364 | levels_needed = 0; |
309 | 365 | ||
310 | bb = chain_alloc(ca, sizeof(struct bm_block)); | 366 | /* How many levels do we need for this block nr? */ |
311 | if (!bb) | 367 | while (block_nr) { |
368 | levels_needed += 1; | ||
369 | block_nr >>= BM_RTREE_LEVEL_SHIFT; | ||
370 | } | ||
371 | |||
372 | /* Make sure the rtree has enough levels */ | ||
373 | for (i = zone->levels; i < levels_needed; i++) { | ||
374 | node = alloc_rtree_node(gfp_mask, safe_needed, ca, | ||
375 | &zone->nodes); | ||
376 | if (!node) | ||
312 | return -ENOMEM; | 377 | return -ENOMEM; |
313 | list_add(&bb->hook, list); | 378 | |
379 | node->data[0] = (unsigned long)zone->rtree; | ||
380 | zone->rtree = node; | ||
381 | zone->levels += 1; | ||
382 | } | ||
383 | |||
384 | /* Allocate new block */ | ||
385 | block = alloc_rtree_node(gfp_mask, safe_needed, ca, &zone->leaves); | ||
386 | if (!block) | ||
387 | return -ENOMEM; | ||
388 | |||
389 | /* Now walk the rtree to insert the block */ | ||
390 | node = zone->rtree; | ||
391 | dst = &zone->rtree; | ||
392 | block_nr = zone->blocks; | ||
393 | for (i = zone->levels; i > 0; i--) { | ||
394 | int index; | ||
395 | |||
396 | if (!node) { | ||
397 | node = alloc_rtree_node(gfp_mask, safe_needed, ca, | ||
398 | &zone->nodes); | ||
399 | if (!node) | ||
400 | return -ENOMEM; | ||
401 | *dst = node; | ||
402 | } | ||
403 | |||
404 | index = block_nr >> ((i - 1) * BM_RTREE_LEVEL_SHIFT); | ||
405 | index &= BM_RTREE_LEVEL_MASK; | ||
406 | dst = (struct rtree_node **)&((*dst)->data[index]); | ||
407 | node = *dst; | ||
314 | } | 408 | } |
315 | 409 | ||
410 | zone->blocks += 1; | ||
411 | *dst = block; | ||
412 | |||
316 | return 0; | 413 | return 0; |
317 | } | 414 | } |
318 | 415 | ||
416 | static void free_zone_bm_rtree(struct mem_zone_bm_rtree *zone, | ||
417 | int clear_nosave_free); | ||
418 | |||
419 | /* | ||
420 | * create_zone_bm_rtree - create a radix tree for one zone | ||
421 | * | ||
422 | * Allocated the mem_zone_bm_rtree structure and initializes it. | ||
423 | * This function also allocated and builds the radix tree for the | ||
424 | * zone. | ||
425 | */ | ||
426 | static struct mem_zone_bm_rtree * | ||
427 | create_zone_bm_rtree(gfp_t gfp_mask, int safe_needed, | ||
428 | struct chain_allocator *ca, | ||
429 | unsigned long start, unsigned long end) | ||
430 | { | ||
431 | struct mem_zone_bm_rtree *zone; | ||
432 | unsigned int i, nr_blocks; | ||
433 | unsigned long pages; | ||
434 | |||
435 | pages = end - start; | ||
436 | zone = chain_alloc(ca, sizeof(struct mem_zone_bm_rtree)); | ||
437 | if (!zone) | ||
438 | return NULL; | ||
439 | |||
440 | INIT_LIST_HEAD(&zone->nodes); | ||
441 | INIT_LIST_HEAD(&zone->leaves); | ||
442 | zone->start_pfn = start; | ||
443 | zone->end_pfn = end; | ||
444 | nr_blocks = DIV_ROUND_UP(pages, BM_BITS_PER_BLOCK); | ||
445 | |||
446 | for (i = 0; i < nr_blocks; i++) { | ||
447 | if (add_rtree_block(zone, gfp_mask, safe_needed, ca)) { | ||
448 | free_zone_bm_rtree(zone, PG_UNSAFE_CLEAR); | ||
449 | return NULL; | ||
450 | } | ||
451 | } | ||
452 | |||
453 | return zone; | ||
454 | } | ||
455 | |||
456 | /* | ||
457 | * free_zone_bm_rtree - Free the memory of the radix tree | ||
458 | * | ||
459 | * Free all node pages of the radix tree. The mem_zone_bm_rtree | ||
460 | * structure itself is not freed here nor are the rtree_node | ||
461 | * structs. | ||
462 | */ | ||
463 | static void free_zone_bm_rtree(struct mem_zone_bm_rtree *zone, | ||
464 | int clear_nosave_free) | ||
465 | { | ||
466 | struct rtree_node *node; | ||
467 | |||
468 | list_for_each_entry(node, &zone->nodes, list) | ||
469 | free_image_page(node->data, clear_nosave_free); | ||
470 | |||
471 | list_for_each_entry(node, &zone->leaves, list) | ||
472 | free_image_page(node->data, clear_nosave_free); | ||
473 | } | ||
474 | |||
475 | static void memory_bm_position_reset(struct memory_bitmap *bm) | ||
476 | { | ||
477 | bm->cur.zone = list_entry(bm->zones.next, struct mem_zone_bm_rtree, | ||
478 | list); | ||
479 | bm->cur.node = list_entry(bm->cur.zone->leaves.next, | ||
480 | struct rtree_node, list); | ||
481 | bm->cur.node_pfn = 0; | ||
482 | bm->cur.node_bit = 0; | ||
483 | } | ||
484 | |||
485 | static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free); | ||
486 | |||
319 | struct mem_extent { | 487 | struct mem_extent { |
320 | struct list_head hook; | 488 | struct list_head hook; |
321 | unsigned long start; | 489 | unsigned long start; |
@@ -407,40 +575,22 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed) | |||
407 | int error; | 575 | int error; |
408 | 576 | ||
409 | chain_init(&ca, gfp_mask, safe_needed); | 577 | chain_init(&ca, gfp_mask, safe_needed); |
410 | INIT_LIST_HEAD(&bm->blocks); | 578 | INIT_LIST_HEAD(&bm->zones); |
411 | 579 | ||
412 | error = create_mem_extents(&mem_extents, gfp_mask); | 580 | error = create_mem_extents(&mem_extents, gfp_mask); |
413 | if (error) | 581 | if (error) |
414 | return error; | 582 | return error; |
415 | 583 | ||
416 | list_for_each_entry(ext, &mem_extents, hook) { | 584 | list_for_each_entry(ext, &mem_extents, hook) { |
417 | struct bm_block *bb; | 585 | struct mem_zone_bm_rtree *zone; |
418 | unsigned long pfn = ext->start; | ||
419 | unsigned long pages = ext->end - ext->start; | ||
420 | |||
421 | bb = list_entry(bm->blocks.prev, struct bm_block, hook); | ||
422 | 586 | ||
423 | error = create_bm_block_list(pages, bm->blocks.prev, &ca); | 587 | zone = create_zone_bm_rtree(gfp_mask, safe_needed, &ca, |
424 | if (error) | 588 | ext->start, ext->end); |
589 | if (!zone) { | ||
590 | error = -ENOMEM; | ||
425 | goto Error; | 591 | goto Error; |
426 | |||
427 | list_for_each_entry_continue(bb, &bm->blocks, hook) { | ||
428 | bb->data = get_image_page(gfp_mask, safe_needed); | ||
429 | if (!bb->data) { | ||
430 | error = -ENOMEM; | ||
431 | goto Error; | ||
432 | } | ||
433 | |||
434 | bb->start_pfn = pfn; | ||
435 | if (pages >= BM_BITS_PER_BLOCK) { | ||
436 | pfn += BM_BITS_PER_BLOCK; | ||
437 | pages -= BM_BITS_PER_BLOCK; | ||
438 | } else { | ||
439 | /* This is executed only once in the loop */ | ||
440 | pfn += pages; | ||
441 | } | ||
442 | bb->end_pfn = pfn; | ||
443 | } | 592 | } |
593 | list_add_tail(&zone->list, &bm->zones); | ||
444 | } | 594 | } |
445 | 595 | ||
446 | bm->p_list = ca.chain; | 596 | bm->p_list = ca.chain; |
@@ -460,51 +610,83 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed) | |||
460 | */ | 610 | */ |
461 | static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free) | 611 | static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free) |
462 | { | 612 | { |
463 | struct bm_block *bb; | 613 | struct mem_zone_bm_rtree *zone; |
464 | 614 | ||
465 | list_for_each_entry(bb, &bm->blocks, hook) | 615 | list_for_each_entry(zone, &bm->zones, list) |
466 | if (bb->data) | 616 | free_zone_bm_rtree(zone, clear_nosave_free); |
467 | free_image_page(bb->data, clear_nosave_free); | ||
468 | 617 | ||
469 | free_list_of_pages(bm->p_list, clear_nosave_free); | 618 | free_list_of_pages(bm->p_list, clear_nosave_free); |
470 | 619 | ||
471 | INIT_LIST_HEAD(&bm->blocks); | 620 | INIT_LIST_HEAD(&bm->zones); |
472 | } | 621 | } |
473 | 622 | ||
474 | /** | 623 | /** |
475 | * memory_bm_find_bit - find the bit in the bitmap @bm that corresponds | 624 | * memory_bm_find_bit - Find the bit for pfn in the memory |
476 | * to given pfn. The cur_zone_bm member of @bm and the cur_block member | 625 | * bitmap |
477 | * of @bm->cur_zone_bm are updated. | 626 | * |
627 | * Find the bit in the bitmap @bm that corresponds to given pfn. | ||
628 | * The cur.zone, cur.block and cur.node_pfn member of @bm are | ||
629 | * updated. | ||
630 | * It walks the radix tree to find the page which contains the bit for | ||
631 | * pfn and returns the bit position in **addr and *bit_nr. | ||
478 | */ | 632 | */ |
479 | static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn, | 633 | static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn, |
480 | void **addr, unsigned int *bit_nr) | 634 | void **addr, unsigned int *bit_nr) |
481 | { | 635 | { |
482 | struct bm_block *bb; | 636 | struct mem_zone_bm_rtree *curr, *zone; |
637 | struct rtree_node *node; | ||
638 | int i, block_nr; | ||
483 | 639 | ||
640 | zone = bm->cur.zone; | ||
641 | |||
642 | if (pfn >= zone->start_pfn && pfn < zone->end_pfn) | ||
643 | goto zone_found; | ||
644 | |||
645 | zone = NULL; | ||
646 | |||
647 | /* Find the right zone */ | ||
648 | list_for_each_entry(curr, &bm->zones, list) { | ||
649 | if (pfn >= curr->start_pfn && pfn < curr->end_pfn) { | ||
650 | zone = curr; | ||
651 | break; | ||
652 | } | ||
653 | } | ||
654 | |||
655 | if (!zone) | ||
656 | return -EFAULT; | ||
657 | |||
658 | zone_found: | ||
484 | /* | 659 | /* |
485 | * Check if the pfn corresponds to the current bitmap block and find | 660 | * We have a zone. Now walk the radix tree to find the leave |
486 | * the block where it fits if this is not the case. | 661 | * node for our pfn. |
487 | */ | 662 | */ |
488 | bb = bm->cur.block; | ||
489 | if (pfn < bb->start_pfn) | ||
490 | list_for_each_entry_continue_reverse(bb, &bm->blocks, hook) | ||
491 | if (pfn >= bb->start_pfn) | ||
492 | break; | ||
493 | 663 | ||
494 | if (pfn >= bb->end_pfn) | 664 | node = bm->cur.node; |
495 | list_for_each_entry_continue(bb, &bm->blocks, hook) | 665 | if (((pfn - zone->start_pfn) & ~BM_BLOCK_MASK) == bm->cur.node_pfn) |
496 | if (pfn >= bb->start_pfn && pfn < bb->end_pfn) | 666 | goto node_found; |
497 | break; | ||
498 | 667 | ||
499 | if (&bb->hook == &bm->blocks) | 668 | node = zone->rtree; |
500 | return -EFAULT; | 669 | block_nr = (pfn - zone->start_pfn) >> BM_BLOCK_SHIFT; |
670 | |||
671 | for (i = zone->levels; i > 0; i--) { | ||
672 | int index; | ||
673 | |||
674 | index = block_nr >> ((i - 1) * BM_RTREE_LEVEL_SHIFT); | ||
675 | index &= BM_RTREE_LEVEL_MASK; | ||
676 | BUG_ON(node->data[index] == 0); | ||
677 | node = (struct rtree_node *)node->data[index]; | ||
678 | } | ||
679 | |||
680 | node_found: | ||
681 | /* Update last position */ | ||
682 | bm->cur.zone = zone; | ||
683 | bm->cur.node = node; | ||
684 | bm->cur.node_pfn = (pfn - zone->start_pfn) & ~BM_BLOCK_MASK; | ||
685 | |||
686 | /* Set return values */ | ||
687 | *addr = node->data; | ||
688 | *bit_nr = (pfn - zone->start_pfn) & BM_BLOCK_MASK; | ||
501 | 689 | ||
502 | /* The block has been found */ | ||
503 | bm->cur.block = bb; | ||
504 | pfn -= bb->start_pfn; | ||
505 | bm->cur.bit = pfn + 1; | ||
506 | *bit_nr = pfn; | ||
507 | *addr = bb->data; | ||
508 | return 0; | 690 | return 0; |
509 | } | 691 | } |
510 | 692 | ||
@@ -528,6 +710,7 @@ static int mem_bm_set_bit_check(struct memory_bitmap *bm, unsigned long pfn) | |||
528 | error = memory_bm_find_bit(bm, pfn, &addr, &bit); | 710 | error = memory_bm_find_bit(bm, pfn, &addr, &bit); |
529 | if (!error) | 711 | if (!error) |
530 | set_bit(bit, addr); | 712 | set_bit(bit, addr); |
713 | |||
531 | return error; | 714 | return error; |
532 | } | 715 | } |
533 | 716 | ||
@@ -542,6 +725,14 @@ static void memory_bm_clear_bit(struct memory_bitmap *bm, unsigned long pfn) | |||
542 | clear_bit(bit, addr); | 725 | clear_bit(bit, addr); |
543 | } | 726 | } |
544 | 727 | ||
728 | static void memory_bm_clear_current(struct memory_bitmap *bm) | ||
729 | { | ||
730 | int bit; | ||
731 | |||
732 | bit = max(bm->cur.node_bit - 1, 0); | ||
733 | clear_bit(bit, bm->cur.node->data); | ||
734 | } | ||
735 | |||
545 | static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn) | 736 | static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn) |
546 | { | 737 | { |
547 | void *addr; | 738 | void *addr; |
@@ -561,38 +752,70 @@ static bool memory_bm_pfn_present(struct memory_bitmap *bm, unsigned long pfn) | |||
561 | return !memory_bm_find_bit(bm, pfn, &addr, &bit); | 752 | return !memory_bm_find_bit(bm, pfn, &addr, &bit); |
562 | } | 753 | } |
563 | 754 | ||
564 | /** | 755 | /* |
565 | * memory_bm_next_pfn - find the pfn that corresponds to the next set bit | 756 | * rtree_next_node - Jumps to the next leave node |
566 | * in the bitmap @bm. If the pfn cannot be found, BM_END_OF_MAP is | 757 | * |
567 | * returned. | 758 | * Sets the position to the beginning of the next node in the |
759 | * memory bitmap. This is either the next node in the current | ||
760 | * zone's radix tree or the first node in the radix tree of the | ||
761 | * next zone. | ||
568 | * | 762 | * |
569 | * It is required to run memory_bm_position_reset() before the first call to | 763 | * Returns true if there is a next node, false otherwise. |
570 | * this function. | ||
571 | */ | 764 | */ |
765 | static bool rtree_next_node(struct memory_bitmap *bm) | ||
766 | { | ||
767 | bm->cur.node = list_entry(bm->cur.node->list.next, | ||
768 | struct rtree_node, list); | ||
769 | if (&bm->cur.node->list != &bm->cur.zone->leaves) { | ||
770 | bm->cur.node_pfn += BM_BITS_PER_BLOCK; | ||
771 | bm->cur.node_bit = 0; | ||
772 | touch_softlockup_watchdog(); | ||
773 | return true; | ||
774 | } | ||
775 | |||
776 | /* No more nodes, goto next zone */ | ||
777 | bm->cur.zone = list_entry(bm->cur.zone->list.next, | ||
778 | struct mem_zone_bm_rtree, list); | ||
779 | if (&bm->cur.zone->list != &bm->zones) { | ||
780 | bm->cur.node = list_entry(bm->cur.zone->leaves.next, | ||
781 | struct rtree_node, list); | ||
782 | bm->cur.node_pfn = 0; | ||
783 | bm->cur.node_bit = 0; | ||
784 | return true; | ||
785 | } | ||
572 | 786 | ||
787 | /* No more zones */ | ||
788 | return false; | ||
789 | } | ||
790 | |||
791 | /** | ||
792 | * memory_bm_rtree_next_pfn - Find the next set bit in the bitmap @bm | ||
793 | * | ||
794 | * Starting from the last returned position this function searches | ||
795 | * for the next set bit in the memory bitmap and returns its | ||
796 | * number. If no more bit is set BM_END_OF_MAP is returned. | ||
797 | * | ||
798 | * It is required to run memory_bm_position_reset() before the | ||
799 | * first call to this function. | ||
800 | */ | ||
573 | static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm) | 801 | static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm) |
574 | { | 802 | { |
575 | struct bm_block *bb; | 803 | unsigned long bits, pfn, pages; |
576 | int bit; | 804 | int bit; |
577 | 805 | ||
578 | bb = bm->cur.block; | ||
579 | do { | 806 | do { |
580 | bit = bm->cur.bit; | 807 | pages = bm->cur.zone->end_pfn - bm->cur.zone->start_pfn; |
581 | bit = find_next_bit(bb->data, bm_block_bits(bb), bit); | 808 | bits = min(pages - bm->cur.node_pfn, BM_BITS_PER_BLOCK); |
582 | if (bit < bm_block_bits(bb)) | 809 | bit = find_next_bit(bm->cur.node->data, bits, |
583 | goto Return_pfn; | 810 | bm->cur.node_bit); |
584 | 811 | if (bit < bits) { | |
585 | bb = list_entry(bb->hook.next, struct bm_block, hook); | 812 | pfn = bm->cur.zone->start_pfn + bm->cur.node_pfn + bit; |
586 | bm->cur.block = bb; | 813 | bm->cur.node_bit = bit + 1; |
587 | bm->cur.bit = 0; | 814 | return pfn; |
588 | } while (&bb->hook != &bm->blocks); | 815 | } |
816 | } while (rtree_next_node(bm)); | ||
589 | 817 | ||
590 | memory_bm_position_reset(bm); | ||
591 | return BM_END_OF_MAP; | 818 | return BM_END_OF_MAP; |
592 | |||
593 | Return_pfn: | ||
594 | bm->cur.bit = bit + 1; | ||
595 | return bb->start_pfn + bit; | ||
596 | } | 819 | } |
597 | 820 | ||
598 | /** | 821 | /** |
@@ -816,12 +1039,17 @@ void free_basic_memory_bitmaps(void) | |||
816 | 1039 | ||
817 | unsigned int snapshot_additional_pages(struct zone *zone) | 1040 | unsigned int snapshot_additional_pages(struct zone *zone) |
818 | { | 1041 | { |
819 | unsigned int res; | 1042 | unsigned int rtree, nodes; |
1043 | |||
1044 | rtree = nodes = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK); | ||
1045 | rtree += DIV_ROUND_UP(rtree * sizeof(struct rtree_node), | ||
1046 | LINKED_PAGE_DATA_SIZE); | ||
1047 | while (nodes > 1) { | ||
1048 | nodes = DIV_ROUND_UP(nodes, BM_ENTRIES_PER_LEVEL); | ||
1049 | rtree += nodes; | ||
1050 | } | ||
820 | 1051 | ||
821 | res = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK); | 1052 | return 2 * rtree; |
822 | res += DIV_ROUND_UP(res * sizeof(struct bm_block), | ||
823 | LINKED_PAGE_DATA_SIZE); | ||
824 | return 2 * res; | ||
825 | } | 1053 | } |
826 | 1054 | ||
827 | #ifdef CONFIG_HIGHMEM | 1055 | #ifdef CONFIG_HIGHMEM |
@@ -1094,23 +1322,35 @@ static struct memory_bitmap copy_bm; | |||
1094 | 1322 | ||
1095 | void swsusp_free(void) | 1323 | void swsusp_free(void) |
1096 | { | 1324 | { |
1097 | struct zone *zone; | 1325 | unsigned long fb_pfn, fr_pfn; |
1098 | unsigned long pfn, max_zone_pfn; | ||
1099 | 1326 | ||
1100 | for_each_populated_zone(zone) { | 1327 | memory_bm_position_reset(forbidden_pages_map); |
1101 | max_zone_pfn = zone_end_pfn(zone); | 1328 | memory_bm_position_reset(free_pages_map); |
1102 | for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) | 1329 | |
1103 | if (pfn_valid(pfn)) { | 1330 | loop: |
1104 | struct page *page = pfn_to_page(pfn); | 1331 | fr_pfn = memory_bm_next_pfn(free_pages_map); |
1105 | 1332 | fb_pfn = memory_bm_next_pfn(forbidden_pages_map); | |
1106 | if (swsusp_page_is_forbidden(page) && | 1333 | |
1107 | swsusp_page_is_free(page)) { | 1334 | /* |
1108 | swsusp_unset_page_forbidden(page); | 1335 | * Find the next bit set in both bitmaps. This is guaranteed to |
1109 | swsusp_unset_page_free(page); | 1336 | * terminate when fb_pfn == fr_pfn == BM_END_OF_MAP. |
1110 | __free_page(page); | 1337 | */ |
1111 | } | 1338 | do { |
1112 | } | 1339 | if (fb_pfn < fr_pfn) |
1340 | fb_pfn = memory_bm_next_pfn(forbidden_pages_map); | ||
1341 | if (fr_pfn < fb_pfn) | ||
1342 | fr_pfn = memory_bm_next_pfn(free_pages_map); | ||
1343 | } while (fb_pfn != fr_pfn); | ||
1344 | |||
1345 | if (fr_pfn != BM_END_OF_MAP && pfn_valid(fr_pfn)) { | ||
1346 | struct page *page = pfn_to_page(fr_pfn); | ||
1347 | |||
1348 | memory_bm_clear_current(forbidden_pages_map); | ||
1349 | memory_bm_clear_current(free_pages_map); | ||
1350 | __free_page(page); | ||
1351 | goto loop; | ||
1113 | } | 1352 | } |
1353 | |||
1114 | nr_copy_pages = 0; | 1354 | nr_copy_pages = 0; |
1115 | nr_meta_pages = 0; | 1355 | nr_meta_pages = 0; |
1116 | restore_pblist = NULL; | 1356 | restore_pblist = NULL; |
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 4dd8822f732a..9a071bea80eb 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c | |||
@@ -31,20 +31,11 @@ | |||
31 | 31 | ||
32 | #include "power.h" | 32 | #include "power.h" |
33 | 33 | ||
34 | struct pm_sleep_state pm_states[PM_SUSPEND_MAX] = { | 34 | static const char *pm_labels[] = { "mem", "standby", "freeze", }; |
35 | [PM_SUSPEND_FREEZE] = { .label = "freeze", .state = PM_SUSPEND_FREEZE }, | 35 | const char *pm_states[PM_SUSPEND_MAX]; |
36 | [PM_SUSPEND_STANDBY] = { .label = "standby", }, | ||
37 | [PM_SUSPEND_MEM] = { .label = "mem", }, | ||
38 | }; | ||
39 | 36 | ||
40 | static const struct platform_suspend_ops *suspend_ops; | 37 | static const struct platform_suspend_ops *suspend_ops; |
41 | static const struct platform_freeze_ops *freeze_ops; | 38 | static const struct platform_freeze_ops *freeze_ops; |
42 | |||
43 | static bool need_suspend_ops(suspend_state_t state) | ||
44 | { | ||
45 | return state > PM_SUSPEND_FREEZE; | ||
46 | } | ||
47 | |||
48 | static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head); | 39 | static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head); |
49 | static bool suspend_freeze_wake; | 40 | static bool suspend_freeze_wake; |
50 | 41 | ||
@@ -97,10 +88,7 @@ static bool relative_states; | |||
97 | static int __init sleep_states_setup(char *str) | 88 | static int __init sleep_states_setup(char *str) |
98 | { | 89 | { |
99 | relative_states = !strncmp(str, "1", 1); | 90 | relative_states = !strncmp(str, "1", 1); |
100 | if (relative_states) { | 91 | pm_states[PM_SUSPEND_FREEZE] = pm_labels[relative_states ? 0 : 2]; |
101 | pm_states[PM_SUSPEND_MEM].state = PM_SUSPEND_FREEZE; | ||
102 | pm_states[PM_SUSPEND_FREEZE].state = 0; | ||
103 | } | ||
104 | return 1; | 92 | return 1; |
105 | } | 93 | } |
106 | 94 | ||
@@ -113,20 +101,20 @@ __setup("relative_sleep_states=", sleep_states_setup); | |||
113 | void suspend_set_ops(const struct platform_suspend_ops *ops) | 101 | void suspend_set_ops(const struct platform_suspend_ops *ops) |
114 | { | 102 | { |
115 | suspend_state_t i; | 103 | suspend_state_t i; |
116 | int j = PM_SUSPEND_MAX - 1; | 104 | int j = 0; |
117 | 105 | ||
118 | lock_system_sleep(); | 106 | lock_system_sleep(); |
119 | 107 | ||
120 | suspend_ops = ops; | 108 | suspend_ops = ops; |
121 | for (i = PM_SUSPEND_MEM; i >= PM_SUSPEND_STANDBY; i--) | 109 | for (i = PM_SUSPEND_MEM; i >= PM_SUSPEND_STANDBY; i--) |
122 | if (valid_state(i)) | 110 | if (valid_state(i)) { |
123 | pm_states[j--].state = i; | 111 | pm_states[i] = pm_labels[j++]; |
124 | else if (!relative_states) | 112 | } else if (!relative_states) { |
125 | pm_states[j--].state = 0; | 113 | pm_states[i] = NULL; |
114 | j++; | ||
115 | } | ||
126 | 116 | ||
127 | pm_states[j--].state = PM_SUSPEND_FREEZE; | 117 | pm_states[PM_SUSPEND_FREEZE] = pm_labels[j]; |
128 | while (j >= PM_SUSPEND_MIN) | ||
129 | pm_states[j--].state = 0; | ||
130 | 118 | ||
131 | unlock_system_sleep(); | 119 | unlock_system_sleep(); |
132 | } | 120 | } |
@@ -145,6 +133,65 @@ int suspend_valid_only_mem(suspend_state_t state) | |||
145 | } | 133 | } |
146 | EXPORT_SYMBOL_GPL(suspend_valid_only_mem); | 134 | EXPORT_SYMBOL_GPL(suspend_valid_only_mem); |
147 | 135 | ||
136 | static bool sleep_state_supported(suspend_state_t state) | ||
137 | { | ||
138 | return state == PM_SUSPEND_FREEZE || (suspend_ops && suspend_ops->enter); | ||
139 | } | ||
140 | |||
141 | static int platform_suspend_prepare(suspend_state_t state) | ||
142 | { | ||
143 | return state != PM_SUSPEND_FREEZE && suspend_ops->prepare ? | ||
144 | suspend_ops->prepare() : 0; | ||
145 | } | ||
146 | |||
147 | static int platform_suspend_prepare_late(suspend_state_t state) | ||
148 | { | ||
149 | return state != PM_SUSPEND_FREEZE && suspend_ops->prepare_late ? | ||
150 | suspend_ops->prepare_late() : 0; | ||
151 | } | ||
152 | |||
153 | static void platform_suspend_wake(suspend_state_t state) | ||
154 | { | ||
155 | if (state != PM_SUSPEND_FREEZE && suspend_ops->wake) | ||
156 | suspend_ops->wake(); | ||
157 | } | ||
158 | |||
159 | static void platform_suspend_finish(suspend_state_t state) | ||
160 | { | ||
161 | if (state != PM_SUSPEND_FREEZE && suspend_ops->finish) | ||
162 | suspend_ops->finish(); | ||
163 | } | ||
164 | |||
165 | static int platform_suspend_begin(suspend_state_t state) | ||
166 | { | ||
167 | if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->begin) | ||
168 | return freeze_ops->begin(); | ||
169 | else if (suspend_ops->begin) | ||
170 | return suspend_ops->begin(state); | ||
171 | else | ||
172 | return 0; | ||
173 | } | ||
174 | |||
175 | static void platform_suspend_end(suspend_state_t state) | ||
176 | { | ||
177 | if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->end) | ||
178 | freeze_ops->end(); | ||
179 | else if (suspend_ops->end) | ||
180 | suspend_ops->end(); | ||
181 | } | ||
182 | |||
183 | static void platform_suspend_recover(suspend_state_t state) | ||
184 | { | ||
185 | if (state != PM_SUSPEND_FREEZE && suspend_ops->recover) | ||
186 | suspend_ops->recover(); | ||
187 | } | ||
188 | |||
189 | static bool platform_suspend_again(suspend_state_t state) | ||
190 | { | ||
191 | return state != PM_SUSPEND_FREEZE && suspend_ops->suspend_again ? | ||
192 | suspend_ops->suspend_again() : false; | ||
193 | } | ||
194 | |||
148 | static int suspend_test(int level) | 195 | static int suspend_test(int level) |
149 | { | 196 | { |
150 | #ifdef CONFIG_PM_DEBUG | 197 | #ifdef CONFIG_PM_DEBUG |
@@ -168,7 +215,7 @@ static int suspend_prepare(suspend_state_t state) | |||
168 | { | 215 | { |
169 | int error; | 216 | int error; |
170 | 217 | ||
171 | if (need_suspend_ops(state) && (!suspend_ops || !suspend_ops->enter)) | 218 | if (!sleep_state_supported(state)) |
172 | return -EPERM; | 219 | return -EPERM; |
173 | 220 | ||
174 | pm_prepare_console(); | 221 | pm_prepare_console(); |
@@ -214,23 +261,18 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) | |||
214 | { | 261 | { |
215 | int error; | 262 | int error; |
216 | 263 | ||
217 | if (need_suspend_ops(state) && suspend_ops->prepare) { | 264 | error = platform_suspend_prepare(state); |
218 | error = suspend_ops->prepare(); | 265 | if (error) |
219 | if (error) | 266 | goto Platform_finish; |
220 | goto Platform_finish; | ||
221 | } | ||
222 | 267 | ||
223 | error = dpm_suspend_end(PMSG_SUSPEND); | 268 | error = dpm_suspend_end(PMSG_SUSPEND); |
224 | if (error) { | 269 | if (error) { |
225 | printk(KERN_ERR "PM: Some devices failed to power down\n"); | 270 | printk(KERN_ERR "PM: Some devices failed to power down\n"); |
226 | goto Platform_finish; | 271 | goto Platform_finish; |
227 | } | 272 | } |
228 | 273 | error = platform_suspend_prepare_late(state); | |
229 | if (need_suspend_ops(state) && suspend_ops->prepare_late) { | 274 | if (error) |
230 | error = suspend_ops->prepare_late(); | 275 | goto Platform_wake; |
231 | if (error) | ||
232 | goto Platform_wake; | ||
233 | } | ||
234 | 276 | ||
235 | if (suspend_test(TEST_PLATFORM)) | 277 | if (suspend_test(TEST_PLATFORM)) |
236 | goto Platform_wake; | 278 | goto Platform_wake; |
@@ -278,15 +320,11 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) | |||
278 | ftrace_start(); | 320 | ftrace_start(); |
279 | 321 | ||
280 | Platform_wake: | 322 | Platform_wake: |
281 | if (need_suspend_ops(state) && suspend_ops->wake) | 323 | platform_suspend_wake(state); |
282 | suspend_ops->wake(); | ||
283 | |||
284 | dpm_resume_start(PMSG_RESUME); | 324 | dpm_resume_start(PMSG_RESUME); |
285 | 325 | ||
286 | Platform_finish: | 326 | Platform_finish: |
287 | if (need_suspend_ops(state) && suspend_ops->finish) | 327 | platform_suspend_finish(state); |
288 | suspend_ops->finish(); | ||
289 | |||
290 | return error; | 328 | return error; |
291 | } | 329 | } |
292 | 330 | ||
@@ -299,18 +337,13 @@ int suspend_devices_and_enter(suspend_state_t state) | |||
299 | int error; | 337 | int error; |
300 | bool wakeup = false; | 338 | bool wakeup = false; |
301 | 339 | ||
302 | if (need_suspend_ops(state) && !suspend_ops) | 340 | if (!sleep_state_supported(state)) |
303 | return -ENOSYS; | 341 | return -ENOSYS; |
304 | 342 | ||
305 | if (need_suspend_ops(state) && suspend_ops->begin) { | 343 | error = platform_suspend_begin(state); |
306 | error = suspend_ops->begin(state); | 344 | if (error) |
307 | if (error) | 345 | goto Close; |
308 | goto Close; | 346 | |
309 | } else if (state == PM_SUSPEND_FREEZE && freeze_ops->begin) { | ||
310 | error = freeze_ops->begin(); | ||
311 | if (error) | ||
312 | goto Close; | ||
313 | } | ||
314 | suspend_console(); | 347 | suspend_console(); |
315 | suspend_test_start(); | 348 | suspend_test_start(); |
316 | error = dpm_suspend_start(PMSG_SUSPEND); | 349 | error = dpm_suspend_start(PMSG_SUSPEND); |
@@ -324,25 +357,20 @@ int suspend_devices_and_enter(suspend_state_t state) | |||
324 | 357 | ||
325 | do { | 358 | do { |
326 | error = suspend_enter(state, &wakeup); | 359 | error = suspend_enter(state, &wakeup); |
327 | } while (!error && !wakeup && need_suspend_ops(state) | 360 | } while (!error && !wakeup && platform_suspend_again(state)); |
328 | && suspend_ops->suspend_again && suspend_ops->suspend_again()); | ||
329 | 361 | ||
330 | Resume_devices: | 362 | Resume_devices: |
331 | suspend_test_start(); | 363 | suspend_test_start(); |
332 | dpm_resume_end(PMSG_RESUME); | 364 | dpm_resume_end(PMSG_RESUME); |
333 | suspend_test_finish("resume devices"); | 365 | suspend_test_finish("resume devices"); |
334 | resume_console(); | 366 | resume_console(); |
335 | Close: | ||
336 | if (need_suspend_ops(state) && suspend_ops->end) | ||
337 | suspend_ops->end(); | ||
338 | else if (state == PM_SUSPEND_FREEZE && freeze_ops->end) | ||
339 | freeze_ops->end(); | ||
340 | 367 | ||
368 | Close: | ||
369 | platform_suspend_end(state); | ||
341 | return error; | 370 | return error; |
342 | 371 | ||
343 | Recover_platform: | 372 | Recover_platform: |
344 | if (need_suspend_ops(state) && suspend_ops->recover) | 373 | platform_suspend_recover(state); |
345 | suspend_ops->recover(); | ||
346 | goto Resume_devices; | 374 | goto Resume_devices; |
347 | } | 375 | } |
348 | 376 | ||
@@ -395,7 +423,7 @@ static int enter_state(suspend_state_t state) | |||
395 | printk("done.\n"); | 423 | printk("done.\n"); |
396 | trace_suspend_resume(TPS("sync_filesystems"), 0, false); | 424 | trace_suspend_resume(TPS("sync_filesystems"), 0, false); |
397 | 425 | ||
398 | pr_debug("PM: Preparing system for %s sleep\n", pm_states[state].label); | 426 | pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]); |
399 | error = suspend_prepare(state); | 427 | error = suspend_prepare(state); |
400 | if (error) | 428 | if (error) |
401 | goto Unlock; | 429 | goto Unlock; |
@@ -404,7 +432,7 @@ static int enter_state(suspend_state_t state) | |||
404 | goto Finish; | 432 | goto Finish; |
405 | 433 | ||
406 | trace_suspend_resume(TPS("suspend_enter"), state, false); | 434 | trace_suspend_resume(TPS("suspend_enter"), state, false); |
407 | pr_debug("PM: Entering %s sleep\n", pm_states[state].label); | 435 | pr_debug("PM: Entering %s sleep\n", pm_states[state]); |
408 | pm_restrict_gfp_mask(); | 436 | pm_restrict_gfp_mask(); |
409 | error = suspend_devices_and_enter(state); | 437 | error = suspend_devices_and_enter(state); |
410 | pm_restore_gfp_mask(); | 438 | pm_restore_gfp_mask(); |
diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c index 269b097e78ea..2f524928b6aa 100644 --- a/kernel/power/suspend_test.c +++ b/kernel/power/suspend_test.c | |||
@@ -92,13 +92,13 @@ static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state) | |||
92 | } | 92 | } |
93 | 93 | ||
94 | if (state == PM_SUSPEND_MEM) { | 94 | if (state == PM_SUSPEND_MEM) { |
95 | printk(info_test, pm_states[state].label); | 95 | printk(info_test, pm_states[state]); |
96 | status = pm_suspend(state); | 96 | status = pm_suspend(state); |
97 | if (status == -ENODEV) | 97 | if (status == -ENODEV) |
98 | state = PM_SUSPEND_STANDBY; | 98 | state = PM_SUSPEND_STANDBY; |
99 | } | 99 | } |
100 | if (state == PM_SUSPEND_STANDBY) { | 100 | if (state == PM_SUSPEND_STANDBY) { |
101 | printk(info_test, pm_states[state].label); | 101 | printk(info_test, pm_states[state]); |
102 | status = pm_suspend(state); | 102 | status = pm_suspend(state); |
103 | } | 103 | } |
104 | if (status < 0) | 104 | if (status < 0) |
@@ -141,8 +141,8 @@ static int __init setup_test_suspend(char *value) | |||
141 | /* "=mem" ==> "mem" */ | 141 | /* "=mem" ==> "mem" */ |
142 | value++; | 142 | value++; |
143 | for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++) | 143 | for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++) |
144 | if (!strcmp(pm_states[i].label, value)) { | 144 | if (!strcmp(pm_states[i], value)) { |
145 | test_state = pm_states[i].state; | 145 | test_state = i; |
146 | return 0; | 146 | return 0; |
147 | } | 147 | } |
148 | 148 | ||
@@ -162,8 +162,8 @@ static int __init test_suspend(void) | |||
162 | /* PM is initialized by now; is that state testable? */ | 162 | /* PM is initialized by now; is that state testable? */ |
163 | if (test_state == PM_SUSPEND_ON) | 163 | if (test_state == PM_SUSPEND_ON) |
164 | goto done; | 164 | goto done; |
165 | if (!pm_states[test_state].state) { | 165 | if (!pm_states[test_state]) { |
166 | printk(warn_bad_state, pm_states[test_state].label); | 166 | printk(warn_bad_state, pm_states[test_state]); |
167 | goto done; | 167 | goto done; |
168 | } | 168 | } |
169 | 169 | ||
diff --git a/kernel/power/user.c b/kernel/power/user.c index 98d357584cd6..526e8911460a 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c | |||
@@ -49,6 +49,9 @@ static int snapshot_open(struct inode *inode, struct file *filp) | |||
49 | struct snapshot_data *data; | 49 | struct snapshot_data *data; |
50 | int error; | 50 | int error; |
51 | 51 | ||
52 | if (!hibernation_available()) | ||
53 | return -EPERM; | ||
54 | |||
52 | lock_system_sleep(); | 55 | lock_system_sleep(); |
53 | 56 | ||
54 | if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { | 57 | if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { |
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index ea2d5f6962ed..13e839dbca07 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c | |||
@@ -1416,9 +1416,10 @@ static int have_callable_console(void) | |||
1416 | /* | 1416 | /* |
1417 | * Can we actually use the console at this time on this cpu? | 1417 | * Can we actually use the console at this time on this cpu? |
1418 | * | 1418 | * |
1419 | * Console drivers may assume that per-cpu resources have been allocated. So | 1419 | * Console drivers may assume that per-cpu resources have |
1420 | * unless they're explicitly marked as being able to cope (CON_ANYTIME) don't | 1420 | * been allocated. So unless they're explicitly marked as |
1421 | * call them until this CPU is officially up. | 1421 | * being able to cope (CON_ANYTIME) don't call them until |
1422 | * this CPU is officially up. | ||
1422 | */ | 1423 | */ |
1423 | static inline int can_use_console(unsigned int cpu) | 1424 | static inline int can_use_console(unsigned int cpu) |
1424 | { | 1425 | { |
@@ -1431,10 +1432,8 @@ static inline int can_use_console(unsigned int cpu) | |||
1431 | * console_lock held, and 'console_locked' set) if it | 1432 | * console_lock held, and 'console_locked' set) if it |
1432 | * is successful, false otherwise. | 1433 | * is successful, false otherwise. |
1433 | */ | 1434 | */ |
1434 | static int console_trylock_for_printk(void) | 1435 | static int console_trylock_for_printk(unsigned int cpu) |
1435 | { | 1436 | { |
1436 | unsigned int cpu = smp_processor_id(); | ||
1437 | |||
1438 | if (!console_trylock()) | 1437 | if (!console_trylock()) |
1439 | return 0; | 1438 | return 0; |
1440 | /* | 1439 | /* |
@@ -1609,8 +1608,7 @@ asmlinkage int vprintk_emit(int facility, int level, | |||
1609 | */ | 1608 | */ |
1610 | if (!oops_in_progress && !lockdep_recursing(current)) { | 1609 | if (!oops_in_progress && !lockdep_recursing(current)) { |
1611 | recursion_bug = 1; | 1610 | recursion_bug = 1; |
1612 | local_irq_restore(flags); | 1611 | goto out_restore_irqs; |
1613 | return 0; | ||
1614 | } | 1612 | } |
1615 | zap_locks(); | 1613 | zap_locks(); |
1616 | } | 1614 | } |
@@ -1718,27 +1716,21 @@ asmlinkage int vprintk_emit(int facility, int level, | |||
1718 | 1716 | ||
1719 | logbuf_cpu = UINT_MAX; | 1717 | logbuf_cpu = UINT_MAX; |
1720 | raw_spin_unlock(&logbuf_lock); | 1718 | raw_spin_unlock(&logbuf_lock); |
1721 | lockdep_on(); | ||
1722 | local_irq_restore(flags); | ||
1723 | 1719 | ||
1724 | /* If called from the scheduler, we can not call up(). */ | 1720 | /* If called from the scheduler, we can not call up(). */ |
1725 | if (in_sched) | 1721 | if (!in_sched) { |
1726 | return printed_len; | 1722 | /* |
1727 | 1723 | * Try to acquire and then immediately release the console | |
1728 | /* | 1724 | * semaphore. The release will print out buffers and wake up |
1729 | * Disable preemption to avoid being preempted while holding | 1725 | * /dev/kmsg and syslog() users. |
1730 | * console_sem which would prevent anyone from printing to console | 1726 | */ |
1731 | */ | 1727 | if (console_trylock_for_printk(this_cpu)) |
1732 | preempt_disable(); | 1728 | console_unlock(); |
1733 | /* | 1729 | } |
1734 | * Try to acquire and then immediately release the console semaphore. | ||
1735 | * The release will print out buffers and wake up /dev/kmsg and syslog() | ||
1736 | * users. | ||
1737 | */ | ||
1738 | if (console_trylock_for_printk()) | ||
1739 | console_unlock(); | ||
1740 | preempt_enable(); | ||
1741 | 1730 | ||
1731 | lockdep_on(); | ||
1732 | out_restore_irqs: | ||
1733 | local_irq_restore(flags); | ||
1742 | return printed_len; | 1734 | return printed_len; |
1743 | } | 1735 | } |
1744 | EXPORT_SYMBOL(vprintk_emit); | 1736 | EXPORT_SYMBOL(vprintk_emit); |
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index f1ba77363fbb..625d0b0cd75a 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c | |||
@@ -206,6 +206,70 @@ void rcu_bh_qs(int cpu) | |||
206 | rdp->passed_quiesce = 1; | 206 | rdp->passed_quiesce = 1; |
207 | } | 207 | } |
208 | 208 | ||
209 | static DEFINE_PER_CPU(int, rcu_sched_qs_mask); | ||
210 | |||
211 | static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { | ||
212 | .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE, | ||
213 | .dynticks = ATOMIC_INIT(1), | ||
214 | #ifdef CONFIG_NO_HZ_FULL_SYSIDLE | ||
215 | .dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE, | ||
216 | .dynticks_idle = ATOMIC_INIT(1), | ||
217 | #endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ | ||
218 | }; | ||
219 | |||
220 | /* | ||
221 | * Let the RCU core know that this CPU has gone through the scheduler, | ||
222 | * which is a quiescent state. This is called when the need for a | ||
223 | * quiescent state is urgent, so we burn an atomic operation and full | ||
224 | * memory barriers to let the RCU core know about it, regardless of what | ||
225 | * this CPU might (or might not) do in the near future. | ||
226 | * | ||
227 | * We inform the RCU core by emulating a zero-duration dyntick-idle | ||
228 | * period, which we in turn do by incrementing the ->dynticks counter | ||
229 | * by two. | ||
230 | */ | ||
231 | static void rcu_momentary_dyntick_idle(void) | ||
232 | { | ||
233 | unsigned long flags; | ||
234 | struct rcu_data *rdp; | ||
235 | struct rcu_dynticks *rdtp; | ||
236 | int resched_mask; | ||
237 | struct rcu_state *rsp; | ||
238 | |||
239 | local_irq_save(flags); | ||
240 | |||
241 | /* | ||
242 | * Yes, we can lose flag-setting operations. This is OK, because | ||
243 | * the flag will be set again after some delay. | ||
244 | */ | ||
245 | resched_mask = raw_cpu_read(rcu_sched_qs_mask); | ||
246 | raw_cpu_write(rcu_sched_qs_mask, 0); | ||
247 | |||
248 | /* Find the flavor that needs a quiescent state. */ | ||
249 | for_each_rcu_flavor(rsp) { | ||
250 | rdp = raw_cpu_ptr(rsp->rda); | ||
251 | if (!(resched_mask & rsp->flavor_mask)) | ||
252 | continue; | ||
253 | smp_mb(); /* rcu_sched_qs_mask before cond_resched_completed. */ | ||
254 | if (ACCESS_ONCE(rdp->mynode->completed) != | ||
255 | ACCESS_ONCE(rdp->cond_resched_completed)) | ||
256 | continue; | ||
257 | |||
258 | /* | ||
259 | * Pretend to be momentarily idle for the quiescent state. | ||
260 | * This allows the grace-period kthread to record the | ||
261 | * quiescent state, with no need for this CPU to do anything | ||
262 | * further. | ||
263 | */ | ||
264 | rdtp = this_cpu_ptr(&rcu_dynticks); | ||
265 | smp_mb__before_atomic(); /* Earlier stuff before QS. */ | ||
266 | atomic_add(2, &rdtp->dynticks); /* QS. */ | ||
267 | smp_mb__after_atomic(); /* Later stuff after QS. */ | ||
268 | break; | ||
269 | } | ||
270 | local_irq_restore(flags); | ||
271 | } | ||
272 | |||
209 | /* | 273 | /* |
210 | * Note a context switch. This is a quiescent state for RCU-sched, | 274 | * Note a context switch. This is a quiescent state for RCU-sched, |
211 | * and requires special handling for preemptible RCU. | 275 | * and requires special handling for preemptible RCU. |
@@ -216,19 +280,12 @@ void rcu_note_context_switch(int cpu) | |||
216 | trace_rcu_utilization(TPS("Start context switch")); | 280 | trace_rcu_utilization(TPS("Start context switch")); |
217 | rcu_sched_qs(cpu); | 281 | rcu_sched_qs(cpu); |
218 | rcu_preempt_note_context_switch(cpu); | 282 | rcu_preempt_note_context_switch(cpu); |
283 | if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) | ||
284 | rcu_momentary_dyntick_idle(); | ||
219 | trace_rcu_utilization(TPS("End context switch")); | 285 | trace_rcu_utilization(TPS("End context switch")); |
220 | } | 286 | } |
221 | EXPORT_SYMBOL_GPL(rcu_note_context_switch); | 287 | EXPORT_SYMBOL_GPL(rcu_note_context_switch); |
222 | 288 | ||
223 | static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { | ||
224 | .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE, | ||
225 | .dynticks = ATOMIC_INIT(1), | ||
226 | #ifdef CONFIG_NO_HZ_FULL_SYSIDLE | ||
227 | .dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE, | ||
228 | .dynticks_idle = ATOMIC_INIT(1), | ||
229 | #endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ | ||
230 | }; | ||
231 | |||
232 | static long blimit = 10; /* Maximum callbacks per rcu_do_batch. */ | 289 | static long blimit = 10; /* Maximum callbacks per rcu_do_batch. */ |
233 | static long qhimark = 10000; /* If this many pending, ignore blimit. */ | 290 | static long qhimark = 10000; /* If this many pending, ignore blimit. */ |
234 | static long qlowmark = 100; /* Once only this many pending, use blimit. */ | 291 | static long qlowmark = 100; /* Once only this many pending, use blimit. */ |
@@ -243,6 +300,13 @@ static ulong jiffies_till_next_fqs = ULONG_MAX; | |||
243 | module_param(jiffies_till_first_fqs, ulong, 0644); | 300 | module_param(jiffies_till_first_fqs, ulong, 0644); |
244 | module_param(jiffies_till_next_fqs, ulong, 0644); | 301 | module_param(jiffies_till_next_fqs, ulong, 0644); |
245 | 302 | ||
303 | /* | ||
304 | * How long the grace period must be before we start recruiting | ||
305 | * quiescent-state help from rcu_note_context_switch(). | ||
306 | */ | ||
307 | static ulong jiffies_till_sched_qs = HZ / 20; | ||
308 | module_param(jiffies_till_sched_qs, ulong, 0644); | ||
309 | |||
246 | static bool rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, | 310 | static bool rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, |
247 | struct rcu_data *rdp); | 311 | struct rcu_data *rdp); |
248 | static void force_qs_rnp(struct rcu_state *rsp, | 312 | static void force_qs_rnp(struct rcu_state *rsp, |
@@ -853,6 +917,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, | |||
853 | bool *isidle, unsigned long *maxj) | 917 | bool *isidle, unsigned long *maxj) |
854 | { | 918 | { |
855 | unsigned int curr; | 919 | unsigned int curr; |
920 | int *rcrmp; | ||
856 | unsigned int snap; | 921 | unsigned int snap; |
857 | 922 | ||
858 | curr = (unsigned int)atomic_add_return(0, &rdp->dynticks->dynticks); | 923 | curr = (unsigned int)atomic_add_return(0, &rdp->dynticks->dynticks); |
@@ -893,27 +958,43 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, | |||
893 | } | 958 | } |
894 | 959 | ||
895 | /* | 960 | /* |
896 | * There is a possibility that a CPU in adaptive-ticks state | 961 | * A CPU running for an extended time within the kernel can |
897 | * might run in the kernel with the scheduling-clock tick disabled | 962 | * delay RCU grace periods. When the CPU is in NO_HZ_FULL mode, |
898 | * for an extended time period. Invoke rcu_kick_nohz_cpu() to | 963 | * even context-switching back and forth between a pair of |
899 | * force the CPU to restart the scheduling-clock tick in this | 964 | * in-kernel CPU-bound tasks cannot advance grace periods. |
900 | * CPU is in this state. | 965 | * So if the grace period is old enough, make the CPU pay attention. |
901 | */ | 966 | * Note that the unsynchronized assignments to the per-CPU |
902 | rcu_kick_nohz_cpu(rdp->cpu); | 967 | * rcu_sched_qs_mask variable are safe. Yes, setting of |
903 | 968 | * bits can be lost, but they will be set again on the next | |
904 | /* | 969 | * force-quiescent-state pass. So lost bit sets do not result |
905 | * Alternatively, the CPU might be running in the kernel | 970 | * in incorrect behavior, merely in a grace period lasting |
906 | * for an extended period of time without a quiescent state. | 971 | * a few jiffies longer than it might otherwise. Because |
907 | * Attempt to force the CPU through the scheduler to gain the | 972 | * there are at most four threads involved, and because the |
908 | * needed quiescent state, but only if the grace period has gone | 973 | * updates are only once every few jiffies, the probability of |
909 | * on for an uncommonly long time. If there are many stuck CPUs, | 974 | * lossage (and thus of slight grace-period extension) is |
910 | * we will beat on the first one until it gets unstuck, then move | 975 | * quite low. |
911 | * to the next. Only do this for the primary flavor of RCU. | 976 | * |
977 | * Note that if the jiffies_till_sched_qs boot/sysfs parameter | ||
978 | * is set too high, we override with half of the RCU CPU stall | ||
979 | * warning delay. | ||
912 | */ | 980 | */ |
913 | if (rdp->rsp == rcu_state_p && | 981 | rcrmp = &per_cpu(rcu_sched_qs_mask, rdp->cpu); |
982 | if (ULONG_CMP_GE(jiffies, | ||
983 | rdp->rsp->gp_start + jiffies_till_sched_qs) || | ||
914 | ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) { | 984 | ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) { |
915 | rdp->rsp->jiffies_resched += 5; | 985 | if (!(ACCESS_ONCE(*rcrmp) & rdp->rsp->flavor_mask)) { |
916 | resched_cpu(rdp->cpu); | 986 | ACCESS_ONCE(rdp->cond_resched_completed) = |
987 | ACCESS_ONCE(rdp->mynode->completed); | ||
988 | smp_mb(); /* ->cond_resched_completed before *rcrmp. */ | ||
989 | ACCESS_ONCE(*rcrmp) = | ||
990 | ACCESS_ONCE(*rcrmp) + rdp->rsp->flavor_mask; | ||
991 | resched_cpu(rdp->cpu); /* Force CPU into scheduler. */ | ||
992 | rdp->rsp->jiffies_resched += 5; /* Enable beating. */ | ||
993 | } else if (ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) { | ||
994 | /* Time to beat on that CPU again! */ | ||
995 | resched_cpu(rdp->cpu); /* Force CPU into scheduler. */ | ||
996 | rdp->rsp->jiffies_resched += 5; /* Re-enable beating. */ | ||
997 | } | ||
917 | } | 998 | } |
918 | 999 | ||
919 | return 0; | 1000 | return 0; |
@@ -3491,6 +3572,7 @@ static void __init rcu_init_one(struct rcu_state *rsp, | |||
3491 | "rcu_node_fqs_1", | 3572 | "rcu_node_fqs_1", |
3492 | "rcu_node_fqs_2", | 3573 | "rcu_node_fqs_2", |
3493 | "rcu_node_fqs_3" }; /* Match MAX_RCU_LVLS */ | 3574 | "rcu_node_fqs_3" }; /* Match MAX_RCU_LVLS */ |
3575 | static u8 fl_mask = 0x1; | ||
3494 | int cpustride = 1; | 3576 | int cpustride = 1; |
3495 | int i; | 3577 | int i; |
3496 | int j; | 3578 | int j; |
@@ -3509,6 +3591,8 @@ static void __init rcu_init_one(struct rcu_state *rsp, | |||
3509 | for (i = 1; i < rcu_num_lvls; i++) | 3591 | for (i = 1; i < rcu_num_lvls; i++) |
3510 | rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1]; | 3592 | rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1]; |
3511 | rcu_init_levelspread(rsp); | 3593 | rcu_init_levelspread(rsp); |
3594 | rsp->flavor_mask = fl_mask; | ||
3595 | fl_mask <<= 1; | ||
3512 | 3596 | ||
3513 | /* Initialize the elements themselves, starting from the leaves. */ | 3597 | /* Initialize the elements themselves, starting from the leaves. */ |
3514 | 3598 | ||
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index bf2c1e669691..0f69a79c5b7d 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h | |||
@@ -307,6 +307,9 @@ struct rcu_data { | |||
307 | /* 4) reasons this CPU needed to be kicked by force_quiescent_state */ | 307 | /* 4) reasons this CPU needed to be kicked by force_quiescent_state */ |
308 | unsigned long dynticks_fqs; /* Kicked due to dynticks idle. */ | 308 | unsigned long dynticks_fqs; /* Kicked due to dynticks idle. */ |
309 | unsigned long offline_fqs; /* Kicked due to being offline. */ | 309 | unsigned long offline_fqs; /* Kicked due to being offline. */ |
310 | unsigned long cond_resched_completed; | ||
311 | /* Grace period that needs help */ | ||
312 | /* from cond_resched(). */ | ||
310 | 313 | ||
311 | /* 5) __rcu_pending() statistics. */ | 314 | /* 5) __rcu_pending() statistics. */ |
312 | unsigned long n_rcu_pending; /* rcu_pending() calls since boot. */ | 315 | unsigned long n_rcu_pending; /* rcu_pending() calls since boot. */ |
@@ -392,6 +395,7 @@ struct rcu_state { | |||
392 | struct rcu_node *level[RCU_NUM_LVLS]; /* Hierarchy levels. */ | 395 | struct rcu_node *level[RCU_NUM_LVLS]; /* Hierarchy levels. */ |
393 | u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */ | 396 | u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */ |
394 | u8 levelspread[RCU_NUM_LVLS]; /* kids/node in each level. */ | 397 | u8 levelspread[RCU_NUM_LVLS]; /* kids/node in each level. */ |
398 | u8 flavor_mask; /* bit in flavor mask. */ | ||
395 | struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */ | 399 | struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */ |
396 | void (*call)(struct rcu_head *head, /* call_rcu() flavor. */ | 400 | void (*call)(struct rcu_head *head, /* call_rcu() flavor. */ |
397 | void (*func)(struct rcu_head *head)); | 401 | void (*func)(struct rcu_head *head)); |
@@ -563,7 +567,7 @@ static bool rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp); | |||
563 | static void do_nocb_deferred_wakeup(struct rcu_data *rdp); | 567 | static void do_nocb_deferred_wakeup(struct rcu_data *rdp); |
564 | static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp); | 568 | static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp); |
565 | static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp); | 569 | static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp); |
566 | static void rcu_kick_nohz_cpu(int cpu); | 570 | static void __maybe_unused rcu_kick_nohz_cpu(int cpu); |
567 | static bool init_nocb_callback_list(struct rcu_data *rdp); | 571 | static bool init_nocb_callback_list(struct rcu_data *rdp); |
568 | static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq); | 572 | static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq); |
569 | static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq); | 573 | static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq); |
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index cbc2c45265e2..02ac0fb186b8 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h | |||
@@ -2404,7 +2404,7 @@ static bool init_nocb_callback_list(struct rcu_data *rdp) | |||
2404 | * if an adaptive-ticks CPU is failing to respond to the current grace | 2404 | * if an adaptive-ticks CPU is failing to respond to the current grace |
2405 | * period and has not be idle from an RCU perspective, kick it. | 2405 | * period and has not be idle from an RCU perspective, kick it. |
2406 | */ | 2406 | */ |
2407 | static void rcu_kick_nohz_cpu(int cpu) | 2407 | static void __maybe_unused rcu_kick_nohz_cpu(int cpu) |
2408 | { | 2408 | { |
2409 | #ifdef CONFIG_NO_HZ_FULL | 2409 | #ifdef CONFIG_NO_HZ_FULL |
2410 | if (tick_nohz_full_cpu(cpu)) | 2410 | if (tick_nohz_full_cpu(cpu)) |
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index a2aeb4df0f60..bc7883570530 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c | |||
@@ -200,12 +200,12 @@ void wait_rcu_gp(call_rcu_func_t crf) | |||
200 | EXPORT_SYMBOL_GPL(wait_rcu_gp); | 200 | EXPORT_SYMBOL_GPL(wait_rcu_gp); |
201 | 201 | ||
202 | #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD | 202 | #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD |
203 | static inline void debug_init_rcu_head(struct rcu_head *head) | 203 | void init_rcu_head(struct rcu_head *head) |
204 | { | 204 | { |
205 | debug_object_init(head, &rcuhead_debug_descr); | 205 | debug_object_init(head, &rcuhead_debug_descr); |
206 | } | 206 | } |
207 | 207 | ||
208 | static inline void debug_rcu_head_free(struct rcu_head *head) | 208 | void destroy_rcu_head(struct rcu_head *head) |
209 | { | 209 | { |
210 | debug_object_free(head, &rcuhead_debug_descr); | 210 | debug_object_free(head, &rcuhead_debug_descr); |
211 | } | 211 | } |
@@ -350,21 +350,3 @@ static int __init check_cpu_stall_init(void) | |||
350 | early_initcall(check_cpu_stall_init); | 350 | early_initcall(check_cpu_stall_init); |
351 | 351 | ||
352 | #endif /* #ifdef CONFIG_RCU_STALL_COMMON */ | 352 | #endif /* #ifdef CONFIG_RCU_STALL_COMMON */ |
353 | |||
354 | /* | ||
355 | * Hooks for cond_resched() and friends to avoid RCU CPU stall warnings. | ||
356 | */ | ||
357 | |||
358 | DEFINE_PER_CPU(int, rcu_cond_resched_count); | ||
359 | |||
360 | /* | ||
361 | * Report a set of RCU quiescent states, for use by cond_resched() | ||
362 | * and friends. Out of line due to being called infrequently. | ||
363 | */ | ||
364 | void rcu_resched(void) | ||
365 | { | ||
366 | preempt_disable(); | ||
367 | __this_cpu_write(rcu_cond_resched_count, 0); | ||
368 | rcu_note_context_switch(smp_processor_id()); | ||
369 | preempt_enable(); | ||
370 | } | ||
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3bdf01b494fe..bc1638b33449 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -4147,7 +4147,6 @@ static void __cond_resched(void) | |||
4147 | 4147 | ||
4148 | int __sched _cond_resched(void) | 4148 | int __sched _cond_resched(void) |
4149 | { | 4149 | { |
4150 | rcu_cond_resched(); | ||
4151 | if (should_resched()) { | 4150 | if (should_resched()) { |
4152 | __cond_resched(); | 4151 | __cond_resched(); |
4153 | return 1; | 4152 | return 1; |
@@ -4166,18 +4165,15 @@ EXPORT_SYMBOL(_cond_resched); | |||
4166 | */ | 4165 | */ |
4167 | int __cond_resched_lock(spinlock_t *lock) | 4166 | int __cond_resched_lock(spinlock_t *lock) |
4168 | { | 4167 | { |
4169 | bool need_rcu_resched = rcu_should_resched(); | ||
4170 | int resched = should_resched(); | 4168 | int resched = should_resched(); |
4171 | int ret = 0; | 4169 | int ret = 0; |
4172 | 4170 | ||
4173 | lockdep_assert_held(lock); | 4171 | lockdep_assert_held(lock); |
4174 | 4172 | ||
4175 | if (spin_needbreak(lock) || resched || need_rcu_resched) { | 4173 | if (spin_needbreak(lock) || resched) { |
4176 | spin_unlock(lock); | 4174 | spin_unlock(lock); |
4177 | if (resched) | 4175 | if (resched) |
4178 | __cond_resched(); | 4176 | __cond_resched(); |
4179 | else if (unlikely(need_rcu_resched)) | ||
4180 | rcu_resched(); | ||
4181 | else | 4177 | else |
4182 | cpu_relax(); | 4178 | cpu_relax(); |
4183 | ret = 1; | 4179 | ret = 1; |
@@ -4191,7 +4187,6 @@ int __sched __cond_resched_softirq(void) | |||
4191 | { | 4187 | { |
4192 | BUG_ON(!in_softirq()); | 4188 | BUG_ON(!in_softirq()); |
4193 | 4189 | ||
4194 | rcu_cond_resched(); /* BH disabled OK, just recording QSes. */ | ||
4195 | if (should_resched()) { | 4190 | if (should_resched()) { |
4196 | local_bh_enable(); | 4191 | local_bh_enable(); |
4197 | __cond_resched(); | 4192 | __cond_resched(); |
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 695f9773bb60..627b3c34b821 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c | |||
@@ -608,7 +608,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) | |||
608 | 608 | ||
609 | avg_atom = p->se.sum_exec_runtime; | 609 | avg_atom = p->se.sum_exec_runtime; |
610 | if (nr_switches) | 610 | if (nr_switches) |
611 | do_div(avg_atom, nr_switches); | 611 | avg_atom = div64_ul(avg_atom, nr_switches); |
612 | else | 612 | else |
613 | avg_atom = -1LL; | 613 | avg_atom = -1LL; |
614 | 614 | ||
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index cf009fb0bc25..658a58dc30f4 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c | |||
@@ -147,8 +147,6 @@ use_default: | |||
147 | clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu)) | 147 | clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu)) |
148 | goto use_default; | 148 | goto use_default; |
149 | 149 | ||
150 | trace_cpu_idle_rcuidle(next_state, dev->cpu); | ||
151 | |||
152 | /* | 150 | /* |
153 | * Enter the idle state previously returned by the governor decision. | 151 | * Enter the idle state previously returned by the governor decision. |
154 | * This function will block until an interrupt occurs and will take | 152 | * This function will block until an interrupt occurs and will take |
@@ -156,8 +154,6 @@ use_default: | |||
156 | */ | 154 | */ |
157 | entered_state = cpuidle_enter(drv, dev, next_state); | 155 | entered_state = cpuidle_enter(drv, dev, next_state); |
158 | 156 | ||
159 | trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu); | ||
160 | |||
161 | if (broadcast) | 157 | if (broadcast) |
162 | clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu); | 158 | clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu); |
163 | 159 | ||
diff --git a/kernel/smp.c b/kernel/smp.c index 306f8180b0d5..80c33f8de14f 100644 --- a/kernel/smp.c +++ b/kernel/smp.c | |||
@@ -29,6 +29,8 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_function_data, cfd_data); | |||
29 | 29 | ||
30 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct llist_head, call_single_queue); | 30 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct llist_head, call_single_queue); |
31 | 31 | ||
32 | static void flush_smp_call_function_queue(bool warn_cpu_offline); | ||
33 | |||
32 | static int | 34 | static int |
33 | hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) | 35 | hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) |
34 | { | 36 | { |
@@ -51,12 +53,27 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
51 | #ifdef CONFIG_HOTPLUG_CPU | 53 | #ifdef CONFIG_HOTPLUG_CPU |
52 | case CPU_UP_CANCELED: | 54 | case CPU_UP_CANCELED: |
53 | case CPU_UP_CANCELED_FROZEN: | 55 | case CPU_UP_CANCELED_FROZEN: |
56 | /* Fall-through to the CPU_DEAD[_FROZEN] case. */ | ||
54 | 57 | ||
55 | case CPU_DEAD: | 58 | case CPU_DEAD: |
56 | case CPU_DEAD_FROZEN: | 59 | case CPU_DEAD_FROZEN: |
57 | free_cpumask_var(cfd->cpumask); | 60 | free_cpumask_var(cfd->cpumask); |
58 | free_percpu(cfd->csd); | 61 | free_percpu(cfd->csd); |
59 | break; | 62 | break; |
63 | |||
64 | case CPU_DYING: | ||
65 | case CPU_DYING_FROZEN: | ||
66 | /* | ||
67 | * The IPIs for the smp-call-function callbacks queued by other | ||
68 | * CPUs might arrive late, either due to hardware latencies or | ||
69 | * because this CPU disabled interrupts (inside stop-machine) | ||
70 | * before the IPIs were sent. So flush out any pending callbacks | ||
71 | * explicitly (without waiting for the IPIs to arrive), to | ||
72 | * ensure that the outgoing CPU doesn't go offline with work | ||
73 | * still pending. | ||
74 | */ | ||
75 | flush_smp_call_function_queue(false); | ||
76 | break; | ||
60 | #endif | 77 | #endif |
61 | }; | 78 | }; |
62 | 79 | ||
@@ -177,23 +194,47 @@ static int generic_exec_single(int cpu, struct call_single_data *csd, | |||
177 | return 0; | 194 | return 0; |
178 | } | 195 | } |
179 | 196 | ||
180 | /* | 197 | /** |
181 | * Invoked by arch to handle an IPI for call function single. Must be | 198 | * generic_smp_call_function_single_interrupt - Execute SMP IPI callbacks |
182 | * called from the arch with interrupts disabled. | 199 | * |
200 | * Invoked by arch to handle an IPI for call function single. | ||
201 | * Must be called with interrupts disabled. | ||
183 | */ | 202 | */ |
184 | void generic_smp_call_function_single_interrupt(void) | 203 | void generic_smp_call_function_single_interrupt(void) |
185 | { | 204 | { |
205 | flush_smp_call_function_queue(true); | ||
206 | } | ||
207 | |||
208 | /** | ||
209 | * flush_smp_call_function_queue - Flush pending smp-call-function callbacks | ||
210 | * | ||
211 | * @warn_cpu_offline: If set to 'true', warn if callbacks were queued on an | ||
212 | * offline CPU. Skip this check if set to 'false'. | ||
213 | * | ||
214 | * Flush any pending smp-call-function callbacks queued on this CPU. This is | ||
215 | * invoked by the generic IPI handler, as well as by a CPU about to go offline, | ||
216 | * to ensure that all pending IPI callbacks are run before it goes completely | ||
217 | * offline. | ||
218 | * | ||
219 | * Loop through the call_single_queue and run all the queued callbacks. | ||
220 | * Must be called with interrupts disabled. | ||
221 | */ | ||
222 | static void flush_smp_call_function_queue(bool warn_cpu_offline) | ||
223 | { | ||
224 | struct llist_head *head; | ||
186 | struct llist_node *entry; | 225 | struct llist_node *entry; |
187 | struct call_single_data *csd, *csd_next; | 226 | struct call_single_data *csd, *csd_next; |
188 | static bool warned; | 227 | static bool warned; |
189 | 228 | ||
190 | entry = llist_del_all(&__get_cpu_var(call_single_queue)); | 229 | WARN_ON(!irqs_disabled()); |
230 | |||
231 | head = &__get_cpu_var(call_single_queue); | ||
232 | entry = llist_del_all(head); | ||
191 | entry = llist_reverse_order(entry); | 233 | entry = llist_reverse_order(entry); |
192 | 234 | ||
193 | /* | 235 | /* There shouldn't be any pending callbacks on an offline CPU. */ |
194 | * Shouldn't receive this interrupt on a cpu that is not yet online. | 236 | if (unlikely(warn_cpu_offline && !cpu_online(smp_processor_id()) && |
195 | */ | 237 | !warned && !llist_empty(head))) { |
196 | if (unlikely(!cpu_online(smp_processor_id()) && !warned)) { | ||
197 | warned = true; | 238 | warned = true; |
198 | WARN(1, "IPI on offline CPU %d\n", smp_processor_id()); | 239 | WARN(1, "IPI on offline CPU %d\n", smp_processor_id()); |
199 | 240 | ||
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index ba9ed453c4ed..75b22e22a72c 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -136,7 +136,6 @@ static unsigned long dirty_bytes_min = 2 * PAGE_SIZE; | |||
136 | /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ | 136 | /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ |
137 | static int maxolduid = 65535; | 137 | static int maxolduid = 65535; |
138 | static int minolduid; | 138 | static int minolduid; |
139 | static int min_percpu_pagelist_fract = 8; | ||
140 | 139 | ||
141 | static int ngroups_max = NGROUPS_MAX; | 140 | static int ngroups_max = NGROUPS_MAX; |
142 | static const int cap_last_cap = CAP_LAST_CAP; | 141 | static const int cap_last_cap = CAP_LAST_CAP; |
@@ -152,10 +151,6 @@ static unsigned long hung_task_timeout_max = (LONG_MAX/HZ); | |||
152 | #ifdef CONFIG_SPARC | 151 | #ifdef CONFIG_SPARC |
153 | #endif | 152 | #endif |
154 | 153 | ||
155 | #ifdef CONFIG_SPARC64 | ||
156 | extern int sysctl_tsb_ratio; | ||
157 | #endif | ||
158 | |||
159 | #ifdef __hppa__ | 154 | #ifdef __hppa__ |
160 | extern int pwrsw_enabled; | 155 | extern int pwrsw_enabled; |
161 | #endif | 156 | #endif |
@@ -865,6 +860,17 @@ static struct ctl_table kern_table[] = { | |||
865 | .extra1 = &zero, | 860 | .extra1 = &zero, |
866 | .extra2 = &one, | 861 | .extra2 = &one, |
867 | }, | 862 | }, |
863 | #ifdef CONFIG_SMP | ||
864 | { | ||
865 | .procname = "softlockup_all_cpu_backtrace", | ||
866 | .data = &sysctl_softlockup_all_cpu_backtrace, | ||
867 | .maxlen = sizeof(int), | ||
868 | .mode = 0644, | ||
869 | .proc_handler = proc_dointvec_minmax, | ||
870 | .extra1 = &zero, | ||
871 | .extra2 = &one, | ||
872 | }, | ||
873 | #endif /* CONFIG_SMP */ | ||
868 | { | 874 | { |
869 | .procname = "nmi_watchdog", | 875 | .procname = "nmi_watchdog", |
870 | .data = &watchdog_user_enabled, | 876 | .data = &watchdog_user_enabled, |
@@ -1321,7 +1327,7 @@ static struct ctl_table vm_table[] = { | |||
1321 | .maxlen = sizeof(percpu_pagelist_fraction), | 1327 | .maxlen = sizeof(percpu_pagelist_fraction), |
1322 | .mode = 0644, | 1328 | .mode = 0644, |
1323 | .proc_handler = percpu_pagelist_fraction_sysctl_handler, | 1329 | .proc_handler = percpu_pagelist_fraction_sysctl_handler, |
1324 | .extra1 = &min_percpu_pagelist_fract, | 1330 | .extra1 = &zero, |
1325 | }, | 1331 | }, |
1326 | #ifdef CONFIG_MMU | 1332 | #ifdef CONFIG_MMU |
1327 | { | 1333 | { |
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 88c9c65a430d..fe75444ae7ec 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c | |||
@@ -585,9 +585,14 @@ static int alarm_timer_set(struct k_itimer *timr, int flags, | |||
585 | struct itimerspec *new_setting, | 585 | struct itimerspec *new_setting, |
586 | struct itimerspec *old_setting) | 586 | struct itimerspec *old_setting) |
587 | { | 587 | { |
588 | ktime_t exp; | ||
589 | |||
588 | if (!rtcdev) | 590 | if (!rtcdev) |
589 | return -ENOTSUPP; | 591 | return -ENOTSUPP; |
590 | 592 | ||
593 | if (flags & ~TIMER_ABSTIME) | ||
594 | return -EINVAL; | ||
595 | |||
591 | if (old_setting) | 596 | if (old_setting) |
592 | alarm_timer_get(timr, old_setting); | 597 | alarm_timer_get(timr, old_setting); |
593 | 598 | ||
@@ -597,8 +602,16 @@ static int alarm_timer_set(struct k_itimer *timr, int flags, | |||
597 | 602 | ||
598 | /* start the timer */ | 603 | /* start the timer */ |
599 | timr->it.alarm.interval = timespec_to_ktime(new_setting->it_interval); | 604 | timr->it.alarm.interval = timespec_to_ktime(new_setting->it_interval); |
600 | alarm_start(&timr->it.alarm.alarmtimer, | 605 | exp = timespec_to_ktime(new_setting->it_value); |
601 | timespec_to_ktime(new_setting->it_value)); | 606 | /* Convert (if necessary) to absolute time */ |
607 | if (flags != TIMER_ABSTIME) { | ||
608 | ktime_t now; | ||
609 | |||
610 | now = alarm_bases[timr->it.alarm.alarmtimer.type].gettime(); | ||
611 | exp = ktime_add(now, exp); | ||
612 | } | ||
613 | |||
614 | alarm_start(&timr->it.alarm.alarmtimer, exp); | ||
602 | return 0; | 615 | return 0; |
603 | } | 616 | } |
604 | 617 | ||
@@ -730,6 +743,9 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags, | |||
730 | if (!alarmtimer_get_rtcdev()) | 743 | if (!alarmtimer_get_rtcdev()) |
731 | return -ENOTSUPP; | 744 | return -ENOTSUPP; |
732 | 745 | ||
746 | if (flags & ~TIMER_ABSTIME) | ||
747 | return -EINVAL; | ||
748 | |||
733 | if (!capable(CAP_WAKE_ALARM)) | 749 | if (!capable(CAP_WAKE_ALARM)) |
734 | return -EPERM; | 750 | return -EPERM; |
735 | 751 | ||
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 5b372e3ed675..ac9d1dad630b 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c | |||
@@ -265,12 +265,12 @@ static void update_ftrace_function(void) | |||
265 | func = ftrace_ops_list_func; | 265 | func = ftrace_ops_list_func; |
266 | } | 266 | } |
267 | 267 | ||
268 | update_function_graph_func(); | ||
269 | |||
268 | /* If there's no change, then do nothing more here */ | 270 | /* If there's no change, then do nothing more here */ |
269 | if (ftrace_trace_function == func) | 271 | if (ftrace_trace_function == func) |
270 | return; | 272 | return; |
271 | 273 | ||
272 | update_function_graph_func(); | ||
273 | |||
274 | /* | 274 | /* |
275 | * If we are using the list function, it doesn't care | 275 | * If we are using the list function, it doesn't care |
276 | * about the function_trace_ops. | 276 | * about the function_trace_ops. |
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 7c56c3d06943..ff7027199a9a 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c | |||
@@ -616,10 +616,6 @@ int ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu, | |||
616 | struct ring_buffer_per_cpu *cpu_buffer; | 616 | struct ring_buffer_per_cpu *cpu_buffer; |
617 | struct rb_irq_work *work; | 617 | struct rb_irq_work *work; |
618 | 618 | ||
619 | if ((cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) || | ||
620 | (cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu))) | ||
621 | return POLLIN | POLLRDNORM; | ||
622 | |||
623 | if (cpu == RING_BUFFER_ALL_CPUS) | 619 | if (cpu == RING_BUFFER_ALL_CPUS) |
624 | work = &buffer->irq_work; | 620 | work = &buffer->irq_work; |
625 | else { | 621 | else { |
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 384ede311717..291397e66669 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c | |||
@@ -466,6 +466,12 @@ int __trace_puts(unsigned long ip, const char *str, int size) | |||
466 | struct print_entry *entry; | 466 | struct print_entry *entry; |
467 | unsigned long irq_flags; | 467 | unsigned long irq_flags; |
468 | int alloc; | 468 | int alloc; |
469 | int pc; | ||
470 | |||
471 | if (!(trace_flags & TRACE_ITER_PRINTK)) | ||
472 | return 0; | ||
473 | |||
474 | pc = preempt_count(); | ||
469 | 475 | ||
470 | if (unlikely(tracing_selftest_running || tracing_disabled)) | 476 | if (unlikely(tracing_selftest_running || tracing_disabled)) |
471 | return 0; | 477 | return 0; |
@@ -475,7 +481,7 @@ int __trace_puts(unsigned long ip, const char *str, int size) | |||
475 | local_save_flags(irq_flags); | 481 | local_save_flags(irq_flags); |
476 | buffer = global_trace.trace_buffer.buffer; | 482 | buffer = global_trace.trace_buffer.buffer; |
477 | event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc, | 483 | event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc, |
478 | irq_flags, preempt_count()); | 484 | irq_flags, pc); |
479 | if (!event) | 485 | if (!event) |
480 | return 0; | 486 | return 0; |
481 | 487 | ||
@@ -492,6 +498,7 @@ int __trace_puts(unsigned long ip, const char *str, int size) | |||
492 | entry->buf[size] = '\0'; | 498 | entry->buf[size] = '\0'; |
493 | 499 | ||
494 | __buffer_unlock_commit(buffer, event); | 500 | __buffer_unlock_commit(buffer, event); |
501 | ftrace_trace_stack(buffer, irq_flags, 4, pc); | ||
495 | 502 | ||
496 | return size; | 503 | return size; |
497 | } | 504 | } |
@@ -509,6 +516,12 @@ int __trace_bputs(unsigned long ip, const char *str) | |||
509 | struct bputs_entry *entry; | 516 | struct bputs_entry *entry; |
510 | unsigned long irq_flags; | 517 | unsigned long irq_flags; |
511 | int size = sizeof(struct bputs_entry); | 518 | int size = sizeof(struct bputs_entry); |
519 | int pc; | ||
520 | |||
521 | if (!(trace_flags & TRACE_ITER_PRINTK)) | ||
522 | return 0; | ||
523 | |||
524 | pc = preempt_count(); | ||
512 | 525 | ||
513 | if (unlikely(tracing_selftest_running || tracing_disabled)) | 526 | if (unlikely(tracing_selftest_running || tracing_disabled)) |
514 | return 0; | 527 | return 0; |
@@ -516,7 +529,7 @@ int __trace_bputs(unsigned long ip, const char *str) | |||
516 | local_save_flags(irq_flags); | 529 | local_save_flags(irq_flags); |
517 | buffer = global_trace.trace_buffer.buffer; | 530 | buffer = global_trace.trace_buffer.buffer; |
518 | event = trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size, | 531 | event = trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size, |
519 | irq_flags, preempt_count()); | 532 | irq_flags, pc); |
520 | if (!event) | 533 | if (!event) |
521 | return 0; | 534 | return 0; |
522 | 535 | ||
@@ -525,6 +538,7 @@ int __trace_bputs(unsigned long ip, const char *str) | |||
525 | entry->str = str; | 538 | entry->str = str; |
526 | 539 | ||
527 | __buffer_unlock_commit(buffer, event); | 540 | __buffer_unlock_commit(buffer, event); |
541 | ftrace_trace_stack(buffer, irq_flags, 4, pc); | ||
528 | 542 | ||
529 | return 1; | 543 | return 1; |
530 | } | 544 | } |
@@ -809,7 +823,7 @@ static struct { | |||
809 | { trace_clock_local, "local", 1 }, | 823 | { trace_clock_local, "local", 1 }, |
810 | { trace_clock_global, "global", 1 }, | 824 | { trace_clock_global, "global", 1 }, |
811 | { trace_clock_counter, "counter", 0 }, | 825 | { trace_clock_counter, "counter", 0 }, |
812 | { trace_clock_jiffies, "uptime", 1 }, | 826 | { trace_clock_jiffies, "uptime", 0 }, |
813 | { trace_clock, "perf", 1 }, | 827 | { trace_clock, "perf", 1 }, |
814 | ARCH_TRACE_CLOCKS | 828 | ARCH_TRACE_CLOCKS |
815 | }; | 829 | }; |
@@ -1396,7 +1410,6 @@ void tracing_start(void) | |||
1396 | 1410 | ||
1397 | arch_spin_unlock(&global_trace.max_lock); | 1411 | arch_spin_unlock(&global_trace.max_lock); |
1398 | 1412 | ||
1399 | ftrace_start(); | ||
1400 | out: | 1413 | out: |
1401 | raw_spin_unlock_irqrestore(&global_trace.start_lock, flags); | 1414 | raw_spin_unlock_irqrestore(&global_trace.start_lock, flags); |
1402 | } | 1415 | } |
@@ -1443,7 +1456,6 @@ void tracing_stop(void) | |||
1443 | struct ring_buffer *buffer; | 1456 | struct ring_buffer *buffer; |
1444 | unsigned long flags; | 1457 | unsigned long flags; |
1445 | 1458 | ||
1446 | ftrace_stop(); | ||
1447 | raw_spin_lock_irqsave(&global_trace.start_lock, flags); | 1459 | raw_spin_lock_irqsave(&global_trace.start_lock, flags); |
1448 | if (global_trace.stop_count++) | 1460 | if (global_trace.stop_count++) |
1449 | goto out; | 1461 | goto out; |
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c index 26dc348332b7..57b67b1f24d1 100644 --- a/kernel/trace/trace_clock.c +++ b/kernel/trace/trace_clock.c | |||
@@ -59,13 +59,14 @@ u64 notrace trace_clock(void) | |||
59 | 59 | ||
60 | /* | 60 | /* |
61 | * trace_jiffy_clock(): Simply use jiffies as a clock counter. | 61 | * trace_jiffy_clock(): Simply use jiffies as a clock counter. |
62 | * Note that this use of jiffies_64 is not completely safe on | ||
63 | * 32-bit systems. But the window is tiny, and the effect if | ||
64 | * we are affected is that we will have an obviously bogus | ||
65 | * timestamp on a trace event - i.e. not life threatening. | ||
62 | */ | 66 | */ |
63 | u64 notrace trace_clock_jiffies(void) | 67 | u64 notrace trace_clock_jiffies(void) |
64 | { | 68 | { |
65 | u64 jiffy = jiffies - INITIAL_JIFFIES; | 69 | return jiffies_64_to_clock_t(jiffies_64 - INITIAL_JIFFIES); |
66 | |||
67 | /* Return nsecs */ | ||
68 | return (u64)jiffies_to_usecs(jiffy) * 1000ULL; | ||
69 | } | 70 | } |
70 | 71 | ||
71 | /* | 72 | /* |
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index f99e0b3bca8c..2de53628689f 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c | |||
@@ -470,6 +470,7 @@ static void remove_event_file_dir(struct ftrace_event_file *file) | |||
470 | 470 | ||
471 | list_del(&file->list); | 471 | list_del(&file->list); |
472 | remove_subsystem(file->system); | 472 | remove_subsystem(file->system); |
473 | free_event_filter(file->filter); | ||
473 | kmem_cache_free(file_cachep, file); | 474 | kmem_cache_free(file_cachep, file); |
474 | } | 475 | } |
475 | 476 | ||
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 04fdb5de823c..3c9b97e6b1f4 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c | |||
@@ -893,6 +893,9 @@ probe_event_enable(struct trace_uprobe *tu, struct ftrace_event_file *file, | |||
893 | int ret; | 893 | int ret; |
894 | 894 | ||
895 | if (file) { | 895 | if (file) { |
896 | if (tu->tp.flags & TP_FLAG_PROFILE) | ||
897 | return -EINTR; | ||
898 | |||
896 | link = kmalloc(sizeof(*link), GFP_KERNEL); | 899 | link = kmalloc(sizeof(*link), GFP_KERNEL); |
897 | if (!link) | 900 | if (!link) |
898 | return -ENOMEM; | 901 | return -ENOMEM; |
@@ -901,29 +904,40 @@ probe_event_enable(struct trace_uprobe *tu, struct ftrace_event_file *file, | |||
901 | list_add_tail_rcu(&link->list, &tu->tp.files); | 904 | list_add_tail_rcu(&link->list, &tu->tp.files); |
902 | 905 | ||
903 | tu->tp.flags |= TP_FLAG_TRACE; | 906 | tu->tp.flags |= TP_FLAG_TRACE; |
904 | } else | 907 | } else { |
905 | tu->tp.flags |= TP_FLAG_PROFILE; | 908 | if (tu->tp.flags & TP_FLAG_TRACE) |
909 | return -EINTR; | ||
906 | 910 | ||
907 | ret = uprobe_buffer_enable(); | 911 | tu->tp.flags |= TP_FLAG_PROFILE; |
908 | if (ret < 0) | 912 | } |
909 | return ret; | ||
910 | 913 | ||
911 | WARN_ON(!uprobe_filter_is_empty(&tu->filter)); | 914 | WARN_ON(!uprobe_filter_is_empty(&tu->filter)); |
912 | 915 | ||
913 | if (enabled) | 916 | if (enabled) |
914 | return 0; | 917 | return 0; |
915 | 918 | ||
919 | ret = uprobe_buffer_enable(); | ||
920 | if (ret) | ||
921 | goto err_flags; | ||
922 | |||
916 | tu->consumer.filter = filter; | 923 | tu->consumer.filter = filter; |
917 | ret = uprobe_register(tu->inode, tu->offset, &tu->consumer); | 924 | ret = uprobe_register(tu->inode, tu->offset, &tu->consumer); |
918 | if (ret) { | 925 | if (ret) |
919 | if (file) { | 926 | goto err_buffer; |
920 | list_del(&link->list); | ||
921 | kfree(link); | ||
922 | tu->tp.flags &= ~TP_FLAG_TRACE; | ||
923 | } else | ||
924 | tu->tp.flags &= ~TP_FLAG_PROFILE; | ||
925 | } | ||
926 | 927 | ||
928 | return 0; | ||
929 | |||
930 | err_buffer: | ||
931 | uprobe_buffer_disable(); | ||
932 | |||
933 | err_flags: | ||
934 | if (file) { | ||
935 | list_del(&link->list); | ||
936 | kfree(link); | ||
937 | tu->tp.flags &= ~TP_FLAG_TRACE; | ||
938 | } else { | ||
939 | tu->tp.flags &= ~TP_FLAG_PROFILE; | ||
940 | } | ||
927 | return ret; | 941 | return ret; |
928 | } | 942 | } |
929 | 943 | ||
@@ -1201,12 +1215,6 @@ static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs) | |||
1201 | 1215 | ||
1202 | current->utask->vaddr = (unsigned long) &udd; | 1216 | current->utask->vaddr = (unsigned long) &udd; |
1203 | 1217 | ||
1204 | #ifdef CONFIG_PERF_EVENTS | ||
1205 | if ((tu->tp.flags & TP_FLAG_TRACE) == 0 && | ||
1206 | !uprobe_perf_filter(&tu->consumer, 0, current->mm)) | ||
1207 | return UPROBE_HANDLER_REMOVE; | ||
1208 | #endif | ||
1209 | |||
1210 | if (WARN_ON_ONCE(!uprobe_cpu_buffer)) | 1218 | if (WARN_ON_ONCE(!uprobe_cpu_buffer)) |
1211 | return 0; | 1219 | return 0; |
1212 | 1220 | ||
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 33cbd8c203f8..3490407dc7b7 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c | |||
@@ -492,33 +492,29 @@ static int sys_tracepoint_refcount; | |||
492 | 492 | ||
493 | void syscall_regfunc(void) | 493 | void syscall_regfunc(void) |
494 | { | 494 | { |
495 | unsigned long flags; | 495 | struct task_struct *p, *t; |
496 | struct task_struct *g, *t; | ||
497 | 496 | ||
498 | if (!sys_tracepoint_refcount) { | 497 | if (!sys_tracepoint_refcount) { |
499 | read_lock_irqsave(&tasklist_lock, flags); | 498 | read_lock(&tasklist_lock); |
500 | do_each_thread(g, t) { | 499 | for_each_process_thread(p, t) { |
501 | /* Skip kernel threads. */ | 500 | set_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT); |
502 | if (t->mm) | 501 | } |
503 | set_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT); | 502 | read_unlock(&tasklist_lock); |
504 | } while_each_thread(g, t); | ||
505 | read_unlock_irqrestore(&tasklist_lock, flags); | ||
506 | } | 503 | } |
507 | sys_tracepoint_refcount++; | 504 | sys_tracepoint_refcount++; |
508 | } | 505 | } |
509 | 506 | ||
510 | void syscall_unregfunc(void) | 507 | void syscall_unregfunc(void) |
511 | { | 508 | { |
512 | unsigned long flags; | 509 | struct task_struct *p, *t; |
513 | struct task_struct *g, *t; | ||
514 | 510 | ||
515 | sys_tracepoint_refcount--; | 511 | sys_tracepoint_refcount--; |
516 | if (!sys_tracepoint_refcount) { | 512 | if (!sys_tracepoint_refcount) { |
517 | read_lock_irqsave(&tasklist_lock, flags); | 513 | read_lock(&tasklist_lock); |
518 | do_each_thread(g, t) { | 514 | for_each_process_thread(p, t) { |
519 | clear_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT); | 515 | clear_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT); |
520 | } while_each_thread(g, t); | 516 | } |
521 | read_unlock_irqrestore(&tasklist_lock, flags); | 517 | read_unlock(&tasklist_lock); |
522 | } | 518 | } |
523 | } | 519 | } |
524 | #endif | 520 | #endif |
diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 516203e665fc..c3319bd1b040 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c | |||
@@ -31,6 +31,12 @@ | |||
31 | 31 | ||
32 | int watchdog_user_enabled = 1; | 32 | int watchdog_user_enabled = 1; |
33 | int __read_mostly watchdog_thresh = 10; | 33 | int __read_mostly watchdog_thresh = 10; |
34 | #ifdef CONFIG_SMP | ||
35 | int __read_mostly sysctl_softlockup_all_cpu_backtrace; | ||
36 | #else | ||
37 | #define sysctl_softlockup_all_cpu_backtrace 0 | ||
38 | #endif | ||
39 | |||
34 | static int __read_mostly watchdog_running; | 40 | static int __read_mostly watchdog_running; |
35 | static u64 __read_mostly sample_period; | 41 | static u64 __read_mostly sample_period; |
36 | 42 | ||
@@ -47,6 +53,7 @@ static DEFINE_PER_CPU(bool, watchdog_nmi_touch); | |||
47 | static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); | 53 | static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); |
48 | static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); | 54 | static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); |
49 | #endif | 55 | #endif |
56 | static unsigned long soft_lockup_nmi_warn; | ||
50 | 57 | ||
51 | /* boot commands */ | 58 | /* boot commands */ |
52 | /* | 59 | /* |
@@ -95,6 +102,15 @@ static int __init nosoftlockup_setup(char *str) | |||
95 | } | 102 | } |
96 | __setup("nosoftlockup", nosoftlockup_setup); | 103 | __setup("nosoftlockup", nosoftlockup_setup); |
97 | /* */ | 104 | /* */ |
105 | #ifdef CONFIG_SMP | ||
106 | static int __init softlockup_all_cpu_backtrace_setup(char *str) | ||
107 | { | ||
108 | sysctl_softlockup_all_cpu_backtrace = | ||
109 | !!simple_strtol(str, NULL, 0); | ||
110 | return 1; | ||
111 | } | ||
112 | __setup("softlockup_all_cpu_backtrace=", softlockup_all_cpu_backtrace_setup); | ||
113 | #endif | ||
98 | 114 | ||
99 | /* | 115 | /* |
100 | * Hard-lockup warnings should be triggered after just a few seconds. Soft- | 116 | * Hard-lockup warnings should be triggered after just a few seconds. Soft- |
@@ -271,6 +287,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) | |||
271 | unsigned long touch_ts = __this_cpu_read(watchdog_touch_ts); | 287 | unsigned long touch_ts = __this_cpu_read(watchdog_touch_ts); |
272 | struct pt_regs *regs = get_irq_regs(); | 288 | struct pt_regs *regs = get_irq_regs(); |
273 | int duration; | 289 | int duration; |
290 | int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace; | ||
274 | 291 | ||
275 | /* kick the hardlockup detector */ | 292 | /* kick the hardlockup detector */ |
276 | watchdog_interrupt_count(); | 293 | watchdog_interrupt_count(); |
@@ -317,6 +334,17 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) | |||
317 | if (__this_cpu_read(soft_watchdog_warn) == true) | 334 | if (__this_cpu_read(soft_watchdog_warn) == true) |
318 | return HRTIMER_RESTART; | 335 | return HRTIMER_RESTART; |
319 | 336 | ||
337 | if (softlockup_all_cpu_backtrace) { | ||
338 | /* Prevent multiple soft-lockup reports if one cpu is already | ||
339 | * engaged in dumping cpu back traces | ||
340 | */ | ||
341 | if (test_and_set_bit(0, &soft_lockup_nmi_warn)) { | ||
342 | /* Someone else will report us. Let's give up */ | ||
343 | __this_cpu_write(soft_watchdog_warn, true); | ||
344 | return HRTIMER_RESTART; | ||
345 | } | ||
346 | } | ||
347 | |||
320 | printk(KERN_EMERG "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", | 348 | printk(KERN_EMERG "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", |
321 | smp_processor_id(), duration, | 349 | smp_processor_id(), duration, |
322 | current->comm, task_pid_nr(current)); | 350 | current->comm, task_pid_nr(current)); |
@@ -327,6 +355,17 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) | |||
327 | else | 355 | else |
328 | dump_stack(); | 356 | dump_stack(); |
329 | 357 | ||
358 | if (softlockup_all_cpu_backtrace) { | ||
359 | /* Avoid generating two back traces for current | ||
360 | * given that one is already made above | ||
361 | */ | ||
362 | trigger_allbutself_cpu_backtrace(); | ||
363 | |||
364 | clear_bit(0, &soft_lockup_nmi_warn); | ||
365 | /* Barrier to sync with other cpus */ | ||
366 | smp_mb__after_atomic(); | ||
367 | } | ||
368 | |||
330 | if (softlockup_panic) | 369 | if (softlockup_panic) |
331 | panic("softlockup: hung tasks"); | 370 | panic("softlockup: hung tasks"); |
332 | __this_cpu_write(soft_watchdog_warn, true); | 371 | __this_cpu_write(soft_watchdog_warn, true); |
@@ -527,10 +566,8 @@ static void update_timers_all_cpus(void) | |||
527 | int cpu; | 566 | int cpu; |
528 | 567 | ||
529 | get_online_cpus(); | 568 | get_online_cpus(); |
530 | preempt_disable(); | ||
531 | for_each_online_cpu(cpu) | 569 | for_each_online_cpu(cpu) |
532 | update_timers(cpu); | 570 | update_timers(cpu); |
533 | preempt_enable(); | ||
534 | put_online_cpus(); | 571 | put_online_cpus(); |
535 | } | 572 | } |
536 | 573 | ||
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 6203d2900877..35974ac69600 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
@@ -3284,6 +3284,7 @@ int workqueue_sysfs_register(struct workqueue_struct *wq) | |||
3284 | } | 3284 | } |
3285 | } | 3285 | } |
3286 | 3286 | ||
3287 | dev_set_uevent_suppress(&wq_dev->dev, false); | ||
3287 | kobject_uevent(&wq_dev->dev.kobj, KOBJ_ADD); | 3288 | kobject_uevent(&wq_dev->dev.kobj, KOBJ_ADD); |
3288 | return 0; | 3289 | return 0; |
3289 | } | 3290 | } |
@@ -4879,7 +4880,7 @@ static void __init wq_numa_init(void) | |||
4879 | BUG_ON(!tbl); | 4880 | BUG_ON(!tbl); |
4880 | 4881 | ||
4881 | for_each_node(node) | 4882 | for_each_node(node) |
4882 | BUG_ON(!alloc_cpumask_var_node(&tbl[node], GFP_KERNEL, | 4883 | BUG_ON(!zalloc_cpumask_var_node(&tbl[node], GFP_KERNEL, |
4883 | node_online(node) ? node : NUMA_NO_NODE)); | 4884 | node_online(node) ? node : NUMA_NO_NODE)); |
4884 | 4885 | ||
4885 | for_each_possible_cpu(cpu) { | 4886 | for_each_possible_cpu(cpu) { |