aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.locks2
-rw-r--r--kernel/Makefile7
-rw-r--r--kernel/auditsc.c27
-rw-r--r--kernel/capability.c4
-rw-r--r--kernel/cgroup.c614
-rw-r--r--kernel/cgroup_freezer.c26
-rw-r--r--kernel/compat.c55
-rw-r--r--kernel/cpuset.c109
-rw-r--r--kernel/cred.c8
-rw-r--r--kernel/events/Makefile6
-rw-r--r--kernel/events/core.c (renamed from kernel/perf_event.c)92
-rw-r--r--kernel/events/hw_breakpoint.c (renamed from kernel/hw_breakpoint.c)0
-rw-r--r--kernel/exit.c141
-rw-r--r--kernel/extable.c18
-rw-r--r--kernel/fork.c97
-rw-r--r--kernel/freezer.c4
-rw-r--r--kernel/gcov/Kconfig3
-rw-r--r--kernel/hrtimer.c164
-rw-r--r--kernel/hung_task.c2
-rw-r--r--kernel/irq/Kconfig4
-rw-r--r--kernel/irq/Makefile1
-rw-r--r--kernel/irq/chip.c3
-rw-r--r--kernel/irq/debug.h1
-rw-r--r--kernel/irq/generic-chip.c368
-rw-r--r--kernel/irq/handle.c6
-rw-r--r--kernel/irq/irqdesc.c36
-rw-r--r--kernel/irq/manage.c30
-rw-r--r--kernel/irq/proc.c55
-rw-r--r--kernel/irq/settings.h17
-rw-r--r--kernel/irq/spurious.c31
-rw-r--r--kernel/jump_label.c551
-rw-r--r--kernel/kexec.c9
-rw-r--r--kernel/kmod.c124
-rw-r--r--kernel/ksysfs.c10
-rw-r--r--kernel/kthread.c4
-rw-r--r--kernel/lockdep.c208
-rw-r--r--kernel/module.c109
-rw-r--r--kernel/mutex-debug.c2
-rw-r--r--kernel/mutex-debug.h2
-rw-r--r--kernel/mutex.c34
-rw-r--r--kernel/mutex.h2
-rw-r--r--kernel/ns_cgroup.c118
-rw-r--r--kernel/nsproxy.c46
-rw-r--r--kernel/params.c23
-rw-r--r--kernel/pm_qos_params.c72
-rw-r--r--kernel/posix-cpu-timers.c4
-rw-r--r--kernel/posix-timers.c27
-rw-r--r--kernel/power/Kconfig10
-rw-r--r--kernel/power/hibernate.c278
-rw-r--r--kernel/power/main.c1
-rw-r--r--kernel/power/power.h4
-rw-r--r--kernel/power/snapshot.c39
-rw-r--r--kernel/power/suspend.c10
-rw-r--r--kernel/power/user.c4
-rw-r--r--kernel/printk.c87
-rw-r--r--kernel/profile.c22
-rw-r--r--kernel/ptrace.c120
-rw-r--r--kernel/rcupdate.c32
-rw-r--r--kernel/rcutiny.c46
-rw-r--r--kernel/rcutiny_plugin.h203
-rw-r--r--kernel/rcutorture.c26
-rw-r--r--kernel/rcutree.c353
-rw-r--r--kernel/rcutree.h128
-rw-r--r--kernel/rcutree_plugin.h973
-rw-r--r--kernel/rcutree_trace.c208
-rw-r--r--kernel/resource.c116
-rw-r--r--kernel/sched.c1796
-rw-r--r--kernel/sched_debug.c6
-rw-r--r--kernel/sched_fair.c183
-rw-r--r--kernel/sched_features.h6
-rw-r--r--kernel/sched_idletask.c2
-rw-r--r--kernel/sched_rt.c99
-rw-r--r--kernel/sched_stats.h4
-rw-r--r--kernel/sched_stoptask.c5
-rw-r--r--kernel/signal.c686
-rw-r--r--kernel/smp.c5
-rw-r--r--kernel/softirq.c2
-rw-r--r--kernel/sys.c5
-rw-r--r--kernel/sys_ni.c9
-rw-r--r--kernel/sysctl.c26
-rw-r--r--kernel/taskstats.c15
-rw-r--r--kernel/time/Makefile2
-rw-r--r--kernel/time/alarmtimer.c720
-rw-r--r--kernel/time/clockevents.c69
-rw-r--r--kernel/time/clocksource.c62
-rw-r--r--kernel/time/tick-broadcast.c16
-rw-r--r--kernel/time/timekeeping.c73
-rw-r--r--kernel/timer.c15
-rw-r--r--kernel/trace/ftrace.c1279
-rw-r--r--kernel/trace/ring_buffer.c10
-rw-r--r--kernel/trace/trace.c15
-rw-r--r--kernel/trace/trace.h17
-rw-r--r--kernel/trace/trace_events.c7
-rw-r--r--kernel/trace/trace_functions.c2
-rw-r--r--kernel/trace/trace_irqsoff.c1
-rw-r--r--kernel/trace/trace_kprobe.c9
-rw-r--r--kernel/trace/trace_output.c30
-rw-r--r--kernel/trace/trace_printk.c117
-rw-r--r--kernel/trace/trace_sched_wakeup.c1
-rw-r--r--kernel/trace/trace_selftest.c214
-rw-r--r--kernel/trace/trace_selftest_dynamic.c6
-rw-r--r--kernel/trace/trace_stack.c1
-rw-r--r--kernel/tracepoint.c23
-rw-r--r--kernel/utsname.c39
-rw-r--r--kernel/watchdog.c61
-rw-r--r--kernel/workqueue.c4
106 files changed, 7918 insertions, 3660 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index 88c92fb44618..5068e2a4e75f 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -199,4 +199,4 @@ config INLINE_WRITE_UNLOCK_IRQRESTORE
199 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE 199 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE
200 200
201config MUTEX_SPIN_ON_OWNER 201config MUTEX_SPIN_ON_OWNER
202 def_bool SMP && !DEBUG_MUTEXES && !HAVE_DEFAULT_NO_SPIN_MUTEXES 202 def_bool SMP && !DEBUG_MUTEXES
diff --git a/kernel/Makefile b/kernel/Makefile
index 85cbfb31e73e..2d64cfcc8b42 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -21,7 +21,6 @@ CFLAGS_REMOVE_mutex-debug.o = -pg
21CFLAGS_REMOVE_rtmutex-debug.o = -pg 21CFLAGS_REMOVE_rtmutex-debug.o = -pg
22CFLAGS_REMOVE_cgroup-debug.o = -pg 22CFLAGS_REMOVE_cgroup-debug.o = -pg
23CFLAGS_REMOVE_sched_clock.o = -pg 23CFLAGS_REMOVE_sched_clock.o = -pg
24CFLAGS_REMOVE_perf_event.o = -pg
25CFLAGS_REMOVE_irq_work.o = -pg 24CFLAGS_REMOVE_irq_work.o = -pg
26endif 25endif
27 26
@@ -62,7 +61,6 @@ obj-$(CONFIG_COMPAT) += compat.o
62obj-$(CONFIG_CGROUPS) += cgroup.o 61obj-$(CONFIG_CGROUPS) += cgroup.o
63obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o 62obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o
64obj-$(CONFIG_CPUSETS) += cpuset.o 63obj-$(CONFIG_CPUSETS) += cpuset.o
65obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o
66obj-$(CONFIG_UTS_NS) += utsname.o 64obj-$(CONFIG_UTS_NS) += utsname.o
67obj-$(CONFIG_USER_NS) += user_namespace.o 65obj-$(CONFIG_USER_NS) += user_namespace.o
68obj-$(CONFIG_PID_NS) += pid_namespace.o 66obj-$(CONFIG_PID_NS) += pid_namespace.o
@@ -103,8 +101,9 @@ obj-$(CONFIG_RING_BUFFER) += trace/
103obj-$(CONFIG_TRACEPOINTS) += trace/ 101obj-$(CONFIG_TRACEPOINTS) += trace/
104obj-$(CONFIG_SMP) += sched_cpupri.o 102obj-$(CONFIG_SMP) += sched_cpupri.o
105obj-$(CONFIG_IRQ_WORK) += irq_work.o 103obj-$(CONFIG_IRQ_WORK) += irq_work.o
106obj-$(CONFIG_PERF_EVENTS) += perf_event.o 104
107obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o 105obj-$(CONFIG_PERF_EVENTS) += events/
106
108obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o 107obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
109obj-$(CONFIG_PADATA) += padata.o 108obj-$(CONFIG_PADATA) += padata.o
110obj-$(CONFIG_CRASH_DUMP) += crash_dump.o 109obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index b33513a08beb..00d79df03e76 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -443,17 +443,25 @@ static int match_tree_refs(struct audit_context *ctx, struct audit_tree *tree)
443 443
444/* Determine if any context name data matches a rule's watch data */ 444/* Determine if any context name data matches a rule's watch data */
445/* Compare a task_struct with an audit_rule. Return 1 on match, 0 445/* Compare a task_struct with an audit_rule. Return 1 on match, 0
446 * otherwise. */ 446 * otherwise.
447 *
448 * If task_creation is true, this is an explicit indication that we are
449 * filtering a task rule at task creation time. This and tsk == current are
450 * the only situations where tsk->cred may be accessed without an rcu read lock.
451 */
447static int audit_filter_rules(struct task_struct *tsk, 452static int audit_filter_rules(struct task_struct *tsk,
448 struct audit_krule *rule, 453 struct audit_krule *rule,
449 struct audit_context *ctx, 454 struct audit_context *ctx,
450 struct audit_names *name, 455 struct audit_names *name,
451 enum audit_state *state) 456 enum audit_state *state,
457 bool task_creation)
452{ 458{
453 const struct cred *cred = get_task_cred(tsk); 459 const struct cred *cred;
454 int i, j, need_sid = 1; 460 int i, j, need_sid = 1;
455 u32 sid; 461 u32 sid;
456 462
463 cred = rcu_dereference_check(tsk->cred, tsk == current || task_creation);
464
457 for (i = 0; i < rule->field_count; i++) { 465 for (i = 0; i < rule->field_count; i++) {
458 struct audit_field *f = &rule->fields[i]; 466 struct audit_field *f = &rule->fields[i];
459 int result = 0; 467 int result = 0;
@@ -637,10 +645,8 @@ static int audit_filter_rules(struct task_struct *tsk,
637 break; 645 break;
638 } 646 }
639 647
640 if (!result) { 648 if (!result)
641 put_cred(cred);
642 return 0; 649 return 0;
643 }
644 } 650 }
645 651
646 if (ctx) { 652 if (ctx) {
@@ -656,7 +662,6 @@ static int audit_filter_rules(struct task_struct *tsk,
656 case AUDIT_NEVER: *state = AUDIT_DISABLED; break; 662 case AUDIT_NEVER: *state = AUDIT_DISABLED; break;
657 case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break; 663 case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break;
658 } 664 }
659 put_cred(cred);
660 return 1; 665 return 1;
661} 666}
662 667
@@ -671,7 +676,8 @@ static enum audit_state audit_filter_task(struct task_struct *tsk, char **key)
671 676
672 rcu_read_lock(); 677 rcu_read_lock();
673 list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TASK], list) { 678 list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TASK], list) {
674 if (audit_filter_rules(tsk, &e->rule, NULL, NULL, &state)) { 679 if (audit_filter_rules(tsk, &e->rule, NULL, NULL,
680 &state, true)) {
675 if (state == AUDIT_RECORD_CONTEXT) 681 if (state == AUDIT_RECORD_CONTEXT)
676 *key = kstrdup(e->rule.filterkey, GFP_ATOMIC); 682 *key = kstrdup(e->rule.filterkey, GFP_ATOMIC);
677 rcu_read_unlock(); 683 rcu_read_unlock();
@@ -705,7 +711,7 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk,
705 list_for_each_entry_rcu(e, list, list) { 711 list_for_each_entry_rcu(e, list, list) {
706 if ((e->rule.mask[word] & bit) == bit && 712 if ((e->rule.mask[word] & bit) == bit &&
707 audit_filter_rules(tsk, &e->rule, ctx, NULL, 713 audit_filter_rules(tsk, &e->rule, ctx, NULL,
708 &state)) { 714 &state, false)) {
709 rcu_read_unlock(); 715 rcu_read_unlock();
710 ctx->current_state = state; 716 ctx->current_state = state;
711 return state; 717 return state;
@@ -743,7 +749,8 @@ void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx)
743 749
744 list_for_each_entry_rcu(e, list, list) { 750 list_for_each_entry_rcu(e, list, list) {
745 if ((e->rule.mask[word] & bit) == bit && 751 if ((e->rule.mask[word] & bit) == bit &&
746 audit_filter_rules(tsk, &e->rule, ctx, n, &state)) { 752 audit_filter_rules(tsk, &e->rule, ctx, n,
753 &state, false)) {
747 rcu_read_unlock(); 754 rcu_read_unlock();
748 ctx->current_state = state; 755 ctx->current_state = state;
749 return; 756 return;
diff --git a/kernel/capability.c b/kernel/capability.c
index 32a80e08ff4b..283c529f8b1c 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -22,12 +22,8 @@
22 */ 22 */
23 23
24const kernel_cap_t __cap_empty_set = CAP_EMPTY_SET; 24const kernel_cap_t __cap_empty_set = CAP_EMPTY_SET;
25const kernel_cap_t __cap_full_set = CAP_FULL_SET;
26const kernel_cap_t __cap_init_eff_set = CAP_INIT_EFF_SET;
27 25
28EXPORT_SYMBOL(__cap_empty_set); 26EXPORT_SYMBOL(__cap_empty_set);
29EXPORT_SYMBOL(__cap_full_set);
30EXPORT_SYMBOL(__cap_init_eff_set);
31 27
32int file_caps_enabled = 1; 28int file_caps_enabled = 1;
33 29
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 25c7eb52de1a..2731d115d725 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -57,6 +57,7 @@
57#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ 57#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
58#include <linux/eventfd.h> 58#include <linux/eventfd.h>
59#include <linux/poll.h> 59#include <linux/poll.h>
60#include <linux/flex_array.h> /* used in cgroup_attach_proc */
60 61
61#include <asm/atomic.h> 62#include <asm/atomic.h>
62 63
@@ -326,12 +327,6 @@ static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[])
326 return &css_set_table[index]; 327 return &css_set_table[index];
327} 328}
328 329
329static void free_css_set_rcu(struct rcu_head *obj)
330{
331 struct css_set *cg = container_of(obj, struct css_set, rcu_head);
332 kfree(cg);
333}
334
335/* We don't maintain the lists running through each css_set to its 330/* We don't maintain the lists running through each css_set to its
336 * task until after the first call to cgroup_iter_start(). This 331 * task until after the first call to cgroup_iter_start(). This
337 * reduces the fork()/exit() overhead for people who have cgroups 332 * reduces the fork()/exit() overhead for people who have cgroups
@@ -375,7 +370,7 @@ static void __put_css_set(struct css_set *cg, int taskexit)
375 } 370 }
376 371
377 write_unlock(&css_set_lock); 372 write_unlock(&css_set_lock);
378 call_rcu(&cg->rcu_head, free_css_set_rcu); 373 kfree_rcu(cg, rcu_head);
379} 374}
380 375
381/* 376/*
@@ -812,13 +807,6 @@ static int cgroup_call_pre_destroy(struct cgroup *cgrp)
812 return ret; 807 return ret;
813} 808}
814 809
815static void free_cgroup_rcu(struct rcu_head *obj)
816{
817 struct cgroup *cgrp = container_of(obj, struct cgroup, rcu_head);
818
819 kfree(cgrp);
820}
821
822static void cgroup_diput(struct dentry *dentry, struct inode *inode) 810static void cgroup_diput(struct dentry *dentry, struct inode *inode)
823{ 811{
824 /* is dentry a directory ? if so, kfree() associated cgroup */ 812 /* is dentry a directory ? if so, kfree() associated cgroup */
@@ -856,7 +844,7 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
856 */ 844 */
857 BUG_ON(!list_empty(&cgrp->pidlists)); 845 BUG_ON(!list_empty(&cgrp->pidlists));
858 846
859 call_rcu(&cgrp->rcu_head, free_cgroup_rcu); 847 kfree_rcu(cgrp, rcu_head);
860 } 848 }
861 iput(inode); 849 iput(inode);
862} 850}
@@ -1748,6 +1736,76 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1748} 1736}
1749EXPORT_SYMBOL_GPL(cgroup_path); 1737EXPORT_SYMBOL_GPL(cgroup_path);
1750 1738
1739/*
1740 * cgroup_task_migrate - move a task from one cgroup to another.
1741 *
1742 * 'guarantee' is set if the caller promises that a new css_set for the task
1743 * will already exist. If not set, this function might sleep, and can fail with
1744 * -ENOMEM. Otherwise, it can only fail with -ESRCH.
1745 */
1746static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
1747 struct task_struct *tsk, bool guarantee)
1748{
1749 struct css_set *oldcg;
1750 struct css_set *newcg;
1751
1752 /*
1753 * get old css_set. we need to take task_lock and refcount it, because
1754 * an exiting task can change its css_set to init_css_set and drop its
1755 * old one without taking cgroup_mutex.
1756 */
1757 task_lock(tsk);
1758 oldcg = tsk->cgroups;
1759 get_css_set(oldcg);
1760 task_unlock(tsk);
1761
1762 /* locate or allocate a new css_set for this task. */
1763 if (guarantee) {
1764 /* we know the css_set we want already exists. */
1765 struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
1766 read_lock(&css_set_lock);
1767 newcg = find_existing_css_set(oldcg, cgrp, template);
1768 BUG_ON(!newcg);
1769 get_css_set(newcg);
1770 read_unlock(&css_set_lock);
1771 } else {
1772 might_sleep();
1773 /* find_css_set will give us newcg already referenced. */
1774 newcg = find_css_set(oldcg, cgrp);
1775 if (!newcg) {
1776 put_css_set(oldcg);
1777 return -ENOMEM;
1778 }
1779 }
1780 put_css_set(oldcg);
1781
1782 /* if PF_EXITING is set, the tsk->cgroups pointer is no longer safe. */
1783 task_lock(tsk);
1784 if (tsk->flags & PF_EXITING) {
1785 task_unlock(tsk);
1786 put_css_set(newcg);
1787 return -ESRCH;
1788 }
1789 rcu_assign_pointer(tsk->cgroups, newcg);
1790 task_unlock(tsk);
1791
1792 /* Update the css_set linked lists if we're using them */
1793 write_lock(&css_set_lock);
1794 if (!list_empty(&tsk->cg_list))
1795 list_move(&tsk->cg_list, &newcg->tasks);
1796 write_unlock(&css_set_lock);
1797
1798 /*
1799 * We just gained a reference on oldcg by taking it from the task. As
1800 * trading it for newcg is protected by cgroup_mutex, we're safe to drop
1801 * it here; it will be freed under RCU.
1802 */
1803 put_css_set(oldcg);
1804
1805 set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
1806 return 0;
1807}
1808
1751/** 1809/**
1752 * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp' 1810 * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp'
1753 * @cgrp: the cgroup the task is attaching to 1811 * @cgrp: the cgroup the task is attaching to
@@ -1758,11 +1816,9 @@ EXPORT_SYMBOL_GPL(cgroup_path);
1758 */ 1816 */
1759int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) 1817int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1760{ 1818{
1761 int retval = 0; 1819 int retval;
1762 struct cgroup_subsys *ss, *failed_ss = NULL; 1820 struct cgroup_subsys *ss, *failed_ss = NULL;
1763 struct cgroup *oldcgrp; 1821 struct cgroup *oldcgrp;
1764 struct css_set *cg;
1765 struct css_set *newcg;
1766 struct cgroupfs_root *root = cgrp->root; 1822 struct cgroupfs_root *root = cgrp->root;
1767 1823
1768 /* Nothing to do if the task is already in that cgroup */ 1824 /* Nothing to do if the task is already in that cgroup */
@@ -1772,7 +1828,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1772 1828
1773 for_each_subsys(root, ss) { 1829 for_each_subsys(root, ss) {
1774 if (ss->can_attach) { 1830 if (ss->can_attach) {
1775 retval = ss->can_attach(ss, cgrp, tsk, false); 1831 retval = ss->can_attach(ss, cgrp, tsk);
1776 if (retval) { 1832 if (retval) {
1777 /* 1833 /*
1778 * Remember on which subsystem the can_attach() 1834 * Remember on which subsystem the can_attach()
@@ -1784,46 +1840,29 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1784 goto out; 1840 goto out;
1785 } 1841 }
1786 } 1842 }
1843 if (ss->can_attach_task) {
1844 retval = ss->can_attach_task(cgrp, tsk);
1845 if (retval) {
1846 failed_ss = ss;
1847 goto out;
1848 }
1849 }
1787 } 1850 }
1788 1851
1789 task_lock(tsk); 1852 retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, false);
1790 cg = tsk->cgroups; 1853 if (retval)
1791 get_css_set(cg);
1792 task_unlock(tsk);
1793 /*
1794 * Locate or allocate a new css_set for this task,
1795 * based on its final set of cgroups
1796 */
1797 newcg = find_css_set(cg, cgrp);
1798 put_css_set(cg);
1799 if (!newcg) {
1800 retval = -ENOMEM;
1801 goto out; 1854 goto out;
1802 }
1803
1804 task_lock(tsk);
1805 if (tsk->flags & PF_EXITING) {
1806 task_unlock(tsk);
1807 put_css_set(newcg);
1808 retval = -ESRCH;
1809 goto out;
1810 }
1811 rcu_assign_pointer(tsk->cgroups, newcg);
1812 task_unlock(tsk);
1813
1814 /* Update the css_set linked lists if we're using them */
1815 write_lock(&css_set_lock);
1816 if (!list_empty(&tsk->cg_list))
1817 list_move(&tsk->cg_list, &newcg->tasks);
1818 write_unlock(&css_set_lock);
1819 1855
1820 for_each_subsys(root, ss) { 1856 for_each_subsys(root, ss) {
1857 if (ss->pre_attach)
1858 ss->pre_attach(cgrp);
1859 if (ss->attach_task)
1860 ss->attach_task(cgrp, tsk);
1821 if (ss->attach) 1861 if (ss->attach)
1822 ss->attach(ss, cgrp, oldcgrp, tsk, false); 1862 ss->attach(ss, cgrp, oldcgrp, tsk);
1823 } 1863 }
1824 set_bit(CGRP_RELEASABLE, &oldcgrp->flags); 1864
1825 synchronize_rcu(); 1865 synchronize_rcu();
1826 put_css_set(cg);
1827 1866
1828 /* 1867 /*
1829 * wake up rmdir() waiter. the rmdir should fail since the cgroup 1868 * wake up rmdir() waiter. the rmdir should fail since the cgroup
@@ -1842,7 +1881,7 @@ out:
1842 */ 1881 */
1843 break; 1882 break;
1844 if (ss->cancel_attach) 1883 if (ss->cancel_attach)
1845 ss->cancel_attach(ss, cgrp, tsk, false); 1884 ss->cancel_attach(ss, cgrp, tsk);
1846 } 1885 }
1847 } 1886 }
1848 return retval; 1887 return retval;
@@ -1873,49 +1912,370 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
1873EXPORT_SYMBOL_GPL(cgroup_attach_task_all); 1912EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
1874 1913
1875/* 1914/*
1876 * Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex 1915 * cgroup_attach_proc works in two stages, the first of which prefetches all
1877 * held. May take task_lock of task 1916 * new css_sets needed (to make sure we have enough memory before committing
1917 * to the move) and stores them in a list of entries of the following type.
1918 * TODO: possible optimization: use css_set->rcu_head for chaining instead
1919 */
1920struct cg_list_entry {
1921 struct css_set *cg;
1922 struct list_head links;
1923};
1924
1925static bool css_set_check_fetched(struct cgroup *cgrp,
1926 struct task_struct *tsk, struct css_set *cg,
1927 struct list_head *newcg_list)
1928{
1929 struct css_set *newcg;
1930 struct cg_list_entry *cg_entry;
1931 struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
1932
1933 read_lock(&css_set_lock);
1934 newcg = find_existing_css_set(cg, cgrp, template);
1935 if (newcg)
1936 get_css_set(newcg);
1937 read_unlock(&css_set_lock);
1938
1939 /* doesn't exist at all? */
1940 if (!newcg)
1941 return false;
1942 /* see if it's already in the list */
1943 list_for_each_entry(cg_entry, newcg_list, links) {
1944 if (cg_entry->cg == newcg) {
1945 put_css_set(newcg);
1946 return true;
1947 }
1948 }
1949
1950 /* not found */
1951 put_css_set(newcg);
1952 return false;
1953}
1954
1955/*
1956 * Find the new css_set and store it in the list in preparation for moving the
1957 * given task to the given cgroup. Returns 0 or -ENOMEM.
1958 */
1959static int css_set_prefetch(struct cgroup *cgrp, struct css_set *cg,
1960 struct list_head *newcg_list)
1961{
1962 struct css_set *newcg;
1963 struct cg_list_entry *cg_entry;
1964
1965 /* ensure a new css_set will exist for this thread */
1966 newcg = find_css_set(cg, cgrp);
1967 if (!newcg)
1968 return -ENOMEM;
1969 /* add it to the list */
1970 cg_entry = kmalloc(sizeof(struct cg_list_entry), GFP_KERNEL);
1971 if (!cg_entry) {
1972 put_css_set(newcg);
1973 return -ENOMEM;
1974 }
1975 cg_entry->cg = newcg;
1976 list_add(&cg_entry->links, newcg_list);
1977 return 0;
1978}
1979
1980/**
1981 * cgroup_attach_proc - attach all threads in a threadgroup to a cgroup
1982 * @cgrp: the cgroup to attach to
1983 * @leader: the threadgroup leader task_struct of the group to be attached
1984 *
1985 * Call holding cgroup_mutex and the threadgroup_fork_lock of the leader. Will
1986 * take task_lock of each thread in leader's threadgroup individually in turn.
1987 */
1988int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
1989{
1990 int retval, i, group_size;
1991 struct cgroup_subsys *ss, *failed_ss = NULL;
1992 bool cancel_failed_ss = false;
1993 /* guaranteed to be initialized later, but the compiler needs this */
1994 struct cgroup *oldcgrp = NULL;
1995 struct css_set *oldcg;
1996 struct cgroupfs_root *root = cgrp->root;
1997 /* threadgroup list cursor and array */
1998 struct task_struct *tsk;
1999 struct flex_array *group;
2000 /*
2001 * we need to make sure we have css_sets for all the tasks we're
2002 * going to move -before- we actually start moving them, so that in
2003 * case we get an ENOMEM we can bail out before making any changes.
2004 */
2005 struct list_head newcg_list;
2006 struct cg_list_entry *cg_entry, *temp_nobe;
2007
2008 /*
2009 * step 0: in order to do expensive, possibly blocking operations for
2010 * every thread, we cannot iterate the thread group list, since it needs
2011 * rcu or tasklist locked. instead, build an array of all threads in the
2012 * group - threadgroup_fork_lock prevents new threads from appearing,
2013 * and if threads exit, this will just be an over-estimate.
2014 */
2015 group_size = get_nr_threads(leader);
2016 /* flex_array supports very large thread-groups better than kmalloc. */
2017 group = flex_array_alloc(sizeof(struct task_struct *), group_size,
2018 GFP_KERNEL);
2019 if (!group)
2020 return -ENOMEM;
2021 /* pre-allocate to guarantee space while iterating in rcu read-side. */
2022 retval = flex_array_prealloc(group, 0, group_size - 1, GFP_KERNEL);
2023 if (retval)
2024 goto out_free_group_list;
2025
2026 /* prevent changes to the threadgroup list while we take a snapshot. */
2027 rcu_read_lock();
2028 if (!thread_group_leader(leader)) {
2029 /*
2030 * a race with de_thread from another thread's exec() may strip
2031 * us of our leadership, making while_each_thread unsafe to use
2032 * on this task. if this happens, there is no choice but to
2033 * throw this task away and try again (from cgroup_procs_write);
2034 * this is "double-double-toil-and-trouble-check locking".
2035 */
2036 rcu_read_unlock();
2037 retval = -EAGAIN;
2038 goto out_free_group_list;
2039 }
2040 /* take a reference on each task in the group to go in the array. */
2041 tsk = leader;
2042 i = 0;
2043 do {
2044 /* as per above, nr_threads may decrease, but not increase. */
2045 BUG_ON(i >= group_size);
2046 get_task_struct(tsk);
2047 /*
2048 * saying GFP_ATOMIC has no effect here because we did prealloc
2049 * earlier, but it's good form to communicate our expectations.
2050 */
2051 retval = flex_array_put_ptr(group, i, tsk, GFP_ATOMIC);
2052 BUG_ON(retval != 0);
2053 i++;
2054 } while_each_thread(leader, tsk);
2055 /* remember the number of threads in the array for later. */
2056 group_size = i;
2057 rcu_read_unlock();
2058
2059 /*
2060 * step 1: check that we can legitimately attach to the cgroup.
2061 */
2062 for_each_subsys(root, ss) {
2063 if (ss->can_attach) {
2064 retval = ss->can_attach(ss, cgrp, leader);
2065 if (retval) {
2066 failed_ss = ss;
2067 goto out_cancel_attach;
2068 }
2069 }
2070 /* a callback to be run on every thread in the threadgroup. */
2071 if (ss->can_attach_task) {
2072 /* run on each task in the threadgroup. */
2073 for (i = 0; i < group_size; i++) {
2074 tsk = flex_array_get_ptr(group, i);
2075 retval = ss->can_attach_task(cgrp, tsk);
2076 if (retval) {
2077 failed_ss = ss;
2078 cancel_failed_ss = true;
2079 goto out_cancel_attach;
2080 }
2081 }
2082 }
2083 }
2084
2085 /*
2086 * step 2: make sure css_sets exist for all threads to be migrated.
2087 * we use find_css_set, which allocates a new one if necessary.
2088 */
2089 INIT_LIST_HEAD(&newcg_list);
2090 for (i = 0; i < group_size; i++) {
2091 tsk = flex_array_get_ptr(group, i);
2092 /* nothing to do if this task is already in the cgroup */
2093 oldcgrp = task_cgroup_from_root(tsk, root);
2094 if (cgrp == oldcgrp)
2095 continue;
2096 /* get old css_set pointer */
2097 task_lock(tsk);
2098 if (tsk->flags & PF_EXITING) {
2099 /* ignore this task if it's going away */
2100 task_unlock(tsk);
2101 continue;
2102 }
2103 oldcg = tsk->cgroups;
2104 get_css_set(oldcg);
2105 task_unlock(tsk);
2106 /* see if the new one for us is already in the list? */
2107 if (css_set_check_fetched(cgrp, tsk, oldcg, &newcg_list)) {
2108 /* was already there, nothing to do. */
2109 put_css_set(oldcg);
2110 } else {
2111 /* we don't already have it. get new one. */
2112 retval = css_set_prefetch(cgrp, oldcg, &newcg_list);
2113 put_css_set(oldcg);
2114 if (retval)
2115 goto out_list_teardown;
2116 }
2117 }
2118
2119 /*
2120 * step 3: now that we're guaranteed success wrt the css_sets, proceed
2121 * to move all tasks to the new cgroup, calling ss->attach_task for each
2122 * one along the way. there are no failure cases after here, so this is
2123 * the commit point.
2124 */
2125 for_each_subsys(root, ss) {
2126 if (ss->pre_attach)
2127 ss->pre_attach(cgrp);
2128 }
2129 for (i = 0; i < group_size; i++) {
2130 tsk = flex_array_get_ptr(group, i);
2131 /* leave current thread as it is if it's already there */
2132 oldcgrp = task_cgroup_from_root(tsk, root);
2133 if (cgrp == oldcgrp)
2134 continue;
2135 /* attach each task to each subsystem */
2136 for_each_subsys(root, ss) {
2137 if (ss->attach_task)
2138 ss->attach_task(cgrp, tsk);
2139 }
2140 /* if the thread is PF_EXITING, it can just get skipped. */
2141 retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, true);
2142 BUG_ON(retval != 0 && retval != -ESRCH);
2143 }
2144 /* nothing is sensitive to fork() after this point. */
2145
2146 /*
2147 * step 4: do expensive, non-thread-specific subsystem callbacks.
2148 * TODO: if ever a subsystem needs to know the oldcgrp for each task
2149 * being moved, this call will need to be reworked to communicate that.
2150 */
2151 for_each_subsys(root, ss) {
2152 if (ss->attach)
2153 ss->attach(ss, cgrp, oldcgrp, leader);
2154 }
2155
2156 /*
2157 * step 5: success! and cleanup
2158 */
2159 synchronize_rcu();
2160 cgroup_wakeup_rmdir_waiter(cgrp);
2161 retval = 0;
2162out_list_teardown:
2163 /* clean up the list of prefetched css_sets. */
2164 list_for_each_entry_safe(cg_entry, temp_nobe, &newcg_list, links) {
2165 list_del(&cg_entry->links);
2166 put_css_set(cg_entry->cg);
2167 kfree(cg_entry);
2168 }
2169out_cancel_attach:
2170 /* same deal as in cgroup_attach_task */
2171 if (retval) {
2172 for_each_subsys(root, ss) {
2173 if (ss == failed_ss) {
2174 if (cancel_failed_ss && ss->cancel_attach)
2175 ss->cancel_attach(ss, cgrp, leader);
2176 break;
2177 }
2178 if (ss->cancel_attach)
2179 ss->cancel_attach(ss, cgrp, leader);
2180 }
2181 }
2182 /* clean up the array of referenced threads in the group. */
2183 for (i = 0; i < group_size; i++) {
2184 tsk = flex_array_get_ptr(group, i);
2185 put_task_struct(tsk);
2186 }
2187out_free_group_list:
2188 flex_array_free(group);
2189 return retval;
2190}
2191
2192/*
2193 * Find the task_struct of the task to attach by vpid and pass it along to the
2194 * function to attach either it or all tasks in its threadgroup. Will take
2195 * cgroup_mutex; may take task_lock of task.
1878 */ 2196 */
1879static int attach_task_by_pid(struct cgroup *cgrp, u64 pid) 2197static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
1880{ 2198{
1881 struct task_struct *tsk; 2199 struct task_struct *tsk;
1882 const struct cred *cred = current_cred(), *tcred; 2200 const struct cred *cred = current_cred(), *tcred;
1883 int ret; 2201 int ret;
1884 2202
2203 if (!cgroup_lock_live_group(cgrp))
2204 return -ENODEV;
2205
1885 if (pid) { 2206 if (pid) {
1886 rcu_read_lock(); 2207 rcu_read_lock();
1887 tsk = find_task_by_vpid(pid); 2208 tsk = find_task_by_vpid(pid);
1888 if (!tsk || tsk->flags & PF_EXITING) { 2209 if (!tsk) {
1889 rcu_read_unlock(); 2210 rcu_read_unlock();
2211 cgroup_unlock();
2212 return -ESRCH;
2213 }
2214 if (threadgroup) {
2215 /*
2216 * RCU protects this access, since tsk was found in the
2217 * tid map. a race with de_thread may cause group_leader
2218 * to stop being the leader, but cgroup_attach_proc will
2219 * detect it later.
2220 */
2221 tsk = tsk->group_leader;
2222 } else if (tsk->flags & PF_EXITING) {
2223 /* optimization for the single-task-only case */
2224 rcu_read_unlock();
2225 cgroup_unlock();
1890 return -ESRCH; 2226 return -ESRCH;
1891 } 2227 }
1892 2228
2229 /*
2230 * even if we're attaching all tasks in the thread group, we
2231 * only need to check permissions on one of them.
2232 */
1893 tcred = __task_cred(tsk); 2233 tcred = __task_cred(tsk);
1894 if (cred->euid && 2234 if (cred->euid &&
1895 cred->euid != tcred->uid && 2235 cred->euid != tcred->uid &&
1896 cred->euid != tcred->suid) { 2236 cred->euid != tcred->suid) {
1897 rcu_read_unlock(); 2237 rcu_read_unlock();
2238 cgroup_unlock();
1898 return -EACCES; 2239 return -EACCES;
1899 } 2240 }
1900 get_task_struct(tsk); 2241 get_task_struct(tsk);
1901 rcu_read_unlock(); 2242 rcu_read_unlock();
1902 } else { 2243 } else {
1903 tsk = current; 2244 if (threadgroup)
2245 tsk = current->group_leader;
2246 else
2247 tsk = current;
1904 get_task_struct(tsk); 2248 get_task_struct(tsk);
1905 } 2249 }
1906 2250
1907 ret = cgroup_attach_task(cgrp, tsk); 2251 if (threadgroup) {
2252 threadgroup_fork_write_lock(tsk);
2253 ret = cgroup_attach_proc(cgrp, tsk);
2254 threadgroup_fork_write_unlock(tsk);
2255 } else {
2256 ret = cgroup_attach_task(cgrp, tsk);
2257 }
1908 put_task_struct(tsk); 2258 put_task_struct(tsk);
2259 cgroup_unlock();
1909 return ret; 2260 return ret;
1910} 2261}
1911 2262
1912static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid) 2263static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid)
1913{ 2264{
2265 return attach_task_by_pid(cgrp, pid, false);
2266}
2267
2268static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid)
2269{
1914 int ret; 2270 int ret;
1915 if (!cgroup_lock_live_group(cgrp)) 2271 do {
1916 return -ENODEV; 2272 /*
1917 ret = attach_task_by_pid(cgrp, pid); 2273 * attach_proc fails with -EAGAIN if threadgroup leadership
1918 cgroup_unlock(); 2274 * changes in the middle of the operation, in which case we need
2275 * to find the task_struct for the new leader and start over.
2276 */
2277 ret = attach_task_by_pid(cgrp, tgid, true);
2278 } while (ret == -EAGAIN);
1919 return ret; 2279 return ret;
1920} 2280}
1921 2281
@@ -3272,9 +3632,9 @@ static struct cftype files[] = {
3272 { 3632 {
3273 .name = CGROUP_FILE_GENERIC_PREFIX "procs", 3633 .name = CGROUP_FILE_GENERIC_PREFIX "procs",
3274 .open = cgroup_procs_open, 3634 .open = cgroup_procs_open,
3275 /* .write_u64 = cgroup_procs_write, TODO */ 3635 .write_u64 = cgroup_procs_write,
3276 .release = cgroup_pidlist_release, 3636 .release = cgroup_pidlist_release,
3277 .mode = S_IRUGO, 3637 .mode = S_IRUGO | S_IWUSR,
3278 }, 3638 },
3279 { 3639 {
3280 .name = "notify_on_release", 3640 .name = "notify_on_release",
@@ -4270,122 +4630,6 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
4270} 4630}
4271 4631
4272/** 4632/**
4273 * cgroup_clone - clone the cgroup the given subsystem is attached to
4274 * @tsk: the task to be moved
4275 * @subsys: the given subsystem
4276 * @nodename: the name for the new cgroup
4277 *
4278 * Duplicate the current cgroup in the hierarchy that the given
4279 * subsystem is attached to, and move this task into the new
4280 * child.
4281 */
4282int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
4283 char *nodename)
4284{
4285 struct dentry *dentry;
4286 int ret = 0;
4287 struct cgroup *parent, *child;
4288 struct inode *inode;
4289 struct css_set *cg;
4290 struct cgroupfs_root *root;
4291 struct cgroup_subsys *ss;
4292
4293 /* We shouldn't be called by an unregistered subsystem */
4294 BUG_ON(!subsys->active);
4295
4296 /* First figure out what hierarchy and cgroup we're dealing
4297 * with, and pin them so we can drop cgroup_mutex */
4298 mutex_lock(&cgroup_mutex);
4299 again:
4300 root = subsys->root;
4301 if (root == &rootnode) {
4302 mutex_unlock(&cgroup_mutex);
4303 return 0;
4304 }
4305
4306 /* Pin the hierarchy */
4307 if (!atomic_inc_not_zero(&root->sb->s_active)) {
4308 /* We race with the final deactivate_super() */
4309 mutex_unlock(&cgroup_mutex);
4310 return 0;
4311 }
4312
4313 /* Keep the cgroup alive */
4314 task_lock(tsk);
4315 parent = task_cgroup(tsk, subsys->subsys_id);
4316 cg = tsk->cgroups;
4317 get_css_set(cg);
4318 task_unlock(tsk);
4319
4320 mutex_unlock(&cgroup_mutex);
4321
4322 /* Now do the VFS work to create a cgroup */
4323 inode = parent->dentry->d_inode;
4324
4325 /* Hold the parent directory mutex across this operation to
4326 * stop anyone else deleting the new cgroup */
4327 mutex_lock(&inode->i_mutex);
4328 dentry = lookup_one_len(nodename, parent->dentry, strlen(nodename));
4329 if (IS_ERR(dentry)) {
4330 printk(KERN_INFO
4331 "cgroup: Couldn't allocate dentry for %s: %ld\n", nodename,
4332 PTR_ERR(dentry));
4333 ret = PTR_ERR(dentry);
4334 goto out_release;
4335 }
4336
4337 /* Create the cgroup directory, which also creates the cgroup */
4338 ret = vfs_mkdir(inode, dentry, 0755);
4339 child = __d_cgrp(dentry);
4340 dput(dentry);
4341 if (ret) {
4342 printk(KERN_INFO
4343 "Failed to create cgroup %s: %d\n", nodename,
4344 ret);
4345 goto out_release;
4346 }
4347
4348 /* The cgroup now exists. Retake cgroup_mutex and check
4349 * that we're still in the same state that we thought we
4350 * were. */
4351 mutex_lock(&cgroup_mutex);
4352 if ((root != subsys->root) ||
4353 (parent != task_cgroup(tsk, subsys->subsys_id))) {
4354 /* Aargh, we raced ... */
4355 mutex_unlock(&inode->i_mutex);
4356 put_css_set(cg);
4357
4358 deactivate_super(root->sb);
4359 /* The cgroup is still accessible in the VFS, but
4360 * we're not going to try to rmdir() it at this
4361 * point. */
4362 printk(KERN_INFO
4363 "Race in cgroup_clone() - leaking cgroup %s\n",
4364 nodename);
4365 goto again;
4366 }
4367
4368 /* do any required auto-setup */
4369 for_each_subsys(root, ss) {
4370 if (ss->post_clone)
4371 ss->post_clone(ss, child);
4372 }
4373
4374 /* All seems fine. Finish by moving the task into the new cgroup */
4375 ret = cgroup_attach_task(child, tsk);
4376 mutex_unlock(&cgroup_mutex);
4377
4378 out_release:
4379 mutex_unlock(&inode->i_mutex);
4380
4381 mutex_lock(&cgroup_mutex);
4382 put_css_set(cg);
4383 mutex_unlock(&cgroup_mutex);
4384 deactivate_super(root->sb);
4385 return ret;
4386}
4387
4388/**
4389 * cgroup_is_descendant - see if @cgrp is a descendant of @task's cgrp 4633 * cgroup_is_descendant - see if @cgrp is a descendant of @task's cgrp
4390 * @cgrp: the cgroup in question 4634 * @cgrp: the cgroup in question
4391 * @task: the task in question 4635 * @task: the task in question
@@ -4623,14 +4867,6 @@ bool css_is_ancestor(struct cgroup_subsys_state *child,
4623 return ret; 4867 return ret;
4624} 4868}
4625 4869
4626static void __free_css_id_cb(struct rcu_head *head)
4627{
4628 struct css_id *id;
4629
4630 id = container_of(head, struct css_id, rcu_head);
4631 kfree(id);
4632}
4633
4634void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css) 4870void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
4635{ 4871{
4636 struct css_id *id = css->id; 4872 struct css_id *id = css->id;
@@ -4645,7 +4881,7 @@ void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
4645 spin_lock(&ss->id_lock); 4881 spin_lock(&ss->id_lock);
4646 idr_remove(&ss->idr, id->id); 4882 idr_remove(&ss->idr, id->id);
4647 spin_unlock(&ss->id_lock); 4883 spin_unlock(&ss->id_lock);
4648 call_rcu(&id->rcu_head, __free_css_id_cb); 4884 kfree_rcu(id, rcu_head);
4649} 4885}
4650EXPORT_SYMBOL_GPL(free_css_id); 4886EXPORT_SYMBOL_GPL(free_css_id);
4651 4887
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index e7bebb7c6c38..e691818d7e45 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -160,7 +160,7 @@ static void freezer_destroy(struct cgroup_subsys *ss,
160 */ 160 */
161static int freezer_can_attach(struct cgroup_subsys *ss, 161static int freezer_can_attach(struct cgroup_subsys *ss,
162 struct cgroup *new_cgroup, 162 struct cgroup *new_cgroup,
163 struct task_struct *task, bool threadgroup) 163 struct task_struct *task)
164{ 164{
165 struct freezer *freezer; 165 struct freezer *freezer;
166 166
@@ -172,26 +172,17 @@ static int freezer_can_attach(struct cgroup_subsys *ss,
172 if (freezer->state != CGROUP_THAWED) 172 if (freezer->state != CGROUP_THAWED)
173 return -EBUSY; 173 return -EBUSY;
174 174
175 return 0;
176}
177
178static int freezer_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
179{
175 rcu_read_lock(); 180 rcu_read_lock();
176 if (__cgroup_freezing_or_frozen(task)) { 181 if (__cgroup_freezing_or_frozen(tsk)) {
177 rcu_read_unlock(); 182 rcu_read_unlock();
178 return -EBUSY; 183 return -EBUSY;
179 } 184 }
180 rcu_read_unlock(); 185 rcu_read_unlock();
181
182 if (threadgroup) {
183 struct task_struct *c;
184
185 rcu_read_lock();
186 list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
187 if (__cgroup_freezing_or_frozen(c)) {
188 rcu_read_unlock();
189 return -EBUSY;
190 }
191 }
192 rcu_read_unlock();
193 }
194
195 return 0; 186 return 0;
196} 187}
197 188
@@ -390,6 +381,9 @@ struct cgroup_subsys freezer_subsys = {
390 .populate = freezer_populate, 381 .populate = freezer_populate,
391 .subsys_id = freezer_subsys_id, 382 .subsys_id = freezer_subsys_id,
392 .can_attach = freezer_can_attach, 383 .can_attach = freezer_can_attach,
384 .can_attach_task = freezer_can_attach_task,
385 .pre_attach = NULL,
386 .attach_task = NULL,
393 .attach = NULL, 387 .attach = NULL,
394 .fork = freezer_fork, 388 .fork = freezer_fork,
395 .exit = NULL, 389 .exit = NULL,
diff --git a/kernel/compat.c b/kernel/compat.c
index 38b1d2c1cbe8..fc9eb093acd5 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -293,6 +293,8 @@ asmlinkage long compat_sys_times(struct compat_tms __user *tbuf)
293 return compat_jiffies_to_clock_t(jiffies); 293 return compat_jiffies_to_clock_t(jiffies);
294} 294}
295 295
296#ifdef __ARCH_WANT_SYS_SIGPENDING
297
296/* 298/*
297 * Assumption: old_sigset_t and compat_old_sigset_t are both 299 * Assumption: old_sigset_t and compat_old_sigset_t are both
298 * types that can be passed to put_user()/get_user(). 300 * types that can be passed to put_user()/get_user().
@@ -312,6 +314,10 @@ asmlinkage long compat_sys_sigpending(compat_old_sigset_t __user *set)
312 return ret; 314 return ret;
313} 315}
314 316
317#endif
318
319#ifdef __ARCH_WANT_SYS_SIGPROCMASK
320
315asmlinkage long compat_sys_sigprocmask(int how, compat_old_sigset_t __user *set, 321asmlinkage long compat_sys_sigprocmask(int how, compat_old_sigset_t __user *set,
316 compat_old_sigset_t __user *oset) 322 compat_old_sigset_t __user *oset)
317{ 323{
@@ -333,6 +339,8 @@ asmlinkage long compat_sys_sigprocmask(int how, compat_old_sigset_t __user *set,
333 return ret; 339 return ret;
334} 340}
335 341
342#endif
343
336asmlinkage long compat_sys_setrlimit(unsigned int resource, 344asmlinkage long compat_sys_setrlimit(unsigned int resource,
337 struct compat_rlimit __user *rlim) 345 struct compat_rlimit __user *rlim)
338{ 346{
@@ -890,10 +898,9 @@ compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese,
890{ 898{
891 compat_sigset_t s32; 899 compat_sigset_t s32;
892 sigset_t s; 900 sigset_t s;
893 int sig;
894 struct timespec t; 901 struct timespec t;
895 siginfo_t info; 902 siginfo_t info;
896 long ret, timeout = 0; 903 long ret;
897 904
898 if (sigsetsize != sizeof(sigset_t)) 905 if (sigsetsize != sizeof(sigset_t))
899 return -EINVAL; 906 return -EINVAL;
@@ -901,51 +908,19 @@ compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese,
901 if (copy_from_user(&s32, uthese, sizeof(compat_sigset_t))) 908 if (copy_from_user(&s32, uthese, sizeof(compat_sigset_t)))
902 return -EFAULT; 909 return -EFAULT;
903 sigset_from_compat(&s, &s32); 910 sigset_from_compat(&s, &s32);
904 sigdelsetmask(&s,sigmask(SIGKILL)|sigmask(SIGSTOP));
905 signotset(&s);
906 911
907 if (uts) { 912 if (uts) {
908 if (get_compat_timespec (&t, uts)) 913 if (get_compat_timespec(&t, uts))
909 return -EFAULT; 914 return -EFAULT;
910 if (t.tv_nsec >= 1000000000L || t.tv_nsec < 0
911 || t.tv_sec < 0)
912 return -EINVAL;
913 } 915 }
914 916
915 spin_lock_irq(&current->sighand->siglock); 917 ret = do_sigtimedwait(&s, &info, uts ? &t : NULL);
916 sig = dequeue_signal(current, &s, &info);
917 if (!sig) {
918 timeout = MAX_SCHEDULE_TIMEOUT;
919 if (uts)
920 timeout = timespec_to_jiffies(&t)
921 +(t.tv_sec || t.tv_nsec);
922 if (timeout) {
923 current->real_blocked = current->blocked;
924 sigandsets(&current->blocked, &current->blocked, &s);
925
926 recalc_sigpending();
927 spin_unlock_irq(&current->sighand->siglock);
928
929 timeout = schedule_timeout_interruptible(timeout);
930
931 spin_lock_irq(&current->sighand->siglock);
932 sig = dequeue_signal(current, &s, &info);
933 current->blocked = current->real_blocked;
934 siginitset(&current->real_blocked, 0);
935 recalc_sigpending();
936 }
937 }
938 spin_unlock_irq(&current->sighand->siglock);
939 918
940 if (sig) { 919 if (ret > 0 && uinfo) {
941 ret = sig; 920 if (copy_siginfo_to_user32(uinfo, &info))
942 if (uinfo) { 921 ret = -EFAULT;
943 if (copy_siginfo_to_user32(uinfo, &info))
944 ret = -EFAULT;
945 }
946 }else {
947 ret = timeout?-EINTR:-EAGAIN;
948 } 922 }
923
949 return ret; 924 return ret;
950 925
951} 926}
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 33eee16addb8..9c9b7545c810 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1159,7 +1159,7 @@ int current_cpuset_is_being_rebound(void)
1159static int update_relax_domain_level(struct cpuset *cs, s64 val) 1159static int update_relax_domain_level(struct cpuset *cs, s64 val)
1160{ 1160{
1161#ifdef CONFIG_SMP 1161#ifdef CONFIG_SMP
1162 if (val < -1 || val >= SD_LV_MAX) 1162 if (val < -1 || val >= sched_domain_level_max)
1163 return -EINVAL; 1163 return -EINVAL;
1164#endif 1164#endif
1165 1165
@@ -1367,14 +1367,10 @@ static int fmeter_getrate(struct fmeter *fmp)
1367 return val; 1367 return val;
1368} 1368}
1369 1369
1370/* Protected by cgroup_lock */
1371static cpumask_var_t cpus_attach;
1372
1373/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */ 1370/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */
1374static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont, 1371static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
1375 struct task_struct *tsk, bool threadgroup) 1372 struct task_struct *tsk)
1376{ 1373{
1377 int ret;
1378 struct cpuset *cs = cgroup_cs(cont); 1374 struct cpuset *cs = cgroup_cs(cont);
1379 1375
1380 if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) 1376 if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
@@ -1391,29 +1387,42 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
1391 if (tsk->flags & PF_THREAD_BOUND) 1387 if (tsk->flags & PF_THREAD_BOUND)
1392 return -EINVAL; 1388 return -EINVAL;
1393 1389
1394 ret = security_task_setscheduler(tsk);
1395 if (ret)
1396 return ret;
1397 if (threadgroup) {
1398 struct task_struct *c;
1399
1400 rcu_read_lock();
1401 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
1402 ret = security_task_setscheduler(c);
1403 if (ret) {
1404 rcu_read_unlock();
1405 return ret;
1406 }
1407 }
1408 rcu_read_unlock();
1409 }
1410 return 0; 1390 return 0;
1411} 1391}
1412 1392
1413static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to, 1393static int cpuset_can_attach_task(struct cgroup *cgrp, struct task_struct *task)
1414 struct cpuset *cs) 1394{
1395 return security_task_setscheduler(task);
1396}
1397
1398/*
1399 * Protected by cgroup_lock. The nodemasks must be stored globally because
1400 * dynamically allocating them is not allowed in pre_attach, and they must
1401 * persist among pre_attach, attach_task, and attach.
1402 */
1403static cpumask_var_t cpus_attach;
1404static nodemask_t cpuset_attach_nodemask_from;
1405static nodemask_t cpuset_attach_nodemask_to;
1406
1407/* Set-up work for before attaching each task. */
1408static void cpuset_pre_attach(struct cgroup *cont)
1409{
1410 struct cpuset *cs = cgroup_cs(cont);
1411
1412 if (cs == &top_cpuset)
1413 cpumask_copy(cpus_attach, cpu_possible_mask);
1414 else
1415 guarantee_online_cpus(cs, cpus_attach);
1416
1417 guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
1418}
1419
1420/* Per-thread attachment work. */
1421static void cpuset_attach_task(struct cgroup *cont, struct task_struct *tsk)
1415{ 1422{
1416 int err; 1423 int err;
1424 struct cpuset *cs = cgroup_cs(cont);
1425
1417 /* 1426 /*
1418 * can_attach beforehand should guarantee that this doesn't fail. 1427 * can_attach beforehand should guarantee that this doesn't fail.
1419 * TODO: have a better way to handle failure here 1428 * TODO: have a better way to handle failure here
@@ -1421,45 +1430,29 @@ static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to,
1421 err = set_cpus_allowed_ptr(tsk, cpus_attach); 1430 err = set_cpus_allowed_ptr(tsk, cpus_attach);
1422 WARN_ON_ONCE(err); 1431 WARN_ON_ONCE(err);
1423 1432
1424 cpuset_change_task_nodemask(tsk, to); 1433 cpuset_change_task_nodemask(tsk, &cpuset_attach_nodemask_to);
1425 cpuset_update_task_spread_flag(cs, tsk); 1434 cpuset_update_task_spread_flag(cs, tsk);
1426
1427} 1435}
1428 1436
1429static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont, 1437static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,
1430 struct cgroup *oldcont, struct task_struct *tsk, 1438 struct cgroup *oldcont, struct task_struct *tsk)
1431 bool threadgroup)
1432{ 1439{
1433 struct mm_struct *mm; 1440 struct mm_struct *mm;
1434 struct cpuset *cs = cgroup_cs(cont); 1441 struct cpuset *cs = cgroup_cs(cont);
1435 struct cpuset *oldcs = cgroup_cs(oldcont); 1442 struct cpuset *oldcs = cgroup_cs(oldcont);
1436 static nodemask_t to; /* protected by cgroup_mutex */
1437 1443
1438 if (cs == &top_cpuset) { 1444 /*
1439 cpumask_copy(cpus_attach, cpu_possible_mask); 1445 * Change mm, possibly for multiple threads in a threadgroup. This is
1440 } else { 1446 * expensive and may sleep.
1441 guarantee_online_cpus(cs, cpus_attach); 1447 */
1442 } 1448 cpuset_attach_nodemask_from = oldcs->mems_allowed;
1443 guarantee_online_mems(cs, &to); 1449 cpuset_attach_nodemask_to = cs->mems_allowed;
1444
1445 /* do per-task migration stuff possibly for each in the threadgroup */
1446 cpuset_attach_task(tsk, &to, cs);
1447 if (threadgroup) {
1448 struct task_struct *c;
1449 rcu_read_lock();
1450 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
1451 cpuset_attach_task(c, &to, cs);
1452 }
1453 rcu_read_unlock();
1454 }
1455
1456 /* change mm; only needs to be done once even if threadgroup */
1457 to = cs->mems_allowed;
1458 mm = get_task_mm(tsk); 1450 mm = get_task_mm(tsk);
1459 if (mm) { 1451 if (mm) {
1460 mpol_rebind_mm(mm, &to); 1452 mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
1461 if (is_memory_migrate(cs)) 1453 if (is_memory_migrate(cs))
1462 cpuset_migrate_mm(mm, &oldcs->mems_allowed, &to); 1454 cpuset_migrate_mm(mm, &cpuset_attach_nodemask_from,
1455 &cpuset_attach_nodemask_to);
1463 mmput(mm); 1456 mmput(mm);
1464 } 1457 }
1465} 1458}
@@ -1809,10 +1802,9 @@ static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont)
1809} 1802}
1810 1803
1811/* 1804/*
1812 * post_clone() is called at the end of cgroup_clone(). 1805 * post_clone() is called during cgroup_create() when the
1813 * 'cgroup' was just created automatically as a result of 1806 * clone_children mount argument was specified. The cgroup
1814 * a cgroup_clone(), and the current task is about to 1807 * can not yet have any tasks.
1815 * be moved into 'cgroup'.
1816 * 1808 *
1817 * Currently we refuse to set up the cgroup - thereby 1809 * Currently we refuse to set up the cgroup - thereby
1818 * refusing the task to be entered, and as a result refusing 1810 * refusing the task to be entered, and as a result refusing
@@ -1911,6 +1903,9 @@ struct cgroup_subsys cpuset_subsys = {
1911 .create = cpuset_create, 1903 .create = cpuset_create,
1912 .destroy = cpuset_destroy, 1904 .destroy = cpuset_destroy,
1913 .can_attach = cpuset_can_attach, 1905 .can_attach = cpuset_can_attach,
1906 .can_attach_task = cpuset_can_attach_task,
1907 .pre_attach = cpuset_pre_attach,
1908 .attach_task = cpuset_attach_task,
1914 .attach = cpuset_attach, 1909 .attach = cpuset_attach,
1915 .populate = cpuset_populate, 1910 .populate = cpuset_populate,
1916 .post_clone = cpuset_post_clone, 1911 .post_clone = cpuset_post_clone,
@@ -2195,7 +2190,7 @@ int cpuset_cpus_allowed_fallback(struct task_struct *tsk)
2195 rcu_read_lock(); 2190 rcu_read_lock();
2196 cs = task_cs(tsk); 2191 cs = task_cs(tsk);
2197 if (cs) 2192 if (cs)
2198 cpumask_copy(&tsk->cpus_allowed, cs->cpus_allowed); 2193 do_set_cpus_allowed(tsk, cs->cpus_allowed);
2199 rcu_read_unlock(); 2194 rcu_read_unlock();
2200 2195
2201 /* 2196 /*
@@ -2222,7 +2217,7 @@ int cpuset_cpus_allowed_fallback(struct task_struct *tsk)
2222 * Like above we can temporary set any mask and rely on 2217 * Like above we can temporary set any mask and rely on
2223 * set_cpus_allowed_ptr() as synchronization point. 2218 * set_cpus_allowed_ptr() as synchronization point.
2224 */ 2219 */
2225 cpumask_copy(&tsk->cpus_allowed, cpu_possible_mask); 2220 do_set_cpus_allowed(tsk, cpu_possible_mask);
2226 cpu = cpumask_any(cpu_active_mask); 2221 cpu = cpumask_any(cpu_active_mask);
2227 } 2222 }
2228 2223
diff --git a/kernel/cred.c b/kernel/cred.c
index 8093c16b84b1..174fa84eca30 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -1,4 +1,4 @@
1/* Task credentials management - see Documentation/credentials.txt 1/* Task credentials management - see Documentation/security/credentials.txt
2 * 2 *
3 * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. 3 * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com) 4 * Written by David Howells (dhowells@redhat.com)
@@ -49,10 +49,10 @@ struct cred init_cred = {
49 .magic = CRED_MAGIC, 49 .magic = CRED_MAGIC,
50#endif 50#endif
51 .securebits = SECUREBITS_DEFAULT, 51 .securebits = SECUREBITS_DEFAULT,
52 .cap_inheritable = CAP_INIT_INH_SET, 52 .cap_inheritable = CAP_EMPTY_SET,
53 .cap_permitted = CAP_FULL_SET, 53 .cap_permitted = CAP_FULL_SET,
54 .cap_effective = CAP_INIT_EFF_SET, 54 .cap_effective = CAP_FULL_SET,
55 .cap_bset = CAP_INIT_BSET, 55 .cap_bset = CAP_FULL_SET,
56 .user = INIT_USER, 56 .user = INIT_USER,
57 .user_ns = &init_user_ns, 57 .user_ns = &init_user_ns,
58 .group_info = &init_groups, 58 .group_info = &init_groups,
diff --git a/kernel/events/Makefile b/kernel/events/Makefile
new file mode 100644
index 000000000000..1ce23d3d8394
--- /dev/null
+++ b/kernel/events/Makefile
@@ -0,0 +1,6 @@
1ifdef CONFIG_FUNCTION_TRACER
2CFLAGS_REMOVE_core.o = -pg
3endif
4
5obj-y := core.o
6obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
diff --git a/kernel/perf_event.c b/kernel/events/core.c
index 8e81a9860a0d..9efe7108ccaf 100644
--- a/kernel/perf_event.c
+++ b/kernel/events/core.c
@@ -2,8 +2,8 @@
2 * Performance events core code: 2 * Performance events core code:
3 * 3 *
4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> 4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar 5 * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
6 * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> 6 * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
7 * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> 7 * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
8 * 8 *
9 * For licensing details see kernel-base/COPYING 9 * For licensing details see kernel-base/COPYING
@@ -39,10 +39,10 @@
39#include <asm/irq_regs.h> 39#include <asm/irq_regs.h>
40 40
41struct remote_function_call { 41struct remote_function_call {
42 struct task_struct *p; 42 struct task_struct *p;
43 int (*func)(void *info); 43 int (*func)(void *info);
44 void *info; 44 void *info;
45 int ret; 45 int ret;
46}; 46};
47 47
48static void remote_function(void *data) 48static void remote_function(void *data)
@@ -76,10 +76,10 @@ static int
76task_function_call(struct task_struct *p, int (*func) (void *info), void *info) 76task_function_call(struct task_struct *p, int (*func) (void *info), void *info)
77{ 77{
78 struct remote_function_call data = { 78 struct remote_function_call data = {
79 .p = p, 79 .p = p,
80 .func = func, 80 .func = func,
81 .info = info, 81 .info = info,
82 .ret = -ESRCH, /* No such (running) process */ 82 .ret = -ESRCH, /* No such (running) process */
83 }; 83 };
84 84
85 if (task_curr(p)) 85 if (task_curr(p))
@@ -100,10 +100,10 @@ task_function_call(struct task_struct *p, int (*func) (void *info), void *info)
100static int cpu_function_call(int cpu, int (*func) (void *info), void *info) 100static int cpu_function_call(int cpu, int (*func) (void *info), void *info)
101{ 101{
102 struct remote_function_call data = { 102 struct remote_function_call data = {
103 .p = NULL, 103 .p = NULL,
104 .func = func, 104 .func = func,
105 .info = info, 105 .info = info,
106 .ret = -ENXIO, /* No such CPU */ 106 .ret = -ENXIO, /* No such CPU */
107 }; 107 };
108 108
109 smp_call_function_single(cpu, remote_function, &data, 1); 109 smp_call_function_single(cpu, remote_function, &data, 1);
@@ -125,7 +125,7 @@ enum event_type_t {
125 * perf_sched_events : >0 events exist 125 * perf_sched_events : >0 events exist
126 * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu 126 * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
127 */ 127 */
128atomic_t perf_sched_events __read_mostly; 128struct jump_label_key perf_sched_events __read_mostly;
129static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); 129static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
130 130
131static atomic_t nr_mmap_events __read_mostly; 131static atomic_t nr_mmap_events __read_mostly;
@@ -586,14 +586,6 @@ static void get_ctx(struct perf_event_context *ctx)
586 WARN_ON(!atomic_inc_not_zero(&ctx->refcount)); 586 WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
587} 587}
588 588
589static void free_ctx(struct rcu_head *head)
590{
591 struct perf_event_context *ctx;
592
593 ctx = container_of(head, struct perf_event_context, rcu_head);
594 kfree(ctx);
595}
596
597static void put_ctx(struct perf_event_context *ctx) 589static void put_ctx(struct perf_event_context *ctx)
598{ 590{
599 if (atomic_dec_and_test(&ctx->refcount)) { 591 if (atomic_dec_and_test(&ctx->refcount)) {
@@ -601,7 +593,7 @@ static void put_ctx(struct perf_event_context *ctx)
601 put_ctx(ctx->parent_ctx); 593 put_ctx(ctx->parent_ctx);
602 if (ctx->task) 594 if (ctx->task)
603 put_task_struct(ctx->task); 595 put_task_struct(ctx->task);
604 call_rcu(&ctx->rcu_head, free_ctx); 596 kfree_rcu(ctx, rcu_head);
605 } 597 }
606} 598}
607 599
@@ -5036,6 +5028,14 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
5036 else 5028 else
5037 perf_event_output(event, nmi, data, regs); 5029 perf_event_output(event, nmi, data, regs);
5038 5030
5031 if (event->fasync && event->pending_kill) {
5032 if (nmi) {
5033 event->pending_wakeup = 1;
5034 irq_work_queue(&event->pending);
5035 } else
5036 perf_event_wakeup(event);
5037 }
5038
5039 return ret; 5039 return ret;
5040} 5040}
5041 5041
@@ -5331,14 +5331,6 @@ swevent_hlist_deref(struct swevent_htable *swhash)
5331 lockdep_is_held(&swhash->hlist_mutex)); 5331 lockdep_is_held(&swhash->hlist_mutex));
5332} 5332}
5333 5333
5334static void swevent_hlist_release_rcu(struct rcu_head *rcu_head)
5335{
5336 struct swevent_hlist *hlist;
5337
5338 hlist = container_of(rcu_head, struct swevent_hlist, rcu_head);
5339 kfree(hlist);
5340}
5341
5342static void swevent_hlist_release(struct swevent_htable *swhash) 5334static void swevent_hlist_release(struct swevent_htable *swhash)
5343{ 5335{
5344 struct swevent_hlist *hlist = swevent_hlist_deref(swhash); 5336 struct swevent_hlist *hlist = swevent_hlist_deref(swhash);
@@ -5347,7 +5339,7 @@ static void swevent_hlist_release(struct swevent_htable *swhash)
5347 return; 5339 return;
5348 5340
5349 rcu_assign_pointer(swhash->swevent_hlist, NULL); 5341 rcu_assign_pointer(swhash->swevent_hlist, NULL);
5350 call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu); 5342 kfree_rcu(hlist, rcu_head);
5351} 5343}
5352 5344
5353static void swevent_hlist_put_cpu(struct perf_event *event, int cpu) 5345static void swevent_hlist_put_cpu(struct perf_event *event, int cpu)
@@ -5429,7 +5421,7 @@ fail:
5429 return err; 5421 return err;
5430} 5422}
5431 5423
5432atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX]; 5424struct jump_label_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
5433 5425
5434static void sw_perf_event_destroy(struct perf_event *event) 5426static void sw_perf_event_destroy(struct perf_event *event)
5435{ 5427{
@@ -7410,26 +7402,12 @@ static int __perf_cgroup_move(void *info)
7410 return 0; 7402 return 0;
7411} 7403}
7412 7404
7413static void perf_cgroup_move(struct task_struct *task) 7405static void
7406perf_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *task)
7414{ 7407{
7415 task_function_call(task, __perf_cgroup_move, task); 7408 task_function_call(task, __perf_cgroup_move, task);
7416} 7409}
7417 7410
7418static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
7419 struct cgroup *old_cgrp, struct task_struct *task,
7420 bool threadgroup)
7421{
7422 perf_cgroup_move(task);
7423 if (threadgroup) {
7424 struct task_struct *c;
7425 rcu_read_lock();
7426 list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
7427 perf_cgroup_move(c);
7428 }
7429 rcu_read_unlock();
7430 }
7431}
7432
7433static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, 7411static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
7434 struct cgroup *old_cgrp, struct task_struct *task) 7412 struct cgroup *old_cgrp, struct task_struct *task)
7435{ 7413{
@@ -7441,15 +7419,15 @@ static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
7441 if (!(task->flags & PF_EXITING)) 7419 if (!(task->flags & PF_EXITING))
7442 return; 7420 return;
7443 7421
7444 perf_cgroup_move(task); 7422 perf_cgroup_attach_task(cgrp, task);
7445} 7423}
7446 7424
7447struct cgroup_subsys perf_subsys = { 7425struct cgroup_subsys perf_subsys = {
7448 .name = "perf_event", 7426 .name = "perf_event",
7449 .subsys_id = perf_subsys_id, 7427 .subsys_id = perf_subsys_id,
7450 .create = perf_cgroup_create, 7428 .create = perf_cgroup_create,
7451 .destroy = perf_cgroup_destroy, 7429 .destroy = perf_cgroup_destroy,
7452 .exit = perf_cgroup_exit, 7430 .exit = perf_cgroup_exit,
7453 .attach = perf_cgroup_attach, 7431 .attach_task = perf_cgroup_attach_task,
7454}; 7432};
7455#endif /* CONFIG_CGROUP_PERF */ 7433#endif /* CONFIG_CGROUP_PERF */
diff --git a/kernel/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index 086adf25a55e..086adf25a55e 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
diff --git a/kernel/exit.c b/kernel/exit.c
index 8dd874181542..f2b321bae440 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -561,29 +561,28 @@ void exit_files(struct task_struct *tsk)
561 561
562#ifdef CONFIG_MM_OWNER 562#ifdef CONFIG_MM_OWNER
563/* 563/*
564 * Task p is exiting and it owned mm, lets find a new owner for it 564 * A task is exiting. If it owned this mm, find a new owner for the mm.
565 */ 565 */
566static inline int
567mm_need_new_owner(struct mm_struct *mm, struct task_struct *p)
568{
569 /*
570 * If there are other users of the mm and the owner (us) is exiting
571 * we need to find a new owner to take on the responsibility.
572 */
573 if (atomic_read(&mm->mm_users) <= 1)
574 return 0;
575 if (mm->owner != p)
576 return 0;
577 return 1;
578}
579
580void mm_update_next_owner(struct mm_struct *mm) 566void mm_update_next_owner(struct mm_struct *mm)
581{ 567{
582 struct task_struct *c, *g, *p = current; 568 struct task_struct *c, *g, *p = current;
583 569
584retry: 570retry:
585 if (!mm_need_new_owner(mm, p)) 571 /*
572 * If the exiting or execing task is not the owner, it's
573 * someone else's problem.
574 */
575 if (mm->owner != p)
586 return; 576 return;
577 /*
578 * The current owner is exiting/execing and there are no other
579 * candidates. Do not leave the mm pointing to a possibly
580 * freed task structure.
581 */
582 if (atomic_read(&mm->mm_users) <= 1) {
583 mm->owner = NULL;
584 return;
585 }
587 586
588 read_lock(&tasklist_lock); 587 read_lock(&tasklist_lock);
589 /* 588 /*
@@ -1377,11 +1376,23 @@ static int *task_stopped_code(struct task_struct *p, bool ptrace)
1377 return NULL; 1376 return NULL;
1378} 1377}
1379 1378
1380/* 1379/**
1381 * Handle sys_wait4 work for one task in state TASK_STOPPED. We hold 1380 * wait_task_stopped - Wait for %TASK_STOPPED or %TASK_TRACED
1382 * read_lock(&tasklist_lock) on entry. If we return zero, we still hold 1381 * @wo: wait options
1383 * the lock and this task is uninteresting. If we return nonzero, we have 1382 * @ptrace: is the wait for ptrace
1384 * released the lock and the system call should return. 1383 * @p: task to wait for
1384 *
1385 * Handle sys_wait4() work for %p in state %TASK_STOPPED or %TASK_TRACED.
1386 *
1387 * CONTEXT:
1388 * read_lock(&tasklist_lock), which is released if return value is
1389 * non-zero. Also, grabs and releases @p->sighand->siglock.
1390 *
1391 * RETURNS:
1392 * 0 if wait condition didn't exist and search for other wait conditions
1393 * should continue. Non-zero return, -errno on failure and @p's pid on
1394 * success, implies that tasklist_lock is released and wait condition
1395 * search should terminate.
1385 */ 1396 */
1386static int wait_task_stopped(struct wait_opts *wo, 1397static int wait_task_stopped(struct wait_opts *wo,
1387 int ptrace, struct task_struct *p) 1398 int ptrace, struct task_struct *p)
@@ -1397,6 +1408,9 @@ static int wait_task_stopped(struct wait_opts *wo,
1397 if (!ptrace && !(wo->wo_flags & WUNTRACED)) 1408 if (!ptrace && !(wo->wo_flags & WUNTRACED))
1398 return 0; 1409 return 0;
1399 1410
1411 if (!task_stopped_code(p, ptrace))
1412 return 0;
1413
1400 exit_code = 0; 1414 exit_code = 0;
1401 spin_lock_irq(&p->sighand->siglock); 1415 spin_lock_irq(&p->sighand->siglock);
1402 1416
@@ -1538,33 +1552,84 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
1538 return 0; 1552 return 0;
1539 } 1553 }
1540 1554
1541 if (likely(!ptrace) && unlikely(task_ptrace(p))) { 1555 /* dead body doesn't have much to contribute */
1556 if (p->exit_state == EXIT_DEAD)
1557 return 0;
1558
1559 /* slay zombie? */
1560 if (p->exit_state == EXIT_ZOMBIE) {
1561 /*
1562 * A zombie ptracee is only visible to its ptracer.
1563 * Notification and reaping will be cascaded to the real
1564 * parent when the ptracer detaches.
1565 */
1566 if (likely(!ptrace) && unlikely(task_ptrace(p))) {
1567 /* it will become visible, clear notask_error */
1568 wo->notask_error = 0;
1569 return 0;
1570 }
1571
1572 /* we don't reap group leaders with subthreads */
1573 if (!delay_group_leader(p))
1574 return wait_task_zombie(wo, p);
1575
1542 /* 1576 /*
1543 * This child is hidden by ptrace. 1577 * Allow access to stopped/continued state via zombie by
1544 * We aren't allowed to see it now, but eventually we will. 1578 * falling through. Clearing of notask_error is complex.
1579 *
1580 * When !@ptrace:
1581 *
1582 * If WEXITED is set, notask_error should naturally be
1583 * cleared. If not, subset of WSTOPPED|WCONTINUED is set,
1584 * so, if there are live subthreads, there are events to
1585 * wait for. If all subthreads are dead, it's still safe
1586 * to clear - this function will be called again in finite
1587 * amount time once all the subthreads are released and
1588 * will then return without clearing.
1589 *
1590 * When @ptrace:
1591 *
1592 * Stopped state is per-task and thus can't change once the
1593 * target task dies. Only continued and exited can happen.
1594 * Clear notask_error if WCONTINUED | WEXITED.
1595 */
1596 if (likely(!ptrace) || (wo->wo_flags & (WCONTINUED | WEXITED)))
1597 wo->notask_error = 0;
1598 } else {
1599 /*
1600 * If @p is ptraced by a task in its real parent's group,
1601 * hide group stop/continued state when looking at @p as
1602 * the real parent; otherwise, a single stop can be
1603 * reported twice as group and ptrace stops.
1604 *
1605 * If a ptracer wants to distinguish the two events for its
1606 * own children, it should create a separate process which
1607 * takes the role of real parent.
1608 */
1609 if (likely(!ptrace) && task_ptrace(p) &&
1610 same_thread_group(p->parent, p->real_parent))
1611 return 0;
1612
1613 /*
1614 * @p is alive and it's gonna stop, continue or exit, so
1615 * there always is something to wait for.
1545 */ 1616 */
1546 wo->notask_error = 0; 1617 wo->notask_error = 0;
1547 return 0;
1548 } 1618 }
1549 1619
1550 if (p->exit_state == EXIT_DEAD)
1551 return 0;
1552
1553 /* 1620 /*
1554 * We don't reap group leaders with subthreads. 1621 * Wait for stopped. Depending on @ptrace, different stopped state
1622 * is used and the two don't interact with each other.
1555 */ 1623 */
1556 if (p->exit_state == EXIT_ZOMBIE && !delay_group_leader(p)) 1624 ret = wait_task_stopped(wo, ptrace, p);
1557 return wait_task_zombie(wo, p); 1625 if (ret)
1626 return ret;
1558 1627
1559 /* 1628 /*
1560 * It's stopped or running now, so it might 1629 * Wait for continued. There's only one continued state and the
1561 * later continue, exit, or stop again. 1630 * ptracer can consume it which can confuse the real parent. Don't
1631 * use WCONTINUED from ptracer. You don't need or want it.
1562 */ 1632 */
1563 wo->notask_error = 0;
1564
1565 if (task_stopped_code(p, ptrace))
1566 return wait_task_stopped(wo, ptrace, p);
1567
1568 return wait_task_continued(wo, p); 1633 return wait_task_continued(wo, p);
1569} 1634}
1570 1635
diff --git a/kernel/extable.c b/kernel/extable.c
index 7f8f263f8524..5339705b8241 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -72,6 +72,24 @@ int core_kernel_text(unsigned long addr)
72 return 0; 72 return 0;
73} 73}
74 74
75/**
76 * core_kernel_data - tell if addr points to kernel data
77 * @addr: address to test
78 *
79 * Returns true if @addr passed in is from the core kernel data
80 * section.
81 *
82 * Note: On some archs it may return true for core RODATA, and false
83 * for others. But will always be true for core RW data.
84 */
85int core_kernel_data(unsigned long addr)
86{
87 if (addr >= (unsigned long)_sdata &&
88 addr < (unsigned long)_edata)
89 return 1;
90 return 0;
91}
92
75int __kernel_text_address(unsigned long addr) 93int __kernel_text_address(unsigned long addr)
76{ 94{
77 if (core_kernel_text(addr)) 95 if (core_kernel_text(addr))
diff --git a/kernel/fork.c b/kernel/fork.c
index e7548dee636b..0276c30401a0 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -59,7 +59,6 @@
59#include <linux/taskstats_kern.h> 59#include <linux/taskstats_kern.h>
60#include <linux/random.h> 60#include <linux/random.h>
61#include <linux/tty.h> 61#include <linux/tty.h>
62#include <linux/proc_fs.h>
63#include <linux/blkdev.h> 62#include <linux/blkdev.h>
64#include <linux/fs_struct.h> 63#include <linux/fs_struct.h>
65#include <linux/magic.h> 64#include <linux/magic.h>
@@ -383,15 +382,14 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
383 get_file(file); 382 get_file(file);
384 if (tmp->vm_flags & VM_DENYWRITE) 383 if (tmp->vm_flags & VM_DENYWRITE)
385 atomic_dec(&inode->i_writecount); 384 atomic_dec(&inode->i_writecount);
386 spin_lock(&mapping->i_mmap_lock); 385 mutex_lock(&mapping->i_mmap_mutex);
387 if (tmp->vm_flags & VM_SHARED) 386 if (tmp->vm_flags & VM_SHARED)
388 mapping->i_mmap_writable++; 387 mapping->i_mmap_writable++;
389 tmp->vm_truncate_count = mpnt->vm_truncate_count;
390 flush_dcache_mmap_lock(mapping); 388 flush_dcache_mmap_lock(mapping);
391 /* insert tmp into the share list, just after mpnt */ 389 /* insert tmp into the share list, just after mpnt */
392 vma_prio_tree_add(tmp, mpnt); 390 vma_prio_tree_add(tmp, mpnt);
393 flush_dcache_mmap_unlock(mapping); 391 flush_dcache_mmap_unlock(mapping);
394 spin_unlock(&mapping->i_mmap_lock); 392 mutex_unlock(&mapping->i_mmap_mutex);
395 } 393 }
396 394
397 /* 395 /*
@@ -522,11 +520,12 @@ struct mm_struct * mm_alloc(void)
522 struct mm_struct * mm; 520 struct mm_struct * mm;
523 521
524 mm = allocate_mm(); 522 mm = allocate_mm();
525 if (mm) { 523 if (!mm)
526 memset(mm, 0, sizeof(*mm)); 524 return NULL;
527 mm = mm_init(mm, current); 525
528 } 526 memset(mm, 0, sizeof(*mm));
529 return mm; 527 mm_init_cpumask(mm);
528 return mm_init(mm, current);
530} 529}
531 530
532/* 531/*
@@ -573,6 +572,57 @@ void mmput(struct mm_struct *mm)
573} 572}
574EXPORT_SYMBOL_GPL(mmput); 573EXPORT_SYMBOL_GPL(mmput);
575 574
575/*
576 * We added or removed a vma mapping the executable. The vmas are only mapped
577 * during exec and are not mapped with the mmap system call.
578 * Callers must hold down_write() on the mm's mmap_sem for these
579 */
580void added_exe_file_vma(struct mm_struct *mm)
581{
582 mm->num_exe_file_vmas++;
583}
584
585void removed_exe_file_vma(struct mm_struct *mm)
586{
587 mm->num_exe_file_vmas--;
588 if ((mm->num_exe_file_vmas == 0) && mm->exe_file){
589 fput(mm->exe_file);
590 mm->exe_file = NULL;
591 }
592
593}
594
595void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
596{
597 if (new_exe_file)
598 get_file(new_exe_file);
599 if (mm->exe_file)
600 fput(mm->exe_file);
601 mm->exe_file = new_exe_file;
602 mm->num_exe_file_vmas = 0;
603}
604
605struct file *get_mm_exe_file(struct mm_struct *mm)
606{
607 struct file *exe_file;
608
609 /* We need mmap_sem to protect against races with removal of
610 * VM_EXECUTABLE vmas */
611 down_read(&mm->mmap_sem);
612 exe_file = mm->exe_file;
613 if (exe_file)
614 get_file(exe_file);
615 up_read(&mm->mmap_sem);
616 return exe_file;
617}
618
619static void dup_mm_exe_file(struct mm_struct *oldmm, struct mm_struct *newmm)
620{
621 /* It's safe to write the exe_file pointer without exe_file_lock because
622 * this is called during fork when the task is not yet in /proc */
623 newmm->exe_file = get_mm_exe_file(oldmm);
624}
625
576/** 626/**
577 * get_task_mm - acquire a reference to the task's mm 627 * get_task_mm - acquire a reference to the task's mm
578 * 628 *
@@ -679,6 +729,7 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
679 goto fail_nomem; 729 goto fail_nomem;
680 730
681 memcpy(mm, oldmm, sizeof(*mm)); 731 memcpy(mm, oldmm, sizeof(*mm));
732 mm_init_cpumask(mm);
682 733
683 /* Initializing for Swap token stuff */ 734 /* Initializing for Swap token stuff */
684 mm->token_priority = 0; 735 mm->token_priority = 0;
@@ -927,6 +978,10 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
927 tty_audit_fork(sig); 978 tty_audit_fork(sig);
928 sched_autogroup_fork(sig); 979 sched_autogroup_fork(sig);
929 980
981#ifdef CONFIG_CGROUPS
982 init_rwsem(&sig->threadgroup_fork_lock);
983#endif
984
930 sig->oom_adj = current->signal->oom_adj; 985 sig->oom_adj = current->signal->oom_adj;
931 sig->oom_score_adj = current->signal->oom_score_adj; 986 sig->oom_score_adj = current->signal->oom_score_adj;
932 sig->oom_score_adj_min = current->signal->oom_score_adj_min; 987 sig->oom_score_adj_min = current->signal->oom_score_adj_min;
@@ -1103,12 +1158,13 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1103 1158
1104 posix_cpu_timers_init(p); 1159 posix_cpu_timers_init(p);
1105 1160
1106 p->lock_depth = -1; /* -1 = no lock */
1107 do_posix_clock_monotonic_gettime(&p->start_time); 1161 do_posix_clock_monotonic_gettime(&p->start_time);
1108 p->real_start_time = p->start_time; 1162 p->real_start_time = p->start_time;
1109 monotonic_to_bootbased(&p->real_start_time); 1163 monotonic_to_bootbased(&p->real_start_time);
1110 p->io_context = NULL; 1164 p->io_context = NULL;
1111 p->audit_context = NULL; 1165 p->audit_context = NULL;
1166 if (clone_flags & CLONE_THREAD)
1167 threadgroup_fork_read_lock(current);
1112 cgroup_fork(p); 1168 cgroup_fork(p);
1113#ifdef CONFIG_NUMA 1169#ifdef CONFIG_NUMA
1114 p->mempolicy = mpol_dup(p->mempolicy); 1170 p->mempolicy = mpol_dup(p->mempolicy);
@@ -1153,7 +1209,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1153#endif 1209#endif
1154 1210
1155 /* Perform scheduler related setup. Assign this task to a CPU. */ 1211 /* Perform scheduler related setup. Assign this task to a CPU. */
1156 sched_fork(p, clone_flags); 1212 sched_fork(p);
1157 1213
1158 retval = perf_event_init_task(p); 1214 retval = perf_event_init_task(p);
1159 if (retval) 1215 if (retval)
@@ -1194,12 +1250,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1194 if (clone_flags & CLONE_THREAD) 1250 if (clone_flags & CLONE_THREAD)
1195 p->tgid = current->tgid; 1251 p->tgid = current->tgid;
1196 1252
1197 if (current->nsproxy != p->nsproxy) {
1198 retval = ns_cgroup_clone(p, pid);
1199 if (retval)
1200 goto bad_fork_free_pid;
1201 }
1202
1203 p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; 1253 p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
1204 /* 1254 /*
1205 * Clear TID on mm_release()? 1255 * Clear TID on mm_release()?
@@ -1313,6 +1363,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1313 write_unlock_irq(&tasklist_lock); 1363 write_unlock_irq(&tasklist_lock);
1314 proc_fork_connector(p); 1364 proc_fork_connector(p);
1315 cgroup_post_fork(p); 1365 cgroup_post_fork(p);
1366 if (clone_flags & CLONE_THREAD)
1367 threadgroup_fork_read_unlock(current);
1316 perf_event_fork(p); 1368 perf_event_fork(p);
1317 return p; 1369 return p;
1318 1370
@@ -1351,6 +1403,8 @@ bad_fork_cleanup_policy:
1351 mpol_put(p->mempolicy); 1403 mpol_put(p->mempolicy);
1352bad_fork_cleanup_cgroup: 1404bad_fork_cleanup_cgroup:
1353#endif 1405#endif
1406 if (clone_flags & CLONE_THREAD)
1407 threadgroup_fork_read_unlock(current);
1354 cgroup_exit(p, cgroup_callbacks_done); 1408 cgroup_exit(p, cgroup_callbacks_done);
1355 delayacct_tsk_free(p); 1409 delayacct_tsk_free(p);
1356 module_put(task_thread_info(p)->exec_domain->module); 1410 module_put(task_thread_info(p)->exec_domain->module);
@@ -1464,7 +1518,7 @@ long do_fork(unsigned long clone_flags,
1464 */ 1518 */
1465 p->flags &= ~PF_STARTING; 1519 p->flags &= ~PF_STARTING;
1466 1520
1467 wake_up_new_task(p, clone_flags); 1521 wake_up_new_task(p);
1468 1522
1469 tracehook_report_clone_complete(trace, regs, 1523 tracehook_report_clone_complete(trace, regs,
1470 clone_flags, nr, p); 1524 clone_flags, nr, p);
@@ -1508,6 +1562,13 @@ void __init proc_caches_init(void)
1508 fs_cachep = kmem_cache_create("fs_cache", 1562 fs_cachep = kmem_cache_create("fs_cache",
1509 sizeof(struct fs_struct), 0, 1563 sizeof(struct fs_struct), 0,
1510 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); 1564 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
1565 /*
1566 * FIXME! The "sizeof(struct mm_struct)" currently includes the
1567 * whole struct cpumask for the OFFSTACK case. We could change
1568 * this to *only* allocate as much of it as required by the
1569 * maximum number of CPU's we can ever have. The cpumask_allocation
1570 * is at the end of the structure, exactly for that reason.
1571 */
1511 mm_cachep = kmem_cache_create("mm_struct", 1572 mm_cachep = kmem_cache_create("mm_struct",
1512 sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, 1573 sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
1513 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); 1574 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
diff --git a/kernel/freezer.c b/kernel/freezer.c
index 66ecd2ead215..7b01de98bb6a 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -17,7 +17,7 @@ static inline void frozen_process(void)
17{ 17{
18 if (!unlikely(current->flags & PF_NOFREEZE)) { 18 if (!unlikely(current->flags & PF_NOFREEZE)) {
19 current->flags |= PF_FROZEN; 19 current->flags |= PF_FROZEN;
20 wmb(); 20 smp_wmb();
21 } 21 }
22 clear_freeze_flag(current); 22 clear_freeze_flag(current);
23} 23}
@@ -93,7 +93,7 @@ bool freeze_task(struct task_struct *p, bool sig_only)
93 * the task as frozen and next clears its TIF_FREEZE. 93 * the task as frozen and next clears its TIF_FREEZE.
94 */ 94 */
95 if (!freezing(p)) { 95 if (!freezing(p)) {
96 rmb(); 96 smp_rmb();
97 if (frozen(p)) 97 if (frozen(p))
98 return false; 98 return false;
99 99
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
index b8cadf70b1fb..5bf924d80b5c 100644
--- a/kernel/gcov/Kconfig
+++ b/kernel/gcov/Kconfig
@@ -2,7 +2,8 @@ menu "GCOV-based kernel profiling"
2 2
3config GCOV_KERNEL 3config GCOV_KERNEL
4 bool "Enable gcov-based kernel profiling" 4 bool "Enable gcov-based kernel profiling"
5 depends on DEBUG_FS && CONSTRUCTORS 5 depends on DEBUG_FS
6 select CONSTRUCTORS
6 default n 7 default n
7 ---help--- 8 ---help---
8 This option enables gcov-based code profiling (e.g. for code coverage 9 This option enables gcov-based code profiling (e.g. for code coverage
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 87fdb3f8db14..a9205e32a059 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -64,24 +64,27 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
64 .clock_base = 64 .clock_base =
65 { 65 {
66 { 66 {
67 .index = CLOCK_REALTIME, 67 .index = HRTIMER_BASE_MONOTONIC,
68 .get_time = &ktime_get_real, 68 .clockid = CLOCK_MONOTONIC,
69 .get_time = &ktime_get,
69 .resolution = KTIME_LOW_RES, 70 .resolution = KTIME_LOW_RES,
70 }, 71 },
71 { 72 {
72 .index = CLOCK_MONOTONIC, 73 .index = HRTIMER_BASE_REALTIME,
73 .get_time = &ktime_get, 74 .clockid = CLOCK_REALTIME,
75 .get_time = &ktime_get_real,
74 .resolution = KTIME_LOW_RES, 76 .resolution = KTIME_LOW_RES,
75 }, 77 },
76 { 78 {
77 .index = CLOCK_BOOTTIME, 79 .index = HRTIMER_BASE_BOOTTIME,
80 .clockid = CLOCK_BOOTTIME,
78 .get_time = &ktime_get_boottime, 81 .get_time = &ktime_get_boottime,
79 .resolution = KTIME_LOW_RES, 82 .resolution = KTIME_LOW_RES,
80 }, 83 },
81 } 84 }
82}; 85};
83 86
84static int hrtimer_clock_to_base_table[MAX_CLOCKS] = { 87static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = {
85 [CLOCK_REALTIME] = HRTIMER_BASE_REALTIME, 88 [CLOCK_REALTIME] = HRTIMER_BASE_REALTIME,
86 [CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC, 89 [CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC,
87 [CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME, 90 [CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME,
@@ -196,7 +199,7 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
196 struct hrtimer_cpu_base *new_cpu_base; 199 struct hrtimer_cpu_base *new_cpu_base;
197 int this_cpu = smp_processor_id(); 200 int this_cpu = smp_processor_id();
198 int cpu = hrtimer_get_target(this_cpu, pinned); 201 int cpu = hrtimer_get_target(this_cpu, pinned);
199 int basenum = hrtimer_clockid_to_base(base->index); 202 int basenum = base->index;
200 203
201again: 204again:
202 new_cpu_base = &per_cpu(hrtimer_bases, cpu); 205 new_cpu_base = &per_cpu(hrtimer_bases, cpu);
@@ -621,66 +624,6 @@ static int hrtimer_reprogram(struct hrtimer *timer,
621 return res; 624 return res;
622} 625}
623 626
624
625/*
626 * Retrigger next event is called after clock was set
627 *
628 * Called with interrupts disabled via on_each_cpu()
629 */
630static void retrigger_next_event(void *arg)
631{
632 struct hrtimer_cpu_base *base;
633 struct timespec realtime_offset, wtm, sleep;
634
635 if (!hrtimer_hres_active())
636 return;
637
638 get_xtime_and_monotonic_and_sleep_offset(&realtime_offset, &wtm,
639 &sleep);
640 set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec);
641
642 base = &__get_cpu_var(hrtimer_bases);
643
644 /* Adjust CLOCK_REALTIME offset */
645 raw_spin_lock(&base->lock);
646 base->clock_base[HRTIMER_BASE_REALTIME].offset =
647 timespec_to_ktime(realtime_offset);
648 base->clock_base[HRTIMER_BASE_BOOTTIME].offset =
649 timespec_to_ktime(sleep);
650
651 hrtimer_force_reprogram(base, 0);
652 raw_spin_unlock(&base->lock);
653}
654
655/*
656 * Clock realtime was set
657 *
658 * Change the offset of the realtime clock vs. the monotonic
659 * clock.
660 *
661 * We might have to reprogram the high resolution timer interrupt. On
662 * SMP we call the architecture specific code to retrigger _all_ high
663 * resolution timer interrupts. On UP we just disable interrupts and
664 * call the high resolution interrupt code.
665 */
666void clock_was_set(void)
667{
668 /* Retrigger the CPU local events everywhere */
669 on_each_cpu(retrigger_next_event, NULL, 1);
670}
671
672/*
673 * During resume we might have to reprogram the high resolution timer
674 * interrupt (on the local CPU):
675 */
676void hres_timers_resume(void)
677{
678 WARN_ONCE(!irqs_disabled(),
679 KERN_INFO "hres_timers_resume() called with IRQs enabled!");
680
681 retrigger_next_event(NULL);
682}
683
684/* 627/*
685 * Initialize the high resolution related parts of cpu_base 628 * Initialize the high resolution related parts of cpu_base
686 */ 629 */
@@ -715,11 +658,39 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
715} 658}
716 659
717/* 660/*
661 * Retrigger next event is called after clock was set
662 *
663 * Called with interrupts disabled via on_each_cpu()
664 */
665static void retrigger_next_event(void *arg)
666{
667 struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases);
668 struct timespec realtime_offset, xtim, wtm, sleep;
669
670 if (!hrtimer_hres_active())
671 return;
672
673 /* Optimized out for !HIGH_RES */
674 get_xtime_and_monotonic_and_sleep_offset(&xtim, &wtm, &sleep);
675 set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec);
676
677 /* Adjust CLOCK_REALTIME offset */
678 raw_spin_lock(&base->lock);
679 base->clock_base[HRTIMER_BASE_REALTIME].offset =
680 timespec_to_ktime(realtime_offset);
681 base->clock_base[HRTIMER_BASE_BOOTTIME].offset =
682 timespec_to_ktime(sleep);
683
684 hrtimer_force_reprogram(base, 0);
685 raw_spin_unlock(&base->lock);
686}
687
688/*
718 * Switch to high resolution mode 689 * Switch to high resolution mode
719 */ 690 */
720static int hrtimer_switch_to_hres(void) 691static int hrtimer_switch_to_hres(void)
721{ 692{
722 int cpu = smp_processor_id(); 693 int i, cpu = smp_processor_id();
723 struct hrtimer_cpu_base *base = &per_cpu(hrtimer_bases, cpu); 694 struct hrtimer_cpu_base *base = &per_cpu(hrtimer_bases, cpu);
724 unsigned long flags; 695 unsigned long flags;
725 696
@@ -735,9 +706,8 @@ static int hrtimer_switch_to_hres(void)
735 return 0; 706 return 0;
736 } 707 }
737 base->hres_active = 1; 708 base->hres_active = 1;
738 base->clock_base[HRTIMER_BASE_REALTIME].resolution = KTIME_HIGH_RES; 709 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)
739 base->clock_base[HRTIMER_BASE_MONOTONIC].resolution = KTIME_HIGH_RES; 710 base->clock_base[i].resolution = KTIME_HIGH_RES;
740 base->clock_base[HRTIMER_BASE_BOOTTIME].resolution = KTIME_HIGH_RES;
741 711
742 tick_setup_sched_timer(); 712 tick_setup_sched_timer();
743 713
@@ -761,9 +731,43 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
761 return 0; 731 return 0;
762} 732}
763static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { } 733static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { }
734static inline void retrigger_next_event(void *arg) { }
764 735
765#endif /* CONFIG_HIGH_RES_TIMERS */ 736#endif /* CONFIG_HIGH_RES_TIMERS */
766 737
738/*
739 * Clock realtime was set
740 *
741 * Change the offset of the realtime clock vs. the monotonic
742 * clock.
743 *
744 * We might have to reprogram the high resolution timer interrupt. On
745 * SMP we call the architecture specific code to retrigger _all_ high
746 * resolution timer interrupts. On UP we just disable interrupts and
747 * call the high resolution interrupt code.
748 */
749void clock_was_set(void)
750{
751#ifdef CONFIG_HIGH_RES_TIMERS
752 /* Retrigger the CPU local events everywhere */
753 on_each_cpu(retrigger_next_event, NULL, 1);
754#endif
755 timerfd_clock_was_set();
756}
757
758/*
759 * During resume we might have to reprogram the high resolution timer
760 * interrupt (on the local CPU):
761 */
762void hrtimers_resume(void)
763{
764 WARN_ONCE(!irqs_disabled(),
765 KERN_INFO "hrtimers_resume() called with IRQs enabled!");
766
767 retrigger_next_event(NULL);
768 timerfd_clock_was_set();
769}
770
767static inline void timer_stats_hrtimer_set_start_info(struct hrtimer *timer) 771static inline void timer_stats_hrtimer_set_start_info(struct hrtimer *timer)
768{ 772{
769#ifdef CONFIG_TIMER_STATS 773#ifdef CONFIG_TIMER_STATS
@@ -856,6 +860,7 @@ static int enqueue_hrtimer(struct hrtimer *timer,
856 debug_activate(timer); 860 debug_activate(timer);
857 861
858 timerqueue_add(&base->active, &timer->node); 862 timerqueue_add(&base->active, &timer->node);
863 base->cpu_base->active_bases |= 1 << base->index;
859 864
860 /* 865 /*
861 * HRTIMER_STATE_ENQUEUED is or'ed to the current state to preserve the 866 * HRTIMER_STATE_ENQUEUED is or'ed to the current state to preserve the
@@ -897,6 +902,8 @@ static void __remove_hrtimer(struct hrtimer *timer,
897#endif 902#endif
898 } 903 }
899 timerqueue_del(&base->active, &timer->node); 904 timerqueue_del(&base->active, &timer->node);
905 if (!timerqueue_getnext(&base->active))
906 base->cpu_base->active_bases &= ~(1 << base->index);
900out: 907out:
901 timer->state = newstate; 908 timer->state = newstate;
902} 909}
@@ -1234,7 +1241,6 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now)
1234void hrtimer_interrupt(struct clock_event_device *dev) 1241void hrtimer_interrupt(struct clock_event_device *dev)
1235{ 1242{
1236 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); 1243 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
1237 struct hrtimer_clock_base *base;
1238 ktime_t expires_next, now, entry_time, delta; 1244 ktime_t expires_next, now, entry_time, delta;
1239 int i, retries = 0; 1245 int i, retries = 0;
1240 1246
@@ -1256,12 +1262,15 @@ retry:
1256 */ 1262 */
1257 cpu_base->expires_next.tv64 = KTIME_MAX; 1263 cpu_base->expires_next.tv64 = KTIME_MAX;
1258 1264
1259 base = cpu_base->clock_base;
1260
1261 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { 1265 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
1262 ktime_t basenow; 1266 struct hrtimer_clock_base *base;
1263 struct timerqueue_node *node; 1267 struct timerqueue_node *node;
1268 ktime_t basenow;
1269
1270 if (!(cpu_base->active_bases & (1 << i)))
1271 continue;
1264 1272
1273 base = cpu_base->clock_base + i;
1265 basenow = ktime_add(now, base->offset); 1274 basenow = ktime_add(now, base->offset);
1266 1275
1267 while ((node = timerqueue_getnext(&base->active))) { 1276 while ((node = timerqueue_getnext(&base->active))) {
@@ -1294,7 +1303,6 @@ retry:
1294 1303
1295 __run_hrtimer(timer, &basenow); 1304 __run_hrtimer(timer, &basenow);
1296 } 1305 }
1297 base++;
1298 } 1306 }
1299 1307
1300 /* 1308 /*
@@ -1525,7 +1533,7 @@ long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
1525 struct timespec __user *rmtp; 1533 struct timespec __user *rmtp;
1526 int ret = 0; 1534 int ret = 0;
1527 1535
1528 hrtimer_init_on_stack(&t.timer, restart->nanosleep.index, 1536 hrtimer_init_on_stack(&t.timer, restart->nanosleep.clockid,
1529 HRTIMER_MODE_ABS); 1537 HRTIMER_MODE_ABS);
1530 hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires); 1538 hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires);
1531 1539
@@ -1577,7 +1585,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
1577 1585
1578 restart = &current_thread_info()->restart_block; 1586 restart = &current_thread_info()->restart_block;
1579 restart->fn = hrtimer_nanosleep_restart; 1587 restart->fn = hrtimer_nanosleep_restart;
1580 restart->nanosleep.index = t.timer.base->index; 1588 restart->nanosleep.clockid = t.timer.base->clockid;
1581 restart->nanosleep.rmtp = rmtp; 1589 restart->nanosleep.rmtp = rmtp;
1582 restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer); 1590 restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer);
1583 1591
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 53ead174da2f..ea640120ab86 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -33,7 +33,7 @@ unsigned long __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT;
33/* 33/*
34 * Zero means infinite timeout - no checking done: 34 * Zero means infinite timeout - no checking done:
35 */ 35 */
36unsigned long __read_mostly sysctl_hung_task_timeout_secs = 120; 36unsigned long __read_mostly sysctl_hung_task_timeout_secs = CONFIG_DEFAULT_HUNG_TASK_TIMEOUT;
37 37
38unsigned long __read_mostly sysctl_hung_task_warnings = 10; 38unsigned long __read_mostly sysctl_hung_task_warnings = 10;
39 39
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index c574f9a12c48..d1d051b38e0b 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -48,6 +48,10 @@ config IRQ_PREFLOW_FASTEOI
48config IRQ_EDGE_EOI_HANDLER 48config IRQ_EDGE_EOI_HANDLER
49 bool 49 bool
50 50
51# Generic configurable interrupt chip implementation
52config GENERIC_IRQ_CHIP
53 bool
54
51# Support forced irq threading 55# Support forced irq threading
52config IRQ_FORCED_THREADING 56config IRQ_FORCED_THREADING
53 bool 57 bool
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index 54329cd7b3ee..73290056cfb6 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -1,5 +1,6 @@
1 1
2obj-y := irqdesc.o handle.o manage.o spurious.o resend.o chip.o dummychip.o devres.o 2obj-y := irqdesc.o handle.o manage.o spurious.o resend.o chip.o dummychip.o devres.o
3obj-$(CONFIG_GENERIC_IRQ_CHIP) += generic-chip.o
3obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o 4obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o
4obj-$(CONFIG_PROC_FS) += proc.o 5obj-$(CONFIG_PROC_FS) += proc.o
5obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o 6obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 4af1e2b244cb..d5a3009da71a 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -310,6 +310,7 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc)
310out_unlock: 310out_unlock:
311 raw_spin_unlock(&desc->lock); 311 raw_spin_unlock(&desc->lock);
312} 312}
313EXPORT_SYMBOL_GPL(handle_simple_irq);
313 314
314/** 315/**
315 * handle_level_irq - Level type irq handler 316 * handle_level_irq - Level type irq handler
@@ -573,6 +574,7 @@ __irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
573 if (handle != handle_bad_irq && is_chained) { 574 if (handle != handle_bad_irq && is_chained) {
574 irq_settings_set_noprobe(desc); 575 irq_settings_set_noprobe(desc);
575 irq_settings_set_norequest(desc); 576 irq_settings_set_norequest(desc);
577 irq_settings_set_nothread(desc);
576 irq_startup(desc); 578 irq_startup(desc);
577 } 579 }
578out: 580out:
@@ -612,6 +614,7 @@ void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set)
612 614
613 irq_put_desc_unlock(desc, flags); 615 irq_put_desc_unlock(desc, flags);
614} 616}
617EXPORT_SYMBOL_GPL(irq_modify_status);
615 618
616/** 619/**
617 * irq_cpu_online - Invoke all irq_cpu_online functions. 620 * irq_cpu_online - Invoke all irq_cpu_online functions.
diff --git a/kernel/irq/debug.h b/kernel/irq/debug.h
index 306cba37e9a5..97a8bfadc88a 100644
--- a/kernel/irq/debug.h
+++ b/kernel/irq/debug.h
@@ -27,6 +27,7 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc)
27 P(IRQ_PER_CPU); 27 P(IRQ_PER_CPU);
28 P(IRQ_NOPROBE); 28 P(IRQ_NOPROBE);
29 P(IRQ_NOREQUEST); 29 P(IRQ_NOREQUEST);
30 P(IRQ_NOTHREAD);
30 P(IRQ_NOAUTOEN); 31 P(IRQ_NOAUTOEN);
31 32
32 PS(IRQS_AUTODETECT); 33 PS(IRQS_AUTODETECT);
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
new file mode 100644
index 000000000000..3a2cab407b93
--- /dev/null
+++ b/kernel/irq/generic-chip.c
@@ -0,0 +1,368 @@
1/*
2 * Library implementing the most common irq chip callback functions
3 *
4 * Copyright (C) 2011, Thomas Gleixner
5 */
6#include <linux/io.h>
7#include <linux/irq.h>
8#include <linux/slab.h>
9#include <linux/interrupt.h>
10#include <linux/kernel_stat.h>
11#include <linux/syscore_ops.h>
12
13#include "internals.h"
14
15static LIST_HEAD(gc_list);
16static DEFINE_RAW_SPINLOCK(gc_lock);
17
18static inline struct irq_chip_regs *cur_regs(struct irq_data *d)
19{
20 return &container_of(d->chip, struct irq_chip_type, chip)->regs;
21}
22
23/**
24 * irq_gc_noop - NOOP function
25 * @d: irq_data
26 */
27void irq_gc_noop(struct irq_data *d)
28{
29}
30
31/**
32 * irq_gc_mask_disable_reg - Mask chip via disable register
33 * @d: irq_data
34 *
35 * Chip has separate enable/disable registers instead of a single mask
36 * register.
37 */
38void irq_gc_mask_disable_reg(struct irq_data *d)
39{
40 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
41 u32 mask = 1 << (d->irq - gc->irq_base);
42
43 irq_gc_lock(gc);
44 irq_reg_writel(mask, gc->reg_base + cur_regs(d)->disable);
45 gc->mask_cache &= ~mask;
46 irq_gc_unlock(gc);
47}
48
49/**
50 * irq_gc_mask_set_mask_bit - Mask chip via setting bit in mask register
51 * @d: irq_data
52 *
53 * Chip has a single mask register. Values of this register are cached
54 * and protected by gc->lock
55 */
56void irq_gc_mask_set_bit(struct irq_data *d)
57{
58 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
59 u32 mask = 1 << (d->irq - gc->irq_base);
60
61 irq_gc_lock(gc);
62 gc->mask_cache |= mask;
63 irq_reg_writel(gc->mask_cache, gc->reg_base + cur_regs(d)->mask);
64 irq_gc_unlock(gc);
65}
66
67/**
68 * irq_gc_mask_set_mask_bit - Mask chip via clearing bit in mask register
69 * @d: irq_data
70 *
71 * Chip has a single mask register. Values of this register are cached
72 * and protected by gc->lock
73 */
74void irq_gc_mask_clr_bit(struct irq_data *d)
75{
76 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
77 u32 mask = 1 << (d->irq - gc->irq_base);
78
79 irq_gc_lock(gc);
80 gc->mask_cache &= ~mask;
81 irq_reg_writel(gc->mask_cache, gc->reg_base + cur_regs(d)->mask);
82 irq_gc_unlock(gc);
83}
84
85/**
86 * irq_gc_unmask_enable_reg - Unmask chip via enable register
87 * @d: irq_data
88 *
89 * Chip has separate enable/disable registers instead of a single mask
90 * register.
91 */
92void irq_gc_unmask_enable_reg(struct irq_data *d)
93{
94 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
95 u32 mask = 1 << (d->irq - gc->irq_base);
96
97 irq_gc_lock(gc);
98 irq_reg_writel(mask, gc->reg_base + cur_regs(d)->enable);
99 gc->mask_cache |= mask;
100 irq_gc_unlock(gc);
101}
102
103/**
104 * irq_gc_ack_set_bit - Ack pending interrupt via setting bit
105 * @d: irq_data
106 */
107void irq_gc_ack_set_bit(struct irq_data *d)
108{
109 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
110 u32 mask = 1 << (d->irq - gc->irq_base);
111
112 irq_gc_lock(gc);
113 irq_reg_writel(mask, gc->reg_base + cur_regs(d)->ack);
114 irq_gc_unlock(gc);
115}
116
117/**
118 * irq_gc_ack_clr_bit - Ack pending interrupt via clearing bit
119 * @d: irq_data
120 */
121void irq_gc_ack_clr_bit(struct irq_data *d)
122{
123 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
124 u32 mask = ~(1 << (d->irq - gc->irq_base));
125
126 irq_gc_lock(gc);
127 irq_reg_writel(mask, gc->reg_base + cur_regs(d)->ack);
128 irq_gc_unlock(gc);
129}
130
131/**
132 * irq_gc_mask_disable_reg_and_ack- Mask and ack pending interrupt
133 * @d: irq_data
134 */
135void irq_gc_mask_disable_reg_and_ack(struct irq_data *d)
136{
137 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
138 u32 mask = 1 << (d->irq - gc->irq_base);
139
140 irq_gc_lock(gc);
141 irq_reg_writel(mask, gc->reg_base + cur_regs(d)->mask);
142 irq_reg_writel(mask, gc->reg_base + cur_regs(d)->ack);
143 irq_gc_unlock(gc);
144}
145
146/**
147 * irq_gc_eoi - EOI interrupt
148 * @d: irq_data
149 */
150void irq_gc_eoi(struct irq_data *d)
151{
152 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
153 u32 mask = 1 << (d->irq - gc->irq_base);
154
155 irq_gc_lock(gc);
156 irq_reg_writel(mask, gc->reg_base + cur_regs(d)->eoi);
157 irq_gc_unlock(gc);
158}
159
160/**
161 * irq_gc_set_wake - Set/clr wake bit for an interrupt
162 * @d: irq_data
163 *
164 * For chips where the wake from suspend functionality is not
165 * configured in a separate register and the wakeup active state is
166 * just stored in a bitmask.
167 */
168int irq_gc_set_wake(struct irq_data *d, unsigned int on)
169{
170 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
171 u32 mask = 1 << (d->irq - gc->irq_base);
172
173 if (!(mask & gc->wake_enabled))
174 return -EINVAL;
175
176 irq_gc_lock(gc);
177 if (on)
178 gc->wake_active |= mask;
179 else
180 gc->wake_active &= ~mask;
181 irq_gc_unlock(gc);
182 return 0;
183}
184
185/**
186 * irq_alloc_generic_chip - Allocate a generic chip and initialize it
187 * @name: Name of the irq chip
188 * @num_ct: Number of irq_chip_type instances associated with this
189 * @irq_base: Interrupt base nr for this chip
190 * @reg_base: Register base address (virtual)
191 * @handler: Default flow handler associated with this chip
192 *
193 * Returns an initialized irq_chip_generic structure. The chip defaults
194 * to the primary (index 0) irq_chip_type and @handler
195 */
196struct irq_chip_generic *
197irq_alloc_generic_chip(const char *name, int num_ct, unsigned int irq_base,
198 void __iomem *reg_base, irq_flow_handler_t handler)
199{
200 struct irq_chip_generic *gc;
201 unsigned long sz = sizeof(*gc) + num_ct * sizeof(struct irq_chip_type);
202
203 gc = kzalloc(sz, GFP_KERNEL);
204 if (gc) {
205 raw_spin_lock_init(&gc->lock);
206 gc->num_ct = num_ct;
207 gc->irq_base = irq_base;
208 gc->reg_base = reg_base;
209 gc->chip_types->chip.name = name;
210 gc->chip_types->handler = handler;
211 }
212 return gc;
213}
214
215/*
216 * Separate lockdep class for interrupt chip which can nest irq_desc
217 * lock.
218 */
219static struct lock_class_key irq_nested_lock_class;
220
221/**
222 * irq_setup_generic_chip - Setup a range of interrupts with a generic chip
223 * @gc: Generic irq chip holding all data
224 * @msk: Bitmask holding the irqs to initialize relative to gc->irq_base
225 * @flags: Flags for initialization
226 * @clr: IRQ_* bits to clear
227 * @set: IRQ_* bits to set
228 *
229 * Set up max. 32 interrupts starting from gc->irq_base. Note, this
230 * initializes all interrupts to the primary irq_chip_type and its
231 * associated handler.
232 */
233void irq_setup_generic_chip(struct irq_chip_generic *gc, u32 msk,
234 enum irq_gc_flags flags, unsigned int clr,
235 unsigned int set)
236{
237 struct irq_chip_type *ct = gc->chip_types;
238 unsigned int i;
239
240 raw_spin_lock(&gc_lock);
241 list_add_tail(&gc->list, &gc_list);
242 raw_spin_unlock(&gc_lock);
243
244 /* Init mask cache ? */
245 if (flags & IRQ_GC_INIT_MASK_CACHE)
246 gc->mask_cache = irq_reg_readl(gc->reg_base + ct->regs.mask);
247
248 for (i = gc->irq_base; msk; msk >>= 1, i++) {
249 if (!msk & 0x01)
250 continue;
251
252 if (flags & IRQ_GC_INIT_NESTED_LOCK)
253 irq_set_lockdep_class(i, &irq_nested_lock_class);
254
255 irq_set_chip_and_handler(i, &ct->chip, ct->handler);
256 irq_set_chip_data(i, gc);
257 irq_modify_status(i, clr, set);
258 }
259 gc->irq_cnt = i - gc->irq_base;
260}
261
262/**
263 * irq_setup_alt_chip - Switch to alternative chip
264 * @d: irq_data for this interrupt
265 * @type Flow type to be initialized
266 *
267 * Only to be called from chip->irq_set_type() callbacks.
268 */
269int irq_setup_alt_chip(struct irq_data *d, unsigned int type)
270{
271 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
272 struct irq_chip_type *ct = gc->chip_types;
273 unsigned int i;
274
275 for (i = 0; i < gc->num_ct; i++, ct++) {
276 if (ct->type & type) {
277 d->chip = &ct->chip;
278 irq_data_to_desc(d)->handle_irq = ct->handler;
279 return 0;
280 }
281 }
282 return -EINVAL;
283}
284
285/**
286 * irq_remove_generic_chip - Remove a chip
287 * @gc: Generic irq chip holding all data
288 * @msk: Bitmask holding the irqs to initialize relative to gc->irq_base
289 * @clr: IRQ_* bits to clear
290 * @set: IRQ_* bits to set
291 *
292 * Remove up to 32 interrupts starting from gc->irq_base.
293 */
294void irq_remove_generic_chip(struct irq_chip_generic *gc, u32 msk,
295 unsigned int clr, unsigned int set)
296{
297 unsigned int i = gc->irq_base;
298
299 raw_spin_lock(&gc_lock);
300 list_del(&gc->list);
301 raw_spin_unlock(&gc_lock);
302
303 for (; msk; msk >>= 1, i++) {
304 if (!msk & 0x01)
305 continue;
306
307 /* Remove handler first. That will mask the irq line */
308 irq_set_handler(i, NULL);
309 irq_set_chip(i, &no_irq_chip);
310 irq_set_chip_data(i, NULL);
311 irq_modify_status(i, clr, set);
312 }
313}
314
315#ifdef CONFIG_PM
316static int irq_gc_suspend(void)
317{
318 struct irq_chip_generic *gc;
319
320 list_for_each_entry(gc, &gc_list, list) {
321 struct irq_chip_type *ct = gc->chip_types;
322
323 if (ct->chip.irq_suspend)
324 ct->chip.irq_suspend(irq_get_irq_data(gc->irq_base));
325 }
326 return 0;
327}
328
329static void irq_gc_resume(void)
330{
331 struct irq_chip_generic *gc;
332
333 list_for_each_entry(gc, &gc_list, list) {
334 struct irq_chip_type *ct = gc->chip_types;
335
336 if (ct->chip.irq_resume)
337 ct->chip.irq_resume(irq_get_irq_data(gc->irq_base));
338 }
339}
340#else
341#define irq_gc_suspend NULL
342#define irq_gc_resume NULL
343#endif
344
345static void irq_gc_shutdown(void)
346{
347 struct irq_chip_generic *gc;
348
349 list_for_each_entry(gc, &gc_list, list) {
350 struct irq_chip_type *ct = gc->chip_types;
351
352 if (ct->chip.irq_pm_shutdown)
353 ct->chip.irq_pm_shutdown(irq_get_irq_data(gc->irq_base));
354 }
355}
356
357static struct syscore_ops irq_gc_syscore_ops = {
358 .suspend = irq_gc_suspend,
359 .resume = irq_gc_resume,
360 .shutdown = irq_gc_shutdown,
361};
362
363static int __init irq_gc_init_ops(void)
364{
365 register_syscore_ops(&irq_gc_syscore_ops);
366 return 0;
367}
368device_initcall(irq_gc_init_ops);
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 90cb55f6d7eb..470d08c82bbe 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -133,12 +133,6 @@ handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
133 switch (res) { 133 switch (res) {
134 case IRQ_WAKE_THREAD: 134 case IRQ_WAKE_THREAD:
135 /* 135 /*
136 * Set result to handled so the spurious check
137 * does not trigger.
138 */
139 res = IRQ_HANDLED;
140
141 /*
142 * Catch drivers which return WAKE_THREAD but 136 * Catch drivers which return WAKE_THREAD but
143 * did not set up a thread function 137 * did not set up a thread function
144 */ 138 */
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 2c039c9b9383..4c60a50e66b2 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -22,7 +22,7 @@
22 */ 22 */
23static struct lock_class_key irq_desc_lock_class; 23static struct lock_class_key irq_desc_lock_class;
24 24
25#if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS) 25#if defined(CONFIG_SMP)
26static void __init init_irq_default_affinity(void) 26static void __init init_irq_default_affinity(void)
27{ 27{
28 alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT); 28 alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT);
@@ -257,13 +257,11 @@ int __init early_irq_init(void)
257 count = ARRAY_SIZE(irq_desc); 257 count = ARRAY_SIZE(irq_desc);
258 258
259 for (i = 0; i < count; i++) { 259 for (i = 0; i < count; i++) {
260 desc[i].irq_data.irq = i;
261 desc[i].irq_data.chip = &no_irq_chip;
262 desc[i].kstat_irqs = alloc_percpu(unsigned int); 260 desc[i].kstat_irqs = alloc_percpu(unsigned int);
263 irq_settings_clr_and_set(desc, ~0, _IRQ_DEFAULT_INIT_FLAGS); 261 alloc_masks(&desc[i], GFP_KERNEL, node);
264 alloc_masks(desc + i, GFP_KERNEL, node); 262 raw_spin_lock_init(&desc[i].lock);
265 desc_smp_init(desc + i, node);
266 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); 263 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
264 desc_set_defaults(i, &desc[i], node);
267 } 265 }
268 return arch_early_irq_init(); 266 return arch_early_irq_init();
269} 267}
@@ -290,6 +288,22 @@ static int irq_expand_nr_irqs(unsigned int nr)
290 288
291#endif /* !CONFIG_SPARSE_IRQ */ 289#endif /* !CONFIG_SPARSE_IRQ */
292 290
291/**
292 * generic_handle_irq - Invoke the handler for a particular irq
293 * @irq: The irq number to handle
294 *
295 */
296int generic_handle_irq(unsigned int irq)
297{
298 struct irq_desc *desc = irq_to_desc(irq);
299
300 if (!desc)
301 return -EINVAL;
302 generic_handle_irq_desc(irq, desc);
303 return 0;
304}
305EXPORT_SYMBOL_GPL(generic_handle_irq);
306
293/* Dynamic interrupt handling */ 307/* Dynamic interrupt handling */
294 308
295/** 309/**
@@ -311,6 +325,7 @@ void irq_free_descs(unsigned int from, unsigned int cnt)
311 bitmap_clear(allocated_irqs, from, cnt); 325 bitmap_clear(allocated_irqs, from, cnt);
312 mutex_unlock(&sparse_irq_lock); 326 mutex_unlock(&sparse_irq_lock);
313} 327}
328EXPORT_SYMBOL_GPL(irq_free_descs);
314 329
315/** 330/**
316 * irq_alloc_descs - allocate and initialize a range of irq descriptors 331 * irq_alloc_descs - allocate and initialize a range of irq descriptors
@@ -329,6 +344,12 @@ irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node)
329 if (!cnt) 344 if (!cnt)
330 return -EINVAL; 345 return -EINVAL;
331 346
347 if (irq >= 0) {
348 if (from > irq)
349 return -EINVAL;
350 from = irq;
351 }
352
332 mutex_lock(&sparse_irq_lock); 353 mutex_lock(&sparse_irq_lock);
333 354
334 start = bitmap_find_next_zero_area(allocated_irqs, IRQ_BITMAP_BITS, 355 start = bitmap_find_next_zero_area(allocated_irqs, IRQ_BITMAP_BITS,
@@ -351,6 +372,7 @@ err:
351 mutex_unlock(&sparse_irq_lock); 372 mutex_unlock(&sparse_irq_lock);
352 return ret; 373 return ret;
353} 374}
375EXPORT_SYMBOL_GPL(irq_alloc_descs);
354 376
355/** 377/**
356 * irq_reserve_irqs - mark irqs allocated 378 * irq_reserve_irqs - mark irqs allocated
@@ -430,7 +452,6 @@ unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
430 *per_cpu_ptr(desc->kstat_irqs, cpu) : 0; 452 *per_cpu_ptr(desc->kstat_irqs, cpu) : 0;
431} 453}
432 454
433#ifdef CONFIG_GENERIC_HARDIRQS
434unsigned int kstat_irqs(unsigned int irq) 455unsigned int kstat_irqs(unsigned int irq)
435{ 456{
436 struct irq_desc *desc = irq_to_desc(irq); 457 struct irq_desc *desc = irq_to_desc(irq);
@@ -443,4 +464,3 @@ unsigned int kstat_irqs(unsigned int irq)
443 sum += *per_cpu_ptr(desc->kstat_irqs, cpu); 464 sum += *per_cpu_ptr(desc->kstat_irqs, cpu);
444 return sum; 465 return sum;
445} 466}
446#endif /* CONFIG_GENERIC_HARDIRQS */
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 07c1611f3899..0a7840aeb0fb 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -491,6 +491,9 @@ int irq_set_irq_wake(unsigned int irq, unsigned int on)
491 struct irq_desc *desc = irq_get_desc_buslock(irq, &flags); 491 struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
492 int ret = 0; 492 int ret = 0;
493 493
494 if (!desc)
495 return -EINVAL;
496
494 /* wakeup-capable irqs can be shared between drivers that 497 /* wakeup-capable irqs can be shared between drivers that
495 * don't need to have the same sleep mode behaviors. 498 * don't need to have the same sleep mode behaviors.
496 */ 499 */
@@ -723,13 +726,16 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { }
723 * context. So we need to disable bh here to avoid deadlocks and other 726 * context. So we need to disable bh here to avoid deadlocks and other
724 * side effects. 727 * side effects.
725 */ 728 */
726static void 729static irqreturn_t
727irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action) 730irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action)
728{ 731{
732 irqreturn_t ret;
733
729 local_bh_disable(); 734 local_bh_disable();
730 action->thread_fn(action->irq, action->dev_id); 735 ret = action->thread_fn(action->irq, action->dev_id);
731 irq_finalize_oneshot(desc, action, false); 736 irq_finalize_oneshot(desc, action, false);
732 local_bh_enable(); 737 local_bh_enable();
738 return ret;
733} 739}
734 740
735/* 741/*
@@ -737,10 +743,14 @@ irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action)
737 * preemtible - many of them need to sleep and wait for slow busses to 743 * preemtible - many of them need to sleep and wait for slow busses to
738 * complete. 744 * complete.
739 */ 745 */
740static void irq_thread_fn(struct irq_desc *desc, struct irqaction *action) 746static irqreturn_t irq_thread_fn(struct irq_desc *desc,
747 struct irqaction *action)
741{ 748{
742 action->thread_fn(action->irq, action->dev_id); 749 irqreturn_t ret;
750
751 ret = action->thread_fn(action->irq, action->dev_id);
743 irq_finalize_oneshot(desc, action, false); 752 irq_finalize_oneshot(desc, action, false);
753 return ret;
744} 754}
745 755
746/* 756/*
@@ -753,7 +763,8 @@ static int irq_thread(void *data)
753 }; 763 };
754 struct irqaction *action = data; 764 struct irqaction *action = data;
755 struct irq_desc *desc = irq_to_desc(action->irq); 765 struct irq_desc *desc = irq_to_desc(action->irq);
756 void (*handler_fn)(struct irq_desc *desc, struct irqaction *action); 766 irqreturn_t (*handler_fn)(struct irq_desc *desc,
767 struct irqaction *action);
757 int wake; 768 int wake;
758 769
759 if (force_irqthreads & test_bit(IRQTF_FORCED_THREAD, 770 if (force_irqthreads & test_bit(IRQTF_FORCED_THREAD,
@@ -783,8 +794,12 @@ static int irq_thread(void *data)
783 desc->istate |= IRQS_PENDING; 794 desc->istate |= IRQS_PENDING;
784 raw_spin_unlock_irq(&desc->lock); 795 raw_spin_unlock_irq(&desc->lock);
785 } else { 796 } else {
797 irqreturn_t action_ret;
798
786 raw_spin_unlock_irq(&desc->lock); 799 raw_spin_unlock_irq(&desc->lock);
787 handler_fn(desc, action); 800 action_ret = handler_fn(desc, action);
801 if (!noirqdebug)
802 note_interrupt(action->irq, desc, action_ret);
788 } 803 }
789 804
790 wake = atomic_dec_and_test(&desc->threads_active); 805 wake = atomic_dec_and_test(&desc->threads_active);
@@ -900,7 +915,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
900 */ 915 */
901 new->handler = irq_nested_primary_handler; 916 new->handler = irq_nested_primary_handler;
902 } else { 917 } else {
903 irq_setup_forced_threading(new); 918 if (irq_settings_can_thread(desc))
919 irq_setup_forced_threading(new);
904 } 920 }
905 921
906 /* 922 /*
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 834899f2500f..4bd4faa6323a 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -19,7 +19,7 @@ static struct proc_dir_entry *root_irq_dir;
19 19
20#ifdef CONFIG_SMP 20#ifdef CONFIG_SMP
21 21
22static int irq_affinity_proc_show(struct seq_file *m, void *v) 22static int show_irq_affinity(int type, struct seq_file *m, void *v)
23{ 23{
24 struct irq_desc *desc = irq_to_desc((long)m->private); 24 struct irq_desc *desc = irq_to_desc((long)m->private);
25 const struct cpumask *mask = desc->irq_data.affinity; 25 const struct cpumask *mask = desc->irq_data.affinity;
@@ -28,7 +28,10 @@ static int irq_affinity_proc_show(struct seq_file *m, void *v)
28 if (irqd_is_setaffinity_pending(&desc->irq_data)) 28 if (irqd_is_setaffinity_pending(&desc->irq_data))
29 mask = desc->pending_mask; 29 mask = desc->pending_mask;
30#endif 30#endif
31 seq_cpumask(m, mask); 31 if (type)
32 seq_cpumask_list(m, mask);
33 else
34 seq_cpumask(m, mask);
32 seq_putc(m, '\n'); 35 seq_putc(m, '\n');
33 return 0; 36 return 0;
34} 37}
@@ -59,7 +62,18 @@ static int irq_affinity_hint_proc_show(struct seq_file *m, void *v)
59#endif 62#endif
60 63
61int no_irq_affinity; 64int no_irq_affinity;
62static ssize_t irq_affinity_proc_write(struct file *file, 65static int irq_affinity_proc_show(struct seq_file *m, void *v)
66{
67 return show_irq_affinity(0, m, v);
68}
69
70static int irq_affinity_list_proc_show(struct seq_file *m, void *v)
71{
72 return show_irq_affinity(1, m, v);
73}
74
75
76static ssize_t write_irq_affinity(int type, struct file *file,
63 const char __user *buffer, size_t count, loff_t *pos) 77 const char __user *buffer, size_t count, loff_t *pos)
64{ 78{
65 unsigned int irq = (int)(long)PDE(file->f_path.dentry->d_inode)->data; 79 unsigned int irq = (int)(long)PDE(file->f_path.dentry->d_inode)->data;
@@ -72,7 +86,10 @@ static ssize_t irq_affinity_proc_write(struct file *file,
72 if (!alloc_cpumask_var(&new_value, GFP_KERNEL)) 86 if (!alloc_cpumask_var(&new_value, GFP_KERNEL))
73 return -ENOMEM; 87 return -ENOMEM;
74 88
75 err = cpumask_parse_user(buffer, count, new_value); 89 if (type)
90 err = cpumask_parselist_user(buffer, count, new_value);
91 else
92 err = cpumask_parse_user(buffer, count, new_value);
76 if (err) 93 if (err)
77 goto free_cpumask; 94 goto free_cpumask;
78 95
@@ -100,11 +117,28 @@ free_cpumask:
100 return err; 117 return err;
101} 118}
102 119
120static ssize_t irq_affinity_proc_write(struct file *file,
121 const char __user *buffer, size_t count, loff_t *pos)
122{
123 return write_irq_affinity(0, file, buffer, count, pos);
124}
125
126static ssize_t irq_affinity_list_proc_write(struct file *file,
127 const char __user *buffer, size_t count, loff_t *pos)
128{
129 return write_irq_affinity(1, file, buffer, count, pos);
130}
131
103static int irq_affinity_proc_open(struct inode *inode, struct file *file) 132static int irq_affinity_proc_open(struct inode *inode, struct file *file)
104{ 133{
105 return single_open(file, irq_affinity_proc_show, PDE(inode)->data); 134 return single_open(file, irq_affinity_proc_show, PDE(inode)->data);
106} 135}
107 136
137static int irq_affinity_list_proc_open(struct inode *inode, struct file *file)
138{
139 return single_open(file, irq_affinity_list_proc_show, PDE(inode)->data);
140}
141
108static int irq_affinity_hint_proc_open(struct inode *inode, struct file *file) 142static int irq_affinity_hint_proc_open(struct inode *inode, struct file *file)
109{ 143{
110 return single_open(file, irq_affinity_hint_proc_show, PDE(inode)->data); 144 return single_open(file, irq_affinity_hint_proc_show, PDE(inode)->data);
@@ -125,6 +159,14 @@ static const struct file_operations irq_affinity_hint_proc_fops = {
125 .release = single_release, 159 .release = single_release,
126}; 160};
127 161
162static const struct file_operations irq_affinity_list_proc_fops = {
163 .open = irq_affinity_list_proc_open,
164 .read = seq_read,
165 .llseek = seq_lseek,
166 .release = single_release,
167 .write = irq_affinity_list_proc_write,
168};
169
128static int default_affinity_show(struct seq_file *m, void *v) 170static int default_affinity_show(struct seq_file *m, void *v)
129{ 171{
130 seq_cpumask(m, irq_default_affinity); 172 seq_cpumask(m, irq_default_affinity);
@@ -289,6 +331,10 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
289 proc_create_data("affinity_hint", 0400, desc->dir, 331 proc_create_data("affinity_hint", 0400, desc->dir,
290 &irq_affinity_hint_proc_fops, (void *)(long)irq); 332 &irq_affinity_hint_proc_fops, (void *)(long)irq);
291 333
334 /* create /proc/irq/<irq>/smp_affinity_list */
335 proc_create_data("smp_affinity_list", 0600, desc->dir,
336 &irq_affinity_list_proc_fops, (void *)(long)irq);
337
292 proc_create_data("node", 0444, desc->dir, 338 proc_create_data("node", 0444, desc->dir,
293 &irq_node_proc_fops, (void *)(long)irq); 339 &irq_node_proc_fops, (void *)(long)irq);
294#endif 340#endif
@@ -306,6 +352,7 @@ void unregister_irq_proc(unsigned int irq, struct irq_desc *desc)
306#ifdef CONFIG_SMP 352#ifdef CONFIG_SMP
307 remove_proc_entry("smp_affinity", desc->dir); 353 remove_proc_entry("smp_affinity", desc->dir);
308 remove_proc_entry("affinity_hint", desc->dir); 354 remove_proc_entry("affinity_hint", desc->dir);
355 remove_proc_entry("smp_affinity_list", desc->dir);
309 remove_proc_entry("node", desc->dir); 356 remove_proc_entry("node", desc->dir);
310#endif 357#endif
311 remove_proc_entry("spurious", desc->dir); 358 remove_proc_entry("spurious", desc->dir);
diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h
index 0d91730b6330..f1667833d444 100644
--- a/kernel/irq/settings.h
+++ b/kernel/irq/settings.h
@@ -8,6 +8,7 @@ enum {
8 _IRQ_LEVEL = IRQ_LEVEL, 8 _IRQ_LEVEL = IRQ_LEVEL,
9 _IRQ_NOPROBE = IRQ_NOPROBE, 9 _IRQ_NOPROBE = IRQ_NOPROBE,
10 _IRQ_NOREQUEST = IRQ_NOREQUEST, 10 _IRQ_NOREQUEST = IRQ_NOREQUEST,
11 _IRQ_NOTHREAD = IRQ_NOTHREAD,
11 _IRQ_NOAUTOEN = IRQ_NOAUTOEN, 12 _IRQ_NOAUTOEN = IRQ_NOAUTOEN,
12 _IRQ_MOVE_PCNTXT = IRQ_MOVE_PCNTXT, 13 _IRQ_MOVE_PCNTXT = IRQ_MOVE_PCNTXT,
13 _IRQ_NO_BALANCING = IRQ_NO_BALANCING, 14 _IRQ_NO_BALANCING = IRQ_NO_BALANCING,
@@ -20,6 +21,7 @@ enum {
20#define IRQ_LEVEL GOT_YOU_MORON 21#define IRQ_LEVEL GOT_YOU_MORON
21#define IRQ_NOPROBE GOT_YOU_MORON 22#define IRQ_NOPROBE GOT_YOU_MORON
22#define IRQ_NOREQUEST GOT_YOU_MORON 23#define IRQ_NOREQUEST GOT_YOU_MORON
24#define IRQ_NOTHREAD GOT_YOU_MORON
23#define IRQ_NOAUTOEN GOT_YOU_MORON 25#define IRQ_NOAUTOEN GOT_YOU_MORON
24#define IRQ_NESTED_THREAD GOT_YOU_MORON 26#define IRQ_NESTED_THREAD GOT_YOU_MORON
25#undef IRQF_MODIFY_MASK 27#undef IRQF_MODIFY_MASK
@@ -94,6 +96,21 @@ static inline void irq_settings_set_norequest(struct irq_desc *desc)
94 desc->status_use_accessors |= _IRQ_NOREQUEST; 96 desc->status_use_accessors |= _IRQ_NOREQUEST;
95} 97}
96 98
99static inline bool irq_settings_can_thread(struct irq_desc *desc)
100{
101 return !(desc->status_use_accessors & _IRQ_NOTHREAD);
102}
103
104static inline void irq_settings_clr_nothread(struct irq_desc *desc)
105{
106 desc->status_use_accessors &= ~_IRQ_NOTHREAD;
107}
108
109static inline void irq_settings_set_nothread(struct irq_desc *desc)
110{
111 desc->status_use_accessors |= _IRQ_NOTHREAD;
112}
113
97static inline bool irq_settings_can_probe(struct irq_desc *desc) 114static inline bool irq_settings_can_probe(struct irq_desc *desc)
98{ 115{
99 return !(desc->status_use_accessors & _IRQ_NOPROBE); 116 return !(desc->status_use_accessors & _IRQ_NOPROBE);
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index dfbd550401b2..aa57d5da18c1 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -167,6 +167,13 @@ out:
167 jiffies + POLL_SPURIOUS_IRQ_INTERVAL); 167 jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
168} 168}
169 169
170static inline int bad_action_ret(irqreturn_t action_ret)
171{
172 if (likely(action_ret <= (IRQ_HANDLED | IRQ_WAKE_THREAD)))
173 return 0;
174 return 1;
175}
176
170/* 177/*
171 * If 99,900 of the previous 100,000 interrupts have not been handled 178 * If 99,900 of the previous 100,000 interrupts have not been handled
172 * then assume that the IRQ is stuck in some manner. Drop a diagnostic 179 * then assume that the IRQ is stuck in some manner. Drop a diagnostic
@@ -182,7 +189,7 @@ __report_bad_irq(unsigned int irq, struct irq_desc *desc,
182 struct irqaction *action; 189 struct irqaction *action;
183 unsigned long flags; 190 unsigned long flags;
184 191
185 if (action_ret != IRQ_HANDLED && action_ret != IRQ_NONE) { 192 if (bad_action_ret(action_ret)) {
186 printk(KERN_ERR "irq event %d: bogus return value %x\n", 193 printk(KERN_ERR "irq event %d: bogus return value %x\n",
187 irq, action_ret); 194 irq, action_ret);
188 } else { 195 } else {
@@ -201,10 +208,11 @@ __report_bad_irq(unsigned int irq, struct irq_desc *desc,
201 raw_spin_lock_irqsave(&desc->lock, flags); 208 raw_spin_lock_irqsave(&desc->lock, flags);
202 action = desc->action; 209 action = desc->action;
203 while (action) { 210 while (action) {
204 printk(KERN_ERR "[<%p>]", action->handler); 211 printk(KERN_ERR "[<%p>] %pf", action->handler, action->handler);
205 print_symbol(" (%s)", 212 if (action->thread_fn)
206 (unsigned long)action->handler); 213 printk(KERN_CONT " threaded [<%p>] %pf",
207 printk("\n"); 214 action->thread_fn, action->thread_fn);
215 printk(KERN_CONT "\n");
208 action = action->next; 216 action = action->next;
209 } 217 }
210 raw_spin_unlock_irqrestore(&desc->lock, flags); 218 raw_spin_unlock_irqrestore(&desc->lock, flags);
@@ -262,7 +270,16 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
262 if (desc->istate & IRQS_POLL_INPROGRESS) 270 if (desc->istate & IRQS_POLL_INPROGRESS)
263 return; 271 return;
264 272
265 if (unlikely(action_ret != IRQ_HANDLED)) { 273 /* we get here again via the threaded handler */
274 if (action_ret == IRQ_WAKE_THREAD)
275 return;
276
277 if (bad_action_ret(action_ret)) {
278 report_bad_irq(irq, desc, action_ret);
279 return;
280 }
281
282 if (unlikely(action_ret == IRQ_NONE)) {
266 /* 283 /*
267 * If we are seeing only the odd spurious IRQ caused by 284 * If we are seeing only the odd spurious IRQ caused by
268 * bus asynchronicity then don't eventually trigger an error, 285 * bus asynchronicity then don't eventually trigger an error,
@@ -274,8 +291,6 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
274 else 291 else
275 desc->irqs_unhandled++; 292 desc->irqs_unhandled++;
276 desc->last_unhandled = jiffies; 293 desc->last_unhandled = jiffies;
277 if (unlikely(action_ret != IRQ_NONE))
278 report_bad_irq(irq, desc, action_ret);
279 } 294 }
280 295
281 if (unlikely(try_misrouted_irq(irq, desc, action_ret))) { 296 if (unlikely(try_misrouted_irq(irq, desc, action_ret))) {
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 3b79bd938330..a8ce45097f3d 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -2,43 +2,23 @@
2 * jump label support 2 * jump label support
3 * 3 *
4 * Copyright (C) 2009 Jason Baron <jbaron@redhat.com> 4 * Copyright (C) 2009 Jason Baron <jbaron@redhat.com>
5 * Copyright (C) 2011 Peter Zijlstra <pzijlstr@redhat.com>
5 * 6 *
6 */ 7 */
7#include <linux/jump_label.h>
8#include <linux/memory.h> 8#include <linux/memory.h>
9#include <linux/uaccess.h> 9#include <linux/uaccess.h>
10#include <linux/module.h> 10#include <linux/module.h>
11#include <linux/list.h> 11#include <linux/list.h>
12#include <linux/jhash.h>
13#include <linux/slab.h> 12#include <linux/slab.h>
14#include <linux/sort.h> 13#include <linux/sort.h>
15#include <linux/err.h> 14#include <linux/err.h>
15#include <linux/jump_label.h>
16 16
17#ifdef HAVE_JUMP_LABEL 17#ifdef HAVE_JUMP_LABEL
18 18
19#define JUMP_LABEL_HASH_BITS 6
20#define JUMP_LABEL_TABLE_SIZE (1 << JUMP_LABEL_HASH_BITS)
21static struct hlist_head jump_label_table[JUMP_LABEL_TABLE_SIZE];
22
23/* mutex to protect coming/going of the the jump_label table */ 19/* mutex to protect coming/going of the the jump_label table */
24static DEFINE_MUTEX(jump_label_mutex); 20static DEFINE_MUTEX(jump_label_mutex);
25 21
26struct jump_label_entry {
27 struct hlist_node hlist;
28 struct jump_entry *table;
29 int nr_entries;
30 /* hang modules off here */
31 struct hlist_head modules;
32 unsigned long key;
33};
34
35struct jump_label_module_entry {
36 struct hlist_node hlist;
37 struct jump_entry *table;
38 int nr_entries;
39 struct module *mod;
40};
41
42void jump_label_lock(void) 22void jump_label_lock(void)
43{ 23{
44 mutex_lock(&jump_label_mutex); 24 mutex_lock(&jump_label_mutex);
@@ -49,6 +29,11 @@ void jump_label_unlock(void)
49 mutex_unlock(&jump_label_mutex); 29 mutex_unlock(&jump_label_mutex);
50} 30}
51 31
32bool jump_label_enabled(struct jump_label_key *key)
33{
34 return !!atomic_read(&key->enabled);
35}
36
52static int jump_label_cmp(const void *a, const void *b) 37static int jump_label_cmp(const void *a, const void *b)
53{ 38{
54 const struct jump_entry *jea = a; 39 const struct jump_entry *jea = a;
@@ -64,7 +49,7 @@ static int jump_label_cmp(const void *a, const void *b)
64} 49}
65 50
66static void 51static void
67sort_jump_label_entries(struct jump_entry *start, struct jump_entry *stop) 52jump_label_sort_entries(struct jump_entry *start, struct jump_entry *stop)
68{ 53{
69 unsigned long size; 54 unsigned long size;
70 55
@@ -73,118 +58,25 @@ sort_jump_label_entries(struct jump_entry *start, struct jump_entry *stop)
73 sort(start, size, sizeof(struct jump_entry), jump_label_cmp, NULL); 58 sort(start, size, sizeof(struct jump_entry), jump_label_cmp, NULL);
74} 59}
75 60
76static struct jump_label_entry *get_jump_label_entry(jump_label_t key) 61static void jump_label_update(struct jump_label_key *key, int enable);
77{
78 struct hlist_head *head;
79 struct hlist_node *node;
80 struct jump_label_entry *e;
81 u32 hash = jhash((void *)&key, sizeof(jump_label_t), 0);
82
83 head = &jump_label_table[hash & (JUMP_LABEL_TABLE_SIZE - 1)];
84 hlist_for_each_entry(e, node, head, hlist) {
85 if (key == e->key)
86 return e;
87 }
88 return NULL;
89}
90 62
91static struct jump_label_entry * 63void jump_label_inc(struct jump_label_key *key)
92add_jump_label_entry(jump_label_t key, int nr_entries, struct jump_entry *table)
93{ 64{
94 struct hlist_head *head; 65 if (atomic_inc_not_zero(&key->enabled))
95 struct jump_label_entry *e; 66 return;
96 u32 hash;
97
98 e = get_jump_label_entry(key);
99 if (e)
100 return ERR_PTR(-EEXIST);
101
102 e = kmalloc(sizeof(struct jump_label_entry), GFP_KERNEL);
103 if (!e)
104 return ERR_PTR(-ENOMEM);
105
106 hash = jhash((void *)&key, sizeof(jump_label_t), 0);
107 head = &jump_label_table[hash & (JUMP_LABEL_TABLE_SIZE - 1)];
108 e->key = key;
109 e->table = table;
110 e->nr_entries = nr_entries;
111 INIT_HLIST_HEAD(&(e->modules));
112 hlist_add_head(&e->hlist, head);
113 return e;
114}
115 67
116static int 68 jump_label_lock();
117build_jump_label_hashtable(struct jump_entry *start, struct jump_entry *stop) 69 if (atomic_add_return(1, &key->enabled) == 1)
118{ 70 jump_label_update(key, JUMP_LABEL_ENABLE);
119 struct jump_entry *iter, *iter_begin; 71 jump_label_unlock();
120 struct jump_label_entry *entry;
121 int count;
122
123 sort_jump_label_entries(start, stop);
124 iter = start;
125 while (iter < stop) {
126 entry = get_jump_label_entry(iter->key);
127 if (!entry) {
128 iter_begin = iter;
129 count = 0;
130 while ((iter < stop) &&
131 (iter->key == iter_begin->key)) {
132 iter++;
133 count++;
134 }
135 entry = add_jump_label_entry(iter_begin->key,
136 count, iter_begin);
137 if (IS_ERR(entry))
138 return PTR_ERR(entry);
139 } else {
140 WARN_ONCE(1, KERN_ERR "build_jump_hashtable: unexpected entry!\n");
141 return -1;
142 }
143 }
144 return 0;
145} 72}
146 73
147/*** 74void jump_label_dec(struct jump_label_key *key)
148 * jump_label_update - update jump label text
149 * @key - key value associated with a a jump label
150 * @type - enum set to JUMP_LABEL_ENABLE or JUMP_LABEL_DISABLE
151 *
152 * Will enable/disable the jump for jump label @key, depending on the
153 * value of @type.
154 *
155 */
156
157void jump_label_update(unsigned long key, enum jump_label_type type)
158{ 75{
159 struct jump_entry *iter; 76 if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex))
160 struct jump_label_entry *entry; 77 return;
161 struct hlist_node *module_node;
162 struct jump_label_module_entry *e_module;
163 int count;
164 78
165 jump_label_lock(); 79 jump_label_update(key, JUMP_LABEL_DISABLE);
166 entry = get_jump_label_entry((jump_label_t)key);
167 if (entry) {
168 count = entry->nr_entries;
169 iter = entry->table;
170 while (count--) {
171 if (kernel_text_address(iter->code))
172 arch_jump_label_transform(iter, type);
173 iter++;
174 }
175 /* eanble/disable jump labels in modules */
176 hlist_for_each_entry(e_module, module_node, &(entry->modules),
177 hlist) {
178 count = e_module->nr_entries;
179 iter = e_module->table;
180 while (count--) {
181 if (iter->key &&
182 kernel_text_address(iter->code))
183 arch_jump_label_transform(iter, type);
184 iter++;
185 }
186 }
187 }
188 jump_label_unlock(); 80 jump_label_unlock();
189} 81}
190 82
@@ -197,77 +89,36 @@ static int addr_conflict(struct jump_entry *entry, void *start, void *end)
197 return 0; 89 return 0;
198} 90}
199 91
200#ifdef CONFIG_MODULES 92static int __jump_label_text_reserved(struct jump_entry *iter_start,
201 93 struct jump_entry *iter_stop, void *start, void *end)
202static int module_conflict(void *start, void *end)
203{
204 struct hlist_head *head;
205 struct hlist_node *node, *node_next, *module_node, *module_node_next;
206 struct jump_label_entry *e;
207 struct jump_label_module_entry *e_module;
208 struct jump_entry *iter;
209 int i, count;
210 int conflict = 0;
211
212 for (i = 0; i < JUMP_LABEL_TABLE_SIZE; i++) {
213 head = &jump_label_table[i];
214 hlist_for_each_entry_safe(e, node, node_next, head, hlist) {
215 hlist_for_each_entry_safe(e_module, module_node,
216 module_node_next,
217 &(e->modules), hlist) {
218 count = e_module->nr_entries;
219 iter = e_module->table;
220 while (count--) {
221 if (addr_conflict(iter, start, end)) {
222 conflict = 1;
223 goto out;
224 }
225 iter++;
226 }
227 }
228 }
229 }
230out:
231 return conflict;
232}
233
234#endif
235
236/***
237 * jump_label_text_reserved - check if addr range is reserved
238 * @start: start text addr
239 * @end: end text addr
240 *
241 * checks if the text addr located between @start and @end
242 * overlaps with any of the jump label patch addresses. Code
243 * that wants to modify kernel text should first verify that
244 * it does not overlap with any of the jump label addresses.
245 * Caller must hold jump_label_mutex.
246 *
247 * returns 1 if there is an overlap, 0 otherwise
248 */
249int jump_label_text_reserved(void *start, void *end)
250{ 94{
251 struct jump_entry *iter; 95 struct jump_entry *iter;
252 struct jump_entry *iter_start = __start___jump_table;
253 struct jump_entry *iter_stop = __start___jump_table;
254 int conflict = 0;
255 96
256 iter = iter_start; 97 iter = iter_start;
257 while (iter < iter_stop) { 98 while (iter < iter_stop) {
258 if (addr_conflict(iter, start, end)) { 99 if (addr_conflict(iter, start, end))
259 conflict = 1; 100 return 1;
260 goto out;
261 }
262 iter++; 101 iter++;
263 } 102 }
264 103
265 /* now check modules */ 104 return 0;
266#ifdef CONFIG_MODULES 105}
267 conflict = module_conflict(start, end); 106
268#endif 107static void __jump_label_update(struct jump_label_key *key,
269out: 108 struct jump_entry *entry,
270 return conflict; 109 struct jump_entry *stop, int enable)
110{
111 for (; (entry < stop) &&
112 (entry->key == (jump_label_t)(unsigned long)key);
113 entry++) {
114 /*
115 * entry->code set to 0 invalidates module init text sections
116 * kernel_text_address() verifies we are not in core kernel
117 * init code, see jump_label_invalidate_module_init().
118 */
119 if (entry->code && kernel_text_address(entry->code))
120 arch_jump_label_transform(entry, enable);
121 }
271} 122}
272 123
273/* 124/*
@@ -277,145 +128,181 @@ void __weak arch_jump_label_text_poke_early(jump_label_t addr)
277{ 128{
278} 129}
279 130
280static __init int init_jump_label(void) 131static __init int jump_label_init(void)
281{ 132{
282 int ret;
283 struct jump_entry *iter_start = __start___jump_table; 133 struct jump_entry *iter_start = __start___jump_table;
284 struct jump_entry *iter_stop = __stop___jump_table; 134 struct jump_entry *iter_stop = __stop___jump_table;
135 struct jump_label_key *key = NULL;
285 struct jump_entry *iter; 136 struct jump_entry *iter;
286 137
287 jump_label_lock(); 138 jump_label_lock();
288 ret = build_jump_label_hashtable(__start___jump_table, 139 jump_label_sort_entries(iter_start, iter_stop);
289 __stop___jump_table); 140
290 iter = iter_start; 141 for (iter = iter_start; iter < iter_stop; iter++) {
291 while (iter < iter_stop) {
292 arch_jump_label_text_poke_early(iter->code); 142 arch_jump_label_text_poke_early(iter->code);
293 iter++; 143 if (iter->key == (jump_label_t)(unsigned long)key)
144 continue;
145
146 key = (struct jump_label_key *)(unsigned long)iter->key;
147 atomic_set(&key->enabled, 0);
148 key->entries = iter;
149#ifdef CONFIG_MODULES
150 key->next = NULL;
151#endif
294 } 152 }
295 jump_label_unlock(); 153 jump_label_unlock();
296 return ret; 154
155 return 0;
297} 156}
298early_initcall(init_jump_label); 157early_initcall(jump_label_init);
299 158
300#ifdef CONFIG_MODULES 159#ifdef CONFIG_MODULES
301 160
302static struct jump_label_module_entry * 161struct jump_label_mod {
303add_jump_label_module_entry(struct jump_label_entry *entry, 162 struct jump_label_mod *next;
304 struct jump_entry *iter_begin, 163 struct jump_entry *entries;
305 int count, struct module *mod) 164 struct module *mod;
165};
166
167static int __jump_label_mod_text_reserved(void *start, void *end)
306{ 168{
307 struct jump_label_module_entry *e; 169 struct module *mod;
308 170
309 e = kmalloc(sizeof(struct jump_label_module_entry), GFP_KERNEL); 171 mod = __module_text_address((unsigned long)start);
310 if (!e) 172 if (!mod)
311 return ERR_PTR(-ENOMEM); 173 return 0;
312 e->mod = mod; 174
313 e->nr_entries = count; 175 WARN_ON_ONCE(__module_text_address((unsigned long)end) != mod);
314 e->table = iter_begin; 176
315 hlist_add_head(&e->hlist, &entry->modules); 177 return __jump_label_text_reserved(mod->jump_entries,
316 return e; 178 mod->jump_entries + mod->num_jump_entries,
179 start, end);
317} 180}
318 181
319static int add_jump_label_module(struct module *mod) 182static void __jump_label_mod_update(struct jump_label_key *key, int enable)
320{ 183{
321 struct jump_entry *iter, *iter_begin; 184 struct jump_label_mod *mod = key->next;
322 struct jump_label_entry *entry;
323 struct jump_label_module_entry *module_entry;
324 int count;
325 185
326 /* if the module doesn't have jump label entries, just return */ 186 while (mod) {
327 if (!mod->num_jump_entries) 187 struct module *m = mod->mod;
328 return 0;
329 188
330 sort_jump_label_entries(mod->jump_entries, 189 __jump_label_update(key, mod->entries,
331 mod->jump_entries + mod->num_jump_entries); 190 m->jump_entries + m->num_jump_entries,
332 iter = mod->jump_entries; 191 enable);
333 while (iter < mod->jump_entries + mod->num_jump_entries) { 192 mod = mod->next;
334 entry = get_jump_label_entry(iter->key);
335 iter_begin = iter;
336 count = 0;
337 while ((iter < mod->jump_entries + mod->num_jump_entries) &&
338 (iter->key == iter_begin->key)) {
339 iter++;
340 count++;
341 }
342 if (!entry) {
343 entry = add_jump_label_entry(iter_begin->key, 0, NULL);
344 if (IS_ERR(entry))
345 return PTR_ERR(entry);
346 }
347 module_entry = add_jump_label_module_entry(entry, iter_begin,
348 count, mod);
349 if (IS_ERR(module_entry))
350 return PTR_ERR(module_entry);
351 } 193 }
352 return 0;
353} 194}
354 195
355static void remove_jump_label_module(struct module *mod) 196/***
197 * apply_jump_label_nops - patch module jump labels with arch_get_jump_label_nop()
198 * @mod: module to patch
199 *
200 * Allow for run-time selection of the optimal nops. Before the module
201 * loads patch these with arch_get_jump_label_nop(), which is specified by
202 * the arch specific jump label code.
203 */
204void jump_label_apply_nops(struct module *mod)
356{ 205{
357 struct hlist_head *head; 206 struct jump_entry *iter_start = mod->jump_entries;
358 struct hlist_node *node, *node_next, *module_node, *module_node_next; 207 struct jump_entry *iter_stop = iter_start + mod->num_jump_entries;
359 struct jump_label_entry *e; 208 struct jump_entry *iter;
360 struct jump_label_module_entry *e_module;
361 int i;
362 209
363 /* if the module doesn't have jump label entries, just return */ 210 /* if the module doesn't have jump label entries, just return */
364 if (!mod->num_jump_entries) 211 if (iter_start == iter_stop)
365 return; 212 return;
366 213
367 for (i = 0; i < JUMP_LABEL_TABLE_SIZE; i++) { 214 for (iter = iter_start; iter < iter_stop; iter++)
368 head = &jump_label_table[i]; 215 arch_jump_label_text_poke_early(iter->code);
369 hlist_for_each_entry_safe(e, node, node_next, head, hlist) { 216}
370 hlist_for_each_entry_safe(e_module, module_node, 217
371 module_node_next, 218static int jump_label_add_module(struct module *mod)
372 &(e->modules), hlist) { 219{
373 if (e_module->mod == mod) { 220 struct jump_entry *iter_start = mod->jump_entries;
374 hlist_del(&e_module->hlist); 221 struct jump_entry *iter_stop = iter_start + mod->num_jump_entries;
375 kfree(e_module); 222 struct jump_entry *iter;
376 } 223 struct jump_label_key *key = NULL;
377 } 224 struct jump_label_mod *jlm;
378 if (hlist_empty(&e->modules) && (e->nr_entries == 0)) { 225
379 hlist_del(&e->hlist); 226 /* if the module doesn't have jump label entries, just return */
380 kfree(e); 227 if (iter_start == iter_stop)
381 } 228 return 0;
229
230 jump_label_sort_entries(iter_start, iter_stop);
231
232 for (iter = iter_start; iter < iter_stop; iter++) {
233 if (iter->key == (jump_label_t)(unsigned long)key)
234 continue;
235
236 key = (struct jump_label_key *)(unsigned long)iter->key;
237
238 if (__module_address(iter->key) == mod) {
239 atomic_set(&key->enabled, 0);
240 key->entries = iter;
241 key->next = NULL;
242 continue;
382 } 243 }
244
245 jlm = kzalloc(sizeof(struct jump_label_mod), GFP_KERNEL);
246 if (!jlm)
247 return -ENOMEM;
248
249 jlm->mod = mod;
250 jlm->entries = iter;
251 jlm->next = key->next;
252 key->next = jlm;
253
254 if (jump_label_enabled(key))
255 __jump_label_update(key, iter, iter_stop,
256 JUMP_LABEL_ENABLE);
383 } 257 }
258
259 return 0;
384} 260}
385 261
386static void remove_jump_label_module_init(struct module *mod) 262static void jump_label_del_module(struct module *mod)
387{ 263{
388 struct hlist_head *head; 264 struct jump_entry *iter_start = mod->jump_entries;
389 struct hlist_node *node, *node_next, *module_node, *module_node_next; 265 struct jump_entry *iter_stop = iter_start + mod->num_jump_entries;
390 struct jump_label_entry *e;
391 struct jump_label_module_entry *e_module;
392 struct jump_entry *iter; 266 struct jump_entry *iter;
393 int i, count; 267 struct jump_label_key *key = NULL;
268 struct jump_label_mod *jlm, **prev;
394 269
395 /* if the module doesn't have jump label entries, just return */ 270 for (iter = iter_start; iter < iter_stop; iter++) {
396 if (!mod->num_jump_entries) 271 if (iter->key == (jump_label_t)(unsigned long)key)
397 return; 272 continue;
273
274 key = (struct jump_label_key *)(unsigned long)iter->key;
275
276 if (__module_address(iter->key) == mod)
277 continue;
278
279 prev = &key->next;
280 jlm = key->next;
281
282 while (jlm && jlm->mod != mod) {
283 prev = &jlm->next;
284 jlm = jlm->next;
285 }
398 286
399 for (i = 0; i < JUMP_LABEL_TABLE_SIZE; i++) { 287 if (jlm) {
400 head = &jump_label_table[i]; 288 *prev = jlm->next;
401 hlist_for_each_entry_safe(e, node, node_next, head, hlist) { 289 kfree(jlm);
402 hlist_for_each_entry_safe(e_module, module_node,
403 module_node_next,
404 &(e->modules), hlist) {
405 if (e_module->mod != mod)
406 continue;
407 count = e_module->nr_entries;
408 iter = e_module->table;
409 while (count--) {
410 if (within_module_init(iter->code, mod))
411 iter->key = 0;
412 iter++;
413 }
414 }
415 } 290 }
416 } 291 }
417} 292}
418 293
294static void jump_label_invalidate_module_init(struct module *mod)
295{
296 struct jump_entry *iter_start = mod->jump_entries;
297 struct jump_entry *iter_stop = iter_start + mod->num_jump_entries;
298 struct jump_entry *iter;
299
300 for (iter = iter_start; iter < iter_stop; iter++) {
301 if (within_module_init(iter->code, mod))
302 iter->code = 0;
303 }
304}
305
419static int 306static int
420jump_label_module_notify(struct notifier_block *self, unsigned long val, 307jump_label_module_notify(struct notifier_block *self, unsigned long val,
421 void *data) 308 void *data)
@@ -426,59 +313,81 @@ jump_label_module_notify(struct notifier_block *self, unsigned long val,
426 switch (val) { 313 switch (val) {
427 case MODULE_STATE_COMING: 314 case MODULE_STATE_COMING:
428 jump_label_lock(); 315 jump_label_lock();
429 ret = add_jump_label_module(mod); 316 ret = jump_label_add_module(mod);
430 if (ret) 317 if (ret)
431 remove_jump_label_module(mod); 318 jump_label_del_module(mod);
432 jump_label_unlock(); 319 jump_label_unlock();
433 break; 320 break;
434 case MODULE_STATE_GOING: 321 case MODULE_STATE_GOING:
435 jump_label_lock(); 322 jump_label_lock();
436 remove_jump_label_module(mod); 323 jump_label_del_module(mod);
437 jump_label_unlock(); 324 jump_label_unlock();
438 break; 325 break;
439 case MODULE_STATE_LIVE: 326 case MODULE_STATE_LIVE:
440 jump_label_lock(); 327 jump_label_lock();
441 remove_jump_label_module_init(mod); 328 jump_label_invalidate_module_init(mod);
442 jump_label_unlock(); 329 jump_label_unlock();
443 break; 330 break;
444 } 331 }
445 return ret;
446}
447
448/***
449 * apply_jump_label_nops - patch module jump labels with arch_get_jump_label_nop()
450 * @mod: module to patch
451 *
452 * Allow for run-time selection of the optimal nops. Before the module
453 * loads patch these with arch_get_jump_label_nop(), which is specified by
454 * the arch specific jump label code.
455 */
456void jump_label_apply_nops(struct module *mod)
457{
458 struct jump_entry *iter;
459
460 /* if the module doesn't have jump label entries, just return */
461 if (!mod->num_jump_entries)
462 return;
463 332
464 iter = mod->jump_entries; 333 return notifier_from_errno(ret);
465 while (iter < mod->jump_entries + mod->num_jump_entries) {
466 arch_jump_label_text_poke_early(iter->code);
467 iter++;
468 }
469} 334}
470 335
471struct notifier_block jump_label_module_nb = { 336struct notifier_block jump_label_module_nb = {
472 .notifier_call = jump_label_module_notify, 337 .notifier_call = jump_label_module_notify,
473 .priority = 0, 338 .priority = 1, /* higher than tracepoints */
474}; 339};
475 340
476static __init int init_jump_label_module(void) 341static __init int jump_label_init_module(void)
477{ 342{
478 return register_module_notifier(&jump_label_module_nb); 343 return register_module_notifier(&jump_label_module_nb);
479} 344}
480early_initcall(init_jump_label_module); 345early_initcall(jump_label_init_module);
481 346
482#endif /* CONFIG_MODULES */ 347#endif /* CONFIG_MODULES */
483 348
349/***
350 * jump_label_text_reserved - check if addr range is reserved
351 * @start: start text addr
352 * @end: end text addr
353 *
354 * checks if the text addr located between @start and @end
355 * overlaps with any of the jump label patch addresses. Code
356 * that wants to modify kernel text should first verify that
357 * it does not overlap with any of the jump label addresses.
358 * Caller must hold jump_label_mutex.
359 *
360 * returns 1 if there is an overlap, 0 otherwise
361 */
362int jump_label_text_reserved(void *start, void *end)
363{
364 int ret = __jump_label_text_reserved(__start___jump_table,
365 __stop___jump_table, start, end);
366
367 if (ret)
368 return ret;
369
370#ifdef CONFIG_MODULES
371 ret = __jump_label_mod_text_reserved(start, end);
372#endif
373 return ret;
374}
375
376static void jump_label_update(struct jump_label_key *key, int enable)
377{
378 struct jump_entry *entry = key->entries, *stop = __stop___jump_table;
379
380#ifdef CONFIG_MODULES
381 struct module *mod = __module_address((jump_label_t)key);
382
383 __jump_label_mod_update(key, enable);
384
385 if (mod)
386 stop = mod->jump_entries + mod->num_jump_entries;
387#endif
388 /* if there are no users, entry can be NULL */
389 if (entry)
390 __jump_label_update(key, entry, stop, enable);
391}
392
484#endif 393#endif
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 87b77de03dd3..8d814cbc8109 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1531,13 +1531,7 @@ int kernel_kexec(void)
1531 if (error) 1531 if (error)
1532 goto Enable_cpus; 1532 goto Enable_cpus;
1533 local_irq_disable(); 1533 local_irq_disable();
1534 /* Suspend system devices */ 1534 error = syscore_suspend();
1535 error = sysdev_suspend(PMSG_FREEZE);
1536 if (!error) {
1537 error = syscore_suspend();
1538 if (error)
1539 sysdev_resume();
1540 }
1541 if (error) 1535 if (error)
1542 goto Enable_irqs; 1536 goto Enable_irqs;
1543 } else 1537 } else
@@ -1553,7 +1547,6 @@ int kernel_kexec(void)
1553#ifdef CONFIG_KEXEC_JUMP 1547#ifdef CONFIG_KEXEC_JUMP
1554 if (kexec_image->preserve_context) { 1548 if (kexec_image->preserve_context) {
1555 syscore_resume(); 1549 syscore_resume();
1556 sysdev_resume();
1557 Enable_irqs: 1550 Enable_irqs:
1558 local_irq_enable(); 1551 local_irq_enable();
1559 Enable_cpus: 1552 Enable_cpus:
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 9cd0591c96a2..47613dfb7b28 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -25,6 +25,7 @@
25#include <linux/kmod.h> 25#include <linux/kmod.h>
26#include <linux/slab.h> 26#include <linux/slab.h>
27#include <linux/completion.h> 27#include <linux/completion.h>
28#include <linux/cred.h>
28#include <linux/file.h> 29#include <linux/file.h>
29#include <linux/fdtable.h> 30#include <linux/fdtable.h>
30#include <linux/workqueue.h> 31#include <linux/workqueue.h>
@@ -43,6 +44,13 @@ extern int max_threads;
43 44
44static struct workqueue_struct *khelper_wq; 45static struct workqueue_struct *khelper_wq;
45 46
47#define CAP_BSET (void *)1
48#define CAP_PI (void *)2
49
50static kernel_cap_t usermodehelper_bset = CAP_FULL_SET;
51static kernel_cap_t usermodehelper_inheritable = CAP_FULL_SET;
52static DEFINE_SPINLOCK(umh_sysctl_lock);
53
46#ifdef CONFIG_MODULES 54#ifdef CONFIG_MODULES
47 55
48/* 56/*
@@ -132,6 +140,7 @@ EXPORT_SYMBOL(__request_module);
132static int ____call_usermodehelper(void *data) 140static int ____call_usermodehelper(void *data)
133{ 141{
134 struct subprocess_info *sub_info = data; 142 struct subprocess_info *sub_info = data;
143 struct cred *new;
135 int retval; 144 int retval;
136 145
137 spin_lock_irq(&current->sighand->siglock); 146 spin_lock_irq(&current->sighand->siglock);
@@ -147,12 +156,27 @@ static int ____call_usermodehelper(void *data)
147 */ 156 */
148 set_user_nice(current, 0); 157 set_user_nice(current, 0);
149 158
159 retval = -ENOMEM;
160 new = prepare_kernel_cred(current);
161 if (!new)
162 goto fail;
163
164 spin_lock(&umh_sysctl_lock);
165 new->cap_bset = cap_intersect(usermodehelper_bset, new->cap_bset);
166 new->cap_inheritable = cap_intersect(usermodehelper_inheritable,
167 new->cap_inheritable);
168 spin_unlock(&umh_sysctl_lock);
169
150 if (sub_info->init) { 170 if (sub_info->init) {
151 retval = sub_info->init(sub_info); 171 retval = sub_info->init(sub_info, new);
152 if (retval) 172 if (retval) {
173 abort_creds(new);
153 goto fail; 174 goto fail;
175 }
154 } 176 }
155 177
178 commit_creds(new);
179
156 retval = kernel_execve(sub_info->path, 180 retval = kernel_execve(sub_info->path,
157 (const char *const *)sub_info->argv, 181 (const char *const *)sub_info->argv,
158 (const char *const *)sub_info->envp); 182 (const char *const *)sub_info->envp);
@@ -245,7 +269,6 @@ static void __call_usermodehelper(struct work_struct *work)
245 } 269 }
246} 270}
247 271
248#ifdef CONFIG_PM_SLEEP
249/* 272/*
250 * If set, call_usermodehelper_exec() will exit immediately returning -EBUSY 273 * If set, call_usermodehelper_exec() will exit immediately returning -EBUSY
251 * (used for preventing user land processes from being created after the user 274 * (used for preventing user land processes from being created after the user
@@ -301,6 +324,15 @@ void usermodehelper_enable(void)
301 usermodehelper_disabled = 0; 324 usermodehelper_disabled = 0;
302} 325}
303 326
327/**
328 * usermodehelper_is_disabled - check if new helpers are allowed to be started
329 */
330bool usermodehelper_is_disabled(void)
331{
332 return usermodehelper_disabled;
333}
334EXPORT_SYMBOL_GPL(usermodehelper_is_disabled);
335
304static void helper_lock(void) 336static void helper_lock(void)
305{ 337{
306 atomic_inc(&running_helpers); 338 atomic_inc(&running_helpers);
@@ -312,12 +344,6 @@ static void helper_unlock(void)
312 if (atomic_dec_and_test(&running_helpers)) 344 if (atomic_dec_and_test(&running_helpers))
313 wake_up(&running_helpers_waitq); 345 wake_up(&running_helpers_waitq);
314} 346}
315#else /* CONFIG_PM_SLEEP */
316#define usermodehelper_disabled 0
317
318static inline void helper_lock(void) {}
319static inline void helper_unlock(void) {}
320#endif /* CONFIG_PM_SLEEP */
321 347
322/** 348/**
323 * call_usermodehelper_setup - prepare to call a usermode helper 349 * call_usermodehelper_setup - prepare to call a usermode helper
@@ -364,7 +390,7 @@ EXPORT_SYMBOL(call_usermodehelper_setup);
364 * context in which call_usermodehelper_exec is called. 390 * context in which call_usermodehelper_exec is called.
365 */ 391 */
366void call_usermodehelper_setfns(struct subprocess_info *info, 392void call_usermodehelper_setfns(struct subprocess_info *info,
367 int (*init)(struct subprocess_info *info), 393 int (*init)(struct subprocess_info *info, struct cred *new),
368 void (*cleanup)(struct subprocess_info *info), 394 void (*cleanup)(struct subprocess_info *info),
369 void *data) 395 void *data)
370{ 396{
@@ -418,6 +444,84 @@ unlock:
418} 444}
419EXPORT_SYMBOL(call_usermodehelper_exec); 445EXPORT_SYMBOL(call_usermodehelper_exec);
420 446
447static int proc_cap_handler(struct ctl_table *table, int write,
448 void __user *buffer, size_t *lenp, loff_t *ppos)
449{
450 struct ctl_table t;
451 unsigned long cap_array[_KERNEL_CAPABILITY_U32S];
452 kernel_cap_t new_cap;
453 int err, i;
454
455 if (write && (!capable(CAP_SETPCAP) ||
456 !capable(CAP_SYS_MODULE)))
457 return -EPERM;
458
459 /*
460 * convert from the global kernel_cap_t to the ulong array to print to
461 * userspace if this is a read.
462 */
463 spin_lock(&umh_sysctl_lock);
464 for (i = 0; i < _KERNEL_CAPABILITY_U32S; i++) {
465 if (table->data == CAP_BSET)
466 cap_array[i] = usermodehelper_bset.cap[i];
467 else if (table->data == CAP_PI)
468 cap_array[i] = usermodehelper_inheritable.cap[i];
469 else
470 BUG();
471 }
472 spin_unlock(&umh_sysctl_lock);
473
474 t = *table;
475 t.data = &cap_array;
476
477 /*
478 * actually read or write and array of ulongs from userspace. Remember
479 * these are least significant 32 bits first
480 */
481 err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos);
482 if (err < 0)
483 return err;
484
485 /*
486 * convert from the sysctl array of ulongs to the kernel_cap_t
487 * internal representation
488 */
489 for (i = 0; i < _KERNEL_CAPABILITY_U32S; i++)
490 new_cap.cap[i] = cap_array[i];
491
492 /*
493 * Drop everything not in the new_cap (but don't add things)
494 */
495 spin_lock(&umh_sysctl_lock);
496 if (write) {
497 if (table->data == CAP_BSET)
498 usermodehelper_bset = cap_intersect(usermodehelper_bset, new_cap);
499 if (table->data == CAP_PI)
500 usermodehelper_inheritable = cap_intersect(usermodehelper_inheritable, new_cap);
501 }
502 spin_unlock(&umh_sysctl_lock);
503
504 return 0;
505}
506
507struct ctl_table usermodehelper_table[] = {
508 {
509 .procname = "bset",
510 .data = CAP_BSET,
511 .maxlen = _KERNEL_CAPABILITY_U32S * sizeof(unsigned long),
512 .mode = 0600,
513 .proc_handler = proc_cap_handler,
514 },
515 {
516 .procname = "inheritable",
517 .data = CAP_PI,
518 .maxlen = _KERNEL_CAPABILITY_U32S * sizeof(unsigned long),
519 .mode = 0600,
520 .proc_handler = proc_cap_handler,
521 },
522 { }
523};
524
421void __init usermodehelper_init(void) 525void __init usermodehelper_init(void)
422{ 526{
423 khelper_wq = create_singlethread_workqueue("khelper"); 527 khelper_wq = create_singlethread_workqueue("khelper");
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 0b624e791805..3b053c04dd86 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -16,6 +16,7 @@
16#include <linux/kexec.h> 16#include <linux/kexec.h>
17#include <linux/profile.h> 17#include <linux/profile.h>
18#include <linux/sched.h> 18#include <linux/sched.h>
19#include <linux/capability.h>
19 20
20#define KERNEL_ATTR_RO(_name) \ 21#define KERNEL_ATTR_RO(_name) \
21static struct kobj_attribute _name##_attr = __ATTR_RO(_name) 22static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
@@ -131,6 +132,14 @@ KERNEL_ATTR_RO(vmcoreinfo);
131 132
132#endif /* CONFIG_KEXEC */ 133#endif /* CONFIG_KEXEC */
133 134
135/* whether file capabilities are enabled */
136static ssize_t fscaps_show(struct kobject *kobj,
137 struct kobj_attribute *attr, char *buf)
138{
139 return sprintf(buf, "%d\n", file_caps_enabled);
140}
141KERNEL_ATTR_RO(fscaps);
142
134/* 143/*
135 * Make /sys/kernel/notes give the raw contents of our kernel .notes section. 144 * Make /sys/kernel/notes give the raw contents of our kernel .notes section.
136 */ 145 */
@@ -158,6 +167,7 @@ struct kobject *kernel_kobj;
158EXPORT_SYMBOL_GPL(kernel_kobj); 167EXPORT_SYMBOL_GPL(kernel_kobj);
159 168
160static struct attribute * kernel_attrs[] = { 169static struct attribute * kernel_attrs[] = {
170 &fscaps_attr.attr,
161#if defined(CONFIG_HOTPLUG) 171#if defined(CONFIG_HOTPLUG)
162 &uevent_seqnum_attr.attr, 172 &uevent_seqnum_attr.attr,
163 &uevent_helper_attr.attr, 173 &uevent_helper_attr.attr,
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 3b34d2732bce..4ba7cccb4994 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -202,8 +202,8 @@ void kthread_bind(struct task_struct *p, unsigned int cpu)
202 return; 202 return;
203 } 203 }
204 204
205 p->cpus_allowed = cpumask_of_cpu(cpu); 205 /* It's safe because the task is inactive. */
206 p->rt.nr_cpus_allowed = 1; 206 do_set_cpus_allowed(p, cpumask_of(cpu));
207 p->flags |= PF_THREAD_BOUND; 207 p->flags |= PF_THREAD_BOUND;
208} 208}
209EXPORT_SYMBOL(kthread_bind); 209EXPORT_SYMBOL(kthread_bind);
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 53a68956f131..298c9276dfdb 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -490,6 +490,18 @@ void get_usage_chars(struct lock_class *class, char usage[LOCK_USAGE_CHARS])
490 usage[i] = '\0'; 490 usage[i] = '\0';
491} 491}
492 492
493static int __print_lock_name(struct lock_class *class)
494{
495 char str[KSYM_NAME_LEN];
496 const char *name;
497
498 name = class->name;
499 if (!name)
500 name = __get_key_name(class->key, str);
501
502 return printk("%s", name);
503}
504
493static void print_lock_name(struct lock_class *class) 505static void print_lock_name(struct lock_class *class)
494{ 506{
495 char str[KSYM_NAME_LEN], usage[LOCK_USAGE_CHARS]; 507 char str[KSYM_NAME_LEN], usage[LOCK_USAGE_CHARS];
@@ -1053,6 +1065,56 @@ print_circular_bug_entry(struct lock_list *target, int depth)
1053 return 0; 1065 return 0;
1054} 1066}
1055 1067
1068static void
1069print_circular_lock_scenario(struct held_lock *src,
1070 struct held_lock *tgt,
1071 struct lock_list *prt)
1072{
1073 struct lock_class *source = hlock_class(src);
1074 struct lock_class *target = hlock_class(tgt);
1075 struct lock_class *parent = prt->class;
1076
1077 /*
1078 * A direct locking problem where unsafe_class lock is taken
1079 * directly by safe_class lock, then all we need to show
1080 * is the deadlock scenario, as it is obvious that the
1081 * unsafe lock is taken under the safe lock.
1082 *
1083 * But if there is a chain instead, where the safe lock takes
1084 * an intermediate lock (middle_class) where this lock is
1085 * not the same as the safe lock, then the lock chain is
1086 * used to describe the problem. Otherwise we would need
1087 * to show a different CPU case for each link in the chain
1088 * from the safe_class lock to the unsafe_class lock.
1089 */
1090 if (parent != source) {
1091 printk("Chain exists of:\n ");
1092 __print_lock_name(source);
1093 printk(" --> ");
1094 __print_lock_name(parent);
1095 printk(" --> ");
1096 __print_lock_name(target);
1097 printk("\n\n");
1098 }
1099
1100 printk(" Possible unsafe locking scenario:\n\n");
1101 printk(" CPU0 CPU1\n");
1102 printk(" ---- ----\n");
1103 printk(" lock(");
1104 __print_lock_name(target);
1105 printk(");\n");
1106 printk(" lock(");
1107 __print_lock_name(parent);
1108 printk(");\n");
1109 printk(" lock(");
1110 __print_lock_name(target);
1111 printk(");\n");
1112 printk(" lock(");
1113 __print_lock_name(source);
1114 printk(");\n");
1115 printk("\n *** DEADLOCK ***\n\n");
1116}
1117
1056/* 1118/*
1057 * When a circular dependency is detected, print the 1119 * When a circular dependency is detected, print the
1058 * header first: 1120 * header first:
@@ -1096,6 +1158,7 @@ static noinline int print_circular_bug(struct lock_list *this,
1096{ 1158{
1097 struct task_struct *curr = current; 1159 struct task_struct *curr = current;
1098 struct lock_list *parent; 1160 struct lock_list *parent;
1161 struct lock_list *first_parent;
1099 int depth; 1162 int depth;
1100 1163
1101 if (!debug_locks_off_graph_unlock() || debug_locks_silent) 1164 if (!debug_locks_off_graph_unlock() || debug_locks_silent)
@@ -1109,6 +1172,7 @@ static noinline int print_circular_bug(struct lock_list *this,
1109 print_circular_bug_header(target, depth, check_src, check_tgt); 1172 print_circular_bug_header(target, depth, check_src, check_tgt);
1110 1173
1111 parent = get_lock_parent(target); 1174 parent = get_lock_parent(target);
1175 first_parent = parent;
1112 1176
1113 while (parent) { 1177 while (parent) {
1114 print_circular_bug_entry(parent, --depth); 1178 print_circular_bug_entry(parent, --depth);
@@ -1116,6 +1180,9 @@ static noinline int print_circular_bug(struct lock_list *this,
1116 } 1180 }
1117 1181
1118 printk("\nother info that might help us debug this:\n\n"); 1182 printk("\nother info that might help us debug this:\n\n");
1183 print_circular_lock_scenario(check_src, check_tgt,
1184 first_parent);
1185
1119 lockdep_print_held_locks(curr); 1186 lockdep_print_held_locks(curr);
1120 1187
1121 printk("\nstack backtrace:\n"); 1188 printk("\nstack backtrace:\n");
@@ -1314,7 +1381,7 @@ print_shortest_lock_dependencies(struct lock_list *leaf,
1314 printk("\n"); 1381 printk("\n");
1315 1382
1316 if (depth == 0 && (entry != root)) { 1383 if (depth == 0 && (entry != root)) {
1317 printk("lockdep:%s bad BFS generated tree\n", __func__); 1384 printk("lockdep:%s bad path found in chain graph\n", __func__);
1318 break; 1385 break;
1319 } 1386 }
1320 1387
@@ -1325,6 +1392,62 @@ print_shortest_lock_dependencies(struct lock_list *leaf,
1325 return; 1392 return;
1326} 1393}
1327 1394
1395static void
1396print_irq_lock_scenario(struct lock_list *safe_entry,
1397 struct lock_list *unsafe_entry,
1398 struct lock_class *prev_class,
1399 struct lock_class *next_class)
1400{
1401 struct lock_class *safe_class = safe_entry->class;
1402 struct lock_class *unsafe_class = unsafe_entry->class;
1403 struct lock_class *middle_class = prev_class;
1404
1405 if (middle_class == safe_class)
1406 middle_class = next_class;
1407
1408 /*
1409 * A direct locking problem where unsafe_class lock is taken
1410 * directly by safe_class lock, then all we need to show
1411 * is the deadlock scenario, as it is obvious that the
1412 * unsafe lock is taken under the safe lock.
1413 *
1414 * But if there is a chain instead, where the safe lock takes
1415 * an intermediate lock (middle_class) where this lock is
1416 * not the same as the safe lock, then the lock chain is
1417 * used to describe the problem. Otherwise we would need
1418 * to show a different CPU case for each link in the chain
1419 * from the safe_class lock to the unsafe_class lock.
1420 */
1421 if (middle_class != unsafe_class) {
1422 printk("Chain exists of:\n ");
1423 __print_lock_name(safe_class);
1424 printk(" --> ");
1425 __print_lock_name(middle_class);
1426 printk(" --> ");
1427 __print_lock_name(unsafe_class);
1428 printk("\n\n");
1429 }
1430
1431 printk(" Possible interrupt unsafe locking scenario:\n\n");
1432 printk(" CPU0 CPU1\n");
1433 printk(" ---- ----\n");
1434 printk(" lock(");
1435 __print_lock_name(unsafe_class);
1436 printk(");\n");
1437 printk(" local_irq_disable();\n");
1438 printk(" lock(");
1439 __print_lock_name(safe_class);
1440 printk(");\n");
1441 printk(" lock(");
1442 __print_lock_name(middle_class);
1443 printk(");\n");
1444 printk(" <Interrupt>\n");
1445 printk(" lock(");
1446 __print_lock_name(safe_class);
1447 printk(");\n");
1448 printk("\n *** DEADLOCK ***\n\n");
1449}
1450
1328static int 1451static int
1329print_bad_irq_dependency(struct task_struct *curr, 1452print_bad_irq_dependency(struct task_struct *curr,
1330 struct lock_list *prev_root, 1453 struct lock_list *prev_root,
@@ -1376,6 +1499,9 @@ print_bad_irq_dependency(struct task_struct *curr,
1376 print_stack_trace(forwards_entry->class->usage_traces + bit2, 1); 1499 print_stack_trace(forwards_entry->class->usage_traces + bit2, 1);
1377 1500
1378 printk("\nother info that might help us debug this:\n\n"); 1501 printk("\nother info that might help us debug this:\n\n");
1502 print_irq_lock_scenario(backwards_entry, forwards_entry,
1503 hlock_class(prev), hlock_class(next));
1504
1379 lockdep_print_held_locks(curr); 1505 lockdep_print_held_locks(curr);
1380 1506
1381 printk("\nthe dependencies between %s-irq-safe lock", irqclass); 1507 printk("\nthe dependencies between %s-irq-safe lock", irqclass);
@@ -1539,6 +1665,26 @@ static inline void inc_chains(void)
1539 1665
1540#endif 1666#endif
1541 1667
1668static void
1669print_deadlock_scenario(struct held_lock *nxt,
1670 struct held_lock *prv)
1671{
1672 struct lock_class *next = hlock_class(nxt);
1673 struct lock_class *prev = hlock_class(prv);
1674
1675 printk(" Possible unsafe locking scenario:\n\n");
1676 printk(" CPU0\n");
1677 printk(" ----\n");
1678 printk(" lock(");
1679 __print_lock_name(prev);
1680 printk(");\n");
1681 printk(" lock(");
1682 __print_lock_name(next);
1683 printk(");\n");
1684 printk("\n *** DEADLOCK ***\n\n");
1685 printk(" May be due to missing lock nesting notation\n\n");
1686}
1687
1542static int 1688static int
1543print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, 1689print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
1544 struct held_lock *next) 1690 struct held_lock *next)
@@ -1557,6 +1703,7 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
1557 print_lock(prev); 1703 print_lock(prev);
1558 1704
1559 printk("\nother info that might help us debug this:\n"); 1705 printk("\nother info that might help us debug this:\n");
1706 print_deadlock_scenario(next, prev);
1560 lockdep_print_held_locks(curr); 1707 lockdep_print_held_locks(curr);
1561 1708
1562 printk("\nstack backtrace:\n"); 1709 printk("\nstack backtrace:\n");
@@ -1826,7 +1973,7 @@ static inline int lookup_chain_cache(struct task_struct *curr,
1826 struct list_head *hash_head = chainhashentry(chain_key); 1973 struct list_head *hash_head = chainhashentry(chain_key);
1827 struct lock_chain *chain; 1974 struct lock_chain *chain;
1828 struct held_lock *hlock_curr, *hlock_next; 1975 struct held_lock *hlock_curr, *hlock_next;
1829 int i, j, n, cn; 1976 int i, j;
1830 1977
1831 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) 1978 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
1832 return 0; 1979 return 0;
@@ -1886,15 +2033,9 @@ cache_hit:
1886 } 2033 }
1887 i++; 2034 i++;
1888 chain->depth = curr->lockdep_depth + 1 - i; 2035 chain->depth = curr->lockdep_depth + 1 - i;
1889 cn = nr_chain_hlocks; 2036 if (likely(nr_chain_hlocks + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) {
1890 while (cn + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS) { 2037 chain->base = nr_chain_hlocks;
1891 n = cmpxchg(&nr_chain_hlocks, cn, cn + chain->depth); 2038 nr_chain_hlocks += chain->depth;
1892 if (n == cn)
1893 break;
1894 cn = n;
1895 }
1896 if (likely(cn + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) {
1897 chain->base = cn;
1898 for (j = 0; j < chain->depth - 1; j++, i++) { 2039 for (j = 0; j < chain->depth - 1; j++, i++) {
1899 int lock_id = curr->held_locks[i].class_idx - 1; 2040 int lock_id = curr->held_locks[i].class_idx - 1;
1900 chain_hlocks[chain->base + j] = lock_id; 2041 chain_hlocks[chain->base + j] = lock_id;
@@ -2011,6 +2152,24 @@ static void check_chain_key(struct task_struct *curr)
2011#endif 2152#endif
2012} 2153}
2013 2154
2155static void
2156print_usage_bug_scenario(struct held_lock *lock)
2157{
2158 struct lock_class *class = hlock_class(lock);
2159
2160 printk(" Possible unsafe locking scenario:\n\n");
2161 printk(" CPU0\n");
2162 printk(" ----\n");
2163 printk(" lock(");
2164 __print_lock_name(class);
2165 printk(");\n");
2166 printk(" <Interrupt>\n");
2167 printk(" lock(");
2168 __print_lock_name(class);
2169 printk(");\n");
2170 printk("\n *** DEADLOCK ***\n\n");
2171}
2172
2014static int 2173static int
2015print_usage_bug(struct task_struct *curr, struct held_lock *this, 2174print_usage_bug(struct task_struct *curr, struct held_lock *this,
2016 enum lock_usage_bit prev_bit, enum lock_usage_bit new_bit) 2175 enum lock_usage_bit prev_bit, enum lock_usage_bit new_bit)
@@ -2039,6 +2198,8 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this,
2039 2198
2040 print_irqtrace_events(curr); 2199 print_irqtrace_events(curr);
2041 printk("\nother info that might help us debug this:\n"); 2200 printk("\nother info that might help us debug this:\n");
2201 print_usage_bug_scenario(this);
2202
2042 lockdep_print_held_locks(curr); 2203 lockdep_print_held_locks(curr);
2043 2204
2044 printk("\nstack backtrace:\n"); 2205 printk("\nstack backtrace:\n");
@@ -2073,6 +2234,10 @@ print_irq_inversion_bug(struct task_struct *curr,
2073 struct held_lock *this, int forwards, 2234 struct held_lock *this, int forwards,
2074 const char *irqclass) 2235 const char *irqclass)
2075{ 2236{
2237 struct lock_list *entry = other;
2238 struct lock_list *middle = NULL;
2239 int depth;
2240
2076 if (!debug_locks_off_graph_unlock() || debug_locks_silent) 2241 if (!debug_locks_off_graph_unlock() || debug_locks_silent)
2077 return 0; 2242 return 0;
2078 2243
@@ -2091,6 +2256,25 @@ print_irq_inversion_bug(struct task_struct *curr,
2091 printk("\n\nand interrupts could create inverse lock ordering between them.\n\n"); 2256 printk("\n\nand interrupts could create inverse lock ordering between them.\n\n");
2092 2257
2093 printk("\nother info that might help us debug this:\n"); 2258 printk("\nother info that might help us debug this:\n");
2259
2260 /* Find a middle lock (if one exists) */
2261 depth = get_lock_depth(other);
2262 do {
2263 if (depth == 0 && (entry != root)) {
2264 printk("lockdep:%s bad path found in chain graph\n", __func__);
2265 break;
2266 }
2267 middle = entry;
2268 entry = get_lock_parent(entry);
2269 depth--;
2270 } while (entry && entry != root && (depth >= 0));
2271 if (forwards)
2272 print_irq_lock_scenario(root, other,
2273 middle ? middle->class : root->class, other->class);
2274 else
2275 print_irq_lock_scenario(other, root,
2276 middle ? middle->class : other->class, root->class);
2277
2094 lockdep_print_held_locks(curr); 2278 lockdep_print_held_locks(curr);
2095 2279
2096 printk("\nthe shortest dependencies between 2nd lock and 1st lock:\n"); 2280 printk("\nthe shortest dependencies between 2nd lock and 1st lock:\n");
@@ -3242,7 +3426,7 @@ int lock_is_held(struct lockdep_map *lock)
3242 int ret = 0; 3426 int ret = 0;
3243 3427
3244 if (unlikely(current->lockdep_recursion)) 3428 if (unlikely(current->lockdep_recursion))
3245 return ret; 3429 return 1; /* avoid false negative lockdep_assert_held() */
3246 3430
3247 raw_local_irq_save(flags); 3431 raw_local_irq_save(flags);
3248 check_flags(flags); 3432 check_flags(flags);
diff --git a/kernel/module.c b/kernel/module.c
index d5938a5c19c4..795bdc7f5c3f 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -57,6 +57,7 @@
57#include <linux/kmemleak.h> 57#include <linux/kmemleak.h>
58#include <linux/jump_label.h> 58#include <linux/jump_label.h>
59#include <linux/pfn.h> 59#include <linux/pfn.h>
60#include <linux/bsearch.h>
60 61
61#define CREATE_TRACE_POINTS 62#define CREATE_TRACE_POINTS
62#include <trace/events/module.h> 63#include <trace/events/module.h>
@@ -240,23 +241,24 @@ static bool each_symbol_in_section(const struct symsearch *arr,
240 struct module *owner, 241 struct module *owner,
241 bool (*fn)(const struct symsearch *syms, 242 bool (*fn)(const struct symsearch *syms,
242 struct module *owner, 243 struct module *owner,
243 unsigned int symnum, void *data), 244 void *data),
244 void *data) 245 void *data)
245{ 246{
246 unsigned int i, j; 247 unsigned int j;
247 248
248 for (j = 0; j < arrsize; j++) { 249 for (j = 0; j < arrsize; j++) {
249 for (i = 0; i < arr[j].stop - arr[j].start; i++) 250 if (fn(&arr[j], owner, data))
250 if (fn(&arr[j], owner, i, data)) 251 return true;
251 return true;
252 } 252 }
253 253
254 return false; 254 return false;
255} 255}
256 256
257/* Returns true as soon as fn returns true, otherwise false. */ 257/* Returns true as soon as fn returns true, otherwise false. */
258bool each_symbol(bool (*fn)(const struct symsearch *arr, struct module *owner, 258bool each_symbol_section(bool (*fn)(const struct symsearch *arr,
259 unsigned int symnum, void *data), void *data) 259 struct module *owner,
260 void *data),
261 void *data)
260{ 262{
261 struct module *mod; 263 struct module *mod;
262 static const struct symsearch arr[] = { 264 static const struct symsearch arr[] = {
@@ -309,7 +311,7 @@ bool each_symbol(bool (*fn)(const struct symsearch *arr, struct module *owner,
309 } 311 }
310 return false; 312 return false;
311} 313}
312EXPORT_SYMBOL_GPL(each_symbol); 314EXPORT_SYMBOL_GPL(each_symbol_section);
313 315
314struct find_symbol_arg { 316struct find_symbol_arg {
315 /* Input */ 317 /* Input */
@@ -323,15 +325,12 @@ struct find_symbol_arg {
323 const struct kernel_symbol *sym; 325 const struct kernel_symbol *sym;
324}; 326};
325 327
326static bool find_symbol_in_section(const struct symsearch *syms, 328static bool check_symbol(const struct symsearch *syms,
327 struct module *owner, 329 struct module *owner,
328 unsigned int symnum, void *data) 330 unsigned int symnum, void *data)
329{ 331{
330 struct find_symbol_arg *fsa = data; 332 struct find_symbol_arg *fsa = data;
331 333
332 if (strcmp(syms->start[symnum].name, fsa->name) != 0)
333 return false;
334
335 if (!fsa->gplok) { 334 if (!fsa->gplok) {
336 if (syms->licence == GPL_ONLY) 335 if (syms->licence == GPL_ONLY)
337 return false; 336 return false;
@@ -365,6 +364,30 @@ static bool find_symbol_in_section(const struct symsearch *syms,
365 return true; 364 return true;
366} 365}
367 366
367static int cmp_name(const void *va, const void *vb)
368{
369 const char *a;
370 const struct kernel_symbol *b;
371 a = va; b = vb;
372 return strcmp(a, b->name);
373}
374
375static bool find_symbol_in_section(const struct symsearch *syms,
376 struct module *owner,
377 void *data)
378{
379 struct find_symbol_arg *fsa = data;
380 struct kernel_symbol *sym;
381
382 sym = bsearch(fsa->name, syms->start, syms->stop - syms->start,
383 sizeof(struct kernel_symbol), cmp_name);
384
385 if (sym != NULL && check_symbol(syms, owner, sym - syms->start, data))
386 return true;
387
388 return false;
389}
390
368/* Find a symbol and return it, along with, (optional) crc and 391/* Find a symbol and return it, along with, (optional) crc and
369 * (optional) module which owns it. Needs preempt disabled or module_mutex. */ 392 * (optional) module which owns it. Needs preempt disabled or module_mutex. */
370const struct kernel_symbol *find_symbol(const char *name, 393const struct kernel_symbol *find_symbol(const char *name,
@@ -379,7 +402,7 @@ const struct kernel_symbol *find_symbol(const char *name,
379 fsa.gplok = gplok; 402 fsa.gplok = gplok;
380 fsa.warn = warn; 403 fsa.warn = warn;
381 404
382 if (each_symbol(find_symbol_in_section, &fsa)) { 405 if (each_symbol_section(find_symbol_in_section, &fsa)) {
383 if (owner) 406 if (owner)
384 *owner = fsa.owner; 407 *owner = fsa.owner;
385 if (crc) 408 if (crc)
@@ -1607,27 +1630,28 @@ static void set_section_ro_nx(void *base,
1607 } 1630 }
1608} 1631}
1609 1632
1610/* Setting memory back to RW+NX before releasing it */ 1633static void unset_module_core_ro_nx(struct module *mod)
1611void unset_section_ro_nx(struct module *mod, void *module_region)
1612{ 1634{
1613 unsigned long total_pages; 1635 set_page_attributes(mod->module_core + mod->core_text_size,
1614 1636 mod->module_core + mod->core_size,
1615 if (mod->module_core == module_region) { 1637 set_memory_x);
1616 /* Set core as NX+RW */ 1638 set_page_attributes(mod->module_core,
1617 total_pages = MOD_NUMBER_OF_PAGES(mod->module_core, mod->core_size); 1639 mod->module_core + mod->core_ro_size,
1618 set_memory_nx((unsigned long)mod->module_core, total_pages); 1640 set_memory_rw);
1619 set_memory_rw((unsigned long)mod->module_core, total_pages); 1641}
1620 1642
1621 } else if (mod->module_init == module_region) { 1643static void unset_module_init_ro_nx(struct module *mod)
1622 /* Set init as NX+RW */ 1644{
1623 total_pages = MOD_NUMBER_OF_PAGES(mod->module_init, mod->init_size); 1645 set_page_attributes(mod->module_init + mod->init_text_size,
1624 set_memory_nx((unsigned long)mod->module_init, total_pages); 1646 mod->module_init + mod->init_size,
1625 set_memory_rw((unsigned long)mod->module_init, total_pages); 1647 set_memory_x);
1626 } 1648 set_page_attributes(mod->module_init,
1649 mod->module_init + mod->init_ro_size,
1650 set_memory_rw);
1627} 1651}
1628 1652
1629/* Iterate through all modules and set each module's text as RW */ 1653/* Iterate through all modules and set each module's text as RW */
1630void set_all_modules_text_rw() 1654void set_all_modules_text_rw(void)
1631{ 1655{
1632 struct module *mod; 1656 struct module *mod;
1633 1657
@@ -1648,7 +1672,7 @@ void set_all_modules_text_rw()
1648} 1672}
1649 1673
1650/* Iterate through all modules and set each module's text as RO */ 1674/* Iterate through all modules and set each module's text as RO */
1651void set_all_modules_text_ro() 1675void set_all_modules_text_ro(void)
1652{ 1676{
1653 struct module *mod; 1677 struct module *mod;
1654 1678
@@ -1669,7 +1693,8 @@ void set_all_modules_text_ro()
1669} 1693}
1670#else 1694#else
1671static inline void set_section_ro_nx(void *base, unsigned long text_size, unsigned long ro_size, unsigned long total_size) { } 1695static inline void set_section_ro_nx(void *base, unsigned long text_size, unsigned long ro_size, unsigned long total_size) { }
1672static inline void unset_section_ro_nx(struct module *mod, void *module_region) { } 1696static void unset_module_core_ro_nx(struct module *mod) { }
1697static void unset_module_init_ro_nx(struct module *mod) { }
1673#endif 1698#endif
1674 1699
1675/* Free a module, remove from lists, etc. */ 1700/* Free a module, remove from lists, etc. */
@@ -1696,7 +1721,7 @@ static void free_module(struct module *mod)
1696 destroy_params(mod->kp, mod->num_kp); 1721 destroy_params(mod->kp, mod->num_kp);
1697 1722
1698 /* This may be NULL, but that's OK */ 1723 /* This may be NULL, but that's OK */
1699 unset_section_ro_nx(mod, mod->module_init); 1724 unset_module_init_ro_nx(mod);
1700 module_free(mod, mod->module_init); 1725 module_free(mod, mod->module_init);
1701 kfree(mod->args); 1726 kfree(mod->args);
1702 percpu_modfree(mod); 1727 percpu_modfree(mod);
@@ -1705,7 +1730,7 @@ static void free_module(struct module *mod)
1705 lockdep_free_key_range(mod->module_core, mod->core_size); 1730 lockdep_free_key_range(mod->module_core, mod->core_size);
1706 1731
1707 /* Finally, free the core (containing the module structure) */ 1732 /* Finally, free the core (containing the module structure) */
1708 unset_section_ro_nx(mod, mod->module_core); 1733 unset_module_core_ro_nx(mod);
1709 module_free(mod, mod->module_core); 1734 module_free(mod, mod->module_core);
1710 1735
1711#ifdef CONFIG_MPU 1736#ifdef CONFIG_MPU
@@ -2030,11 +2055,8 @@ static const struct kernel_symbol *lookup_symbol(const char *name,
2030 const struct kernel_symbol *start, 2055 const struct kernel_symbol *start,
2031 const struct kernel_symbol *stop) 2056 const struct kernel_symbol *stop)
2032{ 2057{
2033 const struct kernel_symbol *ks = start; 2058 return bsearch(name, start, stop - start,
2034 for (; ks < stop; ks++) 2059 sizeof(struct kernel_symbol), cmp_name);
2035 if (strcmp(ks->name, name) == 0)
2036 return ks;
2037 return NULL;
2038} 2060}
2039 2061
2040static int is_exported(const char *name, unsigned long value, 2062static int is_exported(const char *name, unsigned long value,
@@ -2790,7 +2812,7 @@ static struct module *load_module(void __user *umod,
2790 } 2812 }
2791 2813
2792 /* This has to be done once we're sure module name is unique. */ 2814 /* This has to be done once we're sure module name is unique. */
2793 if (!mod->taints) 2815 if (!mod->taints || mod->taints == (1U<<TAINT_CRAP))
2794 dynamic_debug_setup(info.debug, info.num_debug); 2816 dynamic_debug_setup(info.debug, info.num_debug);
2795 2817
2796 /* Find duplicate symbols */ 2818 /* Find duplicate symbols */
@@ -2827,7 +2849,7 @@ static struct module *load_module(void __user *umod,
2827 module_bug_cleanup(mod); 2849 module_bug_cleanup(mod);
2828 2850
2829 ddebug: 2851 ddebug:
2830 if (!mod->taints) 2852 if (!mod->taints || mod->taints == (1U<<TAINT_CRAP))
2831 dynamic_debug_remove(info.debug); 2853 dynamic_debug_remove(info.debug);
2832 unlock: 2854 unlock:
2833 mutex_unlock(&module_mutex); 2855 mutex_unlock(&module_mutex);
@@ -2931,10 +2953,11 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
2931 mod->symtab = mod->core_symtab; 2953 mod->symtab = mod->core_symtab;
2932 mod->strtab = mod->core_strtab; 2954 mod->strtab = mod->core_strtab;
2933#endif 2955#endif
2934 unset_section_ro_nx(mod, mod->module_init); 2956 unset_module_init_ro_nx(mod);
2935 module_free(mod, mod->module_init); 2957 module_free(mod, mod->module_init);
2936 mod->module_init = NULL; 2958 mod->module_init = NULL;
2937 mod->init_size = 0; 2959 mod->init_size = 0;
2960 mod->init_ro_size = 0;
2938 mod->init_text_size = 0; 2961 mod->init_text_size = 0;
2939 mutex_unlock(&module_mutex); 2962 mutex_unlock(&module_mutex);
2940 2963
diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c
index ec815a960b5d..73da83aff418 100644
--- a/kernel/mutex-debug.c
+++ b/kernel/mutex-debug.c
@@ -75,7 +75,7 @@ void debug_mutex_unlock(struct mutex *lock)
75 return; 75 return;
76 76
77 DEBUG_LOCKS_WARN_ON(lock->magic != lock); 77 DEBUG_LOCKS_WARN_ON(lock->magic != lock);
78 DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info()); 78 DEBUG_LOCKS_WARN_ON(lock->owner != current);
79 DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); 79 DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next);
80 mutex_clear_owner(lock); 80 mutex_clear_owner(lock);
81} 81}
diff --git a/kernel/mutex-debug.h b/kernel/mutex-debug.h
index 57d527a16f9d..0799fd3e4cfa 100644
--- a/kernel/mutex-debug.h
+++ b/kernel/mutex-debug.h
@@ -29,7 +29,7 @@ extern void debug_mutex_init(struct mutex *lock, const char *name,
29 29
30static inline void mutex_set_owner(struct mutex *lock) 30static inline void mutex_set_owner(struct mutex *lock)
31{ 31{
32 lock->owner = current_thread_info(); 32 lock->owner = current;
33} 33}
34 34
35static inline void mutex_clear_owner(struct mutex *lock) 35static inline void mutex_clear_owner(struct mutex *lock)
diff --git a/kernel/mutex.c b/kernel/mutex.c
index c4195fa98900..d607ed5dd441 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -131,14 +131,14 @@ EXPORT_SYMBOL(mutex_unlock);
131 */ 131 */
132static inline int __sched 132static inline int __sched
133__mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, 133__mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
134 unsigned long ip) 134 struct lockdep_map *nest_lock, unsigned long ip)
135{ 135{
136 struct task_struct *task = current; 136 struct task_struct *task = current;
137 struct mutex_waiter waiter; 137 struct mutex_waiter waiter;
138 unsigned long flags; 138 unsigned long flags;
139 139
140 preempt_disable(); 140 preempt_disable();
141 mutex_acquire(&lock->dep_map, subclass, 0, ip); 141 mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip);
142 142
143#ifdef CONFIG_MUTEX_SPIN_ON_OWNER 143#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
144 /* 144 /*
@@ -160,14 +160,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
160 */ 160 */
161 161
162 for (;;) { 162 for (;;) {
163 struct thread_info *owner; 163 struct task_struct *owner;
164
165 /*
166 * If we own the BKL, then don't spin. The owner of
167 * the mutex might be waiting on us to release the BKL.
168 */
169 if (unlikely(current->lock_depth >= 0))
170 break;
171 164
172 /* 165 /*
173 * If there's an owner, wait for it to either 166 * If there's an owner, wait for it to either
@@ -276,16 +269,25 @@ void __sched
276mutex_lock_nested(struct mutex *lock, unsigned int subclass) 269mutex_lock_nested(struct mutex *lock, unsigned int subclass)
277{ 270{
278 might_sleep(); 271 might_sleep();
279 __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass, _RET_IP_); 272 __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass, NULL, _RET_IP_);
280} 273}
281 274
282EXPORT_SYMBOL_GPL(mutex_lock_nested); 275EXPORT_SYMBOL_GPL(mutex_lock_nested);
283 276
277void __sched
278_mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest)
279{
280 might_sleep();
281 __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, nest, _RET_IP_);
282}
283
284EXPORT_SYMBOL_GPL(_mutex_lock_nest_lock);
285
284int __sched 286int __sched
285mutex_lock_killable_nested(struct mutex *lock, unsigned int subclass) 287mutex_lock_killable_nested(struct mutex *lock, unsigned int subclass)
286{ 288{
287 might_sleep(); 289 might_sleep();
288 return __mutex_lock_common(lock, TASK_KILLABLE, subclass, _RET_IP_); 290 return __mutex_lock_common(lock, TASK_KILLABLE, subclass, NULL, _RET_IP_);
289} 291}
290EXPORT_SYMBOL_GPL(mutex_lock_killable_nested); 292EXPORT_SYMBOL_GPL(mutex_lock_killable_nested);
291 293
@@ -294,7 +296,7 @@ mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass)
294{ 296{
295 might_sleep(); 297 might_sleep();
296 return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 298 return __mutex_lock_common(lock, TASK_INTERRUPTIBLE,
297 subclass, _RET_IP_); 299 subclass, NULL, _RET_IP_);
298} 300}
299 301
300EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested); 302EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested);
@@ -400,7 +402,7 @@ __mutex_lock_slowpath(atomic_t *lock_count)
400{ 402{
401 struct mutex *lock = container_of(lock_count, struct mutex, count); 403 struct mutex *lock = container_of(lock_count, struct mutex, count);
402 404
403 __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, _RET_IP_); 405 __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, NULL, _RET_IP_);
404} 406}
405 407
406static noinline int __sched 408static noinline int __sched
@@ -408,7 +410,7 @@ __mutex_lock_killable_slowpath(atomic_t *lock_count)
408{ 410{
409 struct mutex *lock = container_of(lock_count, struct mutex, count); 411 struct mutex *lock = container_of(lock_count, struct mutex, count);
410 412
411 return __mutex_lock_common(lock, TASK_KILLABLE, 0, _RET_IP_); 413 return __mutex_lock_common(lock, TASK_KILLABLE, 0, NULL, _RET_IP_);
412} 414}
413 415
414static noinline int __sched 416static noinline int __sched
@@ -416,7 +418,7 @@ __mutex_lock_interruptible_slowpath(atomic_t *lock_count)
416{ 418{
417 struct mutex *lock = container_of(lock_count, struct mutex, count); 419 struct mutex *lock = container_of(lock_count, struct mutex, count);
418 420
419 return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0, _RET_IP_); 421 return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0, NULL, _RET_IP_);
420} 422}
421#endif 423#endif
422 424
diff --git a/kernel/mutex.h b/kernel/mutex.h
index 67578ca48f94..4115fbf83b12 100644
--- a/kernel/mutex.h
+++ b/kernel/mutex.h
@@ -19,7 +19,7 @@
19#ifdef CONFIG_SMP 19#ifdef CONFIG_SMP
20static inline void mutex_set_owner(struct mutex *lock) 20static inline void mutex_set_owner(struct mutex *lock)
21{ 21{
22 lock->owner = current_thread_info(); 22 lock->owner = current;
23} 23}
24 24
25static inline void mutex_clear_owner(struct mutex *lock) 25static inline void mutex_clear_owner(struct mutex *lock)
diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c
deleted file mode 100644
index 2c98ad94ba0e..000000000000
--- a/kernel/ns_cgroup.c
+++ /dev/null
@@ -1,118 +0,0 @@
1/*
2 * ns_cgroup.c - namespace cgroup subsystem
3 *
4 * Copyright 2006, 2007 IBM Corp
5 */
6
7#include <linux/module.h>
8#include <linux/cgroup.h>
9#include <linux/fs.h>
10#include <linux/proc_fs.h>
11#include <linux/slab.h>
12#include <linux/nsproxy.h>
13
14struct ns_cgroup {
15 struct cgroup_subsys_state css;
16};
17
18struct cgroup_subsys ns_subsys;
19
20static inline struct ns_cgroup *cgroup_to_ns(
21 struct cgroup *cgroup)
22{
23 return container_of(cgroup_subsys_state(cgroup, ns_subsys_id),
24 struct ns_cgroup, css);
25}
26
27int ns_cgroup_clone(struct task_struct *task, struct pid *pid)
28{
29 char name[PROC_NUMBUF];
30
31 snprintf(name, PROC_NUMBUF, "%d", pid_vnr(pid));
32 return cgroup_clone(task, &ns_subsys, name);
33}
34
35/*
36 * Rules:
37 * 1. you can only enter a cgroup which is a descendant of your current
38 * cgroup
39 * 2. you can only place another process into a cgroup if
40 * a. you have CAP_SYS_ADMIN
41 * b. your cgroup is an ancestor of task's destination cgroup
42 * (hence either you are in the same cgroup as task, or in an
43 * ancestor cgroup thereof)
44 */
45static int ns_can_attach(struct cgroup_subsys *ss, struct cgroup *new_cgroup,
46 struct task_struct *task, bool threadgroup)
47{
48 if (current != task) {
49 if (!capable(CAP_SYS_ADMIN))
50 return -EPERM;
51
52 if (!cgroup_is_descendant(new_cgroup, current))
53 return -EPERM;
54 }
55
56 if (!cgroup_is_descendant(new_cgroup, task))
57 return -EPERM;
58
59 if (threadgroup) {
60 struct task_struct *c;
61 rcu_read_lock();
62 list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
63 if (!cgroup_is_descendant(new_cgroup, c)) {
64 rcu_read_unlock();
65 return -EPERM;
66 }
67 }
68 rcu_read_unlock();
69 }
70
71 return 0;
72}
73
74/*
75 * Rules: you can only create a cgroup if
76 * 1. you are capable(CAP_SYS_ADMIN)
77 * 2. the target cgroup is a descendant of your own cgroup
78 */
79static struct cgroup_subsys_state *ns_create(struct cgroup_subsys *ss,
80 struct cgroup *cgroup)
81{
82 struct ns_cgroup *ns_cgroup;
83
84 if (!capable(CAP_SYS_ADMIN))
85 return ERR_PTR(-EPERM);
86 if (!cgroup_is_descendant(cgroup, current))
87 return ERR_PTR(-EPERM);
88 if (test_bit(CGRP_CLONE_CHILDREN, &cgroup->flags)) {
89 printk("ns_cgroup can't be created with parent "
90 "'clone_children' set.\n");
91 return ERR_PTR(-EINVAL);
92 }
93
94 printk_once("ns_cgroup deprecated: consider using the "
95 "'clone_children' flag without the ns_cgroup.\n");
96
97 ns_cgroup = kzalloc(sizeof(*ns_cgroup), GFP_KERNEL);
98 if (!ns_cgroup)
99 return ERR_PTR(-ENOMEM);
100 return &ns_cgroup->css;
101}
102
103static void ns_destroy(struct cgroup_subsys *ss,
104 struct cgroup *cgroup)
105{
106 struct ns_cgroup *ns_cgroup;
107
108 ns_cgroup = cgroup_to_ns(cgroup);
109 kfree(ns_cgroup);
110}
111
112struct cgroup_subsys ns_subsys = {
113 .name = "ns",
114 .can_attach = ns_can_attach,
115 .create = ns_create,
116 .destroy = ns_destroy,
117 .subsys_id = ns_subsys_id,
118};
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index a05d191ffdd9..d6a00f3de15d 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -22,6 +22,9 @@
22#include <linux/pid_namespace.h> 22#include <linux/pid_namespace.h>
23#include <net/net_namespace.h> 23#include <net/net_namespace.h>
24#include <linux/ipc_namespace.h> 24#include <linux/ipc_namespace.h>
25#include <linux/proc_fs.h>
26#include <linux/file.h>
27#include <linux/syscalls.h>
25 28
26static struct kmem_cache *nsproxy_cachep; 29static struct kmem_cache *nsproxy_cachep;
27 30
@@ -198,10 +201,6 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags,
198 goto out; 201 goto out;
199 } 202 }
200 203
201 err = ns_cgroup_clone(current, task_pid(current));
202 if (err)
203 put_nsproxy(*new_nsp);
204
205out: 204out:
206 return err; 205 return err;
207} 206}
@@ -233,6 +232,45 @@ void exit_task_namespaces(struct task_struct *p)
233 switch_task_namespaces(p, NULL); 232 switch_task_namespaces(p, NULL);
234} 233}
235 234
235SYSCALL_DEFINE2(setns, int, fd, int, nstype)
236{
237 const struct proc_ns_operations *ops;
238 struct task_struct *tsk = current;
239 struct nsproxy *new_nsproxy;
240 struct proc_inode *ei;
241 struct file *file;
242 int err;
243
244 if (!capable(CAP_SYS_ADMIN))
245 return -EPERM;
246
247 file = proc_ns_fget(fd);
248 if (IS_ERR(file))
249 return PTR_ERR(file);
250
251 err = -EINVAL;
252 ei = PROC_I(file->f_dentry->d_inode);
253 ops = ei->ns_ops;
254 if (nstype && (ops->type != nstype))
255 goto out;
256
257 new_nsproxy = create_new_namespaces(0, tsk, tsk->fs);
258 if (IS_ERR(new_nsproxy)) {
259 err = PTR_ERR(new_nsproxy);
260 goto out;
261 }
262
263 err = ops->install(new_nsproxy, ei->ns);
264 if (err) {
265 free_nsproxy(new_nsproxy);
266 goto out;
267 }
268 switch_task_namespaces(tsk, new_nsproxy);
269out:
270 fput(file);
271 return err;
272}
273
236static int __init nsproxy_cache_init(void) 274static int __init nsproxy_cache_init(void)
237{ 275{
238 nsproxy_cachep = KMEM_CACHE(nsproxy, SLAB_PANIC); 276 nsproxy_cachep = KMEM_CACHE(nsproxy, SLAB_PANIC);
diff --git a/kernel/params.c b/kernel/params.c
index 7ab388a48a2e..ed72e1330862 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -297,21 +297,15 @@ EXPORT_SYMBOL(param_ops_charp);
297int param_set_bool(const char *val, const struct kernel_param *kp) 297int param_set_bool(const char *val, const struct kernel_param *kp)
298{ 298{
299 bool v; 299 bool v;
300 int ret;
300 301
301 /* No equals means "set"... */ 302 /* No equals means "set"... */
302 if (!val) val = "1"; 303 if (!val) val = "1";
303 304
304 /* One of =[yYnN01] */ 305 /* One of =[yYnN01] */
305 switch (val[0]) { 306 ret = strtobool(val, &v);
306 case 'y': case 'Y': case '1': 307 if (ret)
307 v = true; 308 return ret;
308 break;
309 case 'n': case 'N': case '0':
310 v = false;
311 break;
312 default:
313 return -EINVAL;
314 }
315 309
316 if (kp->flags & KPARAM_ISBOOL) 310 if (kp->flags & KPARAM_ISBOOL)
317 *(bool *)kp->arg = v; 311 *(bool *)kp->arg = v;
@@ -821,15 +815,18 @@ ssize_t __modver_version_show(struct module_attribute *mattr,
821 return sprintf(buf, "%s\n", vattr->version); 815 return sprintf(buf, "%s\n", vattr->version);
822} 816}
823 817
824extern struct module_version_attribute __start___modver[], __stop___modver[]; 818extern const struct module_version_attribute *__start___modver[];
819extern const struct module_version_attribute *__stop___modver[];
825 820
826static void __init version_sysfs_builtin(void) 821static void __init version_sysfs_builtin(void)
827{ 822{
828 const struct module_version_attribute *vattr; 823 const struct module_version_attribute **p;
829 struct module_kobject *mk; 824 struct module_kobject *mk;
830 int err; 825 int err;
831 826
832 for (vattr = __start___modver; vattr < __stop___modver; vattr++) { 827 for (p = __start___modver; p < __stop___modver; p++) {
828 const struct module_version_attribute *vattr = *p;
829
833 mk = locate_module_kobject(vattr->module_name); 830 mk = locate_module_kobject(vattr->module_name);
834 if (mk) { 831 if (mk) {
835 err = sysfs_create_file(&mk->kobj, &vattr->mattr.attr); 832 err = sysfs_create_file(&mk->kobj, &vattr->mattr.attr);
diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c
index 0da058bff8eb..6824ca7d4d0c 100644
--- a/kernel/pm_qos_params.c
+++ b/kernel/pm_qos_params.c
@@ -40,6 +40,7 @@
40#include <linux/string.h> 40#include <linux/string.h>
41#include <linux/platform_device.h> 41#include <linux/platform_device.h>
42#include <linux/init.h> 42#include <linux/init.h>
43#include <linux/kernel.h>
43 44
44#include <linux/uaccess.h> 45#include <linux/uaccess.h>
45 46
@@ -53,11 +54,17 @@ enum pm_qos_type {
53 PM_QOS_MIN /* return the smallest value */ 54 PM_QOS_MIN /* return the smallest value */
54}; 55};
55 56
57/*
58 * Note: The lockless read path depends on the CPU accessing
59 * target_value atomically. Atomic access is only guaranteed on all CPU
60 * types linux supports for 32 bit quantites
61 */
56struct pm_qos_object { 62struct pm_qos_object {
57 struct plist_head requests; 63 struct plist_head requests;
58 struct blocking_notifier_head *notifiers; 64 struct blocking_notifier_head *notifiers;
59 struct miscdevice pm_qos_power_miscdev; 65 struct miscdevice pm_qos_power_miscdev;
60 char *name; 66 char *name;
67 s32 target_value; /* Do not change to 64 bit */
61 s32 default_value; 68 s32 default_value;
62 enum pm_qos_type type; 69 enum pm_qos_type type;
63}; 70};
@@ -70,7 +77,8 @@ static struct pm_qos_object cpu_dma_pm_qos = {
70 .requests = PLIST_HEAD_INIT(cpu_dma_pm_qos.requests, pm_qos_lock), 77 .requests = PLIST_HEAD_INIT(cpu_dma_pm_qos.requests, pm_qos_lock),
71 .notifiers = &cpu_dma_lat_notifier, 78 .notifiers = &cpu_dma_lat_notifier,
72 .name = "cpu_dma_latency", 79 .name = "cpu_dma_latency",
73 .default_value = 2000 * USEC_PER_SEC, 80 .target_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE,
81 .default_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE,
74 .type = PM_QOS_MIN, 82 .type = PM_QOS_MIN,
75}; 83};
76 84
@@ -79,7 +87,8 @@ static struct pm_qos_object network_lat_pm_qos = {
79 .requests = PLIST_HEAD_INIT(network_lat_pm_qos.requests, pm_qos_lock), 87 .requests = PLIST_HEAD_INIT(network_lat_pm_qos.requests, pm_qos_lock),
80 .notifiers = &network_lat_notifier, 88 .notifiers = &network_lat_notifier,
81 .name = "network_latency", 89 .name = "network_latency",
82 .default_value = 2000 * USEC_PER_SEC, 90 .target_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE,
91 .default_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE,
83 .type = PM_QOS_MIN 92 .type = PM_QOS_MIN
84}; 93};
85 94
@@ -89,7 +98,8 @@ static struct pm_qos_object network_throughput_pm_qos = {
89 .requests = PLIST_HEAD_INIT(network_throughput_pm_qos.requests, pm_qos_lock), 98 .requests = PLIST_HEAD_INIT(network_throughput_pm_qos.requests, pm_qos_lock),
90 .notifiers = &network_throughput_notifier, 99 .notifiers = &network_throughput_notifier,
91 .name = "network_throughput", 100 .name = "network_throughput",
92 .default_value = 0, 101 .target_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE,
102 .default_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE,
93 .type = PM_QOS_MAX, 103 .type = PM_QOS_MAX,
94}; 104};
95 105
@@ -135,6 +145,16 @@ static inline int pm_qos_get_value(struct pm_qos_object *o)
135 } 145 }
136} 146}
137 147
148static inline s32 pm_qos_read_value(struct pm_qos_object *o)
149{
150 return o->target_value;
151}
152
153static inline void pm_qos_set_value(struct pm_qos_object *o, s32 value)
154{
155 o->target_value = value;
156}
157
138static void update_target(struct pm_qos_object *o, struct plist_node *node, 158static void update_target(struct pm_qos_object *o, struct plist_node *node,
139 int del, int value) 159 int del, int value)
140{ 160{
@@ -159,6 +179,7 @@ static void update_target(struct pm_qos_object *o, struct plist_node *node,
159 plist_add(node, &o->requests); 179 plist_add(node, &o->requests);
160 } 180 }
161 curr_value = pm_qos_get_value(o); 181 curr_value = pm_qos_get_value(o);
182 pm_qos_set_value(o, curr_value);
162 spin_unlock_irqrestore(&pm_qos_lock, flags); 183 spin_unlock_irqrestore(&pm_qos_lock, flags);
163 184
164 if (prev_value != curr_value) 185 if (prev_value != curr_value)
@@ -193,18 +214,11 @@ static int find_pm_qos_object_by_minor(int minor)
193 * pm_qos_request - returns current system wide qos expectation 214 * pm_qos_request - returns current system wide qos expectation
194 * @pm_qos_class: identification of which qos value is requested 215 * @pm_qos_class: identification of which qos value is requested
195 * 216 *
196 * This function returns the current target value in an atomic manner. 217 * This function returns the current target value.
197 */ 218 */
198int pm_qos_request(int pm_qos_class) 219int pm_qos_request(int pm_qos_class)
199{ 220{
200 unsigned long flags; 221 return pm_qos_read_value(pm_qos_array[pm_qos_class]);
201 int value;
202
203 spin_lock_irqsave(&pm_qos_lock, flags);
204 value = pm_qos_get_value(pm_qos_array[pm_qos_class]);
205 spin_unlock_irqrestore(&pm_qos_lock, flags);
206
207 return value;
208} 222}
209EXPORT_SYMBOL_GPL(pm_qos_request); 223EXPORT_SYMBOL_GPL(pm_qos_request);
210 224
@@ -385,7 +399,7 @@ static ssize_t pm_qos_power_read(struct file *filp, char __user *buf,
385 s32 value; 399 s32 value;
386 unsigned long flags; 400 unsigned long flags;
387 struct pm_qos_object *o; 401 struct pm_qos_object *o;
388 struct pm_qos_request_list *pm_qos_req = filp->private_data;; 402 struct pm_qos_request_list *pm_qos_req = filp->private_data;
389 403
390 if (!pm_qos_req) 404 if (!pm_qos_req)
391 return -EINVAL; 405 return -EINVAL;
@@ -404,24 +418,36 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
404 size_t count, loff_t *f_pos) 418 size_t count, loff_t *f_pos)
405{ 419{
406 s32 value; 420 s32 value;
407 int x;
408 char ascii_value[11];
409 struct pm_qos_request_list *pm_qos_req; 421 struct pm_qos_request_list *pm_qos_req;
410 422
411 if (count == sizeof(s32)) { 423 if (count == sizeof(s32)) {
412 if (copy_from_user(&value, buf, sizeof(s32))) 424 if (copy_from_user(&value, buf, sizeof(s32)))
413 return -EFAULT; 425 return -EFAULT;
414 } else if (count == 11) { /* len('0x12345678/0') */ 426 } else if (count <= 11) { /* ASCII perhaps? */
415 if (copy_from_user(ascii_value, buf, 11)) 427 char ascii_value[11];
428 unsigned long int ulval;
429 int ret;
430
431 if (copy_from_user(ascii_value, buf, count))
416 return -EFAULT; 432 return -EFAULT;
417 if (strlen(ascii_value) != 10) 433
418 return -EINVAL; 434 if (count > 10) {
419 x = sscanf(ascii_value, "%x", &value); 435 if (ascii_value[10] == '\n')
420 if (x != 1) 436 ascii_value[10] = '\0';
437 else
438 return -EINVAL;
439 } else {
440 ascii_value[count] = '\0';
441 }
442 ret = strict_strtoul(ascii_value, 16, &ulval);
443 if (ret) {
444 pr_debug("%s, 0x%lx, 0x%x\n", ascii_value, ulval, ret);
421 return -EINVAL; 445 return -EINVAL;
422 pr_debug("%s, %d, 0x%x\n", ascii_value, x, value); 446 }
423 } else 447 value = (s32)lower_32_bits(ulval);
448 } else {
424 return -EINVAL; 449 return -EINVAL;
450 }
425 451
426 pm_qos_req = filp->private_data; 452 pm_qos_req = filp->private_data;
427 pm_qos_update_request(pm_qos_req, value); 453 pm_qos_update_request(pm_qos_req, value);
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 0791b13df7bf..58f405b581e7 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -1514,7 +1514,7 @@ static int posix_cpu_nsleep(const clockid_t which_clock, int flags,
1514 return -EFAULT; 1514 return -EFAULT;
1515 1515
1516 restart_block->fn = posix_cpu_nsleep_restart; 1516 restart_block->fn = posix_cpu_nsleep_restart;
1517 restart_block->nanosleep.index = which_clock; 1517 restart_block->nanosleep.clockid = which_clock;
1518 restart_block->nanosleep.rmtp = rmtp; 1518 restart_block->nanosleep.rmtp = rmtp;
1519 restart_block->nanosleep.expires = timespec_to_ns(rqtp); 1519 restart_block->nanosleep.expires = timespec_to_ns(rqtp);
1520 } 1520 }
@@ -1523,7 +1523,7 @@ static int posix_cpu_nsleep(const clockid_t which_clock, int flags,
1523 1523
1524static long posix_cpu_nsleep_restart(struct restart_block *restart_block) 1524static long posix_cpu_nsleep_restart(struct restart_block *restart_block)
1525{ 1525{
1526 clockid_t which_clock = restart_block->nanosleep.index; 1526 clockid_t which_clock = restart_block->nanosleep.clockid;
1527 struct timespec t; 1527 struct timespec t;
1528 struct itimerspec it; 1528 struct itimerspec it;
1529 int error; 1529 int error;
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index e5498d7405c3..4556182527f3 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -491,6 +491,13 @@ static struct k_itimer * alloc_posix_timer(void)
491 return tmr; 491 return tmr;
492} 492}
493 493
494static void k_itimer_rcu_free(struct rcu_head *head)
495{
496 struct k_itimer *tmr = container_of(head, struct k_itimer, it.rcu);
497
498 kmem_cache_free(posix_timers_cache, tmr);
499}
500
494#define IT_ID_SET 1 501#define IT_ID_SET 1
495#define IT_ID_NOT_SET 0 502#define IT_ID_NOT_SET 0
496static void release_posix_timer(struct k_itimer *tmr, int it_id_set) 503static void release_posix_timer(struct k_itimer *tmr, int it_id_set)
@@ -503,7 +510,7 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set)
503 } 510 }
504 put_pid(tmr->it_pid); 511 put_pid(tmr->it_pid);
505 sigqueue_free(tmr->sigq); 512 sigqueue_free(tmr->sigq);
506 kmem_cache_free(posix_timers_cache, tmr); 513 call_rcu(&tmr->it.rcu, k_itimer_rcu_free);
507} 514}
508 515
509static struct k_clock *clockid_to_kclock(const clockid_t id) 516static struct k_clock *clockid_to_kclock(const clockid_t id)
@@ -631,22 +638,18 @@ out:
631static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags) 638static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags)
632{ 639{
633 struct k_itimer *timr; 640 struct k_itimer *timr;
634 /* 641
635 * Watch out here. We do a irqsave on the idr_lock and pass the 642 rcu_read_lock();
636 * flags part over to the timer lock. Must not let interrupts in
637 * while we are moving the lock.
638 */
639 spin_lock_irqsave(&idr_lock, *flags);
640 timr = idr_find(&posix_timers_id, (int)timer_id); 643 timr = idr_find(&posix_timers_id, (int)timer_id);
641 if (timr) { 644 if (timr) {
642 spin_lock(&timr->it_lock); 645 spin_lock_irqsave(&timr->it_lock, *flags);
643 if (timr->it_signal == current->signal) { 646 if (timr->it_signal == current->signal) {
644 spin_unlock(&idr_lock); 647 rcu_read_unlock();
645 return timr; 648 return timr;
646 } 649 }
647 spin_unlock(&timr->it_lock); 650 spin_unlock_irqrestore(&timr->it_lock, *flags);
648 } 651 }
649 spin_unlock_irqrestore(&idr_lock, *flags); 652 rcu_read_unlock();
650 653
651 return NULL; 654 return NULL;
652} 655}
@@ -1056,7 +1059,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
1056 */ 1059 */
1057long clock_nanosleep_restart(struct restart_block *restart_block) 1060long clock_nanosleep_restart(struct restart_block *restart_block)
1058{ 1061{
1059 clockid_t which_clock = restart_block->nanosleep.index; 1062 clockid_t which_clock = restart_block->nanosleep.clockid;
1060 struct k_clock *kc = clockid_to_kclock(which_clock); 1063 struct k_clock *kc = clockid_to_kclock(which_clock);
1061 1064
1062 if (WARN_ON_ONCE(!kc || !kc->nsleep_restart)) 1065 if (WARN_ON_ONCE(!kc || !kc->nsleep_restart))
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 6de9a8fc3417..87f4d24b55b0 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -125,12 +125,6 @@ config PM_DEBUG
125 code. This is helpful when debugging and reporting PM bugs, like 125 code. This is helpful when debugging and reporting PM bugs, like
126 suspend support. 126 suspend support.
127 127
128config PM_VERBOSE
129 bool "Verbose Power Management debugging"
130 depends on PM_DEBUG
131 ---help---
132 This option enables verbose messages from the Power Management code.
133
134config PM_ADVANCED_DEBUG 128config PM_ADVANCED_DEBUG
135 bool "Extra PM attributes in sysfs for low-level debugging/testing" 129 bool "Extra PM attributes in sysfs for low-level debugging/testing"
136 depends on PM_DEBUG 130 depends on PM_DEBUG
@@ -229,3 +223,7 @@ config PM_OPP
229 representing individual voltage domains and provides SOC 223 representing individual voltage domains and provides SOC
230 implementations a ready to use framework to manage OPPs. 224 implementations a ready to use framework to manage OPPs.
231 For more information, read <file:Documentation/power/opp.txt> 225 For more information, read <file:Documentation/power/opp.txt>
226
227config PM_RUNTIME_CLK
228 def_bool y
229 depends on PM_RUNTIME && HAVE_CLK
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 50aae660174d..8f7b1db1ece1 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -25,7 +25,6 @@
25#include <linux/gfp.h> 25#include <linux/gfp.h>
26#include <linux/syscore_ops.h> 26#include <linux/syscore_ops.h>
27#include <scsi/scsi_scan.h> 27#include <scsi/scsi_scan.h>
28#include <asm/suspend.h>
29 28
30#include "power.h" 29#include "power.h"
31 30
@@ -55,10 +54,9 @@ static int hibernation_mode = HIBERNATION_SHUTDOWN;
55static const struct platform_hibernation_ops *hibernation_ops; 54static const struct platform_hibernation_ops *hibernation_ops;
56 55
57/** 56/**
58 * hibernation_set_ops - set the global hibernate operations 57 * hibernation_set_ops - Set the global hibernate operations.
59 * @ops: the hibernation operations to use in subsequent hibernation transitions 58 * @ops: Hibernation operations to use in subsequent hibernation transitions.
60 */ 59 */
61
62void hibernation_set_ops(const struct platform_hibernation_ops *ops) 60void hibernation_set_ops(const struct platform_hibernation_ops *ops)
63{ 61{
64 if (ops && !(ops->begin && ops->end && ops->pre_snapshot 62 if (ops && !(ops->begin && ops->end && ops->pre_snapshot
@@ -115,10 +113,9 @@ static int hibernation_test(int level) { return 0; }
115#endif /* !CONFIG_PM_DEBUG */ 113#endif /* !CONFIG_PM_DEBUG */
116 114
117/** 115/**
118 * platform_begin - tell the platform driver that we're starting 116 * platform_begin - Call platform to start hibernation.
119 * hibernation 117 * @platform_mode: Whether or not to use the platform driver.
120 */ 118 */
121
122static int platform_begin(int platform_mode) 119static int platform_begin(int platform_mode)
123{ 120{
124 return (platform_mode && hibernation_ops) ? 121 return (platform_mode && hibernation_ops) ?
@@ -126,10 +123,9 @@ static int platform_begin(int platform_mode)
126} 123}
127 124
128/** 125/**
129 * platform_end - tell the platform driver that we've entered the 126 * platform_end - Call platform to finish transition to the working state.
130 * working state 127 * @platform_mode: Whether or not to use the platform driver.
131 */ 128 */
132
133static void platform_end(int platform_mode) 129static void platform_end(int platform_mode)
134{ 130{
135 if (platform_mode && hibernation_ops) 131 if (platform_mode && hibernation_ops)
@@ -137,8 +133,11 @@ static void platform_end(int platform_mode)
137} 133}
138 134
139/** 135/**
140 * platform_pre_snapshot - prepare the machine for hibernation using the 136 * platform_pre_snapshot - Call platform to prepare the machine for hibernation.
141 * platform driver if so configured and return an error code if it fails 137 * @platform_mode: Whether or not to use the platform driver.
138 *
139 * Use the platform driver to prepare the system for creating a hibernate image,
140 * if so configured, and return an error code if that fails.
142 */ 141 */
143 142
144static int platform_pre_snapshot(int platform_mode) 143static int platform_pre_snapshot(int platform_mode)
@@ -148,10 +147,14 @@ static int platform_pre_snapshot(int platform_mode)
148} 147}
149 148
150/** 149/**
151 * platform_leave - prepare the machine for switching to the normal mode 150 * platform_leave - Call platform to prepare a transition to the working state.
152 * of operation using the platform driver (called with interrupts disabled) 151 * @platform_mode: Whether or not to use the platform driver.
152 *
153 * Use the platform driver prepare to prepare the machine for switching to the
154 * normal mode of operation.
155 *
156 * This routine is called on one CPU with interrupts disabled.
153 */ 157 */
154
155static void platform_leave(int platform_mode) 158static void platform_leave(int platform_mode)
156{ 159{
157 if (platform_mode && hibernation_ops) 160 if (platform_mode && hibernation_ops)
@@ -159,10 +162,14 @@ static void platform_leave(int platform_mode)
159} 162}
160 163
161/** 164/**
162 * platform_finish - switch the machine to the normal mode of operation 165 * platform_finish - Call platform to switch the system to the working state.
163 * using the platform driver (must be called after platform_prepare()) 166 * @platform_mode: Whether or not to use the platform driver.
167 *
168 * Use the platform driver to switch the machine to the normal mode of
169 * operation.
170 *
171 * This routine must be called after platform_prepare().
164 */ 172 */
165
166static void platform_finish(int platform_mode) 173static void platform_finish(int platform_mode)
167{ 174{
168 if (platform_mode && hibernation_ops) 175 if (platform_mode && hibernation_ops)
@@ -170,11 +177,15 @@ static void platform_finish(int platform_mode)
170} 177}
171 178
172/** 179/**
173 * platform_pre_restore - prepare the platform for the restoration from a 180 * platform_pre_restore - Prepare for hibernate image restoration.
174 * hibernation image. If the restore fails after this function has been 181 * @platform_mode: Whether or not to use the platform driver.
175 * called, platform_restore_cleanup() must be called. 182 *
183 * Use the platform driver to prepare the system for resume from a hibernation
184 * image.
185 *
186 * If the restore fails after this function has been called,
187 * platform_restore_cleanup() must be called.
176 */ 188 */
177
178static int platform_pre_restore(int platform_mode) 189static int platform_pre_restore(int platform_mode)
179{ 190{
180 return (platform_mode && hibernation_ops) ? 191 return (platform_mode && hibernation_ops) ?
@@ -182,12 +193,16 @@ static int platform_pre_restore(int platform_mode)
182} 193}
183 194
184/** 195/**
185 * platform_restore_cleanup - switch the platform to the normal mode of 196 * platform_restore_cleanup - Switch to the working state after failing restore.
186 * operation after a failing restore. If platform_pre_restore() has been 197 * @platform_mode: Whether or not to use the platform driver.
187 * called before the failing restore, this function must be called too, 198 *
188 * regardless of the result of platform_pre_restore(). 199 * Use the platform driver to switch the system to the normal mode of operation
200 * after a failing restore.
201 *
202 * If platform_pre_restore() has been called before the failing restore, this
203 * function must be called too, regardless of the result of
204 * platform_pre_restore().
189 */ 205 */
190
191static void platform_restore_cleanup(int platform_mode) 206static void platform_restore_cleanup(int platform_mode)
192{ 207{
193 if (platform_mode && hibernation_ops) 208 if (platform_mode && hibernation_ops)
@@ -195,10 +210,9 @@ static void platform_restore_cleanup(int platform_mode)
195} 210}
196 211
197/** 212/**
198 * platform_recover - recover the platform from a failure to suspend 213 * platform_recover - Recover from a failure to suspend devices.
199 * devices. 214 * @platform_mode: Whether or not to use the platform driver.
200 */ 215 */
201
202static void platform_recover(int platform_mode) 216static void platform_recover(int platform_mode)
203{ 217{
204 if (platform_mode && hibernation_ops && hibernation_ops->recover) 218 if (platform_mode && hibernation_ops && hibernation_ops->recover)
@@ -206,13 +220,12 @@ static void platform_recover(int platform_mode)
206} 220}
207 221
208/** 222/**
209 * swsusp_show_speed - print the time elapsed between two events. 223 * swsusp_show_speed - Print time elapsed between two events during hibernation.
210 * @start: Starting event. 224 * @start: Starting event.
211 * @stop: Final event. 225 * @stop: Final event.
212 * @nr_pages - number of pages processed between @start and @stop 226 * @nr_pages: Number of memory pages processed between @start and @stop.
213 * @msg - introductory message to print 227 * @msg: Additional diagnostic message to print.
214 */ 228 */
215
216void swsusp_show_speed(struct timeval *start, struct timeval *stop, 229void swsusp_show_speed(struct timeval *start, struct timeval *stop,
217 unsigned nr_pages, char *msg) 230 unsigned nr_pages, char *msg)
218{ 231{
@@ -235,25 +248,18 @@ void swsusp_show_speed(struct timeval *start, struct timeval *stop,
235} 248}
236 249
237/** 250/**
238 * create_image - freeze devices that need to be frozen with interrupts 251 * create_image - Create a hibernation image.
239 * off, create the hibernation image and thaw those devices. Control 252 * @platform_mode: Whether or not to use the platform driver.
240 * reappears in this routine after a restore. 253 *
254 * Execute device drivers' .freeze_noirq() callbacks, create a hibernation image
255 * and execute the drivers' .thaw_noirq() callbacks.
256 *
257 * Control reappears in this routine after the subsequent restore.
241 */ 258 */
242
243static int create_image(int platform_mode) 259static int create_image(int platform_mode)
244{ 260{
245 int error; 261 int error;
246 262
247 error = arch_prepare_suspend();
248 if (error)
249 return error;
250
251 /* At this point, dpm_suspend_start() has been called, but *not*
252 * dpm_suspend_noirq(). We *must* call dpm_suspend_noirq() now.
253 * Otherwise, drivers for some devices (e.g. interrupt controllers)
254 * become desynchronized with the actual state of the hardware
255 * at resume time, and evil weirdness ensues.
256 */
257 error = dpm_suspend_noirq(PMSG_FREEZE); 263 error = dpm_suspend_noirq(PMSG_FREEZE);
258 if (error) { 264 if (error) {
259 printk(KERN_ERR "PM: Some devices failed to power down, " 265 printk(KERN_ERR "PM: Some devices failed to power down, "
@@ -272,12 +278,7 @@ static int create_image(int platform_mode)
272 278
273 local_irq_disable(); 279 local_irq_disable();
274 280
275 error = sysdev_suspend(PMSG_FREEZE); 281 error = syscore_suspend();
276 if (!error) {
277 error = syscore_suspend();
278 if (error)
279 sysdev_resume();
280 }
281 if (error) { 282 if (error) {
282 printk(KERN_ERR "PM: Some system devices failed to power down, " 283 printk(KERN_ERR "PM: Some system devices failed to power down, "
283 "aborting hibernation\n"); 284 "aborting hibernation\n");
@@ -302,10 +303,6 @@ static int create_image(int platform_mode)
302 303
303 Power_up: 304 Power_up:
304 syscore_resume(); 305 syscore_resume();
305 sysdev_resume();
306 /* NOTE: dpm_resume_noirq() is just a resume() for devices
307 * that suspended with irqs off ... no overall powerup.
308 */
309 306
310 Enable_irqs: 307 Enable_irqs:
311 local_irq_enable(); 308 local_irq_enable();
@@ -323,30 +320,32 @@ static int create_image(int platform_mode)
323} 320}
324 321
325/** 322/**
326 * hibernation_snapshot - quiesce devices and create the hibernation 323 * hibernation_snapshot - Quiesce devices and create a hibernation image.
327 * snapshot image. 324 * @platform_mode: If set, use platform driver to prepare for the transition.
328 * @platform_mode - if set, use the platform driver, if available, to
329 * prepare the platform firmware for the power transition.
330 * 325 *
331 * Must be called with pm_mutex held 326 * This routine must be called with pm_mutex held.
332 */ 327 */
333
334int hibernation_snapshot(int platform_mode) 328int hibernation_snapshot(int platform_mode)
335{ 329{
330 pm_message_t msg = PMSG_RECOVER;
336 int error; 331 int error;
337 332
338 error = platform_begin(platform_mode); 333 error = platform_begin(platform_mode);
339 if (error) 334 if (error)
340 goto Close; 335 goto Close;
341 336
337 error = dpm_prepare(PMSG_FREEZE);
338 if (error)
339 goto Complete_devices;
340
342 /* Preallocate image memory before shutting down devices. */ 341 /* Preallocate image memory before shutting down devices. */
343 error = hibernate_preallocate_memory(); 342 error = hibernate_preallocate_memory();
344 if (error) 343 if (error)
345 goto Close; 344 goto Complete_devices;
346 345
347 suspend_console(); 346 suspend_console();
348 pm_restrict_gfp_mask(); 347 pm_restrict_gfp_mask();
349 error = dpm_suspend_start(PMSG_FREEZE); 348 error = dpm_suspend(PMSG_FREEZE);
350 if (error) 349 if (error)
351 goto Recover_platform; 350 goto Recover_platform;
352 351
@@ -364,13 +363,17 @@ int hibernation_snapshot(int platform_mode)
364 if (error || !in_suspend) 363 if (error || !in_suspend)
365 swsusp_free(); 364 swsusp_free();
366 365
367 dpm_resume_end(in_suspend ? 366 msg = in_suspend ? (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE;
368 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); 367 dpm_resume(msg);
369 368
370 if (error || !in_suspend) 369 if (error || !in_suspend)
371 pm_restore_gfp_mask(); 370 pm_restore_gfp_mask();
372 371
373 resume_console(); 372 resume_console();
373
374 Complete_devices:
375 dpm_complete(msg);
376
374 Close: 377 Close:
375 platform_end(platform_mode); 378 platform_end(platform_mode);
376 return error; 379 return error;
@@ -381,13 +384,14 @@ int hibernation_snapshot(int platform_mode)
381} 384}
382 385
383/** 386/**
384 * resume_target_kernel - prepare devices that need to be suspended with 387 * resume_target_kernel - Restore system state from a hibernation image.
385 * interrupts off, restore the contents of highmem that have not been 388 * @platform_mode: Whether or not to use the platform driver.
386 * restored yet from the image and run the low level code that will restore 389 *
387 * the remaining contents of memory and switch to the just restored target 390 * Execute device drivers' .freeze_noirq() callbacks, restore the contents of
388 * kernel. 391 * highmem that have not been restored yet from the image and run the low-level
392 * code that will restore the remaining contents of memory and switch to the
393 * just restored target kernel.
389 */ 394 */
390
391static int resume_target_kernel(bool platform_mode) 395static int resume_target_kernel(bool platform_mode)
392{ 396{
393 int error; 397 int error;
@@ -409,40 +413,36 @@ static int resume_target_kernel(bool platform_mode)
409 413
410 local_irq_disable(); 414 local_irq_disable();
411 415
412 error = sysdev_suspend(PMSG_QUIESCE); 416 error = syscore_suspend();
413 if (!error) {
414 error = syscore_suspend();
415 if (error)
416 sysdev_resume();
417 }
418 if (error) 417 if (error)
419 goto Enable_irqs; 418 goto Enable_irqs;
420 419
421 /* We'll ignore saved state, but this gets preempt count (etc) right */
422 save_processor_state(); 420 save_processor_state();
423 error = restore_highmem(); 421 error = restore_highmem();
424 if (!error) { 422 if (!error) {
425 error = swsusp_arch_resume(); 423 error = swsusp_arch_resume();
426 /* 424 /*
427 * The code below is only ever reached in case of a failure. 425 * The code below is only ever reached in case of a failure.
428 * Otherwise execution continues at place where 426 * Otherwise, execution continues at the place where
429 * swsusp_arch_suspend() was called 427 * swsusp_arch_suspend() was called.
430 */ 428 */
431 BUG_ON(!error); 429 BUG_ON(!error);
432 /* This call to restore_highmem() undos the previous one */ 430 /*
431 * This call to restore_highmem() reverts the changes made by
432 * the previous one.
433 */
433 restore_highmem(); 434 restore_highmem();
434 } 435 }
435 /* 436 /*
436 * The only reason why swsusp_arch_resume() can fail is memory being 437 * The only reason why swsusp_arch_resume() can fail is memory being
437 * very tight, so we have to free it as soon as we can to avoid 438 * very tight, so we have to free it as soon as we can to avoid
438 * subsequent failures 439 * subsequent failures.
439 */ 440 */
440 swsusp_free(); 441 swsusp_free();
441 restore_processor_state(); 442 restore_processor_state();
442 touch_softlockup_watchdog(); 443 touch_softlockup_watchdog();
443 444
444 syscore_resume(); 445 syscore_resume();
445 sysdev_resume();
446 446
447 Enable_irqs: 447 Enable_irqs:
448 local_irq_enable(); 448 local_irq_enable();
@@ -459,14 +459,12 @@ static int resume_target_kernel(bool platform_mode)
459} 459}
460 460
461/** 461/**
462 * hibernation_restore - quiesce devices and restore the hibernation 462 * hibernation_restore - Quiesce devices and restore from a hibernation image.
463 * snapshot image. If successful, control returns in hibernation_snaphot() 463 * @platform_mode: If set, use platform driver to prepare for the transition.
464 * @platform_mode - if set, use the platform driver, if available, to
465 * prepare the platform firmware for the transition.
466 * 464 *
467 * Must be called with pm_mutex held 465 * This routine must be called with pm_mutex held. If it is successful, control
466 * reappears in the restored target kernel in hibernation_snaphot().
468 */ 467 */
469
470int hibernation_restore(int platform_mode) 468int hibernation_restore(int platform_mode)
471{ 469{
472 int error; 470 int error;
@@ -486,10 +484,8 @@ int hibernation_restore(int platform_mode)
486} 484}
487 485
488/** 486/**
489 * hibernation_platform_enter - enter the hibernation state using the 487 * hibernation_platform_enter - Power off the system using the platform driver.
490 * platform driver (if available)
491 */ 488 */
492
493int hibernation_platform_enter(void) 489int hibernation_platform_enter(void)
494{ 490{
495 int error; 491 int error;
@@ -528,7 +524,6 @@ int hibernation_platform_enter(void)
528 goto Platform_finish; 524 goto Platform_finish;
529 525
530 local_irq_disable(); 526 local_irq_disable();
531 sysdev_suspend(PMSG_HIBERNATE);
532 syscore_suspend(); 527 syscore_suspend();
533 if (pm_wakeup_pending()) { 528 if (pm_wakeup_pending()) {
534 error = -EAGAIN; 529 error = -EAGAIN;
@@ -541,7 +536,6 @@ int hibernation_platform_enter(void)
541 536
542 Power_up: 537 Power_up:
543 syscore_resume(); 538 syscore_resume();
544 sysdev_resume();
545 local_irq_enable(); 539 local_irq_enable();
546 enable_nonboot_cpus(); 540 enable_nonboot_cpus();
547 541
@@ -562,12 +556,12 @@ int hibernation_platform_enter(void)
562} 556}
563 557
564/** 558/**
565 * power_down - Shut the machine down for hibernation. 559 * power_down - Shut the machine down for hibernation.
566 * 560 *
567 * Use the platform driver, if configured so; otherwise try 561 * Use the platform driver, if configured, to put the system into the sleep
568 * to power off or reboot. 562 * state corresponding to hibernation, or try to power it off or reboot,
563 * depending on the value of hibernation_mode.
569 */ 564 */
570
571static void power_down(void) 565static void power_down(void)
572{ 566{
573 switch (hibernation_mode) { 567 switch (hibernation_mode) {
@@ -604,9 +598,8 @@ static int prepare_processes(void)
604} 598}
605 599
606/** 600/**
607 * hibernate - The granpappy of the built-in hibernation management 601 * hibernate - Carry out system hibernation, including saving the image.
608 */ 602 */
609
610int hibernate(void) 603int hibernate(void)
611{ 604{
612 int error; 605 int error;
@@ -684,17 +677,20 @@ int hibernate(void)
684 677
685 678
686/** 679/**
687 * software_resume - Resume from a saved image. 680 * software_resume - Resume from a saved hibernation image.
681 *
682 * This routine is called as a late initcall, when all devices have been
683 * discovered and initialized already.
688 * 684 *
689 * Called as a late_initcall (so all devices are discovered and 685 * The image reading code is called to see if there is a hibernation image
690 * initialized), we call swsusp to see if we have a saved image or not. 686 * available for reading. If that is the case, devices are quiesced and the
691 * If so, we quiesce devices, the restore the saved image. We will 687 * contents of memory is restored from the saved image.
692 * return above (in hibernate() ) if everything goes well.
693 * Otherwise, we fail gracefully and return to the normally
694 * scheduled program.
695 * 688 *
689 * If this is successful, control reappears in the restored target kernel in
690 * hibernation_snaphot() which returns to hibernate(). Otherwise, the routine
691 * attempts to recover gracefully and make the kernel return to the normal mode
692 * of operation.
696 */ 693 */
697
698static int software_resume(void) 694static int software_resume(void)
699{ 695{
700 int error; 696 int error;
@@ -824,21 +820,17 @@ static const char * const hibernation_modes[] = {
824 [HIBERNATION_TESTPROC] = "testproc", 820 [HIBERNATION_TESTPROC] = "testproc",
825}; 821};
826 822
827/** 823/*
828 * disk - Control hibernation mode 824 * /sys/power/disk - Control hibernation mode.
829 *
830 * Suspend-to-disk can be handled in several ways. We have a few options
831 * for putting the system to sleep - using the platform driver (e.g. ACPI
832 * or other hibernation_ops), powering off the system or rebooting the
833 * system (for testing) as well as the two test modes.
834 * 825 *
835 * The system can support 'platform', and that is known a priori (and 826 * Hibernation can be handled in several ways. There are a few different ways
836 * encoded by the presence of hibernation_ops). However, the user may 827 * to put the system into the sleep state: using the platform driver (e.g. ACPI
837 * choose 'shutdown' or 'reboot' as alternatives, as well as one fo the 828 * or other hibernation_ops), powering it off or rebooting it (for testing
838 * test modes, 'test' or 'testproc'. 829 * mostly), or using one of the two available test modes.
839 * 830 *
840 * show() will display what the mode is currently set to. 831 * The sysfs file /sys/power/disk provides an interface for selecting the
841 * store() will accept one of 832 * hibernation mode to use. Reading from this file causes the available modes
833 * to be printed. There are 5 modes that can be supported:
842 * 834 *
843 * 'platform' 835 * 'platform'
844 * 'shutdown' 836 * 'shutdown'
@@ -846,8 +838,14 @@ static const char * const hibernation_modes[] = {
846 * 'test' 838 * 'test'
847 * 'testproc' 839 * 'testproc'
848 * 840 *
849 * It will only change to 'platform' if the system 841 * If a platform hibernation driver is in use, 'platform' will be supported
850 * supports it (as determined by having hibernation_ops). 842 * and will be used by default. Otherwise, 'shutdown' will be used by default.
843 * The selected option (i.e. the one corresponding to the current value of
844 * hibernation_mode) is enclosed by a square bracket.
845 *
846 * To select a given hibernation mode it is necessary to write the mode's
847 * string representation (as returned by reading from /sys/power/disk) back
848 * into /sys/power/disk.
851 */ 849 */
852 850
853static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr, 851static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr,
@@ -880,7 +878,6 @@ static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr,
880 return buf-start; 878 return buf-start;
881} 879}
882 880
883
884static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr, 881static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
885 const char *buf, size_t n) 882 const char *buf, size_t n)
886{ 883{
@@ -982,10 +979,33 @@ static ssize_t image_size_store(struct kobject *kobj, struct kobj_attribute *att
982 979
983power_attr(image_size); 980power_attr(image_size);
984 981
982static ssize_t reserved_size_show(struct kobject *kobj,
983 struct kobj_attribute *attr, char *buf)
984{
985 return sprintf(buf, "%lu\n", reserved_size);
986}
987
988static ssize_t reserved_size_store(struct kobject *kobj,
989 struct kobj_attribute *attr,
990 const char *buf, size_t n)
991{
992 unsigned long size;
993
994 if (sscanf(buf, "%lu", &size) == 1) {
995 reserved_size = size;
996 return n;
997 }
998
999 return -EINVAL;
1000}
1001
1002power_attr(reserved_size);
1003
985static struct attribute * g[] = { 1004static struct attribute * g[] = {
986 &disk_attr.attr, 1005 &disk_attr.attr,
987 &resume_attr.attr, 1006 &resume_attr.attr,
988 &image_size_attr.attr, 1007 &image_size_attr.attr,
1008 &reserved_size_attr.attr,
989 NULL, 1009 NULL,
990}; 1010};
991 1011
diff --git a/kernel/power/main.c b/kernel/power/main.c
index de9aef8742f4..2981af4ce7cb 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -337,6 +337,7 @@ static int __init pm_init(void)
337 if (error) 337 if (error)
338 return error; 338 return error;
339 hibernate_image_size_init(); 339 hibernate_image_size_init();
340 hibernate_reserved_size_init();
340 power_kobj = kobject_create_and_add("power", NULL); 341 power_kobj = kobject_create_and_add("power", NULL);
341 if (!power_kobj) 342 if (!power_kobj)
342 return -ENOMEM; 343 return -ENOMEM;
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 03634be55f62..9a00a0a26280 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -15,6 +15,7 @@ struct swsusp_info {
15 15
16#ifdef CONFIG_HIBERNATION 16#ifdef CONFIG_HIBERNATION
17/* kernel/power/snapshot.c */ 17/* kernel/power/snapshot.c */
18extern void __init hibernate_reserved_size_init(void);
18extern void __init hibernate_image_size_init(void); 19extern void __init hibernate_image_size_init(void);
19 20
20#ifdef CONFIG_ARCH_HIBERNATION_HEADER 21#ifdef CONFIG_ARCH_HIBERNATION_HEADER
@@ -55,6 +56,7 @@ extern int hibernation_platform_enter(void);
55 56
56#else /* !CONFIG_HIBERNATION */ 57#else /* !CONFIG_HIBERNATION */
57 58
59static inline void hibernate_reserved_size_init(void) {}
58static inline void hibernate_image_size_init(void) {} 60static inline void hibernate_image_size_init(void) {}
59#endif /* !CONFIG_HIBERNATION */ 61#endif /* !CONFIG_HIBERNATION */
60 62
@@ -72,6 +74,8 @@ static struct kobj_attribute _name##_attr = { \
72 74
73/* Preferred image size in bytes (default 500 MB) */ 75/* Preferred image size in bytes (default 500 MB) */
74extern unsigned long image_size; 76extern unsigned long image_size;
77/* Size of memory reserved for drivers (default SPARE_PAGES x PAGE_SIZE) */
78extern unsigned long reserved_size;
75extern int in_suspend; 79extern int in_suspend;
76extern dev_t swsusp_resume_device; 80extern dev_t swsusp_resume_device;
77extern sector_t swsusp_resume_block; 81extern sector_t swsusp_resume_block;
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index ca0aacc24874..06efa54f93d6 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -41,16 +41,28 @@ static void swsusp_set_page_forbidden(struct page *);
41static void swsusp_unset_page_forbidden(struct page *); 41static void swsusp_unset_page_forbidden(struct page *);
42 42
43/* 43/*
44 * Number of bytes to reserve for memory allocations made by device drivers
45 * from their ->freeze() and ->freeze_noirq() callbacks so that they don't
46 * cause image creation to fail (tunable via /sys/power/reserved_size).
47 */
48unsigned long reserved_size;
49
50void __init hibernate_reserved_size_init(void)
51{
52 reserved_size = SPARE_PAGES * PAGE_SIZE;
53}
54
55/*
44 * Preferred image size in bytes (tunable via /sys/power/image_size). 56 * Preferred image size in bytes (tunable via /sys/power/image_size).
45 * When it is set to N, the image creating code will do its best to 57 * When it is set to N, swsusp will do its best to ensure the image
46 * ensure the image size will not exceed N bytes, but if that is 58 * size will not exceed N bytes, but if that is impossible, it will
47 * impossible, it will try to create the smallest image possible. 59 * try to create the smallest image possible.
48 */ 60 */
49unsigned long image_size; 61unsigned long image_size;
50 62
51void __init hibernate_image_size_init(void) 63void __init hibernate_image_size_init(void)
52{ 64{
53 image_size = (totalram_pages / 3) * PAGE_SIZE; 65 image_size = ((totalram_pages * 2) / 5) * PAGE_SIZE;
54} 66}
55 67
56/* List of PBEs needed for restoring the pages that were allocated before 68/* List of PBEs needed for restoring the pages that were allocated before
@@ -1199,7 +1211,11 @@ static void free_unnecessary_pages(void)
1199 to_free_highmem = alloc_highmem - save; 1211 to_free_highmem = alloc_highmem - save;
1200 } else { 1212 } else {
1201 to_free_highmem = 0; 1213 to_free_highmem = 0;
1202 to_free_normal -= save - alloc_highmem; 1214 save -= alloc_highmem;
1215 if (to_free_normal > save)
1216 to_free_normal -= save;
1217 else
1218 to_free_normal = 0;
1203 } 1219 }
1204 1220
1205 memory_bm_position_reset(&copy_bm); 1221 memory_bm_position_reset(&copy_bm);
@@ -1263,11 +1279,13 @@ static unsigned long minimum_image_size(unsigned long saveable)
1263 * frame in use. We also need a number of page frames to be free during 1279 * frame in use. We also need a number of page frames to be free during
1264 * hibernation for allocations made while saving the image and for device 1280 * hibernation for allocations made while saving the image and for device
1265 * drivers, in case they need to allocate memory from their hibernation 1281 * drivers, in case they need to allocate memory from their hibernation
1266 * callbacks (these two numbers are given by PAGES_FOR_IO and SPARE_PAGES, 1282 * callbacks (these two numbers are given by PAGES_FOR_IO (which is a rough
1267 * respectively, both of which are rough estimates). To make this happen, we 1283 * estimate) and reserverd_size divided by PAGE_SIZE (which is tunable through
1268 * compute the total number of available page frames and allocate at least 1284 * /sys/power/reserved_size, respectively). To make this happen, we compute the
1285 * total number of available page frames and allocate at least
1269 * 1286 *
1270 * ([page frames total] + PAGES_FOR_IO + [metadata pages]) / 2 + 2 * SPARE_PAGES 1287 * ([page frames total] + PAGES_FOR_IO + [metadata pages]) / 2
1288 * + 2 * DIV_ROUND_UP(reserved_size, PAGE_SIZE)
1271 * 1289 *
1272 * of them, which corresponds to the maximum size of a hibernation image. 1290 * of them, which corresponds to the maximum size of a hibernation image.
1273 * 1291 *
@@ -1322,7 +1340,8 @@ int hibernate_preallocate_memory(void)
1322 count -= totalreserve_pages; 1340 count -= totalreserve_pages;
1323 1341
1324 /* Compute the maximum number of saveable pages to leave in memory. */ 1342 /* Compute the maximum number of saveable pages to leave in memory. */
1325 max_size = (count - (size + PAGES_FOR_IO)) / 2 - 2 * SPARE_PAGES; 1343 max_size = (count - (size + PAGES_FOR_IO)) / 2
1344 - 2 * DIV_ROUND_UP(reserved_size, PAGE_SIZE);
1326 /* Compute the desired number of image pages specified by image_size. */ 1345 /* Compute the desired number of image pages specified by image_size. */
1327 size = DIV_ROUND_UP(image_size, PAGE_SIZE); 1346 size = DIV_ROUND_UP(image_size, PAGE_SIZE);
1328 if (size > max_size) 1347 if (size > max_size)
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 6275970b2189..1c41ba215419 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -163,19 +163,13 @@ static int suspend_enter(suspend_state_t state)
163 arch_suspend_disable_irqs(); 163 arch_suspend_disable_irqs();
164 BUG_ON(!irqs_disabled()); 164 BUG_ON(!irqs_disabled());
165 165
166 error = sysdev_suspend(PMSG_SUSPEND); 166 error = syscore_suspend();
167 if (!error) {
168 error = syscore_suspend();
169 if (error)
170 sysdev_resume();
171 }
172 if (!error) { 167 if (!error) {
173 if (!(suspend_test(TEST_CORE) || pm_wakeup_pending())) { 168 if (!(suspend_test(TEST_CORE) || pm_wakeup_pending())) {
174 error = suspend_ops->enter(state); 169 error = suspend_ops->enter(state);
175 events_check_enabled = false; 170 events_check_enabled = false;
176 } 171 }
177 syscore_resume(); 172 syscore_resume();
178 sysdev_resume();
179 } 173 }
180 174
181 arch_suspend_enable_irqs(); 175 arch_suspend_enable_irqs();
@@ -226,7 +220,7 @@ int suspend_devices_and_enter(suspend_state_t state)
226 if (suspend_test(TEST_DEVICES)) 220 if (suspend_test(TEST_DEVICES))
227 goto Recover_platform; 221 goto Recover_platform;
228 222
229 suspend_enter(state); 223 error = suspend_enter(state);
230 224
231 Resume_devices: 225 Resume_devices:
232 suspend_test_start(); 226 suspend_test_start();
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 7d02d33be699..42ddbc6f0de6 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -113,8 +113,10 @@ static int snapshot_open(struct inode *inode, struct file *filp)
113 if (error) 113 if (error)
114 pm_notifier_call_chain(PM_POST_RESTORE); 114 pm_notifier_call_chain(PM_POST_RESTORE);
115 } 115 }
116 if (error) 116 if (error) {
117 free_basic_memory_bitmaps();
117 atomic_inc(&snapshot_device_available); 118 atomic_inc(&snapshot_device_available);
119 }
118 data->frozen = 0; 120 data->frozen = 0;
119 data->ready = 0; 121 data->ready = 0;
120 data->platform_support = 0; 122 data->platform_support = 0;
diff --git a/kernel/printk.c b/kernel/printk.c
index da8ca817eae3..35185392173f 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -31,6 +31,7 @@
31#include <linux/smp.h> 31#include <linux/smp.h>
32#include <linux/security.h> 32#include <linux/security.h>
33#include <linux/bootmem.h> 33#include <linux/bootmem.h>
34#include <linux/memblock.h>
34#include <linux/syscalls.h> 35#include <linux/syscalls.h>
35#include <linux/kexec.h> 36#include <linux/kexec.h>
36#include <linux/kdb.h> 37#include <linux/kdb.h>
@@ -167,46 +168,74 @@ void log_buf_kexec_setup(void)
167} 168}
168#endif 169#endif
169 170
171/* requested log_buf_len from kernel cmdline */
172static unsigned long __initdata new_log_buf_len;
173
174/* save requested log_buf_len since it's too early to process it */
170static int __init log_buf_len_setup(char *str) 175static int __init log_buf_len_setup(char *str)
171{ 176{
172 unsigned size = memparse(str, &str); 177 unsigned size = memparse(str, &str);
173 unsigned long flags;
174 178
175 if (size) 179 if (size)
176 size = roundup_pow_of_two(size); 180 size = roundup_pow_of_two(size);
177 if (size > log_buf_len) { 181 if (size > log_buf_len)
178 unsigned start, dest_idx, offset; 182 new_log_buf_len = size;
179 char *new_log_buf;
180 183
181 new_log_buf = alloc_bootmem(size); 184 return 0;
182 if (!new_log_buf) { 185}
183 printk(KERN_WARNING "log_buf_len: allocation failed\n"); 186early_param("log_buf_len", log_buf_len_setup);
184 goto out;
185 }
186 187
187 spin_lock_irqsave(&logbuf_lock, flags); 188void __init setup_log_buf(int early)
188 log_buf_len = size; 189{
189 log_buf = new_log_buf; 190 unsigned long flags;
190 191 unsigned start, dest_idx, offset;
191 offset = start = min(con_start, log_start); 192 char *new_log_buf;
192 dest_idx = 0; 193 int free;
193 while (start != log_end) { 194
194 log_buf[dest_idx] = __log_buf[start & (__LOG_BUF_LEN - 1)]; 195 if (!new_log_buf_len)
195 start++; 196 return;
196 dest_idx++; 197
197 } 198 if (early) {
198 log_start -= offset; 199 unsigned long mem;
199 con_start -= offset;
200 log_end -= offset;
201 spin_unlock_irqrestore(&logbuf_lock, flags);
202 200
203 printk(KERN_NOTICE "log_buf_len: %d\n", log_buf_len); 201 mem = memblock_alloc(new_log_buf_len, PAGE_SIZE);
202 if (mem == MEMBLOCK_ERROR)
203 return;
204 new_log_buf = __va(mem);
205 } else {
206 new_log_buf = alloc_bootmem_nopanic(new_log_buf_len);
204 } 207 }
205out:
206 return 1;
207}
208 208
209__setup("log_buf_len=", log_buf_len_setup); 209 if (unlikely(!new_log_buf)) {
210 pr_err("log_buf_len: %ld bytes not available\n",
211 new_log_buf_len);
212 return;
213 }
214
215 spin_lock_irqsave(&logbuf_lock, flags);
216 log_buf_len = new_log_buf_len;
217 log_buf = new_log_buf;
218 new_log_buf_len = 0;
219 free = __LOG_BUF_LEN - log_end;
220
221 offset = start = min(con_start, log_start);
222 dest_idx = 0;
223 while (start != log_end) {
224 unsigned log_idx_mask = start & (__LOG_BUF_LEN - 1);
225
226 log_buf[dest_idx] = __log_buf[log_idx_mask];
227 start++;
228 dest_idx++;
229 }
230 log_start -= offset;
231 con_start -= offset;
232 log_end -= offset;
233 spin_unlock_irqrestore(&logbuf_lock, flags);
234
235 pr_info("log_buf_len: %d\n", log_buf_len);
236 pr_info("early log buf free: %d(%d%%)\n",
237 free, (free * 100) / __LOG_BUF_LEN);
238}
210 239
211#ifdef CONFIG_BOOT_PRINTK_DELAY 240#ifdef CONFIG_BOOT_PRINTK_DELAY
212 241
diff --git a/kernel/profile.c b/kernel/profile.c
index 66f841b7fbd3..961b389fe52f 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -126,11 +126,9 @@ int __ref profile_init(void)
126 if (prof_buffer) 126 if (prof_buffer)
127 return 0; 127 return 0;
128 128
129 prof_buffer = vmalloc(buffer_bytes); 129 prof_buffer = vzalloc(buffer_bytes);
130 if (prof_buffer) { 130 if (prof_buffer)
131 memset(prof_buffer, 0, buffer_bytes);
132 return 0; 131 return 0;
133 }
134 132
135 free_cpumask_var(prof_cpu_mask); 133 free_cpumask_var(prof_cpu_mask);
136 return -ENOMEM; 134 return -ENOMEM;
@@ -305,14 +303,12 @@ static void profile_discard_flip_buffers(void)
305 mutex_unlock(&profile_flip_mutex); 303 mutex_unlock(&profile_flip_mutex);
306} 304}
307 305
308void profile_hits(int type, void *__pc, unsigned int nr_hits) 306static void do_profile_hits(int type, void *__pc, unsigned int nr_hits)
309{ 307{
310 unsigned long primary, secondary, flags, pc = (unsigned long)__pc; 308 unsigned long primary, secondary, flags, pc = (unsigned long)__pc;
311 int i, j, cpu; 309 int i, j, cpu;
312 struct profile_hit *hits; 310 struct profile_hit *hits;
313 311
314 if (prof_on != type || !prof_buffer)
315 return;
316 pc = min((pc - (unsigned long)_stext) >> prof_shift, prof_len - 1); 312 pc = min((pc - (unsigned long)_stext) >> prof_shift, prof_len - 1);
317 i = primary = (pc & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT; 313 i = primary = (pc & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT;
318 secondary = (~(pc << 1) & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT; 314 secondary = (~(pc << 1) & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT;
@@ -419,16 +415,20 @@ out_free:
419#define profile_discard_flip_buffers() do { } while (0) 415#define profile_discard_flip_buffers() do { } while (0)
420#define profile_cpu_callback NULL 416#define profile_cpu_callback NULL
421 417
422void profile_hits(int type, void *__pc, unsigned int nr_hits) 418static void do_profile_hits(int type, void *__pc, unsigned int nr_hits)
423{ 419{
424 unsigned long pc; 420 unsigned long pc;
425
426 if (prof_on != type || !prof_buffer)
427 return;
428 pc = ((unsigned long)__pc - (unsigned long)_stext) >> prof_shift; 421 pc = ((unsigned long)__pc - (unsigned long)_stext) >> prof_shift;
429 atomic_add(nr_hits, &prof_buffer[min(pc, prof_len - 1)]); 422 atomic_add(nr_hits, &prof_buffer[min(pc, prof_len - 1)]);
430} 423}
431#endif /* !CONFIG_SMP */ 424#endif /* !CONFIG_SMP */
425
426void profile_hits(int type, void *__pc, unsigned int nr_hits)
427{
428 if (prof_on != type || !prof_buffer)
429 return;
430 do_profile_hits(type, __pc, nr_hits);
431}
432EXPORT_SYMBOL_GPL(profile_hits); 432EXPORT_SYMBOL_GPL(profile_hits);
433 433
434void profile_tick(int type) 434void profile_tick(int type)
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index dc7ab65f3b36..2df115790cd9 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -38,35 +38,33 @@ void __ptrace_link(struct task_struct *child, struct task_struct *new_parent)
38 child->parent = new_parent; 38 child->parent = new_parent;
39} 39}
40 40
41/* 41/**
42 * Turn a tracing stop into a normal stop now, since with no tracer there 42 * __ptrace_unlink - unlink ptracee and restore its execution state
43 * would be no way to wake it up with SIGCONT or SIGKILL. If there was a 43 * @child: ptracee to be unlinked
44 * signal sent that would resume the child, but didn't because it was in
45 * TASK_TRACED, resume it now.
46 * Requires that irqs be disabled.
47 */
48static void ptrace_untrace(struct task_struct *child)
49{
50 spin_lock(&child->sighand->siglock);
51 if (task_is_traced(child)) {
52 /*
53 * If the group stop is completed or in progress,
54 * this thread was already counted as stopped.
55 */
56 if (child->signal->flags & SIGNAL_STOP_STOPPED ||
57 child->signal->group_stop_count)
58 __set_task_state(child, TASK_STOPPED);
59 else
60 signal_wake_up(child, 1);
61 }
62 spin_unlock(&child->sighand->siglock);
63}
64
65/*
66 * unptrace a task: move it back to its original parent and
67 * remove it from the ptrace list.
68 * 44 *
69 * Must be called with the tasklist lock write-held. 45 * Remove @child from the ptrace list, move it back to the original parent,
46 * and restore the execution state so that it conforms to the group stop
47 * state.
48 *
49 * Unlinking can happen via two paths - explicit PTRACE_DETACH or ptracer
50 * exiting. For PTRACE_DETACH, unless the ptracee has been killed between
51 * ptrace_check_attach() and here, it's guaranteed to be in TASK_TRACED.
52 * If the ptracer is exiting, the ptracee can be in any state.
53 *
54 * After detach, the ptracee should be in a state which conforms to the
55 * group stop. If the group is stopped or in the process of stopping, the
56 * ptracee should be put into TASK_STOPPED; otherwise, it should be woken
57 * up from TASK_TRACED.
58 *
59 * If the ptracee is in TASK_TRACED and needs to be moved to TASK_STOPPED,
60 * it goes through TRACED -> RUNNING -> STOPPED transition which is similar
61 * to but in the opposite direction of what happens while attaching to a
62 * stopped task. However, in this direction, the intermediate RUNNING
63 * state is not hidden even from the current ptracer and if it immediately
64 * re-attaches and performs a WNOHANG wait(2), it may fail.
65 *
66 * CONTEXT:
67 * write_lock_irq(tasklist_lock)
70 */ 68 */
71void __ptrace_unlink(struct task_struct *child) 69void __ptrace_unlink(struct task_struct *child)
72{ 70{
@@ -76,8 +74,27 @@ void __ptrace_unlink(struct task_struct *child)
76 child->parent = child->real_parent; 74 child->parent = child->real_parent;
77 list_del_init(&child->ptrace_entry); 75 list_del_init(&child->ptrace_entry);
78 76
79 if (task_is_traced(child)) 77 spin_lock(&child->sighand->siglock);
80 ptrace_untrace(child); 78
79 /*
80 * Reinstate GROUP_STOP_PENDING if group stop is in effect and
81 * @child isn't dead.
82 */
83 if (!(child->flags & PF_EXITING) &&
84 (child->signal->flags & SIGNAL_STOP_STOPPED ||
85 child->signal->group_stop_count))
86 child->group_stop |= GROUP_STOP_PENDING;
87
88 /*
89 * If transition to TASK_STOPPED is pending or in TASK_TRACED, kick
90 * @child in the butt. Note that @resume should be used iff @child
91 * is in TASK_TRACED; otherwise, we might unduly disrupt
92 * TASK_KILLABLE sleeps.
93 */
94 if (child->group_stop & GROUP_STOP_PENDING || task_is_traced(child))
95 signal_wake_up(child, task_is_traced(child));
96
97 spin_unlock(&child->sighand->siglock);
81} 98}
82 99
83/* 100/*
@@ -96,16 +113,14 @@ int ptrace_check_attach(struct task_struct *child, int kill)
96 */ 113 */
97 read_lock(&tasklist_lock); 114 read_lock(&tasklist_lock);
98 if ((child->ptrace & PT_PTRACED) && child->parent == current) { 115 if ((child->ptrace & PT_PTRACED) && child->parent == current) {
99 ret = 0;
100 /* 116 /*
101 * child->sighand can't be NULL, release_task() 117 * child->sighand can't be NULL, release_task()
102 * does ptrace_unlink() before __exit_signal(). 118 * does ptrace_unlink() before __exit_signal().
103 */ 119 */
104 spin_lock_irq(&child->sighand->siglock); 120 spin_lock_irq(&child->sighand->siglock);
105 if (task_is_stopped(child)) 121 WARN_ON_ONCE(task_is_stopped(child));
106 child->state = TASK_TRACED; 122 if (task_is_traced(child) || kill)
107 else if (!task_is_traced(child) && !kill) 123 ret = 0;
108 ret = -ESRCH;
109 spin_unlock_irq(&child->sighand->siglock); 124 spin_unlock_irq(&child->sighand->siglock);
110 } 125 }
111 read_unlock(&tasklist_lock); 126 read_unlock(&tasklist_lock);
@@ -169,6 +184,7 @@ bool ptrace_may_access(struct task_struct *task, unsigned int mode)
169 184
170static int ptrace_attach(struct task_struct *task) 185static int ptrace_attach(struct task_struct *task)
171{ 186{
187 bool wait_trap = false;
172 int retval; 188 int retval;
173 189
174 audit_ptrace(task); 190 audit_ptrace(task);
@@ -208,12 +224,42 @@ static int ptrace_attach(struct task_struct *task)
208 __ptrace_link(task, current); 224 __ptrace_link(task, current);
209 send_sig_info(SIGSTOP, SEND_SIG_FORCED, task); 225 send_sig_info(SIGSTOP, SEND_SIG_FORCED, task);
210 226
227 spin_lock(&task->sighand->siglock);
228
229 /*
230 * If the task is already STOPPED, set GROUP_STOP_PENDING and
231 * TRAPPING, and kick it so that it transits to TRACED. TRAPPING
232 * will be cleared if the child completes the transition or any
233 * event which clears the group stop states happens. We'll wait
234 * for the transition to complete before returning from this
235 * function.
236 *
237 * This hides STOPPED -> RUNNING -> TRACED transition from the
238 * attaching thread but a different thread in the same group can
239 * still observe the transient RUNNING state. IOW, if another
240 * thread's WNOHANG wait(2) on the stopped tracee races against
241 * ATTACH, the wait(2) may fail due to the transient RUNNING.
242 *
243 * The following task_is_stopped() test is safe as both transitions
244 * in and out of STOPPED are protected by siglock.
245 */
246 if (task_is_stopped(task)) {
247 task->group_stop |= GROUP_STOP_PENDING | GROUP_STOP_TRAPPING;
248 signal_wake_up(task, 1);
249 wait_trap = true;
250 }
251
252 spin_unlock(&task->sighand->siglock);
253
211 retval = 0; 254 retval = 0;
212unlock_tasklist: 255unlock_tasklist:
213 write_unlock_irq(&tasklist_lock); 256 write_unlock_irq(&tasklist_lock);
214unlock_creds: 257unlock_creds:
215 mutex_unlock(&task->signal->cred_guard_mutex); 258 mutex_unlock(&task->signal->cred_guard_mutex);
216out: 259out:
260 if (wait_trap)
261 wait_event(current->signal->wait_chldexit,
262 !(task->group_stop & GROUP_STOP_TRAPPING));
217 return retval; 263 return retval;
218} 264}
219 265
@@ -316,8 +362,6 @@ static int ptrace_detach(struct task_struct *child, unsigned int data)
316 if (child->ptrace) { 362 if (child->ptrace) {
317 child->exit_code = data; 363 child->exit_code = data;
318 dead = __ptrace_detach(current, child); 364 dead = __ptrace_detach(current, child);
319 if (!child->exit_state)
320 wake_up_state(child, TASK_TRACED | TASK_STOPPED);
321 } 365 }
322 write_unlock_irq(&tasklist_lock); 366 write_unlock_irq(&tasklist_lock);
323 367
@@ -518,7 +562,7 @@ static int ptrace_resume(struct task_struct *child, long request,
518 } 562 }
519 563
520 child->exit_code = data; 564 child->exit_code = data;
521 wake_up_process(child); 565 wake_up_state(child, __TASK_TRACED);
522 566
523 return 0; 567 return 0;
524} 568}
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index f3240e987928..7784bd216b6a 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -142,10 +142,17 @@ static int rcuhead_fixup_init(void *addr, enum debug_obj_state state)
142 * Ensure that queued callbacks are all executed. 142 * Ensure that queued callbacks are all executed.
143 * If we detect that we are nested in a RCU read-side critical 143 * If we detect that we are nested in a RCU read-side critical
144 * section, we should simply fail, otherwise we would deadlock. 144 * section, we should simply fail, otherwise we would deadlock.
145 * In !PREEMPT configurations, there is no way to tell if we are
146 * in a RCU read-side critical section or not, so we never
147 * attempt any fixup and just print a warning.
145 */ 148 */
149#ifndef CONFIG_PREEMPT
150 WARN_ON_ONCE(1);
151 return 0;
152#endif
146 if (rcu_preempt_depth() != 0 || preempt_count() != 0 || 153 if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
147 irqs_disabled()) { 154 irqs_disabled()) {
148 WARN_ON(1); 155 WARN_ON_ONCE(1);
149 return 0; 156 return 0;
150 } 157 }
151 rcu_barrier(); 158 rcu_barrier();
@@ -184,10 +191,17 @@ static int rcuhead_fixup_activate(void *addr, enum debug_obj_state state)
184 * Ensure that queued callbacks are all executed. 191 * Ensure that queued callbacks are all executed.
185 * If we detect that we are nested in a RCU read-side critical 192 * If we detect that we are nested in a RCU read-side critical
186 * section, we should simply fail, otherwise we would deadlock. 193 * section, we should simply fail, otherwise we would deadlock.
194 * In !PREEMPT configurations, there is no way to tell if we are
195 * in a RCU read-side critical section or not, so we never
196 * attempt any fixup and just print a warning.
187 */ 197 */
198#ifndef CONFIG_PREEMPT
199 WARN_ON_ONCE(1);
200 return 0;
201#endif
188 if (rcu_preempt_depth() != 0 || preempt_count() != 0 || 202 if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
189 irqs_disabled()) { 203 irqs_disabled()) {
190 WARN_ON(1); 204 WARN_ON_ONCE(1);
191 return 0; 205 return 0;
192 } 206 }
193 rcu_barrier(); 207 rcu_barrier();
@@ -214,15 +228,17 @@ static int rcuhead_fixup_free(void *addr, enum debug_obj_state state)
214 * Ensure that queued callbacks are all executed. 228 * Ensure that queued callbacks are all executed.
215 * If we detect that we are nested in a RCU read-side critical 229 * If we detect that we are nested in a RCU read-side critical
216 * section, we should simply fail, otherwise we would deadlock. 230 * section, we should simply fail, otherwise we would deadlock.
217 * Note that the machinery to reliably determine whether 231 * In !PREEMPT configurations, there is no way to tell if we are
218 * or not we are in an RCU read-side critical section 232 * in a RCU read-side critical section or not, so we never
219 * exists only in the preemptible RCU implementations 233 * attempt any fixup and just print a warning.
220 * (TINY_PREEMPT_RCU and TREE_PREEMPT_RCU), which is why
221 * DEBUG_OBJECTS_RCU_HEAD is disallowed if !PREEMPT.
222 */ 234 */
235#ifndef CONFIG_PREEMPT
236 WARN_ON_ONCE(1);
237 return 0;
238#endif
223 if (rcu_preempt_depth() != 0 || preempt_count() != 0 || 239 if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
224 irqs_disabled()) { 240 irqs_disabled()) {
225 WARN_ON(1); 241 WARN_ON_ONCE(1);
226 return 0; 242 return 0;
227 } 243 }
228 rcu_barrier(); 244 rcu_barrier();
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index 0c343b9a46d5..7bbac7d0f5ab 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -35,15 +35,16 @@
35#include <linux/init.h> 35#include <linux/init.h>
36#include <linux/time.h> 36#include <linux/time.h>
37#include <linux/cpu.h> 37#include <linux/cpu.h>
38#include <linux/prefetch.h>
38 39
39/* Controls for rcu_kthread() kthread, replacing RCU_SOFTIRQ used previously. */ 40/* Controls for rcu_kthread() kthread, replacing RCU_SOFTIRQ used previously. */
40static struct task_struct *rcu_kthread_task; 41static struct task_struct *rcu_kthread_task;
41static DECLARE_WAIT_QUEUE_HEAD(rcu_kthread_wq); 42static DECLARE_WAIT_QUEUE_HEAD(rcu_kthread_wq);
42static unsigned long have_rcu_kthread_work; 43static unsigned long have_rcu_kthread_work;
43static void invoke_rcu_kthread(void);
44 44
45/* Forward declarations for rcutiny_plugin.h. */ 45/* Forward declarations for rcutiny_plugin.h. */
46struct rcu_ctrlblk; 46struct rcu_ctrlblk;
47static void invoke_rcu_kthread(void);
47static void rcu_process_callbacks(struct rcu_ctrlblk *rcp); 48static void rcu_process_callbacks(struct rcu_ctrlblk *rcp);
48static int rcu_kthread(void *arg); 49static int rcu_kthread(void *arg);
49static void __call_rcu(struct rcu_head *head, 50static void __call_rcu(struct rcu_head *head,
@@ -79,36 +80,45 @@ void rcu_exit_nohz(void)
79#endif /* #ifdef CONFIG_NO_HZ */ 80#endif /* #ifdef CONFIG_NO_HZ */
80 81
81/* 82/*
82 * Helper function for rcu_qsctr_inc() and rcu_bh_qsctr_inc(). 83 * Helper function for rcu_sched_qs() and rcu_bh_qs().
83 * Also disable irqs to avoid confusion due to interrupt handlers 84 * Also irqs are disabled to avoid confusion due to interrupt handlers
84 * invoking call_rcu(). 85 * invoking call_rcu().
85 */ 86 */
86static int rcu_qsctr_help(struct rcu_ctrlblk *rcp) 87static int rcu_qsctr_help(struct rcu_ctrlblk *rcp)
87{ 88{
88 unsigned long flags;
89
90 local_irq_save(flags);
91 if (rcp->rcucblist != NULL && 89 if (rcp->rcucblist != NULL &&
92 rcp->donetail != rcp->curtail) { 90 rcp->donetail != rcp->curtail) {
93 rcp->donetail = rcp->curtail; 91 rcp->donetail = rcp->curtail;
94 local_irq_restore(flags);
95 return 1; 92 return 1;
96 } 93 }
97 local_irq_restore(flags);
98 94
99 return 0; 95 return 0;
100} 96}
101 97
102/* 98/*
99 * Wake up rcu_kthread() to process callbacks now eligible for invocation
100 * or to boost readers.
101 */
102static void invoke_rcu_kthread(void)
103{
104 have_rcu_kthread_work = 1;
105 wake_up(&rcu_kthread_wq);
106}
107
108/*
103 * Record an rcu quiescent state. And an rcu_bh quiescent state while we 109 * Record an rcu quiescent state. And an rcu_bh quiescent state while we
104 * are at it, given that any rcu quiescent state is also an rcu_bh 110 * are at it, given that any rcu quiescent state is also an rcu_bh
105 * quiescent state. Use "+" instead of "||" to defeat short circuiting. 111 * quiescent state. Use "+" instead of "||" to defeat short circuiting.
106 */ 112 */
107void rcu_sched_qs(int cpu) 113void rcu_sched_qs(int cpu)
108{ 114{
115 unsigned long flags;
116
117 local_irq_save(flags);
109 if (rcu_qsctr_help(&rcu_sched_ctrlblk) + 118 if (rcu_qsctr_help(&rcu_sched_ctrlblk) +
110 rcu_qsctr_help(&rcu_bh_ctrlblk)) 119 rcu_qsctr_help(&rcu_bh_ctrlblk))
111 invoke_rcu_kthread(); 120 invoke_rcu_kthread();
121 local_irq_restore(flags);
112} 122}
113 123
114/* 124/*
@@ -116,8 +126,12 @@ void rcu_sched_qs(int cpu)
116 */ 126 */
117void rcu_bh_qs(int cpu) 127void rcu_bh_qs(int cpu)
118{ 128{
129 unsigned long flags;
130
131 local_irq_save(flags);
119 if (rcu_qsctr_help(&rcu_bh_ctrlblk)) 132 if (rcu_qsctr_help(&rcu_bh_ctrlblk))
120 invoke_rcu_kthread(); 133 invoke_rcu_kthread();
134 local_irq_restore(flags);
121} 135}
122 136
123/* 137/*
@@ -167,7 +181,7 @@ static void rcu_process_callbacks(struct rcu_ctrlblk *rcp)
167 prefetch(next); 181 prefetch(next);
168 debug_rcu_head_unqueue(list); 182 debug_rcu_head_unqueue(list);
169 local_bh_disable(); 183 local_bh_disable();
170 list->func(list); 184 __rcu_reclaim(list);
171 local_bh_enable(); 185 local_bh_enable();
172 list = next; 186 list = next;
173 RCU_TRACE(cb_count++); 187 RCU_TRACE(cb_count++);
@@ -208,20 +222,6 @@ static int rcu_kthread(void *arg)
208} 222}
209 223
210/* 224/*
211 * Wake up rcu_kthread() to process callbacks now eligible for invocation
212 * or to boost readers.
213 */
214static void invoke_rcu_kthread(void)
215{
216 unsigned long flags;
217
218 local_irq_save(flags);
219 have_rcu_kthread_work = 1;
220 wake_up(&rcu_kthread_wq);
221 local_irq_restore(flags);
222}
223
224/*
225 * Wait for a grace period to elapse. But it is illegal to invoke 225 * Wait for a grace period to elapse. But it is illegal to invoke
226 * synchronize_sched() from within an RCU read-side critical section. 226 * synchronize_sched() from within an RCU read-side critical section.
227 * Therefore, any legal call to synchronize_sched() is a quiescent 227 * Therefore, any legal call to synchronize_sched() is a quiescent
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index 3cb8e362e883..f259c676195f 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -100,23 +100,28 @@ struct rcu_preempt_ctrlblk {
100 u8 completed; /* Last grace period completed. */ 100 u8 completed; /* Last grace period completed. */
101 /* If all three are equal, RCU is idle. */ 101 /* If all three are equal, RCU is idle. */
102#ifdef CONFIG_RCU_BOOST 102#ifdef CONFIG_RCU_BOOST
103 s8 boosted_this_gp; /* Has boosting already happened? */
104 unsigned long boost_time; /* When to start boosting (jiffies) */ 103 unsigned long boost_time; /* When to start boosting (jiffies) */
105#endif /* #ifdef CONFIG_RCU_BOOST */ 104#endif /* #ifdef CONFIG_RCU_BOOST */
106#ifdef CONFIG_RCU_TRACE 105#ifdef CONFIG_RCU_TRACE
107 unsigned long n_grace_periods; 106 unsigned long n_grace_periods;
108#ifdef CONFIG_RCU_BOOST 107#ifdef CONFIG_RCU_BOOST
109 unsigned long n_tasks_boosted; 108 unsigned long n_tasks_boosted;
109 /* Total number of tasks boosted. */
110 unsigned long n_exp_boosts; 110 unsigned long n_exp_boosts;
111 /* Number of tasks boosted for expedited GP. */
111 unsigned long n_normal_boosts; 112 unsigned long n_normal_boosts;
112 unsigned long n_normal_balk_blkd_tasks; 113 /* Number of tasks boosted for normal GP. */
113 unsigned long n_normal_balk_gp_tasks; 114 unsigned long n_balk_blkd_tasks;
114 unsigned long n_normal_balk_boost_tasks; 115 /* Refused to boost: no blocked tasks. */
115 unsigned long n_normal_balk_boosted; 116 unsigned long n_balk_exp_gp_tasks;
116 unsigned long n_normal_balk_notyet; 117 /* Refused to boost: nothing blocking GP. */
117 unsigned long n_normal_balk_nos; 118 unsigned long n_balk_boost_tasks;
118 unsigned long n_exp_balk_blkd_tasks; 119 /* Refused to boost: already boosting. */
119 unsigned long n_exp_balk_nos; 120 unsigned long n_balk_notyet;
121 /* Refused to boost: not yet time. */
122 unsigned long n_balk_nos;
123 /* Refused to boost: not sure why, though. */
124 /* This can happen due to race conditions. */
120#endif /* #ifdef CONFIG_RCU_BOOST */ 125#endif /* #ifdef CONFIG_RCU_BOOST */
121#endif /* #ifdef CONFIG_RCU_TRACE */ 126#endif /* #ifdef CONFIG_RCU_TRACE */
122}; 127};
@@ -201,7 +206,6 @@ static struct list_head *rcu_next_node_entry(struct task_struct *t)
201 206
202#ifdef CONFIG_RCU_BOOST 207#ifdef CONFIG_RCU_BOOST
203static void rcu_initiate_boost_trace(void); 208static void rcu_initiate_boost_trace(void);
204static void rcu_initiate_exp_boost_trace(void);
205#endif /* #ifdef CONFIG_RCU_BOOST */ 209#endif /* #ifdef CONFIG_RCU_BOOST */
206 210
207/* 211/*
@@ -219,41 +223,21 @@ static void show_tiny_preempt_stats(struct seq_file *m)
219 "N."[!rcu_preempt_ctrlblk.gp_tasks], 223 "N."[!rcu_preempt_ctrlblk.gp_tasks],
220 "E."[!rcu_preempt_ctrlblk.exp_tasks]); 224 "E."[!rcu_preempt_ctrlblk.exp_tasks]);
221#ifdef CONFIG_RCU_BOOST 225#ifdef CONFIG_RCU_BOOST
222 seq_printf(m, " ttb=%c btg=", 226 seq_printf(m, "%sttb=%c ntb=%lu neb=%lu nnb=%lu j=%04x bt=%04x\n",
223 "B."[!rcu_preempt_ctrlblk.boost_tasks]); 227 " ",
224 switch (rcu_preempt_ctrlblk.boosted_this_gp) { 228 "B."[!rcu_preempt_ctrlblk.boost_tasks],
225 case -1:
226 seq_puts(m, "exp");
227 break;
228 case 0:
229 seq_puts(m, "no");
230 break;
231 case 1:
232 seq_puts(m, "begun");
233 break;
234 case 2:
235 seq_puts(m, "done");
236 break;
237 default:
238 seq_printf(m, "?%d?", rcu_preempt_ctrlblk.boosted_this_gp);
239 }
240 seq_printf(m, " ntb=%lu neb=%lu nnb=%lu j=%04x bt=%04x\n",
241 rcu_preempt_ctrlblk.n_tasks_boosted, 229 rcu_preempt_ctrlblk.n_tasks_boosted,
242 rcu_preempt_ctrlblk.n_exp_boosts, 230 rcu_preempt_ctrlblk.n_exp_boosts,
243 rcu_preempt_ctrlblk.n_normal_boosts, 231 rcu_preempt_ctrlblk.n_normal_boosts,
244 (int)(jiffies & 0xffff), 232 (int)(jiffies & 0xffff),
245 (int)(rcu_preempt_ctrlblk.boost_time & 0xffff)); 233 (int)(rcu_preempt_ctrlblk.boost_time & 0xffff));
246 seq_printf(m, " %s: nt=%lu gt=%lu bt=%lu b=%lu ny=%lu nos=%lu\n", 234 seq_printf(m, "%s: nt=%lu egt=%lu bt=%lu ny=%lu nos=%lu\n",
247 "normal balk", 235 " balk",
248 rcu_preempt_ctrlblk.n_normal_balk_blkd_tasks, 236 rcu_preempt_ctrlblk.n_balk_blkd_tasks,
249 rcu_preempt_ctrlblk.n_normal_balk_gp_tasks, 237 rcu_preempt_ctrlblk.n_balk_exp_gp_tasks,
250 rcu_preempt_ctrlblk.n_normal_balk_boost_tasks, 238 rcu_preempt_ctrlblk.n_balk_boost_tasks,
251 rcu_preempt_ctrlblk.n_normal_balk_boosted, 239 rcu_preempt_ctrlblk.n_balk_notyet,
252 rcu_preempt_ctrlblk.n_normal_balk_notyet, 240 rcu_preempt_ctrlblk.n_balk_nos);
253 rcu_preempt_ctrlblk.n_normal_balk_nos);
254 seq_printf(m, " exp balk: bt=%lu nos=%lu\n",
255 rcu_preempt_ctrlblk.n_exp_balk_blkd_tasks,
256 rcu_preempt_ctrlblk.n_exp_balk_nos);
257#endif /* #ifdef CONFIG_RCU_BOOST */ 241#endif /* #ifdef CONFIG_RCU_BOOST */
258} 242}
259 243
@@ -271,25 +255,59 @@ static int rcu_boost(void)
271{ 255{
272 unsigned long flags; 256 unsigned long flags;
273 struct rt_mutex mtx; 257 struct rt_mutex mtx;
274 struct list_head *np;
275 struct task_struct *t; 258 struct task_struct *t;
259 struct list_head *tb;
276 260
277 if (rcu_preempt_ctrlblk.boost_tasks == NULL) 261 if (rcu_preempt_ctrlblk.boost_tasks == NULL &&
262 rcu_preempt_ctrlblk.exp_tasks == NULL)
278 return 0; /* Nothing to boost. */ 263 return 0; /* Nothing to boost. */
264
279 raw_local_irq_save(flags); 265 raw_local_irq_save(flags);
280 rcu_preempt_ctrlblk.boosted_this_gp++; 266
281 t = container_of(rcu_preempt_ctrlblk.boost_tasks, struct task_struct, 267 /*
282 rcu_node_entry); 268 * Recheck with irqs disabled: all tasks in need of boosting
283 np = rcu_next_node_entry(t); 269 * might exit their RCU read-side critical sections on their own
270 * if we are preempted just before disabling irqs.
271 */
272 if (rcu_preempt_ctrlblk.boost_tasks == NULL &&
273 rcu_preempt_ctrlblk.exp_tasks == NULL) {
274 raw_local_irq_restore(flags);
275 return 0;
276 }
277
278 /*
279 * Preferentially boost tasks blocking expedited grace periods.
280 * This cannot starve the normal grace periods because a second
281 * expedited grace period must boost all blocked tasks, including
282 * those blocking the pre-existing normal grace period.
283 */
284 if (rcu_preempt_ctrlblk.exp_tasks != NULL) {
285 tb = rcu_preempt_ctrlblk.exp_tasks;
286 RCU_TRACE(rcu_preempt_ctrlblk.n_exp_boosts++);
287 } else {
288 tb = rcu_preempt_ctrlblk.boost_tasks;
289 RCU_TRACE(rcu_preempt_ctrlblk.n_normal_boosts++);
290 }
291 RCU_TRACE(rcu_preempt_ctrlblk.n_tasks_boosted++);
292
293 /*
294 * We boost task t by manufacturing an rt_mutex that appears to
295 * be held by task t. We leave a pointer to that rt_mutex where
296 * task t can find it, and task t will release the mutex when it
297 * exits its outermost RCU read-side critical section. Then
298 * simply acquiring this artificial rt_mutex will boost task
299 * t's priority. (Thanks to tglx for suggesting this approach!)
300 */
301 t = container_of(tb, struct task_struct, rcu_node_entry);
284 rt_mutex_init_proxy_locked(&mtx, t); 302 rt_mutex_init_proxy_locked(&mtx, t);
285 t->rcu_boost_mutex = &mtx; 303 t->rcu_boost_mutex = &mtx;
286 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BOOSTED; 304 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BOOSTED;
287 raw_local_irq_restore(flags); 305 raw_local_irq_restore(flags);
288 rt_mutex_lock(&mtx); 306 rt_mutex_lock(&mtx);
289 RCU_TRACE(rcu_preempt_ctrlblk.n_tasks_boosted++); 307 rt_mutex_unlock(&mtx); /* Keep lockdep happy. */
290 rcu_preempt_ctrlblk.boosted_this_gp++; 308
291 rt_mutex_unlock(&mtx); 309 return rcu_preempt_ctrlblk.boost_tasks != NULL ||
292 return rcu_preempt_ctrlblk.boost_tasks != NULL; 310 rcu_preempt_ctrlblk.exp_tasks != NULL;
293} 311}
294 312
295/* 313/*
@@ -304,42 +322,25 @@ static int rcu_boost(void)
304 */ 322 */
305static int rcu_initiate_boost(void) 323static int rcu_initiate_boost(void)
306{ 324{
307 if (!rcu_preempt_blocked_readers_cgp()) { 325 if (!rcu_preempt_blocked_readers_cgp() &&
308 RCU_TRACE(rcu_preempt_ctrlblk.n_normal_balk_blkd_tasks++); 326 rcu_preempt_ctrlblk.exp_tasks == NULL) {
327 RCU_TRACE(rcu_preempt_ctrlblk.n_balk_exp_gp_tasks++);
309 return 0; 328 return 0;
310 } 329 }
311 if (rcu_preempt_ctrlblk.gp_tasks != NULL && 330 if (rcu_preempt_ctrlblk.exp_tasks != NULL ||
312 rcu_preempt_ctrlblk.boost_tasks == NULL && 331 (rcu_preempt_ctrlblk.gp_tasks != NULL &&
313 rcu_preempt_ctrlblk.boosted_this_gp == 0 && 332 rcu_preempt_ctrlblk.boost_tasks == NULL &&
314 ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time)) { 333 ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time))) {
315 rcu_preempt_ctrlblk.boost_tasks = rcu_preempt_ctrlblk.gp_tasks; 334 if (rcu_preempt_ctrlblk.exp_tasks == NULL)
335 rcu_preempt_ctrlblk.boost_tasks =
336 rcu_preempt_ctrlblk.gp_tasks;
316 invoke_rcu_kthread(); 337 invoke_rcu_kthread();
317 RCU_TRACE(rcu_preempt_ctrlblk.n_normal_boosts++);
318 } else 338 } else
319 RCU_TRACE(rcu_initiate_boost_trace()); 339 RCU_TRACE(rcu_initiate_boost_trace());
320 return 1; 340 return 1;
321} 341}
322 342
323/* 343#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000)
324 * Initiate boosting for an expedited grace period.
325 */
326static void rcu_initiate_expedited_boost(void)
327{
328 unsigned long flags;
329
330 raw_local_irq_save(flags);
331 if (!list_empty(&rcu_preempt_ctrlblk.blkd_tasks)) {
332 rcu_preempt_ctrlblk.boost_tasks =
333 rcu_preempt_ctrlblk.blkd_tasks.next;
334 rcu_preempt_ctrlblk.boosted_this_gp = -1;
335 invoke_rcu_kthread();
336 RCU_TRACE(rcu_preempt_ctrlblk.n_exp_boosts++);
337 } else
338 RCU_TRACE(rcu_initiate_exp_boost_trace());
339 raw_local_irq_restore(flags);
340}
341
342#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000);
343 344
344/* 345/*
345 * Do priority-boost accounting for the start of a new grace period. 346 * Do priority-boost accounting for the start of a new grace period.
@@ -347,8 +348,6 @@ static void rcu_initiate_expedited_boost(void)
347static void rcu_preempt_boost_start_gp(void) 348static void rcu_preempt_boost_start_gp(void)
348{ 349{
349 rcu_preempt_ctrlblk.boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES; 350 rcu_preempt_ctrlblk.boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES;
350 if (rcu_preempt_ctrlblk.boosted_this_gp > 0)
351 rcu_preempt_ctrlblk.boosted_this_gp = 0;
352} 351}
353 352
354#else /* #ifdef CONFIG_RCU_BOOST */ 353#else /* #ifdef CONFIG_RCU_BOOST */
@@ -372,13 +371,6 @@ static int rcu_initiate_boost(void)
372} 371}
373 372
374/* 373/*
375 * If there is no RCU priority boosting, we don't initiate expedited boosting.
376 */
377static void rcu_initiate_expedited_boost(void)
378{
379}
380
381/*
382 * If there is no RCU priority boosting, nothing to do at grace-period start. 374 * If there is no RCU priority boosting, nothing to do at grace-period start.
383 */ 375 */
384static void rcu_preempt_boost_start_gp(void) 376static void rcu_preempt_boost_start_gp(void)
@@ -418,7 +410,7 @@ static void rcu_preempt_cpu_qs(void)
418 if (!rcu_preempt_gp_in_progress()) 410 if (!rcu_preempt_gp_in_progress())
419 return; 411 return;
420 /* 412 /*
421 * Check up on boosting. If there are no readers blocking the 413 * Check up on boosting. If there are readers blocking the
422 * current grace period, leave. 414 * current grace period, leave.
423 */ 415 */
424 if (rcu_initiate_boost()) 416 if (rcu_initiate_boost())
@@ -578,7 +570,7 @@ static void rcu_read_unlock_special(struct task_struct *t)
578 empty = !rcu_preempt_blocked_readers_cgp(); 570 empty = !rcu_preempt_blocked_readers_cgp();
579 empty_exp = rcu_preempt_ctrlblk.exp_tasks == NULL; 571 empty_exp = rcu_preempt_ctrlblk.exp_tasks == NULL;
580 np = rcu_next_node_entry(t); 572 np = rcu_next_node_entry(t);
581 list_del(&t->rcu_node_entry); 573 list_del_init(&t->rcu_node_entry);
582 if (&t->rcu_node_entry == rcu_preempt_ctrlblk.gp_tasks) 574 if (&t->rcu_node_entry == rcu_preempt_ctrlblk.gp_tasks)
583 rcu_preempt_ctrlblk.gp_tasks = np; 575 rcu_preempt_ctrlblk.gp_tasks = np;
584 if (&t->rcu_node_entry == rcu_preempt_ctrlblk.exp_tasks) 576 if (&t->rcu_node_entry == rcu_preempt_ctrlblk.exp_tasks)
@@ -587,7 +579,6 @@ static void rcu_read_unlock_special(struct task_struct *t)
587 if (&t->rcu_node_entry == rcu_preempt_ctrlblk.boost_tasks) 579 if (&t->rcu_node_entry == rcu_preempt_ctrlblk.boost_tasks)
588 rcu_preempt_ctrlblk.boost_tasks = np; 580 rcu_preempt_ctrlblk.boost_tasks = np;
589#endif /* #ifdef CONFIG_RCU_BOOST */ 581#endif /* #ifdef CONFIG_RCU_BOOST */
590 INIT_LIST_HEAD(&t->rcu_node_entry);
591 582
592 /* 583 /*
593 * If this was the last task on the current list, and if 584 * If this was the last task on the current list, and if
@@ -812,13 +803,16 @@ void synchronize_rcu_expedited(void)
812 rpcp->exp_tasks = rpcp->blkd_tasks.next; 803 rpcp->exp_tasks = rpcp->blkd_tasks.next;
813 if (rpcp->exp_tasks == &rpcp->blkd_tasks) 804 if (rpcp->exp_tasks == &rpcp->blkd_tasks)
814 rpcp->exp_tasks = NULL; 805 rpcp->exp_tasks = NULL;
815 local_irq_restore(flags);
816 806
817 /* Wait for tail of ->blkd_tasks list to drain. */ 807 /* Wait for tail of ->blkd_tasks list to drain. */
818 if (rcu_preempted_readers_exp()) 808 if (!rcu_preempted_readers_exp())
819 rcu_initiate_expedited_boost(); 809 local_irq_restore(flags);
810 else {
811 rcu_initiate_boost();
812 local_irq_restore(flags);
820 wait_event(sync_rcu_preempt_exp_wq, 813 wait_event(sync_rcu_preempt_exp_wq,
821 !rcu_preempted_readers_exp()); 814 !rcu_preempted_readers_exp());
815 }
822 816
823 /* Clean up and exit. */ 817 /* Clean up and exit. */
824 barrier(); /* ensure expedited GP seen before counter increment. */ 818 barrier(); /* ensure expedited GP seen before counter increment. */
@@ -931,24 +925,17 @@ void __init rcu_scheduler_starting(void)
931 925
932static void rcu_initiate_boost_trace(void) 926static void rcu_initiate_boost_trace(void)
933{ 927{
934 if (rcu_preempt_ctrlblk.gp_tasks == NULL) 928 if (list_empty(&rcu_preempt_ctrlblk.blkd_tasks))
935 rcu_preempt_ctrlblk.n_normal_balk_gp_tasks++; 929 rcu_preempt_ctrlblk.n_balk_blkd_tasks++;
930 else if (rcu_preempt_ctrlblk.gp_tasks == NULL &&
931 rcu_preempt_ctrlblk.exp_tasks == NULL)
932 rcu_preempt_ctrlblk.n_balk_exp_gp_tasks++;
936 else if (rcu_preempt_ctrlblk.boost_tasks != NULL) 933 else if (rcu_preempt_ctrlblk.boost_tasks != NULL)
937 rcu_preempt_ctrlblk.n_normal_balk_boost_tasks++; 934 rcu_preempt_ctrlblk.n_balk_boost_tasks++;
938 else if (rcu_preempt_ctrlblk.boosted_this_gp != 0)
939 rcu_preempt_ctrlblk.n_normal_balk_boosted++;
940 else if (!ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time)) 935 else if (!ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time))
941 rcu_preempt_ctrlblk.n_normal_balk_notyet++; 936 rcu_preempt_ctrlblk.n_balk_notyet++;
942 else
943 rcu_preempt_ctrlblk.n_normal_balk_nos++;
944}
945
946static void rcu_initiate_exp_boost_trace(void)
947{
948 if (list_empty(&rcu_preempt_ctrlblk.blkd_tasks))
949 rcu_preempt_ctrlblk.n_exp_balk_blkd_tasks++;
950 else 937 else
951 rcu_preempt_ctrlblk.n_exp_balk_nos++; 938 rcu_preempt_ctrlblk.n_balk_nos++;
952} 939}
953 940
954#endif /* #ifdef CONFIG_RCU_BOOST */ 941#endif /* #ifdef CONFIG_RCU_BOOST */
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index c224da41890c..2e138db03382 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -131,7 +131,7 @@ struct rcu_torture {
131 131
132static LIST_HEAD(rcu_torture_freelist); 132static LIST_HEAD(rcu_torture_freelist);
133static struct rcu_torture __rcu *rcu_torture_current; 133static struct rcu_torture __rcu *rcu_torture_current;
134static long rcu_torture_current_version; 134static unsigned long rcu_torture_current_version;
135static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN]; 135static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN];
136static DEFINE_SPINLOCK(rcu_torture_lock); 136static DEFINE_SPINLOCK(rcu_torture_lock);
137static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) = 137static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) =
@@ -146,8 +146,6 @@ static atomic_t n_rcu_torture_mberror;
146static atomic_t n_rcu_torture_error; 146static atomic_t n_rcu_torture_error;
147static long n_rcu_torture_boost_ktrerror; 147static long n_rcu_torture_boost_ktrerror;
148static long n_rcu_torture_boost_rterror; 148static long n_rcu_torture_boost_rterror;
149static long n_rcu_torture_boost_allocerror;
150static long n_rcu_torture_boost_afferror;
151static long n_rcu_torture_boost_failure; 149static long n_rcu_torture_boost_failure;
152static long n_rcu_torture_boosts; 150static long n_rcu_torture_boosts;
153static long n_rcu_torture_timers; 151static long n_rcu_torture_timers;
@@ -163,11 +161,11 @@ static int stutter_pause_test;
163#endif 161#endif
164int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; 162int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT;
165 163
166#ifdef CONFIG_RCU_BOOST 164#if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU)
167#define rcu_can_boost() 1 165#define rcu_can_boost() 1
168#else /* #ifdef CONFIG_RCU_BOOST */ 166#else /* #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */
169#define rcu_can_boost() 0 167#define rcu_can_boost() 0
170#endif /* #else #ifdef CONFIG_RCU_BOOST */ 168#endif /* #else #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */
171 169
172static unsigned long boost_starttime; /* jiffies of next boost test start. */ 170static unsigned long boost_starttime; /* jiffies of next boost test start. */
173DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ 171DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */
@@ -751,6 +749,7 @@ static int rcu_torture_boost(void *arg)
751 n_rcu_torture_boost_rterror++; 749 n_rcu_torture_boost_rterror++;
752 } 750 }
753 751
752 init_rcu_head_on_stack(&rbi.rcu);
754 /* Each pass through the following loop does one boost-test cycle. */ 753 /* Each pass through the following loop does one boost-test cycle. */
755 do { 754 do {
756 /* Wait for the next test interval. */ 755 /* Wait for the next test interval. */
@@ -810,6 +809,7 @@ checkwait: rcu_stutter_wait("rcu_torture_boost");
810 809
811 /* Clean up and exit. */ 810 /* Clean up and exit. */
812 VERBOSE_PRINTK_STRING("rcu_torture_boost task stopping"); 811 VERBOSE_PRINTK_STRING("rcu_torture_boost task stopping");
812 destroy_rcu_head_on_stack(&rbi.rcu);
813 rcutorture_shutdown_absorb("rcu_torture_boost"); 813 rcutorture_shutdown_absorb("rcu_torture_boost");
814 while (!kthread_should_stop() || rbi.inflight) 814 while (!kthread_should_stop() || rbi.inflight)
815 schedule_timeout_uninterruptible(1); 815 schedule_timeout_uninterruptible(1);
@@ -886,7 +886,7 @@ rcu_torture_writer(void *arg)
886 old_rp->rtort_pipe_count++; 886 old_rp->rtort_pipe_count++;
887 cur_ops->deferred_free(old_rp); 887 cur_ops->deferred_free(old_rp);
888 } 888 }
889 rcu_torture_current_version++; 889 rcutorture_record_progress(++rcu_torture_current_version);
890 oldbatch = cur_ops->completed(); 890 oldbatch = cur_ops->completed();
891 rcu_stutter_wait("rcu_torture_writer"); 891 rcu_stutter_wait("rcu_torture_writer");
892 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); 892 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
@@ -1066,8 +1066,8 @@ rcu_torture_printk(char *page)
1066 } 1066 }
1067 cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG); 1067 cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG);
1068 cnt += sprintf(&page[cnt], 1068 cnt += sprintf(&page[cnt],
1069 "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d " 1069 "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d "
1070 "rtmbe: %d rtbke: %ld rtbre: %ld rtbae: %ld rtbafe: %ld " 1070 "rtmbe: %d rtbke: %ld rtbre: %ld "
1071 "rtbf: %ld rtb: %ld nt: %ld", 1071 "rtbf: %ld rtb: %ld nt: %ld",
1072 rcu_torture_current, 1072 rcu_torture_current,
1073 rcu_torture_current_version, 1073 rcu_torture_current_version,
@@ -1078,16 +1078,12 @@ rcu_torture_printk(char *page)
1078 atomic_read(&n_rcu_torture_mberror), 1078 atomic_read(&n_rcu_torture_mberror),
1079 n_rcu_torture_boost_ktrerror, 1079 n_rcu_torture_boost_ktrerror,
1080 n_rcu_torture_boost_rterror, 1080 n_rcu_torture_boost_rterror,
1081 n_rcu_torture_boost_allocerror,
1082 n_rcu_torture_boost_afferror,
1083 n_rcu_torture_boost_failure, 1081 n_rcu_torture_boost_failure,
1084 n_rcu_torture_boosts, 1082 n_rcu_torture_boosts,
1085 n_rcu_torture_timers); 1083 n_rcu_torture_timers);
1086 if (atomic_read(&n_rcu_torture_mberror) != 0 || 1084 if (atomic_read(&n_rcu_torture_mberror) != 0 ||
1087 n_rcu_torture_boost_ktrerror != 0 || 1085 n_rcu_torture_boost_ktrerror != 0 ||
1088 n_rcu_torture_boost_rterror != 0 || 1086 n_rcu_torture_boost_rterror != 0 ||
1089 n_rcu_torture_boost_allocerror != 0 ||
1090 n_rcu_torture_boost_afferror != 0 ||
1091 n_rcu_torture_boost_failure != 0) 1087 n_rcu_torture_boost_failure != 0)
1092 cnt += sprintf(&page[cnt], " !!!"); 1088 cnt += sprintf(&page[cnt], " !!!");
1093 cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); 1089 cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
@@ -1331,6 +1327,7 @@ rcu_torture_cleanup(void)
1331 int i; 1327 int i;
1332 1328
1333 mutex_lock(&fullstop_mutex); 1329 mutex_lock(&fullstop_mutex);
1330 rcutorture_record_test_transition();
1334 if (fullstop == FULLSTOP_SHUTDOWN) { 1331 if (fullstop == FULLSTOP_SHUTDOWN) {
1335 printk(KERN_WARNING /* but going down anyway, so... */ 1332 printk(KERN_WARNING /* but going down anyway, so... */
1336 "Concurrent 'rmmod rcutorture' and shutdown illegal!\n"); 1333 "Concurrent 'rmmod rcutorture' and shutdown illegal!\n");
@@ -1486,8 +1483,6 @@ rcu_torture_init(void)
1486 atomic_set(&n_rcu_torture_error, 0); 1483 atomic_set(&n_rcu_torture_error, 0);
1487 n_rcu_torture_boost_ktrerror = 0; 1484 n_rcu_torture_boost_ktrerror = 0;
1488 n_rcu_torture_boost_rterror = 0; 1485 n_rcu_torture_boost_rterror = 0;
1489 n_rcu_torture_boost_allocerror = 0;
1490 n_rcu_torture_boost_afferror = 0;
1491 n_rcu_torture_boost_failure = 0; 1486 n_rcu_torture_boost_failure = 0;
1492 n_rcu_torture_boosts = 0; 1487 n_rcu_torture_boosts = 0;
1493 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) 1488 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
@@ -1624,6 +1619,7 @@ rcu_torture_init(void)
1624 } 1619 }
1625 } 1620 }
1626 register_reboot_notifier(&rcutorture_shutdown_nb); 1621 register_reboot_notifier(&rcutorture_shutdown_nb);
1622 rcutorture_record_test_transition();
1627 mutex_unlock(&fullstop_mutex); 1623 mutex_unlock(&fullstop_mutex);
1628 return 0; 1624 return 0;
1629 1625
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index dd4aea806f8e..ba06207b1dd3 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -36,7 +36,7 @@
36#include <linux/interrupt.h> 36#include <linux/interrupt.h>
37#include <linux/sched.h> 37#include <linux/sched.h>
38#include <linux/nmi.h> 38#include <linux/nmi.h>
39#include <asm/atomic.h> 39#include <linux/atomic.h>
40#include <linux/bitops.h> 40#include <linux/bitops.h>
41#include <linux/module.h> 41#include <linux/module.h>
42#include <linux/completion.h> 42#include <linux/completion.h>
@@ -47,6 +47,9 @@
47#include <linux/mutex.h> 47#include <linux/mutex.h>
48#include <linux/time.h> 48#include <linux/time.h>
49#include <linux/kernel_stat.h> 49#include <linux/kernel_stat.h>
50#include <linux/wait.h>
51#include <linux/kthread.h>
52#include <linux/prefetch.h>
50 53
51#include "rcutree.h" 54#include "rcutree.h"
52 55
@@ -79,10 +82,67 @@ DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
79struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state); 82struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
80DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); 83DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
81 84
85static struct rcu_state *rcu_state;
86
87/*
88 * The rcu_scheduler_active variable transitions from zero to one just
89 * before the first task is spawned. So when this variable is zero, RCU
90 * can assume that there is but one task, allowing RCU to (for example)
91 * optimized synchronize_sched() to a simple barrier(). When this variable
92 * is one, RCU must actually do all the hard work required to detect real
93 * grace periods. This variable is also used to suppress boot-time false
94 * positives from lockdep-RCU error checking.
95 */
82int rcu_scheduler_active __read_mostly; 96int rcu_scheduler_active __read_mostly;
83EXPORT_SYMBOL_GPL(rcu_scheduler_active); 97EXPORT_SYMBOL_GPL(rcu_scheduler_active);
84 98
85/* 99/*
100 * The rcu_scheduler_fully_active variable transitions from zero to one
101 * during the early_initcall() processing, which is after the scheduler
102 * is capable of creating new tasks. So RCU processing (for example,
103 * creating tasks for RCU priority boosting) must be delayed until after
104 * rcu_scheduler_fully_active transitions from zero to one. We also
105 * currently delay invocation of any RCU callbacks until after this point.
106 *
107 * It might later prove better for people registering RCU callbacks during
108 * early boot to take responsibility for these callbacks, but one step at
109 * a time.
110 */
111static int rcu_scheduler_fully_active __read_mostly;
112
113#ifdef CONFIG_RCU_BOOST
114
115/*
116 * Control variables for per-CPU and per-rcu_node kthreads. These
117 * handle all flavors of RCU.
118 */
119static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task);
120DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
121DEFINE_PER_CPU(int, rcu_cpu_kthread_cpu);
122DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
123DEFINE_PER_CPU(char, rcu_cpu_has_work);
124
125#endif /* #ifdef CONFIG_RCU_BOOST */
126
127static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu);
128static void invoke_rcu_core(void);
129static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
130
131#define RCU_KTHREAD_PRIO 1 /* RT priority for per-CPU kthreads. */
132
133/*
134 * Track the rcutorture test sequence number and the update version
135 * number within a given test. The rcutorture_testseq is incremented
136 * on every rcutorture module load and unload, so has an odd value
137 * when a test is running. The rcutorture_vernum is set to zero
138 * when rcutorture starts and is incremented on each rcutorture update.
139 * These variables enable correlating rcutorture output with the
140 * RCU tracing information.
141 */
142unsigned long rcutorture_testseq;
143unsigned long rcutorture_vernum;
144
145/*
86 * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s 146 * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s
87 * permit this function to be invoked without holding the root rcu_node 147 * permit this function to be invoked without holding the root rcu_node
88 * structure's ->lock, but of course results can be subject to change. 148 * structure's ->lock, but of course results can be subject to change.
@@ -124,11 +184,12 @@ void rcu_note_context_switch(int cpu)
124 rcu_sched_qs(cpu); 184 rcu_sched_qs(cpu);
125 rcu_preempt_note_context_switch(cpu); 185 rcu_preempt_note_context_switch(cpu);
126} 186}
187EXPORT_SYMBOL_GPL(rcu_note_context_switch);
127 188
128#ifdef CONFIG_NO_HZ 189#ifdef CONFIG_NO_HZ
129DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { 190DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
130 .dynticks_nesting = 1, 191 .dynticks_nesting = 1,
131 .dynticks = 1, 192 .dynticks = ATOMIC_INIT(1),
132}; 193};
133#endif /* #ifdef CONFIG_NO_HZ */ 194#endif /* #ifdef CONFIG_NO_HZ */
134 195
@@ -140,10 +201,8 @@ module_param(blimit, int, 0);
140module_param(qhimark, int, 0); 201module_param(qhimark, int, 0);
141module_param(qlowmark, int, 0); 202module_param(qlowmark, int, 0);
142 203
143#ifdef CONFIG_RCU_CPU_STALL_DETECTOR 204int rcu_cpu_stall_suppress __read_mostly;
144int rcu_cpu_stall_suppress __read_mostly = RCU_CPU_STALL_SUPPRESS_INIT;
145module_param(rcu_cpu_stall_suppress, int, 0644); 205module_param(rcu_cpu_stall_suppress, int, 0644);
146#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
147 206
148static void force_quiescent_state(struct rcu_state *rsp, int relaxed); 207static void force_quiescent_state(struct rcu_state *rsp, int relaxed);
149static int rcu_pending(int cpu); 208static int rcu_pending(int cpu);
@@ -176,6 +235,31 @@ void rcu_bh_force_quiescent_state(void)
176EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state); 235EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state);
177 236
178/* 237/*
238 * Record the number of times rcutorture tests have been initiated and
239 * terminated. This information allows the debugfs tracing stats to be
240 * correlated to the rcutorture messages, even when the rcutorture module
241 * is being repeatedly loaded and unloaded. In other words, we cannot
242 * store this state in rcutorture itself.
243 */
244void rcutorture_record_test_transition(void)
245{
246 rcutorture_testseq++;
247 rcutorture_vernum = 0;
248}
249EXPORT_SYMBOL_GPL(rcutorture_record_test_transition);
250
251/*
252 * Record the number of writer passes through the current rcutorture test.
253 * This is also used to correlate debugfs tracing stats with the rcutorture
254 * messages.
255 */
256void rcutorture_record_progress(unsigned long vernum)
257{
258 rcutorture_vernum++;
259}
260EXPORT_SYMBOL_GPL(rcutorture_record_progress);
261
262/*
179 * Force a quiescent state for RCU-sched. 263 * Force a quiescent state for RCU-sched.
180 */ 264 */
181void rcu_sched_force_quiescent_state(void) 265void rcu_sched_force_quiescent_state(void)
@@ -234,8 +318,8 @@ static int rcu_implicit_offline_qs(struct rcu_data *rdp)
234 return 1; 318 return 1;
235 } 319 }
236 320
237 /* If preemptable RCU, no point in sending reschedule IPI. */ 321 /* If preemptible RCU, no point in sending reschedule IPI. */
238 if (rdp->preemptable) 322 if (rdp->preemptible)
239 return 0; 323 return 0;
240 324
241 /* The CPU is online, so send it a reschedule IPI. */ 325 /* The CPU is online, so send it a reschedule IPI. */
@@ -264,13 +348,25 @@ void rcu_enter_nohz(void)
264 unsigned long flags; 348 unsigned long flags;
265 struct rcu_dynticks *rdtp; 349 struct rcu_dynticks *rdtp;
266 350
267 smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
268 local_irq_save(flags); 351 local_irq_save(flags);
269 rdtp = &__get_cpu_var(rcu_dynticks); 352 rdtp = &__get_cpu_var(rcu_dynticks);
270 rdtp->dynticks++; 353 if (--rdtp->dynticks_nesting) {
271 rdtp->dynticks_nesting--; 354 local_irq_restore(flags);
272 WARN_ON_ONCE(rdtp->dynticks & 0x1); 355 return;
356 }
357 /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
358 smp_mb__before_atomic_inc(); /* See above. */
359 atomic_inc(&rdtp->dynticks);
360 smp_mb__after_atomic_inc(); /* Force ordering with next sojourn. */
361 WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
273 local_irq_restore(flags); 362 local_irq_restore(flags);
363
364 /* If the interrupt queued a callback, get out of dyntick mode. */
365 if (in_irq() &&
366 (__get_cpu_var(rcu_sched_data).nxtlist ||
367 __get_cpu_var(rcu_bh_data).nxtlist ||
368 rcu_preempt_needs_cpu(smp_processor_id())))
369 set_need_resched();
274} 370}
275 371
276/* 372/*
@@ -286,11 +382,16 @@ void rcu_exit_nohz(void)
286 382
287 local_irq_save(flags); 383 local_irq_save(flags);
288 rdtp = &__get_cpu_var(rcu_dynticks); 384 rdtp = &__get_cpu_var(rcu_dynticks);
289 rdtp->dynticks++; 385 if (rdtp->dynticks_nesting++) {
290 rdtp->dynticks_nesting++; 386 local_irq_restore(flags);
291 WARN_ON_ONCE(!(rdtp->dynticks & 0x1)); 387 return;
388 }
389 smp_mb__before_atomic_inc(); /* Force ordering w/previous sojourn. */
390 atomic_inc(&rdtp->dynticks);
391 /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */
392 smp_mb__after_atomic_inc(); /* See above. */
393 WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
292 local_irq_restore(flags); 394 local_irq_restore(flags);
293 smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
294} 395}
295 396
296/** 397/**
@@ -304,11 +405,15 @@ void rcu_nmi_enter(void)
304{ 405{
305 struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks); 406 struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
306 407
307 if (rdtp->dynticks & 0x1) 408 if (rdtp->dynticks_nmi_nesting == 0 &&
409 (atomic_read(&rdtp->dynticks) & 0x1))
308 return; 410 return;
309 rdtp->dynticks_nmi++; 411 rdtp->dynticks_nmi_nesting++;
310 WARN_ON_ONCE(!(rdtp->dynticks_nmi & 0x1)); 412 smp_mb__before_atomic_inc(); /* Force delay from prior write. */
311 smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */ 413 atomic_inc(&rdtp->dynticks);
414 /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */
415 smp_mb__after_atomic_inc(); /* See above. */
416 WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
312} 417}
313 418
314/** 419/**
@@ -322,11 +427,14 @@ void rcu_nmi_exit(void)
322{ 427{
323 struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks); 428 struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
324 429
325 if (rdtp->dynticks & 0x1) 430 if (rdtp->dynticks_nmi_nesting == 0 ||
431 --rdtp->dynticks_nmi_nesting != 0)
326 return; 432 return;
327 smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */ 433 /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
328 rdtp->dynticks_nmi++; 434 smp_mb__before_atomic_inc(); /* See above. */
329 WARN_ON_ONCE(rdtp->dynticks_nmi & 0x1); 435 atomic_inc(&rdtp->dynticks);
436 smp_mb__after_atomic_inc(); /* Force delay to next write. */
437 WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
330} 438}
331 439
332/** 440/**
@@ -337,13 +445,7 @@ void rcu_nmi_exit(void)
337 */ 445 */
338void rcu_irq_enter(void) 446void rcu_irq_enter(void)
339{ 447{
340 struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks); 448 rcu_exit_nohz();
341
342 if (rdtp->dynticks_nesting++)
343 return;
344 rdtp->dynticks++;
345 WARN_ON_ONCE(!(rdtp->dynticks & 0x1));
346 smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
347} 449}
348 450
349/** 451/**
@@ -355,18 +457,7 @@ void rcu_irq_enter(void)
355 */ 457 */
356void rcu_irq_exit(void) 458void rcu_irq_exit(void)
357{ 459{
358 struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks); 460 rcu_enter_nohz();
359
360 if (--rdtp->dynticks_nesting)
361 return;
362 smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
363 rdtp->dynticks++;
364 WARN_ON_ONCE(rdtp->dynticks & 0x1);
365
366 /* If the interrupt queued a callback, get out of dyntick mode. */
367 if (__this_cpu_read(rcu_sched_data.nxtlist) ||
368 __this_cpu_read(rcu_bh_data.nxtlist))
369 set_need_resched();
370} 461}
371 462
372#ifdef CONFIG_SMP 463#ifdef CONFIG_SMP
@@ -378,19 +469,8 @@ void rcu_irq_exit(void)
378 */ 469 */
379static int dyntick_save_progress_counter(struct rcu_data *rdp) 470static int dyntick_save_progress_counter(struct rcu_data *rdp)
380{ 471{
381 int ret; 472 rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks);
382 int snap; 473 return 0;
383 int snap_nmi;
384
385 snap = rdp->dynticks->dynticks;
386 snap_nmi = rdp->dynticks->dynticks_nmi;
387 smp_mb(); /* Order sampling of snap with end of grace period. */
388 rdp->dynticks_snap = snap;
389 rdp->dynticks_nmi_snap = snap_nmi;
390 ret = ((snap & 0x1) == 0) && ((snap_nmi & 0x1) == 0);
391 if (ret)
392 rdp->dynticks_fqs++;
393 return ret;
394} 474}
395 475
396/* 476/*
@@ -401,16 +481,11 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp)
401 */ 481 */
402static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) 482static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
403{ 483{
404 long curr; 484 unsigned long curr;
405 long curr_nmi; 485 unsigned long snap;
406 long snap;
407 long snap_nmi;
408 486
409 curr = rdp->dynticks->dynticks; 487 curr = (unsigned long)atomic_add_return(0, &rdp->dynticks->dynticks);
410 snap = rdp->dynticks_snap; 488 snap = (unsigned long)rdp->dynticks_snap;
411 curr_nmi = rdp->dynticks->dynticks_nmi;
412 snap_nmi = rdp->dynticks_nmi_snap;
413 smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
414 489
415 /* 490 /*
416 * If the CPU passed through or entered a dynticks idle phase with 491 * If the CPU passed through or entered a dynticks idle phase with
@@ -420,8 +495,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
420 * read-side critical section that started before the beginning 495 * read-side critical section that started before the beginning
421 * of the current RCU grace period. 496 * of the current RCU grace period.
422 */ 497 */
423 if ((curr != snap || (curr & 0x1) == 0) && 498 if ((curr & 0x1) == 0 || ULONG_CMP_GE(curr, snap + 2)) {
424 (curr_nmi != snap_nmi || (curr_nmi & 0x1) == 0)) {
425 rdp->dynticks_fqs++; 499 rdp->dynticks_fqs++;
426 return 1; 500 return 1;
427 } 501 }
@@ -450,8 +524,6 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
450 524
451#endif /* #else #ifdef CONFIG_NO_HZ */ 525#endif /* #else #ifdef CONFIG_NO_HZ */
452 526
453#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
454
455int rcu_cpu_stall_suppress __read_mostly; 527int rcu_cpu_stall_suppress __read_mostly;
456 528
457static void record_gp_stall_check_time(struct rcu_state *rsp) 529static void record_gp_stall_check_time(struct rcu_state *rsp)
@@ -537,21 +609,24 @@ static void print_cpu_stall(struct rcu_state *rsp)
537 609
538static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) 610static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
539{ 611{
540 long delta; 612 unsigned long j;
613 unsigned long js;
541 struct rcu_node *rnp; 614 struct rcu_node *rnp;
542 615
543 if (rcu_cpu_stall_suppress) 616 if (rcu_cpu_stall_suppress)
544 return; 617 return;
545 delta = jiffies - ACCESS_ONCE(rsp->jiffies_stall); 618 j = ACCESS_ONCE(jiffies);
619 js = ACCESS_ONCE(rsp->jiffies_stall);
546 rnp = rdp->mynode; 620 rnp = rdp->mynode;
547 if ((ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && delta >= 0) { 621 if ((ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && ULONG_CMP_GE(j, js)) {
548 622
549 /* We haven't checked in, so go dump stack. */ 623 /* We haven't checked in, so go dump stack. */
550 print_cpu_stall(rsp); 624 print_cpu_stall(rsp);
551 625
552 } else if (rcu_gp_in_progress(rsp) && delta >= RCU_STALL_RAT_DELAY) { 626 } else if (rcu_gp_in_progress(rsp) &&
627 ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY)) {
553 628
554 /* They had two time units to dump stack, so complain. */ 629 /* They had a few time units to dump stack, so complain. */
555 print_other_cpu_stall(rsp); 630 print_other_cpu_stall(rsp);
556 } 631 }
557} 632}
@@ -587,26 +662,6 @@ static void __init check_cpu_stall_init(void)
587 atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block); 662 atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block);
588} 663}
589 664
590#else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
591
592static void record_gp_stall_check_time(struct rcu_state *rsp)
593{
594}
595
596static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
597{
598}
599
600void rcu_cpu_stall_reset(void)
601{
602}
603
604static void __init check_cpu_stall_init(void)
605{
606}
607
608#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
609
610/* 665/*
611 * Update CPU-local rcu_data state to record the newly noticed grace period. 666 * Update CPU-local rcu_data state to record the newly noticed grace period.
612 * This is used both when we started the grace period and when we notice 667 * This is used both when we started the grace period and when we notice
@@ -809,6 +864,7 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
809 rnp->completed = rsp->completed; 864 rnp->completed = rsp->completed;
810 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */ 865 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */
811 rcu_start_gp_per_cpu(rsp, rnp, rdp); 866 rcu_start_gp_per_cpu(rsp, rnp, rdp);
867 rcu_preempt_boost_start_gp(rnp);
812 raw_spin_unlock_irqrestore(&rnp->lock, flags); 868 raw_spin_unlock_irqrestore(&rnp->lock, flags);
813 return; 869 return;
814 } 870 }
@@ -844,6 +900,7 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
844 rnp->completed = rsp->completed; 900 rnp->completed = rsp->completed;
845 if (rnp == rdp->mynode) 901 if (rnp == rdp->mynode)
846 rcu_start_gp_per_cpu(rsp, rnp, rdp); 902 rcu_start_gp_per_cpu(rsp, rnp, rdp);
903 rcu_preempt_boost_start_gp(rnp);
847 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 904 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
848 } 905 }
849 906
@@ -864,7 +921,18 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
864static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) 921static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
865 __releases(rcu_get_root(rsp)->lock) 922 __releases(rcu_get_root(rsp)->lock)
866{ 923{
924 unsigned long gp_duration;
925
867 WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); 926 WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
927
928 /*
929 * Ensure that all grace-period and pre-grace-period activity
930 * is seen before the assignment to rsp->completed.
931 */
932 smp_mb(); /* See above block comment. */
933 gp_duration = jiffies - rsp->gp_start;
934 if (gp_duration > rsp->gp_max)
935 rsp->gp_max = gp_duration;
868 rsp->completed = rsp->gpnum; 936 rsp->completed = rsp->gpnum;
869 rsp->signaled = RCU_GP_IDLE; 937 rsp->signaled = RCU_GP_IDLE;
870 rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */ 938 rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */
@@ -894,7 +962,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
894 return; 962 return;
895 } 963 }
896 rnp->qsmask &= ~mask; 964 rnp->qsmask &= ~mask;
897 if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) { 965 if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
898 966
899 /* Other bits still set at this level, so done. */ 967 /* Other bits still set at this level, so done. */
900 raw_spin_unlock_irqrestore(&rnp->lock, flags); 968 raw_spin_unlock_irqrestore(&rnp->lock, flags);
@@ -1037,6 +1105,8 @@ static void rcu_send_cbs_to_online(struct rcu_state *rsp)
1037/* 1105/*
1038 * Remove the outgoing CPU from the bitmasks in the rcu_node hierarchy 1106 * Remove the outgoing CPU from the bitmasks in the rcu_node hierarchy
1039 * and move all callbacks from the outgoing CPU to the current one. 1107 * and move all callbacks from the outgoing CPU to the current one.
1108 * There can only be one CPU hotplug operation at a time, so no other
1109 * CPU can be attempting to update rcu_cpu_kthread_task.
1040 */ 1110 */
1041static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) 1111static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
1042{ 1112{
@@ -1046,6 +1116,8 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
1046 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); 1116 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
1047 struct rcu_node *rnp; 1117 struct rcu_node *rnp;
1048 1118
1119 rcu_stop_cpu_kthread(cpu);
1120
1049 /* Exclude any attempts to start a new grace period. */ 1121 /* Exclude any attempts to start a new grace period. */
1050 raw_spin_lock_irqsave(&rsp->onofflock, flags); 1122 raw_spin_lock_irqsave(&rsp->onofflock, flags);
1051 1123
@@ -1082,6 +1154,7 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
1082 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1154 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1083 if (need_report & RCU_OFL_TASKS_EXP_GP) 1155 if (need_report & RCU_OFL_TASKS_EXP_GP)
1084 rcu_report_exp_rnp(rsp, rnp); 1156 rcu_report_exp_rnp(rsp, rnp);
1157 rcu_node_kthread_setaffinity(rnp, -1);
1085} 1158}
1086 1159
1087/* 1160/*
@@ -1143,7 +1216,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1143 next = list->next; 1216 next = list->next;
1144 prefetch(next); 1217 prefetch(next);
1145 debug_rcu_head_unqueue(list); 1218 debug_rcu_head_unqueue(list);
1146 list->func(list); 1219 __rcu_reclaim(list);
1147 list = next; 1220 list = next;
1148 if (++count >= rdp->blimit) 1221 if (++count >= rdp->blimit)
1149 break; 1222 break;
@@ -1179,7 +1252,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1179 1252
1180 /* Re-raise the RCU softirq if there are callbacks remaining. */ 1253 /* Re-raise the RCU softirq if there are callbacks remaining. */
1181 if (cpu_has_callbacks_ready_to_invoke(rdp)) 1254 if (cpu_has_callbacks_ready_to_invoke(rdp))
1182 raise_softirq(RCU_SOFTIRQ); 1255 invoke_rcu_core();
1183} 1256}
1184 1257
1185/* 1258/*
@@ -1225,7 +1298,7 @@ void rcu_check_callbacks(int cpu, int user)
1225 } 1298 }
1226 rcu_preempt_check_callbacks(cpu); 1299 rcu_preempt_check_callbacks(cpu);
1227 if (rcu_pending(cpu)) 1300 if (rcu_pending(cpu))
1228 raise_softirq(RCU_SOFTIRQ); 1301 invoke_rcu_core();
1229} 1302}
1230 1303
1231#ifdef CONFIG_SMP 1304#ifdef CONFIG_SMP
@@ -1233,6 +1306,8 @@ void rcu_check_callbacks(int cpu, int user)
1233/* 1306/*
1234 * Scan the leaf rcu_node structures, processing dyntick state for any that 1307 * Scan the leaf rcu_node structures, processing dyntick state for any that
1235 * have not yet encountered a quiescent state, using the function specified. 1308 * have not yet encountered a quiescent state, using the function specified.
1309 * Also initiate boosting for any threads blocked on the root rcu_node.
1310 *
1236 * The caller must have suppressed start of new grace periods. 1311 * The caller must have suppressed start of new grace periods.
1237 */ 1312 */
1238static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)) 1313static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *))
@@ -1251,7 +1326,7 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *))
1251 return; 1326 return;
1252 } 1327 }
1253 if (rnp->qsmask == 0) { 1328 if (rnp->qsmask == 0) {
1254 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1329 rcu_initiate_boost(rnp, flags); /* releases rnp->lock */
1255 continue; 1330 continue;
1256 } 1331 }
1257 cpu = rnp->grplo; 1332 cpu = rnp->grplo;
@@ -1269,6 +1344,11 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *))
1269 } 1344 }
1270 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1345 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1271 } 1346 }
1347 rnp = rcu_get_root(rsp);
1348 if (rnp->qsmask == 0) {
1349 raw_spin_lock_irqsave(&rnp->lock, flags);
1350 rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */
1351 }
1272} 1352}
1273 1353
1274/* 1354/*
@@ -1383,7 +1463,8 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
1383 } 1463 }
1384 1464
1385 /* If there are callbacks ready, invoke them. */ 1465 /* If there are callbacks ready, invoke them. */
1386 rcu_do_batch(rsp, rdp); 1466 if (cpu_has_callbacks_ready_to_invoke(rdp))
1467 invoke_rcu_callbacks(rsp, rdp);
1387} 1468}
1388 1469
1389/* 1470/*
@@ -1391,29 +1472,37 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
1391 */ 1472 */
1392static void rcu_process_callbacks(struct softirq_action *unused) 1473static void rcu_process_callbacks(struct softirq_action *unused)
1393{ 1474{
1394 /*
1395 * Memory references from any prior RCU read-side critical sections
1396 * executed by the interrupted code must be seen before any RCU
1397 * grace-period manipulations below.
1398 */
1399 smp_mb(); /* See above block comment. */
1400
1401 __rcu_process_callbacks(&rcu_sched_state, 1475 __rcu_process_callbacks(&rcu_sched_state,
1402 &__get_cpu_var(rcu_sched_data)); 1476 &__get_cpu_var(rcu_sched_data));
1403 __rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data)); 1477 __rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data));
1404 rcu_preempt_process_callbacks(); 1478 rcu_preempt_process_callbacks();
1405 1479
1406 /*
1407 * Memory references from any later RCU read-side critical sections
1408 * executed by the interrupted code must be seen after any RCU
1409 * grace-period manipulations above.
1410 */
1411 smp_mb(); /* See above block comment. */
1412
1413 /* If we are last CPU on way to dyntick-idle mode, accelerate it. */ 1480 /* If we are last CPU on way to dyntick-idle mode, accelerate it. */
1414 rcu_needs_cpu_flush(); 1481 rcu_needs_cpu_flush();
1415} 1482}
1416 1483
1484/*
1485 * Wake up the current CPU's kthread. This replaces raise_softirq()
1486 * in earlier versions of RCU. Note that because we are running on
1487 * the current CPU with interrupts disabled, the rcu_cpu_kthread_task
1488 * cannot disappear out from under us.
1489 */
1490static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
1491{
1492 if (unlikely(!ACCESS_ONCE(rcu_scheduler_fully_active)))
1493 return;
1494 if (likely(!rsp->boost)) {
1495 rcu_do_batch(rsp, rdp);
1496 return;
1497 }
1498 invoke_rcu_callbacks_kthread();
1499}
1500
1501static void invoke_rcu_core(void)
1502{
1503 raise_softirq(RCU_SOFTIRQ);
1504}
1505
1417static void 1506static void
1418__call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), 1507__call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1419 struct rcu_state *rsp) 1508 struct rcu_state *rsp)
@@ -1439,6 +1528,13 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1439 /* Add the callback to our list. */ 1528 /* Add the callback to our list. */
1440 *rdp->nxttail[RCU_NEXT_TAIL] = head; 1529 *rdp->nxttail[RCU_NEXT_TAIL] = head;
1441 rdp->nxttail[RCU_NEXT_TAIL] = &head->next; 1530 rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
1531 rdp->qlen++;
1532
1533 /* If interrupts were disabled, don't dive into RCU core. */
1534 if (irqs_disabled_flags(flags)) {
1535 local_irq_restore(flags);
1536 return;
1537 }
1442 1538
1443 /* 1539 /*
1444 * Force the grace period if too many callbacks or too long waiting. 1540 * Force the grace period if too many callbacks or too long waiting.
@@ -1447,7 +1543,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1447 * invoking force_quiescent_state() if the newly enqueued callback 1543 * invoking force_quiescent_state() if the newly enqueued callback
1448 * is the only one waiting for a grace period to complete. 1544 * is the only one waiting for a grace period to complete.
1449 */ 1545 */
1450 if (unlikely(++rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) { 1546 if (unlikely(rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) {
1451 1547
1452 /* Are we ignoring a completed grace period? */ 1548 /* Are we ignoring a completed grace period? */
1453 rcu_process_gp_end(rsp, rdp); 1549 rcu_process_gp_end(rsp, rdp);
@@ -1583,7 +1679,7 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
1583 * or RCU-bh, force a local reschedule. 1679 * or RCU-bh, force a local reschedule.
1584 */ 1680 */
1585 rdp->n_rp_qs_pending++; 1681 rdp->n_rp_qs_pending++;
1586 if (!rdp->preemptable && 1682 if (!rdp->preemptible &&
1587 ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs) - 1, 1683 ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs) - 1,
1588 jiffies)) 1684 jiffies))
1589 set_need_resched(); 1685 set_need_resched();
@@ -1760,7 +1856,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
1760 * that this CPU cannot possibly have any RCU callbacks in flight yet. 1856 * that this CPU cannot possibly have any RCU callbacks in flight yet.
1761 */ 1857 */
1762static void __cpuinit 1858static void __cpuinit
1763rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable) 1859rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
1764{ 1860{
1765 unsigned long flags; 1861 unsigned long flags;
1766 unsigned long mask; 1862 unsigned long mask;
@@ -1772,7 +1868,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
1772 rdp->passed_quiesc = 0; /* We could be racing with new GP, */ 1868 rdp->passed_quiesc = 0; /* We could be racing with new GP, */
1773 rdp->qs_pending = 1; /* so set up to respond to current GP. */ 1869 rdp->qs_pending = 1; /* so set up to respond to current GP. */
1774 rdp->beenonline = 1; /* We have now been online. */ 1870 rdp->beenonline = 1; /* We have now been online. */
1775 rdp->preemptable = preemptable; 1871 rdp->preemptible = preemptible;
1776 rdp->qlen_last_fqs_check = 0; 1872 rdp->qlen_last_fqs_check = 0;
1777 rdp->n_force_qs_snap = rsp->n_force_qs; 1873 rdp->n_force_qs_snap = rsp->n_force_qs;
1778 rdp->blimit = blimit; 1874 rdp->blimit = blimit;
@@ -1806,7 +1902,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
1806 raw_spin_unlock_irqrestore(&rsp->onofflock, flags); 1902 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
1807} 1903}
1808 1904
1809static void __cpuinit rcu_online_cpu(int cpu) 1905static void __cpuinit rcu_prepare_cpu(int cpu)
1810{ 1906{
1811 rcu_init_percpu_data(cpu, &rcu_sched_state, 0); 1907 rcu_init_percpu_data(cpu, &rcu_sched_state, 0);
1812 rcu_init_percpu_data(cpu, &rcu_bh_state, 0); 1908 rcu_init_percpu_data(cpu, &rcu_bh_state, 0);
@@ -1820,11 +1916,23 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
1820 unsigned long action, void *hcpu) 1916 unsigned long action, void *hcpu)
1821{ 1917{
1822 long cpu = (long)hcpu; 1918 long cpu = (long)hcpu;
1919 struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu);
1920 struct rcu_node *rnp = rdp->mynode;
1823 1921
1824 switch (action) { 1922 switch (action) {
1825 case CPU_UP_PREPARE: 1923 case CPU_UP_PREPARE:
1826 case CPU_UP_PREPARE_FROZEN: 1924 case CPU_UP_PREPARE_FROZEN:
1827 rcu_online_cpu(cpu); 1925 rcu_prepare_cpu(cpu);
1926 rcu_prepare_kthreads(cpu);
1927 break;
1928 case CPU_ONLINE:
1929 case CPU_DOWN_FAILED:
1930 rcu_node_kthread_setaffinity(rnp, -1);
1931 rcu_cpu_kthread_setrt(cpu, 1);
1932 break;
1933 case CPU_DOWN_PREPARE:
1934 rcu_node_kthread_setaffinity(rnp, cpu);
1935 rcu_cpu_kthread_setrt(cpu, 0);
1828 break; 1936 break;
1829 case CPU_DYING: 1937 case CPU_DYING:
1830 case CPU_DYING_FROZEN: 1938 case CPU_DYING_FROZEN:
@@ -1943,10 +2051,7 @@ static void __init rcu_init_one(struct rcu_state *rsp,
1943 j / rsp->levelspread[i - 1]; 2051 j / rsp->levelspread[i - 1];
1944 } 2052 }
1945 rnp->level = i; 2053 rnp->level = i;
1946 INIT_LIST_HEAD(&rnp->blocked_tasks[0]); 2054 INIT_LIST_HEAD(&rnp->blkd_tasks);
1947 INIT_LIST_HEAD(&rnp->blocked_tasks[1]);
1948 INIT_LIST_HEAD(&rnp->blocked_tasks[2]);
1949 INIT_LIST_HEAD(&rnp->blocked_tasks[3]);
1950 } 2055 }
1951 } 2056 }
1952 2057
@@ -1968,7 +2073,7 @@ void __init rcu_init(void)
1968 rcu_init_one(&rcu_sched_state, &rcu_sched_data); 2073 rcu_init_one(&rcu_sched_state, &rcu_sched_data);
1969 rcu_init_one(&rcu_bh_state, &rcu_bh_data); 2074 rcu_init_one(&rcu_bh_state, &rcu_bh_data);
1970 __rcu_init_preempt(); 2075 __rcu_init_preempt();
1971 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); 2076 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
1972 2077
1973 /* 2078 /*
1974 * We don't need protection against CPU-hotplug here because 2079 * We don't need protection against CPU-hotplug here because
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index e8f057e44e3e..01b2ccda26fb 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -84,13 +84,19 @@
84 * Dynticks per-CPU state. 84 * Dynticks per-CPU state.
85 */ 85 */
86struct rcu_dynticks { 86struct rcu_dynticks {
87 int dynticks_nesting; /* Track nesting level, sort of. */ 87 int dynticks_nesting; /* Track irq/process nesting level. */
88 int dynticks; /* Even value for dynticks-idle, else odd. */ 88 int dynticks_nmi_nesting; /* Track NMI nesting level. */
89 int dynticks_nmi; /* Even value for either dynticks-idle or */ 89 atomic_t dynticks; /* Even value for dynticks-idle, else odd. */
90 /* not in nmi handler, else odd. So this */
91 /* remains even for nmi from irq handler. */
92}; 90};
93 91
92/* RCU's kthread states for tracing. */
93#define RCU_KTHREAD_STOPPED 0
94#define RCU_KTHREAD_RUNNING 1
95#define RCU_KTHREAD_WAITING 2
96#define RCU_KTHREAD_OFFCPU 3
97#define RCU_KTHREAD_YIELDING 4
98#define RCU_KTHREAD_MAX 4
99
94/* 100/*
95 * Definition for node within the RCU grace-period-detection hierarchy. 101 * Definition for node within the RCU grace-period-detection hierarchy.
96 */ 102 */
@@ -109,10 +115,13 @@ struct rcu_node {
109 /* an rcu_data structure, otherwise, each */ 115 /* an rcu_data structure, otherwise, each */
110 /* bit corresponds to a child rcu_node */ 116 /* bit corresponds to a child rcu_node */
111 /* structure. */ 117 /* structure. */
112 unsigned long expmask; /* Groups that have ->blocked_tasks[] */ 118 unsigned long expmask; /* Groups that have ->blkd_tasks */
113 /* elements that need to drain to allow the */ 119 /* elements that need to drain to allow the */
114 /* current expedited grace period to */ 120 /* current expedited grace period to */
115 /* complete (only for TREE_PREEMPT_RCU). */ 121 /* complete (only for TREE_PREEMPT_RCU). */
122 atomic_t wakemask; /* CPUs whose kthread needs to be awakened. */
123 /* Since this has meaning only for leaf */
124 /* rcu_node structures, 32 bits suffices. */
116 unsigned long qsmaskinit; 125 unsigned long qsmaskinit;
117 /* Per-GP initial value for qsmask & expmask. */ 126 /* Per-GP initial value for qsmask & expmask. */
118 unsigned long grpmask; /* Mask to apply to parent qsmask. */ 127 unsigned long grpmask; /* Mask to apply to parent qsmask. */
@@ -122,11 +131,62 @@ struct rcu_node {
122 u8 grpnum; /* CPU/group number for next level up. */ 131 u8 grpnum; /* CPU/group number for next level up. */
123 u8 level; /* root is at level 0. */ 132 u8 level; /* root is at level 0. */
124 struct rcu_node *parent; 133 struct rcu_node *parent;
125 struct list_head blocked_tasks[4]; 134 struct list_head blkd_tasks;
126 /* Tasks blocked in RCU read-side critsect. */ 135 /* Tasks blocked in RCU read-side critical */
127 /* Grace period number (->gpnum) x blocked */ 136 /* section. Tasks are placed at the head */
128 /* by tasks on the (x & 0x1) element of the */ 137 /* of this list and age towards the tail. */
129 /* blocked_tasks[] array. */ 138 struct list_head *gp_tasks;
139 /* Pointer to the first task blocking the */
140 /* current grace period, or NULL if there */
141 /* is no such task. */
142 struct list_head *exp_tasks;
143 /* Pointer to the first task blocking the */
144 /* current expedited grace period, or NULL */
145 /* if there is no such task. If there */
146 /* is no current expedited grace period, */
147 /* then there can cannot be any such task. */
148#ifdef CONFIG_RCU_BOOST
149 struct list_head *boost_tasks;
150 /* Pointer to first task that needs to be */
151 /* priority boosted, or NULL if no priority */
152 /* boosting is needed for this rcu_node */
153 /* structure. If there are no tasks */
154 /* queued on this rcu_node structure that */
155 /* are blocking the current grace period, */
156 /* there can be no such task. */
157 unsigned long boost_time;
158 /* When to start boosting (jiffies). */
159 struct task_struct *boost_kthread_task;
160 /* kthread that takes care of priority */
161 /* boosting for this rcu_node structure. */
162 unsigned int boost_kthread_status;
163 /* State of boost_kthread_task for tracing. */
164 unsigned long n_tasks_boosted;
165 /* Total number of tasks boosted. */
166 unsigned long n_exp_boosts;
167 /* Number of tasks boosted for expedited GP. */
168 unsigned long n_normal_boosts;
169 /* Number of tasks boosted for normal GP. */
170 unsigned long n_balk_blkd_tasks;
171 /* Refused to boost: no blocked tasks. */
172 unsigned long n_balk_exp_gp_tasks;
173 /* Refused to boost: nothing blocking GP. */
174 unsigned long n_balk_boost_tasks;
175 /* Refused to boost: already boosting. */
176 unsigned long n_balk_notblocked;
177 /* Refused to boost: RCU RS CS still running. */
178 unsigned long n_balk_notyet;
179 /* Refused to boost: not yet time. */
180 unsigned long n_balk_nos;
181 /* Refused to boost: not sure why, though. */
182 /* This can happen due to race conditions. */
183#endif /* #ifdef CONFIG_RCU_BOOST */
184 struct task_struct *node_kthread_task;
185 /* kthread that takes care of this rcu_node */
186 /* structure, for example, awakening the */
187 /* per-CPU kthreads as needed. */
188 unsigned int node_kthread_status;
189 /* State of node_kthread_task for tracing. */
130} ____cacheline_internodealigned_in_smp; 190} ____cacheline_internodealigned_in_smp;
131 191
132/* 192/*
@@ -175,7 +235,7 @@ struct rcu_data {
175 bool passed_quiesc; /* User-mode/idle loop etc. */ 235 bool passed_quiesc; /* User-mode/idle loop etc. */
176 bool qs_pending; /* Core waits for quiesc state. */ 236 bool qs_pending; /* Core waits for quiesc state. */
177 bool beenonline; /* CPU online at least once. */ 237 bool beenonline; /* CPU online at least once. */
178 bool preemptable; /* Preemptable RCU? */ 238 bool preemptible; /* Preemptible RCU? */
179 struct rcu_node *mynode; /* This CPU's leaf of hierarchy */ 239 struct rcu_node *mynode; /* This CPU's leaf of hierarchy */
180 unsigned long grpmask; /* Mask to apply to leaf qsmask. */ 240 unsigned long grpmask; /* Mask to apply to leaf qsmask. */
181 241
@@ -218,7 +278,6 @@ struct rcu_data {
218 /* 3) dynticks interface. */ 278 /* 3) dynticks interface. */
219 struct rcu_dynticks *dynticks; /* Shared per-CPU dynticks state. */ 279 struct rcu_dynticks *dynticks; /* Shared per-CPU dynticks state. */
220 int dynticks_snap; /* Per-GP tracking for dynticks. */ 280 int dynticks_snap; /* Per-GP tracking for dynticks. */
221 int dynticks_nmi_snap; /* Per-GP tracking for dynticks_nmi. */
222#endif /* #ifdef CONFIG_NO_HZ */ 281#endif /* #ifdef CONFIG_NO_HZ */
223 282
224 /* 4) reasons this CPU needed to be kicked by force_quiescent_state */ 283 /* 4) reasons this CPU needed to be kicked by force_quiescent_state */
@@ -254,7 +313,6 @@ struct rcu_data {
254#endif /* #else #ifdef CONFIG_NO_HZ */ 313#endif /* #else #ifdef CONFIG_NO_HZ */
255 314
256#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */ 315#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */
257#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
258 316
259#ifdef CONFIG_PROVE_RCU 317#ifdef CONFIG_PROVE_RCU
260#define RCU_STALL_DELAY_DELTA (5 * HZ) 318#define RCU_STALL_DELAY_DELTA (5 * HZ)
@@ -272,13 +330,16 @@ struct rcu_data {
272 /* scheduling clock irq */ 330 /* scheduling clock irq */
273 /* before ratting on them. */ 331 /* before ratting on them. */
274 332
275#ifdef CONFIG_RCU_CPU_STALL_DETECTOR_RUNNABLE 333#define rcu_wait(cond) \
276#define RCU_CPU_STALL_SUPPRESS_INIT 0 334do { \
277#else 335 for (;;) { \
278#define RCU_CPU_STALL_SUPPRESS_INIT 1 336 set_current_state(TASK_INTERRUPTIBLE); \
279#endif 337 if (cond) \
280 338 break; \
281#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 339 schedule(); \
340 } \
341 __set_current_state(TASK_RUNNING); \
342} while (0)
282 343
283/* 344/*
284 * RCU global state, including node hierarchy. This hierarchy is 345 * RCU global state, including node hierarchy. This hierarchy is
@@ -308,6 +369,7 @@ struct rcu_state {
308 /* period because */ 369 /* period because */
309 /* force_quiescent_state() */ 370 /* force_quiescent_state() */
310 /* was running. */ 371 /* was running. */
372 u8 boost; /* Subject to priority boost. */
311 unsigned long gpnum; /* Current gp number. */ 373 unsigned long gpnum; /* Current gp number. */
312 unsigned long completed; /* # of last completed gp. */ 374 unsigned long completed; /* # of last completed gp. */
313 375
@@ -325,12 +387,12 @@ struct rcu_state {
325 /* due to lock unavailable. */ 387 /* due to lock unavailable. */
326 unsigned long n_force_qs_ngp; /* Number of calls leaving */ 388 unsigned long n_force_qs_ngp; /* Number of calls leaving */
327 /* due to no GP active. */ 389 /* due to no GP active. */
328#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
329 unsigned long gp_start; /* Time at which GP started, */ 390 unsigned long gp_start; /* Time at which GP started, */
330 /* but in jiffies. */ 391 /* but in jiffies. */
331 unsigned long jiffies_stall; /* Time at which to check */ 392 unsigned long jiffies_stall; /* Time at which to check */
332 /* for CPU stalls. */ 393 /* for CPU stalls. */
333#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 394 unsigned long gp_max; /* Maximum GP duration in */
395 /* jiffies. */
334 char *name; /* Name of structure. */ 396 char *name; /* Name of structure. */
335}; 397};
336 398
@@ -361,16 +423,15 @@ DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data);
361static void rcu_bootup_announce(void); 423static void rcu_bootup_announce(void);
362long rcu_batches_completed(void); 424long rcu_batches_completed(void);
363static void rcu_preempt_note_context_switch(int cpu); 425static void rcu_preempt_note_context_switch(int cpu);
364static int rcu_preempted_readers(struct rcu_node *rnp); 426static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp);
365#ifdef CONFIG_HOTPLUG_CPU 427#ifdef CONFIG_HOTPLUG_CPU
366static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, 428static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp,
367 unsigned long flags); 429 unsigned long flags);
430static void rcu_stop_cpu_kthread(int cpu);
368#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 431#endif /* #ifdef CONFIG_HOTPLUG_CPU */
369#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
370static void rcu_print_detail_task_stall(struct rcu_state *rsp); 432static void rcu_print_detail_task_stall(struct rcu_state *rsp);
371static void rcu_print_task_stall(struct rcu_node *rnp); 433static void rcu_print_task_stall(struct rcu_node *rnp);
372static void rcu_preempt_stall_reset(void); 434static void rcu_preempt_stall_reset(void);
373#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
374static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); 435static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
375#ifdef CONFIG_HOTPLUG_CPU 436#ifdef CONFIG_HOTPLUG_CPU
376static int rcu_preempt_offline_tasks(struct rcu_state *rsp, 437static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
@@ -390,5 +451,20 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu);
390static void rcu_preempt_send_cbs_to_online(void); 451static void rcu_preempt_send_cbs_to_online(void);
391static void __init __rcu_init_preempt(void); 452static void __init __rcu_init_preempt(void);
392static void rcu_needs_cpu_flush(void); 453static void rcu_needs_cpu_flush(void);
454static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
455static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
456static void invoke_rcu_callbacks_kthread(void);
457#ifdef CONFIG_RCU_BOOST
458static void rcu_preempt_do_callbacks(void);
459static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp,
460 cpumask_var_t cm);
461static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
462 struct rcu_node *rnp,
463 int rnp_index);
464static void invoke_rcu_node_kthread(struct rcu_node *rnp);
465static void rcu_yield(void (*f)(unsigned long), unsigned long arg);
466#endif /* #ifdef CONFIG_RCU_BOOST */
467static void rcu_cpu_kthread_setrt(int cpu, int to_rt);
468static void __cpuinit rcu_prepare_kthreads(int cpu);
393 469
394#endif /* #ifndef RCU_TREE_NONCORE */ 470#endif /* #ifndef RCU_TREE_NONCORE */
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index a3638710dc67..75113cb7c4fb 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -1,7 +1,7 @@
1/* 1/*
2 * Read-Copy Update mechanism for mutual exclusion (tree-based version) 2 * Read-Copy Update mechanism for mutual exclusion (tree-based version)
3 * Internal non-public definitions that provide either classic 3 * Internal non-public definitions that provide either classic
4 * or preemptable semantics. 4 * or preemptible semantics.
5 * 5 *
6 * This program is free software; you can redistribute it and/or modify 6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by 7 * it under the terms of the GNU General Public License as published by
@@ -54,10 +54,6 @@ static void __init rcu_bootup_announce_oddness(void)
54#ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE 54#ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE
55 printk(KERN_INFO "\tRCU torture testing starts during boot.\n"); 55 printk(KERN_INFO "\tRCU torture testing starts during boot.\n");
56#endif 56#endif
57#ifndef CONFIG_RCU_CPU_STALL_DETECTOR
58 printk(KERN_INFO
59 "\tRCU-based detection of stalled CPUs is disabled.\n");
60#endif
61#if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE) 57#if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE)
62 printk(KERN_INFO "\tVerbose stalled-CPUs detection is disabled.\n"); 58 printk(KERN_INFO "\tVerbose stalled-CPUs detection is disabled.\n");
63#endif 59#endif
@@ -70,6 +66,7 @@ static void __init rcu_bootup_announce_oddness(void)
70 66
71struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state); 67struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state);
72DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data); 68DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data);
69static struct rcu_state *rcu_state = &rcu_preempt_state;
73 70
74static int rcu_preempted_readers_exp(struct rcu_node *rnp); 71static int rcu_preempted_readers_exp(struct rcu_node *rnp);
75 72
@@ -78,7 +75,7 @@ static int rcu_preempted_readers_exp(struct rcu_node *rnp);
78 */ 75 */
79static void __init rcu_bootup_announce(void) 76static void __init rcu_bootup_announce(void)
80{ 77{
81 printk(KERN_INFO "Preemptable hierarchical RCU implementation.\n"); 78 printk(KERN_INFO "Preemptible hierarchical RCU implementation.\n");
82 rcu_bootup_announce_oddness(); 79 rcu_bootup_announce_oddness();
83} 80}
84 81
@@ -111,7 +108,7 @@ void rcu_force_quiescent_state(void)
111EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); 108EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
112 109
113/* 110/*
114 * Record a preemptable-RCU quiescent state for the specified CPU. Note 111 * Record a preemptible-RCU quiescent state for the specified CPU. Note
115 * that this just means that the task currently running on the CPU is 112 * that this just means that the task currently running on the CPU is
116 * not in a quiescent state. There might be any number of tasks blocked 113 * not in a quiescent state. There might be any number of tasks blocked
117 * while in an RCU read-side critical section. 114 * while in an RCU read-side critical section.
@@ -134,12 +131,12 @@ static void rcu_preempt_qs(int cpu)
134 * We have entered the scheduler, and the current task might soon be 131 * We have entered the scheduler, and the current task might soon be
135 * context-switched away from. If this task is in an RCU read-side 132 * context-switched away from. If this task is in an RCU read-side
136 * critical section, we will no longer be able to rely on the CPU to 133 * critical section, we will no longer be able to rely on the CPU to
137 * record that fact, so we enqueue the task on the appropriate entry 134 * record that fact, so we enqueue the task on the blkd_tasks list.
138 * of the blocked_tasks[] array. The task will dequeue itself when 135 * The task will dequeue itself when it exits the outermost enclosing
139 * it exits the outermost enclosing RCU read-side critical section. 136 * RCU read-side critical section. Therefore, the current grace period
140 * Therefore, the current grace period cannot be permitted to complete 137 * cannot be permitted to complete until the blkd_tasks list entries
141 * until the blocked_tasks[] entry indexed by the low-order bit of 138 * predating the current grace period drain, in other words, until
142 * rnp->gpnum empties. 139 * rnp->gp_tasks becomes NULL.
143 * 140 *
144 * Caller must disable preemption. 141 * Caller must disable preemption.
145 */ 142 */
@@ -147,7 +144,6 @@ static void rcu_preempt_note_context_switch(int cpu)
147{ 144{
148 struct task_struct *t = current; 145 struct task_struct *t = current;
149 unsigned long flags; 146 unsigned long flags;
150 int phase;
151 struct rcu_data *rdp; 147 struct rcu_data *rdp;
152 struct rcu_node *rnp; 148 struct rcu_node *rnp;
153 149
@@ -169,15 +165,30 @@ static void rcu_preempt_note_context_switch(int cpu)
169 * (i.e., this CPU has not yet passed through a quiescent 165 * (i.e., this CPU has not yet passed through a quiescent
170 * state for the current grace period), then as long 166 * state for the current grace period), then as long
171 * as that task remains queued, the current grace period 167 * as that task remains queued, the current grace period
172 * cannot end. 168 * cannot end. Note that there is some uncertainty as
169 * to exactly when the current grace period started.
170 * We take a conservative approach, which can result
171 * in unnecessarily waiting on tasks that started very
172 * slightly after the current grace period began. C'est
173 * la vie!!!
173 * 174 *
174 * But first, note that the current CPU must still be 175 * But first, note that the current CPU must still be
175 * on line! 176 * on line!
176 */ 177 */
177 WARN_ON_ONCE((rdp->grpmask & rnp->qsmaskinit) == 0); 178 WARN_ON_ONCE((rdp->grpmask & rnp->qsmaskinit) == 0);
178 WARN_ON_ONCE(!list_empty(&t->rcu_node_entry)); 179 WARN_ON_ONCE(!list_empty(&t->rcu_node_entry));
179 phase = (rnp->gpnum + !(rnp->qsmask & rdp->grpmask)) & 0x1; 180 if ((rnp->qsmask & rdp->grpmask) && rnp->gp_tasks != NULL) {
180 list_add(&t->rcu_node_entry, &rnp->blocked_tasks[phase]); 181 list_add(&t->rcu_node_entry, rnp->gp_tasks->prev);
182 rnp->gp_tasks = &t->rcu_node_entry;
183#ifdef CONFIG_RCU_BOOST
184 if (rnp->boost_tasks != NULL)
185 rnp->boost_tasks = rnp->gp_tasks;
186#endif /* #ifdef CONFIG_RCU_BOOST */
187 } else {
188 list_add(&t->rcu_node_entry, &rnp->blkd_tasks);
189 if (rnp->qsmask & rdp->grpmask)
190 rnp->gp_tasks = &t->rcu_node_entry;
191 }
181 raw_spin_unlock_irqrestore(&rnp->lock, flags); 192 raw_spin_unlock_irqrestore(&rnp->lock, flags);
182 } 193 }
183 194
@@ -196,7 +207,7 @@ static void rcu_preempt_note_context_switch(int cpu)
196} 207}
197 208
198/* 209/*
199 * Tree-preemptable RCU implementation for rcu_read_lock(). 210 * Tree-preemptible RCU implementation for rcu_read_lock().
200 * Just increment ->rcu_read_lock_nesting, shared state will be updated 211 * Just increment ->rcu_read_lock_nesting, shared state will be updated
201 * if we block. 212 * if we block.
202 */ 213 */
@@ -212,12 +223,9 @@ EXPORT_SYMBOL_GPL(__rcu_read_lock);
212 * for the specified rcu_node structure. If the caller needs a reliable 223 * for the specified rcu_node structure. If the caller needs a reliable
213 * answer, it must hold the rcu_node's ->lock. 224 * answer, it must hold the rcu_node's ->lock.
214 */ 225 */
215static int rcu_preempted_readers(struct rcu_node *rnp) 226static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
216{ 227{
217 int phase = rnp->gpnum & 0x1; 228 return rnp->gp_tasks != NULL;
218
219 return !list_empty(&rnp->blocked_tasks[phase]) ||
220 !list_empty(&rnp->blocked_tasks[phase + 2]);
221} 229}
222 230
223/* 231/*
@@ -233,7 +241,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
233 unsigned long mask; 241 unsigned long mask;
234 struct rcu_node *rnp_p; 242 struct rcu_node *rnp_p;
235 243
236 if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) { 244 if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
237 raw_spin_unlock_irqrestore(&rnp->lock, flags); 245 raw_spin_unlock_irqrestore(&rnp->lock, flags);
238 return; /* Still need more quiescent states! */ 246 return; /* Still need more quiescent states! */
239 } 247 }
@@ -257,6 +265,21 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
257} 265}
258 266
259/* 267/*
268 * Advance a ->blkd_tasks-list pointer to the next entry, instead
269 * returning NULL if at the end of the list.
270 */
271static struct list_head *rcu_next_node_entry(struct task_struct *t,
272 struct rcu_node *rnp)
273{
274 struct list_head *np;
275
276 np = t->rcu_node_entry.next;
277 if (np == &rnp->blkd_tasks)
278 np = NULL;
279 return np;
280}
281
282/*
260 * Handle special cases during rcu_read_unlock(), such as needing to 283 * Handle special cases during rcu_read_unlock(), such as needing to
261 * notify RCU core processing or task having blocked during the RCU 284 * notify RCU core processing or task having blocked during the RCU
262 * read-side critical section. 285 * read-side critical section.
@@ -266,6 +289,7 @@ static void rcu_read_unlock_special(struct task_struct *t)
266 int empty; 289 int empty;
267 int empty_exp; 290 int empty_exp;
268 unsigned long flags; 291 unsigned long flags;
292 struct list_head *np;
269 struct rcu_node *rnp; 293 struct rcu_node *rnp;
270 int special; 294 int special;
271 295
@@ -306,10 +330,19 @@ static void rcu_read_unlock_special(struct task_struct *t)
306 break; 330 break;
307 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 331 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
308 } 332 }
309 empty = !rcu_preempted_readers(rnp); 333 empty = !rcu_preempt_blocked_readers_cgp(rnp);
310 empty_exp = !rcu_preempted_readers_exp(rnp); 334 empty_exp = !rcu_preempted_readers_exp(rnp);
311 smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ 335 smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
336 np = rcu_next_node_entry(t, rnp);
312 list_del_init(&t->rcu_node_entry); 337 list_del_init(&t->rcu_node_entry);
338 if (&t->rcu_node_entry == rnp->gp_tasks)
339 rnp->gp_tasks = np;
340 if (&t->rcu_node_entry == rnp->exp_tasks)
341 rnp->exp_tasks = np;
342#ifdef CONFIG_RCU_BOOST
343 if (&t->rcu_node_entry == rnp->boost_tasks)
344 rnp->boost_tasks = np;
345#endif /* #ifdef CONFIG_RCU_BOOST */
313 t->rcu_blocked_node = NULL; 346 t->rcu_blocked_node = NULL;
314 347
315 /* 348 /*
@@ -322,6 +355,15 @@ static void rcu_read_unlock_special(struct task_struct *t)
322 else 355 else
323 rcu_report_unblock_qs_rnp(rnp, flags); 356 rcu_report_unblock_qs_rnp(rnp, flags);
324 357
358#ifdef CONFIG_RCU_BOOST
359 /* Unboost if we were boosted. */
360 if (special & RCU_READ_UNLOCK_BOOSTED) {
361 t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BOOSTED;
362 rt_mutex_unlock(t->rcu_boost_mutex);
363 t->rcu_boost_mutex = NULL;
364 }
365#endif /* #ifdef CONFIG_RCU_BOOST */
366
325 /* 367 /*
326 * If this was the last task on the expedited lists, 368 * If this was the last task on the expedited lists,
327 * then we need to report up the rcu_node hierarchy. 369 * then we need to report up the rcu_node hierarchy.
@@ -334,7 +376,7 @@ static void rcu_read_unlock_special(struct task_struct *t)
334} 376}
335 377
336/* 378/*
337 * Tree-preemptable RCU implementation for rcu_read_unlock(). 379 * Tree-preemptible RCU implementation for rcu_read_unlock().
338 * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost 380 * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost
339 * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then 381 * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then
340 * invoke rcu_read_unlock_special() to clean up after a context switch 382 * invoke rcu_read_unlock_special() to clean up after a context switch
@@ -356,8 +398,6 @@ void __rcu_read_unlock(void)
356} 398}
357EXPORT_SYMBOL_GPL(__rcu_read_unlock); 399EXPORT_SYMBOL_GPL(__rcu_read_unlock);
358 400
359#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
360
361#ifdef CONFIG_RCU_CPU_STALL_VERBOSE 401#ifdef CONFIG_RCU_CPU_STALL_VERBOSE
362 402
363/* 403/*
@@ -367,18 +407,16 @@ EXPORT_SYMBOL_GPL(__rcu_read_unlock);
367static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp) 407static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp)
368{ 408{
369 unsigned long flags; 409 unsigned long flags;
370 struct list_head *lp;
371 int phase;
372 struct task_struct *t; 410 struct task_struct *t;
373 411
374 if (rcu_preempted_readers(rnp)) { 412 if (!rcu_preempt_blocked_readers_cgp(rnp))
375 raw_spin_lock_irqsave(&rnp->lock, flags); 413 return;
376 phase = rnp->gpnum & 0x1; 414 raw_spin_lock_irqsave(&rnp->lock, flags);
377 lp = &rnp->blocked_tasks[phase]; 415 t = list_entry(rnp->gp_tasks,
378 list_for_each_entry(t, lp, rcu_node_entry) 416 struct task_struct, rcu_node_entry);
379 sched_show_task(t); 417 list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry)
380 raw_spin_unlock_irqrestore(&rnp->lock, flags); 418 sched_show_task(t);
381 } 419 raw_spin_unlock_irqrestore(&rnp->lock, flags);
382} 420}
383 421
384/* 422/*
@@ -408,16 +446,14 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp)
408 */ 446 */
409static void rcu_print_task_stall(struct rcu_node *rnp) 447static void rcu_print_task_stall(struct rcu_node *rnp)
410{ 448{
411 struct list_head *lp;
412 int phase;
413 struct task_struct *t; 449 struct task_struct *t;
414 450
415 if (rcu_preempted_readers(rnp)) { 451 if (!rcu_preempt_blocked_readers_cgp(rnp))
416 phase = rnp->gpnum & 0x1; 452 return;
417 lp = &rnp->blocked_tasks[phase]; 453 t = list_entry(rnp->gp_tasks,
418 list_for_each_entry(t, lp, rcu_node_entry) 454 struct task_struct, rcu_node_entry);
419 printk(" P%d", t->pid); 455 list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry)
420 } 456 printk(" P%d", t->pid);
421} 457}
422 458
423/* 459/*
@@ -430,18 +466,21 @@ static void rcu_preempt_stall_reset(void)
430 rcu_preempt_state.jiffies_stall = jiffies + ULONG_MAX / 2; 466 rcu_preempt_state.jiffies_stall = jiffies + ULONG_MAX / 2;
431} 467}
432 468
433#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
434
435/* 469/*
436 * Check that the list of blocked tasks for the newly completed grace 470 * Check that the list of blocked tasks for the newly completed grace
437 * period is in fact empty. It is a serious bug to complete a grace 471 * period is in fact empty. It is a serious bug to complete a grace
438 * period that still has RCU readers blocked! This function must be 472 * period that still has RCU readers blocked! This function must be
439 * invoked -before- updating this rnp's ->gpnum, and the rnp's ->lock 473 * invoked -before- updating this rnp's ->gpnum, and the rnp's ->lock
440 * must be held by the caller. 474 * must be held by the caller.
475 *
476 * Also, if there are blocked tasks on the list, they automatically
477 * block the newly created grace period, so set up ->gp_tasks accordingly.
441 */ 478 */
442static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) 479static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
443{ 480{
444 WARN_ON_ONCE(rcu_preempted_readers(rnp)); 481 WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp));
482 if (!list_empty(&rnp->blkd_tasks))
483 rnp->gp_tasks = rnp->blkd_tasks.next;
445 WARN_ON_ONCE(rnp->qsmask); 484 WARN_ON_ONCE(rnp->qsmask);
446} 485}
447 486
@@ -465,50 +504,68 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
465 struct rcu_node *rnp, 504 struct rcu_node *rnp,
466 struct rcu_data *rdp) 505 struct rcu_data *rdp)
467{ 506{
468 int i;
469 struct list_head *lp; 507 struct list_head *lp;
470 struct list_head *lp_root; 508 struct list_head *lp_root;
471 int retval = 0; 509 int retval = 0;
472 struct rcu_node *rnp_root = rcu_get_root(rsp); 510 struct rcu_node *rnp_root = rcu_get_root(rsp);
473 struct task_struct *tp; 511 struct task_struct *t;
474 512
475 if (rnp == rnp_root) { 513 if (rnp == rnp_root) {
476 WARN_ONCE(1, "Last CPU thought to be offlined?"); 514 WARN_ONCE(1, "Last CPU thought to be offlined?");
477 return 0; /* Shouldn't happen: at least one CPU online. */ 515 return 0; /* Shouldn't happen: at least one CPU online. */
478 } 516 }
479 WARN_ON_ONCE(rnp != rdp->mynode && 517
480 (!list_empty(&rnp->blocked_tasks[0]) || 518 /* If we are on an internal node, complain bitterly. */
481 !list_empty(&rnp->blocked_tasks[1]) || 519 WARN_ON_ONCE(rnp != rdp->mynode);
482 !list_empty(&rnp->blocked_tasks[2]) ||
483 !list_empty(&rnp->blocked_tasks[3])));
484 520
485 /* 521 /*
486 * Move tasks up to root rcu_node. Rely on the fact that the 522 * Move tasks up to root rcu_node. Don't try to get fancy for
487 * root rcu_node can be at most one ahead of the rest of the 523 * this corner-case operation -- just put this node's tasks
488 * rcu_nodes in terms of gp_num value. This fact allows us to 524 * at the head of the root node's list, and update the root node's
489 * move the blocked_tasks[] array directly, element by element. 525 * ->gp_tasks and ->exp_tasks pointers to those of this node's,
526 * if non-NULL. This might result in waiting for more tasks than
527 * absolutely necessary, but this is a good performance/complexity
528 * tradeoff.
490 */ 529 */
491 if (rcu_preempted_readers(rnp)) 530 if (rcu_preempt_blocked_readers_cgp(rnp))
492 retval |= RCU_OFL_TASKS_NORM_GP; 531 retval |= RCU_OFL_TASKS_NORM_GP;
493 if (rcu_preempted_readers_exp(rnp)) 532 if (rcu_preempted_readers_exp(rnp))
494 retval |= RCU_OFL_TASKS_EXP_GP; 533 retval |= RCU_OFL_TASKS_EXP_GP;
495 for (i = 0; i < 4; i++) { 534 lp = &rnp->blkd_tasks;
496 lp = &rnp->blocked_tasks[i]; 535 lp_root = &rnp_root->blkd_tasks;
497 lp_root = &rnp_root->blocked_tasks[i]; 536 while (!list_empty(lp)) {
498 while (!list_empty(lp)) { 537 t = list_entry(lp->next, typeof(*t), rcu_node_entry);
499 tp = list_entry(lp->next, typeof(*tp), rcu_node_entry); 538 raw_spin_lock(&rnp_root->lock); /* irqs already disabled */
500 raw_spin_lock(&rnp_root->lock); /* irqs already disabled */ 539 list_del(&t->rcu_node_entry);
501 list_del(&tp->rcu_node_entry); 540 t->rcu_blocked_node = rnp_root;
502 tp->rcu_blocked_node = rnp_root; 541 list_add(&t->rcu_node_entry, lp_root);
503 list_add(&tp->rcu_node_entry, lp_root); 542 if (&t->rcu_node_entry == rnp->gp_tasks)
504 raw_spin_unlock(&rnp_root->lock); /* irqs remain disabled */ 543 rnp_root->gp_tasks = rnp->gp_tasks;
505 } 544 if (&t->rcu_node_entry == rnp->exp_tasks)
545 rnp_root->exp_tasks = rnp->exp_tasks;
546#ifdef CONFIG_RCU_BOOST
547 if (&t->rcu_node_entry == rnp->boost_tasks)
548 rnp_root->boost_tasks = rnp->boost_tasks;
549#endif /* #ifdef CONFIG_RCU_BOOST */
550 raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */
506 } 551 }
552
553#ifdef CONFIG_RCU_BOOST
554 /* In case root is being boosted and leaf is not. */
555 raw_spin_lock(&rnp_root->lock); /* irqs already disabled */
556 if (rnp_root->boost_tasks != NULL &&
557 rnp_root->boost_tasks != rnp_root->gp_tasks)
558 rnp_root->boost_tasks = rnp_root->gp_tasks;
559 raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */
560#endif /* #ifdef CONFIG_RCU_BOOST */
561
562 rnp->gp_tasks = NULL;
563 rnp->exp_tasks = NULL;
507 return retval; 564 return retval;
508} 565}
509 566
510/* 567/*
511 * Do CPU-offline processing for preemptable RCU. 568 * Do CPU-offline processing for preemptible RCU.
512 */ 569 */
513static void rcu_preempt_offline_cpu(int cpu) 570static void rcu_preempt_offline_cpu(int cpu)
514{ 571{
@@ -537,7 +594,7 @@ static void rcu_preempt_check_callbacks(int cpu)
537} 594}
538 595
539/* 596/*
540 * Process callbacks for preemptable RCU. 597 * Process callbacks for preemptible RCU.
541 */ 598 */
542static void rcu_preempt_process_callbacks(void) 599static void rcu_preempt_process_callbacks(void)
543{ 600{
@@ -545,8 +602,17 @@ static void rcu_preempt_process_callbacks(void)
545 &__get_cpu_var(rcu_preempt_data)); 602 &__get_cpu_var(rcu_preempt_data));
546} 603}
547 604
605#ifdef CONFIG_RCU_BOOST
606
607static void rcu_preempt_do_callbacks(void)
608{
609 rcu_do_batch(&rcu_preempt_state, &__get_cpu_var(rcu_preempt_data));
610}
611
612#endif /* #ifdef CONFIG_RCU_BOOST */
613
548/* 614/*
549 * Queue a preemptable-RCU callback for invocation after a grace period. 615 * Queue a preemptible-RCU callback for invocation after a grace period.
550 */ 616 */
551void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) 617void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
552{ 618{
@@ -594,8 +660,7 @@ static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex);
594 */ 660 */
595static int rcu_preempted_readers_exp(struct rcu_node *rnp) 661static int rcu_preempted_readers_exp(struct rcu_node *rnp)
596{ 662{
597 return !list_empty(&rnp->blocked_tasks[2]) || 663 return rnp->exp_tasks != NULL;
598 !list_empty(&rnp->blocked_tasks[3]);
599} 664}
600 665
601/* 666/*
@@ -655,13 +720,17 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
655static void 720static void
656sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) 721sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
657{ 722{
658 int must_wait; 723 unsigned long flags;
724 int must_wait = 0;
659 725
660 raw_spin_lock(&rnp->lock); /* irqs already disabled */ 726 raw_spin_lock_irqsave(&rnp->lock, flags);
661 list_splice_init(&rnp->blocked_tasks[0], &rnp->blocked_tasks[2]); 727 if (list_empty(&rnp->blkd_tasks))
662 list_splice_init(&rnp->blocked_tasks[1], &rnp->blocked_tasks[3]); 728 raw_spin_unlock_irqrestore(&rnp->lock, flags);
663 must_wait = rcu_preempted_readers_exp(rnp); 729 else {
664 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ 730 rnp->exp_tasks = rnp->blkd_tasks.next;
731 rcu_initiate_boost(rnp, flags); /* releases rnp->lock */
732 must_wait = 1;
733 }
665 if (!must_wait) 734 if (!must_wait)
666 rcu_report_exp_rnp(rsp, rnp); 735 rcu_report_exp_rnp(rsp, rnp);
667} 736}
@@ -669,9 +738,7 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
669/* 738/*
670 * Wait for an rcu-preempt grace period, but expedite it. The basic idea 739 * Wait for an rcu-preempt grace period, but expedite it. The basic idea
671 * is to invoke synchronize_sched_expedited() to push all the tasks to 740 * is to invoke synchronize_sched_expedited() to push all the tasks to
672 * the ->blocked_tasks[] lists, move all entries from the first set of 741 * the ->blkd_tasks lists and wait for this list to drain.
673 * ->blocked_tasks[] lists to the second set, and finally wait for this
674 * second set to drain.
675 */ 742 */
676void synchronize_rcu_expedited(void) 743void synchronize_rcu_expedited(void)
677{ 744{
@@ -703,7 +770,7 @@ void synchronize_rcu_expedited(void)
703 if ((ACCESS_ONCE(sync_rcu_preempt_exp_count) - snap) > 0) 770 if ((ACCESS_ONCE(sync_rcu_preempt_exp_count) - snap) > 0)
704 goto unlock_mb_ret; /* Others did our work for us. */ 771 goto unlock_mb_ret; /* Others did our work for us. */
705 772
706 /* force all RCU readers onto blocked_tasks[]. */ 773 /* force all RCU readers onto ->blkd_tasks lists. */
707 synchronize_sched_expedited(); 774 synchronize_sched_expedited();
708 775
709 raw_spin_lock_irqsave(&rsp->onofflock, flags); 776 raw_spin_lock_irqsave(&rsp->onofflock, flags);
@@ -715,7 +782,7 @@ void synchronize_rcu_expedited(void)
715 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 782 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
716 } 783 }
717 784
718 /* Snapshot current state of ->blocked_tasks[] lists. */ 785 /* Snapshot current state of ->blkd_tasks lists. */
719 rcu_for_each_leaf_node(rsp, rnp) 786 rcu_for_each_leaf_node(rsp, rnp)
720 sync_rcu_preempt_exp_init(rsp, rnp); 787 sync_rcu_preempt_exp_init(rsp, rnp);
721 if (NUM_RCU_NODES > 1) 788 if (NUM_RCU_NODES > 1)
@@ -723,7 +790,7 @@ void synchronize_rcu_expedited(void)
723 790
724 raw_spin_unlock_irqrestore(&rsp->onofflock, flags); 791 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
725 792
726 /* Wait for snapshotted ->blocked_tasks[] lists to drain. */ 793 /* Wait for snapshotted ->blkd_tasks lists to drain. */
727 rnp = rcu_get_root(rsp); 794 rnp = rcu_get_root(rsp);
728 wait_event(sync_rcu_preempt_exp_wq, 795 wait_event(sync_rcu_preempt_exp_wq,
729 sync_rcu_preempt_exp_done(rnp)); 796 sync_rcu_preempt_exp_done(rnp));
@@ -739,7 +806,7 @@ mb_ret:
739EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); 806EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
740 807
741/* 808/*
742 * Check to see if there is any immediate preemptable-RCU-related work 809 * Check to see if there is any immediate preemptible-RCU-related work
743 * to be done. 810 * to be done.
744 */ 811 */
745static int rcu_preempt_pending(int cpu) 812static int rcu_preempt_pending(int cpu)
@@ -749,7 +816,7 @@ static int rcu_preempt_pending(int cpu)
749} 816}
750 817
751/* 818/*
752 * Does preemptable RCU need the CPU to stay out of dynticks mode? 819 * Does preemptible RCU need the CPU to stay out of dynticks mode?
753 */ 820 */
754static int rcu_preempt_needs_cpu(int cpu) 821static int rcu_preempt_needs_cpu(int cpu)
755{ 822{
@@ -766,7 +833,7 @@ void rcu_barrier(void)
766EXPORT_SYMBOL_GPL(rcu_barrier); 833EXPORT_SYMBOL_GPL(rcu_barrier);
767 834
768/* 835/*
769 * Initialize preemptable RCU's per-CPU data. 836 * Initialize preemptible RCU's per-CPU data.
770 */ 837 */
771static void __cpuinit rcu_preempt_init_percpu_data(int cpu) 838static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
772{ 839{
@@ -774,7 +841,7 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
774} 841}
775 842
776/* 843/*
777 * Move preemptable RCU's callbacks from dying CPU to other online CPU. 844 * Move preemptible RCU's callbacks from dying CPU to other online CPU.
778 */ 845 */
779static void rcu_preempt_send_cbs_to_online(void) 846static void rcu_preempt_send_cbs_to_online(void)
780{ 847{
@@ -782,7 +849,7 @@ static void rcu_preempt_send_cbs_to_online(void)
782} 849}
783 850
784/* 851/*
785 * Initialize preemptable RCU's state structures. 852 * Initialize preemptible RCU's state structures.
786 */ 853 */
787static void __init __rcu_init_preempt(void) 854static void __init __rcu_init_preempt(void)
788{ 855{
@@ -790,7 +857,7 @@ static void __init __rcu_init_preempt(void)
790} 857}
791 858
792/* 859/*
793 * Check for a task exiting while in a preemptable-RCU read-side 860 * Check for a task exiting while in a preemptible-RCU read-side
794 * critical section, clean up if so. No need to issue warnings, 861 * critical section, clean up if so. No need to issue warnings,
795 * as debug_check_no_locks_held() already does this if lockdep 862 * as debug_check_no_locks_held() already does this if lockdep
796 * is enabled. 863 * is enabled.
@@ -802,11 +869,13 @@ void exit_rcu(void)
802 if (t->rcu_read_lock_nesting == 0) 869 if (t->rcu_read_lock_nesting == 0)
803 return; 870 return;
804 t->rcu_read_lock_nesting = 1; 871 t->rcu_read_lock_nesting = 1;
805 rcu_read_unlock(); 872 __rcu_read_unlock();
806} 873}
807 874
808#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ 875#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
809 876
877static struct rcu_state *rcu_state = &rcu_sched_state;
878
810/* 879/*
811 * Tell them what RCU they are running. 880 * Tell them what RCU they are running.
812 */ 881 */
@@ -836,7 +905,7 @@ void rcu_force_quiescent_state(void)
836EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); 905EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
837 906
838/* 907/*
839 * Because preemptable RCU does not exist, we never have to check for 908 * Because preemptible RCU does not exist, we never have to check for
840 * CPUs being in quiescent states. 909 * CPUs being in quiescent states.
841 */ 910 */
842static void rcu_preempt_note_context_switch(int cpu) 911static void rcu_preempt_note_context_switch(int cpu)
@@ -844,10 +913,10 @@ static void rcu_preempt_note_context_switch(int cpu)
844} 913}
845 914
846/* 915/*
847 * Because preemptable RCU does not exist, there are never any preempted 916 * Because preemptible RCU does not exist, there are never any preempted
848 * RCU readers. 917 * RCU readers.
849 */ 918 */
850static int rcu_preempted_readers(struct rcu_node *rnp) 919static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
851{ 920{
852 return 0; 921 return 0;
853} 922}
@@ -862,10 +931,8 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
862 931
863#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 932#endif /* #ifdef CONFIG_HOTPLUG_CPU */
864 933
865#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
866
867/* 934/*
868 * Because preemptable RCU does not exist, we never have to check for 935 * Because preemptible RCU does not exist, we never have to check for
869 * tasks blocked within RCU read-side critical sections. 936 * tasks blocked within RCU read-side critical sections.
870 */ 937 */
871static void rcu_print_detail_task_stall(struct rcu_state *rsp) 938static void rcu_print_detail_task_stall(struct rcu_state *rsp)
@@ -873,7 +940,7 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp)
873} 940}
874 941
875/* 942/*
876 * Because preemptable RCU does not exist, we never have to check for 943 * Because preemptible RCU does not exist, we never have to check for
877 * tasks blocked within RCU read-side critical sections. 944 * tasks blocked within RCU read-side critical sections.
878 */ 945 */
879static void rcu_print_task_stall(struct rcu_node *rnp) 946static void rcu_print_task_stall(struct rcu_node *rnp)
@@ -888,10 +955,8 @@ static void rcu_preempt_stall_reset(void)
888{ 955{
889} 956}
890 957
891#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
892
893/* 958/*
894 * Because there is no preemptable RCU, there can be no readers blocked, 959 * Because there is no preemptible RCU, there can be no readers blocked,
895 * so there is no need to check for blocked tasks. So check only for 960 * so there is no need to check for blocked tasks. So check only for
896 * bogus qsmask values. 961 * bogus qsmask values.
897 */ 962 */
@@ -903,7 +968,7 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
903#ifdef CONFIG_HOTPLUG_CPU 968#ifdef CONFIG_HOTPLUG_CPU
904 969
905/* 970/*
906 * Because preemptable RCU does not exist, it never needs to migrate 971 * Because preemptible RCU does not exist, it never needs to migrate
907 * tasks that were blocked within RCU read-side critical sections, and 972 * tasks that were blocked within RCU read-side critical sections, and
908 * such non-existent tasks cannot possibly have been blocking the current 973 * such non-existent tasks cannot possibly have been blocking the current
909 * grace period. 974 * grace period.
@@ -916,7 +981,7 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
916} 981}
917 982
918/* 983/*
919 * Because preemptable RCU does not exist, it never needs CPU-offline 984 * Because preemptible RCU does not exist, it never needs CPU-offline
920 * processing. 985 * processing.
921 */ 986 */
922static void rcu_preempt_offline_cpu(int cpu) 987static void rcu_preempt_offline_cpu(int cpu)
@@ -926,7 +991,7 @@ static void rcu_preempt_offline_cpu(int cpu)
926#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 991#endif /* #ifdef CONFIG_HOTPLUG_CPU */
927 992
928/* 993/*
929 * Because preemptable RCU does not exist, it never has any callbacks 994 * Because preemptible RCU does not exist, it never has any callbacks
930 * to check. 995 * to check.
931 */ 996 */
932static void rcu_preempt_check_callbacks(int cpu) 997static void rcu_preempt_check_callbacks(int cpu)
@@ -934,7 +999,7 @@ static void rcu_preempt_check_callbacks(int cpu)
934} 999}
935 1000
936/* 1001/*
937 * Because preemptable RCU does not exist, it never has any callbacks 1002 * Because preemptible RCU does not exist, it never has any callbacks
938 * to process. 1003 * to process.
939 */ 1004 */
940static void rcu_preempt_process_callbacks(void) 1005static void rcu_preempt_process_callbacks(void)
@@ -943,7 +1008,7 @@ static void rcu_preempt_process_callbacks(void)
943 1008
944/* 1009/*
945 * Wait for an rcu-preempt grace period, but make it happen quickly. 1010 * Wait for an rcu-preempt grace period, but make it happen quickly.
946 * But because preemptable RCU does not exist, map to rcu-sched. 1011 * But because preemptible RCU does not exist, map to rcu-sched.
947 */ 1012 */
948void synchronize_rcu_expedited(void) 1013void synchronize_rcu_expedited(void)
949{ 1014{
@@ -954,7 +1019,7 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
954#ifdef CONFIG_HOTPLUG_CPU 1019#ifdef CONFIG_HOTPLUG_CPU
955 1020
956/* 1021/*
957 * Because preemptable RCU does not exist, there is never any need to 1022 * Because preemptible RCU does not exist, there is never any need to
958 * report on tasks preempted in RCU read-side critical sections during 1023 * report on tasks preempted in RCU read-side critical sections during
959 * expedited RCU grace periods. 1024 * expedited RCU grace periods.
960 */ 1025 */
@@ -966,7 +1031,7 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
966#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 1031#endif /* #ifdef CONFIG_HOTPLUG_CPU */
967 1032
968/* 1033/*
969 * Because preemptable RCU does not exist, it never has any work to do. 1034 * Because preemptible RCU does not exist, it never has any work to do.
970 */ 1035 */
971static int rcu_preempt_pending(int cpu) 1036static int rcu_preempt_pending(int cpu)
972{ 1037{
@@ -974,7 +1039,7 @@ static int rcu_preempt_pending(int cpu)
974} 1039}
975 1040
976/* 1041/*
977 * Because preemptable RCU does not exist, it never needs any CPU. 1042 * Because preemptible RCU does not exist, it never needs any CPU.
978 */ 1043 */
979static int rcu_preempt_needs_cpu(int cpu) 1044static int rcu_preempt_needs_cpu(int cpu)
980{ 1045{
@@ -982,7 +1047,7 @@ static int rcu_preempt_needs_cpu(int cpu)
982} 1047}
983 1048
984/* 1049/*
985 * Because preemptable RCU does not exist, rcu_barrier() is just 1050 * Because preemptible RCU does not exist, rcu_barrier() is just
986 * another name for rcu_barrier_sched(). 1051 * another name for rcu_barrier_sched().
987 */ 1052 */
988void rcu_barrier(void) 1053void rcu_barrier(void)
@@ -992,7 +1057,7 @@ void rcu_barrier(void)
992EXPORT_SYMBOL_GPL(rcu_barrier); 1057EXPORT_SYMBOL_GPL(rcu_barrier);
993 1058
994/* 1059/*
995 * Because preemptable RCU does not exist, there is no per-CPU 1060 * Because preemptible RCU does not exist, there is no per-CPU
996 * data to initialize. 1061 * data to initialize.
997 */ 1062 */
998static void __cpuinit rcu_preempt_init_percpu_data(int cpu) 1063static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
@@ -1000,14 +1065,14 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
1000} 1065}
1001 1066
1002/* 1067/*
1003 * Because there is no preemptable RCU, there are no callbacks to move. 1068 * Because there is no preemptible RCU, there are no callbacks to move.
1004 */ 1069 */
1005static void rcu_preempt_send_cbs_to_online(void) 1070static void rcu_preempt_send_cbs_to_online(void)
1006{ 1071{
1007} 1072}
1008 1073
1009/* 1074/*
1010 * Because preemptable RCU does not exist, it need not be initialized. 1075 * Because preemptible RCU does not exist, it need not be initialized.
1011 */ 1076 */
1012static void __init __rcu_init_preempt(void) 1077static void __init __rcu_init_preempt(void)
1013{ 1078{
@@ -1015,6 +1080,665 @@ static void __init __rcu_init_preempt(void)
1015 1080
1016#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ 1081#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
1017 1082
1083#ifdef CONFIG_RCU_BOOST
1084
1085#include "rtmutex_common.h"
1086
1087#ifdef CONFIG_RCU_TRACE
1088
1089static void rcu_initiate_boost_trace(struct rcu_node *rnp)
1090{
1091 if (list_empty(&rnp->blkd_tasks))
1092 rnp->n_balk_blkd_tasks++;
1093 else if (rnp->exp_tasks == NULL && rnp->gp_tasks == NULL)
1094 rnp->n_balk_exp_gp_tasks++;
1095 else if (rnp->gp_tasks != NULL && rnp->boost_tasks != NULL)
1096 rnp->n_balk_boost_tasks++;
1097 else if (rnp->gp_tasks != NULL && rnp->qsmask != 0)
1098 rnp->n_balk_notblocked++;
1099 else if (rnp->gp_tasks != NULL &&
1100 ULONG_CMP_LT(jiffies, rnp->boost_time))
1101 rnp->n_balk_notyet++;
1102 else
1103 rnp->n_balk_nos++;
1104}
1105
1106#else /* #ifdef CONFIG_RCU_TRACE */
1107
1108static void rcu_initiate_boost_trace(struct rcu_node *rnp)
1109{
1110}
1111
1112#endif /* #else #ifdef CONFIG_RCU_TRACE */
1113
1114/*
1115 * Carry out RCU priority boosting on the task indicated by ->exp_tasks
1116 * or ->boost_tasks, advancing the pointer to the next task in the
1117 * ->blkd_tasks list.
1118 *
1119 * Note that irqs must be enabled: boosting the task can block.
1120 * Returns 1 if there are more tasks needing to be boosted.
1121 */
1122static int rcu_boost(struct rcu_node *rnp)
1123{
1124 unsigned long flags;
1125 struct rt_mutex mtx;
1126 struct task_struct *t;
1127 struct list_head *tb;
1128
1129 if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL)
1130 return 0; /* Nothing left to boost. */
1131
1132 raw_spin_lock_irqsave(&rnp->lock, flags);
1133
1134 /*
1135 * Recheck under the lock: all tasks in need of boosting
1136 * might exit their RCU read-side critical sections on their own.
1137 */
1138 if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL) {
1139 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1140 return 0;
1141 }
1142
1143 /*
1144 * Preferentially boost tasks blocking expedited grace periods.
1145 * This cannot starve the normal grace periods because a second
1146 * expedited grace period must boost all blocked tasks, including
1147 * those blocking the pre-existing normal grace period.
1148 */
1149 if (rnp->exp_tasks != NULL) {
1150 tb = rnp->exp_tasks;
1151 rnp->n_exp_boosts++;
1152 } else {
1153 tb = rnp->boost_tasks;
1154 rnp->n_normal_boosts++;
1155 }
1156 rnp->n_tasks_boosted++;
1157
1158 /*
1159 * We boost task t by manufacturing an rt_mutex that appears to
1160 * be held by task t. We leave a pointer to that rt_mutex where
1161 * task t can find it, and task t will release the mutex when it
1162 * exits its outermost RCU read-side critical section. Then
1163 * simply acquiring this artificial rt_mutex will boost task
1164 * t's priority. (Thanks to tglx for suggesting this approach!)
1165 *
1166 * Note that task t must acquire rnp->lock to remove itself from
1167 * the ->blkd_tasks list, which it will do from exit() if from
1168 * nowhere else. We therefore are guaranteed that task t will
1169 * stay around at least until we drop rnp->lock. Note that
1170 * rnp->lock also resolves races between our priority boosting
1171 * and task t's exiting its outermost RCU read-side critical
1172 * section.
1173 */
1174 t = container_of(tb, struct task_struct, rcu_node_entry);
1175 rt_mutex_init_proxy_locked(&mtx, t);
1176 t->rcu_boost_mutex = &mtx;
1177 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BOOSTED;
1178 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1179 rt_mutex_lock(&mtx); /* Side effect: boosts task t's priority. */
1180 rt_mutex_unlock(&mtx); /* Keep lockdep happy. */
1181
1182 return rnp->exp_tasks != NULL || rnp->boost_tasks != NULL;
1183}
1184
1185/*
1186 * Timer handler to initiate waking up of boost kthreads that
1187 * have yielded the CPU due to excessive numbers of tasks to
1188 * boost. We wake up the per-rcu_node kthread, which in turn
1189 * will wake up the booster kthread.
1190 */
1191static void rcu_boost_kthread_timer(unsigned long arg)
1192{
1193 invoke_rcu_node_kthread((struct rcu_node *)arg);
1194}
1195
1196/*
1197 * Priority-boosting kthread. One per leaf rcu_node and one for the
1198 * root rcu_node.
1199 */
1200static int rcu_boost_kthread(void *arg)
1201{
1202 struct rcu_node *rnp = (struct rcu_node *)arg;
1203 int spincnt = 0;
1204 int more2boost;
1205
1206 for (;;) {
1207 rnp->boost_kthread_status = RCU_KTHREAD_WAITING;
1208 rcu_wait(rnp->boost_tasks || rnp->exp_tasks);
1209 rnp->boost_kthread_status = RCU_KTHREAD_RUNNING;
1210 more2boost = rcu_boost(rnp);
1211 if (more2boost)
1212 spincnt++;
1213 else
1214 spincnt = 0;
1215 if (spincnt > 10) {
1216 rcu_yield(rcu_boost_kthread_timer, (unsigned long)rnp);
1217 spincnt = 0;
1218 }
1219 }
1220 /* NOTREACHED */
1221 return 0;
1222}
1223
1224/*
1225 * Check to see if it is time to start boosting RCU readers that are
1226 * blocking the current grace period, and, if so, tell the per-rcu_node
1227 * kthread to start boosting them. If there is an expedited grace
1228 * period in progress, it is always time to boost.
1229 *
1230 * The caller must hold rnp->lock, which this function releases,
1231 * but irqs remain disabled. The ->boost_kthread_task is immortal,
1232 * so we don't need to worry about it going away.
1233 */
1234static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
1235{
1236 struct task_struct *t;
1237
1238 if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) {
1239 rnp->n_balk_exp_gp_tasks++;
1240 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1241 return;
1242 }
1243 if (rnp->exp_tasks != NULL ||
1244 (rnp->gp_tasks != NULL &&
1245 rnp->boost_tasks == NULL &&
1246 rnp->qsmask == 0 &&
1247 ULONG_CMP_GE(jiffies, rnp->boost_time))) {
1248 if (rnp->exp_tasks == NULL)
1249 rnp->boost_tasks = rnp->gp_tasks;
1250 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1251 t = rnp->boost_kthread_task;
1252 if (t != NULL)
1253 wake_up_process(t);
1254 } else {
1255 rcu_initiate_boost_trace(rnp);
1256 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1257 }
1258}
1259
1260/*
1261 * Wake up the per-CPU kthread to invoke RCU callbacks.
1262 */
1263static void invoke_rcu_callbacks_kthread(void)
1264{
1265 unsigned long flags;
1266
1267 local_irq_save(flags);
1268 __this_cpu_write(rcu_cpu_has_work, 1);
1269 if (__this_cpu_read(rcu_cpu_kthread_task) == NULL) {
1270 local_irq_restore(flags);
1271 return;
1272 }
1273 wake_up_process(__this_cpu_read(rcu_cpu_kthread_task));
1274 local_irq_restore(flags);
1275}
1276
1277/*
1278 * Set the affinity of the boost kthread. The CPU-hotplug locks are
1279 * held, so no one should be messing with the existence of the boost
1280 * kthread.
1281 */
1282static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp,
1283 cpumask_var_t cm)
1284{
1285 struct task_struct *t;
1286
1287 t = rnp->boost_kthread_task;
1288 if (t != NULL)
1289 set_cpus_allowed_ptr(rnp->boost_kthread_task, cm);
1290}
1291
1292#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000)
1293
1294/*
1295 * Do priority-boost accounting for the start of a new grace period.
1296 */
1297static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
1298{
1299 rnp->boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES;
1300}
1301
1302/*
1303 * Create an RCU-boost kthread for the specified node if one does not
1304 * already exist. We only create this kthread for preemptible RCU.
1305 * Returns zero if all is well, a negated errno otherwise.
1306 */
1307static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
1308 struct rcu_node *rnp,
1309 int rnp_index)
1310{
1311 unsigned long flags;
1312 struct sched_param sp;
1313 struct task_struct *t;
1314
1315 if (&rcu_preempt_state != rsp)
1316 return 0;
1317 rsp->boost = 1;
1318 if (rnp->boost_kthread_task != NULL)
1319 return 0;
1320 t = kthread_create(rcu_boost_kthread, (void *)rnp,
1321 "rcub%d", rnp_index);
1322 if (IS_ERR(t))
1323 return PTR_ERR(t);
1324 raw_spin_lock_irqsave(&rnp->lock, flags);
1325 rnp->boost_kthread_task = t;
1326 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1327 sp.sched_priority = RCU_KTHREAD_PRIO;
1328 sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
1329 wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */
1330 return 0;
1331}
1332
1333#ifdef CONFIG_HOTPLUG_CPU
1334
1335/*
1336 * Stop the RCU's per-CPU kthread when its CPU goes offline,.
1337 */
1338static void rcu_stop_cpu_kthread(int cpu)
1339{
1340 struct task_struct *t;
1341
1342 /* Stop the CPU's kthread. */
1343 t = per_cpu(rcu_cpu_kthread_task, cpu);
1344 if (t != NULL) {
1345 per_cpu(rcu_cpu_kthread_task, cpu) = NULL;
1346 kthread_stop(t);
1347 }
1348}
1349
1350#endif /* #ifdef CONFIG_HOTPLUG_CPU */
1351
1352static void rcu_kthread_do_work(void)
1353{
1354 rcu_do_batch(&rcu_sched_state, &__get_cpu_var(rcu_sched_data));
1355 rcu_do_batch(&rcu_bh_state, &__get_cpu_var(rcu_bh_data));
1356 rcu_preempt_do_callbacks();
1357}
1358
1359/*
1360 * Wake up the specified per-rcu_node-structure kthread.
1361 * Because the per-rcu_node kthreads are immortal, we don't need
1362 * to do anything to keep them alive.
1363 */
1364static void invoke_rcu_node_kthread(struct rcu_node *rnp)
1365{
1366 struct task_struct *t;
1367
1368 t = rnp->node_kthread_task;
1369 if (t != NULL)
1370 wake_up_process(t);
1371}
1372
1373/*
1374 * Set the specified CPU's kthread to run RT or not, as specified by
1375 * the to_rt argument. The CPU-hotplug locks are held, so the task
1376 * is not going away.
1377 */
1378static void rcu_cpu_kthread_setrt(int cpu, int to_rt)
1379{
1380 int policy;
1381 struct sched_param sp;
1382 struct task_struct *t;
1383
1384 t = per_cpu(rcu_cpu_kthread_task, cpu);
1385 if (t == NULL)
1386 return;
1387 if (to_rt) {
1388 policy = SCHED_FIFO;
1389 sp.sched_priority = RCU_KTHREAD_PRIO;
1390 } else {
1391 policy = SCHED_NORMAL;
1392 sp.sched_priority = 0;
1393 }
1394 sched_setscheduler_nocheck(t, policy, &sp);
1395}
1396
1397/*
1398 * Timer handler to initiate the waking up of per-CPU kthreads that
1399 * have yielded the CPU due to excess numbers of RCU callbacks.
1400 * We wake up the per-rcu_node kthread, which in turn will wake up
1401 * the booster kthread.
1402 */
1403static void rcu_cpu_kthread_timer(unsigned long arg)
1404{
1405 struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, arg);
1406 struct rcu_node *rnp = rdp->mynode;
1407
1408 atomic_or(rdp->grpmask, &rnp->wakemask);
1409 invoke_rcu_node_kthread(rnp);
1410}
1411
1412/*
1413 * Drop to non-real-time priority and yield, but only after posting a
1414 * timer that will cause us to regain our real-time priority if we
1415 * remain preempted. Either way, we restore our real-time priority
1416 * before returning.
1417 */
1418static void rcu_yield(void (*f)(unsigned long), unsigned long arg)
1419{
1420 struct sched_param sp;
1421 struct timer_list yield_timer;
1422
1423 setup_timer_on_stack(&yield_timer, f, arg);
1424 mod_timer(&yield_timer, jiffies + 2);
1425 sp.sched_priority = 0;
1426 sched_setscheduler_nocheck(current, SCHED_NORMAL, &sp);
1427 set_user_nice(current, 19);
1428 schedule();
1429 sp.sched_priority = RCU_KTHREAD_PRIO;
1430 sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
1431 del_timer(&yield_timer);
1432}
1433
1434/*
1435 * Handle cases where the rcu_cpu_kthread() ends up on the wrong CPU.
1436 * This can happen while the corresponding CPU is either coming online
1437 * or going offline. We cannot wait until the CPU is fully online
1438 * before starting the kthread, because the various notifier functions
1439 * can wait for RCU grace periods. So we park rcu_cpu_kthread() until
1440 * the corresponding CPU is online.
1441 *
1442 * Return 1 if the kthread needs to stop, 0 otherwise.
1443 *
1444 * Caller must disable bh. This function can momentarily enable it.
1445 */
1446static int rcu_cpu_kthread_should_stop(int cpu)
1447{
1448 while (cpu_is_offline(cpu) ||
1449 !cpumask_equal(&current->cpus_allowed, cpumask_of(cpu)) ||
1450 smp_processor_id() != cpu) {
1451 if (kthread_should_stop())
1452 return 1;
1453 per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
1454 per_cpu(rcu_cpu_kthread_cpu, cpu) = raw_smp_processor_id();
1455 local_bh_enable();
1456 schedule_timeout_uninterruptible(1);
1457 if (!cpumask_equal(&current->cpus_allowed, cpumask_of(cpu)))
1458 set_cpus_allowed_ptr(current, cpumask_of(cpu));
1459 local_bh_disable();
1460 }
1461 per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu;
1462 return 0;
1463}
1464
1465/*
1466 * Per-CPU kernel thread that invokes RCU callbacks. This replaces the
1467 * earlier RCU softirq.
1468 */
1469static int rcu_cpu_kthread(void *arg)
1470{
1471 int cpu = (int)(long)arg;
1472 unsigned long flags;
1473 int spincnt = 0;
1474 unsigned int *statusp = &per_cpu(rcu_cpu_kthread_status, cpu);
1475 char work;
1476 char *workp = &per_cpu(rcu_cpu_has_work, cpu);
1477
1478 for (;;) {
1479 *statusp = RCU_KTHREAD_WAITING;
1480 rcu_wait(*workp != 0 || kthread_should_stop());
1481 local_bh_disable();
1482 if (rcu_cpu_kthread_should_stop(cpu)) {
1483 local_bh_enable();
1484 break;
1485 }
1486 *statusp = RCU_KTHREAD_RUNNING;
1487 per_cpu(rcu_cpu_kthread_loops, cpu)++;
1488 local_irq_save(flags);
1489 work = *workp;
1490 *workp = 0;
1491 local_irq_restore(flags);
1492 if (work)
1493 rcu_kthread_do_work();
1494 local_bh_enable();
1495 if (*workp != 0)
1496 spincnt++;
1497 else
1498 spincnt = 0;
1499 if (spincnt > 10) {
1500 *statusp = RCU_KTHREAD_YIELDING;
1501 rcu_yield(rcu_cpu_kthread_timer, (unsigned long)cpu);
1502 spincnt = 0;
1503 }
1504 }
1505 *statusp = RCU_KTHREAD_STOPPED;
1506 return 0;
1507}
1508
1509/*
1510 * Spawn a per-CPU kthread, setting up affinity and priority.
1511 * Because the CPU hotplug lock is held, no other CPU will be attempting
1512 * to manipulate rcu_cpu_kthread_task. There might be another CPU
1513 * attempting to access it during boot, but the locking in kthread_bind()
1514 * will enforce sufficient ordering.
1515 *
1516 * Please note that we cannot simply refuse to wake up the per-CPU
1517 * kthread because kthreads are created in TASK_UNINTERRUPTIBLE state,
1518 * which can result in softlockup complaints if the task ends up being
1519 * idle for more than a couple of minutes.
1520 *
1521 * However, please note also that we cannot bind the per-CPU kthread to its
1522 * CPU until that CPU is fully online. We also cannot wait until the
1523 * CPU is fully online before we create its per-CPU kthread, as this would
1524 * deadlock the system when CPU notifiers tried waiting for grace
1525 * periods. So we bind the per-CPU kthread to its CPU only if the CPU
1526 * is online. If its CPU is not yet fully online, then the code in
1527 * rcu_cpu_kthread() will wait until it is fully online, and then do
1528 * the binding.
1529 */
1530static int __cpuinit rcu_spawn_one_cpu_kthread(int cpu)
1531{
1532 struct sched_param sp;
1533 struct task_struct *t;
1534
1535 if (!rcu_scheduler_fully_active ||
1536 per_cpu(rcu_cpu_kthread_task, cpu) != NULL)
1537 return 0;
1538 t = kthread_create(rcu_cpu_kthread, (void *)(long)cpu, "rcuc%d", cpu);
1539 if (IS_ERR(t))
1540 return PTR_ERR(t);
1541 if (cpu_online(cpu))
1542 kthread_bind(t, cpu);
1543 per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu;
1544 WARN_ON_ONCE(per_cpu(rcu_cpu_kthread_task, cpu) != NULL);
1545 sp.sched_priority = RCU_KTHREAD_PRIO;
1546 sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
1547 per_cpu(rcu_cpu_kthread_task, cpu) = t;
1548 wake_up_process(t); /* Get to TASK_INTERRUPTIBLE quickly. */
1549 return 0;
1550}
1551
1552/*
1553 * Per-rcu_node kthread, which is in charge of waking up the per-CPU
1554 * kthreads when needed. We ignore requests to wake up kthreads
1555 * for offline CPUs, which is OK because force_quiescent_state()
1556 * takes care of this case.
1557 */
1558static int rcu_node_kthread(void *arg)
1559{
1560 int cpu;
1561 unsigned long flags;
1562 unsigned long mask;
1563 struct rcu_node *rnp = (struct rcu_node *)arg;
1564 struct sched_param sp;
1565 struct task_struct *t;
1566
1567 for (;;) {
1568 rnp->node_kthread_status = RCU_KTHREAD_WAITING;
1569 rcu_wait(atomic_read(&rnp->wakemask) != 0);
1570 rnp->node_kthread_status = RCU_KTHREAD_RUNNING;
1571 raw_spin_lock_irqsave(&rnp->lock, flags);
1572 mask = atomic_xchg(&rnp->wakemask, 0);
1573 rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */
1574 for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) {
1575 if ((mask & 0x1) == 0)
1576 continue;
1577 preempt_disable();
1578 t = per_cpu(rcu_cpu_kthread_task, cpu);
1579 if (!cpu_online(cpu) || t == NULL) {
1580 preempt_enable();
1581 continue;
1582 }
1583 per_cpu(rcu_cpu_has_work, cpu) = 1;
1584 sp.sched_priority = RCU_KTHREAD_PRIO;
1585 sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
1586 preempt_enable();
1587 }
1588 }
1589 /* NOTREACHED */
1590 rnp->node_kthread_status = RCU_KTHREAD_STOPPED;
1591 return 0;
1592}
1593
1594/*
1595 * Set the per-rcu_node kthread's affinity to cover all CPUs that are
1596 * served by the rcu_node in question. The CPU hotplug lock is still
1597 * held, so the value of rnp->qsmaskinit will be stable.
1598 *
1599 * We don't include outgoingcpu in the affinity set, use -1 if there is
1600 * no outgoing CPU. If there are no CPUs left in the affinity set,
1601 * this function allows the kthread to execute on any CPU.
1602 */
1603static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
1604{
1605 cpumask_var_t cm;
1606 int cpu;
1607 unsigned long mask = rnp->qsmaskinit;
1608
1609 if (rnp->node_kthread_task == NULL)
1610 return;
1611 if (!alloc_cpumask_var(&cm, GFP_KERNEL))
1612 return;
1613 cpumask_clear(cm);
1614 for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1)
1615 if ((mask & 0x1) && cpu != outgoingcpu)
1616 cpumask_set_cpu(cpu, cm);
1617 if (cpumask_weight(cm) == 0) {
1618 cpumask_setall(cm);
1619 for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++)
1620 cpumask_clear_cpu(cpu, cm);
1621 WARN_ON_ONCE(cpumask_weight(cm) == 0);
1622 }
1623 set_cpus_allowed_ptr(rnp->node_kthread_task, cm);
1624 rcu_boost_kthread_setaffinity(rnp, cm);
1625 free_cpumask_var(cm);
1626}
1627
1628/*
1629 * Spawn a per-rcu_node kthread, setting priority and affinity.
1630 * Called during boot before online/offline can happen, or, if
1631 * during runtime, with the main CPU-hotplug locks held. So only
1632 * one of these can be executing at a time.
1633 */
1634static int __cpuinit rcu_spawn_one_node_kthread(struct rcu_state *rsp,
1635 struct rcu_node *rnp)
1636{
1637 unsigned long flags;
1638 int rnp_index = rnp - &rsp->node[0];
1639 struct sched_param sp;
1640 struct task_struct *t;
1641
1642 if (!rcu_scheduler_fully_active ||
1643 rnp->qsmaskinit == 0)
1644 return 0;
1645 if (rnp->node_kthread_task == NULL) {
1646 t = kthread_create(rcu_node_kthread, (void *)rnp,
1647 "rcun%d", rnp_index);
1648 if (IS_ERR(t))
1649 return PTR_ERR(t);
1650 raw_spin_lock_irqsave(&rnp->lock, flags);
1651 rnp->node_kthread_task = t;
1652 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1653 sp.sched_priority = 99;
1654 sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
1655 wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */
1656 }
1657 return rcu_spawn_one_boost_kthread(rsp, rnp, rnp_index);
1658}
1659
1660/*
1661 * Spawn all kthreads -- called as soon as the scheduler is running.
1662 */
1663static int __init rcu_spawn_kthreads(void)
1664{
1665 int cpu;
1666 struct rcu_node *rnp;
1667
1668 rcu_scheduler_fully_active = 1;
1669 for_each_possible_cpu(cpu) {
1670 per_cpu(rcu_cpu_has_work, cpu) = 0;
1671 if (cpu_online(cpu))
1672 (void)rcu_spawn_one_cpu_kthread(cpu);
1673 }
1674 rnp = rcu_get_root(rcu_state);
1675 (void)rcu_spawn_one_node_kthread(rcu_state, rnp);
1676 if (NUM_RCU_NODES > 1) {
1677 rcu_for_each_leaf_node(rcu_state, rnp)
1678 (void)rcu_spawn_one_node_kthread(rcu_state, rnp);
1679 }
1680 return 0;
1681}
1682early_initcall(rcu_spawn_kthreads);
1683
1684static void __cpuinit rcu_prepare_kthreads(int cpu)
1685{
1686 struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu);
1687 struct rcu_node *rnp = rdp->mynode;
1688
1689 /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */
1690 if (rcu_scheduler_fully_active) {
1691 (void)rcu_spawn_one_cpu_kthread(cpu);
1692 if (rnp->node_kthread_task == NULL)
1693 (void)rcu_spawn_one_node_kthread(rcu_state, rnp);
1694 }
1695}
1696
1697#else /* #ifdef CONFIG_RCU_BOOST */
1698
1699static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
1700{
1701 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1702}
1703
1704static void invoke_rcu_callbacks_kthread(void)
1705{
1706 WARN_ON_ONCE(1);
1707}
1708
1709static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
1710{
1711}
1712
1713#ifdef CONFIG_HOTPLUG_CPU
1714
1715static void rcu_stop_cpu_kthread(int cpu)
1716{
1717}
1718
1719#endif /* #ifdef CONFIG_HOTPLUG_CPU */
1720
1721static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
1722{
1723}
1724
1725static void rcu_cpu_kthread_setrt(int cpu, int to_rt)
1726{
1727}
1728
1729static int __init rcu_scheduler_really_started(void)
1730{
1731 rcu_scheduler_fully_active = 1;
1732 return 0;
1733}
1734early_initcall(rcu_scheduler_really_started);
1735
1736static void __cpuinit rcu_prepare_kthreads(int cpu)
1737{
1738}
1739
1740#endif /* #else #ifdef CONFIG_RCU_BOOST */
1741
1018#ifndef CONFIG_SMP 1742#ifndef CONFIG_SMP
1019 1743
1020void synchronize_sched_expedited(void) 1744void synchronize_sched_expedited(void)
@@ -1187,14 +1911,13 @@ static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff);
1187 * 1911 *
1188 * Because it is not legal to invoke rcu_process_callbacks() with irqs 1912 * Because it is not legal to invoke rcu_process_callbacks() with irqs
1189 * disabled, we do one pass of force_quiescent_state(), then do a 1913 * disabled, we do one pass of force_quiescent_state(), then do a
1190 * raise_softirq() to cause rcu_process_callbacks() to be invoked later. 1914 * invoke_rcu_core() to cause rcu_process_callbacks() to be invoked
1191 * The per-cpu rcu_dyntick_drain variable controls the sequencing. 1915 * later. The per-cpu rcu_dyntick_drain variable controls the sequencing.
1192 */ 1916 */
1193int rcu_needs_cpu(int cpu) 1917int rcu_needs_cpu(int cpu)
1194{ 1918{
1195 int c = 0; 1919 int c = 0;
1196 int snap; 1920 int snap;
1197 int snap_nmi;
1198 int thatcpu; 1921 int thatcpu;
1199 1922
1200 /* Check for being in the holdoff period. */ 1923 /* Check for being in the holdoff period. */
@@ -1205,10 +1928,10 @@ int rcu_needs_cpu(int cpu)
1205 for_each_online_cpu(thatcpu) { 1928 for_each_online_cpu(thatcpu) {
1206 if (thatcpu == cpu) 1929 if (thatcpu == cpu)
1207 continue; 1930 continue;
1208 snap = per_cpu(rcu_dynticks, thatcpu).dynticks; 1931 snap = atomic_add_return(0, &per_cpu(rcu_dynticks,
1209 snap_nmi = per_cpu(rcu_dynticks, thatcpu).dynticks_nmi; 1932 thatcpu).dynticks);
1210 smp_mb(); /* Order sampling of snap with end of grace period. */ 1933 smp_mb(); /* Order sampling of snap with end of grace period. */
1211 if (((snap & 0x1) != 0) || ((snap_nmi & 0x1) != 0)) { 1934 if ((snap & 0x1) != 0) {
1212 per_cpu(rcu_dyntick_drain, cpu) = 0; 1935 per_cpu(rcu_dyntick_drain, cpu) = 0;
1213 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; 1936 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1;
1214 return rcu_needs_cpu_quick_check(cpu); 1937 return rcu_needs_cpu_quick_check(cpu);
@@ -1239,7 +1962,7 @@ int rcu_needs_cpu(int cpu)
1239 1962
1240 /* If RCU callbacks are still pending, RCU still needs this CPU. */ 1963 /* If RCU callbacks are still pending, RCU still needs this CPU. */
1241 if (c) 1964 if (c)
1242 raise_softirq(RCU_SOFTIRQ); 1965 invoke_rcu_core();
1243 return c; 1966 return c;
1244} 1967}
1245 1968
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index c8e97853b970..4e144876dc68 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -46,6 +46,22 @@
46#define RCU_TREE_NONCORE 46#define RCU_TREE_NONCORE
47#include "rcutree.h" 47#include "rcutree.h"
48 48
49#ifdef CONFIG_RCU_BOOST
50
51DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
52DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_cpu);
53DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
54DECLARE_PER_CPU(char, rcu_cpu_has_work);
55
56static char convert_kthread_status(unsigned int kthread_status)
57{
58 if (kthread_status > RCU_KTHREAD_MAX)
59 return '?';
60 return "SRWOY"[kthread_status];
61}
62
63#endif /* #ifdef CONFIG_RCU_BOOST */
64
49static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) 65static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
50{ 66{
51 if (!rdp->beenonline) 67 if (!rdp->beenonline)
@@ -57,14 +73,31 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
57 rdp->passed_quiesc, rdp->passed_quiesc_completed, 73 rdp->passed_quiesc, rdp->passed_quiesc_completed,
58 rdp->qs_pending); 74 rdp->qs_pending);
59#ifdef CONFIG_NO_HZ 75#ifdef CONFIG_NO_HZ
60 seq_printf(m, " dt=%d/%d dn=%d df=%lu", 76 seq_printf(m, " dt=%d/%d/%d df=%lu",
61 rdp->dynticks->dynticks, 77 atomic_read(&rdp->dynticks->dynticks),
62 rdp->dynticks->dynticks_nesting, 78 rdp->dynticks->dynticks_nesting,
63 rdp->dynticks->dynticks_nmi, 79 rdp->dynticks->dynticks_nmi_nesting,
64 rdp->dynticks_fqs); 80 rdp->dynticks_fqs);
65#endif /* #ifdef CONFIG_NO_HZ */ 81#endif /* #ifdef CONFIG_NO_HZ */
66 seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi); 82 seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi);
67 seq_printf(m, " ql=%ld b=%ld", rdp->qlen, rdp->blimit); 83 seq_printf(m, " ql=%ld qs=%c%c%c%c",
84 rdp->qlen,
85 ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] !=
86 rdp->nxttail[RCU_NEXT_TAIL]],
87 ".R"[rdp->nxttail[RCU_WAIT_TAIL] !=
88 rdp->nxttail[RCU_NEXT_READY_TAIL]],
89 ".W"[rdp->nxttail[RCU_DONE_TAIL] !=
90 rdp->nxttail[RCU_WAIT_TAIL]],
91 ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]]);
92#ifdef CONFIG_RCU_BOOST
93 seq_printf(m, " kt=%d/%c/%d ktl=%x",
94 per_cpu(rcu_cpu_has_work, rdp->cpu),
95 convert_kthread_status(per_cpu(rcu_cpu_kthread_status,
96 rdp->cpu)),
97 per_cpu(rcu_cpu_kthread_cpu, rdp->cpu),
98 per_cpu(rcu_cpu_kthread_loops, rdp->cpu) & 0xffff);
99#endif /* #ifdef CONFIG_RCU_BOOST */
100 seq_printf(m, " b=%ld", rdp->blimit);
68 seq_printf(m, " ci=%lu co=%lu ca=%lu\n", 101 seq_printf(m, " ci=%lu co=%lu ca=%lu\n",
69 rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted); 102 rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted);
70} 103}
@@ -115,13 +148,27 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
115 rdp->qs_pending); 148 rdp->qs_pending);
116#ifdef CONFIG_NO_HZ 149#ifdef CONFIG_NO_HZ
117 seq_printf(m, ",%d,%d,%d,%lu", 150 seq_printf(m, ",%d,%d,%d,%lu",
118 rdp->dynticks->dynticks, 151 atomic_read(&rdp->dynticks->dynticks),
119 rdp->dynticks->dynticks_nesting, 152 rdp->dynticks->dynticks_nesting,
120 rdp->dynticks->dynticks_nmi, 153 rdp->dynticks->dynticks_nmi_nesting,
121 rdp->dynticks_fqs); 154 rdp->dynticks_fqs);
122#endif /* #ifdef CONFIG_NO_HZ */ 155#endif /* #ifdef CONFIG_NO_HZ */
123 seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi); 156 seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi);
124 seq_printf(m, ",%ld,%ld", rdp->qlen, rdp->blimit); 157 seq_printf(m, ",%ld,\"%c%c%c%c\"", rdp->qlen,
158 ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] !=
159 rdp->nxttail[RCU_NEXT_TAIL]],
160 ".R"[rdp->nxttail[RCU_WAIT_TAIL] !=
161 rdp->nxttail[RCU_NEXT_READY_TAIL]],
162 ".W"[rdp->nxttail[RCU_DONE_TAIL] !=
163 rdp->nxttail[RCU_WAIT_TAIL]],
164 ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]]);
165#ifdef CONFIG_RCU_BOOST
166 seq_printf(m, ",%d,\"%c\"",
167 per_cpu(rcu_cpu_has_work, rdp->cpu),
168 convert_kthread_status(per_cpu(rcu_cpu_kthread_status,
169 rdp->cpu)));
170#endif /* #ifdef CONFIG_RCU_BOOST */
171 seq_printf(m, ",%ld", rdp->blimit);
125 seq_printf(m, ",%lu,%lu,%lu\n", 172 seq_printf(m, ",%lu,%lu,%lu\n",
126 rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted); 173 rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted);
127} 174}
@@ -130,9 +177,13 @@ static int show_rcudata_csv(struct seq_file *m, void *unused)
130{ 177{
131 seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pqc\",\"pq\","); 178 seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pqc\",\"pq\",");
132#ifdef CONFIG_NO_HZ 179#ifdef CONFIG_NO_HZ
133 seq_puts(m, "\"dt\",\"dt nesting\",\"dn\",\"df\","); 180 seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\",");
134#endif /* #ifdef CONFIG_NO_HZ */ 181#endif /* #ifdef CONFIG_NO_HZ */
135 seq_puts(m, "\"of\",\"ri\",\"ql\",\"b\",\"ci\",\"co\",\"ca\"\n"); 182 seq_puts(m, "\"of\",\"ri\",\"ql\",\"qs\"");
183#ifdef CONFIG_RCU_BOOST
184 seq_puts(m, "\"kt\",\"ktl\"");
185#endif /* #ifdef CONFIG_RCU_BOOST */
186 seq_puts(m, ",\"b\",\"ci\",\"co\",\"ca\"\n");
136#ifdef CONFIG_TREE_PREEMPT_RCU 187#ifdef CONFIG_TREE_PREEMPT_RCU
137 seq_puts(m, "\"rcu_preempt:\"\n"); 188 seq_puts(m, "\"rcu_preempt:\"\n");
138 PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data_csv, m); 189 PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data_csv, m);
@@ -157,11 +208,76 @@ static const struct file_operations rcudata_csv_fops = {
157 .release = single_release, 208 .release = single_release,
158}; 209};
159 210
211#ifdef CONFIG_RCU_BOOST
212
213static void print_one_rcu_node_boost(struct seq_file *m, struct rcu_node *rnp)
214{
215 seq_printf(m, "%d:%d tasks=%c%c%c%c kt=%c ntb=%lu neb=%lu nnb=%lu "
216 "j=%04x bt=%04x\n",
217 rnp->grplo, rnp->grphi,
218 "T."[list_empty(&rnp->blkd_tasks)],
219 "N."[!rnp->gp_tasks],
220 "E."[!rnp->exp_tasks],
221 "B."[!rnp->boost_tasks],
222 convert_kthread_status(rnp->boost_kthread_status),
223 rnp->n_tasks_boosted, rnp->n_exp_boosts,
224 rnp->n_normal_boosts,
225 (int)(jiffies & 0xffff),
226 (int)(rnp->boost_time & 0xffff));
227 seq_printf(m, "%s: nt=%lu egt=%lu bt=%lu nb=%lu ny=%lu nos=%lu\n",
228 " balk",
229 rnp->n_balk_blkd_tasks,
230 rnp->n_balk_exp_gp_tasks,
231 rnp->n_balk_boost_tasks,
232 rnp->n_balk_notblocked,
233 rnp->n_balk_notyet,
234 rnp->n_balk_nos);
235}
236
237static int show_rcu_node_boost(struct seq_file *m, void *unused)
238{
239 struct rcu_node *rnp;
240
241 rcu_for_each_leaf_node(&rcu_preempt_state, rnp)
242 print_one_rcu_node_boost(m, rnp);
243 return 0;
244}
245
246static int rcu_node_boost_open(struct inode *inode, struct file *file)
247{
248 return single_open(file, show_rcu_node_boost, NULL);
249}
250
251static const struct file_operations rcu_node_boost_fops = {
252 .owner = THIS_MODULE,
253 .open = rcu_node_boost_open,
254 .read = seq_read,
255 .llseek = seq_lseek,
256 .release = single_release,
257};
258
259/*
260 * Create the rcuboost debugfs entry. Standard error return.
261 */
262static int rcu_boost_trace_create_file(struct dentry *rcudir)
263{
264 return !debugfs_create_file("rcuboost", 0444, rcudir, NULL,
265 &rcu_node_boost_fops);
266}
267
268#else /* #ifdef CONFIG_RCU_BOOST */
269
270static int rcu_boost_trace_create_file(struct dentry *rcudir)
271{
272 return 0; /* There cannot be an error if we didn't create it! */
273}
274
275#endif /* #else #ifdef CONFIG_RCU_BOOST */
276
160static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) 277static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
161{ 278{
162 unsigned long gpnum; 279 unsigned long gpnum;
163 int level = 0; 280 int level = 0;
164 int phase;
165 struct rcu_node *rnp; 281 struct rcu_node *rnp;
166 282
167 gpnum = rsp->gpnum; 283 gpnum = rsp->gpnum;
@@ -178,13 +294,11 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
178 seq_puts(m, "\n"); 294 seq_puts(m, "\n");
179 level = rnp->level; 295 level = rnp->level;
180 } 296 }
181 phase = gpnum & 0x1; 297 seq_printf(m, "%lx/%lx %c%c>%c %d:%d ^%d ",
182 seq_printf(m, "%lx/%lx %c%c>%c%c %d:%d ^%d ",
183 rnp->qsmask, rnp->qsmaskinit, 298 rnp->qsmask, rnp->qsmaskinit,
184 "T."[list_empty(&rnp->blocked_tasks[phase])], 299 ".G"[rnp->gp_tasks != NULL],
185 "E."[list_empty(&rnp->blocked_tasks[phase + 2])], 300 ".E"[rnp->exp_tasks != NULL],
186 "T."[list_empty(&rnp->blocked_tasks[!phase])], 301 ".T"[!list_empty(&rnp->blkd_tasks)],
187 "E."[list_empty(&rnp->blocked_tasks[!phase + 2])],
188 rnp->grplo, rnp->grphi, rnp->grpnum); 302 rnp->grplo, rnp->grphi, rnp->grpnum);
189 } 303 }
190 seq_puts(m, "\n"); 304 seq_puts(m, "\n");
@@ -216,16 +330,35 @@ static const struct file_operations rcuhier_fops = {
216 .release = single_release, 330 .release = single_release,
217}; 331};
218 332
333static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp)
334{
335 unsigned long flags;
336 unsigned long completed;
337 unsigned long gpnum;
338 unsigned long gpage;
339 unsigned long gpmax;
340 struct rcu_node *rnp = &rsp->node[0];
341
342 raw_spin_lock_irqsave(&rnp->lock, flags);
343 completed = rsp->completed;
344 gpnum = rsp->gpnum;
345 if (rsp->completed == rsp->gpnum)
346 gpage = 0;
347 else
348 gpage = jiffies - rsp->gp_start;
349 gpmax = rsp->gp_max;
350 raw_spin_unlock_irqrestore(&rnp->lock, flags);
351 seq_printf(m, "%s: completed=%ld gpnum=%lu age=%ld max=%ld\n",
352 rsp->name, completed, gpnum, gpage, gpmax);
353}
354
219static int show_rcugp(struct seq_file *m, void *unused) 355static int show_rcugp(struct seq_file *m, void *unused)
220{ 356{
221#ifdef CONFIG_TREE_PREEMPT_RCU 357#ifdef CONFIG_TREE_PREEMPT_RCU
222 seq_printf(m, "rcu_preempt: completed=%ld gpnum=%lu\n", 358 show_one_rcugp(m, &rcu_preempt_state);
223 rcu_preempt_state.completed, rcu_preempt_state.gpnum);
224#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ 359#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
225 seq_printf(m, "rcu_sched: completed=%ld gpnum=%lu\n", 360 show_one_rcugp(m, &rcu_sched_state);
226 rcu_sched_state.completed, rcu_sched_state.gpnum); 361 show_one_rcugp(m, &rcu_bh_state);
227 seq_printf(m, "rcu_bh: completed=%ld gpnum=%lu\n",
228 rcu_bh_state.completed, rcu_bh_state.gpnum);
229 return 0; 362 return 0;
230} 363}
231 364
@@ -298,6 +431,29 @@ static const struct file_operations rcu_pending_fops = {
298 .release = single_release, 431 .release = single_release,
299}; 432};
300 433
434static int show_rcutorture(struct seq_file *m, void *unused)
435{
436 seq_printf(m, "rcutorture test sequence: %lu %s\n",
437 rcutorture_testseq >> 1,
438 (rcutorture_testseq & 0x1) ? "(test in progress)" : "");
439 seq_printf(m, "rcutorture update version number: %lu\n",
440 rcutorture_vernum);
441 return 0;
442}
443
444static int rcutorture_open(struct inode *inode, struct file *file)
445{
446 return single_open(file, show_rcutorture, NULL);
447}
448
449static const struct file_operations rcutorture_fops = {
450 .owner = THIS_MODULE,
451 .open = rcutorture_open,
452 .read = seq_read,
453 .llseek = seq_lseek,
454 .release = single_release,
455};
456
301static struct dentry *rcudir; 457static struct dentry *rcudir;
302 458
303static int __init rcutree_trace_init(void) 459static int __init rcutree_trace_init(void)
@@ -318,6 +474,9 @@ static int __init rcutree_trace_init(void)
318 if (!retval) 474 if (!retval)
319 goto free_out; 475 goto free_out;
320 476
477 if (rcu_boost_trace_create_file(rcudir))
478 goto free_out;
479
321 retval = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops); 480 retval = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops);
322 if (!retval) 481 if (!retval)
323 goto free_out; 482 goto free_out;
@@ -331,6 +490,11 @@ static int __init rcutree_trace_init(void)
331 NULL, &rcu_pending_fops); 490 NULL, &rcu_pending_fops);
332 if (!retval) 491 if (!retval)
333 goto free_out; 492 goto free_out;
493
494 retval = debugfs_create_file("rcutorture", 0444, rcudir,
495 NULL, &rcutorture_fops);
496 if (!retval)
497 goto free_out;
334 return 0; 498 return 0;
335free_out: 499free_out:
336 debugfs_remove_recursive(rcudir); 500 debugfs_remove_recursive(rcudir);
diff --git a/kernel/resource.c b/kernel/resource.c
index 798e2fae2a06..3ff40178dce7 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -38,6 +38,14 @@ struct resource iomem_resource = {
38}; 38};
39EXPORT_SYMBOL(iomem_resource); 39EXPORT_SYMBOL(iomem_resource);
40 40
41/* constraints to be met while allocating resources */
42struct resource_constraint {
43 resource_size_t min, max, align;
44 resource_size_t (*alignf)(void *, const struct resource *,
45 resource_size_t, resource_size_t);
46 void *alignf_data;
47};
48
41static DEFINE_RWLOCK(resource_lock); 49static DEFINE_RWLOCK(resource_lock);
42 50
43static void *r_next(struct seq_file *m, void *v, loff_t *pos) 51static void *r_next(struct seq_file *m, void *v, loff_t *pos)
@@ -384,16 +392,13 @@ static bool resource_contains(struct resource *res1, struct resource *res2)
384} 392}
385 393
386/* 394/*
387 * Find empty slot in the resource tree given range and alignment. 395 * Find empty slot in the resource tree with the given range and
396 * alignment constraints
388 */ 397 */
389static int find_resource(struct resource *root, struct resource *new, 398static int __find_resource(struct resource *root, struct resource *old,
390 resource_size_t size, resource_size_t min, 399 struct resource *new,
391 resource_size_t max, resource_size_t align, 400 resource_size_t size,
392 resource_size_t (*alignf)(void *, 401 struct resource_constraint *constraint)
393 const struct resource *,
394 resource_size_t,
395 resource_size_t),
396 void *alignf_data)
397{ 402{
398 struct resource *this = root->child; 403 struct resource *this = root->child;
399 struct resource tmp = *new, avail, alloc; 404 struct resource tmp = *new, avail, alloc;
@@ -404,25 +409,26 @@ static int find_resource(struct resource *root, struct resource *new,
404 * Skip past an allocated resource that starts at 0, since the assignment 409 * Skip past an allocated resource that starts at 0, since the assignment
405 * of this->start - 1 to tmp->end below would cause an underflow. 410 * of this->start - 1 to tmp->end below would cause an underflow.
406 */ 411 */
407 if (this && this->start == 0) { 412 if (this && this->start == root->start) {
408 tmp.start = this->end + 1; 413 tmp.start = (this == old) ? old->start : this->end + 1;
409 this = this->sibling; 414 this = this->sibling;
410 } 415 }
411 for(;;) { 416 for(;;) {
412 if (this) 417 if (this)
413 tmp.end = this->start - 1; 418 tmp.end = (this == old) ? this->end : this->start - 1;
414 else 419 else
415 tmp.end = root->end; 420 tmp.end = root->end;
416 421
417 resource_clip(&tmp, min, max); 422 resource_clip(&tmp, constraint->min, constraint->max);
418 arch_remove_reservations(&tmp); 423 arch_remove_reservations(&tmp);
419 424
420 /* Check for overflow after ALIGN() */ 425 /* Check for overflow after ALIGN() */
421 avail = *new; 426 avail = *new;
422 avail.start = ALIGN(tmp.start, align); 427 avail.start = ALIGN(tmp.start, constraint->align);
423 avail.end = tmp.end; 428 avail.end = tmp.end;
424 if (avail.start >= tmp.start) { 429 if (avail.start >= tmp.start) {
425 alloc.start = alignf(alignf_data, &avail, size, align); 430 alloc.start = constraint->alignf(constraint->alignf_data, &avail,
431 size, constraint->align);
426 alloc.end = alloc.start + size - 1; 432 alloc.end = alloc.start + size - 1;
427 if (resource_contains(&avail, &alloc)) { 433 if (resource_contains(&avail, &alloc)) {
428 new->start = alloc.start; 434 new->start = alloc.start;
@@ -432,14 +438,75 @@ static int find_resource(struct resource *root, struct resource *new,
432 } 438 }
433 if (!this) 439 if (!this)
434 break; 440 break;
435 tmp.start = this->end + 1; 441 if (this != old)
442 tmp.start = this->end + 1;
436 this = this->sibling; 443 this = this->sibling;
437 } 444 }
438 return -EBUSY; 445 return -EBUSY;
439} 446}
440 447
448/*
449 * Find empty slot in the resource tree given range and alignment.
450 */
451static int find_resource(struct resource *root, struct resource *new,
452 resource_size_t size,
453 struct resource_constraint *constraint)
454{
455 return __find_resource(root, NULL, new, size, constraint);
456}
457
441/** 458/**
442 * allocate_resource - allocate empty slot in the resource tree given range & alignment 459 * reallocate_resource - allocate a slot in the resource tree given range & alignment.
460 * The resource will be relocated if the new size cannot be reallocated in the
461 * current location.
462 *
463 * @root: root resource descriptor
464 * @old: resource descriptor desired by caller
465 * @newsize: new size of the resource descriptor
466 * @constraint: the size and alignment constraints to be met.
467 */
468int reallocate_resource(struct resource *root, struct resource *old,
469 resource_size_t newsize,
470 struct resource_constraint *constraint)
471{
472 int err=0;
473 struct resource new = *old;
474 struct resource *conflict;
475
476 write_lock(&resource_lock);
477
478 if ((err = __find_resource(root, old, &new, newsize, constraint)))
479 goto out;
480
481 if (resource_contains(&new, old)) {
482 old->start = new.start;
483 old->end = new.end;
484 goto out;
485 }
486
487 if (old->child) {
488 err = -EBUSY;
489 goto out;
490 }
491
492 if (resource_contains(old, &new)) {
493 old->start = new.start;
494 old->end = new.end;
495 } else {
496 __release_resource(old);
497 *old = new;
498 conflict = __request_resource(root, old);
499 BUG_ON(conflict);
500 }
501out:
502 write_unlock(&resource_lock);
503 return err;
504}
505
506
507/**
508 * allocate_resource - allocate empty slot in the resource tree given range & alignment.
509 * The resource will be reallocated with a new size if it was already allocated
443 * @root: root resource descriptor 510 * @root: root resource descriptor
444 * @new: resource descriptor desired by caller 511 * @new: resource descriptor desired by caller
445 * @size: requested resource region size 512 * @size: requested resource region size
@@ -459,12 +526,25 @@ int allocate_resource(struct resource *root, struct resource *new,
459 void *alignf_data) 526 void *alignf_data)
460{ 527{
461 int err; 528 int err;
529 struct resource_constraint constraint;
462 530
463 if (!alignf) 531 if (!alignf)
464 alignf = simple_align_resource; 532 alignf = simple_align_resource;
465 533
534 constraint.min = min;
535 constraint.max = max;
536 constraint.align = align;
537 constraint.alignf = alignf;
538 constraint.alignf_data = alignf_data;
539
540 if ( new->parent ) {
541 /* resource is already allocated, try reallocating with
542 the new constraints */
543 return reallocate_resource(root, new, size, &constraint);
544 }
545
466 write_lock(&resource_lock); 546 write_lock(&resource_lock);
467 err = find_resource(root, new, size, min, max, align, alignf, alignf_data); 547 err = find_resource(root, new, size, &constraint);
468 if (err >= 0 && __request_resource(root, new)) 548 if (err >= 0 && __request_resource(root, new))
469 err = -EBUSY; 549 err = -EBUSY;
470 write_unlock(&resource_lock); 550 write_unlock(&resource_lock);
diff --git a/kernel/sched.c b/kernel/sched.c
index 312f8b95c2d4..3dc716f6d8ad 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -231,7 +231,7 @@ static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
231#endif 231#endif
232 232
233/* 233/*
234 * sched_domains_mutex serializes calls to arch_init_sched_domains, 234 * sched_domains_mutex serializes calls to init_sched_domains,
235 * detach_destroy_domains and partition_sched_domains. 235 * detach_destroy_domains and partition_sched_domains.
236 */ 236 */
237static DEFINE_MUTEX(sched_domains_mutex); 237static DEFINE_MUTEX(sched_domains_mutex);
@@ -292,7 +292,7 @@ static DEFINE_SPINLOCK(task_group_lock);
292 * (The default weight is 1024 - so there's no practical 292 * (The default weight is 1024 - so there's no practical
293 * limitation from this.) 293 * limitation from this.)
294 */ 294 */
295#define MIN_SHARES 2 295#define MIN_SHARES (1UL << 1)
296#define MAX_SHARES (1UL << 18) 296#define MAX_SHARES (1UL << 18)
297 297
298static int root_task_group_load = ROOT_TASK_GROUP_LOAD; 298static int root_task_group_load = ROOT_TASK_GROUP_LOAD;
@@ -312,6 +312,9 @@ struct cfs_rq {
312 312
313 u64 exec_clock; 313 u64 exec_clock;
314 u64 min_vruntime; 314 u64 min_vruntime;
315#ifndef CONFIG_64BIT
316 u64 min_vruntime_copy;
317#endif
315 318
316 struct rb_root tasks_timeline; 319 struct rb_root tasks_timeline;
317 struct rb_node *rb_leftmost; 320 struct rb_node *rb_leftmost;
@@ -325,7 +328,9 @@ struct cfs_rq {
325 */ 328 */
326 struct sched_entity *curr, *next, *last, *skip; 329 struct sched_entity *curr, *next, *last, *skip;
327 330
331#ifdef CONFIG_SCHED_DEBUG
328 unsigned int nr_spread_over; 332 unsigned int nr_spread_over;
333#endif
329 334
330#ifdef CONFIG_FAIR_GROUP_SCHED 335#ifdef CONFIG_FAIR_GROUP_SCHED
331 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ 336 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
@@ -417,6 +422,7 @@ struct rt_rq {
417 */ 422 */
418struct root_domain { 423struct root_domain {
419 atomic_t refcount; 424 atomic_t refcount;
425 struct rcu_head rcu;
420 cpumask_var_t span; 426 cpumask_var_t span;
421 cpumask_var_t online; 427 cpumask_var_t online;
422 428
@@ -460,7 +466,7 @@ struct rq {
460 u64 nohz_stamp; 466 u64 nohz_stamp;
461 unsigned char nohz_balance_kick; 467 unsigned char nohz_balance_kick;
462#endif 468#endif
463 unsigned int skip_clock_update; 469 int skip_clock_update;
464 470
465 /* capture load from *all* tasks on this cpu: */ 471 /* capture load from *all* tasks on this cpu: */
466 struct load_weight load; 472 struct load_weight load;
@@ -553,6 +559,10 @@ struct rq {
553 unsigned int ttwu_count; 559 unsigned int ttwu_count;
554 unsigned int ttwu_local; 560 unsigned int ttwu_local;
555#endif 561#endif
562
563#ifdef CONFIG_SMP
564 struct task_struct *wake_list;
565#endif
556}; 566};
557 567
558static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); 568static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
@@ -571,7 +581,7 @@ static inline int cpu_of(struct rq *rq)
571 581
572#define rcu_dereference_check_sched_domain(p) \ 582#define rcu_dereference_check_sched_domain(p) \
573 rcu_dereference_check((p), \ 583 rcu_dereference_check((p), \
574 rcu_read_lock_sched_held() || \ 584 rcu_read_lock_held() || \
575 lockdep_is_held(&sched_domains_mutex)) 585 lockdep_is_held(&sched_domains_mutex))
576 586
577/* 587/*
@@ -595,10 +605,10 @@ static inline int cpu_of(struct rq *rq)
595/* 605/*
596 * Return the group to which this tasks belongs. 606 * Return the group to which this tasks belongs.
597 * 607 *
598 * We use task_subsys_state_check() and extend the RCU verification 608 * We use task_subsys_state_check() and extend the RCU verification with
599 * with lockdep_is_held(&task_rq(p)->lock) because cpu_cgroup_attach() 609 * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each
600 * holds that lock for each task it moves into the cgroup. Therefore 610 * task it moves into the cgroup. Therefore by holding either of those locks,
601 * by holding that lock, we pin the task to the current cgroup. 611 * we pin the task to the current cgroup.
602 */ 612 */
603static inline struct task_group *task_group(struct task_struct *p) 613static inline struct task_group *task_group(struct task_struct *p)
604{ 614{
@@ -606,6 +616,7 @@ static inline struct task_group *task_group(struct task_struct *p)
606 struct cgroup_subsys_state *css; 616 struct cgroup_subsys_state *css;
607 617
608 css = task_subsys_state_check(p, cpu_cgroup_subsys_id, 618 css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
619 lockdep_is_held(&p->pi_lock) ||
609 lockdep_is_held(&task_rq(p)->lock)); 620 lockdep_is_held(&task_rq(p)->lock));
610 tg = container_of(css, struct task_group, css); 621 tg = container_of(css, struct task_group, css);
611 622
@@ -642,7 +653,7 @@ static void update_rq_clock(struct rq *rq)
642{ 653{
643 s64 delta; 654 s64 delta;
644 655
645 if (rq->skip_clock_update) 656 if (rq->skip_clock_update > 0)
646 return; 657 return;
647 658
648 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; 659 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
@@ -838,18 +849,39 @@ static inline int task_current(struct rq *rq, struct task_struct *p)
838 return rq->curr == p; 849 return rq->curr == p;
839} 850}
840 851
841#ifndef __ARCH_WANT_UNLOCKED_CTXSW
842static inline int task_running(struct rq *rq, struct task_struct *p) 852static inline int task_running(struct rq *rq, struct task_struct *p)
843{ 853{
854#ifdef CONFIG_SMP
855 return p->on_cpu;
856#else
844 return task_current(rq, p); 857 return task_current(rq, p);
858#endif
845} 859}
846 860
861#ifndef __ARCH_WANT_UNLOCKED_CTXSW
847static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) 862static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
848{ 863{
864#ifdef CONFIG_SMP
865 /*
866 * We can optimise this out completely for !SMP, because the
867 * SMP rebalancing from interrupt is the only thing that cares
868 * here.
869 */
870 next->on_cpu = 1;
871#endif
849} 872}
850 873
851static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) 874static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
852{ 875{
876#ifdef CONFIG_SMP
877 /*
878 * After ->on_cpu is cleared, the task can be moved to a different CPU.
879 * We must ensure this doesn't happen until the switch is completely
880 * finished.
881 */
882 smp_wmb();
883 prev->on_cpu = 0;
884#endif
853#ifdef CONFIG_DEBUG_SPINLOCK 885#ifdef CONFIG_DEBUG_SPINLOCK
854 /* this is a valid case when another task releases the spinlock */ 886 /* this is a valid case when another task releases the spinlock */
855 rq->lock.owner = current; 887 rq->lock.owner = current;
@@ -865,15 +897,6 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
865} 897}
866 898
867#else /* __ARCH_WANT_UNLOCKED_CTXSW */ 899#else /* __ARCH_WANT_UNLOCKED_CTXSW */
868static inline int task_running(struct rq *rq, struct task_struct *p)
869{
870#ifdef CONFIG_SMP
871 return p->oncpu;
872#else
873 return task_current(rq, p);
874#endif
875}
876
877static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) 900static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
878{ 901{
879#ifdef CONFIG_SMP 902#ifdef CONFIG_SMP
@@ -882,7 +905,7 @@ static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
882 * SMP rebalancing from interrupt is the only thing that cares 905 * SMP rebalancing from interrupt is the only thing that cares
883 * here. 906 * here.
884 */ 907 */
885 next->oncpu = 1; 908 next->on_cpu = 1;
886#endif 909#endif
887#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW 910#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
888 raw_spin_unlock_irq(&rq->lock); 911 raw_spin_unlock_irq(&rq->lock);
@@ -895,12 +918,12 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
895{ 918{
896#ifdef CONFIG_SMP 919#ifdef CONFIG_SMP
897 /* 920 /*
898 * After ->oncpu is cleared, the task can be moved to a different CPU. 921 * After ->on_cpu is cleared, the task can be moved to a different CPU.
899 * We must ensure this doesn't happen until the switch is completely 922 * We must ensure this doesn't happen until the switch is completely
900 * finished. 923 * finished.
901 */ 924 */
902 smp_wmb(); 925 smp_wmb();
903 prev->oncpu = 0; 926 prev->on_cpu = 0;
904#endif 927#endif
905#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW 928#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
906 local_irq_enable(); 929 local_irq_enable();
@@ -909,23 +932,15 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
909#endif /* __ARCH_WANT_UNLOCKED_CTXSW */ 932#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
910 933
911/* 934/*
912 * Check whether the task is waking, we use this to synchronize ->cpus_allowed 935 * __task_rq_lock - lock the rq @p resides on.
913 * against ttwu().
914 */
915static inline int task_is_waking(struct task_struct *p)
916{
917 return unlikely(p->state == TASK_WAKING);
918}
919
920/*
921 * __task_rq_lock - lock the runqueue a given task resides on.
922 * Must be called interrupts disabled.
923 */ 936 */
924static inline struct rq *__task_rq_lock(struct task_struct *p) 937static inline struct rq *__task_rq_lock(struct task_struct *p)
925 __acquires(rq->lock) 938 __acquires(rq->lock)
926{ 939{
927 struct rq *rq; 940 struct rq *rq;
928 941
942 lockdep_assert_held(&p->pi_lock);
943
929 for (;;) { 944 for (;;) {
930 rq = task_rq(p); 945 rq = task_rq(p);
931 raw_spin_lock(&rq->lock); 946 raw_spin_lock(&rq->lock);
@@ -936,22 +951,22 @@ static inline struct rq *__task_rq_lock(struct task_struct *p)
936} 951}
937 952
938/* 953/*
939 * task_rq_lock - lock the runqueue a given task resides on and disable 954 * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
940 * interrupts. Note the ordering: we can safely lookup the task_rq without
941 * explicitly disabling preemption.
942 */ 955 */
943static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) 956static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
957 __acquires(p->pi_lock)
944 __acquires(rq->lock) 958 __acquires(rq->lock)
945{ 959{
946 struct rq *rq; 960 struct rq *rq;
947 961
948 for (;;) { 962 for (;;) {
949 local_irq_save(*flags); 963 raw_spin_lock_irqsave(&p->pi_lock, *flags);
950 rq = task_rq(p); 964 rq = task_rq(p);
951 raw_spin_lock(&rq->lock); 965 raw_spin_lock(&rq->lock);
952 if (likely(rq == task_rq(p))) 966 if (likely(rq == task_rq(p)))
953 return rq; 967 return rq;
954 raw_spin_unlock_irqrestore(&rq->lock, *flags); 968 raw_spin_unlock(&rq->lock);
969 raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
955 } 970 }
956} 971}
957 972
@@ -961,10 +976,13 @@ static void __task_rq_unlock(struct rq *rq)
961 raw_spin_unlock(&rq->lock); 976 raw_spin_unlock(&rq->lock);
962} 977}
963 978
964static inline void task_rq_unlock(struct rq *rq, unsigned long *flags) 979static inline void
980task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
965 __releases(rq->lock) 981 __releases(rq->lock)
982 __releases(p->pi_lock)
966{ 983{
967 raw_spin_unlock_irqrestore(&rq->lock, *flags); 984 raw_spin_unlock(&rq->lock);
985 raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
968} 986}
969 987
970/* 988/*
@@ -1193,11 +1211,17 @@ int get_nohz_timer_target(void)
1193 int i; 1211 int i;
1194 struct sched_domain *sd; 1212 struct sched_domain *sd;
1195 1213
1214 rcu_read_lock();
1196 for_each_domain(cpu, sd) { 1215 for_each_domain(cpu, sd) {
1197 for_each_cpu(i, sched_domain_span(sd)) 1216 for_each_cpu(i, sched_domain_span(sd)) {
1198 if (!idle_cpu(i)) 1217 if (!idle_cpu(i)) {
1199 return i; 1218 cpu = i;
1219 goto unlock;
1220 }
1221 }
1200 } 1222 }
1223unlock:
1224 rcu_read_unlock();
1201 return cpu; 1225 return cpu;
1202} 1226}
1203/* 1227/*
@@ -1307,15 +1331,27 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight,
1307{ 1331{
1308 u64 tmp; 1332 u64 tmp;
1309 1333
1334 /*
1335 * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
1336 * entities since MIN_SHARES = 2. Treat weight as 1 if less than
1337 * 2^SCHED_LOAD_RESOLUTION.
1338 */
1339 if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
1340 tmp = (u64)delta_exec * scale_load_down(weight);
1341 else
1342 tmp = (u64)delta_exec;
1343
1310 if (!lw->inv_weight) { 1344 if (!lw->inv_weight) {
1311 if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST)) 1345 unsigned long w = scale_load_down(lw->weight);
1346
1347 if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
1312 lw->inv_weight = 1; 1348 lw->inv_weight = 1;
1349 else if (unlikely(!w))
1350 lw->inv_weight = WMULT_CONST;
1313 else 1351 else
1314 lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2) 1352 lw->inv_weight = WMULT_CONST / w;
1315 / (lw->weight+1);
1316 } 1353 }
1317 1354
1318 tmp = (u64)delta_exec * weight;
1319 /* 1355 /*
1320 * Check whether we'd overflow the 64-bit multiplication: 1356 * Check whether we'd overflow the 64-bit multiplication:
1321 */ 1357 */
@@ -1755,17 +1791,20 @@ static void dec_nr_running(struct rq *rq)
1755 1791
1756static void set_load_weight(struct task_struct *p) 1792static void set_load_weight(struct task_struct *p)
1757{ 1793{
1794 int prio = p->static_prio - MAX_RT_PRIO;
1795 struct load_weight *load = &p->se.load;
1796
1758 /* 1797 /*
1759 * SCHED_IDLE tasks get minimal weight: 1798 * SCHED_IDLE tasks get minimal weight:
1760 */ 1799 */
1761 if (p->policy == SCHED_IDLE) { 1800 if (p->policy == SCHED_IDLE) {
1762 p->se.load.weight = WEIGHT_IDLEPRIO; 1801 load->weight = scale_load(WEIGHT_IDLEPRIO);
1763 p->se.load.inv_weight = WMULT_IDLEPRIO; 1802 load->inv_weight = WMULT_IDLEPRIO;
1764 return; 1803 return;
1765 } 1804 }
1766 1805
1767 p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO]; 1806 load->weight = scale_load(prio_to_weight[prio]);
1768 p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO]; 1807 load->inv_weight = prio_to_wmult[prio];
1769} 1808}
1770 1809
1771static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) 1810static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
@@ -1773,7 +1812,6 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
1773 update_rq_clock(rq); 1812 update_rq_clock(rq);
1774 sched_info_queued(p); 1813 sched_info_queued(p);
1775 p->sched_class->enqueue_task(rq, p, flags); 1814 p->sched_class->enqueue_task(rq, p, flags);
1776 p->se.on_rq = 1;
1777} 1815}
1778 1816
1779static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) 1817static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
@@ -1781,7 +1819,6 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
1781 update_rq_clock(rq); 1819 update_rq_clock(rq);
1782 sched_info_dequeued(p); 1820 sched_info_dequeued(p);
1783 p->sched_class->dequeue_task(rq, p, flags); 1821 p->sched_class->dequeue_task(rq, p, flags);
1784 p->se.on_rq = 0;
1785} 1822}
1786 1823
1787/* 1824/*
@@ -2116,7 +2153,7 @@ static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
2116 * A queue event has occurred, and we're going to schedule. In 2153 * A queue event has occurred, and we're going to schedule. In
2117 * this case, we can save a useless back to back clock update. 2154 * this case, we can save a useless back to back clock update.
2118 */ 2155 */
2119 if (rq->curr->se.on_rq && test_tsk_need_resched(rq->curr)) 2156 if (rq->curr->on_rq && test_tsk_need_resched(rq->curr))
2120 rq->skip_clock_update = 1; 2157 rq->skip_clock_update = 1;
2121} 2158}
2122 2159
@@ -2162,6 +2199,21 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
2162 */ 2199 */
2163 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && 2200 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
2164 !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)); 2201 !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));
2202
2203#ifdef CONFIG_LOCKDEP
2204 /*
2205 * The caller should hold either p->pi_lock or rq->lock, when changing
2206 * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks.
2207 *
2208 * sched_move_task() holds both and thus holding either pins the cgroup,
2209 * see set_task_rq().
2210 *
2211 * Furthermore, all task_rq users should acquire both locks, see
2212 * task_rq_lock().
2213 */
2214 WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
2215 lockdep_is_held(&task_rq(p)->lock)));
2216#endif
2165#endif 2217#endif
2166 2218
2167 trace_sched_migrate_task(p, new_cpu); 2219 trace_sched_migrate_task(p, new_cpu);
@@ -2182,19 +2234,6 @@ struct migration_arg {
2182static int migration_cpu_stop(void *data); 2234static int migration_cpu_stop(void *data);
2183 2235
2184/* 2236/*
2185 * The task's runqueue lock must be held.
2186 * Returns true if you have to wait for migration thread.
2187 */
2188static bool migrate_task(struct task_struct *p, struct rq *rq)
2189{
2190 /*
2191 * If the task is not on a runqueue (and not running), then
2192 * the next wake-up will properly place the task.
2193 */
2194 return p->se.on_rq || task_running(rq, p);
2195}
2196
2197/*
2198 * wait_task_inactive - wait for a thread to unschedule. 2237 * wait_task_inactive - wait for a thread to unschedule.
2199 * 2238 *
2200 * If @match_state is nonzero, it's the @p->state value just checked and 2239 * If @match_state is nonzero, it's the @p->state value just checked and
@@ -2251,11 +2290,11 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
2251 rq = task_rq_lock(p, &flags); 2290 rq = task_rq_lock(p, &flags);
2252 trace_sched_wait_task(p); 2291 trace_sched_wait_task(p);
2253 running = task_running(rq, p); 2292 running = task_running(rq, p);
2254 on_rq = p->se.on_rq; 2293 on_rq = p->on_rq;
2255 ncsw = 0; 2294 ncsw = 0;
2256 if (!match_state || p->state == match_state) 2295 if (!match_state || p->state == match_state)
2257 ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ 2296 ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
2258 task_rq_unlock(rq, &flags); 2297 task_rq_unlock(rq, p, &flags);
2259 2298
2260 /* 2299 /*
2261 * If it changed from the expected state, bail out now. 2300 * If it changed from the expected state, bail out now.
@@ -2330,7 +2369,7 @@ EXPORT_SYMBOL_GPL(kick_process);
2330 2369
2331#ifdef CONFIG_SMP 2370#ifdef CONFIG_SMP
2332/* 2371/*
2333 * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held. 2372 * ->cpus_allowed is protected by both rq->lock and p->pi_lock
2334 */ 2373 */
2335static int select_fallback_rq(int cpu, struct task_struct *p) 2374static int select_fallback_rq(int cpu, struct task_struct *p)
2336{ 2375{
@@ -2363,12 +2402,12 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
2363} 2402}
2364 2403
2365/* 2404/*
2366 * The caller (fork, wakeup) owns TASK_WAKING, ->cpus_allowed is stable. 2405 * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
2367 */ 2406 */
2368static inline 2407static inline
2369int select_task_rq(struct rq *rq, struct task_struct *p, int sd_flags, int wake_flags) 2408int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
2370{ 2409{
2371 int cpu = p->sched_class->select_task_rq(rq, p, sd_flags, wake_flags); 2410 int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags);
2372 2411
2373 /* 2412 /*
2374 * In order not to call set_task_cpu() on a blocking task we need 2413 * In order not to call set_task_cpu() on a blocking task we need
@@ -2394,27 +2433,63 @@ static void update_avg(u64 *avg, u64 sample)
2394} 2433}
2395#endif 2434#endif
2396 2435
2397static inline void ttwu_activate(struct task_struct *p, struct rq *rq, 2436static void
2398 bool is_sync, bool is_migrate, bool is_local, 2437ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
2399 unsigned long en_flags)
2400{ 2438{
2401 schedstat_inc(p, se.statistics.nr_wakeups); 2439#ifdef CONFIG_SCHEDSTATS
2402 if (is_sync) 2440 struct rq *rq = this_rq();
2403 schedstat_inc(p, se.statistics.nr_wakeups_sync); 2441
2404 if (is_migrate) 2442#ifdef CONFIG_SMP
2405 schedstat_inc(p, se.statistics.nr_wakeups_migrate); 2443 int this_cpu = smp_processor_id();
2406 if (is_local) 2444
2445 if (cpu == this_cpu) {
2446 schedstat_inc(rq, ttwu_local);
2407 schedstat_inc(p, se.statistics.nr_wakeups_local); 2447 schedstat_inc(p, se.statistics.nr_wakeups_local);
2408 else 2448 } else {
2449 struct sched_domain *sd;
2450
2409 schedstat_inc(p, se.statistics.nr_wakeups_remote); 2451 schedstat_inc(p, se.statistics.nr_wakeups_remote);
2452 rcu_read_lock();
2453 for_each_domain(this_cpu, sd) {
2454 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
2455 schedstat_inc(sd, ttwu_wake_remote);
2456 break;
2457 }
2458 }
2459 rcu_read_unlock();
2460 }
2461
2462 if (wake_flags & WF_MIGRATED)
2463 schedstat_inc(p, se.statistics.nr_wakeups_migrate);
2464
2465#endif /* CONFIG_SMP */
2466
2467 schedstat_inc(rq, ttwu_count);
2468 schedstat_inc(p, se.statistics.nr_wakeups);
2410 2469
2470 if (wake_flags & WF_SYNC)
2471 schedstat_inc(p, se.statistics.nr_wakeups_sync);
2472
2473#endif /* CONFIG_SCHEDSTATS */
2474}
2475
2476static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
2477{
2411 activate_task(rq, p, en_flags); 2478 activate_task(rq, p, en_flags);
2479 p->on_rq = 1;
2480
2481 /* if a worker is waking up, notify workqueue */
2482 if (p->flags & PF_WQ_WORKER)
2483 wq_worker_waking_up(p, cpu_of(rq));
2412} 2484}
2413 2485
2414static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq, 2486/*
2415 int wake_flags, bool success) 2487 * Mark the task runnable and perform wakeup-preemption.
2488 */
2489static void
2490ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
2416{ 2491{
2417 trace_sched_wakeup(p, success); 2492 trace_sched_wakeup(p, true);
2418 check_preempt_curr(rq, p, wake_flags); 2493 check_preempt_curr(rq, p, wake_flags);
2419 2494
2420 p->state = TASK_RUNNING; 2495 p->state = TASK_RUNNING;
@@ -2433,9 +2508,119 @@ static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
2433 rq->idle_stamp = 0; 2508 rq->idle_stamp = 0;
2434 } 2509 }
2435#endif 2510#endif
2436 /* if a worker is waking up, notify workqueue */ 2511}
2437 if ((p->flags & PF_WQ_WORKER) && success) 2512
2438 wq_worker_waking_up(p, cpu_of(rq)); 2513static void
2514ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
2515{
2516#ifdef CONFIG_SMP
2517 if (p->sched_contributes_to_load)
2518 rq->nr_uninterruptible--;
2519#endif
2520
2521 ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING);
2522 ttwu_do_wakeup(rq, p, wake_flags);
2523}
2524
2525/*
2526 * Called in case the task @p isn't fully descheduled from its runqueue,
2527 * in this case we must do a remote wakeup. Its a 'light' wakeup though,
2528 * since all we need to do is flip p->state to TASK_RUNNING, since
2529 * the task is still ->on_rq.
2530 */
2531static int ttwu_remote(struct task_struct *p, int wake_flags)
2532{
2533 struct rq *rq;
2534 int ret = 0;
2535
2536 rq = __task_rq_lock(p);
2537 if (p->on_rq) {
2538 ttwu_do_wakeup(rq, p, wake_flags);
2539 ret = 1;
2540 }
2541 __task_rq_unlock(rq);
2542
2543 return ret;
2544}
2545
2546#ifdef CONFIG_SMP
2547static void sched_ttwu_pending(void)
2548{
2549 struct rq *rq = this_rq();
2550 struct task_struct *list = xchg(&rq->wake_list, NULL);
2551
2552 if (!list)
2553 return;
2554
2555 raw_spin_lock(&rq->lock);
2556
2557 while (list) {
2558 struct task_struct *p = list;
2559 list = list->wake_entry;
2560 ttwu_do_activate(rq, p, 0);
2561 }
2562
2563 raw_spin_unlock(&rq->lock);
2564}
2565
2566void scheduler_ipi(void)
2567{
2568 sched_ttwu_pending();
2569}
2570
2571static void ttwu_queue_remote(struct task_struct *p, int cpu)
2572{
2573 struct rq *rq = cpu_rq(cpu);
2574 struct task_struct *next = rq->wake_list;
2575
2576 for (;;) {
2577 struct task_struct *old = next;
2578
2579 p->wake_entry = next;
2580 next = cmpxchg(&rq->wake_list, old, p);
2581 if (next == old)
2582 break;
2583 }
2584
2585 if (!next)
2586 smp_send_reschedule(cpu);
2587}
2588
2589#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
2590static int ttwu_activate_remote(struct task_struct *p, int wake_flags)
2591{
2592 struct rq *rq;
2593 int ret = 0;
2594
2595 rq = __task_rq_lock(p);
2596 if (p->on_cpu) {
2597 ttwu_activate(rq, p, ENQUEUE_WAKEUP);
2598 ttwu_do_wakeup(rq, p, wake_flags);
2599 ret = 1;
2600 }
2601 __task_rq_unlock(rq);
2602
2603 return ret;
2604
2605}
2606#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
2607#endif /* CONFIG_SMP */
2608
2609static void ttwu_queue(struct task_struct *p, int cpu)
2610{
2611 struct rq *rq = cpu_rq(cpu);
2612
2613#if defined(CONFIG_SMP)
2614 if (sched_feat(TTWU_QUEUE) && cpu != smp_processor_id()) {
2615 sched_clock_cpu(cpu); /* sync clocks x-cpu */
2616 ttwu_queue_remote(p, cpu);
2617 return;
2618 }
2619#endif
2620
2621 raw_spin_lock(&rq->lock);
2622 ttwu_do_activate(rq, p, 0);
2623 raw_spin_unlock(&rq->lock);
2439} 2624}
2440 2625
2441/** 2626/**
@@ -2453,92 +2638,66 @@ static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
2453 * Returns %true if @p was woken up, %false if it was already running 2638 * Returns %true if @p was woken up, %false if it was already running
2454 * or @state didn't match @p's state. 2639 * or @state didn't match @p's state.
2455 */ 2640 */
2456static int try_to_wake_up(struct task_struct *p, unsigned int state, 2641static int
2457 int wake_flags) 2642try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
2458{ 2643{
2459 int cpu, orig_cpu, this_cpu, success = 0;
2460 unsigned long flags; 2644 unsigned long flags;
2461 unsigned long en_flags = ENQUEUE_WAKEUP; 2645 int cpu, success = 0;
2462 struct rq *rq;
2463
2464 this_cpu = get_cpu();
2465 2646
2466 smp_wmb(); 2647 smp_wmb();
2467 rq = task_rq_lock(p, &flags); 2648 raw_spin_lock_irqsave(&p->pi_lock, flags);
2468 if (!(p->state & state)) 2649 if (!(p->state & state))
2469 goto out; 2650 goto out;
2470 2651
2471 if (p->se.on_rq) 2652 success = 1; /* we're going to change ->state */
2472 goto out_running;
2473
2474 cpu = task_cpu(p); 2653 cpu = task_cpu(p);
2475 orig_cpu = cpu;
2476 2654
2477#ifdef CONFIG_SMP 2655 if (p->on_rq && ttwu_remote(p, wake_flags))
2478 if (unlikely(task_running(rq, p))) 2656 goto stat;
2479 goto out_activate;
2480 2657
2658#ifdef CONFIG_SMP
2481 /* 2659 /*
2482 * In order to handle concurrent wakeups and release the rq->lock 2660 * If the owning (remote) cpu is still in the middle of schedule() with
2483 * we put the task in TASK_WAKING state. 2661 * this task as prev, wait until its done referencing the task.
2484 *
2485 * First fix up the nr_uninterruptible count:
2486 */ 2662 */
2487 if (task_contributes_to_load(p)) { 2663 while (p->on_cpu) {
2488 if (likely(cpu_online(orig_cpu))) 2664#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
2489 rq->nr_uninterruptible--; 2665 /*
2490 else 2666 * In case the architecture enables interrupts in
2491 this_rq()->nr_uninterruptible--; 2667 * context_switch(), we cannot busy wait, since that
2668 * would lead to deadlocks when an interrupt hits and
2669 * tries to wake up @prev. So bail and do a complete
2670 * remote wakeup.
2671 */
2672 if (ttwu_activate_remote(p, wake_flags))
2673 goto stat;
2674#else
2675 cpu_relax();
2676#endif
2492 } 2677 }
2678 /*
2679 * Pairs with the smp_wmb() in finish_lock_switch().
2680 */
2681 smp_rmb();
2682
2683 p->sched_contributes_to_load = !!task_contributes_to_load(p);
2493 p->state = TASK_WAKING; 2684 p->state = TASK_WAKING;
2494 2685
2495 if (p->sched_class->task_waking) { 2686 if (p->sched_class->task_waking)
2496 p->sched_class->task_waking(rq, p); 2687 p->sched_class->task_waking(p);
2497 en_flags |= ENQUEUE_WAKING;
2498 }
2499 2688
2500 cpu = select_task_rq(rq, p, SD_BALANCE_WAKE, wake_flags); 2689 cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
2501 if (cpu != orig_cpu) 2690 if (task_cpu(p) != cpu) {
2691 wake_flags |= WF_MIGRATED;
2502 set_task_cpu(p, cpu); 2692 set_task_cpu(p, cpu);
2503 __task_rq_unlock(rq);
2504
2505 rq = cpu_rq(cpu);
2506 raw_spin_lock(&rq->lock);
2507
2508 /*
2509 * We migrated the task without holding either rq->lock, however
2510 * since the task is not on the task list itself, nobody else
2511 * will try and migrate the task, hence the rq should match the
2512 * cpu we just moved it to.
2513 */
2514 WARN_ON(task_cpu(p) != cpu);
2515 WARN_ON(p->state != TASK_WAKING);
2516
2517#ifdef CONFIG_SCHEDSTATS
2518 schedstat_inc(rq, ttwu_count);
2519 if (cpu == this_cpu)
2520 schedstat_inc(rq, ttwu_local);
2521 else {
2522 struct sched_domain *sd;
2523 for_each_domain(this_cpu, sd) {
2524 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
2525 schedstat_inc(sd, ttwu_wake_remote);
2526 break;
2527 }
2528 }
2529 } 2693 }
2530#endif /* CONFIG_SCHEDSTATS */
2531
2532out_activate:
2533#endif /* CONFIG_SMP */ 2694#endif /* CONFIG_SMP */
2534 ttwu_activate(p, rq, wake_flags & WF_SYNC, orig_cpu != cpu, 2695
2535 cpu == this_cpu, en_flags); 2696 ttwu_queue(p, cpu);
2536 success = 1; 2697stat:
2537out_running: 2698 ttwu_stat(p, cpu, wake_flags);
2538 ttwu_post_activation(p, rq, wake_flags, success);
2539out: 2699out:
2540 task_rq_unlock(rq, &flags); 2700 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2541 put_cpu();
2542 2701
2543 return success; 2702 return success;
2544} 2703}
@@ -2547,31 +2706,34 @@ out:
2547 * try_to_wake_up_local - try to wake up a local task with rq lock held 2706 * try_to_wake_up_local - try to wake up a local task with rq lock held
2548 * @p: the thread to be awakened 2707 * @p: the thread to be awakened
2549 * 2708 *
2550 * Put @p on the run-queue if it's not already there. The caller must 2709 * Put @p on the run-queue if it's not already there. The caller must
2551 * ensure that this_rq() is locked, @p is bound to this_rq() and not 2710 * ensure that this_rq() is locked, @p is bound to this_rq() and not
2552 * the current task. this_rq() stays locked over invocation. 2711 * the current task.
2553 */ 2712 */
2554static void try_to_wake_up_local(struct task_struct *p) 2713static void try_to_wake_up_local(struct task_struct *p)
2555{ 2714{
2556 struct rq *rq = task_rq(p); 2715 struct rq *rq = task_rq(p);
2557 bool success = false;
2558 2716
2559 BUG_ON(rq != this_rq()); 2717 BUG_ON(rq != this_rq());
2560 BUG_ON(p == current); 2718 BUG_ON(p == current);
2561 lockdep_assert_held(&rq->lock); 2719 lockdep_assert_held(&rq->lock);
2562 2720
2721 if (!raw_spin_trylock(&p->pi_lock)) {
2722 raw_spin_unlock(&rq->lock);
2723 raw_spin_lock(&p->pi_lock);
2724 raw_spin_lock(&rq->lock);
2725 }
2726
2563 if (!(p->state & TASK_NORMAL)) 2727 if (!(p->state & TASK_NORMAL))
2564 return; 2728 goto out;
2565 2729
2566 if (!p->se.on_rq) { 2730 if (!p->on_rq)
2567 if (likely(!task_running(rq, p))) { 2731 ttwu_activate(rq, p, ENQUEUE_WAKEUP);
2568 schedstat_inc(rq, ttwu_count); 2732
2569 schedstat_inc(rq, ttwu_local); 2733 ttwu_do_wakeup(rq, p, 0);
2570 } 2734 ttwu_stat(p, smp_processor_id(), 0);
2571 ttwu_activate(p, rq, false, false, true, ENQUEUE_WAKEUP); 2735out:
2572 success = true; 2736 raw_spin_unlock(&p->pi_lock);
2573 }
2574 ttwu_post_activation(p, rq, 0, success);
2575} 2737}
2576 2738
2577/** 2739/**
@@ -2604,19 +2766,21 @@ int wake_up_state(struct task_struct *p, unsigned int state)
2604 */ 2766 */
2605static void __sched_fork(struct task_struct *p) 2767static void __sched_fork(struct task_struct *p)
2606{ 2768{
2769 p->on_rq = 0;
2770
2771 p->se.on_rq = 0;
2607 p->se.exec_start = 0; 2772 p->se.exec_start = 0;
2608 p->se.sum_exec_runtime = 0; 2773 p->se.sum_exec_runtime = 0;
2609 p->se.prev_sum_exec_runtime = 0; 2774 p->se.prev_sum_exec_runtime = 0;
2610 p->se.nr_migrations = 0; 2775 p->se.nr_migrations = 0;
2611 p->se.vruntime = 0; 2776 p->se.vruntime = 0;
2777 INIT_LIST_HEAD(&p->se.group_node);
2612 2778
2613#ifdef CONFIG_SCHEDSTATS 2779#ifdef CONFIG_SCHEDSTATS
2614 memset(&p->se.statistics, 0, sizeof(p->se.statistics)); 2780 memset(&p->se.statistics, 0, sizeof(p->se.statistics));
2615#endif 2781#endif
2616 2782
2617 INIT_LIST_HEAD(&p->rt.run_list); 2783 INIT_LIST_HEAD(&p->rt.run_list);
2618 p->se.on_rq = 0;
2619 INIT_LIST_HEAD(&p->se.group_node);
2620 2784
2621#ifdef CONFIG_PREEMPT_NOTIFIERS 2785#ifdef CONFIG_PREEMPT_NOTIFIERS
2622 INIT_HLIST_HEAD(&p->preempt_notifiers); 2786 INIT_HLIST_HEAD(&p->preempt_notifiers);
@@ -2626,8 +2790,9 @@ static void __sched_fork(struct task_struct *p)
2626/* 2790/*
2627 * fork()/clone()-time setup: 2791 * fork()/clone()-time setup:
2628 */ 2792 */
2629void sched_fork(struct task_struct *p, int clone_flags) 2793void sched_fork(struct task_struct *p)
2630{ 2794{
2795 unsigned long flags;
2631 int cpu = get_cpu(); 2796 int cpu = get_cpu();
2632 2797
2633 __sched_fork(p); 2798 __sched_fork(p);
@@ -2678,16 +2843,16 @@ void sched_fork(struct task_struct *p, int clone_flags)
2678 * 2843 *
2679 * Silence PROVE_RCU. 2844 * Silence PROVE_RCU.
2680 */ 2845 */
2681 rcu_read_lock(); 2846 raw_spin_lock_irqsave(&p->pi_lock, flags);
2682 set_task_cpu(p, cpu); 2847 set_task_cpu(p, cpu);
2683 rcu_read_unlock(); 2848 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2684 2849
2685#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) 2850#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
2686 if (likely(sched_info_on())) 2851 if (likely(sched_info_on()))
2687 memset(&p->sched_info, 0, sizeof(p->sched_info)); 2852 memset(&p->sched_info, 0, sizeof(p->sched_info));
2688#endif 2853#endif
2689#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) 2854#if defined(CONFIG_SMP)
2690 p->oncpu = 0; 2855 p->on_cpu = 0;
2691#endif 2856#endif
2692#ifdef CONFIG_PREEMPT 2857#ifdef CONFIG_PREEMPT
2693 /* Want to start with kernel preemption disabled. */ 2858 /* Want to start with kernel preemption disabled. */
@@ -2707,41 +2872,31 @@ void sched_fork(struct task_struct *p, int clone_flags)
2707 * that must be done for every newly created context, then puts the task 2872 * that must be done for every newly created context, then puts the task
2708 * on the runqueue and wakes it. 2873 * on the runqueue and wakes it.
2709 */ 2874 */
2710void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) 2875void wake_up_new_task(struct task_struct *p)
2711{ 2876{
2712 unsigned long flags; 2877 unsigned long flags;
2713 struct rq *rq; 2878 struct rq *rq;
2714 int cpu __maybe_unused = get_cpu();
2715 2879
2880 raw_spin_lock_irqsave(&p->pi_lock, flags);
2716#ifdef CONFIG_SMP 2881#ifdef CONFIG_SMP
2717 rq = task_rq_lock(p, &flags);
2718 p->state = TASK_WAKING;
2719
2720 /* 2882 /*
2721 * Fork balancing, do it here and not earlier because: 2883 * Fork balancing, do it here and not earlier because:
2722 * - cpus_allowed can change in the fork path 2884 * - cpus_allowed can change in the fork path
2723 * - any previously selected cpu might disappear through hotplug 2885 * - any previously selected cpu might disappear through hotplug
2724 *
2725 * We set TASK_WAKING so that select_task_rq() can drop rq->lock
2726 * without people poking at ->cpus_allowed.
2727 */ 2886 */
2728 cpu = select_task_rq(rq, p, SD_BALANCE_FORK, 0); 2887 set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0));
2729 set_task_cpu(p, cpu);
2730
2731 p->state = TASK_RUNNING;
2732 task_rq_unlock(rq, &flags);
2733#endif 2888#endif
2734 2889
2735 rq = task_rq_lock(p, &flags); 2890 rq = __task_rq_lock(p);
2736 activate_task(rq, p, 0); 2891 activate_task(rq, p, 0);
2737 trace_sched_wakeup_new(p, 1); 2892 p->on_rq = 1;
2893 trace_sched_wakeup_new(p, true);
2738 check_preempt_curr(rq, p, WF_FORK); 2894 check_preempt_curr(rq, p, WF_FORK);
2739#ifdef CONFIG_SMP 2895#ifdef CONFIG_SMP
2740 if (p->sched_class->task_woken) 2896 if (p->sched_class->task_woken)
2741 p->sched_class->task_woken(rq, p); 2897 p->sched_class->task_woken(rq, p);
2742#endif 2898#endif
2743 task_rq_unlock(rq, &flags); 2899 task_rq_unlock(rq, p, &flags);
2744 put_cpu();
2745} 2900}
2746 2901
2747#ifdef CONFIG_PREEMPT_NOTIFIERS 2902#ifdef CONFIG_PREEMPT_NOTIFIERS
@@ -3450,27 +3605,22 @@ void sched_exec(void)
3450{ 3605{
3451 struct task_struct *p = current; 3606 struct task_struct *p = current;
3452 unsigned long flags; 3607 unsigned long flags;
3453 struct rq *rq;
3454 int dest_cpu; 3608 int dest_cpu;
3455 3609
3456 rq = task_rq_lock(p, &flags); 3610 raw_spin_lock_irqsave(&p->pi_lock, flags);
3457 dest_cpu = p->sched_class->select_task_rq(rq, p, SD_BALANCE_EXEC, 0); 3611 dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0);
3458 if (dest_cpu == smp_processor_id()) 3612 if (dest_cpu == smp_processor_id())
3459 goto unlock; 3613 goto unlock;
3460 3614
3461 /* 3615 if (likely(cpu_active(dest_cpu))) {
3462 * select_task_rq() can race against ->cpus_allowed
3463 */
3464 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) &&
3465 likely(cpu_active(dest_cpu)) && migrate_task(p, rq)) {
3466 struct migration_arg arg = { p, dest_cpu }; 3616 struct migration_arg arg = { p, dest_cpu };
3467 3617
3468 task_rq_unlock(rq, &flags); 3618 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
3469 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); 3619 stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
3470 return; 3620 return;
3471 } 3621 }
3472unlock: 3622unlock:
3473 task_rq_unlock(rq, &flags); 3623 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
3474} 3624}
3475 3625
3476#endif 3626#endif
@@ -3507,7 +3657,7 @@ unsigned long long task_delta_exec(struct task_struct *p)
3507 3657
3508 rq = task_rq_lock(p, &flags); 3658 rq = task_rq_lock(p, &flags);
3509 ns = do_task_delta_exec(p, rq); 3659 ns = do_task_delta_exec(p, rq);
3510 task_rq_unlock(rq, &flags); 3660 task_rq_unlock(rq, p, &flags);
3511 3661
3512 return ns; 3662 return ns;
3513} 3663}
@@ -3525,7 +3675,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
3525 3675
3526 rq = task_rq_lock(p, &flags); 3676 rq = task_rq_lock(p, &flags);
3527 ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq); 3677 ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
3528 task_rq_unlock(rq, &flags); 3678 task_rq_unlock(rq, p, &flags);
3529 3679
3530 return ns; 3680 return ns;
3531} 3681}
@@ -3549,7 +3699,7 @@ unsigned long long thread_group_sched_runtime(struct task_struct *p)
3549 rq = task_rq_lock(p, &flags); 3699 rq = task_rq_lock(p, &flags);
3550 thread_group_cputime(p, &totals); 3700 thread_group_cputime(p, &totals);
3551 ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq); 3701 ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq);
3552 task_rq_unlock(rq, &flags); 3702 task_rq_unlock(rq, p, &flags);
3553 3703
3554 return ns; 3704 return ns;
3555} 3705}
@@ -3903,9 +4053,6 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
3903/* 4053/*
3904 * This function gets called by the timer code, with HZ frequency. 4054 * This function gets called by the timer code, with HZ frequency.
3905 * We call it with interrupts disabled. 4055 * We call it with interrupts disabled.
3906 *
3907 * It also gets called by the fork code, when changing the parent's
3908 * timeslices.
3909 */ 4056 */
3910void scheduler_tick(void) 4057void scheduler_tick(void)
3911{ 4058{
@@ -4025,17 +4172,11 @@ static inline void schedule_debug(struct task_struct *prev)
4025 profile_hit(SCHED_PROFILING, __builtin_return_address(0)); 4172 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
4026 4173
4027 schedstat_inc(this_rq(), sched_count); 4174 schedstat_inc(this_rq(), sched_count);
4028#ifdef CONFIG_SCHEDSTATS
4029 if (unlikely(prev->lock_depth >= 0)) {
4030 schedstat_inc(this_rq(), rq_sched_info.bkl_count);
4031 schedstat_inc(prev, sched_info.bkl_count);
4032 }
4033#endif
4034} 4175}
4035 4176
4036static void put_prev_task(struct rq *rq, struct task_struct *prev) 4177static void put_prev_task(struct rq *rq, struct task_struct *prev)
4037{ 4178{
4038 if (prev->se.on_rq) 4179 if (prev->on_rq || rq->skip_clock_update < 0)
4039 update_rq_clock(rq); 4180 update_rq_clock(rq);
4040 prev->sched_class->put_prev_task(rq, prev); 4181 prev->sched_class->put_prev_task(rq, prev);
4041} 4182}
@@ -4097,11 +4238,13 @@ need_resched:
4097 if (unlikely(signal_pending_state(prev->state, prev))) { 4238 if (unlikely(signal_pending_state(prev->state, prev))) {
4098 prev->state = TASK_RUNNING; 4239 prev->state = TASK_RUNNING;
4099 } else { 4240 } else {
4241 deactivate_task(rq, prev, DEQUEUE_SLEEP);
4242 prev->on_rq = 0;
4243
4100 /* 4244 /*
4101 * If a worker is going to sleep, notify and 4245 * If a worker went to sleep, notify and ask workqueue
4102 * ask workqueue whether it wants to wake up a 4246 * whether it wants to wake up a task to maintain
4103 * task to maintain concurrency. If so, wake 4247 * concurrency.
4104 * up the task.
4105 */ 4248 */
4106 if (prev->flags & PF_WQ_WORKER) { 4249 if (prev->flags & PF_WQ_WORKER) {
4107 struct task_struct *to_wakeup; 4250 struct task_struct *to_wakeup;
@@ -4110,11 +4253,10 @@ need_resched:
4110 if (to_wakeup) 4253 if (to_wakeup)
4111 try_to_wake_up_local(to_wakeup); 4254 try_to_wake_up_local(to_wakeup);
4112 } 4255 }
4113 deactivate_task(rq, prev, DEQUEUE_SLEEP);
4114 4256
4115 /* 4257 /*
4116 * If we are going to sleep and we have plugged IO queued, make 4258 * If we are going to sleep and we have plugged IO
4117 * sure to submit it to avoid deadlocks. 4259 * queued, make sure to submit it to avoid deadlocks.
4118 */ 4260 */
4119 if (blk_needs_flush_plug(prev)) { 4261 if (blk_needs_flush_plug(prev)) {
4120 raw_spin_unlock(&rq->lock); 4262 raw_spin_unlock(&rq->lock);
@@ -4161,70 +4303,53 @@ need_resched:
4161EXPORT_SYMBOL(schedule); 4303EXPORT_SYMBOL(schedule);
4162 4304
4163#ifdef CONFIG_MUTEX_SPIN_ON_OWNER 4305#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
4164/*
4165 * Look out! "owner" is an entirely speculative pointer
4166 * access and not reliable.
4167 */
4168int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
4169{
4170 unsigned int cpu;
4171 struct rq *rq;
4172 4306
4173 if (!sched_feat(OWNER_SPIN)) 4307static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
4174 return 0; 4308{
4309 bool ret = false;
4175 4310
4176#ifdef CONFIG_DEBUG_PAGEALLOC 4311 rcu_read_lock();
4177 /* 4312 if (lock->owner != owner)
4178 * Need to access the cpu field knowing that 4313 goto fail;
4179 * DEBUG_PAGEALLOC could have unmapped it if
4180 * the mutex owner just released it and exited.
4181 */
4182 if (probe_kernel_address(&owner->cpu, cpu))
4183 return 0;
4184#else
4185 cpu = owner->cpu;
4186#endif
4187 4314
4188 /* 4315 /*
4189 * Even if the access succeeded (likely case), 4316 * Ensure we emit the owner->on_cpu, dereference _after_ checking
4190 * the cpu field may no longer be valid. 4317 * lock->owner still matches owner, if that fails, owner might
4318 * point to free()d memory, if it still matches, the rcu_read_lock()
4319 * ensures the memory stays valid.
4191 */ 4320 */
4192 if (cpu >= nr_cpumask_bits) 4321 barrier();
4193 return 0;
4194 4322
4195 /* 4323 ret = owner->on_cpu;
4196 * We need to validate that we can do a 4324fail:
4197 * get_cpu() and that we have the percpu area. 4325 rcu_read_unlock();
4198 */
4199 if (!cpu_online(cpu))
4200 return 0;
4201 4326
4202 rq = cpu_rq(cpu); 4327 return ret;
4328}
4203 4329
4204 for (;;) { 4330/*
4205 /* 4331 * Look out! "owner" is an entirely speculative pointer
4206 * Owner changed, break to re-assess state. 4332 * access and not reliable.
4207 */ 4333 */
4208 if (lock->owner != owner) { 4334int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
4209 /* 4335{
4210 * If the lock has switched to a different owner, 4336 if (!sched_feat(OWNER_SPIN))
4211 * we likely have heavy contention. Return 0 to quit 4337 return 0;
4212 * optimistic spinning and not contend further:
4213 */
4214 if (lock->owner)
4215 return 0;
4216 break;
4217 }
4218 4338
4219 /* 4339 while (owner_running(lock, owner)) {
4220 * Is that owner really running on that cpu? 4340 if (need_resched())
4221 */
4222 if (task_thread_info(rq->curr) != owner || need_resched())
4223 return 0; 4341 return 0;
4224 4342
4225 arch_mutex_cpu_relax(); 4343 arch_mutex_cpu_relax();
4226 } 4344 }
4227 4345
4346 /*
4347 * If the owner changed to another task there is likely
4348 * heavy contention, stop spinning.
4349 */
4350 if (lock->owner)
4351 return 0;
4352
4228 return 1; 4353 return 1;
4229} 4354}
4230#endif 4355#endif
@@ -4684,19 +4809,18 @@ EXPORT_SYMBOL(sleep_on_timeout);
4684 */ 4809 */
4685void rt_mutex_setprio(struct task_struct *p, int prio) 4810void rt_mutex_setprio(struct task_struct *p, int prio)
4686{ 4811{
4687 unsigned long flags;
4688 int oldprio, on_rq, running; 4812 int oldprio, on_rq, running;
4689 struct rq *rq; 4813 struct rq *rq;
4690 const struct sched_class *prev_class; 4814 const struct sched_class *prev_class;
4691 4815
4692 BUG_ON(prio < 0 || prio > MAX_PRIO); 4816 BUG_ON(prio < 0 || prio > MAX_PRIO);
4693 4817
4694 rq = task_rq_lock(p, &flags); 4818 rq = __task_rq_lock(p);
4695 4819
4696 trace_sched_pi_setprio(p, prio); 4820 trace_sched_pi_setprio(p, prio);
4697 oldprio = p->prio; 4821 oldprio = p->prio;
4698 prev_class = p->sched_class; 4822 prev_class = p->sched_class;
4699 on_rq = p->se.on_rq; 4823 on_rq = p->on_rq;
4700 running = task_current(rq, p); 4824 running = task_current(rq, p);
4701 if (on_rq) 4825 if (on_rq)
4702 dequeue_task(rq, p, 0); 4826 dequeue_task(rq, p, 0);
@@ -4716,7 +4840,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
4716 enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); 4840 enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
4717 4841
4718 check_class_changed(rq, p, prev_class, oldprio); 4842 check_class_changed(rq, p, prev_class, oldprio);
4719 task_rq_unlock(rq, &flags); 4843 __task_rq_unlock(rq);
4720} 4844}
4721 4845
4722#endif 4846#endif
@@ -4744,7 +4868,7 @@ void set_user_nice(struct task_struct *p, long nice)
4744 p->static_prio = NICE_TO_PRIO(nice); 4868 p->static_prio = NICE_TO_PRIO(nice);
4745 goto out_unlock; 4869 goto out_unlock;
4746 } 4870 }
4747 on_rq = p->se.on_rq; 4871 on_rq = p->on_rq;
4748 if (on_rq) 4872 if (on_rq)
4749 dequeue_task(rq, p, 0); 4873 dequeue_task(rq, p, 0);
4750 4874
@@ -4764,7 +4888,7 @@ void set_user_nice(struct task_struct *p, long nice)
4764 resched_task(rq->curr); 4888 resched_task(rq->curr);
4765 } 4889 }
4766out_unlock: 4890out_unlock:
4767 task_rq_unlock(rq, &flags); 4891 task_rq_unlock(rq, p, &flags);
4768} 4892}
4769EXPORT_SYMBOL(set_user_nice); 4893EXPORT_SYMBOL(set_user_nice);
4770 4894
@@ -4878,8 +5002,6 @@ static struct task_struct *find_process_by_pid(pid_t pid)
4878static void 5002static void
4879__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) 5003__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
4880{ 5004{
4881 BUG_ON(p->se.on_rq);
4882
4883 p->policy = policy; 5005 p->policy = policy;
4884 p->rt_priority = prio; 5006 p->rt_priority = prio;
4885 p->normal_prio = normal_prio(p); 5007 p->normal_prio = normal_prio(p);
@@ -4994,20 +5116,17 @@ recheck:
4994 /* 5116 /*
4995 * make sure no PI-waiters arrive (or leave) while we are 5117 * make sure no PI-waiters arrive (or leave) while we are
4996 * changing the priority of the task: 5118 * changing the priority of the task:
4997 */ 5119 *
4998 raw_spin_lock_irqsave(&p->pi_lock, flags);
4999 /*
5000 * To be able to change p->policy safely, the appropriate 5120 * To be able to change p->policy safely, the appropriate
5001 * runqueue lock must be held. 5121 * runqueue lock must be held.
5002 */ 5122 */
5003 rq = __task_rq_lock(p); 5123 rq = task_rq_lock(p, &flags);
5004 5124
5005 /* 5125 /*
5006 * Changing the policy of the stop threads its a very bad idea 5126 * Changing the policy of the stop threads its a very bad idea
5007 */ 5127 */
5008 if (p == rq->stop) { 5128 if (p == rq->stop) {
5009 __task_rq_unlock(rq); 5129 task_rq_unlock(rq, p, &flags);
5010 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
5011 return -EINVAL; 5130 return -EINVAL;
5012 } 5131 }
5013 5132
@@ -5031,8 +5150,7 @@ recheck:
5031 if (rt_bandwidth_enabled() && rt_policy(policy) && 5150 if (rt_bandwidth_enabled() && rt_policy(policy) &&
5032 task_group(p)->rt_bandwidth.rt_runtime == 0 && 5151 task_group(p)->rt_bandwidth.rt_runtime == 0 &&
5033 !task_group_is_autogroup(task_group(p))) { 5152 !task_group_is_autogroup(task_group(p))) {
5034 __task_rq_unlock(rq); 5153 task_rq_unlock(rq, p, &flags);
5035 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
5036 return -EPERM; 5154 return -EPERM;
5037 } 5155 }
5038 } 5156 }
@@ -5041,11 +5159,10 @@ recheck:
5041 /* recheck policy now with rq lock held */ 5159 /* recheck policy now with rq lock held */
5042 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { 5160 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
5043 policy = oldpolicy = -1; 5161 policy = oldpolicy = -1;
5044 __task_rq_unlock(rq); 5162 task_rq_unlock(rq, p, &flags);
5045 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
5046 goto recheck; 5163 goto recheck;
5047 } 5164 }
5048 on_rq = p->se.on_rq; 5165 on_rq = p->on_rq;
5049 running = task_current(rq, p); 5166 running = task_current(rq, p);
5050 if (on_rq) 5167 if (on_rq)
5051 deactivate_task(rq, p, 0); 5168 deactivate_task(rq, p, 0);
@@ -5064,8 +5181,7 @@ recheck:
5064 activate_task(rq, p, 0); 5181 activate_task(rq, p, 0);
5065 5182
5066 check_class_changed(rq, p, prev_class, oldprio); 5183 check_class_changed(rq, p, prev_class, oldprio);
5067 __task_rq_unlock(rq); 5184 task_rq_unlock(rq, p, &flags);
5068 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
5069 5185
5070 rt_mutex_adjust_pi(p); 5186 rt_mutex_adjust_pi(p);
5071 5187
@@ -5316,7 +5432,6 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
5316{ 5432{
5317 struct task_struct *p; 5433 struct task_struct *p;
5318 unsigned long flags; 5434 unsigned long flags;
5319 struct rq *rq;
5320 int retval; 5435 int retval;
5321 5436
5322 get_online_cpus(); 5437 get_online_cpus();
@@ -5331,9 +5446,9 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
5331 if (retval) 5446 if (retval)
5332 goto out_unlock; 5447 goto out_unlock;
5333 5448
5334 rq = task_rq_lock(p, &flags); 5449 raw_spin_lock_irqsave(&p->pi_lock, flags);
5335 cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); 5450 cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
5336 task_rq_unlock(rq, &flags); 5451 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
5337 5452
5338out_unlock: 5453out_unlock:
5339 rcu_read_unlock(); 5454 rcu_read_unlock();
@@ -5658,7 +5773,7 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
5658 5773
5659 rq = task_rq_lock(p, &flags); 5774 rq = task_rq_lock(p, &flags);
5660 time_slice = p->sched_class->get_rr_interval(rq, p); 5775 time_slice = p->sched_class->get_rr_interval(rq, p);
5661 task_rq_unlock(rq, &flags); 5776 task_rq_unlock(rq, p, &flags);
5662 5777
5663 rcu_read_unlock(); 5778 rcu_read_unlock();
5664 jiffies_to_timespec(time_slice, &t); 5779 jiffies_to_timespec(time_slice, &t);
@@ -5760,7 +5875,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
5760 idle->state = TASK_RUNNING; 5875 idle->state = TASK_RUNNING;
5761 idle->se.exec_start = sched_clock(); 5876 idle->se.exec_start = sched_clock();
5762 5877
5763 cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu)); 5878 do_set_cpus_allowed(idle, cpumask_of(cpu));
5764 /* 5879 /*
5765 * We're having a chicken and egg problem, even though we are 5880 * We're having a chicken and egg problem, even though we are
5766 * holding rq->lock, the cpu isn't yet set to this cpu so the 5881 * holding rq->lock, the cpu isn't yet set to this cpu so the
@@ -5776,17 +5891,14 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
5776 rcu_read_unlock(); 5891 rcu_read_unlock();
5777 5892
5778 rq->curr = rq->idle = idle; 5893 rq->curr = rq->idle = idle;
5779#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) 5894#if defined(CONFIG_SMP)
5780 idle->oncpu = 1; 5895 idle->on_cpu = 1;
5781#endif 5896#endif
5782 raw_spin_unlock_irqrestore(&rq->lock, flags); 5897 raw_spin_unlock_irqrestore(&rq->lock, flags);
5783 5898
5784 /* Set the preempt count _outside_ the spinlocks! */ 5899 /* Set the preempt count _outside_ the spinlocks! */
5785#if defined(CONFIG_PREEMPT)
5786 task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);
5787#else
5788 task_thread_info(idle)->preempt_count = 0; 5900 task_thread_info(idle)->preempt_count = 0;
5789#endif 5901
5790 /* 5902 /*
5791 * The idle tasks have their own, simple scheduling class: 5903 * The idle tasks have their own, simple scheduling class:
5792 */ 5904 */
@@ -5851,6 +5963,16 @@ static inline void sched_init_granularity(void)
5851} 5963}
5852 5964
5853#ifdef CONFIG_SMP 5965#ifdef CONFIG_SMP
5966void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
5967{
5968 if (p->sched_class && p->sched_class->set_cpus_allowed)
5969 p->sched_class->set_cpus_allowed(p, new_mask);
5970 else {
5971 cpumask_copy(&p->cpus_allowed, new_mask);
5972 p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
5973 }
5974}
5975
5854/* 5976/*
5855 * This is how migration works: 5977 * This is how migration works:
5856 * 5978 *
@@ -5881,52 +6003,38 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
5881 unsigned int dest_cpu; 6003 unsigned int dest_cpu;
5882 int ret = 0; 6004 int ret = 0;
5883 6005
5884 /*
5885 * Serialize against TASK_WAKING so that ttwu() and wunt() can
5886 * drop the rq->lock and still rely on ->cpus_allowed.
5887 */
5888again:
5889 while (task_is_waking(p))
5890 cpu_relax();
5891 rq = task_rq_lock(p, &flags); 6006 rq = task_rq_lock(p, &flags);
5892 if (task_is_waking(p)) { 6007
5893 task_rq_unlock(rq, &flags); 6008 if (cpumask_equal(&p->cpus_allowed, new_mask))
5894 goto again; 6009 goto out;
5895 }
5896 6010
5897 if (!cpumask_intersects(new_mask, cpu_active_mask)) { 6011 if (!cpumask_intersects(new_mask, cpu_active_mask)) {
5898 ret = -EINVAL; 6012 ret = -EINVAL;
5899 goto out; 6013 goto out;
5900 } 6014 }
5901 6015
5902 if (unlikely((p->flags & PF_THREAD_BOUND) && p != current && 6016 if (unlikely((p->flags & PF_THREAD_BOUND) && p != current)) {
5903 !cpumask_equal(&p->cpus_allowed, new_mask))) {
5904 ret = -EINVAL; 6017 ret = -EINVAL;
5905 goto out; 6018 goto out;
5906 } 6019 }
5907 6020
5908 if (p->sched_class->set_cpus_allowed) 6021 do_set_cpus_allowed(p, new_mask);
5909 p->sched_class->set_cpus_allowed(p, new_mask);
5910 else {
5911 cpumask_copy(&p->cpus_allowed, new_mask);
5912 p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
5913 }
5914 6022
5915 /* Can the task run on the task's current CPU? If so, we're done */ 6023 /* Can the task run on the task's current CPU? If so, we're done */
5916 if (cpumask_test_cpu(task_cpu(p), new_mask)) 6024 if (cpumask_test_cpu(task_cpu(p), new_mask))
5917 goto out; 6025 goto out;
5918 6026
5919 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); 6027 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
5920 if (migrate_task(p, rq)) { 6028 if (p->on_rq) {
5921 struct migration_arg arg = { p, dest_cpu }; 6029 struct migration_arg arg = { p, dest_cpu };
5922 /* Need help from migration thread: drop lock and wait. */ 6030 /* Need help from migration thread: drop lock and wait. */
5923 task_rq_unlock(rq, &flags); 6031 task_rq_unlock(rq, p, &flags);
5924 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); 6032 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
5925 tlb_migrate_finish(p->mm); 6033 tlb_migrate_finish(p->mm);
5926 return 0; 6034 return 0;
5927 } 6035 }
5928out: 6036out:
5929 task_rq_unlock(rq, &flags); 6037 task_rq_unlock(rq, p, &flags);
5930 6038
5931 return ret; 6039 return ret;
5932} 6040}
@@ -5954,6 +6062,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
5954 rq_src = cpu_rq(src_cpu); 6062 rq_src = cpu_rq(src_cpu);
5955 rq_dest = cpu_rq(dest_cpu); 6063 rq_dest = cpu_rq(dest_cpu);
5956 6064
6065 raw_spin_lock(&p->pi_lock);
5957 double_rq_lock(rq_src, rq_dest); 6066 double_rq_lock(rq_src, rq_dest);
5958 /* Already moved. */ 6067 /* Already moved. */
5959 if (task_cpu(p) != src_cpu) 6068 if (task_cpu(p) != src_cpu)
@@ -5966,7 +6075,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
5966 * If we're not on a rq, the next wake-up will ensure we're 6075 * If we're not on a rq, the next wake-up will ensure we're
5967 * placed properly. 6076 * placed properly.
5968 */ 6077 */
5969 if (p->se.on_rq) { 6078 if (p->on_rq) {
5970 deactivate_task(rq_src, p, 0); 6079 deactivate_task(rq_src, p, 0);
5971 set_task_cpu(p, dest_cpu); 6080 set_task_cpu(p, dest_cpu);
5972 activate_task(rq_dest, p, 0); 6081 activate_task(rq_dest, p, 0);
@@ -5976,6 +6085,7 @@ done:
5976 ret = 1; 6085 ret = 1;
5977fail: 6086fail:
5978 double_rq_unlock(rq_src, rq_dest); 6087 double_rq_unlock(rq_src, rq_dest);
6088 raw_spin_unlock(&p->pi_lock);
5979 return ret; 6089 return ret;
5980} 6090}
5981 6091
@@ -6316,6 +6426,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6316 6426
6317#ifdef CONFIG_HOTPLUG_CPU 6427#ifdef CONFIG_HOTPLUG_CPU
6318 case CPU_DYING: 6428 case CPU_DYING:
6429 sched_ttwu_pending();
6319 /* Update our root-domain */ 6430 /* Update our root-domain */
6320 raw_spin_lock_irqsave(&rq->lock, flags); 6431 raw_spin_lock_irqsave(&rq->lock, flags);
6321 if (rq->rd) { 6432 if (rq->rd) {
@@ -6394,6 +6505,8 @@ early_initcall(migration_init);
6394 6505
6395#ifdef CONFIG_SMP 6506#ifdef CONFIG_SMP
6396 6507
6508static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
6509
6397#ifdef CONFIG_SCHED_DEBUG 6510#ifdef CONFIG_SCHED_DEBUG
6398 6511
6399static __read_mostly int sched_domain_debug_enabled; 6512static __read_mostly int sched_domain_debug_enabled;
@@ -6468,7 +6581,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6468 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); 6581 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
6469 6582
6470 printk(KERN_CONT " %s", str); 6583 printk(KERN_CONT " %s", str);
6471 if (group->cpu_power != SCHED_LOAD_SCALE) { 6584 if (group->cpu_power != SCHED_POWER_SCALE) {
6472 printk(KERN_CONT " (cpu_power = %d)", 6585 printk(KERN_CONT " (cpu_power = %d)",
6473 group->cpu_power); 6586 group->cpu_power);
6474 } 6587 }
@@ -6489,7 +6602,6 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6489 6602
6490static void sched_domain_debug(struct sched_domain *sd, int cpu) 6603static void sched_domain_debug(struct sched_domain *sd, int cpu)
6491{ 6604{
6492 cpumask_var_t groupmask;
6493 int level = 0; 6605 int level = 0;
6494 6606
6495 if (!sched_domain_debug_enabled) 6607 if (!sched_domain_debug_enabled)
@@ -6502,20 +6614,14 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
6502 6614
6503 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); 6615 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
6504 6616
6505 if (!alloc_cpumask_var(&groupmask, GFP_KERNEL)) {
6506 printk(KERN_DEBUG "Cannot load-balance (out of memory)\n");
6507 return;
6508 }
6509
6510 for (;;) { 6617 for (;;) {
6511 if (sched_domain_debug_one(sd, cpu, level, groupmask)) 6618 if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
6512 break; 6619 break;
6513 level++; 6620 level++;
6514 sd = sd->parent; 6621 sd = sd->parent;
6515 if (!sd) 6622 if (!sd)
6516 break; 6623 break;
6517 } 6624 }
6518 free_cpumask_var(groupmask);
6519} 6625}
6520#else /* !CONFIG_SCHED_DEBUG */ 6626#else /* !CONFIG_SCHED_DEBUG */
6521# define sched_domain_debug(sd, cpu) do { } while (0) 6627# define sched_domain_debug(sd, cpu) do { } while (0)
@@ -6572,12 +6678,11 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
6572 return 1; 6678 return 1;
6573} 6679}
6574 6680
6575static void free_rootdomain(struct root_domain *rd) 6681static void free_rootdomain(struct rcu_head *rcu)
6576{ 6682{
6577 synchronize_sched(); 6683 struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
6578 6684
6579 cpupri_cleanup(&rd->cpupri); 6685 cpupri_cleanup(&rd->cpupri);
6580
6581 free_cpumask_var(rd->rto_mask); 6686 free_cpumask_var(rd->rto_mask);
6582 free_cpumask_var(rd->online); 6687 free_cpumask_var(rd->online);
6583 free_cpumask_var(rd->span); 6688 free_cpumask_var(rd->span);
@@ -6618,7 +6723,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
6618 raw_spin_unlock_irqrestore(&rq->lock, flags); 6723 raw_spin_unlock_irqrestore(&rq->lock, flags);
6619 6724
6620 if (old_rd) 6725 if (old_rd)
6621 free_rootdomain(old_rd); 6726 call_rcu_sched(&old_rd->rcu, free_rootdomain);
6622} 6727}
6623 6728
6624static int init_rootdomain(struct root_domain *rd) 6729static int init_rootdomain(struct root_domain *rd)
@@ -6669,6 +6774,25 @@ static struct root_domain *alloc_rootdomain(void)
6669 return rd; 6774 return rd;
6670} 6775}
6671 6776
6777static void free_sched_domain(struct rcu_head *rcu)
6778{
6779 struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
6780 if (atomic_dec_and_test(&sd->groups->ref))
6781 kfree(sd->groups);
6782 kfree(sd);
6783}
6784
6785static void destroy_sched_domain(struct sched_domain *sd, int cpu)
6786{
6787 call_rcu(&sd->rcu, free_sched_domain);
6788}
6789
6790static void destroy_sched_domains(struct sched_domain *sd, int cpu)
6791{
6792 for (; sd; sd = sd->parent)
6793 destroy_sched_domain(sd, cpu);
6794}
6795
6672/* 6796/*
6673 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must 6797 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
6674 * hold the hotplug lock. 6798 * hold the hotplug lock.
@@ -6679,9 +6803,6 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
6679 struct rq *rq = cpu_rq(cpu); 6803 struct rq *rq = cpu_rq(cpu);
6680 struct sched_domain *tmp; 6804 struct sched_domain *tmp;
6681 6805
6682 for (tmp = sd; tmp; tmp = tmp->parent)
6683 tmp->span_weight = cpumask_weight(sched_domain_span(tmp));
6684
6685 /* Remove the sched domains which do not contribute to scheduling. */ 6806 /* Remove the sched domains which do not contribute to scheduling. */
6686 for (tmp = sd; tmp; ) { 6807 for (tmp = sd; tmp; ) {
6687 struct sched_domain *parent = tmp->parent; 6808 struct sched_domain *parent = tmp->parent;
@@ -6692,12 +6813,15 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
6692 tmp->parent = parent->parent; 6813 tmp->parent = parent->parent;
6693 if (parent->parent) 6814 if (parent->parent)
6694 parent->parent->child = tmp; 6815 parent->parent->child = tmp;
6816 destroy_sched_domain(parent, cpu);
6695 } else 6817 } else
6696 tmp = tmp->parent; 6818 tmp = tmp->parent;
6697 } 6819 }
6698 6820
6699 if (sd && sd_degenerate(sd)) { 6821 if (sd && sd_degenerate(sd)) {
6822 tmp = sd;
6700 sd = sd->parent; 6823 sd = sd->parent;
6824 destroy_sched_domain(tmp, cpu);
6701 if (sd) 6825 if (sd)
6702 sd->child = NULL; 6826 sd->child = NULL;
6703 } 6827 }
@@ -6705,7 +6829,9 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
6705 sched_domain_debug(sd, cpu); 6829 sched_domain_debug(sd, cpu);
6706 6830
6707 rq_attach_root(rq, rd); 6831 rq_attach_root(rq, rd);
6832 tmp = rq->sd;
6708 rcu_assign_pointer(rq->sd, sd); 6833 rcu_assign_pointer(rq->sd, sd);
6834 destroy_sched_domains(tmp, cpu);
6709} 6835}
6710 6836
6711/* cpus with isolated domains */ 6837/* cpus with isolated domains */
@@ -6721,56 +6847,6 @@ static int __init isolated_cpu_setup(char *str)
6721 6847
6722__setup("isolcpus=", isolated_cpu_setup); 6848__setup("isolcpus=", isolated_cpu_setup);
6723 6849
6724/*
6725 * init_sched_build_groups takes the cpumask we wish to span, and a pointer
6726 * to a function which identifies what group(along with sched group) a CPU
6727 * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids
6728 * (due to the fact that we keep track of groups covered with a struct cpumask).
6729 *
6730 * init_sched_build_groups will build a circular linked list of the groups
6731 * covered by the given span, and will set each group's ->cpumask correctly,
6732 * and ->cpu_power to 0.
6733 */
6734static void
6735init_sched_build_groups(const struct cpumask *span,
6736 const struct cpumask *cpu_map,
6737 int (*group_fn)(int cpu, const struct cpumask *cpu_map,
6738 struct sched_group **sg,
6739 struct cpumask *tmpmask),
6740 struct cpumask *covered, struct cpumask *tmpmask)
6741{
6742 struct sched_group *first = NULL, *last = NULL;
6743 int i;
6744
6745 cpumask_clear(covered);
6746
6747 for_each_cpu(i, span) {
6748 struct sched_group *sg;
6749 int group = group_fn(i, cpu_map, &sg, tmpmask);
6750 int j;
6751
6752 if (cpumask_test_cpu(i, covered))
6753 continue;
6754
6755 cpumask_clear(sched_group_cpus(sg));
6756 sg->cpu_power = 0;
6757
6758 for_each_cpu(j, span) {
6759 if (group_fn(j, cpu_map, NULL, tmpmask) != group)
6760 continue;
6761
6762 cpumask_set_cpu(j, covered);
6763 cpumask_set_cpu(j, sched_group_cpus(sg));
6764 }
6765 if (!first)
6766 first = sg;
6767 if (last)
6768 last->next = sg;
6769 last = sg;
6770 }
6771 last->next = first;
6772}
6773
6774#define SD_NODES_PER_DOMAIN 16 6850#define SD_NODES_PER_DOMAIN 16
6775 6851
6776#ifdef CONFIG_NUMA 6852#ifdef CONFIG_NUMA
@@ -6787,7 +6863,7 @@ init_sched_build_groups(const struct cpumask *span,
6787 */ 6863 */
6788static int find_next_best_node(int node, nodemask_t *used_nodes) 6864static int find_next_best_node(int node, nodemask_t *used_nodes)
6789{ 6865{
6790 int i, n, val, min_val, best_node = 0; 6866 int i, n, val, min_val, best_node = -1;
6791 6867
6792 min_val = INT_MAX; 6868 min_val = INT_MAX;
6793 6869
@@ -6811,7 +6887,8 @@ static int find_next_best_node(int node, nodemask_t *used_nodes)
6811 } 6887 }
6812 } 6888 }
6813 6889
6814 node_set(best_node, *used_nodes); 6890 if (best_node != -1)
6891 node_set(best_node, *used_nodes);
6815 return best_node; 6892 return best_node;
6816} 6893}
6817 6894
@@ -6837,315 +6914,130 @@ static void sched_domain_node_span(int node, struct cpumask *span)
6837 6914
6838 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { 6915 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
6839 int next_node = find_next_best_node(node, &used_nodes); 6916 int next_node = find_next_best_node(node, &used_nodes);
6840 6917 if (next_node < 0)
6918 break;
6841 cpumask_or(span, span, cpumask_of_node(next_node)); 6919 cpumask_or(span, span, cpumask_of_node(next_node));
6842 } 6920 }
6843} 6921}
6922
6923static const struct cpumask *cpu_node_mask(int cpu)
6924{
6925 lockdep_assert_held(&sched_domains_mutex);
6926
6927 sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask);
6928
6929 return sched_domains_tmpmask;
6930}
6931
6932static const struct cpumask *cpu_allnodes_mask(int cpu)
6933{
6934 return cpu_possible_mask;
6935}
6844#endif /* CONFIG_NUMA */ 6936#endif /* CONFIG_NUMA */
6845 6937
6846int sched_smt_power_savings = 0, sched_mc_power_savings = 0; 6938static const struct cpumask *cpu_cpu_mask(int cpu)
6939{
6940 return cpumask_of_node(cpu_to_node(cpu));
6941}
6847 6942
6848/* 6943int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
6849 * The cpus mask in sched_group and sched_domain hangs off the end.
6850 *
6851 * ( See the the comments in include/linux/sched.h:struct sched_group
6852 * and struct sched_domain. )
6853 */
6854struct static_sched_group {
6855 struct sched_group sg;
6856 DECLARE_BITMAP(cpus, CONFIG_NR_CPUS);
6857};
6858 6944
6859struct static_sched_domain { 6945struct sd_data {
6860 struct sched_domain sd; 6946 struct sched_domain **__percpu sd;
6861 DECLARE_BITMAP(span, CONFIG_NR_CPUS); 6947 struct sched_group **__percpu sg;
6862}; 6948};
6863 6949
6864struct s_data { 6950struct s_data {
6865#ifdef CONFIG_NUMA 6951 struct sched_domain ** __percpu sd;
6866 int sd_allnodes;
6867 cpumask_var_t domainspan;
6868 cpumask_var_t covered;
6869 cpumask_var_t notcovered;
6870#endif
6871 cpumask_var_t nodemask;
6872 cpumask_var_t this_sibling_map;
6873 cpumask_var_t this_core_map;
6874 cpumask_var_t this_book_map;
6875 cpumask_var_t send_covered;
6876 cpumask_var_t tmpmask;
6877 struct sched_group **sched_group_nodes;
6878 struct root_domain *rd; 6952 struct root_domain *rd;
6879}; 6953};
6880 6954
6881enum s_alloc { 6955enum s_alloc {
6882 sa_sched_groups = 0,
6883 sa_rootdomain, 6956 sa_rootdomain,
6884 sa_tmpmask, 6957 sa_sd,
6885 sa_send_covered, 6958 sa_sd_storage,
6886 sa_this_book_map,
6887 sa_this_core_map,
6888 sa_this_sibling_map,
6889 sa_nodemask,
6890 sa_sched_group_nodes,
6891#ifdef CONFIG_NUMA
6892 sa_notcovered,
6893 sa_covered,
6894 sa_domainspan,
6895#endif
6896 sa_none, 6959 sa_none,
6897}; 6960};
6898 6961
6899/* 6962struct sched_domain_topology_level;
6900 * SMT sched-domains:
6901 */
6902#ifdef CONFIG_SCHED_SMT
6903static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains);
6904static DEFINE_PER_CPU(struct static_sched_group, sched_groups);
6905 6963
6906static int 6964typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu);
6907cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map, 6965typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
6908 struct sched_group **sg, struct cpumask *unused)
6909{
6910 if (sg)
6911 *sg = &per_cpu(sched_groups, cpu).sg;
6912 return cpu;
6913}
6914#endif /* CONFIG_SCHED_SMT */
6915
6916/*
6917 * multi-core sched-domains:
6918 */
6919#ifdef CONFIG_SCHED_MC
6920static DEFINE_PER_CPU(struct static_sched_domain, core_domains);
6921static DEFINE_PER_CPU(struct static_sched_group, sched_group_core);
6922 6966
6923static int 6967struct sched_domain_topology_level {
6924cpu_to_core_group(int cpu, const struct cpumask *cpu_map, 6968 sched_domain_init_f init;
6925 struct sched_group **sg, struct cpumask *mask) 6969 sched_domain_mask_f mask;
6926{ 6970 struct sd_data data;
6927 int group; 6971};
6928#ifdef CONFIG_SCHED_SMT
6929 cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
6930 group = cpumask_first(mask);
6931#else
6932 group = cpu;
6933#endif
6934 if (sg)
6935 *sg = &per_cpu(sched_group_core, group).sg;
6936 return group;
6937}
6938#endif /* CONFIG_SCHED_MC */
6939 6972
6940/* 6973/*
6941 * book sched-domains: 6974 * Assumes the sched_domain tree is fully constructed
6942 */ 6975 */
6943#ifdef CONFIG_SCHED_BOOK 6976static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
6944static DEFINE_PER_CPU(struct static_sched_domain, book_domains);
6945static DEFINE_PER_CPU(struct static_sched_group, sched_group_book);
6946
6947static int
6948cpu_to_book_group(int cpu, const struct cpumask *cpu_map,
6949 struct sched_group **sg, struct cpumask *mask)
6950{ 6977{
6951 int group = cpu; 6978 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
6952#ifdef CONFIG_SCHED_MC 6979 struct sched_domain *child = sd->child;
6953 cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
6954 group = cpumask_first(mask);
6955#elif defined(CONFIG_SCHED_SMT)
6956 cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
6957 group = cpumask_first(mask);
6958#endif
6959 if (sg)
6960 *sg = &per_cpu(sched_group_book, group).sg;
6961 return group;
6962}
6963#endif /* CONFIG_SCHED_BOOK */
6964 6980
6965static DEFINE_PER_CPU(struct static_sched_domain, phys_domains); 6981 if (child)
6966static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys); 6982 cpu = cpumask_first(sched_domain_span(child));
6967 6983
6968static int
6969cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,
6970 struct sched_group **sg, struct cpumask *mask)
6971{
6972 int group;
6973#ifdef CONFIG_SCHED_BOOK
6974 cpumask_and(mask, cpu_book_mask(cpu), cpu_map);
6975 group = cpumask_first(mask);
6976#elif defined(CONFIG_SCHED_MC)
6977 cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
6978 group = cpumask_first(mask);
6979#elif defined(CONFIG_SCHED_SMT)
6980 cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
6981 group = cpumask_first(mask);
6982#else
6983 group = cpu;
6984#endif
6985 if (sg) 6984 if (sg)
6986 *sg = &per_cpu(sched_group_phys, group).sg; 6985 *sg = *per_cpu_ptr(sdd->sg, cpu);
6987 return group; 6986
6987 return cpu;
6988} 6988}
6989 6989
6990#ifdef CONFIG_NUMA
6991/* 6990/*
6992 * The init_sched_build_groups can't handle what we want to do with node 6991 * build_sched_groups takes the cpumask we wish to span, and a pointer
6993 * groups, so roll our own. Now each node has its own list of groups which 6992 * to a function which identifies what group(along with sched group) a CPU
6994 * gets dynamically allocated. 6993 * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids
6994 * (due to the fact that we keep track of groups covered with a struct cpumask).
6995 *
6996 * build_sched_groups will build a circular linked list of the groups
6997 * covered by the given span, and will set each group's ->cpumask correctly,
6998 * and ->cpu_power to 0.
6995 */ 6999 */
6996static DEFINE_PER_CPU(struct static_sched_domain, node_domains); 7000static void
6997static struct sched_group ***sched_group_nodes_bycpu; 7001build_sched_groups(struct sched_domain *sd)
6998
6999static DEFINE_PER_CPU(struct static_sched_domain, allnodes_domains);
7000static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes);
7001
7002static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map,
7003 struct sched_group **sg,
7004 struct cpumask *nodemask)
7005{
7006 int group;
7007
7008 cpumask_and(nodemask, cpumask_of_node(cpu_to_node(cpu)), cpu_map);
7009 group = cpumask_first(nodemask);
7010
7011 if (sg)
7012 *sg = &per_cpu(sched_group_allnodes, group).sg;
7013 return group;
7014}
7015
7016static void init_numa_sched_groups_power(struct sched_group *group_head)
7017{
7018 struct sched_group *sg = group_head;
7019 int j;
7020
7021 if (!sg)
7022 return;
7023 do {
7024 for_each_cpu(j, sched_group_cpus(sg)) {
7025 struct sched_domain *sd;
7026
7027 sd = &per_cpu(phys_domains, j).sd;
7028 if (j != group_first_cpu(sd->groups)) {
7029 /*
7030 * Only add "power" once for each
7031 * physical package.
7032 */
7033 continue;
7034 }
7035
7036 sg->cpu_power += sd->groups->cpu_power;
7037 }
7038 sg = sg->next;
7039 } while (sg != group_head);
7040}
7041
7042static int build_numa_sched_groups(struct s_data *d,
7043 const struct cpumask *cpu_map, int num)
7044{ 7002{
7045 struct sched_domain *sd; 7003 struct sched_group *first = NULL, *last = NULL;
7046 struct sched_group *sg, *prev; 7004 struct sd_data *sdd = sd->private;
7047 int n, j; 7005 const struct cpumask *span = sched_domain_span(sd);
7048 7006 struct cpumask *covered;
7049 cpumask_clear(d->covered); 7007 int i;
7050 cpumask_and(d->nodemask, cpumask_of_node(num), cpu_map);
7051 if (cpumask_empty(d->nodemask)) {
7052 d->sched_group_nodes[num] = NULL;
7053 goto out;
7054 }
7055
7056 sched_domain_node_span(num, d->domainspan);
7057 cpumask_and(d->domainspan, d->domainspan, cpu_map);
7058
7059 sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
7060 GFP_KERNEL, num);
7061 if (!sg) {
7062 printk(KERN_WARNING "Can not alloc domain group for node %d\n",
7063 num);
7064 return -ENOMEM;
7065 }
7066 d->sched_group_nodes[num] = sg;
7067
7068 for_each_cpu(j, d->nodemask) {
7069 sd = &per_cpu(node_domains, j).sd;
7070 sd->groups = sg;
7071 }
7072 7008
7073 sg->cpu_power = 0; 7009 lockdep_assert_held(&sched_domains_mutex);
7074 cpumask_copy(sched_group_cpus(sg), d->nodemask); 7010 covered = sched_domains_tmpmask;
7075 sg->next = sg;
7076 cpumask_or(d->covered, d->covered, d->nodemask);
7077 7011
7078 prev = sg; 7012 cpumask_clear(covered);
7079 for (j = 0; j < nr_node_ids; j++) {
7080 n = (num + j) % nr_node_ids;
7081 cpumask_complement(d->notcovered, d->covered);
7082 cpumask_and(d->tmpmask, d->notcovered, cpu_map);
7083 cpumask_and(d->tmpmask, d->tmpmask, d->domainspan);
7084 if (cpumask_empty(d->tmpmask))
7085 break;
7086 cpumask_and(d->tmpmask, d->tmpmask, cpumask_of_node(n));
7087 if (cpumask_empty(d->tmpmask))
7088 continue;
7089 sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
7090 GFP_KERNEL, num);
7091 if (!sg) {
7092 printk(KERN_WARNING
7093 "Can not alloc domain group for node %d\n", j);
7094 return -ENOMEM;
7095 }
7096 sg->cpu_power = 0;
7097 cpumask_copy(sched_group_cpus(sg), d->tmpmask);
7098 sg->next = prev->next;
7099 cpumask_or(d->covered, d->covered, d->tmpmask);
7100 prev->next = sg;
7101 prev = sg;
7102 }
7103out:
7104 return 0;
7105}
7106#endif /* CONFIG_NUMA */
7107
7108#ifdef CONFIG_NUMA
7109/* Free memory allocated for various sched_group structures */
7110static void free_sched_groups(const struct cpumask *cpu_map,
7111 struct cpumask *nodemask)
7112{
7113 int cpu, i;
7114 7013
7115 for_each_cpu(cpu, cpu_map) { 7014 for_each_cpu(i, span) {
7116 struct sched_group **sched_group_nodes 7015 struct sched_group *sg;
7117 = sched_group_nodes_bycpu[cpu]; 7016 int group = get_group(i, sdd, &sg);
7017 int j;
7118 7018
7119 if (!sched_group_nodes) 7019 if (cpumask_test_cpu(i, covered))
7120 continue; 7020 continue;
7121 7021
7122 for (i = 0; i < nr_node_ids; i++) { 7022 cpumask_clear(sched_group_cpus(sg));
7123 struct sched_group *oldsg, *sg = sched_group_nodes[i]; 7023 sg->cpu_power = 0;
7124 7024
7125 cpumask_and(nodemask, cpumask_of_node(i), cpu_map); 7025 for_each_cpu(j, span) {
7126 if (cpumask_empty(nodemask)) 7026 if (get_group(j, sdd, NULL) != group)
7127 continue; 7027 continue;
7128 7028
7129 if (sg == NULL) 7029 cpumask_set_cpu(j, covered);
7130 continue; 7030 cpumask_set_cpu(j, sched_group_cpus(sg));
7131 sg = sg->next;
7132next_sg:
7133 oldsg = sg;
7134 sg = sg->next;
7135 kfree(oldsg);
7136 if (oldsg != sched_group_nodes[i])
7137 goto next_sg;
7138 } 7031 }
7139 kfree(sched_group_nodes); 7032
7140 sched_group_nodes_bycpu[cpu] = NULL; 7033 if (!first)
7034 first = sg;
7035 if (last)
7036 last->next = sg;
7037 last = sg;
7141 } 7038 }
7039 last->next = first;
7142} 7040}
7143#else /* !CONFIG_NUMA */
7144static void free_sched_groups(const struct cpumask *cpu_map,
7145 struct cpumask *nodemask)
7146{
7147}
7148#endif /* CONFIG_NUMA */
7149 7041
7150/* 7042/*
7151 * Initialize sched groups cpu_power. 7043 * Initialize sched groups cpu_power.
@@ -7159,11 +7051,6 @@ static void free_sched_groups(const struct cpumask *cpu_map,
7159 */ 7051 */
7160static void init_sched_groups_power(int cpu, struct sched_domain *sd) 7052static void init_sched_groups_power(int cpu, struct sched_domain *sd)
7161{ 7053{
7162 struct sched_domain *child;
7163 struct sched_group *group;
7164 long power;
7165 int weight;
7166
7167 WARN_ON(!sd || !sd->groups); 7054 WARN_ON(!sd || !sd->groups);
7168 7055
7169 if (cpu != group_first_cpu(sd->groups)) 7056 if (cpu != group_first_cpu(sd->groups))
@@ -7171,36 +7058,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
7171 7058
7172 sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups)); 7059 sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups));
7173 7060
7174 child = sd->child; 7061 update_group_power(sd, cpu);
7175
7176 sd->groups->cpu_power = 0;
7177
7178 if (!child) {
7179 power = SCHED_LOAD_SCALE;
7180 weight = cpumask_weight(sched_domain_span(sd));
7181 /*
7182 * SMT siblings share the power of a single core.
7183 * Usually multiple threads get a better yield out of
7184 * that one core than a single thread would have,
7185 * reflect that in sd->smt_gain.
7186 */
7187 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
7188 power *= sd->smt_gain;
7189 power /= weight;
7190 power >>= SCHED_LOAD_SHIFT;
7191 }
7192 sd->groups->cpu_power += power;
7193 return;
7194 }
7195
7196 /*
7197 * Add cpu_power of each child group to this groups cpu_power.
7198 */
7199 group = child->groups;
7200 do {
7201 sd->groups->cpu_power += group->cpu_power;
7202 group = group->next;
7203 } while (group != child->groups);
7204} 7062}
7205 7063
7206/* 7064/*
@@ -7214,15 +7072,15 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
7214# define SD_INIT_NAME(sd, type) do { } while (0) 7072# define SD_INIT_NAME(sd, type) do { } while (0)
7215#endif 7073#endif
7216 7074
7217#define SD_INIT(sd, type) sd_init_##type(sd) 7075#define SD_INIT_FUNC(type) \
7218 7076static noinline struct sched_domain * \
7219#define SD_INIT_FUNC(type) \ 7077sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \
7220static noinline void sd_init_##type(struct sched_domain *sd) \ 7078{ \
7221{ \ 7079 struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); \
7222 memset(sd, 0, sizeof(*sd)); \ 7080 *sd = SD_##type##_INIT; \
7223 *sd = SD_##type##_INIT; \ 7081 SD_INIT_NAME(sd, type); \
7224 sd->level = SD_LV_##type; \ 7082 sd->private = &tl->data; \
7225 SD_INIT_NAME(sd, type); \ 7083 return sd; \
7226} 7084}
7227 7085
7228SD_INIT_FUNC(CPU) 7086SD_INIT_FUNC(CPU)
@@ -7241,13 +7099,14 @@ SD_INIT_FUNC(CPU)
7241#endif 7099#endif
7242 7100
7243static int default_relax_domain_level = -1; 7101static int default_relax_domain_level = -1;
7102int sched_domain_level_max;
7244 7103
7245static int __init setup_relax_domain_level(char *str) 7104static int __init setup_relax_domain_level(char *str)
7246{ 7105{
7247 unsigned long val; 7106 unsigned long val;
7248 7107
7249 val = simple_strtoul(str, NULL, 0); 7108 val = simple_strtoul(str, NULL, 0);
7250 if (val < SD_LV_MAX) 7109 if (val < sched_domain_level_max)
7251 default_relax_domain_level = val; 7110 default_relax_domain_level = val;
7252 7111
7253 return 1; 7112 return 1;
@@ -7275,37 +7134,20 @@ static void set_domain_attribute(struct sched_domain *sd,
7275 } 7134 }
7276} 7135}
7277 7136
7137static void __sdt_free(const struct cpumask *cpu_map);
7138static int __sdt_alloc(const struct cpumask *cpu_map);
7139
7278static void __free_domain_allocs(struct s_data *d, enum s_alloc what, 7140static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
7279 const struct cpumask *cpu_map) 7141 const struct cpumask *cpu_map)
7280{ 7142{
7281 switch (what) { 7143 switch (what) {
7282 case sa_sched_groups:
7283 free_sched_groups(cpu_map, d->tmpmask); /* fall through */
7284 d->sched_group_nodes = NULL;
7285 case sa_rootdomain: 7144 case sa_rootdomain:
7286 free_rootdomain(d->rd); /* fall through */ 7145 if (!atomic_read(&d->rd->refcount))
7287 case sa_tmpmask: 7146 free_rootdomain(&d->rd->rcu); /* fall through */
7288 free_cpumask_var(d->tmpmask); /* fall through */ 7147 case sa_sd:
7289 case sa_send_covered: 7148 free_percpu(d->sd); /* fall through */
7290 free_cpumask_var(d->send_covered); /* fall through */ 7149 case sa_sd_storage:
7291 case sa_this_book_map: 7150 __sdt_free(cpu_map); /* fall through */
7292 free_cpumask_var(d->this_book_map); /* fall through */
7293 case sa_this_core_map:
7294 free_cpumask_var(d->this_core_map); /* fall through */
7295 case sa_this_sibling_map:
7296 free_cpumask_var(d->this_sibling_map); /* fall through */
7297 case sa_nodemask:
7298 free_cpumask_var(d->nodemask); /* fall through */
7299 case sa_sched_group_nodes:
7300#ifdef CONFIG_NUMA
7301 kfree(d->sched_group_nodes); /* fall through */
7302 case sa_notcovered:
7303 free_cpumask_var(d->notcovered); /* fall through */
7304 case sa_covered:
7305 free_cpumask_var(d->covered); /* fall through */
7306 case sa_domainspan:
7307 free_cpumask_var(d->domainspan); /* fall through */
7308#endif
7309 case sa_none: 7151 case sa_none:
7310 break; 7152 break;
7311 } 7153 }
@@ -7314,308 +7156,212 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
7314static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, 7156static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
7315 const struct cpumask *cpu_map) 7157 const struct cpumask *cpu_map)
7316{ 7158{
7317#ifdef CONFIG_NUMA 7159 memset(d, 0, sizeof(*d));
7318 if (!alloc_cpumask_var(&d->domainspan, GFP_KERNEL)) 7160
7319 return sa_none; 7161 if (__sdt_alloc(cpu_map))
7320 if (!alloc_cpumask_var(&d->covered, GFP_KERNEL)) 7162 return sa_sd_storage;
7321 return sa_domainspan; 7163 d->sd = alloc_percpu(struct sched_domain *);
7322 if (!alloc_cpumask_var(&d->notcovered, GFP_KERNEL)) 7164 if (!d->sd)
7323 return sa_covered; 7165 return sa_sd_storage;
7324 /* Allocate the per-node list of sched groups */
7325 d->sched_group_nodes = kcalloc(nr_node_ids,
7326 sizeof(struct sched_group *), GFP_KERNEL);
7327 if (!d->sched_group_nodes) {
7328 printk(KERN_WARNING "Can not alloc sched group node list\n");
7329 return sa_notcovered;
7330 }
7331 sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d->sched_group_nodes;
7332#endif
7333 if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL))
7334 return sa_sched_group_nodes;
7335 if (!alloc_cpumask_var(&d->this_sibling_map, GFP_KERNEL))
7336 return sa_nodemask;
7337 if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL))
7338 return sa_this_sibling_map;
7339 if (!alloc_cpumask_var(&d->this_book_map, GFP_KERNEL))
7340 return sa_this_core_map;
7341 if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))
7342 return sa_this_book_map;
7343 if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL))
7344 return sa_send_covered;
7345 d->rd = alloc_rootdomain(); 7166 d->rd = alloc_rootdomain();
7346 if (!d->rd) { 7167 if (!d->rd)
7347 printk(KERN_WARNING "Cannot alloc root domain\n"); 7168 return sa_sd;
7348 return sa_tmpmask;
7349 }
7350 return sa_rootdomain; 7169 return sa_rootdomain;
7351} 7170}
7352 7171
7353static struct sched_domain *__build_numa_sched_domains(struct s_data *d, 7172/*
7354 const struct cpumask *cpu_map, struct sched_domain_attr *attr, int i) 7173 * NULL the sd_data elements we've used to build the sched_domain and
7174 * sched_group structure so that the subsequent __free_domain_allocs()
7175 * will not free the data we're using.
7176 */
7177static void claim_allocations(int cpu, struct sched_domain *sd)
7355{ 7178{
7356 struct sched_domain *sd = NULL; 7179 struct sd_data *sdd = sd->private;
7357#ifdef CONFIG_NUMA 7180 struct sched_group *sg = sd->groups;
7358 struct sched_domain *parent;
7359
7360 d->sd_allnodes = 0;
7361 if (cpumask_weight(cpu_map) >
7362 SD_NODES_PER_DOMAIN * cpumask_weight(d->nodemask)) {
7363 sd = &per_cpu(allnodes_domains, i).sd;
7364 SD_INIT(sd, ALLNODES);
7365 set_domain_attribute(sd, attr);
7366 cpumask_copy(sched_domain_span(sd), cpu_map);
7367 cpu_to_allnodes_group(i, cpu_map, &sd->groups, d->tmpmask);
7368 d->sd_allnodes = 1;
7369 }
7370 parent = sd;
7371
7372 sd = &per_cpu(node_domains, i).sd;
7373 SD_INIT(sd, NODE);
7374 set_domain_attribute(sd, attr);
7375 sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd));
7376 sd->parent = parent;
7377 if (parent)
7378 parent->child = sd;
7379 cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map);
7380#endif
7381 return sd;
7382}
7383 7181
7384static struct sched_domain *__build_cpu_sched_domain(struct s_data *d, 7182 WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
7385 const struct cpumask *cpu_map, struct sched_domain_attr *attr, 7183 *per_cpu_ptr(sdd->sd, cpu) = NULL;
7386 struct sched_domain *parent, int i)
7387{
7388 struct sched_domain *sd;
7389 sd = &per_cpu(phys_domains, i).sd;
7390 SD_INIT(sd, CPU);
7391 set_domain_attribute(sd, attr);
7392 cpumask_copy(sched_domain_span(sd), d->nodemask);
7393 sd->parent = parent;
7394 if (parent)
7395 parent->child = sd;
7396 cpu_to_phys_group(i, cpu_map, &sd->groups, d->tmpmask);
7397 return sd;
7398}
7399 7184
7400static struct sched_domain *__build_book_sched_domain(struct s_data *d, 7185 if (cpu == cpumask_first(sched_group_cpus(sg))) {
7401 const struct cpumask *cpu_map, struct sched_domain_attr *attr, 7186 WARN_ON_ONCE(*per_cpu_ptr(sdd->sg, cpu) != sg);
7402 struct sched_domain *parent, int i) 7187 *per_cpu_ptr(sdd->sg, cpu) = NULL;
7403{ 7188 }
7404 struct sched_domain *sd = parent;
7405#ifdef CONFIG_SCHED_BOOK
7406 sd = &per_cpu(book_domains, i).sd;
7407 SD_INIT(sd, BOOK);
7408 set_domain_attribute(sd, attr);
7409 cpumask_and(sched_domain_span(sd), cpu_map, cpu_book_mask(i));
7410 sd->parent = parent;
7411 parent->child = sd;
7412 cpu_to_book_group(i, cpu_map, &sd->groups, d->tmpmask);
7413#endif
7414 return sd;
7415} 7189}
7416 7190
7417static struct sched_domain *__build_mc_sched_domain(struct s_data *d, 7191#ifdef CONFIG_SCHED_SMT
7418 const struct cpumask *cpu_map, struct sched_domain_attr *attr, 7192static const struct cpumask *cpu_smt_mask(int cpu)
7419 struct sched_domain *parent, int i)
7420{ 7193{
7421 struct sched_domain *sd = parent; 7194 return topology_thread_cpumask(cpu);
7422#ifdef CONFIG_SCHED_MC
7423 sd = &per_cpu(core_domains, i).sd;
7424 SD_INIT(sd, MC);
7425 set_domain_attribute(sd, attr);
7426 cpumask_and(sched_domain_span(sd), cpu_map, cpu_coregroup_mask(i));
7427 sd->parent = parent;
7428 parent->child = sd;
7429 cpu_to_core_group(i, cpu_map, &sd->groups, d->tmpmask);
7430#endif
7431 return sd;
7432} 7195}
7433
7434static struct sched_domain *__build_smt_sched_domain(struct s_data *d,
7435 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
7436 struct sched_domain *parent, int i)
7437{
7438 struct sched_domain *sd = parent;
7439#ifdef CONFIG_SCHED_SMT
7440 sd = &per_cpu(cpu_domains, i).sd;
7441 SD_INIT(sd, SIBLING);
7442 set_domain_attribute(sd, attr);
7443 cpumask_and(sched_domain_span(sd), cpu_map, topology_thread_cpumask(i));
7444 sd->parent = parent;
7445 parent->child = sd;
7446 cpu_to_cpu_group(i, cpu_map, &sd->groups, d->tmpmask);
7447#endif 7196#endif
7448 return sd;
7449}
7450 7197
7451static void build_sched_groups(struct s_data *d, enum sched_domain_level l, 7198/*
7452 const struct cpumask *cpu_map, int cpu) 7199 * Topology list, bottom-up.
7453{ 7200 */
7454 switch (l) { 7201static struct sched_domain_topology_level default_topology[] = {
7455#ifdef CONFIG_SCHED_SMT 7202#ifdef CONFIG_SCHED_SMT
7456 case SD_LV_SIBLING: /* set up CPU (sibling) groups */ 7203 { sd_init_SIBLING, cpu_smt_mask, },
7457 cpumask_and(d->this_sibling_map, cpu_map,
7458 topology_thread_cpumask(cpu));
7459 if (cpu == cpumask_first(d->this_sibling_map))
7460 init_sched_build_groups(d->this_sibling_map, cpu_map,
7461 &cpu_to_cpu_group,
7462 d->send_covered, d->tmpmask);
7463 break;
7464#endif 7204#endif
7465#ifdef CONFIG_SCHED_MC 7205#ifdef CONFIG_SCHED_MC
7466 case SD_LV_MC: /* set up multi-core groups */ 7206 { sd_init_MC, cpu_coregroup_mask, },
7467 cpumask_and(d->this_core_map, cpu_map, cpu_coregroup_mask(cpu));
7468 if (cpu == cpumask_first(d->this_core_map))
7469 init_sched_build_groups(d->this_core_map, cpu_map,
7470 &cpu_to_core_group,
7471 d->send_covered, d->tmpmask);
7472 break;
7473#endif 7207#endif
7474#ifdef CONFIG_SCHED_BOOK 7208#ifdef CONFIG_SCHED_BOOK
7475 case SD_LV_BOOK: /* set up book groups */ 7209 { sd_init_BOOK, cpu_book_mask, },
7476 cpumask_and(d->this_book_map, cpu_map, cpu_book_mask(cpu));
7477 if (cpu == cpumask_first(d->this_book_map))
7478 init_sched_build_groups(d->this_book_map, cpu_map,
7479 &cpu_to_book_group,
7480 d->send_covered, d->tmpmask);
7481 break;
7482#endif 7210#endif
7483 case SD_LV_CPU: /* set up physical groups */ 7211 { sd_init_CPU, cpu_cpu_mask, },
7484 cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map);
7485 if (!cpumask_empty(d->nodemask))
7486 init_sched_build_groups(d->nodemask, cpu_map,
7487 &cpu_to_phys_group,
7488 d->send_covered, d->tmpmask);
7489 break;
7490#ifdef CONFIG_NUMA 7212#ifdef CONFIG_NUMA
7491 case SD_LV_ALLNODES: 7213 { sd_init_NODE, cpu_node_mask, },
7492 init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group, 7214 { sd_init_ALLNODES, cpu_allnodes_mask, },
7493 d->send_covered, d->tmpmask);
7494 break;
7495#endif 7215#endif
7496 default: 7216 { NULL, },
7497 break; 7217};
7218
7219static struct sched_domain_topology_level *sched_domain_topology = default_topology;
7220
7221static int __sdt_alloc(const struct cpumask *cpu_map)
7222{
7223 struct sched_domain_topology_level *tl;
7224 int j;
7225
7226 for (tl = sched_domain_topology; tl->init; tl++) {
7227 struct sd_data *sdd = &tl->data;
7228
7229 sdd->sd = alloc_percpu(struct sched_domain *);
7230 if (!sdd->sd)
7231 return -ENOMEM;
7232
7233 sdd->sg = alloc_percpu(struct sched_group *);
7234 if (!sdd->sg)
7235 return -ENOMEM;
7236
7237 for_each_cpu(j, cpu_map) {
7238 struct sched_domain *sd;
7239 struct sched_group *sg;
7240
7241 sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
7242 GFP_KERNEL, cpu_to_node(j));
7243 if (!sd)
7244 return -ENOMEM;
7245
7246 *per_cpu_ptr(sdd->sd, j) = sd;
7247
7248 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
7249 GFP_KERNEL, cpu_to_node(j));
7250 if (!sg)
7251 return -ENOMEM;
7252
7253 *per_cpu_ptr(sdd->sg, j) = sg;
7254 }
7498 } 7255 }
7256
7257 return 0;
7258}
7259
7260static void __sdt_free(const struct cpumask *cpu_map)
7261{
7262 struct sched_domain_topology_level *tl;
7263 int j;
7264
7265 for (tl = sched_domain_topology; tl->init; tl++) {
7266 struct sd_data *sdd = &tl->data;
7267
7268 for_each_cpu(j, cpu_map) {
7269 kfree(*per_cpu_ptr(sdd->sd, j));
7270 kfree(*per_cpu_ptr(sdd->sg, j));
7271 }
7272 free_percpu(sdd->sd);
7273 free_percpu(sdd->sg);
7274 }
7275}
7276
7277struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
7278 struct s_data *d, const struct cpumask *cpu_map,
7279 struct sched_domain_attr *attr, struct sched_domain *child,
7280 int cpu)
7281{
7282 struct sched_domain *sd = tl->init(tl, cpu);
7283 if (!sd)
7284 return child;
7285
7286 set_domain_attribute(sd, attr);
7287 cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
7288 if (child) {
7289 sd->level = child->level + 1;
7290 sched_domain_level_max = max(sched_domain_level_max, sd->level);
7291 child->parent = sd;
7292 }
7293 sd->child = child;
7294
7295 return sd;
7499} 7296}
7500 7297
7501/* 7298/*
7502 * Build sched domains for a given set of cpus and attach the sched domains 7299 * Build sched domains for a given set of cpus and attach the sched domains
7503 * to the individual cpus 7300 * to the individual cpus
7504 */ 7301 */
7505static int __build_sched_domains(const struct cpumask *cpu_map, 7302static int build_sched_domains(const struct cpumask *cpu_map,
7506 struct sched_domain_attr *attr) 7303 struct sched_domain_attr *attr)
7507{ 7304{
7508 enum s_alloc alloc_state = sa_none; 7305 enum s_alloc alloc_state = sa_none;
7509 struct s_data d;
7510 struct sched_domain *sd; 7306 struct sched_domain *sd;
7511 int i; 7307 struct s_data d;
7512#ifdef CONFIG_NUMA 7308 int i, ret = -ENOMEM;
7513 d.sd_allnodes = 0;
7514#endif
7515 7309
7516 alloc_state = __visit_domain_allocation_hell(&d, cpu_map); 7310 alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
7517 if (alloc_state != sa_rootdomain) 7311 if (alloc_state != sa_rootdomain)
7518 goto error; 7312 goto error;
7519 alloc_state = sa_sched_groups;
7520 7313
7521 /* 7314 /* Set up domains for cpus specified by the cpu_map. */
7522 * Set up domains for cpus specified by the cpu_map.
7523 */
7524 for_each_cpu(i, cpu_map) { 7315 for_each_cpu(i, cpu_map) {
7525 cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)), 7316 struct sched_domain_topology_level *tl;
7526 cpu_map);
7527 7317
7528 sd = __build_numa_sched_domains(&d, cpu_map, attr, i); 7318 sd = NULL;
7529 sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i); 7319 for (tl = sched_domain_topology; tl->init; tl++)
7530 sd = __build_book_sched_domain(&d, cpu_map, attr, sd, i); 7320 sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i);
7531 sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i);
7532 sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i);
7533 }
7534 7321
7535 for_each_cpu(i, cpu_map) { 7322 while (sd->child)
7536 build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i); 7323 sd = sd->child;
7537 build_sched_groups(&d, SD_LV_BOOK, cpu_map, i);
7538 build_sched_groups(&d, SD_LV_MC, cpu_map, i);
7539 }
7540 7324
7541 /* Set up physical groups */ 7325 *per_cpu_ptr(d.sd, i) = sd;
7542 for (i = 0; i < nr_node_ids; i++)
7543 build_sched_groups(&d, SD_LV_CPU, cpu_map, i);
7544
7545#ifdef CONFIG_NUMA
7546 /* Set up node groups */
7547 if (d.sd_allnodes)
7548 build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0);
7549
7550 for (i = 0; i < nr_node_ids; i++)
7551 if (build_numa_sched_groups(&d, cpu_map, i))
7552 goto error;
7553#endif
7554
7555 /* Calculate CPU power for physical packages and nodes */
7556#ifdef CONFIG_SCHED_SMT
7557 for_each_cpu(i, cpu_map) {
7558 sd = &per_cpu(cpu_domains, i).sd;
7559 init_sched_groups_power(i, sd);
7560 }
7561#endif
7562#ifdef CONFIG_SCHED_MC
7563 for_each_cpu(i, cpu_map) {
7564 sd = &per_cpu(core_domains, i).sd;
7565 init_sched_groups_power(i, sd);
7566 }
7567#endif
7568#ifdef CONFIG_SCHED_BOOK
7569 for_each_cpu(i, cpu_map) {
7570 sd = &per_cpu(book_domains, i).sd;
7571 init_sched_groups_power(i, sd);
7572 } 7326 }
7573#endif
7574 7327
7328 /* Build the groups for the domains */
7575 for_each_cpu(i, cpu_map) { 7329 for_each_cpu(i, cpu_map) {
7576 sd = &per_cpu(phys_domains, i).sd; 7330 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
7577 init_sched_groups_power(i, sd); 7331 sd->span_weight = cpumask_weight(sched_domain_span(sd));
7578 } 7332 get_group(i, sd->private, &sd->groups);
7333 atomic_inc(&sd->groups->ref);
7579 7334
7580#ifdef CONFIG_NUMA 7335 if (i != cpumask_first(sched_domain_span(sd)))
7581 for (i = 0; i < nr_node_ids; i++) 7336 continue;
7582 init_numa_sched_groups_power(d.sched_group_nodes[i]);
7583 7337
7584 if (d.sd_allnodes) { 7338 build_sched_groups(sd);
7585 struct sched_group *sg; 7339 }
7340 }
7341
7342 /* Calculate CPU power for physical packages and nodes */
7343 for (i = nr_cpumask_bits-1; i >= 0; i--) {
7344 if (!cpumask_test_cpu(i, cpu_map))
7345 continue;
7586 7346
7587 cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg, 7347 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
7588 d.tmpmask); 7348 claim_allocations(i, sd);
7589 init_numa_sched_groups_power(sg); 7349 init_sched_groups_power(i, sd);
7350 }
7590 } 7351 }
7591#endif
7592 7352
7593 /* Attach the domains */ 7353 /* Attach the domains */
7354 rcu_read_lock();
7594 for_each_cpu(i, cpu_map) { 7355 for_each_cpu(i, cpu_map) {
7595#ifdef CONFIG_SCHED_SMT 7356 sd = *per_cpu_ptr(d.sd, i);
7596 sd = &per_cpu(cpu_domains, i).sd;
7597#elif defined(CONFIG_SCHED_MC)
7598 sd = &per_cpu(core_domains, i).sd;
7599#elif defined(CONFIG_SCHED_BOOK)
7600 sd = &per_cpu(book_domains, i).sd;
7601#else
7602 sd = &per_cpu(phys_domains, i).sd;
7603#endif
7604 cpu_attach_domain(sd, d.rd, i); 7357 cpu_attach_domain(sd, d.rd, i);
7605 } 7358 }
7359 rcu_read_unlock();
7606 7360
7607 d.sched_group_nodes = NULL; /* don't free this we still need it */ 7361 ret = 0;
7608 __free_domain_allocs(&d, sa_tmpmask, cpu_map);
7609 return 0;
7610
7611error: 7362error:
7612 __free_domain_allocs(&d, alloc_state, cpu_map); 7363 __free_domain_allocs(&d, alloc_state, cpu_map);
7613 return -ENOMEM; 7364 return ret;
7614}
7615
7616static int build_sched_domains(const struct cpumask *cpu_map)
7617{
7618 return __build_sched_domains(cpu_map, NULL);
7619} 7365}
7620 7366
7621static cpumask_var_t *doms_cur; /* current sched domains */ 7367static cpumask_var_t *doms_cur; /* current sched domains */
@@ -7670,7 +7416,7 @@ void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
7670 * For now this just excludes isolated cpus, but could be used to 7416 * For now this just excludes isolated cpus, but could be used to
7671 * exclude other special cases in the future. 7417 * exclude other special cases in the future.
7672 */ 7418 */
7673static int arch_init_sched_domains(const struct cpumask *cpu_map) 7419static int init_sched_domains(const struct cpumask *cpu_map)
7674{ 7420{
7675 int err; 7421 int err;
7676 7422
@@ -7681,32 +7427,24 @@ static int arch_init_sched_domains(const struct cpumask *cpu_map)
7681 doms_cur = &fallback_doms; 7427 doms_cur = &fallback_doms;
7682 cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); 7428 cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
7683 dattr_cur = NULL; 7429 dattr_cur = NULL;
7684 err = build_sched_domains(doms_cur[0]); 7430 err = build_sched_domains(doms_cur[0], NULL);
7685 register_sched_domain_sysctl(); 7431 register_sched_domain_sysctl();
7686 7432
7687 return err; 7433 return err;
7688} 7434}
7689 7435
7690static void arch_destroy_sched_domains(const struct cpumask *cpu_map,
7691 struct cpumask *tmpmask)
7692{
7693 free_sched_groups(cpu_map, tmpmask);
7694}
7695
7696/* 7436/*
7697 * Detach sched domains from a group of cpus specified in cpu_map 7437 * Detach sched domains from a group of cpus specified in cpu_map
7698 * These cpus will now be attached to the NULL domain 7438 * These cpus will now be attached to the NULL domain
7699 */ 7439 */
7700static void detach_destroy_domains(const struct cpumask *cpu_map) 7440static void detach_destroy_domains(const struct cpumask *cpu_map)
7701{ 7441{
7702 /* Save because hotplug lock held. */
7703 static DECLARE_BITMAP(tmpmask, CONFIG_NR_CPUS);
7704 int i; 7442 int i;
7705 7443
7444 rcu_read_lock();
7706 for_each_cpu(i, cpu_map) 7445 for_each_cpu(i, cpu_map)
7707 cpu_attach_domain(NULL, &def_root_domain, i); 7446 cpu_attach_domain(NULL, &def_root_domain, i);
7708 synchronize_sched(); 7447 rcu_read_unlock();
7709 arch_destroy_sched_domains(cpu_map, to_cpumask(tmpmask));
7710} 7448}
7711 7449
7712/* handle null as "default" */ 7450/* handle null as "default" */
@@ -7795,8 +7533,7 @@ match1:
7795 goto match2; 7533 goto match2;
7796 } 7534 }
7797 /* no match - add a new doms_new */ 7535 /* no match - add a new doms_new */
7798 __build_sched_domains(doms_new[i], 7536 build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
7799 dattr_new ? dattr_new + i : NULL);
7800match2: 7537match2:
7801 ; 7538 ;
7802 } 7539 }
@@ -7815,7 +7552,7 @@ match2:
7815} 7552}
7816 7553
7817#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 7554#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
7818static void arch_reinit_sched_domains(void) 7555static void reinit_sched_domains(void)
7819{ 7556{
7820 get_online_cpus(); 7557 get_online_cpus();
7821 7558
@@ -7848,7 +7585,7 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
7848 else 7585 else
7849 sched_mc_power_savings = level; 7586 sched_mc_power_savings = level;
7850 7587
7851 arch_reinit_sched_domains(); 7588 reinit_sched_domains();
7852 7589
7853 return count; 7590 return count;
7854} 7591}
@@ -7967,14 +7704,9 @@ void __init sched_init_smp(void)
7967 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); 7704 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
7968 alloc_cpumask_var(&fallback_doms, GFP_KERNEL); 7705 alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
7969 7706
7970#if defined(CONFIG_NUMA)
7971 sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **),
7972 GFP_KERNEL);
7973 BUG_ON(sched_group_nodes_bycpu == NULL);
7974#endif
7975 get_online_cpus(); 7707 get_online_cpus();
7976 mutex_lock(&sched_domains_mutex); 7708 mutex_lock(&sched_domains_mutex);
7977 arch_init_sched_domains(cpu_active_mask); 7709 init_sched_domains(cpu_active_mask);
7978 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); 7710 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
7979 if (cpumask_empty(non_isolated_cpus)) 7711 if (cpumask_empty(non_isolated_cpus))
7980 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); 7712 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
@@ -8025,6 +7757,9 @@ static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
8025#endif 7757#endif
8026#endif 7758#endif
8027 cfs_rq->min_vruntime = (u64)(-(1LL << 20)); 7759 cfs_rq->min_vruntime = (u64)(-(1LL << 20));
7760#ifndef CONFIG_64BIT
7761 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
7762#endif
8028} 7763}
8029 7764
8030static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) 7765static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
@@ -8224,7 +7959,7 @@ void __init sched_init(void)
8224#ifdef CONFIG_SMP 7959#ifdef CONFIG_SMP
8225 rq->sd = NULL; 7960 rq->sd = NULL;
8226 rq->rd = NULL; 7961 rq->rd = NULL;
8227 rq->cpu_power = SCHED_LOAD_SCALE; 7962 rq->cpu_power = SCHED_POWER_SCALE;
8228 rq->post_schedule = 0; 7963 rq->post_schedule = 0;
8229 rq->active_balance = 0; 7964 rq->active_balance = 0;
8230 rq->next_balance = jiffies; 7965 rq->next_balance = jiffies;
@@ -8281,6 +8016,7 @@ void __init sched_init(void)
8281 /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ 8016 /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */
8282 zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); 8017 zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
8283#ifdef CONFIG_SMP 8018#ifdef CONFIG_SMP
8019 zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
8284#ifdef CONFIG_NO_HZ 8020#ifdef CONFIG_NO_HZ
8285 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); 8021 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
8286 alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT); 8022 alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT);
@@ -8340,7 +8076,7 @@ static void normalize_task(struct rq *rq, struct task_struct *p)
8340 int old_prio = p->prio; 8076 int old_prio = p->prio;
8341 int on_rq; 8077 int on_rq;
8342 8078
8343 on_rq = p->se.on_rq; 8079 on_rq = p->on_rq;
8344 if (on_rq) 8080 if (on_rq)
8345 deactivate_task(rq, p, 0); 8081 deactivate_task(rq, p, 0);
8346 __setscheduler(rq, p, SCHED_NORMAL, 0); 8082 __setscheduler(rq, p, SCHED_NORMAL, 0);
@@ -8553,7 +8289,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8553{ 8289{
8554 struct rt_rq *rt_rq; 8290 struct rt_rq *rt_rq;
8555 struct sched_rt_entity *rt_se; 8291 struct sched_rt_entity *rt_se;
8556 struct rq *rq;
8557 int i; 8292 int i;
8558 8293
8559 tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL); 8294 tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);
@@ -8567,8 +8302,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8567 ktime_to_ns(def_rt_bandwidth.rt_period), 0); 8302 ktime_to_ns(def_rt_bandwidth.rt_period), 0);
8568 8303
8569 for_each_possible_cpu(i) { 8304 for_each_possible_cpu(i) {
8570 rq = cpu_rq(i);
8571
8572 rt_rq = kzalloc_node(sizeof(struct rt_rq), 8305 rt_rq = kzalloc_node(sizeof(struct rt_rq),
8573 GFP_KERNEL, cpu_to_node(i)); 8306 GFP_KERNEL, cpu_to_node(i));
8574 if (!rt_rq) 8307 if (!rt_rq)
@@ -8683,7 +8416,7 @@ void sched_move_task(struct task_struct *tsk)
8683 rq = task_rq_lock(tsk, &flags); 8416 rq = task_rq_lock(tsk, &flags);
8684 8417
8685 running = task_current(rq, tsk); 8418 running = task_current(rq, tsk);
8686 on_rq = tsk->se.on_rq; 8419 on_rq = tsk->on_rq;
8687 8420
8688 if (on_rq) 8421 if (on_rq)
8689 dequeue_task(rq, tsk, 0); 8422 dequeue_task(rq, tsk, 0);
@@ -8702,7 +8435,7 @@ void sched_move_task(struct task_struct *tsk)
8702 if (on_rq) 8435 if (on_rq)
8703 enqueue_task(rq, tsk, 0); 8436 enqueue_task(rq, tsk, 0);
8704 8437
8705 task_rq_unlock(rq, &flags); 8438 task_rq_unlock(rq, tsk, &flags);
8706} 8439}
8707#endif /* CONFIG_CGROUP_SCHED */ 8440#endif /* CONFIG_CGROUP_SCHED */
8708 8441
@@ -8720,10 +8453,7 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
8720 if (!tg->se[0]) 8453 if (!tg->se[0])
8721 return -EINVAL; 8454 return -EINVAL;
8722 8455
8723 if (shares < MIN_SHARES) 8456 shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
8724 shares = MIN_SHARES;
8725 else if (shares > MAX_SHARES)
8726 shares = MAX_SHARES;
8727 8457
8728 mutex_lock(&shares_mutex); 8458 mutex_lock(&shares_mutex);
8729 if (tg->shares == shares) 8459 if (tg->shares == shares)
@@ -9073,42 +8803,10 @@ cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
9073 return 0; 8803 return 0;
9074} 8804}
9075 8805
9076static int
9077cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
9078 struct task_struct *tsk, bool threadgroup)
9079{
9080 int retval = cpu_cgroup_can_attach_task(cgrp, tsk);
9081 if (retval)
9082 return retval;
9083 if (threadgroup) {
9084 struct task_struct *c;
9085 rcu_read_lock();
9086 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
9087 retval = cpu_cgroup_can_attach_task(cgrp, c);
9088 if (retval) {
9089 rcu_read_unlock();
9090 return retval;
9091 }
9092 }
9093 rcu_read_unlock();
9094 }
9095 return 0;
9096}
9097
9098static void 8806static void
9099cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, 8807cpu_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
9100 struct cgroup *old_cont, struct task_struct *tsk,
9101 bool threadgroup)
9102{ 8808{
9103 sched_move_task(tsk); 8809 sched_move_task(tsk);
9104 if (threadgroup) {
9105 struct task_struct *c;
9106 rcu_read_lock();
9107 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
9108 sched_move_task(c);
9109 }
9110 rcu_read_unlock();
9111 }
9112} 8810}
9113 8811
9114static void 8812static void
@@ -9130,14 +8828,14 @@ cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
9130static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype, 8828static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,
9131 u64 shareval) 8829 u64 shareval)
9132{ 8830{
9133 return sched_group_set_shares(cgroup_tg(cgrp), shareval); 8831 return sched_group_set_shares(cgroup_tg(cgrp), scale_load(shareval));
9134} 8832}
9135 8833
9136static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft) 8834static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
9137{ 8835{
9138 struct task_group *tg = cgroup_tg(cgrp); 8836 struct task_group *tg = cgroup_tg(cgrp);
9139 8837
9140 return (u64) tg->shares; 8838 return (u64) scale_load_down(tg->shares);
9141} 8839}
9142#endif /* CONFIG_FAIR_GROUP_SCHED */ 8840#endif /* CONFIG_FAIR_GROUP_SCHED */
9143 8841
@@ -9196,8 +8894,8 @@ struct cgroup_subsys cpu_cgroup_subsys = {
9196 .name = "cpu", 8894 .name = "cpu",
9197 .create = cpu_cgroup_create, 8895 .create = cpu_cgroup_create,
9198 .destroy = cpu_cgroup_destroy, 8896 .destroy = cpu_cgroup_destroy,
9199 .can_attach = cpu_cgroup_can_attach, 8897 .can_attach_task = cpu_cgroup_can_attach_task,
9200 .attach = cpu_cgroup_attach, 8898 .attach_task = cpu_cgroup_attach_task,
9201 .exit = cpu_cgroup_exit, 8899 .exit = cpu_cgroup_exit,
9202 .populate = cpu_cgroup_populate, 8900 .populate = cpu_cgroup_populate,
9203 .subsys_id = cpu_cgroup_subsys_id, 8901 .subsys_id = cpu_cgroup_subsys_id,
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 7bacd83a4158..a6710a112b4f 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -152,7 +152,7 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
152 read_lock_irqsave(&tasklist_lock, flags); 152 read_lock_irqsave(&tasklist_lock, flags);
153 153
154 do_each_thread(g, p) { 154 do_each_thread(g, p) {
155 if (!p->se.on_rq || task_cpu(p) != rq_cpu) 155 if (!p->on_rq || task_cpu(p) != rq_cpu)
156 continue; 156 continue;
157 157
158 print_task(m, rq, p); 158 print_task(m, rq, p);
@@ -296,9 +296,6 @@ static void print_cpu(struct seq_file *m, int cpu)
296 P(ttwu_count); 296 P(ttwu_count);
297 P(ttwu_local); 297 P(ttwu_local);
298 298
299 SEQ_printf(m, " .%-30s: %d\n", "bkl_count",
300 rq->rq_sched_info.bkl_count);
301
302#undef P 299#undef P
303#undef P64 300#undef P64
304#endif 301#endif
@@ -441,7 +438,6 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
441 P(se.statistics.wait_count); 438 P(se.statistics.wait_count);
442 PN(se.statistics.iowait_sum); 439 PN(se.statistics.iowait_sum);
443 P(se.statistics.iowait_count); 440 P(se.statistics.iowait_count);
444 P(sched_info.bkl_count);
445 P(se.nr_migrations); 441 P(se.nr_migrations);
446 P(se.statistics.nr_migrations_cold); 442 P(se.statistics.nr_migrations_cold);
447 P(se.statistics.nr_failed_migrations_affine); 443 P(se.statistics.nr_failed_migrations_affine);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 6fa833ab2cb8..433491c2dc8f 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -358,6 +358,10 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)
358 } 358 }
359 359
360 cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime); 360 cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
361#ifndef CONFIG_64BIT
362 smp_wmb();
363 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
364#endif
361} 365}
362 366
363/* 367/*
@@ -1072,8 +1076,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1072 se->on_rq = 0; 1076 se->on_rq = 0;
1073 update_cfs_load(cfs_rq, 0); 1077 update_cfs_load(cfs_rq, 0);
1074 account_entity_dequeue(cfs_rq, se); 1078 account_entity_dequeue(cfs_rq, se);
1075 update_min_vruntime(cfs_rq);
1076 update_cfs_shares(cfs_rq);
1077 1079
1078 /* 1080 /*
1079 * Normalize the entity after updating the min_vruntime because the 1081 * Normalize the entity after updating the min_vruntime because the
@@ -1082,6 +1084,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1082 */ 1084 */
1083 if (!(flags & DEQUEUE_SLEEP)) 1085 if (!(flags & DEQUEUE_SLEEP))
1084 se->vruntime -= cfs_rq->min_vruntime; 1086 se->vruntime -= cfs_rq->min_vruntime;
1087
1088 update_min_vruntime(cfs_rq);
1089 update_cfs_shares(cfs_rq);
1085} 1090}
1086 1091
1087/* 1092/*
@@ -1340,6 +1345,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1340 hrtick_update(rq); 1345 hrtick_update(rq);
1341} 1346}
1342 1347
1348static void set_next_buddy(struct sched_entity *se);
1349
1343/* 1350/*
1344 * The dequeue_task method is called before nr_running is 1351 * The dequeue_task method is called before nr_running is
1345 * decreased. We remove the task from the rbtree and 1352 * decreased. We remove the task from the rbtree and
@@ -1349,14 +1356,22 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1349{ 1356{
1350 struct cfs_rq *cfs_rq; 1357 struct cfs_rq *cfs_rq;
1351 struct sched_entity *se = &p->se; 1358 struct sched_entity *se = &p->se;
1359 int task_sleep = flags & DEQUEUE_SLEEP;
1352 1360
1353 for_each_sched_entity(se) { 1361 for_each_sched_entity(se) {
1354 cfs_rq = cfs_rq_of(se); 1362 cfs_rq = cfs_rq_of(se);
1355 dequeue_entity(cfs_rq, se, flags); 1363 dequeue_entity(cfs_rq, se, flags);
1356 1364
1357 /* Don't dequeue parent if it has other entities besides us */ 1365 /* Don't dequeue parent if it has other entities besides us */
1358 if (cfs_rq->load.weight) 1366 if (cfs_rq->load.weight) {
1367 /*
1368 * Bias pick_next to pick a task from this cfs_rq, as
1369 * p is sleeping when it is within its sched_slice.
1370 */
1371 if (task_sleep && parent_entity(se))
1372 set_next_buddy(parent_entity(se));
1359 break; 1373 break;
1374 }
1360 flags |= DEQUEUE_SLEEP; 1375 flags |= DEQUEUE_SLEEP;
1361 } 1376 }
1362 1377
@@ -1372,12 +1387,25 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1372 1387
1373#ifdef CONFIG_SMP 1388#ifdef CONFIG_SMP
1374 1389
1375static void task_waking_fair(struct rq *rq, struct task_struct *p) 1390static void task_waking_fair(struct task_struct *p)
1376{ 1391{
1377 struct sched_entity *se = &p->se; 1392 struct sched_entity *se = &p->se;
1378 struct cfs_rq *cfs_rq = cfs_rq_of(se); 1393 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1394 u64 min_vruntime;
1379 1395
1380 se->vruntime -= cfs_rq->min_vruntime; 1396#ifndef CONFIG_64BIT
1397 u64 min_vruntime_copy;
1398
1399 do {
1400 min_vruntime_copy = cfs_rq->min_vruntime_copy;
1401 smp_rmb();
1402 min_vruntime = cfs_rq->min_vruntime;
1403 } while (min_vruntime != min_vruntime_copy);
1404#else
1405 min_vruntime = cfs_rq->min_vruntime;
1406#endif
1407
1408 se->vruntime -= min_vruntime;
1381} 1409}
1382 1410
1383#ifdef CONFIG_FAIR_GROUP_SCHED 1411#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1557,7 +1585,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
1557 } 1585 }
1558 1586
1559 /* Adjust by relative CPU power of the group */ 1587 /* Adjust by relative CPU power of the group */
1560 avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; 1588 avg_load = (avg_load * SCHED_POWER_SCALE) / group->cpu_power;
1561 1589
1562 if (local_group) { 1590 if (local_group) {
1563 this_load = avg_load; 1591 this_load = avg_load;
@@ -1622,6 +1650,7 @@ static int select_idle_sibling(struct task_struct *p, int target)
1622 /* 1650 /*
1623 * Otherwise, iterate the domains and find an elegible idle cpu. 1651 * Otherwise, iterate the domains and find an elegible idle cpu.
1624 */ 1652 */
1653 rcu_read_lock();
1625 for_each_domain(target, sd) { 1654 for_each_domain(target, sd) {
1626 if (!(sd->flags & SD_SHARE_PKG_RESOURCES)) 1655 if (!(sd->flags & SD_SHARE_PKG_RESOURCES))
1627 break; 1656 break;
@@ -1641,6 +1670,7 @@ static int select_idle_sibling(struct task_struct *p, int target)
1641 cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) 1670 cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
1642 break; 1671 break;
1643 } 1672 }
1673 rcu_read_unlock();
1644 1674
1645 return target; 1675 return target;
1646} 1676}
@@ -1657,7 +1687,7 @@ static int select_idle_sibling(struct task_struct *p, int target)
1657 * preempt must be disabled. 1687 * preempt must be disabled.
1658 */ 1688 */
1659static int 1689static int
1660select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_flags) 1690select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
1661{ 1691{
1662 struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; 1692 struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
1663 int cpu = smp_processor_id(); 1693 int cpu = smp_processor_id();
@@ -1673,6 +1703,7 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_
1673 new_cpu = prev_cpu; 1703 new_cpu = prev_cpu;
1674 } 1704 }
1675 1705
1706 rcu_read_lock();
1676 for_each_domain(cpu, tmp) { 1707 for_each_domain(cpu, tmp) {
1677 if (!(tmp->flags & SD_LOAD_BALANCE)) 1708 if (!(tmp->flags & SD_LOAD_BALANCE))
1678 continue; 1709 continue;
@@ -1692,7 +1723,7 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_
1692 nr_running += cpu_rq(i)->cfs.nr_running; 1723 nr_running += cpu_rq(i)->cfs.nr_running;
1693 } 1724 }
1694 1725
1695 capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE); 1726 capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE);
1696 1727
1697 if (tmp->flags & SD_POWERSAVINGS_BALANCE) 1728 if (tmp->flags & SD_POWERSAVINGS_BALANCE)
1698 nr_running /= 2; 1729 nr_running /= 2;
@@ -1723,9 +1754,10 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_
1723 1754
1724 if (affine_sd) { 1755 if (affine_sd) {
1725 if (cpu == prev_cpu || wake_affine(affine_sd, p, sync)) 1756 if (cpu == prev_cpu || wake_affine(affine_sd, p, sync))
1726 return select_idle_sibling(p, cpu); 1757 prev_cpu = cpu;
1727 else 1758
1728 return select_idle_sibling(p, prev_cpu); 1759 new_cpu = select_idle_sibling(p, prev_cpu);
1760 goto unlock;
1729 } 1761 }
1730 1762
1731 while (sd) { 1763 while (sd) {
@@ -1766,6 +1798,8 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_
1766 } 1798 }
1767 /* while loop will break here if sd == NULL */ 1799 /* while loop will break here if sd == NULL */
1768 } 1800 }
1801unlock:
1802 rcu_read_unlock();
1769 1803
1770 return new_cpu; 1804 return new_cpu;
1771} 1805}
@@ -1789,10 +1823,7 @@ wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
1789 * This is especially important for buddies when the leftmost 1823 * This is especially important for buddies when the leftmost
1790 * task is higher priority than the buddy. 1824 * task is higher priority than the buddy.
1791 */ 1825 */
1792 if (unlikely(se->load.weight != NICE_0_LOAD)) 1826 return calc_delta_fair(gran, se);
1793 gran = calc_delta_fair(gran, se);
1794
1795 return gran;
1796} 1827}
1797 1828
1798/* 1829/*
@@ -1826,26 +1857,26 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
1826 1857
1827static void set_last_buddy(struct sched_entity *se) 1858static void set_last_buddy(struct sched_entity *se)
1828{ 1859{
1829 if (likely(task_of(se)->policy != SCHED_IDLE)) { 1860 if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
1830 for_each_sched_entity(se) 1861 return;
1831 cfs_rq_of(se)->last = se; 1862
1832 } 1863 for_each_sched_entity(se)
1864 cfs_rq_of(se)->last = se;
1833} 1865}
1834 1866
1835static void set_next_buddy(struct sched_entity *se) 1867static void set_next_buddy(struct sched_entity *se)
1836{ 1868{
1837 if (likely(task_of(se)->policy != SCHED_IDLE)) { 1869 if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
1838 for_each_sched_entity(se) 1870 return;
1839 cfs_rq_of(se)->next = se; 1871
1840 } 1872 for_each_sched_entity(se)
1873 cfs_rq_of(se)->next = se;
1841} 1874}
1842 1875
1843static void set_skip_buddy(struct sched_entity *se) 1876static void set_skip_buddy(struct sched_entity *se)
1844{ 1877{
1845 if (likely(task_of(se)->policy != SCHED_IDLE)) { 1878 for_each_sched_entity(se)
1846 for_each_sched_entity(se) 1879 cfs_rq_of(se)->skip = se;
1847 cfs_rq_of(se)->skip = se;
1848 }
1849} 1880}
1850 1881
1851/* 1882/*
@@ -1857,12 +1888,15 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
1857 struct sched_entity *se = &curr->se, *pse = &p->se; 1888 struct sched_entity *se = &curr->se, *pse = &p->se;
1858 struct cfs_rq *cfs_rq = task_cfs_rq(curr); 1889 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
1859 int scale = cfs_rq->nr_running >= sched_nr_latency; 1890 int scale = cfs_rq->nr_running >= sched_nr_latency;
1891 int next_buddy_marked = 0;
1860 1892
1861 if (unlikely(se == pse)) 1893 if (unlikely(se == pse))
1862 return; 1894 return;
1863 1895
1864 if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) 1896 if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
1865 set_next_buddy(pse); 1897 set_next_buddy(pse);
1898 next_buddy_marked = 1;
1899 }
1866 1900
1867 /* 1901 /*
1868 * We can come here with TIF_NEED_RESCHED already set from new task 1902 * We can come here with TIF_NEED_RESCHED already set from new task
@@ -1890,8 +1924,15 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
1890 update_curr(cfs_rq); 1924 update_curr(cfs_rq);
1891 find_matching_se(&se, &pse); 1925 find_matching_se(&se, &pse);
1892 BUG_ON(!pse); 1926 BUG_ON(!pse);
1893 if (wakeup_preempt_entity(se, pse) == 1) 1927 if (wakeup_preempt_entity(se, pse) == 1) {
1928 /*
1929 * Bias pick_next to pick the sched entity that is
1930 * triggering this preemption.
1931 */
1932 if (!next_buddy_marked)
1933 set_next_buddy(pse);
1894 goto preempt; 1934 goto preempt;
1935 }
1895 1936
1896 return; 1937 return;
1897 1938
@@ -2102,7 +2143,7 @@ static unsigned long
2102balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, 2143balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2103 unsigned long max_load_move, struct sched_domain *sd, 2144 unsigned long max_load_move, struct sched_domain *sd,
2104 enum cpu_idle_type idle, int *all_pinned, 2145 enum cpu_idle_type idle, int *all_pinned,
2105 int *this_best_prio, struct cfs_rq *busiest_cfs_rq) 2146 struct cfs_rq *busiest_cfs_rq)
2106{ 2147{
2107 int loops = 0, pulled = 0; 2148 int loops = 0, pulled = 0;
2108 long rem_load_move = max_load_move; 2149 long rem_load_move = max_load_move;
@@ -2140,9 +2181,6 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2140 */ 2181 */
2141 if (rem_load_move <= 0) 2182 if (rem_load_move <= 0)
2142 break; 2183 break;
2143
2144 if (p->prio < *this_best_prio)
2145 *this_best_prio = p->prio;
2146 } 2184 }
2147out: 2185out:
2148 /* 2186 /*
@@ -2202,7 +2240,7 @@ static unsigned long
2202load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 2240load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
2203 unsigned long max_load_move, 2241 unsigned long max_load_move,
2204 struct sched_domain *sd, enum cpu_idle_type idle, 2242 struct sched_domain *sd, enum cpu_idle_type idle,
2205 int *all_pinned, int *this_best_prio) 2243 int *all_pinned)
2206{ 2244{
2207 long rem_load_move = max_load_move; 2245 long rem_load_move = max_load_move;
2208 int busiest_cpu = cpu_of(busiest); 2246 int busiest_cpu = cpu_of(busiest);
@@ -2227,7 +2265,7 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
2227 rem_load = div_u64(rem_load, busiest_h_load + 1); 2265 rem_load = div_u64(rem_load, busiest_h_load + 1);
2228 2266
2229 moved_load = balance_tasks(this_rq, this_cpu, busiest, 2267 moved_load = balance_tasks(this_rq, this_cpu, busiest,
2230 rem_load, sd, idle, all_pinned, this_best_prio, 2268 rem_load, sd, idle, all_pinned,
2231 busiest_cfs_rq); 2269 busiest_cfs_rq);
2232 2270
2233 if (!moved_load) 2271 if (!moved_load)
@@ -2253,11 +2291,11 @@ static unsigned long
2253load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 2291load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
2254 unsigned long max_load_move, 2292 unsigned long max_load_move,
2255 struct sched_domain *sd, enum cpu_idle_type idle, 2293 struct sched_domain *sd, enum cpu_idle_type idle,
2256 int *all_pinned, int *this_best_prio) 2294 int *all_pinned)
2257{ 2295{
2258 return balance_tasks(this_rq, this_cpu, busiest, 2296 return balance_tasks(this_rq, this_cpu, busiest,
2259 max_load_move, sd, idle, all_pinned, 2297 max_load_move, sd, idle, all_pinned,
2260 this_best_prio, &busiest->cfs); 2298 &busiest->cfs);
2261} 2299}
2262#endif 2300#endif
2263 2301
@@ -2274,12 +2312,11 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2274 int *all_pinned) 2312 int *all_pinned)
2275{ 2313{
2276 unsigned long total_load_moved = 0, load_moved; 2314 unsigned long total_load_moved = 0, load_moved;
2277 int this_best_prio = this_rq->curr->prio;
2278 2315
2279 do { 2316 do {
2280 load_moved = load_balance_fair(this_rq, this_cpu, busiest, 2317 load_moved = load_balance_fair(this_rq, this_cpu, busiest,
2281 max_load_move - total_load_moved, 2318 max_load_move - total_load_moved,
2282 sd, idle, all_pinned, &this_best_prio); 2319 sd, idle, all_pinned);
2283 2320
2284 total_load_moved += load_moved; 2321 total_load_moved += load_moved;
2285 2322
@@ -2534,7 +2571,7 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
2534 2571
2535unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) 2572unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
2536{ 2573{
2537 return SCHED_LOAD_SCALE; 2574 return SCHED_POWER_SCALE;
2538} 2575}
2539 2576
2540unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu) 2577unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
@@ -2571,10 +2608,10 @@ unsigned long scale_rt_power(int cpu)
2571 available = total - rq->rt_avg; 2608 available = total - rq->rt_avg;
2572 } 2609 }
2573 2610
2574 if (unlikely((s64)total < SCHED_LOAD_SCALE)) 2611 if (unlikely((s64)total < SCHED_POWER_SCALE))
2575 total = SCHED_LOAD_SCALE; 2612 total = SCHED_POWER_SCALE;
2576 2613
2577 total >>= SCHED_LOAD_SHIFT; 2614 total >>= SCHED_POWER_SHIFT;
2578 2615
2579 return div_u64(available, total); 2616 return div_u64(available, total);
2580} 2617}
@@ -2582,7 +2619,7 @@ unsigned long scale_rt_power(int cpu)
2582static void update_cpu_power(struct sched_domain *sd, int cpu) 2619static void update_cpu_power(struct sched_domain *sd, int cpu)
2583{ 2620{
2584 unsigned long weight = sd->span_weight; 2621 unsigned long weight = sd->span_weight;
2585 unsigned long power = SCHED_LOAD_SCALE; 2622 unsigned long power = SCHED_POWER_SCALE;
2586 struct sched_group *sdg = sd->groups; 2623 struct sched_group *sdg = sd->groups;
2587 2624
2588 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { 2625 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
@@ -2591,7 +2628,7 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
2591 else 2628 else
2592 power *= default_scale_smt_power(sd, cpu); 2629 power *= default_scale_smt_power(sd, cpu);
2593 2630
2594 power >>= SCHED_LOAD_SHIFT; 2631 power >>= SCHED_POWER_SHIFT;
2595 } 2632 }
2596 2633
2597 sdg->cpu_power_orig = power; 2634 sdg->cpu_power_orig = power;
@@ -2601,10 +2638,10 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
2601 else 2638 else
2602 power *= default_scale_freq_power(sd, cpu); 2639 power *= default_scale_freq_power(sd, cpu);
2603 2640
2604 power >>= SCHED_LOAD_SHIFT; 2641 power >>= SCHED_POWER_SHIFT;
2605 2642
2606 power *= scale_rt_power(cpu); 2643 power *= scale_rt_power(cpu);
2607 power >>= SCHED_LOAD_SHIFT; 2644 power >>= SCHED_POWER_SHIFT;
2608 2645
2609 if (!power) 2646 if (!power)
2610 power = 1; 2647 power = 1;
@@ -2646,9 +2683,9 @@ static inline int
2646fix_small_capacity(struct sched_domain *sd, struct sched_group *group) 2683fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
2647{ 2684{
2648 /* 2685 /*
2649 * Only siblings can have significantly less than SCHED_LOAD_SCALE 2686 * Only siblings can have significantly less than SCHED_POWER_SCALE
2650 */ 2687 */
2651 if (sd->level != SD_LV_SIBLING) 2688 if (!(sd->flags & SD_SHARE_CPUPOWER))
2652 return 0; 2689 return 0;
2653 2690
2654 /* 2691 /*
@@ -2734,7 +2771,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2734 } 2771 }
2735 2772
2736 /* Adjust by relative CPU power of the group */ 2773 /* Adjust by relative CPU power of the group */
2737 sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power; 2774 sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->cpu_power;
2738 2775
2739 /* 2776 /*
2740 * Consider the group unbalanced when the imbalance is larger 2777 * Consider the group unbalanced when the imbalance is larger
@@ -2751,7 +2788,8 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2751 if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && max_nr_running > 1) 2788 if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && max_nr_running > 1)
2752 sgs->group_imb = 1; 2789 sgs->group_imb = 1;
2753 2790
2754 sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); 2791 sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power,
2792 SCHED_POWER_SCALE);
2755 if (!sgs->group_capacity) 2793 if (!sgs->group_capacity)
2756 sgs->group_capacity = fix_small_capacity(sd, group); 2794 sgs->group_capacity = fix_small_capacity(sd, group);
2757 sgs->group_weight = group->group_weight; 2795 sgs->group_weight = group->group_weight;
@@ -2925,7 +2963,7 @@ static int check_asym_packing(struct sched_domain *sd,
2925 return 0; 2963 return 0;
2926 2964
2927 *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->cpu_power, 2965 *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->cpu_power,
2928 SCHED_LOAD_SCALE); 2966 SCHED_POWER_SCALE);
2929 return 1; 2967 return 1;
2930} 2968}
2931 2969
@@ -2954,7 +2992,7 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
2954 cpu_avg_load_per_task(this_cpu); 2992 cpu_avg_load_per_task(this_cpu);
2955 2993
2956 scaled_busy_load_per_task = sds->busiest_load_per_task 2994 scaled_busy_load_per_task = sds->busiest_load_per_task
2957 * SCHED_LOAD_SCALE; 2995 * SCHED_POWER_SCALE;
2958 scaled_busy_load_per_task /= sds->busiest->cpu_power; 2996 scaled_busy_load_per_task /= sds->busiest->cpu_power;
2959 2997
2960 if (sds->max_load - sds->this_load + scaled_busy_load_per_task >= 2998 if (sds->max_load - sds->this_load + scaled_busy_load_per_task >=
@@ -2973,10 +3011,10 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
2973 min(sds->busiest_load_per_task, sds->max_load); 3011 min(sds->busiest_load_per_task, sds->max_load);
2974 pwr_now += sds->this->cpu_power * 3012 pwr_now += sds->this->cpu_power *
2975 min(sds->this_load_per_task, sds->this_load); 3013 min(sds->this_load_per_task, sds->this_load);
2976 pwr_now /= SCHED_LOAD_SCALE; 3014 pwr_now /= SCHED_POWER_SCALE;
2977 3015
2978 /* Amount of load we'd subtract */ 3016 /* Amount of load we'd subtract */
2979 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) / 3017 tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
2980 sds->busiest->cpu_power; 3018 sds->busiest->cpu_power;
2981 if (sds->max_load > tmp) 3019 if (sds->max_load > tmp)
2982 pwr_move += sds->busiest->cpu_power * 3020 pwr_move += sds->busiest->cpu_power *
@@ -2984,15 +3022,15 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
2984 3022
2985 /* Amount of load we'd add */ 3023 /* Amount of load we'd add */
2986 if (sds->max_load * sds->busiest->cpu_power < 3024 if (sds->max_load * sds->busiest->cpu_power <
2987 sds->busiest_load_per_task * SCHED_LOAD_SCALE) 3025 sds->busiest_load_per_task * SCHED_POWER_SCALE)
2988 tmp = (sds->max_load * sds->busiest->cpu_power) / 3026 tmp = (sds->max_load * sds->busiest->cpu_power) /
2989 sds->this->cpu_power; 3027 sds->this->cpu_power;
2990 else 3028 else
2991 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) / 3029 tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
2992 sds->this->cpu_power; 3030 sds->this->cpu_power;
2993 pwr_move += sds->this->cpu_power * 3031 pwr_move += sds->this->cpu_power *
2994 min(sds->this_load_per_task, sds->this_load + tmp); 3032 min(sds->this_load_per_task, sds->this_load + tmp);
2995 pwr_move /= SCHED_LOAD_SCALE; 3033 pwr_move /= SCHED_POWER_SCALE;
2996 3034
2997 /* Move if we gain throughput */ 3035 /* Move if we gain throughput */
2998 if (pwr_move > pwr_now) 3036 if (pwr_move > pwr_now)
@@ -3034,7 +3072,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
3034 load_above_capacity = (sds->busiest_nr_running - 3072 load_above_capacity = (sds->busiest_nr_running -
3035 sds->busiest_group_capacity); 3073 sds->busiest_group_capacity);
3036 3074
3037 load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_LOAD_SCALE); 3075 load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE);
3038 3076
3039 load_above_capacity /= sds->busiest->cpu_power; 3077 load_above_capacity /= sds->busiest->cpu_power;
3040 } 3078 }
@@ -3054,7 +3092,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
3054 /* How much load to actually move to equalise the imbalance */ 3092 /* How much load to actually move to equalise the imbalance */
3055 *imbalance = min(max_pull * sds->busiest->cpu_power, 3093 *imbalance = min(max_pull * sds->busiest->cpu_power,
3056 (sds->avg_load - sds->this_load) * sds->this->cpu_power) 3094 (sds->avg_load - sds->this_load) * sds->this->cpu_power)
3057 / SCHED_LOAD_SCALE; 3095 / SCHED_POWER_SCALE;
3058 3096
3059 /* 3097 /*
3060 * if *imbalance is less than the average load per runnable task 3098 * if *imbalance is less than the average load per runnable task
@@ -3123,7 +3161,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
3123 if (!sds.busiest || sds.busiest_nr_running == 0) 3161 if (!sds.busiest || sds.busiest_nr_running == 0)
3124 goto out_balanced; 3162 goto out_balanced;
3125 3163
3126 sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr; 3164 sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr;
3127 3165
3128 /* 3166 /*
3129 * If the busiest group is imbalanced the below checks don't 3167 * If the busiest group is imbalanced the below checks don't
@@ -3202,7 +3240,8 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
3202 3240
3203 for_each_cpu(i, sched_group_cpus(group)) { 3241 for_each_cpu(i, sched_group_cpus(group)) {
3204 unsigned long power = power_of(i); 3242 unsigned long power = power_of(i);
3205 unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE); 3243 unsigned long capacity = DIV_ROUND_CLOSEST(power,
3244 SCHED_POWER_SCALE);
3206 unsigned long wl; 3245 unsigned long wl;
3207 3246
3208 if (!capacity) 3247 if (!capacity)
@@ -3227,7 +3266,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
3227 * the load can be moved away from the cpu that is potentially 3266 * the load can be moved away from the cpu that is potentially
3228 * running at a lower capacity. 3267 * running at a lower capacity.
3229 */ 3268 */
3230 wl = (wl * SCHED_LOAD_SCALE) / power; 3269 wl = (wl * SCHED_POWER_SCALE) / power;
3231 3270
3232 if (wl > max_load) { 3271 if (wl > max_load) {
3233 max_load = wl; 3272 max_load = wl;
@@ -3465,6 +3504,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
3465 raw_spin_unlock(&this_rq->lock); 3504 raw_spin_unlock(&this_rq->lock);
3466 3505
3467 update_shares(this_cpu); 3506 update_shares(this_cpu);
3507 rcu_read_lock();
3468 for_each_domain(this_cpu, sd) { 3508 for_each_domain(this_cpu, sd) {
3469 unsigned long interval; 3509 unsigned long interval;
3470 int balance = 1; 3510 int balance = 1;
@@ -3486,6 +3526,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
3486 break; 3526 break;
3487 } 3527 }
3488 } 3528 }
3529 rcu_read_unlock();
3489 3530
3490 raw_spin_lock(&this_rq->lock); 3531 raw_spin_lock(&this_rq->lock);
3491 3532
@@ -3534,6 +3575,7 @@ static int active_load_balance_cpu_stop(void *data)
3534 double_lock_balance(busiest_rq, target_rq); 3575 double_lock_balance(busiest_rq, target_rq);
3535 3576
3536 /* Search for an sd spanning us and the target CPU. */ 3577 /* Search for an sd spanning us and the target CPU. */
3578 rcu_read_lock();
3537 for_each_domain(target_cpu, sd) { 3579 for_each_domain(target_cpu, sd) {
3538 if ((sd->flags & SD_LOAD_BALANCE) && 3580 if ((sd->flags & SD_LOAD_BALANCE) &&
3539 cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) 3581 cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
@@ -3549,6 +3591,7 @@ static int active_load_balance_cpu_stop(void *data)
3549 else 3591 else
3550 schedstat_inc(sd, alb_failed); 3592 schedstat_inc(sd, alb_failed);
3551 } 3593 }
3594 rcu_read_unlock();
3552 double_unlock_balance(busiest_rq, target_rq); 3595 double_unlock_balance(busiest_rq, target_rq);
3553out_unlock: 3596out_unlock:
3554 busiest_rq->active_balance = 0; 3597 busiest_rq->active_balance = 0;
@@ -3675,6 +3718,7 @@ static int find_new_ilb(int cpu)
3675{ 3718{
3676 struct sched_domain *sd; 3719 struct sched_domain *sd;
3677 struct sched_group *ilb_group; 3720 struct sched_group *ilb_group;
3721 int ilb = nr_cpu_ids;
3678 3722
3679 /* 3723 /*
3680 * Have idle load balancer selection from semi-idle packages only 3724 * Have idle load balancer selection from semi-idle packages only
@@ -3690,20 +3734,25 @@ static int find_new_ilb(int cpu)
3690 if (cpumask_weight(nohz.idle_cpus_mask) < 2) 3734 if (cpumask_weight(nohz.idle_cpus_mask) < 2)
3691 goto out_done; 3735 goto out_done;
3692 3736
3737 rcu_read_lock();
3693 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { 3738 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
3694 ilb_group = sd->groups; 3739 ilb_group = sd->groups;
3695 3740
3696 do { 3741 do {
3697 if (is_semi_idle_group(ilb_group)) 3742 if (is_semi_idle_group(ilb_group)) {
3698 return cpumask_first(nohz.grp_idle_mask); 3743 ilb = cpumask_first(nohz.grp_idle_mask);
3744 goto unlock;
3745 }
3699 3746
3700 ilb_group = ilb_group->next; 3747 ilb_group = ilb_group->next;
3701 3748
3702 } while (ilb_group != sd->groups); 3749 } while (ilb_group != sd->groups);
3703 } 3750 }
3751unlock:
3752 rcu_read_unlock();
3704 3753
3705out_done: 3754out_done:
3706 return nr_cpu_ids; 3755 return ilb;
3707} 3756}
3708#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ 3757#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
3709static inline int find_new_ilb(int call_cpu) 3758static inline int find_new_ilb(int call_cpu)
@@ -3848,6 +3897,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3848 3897
3849 update_shares(cpu); 3898 update_shares(cpu);
3850 3899
3900 rcu_read_lock();
3851 for_each_domain(cpu, sd) { 3901 for_each_domain(cpu, sd) {
3852 if (!(sd->flags & SD_LOAD_BALANCE)) 3902 if (!(sd->flags & SD_LOAD_BALANCE))
3853 continue; 3903 continue;
@@ -3893,6 +3943,7 @@ out:
3893 if (!balance) 3943 if (!balance)
3894 break; 3944 break;
3895 } 3945 }
3946 rcu_read_unlock();
3896 3947
3897 /* 3948 /*
3898 * next_balance will be updated only when there is a need. 3949 * next_balance will be updated only when there is a need.
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 68e69acc29b9..be40f7371ee1 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -64,3 +64,9 @@ SCHED_FEAT(OWNER_SPIN, 1)
64 * Decrement CPU power based on irq activity 64 * Decrement CPU power based on irq activity
65 */ 65 */
66SCHED_FEAT(NONIRQ_POWER, 1) 66SCHED_FEAT(NONIRQ_POWER, 1)
67
68/*
69 * Queue remote wakeups on the target CPU and process them
70 * using the scheduler IPI. Reduces rq->lock contention/bounces.
71 */
72SCHED_FEAT(TTWU_QUEUE, 1)
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index a776a6396427..0a51882534ea 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -7,7 +7,7 @@
7 7
8#ifdef CONFIG_SMP 8#ifdef CONFIG_SMP
9static int 9static int
10select_task_rq_idle(struct rq *rq, struct task_struct *p, int sd_flag, int flags) 10select_task_rq_idle(struct task_struct *p, int sd_flag, int flags)
11{ 11{
12 return task_cpu(p); /* IDLE tasks as never migrated */ 12 return task_cpu(p); /* IDLE tasks as never migrated */
13} 13}
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index e7cebdc65f82..10d018212bab 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -183,6 +183,14 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq)
183 return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period); 183 return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period);
184} 184}
185 185
186typedef struct task_group *rt_rq_iter_t;
187
188#define for_each_rt_rq(rt_rq, iter, rq) \
189 for (iter = list_entry_rcu(task_groups.next, typeof(*iter), list); \
190 (&iter->list != &task_groups) && \
191 (rt_rq = iter->rt_rq[cpu_of(rq)]); \
192 iter = list_entry_rcu(iter->list.next, typeof(*iter), list))
193
186static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq) 194static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
187{ 195{
188 list_add_rcu(&rt_rq->leaf_rt_rq_list, 196 list_add_rcu(&rt_rq->leaf_rt_rq_list,
@@ -288,6 +296,11 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq)
288 return ktime_to_ns(def_rt_bandwidth.rt_period); 296 return ktime_to_ns(def_rt_bandwidth.rt_period);
289} 297}
290 298
299typedef struct rt_rq *rt_rq_iter_t;
300
301#define for_each_rt_rq(rt_rq, iter, rq) \
302 for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
303
291static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq) 304static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
292{ 305{
293} 306}
@@ -402,12 +415,13 @@ next:
402static void __disable_runtime(struct rq *rq) 415static void __disable_runtime(struct rq *rq)
403{ 416{
404 struct root_domain *rd = rq->rd; 417 struct root_domain *rd = rq->rd;
418 rt_rq_iter_t iter;
405 struct rt_rq *rt_rq; 419 struct rt_rq *rt_rq;
406 420
407 if (unlikely(!scheduler_running)) 421 if (unlikely(!scheduler_running))
408 return; 422 return;
409 423
410 for_each_leaf_rt_rq(rt_rq, rq) { 424 for_each_rt_rq(rt_rq, iter, rq) {
411 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); 425 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
412 s64 want; 426 s64 want;
413 int i; 427 int i;
@@ -487,6 +501,7 @@ static void disable_runtime(struct rq *rq)
487 501
488static void __enable_runtime(struct rq *rq) 502static void __enable_runtime(struct rq *rq)
489{ 503{
504 rt_rq_iter_t iter;
490 struct rt_rq *rt_rq; 505 struct rt_rq *rt_rq;
491 506
492 if (unlikely(!scheduler_running)) 507 if (unlikely(!scheduler_running))
@@ -495,7 +510,7 @@ static void __enable_runtime(struct rq *rq)
495 /* 510 /*
496 * Reset each runqueue's bandwidth settings 511 * Reset each runqueue's bandwidth settings
497 */ 512 */
498 for_each_leaf_rt_rq(rt_rq, rq) { 513 for_each_rt_rq(rt_rq, iter, rq) {
499 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); 514 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
500 515
501 raw_spin_lock(&rt_b->rt_runtime_lock); 516 raw_spin_lock(&rt_b->rt_runtime_lock);
@@ -562,6 +577,13 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
562 if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) { 577 if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
563 rt_rq->rt_throttled = 0; 578 rt_rq->rt_throttled = 0;
564 enqueue = 1; 579 enqueue = 1;
580
581 /*
582 * Force a clock update if the CPU was idle,
583 * lest wakeup -> unthrottle time accumulate.
584 */
585 if (rt_rq->rt_nr_running && rq->curr == rq->idle)
586 rq->skip_clock_update = -1;
565 } 587 }
566 if (rt_rq->rt_time || rt_rq->rt_nr_running) 588 if (rt_rq->rt_time || rt_rq->rt_nr_running)
567 idle = 0; 589 idle = 0;
@@ -977,13 +999,23 @@ static void yield_task_rt(struct rq *rq)
977static int find_lowest_rq(struct task_struct *task); 999static int find_lowest_rq(struct task_struct *task);
978 1000
979static int 1001static int
980select_task_rq_rt(struct rq *rq, struct task_struct *p, int sd_flag, int flags) 1002select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
981{ 1003{
1004 struct task_struct *curr;
1005 struct rq *rq;
1006 int cpu;
1007
982 if (sd_flag != SD_BALANCE_WAKE) 1008 if (sd_flag != SD_BALANCE_WAKE)
983 return smp_processor_id(); 1009 return smp_processor_id();
984 1010
1011 cpu = task_cpu(p);
1012 rq = cpu_rq(cpu);
1013
1014 rcu_read_lock();
1015 curr = ACCESS_ONCE(rq->curr); /* unlocked access */
1016
985 /* 1017 /*
986 * If the current task is an RT task, then 1018 * If the current task on @p's runqueue is an RT task, then
987 * try to see if we can wake this RT task up on another 1019 * try to see if we can wake this RT task up on another
988 * runqueue. Otherwise simply start this RT task 1020 * runqueue. Otherwise simply start this RT task
989 * on its current runqueue. 1021 * on its current runqueue.
@@ -997,21 +1029,25 @@ select_task_rq_rt(struct rq *rq, struct task_struct *p, int sd_flag, int flags)
997 * lock? 1029 * lock?
998 * 1030 *
999 * For equal prio tasks, we just let the scheduler sort it out. 1031 * For equal prio tasks, we just let the scheduler sort it out.
1032 *
1033 * Otherwise, just let it ride on the affined RQ and the
1034 * post-schedule router will push the preempted task away
1035 *
1036 * This test is optimistic, if we get it wrong the load-balancer
1037 * will have to sort it out.
1000 */ 1038 */
1001 if (unlikely(rt_task(rq->curr)) && 1039 if (curr && unlikely(rt_task(curr)) &&
1002 (rq->curr->rt.nr_cpus_allowed < 2 || 1040 (curr->rt.nr_cpus_allowed < 2 ||
1003 rq->curr->prio < p->prio) && 1041 curr->prio < p->prio) &&
1004 (p->rt.nr_cpus_allowed > 1)) { 1042 (p->rt.nr_cpus_allowed > 1)) {
1005 int cpu = find_lowest_rq(p); 1043 int target = find_lowest_rq(p);
1006 1044
1007 return (cpu == -1) ? task_cpu(p) : cpu; 1045 if (target != -1)
1046 cpu = target;
1008 } 1047 }
1048 rcu_read_unlock();
1009 1049
1010 /* 1050 return cpu;
1011 * Otherwise, just let it ride on the affined RQ and the
1012 * post-schedule router will push the preempted task away
1013 */
1014 return task_cpu(p);
1015} 1051}
1016 1052
1017static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) 1053static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
@@ -1060,7 +1096,7 @@ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flag
1060 * to move current somewhere else, making room for our non-migratable 1096 * to move current somewhere else, making room for our non-migratable
1061 * task. 1097 * task.
1062 */ 1098 */
1063 if (p->prio == rq->curr->prio && !need_resched()) 1099 if (p->prio == rq->curr->prio && !test_tsk_need_resched(rq->curr))
1064 check_preempt_equal_prio(rq, p); 1100 check_preempt_equal_prio(rq, p);
1065#endif 1101#endif
1066} 1102}
@@ -1136,7 +1172,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
1136 * The previous task needs to be made eligible for pushing 1172 * The previous task needs to be made eligible for pushing
1137 * if it is still active 1173 * if it is still active
1138 */ 1174 */
1139 if (p->se.on_rq && p->rt.nr_cpus_allowed > 1) 1175 if (on_rt_rq(&p->rt) && p->rt.nr_cpus_allowed > 1)
1140 enqueue_pushable_task(rq, p); 1176 enqueue_pushable_task(rq, p);
1141} 1177}
1142 1178
@@ -1203,6 +1239,10 @@ static int find_lowest_rq(struct task_struct *task)
1203 int this_cpu = smp_processor_id(); 1239 int this_cpu = smp_processor_id();
1204 int cpu = task_cpu(task); 1240 int cpu = task_cpu(task);
1205 1241
1242 /* Make sure the mask is initialized first */
1243 if (unlikely(!lowest_mask))
1244 return -1;
1245
1206 if (task->rt.nr_cpus_allowed == 1) 1246 if (task->rt.nr_cpus_allowed == 1)
1207 return -1; /* No other targets possible */ 1247 return -1; /* No other targets possible */
1208 1248
@@ -1227,6 +1267,7 @@ static int find_lowest_rq(struct task_struct *task)
1227 if (!cpumask_test_cpu(this_cpu, lowest_mask)) 1267 if (!cpumask_test_cpu(this_cpu, lowest_mask))
1228 this_cpu = -1; /* Skip this_cpu opt if not among lowest */ 1268 this_cpu = -1; /* Skip this_cpu opt if not among lowest */
1229 1269
1270 rcu_read_lock();
1230 for_each_domain(cpu, sd) { 1271 for_each_domain(cpu, sd) {
1231 if (sd->flags & SD_WAKE_AFFINE) { 1272 if (sd->flags & SD_WAKE_AFFINE) {
1232 int best_cpu; 1273 int best_cpu;
@@ -1236,15 +1277,20 @@ static int find_lowest_rq(struct task_struct *task)
1236 * remote processor. 1277 * remote processor.
1237 */ 1278 */
1238 if (this_cpu != -1 && 1279 if (this_cpu != -1 &&
1239 cpumask_test_cpu(this_cpu, sched_domain_span(sd))) 1280 cpumask_test_cpu(this_cpu, sched_domain_span(sd))) {
1281 rcu_read_unlock();
1240 return this_cpu; 1282 return this_cpu;
1283 }
1241 1284
1242 best_cpu = cpumask_first_and(lowest_mask, 1285 best_cpu = cpumask_first_and(lowest_mask,
1243 sched_domain_span(sd)); 1286 sched_domain_span(sd));
1244 if (best_cpu < nr_cpu_ids) 1287 if (best_cpu < nr_cpu_ids) {
1288 rcu_read_unlock();
1245 return best_cpu; 1289 return best_cpu;
1290 }
1246 } 1291 }
1247 } 1292 }
1293 rcu_read_unlock();
1248 1294
1249 /* 1295 /*
1250 * And finally, if there were no matches within the domains 1296 * And finally, if there were no matches within the domains
@@ -1287,7 +1333,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
1287 !cpumask_test_cpu(lowest_rq->cpu, 1333 !cpumask_test_cpu(lowest_rq->cpu,
1288 &task->cpus_allowed) || 1334 &task->cpus_allowed) ||
1289 task_running(rq, task) || 1335 task_running(rq, task) ||
1290 !task->se.on_rq)) { 1336 !task->on_rq)) {
1291 1337
1292 raw_spin_unlock(&lowest_rq->lock); 1338 raw_spin_unlock(&lowest_rq->lock);
1293 lowest_rq = NULL; 1339 lowest_rq = NULL;
@@ -1321,7 +1367,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq)
1321 BUG_ON(task_current(rq, p)); 1367 BUG_ON(task_current(rq, p));
1322 BUG_ON(p->rt.nr_cpus_allowed <= 1); 1368 BUG_ON(p->rt.nr_cpus_allowed <= 1);
1323 1369
1324 BUG_ON(!p->se.on_rq); 1370 BUG_ON(!p->on_rq);
1325 BUG_ON(!rt_task(p)); 1371 BUG_ON(!rt_task(p));
1326 1372
1327 return p; 1373 return p;
@@ -1467,7 +1513,7 @@ static int pull_rt_task(struct rq *this_rq)
1467 */ 1513 */
1468 if (p && (p->prio < this_rq->rt.highest_prio.curr)) { 1514 if (p && (p->prio < this_rq->rt.highest_prio.curr)) {
1469 WARN_ON(p == src_rq->curr); 1515 WARN_ON(p == src_rq->curr);
1470 WARN_ON(!p->se.on_rq); 1516 WARN_ON(!p->on_rq);
1471 1517
1472 /* 1518 /*
1473 * There's a chance that p is higher in priority 1519 * There's a chance that p is higher in priority
@@ -1538,7 +1584,7 @@ static void set_cpus_allowed_rt(struct task_struct *p,
1538 * Update the migration status of the RQ if we have an RT task 1584 * Update the migration status of the RQ if we have an RT task
1539 * which is running AND changing its weight value. 1585 * which is running AND changing its weight value.
1540 */ 1586 */
1541 if (p->se.on_rq && (weight != p->rt.nr_cpus_allowed)) { 1587 if (p->on_rq && (weight != p->rt.nr_cpus_allowed)) {
1542 struct rq *rq = task_rq(p); 1588 struct rq *rq = task_rq(p);
1543 1589
1544 if (!task_current(rq, p)) { 1590 if (!task_current(rq, p)) {
@@ -1608,7 +1654,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p)
1608 * we may need to handle the pulling of RT tasks 1654 * we may need to handle the pulling of RT tasks
1609 * now. 1655 * now.
1610 */ 1656 */
1611 if (p->se.on_rq && !rq->rt.rt_nr_running) 1657 if (p->on_rq && !rq->rt.rt_nr_running)
1612 pull_rt_task(rq); 1658 pull_rt_task(rq);
1613} 1659}
1614 1660
@@ -1638,7 +1684,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
1638 * If that current running task is also an RT task 1684 * If that current running task is also an RT task
1639 * then see if we can move to another run queue. 1685 * then see if we can move to another run queue.
1640 */ 1686 */
1641 if (p->se.on_rq && rq->curr != p) { 1687 if (p->on_rq && rq->curr != p) {
1642#ifdef CONFIG_SMP 1688#ifdef CONFIG_SMP
1643 if (rq->rt.overloaded && push_rt_task(rq) && 1689 if (rq->rt.overloaded && push_rt_task(rq) &&
1644 /* Don't resched if we changed runqueues */ 1690 /* Don't resched if we changed runqueues */
@@ -1657,7 +1703,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
1657static void 1703static void
1658prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) 1704prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
1659{ 1705{
1660 if (!p->se.on_rq) 1706 if (!p->on_rq)
1661 return; 1707 return;
1662 1708
1663 if (rq->curr == p) { 1709 if (rq->curr == p) {
@@ -1796,10 +1842,11 @@ extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
1796 1842
1797static void print_rt_stats(struct seq_file *m, int cpu) 1843static void print_rt_stats(struct seq_file *m, int cpu)
1798{ 1844{
1845 rt_rq_iter_t iter;
1799 struct rt_rq *rt_rq; 1846 struct rt_rq *rt_rq;
1800 1847
1801 rcu_read_lock(); 1848 rcu_read_lock();
1802 for_each_leaf_rt_rq(rt_rq, cpu_rq(cpu)) 1849 for_each_rt_rq(rt_rq, iter, cpu_rq(cpu))
1803 print_rt_rq(m, cpu, rt_rq); 1850 print_rt_rq(m, cpu, rt_rq);
1804 rcu_read_unlock(); 1851 rcu_read_unlock();
1805} 1852}
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index 48ddf431db0e..331e01bcd026 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -37,7 +37,7 @@ static int show_schedstat(struct seq_file *seq, void *v)
37 37
38#ifdef CONFIG_SMP 38#ifdef CONFIG_SMP
39 /* domain-specific stats */ 39 /* domain-specific stats */
40 preempt_disable(); 40 rcu_read_lock();
41 for_each_domain(cpu, sd) { 41 for_each_domain(cpu, sd) {
42 enum cpu_idle_type itype; 42 enum cpu_idle_type itype;
43 43
@@ -64,7 +64,7 @@ static int show_schedstat(struct seq_file *seq, void *v)
64 sd->ttwu_wake_remote, sd->ttwu_move_affine, 64 sd->ttwu_wake_remote, sd->ttwu_move_affine,
65 sd->ttwu_move_balance); 65 sd->ttwu_move_balance);
66 } 66 }
67 preempt_enable(); 67 rcu_read_unlock();
68#endif 68#endif
69 } 69 }
70 kfree(mask_str); 70 kfree(mask_str);
diff --git a/kernel/sched_stoptask.c b/kernel/sched_stoptask.c
index 1ba2bd40fdac..6f437632afab 100644
--- a/kernel/sched_stoptask.c
+++ b/kernel/sched_stoptask.c
@@ -9,8 +9,7 @@
9 9
10#ifdef CONFIG_SMP 10#ifdef CONFIG_SMP
11static int 11static int
12select_task_rq_stop(struct rq *rq, struct task_struct *p, 12select_task_rq_stop(struct task_struct *p, int sd_flag, int flags)
13 int sd_flag, int flags)
14{ 13{
15 return task_cpu(p); /* stop tasks as never migrate */ 14 return task_cpu(p); /* stop tasks as never migrate */
16} 15}
@@ -26,7 +25,7 @@ static struct task_struct *pick_next_task_stop(struct rq *rq)
26{ 25{
27 struct task_struct *stop = rq->stop; 26 struct task_struct *stop = rq->stop;
28 27
29 if (stop && stop->se.on_rq) 28 if (stop && stop->on_rq)
30 return stop; 29 return stop;
31 30
32 return NULL; 31 return NULL;
diff --git a/kernel/signal.c b/kernel/signal.c
index 7165af5f1b11..ff7678603328 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -124,7 +124,7 @@ static inline int has_pending_signals(sigset_t *signal, sigset_t *blocked)
124 124
125static int recalc_sigpending_tsk(struct task_struct *t) 125static int recalc_sigpending_tsk(struct task_struct *t)
126{ 126{
127 if (t->signal->group_stop_count > 0 || 127 if ((t->group_stop & GROUP_STOP_PENDING) ||
128 PENDING(&t->pending, &t->blocked) || 128 PENDING(&t->pending, &t->blocked) ||
129 PENDING(&t->signal->shared_pending, &t->blocked)) { 129 PENDING(&t->signal->shared_pending, &t->blocked)) {
130 set_tsk_thread_flag(t, TIF_SIGPENDING); 130 set_tsk_thread_flag(t, TIF_SIGPENDING);
@@ -223,6 +223,83 @@ static inline void print_dropped_signal(int sig)
223 current->comm, current->pid, sig); 223 current->comm, current->pid, sig);
224} 224}
225 225
226/**
227 * task_clear_group_stop_trapping - clear group stop trapping bit
228 * @task: target task
229 *
230 * If GROUP_STOP_TRAPPING is set, a ptracer is waiting for us. Clear it
231 * and wake up the ptracer. Note that we don't need any further locking.
232 * @task->siglock guarantees that @task->parent points to the ptracer.
233 *
234 * CONTEXT:
235 * Must be called with @task->sighand->siglock held.
236 */
237static void task_clear_group_stop_trapping(struct task_struct *task)
238{
239 if (unlikely(task->group_stop & GROUP_STOP_TRAPPING)) {
240 task->group_stop &= ~GROUP_STOP_TRAPPING;
241 __wake_up_sync_key(&task->parent->signal->wait_chldexit,
242 TASK_UNINTERRUPTIBLE, 1, task);
243 }
244}
245
246/**
247 * task_clear_group_stop_pending - clear pending group stop
248 * @task: target task
249 *
250 * Clear group stop states for @task.
251 *
252 * CONTEXT:
253 * Must be called with @task->sighand->siglock held.
254 */
255void task_clear_group_stop_pending(struct task_struct *task)
256{
257 task->group_stop &= ~(GROUP_STOP_PENDING | GROUP_STOP_CONSUME |
258 GROUP_STOP_DEQUEUED);
259}
260
261/**
262 * task_participate_group_stop - participate in a group stop
263 * @task: task participating in a group stop
264 *
265 * @task has GROUP_STOP_PENDING set and is participating in a group stop.
266 * Group stop states are cleared and the group stop count is consumed if
267 * %GROUP_STOP_CONSUME was set. If the consumption completes the group
268 * stop, the appropriate %SIGNAL_* flags are set.
269 *
270 * CONTEXT:
271 * Must be called with @task->sighand->siglock held.
272 *
273 * RETURNS:
274 * %true if group stop completion should be notified to the parent, %false
275 * otherwise.
276 */
277static bool task_participate_group_stop(struct task_struct *task)
278{
279 struct signal_struct *sig = task->signal;
280 bool consume = task->group_stop & GROUP_STOP_CONSUME;
281
282 WARN_ON_ONCE(!(task->group_stop & GROUP_STOP_PENDING));
283
284 task_clear_group_stop_pending(task);
285
286 if (!consume)
287 return false;
288
289 if (!WARN_ON_ONCE(sig->group_stop_count == 0))
290 sig->group_stop_count--;
291
292 /*
293 * Tell the caller to notify completion iff we are entering into a
294 * fresh group stop. Read comment in do_signal_stop() for details.
295 */
296 if (!sig->group_stop_count && !(sig->flags & SIGNAL_STOP_STOPPED)) {
297 sig->flags = SIGNAL_STOP_STOPPED;
298 return true;
299 }
300 return false;
301}
302
226/* 303/*
227 * allocate a new signal queue record 304 * allocate a new signal queue record
228 * - this may be called without locks if and only if t == current, otherwise an 305 * - this may be called without locks if and only if t == current, otherwise an
@@ -527,7 +604,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
527 * is to alert stop-signal processing code when another 604 * is to alert stop-signal processing code when another
528 * processor has come along and cleared the flag. 605 * processor has come along and cleared the flag.
529 */ 606 */
530 tsk->signal->flags |= SIGNAL_STOP_DEQUEUED; 607 current->group_stop |= GROUP_STOP_DEQUEUED;
531 } 608 }
532 if ((info->si_code & __SI_MASK) == __SI_TIMER && info->si_sys_private) { 609 if ((info->si_code & __SI_MASK) == __SI_TIMER && info->si_sys_private) {
533 /* 610 /*
@@ -592,7 +669,7 @@ static int rm_from_queue_full(sigset_t *mask, struct sigpending *s)
592 if (sigisemptyset(&m)) 669 if (sigisemptyset(&m))
593 return 0; 670 return 0;
594 671
595 signandsets(&s->signal, &s->signal, mask); 672 sigandnsets(&s->signal, &s->signal, mask);
596 list_for_each_entry_safe(q, n, &s->list, list) { 673 list_for_each_entry_safe(q, n, &s->list, list) {
597 if (sigismember(mask, q->info.si_signo)) { 674 if (sigismember(mask, q->info.si_signo)) {
598 list_del_init(&q->list); 675 list_del_init(&q->list);
@@ -727,34 +804,14 @@ static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns)
727 } else if (sig == SIGCONT) { 804 } else if (sig == SIGCONT) {
728 unsigned int why; 805 unsigned int why;
729 /* 806 /*
730 * Remove all stop signals from all queues, 807 * Remove all stop signals from all queues, wake all threads.
731 * and wake all threads.
732 */ 808 */
733 rm_from_queue(SIG_KERNEL_STOP_MASK, &signal->shared_pending); 809 rm_from_queue(SIG_KERNEL_STOP_MASK, &signal->shared_pending);
734 t = p; 810 t = p;
735 do { 811 do {
736 unsigned int state; 812 task_clear_group_stop_pending(t);
737 rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending); 813 rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending);
738 /* 814 wake_up_state(t, __TASK_STOPPED);
739 * If there is a handler for SIGCONT, we must make
740 * sure that no thread returns to user mode before
741 * we post the signal, in case it was the only
742 * thread eligible to run the signal handler--then
743 * it must not do anything between resuming and
744 * running the handler. With the TIF_SIGPENDING
745 * flag set, the thread will pause and acquire the
746 * siglock that we hold now and until we've queued
747 * the pending signal.
748 *
749 * Wake up the stopped thread _after_ setting
750 * TIF_SIGPENDING
751 */
752 state = __TASK_STOPPED;
753 if (sig_user_defined(t, SIGCONT) && !sigismember(&t->blocked, SIGCONT)) {
754 set_tsk_thread_flag(t, TIF_SIGPENDING);
755 state |= TASK_INTERRUPTIBLE;
756 }
757 wake_up_state(t, state);
758 } while_each_thread(p, t); 815 } while_each_thread(p, t);
759 816
760 /* 817 /*
@@ -780,13 +837,6 @@ static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns)
780 signal->flags = why | SIGNAL_STOP_CONTINUED; 837 signal->flags = why | SIGNAL_STOP_CONTINUED;
781 signal->group_stop_count = 0; 838 signal->group_stop_count = 0;
782 signal->group_exit_code = 0; 839 signal->group_exit_code = 0;
783 } else {
784 /*
785 * We are not stopped, but there could be a stop
786 * signal in the middle of being processed after
787 * being removed from the queue. Clear that too.
788 */
789 signal->flags &= ~SIGNAL_STOP_DEQUEUED;
790 } 840 }
791 } 841 }
792 842
@@ -875,6 +925,7 @@ static void complete_signal(int sig, struct task_struct *p, int group)
875 signal->group_stop_count = 0; 925 signal->group_stop_count = 0;
876 t = p; 926 t = p;
877 do { 927 do {
928 task_clear_group_stop_pending(t);
878 sigaddset(&t->pending.signal, SIGKILL); 929 sigaddset(&t->pending.signal, SIGKILL);
879 signal_wake_up(t, 1); 930 signal_wake_up(t, 1);
880 } while_each_thread(p, t); 931 } while_each_thread(p, t);
@@ -1109,6 +1160,7 @@ int zap_other_threads(struct task_struct *p)
1109 p->signal->group_stop_count = 0; 1160 p->signal->group_stop_count = 0;
1110 1161
1111 while_each_thread(p, t) { 1162 while_each_thread(p, t) {
1163 task_clear_group_stop_pending(t);
1112 count++; 1164 count++;
1113 1165
1114 /* Don't bother with already dead threads */ 1166 /* Don't bother with already dead threads */
@@ -1536,16 +1588,30 @@ int do_notify_parent(struct task_struct *tsk, int sig)
1536 return ret; 1588 return ret;
1537} 1589}
1538 1590
1539static void do_notify_parent_cldstop(struct task_struct *tsk, int why) 1591/**
1592 * do_notify_parent_cldstop - notify parent of stopped/continued state change
1593 * @tsk: task reporting the state change
1594 * @for_ptracer: the notification is for ptracer
1595 * @why: CLD_{CONTINUED|STOPPED|TRAPPED} to report
1596 *
1597 * Notify @tsk's parent that the stopped/continued state has changed. If
1598 * @for_ptracer is %false, @tsk's group leader notifies to its real parent.
1599 * If %true, @tsk reports to @tsk->parent which should be the ptracer.
1600 *
1601 * CONTEXT:
1602 * Must be called with tasklist_lock at least read locked.
1603 */
1604static void do_notify_parent_cldstop(struct task_struct *tsk,
1605 bool for_ptracer, int why)
1540{ 1606{
1541 struct siginfo info; 1607 struct siginfo info;
1542 unsigned long flags; 1608 unsigned long flags;
1543 struct task_struct *parent; 1609 struct task_struct *parent;
1544 struct sighand_struct *sighand; 1610 struct sighand_struct *sighand;
1545 1611
1546 if (task_ptrace(tsk)) 1612 if (for_ptracer) {
1547 parent = tsk->parent; 1613 parent = tsk->parent;
1548 else { 1614 } else {
1549 tsk = tsk->group_leader; 1615 tsk = tsk->group_leader;
1550 parent = tsk->real_parent; 1616 parent = tsk->real_parent;
1551 } 1617 }
@@ -1621,6 +1687,15 @@ static int sigkill_pending(struct task_struct *tsk)
1621} 1687}
1622 1688
1623/* 1689/*
1690 * Test whether the target task of the usual cldstop notification - the
1691 * real_parent of @child - is in the same group as the ptracer.
1692 */
1693static bool real_parent_is_ptracer(struct task_struct *child)
1694{
1695 return same_thread_group(child->parent, child->real_parent);
1696}
1697
1698/*
1624 * This must be called with current->sighand->siglock held. 1699 * This must be called with current->sighand->siglock held.
1625 * 1700 *
1626 * This should be the path for all ptrace stops. 1701 * This should be the path for all ptrace stops.
@@ -1631,10 +1706,12 @@ static int sigkill_pending(struct task_struct *tsk)
1631 * If we actually decide not to stop at all because the tracer 1706 * If we actually decide not to stop at all because the tracer
1632 * is gone, we keep current->exit_code unless clear_code. 1707 * is gone, we keep current->exit_code unless clear_code.
1633 */ 1708 */
1634static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info) 1709static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
1635 __releases(&current->sighand->siglock) 1710 __releases(&current->sighand->siglock)
1636 __acquires(&current->sighand->siglock) 1711 __acquires(&current->sighand->siglock)
1637{ 1712{
1713 bool gstop_done = false;
1714
1638 if (arch_ptrace_stop_needed(exit_code, info)) { 1715 if (arch_ptrace_stop_needed(exit_code, info)) {
1639 /* 1716 /*
1640 * The arch code has something special to do before a 1717 * The arch code has something special to do before a
@@ -1655,21 +1732,49 @@ static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info)
1655 } 1732 }
1656 1733
1657 /* 1734 /*
1658 * If there is a group stop in progress, 1735 * If @why is CLD_STOPPED, we're trapping to participate in a group
1659 * we must participate in the bookkeeping. 1736 * stop. Do the bookkeeping. Note that if SIGCONT was delievered
1737 * while siglock was released for the arch hook, PENDING could be
1738 * clear now. We act as if SIGCONT is received after TASK_TRACED
1739 * is entered - ignore it.
1660 */ 1740 */
1661 if (current->signal->group_stop_count > 0) 1741 if (why == CLD_STOPPED && (current->group_stop & GROUP_STOP_PENDING))
1662 --current->signal->group_stop_count; 1742 gstop_done = task_participate_group_stop(current);
1663 1743
1664 current->last_siginfo = info; 1744 current->last_siginfo = info;
1665 current->exit_code = exit_code; 1745 current->exit_code = exit_code;
1666 1746
1667 /* Let the debugger run. */ 1747 /*
1668 __set_current_state(TASK_TRACED); 1748 * TRACED should be visible before TRAPPING is cleared; otherwise,
1749 * the tracer might fail do_wait().
1750 */
1751 set_current_state(TASK_TRACED);
1752
1753 /*
1754 * We're committing to trapping. Clearing GROUP_STOP_TRAPPING and
1755 * transition to TASK_TRACED should be atomic with respect to
1756 * siglock. This hsould be done after the arch hook as siglock is
1757 * released and regrabbed across it.
1758 */
1759 task_clear_group_stop_trapping(current);
1760
1669 spin_unlock_irq(&current->sighand->siglock); 1761 spin_unlock_irq(&current->sighand->siglock);
1670 read_lock(&tasklist_lock); 1762 read_lock(&tasklist_lock);
1671 if (may_ptrace_stop()) { 1763 if (may_ptrace_stop()) {
1672 do_notify_parent_cldstop(current, CLD_TRAPPED); 1764 /*
1765 * Notify parents of the stop.
1766 *
1767 * While ptraced, there are two parents - the ptracer and
1768 * the real_parent of the group_leader. The ptracer should
1769 * know about every stop while the real parent is only
1770 * interested in the completion of group stop. The states
1771 * for the two don't interact with each other. Notify
1772 * separately unless they're gonna be duplicates.
1773 */
1774 do_notify_parent_cldstop(current, true, why);
1775 if (gstop_done && !real_parent_is_ptracer(current))
1776 do_notify_parent_cldstop(current, false, why);
1777
1673 /* 1778 /*
1674 * Don't want to allow preemption here, because 1779 * Don't want to allow preemption here, because
1675 * sys_ptrace() needs this task to be inactive. 1780 * sys_ptrace() needs this task to be inactive.
@@ -1684,7 +1789,16 @@ static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info)
1684 /* 1789 /*
1685 * By the time we got the lock, our tracer went away. 1790 * By the time we got the lock, our tracer went away.
1686 * Don't drop the lock yet, another tracer may come. 1791 * Don't drop the lock yet, another tracer may come.
1792 *
1793 * If @gstop_done, the ptracer went away between group stop
1794 * completion and here. During detach, it would have set
1795 * GROUP_STOP_PENDING on us and we'll re-enter TASK_STOPPED
1796 * in do_signal_stop() on return, so notifying the real
1797 * parent of the group stop completion is enough.
1687 */ 1798 */
1799 if (gstop_done)
1800 do_notify_parent_cldstop(current, false, why);
1801
1688 __set_current_state(TASK_RUNNING); 1802 __set_current_state(TASK_RUNNING);
1689 if (clear_code) 1803 if (clear_code)
1690 current->exit_code = 0; 1804 current->exit_code = 0;
@@ -1728,7 +1842,7 @@ void ptrace_notify(int exit_code)
1728 1842
1729 /* Let the debugger run. */ 1843 /* Let the debugger run. */
1730 spin_lock_irq(&current->sighand->siglock); 1844 spin_lock_irq(&current->sighand->siglock);
1731 ptrace_stop(exit_code, 1, &info); 1845 ptrace_stop(exit_code, CLD_TRAPPED, 1, &info);
1732 spin_unlock_irq(&current->sighand->siglock); 1846 spin_unlock_irq(&current->sighand->siglock);
1733} 1847}
1734 1848
@@ -1741,66 +1855,115 @@ void ptrace_notify(int exit_code)
1741static int do_signal_stop(int signr) 1855static int do_signal_stop(int signr)
1742{ 1856{
1743 struct signal_struct *sig = current->signal; 1857 struct signal_struct *sig = current->signal;
1744 int notify;
1745 1858
1746 if (!sig->group_stop_count) { 1859 if (!(current->group_stop & GROUP_STOP_PENDING)) {
1860 unsigned int gstop = GROUP_STOP_PENDING | GROUP_STOP_CONSUME;
1747 struct task_struct *t; 1861 struct task_struct *t;
1748 1862
1749 if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED) || 1863 /* signr will be recorded in task->group_stop for retries */
1864 WARN_ON_ONCE(signr & ~GROUP_STOP_SIGMASK);
1865
1866 if (!likely(current->group_stop & GROUP_STOP_DEQUEUED) ||
1750 unlikely(signal_group_exit(sig))) 1867 unlikely(signal_group_exit(sig)))
1751 return 0; 1868 return 0;
1752 /* 1869 /*
1753 * There is no group stop already in progress. 1870 * There is no group stop already in progress. We must
1754 * We must initiate one now. 1871 * initiate one now.
1872 *
1873 * While ptraced, a task may be resumed while group stop is
1874 * still in effect and then receive a stop signal and
1875 * initiate another group stop. This deviates from the
1876 * usual behavior as two consecutive stop signals can't
1877 * cause two group stops when !ptraced. That is why we
1878 * also check !task_is_stopped(t) below.
1879 *
1880 * The condition can be distinguished by testing whether
1881 * SIGNAL_STOP_STOPPED is already set. Don't generate
1882 * group_exit_code in such case.
1883 *
1884 * This is not necessary for SIGNAL_STOP_CONTINUED because
1885 * an intervening stop signal is required to cause two
1886 * continued events regardless of ptrace.
1755 */ 1887 */
1756 sig->group_exit_code = signr; 1888 if (!(sig->flags & SIGNAL_STOP_STOPPED))
1889 sig->group_exit_code = signr;
1890 else
1891 WARN_ON_ONCE(!task_ptrace(current));
1757 1892
1893 current->group_stop &= ~GROUP_STOP_SIGMASK;
1894 current->group_stop |= signr | gstop;
1758 sig->group_stop_count = 1; 1895 sig->group_stop_count = 1;
1759 for (t = next_thread(current); t != current; t = next_thread(t)) 1896 for (t = next_thread(current); t != current;
1897 t = next_thread(t)) {
1898 t->group_stop &= ~GROUP_STOP_SIGMASK;
1760 /* 1899 /*
1761 * Setting state to TASK_STOPPED for a group 1900 * Setting state to TASK_STOPPED for a group
1762 * stop is always done with the siglock held, 1901 * stop is always done with the siglock held,
1763 * so this check has no races. 1902 * so this check has no races.
1764 */ 1903 */
1765 if (!(t->flags & PF_EXITING) && 1904 if (!(t->flags & PF_EXITING) && !task_is_stopped(t)) {
1766 !task_is_stopped_or_traced(t)) { 1905 t->group_stop |= signr | gstop;
1767 sig->group_stop_count++; 1906 sig->group_stop_count++;
1768 signal_wake_up(t, 0); 1907 signal_wake_up(t, 0);
1769 } 1908 }
1909 }
1770 } 1910 }
1771 /* 1911retry:
1772 * If there are no other threads in the group, or if there is 1912 if (likely(!task_ptrace(current))) {
1773 * a group stop in progress and we are the last to stop, report 1913 int notify = 0;
1774 * to the parent. When ptraced, every thread reports itself. 1914
1775 */ 1915 /*
1776 notify = sig->group_stop_count == 1 ? CLD_STOPPED : 0; 1916 * If there are no other threads in the group, or if there
1777 notify = tracehook_notify_jctl(notify, CLD_STOPPED); 1917 * is a group stop in progress and we are the last to stop,
1778 /* 1918 * report to the parent.
1779 * tracehook_notify_jctl() can drop and reacquire siglock, so 1919 */
1780 * we keep ->group_stop_count != 0 before the call. If SIGCONT 1920 if (task_participate_group_stop(current))
1781 * or SIGKILL comes in between ->group_stop_count == 0. 1921 notify = CLD_STOPPED;
1782 */ 1922
1783 if (sig->group_stop_count) {
1784 if (!--sig->group_stop_count)
1785 sig->flags = SIGNAL_STOP_STOPPED;
1786 current->exit_code = sig->group_exit_code;
1787 __set_current_state(TASK_STOPPED); 1923 __set_current_state(TASK_STOPPED);
1924 spin_unlock_irq(&current->sighand->siglock);
1925
1926 /*
1927 * Notify the parent of the group stop completion. Because
1928 * we're not holding either the siglock or tasklist_lock
1929 * here, ptracer may attach inbetween; however, this is for
1930 * group stop and should always be delivered to the real
1931 * parent of the group leader. The new ptracer will get
1932 * its notification when this task transitions into
1933 * TASK_TRACED.
1934 */
1935 if (notify) {
1936 read_lock(&tasklist_lock);
1937 do_notify_parent_cldstop(current, false, notify);
1938 read_unlock(&tasklist_lock);
1939 }
1940
1941 /* Now we don't run again until woken by SIGCONT or SIGKILL */
1942 schedule();
1943
1944 spin_lock_irq(&current->sighand->siglock);
1945 } else {
1946 ptrace_stop(current->group_stop & GROUP_STOP_SIGMASK,
1947 CLD_STOPPED, 0, NULL);
1948 current->exit_code = 0;
1788 } 1949 }
1789 spin_unlock_irq(&current->sighand->siglock);
1790 1950
1791 if (notify) { 1951 /*
1792 read_lock(&tasklist_lock); 1952 * GROUP_STOP_PENDING could be set if another group stop has
1793 do_notify_parent_cldstop(current, notify); 1953 * started since being woken up or ptrace wants us to transit
1794 read_unlock(&tasklist_lock); 1954 * between TASK_STOPPED and TRACED. Retry group stop.
1955 */
1956 if (current->group_stop & GROUP_STOP_PENDING) {
1957 WARN_ON_ONCE(!(current->group_stop & GROUP_STOP_SIGMASK));
1958 goto retry;
1795 } 1959 }
1796 1960
1797 /* Now we don't run again until woken by SIGCONT or SIGKILL */ 1961 /* PTRACE_ATTACH might have raced with task killing, clear trapping */
1798 do { 1962 task_clear_group_stop_trapping(current);
1799 schedule(); 1963
1800 } while (try_to_freeze()); 1964 spin_unlock_irq(&current->sighand->siglock);
1801 1965
1802 tracehook_finish_jctl(); 1966 tracehook_finish_jctl();
1803 current->exit_code = 0;
1804 1967
1805 return 1; 1968 return 1;
1806} 1969}
@@ -1814,7 +1977,7 @@ static int ptrace_signal(int signr, siginfo_t *info,
1814 ptrace_signal_deliver(regs, cookie); 1977 ptrace_signal_deliver(regs, cookie);
1815 1978
1816 /* Let the debugger run. */ 1979 /* Let the debugger run. */
1817 ptrace_stop(signr, 0, info); 1980 ptrace_stop(signr, CLD_TRAPPED, 0, info);
1818 1981
1819 /* We're back. Did the debugger cancel the sig? */ 1982 /* We're back. Did the debugger cancel the sig? */
1820 signr = current->exit_code; 1983 signr = current->exit_code;
@@ -1869,18 +2032,36 @@ relock:
1869 * the CLD_ si_code into SIGNAL_CLD_MASK bits. 2032 * the CLD_ si_code into SIGNAL_CLD_MASK bits.
1870 */ 2033 */
1871 if (unlikely(signal->flags & SIGNAL_CLD_MASK)) { 2034 if (unlikely(signal->flags & SIGNAL_CLD_MASK)) {
1872 int why = (signal->flags & SIGNAL_STOP_CONTINUED) 2035 struct task_struct *leader;
1873 ? CLD_CONTINUED : CLD_STOPPED; 2036 int why;
2037
2038 if (signal->flags & SIGNAL_CLD_CONTINUED)
2039 why = CLD_CONTINUED;
2040 else
2041 why = CLD_STOPPED;
2042
1874 signal->flags &= ~SIGNAL_CLD_MASK; 2043 signal->flags &= ~SIGNAL_CLD_MASK;
1875 2044
1876 why = tracehook_notify_jctl(why, CLD_CONTINUED);
1877 spin_unlock_irq(&sighand->siglock); 2045 spin_unlock_irq(&sighand->siglock);
1878 2046
1879 if (why) { 2047 /*
1880 read_lock(&tasklist_lock); 2048 * Notify the parent that we're continuing. This event is
1881 do_notify_parent_cldstop(current->group_leader, why); 2049 * always per-process and doesn't make whole lot of sense
1882 read_unlock(&tasklist_lock); 2050 * for ptracers, who shouldn't consume the state via
1883 } 2051 * wait(2) either, but, for backward compatibility, notify
2052 * the ptracer of the group leader too unless it's gonna be
2053 * a duplicate.
2054 */
2055 read_lock(&tasklist_lock);
2056
2057 do_notify_parent_cldstop(current, false, why);
2058
2059 leader = current->group_leader;
2060 if (task_ptrace(leader) && !real_parent_is_ptracer(leader))
2061 do_notify_parent_cldstop(leader, true, why);
2062
2063 read_unlock(&tasklist_lock);
2064
1884 goto relock; 2065 goto relock;
1885 } 2066 }
1886 2067
@@ -1897,8 +2078,8 @@ relock:
1897 if (unlikely(signr != 0)) 2078 if (unlikely(signr != 0))
1898 ka = return_ka; 2079 ka = return_ka;
1899 else { 2080 else {
1900 if (unlikely(signal->group_stop_count > 0) && 2081 if (unlikely(current->group_stop &
1901 do_signal_stop(0)) 2082 GROUP_STOP_PENDING) && do_signal_stop(0))
1902 goto relock; 2083 goto relock;
1903 2084
1904 signr = dequeue_signal(current, &current->blocked, 2085 signr = dequeue_signal(current, &current->blocked,
@@ -2017,10 +2198,42 @@ relock:
2017 return signr; 2198 return signr;
2018} 2199}
2019 2200
2201/*
2202 * It could be that complete_signal() picked us to notify about the
2203 * group-wide signal. Other threads should be notified now to take
2204 * the shared signals in @which since we will not.
2205 */
2206static void retarget_shared_pending(struct task_struct *tsk, sigset_t *which)
2207{
2208 sigset_t retarget;
2209 struct task_struct *t;
2210
2211 sigandsets(&retarget, &tsk->signal->shared_pending.signal, which);
2212 if (sigisemptyset(&retarget))
2213 return;
2214
2215 t = tsk;
2216 while_each_thread(tsk, t) {
2217 if (t->flags & PF_EXITING)
2218 continue;
2219
2220 if (!has_pending_signals(&retarget, &t->blocked))
2221 continue;
2222 /* Remove the signals this thread can handle. */
2223 sigandsets(&retarget, &retarget, &t->blocked);
2224
2225 if (!signal_pending(t))
2226 signal_wake_up(t, 0);
2227
2228 if (sigisemptyset(&retarget))
2229 break;
2230 }
2231}
2232
2020void exit_signals(struct task_struct *tsk) 2233void exit_signals(struct task_struct *tsk)
2021{ 2234{
2022 int group_stop = 0; 2235 int group_stop = 0;
2023 struct task_struct *t; 2236 sigset_t unblocked;
2024 2237
2025 if (thread_group_empty(tsk) || signal_group_exit(tsk->signal)) { 2238 if (thread_group_empty(tsk) || signal_group_exit(tsk->signal)) {
2026 tsk->flags |= PF_EXITING; 2239 tsk->flags |= PF_EXITING;
@@ -2036,26 +2249,23 @@ void exit_signals(struct task_struct *tsk)
2036 if (!signal_pending(tsk)) 2249 if (!signal_pending(tsk))
2037 goto out; 2250 goto out;
2038 2251
2039 /* 2252 unblocked = tsk->blocked;
2040 * It could be that __group_complete_signal() choose us to 2253 signotset(&unblocked);
2041 * notify about group-wide signal. Another thread should be 2254 retarget_shared_pending(tsk, &unblocked);
2042 * woken now to take the signal since we will not.
2043 */
2044 for (t = tsk; (t = next_thread(t)) != tsk; )
2045 if (!signal_pending(t) && !(t->flags & PF_EXITING))
2046 recalc_sigpending_and_wake(t);
2047 2255
2048 if (unlikely(tsk->signal->group_stop_count) && 2256 if (unlikely(tsk->group_stop & GROUP_STOP_PENDING) &&
2049 !--tsk->signal->group_stop_count) { 2257 task_participate_group_stop(tsk))
2050 tsk->signal->flags = SIGNAL_STOP_STOPPED; 2258 group_stop = CLD_STOPPED;
2051 group_stop = tracehook_notify_jctl(CLD_STOPPED, CLD_STOPPED);
2052 }
2053out: 2259out:
2054 spin_unlock_irq(&tsk->sighand->siglock); 2260 spin_unlock_irq(&tsk->sighand->siglock);
2055 2261
2262 /*
2263 * If group stop has completed, deliver the notification. This
2264 * should always go to the real parent of the group leader.
2265 */
2056 if (unlikely(group_stop)) { 2266 if (unlikely(group_stop)) {
2057 read_lock(&tasklist_lock); 2267 read_lock(&tasklist_lock);
2058 do_notify_parent_cldstop(tsk, group_stop); 2268 do_notify_parent_cldstop(tsk, false, group_stop);
2059 read_unlock(&tasklist_lock); 2269 read_unlock(&tasklist_lock);
2060 } 2270 }
2061} 2271}
@@ -2089,11 +2299,33 @@ long do_no_restart_syscall(struct restart_block *param)
2089 return -EINTR; 2299 return -EINTR;
2090} 2300}
2091 2301
2092/* 2302static void __set_task_blocked(struct task_struct *tsk, const sigset_t *newset)
2093 * We don't need to get the kernel lock - this is all local to this 2303{
2094 * particular thread.. (and that's good, because this is _heavily_ 2304 if (signal_pending(tsk) && !thread_group_empty(tsk)) {
2095 * used by various programs) 2305 sigset_t newblocked;
2306 /* A set of now blocked but previously unblocked signals. */
2307 sigandnsets(&newblocked, newset, &current->blocked);
2308 retarget_shared_pending(tsk, &newblocked);
2309 }
2310 tsk->blocked = *newset;
2311 recalc_sigpending();
2312}
2313
2314/**
2315 * set_current_blocked - change current->blocked mask
2316 * @newset: new mask
2317 *
2318 * It is wrong to change ->blocked directly, this helper should be used
2319 * to ensure the process can't miss a shared signal we are going to block.
2096 */ 2320 */
2321void set_current_blocked(const sigset_t *newset)
2322{
2323 struct task_struct *tsk = current;
2324
2325 spin_lock_irq(&tsk->sighand->siglock);
2326 __set_task_blocked(tsk, newset);
2327 spin_unlock_irq(&tsk->sighand->siglock);
2328}
2097 2329
2098/* 2330/*
2099 * This is also useful for kernel threads that want to temporarily 2331 * This is also useful for kernel threads that want to temporarily
@@ -2105,73 +2337,66 @@ long do_no_restart_syscall(struct restart_block *param)
2105 */ 2337 */
2106int sigprocmask(int how, sigset_t *set, sigset_t *oldset) 2338int sigprocmask(int how, sigset_t *set, sigset_t *oldset)
2107{ 2339{
2108 int error; 2340 struct task_struct *tsk = current;
2341 sigset_t newset;
2109 2342
2110 spin_lock_irq(&current->sighand->siglock); 2343 /* Lockless, only current can change ->blocked, never from irq */
2111 if (oldset) 2344 if (oldset)
2112 *oldset = current->blocked; 2345 *oldset = tsk->blocked;
2113 2346
2114 error = 0;
2115 switch (how) { 2347 switch (how) {
2116 case SIG_BLOCK: 2348 case SIG_BLOCK:
2117 sigorsets(&current->blocked, &current->blocked, set); 2349 sigorsets(&newset, &tsk->blocked, set);
2118 break; 2350 break;
2119 case SIG_UNBLOCK: 2351 case SIG_UNBLOCK:
2120 signandsets(&current->blocked, &current->blocked, set); 2352 sigandnsets(&newset, &tsk->blocked, set);
2121 break; 2353 break;
2122 case SIG_SETMASK: 2354 case SIG_SETMASK:
2123 current->blocked = *set; 2355 newset = *set;
2124 break; 2356 break;
2125 default: 2357 default:
2126 error = -EINVAL; 2358 return -EINVAL;
2127 } 2359 }
2128 recalc_sigpending();
2129 spin_unlock_irq(&current->sighand->siglock);
2130 2360
2131 return error; 2361 set_current_blocked(&newset);
2362 return 0;
2132} 2363}
2133 2364
2134/** 2365/**
2135 * sys_rt_sigprocmask - change the list of currently blocked signals 2366 * sys_rt_sigprocmask - change the list of currently blocked signals
2136 * @how: whether to add, remove, or set signals 2367 * @how: whether to add, remove, or set signals
2137 * @set: stores pending signals 2368 * @nset: stores pending signals
2138 * @oset: previous value of signal mask if non-null 2369 * @oset: previous value of signal mask if non-null
2139 * @sigsetsize: size of sigset_t type 2370 * @sigsetsize: size of sigset_t type
2140 */ 2371 */
2141SYSCALL_DEFINE4(rt_sigprocmask, int, how, sigset_t __user *, set, 2372SYSCALL_DEFINE4(rt_sigprocmask, int, how, sigset_t __user *, nset,
2142 sigset_t __user *, oset, size_t, sigsetsize) 2373 sigset_t __user *, oset, size_t, sigsetsize)
2143{ 2374{
2144 int error = -EINVAL;
2145 sigset_t old_set, new_set; 2375 sigset_t old_set, new_set;
2376 int error;
2146 2377
2147 /* XXX: Don't preclude handling different sized sigset_t's. */ 2378 /* XXX: Don't preclude handling different sized sigset_t's. */
2148 if (sigsetsize != sizeof(sigset_t)) 2379 if (sigsetsize != sizeof(sigset_t))
2149 goto out; 2380 return -EINVAL;
2150 2381
2151 if (set) { 2382 old_set = current->blocked;
2152 error = -EFAULT; 2383
2153 if (copy_from_user(&new_set, set, sizeof(*set))) 2384 if (nset) {
2154 goto out; 2385 if (copy_from_user(&new_set, nset, sizeof(sigset_t)))
2386 return -EFAULT;
2155 sigdelsetmask(&new_set, sigmask(SIGKILL)|sigmask(SIGSTOP)); 2387 sigdelsetmask(&new_set, sigmask(SIGKILL)|sigmask(SIGSTOP));
2156 2388
2157 error = sigprocmask(how, &new_set, &old_set); 2389 error = sigprocmask(how, &new_set, NULL);
2158 if (error) 2390 if (error)
2159 goto out; 2391 return error;
2160 if (oset) 2392 }
2161 goto set_old;
2162 } else if (oset) {
2163 spin_lock_irq(&current->sighand->siglock);
2164 old_set = current->blocked;
2165 spin_unlock_irq(&current->sighand->siglock);
2166 2393
2167 set_old: 2394 if (oset) {
2168 error = -EFAULT; 2395 if (copy_to_user(oset, &old_set, sizeof(sigset_t)))
2169 if (copy_to_user(oset, &old_set, sizeof(*oset))) 2396 return -EFAULT;
2170 goto out;
2171 } 2397 }
2172 error = 0; 2398
2173out: 2399 return 0;
2174 return error;
2175} 2400}
2176 2401
2177long do_sigpending(void __user *set, unsigned long sigsetsize) 2402long do_sigpending(void __user *set, unsigned long sigsetsize)
@@ -2284,6 +2509,66 @@ int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from)
2284#endif 2509#endif
2285 2510
2286/** 2511/**
2512 * do_sigtimedwait - wait for queued signals specified in @which
2513 * @which: queued signals to wait for
2514 * @info: if non-null, the signal's siginfo is returned here
2515 * @ts: upper bound on process time suspension
2516 */
2517int do_sigtimedwait(const sigset_t *which, siginfo_t *info,
2518 const struct timespec *ts)
2519{
2520 struct task_struct *tsk = current;
2521 long timeout = MAX_SCHEDULE_TIMEOUT;
2522 sigset_t mask = *which;
2523 int sig;
2524
2525 if (ts) {
2526 if (!timespec_valid(ts))
2527 return -EINVAL;
2528 timeout = timespec_to_jiffies(ts);
2529 /*
2530 * We can be close to the next tick, add another one
2531 * to ensure we will wait at least the time asked for.
2532 */
2533 if (ts->tv_sec || ts->tv_nsec)
2534 timeout++;
2535 }
2536
2537 /*
2538 * Invert the set of allowed signals to get those we want to block.
2539 */
2540 sigdelsetmask(&mask, sigmask(SIGKILL) | sigmask(SIGSTOP));
2541 signotset(&mask);
2542
2543 spin_lock_irq(&tsk->sighand->siglock);
2544 sig = dequeue_signal(tsk, &mask, info);
2545 if (!sig && timeout) {
2546 /*
2547 * None ready, temporarily unblock those we're interested
2548 * while we are sleeping in so that we'll be awakened when
2549 * they arrive. Unblocking is always fine, we can avoid
2550 * set_current_blocked().
2551 */
2552 tsk->real_blocked = tsk->blocked;
2553 sigandsets(&tsk->blocked, &tsk->blocked, &mask);
2554 recalc_sigpending();
2555 spin_unlock_irq(&tsk->sighand->siglock);
2556
2557 timeout = schedule_timeout_interruptible(timeout);
2558
2559 spin_lock_irq(&tsk->sighand->siglock);
2560 __set_task_blocked(tsk, &tsk->real_blocked);
2561 siginitset(&tsk->real_blocked, 0);
2562 sig = dequeue_signal(tsk, &mask, info);
2563 }
2564 spin_unlock_irq(&tsk->sighand->siglock);
2565
2566 if (sig)
2567 return sig;
2568 return timeout ? -EINTR : -EAGAIN;
2569}
2570
2571/**
2287 * sys_rt_sigtimedwait - synchronously wait for queued signals specified 2572 * sys_rt_sigtimedwait - synchronously wait for queued signals specified
2288 * in @uthese 2573 * in @uthese
2289 * @uthese: queued signals to wait for 2574 * @uthese: queued signals to wait for
@@ -2295,11 +2580,10 @@ SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese,
2295 siginfo_t __user *, uinfo, const struct timespec __user *, uts, 2580 siginfo_t __user *, uinfo, const struct timespec __user *, uts,
2296 size_t, sigsetsize) 2581 size_t, sigsetsize)
2297{ 2582{
2298 int ret, sig;
2299 sigset_t these; 2583 sigset_t these;
2300 struct timespec ts; 2584 struct timespec ts;
2301 siginfo_t info; 2585 siginfo_t info;
2302 long timeout = 0; 2586 int ret;
2303 2587
2304 /* XXX: Don't preclude handling different sized sigset_t's. */ 2588 /* XXX: Don't preclude handling different sized sigset_t's. */
2305 if (sigsetsize != sizeof(sigset_t)) 2589 if (sigsetsize != sizeof(sigset_t))
@@ -2308,61 +2592,16 @@ SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese,
2308 if (copy_from_user(&these, uthese, sizeof(these))) 2592 if (copy_from_user(&these, uthese, sizeof(these)))
2309 return -EFAULT; 2593 return -EFAULT;
2310 2594
2311 /*
2312 * Invert the set of allowed signals to get those we
2313 * want to block.
2314 */
2315 sigdelsetmask(&these, sigmask(SIGKILL)|sigmask(SIGSTOP));
2316 signotset(&these);
2317
2318 if (uts) { 2595 if (uts) {
2319 if (copy_from_user(&ts, uts, sizeof(ts))) 2596 if (copy_from_user(&ts, uts, sizeof(ts)))
2320 return -EFAULT; 2597 return -EFAULT;
2321 if (ts.tv_nsec >= 1000000000L || ts.tv_nsec < 0
2322 || ts.tv_sec < 0)
2323 return -EINVAL;
2324 } 2598 }
2325 2599
2326 spin_lock_irq(&current->sighand->siglock); 2600 ret = do_sigtimedwait(&these, &info, uts ? &ts : NULL);
2327 sig = dequeue_signal(current, &these, &info);
2328 if (!sig) {
2329 timeout = MAX_SCHEDULE_TIMEOUT;
2330 if (uts)
2331 timeout = (timespec_to_jiffies(&ts)
2332 + (ts.tv_sec || ts.tv_nsec));
2333
2334 if (timeout) {
2335 /*
2336 * None ready -- temporarily unblock those we're
2337 * interested while we are sleeping in so that we'll
2338 * be awakened when they arrive.
2339 */
2340 current->real_blocked = current->blocked;
2341 sigandsets(&current->blocked, &current->blocked, &these);
2342 recalc_sigpending();
2343 spin_unlock_irq(&current->sighand->siglock);
2344
2345 timeout = schedule_timeout_interruptible(timeout);
2346
2347 spin_lock_irq(&current->sighand->siglock);
2348 sig = dequeue_signal(current, &these, &info);
2349 current->blocked = current->real_blocked;
2350 siginitset(&current->real_blocked, 0);
2351 recalc_sigpending();
2352 }
2353 }
2354 spin_unlock_irq(&current->sighand->siglock);
2355 2601
2356 if (sig) { 2602 if (ret > 0 && uinfo) {
2357 ret = sig; 2603 if (copy_siginfo_to_user(uinfo, &info))
2358 if (uinfo) { 2604 ret = -EFAULT;
2359 if (copy_siginfo_to_user(uinfo, &info))
2360 ret = -EFAULT;
2361 }
2362 } else {
2363 ret = -EAGAIN;
2364 if (timeout)
2365 ret = -EINTR;
2366 } 2605 }
2367 2606
2368 return ret; 2607 return ret;
@@ -2650,60 +2889,51 @@ SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set)
2650/** 2889/**
2651 * sys_sigprocmask - examine and change blocked signals 2890 * sys_sigprocmask - examine and change blocked signals
2652 * @how: whether to add, remove, or set signals 2891 * @how: whether to add, remove, or set signals
2653 * @set: signals to add or remove (if non-null) 2892 * @nset: signals to add or remove (if non-null)
2654 * @oset: previous value of signal mask if non-null 2893 * @oset: previous value of signal mask if non-null
2655 * 2894 *
2656 * Some platforms have their own version with special arguments; 2895 * Some platforms have their own version with special arguments;
2657 * others support only sys_rt_sigprocmask. 2896 * others support only sys_rt_sigprocmask.
2658 */ 2897 */
2659 2898
2660SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, set, 2899SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, nset,
2661 old_sigset_t __user *, oset) 2900 old_sigset_t __user *, oset)
2662{ 2901{
2663 int error;
2664 old_sigset_t old_set, new_set; 2902 old_sigset_t old_set, new_set;
2903 sigset_t new_blocked;
2665 2904
2666 if (set) { 2905 old_set = current->blocked.sig[0];
2667 error = -EFAULT; 2906
2668 if (copy_from_user(&new_set, set, sizeof(*set))) 2907 if (nset) {
2669 goto out; 2908 if (copy_from_user(&new_set, nset, sizeof(*nset)))
2909 return -EFAULT;
2670 new_set &= ~(sigmask(SIGKILL) | sigmask(SIGSTOP)); 2910 new_set &= ~(sigmask(SIGKILL) | sigmask(SIGSTOP));
2671 2911
2672 spin_lock_irq(&current->sighand->siglock); 2912 new_blocked = current->blocked;
2673 old_set = current->blocked.sig[0];
2674 2913
2675 error = 0;
2676 switch (how) { 2914 switch (how) {
2677 default:
2678 error = -EINVAL;
2679 break;
2680 case SIG_BLOCK: 2915 case SIG_BLOCK:
2681 sigaddsetmask(&current->blocked, new_set); 2916 sigaddsetmask(&new_blocked, new_set);
2682 break; 2917 break;
2683 case SIG_UNBLOCK: 2918 case SIG_UNBLOCK:
2684 sigdelsetmask(&current->blocked, new_set); 2919 sigdelsetmask(&new_blocked, new_set);
2685 break; 2920 break;
2686 case SIG_SETMASK: 2921 case SIG_SETMASK:
2687 current->blocked.sig[0] = new_set; 2922 new_blocked.sig[0] = new_set;
2688 break; 2923 break;
2924 default:
2925 return -EINVAL;
2689 } 2926 }
2690 2927
2691 recalc_sigpending(); 2928 set_current_blocked(&new_blocked);
2692 spin_unlock_irq(&current->sighand->siglock); 2929 }
2693 if (error) 2930
2694 goto out; 2931 if (oset) {
2695 if (oset)
2696 goto set_old;
2697 } else if (oset) {
2698 old_set = current->blocked.sig[0];
2699 set_old:
2700 error = -EFAULT;
2701 if (copy_to_user(oset, &old_set, sizeof(*oset))) 2932 if (copy_to_user(oset, &old_set, sizeof(*oset)))
2702 goto out; 2933 return -EFAULT;
2703 } 2934 }
2704 error = 0; 2935
2705out: 2936 return 0;
2706 return error;
2707} 2937}
2708#endif /* __ARCH_WANT_SYS_SIGPROCMASK */ 2938#endif /* __ARCH_WANT_SYS_SIGPROCMASK */
2709 2939
@@ -2793,8 +3023,10 @@ SYSCALL_DEFINE2(signal, int, sig, __sighandler_t, handler)
2793 3023
2794SYSCALL_DEFINE0(pause) 3024SYSCALL_DEFINE0(pause)
2795{ 3025{
2796 current->state = TASK_INTERRUPTIBLE; 3026 while (!signal_pending(current)) {
2797 schedule(); 3027 current->state = TASK_INTERRUPTIBLE;
3028 schedule();
3029 }
2798 return -ERESTARTNOHAND; 3030 return -ERESTARTNOHAND;
2799} 3031}
2800 3032
diff --git a/kernel/smp.c b/kernel/smp.c
index 73a195193558..fb67dfa8394e 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -74,7 +74,7 @@ static struct notifier_block __cpuinitdata hotplug_cfd_notifier = {
74 .notifier_call = hotplug_cfd, 74 .notifier_call = hotplug_cfd,
75}; 75};
76 76
77static int __cpuinit init_call_single_data(void) 77void __init call_function_init(void)
78{ 78{
79 void *cpu = (void *)(long)smp_processor_id(); 79 void *cpu = (void *)(long)smp_processor_id();
80 int i; 80 int i;
@@ -88,10 +88,7 @@ static int __cpuinit init_call_single_data(void)
88 88
89 hotplug_cfd(&hotplug_cfd_notifier, CPU_UP_PREPARE, cpu); 89 hotplug_cfd(&hotplug_cfd_notifier, CPU_UP_PREPARE, cpu);
90 register_cpu_notifier(&hotplug_cfd_notifier); 90 register_cpu_notifier(&hotplug_cfd_notifier);
91
92 return 0;
93} 91}
94early_initcall(init_call_single_data);
95 92
96/* 93/*
97 * csd_lock/csd_unlock used to serialize access to per-cpu csd resources 94 * csd_lock/csd_unlock used to serialize access to per-cpu csd resources
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 174f976c2874..40cf63ddd4b3 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -58,7 +58,7 @@ DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
58 58
59char *softirq_to_name[NR_SOFTIRQS] = { 59char *softirq_to_name[NR_SOFTIRQS] = {
60 "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL", 60 "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL",
61 "TASKLET", "SCHED", "HRTIMER", "RCU" 61 "TASKLET", "SCHED", "HRTIMER", "RCU"
62}; 62};
63 63
64/* 64/*
diff --git a/kernel/sys.c b/kernel/sys.c
index af468edf096a..e4128b278f23 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -314,8 +314,8 @@ void kernel_restart_prepare(char *cmd)
314{ 314{
315 blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd); 315 blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd);
316 system_state = SYSTEM_RESTART; 316 system_state = SYSTEM_RESTART;
317 usermodehelper_disable();
317 device_shutdown(); 318 device_shutdown();
318 sysdev_shutdown();
319 syscore_shutdown(); 319 syscore_shutdown();
320} 320}
321 321
@@ -344,6 +344,7 @@ static void kernel_shutdown_prepare(enum system_states state)
344 blocking_notifier_call_chain(&reboot_notifier_list, 344 blocking_notifier_call_chain(&reboot_notifier_list,
345 (state == SYSTEM_HALT)?SYS_HALT:SYS_POWER_OFF, NULL); 345 (state == SYSTEM_HALT)?SYS_HALT:SYS_POWER_OFF, NULL);
346 system_state = state; 346 system_state = state;
347 usermodehelper_disable();
347 device_shutdown(); 348 device_shutdown();
348} 349}
349/** 350/**
@@ -354,7 +355,6 @@ static void kernel_shutdown_prepare(enum system_states state)
354void kernel_halt(void) 355void kernel_halt(void)
355{ 356{
356 kernel_shutdown_prepare(SYSTEM_HALT); 357 kernel_shutdown_prepare(SYSTEM_HALT);
357 sysdev_shutdown();
358 syscore_shutdown(); 358 syscore_shutdown();
359 printk(KERN_EMERG "System halted.\n"); 359 printk(KERN_EMERG "System halted.\n");
360 kmsg_dump(KMSG_DUMP_HALT); 360 kmsg_dump(KMSG_DUMP_HALT);
@@ -374,7 +374,6 @@ void kernel_power_off(void)
374 if (pm_power_off_prepare) 374 if (pm_power_off_prepare)
375 pm_power_off_prepare(); 375 pm_power_off_prepare();
376 disable_nonboot_cpus(); 376 disable_nonboot_cpus();
377 sysdev_shutdown();
378 syscore_shutdown(); 377 syscore_shutdown();
379 printk(KERN_EMERG "Power down.\n"); 378 printk(KERN_EMERG "Power down.\n");
380 kmsg_dump(KMSG_DUMP_POWEROFF); 379 kmsg_dump(KMSG_DUMP_POWEROFF);
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 25cc41cd8f33..62cbc8877fef 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -46,7 +46,9 @@ cond_syscall(sys_getsockopt);
46cond_syscall(compat_sys_getsockopt); 46cond_syscall(compat_sys_getsockopt);
47cond_syscall(sys_shutdown); 47cond_syscall(sys_shutdown);
48cond_syscall(sys_sendmsg); 48cond_syscall(sys_sendmsg);
49cond_syscall(sys_sendmmsg);
49cond_syscall(compat_sys_sendmsg); 50cond_syscall(compat_sys_sendmsg);
51cond_syscall(compat_sys_sendmmsg);
50cond_syscall(sys_recvmsg); 52cond_syscall(sys_recvmsg);
51cond_syscall(sys_recvmmsg); 53cond_syscall(sys_recvmmsg);
52cond_syscall(compat_sys_recvmsg); 54cond_syscall(compat_sys_recvmsg);
@@ -69,15 +71,22 @@ cond_syscall(compat_sys_epoll_pwait);
69cond_syscall(sys_semget); 71cond_syscall(sys_semget);
70cond_syscall(sys_semop); 72cond_syscall(sys_semop);
71cond_syscall(sys_semtimedop); 73cond_syscall(sys_semtimedop);
74cond_syscall(compat_sys_semtimedop);
72cond_syscall(sys_semctl); 75cond_syscall(sys_semctl);
76cond_syscall(compat_sys_semctl);
73cond_syscall(sys_msgget); 77cond_syscall(sys_msgget);
74cond_syscall(sys_msgsnd); 78cond_syscall(sys_msgsnd);
79cond_syscall(compat_sys_msgsnd);
75cond_syscall(sys_msgrcv); 80cond_syscall(sys_msgrcv);
81cond_syscall(compat_sys_msgrcv);
76cond_syscall(sys_msgctl); 82cond_syscall(sys_msgctl);
83cond_syscall(compat_sys_msgctl);
77cond_syscall(sys_shmget); 84cond_syscall(sys_shmget);
78cond_syscall(sys_shmat); 85cond_syscall(sys_shmat);
86cond_syscall(compat_sys_shmat);
79cond_syscall(sys_shmdt); 87cond_syscall(sys_shmdt);
80cond_syscall(sys_shmctl); 88cond_syscall(sys_shmctl);
89cond_syscall(compat_sys_shmctl);
81cond_syscall(sys_mq_open); 90cond_syscall(sys_mq_open);
82cond_syscall(sys_mq_unlink); 91cond_syscall(sys_mq_unlink);
83cond_syscall(sys_mq_timedsend); 92cond_syscall(sys_mq_timedsend);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index c0bb32414b17..f175d98bd355 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -56,6 +56,7 @@
56#include <linux/kprobes.h> 56#include <linux/kprobes.h>
57#include <linux/pipe_fs_i.h> 57#include <linux/pipe_fs_i.h>
58#include <linux/oom.h> 58#include <linux/oom.h>
59#include <linux/kmod.h>
59 60
60#include <asm/uaccess.h> 61#include <asm/uaccess.h>
61#include <asm/processor.h> 62#include <asm/processor.h>
@@ -616,6 +617,11 @@ static struct ctl_table kern_table[] = {
616 .child = random_table, 617 .child = random_table,
617 }, 618 },
618 { 619 {
620 .procname = "usermodehelper",
621 .mode = 0555,
622 .child = usermodehelper_table,
623 },
624 {
619 .procname = "overflowuid", 625 .procname = "overflowuid",
620 .data = &overflowuid, 626 .data = &overflowuid,
621 .maxlen = sizeof(int), 627 .maxlen = sizeof(int),
@@ -730,14 +736,16 @@ static struct ctl_table kern_table[] = {
730 .data = &watchdog_enabled, 736 .data = &watchdog_enabled,
731 .maxlen = sizeof (int), 737 .maxlen = sizeof (int),
732 .mode = 0644, 738 .mode = 0644,
733 .proc_handler = proc_dowatchdog_enabled, 739 .proc_handler = proc_dowatchdog,
740 .extra1 = &zero,
741 .extra2 = &one,
734 }, 742 },
735 { 743 {
736 .procname = "watchdog_thresh", 744 .procname = "watchdog_thresh",
737 .data = &softlockup_thresh, 745 .data = &watchdog_thresh,
738 .maxlen = sizeof(int), 746 .maxlen = sizeof(int),
739 .mode = 0644, 747 .mode = 0644,
740 .proc_handler = proc_dowatchdog_thresh, 748 .proc_handler = proc_dowatchdog,
741 .extra1 = &neg_one, 749 .extra1 = &neg_one,
742 .extra2 = &sixty, 750 .extra2 = &sixty,
743 }, 751 },
@@ -755,7 +763,9 @@ static struct ctl_table kern_table[] = {
755 .data = &watchdog_enabled, 763 .data = &watchdog_enabled,
756 .maxlen = sizeof (int), 764 .maxlen = sizeof (int),
757 .mode = 0644, 765 .mode = 0644,
758 .proc_handler = proc_dowatchdog_enabled, 766 .proc_handler = proc_dowatchdog,
767 .extra1 = &zero,
768 .extra2 = &one,
759 }, 769 },
760#endif 770#endif
761#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) 771#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
@@ -928,6 +938,12 @@ static struct ctl_table kern_table[] = {
928 }, 938 },
929#endif 939#endif
930#ifdef CONFIG_PERF_EVENTS 940#ifdef CONFIG_PERF_EVENTS
941 /*
942 * User-space scripts rely on the existence of this file
943 * as a feature check for perf_events being enabled.
944 *
945 * So it's an ABI, do not remove!
946 */
931 { 947 {
932 .procname = "perf_event_paranoid", 948 .procname = "perf_event_paranoid",
933 .data = &sysctl_perf_event_paranoid, 949 .data = &sysctl_perf_event_paranoid,
@@ -1496,7 +1512,7 @@ static struct ctl_table fs_table[] = {
1496 1512
1497static struct ctl_table debug_table[] = { 1513static struct ctl_table debug_table[] = {
1498#if defined(CONFIG_X86) || defined(CONFIG_PPC) || defined(CONFIG_SPARC) || \ 1514#if defined(CONFIG_X86) || defined(CONFIG_PPC) || defined(CONFIG_SPARC) || \
1499 defined(CONFIG_S390) 1515 defined(CONFIG_S390) || defined(CONFIG_TILE)
1500 { 1516 {
1501 .procname = "exception-trace", 1517 .procname = "exception-trace",
1502 .data = &show_unhandled_signals, 1518 .data = &show_unhandled_signals,
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 9ffea360a778..fc0f22005417 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -285,16 +285,18 @@ ret:
285static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd) 285static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd)
286{ 286{
287 struct listener_list *listeners; 287 struct listener_list *listeners;
288 struct listener *s, *tmp; 288 struct listener *s, *tmp, *s2;
289 unsigned int cpu; 289 unsigned int cpu;
290 290
291 if (!cpumask_subset(mask, cpu_possible_mask)) 291 if (!cpumask_subset(mask, cpu_possible_mask))
292 return -EINVAL; 292 return -EINVAL;
293 293
294 s = NULL;
294 if (isadd == REGISTER) { 295 if (isadd == REGISTER) {
295 for_each_cpu(cpu, mask) { 296 for_each_cpu(cpu, mask) {
296 s = kmalloc_node(sizeof(struct listener), GFP_KERNEL, 297 if (!s)
297 cpu_to_node(cpu)); 298 s = kmalloc_node(sizeof(struct listener),
299 GFP_KERNEL, cpu_to_node(cpu));
298 if (!s) 300 if (!s)
299 goto cleanup; 301 goto cleanup;
300 s->pid = pid; 302 s->pid = pid;
@@ -303,9 +305,16 @@ static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd)
303 305
304 listeners = &per_cpu(listener_array, cpu); 306 listeners = &per_cpu(listener_array, cpu);
305 down_write(&listeners->sem); 307 down_write(&listeners->sem);
308 list_for_each_entry_safe(s2, tmp, &listeners->list, list) {
309 if (s2->pid == pid)
310 goto next_cpu;
311 }
306 list_add(&s->list, &listeners->list); 312 list_add(&s->list, &listeners->list);
313 s = NULL;
314next_cpu:
307 up_write(&listeners->sem); 315 up_write(&listeners->sem);
308 } 316 }
317 kfree(s);
309 return 0; 318 return 0;
310 } 319 }
311 320
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index b0425991e9ac..e2fd74b8e8c2 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -1,5 +1,5 @@
1obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o 1obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o
2obj-y += timeconv.o posix-clock.o 2obj-y += timeconv.o posix-clock.o alarmtimer.o
3 3
4obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o 4obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o
5obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o 5obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
new file mode 100644
index 000000000000..59f369f98a04
--- /dev/null
+++ b/kernel/time/alarmtimer.c
@@ -0,0 +1,720 @@
1/*
2 * Alarmtimer interface
3 *
4 * This interface provides a timer which is similarto hrtimers,
5 * but triggers a RTC alarm if the box is suspend.
6 *
7 * This interface is influenced by the Android RTC Alarm timer
8 * interface.
9 *
10 * Copyright (C) 2010 IBM Corperation
11 *
12 * Author: John Stultz <john.stultz@linaro.org>
13 *
14 * This program is free software; you can redistribute it and/or modify
15 * it under the terms of the GNU General Public License version 2 as
16 * published by the Free Software Foundation.
17 */
18#include <linux/time.h>
19#include <linux/hrtimer.h>
20#include <linux/timerqueue.h>
21#include <linux/rtc.h>
22#include <linux/alarmtimer.h>
23#include <linux/mutex.h>
24#include <linux/platform_device.h>
25#include <linux/posix-timers.h>
26#include <linux/workqueue.h>
27#include <linux/freezer.h>
28
29/**
30 * struct alarm_base - Alarm timer bases
31 * @lock: Lock for syncrhonized access to the base
32 * @timerqueue: Timerqueue head managing the list of events
33 * @timer: hrtimer used to schedule events while running
34 * @gettime: Function to read the time correlating to the base
35 * @base_clockid: clockid for the base
36 */
37static struct alarm_base {
38 spinlock_t lock;
39 struct timerqueue_head timerqueue;
40 struct hrtimer timer;
41 ktime_t (*gettime)(void);
42 clockid_t base_clockid;
43} alarm_bases[ALARM_NUMTYPE];
44
45/* freezer delta & lock used to handle clock_nanosleep triggered wakeups */
46static ktime_t freezer_delta;
47static DEFINE_SPINLOCK(freezer_delta_lock);
48
49#ifdef CONFIG_RTC_CLASS
50/* rtc timer and device for setting alarm wakeups at suspend */
51static struct rtc_timer rtctimer;
52static struct rtc_device *rtcdev;
53static DEFINE_SPINLOCK(rtcdev_lock);
54
55/**
56 * has_wakealarm - check rtc device has wakealarm ability
57 * @dev: current device
58 * @name_ptr: name to be returned
59 *
60 * This helper function checks to see if the rtc device can wake
61 * from suspend.
62 */
63static int has_wakealarm(struct device *dev, void *name_ptr)
64{
65 struct rtc_device *candidate = to_rtc_device(dev);
66
67 if (!candidate->ops->set_alarm)
68 return 0;
69 if (!device_may_wakeup(candidate->dev.parent))
70 return 0;
71
72 *(const char **)name_ptr = dev_name(dev);
73 return 1;
74}
75
76/**
77 * alarmtimer_get_rtcdev - Return selected rtcdevice
78 *
79 * This function returns the rtc device to use for wakealarms.
80 * If one has not already been chosen, it checks to see if a
81 * functional rtc device is available.
82 */
83static struct rtc_device *alarmtimer_get_rtcdev(void)
84{
85 struct device *dev;
86 char *str;
87 unsigned long flags;
88 struct rtc_device *ret;
89
90 spin_lock_irqsave(&rtcdev_lock, flags);
91 if (!rtcdev) {
92 /* Find an rtc device and init the rtc_timer */
93 dev = class_find_device(rtc_class, NULL, &str, has_wakealarm);
94 /* If we have a device then str is valid. See has_wakealarm() */
95 if (dev) {
96 rtcdev = rtc_class_open(str);
97 /*
98 * Drop the reference we got in class_find_device,
99 * rtc_open takes its own.
100 */
101 put_device(dev);
102 rtc_timer_init(&rtctimer, NULL, NULL);
103 }
104 }
105 ret = rtcdev;
106 spin_unlock_irqrestore(&rtcdev_lock, flags);
107
108 return ret;
109}
110#else
111#define alarmtimer_get_rtcdev() (0)
112#define rtcdev (0)
113#endif
114
115
116/**
117 * alarmtimer_enqueue - Adds an alarm timer to an alarm_base timerqueue
118 * @base: pointer to the base where the timer is being run
119 * @alarm: pointer to alarm being enqueued.
120 *
121 * Adds alarm to a alarm_base timerqueue and if necessary sets
122 * an hrtimer to run.
123 *
124 * Must hold base->lock when calling.
125 */
126static void alarmtimer_enqueue(struct alarm_base *base, struct alarm *alarm)
127{
128 timerqueue_add(&base->timerqueue, &alarm->node);
129 if (&alarm->node == timerqueue_getnext(&base->timerqueue)) {
130 hrtimer_try_to_cancel(&base->timer);
131 hrtimer_start(&base->timer, alarm->node.expires,
132 HRTIMER_MODE_ABS);
133 }
134}
135
136/**
137 * alarmtimer_remove - Removes an alarm timer from an alarm_base timerqueue
138 * @base: pointer to the base where the timer is running
139 * @alarm: pointer to alarm being removed
140 *
141 * Removes alarm to a alarm_base timerqueue and if necessary sets
142 * a new timer to run.
143 *
144 * Must hold base->lock when calling.
145 */
146static void alarmtimer_remove(struct alarm_base *base, struct alarm *alarm)
147{
148 struct timerqueue_node *next = timerqueue_getnext(&base->timerqueue);
149
150 timerqueue_del(&base->timerqueue, &alarm->node);
151 if (next == &alarm->node) {
152 hrtimer_try_to_cancel(&base->timer);
153 next = timerqueue_getnext(&base->timerqueue);
154 if (!next)
155 return;
156 hrtimer_start(&base->timer, next->expires, HRTIMER_MODE_ABS);
157 }
158}
159
160
161/**
162 * alarmtimer_fired - Handles alarm hrtimer being fired.
163 * @timer: pointer to hrtimer being run
164 *
165 * When a alarm timer fires, this runs through the timerqueue to
166 * see which alarms expired, and runs those. If there are more alarm
167 * timers queued for the future, we set the hrtimer to fire when
168 * when the next future alarm timer expires.
169 */
170static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer)
171{
172 struct alarm_base *base = container_of(timer, struct alarm_base, timer);
173 struct timerqueue_node *next;
174 unsigned long flags;
175 ktime_t now;
176 int ret = HRTIMER_NORESTART;
177
178 spin_lock_irqsave(&base->lock, flags);
179 now = base->gettime();
180 while ((next = timerqueue_getnext(&base->timerqueue))) {
181 struct alarm *alarm;
182 ktime_t expired = next->expires;
183
184 if (expired.tv64 >= now.tv64)
185 break;
186
187 alarm = container_of(next, struct alarm, node);
188
189 timerqueue_del(&base->timerqueue, &alarm->node);
190 alarm->enabled = 0;
191 /* Re-add periodic timers */
192 if (alarm->period.tv64) {
193 alarm->node.expires = ktime_add(expired, alarm->period);
194 timerqueue_add(&base->timerqueue, &alarm->node);
195 alarm->enabled = 1;
196 }
197 spin_unlock_irqrestore(&base->lock, flags);
198 if (alarm->function)
199 alarm->function(alarm);
200 spin_lock_irqsave(&base->lock, flags);
201 }
202
203 if (next) {
204 hrtimer_set_expires(&base->timer, next->expires);
205 ret = HRTIMER_RESTART;
206 }
207 spin_unlock_irqrestore(&base->lock, flags);
208
209 return ret;
210
211}
212
213#ifdef CONFIG_RTC_CLASS
214/**
215 * alarmtimer_suspend - Suspend time callback
216 * @dev: unused
217 * @state: unused
218 *
219 * When we are going into suspend, we look through the bases
220 * to see which is the soonest timer to expire. We then
221 * set an rtc timer to fire that far into the future, which
222 * will wake us from suspend.
223 */
224static int alarmtimer_suspend(struct device *dev)
225{
226 struct rtc_time tm;
227 ktime_t min, now;
228 unsigned long flags;
229 struct rtc_device *rtc;
230 int i;
231
232 spin_lock_irqsave(&freezer_delta_lock, flags);
233 min = freezer_delta;
234 freezer_delta = ktime_set(0, 0);
235 spin_unlock_irqrestore(&freezer_delta_lock, flags);
236
237 rtc = rtcdev;
238 /* If we have no rtcdev, just return */
239 if (!rtc)
240 return 0;
241
242 /* Find the soonest timer to expire*/
243 for (i = 0; i < ALARM_NUMTYPE; i++) {
244 struct alarm_base *base = &alarm_bases[i];
245 struct timerqueue_node *next;
246 ktime_t delta;
247
248 spin_lock_irqsave(&base->lock, flags);
249 next = timerqueue_getnext(&base->timerqueue);
250 spin_unlock_irqrestore(&base->lock, flags);
251 if (!next)
252 continue;
253 delta = ktime_sub(next->expires, base->gettime());
254 if (!min.tv64 || (delta.tv64 < min.tv64))
255 min = delta;
256 }
257 if (min.tv64 == 0)
258 return 0;
259
260 /* XXX - Should we enforce a minimum sleep time? */
261 WARN_ON(min.tv64 < NSEC_PER_SEC);
262
263 /* Setup an rtc timer to fire that far in the future */
264 rtc_timer_cancel(rtc, &rtctimer);
265 rtc_read_time(rtc, &tm);
266 now = rtc_tm_to_ktime(tm);
267 now = ktime_add(now, min);
268
269 rtc_timer_start(rtc, &rtctimer, now, ktime_set(0, 0));
270
271 return 0;
272}
273#else
274static int alarmtimer_suspend(struct device *dev)
275{
276 return 0;
277}
278#endif
279
280static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type)
281{
282 ktime_t delta;
283 unsigned long flags;
284 struct alarm_base *base = &alarm_bases[type];
285
286 delta = ktime_sub(absexp, base->gettime());
287
288 spin_lock_irqsave(&freezer_delta_lock, flags);
289 if (!freezer_delta.tv64 || (delta.tv64 < freezer_delta.tv64))
290 freezer_delta = delta;
291 spin_unlock_irqrestore(&freezer_delta_lock, flags);
292}
293
294
295/**
296 * alarm_init - Initialize an alarm structure
297 * @alarm: ptr to alarm to be initialized
298 * @type: the type of the alarm
299 * @function: callback that is run when the alarm fires
300 */
301void alarm_init(struct alarm *alarm, enum alarmtimer_type type,
302 void (*function)(struct alarm *))
303{
304 timerqueue_init(&alarm->node);
305 alarm->period = ktime_set(0, 0);
306 alarm->function = function;
307 alarm->type = type;
308 alarm->enabled = 0;
309}
310
311/**
312 * alarm_start - Sets an alarm to fire
313 * @alarm: ptr to alarm to set
314 * @start: time to run the alarm
315 * @period: period at which the alarm will recur
316 */
317void alarm_start(struct alarm *alarm, ktime_t start, ktime_t period)
318{
319 struct alarm_base *base = &alarm_bases[alarm->type];
320 unsigned long flags;
321
322 spin_lock_irqsave(&base->lock, flags);
323 if (alarm->enabled)
324 alarmtimer_remove(base, alarm);
325 alarm->node.expires = start;
326 alarm->period = period;
327 alarmtimer_enqueue(base, alarm);
328 alarm->enabled = 1;
329 spin_unlock_irqrestore(&base->lock, flags);
330}
331
332/**
333 * alarm_cancel - Tries to cancel an alarm timer
334 * @alarm: ptr to alarm to be canceled
335 */
336void alarm_cancel(struct alarm *alarm)
337{
338 struct alarm_base *base = &alarm_bases[alarm->type];
339 unsigned long flags;
340
341 spin_lock_irqsave(&base->lock, flags);
342 if (alarm->enabled)
343 alarmtimer_remove(base, alarm);
344 alarm->enabled = 0;
345 spin_unlock_irqrestore(&base->lock, flags);
346}
347
348
349/**
350 * clock2alarm - helper that converts from clockid to alarmtypes
351 * @clockid: clockid.
352 */
353static enum alarmtimer_type clock2alarm(clockid_t clockid)
354{
355 if (clockid == CLOCK_REALTIME_ALARM)
356 return ALARM_REALTIME;
357 if (clockid == CLOCK_BOOTTIME_ALARM)
358 return ALARM_BOOTTIME;
359 return -1;
360}
361
362/**
363 * alarm_handle_timer - Callback for posix timers
364 * @alarm: alarm that fired
365 *
366 * Posix timer callback for expired alarm timers.
367 */
368static void alarm_handle_timer(struct alarm *alarm)
369{
370 struct k_itimer *ptr = container_of(alarm, struct k_itimer,
371 it.alarmtimer);
372 if (posix_timer_event(ptr, 0) != 0)
373 ptr->it_overrun++;
374}
375
376/**
377 * alarm_clock_getres - posix getres interface
378 * @which_clock: clockid
379 * @tp: timespec to fill
380 *
381 * Returns the granularity of underlying alarm base clock
382 */
383static int alarm_clock_getres(const clockid_t which_clock, struct timespec *tp)
384{
385 clockid_t baseid = alarm_bases[clock2alarm(which_clock)].base_clockid;
386
387 if (!alarmtimer_get_rtcdev())
388 return -ENOTSUPP;
389
390 return hrtimer_get_res(baseid, tp);
391}
392
393/**
394 * alarm_clock_get - posix clock_get interface
395 * @which_clock: clockid
396 * @tp: timespec to fill.
397 *
398 * Provides the underlying alarm base time.
399 */
400static int alarm_clock_get(clockid_t which_clock, struct timespec *tp)
401{
402 struct alarm_base *base = &alarm_bases[clock2alarm(which_clock)];
403
404 if (!alarmtimer_get_rtcdev())
405 return -ENOTSUPP;
406
407 *tp = ktime_to_timespec(base->gettime());
408 return 0;
409}
410
411/**
412 * alarm_timer_create - posix timer_create interface
413 * @new_timer: k_itimer pointer to manage
414 *
415 * Initializes the k_itimer structure.
416 */
417static int alarm_timer_create(struct k_itimer *new_timer)
418{
419 enum alarmtimer_type type;
420 struct alarm_base *base;
421
422 if (!alarmtimer_get_rtcdev())
423 return -ENOTSUPP;
424
425 if (!capable(CAP_WAKE_ALARM))
426 return -EPERM;
427
428 type = clock2alarm(new_timer->it_clock);
429 base = &alarm_bases[type];
430 alarm_init(&new_timer->it.alarmtimer, type, alarm_handle_timer);
431 return 0;
432}
433
434/**
435 * alarm_timer_get - posix timer_get interface
436 * @new_timer: k_itimer pointer
437 * @cur_setting: itimerspec data to fill
438 *
439 * Copies the itimerspec data out from the k_itimer
440 */
441static void alarm_timer_get(struct k_itimer *timr,
442 struct itimerspec *cur_setting)
443{
444 cur_setting->it_interval =
445 ktime_to_timespec(timr->it.alarmtimer.period);
446 cur_setting->it_value =
447 ktime_to_timespec(timr->it.alarmtimer.node.expires);
448 return;
449}
450
451/**
452 * alarm_timer_del - posix timer_del interface
453 * @timr: k_itimer pointer to be deleted
454 *
455 * Cancels any programmed alarms for the given timer.
456 */
457static int alarm_timer_del(struct k_itimer *timr)
458{
459 if (!rtcdev)
460 return -ENOTSUPP;
461
462 alarm_cancel(&timr->it.alarmtimer);
463 return 0;
464}
465
466/**
467 * alarm_timer_set - posix timer_set interface
468 * @timr: k_itimer pointer to be deleted
469 * @flags: timer flags
470 * @new_setting: itimerspec to be used
471 * @old_setting: itimerspec being replaced
472 *
473 * Sets the timer to new_setting, and starts the timer.
474 */
475static int alarm_timer_set(struct k_itimer *timr, int flags,
476 struct itimerspec *new_setting,
477 struct itimerspec *old_setting)
478{
479 if (!rtcdev)
480 return -ENOTSUPP;
481
482 /* Save old values */
483 old_setting->it_interval =
484 ktime_to_timespec(timr->it.alarmtimer.period);
485 old_setting->it_value =
486 ktime_to_timespec(timr->it.alarmtimer.node.expires);
487
488 /* If the timer was already set, cancel it */
489 alarm_cancel(&timr->it.alarmtimer);
490
491 /* start the timer */
492 alarm_start(&timr->it.alarmtimer,
493 timespec_to_ktime(new_setting->it_value),
494 timespec_to_ktime(new_setting->it_interval));
495 return 0;
496}
497
498/**
499 * alarmtimer_nsleep_wakeup - Wakeup function for alarm_timer_nsleep
500 * @alarm: ptr to alarm that fired
501 *
502 * Wakes up the task that set the alarmtimer
503 */
504static void alarmtimer_nsleep_wakeup(struct alarm *alarm)
505{
506 struct task_struct *task = (struct task_struct *)alarm->data;
507
508 alarm->data = NULL;
509 if (task)
510 wake_up_process(task);
511}
512
513/**
514 * alarmtimer_do_nsleep - Internal alarmtimer nsleep implementation
515 * @alarm: ptr to alarmtimer
516 * @absexp: absolute expiration time
517 *
518 * Sets the alarm timer and sleeps until it is fired or interrupted.
519 */
520static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp)
521{
522 alarm->data = (void *)current;
523 do {
524 set_current_state(TASK_INTERRUPTIBLE);
525 alarm_start(alarm, absexp, ktime_set(0, 0));
526 if (likely(alarm->data))
527 schedule();
528
529 alarm_cancel(alarm);
530 } while (alarm->data && !signal_pending(current));
531
532 __set_current_state(TASK_RUNNING);
533
534 return (alarm->data == NULL);
535}
536
537
538/**
539 * update_rmtp - Update remaining timespec value
540 * @exp: expiration time
541 * @type: timer type
542 * @rmtp: user pointer to remaining timepsec value
543 *
544 * Helper function that fills in rmtp value with time between
545 * now and the exp value
546 */
547static int update_rmtp(ktime_t exp, enum alarmtimer_type type,
548 struct timespec __user *rmtp)
549{
550 struct timespec rmt;
551 ktime_t rem;
552
553 rem = ktime_sub(exp, alarm_bases[type].gettime());
554
555 if (rem.tv64 <= 0)
556 return 0;
557 rmt = ktime_to_timespec(rem);
558
559 if (copy_to_user(rmtp, &rmt, sizeof(*rmtp)))
560 return -EFAULT;
561
562 return 1;
563
564}
565
566/**
567 * alarm_timer_nsleep_restart - restartblock alarmtimer nsleep
568 * @restart: ptr to restart block
569 *
570 * Handles restarted clock_nanosleep calls
571 */
572static long __sched alarm_timer_nsleep_restart(struct restart_block *restart)
573{
574 enum alarmtimer_type type = restart->nanosleep.clockid;
575 ktime_t exp;
576 struct timespec __user *rmtp;
577 struct alarm alarm;
578 int ret = 0;
579
580 exp.tv64 = restart->nanosleep.expires;
581 alarm_init(&alarm, type, alarmtimer_nsleep_wakeup);
582
583 if (alarmtimer_do_nsleep(&alarm, exp))
584 goto out;
585
586 if (freezing(current))
587 alarmtimer_freezerset(exp, type);
588
589 rmtp = restart->nanosleep.rmtp;
590 if (rmtp) {
591 ret = update_rmtp(exp, type, rmtp);
592 if (ret <= 0)
593 goto out;
594 }
595
596
597 /* The other values in restart are already filled in */
598 ret = -ERESTART_RESTARTBLOCK;
599out:
600 return ret;
601}
602
603/**
604 * alarm_timer_nsleep - alarmtimer nanosleep
605 * @which_clock: clockid
606 * @flags: determins abstime or relative
607 * @tsreq: requested sleep time (abs or rel)
608 * @rmtp: remaining sleep time saved
609 *
610 * Handles clock_nanosleep calls against _ALARM clockids
611 */
612static int alarm_timer_nsleep(const clockid_t which_clock, int flags,
613 struct timespec *tsreq, struct timespec __user *rmtp)
614{
615 enum alarmtimer_type type = clock2alarm(which_clock);
616 struct alarm alarm;
617 ktime_t exp;
618 int ret = 0;
619 struct restart_block *restart;
620
621 if (!alarmtimer_get_rtcdev())
622 return -ENOTSUPP;
623
624 if (!capable(CAP_WAKE_ALARM))
625 return -EPERM;
626
627 alarm_init(&alarm, type, alarmtimer_nsleep_wakeup);
628
629 exp = timespec_to_ktime(*tsreq);
630 /* Convert (if necessary) to absolute time */
631 if (flags != TIMER_ABSTIME) {
632 ktime_t now = alarm_bases[type].gettime();
633 exp = ktime_add(now, exp);
634 }
635
636 if (alarmtimer_do_nsleep(&alarm, exp))
637 goto out;
638
639 if (freezing(current))
640 alarmtimer_freezerset(exp, type);
641
642 /* abs timers don't set remaining time or restart */
643 if (flags == TIMER_ABSTIME) {
644 ret = -ERESTARTNOHAND;
645 goto out;
646 }
647
648 if (rmtp) {
649 ret = update_rmtp(exp, type, rmtp);
650 if (ret <= 0)
651 goto out;
652 }
653
654 restart = &current_thread_info()->restart_block;
655 restart->fn = alarm_timer_nsleep_restart;
656 restart->nanosleep.clockid = type;
657 restart->nanosleep.expires = exp.tv64;
658 restart->nanosleep.rmtp = rmtp;
659 ret = -ERESTART_RESTARTBLOCK;
660
661out:
662 return ret;
663}
664
665
666/* Suspend hook structures */
667static const struct dev_pm_ops alarmtimer_pm_ops = {
668 .suspend = alarmtimer_suspend,
669};
670
671static struct platform_driver alarmtimer_driver = {
672 .driver = {
673 .name = "alarmtimer",
674 .pm = &alarmtimer_pm_ops,
675 }
676};
677
678/**
679 * alarmtimer_init - Initialize alarm timer code
680 *
681 * This function initializes the alarm bases and registers
682 * the posix clock ids.
683 */
684static int __init alarmtimer_init(void)
685{
686 int error = 0;
687 int i;
688 struct k_clock alarm_clock = {
689 .clock_getres = alarm_clock_getres,
690 .clock_get = alarm_clock_get,
691 .timer_create = alarm_timer_create,
692 .timer_set = alarm_timer_set,
693 .timer_del = alarm_timer_del,
694 .timer_get = alarm_timer_get,
695 .nsleep = alarm_timer_nsleep,
696 };
697
698 posix_timers_register_clock(CLOCK_REALTIME_ALARM, &alarm_clock);
699 posix_timers_register_clock(CLOCK_BOOTTIME_ALARM, &alarm_clock);
700
701 /* Initialize alarm bases */
702 alarm_bases[ALARM_REALTIME].base_clockid = CLOCK_REALTIME;
703 alarm_bases[ALARM_REALTIME].gettime = &ktime_get_real;
704 alarm_bases[ALARM_BOOTTIME].base_clockid = CLOCK_BOOTTIME;
705 alarm_bases[ALARM_BOOTTIME].gettime = &ktime_get_boottime;
706 for (i = 0; i < ALARM_NUMTYPE; i++) {
707 timerqueue_init_head(&alarm_bases[i].timerqueue);
708 spin_lock_init(&alarm_bases[i].lock);
709 hrtimer_init(&alarm_bases[i].timer,
710 alarm_bases[i].base_clockid,
711 HRTIMER_MODE_ABS);
712 alarm_bases[i].timer.function = alarmtimer_fired;
713 }
714 error = platform_driver_register(&alarmtimer_driver);
715 platform_device_register_simple("alarmtimer", -1, NULL, 0);
716
717 return error;
718}
719device_initcall(alarmtimer_init);
720
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 0d74b9ba90c8..e4c699dfa4e8 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -182,7 +182,10 @@ void clockevents_register_device(struct clock_event_device *dev)
182 unsigned long flags; 182 unsigned long flags;
183 183
184 BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); 184 BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
185 BUG_ON(!dev->cpumask); 185 if (!dev->cpumask) {
186 WARN_ON(num_possible_cpus() > 1);
187 dev->cpumask = cpumask_of(smp_processor_id());
188 }
186 189
187 raw_spin_lock_irqsave(&clockevents_lock, flags); 190 raw_spin_lock_irqsave(&clockevents_lock, flags);
188 191
@@ -194,6 +197,70 @@ void clockevents_register_device(struct clock_event_device *dev)
194} 197}
195EXPORT_SYMBOL_GPL(clockevents_register_device); 198EXPORT_SYMBOL_GPL(clockevents_register_device);
196 199
200static void clockevents_config(struct clock_event_device *dev,
201 u32 freq)
202{
203 u64 sec;
204
205 if (!(dev->features & CLOCK_EVT_FEAT_ONESHOT))
206 return;
207
208 /*
209 * Calculate the maximum number of seconds we can sleep. Limit
210 * to 10 minutes for hardware which can program more than
211 * 32bit ticks so we still get reasonable conversion values.
212 */
213 sec = dev->max_delta_ticks;
214 do_div(sec, freq);
215 if (!sec)
216 sec = 1;
217 else if (sec > 600 && dev->max_delta_ticks > UINT_MAX)
218 sec = 600;
219
220 clockevents_calc_mult_shift(dev, freq, sec);
221 dev->min_delta_ns = clockevent_delta2ns(dev->min_delta_ticks, dev);
222 dev->max_delta_ns = clockevent_delta2ns(dev->max_delta_ticks, dev);
223}
224
225/**
226 * clockevents_config_and_register - Configure and register a clock event device
227 * @dev: device to register
228 * @freq: The clock frequency
229 * @min_delta: The minimum clock ticks to program in oneshot mode
230 * @max_delta: The maximum clock ticks to program in oneshot mode
231 *
232 * min/max_delta can be 0 for devices which do not support oneshot mode.
233 */
234void clockevents_config_and_register(struct clock_event_device *dev,
235 u32 freq, unsigned long min_delta,
236 unsigned long max_delta)
237{
238 dev->min_delta_ticks = min_delta;
239 dev->max_delta_ticks = max_delta;
240 clockevents_config(dev, freq);
241 clockevents_register_device(dev);
242}
243
244/**
245 * clockevents_update_freq - Update frequency and reprogram a clock event device.
246 * @dev: device to modify
247 * @freq: new device frequency
248 *
249 * Reconfigure and reprogram a clock event device in oneshot
250 * mode. Must be called on the cpu for which the device delivers per
251 * cpu timer events with interrupts disabled! Returns 0 on success,
252 * -ETIME when the event is in the past.
253 */
254int clockevents_update_freq(struct clock_event_device *dev, u32 freq)
255{
256 clockevents_config(dev, freq);
257
258 if (dev->mode != CLOCK_EVT_MODE_ONESHOT)
259 return 0;
260
261 return clockevents_program_event(dev, dev->next_event, ktime_get());
262}
263
197/* 264/*
198 * Noop handler when we shut down an event device 265 * Noop handler when we shut down an event device
199 */ 266 */
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 0e17c10f8a9d..e0980f0d9a0a 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -185,7 +185,6 @@ static struct clocksource *watchdog;
185static struct timer_list watchdog_timer; 185static struct timer_list watchdog_timer;
186static DECLARE_WORK(watchdog_work, clocksource_watchdog_work); 186static DECLARE_WORK(watchdog_work, clocksource_watchdog_work);
187static DEFINE_SPINLOCK(watchdog_lock); 187static DEFINE_SPINLOCK(watchdog_lock);
188static cycle_t watchdog_last;
189static int watchdog_running; 188static int watchdog_running;
190 189
191static int clocksource_watchdog_kthread(void *data); 190static int clocksource_watchdog_kthread(void *data);
@@ -254,11 +253,6 @@ static void clocksource_watchdog(unsigned long data)
254 if (!watchdog_running) 253 if (!watchdog_running)
255 goto out; 254 goto out;
256 255
257 wdnow = watchdog->read(watchdog);
258 wd_nsec = clocksource_cyc2ns((wdnow - watchdog_last) & watchdog->mask,
259 watchdog->mult, watchdog->shift);
260 watchdog_last = wdnow;
261
262 list_for_each_entry(cs, &watchdog_list, wd_list) { 256 list_for_each_entry(cs, &watchdog_list, wd_list) {
263 257
264 /* Clocksource already marked unstable? */ 258 /* Clocksource already marked unstable? */
@@ -268,19 +262,28 @@ static void clocksource_watchdog(unsigned long data)
268 continue; 262 continue;
269 } 263 }
270 264
265 local_irq_disable();
271 csnow = cs->read(cs); 266 csnow = cs->read(cs);
267 wdnow = watchdog->read(watchdog);
268 local_irq_enable();
272 269
273 /* Clocksource initialized ? */ 270 /* Clocksource initialized ? */
274 if (!(cs->flags & CLOCK_SOURCE_WATCHDOG)) { 271 if (!(cs->flags & CLOCK_SOURCE_WATCHDOG)) {
275 cs->flags |= CLOCK_SOURCE_WATCHDOG; 272 cs->flags |= CLOCK_SOURCE_WATCHDOG;
276 cs->wd_last = csnow; 273 cs->wd_last = wdnow;
274 cs->cs_last = csnow;
277 continue; 275 continue;
278 } 276 }
279 277
280 /* Check the deviation from the watchdog clocksource. */ 278 wd_nsec = clocksource_cyc2ns((wdnow - cs->wd_last) & watchdog->mask,
281 cs_nsec = clocksource_cyc2ns((csnow - cs->wd_last) & 279 watchdog->mult, watchdog->shift);
280
281 cs_nsec = clocksource_cyc2ns((csnow - cs->cs_last) &
282 cs->mask, cs->mult, cs->shift); 282 cs->mask, cs->mult, cs->shift);
283 cs->wd_last = csnow; 283 cs->cs_last = csnow;
284 cs->wd_last = wdnow;
285
286 /* Check the deviation from the watchdog clocksource. */
284 if (abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD) { 287 if (abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD) {
285 clocksource_unstable(cs, cs_nsec - wd_nsec); 288 clocksource_unstable(cs, cs_nsec - wd_nsec);
286 continue; 289 continue;
@@ -318,7 +321,6 @@ static inline void clocksource_start_watchdog(void)
318 return; 321 return;
319 init_timer(&watchdog_timer); 322 init_timer(&watchdog_timer);
320 watchdog_timer.function = clocksource_watchdog; 323 watchdog_timer.function = clocksource_watchdog;
321 watchdog_last = watchdog->read(watchdog);
322 watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL; 324 watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL;
323 add_timer_on(&watchdog_timer, cpumask_first(cpu_online_mask)); 325 add_timer_on(&watchdog_timer, cpumask_first(cpu_online_mask));
324 watchdog_running = 1; 326 watchdog_running = 1;
@@ -626,19 +628,6 @@ static void clocksource_enqueue(struct clocksource *cs)
626 list_add(&cs->list, entry); 628 list_add(&cs->list, entry);
627} 629}
628 630
629
630/*
631 * Maximum time we expect to go between ticks. This includes idle
632 * tickless time. It provides the trade off between selecting a
633 * mult/shift pair that is very precise but can only handle a short
634 * period of time, vs. a mult/shift pair that can handle long periods
635 * of time but isn't as precise.
636 *
637 * This is a subsystem constant, and actual hardware limitations
638 * may override it (ie: clocksources that wrap every 3 seconds).
639 */
640#define MAX_UPDATE_LENGTH 5 /* Seconds */
641
642/** 631/**
643 * __clocksource_updatefreq_scale - Used update clocksource with new freq 632 * __clocksource_updatefreq_scale - Used update clocksource with new freq
644 * @t: clocksource to be registered 633 * @t: clocksource to be registered
@@ -652,15 +641,28 @@ static void clocksource_enqueue(struct clocksource *cs)
652 */ 641 */
653void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq) 642void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq)
654{ 643{
644 u64 sec;
645
655 /* 646 /*
656 * Ideally we want to use some of the limits used in 647 * Calc the maximum number of seconds which we can run before
657 * clocksource_max_deferment, to provide a more informed 648 * wrapping around. For clocksources which have a mask > 32bit
658 * MAX_UPDATE_LENGTH. But for now this just gets the 649 * we need to limit the max sleep time to have a good
659 * register interface working properly. 650 * conversion precision. 10 minutes is still a reasonable
651 * amount. That results in a shift value of 24 for a
652 * clocksource with mask >= 40bit and f >= 4GHz. That maps to
653 * ~ 0.06ppm granularity for NTP. We apply the same 12.5%
654 * margin as we do in clocksource_max_deferment()
660 */ 655 */
656 sec = (cs->mask - (cs->mask >> 5));
657 do_div(sec, freq);
658 do_div(sec, scale);
659 if (!sec)
660 sec = 1;
661 else if (sec > 600 && cs->mask > UINT_MAX)
662 sec = 600;
663
661 clocks_calc_mult_shift(&cs->mult, &cs->shift, freq, 664 clocks_calc_mult_shift(&cs->mult, &cs->shift, freq,
662 NSEC_PER_SEC/scale, 665 NSEC_PER_SEC / scale, sec * scale);
663 MAX_UPDATE_LENGTH*scale);
664 cs->max_idle_ns = clocksource_max_deferment(cs); 666 cs->max_idle_ns = clocksource_max_deferment(cs);
665} 667}
666EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale); 668EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale);
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 723c7637e55a..c7218d132738 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -456,23 +456,27 @@ void tick_broadcast_oneshot_control(unsigned long reason)
456 unsigned long flags; 456 unsigned long flags;
457 int cpu; 457 int cpu;
458 458
459 raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
460
461 /* 459 /*
462 * Periodic mode does not care about the enter/exit of power 460 * Periodic mode does not care about the enter/exit of power
463 * states 461 * states
464 */ 462 */
465 if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) 463 if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC)
466 goto out; 464 return;
467 465
468 bc = tick_broadcast_device.evtdev; 466 /*
467 * We are called with preemtion disabled from the depth of the
468 * idle code, so we can't be moved away.
469 */
469 cpu = smp_processor_id(); 470 cpu = smp_processor_id();
470 td = &per_cpu(tick_cpu_device, cpu); 471 td = &per_cpu(tick_cpu_device, cpu);
471 dev = td->evtdev; 472 dev = td->evtdev;
472 473
473 if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) 474 if (!(dev->features & CLOCK_EVT_FEAT_C3STOP))
474 goto out; 475 return;
476
477 bc = tick_broadcast_device.evtdev;
475 478
479 raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
476 if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) { 480 if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) {
477 if (!cpumask_test_cpu(cpu, tick_get_broadcast_oneshot_mask())) { 481 if (!cpumask_test_cpu(cpu, tick_get_broadcast_oneshot_mask())) {
478 cpumask_set_cpu(cpu, tick_get_broadcast_oneshot_mask()); 482 cpumask_set_cpu(cpu, tick_get_broadcast_oneshot_mask());
@@ -489,8 +493,6 @@ void tick_broadcast_oneshot_control(unsigned long reason)
489 tick_program_event(dev->next_event, 1); 493 tick_program_event(dev->next_event, 1);
490 } 494 }
491 } 495 }
492
493out:
494 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); 496 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
495} 497}
496 498
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 8ad5d576755e..342408cf68dd 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -596,6 +596,58 @@ void __init timekeeping_init(void)
596static struct timespec timekeeping_suspend_time; 596static struct timespec timekeeping_suspend_time;
597 597
598/** 598/**
599 * __timekeeping_inject_sleeptime - Internal function to add sleep interval
600 * @delta: pointer to a timespec delta value
601 *
602 * Takes a timespec offset measuring a suspend interval and properly
603 * adds the sleep offset to the timekeeping variables.
604 */
605static void __timekeeping_inject_sleeptime(struct timespec *delta)
606{
607 xtime = timespec_add(xtime, *delta);
608 wall_to_monotonic = timespec_sub(wall_to_monotonic, *delta);
609 total_sleep_time = timespec_add(total_sleep_time, *delta);
610}
611
612
613/**
614 * timekeeping_inject_sleeptime - Adds suspend interval to timeekeeping values
615 * @delta: pointer to a timespec delta value
616 *
617 * This hook is for architectures that cannot support read_persistent_clock
618 * because their RTC/persistent clock is only accessible when irqs are enabled.
619 *
620 * This function should only be called by rtc_resume(), and allows
621 * a suspend offset to be injected into the timekeeping values.
622 */
623void timekeeping_inject_sleeptime(struct timespec *delta)
624{
625 unsigned long flags;
626 struct timespec ts;
627
628 /* Make sure we don't set the clock twice */
629 read_persistent_clock(&ts);
630 if (!(ts.tv_sec == 0 && ts.tv_nsec == 0))
631 return;
632
633 write_seqlock_irqsave(&xtime_lock, flags);
634 timekeeping_forward_now();
635
636 __timekeeping_inject_sleeptime(delta);
637
638 timekeeper.ntp_error = 0;
639 ntp_clear();
640 update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock,
641 timekeeper.mult);
642
643 write_sequnlock_irqrestore(&xtime_lock, flags);
644
645 /* signal hrtimers about time change */
646 clock_was_set();
647}
648
649
650/**
599 * timekeeping_resume - Resumes the generic timekeeping subsystem. 651 * timekeeping_resume - Resumes the generic timekeeping subsystem.
600 * 652 *
601 * This is for the generic clocksource timekeeping. 653 * This is for the generic clocksource timekeeping.
@@ -615,9 +667,7 @@ static void timekeeping_resume(void)
615 667
616 if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) { 668 if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) {
617 ts = timespec_sub(ts, timekeeping_suspend_time); 669 ts = timespec_sub(ts, timekeeping_suspend_time);
618 xtime = timespec_add(xtime, ts); 670 __timekeeping_inject_sleeptime(&ts);
619 wall_to_monotonic = timespec_sub(wall_to_monotonic, ts);
620 total_sleep_time = timespec_add(total_sleep_time, ts);
621 } 671 }
622 /* re-base the last cycle value */ 672 /* re-base the last cycle value */
623 timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock); 673 timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock);
@@ -630,7 +680,7 @@ static void timekeeping_resume(void)
630 clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL); 680 clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL);
631 681
632 /* Resume hrtimers */ 682 /* Resume hrtimers */
633 hres_timers_resume(); 683 hrtimers_resume();
634} 684}
635 685
636static int timekeeping_suspend(void) 686static int timekeeping_suspend(void)
@@ -1049,6 +1099,21 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim,
1049} 1099}
1050 1100
1051/** 1101/**
1102 * ktime_get_monotonic_offset() - get wall_to_monotonic in ktime_t format
1103 */
1104ktime_t ktime_get_monotonic_offset(void)
1105{
1106 unsigned long seq;
1107 struct timespec wtom;
1108
1109 do {
1110 seq = read_seqbegin(&xtime_lock);
1111 wtom = wall_to_monotonic;
1112 } while (read_seqretry(&xtime_lock, seq));
1113 return timespec_to_ktime(wtom);
1114}
1115
1116/**
1052 * xtime_update() - advances the timekeeping infrastructure 1117 * xtime_update() - advances the timekeeping infrastructure
1053 * @ticks: number of ticks, that have elapsed since the last call. 1118 * @ticks: number of ticks, that have elapsed since the last call.
1054 * 1119 *
diff --git a/kernel/timer.c b/kernel/timer.c
index fd6198692b57..8cff36119e4d 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -749,16 +749,15 @@ unsigned long apply_slack(struct timer_list *timer, unsigned long expires)
749 unsigned long expires_limit, mask; 749 unsigned long expires_limit, mask;
750 int bit; 750 int bit;
751 751
752 expires_limit = expires;
753
754 if (timer->slack >= 0) { 752 if (timer->slack >= 0) {
755 expires_limit = expires + timer->slack; 753 expires_limit = expires + timer->slack;
756 } else { 754 } else {
757 unsigned long now = jiffies; 755 long delta = expires - jiffies;
756
757 if (delta < 256)
758 return expires;
758 759
759 /* No slack, if already expired else auto slack 0.4% */ 760 expires_limit = expires + delta / 256;
760 if (time_after(expires, now))
761 expires_limit = expires + (expires - now)/256;
762 } 761 }
763 mask = expires ^ expires_limit; 762 mask = expires ^ expires_limit;
764 if (mask == 0) 763 if (mask == 0)
@@ -795,6 +794,8 @@ unsigned long apply_slack(struct timer_list *timer, unsigned long expires)
795 */ 794 */
796int mod_timer(struct timer_list *timer, unsigned long expires) 795int mod_timer(struct timer_list *timer, unsigned long expires)
797{ 796{
797 expires = apply_slack(timer, expires);
798
798 /* 799 /*
799 * This is a common optimization triggered by the 800 * This is a common optimization triggered by the
800 * networking code - if the timer is re-modified 801 * networking code - if the timer is re-modified
@@ -803,8 +804,6 @@ int mod_timer(struct timer_list *timer, unsigned long expires)
803 if (timer_pending(timer) && timer->expires == expires) 804 if (timer_pending(timer) && timer->expires == expires)
804 return 1; 805 return 1;
805 806
806 expires = apply_slack(timer, expires);
807
808 return __mod_timer(timer, expires, false, TIMER_NOT_PINNED); 807 return __mod_timer(timer, expires, false, TIMER_NOT_PINNED);
809} 808}
810EXPORT_SYMBOL(mod_timer); 809EXPORT_SYMBOL(mod_timer);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index ee24fa1935ac..908038f57440 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -39,20 +39,26 @@
39#include "trace_stat.h" 39#include "trace_stat.h"
40 40
41#define FTRACE_WARN_ON(cond) \ 41#define FTRACE_WARN_ON(cond) \
42 do { \ 42 ({ \
43 if (WARN_ON(cond)) \ 43 int ___r = cond; \
44 if (WARN_ON(___r)) \
44 ftrace_kill(); \ 45 ftrace_kill(); \
45 } while (0) 46 ___r; \
47 })
46 48
47#define FTRACE_WARN_ON_ONCE(cond) \ 49#define FTRACE_WARN_ON_ONCE(cond) \
48 do { \ 50 ({ \
49 if (WARN_ON_ONCE(cond)) \ 51 int ___r = cond; \
52 if (WARN_ON_ONCE(___r)) \
50 ftrace_kill(); \ 53 ftrace_kill(); \
51 } while (0) 54 ___r; \
55 })
52 56
53/* hash bits for specific function selection */ 57/* hash bits for specific function selection */
54#define FTRACE_HASH_BITS 7 58#define FTRACE_HASH_BITS 7
55#define FTRACE_FUNC_HASHSIZE (1 << FTRACE_HASH_BITS) 59#define FTRACE_FUNC_HASHSIZE (1 << FTRACE_HASH_BITS)
60#define FTRACE_HASH_DEFAULT_BITS 10
61#define FTRACE_HASH_MAX_BITS 12
56 62
57/* ftrace_enabled is a method to turn ftrace on or off */ 63/* ftrace_enabled is a method to turn ftrace on or off */
58int ftrace_enabled __read_mostly; 64int ftrace_enabled __read_mostly;
@@ -81,28 +87,40 @@ static struct ftrace_ops ftrace_list_end __read_mostly =
81 .func = ftrace_stub, 87 .func = ftrace_stub,
82}; 88};
83 89
84static struct ftrace_ops *ftrace_list __read_mostly = &ftrace_list_end; 90static struct ftrace_ops *ftrace_global_list __read_mostly = &ftrace_list_end;
91static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end;
85ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; 92ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
86ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub; 93ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub;
87ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub; 94ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub;
95static struct ftrace_ops global_ops;
96
97static void
98ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip);
88 99
89/* 100/*
90 * Traverse the ftrace_list, invoking all entries. The reason that we 101 * Traverse the ftrace_global_list, invoking all entries. The reason that we
91 * can use rcu_dereference_raw() is that elements removed from this list 102 * can use rcu_dereference_raw() is that elements removed from this list
92 * are simply leaked, so there is no need to interact with a grace-period 103 * are simply leaked, so there is no need to interact with a grace-period
93 * mechanism. The rcu_dereference_raw() calls are needed to handle 104 * mechanism. The rcu_dereference_raw() calls are needed to handle
94 * concurrent insertions into the ftrace_list. 105 * concurrent insertions into the ftrace_global_list.
95 * 106 *
96 * Silly Alpha and silly pointer-speculation compiler optimizations! 107 * Silly Alpha and silly pointer-speculation compiler optimizations!
97 */ 108 */
98static void ftrace_list_func(unsigned long ip, unsigned long parent_ip) 109static void ftrace_global_list_func(unsigned long ip,
110 unsigned long parent_ip)
99{ 111{
100 struct ftrace_ops *op = rcu_dereference_raw(ftrace_list); /*see above*/ 112 struct ftrace_ops *op;
101 113
114 if (unlikely(trace_recursion_test(TRACE_GLOBAL_BIT)))
115 return;
116
117 trace_recursion_set(TRACE_GLOBAL_BIT);
118 op = rcu_dereference_raw(ftrace_global_list); /*see above*/
102 while (op != &ftrace_list_end) { 119 while (op != &ftrace_list_end) {
103 op->func(ip, parent_ip); 120 op->func(ip, parent_ip);
104 op = rcu_dereference_raw(op->next); /*see above*/ 121 op = rcu_dereference_raw(op->next); /*see above*/
105 }; 122 };
123 trace_recursion_clear(TRACE_GLOBAL_BIT);
106} 124}
107 125
108static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip) 126static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip)
@@ -147,46 +165,69 @@ static void ftrace_test_stop_func(unsigned long ip, unsigned long parent_ip)
147} 165}
148#endif 166#endif
149 167
150static int __register_ftrace_function(struct ftrace_ops *ops) 168static void update_global_ops(void)
151{ 169{
152 ops->next = ftrace_list; 170 ftrace_func_t func;
171
153 /* 172 /*
154 * We are entering ops into the ftrace_list but another 173 * If there's only one function registered, then call that
155 * CPU might be walking that list. We need to make sure 174 * function directly. Otherwise, we need to iterate over the
156 * the ops->next pointer is valid before another CPU sees 175 * registered callers.
157 * the ops pointer included into the ftrace_list.
158 */ 176 */
159 rcu_assign_pointer(ftrace_list, ops); 177 if (ftrace_global_list == &ftrace_list_end ||
178 ftrace_global_list->next == &ftrace_list_end)
179 func = ftrace_global_list->func;
180 else
181 func = ftrace_global_list_func;
160 182
161 if (ftrace_enabled) { 183 /* If we filter on pids, update to use the pid function */
162 ftrace_func_t func; 184 if (!list_empty(&ftrace_pids)) {
185 set_ftrace_pid_function(func);
186 func = ftrace_pid_func;
187 }
163 188
164 if (ops->next == &ftrace_list_end) 189 global_ops.func = func;
165 func = ops->func; 190}
166 else
167 func = ftrace_list_func;
168 191
169 if (!list_empty(&ftrace_pids)) { 192static void update_ftrace_function(void)
170 set_ftrace_pid_function(func); 193{
171 func = ftrace_pid_func; 194 ftrace_func_t func;
172 } 195
196 update_global_ops();
197
198 /*
199 * If we are at the end of the list and this ops is
200 * not dynamic, then have the mcount trampoline call
201 * the function directly
202 */
203 if (ftrace_ops_list == &ftrace_list_end ||
204 (ftrace_ops_list->next == &ftrace_list_end &&
205 !(ftrace_ops_list->flags & FTRACE_OPS_FL_DYNAMIC)))
206 func = ftrace_ops_list->func;
207 else
208 func = ftrace_ops_list_func;
173 209
174 /*
175 * For one func, simply call it directly.
176 * For more than one func, call the chain.
177 */
178#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST 210#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
179 ftrace_trace_function = func; 211 ftrace_trace_function = func;
180#else 212#else
181 __ftrace_trace_function = func; 213 __ftrace_trace_function = func;
182 ftrace_trace_function = ftrace_test_stop_func; 214 ftrace_trace_function = ftrace_test_stop_func;
183#endif 215#endif
184 } 216}
185 217
186 return 0; 218static void add_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops)
219{
220 ops->next = *list;
221 /*
222 * We are entering ops into the list but another
223 * CPU might be walking that list. We need to make sure
224 * the ops->next pointer is valid before another CPU sees
225 * the ops pointer included into the list.
226 */
227 rcu_assign_pointer(*list, ops);
187} 228}
188 229
189static int __unregister_ftrace_function(struct ftrace_ops *ops) 230static int remove_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops)
190{ 231{
191 struct ftrace_ops **p; 232 struct ftrace_ops **p;
192 233
@@ -194,13 +235,12 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
194 * If we are removing the last function, then simply point 235 * If we are removing the last function, then simply point
195 * to the ftrace_stub. 236 * to the ftrace_stub.
196 */ 237 */
197 if (ftrace_list == ops && ops->next == &ftrace_list_end) { 238 if (*list == ops && ops->next == &ftrace_list_end) {
198 ftrace_trace_function = ftrace_stub; 239 *list = &ftrace_list_end;
199 ftrace_list = &ftrace_list_end;
200 return 0; 240 return 0;
201 } 241 }
202 242
203 for (p = &ftrace_list; *p != &ftrace_list_end; p = &(*p)->next) 243 for (p = list; *p != &ftrace_list_end; p = &(*p)->next)
204 if (*p == ops) 244 if (*p == ops)
205 break; 245 break;
206 246
@@ -208,53 +248,83 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
208 return -1; 248 return -1;
209 249
210 *p = (*p)->next; 250 *p = (*p)->next;
251 return 0;
252}
211 253
212 if (ftrace_enabled) { 254static int __register_ftrace_function(struct ftrace_ops *ops)
213 /* If we only have one func left, then call that directly */ 255{
214 if (ftrace_list->next == &ftrace_list_end) { 256 if (ftrace_disabled)
215 ftrace_func_t func = ftrace_list->func; 257 return -ENODEV;
216 258
217 if (!list_empty(&ftrace_pids)) { 259 if (FTRACE_WARN_ON(ops == &global_ops))
218 set_ftrace_pid_function(func); 260 return -EINVAL;
219 func = ftrace_pid_func; 261
220 } 262 if (WARN_ON(ops->flags & FTRACE_OPS_FL_ENABLED))
221#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST 263 return -EBUSY;
222 ftrace_trace_function = func; 264
223#else 265 if (!core_kernel_data((unsigned long)ops))
224 __ftrace_trace_function = func; 266 ops->flags |= FTRACE_OPS_FL_DYNAMIC;
225#endif 267
226 } 268 if (ops->flags & FTRACE_OPS_FL_GLOBAL) {
227 } 269 int first = ftrace_global_list == &ftrace_list_end;
270 add_ftrace_ops(&ftrace_global_list, ops);
271 ops->flags |= FTRACE_OPS_FL_ENABLED;
272 if (first)
273 add_ftrace_ops(&ftrace_ops_list, &global_ops);
274 } else
275 add_ftrace_ops(&ftrace_ops_list, ops);
276
277 if (ftrace_enabled)
278 update_ftrace_function();
228 279
229 return 0; 280 return 0;
230} 281}
231 282
232static void ftrace_update_pid_func(void) 283static int __unregister_ftrace_function(struct ftrace_ops *ops)
233{ 284{
234 ftrace_func_t func; 285 int ret;
235 286
236 if (ftrace_trace_function == ftrace_stub) 287 if (ftrace_disabled)
237 return; 288 return -ENODEV;
238 289
239#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST 290 if (WARN_ON(!(ops->flags & FTRACE_OPS_FL_ENABLED)))
240 func = ftrace_trace_function; 291 return -EBUSY;
241#else
242 func = __ftrace_trace_function;
243#endif
244 292
245 if (!list_empty(&ftrace_pids)) { 293 if (FTRACE_WARN_ON(ops == &global_ops))
246 set_ftrace_pid_function(func); 294 return -EINVAL;
247 func = ftrace_pid_func;
248 } else {
249 if (func == ftrace_pid_func)
250 func = ftrace_pid_function;
251 }
252 295
253#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST 296 if (ops->flags & FTRACE_OPS_FL_GLOBAL) {
254 ftrace_trace_function = func; 297 ret = remove_ftrace_ops(&ftrace_global_list, ops);
255#else 298 if (!ret && ftrace_global_list == &ftrace_list_end)
256 __ftrace_trace_function = func; 299 ret = remove_ftrace_ops(&ftrace_ops_list, &global_ops);
257#endif 300 if (!ret)
301 ops->flags &= ~FTRACE_OPS_FL_ENABLED;
302 } else
303 ret = remove_ftrace_ops(&ftrace_ops_list, ops);
304
305 if (ret < 0)
306 return ret;
307
308 if (ftrace_enabled)
309 update_ftrace_function();
310
311 /*
312 * Dynamic ops may be freed, we must make sure that all
313 * callers are done before leaving this function.
314 */
315 if (ops->flags & FTRACE_OPS_FL_DYNAMIC)
316 synchronize_sched();
317
318 return 0;
319}
320
321static void ftrace_update_pid_func(void)
322{
323 /* Only do something if we are tracing something */
324 if (ftrace_trace_function == ftrace_stub)
325 return;
326
327 update_ftrace_function();
258} 328}
259 329
260#ifdef CONFIG_FUNCTION_PROFILER 330#ifdef CONFIG_FUNCTION_PROFILER
@@ -888,8 +958,35 @@ enum {
888 FTRACE_START_FUNC_RET = (1 << 3), 958 FTRACE_START_FUNC_RET = (1 << 3),
889 FTRACE_STOP_FUNC_RET = (1 << 4), 959 FTRACE_STOP_FUNC_RET = (1 << 4),
890}; 960};
961struct ftrace_func_entry {
962 struct hlist_node hlist;
963 unsigned long ip;
964};
965
966struct ftrace_hash {
967 unsigned long size_bits;
968 struct hlist_head *buckets;
969 unsigned long count;
970 struct rcu_head rcu;
971};
972
973/*
974 * We make these constant because no one should touch them,
975 * but they are used as the default "empty hash", to avoid allocating
976 * it all the time. These are in a read only section such that if
977 * anyone does try to modify it, it will cause an exception.
978 */
979static const struct hlist_head empty_buckets[1];
980static const struct ftrace_hash empty_hash = {
981 .buckets = (struct hlist_head *)empty_buckets,
982};
983#define EMPTY_HASH ((struct ftrace_hash *)&empty_hash)
891 984
892static int ftrace_filtered; 985static struct ftrace_ops global_ops = {
986 .func = ftrace_stub,
987 .notrace_hash = EMPTY_HASH,
988 .filter_hash = EMPTY_HASH,
989};
893 990
894static struct dyn_ftrace *ftrace_new_addrs; 991static struct dyn_ftrace *ftrace_new_addrs;
895 992
@@ -912,6 +1009,269 @@ static struct ftrace_page *ftrace_pages;
912 1009
913static struct dyn_ftrace *ftrace_free_records; 1010static struct dyn_ftrace *ftrace_free_records;
914 1011
1012static struct ftrace_func_entry *
1013ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip)
1014{
1015 unsigned long key;
1016 struct ftrace_func_entry *entry;
1017 struct hlist_head *hhd;
1018 struct hlist_node *n;
1019
1020 if (!hash->count)
1021 return NULL;
1022
1023 if (hash->size_bits > 0)
1024 key = hash_long(ip, hash->size_bits);
1025 else
1026 key = 0;
1027
1028 hhd = &hash->buckets[key];
1029
1030 hlist_for_each_entry_rcu(entry, n, hhd, hlist) {
1031 if (entry->ip == ip)
1032 return entry;
1033 }
1034 return NULL;
1035}
1036
1037static void __add_hash_entry(struct ftrace_hash *hash,
1038 struct ftrace_func_entry *entry)
1039{
1040 struct hlist_head *hhd;
1041 unsigned long key;
1042
1043 if (hash->size_bits)
1044 key = hash_long(entry->ip, hash->size_bits);
1045 else
1046 key = 0;
1047
1048 hhd = &hash->buckets[key];
1049 hlist_add_head(&entry->hlist, hhd);
1050 hash->count++;
1051}
1052
1053static int add_hash_entry(struct ftrace_hash *hash, unsigned long ip)
1054{
1055 struct ftrace_func_entry *entry;
1056
1057 entry = kmalloc(sizeof(*entry), GFP_KERNEL);
1058 if (!entry)
1059 return -ENOMEM;
1060
1061 entry->ip = ip;
1062 __add_hash_entry(hash, entry);
1063
1064 return 0;
1065}
1066
1067static void
1068free_hash_entry(struct ftrace_hash *hash,
1069 struct ftrace_func_entry *entry)
1070{
1071 hlist_del(&entry->hlist);
1072 kfree(entry);
1073 hash->count--;
1074}
1075
1076static void
1077remove_hash_entry(struct ftrace_hash *hash,
1078 struct ftrace_func_entry *entry)
1079{
1080 hlist_del(&entry->hlist);
1081 hash->count--;
1082}
1083
1084static void ftrace_hash_clear(struct ftrace_hash *hash)
1085{
1086 struct hlist_head *hhd;
1087 struct hlist_node *tp, *tn;
1088 struct ftrace_func_entry *entry;
1089 int size = 1 << hash->size_bits;
1090 int i;
1091
1092 if (!hash->count)
1093 return;
1094
1095 for (i = 0; i < size; i++) {
1096 hhd = &hash->buckets[i];
1097 hlist_for_each_entry_safe(entry, tp, tn, hhd, hlist)
1098 free_hash_entry(hash, entry);
1099 }
1100 FTRACE_WARN_ON(hash->count);
1101}
1102
1103static void free_ftrace_hash(struct ftrace_hash *hash)
1104{
1105 if (!hash || hash == EMPTY_HASH)
1106 return;
1107 ftrace_hash_clear(hash);
1108 kfree(hash->buckets);
1109 kfree(hash);
1110}
1111
1112static void __free_ftrace_hash_rcu(struct rcu_head *rcu)
1113{
1114 struct ftrace_hash *hash;
1115
1116 hash = container_of(rcu, struct ftrace_hash, rcu);
1117 free_ftrace_hash(hash);
1118}
1119
1120static void free_ftrace_hash_rcu(struct ftrace_hash *hash)
1121{
1122 if (!hash || hash == EMPTY_HASH)
1123 return;
1124 call_rcu_sched(&hash->rcu, __free_ftrace_hash_rcu);
1125}
1126
1127static struct ftrace_hash *alloc_ftrace_hash(int size_bits)
1128{
1129 struct ftrace_hash *hash;
1130 int size;
1131
1132 hash = kzalloc(sizeof(*hash), GFP_KERNEL);
1133 if (!hash)
1134 return NULL;
1135
1136 size = 1 << size_bits;
1137 hash->buckets = kzalloc(sizeof(*hash->buckets) * size, GFP_KERNEL);
1138
1139 if (!hash->buckets) {
1140 kfree(hash);
1141 return NULL;
1142 }
1143
1144 hash->size_bits = size_bits;
1145
1146 return hash;
1147}
1148
1149static struct ftrace_hash *
1150alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash)
1151{
1152 struct ftrace_func_entry *entry;
1153 struct ftrace_hash *new_hash;
1154 struct hlist_node *tp;
1155 int size;
1156 int ret;
1157 int i;
1158
1159 new_hash = alloc_ftrace_hash(size_bits);
1160 if (!new_hash)
1161 return NULL;
1162
1163 /* Empty hash? */
1164 if (!hash || !hash->count)
1165 return new_hash;
1166
1167 size = 1 << hash->size_bits;
1168 for (i = 0; i < size; i++) {
1169 hlist_for_each_entry(entry, tp, &hash->buckets[i], hlist) {
1170 ret = add_hash_entry(new_hash, entry->ip);
1171 if (ret < 0)
1172 goto free_hash;
1173 }
1174 }
1175
1176 FTRACE_WARN_ON(new_hash->count != hash->count);
1177
1178 return new_hash;
1179
1180 free_hash:
1181 free_ftrace_hash(new_hash);
1182 return NULL;
1183}
1184
1185static int
1186ftrace_hash_move(struct ftrace_hash **dst, struct ftrace_hash *src)
1187{
1188 struct ftrace_func_entry *entry;
1189 struct hlist_node *tp, *tn;
1190 struct hlist_head *hhd;
1191 struct ftrace_hash *old_hash;
1192 struct ftrace_hash *new_hash;
1193 unsigned long key;
1194 int size = src->count;
1195 int bits = 0;
1196 int i;
1197
1198 /*
1199 * If the new source is empty, just free dst and assign it
1200 * the empty_hash.
1201 */
1202 if (!src->count) {
1203 free_ftrace_hash_rcu(*dst);
1204 rcu_assign_pointer(*dst, EMPTY_HASH);
1205 return 0;
1206 }
1207
1208 /*
1209 * Make the hash size about 1/2 the # found
1210 */
1211 for (size /= 2; size; size >>= 1)
1212 bits++;
1213
1214 /* Don't allocate too much */
1215 if (bits > FTRACE_HASH_MAX_BITS)
1216 bits = FTRACE_HASH_MAX_BITS;
1217
1218 new_hash = alloc_ftrace_hash(bits);
1219 if (!new_hash)
1220 return -ENOMEM;
1221
1222 size = 1 << src->size_bits;
1223 for (i = 0; i < size; i++) {
1224 hhd = &src->buckets[i];
1225 hlist_for_each_entry_safe(entry, tp, tn, hhd, hlist) {
1226 if (bits > 0)
1227 key = hash_long(entry->ip, bits);
1228 else
1229 key = 0;
1230 remove_hash_entry(src, entry);
1231 __add_hash_entry(new_hash, entry);
1232 }
1233 }
1234
1235 old_hash = *dst;
1236 rcu_assign_pointer(*dst, new_hash);
1237 free_ftrace_hash_rcu(old_hash);
1238
1239 return 0;
1240}
1241
1242/*
1243 * Test the hashes for this ops to see if we want to call
1244 * the ops->func or not.
1245 *
1246 * It's a match if the ip is in the ops->filter_hash or
1247 * the filter_hash does not exist or is empty,
1248 * AND
1249 * the ip is not in the ops->notrace_hash.
1250 *
1251 * This needs to be called with preemption disabled as
1252 * the hashes are freed with call_rcu_sched().
1253 */
1254static int
1255ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip)
1256{
1257 struct ftrace_hash *filter_hash;
1258 struct ftrace_hash *notrace_hash;
1259 int ret;
1260
1261 filter_hash = rcu_dereference_raw(ops->filter_hash);
1262 notrace_hash = rcu_dereference_raw(ops->notrace_hash);
1263
1264 if ((!filter_hash || !filter_hash->count ||
1265 ftrace_lookup_ip(filter_hash, ip)) &&
1266 (!notrace_hash || !notrace_hash->count ||
1267 !ftrace_lookup_ip(notrace_hash, ip)))
1268 ret = 1;
1269 else
1270 ret = 0;
1271
1272 return ret;
1273}
1274
915/* 1275/*
916 * This is a double for. Do not use 'break' to break out of the loop, 1276 * This is a double for. Do not use 'break' to break out of the loop,
917 * you must use a goto. 1277 * you must use a goto.
@@ -926,6 +1286,105 @@ static struct dyn_ftrace *ftrace_free_records;
926 } \ 1286 } \
927 } 1287 }
928 1288
1289static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
1290 int filter_hash,
1291 bool inc)
1292{
1293 struct ftrace_hash *hash;
1294 struct ftrace_hash *other_hash;
1295 struct ftrace_page *pg;
1296 struct dyn_ftrace *rec;
1297 int count = 0;
1298 int all = 0;
1299
1300 /* Only update if the ops has been registered */
1301 if (!(ops->flags & FTRACE_OPS_FL_ENABLED))
1302 return;
1303
1304 /*
1305 * In the filter_hash case:
1306 * If the count is zero, we update all records.
1307 * Otherwise we just update the items in the hash.
1308 *
1309 * In the notrace_hash case:
1310 * We enable the update in the hash.
1311 * As disabling notrace means enabling the tracing,
1312 * and enabling notrace means disabling, the inc variable
1313 * gets inversed.
1314 */
1315 if (filter_hash) {
1316 hash = ops->filter_hash;
1317 other_hash = ops->notrace_hash;
1318 if (!hash || !hash->count)
1319 all = 1;
1320 } else {
1321 inc = !inc;
1322 hash = ops->notrace_hash;
1323 other_hash = ops->filter_hash;
1324 /*
1325 * If the notrace hash has no items,
1326 * then there's nothing to do.
1327 */
1328 if (hash && !hash->count)
1329 return;
1330 }
1331
1332 do_for_each_ftrace_rec(pg, rec) {
1333 int in_other_hash = 0;
1334 int in_hash = 0;
1335 int match = 0;
1336
1337 if (all) {
1338 /*
1339 * Only the filter_hash affects all records.
1340 * Update if the record is not in the notrace hash.
1341 */
1342 if (!other_hash || !ftrace_lookup_ip(other_hash, rec->ip))
1343 match = 1;
1344 } else {
1345 in_hash = hash && !!ftrace_lookup_ip(hash, rec->ip);
1346 in_other_hash = other_hash && !!ftrace_lookup_ip(other_hash, rec->ip);
1347
1348 /*
1349 *
1350 */
1351 if (filter_hash && in_hash && !in_other_hash)
1352 match = 1;
1353 else if (!filter_hash && in_hash &&
1354 (in_other_hash || !other_hash->count))
1355 match = 1;
1356 }
1357 if (!match)
1358 continue;
1359
1360 if (inc) {
1361 rec->flags++;
1362 if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == FTRACE_REF_MAX))
1363 return;
1364 } else {
1365 if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == 0))
1366 return;
1367 rec->flags--;
1368 }
1369 count++;
1370 /* Shortcut, if we handled all records, we are done. */
1371 if (!all && count == hash->count)
1372 return;
1373 } while_for_each_ftrace_rec();
1374}
1375
1376static void ftrace_hash_rec_disable(struct ftrace_ops *ops,
1377 int filter_hash)
1378{
1379 __ftrace_hash_rec_update(ops, filter_hash, 0);
1380}
1381
1382static void ftrace_hash_rec_enable(struct ftrace_ops *ops,
1383 int filter_hash)
1384{
1385 __ftrace_hash_rec_update(ops, filter_hash, 1);
1386}
1387
929static void ftrace_free_rec(struct dyn_ftrace *rec) 1388static void ftrace_free_rec(struct dyn_ftrace *rec)
930{ 1389{
931 rec->freelist = ftrace_free_records; 1390 rec->freelist = ftrace_free_records;
@@ -1047,18 +1506,18 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
1047 ftrace_addr = (unsigned long)FTRACE_ADDR; 1506 ftrace_addr = (unsigned long)FTRACE_ADDR;
1048 1507
1049 /* 1508 /*
1050 * If this record is not to be traced or we want to disable it, 1509 * If we are enabling tracing:
1051 * then disable it. 1510 *
1511 * If the record has a ref count, then we need to enable it
1512 * because someone is using it.
1052 * 1513 *
1053 * If we want to enable it and filtering is off, then enable it. 1514 * Otherwise we make sure its disabled.
1054 * 1515 *
1055 * If we want to enable it and filtering is on, enable it only if 1516 * If we are disabling tracing, then disable all records that
1056 * it's filtered 1517 * are enabled.
1057 */ 1518 */
1058 if (enable && !(rec->flags & FTRACE_FL_NOTRACE)) { 1519 if (enable && (rec->flags & ~FTRACE_FL_MASK))
1059 if (!ftrace_filtered || (rec->flags & FTRACE_FL_FILTER)) 1520 flag = FTRACE_FL_ENABLED;
1060 flag = FTRACE_FL_ENABLED;
1061 }
1062 1521
1063 /* If the state of this record hasn't changed, then do nothing */ 1522 /* If the state of this record hasn't changed, then do nothing */
1064 if ((rec->flags & FTRACE_FL_ENABLED) == flag) 1523 if ((rec->flags & FTRACE_FL_ENABLED) == flag)
@@ -1079,19 +1538,16 @@ static void ftrace_replace_code(int enable)
1079 struct ftrace_page *pg; 1538 struct ftrace_page *pg;
1080 int failed; 1539 int failed;
1081 1540
1541 if (unlikely(ftrace_disabled))
1542 return;
1543
1082 do_for_each_ftrace_rec(pg, rec) { 1544 do_for_each_ftrace_rec(pg, rec) {
1083 /* 1545 /* Skip over free records */
1084 * Skip over free records, records that have 1546 if (rec->flags & FTRACE_FL_FREE)
1085 * failed and not converted.
1086 */
1087 if (rec->flags & FTRACE_FL_FREE ||
1088 rec->flags & FTRACE_FL_FAILED ||
1089 !(rec->flags & FTRACE_FL_CONVERTED))
1090 continue; 1547 continue;
1091 1548
1092 failed = __ftrace_replace_code(rec, enable); 1549 failed = __ftrace_replace_code(rec, enable);
1093 if (failed) { 1550 if (failed) {
1094 rec->flags |= FTRACE_FL_FAILED;
1095 ftrace_bug(failed, rec->ip); 1551 ftrace_bug(failed, rec->ip);
1096 /* Stop processing */ 1552 /* Stop processing */
1097 return; 1553 return;
@@ -1107,10 +1563,12 @@ ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec)
1107 1563
1108 ip = rec->ip; 1564 ip = rec->ip;
1109 1565
1566 if (unlikely(ftrace_disabled))
1567 return 0;
1568
1110 ret = ftrace_make_nop(mod, rec, MCOUNT_ADDR); 1569 ret = ftrace_make_nop(mod, rec, MCOUNT_ADDR);
1111 if (ret) { 1570 if (ret) {
1112 ftrace_bug(ret, ip); 1571 ftrace_bug(ret, ip);
1113 rec->flags |= FTRACE_FL_FAILED;
1114 return 0; 1572 return 0;
1115 } 1573 }
1116 return 1; 1574 return 1;
@@ -1171,6 +1629,7 @@ static void ftrace_run_update_code(int command)
1171 1629
1172static ftrace_func_t saved_ftrace_func; 1630static ftrace_func_t saved_ftrace_func;
1173static int ftrace_start_up; 1631static int ftrace_start_up;
1632static int global_start_up;
1174 1633
1175static void ftrace_startup_enable(int command) 1634static void ftrace_startup_enable(int command)
1176{ 1635{
@@ -1185,19 +1644,38 @@ static void ftrace_startup_enable(int command)
1185 ftrace_run_update_code(command); 1644 ftrace_run_update_code(command);
1186} 1645}
1187 1646
1188static void ftrace_startup(int command) 1647static int ftrace_startup(struct ftrace_ops *ops, int command)
1189{ 1648{
1649 bool hash_enable = true;
1650
1190 if (unlikely(ftrace_disabled)) 1651 if (unlikely(ftrace_disabled))
1191 return; 1652 return -ENODEV;
1192 1653
1193 ftrace_start_up++; 1654 ftrace_start_up++;
1194 command |= FTRACE_ENABLE_CALLS; 1655 command |= FTRACE_ENABLE_CALLS;
1195 1656
1657 /* ops marked global share the filter hashes */
1658 if (ops->flags & FTRACE_OPS_FL_GLOBAL) {
1659 ops = &global_ops;
1660 /* Don't update hash if global is already set */
1661 if (global_start_up)
1662 hash_enable = false;
1663 global_start_up++;
1664 }
1665
1666 ops->flags |= FTRACE_OPS_FL_ENABLED;
1667 if (hash_enable)
1668 ftrace_hash_rec_enable(ops, 1);
1669
1196 ftrace_startup_enable(command); 1670 ftrace_startup_enable(command);
1671
1672 return 0;
1197} 1673}
1198 1674
1199static void ftrace_shutdown(int command) 1675static void ftrace_shutdown(struct ftrace_ops *ops, int command)
1200{ 1676{
1677 bool hash_disable = true;
1678
1201 if (unlikely(ftrace_disabled)) 1679 if (unlikely(ftrace_disabled))
1202 return; 1680 return;
1203 1681
@@ -1209,6 +1687,23 @@ static void ftrace_shutdown(int command)
1209 */ 1687 */
1210 WARN_ON_ONCE(ftrace_start_up < 0); 1688 WARN_ON_ONCE(ftrace_start_up < 0);
1211 1689
1690 if (ops->flags & FTRACE_OPS_FL_GLOBAL) {
1691 ops = &global_ops;
1692 global_start_up--;
1693 WARN_ON_ONCE(global_start_up < 0);
1694 /* Don't update hash if global still has users */
1695 if (global_start_up) {
1696 WARN_ON_ONCE(!ftrace_start_up);
1697 hash_disable = false;
1698 }
1699 }
1700
1701 if (hash_disable)
1702 ftrace_hash_rec_disable(ops, 1);
1703
1704 if (ops != &global_ops || !global_start_up)
1705 ops->flags &= ~FTRACE_OPS_FL_ENABLED;
1706
1212 if (!ftrace_start_up) 1707 if (!ftrace_start_up)
1213 command |= FTRACE_DISABLE_CALLS; 1708 command |= FTRACE_DISABLE_CALLS;
1214 1709
@@ -1273,10 +1768,10 @@ static int ftrace_update_code(struct module *mod)
1273 */ 1768 */
1274 if (!ftrace_code_disable(mod, p)) { 1769 if (!ftrace_code_disable(mod, p)) {
1275 ftrace_free_rec(p); 1770 ftrace_free_rec(p);
1276 continue; 1771 /* Game over */
1772 break;
1277 } 1773 }
1278 1774
1279 p->flags |= FTRACE_FL_CONVERTED;
1280 ftrace_update_cnt++; 1775 ftrace_update_cnt++;
1281 1776
1282 /* 1777 /*
@@ -1351,9 +1846,9 @@ static int __init ftrace_dyn_table_alloc(unsigned long num_to_init)
1351enum { 1846enum {
1352 FTRACE_ITER_FILTER = (1 << 0), 1847 FTRACE_ITER_FILTER = (1 << 0),
1353 FTRACE_ITER_NOTRACE = (1 << 1), 1848 FTRACE_ITER_NOTRACE = (1 << 1),
1354 FTRACE_ITER_FAILURES = (1 << 2), 1849 FTRACE_ITER_PRINTALL = (1 << 2),
1355 FTRACE_ITER_PRINTALL = (1 << 3), 1850 FTRACE_ITER_HASH = (1 << 3),
1356 FTRACE_ITER_HASH = (1 << 4), 1851 FTRACE_ITER_ENABLED = (1 << 4),
1357}; 1852};
1358 1853
1359#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ 1854#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */
@@ -1365,6 +1860,8 @@ struct ftrace_iterator {
1365 struct dyn_ftrace *func; 1860 struct dyn_ftrace *func;
1366 struct ftrace_func_probe *probe; 1861 struct ftrace_func_probe *probe;
1367 struct trace_parser parser; 1862 struct trace_parser parser;
1863 struct ftrace_hash *hash;
1864 struct ftrace_ops *ops;
1368 int hidx; 1865 int hidx;
1369 int idx; 1866 int idx;
1370 unsigned flags; 1867 unsigned flags;
@@ -1461,8 +1958,12 @@ static void *
1461t_next(struct seq_file *m, void *v, loff_t *pos) 1958t_next(struct seq_file *m, void *v, loff_t *pos)
1462{ 1959{
1463 struct ftrace_iterator *iter = m->private; 1960 struct ftrace_iterator *iter = m->private;
1961 struct ftrace_ops *ops = &global_ops;
1464 struct dyn_ftrace *rec = NULL; 1962 struct dyn_ftrace *rec = NULL;
1465 1963
1964 if (unlikely(ftrace_disabled))
1965 return NULL;
1966
1466 if (iter->flags & FTRACE_ITER_HASH) 1967 if (iter->flags & FTRACE_ITER_HASH)
1467 return t_hash_next(m, pos); 1968 return t_hash_next(m, pos);
1468 1969
@@ -1483,17 +1984,15 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
1483 rec = &iter->pg->records[iter->idx++]; 1984 rec = &iter->pg->records[iter->idx++];
1484 if ((rec->flags & FTRACE_FL_FREE) || 1985 if ((rec->flags & FTRACE_FL_FREE) ||
1485 1986
1486 (!(iter->flags & FTRACE_ITER_FAILURES) &&
1487 (rec->flags & FTRACE_FL_FAILED)) ||
1488
1489 ((iter->flags & FTRACE_ITER_FAILURES) &&
1490 !(rec->flags & FTRACE_FL_FAILED)) ||
1491
1492 ((iter->flags & FTRACE_ITER_FILTER) && 1987 ((iter->flags & FTRACE_ITER_FILTER) &&
1493 !(rec->flags & FTRACE_FL_FILTER)) || 1988 !(ftrace_lookup_ip(ops->filter_hash, rec->ip))) ||
1494 1989
1495 ((iter->flags & FTRACE_ITER_NOTRACE) && 1990 ((iter->flags & FTRACE_ITER_NOTRACE) &&
1496 !(rec->flags & FTRACE_FL_NOTRACE))) { 1991 !ftrace_lookup_ip(ops->notrace_hash, rec->ip)) ||
1992
1993 ((iter->flags & FTRACE_ITER_ENABLED) &&
1994 !(rec->flags & ~FTRACE_FL_MASK))) {
1995
1497 rec = NULL; 1996 rec = NULL;
1498 goto retry; 1997 goto retry;
1499 } 1998 }
@@ -1517,10 +2016,15 @@ static void reset_iter_read(struct ftrace_iterator *iter)
1517static void *t_start(struct seq_file *m, loff_t *pos) 2016static void *t_start(struct seq_file *m, loff_t *pos)
1518{ 2017{
1519 struct ftrace_iterator *iter = m->private; 2018 struct ftrace_iterator *iter = m->private;
2019 struct ftrace_ops *ops = &global_ops;
1520 void *p = NULL; 2020 void *p = NULL;
1521 loff_t l; 2021 loff_t l;
1522 2022
1523 mutex_lock(&ftrace_lock); 2023 mutex_lock(&ftrace_lock);
2024
2025 if (unlikely(ftrace_disabled))
2026 return NULL;
2027
1524 /* 2028 /*
1525 * If an lseek was done, then reset and start from beginning. 2029 * If an lseek was done, then reset and start from beginning.
1526 */ 2030 */
@@ -1532,7 +2036,7 @@ static void *t_start(struct seq_file *m, loff_t *pos)
1532 * off, we can short cut and just print out that all 2036 * off, we can short cut and just print out that all
1533 * functions are enabled. 2037 * functions are enabled.
1534 */ 2038 */
1535 if (iter->flags & FTRACE_ITER_FILTER && !ftrace_filtered) { 2039 if (iter->flags & FTRACE_ITER_FILTER && !ops->filter_hash->count) {
1536 if (*pos > 0) 2040 if (*pos > 0)
1537 return t_hash_start(m, pos); 2041 return t_hash_start(m, pos);
1538 iter->flags |= FTRACE_ITER_PRINTALL; 2042 iter->flags |= FTRACE_ITER_PRINTALL;
@@ -1590,7 +2094,11 @@ static int t_show(struct seq_file *m, void *v)
1590 if (!rec) 2094 if (!rec)
1591 return 0; 2095 return 0;
1592 2096
1593 seq_printf(m, "%ps\n", (void *)rec->ip); 2097 seq_printf(m, "%ps", (void *)rec->ip);
2098 if (iter->flags & FTRACE_ITER_ENABLED)
2099 seq_printf(m, " (%ld)",
2100 rec->flags & ~FTRACE_FL_MASK);
2101 seq_printf(m, "\n");
1594 2102
1595 return 0; 2103 return 0;
1596} 2104}
@@ -1630,44 +2138,46 @@ ftrace_avail_open(struct inode *inode, struct file *file)
1630} 2138}
1631 2139
1632static int 2140static int
1633ftrace_failures_open(struct inode *inode, struct file *file) 2141ftrace_enabled_open(struct inode *inode, struct file *file)
1634{ 2142{
1635 int ret;
1636 struct seq_file *m;
1637 struct ftrace_iterator *iter; 2143 struct ftrace_iterator *iter;
2144 int ret;
2145
2146 if (unlikely(ftrace_disabled))
2147 return -ENODEV;
2148
2149 iter = kzalloc(sizeof(*iter), GFP_KERNEL);
2150 if (!iter)
2151 return -ENOMEM;
1638 2152
1639 ret = ftrace_avail_open(inode, file); 2153 iter->pg = ftrace_pages_start;
2154 iter->flags = FTRACE_ITER_ENABLED;
2155
2156 ret = seq_open(file, &show_ftrace_seq_ops);
1640 if (!ret) { 2157 if (!ret) {
1641 m = file->private_data; 2158 struct seq_file *m = file->private_data;
1642 iter = m->private; 2159
1643 iter->flags = FTRACE_ITER_FAILURES; 2160 m->private = iter;
2161 } else {
2162 kfree(iter);
1644 } 2163 }
1645 2164
1646 return ret; 2165 return ret;
1647} 2166}
1648 2167
1649 2168static void ftrace_filter_reset(struct ftrace_hash *hash)
1650static void ftrace_filter_reset(int enable)
1651{ 2169{
1652 struct ftrace_page *pg;
1653 struct dyn_ftrace *rec;
1654 unsigned long type = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
1655
1656 mutex_lock(&ftrace_lock); 2170 mutex_lock(&ftrace_lock);
1657 if (enable) 2171 ftrace_hash_clear(hash);
1658 ftrace_filtered = 0;
1659 do_for_each_ftrace_rec(pg, rec) {
1660 if (rec->flags & FTRACE_FL_FAILED)
1661 continue;
1662 rec->flags &= ~type;
1663 } while_for_each_ftrace_rec();
1664 mutex_unlock(&ftrace_lock); 2172 mutex_unlock(&ftrace_lock);
1665} 2173}
1666 2174
1667static int 2175static int
1668ftrace_regex_open(struct inode *inode, struct file *file, int enable) 2176ftrace_regex_open(struct ftrace_ops *ops, int flag,
2177 struct inode *inode, struct file *file)
1669{ 2178{
1670 struct ftrace_iterator *iter; 2179 struct ftrace_iterator *iter;
2180 struct ftrace_hash *hash;
1671 int ret = 0; 2181 int ret = 0;
1672 2182
1673 if (unlikely(ftrace_disabled)) 2183 if (unlikely(ftrace_disabled))
@@ -1682,21 +2192,42 @@ ftrace_regex_open(struct inode *inode, struct file *file, int enable)
1682 return -ENOMEM; 2192 return -ENOMEM;
1683 } 2193 }
1684 2194
2195 if (flag & FTRACE_ITER_NOTRACE)
2196 hash = ops->notrace_hash;
2197 else
2198 hash = ops->filter_hash;
2199
2200 iter->ops = ops;
2201 iter->flags = flag;
2202
2203 if (file->f_mode & FMODE_WRITE) {
2204 mutex_lock(&ftrace_lock);
2205 iter->hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, hash);
2206 mutex_unlock(&ftrace_lock);
2207
2208 if (!iter->hash) {
2209 trace_parser_put(&iter->parser);
2210 kfree(iter);
2211 return -ENOMEM;
2212 }
2213 }
2214
1685 mutex_lock(&ftrace_regex_lock); 2215 mutex_lock(&ftrace_regex_lock);
2216
1686 if ((file->f_mode & FMODE_WRITE) && 2217 if ((file->f_mode & FMODE_WRITE) &&
1687 (file->f_flags & O_TRUNC)) 2218 (file->f_flags & O_TRUNC))
1688 ftrace_filter_reset(enable); 2219 ftrace_filter_reset(iter->hash);
1689 2220
1690 if (file->f_mode & FMODE_READ) { 2221 if (file->f_mode & FMODE_READ) {
1691 iter->pg = ftrace_pages_start; 2222 iter->pg = ftrace_pages_start;
1692 iter->flags = enable ? FTRACE_ITER_FILTER :
1693 FTRACE_ITER_NOTRACE;
1694 2223
1695 ret = seq_open(file, &show_ftrace_seq_ops); 2224 ret = seq_open(file, &show_ftrace_seq_ops);
1696 if (!ret) { 2225 if (!ret) {
1697 struct seq_file *m = file->private_data; 2226 struct seq_file *m = file->private_data;
1698 m->private = iter; 2227 m->private = iter;
1699 } else { 2228 } else {
2229 /* Failed */
2230 free_ftrace_hash(iter->hash);
1700 trace_parser_put(&iter->parser); 2231 trace_parser_put(&iter->parser);
1701 kfree(iter); 2232 kfree(iter);
1702 } 2233 }
@@ -1710,13 +2241,15 @@ ftrace_regex_open(struct inode *inode, struct file *file, int enable)
1710static int 2241static int
1711ftrace_filter_open(struct inode *inode, struct file *file) 2242ftrace_filter_open(struct inode *inode, struct file *file)
1712{ 2243{
1713 return ftrace_regex_open(inode, file, 1); 2244 return ftrace_regex_open(&global_ops, FTRACE_ITER_FILTER,
2245 inode, file);
1714} 2246}
1715 2247
1716static int 2248static int
1717ftrace_notrace_open(struct inode *inode, struct file *file) 2249ftrace_notrace_open(struct inode *inode, struct file *file)
1718{ 2250{
1719 return ftrace_regex_open(inode, file, 0); 2251 return ftrace_regex_open(&global_ops, FTRACE_ITER_NOTRACE,
2252 inode, file);
1720} 2253}
1721 2254
1722static loff_t 2255static loff_t
@@ -1761,86 +2294,99 @@ static int ftrace_match(char *str, char *regex, int len, int type)
1761} 2294}
1762 2295
1763static int 2296static int
1764ftrace_match_record(struct dyn_ftrace *rec, char *regex, int len, int type) 2297enter_record(struct ftrace_hash *hash, struct dyn_ftrace *rec, int not)
2298{
2299 struct ftrace_func_entry *entry;
2300 int ret = 0;
2301
2302 entry = ftrace_lookup_ip(hash, rec->ip);
2303 if (not) {
2304 /* Do nothing if it doesn't exist */
2305 if (!entry)
2306 return 0;
2307
2308 free_hash_entry(hash, entry);
2309 } else {
2310 /* Do nothing if it exists */
2311 if (entry)
2312 return 0;
2313
2314 ret = add_hash_entry(hash, rec->ip);
2315 }
2316 return ret;
2317}
2318
2319static int
2320ftrace_match_record(struct dyn_ftrace *rec, char *mod,
2321 char *regex, int len, int type)
1765{ 2322{
1766 char str[KSYM_SYMBOL_LEN]; 2323 char str[KSYM_SYMBOL_LEN];
2324 char *modname;
2325
2326 kallsyms_lookup(rec->ip, NULL, NULL, &modname, str);
2327
2328 if (mod) {
2329 /* module lookup requires matching the module */
2330 if (!modname || strcmp(modname, mod))
2331 return 0;
2332
2333 /* blank search means to match all funcs in the mod */
2334 if (!len)
2335 return 1;
2336 }
1767 2337
1768 kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
1769 return ftrace_match(str, regex, len, type); 2338 return ftrace_match(str, regex, len, type);
1770} 2339}
1771 2340
1772static int ftrace_match_records(char *buff, int len, int enable) 2341static int
2342match_records(struct ftrace_hash *hash, char *buff,
2343 int len, char *mod, int not)
1773{ 2344{
1774 unsigned int search_len; 2345 unsigned search_len = 0;
1775 struct ftrace_page *pg; 2346 struct ftrace_page *pg;
1776 struct dyn_ftrace *rec; 2347 struct dyn_ftrace *rec;
1777 unsigned long flag; 2348 int type = MATCH_FULL;
1778 char *search; 2349 char *search = buff;
1779 int type;
1780 int not;
1781 int found = 0; 2350 int found = 0;
2351 int ret;
1782 2352
1783 flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; 2353 if (len) {
1784 type = filter_parse_regex(buff, len, &search, &not); 2354 type = filter_parse_regex(buff, len, &search, &not);
1785 2355 search_len = strlen(search);
1786 search_len = strlen(search); 2356 }
1787 2357
1788 mutex_lock(&ftrace_lock); 2358 mutex_lock(&ftrace_lock);
1789 do_for_each_ftrace_rec(pg, rec) {
1790 2359
1791 if (rec->flags & FTRACE_FL_FAILED) 2360 if (unlikely(ftrace_disabled))
1792 continue; 2361 goto out_unlock;
1793 2362
1794 if (ftrace_match_record(rec, search, search_len, type)) { 2363 do_for_each_ftrace_rec(pg, rec) {
1795 if (not) 2364
1796 rec->flags &= ~flag; 2365 if (ftrace_match_record(rec, mod, search, search_len, type)) {
1797 else 2366 ret = enter_record(hash, rec, not);
1798 rec->flags |= flag; 2367 if (ret < 0) {
2368 found = ret;
2369 goto out_unlock;
2370 }
1799 found = 1; 2371 found = 1;
1800 } 2372 }
1801 /*
1802 * Only enable filtering if we have a function that
1803 * is filtered on.
1804 */
1805 if (enable && (rec->flags & FTRACE_FL_FILTER))
1806 ftrace_filtered = 1;
1807 } while_for_each_ftrace_rec(); 2373 } while_for_each_ftrace_rec();
2374 out_unlock:
1808 mutex_unlock(&ftrace_lock); 2375 mutex_unlock(&ftrace_lock);
1809 2376
1810 return found; 2377 return found;
1811} 2378}
1812 2379
1813static int 2380static int
1814ftrace_match_module_record(struct dyn_ftrace *rec, char *mod, 2381ftrace_match_records(struct ftrace_hash *hash, char *buff, int len)
1815 char *regex, int len, int type)
1816{ 2382{
1817 char str[KSYM_SYMBOL_LEN]; 2383 return match_records(hash, buff, len, NULL, 0);
1818 char *modname;
1819
1820 kallsyms_lookup(rec->ip, NULL, NULL, &modname, str);
1821
1822 if (!modname || strcmp(modname, mod))
1823 return 0;
1824
1825 /* blank search means to match all funcs in the mod */
1826 if (len)
1827 return ftrace_match(str, regex, len, type);
1828 else
1829 return 1;
1830} 2384}
1831 2385
1832static int ftrace_match_module_records(char *buff, char *mod, int enable) 2386static int
2387ftrace_match_module_records(struct ftrace_hash *hash, char *buff, char *mod)
1833{ 2388{
1834 unsigned search_len = 0;
1835 struct ftrace_page *pg;
1836 struct dyn_ftrace *rec;
1837 int type = MATCH_FULL;
1838 char *search = buff;
1839 unsigned long flag;
1840 int not = 0; 2389 int not = 0;
1841 int found = 0;
1842
1843 flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
1844 2390
1845 /* blank or '*' mean the same */ 2391 /* blank or '*' mean the same */
1846 if (strcmp(buff, "*") == 0) 2392 if (strcmp(buff, "*") == 0)
@@ -1852,32 +2398,7 @@ static int ftrace_match_module_records(char *buff, char *mod, int enable)
1852 not = 1; 2398 not = 1;
1853 } 2399 }
1854 2400
1855 if (strlen(buff)) { 2401 return match_records(hash, buff, strlen(buff), mod, not);
1856 type = filter_parse_regex(buff, strlen(buff), &search, &not);
1857 search_len = strlen(search);
1858 }
1859
1860 mutex_lock(&ftrace_lock);
1861 do_for_each_ftrace_rec(pg, rec) {
1862
1863 if (rec->flags & FTRACE_FL_FAILED)
1864 continue;
1865
1866 if (ftrace_match_module_record(rec, mod,
1867 search, search_len, type)) {
1868 if (not)
1869 rec->flags &= ~flag;
1870 else
1871 rec->flags |= flag;
1872 found = 1;
1873 }
1874 if (enable && (rec->flags & FTRACE_FL_FILTER))
1875 ftrace_filtered = 1;
1876
1877 } while_for_each_ftrace_rec();
1878 mutex_unlock(&ftrace_lock);
1879
1880 return found;
1881} 2402}
1882 2403
1883/* 2404/*
@@ -1888,7 +2409,10 @@ static int ftrace_match_module_records(char *buff, char *mod, int enable)
1888static int 2409static int
1889ftrace_mod_callback(char *func, char *cmd, char *param, int enable) 2410ftrace_mod_callback(char *func, char *cmd, char *param, int enable)
1890{ 2411{
2412 struct ftrace_ops *ops = &global_ops;
2413 struct ftrace_hash *hash;
1891 char *mod; 2414 char *mod;
2415 int ret = -EINVAL;
1892 2416
1893 /* 2417 /*
1894 * cmd == 'mod' because we only registered this func 2418 * cmd == 'mod' because we only registered this func
@@ -1900,15 +2424,24 @@ ftrace_mod_callback(char *func, char *cmd, char *param, int enable)
1900 2424
1901 /* we must have a module name */ 2425 /* we must have a module name */
1902 if (!param) 2426 if (!param)
1903 return -EINVAL; 2427 return ret;
1904 2428
1905 mod = strsep(&param, ":"); 2429 mod = strsep(&param, ":");
1906 if (!strlen(mod)) 2430 if (!strlen(mod))
1907 return -EINVAL; 2431 return ret;
1908 2432
1909 if (ftrace_match_module_records(func, mod, enable)) 2433 if (enable)
1910 return 0; 2434 hash = ops->filter_hash;
1911 return -EINVAL; 2435 else
2436 hash = ops->notrace_hash;
2437
2438 ret = ftrace_match_module_records(hash, func, mod);
2439 if (!ret)
2440 ret = -EINVAL;
2441 if (ret < 0)
2442 return ret;
2443
2444 return 0;
1912} 2445}
1913 2446
1914static struct ftrace_func_command ftrace_mod_cmd = { 2447static struct ftrace_func_command ftrace_mod_cmd = {
@@ -1959,6 +2492,7 @@ static int ftrace_probe_registered;
1959 2492
1960static void __enable_ftrace_function_probe(void) 2493static void __enable_ftrace_function_probe(void)
1961{ 2494{
2495 int ret;
1962 int i; 2496 int i;
1963 2497
1964 if (ftrace_probe_registered) 2498 if (ftrace_probe_registered)
@@ -1973,13 +2507,16 @@ static void __enable_ftrace_function_probe(void)
1973 if (i == FTRACE_FUNC_HASHSIZE) 2507 if (i == FTRACE_FUNC_HASHSIZE)
1974 return; 2508 return;
1975 2509
1976 __register_ftrace_function(&trace_probe_ops); 2510 ret = __register_ftrace_function(&trace_probe_ops);
1977 ftrace_startup(0); 2511 if (!ret)
2512 ret = ftrace_startup(&trace_probe_ops, 0);
2513
1978 ftrace_probe_registered = 1; 2514 ftrace_probe_registered = 1;
1979} 2515}
1980 2516
1981static void __disable_ftrace_function_probe(void) 2517static void __disable_ftrace_function_probe(void)
1982{ 2518{
2519 int ret;
1983 int i; 2520 int i;
1984 2521
1985 if (!ftrace_probe_registered) 2522 if (!ftrace_probe_registered)
@@ -1992,8 +2529,10 @@ static void __disable_ftrace_function_probe(void)
1992 } 2529 }
1993 2530
1994 /* no more funcs left */ 2531 /* no more funcs left */
1995 __unregister_ftrace_function(&trace_probe_ops); 2532 ret = __unregister_ftrace_function(&trace_probe_ops);
1996 ftrace_shutdown(0); 2533 if (!ret)
2534 ftrace_shutdown(&trace_probe_ops, 0);
2535
1997 ftrace_probe_registered = 0; 2536 ftrace_probe_registered = 0;
1998} 2537}
1999 2538
@@ -2029,12 +2568,13 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
2029 return -EINVAL; 2568 return -EINVAL;
2030 2569
2031 mutex_lock(&ftrace_lock); 2570 mutex_lock(&ftrace_lock);
2032 do_for_each_ftrace_rec(pg, rec) {
2033 2571
2034 if (rec->flags & FTRACE_FL_FAILED) 2572 if (unlikely(ftrace_disabled))
2035 continue; 2573 goto out_unlock;
2574
2575 do_for_each_ftrace_rec(pg, rec) {
2036 2576
2037 if (!ftrace_match_record(rec, search, len, type)) 2577 if (!ftrace_match_record(rec, NULL, search, len, type))
2038 continue; 2578 continue;
2039 2579
2040 entry = kmalloc(sizeof(*entry), GFP_KERNEL); 2580 entry = kmalloc(sizeof(*entry), GFP_KERNEL);
@@ -2195,7 +2735,8 @@ int unregister_ftrace_command(struct ftrace_func_command *cmd)
2195 return ret; 2735 return ret;
2196} 2736}
2197 2737
2198static int ftrace_process_regex(char *buff, int len, int enable) 2738static int ftrace_process_regex(struct ftrace_hash *hash,
2739 char *buff, int len, int enable)
2199{ 2740{
2200 char *func, *command, *next = buff; 2741 char *func, *command, *next = buff;
2201 struct ftrace_func_command *p; 2742 struct ftrace_func_command *p;
@@ -2204,9 +2745,12 @@ static int ftrace_process_regex(char *buff, int len, int enable)
2204 func = strsep(&next, ":"); 2745 func = strsep(&next, ":");
2205 2746
2206 if (!next) { 2747 if (!next) {
2207 if (ftrace_match_records(func, len, enable)) 2748 ret = ftrace_match_records(hash, func, len);
2208 return 0; 2749 if (!ret)
2209 return ret; 2750 ret = -EINVAL;
2751 if (ret < 0)
2752 return ret;
2753 return 0;
2210 } 2754 }
2211 2755
2212 /* command found */ 2756 /* command found */
@@ -2239,6 +2783,10 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
2239 2783
2240 mutex_lock(&ftrace_regex_lock); 2784 mutex_lock(&ftrace_regex_lock);
2241 2785
2786 ret = -ENODEV;
2787 if (unlikely(ftrace_disabled))
2788 goto out_unlock;
2789
2242 if (file->f_mode & FMODE_READ) { 2790 if (file->f_mode & FMODE_READ) {
2243 struct seq_file *m = file->private_data; 2791 struct seq_file *m = file->private_data;
2244 iter = m->private; 2792 iter = m->private;
@@ -2250,7 +2798,7 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
2250 2798
2251 if (read >= 0 && trace_parser_loaded(parser) && 2799 if (read >= 0 && trace_parser_loaded(parser) &&
2252 !trace_parser_cont(parser)) { 2800 !trace_parser_cont(parser)) {
2253 ret = ftrace_process_regex(parser->buffer, 2801 ret = ftrace_process_regex(iter->hash, parser->buffer,
2254 parser->idx, enable); 2802 parser->idx, enable);
2255 trace_parser_clear(parser); 2803 trace_parser_clear(parser);
2256 if (ret) 2804 if (ret)
@@ -2278,22 +2826,83 @@ ftrace_notrace_write(struct file *file, const char __user *ubuf,
2278 return ftrace_regex_write(file, ubuf, cnt, ppos, 0); 2826 return ftrace_regex_write(file, ubuf, cnt, ppos, 0);
2279} 2827}
2280 2828
2281static void 2829static int
2282ftrace_set_regex(unsigned char *buf, int len, int reset, int enable) 2830ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len,
2831 int reset, int enable)
2283{ 2832{
2833 struct ftrace_hash **orig_hash;
2834 struct ftrace_hash *hash;
2835 int ret;
2836
2837 /* All global ops uses the global ops filters */
2838 if (ops->flags & FTRACE_OPS_FL_GLOBAL)
2839 ops = &global_ops;
2840
2284 if (unlikely(ftrace_disabled)) 2841 if (unlikely(ftrace_disabled))
2285 return; 2842 return -ENODEV;
2843
2844 if (enable)
2845 orig_hash = &ops->filter_hash;
2846 else
2847 orig_hash = &ops->notrace_hash;
2848
2849 hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash);
2850 if (!hash)
2851 return -ENOMEM;
2286 2852
2287 mutex_lock(&ftrace_regex_lock); 2853 mutex_lock(&ftrace_regex_lock);
2288 if (reset) 2854 if (reset)
2289 ftrace_filter_reset(enable); 2855 ftrace_filter_reset(hash);
2290 if (buf) 2856 if (buf)
2291 ftrace_match_records(buf, len, enable); 2857 ftrace_match_records(hash, buf, len);
2858
2859 mutex_lock(&ftrace_lock);
2860 ret = ftrace_hash_move(orig_hash, hash);
2861 mutex_unlock(&ftrace_lock);
2862
2292 mutex_unlock(&ftrace_regex_lock); 2863 mutex_unlock(&ftrace_regex_lock);
2864
2865 free_ftrace_hash(hash);
2866 return ret;
2867}
2868
2869/**
2870 * ftrace_set_filter - set a function to filter on in ftrace
2871 * @ops - the ops to set the filter with
2872 * @buf - the string that holds the function filter text.
2873 * @len - the length of the string.
2874 * @reset - non zero to reset all filters before applying this filter.
2875 *
2876 * Filters denote which functions should be enabled when tracing is enabled.
2877 * If @buf is NULL and reset is set, all functions will be enabled for tracing.
2878 */
2879void ftrace_set_filter(struct ftrace_ops *ops, unsigned char *buf,
2880 int len, int reset)
2881{
2882 ftrace_set_regex(ops, buf, len, reset, 1);
2293} 2883}
2884EXPORT_SYMBOL_GPL(ftrace_set_filter);
2294 2885
2295/** 2886/**
2887 * ftrace_set_notrace - set a function to not trace in ftrace
2888 * @ops - the ops to set the notrace filter with
2889 * @buf - the string that holds the function notrace text.
2890 * @len - the length of the string.
2891 * @reset - non zero to reset all filters before applying this filter.
2892 *
2893 * Notrace Filters denote which functions should not be enabled when tracing
2894 * is enabled. If @buf is NULL and reset is set, all functions will be enabled
2895 * for tracing.
2896 */
2897void ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf,
2898 int len, int reset)
2899{
2900 ftrace_set_regex(ops, buf, len, reset, 0);
2901}
2902EXPORT_SYMBOL_GPL(ftrace_set_notrace);
2903/**
2296 * ftrace_set_filter - set a function to filter on in ftrace 2904 * ftrace_set_filter - set a function to filter on in ftrace
2905 * @ops - the ops to set the filter with
2297 * @buf - the string that holds the function filter text. 2906 * @buf - the string that holds the function filter text.
2298 * @len - the length of the string. 2907 * @len - the length of the string.
2299 * @reset - non zero to reset all filters before applying this filter. 2908 * @reset - non zero to reset all filters before applying this filter.
@@ -2301,13 +2910,15 @@ ftrace_set_regex(unsigned char *buf, int len, int reset, int enable)
2301 * Filters denote which functions should be enabled when tracing is enabled. 2910 * Filters denote which functions should be enabled when tracing is enabled.
2302 * If @buf is NULL and reset is set, all functions will be enabled for tracing. 2911 * If @buf is NULL and reset is set, all functions will be enabled for tracing.
2303 */ 2912 */
2304void ftrace_set_filter(unsigned char *buf, int len, int reset) 2913void ftrace_set_global_filter(unsigned char *buf, int len, int reset)
2305{ 2914{
2306 ftrace_set_regex(buf, len, reset, 1); 2915 ftrace_set_regex(&global_ops, buf, len, reset, 1);
2307} 2916}
2917EXPORT_SYMBOL_GPL(ftrace_set_global_filter);
2308 2918
2309/** 2919/**
2310 * ftrace_set_notrace - set a function to not trace in ftrace 2920 * ftrace_set_notrace - set a function to not trace in ftrace
2921 * @ops - the ops to set the notrace filter with
2311 * @buf - the string that holds the function notrace text. 2922 * @buf - the string that holds the function notrace text.
2312 * @len - the length of the string. 2923 * @len - the length of the string.
2313 * @reset - non zero to reset all filters before applying this filter. 2924 * @reset - non zero to reset all filters before applying this filter.
@@ -2316,10 +2927,11 @@ void ftrace_set_filter(unsigned char *buf, int len, int reset)
2316 * is enabled. If @buf is NULL and reset is set, all functions will be enabled 2927 * is enabled. If @buf is NULL and reset is set, all functions will be enabled
2317 * for tracing. 2928 * for tracing.
2318 */ 2929 */
2319void ftrace_set_notrace(unsigned char *buf, int len, int reset) 2930void ftrace_set_global_notrace(unsigned char *buf, int len, int reset)
2320{ 2931{
2321 ftrace_set_regex(buf, len, reset, 0); 2932 ftrace_set_regex(&global_ops, buf, len, reset, 0);
2322} 2933}
2934EXPORT_SYMBOL_GPL(ftrace_set_global_notrace);
2323 2935
2324/* 2936/*
2325 * command line interface to allow users to set filters on boot up. 2937 * command line interface to allow users to set filters on boot up.
@@ -2370,22 +2982,23 @@ static void __init set_ftrace_early_graph(char *buf)
2370} 2982}
2371#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ 2983#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
2372 2984
2373static void __init set_ftrace_early_filter(char *buf, int enable) 2985static void __init
2986set_ftrace_early_filter(struct ftrace_ops *ops, char *buf, int enable)
2374{ 2987{
2375 char *func; 2988 char *func;
2376 2989
2377 while (buf) { 2990 while (buf) {
2378 func = strsep(&buf, ","); 2991 func = strsep(&buf, ",");
2379 ftrace_set_regex(func, strlen(func), 0, enable); 2992 ftrace_set_regex(ops, func, strlen(func), 0, enable);
2380 } 2993 }
2381} 2994}
2382 2995
2383static void __init set_ftrace_early_filters(void) 2996static void __init set_ftrace_early_filters(void)
2384{ 2997{
2385 if (ftrace_filter_buf[0]) 2998 if (ftrace_filter_buf[0])
2386 set_ftrace_early_filter(ftrace_filter_buf, 1); 2999 set_ftrace_early_filter(&global_ops, ftrace_filter_buf, 1);
2387 if (ftrace_notrace_buf[0]) 3000 if (ftrace_notrace_buf[0])
2388 set_ftrace_early_filter(ftrace_notrace_buf, 0); 3001 set_ftrace_early_filter(&global_ops, ftrace_notrace_buf, 0);
2389#ifdef CONFIG_FUNCTION_GRAPH_TRACER 3002#ifdef CONFIG_FUNCTION_GRAPH_TRACER
2390 if (ftrace_graph_buf[0]) 3003 if (ftrace_graph_buf[0])
2391 set_ftrace_early_graph(ftrace_graph_buf); 3004 set_ftrace_early_graph(ftrace_graph_buf);
@@ -2393,11 +3006,14 @@ static void __init set_ftrace_early_filters(void)
2393} 3006}
2394 3007
2395static int 3008static int
2396ftrace_regex_release(struct inode *inode, struct file *file, int enable) 3009ftrace_regex_release(struct inode *inode, struct file *file)
2397{ 3010{
2398 struct seq_file *m = (struct seq_file *)file->private_data; 3011 struct seq_file *m = (struct seq_file *)file->private_data;
2399 struct ftrace_iterator *iter; 3012 struct ftrace_iterator *iter;
3013 struct ftrace_hash **orig_hash;
2400 struct trace_parser *parser; 3014 struct trace_parser *parser;
3015 int filter_hash;
3016 int ret;
2401 3017
2402 mutex_lock(&ftrace_regex_lock); 3018 mutex_lock(&ftrace_regex_lock);
2403 if (file->f_mode & FMODE_READ) { 3019 if (file->f_mode & FMODE_READ) {
@@ -2410,33 +3026,41 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable)
2410 parser = &iter->parser; 3026 parser = &iter->parser;
2411 if (trace_parser_loaded(parser)) { 3027 if (trace_parser_loaded(parser)) {
2412 parser->buffer[parser->idx] = 0; 3028 parser->buffer[parser->idx] = 0;
2413 ftrace_match_records(parser->buffer, parser->idx, enable); 3029 ftrace_match_records(iter->hash, parser->buffer, parser->idx);
2414 } 3030 }
2415 3031
2416 mutex_lock(&ftrace_lock);
2417 if (ftrace_start_up && ftrace_enabled)
2418 ftrace_run_update_code(FTRACE_ENABLE_CALLS);
2419 mutex_unlock(&ftrace_lock);
2420
2421 trace_parser_put(parser); 3032 trace_parser_put(parser);
3033
3034 if (file->f_mode & FMODE_WRITE) {
3035 filter_hash = !!(iter->flags & FTRACE_ITER_FILTER);
3036
3037 if (filter_hash)
3038 orig_hash = &iter->ops->filter_hash;
3039 else
3040 orig_hash = &iter->ops->notrace_hash;
3041
3042 mutex_lock(&ftrace_lock);
3043 /*
3044 * Remove the current set, update the hash and add
3045 * them back.
3046 */
3047 ftrace_hash_rec_disable(iter->ops, filter_hash);
3048 ret = ftrace_hash_move(orig_hash, iter->hash);
3049 if (!ret) {
3050 ftrace_hash_rec_enable(iter->ops, filter_hash);
3051 if (iter->ops->flags & FTRACE_OPS_FL_ENABLED
3052 && ftrace_enabled)
3053 ftrace_run_update_code(FTRACE_ENABLE_CALLS);
3054 }
3055 mutex_unlock(&ftrace_lock);
3056 }
3057 free_ftrace_hash(iter->hash);
2422 kfree(iter); 3058 kfree(iter);
2423 3059
2424 mutex_unlock(&ftrace_regex_lock); 3060 mutex_unlock(&ftrace_regex_lock);
2425 return 0; 3061 return 0;
2426} 3062}
2427 3063
2428static int
2429ftrace_filter_release(struct inode *inode, struct file *file)
2430{
2431 return ftrace_regex_release(inode, file, 1);
2432}
2433
2434static int
2435ftrace_notrace_release(struct inode *inode, struct file *file)
2436{
2437 return ftrace_regex_release(inode, file, 0);
2438}
2439
2440static const struct file_operations ftrace_avail_fops = { 3064static const struct file_operations ftrace_avail_fops = {
2441 .open = ftrace_avail_open, 3065 .open = ftrace_avail_open,
2442 .read = seq_read, 3066 .read = seq_read,
@@ -2444,8 +3068,8 @@ static const struct file_operations ftrace_avail_fops = {
2444 .release = seq_release_private, 3068 .release = seq_release_private,
2445}; 3069};
2446 3070
2447static const struct file_operations ftrace_failures_fops = { 3071static const struct file_operations ftrace_enabled_fops = {
2448 .open = ftrace_failures_open, 3072 .open = ftrace_enabled_open,
2449 .read = seq_read, 3073 .read = seq_read,
2450 .llseek = seq_lseek, 3074 .llseek = seq_lseek,
2451 .release = seq_release_private, 3075 .release = seq_release_private,
@@ -2456,7 +3080,7 @@ static const struct file_operations ftrace_filter_fops = {
2456 .read = seq_read, 3080 .read = seq_read,
2457 .write = ftrace_filter_write, 3081 .write = ftrace_filter_write,
2458 .llseek = ftrace_regex_lseek, 3082 .llseek = ftrace_regex_lseek,
2459 .release = ftrace_filter_release, 3083 .release = ftrace_regex_release,
2460}; 3084};
2461 3085
2462static const struct file_operations ftrace_notrace_fops = { 3086static const struct file_operations ftrace_notrace_fops = {
@@ -2464,7 +3088,7 @@ static const struct file_operations ftrace_notrace_fops = {
2464 .read = seq_read, 3088 .read = seq_read,
2465 .write = ftrace_notrace_write, 3089 .write = ftrace_notrace_write,
2466 .llseek = ftrace_regex_lseek, 3090 .llseek = ftrace_regex_lseek,
2467 .release = ftrace_notrace_release, 3091 .release = ftrace_regex_release,
2468}; 3092};
2469 3093
2470#ifdef CONFIG_FUNCTION_GRAPH_TRACER 3094#ifdef CONFIG_FUNCTION_GRAPH_TRACER
@@ -2573,9 +3197,6 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer)
2573 bool exists; 3197 bool exists;
2574 int i; 3198 int i;
2575 3199
2576 if (ftrace_disabled)
2577 return -ENODEV;
2578
2579 /* decode regex */ 3200 /* decode regex */
2580 type = filter_parse_regex(buffer, strlen(buffer), &search, &not); 3201 type = filter_parse_regex(buffer, strlen(buffer), &search, &not);
2581 if (!not && *idx >= FTRACE_GRAPH_MAX_FUNCS) 3202 if (!not && *idx >= FTRACE_GRAPH_MAX_FUNCS)
@@ -2584,12 +3205,18 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer)
2584 search_len = strlen(search); 3205 search_len = strlen(search);
2585 3206
2586 mutex_lock(&ftrace_lock); 3207 mutex_lock(&ftrace_lock);
3208
3209 if (unlikely(ftrace_disabled)) {
3210 mutex_unlock(&ftrace_lock);
3211 return -ENODEV;
3212 }
3213
2587 do_for_each_ftrace_rec(pg, rec) { 3214 do_for_each_ftrace_rec(pg, rec) {
2588 3215
2589 if (rec->flags & (FTRACE_FL_FAILED | FTRACE_FL_FREE)) 3216 if (rec->flags & FTRACE_FL_FREE)
2590 continue; 3217 continue;
2591 3218
2592 if (ftrace_match_record(rec, search, search_len, type)) { 3219 if (ftrace_match_record(rec, NULL, search, search_len, type)) {
2593 /* if it is in the array */ 3220 /* if it is in the array */
2594 exists = false; 3221 exists = false;
2595 for (i = 0; i < *idx; i++) { 3222 for (i = 0; i < *idx; i++) {
@@ -2679,8 +3306,8 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)
2679 trace_create_file("available_filter_functions", 0444, 3306 trace_create_file("available_filter_functions", 0444,
2680 d_tracer, NULL, &ftrace_avail_fops); 3307 d_tracer, NULL, &ftrace_avail_fops);
2681 3308
2682 trace_create_file("failures", 0444, 3309 trace_create_file("enabled_functions", 0444,
2683 d_tracer, NULL, &ftrace_failures_fops); 3310 d_tracer, NULL, &ftrace_enabled_fops);
2684 3311
2685 trace_create_file("set_ftrace_filter", 0644, d_tracer, 3312 trace_create_file("set_ftrace_filter", 0644, d_tracer,
2686 NULL, &ftrace_filter_fops); 3313 NULL, &ftrace_filter_fops);
@@ -2720,7 +3347,10 @@ static int ftrace_process_locs(struct module *mod,
2720 ftrace_record_ip(addr); 3347 ftrace_record_ip(addr);
2721 } 3348 }
2722 3349
2723 /* disable interrupts to prevent kstop machine */ 3350 /*
3351 * Disable interrupts to prevent interrupts from executing
3352 * code that is being modified.
3353 */
2724 local_irq_save(flags); 3354 local_irq_save(flags);
2725 ftrace_update_code(mod); 3355 ftrace_update_code(mod);
2726 local_irq_restore(flags); 3356 local_irq_restore(flags);
@@ -2735,10 +3365,11 @@ void ftrace_release_mod(struct module *mod)
2735 struct dyn_ftrace *rec; 3365 struct dyn_ftrace *rec;
2736 struct ftrace_page *pg; 3366 struct ftrace_page *pg;
2737 3367
3368 mutex_lock(&ftrace_lock);
3369
2738 if (ftrace_disabled) 3370 if (ftrace_disabled)
2739 return; 3371 goto out_unlock;
2740 3372
2741 mutex_lock(&ftrace_lock);
2742 do_for_each_ftrace_rec(pg, rec) { 3373 do_for_each_ftrace_rec(pg, rec) {
2743 if (within_module_core(rec->ip, mod)) { 3374 if (within_module_core(rec->ip, mod)) {
2744 /* 3375 /*
@@ -2749,6 +3380,7 @@ void ftrace_release_mod(struct module *mod)
2749 ftrace_free_rec(rec); 3380 ftrace_free_rec(rec);
2750 } 3381 }
2751 } while_for_each_ftrace_rec(); 3382 } while_for_each_ftrace_rec();
3383 out_unlock:
2752 mutex_unlock(&ftrace_lock); 3384 mutex_unlock(&ftrace_lock);
2753} 3385}
2754 3386
@@ -2835,6 +3467,10 @@ void __init ftrace_init(void)
2835 3467
2836#else 3468#else
2837 3469
3470static struct ftrace_ops global_ops = {
3471 .func = ftrace_stub,
3472};
3473
2838static int __init ftrace_nodyn_init(void) 3474static int __init ftrace_nodyn_init(void)
2839{ 3475{
2840 ftrace_enabled = 1; 3476 ftrace_enabled = 1;
@@ -2845,12 +3481,47 @@ device_initcall(ftrace_nodyn_init);
2845static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; } 3481static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; }
2846static inline void ftrace_startup_enable(int command) { } 3482static inline void ftrace_startup_enable(int command) { }
2847/* Keep as macros so we do not need to define the commands */ 3483/* Keep as macros so we do not need to define the commands */
2848# define ftrace_startup(command) do { } while (0) 3484# define ftrace_startup(ops, command) \
2849# define ftrace_shutdown(command) do { } while (0) 3485 ({ \
3486 (ops)->flags |= FTRACE_OPS_FL_ENABLED; \
3487 0; \
3488 })
3489# define ftrace_shutdown(ops, command) do { } while (0)
2850# define ftrace_startup_sysctl() do { } while (0) 3490# define ftrace_startup_sysctl() do { } while (0)
2851# define ftrace_shutdown_sysctl() do { } while (0) 3491# define ftrace_shutdown_sysctl() do { } while (0)
3492
3493static inline int
3494ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip)
3495{
3496 return 1;
3497}
3498
2852#endif /* CONFIG_DYNAMIC_FTRACE */ 3499#endif /* CONFIG_DYNAMIC_FTRACE */
2853 3500
3501static void
3502ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip)
3503{
3504 struct ftrace_ops *op;
3505
3506 if (unlikely(trace_recursion_test(TRACE_INTERNAL_BIT)))
3507 return;
3508
3509 trace_recursion_set(TRACE_INTERNAL_BIT);
3510 /*
3511 * Some of the ops may be dynamically allocated,
3512 * they must be freed after a synchronize_sched().
3513 */
3514 preempt_disable_notrace();
3515 op = rcu_dereference_raw(ftrace_ops_list);
3516 while (op != &ftrace_list_end) {
3517 if (ftrace_ops_test(op, ip))
3518 op->func(ip, parent_ip);
3519 op = rcu_dereference_raw(op->next);
3520 };
3521 preempt_enable_notrace();
3522 trace_recursion_clear(TRACE_INTERNAL_BIT);
3523}
3524
2854static void clear_ftrace_swapper(void) 3525static void clear_ftrace_swapper(void)
2855{ 3526{
2856 struct task_struct *p; 3527 struct task_struct *p;
@@ -3143,19 +3814,23 @@ void ftrace_kill(void)
3143 */ 3814 */
3144int register_ftrace_function(struct ftrace_ops *ops) 3815int register_ftrace_function(struct ftrace_ops *ops)
3145{ 3816{
3146 int ret; 3817 int ret = -1;
3147
3148 if (unlikely(ftrace_disabled))
3149 return -1;
3150 3818
3151 mutex_lock(&ftrace_lock); 3819 mutex_lock(&ftrace_lock);
3152 3820
3821 if (unlikely(ftrace_disabled))
3822 goto out_unlock;
3823
3153 ret = __register_ftrace_function(ops); 3824 ret = __register_ftrace_function(ops);
3154 ftrace_startup(0); 3825 if (!ret)
3826 ret = ftrace_startup(ops, 0);
3155 3827
3828
3829 out_unlock:
3156 mutex_unlock(&ftrace_lock); 3830 mutex_unlock(&ftrace_lock);
3157 return ret; 3831 return ret;
3158} 3832}
3833EXPORT_SYMBOL_GPL(register_ftrace_function);
3159 3834
3160/** 3835/**
3161 * unregister_ftrace_function - unregister a function for profiling. 3836 * unregister_ftrace_function - unregister a function for profiling.
@@ -3169,25 +3844,27 @@ int unregister_ftrace_function(struct ftrace_ops *ops)
3169 3844
3170 mutex_lock(&ftrace_lock); 3845 mutex_lock(&ftrace_lock);
3171 ret = __unregister_ftrace_function(ops); 3846 ret = __unregister_ftrace_function(ops);
3172 ftrace_shutdown(0); 3847 if (!ret)
3848 ftrace_shutdown(ops, 0);
3173 mutex_unlock(&ftrace_lock); 3849 mutex_unlock(&ftrace_lock);
3174 3850
3175 return ret; 3851 return ret;
3176} 3852}
3853EXPORT_SYMBOL_GPL(unregister_ftrace_function);
3177 3854
3178int 3855int
3179ftrace_enable_sysctl(struct ctl_table *table, int write, 3856ftrace_enable_sysctl(struct ctl_table *table, int write,
3180 void __user *buffer, size_t *lenp, 3857 void __user *buffer, size_t *lenp,
3181 loff_t *ppos) 3858 loff_t *ppos)
3182{ 3859{
3183 int ret; 3860 int ret = -ENODEV;
3184
3185 if (unlikely(ftrace_disabled))
3186 return -ENODEV;
3187 3861
3188 mutex_lock(&ftrace_lock); 3862 mutex_lock(&ftrace_lock);
3189 3863
3190 ret = proc_dointvec(table, write, buffer, lenp, ppos); 3864 if (unlikely(ftrace_disabled))
3865 goto out;
3866
3867 ret = proc_dointvec(table, write, buffer, lenp, ppos);
3191 3868
3192 if (ret || !write || (last_ftrace_enabled == !!ftrace_enabled)) 3869 if (ret || !write || (last_ftrace_enabled == !!ftrace_enabled))
3193 goto out; 3870 goto out;
@@ -3199,11 +3876,11 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
3199 ftrace_startup_sysctl(); 3876 ftrace_startup_sysctl();
3200 3877
3201 /* we are starting ftrace again */ 3878 /* we are starting ftrace again */
3202 if (ftrace_list != &ftrace_list_end) { 3879 if (ftrace_ops_list != &ftrace_list_end) {
3203 if (ftrace_list->next == &ftrace_list_end) 3880 if (ftrace_ops_list->next == &ftrace_list_end)
3204 ftrace_trace_function = ftrace_list->func; 3881 ftrace_trace_function = ftrace_ops_list->func;
3205 else 3882 else
3206 ftrace_trace_function = ftrace_list_func; 3883 ftrace_trace_function = ftrace_ops_list_func;
3207 } 3884 }
3208 3885
3209 } else { 3886 } else {
@@ -3392,7 +4069,7 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
3392 ftrace_graph_return = retfunc; 4069 ftrace_graph_return = retfunc;
3393 ftrace_graph_entry = entryfunc; 4070 ftrace_graph_entry = entryfunc;
3394 4071
3395 ftrace_startup(FTRACE_START_FUNC_RET); 4072 ret = ftrace_startup(&global_ops, FTRACE_START_FUNC_RET);
3396 4073
3397out: 4074out:
3398 mutex_unlock(&ftrace_lock); 4075 mutex_unlock(&ftrace_lock);
@@ -3409,7 +4086,7 @@ void unregister_ftrace_graph(void)
3409 ftrace_graph_active--; 4086 ftrace_graph_active--;
3410 ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; 4087 ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
3411 ftrace_graph_entry = ftrace_graph_entry_stub; 4088 ftrace_graph_entry = ftrace_graph_entry_stub;
3412 ftrace_shutdown(FTRACE_STOP_FUNC_RET); 4089 ftrace_shutdown(&global_ops, FTRACE_STOP_FUNC_RET);
3413 unregister_pm_notifier(&ftrace_suspend_notifier); 4090 unregister_pm_notifier(&ftrace_suspend_notifier);
3414 unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); 4091 unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
3415 4092
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 0ef7b4b2a1f7..b0c7aa407943 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2216,7 +2216,7 @@ static noinline void trace_recursive_fail(void)
2216 2216
2217 printk_once(KERN_WARNING "Tracing recursion: depth[%ld]:" 2217 printk_once(KERN_WARNING "Tracing recursion: depth[%ld]:"
2218 "HC[%lu]:SC[%lu]:NMI[%lu]\n", 2218 "HC[%lu]:SC[%lu]:NMI[%lu]\n",
2219 current->trace_recursion, 2219 trace_recursion_buffer(),
2220 hardirq_count() >> HARDIRQ_SHIFT, 2220 hardirq_count() >> HARDIRQ_SHIFT,
2221 softirq_count() >> SOFTIRQ_SHIFT, 2221 softirq_count() >> SOFTIRQ_SHIFT,
2222 in_nmi()); 2222 in_nmi());
@@ -2226,9 +2226,9 @@ static noinline void trace_recursive_fail(void)
2226 2226
2227static inline int trace_recursive_lock(void) 2227static inline int trace_recursive_lock(void)
2228{ 2228{
2229 current->trace_recursion++; 2229 trace_recursion_inc();
2230 2230
2231 if (likely(current->trace_recursion < TRACE_RECURSIVE_DEPTH)) 2231 if (likely(trace_recursion_buffer() < TRACE_RECURSIVE_DEPTH))
2232 return 0; 2232 return 0;
2233 2233
2234 trace_recursive_fail(); 2234 trace_recursive_fail();
@@ -2238,9 +2238,9 @@ static inline int trace_recursive_lock(void)
2238 2238
2239static inline void trace_recursive_unlock(void) 2239static inline void trace_recursive_unlock(void)
2240{ 2240{
2241 WARN_ON_ONCE(!current->trace_recursion); 2241 WARN_ON_ONCE(!trace_recursion_buffer());
2242 2242
2243 current->trace_recursion--; 2243 trace_recursion_dec();
2244} 2244}
2245 2245
2246#else 2246#else
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 1cb49be7c7fb..ee9c921d7f21 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2014,9 +2014,10 @@ enum print_line_t print_trace_line(struct trace_iterator *iter)
2014{ 2014{
2015 enum print_line_t ret; 2015 enum print_line_t ret;
2016 2016
2017 if (iter->lost_events) 2017 if (iter->lost_events &&
2018 trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n", 2018 !trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n",
2019 iter->cpu, iter->lost_events); 2019 iter->cpu, iter->lost_events))
2020 return TRACE_TYPE_PARTIAL_LINE;
2020 2021
2021 if (iter->trace && iter->trace->print_line) { 2022 if (iter->trace && iter->trace->print_line) {
2022 ret = iter->trace->print_line(iter); 2023 ret = iter->trace->print_line(iter);
@@ -3230,6 +3231,14 @@ waitagain:
3230 3231
3231 if (iter->seq.len >= cnt) 3232 if (iter->seq.len >= cnt)
3232 break; 3233 break;
3234
3235 /*
3236 * Setting the full flag means we reached the trace_seq buffer
3237 * size and we should leave by partial output condition above.
3238 * One of the trace_seq_* functions is not used properly.
3239 */
3240 WARN_ONCE(iter->seq.full, "full flag set for trace type %d",
3241 iter->ent->type);
3233 } 3242 }
3234 trace_access_unlock(iter->cpu_file); 3243 trace_access_unlock(iter->cpu_file);
3235 trace_event_read_unlock(); 3244 trace_event_read_unlock();
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 5e9dfc6286dd..229f8591f61d 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -419,6 +419,8 @@ extern void trace_find_cmdline(int pid, char comm[]);
419extern unsigned long ftrace_update_tot_cnt; 419extern unsigned long ftrace_update_tot_cnt;
420#define DYN_FTRACE_TEST_NAME trace_selftest_dynamic_test_func 420#define DYN_FTRACE_TEST_NAME trace_selftest_dynamic_test_func
421extern int DYN_FTRACE_TEST_NAME(void); 421extern int DYN_FTRACE_TEST_NAME(void);
422#define DYN_FTRACE_TEST_NAME2 trace_selftest_dynamic_test_func2
423extern int DYN_FTRACE_TEST_NAME2(void);
422#endif 424#endif
423 425
424extern int ring_buffer_expanded; 426extern int ring_buffer_expanded;
@@ -782,4 +784,19 @@ extern const char *__stop___trace_bprintk_fmt[];
782 FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print)) 784 FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print))
783#include "trace_entries.h" 785#include "trace_entries.h"
784 786
787/* Only current can touch trace_recursion */
788#define trace_recursion_inc() do { (current)->trace_recursion++; } while (0)
789#define trace_recursion_dec() do { (current)->trace_recursion--; } while (0)
790
791/* Ring buffer has the 10 LSB bits to count */
792#define trace_recursion_buffer() ((current)->trace_recursion & 0x3ff)
793
794/* for function tracing recursion */
795#define TRACE_INTERNAL_BIT (1<<11)
796#define TRACE_GLOBAL_BIT (1<<12)
797
798#define trace_recursion_set(bit) do { (current)->trace_recursion |= (bit); } while (0)
799#define trace_recursion_clear(bit) do { (current)->trace_recursion &= ~(bit); } while (0)
800#define trace_recursion_test(bit) ((current)->trace_recursion & (bit))
801
785#endif /* _LINUX_KERNEL_TRACE_H */ 802#endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 2fe110341359..686ec399f2a8 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1657,7 +1657,12 @@ static struct ftrace_ops trace_ops __initdata =
1657 1657
1658static __init void event_trace_self_test_with_function(void) 1658static __init void event_trace_self_test_with_function(void)
1659{ 1659{
1660 register_ftrace_function(&trace_ops); 1660 int ret;
1661 ret = register_ftrace_function(&trace_ops);
1662 if (WARN_ON(ret < 0)) {
1663 pr_info("Failed to enable function tracer for event tests\n");
1664 return;
1665 }
1661 pr_info("Running tests again, along with the function tracer\n"); 1666 pr_info("Running tests again, along with the function tracer\n");
1662 event_trace_self_tests(); 1667 event_trace_self_tests();
1663 unregister_ftrace_function(&trace_ops); 1668 unregister_ftrace_function(&trace_ops);
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 16aee4d44e8f..8d0e1cc4e974 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -149,11 +149,13 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip)
149static struct ftrace_ops trace_ops __read_mostly = 149static struct ftrace_ops trace_ops __read_mostly =
150{ 150{
151 .func = function_trace_call, 151 .func = function_trace_call,
152 .flags = FTRACE_OPS_FL_GLOBAL,
152}; 153};
153 154
154static struct ftrace_ops trace_stack_ops __read_mostly = 155static struct ftrace_ops trace_stack_ops __read_mostly =
155{ 156{
156 .func = function_stack_trace_call, 157 .func = function_stack_trace_call,
158 .flags = FTRACE_OPS_FL_GLOBAL,
157}; 159};
158 160
159/* Our two options */ 161/* Our two options */
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index a4969b47afc1..c77424be284d 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -153,6 +153,7 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip)
153static struct ftrace_ops trace_ops __read_mostly = 153static struct ftrace_ops trace_ops __read_mostly =
154{ 154{
155 .func = irqsoff_tracer_call, 155 .func = irqsoff_tracer_call,
156 .flags = FTRACE_OPS_FL_GLOBAL,
156}; 157};
157#endif /* CONFIG_FUNCTION_TRACER */ 158#endif /* CONFIG_FUNCTION_TRACER */
158 159
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 35d55a386145..27d13b36b8be 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -53,7 +53,6 @@ const char *reserved_field_names[] = {
53 "common_preempt_count", 53 "common_preempt_count",
54 "common_pid", 54 "common_pid",
55 "common_tgid", 55 "common_tgid",
56 "common_lock_depth",
57 FIELD_STRING_IP, 56 FIELD_STRING_IP,
58 FIELD_STRING_RETIP, 57 FIELD_STRING_RETIP,
59 FIELD_STRING_FUNC, 58 FIELD_STRING_FUNC,
@@ -1871,8 +1870,12 @@ fs_initcall(init_kprobe_trace);
1871 1870
1872#ifdef CONFIG_FTRACE_STARTUP_TEST 1871#ifdef CONFIG_FTRACE_STARTUP_TEST
1873 1872
1874static int kprobe_trace_selftest_target(int a1, int a2, int a3, 1873/*
1875 int a4, int a5, int a6) 1874 * The "__used" keeps gcc from removing the function symbol
1875 * from the kallsyms table.
1876 */
1877static __used int kprobe_trace_selftest_target(int a1, int a2, int a3,
1878 int a4, int a5, int a6)
1876{ 1879{
1877 return a1 + a2 + a3 + a4 + a5 + a6; 1880 return a1 + a2 + a3 + a4 + a5 + a6;
1878} 1881}
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 456be9063c2d..e37de492a9e1 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -353,6 +353,33 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val,
353} 353}
354EXPORT_SYMBOL(ftrace_print_symbols_seq); 354EXPORT_SYMBOL(ftrace_print_symbols_seq);
355 355
356#if BITS_PER_LONG == 32
357const char *
358ftrace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val,
359 const struct trace_print_flags_u64 *symbol_array)
360{
361 int i;
362 const char *ret = p->buffer + p->len;
363
364 for (i = 0; symbol_array[i].name; i++) {
365
366 if (val != symbol_array[i].mask)
367 continue;
368
369 trace_seq_puts(p, symbol_array[i].name);
370 break;
371 }
372
373 if (!p->len)
374 trace_seq_printf(p, "0x%llx", val);
375
376 trace_seq_putc(p, 0);
377
378 return ret;
379}
380EXPORT_SYMBOL(ftrace_print_symbols_seq_u64);
381#endif
382
356const char * 383const char *
357ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len) 384ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len)
358{ 385{
@@ -830,6 +857,9 @@ EXPORT_SYMBOL_GPL(unregister_ftrace_event);
830enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags, 857enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags,
831 struct trace_event *event) 858 struct trace_event *event)
832{ 859{
860 if (!trace_seq_printf(&iter->seq, "type: %d\n", iter->ent->type))
861 return TRACE_TYPE_PARTIAL_LINE;
862
833 return TRACE_TYPE_HANDLED; 863 return TRACE_TYPE_HANDLED;
834} 864}
835 865
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index 2547d8813cf0..1f06468a10d7 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -32,7 +32,7 @@ static DEFINE_MUTEX(btrace_mutex);
32 32
33struct trace_bprintk_fmt { 33struct trace_bprintk_fmt {
34 struct list_head list; 34 struct list_head list;
35 char fmt[0]; 35 const char *fmt;
36}; 36};
37 37
38static inline struct trace_bprintk_fmt *lookup_format(const char *fmt) 38static inline struct trace_bprintk_fmt *lookup_format(const char *fmt)
@@ -49,6 +49,7 @@ static
49void hold_module_trace_bprintk_format(const char **start, const char **end) 49void hold_module_trace_bprintk_format(const char **start, const char **end)
50{ 50{
51 const char **iter; 51 const char **iter;
52 char *fmt;
52 53
53 mutex_lock(&btrace_mutex); 54 mutex_lock(&btrace_mutex);
54 for (iter = start; iter < end; iter++) { 55 for (iter = start; iter < end; iter++) {
@@ -58,14 +59,18 @@ void hold_module_trace_bprintk_format(const char **start, const char **end)
58 continue; 59 continue;
59 } 60 }
60 61
61 tb_fmt = kmalloc(offsetof(struct trace_bprintk_fmt, fmt) 62 tb_fmt = kmalloc(sizeof(*tb_fmt), GFP_KERNEL);
62 + strlen(*iter) + 1, GFP_KERNEL); 63 if (tb_fmt)
63 if (tb_fmt) { 64 fmt = kmalloc(strlen(*iter) + 1, GFP_KERNEL);
65 if (tb_fmt && fmt) {
64 list_add_tail(&tb_fmt->list, &trace_bprintk_fmt_list); 66 list_add_tail(&tb_fmt->list, &trace_bprintk_fmt_list);
65 strcpy(tb_fmt->fmt, *iter); 67 strcpy(fmt, *iter);
68 tb_fmt->fmt = fmt;
66 *iter = tb_fmt->fmt; 69 *iter = tb_fmt->fmt;
67 } else 70 } else {
71 kfree(tb_fmt);
68 *iter = NULL; 72 *iter = NULL;
73 }
69 } 74 }
70 mutex_unlock(&btrace_mutex); 75 mutex_unlock(&btrace_mutex);
71} 76}
@@ -84,6 +89,76 @@ static int module_trace_bprintk_format_notify(struct notifier_block *self,
84 return 0; 89 return 0;
85} 90}
86 91
92/*
93 * The debugfs/tracing/printk_formats file maps the addresses with
94 * the ASCII formats that are used in the bprintk events in the
95 * buffer. For userspace tools to be able to decode the events from
96 * the buffer, they need to be able to map the address with the format.
97 *
98 * The addresses of the bprintk formats are in their own section
99 * __trace_printk_fmt. But for modules we copy them into a link list.
100 * The code to print the formats and their addresses passes around the
101 * address of the fmt string. If the fmt address passed into the seq
102 * functions is within the kernel core __trace_printk_fmt section, then
103 * it simply uses the next pointer in the list.
104 *
105 * When the fmt pointer is outside the kernel core __trace_printk_fmt
106 * section, then we need to read the link list pointers. The trick is
107 * we pass the address of the string to the seq function just like
108 * we do for the kernel core formats. To get back the structure that
109 * holds the format, we simply use containerof() and then go to the
110 * next format in the list.
111 */
112static const char **
113find_next_mod_format(int start_index, void *v, const char **fmt, loff_t *pos)
114{
115 struct trace_bprintk_fmt *mod_fmt;
116
117 if (list_empty(&trace_bprintk_fmt_list))
118 return NULL;
119
120 /*
121 * v will point to the address of the fmt record from t_next
122 * v will be NULL from t_start.
123 * If this is the first pointer or called from start
124 * then we need to walk the list.
125 */
126 if (!v || start_index == *pos) {
127 struct trace_bprintk_fmt *p;
128
129 /* search the module list */
130 list_for_each_entry(p, &trace_bprintk_fmt_list, list) {
131 if (start_index == *pos)
132 return &p->fmt;
133 start_index++;
134 }
135 /* pos > index */
136 return NULL;
137 }
138
139 /*
140 * v points to the address of the fmt field in the mod list
141 * structure that holds the module print format.
142 */
143 mod_fmt = container_of(v, typeof(*mod_fmt), fmt);
144 if (mod_fmt->list.next == &trace_bprintk_fmt_list)
145 return NULL;
146
147 mod_fmt = container_of(mod_fmt->list.next, typeof(*mod_fmt), list);
148
149 return &mod_fmt->fmt;
150}
151
152static void format_mod_start(void)
153{
154 mutex_lock(&btrace_mutex);
155}
156
157static void format_mod_stop(void)
158{
159 mutex_unlock(&btrace_mutex);
160}
161
87#else /* !CONFIG_MODULES */ 162#else /* !CONFIG_MODULES */
88__init static int 163__init static int
89module_trace_bprintk_format_notify(struct notifier_block *self, 164module_trace_bprintk_format_notify(struct notifier_block *self,
@@ -91,6 +166,13 @@ module_trace_bprintk_format_notify(struct notifier_block *self,
91{ 166{
92 return 0; 167 return 0;
93} 168}
169static inline const char **
170find_next_mod_format(int start_index, void *v, const char **fmt, loff_t *pos)
171{
172 return NULL;
173}
174static inline void format_mod_start(void) { }
175static inline void format_mod_stop(void) { }
94#endif /* CONFIG_MODULES */ 176#endif /* CONFIG_MODULES */
95 177
96 178
@@ -153,20 +235,30 @@ int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap)
153} 235}
154EXPORT_SYMBOL_GPL(__ftrace_vprintk); 236EXPORT_SYMBOL_GPL(__ftrace_vprintk);
155 237
238static const char **find_next(void *v, loff_t *pos)
239{
240 const char **fmt = v;
241 int start_index;
242
243 start_index = __stop___trace_bprintk_fmt - __start___trace_bprintk_fmt;
244
245 if (*pos < start_index)
246 return __start___trace_bprintk_fmt + *pos;
247
248 return find_next_mod_format(start_index, v, fmt, pos);
249}
250
156static void * 251static void *
157t_start(struct seq_file *m, loff_t *pos) 252t_start(struct seq_file *m, loff_t *pos)
158{ 253{
159 const char **fmt = __start___trace_bprintk_fmt + *pos; 254 format_mod_start();
160 255 return find_next(NULL, pos);
161 if ((unsigned long)fmt >= (unsigned long)__stop___trace_bprintk_fmt)
162 return NULL;
163 return fmt;
164} 256}
165 257
166static void *t_next(struct seq_file *m, void * v, loff_t *pos) 258static void *t_next(struct seq_file *m, void * v, loff_t *pos)
167{ 259{
168 (*pos)++; 260 (*pos)++;
169 return t_start(m, pos); 261 return find_next(v, pos);
170} 262}
171 263
172static int t_show(struct seq_file *m, void *v) 264static int t_show(struct seq_file *m, void *v)
@@ -205,6 +297,7 @@ static int t_show(struct seq_file *m, void *v)
205 297
206static void t_stop(struct seq_file *m, void *p) 298static void t_stop(struct seq_file *m, void *p)
207{ 299{
300 format_mod_stop();
208} 301}
209 302
210static const struct seq_operations show_format_seq_ops = { 303static const struct seq_operations show_format_seq_ops = {
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 7319559ed59f..f029dd4fd2ca 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -129,6 +129,7 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
129static struct ftrace_ops trace_ops __read_mostly = 129static struct ftrace_ops trace_ops __read_mostly =
130{ 130{
131 .func = wakeup_tracer_call, 131 .func = wakeup_tracer_call,
132 .flags = FTRACE_OPS_FL_GLOBAL,
132}; 133};
133#endif /* CONFIG_FUNCTION_TRACER */ 134#endif /* CONFIG_FUNCTION_TRACER */
134 135
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 659732eba07c..288541f977fb 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -101,6 +101,206 @@ static inline void warn_failed_init_tracer(struct tracer *trace, int init_ret)
101 101
102#ifdef CONFIG_DYNAMIC_FTRACE 102#ifdef CONFIG_DYNAMIC_FTRACE
103 103
104static int trace_selftest_test_probe1_cnt;
105static void trace_selftest_test_probe1_func(unsigned long ip,
106 unsigned long pip)
107{
108 trace_selftest_test_probe1_cnt++;
109}
110
111static int trace_selftest_test_probe2_cnt;
112static void trace_selftest_test_probe2_func(unsigned long ip,
113 unsigned long pip)
114{
115 trace_selftest_test_probe2_cnt++;
116}
117
118static int trace_selftest_test_probe3_cnt;
119static void trace_selftest_test_probe3_func(unsigned long ip,
120 unsigned long pip)
121{
122 trace_selftest_test_probe3_cnt++;
123}
124
125static int trace_selftest_test_global_cnt;
126static void trace_selftest_test_global_func(unsigned long ip,
127 unsigned long pip)
128{
129 trace_selftest_test_global_cnt++;
130}
131
132static int trace_selftest_test_dyn_cnt;
133static void trace_selftest_test_dyn_func(unsigned long ip,
134 unsigned long pip)
135{
136 trace_selftest_test_dyn_cnt++;
137}
138
139static struct ftrace_ops test_probe1 = {
140 .func = trace_selftest_test_probe1_func,
141};
142
143static struct ftrace_ops test_probe2 = {
144 .func = trace_selftest_test_probe2_func,
145};
146
147static struct ftrace_ops test_probe3 = {
148 .func = trace_selftest_test_probe3_func,
149};
150
151static struct ftrace_ops test_global = {
152 .func = trace_selftest_test_global_func,
153 .flags = FTRACE_OPS_FL_GLOBAL,
154};
155
156static void print_counts(void)
157{
158 printk("(%d %d %d %d %d) ",
159 trace_selftest_test_probe1_cnt,
160 trace_selftest_test_probe2_cnt,
161 trace_selftest_test_probe3_cnt,
162 trace_selftest_test_global_cnt,
163 trace_selftest_test_dyn_cnt);
164}
165
166static void reset_counts(void)
167{
168 trace_selftest_test_probe1_cnt = 0;
169 trace_selftest_test_probe2_cnt = 0;
170 trace_selftest_test_probe3_cnt = 0;
171 trace_selftest_test_global_cnt = 0;
172 trace_selftest_test_dyn_cnt = 0;
173}
174
175static int trace_selftest_ops(int cnt)
176{
177 int save_ftrace_enabled = ftrace_enabled;
178 struct ftrace_ops *dyn_ops;
179 char *func1_name;
180 char *func2_name;
181 int len1;
182 int len2;
183 int ret = -1;
184
185 printk(KERN_CONT "PASSED\n");
186 pr_info("Testing dynamic ftrace ops #%d: ", cnt);
187
188 ftrace_enabled = 1;
189 reset_counts();
190
191 /* Handle PPC64 '.' name */
192 func1_name = "*" __stringify(DYN_FTRACE_TEST_NAME);
193 func2_name = "*" __stringify(DYN_FTRACE_TEST_NAME2);
194 len1 = strlen(func1_name);
195 len2 = strlen(func2_name);
196
197 /*
198 * Probe 1 will trace function 1.
199 * Probe 2 will trace function 2.
200 * Probe 3 will trace functions 1 and 2.
201 */
202 ftrace_set_filter(&test_probe1, func1_name, len1, 1);
203 ftrace_set_filter(&test_probe2, func2_name, len2, 1);
204 ftrace_set_filter(&test_probe3, func1_name, len1, 1);
205 ftrace_set_filter(&test_probe3, func2_name, len2, 0);
206
207 register_ftrace_function(&test_probe1);
208 register_ftrace_function(&test_probe2);
209 register_ftrace_function(&test_probe3);
210 register_ftrace_function(&test_global);
211
212 DYN_FTRACE_TEST_NAME();
213
214 print_counts();
215
216 if (trace_selftest_test_probe1_cnt != 1)
217 goto out;
218 if (trace_selftest_test_probe2_cnt != 0)
219 goto out;
220 if (trace_selftest_test_probe3_cnt != 1)
221 goto out;
222 if (trace_selftest_test_global_cnt == 0)
223 goto out;
224
225 DYN_FTRACE_TEST_NAME2();
226
227 print_counts();
228
229 if (trace_selftest_test_probe1_cnt != 1)
230 goto out;
231 if (trace_selftest_test_probe2_cnt != 1)
232 goto out;
233 if (trace_selftest_test_probe3_cnt != 2)
234 goto out;
235
236 /* Add a dynamic probe */
237 dyn_ops = kzalloc(sizeof(*dyn_ops), GFP_KERNEL);
238 if (!dyn_ops) {
239 printk("MEMORY ERROR ");
240 goto out;
241 }
242
243 dyn_ops->func = trace_selftest_test_dyn_func;
244
245 register_ftrace_function(dyn_ops);
246
247 trace_selftest_test_global_cnt = 0;
248
249 DYN_FTRACE_TEST_NAME();
250
251 print_counts();
252
253 if (trace_selftest_test_probe1_cnt != 2)
254 goto out_free;
255 if (trace_selftest_test_probe2_cnt != 1)
256 goto out_free;
257 if (trace_selftest_test_probe3_cnt != 3)
258 goto out_free;
259 if (trace_selftest_test_global_cnt == 0)
260 goto out;
261 if (trace_selftest_test_dyn_cnt == 0)
262 goto out_free;
263
264 DYN_FTRACE_TEST_NAME2();
265
266 print_counts();
267
268 if (trace_selftest_test_probe1_cnt != 2)
269 goto out_free;
270 if (trace_selftest_test_probe2_cnt != 2)
271 goto out_free;
272 if (trace_selftest_test_probe3_cnt != 4)
273 goto out_free;
274
275 ret = 0;
276 out_free:
277 unregister_ftrace_function(dyn_ops);
278 kfree(dyn_ops);
279
280 out:
281 /* Purposely unregister in the same order */
282 unregister_ftrace_function(&test_probe1);
283 unregister_ftrace_function(&test_probe2);
284 unregister_ftrace_function(&test_probe3);
285 unregister_ftrace_function(&test_global);
286
287 /* Make sure everything is off */
288 reset_counts();
289 DYN_FTRACE_TEST_NAME();
290 DYN_FTRACE_TEST_NAME();
291
292 if (trace_selftest_test_probe1_cnt ||
293 trace_selftest_test_probe2_cnt ||
294 trace_selftest_test_probe3_cnt ||
295 trace_selftest_test_global_cnt ||
296 trace_selftest_test_dyn_cnt)
297 ret = -1;
298
299 ftrace_enabled = save_ftrace_enabled;
300
301 return ret;
302}
303
104/* Test dynamic code modification and ftrace filters */ 304/* Test dynamic code modification and ftrace filters */
105int trace_selftest_startup_dynamic_tracing(struct tracer *trace, 305int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
106 struct trace_array *tr, 306 struct trace_array *tr,
@@ -131,7 +331,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
131 func_name = "*" __stringify(DYN_FTRACE_TEST_NAME); 331 func_name = "*" __stringify(DYN_FTRACE_TEST_NAME);
132 332
133 /* filter only on our function */ 333 /* filter only on our function */
134 ftrace_set_filter(func_name, strlen(func_name), 1); 334 ftrace_set_global_filter(func_name, strlen(func_name), 1);
135 335
136 /* enable tracing */ 336 /* enable tracing */
137 ret = tracer_init(trace, tr); 337 ret = tracer_init(trace, tr);
@@ -166,22 +366,30 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
166 366
167 /* check the trace buffer */ 367 /* check the trace buffer */
168 ret = trace_test_buffer(tr, &count); 368 ret = trace_test_buffer(tr, &count);
169 trace->reset(tr);
170 tracing_start(); 369 tracing_start();
171 370
172 /* we should only have one item */ 371 /* we should only have one item */
173 if (!ret && count != 1) { 372 if (!ret && count != 1) {
373 trace->reset(tr);
174 printk(KERN_CONT ".. filter failed count=%ld ..", count); 374 printk(KERN_CONT ".. filter failed count=%ld ..", count);
175 ret = -1; 375 ret = -1;
176 goto out; 376 goto out;
177 } 377 }
178 378
379 /* Test the ops with global tracing running */
380 ret = trace_selftest_ops(1);
381 trace->reset(tr);
382
179 out: 383 out:
180 ftrace_enabled = save_ftrace_enabled; 384 ftrace_enabled = save_ftrace_enabled;
181 tracer_enabled = save_tracer_enabled; 385 tracer_enabled = save_tracer_enabled;
182 386
183 /* Enable tracing on all functions again */ 387 /* Enable tracing on all functions again */
184 ftrace_set_filter(NULL, 0, 1); 388 ftrace_set_global_filter(NULL, 0, 1);
389
390 /* Test the ops with global tracing off */
391 if (!ret)
392 ret = trace_selftest_ops(2);
185 393
186 return ret; 394 return ret;
187} 395}
diff --git a/kernel/trace/trace_selftest_dynamic.c b/kernel/trace/trace_selftest_dynamic.c
index 54dd77cce5bf..b4c475a0a48b 100644
--- a/kernel/trace/trace_selftest_dynamic.c
+++ b/kernel/trace/trace_selftest_dynamic.c
@@ -5,3 +5,9 @@ int DYN_FTRACE_TEST_NAME(void)
5 /* used to call mcount */ 5 /* used to call mcount */
6 return 0; 6 return 0;
7} 7}
8
9int DYN_FTRACE_TEST_NAME2(void)
10{
11 /* used to call mcount */
12 return 0;
13}
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 4c5dead0c239..b0b53b8e4c25 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -133,6 +133,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip)
133static struct ftrace_ops trace_ops __read_mostly = 133static struct ftrace_ops trace_ops __read_mostly =
134{ 134{
135 .func = stack_trace_call, 135 .func = stack_trace_call,
136 .flags = FTRACE_OPS_FL_GLOBAL,
136}; 137};
137 138
138static ssize_t 139static ssize_t
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 68187af4889e..b219f1449c54 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -251,9 +251,9 @@ static void set_tracepoint(struct tracepoint_entry **entry,
251{ 251{
252 WARN_ON(strcmp((*entry)->name, elem->name) != 0); 252 WARN_ON(strcmp((*entry)->name, elem->name) != 0);
253 253
254 if (elem->regfunc && !elem->state && active) 254 if (elem->regfunc && !jump_label_enabled(&elem->key) && active)
255 elem->regfunc(); 255 elem->regfunc();
256 else if (elem->unregfunc && elem->state && !active) 256 else if (elem->unregfunc && jump_label_enabled(&elem->key) && !active)
257 elem->unregfunc(); 257 elem->unregfunc();
258 258
259 /* 259 /*
@@ -264,13 +264,10 @@ static void set_tracepoint(struct tracepoint_entry **entry,
264 * is used. 264 * is used.
265 */ 265 */
266 rcu_assign_pointer(elem->funcs, (*entry)->funcs); 266 rcu_assign_pointer(elem->funcs, (*entry)->funcs);
267 if (!elem->state && active) { 267 if (active && !jump_label_enabled(&elem->key))
268 jump_label_enable(&elem->state); 268 jump_label_inc(&elem->key);
269 elem->state = active; 269 else if (!active && jump_label_enabled(&elem->key))
270 } else if (elem->state && !active) { 270 jump_label_dec(&elem->key);
271 jump_label_disable(&elem->state);
272 elem->state = active;
273 }
274} 271}
275 272
276/* 273/*
@@ -281,13 +278,11 @@ static void set_tracepoint(struct tracepoint_entry **entry,
281 */ 278 */
282static void disable_tracepoint(struct tracepoint *elem) 279static void disable_tracepoint(struct tracepoint *elem)
283{ 280{
284 if (elem->unregfunc && elem->state) 281 if (elem->unregfunc && jump_label_enabled(&elem->key))
285 elem->unregfunc(); 282 elem->unregfunc();
286 283
287 if (elem->state) { 284 if (jump_label_enabled(&elem->key))
288 jump_label_disable(&elem->state); 285 jump_label_dec(&elem->key);
289 elem->state = 0;
290 }
291 rcu_assign_pointer(elem->funcs, NULL); 286 rcu_assign_pointer(elem->funcs, NULL);
292} 287}
293 288
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 44646179eaba..bff131b9510a 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -15,6 +15,7 @@
15#include <linux/err.h> 15#include <linux/err.h>
16#include <linux/slab.h> 16#include <linux/slab.h>
17#include <linux/user_namespace.h> 17#include <linux/user_namespace.h>
18#include <linux/proc_fs.h>
18 19
19static struct uts_namespace *create_uts_ns(void) 20static struct uts_namespace *create_uts_ns(void)
20{ 21{
@@ -79,3 +80,41 @@ void free_uts_ns(struct kref *kref)
79 put_user_ns(ns->user_ns); 80 put_user_ns(ns->user_ns);
80 kfree(ns); 81 kfree(ns);
81} 82}
83
84static void *utsns_get(struct task_struct *task)
85{
86 struct uts_namespace *ns = NULL;
87 struct nsproxy *nsproxy;
88
89 rcu_read_lock();
90 nsproxy = task_nsproxy(task);
91 if (nsproxy) {
92 ns = nsproxy->uts_ns;
93 get_uts_ns(ns);
94 }
95 rcu_read_unlock();
96
97 return ns;
98}
99
100static void utsns_put(void *ns)
101{
102 put_uts_ns(ns);
103}
104
105static int utsns_install(struct nsproxy *nsproxy, void *ns)
106{
107 get_uts_ns(ns);
108 put_uts_ns(nsproxy->uts_ns);
109 nsproxy->uts_ns = ns;
110 return 0;
111}
112
113const struct proc_ns_operations utsns_operations = {
114 .name = "uts",
115 .type = CLONE_NEWUTS,
116 .get = utsns_get,
117 .put = utsns_put,
118 .install = utsns_install,
119};
120
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 14733d4d156b..3d0c56ad4792 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -28,7 +28,7 @@
28#include <linux/perf_event.h> 28#include <linux/perf_event.h>
29 29
30int watchdog_enabled = 1; 30int watchdog_enabled = 1;
31int __read_mostly softlockup_thresh = 60; 31int __read_mostly watchdog_thresh = 10;
32 32
33static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); 33static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
34static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog); 34static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog);
@@ -91,6 +91,17 @@ static int __init nosoftlockup_setup(char *str)
91__setup("nosoftlockup", nosoftlockup_setup); 91__setup("nosoftlockup", nosoftlockup_setup);
92/* */ 92/* */
93 93
94/*
95 * Hard-lockup warnings should be triggered after just a few seconds. Soft-
96 * lockups can have false positives under extreme conditions. So we generally
97 * want a higher threshold for soft lockups than for hard lockups. So we couple
98 * the thresholds with a factor: we make the soft threshold twice the amount of
99 * time the hard threshold is.
100 */
101static int get_softlockup_thresh(void)
102{
103 return watchdog_thresh * 2;
104}
94 105
95/* 106/*
96 * Returns seconds, approximately. We don't need nanosecond 107 * Returns seconds, approximately. We don't need nanosecond
@@ -105,12 +116,12 @@ static unsigned long get_timestamp(int this_cpu)
105static unsigned long get_sample_period(void) 116static unsigned long get_sample_period(void)
106{ 117{
107 /* 118 /*
108 * convert softlockup_thresh from seconds to ns 119 * convert watchdog_thresh from seconds to ns
109 * the divide by 5 is to give hrtimer 5 chances to 120 * the divide by 5 is to give hrtimer 5 chances to
110 * increment before the hardlockup detector generates 121 * increment before the hardlockup detector generates
111 * a warning 122 * a warning
112 */ 123 */
113 return softlockup_thresh / 5 * NSEC_PER_SEC; 124 return get_softlockup_thresh() * (NSEC_PER_SEC / 5);
114} 125}
115 126
116/* Commands for resetting the watchdog */ 127/* Commands for resetting the watchdog */
@@ -182,7 +193,7 @@ static int is_softlockup(unsigned long touch_ts)
182 unsigned long now = get_timestamp(smp_processor_id()); 193 unsigned long now = get_timestamp(smp_processor_id());
183 194
184 /* Warn about unreasonable delays: */ 195 /* Warn about unreasonable delays: */
185 if (time_after(now, touch_ts + softlockup_thresh)) 196 if (time_after(now, touch_ts + get_softlockup_thresh()))
186 return now - touch_ts; 197 return now - touch_ts;
187 198
188 return 0; 199 return 0;
@@ -359,7 +370,7 @@ static int watchdog_nmi_enable(int cpu)
359 370
360 /* Try to register using hardware perf events */ 371 /* Try to register using hardware perf events */
361 wd_attr = &wd_hw_attr; 372 wd_attr = &wd_hw_attr;
362 wd_attr->sample_period = hw_nmi_get_sample_period(); 373 wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh);
363 event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback); 374 event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback);
364 if (!IS_ERR(event)) { 375 if (!IS_ERR(event)) {
365 printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n"); 376 printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n");
@@ -404,15 +415,13 @@ static void watchdog_nmi_disable(int cpu) { return; }
404#endif /* CONFIG_HARDLOCKUP_DETECTOR */ 415#endif /* CONFIG_HARDLOCKUP_DETECTOR */
405 416
406/* prepare/enable/disable routines */ 417/* prepare/enable/disable routines */
407static int watchdog_prepare_cpu(int cpu) 418static void watchdog_prepare_cpu(int cpu)
408{ 419{
409 struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu); 420 struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu);
410 421
411 WARN_ON(per_cpu(softlockup_watchdog, cpu)); 422 WARN_ON(per_cpu(softlockup_watchdog, cpu));
412 hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 423 hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
413 hrtimer->function = watchdog_timer_fn; 424 hrtimer->function = watchdog_timer_fn;
414
415 return 0;
416} 425}
417 426
418static int watchdog_enable(int cpu) 427static int watchdog_enable(int cpu)
@@ -501,28 +510,25 @@ static void watchdog_disable_all_cpus(void)
501/* sysctl functions */ 510/* sysctl functions */
502#ifdef CONFIG_SYSCTL 511#ifdef CONFIG_SYSCTL
503/* 512/*
504 * proc handler for /proc/sys/kernel/nmi_watchdog 513 * proc handler for /proc/sys/kernel/nmi_watchdog,watchdog_thresh
505 */ 514 */
506 515
507int proc_dowatchdog_enabled(struct ctl_table *table, int write, 516int proc_dowatchdog(struct ctl_table *table, int write,
508 void __user *buffer, size_t *length, loff_t *ppos) 517 void __user *buffer, size_t *lenp, loff_t *ppos)
509{ 518{
510 proc_dointvec(table, write, buffer, length, ppos); 519 int ret;
511 520
512 if (write) { 521 ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
513 if (watchdog_enabled) 522 if (ret || !write)
514 watchdog_enable_all_cpus(); 523 goto out;
515 else
516 watchdog_disable_all_cpus();
517 }
518 return 0;
519}
520 524
521int proc_dowatchdog_thresh(struct ctl_table *table, int write, 525 if (watchdog_enabled && watchdog_thresh)
522 void __user *buffer, 526 watchdog_enable_all_cpus();
523 size_t *lenp, loff_t *ppos) 527 else
524{ 528 watchdog_disable_all_cpus();
525 return proc_dointvec_minmax(table, write, buffer, lenp, ppos); 529
530out:
531 return ret;
526} 532}
527#endif /* CONFIG_SYSCTL */ 533#endif /* CONFIG_SYSCTL */
528 534
@@ -534,17 +540,16 @@ static int __cpuinit
534cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) 540cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
535{ 541{
536 int hotcpu = (unsigned long)hcpu; 542 int hotcpu = (unsigned long)hcpu;
537 int err = 0;
538 543
539 switch (action) { 544 switch (action) {
540 case CPU_UP_PREPARE: 545 case CPU_UP_PREPARE:
541 case CPU_UP_PREPARE_FROZEN: 546 case CPU_UP_PREPARE_FROZEN:
542 err = watchdog_prepare_cpu(hotcpu); 547 watchdog_prepare_cpu(hotcpu);
543 break; 548 break;
544 case CPU_ONLINE: 549 case CPU_ONLINE:
545 case CPU_ONLINE_FROZEN: 550 case CPU_ONLINE_FROZEN:
546 if (watchdog_enabled) 551 if (watchdog_enabled)
547 err = watchdog_enable(hotcpu); 552 watchdog_enable(hotcpu);
548 break; 553 break;
549#ifdef CONFIG_HOTPLUG_CPU 554#ifdef CONFIG_HOTPLUG_CPU
550 case CPU_UP_CANCELED: 555 case CPU_UP_CANCELED:
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index e3378e8d3a5c..0400553f0d04 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2866,9 +2866,7 @@ static int alloc_cwqs(struct workqueue_struct *wq)
2866 } 2866 }
2867 } 2867 }
2868 2868
2869 /* just in case, make sure it's actually aligned 2869 /* just in case, make sure it's actually aligned */
2870 * - this is affected by PERCPU() alignment in vmlinux.lds.S
2871 */
2872 BUG_ON(!IS_ALIGNED(wq->cpu_wq.v, align)); 2870 BUG_ON(!IS_ALIGNED(wq->cpu_wq.v, align));
2873 return wq->cpu_wq.v ? 0 : -ENOMEM; 2871 return wq->cpu_wq.v ? 0 : -ENOMEM;
2874} 2872}